diff --git a/CMakeLists.txt b/CMakeLists.txt
index 78d5e0eb6..ead63bff8 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -6,7 +6,7 @@ cmake_minimum_required(VERSION 2.8.4)
 project(OpenBLAS)
 set(OpenBLAS_MAJOR_VERSION 0)
 set(OpenBLAS_MINOR_VERSION 2)
-set(OpenBLAS_PATCH_VERSION 17)
+set(OpenBLAS_PATCH_VERSION 18)
 set(OpenBLAS_VERSION "${OpenBLAS_MAJOR_VERSION}.${OpenBLAS_MINOR_VERSION}.${OpenBLAS_PATCH_VERSION}")
 
 enable_language(ASM)
diff --git a/CONTRIBUTORS.md b/CONTRIBUTORS.md
index da56c0758..ebe52ea8a 100644
--- a/CONTRIBUTORS.md
+++ b/CONTRIBUTORS.md
@@ -147,5 +147,6 @@ In chronological order:
   * [2016-03-14] Additional functional Assembly Kernels for Cortex-A57
   * [2016-03-14] Optimize Dgemm 4x4 for Cortex-A57
 
-* [Your name or handle] <[email or website]>
-  * [Date] [Brief summary of your changes]
+* theoractice <https://github.com/theoractice/>
+  * [2016-03-20] Fix compiler error in VisualStudio with CMake
+  * [2016-03-22] Fix access violation on Windows while static linking
diff --git a/Changelog.txt b/Changelog.txt
index c59166c38..7f82e8e88 100644
--- a/Changelog.txt
+++ b/Changelog.txt
@@ -1,4 +1,22 @@
 OpenBLAS ChangeLog
+====================================================================
+Version 0.2.18
+12-Apr-2016
+common:
+	* If you set MAKE_NB_JOBS flag less or equal than zero,
+	  make will be without -j.
+
+x86/x86_64:
+	* Support building Visual Studio static library. (#813, Thanks, theoractice)
+	* Fix bugs to pass buidbot CI tests (http://build.openblas.net)
+
+ARM:
+	* Provide DGEMM 8x4 kernel for Cortex-A57 (Thanks, Ashwin Sekhar T K)
+
+POWER:
+	* Optimize S and C BLAS3 on Power8
+	* Optimize BLAS2/1 on Power8
+
 ====================================================================
 Version 0.2.17
 20-Mar-2016
diff --git a/Makefile.rule b/Makefile.rule
index 0758a48a8..d8db6102c 100644
--- a/Makefile.rule
+++ b/Makefile.rule
@@ -3,7 +3,7 @@
 #
 
 # This library's version
-VERSION = 0.2.17
+VERSION = 0.2.18
 
 # If you set the suffix, the library name will be libopenblas_$(LIBNAMESUFFIX).a
 # and libopenblas_$(LIBNAMESUFFIX).so. Meanwhile, the soname in shared library
@@ -112,7 +112,10 @@ NO_AFFINITY = 1
 # NO_PARALLEL_MAKE = 1
 
 # Force number of make jobs. The default is the number of logical CPU of the host.
-# This is particularly useful when using distcc
+# This is particularly useful when using distcc.
+# A negative value will disable adding a -j flag to make, allowing to use a parent
+# make -j value. This is useful to call OpenBLAS make from an other project
+# makefile
 # MAKE_NB_JOBS = 2
 
 # If you would like to know minute performance report of GotoBLAS.
diff --git a/appveyor.yml b/appveyor.yml
index 172a49b42..5360a9ef9 100644
--- a/appveyor.yml
+++ b/appveyor.yml
@@ -1,4 +1,4 @@
-version: 0.2.15.{build}
+version: 0.2.18.{build}
 
 #environment:
 
diff --git a/benchmark/Makefile b/benchmark/Makefile
index 11d3c5bec..8166f3863 100644
--- a/benchmark/Makefile
+++ b/benchmark/Makefile
@@ -33,6 +33,10 @@ LIBMKL = -L$(MKL) -lmkl_intel_lp64 -lmkl_gnu_thread -lmkl_core -lgomp -lpthread
 # Apple vecLib
 LIBVECLIB = -framework Accelerate
 
+ESSL=/opt/ibm/lib
+#LIBESSL = -lesslsmp $(ESSL)/libxlomp_ser.so.1 $(ESSL)/libxlf90_r.so.1 $(ESSL)/libxlfmath.so.1 $(ESSL)/libxlsmp.so.1 /opt/ibm/xlC/13.1.3/lib/libxl.a
+LIBESSL = -lesslsmp  $(ESSL)/libxlf90_r.so.1 $(ESSL)/libxlfmath.so.1 $(ESSL)/libxlsmp.so.1 /opt/ibm/xlC/13.1.3/lib/libxl.a
+
 ifeq ($(OSNAME), WINNT)
 
 goto :: slinpack.goto dlinpack.goto clinpack.goto zlinpack.goto \
@@ -44,6 +48,7 @@ goto :: slinpack.goto dlinpack.goto clinpack.goto zlinpack.goto \
        ssyr2k.goto dsyr2k.goto csyr2k.goto zsyr2k.goto \
        sger.goto dger.goto cger.goto zger.goto \
        sdot.goto ddot.goto \
+       srot.goto drot.goto \
        saxpy.goto daxpy.goto caxpy.goto zaxpy.goto \
        scopy.goto dcopy.goto ccopy.goto zcopy.goto \
        sswap.goto dswap.goto cswap.goto zswap.goto \
@@ -151,6 +156,7 @@ goto :: slinpack.goto dlinpack.goto clinpack.goto zlinpack.goto \
        ssyr2k.goto dsyr2k.goto csyr2k.goto zsyr2k.goto \
        sger.goto dger.goto cger.goto zger.goto \
        sdot.goto ddot.goto cdot.goto zdot.goto \
+       srot.goto drot.goto \
        saxpy.goto daxpy.goto caxpy.goto zaxpy.goto \
        scopy.goto dcopy.goto ccopy.goto zcopy.goto \
        sswap.goto dswap.goto cswap.goto zswap.goto \
@@ -253,7 +259,9 @@ mkl :: slinpack.mkl dlinpack.mkl clinpack.mkl zlinpack.mkl \
 
 endif
 
-
+essl :: sgemm.essl strmm.essl dgemm.essl dtrmm.essl  \
+	cgemm.essl ctrmm.essl zgemm.essl ztrmm.essl  \
+	slinpack.essl clinpack.essl dlinpack.essl zlinpack.essl
 
 veclib :: slinpack.veclib dlinpack.veclib clinpack.veclib zlinpack.veclib \
        scholesky.veclib dcholesky.veclib ccholesky.veclib zcholesky.veclib \
@@ -306,6 +314,9 @@ slinpack.mkl : slinpack.$(SUFFIX)
 slinpack.veclib : slinpack.$(SUFFIX)
 	-$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
 
+slinpack.essl : slinpack.$(SUFFIX)
+	-$(CC) $(CFLAGS) -o $(@F) $^ $(LIBESSL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
+
 ##################################### Dlinpack ####################################################
 dlinpack.goto : dlinpack.$(SUFFIX) ../$(LIBNAME)
 	$(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm
@@ -322,6 +333,9 @@ dlinpack.mkl : dlinpack.$(SUFFIX)
 dlinpack.veclib : dlinpack.$(SUFFIX)
 	-$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
 
+dlinpack.essl : dlinpack.$(SUFFIX)
+	-$(CC) $(CFLAGS) -o $(@F) $^ $(LIBESSL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
+
 ##################################### Clinpack ####################################################
 
 clinpack.goto : clinpack.$(SUFFIX) ../$(LIBNAME)
@@ -339,6 +353,9 @@ clinpack.mkl : clinpack.$(SUFFIX)
 clinpack.veclib : clinpack.$(SUFFIX)
 	-$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
 
+clinpack.essl : clinpack.$(SUFFIX)
+	-$(CC) $(CFLAGS) -o $(@F) $^ $(LIBESSL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
+
 ##################################### Zlinpack ####################################################
 
 zlinpack.goto : zlinpack.$(SUFFIX) ../$(LIBNAME)
@@ -356,6 +373,9 @@ zlinpack.mkl : zlinpack.$(SUFFIX)
 zlinpack.veclib : zlinpack.$(SUFFIX)
 	-$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
 
+zlinpack.essl : zlinpack.$(SUFFIX)
+	-$(CC) $(CFLAGS) -o $(@F) $^ $(LIBESSL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
+
 ##################################### Scholesky ###################################################
 
 scholesky.goto : scholesky.$(SUFFIX) ../$(LIBNAME)
@@ -441,6 +461,9 @@ sgemm.mkl : sgemm.$(SUFFIX)
 sgemm.veclib : sgemm.$(SUFFIX)
 	-$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
 
+sgemm.essl : sgemm.$(SUFFIX)
+	-$(CC) $(CFLAGS) -o $(@F) $^ $(LIBESSL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
+
 ##################################### Dgemm ####################################################
 dgemm.goto : dgemm.$(SUFFIX) ../$(LIBNAME)
 	$(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm
@@ -457,6 +480,9 @@ dgemm.mkl : dgemm.$(SUFFIX)
 dgemm.veclib : dgemm.$(SUFFIX)
 	-$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
 
+dgemm.essl : dgemm.$(SUFFIX)
+	-$(CC) $(CFLAGS) -o $(@F) $^ $(LIBESSL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
+
 ##################################### Cgemm ####################################################
 
 cgemm.goto : cgemm.$(SUFFIX) ../$(LIBNAME)
@@ -474,6 +500,9 @@ cgemm.mkl : cgemm.$(SUFFIX)
 cgemm.veclib : cgemm.$(SUFFIX)
 	-$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
 
+cgemm.essl : cgemm.$(SUFFIX)
+	-$(CC) $(CFLAGS) -o $(@F) $^ $(LIBESSL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
+
 ##################################### Zgemm ####################################################
 
 zgemm.goto : zgemm.$(SUFFIX) ../$(LIBNAME)
@@ -491,6 +520,9 @@ zgemm.mkl : zgemm.$(SUFFIX)
 zgemm.veclib : zgemm.$(SUFFIX)
 	-$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
 
+zgemm.essl : zgemm.$(SUFFIX)
+	-$(CC) $(CFLAGS) -o $(@F) $^ $(LIBESSL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
+
 ##################################### Ssymm ####################################################
 ssymm.goto : ssymm.$(SUFFIX) ../$(LIBNAME)
 	$(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm
@@ -573,6 +605,9 @@ strmm.mkl : strmm.$(SUFFIX)
 strmm.veclib : strmm.$(SUFFIX)
 	-$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
 
+strmm.essl : strmm.$(SUFFIX)
+	-$(CC) $(CFLAGS) -o $(@F) $^ $(LIBESSL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
+
 ##################################### Dtrmm ####################################################
 dtrmm.goto : dtrmm.$(SUFFIX) ../$(LIBNAME)
 	$(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm
@@ -589,6 +624,9 @@ dtrmm.mkl : dtrmm.$(SUFFIX)
 dtrmm.veclib : dtrmm.$(SUFFIX)
 	-$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
 
+dtrmm.essl : dtrmm.$(SUFFIX)
+	-$(CC) $(CFLAGS) -o $(@F) $^ $(LIBESSL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
+
 ##################################### Ctrmm ####################################################
 
 ctrmm.goto : ctrmm.$(SUFFIX) ../$(LIBNAME)
@@ -606,6 +644,9 @@ ctrmm.mkl : ctrmm.$(SUFFIX)
 ctrmm.veclib : ctrmm.$(SUFFIX)
 	-$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
 
+ctrmm.essl : ctrmm.$(SUFFIX)
+	-$(CC) $(CFLAGS) -o $(@F) $^ $(LIBESSL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
+
 ##################################### Ztrmm ####################################################
 
 ztrmm.goto : ztrmm.$(SUFFIX) ../$(LIBNAME)
@@ -623,6 +664,9 @@ ztrmm.mkl : ztrmm.$(SUFFIX)
 ztrmm.veclib : ztrmm.$(SUFFIX)
 	-$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
 
+ztrmm.essl : ztrmm.$(SUFFIX)
+	-$(CC) $(CFLAGS) -o $(@F) $^ $(LIBESSL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
+
 ##################################### Strsm ####################################################
 strsm.goto : strsm.$(SUFFIX) ../$(LIBNAME)
 	$(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm
@@ -1413,6 +1457,39 @@ zdot.mkl : zdot-intel.$(SUFFIX)
 zdot.veclib : zdot-intel.$(SUFFIX)
 	$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
 
+##################################### Srot ####################################################
+srot.goto : srot.$(SUFFIX) ../$(LIBNAME)
+	$(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm
+
+srot.acml : srot.$(SUFFIX)
+	$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
+
+srot.atlas : srot.$(SUFFIX)
+	$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
+
+srot.mkl : srot.$(SUFFIX)
+	$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
+
+srot.veclib : srot.$(SUFFIX)
+	$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
+
+##################################### Drot ####################################################
+drot.goto : drot.$(SUFFIX) ../$(LIBNAME)
+	$(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm
+
+drot.acml : drot.$(SUFFIX)
+	$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
+
+drot.atlas : drot.$(SUFFIX)
+	$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
+
+drot.mkl : drot.$(SUFFIX)
+	$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
+
+drot.veclib : drot.$(SUFFIX)
+	$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
+
+
 ##################################### Saxpy ####################################################
 saxpy.goto : saxpy.$(SUFFIX) ../$(LIBNAME)
 	$(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm
@@ -2124,6 +2201,13 @@ cgesv.$(SUFFIX) : gesv.c
 zgesv.$(SUFFIX) : gesv.c
 	$(CC) $(CFLAGS) -c -DCOMPLEX -DDOUBLE -o $(@F) $^
 
+srot.$(SUFFIX) : rot.c
+	$(CC) $(CFLAGS) -c -UCOMPLEX -UDOUBLE -o $(@F) $^
+
+drot.$(SUFFIX) : rot.c
+	$(CC) $(CFLAGS) -c -UCOMPLEX -DDOUBLE -o $(@F) $^
+
+
 
 
 
@@ -2137,7 +2221,7 @@ smallscaling: smallscaling.c ../$(LIBNAME)
 	$(CC) $(CFLAGS) -o $(@F) $^ $(EXTRALIB) -fopenmp -lm
 
 clean ::
-	@rm -f *.goto *.mkl *.acml *.atlas *.veclib
+	@rm -f *.goto *.mkl *.acml *.atlas *.veclib *.essl
 
 include $(TOPDIR)/Makefile.tail
 
diff --git a/benchmark/rot.c b/benchmark/rot.c
new file mode 100644
index 000000000..32322bebb
--- /dev/null
+++ b/benchmark/rot.c
@@ -0,0 +1,197 @@
+/***************************************************************************
+Copyright (c) 2014, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*****************************************************************************/
+
+#include <stdio.h>
+#include <stdlib.h>
+#ifdef __CYGWIN32__
+#include <sys/time.h>
+#endif
+#include "common.h"
+
+
+#undef DOT
+
+
+#ifdef DOUBLE
+#define ROT   BLASFUNC(drot)
+#else
+#define ROT   BLASFUNC(srot)
+#endif
+
+
+#if defined(__WIN32__) || defined(__WIN64__)
+
+#ifndef DELTA_EPOCH_IN_MICROSECS
+#define DELTA_EPOCH_IN_MICROSECS 11644473600000000ULL
+#endif
+
+int gettimeofday(struct timeval *tv, void *tz){
+
+  FILETIME ft;
+  unsigned __int64 tmpres = 0;
+  static int tzflag;
+
+  if (NULL != tv)
+    {
+      GetSystemTimeAsFileTime(&ft);
+
+      tmpres |= ft.dwHighDateTime;
+      tmpres <<= 32;
+      tmpres |= ft.dwLowDateTime;
+
+      /*converting file time to unix epoch*/
+      tmpres /= 10;  /*convert into microseconds*/
+      tmpres -= DELTA_EPOCH_IN_MICROSECS;
+      tv->tv_sec = (long)(tmpres / 1000000UL);
+      tv->tv_usec = (long)(tmpres % 1000000UL);
+    }
+
+  return 0;
+}
+
+#endif
+
+#if !defined(__WIN32__) && !defined(__WIN64__) && !defined(__CYGWIN32__) && 0
+
+static void *huge_malloc(BLASLONG size){
+  int shmid;
+  void *address;
+
+#ifndef SHM_HUGETLB
+#define SHM_HUGETLB 04000
+#endif
+
+  if ((shmid =shmget(IPC_PRIVATE,
+		     (size + HUGE_PAGESIZE) & ~(HUGE_PAGESIZE - 1),
+		     SHM_HUGETLB | IPC_CREAT |0600)) < 0) {
+    printf( "Memory allocation failed(shmget).\n");
+    exit(1);
+  }
+
+  address = shmat(shmid, NULL, SHM_RND);
+
+  if ((BLASLONG)address == -1){
+    printf( "Memory allocation failed(shmat).\n");
+    exit(1);
+  }
+
+  shmctl(shmid, IPC_RMID, 0);
+
+  return address;
+}
+
+#define malloc huge_malloc
+
+#endif
+
+int main(int argc, char *argv[]){
+
+  FLOAT *x, *y;
+  // FLOAT result;
+  blasint m, i;
+  blasint inc_x=1,inc_y=1;
+  FLOAT c[1] = { 2.0 };
+  FLOAT s[1] = { 2.0 };
+  int loops = 1;
+  int l;
+  char *p;
+
+  int from =   1;
+  int to   = 200;
+  int step =   1;
+
+  struct timeval start, stop;
+  double time1,timeg;
+
+  argc--;argv++;
+
+  if (argc > 0) { from     = atol(*argv);		argc--; argv++;}
+  if (argc > 0) { to       = MAX(atol(*argv), from);	argc--; argv++;}
+  if (argc > 0) { step     = atol(*argv);		argc--; argv++;}
+
+  if ((p = getenv("OPENBLAS_LOOPS")))  loops = atoi(p);
+  if ((p = getenv("OPENBLAS_INCX")))   inc_x = atoi(p);
+  if ((p = getenv("OPENBLAS_INCY")))   inc_y = atoi(p);
+
+  fprintf(stderr, "From : %3d  To : %3d Step = %3d Inc_x = %d Inc_y = %d Loops = %d\n", from, to, step,inc_x,inc_y,loops);
+
+  if (( x = (FLOAT *)malloc(sizeof(FLOAT) * to * abs(inc_x) * COMPSIZE)) == NULL){
+    fprintf(stderr,"Out of Memory!!\n");exit(1);
+  }
+
+  if (( y = (FLOAT *)malloc(sizeof(FLOAT) * to * abs(inc_y) * COMPSIZE)) == NULL){
+    fprintf(stderr,"Out of Memory!!\n");exit(1);
+  }
+
+#ifdef linux
+  srandom(getpid());
+#endif
+
+  fprintf(stderr, "   SIZE       Flops\n");
+
+  for(m = from; m <= to; m += step)
+  {
+
+   timeg=0;
+
+   fprintf(stderr, " %6d : ", (int)m);
+
+
+   for (l=0; l<loops; l++)
+   {
+
+   	for(i = 0; i < m * COMPSIZE * abs(inc_x); i++){
+			x[i] = ((FLOAT) rand() / (FLOAT) RAND_MAX) - 0.5;
+   	}
+
+   	for(i = 0; i < m * COMPSIZE * abs(inc_y); i++){
+			y[i] = ((FLOAT) rand() / (FLOAT) RAND_MAX) - 0.5;
+   	}
+    	gettimeofday( &start, (struct timezone *)0);
+
+    	ROT (&m, x, &inc_x, y, &inc_y, c, s);
+
+    	gettimeofday( &stop, (struct timezone *)0);
+
+    	time1 = (double)(stop.tv_sec - start.tv_sec) + (double)((stop.tv_usec - start.tv_usec)) * 1.e-6;
+
+	timeg += time1;
+
+    }
+
+    timeg /= loops;
+
+    fprintf(stderr,
+	    " %10.2f MFlops\n",
+	    COMPSIZE * COMPSIZE * 6. * (double)m / timeg * 1.e-6);
+
+  }
+
+  return 0;
+}
+
+// void main(int argc, char *argv[]) __attribute__((weak, alias("MAIN__")));
diff --git a/common_power.h b/common_power.h
index 052d38828..723d949f2 100644
--- a/common_power.h
+++ b/common_power.h
@@ -798,7 +798,7 @@ Lmcount$lazy_ptr:
 #elif defined(PPC440FP2)
 #define BUFFER_SIZE     ( 16 << 20)
 #elif defined(POWER8)
-#define BUFFER_SIZE     ( 64 << 20)
+#define BUFFER_SIZE     ( 32 << 20)
 #else
 #define BUFFER_SIZE     ( 16 << 20)
 #endif
diff --git a/common_x86.h b/common_x86.h
index ab9f22b0d..4363fb2f4 100644
--- a/common_x86.h
+++ b/common_x86.h
@@ -62,7 +62,7 @@ static void __inline blas_lock(volatile BLASULONG *address){
 
 #if defined(_MSC_VER) && !defined(__clang__)
 	// use intrinsic instead of inline assembly
-	ret = _InterlockedExchange(address, 1);
+	ret = _InterlockedExchange((volatile LONG *)address, 1);
 	// inline assembly
 	/*__asm {
 		mov eax, address
diff --git a/driver/others/memory.c b/driver/others/memory.c
index e64781740..e89f5c328 100644
--- a/driver/others/memory.c
+++ b/driver/others/memory.c
@@ -1452,6 +1452,31 @@ BOOL APIENTRY DllMain(HMODULE hModule, DWORD  ul_reason_for_call, LPVOID lpReser
   }
   return TRUE;
 }
+
+/*
+  This is to allow static linking.
+  Code adapted from Google performance tools:
+  https://gperftools.googlecode.com/git-history/perftools-1.0/src/windows/port.cc
+  Reference:
+  https://sourceware.org/ml/pthreads-win32/2008/msg00028.html
+  http://ci.boost.org/svn-trac/browser/trunk/libs/thread/src/win32/tss_pe.cpp
+*/
+static int on_process_term(void)
+{
+	gotoblas_quit();
+	return 0;
+}
+#ifdef _WIN64
+#pragma comment(linker, "/INCLUDE:_tls_used")
+#else
+#pragma comment(linker, "/INCLUDE:__tls_used")
+#endif
+#pragma data_seg(push, old_seg)
+#pragma data_seg(".CRT$XLB")
+static void (APIENTRY *dll_callback)(HINSTANCE h, DWORD ul_reason_for_call, PVOID pv) = DllMain;
+#pragma data_seg(".CRT$XTU")
+static int(*p_process_term)(void) = on_process_term;
+#pragma data_seg(pop, old_seg)
 #endif
 
 #if (defined(C_PGI) || (!defined(C_SUN) && defined(F_INTERFACE_SUN))) && (defined(ARCH_X86) || defined(ARCH_X86_64))
diff --git a/getarch.c b/getarch.c
index f9c49e663..1e0b08675 100644
--- a/getarch.c
+++ b/getarch.c
@@ -1013,7 +1013,12 @@ int main(int argc, char *argv[]){
 #endif
 
 #ifdef MAKE_NB_JOBS
+  #if MAKE_NB_JOBS > 0
     printf("MAKE += -j %d\n", MAKE_NB_JOBS);
+  #else
+    // Let make use parent -j argument or -j1 if there
+    // is no make parent
+  #endif
 #elif NO_PARALLEL_MAKE==1
     printf("MAKE += -j 1\n");
 #else
diff --git a/getarch_2nd.c b/getarch_2nd.c
index fad647fed..cf9c578cb 100644
--- a/getarch_2nd.c
+++ b/getarch_2nd.c
@@ -64,10 +64,13 @@ int main(int argc, char **argv) {
 
 
   if ((argc >= 2) && (*argv[1] == '1')) {
+
+#if defined(ARCH_X86) || defined(ARCH_X86_64)
     printf("#define SLOCAL_BUFFER_SIZE\t%ld\n", (SGEMM_DEFAULT_Q * SGEMM_DEFAULT_UNROLL_N * 4 * 1 *  sizeof(float)));
     printf("#define DLOCAL_BUFFER_SIZE\t%ld\n", (DGEMM_DEFAULT_Q * DGEMM_DEFAULT_UNROLL_N * 2 * 1 *  sizeof(double)));
     printf("#define CLOCAL_BUFFER_SIZE\t%ld\n", (CGEMM_DEFAULT_Q * CGEMM_DEFAULT_UNROLL_N * 4 * 2 *  sizeof(float)));
     printf("#define ZLOCAL_BUFFER_SIZE\t%ld\n", (ZGEMM_DEFAULT_Q * ZGEMM_DEFAULT_UNROLL_N * 2 * 2 *  sizeof(double)));
+#endif
 
 #ifdef USE64BITINT
 	printf("#define USE64BITINT\n");
diff --git a/kernel/arm64/cgemm_kernel_4x4.S b/kernel/arm64/cgemm_kernel_4x4.S
index 7a70264ca..7f2ddea07 100644
--- a/kernel/arm64/cgemm_kernel_4x4.S
+++ b/kernel/arm64/cgemm_kernel_4x4.S
@@ -179,93 +179,93 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 	ld2	{v2.4s, v3.4s}, [ppA]
 	add	ppA, ppA, #32
 
-	fmul	v16.4s, v0.4s, v8.4s[0]
-	OP_ii	v16.4s, v1.4s, v9.4s[0]
+	fmul	v16.4s, v0.4s, v8.s[0]
+	OP_ii	v16.4s, v1.4s, v9.s[0]
 #if defined(NR) || defined(NC) || defined(TR) || defined(TC) || \
     defined(RR) || defined(RC) || defined(CR) || defined(CC)
 	eor	v17.16b, v17.16b, v17.16b
-	fmls	v17.4s, v0.4s, v9.4s[0]
+	fmls	v17.4s, v0.4s, v9.s[0]
 #else
-	fmul	v17.4s, v0.4s, v9.4s[0]
+	fmul	v17.4s, v0.4s, v9.s[0]
 #endif
-	OP_ir	v17.4s, v1.4s, v8.4s[0]
+	OP_ir	v17.4s, v1.4s, v8.s[0]
 
-	fmul	v20.4s, v0.4s, v8.4s[1]
-	OP_ii	v20.4s, v1.4s, v9.4s[1]
+	fmul	v20.4s, v0.4s, v8.s[1]
+	OP_ii	v20.4s, v1.4s, v9.s[1]
 #if defined(NR) || defined(NC) || defined(TR) || defined(TC) || \
     defined(RR) || defined(RC) || defined(CR) || defined(CC)
 	eor	v21.16b, v21.16b, v21.16b
-	fmls	v21.4s, v0.4s, v9.4s[1]
+	fmls	v21.4s, v0.4s, v9.s[1]
 #else
-	fmul	v21.4s, v0.4s, v9.4s[1]
+	fmul	v21.4s, v0.4s, v9.s[1]
 #endif
-	OP_ir	v21.4s, v1.4s, v8.4s[1]
+	OP_ir	v21.4s, v1.4s, v8.s[1]
 
-	fmul	v24.4s, v0.4s, v8.4s[2]
-	OP_ii	v24.4s, v1.4s, v9.4s[2]
+	fmul	v24.4s, v0.4s, v8.s[2]
+	OP_ii	v24.4s, v1.4s, v9.s[2]
 #if defined(NR) || defined(NC) || defined(TR) || defined(TC) || \
     defined(RR) || defined(RC) || defined(CR) || defined(CC)
 	eor	v25.16b, v25.16b, v25.16b
-	fmls	v25.4s, v0.4s, v9.4s[2]
+	fmls	v25.4s, v0.4s, v9.s[2]
 #else
-	fmul	v25.4s, v0.4s, v9.4s[2]
+	fmul	v25.4s, v0.4s, v9.s[2]
 #endif
-	OP_ir	v25.4s, v1.4s, v8.4s[2]
+	OP_ir	v25.4s, v1.4s, v8.s[2]
 
-	fmul	v28.4s, v0.4s, v8.4s[3]
-	OP_ii	v28.4s, v1.4s, v9.4s[3]
+	fmul	v28.4s, v0.4s, v8.s[3]
+	OP_ii	v28.4s, v1.4s, v9.s[3]
 #if defined(NR) || defined(NC) || defined(TR) || defined(TC) || \
     defined(RR) || defined(RC) || defined(CR) || defined(CC)
 	eor	v29.16b, v29.16b, v29.16b
-	fmls	v29.4s, v0.4s, v9.4s[3]
+	fmls	v29.4s, v0.4s, v9.s[3]
 #else
-	fmul	v29.4s, v0.4s, v9.4s[3]
+	fmul	v29.4s, v0.4s, v9.s[3]
 #endif
-	OP_ir	v29.4s, v1.4s, v8.4s[3]
+	OP_ir	v29.4s, v1.4s, v8.s[3]
 
-	fmul	v18.4s, v2.4s, v8.4s[0]
-	OP_ii	v18.4s, v3.4s, v9.4s[0]
+	fmul	v18.4s, v2.4s, v8.s[0]
+	OP_ii	v18.4s, v3.4s, v9.s[0]
 #if defined(NR) || defined(NC) || defined(TR) || defined(TC) || \
     defined(RR) || defined(RC) || defined(CR) || defined(CC)
 	eor	v19.16b, v19.16b, v19.16b
-	fmls	v19.4s, v2.4s, v9.4s[0]
+	fmls	v19.4s, v2.4s, v9.s[0]
 #else
-	fmul	v19.4s, v2.4s, v9.4s[0]
+	fmul	v19.4s, v2.4s, v9.s[0]
 #endif
-	OP_ir	v19.4s, v3.4s, v8.4s[0]
+	OP_ir	v19.4s, v3.4s, v8.s[0]
 
-	fmul	v22.4s, v2.4s, v8.4s[1]
-	OP_ii	v22.4s, v3.4s, v9.4s[1]
+	fmul	v22.4s, v2.4s, v8.s[1]
+	OP_ii	v22.4s, v3.4s, v9.s[1]
 #if defined(NR) || defined(NC) || defined(TR) || defined(TC) || \
     defined(RR) || defined(RC) || defined(CR) || defined(CC)
 	eor	v23.16b, v23.16b, v23.16b
-	fmls	v23.4s, v2.4s, v9.4s[1]
+	fmls	v23.4s, v2.4s, v9.s[1]
 #else
-	fmul	v23.4s, v2.4s, v9.4s[1]
+	fmul	v23.4s, v2.4s, v9.s[1]
 #endif
-	OP_ir	v23.4s, v3.4s, v8.4s[1]
+	OP_ir	v23.4s, v3.4s, v8.s[1]
 
-	fmul	v26.4s, v2.4s, v8.4s[2]
-	OP_ii	v26.4s, v3.4s, v9.4s[2]
+	fmul	v26.4s, v2.4s, v8.s[2]
+	OP_ii	v26.4s, v3.4s, v9.s[2]
 #if defined(NR) || defined(NC) || defined(TR) || defined(TC) || \
     defined(RR) || defined(RC) || defined(CR) || defined(CC)
 	eor	v27.16b, v27.16b, v27.16b
-	fmls	v27.4s, v2.4s, v9.4s[2]
+	fmls	v27.4s, v2.4s, v9.s[2]
 #else
-	fmul	v27.4s, v2.4s, v9.4s[2]
+	fmul	v27.4s, v2.4s, v9.s[2]
 #endif
-	OP_ir	v27.4s, v3.4s, v8.4s[2]
+	OP_ir	v27.4s, v3.4s, v8.s[2]
 
-	fmul	v30.4s, v2.4s, v8.4s[3]
-	OP_ii	v30.4s, v3.4s, v9.4s[3]
+	fmul	v30.4s, v2.4s, v8.s[3]
+	OP_ii	v30.4s, v3.4s, v9.s[3]
 #if defined(NR) || defined(NC) || defined(TR) || defined(TC) || \
     defined(RR) || defined(RC) || defined(CR) || defined(CC)
 	eor	v31.16b, v31.16b, v31.16b
-	fmls	v31.4s, v2.4s, v9.4s[3]
+	fmls	v31.4s, v2.4s, v9.s[3]
 #else
-	fmul	v31.4s, v2.4s, v9.4s[3]
+	fmul	v31.4s, v2.4s, v9.s[3]
 #endif
-	OP_ir	v31.4s, v3.4s, v8.4s[3]
+	OP_ir	v31.4s, v3.4s, v8.s[3]
 
 	ld2	{v12.4s, v13.4s}, [pB]
 	add	pB, pB, #32
@@ -276,159 +276,159 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 .endm
 
 .macro KERNEL8x4_M1
-	OP_rr	v16.4s, v0.4s, v8.4s[0]
-	OP_ii	v16.4s, v1.4s, v9.4s[0]
-	OP_ri	v17.4s, v0.4s, v9.4s[0]
-	OP_ir	v17.4s, v1.4s, v8.4s[0]
+	OP_rr	v16.4s, v0.4s, v8.s[0]
+	OP_ii	v16.4s, v1.4s, v9.s[0]
+	OP_ri	v17.4s, v0.4s, v9.s[0]
+	OP_ir	v17.4s, v1.4s, v8.s[0]
 
 	ld2	{v12.4s, v13.4s}, [pB]		// for next round
 	add	pB, pB, #32
 
-	OP_rr	v20.4s, v0.4s, v8.4s[1]
-	OP_ii	v20.4s, v1.4s, v9.4s[1]
-	OP_ri	v21.4s, v0.4s, v9.4s[1]
-	OP_ir	v21.4s, v1.4s, v8.4s[1]
+	OP_rr	v20.4s, v0.4s, v8.s[1]
+	OP_ii	v20.4s, v1.4s, v9.s[1]
+	OP_ri	v21.4s, v0.4s, v9.s[1]
+	OP_ir	v21.4s, v1.4s, v8.s[1]
 
 	prfm	PLDL1KEEP, [pB, #512]
 
-	OP_rr	v24.4s, v0.4s, v8.4s[2]
-	OP_ii	v24.4s, v1.4s, v9.4s[2]
-	OP_ri	v25.4s, v0.4s, v9.4s[2]
-	OP_ir	v25.4s, v1.4s, v8.4s[2]
+	OP_rr	v24.4s, v0.4s, v8.s[2]
+	OP_ii	v24.4s, v1.4s, v9.s[2]
+	OP_ri	v25.4s, v0.4s, v9.s[2]
+	OP_ir	v25.4s, v1.4s, v8.s[2]
 
 	ld2	{v4.4s, v5.4s} , [pA]		// for next round
 	add	pA, pA, #32
 
-	OP_rr	v28.4s, v0.4s, v8.4s[3]
-	OP_ii	v28.4s, v1.4s, v9.4s[3]
-	OP_ri	v29.4s, v0.4s, v9.4s[3]
-	OP_ir	v29.4s, v1.4s, v8.4s[3]
+	OP_rr	v28.4s, v0.4s, v8.s[3]
+	OP_ii	v28.4s, v1.4s, v9.s[3]
+	OP_ri	v29.4s, v0.4s, v9.s[3]
+	OP_ir	v29.4s, v1.4s, v8.s[3]
 
 	prfm	PLDL1KEEP, [pA, #512]
 
-	OP_rr	v18.4s, v2.4s, v8.4s[0]
-	OP_ii	v18.4s, v3.4s, v9.4s[0]
-	OP_ri	v19.4s, v2.4s, v9.4s[0]
-	OP_ir	v19.4s, v3.4s, v8.4s[0]
+	OP_rr	v18.4s, v2.4s, v8.s[0]
+	OP_ii	v18.4s, v3.4s, v9.s[0]
+	OP_ri	v19.4s, v2.4s, v9.s[0]
+	OP_ir	v19.4s, v3.4s, v8.s[0]
 
 	ld2	{v6.4s, v7.4s} , [ppA]		// for next round
 	add	ppA, ppA, #32
 
-	OP_rr	v22.4s, v2.4s, v8.4s[1]
-	OP_ii	v22.4s, v3.4s, v9.4s[1]
-	OP_ri	v23.4s, v2.4s, v9.4s[1]
-	OP_ir	v23.4s, v3.4s, v8.4s[1]
+	OP_rr	v22.4s, v2.4s, v8.s[1]
+	OP_ii	v22.4s, v3.4s, v9.s[1]
+	OP_ri	v23.4s, v2.4s, v9.s[1]
+	OP_ir	v23.4s, v3.4s, v8.s[1]
 
 	prfm	PLDL1KEEP, [ppA, #512]
 
-	OP_rr	v26.4s, v2.4s, v8.4s[2]
-	OP_ii	v26.4s, v3.4s, v9.4s[2]
-	OP_ri	v27.4s, v2.4s, v9.4s[2]
-	OP_ir	v27.4s, v3.4s, v8.4s[2]
+	OP_rr	v26.4s, v2.4s, v8.s[2]
+	OP_ii	v26.4s, v3.4s, v9.s[2]
+	OP_ri	v27.4s, v2.4s, v9.s[2]
+	OP_ir	v27.4s, v3.4s, v8.s[2]
 
-	OP_rr	v30.4s, v2.4s, v8.4s[3]
-	OP_ii	v30.4s, v3.4s, v9.4s[3]
-	OP_ri	v31.4s, v2.4s, v9.4s[3]
-	OP_ir	v31.4s, v3.4s, v8.4s[3]
+	OP_rr	v30.4s, v2.4s, v8.s[3]
+	OP_ii	v30.4s, v3.4s, v9.s[3]
+	OP_ri	v31.4s, v2.4s, v9.s[3]
+	OP_ir	v31.4s, v3.4s, v8.s[3]
 .endm
 
 .macro KERNEL8x4_M2
-	OP_rr	v16.4s, v4.4s, v12.4s[0]
-	OP_ii	v16.4s, v5.4s, v13.4s[0]
-	OP_ri	v17.4s, v4.4s, v13.4s[0]
-	OP_ir	v17.4s, v5.4s, v12.4s[0]
+	OP_rr	v16.4s, v4.4s, v12.s[0]
+	OP_ii	v16.4s, v5.4s, v13.s[0]
+	OP_ri	v17.4s, v4.4s, v13.s[0]
+	OP_ir	v17.4s, v5.4s, v12.s[0]
 
 	ld2	{v8.4s, v9.4s}, [pB]		// for next round
 	add	pB, pB, #32
 
-	OP_rr	v20.4s, v4.4s, v12.4s[1]
-	OP_ii	v20.4s, v5.4s, v13.4s[1]
-	OP_ri	v21.4s, v4.4s, v13.4s[1]
-	OP_ir	v21.4s, v5.4s, v12.4s[1]
+	OP_rr	v20.4s, v4.4s, v12.s[1]
+	OP_ii	v20.4s, v5.4s, v13.s[1]
+	OP_ri	v21.4s, v4.4s, v13.s[1]
+	OP_ir	v21.4s, v5.4s, v12.s[1]
 
 	prfm	PLDL1KEEP, [pA, #512]
 
-	OP_rr	v24.4s, v4.4s, v12.4s[2]
-	OP_ii	v24.4s, v5.4s, v13.4s[2]
-	OP_ri	v25.4s, v4.4s, v13.4s[2]
-	OP_ir	v25.4s, v5.4s, v12.4s[2]
+	OP_rr	v24.4s, v4.4s, v12.s[2]
+	OP_ii	v24.4s, v5.4s, v13.s[2]
+	OP_ri	v25.4s, v4.4s, v13.s[2]
+	OP_ir	v25.4s, v5.4s, v12.s[2]
 
 	ld2	{v0.4s, v1.4s}, [pA]		// for next round
 	add	pA, pA, #32
 
-	OP_rr	v28.4s, v4.4s, v12.4s[3]
-	OP_ii	v28.4s, v5.4s, v13.4s[3]
-	OP_ri	v29.4s, v4.4s, v13.4s[3]
-	OP_ir	v29.4s, v5.4s, v12.4s[3]
+	OP_rr	v28.4s, v4.4s, v12.s[3]
+	OP_ii	v28.4s, v5.4s, v13.s[3]
+	OP_ri	v29.4s, v4.4s, v13.s[3]
+	OP_ir	v29.4s, v5.4s, v12.s[3]
 
 	prfm	PLDL1KEEP, [ppA, #512]
 
-	OP_rr	v18.4s, v6.4s, v12.4s[0]
-	OP_ii	v18.4s, v7.4s, v13.4s[0]
-	OP_ri	v19.4s, v6.4s, v13.4s[0]
-	OP_ir	v19.4s, v7.4s, v12.4s[0]
+	OP_rr	v18.4s, v6.4s, v12.s[0]
+	OP_ii	v18.4s, v7.4s, v13.s[0]
+	OP_ri	v19.4s, v6.4s, v13.s[0]
+	OP_ir	v19.4s, v7.4s, v12.s[0]
 
 	ld2	{v2.4s, v3.4s}, [ppA]		// for next round
 	add	ppA, ppA, #32
 
-	OP_rr	v22.4s, v6.4s, v12.4s[1]
-	OP_ii	v22.4s, v7.4s, v13.4s[1]
-	OP_ri	v23.4s, v6.4s, v13.4s[1]
-	OP_ir	v23.4s, v7.4s, v12.4s[1]
+	OP_rr	v22.4s, v6.4s, v12.s[1]
+	OP_ii	v22.4s, v7.4s, v13.s[1]
+	OP_ri	v23.4s, v6.4s, v13.s[1]
+	OP_ir	v23.4s, v7.4s, v12.s[1]
 
 	prfm	PLDL1KEEP, [pB, #512]
 
-	OP_rr	v26.4s, v6.4s, v12.4s[2]
-	OP_ii	v26.4s, v7.4s, v13.4s[2]
-	OP_ri	v27.4s, v6.4s, v13.4s[2]
-	OP_ir	v27.4s, v7.4s, v12.4s[2]
+	OP_rr	v26.4s, v6.4s, v12.s[2]
+	OP_ii	v26.4s, v7.4s, v13.s[2]
+	OP_ri	v27.4s, v6.4s, v13.s[2]
+	OP_ir	v27.4s, v7.4s, v12.s[2]
 
-	OP_rr	v30.4s, v6.4s, v12.4s[3]
-	OP_ii	v30.4s, v7.4s, v13.4s[3]
-	OP_ri	v31.4s, v6.4s, v13.4s[3]
-	OP_ir	v31.4s, v7.4s, v12.4s[3]
+	OP_rr	v30.4s, v6.4s, v12.s[3]
+	OP_ii	v30.4s, v7.4s, v13.s[3]
+	OP_ri	v31.4s, v6.4s, v13.s[3]
+	OP_ir	v31.4s, v7.4s, v12.s[3]
 .endm
 
 .macro KERNEL8x4_E
-	OP_rr	v16.4s, v4.4s, v12.4s[0]
-	OP_ii	v16.4s, v5.4s, v13.4s[0]
-	OP_ri	v17.4s, v4.4s, v13.4s[0]
-	OP_ir	v17.4s, v5.4s, v12.4s[0]
+	OP_rr	v16.4s, v4.4s, v12.s[0]
+	OP_ii	v16.4s, v5.4s, v13.s[0]
+	OP_ri	v17.4s, v4.4s, v13.s[0]
+	OP_ir	v17.4s, v5.4s, v12.s[0]
 
-	OP_rr	v20.4s, v4.4s, v12.4s[1]
-	OP_ii	v20.4s, v5.4s, v13.4s[1]
-	OP_ri	v21.4s, v4.4s, v13.4s[1]
-	OP_ir	v21.4s, v5.4s, v12.4s[1]
+	OP_rr	v20.4s, v4.4s, v12.s[1]
+	OP_ii	v20.4s, v5.4s, v13.s[1]
+	OP_ri	v21.4s, v4.4s, v13.s[1]
+	OP_ir	v21.4s, v5.4s, v12.s[1]
 
-	OP_rr	v24.4s, v4.4s, v12.4s[2]
-	OP_ii	v24.4s, v5.4s, v13.4s[2]
-	OP_ri	v25.4s, v4.4s, v13.4s[2]
-	OP_ir	v25.4s, v5.4s, v12.4s[2]
+	OP_rr	v24.4s, v4.4s, v12.s[2]
+	OP_ii	v24.4s, v5.4s, v13.s[2]
+	OP_ri	v25.4s, v4.4s, v13.s[2]
+	OP_ir	v25.4s, v5.4s, v12.s[2]
 
-	OP_rr	v28.4s, v4.4s, v12.4s[3]
-	OP_ii	v28.4s, v5.4s, v13.4s[3]
-	OP_ri	v29.4s, v4.4s, v13.4s[3]
-	OP_ir	v29.4s, v5.4s, v12.4s[3]
+	OP_rr	v28.4s, v4.4s, v12.s[3]
+	OP_ii	v28.4s, v5.4s, v13.s[3]
+	OP_ri	v29.4s, v4.4s, v13.s[3]
+	OP_ir	v29.4s, v5.4s, v12.s[3]
 
-	OP_rr	v18.4s, v6.4s, v12.4s[0]
-	OP_ii	v18.4s, v7.4s, v13.4s[0]
-	OP_ri	v19.4s, v6.4s, v13.4s[0]
-	OP_ir	v19.4s, v7.4s, v12.4s[0]
+	OP_rr	v18.4s, v6.4s, v12.s[0]
+	OP_ii	v18.4s, v7.4s, v13.s[0]
+	OP_ri	v19.4s, v6.4s, v13.s[0]
+	OP_ir	v19.4s, v7.4s, v12.s[0]
 
-	OP_rr	v22.4s, v6.4s, v12.4s[1]
-	OP_ii	v22.4s, v7.4s, v13.4s[1]
-	OP_ri	v23.4s, v6.4s, v13.4s[1]
-	OP_ir	v23.4s, v7.4s, v12.4s[1]
+	OP_rr	v22.4s, v6.4s, v12.s[1]
+	OP_ii	v22.4s, v7.4s, v13.s[1]
+	OP_ri	v23.4s, v6.4s, v13.s[1]
+	OP_ir	v23.4s, v7.4s, v12.s[1]
 
-	OP_rr	v26.4s, v6.4s, v12.4s[2]
-	OP_ii	v26.4s, v7.4s, v13.4s[2]
-	OP_ri	v27.4s, v6.4s, v13.4s[2]
-	OP_ir	v27.4s, v7.4s, v12.4s[2]
+	OP_rr	v26.4s, v6.4s, v12.s[2]
+	OP_ii	v26.4s, v7.4s, v13.s[2]
+	OP_ri	v27.4s, v6.4s, v13.s[2]
+	OP_ir	v27.4s, v7.4s, v12.s[2]
 
-	OP_rr	v30.4s, v6.4s, v12.4s[3]
-	OP_ii	v30.4s, v7.4s, v13.4s[3]
-	OP_ri	v31.4s, v6.4s, v13.4s[3]
-	OP_ir	v31.4s, v7.4s, v12.4s[3]
+	OP_rr	v30.4s, v6.4s, v12.s[3]
+	OP_ii	v30.4s, v7.4s, v13.s[3]
+	OP_ri	v31.4s, v6.4s, v13.s[3]
+	OP_ir	v31.4s, v7.4s, v12.s[3]
 .endm
 
 .macro KERNEL8x4_SUB
@@ -437,48 +437,48 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 	ld2	{v0.4s, v1.4s}, [pA]
 	add	pA, pA, #32
 
-	OP_rr	v16.4s, v0.4s, v8.4s[0]
-	OP_ii	v16.4s, v1.4s, v9.4s[0]
-	OP_ri	v17.4s, v0.4s, v9.4s[0]
-	OP_ir	v17.4s, v1.4s, v8.4s[0]
+	OP_rr	v16.4s, v0.4s, v8.s[0]
+	OP_ii	v16.4s, v1.4s, v9.s[0]
+	OP_ri	v17.4s, v0.4s, v9.s[0]
+	OP_ir	v17.4s, v1.4s, v8.s[0]
 
-	OP_rr	v20.4s, v0.4s, v8.4s[1]
-	OP_ii	v20.4s, v1.4s, v9.4s[1]
-	OP_ri	v21.4s, v0.4s, v9.4s[1]
-	OP_ir	v21.4s, v1.4s, v8.4s[1]
+	OP_rr	v20.4s, v0.4s, v8.s[1]
+	OP_ii	v20.4s, v1.4s, v9.s[1]
+	OP_ri	v21.4s, v0.4s, v9.s[1]
+	OP_ir	v21.4s, v1.4s, v8.s[1]
 
 	ld2	{v2.4s, v3.4s}, [ppA]
 	add	ppA, ppA, #32
 
-	OP_rr	v24.4s, v0.4s, v8.4s[2]
-	OP_ii	v24.4s, v1.4s, v9.4s[2]
-	OP_ri	v25.4s, v0.4s, v9.4s[2]
-	OP_ir	v25.4s, v1.4s, v8.4s[2]
+	OP_rr	v24.4s, v0.4s, v8.s[2]
+	OP_ii	v24.4s, v1.4s, v9.s[2]
+	OP_ri	v25.4s, v0.4s, v9.s[2]
+	OP_ir	v25.4s, v1.4s, v8.s[2]
 
-	OP_rr	v28.4s, v0.4s, v8.4s[3]
-	OP_ii	v28.4s, v1.4s, v9.4s[3]
-	OP_ri	v29.4s, v0.4s, v9.4s[3]
-	OP_ir	v29.4s, v1.4s, v8.4s[3]
+	OP_rr	v28.4s, v0.4s, v8.s[3]
+	OP_ii	v28.4s, v1.4s, v9.s[3]
+	OP_ri	v29.4s, v0.4s, v9.s[3]
+	OP_ir	v29.4s, v1.4s, v8.s[3]
 
-	OP_rr	v18.4s, v2.4s, v8.4s[0]
-	OP_ii	v18.4s, v3.4s, v9.4s[0]
-	OP_ri	v19.4s, v2.4s, v9.4s[0]
-	OP_ir	v19.4s, v3.4s, v8.4s[0]
+	OP_rr	v18.4s, v2.4s, v8.s[0]
+	OP_ii	v18.4s, v3.4s, v9.s[0]
+	OP_ri	v19.4s, v2.4s, v9.s[0]
+	OP_ir	v19.4s, v3.4s, v8.s[0]
 
-	OP_rr	v22.4s, v2.4s, v8.4s[1]
-	OP_ii	v22.4s, v3.4s, v9.4s[1]
-	OP_ri	v23.4s, v2.4s, v9.4s[1]
-	OP_ir	v23.4s, v3.4s, v8.4s[1]
+	OP_rr	v22.4s, v2.4s, v8.s[1]
+	OP_ii	v22.4s, v3.4s, v9.s[1]
+	OP_ri	v23.4s, v2.4s, v9.s[1]
+	OP_ir	v23.4s, v3.4s, v8.s[1]
 
-	OP_rr	v26.4s, v2.4s, v8.4s[2]
-	OP_ii	v26.4s, v3.4s, v9.4s[2]
-	OP_ri	v27.4s, v2.4s, v9.4s[2]
-	OP_ir	v27.4s, v3.4s, v8.4s[2]
+	OP_rr	v26.4s, v2.4s, v8.s[2]
+	OP_ii	v26.4s, v3.4s, v9.s[2]
+	OP_ri	v27.4s, v2.4s, v9.s[2]
+	OP_ir	v27.4s, v3.4s, v8.s[2]
 
-	OP_rr	v30.4s, v2.4s, v8.4s[3]
-	OP_ii	v30.4s, v3.4s, v9.4s[3]
-	OP_ri	v31.4s, v2.4s, v9.4s[3]
-	OP_ir	v31.4s, v3.4s, v8.4s[3]
+	OP_rr	v30.4s, v2.4s, v8.s[3]
+	OP_ii	v30.4s, v3.4s, v9.s[3]
+	OP_ri	v31.4s, v2.4s, v9.s[3]
+	OP_ir	v31.4s, v3.4s, v8.s[3]
 .endm
 
 .macro SAVE8x4
@@ -578,25 +578,25 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 	ld2	{v0.4s, v1.4s}, [pA]
 	add	pA, pA, #32
 
-	OP_rr	v16.4s, v0.4s, v8.4s[0]
-	OP_ii	v16.4s, v1.4s, v9.4s[0]
-	OP_ri	v17.4s, v0.4s, v9.4s[0]
-	OP_ir	v17.4s, v1.4s, v8.4s[0]
+	OP_rr	v16.4s, v0.4s, v8.s[0]
+	OP_ii	v16.4s, v1.4s, v9.s[0]
+	OP_ri	v17.4s, v0.4s, v9.s[0]
+	OP_ir	v17.4s, v1.4s, v8.s[0]
 
-	OP_rr	v20.4s, v0.4s, v8.4s[1]
-	OP_ii	v20.4s, v1.4s, v9.4s[1]
-	OP_ri	v21.4s, v0.4s, v9.4s[1]
-	OP_ir	v21.4s, v1.4s, v8.4s[1]
+	OP_rr	v20.4s, v0.4s, v8.s[1]
+	OP_ii	v20.4s, v1.4s, v9.s[1]
+	OP_ri	v21.4s, v0.4s, v9.s[1]
+	OP_ir	v21.4s, v1.4s, v8.s[1]
 
-	OP_rr	v24.4s, v0.4s, v8.4s[2]
-	OP_ii	v24.4s, v1.4s, v9.4s[2]
-	OP_ri	v25.4s, v0.4s, v9.4s[2]
-	OP_ir	v25.4s, v1.4s, v8.4s[2]
+	OP_rr	v24.4s, v0.4s, v8.s[2]
+	OP_ii	v24.4s, v1.4s, v9.s[2]
+	OP_ri	v25.4s, v0.4s, v9.s[2]
+	OP_ir	v25.4s, v1.4s, v8.s[2]
 
-	OP_rr	v28.4s, v0.4s, v8.4s[3]
-	OP_ii	v28.4s, v1.4s, v9.4s[3]
-	OP_ri	v29.4s, v0.4s, v9.4s[3]
-	OP_ir	v29.4s, v1.4s, v8.4s[3]
+	OP_rr	v28.4s, v0.4s, v8.s[3]
+	OP_ii	v28.4s, v1.4s, v9.s[3]
+	OP_ri	v29.4s, v0.4s, v9.s[3]
+	OP_ir	v29.4s, v1.4s, v8.s[3]
 .endm
 
 .macro SAVE4x4
@@ -658,25 +658,25 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 	ld2	{v0.2s, v1.2s}, [pA]
 	add	pA, pA, #16
 
-	OP_rr	v16.2s, v0.2s, v8.4s[0]
-	OP_ii	v16.2s, v1.2s, v9.4s[0]
-	OP_ri	v17.2s, v0.2s, v9.4s[0]
-	OP_ir	v17.2s, v1.2s, v8.4s[0]
+	OP_rr	v16.2s, v0.2s, v8.s[0]
+	OP_ii	v16.2s, v1.2s, v9.s[0]
+	OP_ri	v17.2s, v0.2s, v9.s[0]
+	OP_ir	v17.2s, v1.2s, v8.s[0]
 
-	OP_rr	v20.2s, v0.2s, v8.4s[1]
-	OP_ii	v20.2s, v1.2s, v9.4s[1]
-	OP_ri	v21.2s, v0.2s, v9.4s[1]
-	OP_ir	v21.2s, v1.2s, v8.4s[1]
+	OP_rr	v20.2s, v0.2s, v8.s[1]
+	OP_ii	v20.2s, v1.2s, v9.s[1]
+	OP_ri	v21.2s, v0.2s, v9.s[1]
+	OP_ir	v21.2s, v1.2s, v8.s[1]
 
-	OP_rr	v24.2s, v0.2s, v8.4s[2]
-	OP_ii	v24.2s, v1.2s, v9.4s[2]
-	OP_ri	v25.2s, v0.2s, v9.4s[2]
-	OP_ir	v25.2s, v1.2s, v8.4s[2]
+	OP_rr	v24.2s, v0.2s, v8.s[2]
+	OP_ii	v24.2s, v1.2s, v9.s[2]
+	OP_ri	v25.2s, v0.2s, v9.s[2]
+	OP_ir	v25.2s, v1.2s, v8.s[2]
 
-	OP_rr	v28.2s, v0.2s, v8.4s[3]
-	OP_ii	v28.2s, v1.2s, v9.4s[3]
-	OP_ri	v29.2s, v0.2s, v9.4s[3]
-	OP_ir	v29.2s, v1.2s, v8.4s[3]
+	OP_rr	v28.2s, v0.2s, v8.s[3]
+	OP_ii	v28.2s, v1.2s, v9.s[3]
+	OP_ri	v29.2s, v0.2s, v9.s[3]
+	OP_ir	v29.2s, v1.2s, v8.s[3]
 .endm
 
 .macro SAVE2x4
@@ -738,25 +738,25 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 	ld2	{v0.s, v1.s}[0], [pA]
 	add	pA, pA, #8
 
-	OP_rr	s16, s0, v8.4s[0]
-	OP_ii	s16, s1, v9.4s[0]
-	OP_ri	s17, s0, v9.4s[0]
-	OP_ir	s17, s1, v8.4s[0]
+	OP_rr	s16, s0, v8.s[0]
+	OP_ii	s16, s1, v9.s[0]
+	OP_ri	s17, s0, v9.s[0]
+	OP_ir	s17, s1, v8.s[0]
 
-	OP_rr	s20, s0, v8.4s[1]
-	OP_ii	s20, s1, v9.4s[1]
-	OP_ri	s21, s0, v9.4s[1]
-	OP_ir	s21, s1, v8.4s[1]
+	OP_rr	s20, s0, v8.s[1]
+	OP_ii	s20, s1, v9.s[1]
+	OP_ri	s21, s0, v9.s[1]
+	OP_ir	s21, s1, v8.s[1]
 
-	OP_rr	s24, s0, v8.4s[2]
-	OP_ii	s24, s1, v9.4s[2]
-	OP_ri	s25, s0, v9.4s[2]
-	OP_ir	s25, s1, v8.4s[2]
+	OP_rr	s24, s0, v8.s[2]
+	OP_ii	s24, s1, v9.s[2]
+	OP_ri	s25, s0, v9.s[2]
+	OP_ir	s25, s1, v8.s[2]
 
-	OP_rr	s28, s0, v8.4s[3]
-	OP_ii	s28, s1, v9.4s[3]
-	OP_ri	s29, s0, v9.4s[3]
-	OP_ir	s29, s1, v8.4s[3]
+	OP_rr	s28, s0, v8.s[3]
+	OP_ii	s28, s1, v9.s[3]
+	OP_ri	s29, s0, v9.s[3]
+	OP_ir	s29, s1, v8.s[3]
 .endm
 
 .macro SAVE1x4
@@ -814,15 +814,15 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 	ld2	{v0.4s, v1.4s}, [pA]
 	add	pA, pA, #32
 
-	OP_rr	v16.4s, v0.4s, v8.2s[0]
-	OP_ii	v16.4s, v1.4s, v9.2s[0]
-	OP_ri	v17.4s, v0.4s, v9.2s[0]
-	OP_ir	v17.4s, v1.4s, v8.2s[0]
+	OP_rr	v16.4s, v0.4s, v8.s[0]
+	OP_ii	v16.4s, v1.4s, v9.s[0]
+	OP_ri	v17.4s, v0.4s, v9.s[0]
+	OP_ir	v17.4s, v1.4s, v8.s[0]
 
-	OP_rr	v20.4s, v0.4s, v8.2s[1]
-	OP_ii	v20.4s, v1.4s, v9.2s[1]
-	OP_ri	v21.4s, v0.4s, v9.2s[1]
-	OP_ir	v21.4s, v1.4s, v8.2s[1]
+	OP_rr	v20.4s, v0.4s, v8.s[1]
+	OP_ii	v20.4s, v1.4s, v9.s[1]
+	OP_ri	v21.4s, v0.4s, v9.s[1]
+	OP_ir	v21.4s, v1.4s, v8.s[1]
 .endm
 
 .macro SAVE4x2
@@ -862,15 +862,15 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 	ld2	{v0.2s, v1.2s}, [pA]
 	add	pA, pA, #16
 
-	OP_rr	v16.2s, v0.2s, v8.2s[0]
-	OP_ii	v16.2s, v1.2s, v9.2s[0]
-	OP_ri	v17.2s, v0.2s, v9.2s[0]
-	OP_ir	v17.2s, v1.2s, v8.2s[0]
+	OP_rr	v16.2s, v0.2s, v8.s[0]
+	OP_ii	v16.2s, v1.2s, v9.s[0]
+	OP_ri	v17.2s, v0.2s, v9.s[0]
+	OP_ir	v17.2s, v1.2s, v8.s[0]
 
-	OP_rr	v20.2s, v0.2s, v8.2s[1]
-	OP_ii	v20.2s, v1.2s, v9.2s[1]
-	OP_ri	v21.2s, v0.2s, v9.2s[1]
-	OP_ir	v21.2s, v1.2s, v8.2s[1]
+	OP_rr	v20.2s, v0.2s, v8.s[1]
+	OP_ii	v20.2s, v1.2s, v9.s[1]
+	OP_ri	v21.2s, v0.2s, v9.s[1]
+	OP_ir	v21.2s, v1.2s, v8.s[1]
 .endm
 
 .macro SAVE2x2
@@ -910,15 +910,15 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 	ld2	{v0.s, v1.s}[0], [pA]
 	add	pA, pA, #8
 
-	OP_rr	s16, s0, v8.2s[0]
-	OP_ii	s16, s1, v9.2s[0]
-	OP_ri	s17, s0, v9.2s[0]
-	OP_ir	s17, s1, v8.2s[0]
+	OP_rr	s16, s0, v8.s[0]
+	OP_ii	s16, s1, v9.s[0]
+	OP_ri	s17, s0, v9.s[0]
+	OP_ir	s17, s1, v8.s[0]
 
-	OP_rr	s20, s0, v8.2s[1]
-	OP_ii	s20, s1, v9.2s[1]
-	OP_ri	s21, s0, v9.2s[1]
-	OP_ir	s21, s1, v8.2s[1]
+	OP_rr	s20, s0, v8.s[1]
+	OP_ii	s20, s1, v9.s[1]
+	OP_ri	s21, s0, v9.s[1]
+	OP_ir	s21, s1, v8.s[1]
 .endm
 
 .macro SAVE1x2
diff --git a/kernel/arm64/cgemm_kernel_8x4.S b/kernel/arm64/cgemm_kernel_8x4.S
old mode 100755
new mode 100644
index 40b98cee2..d58cef52d
--- a/kernel/arm64/cgemm_kernel_8x4.S
+++ b/kernel/arm64/cgemm_kernel_8x4.S
@@ -178,93 +178,93 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 	ld2	{v2.4s, v3.4s}, [pA]
 	add	pA, pA, #32
 
-	fmul	v16.4s, v0.4s, v8.4s[0]
-	OP_ii	v16.4s, v1.4s, v9.4s[0]
+	fmul	v16.4s, v0.4s, v8.s[0]
+	OP_ii	v16.4s, v1.4s, v9.s[0]
 #if defined(NR) || defined(NC) || defined(TR) || defined(TC) || \
     defined(RR) || defined(RC) || defined(CR) || defined(CC)
 	eor	v17.16b, v17.16b, v17.16b
-	fmls	v17.4s, v0.4s, v9.4s[0]
+	fmls	v17.4s, v0.4s, v9.s[0]
 #else
-	fmul	v17.4s, v0.4s, v9.4s[0]
+	fmul	v17.4s, v0.4s, v9.s[0]
 #endif
-	OP_ir	v17.4s, v1.4s, v8.4s[0]
+	OP_ir	v17.4s, v1.4s, v8.s[0]
 
-	fmul	v18.4s, v2.4s, v8.4s[0]
-	OP_ii	v18.4s, v3.4s, v9.4s[0]
+	fmul	v18.4s, v2.4s, v8.s[0]
+	OP_ii	v18.4s, v3.4s, v9.s[0]
 #if defined(NR) || defined(NC) || defined(TR) || defined(TC) || \
     defined(RR) || defined(RC) || defined(CR) || defined(CC)
 	eor	v19.16b, v19.16b, v19.16b
-	fmls	v19.4s, v2.4s, v9.4s[0]
+	fmls	v19.4s, v2.4s, v9.s[0]
 #else
-	fmul	v19.4s, v2.4s, v9.4s[0]
+	fmul	v19.4s, v2.4s, v9.s[0]
 #endif
-	OP_ir	v19.4s, v3.4s, v8.4s[0]
+	OP_ir	v19.4s, v3.4s, v8.s[0]
 
-	fmul	v20.4s, v0.4s, v8.4s[1]
-	OP_ii	v20.4s, v1.4s, v9.4s[1]
+	fmul	v20.4s, v0.4s, v8.s[1]
+	OP_ii	v20.4s, v1.4s, v9.s[1]
 #if defined(NR) || defined(NC) || defined(TR) || defined(TC) || \
     defined(RR) || defined(RC) || defined(CR) || defined(CC)
 	eor	v21.16b, v21.16b, v21.16b
-	fmls	v21.4s, v0.4s, v9.4s[1]
+	fmls	v21.4s, v0.4s, v9.s[1]
 #else
-	fmul	v21.4s, v0.4s, v9.4s[1]
+	fmul	v21.4s, v0.4s, v9.s[1]
 #endif
-	OP_ir	v21.4s, v1.4s, v8.4s[1]
+	OP_ir	v21.4s, v1.4s, v8.s[1]
 
-	fmul	v22.4s, v2.4s, v8.4s[1]
-	OP_ii	v22.4s, v3.4s, v9.4s[1]
+	fmul	v22.4s, v2.4s, v8.s[1]
+	OP_ii	v22.4s, v3.4s, v9.s[1]
 #if defined(NR) || defined(NC) || defined(TR) || defined(TC) || \
     defined(RR) || defined(RC) || defined(CR) || defined(CC)
 	eor	v23.16b, v23.16b, v23.16b
-	fmls	v23.4s, v2.4s, v9.4s[1]
+	fmls	v23.4s, v2.4s, v9.s[1]
 #else
-	fmul	v23.4s, v2.4s, v9.4s[1]
+	fmul	v23.4s, v2.4s, v9.s[1]
 #endif
-	OP_ir	v23.4s, v3.4s, v8.4s[1]
+	OP_ir	v23.4s, v3.4s, v8.s[1]
 
-	fmul	v24.4s, v0.4s, v8.4s[2]
-	OP_ii	v24.4s, v1.4s, v9.4s[2]
+	fmul	v24.4s, v0.4s, v8.s[2]
+	OP_ii	v24.4s, v1.4s, v9.s[2]
 #if defined(NR) || defined(NC) || defined(TR) || defined(TC) || \
     defined(RR) || defined(RC) || defined(CR) || defined(CC)
 	eor	v25.16b, v25.16b, v25.16b
-	fmls	v25.4s, v0.4s, v9.4s[2]
+	fmls	v25.4s, v0.4s, v9.s[2]
 #else
-	fmul	v25.4s, v0.4s, v9.4s[2]
+	fmul	v25.4s, v0.4s, v9.s[2]
 #endif
-	OP_ir	v25.4s, v1.4s, v8.4s[2]
+	OP_ir	v25.4s, v1.4s, v8.s[2]
 
-	fmul	v26.4s, v2.4s, v8.4s[2]
-	OP_ii	v26.4s, v3.4s, v9.4s[2]
+	fmul	v26.4s, v2.4s, v8.s[2]
+	OP_ii	v26.4s, v3.4s, v9.s[2]
 #if defined(NR) || defined(NC) || defined(TR) || defined(TC) || \
     defined(RR) || defined(RC) || defined(CR) || defined(CC)
 	eor	v27.16b, v27.16b, v27.16b
-	fmls	v27.4s, v2.4s, v9.4s[2]
+	fmls	v27.4s, v2.4s, v9.s[2]
 #else
-	fmul	v27.4s, v2.4s, v9.4s[2]
+	fmul	v27.4s, v2.4s, v9.s[2]
 #endif
-	OP_ir	v27.4s, v3.4s, v8.4s[2]
+	OP_ir	v27.4s, v3.4s, v8.s[2]
 
-	fmul	v28.4s, v0.4s, v8.4s[3]
-	OP_ii	v28.4s, v1.4s, v9.4s[3]
+	fmul	v28.4s, v0.4s, v8.s[3]
+	OP_ii	v28.4s, v1.4s, v9.s[3]
 #if defined(NR) || defined(NC) || defined(TR) || defined(TC) || \
     defined(RR) || defined(RC) || defined(CR) || defined(CC)
 	eor	v29.16b, v29.16b, v29.16b
-	fmls	v29.4s, v0.4s, v9.4s[3]
+	fmls	v29.4s, v0.4s, v9.s[3]
 #else
-	fmul	v29.4s, v0.4s, v9.4s[3]
+	fmul	v29.4s, v0.4s, v9.s[3]
 #endif
-	OP_ir	v29.4s, v1.4s, v8.4s[3]
+	OP_ir	v29.4s, v1.4s, v8.s[3]
 
-	fmul	v30.4s, v2.4s, v8.4s[3]
-	OP_ii	v30.4s, v3.4s, v9.4s[3]
+	fmul	v30.4s, v2.4s, v8.s[3]
+	OP_ii	v30.4s, v3.4s, v9.s[3]
 #if defined(NR) || defined(NC) || defined(TR) || defined(TC) || \
     defined(RR) || defined(RC) || defined(CR) || defined(CC)
 	eor	v31.16b, v31.16b, v31.16b
-	fmls	v31.4s, v2.4s, v9.4s[3]
+	fmls	v31.4s, v2.4s, v9.s[3]
 #else
-	fmul	v31.4s, v2.4s, v9.4s[3]
+	fmul	v31.4s, v2.4s, v9.s[3]
 #endif
-	OP_ir	v31.4s, v3.4s, v8.4s[3]
+	OP_ir	v31.4s, v3.4s, v8.s[3]
 
 	ld2	{v12.4s, v13.4s}, [pB]
 	add	pB, pB, #32
@@ -275,45 +275,45 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 .endm
 
 .macro KERNEL8x4_M1
-	OP_rr	v16.4s, v0.4s, v8.4s[0]
-	OP_ii	v16.4s, v1.4s, v9.4s[0]
-	OP_ri	v17.4s, v0.4s, v9.4s[0]
-	OP_ir	v17.4s, v1.4s, v8.4s[0]
+	OP_rr	v16.4s, v0.4s, v8.s[0]
+	OP_ii	v16.4s, v1.4s, v9.s[0]
+	OP_ri	v17.4s, v0.4s, v9.s[0]
+	OP_ir	v17.4s, v1.4s, v8.s[0]
 
-	OP_rr	v18.4s, v2.4s, v8.4s[0]
-	OP_ii	v18.4s, v3.4s, v9.4s[0]
-	OP_ri	v19.4s, v2.4s, v9.4s[0]
-	OP_ir	v19.4s, v3.4s, v8.4s[0]
+	OP_rr	v18.4s, v2.4s, v8.s[0]
+	OP_ii	v18.4s, v3.4s, v9.s[0]
+	OP_ri	v19.4s, v2.4s, v9.s[0]
+	OP_ir	v19.4s, v3.4s, v8.s[0]
 
-	OP_rr	v20.4s, v0.4s, v8.4s[1]
-	OP_ii	v20.4s, v1.4s, v9.4s[1]
-	OP_ri	v21.4s, v0.4s, v9.4s[1]
-	OP_ir	v21.4s, v1.4s, v8.4s[1]
+	OP_rr	v20.4s, v0.4s, v8.s[1]
+	OP_ii	v20.4s, v1.4s, v9.s[1]
+	OP_ri	v21.4s, v0.4s, v9.s[1]
+	OP_ir	v21.4s, v1.4s, v8.s[1]
 
-	OP_rr	v22.4s, v2.4s, v8.4s[1]
-	OP_ii	v22.4s, v3.4s, v9.4s[1]
-	OP_ri	v23.4s, v2.4s, v9.4s[1]
-	OP_ir	v23.4s, v3.4s, v8.4s[1]
+	OP_rr	v22.4s, v2.4s, v8.s[1]
+	OP_ii	v22.4s, v3.4s, v9.s[1]
+	OP_ri	v23.4s, v2.4s, v9.s[1]
+	OP_ir	v23.4s, v3.4s, v8.s[1]
 
-	OP_rr	v24.4s, v0.4s, v8.4s[2]
-	OP_ii	v24.4s, v1.4s, v9.4s[2]
-	OP_ri	v25.4s, v0.4s, v9.4s[2]
-	OP_ir	v25.4s, v1.4s, v8.4s[2]
+	OP_rr	v24.4s, v0.4s, v8.s[2]
+	OP_ii	v24.4s, v1.4s, v9.s[2]
+	OP_ri	v25.4s, v0.4s, v9.s[2]
+	OP_ir	v25.4s, v1.4s, v8.s[2]
 
-	OP_rr	v26.4s, v2.4s, v8.4s[2]
-	OP_ii	v26.4s, v3.4s, v9.4s[2]
-	OP_ri	v27.4s, v2.4s, v9.4s[2]
-	OP_ir	v27.4s, v3.4s, v8.4s[2]
+	OP_rr	v26.4s, v2.4s, v8.s[2]
+	OP_ii	v26.4s, v3.4s, v9.s[2]
+	OP_ri	v27.4s, v2.4s, v9.s[2]
+	OP_ir	v27.4s, v3.4s, v8.s[2]
 
-	OP_rr	v28.4s, v0.4s, v8.4s[3]
-	OP_ii	v28.4s, v1.4s, v9.4s[3]
-	OP_ri	v29.4s, v0.4s, v9.4s[3]
-	OP_ir	v29.4s, v1.4s, v8.4s[3]
+	OP_rr	v28.4s, v0.4s, v8.s[3]
+	OP_ii	v28.4s, v1.4s, v9.s[3]
+	OP_ri	v29.4s, v0.4s, v9.s[3]
+	OP_ir	v29.4s, v1.4s, v8.s[3]
 
-	OP_rr	v30.4s, v2.4s, v8.4s[3]
-	OP_ii	v30.4s, v3.4s, v9.4s[3]
-	OP_ri	v31.4s, v2.4s, v9.4s[3]
-	OP_ir	v31.4s, v3.4s, v8.4s[3]
+	OP_rr	v30.4s, v2.4s, v8.s[3]
+	OP_ii	v30.4s, v3.4s, v9.s[3]
+	OP_ri	v31.4s, v2.4s, v9.s[3]
+	OP_ir	v31.4s, v3.4s, v8.s[3]
 
 	ld2	{v12.4s, v13.4s}, [pB]		// For next round
 	add	pB, pB, #32
@@ -324,45 +324,45 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 .endm
 
 .macro KERNEL8x4_M2
-	OP_rr	v16.4s, v4.4s, v12.4s[0]
-	OP_ii	v16.4s, v5.4s, v13.4s[0]
-	OP_ri	v17.4s, v4.4s, v13.4s[0]
-	OP_ir	v17.4s, v5.4s, v12.4s[0]
+	OP_rr	v16.4s, v4.4s, v12.s[0]
+	OP_ii	v16.4s, v5.4s, v13.s[0]
+	OP_ri	v17.4s, v4.4s, v13.s[0]
+	OP_ir	v17.4s, v5.4s, v12.s[0]
 
-	OP_rr	v18.4s, v6.4s, v12.4s[0]
-	OP_ii	v18.4s, v7.4s, v13.4s[0]
-	OP_ri	v19.4s, v6.4s, v13.4s[0]
-	OP_ir	v19.4s, v7.4s, v12.4s[0]
+	OP_rr	v18.4s, v6.4s, v12.s[0]
+	OP_ii	v18.4s, v7.4s, v13.s[0]
+	OP_ri	v19.4s, v6.4s, v13.s[0]
+	OP_ir	v19.4s, v7.4s, v12.s[0]
 
-	OP_rr	v20.4s, v4.4s, v12.4s[1]
-	OP_ii	v20.4s, v5.4s, v13.4s[1]
-	OP_ri	v21.4s, v4.4s, v13.4s[1]
-	OP_ir	v21.4s, v5.4s, v12.4s[1]
+	OP_rr	v20.4s, v4.4s, v12.s[1]
+	OP_ii	v20.4s, v5.4s, v13.s[1]
+	OP_ri	v21.4s, v4.4s, v13.s[1]
+	OP_ir	v21.4s, v5.4s, v12.s[1]
 
-	OP_rr	v22.4s, v6.4s, v12.4s[1]
-	OP_ii	v22.4s, v7.4s, v13.4s[1]
-	OP_ri	v23.4s, v6.4s, v13.4s[1]
-	OP_ir	v23.4s, v7.4s, v12.4s[1]
+	OP_rr	v22.4s, v6.4s, v12.s[1]
+	OP_ii	v22.4s, v7.4s, v13.s[1]
+	OP_ri	v23.4s, v6.4s, v13.s[1]
+	OP_ir	v23.4s, v7.4s, v12.s[1]
 
-	OP_rr	v24.4s, v4.4s, v12.4s[2]
-	OP_ii	v24.4s, v5.4s, v13.4s[2]
-	OP_ri	v25.4s, v4.4s, v13.4s[2]
-	OP_ir	v25.4s, v5.4s, v12.4s[2]
+	OP_rr	v24.4s, v4.4s, v12.s[2]
+	OP_ii	v24.4s, v5.4s, v13.s[2]
+	OP_ri	v25.4s, v4.4s, v13.s[2]
+	OP_ir	v25.4s, v5.4s, v12.s[2]
 
-	OP_rr	v26.4s, v6.4s, v12.4s[2]
-	OP_ii	v26.4s, v7.4s, v13.4s[2]
-	OP_ri	v27.4s, v6.4s, v13.4s[2]
-	OP_ir	v27.4s, v7.4s, v12.4s[2]
+	OP_rr	v26.4s, v6.4s, v12.s[2]
+	OP_ii	v26.4s, v7.4s, v13.s[2]
+	OP_ri	v27.4s, v6.4s, v13.s[2]
+	OP_ir	v27.4s, v7.4s, v12.s[2]
 
-	OP_rr	v28.4s, v4.4s, v12.4s[3]
-	OP_ii	v28.4s, v5.4s, v13.4s[3]
-	OP_ri	v29.4s, v4.4s, v13.4s[3]
-	OP_ir	v29.4s, v5.4s, v12.4s[3]
+	OP_rr	v28.4s, v4.4s, v12.s[3]
+	OP_ii	v28.4s, v5.4s, v13.s[3]
+	OP_ri	v29.4s, v4.4s, v13.s[3]
+	OP_ir	v29.4s, v5.4s, v12.s[3]
 
-	OP_rr	v30.4s, v6.4s, v12.4s[3]
-	OP_ii	v30.4s, v7.4s, v13.4s[3]
-	OP_ri	v31.4s, v6.4s, v13.4s[3]
-	OP_ir	v31.4s, v7.4s, v12.4s[3]
+	OP_rr	v30.4s, v6.4s, v12.s[3]
+	OP_ii	v30.4s, v7.4s, v13.s[3]
+	OP_ri	v31.4s, v6.4s, v13.s[3]
+	OP_ir	v31.4s, v7.4s, v12.s[3]
 
 	ld2	{v8.4s, v9.4s}, [pB]
 	add	pB, pB, #32
@@ -373,45 +373,45 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 .endm
 
 .macro KERNEL8x4_E
-	OP_rr	v16.4s, v4.4s, v12.4s[0]
-	OP_ii	v16.4s, v5.4s, v13.4s[0]
-	OP_ri	v17.4s, v4.4s, v13.4s[0]
-	OP_ir	v17.4s, v5.4s, v12.4s[0]
+	OP_rr	v16.4s, v4.4s, v12.s[0]
+	OP_ii	v16.4s, v5.4s, v13.s[0]
+	OP_ri	v17.4s, v4.4s, v13.s[0]
+	OP_ir	v17.4s, v5.4s, v12.s[0]
 
-	OP_rr	v18.4s, v6.4s, v12.4s[0]
-	OP_ii	v18.4s, v7.4s, v13.4s[0]
-	OP_ri	v19.4s, v6.4s, v13.4s[0]
-	OP_ir	v19.4s, v7.4s, v12.4s[0]
+	OP_rr	v18.4s, v6.4s, v12.s[0]
+	OP_ii	v18.4s, v7.4s, v13.s[0]
+	OP_ri	v19.4s, v6.4s, v13.s[0]
+	OP_ir	v19.4s, v7.4s, v12.s[0]
 
-	OP_rr	v20.4s, v4.4s, v12.4s[1]
-	OP_ii	v20.4s, v5.4s, v13.4s[1]
-	OP_ri	v21.4s, v4.4s, v13.4s[1]
-	OP_ir	v21.4s, v5.4s, v12.4s[1]
+	OP_rr	v20.4s, v4.4s, v12.s[1]
+	OP_ii	v20.4s, v5.4s, v13.s[1]
+	OP_ri	v21.4s, v4.4s, v13.s[1]
+	OP_ir	v21.4s, v5.4s, v12.s[1]
 
-	OP_rr	v22.4s, v6.4s, v12.4s[1]
-	OP_ii	v22.4s, v7.4s, v13.4s[1]
-	OP_ri	v23.4s, v6.4s, v13.4s[1]
-	OP_ir	v23.4s, v7.4s, v12.4s[1]
+	OP_rr	v22.4s, v6.4s, v12.s[1]
+	OP_ii	v22.4s, v7.4s, v13.s[1]
+	OP_ri	v23.4s, v6.4s, v13.s[1]
+	OP_ir	v23.4s, v7.4s, v12.s[1]
 
-	OP_rr	v24.4s, v4.4s, v12.4s[2]
-	OP_ii	v24.4s, v5.4s, v13.4s[2]
-	OP_ri	v25.4s, v4.4s, v13.4s[2]
-	OP_ir	v25.4s, v5.4s, v12.4s[2]
+	OP_rr	v24.4s, v4.4s, v12.s[2]
+	OP_ii	v24.4s, v5.4s, v13.s[2]
+	OP_ri	v25.4s, v4.4s, v13.s[2]
+	OP_ir	v25.4s, v5.4s, v12.s[2]
 
-	OP_rr	v26.4s, v6.4s, v12.4s[2]
-	OP_ii	v26.4s, v7.4s, v13.4s[2]
-	OP_ri	v27.4s, v6.4s, v13.4s[2]
-	OP_ir	v27.4s, v7.4s, v12.4s[2]
+	OP_rr	v26.4s, v6.4s, v12.s[2]
+	OP_ii	v26.4s, v7.4s, v13.s[2]
+	OP_ri	v27.4s, v6.4s, v13.s[2]
+	OP_ir	v27.4s, v7.4s, v12.s[2]
 
-	OP_rr	v28.4s, v4.4s, v12.4s[3]
-	OP_ii	v28.4s, v5.4s, v13.4s[3]
-	OP_ri	v29.4s, v4.4s, v13.4s[3]
-	OP_ir	v29.4s, v5.4s, v12.4s[3]
+	OP_rr	v28.4s, v4.4s, v12.s[3]
+	OP_ii	v28.4s, v5.4s, v13.s[3]
+	OP_ri	v29.4s, v4.4s, v13.s[3]
+	OP_ir	v29.4s, v5.4s, v12.s[3]
 
-	OP_rr	v30.4s, v6.4s, v12.4s[3]
-	OP_ii	v30.4s, v7.4s, v13.4s[3]
-	OP_ri	v31.4s, v6.4s, v13.4s[3]
-	OP_ir	v31.4s, v7.4s, v12.4s[3]
+	OP_rr	v30.4s, v6.4s, v12.s[3]
+	OP_ii	v30.4s, v7.4s, v13.s[3]
+	OP_ri	v31.4s, v6.4s, v13.s[3]
+	OP_ir	v31.4s, v7.4s, v12.s[3]
 
 .endm
 
@@ -423,45 +423,45 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 	ld2	{v2.4s, v3.4s}, [pA]
 	add	pA, pA, #32
 
-	OP_rr	v16.4s, v0.4s, v8.4s[0]
-	OP_ii	v16.4s, v1.4s, v9.4s[0]
-	OP_ri	v17.4s, v0.4s, v9.4s[0]
-	OP_ir	v17.4s, v1.4s, v8.4s[0]
+	OP_rr	v16.4s, v0.4s, v8.s[0]
+	OP_ii	v16.4s, v1.4s, v9.s[0]
+	OP_ri	v17.4s, v0.4s, v9.s[0]
+	OP_ir	v17.4s, v1.4s, v8.s[0]
 
-	OP_rr	v18.4s, v2.4s, v8.4s[0]
-	OP_ii	v18.4s, v3.4s, v9.4s[0]
-	OP_ri	v19.4s, v2.4s, v9.4s[0]
-	OP_ir	v19.4s, v3.4s, v8.4s[0]
+	OP_rr	v18.4s, v2.4s, v8.s[0]
+	OP_ii	v18.4s, v3.4s, v9.s[0]
+	OP_ri	v19.4s, v2.4s, v9.s[0]
+	OP_ir	v19.4s, v3.4s, v8.s[0]
 
-	OP_rr	v20.4s, v0.4s, v8.4s[1]
-	OP_ii	v20.4s, v1.4s, v9.4s[1]
-	OP_ri	v21.4s, v0.4s, v9.4s[1]
-	OP_ir	v21.4s, v1.4s, v8.4s[1]
+	OP_rr	v20.4s, v0.4s, v8.s[1]
+	OP_ii	v20.4s, v1.4s, v9.s[1]
+	OP_ri	v21.4s, v0.4s, v9.s[1]
+	OP_ir	v21.4s, v1.4s, v8.s[1]
 
-	OP_rr	v22.4s, v2.4s, v8.4s[1]
-	OP_ii	v22.4s, v3.4s, v9.4s[1]
-	OP_ri	v23.4s, v2.4s, v9.4s[1]
-	OP_ir	v23.4s, v3.4s, v8.4s[1]
+	OP_rr	v22.4s, v2.4s, v8.s[1]
+	OP_ii	v22.4s, v3.4s, v9.s[1]
+	OP_ri	v23.4s, v2.4s, v9.s[1]
+	OP_ir	v23.4s, v3.4s, v8.s[1]
 
-	OP_rr	v24.4s, v0.4s, v8.4s[2]
-	OP_ii	v24.4s, v1.4s, v9.4s[2]
-	OP_ri	v25.4s, v0.4s, v9.4s[2]
-	OP_ir	v25.4s, v1.4s, v8.4s[2]
+	OP_rr	v24.4s, v0.4s, v8.s[2]
+	OP_ii	v24.4s, v1.4s, v9.s[2]
+	OP_ri	v25.4s, v0.4s, v9.s[2]
+	OP_ir	v25.4s, v1.4s, v8.s[2]
 
-	OP_rr	v26.4s, v2.4s, v8.4s[2]
-	OP_ii	v26.4s, v3.4s, v9.4s[2]
-	OP_ri	v27.4s, v2.4s, v9.4s[2]
-	OP_ir	v27.4s, v3.4s, v8.4s[2]
+	OP_rr	v26.4s, v2.4s, v8.s[2]
+	OP_ii	v26.4s, v3.4s, v9.s[2]
+	OP_ri	v27.4s, v2.4s, v9.s[2]
+	OP_ir	v27.4s, v3.4s, v8.s[2]
 
-	OP_rr	v28.4s, v0.4s, v8.4s[3]
-	OP_ii	v28.4s, v1.4s, v9.4s[3]
-	OP_ri	v29.4s, v0.4s, v9.4s[3]
-	OP_ir	v29.4s, v1.4s, v8.4s[3]
+	OP_rr	v28.4s, v0.4s, v8.s[3]
+	OP_ii	v28.4s, v1.4s, v9.s[3]
+	OP_ri	v29.4s, v0.4s, v9.s[3]
+	OP_ir	v29.4s, v1.4s, v8.s[3]
 
-	OP_rr	v30.4s, v2.4s, v8.4s[3]
-	OP_ii	v30.4s, v3.4s, v9.4s[3]
-	OP_ri	v31.4s, v2.4s, v9.4s[3]
-	OP_ir	v31.4s, v3.4s, v8.4s[3]
+	OP_rr	v30.4s, v2.4s, v8.s[3]
+	OP_ii	v30.4s, v3.4s, v9.s[3]
+	OP_ri	v31.4s, v2.4s, v9.s[3]
+	OP_ir	v31.4s, v3.4s, v8.s[3]
 
 .endm
 
@@ -560,49 +560,49 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 	ld2	{v0.4s, v1.4s}, [pA]
 	add	pA, pA, #32
 
-	fmul	v16.4s, v0.4s, v8.4s[0]
-	OP_ii	v16.4s, v1.4s, v9.4s[0]
+	fmul	v16.4s, v0.4s, v8.s[0]
+	OP_ii	v16.4s, v1.4s, v9.s[0]
 #if defined(NR) || defined(NC) || defined(TR) || defined(TC) || \
     defined(RR) || defined(RC) || defined(CR) || defined(CC)
 	eor	v17.16b, v17.16b, v17.16b
-	fmls	v17.4s, v0.4s, v9.4s[0]
+	fmls	v17.4s, v0.4s, v9.s[0]
 #else
-	fmul	v17.4s, v0.4s, v9.4s[0]
+	fmul	v17.4s, v0.4s, v9.s[0]
 #endif
-	OP_ir	v17.4s, v1.4s, v8.4s[0]
+	OP_ir	v17.4s, v1.4s, v8.s[0]
 
-	fmul	v20.4s, v0.4s, v8.4s[1]
-	OP_ii	v20.4s, v1.4s, v9.4s[1]
+	fmul	v20.4s, v0.4s, v8.s[1]
+	OP_ii	v20.4s, v1.4s, v9.s[1]
 #if defined(NR) || defined(NC) || defined(TR) || defined(TC) || \
     defined(RR) || defined(RC) || defined(CR) || defined(CC)
 	eor	v21.16b, v21.16b, v21.16b
-	fmls	v21.4s, v0.4s, v9.4s[1]
+	fmls	v21.4s, v0.4s, v9.s[1]
 #else
-	fmul	v21.4s, v0.4s, v9.4s[1]
+	fmul	v21.4s, v0.4s, v9.s[1]
 #endif
-	OP_ir	v21.4s, v1.4s, v8.4s[1]
+	OP_ir	v21.4s, v1.4s, v8.s[1]
 
-	fmul	v24.4s, v0.4s, v8.4s[2]
-	OP_ii	v24.4s, v1.4s, v9.4s[2]
+	fmul	v24.4s, v0.4s, v8.s[2]
+	OP_ii	v24.4s, v1.4s, v9.s[2]
 #if defined(NR) || defined(NC) || defined(TR) || defined(TC) || \
     defined(RR) || defined(RC) || defined(CR) || defined(CC)
 	eor	v25.16b, v25.16b, v25.16b
-	fmls	v25.4s, v0.4s, v9.4s[2]
+	fmls	v25.4s, v0.4s, v9.s[2]
 #else
-	fmul	v25.4s, v0.4s, v9.4s[2]
+	fmul	v25.4s, v0.4s, v9.s[2]
 #endif
-	OP_ir	v25.4s, v1.4s, v8.4s[2]
+	OP_ir	v25.4s, v1.4s, v8.s[2]
 
-	fmul	v28.4s, v0.4s, v8.4s[3]
-	OP_ii	v28.4s, v1.4s, v9.4s[3]
+	fmul	v28.4s, v0.4s, v8.s[3]
+	OP_ii	v28.4s, v1.4s, v9.s[3]
 #if defined(NR) || defined(NC) || defined(TR) || defined(TC) || \
     defined(RR) || defined(RC) || defined(CR) || defined(CC)
 	eor	v29.16b, v29.16b, v29.16b
-	fmls	v29.4s, v0.4s, v9.4s[3]
+	fmls	v29.4s, v0.4s, v9.s[3]
 #else
-	fmul	v29.4s, v0.4s, v9.4s[3]
+	fmul	v29.4s, v0.4s, v9.s[3]
 #endif
-	OP_ir	v29.4s, v1.4s, v8.4s[3]
+	OP_ir	v29.4s, v1.4s, v8.s[3]
 
 	ld2	{v12.4s, v13.4s}, [pB]
 	add	pB, pB, #32
@@ -611,85 +611,85 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 .endm
 
 .macro KERNEL4x4_M1
-	OP_rr	v16.4s, v0.4s, v8.4s[0]
-	OP_ii	v16.4s, v1.4s, v9.4s[0]
-	OP_ri	v17.4s, v0.4s, v9.4s[0]
-	OP_ir	v17.4s, v1.4s, v8.4s[0]
+	OP_rr	v16.4s, v0.4s, v8.s[0]
+	OP_ii	v16.4s, v1.4s, v9.s[0]
+	OP_ri	v17.4s, v0.4s, v9.s[0]
+	OP_ir	v17.4s, v1.4s, v8.s[0]
 
 	ld2	{v12.4s, v13.4s}, [pB]		// For next round
 	add	pB, pB, #32
 
-	OP_rr	v20.4s, v0.4s, v8.4s[1]
-	OP_ii	v20.4s, v1.4s, v9.4s[1]
-	OP_ri	v21.4s, v0.4s, v9.4s[1]
-	OP_ir	v21.4s, v1.4s, v8.4s[1]
+	OP_rr	v20.4s, v0.4s, v8.s[1]
+	OP_ii	v20.4s, v1.4s, v9.s[1]
+	OP_ri	v21.4s, v0.4s, v9.s[1]
+	OP_ir	v21.4s, v1.4s, v8.s[1]
 
 	ld2	{v4.4s, v5.4s}, [pA]		// For next round
 	add	pA, pA, #32
 
-	OP_rr	v24.4s, v0.4s, v8.4s[2]
-	OP_ii	v24.4s, v1.4s, v9.4s[2]
-	OP_ri	v25.4s, v0.4s, v9.4s[2]
-	OP_ir	v25.4s, v1.4s, v8.4s[2]
+	OP_rr	v24.4s, v0.4s, v8.s[2]
+	OP_ii	v24.4s, v1.4s, v9.s[2]
+	OP_ri	v25.4s, v0.4s, v9.s[2]
+	OP_ir	v25.4s, v1.4s, v8.s[2]
 
 	prfm	PLDL1KEEP, [pA, #512]
 
-	OP_rr	v28.4s, v0.4s, v8.4s[3]
-	OP_ii	v28.4s, v1.4s, v9.4s[3]
-	OP_ri	v29.4s, v0.4s, v9.4s[3]
-	OP_ir	v29.4s, v1.4s, v8.4s[3]
+	OP_rr	v28.4s, v0.4s, v8.s[3]
+	OP_ii	v28.4s, v1.4s, v9.s[3]
+	OP_ri	v29.4s, v0.4s, v9.s[3]
+	OP_ir	v29.4s, v1.4s, v8.s[3]
 .endm
 
 .macro KERNEL4x4_M2
-	OP_rr	v16.4s, v4.4s, v12.4s[0]
-	OP_ii	v16.4s, v5.4s, v13.4s[0]
-	OP_ri	v17.4s, v4.4s, v13.4s[0]
-	OP_ir	v17.4s, v5.4s, v12.4s[0]
+	OP_rr	v16.4s, v4.4s, v12.s[0]
+	OP_ii	v16.4s, v5.4s, v13.s[0]
+	OP_ri	v17.4s, v4.4s, v13.s[0]
+	OP_ir	v17.4s, v5.4s, v12.s[0]
 
 	ld2	{v8.4s, v9.4s}, [pB]		// For next round
 	add	pB, pB, #32
 
-	OP_rr	v20.4s, v4.4s, v12.4s[1]
-	OP_ii	v20.4s, v5.4s, v13.4s[1]
-	OP_ri	v21.4s, v4.4s, v13.4s[1]
-	OP_ir	v21.4s, v5.4s, v12.4s[1]
+	OP_rr	v20.4s, v4.4s, v12.s[1]
+	OP_ii	v20.4s, v5.4s, v13.s[1]
+	OP_ri	v21.4s, v4.4s, v13.s[1]
+	OP_ir	v21.4s, v5.4s, v12.s[1]
 
 	ld2	{v0.4s, v1.4s}, [pA]		// For next round
 	add	pA, pA, #32
 
-	OP_rr	v24.4s, v4.4s, v12.4s[2]
-	OP_ii	v24.4s, v5.4s, v13.4s[2]
-	OP_ri	v25.4s, v4.4s, v13.4s[2]
-	OP_ir	v25.4s, v5.4s, v12.4s[2]
+	OP_rr	v24.4s, v4.4s, v12.s[2]
+	OP_ii	v24.4s, v5.4s, v13.s[2]
+	OP_ri	v25.4s, v4.4s, v13.s[2]
+	OP_ir	v25.4s, v5.4s, v12.s[2]
 
 	prfm	PLDL1KEEP, [pB, #512]
 
-	OP_rr	v28.4s, v4.4s, v12.4s[3]
-	OP_ii	v28.4s, v5.4s, v13.4s[3]
-	OP_ri	v29.4s, v4.4s, v13.4s[3]
-	OP_ir	v29.4s, v5.4s, v12.4s[3]
+	OP_rr	v28.4s, v4.4s, v12.s[3]
+	OP_ii	v28.4s, v5.4s, v13.s[3]
+	OP_ri	v29.4s, v4.4s, v13.s[3]
+	OP_ir	v29.4s, v5.4s, v12.s[3]
 .endm
 
 .macro KERNEL4x4_E
-	OP_rr	v16.4s, v4.4s, v12.4s[0]
-	OP_ii	v16.4s, v5.4s, v13.4s[0]
-	OP_ri	v17.4s, v4.4s, v13.4s[0]
-	OP_ir	v17.4s, v5.4s, v12.4s[0]
+	OP_rr	v16.4s, v4.4s, v12.s[0]
+	OP_ii	v16.4s, v5.4s, v13.s[0]
+	OP_ri	v17.4s, v4.4s, v13.s[0]
+	OP_ir	v17.4s, v5.4s, v12.s[0]
 
-	OP_rr	v20.4s, v4.4s, v12.4s[1]
-	OP_ii	v20.4s, v5.4s, v13.4s[1]
-	OP_ri	v21.4s, v4.4s, v13.4s[1]
-	OP_ir	v21.4s, v5.4s, v12.4s[1]
+	OP_rr	v20.4s, v4.4s, v12.s[1]
+	OP_ii	v20.4s, v5.4s, v13.s[1]
+	OP_ri	v21.4s, v4.4s, v13.s[1]
+	OP_ir	v21.4s, v5.4s, v12.s[1]
 
-	OP_rr	v24.4s, v4.4s, v12.4s[2]
-	OP_ii	v24.4s, v5.4s, v13.4s[2]
-	OP_ri	v25.4s, v4.4s, v13.4s[2]
-	OP_ir	v25.4s, v5.4s, v12.4s[2]
+	OP_rr	v24.4s, v4.4s, v12.s[2]
+	OP_ii	v24.4s, v5.4s, v13.s[2]
+	OP_ri	v25.4s, v4.4s, v13.s[2]
+	OP_ir	v25.4s, v5.4s, v12.s[2]
 
-	OP_rr	v28.4s, v4.4s, v12.4s[3]
-	OP_ii	v28.4s, v5.4s, v13.4s[3]
-	OP_ri	v29.4s, v4.4s, v13.4s[3]
-	OP_ir	v29.4s, v5.4s, v12.4s[3]
+	OP_rr	v28.4s, v4.4s, v12.s[3]
+	OP_ii	v28.4s, v5.4s, v13.s[3]
+	OP_ri	v29.4s, v4.4s, v13.s[3]
+	OP_ir	v29.4s, v5.4s, v12.s[3]
 .endm
 
 .macro KERNEL4x4_SUB
@@ -698,25 +698,25 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 	ld2	{v0.4s, v1.4s}, [pA]
 	add	pA, pA, #32
 
-	OP_rr	v16.4s, v0.4s, v8.4s[0]
-	OP_ii	v16.4s, v1.4s, v9.4s[0]
-	OP_ri	v17.4s, v0.4s, v9.4s[0]
-	OP_ir	v17.4s, v1.4s, v8.4s[0]
+	OP_rr	v16.4s, v0.4s, v8.s[0]
+	OP_ii	v16.4s, v1.4s, v9.s[0]
+	OP_ri	v17.4s, v0.4s, v9.s[0]
+	OP_ir	v17.4s, v1.4s, v8.s[0]
 
-	OP_rr	v20.4s, v0.4s, v8.4s[1]
-	OP_ii	v20.4s, v1.4s, v9.4s[1]
-	OP_ri	v21.4s, v0.4s, v9.4s[1]
-	OP_ir	v21.4s, v1.4s, v8.4s[1]
+	OP_rr	v20.4s, v0.4s, v8.s[1]
+	OP_ii	v20.4s, v1.4s, v9.s[1]
+	OP_ri	v21.4s, v0.4s, v9.s[1]
+	OP_ir	v21.4s, v1.4s, v8.s[1]
 
-	OP_rr	v24.4s, v0.4s, v8.4s[2]
-	OP_ii	v24.4s, v1.4s, v9.4s[2]
-	OP_ri	v25.4s, v0.4s, v9.4s[2]
-	OP_ir	v25.4s, v1.4s, v8.4s[2]
+	OP_rr	v24.4s, v0.4s, v8.s[2]
+	OP_ii	v24.4s, v1.4s, v9.s[2]
+	OP_ri	v25.4s, v0.4s, v9.s[2]
+	OP_ir	v25.4s, v1.4s, v8.s[2]
 
-	OP_rr	v28.4s, v0.4s, v8.4s[3]
-	OP_ii	v28.4s, v1.4s, v9.4s[3]
-	OP_ri	v29.4s, v0.4s, v9.4s[3]
-	OP_ir	v29.4s, v1.4s, v8.4s[3]
+	OP_rr	v28.4s, v0.4s, v8.s[3]
+	OP_ii	v28.4s, v1.4s, v9.s[3]
+	OP_ri	v29.4s, v0.4s, v9.s[3]
+	OP_ir	v29.4s, v1.4s, v8.s[3]
 .endm
 
 .macro SAVE4x4
@@ -778,25 +778,25 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 	ld2	{v0.2s, v1.2s}, [pA]
 	add	pA, pA, #16
 
-	OP_rr	v16.2s, v0.2s, v8.4s[0]
-	OP_ii	v16.2s, v1.2s, v9.4s[0]
-	OP_ri	v17.2s, v0.2s, v9.4s[0]
-	OP_ir	v17.2s, v1.2s, v8.4s[0]
+	OP_rr	v16.2s, v0.2s, v8.s[0]
+	OP_ii	v16.2s, v1.2s, v9.s[0]
+	OP_ri	v17.2s, v0.2s, v9.s[0]
+	OP_ir	v17.2s, v1.2s, v8.s[0]
 
-	OP_rr	v20.2s, v0.2s, v8.4s[1]
-	OP_ii	v20.2s, v1.2s, v9.4s[1]
-	OP_ri	v21.2s, v0.2s, v9.4s[1]
-	OP_ir	v21.2s, v1.2s, v8.4s[1]
+	OP_rr	v20.2s, v0.2s, v8.s[1]
+	OP_ii	v20.2s, v1.2s, v9.s[1]
+	OP_ri	v21.2s, v0.2s, v9.s[1]
+	OP_ir	v21.2s, v1.2s, v8.s[1]
 
-	OP_rr	v24.2s, v0.2s, v8.4s[2]
-	OP_ii	v24.2s, v1.2s, v9.4s[2]
-	OP_ri	v25.2s, v0.2s, v9.4s[2]
-	OP_ir	v25.2s, v1.2s, v8.4s[2]
+	OP_rr	v24.2s, v0.2s, v8.s[2]
+	OP_ii	v24.2s, v1.2s, v9.s[2]
+	OP_ri	v25.2s, v0.2s, v9.s[2]
+	OP_ir	v25.2s, v1.2s, v8.s[2]
 
-	OP_rr	v28.2s, v0.2s, v8.4s[3]
-	OP_ii	v28.2s, v1.2s, v9.4s[3]
-	OP_ri	v29.2s, v0.2s, v9.4s[3]
-	OP_ir	v29.2s, v1.2s, v8.4s[3]
+	OP_rr	v28.2s, v0.2s, v8.s[3]
+	OP_ii	v28.2s, v1.2s, v9.s[3]
+	OP_ri	v29.2s, v0.2s, v9.s[3]
+	OP_ir	v29.2s, v1.2s, v8.s[3]
 .endm
 
 .macro SAVE2x4
@@ -858,25 +858,25 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 	ld2	{v0.s, v1.s}[0], [pA]
 	add	pA, pA, #8
 
-	OP_rr	s16, s0, v8.4s[0]
-	OP_ii	s16, s1, v9.4s[0]
-	OP_ri	s17, s0, v9.4s[0]
-	OP_ir	s17, s1, v8.4s[0]
+	OP_rr	s16, s0, v8.s[0]
+	OP_ii	s16, s1, v9.s[0]
+	OP_ri	s17, s0, v9.s[0]
+	OP_ir	s17, s1, v8.s[0]
 
-	OP_rr	s20, s0, v8.4s[1]
-	OP_ii	s20, s1, v9.4s[1]
-	OP_ri	s21, s0, v9.4s[1]
-	OP_ir	s21, s1, v8.4s[1]
+	OP_rr	s20, s0, v8.s[1]
+	OP_ii	s20, s1, v9.s[1]
+	OP_ri	s21, s0, v9.s[1]
+	OP_ir	s21, s1, v8.s[1]
 
-	OP_rr	s24, s0, v8.4s[2]
-	OP_ii	s24, s1, v9.4s[2]
-	OP_ri	s25, s0, v9.4s[2]
-	OP_ir	s25, s1, v8.4s[2]
+	OP_rr	s24, s0, v8.s[2]
+	OP_ii	s24, s1, v9.s[2]
+	OP_ri	s25, s0, v9.s[2]
+	OP_ir	s25, s1, v8.s[2]
 
-	OP_rr	s28, s0, v8.4s[3]
-	OP_ii	s28, s1, v9.4s[3]
-	OP_ri	s29, s0, v9.4s[3]
-	OP_ir	s29, s1, v8.4s[3]
+	OP_rr	s28, s0, v8.s[3]
+	OP_ii	s28, s1, v9.s[3]
+	OP_ri	s29, s0, v9.s[3]
+	OP_ir	s29, s1, v8.s[3]
 .endm
 
 .macro SAVE1x4
@@ -940,25 +940,25 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 	ld2	{v2.4s, v3.4s}, [pA]
 	add	pA, pA, #32
 
-	OP_rr	v16.4s, v0.4s, v8.2s[0]
-	OP_ii	v16.4s, v1.4s, v9.2s[0]
-	OP_ri	v17.4s, v0.4s, v9.2s[0]
-	OP_ir	v17.4s, v1.4s, v8.2s[0]
+	OP_rr	v16.4s, v0.4s, v8.s[0]
+	OP_ii	v16.4s, v1.4s, v9.s[0]
+	OP_ri	v17.4s, v0.4s, v9.s[0]
+	OP_ir	v17.4s, v1.4s, v8.s[0]
 
-	OP_rr	v18.4s, v2.4s, v8.2s[0]
-	OP_ii	v18.4s, v3.4s, v9.2s[0]
-	OP_ri	v19.4s, v2.4s, v9.2s[0]
-	OP_ir	v19.4s, v3.4s, v8.2s[0]
+	OP_rr	v18.4s, v2.4s, v8.s[0]
+	OP_ii	v18.4s, v3.4s, v9.s[0]
+	OP_ri	v19.4s, v2.4s, v9.s[0]
+	OP_ir	v19.4s, v3.4s, v8.s[0]
 
-	OP_rr	v20.4s, v0.4s, v8.2s[1]
-	OP_ii	v20.4s, v1.4s, v9.2s[1]
-	OP_ri	v21.4s, v0.4s, v9.2s[1]
-	OP_ir	v21.4s, v1.4s, v8.2s[1]
+	OP_rr	v20.4s, v0.4s, v8.s[1]
+	OP_ii	v20.4s, v1.4s, v9.s[1]
+	OP_ri	v21.4s, v0.4s, v9.s[1]
+	OP_ir	v21.4s, v1.4s, v8.s[1]
 
-	OP_rr	v22.4s, v2.4s, v8.2s[1]
-	OP_ii	v22.4s, v3.4s, v9.2s[1]
-	OP_ri	v23.4s, v2.4s, v9.2s[1]
-	OP_ir	v23.4s, v3.4s, v8.2s[1]
+	OP_rr	v22.4s, v2.4s, v8.s[1]
+	OP_ii	v22.4s, v3.4s, v9.s[1]
+	OP_ri	v23.4s, v2.4s, v9.s[1]
+	OP_ir	v23.4s, v3.4s, v8.s[1]
 .endm
 
 .macro SAVE8x2
@@ -1016,15 +1016,15 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 	ld2	{v0.4s, v1.4s}, [pA]
 	add	pA, pA, #32
 
-	OP_rr	v16.4s, v0.4s, v8.2s[0]
-	OP_ii	v16.4s, v1.4s, v9.2s[0]
-	OP_ri	v17.4s, v0.4s, v9.2s[0]
-	OP_ir	v17.4s, v1.4s, v8.2s[0]
+	OP_rr	v16.4s, v0.4s, v8.s[0]
+	OP_ii	v16.4s, v1.4s, v9.s[0]
+	OP_ri	v17.4s, v0.4s, v9.s[0]
+	OP_ir	v17.4s, v1.4s, v8.s[0]
 
-	OP_rr	v20.4s, v0.4s, v8.2s[1]
-	OP_ii	v20.4s, v1.4s, v9.2s[1]
-	OP_ri	v21.4s, v0.4s, v9.2s[1]
-	OP_ir	v21.4s, v1.4s, v8.2s[1]
+	OP_rr	v20.4s, v0.4s, v8.s[1]
+	OP_ii	v20.4s, v1.4s, v9.s[1]
+	OP_ri	v21.4s, v0.4s, v9.s[1]
+	OP_ir	v21.4s, v1.4s, v8.s[1]
 .endm
 
 .macro SAVE4x2
@@ -1064,15 +1064,15 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 	ld2	{v0.2s, v1.2s}, [pA]
 	add	pA, pA, #16
 
-	OP_rr	v16.2s, v0.2s, v8.2s[0]
-	OP_ii	v16.2s, v1.2s, v9.2s[0]
-	OP_ri	v17.2s, v0.2s, v9.2s[0]
-	OP_ir	v17.2s, v1.2s, v8.2s[0]
+	OP_rr	v16.2s, v0.2s, v8.s[0]
+	OP_ii	v16.2s, v1.2s, v9.s[0]
+	OP_ri	v17.2s, v0.2s, v9.s[0]
+	OP_ir	v17.2s, v1.2s, v8.s[0]
 
-	OP_rr	v20.2s, v0.2s, v8.2s[1]
-	OP_ii	v20.2s, v1.2s, v9.2s[1]
-	OP_ri	v21.2s, v0.2s, v9.2s[1]
-	OP_ir	v21.2s, v1.2s, v8.2s[1]
+	OP_rr	v20.2s, v0.2s, v8.s[1]
+	OP_ii	v20.2s, v1.2s, v9.s[1]
+	OP_ri	v21.2s, v0.2s, v9.s[1]
+	OP_ir	v21.2s, v1.2s, v8.s[1]
 .endm
 
 .macro SAVE2x2
@@ -1112,15 +1112,15 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 	ld2	{v0.s, v1.s}[0], [pA]
 	add	pA, pA, #8
 
-	OP_rr	s16, s0, v8.2s[0]
-	OP_ii	s16, s1, v9.2s[0]
-	OP_ri	s17, s0, v9.2s[0]
-	OP_ir	s17, s1, v8.2s[0]
+	OP_rr	s16, s0, v8.s[0]
+	OP_ii	s16, s1, v9.s[0]
+	OP_ri	s17, s0, v9.s[0]
+	OP_ir	s17, s1, v8.s[0]
 
-	OP_rr	s20, s0, v8.2s[1]
-	OP_ii	s20, s1, v9.2s[1]
-	OP_ri	s21, s0, v9.2s[1]
-	OP_ir	s21, s1, v8.2s[1]
+	OP_rr	s20, s0, v8.s[1]
+	OP_ii	s20, s1, v9.s[1]
+	OP_ri	s21, s0, v9.s[1]
+	OP_ir	s21, s1, v8.s[1]
 .endm
 
 .macro SAVE1x2
@@ -1162,15 +1162,15 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 	ld2	{v2.4s, v3.4s}, [pA]
 	add	pA, pA, #32
 
-	OP_rr	v16.4s, v0.4s, v8.4s[0]
-	OP_ii	v16.4s, v1.4s, v8.4s[1]
-	OP_ri	v17.4s, v0.4s, v8.4s[1]
-	OP_ir	v17.4s, v1.4s, v8.4s[0]
+	OP_rr	v16.4s, v0.4s, v8.s[0]
+	OP_ii	v16.4s, v1.4s, v8.s[1]
+	OP_ri	v17.4s, v0.4s, v8.s[1]
+	OP_ir	v17.4s, v1.4s, v8.s[0]
 
-	OP_rr	v18.4s, v2.4s, v8.4s[0]
-	OP_ii	v18.4s, v3.4s, v8.4s[1]
-	OP_ri	v19.4s, v2.4s, v8.4s[1]
-	OP_ir	v19.4s, v3.4s, v8.4s[0]
+	OP_rr	v18.4s, v2.4s, v8.s[0]
+	OP_ii	v18.4s, v3.4s, v8.s[1]
+	OP_ri	v19.4s, v2.4s, v8.s[1]
+	OP_ir	v19.4s, v3.4s, v8.s[0]
 .endm
 
 .macro SAVE8x1
diff --git a/kernel/arm64/ctrmm_kernel_4x4.S b/kernel/arm64/ctrmm_kernel_4x4.S
index be0e9bdef..3de27257a 100644
--- a/kernel/arm64/ctrmm_kernel_4x4.S
+++ b/kernel/arm64/ctrmm_kernel_4x4.S
@@ -170,49 +170,49 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 	ld2	{v0.4s, v1.4s}, [pA]
 	add	pA, pA, #32
 
-	fmul	v16.4s, v0.4s, v8.4s[0]
-	OP_ii	v16.4s, v1.4s, v9.4s[0]
+	fmul	v16.4s, v0.4s, v8.s[0]
+	OP_ii	v16.4s, v1.4s, v9.s[0]
 #if defined(NR) || defined(NC) || defined(TR) || defined(TC) || \
     defined(RR) || defined(RC) || defined(CR) || defined(CC)
 	eor	v17.16b, v17.16b, v17.16b
-	fmls	v17.4s, v0.4s, v9.4s[0]
+	fmls	v17.4s, v0.4s, v9.s[0]
 #else
-	fmul	v17.4s, v0.4s, v9.4s[0]
+	fmul	v17.4s, v0.4s, v9.s[0]
 #endif
-	OP_ir	v17.4s, v1.4s, v8.4s[0]
+	OP_ir	v17.4s, v1.4s, v8.s[0]
 
-	fmul	v20.4s, v0.4s, v8.4s[1]
-	OP_ii	v20.4s, v1.4s, v9.4s[1]
+	fmul	v20.4s, v0.4s, v8.s[1]
+	OP_ii	v20.4s, v1.4s, v9.s[1]
 #if defined(NR) || defined(NC) || defined(TR) || defined(TC) || \
     defined(RR) || defined(RC) || defined(CR) || defined(CC)
 	eor	v21.16b, v21.16b, v21.16b
-	fmls	v21.4s, v0.4s, v9.4s[1]
+	fmls	v21.4s, v0.4s, v9.s[1]
 #else
-	fmul	v21.4s, v0.4s, v9.4s[1]
+	fmul	v21.4s, v0.4s, v9.s[1]
 #endif
-	OP_ir	v21.4s, v1.4s, v8.4s[1]
+	OP_ir	v21.4s, v1.4s, v8.s[1]
 
-	fmul	v24.4s, v0.4s, v8.4s[2]
-	OP_ii	v24.4s, v1.4s, v9.4s[2]
+	fmul	v24.4s, v0.4s, v8.s[2]
+	OP_ii	v24.4s, v1.4s, v9.s[2]
 #if defined(NR) || defined(NC) || defined(TR) || defined(TC) || \
     defined(RR) || defined(RC) || defined(CR) || defined(CC)
 	eor	v25.16b, v25.16b, v25.16b
-	fmls	v25.4s, v0.4s, v9.4s[2]
+	fmls	v25.4s, v0.4s, v9.s[2]
 #else
-	fmul	v25.4s, v0.4s, v9.4s[2]
+	fmul	v25.4s, v0.4s, v9.s[2]
 #endif
-	OP_ir	v25.4s, v1.4s, v8.4s[2]
+	OP_ir	v25.4s, v1.4s, v8.s[2]
 
-	fmul	v28.4s, v0.4s, v8.4s[3]
-	OP_ii	v28.4s, v1.4s, v9.4s[3]
+	fmul	v28.4s, v0.4s, v8.s[3]
+	OP_ii	v28.4s, v1.4s, v9.s[3]
 #if defined(NR) || defined(NC) || defined(TR) || defined(TC) || \
     defined(RR) || defined(RC) || defined(CR) || defined(CC)
 	eor	v29.16b, v29.16b, v29.16b
-	fmls	v29.4s, v0.4s, v9.4s[3]
+	fmls	v29.4s, v0.4s, v9.s[3]
 #else
-	fmul	v29.4s, v0.4s, v9.4s[3]
+	fmul	v29.4s, v0.4s, v9.s[3]
 #endif
-	OP_ir	v29.4s, v1.4s, v8.4s[3]
+	OP_ir	v29.4s, v1.4s, v8.s[3]
 
 	ld2	{v12.4s, v13.4s}, [pB]
 	add	pB, pB, #32
@@ -221,85 +221,85 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 .endm
 
 .macro KERNEL4x4_M1
-	OP_rr	v16.4s, v0.4s, v8.4s[0]
-	OP_ii	v16.4s, v1.4s, v9.4s[0]
-	OP_ri	v17.4s, v0.4s, v9.4s[0]
-	OP_ir	v17.4s, v1.4s, v8.4s[0]
+	OP_rr	v16.4s, v0.4s, v8.s[0]
+	OP_ii	v16.4s, v1.4s, v9.s[0]
+	OP_ri	v17.4s, v0.4s, v9.s[0]
+	OP_ir	v17.4s, v1.4s, v8.s[0]
 
 	ld2	{v12.4s, v13.4s}, [pB]		// For next round
 	add	pB, pB, #32
 
-	OP_rr	v20.4s, v0.4s, v8.4s[1]
-	OP_ii	v20.4s, v1.4s, v9.4s[1]
-	OP_ri	v21.4s, v0.4s, v9.4s[1]
-	OP_ir	v21.4s, v1.4s, v8.4s[1]
+	OP_rr	v20.4s, v0.4s, v8.s[1]
+	OP_ii	v20.4s, v1.4s, v9.s[1]
+	OP_ri	v21.4s, v0.4s, v9.s[1]
+	OP_ir	v21.4s, v1.4s, v8.s[1]
 
 	ld2	{v4.4s, v5.4s}, [pA]		// For next round
 	add	pA, pA, #32
 
-	OP_rr	v24.4s, v0.4s, v8.4s[2]
-	OP_ii	v24.4s, v1.4s, v9.4s[2]
-	OP_ri	v25.4s, v0.4s, v9.4s[2]
-	OP_ir	v25.4s, v1.4s, v8.4s[2]
+	OP_rr	v24.4s, v0.4s, v8.s[2]
+	OP_ii	v24.4s, v1.4s, v9.s[2]
+	OP_ri	v25.4s, v0.4s, v9.s[2]
+	OP_ir	v25.4s, v1.4s, v8.s[2]
 
 	prfm	PLDL1KEEP, [pA, #512]
 
-	OP_rr	v28.4s, v0.4s, v8.4s[3]
-	OP_ii	v28.4s, v1.4s, v9.4s[3]
-	OP_ri	v29.4s, v0.4s, v9.4s[3]
-	OP_ir	v29.4s, v1.4s, v8.4s[3]
+	OP_rr	v28.4s, v0.4s, v8.s[3]
+	OP_ii	v28.4s, v1.4s, v9.s[3]
+	OP_ri	v29.4s, v0.4s, v9.s[3]
+	OP_ir	v29.4s, v1.4s, v8.s[3]
 .endm
 
 .macro KERNEL4x4_M2
-	OP_rr	v16.4s, v4.4s, v12.4s[0]
-	OP_ii	v16.4s, v5.4s, v13.4s[0]
-	OP_ri	v17.4s, v4.4s, v13.4s[0]
-	OP_ir	v17.4s, v5.4s, v12.4s[0]
+	OP_rr	v16.4s, v4.4s, v12.s[0]
+	OP_ii	v16.4s, v5.4s, v13.s[0]
+	OP_ri	v17.4s, v4.4s, v13.s[0]
+	OP_ir	v17.4s, v5.4s, v12.s[0]
 
 	ld2	{v8.4s, v9.4s}, [pB]		// For next round
 	add	pB, pB, #32
 
-	OP_rr	v20.4s, v4.4s, v12.4s[1]
-	OP_ii	v20.4s, v5.4s, v13.4s[1]
-	OP_ri	v21.4s, v4.4s, v13.4s[1]
-	OP_ir	v21.4s, v5.4s, v12.4s[1]
+	OP_rr	v20.4s, v4.4s, v12.s[1]
+	OP_ii	v20.4s, v5.4s, v13.s[1]
+	OP_ri	v21.4s, v4.4s, v13.s[1]
+	OP_ir	v21.4s, v5.4s, v12.s[1]
 
 	ld2	{v0.4s, v1.4s}, [pA]		// For next round
 	add	pA, pA, #32
 
-	OP_rr	v24.4s, v4.4s, v12.4s[2]
-	OP_ii	v24.4s, v5.4s, v13.4s[2]
-	OP_ri	v25.4s, v4.4s, v13.4s[2]
-	OP_ir	v25.4s, v5.4s, v12.4s[2]
+	OP_rr	v24.4s, v4.4s, v12.s[2]
+	OP_ii	v24.4s, v5.4s, v13.s[2]
+	OP_ri	v25.4s, v4.4s, v13.s[2]
+	OP_ir	v25.4s, v5.4s, v12.s[2]
 
 	prfm	PLDL1KEEP, [pB, #512]
 
-	OP_rr	v28.4s, v4.4s, v12.4s[3]
-	OP_ii	v28.4s, v5.4s, v13.4s[3]
-	OP_ri	v29.4s, v4.4s, v13.4s[3]
-	OP_ir	v29.4s, v5.4s, v12.4s[3]
+	OP_rr	v28.4s, v4.4s, v12.s[3]
+	OP_ii	v28.4s, v5.4s, v13.s[3]
+	OP_ri	v29.4s, v4.4s, v13.s[3]
+	OP_ir	v29.4s, v5.4s, v12.s[3]
 .endm
 
 .macro KERNEL4x4_E
-	OP_rr	v16.4s, v4.4s, v12.4s[0]
-	OP_ii	v16.4s, v5.4s, v13.4s[0]
-	OP_ri	v17.4s, v4.4s, v13.4s[0]
-	OP_ir	v17.4s, v5.4s, v12.4s[0]
+	OP_rr	v16.4s, v4.4s, v12.s[0]
+	OP_ii	v16.4s, v5.4s, v13.s[0]
+	OP_ri	v17.4s, v4.4s, v13.s[0]
+	OP_ir	v17.4s, v5.4s, v12.s[0]
 
-	OP_rr	v20.4s, v4.4s, v12.4s[1]
-	OP_ii	v20.4s, v5.4s, v13.4s[1]
-	OP_ri	v21.4s, v4.4s, v13.4s[1]
-	OP_ir	v21.4s, v5.4s, v12.4s[1]
+	OP_rr	v20.4s, v4.4s, v12.s[1]
+	OP_ii	v20.4s, v5.4s, v13.s[1]
+	OP_ri	v21.4s, v4.4s, v13.s[1]
+	OP_ir	v21.4s, v5.4s, v12.s[1]
 
-	OP_rr	v24.4s, v4.4s, v12.4s[2]
-	OP_ii	v24.4s, v5.4s, v13.4s[2]
-	OP_ri	v25.4s, v4.4s, v13.4s[2]
-	OP_ir	v25.4s, v5.4s, v12.4s[2]
+	OP_rr	v24.4s, v4.4s, v12.s[2]
+	OP_ii	v24.4s, v5.4s, v13.s[2]
+	OP_ri	v25.4s, v4.4s, v13.s[2]
+	OP_ir	v25.4s, v5.4s, v12.s[2]
 
-	OP_rr	v28.4s, v4.4s, v12.4s[3]
-	OP_ii	v28.4s, v5.4s, v13.4s[3]
-	OP_ri	v29.4s, v4.4s, v13.4s[3]
-	OP_ir	v29.4s, v5.4s, v12.4s[3]
+	OP_rr	v28.4s, v4.4s, v12.s[3]
+	OP_ii	v28.4s, v5.4s, v13.s[3]
+	OP_ri	v29.4s, v4.4s, v13.s[3]
+	OP_ir	v29.4s, v5.4s, v12.s[3]
 .endm
 
 .macro KERNEL4x4_SUB
@@ -308,25 +308,25 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 	ld2	{v0.4s, v1.4s}, [pA]
 	add	pA, pA, #32
 
-	OP_rr	v16.4s, v0.4s, v8.4s[0]
-	OP_ii	v16.4s, v1.4s, v9.4s[0]
-	OP_ri	v17.4s, v0.4s, v9.4s[0]
-	OP_ir	v17.4s, v1.4s, v8.4s[0]
+	OP_rr	v16.4s, v0.4s, v8.s[0]
+	OP_ii	v16.4s, v1.4s, v9.s[0]
+	OP_ri	v17.4s, v0.4s, v9.s[0]
+	OP_ir	v17.4s, v1.4s, v8.s[0]
 
-	OP_rr	v20.4s, v0.4s, v8.4s[1]
-	OP_ii	v20.4s, v1.4s, v9.4s[1]
-	OP_ri	v21.4s, v0.4s, v9.4s[1]
-	OP_ir	v21.4s, v1.4s, v8.4s[1]
+	OP_rr	v20.4s, v0.4s, v8.s[1]
+	OP_ii	v20.4s, v1.4s, v9.s[1]
+	OP_ri	v21.4s, v0.4s, v9.s[1]
+	OP_ir	v21.4s, v1.4s, v8.s[1]
 
-	OP_rr	v24.4s, v0.4s, v8.4s[2]
-	OP_ii	v24.4s, v1.4s, v9.4s[2]
-	OP_ri	v25.4s, v0.4s, v9.4s[2]
-	OP_ir	v25.4s, v1.4s, v8.4s[2]
+	OP_rr	v24.4s, v0.4s, v8.s[2]
+	OP_ii	v24.4s, v1.4s, v9.s[2]
+	OP_ri	v25.4s, v0.4s, v9.s[2]
+	OP_ir	v25.4s, v1.4s, v8.s[2]
 
-	OP_rr	v28.4s, v0.4s, v8.4s[3]
-	OP_ii	v28.4s, v1.4s, v9.4s[3]
-	OP_ri	v29.4s, v0.4s, v9.4s[3]
-	OP_ir	v29.4s, v1.4s, v8.4s[3]
+	OP_rr	v28.4s, v0.4s, v8.s[3]
+	OP_ii	v28.4s, v1.4s, v9.s[3]
+	OP_ri	v29.4s, v0.4s, v9.s[3]
+	OP_ir	v29.4s, v1.4s, v8.s[3]
 .endm
 
 .macro SAVE4x4
@@ -384,25 +384,25 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 	ld2	{v0.2s, v1.2s}, [pA]
 	add	pA, pA, #16
 
-	OP_rr	v16.2s, v0.2s, v8.4s[0]
-	OP_ii	v16.2s, v1.2s, v9.4s[0]
-	OP_ri	v17.2s, v0.2s, v9.4s[0]
-	OP_ir	v17.2s, v1.2s, v8.4s[0]
+	OP_rr	v16.2s, v0.2s, v8.s[0]
+	OP_ii	v16.2s, v1.2s, v9.s[0]
+	OP_ri	v17.2s, v0.2s, v9.s[0]
+	OP_ir	v17.2s, v1.2s, v8.s[0]
 
-	OP_rr	v20.2s, v0.2s, v8.4s[1]
-	OP_ii	v20.2s, v1.2s, v9.4s[1]
-	OP_ri	v21.2s, v0.2s, v9.4s[1]
-	OP_ir	v21.2s, v1.2s, v8.4s[1]
+	OP_rr	v20.2s, v0.2s, v8.s[1]
+	OP_ii	v20.2s, v1.2s, v9.s[1]
+	OP_ri	v21.2s, v0.2s, v9.s[1]
+	OP_ir	v21.2s, v1.2s, v8.s[1]
 
-	OP_rr	v24.2s, v0.2s, v8.4s[2]
-	OP_ii	v24.2s, v1.2s, v9.4s[2]
-	OP_ri	v25.2s, v0.2s, v9.4s[2]
-	OP_ir	v25.2s, v1.2s, v8.4s[2]
+	OP_rr	v24.2s, v0.2s, v8.s[2]
+	OP_ii	v24.2s, v1.2s, v9.s[2]
+	OP_ri	v25.2s, v0.2s, v9.s[2]
+	OP_ir	v25.2s, v1.2s, v8.s[2]
 
-	OP_rr	v28.2s, v0.2s, v8.4s[3]
-	OP_ii	v28.2s, v1.2s, v9.4s[3]
-	OP_ri	v29.2s, v0.2s, v9.4s[3]
-	OP_ir	v29.2s, v1.2s, v8.4s[3]
+	OP_rr	v28.2s, v0.2s, v8.s[3]
+	OP_ii	v28.2s, v1.2s, v9.s[3]
+	OP_ri	v29.2s, v0.2s, v9.s[3]
+	OP_ir	v29.2s, v1.2s, v8.s[3]
 .endm
 
 .macro SAVE2x4
@@ -460,25 +460,25 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 	ld2	{v0.s, v1.s}[0], [pA]
 	add	pA, pA, #8
 
-	OP_rr	s16, s0, v8.4s[0]
-	OP_ii	s16, s1, v9.4s[0]
-	OP_ri	s17, s0, v9.4s[0]
-	OP_ir	s17, s1, v8.4s[0]
+	OP_rr	s16, s0, v8.s[0]
+	OP_ii	s16, s1, v9.s[0]
+	OP_ri	s17, s0, v9.s[0]
+	OP_ir	s17, s1, v8.s[0]
 
-	OP_rr	s20, s0, v8.4s[1]
-	OP_ii	s20, s1, v9.4s[1]
-	OP_ri	s21, s0, v9.4s[1]
-	OP_ir	s21, s1, v8.4s[1]
+	OP_rr	s20, s0, v8.s[1]
+	OP_ii	s20, s1, v9.s[1]
+	OP_ri	s21, s0, v9.s[1]
+	OP_ir	s21, s1, v8.s[1]
 
-	OP_rr	s24, s0, v8.4s[2]
-	OP_ii	s24, s1, v9.4s[2]
-	OP_ri	s25, s0, v9.4s[2]
-	OP_ir	s25, s1, v8.4s[2]
+	OP_rr	s24, s0, v8.s[2]
+	OP_ii	s24, s1, v9.s[2]
+	OP_ri	s25, s0, v9.s[2]
+	OP_ir	s25, s1, v8.s[2]
 
-	OP_rr	s28, s0, v8.4s[3]
-	OP_ii	s28, s1, v9.4s[3]
-	OP_ri	s29, s0, v9.4s[3]
-	OP_ir	s29, s1, v8.4s[3]
+	OP_rr	s28, s0, v8.s[3]
+	OP_ii	s28, s1, v9.s[3]
+	OP_ri	s29, s0, v9.s[3]
+	OP_ir	s29, s1, v8.s[3]
 .endm
 
 .macro SAVE1x4
@@ -532,15 +532,15 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 	ld2	{v0.4s, v1.4s}, [pA]
 	add	pA, pA, #32
 
-	OP_rr	v16.4s, v0.4s, v8.2s[0]
-	OP_ii	v16.4s, v1.4s, v9.2s[0]
-	OP_ri	v17.4s, v0.4s, v9.2s[0]
-	OP_ir	v17.4s, v1.4s, v8.2s[0]
+	OP_rr	v16.4s, v0.4s, v8.s[0]
+	OP_ii	v16.4s, v1.4s, v9.s[0]
+	OP_ri	v17.4s, v0.4s, v9.s[0]
+	OP_ir	v17.4s, v1.4s, v8.s[0]
 
-	OP_rr	v20.4s, v0.4s, v8.2s[1]
-	OP_ii	v20.4s, v1.4s, v9.2s[1]
-	OP_ri	v21.4s, v0.4s, v9.2s[1]
-	OP_ir	v21.4s, v1.4s, v8.2s[1]
+	OP_rr	v20.4s, v0.4s, v8.s[1]
+	OP_ii	v20.4s, v1.4s, v9.s[1]
+	OP_ri	v21.4s, v0.4s, v9.s[1]
+	OP_ir	v21.4s, v1.4s, v8.s[1]
 .endm
 
 .macro SAVE4x2
@@ -578,15 +578,15 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 	ld2	{v0.2s, v1.2s}, [pA]
 	add	pA, pA, #16
 
-	OP_rr	v16.2s, v0.2s, v8.2s[0]
-	OP_ii	v16.2s, v1.2s, v9.2s[0]
-	OP_ri	v17.2s, v0.2s, v9.2s[0]
-	OP_ir	v17.2s, v1.2s, v8.2s[0]
+	OP_rr	v16.2s, v0.2s, v8.s[0]
+	OP_ii	v16.2s, v1.2s, v9.s[0]
+	OP_ri	v17.2s, v0.2s, v9.s[0]
+	OP_ir	v17.2s, v1.2s, v8.s[0]
 
-	OP_rr	v20.2s, v0.2s, v8.2s[1]
-	OP_ii	v20.2s, v1.2s, v9.2s[1]
-	OP_ri	v21.2s, v0.2s, v9.2s[1]
-	OP_ir	v21.2s, v1.2s, v8.2s[1]
+	OP_rr	v20.2s, v0.2s, v8.s[1]
+	OP_ii	v20.2s, v1.2s, v9.s[1]
+	OP_ri	v21.2s, v0.2s, v9.s[1]
+	OP_ir	v21.2s, v1.2s, v8.s[1]
 .endm
 
 .macro SAVE2x2
@@ -624,15 +624,15 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 	ld2	{v0.s, v1.s}[0], [pA]
 	add	pA, pA, #8
 
-	OP_rr	s16, s0, v8.2s[0]
-	OP_ii	s16, s1, v9.2s[0]
-	OP_ri	s17, s0, v9.2s[0]
-	OP_ir	s17, s1, v8.2s[0]
+	OP_rr	s16, s0, v8.s[0]
+	OP_ii	s16, s1, v9.s[0]
+	OP_ri	s17, s0, v9.s[0]
+	OP_ir	s17, s1, v8.s[0]
 
-	OP_rr	s20, s0, v8.2s[1]
-	OP_ii	s20, s1, v9.2s[1]
-	OP_ri	s21, s0, v9.2s[1]
-	OP_ir	s21, s1, v8.2s[1]
+	OP_rr	s20, s0, v8.s[1]
+	OP_ii	s20, s1, v9.s[1]
+	OP_ri	s21, s0, v9.s[1]
+	OP_ir	s21, s1, v8.s[1]
 .endm
 
 .macro SAVE1x2
diff --git a/kernel/arm64/ctrmm_kernel_8x4.S b/kernel/arm64/ctrmm_kernel_8x4.S
old mode 100755
new mode 100644
index 3131541d4..ce5cb0406
--- a/kernel/arm64/ctrmm_kernel_8x4.S
+++ b/kernel/arm64/ctrmm_kernel_8x4.S
@@ -180,93 +180,93 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 	ld2	{v2.4s, v3.4s}, [pA]
 	add	pA, pA, #32
 
-	fmul	v16.4s, v0.4s, v8.4s[0]
-	OP_ii	v16.4s, v1.4s, v9.4s[0]
+	fmul	v16.4s, v0.4s, v8.s[0]
+	OP_ii	v16.4s, v1.4s, v9.s[0]
 #if defined(NR) || defined(NC) || defined(TR) || defined(TC) || \
     defined(RR) || defined(RC) || defined(CR) || defined(CC)
 	eor	v17.16b, v17.16b, v17.16b
-	fmls	v17.4s, v0.4s, v9.4s[0]
+	fmls	v17.4s, v0.4s, v9.s[0]
 #else
-	fmul	v17.4s, v0.4s, v9.4s[0]
+	fmul	v17.4s, v0.4s, v9.s[0]
 #endif
-	OP_ir	v17.4s, v1.4s, v8.4s[0]
+	OP_ir	v17.4s, v1.4s, v8.s[0]
 
-	fmul	v18.4s, v2.4s, v8.4s[0]
-	OP_ii	v18.4s, v3.4s, v9.4s[0]
+	fmul	v18.4s, v2.4s, v8.s[0]
+	OP_ii	v18.4s, v3.4s, v9.s[0]
 #if defined(NR) || defined(NC) || defined(TR) || defined(TC) || \
     defined(RR) || defined(RC) || defined(CR) || defined(CC)
 	eor	v19.16b, v19.16b, v19.16b
-	fmls	v19.4s, v2.4s, v9.4s[0]
+	fmls	v19.4s, v2.4s, v9.s[0]
 #else
-	fmul	v19.4s, v2.4s, v9.4s[0]
+	fmul	v19.4s, v2.4s, v9.s[0]
 #endif
-	OP_ir	v19.4s, v3.4s, v8.4s[0]
+	OP_ir	v19.4s, v3.4s, v8.s[0]
 
-	fmul	v20.4s, v0.4s, v8.4s[1]
-	OP_ii	v20.4s, v1.4s, v9.4s[1]
+	fmul	v20.4s, v0.4s, v8.s[1]
+	OP_ii	v20.4s, v1.4s, v9.s[1]
 #if defined(NR) || defined(NC) || defined(TR) || defined(TC) || \
     defined(RR) || defined(RC) || defined(CR) || defined(CC)
 	eor	v21.16b, v21.16b, v21.16b
-	fmls	v21.4s, v0.4s, v9.4s[1]
+	fmls	v21.4s, v0.4s, v9.s[1]
 #else
-	fmul	v21.4s, v0.4s, v9.4s[1]
+	fmul	v21.4s, v0.4s, v9.s[1]
 #endif
-	OP_ir	v21.4s, v1.4s, v8.4s[1]
+	OP_ir	v21.4s, v1.4s, v8.s[1]
 
-	fmul	v22.4s, v2.4s, v8.4s[1]
-	OP_ii	v22.4s, v3.4s, v9.4s[1]
+	fmul	v22.4s, v2.4s, v8.s[1]
+	OP_ii	v22.4s, v3.4s, v9.s[1]
 #if defined(NR) || defined(NC) || defined(TR) || defined(TC) || \
     defined(RR) || defined(RC) || defined(CR) || defined(CC)
 	eor	v23.16b, v23.16b, v23.16b
-	fmls	v23.4s, v2.4s, v9.4s[1]
+	fmls	v23.4s, v2.4s, v9.s[1]
 #else
-	fmul	v23.4s, v2.4s, v9.4s[1]
+	fmul	v23.4s, v2.4s, v9.s[1]
 #endif
-	OP_ir	v23.4s, v3.4s, v8.4s[1]
+	OP_ir	v23.4s, v3.4s, v8.s[1]
 
-	fmul	v24.4s, v0.4s, v8.4s[2]
-	OP_ii	v24.4s, v1.4s, v9.4s[2]
+	fmul	v24.4s, v0.4s, v8.s[2]
+	OP_ii	v24.4s, v1.4s, v9.s[2]
 #if defined(NR) || defined(NC) || defined(TR) || defined(TC) || \
     defined(RR) || defined(RC) || defined(CR) || defined(CC)
 	eor	v25.16b, v25.16b, v25.16b
-	fmls	v25.4s, v0.4s, v9.4s[2]
+	fmls	v25.4s, v0.4s, v9.s[2]
 #else
-	fmul	v25.4s, v0.4s, v9.4s[2]
+	fmul	v25.4s, v0.4s, v9.s[2]
 #endif
-	OP_ir	v25.4s, v1.4s, v8.4s[2]
+	OP_ir	v25.4s, v1.4s, v8.s[2]
 
-	fmul	v26.4s, v2.4s, v8.4s[2]
-	OP_ii	v26.4s, v3.4s, v9.4s[2]
+	fmul	v26.4s, v2.4s, v8.s[2]
+	OP_ii	v26.4s, v3.4s, v9.s[2]
 #if defined(NR) || defined(NC) || defined(TR) || defined(TC) || \
     defined(RR) || defined(RC) || defined(CR) || defined(CC)
 	eor	v27.16b, v27.16b, v27.16b
-	fmls	v27.4s, v2.4s, v9.4s[2]
+	fmls	v27.4s, v2.4s, v9.s[2]
 #else
-	fmul	v27.4s, v2.4s, v9.4s[2]
+	fmul	v27.4s, v2.4s, v9.s[2]
 #endif
-	OP_ir	v27.4s, v3.4s, v8.4s[2]
+	OP_ir	v27.4s, v3.4s, v8.s[2]
 
-	fmul	v28.4s, v0.4s, v8.4s[3]
-	OP_ii	v28.4s, v1.4s, v9.4s[3]
+	fmul	v28.4s, v0.4s, v8.s[3]
+	OP_ii	v28.4s, v1.4s, v9.s[3]
 #if defined(NR) || defined(NC) || defined(TR) || defined(TC) || \
     defined(RR) || defined(RC) || defined(CR) || defined(CC)
 	eor	v29.16b, v29.16b, v29.16b
-	fmls	v29.4s, v0.4s, v9.4s[3]
+	fmls	v29.4s, v0.4s, v9.s[3]
 #else
-	fmul	v29.4s, v0.4s, v9.4s[3]
+	fmul	v29.4s, v0.4s, v9.s[3]
 #endif
-	OP_ir	v29.4s, v1.4s, v8.4s[3]
+	OP_ir	v29.4s, v1.4s, v8.s[3]
 
-	fmul	v30.4s, v2.4s, v8.4s[3]
-	OP_ii	v30.4s, v3.4s, v9.4s[3]
+	fmul	v30.4s, v2.4s, v8.s[3]
+	OP_ii	v30.4s, v3.4s, v9.s[3]
 #if defined(NR) || defined(NC) || defined(TR) || defined(TC) || \
     defined(RR) || defined(RC) || defined(CR) || defined(CC)
 	eor	v31.16b, v31.16b, v31.16b
-	fmls	v31.4s, v2.4s, v9.4s[3]
+	fmls	v31.4s, v2.4s, v9.s[3]
 #else
-	fmul	v31.4s, v2.4s, v9.4s[3]
+	fmul	v31.4s, v2.4s, v9.s[3]
 #endif
-	OP_ir	v31.4s, v3.4s, v8.4s[3]
+	OP_ir	v31.4s, v3.4s, v8.s[3]
 
 	ld2	{v12.4s, v13.4s}, [pB]
 	add	pB, pB, #32
@@ -277,45 +277,45 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 .endm
 
 .macro KERNEL8x4_M1
-	OP_rr	v16.4s, v0.4s, v8.4s[0]
-	OP_ii	v16.4s, v1.4s, v9.4s[0]
-	OP_ri	v17.4s, v0.4s, v9.4s[0]
-	OP_ir	v17.4s, v1.4s, v8.4s[0]
+	OP_rr	v16.4s, v0.4s, v8.s[0]
+	OP_ii	v16.4s, v1.4s, v9.s[0]
+	OP_ri	v17.4s, v0.4s, v9.s[0]
+	OP_ir	v17.4s, v1.4s, v8.s[0]
 
-	OP_rr	v18.4s, v2.4s, v8.4s[0]
-	OP_ii	v18.4s, v3.4s, v9.4s[0]
-	OP_ri	v19.4s, v2.4s, v9.4s[0]
-	OP_ir	v19.4s, v3.4s, v8.4s[0]
+	OP_rr	v18.4s, v2.4s, v8.s[0]
+	OP_ii	v18.4s, v3.4s, v9.s[0]
+	OP_ri	v19.4s, v2.4s, v9.s[0]
+	OP_ir	v19.4s, v3.4s, v8.s[0]
 
-	OP_rr	v20.4s, v0.4s, v8.4s[1]
-	OP_ii	v20.4s, v1.4s, v9.4s[1]
-	OP_ri	v21.4s, v0.4s, v9.4s[1]
-	OP_ir	v21.4s, v1.4s, v8.4s[1]
+	OP_rr	v20.4s, v0.4s, v8.s[1]
+	OP_ii	v20.4s, v1.4s, v9.s[1]
+	OP_ri	v21.4s, v0.4s, v9.s[1]
+	OP_ir	v21.4s, v1.4s, v8.s[1]
 
-	OP_rr	v22.4s, v2.4s, v8.4s[1]
-	OP_ii	v22.4s, v3.4s, v9.4s[1]
-	OP_ri	v23.4s, v2.4s, v9.4s[1]
-	OP_ir	v23.4s, v3.4s, v8.4s[1]
+	OP_rr	v22.4s, v2.4s, v8.s[1]
+	OP_ii	v22.4s, v3.4s, v9.s[1]
+	OP_ri	v23.4s, v2.4s, v9.s[1]
+	OP_ir	v23.4s, v3.4s, v8.s[1]
 
-	OP_rr	v24.4s, v0.4s, v8.4s[2]
-	OP_ii	v24.4s, v1.4s, v9.4s[2]
-	OP_ri	v25.4s, v0.4s, v9.4s[2]
-	OP_ir	v25.4s, v1.4s, v8.4s[2]
+	OP_rr	v24.4s, v0.4s, v8.s[2]
+	OP_ii	v24.4s, v1.4s, v9.s[2]
+	OP_ri	v25.4s, v0.4s, v9.s[2]
+	OP_ir	v25.4s, v1.4s, v8.s[2]
 
-	OP_rr	v26.4s, v2.4s, v8.4s[2]
-	OP_ii	v26.4s, v3.4s, v9.4s[2]
-	OP_ri	v27.4s, v2.4s, v9.4s[2]
-	OP_ir	v27.4s, v3.4s, v8.4s[2]
+	OP_rr	v26.4s, v2.4s, v8.s[2]
+	OP_ii	v26.4s, v3.4s, v9.s[2]
+	OP_ri	v27.4s, v2.4s, v9.s[2]
+	OP_ir	v27.4s, v3.4s, v8.s[2]
 
-	OP_rr	v28.4s, v0.4s, v8.4s[3]
-	OP_ii	v28.4s, v1.4s, v9.4s[3]
-	OP_ri	v29.4s, v0.4s, v9.4s[3]
-	OP_ir	v29.4s, v1.4s, v8.4s[3]
+	OP_rr	v28.4s, v0.4s, v8.s[3]
+	OP_ii	v28.4s, v1.4s, v9.s[3]
+	OP_ri	v29.4s, v0.4s, v9.s[3]
+	OP_ir	v29.4s, v1.4s, v8.s[3]
 
-	OP_rr	v30.4s, v2.4s, v8.4s[3]
-	OP_ii	v30.4s, v3.4s, v9.4s[3]
-	OP_ri	v31.4s, v2.4s, v9.4s[3]
-	OP_ir	v31.4s, v3.4s, v8.4s[3]
+	OP_rr	v30.4s, v2.4s, v8.s[3]
+	OP_ii	v30.4s, v3.4s, v9.s[3]
+	OP_ri	v31.4s, v2.4s, v9.s[3]
+	OP_ir	v31.4s, v3.4s, v8.s[3]
 
 	ld2	{v12.4s, v13.4s}, [pB]		// For next round
 	add	pB, pB, #32
@@ -326,45 +326,45 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 .endm
 
 .macro KERNEL8x4_M2
-	OP_rr	v16.4s, v4.4s, v12.4s[0]
-	OP_ii	v16.4s, v5.4s, v13.4s[0]
-	OP_ri	v17.4s, v4.4s, v13.4s[0]
-	OP_ir	v17.4s, v5.4s, v12.4s[0]
+	OP_rr	v16.4s, v4.4s, v12.s[0]
+	OP_ii	v16.4s, v5.4s, v13.s[0]
+	OP_ri	v17.4s, v4.4s, v13.s[0]
+	OP_ir	v17.4s, v5.4s, v12.s[0]
 
-	OP_rr	v18.4s, v6.4s, v12.4s[0]
-	OP_ii	v18.4s, v7.4s, v13.4s[0]
-	OP_ri	v19.4s, v6.4s, v13.4s[0]
-	OP_ir	v19.4s, v7.4s, v12.4s[0]
+	OP_rr	v18.4s, v6.4s, v12.s[0]
+	OP_ii	v18.4s, v7.4s, v13.s[0]
+	OP_ri	v19.4s, v6.4s, v13.s[0]
+	OP_ir	v19.4s, v7.4s, v12.s[0]
 
-	OP_rr	v20.4s, v4.4s, v12.4s[1]
-	OP_ii	v20.4s, v5.4s, v13.4s[1]
-	OP_ri	v21.4s, v4.4s, v13.4s[1]
-	OP_ir	v21.4s, v5.4s, v12.4s[1]
+	OP_rr	v20.4s, v4.4s, v12.s[1]
+	OP_ii	v20.4s, v5.4s, v13.s[1]
+	OP_ri	v21.4s, v4.4s, v13.s[1]
+	OP_ir	v21.4s, v5.4s, v12.s[1]
 
-	OP_rr	v22.4s, v6.4s, v12.4s[1]
-	OP_ii	v22.4s, v7.4s, v13.4s[1]
-	OP_ri	v23.4s, v6.4s, v13.4s[1]
-	OP_ir	v23.4s, v7.4s, v12.4s[1]
+	OP_rr	v22.4s, v6.4s, v12.s[1]
+	OP_ii	v22.4s, v7.4s, v13.s[1]
+	OP_ri	v23.4s, v6.4s, v13.s[1]
+	OP_ir	v23.4s, v7.4s, v12.s[1]
 
-	OP_rr	v24.4s, v4.4s, v12.4s[2]
-	OP_ii	v24.4s, v5.4s, v13.4s[2]
-	OP_ri	v25.4s, v4.4s, v13.4s[2]
-	OP_ir	v25.4s, v5.4s, v12.4s[2]
+	OP_rr	v24.4s, v4.4s, v12.s[2]
+	OP_ii	v24.4s, v5.4s, v13.s[2]
+	OP_ri	v25.4s, v4.4s, v13.s[2]
+	OP_ir	v25.4s, v5.4s, v12.s[2]
 
-	OP_rr	v26.4s, v6.4s, v12.4s[2]
-	OP_ii	v26.4s, v7.4s, v13.4s[2]
-	OP_ri	v27.4s, v6.4s, v13.4s[2]
-	OP_ir	v27.4s, v7.4s, v12.4s[2]
+	OP_rr	v26.4s, v6.4s, v12.s[2]
+	OP_ii	v26.4s, v7.4s, v13.s[2]
+	OP_ri	v27.4s, v6.4s, v13.s[2]
+	OP_ir	v27.4s, v7.4s, v12.s[2]
 
-	OP_rr	v28.4s, v4.4s, v12.4s[3]
-	OP_ii	v28.4s, v5.4s, v13.4s[3]
-	OP_ri	v29.4s, v4.4s, v13.4s[3]
-	OP_ir	v29.4s, v5.4s, v12.4s[3]
+	OP_rr	v28.4s, v4.4s, v12.s[3]
+	OP_ii	v28.4s, v5.4s, v13.s[3]
+	OP_ri	v29.4s, v4.4s, v13.s[3]
+	OP_ir	v29.4s, v5.4s, v12.s[3]
 
-	OP_rr	v30.4s, v6.4s, v12.4s[3]
-	OP_ii	v30.4s, v7.4s, v13.4s[3]
-	OP_ri	v31.4s, v6.4s, v13.4s[3]
-	OP_ir	v31.4s, v7.4s, v12.4s[3]
+	OP_rr	v30.4s, v6.4s, v12.s[3]
+	OP_ii	v30.4s, v7.4s, v13.s[3]
+	OP_ri	v31.4s, v6.4s, v13.s[3]
+	OP_ir	v31.4s, v7.4s, v12.s[3]
 
 	ld2	{v8.4s, v9.4s}, [pB]
 	add	pB, pB, #32
@@ -375,45 +375,45 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 .endm
 
 .macro KERNEL8x4_E
-	OP_rr	v16.4s, v4.4s, v12.4s[0]
-	OP_ii	v16.4s, v5.4s, v13.4s[0]
-	OP_ri	v17.4s, v4.4s, v13.4s[0]
-	OP_ir	v17.4s, v5.4s, v12.4s[0]
+	OP_rr	v16.4s, v4.4s, v12.s[0]
+	OP_ii	v16.4s, v5.4s, v13.s[0]
+	OP_ri	v17.4s, v4.4s, v13.s[0]
+	OP_ir	v17.4s, v5.4s, v12.s[0]
 
-	OP_rr	v18.4s, v6.4s, v12.4s[0]
-	OP_ii	v18.4s, v7.4s, v13.4s[0]
-	OP_ri	v19.4s, v6.4s, v13.4s[0]
-	OP_ir	v19.4s, v7.4s, v12.4s[0]
+	OP_rr	v18.4s, v6.4s, v12.s[0]
+	OP_ii	v18.4s, v7.4s, v13.s[0]
+	OP_ri	v19.4s, v6.4s, v13.s[0]
+	OP_ir	v19.4s, v7.4s, v12.s[0]
 
-	OP_rr	v20.4s, v4.4s, v12.4s[1]
-	OP_ii	v20.4s, v5.4s, v13.4s[1]
-	OP_ri	v21.4s, v4.4s, v13.4s[1]
-	OP_ir	v21.4s, v5.4s, v12.4s[1]
+	OP_rr	v20.4s, v4.4s, v12.s[1]
+	OP_ii	v20.4s, v5.4s, v13.s[1]
+	OP_ri	v21.4s, v4.4s, v13.s[1]
+	OP_ir	v21.4s, v5.4s, v12.s[1]
 
-	OP_rr	v22.4s, v6.4s, v12.4s[1]
-	OP_ii	v22.4s, v7.4s, v13.4s[1]
-	OP_ri	v23.4s, v6.4s, v13.4s[1]
-	OP_ir	v23.4s, v7.4s, v12.4s[1]
+	OP_rr	v22.4s, v6.4s, v12.s[1]
+	OP_ii	v22.4s, v7.4s, v13.s[1]
+	OP_ri	v23.4s, v6.4s, v13.s[1]
+	OP_ir	v23.4s, v7.4s, v12.s[1]
 
-	OP_rr	v24.4s, v4.4s, v12.4s[2]
-	OP_ii	v24.4s, v5.4s, v13.4s[2]
-	OP_ri	v25.4s, v4.4s, v13.4s[2]
-	OP_ir	v25.4s, v5.4s, v12.4s[2]
+	OP_rr	v24.4s, v4.4s, v12.s[2]
+	OP_ii	v24.4s, v5.4s, v13.s[2]
+	OP_ri	v25.4s, v4.4s, v13.s[2]
+	OP_ir	v25.4s, v5.4s, v12.s[2]
 
-	OP_rr	v26.4s, v6.4s, v12.4s[2]
-	OP_ii	v26.4s, v7.4s, v13.4s[2]
-	OP_ri	v27.4s, v6.4s, v13.4s[2]
-	OP_ir	v27.4s, v7.4s, v12.4s[2]
+	OP_rr	v26.4s, v6.4s, v12.s[2]
+	OP_ii	v26.4s, v7.4s, v13.s[2]
+	OP_ri	v27.4s, v6.4s, v13.s[2]
+	OP_ir	v27.4s, v7.4s, v12.s[2]
 
-	OP_rr	v28.4s, v4.4s, v12.4s[3]
-	OP_ii	v28.4s, v5.4s, v13.4s[3]
-	OP_ri	v29.4s, v4.4s, v13.4s[3]
-	OP_ir	v29.4s, v5.4s, v12.4s[3]
+	OP_rr	v28.4s, v4.4s, v12.s[3]
+	OP_ii	v28.4s, v5.4s, v13.s[3]
+	OP_ri	v29.4s, v4.4s, v13.s[3]
+	OP_ir	v29.4s, v5.4s, v12.s[3]
 
-	OP_rr	v30.4s, v6.4s, v12.4s[3]
-	OP_ii	v30.4s, v7.4s, v13.4s[3]
-	OP_ri	v31.4s, v6.4s, v13.4s[3]
-	OP_ir	v31.4s, v7.4s, v12.4s[3]
+	OP_rr	v30.4s, v6.4s, v12.s[3]
+	OP_ii	v30.4s, v7.4s, v13.s[3]
+	OP_ri	v31.4s, v6.4s, v13.s[3]
+	OP_ir	v31.4s, v7.4s, v12.s[3]
 
 .endm
 
@@ -425,45 +425,45 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 	ld2	{v2.4s, v3.4s}, [pA]
 	add	pA, pA, #32
 
-	OP_rr	v16.4s, v0.4s, v8.4s[0]
-	OP_ii	v16.4s, v1.4s, v9.4s[0]
-	OP_ri	v17.4s, v0.4s, v9.4s[0]
-	OP_ir	v17.4s, v1.4s, v8.4s[0]
+	OP_rr	v16.4s, v0.4s, v8.s[0]
+	OP_ii	v16.4s, v1.4s, v9.s[0]
+	OP_ri	v17.4s, v0.4s, v9.s[0]
+	OP_ir	v17.4s, v1.4s, v8.s[0]
 
-	OP_rr	v18.4s, v2.4s, v8.4s[0]
-	OP_ii	v18.4s, v3.4s, v9.4s[0]
-	OP_ri	v19.4s, v2.4s, v9.4s[0]
-	OP_ir	v19.4s, v3.4s, v8.4s[0]
+	OP_rr	v18.4s, v2.4s, v8.s[0]
+	OP_ii	v18.4s, v3.4s, v9.s[0]
+	OP_ri	v19.4s, v2.4s, v9.s[0]
+	OP_ir	v19.4s, v3.4s, v8.s[0]
 
-	OP_rr	v20.4s, v0.4s, v8.4s[1]
-	OP_ii	v20.4s, v1.4s, v9.4s[1]
-	OP_ri	v21.4s, v0.4s, v9.4s[1]
-	OP_ir	v21.4s, v1.4s, v8.4s[1]
+	OP_rr	v20.4s, v0.4s, v8.s[1]
+	OP_ii	v20.4s, v1.4s, v9.s[1]
+	OP_ri	v21.4s, v0.4s, v9.s[1]
+	OP_ir	v21.4s, v1.4s, v8.s[1]
 
-	OP_rr	v22.4s, v2.4s, v8.4s[1]
-	OP_ii	v22.4s, v3.4s, v9.4s[1]
-	OP_ri	v23.4s, v2.4s, v9.4s[1]
-	OP_ir	v23.4s, v3.4s, v8.4s[1]
+	OP_rr	v22.4s, v2.4s, v8.s[1]
+	OP_ii	v22.4s, v3.4s, v9.s[1]
+	OP_ri	v23.4s, v2.4s, v9.s[1]
+	OP_ir	v23.4s, v3.4s, v8.s[1]
 
-	OP_rr	v24.4s, v0.4s, v8.4s[2]
-	OP_ii	v24.4s, v1.4s, v9.4s[2]
-	OP_ri	v25.4s, v0.4s, v9.4s[2]
-	OP_ir	v25.4s, v1.4s, v8.4s[2]
+	OP_rr	v24.4s, v0.4s, v8.s[2]
+	OP_ii	v24.4s, v1.4s, v9.s[2]
+	OP_ri	v25.4s, v0.4s, v9.s[2]
+	OP_ir	v25.4s, v1.4s, v8.s[2]
 
-	OP_rr	v26.4s, v2.4s, v8.4s[2]
-	OP_ii	v26.4s, v3.4s, v9.4s[2]
-	OP_ri	v27.4s, v2.4s, v9.4s[2]
-	OP_ir	v27.4s, v3.4s, v8.4s[2]
+	OP_rr	v26.4s, v2.4s, v8.s[2]
+	OP_ii	v26.4s, v3.4s, v9.s[2]
+	OP_ri	v27.4s, v2.4s, v9.s[2]
+	OP_ir	v27.4s, v3.4s, v8.s[2]
 
-	OP_rr	v28.4s, v0.4s, v8.4s[3]
-	OP_ii	v28.4s, v1.4s, v9.4s[3]
-	OP_ri	v29.4s, v0.4s, v9.4s[3]
-	OP_ir	v29.4s, v1.4s, v8.4s[3]
+	OP_rr	v28.4s, v0.4s, v8.s[3]
+	OP_ii	v28.4s, v1.4s, v9.s[3]
+	OP_ri	v29.4s, v0.4s, v9.s[3]
+	OP_ir	v29.4s, v1.4s, v8.s[3]
 
-	OP_rr	v30.4s, v2.4s, v8.4s[3]
-	OP_ii	v30.4s, v3.4s, v9.4s[3]
-	OP_ri	v31.4s, v2.4s, v9.4s[3]
-	OP_ir	v31.4s, v3.4s, v8.4s[3]
+	OP_rr	v30.4s, v2.4s, v8.s[3]
+	OP_ii	v30.4s, v3.4s, v9.s[3]
+	OP_ri	v31.4s, v2.4s, v9.s[3]
+	OP_ir	v31.4s, v3.4s, v8.s[3]
 
 .endm
 
@@ -562,49 +562,49 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 	ld2	{v0.4s, v1.4s}, [pA]
 	add	pA, pA, #32
 
-	fmul	v16.4s, v0.4s, v8.4s[0]
-	OP_ii	v16.4s, v1.4s, v9.4s[0]
+	fmul	v16.4s, v0.4s, v8.s[0]
+	OP_ii	v16.4s, v1.4s, v9.s[0]
 #if defined(NR) || defined(NC) || defined(TR) || defined(TC) || \
     defined(RR) || defined(RC) || defined(CR) || defined(CC)
 	eor	v17.16b, v17.16b, v17.16b
-	fmls	v17.4s, v0.4s, v9.4s[0]
+	fmls	v17.4s, v0.4s, v9.s[0]
 #else
-	fmul	v17.4s, v0.4s, v9.4s[0]
+	fmul	v17.4s, v0.4s, v9.s[0]
 #endif
-	OP_ir	v17.4s, v1.4s, v8.4s[0]
+	OP_ir	v17.4s, v1.4s, v8.s[0]
 
-	fmul	v20.4s, v0.4s, v8.4s[1]
-	OP_ii	v20.4s, v1.4s, v9.4s[1]
+	fmul	v20.4s, v0.4s, v8.s[1]
+	OP_ii	v20.4s, v1.4s, v9.s[1]
 #if defined(NR) || defined(NC) || defined(TR) || defined(TC) || \
     defined(RR) || defined(RC) || defined(CR) || defined(CC)
 	eor	v21.16b, v21.16b, v21.16b
-	fmls	v21.4s, v0.4s, v9.4s[1]
+	fmls	v21.4s, v0.4s, v9.s[1]
 #else
-	fmul	v21.4s, v0.4s, v9.4s[1]
+	fmul	v21.4s, v0.4s, v9.s[1]
 #endif
-	OP_ir	v21.4s, v1.4s, v8.4s[1]
+	OP_ir	v21.4s, v1.4s, v8.s[1]
 
-	fmul	v24.4s, v0.4s, v8.4s[2]
-	OP_ii	v24.4s, v1.4s, v9.4s[2]
+	fmul	v24.4s, v0.4s, v8.s[2]
+	OP_ii	v24.4s, v1.4s, v9.s[2]
 #if defined(NR) || defined(NC) || defined(TR) || defined(TC) || \
     defined(RR) || defined(RC) || defined(CR) || defined(CC)
 	eor	v25.16b, v25.16b, v25.16b
-	fmls	v25.4s, v0.4s, v9.4s[2]
+	fmls	v25.4s, v0.4s, v9.s[2]
 #else
-	fmul	v25.4s, v0.4s, v9.4s[2]
+	fmul	v25.4s, v0.4s, v9.s[2]
 #endif
-	OP_ir	v25.4s, v1.4s, v8.4s[2]
+	OP_ir	v25.4s, v1.4s, v8.s[2]
 
-	fmul	v28.4s, v0.4s, v8.4s[3]
-	OP_ii	v28.4s, v1.4s, v9.4s[3]
+	fmul	v28.4s, v0.4s, v8.s[3]
+	OP_ii	v28.4s, v1.4s, v9.s[3]
 #if defined(NR) || defined(NC) || defined(TR) || defined(TC) || \
     defined(RR) || defined(RC) || defined(CR) || defined(CC)
 	eor	v29.16b, v29.16b, v29.16b
-	fmls	v29.4s, v0.4s, v9.4s[3]
+	fmls	v29.4s, v0.4s, v9.s[3]
 #else
-	fmul	v29.4s, v0.4s, v9.4s[3]
+	fmul	v29.4s, v0.4s, v9.s[3]
 #endif
-	OP_ir	v29.4s, v1.4s, v8.4s[3]
+	OP_ir	v29.4s, v1.4s, v8.s[3]
 
 	ld2	{v12.4s, v13.4s}, [pB]
 	add	pB, pB, #32
@@ -613,85 +613,85 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 .endm
 
 .macro KERNEL4x4_M1
-	OP_rr	v16.4s, v0.4s, v8.4s[0]
-	OP_ii	v16.4s, v1.4s, v9.4s[0]
-	OP_ri	v17.4s, v0.4s, v9.4s[0]
-	OP_ir	v17.4s, v1.4s, v8.4s[0]
+	OP_rr	v16.4s, v0.4s, v8.s[0]
+	OP_ii	v16.4s, v1.4s, v9.s[0]
+	OP_ri	v17.4s, v0.4s, v9.s[0]
+	OP_ir	v17.4s, v1.4s, v8.s[0]
 
 	ld2	{v12.4s, v13.4s}, [pB]		// For next round
 	add	pB, pB, #32
 
-	OP_rr	v20.4s, v0.4s, v8.4s[1]
-	OP_ii	v20.4s, v1.4s, v9.4s[1]
-	OP_ri	v21.4s, v0.4s, v9.4s[1]
-	OP_ir	v21.4s, v1.4s, v8.4s[1]
+	OP_rr	v20.4s, v0.4s, v8.s[1]
+	OP_ii	v20.4s, v1.4s, v9.s[1]
+	OP_ri	v21.4s, v0.4s, v9.s[1]
+	OP_ir	v21.4s, v1.4s, v8.s[1]
 
 	ld2	{v4.4s, v5.4s}, [pA]		// For next round
 	add	pA, pA, #32
 
-	OP_rr	v24.4s, v0.4s, v8.4s[2]
-	OP_ii	v24.4s, v1.4s, v9.4s[2]
-	OP_ri	v25.4s, v0.4s, v9.4s[2]
-	OP_ir	v25.4s, v1.4s, v8.4s[2]
+	OP_rr	v24.4s, v0.4s, v8.s[2]
+	OP_ii	v24.4s, v1.4s, v9.s[2]
+	OP_ri	v25.4s, v0.4s, v9.s[2]
+	OP_ir	v25.4s, v1.4s, v8.s[2]
 
 	prfm	PLDL1KEEP, [pA, #512]
 
-	OP_rr	v28.4s, v0.4s, v8.4s[3]
-	OP_ii	v28.4s, v1.4s, v9.4s[3]
-	OP_ri	v29.4s, v0.4s, v9.4s[3]
-	OP_ir	v29.4s, v1.4s, v8.4s[3]
+	OP_rr	v28.4s, v0.4s, v8.s[3]
+	OP_ii	v28.4s, v1.4s, v9.s[3]
+	OP_ri	v29.4s, v0.4s, v9.s[3]
+	OP_ir	v29.4s, v1.4s, v8.s[3]
 .endm
 
 .macro KERNEL4x4_M2
-	OP_rr	v16.4s, v4.4s, v12.4s[0]
-	OP_ii	v16.4s, v5.4s, v13.4s[0]
-	OP_ri	v17.4s, v4.4s, v13.4s[0]
-	OP_ir	v17.4s, v5.4s, v12.4s[0]
+	OP_rr	v16.4s, v4.4s, v12.s[0]
+	OP_ii	v16.4s, v5.4s, v13.s[0]
+	OP_ri	v17.4s, v4.4s, v13.s[0]
+	OP_ir	v17.4s, v5.4s, v12.s[0]
 
 	ld2	{v8.4s, v9.4s}, [pB]		// For next round
 	add	pB, pB, #32
 
-	OP_rr	v20.4s, v4.4s, v12.4s[1]
-	OP_ii	v20.4s, v5.4s, v13.4s[1]
-	OP_ri	v21.4s, v4.4s, v13.4s[1]
-	OP_ir	v21.4s, v5.4s, v12.4s[1]
+	OP_rr	v20.4s, v4.4s, v12.s[1]
+	OP_ii	v20.4s, v5.4s, v13.s[1]
+	OP_ri	v21.4s, v4.4s, v13.s[1]
+	OP_ir	v21.4s, v5.4s, v12.s[1]
 
 	ld2	{v0.4s, v1.4s}, [pA]		// For next round
 	add	pA, pA, #32
 
-	OP_rr	v24.4s, v4.4s, v12.4s[2]
-	OP_ii	v24.4s, v5.4s, v13.4s[2]
-	OP_ri	v25.4s, v4.4s, v13.4s[2]
-	OP_ir	v25.4s, v5.4s, v12.4s[2]
+	OP_rr	v24.4s, v4.4s, v12.s[2]
+	OP_ii	v24.4s, v5.4s, v13.s[2]
+	OP_ri	v25.4s, v4.4s, v13.s[2]
+	OP_ir	v25.4s, v5.4s, v12.s[2]
 
 	prfm	PLDL1KEEP, [pB, #512]
 
-	OP_rr	v28.4s, v4.4s, v12.4s[3]
-	OP_ii	v28.4s, v5.4s, v13.4s[3]
-	OP_ri	v29.4s, v4.4s, v13.4s[3]
-	OP_ir	v29.4s, v5.4s, v12.4s[3]
+	OP_rr	v28.4s, v4.4s, v12.s[3]
+	OP_ii	v28.4s, v5.4s, v13.s[3]
+	OP_ri	v29.4s, v4.4s, v13.s[3]
+	OP_ir	v29.4s, v5.4s, v12.s[3]
 .endm
 
 .macro KERNEL4x4_E
-	OP_rr	v16.4s, v4.4s, v12.4s[0]
-	OP_ii	v16.4s, v5.4s, v13.4s[0]
-	OP_ri	v17.4s, v4.4s, v13.4s[0]
-	OP_ir	v17.4s, v5.4s, v12.4s[0]
+	OP_rr	v16.4s, v4.4s, v12.s[0]
+	OP_ii	v16.4s, v5.4s, v13.s[0]
+	OP_ri	v17.4s, v4.4s, v13.s[0]
+	OP_ir	v17.4s, v5.4s, v12.s[0]
 
-	OP_rr	v20.4s, v4.4s, v12.4s[1]
-	OP_ii	v20.4s, v5.4s, v13.4s[1]
-	OP_ri	v21.4s, v4.4s, v13.4s[1]
-	OP_ir	v21.4s, v5.4s, v12.4s[1]
+	OP_rr	v20.4s, v4.4s, v12.s[1]
+	OP_ii	v20.4s, v5.4s, v13.s[1]
+	OP_ri	v21.4s, v4.4s, v13.s[1]
+	OP_ir	v21.4s, v5.4s, v12.s[1]
 
-	OP_rr	v24.4s, v4.4s, v12.4s[2]
-	OP_ii	v24.4s, v5.4s, v13.4s[2]
-	OP_ri	v25.4s, v4.4s, v13.4s[2]
-	OP_ir	v25.4s, v5.4s, v12.4s[2]
+	OP_rr	v24.4s, v4.4s, v12.s[2]
+	OP_ii	v24.4s, v5.4s, v13.s[2]
+	OP_ri	v25.4s, v4.4s, v13.s[2]
+	OP_ir	v25.4s, v5.4s, v12.s[2]
 
-	OP_rr	v28.4s, v4.4s, v12.4s[3]
-	OP_ii	v28.4s, v5.4s, v13.4s[3]
-	OP_ri	v29.4s, v4.4s, v13.4s[3]
-	OP_ir	v29.4s, v5.4s, v12.4s[3]
+	OP_rr	v28.4s, v4.4s, v12.s[3]
+	OP_ii	v28.4s, v5.4s, v13.s[3]
+	OP_ri	v29.4s, v4.4s, v13.s[3]
+	OP_ir	v29.4s, v5.4s, v12.s[3]
 .endm
 
 .macro KERNEL4x4_SUB
@@ -700,25 +700,25 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 	ld2	{v0.4s, v1.4s}, [pA]
 	add	pA, pA, #32
 
-	OP_rr	v16.4s, v0.4s, v8.4s[0]
-	OP_ii	v16.4s, v1.4s, v9.4s[0]
-	OP_ri	v17.4s, v0.4s, v9.4s[0]
-	OP_ir	v17.4s, v1.4s, v8.4s[0]
+	OP_rr	v16.4s, v0.4s, v8.s[0]
+	OP_ii	v16.4s, v1.4s, v9.s[0]
+	OP_ri	v17.4s, v0.4s, v9.s[0]
+	OP_ir	v17.4s, v1.4s, v8.s[0]
 
-	OP_rr	v20.4s, v0.4s, v8.4s[1]
-	OP_ii	v20.4s, v1.4s, v9.4s[1]
-	OP_ri	v21.4s, v0.4s, v9.4s[1]
-	OP_ir	v21.4s, v1.4s, v8.4s[1]
+	OP_rr	v20.4s, v0.4s, v8.s[1]
+	OP_ii	v20.4s, v1.4s, v9.s[1]
+	OP_ri	v21.4s, v0.4s, v9.s[1]
+	OP_ir	v21.4s, v1.4s, v8.s[1]
 
-	OP_rr	v24.4s, v0.4s, v8.4s[2]
-	OP_ii	v24.4s, v1.4s, v9.4s[2]
-	OP_ri	v25.4s, v0.4s, v9.4s[2]
-	OP_ir	v25.4s, v1.4s, v8.4s[2]
+	OP_rr	v24.4s, v0.4s, v8.s[2]
+	OP_ii	v24.4s, v1.4s, v9.s[2]
+	OP_ri	v25.4s, v0.4s, v9.s[2]
+	OP_ir	v25.4s, v1.4s, v8.s[2]
 
-	OP_rr	v28.4s, v0.4s, v8.4s[3]
-	OP_ii	v28.4s, v1.4s, v9.4s[3]
-	OP_ri	v29.4s, v0.4s, v9.4s[3]
-	OP_ir	v29.4s, v1.4s, v8.4s[3]
+	OP_rr	v28.4s, v0.4s, v8.s[3]
+	OP_ii	v28.4s, v1.4s, v9.s[3]
+	OP_ri	v29.4s, v0.4s, v9.s[3]
+	OP_ir	v29.4s, v1.4s, v8.s[3]
 .endm
 
 .macro SAVE4x4
@@ -780,25 +780,25 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 	ld2	{v0.2s, v1.2s}, [pA]
 	add	pA, pA, #16
 
-	OP_rr	v16.2s, v0.2s, v8.4s[0]
-	OP_ii	v16.2s, v1.2s, v9.4s[0]
-	OP_ri	v17.2s, v0.2s, v9.4s[0]
-	OP_ir	v17.2s, v1.2s, v8.4s[0]
+	OP_rr	v16.2s, v0.2s, v8.s[0]
+	OP_ii	v16.2s, v1.2s, v9.s[0]
+	OP_ri	v17.2s, v0.2s, v9.s[0]
+	OP_ir	v17.2s, v1.2s, v8.s[0]
 
-	OP_rr	v20.2s, v0.2s, v8.4s[1]
-	OP_ii	v20.2s, v1.2s, v9.4s[1]
-	OP_ri	v21.2s, v0.2s, v9.4s[1]
-	OP_ir	v21.2s, v1.2s, v8.4s[1]
+	OP_rr	v20.2s, v0.2s, v8.s[1]
+	OP_ii	v20.2s, v1.2s, v9.s[1]
+	OP_ri	v21.2s, v0.2s, v9.s[1]
+	OP_ir	v21.2s, v1.2s, v8.s[1]
 
-	OP_rr	v24.2s, v0.2s, v8.4s[2]
-	OP_ii	v24.2s, v1.2s, v9.4s[2]
-	OP_ri	v25.2s, v0.2s, v9.4s[2]
-	OP_ir	v25.2s, v1.2s, v8.4s[2]
+	OP_rr	v24.2s, v0.2s, v8.s[2]
+	OP_ii	v24.2s, v1.2s, v9.s[2]
+	OP_ri	v25.2s, v0.2s, v9.s[2]
+	OP_ir	v25.2s, v1.2s, v8.s[2]
 
-	OP_rr	v28.2s, v0.2s, v8.4s[3]
-	OP_ii	v28.2s, v1.2s, v9.4s[3]
-	OP_ri	v29.2s, v0.2s, v9.4s[3]
-	OP_ir	v29.2s, v1.2s, v8.4s[3]
+	OP_rr	v28.2s, v0.2s, v8.s[3]
+	OP_ii	v28.2s, v1.2s, v9.s[3]
+	OP_ri	v29.2s, v0.2s, v9.s[3]
+	OP_ir	v29.2s, v1.2s, v8.s[3]
 .endm
 
 .macro SAVE2x4
@@ -860,25 +860,25 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 	ld2	{v0.s, v1.s}[0], [pA]
 	add	pA, pA, #8
 
-	OP_rr	s16, s0, v8.4s[0]
-	OP_ii	s16, s1, v9.4s[0]
-	OP_ri	s17, s0, v9.4s[0]
-	OP_ir	s17, s1, v8.4s[0]
+	OP_rr	s16, s0, v8.s[0]
+	OP_ii	s16, s1, v9.s[0]
+	OP_ri	s17, s0, v9.s[0]
+	OP_ir	s17, s1, v8.s[0]
 
-	OP_rr	s20, s0, v8.4s[1]
-	OP_ii	s20, s1, v9.4s[1]
-	OP_ri	s21, s0, v9.4s[1]
-	OP_ir	s21, s1, v8.4s[1]
+	OP_rr	s20, s0, v8.s[1]
+	OP_ii	s20, s1, v9.s[1]
+	OP_ri	s21, s0, v9.s[1]
+	OP_ir	s21, s1, v8.s[1]
 
-	OP_rr	s24, s0, v8.4s[2]
-	OP_ii	s24, s1, v9.4s[2]
-	OP_ri	s25, s0, v9.4s[2]
-	OP_ir	s25, s1, v8.4s[2]
+	OP_rr	s24, s0, v8.s[2]
+	OP_ii	s24, s1, v9.s[2]
+	OP_ri	s25, s0, v9.s[2]
+	OP_ir	s25, s1, v8.s[2]
 
-	OP_rr	s28, s0, v8.4s[3]
-	OP_ii	s28, s1, v9.4s[3]
-	OP_ri	s29, s0, v9.4s[3]
-	OP_ir	s29, s1, v8.4s[3]
+	OP_rr	s28, s0, v8.s[3]
+	OP_ii	s28, s1, v9.s[3]
+	OP_ri	s29, s0, v9.s[3]
+	OP_ir	s29, s1, v8.s[3]
 .endm
 
 .macro SAVE1x4
@@ -942,25 +942,25 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 	ld2	{v2.4s, v3.4s}, [pA]
 	add	pA, pA, #32
 
-	OP_rr	v16.4s, v0.4s, v8.2s[0]
-	OP_ii	v16.4s, v1.4s, v9.2s[0]
-	OP_ri	v17.4s, v0.4s, v9.2s[0]
-	OP_ir	v17.4s, v1.4s, v8.2s[0]
+	OP_rr	v16.4s, v0.4s, v8.s[0]
+	OP_ii	v16.4s, v1.4s, v9.s[0]
+	OP_ri	v17.4s, v0.4s, v9.s[0]
+	OP_ir	v17.4s, v1.4s, v8.s[0]
 
-	OP_rr	v18.4s, v2.4s, v8.2s[0]
-	OP_ii	v18.4s, v3.4s, v9.2s[0]
-	OP_ri	v19.4s, v2.4s, v9.2s[0]
-	OP_ir	v19.4s, v3.4s, v8.2s[0]
+	OP_rr	v18.4s, v2.4s, v8.s[0]
+	OP_ii	v18.4s, v3.4s, v9.s[0]
+	OP_ri	v19.4s, v2.4s, v9.s[0]
+	OP_ir	v19.4s, v3.4s, v8.s[0]
 
-	OP_rr	v20.4s, v0.4s, v8.2s[1]
-	OP_ii	v20.4s, v1.4s, v9.2s[1]
-	OP_ri	v21.4s, v0.4s, v9.2s[1]
-	OP_ir	v21.4s, v1.4s, v8.2s[1]
+	OP_rr	v20.4s, v0.4s, v8.s[1]
+	OP_ii	v20.4s, v1.4s, v9.s[1]
+	OP_ri	v21.4s, v0.4s, v9.s[1]
+	OP_ir	v21.4s, v1.4s, v8.s[1]
 
-	OP_rr	v22.4s, v2.4s, v8.2s[1]
-	OP_ii	v22.4s, v3.4s, v9.2s[1]
-	OP_ri	v23.4s, v2.4s, v9.2s[1]
-	OP_ir	v23.4s, v3.4s, v8.2s[1]
+	OP_rr	v22.4s, v2.4s, v8.s[1]
+	OP_ii	v22.4s, v3.4s, v9.s[1]
+	OP_ri	v23.4s, v2.4s, v9.s[1]
+	OP_ir	v23.4s, v3.4s, v8.s[1]
 .endm
 
 .macro SAVE8x2
@@ -1018,15 +1018,15 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 	ld2	{v0.4s, v1.4s}, [pA]
 	add	pA, pA, #32
 
-	OP_rr	v16.4s, v0.4s, v8.2s[0]
-	OP_ii	v16.4s, v1.4s, v9.2s[0]
-	OP_ri	v17.4s, v0.4s, v9.2s[0]
-	OP_ir	v17.4s, v1.4s, v8.2s[0]
+	OP_rr	v16.4s, v0.4s, v8.s[0]
+	OP_ii	v16.4s, v1.4s, v9.s[0]
+	OP_ri	v17.4s, v0.4s, v9.s[0]
+	OP_ir	v17.4s, v1.4s, v8.s[0]
 
-	OP_rr	v20.4s, v0.4s, v8.2s[1]
-	OP_ii	v20.4s, v1.4s, v9.2s[1]
-	OP_ri	v21.4s, v0.4s, v9.2s[1]
-	OP_ir	v21.4s, v1.4s, v8.2s[1]
+	OP_rr	v20.4s, v0.4s, v8.s[1]
+	OP_ii	v20.4s, v1.4s, v9.s[1]
+	OP_ri	v21.4s, v0.4s, v9.s[1]
+	OP_ir	v21.4s, v1.4s, v8.s[1]
 .endm
 
 .macro SAVE4x2
@@ -1066,15 +1066,15 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 	ld2	{v0.2s, v1.2s}, [pA]
 	add	pA, pA, #16
 
-	OP_rr	v16.2s, v0.2s, v8.2s[0]
-	OP_ii	v16.2s, v1.2s, v9.2s[0]
-	OP_ri	v17.2s, v0.2s, v9.2s[0]
-	OP_ir	v17.2s, v1.2s, v8.2s[0]
+	OP_rr	v16.2s, v0.2s, v8.s[0]
+	OP_ii	v16.2s, v1.2s, v9.s[0]
+	OP_ri	v17.2s, v0.2s, v9.s[0]
+	OP_ir	v17.2s, v1.2s, v8.s[0]
 
-	OP_rr	v20.2s, v0.2s, v8.2s[1]
-	OP_ii	v20.2s, v1.2s, v9.2s[1]
-	OP_ri	v21.2s, v0.2s, v9.2s[1]
-	OP_ir	v21.2s, v1.2s, v8.2s[1]
+	OP_rr	v20.2s, v0.2s, v8.s[1]
+	OP_ii	v20.2s, v1.2s, v9.s[1]
+	OP_ri	v21.2s, v0.2s, v9.s[1]
+	OP_ir	v21.2s, v1.2s, v8.s[1]
 .endm
 
 .macro SAVE2x2
@@ -1114,15 +1114,15 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 	ld2	{v0.s, v1.s}[0], [pA]
 	add	pA, pA, #8
 
-	OP_rr	s16, s0, v8.2s[0]
-	OP_ii	s16, s1, v9.2s[0]
-	OP_ri	s17, s0, v9.2s[0]
-	OP_ir	s17, s1, v8.2s[0]
+	OP_rr	s16, s0, v8.s[0]
+	OP_ii	s16, s1, v9.s[0]
+	OP_ri	s17, s0, v9.s[0]
+	OP_ir	s17, s1, v8.s[0]
 
-	OP_rr	s20, s0, v8.2s[1]
-	OP_ii	s20, s1, v9.2s[1]
-	OP_ri	s21, s0, v9.2s[1]
-	OP_ir	s21, s1, v8.2s[1]
+	OP_rr	s20, s0, v8.s[1]
+	OP_ii	s20, s1, v9.s[1]
+	OP_ri	s21, s0, v9.s[1]
+	OP_ir	s21, s1, v8.s[1]
 .endm
 
 .macro SAVE1x2
@@ -1164,15 +1164,15 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 	ld2	{v2.4s, v3.4s}, [pA]
 	add	pA, pA, #32
 
-	OP_rr	v16.4s, v0.4s, v8.4s[0]
-	OP_ii	v16.4s, v1.4s, v8.4s[1]
-	OP_ri	v17.4s, v0.4s, v8.4s[1]
-	OP_ir	v17.4s, v1.4s, v8.4s[0]
+	OP_rr	v16.4s, v0.4s, v8.s[0]
+	OP_ii	v16.4s, v1.4s, v8.s[1]
+	OP_ri	v17.4s, v0.4s, v8.s[1]
+	OP_ir	v17.4s, v1.4s, v8.s[0]
 
-	OP_rr	v18.4s, v2.4s, v8.4s[0]
-	OP_ii	v18.4s, v3.4s, v8.4s[1]
-	OP_ri	v19.4s, v2.4s, v8.4s[1]
-	OP_ir	v19.4s, v3.4s, v8.4s[0]
+	OP_rr	v18.4s, v2.4s, v8.s[0]
+	OP_ii	v18.4s, v3.4s, v8.s[1]
+	OP_ri	v19.4s, v2.4s, v8.s[1]
+	OP_ir	v19.4s, v3.4s, v8.s[0]
 .endm
 
 .macro SAVE8x1
diff --git a/kernel/arm64/dgemm_kernel_4x4.S b/kernel/arm64/dgemm_kernel_4x4.S
index e2ad11492..44b0f7ff2 100644
--- a/kernel/arm64/dgemm_kernel_4x4.S
+++ b/kernel/arm64/dgemm_kernel_4x4.S
@@ -161,150 +161,150 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 	ldp	q0, q1, [pA]
 	add	pA, pA, #32
 
-	fmul	v16.2d, v0.2d, v8.2d[0]
-	fmul	v29.2d, v1.2d, v11.2d[0]
+	fmul	v16.2d, v0.2d, v8.d[0]
+	fmul	v29.2d, v1.2d, v11.d[0]
 
 	ldp	q2, q3, [ppA]
 	add	ppA, ppA, #32
 
-	fmul	v20.2d, v0.2d, v9.2d[0]
-	fmul	v25.2d, v1.2d, v10.2d[0]
+	fmul	v20.2d, v0.2d, v9.d[0]
+	fmul	v25.2d, v1.2d, v10.d[0]
 
 	prfm	PLDL1KEEP, [pA, #A_PRE_SIZE]
 
-	fmul	v18.2d, v2.2d, v8.2d[0]
-	fmul	v31.2d, v3.2d, v11.2d[0]
+	fmul	v18.2d, v2.2d, v8.d[0]
+	fmul	v31.2d, v3.2d, v11.d[0]
 
 	prfm	PLDL1KEEP, [ppA, #A_PRE_SIZE]
 
-	fmul	v22.2d, v2.2d, v9.2d[0]
-	fmul	v27.2d, v3.2d, v10.2d[0]
+	fmul	v22.2d, v2.2d, v9.d[0]
+	fmul	v27.2d, v3.2d, v10.d[0]
 
 	ldp	d12, d13, [pB]
 	add	pB, pB, #16
 
-	fmul	v24.2d, v0.2d, v10.2d[0]
-	fmul	v21.2d, v1.2d, v9.2d[0]
+	fmul	v24.2d, v0.2d, v10.d[0]
+	fmul	v21.2d, v1.2d, v9.d[0]
 
 	ldp	q4, q5, [pA]		// for next round
 	add	pA, pA, #32
 
-	fmul	v26.2d, v2.2d, v10.2d[0]
-	fmul	v23.2d, v3.2d, v9.2d[0]
+	fmul	v26.2d, v2.2d, v10.d[0]
+	fmul	v23.2d, v3.2d, v9.d[0]
 
 	ldp	q6, q7, [ppA]		// for next round
 	add	ppA, ppA, #32
 
-	fmul	v28.2d, v0.2d, v11.2d[0]
-	fmul	v17.2d, v1.2d, v8.2d[0]
+	fmul	v28.2d, v0.2d, v11.d[0]
+	fmul	v17.2d, v1.2d, v8.d[0]
 
 	ldp	d14, d15, [pB]
 	add	pB, pB, #16
 
-	fmul	v30.2d, v2.2d, v11.2d[0]
-	fmul	v19.2d, v3.2d, v8.2d[0]
+	fmul	v30.2d, v2.2d, v11.d[0]
+	fmul	v19.2d, v3.2d, v8.d[0]
 .endm
 
 .macro KERNEL8x4_M2
-	fmla	v16.2d, v4.2d, v12.2d[0]
-	fmla	v29.2d, v5.2d, v15.2d[0]
+	fmla	v16.2d, v4.2d, v12.d[0]
+	fmla	v29.2d, v5.2d, v15.d[0]
 
 	ldp	d8, d9, [pB]
 	add	pB, pB, #16
 
-	fmla	v18.2d, v6.2d, v12.2d[0]
-	fmla	v31.2d, v7.2d, v15.2d[0]
+	fmla	v18.2d, v6.2d, v12.d[0]
+	fmla	v31.2d, v7.2d, v15.d[0]
 
 	ldp	d10, d11, [pB]
 	add	pB, pB, #16
 
-	fmla	v20.2d, v4.2d, v13.2d[0]
-	fmla	v25.2d, v5.2d, v14.2d[0]
+	fmla	v20.2d, v4.2d, v13.d[0]
+	fmla	v25.2d, v5.2d, v14.d[0]
 
 	prfm	PLDL1KEEP, [pB, #B_PRE_SIZE]
 
-	fmla	v22.2d, v6.2d, v13.2d[0]
-	fmla	v27.2d, v7.2d, v14.2d[0]
-	fmla	v24.2d, v4.2d, v14.2d[0]
-	fmla	v21.2d, v5.2d, v13.2d[0]
+	fmla	v22.2d, v6.2d, v13.d[0]
+	fmla	v27.2d, v7.2d, v14.d[0]
+	fmla	v24.2d, v4.2d, v14.d[0]
+	fmla	v21.2d, v5.2d, v13.d[0]
 
 	ldp	q0, q1, [pA]
 	add	pA, pA, #32
 
-	fmla	v26.2d, v6.2d, v14.2d[0]
-	fmla	v23.2d, v7.2d, v13.2d[0]
-	fmla	v28.2d, v4.2d, v15.2d[0]
-	fmla	v17.2d, v5.2d, v12.2d[0]
+	fmla	v26.2d, v6.2d, v14.d[0]
+	fmla	v23.2d, v7.2d, v13.d[0]
+	fmla	v28.2d, v4.2d, v15.d[0]
+	fmla	v17.2d, v5.2d, v12.d[0]
 
 	ldp	q2, q3, [ppA]
 	add	ppA, ppA, #32
 
-	fmla	v30.2d, v6.2d, v15.2d[0]
-	fmla	v19.2d, v7.2d, v12.2d[0]
+	fmla	v30.2d, v6.2d, v15.d[0]
+	fmla	v19.2d, v7.2d, v12.d[0]
 .endm
 
 .macro KERNEL8x4_M1
-	fmla	v16.2d, v0.2d, v8.2d[0]
-	fmla	v29.2d, v1.2d, v11.2d[0]
+	fmla	v16.2d, v0.2d, v8.d[0]
+	fmla	v29.2d, v1.2d, v11.d[0]
 
 	ldp	d12, d13, [pB]
 	add	pB, pB, #16
 
-	fmla	v18.2d, v2.2d, v8.2d[0]
-	fmla	v31.2d, v3.2d, v11.2d[0]
+	fmla	v18.2d, v2.2d, v8.d[0]
+	fmla	v31.2d, v3.2d, v11.d[0]
 
 	ldp	d14, d15, [pB]
 	add	pB, pB, #16
 
-	fmla	v20.2d, v0.2d, v9.2d[0]
-	fmla	v25.2d, v1.2d, v10.2d[0]
+	fmla	v20.2d, v0.2d, v9.d[0]
+	fmla	v25.2d, v1.2d, v10.d[0]
 
 	prfm	PLDL1KEEP, [pA, #A_PRE_SIZE]
 
-	fmla	v22.2d, v2.2d, v9.2d[0]
-	fmla	v27.2d, v3.2d, v10.2d[0]
+	fmla	v22.2d, v2.2d, v9.d[0]
+	fmla	v27.2d, v3.2d, v10.d[0]
 
 	prfm	PLDL1KEEP, [ppA, #A_PRE_SIZE]
 
-	fmla	v24.2d, v0.2d, v10.2d[0]
-	fmla	v21.2d, v1.2d, v9.2d[0]
+	fmla	v24.2d, v0.2d, v10.d[0]
+	fmla	v21.2d, v1.2d, v9.d[0]
 
 	ldp	q4, q5, [pA]
 	add	pA, pA, #32
 
-	fmla	v26.2d, v2.2d, v10.2d[0]
-	fmla	v23.2d, v3.2d, v9.2d[0]
+	fmla	v26.2d, v2.2d, v10.d[0]
+	fmla	v23.2d, v3.2d, v9.d[0]
 
-	fmla	v28.2d, v0.2d, v11.2d[0]
-	fmla	v17.2d, v1.2d, v8.2d[0]
+	fmla	v28.2d, v0.2d, v11.d[0]
+	fmla	v17.2d, v1.2d, v8.d[0]
 
 	ldp	q6, q7, [ppA]
 	add	ppA, ppA, #32
 
-	fmla	v30.2d, v2.2d, v11.2d[0]
-	fmla	v19.2d, v3.2d, v8.2d[0]
+	fmla	v30.2d, v2.2d, v11.d[0]
+	fmla	v19.2d, v3.2d, v8.d[0]
 .endm
 
 .macro KERNEL8x4_E
-	fmla	v16.2d, v4.2d, v12.2d[0]
-	fmla	v25.2d, v5.2d, v14.2d[0]
-	fmla	v18.2d, v6.2d, v12.2d[0]
-	fmla	v27.2d, v7.2d, v14.2d[0]
+	fmla	v16.2d, v4.2d, v12.d[0]
+	fmla	v25.2d, v5.2d, v14.d[0]
+	fmla	v18.2d, v6.2d, v12.d[0]
+	fmla	v27.2d, v7.2d, v14.d[0]
 
-	fmla	v20.2d, v4.2d, v13.2d[0]
-	fmla	v29.2d, v5.2d, v15.2d[0]
-	fmla	v22.2d, v6.2d, v13.2d[0]
-	fmla	v31.2d, v7.2d, v15.2d[0]
+	fmla	v20.2d, v4.2d, v13.d[0]
+	fmla	v29.2d, v5.2d, v15.d[0]
+	fmla	v22.2d, v6.2d, v13.d[0]
+	fmla	v31.2d, v7.2d, v15.d[0]
 
-	fmla	v24.2d, v4.2d, v14.2d[0]
-	fmla	v17.2d, v5.2d, v12.2d[0]
-	fmla	v26.2d, v6.2d, v14.2d[0]
-	fmla	v19.2d, v7.2d, v12.2d[0]
+	fmla	v24.2d, v4.2d, v14.d[0]
+	fmla	v17.2d, v5.2d, v12.d[0]
+	fmla	v26.2d, v6.2d, v14.d[0]
+	fmla	v19.2d, v7.2d, v12.d[0]
 
-	fmla	v28.2d, v4.2d, v15.2d[0]
-	fmla	v21.2d, v5.2d, v13.2d[0]
-	fmla	v30.2d, v6.2d, v15.2d[0]
-	fmla	v23.2d, v7.2d, v13.2d[0]
+	fmla	v28.2d, v4.2d, v15.d[0]
+	fmla	v21.2d, v5.2d, v13.d[0]
+	fmla	v30.2d, v6.2d, v15.d[0]
+	fmla	v23.2d, v7.2d, v13.d[0]
 .endm
 
 .macro KERNEL8x4_SUB
@@ -315,28 +315,28 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 	ldp	q0, q1, [pA]
 	add	pA, pA, #32
 
-	fmla	v16.2d, v0.2d, v8.2d[0]
-	fmla	v29.2d, v1.2d, v11.2d[0]
-	fmla	v20.2d, v0.2d, v9.2d[0]
-	fmla	v25.2d, v1.2d, v10.2d[0]
+	fmla	v16.2d, v0.2d, v8.d[0]
+	fmla	v29.2d, v1.2d, v11.d[0]
+	fmla	v20.2d, v0.2d, v9.d[0]
+	fmla	v25.2d, v1.2d, v10.d[0]
 
 	ldp	q2, q3, [ppA]
 	add	ppA, ppA, #32
 
-	fmla	v24.2d, v0.2d, v10.2d[0]
-	fmla	v21.2d, v1.2d, v9.2d[0]
-	fmla	v28.2d, v0.2d, v11.2d[0]
-	fmla	v17.2d, v1.2d, v8.2d[0]
+	fmla	v24.2d, v0.2d, v10.d[0]
+	fmla	v21.2d, v1.2d, v9.d[0]
+	fmla	v28.2d, v0.2d, v11.d[0]
+	fmla	v17.2d, v1.2d, v8.d[0]
 
-	fmla	v18.2d, v2.2d, v8.2d[0]
-	fmla	v31.2d, v3.2d, v11.2d[0]
-	fmla	v22.2d, v2.2d, v9.2d[0]
-	fmla	v27.2d, v3.2d, v10.2d[0]
+	fmla	v18.2d, v2.2d, v8.d[0]
+	fmla	v31.2d, v3.2d, v11.d[0]
+	fmla	v22.2d, v2.2d, v9.d[0]
+	fmla	v27.2d, v3.2d, v10.d[0]
 
-	fmla	v26.2d, v2.2d, v10.2d[0]
-	fmla	v23.2d, v3.2d, v9.2d[0]
-	fmla	v30.2d, v2.2d, v11.2d[0]
-	fmla	v19.2d, v3.2d, v8.2d[0]
+	fmla	v26.2d, v2.2d, v10.d[0]
+	fmla	v23.2d, v3.2d, v9.d[0]
+	fmla	v30.2d, v2.2d, v11.d[0]
+	fmla	v19.2d, v3.2d, v8.d[0]
 .endm
 
 .macro SAVE8x4
@@ -422,17 +422,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 	ld1	{v0.2d, v1.2d}, [pA]
 	add	pA, pA, #32
 
-	fmla	v16.2d, v0.2d, v8.2d[0]
-	fmla	v29.2d, v1.2d, v9.2d[1]
+	fmla	v16.2d, v0.2d, v8.d[0]
+	fmla	v29.2d, v1.2d, v9.d[1]
 
-	fmla	v20.2d, v0.2d, v8.2d[1]
-	fmla	v25.2d, v1.2d, v9.2d[0]
+	fmla	v20.2d, v0.2d, v8.d[1]
+	fmla	v25.2d, v1.2d, v9.d[0]
 
-	fmla	v24.2d, v0.2d, v9.2d[0]
-	fmla	v21.2d, v1.2d, v8.2d[1]
+	fmla	v24.2d, v0.2d, v9.d[0]
+	fmla	v21.2d, v1.2d, v8.d[1]
 
-	fmla	v28.2d, v0.2d, v9.2d[1]
-	fmla	v17.2d, v1.2d, v8.2d[0]
+	fmla	v28.2d, v0.2d, v9.d[1]
+	fmla	v17.2d, v1.2d, v8.d[0]
 .endm
 
 .macro SAVE4x4
@@ -482,10 +482,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 	ld1	{v0.2d}, [pA]
 	add	pA, pA, #16
 
-	fmla	v16.2d, v0.2d, v8.2d[0]
-	fmla	v20.2d, v0.2d, v8.2d[1]
-	fmla	v24.2d, v0.2d, v9.2d[0]
-	fmla	v28.2d, v0.2d, v9.2d[1]
+	fmla	v16.2d, v0.2d, v8.d[0]
+	fmla	v20.2d, v0.2d, v8.d[1]
+	fmla	v24.2d, v0.2d, v9.d[0]
+	fmla	v28.2d, v0.2d, v9.d[1]
 .endm
 
 .macro SAVE2x4
@@ -572,10 +572,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 	ld1	{v0.2d, v1.2d}, [pA]
 	add	pA, pA, #32
 
-	fmla	v16.2d, v0.2d, v8.2d[0]
-	fmla	v17.2d, v1.2d, v8.2d[0]
-	fmla	v20.2d, v0.2d, v8.2d[1]
-	fmla	v21.2d, v1.2d, v8.2d[1]
+	fmla	v16.2d, v0.2d, v8.d[0]
+	fmla	v17.2d, v1.2d, v8.d[0]
+	fmla	v20.2d, v0.2d, v8.d[1]
+	fmla	v21.2d, v1.2d, v8.d[1]
 .endm
 
 .macro SAVE4x2
@@ -610,8 +610,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 	ld1	{v0.2d}, [pA]
 	add	pA, pA, #16
 
-	fmla	v16.2d, v0.2d, v8.2d[0]
-	fmla	v20.2d, v0.2d, v8.2d[1]
+	fmla	v16.2d, v0.2d, v8.d[0]
+	fmla	v20.2d, v0.2d, v8.d[1]
 .endm
 
 .macro SAVE2x2
@@ -643,7 +643,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 	ldr	d0 , [pA]
 	add	pA, pA, #8
 
-	fmla	v16.2d, v8.2d, v0.2d[0]
+	fmla	v16.2d, v8.2d, v0.d[0]
 .endm
 
 .macro SAVE1x2
@@ -674,8 +674,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 	ld1	{v0.2d, v1.2d}, [pA]
 	add	pA , pA, #32
 
-	fmla	v16.2d, v0.2d, v8.2d[0]
-	fmla	v17.2d, v1.2d, v8.2d[0]
+	fmla	v16.2d, v0.2d, v8.d[0]
+	fmla	v17.2d, v1.2d, v8.d[0]
 .endm
 
 .macro SAVE4x1
@@ -705,7 +705,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 	ld1	{v0.2d}, [pA]
 	add	pA , pA, #16
 
-	fmla	v16.2d, v0.2d, v8.2d[0]
+	fmla	v16.2d, v0.2d, v8.d[0]
 .endm
 
 .macro SAVE2x1
diff --git a/kernel/arm64/dgemm_kernel_4x8.S b/kernel/arm64/dgemm_kernel_4x8.S
old mode 100755
new mode 100644
index 88e9a773d..b04dbb5d5
--- a/kernel/arm64/dgemm_kernel_4x8.S
+++ b/kernel/arm64/dgemm_kernel_4x8.S
@@ -154,25 +154,25 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 	ld1	{v10.2d, v11.2d}, [pB]
 	add	pB, pB, #32
 
-	fmul	v16.2d, v0.2d, v8.2d[0]
-	fmul	v17.2d, v1.2d, v8.2d[0]
-	fmul	v18.2d, v0.2d, v8.2d[1]
-	fmul	v19.2d, v1.2d, v8.2d[1]
+	fmul	v16.2d, v0.2d, v8.d[0]
+	fmul	v17.2d, v1.2d, v8.d[0]
+	fmul	v18.2d, v0.2d, v8.d[1]
+	fmul	v19.2d, v1.2d, v8.d[1]
 
-	fmul	v20.2d, v0.2d, v9.2d[0]
-	fmul	v21.2d, v1.2d, v9.2d[0]
-	fmul	v22.2d, v0.2d, v9.2d[1]
-	fmul	v23.2d, v1.2d, v9.2d[1]
+	fmul	v20.2d, v0.2d, v9.d[0]
+	fmul	v21.2d, v1.2d, v9.d[0]
+	fmul	v22.2d, v0.2d, v9.d[1]
+	fmul	v23.2d, v1.2d, v9.d[1]
 
-	fmul	v24.2d, v0.2d, v10.2d[0]
-	fmul	v25.2d, v1.2d, v10.2d[0]
-	fmul	v26.2d, v0.2d, v10.2d[1]
-	fmul	v27.2d, v1.2d, v10.2d[1]
+	fmul	v24.2d, v0.2d, v10.d[0]
+	fmul	v25.2d, v1.2d, v10.d[0]
+	fmul	v26.2d, v0.2d, v10.d[1]
+	fmul	v27.2d, v1.2d, v10.d[1]
 
-	fmul	v28.2d, v0.2d, v11.2d[0]
-	fmul	v29.2d, v1.2d, v11.2d[0]
-	fmul	v30.2d, v0.2d, v11.2d[1]
-	fmul	v31.2d, v1.2d, v11.2d[1]
+	fmul	v28.2d, v0.2d, v11.d[0]
+	fmul	v29.2d, v1.2d, v11.d[0]
+	fmul	v30.2d, v0.2d, v11.d[1]
+	fmul	v31.2d, v1.2d, v11.d[1]
 
 	ld1	{v12.2d, v13.2d}, [pB]
 	add	pB, pB, #32
@@ -183,25 +183,25 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 .endm
 
 .macro KERNEL4x8_M1
-	fmla	v16.2d, v0.2d, v8.2d[0]
-	fmla	v17.2d, v1.2d, v8.2d[0]
-	fmla	v18.2d, v0.2d, v8.2d[1]
-	fmla	v19.2d, v1.2d, v8.2d[1]
+	fmla	v16.2d, v0.2d, v8.d[0]
+	fmla	v17.2d, v1.2d, v8.d[0]
+	fmla	v18.2d, v0.2d, v8.d[1]
+	fmla	v19.2d, v1.2d, v8.d[1]
 
-	fmla	v20.2d, v0.2d, v9.2d[0]
-	fmla	v21.2d, v1.2d, v9.2d[0]
-	fmla	v22.2d, v0.2d, v9.2d[1]
-	fmla	v23.2d, v1.2d, v9.2d[1]
+	fmla	v20.2d, v0.2d, v9.d[0]
+	fmla	v21.2d, v1.2d, v9.d[0]
+	fmla	v22.2d, v0.2d, v9.d[1]
+	fmla	v23.2d, v1.2d, v9.d[1]
 
-	fmla	v24.2d, v0.2d, v10.2d[0]
-	fmla	v25.2d, v1.2d, v10.2d[0]
-	fmla	v26.2d, v0.2d, v10.2d[1]
-	fmla	v27.2d, v1.2d, v10.2d[1]
+	fmla	v24.2d, v0.2d, v10.d[0]
+	fmla	v25.2d, v1.2d, v10.d[0]
+	fmla	v26.2d, v0.2d, v10.d[1]
+	fmla	v27.2d, v1.2d, v10.d[1]
 
-	fmla	v28.2d, v0.2d, v11.2d[0]
-	fmla	v29.2d, v1.2d, v11.2d[0]
-	fmla	v30.2d, v0.2d, v11.2d[1]
-	fmla	v31.2d, v1.2d, v11.2d[1]
+	fmla	v28.2d, v0.2d, v11.d[0]
+	fmla	v29.2d, v1.2d, v11.d[0]
+	fmla	v30.2d, v0.2d, v11.d[1]
+	fmla	v31.2d, v1.2d, v11.d[1]
 
 	ld1	{v12.2d, v13.2d}, [pB]		// For next round
 	add	pB, pB, #32
@@ -214,25 +214,25 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 .endm
 
 .macro KERNEL4x8_M2
-	fmla	v16.2d, v4.2d, v12.2d[0]
-	fmla	v17.2d, v5.2d, v12.2d[0]
-	fmla	v18.2d, v4.2d, v12.2d[1]
-	fmla	v19.2d, v5.2d, v12.2d[1]
+	fmla	v16.2d, v4.2d, v12.d[0]
+	fmla	v17.2d, v5.2d, v12.d[0]
+	fmla	v18.2d, v4.2d, v12.d[1]
+	fmla	v19.2d, v5.2d, v12.d[1]
 
-	fmla	v20.2d, v4.2d, v13.2d[0]
-	fmla	v21.2d, v5.2d, v13.2d[0]
-	fmla	v22.2d, v4.2d, v13.2d[1]
-	fmla	v23.2d, v5.2d, v13.2d[1]
+	fmla	v20.2d, v4.2d, v13.d[0]
+	fmla	v21.2d, v5.2d, v13.d[0]
+	fmla	v22.2d, v4.2d, v13.d[1]
+	fmla	v23.2d, v5.2d, v13.d[1]
 
-	fmla	v24.2d, v4.2d, v14.2d[0]
-	fmla	v25.2d, v5.2d, v14.2d[0]
-	fmla	v26.2d, v4.2d, v14.2d[1]
-	fmla	v27.2d, v5.2d, v14.2d[1]
+	fmla	v24.2d, v4.2d, v14.d[0]
+	fmla	v25.2d, v5.2d, v14.d[0]
+	fmla	v26.2d, v4.2d, v14.d[1]
+	fmla	v27.2d, v5.2d, v14.d[1]
 
-	fmla	v28.2d, v4.2d, v15.2d[0]
-	fmla	v29.2d, v5.2d, v15.2d[0]
-	fmla	v30.2d, v4.2d, v15.2d[1]
-	fmla	v31.2d, v5.2d, v15.2d[1]
+	fmla	v28.2d, v4.2d, v15.d[0]
+	fmla	v29.2d, v5.2d, v15.d[0]
+	fmla	v30.2d, v4.2d, v15.d[1]
+	fmla	v31.2d, v5.2d, v15.d[1]
 
 	ld1	{v8.2d, v9.2d}, [pB]		// For next round
 	add	pB, pB, #32
@@ -245,25 +245,25 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 .endm
 
 .macro KERNEL4x8_E
-	fmla	v16.2d, v4.2d, v12.2d[0]
-	fmla	v17.2d, v5.2d, v12.2d[0]
-	fmla	v18.2d, v4.2d, v12.2d[1]
-	fmla	v19.2d, v5.2d, v12.2d[1]
+	fmla	v16.2d, v4.2d, v12.d[0]
+	fmla	v17.2d, v5.2d, v12.d[0]
+	fmla	v18.2d, v4.2d, v12.d[1]
+	fmla	v19.2d, v5.2d, v12.d[1]
 
-	fmla	v20.2d, v4.2d, v13.2d[0]
-	fmla	v21.2d, v5.2d, v13.2d[0]
-	fmla	v22.2d, v4.2d, v13.2d[1]
-	fmla	v23.2d, v5.2d, v13.2d[1]
+	fmla	v20.2d, v4.2d, v13.d[0]
+	fmla	v21.2d, v5.2d, v13.d[0]
+	fmla	v22.2d, v4.2d, v13.d[1]
+	fmla	v23.2d, v5.2d, v13.d[1]
 
-	fmla	v24.2d, v4.2d, v14.2d[0]
-	fmla	v25.2d, v5.2d, v14.2d[0]
-	fmla	v26.2d, v4.2d, v14.2d[1]
-	fmla	v27.2d, v5.2d, v14.2d[1]
+	fmla	v24.2d, v4.2d, v14.d[0]
+	fmla	v25.2d, v5.2d, v14.d[0]
+	fmla	v26.2d, v4.2d, v14.d[1]
+	fmla	v27.2d, v5.2d, v14.d[1]
 
-	fmla	v28.2d, v4.2d, v15.2d[0]
-	fmla	v29.2d, v5.2d, v15.2d[0]
-	fmla	v30.2d, v4.2d, v15.2d[1]
-	fmla	v31.2d, v5.2d, v15.2d[1]
+	fmla	v28.2d, v4.2d, v15.d[0]
+	fmla	v29.2d, v5.2d, v15.d[0]
+	fmla	v30.2d, v4.2d, v15.d[1]
+	fmla	v31.2d, v5.2d, v15.d[1]
 .endm
 
 .macro KERNEL4x8_SUB
@@ -274,25 +274,25 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 	ld1	{v10.2d, v11.2d}, [pB]
 	add	pB, pB, #32
 
-	fmla	v16.2d, v0.2d, v8.2d[0]
-	fmla	v17.2d, v1.2d, v8.2d[0]
-	fmla	v18.2d, v0.2d, v8.2d[1]
-	fmla	v19.2d, v1.2d, v8.2d[1]
+	fmla	v16.2d, v0.2d, v8.d[0]
+	fmla	v17.2d, v1.2d, v8.d[0]
+	fmla	v18.2d, v0.2d, v8.d[1]
+	fmla	v19.2d, v1.2d, v8.d[1]
 
-	fmla	v20.2d, v0.2d, v9.2d[0]
-	fmla	v21.2d, v1.2d, v9.2d[0]
-	fmla	v22.2d, v0.2d, v9.2d[1]
-	fmla	v23.2d, v1.2d, v9.2d[1]
+	fmla	v20.2d, v0.2d, v9.d[0]
+	fmla	v21.2d, v1.2d, v9.d[0]
+	fmla	v22.2d, v0.2d, v9.d[1]
+	fmla	v23.2d, v1.2d, v9.d[1]
 
-	fmla	v24.2d, v0.2d, v10.2d[0]
-	fmla	v25.2d, v1.2d, v10.2d[0]
-	fmla	v26.2d, v0.2d, v10.2d[1]
-	fmla	v27.2d, v1.2d, v10.2d[1]
+	fmla	v24.2d, v0.2d, v10.d[0]
+	fmla	v25.2d, v1.2d, v10.d[0]
+	fmla	v26.2d, v0.2d, v10.d[1]
+	fmla	v27.2d, v1.2d, v10.d[1]
 
-	fmla	v28.2d, v0.2d, v11.2d[0]
-	fmla	v29.2d, v1.2d, v11.2d[0]
-	fmla	v30.2d, v0.2d, v11.2d[1]
-	fmla	v31.2d, v1.2d, v11.2d[1]
+	fmla	v28.2d, v0.2d, v11.d[0]
+	fmla	v29.2d, v1.2d, v11.d[0]
+	fmla	v30.2d, v0.2d, v11.d[1]
+	fmla	v31.2d, v1.2d, v11.d[1]
 .endm
 
 .macro SAVE4x8
@@ -374,17 +374,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 	ld1	{v10.2d, v11.2d}, [pB]
 	add	pB, pB, #32
 
-	fmla	v16.2d, v0.2d, v8.2d[0]
-	fmla	v18.2d, v0.2d, v8.2d[1]
+	fmla	v16.2d, v0.2d, v8.d[0]
+	fmla	v18.2d, v0.2d, v8.d[1]
 
-	fmla	v20.2d, v0.2d, v9.2d[0]
-	fmla	v22.2d, v0.2d, v9.2d[1]
+	fmla	v20.2d, v0.2d, v9.d[0]
+	fmla	v22.2d, v0.2d, v9.d[1]
 
-	fmla	v24.2d, v0.2d, v10.2d[0]
-	fmla	v26.2d, v0.2d, v10.2d[1]
+	fmla	v24.2d, v0.2d, v10.d[0]
+	fmla	v26.2d, v0.2d, v10.d[1]
 
-	fmla	v28.2d, v0.2d, v11.2d[0]
-	fmla	v30.2d, v0.2d, v11.2d[1]
+	fmla	v28.2d, v0.2d, v11.d[0]
+	fmla	v30.2d, v0.2d, v11.d[1]
 .endm
 
 .macro SAVE2x8
@@ -520,17 +520,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 	ld1	{v0.2d, v1.2d}, [pA]
 	add	pA, pA, #32
 
-	fmul	v16.2d, v0.2d, v8.2d[0]
-	fmul	v29.2d, v1.2d, v9.2d[1]
+	fmul	v16.2d, v0.2d, v8.d[0]
+	fmul	v29.2d, v1.2d, v9.d[1]
 
-	fmul	v20.2d, v0.2d, v8.2d[1]
-	fmul	v25.2d, v1.2d, v9.2d[0]
+	fmul	v20.2d, v0.2d, v8.d[1]
+	fmul	v25.2d, v1.2d, v9.d[0]
 
-	fmul	v24.2d, v0.2d, v9.2d[0]
-	fmul	v21.2d, v1.2d, v8.2d[1]
+	fmul	v24.2d, v0.2d, v9.d[0]
+	fmul	v21.2d, v1.2d, v8.d[1]
 
-	fmul	v28.2d, v0.2d, v9.2d[1]
-	fmul	v17.2d, v1.2d, v8.2d[0]
+	fmul	v28.2d, v0.2d, v9.d[1]
+	fmul	v17.2d, v1.2d, v8.d[0]
 
 	ld1	{v12.2d, v13.2d}, [pB]
 	add	pB, pB, #32
@@ -539,61 +539,61 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 .endm
 
 .macro KERNEL4x4_M1
-	fmla	v16.2d, v0.2d, v8.2d[0]
-	fmla	v29.2d, v1.2d, v9.2d[1]
+	fmla	v16.2d, v0.2d, v8.d[0]
+	fmla	v29.2d, v1.2d, v9.d[1]
 
 	ld1	{v12.2d, v13.2d}, [pB]		// For next round
 	add	pB, pB, #32
 
-	fmla	v20.2d, v0.2d, v8.2d[1]
-	fmla	v25.2d, v1.2d, v9.2d[0]
+	fmla	v20.2d, v0.2d, v8.d[1]
+	fmla	v25.2d, v1.2d, v9.d[0]
 
 	ld1	{v4.2d, v5.2d}, [pA]		// For next round
 	add	pA, pA, #32
 
-	fmla	v24.2d, v0.2d, v9.2d[0]
-	fmla	v21.2d, v1.2d, v8.2d[1]
+	fmla	v24.2d, v0.2d, v9.d[0]
+	fmla	v21.2d, v1.2d, v8.d[1]
 
 	prfm	PLDL1KEEP, [pA, #512]
 
-	fmla	v28.2d, v0.2d, v9.2d[1]
-	fmla	v17.2d, v1.2d, v8.2d[0]
+	fmla	v28.2d, v0.2d, v9.d[1]
+	fmla	v17.2d, v1.2d, v8.d[0]
 .endm
 
 .macro KERNEL4x4_M2
-	fmla	v16.2d, v4.2d, v12.2d[0]
-	fmla	v29.2d, v5.2d, v13.2d[1]
+	fmla	v16.2d, v4.2d, v12.d[0]
+	fmla	v29.2d, v5.2d, v13.d[1]
 
 	ld1	{v8.2d, v9.2d}, [pB]		// For next round
 	add	pB, pB, #32
 
-	fmla	v20.2d, v4.2d, v12.2d[1]
-	fmla	v25.2d, v5.2d, v13.2d[0]
+	fmla	v20.2d, v4.2d, v12.d[1]
+	fmla	v25.2d, v5.2d, v13.d[0]
 
 	ld1	{v0.2d, v1.2d}, [pA]		// For next round
 	add	pA, pA, #32
 
-	fmla	v24.2d, v4.2d, v13.2d[0]
-	fmla	v21.2d, v5.2d, v12.2d[1]
+	fmla	v24.2d, v4.2d, v13.d[0]
+	fmla	v21.2d, v5.2d, v12.d[1]
 
 	prfm	PLDL1KEEP, [pB, #512]
 
-	fmla	v28.2d, v4.2d, v13.2d[1]
-	fmla	v17.2d, v5.2d, v12.2d[0]
+	fmla	v28.2d, v4.2d, v13.d[1]
+	fmla	v17.2d, v5.2d, v12.d[0]
 .endm
 
 .macro KERNEL4x4_E
-	fmla	v16.2d, v4.2d, v12.2d[0]
-	fmla	v29.2d, v5.2d, v13.2d[1]
+	fmla	v16.2d, v4.2d, v12.d[0]
+	fmla	v29.2d, v5.2d, v13.d[1]
 
-	fmla	v20.2d, v4.2d, v12.2d[1]
-	fmla	v25.2d, v5.2d, v13.2d[0]
+	fmla	v20.2d, v4.2d, v12.d[1]
+	fmla	v25.2d, v5.2d, v13.d[0]
 
-	fmla	v24.2d, v4.2d, v13.2d[0]
-	fmla	v21.2d, v5.2d, v12.2d[1]
+	fmla	v24.2d, v4.2d, v13.d[0]
+	fmla	v21.2d, v5.2d, v12.d[1]
 
-	fmla	v28.2d, v4.2d, v13.2d[1]
-	fmla	v17.2d, v5.2d, v12.2d[0]
+	fmla	v28.2d, v4.2d, v13.d[1]
+	fmla	v17.2d, v5.2d, v12.d[0]
 .endm
 
 .macro KERNEL4x4_SUB
@@ -602,17 +602,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 	ld1	{v0.2d, v1.2d}, [pA]
 	add	pA, pA, #32
 
-	fmla	v16.2d, v0.2d, v8.2d[0]
-	fmla	v29.2d, v1.2d, v9.2d[1]
+	fmla	v16.2d, v0.2d, v8.d[0]
+	fmla	v29.2d, v1.2d, v9.d[1]
 
-	fmla	v20.2d, v0.2d, v8.2d[1]
-	fmla	v25.2d, v1.2d, v9.2d[0]
+	fmla	v20.2d, v0.2d, v8.d[1]
+	fmla	v25.2d, v1.2d, v9.d[0]
 
-	fmla	v24.2d, v0.2d, v9.2d[0]
-	fmla	v21.2d, v1.2d, v8.2d[1]
+	fmla	v24.2d, v0.2d, v9.d[0]
+	fmla	v21.2d, v1.2d, v8.d[1]
 
-	fmla	v28.2d, v0.2d, v9.2d[1]
-	fmla	v17.2d, v1.2d, v8.2d[0]
+	fmla	v28.2d, v0.2d, v9.d[1]
+	fmla	v17.2d, v1.2d, v8.d[0]
 .endm
 
 .macro SAVE4x4
@@ -660,10 +660,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 	ld1	{v0.2d}, [pA]
 	add	pA, pA, #16
 
-	fmla	v16.2d, v0.2d, v8.2d[0]
-	fmla	v20.2d, v0.2d, v8.2d[1]
-	fmla	v24.2d, v0.2d, v9.2d[0]
-	fmla	v28.2d, v0.2d, v9.2d[1]
+	fmla	v16.2d, v0.2d, v8.d[0]
+	fmla	v20.2d, v0.2d, v8.d[1]
+	fmla	v24.2d, v0.2d, v9.d[0]
+	fmla	v28.2d, v0.2d, v9.d[1]
 .endm
 
 .macro SAVE2x4
@@ -746,10 +746,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 	ld1	{v0.2d, v1.2d}, [pA]
 	add	pA, pA, #32
 
-	fmla	v16.2d, v0.2d, v8.2d[0]
-	fmla	v17.2d, v1.2d, v8.2d[0]
-	fmla	v20.2d, v0.2d, v8.2d[1]
-	fmla	v21.2d, v1.2d, v8.2d[1]
+	fmla	v16.2d, v0.2d, v8.d[0]
+	fmla	v17.2d, v1.2d, v8.d[0]
+	fmla	v20.2d, v0.2d, v8.d[1]
+	fmla	v21.2d, v1.2d, v8.d[1]
 .endm
 
 .macro SAVE4x2
@@ -782,8 +782,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 	ld1	{v0.2d}, [pA]
 	add	pA, pA, #16
 
-	fmla	v16.2d, v0.2d, v8.2d[0]
-	fmla	v20.2d, v0.2d, v8.2d[1]
+	fmla	v16.2d, v0.2d, v8.d[0]
+	fmla	v20.2d, v0.2d, v8.d[1]
 .endm
 
 .macro SAVE2x2
@@ -813,7 +813,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 	ldr	d0 , [pA]
 	add	pA, pA, #8
 
-	fmla	v16.2d, v8.2d, v0.2d[0]
+	fmla	v16.2d, v8.2d, v0.d[0]
 .endm
 
 .macro SAVE1x2
@@ -842,8 +842,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 	ld1	{v0.2d, v1.2d}, [pA]
 	add	pA , pA, #32
 
-	fmla	v16.2d, v0.2d, v8.2d[0]
-	fmla	v17.2d, v1.2d, v8.2d[0]
+	fmla	v16.2d, v0.2d, v8.d[0]
+	fmla	v17.2d, v1.2d, v8.d[0]
 .endm
 
 .macro SAVE4x1
@@ -871,7 +871,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 	ld1	{v0.2d}, [pA]
 	add	pA , pA, #16
 
-	fmla	v16.2d, v0.2d, v8.2d[0]
+	fmla	v16.2d, v0.2d, v8.d[0]
 .endm
 
 .macro SAVE2x1
diff --git a/kernel/arm64/dgemm_kernel_8x4.S b/kernel/arm64/dgemm_kernel_8x4.S
old mode 100755
new mode 100644
index a607fecc4..f3c3d5c35
--- a/kernel/arm64/dgemm_kernel_8x4.S
+++ b/kernel/arm64/dgemm_kernel_8x4.S
@@ -52,12 +52,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
 #define alpha0		d10
 #define alphaV0		v10.d[0]
-#define alpha1		d11
-#define alphaV1		v11.d[0]
-#define alpha2		d14
-#define alphaV2		v14.d[0]
-#define alpha3		d15
-#define alphaV3		v15.d[0]
+
+#define A_PRE_SIZE	2560
+#define B_PRE_SIZE	448
+#define C_PRE_SIZE	128
 
 // 00 origM
 // 01 origN
@@ -74,8 +72,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 // 12 pCRow0
 // 13 pCRow1
 // 14 pCRow2
-// 15 pA
-// 16
+// 15 pCRow3
+// 16 pA
 // 17
 // 18 must save
 // 19 must save
@@ -100,14 +98,14 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 //v05 pA1_2, pA1_3
 //v06 pA1_4, pA1_5
 //v07 pA1_6, pA1_7
-//v08 must save pB0_0, pB0_1
-//v09 must save pB0_2, pB0_3
-//v10 must save ALPHA0
-//v11 must save ALPHA1
-//v12 must save pB1_0, pB1_1
-//v13 must save pB1_2, pB1_3
-//v14 must save ALPHA2
-//v15 must save ALPHA3
+//v08 must save pB0_0
+//v09 must save pB0_1
+//v10 must save pB0_2 --> ALPHA0
+//v11 must save pB0_3
+//v12 must save pB1_0
+//v13 must save pB1_1
+//v14 must save pB1_2
+//v15 must save pB1_3
 //v16 must save C00, C01
 //v17 must save C02, C03
 //v18 C04, C05
@@ -149,244 +147,257 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 .endm
 
 .macro KERNEL8x4_I
-	ld1	{v0.2d, v1.2d}, [pA]
-	add	pA, pA, #32
-	ld1	{v2.2d, v3.2d}, [pA]
-	add	pA, pA, #32
-	ldp	d8, d9, [pB]
-	add	pB, pB, #16
-	ldp	d10, d11, [pB]
-	add	pB, pB, #16
+	ldp	q0, q1, [pA], #32
 
-	fmul	v16.2d, v0.2d, v8.2d[0]
-	fmul	v17.2d, v1.2d, v8.2d[0]
+	ldp	d8, d9, [pB], #16
 
-	fmul	v18.2d, v2.2d, v8.2d[0]
-	fmul	v19.2d, v3.2d, v8.2d[0]
+	fmul	v16.2d, v0.2d, v8.d[0]
+	fmul	v20.2d, v0.2d, v9.d[0]
 
-	fmul	v20.2d, v0.2d, v9.2d[0]
-	fmul	v21.2d, v1.2d, v9.2d[0]
+	ldp	d10, d11, [pB], #16
 
-	fmul	v22.2d, v2.2d, v9.2d[0]
-	fmul	v23.2d, v3.2d, v9.2d[0]
+	fmul	v17.2d, v1.2d, v8.d[0]
+	fmul	v21.2d, v1.2d, v9.d[0]
 
-	fmul	v24.2d, v0.2d, v10.2d[0]
-	fmul	v25.2d, v1.2d, v10.2d[0]
+	ldp	q2, q3, [pA], #32
 
-	fmul	v26.2d, v2.2d, v10.2d[0]
-	fmul	v27.2d, v3.2d, v10.2d[0]
+	fmul	v24.2d, v0.2d, v10.d[0]
+	fmul	v28.2d, v0.2d, v11.d[0]
 
-	fmul	v28.2d, v0.2d, v11.2d[0]
-	fmul	v29.2d, v1.2d, v11.2d[0]
+	ldp	q4, q5, [pA], #32
 
-	fmul	v30.2d, v2.2d, v11.2d[0]
-	fmul	v31.2d, v3.2d, v11.2d[0]
+	fmul	v25.2d, v1.2d, v10.d[0]
+	fmul	v29.2d, v1.2d, v11.d[0]
 
-	ld1	{v4.2d, v5.2d}, [pA]
-	add	pA, pA, #32
-	ld1	{v6.2d, v7.2d}, [pA]
-	add	pA, pA, #32
-	ldp	d12, d13, [pB]
-	add	pB, pB, #16
-	ldp	d14, d15, [pB]
-	add	pB, pB, #16
+	ldp	d12, d13, [pB], #16
+
+	fmul	v18.2d, v2.2d, v8.d[0]
+	fmul	v22.2d, v2.2d, v9.d[0]
+
+	ldp	d14, d15, [pB], #16
+
+	fmul	v26.2d, v2.2d, v10.d[0]
+	fmul	v30.2d, v2.2d, v11.d[0]
+
+	ldp	q6, q7, [pA], #32
+
+	fmul	v19.2d, v3.2d, v8.d[0]
+	fmul	v27.2d, v3.2d, v10.d[0]
+
+	prfm	PLDL1KEEP, [pA, #A_PRE_SIZE]
+
+	fmul	v31.2d, v3.2d, v11.d[0]
+	fmul	v23.2d, v3.2d, v9.d[0]
+
+	prfm	PLDL1KEEP, [pA, #A_PRE_SIZE+64]
 .endm
 
 .macro KERNEL8x4_M1
-	fmla	v16.2d, v0.2d, v8.2d[0]
-	fmla	v21.2d, v1.2d, v9.2d[0]
-	fmla	v26.2d, v2.2d, v10.2d[0]
-	fmla	v31.2d, v3.2d, v11.2d[0]
+	fmla	v16.2d, v0.2d, v8.d[0]
+	fmla	v20.2d, v0.2d, v9.d[0]
 
-	ld1	{v4.2d}, [pA], #16
+	ldp	q4, q5, [pA], #32
 
-	fmla	v20.2d, v0.2d, v9.2d[0]
-	fmla	v17.2d, v1.2d, v8.2d[0]
+	fmla	v24.2d, v0.2d, v10.d[0]
+	fmla	v28.2d, v0.2d, v11.d[0]
 
-	ld1	{v5.2d}, [pA], #16
+	ldp	d12, d13, [pB], #16
 
-	fmla	v30.2d, v2.2d, v11.2d[0]
-	fmla	v27.2d, v3.2d, v10.2d[0]
+	fmla	v17.2d, v1.2d, v8.d[0]
+	fmla	v25.2d, v1.2d, v10.d[0]
 
-	ldp	d12, d13, [pB]
-	add	pB, pB, #16
+	prfm	PLDL1KEEP, [pA, #A_PRE_SIZE+64]
 
-	fmla	v28.2d, v0.2d, v11.2d[0]
-	fmla	v25.2d, v1.2d, v10.2d[0]
+	fmla	v21.2d, v1.2d, v9.d[0]
+	fmla	v29.2d, v1.2d, v11.d[0]
 
-	ldp	d14, d15, [pB]
-	add	pB, pB, #16
+	ldp	d14, d15, [pB], #16
 
-	fmla	v18.2d, v2.2d, v8.2d[0]
-	fmla	v23.2d, v3.2d, v9.2d[0]
+	fmla	v18.2d, v2.2d, v8.d[0]
+	fmla	v22.2d, v2.2d, v9.d[0]
 
-	ld1	{v6.2d}, [pA], #16
+	prfm	PLDL1KEEP, [pA, #A_PRE_SIZE]
 
-	fmla	v24.2d, v0.2d, v10.2d[0]
-	fmla	v29.2d, v1.2d, v11.2d[0]
+	fmla	v26.2d, v2.2d, v10.d[0]
+	fmla	v30.2d, v2.2d, v11.d[0]
+	fmla	v19.2d, v3.2d, v8.d[0]
+	fmla	v23.2d, v3.2d, v9.d[0]
 
-	ld1	{v7.2d}, [pA], #16
+	ldp	q6, q7, [pA], #32
 
-	fmla	v22.2d, v2.2d, v9.2d[0]
-	fmla	v19.2d, v3.2d, v8.2d[0]
-
-	prfm	PLDL1KEEP, [pA, #224]
-	prfm	PLDL1KEEP, [pA, #224+64]
+	fmla	v27.2d, v3.2d, v10.d[0]
+	fmla	v31.2d, v3.2d, v11.d[0]
 .endm
 
 .macro KERNEL8x4_M2
-	fmla	v16.2d, v4.2d, v12.2d[0]
-	fmla	v21.2d, v5.2d, v13.2d[0]
-	fmla	v26.2d, v6.2d, v14.2d[0]
-	fmla	v31.2d, v7.2d, v15.2d[0]
+	fmla	v16.2d, v4.2d, v12.d[0]
+	fmla	v20.2d, v4.2d, v13.d[0]
+	fmla	v24.2d, v4.2d, v14.d[0]
+	fmla	v28.2d, v4.2d, v15.d[0]
 
-	ld1	{v0.2d}, [pA], #16
+	ldp	q0, q1, [pA], #32
 
-	fmla	v20.2d, v4.2d, v13.2d[0]
-	fmla	v17.2d, v5.2d, v12.2d[0]
+	fmla	v17.2d, v5.2d, v12.d[0]
+	fmla	v25.2d, v5.2d, v14.d[0]
 
-	ld1	{v1.2d}, [pA], #16
+	ldp	d8, d9, [pB], #16
 
-	fmla	v30.2d, v6.2d, v15.2d[0]
-	fmla	v27.2d, v7.2d, v14.2d[0]
+	fmla	v21.2d, v5.2d, v13.d[0]
+	fmla	v29.2d, v5.2d, v15.d[0]
 
-	ldp	d8, d9, [pB]
-	add	pB, pB, #16
+	ldp	d10, d11, [pB], #16
 
-	fmla	v28.2d, v4.2d, v15.2d[0]
-	fmla	v25.2d, v5.2d, v14.2d[0]
+	fmla	v18.2d, v6.2d, v12.d[0]
+	fmla	v22.2d, v6.2d, v13.d[0]
 
-	ldp	d10, d11, [pB]
-	add	pB, pB, #16
+	prfm	PLDL1KEEP, [pB, #B_PRE_SIZE]
 
-	fmla	v22.2d, v6.2d, v13.2d[0]
-	fmla	v19.2d, v7.2d, v12.2d[0]
+	fmla	v26.2d, v6.2d, v14.d[0]
+	fmla	v30.2d, v6.2d, v15.d[0]
 
-	ld1	{v2.2d}, [pA], #16
+	fmla	v19.2d, v7.2d, v12.d[0]
+	fmla	v23.2d, v7.2d, v13.d[0]
 
-	fmla	v24.2d, v4.2d, v14.2d[0]
-	fmla	v29.2d, v5.2d, v15.2d[0]
+	ldp	q2, q3, [pA], #32
 
-	ld1	{v3.2d}, [pA], #16
-
-	fmla	v18.2d, v6.2d, v12.2d[0]
-	fmla	v23.2d, v7.2d, v13.2d[0]
-
-	prfm	PLDL1KEEP, [pB, #640]
+	fmla	v27.2d, v7.2d, v14.d[0]
+	fmla	v31.2d, v7.2d, v15.d[0]
 .endm
 
 .macro KERNEL8x4_E
-	fmla	v16.2d, v4.2d, v12.2d[0]
-	fmla	v17.2d, v5.2d, v12.2d[0]
-	fmla	v18.2d, v6.2d, v12.2d[0]
-	fmla	v19.2d, v7.2d, v12.2d[0]
-	fmla	v20.2d, v4.2d, v13.2d[0]
-	fmla	v21.2d, v5.2d, v13.2d[0]
-	fmla	v22.2d, v6.2d, v13.2d[0]
-	fmla	v23.2d, v7.2d, v13.2d[0]
-	fmla	v24.2d, v4.2d, v14.2d[0]
-	fmla	v25.2d, v5.2d, v14.2d[0]
-	fmla	v26.2d, v6.2d, v14.2d[0]
-	fmla	v27.2d, v7.2d, v14.2d[0]
-	fmla	v28.2d, v4.2d, v15.2d[0]
-	fmla	v29.2d, v5.2d, v15.2d[0]
-	fmla	v30.2d, v6.2d, v15.2d[0]
-	fmla	v31.2d, v7.2d, v15.2d[0]
+	fmla	v16.2d, v4.2d, v12.d[0]
+	fmla	v20.2d, v4.2d, v13.d[0]
+	fmla	v24.2d, v4.2d, v14.d[0]
+	fmla	v28.2d, v4.2d, v15.d[0]
+
+	fmla	v17.2d, v5.2d, v12.d[0]
+	fmla	v25.2d, v5.2d, v14.d[0]
+	fmla	v21.2d, v5.2d, v13.d[0]
+	fmla	v29.2d, v5.2d, v15.d[0]
+
+	prfm	PLDL1KEEP, [pB, #B_PRE_SIZE]
+
+	fmla	v18.2d, v6.2d, v12.d[0]
+	fmla	v22.2d, v6.2d, v13.d[0]
+	fmla	v26.2d, v6.2d, v14.d[0]
+	fmla	v30.2d, v6.2d, v15.d[0]
+
+	fmla	v19.2d, v7.2d, v12.d[0]
+	fmla	v23.2d, v7.2d, v13.d[0]
+	fmla	v27.2d, v7.2d, v14.d[0]
+	fmla	v31.2d, v7.2d, v15.d[0]
 .endm
 
 .macro KERNEL8x4_SUB
-	ld1	{v0.2d, v1.2d}, [pA]
-	add	pA, pA, #32
-	ld1	{v2.2d, v3.2d}, [pA]
-	add	pA, pA, #32
-	ldp	d8, d9, [pB]
-	add	pB, pB, #16
-	ldp	d10, d11, [pB]
-	add	pB, pB, #16
+	ldp	q0, q1, [pA], #32
 
-	fmla	v16.2d, v0.2d, v8.2d[0]
-	fmla	v17.2d, v1.2d, v8.2d[0]
-	fmla	v18.2d, v2.2d, v8.2d[0]
-	fmla	v19.2d, v3.2d, v8.2d[0]
+	ldp	d8, d9, [pB], #16
 
-	fmla	v20.2d, v0.2d, v9.2d[0]
-	fmla	v21.2d, v1.2d, v9.2d[0]
-	fmla	v22.2d, v2.2d, v9.2d[0]
-	fmla	v23.2d, v3.2d, v9.2d[0]
+	fmla	v16.2d, v0.2d, v8.d[0]
+	fmla	v20.2d, v0.2d, v9.d[0]
 
-	fmla	v24.2d, v0.2d, v10.2d[0]
-	fmla	v25.2d, v1.2d, v10.2d[0]
-	fmla	v26.2d, v2.2d, v10.2d[0]
-	fmla	v27.2d, v3.2d, v10.2d[0]
+	ldp	d10, d11, [pB], #16
 
-	fmla	v28.2d, v0.2d, v11.2d[0]
-	fmla	v29.2d, v1.2d, v11.2d[0]
-	fmla	v30.2d, v2.2d, v11.2d[0]
-	fmla	v31.2d, v3.2d, v11.2d[0]
+	fmla	v17.2d, v1.2d, v8.d[0]
+	fmla	v21.2d, v1.2d, v9.d[0]
+
+	ldp	q2, q3, [pA], #32
+
+	fmla	v24.2d, v0.2d, v10.d[0]
+	fmla	v28.2d, v0.2d, v11.d[0]
+
+	fmla	v25.2d, v1.2d, v10.d[0]
+	fmla	v29.2d, v1.2d, v11.d[0]
+
+	prfm	PLDL1KEEP, [pA, #A_PRE_SIZE]
+
+	fmla	v18.2d, v2.2d, v8.d[0]
+	fmla	v22.2d, v2.2d, v9.d[0]
+
+	prfm	PLDL1KEEP, [pA, #A_PRE_SIZE+64]
+
+	fmla	v26.2d, v2.2d, v10.d[0]
+	fmla	v30.2d, v2.2d, v11.d[0]
+
+	prfm	PLDL1KEEP, [pB, #B_PRE_SIZE]
+
+	fmla	v19.2d, v3.2d, v8.d[0]
+	fmla	v27.2d, v3.2d, v10.d[0]
+
+	fmla	v31.2d, v3.2d, v11.d[0]
+	fmla	v23.2d, v3.2d, v9.d[0]
 .endm
 
 .macro SAVE8x4
 	fmov	alpha0, alpha
 
-	ld1	{v0.2d, v1.2d}, [pCRow0]
+	prfm	PLDL2KEEP, [pCRow0, #C_PRE_SIZE]
+
+	ldp	q0, q1, [pCRow0]
 	fmla	v0.2d, v16.2d, alphaV0
 	fmla	v1.2d, v17.2d, alphaV0
-	st1 	{v0.2d, v1.2d}, [pCRow0]
+	stp 	q0, q1, [pCRow0]
 
 	add	pCRow0, pCRow0, #32
+	prfm	PLDL2KEEP, [pCRow0, #C_PRE_SIZE]
 
-	ld1	{v2.2d, v3.2d}, [pCRow0]
+	ldp	q2, q3, [pCRow0]
 	fmla	v2.2d, v18.2d, alphaV0
 	fmla	v3.2d, v19.2d, alphaV0
-	st1 	{v2.2d, v3.2d}, [pCRow0]
+	stp 	q2, q3, [pCRow0]
 
 	add	pCRow0, pCRow0, #32
 
-	ld1	{v4.2d, v5.2d}, [pCRow1]
+	prfm	PLDL2KEEP, [pCRow1, #C_PRE_SIZE]
+
+	ldp	q4, q5, [pCRow1]
 	fmla	v4.2d, v20.2d, alphaV0
 	fmla	v5.2d, v21.2d, alphaV0
-	st1 	{v4.2d, v5.2d}, [pCRow1]
+	stp 	q4, q5, [pCRow1]
 
 	add	pCRow1, pCRow1, #32
+	prfm	PLDL2KEEP, [pCRow1, #C_PRE_SIZE]
 
-	ld1	{v6.2d, v7.2d}, [pCRow1]
+	ldp	q6, q7, [pCRow1]
 	fmla	v6.2d, v22.2d, alphaV0
 	fmla	v7.2d, v23.2d, alphaV0
-	st1 	{v6.2d, v7.2d}, [pCRow1]
+	stp 	q6, q7, [pCRow1]
 
 	add	pCRow1, pCRow1, #32
 
-	ld1	{v0.2d, v1.2d}, [pCRow2]
+	prfm	PLDL2KEEP, [pCRow2, #C_PRE_SIZE]
+
+	ldp	q0, q1, [pCRow2]
 	fmla	v0.2d, v24.2d, alphaV0
 	fmla	v1.2d, v25.2d, alphaV0
-	st1 	{v0.2d, v1.2d}, [pCRow2]
+	stp 	q0, q1, [pCRow2]
 
 	add	pCRow2, pCRow2, #32
-	ld1	{v2.2d, v3.2d}, [pCRow2]
+	prfm	PLDL2KEEP, [pCRow2, #C_PRE_SIZE]
+
+	ldp	q2, q3, [pCRow2]
 	fmla	v2.2d, v26.2d, alphaV0
 	fmla	v3.2d, v27.2d, alphaV0
-	st1 	{v2.2d, v3.2d}, [pCRow2]
+	stp 	q2, q3, [pCRow2]
 
 	add	pCRow2, pCRow2, #32
 
-	ld1	{v4.2d, v5.2d}, [pCRow3]
+	prfm	PLDL2KEEP, [pCRow3, #C_PRE_SIZE]
+
+	ldp	q4, q5, [pCRow3]
 	fmla	v4.2d, v28.2d, alphaV0
 	fmla	v5.2d, v29.2d, alphaV0
-	st1 	{v4.2d, v5.2d}, [pCRow3]
+	stp 	q4, q5, [pCRow3]
 
 	add	pCRow3, pCRow3, #32
+	prfm	PLDL2KEEP, [pCRow3, #C_PRE_SIZE]
 
-	ld1	{v6.2d, v7.2d}, [pCRow3]
+	ldp	q6, q7, [pCRow3]
 	fmla	v6.2d, v30.2d, alphaV0
 	fmla	v7.2d, v31.2d, alphaV0
-	st1 	{v6.2d, v7.2d}, [pCRow3]
+	stp 	q6, q7, [pCRow3]
 
 	add	pCRow3, pCRow3, #32
-
-	prfm	PLDL2KEEP, [pCRow0, #128]
-	prfm	PLDL2KEEP, [pCRow1, #128]
-	prfm	PLDL2KEEP, [pCRow2, #128]
-	prfm	PLDL2KEEP, [pCRow3, #128]
 .endm
 
 /******************************************************************************/
@@ -408,44 +419,45 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 	ld1	{v0.2d, v1.2d}, [pA]
 	add	pA, pA, #32
 
-	fmla	v16.2d, v0.2d, v8.2d[0]
-	fmla	v29.2d, v1.2d, v9.2d[1]
+	fmla	v16.2d, v0.2d, v8.d[0]
+	fmla	v29.2d, v1.2d, v9.d[1]
 
-	fmla	v20.2d, v0.2d, v8.2d[1]
-	fmla	v25.2d, v1.2d, v9.2d[0]
+	fmla	v20.2d, v0.2d, v8.d[1]
+	fmla	v25.2d, v1.2d, v9.d[0]
 
-	fmla	v24.2d, v0.2d, v9.2d[0]
-	fmla	v21.2d, v1.2d, v8.2d[1]
+	fmla	v24.2d, v0.2d, v9.d[0]
+	fmla	v21.2d, v1.2d, v8.d[1]
 
-	fmla	v28.2d, v0.2d, v9.2d[1]
-	fmla	v17.2d, v1.2d, v8.2d[0]
+	fmla	v28.2d, v0.2d, v9.d[1]
+	fmla	v17.2d, v1.2d, v8.d[0]
 .endm
 
 .macro SAVE4x4
+	fmov	alpha0, alpha
 	ld1	{v8.2d, v9.2d}, [pCRow0]
 	fmla	v8.2d, v16.2d, alphaV0
-	fmla	v9.2d, v17.2d, alphaV1
+	fmla	v9.2d, v17.2d, alphaV0
 	st1 	{v8.2d, v9.2d}, [pCRow0]
 
 	add	pCRow1, pCRow0, LDC
 
 	ld1	{v12.2d, v13.2d}, [pCRow1]
-	fmla	v12.2d, v20.2d, alphaV2
-	fmla	v13.2d, v21.2d, alphaV3
+	fmla	v12.2d, v20.2d, alphaV0
+	fmla	v13.2d, v21.2d, alphaV0
 	st1 	{v12.2d, v13.2d}, [pCRow1]
 
 	add	pCRow2, pCRow1, LDC
 
 	ld1	{v8.2d, v9.2d}, [pCRow2]
 	fmla	v8.2d, v24.2d, alphaV0
-	fmla	v9.2d, v25.2d, alphaV1
+	fmla	v9.2d, v25.2d, alphaV0
 	st1 	{v8.2d, v9.2d}, [pCRow2]
 
 	add	pCRow1, pCRow2, LDC
 
 	ld1	{v12.2d, v13.2d}, [pCRow1]
-	fmla	v12.2d, v28.2d, alphaV2
-	fmla	v13.2d, v29.2d, alphaV3
+	fmla	v12.2d, v28.2d, alphaV0
+	fmla	v13.2d, v29.2d, alphaV0
 	st1 	{v12.2d, v13.2d}, [pCRow1]
 
 	add	pCRow0, pCRow0, #32
@@ -467,13 +479,14 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 	ld1	{v0.2d}, [pA]
 	add	pA, pA, #16
 
-	fmla	v16.2d, v0.2d, v8.2d[0]
-	fmla	v20.2d, v0.2d, v8.2d[1]
-	fmla	v24.2d, v0.2d, v9.2d[0]
-	fmla	v28.2d, v0.2d, v9.2d[1]
+	fmla	v16.2d, v0.2d, v8.d[0]
+	fmla	v20.2d, v0.2d, v8.d[1]
+	fmla	v24.2d, v0.2d, v9.d[0]
+	fmla	v28.2d, v0.2d, v9.d[1]
 .endm
 
 .macro SAVE2x4
+	fmov	alpha0, alpha
 	ld1	{v8.2d}, [pCRow0]
 	fmla	v8.2d, v16.2d, alphaV0
 	st1	{v8.2d}, [pCRow0]
@@ -481,19 +494,19 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 	add	pCRow1, pCRow0, LDC
 
 	ld1	{v12.2d}, [pCRow1]
-	fmla	v12.2d, v20.2d, alphaV1
+	fmla	v12.2d, v20.2d, alphaV0
 	st1	{v12.2d}, [pCRow1]
 
 	add	pCRow2, pCRow1, LDC
 
 	ld1	{v8.2d}, [pCRow2]
-	fmla	v8.2d, v24.2d, alphaV2
+	fmla	v8.2d, v24.2d, alphaV0
 	st1	{v8.2d}, [pCRow2]
 
 	add	pCRow1, pCRow2, LDC
 
 	ld1	{v12.2d}, [pCRow1]
-	fmla	v12.2d, v28.2d, alphaV3
+	fmla	v12.2d, v28.2d, alphaV0
 	st1	{v12.2d}, [pCRow1]
 
 	add	pCRow0, pCRow0, #16
@@ -518,6 +531,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 .endm
 
 .macro SAVE1x4
+	fmov	alpha0, alpha
 	add	pCRow1, pCRow0, LDC
 
 	ld1	{v8.d}[0], [pCRow0]
@@ -531,7 +545,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
 	ld1	{v12.d}[0], [pCRow2]
 	ld1	{v12.d}[1], [pCRow1]
-	fmla	v12.2d, v20.2d, alphaV1
+	fmla	v12.2d, v20.2d, alphaV0
 	st1	{v12.d}[0], [pCRow2]
 	st1	{v12.d}[1], [pCRow1]
 
@@ -559,32 +573,33 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 	ld1	{v2.2d, v3.2d}, [pA]
 	add	pA, pA, #32
 
-	fmla	v16.2d, v0.2d, v8.2d[0]
-	fmla	v17.2d, v1.2d, v8.2d[0]
-	fmla	v18.2d, v2.2d, v8.2d[0]
-	fmla	v19.2d, v3.2d, v8.2d[0]
+	fmla	v16.2d, v0.2d, v8.d[0]
+	fmla	v17.2d, v1.2d, v8.d[0]
+	fmla	v18.2d, v2.2d, v8.d[0]
+	fmla	v19.2d, v3.2d, v8.d[0]
 
-	fmla	v20.2d, v0.2d, v8.2d[1]
-	fmla	v21.2d, v1.2d, v8.2d[1]
-	fmla	v22.2d, v2.2d, v8.2d[1]
-	fmla	v23.2d, v3.2d, v8.2d[1]
+	fmla	v20.2d, v0.2d, v8.d[1]
+	fmla	v21.2d, v1.2d, v8.d[1]
+	fmla	v22.2d, v2.2d, v8.d[1]
+	fmla	v23.2d, v3.2d, v8.d[1]
 .endm
 
 .macro SAVE8x2
+	fmov	alpha0, alpha
 	add	pCRow1, pCRow0, LDC
 
 	ld1	{v0.2d, v1.2d, v2.2d, v3.2d}, [pCRow0]
 	fmla	v0.2d, v16.2d, alphaV0
-	fmla	v1.2d, v17.2d, alphaV1
-	fmla	v2.2d, v18.2d, alphaV2
-	fmla	v3.2d, v19.2d, alphaV3
+	fmla	v1.2d, v17.2d, alphaV0
+	fmla	v2.2d, v18.2d, alphaV0
+	fmla	v3.2d, v19.2d, alphaV0
 	st1 	{v0.2d, v1.2d, v2.2d, v3.2d}, [pCRow0]
 
 	ld1	{v4.2d, v5.2d, v6.2d, v7.2d}, [pCRow1]
 	fmla	v4.2d, v20.2d, alphaV0
-	fmla	v5.2d, v21.2d, alphaV1
-	fmla	v6.2d, v22.2d, alphaV2
-	fmla	v7.2d, v23.2d, alphaV3
+	fmla	v5.2d, v21.2d, alphaV0
+	fmla	v6.2d, v22.2d, alphaV0
+	fmla	v7.2d, v23.2d, alphaV0
 	st1 	{v4.2d, v5.2d, v6.2d, v7.2d}, [pCRow1]
 
 	add	pCRow0, pCRow0, #64
@@ -605,23 +620,24 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 	ld1	{v0.2d, v1.2d}, [pA]
 	add	pA, pA, #32
 
-	fmla	v16.2d, v0.2d, v8.2d[0]
-	fmla	v17.2d, v1.2d, v8.2d[0]
-	fmla	v20.2d, v0.2d, v8.2d[1]
-	fmla	v21.2d, v1.2d, v8.2d[1]
+	fmla	v16.2d, v0.2d, v8.d[0]
+	fmla	v17.2d, v1.2d, v8.d[0]
+	fmla	v20.2d, v0.2d, v8.d[1]
+	fmla	v21.2d, v1.2d, v8.d[1]
 .endm
 
 .macro SAVE4x2
+	fmov	alpha0, alpha
 	ld1	{v8.2d, v9.2d}, [pCRow0]
 	fmla	v8.2d, v16.2d, alphaV0
-	fmla	v9.2d, v17.2d, alphaV1
+	fmla	v9.2d, v17.2d, alphaV0
 	st1	{v8.2d, v9.2d}, [pCRow0]
 
 	add	pCRow1, pCRow0, LDC
 
 	ld1	{v12.2d, v13.2d}, [pCRow1]
-	fmla	v12.2d, v20.2d, alphaV2
-	fmla	v13.2d, v21.2d, alphaV3
+	fmla	v12.2d, v20.2d, alphaV0
+	fmla	v13.2d, v21.2d, alphaV0
 	st1	{v12.2d, v13.2d}, [pCRow1]
 
 	add	pCRow0, pCRow0, #32
@@ -641,11 +657,12 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 	ld1	{v0.2d}, [pA]
 	add	pA, pA, #16
 
-	fmla	v16.2d, v0.2d, v8.2d[0]
-	fmla	v20.2d, v0.2d, v8.2d[1]
+	fmla	v16.2d, v0.2d, v8.d[0]
+	fmla	v20.2d, v0.2d, v8.d[1]
 .endm
 
 .macro SAVE2x2
+	fmov	alpha0, alpha
 	ld1	{v8.2d}, [pCRow0]
 	fmla	v8.2d, v16.2d, alphaV0
 	st1	{v8.2d}, [pCRow0]
@@ -653,7 +670,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 	add	pCRow1 , pCRow0, LDC
 
 	ld1	{v12.2d}, [pCRow1]
-	fmla	v12.2d, v20.2d, alphaV1
+	fmla	v12.2d, v20.2d, alphaV0
 	st1	{v12.2d}, [pCRow1]
 
 	add	pCRow0, pCRow0, #16
@@ -672,10 +689,11 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 	ldr	d0 , [pA]
 	add	pA, pA, #8
 
-	fmla	v16.2d, v8.2d, v0.2d[0]
+	fmla	v16.2d, v8.2d, v0.d[0]
 .endm
 
 .macro SAVE1x2
+	fmov	alpha0, alpha
 	add	pCRow1 , pCRow0, LDC
 
 	ld1	{v8.d}[0], [pCRow0]
@@ -706,18 +724,19 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 	ld1	{v2.2d, v3.2d}, [pA]
 	add	pA, pA, #32
 
-	fmla	v16.2d, v0.2d, v8.2d[0]
-	fmla	v17.2d, v1.2d, v8.2d[0]
-	fmla	v18.2d, v2.2d, v8.2d[0]
-	fmla	v19.2d, v3.2d, v8.2d[0]
+	fmla	v16.2d, v0.2d, v8.d[0]
+	fmla	v17.2d, v1.2d, v8.d[0]
+	fmla	v18.2d, v2.2d, v8.d[0]
+	fmla	v19.2d, v3.2d, v8.d[0]
 .endm
 
 .macro SAVE8x1
+	fmov	alpha0, alpha
 	ld1	{v0.2d, v1.2d, v2.2d, v3.2d}, [pCRow0]
 	fmla	v0.2d, v16.2d, alphaV0
-	fmla	v1.2d, v17.2d, alphaV1
-	fmla	v2.2d, v18.2d, alphaV2
-	fmla	v3.2d, v19.2d, alphaV3
+	fmla	v1.2d, v17.2d, alphaV0
+	fmla	v2.2d, v18.2d, alphaV0
+	fmla	v3.2d, v19.2d, alphaV0
 	st1 	{v0.2d, v1.2d, v2.2d, v3.2d}, [pCRow0]
 
 	add	pCRow0, pCRow0, #64
@@ -738,14 +757,15 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 	ld1	{v0.2d, v1.2d}, [pA]
 	add	pA , pA, #32
 
-	fmla	v16.2d, v0.2d, v8.2d[0]
-	fmla	v17.2d, v1.2d, v8.2d[0]
+	fmla	v16.2d, v0.2d, v8.d[0]
+	fmla	v17.2d, v1.2d, v8.d[0]
 .endm
 
 .macro SAVE4x1
+	fmov	alpha0, alpha
 	ld1	{v8.2d, v9.2d}, [pCRow0]
 	fmla	v8.2d, v16.2d, alphaV0
-	fmla	v9.2d, v17.2d, alphaV1
+	fmla	v9.2d, v17.2d, alphaV0
 	st1	{v8.2d, v9.2d}, [pCRow0]
 
 	add	pCRow0, pCRow0, #32
@@ -765,10 +785,11 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 	ld1	{v0.2d}, [pA]
 	add	pA , pA, #16
 
-	fmla	v16.2d, v0.2d, v8.2d[0]
+	fmla	v16.2d, v0.2d, v8.d[0]
 .endm
 
 .macro SAVE2x1
+	fmov	alpha0, alpha
 	ld1	{v8.2d}, [pCRow0]
 	fmla	v8.2d, v16.2d, alphaV0
 	st1	{v8.2d}, [pCRow0]
@@ -793,6 +814,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 .endm
 
 .macro SAVE1x1
+	fmov	alpha0, alpha
 	ldr	d8, [pCRow0]
 	fmadd	d8, d16, alpha0, d8
 	str 	d8, [pCRow0]
@@ -820,6 +842,9 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 	stp	x26, x27, [sp, #(9 * 16)]
 	str	x28, [sp, #(10 * 16)]
 
+	prfm	PLDL1KEEP, [origPB]
+	prfm	PLDL1KEEP, [origPA]
+
 	fmov	alpha, d0
 
 	lsl	LDC, LDC, #3			// ldc = ldc * 8
@@ -838,6 +863,7 @@ dgemm_kernel_L4_BEGIN:
 	add	pCRow1, pCRow0, LDC
 	add	pCRow2, pCRow1, LDC
 	add	pCRow3, pCRow2, LDC
+
 	add	pC, pCRow3, LDC
 
 	mov	pA, origPA			// pA = start of A array
@@ -849,6 +875,7 @@ dgemm_kernel_L4_M8_BEGIN:
 	cmp 	counterI, #0
 	ble	dgemm_kernel_L4_M4_BEGIN
 
+	.align 5
 dgemm_kernel_L4_M8_20:
 
 	mov	pB, origPB
@@ -868,8 +895,8 @@ dgemm_kernel_L4_M8_20:
 
 	subs	counterL, counterL, #2		// subtract 2
 	ble	dgemm_kernel_L4_M8_22a
-	.align 5
 
+	.align 5
 dgemm_kernel_L4_M8_22:
 
 	KERNEL8x4_M1
@@ -884,7 +911,7 @@ dgemm_kernel_L4_M8_22:
 	subs	counterL, counterL, #1
 	bgt	dgemm_kernel_L4_M8_22
 
-
+	.align 5
 dgemm_kernel_L4_M8_22a:
 
 	KERNEL8x4_M1
@@ -898,6 +925,7 @@ dgemm_kernel_L4_M8_22a:
 
 	b	 dgemm_kernel_L4_M8_44
 
+	.align 5
 dgemm_kernel_L4_M8_32:
 
 	tst	counterL, #1
@@ -923,6 +951,7 @@ dgemm_kernel_L4_M8_44:
 	ands	counterL , origK, #7
 	ble	dgemm_kernel_L4_M8_100
 
+	.align 5
 dgemm_kernel_L4_M8_46:
 
 	KERNEL8x4_SUB
@@ -931,6 +960,9 @@ dgemm_kernel_L4_M8_46:
 	bne	dgemm_kernel_L4_M8_46
 
 dgemm_kernel_L4_M8_100:
+	prfm	PLDL1KEEP, [pA]
+	prfm	PLDL1KEEP, [pA, #64]
+	prfm	PLDL1KEEP, [origPB]
 
 	SAVE8x4
 
diff --git a/kernel/arm64/dtrmm_kernel_4x4.S b/kernel/arm64/dtrmm_kernel_4x4.S
index 0d1b12881..34fb8c233 100644
--- a/kernel/arm64/dtrmm_kernel_4x4.S
+++ b/kernel/arm64/dtrmm_kernel_4x4.S
@@ -147,17 +147,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 	ld1	{v0.2d, v1.2d}, [pA]
 	add	pA, pA, #32
 
-	fmul	v16.2d, v0.2d, v8.2d[0]
-	fmul	v29.2d, v1.2d, v9.2d[1]
+	fmul	v16.2d, v0.2d, v8.d[0]
+	fmul	v29.2d, v1.2d, v9.d[1]
 
-	fmul	v20.2d, v0.2d, v8.2d[1]
-	fmul	v25.2d, v1.2d, v9.2d[0]
+	fmul	v20.2d, v0.2d, v8.d[1]
+	fmul	v25.2d, v1.2d, v9.d[0]
 
-	fmul	v24.2d, v0.2d, v9.2d[0]
-	fmul	v21.2d, v1.2d, v8.2d[1]
+	fmul	v24.2d, v0.2d, v9.d[0]
+	fmul	v21.2d, v1.2d, v8.d[1]
 
-	fmul	v28.2d, v0.2d, v9.2d[1]
-	fmul	v17.2d, v1.2d, v8.2d[0]
+	fmul	v28.2d, v0.2d, v9.d[1]
+	fmul	v17.2d, v1.2d, v8.d[0]
 
 	ld1	{v12.2d, v13.2d}, [pB]
 	add	pB, pB, #32
@@ -166,61 +166,61 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 .endm
 
 .macro KERNEL4x4_M1
-	fmla	v16.2d, v0.2d, v8.2d[0]
-	fmla	v29.2d, v1.2d, v9.2d[1]
+	fmla	v16.2d, v0.2d, v8.d[0]
+	fmla	v29.2d, v1.2d, v9.d[1]
 
 	ld1	{v12.2d, v13.2d}, [pB]		// For next round
 	add	pB, pB, #32
 
-	fmla	v20.2d, v0.2d, v8.2d[1]
-	fmla	v25.2d, v1.2d, v9.2d[0]
+	fmla	v20.2d, v0.2d, v8.d[1]
+	fmla	v25.2d, v1.2d, v9.d[0]
 
 	ld1	{v4.2d, v5.2d}, [pA]		// For next round
 	add	pA, pA, #32
 
-	fmla	v24.2d, v0.2d, v9.2d[0]
-	fmla	v21.2d, v1.2d, v8.2d[1]
+	fmla	v24.2d, v0.2d, v9.d[0]
+	fmla	v21.2d, v1.2d, v8.d[1]
 
 	prfm	PLDL1KEEP, [pA, #512]
 
-	fmla	v28.2d, v0.2d, v9.2d[1]
-	fmla	v17.2d, v1.2d, v8.2d[0]
+	fmla	v28.2d, v0.2d, v9.d[1]
+	fmla	v17.2d, v1.2d, v8.d[0]
 .endm
 
 .macro KERNEL4x4_M2
-	fmla	v16.2d, v4.2d, v12.2d[0]
-	fmla	v29.2d, v5.2d, v13.2d[1]
+	fmla	v16.2d, v4.2d, v12.d[0]
+	fmla	v29.2d, v5.2d, v13.d[1]
 
 	ld1	{v8.2d, v9.2d}, [pB]		// For next round
 	add	pB, pB, #32
 
-	fmla	v20.2d, v4.2d, v12.2d[1]
-	fmla	v25.2d, v5.2d, v13.2d[0]
+	fmla	v20.2d, v4.2d, v12.d[1]
+	fmla	v25.2d, v5.2d, v13.d[0]
 
 	ld1	{v0.2d, v1.2d}, [pA]		// For next round
 	add	pA, pA, #32
 
-	fmla	v24.2d, v4.2d, v13.2d[0]
-	fmla	v21.2d, v5.2d, v12.2d[1]
+	fmla	v24.2d, v4.2d, v13.d[0]
+	fmla	v21.2d, v5.2d, v12.d[1]
 
 	prfm	PLDL1KEEP, [pB, #512]
 
-	fmla	v28.2d, v4.2d, v13.2d[1]
-	fmla	v17.2d, v5.2d, v12.2d[0]
+	fmla	v28.2d, v4.2d, v13.d[1]
+	fmla	v17.2d, v5.2d, v12.d[0]
 .endm
 
 .macro KERNEL4x4_E
-	fmla	v16.2d, v4.2d, v12.2d[0]
-	fmla	v29.2d, v5.2d, v13.2d[1]
+	fmla	v16.2d, v4.2d, v12.d[0]
+	fmla	v29.2d, v5.2d, v13.d[1]
 
-	fmla	v20.2d, v4.2d, v12.2d[1]
-	fmla	v25.2d, v5.2d, v13.2d[0]
+	fmla	v20.2d, v4.2d, v12.d[1]
+	fmla	v25.2d, v5.2d, v13.d[0]
 
-	fmla	v24.2d, v4.2d, v13.2d[0]
-	fmla	v21.2d, v5.2d, v12.2d[1]
+	fmla	v24.2d, v4.2d, v13.d[0]
+	fmla	v21.2d, v5.2d, v12.d[1]
 
-	fmla	v28.2d, v4.2d, v13.2d[1]
-	fmla	v17.2d, v5.2d, v12.2d[0]
+	fmla	v28.2d, v4.2d, v13.d[1]
+	fmla	v17.2d, v5.2d, v12.d[0]
 .endm
 
 .macro KERNEL4x4_SUB
@@ -229,17 +229,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 	ld1	{v0.2d, v1.2d}, [pA]
 	add	pA, pA, #32
 
-	fmla	v16.2d, v0.2d, v8.2d[0]
-	fmla	v29.2d, v1.2d, v9.2d[1]
+	fmla	v16.2d, v0.2d, v8.d[0]
+	fmla	v29.2d, v1.2d, v9.d[1]
 
-	fmla	v20.2d, v0.2d, v8.2d[1]
-	fmla	v25.2d, v1.2d, v9.2d[0]
+	fmla	v20.2d, v0.2d, v8.d[1]
+	fmla	v25.2d, v1.2d, v9.d[0]
 
-	fmla	v24.2d, v0.2d, v9.2d[0]
-	fmla	v21.2d, v1.2d, v8.2d[1]
+	fmla	v24.2d, v0.2d, v9.d[0]
+	fmla	v21.2d, v1.2d, v8.d[1]
 
-	fmla	v28.2d, v0.2d, v9.2d[1]
-	fmla	v17.2d, v1.2d, v8.2d[0]
+	fmla	v28.2d, v0.2d, v9.d[1]
+	fmla	v17.2d, v1.2d, v8.d[0]
 .endm
 
 .macro SAVE4x4
@@ -283,10 +283,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 	ld1	{v0.2d}, [pA]
 	add	pA, pA, #16
 
-	fmla	v16.2d, v0.2d, v8.2d[0]
-	fmla	v20.2d, v0.2d, v8.2d[1]
-	fmla	v24.2d, v0.2d, v9.2d[0]
-	fmla	v28.2d, v0.2d, v9.2d[1]
+	fmla	v16.2d, v0.2d, v8.d[0]
+	fmla	v20.2d, v0.2d, v8.d[1]
+	fmla	v24.2d, v0.2d, v9.d[0]
+	fmla	v28.2d, v0.2d, v9.d[1]
 .endm
 
 .macro SAVE2x4
@@ -361,10 +361,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 	ld1	{v0.2d, v1.2d}, [pA]
 	add	pA, pA, #32
 
-	fmla	v16.2d, v0.2d, v8.2d[0]
-	fmla	v17.2d, v1.2d, v8.2d[0]
-	fmla	v20.2d, v0.2d, v8.2d[1]
-	fmla	v21.2d, v1.2d, v8.2d[1]
+	fmla	v16.2d, v0.2d, v8.d[0]
+	fmla	v17.2d, v1.2d, v8.d[0]
+	fmla	v20.2d, v0.2d, v8.d[1]
+	fmla	v21.2d, v1.2d, v8.d[1]
 .endm
 
 .macro SAVE4x2
@@ -395,8 +395,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 	ld1	{v0.2d}, [pA]
 	add	pA, pA, #16
 
-	fmla	v16.2d, v0.2d, v8.2d[0]
-	fmla	v20.2d, v0.2d, v8.2d[1]
+	fmla	v16.2d, v0.2d, v8.d[0]
+	fmla	v20.2d, v0.2d, v8.d[1]
 .endm
 
 .macro SAVE2x2
@@ -424,7 +424,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 	ldr	d0 , [pA]
 	add	pA, pA, #8
 
-	fmla	v16.2d, v8.2d, v0.2d[0]
+	fmla	v16.2d, v8.2d, v0.d[0]
 .endm
 
 .macro SAVE1x2
@@ -451,8 +451,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 	ld1	{v0.2d, v1.2d}, [pA]
 	add	pA , pA, #32
 
-	fmla	v16.2d, v0.2d, v8.2d[0]
-	fmla	v17.2d, v1.2d, v8.2d[0]
+	fmla	v16.2d, v0.2d, v8.d[0]
+	fmla	v17.2d, v1.2d, v8.d[0]
 .endm
 
 .macro SAVE4x1
@@ -479,7 +479,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 	ld1	{v0.2d}, [pA]
 	add	pA , pA, #16
 
-	fmla	v16.2d, v0.2d, v8.2d[0]
+	fmla	v16.2d, v0.2d, v8.d[0]
 .endm
 
 .macro SAVE2x1
diff --git a/kernel/arm64/dtrmm_kernel_4x8.S b/kernel/arm64/dtrmm_kernel_4x8.S
old mode 100755
new mode 100644
index eb7397faa..4aecf28eb
--- a/kernel/arm64/dtrmm_kernel_4x8.S
+++ b/kernel/arm64/dtrmm_kernel_4x8.S
@@ -157,25 +157,25 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 	ld1	{v10.2d, v11.2d}, [pB]
 	add	pB, pB, #32
 
-	fmul	v16.2d, v0.2d, v8.2d[0]
-	fmul	v17.2d, v1.2d, v8.2d[0]
-	fmul	v18.2d, v0.2d, v8.2d[1]
-	fmul	v19.2d, v1.2d, v8.2d[1]
+	fmul	v16.2d, v0.2d, v8.d[0]
+	fmul	v17.2d, v1.2d, v8.d[0]
+	fmul	v18.2d, v0.2d, v8.d[1]
+	fmul	v19.2d, v1.2d, v8.d[1]
 
-	fmul	v20.2d, v0.2d, v9.2d[0]
-	fmul	v21.2d, v1.2d, v9.2d[0]
-	fmul	v22.2d, v0.2d, v9.2d[1]
-	fmul	v23.2d, v1.2d, v9.2d[1]
+	fmul	v20.2d, v0.2d, v9.d[0]
+	fmul	v21.2d, v1.2d, v9.d[0]
+	fmul	v22.2d, v0.2d, v9.d[1]
+	fmul	v23.2d, v1.2d, v9.d[1]
 
-	fmul	v24.2d, v0.2d, v10.2d[0]
-	fmul	v25.2d, v1.2d, v10.2d[0]
-	fmul	v26.2d, v0.2d, v10.2d[1]
-	fmul	v27.2d, v1.2d, v10.2d[1]
+	fmul	v24.2d, v0.2d, v10.d[0]
+	fmul	v25.2d, v1.2d, v10.d[0]
+	fmul	v26.2d, v0.2d, v10.d[1]
+	fmul	v27.2d, v1.2d, v10.d[1]
 
-	fmul	v28.2d, v0.2d, v11.2d[0]
-	fmul	v29.2d, v1.2d, v11.2d[0]
-	fmul	v30.2d, v0.2d, v11.2d[1]
-	fmul	v31.2d, v1.2d, v11.2d[1]
+	fmul	v28.2d, v0.2d, v11.d[0]
+	fmul	v29.2d, v1.2d, v11.d[0]
+	fmul	v30.2d, v0.2d, v11.d[1]
+	fmul	v31.2d, v1.2d, v11.d[1]
 
 	ld1	{v12.2d, v13.2d}, [pB]
 	add	pB, pB, #32
@@ -186,25 +186,25 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 .endm
 
 .macro KERNEL4x8_M1
-	fmla	v16.2d, v0.2d, v8.2d[0]
-	fmla	v17.2d, v1.2d, v8.2d[0]
-	fmla	v18.2d, v0.2d, v8.2d[1]
-	fmla	v19.2d, v1.2d, v8.2d[1]
+	fmla	v16.2d, v0.2d, v8.d[0]
+	fmla	v17.2d, v1.2d, v8.d[0]
+	fmla	v18.2d, v0.2d, v8.d[1]
+	fmla	v19.2d, v1.2d, v8.d[1]
 
-	fmla	v20.2d, v0.2d, v9.2d[0]
-	fmla	v21.2d, v1.2d, v9.2d[0]
-	fmla	v22.2d, v0.2d, v9.2d[1]
-	fmla	v23.2d, v1.2d, v9.2d[1]
+	fmla	v20.2d, v0.2d, v9.d[0]
+	fmla	v21.2d, v1.2d, v9.d[0]
+	fmla	v22.2d, v0.2d, v9.d[1]
+	fmla	v23.2d, v1.2d, v9.d[1]
 
-	fmla	v24.2d, v0.2d, v10.2d[0]
-	fmla	v25.2d, v1.2d, v10.2d[0]
-	fmla	v26.2d, v0.2d, v10.2d[1]
-	fmla	v27.2d, v1.2d, v10.2d[1]
+	fmla	v24.2d, v0.2d, v10.d[0]
+	fmla	v25.2d, v1.2d, v10.d[0]
+	fmla	v26.2d, v0.2d, v10.d[1]
+	fmla	v27.2d, v1.2d, v10.d[1]
 
-	fmla	v28.2d, v0.2d, v11.2d[0]
-	fmla	v29.2d, v1.2d, v11.2d[0]
-	fmla	v30.2d, v0.2d, v11.2d[1]
-	fmla	v31.2d, v1.2d, v11.2d[1]
+	fmla	v28.2d, v0.2d, v11.d[0]
+	fmla	v29.2d, v1.2d, v11.d[0]
+	fmla	v30.2d, v0.2d, v11.d[1]
+	fmla	v31.2d, v1.2d, v11.d[1]
 
 	ld1	{v12.2d, v13.2d}, [pB]		// For next round
 	add	pB, pB, #32
@@ -217,25 +217,25 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 .endm
 
 .macro KERNEL4x8_M2
-	fmla	v16.2d, v4.2d, v12.2d[0]
-	fmla	v17.2d, v5.2d, v12.2d[0]
-	fmla	v18.2d, v4.2d, v12.2d[1]
-	fmla	v19.2d, v5.2d, v12.2d[1]
+	fmla	v16.2d, v4.2d, v12.d[0]
+	fmla	v17.2d, v5.2d, v12.d[0]
+	fmla	v18.2d, v4.2d, v12.d[1]
+	fmla	v19.2d, v5.2d, v12.d[1]
 
-	fmla	v20.2d, v4.2d, v13.2d[0]
-	fmla	v21.2d, v5.2d, v13.2d[0]
-	fmla	v22.2d, v4.2d, v13.2d[1]
-	fmla	v23.2d, v5.2d, v13.2d[1]
+	fmla	v20.2d, v4.2d, v13.d[0]
+	fmla	v21.2d, v5.2d, v13.d[0]
+	fmla	v22.2d, v4.2d, v13.d[1]
+	fmla	v23.2d, v5.2d, v13.d[1]
 
-	fmla	v24.2d, v4.2d, v14.2d[0]
-	fmla	v25.2d, v5.2d, v14.2d[0]
-	fmla	v26.2d, v4.2d, v14.2d[1]
-	fmla	v27.2d, v5.2d, v14.2d[1]
+	fmla	v24.2d, v4.2d, v14.d[0]
+	fmla	v25.2d, v5.2d, v14.d[0]
+	fmla	v26.2d, v4.2d, v14.d[1]
+	fmla	v27.2d, v5.2d, v14.d[1]
 
-	fmla	v28.2d, v4.2d, v15.2d[0]
-	fmla	v29.2d, v5.2d, v15.2d[0]
-	fmla	v30.2d, v4.2d, v15.2d[1]
-	fmla	v31.2d, v5.2d, v15.2d[1]
+	fmla	v28.2d, v4.2d, v15.d[0]
+	fmla	v29.2d, v5.2d, v15.d[0]
+	fmla	v30.2d, v4.2d, v15.d[1]
+	fmla	v31.2d, v5.2d, v15.d[1]
 
 	ld1	{v8.2d, v9.2d}, [pB]		// For next round
 	add	pB, pB, #32
@@ -248,25 +248,25 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 .endm
 
 .macro KERNEL4x8_E
-	fmla	v16.2d, v4.2d, v12.2d[0]
-	fmla	v17.2d, v5.2d, v12.2d[0]
-	fmla	v18.2d, v4.2d, v12.2d[1]
-	fmla	v19.2d, v5.2d, v12.2d[1]
+	fmla	v16.2d, v4.2d, v12.d[0]
+	fmla	v17.2d, v5.2d, v12.d[0]
+	fmla	v18.2d, v4.2d, v12.d[1]
+	fmla	v19.2d, v5.2d, v12.d[1]
 
-	fmla	v20.2d, v4.2d, v13.2d[0]
-	fmla	v21.2d, v5.2d, v13.2d[0]
-	fmla	v22.2d, v4.2d, v13.2d[1]
-	fmla	v23.2d, v5.2d, v13.2d[1]
+	fmla	v20.2d, v4.2d, v13.d[0]
+	fmla	v21.2d, v5.2d, v13.d[0]
+	fmla	v22.2d, v4.2d, v13.d[1]
+	fmla	v23.2d, v5.2d, v13.d[1]
 
-	fmla	v24.2d, v4.2d, v14.2d[0]
-	fmla	v25.2d, v5.2d, v14.2d[0]
-	fmla	v26.2d, v4.2d, v14.2d[1]
-	fmla	v27.2d, v5.2d, v14.2d[1]
+	fmla	v24.2d, v4.2d, v14.d[0]
+	fmla	v25.2d, v5.2d, v14.d[0]
+	fmla	v26.2d, v4.2d, v14.d[1]
+	fmla	v27.2d, v5.2d, v14.d[1]
 
-	fmla	v28.2d, v4.2d, v15.2d[0]
-	fmla	v29.2d, v5.2d, v15.2d[0]
-	fmla	v30.2d, v4.2d, v15.2d[1]
-	fmla	v31.2d, v5.2d, v15.2d[1]
+	fmla	v28.2d, v4.2d, v15.d[0]
+	fmla	v29.2d, v5.2d, v15.d[0]
+	fmla	v30.2d, v4.2d, v15.d[1]
+	fmla	v31.2d, v5.2d, v15.d[1]
 .endm
 
 .macro KERNEL4x8_SUB
@@ -277,25 +277,25 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 	ld1	{v10.2d, v11.2d}, [pB]
 	add	pB, pB, #32
 
-	fmla	v16.2d, v0.2d, v8.2d[0]
-	fmla	v17.2d, v1.2d, v8.2d[0]
-	fmla	v18.2d, v0.2d, v8.2d[1]
-	fmla	v19.2d, v1.2d, v8.2d[1]
+	fmla	v16.2d, v0.2d, v8.d[0]
+	fmla	v17.2d, v1.2d, v8.d[0]
+	fmla	v18.2d, v0.2d, v8.d[1]
+	fmla	v19.2d, v1.2d, v8.d[1]
 
-	fmla	v20.2d, v0.2d, v9.2d[0]
-	fmla	v21.2d, v1.2d, v9.2d[0]
-	fmla	v22.2d, v0.2d, v9.2d[1]
-	fmla	v23.2d, v1.2d, v9.2d[1]
+	fmla	v20.2d, v0.2d, v9.d[0]
+	fmla	v21.2d, v1.2d, v9.d[0]
+	fmla	v22.2d, v0.2d, v9.d[1]
+	fmla	v23.2d, v1.2d, v9.d[1]
 
-	fmla	v24.2d, v0.2d, v10.2d[0]
-	fmla	v25.2d, v1.2d, v10.2d[0]
-	fmla	v26.2d, v0.2d, v10.2d[1]
-	fmla	v27.2d, v1.2d, v10.2d[1]
+	fmla	v24.2d, v0.2d, v10.d[0]
+	fmla	v25.2d, v1.2d, v10.d[0]
+	fmla	v26.2d, v0.2d, v10.d[1]
+	fmla	v27.2d, v1.2d, v10.d[1]
 
-	fmla	v28.2d, v0.2d, v11.2d[0]
-	fmla	v29.2d, v1.2d, v11.2d[0]
-	fmla	v30.2d, v0.2d, v11.2d[1]
-	fmla	v31.2d, v1.2d, v11.2d[1]
+	fmla	v28.2d, v0.2d, v11.d[0]
+	fmla	v29.2d, v1.2d, v11.d[0]
+	fmla	v30.2d, v0.2d, v11.d[1]
+	fmla	v31.2d, v1.2d, v11.d[1]
 .endm
 
 .macro SAVE4x8
@@ -369,17 +369,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 	ld1	{v10.2d, v11.2d}, [pB]
 	add	pB, pB, #32
 
-	fmla	v16.2d, v0.2d, v8.2d[0]
-	fmla	v18.2d, v0.2d, v8.2d[1]
+	fmla	v16.2d, v0.2d, v8.d[0]
+	fmla	v18.2d, v0.2d, v8.d[1]
 
-	fmla	v20.2d, v0.2d, v9.2d[0]
-	fmla	v22.2d, v0.2d, v9.2d[1]
+	fmla	v20.2d, v0.2d, v9.d[0]
+	fmla	v22.2d, v0.2d, v9.d[1]
 
-	fmla	v24.2d, v0.2d, v10.2d[0]
-	fmla	v26.2d, v0.2d, v10.2d[1]
+	fmla	v24.2d, v0.2d, v10.d[0]
+	fmla	v26.2d, v0.2d, v10.d[1]
 
-	fmla	v28.2d, v0.2d, v11.2d[0]
-	fmla	v30.2d, v0.2d, v11.2d[1]
+	fmla	v28.2d, v0.2d, v11.d[0]
+	fmla	v30.2d, v0.2d, v11.d[1]
 .endm
 
 .macro SAVE2x8
@@ -499,17 +499,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 	ld1	{v0.2d, v1.2d}, [pA]
 	add	pA, pA, #32
 
-	fmul	v16.2d, v0.2d, v8.2d[0]
-	fmul	v29.2d, v1.2d, v9.2d[1]
+	fmul	v16.2d, v0.2d, v8.d[0]
+	fmul	v29.2d, v1.2d, v9.d[1]
 
-	fmul	v20.2d, v0.2d, v8.2d[1]
-	fmul	v25.2d, v1.2d, v9.2d[0]
+	fmul	v20.2d, v0.2d, v8.d[1]
+	fmul	v25.2d, v1.2d, v9.d[0]
 
-	fmul	v24.2d, v0.2d, v9.2d[0]
-	fmul	v21.2d, v1.2d, v8.2d[1]
+	fmul	v24.2d, v0.2d, v9.d[0]
+	fmul	v21.2d, v1.2d, v8.d[1]
 
-	fmul	v28.2d, v0.2d, v9.2d[1]
-	fmul	v17.2d, v1.2d, v8.2d[0]
+	fmul	v28.2d, v0.2d, v9.d[1]
+	fmul	v17.2d, v1.2d, v8.d[0]
 
 	ld1	{v12.2d, v13.2d}, [pB]
 	add	pB, pB, #32
@@ -518,61 +518,61 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 .endm
 
 .macro KERNEL4x4_M1
-	fmla	v16.2d, v0.2d, v8.2d[0]
-	fmla	v29.2d, v1.2d, v9.2d[1]
+	fmla	v16.2d, v0.2d, v8.d[0]
+	fmla	v29.2d, v1.2d, v9.d[1]
 
 	ld1	{v12.2d, v13.2d}, [pB]		// For next round
 	add	pB, pB, #32
 
-	fmla	v20.2d, v0.2d, v8.2d[1]
-	fmla	v25.2d, v1.2d, v9.2d[0]
+	fmla	v20.2d, v0.2d, v8.d[1]
+	fmla	v25.2d, v1.2d, v9.d[0]
 
 	ld1	{v4.2d, v5.2d}, [pA]		// For next round
 	add	pA, pA, #32
 
-	fmla	v24.2d, v0.2d, v9.2d[0]
-	fmla	v21.2d, v1.2d, v8.2d[1]
+	fmla	v24.2d, v0.2d, v9.d[0]
+	fmla	v21.2d, v1.2d, v8.d[1]
 
 	prfm	PLDL1KEEP, [pA, #512]
 
-	fmla	v28.2d, v0.2d, v9.2d[1]
-	fmla	v17.2d, v1.2d, v8.2d[0]
+	fmla	v28.2d, v0.2d, v9.d[1]
+	fmla	v17.2d, v1.2d, v8.d[0]
 .endm
 
 .macro KERNEL4x4_M2
-	fmla	v16.2d, v4.2d, v12.2d[0]
-	fmla	v29.2d, v5.2d, v13.2d[1]
+	fmla	v16.2d, v4.2d, v12.d[0]
+	fmla	v29.2d, v5.2d, v13.d[1]
 
 	ld1	{v8.2d, v9.2d}, [pB]		// For next round
 	add	pB, pB, #32
 
-	fmla	v20.2d, v4.2d, v12.2d[1]
-	fmla	v25.2d, v5.2d, v13.2d[0]
+	fmla	v20.2d, v4.2d, v12.d[1]
+	fmla	v25.2d, v5.2d, v13.d[0]
 
 	ld1	{v0.2d, v1.2d}, [pA]		// For next round
 	add	pA, pA, #32
 
-	fmla	v24.2d, v4.2d, v13.2d[0]
-	fmla	v21.2d, v5.2d, v12.2d[1]
+	fmla	v24.2d, v4.2d, v13.d[0]
+	fmla	v21.2d, v5.2d, v12.d[1]
 
 	prfm	PLDL1KEEP, [pB, #512]
 
-	fmla	v28.2d, v4.2d, v13.2d[1]
-	fmla	v17.2d, v5.2d, v12.2d[0]
+	fmla	v28.2d, v4.2d, v13.d[1]
+	fmla	v17.2d, v5.2d, v12.d[0]
 .endm
 
 .macro KERNEL4x4_E
-	fmla	v16.2d, v4.2d, v12.2d[0]
-	fmla	v29.2d, v5.2d, v13.2d[1]
+	fmla	v16.2d, v4.2d, v12.d[0]
+	fmla	v29.2d, v5.2d, v13.d[1]
 
-	fmla	v20.2d, v4.2d, v12.2d[1]
-	fmla	v25.2d, v5.2d, v13.2d[0]
+	fmla	v20.2d, v4.2d, v12.d[1]
+	fmla	v25.2d, v5.2d, v13.d[0]
 
-	fmla	v24.2d, v4.2d, v13.2d[0]
-	fmla	v21.2d, v5.2d, v12.2d[1]
+	fmla	v24.2d, v4.2d, v13.d[0]
+	fmla	v21.2d, v5.2d, v12.d[1]
 
-	fmla	v28.2d, v4.2d, v13.2d[1]
-	fmla	v17.2d, v5.2d, v12.2d[0]
+	fmla	v28.2d, v4.2d, v13.d[1]
+	fmla	v17.2d, v5.2d, v12.d[0]
 .endm
 
 .macro KERNEL4x4_SUB
@@ -581,17 +581,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 	ld1	{v0.2d, v1.2d}, [pA]
 	add	pA, pA, #32
 
-	fmla	v16.2d, v0.2d, v8.2d[0]
-	fmla	v29.2d, v1.2d, v9.2d[1]
+	fmla	v16.2d, v0.2d, v8.d[0]
+	fmla	v29.2d, v1.2d, v9.d[1]
 
-	fmla	v20.2d, v0.2d, v8.2d[1]
-	fmla	v25.2d, v1.2d, v9.2d[0]
+	fmla	v20.2d, v0.2d, v8.d[1]
+	fmla	v25.2d, v1.2d, v9.d[0]
 
-	fmla	v24.2d, v0.2d, v9.2d[0]
-	fmla	v21.2d, v1.2d, v8.2d[1]
+	fmla	v24.2d, v0.2d, v9.d[0]
+	fmla	v21.2d, v1.2d, v8.d[1]
 
-	fmla	v28.2d, v0.2d, v9.2d[1]
-	fmla	v17.2d, v1.2d, v8.2d[0]
+	fmla	v28.2d, v0.2d, v9.d[1]
+	fmla	v17.2d, v1.2d, v8.d[0]
 .endm
 
 .macro SAVE4x4
@@ -635,10 +635,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 	ld1	{v0.2d}, [pA]
 	add	pA, pA, #16
 
-	fmla	v16.2d, v0.2d, v8.2d[0]
-	fmla	v20.2d, v0.2d, v8.2d[1]
-	fmla	v24.2d, v0.2d, v9.2d[0]
-	fmla	v28.2d, v0.2d, v9.2d[1]
+	fmla	v16.2d, v0.2d, v8.d[0]
+	fmla	v20.2d, v0.2d, v8.d[1]
+	fmla	v24.2d, v0.2d, v9.d[0]
+	fmla	v28.2d, v0.2d, v9.d[1]
 .endm
 
 .macro SAVE2x4
@@ -713,10 +713,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 	ld1	{v0.2d, v1.2d}, [pA]
 	add	pA, pA, #32
 
-	fmla	v16.2d, v0.2d, v8.2d[0]
-	fmla	v17.2d, v1.2d, v8.2d[0]
-	fmla	v20.2d, v0.2d, v8.2d[1]
-	fmla	v21.2d, v1.2d, v8.2d[1]
+	fmla	v16.2d, v0.2d, v8.d[0]
+	fmla	v17.2d, v1.2d, v8.d[0]
+	fmla	v20.2d, v0.2d, v8.d[1]
+	fmla	v21.2d, v1.2d, v8.d[1]
 .endm
 
 .macro SAVE4x2
@@ -747,8 +747,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 	ld1	{v0.2d}, [pA]
 	add	pA, pA, #16
 
-	fmla	v16.2d, v0.2d, v8.2d[0]
-	fmla	v20.2d, v0.2d, v8.2d[1]
+	fmla	v16.2d, v0.2d, v8.d[0]
+	fmla	v20.2d, v0.2d, v8.d[1]
 .endm
 
 .macro SAVE2x2
@@ -776,7 +776,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 	ldr	d0 , [pA]
 	add	pA, pA, #8
 
-	fmla	v16.2d, v8.2d, v0.2d[0]
+	fmla	v16.2d, v8.2d, v0.d[0]
 .endm
 
 .macro SAVE1x2
@@ -803,8 +803,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 	ld1	{v0.2d, v1.2d}, [pA]
 	add	pA , pA, #32
 
-	fmla	v16.2d, v0.2d, v8.2d[0]
-	fmla	v17.2d, v1.2d, v8.2d[0]
+	fmla	v16.2d, v0.2d, v8.d[0]
+	fmla	v17.2d, v1.2d, v8.d[0]
 .endm
 
 .macro SAVE4x1
@@ -831,7 +831,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 	ld1	{v0.2d}, [pA]
 	add	pA , pA, #16
 
-	fmla	v16.2d, v0.2d, v8.2d[0]
+	fmla	v16.2d, v0.2d, v8.d[0]
 .endm
 
 .macro SAVE2x1
diff --git a/kernel/arm64/dtrmm_kernel_8x4.S b/kernel/arm64/dtrmm_kernel_8x4.S
old mode 100755
new mode 100644
index 6890505bd..b06c7560d
--- a/kernel/arm64/dtrmm_kernel_8x4.S
+++ b/kernel/arm64/dtrmm_kernel_8x4.S
@@ -157,25 +157,25 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 	ld1	{v2.2d, v3.2d}, [pA]
 	add	pA, pA, #32
 
-	fmul	v16.2d, v0.2d, v8.2d[0]
-	fmul	v17.2d, v1.2d, v8.2d[0]
-	fmul	v18.2d, v2.2d, v8.2d[0]
-	fmul	v19.2d, v3.2d, v8.2d[0]
+	fmul	v16.2d, v0.2d, v8.d[0]
+	fmul	v17.2d, v1.2d, v8.d[0]
+	fmul	v18.2d, v2.2d, v8.d[0]
+	fmul	v19.2d, v3.2d, v8.d[0]
 
-	fmul	v20.2d, v0.2d, v8.2d[1]
-	fmul	v21.2d, v1.2d, v8.2d[1]
-	fmul	v22.2d, v2.2d, v8.2d[1]
-	fmul	v23.2d, v3.2d, v8.2d[1]
+	fmul	v20.2d, v0.2d, v8.d[1]
+	fmul	v21.2d, v1.2d, v8.d[1]
+	fmul	v22.2d, v2.2d, v8.d[1]
+	fmul	v23.2d, v3.2d, v8.d[1]
 
-	fmul	v24.2d, v0.2d, v9.2d[0]
-	fmul	v25.2d, v1.2d, v9.2d[0]
-	fmul	v26.2d, v2.2d, v9.2d[0]
-	fmul	v27.2d, v3.2d, v9.2d[0]
+	fmul	v24.2d, v0.2d, v9.d[0]
+	fmul	v25.2d, v1.2d, v9.d[0]
+	fmul	v26.2d, v2.2d, v9.d[0]
+	fmul	v27.2d, v3.2d, v9.d[0]
 
-	fmul	v28.2d, v0.2d, v9.2d[1]
-	fmul	v29.2d, v1.2d, v9.2d[1]
-	fmul	v30.2d, v2.2d, v9.2d[1]
-	fmul	v31.2d, v3.2d, v9.2d[1]
+	fmul	v28.2d, v0.2d, v9.d[1]
+	fmul	v29.2d, v1.2d, v9.d[1]
+	fmul	v30.2d, v2.2d, v9.d[1]
+	fmul	v31.2d, v3.2d, v9.d[1]
 
 	ld1	{v4.2d, v5.2d}, [pA]
 	add	pA, pA, #32
@@ -186,25 +186,25 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 .endm
 
 .macro KERNEL8x4_M1
-	fmla	v16.2d, v0.2d, v8.2d[0]
-	fmla	v17.2d, v1.2d, v8.2d[0]
-	fmla	v18.2d, v2.2d, v8.2d[0]
-	fmla	v19.2d, v3.2d, v8.2d[0]
+	fmla	v16.2d, v0.2d, v8.d[0]
+	fmla	v17.2d, v1.2d, v8.d[0]
+	fmla	v18.2d, v2.2d, v8.d[0]
+	fmla	v19.2d, v3.2d, v8.d[0]
 
-	fmla	v20.2d, v0.2d, v8.2d[1]
-	fmla	v21.2d, v1.2d, v8.2d[1]
-	fmla	v22.2d, v2.2d, v8.2d[1]
-	fmla	v23.2d, v3.2d, v8.2d[1]
+	fmla	v20.2d, v0.2d, v8.d[1]
+	fmla	v21.2d, v1.2d, v8.d[1]
+	fmla	v22.2d, v2.2d, v8.d[1]
+	fmla	v23.2d, v3.2d, v8.d[1]
 
-	fmla	v24.2d, v0.2d, v9.2d[0]
-	fmla	v25.2d, v1.2d, v9.2d[0]
-	fmla	v26.2d, v2.2d, v9.2d[0]
-	fmla	v27.2d, v3.2d, v9.2d[0]
+	fmla	v24.2d, v0.2d, v9.d[0]
+	fmla	v25.2d, v1.2d, v9.d[0]
+	fmla	v26.2d, v2.2d, v9.d[0]
+	fmla	v27.2d, v3.2d, v9.d[0]
 
-	fmla	v28.2d, v0.2d, v9.2d[1]
-	fmla	v29.2d, v1.2d, v9.2d[1]
-	fmla	v30.2d, v2.2d, v9.2d[1]
-	fmla	v31.2d, v3.2d, v9.2d[1]
+	fmla	v28.2d, v0.2d, v9.d[1]
+	fmla	v29.2d, v1.2d, v9.d[1]
+	fmla	v30.2d, v2.2d, v9.d[1]
+	fmla	v31.2d, v3.2d, v9.d[1]
 
 	ld1	{v4.2d, v5.2d}, [pA]
 	add	pA, pA, #32
@@ -217,25 +217,25 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 .endm
 
 .macro KERNEL8x4_M2
-	fmla	v16.2d, v4.2d, v12.2d[0]
-	fmla	v17.2d, v5.2d, v12.2d[0]
-	fmla	v18.2d, v6.2d, v12.2d[0]
-	fmla	v19.2d, v7.2d, v12.2d[0]
+	fmla	v16.2d, v4.2d, v12.d[0]
+	fmla	v17.2d, v5.2d, v12.d[0]
+	fmla	v18.2d, v6.2d, v12.d[0]
+	fmla	v19.2d, v7.2d, v12.d[0]
 
-	fmla	v20.2d, v4.2d, v12.2d[1]
-	fmla	v21.2d, v5.2d, v12.2d[1]
-	fmla	v22.2d, v6.2d, v12.2d[1]
-	fmla	v23.2d, v7.2d, v12.2d[1]
+	fmla	v20.2d, v4.2d, v12.d[1]
+	fmla	v21.2d, v5.2d, v12.d[1]
+	fmla	v22.2d, v6.2d, v12.d[1]
+	fmla	v23.2d, v7.2d, v12.d[1]
 
-	fmla	v24.2d, v4.2d, v13.2d[0]
-	fmla	v25.2d, v5.2d, v13.2d[0]
-	fmla	v26.2d, v6.2d, v13.2d[0]
-	fmla	v27.2d, v7.2d, v13.2d[0]
+	fmla	v24.2d, v4.2d, v13.d[0]
+	fmla	v25.2d, v5.2d, v13.d[0]
+	fmla	v26.2d, v6.2d, v13.d[0]
+	fmla	v27.2d, v7.2d, v13.d[0]
 
-	fmla	v28.2d, v4.2d, v13.2d[1]
-	fmla	v29.2d, v5.2d, v13.2d[1]
-	fmla	v30.2d, v6.2d, v13.2d[1]
-	fmla	v31.2d, v7.2d, v13.2d[1]
+	fmla	v28.2d, v4.2d, v13.d[1]
+	fmla	v29.2d, v5.2d, v13.d[1]
+	fmla	v30.2d, v6.2d, v13.d[1]
+	fmla	v31.2d, v7.2d, v13.d[1]
 
 	ld1	{v0.2d, v1.2d}, [pA]
 	add	pA, pA, #32
@@ -248,25 +248,25 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 .endm
 
 .macro KERNEL8x4_E
-	fmla	v16.2d, v4.2d, v12.2d[0]
-	fmla	v17.2d, v5.2d, v12.2d[0]
-	fmla	v18.2d, v6.2d, v12.2d[0]
-	fmla	v19.2d, v7.2d, v12.2d[0]
+	fmla	v16.2d, v4.2d, v12.d[0]
+	fmla	v17.2d, v5.2d, v12.d[0]
+	fmla	v18.2d, v6.2d, v12.d[0]
+	fmla	v19.2d, v7.2d, v12.d[0]
 
-	fmla	v20.2d, v4.2d, v12.2d[1]
-	fmla	v21.2d, v5.2d, v12.2d[1]
-	fmla	v22.2d, v6.2d, v12.2d[1]
-	fmla	v23.2d, v7.2d, v12.2d[1]
+	fmla	v20.2d, v4.2d, v12.d[1]
+	fmla	v21.2d, v5.2d, v12.d[1]
+	fmla	v22.2d, v6.2d, v12.d[1]
+	fmla	v23.2d, v7.2d, v12.d[1]
 
-	fmla	v24.2d, v4.2d, v13.2d[0]
-	fmla	v25.2d, v5.2d, v13.2d[0]
-	fmla	v26.2d, v6.2d, v13.2d[0]
-	fmla	v27.2d, v7.2d, v13.2d[0]
+	fmla	v24.2d, v4.2d, v13.d[0]
+	fmla	v25.2d, v5.2d, v13.d[0]
+	fmla	v26.2d, v6.2d, v13.d[0]
+	fmla	v27.2d, v7.2d, v13.d[0]
 
-	fmla	v28.2d, v4.2d, v13.2d[1]
-	fmla	v29.2d, v5.2d, v13.2d[1]
-	fmla	v30.2d, v6.2d, v13.2d[1]
-	fmla	v31.2d, v7.2d, v13.2d[1]
+	fmla	v28.2d, v4.2d, v13.d[1]
+	fmla	v29.2d, v5.2d, v13.d[1]
+	fmla	v30.2d, v6.2d, v13.d[1]
+	fmla	v31.2d, v7.2d, v13.d[1]
 .endm
 
 .macro KERNEL8x4_SUB
@@ -277,25 +277,25 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 	ld1	{v2.2d, v3.2d}, [pA]
 	add	pA, pA, #32
 
-	fmla	v16.2d, v0.2d, v8.2d[0]
-	fmla	v17.2d, v1.2d, v8.2d[0]
-	fmla	v18.2d, v2.2d, v8.2d[0]
-	fmla	v19.2d, v3.2d, v8.2d[0]
+	fmla	v16.2d, v0.2d, v8.d[0]
+	fmla	v17.2d, v1.2d, v8.d[0]
+	fmla	v18.2d, v2.2d, v8.d[0]
+	fmla	v19.2d, v3.2d, v8.d[0]
 
-	fmla	v20.2d, v0.2d, v8.2d[1]
-	fmla	v21.2d, v1.2d, v8.2d[1]
-	fmla	v22.2d, v2.2d, v8.2d[1]
-	fmla	v23.2d, v3.2d, v8.2d[1]
+	fmla	v20.2d, v0.2d, v8.d[1]
+	fmla	v21.2d, v1.2d, v8.d[1]
+	fmla	v22.2d, v2.2d, v8.d[1]
+	fmla	v23.2d, v3.2d, v8.d[1]
 
-	fmla	v24.2d, v0.2d, v9.2d[0]
-	fmla	v25.2d, v1.2d, v9.2d[0]
-	fmla	v26.2d, v2.2d, v9.2d[0]
-	fmla	v27.2d, v3.2d, v9.2d[0]
+	fmla	v24.2d, v0.2d, v9.d[0]
+	fmla	v25.2d, v1.2d, v9.d[0]
+	fmla	v26.2d, v2.2d, v9.d[0]
+	fmla	v27.2d, v3.2d, v9.d[0]
 
-	fmla	v28.2d, v0.2d, v9.2d[1]
-	fmla	v29.2d, v1.2d, v9.2d[1]
-	fmla	v30.2d, v2.2d, v9.2d[1]
-	fmla	v31.2d, v3.2d, v9.2d[1]
+	fmla	v28.2d, v0.2d, v9.d[1]
+	fmla	v29.2d, v1.2d, v9.d[1]
+	fmla	v30.2d, v2.2d, v9.d[1]
+	fmla	v31.2d, v3.2d, v9.d[1]
 .endm
 
 .macro SAVE8x4
@@ -351,17 +351,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 	ld1	{v0.2d, v1.2d}, [pA]
 	add	pA, pA, #32
 
-	fmla	v16.2d, v0.2d, v8.2d[0]
-	fmla	v29.2d, v1.2d, v9.2d[1]
+	fmla	v16.2d, v0.2d, v8.d[0]
+	fmla	v29.2d, v1.2d, v9.d[1]
 
-	fmla	v20.2d, v0.2d, v8.2d[1]
-	fmla	v25.2d, v1.2d, v9.2d[0]
+	fmla	v20.2d, v0.2d, v8.d[1]
+	fmla	v25.2d, v1.2d, v9.d[0]
 
-	fmla	v24.2d, v0.2d, v9.2d[0]
-	fmla	v21.2d, v1.2d, v8.2d[1]
+	fmla	v24.2d, v0.2d, v9.d[0]
+	fmla	v21.2d, v1.2d, v8.d[1]
 
-	fmla	v28.2d, v0.2d, v9.2d[1]
-	fmla	v17.2d, v1.2d, v8.2d[0]
+	fmla	v28.2d, v0.2d, v9.d[1]
+	fmla	v17.2d, v1.2d, v8.d[0]
 .endm
 
 .macro SAVE4x4
@@ -406,10 +406,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 	ld1	{v0.2d}, [pA]
 	add	pA, pA, #16
 
-	fmla	v16.2d, v0.2d, v8.2d[0]
-	fmla	v20.2d, v0.2d, v8.2d[1]
-	fmla	v24.2d, v0.2d, v9.2d[0]
-	fmla	v28.2d, v0.2d, v9.2d[1]
+	fmla	v16.2d, v0.2d, v8.d[0]
+	fmla	v20.2d, v0.2d, v8.d[1]
+	fmla	v24.2d, v0.2d, v9.d[0]
+	fmla	v28.2d, v0.2d, v9.d[1]
 .endm
 
 .macro SAVE2x4
@@ -490,15 +490,15 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 	ld1	{v2.2d, v3.2d}, [pA]
 	add	pA, pA, #32
 
-	fmla	v16.2d, v0.2d, v8.2d[0]
-	fmla	v17.2d, v1.2d, v8.2d[0]
-	fmla	v18.2d, v2.2d, v8.2d[0]
-	fmla	v19.2d, v3.2d, v8.2d[0]
+	fmla	v16.2d, v0.2d, v8.d[0]
+	fmla	v17.2d, v1.2d, v8.d[0]
+	fmla	v18.2d, v2.2d, v8.d[0]
+	fmla	v19.2d, v3.2d, v8.d[0]
 
-	fmla	v20.2d, v0.2d, v8.2d[1]
-	fmla	v21.2d, v1.2d, v8.2d[1]
-	fmla	v22.2d, v2.2d, v8.2d[1]
-	fmla	v23.2d, v3.2d, v8.2d[1]
+	fmla	v20.2d, v0.2d, v8.d[1]
+	fmla	v21.2d, v1.2d, v8.d[1]
+	fmla	v22.2d, v2.2d, v8.d[1]
+	fmla	v23.2d, v3.2d, v8.d[1]
 .endm
 
 .macro SAVE8x2
@@ -534,10 +534,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 	ld1	{v0.2d, v1.2d}, [pA]
 	add	pA, pA, #32
 
-	fmla	v16.2d, v0.2d, v8.2d[0]
-	fmla	v17.2d, v1.2d, v8.2d[0]
-	fmla	v20.2d, v0.2d, v8.2d[1]
-	fmla	v21.2d, v1.2d, v8.2d[1]
+	fmla	v16.2d, v0.2d, v8.d[0]
+	fmla	v17.2d, v1.2d, v8.d[0]
+	fmla	v20.2d, v0.2d, v8.d[1]
+	fmla	v21.2d, v1.2d, v8.d[1]
 .endm
 
 .macro SAVE4x2
@@ -568,8 +568,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 	ld1	{v0.2d}, [pA]
 	add	pA, pA, #16
 
-	fmla	v16.2d, v0.2d, v8.2d[0]
-	fmla	v20.2d, v0.2d, v8.2d[1]
+	fmla	v16.2d, v0.2d, v8.d[0]
+	fmla	v20.2d, v0.2d, v8.d[1]
 .endm
 
 .macro SAVE2x2
@@ -597,7 +597,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 	ldr	d0 , [pA]
 	add	pA, pA, #8
 
-	fmla	v16.2d, v8.2d, v0.2d[0]
+	fmla	v16.2d, v8.2d, v0.d[0]
 .endm
 
 .macro SAVE1x2
@@ -629,10 +629,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 	ld1	{v2.2d, v3.2d}, [pA]
 	add	pA, pA, #32
 
-	fmla	v16.2d, v0.2d, v8.2d[0]
-	fmla	v17.2d, v1.2d, v8.2d[0]
-	fmla	v18.2d, v2.2d, v8.2d[0]
-	fmla	v19.2d, v3.2d, v8.2d[0]
+	fmla	v16.2d, v0.2d, v8.d[0]
+	fmla	v17.2d, v1.2d, v8.d[0]
+	fmla	v18.2d, v2.2d, v8.d[0]
+	fmla	v19.2d, v3.2d, v8.d[0]
 .endm
 
 .macro SAVE8x1
@@ -660,8 +660,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 	ld1	{v0.2d, v1.2d}, [pA]
 	add	pA , pA, #32
 
-	fmla	v16.2d, v0.2d, v8.2d[0]
-	fmla	v17.2d, v1.2d, v8.2d[0]
+	fmla	v16.2d, v0.2d, v8.d[0]
+	fmla	v17.2d, v1.2d, v8.d[0]
 .endm
 
 .macro SAVE4x1
@@ -686,7 +686,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 	ld1	{v0.2d}, [pA]
 	add	pA , pA, #16
 
-	fmla	v16.2d, v0.2d, v8.2d[0]
+	fmla	v16.2d, v0.2d, v8.d[0]
 .endm
 
 .macro SAVE2x1
diff --git a/kernel/arm64/sgemm_kernel_16x4.S b/kernel/arm64/sgemm_kernel_16x4.S
index 22b55b01c..68366d9f2 100644
--- a/kernel/arm64/sgemm_kernel_16x4.S
+++ b/kernel/arm64/sgemm_kernel_16x4.S
@@ -158,25 +158,25 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 	ld1	{v3.4s}, [pA]
 	add	pA, pA, #16
 
-	fmul	v16.4s, v0.4s, v8.2s[0]
-	fmul	v17.4s, v1.4s, v8.2s[0]
-	fmul	v18.4s, v2.4s, v8.2s[0]
-	fmul	v19.4s, v3.4s, v8.2s[0]
+	fmul	v16.4s, v0.4s, v8.s[0]
+	fmul	v17.4s, v1.4s, v8.s[0]
+	fmul	v18.4s, v2.4s, v8.s[0]
+	fmul	v19.4s, v3.4s, v8.s[0]
 
-	fmul	v20.4s, v0.4s, v8.2s[1]
-	fmul	v21.4s, v1.4s, v8.2s[1]
-	fmul	v22.4s, v2.4s, v8.2s[1]
-	fmul	v23.4s, v3.4s, v8.2s[1]
+	fmul	v20.4s, v0.4s, v8.s[1]
+	fmul	v21.4s, v1.4s, v8.s[1]
+	fmul	v22.4s, v2.4s, v8.s[1]
+	fmul	v23.4s, v3.4s, v8.s[1]
 
-	fmul	v24.4s, v0.4s, v9.2s[0]
-	fmul	v25.4s, v1.4s, v9.2s[0]
-	fmul	v26.4s, v2.4s, v9.2s[0]
-	fmul	v27.4s, v3.4s, v9.2s[0]
+	fmul	v24.4s, v0.4s, v9.s[0]
+	fmul	v25.4s, v1.4s, v9.s[0]
+	fmul	v26.4s, v2.4s, v9.s[0]
+	fmul	v27.4s, v3.4s, v9.s[0]
 
-	fmul	v28.4s, v0.4s, v9.2s[1]
-	fmul	v29.4s, v1.4s, v9.2s[1]
-	fmul	v30.4s, v2.4s, v9.2s[1]
-	fmul	v31.4s, v3.4s, v9.2s[1]
+	fmul	v28.4s, v0.4s, v9.s[1]
+	fmul	v29.4s, v1.4s, v9.s[1]
+	fmul	v30.4s, v2.4s, v9.s[1]
+	fmul	v31.4s, v3.4s, v9.s[1]
 
 	ld1	{v12.2s, v13.2s}, [pB]
 	add	pB, pB, #16
@@ -191,25 +191,25 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 .endm
 
 .macro KERNEL16x4_M1
-	fmla	v16.4s, v0.4s, v8.2s[0]
-	fmla	v17.4s, v1.4s, v8.2s[0]
-	fmla	v18.4s, v2.4s, v8.2s[0]
-	fmla	v19.4s, v3.4s, v8.2s[0]
+	fmla	v16.4s, v0.4s, v8.s[0]
+	fmla	v17.4s, v1.4s, v8.s[0]
+	fmla	v18.4s, v2.4s, v8.s[0]
+	fmla	v19.4s, v3.4s, v8.s[0]
 
-	fmla	v20.4s, v0.4s, v8.2s[1]
-	fmla	v21.4s, v1.4s, v8.2s[1]
-	fmla	v22.4s, v2.4s, v8.2s[1]
-	fmla	v23.4s, v3.4s, v8.2s[1]
+	fmla	v20.4s, v0.4s, v8.s[1]
+	fmla	v21.4s, v1.4s, v8.s[1]
+	fmla	v22.4s, v2.4s, v8.s[1]
+	fmla	v23.4s, v3.4s, v8.s[1]
 
-	fmla	v24.4s, v0.4s, v9.2s[0]
-	fmla	v25.4s, v1.4s, v9.2s[0]
-	fmla	v26.4s, v2.4s, v9.2s[0]
-	fmla	v27.4s, v3.4s, v9.2s[0]
+	fmla	v24.4s, v0.4s, v9.s[0]
+	fmla	v25.4s, v1.4s, v9.s[0]
+	fmla	v26.4s, v2.4s, v9.s[0]
+	fmla	v27.4s, v3.4s, v9.s[0]
 
-	fmla	v28.4s, v0.4s, v9.2s[1]
-	fmla	v29.4s, v1.4s, v9.2s[1]
-	fmla	v30.4s, v2.4s, v9.2s[1]
-	fmla	v31.4s, v3.4s, v9.2s[1]
+	fmla	v28.4s, v0.4s, v9.s[1]
+	fmla	v29.4s, v1.4s, v9.s[1]
+	fmla	v30.4s, v2.4s, v9.s[1]
+	fmla	v31.4s, v3.4s, v9.s[1]
 
 	ld1	{v12.2s, v13.2s}, [pB]
 	add	pB, pB, #16
@@ -224,25 +224,25 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 .endm
 
 .macro KERNEL16x4_M2
-	fmla	v16.4s, v4.4s, v12.2s[0]
-	fmla	v17.4s, v5.4s, v12.2s[0]
-	fmla	v18.4s, v6.4s, v12.2s[0]
-	fmla	v19.4s, v7.4s, v12.2s[0]
+	fmla	v16.4s, v4.4s, v12.s[0]
+	fmla	v17.4s, v5.4s, v12.s[0]
+	fmla	v18.4s, v6.4s, v12.s[0]
+	fmla	v19.4s, v7.4s, v12.s[0]
 
-	fmla	v20.4s, v4.4s, v12.2s[1]
-	fmla	v21.4s, v5.4s, v12.2s[1]
-	fmla	v22.4s, v6.4s, v12.2s[1]
-	fmla	v23.4s, v7.4s, v12.2s[1]
+	fmla	v20.4s, v4.4s, v12.s[1]
+	fmla	v21.4s, v5.4s, v12.s[1]
+	fmla	v22.4s, v6.4s, v12.s[1]
+	fmla	v23.4s, v7.4s, v12.s[1]
 
-	fmla	v24.4s, v4.4s, v13.2s[0]
-	fmla	v25.4s, v5.4s, v13.2s[0]
-	fmla	v26.4s, v6.4s, v13.2s[0]
-	fmla	v27.4s, v7.4s, v13.2s[0]
+	fmla	v24.4s, v4.4s, v13.s[0]
+	fmla	v25.4s, v5.4s, v13.s[0]
+	fmla	v26.4s, v6.4s, v13.s[0]
+	fmla	v27.4s, v7.4s, v13.s[0]
 
-	fmla	v28.4s, v4.4s, v13.2s[1]
-	fmla	v29.4s, v5.4s, v13.2s[1]
-	fmla	v30.4s, v6.4s, v13.2s[1]
-	fmla	v31.4s, v7.4s, v13.2s[1]
+	fmla	v28.4s, v4.4s, v13.s[1]
+	fmla	v29.4s, v5.4s, v13.s[1]
+	fmla	v30.4s, v6.4s, v13.s[1]
+	fmla	v31.4s, v7.4s, v13.s[1]
 
 	ld1	{v8.2s, v9.2s}, [pB]
 	add	pB, pB, #16
@@ -257,25 +257,25 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 .endm
 
 .macro KERNEL16x4_E
-	fmla	v16.4s, v4.4s, v12.2s[0]
-	fmla	v17.4s, v5.4s, v12.2s[0]
-	fmla	v18.4s, v6.4s, v12.2s[0]
-	fmla	v19.4s, v7.4s, v12.2s[0]
+	fmla	v16.4s, v4.4s, v12.s[0]
+	fmla	v17.4s, v5.4s, v12.s[0]
+	fmla	v18.4s, v6.4s, v12.s[0]
+	fmla	v19.4s, v7.4s, v12.s[0]
 
-	fmla	v20.4s, v4.4s, v12.2s[1]
-	fmla	v21.4s, v5.4s, v12.2s[1]
-	fmla	v22.4s, v6.4s, v12.2s[1]
-	fmla	v23.4s, v7.4s, v12.2s[1]
+	fmla	v20.4s, v4.4s, v12.s[1]
+	fmla	v21.4s, v5.4s, v12.s[1]
+	fmla	v22.4s, v6.4s, v12.s[1]
+	fmla	v23.4s, v7.4s, v12.s[1]
 
-	fmla	v24.4s, v4.4s, v13.2s[0]
-	fmla	v25.4s, v5.4s, v13.2s[0]
-	fmla	v26.4s, v6.4s, v13.2s[0]
-	fmla	v27.4s, v7.4s, v13.2s[0]
+	fmla	v24.4s, v4.4s, v13.s[0]
+	fmla	v25.4s, v5.4s, v13.s[0]
+	fmla	v26.4s, v6.4s, v13.s[0]
+	fmla	v27.4s, v7.4s, v13.s[0]
 
-	fmla	v28.4s, v4.4s, v13.2s[1]
-	fmla	v29.4s, v5.4s, v13.2s[1]
-	fmla	v30.4s, v6.4s, v13.2s[1]
-	fmla	v31.4s, v7.4s, v13.2s[1]
+	fmla	v28.4s, v4.4s, v13.s[1]
+	fmla	v29.4s, v5.4s, v13.s[1]
+	fmla	v30.4s, v6.4s, v13.s[1]
+	fmla	v31.4s, v7.4s, v13.s[1]
 .endm
 
 .macro KERNEL16x4_SUB
@@ -290,25 +290,25 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 	ld1	{v3.4s}, [pA]
 	add	pA, pA, #16
 
-	fmla	v16.4s, v0.4s, v8.2s[0]
-	fmla	v17.4s, v1.4s, v8.2s[0]
-	fmla	v18.4s, v2.4s, v8.2s[0]
-	fmla	v19.4s, v3.4s, v8.2s[0]
+	fmla	v16.4s, v0.4s, v8.s[0]
+	fmla	v17.4s, v1.4s, v8.s[0]
+	fmla	v18.4s, v2.4s, v8.s[0]
+	fmla	v19.4s, v3.4s, v8.s[0]
 
-	fmla	v20.4s, v0.4s, v8.2s[1]
-	fmla	v21.4s, v1.4s, v8.2s[1]
-	fmla	v22.4s, v2.4s, v8.2s[1]
-	fmla	v23.4s, v3.4s, v8.2s[1]
+	fmla	v20.4s, v0.4s, v8.s[1]
+	fmla	v21.4s, v1.4s, v8.s[1]
+	fmla	v22.4s, v2.4s, v8.s[1]
+	fmla	v23.4s, v3.4s, v8.s[1]
 
-	fmla	v24.4s, v0.4s, v9.2s[0]
-	fmla	v25.4s, v1.4s, v9.2s[0]
-	fmla	v26.4s, v2.4s, v9.2s[0]
-	fmla	v27.4s, v3.4s, v9.2s[0]
+	fmla	v24.4s, v0.4s, v9.s[0]
+	fmla	v25.4s, v1.4s, v9.s[0]
+	fmla	v26.4s, v2.4s, v9.s[0]
+	fmla	v27.4s, v3.4s, v9.s[0]
 
-	fmla	v28.4s, v0.4s, v9.2s[1]
-	fmla	v29.4s, v1.4s, v9.2s[1]
-	fmla	v30.4s, v2.4s, v9.2s[1]
-	fmla	v31.4s, v3.4s, v9.2s[1]
+	fmla	v28.4s, v0.4s, v9.s[1]
+	fmla	v29.4s, v1.4s, v9.s[1]
+	fmla	v30.4s, v2.4s, v9.s[1]
+	fmla	v31.4s, v3.4s, v9.s[1]
 .endm
 
 .macro SAVE16x4
@@ -370,14 +370,14 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 	ld1	{v1.4s}, [pA]
 	add	pA, pA, #16
 
-	fmul	v16.4s, v0.4s, v8.2s[0]
-	fmul	v17.4s, v1.4s, v8.2s[0]
-	fmul	v20.4s, v0.4s, v8.2s[1]
-	fmul	v21.4s, v1.4s, v8.2s[1]
-	fmul	v24.4s, v0.4s, v9.2s[0]
-	fmul	v25.4s, v1.4s, v9.2s[0]
-	fmul	v28.4s, v0.4s, v9.2s[1]
-	fmul	v29.4s, v1.4s, v9.2s[1]
+	fmul	v16.4s, v0.4s, v8.s[0]
+	fmul	v17.4s, v1.4s, v8.s[0]
+	fmul	v20.4s, v0.4s, v8.s[1]
+	fmul	v21.4s, v1.4s, v8.s[1]
+	fmul	v24.4s, v0.4s, v9.s[0]
+	fmul	v25.4s, v1.4s, v9.s[0]
+	fmul	v28.4s, v0.4s, v9.s[1]
+	fmul	v29.4s, v1.4s, v9.s[1]
 
 	ld1	{v12.2s, v13.2s}, [pB]
 	add	pB, pB, #16
@@ -388,14 +388,14 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 .endm
 
 .macro KERNEL8x4_M1
-	fmla	v16.4s, v0.4s, v8.2s[0]
-	fmla	v17.4s, v1.4s, v8.2s[0]
-	fmla	v20.4s, v0.4s, v8.2s[1]
-	fmla	v21.4s, v1.4s, v8.2s[1]
-	fmla	v24.4s, v0.4s, v9.2s[0]
-	fmla	v25.4s, v1.4s, v9.2s[0]
-	fmla	v28.4s, v0.4s, v9.2s[1]
-	fmla	v29.4s, v1.4s, v9.2s[1]
+	fmla	v16.4s, v0.4s, v8.s[0]
+	fmla	v17.4s, v1.4s, v8.s[0]
+	fmla	v20.4s, v0.4s, v8.s[1]
+	fmla	v21.4s, v1.4s, v8.s[1]
+	fmla	v24.4s, v0.4s, v9.s[0]
+	fmla	v25.4s, v1.4s, v9.s[0]
+	fmla	v28.4s, v0.4s, v9.s[1]
+	fmla	v29.4s, v1.4s, v9.s[1]
 
 	ld1	{v12.2s, v13.2s}, [pB]
 	add	pB, pB, #16
@@ -406,14 +406,14 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 .endm
 
 .macro KERNEL8x4_M2
-	fmla	v16.4s, v4.4s, v12.2s[0]
-	fmla	v17.4s, v5.4s, v12.2s[0]
-	fmla	v20.4s, v4.4s, v12.2s[1]
-	fmla	v21.4s, v5.4s, v12.2s[1]
-	fmla	v24.4s, v4.4s, v13.2s[0]
-	fmla	v25.4s, v5.4s, v13.2s[0]
-	fmla	v28.4s, v4.4s, v13.2s[1]
-	fmla	v29.4s, v5.4s, v13.2s[1]
+	fmla	v16.4s, v4.4s, v12.s[0]
+	fmla	v17.4s, v5.4s, v12.s[0]
+	fmla	v20.4s, v4.4s, v12.s[1]
+	fmla	v21.4s, v5.4s, v12.s[1]
+	fmla	v24.4s, v4.4s, v13.s[0]
+	fmla	v25.4s, v5.4s, v13.s[0]
+	fmla	v28.4s, v4.4s, v13.s[1]
+	fmla	v29.4s, v5.4s, v13.s[1]
 
 	ld1	{v8.2s, v9.2s}, [pB]
 	add	pB, pB, #16
@@ -424,14 +424,14 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 .endm
 
 .macro KERNEL8x4_E
-	fmla	v16.4s, v4.4s, v12.2s[0]
-	fmla	v17.4s, v5.4s, v12.2s[0]
-	fmla	v20.4s, v4.4s, v12.2s[1]
-	fmla	v21.4s, v5.4s, v12.2s[1]
-	fmla	v24.4s, v4.4s, v13.2s[0]
-	fmla	v25.4s, v5.4s, v13.2s[0]
-	fmla	v28.4s, v4.4s, v13.2s[1]
-	fmla	v29.4s, v5.4s, v13.2s[1]
+	fmla	v16.4s, v4.4s, v12.s[0]
+	fmla	v17.4s, v5.4s, v12.s[0]
+	fmla	v20.4s, v4.4s, v12.s[1]
+	fmla	v21.4s, v5.4s, v12.s[1]
+	fmla	v24.4s, v4.4s, v13.s[0]
+	fmla	v25.4s, v5.4s, v13.s[0]
+	fmla	v28.4s, v4.4s, v13.s[1]
+	fmla	v29.4s, v5.4s, v13.s[1]
 .endm
 
 .macro KERNEL8x4_SUB
@@ -442,14 +442,14 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 	ld1	{v1.4s}, [pA]
 	add	pA, pA, #16
 
-	fmla	v16.4s, v0.4s, v8.2s[0]
-	fmla	v17.4s, v1.4s, v8.2s[0]
-	fmla	v20.4s, v0.4s, v8.2s[1]
-	fmla	v21.4s, v1.4s, v8.2s[1]
-	fmla	v24.4s, v0.4s, v9.2s[0]
-	fmla	v25.4s, v1.4s, v9.2s[0]
-	fmla	v28.4s, v0.4s, v9.2s[1]
-	fmla	v29.4s, v1.4s, v9.2s[1]
+	fmla	v16.4s, v0.4s, v8.s[0]
+	fmla	v17.4s, v1.4s, v8.s[0]
+	fmla	v20.4s, v0.4s, v8.s[1]
+	fmla	v21.4s, v1.4s, v8.s[1]
+	fmla	v24.4s, v0.4s, v9.s[0]
+	fmla	v25.4s, v1.4s, v9.s[0]
+	fmla	v28.4s, v0.4s, v9.s[1]
+	fmla	v29.4s, v1.4s, v9.s[1]
 .endm
 
 .macro SAVE8x4
@@ -501,17 +501,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 	ld1	{v0.2s, v1.2s}, [pA]
 	add	pA, pA, #16
 
-	fmul	v16.2s, v0.2s, v8.2s[0]
-	fmul	v29.2s, v1.2s, v9.2s[1]
+	fmul	v16.2s, v0.2s, v8.s[0]
+	fmul	v29.2s, v1.2s, v9.s[1]
 
-	fmul	v20.2s, v0.2s, v8.2s[1]
-	fmul	v25.2s, v1.2s, v9.2s[0]
+	fmul	v20.2s, v0.2s, v8.s[1]
+	fmul	v25.2s, v1.2s, v9.s[0]
 
-	fmul	v24.2s, v0.2s, v9.2s[0]
-	fmul	v21.2s, v1.2s, v8.2s[1]
+	fmul	v24.2s, v0.2s, v9.s[0]
+	fmul	v21.2s, v1.2s, v8.s[1]
 
-	fmul	v28.2s, v0.2s, v9.2s[1]
-	fmul	v17.2s, v1.2s, v8.2s[0]
+	fmul	v28.2s, v0.2s, v9.s[1]
+	fmul	v17.2s, v1.2s, v8.s[0]
 
 	ld1	{v12.2s, v13.2s}, [pB]
 	add	pB, pB, #16
@@ -520,61 +520,61 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 .endm
 
 .macro KERNEL4x4_M1
-	fmla	v16.2s, v0.2s, v8.2s[0]
-	fmla	v29.2s, v1.2s, v9.2s[1]
+	fmla	v16.2s, v0.2s, v8.s[0]
+	fmla	v29.2s, v1.2s, v9.s[1]
 
 	ld1	{v12.2s, v13.2s}, [pB]		// For next round
 	add	pB, pB, #16
 
-	fmla	v20.2s, v0.2s, v8.2s[1]
-	fmla	v25.2s, v1.2s, v9.2s[0]
+	fmla	v20.2s, v0.2s, v8.s[1]
+	fmla	v25.2s, v1.2s, v9.s[0]
 
 	ld1	{v4.2s, v5.2s}, [pA]		// For next round
 	add	pA, pA, #16
 
-	fmla	v24.2s, v0.2s, v9.2s[0]
-	fmla	v21.2s, v1.2s, v8.2s[1]
+	fmla	v24.2s, v0.2s, v9.s[0]
+	fmla	v21.2s, v1.2s, v8.s[1]
 
 	prfm	PLDL1KEEP, [pB, #512]
 
-	fmla	v28.2s, v0.2s, v9.2s[1]
-	fmla	v17.2s, v1.2s, v8.2s[0]
+	fmla	v28.2s, v0.2s, v9.s[1]
+	fmla	v17.2s, v1.2s, v8.s[0]
 .endm
 
 .macro KERNEL4x4_M2
-	fmla	v16.2s, v4.2s, v12.2s[0]
-	fmla	v29.2s, v5.2s, v13.2s[1]
+	fmla	v16.2s, v4.2s, v12.s[0]
+	fmla	v29.2s, v5.2s, v13.s[1]
 
 	ld1	{v8.2s, v9.2s}, [pB]		// For next round
 	add	pB, pB, #16
 
-	fmla	v20.2s, v4.2s, v12.2s[1]
-	fmla	v25.2s, v5.2s, v13.2s[0]
+	fmla	v20.2s, v4.2s, v12.s[1]
+	fmla	v25.2s, v5.2s, v13.s[0]
 
 	ld1	{v0.2s, v1.2s}, [pA]		// For next round
 	add	pA, pA, #16
 
-	fmla	v24.2s, v4.2s, v13.2s[0]
-	fmla	v21.2s, v5.2s, v12.2s[1]
+	fmla	v24.2s, v4.2s, v13.s[0]
+	fmla	v21.2s, v5.2s, v12.s[1]
 
 	prfm	PLDL1KEEP, [pA, #512]
 
-	fmla	v28.2s, v4.2s, v13.2s[1]
-	fmla	v17.2s, v5.2s, v12.2s[0]
+	fmla	v28.2s, v4.2s, v13.s[1]
+	fmla	v17.2s, v5.2s, v12.s[0]
 .endm
 
 .macro KERNEL4x4_E
-	fmla	v16.2s, v4.2s, v12.2s[0]
-	fmla	v29.2s, v5.2s, v13.2s[1]
+	fmla	v16.2s, v4.2s, v12.s[0]
+	fmla	v29.2s, v5.2s, v13.s[1]
 
-	fmla	v20.2s, v4.2s, v12.2s[1]
-	fmla	v25.2s, v5.2s, v13.2s[0]
+	fmla	v20.2s, v4.2s, v12.s[1]
+	fmla	v25.2s, v5.2s, v13.s[0]
 
-	fmla	v24.2s, v4.2s, v13.2s[0]
-	fmla	v21.2s, v5.2s, v12.2s[1]
+	fmla	v24.2s, v4.2s, v13.s[0]
+	fmla	v21.2s, v5.2s, v12.s[1]
 
-	fmla	v28.2s, v4.2s, v13.2s[1]
-	fmla	v17.2s, v5.2s, v12.2s[0]
+	fmla	v28.2s, v4.2s, v13.s[1]
+	fmla	v17.2s, v5.2s, v12.s[0]
 .endm
 
 .macro KERNEL4x4_SUB
@@ -583,17 +583,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 	ld1	{v0.2s, v1.2s}, [pA]
 	add	pA, pA, #16
 
-	fmla	v16.2s, v0.2s, v8.2s[0]
-	fmla	v29.2s, v1.2s, v9.2s[1]
+	fmla	v16.2s, v0.2s, v8.s[0]
+	fmla	v29.2s, v1.2s, v9.s[1]
 
-	fmla	v20.2s, v0.2s, v8.2s[1]
-	fmla	v25.2s, v1.2s, v9.2s[0]
+	fmla	v20.2s, v0.2s, v8.s[1]
+	fmla	v25.2s, v1.2s, v9.s[0]
 
-	fmla	v24.2s, v0.2s, v9.2s[0]
-	fmla	v21.2s, v1.2s, v8.2s[1]
+	fmla	v24.2s, v0.2s, v9.s[0]
+	fmla	v21.2s, v1.2s, v8.s[1]
 
-	fmla	v28.2s, v0.2s, v9.2s[1]
-	fmla	v17.2s, v1.2s, v8.2s[0]
+	fmla	v28.2s, v0.2s, v9.s[1]
+	fmla	v17.2s, v1.2s, v8.s[0]
 .endm
 
 .macro SAVE4x4
@@ -638,10 +638,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 	ld1	{v0.2s}, [pA]
 	add	pA, pA, #8
 
-	fmla	v16.2s, v0.2s, v8.2s[0]
-	fmla	v20.2s, v0.2s, v8.2s[1]
-	fmla	v24.2s, v0.2s, v9.2s[0]
-	fmla	v28.2s, v0.2s, v9.2s[1]
+	fmla	v16.2s, v0.2s, v8.s[0]
+	fmla	v20.2s, v0.2s, v8.s[1]
+	fmla	v24.2s, v0.2s, v9.s[0]
+	fmla	v28.2s, v0.2s, v9.s[1]
 .endm
 
 .macro SAVE2x4
@@ -729,15 +729,15 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 	ld1	{v3.4s}, [pA]
 	add	pA, pA, #16
 
-	fmla	v16.4s, v0.4s, v8.2s[0]
-	fmla	v17.4s, v1.4s, v8.2s[0]
-	fmla	v18.4s, v2.4s, v8.2s[0]
-	fmla	v19.4s, v3.4s, v8.2s[0]
+	fmla	v16.4s, v0.4s, v8.s[0]
+	fmla	v17.4s, v1.4s, v8.s[0]
+	fmla	v18.4s, v2.4s, v8.s[0]
+	fmla	v19.4s, v3.4s, v8.s[0]
 
-	fmla	v20.4s, v0.4s, v8.2s[1]
-	fmla	v21.4s, v1.4s, v8.2s[1]
-	fmla	v22.4s, v2.4s, v8.2s[1]
-	fmla	v23.4s, v3.4s, v8.2s[1]
+	fmla	v20.4s, v0.4s, v8.s[1]
+	fmla	v21.4s, v1.4s, v8.s[1]
+	fmla	v22.4s, v2.4s, v8.s[1]
+	fmla	v23.4s, v3.4s, v8.s[1]
 .endm
 
 .macro SAVE16x2
@@ -777,11 +777,11 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 	ld1	{v1.4s}, [pA]
 	add	pA, pA, #16
 
-	fmla	v16.4s, v0.4s, v8.2s[0]
-	fmla	v17.4s, v1.4s, v8.2s[0]
+	fmla	v16.4s, v0.4s, v8.s[0]
+	fmla	v17.4s, v1.4s, v8.s[0]
 
-	fmla	v20.4s, v0.4s, v8.2s[1]
-	fmla	v21.4s, v1.4s, v8.2s[1]
+	fmla	v20.4s, v0.4s, v8.s[1]
+	fmla	v21.4s, v1.4s, v8.s[1]
 .endm
 
 .macro SAVE8x2
@@ -817,10 +817,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 	ld1	{v0.2s, v1.2s}, [pA]
 	add	pA, pA, #16
 
-	fmla	v16.2s, v0.2s, v8.2s[0]
-	fmla	v17.2s, v1.2s, v8.2s[0]
-	fmla	v20.2s, v0.2s, v8.2s[1]
-	fmla	v21.2s, v1.2s, v8.2s[1]
+	fmla	v16.2s, v0.2s, v8.s[0]
+	fmla	v17.2s, v1.2s, v8.s[0]
+	fmla	v20.2s, v0.2s, v8.s[1]
+	fmla	v21.2s, v1.2s, v8.s[1]
 .endm
 
 .macro SAVE4x2
@@ -852,8 +852,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 	ld1	{v0.2s}, [pA]
 	add	pA, pA, #8
 
-	fmla	v16.2s, v0.2s, v8.2s[0]
-	fmla	v20.2s, v0.2s, v8.2s[1]
+	fmla	v16.2s, v0.2s, v8.s[0]
+	fmla	v20.2s, v0.2s, v8.s[1]
 .endm
 
 .macro SAVE2x2
@@ -882,7 +882,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 	ldr	s0 , [pA]
 	add	pA, pA, #4
 
-	fmla	v16.2s, v8.2s, v0.2s[0]
+	fmla	v16.2s, v8.2s, v0.s[0]
 .endm
 
 .macro SAVE1x2
@@ -918,10 +918,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 	ld1	{v3.4s}, [pA]
 	add	pA, pA, #16
 
-	fmla	v16.4s, v0.4s, v8.2s[0]
-	fmla	v17.4s, v1.4s, v8.2s[0]
-	fmla	v18.4s, v2.4s, v8.2s[0]
-	fmla	v19.4s, v3.4s, v8.2s[0]
+	fmla	v16.4s, v0.4s, v8.s[0]
+	fmla	v17.4s, v1.4s, v8.s[0]
+	fmla	v18.4s, v2.4s, v8.s[0]
+	fmla	v19.4s, v3.4s, v8.s[0]
 .endm
 
 .macro SAVE16x1
@@ -951,8 +951,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 	ld1	{v1.4s}, [pA]
 	add	pA, pA, #16
 
-	fmla	v16.4s, v0.4s, v8.2s[0]
-	fmla	v17.4s, v1.4s, v8.2s[0]
+	fmla	v16.4s, v0.4s, v8.s[0]
+	fmla	v17.4s, v1.4s, v8.s[0]
 .endm
 
 .macro SAVE8x1
@@ -978,8 +978,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 	ld1	{v0.2s, v1.2s}, [pA]
 	add	pA , pA, #16
 
-	fmla	v16.2s, v0.2s, v8.2s[0]
-	fmla	v17.2s, v1.2s, v8.2s[0]
+	fmla	v16.2s, v0.2s, v8.s[0]
+	fmla	v17.2s, v1.2s, v8.s[0]
 .endm
 
 .macro SAVE4x1
@@ -1004,7 +1004,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 	ld1	{v0.2s}, [pA]
 	add	pA , pA, #8
 
-	fmla	v16.2s, v0.2s, v8.2s[0]
+	fmla	v16.2s, v0.2s, v8.s[0]
 .endm
 
 .macro SAVE2x1
diff --git a/kernel/arm64/sgemm_kernel_4x4.S b/kernel/arm64/sgemm_kernel_4x4.S
index bfa80d589..a5cf7baff 100644
--- a/kernel/arm64/sgemm_kernel_4x4.S
+++ b/kernel/arm64/sgemm_kernel_4x4.S
@@ -192,164 +192,164 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 	ld1	{v0.4s}, [pA_0]
 	add	pA_0, pA_0, #16
 
-	fmul	v16.4s, v0.4s, v8.4s[0]
-	fmul	v20.4s, v0.4s, v8.4s[1]
+	fmul	v16.4s, v0.4s, v8.s[0]
+	fmul	v20.4s, v0.4s, v8.s[1]
 
 	ld1	{v2.4s}, [pA_1]
 	add	pA_1, pA_1, #16
 
-	fmul	v24.4s, v0.4s, v8.4s[2]
-	fmul	v28.4s, v0.4s, v8.4s[3]
+	fmul	v24.4s, v0.4s, v8.s[2]
+	fmul	v28.4s, v0.4s, v8.s[3]
 
 	ld1	{v4.4s}, [pA_2]
 	add	pA_2, pA_2, #16
 
-	fmul	v17.4s, v2.4s, v8.4s[0]
-	fmul	v21.4s, v2.4s, v8.4s[1]
+	fmul	v17.4s, v2.4s, v8.s[0]
+	fmul	v21.4s, v2.4s, v8.s[1]
 
 	ld1	{v6.4s}, [pA_3]
 	add	pA_3, pA_3, #16
 
-	fmul	v25.4s, v2.4s, v8.4s[2]
-	fmul	v29.4s, v2.4s, v8.4s[3]
+	fmul	v25.4s, v2.4s, v8.s[2]
+	fmul	v29.4s, v2.4s, v8.s[3]
 
 	ld1	{v12.4s}, [pB]		// for next round
 	add	pB, pB, #16
 
-	fmul	v18.4s, v4.4s, v8.4s[0]
-	fmul	v19.4s, v6.4s, v8.4s[0]
+	fmul	v18.4s, v4.4s, v8.s[0]
+	fmul	v19.4s, v6.4s, v8.s[0]
 
 	ld1	{v1.4s}, [pA_0]		// for next round
 	add	pA_0, pA_0, #16
 
-	fmul	v22.4s, v4.4s, v8.4s[1]
-	fmul	v23.4s, v6.4s, v8.4s[1]
+	fmul	v22.4s, v4.4s, v8.s[1]
+	fmul	v23.4s, v6.4s, v8.s[1]
 
 	ld1	{v3.4s}, [pA_1]		// for next round
 	add	pA_1, pA_1, #16
 
-	fmul	v26.4s, v4.4s, v8.4s[2]
-	fmul	v27.4s, v6.4s, v8.4s[2]
+	fmul	v26.4s, v4.4s, v8.s[2]
+	fmul	v27.4s, v6.4s, v8.s[2]
 
 	ld1	{v5.4s}, [pA_2]		// for next round
 	add	pA_2, pA_2, #16
 
-	fmul	v30.4s, v4.4s, v8.4s[3]
-	fmul	v31.4s, v6.4s, v8.4s[3]
+	fmul	v30.4s, v4.4s, v8.s[3]
+	fmul	v31.4s, v6.4s, v8.s[3]
 
 	ld1	{v7.4s}, [pA_3]		// for next round
 	add	pA_3, pA_3, #16
 .endm
 
 .macro KERNEL16x4_M2
-	fmla	v16.4s, v1.4s, v12.4s[0]
-	fmla	v17.4s, v3.4s, v12.4s[0]
+	fmla	v16.4s, v1.4s, v12.s[0]
+	fmla	v17.4s, v3.4s, v12.s[0]
 
 	ld1	{v8.4s}, [pB]		// for next round
 	add	pB, pB, #16
 
-	fmla	v18.4s, v5.4s, v12.4s[0]
-	fmla	v19.4s, v7.4s, v12.4s[0]
+	fmla	v18.4s, v5.4s, v12.s[0]
+	fmla	v19.4s, v7.4s, v12.s[0]
 
 	ld1	{v0.4s}, [pA_0]		// for next round
 	add	pA_0, pA_0, #16
 
-	fmla	v20.4s, v1.4s, v12.4s[1]
-	fmla	v21.4s, v3.4s, v12.4s[1]
+	fmla	v20.4s, v1.4s, v12.s[1]
+	fmla	v21.4s, v3.4s, v12.s[1]
 
 	ld1	{v2.4s}, [pA_1]		// for next round
 	add	pA_1, pA_1, #16
 
-	fmla	v22.4s, v5.4s, v12.4s[1]
-	fmla	v23.4s, v7.4s, v12.4s[1]
+	fmla	v22.4s, v5.4s, v12.s[1]
+	fmla	v23.4s, v7.4s, v12.s[1]
 
 	ld1	{v4.4s}, [pA_2]		// for next round
 	add	pA_2, pA_2, #16
 
-	fmla	v24.4s, v1.4s, v12.4s[2]
-	fmla	v25.4s, v3.4s, v12.4s[2]
+	fmla	v24.4s, v1.4s, v12.s[2]
+	fmla	v25.4s, v3.4s, v12.s[2]
 
 	ld1	{v6.4s}, [pA_3]		// for next round
 	add	pA_3, pA_3, #16
 
-	fmla	v26.4s, v5.4s, v12.4s[2]
-	fmla	v27.4s, v7.4s, v12.4s[2]
+	fmla	v26.4s, v5.4s, v12.s[2]
+	fmla	v27.4s, v7.4s, v12.s[2]
 
 	prfm	PLDL1KEEP, [pA_2, #512]
 
-	fmla	v28.4s, v1.4s, v12.4s[3]
-	fmla	v29.4s, v3.4s, v12.4s[3]
+	fmla	v28.4s, v1.4s, v12.s[3]
+	fmla	v29.4s, v3.4s, v12.s[3]
 
 	prfm	PLDL1KEEP, [pA_3, #512]
 
-	fmla	v30.4s, v5.4s, v12.4s[3]
-	fmla	v31.4s, v7.4s, v12.4s[3]
+	fmla	v30.4s, v5.4s, v12.s[3]
+	fmla	v31.4s, v7.4s, v12.s[3]
 
 	prfm	PLDL1KEEP, [pB, #512]
 .endm
 
 .macro KERNEL16x4_M1
-	fmla	v16.4s, v0.4s, v8.4s[0]
-	fmla	v17.4s, v2.4s, v8.4s[0]
+	fmla	v16.4s, v0.4s, v8.s[0]
+	fmla	v17.4s, v2.4s, v8.s[0]
 
 	ld1	{v12.4s}, [pB]		// for next round
 	add	pB, pB, #16
 
-	fmla	v18.4s, v4.4s, v8.4s[0]
-	fmla	v19.4s, v6.4s, v8.4s[0]
+	fmla	v18.4s, v4.4s, v8.s[0]
+	fmla	v19.4s, v6.4s, v8.s[0]
 
 	ld1	{v1.4s}, [pA_0]		// for next round
 	add	pA_0, pA_0, #16
 
-	fmla	v20.4s, v0.4s, v8.4s[1]
-	fmla	v21.4s, v2.4s, v8.4s[1]
+	fmla	v20.4s, v0.4s, v8.s[1]
+	fmla	v21.4s, v2.4s, v8.s[1]
 
 	ld1	{v3.4s}, [pA_1]		// for next round
 	add	pA_1, pA_1, #16
 
-	fmla	v22.4s, v4.4s, v8.4s[1]
-	fmla	v23.4s, v6.4s, v8.4s[1]
+	fmla	v22.4s, v4.4s, v8.s[1]
+	fmla	v23.4s, v6.4s, v8.s[1]
 
 	ld1	{v5.4s}, [pA_2]		// for next round
 	add	pA_2, pA_2, #16
 
-	fmla	v24.4s, v0.4s, v8.4s[2]
-	fmla	v25.4s, v2.4s, v8.4s[2]
+	fmla	v24.4s, v0.4s, v8.s[2]
+	fmla	v25.4s, v2.4s, v8.s[2]
 
 	ld1	{v7.4s}, [pA_3]		// for next round
 	add	pA_3, pA_3, #16
 
-	fmla	v26.4s, v4.4s, v8.4s[2]
-	fmla	v27.4s, v6.4s, v8.4s[2]
+	fmla	v26.4s, v4.4s, v8.s[2]
+	fmla	v27.4s, v6.4s, v8.s[2]
 
 	prfm	PLDL1KEEP, [pA_0, #512]
 
-	fmla	v28.4s, v0.4s, v8.4s[3]
-	fmla	v29.4s, v2.4s, v8.4s[3]
+	fmla	v28.4s, v0.4s, v8.s[3]
+	fmla	v29.4s, v2.4s, v8.s[3]
 
 	prfm	PLDL1KEEP, [pA_1, #512]
 
-	fmla	v30.4s, v4.4s, v8.4s[3]
-	fmla	v31.4s, v6.4s, v8.4s[3]
+	fmla	v30.4s, v4.4s, v8.s[3]
+	fmla	v31.4s, v6.4s, v8.s[3]
 .endm
 
 .macro KERNEL16x4_E
-	fmla	v16.4s, v1.4s, v12.4s[0]
-	fmla	v17.4s, v3.4s, v12.4s[0]
-	fmla	v18.4s, v5.4s, v12.4s[0]
-	fmla	v19.4s, v7.4s, v12.4s[0]
-	fmla	v20.4s, v1.4s, v12.4s[1]
-	fmla	v21.4s, v3.4s, v12.4s[1]
-	fmla	v22.4s, v5.4s, v12.4s[1]
-	fmla	v23.4s, v7.4s, v12.4s[1]
-	fmla	v24.4s, v1.4s, v12.4s[2]
-	fmla	v25.4s, v3.4s, v12.4s[2]
-	fmla	v26.4s, v5.4s, v12.4s[2]
-	fmla	v27.4s, v7.4s, v12.4s[2]
-	fmla	v28.4s, v1.4s, v12.4s[3]
-	fmla	v29.4s, v3.4s, v12.4s[3]
-	fmla	v30.4s, v5.4s, v12.4s[3]
-	fmla	v31.4s, v7.4s, v12.4s[3]
+	fmla	v16.4s, v1.4s, v12.s[0]
+	fmla	v17.4s, v3.4s, v12.s[0]
+	fmla	v18.4s, v5.4s, v12.s[0]
+	fmla	v19.4s, v7.4s, v12.s[0]
+	fmla	v20.4s, v1.4s, v12.s[1]
+	fmla	v21.4s, v3.4s, v12.s[1]
+	fmla	v22.4s, v5.4s, v12.s[1]
+	fmla	v23.4s, v7.4s, v12.s[1]
+	fmla	v24.4s, v1.4s, v12.s[2]
+	fmla	v25.4s, v3.4s, v12.s[2]
+	fmla	v26.4s, v5.4s, v12.s[2]
+	fmla	v27.4s, v7.4s, v12.s[2]
+	fmla	v28.4s, v1.4s, v12.s[3]
+	fmla	v29.4s, v3.4s, v12.s[3]
+	fmla	v30.4s, v5.4s, v12.s[3]
+	fmla	v31.4s, v7.4s, v12.s[3]
 .endm
 
 .macro KERNEL16x4_SUB
@@ -359,34 +359,34 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 	ld1	{v0.4s}, [pA_0]
 	add	pA_0, pA_0, #16
 
-	fmla	v16.4s, v0.4s, v8.4s[0]
-	fmla	v20.4s, v0.4s, v8.4s[1]
-	fmla	v24.4s, v0.4s, v8.4s[2]
-	fmla	v28.4s, v0.4s, v8.4s[3]
+	fmla	v16.4s, v0.4s, v8.s[0]
+	fmla	v20.4s, v0.4s, v8.s[1]
+	fmla	v24.4s, v0.4s, v8.s[2]
+	fmla	v28.4s, v0.4s, v8.s[3]
 
 	ld1	{v2.4s}, [pA_1]
 	add	pA_1, pA_1, #16
 
-	fmla	v17.4s, v2.4s, v8.4s[0]
-	fmla	v21.4s, v2.4s, v8.4s[1]
-	fmla	v25.4s, v2.4s, v8.4s[2]
-	fmla	v29.4s, v2.4s, v8.4s[3]
+	fmla	v17.4s, v2.4s, v8.s[0]
+	fmla	v21.4s, v2.4s, v8.s[1]
+	fmla	v25.4s, v2.4s, v8.s[2]
+	fmla	v29.4s, v2.4s, v8.s[3]
 
 	ld1	{v4.4s}, [pA_2]
 	add	pA_2, pA_2, #16
 
-	fmla	v18.4s, v4.4s, v8.4s[0]
-	fmla	v22.4s, v4.4s, v8.4s[1]
-	fmla	v26.4s, v4.4s, v8.4s[2]
-	fmla	v30.4s, v4.4s, v8.4s[3]
+	fmla	v18.4s, v4.4s, v8.s[0]
+	fmla	v22.4s, v4.4s, v8.s[1]
+	fmla	v26.4s, v4.4s, v8.s[2]
+	fmla	v30.4s, v4.4s, v8.s[3]
 
 	ld1	{v6.4s}, [pA_3]
 	add	pA_3, pA_3, #16
 
-	fmla	v19.4s, v6.4s, v8.4s[0]
-	fmla	v23.4s, v6.4s, v8.4s[1]
-	fmla	v27.4s, v6.4s, v8.4s[2]
-	fmla	v31.4s, v6.4s, v8.4s[3]
+	fmla	v19.4s, v6.4s, v8.s[0]
+	fmla	v23.4s, v6.4s, v8.s[1]
+	fmla	v27.4s, v6.4s, v8.s[2]
+	fmla	v31.4s, v6.4s, v8.s[3]
 .endm
 
 .macro SAVE16x4
@@ -456,28 +456,28 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 	ld1	{v0.2s, v1.2s}, [pA_0]
 	add	pA_0, pA_0, #16
 
-	fmla	v16.2s, v0.2s, v8.2s[0]
-	fmla	v29.2s, v1.2s, v9.2s[1]
-	fmla	v20.2s, v0.2s, v8.2s[1]
-	fmla	v25.2s, v1.2s, v9.2s[0]
+	fmla	v16.2s, v0.2s, v8.s[0]
+	fmla	v29.2s, v1.2s, v9.s[1]
+	fmla	v20.2s, v0.2s, v8.s[1]
+	fmla	v25.2s, v1.2s, v9.s[0]
 
 	ld1	{v2.2s, v3.2s}, [pA_1]
 	add	pA_1, pA_1, #16
 
-	fmla	v24.2s, v0.2s, v9.2s[0]
-	fmla	v21.2s, v1.2s, v8.2s[1]
-	fmla	v28.2s, v0.2s, v9.2s[1]
-	fmla	v17.2s, v1.2s, v8.2s[0]
+	fmla	v24.2s, v0.2s, v9.s[0]
+	fmla	v21.2s, v1.2s, v8.s[1]
+	fmla	v28.2s, v0.2s, v9.s[1]
+	fmla	v17.2s, v1.2s, v8.s[0]
 
-	fmla	v18.2s, v2.2s, v8.2s[0]
-	fmla	v31.2s, v3.2s, v9.2s[1]
-	fmla	v22.2s, v2.2s, v8.2s[1]
-	fmla	v27.2s, v3.2s, v9.2s[0]
+	fmla	v18.2s, v2.2s, v8.s[0]
+	fmla	v31.2s, v3.2s, v9.s[1]
+	fmla	v22.2s, v2.2s, v8.s[1]
+	fmla	v27.2s, v3.2s, v9.s[0]
 
-	fmla	v26.2s, v2.2s, v9.2s[0]
-	fmla	v23.2s, v3.2s, v8.2s[1]
-	fmla	v30.2s, v2.2s, v9.2s[1]
-	fmla	v19.2s, v3.2s, v8.2s[0]
+	fmla	v26.2s, v2.2s, v9.s[0]
+	fmla	v23.2s, v3.2s, v8.s[1]
+	fmla	v30.2s, v2.2s, v9.s[1]
+	fmla	v19.2s, v3.2s, v8.s[0]
 .endm
 
 .macro SAVE8x4
@@ -556,17 +556,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 	ld1	{v0.2s, v1.2s}, [pA_0]
 	add	pA_0, pA_0, #16
 
-	fmla	v16.2s, v0.2s, v8.2s[0]
-	fmla	v29.2s, v1.2s, v9.2s[1]
+	fmla	v16.2s, v0.2s, v8.s[0]
+	fmla	v29.2s, v1.2s, v9.s[1]
 
-	fmla	v20.2s, v0.2s, v8.2s[1]
-	fmla	v25.2s, v1.2s, v9.2s[0]
+	fmla	v20.2s, v0.2s, v8.s[1]
+	fmla	v25.2s, v1.2s, v9.s[0]
 
-	fmla	v24.2s, v0.2s, v9.2s[0]
-	fmla	v21.2s, v1.2s, v8.2s[1]
+	fmla	v24.2s, v0.2s, v9.s[0]
+	fmla	v21.2s, v1.2s, v8.s[1]
 
-	fmla	v28.2s, v0.2s, v9.2s[1]
-	fmla	v17.2s, v1.2s, v8.2s[0]
+	fmla	v28.2s, v0.2s, v9.s[1]
+	fmla	v17.2s, v1.2s, v8.s[0]
 .endm
 
 .macro SAVE4x4
@@ -614,10 +614,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 	ld1	{v0.2s}, [pA_0]
 	add	pA_0, pA_0, #8
 
-	fmla	v16.2s, v0.2s, v8.2s[0]
-	fmla	v20.2s, v0.2s, v8.2s[1]
-	fmla	v24.2s, v0.2s, v9.2s[0]
-	fmla	v28.2s, v0.2s, v9.2s[1]
+	fmla	v16.2s, v0.2s, v8.s[0]
+	fmla	v20.2s, v0.2s, v8.s[1]
+	fmla	v24.2s, v0.2s, v9.s[0]
+	fmla	v28.2s, v0.2s, v9.s[1]
 .endm
 
 .macro SAVE2x4
@@ -700,10 +700,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 	ld1	{v0.2s, v1.2s}, [pA_0]
 	add	pA_0, pA_0, #16
 
-	fmla	v16.2s, v0.2s, v8.2s[0]
-	fmla	v17.2s, v1.2s, v8.2s[0]
-	fmla	v20.2s, v0.2s, v8.2s[1]
-	fmla	v21.2s, v1.2s, v8.2s[1]
+	fmla	v16.2s, v0.2s, v8.s[0]
+	fmla	v17.2s, v1.2s, v8.s[0]
+	fmla	v20.2s, v0.2s, v8.s[1]
+	fmla	v21.2s, v1.2s, v8.s[1]
 .endm
 
 .macro SAVE4x2
@@ -736,8 +736,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 	ld1	{v0.2s}, [pA_0]
 	add	pA_0, pA_0, #8
 
-	fmla	v16.2s, v0.2s, v8.2s[0]
-	fmla	v20.2s, v0.2s, v8.2s[1]
+	fmla	v16.2s, v0.2s, v8.s[0]
+	fmla	v20.2s, v0.2s, v8.s[1]
 .endm
 
 .macro SAVE2x2
@@ -767,7 +767,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 	ldr	s0 , [pA_0]
 	add	pA_0, pA_0, #4
 
-	fmla	v16.2s, v8.2s, v0.2s[0]
+	fmla	v16.2s, v8.2s, v0.s[0]
 .endm
 
 .macro SAVE1x2
@@ -796,8 +796,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 	ld1	{v0.2s, v1.2s}, [pA_0]
 	add	pA_0 , pA_0, #16
 
-	fmla	v16.2s, v0.2s, v8.2s[0]
-	fmla	v17.2s, v1.2s, v8.2s[0]
+	fmla	v16.2s, v0.2s, v8.s[0]
+	fmla	v17.2s, v1.2s, v8.s[0]
 .endm
 
 .macro SAVE4x1
@@ -825,7 +825,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 	ld1	{v0.2s}, [pA_0]
 	add	pA_0 , pA_0, #8
 
-	fmla	v16.2s, v0.2s, v8.2s[0]
+	fmla	v16.2s, v0.2s, v8.s[0]
 .endm
 
 .macro SAVE2x1
diff --git a/kernel/arm64/sgemm_kernel_8x8.S b/kernel/arm64/sgemm_kernel_8x8.S
index ac690e4d4..bd47bed31 100644
--- a/kernel/arm64/sgemm_kernel_8x8.S
+++ b/kernel/arm64/sgemm_kernel_8x8.S
@@ -157,22 +157,22 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 	ld1	{v1.4s}, [pA]
 	add	pA, pA, #16
 
-	fmul	v16.4s, v0.4s, v4.4s[0]
-	fmul	v17.4s, v1.4s, v4.4s[0]
-	fmul	v18.4s, v0.4s, v4.4s[1]
-	fmul	v19.4s, v1.4s, v4.4s[1]
-	fmul	v20.4s, v0.4s, v4.4s[2]
-	fmul	v21.4s, v1.4s, v4.4s[2]
-	fmul	v22.4s, v0.4s, v4.4s[3]
-	fmul	v23.4s, v1.4s, v4.4s[3]
-	fmul	v24.4s, v0.4s, v5.4s[0]
-	fmul	v25.4s, v1.4s, v5.4s[0]
-	fmul	v26.4s, v0.4s, v5.4s[1]
-	fmul	v27.4s, v1.4s, v5.4s[1]
-	fmul	v28.4s, v0.4s, v5.4s[2]
-	fmul	v29.4s, v1.4s, v5.4s[2]
-	fmul	v30.4s, v0.4s, v5.4s[3]
-	fmul	v31.4s, v1.4s, v5.4s[3]
+	fmul	v16.4s, v0.4s, v4.s[0]
+	fmul	v17.4s, v1.4s, v4.s[0]
+	fmul	v18.4s, v0.4s, v4.s[1]
+	fmul	v19.4s, v1.4s, v4.s[1]
+	fmul	v20.4s, v0.4s, v4.s[2]
+	fmul	v21.4s, v1.4s, v4.s[2]
+	fmul	v22.4s, v0.4s, v4.s[3]
+	fmul	v23.4s, v1.4s, v4.s[3]
+	fmul	v24.4s, v0.4s, v5.s[0]
+	fmul	v25.4s, v1.4s, v5.s[0]
+	fmul	v26.4s, v0.4s, v5.s[1]
+	fmul	v27.4s, v1.4s, v5.s[1]
+	fmul	v28.4s, v0.4s, v5.s[2]
+	fmul	v29.4s, v1.4s, v5.s[2]
+	fmul	v30.4s, v0.4s, v5.s[3]
+	fmul	v31.4s, v1.4s, v5.s[3]
 
 	ld1	{v6.4s}, [pB]
 	add	pB, pB, #16
@@ -185,22 +185,22 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 .endm
 
 .macro KERNEL8x8_M1
-	fmla	v16.4s, v0.4s, v4.4s[0]
-	fmla	v17.4s, v1.4s, v4.4s[0]
-	fmla	v18.4s, v0.4s, v4.4s[1]
-	fmla	v19.4s, v1.4s, v4.4s[1]
-	fmla	v20.4s, v0.4s, v4.4s[2]
-	fmla	v21.4s, v1.4s, v4.4s[2]
-	fmla	v22.4s, v0.4s, v4.4s[3]
-	fmla	v23.4s, v1.4s, v4.4s[3]
-	fmla	v24.4s, v0.4s, v5.4s[0]
-	fmla	v25.4s, v1.4s, v5.4s[0]
-	fmla	v26.4s, v0.4s, v5.4s[1]
-	fmla	v27.4s, v1.4s, v5.4s[1]
-	fmla	v28.4s, v0.4s, v5.4s[2]
-	fmla	v29.4s, v1.4s, v5.4s[2]
-	fmla	v30.4s, v0.4s, v5.4s[3]
-	fmla	v31.4s, v1.4s, v5.4s[3]
+	fmla	v16.4s, v0.4s, v4.s[0]
+	fmla	v17.4s, v1.4s, v4.s[0]
+	fmla	v18.4s, v0.4s, v4.s[1]
+	fmla	v19.4s, v1.4s, v4.s[1]
+	fmla	v20.4s, v0.4s, v4.s[2]
+	fmla	v21.4s, v1.4s, v4.s[2]
+	fmla	v22.4s, v0.4s, v4.s[3]
+	fmla	v23.4s, v1.4s, v4.s[3]
+	fmla	v24.4s, v0.4s, v5.s[0]
+	fmla	v25.4s, v1.4s, v5.s[0]
+	fmla	v26.4s, v0.4s, v5.s[1]
+	fmla	v27.4s, v1.4s, v5.s[1]
+	fmla	v28.4s, v0.4s, v5.s[2]
+	fmla	v29.4s, v1.4s, v5.s[2]
+	fmla	v30.4s, v0.4s, v5.s[3]
+	fmla	v31.4s, v1.4s, v5.s[3]
 
 	ld1	{v6.4s}, [pB]
 	add	pB, pB, #16
@@ -213,22 +213,22 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 .endm
 
 .macro KERNEL8x8_M2
-	fmla	v16.4s, v2.4s, v6.4s[0]
-	fmla	v17.4s, v3.4s, v6.4s[0]
-	fmla	v18.4s, v2.4s, v6.4s[1]
-	fmla	v19.4s, v3.4s, v6.4s[1]
-	fmla	v20.4s, v2.4s, v6.4s[2]
-	fmla	v21.4s, v3.4s, v6.4s[2]
-	fmla	v22.4s, v2.4s, v6.4s[3]
-	fmla	v23.4s, v3.4s, v6.4s[3]
-	fmla	v24.4s, v2.4s, v7.4s[0]
-	fmla	v25.4s, v3.4s, v7.4s[0]
-	fmla	v26.4s, v2.4s, v7.4s[1]
-	fmla	v27.4s, v3.4s, v7.4s[1]
-	fmla	v28.4s, v2.4s, v7.4s[2]
-	fmla	v29.4s, v3.4s, v7.4s[2]
-	fmla	v30.4s, v2.4s, v7.4s[3]
-	fmla	v31.4s, v3.4s, v7.4s[3]
+	fmla	v16.4s, v2.4s, v6.s[0]
+	fmla	v17.4s, v3.4s, v6.s[0]
+	fmla	v18.4s, v2.4s, v6.s[1]
+	fmla	v19.4s, v3.4s, v6.s[1]
+	fmla	v20.4s, v2.4s, v6.s[2]
+	fmla	v21.4s, v3.4s, v6.s[2]
+	fmla	v22.4s, v2.4s, v6.s[3]
+	fmla	v23.4s, v3.4s, v6.s[3]
+	fmla	v24.4s, v2.4s, v7.s[0]
+	fmla	v25.4s, v3.4s, v7.s[0]
+	fmla	v26.4s, v2.4s, v7.s[1]
+	fmla	v27.4s, v3.4s, v7.s[1]
+	fmla	v28.4s, v2.4s, v7.s[2]
+	fmla	v29.4s, v3.4s, v7.s[2]
+	fmla	v30.4s, v2.4s, v7.s[3]
+	fmla	v31.4s, v3.4s, v7.s[3]
 
 	ld1	{v4.4s}, [pB]
 	add	pB, pB, #16
@@ -241,22 +241,22 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 .endm
 
 .macro KERNEL8x8_E
-	fmla	v16.4s, v2.4s, v6.4s[0]
-	fmla	v17.4s, v3.4s, v6.4s[0]
-	fmla	v18.4s, v2.4s, v6.4s[1]
-	fmla	v19.4s, v3.4s, v6.4s[1]
-	fmla	v20.4s, v2.4s, v6.4s[2]
-	fmla	v21.4s, v3.4s, v6.4s[2]
-	fmla	v22.4s, v2.4s, v6.4s[3]
-	fmla	v23.4s, v3.4s, v6.4s[3]
-	fmla	v24.4s, v2.4s, v7.4s[0]
-	fmla	v25.4s, v3.4s, v7.4s[0]
-	fmla	v26.4s, v2.4s, v7.4s[1]
-	fmla	v27.4s, v3.4s, v7.4s[1]
-	fmla	v28.4s, v2.4s, v7.4s[2]
-	fmla	v29.4s, v3.4s, v7.4s[2]
-	fmla	v30.4s, v2.4s, v7.4s[3]
-	fmla	v31.4s, v3.4s, v7.4s[3]
+	fmla	v16.4s, v2.4s, v6.s[0]
+	fmla	v17.4s, v3.4s, v6.s[0]
+	fmla	v18.4s, v2.4s, v6.s[1]
+	fmla	v19.4s, v3.4s, v6.s[1]
+	fmla	v20.4s, v2.4s, v6.s[2]
+	fmla	v21.4s, v3.4s, v6.s[2]
+	fmla	v22.4s, v2.4s, v6.s[3]
+	fmla	v23.4s, v3.4s, v6.s[3]
+	fmla	v24.4s, v2.4s, v7.s[0]
+	fmla	v25.4s, v3.4s, v7.s[0]
+	fmla	v26.4s, v2.4s, v7.s[1]
+	fmla	v27.4s, v3.4s, v7.s[1]
+	fmla	v28.4s, v2.4s, v7.s[2]
+	fmla	v29.4s, v3.4s, v7.s[2]
+	fmla	v30.4s, v2.4s, v7.s[3]
+	fmla	v31.4s, v3.4s, v7.s[3]
 .endm
 
 .macro KERNEL8x8_SUB
@@ -269,22 +269,22 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 	ld1	{v1.4s}, [pA]
 	add	pA, pA, #16
 
-	fmla	v16.4s, v0.4s, v4.4s[0]
-	fmla	v17.4s, v1.4s, v4.4s[0]
-	fmla	v18.4s, v0.4s, v4.4s[1]
-	fmla	v19.4s, v1.4s, v4.4s[1]
-	fmla	v20.4s, v0.4s, v4.4s[2]
-	fmla	v21.4s, v1.4s, v4.4s[2]
-	fmla	v22.4s, v0.4s, v4.4s[3]
-	fmla	v23.4s, v1.4s, v4.4s[3]
-	fmla	v24.4s, v0.4s, v5.4s[0]
-	fmla	v25.4s, v1.4s, v5.4s[0]
-	fmla	v26.4s, v0.4s, v5.4s[1]
-	fmla	v27.4s, v1.4s, v5.4s[1]
-	fmla	v28.4s, v0.4s, v5.4s[2]
-	fmla	v29.4s, v1.4s, v5.4s[2]
-	fmla	v30.4s, v0.4s, v5.4s[3]
-	fmla	v31.4s, v1.4s, v5.4s[3]
+	fmla	v16.4s, v0.4s, v4.s[0]
+	fmla	v17.4s, v1.4s, v4.s[0]
+	fmla	v18.4s, v0.4s, v4.s[1]
+	fmla	v19.4s, v1.4s, v4.s[1]
+	fmla	v20.4s, v0.4s, v4.s[2]
+	fmla	v21.4s, v1.4s, v4.s[2]
+	fmla	v22.4s, v0.4s, v4.s[3]
+	fmla	v23.4s, v1.4s, v4.s[3]
+	fmla	v24.4s, v0.4s, v5.s[0]
+	fmla	v25.4s, v1.4s, v5.s[0]
+	fmla	v26.4s, v0.4s, v5.s[1]
+	fmla	v27.4s, v1.4s, v5.s[1]
+	fmla	v28.4s, v0.4s, v5.s[2]
+	fmla	v29.4s, v1.4s, v5.s[2]
+	fmla	v30.4s, v0.4s, v5.s[3]
+	fmla	v31.4s, v1.4s, v5.s[3]
 .endm
 
 .macro SAVE8x8
@@ -367,14 +367,14 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 	ld1	{v0.4s}, [pA]
 	add	pA, pA, #16
 
-	fmul	v16.4s, v0.4s, v4.4s[0]
-	fmul	v18.4s, v0.4s, v4.4s[1]
-	fmul	v20.4s, v0.4s, v4.4s[2]
-	fmul	v22.4s, v0.4s, v4.4s[3]
-	fmul	v24.4s, v0.4s, v5.4s[0]
-	fmul	v26.4s, v0.4s, v5.4s[1]
-	fmul	v28.4s, v0.4s, v5.4s[2]
-	fmul	v30.4s, v0.4s, v5.4s[3]
+	fmul	v16.4s, v0.4s, v4.s[0]
+	fmul	v18.4s, v0.4s, v4.s[1]
+	fmul	v20.4s, v0.4s, v4.s[2]
+	fmul	v22.4s, v0.4s, v4.s[3]
+	fmul	v24.4s, v0.4s, v5.s[0]
+	fmul	v26.4s, v0.4s, v5.s[1]
+	fmul	v28.4s, v0.4s, v5.s[2]
+	fmul	v30.4s, v0.4s, v5.s[3]
 
 	ld1	{v6.4s}, [pB]
 	add	pB, pB, #16
@@ -385,14 +385,14 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 .endm
 
 .macro KERNEL4x8_M1
-	fmla	v16.4s, v0.4s, v4.4s[0]
-	fmla	v18.4s, v0.4s, v4.4s[1]
-	fmla	v20.4s, v0.4s, v4.4s[2]
-	fmla	v22.4s, v0.4s, v4.4s[3]
-	fmla	v24.4s, v0.4s, v5.4s[0]
-	fmla	v26.4s, v0.4s, v5.4s[1]
-	fmla	v28.4s, v0.4s, v5.4s[2]
-	fmla	v30.4s, v0.4s, v5.4s[3]
+	fmla	v16.4s, v0.4s, v4.s[0]
+	fmla	v18.4s, v0.4s, v4.s[1]
+	fmla	v20.4s, v0.4s, v4.s[2]
+	fmla	v22.4s, v0.4s, v4.s[3]
+	fmla	v24.4s, v0.4s, v5.s[0]
+	fmla	v26.4s, v0.4s, v5.s[1]
+	fmla	v28.4s, v0.4s, v5.s[2]
+	fmla	v30.4s, v0.4s, v5.s[3]
 
 	ld1	{v6.4s}, [pB]
 	add	pB, pB, #16
@@ -403,14 +403,14 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 .endm
 
 .macro KERNEL4x8_M2
-	fmla	v16.4s, v2.4s, v6.4s[0]
-	fmla	v18.4s, v2.4s, v6.4s[1]
-	fmla	v20.4s, v2.4s, v6.4s[2]
-	fmla	v22.4s, v2.4s, v6.4s[3]
-	fmla	v24.4s, v2.4s, v7.4s[0]
-	fmla	v26.4s, v2.4s, v7.4s[1]
-	fmla	v28.4s, v2.4s, v7.4s[2]
-	fmla	v30.4s, v2.4s, v7.4s[3]
+	fmla	v16.4s, v2.4s, v6.s[0]
+	fmla	v18.4s, v2.4s, v6.s[1]
+	fmla	v20.4s, v2.4s, v6.s[2]
+	fmla	v22.4s, v2.4s, v6.s[3]
+	fmla	v24.4s, v2.4s, v7.s[0]
+	fmla	v26.4s, v2.4s, v7.s[1]
+	fmla	v28.4s, v2.4s, v7.s[2]
+	fmla	v30.4s, v2.4s, v7.s[3]
 
 	ld1	{v4.4s}, [pB]
 	add	pB, pB, #16
@@ -421,14 +421,14 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 .endm
 
 .macro KERNEL4x8_E
-	fmla	v16.4s, v2.4s, v6.4s[0]
-	fmla	v18.4s, v2.4s, v6.4s[1]
-	fmla	v20.4s, v2.4s, v6.4s[2]
-	fmla	v22.4s, v2.4s, v6.4s[3]
-	fmla	v24.4s, v2.4s, v7.4s[0]
-	fmla	v26.4s, v2.4s, v7.4s[1]
-	fmla	v28.4s, v2.4s, v7.4s[2]
-	fmla	v30.4s, v2.4s, v7.4s[3]
+	fmla	v16.4s, v2.4s, v6.s[0]
+	fmla	v18.4s, v2.4s, v6.s[1]
+	fmla	v20.4s, v2.4s, v6.s[2]
+	fmla	v22.4s, v2.4s, v6.s[3]
+	fmla	v24.4s, v2.4s, v7.s[0]
+	fmla	v26.4s, v2.4s, v7.s[1]
+	fmla	v28.4s, v2.4s, v7.s[2]
+	fmla	v30.4s, v2.4s, v7.s[3]
 .endm
 
 .macro KERNEL4x8_SUB
@@ -439,14 +439,14 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 	ld1	{v0.4s}, [pA]
 	add	pA, pA, #16
 
-	fmla	v16.4s, v0.4s, v4.4s[0]
-	fmla	v18.4s, v0.4s, v4.4s[1]
-	fmla	v20.4s, v0.4s, v4.4s[2]
-	fmla	v22.4s, v0.4s, v4.4s[3]
-	fmla	v24.4s, v0.4s, v5.4s[0]
-	fmla	v26.4s, v0.4s, v5.4s[1]
-	fmla	v28.4s, v0.4s, v5.4s[2]
-	fmla	v30.4s, v0.4s, v5.4s[3]
+	fmla	v16.4s, v0.4s, v4.s[0]
+	fmla	v18.4s, v0.4s, v4.s[1]
+	fmla	v20.4s, v0.4s, v4.s[2]
+	fmla	v22.4s, v0.4s, v4.s[3]
+	fmla	v24.4s, v0.4s, v5.s[0]
+	fmla	v26.4s, v0.4s, v5.s[1]
+	fmla	v28.4s, v0.4s, v5.s[2]
+	fmla	v30.4s, v0.4s, v5.s[3]
 .endm
 
 .macro SAVE4x8
@@ -520,14 +520,14 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 	ld1	{v0.2s}, [pA]
 	add	pA, pA, #8
 
-	fmla	v16.2s, v0.2s, v4.4s[0]
-	fmla	v18.2s, v0.2s, v4.4s[1]
-	fmla	v20.2s, v0.2s, v4.4s[2]
-	fmla	v22.2s, v0.2s, v4.4s[3]
-	fmla	v24.2s, v0.2s, v5.4s[0]
-	fmla	v26.2s, v0.2s, v5.4s[1]
-	fmla	v28.2s, v0.2s, v5.4s[2]
-	fmla	v30.2s, v0.2s, v5.4s[3]
+	fmla	v16.2s, v0.2s, v4.s[0]
+	fmla	v18.2s, v0.2s, v4.s[1]
+	fmla	v20.2s, v0.2s, v4.s[2]
+	fmla	v22.2s, v0.2s, v4.s[3]
+	fmla	v24.2s, v0.2s, v5.s[0]
+	fmla	v26.2s, v0.2s, v5.s[1]
+	fmla	v28.2s, v0.2s, v5.s[2]
+	fmla	v30.2s, v0.2s, v5.s[3]
 .endm
 
 .macro SAVE2x8
@@ -601,14 +601,14 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 	ldr	s0, [pA]
 	add	pA, pA, #4
 
-	fmla	s16, s0, v4.4s[0]
-	fmla	s18, s0, v4.4s[1]
-	fmla	s20, s0, v4.4s[2]
-	fmla	s22, s0, v4.4s[3]
-	fmla	s24, s0, v5.4s[0]
-	fmla	s26, s0, v5.4s[1]
-	fmla	s28, s0, v5.4s[2]
-	fmla	s30, s0, v5.4s[3]
+	fmla	s16, s0, v4.s[0]
+	fmla	s18, s0, v4.s[1]
+	fmla	s20, s0, v4.s[2]
+	fmla	s22, s0, v4.s[3]
+	fmla	s24, s0, v5.s[0]
+	fmla	s26, s0, v5.s[1]
+	fmla	s28, s0, v5.s[2]
+	fmla	s30, s0, v5.s[3]
 .endm
 
 .macro SAVE1x8
@@ -682,14 +682,14 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 	ld1	{v1.4s}, [pA]
 	add	pA, pA, #16
 
-	fmul	v16.4s, v0.4s, v8.2s[0]
-	fmul	v17.4s, v1.4s, v8.2s[0]
-	fmul	v20.4s, v0.4s, v8.2s[1]
-	fmul	v21.4s, v1.4s, v8.2s[1]
-	fmul	v24.4s, v0.4s, v9.2s[0]
-	fmul	v25.4s, v1.4s, v9.2s[0]
-	fmul	v28.4s, v0.4s, v9.2s[1]
-	fmul	v29.4s, v1.4s, v9.2s[1]
+	fmul	v16.4s, v0.4s, v8.s[0]
+	fmul	v17.4s, v1.4s, v8.s[0]
+	fmul	v20.4s, v0.4s, v8.s[1]
+	fmul	v21.4s, v1.4s, v8.s[1]
+	fmul	v24.4s, v0.4s, v9.s[0]
+	fmul	v25.4s, v1.4s, v9.s[0]
+	fmul	v28.4s, v0.4s, v9.s[1]
+	fmul	v29.4s, v1.4s, v9.s[1]
 
 	ld1	{v12.2s, v13.2s}, [pB]
 	add	pB, pB, #16
@@ -700,14 +700,14 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 .endm
 
 .macro KERNEL8x4_M1
-	fmla	v16.4s, v0.4s, v8.2s[0]
-	fmla	v17.4s, v1.4s, v8.2s[0]
-	fmla	v20.4s, v0.4s, v8.2s[1]
-	fmla	v21.4s, v1.4s, v8.2s[1]
-	fmla	v24.4s, v0.4s, v9.2s[0]
-	fmla	v25.4s, v1.4s, v9.2s[0]
-	fmla	v28.4s, v0.4s, v9.2s[1]
-	fmla	v29.4s, v1.4s, v9.2s[1]
+	fmla	v16.4s, v0.4s, v8.s[0]
+	fmla	v17.4s, v1.4s, v8.s[0]
+	fmla	v20.4s, v0.4s, v8.s[1]
+	fmla	v21.4s, v1.4s, v8.s[1]
+	fmla	v24.4s, v0.4s, v9.s[0]
+	fmla	v25.4s, v1.4s, v9.s[0]
+	fmla	v28.4s, v0.4s, v9.s[1]
+	fmla	v29.4s, v1.4s, v9.s[1]
 
 	ld1	{v12.2s, v13.2s}, [pB]
 	add	pB, pB, #16
@@ -718,14 +718,14 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 .endm
 
 .macro KERNEL8x4_M2
-	fmla	v16.4s, v4.4s, v12.2s[0]
-	fmla	v17.4s, v5.4s, v12.2s[0]
-	fmla	v20.4s, v4.4s, v12.2s[1]
-	fmla	v21.4s, v5.4s, v12.2s[1]
-	fmla	v24.4s, v4.4s, v13.2s[0]
-	fmla	v25.4s, v5.4s, v13.2s[0]
-	fmla	v28.4s, v4.4s, v13.2s[1]
-	fmla	v29.4s, v5.4s, v13.2s[1]
+	fmla	v16.4s, v4.4s, v12.s[0]
+	fmla	v17.4s, v5.4s, v12.s[0]
+	fmla	v20.4s, v4.4s, v12.s[1]
+	fmla	v21.4s, v5.4s, v12.s[1]
+	fmla	v24.4s, v4.4s, v13.s[0]
+	fmla	v25.4s, v5.4s, v13.s[0]
+	fmla	v28.4s, v4.4s, v13.s[1]
+	fmla	v29.4s, v5.4s, v13.s[1]
 
 	ld1	{v8.2s, v9.2s}, [pB]
 	add	pB, pB, #16
@@ -736,14 +736,14 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 .endm
 
 .macro KERNEL8x4_E
-	fmla	v16.4s, v4.4s, v12.2s[0]
-	fmla	v17.4s, v5.4s, v12.2s[0]
-	fmla	v20.4s, v4.4s, v12.2s[1]
-	fmla	v21.4s, v5.4s, v12.2s[1]
-	fmla	v24.4s, v4.4s, v13.2s[0]
-	fmla	v25.4s, v5.4s, v13.2s[0]
-	fmla	v28.4s, v4.4s, v13.2s[1]
-	fmla	v29.4s, v5.4s, v13.2s[1]
+	fmla	v16.4s, v4.4s, v12.s[0]
+	fmla	v17.4s, v5.4s, v12.s[0]
+	fmla	v20.4s, v4.4s, v12.s[1]
+	fmla	v21.4s, v5.4s, v12.s[1]
+	fmla	v24.4s, v4.4s, v13.s[0]
+	fmla	v25.4s, v5.4s, v13.s[0]
+	fmla	v28.4s, v4.4s, v13.s[1]
+	fmla	v29.4s, v5.4s, v13.s[1]
 .endm
 
 .macro KERNEL8x4_SUB
@@ -754,14 +754,14 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 	ld1	{v1.4s}, [pA]
 	add	pA, pA, #16
 
-	fmla	v16.4s, v0.4s, v8.2s[0]
-	fmla	v17.4s, v1.4s, v8.2s[0]
-	fmla	v20.4s, v0.4s, v8.2s[1]
-	fmla	v21.4s, v1.4s, v8.2s[1]
-	fmla	v24.4s, v0.4s, v9.2s[0]
-	fmla	v25.4s, v1.4s, v9.2s[0]
-	fmla	v28.4s, v0.4s, v9.2s[1]
-	fmla	v29.4s, v1.4s, v9.2s[1]
+	fmla	v16.4s, v0.4s, v8.s[0]
+	fmla	v17.4s, v1.4s, v8.s[0]
+	fmla	v20.4s, v0.4s, v8.s[1]
+	fmla	v21.4s, v1.4s, v8.s[1]
+	fmla	v24.4s, v0.4s, v9.s[0]
+	fmla	v25.4s, v1.4s, v9.s[0]
+	fmla	v28.4s, v0.4s, v9.s[1]
+	fmla	v29.4s, v1.4s, v9.s[1]
 .endm
 
 .macro SAVE8x4
@@ -814,17 +814,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 	ld1	{v0.2s, v1.2s}, [pA]
 	add	pA, pA, #16
 
-	fmul	v16.2s, v0.2s, v8.2s[0]
-	fmul	v29.2s, v1.2s, v9.2s[1]
+	fmul	v16.2s, v0.2s, v8.s[0]
+	fmul	v29.2s, v1.2s, v9.s[1]
 
-	fmul	v20.2s, v0.2s, v8.2s[1]
-	fmul	v25.2s, v1.2s, v9.2s[0]
+	fmul	v20.2s, v0.2s, v8.s[1]
+	fmul	v25.2s, v1.2s, v9.s[0]
 
-	fmul	v24.2s, v0.2s, v9.2s[0]
-	fmul	v21.2s, v1.2s, v8.2s[1]
+	fmul	v24.2s, v0.2s, v9.s[0]
+	fmul	v21.2s, v1.2s, v8.s[1]
 
-	fmul	v28.2s, v0.2s, v9.2s[1]
-	fmul	v17.2s, v1.2s, v8.2s[0]
+	fmul	v28.2s, v0.2s, v9.s[1]
+	fmul	v17.2s, v1.2s, v8.s[0]
 
 	ld1	{v12.2s, v13.2s}, [pB]
 	add	pB, pB, #16
@@ -833,61 +833,61 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 .endm
 
 .macro KERNEL4x4_M1
-	fmla	v16.2s, v0.2s, v8.2s[0]
-	fmla	v29.2s, v1.2s, v9.2s[1]
+	fmla	v16.2s, v0.2s, v8.s[0]
+	fmla	v29.2s, v1.2s, v9.s[1]
 
 	ld1	{v12.2s, v13.2s}, [pB]		// For next round
 	add	pB, pB, #16
 
-	fmla	v20.2s, v0.2s, v8.2s[1]
-	fmla	v25.2s, v1.2s, v9.2s[0]
+	fmla	v20.2s, v0.2s, v8.s[1]
+	fmla	v25.2s, v1.2s, v9.s[0]
 
 	ld1	{v4.2s, v5.2s}, [pA]		// For next round
 	add	pA, pA, #16
 
-	fmla	v24.2s, v0.2s, v9.2s[0]
-	fmla	v21.2s, v1.2s, v8.2s[1]
+	fmla	v24.2s, v0.2s, v9.s[0]
+	fmla	v21.2s, v1.2s, v8.s[1]
 
 	prfm	PLDL1KEEP, [pB, #512]
 
-	fmla	v28.2s, v0.2s, v9.2s[1]
-	fmla	v17.2s, v1.2s, v8.2s[0]
+	fmla	v28.2s, v0.2s, v9.s[1]
+	fmla	v17.2s, v1.2s, v8.s[0]
 .endm
 
 .macro KERNEL4x4_M2
-	fmla	v16.2s, v4.2s, v12.2s[0]
-	fmla	v29.2s, v5.2s, v13.2s[1]
+	fmla	v16.2s, v4.2s, v12.s[0]
+	fmla	v29.2s, v5.2s, v13.s[1]
 
 	ld1	{v8.2s, v9.2s}, [pB]		// For next round
 	add	pB, pB, #16
 
-	fmla	v20.2s, v4.2s, v12.2s[1]
-	fmla	v25.2s, v5.2s, v13.2s[0]
+	fmla	v20.2s, v4.2s, v12.s[1]
+	fmla	v25.2s, v5.2s, v13.s[0]
 
 	ld1	{v0.2s, v1.2s}, [pA]		// For next round
 	add	pA, pA, #16
 
-	fmla	v24.2s, v4.2s, v13.2s[0]
-	fmla	v21.2s, v5.2s, v12.2s[1]
+	fmla	v24.2s, v4.2s, v13.s[0]
+	fmla	v21.2s, v5.2s, v12.s[1]
 
 	prfm	PLDL1KEEP, [pA, #512]
 
-	fmla	v28.2s, v4.2s, v13.2s[1]
-	fmla	v17.2s, v5.2s, v12.2s[0]
+	fmla	v28.2s, v4.2s, v13.s[1]
+	fmla	v17.2s, v5.2s, v12.s[0]
 .endm
 
 .macro KERNEL4x4_E
-	fmla	v16.2s, v4.2s, v12.2s[0]
-	fmla	v29.2s, v5.2s, v13.2s[1]
+	fmla	v16.2s, v4.2s, v12.s[0]
+	fmla	v29.2s, v5.2s, v13.s[1]
 
-	fmla	v20.2s, v4.2s, v12.2s[1]
-	fmla	v25.2s, v5.2s, v13.2s[0]
+	fmla	v20.2s, v4.2s, v12.s[1]
+	fmla	v25.2s, v5.2s, v13.s[0]
 
-	fmla	v24.2s, v4.2s, v13.2s[0]
-	fmla	v21.2s, v5.2s, v12.2s[1]
+	fmla	v24.2s, v4.2s, v13.s[0]
+	fmla	v21.2s, v5.2s, v12.s[1]
 
-	fmla	v28.2s, v4.2s, v13.2s[1]
-	fmla	v17.2s, v5.2s, v12.2s[0]
+	fmla	v28.2s, v4.2s, v13.s[1]
+	fmla	v17.2s, v5.2s, v12.s[0]
 .endm
 
 .macro KERNEL4x4_SUB
@@ -896,17 +896,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 	ld1	{v0.2s, v1.2s}, [pA]
 	add	pA, pA, #16
 
-	fmla	v16.2s, v0.2s, v8.2s[0]
-	fmla	v29.2s, v1.2s, v9.2s[1]
+	fmla	v16.2s, v0.2s, v8.s[0]
+	fmla	v29.2s, v1.2s, v9.s[1]
 
-	fmla	v20.2s, v0.2s, v8.2s[1]
-	fmla	v25.2s, v1.2s, v9.2s[0]
+	fmla	v20.2s, v0.2s, v8.s[1]
+	fmla	v25.2s, v1.2s, v9.s[0]
 
-	fmla	v24.2s, v0.2s, v9.2s[0]
-	fmla	v21.2s, v1.2s, v8.2s[1]
+	fmla	v24.2s, v0.2s, v9.s[0]
+	fmla	v21.2s, v1.2s, v8.s[1]
 
-	fmla	v28.2s, v0.2s, v9.2s[1]
-	fmla	v17.2s, v1.2s, v8.2s[0]
+	fmla	v28.2s, v0.2s, v9.s[1]
+	fmla	v17.2s, v1.2s, v8.s[0]
 .endm
 
 .macro SAVE4x4
@@ -951,10 +951,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 	ld1	{v0.2s}, [pA]
 	add	pA, pA, #8
 
-	fmla	v16.2s, v0.2s, v8.2s[0]
-	fmla	v20.2s, v0.2s, v8.2s[1]
-	fmla	v24.2s, v0.2s, v9.2s[0]
-	fmla	v28.2s, v0.2s, v9.2s[1]
+	fmla	v16.2s, v0.2s, v8.s[0]
+	fmla	v20.2s, v0.2s, v8.s[1]
+	fmla	v24.2s, v0.2s, v9.s[0]
+	fmla	v28.2s, v0.2s, v9.s[1]
 .endm
 
 .macro SAVE2x4
@@ -1034,11 +1034,11 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 	ld1	{v1.4s}, [pA]
 	add	pA, pA, #16
 
-	fmla	v16.4s, v0.4s, v8.2s[0]
-	fmla	v17.4s, v1.4s, v8.2s[0]
+	fmla	v16.4s, v0.4s, v8.s[0]
+	fmla	v17.4s, v1.4s, v8.s[0]
 
-	fmla	v20.4s, v0.4s, v8.2s[1]
-	fmla	v21.4s, v1.4s, v8.2s[1]
+	fmla	v20.4s, v0.4s, v8.s[1]
+	fmla	v21.4s, v1.4s, v8.s[1]
 .endm
 
 .macro SAVE8x2
@@ -1074,10 +1074,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 	ld1	{v0.2s, v1.2s}, [pA]
 	add	pA, pA, #16
 
-	fmla	v16.2s, v0.2s, v8.2s[0]
-	fmla	v17.2s, v1.2s, v8.2s[0]
-	fmla	v20.2s, v0.2s, v8.2s[1]
-	fmla	v21.2s, v1.2s, v8.2s[1]
+	fmla	v16.2s, v0.2s, v8.s[0]
+	fmla	v17.2s, v1.2s, v8.s[0]
+	fmla	v20.2s, v0.2s, v8.s[1]
+	fmla	v21.2s, v1.2s, v8.s[1]
 .endm
 
 .macro SAVE4x2
@@ -1109,8 +1109,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 	ld1	{v0.2s}, [pA]
 	add	pA, pA, #8
 
-	fmla	v16.2s, v0.2s, v8.2s[0]
-	fmla	v20.2s, v0.2s, v8.2s[1]
+	fmla	v16.2s, v0.2s, v8.s[0]
+	fmla	v20.2s, v0.2s, v8.s[1]
 .endm
 
 .macro SAVE2x2
@@ -1139,7 +1139,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 	ldr	s0 , [pA]
 	add	pA, pA, #4
 
-	fmla	v16.2s, v8.2s, v0.2s[0]
+	fmla	v16.2s, v8.2s, v0.s[0]
 .endm
 
 .macro SAVE1x2
@@ -1169,8 +1169,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 	ld1	{v1.4s}, [pA]
 	add	pA, pA, #16
 
-	fmla	v16.4s, v0.4s, v8.2s[0]
-	fmla	v17.4s, v1.4s, v8.2s[0]
+	fmla	v16.4s, v0.4s, v8.s[0]
+	fmla	v17.4s, v1.4s, v8.s[0]
 .endm
 
 .macro SAVE8x1
@@ -1196,8 +1196,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 	ld1	{v0.2s, v1.2s}, [pA]
 	add	pA , pA, #16
 
-	fmla	v16.2s, v0.2s, v8.2s[0]
-	fmla	v17.2s, v1.2s, v8.2s[0]
+	fmla	v16.2s, v0.2s, v8.s[0]
+	fmla	v17.2s, v1.2s, v8.s[0]
 .endm
 
 .macro SAVE4x1
@@ -1222,7 +1222,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 	ld1	{v0.2s}, [pA]
 	add	pA , pA, #8
 
-	fmla	v16.2s, v0.2s, v8.2s[0]
+	fmla	v16.2s, v0.2s, v8.s[0]
 .endm
 
 .macro SAVE2x1
diff --git a/kernel/arm64/strmm_kernel_16x4.S b/kernel/arm64/strmm_kernel_16x4.S
old mode 100755
new mode 100644
index b99760a03..28b321651
--- a/kernel/arm64/strmm_kernel_16x4.S
+++ b/kernel/arm64/strmm_kernel_16x4.S
@@ -161,25 +161,25 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 	ld1	{v3.4s}, [pA]
 	add	pA, pA, #16
 
-	fmul	v16.4s, v0.4s, v8.2s[0]
-	fmul	v17.4s, v1.4s, v8.2s[0]
-	fmul	v18.4s, v2.4s, v8.2s[0]
-	fmul	v19.4s, v3.4s, v8.2s[0]
+	fmul	v16.4s, v0.4s, v8.s[0]
+	fmul	v17.4s, v1.4s, v8.s[0]
+	fmul	v18.4s, v2.4s, v8.s[0]
+	fmul	v19.4s, v3.4s, v8.s[0]
 
-	fmul	v20.4s, v0.4s, v8.2s[1]
-	fmul	v21.4s, v1.4s, v8.2s[1]
-	fmul	v22.4s, v2.4s, v8.2s[1]
-	fmul	v23.4s, v3.4s, v8.2s[1]
+	fmul	v20.4s, v0.4s, v8.s[1]
+	fmul	v21.4s, v1.4s, v8.s[1]
+	fmul	v22.4s, v2.4s, v8.s[1]
+	fmul	v23.4s, v3.4s, v8.s[1]
 
-	fmul	v24.4s, v0.4s, v9.2s[0]
-	fmul	v25.4s, v1.4s, v9.2s[0]
-	fmul	v26.4s, v2.4s, v9.2s[0]
-	fmul	v27.4s, v3.4s, v9.2s[0]
+	fmul	v24.4s, v0.4s, v9.s[0]
+	fmul	v25.4s, v1.4s, v9.s[0]
+	fmul	v26.4s, v2.4s, v9.s[0]
+	fmul	v27.4s, v3.4s, v9.s[0]
 
-	fmul	v28.4s, v0.4s, v9.2s[1]
-	fmul	v29.4s, v1.4s, v9.2s[1]
-	fmul	v30.4s, v2.4s, v9.2s[1]
-	fmul	v31.4s, v3.4s, v9.2s[1]
+	fmul	v28.4s, v0.4s, v9.s[1]
+	fmul	v29.4s, v1.4s, v9.s[1]
+	fmul	v30.4s, v2.4s, v9.s[1]
+	fmul	v31.4s, v3.4s, v9.s[1]
 
 	ld1	{v12.2s, v13.2s}, [pB]
 	add	pB, pB, #16
@@ -194,25 +194,25 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 .endm
 
 .macro KERNEL16x4_M1
-	fmla	v16.4s, v0.4s, v8.2s[0]
-	fmla	v17.4s, v1.4s, v8.2s[0]
-	fmla	v18.4s, v2.4s, v8.2s[0]
-	fmla	v19.4s, v3.4s, v8.2s[0]
+	fmla	v16.4s, v0.4s, v8.s[0]
+	fmla	v17.4s, v1.4s, v8.s[0]
+	fmla	v18.4s, v2.4s, v8.s[0]
+	fmla	v19.4s, v3.4s, v8.s[0]
 
-	fmla	v20.4s, v0.4s, v8.2s[1]
-	fmla	v21.4s, v1.4s, v8.2s[1]
-	fmla	v22.4s, v2.4s, v8.2s[1]
-	fmla	v23.4s, v3.4s, v8.2s[1]
+	fmla	v20.4s, v0.4s, v8.s[1]
+	fmla	v21.4s, v1.4s, v8.s[1]
+	fmla	v22.4s, v2.4s, v8.s[1]
+	fmla	v23.4s, v3.4s, v8.s[1]
 
-	fmla	v24.4s, v0.4s, v9.2s[0]
-	fmla	v25.4s, v1.4s, v9.2s[0]
-	fmla	v26.4s, v2.4s, v9.2s[0]
-	fmla	v27.4s, v3.4s, v9.2s[0]
+	fmla	v24.4s, v0.4s, v9.s[0]
+	fmla	v25.4s, v1.4s, v9.s[0]
+	fmla	v26.4s, v2.4s, v9.s[0]
+	fmla	v27.4s, v3.4s, v9.s[0]
 
-	fmla	v28.4s, v0.4s, v9.2s[1]
-	fmla	v29.4s, v1.4s, v9.2s[1]
-	fmla	v30.4s, v2.4s, v9.2s[1]
-	fmla	v31.4s, v3.4s, v9.2s[1]
+	fmla	v28.4s, v0.4s, v9.s[1]
+	fmla	v29.4s, v1.4s, v9.s[1]
+	fmla	v30.4s, v2.4s, v9.s[1]
+	fmla	v31.4s, v3.4s, v9.s[1]
 
 	ld1	{v12.2s, v13.2s}, [pB]
 	add	pB, pB, #16
@@ -227,25 +227,25 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 .endm
 
 .macro KERNEL16x4_M2
-	fmla	v16.4s, v4.4s, v12.2s[0]
-	fmla	v17.4s, v5.4s, v12.2s[0]
-	fmla	v18.4s, v6.4s, v12.2s[0]
-	fmla	v19.4s, v7.4s, v12.2s[0]
+	fmla	v16.4s, v4.4s, v12.s[0]
+	fmla	v17.4s, v5.4s, v12.s[0]
+	fmla	v18.4s, v6.4s, v12.s[0]
+	fmla	v19.4s, v7.4s, v12.s[0]
 
-	fmla	v20.4s, v4.4s, v12.2s[1]
-	fmla	v21.4s, v5.4s, v12.2s[1]
-	fmla	v22.4s, v6.4s, v12.2s[1]
-	fmla	v23.4s, v7.4s, v12.2s[1]
+	fmla	v20.4s, v4.4s, v12.s[1]
+	fmla	v21.4s, v5.4s, v12.s[1]
+	fmla	v22.4s, v6.4s, v12.s[1]
+	fmla	v23.4s, v7.4s, v12.s[1]
 
-	fmla	v24.4s, v4.4s, v13.2s[0]
-	fmla	v25.4s, v5.4s, v13.2s[0]
-	fmla	v26.4s, v6.4s, v13.2s[0]
-	fmla	v27.4s, v7.4s, v13.2s[0]
+	fmla	v24.4s, v4.4s, v13.s[0]
+	fmla	v25.4s, v5.4s, v13.s[0]
+	fmla	v26.4s, v6.4s, v13.s[0]
+	fmla	v27.4s, v7.4s, v13.s[0]
 
-	fmla	v28.4s, v4.4s, v13.2s[1]
-	fmla	v29.4s, v5.4s, v13.2s[1]
-	fmla	v30.4s, v6.4s, v13.2s[1]
-	fmla	v31.4s, v7.4s, v13.2s[1]
+	fmla	v28.4s, v4.4s, v13.s[1]
+	fmla	v29.4s, v5.4s, v13.s[1]
+	fmla	v30.4s, v6.4s, v13.s[1]
+	fmla	v31.4s, v7.4s, v13.s[1]
 
 	ld1	{v8.2s, v9.2s}, [pB]
 	add	pB, pB, #16
@@ -260,25 +260,25 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 .endm
 
 .macro KERNEL16x4_E
-	fmla	v16.4s, v4.4s, v12.2s[0]
-	fmla	v17.4s, v5.4s, v12.2s[0]
-	fmla	v18.4s, v6.4s, v12.2s[0]
-	fmla	v19.4s, v7.4s, v12.2s[0]
+	fmla	v16.4s, v4.4s, v12.s[0]
+	fmla	v17.4s, v5.4s, v12.s[0]
+	fmla	v18.4s, v6.4s, v12.s[0]
+	fmla	v19.4s, v7.4s, v12.s[0]
 
-	fmla	v20.4s, v4.4s, v12.2s[1]
-	fmla	v21.4s, v5.4s, v12.2s[1]
-	fmla	v22.4s, v6.4s, v12.2s[1]
-	fmla	v23.4s, v7.4s, v12.2s[1]
+	fmla	v20.4s, v4.4s, v12.s[1]
+	fmla	v21.4s, v5.4s, v12.s[1]
+	fmla	v22.4s, v6.4s, v12.s[1]
+	fmla	v23.4s, v7.4s, v12.s[1]
 
-	fmla	v24.4s, v4.4s, v13.2s[0]
-	fmla	v25.4s, v5.4s, v13.2s[0]
-	fmla	v26.4s, v6.4s, v13.2s[0]
-	fmla	v27.4s, v7.4s, v13.2s[0]
+	fmla	v24.4s, v4.4s, v13.s[0]
+	fmla	v25.4s, v5.4s, v13.s[0]
+	fmla	v26.4s, v6.4s, v13.s[0]
+	fmla	v27.4s, v7.4s, v13.s[0]
 
-	fmla	v28.4s, v4.4s, v13.2s[1]
-	fmla	v29.4s, v5.4s, v13.2s[1]
-	fmla	v30.4s, v6.4s, v13.2s[1]
-	fmla	v31.4s, v7.4s, v13.2s[1]
+	fmla	v28.4s, v4.4s, v13.s[1]
+	fmla	v29.4s, v5.4s, v13.s[1]
+	fmla	v30.4s, v6.4s, v13.s[1]
+	fmla	v31.4s, v7.4s, v13.s[1]
 .endm
 
 .macro KERNEL16x4_SUB
@@ -293,25 +293,25 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 	ld1	{v3.4s}, [pA]
 	add	pA, pA, #16
 
-	fmla	v16.4s, v0.4s, v8.2s[0]
-	fmla	v17.4s, v1.4s, v8.2s[0]
-	fmla	v18.4s, v2.4s, v8.2s[0]
-	fmla	v19.4s, v3.4s, v8.2s[0]
+	fmla	v16.4s, v0.4s, v8.s[0]
+	fmla	v17.4s, v1.4s, v8.s[0]
+	fmla	v18.4s, v2.4s, v8.s[0]
+	fmla	v19.4s, v3.4s, v8.s[0]
 
-	fmla	v20.4s, v0.4s, v8.2s[1]
-	fmla	v21.4s, v1.4s, v8.2s[1]
-	fmla	v22.4s, v2.4s, v8.2s[1]
-	fmla	v23.4s, v3.4s, v8.2s[1]
+	fmla	v20.4s, v0.4s, v8.s[1]
+	fmla	v21.4s, v1.4s, v8.s[1]
+	fmla	v22.4s, v2.4s, v8.s[1]
+	fmla	v23.4s, v3.4s, v8.s[1]
 
-	fmla	v24.4s, v0.4s, v9.2s[0]
-	fmla	v25.4s, v1.4s, v9.2s[0]
-	fmla	v26.4s, v2.4s, v9.2s[0]
-	fmla	v27.4s, v3.4s, v9.2s[0]
+	fmla	v24.4s, v0.4s, v9.s[0]
+	fmla	v25.4s, v1.4s, v9.s[0]
+	fmla	v26.4s, v2.4s, v9.s[0]
+	fmla	v27.4s, v3.4s, v9.s[0]
 
-	fmla	v28.4s, v0.4s, v9.2s[1]
-	fmla	v29.4s, v1.4s, v9.2s[1]
-	fmla	v30.4s, v2.4s, v9.2s[1]
-	fmla	v31.4s, v3.4s, v9.2s[1]
+	fmla	v28.4s, v0.4s, v9.s[1]
+	fmla	v29.4s, v1.4s, v9.s[1]
+	fmla	v30.4s, v2.4s, v9.s[1]
+	fmla	v31.4s, v3.4s, v9.s[1]
 .endm
 
 .macro SAVE16x4
@@ -369,14 +369,14 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 	ld1	{v1.4s}, [pA]
 	add	pA, pA, #16
 
-	fmul	v16.4s, v0.4s, v8.2s[0]
-	fmul	v17.4s, v1.4s, v8.2s[0]
-	fmul	v20.4s, v0.4s, v8.2s[1]
-	fmul	v21.4s, v1.4s, v8.2s[1]
-	fmul	v24.4s, v0.4s, v9.2s[0]
-	fmul	v25.4s, v1.4s, v9.2s[0]
-	fmul	v28.4s, v0.4s, v9.2s[1]
-	fmul	v29.4s, v1.4s, v9.2s[1]
+	fmul	v16.4s, v0.4s, v8.s[0]
+	fmul	v17.4s, v1.4s, v8.s[0]
+	fmul	v20.4s, v0.4s, v8.s[1]
+	fmul	v21.4s, v1.4s, v8.s[1]
+	fmul	v24.4s, v0.4s, v9.s[0]
+	fmul	v25.4s, v1.4s, v9.s[0]
+	fmul	v28.4s, v0.4s, v9.s[1]
+	fmul	v29.4s, v1.4s, v9.s[1]
 
 	ld1	{v12.2s, v13.2s}, [pB]
 	add	pB, pB, #16
@@ -387,14 +387,14 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 .endm
 
 .macro KERNEL8x4_M1
-	fmla	v16.4s, v0.4s, v8.2s[0]
-	fmla	v17.4s, v1.4s, v8.2s[0]
-	fmla	v20.4s, v0.4s, v8.2s[1]
-	fmla	v21.4s, v1.4s, v8.2s[1]
-	fmla	v24.4s, v0.4s, v9.2s[0]
-	fmla	v25.4s, v1.4s, v9.2s[0]
-	fmla	v28.4s, v0.4s, v9.2s[1]
-	fmla	v29.4s, v1.4s, v9.2s[1]
+	fmla	v16.4s, v0.4s, v8.s[0]
+	fmla	v17.4s, v1.4s, v8.s[0]
+	fmla	v20.4s, v0.4s, v8.s[1]
+	fmla	v21.4s, v1.4s, v8.s[1]
+	fmla	v24.4s, v0.4s, v9.s[0]
+	fmla	v25.4s, v1.4s, v9.s[0]
+	fmla	v28.4s, v0.4s, v9.s[1]
+	fmla	v29.4s, v1.4s, v9.s[1]
 
 	ld1	{v12.2s, v13.2s}, [pB]
 	add	pB, pB, #16
@@ -405,14 +405,14 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 .endm
 
 .macro KERNEL8x4_M2
-	fmla	v16.4s, v4.4s, v12.2s[0]
-	fmla	v17.4s, v5.4s, v12.2s[0]
-	fmla	v20.4s, v4.4s, v12.2s[1]
-	fmla	v21.4s, v5.4s, v12.2s[1]
-	fmla	v24.4s, v4.4s, v13.2s[0]
-	fmla	v25.4s, v5.4s, v13.2s[0]
-	fmla	v28.4s, v4.4s, v13.2s[1]
-	fmla	v29.4s, v5.4s, v13.2s[1]
+	fmla	v16.4s, v4.4s, v12.s[0]
+	fmla	v17.4s, v5.4s, v12.s[0]
+	fmla	v20.4s, v4.4s, v12.s[1]
+	fmla	v21.4s, v5.4s, v12.s[1]
+	fmla	v24.4s, v4.4s, v13.s[0]
+	fmla	v25.4s, v5.4s, v13.s[0]
+	fmla	v28.4s, v4.4s, v13.s[1]
+	fmla	v29.4s, v5.4s, v13.s[1]
 
 	ld1	{v8.2s, v9.2s}, [pB]
 	add	pB, pB, #16
@@ -423,14 +423,14 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 .endm
 
 .macro KERNEL8x4_E
-	fmla	v16.4s, v4.4s, v12.2s[0]
-	fmla	v17.4s, v5.4s, v12.2s[0]
-	fmla	v20.4s, v4.4s, v12.2s[1]
-	fmla	v21.4s, v5.4s, v12.2s[1]
-	fmla	v24.4s, v4.4s, v13.2s[0]
-	fmla	v25.4s, v5.4s, v13.2s[0]
-	fmla	v28.4s, v4.4s, v13.2s[1]
-	fmla	v29.4s, v5.4s, v13.2s[1]
+	fmla	v16.4s, v4.4s, v12.s[0]
+	fmla	v17.4s, v5.4s, v12.s[0]
+	fmla	v20.4s, v4.4s, v12.s[1]
+	fmla	v21.4s, v5.4s, v12.s[1]
+	fmla	v24.4s, v4.4s, v13.s[0]
+	fmla	v25.4s, v5.4s, v13.s[0]
+	fmla	v28.4s, v4.4s, v13.s[1]
+	fmla	v29.4s, v5.4s, v13.s[1]
 .endm
 
 .macro KERNEL8x4_SUB
@@ -441,14 +441,14 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 	ld1	{v1.4s}, [pA]
 	add	pA, pA, #16
 
-	fmla	v16.4s, v0.4s, v8.2s[0]
-	fmla	v17.4s, v1.4s, v8.2s[0]
-	fmla	v20.4s, v0.4s, v8.2s[1]
-	fmla	v21.4s, v1.4s, v8.2s[1]
-	fmla	v24.4s, v0.4s, v9.2s[0]
-	fmla	v25.4s, v1.4s, v9.2s[0]
-	fmla	v28.4s, v0.4s, v9.2s[1]
-	fmla	v29.4s, v1.4s, v9.2s[1]
+	fmla	v16.4s, v0.4s, v8.s[0]
+	fmla	v17.4s, v1.4s, v8.s[0]
+	fmla	v20.4s, v0.4s, v8.s[1]
+	fmla	v21.4s, v1.4s, v8.s[1]
+	fmla	v24.4s, v0.4s, v9.s[0]
+	fmla	v25.4s, v1.4s, v9.s[0]
+	fmla	v28.4s, v0.4s, v9.s[1]
+	fmla	v29.4s, v1.4s, v9.s[1]
 .endm
 
 .macro SAVE8x4
@@ -496,17 +496,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 	ld1	{v0.2s, v1.2s}, [pA]
 	add	pA, pA, #16
 
-	fmul	v16.2s, v0.2s, v8.2s[0]
-	fmul	v29.2s, v1.2s, v9.2s[1]
+	fmul	v16.2s, v0.2s, v8.s[0]
+	fmul	v29.2s, v1.2s, v9.s[1]
 
-	fmul	v20.2s, v0.2s, v8.2s[1]
-	fmul	v25.2s, v1.2s, v9.2s[0]
+	fmul	v20.2s, v0.2s, v8.s[1]
+	fmul	v25.2s, v1.2s, v9.s[0]
 
-	fmul	v24.2s, v0.2s, v9.2s[0]
-	fmul	v21.2s, v1.2s, v8.2s[1]
+	fmul	v24.2s, v0.2s, v9.s[0]
+	fmul	v21.2s, v1.2s, v8.s[1]
 
-	fmul	v28.2s, v0.2s, v9.2s[1]
-	fmul	v17.2s, v1.2s, v8.2s[0]
+	fmul	v28.2s, v0.2s, v9.s[1]
+	fmul	v17.2s, v1.2s, v8.s[0]
 
 	ld1	{v12.2s, v13.2s}, [pB]
 	add	pB, pB, #16
@@ -515,61 +515,61 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 .endm
 
 .macro KERNEL4x4_M1
-	fmla	v16.2s, v0.2s, v8.2s[0]
-	fmla	v29.2s, v1.2s, v9.2s[1]
+	fmla	v16.2s, v0.2s, v8.s[0]
+	fmla	v29.2s, v1.2s, v9.s[1]
 
 	ld1	{v12.2s, v13.2s}, [pB]		// For next round
 	add	pB, pB, #16
 
-	fmla	v20.2s, v0.2s, v8.2s[1]
-	fmla	v25.2s, v1.2s, v9.2s[0]
+	fmla	v20.2s, v0.2s, v8.s[1]
+	fmla	v25.2s, v1.2s, v9.s[0]
 
 	ld1	{v4.2s, v5.2s}, [pA]		// For next round
 	add	pA, pA, #16
 
-	fmla	v24.2s, v0.2s, v9.2s[0]
-	fmla	v21.2s, v1.2s, v8.2s[1]
+	fmla	v24.2s, v0.2s, v9.s[0]
+	fmla	v21.2s, v1.2s, v8.s[1]
 
 	prfm	PLDL1KEEP, [pB, #512]
 
-	fmla	v28.2s, v0.2s, v9.2s[1]
-	fmla	v17.2s, v1.2s, v8.2s[0]
+	fmla	v28.2s, v0.2s, v9.s[1]
+	fmla	v17.2s, v1.2s, v8.s[0]
 .endm
 
 .macro KERNEL4x4_M2
-	fmla	v16.2s, v4.2s, v12.2s[0]
-	fmla	v29.2s, v5.2s, v13.2s[1]
+	fmla	v16.2s, v4.2s, v12.s[0]
+	fmla	v29.2s, v5.2s, v13.s[1]
 
 	ld1	{v8.2s, v9.2s}, [pB]		// For next round
 	add	pB, pB, #16
 
-	fmla	v20.2s, v4.2s, v12.2s[1]
-	fmla	v25.2s, v5.2s, v13.2s[0]
+	fmla	v20.2s, v4.2s, v12.s[1]
+	fmla	v25.2s, v5.2s, v13.s[0]
 
 	ld1	{v0.2s, v1.2s}, [pA]		// For next round
 	add	pA, pA, #16
 
-	fmla	v24.2s, v4.2s, v13.2s[0]
-	fmla	v21.2s, v5.2s, v12.2s[1]
+	fmla	v24.2s, v4.2s, v13.s[0]
+	fmla	v21.2s, v5.2s, v12.s[1]
 
 	prfm	PLDL1KEEP, [pA, #512]
 
-	fmla	v28.2s, v4.2s, v13.2s[1]
-	fmla	v17.2s, v5.2s, v12.2s[0]
+	fmla	v28.2s, v4.2s, v13.s[1]
+	fmla	v17.2s, v5.2s, v12.s[0]
 .endm
 
 .macro KERNEL4x4_E
-	fmla	v16.2s, v4.2s, v12.2s[0]
-	fmla	v29.2s, v5.2s, v13.2s[1]
+	fmla	v16.2s, v4.2s, v12.s[0]
+	fmla	v29.2s, v5.2s, v13.s[1]
 
-	fmla	v20.2s, v4.2s, v12.2s[1]
-	fmla	v25.2s, v5.2s, v13.2s[0]
+	fmla	v20.2s, v4.2s, v12.s[1]
+	fmla	v25.2s, v5.2s, v13.s[0]
 
-	fmla	v24.2s, v4.2s, v13.2s[0]
-	fmla	v21.2s, v5.2s, v12.2s[1]
+	fmla	v24.2s, v4.2s, v13.s[0]
+	fmla	v21.2s, v5.2s, v12.s[1]
 
-	fmla	v28.2s, v4.2s, v13.2s[1]
-	fmla	v17.2s, v5.2s, v12.2s[0]
+	fmla	v28.2s, v4.2s, v13.s[1]
+	fmla	v17.2s, v5.2s, v12.s[0]
 .endm
 
 .macro KERNEL4x4_SUB
@@ -578,17 +578,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 	ld1	{v0.2s, v1.2s}, [pA]
 	add	pA, pA, #16
 
-	fmla	v16.2s, v0.2s, v8.2s[0]
-	fmla	v29.2s, v1.2s, v9.2s[1]
+	fmla	v16.2s, v0.2s, v8.s[0]
+	fmla	v29.2s, v1.2s, v9.s[1]
 
-	fmla	v20.2s, v0.2s, v8.2s[1]
-	fmla	v25.2s, v1.2s, v9.2s[0]
+	fmla	v20.2s, v0.2s, v8.s[1]
+	fmla	v25.2s, v1.2s, v9.s[0]
 
-	fmla	v24.2s, v0.2s, v9.2s[0]
-	fmla	v21.2s, v1.2s, v8.2s[1]
+	fmla	v24.2s, v0.2s, v9.s[0]
+	fmla	v21.2s, v1.2s, v8.s[1]
 
-	fmla	v28.2s, v0.2s, v9.2s[1]
-	fmla	v17.2s, v1.2s, v8.2s[0]
+	fmla	v28.2s, v0.2s, v9.s[1]
+	fmla	v17.2s, v1.2s, v8.s[0]
 .endm
 
 .macro SAVE4x4
@@ -633,10 +633,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 	ld1	{v0.2s}, [pA]
 	add	pA, pA, #8
 
-	fmla	v16.2s, v0.2s, v8.2s[0]
-	fmla	v20.2s, v0.2s, v8.2s[1]
-	fmla	v24.2s, v0.2s, v9.2s[0]
-	fmla	v28.2s, v0.2s, v9.2s[1]
+	fmla	v16.2s, v0.2s, v8.s[0]
+	fmla	v20.2s, v0.2s, v8.s[1]
+	fmla	v24.2s, v0.2s, v9.s[0]
+	fmla	v28.2s, v0.2s, v9.s[1]
 .endm
 
 .macro SAVE2x4
@@ -718,15 +718,15 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 	ld1	{v3.4s}, [pA]
 	add	pA, pA, #16
 
-	fmla	v16.4s, v0.4s, v8.2s[0]
-	fmla	v17.4s, v1.4s, v8.2s[0]
-	fmla	v18.4s, v2.4s, v8.2s[0]
-	fmla	v19.4s, v3.4s, v8.2s[0]
+	fmla	v16.4s, v0.4s, v8.s[0]
+	fmla	v17.4s, v1.4s, v8.s[0]
+	fmla	v18.4s, v2.4s, v8.s[0]
+	fmla	v19.4s, v3.4s, v8.s[0]
 
-	fmla	v20.4s, v0.4s, v8.2s[1]
-	fmla	v21.4s, v1.4s, v8.2s[1]
-	fmla	v22.4s, v2.4s, v8.2s[1]
-	fmla	v23.4s, v3.4s, v8.2s[1]
+	fmla	v20.4s, v0.4s, v8.s[1]
+	fmla	v21.4s, v1.4s, v8.s[1]
+	fmla	v22.4s, v2.4s, v8.s[1]
+	fmla	v23.4s, v3.4s, v8.s[1]
 .endm
 
 .macro SAVE16x2
@@ -764,11 +764,11 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 	ld1	{v1.4s}, [pA]
 	add	pA, pA, #16
 
-	fmla	v16.4s, v0.4s, v8.2s[0]
-	fmla	v17.4s, v1.4s, v8.2s[0]
+	fmla	v16.4s, v0.4s, v8.s[0]
+	fmla	v17.4s, v1.4s, v8.s[0]
 
-	fmla	v20.4s, v0.4s, v8.2s[1]
-	fmla	v21.4s, v1.4s, v8.2s[1]
+	fmla	v20.4s, v0.4s, v8.s[1]
+	fmla	v21.4s, v1.4s, v8.s[1]
 .endm
 
 .macro SAVE8x2
@@ -802,10 +802,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 	ld1	{v0.2s, v1.2s}, [pA]
 	add	pA, pA, #16
 
-	fmla	v16.2s, v0.2s, v8.2s[0]
-	fmla	v17.2s, v1.2s, v8.2s[0]
-	fmla	v20.2s, v0.2s, v8.2s[1]
-	fmla	v21.2s, v1.2s, v8.2s[1]
+	fmla	v16.2s, v0.2s, v8.s[0]
+	fmla	v17.2s, v1.2s, v8.s[0]
+	fmla	v20.2s, v0.2s, v8.s[1]
+	fmla	v21.2s, v1.2s, v8.s[1]
 .endm
 
 .macro SAVE4x2
@@ -837,8 +837,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 	ld1	{v0.2s}, [pA]
 	add	pA, pA, #8
 
-	fmla	v16.2s, v0.2s, v8.2s[0]
-	fmla	v20.2s, v0.2s, v8.2s[1]
+	fmla	v16.2s, v0.2s, v8.s[0]
+	fmla	v20.2s, v0.2s, v8.s[1]
 .endm
 
 .macro SAVE2x2
@@ -866,7 +866,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 	ldr	s0 , [pA]
 	add	pA, pA, #4
 
-	fmla	v16.2s, v8.2s, v0.2s[0]
+	fmla	v16.2s, v8.2s, v0.s[0]
 .endm
 
 .macro SAVE1x2
@@ -901,10 +901,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 	ld1	{v3.4s}, [pA]
 	add	pA, pA, #16
 
-	fmla	v16.4s, v0.4s, v8.2s[0]
-	fmla	v17.4s, v1.4s, v8.2s[0]
-	fmla	v18.4s, v2.4s, v8.2s[0]
-	fmla	v19.4s, v3.4s, v8.2s[0]
+	fmla	v16.4s, v0.4s, v8.s[0]
+	fmla	v17.4s, v1.4s, v8.s[0]
+	fmla	v18.4s, v2.4s, v8.s[0]
+	fmla	v19.4s, v3.4s, v8.s[0]
 .endm
 
 .macro SAVE16x1
@@ -934,8 +934,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 	ld1	{v1.4s}, [pA]
 	add	pA, pA, #16
 
-	fmla	v16.4s, v0.4s, v8.2s[0]
-	fmla	v17.4s, v1.4s, v8.2s[0]
+	fmla	v16.4s, v0.4s, v8.s[0]
+	fmla	v17.4s, v1.4s, v8.s[0]
 .endm
 
 .macro SAVE8x1
@@ -961,8 +961,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 	ld1	{v0.2s, v1.2s}, [pA]
 	add	pA , pA, #16
 
-	fmla	v16.2s, v0.2s, v8.2s[0]
-	fmla	v17.2s, v1.2s, v8.2s[0]
+	fmla	v16.2s, v0.2s, v8.s[0]
+	fmla	v17.2s, v1.2s, v8.s[0]
 .endm
 
 .macro SAVE4x1
@@ -987,7 +987,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 	ld1	{v0.2s}, [pA]
 	add	pA , pA, #8
 
-	fmla	v16.2s, v0.2s, v8.2s[0]
+	fmla	v16.2s, v0.2s, v8.s[0]
 .endm
 
 .macro SAVE2x1
diff --git a/kernel/arm64/strmm_kernel_4x4.S b/kernel/arm64/strmm_kernel_4x4.S
index 674e200d8..eeb3e6e72 100644
--- a/kernel/arm64/strmm_kernel_4x4.S
+++ b/kernel/arm64/strmm_kernel_4x4.S
@@ -147,17 +147,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 	ld1	{v0.2s, v1.2s}, [pA]
 	add	pA, pA, #16
 
-	fmul	v16.2s, v0.2s, v8.2s[0]
-	fmul	v29.2s, v1.2s, v9.2s[1]
+	fmul	v16.2s, v0.2s, v8.s[0]
+	fmul	v29.2s, v1.2s, v9.s[1]
 
-	fmul	v20.2s, v0.2s, v8.2s[1]
-	fmul	v25.2s, v1.2s, v9.2s[0]
+	fmul	v20.2s, v0.2s, v8.s[1]
+	fmul	v25.2s, v1.2s, v9.s[0]
 
-	fmul	v24.2s, v0.2s, v9.2s[0]
-	fmul	v21.2s, v1.2s, v8.2s[1]
+	fmul	v24.2s, v0.2s, v9.s[0]
+	fmul	v21.2s, v1.2s, v8.s[1]
 
-	fmul	v28.2s, v0.2s, v9.2s[1]
-	fmul	v17.2s, v1.2s, v8.2s[0]
+	fmul	v28.2s, v0.2s, v9.s[1]
+	fmul	v17.2s, v1.2s, v8.s[0]
 
 	ld1	{v12.2s, v13.2s}, [pB]
 	add	pB, pB, #16
@@ -166,61 +166,61 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 .endm
 
 .macro KERNEL4x4_M1
-	fmla	v16.2s, v0.2s, v8.2s[0]
-	fmla	v29.2s, v1.2s, v9.2s[1]
+	fmla	v16.2s, v0.2s, v8.s[0]
+	fmla	v29.2s, v1.2s, v9.s[1]
 
 	ld1	{v12.2s, v13.2s}, [pB]		// For next round
 	add	pB, pB, #16
 
-	fmla	v20.2s, v0.2s, v8.2s[1]
-	fmla	v25.2s, v1.2s, v9.2s[0]
+	fmla	v20.2s, v0.2s, v8.s[1]
+	fmla	v25.2s, v1.2s, v9.s[0]
 
 	ld1	{v4.2s, v5.2s}, [pA]		// For next round
 	add	pA, pA, #16
 
-	fmla	v24.2s, v0.2s, v9.2s[0]
-	fmla	v21.2s, v1.2s, v8.2s[1]
+	fmla	v24.2s, v0.2s, v9.s[0]
+	fmla	v21.2s, v1.2s, v8.s[1]
 
 	prfm	PLDL1KEEP, [pB, #512]
 
-	fmla	v28.2s, v0.2s, v9.2s[1]
-	fmla	v17.2s, v1.2s, v8.2s[0]
+	fmla	v28.2s, v0.2s, v9.s[1]
+	fmla	v17.2s, v1.2s, v8.s[0]
 .endm
 
 .macro KERNEL4x4_M2
-	fmla	v16.2s, v4.2s, v12.2s[0]
-	fmla	v29.2s, v5.2s, v13.2s[1]
+	fmla	v16.2s, v4.2s, v12.s[0]
+	fmla	v29.2s, v5.2s, v13.s[1]
 
 	ld1	{v8.2s, v9.2s}, [pB]		// For next round
 	add	pB, pB, #16
 
-	fmla	v20.2s, v4.2s, v12.2s[1]
-	fmla	v25.2s, v5.2s, v13.2s[0]
+	fmla	v20.2s, v4.2s, v12.s[1]
+	fmla	v25.2s, v5.2s, v13.s[0]
 
 	ld1	{v0.2s, v1.2s}, [pA]		// For next round
 	add	pA, pA, #16
 
-	fmla	v24.2s, v4.2s, v13.2s[0]
-	fmla	v21.2s, v5.2s, v12.2s[1]
+	fmla	v24.2s, v4.2s, v13.s[0]
+	fmla	v21.2s, v5.2s, v12.s[1]
 
 	prfm	PLDL1KEEP, [pA, #512]
 
-	fmla	v28.2s, v4.2s, v13.2s[1]
-	fmla	v17.2s, v5.2s, v12.2s[0]
+	fmla	v28.2s, v4.2s, v13.s[1]
+	fmla	v17.2s, v5.2s, v12.s[0]
 .endm
 
 .macro KERNEL4x4_E
-	fmla	v16.2s, v4.2s, v12.2s[0]
-	fmla	v29.2s, v5.2s, v13.2s[1]
+	fmla	v16.2s, v4.2s, v12.s[0]
+	fmla	v29.2s, v5.2s, v13.s[1]
 
-	fmla	v20.2s, v4.2s, v12.2s[1]
-	fmla	v25.2s, v5.2s, v13.2s[0]
+	fmla	v20.2s, v4.2s, v12.s[1]
+	fmla	v25.2s, v5.2s, v13.s[0]
 
-	fmla	v24.2s, v4.2s, v13.2s[0]
-	fmla	v21.2s, v5.2s, v12.2s[1]
+	fmla	v24.2s, v4.2s, v13.s[0]
+	fmla	v21.2s, v5.2s, v12.s[1]
 
-	fmla	v28.2s, v4.2s, v13.2s[1]
-	fmla	v17.2s, v5.2s, v12.2s[0]
+	fmla	v28.2s, v4.2s, v13.s[1]
+	fmla	v17.2s, v5.2s, v12.s[0]
 .endm
 
 .macro KERNEL4x4_SUB
@@ -229,17 +229,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 	ld1	{v0.2s, v1.2s}, [pA]
 	add	pA, pA, #16
 
-	fmla	v16.2s, v0.2s, v8.2s[0]
-	fmla	v29.2s, v1.2s, v9.2s[1]
+	fmla	v16.2s, v0.2s, v8.s[0]
+	fmla	v29.2s, v1.2s, v9.s[1]
 
-	fmla	v20.2s, v0.2s, v8.2s[1]
-	fmla	v25.2s, v1.2s, v9.2s[0]
+	fmla	v20.2s, v0.2s, v8.s[1]
+	fmla	v25.2s, v1.2s, v9.s[0]
 
-	fmla	v24.2s, v0.2s, v9.2s[0]
-	fmla	v21.2s, v1.2s, v8.2s[1]
+	fmla	v24.2s, v0.2s, v9.s[0]
+	fmla	v21.2s, v1.2s, v8.s[1]
 
-	fmla	v28.2s, v0.2s, v9.2s[1]
-	fmla	v17.2s, v1.2s, v8.2s[0]
+	fmla	v28.2s, v0.2s, v9.s[1]
+	fmla	v17.2s, v1.2s, v8.s[0]
 .endm
 
 .macro SAVE4x4
@@ -280,10 +280,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 	ld1	{v0.2s}, [pA]
 	add	pA, pA, #8
 
-	fmla	v16.2s, v0.2s, v8.2s[0]
-	fmla	v20.2s, v0.2s, v8.2s[1]
-	fmla	v24.2s, v0.2s, v9.2s[0]
-	fmla	v28.2s, v0.2s, v9.2s[1]
+	fmla	v16.2s, v0.2s, v8.s[0]
+	fmla	v20.2s, v0.2s, v8.s[1]
+	fmla	v24.2s, v0.2s, v9.s[0]
+	fmla	v28.2s, v0.2s, v9.s[1]
 .endm
 
 .macro SAVE2x4
@@ -353,10 +353,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 	ld1	{v0.2s, v1.2s}, [pA]
 	add	pA, pA, #16
 
-	fmla	v16.2s, v0.2s, v8.2s[0]
-	fmla	v17.2s, v1.2s, v8.2s[0]
-	fmla	v20.2s, v0.2s, v8.2s[1]
-	fmla	v21.2s, v1.2s, v8.2s[1]
+	fmla	v16.2s, v0.2s, v8.s[0]
+	fmla	v17.2s, v1.2s, v8.s[0]
+	fmla	v20.2s, v0.2s, v8.s[1]
+	fmla	v21.2s, v1.2s, v8.s[1]
 .endm
 
 .macro SAVE4x2
@@ -386,8 +386,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 	ld1	{v0.2s}, [pA]
 	add	pA, pA, #8
 
-	fmla	v16.2s, v0.2s, v8.2s[0]
-	fmla	v20.2s, v0.2s, v8.2s[1]
+	fmla	v16.2s, v0.2s, v8.s[0]
+	fmla	v20.2s, v0.2s, v8.s[1]
 .endm
 
 .macro SAVE2x2
@@ -414,7 +414,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 	ldr	s0 , [pA]
 	add	pA, pA, #4
 
-	fmla	v16.2s, v8.2s, v0.2s[0]
+	fmla	v16.2s, v8.2s, v0.s[0]
 .endm
 
 .macro SAVE1x2
@@ -440,8 +440,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 	ld1	{v0.2s, v1.2s}, [pA]
 	add	pA , pA, #16
 
-	fmla	v16.2s, v0.2s, v8.2s[0]
-	fmla	v17.2s, v1.2s, v8.2s[0]
+	fmla	v16.2s, v0.2s, v8.s[0]
+	fmla	v17.2s, v1.2s, v8.s[0]
 .endm
 
 .macro SAVE4x1
@@ -468,7 +468,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 	ld1	{v0.2s}, [pA]
 	add	pA , pA, #8
 
-	fmla	v16.2s, v0.2s, v8.2s[0]
+	fmla	v16.2s, v0.2s, v8.s[0]
 .endm
 
 .macro SAVE2x1
diff --git a/kernel/arm64/strmm_kernel_8x8.S b/kernel/arm64/strmm_kernel_8x8.S
old mode 100755
new mode 100644
index 98b912934..843f0c890
--- a/kernel/arm64/strmm_kernel_8x8.S
+++ b/kernel/arm64/strmm_kernel_8x8.S
@@ -159,22 +159,22 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 	ld1	{v1.4s}, [pA]
 	add	pA, pA, #16
 
-	fmul	v16.4s, v0.4s, v4.4s[0]
-	fmul	v17.4s, v1.4s, v4.4s[0]
-	fmul	v18.4s, v0.4s, v4.4s[1]
-	fmul	v19.4s, v1.4s, v4.4s[1]
-	fmul	v20.4s, v0.4s, v4.4s[2]
-	fmul	v21.4s, v1.4s, v4.4s[2]
-	fmul	v22.4s, v0.4s, v4.4s[3]
-	fmul	v23.4s, v1.4s, v4.4s[3]
-	fmul	v24.4s, v0.4s, v5.4s[0]
-	fmul	v25.4s, v1.4s, v5.4s[0]
-	fmul	v26.4s, v0.4s, v5.4s[1]
-	fmul	v27.4s, v1.4s, v5.4s[1]
-	fmul	v28.4s, v0.4s, v5.4s[2]
-	fmul	v29.4s, v1.4s, v5.4s[2]
-	fmul	v30.4s, v0.4s, v5.4s[3]
-	fmul	v31.4s, v1.4s, v5.4s[3]
+	fmul	v16.4s, v0.4s, v4.s[0]
+	fmul	v17.4s, v1.4s, v4.s[0]
+	fmul	v18.4s, v0.4s, v4.s[1]
+	fmul	v19.4s, v1.4s, v4.s[1]
+	fmul	v20.4s, v0.4s, v4.s[2]
+	fmul	v21.4s, v1.4s, v4.s[2]
+	fmul	v22.4s, v0.4s, v4.s[3]
+	fmul	v23.4s, v1.4s, v4.s[3]
+	fmul	v24.4s, v0.4s, v5.s[0]
+	fmul	v25.4s, v1.4s, v5.s[0]
+	fmul	v26.4s, v0.4s, v5.s[1]
+	fmul	v27.4s, v1.4s, v5.s[1]
+	fmul	v28.4s, v0.4s, v5.s[2]
+	fmul	v29.4s, v1.4s, v5.s[2]
+	fmul	v30.4s, v0.4s, v5.s[3]
+	fmul	v31.4s, v1.4s, v5.s[3]
 
 	ld1	{v6.4s}, [pB]
 	add	pB, pB, #16
@@ -187,22 +187,22 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 .endm
 
 .macro KERNEL8x8_M1
-	fmla	v16.4s, v0.4s, v4.4s[0]
-	fmla	v17.4s, v1.4s, v4.4s[0]
-	fmla	v18.4s, v0.4s, v4.4s[1]
-	fmla	v19.4s, v1.4s, v4.4s[1]
-	fmla	v20.4s, v0.4s, v4.4s[2]
-	fmla	v21.4s, v1.4s, v4.4s[2]
-	fmla	v22.4s, v0.4s, v4.4s[3]
-	fmla	v23.4s, v1.4s, v4.4s[3]
-	fmla	v24.4s, v0.4s, v5.4s[0]
-	fmla	v25.4s, v1.4s, v5.4s[0]
-	fmla	v26.4s, v0.4s, v5.4s[1]
-	fmla	v27.4s, v1.4s, v5.4s[1]
-	fmla	v28.4s, v0.4s, v5.4s[2]
-	fmla	v29.4s, v1.4s, v5.4s[2]
-	fmla	v30.4s, v0.4s, v5.4s[3]
-	fmla	v31.4s, v1.4s, v5.4s[3]
+	fmla	v16.4s, v0.4s, v4.s[0]
+	fmla	v17.4s, v1.4s, v4.s[0]
+	fmla	v18.4s, v0.4s, v4.s[1]
+	fmla	v19.4s, v1.4s, v4.s[1]
+	fmla	v20.4s, v0.4s, v4.s[2]
+	fmla	v21.4s, v1.4s, v4.s[2]
+	fmla	v22.4s, v0.4s, v4.s[3]
+	fmla	v23.4s, v1.4s, v4.s[3]
+	fmla	v24.4s, v0.4s, v5.s[0]
+	fmla	v25.4s, v1.4s, v5.s[0]
+	fmla	v26.4s, v0.4s, v5.s[1]
+	fmla	v27.4s, v1.4s, v5.s[1]
+	fmla	v28.4s, v0.4s, v5.s[2]
+	fmla	v29.4s, v1.4s, v5.s[2]
+	fmla	v30.4s, v0.4s, v5.s[3]
+	fmla	v31.4s, v1.4s, v5.s[3]
 
 	ld1	{v6.4s}, [pB]
 	add	pB, pB, #16
@@ -215,22 +215,22 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 .endm
 
 .macro KERNEL8x8_M2
-	fmla	v16.4s, v2.4s, v6.4s[0]
-	fmla	v17.4s, v3.4s, v6.4s[0]
-	fmla	v18.4s, v2.4s, v6.4s[1]
-	fmla	v19.4s, v3.4s, v6.4s[1]
-	fmla	v20.4s, v2.4s, v6.4s[2]
-	fmla	v21.4s, v3.4s, v6.4s[2]
-	fmla	v22.4s, v2.4s, v6.4s[3]
-	fmla	v23.4s, v3.4s, v6.4s[3]
-	fmla	v24.4s, v2.4s, v7.4s[0]
-	fmla	v25.4s, v3.4s, v7.4s[0]
-	fmla	v26.4s, v2.4s, v7.4s[1]
-	fmla	v27.4s, v3.4s, v7.4s[1]
-	fmla	v28.4s, v2.4s, v7.4s[2]
-	fmla	v29.4s, v3.4s, v7.4s[2]
-	fmla	v30.4s, v2.4s, v7.4s[3]
-	fmla	v31.4s, v3.4s, v7.4s[3]
+	fmla	v16.4s, v2.4s, v6.s[0]
+	fmla	v17.4s, v3.4s, v6.s[0]
+	fmla	v18.4s, v2.4s, v6.s[1]
+	fmla	v19.4s, v3.4s, v6.s[1]
+	fmla	v20.4s, v2.4s, v6.s[2]
+	fmla	v21.4s, v3.4s, v6.s[2]
+	fmla	v22.4s, v2.4s, v6.s[3]
+	fmla	v23.4s, v3.4s, v6.s[3]
+	fmla	v24.4s, v2.4s, v7.s[0]
+	fmla	v25.4s, v3.4s, v7.s[0]
+	fmla	v26.4s, v2.4s, v7.s[1]
+	fmla	v27.4s, v3.4s, v7.s[1]
+	fmla	v28.4s, v2.4s, v7.s[2]
+	fmla	v29.4s, v3.4s, v7.s[2]
+	fmla	v30.4s, v2.4s, v7.s[3]
+	fmla	v31.4s, v3.4s, v7.s[3]
 
 	ld1	{v4.4s}, [pB]
 	add	pB, pB, #16
@@ -243,22 +243,22 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 .endm
 
 .macro KERNEL8x8_E
-	fmla	v16.4s, v2.4s, v6.4s[0]
-	fmla	v17.4s, v3.4s, v6.4s[0]
-	fmla	v18.4s, v2.4s, v6.4s[1]
-	fmla	v19.4s, v3.4s, v6.4s[1]
-	fmla	v20.4s, v2.4s, v6.4s[2]
-	fmla	v21.4s, v3.4s, v6.4s[2]
-	fmla	v22.4s, v2.4s, v6.4s[3]
-	fmla	v23.4s, v3.4s, v6.4s[3]
-	fmla	v24.4s, v2.4s, v7.4s[0]
-	fmla	v25.4s, v3.4s, v7.4s[0]
-	fmla	v26.4s, v2.4s, v7.4s[1]
-	fmla	v27.4s, v3.4s, v7.4s[1]
-	fmla	v28.4s, v2.4s, v7.4s[2]
-	fmla	v29.4s, v3.4s, v7.4s[2]
-	fmla	v30.4s, v2.4s, v7.4s[3]
-	fmla	v31.4s, v3.4s, v7.4s[3]
+	fmla	v16.4s, v2.4s, v6.s[0]
+	fmla	v17.4s, v3.4s, v6.s[0]
+	fmla	v18.4s, v2.4s, v6.s[1]
+	fmla	v19.4s, v3.4s, v6.s[1]
+	fmla	v20.4s, v2.4s, v6.s[2]
+	fmla	v21.4s, v3.4s, v6.s[2]
+	fmla	v22.4s, v2.4s, v6.s[3]
+	fmla	v23.4s, v3.4s, v6.s[3]
+	fmla	v24.4s, v2.4s, v7.s[0]
+	fmla	v25.4s, v3.4s, v7.s[0]
+	fmla	v26.4s, v2.4s, v7.s[1]
+	fmla	v27.4s, v3.4s, v7.s[1]
+	fmla	v28.4s, v2.4s, v7.s[2]
+	fmla	v29.4s, v3.4s, v7.s[2]
+	fmla	v30.4s, v2.4s, v7.s[3]
+	fmla	v31.4s, v3.4s, v7.s[3]
 .endm
 
 .macro KERNEL8x8_SUB
@@ -271,22 +271,22 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 	ld1	{v1.4s}, [pA]
 	add	pA, pA, #16
 
-	fmla	v16.4s, v0.4s, v4.4s[0]
-	fmla	v17.4s, v1.4s, v4.4s[0]
-	fmla	v18.4s, v0.4s, v4.4s[1]
-	fmla	v19.4s, v1.4s, v4.4s[1]
-	fmla	v20.4s, v0.4s, v4.4s[2]
-	fmla	v21.4s, v1.4s, v4.4s[2]
-	fmla	v22.4s, v0.4s, v4.4s[3]
-	fmla	v23.4s, v1.4s, v4.4s[3]
-	fmla	v24.4s, v0.4s, v5.4s[0]
-	fmla	v25.4s, v1.4s, v5.4s[0]
-	fmla	v26.4s, v0.4s, v5.4s[1]
-	fmla	v27.4s, v1.4s, v5.4s[1]
-	fmla	v28.4s, v0.4s, v5.4s[2]
-	fmla	v29.4s, v1.4s, v5.4s[2]
-	fmla	v30.4s, v0.4s, v5.4s[3]
-	fmla	v31.4s, v1.4s, v5.4s[3]
+	fmla	v16.4s, v0.4s, v4.s[0]
+	fmla	v17.4s, v1.4s, v4.s[0]
+	fmla	v18.4s, v0.4s, v4.s[1]
+	fmla	v19.4s, v1.4s, v4.s[1]
+	fmla	v20.4s, v0.4s, v4.s[2]
+	fmla	v21.4s, v1.4s, v4.s[2]
+	fmla	v22.4s, v0.4s, v4.s[3]
+	fmla	v23.4s, v1.4s, v4.s[3]
+	fmla	v24.4s, v0.4s, v5.s[0]
+	fmla	v25.4s, v1.4s, v5.s[0]
+	fmla	v26.4s, v0.4s, v5.s[1]
+	fmla	v27.4s, v1.4s, v5.s[1]
+	fmla	v28.4s, v0.4s, v5.s[2]
+	fmla	v29.4s, v1.4s, v5.s[2]
+	fmla	v30.4s, v0.4s, v5.s[3]
+	fmla	v31.4s, v1.4s, v5.s[3]
 .endm
 
 .macro SAVE8x8
@@ -361,14 +361,14 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 	ld1	{v0.4s}, [pA]
 	add	pA, pA, #16
 
-	fmul	v16.4s, v0.4s, v4.4s[0]
-	fmul	v18.4s, v0.4s, v4.4s[1]
-	fmul	v20.4s, v0.4s, v4.4s[2]
-	fmul	v22.4s, v0.4s, v4.4s[3]
-	fmul	v24.4s, v0.4s, v5.4s[0]
-	fmul	v26.4s, v0.4s, v5.4s[1]
-	fmul	v28.4s, v0.4s, v5.4s[2]
-	fmul	v30.4s, v0.4s, v5.4s[3]
+	fmul	v16.4s, v0.4s, v4.s[0]
+	fmul	v18.4s, v0.4s, v4.s[1]
+	fmul	v20.4s, v0.4s, v4.s[2]
+	fmul	v22.4s, v0.4s, v4.s[3]
+	fmul	v24.4s, v0.4s, v5.s[0]
+	fmul	v26.4s, v0.4s, v5.s[1]
+	fmul	v28.4s, v0.4s, v5.s[2]
+	fmul	v30.4s, v0.4s, v5.s[3]
 
 	ld1	{v6.4s}, [pB]
 	add	pB, pB, #16
@@ -379,14 +379,14 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 .endm
 
 .macro KERNEL4x8_M1
-	fmla	v16.4s, v0.4s, v4.4s[0]
-	fmla	v18.4s, v0.4s, v4.4s[1]
-	fmla	v20.4s, v0.4s, v4.4s[2]
-	fmla	v22.4s, v0.4s, v4.4s[3]
-	fmla	v24.4s, v0.4s, v5.4s[0]
-	fmla	v26.4s, v0.4s, v5.4s[1]
-	fmla	v28.4s, v0.4s, v5.4s[2]
-	fmla	v30.4s, v0.4s, v5.4s[3]
+	fmla	v16.4s, v0.4s, v4.s[0]
+	fmla	v18.4s, v0.4s, v4.s[1]
+	fmla	v20.4s, v0.4s, v4.s[2]
+	fmla	v22.4s, v0.4s, v4.s[3]
+	fmla	v24.4s, v0.4s, v5.s[0]
+	fmla	v26.4s, v0.4s, v5.s[1]
+	fmla	v28.4s, v0.4s, v5.s[2]
+	fmla	v30.4s, v0.4s, v5.s[3]
 
 	ld1	{v6.4s}, [pB]
 	add	pB, pB, #16
@@ -397,14 +397,14 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 .endm
 
 .macro KERNEL4x8_M2
-	fmla	v16.4s, v2.4s, v6.4s[0]
-	fmla	v18.4s, v2.4s, v6.4s[1]
-	fmla	v20.4s, v2.4s, v6.4s[2]
-	fmla	v22.4s, v2.4s, v6.4s[3]
-	fmla	v24.4s, v2.4s, v7.4s[0]
-	fmla	v26.4s, v2.4s, v7.4s[1]
-	fmla	v28.4s, v2.4s, v7.4s[2]
-	fmla	v30.4s, v2.4s, v7.4s[3]
+	fmla	v16.4s, v2.4s, v6.s[0]
+	fmla	v18.4s, v2.4s, v6.s[1]
+	fmla	v20.4s, v2.4s, v6.s[2]
+	fmla	v22.4s, v2.4s, v6.s[3]
+	fmla	v24.4s, v2.4s, v7.s[0]
+	fmla	v26.4s, v2.4s, v7.s[1]
+	fmla	v28.4s, v2.4s, v7.s[2]
+	fmla	v30.4s, v2.4s, v7.s[3]
 
 	ld1	{v4.4s}, [pB]
 	add	pB, pB, #16
@@ -415,14 +415,14 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 .endm
 
 .macro KERNEL4x8_E
-	fmla	v16.4s, v2.4s, v6.4s[0]
-	fmla	v18.4s, v2.4s, v6.4s[1]
-	fmla	v20.4s, v2.4s, v6.4s[2]
-	fmla	v22.4s, v2.4s, v6.4s[3]
-	fmla	v24.4s, v2.4s, v7.4s[0]
-	fmla	v26.4s, v2.4s, v7.4s[1]
-	fmla	v28.4s, v2.4s, v7.4s[2]
-	fmla	v30.4s, v2.4s, v7.4s[3]
+	fmla	v16.4s, v2.4s, v6.s[0]
+	fmla	v18.4s, v2.4s, v6.s[1]
+	fmla	v20.4s, v2.4s, v6.s[2]
+	fmla	v22.4s, v2.4s, v6.s[3]
+	fmla	v24.4s, v2.4s, v7.s[0]
+	fmla	v26.4s, v2.4s, v7.s[1]
+	fmla	v28.4s, v2.4s, v7.s[2]
+	fmla	v30.4s, v2.4s, v7.s[3]
 .endm
 
 .macro KERNEL4x8_SUB
@@ -433,14 +433,14 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 	ld1	{v0.4s}, [pA]
 	add	pA, pA, #16
 
-	fmla	v16.4s, v0.4s, v4.4s[0]
-	fmla	v18.4s, v0.4s, v4.4s[1]
-	fmla	v20.4s, v0.4s, v4.4s[2]
-	fmla	v22.4s, v0.4s, v4.4s[3]
-	fmla	v24.4s, v0.4s, v5.4s[0]
-	fmla	v26.4s, v0.4s, v5.4s[1]
-	fmla	v28.4s, v0.4s, v5.4s[2]
-	fmla	v30.4s, v0.4s, v5.4s[3]
+	fmla	v16.4s, v0.4s, v4.s[0]
+	fmla	v18.4s, v0.4s, v4.s[1]
+	fmla	v20.4s, v0.4s, v4.s[2]
+	fmla	v22.4s, v0.4s, v4.s[3]
+	fmla	v24.4s, v0.4s, v5.s[0]
+	fmla	v26.4s, v0.4s, v5.s[1]
+	fmla	v28.4s, v0.4s, v5.s[2]
+	fmla	v30.4s, v0.4s, v5.s[3]
 .endm
 
 .macro SAVE4x8
@@ -514,14 +514,14 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 	ld1	{v0.2s}, [pA]
 	add	pA, pA, #8
 
-	fmla	v16.2s, v0.2s, v4.4s[0]
-	fmla	v18.2s, v0.2s, v4.4s[1]
-	fmla	v20.2s, v0.2s, v4.4s[2]
-	fmla	v22.2s, v0.2s, v4.4s[3]
-	fmla	v24.2s, v0.2s, v5.4s[0]
-	fmla	v26.2s, v0.2s, v5.4s[1]
-	fmla	v28.2s, v0.2s, v5.4s[2]
-	fmla	v30.2s, v0.2s, v5.4s[3]
+	fmla	v16.2s, v0.2s, v4.s[0]
+	fmla	v18.2s, v0.2s, v4.s[1]
+	fmla	v20.2s, v0.2s, v4.s[2]
+	fmla	v22.2s, v0.2s, v4.s[3]
+	fmla	v24.2s, v0.2s, v5.s[0]
+	fmla	v26.2s, v0.2s, v5.s[1]
+	fmla	v28.2s, v0.2s, v5.s[2]
+	fmla	v30.2s, v0.2s, v5.s[3]
 .endm
 
 .macro SAVE2x8
@@ -595,14 +595,14 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 	ldr	s0, [pA]
 	add	pA, pA, #4
 
-	fmla	s16, s0, v4.4s[0]
-	fmla	s18, s0, v4.4s[1]
-	fmla	s20, s0, v4.4s[2]
-	fmla	s22, s0, v4.4s[3]
-	fmla	s24, s0, v5.4s[0]
-	fmla	s26, s0, v5.4s[1]
-	fmla	s28, s0, v5.4s[2]
-	fmla	s30, s0, v5.4s[3]
+	fmla	s16, s0, v4.s[0]
+	fmla	s18, s0, v4.s[1]
+	fmla	s20, s0, v4.s[2]
+	fmla	s22, s0, v4.s[3]
+	fmla	s24, s0, v5.s[0]
+	fmla	s26, s0, v5.s[1]
+	fmla	s28, s0, v5.s[2]
+	fmla	s30, s0, v5.s[3]
 .endm
 
 .macro SAVE1x8
@@ -676,14 +676,14 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 	ld1	{v1.4s}, [pA]
 	add	pA, pA, #16
 
-	fmul	v16.4s, v0.4s, v8.2s[0]
-	fmul	v17.4s, v1.4s, v8.2s[0]
-	fmul	v20.4s, v0.4s, v8.2s[1]
-	fmul	v21.4s, v1.4s, v8.2s[1]
-	fmul	v24.4s, v0.4s, v9.2s[0]
-	fmul	v25.4s, v1.4s, v9.2s[0]
-	fmul	v28.4s, v0.4s, v9.2s[1]
-	fmul	v29.4s, v1.4s, v9.2s[1]
+	fmul	v16.4s, v0.4s, v8.s[0]
+	fmul	v17.4s, v1.4s, v8.s[0]
+	fmul	v20.4s, v0.4s, v8.s[1]
+	fmul	v21.4s, v1.4s, v8.s[1]
+	fmul	v24.4s, v0.4s, v9.s[0]
+	fmul	v25.4s, v1.4s, v9.s[0]
+	fmul	v28.4s, v0.4s, v9.s[1]
+	fmul	v29.4s, v1.4s, v9.s[1]
 
 	ld1	{v12.2s, v13.2s}, [pB]
 	add	pB, pB, #16
@@ -694,14 +694,14 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 .endm
 
 .macro KERNEL8x4_M1
-	fmla	v16.4s, v0.4s, v8.2s[0]
-	fmla	v17.4s, v1.4s, v8.2s[0]
-	fmla	v20.4s, v0.4s, v8.2s[1]
-	fmla	v21.4s, v1.4s, v8.2s[1]
-	fmla	v24.4s, v0.4s, v9.2s[0]
-	fmla	v25.4s, v1.4s, v9.2s[0]
-	fmla	v28.4s, v0.4s, v9.2s[1]
-	fmla	v29.4s, v1.4s, v9.2s[1]
+	fmla	v16.4s, v0.4s, v8.s[0]
+	fmla	v17.4s, v1.4s, v8.s[0]
+	fmla	v20.4s, v0.4s, v8.s[1]
+	fmla	v21.4s, v1.4s, v8.s[1]
+	fmla	v24.4s, v0.4s, v9.s[0]
+	fmla	v25.4s, v1.4s, v9.s[0]
+	fmla	v28.4s, v0.4s, v9.s[1]
+	fmla	v29.4s, v1.4s, v9.s[1]
 
 	ld1	{v12.2s, v13.2s}, [pB]
 	add	pB, pB, #16
@@ -712,14 +712,14 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 .endm
 
 .macro KERNEL8x4_M2
-	fmla	v16.4s, v4.4s, v12.2s[0]
-	fmla	v17.4s, v5.4s, v12.2s[0]
-	fmla	v20.4s, v4.4s, v12.2s[1]
-	fmla	v21.4s, v5.4s, v12.2s[1]
-	fmla	v24.4s, v4.4s, v13.2s[0]
-	fmla	v25.4s, v5.4s, v13.2s[0]
-	fmla	v28.4s, v4.4s, v13.2s[1]
-	fmla	v29.4s, v5.4s, v13.2s[1]
+	fmla	v16.4s, v4.4s, v12.s[0]
+	fmla	v17.4s, v5.4s, v12.s[0]
+	fmla	v20.4s, v4.4s, v12.s[1]
+	fmla	v21.4s, v5.4s, v12.s[1]
+	fmla	v24.4s, v4.4s, v13.s[0]
+	fmla	v25.4s, v5.4s, v13.s[0]
+	fmla	v28.4s, v4.4s, v13.s[1]
+	fmla	v29.4s, v5.4s, v13.s[1]
 
 	ld1	{v8.2s, v9.2s}, [pB]
 	add	pB, pB, #16
@@ -730,14 +730,14 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 .endm
 
 .macro KERNEL8x4_E
-	fmla	v16.4s, v4.4s, v12.2s[0]
-	fmla	v17.4s, v5.4s, v12.2s[0]
-	fmla	v20.4s, v4.4s, v12.2s[1]
-	fmla	v21.4s, v5.4s, v12.2s[1]
-	fmla	v24.4s, v4.4s, v13.2s[0]
-	fmla	v25.4s, v5.4s, v13.2s[0]
-	fmla	v28.4s, v4.4s, v13.2s[1]
-	fmla	v29.4s, v5.4s, v13.2s[1]
+	fmla	v16.4s, v4.4s, v12.s[0]
+	fmla	v17.4s, v5.4s, v12.s[0]
+	fmla	v20.4s, v4.4s, v12.s[1]
+	fmla	v21.4s, v5.4s, v12.s[1]
+	fmla	v24.4s, v4.4s, v13.s[0]
+	fmla	v25.4s, v5.4s, v13.s[0]
+	fmla	v28.4s, v4.4s, v13.s[1]
+	fmla	v29.4s, v5.4s, v13.s[1]
 .endm
 
 .macro KERNEL8x4_SUB
@@ -748,14 +748,14 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 	ld1	{v1.4s}, [pA]
 	add	pA, pA, #16
 
-	fmla	v16.4s, v0.4s, v8.2s[0]
-	fmla	v17.4s, v1.4s, v8.2s[0]
-	fmla	v20.4s, v0.4s, v8.2s[1]
-	fmla	v21.4s, v1.4s, v8.2s[1]
-	fmla	v24.4s, v0.4s, v9.2s[0]
-	fmla	v25.4s, v1.4s, v9.2s[0]
-	fmla	v28.4s, v0.4s, v9.2s[1]
-	fmla	v29.4s, v1.4s, v9.2s[1]
+	fmla	v16.4s, v0.4s, v8.s[0]
+	fmla	v17.4s, v1.4s, v8.s[0]
+	fmla	v20.4s, v0.4s, v8.s[1]
+	fmla	v21.4s, v1.4s, v8.s[1]
+	fmla	v24.4s, v0.4s, v9.s[0]
+	fmla	v25.4s, v1.4s, v9.s[0]
+	fmla	v28.4s, v0.4s, v9.s[1]
+	fmla	v29.4s, v1.4s, v9.s[1]
 .endm
 
 .macro SAVE8x4
@@ -808,17 +808,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 	ld1	{v0.2s, v1.2s}, [pA]
 	add	pA, pA, #16
 
-	fmul	v16.2s, v0.2s, v8.2s[0]
-	fmul	v29.2s, v1.2s, v9.2s[1]
+	fmul	v16.2s, v0.2s, v8.s[0]
+	fmul	v29.2s, v1.2s, v9.s[1]
 
-	fmul	v20.2s, v0.2s, v8.2s[1]
-	fmul	v25.2s, v1.2s, v9.2s[0]
+	fmul	v20.2s, v0.2s, v8.s[1]
+	fmul	v25.2s, v1.2s, v9.s[0]
 
-	fmul	v24.2s, v0.2s, v9.2s[0]
-	fmul	v21.2s, v1.2s, v8.2s[1]
+	fmul	v24.2s, v0.2s, v9.s[0]
+	fmul	v21.2s, v1.2s, v8.s[1]
 
-	fmul	v28.2s, v0.2s, v9.2s[1]
-	fmul	v17.2s, v1.2s, v8.2s[0]
+	fmul	v28.2s, v0.2s, v9.s[1]
+	fmul	v17.2s, v1.2s, v8.s[0]
 
 	ld1	{v12.2s, v13.2s}, [pB]
 	add	pB, pB, #16
@@ -827,61 +827,61 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 .endm
 
 .macro KERNEL4x4_M1
-	fmla	v16.2s, v0.2s, v8.2s[0]
-	fmla	v29.2s, v1.2s, v9.2s[1]
+	fmla	v16.2s, v0.2s, v8.s[0]
+	fmla	v29.2s, v1.2s, v9.s[1]
 
 	ld1	{v12.2s, v13.2s}, [pB]		// For next round
 	add	pB, pB, #16
 
-	fmla	v20.2s, v0.2s, v8.2s[1]
-	fmla	v25.2s, v1.2s, v9.2s[0]
+	fmla	v20.2s, v0.2s, v8.s[1]
+	fmla	v25.2s, v1.2s, v9.s[0]
 
 	ld1	{v4.2s, v5.2s}, [pA]		// For next round
 	add	pA, pA, #16
 
-	fmla	v24.2s, v0.2s, v9.2s[0]
-	fmla	v21.2s, v1.2s, v8.2s[1]
+	fmla	v24.2s, v0.2s, v9.s[0]
+	fmla	v21.2s, v1.2s, v8.s[1]
 
 	prfm	PLDL1KEEP, [pB, #512]
 
-	fmla	v28.2s, v0.2s, v9.2s[1]
-	fmla	v17.2s, v1.2s, v8.2s[0]
+	fmla	v28.2s, v0.2s, v9.s[1]
+	fmla	v17.2s, v1.2s, v8.s[0]
 .endm
 
 .macro KERNEL4x4_M2
-	fmla	v16.2s, v4.2s, v12.2s[0]
-	fmla	v29.2s, v5.2s, v13.2s[1]
+	fmla	v16.2s, v4.2s, v12.s[0]
+	fmla	v29.2s, v5.2s, v13.s[1]
 
 	ld1	{v8.2s, v9.2s}, [pB]		// For next round
 	add	pB, pB, #16
 
-	fmla	v20.2s, v4.2s, v12.2s[1]
-	fmla	v25.2s, v5.2s, v13.2s[0]
+	fmla	v20.2s, v4.2s, v12.s[1]
+	fmla	v25.2s, v5.2s, v13.s[0]
 
 	ld1	{v0.2s, v1.2s}, [pA]		// For next round
 	add	pA, pA, #16
 
-	fmla	v24.2s, v4.2s, v13.2s[0]
-	fmla	v21.2s, v5.2s, v12.2s[1]
+	fmla	v24.2s, v4.2s, v13.s[0]
+	fmla	v21.2s, v5.2s, v12.s[1]
 
 	prfm	PLDL1KEEP, [pA, #512]
 
-	fmla	v28.2s, v4.2s, v13.2s[1]
-	fmla	v17.2s, v5.2s, v12.2s[0]
+	fmla	v28.2s, v4.2s, v13.s[1]
+	fmla	v17.2s, v5.2s, v12.s[0]
 .endm
 
 .macro KERNEL4x4_E
-	fmla	v16.2s, v4.2s, v12.2s[0]
-	fmla	v29.2s, v5.2s, v13.2s[1]
+	fmla	v16.2s, v4.2s, v12.s[0]
+	fmla	v29.2s, v5.2s, v13.s[1]
 
-	fmla	v20.2s, v4.2s, v12.2s[1]
-	fmla	v25.2s, v5.2s, v13.2s[0]
+	fmla	v20.2s, v4.2s, v12.s[1]
+	fmla	v25.2s, v5.2s, v13.s[0]
 
-	fmla	v24.2s, v4.2s, v13.2s[0]
-	fmla	v21.2s, v5.2s, v12.2s[1]
+	fmla	v24.2s, v4.2s, v13.s[0]
+	fmla	v21.2s, v5.2s, v12.s[1]
 
-	fmla	v28.2s, v4.2s, v13.2s[1]
-	fmla	v17.2s, v5.2s, v12.2s[0]
+	fmla	v28.2s, v4.2s, v13.s[1]
+	fmla	v17.2s, v5.2s, v12.s[0]
 .endm
 
 .macro KERNEL4x4_SUB
@@ -890,17 +890,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 	ld1	{v0.2s, v1.2s}, [pA]
 	add	pA, pA, #16
 
-	fmla	v16.2s, v0.2s, v8.2s[0]
-	fmla	v29.2s, v1.2s, v9.2s[1]
+	fmla	v16.2s, v0.2s, v8.s[0]
+	fmla	v29.2s, v1.2s, v9.s[1]
 
-	fmla	v20.2s, v0.2s, v8.2s[1]
-	fmla	v25.2s, v1.2s, v9.2s[0]
+	fmla	v20.2s, v0.2s, v8.s[1]
+	fmla	v25.2s, v1.2s, v9.s[0]
 
-	fmla	v24.2s, v0.2s, v9.2s[0]
-	fmla	v21.2s, v1.2s, v8.2s[1]
+	fmla	v24.2s, v0.2s, v9.s[0]
+	fmla	v21.2s, v1.2s, v8.s[1]
 
-	fmla	v28.2s, v0.2s, v9.2s[1]
-	fmla	v17.2s, v1.2s, v8.2s[0]
+	fmla	v28.2s, v0.2s, v9.s[1]
+	fmla	v17.2s, v1.2s, v8.s[0]
 .endm
 
 .macro SAVE4x4
@@ -945,10 +945,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 	ld1	{v0.2s}, [pA]
 	add	pA, pA, #8
 
-	fmla	v16.2s, v0.2s, v8.2s[0]
-	fmla	v20.2s, v0.2s, v8.2s[1]
-	fmla	v24.2s, v0.2s, v9.2s[0]
-	fmla	v28.2s, v0.2s, v9.2s[1]
+	fmla	v16.2s, v0.2s, v8.s[0]
+	fmla	v20.2s, v0.2s, v8.s[1]
+	fmla	v24.2s, v0.2s, v9.s[0]
+	fmla	v28.2s, v0.2s, v9.s[1]
 .endm
 
 .macro SAVE2x4
@@ -1028,11 +1028,11 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 	ld1	{v1.4s}, [pA]
 	add	pA, pA, #16
 
-	fmla	v16.4s, v0.4s, v8.2s[0]
-	fmla	v17.4s, v1.4s, v8.2s[0]
+	fmla	v16.4s, v0.4s, v8.s[0]
+	fmla	v17.4s, v1.4s, v8.s[0]
 
-	fmla	v20.4s, v0.4s, v8.2s[1]
-	fmla	v21.4s, v1.4s, v8.2s[1]
+	fmla	v20.4s, v0.4s, v8.s[1]
+	fmla	v21.4s, v1.4s, v8.s[1]
 .endm
 
 .macro SAVE8x2
@@ -1068,10 +1068,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 	ld1	{v0.2s, v1.2s}, [pA]
 	add	pA, pA, #16
 
-	fmla	v16.2s, v0.2s, v8.2s[0]
-	fmla	v17.2s, v1.2s, v8.2s[0]
-	fmla	v20.2s, v0.2s, v8.2s[1]
-	fmla	v21.2s, v1.2s, v8.2s[1]
+	fmla	v16.2s, v0.2s, v8.s[0]
+	fmla	v17.2s, v1.2s, v8.s[0]
+	fmla	v20.2s, v0.2s, v8.s[1]
+	fmla	v21.2s, v1.2s, v8.s[1]
 .endm
 
 .macro SAVE4x2
@@ -1103,8 +1103,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 	ld1	{v0.2s}, [pA]
 	add	pA, pA, #8
 
-	fmla	v16.2s, v0.2s, v8.2s[0]
-	fmla	v20.2s, v0.2s, v8.2s[1]
+	fmla	v16.2s, v0.2s, v8.s[0]
+	fmla	v20.2s, v0.2s, v8.s[1]
 .endm
 
 .macro SAVE2x2
@@ -1133,7 +1133,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 	ldr	s0 , [pA]
 	add	pA, pA, #4
 
-	fmla	v16.2s, v8.2s, v0.2s[0]
+	fmla	v16.2s, v8.2s, v0.s[0]
 .endm
 
 .macro SAVE1x2
@@ -1163,8 +1163,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 	ld1	{v1.4s}, [pA]
 	add	pA, pA, #16
 
-	fmla	v16.4s, v0.4s, v8.2s[0]
-	fmla	v17.4s, v1.4s, v8.2s[0]
+	fmla	v16.4s, v0.4s, v8.s[0]
+	fmla	v17.4s, v1.4s, v8.s[0]
 .endm
 
 .macro SAVE8x1
@@ -1190,8 +1190,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 	ld1	{v0.2s, v1.2s}, [pA]
 	add	pA , pA, #16
 
-	fmla	v16.2s, v0.2s, v8.2s[0]
-	fmla	v17.2s, v1.2s, v8.2s[0]
+	fmla	v16.2s, v0.2s, v8.s[0]
+	fmla	v17.2s, v1.2s, v8.s[0]
 .endm
 
 .macro SAVE4x1
@@ -1216,7 +1216,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 	ld1	{v0.2s}, [pA]
 	add	pA , pA, #8
 
-	fmla	v16.2s, v0.2s, v8.2s[0]
+	fmla	v16.2s, v0.2s, v8.s[0]
 .endm
 
 .macro SAVE2x1
diff --git a/kernel/arm64/zgemm_kernel_4x4.S b/kernel/arm64/zgemm_kernel_4x4.S
index 28ce3de40..1cb695e56 100644
--- a/kernel/arm64/zgemm_kernel_4x4.S
+++ b/kernel/arm64/zgemm_kernel_4x4.S
@@ -182,93 +182,93 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 	ld2	{v2.2d, v3.2d}, [pA]
 	add	pA, pA, #32
 
-	fmul	v16.2d, v0.2d, v8.2d[0]
-	OP_ii	v16.2d, v1.2d, v9.2d[0]
+	fmul	v16.2d, v0.2d, v8.d[0]
+	OP_ii	v16.2d, v1.2d, v9.d[0]
 #if defined(NR) || defined(NC) || defined(TR) || defined(TC) || \
     defined(RR) || defined(RC) || defined(CR) || defined(CC)
 	eor	v17.16b, v17.16b, v17.16b
-	fmls	v17.2d, v0.2d, v9.2d[0]
+	fmls	v17.2d, v0.2d, v9.d[0]
 #else
-	fmul	v17.2d, v0.2d, v9.2d[0]
+	fmul	v17.2d, v0.2d, v9.d[0]
 #endif
-	OP_ir	v17.2d, v1.2d, v8.2d[0]
+	OP_ir	v17.2d, v1.2d, v8.d[0]
 
-	fmul	v18.2d, v2.2d, v8.2d[0]
-	OP_ii	v18.2d, v3.2d, v9.2d[0]
+	fmul	v18.2d, v2.2d, v8.d[0]
+	OP_ii	v18.2d, v3.2d, v9.d[0]
 #if defined(NR) || defined(NC) || defined(TR) || defined(TC) || \
     defined(RR) || defined(RC) || defined(CR) || defined(CC)
 	eor	v19.16b, v19.16b, v19.16b
-	fmls	v19.2d, v2.2d, v9.2d[0]
+	fmls	v19.2d, v2.2d, v9.d[0]
 #else
-	fmul	v19.2d, v2.2d, v9.2d[0]
+	fmul	v19.2d, v2.2d, v9.d[0]
 #endif
-	OP_ir	v19.2d, v3.2d, v8.2d[0]
+	OP_ir	v19.2d, v3.2d, v8.d[0]
 
-	fmul	v20.2d, v0.2d, v8.2d[1]
-	OP_ii	v20.2d, v1.2d, v9.2d[1]
+	fmul	v20.2d, v0.2d, v8.d[1]
+	OP_ii	v20.2d, v1.2d, v9.d[1]
 #if defined(NR) || defined(NC) || defined(TR) || defined(TC) || \
     defined(RR) || defined(RC) || defined(CR) || defined(CC)
 	eor	v21.16b, v21.16b, v21.16b
-	fmls	v21.2d, v0.2d, v9.2d[1]
+	fmls	v21.2d, v0.2d, v9.d[1]
 #else
-	fmul	v21.2d, v0.2d, v9.2d[1]
+	fmul	v21.2d, v0.2d, v9.d[1]
 #endif
-	OP_ir	v21.2d, v1.2d, v8.2d[1]
+	OP_ir	v21.2d, v1.2d, v8.d[1]
 
-	fmul	v22.2d, v2.2d, v8.2d[1]
-	OP_ii	v22.2d, v3.2d, v9.2d[1]
+	fmul	v22.2d, v2.2d, v8.d[1]
+	OP_ii	v22.2d, v3.2d, v9.d[1]
 #if defined(NR) || defined(NC) || defined(TR) || defined(TC) || \
     defined(RR) || defined(RC) || defined(CR) || defined(CC)
 	eor	v23.16b, v23.16b, v23.16b
-	fmls	v23.2d, v2.2d, v9.2d[1]
+	fmls	v23.2d, v2.2d, v9.d[1]
 #else
-	fmul	v23.2d, v2.2d, v9.2d[1]
+	fmul	v23.2d, v2.2d, v9.d[1]
 #endif
-	OP_ir	v23.2d, v3.2d, v8.2d[1]
+	OP_ir	v23.2d, v3.2d, v8.d[1]
 
-	fmul	v24.2d, v0.2d, v10.2d[0]
-	OP_ii	v24.2d, v1.2d, v11.2d[0]
+	fmul	v24.2d, v0.2d, v10.d[0]
+	OP_ii	v24.2d, v1.2d, v11.d[0]
 #if defined(NR) || defined(NC) || defined(TR) || defined(TC) || \
     defined(RR) || defined(RC) || defined(CR) || defined(CC)
 	eor	v25.16b, v25.16b, v25.16b
-	fmls	v25.2d, v0.2d, v11.2d[0]
+	fmls	v25.2d, v0.2d, v11.d[0]
 #else
-	fmul	v25.2d, v0.2d, v11.2d[0]
+	fmul	v25.2d, v0.2d, v11.d[0]
 #endif
-	OP_ir	v25.2d, v1.2d, v10.2d[0]
+	OP_ir	v25.2d, v1.2d, v10.d[0]
 
-	fmul	v26.2d, v2.2d, v10.2d[0]
-	OP_ii	v26.2d, v3.2d, v11.2d[0]
+	fmul	v26.2d, v2.2d, v10.d[0]
+	OP_ii	v26.2d, v3.2d, v11.d[0]
 #if defined(NR) || defined(NC) || defined(TR) || defined(TC) || \
     defined(RR) || defined(RC) || defined(CR) || defined(CC)
 	eor	v27.16b, v27.16b, v27.16b
-	fmls	v27.2d, v2.2d, v11.2d[0]
+	fmls	v27.2d, v2.2d, v11.d[0]
 #else
-	fmul	v27.2d, v2.2d, v11.2d[0]
+	fmul	v27.2d, v2.2d, v11.d[0]
 #endif
-	OP_ir	v27.2d, v3.2d, v10.2d[0]
+	OP_ir	v27.2d, v3.2d, v10.d[0]
 
-	fmul	v28.2d, v0.2d, v10.2d[1]
-	OP_ii	v28.2d, v1.2d, v11.2d[1]
+	fmul	v28.2d, v0.2d, v10.d[1]
+	OP_ii	v28.2d, v1.2d, v11.d[1]
 #if defined(NR) || defined(NC) || defined(TR) || defined(TC) || \
     defined(RR) || defined(RC) || defined(CR) || defined(CC)
 	eor	v29.16b, v29.16b, v29.16b
-	fmls	v29.2d, v0.2d, v11.2d[1]
+	fmls	v29.2d, v0.2d, v11.d[1]
 #else
-	fmul	v29.2d, v0.2d, v11.2d[1]
+	fmul	v29.2d, v0.2d, v11.d[1]
 #endif
-	OP_ir	v29.2d, v1.2d, v10.2d[1]
+	OP_ir	v29.2d, v1.2d, v10.d[1]
 
-	fmul	v30.2d, v2.2d, v10.2d[1]
-	OP_ii	v30.2d, v3.2d, v11.2d[1]
+	fmul	v30.2d, v2.2d, v10.d[1]
+	OP_ii	v30.2d, v3.2d, v11.d[1]
 #if defined(NR) || defined(NC) || defined(TR) || defined(TC) || \
     defined(RR) || defined(RC) || defined(CR) || defined(CC)
 	eor	v31.16b, v31.16b, v31.16b
-	fmls	v31.2d, v2.2d, v11.2d[1]
+	fmls	v31.2d, v2.2d, v11.d[1]
 #else
-	fmul	v31.2d, v2.2d, v11.2d[1]
+	fmul	v31.2d, v2.2d, v11.d[1]
 #endif
-	OP_ir	v31.2d, v3.2d, v10.2d[1]
+	OP_ir	v31.2d, v3.2d, v10.d[1]
 
 	ld2	{v12.2d, v13.2d}, [pB]
 	add	pB, pB, #32
@@ -281,161 +281,161 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 .endm
 
 .macro KERNEL4x4_M1
-	OP_rr	v16.2d, v0.2d, v8.2d[0]
-	OP_ii	v16.2d, v1.2d, v9.2d[0]
-	OP_ri	v17.2d, v0.2d, v9.2d[0]
-	OP_ir	v17.2d, v1.2d, v8.2d[0]
+	OP_rr	v16.2d, v0.2d, v8.d[0]
+	OP_ii	v16.2d, v1.2d, v9.d[0]
+	OP_ri	v17.2d, v0.2d, v9.d[0]
+	OP_ir	v17.2d, v1.2d, v8.d[0]
 
 	ld2	{v12.2d, v13.2d}, [pB]		// For next round
 	add	pB, pB, #32
 
-	OP_rr	v18.2d, v2.2d, v8.2d[0]
-	OP_ii	v18.2d, v3.2d, v9.2d[0]
-	OP_ri	v19.2d, v2.2d, v9.2d[0]
-	OP_ir	v19.2d, v3.2d, v8.2d[0]
+	OP_rr	v18.2d, v2.2d, v8.d[0]
+	OP_ii	v18.2d, v3.2d, v9.d[0]
+	OP_ri	v19.2d, v2.2d, v9.d[0]
+	OP_ir	v19.2d, v3.2d, v8.d[0]
 
 	ld2	{v14.2d, v15.2d}, [pB]		// For next round
 	add	pB, pB, #32
 
-	OP_rr	v20.2d, v0.2d, v8.2d[1]
-	OP_ii	v20.2d, v1.2d, v9.2d[1]
-	OP_ri	v21.2d, v0.2d, v9.2d[1]
-	OP_ir	v21.2d, v1.2d, v8.2d[1]
+	OP_rr	v20.2d, v0.2d, v8.d[1]
+	OP_ii	v20.2d, v1.2d, v9.d[1]
+	OP_ri	v21.2d, v0.2d, v9.d[1]
+	OP_ir	v21.2d, v1.2d, v8.d[1]
 
 	ld2	{v4.2d, v5.2d} , [pA]		// For next round
 	add	pA, pA, #32
 
-	OP_rr	v22.2d, v2.2d, v8.2d[1]
-	OP_ii	v22.2d, v3.2d, v9.2d[1]
-	OP_ri	v23.2d, v2.2d, v9.2d[1]
-	OP_ir	v23.2d, v3.2d, v8.2d[1]
+	OP_rr	v22.2d, v2.2d, v8.d[1]
+	OP_ii	v22.2d, v3.2d, v9.d[1]
+	OP_ri	v23.2d, v2.2d, v9.d[1]
+	OP_ir	v23.2d, v3.2d, v8.d[1]
 
 	ld2	{v6.2d, v7.2d} , [pA]		// For next round
 	add	pA, pA, #32
 
-	OP_rr	v24.2d, v0.2d, v10.2d[0]
-	OP_ii	v24.2d, v1.2d, v11.2d[0]
-	OP_ri	v25.2d, v0.2d, v11.2d[0]
-	OP_ir	v25.2d, v1.2d, v10.2d[0]
+	OP_rr	v24.2d, v0.2d, v10.d[0]
+	OP_ii	v24.2d, v1.2d, v11.d[0]
+	OP_ri	v25.2d, v0.2d, v11.d[0]
+	OP_ir	v25.2d, v1.2d, v10.d[0]
 
 	prfm	PLDL1KEEP, [pA, #512]
 
-	OP_rr	v26.2d, v2.2d, v10.2d[0]
-	OP_ii	v26.2d, v3.2d, v11.2d[0]
-	OP_ri	v27.2d, v2.2d, v11.2d[0]
-	OP_ir	v27.2d, v3.2d, v10.2d[0]
+	OP_rr	v26.2d, v2.2d, v10.d[0]
+	OP_ii	v26.2d, v3.2d, v11.d[0]
+	OP_ri	v27.2d, v2.2d, v11.d[0]
+	OP_ir	v27.2d, v3.2d, v10.d[0]
 
 	prfm	PLDL1KEEP, [pB, #512]
 
-	OP_rr	v28.2d, v0.2d, v10.2d[1]
-	OP_ii	v28.2d, v1.2d, v11.2d[1]
-	OP_ri	v29.2d, v0.2d, v11.2d[1]
-	OP_ir	v29.2d, v1.2d, v10.2d[1]
+	OP_rr	v28.2d, v0.2d, v10.d[1]
+	OP_ii	v28.2d, v1.2d, v11.d[1]
+	OP_ri	v29.2d, v0.2d, v11.d[1]
+	OP_ir	v29.2d, v1.2d, v10.d[1]
 
-	OP_rr	v30.2d, v2.2d, v10.2d[1]
-	OP_ii	v30.2d, v3.2d, v11.2d[1]
-	OP_ri	v31.2d, v2.2d, v11.2d[1]
-	OP_ir	v31.2d, v3.2d, v10.2d[1]
+	OP_rr	v30.2d, v2.2d, v10.d[1]
+	OP_ii	v30.2d, v3.2d, v11.d[1]
+	OP_ri	v31.2d, v2.2d, v11.d[1]
+	OP_ir	v31.2d, v3.2d, v10.d[1]
 .endm
 
 .macro KERNEL4x4_M2
-	OP_rr	v16.2d, v4.2d, v12.2d[0]
-	OP_ii	v16.2d, v5.2d, v13.2d[0]
-	OP_ri	v17.2d, v4.2d, v13.2d[0]
-	OP_ir	v17.2d, v5.2d, v12.2d[0]
+	OP_rr	v16.2d, v4.2d, v12.d[0]
+	OP_ii	v16.2d, v5.2d, v13.d[0]
+	OP_ri	v17.2d, v4.2d, v13.d[0]
+	OP_ir	v17.2d, v5.2d, v12.d[0]
 
 	ld2	{v8.2d, v9.2d}, [pB]		// For next round
 	add	pB, pB, #32
 
-	OP_rr	v18.2d, v6.2d, v12.2d[0]
-	OP_ii	v18.2d, v7.2d, v13.2d[0]
-	OP_ri	v19.2d, v6.2d, v13.2d[0]
-	OP_ir	v19.2d, v7.2d, v12.2d[0]
+	OP_rr	v18.2d, v6.2d, v12.d[0]
+	OP_ii	v18.2d, v7.2d, v13.d[0]
+	OP_ri	v19.2d, v6.2d, v13.d[0]
+	OP_ir	v19.2d, v7.2d, v12.d[0]
 
 	ld2	{v10.2d, v11.2d}, [pB]		// For next round
 	add	pB, pB, #32
 
-	OP_rr	v20.2d, v4.2d, v12.2d[1]
-	OP_ii	v20.2d, v5.2d, v13.2d[1]
-	OP_ri	v21.2d, v4.2d, v13.2d[1]
-	OP_ir	v21.2d, v5.2d, v12.2d[1]
+	OP_rr	v20.2d, v4.2d, v12.d[1]
+	OP_ii	v20.2d, v5.2d, v13.d[1]
+	OP_ri	v21.2d, v4.2d, v13.d[1]
+	OP_ir	v21.2d, v5.2d, v12.d[1]
 
 	ld2	{v0.2d, v1.2d}, [pA]		// For next round
 	add	pA, pA, #32
 
-	OP_rr	v22.2d, v6.2d, v12.2d[1]
-	OP_ii	v22.2d, v7.2d, v13.2d[1]
-	OP_ri	v23.2d, v6.2d, v13.2d[1]
-	OP_ir	v23.2d, v7.2d, v12.2d[1]
+	OP_rr	v22.2d, v6.2d, v12.d[1]
+	OP_ii	v22.2d, v7.2d, v13.d[1]
+	OP_ri	v23.2d, v6.2d, v13.d[1]
+	OP_ir	v23.2d, v7.2d, v12.d[1]
 
 	ld2	{v2.2d, v3.2d}, [pA]		// For next round
 	add	pA, pA, #32
 
-	OP_rr	v24.2d, v4.2d, v14.2d[0]
-	OP_ii	v24.2d, v5.2d, v15.2d[0]
-	OP_ri	v25.2d, v4.2d, v15.2d[0]
-	OP_ir	v25.2d, v5.2d, v14.2d[0]
+	OP_rr	v24.2d, v4.2d, v14.d[0]
+	OP_ii	v24.2d, v5.2d, v15.d[0]
+	OP_ri	v25.2d, v4.2d, v15.d[0]
+	OP_ir	v25.2d, v5.2d, v14.d[0]
 
 	prfm	PLDL1KEEP, [pA, #512]
 
-	OP_rr	v26.2d, v6.2d, v14.2d[0]
-	OP_ii	v26.2d, v7.2d, v15.2d[0]
-	OP_ri	v27.2d, v6.2d, v15.2d[0]
-	OP_ir	v27.2d, v7.2d, v14.2d[0]
+	OP_rr	v26.2d, v6.2d, v14.d[0]
+	OP_ii	v26.2d, v7.2d, v15.d[0]
+	OP_ri	v27.2d, v6.2d, v15.d[0]
+	OP_ir	v27.2d, v7.2d, v14.d[0]
 
 	prfm	PLDL1KEEP, [pB, #512]
 
-	OP_rr	v28.2d, v4.2d, v14.2d[1]
-	OP_ii	v28.2d, v5.2d, v15.2d[1]
-	OP_ri	v29.2d, v4.2d, v15.2d[1]
-	OP_ir	v29.2d, v5.2d, v14.2d[1]
+	OP_rr	v28.2d, v4.2d, v14.d[1]
+	OP_ii	v28.2d, v5.2d, v15.d[1]
+	OP_ri	v29.2d, v4.2d, v15.d[1]
+	OP_ir	v29.2d, v5.2d, v14.d[1]
 
-	OP_rr	v30.2d, v6.2d, v14.2d[1]
-	OP_ii	v30.2d, v7.2d, v15.2d[1]
-	OP_ri	v31.2d, v6.2d, v15.2d[1]
-	OP_ir	v31.2d, v7.2d, v14.2d[1]
+	OP_rr	v30.2d, v6.2d, v14.d[1]
+	OP_ii	v30.2d, v7.2d, v15.d[1]
+	OP_ri	v31.2d, v6.2d, v15.d[1]
+	OP_ir	v31.2d, v7.2d, v14.d[1]
 .endm
 
 .macro KERNEL4x4_E
-	OP_rr	v16.2d, v4.2d, v12.2d[0]
-	OP_ii	v16.2d, v5.2d, v13.2d[0]
-	OP_ri	v17.2d, v4.2d, v13.2d[0]
-	OP_ir	v17.2d, v5.2d, v12.2d[0]
+	OP_rr	v16.2d, v4.2d, v12.d[0]
+	OP_ii	v16.2d, v5.2d, v13.d[0]
+	OP_ri	v17.2d, v4.2d, v13.d[0]
+	OP_ir	v17.2d, v5.2d, v12.d[0]
 
-	OP_rr	v18.2d, v6.2d, v12.2d[0]
-	OP_ii	v18.2d, v7.2d, v13.2d[0]
-	OP_ri	v19.2d, v6.2d, v13.2d[0]
-	OP_ir	v19.2d, v7.2d, v12.2d[0]
+	OP_rr	v18.2d, v6.2d, v12.d[0]
+	OP_ii	v18.2d, v7.2d, v13.d[0]
+	OP_ri	v19.2d, v6.2d, v13.d[0]
+	OP_ir	v19.2d, v7.2d, v12.d[0]
 
-	OP_rr	v20.2d, v4.2d, v12.2d[1]
-	OP_ii	v20.2d, v5.2d, v13.2d[1]
-	OP_ri	v21.2d, v4.2d, v13.2d[1]
-	OP_ir	v21.2d, v5.2d, v12.2d[1]
+	OP_rr	v20.2d, v4.2d, v12.d[1]
+	OP_ii	v20.2d, v5.2d, v13.d[1]
+	OP_ri	v21.2d, v4.2d, v13.d[1]
+	OP_ir	v21.2d, v5.2d, v12.d[1]
 
-	OP_rr	v22.2d, v6.2d, v12.2d[1]
-	OP_ii	v22.2d, v7.2d, v13.2d[1]
-	OP_ri	v23.2d, v6.2d, v13.2d[1]
-	OP_ir	v23.2d, v7.2d, v12.2d[1]
+	OP_rr	v22.2d, v6.2d, v12.d[1]
+	OP_ii	v22.2d, v7.2d, v13.d[1]
+	OP_ri	v23.2d, v6.2d, v13.d[1]
+	OP_ir	v23.2d, v7.2d, v12.d[1]
 
-	OP_rr	v24.2d, v4.2d, v14.2d[0]
-	OP_ii	v24.2d, v5.2d, v15.2d[0]
-	OP_ri	v25.2d, v4.2d, v15.2d[0]
-	OP_ir	v25.2d, v5.2d, v14.2d[0]
+	OP_rr	v24.2d, v4.2d, v14.d[0]
+	OP_ii	v24.2d, v5.2d, v15.d[0]
+	OP_ri	v25.2d, v4.2d, v15.d[0]
+	OP_ir	v25.2d, v5.2d, v14.d[0]
 
-	OP_rr	v26.2d, v6.2d, v14.2d[0]
-	OP_ii	v26.2d, v7.2d, v15.2d[0]
-	OP_ri	v27.2d, v6.2d, v15.2d[0]
-	OP_ir	v27.2d, v7.2d, v14.2d[0]
+	OP_rr	v26.2d, v6.2d, v14.d[0]
+	OP_ii	v26.2d, v7.2d, v15.d[0]
+	OP_ri	v27.2d, v6.2d, v15.d[0]
+	OP_ir	v27.2d, v7.2d, v14.d[0]
 
-	OP_rr	v28.2d, v4.2d, v14.2d[1]
-	OP_ii	v28.2d, v5.2d, v15.2d[1]
-	OP_ri	v29.2d, v4.2d, v15.2d[1]
-	OP_ir	v29.2d, v5.2d, v14.2d[1]
+	OP_rr	v28.2d, v4.2d, v14.d[1]
+	OP_ii	v28.2d, v5.2d, v15.d[1]
+	OP_ri	v29.2d, v4.2d, v15.d[1]
+	OP_ir	v29.2d, v5.2d, v14.d[1]
 
-	OP_rr	v30.2d, v6.2d, v14.2d[1]
-	OP_ii	v30.2d, v7.2d, v15.2d[1]
-	OP_ri	v31.2d, v6.2d, v15.2d[1]
-	OP_ir	v31.2d, v7.2d, v14.2d[1]
+	OP_rr	v30.2d, v6.2d, v14.d[1]
+	OP_ii	v30.2d, v7.2d, v15.d[1]
+	OP_ri	v31.2d, v6.2d, v15.d[1]
+	OP_ir	v31.2d, v7.2d, v14.d[1]
 .endm
 
 .macro KERNEL4x4_SUB
@@ -448,45 +448,45 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 	ld2	{v2.2d, v3.2d}, [pA]
 	add	pA, pA, #32
 
-	OP_rr	v16.2d, v0.2d, v8.2d[0]
-	OP_ii	v16.2d, v1.2d, v9.2d[0]
-	OP_ri	v17.2d, v0.2d, v9.2d[0]
-	OP_ir	v17.2d, v1.2d, v8.2d[0]
+	OP_rr	v16.2d, v0.2d, v8.d[0]
+	OP_ii	v16.2d, v1.2d, v9.d[0]
+	OP_ri	v17.2d, v0.2d, v9.d[0]
+	OP_ir	v17.2d, v1.2d, v8.d[0]
 
-	OP_rr	v18.2d, v2.2d, v8.2d[0]
-	OP_ii	v18.2d, v3.2d, v9.2d[0]
-	OP_ri	v19.2d, v2.2d, v9.2d[0]
-	OP_ir	v19.2d, v3.2d, v8.2d[0]
+	OP_rr	v18.2d, v2.2d, v8.d[0]
+	OP_ii	v18.2d, v3.2d, v9.d[0]
+	OP_ri	v19.2d, v2.2d, v9.d[0]
+	OP_ir	v19.2d, v3.2d, v8.d[0]
 
-	OP_rr	v20.2d, v0.2d, v8.2d[1]
-	OP_ii	v20.2d, v1.2d, v9.2d[1]
-	OP_ri	v21.2d, v0.2d, v9.2d[1]
-	OP_ir	v21.2d, v1.2d, v8.2d[1]
+	OP_rr	v20.2d, v0.2d, v8.d[1]
+	OP_ii	v20.2d, v1.2d, v9.d[1]
+	OP_ri	v21.2d, v0.2d, v9.d[1]
+	OP_ir	v21.2d, v1.2d, v8.d[1]
 
-	OP_rr	v22.2d, v2.2d, v8.2d[1]
-	OP_ii	v22.2d, v3.2d, v9.2d[1]
-	OP_ri	v23.2d, v2.2d, v9.2d[1]
-	OP_ir	v23.2d, v3.2d, v8.2d[1]
+	OP_rr	v22.2d, v2.2d, v8.d[1]
+	OP_ii	v22.2d, v3.2d, v9.d[1]
+	OP_ri	v23.2d, v2.2d, v9.d[1]
+	OP_ir	v23.2d, v3.2d, v8.d[1]
 
-	OP_rr	v24.2d, v0.2d, v10.2d[0]
-	OP_ii	v24.2d, v1.2d, v11.2d[0]
-	OP_ri	v25.2d, v0.2d, v11.2d[0]
-	OP_ir	v25.2d, v1.2d, v10.2d[0]
+	OP_rr	v24.2d, v0.2d, v10.d[0]
+	OP_ii	v24.2d, v1.2d, v11.d[0]
+	OP_ri	v25.2d, v0.2d, v11.d[0]
+	OP_ir	v25.2d, v1.2d, v10.d[0]
 
-	OP_rr	v26.2d, v2.2d, v10.2d[0]
-	OP_ii	v26.2d, v3.2d, v11.2d[0]
-	OP_ri	v27.2d, v2.2d, v11.2d[0]
-	OP_ir	v27.2d, v3.2d, v10.2d[0]
+	OP_rr	v26.2d, v2.2d, v10.d[0]
+	OP_ii	v26.2d, v3.2d, v11.d[0]
+	OP_ri	v27.2d, v2.2d, v11.d[0]
+	OP_ir	v27.2d, v3.2d, v10.d[0]
 
-	OP_rr	v28.2d, v0.2d, v10.2d[1]
-	OP_ii	v28.2d, v1.2d, v11.2d[1]
-	OP_ri	v29.2d, v0.2d, v11.2d[1]
-	OP_ir	v29.2d, v1.2d, v10.2d[1]
+	OP_rr	v28.2d, v0.2d, v10.d[1]
+	OP_ii	v28.2d, v1.2d, v11.d[1]
+	OP_ri	v29.2d, v0.2d, v11.d[1]
+	OP_ir	v29.2d, v1.2d, v10.d[1]
 
-	OP_rr	v30.2d, v2.2d, v10.2d[1]
-	OP_ii	v30.2d, v3.2d, v11.2d[1]
-	OP_ri	v31.2d, v2.2d, v11.2d[1]
-	OP_ir	v31.2d, v3.2d, v10.2d[1]
+	OP_rr	v30.2d, v2.2d, v10.d[1]
+	OP_ii	v30.2d, v3.2d, v11.d[1]
+	OP_ri	v31.2d, v2.2d, v11.d[1]
+	OP_ir	v31.2d, v3.2d, v10.d[1]
 .endm
 
 .macro SAVE4x4
@@ -582,25 +582,25 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 	ld2	{v0.2d, v1.2d}, [pA]
 	add	pA, pA, #32
 
-	OP_rr	v16.2d, v0.2d, v8.2d[0]
-	OP_ii	v16.2d, v1.2d, v9.2d[0]
-	OP_ri	v17.2d, v0.2d, v9.2d[0]
-	OP_ir	v17.2d, v1.2d, v8.2d[0]
+	OP_rr	v16.2d, v0.2d, v8.d[0]
+	OP_ii	v16.2d, v1.2d, v9.d[0]
+	OP_ri	v17.2d, v0.2d, v9.d[0]
+	OP_ir	v17.2d, v1.2d, v8.d[0]
 
-	OP_rr	v20.2d, v0.2d, v8.2d[1]
-	OP_ii	v20.2d, v1.2d, v9.2d[1]
-	OP_ri	v21.2d, v0.2d, v9.2d[1]
-	OP_ir	v21.2d, v1.2d, v8.2d[1]
+	OP_rr	v20.2d, v0.2d, v8.d[1]
+	OP_ii	v20.2d, v1.2d, v9.d[1]
+	OP_ri	v21.2d, v0.2d, v9.d[1]
+	OP_ir	v21.2d, v1.2d, v8.d[1]
 
-	OP_rr	v24.2d, v0.2d, v10.2d[0]
-	OP_ii	v24.2d, v1.2d, v11.2d[0]
-	OP_ri	v25.2d, v0.2d, v11.2d[0]
-	OP_ir	v25.2d, v1.2d, v10.2d[0]
+	OP_rr	v24.2d, v0.2d, v10.d[0]
+	OP_ii	v24.2d, v1.2d, v11.d[0]
+	OP_ri	v25.2d, v0.2d, v11.d[0]
+	OP_ir	v25.2d, v1.2d, v10.d[0]
 
-	OP_rr	v28.2d, v0.2d, v10.2d[1]
-	OP_ii	v28.2d, v1.2d, v11.2d[1]
-	OP_ri	v29.2d, v0.2d, v11.2d[1]
-	OP_ir	v29.2d, v1.2d, v10.2d[1]
+	OP_rr	v28.2d, v0.2d, v10.d[1]
+	OP_ii	v28.2d, v1.2d, v11.d[1]
+	OP_ri	v29.2d, v0.2d, v11.d[1]
+	OP_ir	v29.2d, v1.2d, v10.d[1]
 .endm
 
 .macro SAVE2x4
@@ -669,25 +669,25 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 	ld2	{v0.d, v1.d}[0], [pA]
 	add	pA, pA, #16
 
-	OP_rr	d16, d0, v8.2d[0]
-	OP_ii	d16, d1, v9.2d[0]
-	OP_ri	d17, d0, v9.2d[0]
-	OP_ir	d17, d1, v8.2d[0]
+	OP_rr	d16, d0, v8.d[0]
+	OP_ii	d16, d1, v9.d[0]
+	OP_ri	d17, d0, v9.d[0]
+	OP_ir	d17, d1, v8.d[0]
 
-	OP_rr	d20, d0, v8.2d[1]
-	OP_ii	d20, d1, v9.2d[1]
-	OP_ri	d21, d0, v9.2d[1]
-	OP_ir	d21, d1, v8.2d[1]
+	OP_rr	d20, d0, v8.d[1]
+	OP_ii	d20, d1, v9.d[1]
+	OP_ri	d21, d0, v9.d[1]
+	OP_ir	d21, d1, v8.d[1]
 
-	OP_rr	d24, d0, v10.2d[0]
-	OP_ii	d24, d1, v11.2d[0]
-	OP_ri	d25, d0, v11.2d[0]
-	OP_ir	d25, d1, v10.2d[0]
+	OP_rr	d24, d0, v10.d[0]
+	OP_ii	d24, d1, v11.d[0]
+	OP_ri	d25, d0, v11.d[0]
+	OP_ir	d25, d1, v10.d[0]
 
-	OP_rr	d28, d0, v10.2d[1]
-	OP_ii	d28, d1, v11.2d[1]
-	OP_ri	d29, d0, v11.2d[1]
-	OP_ir	d29, d1, v10.2d[1]
+	OP_rr	d28, d0, v10.d[1]
+	OP_ii	d28, d1, v11.d[1]
+	OP_ri	d29, d0, v11.d[1]
+	OP_ir	d29, d1, v10.d[1]
 .endm
 
 .macro SAVE1x4
@@ -756,25 +756,25 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 	ld2	{v2.2d, v3.2d}, [pA]
 	add	pA, pA, #32
 
-	OP_rr	v16.2d, v0.2d, v8.2d[0]
-	OP_ii	v16.2d, v1.2d, v9.2d[0]
-	OP_ri	v17.2d, v0.2d, v9.2d[0]
-	OP_ir	v17.2d, v1.2d, v8.2d[0]
+	OP_rr	v16.2d, v0.2d, v8.d[0]
+	OP_ii	v16.2d, v1.2d, v9.d[0]
+	OP_ri	v17.2d, v0.2d, v9.d[0]
+	OP_ir	v17.2d, v1.2d, v8.d[0]
 
-	OP_rr	v18.2d, v2.2d, v8.2d[0]
-	OP_ii	v18.2d, v3.2d, v9.2d[0]
-	OP_ri	v19.2d, v2.2d, v9.2d[0]
-	OP_ir	v19.2d, v3.2d, v8.2d[0]
+	OP_rr	v18.2d, v2.2d, v8.d[0]
+	OP_ii	v18.2d, v3.2d, v9.d[0]
+	OP_ri	v19.2d, v2.2d, v9.d[0]
+	OP_ir	v19.2d, v3.2d, v8.d[0]
 
-	OP_rr	v20.2d, v0.2d, v8.2d[1]
-	OP_ii	v20.2d, v1.2d, v9.2d[1]
-	OP_ri	v21.2d, v0.2d, v9.2d[1]
-	OP_ir	v21.2d, v1.2d, v8.2d[1]
+	OP_rr	v20.2d, v0.2d, v8.d[1]
+	OP_ii	v20.2d, v1.2d, v9.d[1]
+	OP_ri	v21.2d, v0.2d, v9.d[1]
+	OP_ir	v21.2d, v1.2d, v8.d[1]
 
-	OP_rr	v22.2d, v2.2d, v8.2d[1]
-	OP_ii	v22.2d, v3.2d, v9.2d[1]
-	OP_ri	v23.2d, v2.2d, v9.2d[1]
-	OP_ir	v23.2d, v3.2d, v8.2d[1]
+	OP_rr	v22.2d, v2.2d, v8.d[1]
+	OP_ii	v22.2d, v3.2d, v9.d[1]
+	OP_ri	v23.2d, v2.2d, v9.d[1]
+	OP_ir	v23.2d, v3.2d, v8.d[1]
 .endm
 
 .macro SAVE4x2
@@ -833,15 +833,15 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 	ld2	{v0.2d, v1.2d}, [pA]
 	add	pA, pA, #32
 
-	OP_rr	v16.2d, v0.2d, v8.2d[0]
-	OP_ii	v16.2d, v1.2d, v9.2d[0]
-	OP_ri	v17.2d, v0.2d, v9.2d[0]
-	OP_ir	v17.2d, v1.2d, v8.2d[0]
+	OP_rr	v16.2d, v0.2d, v8.d[0]
+	OP_ii	v16.2d, v1.2d, v9.d[0]
+	OP_ri	v17.2d, v0.2d, v9.d[0]
+	OP_ir	v17.2d, v1.2d, v8.d[0]
 
-	OP_rr	v20.2d, v0.2d, v8.2d[1]
-	OP_ii	v20.2d, v1.2d, v9.2d[1]
-	OP_ri	v21.2d, v0.2d, v9.2d[1]
-	OP_ir	v21.2d, v1.2d, v8.2d[1]
+	OP_rr	v20.2d, v0.2d, v8.d[1]
+	OP_ii	v20.2d, v1.2d, v9.d[1]
+	OP_ri	v21.2d, v0.2d, v9.d[1]
+	OP_ir	v21.2d, v1.2d, v8.d[1]
 .endm
 
 .macro SAVE2x2
@@ -886,15 +886,15 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 	ld2	{v0.d, v1.d}[0], [pA]
 	add	pA, pA, #16
 
-	OP_rr	d16, d0, v8.2d[0]
-	OP_ii	d16, d1, v9.2d[0]
-	OP_ri	d17, d0, v9.2d[0]
-	OP_ir	d17, d1, v8.2d[0]
+	OP_rr	d16, d0, v8.d[0]
+	OP_ii	d16, d1, v9.d[0]
+	OP_ri	d17, d0, v9.d[0]
+	OP_ir	d17, d1, v8.d[0]
 
-	OP_rr	d20, d0, v8.2d[1]
-	OP_ii	d20, d1, v9.2d[1]
-	OP_ri	d21, d0, v9.2d[1]
-	OP_ir	d21, d1, v8.2d[1]
+	OP_rr	d20, d0, v8.d[1]
+	OP_ii	d20, d1, v9.d[1]
+	OP_ri	d21, d0, v9.d[1]
+	OP_ir	d21, d1, v8.d[1]
 .endm
 
 .macro SAVE1x2
diff --git a/kernel/arm64/ztrmm_kernel_4x4.S b/kernel/arm64/ztrmm_kernel_4x4.S
index 3ff8227e3..7945870d6 100644
--- a/kernel/arm64/ztrmm_kernel_4x4.S
+++ b/kernel/arm64/ztrmm_kernel_4x4.S
@@ -185,93 +185,93 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 	ld2	{v2.2d, v3.2d}, [pA]
 	add	pA, pA, #32
 
-	fmul	v16.2d, v0.2d, v8.2d[0]
-	OP_ii	v16.2d, v1.2d, v9.2d[0]
+	fmul	v16.2d, v0.2d, v8.d[0]
+	OP_ii	v16.2d, v1.2d, v9.d[0]
 #if defined(NR) || defined(NC) || defined(TR) || defined(TC) || \
     defined(RR) || defined(RC) || defined(CR) || defined(CC)
 	eor	v17.16b, v17.16b, v17.16b
-	fmls	v17.2d, v0.2d, v9.2d[0]
+	fmls	v17.2d, v0.2d, v9.d[0]
 #else
-	fmul	v17.2d, v0.2d, v9.2d[0]
+	fmul	v17.2d, v0.2d, v9.d[0]
 #endif
-	OP_ir	v17.2d, v1.2d, v8.2d[0]
+	OP_ir	v17.2d, v1.2d, v8.d[0]
 
-	fmul	v18.2d, v2.2d, v8.2d[0]
-	OP_ii	v18.2d, v3.2d, v9.2d[0]
+	fmul	v18.2d, v2.2d, v8.d[0]
+	OP_ii	v18.2d, v3.2d, v9.d[0]
 #if defined(NR) || defined(NC) || defined(TR) || defined(TC) || \
     defined(RR) || defined(RC) || defined(CR) || defined(CC)
 	eor	v19.16b, v19.16b, v19.16b
-	fmls	v19.2d, v2.2d, v9.2d[0]
+	fmls	v19.2d, v2.2d, v9.d[0]
 #else
-	fmul	v19.2d, v2.2d, v9.2d[0]
+	fmul	v19.2d, v2.2d, v9.d[0]
 #endif
-	OP_ir	v19.2d, v3.2d, v8.2d[0]
+	OP_ir	v19.2d, v3.2d, v8.d[0]
 
-	fmul	v20.2d, v0.2d, v8.2d[1]
-	OP_ii	v20.2d, v1.2d, v9.2d[1]
+	fmul	v20.2d, v0.2d, v8.d[1]
+	OP_ii	v20.2d, v1.2d, v9.d[1]
 #if defined(NR) || defined(NC) || defined(TR) || defined(TC) || \
     defined(RR) || defined(RC) || defined(CR) || defined(CC)
 	eor	v21.16b, v21.16b, v21.16b
-	fmls	v21.2d, v0.2d, v9.2d[1]
+	fmls	v21.2d, v0.2d, v9.d[1]
 #else
-	fmul	v21.2d, v0.2d, v9.2d[1]
+	fmul	v21.2d, v0.2d, v9.d[1]
 #endif
-	OP_ir	v21.2d, v1.2d, v8.2d[1]
+	OP_ir	v21.2d, v1.2d, v8.d[1]
 
-	fmul	v22.2d, v2.2d, v8.2d[1]
-	OP_ii	v22.2d, v3.2d, v9.2d[1]
+	fmul	v22.2d, v2.2d, v8.d[1]
+	OP_ii	v22.2d, v3.2d, v9.d[1]
 #if defined(NR) || defined(NC) || defined(TR) || defined(TC) || \
     defined(RR) || defined(RC) || defined(CR) || defined(CC)
 	eor	v23.16b, v23.16b, v23.16b
-	fmls	v23.2d, v2.2d, v9.2d[1]
+	fmls	v23.2d, v2.2d, v9.d[1]
 #else
-	fmul	v23.2d, v2.2d, v9.2d[1]
+	fmul	v23.2d, v2.2d, v9.d[1]
 #endif
-	OP_ir	v23.2d, v3.2d, v8.2d[1]
+	OP_ir	v23.2d, v3.2d, v8.d[1]
 
-	fmul	v24.2d, v0.2d, v10.2d[0]
-	OP_ii	v24.2d, v1.2d, v11.2d[0]
+	fmul	v24.2d, v0.2d, v10.d[0]
+	OP_ii	v24.2d, v1.2d, v11.d[0]
 #if defined(NR) || defined(NC) || defined(TR) || defined(TC) || \
     defined(RR) || defined(RC) || defined(CR) || defined(CC)
 	eor	v25.16b, v25.16b, v25.16b
-	fmls	v25.2d, v0.2d, v11.2d[0]
+	fmls	v25.2d, v0.2d, v11.d[0]
 #else
-	fmul	v25.2d, v0.2d, v11.2d[0]
+	fmul	v25.2d, v0.2d, v11.d[0]
 #endif
-	OP_ir	v25.2d, v1.2d, v10.2d[0]
+	OP_ir	v25.2d, v1.2d, v10.d[0]
 
-	fmul	v26.2d, v2.2d, v10.2d[0]
-	OP_ii	v26.2d, v3.2d, v11.2d[0]
+	fmul	v26.2d, v2.2d, v10.d[0]
+	OP_ii	v26.2d, v3.2d, v11.d[0]
 #if defined(NR) || defined(NC) || defined(TR) || defined(TC) || \
     defined(RR) || defined(RC) || defined(CR) || defined(CC)
 	eor	v27.16b, v27.16b, v27.16b
-	fmls	v27.2d, v2.2d, v11.2d[0]
+	fmls	v27.2d, v2.2d, v11.d[0]
 #else
-	fmul	v27.2d, v2.2d, v11.2d[0]
+	fmul	v27.2d, v2.2d, v11.d[0]
 #endif
-	OP_ir	v27.2d, v3.2d, v10.2d[0]
+	OP_ir	v27.2d, v3.2d, v10.d[0]
 
-	fmul	v28.2d, v0.2d, v10.2d[1]
-	OP_ii	v28.2d, v1.2d, v11.2d[1]
+	fmul	v28.2d, v0.2d, v10.d[1]
+	OP_ii	v28.2d, v1.2d, v11.d[1]
 #if defined(NR) || defined(NC) || defined(TR) || defined(TC) || \
     defined(RR) || defined(RC) || defined(CR) || defined(CC)
 	eor	v29.16b, v29.16b, v29.16b
-	fmls	v29.2d, v0.2d, v11.2d[1]
+	fmls	v29.2d, v0.2d, v11.d[1]
 #else
-	fmul	v29.2d, v0.2d, v11.2d[1]
+	fmul	v29.2d, v0.2d, v11.d[1]
 #endif
-	OP_ir	v29.2d, v1.2d, v10.2d[1]
+	OP_ir	v29.2d, v1.2d, v10.d[1]
 
-	fmul	v30.2d, v2.2d, v10.2d[1]
-	OP_ii	v30.2d, v3.2d, v11.2d[1]
+	fmul	v30.2d, v2.2d, v10.d[1]
+	OP_ii	v30.2d, v3.2d, v11.d[1]
 #if defined(NR) || defined(NC) || defined(TR) || defined(TC) || \
     defined(RR) || defined(RC) || defined(CR) || defined(CC)
 	eor	v31.16b, v31.16b, v31.16b
-	fmls	v31.2d, v2.2d, v11.2d[1]
+	fmls	v31.2d, v2.2d, v11.d[1]
 #else
-	fmul	v31.2d, v2.2d, v11.2d[1]
+	fmul	v31.2d, v2.2d, v11.d[1]
 #endif
-	OP_ir	v31.2d, v3.2d, v10.2d[1]
+	OP_ir	v31.2d, v3.2d, v10.d[1]
 
 	ld2	{v12.2d, v13.2d}, [pB]
 	add	pB, pB, #32
@@ -284,161 +284,161 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 .endm
 
 .macro KERNEL4x4_M1
-	OP_rr	v16.2d, v0.2d, v8.2d[0]
-	OP_ii	v16.2d, v1.2d, v9.2d[0]
-	OP_ri	v17.2d, v0.2d, v9.2d[0]
-	OP_ir	v17.2d, v1.2d, v8.2d[0]
+	OP_rr	v16.2d, v0.2d, v8.d[0]
+	OP_ii	v16.2d, v1.2d, v9.d[0]
+	OP_ri	v17.2d, v0.2d, v9.d[0]
+	OP_ir	v17.2d, v1.2d, v8.d[0]
 
 	ld2	{v12.2d, v13.2d}, [pB]		// For next round
 	add	pB, pB, #32
 
-	OP_rr	v18.2d, v2.2d, v8.2d[0]
-	OP_ii	v18.2d, v3.2d, v9.2d[0]
-	OP_ri	v19.2d, v2.2d, v9.2d[0]
-	OP_ir	v19.2d, v3.2d, v8.2d[0]
+	OP_rr	v18.2d, v2.2d, v8.d[0]
+	OP_ii	v18.2d, v3.2d, v9.d[0]
+	OP_ri	v19.2d, v2.2d, v9.d[0]
+	OP_ir	v19.2d, v3.2d, v8.d[0]
 
 	ld2	{v14.2d, v15.2d}, [pB]		// For next round
 	add	pB, pB, #32
 
-	OP_rr	v20.2d, v0.2d, v8.2d[1]
-	OP_ii	v20.2d, v1.2d, v9.2d[1]
-	OP_ri	v21.2d, v0.2d, v9.2d[1]
-	OP_ir	v21.2d, v1.2d, v8.2d[1]
+	OP_rr	v20.2d, v0.2d, v8.d[1]
+	OP_ii	v20.2d, v1.2d, v9.d[1]
+	OP_ri	v21.2d, v0.2d, v9.d[1]
+	OP_ir	v21.2d, v1.2d, v8.d[1]
 
 	ld2	{v4.2d, v5.2d} , [pA]		// For next round
 	add	pA, pA, #32
 
-	OP_rr	v22.2d, v2.2d, v8.2d[1]
-	OP_ii	v22.2d, v3.2d, v9.2d[1]
-	OP_ri	v23.2d, v2.2d, v9.2d[1]
-	OP_ir	v23.2d, v3.2d, v8.2d[1]
+	OP_rr	v22.2d, v2.2d, v8.d[1]
+	OP_ii	v22.2d, v3.2d, v9.d[1]
+	OP_ri	v23.2d, v2.2d, v9.d[1]
+	OP_ir	v23.2d, v3.2d, v8.d[1]
 
 	ld2	{v6.2d, v7.2d} , [pA]		// For next round
 	add	pA, pA, #32
 
-	OP_rr	v24.2d, v0.2d, v10.2d[0]
-	OP_ii	v24.2d, v1.2d, v11.2d[0]
-	OP_ri	v25.2d, v0.2d, v11.2d[0]
-	OP_ir	v25.2d, v1.2d, v10.2d[0]
+	OP_rr	v24.2d, v0.2d, v10.d[0]
+	OP_ii	v24.2d, v1.2d, v11.d[0]
+	OP_ri	v25.2d, v0.2d, v11.d[0]
+	OP_ir	v25.2d, v1.2d, v10.d[0]
 
 	prfm	PLDL1KEEP, [pA, #512]
 
-	OP_rr	v26.2d, v2.2d, v10.2d[0]
-	OP_ii	v26.2d, v3.2d, v11.2d[0]
-	OP_ri	v27.2d, v2.2d, v11.2d[0]
-	OP_ir	v27.2d, v3.2d, v10.2d[0]
+	OP_rr	v26.2d, v2.2d, v10.d[0]
+	OP_ii	v26.2d, v3.2d, v11.d[0]
+	OP_ri	v27.2d, v2.2d, v11.d[0]
+	OP_ir	v27.2d, v3.2d, v10.d[0]
 
 	prfm	PLDL1KEEP, [pB, #512]
 
-	OP_rr	v28.2d, v0.2d, v10.2d[1]
-	OP_ii	v28.2d, v1.2d, v11.2d[1]
-	OP_ri	v29.2d, v0.2d, v11.2d[1]
-	OP_ir	v29.2d, v1.2d, v10.2d[1]
+	OP_rr	v28.2d, v0.2d, v10.d[1]
+	OP_ii	v28.2d, v1.2d, v11.d[1]
+	OP_ri	v29.2d, v0.2d, v11.d[1]
+	OP_ir	v29.2d, v1.2d, v10.d[1]
 
-	OP_rr	v30.2d, v2.2d, v10.2d[1]
-	OP_ii	v30.2d, v3.2d, v11.2d[1]
-	OP_ri	v31.2d, v2.2d, v11.2d[1]
-	OP_ir	v31.2d, v3.2d, v10.2d[1]
+	OP_rr	v30.2d, v2.2d, v10.d[1]
+	OP_ii	v30.2d, v3.2d, v11.d[1]
+	OP_ri	v31.2d, v2.2d, v11.d[1]
+	OP_ir	v31.2d, v3.2d, v10.d[1]
 .endm
 
 .macro KERNEL4x4_M2
-	OP_rr	v16.2d, v4.2d, v12.2d[0]
-	OP_ii	v16.2d, v5.2d, v13.2d[0]
-	OP_ri	v17.2d, v4.2d, v13.2d[0]
-	OP_ir	v17.2d, v5.2d, v12.2d[0]
+	OP_rr	v16.2d, v4.2d, v12.d[0]
+	OP_ii	v16.2d, v5.2d, v13.d[0]
+	OP_ri	v17.2d, v4.2d, v13.d[0]
+	OP_ir	v17.2d, v5.2d, v12.d[0]
 
 	ld2	{v8.2d, v9.2d}, [pB]		// For next round
 	add	pB, pB, #32
 
-	OP_rr	v18.2d, v6.2d, v12.2d[0]
-	OP_ii	v18.2d, v7.2d, v13.2d[0]
-	OP_ri	v19.2d, v6.2d, v13.2d[0]
-	OP_ir	v19.2d, v7.2d, v12.2d[0]
+	OP_rr	v18.2d, v6.2d, v12.d[0]
+	OP_ii	v18.2d, v7.2d, v13.d[0]
+	OP_ri	v19.2d, v6.2d, v13.d[0]
+	OP_ir	v19.2d, v7.2d, v12.d[0]
 
 	ld2	{v10.2d, v11.2d}, [pB]		// For next round
 	add	pB, pB, #32
 
-	OP_rr	v20.2d, v4.2d, v12.2d[1]
-	OP_ii	v20.2d, v5.2d, v13.2d[1]
-	OP_ri	v21.2d, v4.2d, v13.2d[1]
-	OP_ir	v21.2d, v5.2d, v12.2d[1]
+	OP_rr	v20.2d, v4.2d, v12.d[1]
+	OP_ii	v20.2d, v5.2d, v13.d[1]
+	OP_ri	v21.2d, v4.2d, v13.d[1]
+	OP_ir	v21.2d, v5.2d, v12.d[1]
 
 	ld2	{v0.2d, v1.2d}, [pA]		// For next round
 	add	pA, pA, #32
 
-	OP_rr	v22.2d, v6.2d, v12.2d[1]
-	OP_ii	v22.2d, v7.2d, v13.2d[1]
-	OP_ri	v23.2d, v6.2d, v13.2d[1]
-	OP_ir	v23.2d, v7.2d, v12.2d[1]
+	OP_rr	v22.2d, v6.2d, v12.d[1]
+	OP_ii	v22.2d, v7.2d, v13.d[1]
+	OP_ri	v23.2d, v6.2d, v13.d[1]
+	OP_ir	v23.2d, v7.2d, v12.d[1]
 
 	ld2	{v2.2d, v3.2d}, [pA]		// For next round
 	add	pA, pA, #32
 
-	OP_rr	v24.2d, v4.2d, v14.2d[0]
-	OP_ii	v24.2d, v5.2d, v15.2d[0]
-	OP_ri	v25.2d, v4.2d, v15.2d[0]
-	OP_ir	v25.2d, v5.2d, v14.2d[0]
+	OP_rr	v24.2d, v4.2d, v14.d[0]
+	OP_ii	v24.2d, v5.2d, v15.d[0]
+	OP_ri	v25.2d, v4.2d, v15.d[0]
+	OP_ir	v25.2d, v5.2d, v14.d[0]
 
 	prfm	PLDL1KEEP, [pA, #512]
 
-	OP_rr	v26.2d, v6.2d, v14.2d[0]
-	OP_ii	v26.2d, v7.2d, v15.2d[0]
-	OP_ri	v27.2d, v6.2d, v15.2d[0]
-	OP_ir	v27.2d, v7.2d, v14.2d[0]
+	OP_rr	v26.2d, v6.2d, v14.d[0]
+	OP_ii	v26.2d, v7.2d, v15.d[0]
+	OP_ri	v27.2d, v6.2d, v15.d[0]
+	OP_ir	v27.2d, v7.2d, v14.d[0]
 
 	prfm	PLDL1KEEP, [pB, #512]
 
-	OP_rr	v28.2d, v4.2d, v14.2d[1]
-	OP_ii	v28.2d, v5.2d, v15.2d[1]
-	OP_ri	v29.2d, v4.2d, v15.2d[1]
-	OP_ir	v29.2d, v5.2d, v14.2d[1]
+	OP_rr	v28.2d, v4.2d, v14.d[1]
+	OP_ii	v28.2d, v5.2d, v15.d[1]
+	OP_ri	v29.2d, v4.2d, v15.d[1]
+	OP_ir	v29.2d, v5.2d, v14.d[1]
 
-	OP_rr	v30.2d, v6.2d, v14.2d[1]
-	OP_ii	v30.2d, v7.2d, v15.2d[1]
-	OP_ri	v31.2d, v6.2d, v15.2d[1]
-	OP_ir	v31.2d, v7.2d, v14.2d[1]
+	OP_rr	v30.2d, v6.2d, v14.d[1]
+	OP_ii	v30.2d, v7.2d, v15.d[1]
+	OP_ri	v31.2d, v6.2d, v15.d[1]
+	OP_ir	v31.2d, v7.2d, v14.d[1]
 .endm
 
 .macro KERNEL4x4_E
-	OP_rr	v16.2d, v4.2d, v12.2d[0]
-	OP_ii	v16.2d, v5.2d, v13.2d[0]
-	OP_ri	v17.2d, v4.2d, v13.2d[0]
-	OP_ir	v17.2d, v5.2d, v12.2d[0]
+	OP_rr	v16.2d, v4.2d, v12.d[0]
+	OP_ii	v16.2d, v5.2d, v13.d[0]
+	OP_ri	v17.2d, v4.2d, v13.d[0]
+	OP_ir	v17.2d, v5.2d, v12.d[0]
 
-	OP_rr	v18.2d, v6.2d, v12.2d[0]
-	OP_ii	v18.2d, v7.2d, v13.2d[0]
-	OP_ri	v19.2d, v6.2d, v13.2d[0]
-	OP_ir	v19.2d, v7.2d, v12.2d[0]
+	OP_rr	v18.2d, v6.2d, v12.d[0]
+	OP_ii	v18.2d, v7.2d, v13.d[0]
+	OP_ri	v19.2d, v6.2d, v13.d[0]
+	OP_ir	v19.2d, v7.2d, v12.d[0]
 
-	OP_rr	v20.2d, v4.2d, v12.2d[1]
-	OP_ii	v20.2d, v5.2d, v13.2d[1]
-	OP_ri	v21.2d, v4.2d, v13.2d[1]
-	OP_ir	v21.2d, v5.2d, v12.2d[1]
+	OP_rr	v20.2d, v4.2d, v12.d[1]
+	OP_ii	v20.2d, v5.2d, v13.d[1]
+	OP_ri	v21.2d, v4.2d, v13.d[1]
+	OP_ir	v21.2d, v5.2d, v12.d[1]
 
-	OP_rr	v22.2d, v6.2d, v12.2d[1]
-	OP_ii	v22.2d, v7.2d, v13.2d[1]
-	OP_ri	v23.2d, v6.2d, v13.2d[1]
-	OP_ir	v23.2d, v7.2d, v12.2d[1]
+	OP_rr	v22.2d, v6.2d, v12.d[1]
+	OP_ii	v22.2d, v7.2d, v13.d[1]
+	OP_ri	v23.2d, v6.2d, v13.d[1]
+	OP_ir	v23.2d, v7.2d, v12.d[1]
 
-	OP_rr	v24.2d, v4.2d, v14.2d[0]
-	OP_ii	v24.2d, v5.2d, v15.2d[0]
-	OP_ri	v25.2d, v4.2d, v15.2d[0]
-	OP_ir	v25.2d, v5.2d, v14.2d[0]
+	OP_rr	v24.2d, v4.2d, v14.d[0]
+	OP_ii	v24.2d, v5.2d, v15.d[0]
+	OP_ri	v25.2d, v4.2d, v15.d[0]
+	OP_ir	v25.2d, v5.2d, v14.d[0]
 
-	OP_rr	v26.2d, v6.2d, v14.2d[0]
-	OP_ii	v26.2d, v7.2d, v15.2d[0]
-	OP_ri	v27.2d, v6.2d, v15.2d[0]
-	OP_ir	v27.2d, v7.2d, v14.2d[0]
+	OP_rr	v26.2d, v6.2d, v14.d[0]
+	OP_ii	v26.2d, v7.2d, v15.d[0]
+	OP_ri	v27.2d, v6.2d, v15.d[0]
+	OP_ir	v27.2d, v7.2d, v14.d[0]
 
-	OP_rr	v28.2d, v4.2d, v14.2d[1]
-	OP_ii	v28.2d, v5.2d, v15.2d[1]
-	OP_ri	v29.2d, v4.2d, v15.2d[1]
-	OP_ir	v29.2d, v5.2d, v14.2d[1]
+	OP_rr	v28.2d, v4.2d, v14.d[1]
+	OP_ii	v28.2d, v5.2d, v15.d[1]
+	OP_ri	v29.2d, v4.2d, v15.d[1]
+	OP_ir	v29.2d, v5.2d, v14.d[1]
 
-	OP_rr	v30.2d, v6.2d, v14.2d[1]
-	OP_ii	v30.2d, v7.2d, v15.2d[1]
-	OP_ri	v31.2d, v6.2d, v15.2d[1]
-	OP_ir	v31.2d, v7.2d, v14.2d[1]
+	OP_rr	v30.2d, v6.2d, v14.d[1]
+	OP_ii	v30.2d, v7.2d, v15.d[1]
+	OP_ri	v31.2d, v6.2d, v15.d[1]
+	OP_ir	v31.2d, v7.2d, v14.d[1]
 .endm
 
 .macro KERNEL4x4_SUB
@@ -451,45 +451,45 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 	ld2	{v2.2d, v3.2d}, [pA]
 	add	pA, pA, #32
 
-	OP_rr	v16.2d, v0.2d, v8.2d[0]
-	OP_ii	v16.2d, v1.2d, v9.2d[0]
-	OP_ri	v17.2d, v0.2d, v9.2d[0]
-	OP_ir	v17.2d, v1.2d, v8.2d[0]
+	OP_rr	v16.2d, v0.2d, v8.d[0]
+	OP_ii	v16.2d, v1.2d, v9.d[0]
+	OP_ri	v17.2d, v0.2d, v9.d[0]
+	OP_ir	v17.2d, v1.2d, v8.d[0]
 
-	OP_rr	v18.2d, v2.2d, v8.2d[0]
-	OP_ii	v18.2d, v3.2d, v9.2d[0]
-	OP_ri	v19.2d, v2.2d, v9.2d[0]
-	OP_ir	v19.2d, v3.2d, v8.2d[0]
+	OP_rr	v18.2d, v2.2d, v8.d[0]
+	OP_ii	v18.2d, v3.2d, v9.d[0]
+	OP_ri	v19.2d, v2.2d, v9.d[0]
+	OP_ir	v19.2d, v3.2d, v8.d[0]
 
-	OP_rr	v20.2d, v0.2d, v8.2d[1]
-	OP_ii	v20.2d, v1.2d, v9.2d[1]
-	OP_ri	v21.2d, v0.2d, v9.2d[1]
-	OP_ir	v21.2d, v1.2d, v8.2d[1]
+	OP_rr	v20.2d, v0.2d, v8.d[1]
+	OP_ii	v20.2d, v1.2d, v9.d[1]
+	OP_ri	v21.2d, v0.2d, v9.d[1]
+	OP_ir	v21.2d, v1.2d, v8.d[1]
 
-	OP_rr	v22.2d, v2.2d, v8.2d[1]
-	OP_ii	v22.2d, v3.2d, v9.2d[1]
-	OP_ri	v23.2d, v2.2d, v9.2d[1]
-	OP_ir	v23.2d, v3.2d, v8.2d[1]
+	OP_rr	v22.2d, v2.2d, v8.d[1]
+	OP_ii	v22.2d, v3.2d, v9.d[1]
+	OP_ri	v23.2d, v2.2d, v9.d[1]
+	OP_ir	v23.2d, v3.2d, v8.d[1]
 
-	OP_rr	v24.2d, v0.2d, v10.2d[0]
-	OP_ii	v24.2d, v1.2d, v11.2d[0]
-	OP_ri	v25.2d, v0.2d, v11.2d[0]
-	OP_ir	v25.2d, v1.2d, v10.2d[0]
+	OP_rr	v24.2d, v0.2d, v10.d[0]
+	OP_ii	v24.2d, v1.2d, v11.d[0]
+	OP_ri	v25.2d, v0.2d, v11.d[0]
+	OP_ir	v25.2d, v1.2d, v10.d[0]
 
-	OP_rr	v26.2d, v2.2d, v10.2d[0]
-	OP_ii	v26.2d, v3.2d, v11.2d[0]
-	OP_ri	v27.2d, v2.2d, v11.2d[0]
-	OP_ir	v27.2d, v3.2d, v10.2d[0]
+	OP_rr	v26.2d, v2.2d, v10.d[0]
+	OP_ii	v26.2d, v3.2d, v11.d[0]
+	OP_ri	v27.2d, v2.2d, v11.d[0]
+	OP_ir	v27.2d, v3.2d, v10.d[0]
 
-	OP_rr	v28.2d, v0.2d, v10.2d[1]
-	OP_ii	v28.2d, v1.2d, v11.2d[1]
-	OP_ri	v29.2d, v0.2d, v11.2d[1]
-	OP_ir	v29.2d, v1.2d, v10.2d[1]
+	OP_rr	v28.2d, v0.2d, v10.d[1]
+	OP_ii	v28.2d, v1.2d, v11.d[1]
+	OP_ri	v29.2d, v0.2d, v11.d[1]
+	OP_ir	v29.2d, v1.2d, v10.d[1]
 
-	OP_rr	v30.2d, v2.2d, v10.2d[1]
-	OP_ii	v30.2d, v3.2d, v11.2d[1]
-	OP_ri	v31.2d, v2.2d, v11.2d[1]
-	OP_ir	v31.2d, v3.2d, v10.2d[1]
+	OP_rr	v30.2d, v2.2d, v10.d[1]
+	OP_ii	v30.2d, v3.2d, v11.d[1]
+	OP_ri	v31.2d, v2.2d, v11.d[1]
+	OP_ir	v31.2d, v3.2d, v10.d[1]
 .endm
 
 .macro SAVE4x4
@@ -577,25 +577,25 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 	ld2	{v0.2d, v1.2d}, [pA]
 	add	pA, pA, #32
 
-	OP_rr	v16.2d, v0.2d, v8.2d[0]
-	OP_ii	v16.2d, v1.2d, v9.2d[0]
-	OP_ri	v17.2d, v0.2d, v9.2d[0]
-	OP_ir	v17.2d, v1.2d, v8.2d[0]
+	OP_rr	v16.2d, v0.2d, v8.d[0]
+	OP_ii	v16.2d, v1.2d, v9.d[0]
+	OP_ri	v17.2d, v0.2d, v9.d[0]
+	OP_ir	v17.2d, v1.2d, v8.d[0]
 
-	OP_rr	v20.2d, v0.2d, v8.2d[1]
-	OP_ii	v20.2d, v1.2d, v9.2d[1]
-	OP_ri	v21.2d, v0.2d, v9.2d[1]
-	OP_ir	v21.2d, v1.2d, v8.2d[1]
+	OP_rr	v20.2d, v0.2d, v8.d[1]
+	OP_ii	v20.2d, v1.2d, v9.d[1]
+	OP_ri	v21.2d, v0.2d, v9.d[1]
+	OP_ir	v21.2d, v1.2d, v8.d[1]
 
-	OP_rr	v24.2d, v0.2d, v10.2d[0]
-	OP_ii	v24.2d, v1.2d, v11.2d[0]
-	OP_ri	v25.2d, v0.2d, v11.2d[0]
-	OP_ir	v25.2d, v1.2d, v10.2d[0]
+	OP_rr	v24.2d, v0.2d, v10.d[0]
+	OP_ii	v24.2d, v1.2d, v11.d[0]
+	OP_ri	v25.2d, v0.2d, v11.d[0]
+	OP_ir	v25.2d, v1.2d, v10.d[0]
 
-	OP_rr	v28.2d, v0.2d, v10.2d[1]
-	OP_ii	v28.2d, v1.2d, v11.2d[1]
-	OP_ri	v29.2d, v0.2d, v11.2d[1]
-	OP_ir	v29.2d, v1.2d, v10.2d[1]
+	OP_rr	v28.2d, v0.2d, v10.d[1]
+	OP_ii	v28.2d, v1.2d, v11.d[1]
+	OP_ri	v29.2d, v0.2d, v11.d[1]
+	OP_ir	v29.2d, v1.2d, v10.d[1]
 .endm
 
 .macro SAVE2x4
@@ -660,25 +660,25 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 	ld2	{v0.d, v1.d}[0], [pA]
 	add	pA, pA, #16
 
-	OP_rr	d16, d0, v8.2d[0]
-	OP_ii	d16, d1, v9.2d[0]
-	OP_ri	d17, d0, v9.2d[0]
-	OP_ir	d17, d1, v8.2d[0]
+	OP_rr	d16, d0, v8.d[0]
+	OP_ii	d16, d1, v9.d[0]
+	OP_ri	d17, d0, v9.d[0]
+	OP_ir	d17, d1, v8.d[0]
 
-	OP_rr	d20, d0, v8.2d[1]
-	OP_ii	d20, d1, v9.2d[1]
-	OP_ri	d21, d0, v9.2d[1]
-	OP_ir	d21, d1, v8.2d[1]
+	OP_rr	d20, d0, v8.d[1]
+	OP_ii	d20, d1, v9.d[1]
+	OP_ri	d21, d0, v9.d[1]
+	OP_ir	d21, d1, v8.d[1]
 
-	OP_rr	d24, d0, v10.2d[0]
-	OP_ii	d24, d1, v11.2d[0]
-	OP_ri	d25, d0, v11.2d[0]
-	OP_ir	d25, d1, v10.2d[0]
+	OP_rr	d24, d0, v10.d[0]
+	OP_ii	d24, d1, v11.d[0]
+	OP_ri	d25, d0, v11.d[0]
+	OP_ir	d25, d1, v10.d[0]
 
-	OP_rr	d28, d0, v10.2d[1]
-	OP_ii	d28, d1, v11.2d[1]
-	OP_ri	d29, d0, v11.2d[1]
-	OP_ir	d29, d1, v10.2d[1]
+	OP_rr	d28, d0, v10.d[1]
+	OP_ii	d28, d1, v11.d[1]
+	OP_ri	d29, d0, v11.d[1]
+	OP_ir	d29, d1, v10.d[1]
 .endm
 
 .macro SAVE1x4
@@ -743,25 +743,25 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 	ld2	{v2.2d, v3.2d}, [pA]
 	add	pA, pA, #32
 
-	OP_rr	v16.2d, v0.2d, v8.2d[0]
-	OP_ii	v16.2d, v1.2d, v9.2d[0]
-	OP_ri	v17.2d, v0.2d, v9.2d[0]
-	OP_ir	v17.2d, v1.2d, v8.2d[0]
+	OP_rr	v16.2d, v0.2d, v8.d[0]
+	OP_ii	v16.2d, v1.2d, v9.d[0]
+	OP_ri	v17.2d, v0.2d, v9.d[0]
+	OP_ir	v17.2d, v1.2d, v8.d[0]
 
-	OP_rr	v18.2d, v2.2d, v8.2d[0]
-	OP_ii	v18.2d, v3.2d, v9.2d[0]
-	OP_ri	v19.2d, v2.2d, v9.2d[0]
-	OP_ir	v19.2d, v3.2d, v8.2d[0]
+	OP_rr	v18.2d, v2.2d, v8.d[0]
+	OP_ii	v18.2d, v3.2d, v9.d[0]
+	OP_ri	v19.2d, v2.2d, v9.d[0]
+	OP_ir	v19.2d, v3.2d, v8.d[0]
 
-	OP_rr	v20.2d, v0.2d, v8.2d[1]
-	OP_ii	v20.2d, v1.2d, v9.2d[1]
-	OP_ri	v21.2d, v0.2d, v9.2d[1]
-	OP_ir	v21.2d, v1.2d, v8.2d[1]
+	OP_rr	v20.2d, v0.2d, v8.d[1]
+	OP_ii	v20.2d, v1.2d, v9.d[1]
+	OP_ri	v21.2d, v0.2d, v9.d[1]
+	OP_ir	v21.2d, v1.2d, v8.d[1]
 
-	OP_rr	v22.2d, v2.2d, v8.2d[1]
-	OP_ii	v22.2d, v3.2d, v9.2d[1]
-	OP_ri	v23.2d, v2.2d, v9.2d[1]
-	OP_ir	v23.2d, v3.2d, v8.2d[1]
+	OP_rr	v22.2d, v2.2d, v8.d[1]
+	OP_ii	v22.2d, v3.2d, v9.d[1]
+	OP_ri	v23.2d, v2.2d, v9.d[1]
+	OP_ir	v23.2d, v3.2d, v8.d[1]
 .endm
 
 .macro SAVE4x2
@@ -816,15 +816,15 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 	ld2	{v0.2d, v1.2d}, [pA]
 	add	pA, pA, #32
 
-	OP_rr	v16.2d, v0.2d, v8.2d[0]
-	OP_ii	v16.2d, v1.2d, v9.2d[0]
-	OP_ri	v17.2d, v0.2d, v9.2d[0]
-	OP_ir	v17.2d, v1.2d, v8.2d[0]
+	OP_rr	v16.2d, v0.2d, v8.d[0]
+	OP_ii	v16.2d, v1.2d, v9.d[0]
+	OP_ri	v17.2d, v0.2d, v9.d[0]
+	OP_ir	v17.2d, v1.2d, v8.d[0]
 
-	OP_rr	v20.2d, v0.2d, v8.2d[1]
-	OP_ii	v20.2d, v1.2d, v9.2d[1]
-	OP_ri	v21.2d, v0.2d, v9.2d[1]
-	OP_ir	v21.2d, v1.2d, v8.2d[1]
+	OP_rr	v20.2d, v0.2d, v8.d[1]
+	OP_ii	v20.2d, v1.2d, v9.d[1]
+	OP_ri	v21.2d, v0.2d, v9.d[1]
+	OP_ir	v21.2d, v1.2d, v8.d[1]
 .endm
 
 .macro SAVE2x2
@@ -867,15 +867,15 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 	ld2	{v0.d, v1.d}[0], [pA]
 	add	pA, pA, #16
 
-	OP_rr	d16, d0, v8.2d[0]
-	OP_ii	d16, d1, v9.2d[0]
-	OP_ri	d17, d0, v9.2d[0]
-	OP_ir	d17, d1, v8.2d[0]
+	OP_rr	d16, d0, v8.d[0]
+	OP_ii	d16, d1, v9.d[0]
+	OP_ri	d17, d0, v9.d[0]
+	OP_ir	d17, d1, v8.d[0]
 
-	OP_rr	d20, d0, v8.2d[1]
-	OP_ii	d20, d1, v9.2d[1]
-	OP_ri	d21, d0, v9.2d[1]
-	OP_ir	d21, d1, v8.2d[1]
+	OP_rr	d20, d0, v8.d[1]
+	OP_ii	d20, d1, v9.d[1]
+	OP_ri	d21, d0, v9.d[1]
+	OP_ir	d21, d1, v8.d[1]
 .endm
 
 .macro SAVE1x2
diff --git a/kernel/power/KERNEL.POWER8 b/kernel/power/KERNEL.POWER8
index 760d568cd..b37a4213b 100644
--- a/kernel/power/KERNEL.POWER8
+++ b/kernel/power/KERNEL.POWER8
@@ -3,14 +3,18 @@
 #CGEMM_BETA = ../generic/zgemm_beta.c
 #ZGEMM_BETA = ../generic/zgemm_beta.c
 
-STRMMKERNEL	= gemm_kernel_power6.S
+STRMMKERNEL	= strmm_kernel_16x8_power8.S
 DTRMMKERNEL	= dtrmm_kernel_16x4_power8.S
-CTRMMKERNEL	= ../generic/ztrmmkernel_2x2.c
+CTRMMKERNEL	= ctrmm_kernel_8x4_power8.S
 ZTRMMKERNEL	= ztrmm_kernel_8x2_power8.S
 
-SGEMMKERNEL    =  gemm_kernel_power6.S
-SGEMMONCOPY    =  ../generic/gemm_ncopy_4.c
-SGEMMOTCOPY    =  ../generic/gemm_tcopy_4.c
+SGEMMKERNEL    =  sgemm_kernel_16x8_power8.S
+SGEMMINCOPY    = ../generic/gemm_ncopy_16.c
+SGEMMITCOPY    = ../generic/gemm_tcopy_16.c
+SGEMMONCOPY    =  ../generic/gemm_ncopy_8.c
+SGEMMOTCOPY    =  ../generic/gemm_tcopy_8.c
+SGEMMINCOPYOBJ =  sgemm_incopy.o
+SGEMMITCOPYOBJ =  sgemm_itcopy.o
 SGEMMONCOPYOBJ =  sgemm_oncopy.o
 SGEMMOTCOPYOBJ =  sgemm_otcopy.o
 
@@ -24,11 +28,15 @@ DGEMMITCOPYOBJ = dgemm_itcopy.o
 DGEMMONCOPYOBJ = dgemm_oncopy.o
 DGEMMOTCOPYOBJ = dgemm_otcopy.o
 
-CGEMMKERNEL    = ../generic/zgemmkernel_2x2.c
-CGEMMONCOPY    = ../generic/zgemm_ncopy_2.c
-CGEMMOTCOPY    = ../generic/zgemm_tcopy_2.c
+CGEMMKERNEL    = cgemm_kernel_8x4_power8.S
+CGEMMINCOPY    = ../generic/zgemm_ncopy_8.c
+CGEMMITCOPY    = ../generic/zgemm_tcopy_8.c
+CGEMMONCOPY    = ../generic/zgemm_ncopy_4.c
+CGEMMOTCOPY    = ../generic/zgemm_tcopy_4.c
 CGEMMONCOPYOBJ =  cgemm_oncopy.o
 CGEMMOTCOPYOBJ =  cgemm_otcopy.o
+CGEMMINCOPYOBJ =  cgemm_incopy.o
+CGEMMITCOPYOBJ =  cgemm_itcopy.o
 
 ZGEMMKERNEL    = zgemm_kernel_8x2_power8.S
 ZGEMMONCOPY    = ../generic/zgemm_ncopy_2.c
@@ -97,56 +105,56 @@ ZTRSMKERNEL_RT	= ../generic/trsm_kernel_RT.c
 #ISMINKERNEL  = ../arm/imin.c
 #IDMINKERNEL  = ../arm/imin.c
 #
-#SASUMKERNEL  = ../arm/asum.c
-#DASUMKERNEL  = ../arm/asum.c
-#CASUMKERNEL  = ../arm/zasum.c
-#ZASUMKERNEL  = ../arm/zasum.c
+SASUMKERNEL  = sasum.c
+DASUMKERNEL  = dasum.c
+CASUMKERNEL  = casum.c
+ZASUMKERNEL  = zasum.c
 #
 #SAXPYKERNEL  = ../arm/axpy.c
-#DAXPYKERNEL  = ../arm/axpy.c
+DAXPYKERNEL  = daxpy.c
 #CAXPYKERNEL  = ../arm/zaxpy.c
-#ZAXPYKERNEL  = ../arm/zaxpy.c
+ZAXPYKERNEL  = zaxpy.c
 #
-#SCOPYKERNEL  = ../arm/copy.c
-#DCOPYKERNEL  = ../arm/copy.c
-#CCOPYKERNEL  = ../arm/zcopy.c
-#ZCOPYKERNEL  = ../arm/zcopy.c
+SCOPYKERNEL  = scopy.c
+DCOPYKERNEL  = dcopy.c
+CCOPYKERNEL  = ccopy.c
+ZCOPYKERNEL  = zcopy.c
 #
-#SDOTKERNEL   = ../arm/dot.c
-#DDOTKERNEL   = ../arm/dot.c
+SDOTKERNEL   =  sdot.c
+DDOTKERNEL   =  ddot.c
 #CDOTKERNEL   = ../arm/zdot.c
-#ZDOTKERNEL   = ../arm/zdot.c
+ZDOTKERNEL   =  zdot.c
 #
 #SNRM2KERNEL  = ../arm/nrm2.c
 #DNRM2KERNEL  = ../arm/nrm2.c
 #CNRM2KERNEL  = ../arm/znrm2.c
 #ZNRM2KERNEL  = ../arm/znrm2.c
 #
-#SROTKERNEL   = ../arm/rot.c
-#DROTKERNEL   = ../arm/rot.c
+SROTKERNEL   = srot.c
+DROTKERNEL   = drot.c
 #CROTKERNEL   = ../arm/zrot.c
 #ZROTKERNEL   = ../arm/zrot.c
 #
-#SSCALKERNEL  = ../arm/scal.c
-#DSCALKERNEL  = ../arm/scal.c
+SSCALKERNEL  = sscal.c
+DSCALKERNEL  = dscal.c
 #CSCALKERNEL  = ../arm/zscal.c
-#ZSCALKERNEL  = ../arm/zscal.c
+ZSCALKERNEL  = zscal.c
 #
-#SSWAPKERNEL  = ../arm/swap.c
-#DSWAPKERNEL  = ../arm/swap.c
-#CSWAPKERNEL  = ../arm/zswap.c
-#ZSWAPKERNEL  = ../arm/zswap.c
+SSWAPKERNEL  = sswap.c
+DSWAPKERNEL  = dswap.c
+CSWAPKERNEL  = cswap.c
+ZSWAPKERNEL  = zswap.c
 #
 
 #SGEMVNKERNEL = ../arm/gemv_n.c
-#DGEMVNKERNEL = ../arm/gemv_n.c
+DGEMVNKERNEL = dgemv_n.c
 #CGEMVNKERNEL = ../arm/zgemv_n.c
 #ZGEMVNKERNEL = ../arm/zgemv_n.c
 #
 #SGEMVTKERNEL = ../arm/gemv_t.c
 #DGEMVTKERNEL = ../arm/gemv_t.c
 #CGEMVTKERNEL = ../arm/zgemv_t.c
-#ZGEMVTKERNEL = ../arm/zgemv_t.c
+#ZGEMVTKERNEL = zgemv_t_4.c
 
 
 #SSYMV_U_KERNEL =  ../generic/symv_k.c
diff --git a/kernel/power/casum.c b/kernel/power/casum.c
new file mode 100644
index 000000000..aeed0ca78
--- /dev/null
+++ b/kernel/power/casum.c
@@ -0,0 +1,151 @@
+/***************************************************************************
+Copyright (c) 2013-2016, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*****************************************************************************/
+
+/**************************************************************************************
+* 2016/03/28 Werner Saar (wernsaar@googlemail.com)
+* 	 BLASTEST 		: OK
+* 	 CTEST			: OK
+* 	 TEST			: OK
+*	 LAPACK-TEST		: OK
+**************************************************************************************/
+
+#include "common.h"
+#include <math.h>
+
+#if defined(DOUBLE)
+
+#define ABS fabs
+
+#else
+
+#define ABS fabsf
+
+#endif
+
+#if defined(POWER8)
+#include "casum_microk_power8.c"
+#endif
+
+
+#ifndef HAVE_KERNEL_16
+
+static void casum_kernel_16(BLASLONG n, FLOAT *x1, FLOAT *svec)
+{
+
+	BLASLONG i=0;
+	FLOAT *x = x1;
+	FLOAT temp0, temp1, temp2, temp3;
+	FLOAT temp4, temp5, temp6, temp7;
+	FLOAT sum0 = 0.0;
+	FLOAT sum1 = 0.0;
+	FLOAT sum2 = 0.0;
+	FLOAT sum3 = 0.0;
+
+	while ( i< n )
+	{
+
+		temp0 = ABS(x[0]);
+		temp1 = ABS(x[1]);
+		temp2 = ABS(x[2]);
+		temp3 = ABS(x[3]);
+		temp4 = ABS(x[4]);
+		temp5 = ABS(x[5]);
+		temp6 = ABS(x[6]);
+		temp7 = ABS(x[7]);
+
+		sum0 += temp0;
+		sum1 += temp1;
+		sum2 += temp2;
+		sum3 += temp3;
+
+		sum0 += temp4;
+		sum1 += temp5;
+		sum2 += temp6;
+		sum3 += temp7;
+
+		x+=8;
+		i+=4;
+
+	}
+
+	svec[0] = sum0+sum1+sum2+sum3;
+	svec[1] = 0.0;
+	svec[2] = 0.0;
+	svec[3] = 0.0;
+
+}
+
+#endif
+
+FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x)
+{
+	BLASLONG i=0;
+	BLASLONG ip=0;
+	FLOAT sumf = 0.0;
+	FLOAT svec[4] __attribute__ ((aligned (16)));;
+	BLASLONG n1;
+	BLASLONG inc_x2;
+
+	if (n <= 0 || inc_x <= 0) return(sumf);
+
+	if ( inc_x == 1 )
+	{
+
+		n1 = n & -16;
+		if ( n1 > 0 )
+		{
+
+			casum_kernel_16(n1, x, svec);
+			sumf = svec[0] + svec[1]+svec[2]+svec[3];
+			i=n1;
+			ip = 2 * n1;
+		}
+
+		while(i < n)
+		{
+			sumf += ABS(x[ip]) + ABS(x[ip+1]);
+			ip += 2;
+			i++;
+		}
+
+	}
+	else
+	{
+		inc_x2 = 2 * inc_x;
+
+		while(i < n)
+		{
+			sumf += ABS(x[ip]) + ABS(x[ip+1]);
+			ip += inc_x2;
+			i++;
+		}
+
+	}
+	return(sumf);
+}
+
+
diff --git a/kernel/power/casum_microk_power8.c b/kernel/power/casum_microk_power8.c
new file mode 100644
index 000000000..cb50234ce
--- /dev/null
+++ b/kernel/power/casum_microk_power8.c
@@ -0,0 +1,177 @@
+/***************************************************************************
+Copyright (c) 2013-2016, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*****************************************************************************/
+
+/**************************************************************************************
+* 2016/03/28 Werner Saar (wernsaar@googlemail.com)
+* 	 BLASTEST 		: OK
+* 	 CTEST			: OK
+* 	 TEST			: OK
+*	 LAPACK-TEST		: OK
+**************************************************************************************/
+
+#define HAVE_KERNEL_16 1
+static void casum_kernel_16( BLASLONG n, FLOAT *x, FLOAT *svec) __attribute__ ((noinline));
+
+static void casum_kernel_16( BLASLONG n, FLOAT *x, FLOAT *svec)
+{
+
+
+	BLASLONG i = n;
+	BLASLONG o16 = 16;
+	BLASLONG o32 = 32;
+	BLASLONG o48 = 48;
+	BLASLONG o64 = 64;
+	BLASLONG o80 = 80;
+	BLASLONG o96 = 96;
+	BLASLONG o112 = 112;
+	FLOAT *x1=x;
+	BLASLONG pre = 384;
+
+	__asm__  __volatile__
+	(
+
+	"dcbt		%2 , %4				    \n\t"
+
+	"xxlxor		32,32,32			    \n\t"
+	"xxlxor		33,33,33			    \n\t"
+	"xxlxor		34,34,34			    \n\t"
+	"xxlxor		35,35,35			    \n\t"
+	"xxlxor		36,36,36			    \n\t"
+	"xxlxor		37,37,37			    \n\t"
+	"xxlxor		38,38,38			    \n\t"
+	"xxlxor		39,39,39			    \n\t"
+
+	"lxvw4x		40, 0, %2			    \n\t"
+	"lxvw4x		41, %5, %2			    \n\t"
+	"lxvw4x		42, %6, %2			    \n\t"
+	"lxvw4x		43, %7, %2			    \n\t"
+	"lxvw4x		44, %8, %2			    \n\t"
+	"lxvw4x		45, %9, %2			    \n\t"
+	"lxvw4x		46, %10, %2			    \n\t"
+	"lxvw4x		47, %11, %2			    \n\t"
+
+	"addi		%2, %2, 128			    \n\t"
+
+	"addic.		%0 , %0	, -16  	 	             \n\t"
+	"ble		2f		             	     \n\t"
+
+	".align 5				            \n\t"
+	"1:				                    \n\t"
+
+	"dcbt		%2 , %4				    \n\t"
+
+	"xvabssp	48, 40				    \n\t"
+	"xvabssp	49, 41				    \n\t"
+	"xvabssp	50, 42				    \n\t"
+	"xvabssp	51, 43				    \n\t"
+
+	"lxvw4x		40, 0, %2			    \n\t"
+	"lxvw4x		41, %5, %2			    \n\t"
+
+	"xvabssp	52, 44				    \n\t"
+	"xvabssp	53, 45				    \n\t"
+
+	"lxvw4x		42, %6, %2			    \n\t"
+	"lxvw4x		43, %7, %2			    \n\t"
+
+	"xvabssp	54, 46				    \n\t"
+	"xvabssp	55, 47				    \n\t"
+
+	"lxvw4x		44, %8, %2			    \n\t"
+	"lxvw4x		45, %9, %2			    \n\t"
+
+	"xvaddsp	32, 32, 48		    \n\t"
+	"xvaddsp	33, 33, 49		    \n\t"
+
+	"lxvw4x		46, %10, %2			    \n\t"
+	"lxvw4x		47, %11, %2			    \n\t"
+
+	"xvaddsp	34, 34, 50		    \n\t"
+	"xvaddsp	35, 35, 51		    \n\t"
+	"addi		%2, %2, 128			    \n\t"
+	"xvaddsp	36, 36, 52		    \n\t"
+	"xvaddsp	37, 37, 53		    \n\t"
+	"addic.		%0 , %0	, -16  	 	             \n\t"
+	"xvaddsp	38, 38, 54		    \n\t"
+	"xvaddsp	39, 39, 55		    \n\t"
+
+	"bgt		1b		             	     \n\t"
+
+	"2:						     \n\t"
+
+
+	"xvabssp	48, 40				    \n\t"
+	"xvabssp	49, 41				    \n\t"
+	"xvabssp	50, 42				    \n\t"
+	"xvabssp	51, 43				    \n\t"
+	"xvabssp	52, 44				    \n\t"
+	"xvabssp	53, 45				    \n\t"
+	"xvabssp	54, 46				    \n\t"
+	"xvabssp	55, 47				    \n\t"
+
+	"xvaddsp	32, 32, 48		    \n\t"
+	"xvaddsp	33, 33, 49		    \n\t"
+	"xvaddsp	34, 34, 50		    \n\t"
+	"xvaddsp	35, 35, 51		    \n\t"
+	"xvaddsp	36, 36, 52		    \n\t"
+	"xvaddsp	37, 37, 53		    \n\t"
+	"xvaddsp	38, 38, 54		    \n\t"
+	"xvaddsp	39, 39, 55		    \n\t"
+
+	"xvaddsp	32, 32, 33		     \n\t"
+	"xvaddsp	34, 34, 35		     \n\t"
+	"xvaddsp	36, 36, 37		     \n\t"
+	"xvaddsp	38, 38, 39		     \n\t"
+
+	"xvaddsp	32, 32, 34		     \n\t"
+	"xvaddsp	36, 36, 38		     \n\t"
+
+	"xvaddsp	32, 32, 36		     \n\t"
+
+
+	"stxvw4x	32, 0, %3		     \n\t"
+
+	:
+        : 
+          "r" (i),	// 0	
+	  "r" (n),  	// 1
+          "r" (x1),     // 2
+          "r" (svec),   // 3
+          "r" (pre),    // 4
+	  "r" (o16),	// 5
+	  "r" (o32),	// 6
+	  "r" (o48),    // 7
+          "r" (o64),    // 8
+          "r" (o80),    // 9
+          "r" (o96),    // 10
+          "r" (o112)   // 11
+	: "cr0", "%0", "%2",  "memory"
+	);
+
+} 
+
+
diff --git a/kernel/power/ccopy.c b/kernel/power/ccopy.c
new file mode 100644
index 000000000..ce7d67475
--- /dev/null
+++ b/kernel/power/ccopy.c
@@ -0,0 +1,140 @@
+/***************************************************************************
+Copyright (c) 2013-2016, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*****************************************************************************/
+
+/**************************************************************************************
+* 2016/03/25 Werner Saar (wernsaar@googlemail.com)
+* 	 BLASTEST 		: OK
+* 	 CTEST			: OK
+* 	 TEST			: OK
+*	 LAPACK-TEST		: OK
+**************************************************************************************/
+
+#include "common.h"
+
+#if defined(POWER8)
+#include "ccopy_microk_power8.c"
+#endif
+
+#ifndef HAVE_KERNEL_32
+
+static void ccopy_kernel_32(BLASLONG n, FLOAT *x, FLOAT *y)
+{
+
+	BLASLONG i=0;
+	FLOAT f0, f1, f2, f3, f4, f5, f6, f7;
+	FLOAT *x1=x;
+	FLOAT *y1=y;
+
+	while ( i<n )
+	{
+
+		f0 = x1[0];
+		f1 = x1[1];
+		f2 = x1[2];
+		f3 = x1[3];
+		f4 = x1[4];
+		f5 = x1[5];
+		f6 = x1[6];
+		f7 = x1[7];
+
+		y1[0] = f0;
+		y1[1] = f1;
+		y1[2] = f2;
+		y1[3] = f3;
+		y1[4] = f4;
+		y1[5] = f5;
+		y1[6] = f6;
+		y1[7] = f7;
+
+		x1 += 8;
+		y1 += 8;
+
+		i+=4;
+	}
+	return;
+
+}
+
+
+#endif
+
+
+
+int CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y)
+{
+	BLASLONG i=0;
+	BLASLONG ix=0,iy=0;
+
+	if ( n <= 0     )  return(0);
+
+	if ( (inc_x == 1) && (inc_y == 1 ))
+	{
+
+		BLASLONG n1 = n & -32;
+		if ( n1 > 0 )
+		{
+			ccopy_kernel_32(n1, x, y);
+			i=n1;
+			ix=n1*2;
+			iy=n1*2;
+		}
+
+		while(i < n)
+		{
+			y[iy] = x[iy] ;
+			y[iy+1] = x[ix+1] ;
+			ix+=2;
+			iy+=2;
+			i++ ;
+
+		}
+
+
+	}
+	else
+	{
+
+		BLASLONG inc_x2 = 2 * inc_x;
+		BLASLONG inc_y2 = 2 * inc_y;
+
+		while(i < n)
+		{
+			y[iy] = x[ix] ;
+			y[iy+1] = x[ix+1] ;
+			ix += inc_x2 ;
+			iy += inc_y2 ;
+			i++ ;
+
+		}
+
+	}
+	return(0);
+	
+
+}
+
+
diff --git a/kernel/power/ccopy_microk_power8.c b/kernel/power/ccopy_microk_power8.c
new file mode 100644
index 000000000..95b3559ba
--- /dev/null
+++ b/kernel/power/ccopy_microk_power8.c
@@ -0,0 +1,174 @@
+/***************************************************************************
+Copyright (c) 2013-2016, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*****************************************************************************/
+
+/**************************************************************************************
+* 2016/03/25 Werner Saar (wernsaar@googlemail.com)
+* 	 BLASTEST 		: OK
+* 	 CTEST			: OK
+* 	 TEST			: OK
+*	 LAPACK-TEST		: OK
+**************************************************************************************/
+
+#define HAVE_KERNEL_32 1
+
+static void ccopy_kernel_32( BLASLONG n, FLOAT *x, FLOAT *y) __attribute__ ((noinline));
+
+static void ccopy_kernel_32( BLASLONG n, FLOAT *x, FLOAT *y)
+{
+
+
+	BLASLONG i = n;
+	BLASLONG o16 = 16;
+	BLASLONG o32 = 32;
+	BLASLONG o48 = 48;
+	BLASLONG o64 = 64;
+	BLASLONG o80 = 80;
+	BLASLONG o96 = 96;
+	BLASLONG o112 = 112;
+	FLOAT *x1=x;
+	FLOAT *y1=y;
+	BLASLONG pre = 384;
+	BLASLONG alpha=0;
+
+	__asm__  __volatile__
+	(
+
+	"lxvw4x		40, 0, %2			    \n\t"
+	"lxvw4x		41, %5, %2			    \n\t"
+	"lxvw4x		42, %6, %2			    \n\t"
+	"lxvw4x		43, %7, %2			    \n\t"
+	"lxvw4x		44, %8, %2			    \n\t"
+	"lxvw4x		45, %9, %2			    \n\t"
+	"lxvw4x		46, %10, %2			    \n\t"
+	"lxvw4x		47, %11, %2			    \n\t"
+
+	"addi		%2, %2, 128			    \n\t"
+
+	"lxvw4x		50, 0, %2			    \n\t"
+	"lxvw4x		51, %5, %2			    \n\t"
+	"lxvw4x		52, %6, %2			    \n\t"
+	"lxvw4x		53, %7, %2			    \n\t"
+	"lxvw4x		54, %8, %2			    \n\t"
+	"lxvw4x		55, %9, %2			    \n\t"
+	"lxvw4x		56, %10, %2			    \n\t"
+	"lxvw4x		57, %11, %2			    \n\t"
+
+	"addi		%2, %2, 128			    \n\t"
+
+	"addic.		%0 , %0	, -32  	 	             \n\t"
+	"ble		2f		             	     \n\t"
+
+	".align 5				            \n\t"
+	"1:				                    \n\t"
+
+	"stxvw4x		40, 0, %1			    \n\t"
+	"stxvw4x		41, %5, %1			    \n\t"
+	"lxvw4x		40, 0, %2			    \n\t"
+	"lxvw4x		41, %5, %2			    \n\t"
+	"stxvw4x		42, %6, %1			    \n\t"
+	"stxvw4x		43, %7, %1			    \n\t"
+	"lxvw4x		42, %6, %2			    \n\t"
+	"lxvw4x		43, %7, %2			    \n\t"
+	"stxvw4x		44, %8, %1			    \n\t"
+	"stxvw4x		45, %9, %1			    \n\t"
+	"lxvw4x		44, %8, %2			    \n\t"
+	"lxvw4x		45, %9, %2			    \n\t"
+	"stxvw4x		46, %10, %1			    \n\t"
+	"stxvw4x		47, %11, %1			    \n\t"
+	"lxvw4x		46, %10, %2			    \n\t"
+	"lxvw4x		47, %11, %2			    \n\t"
+
+
+	"addi		%1, %1, 128			    \n\t"
+	"addi		%2, %2, 128			    \n\t"
+
+	"stxvw4x		50, 0, %1			    \n\t"
+	"stxvw4x		51, %5, %1			    \n\t"
+	"lxvw4x		50, 0, %2			    \n\t"
+	"lxvw4x		51, %5, %2			    \n\t"
+	"stxvw4x		52, %6, %1			    \n\t"
+	"stxvw4x		53, %7, %1			    \n\t"
+	"lxvw4x		52, %6, %2			    \n\t"
+	"lxvw4x		53, %7, %2			    \n\t"
+	"stxvw4x		54, %8, %1			    \n\t"
+	"stxvw4x		55, %9, %1			    \n\t"
+	"lxvw4x		54, %8, %2			    \n\t"
+	"lxvw4x		55, %9, %2			    \n\t"
+	"stxvw4x		56, %10, %1			    \n\t"
+	"stxvw4x		57, %11, %1			    \n\t"
+	"lxvw4x		56, %10, %2			    \n\t"
+	"lxvw4x		57, %11, %2			    \n\t"
+
+	"addi		%1, %1, 128			    \n\t"
+	"addi		%2, %2, 128			    \n\t"
+
+	"addic.		%0 , %0	, -32  	 	             \n\t"
+	"bgt		1b		             	     \n\t"
+
+	"2:						     \n\t"
+
+	"stxvw4x		40, 0, %1			    \n\t"
+	"stxvw4x		41, %5, %1			    \n\t"
+	"stxvw4x		42, %6, %1			    \n\t"
+	"stxvw4x		43, %7, %1			    \n\t"
+	"stxvw4x		44, %8, %1			    \n\t"
+	"stxvw4x		45, %9, %1			    \n\t"
+	"stxvw4x		46, %10, %1			    \n\t"
+	"stxvw4x		47, %11, %1			    \n\t"
+
+	"addi		%1, %1, 128			    \n\t"
+
+	"stxvw4x		50, 0, %1			    \n\t"
+	"stxvw4x		51, %5, %1			    \n\t"
+	"stxvw4x		52, %6, %1			    \n\t"
+	"stxvw4x		53, %7, %1			    \n\t"
+	"stxvw4x		54, %8, %1			    \n\t"
+	"stxvw4x		55, %9, %1			    \n\t"
+	"stxvw4x		56, %10, %1			    \n\t"
+	"stxvw4x		57, %11, %1			    \n\t"
+
+
+	:
+        : 
+          "r" (i),	// 0	
+	  "r" (y1),  	// 1
+          "r" (x1),     // 2
+          "r" (alpha),  // 3
+          "r" (pre),    // 4
+	  "r" (o16),	// 5
+	  "r" (o32),	// 6
+	  "r" (o48),    // 7
+          "r" (o64),    // 8
+          "r" (o80),    // 9
+          "r" (o96),    // 10
+          "r" (o112)    // 11
+	: "cr0", "%0", "%2" , "%1", "memory"
+	);
+
+} 
+
+
diff --git a/kernel/power/cgemm_kernel_8x4_power8.S b/kernel/power/cgemm_kernel_8x4_power8.S
new file mode 100644
index 000000000..0c462ce8e
--- /dev/null
+++ b/kernel/power/cgemm_kernel_8x4_power8.S
@@ -0,0 +1,407 @@
+/***************************************************************************
+Copyright (c) 2013-2016, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*****************************************************************************/
+
+/**************************************************************************************
+* 2016/04/04 Werner Saar (wernsaar@googlemail.com)
+* 	 BLASTEST 		: OK
+* 	 CTEST			: OK
+* 	 TEST			: OK
+* 	 LAPACK-TEST		: OK
+**************************************************************************************/
+
+/*********************************************************************/
+/* Copyright 2009, 2010 The University of Texas at Austin.           */
+/* All rights reserved.                                              */
+/*                                                                   */
+/* Redistribution and use in source and binary forms, with or        */
+/* without modification, are permitted provided that the following   */
+/* conditions are met:                                               */
+/*                                                                   */
+/*   1. Redistributions of source code must retain the above         */
+/*      copyright notice, this list of conditions and the following  */
+/*      disclaimer.                                                  */
+/*                                                                   */
+/*   2. Redistributions in binary form must reproduce the above      */
+/*      copyright notice, this list of conditions and the following  */
+/*      disclaimer in the documentation and/or other materials       */
+/*      provided with the distribution.                              */
+/*                                                                   */
+/*    THIS  SOFTWARE IS PROVIDED  BY THE  UNIVERSITY OF  TEXAS AT    */
+/*    AUSTIN  ``AS IS''  AND ANY  EXPRESS OR  IMPLIED WARRANTIES,    */
+/*    INCLUDING, BUT  NOT LIMITED  TO, THE IMPLIED  WARRANTIES OF    */
+/*    MERCHANTABILITY  AND FITNESS FOR  A PARTICULAR  PURPOSE ARE    */
+/*    DISCLAIMED.  IN  NO EVENT SHALL THE UNIVERSITY  OF TEXAS AT    */
+/*    AUSTIN OR CONTRIBUTORS BE  LIABLE FOR ANY DIRECT, INDIRECT,    */
+/*    INCIDENTAL,  SPECIAL, EXEMPLARY,  OR  CONSEQUENTIAL DAMAGES    */
+/*    (INCLUDING, BUT  NOT LIMITED TO,  PROCUREMENT OF SUBSTITUTE    */
+/*    GOODS  OR  SERVICES; LOSS  OF  USE,  DATA,  OR PROFITS;  OR    */
+/*    BUSINESS INTERRUPTION) HOWEVER CAUSED  AND ON ANY THEORY OF    */
+/*    LIABILITY, WHETHER  IN CONTRACT, STRICT  LIABILITY, OR TORT    */
+/*    (INCLUDING NEGLIGENCE OR OTHERWISE)  ARISING IN ANY WAY OUT    */
+/*    OF  THE  USE OF  THIS  SOFTWARE,  EVEN  IF ADVISED  OF  THE    */
+/*    POSSIBILITY OF SUCH DAMAGE.                                    */
+/*                                                                   */
+/* The views and conclusions contained in the software and           */
+/* documentation are those of the authors and should not be          */
+/* interpreted as representing official policies, either expressed   */
+/* or implied, of The University of Texas at Austin.                 */
+/*********************************************************************/
+
+#define ASSEMBLER
+#include "common.h"
+#include "def_vsx.h"
+
+#ifndef __64BIT__
+#define LOAD	lwz
+#else
+#define LOAD	ld
+#endif
+
+#ifdef __64BIT__
+#define STACKSIZE 32000
+#define ALPHA_R_SP 296(SP)
+#define ALPHA_I_SP 304(SP)
+#define FZERO	312(SP)
+#else
+#define STACKSIZE 256
+#define ALPHA_R_SP 224(SP)
+#define ALPHA_I_SP 232(SP)
+#define FZERO	240(SP)
+#endif
+
+#define	M	r3
+#define	N	r4
+#define	K	r5
+
+#ifdef linux
+#ifndef __64BIT__
+#define A	r6
+#define	B	r7
+#define	C	r8
+#define	LDC	r9
+#define OFFSET	r10
+#else
+#define A	r8
+#define	B	r9
+#define	C	r10
+#define	LDC	r6
+#define OFFSET	r7
+#endif
+#endif
+
+#if defined(_AIX) || defined(__APPLE__)
+#if !defined(__64BIT__) && defined(DOUBLE)
+#define A	r10
+#define	B	r6
+#define	C	r7
+#define	LDC	r8
+#define OFFSET	r9
+#else
+#define A	r8
+#define	B	r9
+#define	C	r10
+#define	LDC	r6
+#define OFFSET	r7
+#endif
+#endif
+
+#define o0	0
+
+#define alpha_dr vs28
+#define alpha_di vs29
+#define alpha_sr vs30
+#define alpha_si vs31
+
+#define FRAMEPOINTER r12
+
+#define BBUFFER	r14
+#define L	r15
+#define o12	r16
+#define o4	r17
+#define T2	r19
+#define BBO	r20
+#define	o8	r21
+#define	I	r22
+#define J	r23
+#define AO	r24
+#define	BO	r25
+#define	CO	r26
+#define o16	r27
+#define	o32	r28
+#define o48	r29
+
+#define PRE	r30
+#define T1  	r31
+
+#ifndef NEEDPARAM
+
+	PROLOGUE
+	PROFCODE
+
+	mr      FRAMEPOINTER, SP
+	addi	SP, SP, -STACKSIZE
+	addi	SP, SP, -STACKSIZE
+	addi	SP, SP, -STACKSIZE
+	addi	SP, SP, -STACKSIZE
+	li	r0, 0
+
+	stfd	f14,    0(SP)
+	stfd	f15,    8(SP)
+	stfd	f16,   16(SP)
+	stfd	f17,   24(SP)
+
+	stfd	f18,   32(SP)
+	stfd	f19,   40(SP)
+	stfd	f20,   48(SP)
+	stfd	f21,   56(SP)
+
+	stfd	f22,   64(SP)
+	stfd	f23,   72(SP)
+	stfd	f24,   80(SP)
+	stfd	f25,   88(SP)
+
+	stfd	f26,   96(SP)
+	stfd	f27,  104(SP)
+	stfd	f28,  112(SP)
+	stfd	f29,  120(SP)
+
+	stfd	f30,  128(SP)
+	stfd	f31,  136(SP)
+
+#ifdef __64BIT__
+	std	r31,  144(SP)
+	std	r30,  152(SP)
+	std	r29,  160(SP)
+	std	r28,  168(SP)
+	std	r27,  176(SP)
+	std	r26,  184(SP)
+	std	r25,  192(SP)
+	std	r24,  200(SP)
+	std	r23,  208(SP)
+	std	r22,  216(SP)
+	std	r21,  224(SP)
+	std	r20,  232(SP)
+	std	r19,  240(SP)
+	std	r18,  248(SP)
+	std	r17,  256(SP)
+	std	r16,  264(SP)
+	std	r15,  272(SP)
+	std	r14,  280(SP)
+#else
+	stw	r31,  144(SP)
+	stw	r30,  148(SP)
+	stw	r29,  152(SP)
+	stw	r28,  156(SP)
+	stw	r27,  160(SP)
+	stw	r26,  164(SP)
+	stw	r25,  168(SP)
+	stw	r24,  172(SP)
+	stw	r23,  176(SP)
+	stw	r22,  180(SP)
+	stw	r21,  184(SP)
+	stw	r20,  188(SP)
+	stw	r19,  192(SP)
+	stw	r18,  196(SP)
+	stw	r17,  200(SP)
+	stw	r16,  204(SP)
+	stw	r15,  208(SP)
+	stw	r14,  212(SP)
+#endif
+
+	stfs	f1,  ALPHA_R_SP
+	stfs	f2,  ALPHA_I_SP
+	// stw	r0,  FZERO
+
+#ifdef linux
+#ifdef __64BIT__
+	ld	LDC, FRAMESLOT(0) + 0(FRAMEPOINTER)
+#endif
+#endif
+
+#if defined(_AIX) || defined(__APPLE__)
+#ifdef __64BIT__
+	ld	LDC, FRAMESLOT(0) + 0(FRAMEPOINTER)
+#else
+#ifdef DOUBLE
+	lwz	B,   FRAMESLOT(0) + 0(FRAMEPOINTER)
+	lwz	C,   FRAMESLOT(1) + 0(FRAMEPOINTER)
+	lwz	LDC, FRAMESLOT(2) + 0(FRAMEPOINTER)
+#else
+	lwz	LDC, FRAMESLOT(0) + 0(FRAMEPOINTER)
+#endif
+#endif
+#endif
+
+#ifdef TRMMKERNEL
+#if defined(linux) && defined(__64BIT__)
+	ld	OFFSET,  FRAMESLOT(1) + 0(FRAMEPOINTER)
+#endif
+
+#if defined(_AIX) || defined(__APPLE__)
+#ifdef __64BIT__
+	ld	OFFSET,  FRAMESLOT(1) + 0(FRAMEPOINTER)
+#else
+#ifdef DOUBLE
+	lwz	OFFSET,  FRAMESLOT(3) + 0(FRAMEPOINTER)
+#else
+	lwz	OFFSET,  FRAMESLOT(1) + 0(FRAMEPOINTER)
+#endif
+#endif
+#endif
+#if defined(TRMMKERNEL) && !defined(LEFT)
+	neg	KK, OFFSET
+#endif
+#endif
+
+#include "cgemm_macros_8x4_power8.S"
+
+	cmpwi	cr0, M, 0
+	ble	L999_H1
+	cmpwi	cr0, N, 0
+	ble	L999_H1
+	cmpwi	cr0, K, 0
+	ble	L999_H1
+
+	slwi	LDC, LDC, ZBASE_SHIFT
+	li	PRE, 384 
+	li	o4  , 4
+	li	o8  , 8
+	li	o12 , 12
+	li	o16 , 16
+	li	o32 , 32
+	li	o48 , 48
+	
+	addi    BBUFFER, SP, 512+4096
+        li      T1, -4096
+        and     BBUFFER, BBUFFER, T1
+
+
+#ifdef __64BIT__
+	addi	T1 , SP, 296
+#else
+	addi	T1 , SP, 224
+#endif
+
+	stxsspx vs1,  0, T1
+        lxsspx  alpha_dr, 0, T1
+	stxsspx vs2,  o8  , T1
+        lxsspx  alpha_di, o8, T1
+        addi    T1, SP, 360
+        li      T2, 0
+
+        stw             T2, 0(T1)
+        stw             T2, 4(T1)
+        stw             T2, 8(T1)
+        stxsspx         alpha_dr, o12, T1
+        lxvw4x          alpha_sr, o0 , T1
+        addi            T1, T1, 16
+
+        stw             T2, 0(T1)
+        stw             T2, 4(T1)
+        stw             T2, 8(T1)
+        stxsspx         alpha_di, o12, T1
+        lxvw4x          alpha_si, o0 , T1
+
+	.align 5
+
+#include "cgemm_logic_8x4_power8.S"
+
+L999:
+	addi	r3, 0, 0
+
+	lfd	f14,    0(SP)
+	lfd	f15,    8(SP)
+	lfd	f16,   16(SP)
+	lfd	f17,   24(SP)
+
+	lfd	f18,   32(SP)
+	lfd	f19,   40(SP)
+	lfd	f20,   48(SP)
+	lfd	f21,   56(SP)
+
+	lfd	f22,   64(SP)
+	lfd	f23,   72(SP)
+	lfd	f24,   80(SP)
+	lfd	f25,   88(SP)
+
+	lfd	f26,   96(SP)
+	lfd	f27,  104(SP)
+	lfd	f28,  112(SP)
+	lfd	f29,  120(SP)
+
+	lfd	f30,  128(SP)
+	lfd	f31,  136(SP)
+
+#ifdef __64BIT__
+	ld	r31,  144(SP)
+	ld	r30,  152(SP)
+	ld	r29,  160(SP)
+	ld	r28,  168(SP)
+	ld	r27,  176(SP)
+	ld	r26,  184(SP)
+	ld	r25,  192(SP)
+	ld	r24,  200(SP)
+	ld	r23,  208(SP)
+	ld	r22,  216(SP)
+	ld	r21,  224(SP)
+	ld	r20,  232(SP)
+	ld	r19,  240(SP)
+	ld	r18,  248(SP)
+	ld	r17,  256(SP)
+	ld	r16,  264(SP)
+	ld	r15,  272(SP)
+	ld	r14,  280(SP)
+#else
+	lwz	r31,  144(SP)
+	lwz	r30,  148(SP)
+	lwz	r29,  152(SP)
+	lwz	r28,  156(SP)
+	lwz	r27,  160(SP)
+	lwz	r26,  164(SP)
+	lwz	r25,  168(SP)
+	lwz	r24,  172(SP)
+	lwz	r23,  176(SP)
+	lwz	r22,  180(SP)
+	lwz	r21,  184(SP)
+	lwz	r20,  188(SP)
+	lwz	r19,  192(SP)
+	lwz	r18,  196(SP)
+	lwz	r17,  200(SP)
+	lwz	r16,  204(SP)
+	lwz	r15,  208(SP)
+	lwz	r14,  212(SP)
+#endif
+
+	addi	SP, SP, STACKSIZE
+	addi	SP, SP, STACKSIZE
+	addi	SP, SP, STACKSIZE
+	addi	SP, SP, STACKSIZE
+
+	blr
+
+	EPILOGUE
+#endif
diff --git a/kernel/power/cgemm_logic_8x4_power8.S b/kernel/power/cgemm_logic_8x4_power8.S
new file mode 100644
index 000000000..db2a57f91
--- /dev/null
+++ b/kernel/power/cgemm_logic_8x4_power8.S
@@ -0,0 +1,1459 @@
+/***************************************************************************
+Copyright (c) 2013-2016, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*****************************************************************************/
+
+/**************************************************************************************
+* 2016/04/04 Werner Saar (wernsaar@googlemail.com)
+* 	 BLASTEST 		: OK
+* 	 CTEST			: OK
+* 	 TEST			: OK
+* 	 LAPACK-TEST		: OK
+**************************************************************************************/
+
+	srawi.		J,	N,	2
+	ble		CGEMM_L4_END
+
+CGEMM_L4_BEGIN:
+
+	mr		BO,	B
+	mr		BBO,	BBUFFER
+	slwi		T1,	K,	3
+
+CGEMM_L4_COPYB:
+	dcbtst		BBO,	PRE
+
+	lxvw4x		vs3,	o0,	BO
+	lxvw4x		vs11,	o16,	BO
+	xxspltw		vs4,	vs3,	0
+	xxspltw		vs5,	vs3,	1
+	xxspltw		vs6,	vs3,	2
+	xxspltw		vs7,	vs3,	3
+	xxspltw		vs12,	vs11,	0
+	xxspltw		vs13,	vs11,	1
+	xxspltw		vs14,	vs11,	2
+	xxspltw		vs15,	vs11,	3
+	stxvw4x		vs4,	o0,	BBO
+	stxvw4x		vs5,	o16,	BBO
+	stxvw4x		vs6,	o32,	BBO
+	stxvw4x		vs7,	o48,	BBO
+	addi		BO,	BO,	32
+	addi		BBO,	BBO,	64
+	stxvw4x		vs12,	o0,	BBO
+	stxvw4x		vs13,	o16,	BBO
+	stxvw4x		vs14,	o32,	BBO
+	stxvw4x		vs15,	o48,	BBO
+	addic.		T1,	T1,	-8
+	addi		BBO,	BBO,	64
+
+	bge		CGEMM_L4_COPYB
+
+
+	mr		CO,	C
+	mr		AO,	A
+	slwi		T1,	LDC	,	2
+	add		C,	C,	T1
+	srawi.		I,	M,	3
+	ble		CGEMM_L4x8_END
+
+CGEMM_L4x8_BEGIN:
+
+
+	mr		BO,	BBUFFER
+	srawi.		L,	K,	3
+	ble		CGEMM_L4x8_SUB0
+	cmpwi		cr0,	L,	1
+	ble		CGEMM_L4x8_SUB4
+
+CGEMM_L4x8_LOOP_START:
+
+	dcbt		AO,	PRE
+	dcbt		BO,	PRE
+	LOAD4x8_1
+	dcbt		BO,	PRE
+	KERNEL4x8_I1
+	dcbt		BO,	PRE
+	dcbt		AO,	PRE
+	KERNEL4x8_2
+	dcbt		BO,	PRE
+	KERNEL4x8_1
+	dcbt		BO,	PRE
+	dcbt		AO,	PRE
+	KERNEL4x8_2
+
+	dcbt		BO,	PRE
+	KERNEL4x8_1
+	dcbt		BO,	PRE
+	dcbt		AO,	PRE
+	KERNEL4x8_2
+	dcbt		BO,	PRE
+	KERNEL4x8_1
+	dcbt		BO,	PRE
+	dcbt		AO,	PRE
+	KERNEL4x8_2
+
+	addic.		L,	L,	-2
+	ble		CGEMM_L4x8_LOOP_END
+
+	.align 5
+
+CGEMM_L4x8_LOOP:
+
+	dcbt		BO,	PRE
+	KERNEL4x8_1
+	dcbt		BO,	PRE
+	dcbt		AO,	PRE
+	KERNEL4x8_2
+	dcbt		BO,	PRE
+	KERNEL4x8_1
+	dcbt		BO,	PRE
+	dcbt		AO,	PRE
+	KERNEL4x8_2
+
+	dcbt		BO,	PRE
+	KERNEL4x8_1
+	dcbt		BO,	PRE
+	dcbt		AO,	PRE
+	KERNEL4x8_2
+	dcbt		BO,	PRE
+	KERNEL4x8_1
+	dcbt		BO,	PRE
+	dcbt		AO,	PRE
+	KERNEL4x8_2
+
+	addic.		L,	L,	-1
+	bgt		CGEMM_L4x8_LOOP
+
+CGEMM_L4x8_LOOP_END:
+
+	dcbt		BO,	PRE
+	KERNEL4x8_1
+	dcbt		BO,	PRE
+	dcbt		AO,	PRE
+	KERNEL4x8_2
+	KERNEL4x8_1
+	dcbt		AO,	PRE
+	KERNEL4x8_2
+
+	KERNEL4x8_1
+	dcbt		AO,	PRE
+	KERNEL4x8_2
+	KERNEL4x8_1
+	KERNEL4x8_E2
+
+	b		CGEMM_L4x8_SUB1
+
+CGEMM_L4x8_SUB4:
+
+	KERNEL4x8_SUBI1
+	KERNEL4x8_SUB1
+	KERNEL4x8_SUB1
+	KERNEL4x8_SUB1
+
+	KERNEL4x8_SUB1
+	KERNEL4x8_SUB1
+	KERNEL4x8_SUB1
+	KERNEL4x8_SUB1
+
+	b		CGEMM_L4x8_SUB1
+
+CGEMM_L4x8_SUB0:
+
+	andi.		L,	K,	7
+
+	KERNEL4x8_SUBI1
+
+	addic.		L,	L,	-1
+	ble		CGEMM_L4x8_SAVE
+	b		CGEMM_L4x8_SUB2
+
+CGEMM_L4x8_SUB1:
+
+	andi.		L,	K,	7
+	ble		CGEMM_L4x8_SAVE
+
+CGEMM_L4x8_SUB2:
+
+	KERNEL4x8_SUB1
+
+	addic.		L,	L,	-1
+	bgt		CGEMM_L4x8_SUB2
+
+CGEMM_L4x8_SAVE:
+
+	SAVE4x8
+
+	addic.		I,	I,	-1
+	bgt		CGEMM_L4x8_BEGIN
+
+CGEMM_L4x8_END:
+
+CGEMM_L4x4_BEGIN:
+
+	andi.		T2,	M,	7
+	ble		CGEMM_L4x1_END
+
+	andi.		T1,	M,	4
+	ble		CGEMM_L4x4_END
+	mr		BO,	BBUFFER
+	srawi.		L,	K,	3
+	ble		CGEMM_L4x4_SUB0
+	cmpwi		cr0,	L,	1
+	ble		CGEMM_L4x4_SUB4
+
+CGEMM_L4x4_LOOP_START:
+
+	LOAD4x4_1
+	KERNEL4x4_I1
+	KERNEL4x4_2
+	KERNEL4x4_1
+	KERNEL4x4_2
+
+	KERNEL4x4_1
+	KERNEL4x4_2
+	KERNEL4x4_1
+	KERNEL4x4_2
+
+	addic.		L,	L,	-2
+	ble		CGEMM_L4x4_LOOP_END
+
+	.align 5
+
+CGEMM_L4x4_LOOP:
+
+	KERNEL4x4_1
+	KERNEL4x4_2
+	KERNEL4x4_1
+	KERNEL4x4_2
+
+	KERNEL4x4_1
+	KERNEL4x4_2
+	KERNEL4x4_1
+	KERNEL4x4_2
+
+	addic.		L,	L,	-1
+	bgt		CGEMM_L4x4_LOOP
+
+CGEMM_L4x4_LOOP_END:
+
+	KERNEL4x4_1
+	KERNEL4x4_2
+	KERNEL4x4_1
+	KERNEL4x4_2
+
+	KERNEL4x4_1
+	KERNEL4x4_2
+	KERNEL4x4_1
+	KERNEL4x4_E2
+
+	b		CGEMM_L4x4_SUB1
+
+CGEMM_L4x4_SUB4:
+
+	KERNEL4x4_SUBI1
+	KERNEL4x4_SUB1
+	KERNEL4x4_SUB1
+	KERNEL4x4_SUB1
+
+	KERNEL4x4_SUB1
+	KERNEL4x4_SUB1
+	KERNEL4x4_SUB1
+	KERNEL4x4_SUB1
+
+	b		CGEMM_L4x4_SUB1
+
+CGEMM_L4x4_SUB0:
+
+	andi.		L,	K,	7
+
+	KERNEL4x4_SUBI1
+
+	addic.		L,	L,	-1
+	ble		CGEMM_L4x4_SAVE
+	b		CGEMM_L4x4_SUB2
+
+CGEMM_L4x4_SUB1:
+
+	andi.		L,	K,	7
+	ble		CGEMM_L4x4_SAVE
+
+CGEMM_L4x4_SUB2:
+
+	KERNEL4x4_SUB1
+
+	addic.		L,	L,	-1
+	bgt		CGEMM_L4x4_SUB2
+
+CGEMM_L4x4_SAVE:
+
+	SAVE4x4
+
+CGEMM_L4x4_END:
+
+CGEMM_L4x2_BEGIN:
+
+
+	andi.		T1,	M,	2
+	ble		CGEMM_L4x2_END
+	mr		BO,	BBUFFER
+	srawi.		L,	K,	3
+	ble		CGEMM_L4x2_SUB0
+	cmpwi		cr0,	L,	1
+	ble		CGEMM_L4x2_SUB4
+
+CGEMM_L4x2_LOOP_START:
+
+	LOAD4x2_1
+	KERNEL4x2_I1
+	KERNEL4x2_2
+	KERNEL4x2_1
+	KERNEL4x2_2
+
+	KERNEL4x2_1
+	KERNEL4x2_2
+	KERNEL4x2_1
+	KERNEL4x2_2
+
+	addic.		L,	L,	-2
+	ble		CGEMM_L4x2_LOOP_END
+
+	.align 5
+
+CGEMM_L4x2_LOOP:
+
+	KERNEL4x2_1
+	KERNEL4x2_2
+	KERNEL4x2_1
+	KERNEL4x2_2
+
+	KERNEL4x2_1
+	KERNEL4x2_2
+	KERNEL4x2_1
+	KERNEL4x2_2
+
+	addic.		L,	L,	-1
+	bgt		CGEMM_L4x2_LOOP
+
+CGEMM_L4x2_LOOP_END:
+
+	KERNEL4x2_1
+	KERNEL4x2_2
+	KERNEL4x2_1
+	KERNEL4x2_2
+
+	KERNEL4x2_1
+	KERNEL4x2_2
+	KERNEL4x2_1
+	KERNEL4x2_E2
+
+	b		CGEMM_L4x2_SUB1
+
+CGEMM_L4x2_SUB4:
+
+	KERNEL4x2_SUBI1
+	KERNEL4x2_SUB1
+	KERNEL4x2_SUB1
+	KERNEL4x2_SUB1
+
+	KERNEL4x2_SUB1
+	KERNEL4x2_SUB1
+	KERNEL4x2_SUB1
+	KERNEL4x2_SUB1
+
+	b		CGEMM_L4x2_SUB1
+
+CGEMM_L4x2_SUB0:
+
+	andi.		L,	K,	7
+
+	KERNEL4x2_SUBI1
+
+	addic.		L,	L,	-1
+	ble		CGEMM_L4x2_SAVE
+	b		CGEMM_L4x2_SUB2
+
+CGEMM_L4x2_SUB1:
+
+	andi.		L,	K,	7
+	ble		CGEMM_L4x2_SAVE
+
+CGEMM_L4x2_SUB2:
+
+	KERNEL4x2_SUB1
+
+	addic.		L,	L,	-1
+	bgt		CGEMM_L4x2_SUB2
+
+CGEMM_L4x2_SAVE:
+
+	SAVE4x2
+
+CGEMM_L4x2_END:
+
+CGEMM_L4x1_BEGIN:
+
+
+	andi.		T1,	M,	1
+	ble		CGEMM_L4x1_END
+	mr		BO,	BBUFFER
+	srawi.		L,	K,	3
+	ble		CGEMM_L4x1_SUB0
+	cmpwi		cr0,	L,	1
+	ble		CGEMM_L4x1_SUB4
+
+CGEMM_L4x1_LOOP_START:
+
+	LOAD4x1_1
+	KERNEL4x1_I1
+	KERNEL4x1_2
+	KERNEL4x1_1
+	KERNEL4x1_2
+
+	KERNEL4x1_1
+	KERNEL4x1_2
+	KERNEL4x1_1
+	KERNEL4x1_2
+
+	addic.		L,	L,	-2
+	ble		CGEMM_L4x1_LOOP_END
+
+	.align 5
+
+CGEMM_L4x1_LOOP:
+
+	KERNEL4x1_1
+	KERNEL4x1_2
+	KERNEL4x1_1
+	KERNEL4x1_2
+
+	KERNEL4x1_1
+	KERNEL4x1_2
+	KERNEL4x1_1
+	KERNEL4x1_2
+
+	addic.		L,	L,	-1
+	bgt		CGEMM_L4x1_LOOP
+
+CGEMM_L4x1_LOOP_END:
+
+	KERNEL4x1_1
+	KERNEL4x1_2
+	KERNEL4x1_1
+	KERNEL4x1_2
+
+	KERNEL4x1_1
+	KERNEL4x1_2
+	KERNEL4x1_1
+	KERNEL4x1_E2
+
+	b		CGEMM_L4x1_SUB1
+
+CGEMM_L4x1_SUB4:
+
+	KERNEL4x1_SUBI1
+	KERNEL4x1_SUB1
+	KERNEL4x1_SUB1
+	KERNEL4x1_SUB1
+
+	KERNEL4x1_SUB1
+	KERNEL4x1_SUB1
+	KERNEL4x1_SUB1
+	KERNEL4x1_SUB1
+
+	b		CGEMM_L4x1_SUB1
+
+CGEMM_L4x1_SUB0:
+
+	andi.		L,	K,	7
+
+	KERNEL4x1_SUBI1
+
+	addic.		L,	L,	-1
+	ble		CGEMM_L4x1_SAVE
+	b		CGEMM_L4x1_SUB2
+
+CGEMM_L4x1_SUB1:
+
+	andi.		L,	K,	7
+	ble		CGEMM_L4x1_SAVE
+
+CGEMM_L4x1_SUB2:
+
+	KERNEL4x1_SUB1
+
+	addic.		L,	L,	-1
+	bgt		CGEMM_L4x1_SUB2
+
+CGEMM_L4x1_SAVE:
+
+	SAVE4x1
+
+CGEMM_L4x1_END:
+
+	slwi		T1,	K,	5
+	add		B,	B,	T1
+
+	addic.		J,	J,	-1
+	bgt		CGEMM_L4_BEGIN
+
+	andi.		T2,	N,	3
+	ble		L999_H2
+
+CGEMM_L4_END:
+
+	b		CGEMM_L2_BEGIN
+
+L999_H1:
+
+	b		L999_H2
+
+CGEMM_L2_BEGIN:
+
+	mr		BO,	B
+	mr		BBO,	BBUFFER
+	slwi		T1,	K,	2
+
+CGEMM_L2_COPYB:
+	dcbtst		BBO,	PRE
+
+	lxvw4x		vs3,	o0,	BO
+	lxvw4x		vs11,	o16,	BO
+	xxspltw		vs4,	vs3,	0
+	xxspltw		vs5,	vs3,	1
+	xxspltw		vs6,	vs3,	2
+	xxspltw		vs7,	vs3,	3
+	xxspltw		vs12,	vs11,	0
+	xxspltw		vs13,	vs11,	1
+	xxspltw		vs14,	vs11,	2
+	xxspltw		vs15,	vs11,	3
+	stxvw4x		vs4,	o0,	BBO
+	stxvw4x		vs5,	o16,	BBO
+	stxvw4x		vs6,	o32,	BBO
+	stxvw4x		vs7,	o48,	BBO
+	addi		BO,	BO,	32
+	addi		BBO,	BBO,	64
+	stxvw4x		vs12,	o0,	BBO
+	stxvw4x		vs13,	o16,	BBO
+	stxvw4x		vs14,	o32,	BBO
+	stxvw4x		vs15,	o48,	BBO
+	addic.		T1,	T1,	-8
+	addi		BBO,	BBO,	64
+
+	bge		CGEMM_L2_COPYB
+
+
+	andi.		T1,	N,	2
+	ble		CGEMM_L2_END
+	mr		CO,	C
+	mr		AO,	A
+	slwi		T1,	LDC	,	1
+	add		C,	C,	T1
+	srawi.		I,	M,	3
+	ble		CGEMM_L2x8_END
+
+CGEMM_L2x8_BEGIN:
+
+
+	mr		BO,	BBUFFER
+	srawi.		L,	K,	3
+	ble		CGEMM_L2x8_SUB0
+	cmpwi		cr0,	L,	1
+	ble		CGEMM_L2x8_SUB4
+
+CGEMM_L2x8_LOOP_START:
+
+	dcbt		AO,	PRE
+	LOAD2x8_1
+	KERNEL2x8_I1
+	dcbt		AO,	PRE
+	KERNEL2x8_2
+	KERNEL2x8_1
+	dcbt		AO,	PRE
+	KERNEL2x8_2
+
+	KERNEL2x8_1
+	dcbt		AO,	PRE
+	KERNEL2x8_2
+	KERNEL2x8_1
+	dcbt		AO,	PRE
+	KERNEL2x8_2
+
+	addic.		L,	L,	-2
+	ble		CGEMM_L2x8_LOOP_END
+
+	.align 5
+
+CGEMM_L2x8_LOOP:
+
+	KERNEL2x8_1
+	dcbt		AO,	PRE
+	KERNEL2x8_2
+	KERNEL2x8_1
+	dcbt		AO,	PRE
+	KERNEL2x8_2
+
+	KERNEL2x8_1
+	dcbt		AO,	PRE
+	KERNEL2x8_2
+	KERNEL2x8_1
+	dcbt		AO,	PRE
+	KERNEL2x8_2
+
+	addic.		L,	L,	-1
+	bgt		CGEMM_L2x8_LOOP
+
+CGEMM_L2x8_LOOP_END:
+
+	KERNEL2x8_1
+	dcbt		AO,	PRE
+	KERNEL2x8_2
+	KERNEL2x8_1
+	dcbt		AO,	PRE
+	KERNEL2x8_2
+
+	KERNEL2x8_1
+	dcbt		AO,	PRE
+	KERNEL2x8_2
+	KERNEL2x8_1
+	KERNEL2x8_E2
+
+	b		CGEMM_L2x8_SUB1
+
+CGEMM_L2x8_SUB4:
+
+	KERNEL2x8_SUBI1
+	KERNEL2x8_SUB1
+	KERNEL2x8_SUB1
+	KERNEL2x8_SUB1
+
+	KERNEL2x8_SUB1
+	KERNEL2x8_SUB1
+	KERNEL2x8_SUB1
+	KERNEL2x8_SUB1
+
+	b		CGEMM_L2x8_SUB1
+
+CGEMM_L2x8_SUB0:
+
+	andi.		L,	K,	7
+
+	KERNEL2x8_SUBI1
+
+	addic.		L,	L,	-1
+	ble		CGEMM_L2x8_SAVE
+	b		CGEMM_L2x8_SUB2
+
+CGEMM_L2x8_SUB1:
+
+	andi.		L,	K,	7
+	ble		CGEMM_L2x8_SAVE
+
+CGEMM_L2x8_SUB2:
+
+	KERNEL2x8_SUB1
+
+	addic.		L,	L,	-1
+	bgt		CGEMM_L2x8_SUB2
+
+CGEMM_L2x8_SAVE:
+
+	SAVE2x8
+
+	addic.		I,	I,	-1
+	bgt		CGEMM_L2x8_BEGIN
+
+CGEMM_L2x8_END:
+
+CGEMM_L2x4_BEGIN:
+
+	andi.		T2,	M,	7
+	ble		CGEMM_L2x1_END
+
+	andi.		T1,	M,	4
+	ble		CGEMM_L2x4_END
+	mr		BO,	BBUFFER
+	srawi.		L,	K,	3
+	ble		CGEMM_L2x4_SUB0
+	cmpwi		cr0,	L,	1
+	ble		CGEMM_L2x4_SUB4
+
+CGEMM_L2x4_LOOP_START:
+
+	LOAD2x4_1
+	KERNEL2x4_I1
+	KERNEL2x4_2
+	KERNEL2x4_1
+	KERNEL2x4_2
+
+	KERNEL2x4_1
+	KERNEL2x4_2
+	KERNEL2x4_1
+	KERNEL2x4_2
+
+	addic.		L,	L,	-2
+	ble		CGEMM_L2x4_LOOP_END
+
+	.align 5
+
+CGEMM_L2x4_LOOP:
+
+	KERNEL2x4_1
+	KERNEL2x4_2
+	KERNEL2x4_1
+	KERNEL2x4_2
+
+	KERNEL2x4_1
+	KERNEL2x4_2
+	KERNEL2x4_1
+	KERNEL2x4_2
+
+	addic.		L,	L,	-1
+	bgt		CGEMM_L2x4_LOOP
+
+CGEMM_L2x4_LOOP_END:
+
+	KERNEL2x4_1
+	KERNEL2x4_2
+	KERNEL2x4_1
+	KERNEL2x4_2
+
+	KERNEL2x4_1
+	KERNEL2x4_2
+	KERNEL2x4_1
+	KERNEL2x4_E2
+
+	b		CGEMM_L2x4_SUB1
+
+CGEMM_L2x4_SUB4:
+
+	KERNEL2x4_SUBI1
+	KERNEL2x4_SUB1
+	KERNEL2x4_SUB1
+	KERNEL2x4_SUB1
+
+	KERNEL2x4_SUB1
+	KERNEL2x4_SUB1
+	KERNEL2x4_SUB1
+	KERNEL2x4_SUB1
+
+	b		CGEMM_L2x4_SUB1
+
+CGEMM_L2x4_SUB0:
+
+	andi.		L,	K,	7
+
+	KERNEL2x4_SUBI1
+
+	addic.		L,	L,	-1
+	ble		CGEMM_L2x4_SAVE
+	b		CGEMM_L2x4_SUB2
+
+CGEMM_L2x4_SUB1:
+
+	andi.		L,	K,	7
+	ble		CGEMM_L2x4_SAVE
+
+CGEMM_L2x4_SUB2:
+
+	KERNEL2x4_SUB1
+
+	addic.		L,	L,	-1
+	bgt		CGEMM_L2x4_SUB2
+
+CGEMM_L2x4_SAVE:
+
+	SAVE2x4
+
+CGEMM_L2x4_END:
+
+CGEMM_L2x2_BEGIN:
+
+
+	andi.		T1,	M,	2
+	ble		CGEMM_L2x2_END
+	mr		BO,	BBUFFER
+	srawi.		L,	K,	3
+	ble		CGEMM_L2x2_SUB0
+	cmpwi		cr0,	L,	1
+	ble		CGEMM_L2x2_SUB4
+
+CGEMM_L2x2_LOOP_START:
+
+	LOAD2x2_1
+	KERNEL2x2_I1
+	KERNEL2x2_2
+	KERNEL2x2_1
+	KERNEL2x2_2
+
+	KERNEL2x2_1
+	KERNEL2x2_2
+	KERNEL2x2_1
+	KERNEL2x2_2
+
+	addic.		L,	L,	-2
+	ble		CGEMM_L2x2_LOOP_END
+
+	.align 5
+
+CGEMM_L2x2_LOOP:
+
+	KERNEL2x2_1
+	KERNEL2x2_2
+	KERNEL2x2_1
+	KERNEL2x2_2
+
+	KERNEL2x2_1
+	KERNEL2x2_2
+	KERNEL2x2_1
+	KERNEL2x2_2
+
+	addic.		L,	L,	-1
+	bgt		CGEMM_L2x2_LOOP
+
+CGEMM_L2x2_LOOP_END:
+
+	KERNEL2x2_1
+	KERNEL2x2_2
+	KERNEL2x2_1
+	KERNEL2x2_2
+
+	KERNEL2x2_1
+	KERNEL2x2_2
+	KERNEL2x2_1
+	KERNEL2x2_E2
+
+	b		CGEMM_L2x2_SUB1
+
+CGEMM_L2x2_SUB4:
+
+	KERNEL2x2_SUBI1
+	KERNEL2x2_SUB1
+	KERNEL2x2_SUB1
+	KERNEL2x2_SUB1
+
+	KERNEL2x2_SUB1
+	KERNEL2x2_SUB1
+	KERNEL2x2_SUB1
+	KERNEL2x2_SUB1
+
+	b		CGEMM_L2x2_SUB1
+
+CGEMM_L2x2_SUB0:
+
+	andi.		L,	K,	7
+
+	KERNEL2x2_SUBI1
+
+	addic.		L,	L,	-1
+	ble		CGEMM_L2x2_SAVE
+	b		CGEMM_L2x2_SUB2
+
+CGEMM_L2x2_SUB1:
+
+	andi.		L,	K,	7
+	ble		CGEMM_L2x2_SAVE
+
+CGEMM_L2x2_SUB2:
+
+	KERNEL2x2_SUB1
+
+	addic.		L,	L,	-1
+	bgt		CGEMM_L2x2_SUB2
+
+CGEMM_L2x2_SAVE:
+
+	SAVE2x2
+
+CGEMM_L2x2_END:
+
+CGEMM_L2x1_BEGIN:
+
+
+	andi.		T1,	M,	1
+	ble		CGEMM_L2x1_END
+	mr		BO,	BBUFFER
+	srawi.		L,	K,	3
+	ble		CGEMM_L2x1_SUB0
+	cmpwi		cr0,	L,	1
+	ble		CGEMM_L2x1_SUB4
+
+CGEMM_L2x1_LOOP_START:
+
+	LOAD2x1_1
+	KERNEL2x1_I1
+	KERNEL2x1_2
+	KERNEL2x1_1
+	KERNEL2x1_2
+
+	KERNEL2x1_1
+	KERNEL2x1_2
+	KERNEL2x1_1
+	KERNEL2x1_2
+
+	addic.		L,	L,	-2
+	ble		CGEMM_L2x1_LOOP_END
+
+	.align 5
+
+CGEMM_L2x1_LOOP:
+
+	KERNEL2x1_1
+	KERNEL2x1_2
+	KERNEL2x1_1
+	KERNEL2x1_2
+
+	KERNEL2x1_1
+	KERNEL2x1_2
+	KERNEL2x1_1
+	KERNEL2x1_2
+
+	addic.		L,	L,	-1
+	bgt		CGEMM_L2x1_LOOP
+
+CGEMM_L2x1_LOOP_END:
+
+	KERNEL2x1_1
+	KERNEL2x1_2
+	KERNEL2x1_1
+	KERNEL2x1_2
+
+	KERNEL2x1_1
+	KERNEL2x1_2
+	KERNEL2x1_1
+	KERNEL2x1_E2
+
+	b		CGEMM_L2x1_SUB1
+
+CGEMM_L2x1_SUB4:
+
+	KERNEL2x1_SUBI1
+	KERNEL2x1_SUB1
+	KERNEL2x1_SUB1
+	KERNEL2x1_SUB1
+
+	KERNEL2x1_SUB1
+	KERNEL2x1_SUB1
+	KERNEL2x1_SUB1
+	KERNEL2x1_SUB1
+
+	b		CGEMM_L2x1_SUB1
+
+CGEMM_L2x1_SUB0:
+
+	andi.		L,	K,	7
+
+	KERNEL2x1_SUBI1
+
+	addic.		L,	L,	-1
+	ble		CGEMM_L2x1_SAVE
+	b		CGEMM_L2x1_SUB2
+
+CGEMM_L2x1_SUB1:
+
+	andi.		L,	K,	7
+	ble		CGEMM_L2x1_SAVE
+
+CGEMM_L2x1_SUB2:
+
+	KERNEL2x1_SUB1
+
+	addic.		L,	L,	-1
+	bgt		CGEMM_L2x1_SUB2
+
+CGEMM_L2x1_SAVE:
+
+	SAVE2x1
+
+CGEMM_L2x1_END:
+
+	slwi		T1,	K,	4
+	add		B,	B,	T1
+
+CGEMM_L2_END:
+
+	b		CGEMM_L1_BEGIN
+
+L999_H2:
+
+	b		L999
+
+CGEMM_L1_BEGIN:
+
+	mr		BO,	B
+	mr		BBO,	BBUFFER
+	slwi		T1,	K,	1
+
+CGEMM_L1_COPYB:
+	dcbtst		BBO,	PRE
+
+	lxvw4x		vs3,	o0,	BO
+	lxvw4x		vs11,	o16,	BO
+	xxspltw		vs4,	vs3,	0
+	xxspltw		vs5,	vs3,	1
+	xxspltw		vs6,	vs3,	2
+	xxspltw		vs7,	vs3,	3
+	xxspltw		vs12,	vs11,	0
+	xxspltw		vs13,	vs11,	1
+	xxspltw		vs14,	vs11,	2
+	xxspltw		vs15,	vs11,	3
+	stxvw4x		vs4,	o0,	BBO
+	stxvw4x		vs5,	o16,	BBO
+	stxvw4x		vs6,	o32,	BBO
+	stxvw4x		vs7,	o48,	BBO
+	addi		BO,	BO,	32
+	addi		BBO,	BBO,	64
+	stxvw4x		vs12,	o0,	BBO
+	stxvw4x		vs13,	o16,	BBO
+	stxvw4x		vs14,	o32,	BBO
+	stxvw4x		vs15,	o48,	BBO
+	addic.		T1,	T1,	-8
+	addi		BBO,	BBO,	64
+
+	bge		CGEMM_L1_COPYB
+
+
+	andi.		T1,	N,	1
+	ble		CGEMM_L1_END
+	mr		CO,	C
+	mr		AO,	A
+	srawi.		I,	M,	3
+	ble		CGEMM_L1x8_END
+
+CGEMM_L1x8_BEGIN:
+
+
+	mr		BO,	BBUFFER
+	srawi.		L,	K,	3
+	ble		CGEMM_L1x8_SUB0
+	cmpwi		cr0,	L,	1
+	ble		CGEMM_L1x8_SUB4
+
+CGEMM_L1x8_LOOP_START:
+
+	dcbt		AO,	PRE
+	LOAD1x8_1
+	KERNEL1x8_I1
+	dcbt		AO,	PRE
+	KERNEL1x8_2
+	KERNEL1x8_1
+	dcbt		AO,	PRE
+	KERNEL1x8_2
+
+	KERNEL1x8_1
+	dcbt		AO,	PRE
+	KERNEL1x8_2
+	KERNEL1x8_1
+	dcbt		AO,	PRE
+	KERNEL1x8_2
+
+	addic.		L,	L,	-2
+	ble		CGEMM_L1x8_LOOP_END
+
+	.align 5
+
+CGEMM_L1x8_LOOP:
+
+	KERNEL1x8_1
+	dcbt		AO,	PRE
+	KERNEL1x8_2
+	KERNEL1x8_1
+	dcbt		AO,	PRE
+	KERNEL1x8_2
+
+	KERNEL1x8_1
+	dcbt		AO,	PRE
+	KERNEL1x8_2
+	KERNEL1x8_1
+	dcbt		AO,	PRE
+	KERNEL1x8_2
+
+	addic.		L,	L,	-1
+	bgt		CGEMM_L1x8_LOOP
+
+CGEMM_L1x8_LOOP_END:
+
+	KERNEL1x8_1
+	dcbt		AO,	PRE
+	KERNEL1x8_2
+	KERNEL1x8_1
+	dcbt		AO,	PRE
+	KERNEL1x8_2
+
+	KERNEL1x8_1
+	dcbt		AO,	PRE
+	KERNEL1x8_2
+	KERNEL1x8_1
+	KERNEL1x8_E2
+
+	b		CGEMM_L1x8_SUB1
+
+CGEMM_L1x8_SUB4:
+
+	KERNEL1x8_SUBI1
+	KERNEL1x8_SUB1
+	KERNEL1x8_SUB1
+	KERNEL1x8_SUB1
+
+	KERNEL1x8_SUB1
+	KERNEL1x8_SUB1
+	KERNEL1x8_SUB1
+	KERNEL1x8_SUB1
+
+	b		CGEMM_L1x8_SUB1
+
+CGEMM_L1x8_SUB0:
+
+	andi.		L,	K,	7
+
+	KERNEL1x8_SUBI1
+
+	addic.		L,	L,	-1
+	ble		CGEMM_L1x8_SAVE
+	b		CGEMM_L1x8_SUB2
+
+CGEMM_L1x8_SUB1:
+
+	andi.		L,	K,	7
+	ble		CGEMM_L1x8_SAVE
+
+CGEMM_L1x8_SUB2:
+
+	KERNEL1x8_SUB1
+
+	addic.		L,	L,	-1
+	bgt		CGEMM_L1x8_SUB2
+
+CGEMM_L1x8_SAVE:
+
+	SAVE1x8
+
+	addic.		I,	I,	-1
+	bgt		CGEMM_L1x8_BEGIN
+
+CGEMM_L1x8_END:
+
+CGEMM_L1x4_BEGIN:
+
+	andi.		T2,	M,	7
+	ble		CGEMM_L1x1_END
+
+	andi.		T1,	M,	4
+	ble		CGEMM_L1x4_END
+	mr		BO,	BBUFFER
+	srawi.		L,	K,	3
+	ble		CGEMM_L1x4_SUB0
+	cmpwi		cr0,	L,	1
+	ble		CGEMM_L1x4_SUB4
+
+CGEMM_L1x4_LOOP_START:
+
+	LOAD1x4_1
+	KERNEL1x4_I1
+	KERNEL1x4_2
+	KERNEL1x4_1
+	KERNEL1x4_2
+
+	KERNEL1x4_1
+	KERNEL1x4_2
+	KERNEL1x4_1
+	KERNEL1x4_2
+
+	addic.		L,	L,	-2
+	ble		CGEMM_L1x4_LOOP_END
+
+	.align 5
+
+CGEMM_L1x4_LOOP:
+
+	KERNEL1x4_1
+	KERNEL1x4_2
+	KERNEL1x4_1
+	KERNEL1x4_2
+
+	KERNEL1x4_1
+	KERNEL1x4_2
+	KERNEL1x4_1
+	KERNEL1x4_2
+
+	addic.		L,	L,	-1
+	bgt		CGEMM_L1x4_LOOP
+
+CGEMM_L1x4_LOOP_END:
+
+	KERNEL1x4_1
+	KERNEL1x4_2
+	KERNEL1x4_1
+	KERNEL1x4_2
+
+	KERNEL1x4_1
+	KERNEL1x4_2
+	KERNEL1x4_1
+	KERNEL1x4_E2
+
+	b		CGEMM_L1x4_SUB1
+
+CGEMM_L1x4_SUB4:
+
+	KERNEL1x4_SUBI1
+	KERNEL1x4_SUB1
+	KERNEL1x4_SUB1
+	KERNEL1x4_SUB1
+
+	KERNEL1x4_SUB1
+	KERNEL1x4_SUB1
+	KERNEL1x4_SUB1
+	KERNEL1x4_SUB1
+
+	b		CGEMM_L1x4_SUB1
+
+CGEMM_L1x4_SUB0:
+
+	andi.		L,	K,	7
+
+	KERNEL1x4_SUBI1
+
+	addic.		L,	L,	-1
+	ble		CGEMM_L1x4_SAVE
+	b		CGEMM_L1x4_SUB2
+
+CGEMM_L1x4_SUB1:
+
+	andi.		L,	K,	7
+	ble		CGEMM_L1x4_SAVE
+
+CGEMM_L1x4_SUB2:
+
+	KERNEL1x4_SUB1
+
+	addic.		L,	L,	-1
+	bgt		CGEMM_L1x4_SUB2
+
+CGEMM_L1x4_SAVE:
+
+	SAVE1x4
+
+CGEMM_L1x4_END:
+
+CGEMM_L1x2_BEGIN:
+
+
+	andi.		T1,	M,	2
+	ble		CGEMM_L1x2_END
+	mr		BO,	BBUFFER
+	srawi.		L,	K,	3
+	ble		CGEMM_L1x2_SUB0
+	cmpwi		cr0,	L,	1
+	ble		CGEMM_L1x2_SUB4
+
+CGEMM_L1x2_LOOP_START:
+
+	LOAD1x2_1
+	KERNEL1x2_I1
+	KERNEL1x2_2
+	KERNEL1x2_1
+	KERNEL1x2_2
+
+	KERNEL1x2_1
+	KERNEL1x2_2
+	KERNEL1x2_1
+	KERNEL1x2_2
+
+	addic.		L,	L,	-2
+	ble		CGEMM_L1x2_LOOP_END
+
+	.align 5
+
+CGEMM_L1x2_LOOP:
+
+	KERNEL1x2_1
+	KERNEL1x2_2
+	KERNEL1x2_1
+	KERNEL1x2_2
+
+	KERNEL1x2_1
+	KERNEL1x2_2
+	KERNEL1x2_1
+	KERNEL1x2_2
+
+	addic.		L,	L,	-1
+	bgt		CGEMM_L1x2_LOOP
+
+CGEMM_L1x2_LOOP_END:
+
+	KERNEL1x2_1
+	KERNEL1x2_2
+	KERNEL1x2_1
+	KERNEL1x2_2
+
+	KERNEL1x2_1
+	KERNEL1x2_2
+	KERNEL1x2_1
+	KERNEL1x2_E2
+
+	b		CGEMM_L1x2_SUB1
+
+CGEMM_L1x2_SUB4:
+
+	KERNEL1x2_SUBI1
+	KERNEL1x2_SUB1
+	KERNEL1x2_SUB1
+	KERNEL1x2_SUB1
+
+	KERNEL1x2_SUB1
+	KERNEL1x2_SUB1
+	KERNEL1x2_SUB1
+	KERNEL1x2_SUB1
+
+	b		CGEMM_L1x2_SUB1
+
+CGEMM_L1x2_SUB0:
+
+	andi.		L,	K,	7
+
+	KERNEL1x2_SUBI1
+
+	addic.		L,	L,	-1
+	ble		CGEMM_L1x2_SAVE
+	b		CGEMM_L1x2_SUB2
+
+CGEMM_L1x2_SUB1:
+
+	andi.		L,	K,	7
+	ble		CGEMM_L1x2_SAVE
+
+CGEMM_L1x2_SUB2:
+
+	KERNEL1x2_SUB1
+
+	addic.		L,	L,	-1
+	bgt		CGEMM_L1x2_SUB2
+
+CGEMM_L1x2_SAVE:
+
+	SAVE1x2
+
+CGEMM_L1x2_END:
+
+CGEMM_L1x1_BEGIN:
+
+
+	andi.		T1,	M,	1
+	ble		CGEMM_L1x1_END
+	mr		BO,	BBUFFER
+	srawi.		L,	K,	3
+	ble		CGEMM_L1x1_SUB0
+	cmpwi		cr0,	L,	1
+	ble		CGEMM_L1x1_SUB4
+
+CGEMM_L1x1_LOOP_START:
+
+	LOAD1x1_1
+	KERNEL1x1_I1
+	KERNEL1x1_2
+	KERNEL1x1_1
+	KERNEL1x1_2
+
+	KERNEL1x1_1
+	KERNEL1x1_2
+	KERNEL1x1_1
+	KERNEL1x1_2
+
+	addic.		L,	L,	-2
+	ble		CGEMM_L1x1_LOOP_END
+
+	.align 5
+
+CGEMM_L1x1_LOOP:
+
+	KERNEL1x1_1
+	KERNEL1x1_2
+	KERNEL1x1_1
+	KERNEL1x1_2
+
+	KERNEL1x1_1
+	KERNEL1x1_2
+	KERNEL1x1_1
+	KERNEL1x1_2
+
+	addic.		L,	L,	-1
+	bgt		CGEMM_L1x1_LOOP
+
+CGEMM_L1x1_LOOP_END:
+
+	KERNEL1x1_1
+	KERNEL1x1_2
+	KERNEL1x1_1
+	KERNEL1x1_2
+
+	KERNEL1x1_1
+	KERNEL1x1_2
+	KERNEL1x1_1
+	KERNEL1x1_E2
+
+	b		CGEMM_L1x1_SUB1
+
+CGEMM_L1x1_SUB4:
+
+	KERNEL1x1_SUBI1
+	KERNEL1x1_SUB1
+	KERNEL1x1_SUB1
+	KERNEL1x1_SUB1
+
+	KERNEL1x1_SUB1
+	KERNEL1x1_SUB1
+	KERNEL1x1_SUB1
+	KERNEL1x1_SUB1
+
+	b		CGEMM_L1x1_SUB1
+
+CGEMM_L1x1_SUB0:
+
+	andi.		L,	K,	7
+
+	KERNEL1x1_SUBI1
+
+	addic.		L,	L,	-1
+	ble		CGEMM_L1x1_SAVE
+	b		CGEMM_L1x1_SUB2
+
+CGEMM_L1x1_SUB1:
+
+	andi.		L,	K,	7
+	ble		CGEMM_L1x1_SAVE
+
+CGEMM_L1x1_SUB2:
+
+	KERNEL1x1_SUB1
+
+	addic.		L,	L,	-1
+	bgt		CGEMM_L1x1_SUB2
+
+CGEMM_L1x1_SAVE:
+
+	SAVE1x1
+
+CGEMM_L1x1_END:
+
+CGEMM_L1_END:
diff --git a/kernel/power/cgemm_macros_8x4_power8.S b/kernel/power/cgemm_macros_8x4_power8.S
new file mode 100644
index 000000000..9a18cb189
--- /dev/null
+++ b/kernel/power/cgemm_macros_8x4_power8.S
@@ -0,0 +1,6355 @@
+/***************************************************************************
+Copyright (c) 2013-2016, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*****************************************************************************/
+
+/**************************************************************************************
+* 2016/04/04 Werner Saar (wernsaar@googlemail.com)
+* 	 BLASTEST 		: OK
+* 	 CTEST			: OK
+* 	 TEST			: OK
+* 	 LAPACK-TEST		: OK
+**************************************************************************************/
+
+#if   defined(NN) || defined(NT) || defined(TN) || defined(TT)
+
+	#define	XSFADD_R1   xsadddp
+	#define	XSFADD_R2   xssubdp
+	#define	XSFADD_I1   xsadddp
+	#define	XSFADD_I2   xsadddp
+	#define	XVFADD_R1   xvaddsp
+	#define	XVFADD_R2   xvsubsp
+	#define	XVFADD_I1   xvaddsp
+	#define	XVFADD_I2   xvaddsp
+
+#elif  defined(CN) || defined(CT) || defined(RN) || defined(RT)
+
+	#define	XSFADD_R1   xsadddp
+	#define	XSFADD_R2   xsadddp
+	#define	XSFADD_I1   xssubdp
+	#define	XSFADD_I2   xsadddp
+	#define	XVFADD_R1   xvaddsp
+	#define	XVFADD_R2   xvaddsp
+	#define	XVFADD_I1   xvsubsp
+	#define	XVFADD_I2   xvaddsp
+
+#elif  defined(NC) || defined(TC) || defined(NR) || defined(TR)
+
+	#define	XSFADD_R1   xsadddp
+	#define	XSFADD_R2   xsadddp
+	#define	XSFADD_I1   xsadddp
+	#define	XSFADD_I2   xssubdp
+	#define	XVFADD_R1   xvaddsp
+	#define	XVFADD_R2   xvaddsp
+	#define	XVFADD_I1   xvaddsp
+	#define	XVFADD_I2   xvsubsp
+
+#else             // CC || CR || RC || RR
+
+	#define	XSFADD_R1   xsadddp
+	#define	XSFADD_R2   xssubdp
+	#define	XSFADD_I1   xssubdp
+	#define	XSFADD_I2   xssubdp
+	#define	XVFADD_R1   xvaddsp
+	#define	XVFADD_R2   xvsubsp
+	#define	XVFADD_I1   xvsubsp
+	#define	XVFADD_I2   xvsubsp
+
+#endif
+
+/**********************************************************************************************
+* Macros for N=4 and M=8
+**********************************************************************************************/
+
+.macro LOAD4x8_1
+
+	lxvw4x		vs0,	o0,	AO		// load a0, a1
+	lxvw4x		vs1,	o16,	AO		// load a2, a3
+	lxvw4x		vs2,	o32,	AO		// load a4, a5
+	lxvw4x		vs3,	o48,	AO		// load a6, a7
+
+	addi		AO,	AO,	64
+
+	lxvw4x		vs8,	o0,	BO		//  load b0_r
+	lxvw4x		vs9,	o16,	BO		//  load b0_i
+	lxvw4x		vs10,	o32,	BO		//  load b1_r
+	lxvw4x		vs11,	o48,	BO		//  load b1_i
+
+	addi		BO,	BO,	64
+
+	lxvw4x		vs12,	o0,	BO		//  load b2_r
+	lxvw4x		vs13,	o16,	BO		//  load b2_i
+	lxvw4x		vs14,	o32,	BO		//  load b3_r
+	lxvw4x		vs15,	o48,	BO		//  load b3_i
+
+	addi		BO,	BO,	64
+
+
+.endm
+
+.macro KERNEL4x8_I1
+
+
+	lxvw4x		vs4,	o0,	AO		// load a0, a1
+	lxvw4x		vs5,	o16,	AO		// load a2, a3
+	lxvw4x		vs6,	o32,	AO		// load a4, a5
+	lxvw4x		vs7,	o48,	AO		// load a6, a7
+
+	addi		AO,	AO,	64
+
+	lxvw4x		vs16,	o0,	BO		//  load b0_r
+	lxvw4x		vs17,	o16,	BO		//  load b0_i
+	lxvw4x		vs18,	o32,	BO		//  load b1_r
+	lxvw4x		vs19,	o48,	BO		//  load b1_i
+
+	addi		BO,	BO,	64
+
+	lxvw4x		vs20,	o0,	BO		//  load b2_r
+	lxvw4x		vs21,	o16,	BO		//  load b2_i
+	lxvw4x		vs22,	o32,	BO		//  load b3_r
+	lxvw4x		vs23,	o48,	BO		//  load b3_i
+
+	addi		BO,	BO,	64
+
+
+
+	xvmulsp		vs32,	vs0,	vs8		// a0_r*b0_r, a0_i*b0_r, a1_r*b0_r, a1_i*b0_r
+	xvmulsp		vs33,	vs0,	vs9		// a0_r*b0_i, a0_i*b0_i, a1_r*b0_i, a1_i*b0_i
+	xvmulsp		vs34,	vs1,	vs8		// a1_r*b0_r, a1_i*b0_r, a1_r*b0_r, a1_i*b0_r
+	xvmulsp		vs35,	vs1,	vs9		// a1_r*b0_i, a1_i*b0_i, a1_r*b0_i, a1_i*b0_i
+	xvmulsp		vs36,	vs2,	vs8		// a2_r*b0_r, a2_i*b0_r, a1_r*b0_r, a1_i*b0_r
+	xvmulsp		vs37,	vs2,	vs9		// a2_r*b0_i, a2_i*b0_i, a1_r*b0_i, a1_i*b0_i
+	xvmulsp		vs38,	vs3,	vs8		// a3_r*b0_r, a3_i*b0_r, a1_r*b0_r, a1_i*b0_r
+	xvmulsp		vs39,	vs3,	vs9		// a3_r*b0_i, a3_i*b0_i, a1_r*b0_i, a1_i*b0_i
+
+	xvmulsp		vs40,	vs0,	vs10		// a0_r*b1_r, a0_i*b1_r, a1_r*b1_r, a1_i*b1_r
+	xvmulsp		vs41,	vs0,	vs11		// a0_r*b1_i, a0_i*b1_i, a1_r*b1_i, a1_i*b1_i
+	xvmulsp		vs42,	vs1,	vs10		// a1_r*b1_r, a1_i*b1_r, a1_r*b1_r, a1_i*b1_r
+	xvmulsp		vs43,	vs1,	vs11		// a1_r*b1_i, a1_i*b1_i, a1_r*b1_i, a1_i*b1_i
+	xvmulsp		vs44,	vs2,	vs10		// a2_r*b1_r, a2_i*b1_r, a1_r*b1_r, a1_i*b1_r
+	xvmulsp		vs45,	vs2,	vs11		// a2_r*b1_i, a2_i*b1_i, a1_r*b1_i, a1_i*b1_i
+	xvmulsp		vs46,	vs3,	vs10		// a3_r*b1_r, a3_i*b1_r, a1_r*b1_r, a1_i*b1_r
+	xvmulsp		vs47,	vs3,	vs11		// a3_r*b1_i, a3_i*b1_i, a1_r*b1_i, a1_i*b1_i
+
+	xvmulsp		vs48,	vs0,	vs12		// a0_r*b2_r, a0_i*b2_r, a1_r*b2_r, a1_i*b2_r
+	xvmulsp		vs49,	vs0,	vs13		// a0_r*b2_i, a0_i*b2_i, a1_r*b2_i, a1_i*b2_i
+	xvmulsp		vs50,	vs1,	vs12		// a1_r*b2_r, a1_i*b2_r, a1_r*b2_r, a1_i*b2_r
+	xvmulsp		vs51,	vs1,	vs13		// a1_r*b2_i, a1_i*b2_i, a1_r*b2_i, a1_i*b2_i
+	xvmulsp		vs52,	vs2,	vs12		// a2_r*b2_r, a2_i*b2_r, a1_r*b2_r, a1_i*b2_r
+	xvmulsp		vs53,	vs2,	vs13		// a2_r*b2_i, a2_i*b2_i, a1_r*b2_i, a1_i*b2_i
+	xvmulsp		vs54,	vs3,	vs12		// a3_r*b2_r, a3_i*b2_r, a1_r*b2_r, a1_i*b2_r
+	xvmulsp		vs55,	vs3,	vs13		// a3_r*b2_i, a3_i*b2_i, a1_r*b2_i, a1_i*b2_i
+
+	xvmulsp		vs56,	vs0,	vs14		// a0_r*b3_r, a0_i*b3_r, a1_r*b3_r, a1_i*b3_r
+	xvmulsp		vs57,	vs0,	vs15		// a0_r*b3_i, a0_i*b3_i, a1_r*b3_i, a1_i*b3_i
+	xvmulsp		vs58,	vs1,	vs14		// a1_r*b3_r, a1_i*b3_r, a1_r*b3_r, a1_i*b3_r
+	xvmulsp		vs59,	vs1,	vs15		// a1_r*b3_i, a1_i*b3_i, a1_r*b3_i, a1_i*b3_i
+	xvmulsp		vs60,	vs2,	vs14		// a2_r*b3_r, a2_i*b3_r, a1_r*b3_r, a1_i*b3_r
+	xvmulsp		vs61,	vs2,	vs15		// a2_r*b3_i, a2_i*b3_i, a1_r*b3_i, a1_i*b3_i
+	xvmulsp		vs62,	vs3,	vs14		// a3_r*b3_r, a3_i*b3_r, a1_r*b3_r, a1_i*b3_r
+	xvmulsp		vs63,	vs3,	vs15		// a3_r*b3_i, a3_i*b3_i, a1_r*b3_i, a1_i*b3_i
+
+
+.endm
+
+.macro KERNEL4x8_1
+
+
+	lxvw4x		vs4,	o0,	AO		// load a0, a1
+	lxvw4x		vs5,	o16,	AO		// load a2, a3
+	lxvw4x		vs6,	o32,	AO		// load a4, a5
+	lxvw4x		vs7,	o48,	AO		// load a6, a7
+
+	addi		AO,	AO,	64
+
+	lxvw4x		vs16,	o0,	BO		//  load b0_r
+	lxvw4x		vs17,	o16,	BO		//  load b0_i
+	lxvw4x		vs18,	o32,	BO		//  load b1_r
+	lxvw4x		vs19,	o48,	BO		//  load b1_i
+
+	addi		BO,	BO,	64
+
+	lxvw4x		vs20,	o0,	BO		//  load b2_r
+	lxvw4x		vs21,	o16,	BO		//  load b2_i
+	lxvw4x		vs22,	o32,	BO		//  load b3_r
+	lxvw4x		vs23,	o48,	BO		//  load b3_i
+
+	addi		BO,	BO,	64
+
+
+
+	xvmaddasp	vs32,	vs0,	vs8		// a0_r*b0_r, a0_i*b0_r, a1_r*b0_r, a1_i*b0_r
+	xvmaddasp	vs33,	vs0,	vs9		// a0_r*b0_i, a0_i*b0_i, a1_r*b0_i, a1_i*b0_i
+	xvmaddasp	vs34,	vs1,	vs8		// a1_r*b0_r, a1_i*b0_r, a1_r*b0_r, a1_i*b0_r
+	xvmaddasp	vs35,	vs1,	vs9		// a1_r*b0_i, a1_i*b0_i, a1_r*b0_i, a1_i*b0_i
+	xvmaddasp	vs36,	vs2,	vs8		// a2_r*b0_r, a2_i*b0_r, a1_r*b0_r, a1_i*b0_r
+	xvmaddasp	vs37,	vs2,	vs9		// a2_r*b0_i, a2_i*b0_i, a1_r*b0_i, a1_i*b0_i
+	xvmaddasp	vs38,	vs3,	vs8		// a3_r*b0_r, a3_i*b0_r, a1_r*b0_r, a1_i*b0_r
+	xvmaddasp	vs39,	vs3,	vs9		// a3_r*b0_i, a3_i*b0_i, a1_r*b0_i, a1_i*b0_i
+
+	xvmaddasp	vs40,	vs0,	vs10		// a0_r*b1_r, a0_i*b1_r, a1_r*b1_r, a1_i*b1_r
+	xvmaddasp	vs41,	vs0,	vs11		// a0_r*b1_i, a0_i*b1_i, a1_r*b1_i, a1_i*b1_i
+	xvmaddasp	vs42,	vs1,	vs10		// a1_r*b1_r, a1_i*b1_r, a1_r*b1_r, a1_i*b1_r
+	xvmaddasp	vs43,	vs1,	vs11		// a1_r*b1_i, a1_i*b1_i, a1_r*b1_i, a1_i*b1_i
+	xvmaddasp	vs44,	vs2,	vs10		// a2_r*b1_r, a2_i*b1_r, a1_r*b1_r, a1_i*b1_r
+	xvmaddasp	vs45,	vs2,	vs11		// a2_r*b1_i, a2_i*b1_i, a1_r*b1_i, a1_i*b1_i
+	xvmaddasp	vs46,	vs3,	vs10		// a3_r*b1_r, a3_i*b1_r, a1_r*b1_r, a1_i*b1_r
+	xvmaddasp	vs47,	vs3,	vs11		// a3_r*b1_i, a3_i*b1_i, a1_r*b1_i, a1_i*b1_i
+
+	xvmaddasp	vs48,	vs0,	vs12		// a0_r*b2_r, a0_i*b2_r, a1_r*b2_r, a1_i*b2_r
+	xvmaddasp	vs49,	vs0,	vs13		// a0_r*b2_i, a0_i*b2_i, a1_r*b2_i, a1_i*b2_i
+	xvmaddasp	vs50,	vs1,	vs12		// a1_r*b2_r, a1_i*b2_r, a1_r*b2_r, a1_i*b2_r
+	xvmaddasp	vs51,	vs1,	vs13		// a1_r*b2_i, a1_i*b2_i, a1_r*b2_i, a1_i*b2_i
+	xvmaddasp	vs52,	vs2,	vs12		// a2_r*b2_r, a2_i*b2_r, a1_r*b2_r, a1_i*b2_r
+	xvmaddasp	vs53,	vs2,	vs13		// a2_r*b2_i, a2_i*b2_i, a1_r*b2_i, a1_i*b2_i
+	xvmaddasp	vs54,	vs3,	vs12		// a3_r*b2_r, a3_i*b2_r, a1_r*b2_r, a1_i*b2_r
+	xvmaddasp	vs55,	vs3,	vs13		// a3_r*b2_i, a3_i*b2_i, a1_r*b2_i, a1_i*b2_i
+
+	xvmaddasp	vs56,	vs0,	vs14		// a0_r*b3_r, a0_i*b3_r, a1_r*b3_r, a1_i*b3_r
+	xvmaddasp	vs57,	vs0,	vs15		// a0_r*b3_i, a0_i*b3_i, a1_r*b3_i, a1_i*b3_i
+	xvmaddasp	vs58,	vs1,	vs14		// a1_r*b3_r, a1_i*b3_r, a1_r*b3_r, a1_i*b3_r
+	xvmaddasp	vs59,	vs1,	vs15		// a1_r*b3_i, a1_i*b3_i, a1_r*b3_i, a1_i*b3_i
+	xvmaddasp	vs60,	vs2,	vs14		// a2_r*b3_r, a2_i*b3_r, a1_r*b3_r, a1_i*b3_r
+	xvmaddasp	vs61,	vs2,	vs15		// a2_r*b3_i, a2_i*b3_i, a1_r*b3_i, a1_i*b3_i
+	xvmaddasp	vs62,	vs3,	vs14		// a3_r*b3_r, a3_i*b3_r, a1_r*b3_r, a1_i*b3_r
+	xvmaddasp	vs63,	vs3,	vs15		// a3_r*b3_i, a3_i*b3_i, a1_r*b3_i, a1_i*b3_i
+
+
+.endm
+
+.macro KERNEL4x8_2
+
+
+	lxvw4x		vs0,	o0,	AO		// load a0, a1
+	lxvw4x		vs1,	o16,	AO		// load a2, a3
+	lxvw4x		vs2,	o32,	AO		// load a4, a5
+	lxvw4x		vs3,	o48,	AO		// load a6, a7
+
+	addi		AO,	AO,	64
+
+	lxvw4x		vs8,	o0,	BO		//  load b0_r
+	lxvw4x		vs9,	o16,	BO		//  load b0_i
+	lxvw4x		vs10,	o32,	BO		//  load b1_r
+	lxvw4x		vs11,	o48,	BO		//  load b1_i
+
+	addi		BO,	BO,	64
+
+	lxvw4x		vs12,	o0,	BO		//  load b2_r
+	lxvw4x		vs13,	o16,	BO		//  load b2_i
+	lxvw4x		vs14,	o32,	BO		//  load b3_r
+	lxvw4x		vs15,	o48,	BO		//  load b3_i
+
+	addi		BO,	BO,	64
+
+
+
+	xvmaddasp	vs32,	vs4,	vs16		// a4_r*b0_r, a4_i*b0_r, a1_r*b0_r, a1_i*b0_r
+	xvmaddasp	vs33,	vs4,	vs17		// a4_r*b0_i, a4_i*b0_i, a1_r*b0_i, a1_i*b0_i
+	xvmaddasp	vs34,	vs5,	vs16		// a5_r*b0_r, a5_i*b0_r, a1_r*b0_r, a1_i*b0_r
+	xvmaddasp	vs35,	vs5,	vs17		// a5_r*b0_i, a5_i*b0_i, a1_r*b0_i, a1_i*b0_i
+	xvmaddasp	vs36,	vs6,	vs16		// a6_r*b0_r, a6_i*b0_r, a1_r*b0_r, a1_i*b0_r
+	xvmaddasp	vs37,	vs6,	vs17		// a6_r*b0_i, a6_i*b0_i, a1_r*b0_i, a1_i*b0_i
+	xvmaddasp	vs38,	vs7,	vs16		// a7_r*b0_r, a7_i*b0_r, a1_r*b0_r, a1_i*b0_r
+	xvmaddasp	vs39,	vs7,	vs17		// a7_r*b0_i, a7_i*b0_i, a1_r*b0_i, a1_i*b0_i
+
+	xvmaddasp	vs40,	vs4,	vs18		// a4_r*b1_r, a4_i*b1_r, a1_r*b1_r, a1_i*b1_r
+	xvmaddasp	vs41,	vs4,	vs19		// a4_r*b1_i, a4_i*b1_i, a1_r*b1_i, a1_i*b1_i
+	xvmaddasp	vs42,	vs5,	vs18		// a5_r*b1_r, a5_i*b1_r, a1_r*b1_r, a1_i*b1_r
+	xvmaddasp	vs43,	vs5,	vs19		// a5_r*b1_i, a5_i*b1_i, a1_r*b1_i, a1_i*b1_i
+	xvmaddasp	vs44,	vs6,	vs18		// a6_r*b1_r, a6_i*b1_r, a1_r*b1_r, a1_i*b1_r
+	xvmaddasp	vs45,	vs6,	vs19		// a6_r*b1_i, a6_i*b1_i, a1_r*b1_i, a1_i*b1_i
+	xvmaddasp	vs46,	vs7,	vs18		// a7_r*b1_r, a7_i*b1_r, a1_r*b1_r, a1_i*b1_r
+	xvmaddasp	vs47,	vs7,	vs19		// a7_r*b1_i, a7_i*b1_i, a1_r*b1_i, a1_i*b1_i
+
+	xvmaddasp	vs48,	vs4,	vs20		// a4_r*b2_r, a4_i*b2_r, a1_r*b2_r, a1_i*b2_r
+	xvmaddasp	vs49,	vs4,	vs21		// a4_r*b2_i, a4_i*b2_i, a1_r*b2_i, a1_i*b2_i
+	xvmaddasp	vs50,	vs5,	vs20		// a5_r*b2_r, a5_i*b2_r, a1_r*b2_r, a1_i*b2_r
+	xvmaddasp	vs51,	vs5,	vs21		// a5_r*b2_i, a5_i*b2_i, a1_r*b2_i, a1_i*b2_i
+	xvmaddasp	vs52,	vs6,	vs20		// a6_r*b2_r, a6_i*b2_r, a1_r*b2_r, a1_i*b2_r
+	xvmaddasp	vs53,	vs6,	vs21		// a6_r*b2_i, a6_i*b2_i, a1_r*b2_i, a1_i*b2_i
+	xvmaddasp	vs54,	vs7,	vs20		// a7_r*b2_r, a7_i*b2_r, a1_r*b2_r, a1_i*b2_r
+	xvmaddasp	vs55,	vs7,	vs21		// a7_r*b2_i, a7_i*b2_i, a1_r*b2_i, a1_i*b2_i
+
+	xvmaddasp	vs56,	vs4,	vs22		// a4_r*b3_r, a4_i*b3_r, a1_r*b3_r, a1_i*b3_r
+	xvmaddasp	vs57,	vs4,	vs23		// a4_r*b3_i, a4_i*b3_i, a1_r*b3_i, a1_i*b3_i
+	xvmaddasp	vs58,	vs5,	vs22		// a5_r*b3_r, a5_i*b3_r, a1_r*b3_r, a1_i*b3_r
+	xvmaddasp	vs59,	vs5,	vs23		// a5_r*b3_i, a5_i*b3_i, a1_r*b3_i, a1_i*b3_i
+	xvmaddasp	vs60,	vs6,	vs22		// a6_r*b3_r, a6_i*b3_r, a1_r*b3_r, a1_i*b3_r
+	xvmaddasp	vs61,	vs6,	vs23		// a6_r*b3_i, a6_i*b3_i, a1_r*b3_i, a1_i*b3_i
+	xvmaddasp	vs62,	vs7,	vs22		// a7_r*b3_r, a7_i*b3_r, a1_r*b3_r, a1_i*b3_r
+	xvmaddasp	vs63,	vs7,	vs23		// a7_r*b3_i, a7_i*b3_i, a1_r*b3_i, a1_i*b3_i
+
+
+.endm
+
+.macro KERNEL4x8_E2
+
+
+	xvmaddasp	vs32,	vs4,	vs16		// a4_r*b0_r, a4_i*b0_r, a1_r*b0_r, a1_i*b0_r
+	xvmaddasp	vs33,	vs4,	vs17		// a4_r*b0_i, a4_i*b0_i, a1_r*b0_i, a1_i*b0_i
+	xvmaddasp	vs34,	vs5,	vs16		// a5_r*b0_r, a5_i*b0_r, a1_r*b0_r, a1_i*b0_r
+	xvmaddasp	vs35,	vs5,	vs17		// a5_r*b0_i, a5_i*b0_i, a1_r*b0_i, a1_i*b0_i
+	xvmaddasp	vs36,	vs6,	vs16		// a6_r*b0_r, a6_i*b0_r, a1_r*b0_r, a1_i*b0_r
+	xvmaddasp	vs37,	vs6,	vs17		// a6_r*b0_i, a6_i*b0_i, a1_r*b0_i, a1_i*b0_i
+	xvmaddasp	vs38,	vs7,	vs16		// a7_r*b0_r, a7_i*b0_r, a1_r*b0_r, a1_i*b0_r
+	xvmaddasp	vs39,	vs7,	vs17		// a7_r*b0_i, a7_i*b0_i, a1_r*b0_i, a1_i*b0_i
+
+	xvmaddasp	vs40,	vs4,	vs18		// a4_r*b1_r, a4_i*b1_r, a1_r*b1_r, a1_i*b1_r
+	xvmaddasp	vs41,	vs4,	vs19		// a4_r*b1_i, a4_i*b1_i, a1_r*b1_i, a1_i*b1_i
+	xvmaddasp	vs42,	vs5,	vs18		// a5_r*b1_r, a5_i*b1_r, a1_r*b1_r, a1_i*b1_r
+	xvmaddasp	vs43,	vs5,	vs19		// a5_r*b1_i, a5_i*b1_i, a1_r*b1_i, a1_i*b1_i
+	xvmaddasp	vs44,	vs6,	vs18		// a6_r*b1_r, a6_i*b1_r, a1_r*b1_r, a1_i*b1_r
+	xvmaddasp	vs45,	vs6,	vs19		// a6_r*b1_i, a6_i*b1_i, a1_r*b1_i, a1_i*b1_i
+	xvmaddasp	vs46,	vs7,	vs18		// a7_r*b1_r, a7_i*b1_r, a1_r*b1_r, a1_i*b1_r
+	xvmaddasp	vs47,	vs7,	vs19		// a7_r*b1_i, a7_i*b1_i, a1_r*b1_i, a1_i*b1_i
+
+	xvmaddasp	vs48,	vs4,	vs20		// a4_r*b2_r, a4_i*b2_r, a1_r*b2_r, a1_i*b2_r
+	xvmaddasp	vs49,	vs4,	vs21		// a4_r*b2_i, a4_i*b2_i, a1_r*b2_i, a1_i*b2_i
+	xvmaddasp	vs50,	vs5,	vs20		// a5_r*b2_r, a5_i*b2_r, a1_r*b2_r, a1_i*b2_r
+	xvmaddasp	vs51,	vs5,	vs21		// a5_r*b2_i, a5_i*b2_i, a1_r*b2_i, a1_i*b2_i
+	xvmaddasp	vs52,	vs6,	vs20		// a6_r*b2_r, a6_i*b2_r, a1_r*b2_r, a1_i*b2_r
+	xvmaddasp	vs53,	vs6,	vs21		// a6_r*b2_i, a6_i*b2_i, a1_r*b2_i, a1_i*b2_i
+	xvmaddasp	vs54,	vs7,	vs20		// a7_r*b2_r, a7_i*b2_r, a1_r*b2_r, a1_i*b2_r
+	xvmaddasp	vs55,	vs7,	vs21		// a7_r*b2_i, a7_i*b2_i, a1_r*b2_i, a1_i*b2_i
+
+	xvmaddasp	vs56,	vs4,	vs22		// a4_r*b3_r, a4_i*b3_r, a1_r*b3_r, a1_i*b3_r
+	xvmaddasp	vs57,	vs4,	vs23		// a4_r*b3_i, a4_i*b3_i, a1_r*b3_i, a1_i*b3_i
+	xvmaddasp	vs58,	vs5,	vs22		// a5_r*b3_r, a5_i*b3_r, a1_r*b3_r, a1_i*b3_r
+	xvmaddasp	vs59,	vs5,	vs23		// a5_r*b3_i, a5_i*b3_i, a1_r*b3_i, a1_i*b3_i
+	xvmaddasp	vs60,	vs6,	vs22		// a6_r*b3_r, a6_i*b3_r, a1_r*b3_r, a1_i*b3_r
+	xvmaddasp	vs61,	vs6,	vs23		// a6_r*b3_i, a6_i*b3_i, a1_r*b3_i, a1_i*b3_i
+	xvmaddasp	vs62,	vs7,	vs22		// a7_r*b3_r, a7_i*b3_r, a1_r*b3_r, a1_i*b3_r
+	xvmaddasp	vs63,	vs7,	vs23		// a7_r*b3_i, a7_i*b3_i, a1_r*b3_i, a1_i*b3_i
+
+
+.endm
+
+.macro KERNEL4x8_SUBI1
+
+
+	lxvw4x		vs0,	o0,	AO		// load a0, a1
+	lxvw4x		vs1,	o16,	AO		// load a2, a3
+	lxvw4x		vs2,	o32,	AO		// load a4, a5
+	lxvw4x		vs3,	o48,	AO		// load a6, a7
+
+	addi		AO,	AO,	64
+
+	lxvw4x		vs8,	o0,	BO		//  load b0_r
+	lxvw4x		vs9,	o16,	BO		//  load b0_i
+	lxvw4x		vs10,	o32,	BO		//  load b1_r
+	lxvw4x		vs11,	o48,	BO		//  load b1_i
+
+	addi		BO,	BO,	64
+
+	lxvw4x		vs12,	o0,	BO		//  load b2_r
+	lxvw4x		vs13,	o16,	BO		//  load b2_i
+	lxvw4x		vs14,	o32,	BO		//  load b3_r
+	lxvw4x		vs15,	o48,	BO		//  load b3_i
+
+	addi		BO,	BO,	64
+
+
+
+	xvmulsp		vs32,	vs0,	vs8		// a0_r*b0_r, a0_i*b0_r, a1_r*b0_r, a1_i*b0_r
+	xvmulsp		vs33,	vs0,	vs9		// a0_r*b0_i, a0_i*b0_i, a1_r*b0_i, a1_i*b0_i
+	xvmulsp		vs34,	vs1,	vs8		// a1_r*b0_r, a1_i*b0_r, a1_r*b0_r, a1_i*b0_r
+	xvmulsp		vs35,	vs1,	vs9		// a1_r*b0_i, a1_i*b0_i, a1_r*b0_i, a1_i*b0_i
+	xvmulsp		vs36,	vs2,	vs8		// a2_r*b0_r, a2_i*b0_r, a1_r*b0_r, a1_i*b0_r
+	xvmulsp		vs37,	vs2,	vs9		// a2_r*b0_i, a2_i*b0_i, a1_r*b0_i, a1_i*b0_i
+	xvmulsp		vs38,	vs3,	vs8		// a3_r*b0_r, a3_i*b0_r, a1_r*b0_r, a1_i*b0_r
+	xvmulsp		vs39,	vs3,	vs9		// a3_r*b0_i, a3_i*b0_i, a1_r*b0_i, a1_i*b0_i
+
+	xvmulsp		vs40,	vs0,	vs10		// a0_r*b1_r, a0_i*b1_r, a1_r*b1_r, a1_i*b1_r
+	xvmulsp		vs41,	vs0,	vs11		// a0_r*b1_i, a0_i*b1_i, a1_r*b1_i, a1_i*b1_i
+	xvmulsp		vs42,	vs1,	vs10		// a1_r*b1_r, a1_i*b1_r, a1_r*b1_r, a1_i*b1_r
+	xvmulsp		vs43,	vs1,	vs11		// a1_r*b1_i, a1_i*b1_i, a1_r*b1_i, a1_i*b1_i
+	xvmulsp		vs44,	vs2,	vs10		// a2_r*b1_r, a2_i*b1_r, a1_r*b1_r, a1_i*b1_r
+	xvmulsp		vs45,	vs2,	vs11		// a2_r*b1_i, a2_i*b1_i, a1_r*b1_i, a1_i*b1_i
+	xvmulsp		vs46,	vs3,	vs10		// a3_r*b1_r, a3_i*b1_r, a1_r*b1_r, a1_i*b1_r
+	xvmulsp		vs47,	vs3,	vs11		// a3_r*b1_i, a3_i*b1_i, a1_r*b1_i, a1_i*b1_i
+
+	xvmulsp		vs48,	vs0,	vs12		// a0_r*b2_r, a0_i*b2_r, a1_r*b2_r, a1_i*b2_r
+	xvmulsp		vs49,	vs0,	vs13		// a0_r*b2_i, a0_i*b2_i, a1_r*b2_i, a1_i*b2_i
+	xvmulsp		vs50,	vs1,	vs12		// a1_r*b2_r, a1_i*b2_r, a1_r*b2_r, a1_i*b2_r
+	xvmulsp		vs51,	vs1,	vs13		// a1_r*b2_i, a1_i*b2_i, a1_r*b2_i, a1_i*b2_i
+	xvmulsp		vs52,	vs2,	vs12		// a2_r*b2_r, a2_i*b2_r, a1_r*b2_r, a1_i*b2_r
+	xvmulsp		vs53,	vs2,	vs13		// a2_r*b2_i, a2_i*b2_i, a1_r*b2_i, a1_i*b2_i
+	xvmulsp		vs54,	vs3,	vs12		// a3_r*b2_r, a3_i*b2_r, a1_r*b2_r, a1_i*b2_r
+	xvmulsp		vs55,	vs3,	vs13		// a3_r*b2_i, a3_i*b2_i, a1_r*b2_i, a1_i*b2_i
+
+	xvmulsp		vs56,	vs0,	vs14		// a0_r*b3_r, a0_i*b3_r, a1_r*b3_r, a1_i*b3_r
+	xvmulsp		vs57,	vs0,	vs15		// a0_r*b3_i, a0_i*b3_i, a1_r*b3_i, a1_i*b3_i
+	xvmulsp		vs58,	vs1,	vs14		// a1_r*b3_r, a1_i*b3_r, a1_r*b3_r, a1_i*b3_r
+	xvmulsp		vs59,	vs1,	vs15		// a1_r*b3_i, a1_i*b3_i, a1_r*b3_i, a1_i*b3_i
+	xvmulsp		vs60,	vs2,	vs14		// a2_r*b3_r, a2_i*b3_r, a1_r*b3_r, a1_i*b3_r
+	xvmulsp		vs61,	vs2,	vs15		// a2_r*b3_i, a2_i*b3_i, a1_r*b3_i, a1_i*b3_i
+	xvmulsp		vs62,	vs3,	vs14		// a3_r*b3_r, a3_i*b3_r, a1_r*b3_r, a1_i*b3_r
+	xvmulsp		vs63,	vs3,	vs15		// a3_r*b3_i, a3_i*b3_i, a1_r*b3_i, a1_i*b3_i
+
+
+.endm
+
+.macro KERNEL4x8_SUB1
+
+
+	lxvw4x		vs0,	o0,	AO		// load a0, a1
+	lxvw4x		vs1,	o16,	AO		// load a2, a3
+	lxvw4x		vs2,	o32,	AO		// load a4, a5
+	lxvw4x		vs3,	o48,	AO		// load a6, a7
+
+	addi		AO,	AO,	64
+
+	lxvw4x		vs8,	o0,	BO		//  load b0_r
+	lxvw4x		vs9,	o16,	BO		//  load b0_i
+	lxvw4x		vs10,	o32,	BO		//  load b1_r
+	lxvw4x		vs11,	o48,	BO		//  load b1_i
+
+	addi		BO,	BO,	64
+
+	lxvw4x		vs12,	o0,	BO		//  load b2_r
+	lxvw4x		vs13,	o16,	BO		//  load b2_i
+	lxvw4x		vs14,	o32,	BO		//  load b3_r
+	lxvw4x		vs15,	o48,	BO		//  load b3_i
+
+	addi		BO,	BO,	64
+
+
+
+	xvmaddasp	vs32,	vs0,	vs8		// a0_r*b0_r, a0_i*b0_r, a1_r*b0_r, a1_i*b0_r
+	xvmaddasp	vs33,	vs0,	vs9		// a0_r*b0_i, a0_i*b0_i, a1_r*b0_i, a1_i*b0_i
+	xvmaddasp	vs34,	vs1,	vs8		// a1_r*b0_r, a1_i*b0_r, a1_r*b0_r, a1_i*b0_r
+	xvmaddasp	vs35,	vs1,	vs9		// a1_r*b0_i, a1_i*b0_i, a1_r*b0_i, a1_i*b0_i
+	xvmaddasp	vs36,	vs2,	vs8		// a2_r*b0_r, a2_i*b0_r, a1_r*b0_r, a1_i*b0_r
+	xvmaddasp	vs37,	vs2,	vs9		// a2_r*b0_i, a2_i*b0_i, a1_r*b0_i, a1_i*b0_i
+	xvmaddasp	vs38,	vs3,	vs8		// a3_r*b0_r, a3_i*b0_r, a1_r*b0_r, a1_i*b0_r
+	xvmaddasp	vs39,	vs3,	vs9		// a3_r*b0_i, a3_i*b0_i, a1_r*b0_i, a1_i*b0_i
+
+	xvmaddasp	vs40,	vs0,	vs10		// a0_r*b1_r, a0_i*b1_r, a1_r*b1_r, a1_i*b1_r
+	xvmaddasp	vs41,	vs0,	vs11		// a0_r*b1_i, a0_i*b1_i, a1_r*b1_i, a1_i*b1_i
+	xvmaddasp	vs42,	vs1,	vs10		// a1_r*b1_r, a1_i*b1_r, a1_r*b1_r, a1_i*b1_r
+	xvmaddasp	vs43,	vs1,	vs11		// a1_r*b1_i, a1_i*b1_i, a1_r*b1_i, a1_i*b1_i
+	xvmaddasp	vs44,	vs2,	vs10		// a2_r*b1_r, a2_i*b1_r, a1_r*b1_r, a1_i*b1_r
+	xvmaddasp	vs45,	vs2,	vs11		// a2_r*b1_i, a2_i*b1_i, a1_r*b1_i, a1_i*b1_i
+	xvmaddasp	vs46,	vs3,	vs10		// a3_r*b1_r, a3_i*b1_r, a1_r*b1_r, a1_i*b1_r
+	xvmaddasp	vs47,	vs3,	vs11		// a3_r*b1_i, a3_i*b1_i, a1_r*b1_i, a1_i*b1_i
+
+	xvmaddasp	vs48,	vs0,	vs12		// a0_r*b2_r, a0_i*b2_r, a1_r*b2_r, a1_i*b2_r
+	xvmaddasp	vs49,	vs0,	vs13		// a0_r*b2_i, a0_i*b2_i, a1_r*b2_i, a1_i*b2_i
+	xvmaddasp	vs50,	vs1,	vs12		// a1_r*b2_r, a1_i*b2_r, a1_r*b2_r, a1_i*b2_r
+	xvmaddasp	vs51,	vs1,	vs13		// a1_r*b2_i, a1_i*b2_i, a1_r*b2_i, a1_i*b2_i
+	xvmaddasp	vs52,	vs2,	vs12		// a2_r*b2_r, a2_i*b2_r, a1_r*b2_r, a1_i*b2_r
+	xvmaddasp	vs53,	vs2,	vs13		// a2_r*b2_i, a2_i*b2_i, a1_r*b2_i, a1_i*b2_i
+	xvmaddasp	vs54,	vs3,	vs12		// a3_r*b2_r, a3_i*b2_r, a1_r*b2_r, a1_i*b2_r
+	xvmaddasp	vs55,	vs3,	vs13		// a3_r*b2_i, a3_i*b2_i, a1_r*b2_i, a1_i*b2_i
+
+	xvmaddasp	vs56,	vs0,	vs14		// a0_r*b3_r, a0_i*b3_r, a1_r*b3_r, a1_i*b3_r
+	xvmaddasp	vs57,	vs0,	vs15		// a0_r*b3_i, a0_i*b3_i, a1_r*b3_i, a1_i*b3_i
+	xvmaddasp	vs58,	vs1,	vs14		// a1_r*b3_r, a1_i*b3_r, a1_r*b3_r, a1_i*b3_r
+	xvmaddasp	vs59,	vs1,	vs15		// a1_r*b3_i, a1_i*b3_i, a1_r*b3_i, a1_i*b3_i
+	xvmaddasp	vs60,	vs2,	vs14		// a2_r*b3_r, a2_i*b3_r, a1_r*b3_r, a1_i*b3_r
+	xvmaddasp	vs61,	vs2,	vs15		// a2_r*b3_i, a2_i*b3_i, a1_r*b3_i, a1_i*b3_i
+	xvmaddasp	vs62,	vs3,	vs14		// a3_r*b3_r, a3_i*b3_r, a1_r*b3_r, a1_i*b3_r
+	xvmaddasp	vs63,	vs3,	vs15		// a3_r*b3_i, a3_i*b3_i, a1_r*b3_i, a1_i*b3_i
+
+
+.endm
+
+.macro SAVE4x8
+
+	mr		T1,	CO
+	xxlxor		vs24,	vs24,	vs24
+
+// N=0
+
+	mr		T2,	T1
+
+// N=0 M=0
+
+	xxlxor		vs4,	vs4,	vs4
+	xxlxor		vs5,	vs5,	vs5
+	xxlxor		vs6,	vs6,	vs6
+	xxlxor		vs7,	vs7,	vs7
+
+#ifndef TRMMKERNEL
+	lxvw4x		vs0,	o0,	T2	// c0, c1
+#else
+	xxlxor		vs0,	vs0,	vs0
+#endif
+
+
+	xxspltw		vs8,	vs32,	0
+	xxspltw		vs9,	vs32,	1
+	xxspltw		vs10,	vs32,	2
+	xxspltw		vs11,	vs32,	3
+
+
+	xxspltw		vs12,	vs33,	0
+	xxspltw		vs13,	vs33,	1
+	xxspltw		vs14,	vs33,	2
+	xxspltw		vs15,	vs33,	3
+
+	XVFADD_R1	vs4,	vs4,	vs8		// add a0_r * b0_r
+	XVFADD_I2	vs5,	vs5,	vs12		// add a0_r * b0_i
+	XVFADD_R1	vs6,	vs6,	vs10		// add a1_r * b0_r
+	XVFADD_I2	vs7,	vs7,	vs14		// add a1_r * b0_i
+
+	XVFADD_R2	vs4,	vs4,	vs13		// add a0_i * b0_i
+	XVFADD_I1	vs5,	vs5,	vs9 		// add a0_i * b0_r
+	XVFADD_R2	vs6,	vs6,	vs15		// add a1_i * b0_i
+	XVFADD_I1	vs7,	vs7,	vs11		// add a1_i * b0_r
+
+	xvmulsp		vs16,	vs4,	alpha_sr		// r0_r * alpha_r
+	xvmulsp		vs17,	vs5,	alpha_si		// r0_i * alpha_i
+	xvmulsp		vs18,	vs4,	alpha_si		// r0_r * alpha_i
+	xvmulsp		vs19,	vs5,	alpha_sr		// r0_i * alpha_r
+
+	xvsubsp		vs20,	vs16,	vs17		// r0_r * alpha_r - r0_i * alpha_i
+	xvaddsp		vs21,	vs18,	vs19		// r0_r * alpha_i + r0_i * alpha_r
+
+	xvmulsp		vs16,	vs6,	alpha_sr		// r1_r * alpha_r
+	xvmulsp		vs17,	vs7,	alpha_si		// r1_i * alpha_i
+	xvmulsp		vs18,	vs6,	alpha_si		// r1_r * alpha_i
+	xvmulsp		vs19,	vs7,	alpha_sr		// r1_i * alpha_r
+
+	xvsubsp		vs22,	vs16,	vs17		// r1_r * alpha_r - r1_i * alpha_i
+	xvaddsp		vs23,	vs18,	vs19		// r1_r * alpha_i + r1_i * alpha_r
+
+	xxsldwi		vs20,	vs20,	vs24,	3		// r0_r
+	xxsldwi		vs21,	vs21,	vs24,	2		// r0_i
+	xxsldwi		vs22,	vs22,	vs24,	1		// r1_r
+	xxsldwi		vs23,	vs23,	vs24,	0		// r1_i
+	xvaddsp		vs20,	vs20,	vs21		// r0_r, r0_i
+	xvaddsp		vs22,	vs22,	vs23		// r1_r, r1_i
+	xvaddsp		vs1,	vs20,	vs22			// r0_r, r0_i, r1_r, r1_i
+	xvaddsp		vs0,	vs0,	vs1
+
+
+	stxvw4x		vs0,	o0,	T2	// c0, c1
+
+	addi		T2,	T2,	16
+
+// N=0 M=2
+
+	xxlxor		vs4,	vs4,	vs4
+	xxlxor		vs5,	vs5,	vs5
+	xxlxor		vs6,	vs6,	vs6
+	xxlxor		vs7,	vs7,	vs7
+
+#ifndef TRMMKERNEL
+	lxvw4x		vs0,	o0,	T2	// c0, c1
+#else
+	xxlxor		vs0,	vs0,	vs0
+#endif
+
+
+	xxspltw		vs8,	vs34,	0
+	xxspltw		vs9,	vs34,	1
+	xxspltw		vs10,	vs34,	2
+	xxspltw		vs11,	vs34,	3
+
+
+	xxspltw		vs12,	vs35,	0
+	xxspltw		vs13,	vs35,	1
+	xxspltw		vs14,	vs35,	2
+	xxspltw		vs15,	vs35,	3
+
+	XVFADD_R1	vs4,	vs4,	vs8		// add a0_r * b0_r
+	XVFADD_I2	vs5,	vs5,	vs12		// add a0_r * b0_i
+	XVFADD_R1	vs6,	vs6,	vs10		// add a1_r * b0_r
+	XVFADD_I2	vs7,	vs7,	vs14		// add a1_r * b0_i
+
+	XVFADD_R2	vs4,	vs4,	vs13		// add a0_i * b0_i
+	XVFADD_I1	vs5,	vs5,	vs9 		// add a0_i * b0_r
+	XVFADD_R2	vs6,	vs6,	vs15		// add a1_i * b0_i
+	XVFADD_I1	vs7,	vs7,	vs11		// add a1_i * b0_r
+
+	xvmulsp		vs16,	vs4,	alpha_sr		// r0_r * alpha_r
+	xvmulsp		vs17,	vs5,	alpha_si		// r0_i * alpha_i
+	xvmulsp		vs18,	vs4,	alpha_si		// r0_r * alpha_i
+	xvmulsp		vs19,	vs5,	alpha_sr		// r0_i * alpha_r
+
+	xvsubsp		vs20,	vs16,	vs17		// r0_r * alpha_r - r0_i * alpha_i
+	xvaddsp		vs21,	vs18,	vs19		// r0_r * alpha_i + r0_i * alpha_r
+
+	xvmulsp		vs16,	vs6,	alpha_sr		// r1_r * alpha_r
+	xvmulsp		vs17,	vs7,	alpha_si		// r1_i * alpha_i
+	xvmulsp		vs18,	vs6,	alpha_si		// r1_r * alpha_i
+	xvmulsp		vs19,	vs7,	alpha_sr		// r1_i * alpha_r
+
+	xvsubsp		vs22,	vs16,	vs17		// r1_r * alpha_r - r1_i * alpha_i
+	xvaddsp		vs23,	vs18,	vs19		// r1_r * alpha_i + r1_i * alpha_r
+
+	xxsldwi		vs20,	vs20,	vs24,	3		// r0_r
+	xxsldwi		vs21,	vs21,	vs24,	2		// r0_i
+	xxsldwi		vs22,	vs22,	vs24,	1		// r1_r
+	xxsldwi		vs23,	vs23,	vs24,	0		// r1_i
+	xvaddsp		vs20,	vs20,	vs21		// r0_r, r0_i
+	xvaddsp		vs22,	vs22,	vs23		// r1_r, r1_i
+	xvaddsp		vs1,	vs20,	vs22			// r0_r, r0_i, r1_r, r1_i
+	xvaddsp		vs0,	vs0,	vs1
+
+
+	stxvw4x		vs0,	o0,	T2	// c0, c1
+
+	addi		T2,	T2,	16
+
+// N=0 M=4
+
+	xxlxor		vs4,	vs4,	vs4
+	xxlxor		vs5,	vs5,	vs5
+	xxlxor		vs6,	vs6,	vs6
+	xxlxor		vs7,	vs7,	vs7
+
+#ifndef TRMMKERNEL
+	lxvw4x		vs0,	o0,	T2	// c0, c1
+#else
+	xxlxor		vs0,	vs0,	vs0
+#endif
+
+
+	xxspltw		vs8,	vs36,	0
+	xxspltw		vs9,	vs36,	1
+	xxspltw		vs10,	vs36,	2
+	xxspltw		vs11,	vs36,	3
+
+
+	xxspltw		vs12,	vs37,	0
+	xxspltw		vs13,	vs37,	1
+	xxspltw		vs14,	vs37,	2
+	xxspltw		vs15,	vs37,	3
+
+	XVFADD_R1	vs4,	vs4,	vs8		// add a0_r * b0_r
+	XVFADD_I2	vs5,	vs5,	vs12		// add a0_r * b0_i
+	XVFADD_R1	vs6,	vs6,	vs10		// add a1_r * b0_r
+	XVFADD_I2	vs7,	vs7,	vs14		// add a1_r * b0_i
+
+	XVFADD_R2	vs4,	vs4,	vs13		// add a0_i * b0_i
+	XVFADD_I1	vs5,	vs5,	vs9 		// add a0_i * b0_r
+	XVFADD_R2	vs6,	vs6,	vs15		// add a1_i * b0_i
+	XVFADD_I1	vs7,	vs7,	vs11		// add a1_i * b0_r
+
+	xvmulsp		vs16,	vs4,	alpha_sr		// r0_r * alpha_r
+	xvmulsp		vs17,	vs5,	alpha_si		// r0_i * alpha_i
+	xvmulsp		vs18,	vs4,	alpha_si		// r0_r * alpha_i
+	xvmulsp		vs19,	vs5,	alpha_sr		// r0_i * alpha_r
+
+	xvsubsp		vs20,	vs16,	vs17		// r0_r * alpha_r - r0_i * alpha_i
+	xvaddsp		vs21,	vs18,	vs19		// r0_r * alpha_i + r0_i * alpha_r
+
+	xvmulsp		vs16,	vs6,	alpha_sr		// r1_r * alpha_r
+	xvmulsp		vs17,	vs7,	alpha_si		// r1_i * alpha_i
+	xvmulsp		vs18,	vs6,	alpha_si		// r1_r * alpha_i
+	xvmulsp		vs19,	vs7,	alpha_sr		// r1_i * alpha_r
+
+	xvsubsp		vs22,	vs16,	vs17		// r1_r * alpha_r - r1_i * alpha_i
+	xvaddsp		vs23,	vs18,	vs19		// r1_r * alpha_i + r1_i * alpha_r
+
+	xxsldwi		vs20,	vs20,	vs24,	3		// r0_r
+	xxsldwi		vs21,	vs21,	vs24,	2		// r0_i
+	xxsldwi		vs22,	vs22,	vs24,	1		// r1_r
+	xxsldwi		vs23,	vs23,	vs24,	0		// r1_i
+	xvaddsp		vs20,	vs20,	vs21		// r0_r, r0_i
+	xvaddsp		vs22,	vs22,	vs23		// r1_r, r1_i
+	xvaddsp		vs1,	vs20,	vs22			// r0_r, r0_i, r1_r, r1_i
+	xvaddsp		vs0,	vs0,	vs1
+
+
+	stxvw4x		vs0,	o0,	T2	// c0, c1
+
+	addi		T2,	T2,	16
+
+// N=0 M=6
+
+	xxlxor		vs4,	vs4,	vs4
+	xxlxor		vs5,	vs5,	vs5
+	xxlxor		vs6,	vs6,	vs6
+	xxlxor		vs7,	vs7,	vs7
+
+#ifndef TRMMKERNEL
+	lxvw4x		vs0,	o0,	T2	// c0, c1
+#else
+	xxlxor		vs0,	vs0,	vs0
+#endif
+
+
+	xxspltw		vs8,	vs38,	0
+	xxspltw		vs9,	vs38,	1
+	xxspltw		vs10,	vs38,	2
+	xxspltw		vs11,	vs38,	3
+
+
+	xxspltw		vs12,	vs39,	0
+	xxspltw		vs13,	vs39,	1
+	xxspltw		vs14,	vs39,	2
+	xxspltw		vs15,	vs39,	3
+
+	XVFADD_R1	vs4,	vs4,	vs8		// add a0_r * b0_r
+	XVFADD_I2	vs5,	vs5,	vs12		// add a0_r * b0_i
+	XVFADD_R1	vs6,	vs6,	vs10		// add a1_r * b0_r
+	XVFADD_I2	vs7,	vs7,	vs14		// add a1_r * b0_i
+
+	XVFADD_R2	vs4,	vs4,	vs13		// add a0_i * b0_i
+	XVFADD_I1	vs5,	vs5,	vs9 		// add a0_i * b0_r
+	XVFADD_R2	vs6,	vs6,	vs15		// add a1_i * b0_i
+	XVFADD_I1	vs7,	vs7,	vs11		// add a1_i * b0_r
+
+	xvmulsp		vs16,	vs4,	alpha_sr		// r0_r * alpha_r
+	xvmulsp		vs17,	vs5,	alpha_si		// r0_i * alpha_i
+	xvmulsp		vs18,	vs4,	alpha_si		// r0_r * alpha_i
+	xvmulsp		vs19,	vs5,	alpha_sr		// r0_i * alpha_r
+
+	xvsubsp		vs20,	vs16,	vs17		// r0_r * alpha_r - r0_i * alpha_i
+	xvaddsp		vs21,	vs18,	vs19		// r0_r * alpha_i + r0_i * alpha_r
+
+	xvmulsp		vs16,	vs6,	alpha_sr		// r1_r * alpha_r
+	xvmulsp		vs17,	vs7,	alpha_si		// r1_i * alpha_i
+	xvmulsp		vs18,	vs6,	alpha_si		// r1_r * alpha_i
+	xvmulsp		vs19,	vs7,	alpha_sr		// r1_i * alpha_r
+
+	xvsubsp		vs22,	vs16,	vs17		// r1_r * alpha_r - r1_i * alpha_i
+	xvaddsp		vs23,	vs18,	vs19		// r1_r * alpha_i + r1_i * alpha_r
+
+	xxsldwi		vs20,	vs20,	vs24,	3		// r0_r
+	xxsldwi		vs21,	vs21,	vs24,	2		// r0_i
+	xxsldwi		vs22,	vs22,	vs24,	1		// r1_r
+	xxsldwi		vs23,	vs23,	vs24,	0		// r1_i
+	xvaddsp		vs20,	vs20,	vs21		// r0_r, r0_i
+	xvaddsp		vs22,	vs22,	vs23		// r1_r, r1_i
+	xvaddsp		vs1,	vs20,	vs22			// r0_r, r0_i, r1_r, r1_i
+	xvaddsp		vs0,	vs0,	vs1
+
+
+	stxvw4x		vs0,	o0,	T2	// c0, c1
+
+	addi		T2,	T2,	16
+	add		T1,	T1,	LDC
+
+
+// N=1
+
+	mr		T2,	T1
+
+// N=1 M=0
+
+	xxlxor		vs4,	vs4,	vs4
+	xxlxor		vs5,	vs5,	vs5
+	xxlxor		vs6,	vs6,	vs6
+	xxlxor		vs7,	vs7,	vs7
+
+#ifndef TRMMKERNEL
+	lxvw4x		vs0,	o0,	T2	// c0, c1
+#else
+	xxlxor		vs0,	vs0,	vs0
+#endif
+
+
+	xxspltw		vs8,	vs40,	0
+	xxspltw		vs9,	vs40,	1
+	xxspltw		vs10,	vs40,	2
+	xxspltw		vs11,	vs40,	3
+
+
+	xxspltw		vs12,	vs41,	0
+	xxspltw		vs13,	vs41,	1
+	xxspltw		vs14,	vs41,	2
+	xxspltw		vs15,	vs41,	3
+
+	XVFADD_R1	vs4,	vs4,	vs8		// add a0_r * b0_r
+	XVFADD_I2	vs5,	vs5,	vs12		// add a0_r * b0_i
+	XVFADD_R1	vs6,	vs6,	vs10		// add a1_r * b0_r
+	XVFADD_I2	vs7,	vs7,	vs14		// add a1_r * b0_i
+
+	XVFADD_R2	vs4,	vs4,	vs13		// add a0_i * b0_i
+	XVFADD_I1	vs5,	vs5,	vs9 		// add a0_i * b0_r
+	XVFADD_R2	vs6,	vs6,	vs15		// add a1_i * b0_i
+	XVFADD_I1	vs7,	vs7,	vs11		// add a1_i * b0_r
+
+	xvmulsp		vs16,	vs4,	alpha_sr		// r0_r * alpha_r
+	xvmulsp		vs17,	vs5,	alpha_si		// r0_i * alpha_i
+	xvmulsp		vs18,	vs4,	alpha_si		// r0_r * alpha_i
+	xvmulsp		vs19,	vs5,	alpha_sr		// r0_i * alpha_r
+
+	xvsubsp		vs20,	vs16,	vs17		// r0_r * alpha_r - r0_i * alpha_i
+	xvaddsp		vs21,	vs18,	vs19		// r0_r * alpha_i + r0_i * alpha_r
+
+	xvmulsp		vs16,	vs6,	alpha_sr		// r1_r * alpha_r
+	xvmulsp		vs17,	vs7,	alpha_si		// r1_i * alpha_i
+	xvmulsp		vs18,	vs6,	alpha_si		// r1_r * alpha_i
+	xvmulsp		vs19,	vs7,	alpha_sr		// r1_i * alpha_r
+
+	xvsubsp		vs22,	vs16,	vs17		// r1_r * alpha_r - r1_i * alpha_i
+	xvaddsp		vs23,	vs18,	vs19		// r1_r * alpha_i + r1_i * alpha_r
+
+	xxsldwi		vs20,	vs20,	vs24,	3		// r0_r
+	xxsldwi		vs21,	vs21,	vs24,	2		// r0_i
+	xxsldwi		vs22,	vs22,	vs24,	1		// r1_r
+	xxsldwi		vs23,	vs23,	vs24,	0		// r1_i
+	xvaddsp		vs20,	vs20,	vs21		// r0_r, r0_i
+	xvaddsp		vs22,	vs22,	vs23		// r1_r, r1_i
+	xvaddsp		vs1,	vs20,	vs22			// r0_r, r0_i, r1_r, r1_i
+	xvaddsp		vs0,	vs0,	vs1
+
+
+	stxvw4x		vs0,	o0,	T2	// c0, c1
+
+	addi		T2,	T2,	16
+
+// N=1 M=2
+
+	xxlxor		vs4,	vs4,	vs4
+	xxlxor		vs5,	vs5,	vs5
+	xxlxor		vs6,	vs6,	vs6
+	xxlxor		vs7,	vs7,	vs7
+
+#ifndef TRMMKERNEL
+	lxvw4x		vs0,	o0,	T2	// c0, c1
+#else
+	xxlxor		vs0,	vs0,	vs0
+#endif
+
+
+	xxspltw		vs8,	vs42,	0
+	xxspltw		vs9,	vs42,	1
+	xxspltw		vs10,	vs42,	2
+	xxspltw		vs11,	vs42,	3
+
+
+	xxspltw		vs12,	vs43,	0
+	xxspltw		vs13,	vs43,	1
+	xxspltw		vs14,	vs43,	2
+	xxspltw		vs15,	vs43,	3
+
+	XVFADD_R1	vs4,	vs4,	vs8		// add a0_r * b0_r
+	XVFADD_I2	vs5,	vs5,	vs12		// add a0_r * b0_i
+	XVFADD_R1	vs6,	vs6,	vs10		// add a1_r * b0_r
+	XVFADD_I2	vs7,	vs7,	vs14		// add a1_r * b0_i
+
+	XVFADD_R2	vs4,	vs4,	vs13		// add a0_i * b0_i
+	XVFADD_I1	vs5,	vs5,	vs9 		// add a0_i * b0_r
+	XVFADD_R2	vs6,	vs6,	vs15		// add a1_i * b0_i
+	XVFADD_I1	vs7,	vs7,	vs11		// add a1_i * b0_r
+
+	xvmulsp		vs16,	vs4,	alpha_sr		// r0_r * alpha_r
+	xvmulsp		vs17,	vs5,	alpha_si		// r0_i * alpha_i
+	xvmulsp		vs18,	vs4,	alpha_si		// r0_r * alpha_i
+	xvmulsp		vs19,	vs5,	alpha_sr		// r0_i * alpha_r
+
+	xvsubsp		vs20,	vs16,	vs17		// r0_r * alpha_r - r0_i * alpha_i
+	xvaddsp		vs21,	vs18,	vs19		// r0_r * alpha_i + r0_i * alpha_r
+
+	xvmulsp		vs16,	vs6,	alpha_sr		// r1_r * alpha_r
+	xvmulsp		vs17,	vs7,	alpha_si		// r1_i * alpha_i
+	xvmulsp		vs18,	vs6,	alpha_si		// r1_r * alpha_i
+	xvmulsp		vs19,	vs7,	alpha_sr		// r1_i * alpha_r
+
+	xvsubsp		vs22,	vs16,	vs17		// r1_r * alpha_r - r1_i * alpha_i
+	xvaddsp		vs23,	vs18,	vs19		// r1_r * alpha_i + r1_i * alpha_r
+
+	xxsldwi		vs20,	vs20,	vs24,	3		// r0_r
+	xxsldwi		vs21,	vs21,	vs24,	2		// r0_i
+	xxsldwi		vs22,	vs22,	vs24,	1		// r1_r
+	xxsldwi		vs23,	vs23,	vs24,	0		// r1_i
+	xvaddsp		vs20,	vs20,	vs21		// r0_r, r0_i
+	xvaddsp		vs22,	vs22,	vs23		// r1_r, r1_i
+	xvaddsp		vs1,	vs20,	vs22			// r0_r, r0_i, r1_r, r1_i
+	xvaddsp		vs0,	vs0,	vs1
+
+
+	stxvw4x		vs0,	o0,	T2	// c0, c1
+
+	addi		T2,	T2,	16
+
+// N=1 M=4
+
+	xxlxor		vs4,	vs4,	vs4
+	xxlxor		vs5,	vs5,	vs5
+	xxlxor		vs6,	vs6,	vs6
+	xxlxor		vs7,	vs7,	vs7
+
+#ifndef TRMMKERNEL
+	lxvw4x		vs0,	o0,	T2	// c0, c1
+#else
+	xxlxor		vs0,	vs0,	vs0
+#endif
+
+
+	xxspltw		vs8,	vs44,	0
+	xxspltw		vs9,	vs44,	1
+	xxspltw		vs10,	vs44,	2
+	xxspltw		vs11,	vs44,	3
+
+
+	xxspltw		vs12,	vs45,	0
+	xxspltw		vs13,	vs45,	1
+	xxspltw		vs14,	vs45,	2
+	xxspltw		vs15,	vs45,	3
+
+	XVFADD_R1	vs4,	vs4,	vs8		// add a0_r * b0_r
+	XVFADD_I2	vs5,	vs5,	vs12		// add a0_r * b0_i
+	XVFADD_R1	vs6,	vs6,	vs10		// add a1_r * b0_r
+	XVFADD_I2	vs7,	vs7,	vs14		// add a1_r * b0_i
+
+	XVFADD_R2	vs4,	vs4,	vs13		// add a0_i * b0_i
+	XVFADD_I1	vs5,	vs5,	vs9 		// add a0_i * b0_r
+	XVFADD_R2	vs6,	vs6,	vs15		// add a1_i * b0_i
+	XVFADD_I1	vs7,	vs7,	vs11		// add a1_i * b0_r
+
+	xvmulsp		vs16,	vs4,	alpha_sr		// r0_r * alpha_r
+	xvmulsp		vs17,	vs5,	alpha_si		// r0_i * alpha_i
+	xvmulsp		vs18,	vs4,	alpha_si		// r0_r * alpha_i
+	xvmulsp		vs19,	vs5,	alpha_sr		// r0_i * alpha_r
+
+	xvsubsp		vs20,	vs16,	vs17		// r0_r * alpha_r - r0_i * alpha_i
+	xvaddsp		vs21,	vs18,	vs19		// r0_r * alpha_i + r0_i * alpha_r
+
+	xvmulsp		vs16,	vs6,	alpha_sr		// r1_r * alpha_r
+	xvmulsp		vs17,	vs7,	alpha_si		// r1_i * alpha_i
+	xvmulsp		vs18,	vs6,	alpha_si		// r1_r * alpha_i
+	xvmulsp		vs19,	vs7,	alpha_sr		// r1_i * alpha_r
+
+	xvsubsp		vs22,	vs16,	vs17		// r1_r * alpha_r - r1_i * alpha_i
+	xvaddsp		vs23,	vs18,	vs19		// r1_r * alpha_i + r1_i * alpha_r
+
+	xxsldwi		vs20,	vs20,	vs24,	3		// r0_r
+	xxsldwi		vs21,	vs21,	vs24,	2		// r0_i
+	xxsldwi		vs22,	vs22,	vs24,	1		// r1_r
+	xxsldwi		vs23,	vs23,	vs24,	0		// r1_i
+	xvaddsp		vs20,	vs20,	vs21		// r0_r, r0_i
+	xvaddsp		vs22,	vs22,	vs23		// r1_r, r1_i
+	xvaddsp		vs1,	vs20,	vs22			// r0_r, r0_i, r1_r, r1_i
+	xvaddsp		vs0,	vs0,	vs1
+
+
+	stxvw4x		vs0,	o0,	T2	// c0, c1
+
+	addi		T2,	T2,	16
+
+// N=1 M=6
+
+	xxlxor		vs4,	vs4,	vs4
+	xxlxor		vs5,	vs5,	vs5
+	xxlxor		vs6,	vs6,	vs6
+	xxlxor		vs7,	vs7,	vs7
+
+#ifndef TRMMKERNEL
+	lxvw4x		vs0,	o0,	T2	// c0, c1
+#else
+	xxlxor		vs0,	vs0,	vs0
+#endif
+
+
+	xxspltw		vs8,	vs46,	0
+	xxspltw		vs9,	vs46,	1
+	xxspltw		vs10,	vs46,	2
+	xxspltw		vs11,	vs46,	3
+
+
+	xxspltw		vs12,	vs47,	0
+	xxspltw		vs13,	vs47,	1
+	xxspltw		vs14,	vs47,	2
+	xxspltw		vs15,	vs47,	3
+
+	XVFADD_R1	vs4,	vs4,	vs8		// add a0_r * b0_r
+	XVFADD_I2	vs5,	vs5,	vs12		// add a0_r * b0_i
+	XVFADD_R1	vs6,	vs6,	vs10		// add a1_r * b0_r
+	XVFADD_I2	vs7,	vs7,	vs14		// add a1_r * b0_i
+
+	XVFADD_R2	vs4,	vs4,	vs13		// add a0_i * b0_i
+	XVFADD_I1	vs5,	vs5,	vs9 		// add a0_i * b0_r
+	XVFADD_R2	vs6,	vs6,	vs15		// add a1_i * b0_i
+	XVFADD_I1	vs7,	vs7,	vs11		// add a1_i * b0_r
+
+	xvmulsp		vs16,	vs4,	alpha_sr		// r0_r * alpha_r
+	xvmulsp		vs17,	vs5,	alpha_si		// r0_i * alpha_i
+	xvmulsp		vs18,	vs4,	alpha_si		// r0_r * alpha_i
+	xvmulsp		vs19,	vs5,	alpha_sr		// r0_i * alpha_r
+
+	xvsubsp		vs20,	vs16,	vs17		// r0_r * alpha_r - r0_i * alpha_i
+	xvaddsp		vs21,	vs18,	vs19		// r0_r * alpha_i + r0_i * alpha_r
+
+	xvmulsp		vs16,	vs6,	alpha_sr		// r1_r * alpha_r
+	xvmulsp		vs17,	vs7,	alpha_si		// r1_i * alpha_i
+	xvmulsp		vs18,	vs6,	alpha_si		// r1_r * alpha_i
+	xvmulsp		vs19,	vs7,	alpha_sr		// r1_i * alpha_r
+
+	xvsubsp		vs22,	vs16,	vs17		// r1_r * alpha_r - r1_i * alpha_i
+	xvaddsp		vs23,	vs18,	vs19		// r1_r * alpha_i + r1_i * alpha_r
+
+	xxsldwi		vs20,	vs20,	vs24,	3		// r0_r
+	xxsldwi		vs21,	vs21,	vs24,	2		// r0_i
+	xxsldwi		vs22,	vs22,	vs24,	1		// r1_r
+	xxsldwi		vs23,	vs23,	vs24,	0		// r1_i
+	xvaddsp		vs20,	vs20,	vs21		// r0_r, r0_i
+	xvaddsp		vs22,	vs22,	vs23		// r1_r, r1_i
+	xvaddsp		vs1,	vs20,	vs22			// r0_r, r0_i, r1_r, r1_i
+	xvaddsp		vs0,	vs0,	vs1
+
+
+	stxvw4x		vs0,	o0,	T2	// c0, c1
+
+	addi		T2,	T2,	16
+	add		T1,	T1,	LDC
+
+
+// N=2
+
+	mr		T2,	T1
+
+// N=2 M=0
+
+	xxlxor		vs4,	vs4,	vs4
+	xxlxor		vs5,	vs5,	vs5
+	xxlxor		vs6,	vs6,	vs6
+	xxlxor		vs7,	vs7,	vs7
+
+#ifndef TRMMKERNEL
+	lxvw4x		vs0,	o0,	T2	// c0, c1
+#else
+	xxlxor		vs0,	vs0,	vs0
+#endif
+
+
+	xxspltw		vs8,	vs48,	0
+	xxspltw		vs9,	vs48,	1
+	xxspltw		vs10,	vs48,	2
+	xxspltw		vs11,	vs48,	3
+
+
+	xxspltw		vs12,	vs49,	0
+	xxspltw		vs13,	vs49,	1
+	xxspltw		vs14,	vs49,	2
+	xxspltw		vs15,	vs49,	3
+
+	XVFADD_R1	vs4,	vs4,	vs8		// add a0_r * b0_r
+	XVFADD_I2	vs5,	vs5,	vs12		// add a0_r * b0_i
+	XVFADD_R1	vs6,	vs6,	vs10		// add a1_r * b0_r
+	XVFADD_I2	vs7,	vs7,	vs14		// add a1_r * b0_i
+
+	XVFADD_R2	vs4,	vs4,	vs13		// add a0_i * b0_i
+	XVFADD_I1	vs5,	vs5,	vs9 		// add a0_i * b0_r
+	XVFADD_R2	vs6,	vs6,	vs15		// add a1_i * b0_i
+	XVFADD_I1	vs7,	vs7,	vs11		// add a1_i * b0_r
+
+	xvmulsp		vs16,	vs4,	alpha_sr		// r0_r * alpha_r
+	xvmulsp		vs17,	vs5,	alpha_si		// r0_i * alpha_i
+	xvmulsp		vs18,	vs4,	alpha_si		// r0_r * alpha_i
+	xvmulsp		vs19,	vs5,	alpha_sr		// r0_i * alpha_r
+
+	xvsubsp		vs20,	vs16,	vs17		// r0_r * alpha_r - r0_i * alpha_i
+	xvaddsp		vs21,	vs18,	vs19		// r0_r * alpha_i + r0_i * alpha_r
+
+	xvmulsp		vs16,	vs6,	alpha_sr		// r1_r * alpha_r
+	xvmulsp		vs17,	vs7,	alpha_si		// r1_i * alpha_i
+	xvmulsp		vs18,	vs6,	alpha_si		// r1_r * alpha_i
+	xvmulsp		vs19,	vs7,	alpha_sr		// r1_i * alpha_r
+
+	xvsubsp		vs22,	vs16,	vs17		// r1_r * alpha_r - r1_i * alpha_i
+	xvaddsp		vs23,	vs18,	vs19		// r1_r * alpha_i + r1_i * alpha_r
+
+	xxsldwi		vs20,	vs20,	vs24,	3		// r0_r
+	xxsldwi		vs21,	vs21,	vs24,	2		// r0_i
+	xxsldwi		vs22,	vs22,	vs24,	1		// r1_r
+	xxsldwi		vs23,	vs23,	vs24,	0		// r1_i
+	xvaddsp		vs20,	vs20,	vs21		// r0_r, r0_i
+	xvaddsp		vs22,	vs22,	vs23		// r1_r, r1_i
+	xvaddsp		vs1,	vs20,	vs22			// r0_r, r0_i, r1_r, r1_i
+	xvaddsp		vs0,	vs0,	vs1
+
+
+	stxvw4x		vs0,	o0,	T2	// c0, c1
+
+	addi		T2,	T2,	16
+
+// N=2 M=2
+
+	xxlxor		vs4,	vs4,	vs4
+	xxlxor		vs5,	vs5,	vs5
+	xxlxor		vs6,	vs6,	vs6
+	xxlxor		vs7,	vs7,	vs7
+
+#ifndef TRMMKERNEL
+	lxvw4x		vs0,	o0,	T2	// c0, c1
+#else
+	xxlxor		vs0,	vs0,	vs0
+#endif
+
+
+	xxspltw		vs8,	vs50,	0
+	xxspltw		vs9,	vs50,	1
+	xxspltw		vs10,	vs50,	2
+	xxspltw		vs11,	vs50,	3
+
+
+	xxspltw		vs12,	vs51,	0
+	xxspltw		vs13,	vs51,	1
+	xxspltw		vs14,	vs51,	2
+	xxspltw		vs15,	vs51,	3
+
+	XVFADD_R1	vs4,	vs4,	vs8		// add a0_r * b0_r
+	XVFADD_I2	vs5,	vs5,	vs12		// add a0_r * b0_i
+	XVFADD_R1	vs6,	vs6,	vs10		// add a1_r * b0_r
+	XVFADD_I2	vs7,	vs7,	vs14		// add a1_r * b0_i
+
+	XVFADD_R2	vs4,	vs4,	vs13		// add a0_i * b0_i
+	XVFADD_I1	vs5,	vs5,	vs9 		// add a0_i * b0_r
+	XVFADD_R2	vs6,	vs6,	vs15		// add a1_i * b0_i
+	XVFADD_I1	vs7,	vs7,	vs11		// add a1_i * b0_r
+
+	xvmulsp		vs16,	vs4,	alpha_sr		// r0_r * alpha_r
+	xvmulsp		vs17,	vs5,	alpha_si		// r0_i * alpha_i
+	xvmulsp		vs18,	vs4,	alpha_si		// r0_r * alpha_i
+	xvmulsp		vs19,	vs5,	alpha_sr		// r0_i * alpha_r
+
+	xvsubsp		vs20,	vs16,	vs17		// r0_r * alpha_r - r0_i * alpha_i
+	xvaddsp		vs21,	vs18,	vs19		// r0_r * alpha_i + r0_i * alpha_r
+
+	xvmulsp		vs16,	vs6,	alpha_sr		// r1_r * alpha_r
+	xvmulsp		vs17,	vs7,	alpha_si		// r1_i * alpha_i
+	xvmulsp		vs18,	vs6,	alpha_si		// r1_r * alpha_i
+	xvmulsp		vs19,	vs7,	alpha_sr		// r1_i * alpha_r
+
+	xvsubsp		vs22,	vs16,	vs17		// r1_r * alpha_r - r1_i * alpha_i
+	xvaddsp		vs23,	vs18,	vs19		// r1_r * alpha_i + r1_i * alpha_r
+
+	xxsldwi		vs20,	vs20,	vs24,	3		// r0_r
+	xxsldwi		vs21,	vs21,	vs24,	2		// r0_i
+	xxsldwi		vs22,	vs22,	vs24,	1		// r1_r
+	xxsldwi		vs23,	vs23,	vs24,	0		// r1_i
+	xvaddsp		vs20,	vs20,	vs21		// r0_r, r0_i
+	xvaddsp		vs22,	vs22,	vs23		// r1_r, r1_i
+	xvaddsp		vs1,	vs20,	vs22			// r0_r, r0_i, r1_r, r1_i
+	xvaddsp		vs0,	vs0,	vs1
+
+
+	stxvw4x		vs0,	o0,	T2	// c0, c1
+
+	addi		T2,	T2,	16
+
+// N=2 M=4
+
+	xxlxor		vs4,	vs4,	vs4
+	xxlxor		vs5,	vs5,	vs5
+	xxlxor		vs6,	vs6,	vs6
+	xxlxor		vs7,	vs7,	vs7
+
+#ifndef TRMMKERNEL
+	lxvw4x		vs0,	o0,	T2	// c0, c1
+#else
+	xxlxor		vs0,	vs0,	vs0
+#endif
+
+
+	xxspltw		vs8,	vs52,	0
+	xxspltw		vs9,	vs52,	1
+	xxspltw		vs10,	vs52,	2
+	xxspltw		vs11,	vs52,	3
+
+
+	xxspltw		vs12,	vs53,	0
+	xxspltw		vs13,	vs53,	1
+	xxspltw		vs14,	vs53,	2
+	xxspltw		vs15,	vs53,	3
+
+	XVFADD_R1	vs4,	vs4,	vs8		// add a0_r * b0_r
+	XVFADD_I2	vs5,	vs5,	vs12		// add a0_r * b0_i
+	XVFADD_R1	vs6,	vs6,	vs10		// add a1_r * b0_r
+	XVFADD_I2	vs7,	vs7,	vs14		// add a1_r * b0_i
+
+	XVFADD_R2	vs4,	vs4,	vs13		// add a0_i * b0_i
+	XVFADD_I1	vs5,	vs5,	vs9 		// add a0_i * b0_r
+	XVFADD_R2	vs6,	vs6,	vs15		// add a1_i * b0_i
+	XVFADD_I1	vs7,	vs7,	vs11		// add a1_i * b0_r
+
+	xvmulsp		vs16,	vs4,	alpha_sr		// r0_r * alpha_r
+	xvmulsp		vs17,	vs5,	alpha_si		// r0_i * alpha_i
+	xvmulsp		vs18,	vs4,	alpha_si		// r0_r * alpha_i
+	xvmulsp		vs19,	vs5,	alpha_sr		// r0_i * alpha_r
+
+	xvsubsp		vs20,	vs16,	vs17		// r0_r * alpha_r - r0_i * alpha_i
+	xvaddsp		vs21,	vs18,	vs19		// r0_r * alpha_i + r0_i * alpha_r
+
+	xvmulsp		vs16,	vs6,	alpha_sr		// r1_r * alpha_r
+	xvmulsp		vs17,	vs7,	alpha_si		// r1_i * alpha_i
+	xvmulsp		vs18,	vs6,	alpha_si		// r1_r * alpha_i
+	xvmulsp		vs19,	vs7,	alpha_sr		// r1_i * alpha_r
+
+	xvsubsp		vs22,	vs16,	vs17		// r1_r * alpha_r - r1_i * alpha_i
+	xvaddsp		vs23,	vs18,	vs19		// r1_r * alpha_i + r1_i * alpha_r
+
+	xxsldwi		vs20,	vs20,	vs24,	3		// r0_r
+	xxsldwi		vs21,	vs21,	vs24,	2		// r0_i
+	xxsldwi		vs22,	vs22,	vs24,	1		// r1_r
+	xxsldwi		vs23,	vs23,	vs24,	0		// r1_i
+	xvaddsp		vs20,	vs20,	vs21		// r0_r, r0_i
+	xvaddsp		vs22,	vs22,	vs23		// r1_r, r1_i
+	xvaddsp		vs1,	vs20,	vs22			// r0_r, r0_i, r1_r, r1_i
+	xvaddsp		vs0,	vs0,	vs1
+
+
+	stxvw4x		vs0,	o0,	T2	// c0, c1
+
+	addi		T2,	T2,	16
+
+// N=2 M=6
+
+	xxlxor		vs4,	vs4,	vs4
+	xxlxor		vs5,	vs5,	vs5
+	xxlxor		vs6,	vs6,	vs6
+	xxlxor		vs7,	vs7,	vs7
+
+#ifndef TRMMKERNEL
+	lxvw4x		vs0,	o0,	T2	// c0, c1
+#else
+	xxlxor		vs0,	vs0,	vs0
+#endif
+
+
+	xxspltw		vs8,	vs54,	0
+	xxspltw		vs9,	vs54,	1
+	xxspltw		vs10,	vs54,	2
+	xxspltw		vs11,	vs54,	3
+
+
+	xxspltw		vs12,	vs55,	0
+	xxspltw		vs13,	vs55,	1
+	xxspltw		vs14,	vs55,	2
+	xxspltw		vs15,	vs55,	3
+
+	XVFADD_R1	vs4,	vs4,	vs8		// add a0_r * b0_r
+	XVFADD_I2	vs5,	vs5,	vs12		// add a0_r * b0_i
+	XVFADD_R1	vs6,	vs6,	vs10		// add a1_r * b0_r
+	XVFADD_I2	vs7,	vs7,	vs14		// add a1_r * b0_i
+
+	XVFADD_R2	vs4,	vs4,	vs13		// add a0_i * b0_i
+	XVFADD_I1	vs5,	vs5,	vs9 		// add a0_i * b0_r
+	XVFADD_R2	vs6,	vs6,	vs15		// add a1_i * b0_i
+	XVFADD_I1	vs7,	vs7,	vs11		// add a1_i * b0_r
+
+	xvmulsp		vs16,	vs4,	alpha_sr		// r0_r * alpha_r
+	xvmulsp		vs17,	vs5,	alpha_si		// r0_i * alpha_i
+	xvmulsp		vs18,	vs4,	alpha_si		// r0_r * alpha_i
+	xvmulsp		vs19,	vs5,	alpha_sr		// r0_i * alpha_r
+
+	xvsubsp		vs20,	vs16,	vs17		// r0_r * alpha_r - r0_i * alpha_i
+	xvaddsp		vs21,	vs18,	vs19		// r0_r * alpha_i + r0_i * alpha_r
+
+	xvmulsp		vs16,	vs6,	alpha_sr		// r1_r * alpha_r
+	xvmulsp		vs17,	vs7,	alpha_si		// r1_i * alpha_i
+	xvmulsp		vs18,	vs6,	alpha_si		// r1_r * alpha_i
+	xvmulsp		vs19,	vs7,	alpha_sr		// r1_i * alpha_r
+
+	xvsubsp		vs22,	vs16,	vs17		// r1_r * alpha_r - r1_i * alpha_i
+	xvaddsp		vs23,	vs18,	vs19		// r1_r * alpha_i + r1_i * alpha_r
+
+	xxsldwi		vs20,	vs20,	vs24,	3		// r0_r
+	xxsldwi		vs21,	vs21,	vs24,	2		// r0_i
+	xxsldwi		vs22,	vs22,	vs24,	1		// r1_r
+	xxsldwi		vs23,	vs23,	vs24,	0		// r1_i
+	xvaddsp		vs20,	vs20,	vs21		// r0_r, r0_i
+	xvaddsp		vs22,	vs22,	vs23		// r1_r, r1_i
+	xvaddsp		vs1,	vs20,	vs22			// r0_r, r0_i, r1_r, r1_i
+	xvaddsp		vs0,	vs0,	vs1
+
+
+	stxvw4x		vs0,	o0,	T2	// c0, c1
+
+	addi		T2,	T2,	16
+	add		T1,	T1,	LDC
+
+
+// N=3
+
+	mr		T2,	T1
+
+// N=3 M=0
+
+	xxlxor		vs4,	vs4,	vs4
+	xxlxor		vs5,	vs5,	vs5
+	xxlxor		vs6,	vs6,	vs6
+	xxlxor		vs7,	vs7,	vs7
+
+#ifndef TRMMKERNEL
+	lxvw4x		vs0,	o0,	T2	// c0, c1
+#else
+	xxlxor		vs0,	vs0,	vs0
+#endif
+
+
+	xxspltw		vs8,	vs56,	0
+	xxspltw		vs9,	vs56,	1
+	xxspltw		vs10,	vs56,	2
+	xxspltw		vs11,	vs56,	3
+
+
+	xxspltw		vs12,	vs57,	0
+	xxspltw		vs13,	vs57,	1
+	xxspltw		vs14,	vs57,	2
+	xxspltw		vs15,	vs57,	3
+
+	XVFADD_R1	vs4,	vs4,	vs8		// add a0_r * b0_r
+	XVFADD_I2	vs5,	vs5,	vs12		// add a0_r * b0_i
+	XVFADD_R1	vs6,	vs6,	vs10		// add a1_r * b0_r
+	XVFADD_I2	vs7,	vs7,	vs14		// add a1_r * b0_i
+
+	XVFADD_R2	vs4,	vs4,	vs13		// add a0_i * b0_i
+	XVFADD_I1	vs5,	vs5,	vs9 		// add a0_i * b0_r
+	XVFADD_R2	vs6,	vs6,	vs15		// add a1_i * b0_i
+	XVFADD_I1	vs7,	vs7,	vs11		// add a1_i * b0_r
+
+	xvmulsp		vs16,	vs4,	alpha_sr		// r0_r * alpha_r
+	xvmulsp		vs17,	vs5,	alpha_si		// r0_i * alpha_i
+	xvmulsp		vs18,	vs4,	alpha_si		// r0_r * alpha_i
+	xvmulsp		vs19,	vs5,	alpha_sr		// r0_i * alpha_r
+
+	xvsubsp		vs20,	vs16,	vs17		// r0_r * alpha_r - r0_i * alpha_i
+	xvaddsp		vs21,	vs18,	vs19		// r0_r * alpha_i + r0_i * alpha_r
+
+	xvmulsp		vs16,	vs6,	alpha_sr		// r1_r * alpha_r
+	xvmulsp		vs17,	vs7,	alpha_si		// r1_i * alpha_i
+	xvmulsp		vs18,	vs6,	alpha_si		// r1_r * alpha_i
+	xvmulsp		vs19,	vs7,	alpha_sr		// r1_i * alpha_r
+
+	xvsubsp		vs22,	vs16,	vs17		// r1_r * alpha_r - r1_i * alpha_i
+	xvaddsp		vs23,	vs18,	vs19		// r1_r * alpha_i + r1_i * alpha_r
+
+	xxsldwi		vs20,	vs20,	vs24,	3		// r0_r
+	xxsldwi		vs21,	vs21,	vs24,	2		// r0_i
+	xxsldwi		vs22,	vs22,	vs24,	1		// r1_r
+	xxsldwi		vs23,	vs23,	vs24,	0		// r1_i
+	xvaddsp		vs20,	vs20,	vs21		// r0_r, r0_i
+	xvaddsp		vs22,	vs22,	vs23		// r1_r, r1_i
+	xvaddsp		vs1,	vs20,	vs22			// r0_r, r0_i, r1_r, r1_i
+	xvaddsp		vs0,	vs0,	vs1
+
+
+	stxvw4x		vs0,	o0,	T2	// c0, c1
+
+	addi		T2,	T2,	16
+
+// N=3 M=2
+
+	xxlxor		vs4,	vs4,	vs4
+	xxlxor		vs5,	vs5,	vs5
+	xxlxor		vs6,	vs6,	vs6
+	xxlxor		vs7,	vs7,	vs7
+
+#ifndef TRMMKERNEL
+	lxvw4x		vs0,	o0,	T2	// c0, c1
+#else
+	xxlxor		vs0,	vs0,	vs0
+#endif
+
+
+	xxspltw		vs8,	vs58,	0
+	xxspltw		vs9,	vs58,	1
+	xxspltw		vs10,	vs58,	2
+	xxspltw		vs11,	vs58,	3
+
+
+	xxspltw		vs12,	vs59,	0
+	xxspltw		vs13,	vs59,	1
+	xxspltw		vs14,	vs59,	2
+	xxspltw		vs15,	vs59,	3
+
+	XVFADD_R1	vs4,	vs4,	vs8		// add a0_r * b0_r
+	XVFADD_I2	vs5,	vs5,	vs12		// add a0_r * b0_i
+	XVFADD_R1	vs6,	vs6,	vs10		// add a1_r * b0_r
+	XVFADD_I2	vs7,	vs7,	vs14		// add a1_r * b0_i
+
+	XVFADD_R2	vs4,	vs4,	vs13		// add a0_i * b0_i
+	XVFADD_I1	vs5,	vs5,	vs9 		// add a0_i * b0_r
+	XVFADD_R2	vs6,	vs6,	vs15		// add a1_i * b0_i
+	XVFADD_I1	vs7,	vs7,	vs11		// add a1_i * b0_r
+
+	xvmulsp		vs16,	vs4,	alpha_sr		// r0_r * alpha_r
+	xvmulsp		vs17,	vs5,	alpha_si		// r0_i * alpha_i
+	xvmulsp		vs18,	vs4,	alpha_si		// r0_r * alpha_i
+	xvmulsp		vs19,	vs5,	alpha_sr		// r0_i * alpha_r
+
+	xvsubsp		vs20,	vs16,	vs17		// r0_r * alpha_r - r0_i * alpha_i
+	xvaddsp		vs21,	vs18,	vs19		// r0_r * alpha_i + r0_i * alpha_r
+
+	xvmulsp		vs16,	vs6,	alpha_sr		// r1_r * alpha_r
+	xvmulsp		vs17,	vs7,	alpha_si		// r1_i * alpha_i
+	xvmulsp		vs18,	vs6,	alpha_si		// r1_r * alpha_i
+	xvmulsp		vs19,	vs7,	alpha_sr		// r1_i * alpha_r
+
+	xvsubsp		vs22,	vs16,	vs17		// r1_r * alpha_r - r1_i * alpha_i
+	xvaddsp		vs23,	vs18,	vs19		// r1_r * alpha_i + r1_i * alpha_r
+
+	xxsldwi		vs20,	vs20,	vs24,	3		// r0_r
+	xxsldwi		vs21,	vs21,	vs24,	2		// r0_i
+	xxsldwi		vs22,	vs22,	vs24,	1		// r1_r
+	xxsldwi		vs23,	vs23,	vs24,	0		// r1_i
+	xvaddsp		vs20,	vs20,	vs21		// r0_r, r0_i
+	xvaddsp		vs22,	vs22,	vs23		// r1_r, r1_i
+	xvaddsp		vs1,	vs20,	vs22			// r0_r, r0_i, r1_r, r1_i
+	xvaddsp		vs0,	vs0,	vs1
+
+
+	stxvw4x		vs0,	o0,	T2	// c0, c1
+
+	addi		T2,	T2,	16
+
+// N=3 M=4
+
+	xxlxor		vs4,	vs4,	vs4
+	xxlxor		vs5,	vs5,	vs5
+	xxlxor		vs6,	vs6,	vs6
+	xxlxor		vs7,	vs7,	vs7
+
+#ifndef TRMMKERNEL
+	lxvw4x		vs0,	o0,	T2	// c0, c1
+#else
+	xxlxor		vs0,	vs0,	vs0
+#endif
+
+
+	xxspltw		vs8,	vs60,	0
+	xxspltw		vs9,	vs60,	1
+	xxspltw		vs10,	vs60,	2
+	xxspltw		vs11,	vs60,	3
+
+
+	xxspltw		vs12,	vs61,	0
+	xxspltw		vs13,	vs61,	1
+	xxspltw		vs14,	vs61,	2
+	xxspltw		vs15,	vs61,	3
+
+	XVFADD_R1	vs4,	vs4,	vs8		// add a0_r * b0_r
+	XVFADD_I2	vs5,	vs5,	vs12		// add a0_r * b0_i
+	XVFADD_R1	vs6,	vs6,	vs10		// add a1_r * b0_r
+	XVFADD_I2	vs7,	vs7,	vs14		// add a1_r * b0_i
+
+	XVFADD_R2	vs4,	vs4,	vs13		// add a0_i * b0_i
+	XVFADD_I1	vs5,	vs5,	vs9 		// add a0_i * b0_r
+	XVFADD_R2	vs6,	vs6,	vs15		// add a1_i * b0_i
+	XVFADD_I1	vs7,	vs7,	vs11		// add a1_i * b0_r
+
+	xvmulsp		vs16,	vs4,	alpha_sr		// r0_r * alpha_r
+	xvmulsp		vs17,	vs5,	alpha_si		// r0_i * alpha_i
+	xvmulsp		vs18,	vs4,	alpha_si		// r0_r * alpha_i
+	xvmulsp		vs19,	vs5,	alpha_sr		// r0_i * alpha_r
+
+	xvsubsp		vs20,	vs16,	vs17		// r0_r * alpha_r - r0_i * alpha_i
+	xvaddsp		vs21,	vs18,	vs19		// r0_r * alpha_i + r0_i * alpha_r
+
+	xvmulsp		vs16,	vs6,	alpha_sr		// r1_r * alpha_r
+	xvmulsp		vs17,	vs7,	alpha_si		// r1_i * alpha_i
+	xvmulsp		vs18,	vs6,	alpha_si		// r1_r * alpha_i
+	xvmulsp		vs19,	vs7,	alpha_sr		// r1_i * alpha_r
+
+	xvsubsp		vs22,	vs16,	vs17		// r1_r * alpha_r - r1_i * alpha_i
+	xvaddsp		vs23,	vs18,	vs19		// r1_r * alpha_i + r1_i * alpha_r
+
+	xxsldwi		vs20,	vs20,	vs24,	3		// r0_r
+	xxsldwi		vs21,	vs21,	vs24,	2		// r0_i
+	xxsldwi		vs22,	vs22,	vs24,	1		// r1_r
+	xxsldwi		vs23,	vs23,	vs24,	0		// r1_i
+	xvaddsp		vs20,	vs20,	vs21		// r0_r, r0_i
+	xvaddsp		vs22,	vs22,	vs23		// r1_r, r1_i
+	xvaddsp		vs1,	vs20,	vs22			// r0_r, r0_i, r1_r, r1_i
+	xvaddsp		vs0,	vs0,	vs1
+
+
+	stxvw4x		vs0,	o0,	T2	// c0, c1
+
+	addi		T2,	T2,	16
+
+// N=3 M=6
+
+	xxlxor		vs4,	vs4,	vs4
+	xxlxor		vs5,	vs5,	vs5
+	xxlxor		vs6,	vs6,	vs6
+	xxlxor		vs7,	vs7,	vs7
+
+#ifndef TRMMKERNEL
+	lxvw4x		vs0,	o0,	T2	// c0, c1
+#else
+	xxlxor		vs0,	vs0,	vs0
+#endif
+
+
+	xxspltw		vs8,	vs62,	0
+	xxspltw		vs9,	vs62,	1
+	xxspltw		vs10,	vs62,	2
+	xxspltw		vs11,	vs62,	3
+
+
+	xxspltw		vs12,	vs63,	0
+	xxspltw		vs13,	vs63,	1
+	xxspltw		vs14,	vs63,	2
+	xxspltw		vs15,	vs63,	3
+
+	XVFADD_R1	vs4,	vs4,	vs8		// add a0_r * b0_r
+	XVFADD_I2	vs5,	vs5,	vs12		// add a0_r * b0_i
+	XVFADD_R1	vs6,	vs6,	vs10		// add a1_r * b0_r
+	XVFADD_I2	vs7,	vs7,	vs14		// add a1_r * b0_i
+
+	XVFADD_R2	vs4,	vs4,	vs13		// add a0_i * b0_i
+	XVFADD_I1	vs5,	vs5,	vs9 		// add a0_i * b0_r
+	XVFADD_R2	vs6,	vs6,	vs15		// add a1_i * b0_i
+	XVFADD_I1	vs7,	vs7,	vs11		// add a1_i * b0_r
+
+	xvmulsp		vs16,	vs4,	alpha_sr		// r0_r * alpha_r
+	xvmulsp		vs17,	vs5,	alpha_si		// r0_i * alpha_i
+	xvmulsp		vs18,	vs4,	alpha_si		// r0_r * alpha_i
+	xvmulsp		vs19,	vs5,	alpha_sr		// r0_i * alpha_r
+
+	xvsubsp		vs20,	vs16,	vs17		// r0_r * alpha_r - r0_i * alpha_i
+	xvaddsp		vs21,	vs18,	vs19		// r0_r * alpha_i + r0_i * alpha_r
+
+	xvmulsp		vs16,	vs6,	alpha_sr		// r1_r * alpha_r
+	xvmulsp		vs17,	vs7,	alpha_si		// r1_i * alpha_i
+	xvmulsp		vs18,	vs6,	alpha_si		// r1_r * alpha_i
+	xvmulsp		vs19,	vs7,	alpha_sr		// r1_i * alpha_r
+
+	xvsubsp		vs22,	vs16,	vs17		// r1_r * alpha_r - r1_i * alpha_i
+	xvaddsp		vs23,	vs18,	vs19		// r1_r * alpha_i + r1_i * alpha_r
+
+	xxsldwi		vs20,	vs20,	vs24,	3		// r0_r
+	xxsldwi		vs21,	vs21,	vs24,	2		// r0_i
+	xxsldwi		vs22,	vs22,	vs24,	1		// r1_r
+	xxsldwi		vs23,	vs23,	vs24,	0		// r1_i
+	xvaddsp		vs20,	vs20,	vs21		// r0_r, r0_i
+	xvaddsp		vs22,	vs22,	vs23		// r1_r, r1_i
+	xvaddsp		vs1,	vs20,	vs22			// r0_r, r0_i, r1_r, r1_i
+	xvaddsp		vs0,	vs0,	vs1
+
+
+	stxvw4x		vs0,	o0,	T2	// c0, c1
+
+	addi		T2,	T2,	16
+	add		T1,	T1,	LDC
+
+	addi		CO,	CO,	64
+
+.endm
+
+
+/**********************************************************************************************
+* Macros for N=4 and M=4
+**********************************************************************************************/
+
+.macro LOAD4x4_1
+
+	lxvw4x		vs0,	o0,	AO		// load a0, a1
+	lxvw4x		vs1,	o16,	AO		// load a2, a3
+
+	addi		AO,	AO,	32
+
+	lxvw4x		vs8,	o0,	BO		//  load b0_r
+	lxvw4x		vs9,	o16,	BO		//  load b0_i
+	lxvw4x		vs10,	o32,	BO		//  load b1_r
+	lxvw4x		vs11,	o48,	BO		//  load b1_i
+
+	addi		BO,	BO,	64
+
+	lxvw4x		vs12,	o0,	BO		//  load b2_r
+	lxvw4x		vs13,	o16,	BO		//  load b2_i
+	lxvw4x		vs14,	o32,	BO		//  load b3_r
+	lxvw4x		vs15,	o48,	BO		//  load b3_i
+
+	addi		BO,	BO,	64
+
+
+.endm
+
+.macro KERNEL4x4_I1
+
+
+	lxvw4x		vs4,	o0,	AO		// load a0, a1
+	lxvw4x		vs5,	o16,	AO		// load a2, a3
+
+	addi		AO,	AO,	32
+
+	lxvw4x		vs16,	o0,	BO		//  load b0_r
+	lxvw4x		vs17,	o16,	BO		//  load b0_i
+	lxvw4x		vs18,	o32,	BO		//  load b1_r
+	lxvw4x		vs19,	o48,	BO		//  load b1_i
+
+	addi		BO,	BO,	64
+
+	lxvw4x		vs20,	o0,	BO		//  load b2_r
+	lxvw4x		vs21,	o16,	BO		//  load b2_i
+	lxvw4x		vs22,	o32,	BO		//  load b3_r
+	lxvw4x		vs23,	o48,	BO		//  load b3_i
+
+	addi		BO,	BO,	64
+
+
+
+	xvmulsp		vs32,	vs0,	vs8		// a0_r*b0_r, a0_i*b0_r, a1_r*b0_r, a1_i*b0_r
+	xvmulsp		vs33,	vs0,	vs9		// a0_r*b0_i, a0_i*b0_i, a1_r*b0_i, a1_i*b0_i
+	xvmulsp		vs34,	vs1,	vs8		// a1_r*b0_r, a1_i*b0_r, a1_r*b0_r, a1_i*b0_r
+	xvmulsp		vs35,	vs1,	vs9		// a1_r*b0_i, a1_i*b0_i, a1_r*b0_i, a1_i*b0_i
+
+	xvmulsp		vs36,	vs0,	vs10		// a0_r*b1_r, a0_i*b1_r, a1_r*b1_r, a1_i*b1_r
+	xvmulsp		vs37,	vs0,	vs11		// a0_r*b1_i, a0_i*b1_i, a1_r*b1_i, a1_i*b1_i
+	xvmulsp		vs38,	vs1,	vs10		// a1_r*b1_r, a1_i*b1_r, a1_r*b1_r, a1_i*b1_r
+	xvmulsp		vs39,	vs1,	vs11		// a1_r*b1_i, a1_i*b1_i, a1_r*b1_i, a1_i*b1_i
+
+	xvmulsp		vs40,	vs0,	vs12		// a0_r*b2_r, a0_i*b2_r, a1_r*b2_r, a1_i*b2_r
+	xvmulsp		vs41,	vs0,	vs13		// a0_r*b2_i, a0_i*b2_i, a1_r*b2_i, a1_i*b2_i
+	xvmulsp		vs42,	vs1,	vs12		// a1_r*b2_r, a1_i*b2_r, a1_r*b2_r, a1_i*b2_r
+	xvmulsp		vs43,	vs1,	vs13		// a1_r*b2_i, a1_i*b2_i, a1_r*b2_i, a1_i*b2_i
+
+	xvmulsp		vs44,	vs0,	vs14		// a0_r*b3_r, a0_i*b3_r, a1_r*b3_r, a1_i*b3_r
+	xvmulsp		vs45,	vs0,	vs15		// a0_r*b3_i, a0_i*b3_i, a1_r*b3_i, a1_i*b3_i
+	xvmulsp		vs46,	vs1,	vs14		// a1_r*b3_r, a1_i*b3_r, a1_r*b3_r, a1_i*b3_r
+	xvmulsp		vs47,	vs1,	vs15		// a1_r*b3_i, a1_i*b3_i, a1_r*b3_i, a1_i*b3_i
+
+
+.endm
+
+.macro KERNEL4x4_1
+
+
+	lxvw4x		vs4,	o0,	AO		// load a0, a1
+	lxvw4x		vs5,	o16,	AO		// load a2, a3
+
+	addi		AO,	AO,	32
+
+	lxvw4x		vs16,	o0,	BO		//  load b0_r
+	lxvw4x		vs17,	o16,	BO		//  load b0_i
+	lxvw4x		vs18,	o32,	BO		//  load b1_r
+	lxvw4x		vs19,	o48,	BO		//  load b1_i
+
+	addi		BO,	BO,	64
+
+	lxvw4x		vs20,	o0,	BO		//  load b2_r
+	lxvw4x		vs21,	o16,	BO		//  load b2_i
+	lxvw4x		vs22,	o32,	BO		//  load b3_r
+	lxvw4x		vs23,	o48,	BO		//  load b3_i
+
+	addi		BO,	BO,	64
+
+
+
+	xvmaddasp	vs32,	vs0,	vs8		// a0_r*b0_r, a0_i*b0_r, a1_r*b0_r, a1_i*b0_r
+	xvmaddasp	vs33,	vs0,	vs9		// a0_r*b0_i, a0_i*b0_i, a1_r*b0_i, a1_i*b0_i
+	xvmaddasp	vs34,	vs1,	vs8		// a1_r*b0_r, a1_i*b0_r, a1_r*b0_r, a1_i*b0_r
+	xvmaddasp	vs35,	vs1,	vs9		// a1_r*b0_i, a1_i*b0_i, a1_r*b0_i, a1_i*b0_i
+
+	xvmaddasp	vs36,	vs0,	vs10		// a0_r*b1_r, a0_i*b1_r, a1_r*b1_r, a1_i*b1_r
+	xvmaddasp	vs37,	vs0,	vs11		// a0_r*b1_i, a0_i*b1_i, a1_r*b1_i, a1_i*b1_i
+	xvmaddasp	vs38,	vs1,	vs10		// a1_r*b1_r, a1_i*b1_r, a1_r*b1_r, a1_i*b1_r
+	xvmaddasp	vs39,	vs1,	vs11		// a1_r*b1_i, a1_i*b1_i, a1_r*b1_i, a1_i*b1_i
+
+	xvmaddasp	vs40,	vs0,	vs12		// a0_r*b2_r, a0_i*b2_r, a1_r*b2_r, a1_i*b2_r
+	xvmaddasp	vs41,	vs0,	vs13		// a0_r*b2_i, a0_i*b2_i, a1_r*b2_i, a1_i*b2_i
+	xvmaddasp	vs42,	vs1,	vs12		// a1_r*b2_r, a1_i*b2_r, a1_r*b2_r, a1_i*b2_r
+	xvmaddasp	vs43,	vs1,	vs13		// a1_r*b2_i, a1_i*b2_i, a1_r*b2_i, a1_i*b2_i
+
+	xvmaddasp	vs44,	vs0,	vs14		// a0_r*b3_r, a0_i*b3_r, a1_r*b3_r, a1_i*b3_r
+	xvmaddasp	vs45,	vs0,	vs15		// a0_r*b3_i, a0_i*b3_i, a1_r*b3_i, a1_i*b3_i
+	xvmaddasp	vs46,	vs1,	vs14		// a1_r*b3_r, a1_i*b3_r, a1_r*b3_r, a1_i*b3_r
+	xvmaddasp	vs47,	vs1,	vs15		// a1_r*b3_i, a1_i*b3_i, a1_r*b3_i, a1_i*b3_i
+
+
+.endm
+
+.macro KERNEL4x4_2
+
+
+	lxvw4x		vs0,	o0,	AO		// load a0, a1
+	lxvw4x		vs1,	o16,	AO		// load a2, a3
+
+	addi		AO,	AO,	32
+
+	lxvw4x		vs8,	o0,	BO		//  load b0_r
+	lxvw4x		vs9,	o16,	BO		//  load b0_i
+	lxvw4x		vs10,	o32,	BO		//  load b1_r
+	lxvw4x		vs11,	o48,	BO		//  load b1_i
+
+	addi		BO,	BO,	64
+
+	lxvw4x		vs12,	o0,	BO		//  load b2_r
+	lxvw4x		vs13,	o16,	BO		//  load b2_i
+	lxvw4x		vs14,	o32,	BO		//  load b3_r
+	lxvw4x		vs15,	o48,	BO		//  load b3_i
+
+	addi		BO,	BO,	64
+
+
+
+	xvmaddasp	vs32,	vs4,	vs16		// a4_r*b0_r, a4_i*b0_r, a1_r*b0_r, a1_i*b0_r
+	xvmaddasp	vs33,	vs4,	vs17		// a4_r*b0_i, a4_i*b0_i, a1_r*b0_i, a1_i*b0_i
+	xvmaddasp	vs34,	vs5,	vs16		// a5_r*b0_r, a5_i*b0_r, a1_r*b0_r, a1_i*b0_r
+	xvmaddasp	vs35,	vs5,	vs17		// a5_r*b0_i, a5_i*b0_i, a1_r*b0_i, a1_i*b0_i
+
+	xvmaddasp	vs36,	vs4,	vs18		// a4_r*b1_r, a4_i*b1_r, a1_r*b1_r, a1_i*b1_r
+	xvmaddasp	vs37,	vs4,	vs19		// a4_r*b1_i, a4_i*b1_i, a1_r*b1_i, a1_i*b1_i
+	xvmaddasp	vs38,	vs5,	vs18		// a5_r*b1_r, a5_i*b1_r, a1_r*b1_r, a1_i*b1_r
+	xvmaddasp	vs39,	vs5,	vs19		// a5_r*b1_i, a5_i*b1_i, a1_r*b1_i, a1_i*b1_i
+
+	xvmaddasp	vs40,	vs4,	vs20		// a4_r*b2_r, a4_i*b2_r, a1_r*b2_r, a1_i*b2_r
+	xvmaddasp	vs41,	vs4,	vs21		// a4_r*b2_i, a4_i*b2_i, a1_r*b2_i, a1_i*b2_i
+	xvmaddasp	vs42,	vs5,	vs20		// a5_r*b2_r, a5_i*b2_r, a1_r*b2_r, a1_i*b2_r
+	xvmaddasp	vs43,	vs5,	vs21		// a5_r*b2_i, a5_i*b2_i, a1_r*b2_i, a1_i*b2_i
+
+	xvmaddasp	vs44,	vs4,	vs22		// a4_r*b3_r, a4_i*b3_r, a1_r*b3_r, a1_i*b3_r
+	xvmaddasp	vs45,	vs4,	vs23		// a4_r*b3_i, a4_i*b3_i, a1_r*b3_i, a1_i*b3_i
+	xvmaddasp	vs46,	vs5,	vs22		// a5_r*b3_r, a5_i*b3_r, a1_r*b3_r, a1_i*b3_r
+	xvmaddasp	vs47,	vs5,	vs23		// a5_r*b3_i, a5_i*b3_i, a1_r*b3_i, a1_i*b3_i
+
+
+.endm
+
+.macro KERNEL4x4_E2
+
+
+	xvmaddasp	vs32,	vs4,	vs16		// a4_r*b0_r, a4_i*b0_r, a1_r*b0_r, a1_i*b0_r
+	xvmaddasp	vs33,	vs4,	vs17		// a4_r*b0_i, a4_i*b0_i, a1_r*b0_i, a1_i*b0_i
+	xvmaddasp	vs34,	vs5,	vs16		// a5_r*b0_r, a5_i*b0_r, a1_r*b0_r, a1_i*b0_r
+	xvmaddasp	vs35,	vs5,	vs17		// a5_r*b0_i, a5_i*b0_i, a1_r*b0_i, a1_i*b0_i
+
+	xvmaddasp	vs36,	vs4,	vs18		// a4_r*b1_r, a4_i*b1_r, a1_r*b1_r, a1_i*b1_r
+	xvmaddasp	vs37,	vs4,	vs19		// a4_r*b1_i, a4_i*b1_i, a1_r*b1_i, a1_i*b1_i
+	xvmaddasp	vs38,	vs5,	vs18		// a5_r*b1_r, a5_i*b1_r, a1_r*b1_r, a1_i*b1_r
+	xvmaddasp	vs39,	vs5,	vs19		// a5_r*b1_i, a5_i*b1_i, a1_r*b1_i, a1_i*b1_i
+
+	xvmaddasp	vs40,	vs4,	vs20		// a4_r*b2_r, a4_i*b2_r, a1_r*b2_r, a1_i*b2_r
+	xvmaddasp	vs41,	vs4,	vs21		// a4_r*b2_i, a4_i*b2_i, a1_r*b2_i, a1_i*b2_i
+	xvmaddasp	vs42,	vs5,	vs20		// a5_r*b2_r, a5_i*b2_r, a1_r*b2_r, a1_i*b2_r
+	xvmaddasp	vs43,	vs5,	vs21		// a5_r*b2_i, a5_i*b2_i, a1_r*b2_i, a1_i*b2_i
+
+	xvmaddasp	vs44,	vs4,	vs22		// a4_r*b3_r, a4_i*b3_r, a1_r*b3_r, a1_i*b3_r
+	xvmaddasp	vs45,	vs4,	vs23		// a4_r*b3_i, a4_i*b3_i, a1_r*b3_i, a1_i*b3_i
+	xvmaddasp	vs46,	vs5,	vs22		// a5_r*b3_r, a5_i*b3_r, a1_r*b3_r, a1_i*b3_r
+	xvmaddasp	vs47,	vs5,	vs23		// a5_r*b3_i, a5_i*b3_i, a1_r*b3_i, a1_i*b3_i
+
+
+.endm
+
+.macro KERNEL4x4_SUBI1
+
+
+	lxvw4x		vs0,	o0,	AO		// load a0, a1
+	lxvw4x		vs1,	o16,	AO		// load a2, a3
+
+	addi		AO,	AO,	32
+
+	lxvw4x		vs8,	o0,	BO		//  load b0_r
+	lxvw4x		vs9,	o16,	BO		//  load b0_i
+	lxvw4x		vs10,	o32,	BO		//  load b1_r
+	lxvw4x		vs11,	o48,	BO		//  load b1_i
+
+	addi		BO,	BO,	64
+
+	lxvw4x		vs12,	o0,	BO		//  load b2_r
+	lxvw4x		vs13,	o16,	BO		//  load b2_i
+	lxvw4x		vs14,	o32,	BO		//  load b3_r
+	lxvw4x		vs15,	o48,	BO		//  load b3_i
+
+	addi		BO,	BO,	64
+
+
+
+	xvmulsp		vs32,	vs0,	vs8		// a0_r*b0_r, a0_i*b0_r, a1_r*b0_r, a1_i*b0_r
+	xvmulsp		vs33,	vs0,	vs9		// a0_r*b0_i, a0_i*b0_i, a1_r*b0_i, a1_i*b0_i
+	xvmulsp		vs34,	vs1,	vs8		// a1_r*b0_r, a1_i*b0_r, a1_r*b0_r, a1_i*b0_r
+	xvmulsp		vs35,	vs1,	vs9		// a1_r*b0_i, a1_i*b0_i, a1_r*b0_i, a1_i*b0_i
+
+	xvmulsp		vs36,	vs0,	vs10		// a0_r*b1_r, a0_i*b1_r, a1_r*b1_r, a1_i*b1_r
+	xvmulsp		vs37,	vs0,	vs11		// a0_r*b1_i, a0_i*b1_i, a1_r*b1_i, a1_i*b1_i
+	xvmulsp		vs38,	vs1,	vs10		// a1_r*b1_r, a1_i*b1_r, a1_r*b1_r, a1_i*b1_r
+	xvmulsp		vs39,	vs1,	vs11		// a1_r*b1_i, a1_i*b1_i, a1_r*b1_i, a1_i*b1_i
+
+	xvmulsp		vs40,	vs0,	vs12		// a0_r*b2_r, a0_i*b2_r, a1_r*b2_r, a1_i*b2_r
+	xvmulsp		vs41,	vs0,	vs13		// a0_r*b2_i, a0_i*b2_i, a1_r*b2_i, a1_i*b2_i
+	xvmulsp		vs42,	vs1,	vs12		// a1_r*b2_r, a1_i*b2_r, a1_r*b2_r, a1_i*b2_r
+	xvmulsp		vs43,	vs1,	vs13		// a1_r*b2_i, a1_i*b2_i, a1_r*b2_i, a1_i*b2_i
+
+	xvmulsp		vs44,	vs0,	vs14		// a0_r*b3_r, a0_i*b3_r, a1_r*b3_r, a1_i*b3_r
+	xvmulsp		vs45,	vs0,	vs15		// a0_r*b3_i, a0_i*b3_i, a1_r*b3_i, a1_i*b3_i
+	xvmulsp		vs46,	vs1,	vs14		// a1_r*b3_r, a1_i*b3_r, a1_r*b3_r, a1_i*b3_r
+	xvmulsp		vs47,	vs1,	vs15		// a1_r*b3_i, a1_i*b3_i, a1_r*b3_i, a1_i*b3_i
+
+
+.endm
+
+.macro KERNEL4x4_SUB1
+
+
+	lxvw4x		vs0,	o0,	AO		// load a0, a1
+	lxvw4x		vs1,	o16,	AO		// load a2, a3
+
+	addi		AO,	AO,	32
+
+	lxvw4x		vs8,	o0,	BO		//  load b0_r
+	lxvw4x		vs9,	o16,	BO		//  load b0_i
+	lxvw4x		vs10,	o32,	BO		//  load b1_r
+	lxvw4x		vs11,	o48,	BO		//  load b1_i
+
+	addi		BO,	BO,	64
+
+	lxvw4x		vs12,	o0,	BO		//  load b2_r
+	lxvw4x		vs13,	o16,	BO		//  load b2_i
+	lxvw4x		vs14,	o32,	BO		//  load b3_r
+	lxvw4x		vs15,	o48,	BO		//  load b3_i
+
+	addi		BO,	BO,	64
+
+
+
+	xvmaddasp	vs32,	vs0,	vs8		// a0_r*b0_r, a0_i*b0_r, a1_r*b0_r, a1_i*b0_r
+	xvmaddasp	vs33,	vs0,	vs9		// a0_r*b0_i, a0_i*b0_i, a1_r*b0_i, a1_i*b0_i
+	xvmaddasp	vs34,	vs1,	vs8		// a1_r*b0_r, a1_i*b0_r, a1_r*b0_r, a1_i*b0_r
+	xvmaddasp	vs35,	vs1,	vs9		// a1_r*b0_i, a1_i*b0_i, a1_r*b0_i, a1_i*b0_i
+
+	xvmaddasp	vs36,	vs0,	vs10		// a0_r*b1_r, a0_i*b1_r, a1_r*b1_r, a1_i*b1_r
+	xvmaddasp	vs37,	vs0,	vs11		// a0_r*b1_i, a0_i*b1_i, a1_r*b1_i, a1_i*b1_i
+	xvmaddasp	vs38,	vs1,	vs10		// a1_r*b1_r, a1_i*b1_r, a1_r*b1_r, a1_i*b1_r
+	xvmaddasp	vs39,	vs1,	vs11		// a1_r*b1_i, a1_i*b1_i, a1_r*b1_i, a1_i*b1_i
+
+	xvmaddasp	vs40,	vs0,	vs12		// a0_r*b2_r, a0_i*b2_r, a1_r*b2_r, a1_i*b2_r
+	xvmaddasp	vs41,	vs0,	vs13		// a0_r*b2_i, a0_i*b2_i, a1_r*b2_i, a1_i*b2_i
+	xvmaddasp	vs42,	vs1,	vs12		// a1_r*b2_r, a1_i*b2_r, a1_r*b2_r, a1_i*b2_r
+	xvmaddasp	vs43,	vs1,	vs13		// a1_r*b2_i, a1_i*b2_i, a1_r*b2_i, a1_i*b2_i
+
+	xvmaddasp	vs44,	vs0,	vs14		// a0_r*b3_r, a0_i*b3_r, a1_r*b3_r, a1_i*b3_r
+	xvmaddasp	vs45,	vs0,	vs15		// a0_r*b3_i, a0_i*b3_i, a1_r*b3_i, a1_i*b3_i
+	xvmaddasp	vs46,	vs1,	vs14		// a1_r*b3_r, a1_i*b3_r, a1_r*b3_r, a1_i*b3_r
+	xvmaddasp	vs47,	vs1,	vs15		// a1_r*b3_i, a1_i*b3_i, a1_r*b3_i, a1_i*b3_i
+
+
+.endm
+
+.macro SAVE4x4
+
+	mr		T1,	CO
+	xxlxor		vs24,	vs24,	vs24
+
+// N=0
+
+	mr		T2,	T1
+
+// N=0 M=0
+
+	xxlxor		vs4,	vs4,	vs4
+	xxlxor		vs5,	vs5,	vs5
+	xxlxor		vs6,	vs6,	vs6
+	xxlxor		vs7,	vs7,	vs7
+
+#ifndef TRMMKERNEL
+	lxvw4x		vs0,	o0,	T2	// c0, c1
+#else
+	xxlxor		vs0,	vs0,	vs0
+#endif
+
+
+	xxspltw		vs8,	vs32,	0
+	xxspltw		vs9,	vs32,	1
+	xxspltw		vs10,	vs32,	2
+	xxspltw		vs11,	vs32,	3
+
+
+	xxspltw		vs12,	vs33,	0
+	xxspltw		vs13,	vs33,	1
+	xxspltw		vs14,	vs33,	2
+	xxspltw		vs15,	vs33,	3
+
+	XVFADD_R1	vs4,	vs4,	vs8		// add a0_r * b0_r
+	XVFADD_I2	vs5,	vs5,	vs12		// add a0_r * b0_i
+	XVFADD_R1	vs6,	vs6,	vs10		// add a1_r * b0_r
+	XVFADD_I2	vs7,	vs7,	vs14		// add a1_r * b0_i
+
+	XVFADD_R2	vs4,	vs4,	vs13		// add a0_i * b0_i
+	XVFADD_I1	vs5,	vs5,	vs9 		// add a0_i * b0_r
+	XVFADD_R2	vs6,	vs6,	vs15		// add a1_i * b0_i
+	XVFADD_I1	vs7,	vs7,	vs11		// add a1_i * b0_r
+
+	xvmulsp		vs16,	vs4,	alpha_sr		// r0_r * alpha_r
+	xvmulsp		vs17,	vs5,	alpha_si		// r0_i * alpha_i
+	xvmulsp		vs18,	vs4,	alpha_si		// r0_r * alpha_i
+	xvmulsp		vs19,	vs5,	alpha_sr		// r0_i * alpha_r
+
+	xvsubsp		vs20,	vs16,	vs17		// r0_r * alpha_r - r0_i * alpha_i
+	xvaddsp		vs21,	vs18,	vs19		// r0_r * alpha_i + r0_i * alpha_r
+
+	xvmulsp		vs16,	vs6,	alpha_sr		// r1_r * alpha_r
+	xvmulsp		vs17,	vs7,	alpha_si		// r1_i * alpha_i
+	xvmulsp		vs18,	vs6,	alpha_si		// r1_r * alpha_i
+	xvmulsp		vs19,	vs7,	alpha_sr		// r1_i * alpha_r
+
+	xvsubsp		vs22,	vs16,	vs17		// r1_r * alpha_r - r1_i * alpha_i
+	xvaddsp		vs23,	vs18,	vs19		// r1_r * alpha_i + r1_i * alpha_r
+
+	xxsldwi		vs20,	vs20,	vs24,	3		// r0_r
+	xxsldwi		vs21,	vs21,	vs24,	2		// r0_i
+	xxsldwi		vs22,	vs22,	vs24,	1		// r1_r
+	xxsldwi		vs23,	vs23,	vs24,	0		// r1_i
+	xvaddsp		vs20,	vs20,	vs21		// r0_r, r0_i
+	xvaddsp		vs22,	vs22,	vs23		// r1_r, r1_i
+	xvaddsp		vs1,	vs20,	vs22			// r0_r, r0_i, r1_r, r1_i
+	xvaddsp		vs0,	vs0,	vs1
+
+
+	stxvw4x		vs0,	o0,	T2	// c0, c1
+
+	addi		T2,	T2,	16
+
+// N=0 M=2
+
+	xxlxor		vs4,	vs4,	vs4
+	xxlxor		vs5,	vs5,	vs5
+	xxlxor		vs6,	vs6,	vs6
+	xxlxor		vs7,	vs7,	vs7
+
+#ifndef TRMMKERNEL
+	lxvw4x		vs0,	o0,	T2	// c0, c1
+#else
+	xxlxor		vs0,	vs0,	vs0
+#endif
+
+
+	xxspltw		vs8,	vs34,	0
+	xxspltw		vs9,	vs34,	1
+	xxspltw		vs10,	vs34,	2
+	xxspltw		vs11,	vs34,	3
+
+
+	xxspltw		vs12,	vs35,	0
+	xxspltw		vs13,	vs35,	1
+	xxspltw		vs14,	vs35,	2
+	xxspltw		vs15,	vs35,	3
+
+	XVFADD_R1	vs4,	vs4,	vs8		// add a0_r * b0_r
+	XVFADD_I2	vs5,	vs5,	vs12		// add a0_r * b0_i
+	XVFADD_R1	vs6,	vs6,	vs10		// add a1_r * b0_r
+	XVFADD_I2	vs7,	vs7,	vs14		// add a1_r * b0_i
+
+	XVFADD_R2	vs4,	vs4,	vs13		// add a0_i * b0_i
+	XVFADD_I1	vs5,	vs5,	vs9 		// add a0_i * b0_r
+	XVFADD_R2	vs6,	vs6,	vs15		// add a1_i * b0_i
+	XVFADD_I1	vs7,	vs7,	vs11		// add a1_i * b0_r
+
+	xvmulsp		vs16,	vs4,	alpha_sr		// r0_r * alpha_r
+	xvmulsp		vs17,	vs5,	alpha_si		// r0_i * alpha_i
+	xvmulsp		vs18,	vs4,	alpha_si		// r0_r * alpha_i
+	xvmulsp		vs19,	vs5,	alpha_sr		// r0_i * alpha_r
+
+	xvsubsp		vs20,	vs16,	vs17		// r0_r * alpha_r - r0_i * alpha_i
+	xvaddsp		vs21,	vs18,	vs19		// r0_r * alpha_i + r0_i * alpha_r
+
+	xvmulsp		vs16,	vs6,	alpha_sr		// r1_r * alpha_r
+	xvmulsp		vs17,	vs7,	alpha_si		// r1_i * alpha_i
+	xvmulsp		vs18,	vs6,	alpha_si		// r1_r * alpha_i
+	xvmulsp		vs19,	vs7,	alpha_sr		// r1_i * alpha_r
+
+	xvsubsp		vs22,	vs16,	vs17		// r1_r * alpha_r - r1_i * alpha_i
+	xvaddsp		vs23,	vs18,	vs19		// r1_r * alpha_i + r1_i * alpha_r
+
+	xxsldwi		vs20,	vs20,	vs24,	3		// r0_r
+	xxsldwi		vs21,	vs21,	vs24,	2		// r0_i
+	xxsldwi		vs22,	vs22,	vs24,	1		// r1_r
+	xxsldwi		vs23,	vs23,	vs24,	0		// r1_i
+	xvaddsp		vs20,	vs20,	vs21		// r0_r, r0_i
+	xvaddsp		vs22,	vs22,	vs23		// r1_r, r1_i
+	xvaddsp		vs1,	vs20,	vs22			// r0_r, r0_i, r1_r, r1_i
+	xvaddsp		vs0,	vs0,	vs1
+
+
+	stxvw4x		vs0,	o0,	T2	// c0, c1
+
+	addi		T2,	T2,	16
+	add		T1,	T1,	LDC
+
+
+// N=1
+
+	mr		T2,	T1
+
+// N=1 M=0
+
+	xxlxor		vs4,	vs4,	vs4
+	xxlxor		vs5,	vs5,	vs5
+	xxlxor		vs6,	vs6,	vs6
+	xxlxor		vs7,	vs7,	vs7
+
+#ifndef TRMMKERNEL
+	lxvw4x		vs0,	o0,	T2	// c0, c1
+#else
+	xxlxor		vs0,	vs0,	vs0
+#endif
+
+
+	xxspltw		vs8,	vs36,	0
+	xxspltw		vs9,	vs36,	1
+	xxspltw		vs10,	vs36,	2
+	xxspltw		vs11,	vs36,	3
+
+
+	xxspltw		vs12,	vs37,	0
+	xxspltw		vs13,	vs37,	1
+	xxspltw		vs14,	vs37,	2
+	xxspltw		vs15,	vs37,	3
+
+	XVFADD_R1	vs4,	vs4,	vs8		// add a0_r * b0_r
+	XVFADD_I2	vs5,	vs5,	vs12		// add a0_r * b0_i
+	XVFADD_R1	vs6,	vs6,	vs10		// add a1_r * b0_r
+	XVFADD_I2	vs7,	vs7,	vs14		// add a1_r * b0_i
+
+	XVFADD_R2	vs4,	vs4,	vs13		// add a0_i * b0_i
+	XVFADD_I1	vs5,	vs5,	vs9 		// add a0_i * b0_r
+	XVFADD_R2	vs6,	vs6,	vs15		// add a1_i * b0_i
+	XVFADD_I1	vs7,	vs7,	vs11		// add a1_i * b0_r
+
+	xvmulsp		vs16,	vs4,	alpha_sr		// r0_r * alpha_r
+	xvmulsp		vs17,	vs5,	alpha_si		// r0_i * alpha_i
+	xvmulsp		vs18,	vs4,	alpha_si		// r0_r * alpha_i
+	xvmulsp		vs19,	vs5,	alpha_sr		// r0_i * alpha_r
+
+	xvsubsp		vs20,	vs16,	vs17		// r0_r * alpha_r - r0_i * alpha_i
+	xvaddsp		vs21,	vs18,	vs19		// r0_r * alpha_i + r0_i * alpha_r
+
+	xvmulsp		vs16,	vs6,	alpha_sr		// r1_r * alpha_r
+	xvmulsp		vs17,	vs7,	alpha_si		// r1_i * alpha_i
+	xvmulsp		vs18,	vs6,	alpha_si		// r1_r * alpha_i
+	xvmulsp		vs19,	vs7,	alpha_sr		// r1_i * alpha_r
+
+	xvsubsp		vs22,	vs16,	vs17		// r1_r * alpha_r - r1_i * alpha_i
+	xvaddsp		vs23,	vs18,	vs19		// r1_r * alpha_i + r1_i * alpha_r
+
+	xxsldwi		vs20,	vs20,	vs24,	3		// r0_r
+	xxsldwi		vs21,	vs21,	vs24,	2		// r0_i
+	xxsldwi		vs22,	vs22,	vs24,	1		// r1_r
+	xxsldwi		vs23,	vs23,	vs24,	0		// r1_i
+	xvaddsp		vs20,	vs20,	vs21		// r0_r, r0_i
+	xvaddsp		vs22,	vs22,	vs23		// r1_r, r1_i
+	xvaddsp		vs1,	vs20,	vs22			// r0_r, r0_i, r1_r, r1_i
+	xvaddsp		vs0,	vs0,	vs1
+
+
+	stxvw4x		vs0,	o0,	T2	// c0, c1
+
+	addi		T2,	T2,	16
+
+// N=1 M=2
+
+	xxlxor		vs4,	vs4,	vs4
+	xxlxor		vs5,	vs5,	vs5
+	xxlxor		vs6,	vs6,	vs6
+	xxlxor		vs7,	vs7,	vs7
+
+#ifndef TRMMKERNEL
+	lxvw4x		vs0,	o0,	T2	// c0, c1
+#else
+	xxlxor		vs0,	vs0,	vs0
+#endif
+
+
+	xxspltw		vs8,	vs38,	0
+	xxspltw		vs9,	vs38,	1
+	xxspltw		vs10,	vs38,	2
+	xxspltw		vs11,	vs38,	3
+
+
+	xxspltw		vs12,	vs39,	0
+	xxspltw		vs13,	vs39,	1
+	xxspltw		vs14,	vs39,	2
+	xxspltw		vs15,	vs39,	3
+
+	XVFADD_R1	vs4,	vs4,	vs8		// add a0_r * b0_r
+	XVFADD_I2	vs5,	vs5,	vs12		// add a0_r * b0_i
+	XVFADD_R1	vs6,	vs6,	vs10		// add a1_r * b0_r
+	XVFADD_I2	vs7,	vs7,	vs14		// add a1_r * b0_i
+
+	XVFADD_R2	vs4,	vs4,	vs13		// add a0_i * b0_i
+	XVFADD_I1	vs5,	vs5,	vs9 		// add a0_i * b0_r
+	XVFADD_R2	vs6,	vs6,	vs15		// add a1_i * b0_i
+	XVFADD_I1	vs7,	vs7,	vs11		// add a1_i * b0_r
+
+	xvmulsp		vs16,	vs4,	alpha_sr		// r0_r * alpha_r
+	xvmulsp		vs17,	vs5,	alpha_si		// r0_i * alpha_i
+	xvmulsp		vs18,	vs4,	alpha_si		// r0_r * alpha_i
+	xvmulsp		vs19,	vs5,	alpha_sr		// r0_i * alpha_r
+
+	xvsubsp		vs20,	vs16,	vs17		// r0_r * alpha_r - r0_i * alpha_i
+	xvaddsp		vs21,	vs18,	vs19		// r0_r * alpha_i + r0_i * alpha_r
+
+	xvmulsp		vs16,	vs6,	alpha_sr		// r1_r * alpha_r
+	xvmulsp		vs17,	vs7,	alpha_si		// r1_i * alpha_i
+	xvmulsp		vs18,	vs6,	alpha_si		// r1_r * alpha_i
+	xvmulsp		vs19,	vs7,	alpha_sr		// r1_i * alpha_r
+
+	xvsubsp		vs22,	vs16,	vs17		// r1_r * alpha_r - r1_i * alpha_i
+	xvaddsp		vs23,	vs18,	vs19		// r1_r * alpha_i + r1_i * alpha_r
+
+	xxsldwi		vs20,	vs20,	vs24,	3		// r0_r
+	xxsldwi		vs21,	vs21,	vs24,	2		// r0_i
+	xxsldwi		vs22,	vs22,	vs24,	1		// r1_r
+	xxsldwi		vs23,	vs23,	vs24,	0		// r1_i
+	xvaddsp		vs20,	vs20,	vs21		// r0_r, r0_i
+	xvaddsp		vs22,	vs22,	vs23		// r1_r, r1_i
+	xvaddsp		vs1,	vs20,	vs22			// r0_r, r0_i, r1_r, r1_i
+	xvaddsp		vs0,	vs0,	vs1
+
+
+	stxvw4x		vs0,	o0,	T2	// c0, c1
+
+	addi		T2,	T2,	16
+	add		T1,	T1,	LDC
+
+
+// N=2
+
+	mr		T2,	T1
+
+// N=2 M=0
+
+	xxlxor		vs4,	vs4,	vs4
+	xxlxor		vs5,	vs5,	vs5
+	xxlxor		vs6,	vs6,	vs6
+	xxlxor		vs7,	vs7,	vs7
+
+#ifndef TRMMKERNEL
+	lxvw4x		vs0,	o0,	T2	// c0, c1
+#else
+	xxlxor		vs0,	vs0,	vs0
+#endif
+
+
+	xxspltw		vs8,	vs40,	0
+	xxspltw		vs9,	vs40,	1
+	xxspltw		vs10,	vs40,	2
+	xxspltw		vs11,	vs40,	3
+
+
+	xxspltw		vs12,	vs41,	0
+	xxspltw		vs13,	vs41,	1
+	xxspltw		vs14,	vs41,	2
+	xxspltw		vs15,	vs41,	3
+
+	XVFADD_R1	vs4,	vs4,	vs8		// add a0_r * b0_r
+	XVFADD_I2	vs5,	vs5,	vs12		// add a0_r * b0_i
+	XVFADD_R1	vs6,	vs6,	vs10		// add a1_r * b0_r
+	XVFADD_I2	vs7,	vs7,	vs14		// add a1_r * b0_i
+
+	XVFADD_R2	vs4,	vs4,	vs13		// add a0_i * b0_i
+	XVFADD_I1	vs5,	vs5,	vs9 		// add a0_i * b0_r
+	XVFADD_R2	vs6,	vs6,	vs15		// add a1_i * b0_i
+	XVFADD_I1	vs7,	vs7,	vs11		// add a1_i * b0_r
+
+	xvmulsp		vs16,	vs4,	alpha_sr		// r0_r * alpha_r
+	xvmulsp		vs17,	vs5,	alpha_si		// r0_i * alpha_i
+	xvmulsp		vs18,	vs4,	alpha_si		// r0_r * alpha_i
+	xvmulsp		vs19,	vs5,	alpha_sr		// r0_i * alpha_r
+
+	xvsubsp		vs20,	vs16,	vs17		// r0_r * alpha_r - r0_i * alpha_i
+	xvaddsp		vs21,	vs18,	vs19		// r0_r * alpha_i + r0_i * alpha_r
+
+	xvmulsp		vs16,	vs6,	alpha_sr		// r1_r * alpha_r
+	xvmulsp		vs17,	vs7,	alpha_si		// r1_i * alpha_i
+	xvmulsp		vs18,	vs6,	alpha_si		// r1_r * alpha_i
+	xvmulsp		vs19,	vs7,	alpha_sr		// r1_i * alpha_r
+
+	xvsubsp		vs22,	vs16,	vs17		// r1_r * alpha_r - r1_i * alpha_i
+	xvaddsp		vs23,	vs18,	vs19		// r1_r * alpha_i + r1_i * alpha_r
+
+	xxsldwi		vs20,	vs20,	vs24,	3		// r0_r
+	xxsldwi		vs21,	vs21,	vs24,	2		// r0_i
+	xxsldwi		vs22,	vs22,	vs24,	1		// r1_r
+	xxsldwi		vs23,	vs23,	vs24,	0		// r1_i
+	xvaddsp		vs20,	vs20,	vs21		// r0_r, r0_i
+	xvaddsp		vs22,	vs22,	vs23		// r1_r, r1_i
+	xvaddsp		vs1,	vs20,	vs22			// r0_r, r0_i, r1_r, r1_i
+	xvaddsp		vs0,	vs0,	vs1
+
+
+	stxvw4x		vs0,	o0,	T2	// c0, c1
+
+	addi		T2,	T2,	16
+
+// N=2 M=2
+
+	xxlxor		vs4,	vs4,	vs4
+	xxlxor		vs5,	vs5,	vs5
+	xxlxor		vs6,	vs6,	vs6
+	xxlxor		vs7,	vs7,	vs7
+
+#ifndef TRMMKERNEL
+	lxvw4x		vs0,	o0,	T2	// c0, c1
+#else
+	xxlxor		vs0,	vs0,	vs0
+#endif
+
+
+	xxspltw		vs8,	vs42,	0
+	xxspltw		vs9,	vs42,	1
+	xxspltw		vs10,	vs42,	2
+	xxspltw		vs11,	vs42,	3
+
+
+	xxspltw		vs12,	vs43,	0
+	xxspltw		vs13,	vs43,	1
+	xxspltw		vs14,	vs43,	2
+	xxspltw		vs15,	vs43,	3
+
+	XVFADD_R1	vs4,	vs4,	vs8		// add a0_r * b0_r
+	XVFADD_I2	vs5,	vs5,	vs12		// add a0_r * b0_i
+	XVFADD_R1	vs6,	vs6,	vs10		// add a1_r * b0_r
+	XVFADD_I2	vs7,	vs7,	vs14		// add a1_r * b0_i
+
+	XVFADD_R2	vs4,	vs4,	vs13		// add a0_i * b0_i
+	XVFADD_I1	vs5,	vs5,	vs9 		// add a0_i * b0_r
+	XVFADD_R2	vs6,	vs6,	vs15		// add a1_i * b0_i
+	XVFADD_I1	vs7,	vs7,	vs11		// add a1_i * b0_r
+
+	xvmulsp		vs16,	vs4,	alpha_sr		// r0_r * alpha_r
+	xvmulsp		vs17,	vs5,	alpha_si		// r0_i * alpha_i
+	xvmulsp		vs18,	vs4,	alpha_si		// r0_r * alpha_i
+	xvmulsp		vs19,	vs5,	alpha_sr		// r0_i * alpha_r
+
+	xvsubsp		vs20,	vs16,	vs17		// r0_r * alpha_r - r0_i * alpha_i
+	xvaddsp		vs21,	vs18,	vs19		// r0_r * alpha_i + r0_i * alpha_r
+
+	xvmulsp		vs16,	vs6,	alpha_sr		// r1_r * alpha_r
+	xvmulsp		vs17,	vs7,	alpha_si		// r1_i * alpha_i
+	xvmulsp		vs18,	vs6,	alpha_si		// r1_r * alpha_i
+	xvmulsp		vs19,	vs7,	alpha_sr		// r1_i * alpha_r
+
+	xvsubsp		vs22,	vs16,	vs17		// r1_r * alpha_r - r1_i * alpha_i
+	xvaddsp		vs23,	vs18,	vs19		// r1_r * alpha_i + r1_i * alpha_r
+
+	xxsldwi		vs20,	vs20,	vs24,	3		// r0_r
+	xxsldwi		vs21,	vs21,	vs24,	2		// r0_i
+	xxsldwi		vs22,	vs22,	vs24,	1		// r1_r
+	xxsldwi		vs23,	vs23,	vs24,	0		// r1_i
+	xvaddsp		vs20,	vs20,	vs21		// r0_r, r0_i
+	xvaddsp		vs22,	vs22,	vs23		// r1_r, r1_i
+	xvaddsp		vs1,	vs20,	vs22			// r0_r, r0_i, r1_r, r1_i
+	xvaddsp		vs0,	vs0,	vs1
+
+
+	stxvw4x		vs0,	o0,	T2	// c0, c1
+
+	addi		T2,	T2,	16
+	add		T1,	T1,	LDC
+
+
+// N=3
+
+	mr		T2,	T1
+
+// N=3 M=0
+
+	xxlxor		vs4,	vs4,	vs4
+	xxlxor		vs5,	vs5,	vs5
+	xxlxor		vs6,	vs6,	vs6
+	xxlxor		vs7,	vs7,	vs7
+
+#ifndef TRMMKERNEL
+	lxvw4x		vs0,	o0,	T2	// c0, c1
+#else
+	xxlxor		vs0,	vs0,	vs0
+#endif
+
+
+	xxspltw		vs8,	vs44,	0
+	xxspltw		vs9,	vs44,	1
+	xxspltw		vs10,	vs44,	2
+	xxspltw		vs11,	vs44,	3
+
+
+	xxspltw		vs12,	vs45,	0
+	xxspltw		vs13,	vs45,	1
+	xxspltw		vs14,	vs45,	2
+	xxspltw		vs15,	vs45,	3
+
+	XVFADD_R1	vs4,	vs4,	vs8		// add a0_r * b0_r
+	XVFADD_I2	vs5,	vs5,	vs12		// add a0_r * b0_i
+	XVFADD_R1	vs6,	vs6,	vs10		// add a1_r * b0_r
+	XVFADD_I2	vs7,	vs7,	vs14		// add a1_r * b0_i
+
+	XVFADD_R2	vs4,	vs4,	vs13		// add a0_i * b0_i
+	XVFADD_I1	vs5,	vs5,	vs9 		// add a0_i * b0_r
+	XVFADD_R2	vs6,	vs6,	vs15		// add a1_i * b0_i
+	XVFADD_I1	vs7,	vs7,	vs11		// add a1_i * b0_r
+
+	xvmulsp		vs16,	vs4,	alpha_sr		// r0_r * alpha_r
+	xvmulsp		vs17,	vs5,	alpha_si		// r0_i * alpha_i
+	xvmulsp		vs18,	vs4,	alpha_si		// r0_r * alpha_i
+	xvmulsp		vs19,	vs5,	alpha_sr		// r0_i * alpha_r
+
+	xvsubsp		vs20,	vs16,	vs17		// r0_r * alpha_r - r0_i * alpha_i
+	xvaddsp		vs21,	vs18,	vs19		// r0_r * alpha_i + r0_i * alpha_r
+
+	xvmulsp		vs16,	vs6,	alpha_sr		// r1_r * alpha_r
+	xvmulsp		vs17,	vs7,	alpha_si		// r1_i * alpha_i
+	xvmulsp		vs18,	vs6,	alpha_si		// r1_r * alpha_i
+	xvmulsp		vs19,	vs7,	alpha_sr		// r1_i * alpha_r
+
+	xvsubsp		vs22,	vs16,	vs17		// r1_r * alpha_r - r1_i * alpha_i
+	xvaddsp		vs23,	vs18,	vs19		// r1_r * alpha_i + r1_i * alpha_r
+
+	xxsldwi		vs20,	vs20,	vs24,	3		// r0_r
+	xxsldwi		vs21,	vs21,	vs24,	2		// r0_i
+	xxsldwi		vs22,	vs22,	vs24,	1		// r1_r
+	xxsldwi		vs23,	vs23,	vs24,	0		// r1_i
+	xvaddsp		vs20,	vs20,	vs21		// r0_r, r0_i
+	xvaddsp		vs22,	vs22,	vs23		// r1_r, r1_i
+	xvaddsp		vs1,	vs20,	vs22			// r0_r, r0_i, r1_r, r1_i
+	xvaddsp		vs0,	vs0,	vs1
+
+
+	stxvw4x		vs0,	o0,	T2	// c0, c1
+
+	addi		T2,	T2,	16
+
+// N=3 M=2
+
+	xxlxor		vs4,	vs4,	vs4
+	xxlxor		vs5,	vs5,	vs5
+	xxlxor		vs6,	vs6,	vs6
+	xxlxor		vs7,	vs7,	vs7
+
+#ifndef TRMMKERNEL
+	lxvw4x		vs0,	o0,	T2	// c0, c1
+#else
+	xxlxor		vs0,	vs0,	vs0
+#endif
+
+
+	xxspltw		vs8,	vs46,	0
+	xxspltw		vs9,	vs46,	1
+	xxspltw		vs10,	vs46,	2
+	xxspltw		vs11,	vs46,	3
+
+
+	xxspltw		vs12,	vs47,	0
+	xxspltw		vs13,	vs47,	1
+	xxspltw		vs14,	vs47,	2
+	xxspltw		vs15,	vs47,	3
+
+	XVFADD_R1	vs4,	vs4,	vs8		// add a0_r * b0_r
+	XVFADD_I2	vs5,	vs5,	vs12		// add a0_r * b0_i
+	XVFADD_R1	vs6,	vs6,	vs10		// add a1_r * b0_r
+	XVFADD_I2	vs7,	vs7,	vs14		// add a1_r * b0_i
+
+	XVFADD_R2	vs4,	vs4,	vs13		// add a0_i * b0_i
+	XVFADD_I1	vs5,	vs5,	vs9 		// add a0_i * b0_r
+	XVFADD_R2	vs6,	vs6,	vs15		// add a1_i * b0_i
+	XVFADD_I1	vs7,	vs7,	vs11		// add a1_i * b0_r
+
+	xvmulsp		vs16,	vs4,	alpha_sr		// r0_r * alpha_r
+	xvmulsp		vs17,	vs5,	alpha_si		// r0_i * alpha_i
+	xvmulsp		vs18,	vs4,	alpha_si		// r0_r * alpha_i
+	xvmulsp		vs19,	vs5,	alpha_sr		// r0_i * alpha_r
+
+	xvsubsp		vs20,	vs16,	vs17		// r0_r * alpha_r - r0_i * alpha_i
+	xvaddsp		vs21,	vs18,	vs19		// r0_r * alpha_i + r0_i * alpha_r
+
+	xvmulsp		vs16,	vs6,	alpha_sr		// r1_r * alpha_r
+	xvmulsp		vs17,	vs7,	alpha_si		// r1_i * alpha_i
+	xvmulsp		vs18,	vs6,	alpha_si		// r1_r * alpha_i
+	xvmulsp		vs19,	vs7,	alpha_sr		// r1_i * alpha_r
+
+	xvsubsp		vs22,	vs16,	vs17		// r1_r * alpha_r - r1_i * alpha_i
+	xvaddsp		vs23,	vs18,	vs19		// r1_r * alpha_i + r1_i * alpha_r
+
+	xxsldwi		vs20,	vs20,	vs24,	3		// r0_r
+	xxsldwi		vs21,	vs21,	vs24,	2		// r0_i
+	xxsldwi		vs22,	vs22,	vs24,	1		// r1_r
+	xxsldwi		vs23,	vs23,	vs24,	0		// r1_i
+	xvaddsp		vs20,	vs20,	vs21		// r0_r, r0_i
+	xvaddsp		vs22,	vs22,	vs23		// r1_r, r1_i
+	xvaddsp		vs1,	vs20,	vs22			// r0_r, r0_i, r1_r, r1_i
+	xvaddsp		vs0,	vs0,	vs1
+
+
+	stxvw4x		vs0,	o0,	T2	// c0, c1
+
+	addi		T2,	T2,	16
+	add		T1,	T1,	LDC
+
+	addi		CO,	CO,	32
+
+.endm
+
+
+/**********************************************************************************************
+* Macros for N=4 and M=2
+**********************************************************************************************/
+
+.macro LOAD4x2_1
+
+	lxvw4x		vs0,	o0,	AO		// load a0, a1
+
+	addi		AO,	AO,	16
+
+	lxvw4x		vs8,	o0,	BO		//  load b0_r
+	lxvw4x		vs9,	o16,	BO		//  load b0_i
+	lxvw4x		vs10,	o32,	BO		//  load b1_r
+	lxvw4x		vs11,	o48,	BO		//  load b1_i
+
+	addi		BO,	BO,	64
+
+	lxvw4x		vs12,	o0,	BO		//  load b2_r
+	lxvw4x		vs13,	o16,	BO		//  load b2_i
+	lxvw4x		vs14,	o32,	BO		//  load b3_r
+	lxvw4x		vs15,	o48,	BO		//  load b3_i
+
+	addi		BO,	BO,	64
+
+
+.endm
+
+.macro KERNEL4x2_I1
+
+
+	lxvw4x		vs4,	o0,	AO		// load a0, a1
+
+	addi		AO,	AO,	16
+
+	lxvw4x		vs16,	o0,	BO		//  load b0_r
+	lxvw4x		vs17,	o16,	BO		//  load b0_i
+	lxvw4x		vs18,	o32,	BO		//  load b1_r
+	lxvw4x		vs19,	o48,	BO		//  load b1_i
+
+	addi		BO,	BO,	64
+
+	lxvw4x		vs20,	o0,	BO		//  load b2_r
+	lxvw4x		vs21,	o16,	BO		//  load b2_i
+	lxvw4x		vs22,	o32,	BO		//  load b3_r
+	lxvw4x		vs23,	o48,	BO		//  load b3_i
+
+	addi		BO,	BO,	64
+
+
+
+	xvmulsp		vs32,	vs0,	vs8		// a0_r*b0_r, a0_i*b0_r, a1_r*b0_r, a1_i*b0_r
+	xvmulsp		vs33,	vs0,	vs9		// a0_r*b0_i, a0_i*b0_i, a1_r*b0_i, a1_i*b0_i
+
+	xvmulsp		vs34,	vs0,	vs10		// a0_r*b1_r, a0_i*b1_r, a1_r*b1_r, a1_i*b1_r
+	xvmulsp		vs35,	vs0,	vs11		// a0_r*b1_i, a0_i*b1_i, a1_r*b1_i, a1_i*b1_i
+
+	xvmulsp		vs36,	vs0,	vs12		// a0_r*b2_r, a0_i*b2_r, a1_r*b2_r, a1_i*b2_r
+	xvmulsp		vs37,	vs0,	vs13		// a0_r*b2_i, a0_i*b2_i, a1_r*b2_i, a1_i*b2_i
+
+	xvmulsp		vs38,	vs0,	vs14		// a0_r*b3_r, a0_i*b3_r, a1_r*b3_r, a1_i*b3_r
+	xvmulsp		vs39,	vs0,	vs15		// a0_r*b3_i, a0_i*b3_i, a1_r*b3_i, a1_i*b3_i
+
+
+.endm
+
+.macro KERNEL4x2_1
+
+
+	lxvw4x		vs4,	o0,	AO		// load a0, a1
+
+	addi		AO,	AO,	16
+
+	lxvw4x		vs16,	o0,	BO		//  load b0_r
+	lxvw4x		vs17,	o16,	BO		//  load b0_i
+	lxvw4x		vs18,	o32,	BO		//  load b1_r
+	lxvw4x		vs19,	o48,	BO		//  load b1_i
+
+	addi		BO,	BO,	64
+
+	lxvw4x		vs20,	o0,	BO		//  load b2_r
+	lxvw4x		vs21,	o16,	BO		//  load b2_i
+	lxvw4x		vs22,	o32,	BO		//  load b3_r
+	lxvw4x		vs23,	o48,	BO		//  load b3_i
+
+	addi		BO,	BO,	64
+
+
+
+	xvmaddasp	vs32,	vs0,	vs8		// a0_r*b0_r, a0_i*b0_r, a1_r*b0_r, a1_i*b0_r
+	xvmaddasp	vs33,	vs0,	vs9		// a0_r*b0_i, a0_i*b0_i, a1_r*b0_i, a1_i*b0_i
+
+	xvmaddasp	vs34,	vs0,	vs10		// a0_r*b1_r, a0_i*b1_r, a1_r*b1_r, a1_i*b1_r
+	xvmaddasp	vs35,	vs0,	vs11		// a0_r*b1_i, a0_i*b1_i, a1_r*b1_i, a1_i*b1_i
+
+	xvmaddasp	vs36,	vs0,	vs12		// a0_r*b2_r, a0_i*b2_r, a1_r*b2_r, a1_i*b2_r
+	xvmaddasp	vs37,	vs0,	vs13		// a0_r*b2_i, a0_i*b2_i, a1_r*b2_i, a1_i*b2_i
+
+	xvmaddasp	vs38,	vs0,	vs14		// a0_r*b3_r, a0_i*b3_r, a1_r*b3_r, a1_i*b3_r
+	xvmaddasp	vs39,	vs0,	vs15		// a0_r*b3_i, a0_i*b3_i, a1_r*b3_i, a1_i*b3_i
+
+
+.endm
+
+.macro KERNEL4x2_2
+
+
+	lxvw4x		vs0,	o0,	AO		// load a0, a1
+
+	addi		AO,	AO,	16
+
+	lxvw4x		vs8,	o0,	BO		//  load b0_r
+	lxvw4x		vs9,	o16,	BO		//  load b0_i
+	lxvw4x		vs10,	o32,	BO		//  load b1_r
+	lxvw4x		vs11,	o48,	BO		//  load b1_i
+
+	addi		BO,	BO,	64
+
+	lxvw4x		vs12,	o0,	BO		//  load b2_r
+	lxvw4x		vs13,	o16,	BO		//  load b2_i
+	lxvw4x		vs14,	o32,	BO		//  load b3_r
+	lxvw4x		vs15,	o48,	BO		//  load b3_i
+
+	addi		BO,	BO,	64
+
+
+
+	xvmaddasp	vs32,	vs4,	vs16		// a4_r*b0_r, a4_i*b0_r, a1_r*b0_r, a1_i*b0_r
+	xvmaddasp	vs33,	vs4,	vs17		// a4_r*b0_i, a4_i*b0_i, a1_r*b0_i, a1_i*b0_i
+
+	xvmaddasp	vs34,	vs4,	vs18		// a4_r*b1_r, a4_i*b1_r, a1_r*b1_r, a1_i*b1_r
+	xvmaddasp	vs35,	vs4,	vs19		// a4_r*b1_i, a4_i*b1_i, a1_r*b1_i, a1_i*b1_i
+
+	xvmaddasp	vs36,	vs4,	vs20		// a4_r*b2_r, a4_i*b2_r, a1_r*b2_r, a1_i*b2_r
+	xvmaddasp	vs37,	vs4,	vs21		// a4_r*b2_i, a4_i*b2_i, a1_r*b2_i, a1_i*b2_i
+
+	xvmaddasp	vs38,	vs4,	vs22		// a4_r*b3_r, a4_i*b3_r, a1_r*b3_r, a1_i*b3_r
+	xvmaddasp	vs39,	vs4,	vs23		// a4_r*b3_i, a4_i*b3_i, a1_r*b3_i, a1_i*b3_i
+
+
+.endm
+
+.macro KERNEL4x2_E2
+
+
+	xvmaddasp	vs32,	vs4,	vs16		// a4_r*b0_r, a4_i*b0_r, a1_r*b0_r, a1_i*b0_r
+	xvmaddasp	vs33,	vs4,	vs17		// a4_r*b0_i, a4_i*b0_i, a1_r*b0_i, a1_i*b0_i
+
+	xvmaddasp	vs34,	vs4,	vs18		// a4_r*b1_r, a4_i*b1_r, a1_r*b1_r, a1_i*b1_r
+	xvmaddasp	vs35,	vs4,	vs19		// a4_r*b1_i, a4_i*b1_i, a1_r*b1_i, a1_i*b1_i
+
+	xvmaddasp	vs36,	vs4,	vs20		// a4_r*b2_r, a4_i*b2_r, a1_r*b2_r, a1_i*b2_r
+	xvmaddasp	vs37,	vs4,	vs21		// a4_r*b2_i, a4_i*b2_i, a1_r*b2_i, a1_i*b2_i
+
+	xvmaddasp	vs38,	vs4,	vs22		// a4_r*b3_r, a4_i*b3_r, a1_r*b3_r, a1_i*b3_r
+	xvmaddasp	vs39,	vs4,	vs23		// a4_r*b3_i, a4_i*b3_i, a1_r*b3_i, a1_i*b3_i
+
+
+.endm
+
+.macro KERNEL4x2_SUBI1
+
+
+	lxvw4x		vs0,	o0,	AO		// load a0, a1
+
+	addi		AO,	AO,	16
+
+	lxvw4x		vs8,	o0,	BO		//  load b0_r
+	lxvw4x		vs9,	o16,	BO		//  load b0_i
+	lxvw4x		vs10,	o32,	BO		//  load b1_r
+	lxvw4x		vs11,	o48,	BO		//  load b1_i
+
+	addi		BO,	BO,	64
+
+	lxvw4x		vs12,	o0,	BO		//  load b2_r
+	lxvw4x		vs13,	o16,	BO		//  load b2_i
+	lxvw4x		vs14,	o32,	BO		//  load b3_r
+	lxvw4x		vs15,	o48,	BO		//  load b3_i
+
+	addi		BO,	BO,	64
+
+
+
+	xvmulsp		vs32,	vs0,	vs8		// a0_r*b0_r, a0_i*b0_r, a1_r*b0_r, a1_i*b0_r
+	xvmulsp		vs33,	vs0,	vs9		// a0_r*b0_i, a0_i*b0_i, a1_r*b0_i, a1_i*b0_i
+
+	xvmulsp		vs34,	vs0,	vs10		// a0_r*b1_r, a0_i*b1_r, a1_r*b1_r, a1_i*b1_r
+	xvmulsp		vs35,	vs0,	vs11		// a0_r*b1_i, a0_i*b1_i, a1_r*b1_i, a1_i*b1_i
+
+	xvmulsp		vs36,	vs0,	vs12		// a0_r*b2_r, a0_i*b2_r, a1_r*b2_r, a1_i*b2_r
+	xvmulsp		vs37,	vs0,	vs13		// a0_r*b2_i, a0_i*b2_i, a1_r*b2_i, a1_i*b2_i
+
+	xvmulsp		vs38,	vs0,	vs14		// a0_r*b3_r, a0_i*b3_r, a1_r*b3_r, a1_i*b3_r
+	xvmulsp		vs39,	vs0,	vs15		// a0_r*b3_i, a0_i*b3_i, a1_r*b3_i, a1_i*b3_i
+
+
+.endm
+
+.macro KERNEL4x2_SUB1
+
+
+	lxvw4x		vs0,	o0,	AO		// load a0, a1
+
+	addi		AO,	AO,	16
+
+	lxvw4x		vs8,	o0,	BO		//  load b0_r
+	lxvw4x		vs9,	o16,	BO		//  load b0_i
+	lxvw4x		vs10,	o32,	BO		//  load b1_r
+	lxvw4x		vs11,	o48,	BO		//  load b1_i
+
+	addi		BO,	BO,	64
+
+	lxvw4x		vs12,	o0,	BO		//  load b2_r
+	lxvw4x		vs13,	o16,	BO		//  load b2_i
+	lxvw4x		vs14,	o32,	BO		//  load b3_r
+	lxvw4x		vs15,	o48,	BO		//  load b3_i
+
+	addi		BO,	BO,	64
+
+
+
+	xvmaddasp	vs32,	vs0,	vs8		// a0_r*b0_r, a0_i*b0_r, a1_r*b0_r, a1_i*b0_r
+	xvmaddasp	vs33,	vs0,	vs9		// a0_r*b0_i, a0_i*b0_i, a1_r*b0_i, a1_i*b0_i
+
+	xvmaddasp	vs34,	vs0,	vs10		// a0_r*b1_r, a0_i*b1_r, a1_r*b1_r, a1_i*b1_r
+	xvmaddasp	vs35,	vs0,	vs11		// a0_r*b1_i, a0_i*b1_i, a1_r*b1_i, a1_i*b1_i
+
+	xvmaddasp	vs36,	vs0,	vs12		// a0_r*b2_r, a0_i*b2_r, a1_r*b2_r, a1_i*b2_r
+	xvmaddasp	vs37,	vs0,	vs13		// a0_r*b2_i, a0_i*b2_i, a1_r*b2_i, a1_i*b2_i
+
+	xvmaddasp	vs38,	vs0,	vs14		// a0_r*b3_r, a0_i*b3_r, a1_r*b3_r, a1_i*b3_r
+	xvmaddasp	vs39,	vs0,	vs15		// a0_r*b3_i, a0_i*b3_i, a1_r*b3_i, a1_i*b3_i
+
+
+.endm
+
+.macro SAVE4x2
+
+	mr		T1,	CO
+	xxlxor		vs24,	vs24,	vs24
+
+// N=0
+
+	mr		T2,	T1
+
+// N=0 M=0
+
+	xxlxor		vs4,	vs4,	vs4
+	xxlxor		vs5,	vs5,	vs5
+	xxlxor		vs6,	vs6,	vs6
+	xxlxor		vs7,	vs7,	vs7
+
+#ifndef TRMMKERNEL
+	lxvw4x		vs0,	o0,	T2	// c0, c1
+#else
+	xxlxor		vs0,	vs0,	vs0
+#endif
+
+
+	xxspltw		vs8,	vs32,	0
+	xxspltw		vs9,	vs32,	1
+	xxspltw		vs10,	vs32,	2
+	xxspltw		vs11,	vs32,	3
+
+
+	xxspltw		vs12,	vs33,	0
+	xxspltw		vs13,	vs33,	1
+	xxspltw		vs14,	vs33,	2
+	xxspltw		vs15,	vs33,	3
+
+	XVFADD_R1	vs4,	vs4,	vs8		// add a0_r * b0_r
+	XVFADD_I2	vs5,	vs5,	vs12		// add a0_r * b0_i
+	XVFADD_R1	vs6,	vs6,	vs10		// add a1_r * b0_r
+	XVFADD_I2	vs7,	vs7,	vs14		// add a1_r * b0_i
+
+	XVFADD_R2	vs4,	vs4,	vs13		// add a0_i * b0_i
+	XVFADD_I1	vs5,	vs5,	vs9 		// add a0_i * b0_r
+	XVFADD_R2	vs6,	vs6,	vs15		// add a1_i * b0_i
+	XVFADD_I1	vs7,	vs7,	vs11		// add a1_i * b0_r
+
+	xvmulsp		vs16,	vs4,	alpha_sr		// r0_r * alpha_r
+	xvmulsp		vs17,	vs5,	alpha_si		// r0_i * alpha_i
+	xvmulsp		vs18,	vs4,	alpha_si		// r0_r * alpha_i
+	xvmulsp		vs19,	vs5,	alpha_sr		// r0_i * alpha_r
+
+	xvsubsp		vs20,	vs16,	vs17		// r0_r * alpha_r - r0_i * alpha_i
+	xvaddsp		vs21,	vs18,	vs19		// r0_r * alpha_i + r0_i * alpha_r
+
+	xvmulsp		vs16,	vs6,	alpha_sr		// r1_r * alpha_r
+	xvmulsp		vs17,	vs7,	alpha_si		// r1_i * alpha_i
+	xvmulsp		vs18,	vs6,	alpha_si		// r1_r * alpha_i
+	xvmulsp		vs19,	vs7,	alpha_sr		// r1_i * alpha_r
+
+	xvsubsp		vs22,	vs16,	vs17		// r1_r * alpha_r - r1_i * alpha_i
+	xvaddsp		vs23,	vs18,	vs19		// r1_r * alpha_i + r1_i * alpha_r
+
+	xxsldwi		vs20,	vs20,	vs24,	3		// r0_r
+	xxsldwi		vs21,	vs21,	vs24,	2		// r0_i
+	xxsldwi		vs22,	vs22,	vs24,	1		// r1_r
+	xxsldwi		vs23,	vs23,	vs24,	0		// r1_i
+	xvaddsp		vs20,	vs20,	vs21		// r0_r, r0_i
+	xvaddsp		vs22,	vs22,	vs23		// r1_r, r1_i
+	xvaddsp		vs1,	vs20,	vs22			// r0_r, r0_i, r1_r, r1_i
+	xvaddsp		vs0,	vs0,	vs1
+
+
+	stxvw4x		vs0,	o0,	T2	// c0, c1
+
+	addi		T2,	T2,	16
+	add		T1,	T1,	LDC
+
+
+// N=1
+
+	mr		T2,	T1
+
+// N=1 M=0
+
+	xxlxor		vs4,	vs4,	vs4
+	xxlxor		vs5,	vs5,	vs5
+	xxlxor		vs6,	vs6,	vs6
+	xxlxor		vs7,	vs7,	vs7
+
+#ifndef TRMMKERNEL
+	lxvw4x		vs0,	o0,	T2	// c0, c1
+#else
+	xxlxor		vs0,	vs0,	vs0
+#endif
+
+
+	xxspltw		vs8,	vs34,	0
+	xxspltw		vs9,	vs34,	1
+	xxspltw		vs10,	vs34,	2
+	xxspltw		vs11,	vs34,	3
+
+
+	xxspltw		vs12,	vs35,	0
+	xxspltw		vs13,	vs35,	1
+	xxspltw		vs14,	vs35,	2
+	xxspltw		vs15,	vs35,	3
+
+	XVFADD_R1	vs4,	vs4,	vs8		// add a0_r * b0_r
+	XVFADD_I2	vs5,	vs5,	vs12		// add a0_r * b0_i
+	XVFADD_R1	vs6,	vs6,	vs10		// add a1_r * b0_r
+	XVFADD_I2	vs7,	vs7,	vs14		// add a1_r * b0_i
+
+	XVFADD_R2	vs4,	vs4,	vs13		// add a0_i * b0_i
+	XVFADD_I1	vs5,	vs5,	vs9 		// add a0_i * b0_r
+	XVFADD_R2	vs6,	vs6,	vs15		// add a1_i * b0_i
+	XVFADD_I1	vs7,	vs7,	vs11		// add a1_i * b0_r
+
+	xvmulsp		vs16,	vs4,	alpha_sr		// r0_r * alpha_r
+	xvmulsp		vs17,	vs5,	alpha_si		// r0_i * alpha_i
+	xvmulsp		vs18,	vs4,	alpha_si		// r0_r * alpha_i
+	xvmulsp		vs19,	vs5,	alpha_sr		// r0_i * alpha_r
+
+	xvsubsp		vs20,	vs16,	vs17		// r0_r * alpha_r - r0_i * alpha_i
+	xvaddsp		vs21,	vs18,	vs19		// r0_r * alpha_i + r0_i * alpha_r
+
+	xvmulsp		vs16,	vs6,	alpha_sr		// r1_r * alpha_r
+	xvmulsp		vs17,	vs7,	alpha_si		// r1_i * alpha_i
+	xvmulsp		vs18,	vs6,	alpha_si		// r1_r * alpha_i
+	xvmulsp		vs19,	vs7,	alpha_sr		// r1_i * alpha_r
+
+	xvsubsp		vs22,	vs16,	vs17		// r1_r * alpha_r - r1_i * alpha_i
+	xvaddsp		vs23,	vs18,	vs19		// r1_r * alpha_i + r1_i * alpha_r
+
+	xxsldwi		vs20,	vs20,	vs24,	3		// r0_r
+	xxsldwi		vs21,	vs21,	vs24,	2		// r0_i
+	xxsldwi		vs22,	vs22,	vs24,	1		// r1_r
+	xxsldwi		vs23,	vs23,	vs24,	0		// r1_i
+	xvaddsp		vs20,	vs20,	vs21		// r0_r, r0_i
+	xvaddsp		vs22,	vs22,	vs23		// r1_r, r1_i
+	xvaddsp		vs1,	vs20,	vs22			// r0_r, r0_i, r1_r, r1_i
+	xvaddsp		vs0,	vs0,	vs1
+
+
+	stxvw4x		vs0,	o0,	T2	// c0, c1
+
+	addi		T2,	T2,	16
+	add		T1,	T1,	LDC
+
+
+// N=2
+
+	mr		T2,	T1
+
+// N=2 M=0
+
+	xxlxor		vs4,	vs4,	vs4
+	xxlxor		vs5,	vs5,	vs5
+	xxlxor		vs6,	vs6,	vs6
+	xxlxor		vs7,	vs7,	vs7
+
+#ifndef TRMMKERNEL
+	lxvw4x		vs0,	o0,	T2	// c0, c1
+#else
+	xxlxor		vs0,	vs0,	vs0
+#endif
+
+
+	xxspltw		vs8,	vs36,	0
+	xxspltw		vs9,	vs36,	1
+	xxspltw		vs10,	vs36,	2
+	xxspltw		vs11,	vs36,	3
+
+
+	xxspltw		vs12,	vs37,	0
+	xxspltw		vs13,	vs37,	1
+	xxspltw		vs14,	vs37,	2
+	xxspltw		vs15,	vs37,	3
+
+	XVFADD_R1	vs4,	vs4,	vs8		// add a0_r * b0_r
+	XVFADD_I2	vs5,	vs5,	vs12		// add a0_r * b0_i
+	XVFADD_R1	vs6,	vs6,	vs10		// add a1_r * b0_r
+	XVFADD_I2	vs7,	vs7,	vs14		// add a1_r * b0_i
+
+	XVFADD_R2	vs4,	vs4,	vs13		// add a0_i * b0_i
+	XVFADD_I1	vs5,	vs5,	vs9 		// add a0_i * b0_r
+	XVFADD_R2	vs6,	vs6,	vs15		// add a1_i * b0_i
+	XVFADD_I1	vs7,	vs7,	vs11		// add a1_i * b0_r
+
+	xvmulsp		vs16,	vs4,	alpha_sr		// r0_r * alpha_r
+	xvmulsp		vs17,	vs5,	alpha_si		// r0_i * alpha_i
+	xvmulsp		vs18,	vs4,	alpha_si		// r0_r * alpha_i
+	xvmulsp		vs19,	vs5,	alpha_sr		// r0_i * alpha_r
+
+	xvsubsp		vs20,	vs16,	vs17		// r0_r * alpha_r - r0_i * alpha_i
+	xvaddsp		vs21,	vs18,	vs19		// r0_r * alpha_i + r0_i * alpha_r
+
+	xvmulsp		vs16,	vs6,	alpha_sr		// r1_r * alpha_r
+	xvmulsp		vs17,	vs7,	alpha_si		// r1_i * alpha_i
+	xvmulsp		vs18,	vs6,	alpha_si		// r1_r * alpha_i
+	xvmulsp		vs19,	vs7,	alpha_sr		// r1_i * alpha_r
+
+	xvsubsp		vs22,	vs16,	vs17		// r1_r * alpha_r - r1_i * alpha_i
+	xvaddsp		vs23,	vs18,	vs19		// r1_r * alpha_i + r1_i * alpha_r
+
+	xxsldwi		vs20,	vs20,	vs24,	3		// r0_r
+	xxsldwi		vs21,	vs21,	vs24,	2		// r0_i
+	xxsldwi		vs22,	vs22,	vs24,	1		// r1_r
+	xxsldwi		vs23,	vs23,	vs24,	0		// r1_i
+	xvaddsp		vs20,	vs20,	vs21		// r0_r, r0_i
+	xvaddsp		vs22,	vs22,	vs23		// r1_r, r1_i
+	xvaddsp		vs1,	vs20,	vs22			// r0_r, r0_i, r1_r, r1_i
+	xvaddsp		vs0,	vs0,	vs1
+
+
+	stxvw4x		vs0,	o0,	T2	// c0, c1
+
+	addi		T2,	T2,	16
+	add		T1,	T1,	LDC
+
+
+// N=3
+
+	mr		T2,	T1
+
+// N=3 M=0
+
+	xxlxor		vs4,	vs4,	vs4
+	xxlxor		vs5,	vs5,	vs5
+	xxlxor		vs6,	vs6,	vs6
+	xxlxor		vs7,	vs7,	vs7
+
+#ifndef TRMMKERNEL
+	lxvw4x		vs0,	o0,	T2	// c0, c1
+#else
+	xxlxor		vs0,	vs0,	vs0
+#endif
+
+
+	xxspltw		vs8,	vs38,	0
+	xxspltw		vs9,	vs38,	1
+	xxspltw		vs10,	vs38,	2
+	xxspltw		vs11,	vs38,	3
+
+
+	xxspltw		vs12,	vs39,	0
+	xxspltw		vs13,	vs39,	1
+	xxspltw		vs14,	vs39,	2
+	xxspltw		vs15,	vs39,	3
+
+	XVFADD_R1	vs4,	vs4,	vs8		// add a0_r * b0_r
+	XVFADD_I2	vs5,	vs5,	vs12		// add a0_r * b0_i
+	XVFADD_R1	vs6,	vs6,	vs10		// add a1_r * b0_r
+	XVFADD_I2	vs7,	vs7,	vs14		// add a1_r * b0_i
+
+	XVFADD_R2	vs4,	vs4,	vs13		// add a0_i * b0_i
+	XVFADD_I1	vs5,	vs5,	vs9 		// add a0_i * b0_r
+	XVFADD_R2	vs6,	vs6,	vs15		// add a1_i * b0_i
+	XVFADD_I1	vs7,	vs7,	vs11		// add a1_i * b0_r
+
+	xvmulsp		vs16,	vs4,	alpha_sr		// r0_r * alpha_r
+	xvmulsp		vs17,	vs5,	alpha_si		// r0_i * alpha_i
+	xvmulsp		vs18,	vs4,	alpha_si		// r0_r * alpha_i
+	xvmulsp		vs19,	vs5,	alpha_sr		// r0_i * alpha_r
+
+	xvsubsp		vs20,	vs16,	vs17		// r0_r * alpha_r - r0_i * alpha_i
+	xvaddsp		vs21,	vs18,	vs19		// r0_r * alpha_i + r0_i * alpha_r
+
+	xvmulsp		vs16,	vs6,	alpha_sr		// r1_r * alpha_r
+	xvmulsp		vs17,	vs7,	alpha_si		// r1_i * alpha_i
+	xvmulsp		vs18,	vs6,	alpha_si		// r1_r * alpha_i
+	xvmulsp		vs19,	vs7,	alpha_sr		// r1_i * alpha_r
+
+	xvsubsp		vs22,	vs16,	vs17		// r1_r * alpha_r - r1_i * alpha_i
+	xvaddsp		vs23,	vs18,	vs19		// r1_r * alpha_i + r1_i * alpha_r
+
+	xxsldwi		vs20,	vs20,	vs24,	3		// r0_r
+	xxsldwi		vs21,	vs21,	vs24,	2		// r0_i
+	xxsldwi		vs22,	vs22,	vs24,	1		// r1_r
+	xxsldwi		vs23,	vs23,	vs24,	0		// r1_i
+	xvaddsp		vs20,	vs20,	vs21		// r0_r, r0_i
+	xvaddsp		vs22,	vs22,	vs23		// r1_r, r1_i
+	xvaddsp		vs1,	vs20,	vs22			// r0_r, r0_i, r1_r, r1_i
+	xvaddsp		vs0,	vs0,	vs1
+
+
+	stxvw4x		vs0,	o0,	T2	// c0, c1
+
+	addi		T2,	T2,	16
+	add		T1,	T1,	LDC
+
+	addi		CO,	CO,	16
+
+.endm
+
+
+/**********************************************************************************************
+* Macros for N=4 and M=1
+**********************************************************************************************/
+
+.macro LOAD4x1_1
+
+	lxsspx		vs0,	o0,	AO		// load a0_r
+	lxsspx		vs1,	o4,	AO		// load a0_i
+
+	addi		AO,	AO,	8
+
+	lxsspx		vs8,	o0,	BO		//  load b0_r
+	lxsspx		vs9,	o16,	BO		//  load b0_i
+	lxsspx		vs10,	o32,	BO		//  load b1_r
+	lxsspx		vs11,	o48,	BO		//  load b1_i
+
+	addi		BO,	BO,	64
+
+	lxsspx		vs12,	o0,	BO		//  load b2_r
+	lxsspx		vs13,	o16,	BO		//  load b2_i
+	lxsspx		vs14,	o32,	BO		//  load b3_r
+	lxsspx		vs15,	o48,	BO		//  load b3_i
+
+	addi		BO,	BO,	64
+
+
+.endm
+
+.macro KERNEL4x1_I1
+
+
+	lxsspx		vs4,	o0,	AO		// load a0_r
+	lxsspx		vs5,	o4,	AO		// load a0_i
+
+	addi		AO,	AO,	8
+
+	lxsspx		vs16,	o0,	BO		//  load b0_r
+	lxsspx		vs17,	o16,	BO		//  load b0_i
+	lxsspx		vs18,	o32,	BO		//  load b1_r
+	lxsspx		vs19,	o48,	BO		//  load b1_i
+
+	addi		BO,	BO,	64
+
+	lxsspx		vs20,	o0,	BO		//  load b2_r
+	lxsspx		vs21,	o16,	BO		//  load b2_i
+	lxsspx		vs22,	o32,	BO		//  load b3_r
+	lxsspx		vs23,	o48,	BO		//  load b3_i
+
+	addi		BO,	BO,	64
+
+
+
+	xsmuldp		vs32,	vs0,	vs8		// a0_r*b0_r
+	xsmuldp		vs33,	vs1,	vs9		// a0_i*b0_i
+	xsmuldp		vs34,	vs0,	vs9		// a0_r*b0_i
+	xsmuldp		vs35,	vs1,	vs8		// a0_i*b0_r
+
+	xsmuldp		vs36,	vs0,	vs10		// a0_r*b1_r
+	xsmuldp		vs37,	vs1,	vs11		// a0_i*b1_i
+	xsmuldp		vs38,	vs0,	vs11		// a0_r*b1_i
+	xsmuldp		vs39,	vs1,	vs10		// a0_i*b1_r
+
+	xsmuldp		vs40,	vs0,	vs12		// a0_r*b2_r
+	xsmuldp		vs41,	vs1,	vs13		// a0_i*b2_i
+	xsmuldp		vs42,	vs0,	vs13		// a0_r*b2_i
+	xsmuldp		vs43,	vs1,	vs12		// a0_i*b2_r
+
+	xsmuldp		vs44,	vs0,	vs14		// a0_r*b3_r
+	xsmuldp		vs45,	vs1,	vs15		// a0_i*b3_i
+	xsmuldp		vs46,	vs0,	vs15		// a0_r*b3_i
+	xsmuldp		vs47,	vs1,	vs14		// a0_i*b3_r
+
+
+.endm
+
+.macro KERNEL4x1_1
+
+
+	lxsspx		vs4,	o0,	AO		// load a0_r
+	lxsspx		vs5,	o4,	AO		// load a0_i
+
+	addi		AO,	AO,	8
+
+	lxsspx		vs16,	o0,	BO		//  load b0_r
+	lxsspx		vs17,	o16,	BO		//  load b0_i
+	lxsspx		vs18,	o32,	BO		//  load b1_r
+	lxsspx		vs19,	o48,	BO		//  load b1_i
+
+	addi		BO,	BO,	64
+
+	lxsspx		vs20,	o0,	BO		//  load b2_r
+	lxsspx		vs21,	o16,	BO		//  load b2_i
+	lxsspx		vs22,	o32,	BO		//  load b3_r
+	lxsspx		vs23,	o48,	BO		//  load b3_i
+
+	addi		BO,	BO,	64
+
+
+
+	xsmaddadp	vs32,	vs0,	vs8		// a0_r*b0_r
+	xsmaddadp	vs33,	vs1,	vs9		// a0_i*b0_i
+	xsmaddadp	vs34,	vs0,	vs9		// a0_r*b0_i
+	xsmaddadp	vs35,	vs1,	vs8		// a0_i*b0_r
+
+	xsmaddadp	vs36,	vs0,	vs10		// a0_r*b1_r
+	xsmaddadp	vs37,	vs1,	vs11		// a0_i*b1_i
+	xsmaddadp	vs38,	vs0,	vs11		// a0_r*b1_i
+	xsmaddadp	vs39,	vs1,	vs10		// a0_i*b1_r
+
+	xsmaddadp	vs40,	vs0,	vs12		// a0_r*b2_r
+	xsmaddadp	vs41,	vs1,	vs13		// a0_i*b2_i
+	xsmaddadp	vs42,	vs0,	vs13		// a0_r*b2_i
+	xsmaddadp	vs43,	vs1,	vs12		// a0_i*b2_r
+
+	xsmaddadp	vs44,	vs0,	vs14		// a0_r*b3_r
+	xsmaddadp	vs45,	vs1,	vs15		// a0_i*b3_i
+	xsmaddadp	vs46,	vs0,	vs15		// a0_r*b3_i
+	xsmaddadp	vs47,	vs1,	vs14		// a0_i*b3_r
+
+
+.endm
+
+.macro KERNEL4x1_2
+
+
+	lxsspx		vs0,	o0,	AO		// load a0_r
+	lxsspx		vs1,	o4,	AO		// load a0_i
+
+	addi		AO,	AO,	8
+
+	lxsspx		vs8,	o0,	BO		//  load b0_r
+	lxsspx		vs9,	o16,	BO		//  load b0_i
+	lxsspx		vs10,	o32,	BO		//  load b1_r
+	lxsspx		vs11,	o48,	BO		//  load b1_i
+
+	addi		BO,	BO,	64
+
+	lxsspx		vs12,	o0,	BO		//  load b2_r
+	lxsspx		vs13,	o16,	BO		//  load b2_i
+	lxsspx		vs14,	o32,	BO		//  load b3_r
+	lxsspx		vs15,	o48,	BO		//  load b3_i
+
+	addi		BO,	BO,	64
+
+
+
+	xsmaddadp	vs32,	vs4,	vs16		// a4_r*b0_r
+	xsmaddadp	vs33,	vs5,	vs17		// a4_i*b0_i
+	xsmaddadp	vs34,	vs4,	vs17		// a4_r*b0_i
+	xsmaddadp	vs35,	vs5,	vs16		// a4_i*b0_r
+
+	xsmaddadp	vs36,	vs4,	vs18		// a4_r*b1_r
+	xsmaddadp	vs37,	vs5,	vs19		// a4_i*b1_i
+	xsmaddadp	vs38,	vs4,	vs19		// a4_r*b1_i
+	xsmaddadp	vs39,	vs5,	vs18		// a4_i*b1_r
+
+	xsmaddadp	vs40,	vs4,	vs20		// a4_r*b2_r
+	xsmaddadp	vs41,	vs5,	vs21		// a4_i*b2_i
+	xsmaddadp	vs42,	vs4,	vs21		// a4_r*b2_i
+	xsmaddadp	vs43,	vs5,	vs20		// a4_i*b2_r
+
+	xsmaddadp	vs44,	vs4,	vs22		// a4_r*b3_r
+	xsmaddadp	vs45,	vs5,	vs23		// a4_i*b3_i
+	xsmaddadp	vs46,	vs4,	vs23		// a4_r*b3_i
+	xsmaddadp	vs47,	vs5,	vs22		// a4_i*b3_r
+
+
+.endm
+
+.macro KERNEL4x1_E2
+
+
+	xsmaddadp	vs32,	vs4,	vs16		// a4_r*b0_r
+	xsmaddadp	vs33,	vs5,	vs17		// a4_i*b0_i
+	xsmaddadp	vs34,	vs4,	vs17		// a4_r*b0_i
+	xsmaddadp	vs35,	vs5,	vs16		// a4_i*b0_r
+
+	xsmaddadp	vs36,	vs4,	vs18		// a4_r*b1_r
+	xsmaddadp	vs37,	vs5,	vs19		// a4_i*b1_i
+	xsmaddadp	vs38,	vs4,	vs19		// a4_r*b1_i
+	xsmaddadp	vs39,	vs5,	vs18		// a4_i*b1_r
+
+	xsmaddadp	vs40,	vs4,	vs20		// a4_r*b2_r
+	xsmaddadp	vs41,	vs5,	vs21		// a4_i*b2_i
+	xsmaddadp	vs42,	vs4,	vs21		// a4_r*b2_i
+	xsmaddadp	vs43,	vs5,	vs20		// a4_i*b2_r
+
+	xsmaddadp	vs44,	vs4,	vs22		// a4_r*b3_r
+	xsmaddadp	vs45,	vs5,	vs23		// a4_i*b3_i
+	xsmaddadp	vs46,	vs4,	vs23		// a4_r*b3_i
+	xsmaddadp	vs47,	vs5,	vs22		// a4_i*b3_r
+
+
+.endm
+
+.macro KERNEL4x1_SUBI1
+
+
+	lxsspx		vs0,	o0,	AO		// load a0_r
+	lxsspx		vs1,	o4,	AO		// load a0_i
+
+	addi		AO,	AO,	8
+
+	lxsspx		vs8,	o0,	BO		//  load b0_r
+	lxsspx		vs9,	o16,	BO		//  load b0_i
+	lxsspx		vs10,	o32,	BO		//  load b1_r
+	lxsspx		vs11,	o48,	BO		//  load b1_i
+
+	addi		BO,	BO,	64
+
+	lxsspx		vs12,	o0,	BO		//  load b2_r
+	lxsspx		vs13,	o16,	BO		//  load b2_i
+	lxsspx		vs14,	o32,	BO		//  load b3_r
+	lxsspx		vs15,	o48,	BO		//  load b3_i
+
+	addi		BO,	BO,	64
+
+
+
+	xsmuldp		vs32,	vs0,	vs8		// a0_r*b0_r
+	xsmuldp		vs33,	vs1,	vs9		// a0_i*b0_i
+	xsmuldp		vs34,	vs0,	vs9		// a0_r*b0_i
+	xsmuldp		vs35,	vs1,	vs8		// a0_i*b0_r
+
+	xsmuldp		vs36,	vs0,	vs10		// a0_r*b1_r
+	xsmuldp		vs37,	vs1,	vs11		// a0_i*b1_i
+	xsmuldp		vs38,	vs0,	vs11		// a0_r*b1_i
+	xsmuldp		vs39,	vs1,	vs10		// a0_i*b1_r
+
+	xsmuldp		vs40,	vs0,	vs12		// a0_r*b2_r
+	xsmuldp		vs41,	vs1,	vs13		// a0_i*b2_i
+	xsmuldp		vs42,	vs0,	vs13		// a0_r*b2_i
+	xsmuldp		vs43,	vs1,	vs12		// a0_i*b2_r
+
+	xsmuldp		vs44,	vs0,	vs14		// a0_r*b3_r
+	xsmuldp		vs45,	vs1,	vs15		// a0_i*b3_i
+	xsmuldp		vs46,	vs0,	vs15		// a0_r*b3_i
+	xsmuldp		vs47,	vs1,	vs14		// a0_i*b3_r
+
+
+.endm
+
+.macro KERNEL4x1_SUB1
+
+
+	lxsspx		vs0,	o0,	AO		// load a0_r
+	lxsspx		vs1,	o4,	AO		// load a0_i
+
+	addi		AO,	AO,	8
+
+	lxsspx		vs8,	o0,	BO		//  load b0_r
+	lxsspx		vs9,	o16,	BO		//  load b0_i
+	lxsspx		vs10,	o32,	BO		//  load b1_r
+	lxsspx		vs11,	o48,	BO		//  load b1_i
+
+	addi		BO,	BO,	64
+
+	lxsspx		vs12,	o0,	BO		//  load b2_r
+	lxsspx		vs13,	o16,	BO		//  load b2_i
+	lxsspx		vs14,	o32,	BO		//  load b3_r
+	lxsspx		vs15,	o48,	BO		//  load b3_i
+
+	addi		BO,	BO,	64
+
+
+
+	xsmaddadp	vs32,	vs0,	vs8		// a0_r*b0_r
+	xsmaddadp	vs33,	vs1,	vs9		// a0_i*b0_i
+	xsmaddadp	vs34,	vs0,	vs9		// a0_r*b0_i
+	xsmaddadp	vs35,	vs1,	vs8		// a0_i*b0_r
+
+	xsmaddadp	vs36,	vs0,	vs10		// a0_r*b1_r
+	xsmaddadp	vs37,	vs1,	vs11		// a0_i*b1_i
+	xsmaddadp	vs38,	vs0,	vs11		// a0_r*b1_i
+	xsmaddadp	vs39,	vs1,	vs10		// a0_i*b1_r
+
+	xsmaddadp	vs40,	vs0,	vs12		// a0_r*b2_r
+	xsmaddadp	vs41,	vs1,	vs13		// a0_i*b2_i
+	xsmaddadp	vs42,	vs0,	vs13		// a0_r*b2_i
+	xsmaddadp	vs43,	vs1,	vs12		// a0_i*b2_r
+
+	xsmaddadp	vs44,	vs0,	vs14		// a0_r*b3_r
+	xsmaddadp	vs45,	vs1,	vs15		// a0_i*b3_i
+	xsmaddadp	vs46,	vs0,	vs15		// a0_r*b3_i
+	xsmaddadp	vs47,	vs1,	vs14		// a0_i*b3_r
+
+
+.endm
+
+.macro SAVE4x1
+
+	mr		T1,	CO
+	xxlxor		vs24,	vs24,	vs24
+
+// N=0
+
+	mr		T2,	T1
+
+// N=0 M=0
+
+	xxlxor		vs4,	vs4,	vs4
+	xxlxor		vs5,	vs5,	vs5
+
+#ifndef TRMMKERNEL
+	lxsspx		vs0,	o0,	T2	// load c0_r
+	lxsspx		vs1,	o4,	T2	// load c0_i
+#else
+	xxlxor		vs0,	vs0,	vs0
+	xxlxor		vs1,	vs1,	vs1
+#endif
+
+	XSFADD_R1	vs4,	vs4,	vs32		// add a0_r * b0_r
+	XSFADD_I1	vs5,	vs5,	vs35		// add a0_r * b0_i
+
+	XSFADD_R2	vs4,	vs4,	vs33		// add a0_i * b0_i
+	XSFADD_I2	vs5,	vs5,	vs34		// add a0_i * b0_r
+
+	xsmuldp		vs16,	vs4,	alpha_dr		// r0_r * alpha_r
+	xsmuldp		vs17,	vs5,	alpha_di		// r0_i * alpha_i
+	xsmuldp		vs18,	vs4,	alpha_di		// r0_r * alpha_i
+	xsmuldp		vs19,	vs5,	alpha_dr		// r0_i * alpha_r
+
+	xssubdp		vs20,	vs16,	vs17		// r0_r * alpha_r - r0_i * alpha_i
+	xsadddp		vs21,	vs18,	vs19		// r0_r * alpha_i + r0_i * alpha_r
+
+	xsadddp		vs0,	vs0,	vs20
+	xsadddp		vs1,	vs1,	vs21
+
+
+	stxsspx		vs0,	o0,	T2	// store c0_r
+	stxsspx		vs1,	o4,	T2	// store c0_i
+
+	addi		T2,	T2,	8
+	add		T1,	T1,	LDC
+
+
+// N=1
+
+	mr		T2,	T1
+
+// N=1 M=0
+
+	xxlxor		vs4,	vs4,	vs4
+	xxlxor		vs5,	vs5,	vs5
+
+#ifndef TRMMKERNEL
+	lxsspx		vs0,	o0,	T2	// load c0_r
+	lxsspx		vs1,	o4,	T2	// load c0_i
+#else
+	xxlxor		vs0,	vs0,	vs0
+	xxlxor		vs1,	vs1,	vs1
+#endif
+
+	XSFADD_R1	vs4,	vs4,	vs36		// add a0_r * b0_r
+	XSFADD_I1	vs5,	vs5,	vs39		// add a0_r * b0_i
+
+	XSFADD_R2	vs4,	vs4,	vs37		// add a0_i * b0_i
+	XSFADD_I2	vs5,	vs5,	vs38		// add a0_i * b0_r
+
+	xsmuldp		vs16,	vs4,	alpha_dr		// r0_r * alpha_r
+	xsmuldp		vs17,	vs5,	alpha_di		// r0_i * alpha_i
+	xsmuldp		vs18,	vs4,	alpha_di		// r0_r * alpha_i
+	xsmuldp		vs19,	vs5,	alpha_dr		// r0_i * alpha_r
+
+	xssubdp		vs20,	vs16,	vs17		// r0_r * alpha_r - r0_i * alpha_i
+	xsadddp		vs21,	vs18,	vs19		// r0_r * alpha_i + r0_i * alpha_r
+
+	xsadddp		vs0,	vs0,	vs20
+	xsadddp		vs1,	vs1,	vs21
+
+
+	stxsspx		vs0,	o0,	T2	// store c0_r
+	stxsspx		vs1,	o4,	T2	// store c0_i
+
+	addi		T2,	T2,	8
+	add		T1,	T1,	LDC
+
+
+// N=2
+
+	mr		T2,	T1
+
+// N=2 M=0
+
+	xxlxor		vs4,	vs4,	vs4
+	xxlxor		vs5,	vs5,	vs5
+
+#ifndef TRMMKERNEL
+	lxsspx		vs0,	o0,	T2	// load c0_r
+	lxsspx		vs1,	o4,	T2	// load c0_i
+#else
+	xxlxor		vs0,	vs0,	vs0
+	xxlxor		vs1,	vs1,	vs1
+#endif
+
+	XSFADD_R1	vs4,	vs4,	vs40		// add a0_r * b0_r
+	XSFADD_I1	vs5,	vs5,	vs43		// add a0_r * b0_i
+
+	XSFADD_R2	vs4,	vs4,	vs41		// add a0_i * b0_i
+	XSFADD_I2	vs5,	vs5,	vs42		// add a0_i * b0_r
+
+	xsmuldp		vs16,	vs4,	alpha_dr		// r0_r * alpha_r
+	xsmuldp		vs17,	vs5,	alpha_di		// r0_i * alpha_i
+	xsmuldp		vs18,	vs4,	alpha_di		// r0_r * alpha_i
+	xsmuldp		vs19,	vs5,	alpha_dr		// r0_i * alpha_r
+
+	xssubdp		vs20,	vs16,	vs17		// r0_r * alpha_r - r0_i * alpha_i
+	xsadddp		vs21,	vs18,	vs19		// r0_r * alpha_i + r0_i * alpha_r
+
+	xsadddp		vs0,	vs0,	vs20
+	xsadddp		vs1,	vs1,	vs21
+
+
+	stxsspx		vs0,	o0,	T2	// store c0_r
+	stxsspx		vs1,	o4,	T2	// store c0_i
+
+	addi		T2,	T2,	8
+	add		T1,	T1,	LDC
+
+
+// N=3
+
+	mr		T2,	T1
+
+// N=3 M=0
+
+	xxlxor		vs4,	vs4,	vs4
+	xxlxor		vs5,	vs5,	vs5
+
+#ifndef TRMMKERNEL
+	lxsspx		vs0,	o0,	T2	// load c0_r
+	lxsspx		vs1,	o4,	T2	// load c0_i
+#else
+	xxlxor		vs0,	vs0,	vs0
+	xxlxor		vs1,	vs1,	vs1
+#endif
+
+	XSFADD_R1	vs4,	vs4,	vs44		// add a0_r * b0_r
+	XSFADD_I1	vs5,	vs5,	vs47		// add a0_r * b0_i
+
+	XSFADD_R2	vs4,	vs4,	vs45		// add a0_i * b0_i
+	XSFADD_I2	vs5,	vs5,	vs46		// add a0_i * b0_r
+
+	xsmuldp		vs16,	vs4,	alpha_dr		// r0_r * alpha_r
+	xsmuldp		vs17,	vs5,	alpha_di		// r0_i * alpha_i
+	xsmuldp		vs18,	vs4,	alpha_di		// r0_r * alpha_i
+	xsmuldp		vs19,	vs5,	alpha_dr		// r0_i * alpha_r
+
+	xssubdp		vs20,	vs16,	vs17		// r0_r * alpha_r - r0_i * alpha_i
+	xsadddp		vs21,	vs18,	vs19		// r0_r * alpha_i + r0_i * alpha_r
+
+	xsadddp		vs0,	vs0,	vs20
+	xsadddp		vs1,	vs1,	vs21
+
+
+	stxsspx		vs0,	o0,	T2	// store c0_r
+	stxsspx		vs1,	o4,	T2	// store c0_i
+
+	addi		T2,	T2,	8
+	add		T1,	T1,	LDC
+
+	addi		CO,	CO,	8
+
+.endm
+
+
+/**********************************************************************************************
+* Macros for N=2 and M=8
+**********************************************************************************************/
+
+.macro LOAD2x8_1
+
+	lxvw4x		vs0,	o0,	AO		// load a0, a1
+	lxvw4x		vs1,	o16,	AO		// load a2, a3
+	lxvw4x		vs2,	o32,	AO		// load a4, a5
+	lxvw4x		vs3,	o48,	AO		// load a6, a7
+
+	addi		AO,	AO,	64
+
+	lxvw4x		vs8,	o0,	BO		//  load b0_r
+	lxvw4x		vs9,	o16,	BO		//  load b0_i
+	lxvw4x		vs10,	o32,	BO		//  load b1_r
+	lxvw4x		vs11,	o48,	BO		//  load b1_i
+
+	addi		BO,	BO,	64
+
+
+.endm
+
+.macro KERNEL2x8_I1
+
+
+	lxvw4x		vs4,	o0,	AO		// load a0, a1
+	lxvw4x		vs5,	o16,	AO		// load a2, a3
+	lxvw4x		vs6,	o32,	AO		// load a4, a5
+	lxvw4x		vs7,	o48,	AO		// load a6, a7
+
+	addi		AO,	AO,	64
+
+	lxvw4x		vs16,	o0,	BO		//  load b0_r
+	lxvw4x		vs17,	o16,	BO		//  load b0_i
+	lxvw4x		vs18,	o32,	BO		//  load b1_r
+	lxvw4x		vs19,	o48,	BO		//  load b1_i
+
+	addi		BO,	BO,	64
+
+
+
+	xvmulsp		vs32,	vs0,	vs8		// a0_r*b0_r, a0_i*b0_r, a1_r*b0_r, a1_i*b0_r
+	xvmulsp		vs33,	vs0,	vs9		// a0_r*b0_i, a0_i*b0_i, a1_r*b0_i, a1_i*b0_i
+	xvmulsp		vs34,	vs1,	vs8		// a1_r*b0_r, a1_i*b0_r, a1_r*b0_r, a1_i*b0_r
+	xvmulsp		vs35,	vs1,	vs9		// a1_r*b0_i, a1_i*b0_i, a1_r*b0_i, a1_i*b0_i
+	xvmulsp		vs36,	vs2,	vs8		// a2_r*b0_r, a2_i*b0_r, a1_r*b0_r, a1_i*b0_r
+	xvmulsp		vs37,	vs2,	vs9		// a2_r*b0_i, a2_i*b0_i, a1_r*b0_i, a1_i*b0_i
+	xvmulsp		vs38,	vs3,	vs8		// a3_r*b0_r, a3_i*b0_r, a1_r*b0_r, a1_i*b0_r
+	xvmulsp		vs39,	vs3,	vs9		// a3_r*b0_i, a3_i*b0_i, a1_r*b0_i, a1_i*b0_i
+
+	xvmulsp		vs40,	vs0,	vs10		// a0_r*b1_r, a0_i*b1_r, a1_r*b1_r, a1_i*b1_r
+	xvmulsp		vs41,	vs0,	vs11		// a0_r*b1_i, a0_i*b1_i, a1_r*b1_i, a1_i*b1_i
+	xvmulsp		vs42,	vs1,	vs10		// a1_r*b1_r, a1_i*b1_r, a1_r*b1_r, a1_i*b1_r
+	xvmulsp		vs43,	vs1,	vs11		// a1_r*b1_i, a1_i*b1_i, a1_r*b1_i, a1_i*b1_i
+	xvmulsp		vs44,	vs2,	vs10		// a2_r*b1_r, a2_i*b1_r, a1_r*b1_r, a1_i*b1_r
+	xvmulsp		vs45,	vs2,	vs11		// a2_r*b1_i, a2_i*b1_i, a1_r*b1_i, a1_i*b1_i
+	xvmulsp		vs46,	vs3,	vs10		// a3_r*b1_r, a3_i*b1_r, a1_r*b1_r, a1_i*b1_r
+	xvmulsp		vs47,	vs3,	vs11		// a3_r*b1_i, a3_i*b1_i, a1_r*b1_i, a1_i*b1_i
+
+
+.endm
+
+.macro KERNEL2x8_1
+
+
+	lxvw4x		vs4,	o0,	AO		// load a0, a1
+	lxvw4x		vs5,	o16,	AO		// load a2, a3
+	lxvw4x		vs6,	o32,	AO		// load a4, a5
+	lxvw4x		vs7,	o48,	AO		// load a6, a7
+
+	addi		AO,	AO,	64
+
+	lxvw4x		vs16,	o0,	BO		//  load b0_r
+	lxvw4x		vs17,	o16,	BO		//  load b0_i
+	lxvw4x		vs18,	o32,	BO		//  load b1_r
+	lxvw4x		vs19,	o48,	BO		//  load b1_i
+
+	addi		BO,	BO,	64
+
+
+
+	xvmaddasp	vs32,	vs0,	vs8		// a0_r*b0_r, a0_i*b0_r, a1_r*b0_r, a1_i*b0_r
+	xvmaddasp	vs33,	vs0,	vs9		// a0_r*b0_i, a0_i*b0_i, a1_r*b0_i, a1_i*b0_i
+	xvmaddasp	vs34,	vs1,	vs8		// a1_r*b0_r, a1_i*b0_r, a1_r*b0_r, a1_i*b0_r
+	xvmaddasp	vs35,	vs1,	vs9		// a1_r*b0_i, a1_i*b0_i, a1_r*b0_i, a1_i*b0_i
+	xvmaddasp	vs36,	vs2,	vs8		// a2_r*b0_r, a2_i*b0_r, a1_r*b0_r, a1_i*b0_r
+	xvmaddasp	vs37,	vs2,	vs9		// a2_r*b0_i, a2_i*b0_i, a1_r*b0_i, a1_i*b0_i
+	xvmaddasp	vs38,	vs3,	vs8		// a3_r*b0_r, a3_i*b0_r, a1_r*b0_r, a1_i*b0_r
+	xvmaddasp	vs39,	vs3,	vs9		// a3_r*b0_i, a3_i*b0_i, a1_r*b0_i, a1_i*b0_i
+
+	xvmaddasp	vs40,	vs0,	vs10		// a0_r*b1_r, a0_i*b1_r, a1_r*b1_r, a1_i*b1_r
+	xvmaddasp	vs41,	vs0,	vs11		// a0_r*b1_i, a0_i*b1_i, a1_r*b1_i, a1_i*b1_i
+	xvmaddasp	vs42,	vs1,	vs10		// a1_r*b1_r, a1_i*b1_r, a1_r*b1_r, a1_i*b1_r
+	xvmaddasp	vs43,	vs1,	vs11		// a1_r*b1_i, a1_i*b1_i, a1_r*b1_i, a1_i*b1_i
+	xvmaddasp	vs44,	vs2,	vs10		// a2_r*b1_r, a2_i*b1_r, a1_r*b1_r, a1_i*b1_r
+	xvmaddasp	vs45,	vs2,	vs11		// a2_r*b1_i, a2_i*b1_i, a1_r*b1_i, a1_i*b1_i
+	xvmaddasp	vs46,	vs3,	vs10		// a3_r*b1_r, a3_i*b1_r, a1_r*b1_r, a1_i*b1_r
+	xvmaddasp	vs47,	vs3,	vs11		// a3_r*b1_i, a3_i*b1_i, a1_r*b1_i, a1_i*b1_i
+
+
+.endm
+
+.macro KERNEL2x8_2
+
+
+	lxvw4x		vs0,	o0,	AO		// load a0, a1
+	lxvw4x		vs1,	o16,	AO		// load a2, a3
+	lxvw4x		vs2,	o32,	AO		// load a4, a5
+	lxvw4x		vs3,	o48,	AO		// load a6, a7
+
+	addi		AO,	AO,	64
+
+	lxvw4x		vs8,	o0,	BO		//  load b0_r
+	lxvw4x		vs9,	o16,	BO		//  load b0_i
+	lxvw4x		vs10,	o32,	BO		//  load b1_r
+	lxvw4x		vs11,	o48,	BO		//  load b1_i
+
+	addi		BO,	BO,	64
+
+
+
+	xvmaddasp	vs32,	vs4,	vs16		// a4_r*b0_r, a4_i*b0_r, a1_r*b0_r, a1_i*b0_r
+	xvmaddasp	vs33,	vs4,	vs17		// a4_r*b0_i, a4_i*b0_i, a1_r*b0_i, a1_i*b0_i
+	xvmaddasp	vs34,	vs5,	vs16		// a5_r*b0_r, a5_i*b0_r, a1_r*b0_r, a1_i*b0_r
+	xvmaddasp	vs35,	vs5,	vs17		// a5_r*b0_i, a5_i*b0_i, a1_r*b0_i, a1_i*b0_i
+	xvmaddasp	vs36,	vs6,	vs16		// a6_r*b0_r, a6_i*b0_r, a1_r*b0_r, a1_i*b0_r
+	xvmaddasp	vs37,	vs6,	vs17		// a6_r*b0_i, a6_i*b0_i, a1_r*b0_i, a1_i*b0_i
+	xvmaddasp	vs38,	vs7,	vs16		// a7_r*b0_r, a7_i*b0_r, a1_r*b0_r, a1_i*b0_r
+	xvmaddasp	vs39,	vs7,	vs17		// a7_r*b0_i, a7_i*b0_i, a1_r*b0_i, a1_i*b0_i
+
+	xvmaddasp	vs40,	vs4,	vs18		// a4_r*b1_r, a4_i*b1_r, a1_r*b1_r, a1_i*b1_r
+	xvmaddasp	vs41,	vs4,	vs19		// a4_r*b1_i, a4_i*b1_i, a1_r*b1_i, a1_i*b1_i
+	xvmaddasp	vs42,	vs5,	vs18		// a5_r*b1_r, a5_i*b1_r, a1_r*b1_r, a1_i*b1_r
+	xvmaddasp	vs43,	vs5,	vs19		// a5_r*b1_i, a5_i*b1_i, a1_r*b1_i, a1_i*b1_i
+	xvmaddasp	vs44,	vs6,	vs18		// a6_r*b1_r, a6_i*b1_r, a1_r*b1_r, a1_i*b1_r
+	xvmaddasp	vs45,	vs6,	vs19		// a6_r*b1_i, a6_i*b1_i, a1_r*b1_i, a1_i*b1_i
+	xvmaddasp	vs46,	vs7,	vs18		// a7_r*b1_r, a7_i*b1_r, a1_r*b1_r, a1_i*b1_r
+	xvmaddasp	vs47,	vs7,	vs19		// a7_r*b1_i, a7_i*b1_i, a1_r*b1_i, a1_i*b1_i
+
+
+.endm
+
+.macro KERNEL2x8_E2
+
+
+	xvmaddasp	vs32,	vs4,	vs16		// a4_r*b0_r, a4_i*b0_r, a1_r*b0_r, a1_i*b0_r
+	xvmaddasp	vs33,	vs4,	vs17		// a4_r*b0_i, a4_i*b0_i, a1_r*b0_i, a1_i*b0_i
+	xvmaddasp	vs34,	vs5,	vs16		// a5_r*b0_r, a5_i*b0_r, a1_r*b0_r, a1_i*b0_r
+	xvmaddasp	vs35,	vs5,	vs17		// a5_r*b0_i, a5_i*b0_i, a1_r*b0_i, a1_i*b0_i
+	xvmaddasp	vs36,	vs6,	vs16		// a6_r*b0_r, a6_i*b0_r, a1_r*b0_r, a1_i*b0_r
+	xvmaddasp	vs37,	vs6,	vs17		// a6_r*b0_i, a6_i*b0_i, a1_r*b0_i, a1_i*b0_i
+	xvmaddasp	vs38,	vs7,	vs16		// a7_r*b0_r, a7_i*b0_r, a1_r*b0_r, a1_i*b0_r
+	xvmaddasp	vs39,	vs7,	vs17		// a7_r*b0_i, a7_i*b0_i, a1_r*b0_i, a1_i*b0_i
+
+	xvmaddasp	vs40,	vs4,	vs18		// a4_r*b1_r, a4_i*b1_r, a1_r*b1_r, a1_i*b1_r
+	xvmaddasp	vs41,	vs4,	vs19		// a4_r*b1_i, a4_i*b1_i, a1_r*b1_i, a1_i*b1_i
+	xvmaddasp	vs42,	vs5,	vs18		// a5_r*b1_r, a5_i*b1_r, a1_r*b1_r, a1_i*b1_r
+	xvmaddasp	vs43,	vs5,	vs19		// a5_r*b1_i, a5_i*b1_i, a1_r*b1_i, a1_i*b1_i
+	xvmaddasp	vs44,	vs6,	vs18		// a6_r*b1_r, a6_i*b1_r, a1_r*b1_r, a1_i*b1_r
+	xvmaddasp	vs45,	vs6,	vs19		// a6_r*b1_i, a6_i*b1_i, a1_r*b1_i, a1_i*b1_i
+	xvmaddasp	vs46,	vs7,	vs18		// a7_r*b1_r, a7_i*b1_r, a1_r*b1_r, a1_i*b1_r
+	xvmaddasp	vs47,	vs7,	vs19		// a7_r*b1_i, a7_i*b1_i, a1_r*b1_i, a1_i*b1_i
+
+
+.endm
+
+.macro KERNEL2x8_SUBI1
+
+
+	lxvw4x		vs0,	o0,	AO		// load a0, a1
+	lxvw4x		vs1,	o16,	AO		// load a2, a3
+	lxvw4x		vs2,	o32,	AO		// load a4, a5
+	lxvw4x		vs3,	o48,	AO		// load a6, a7
+
+	addi		AO,	AO,	64
+
+	lxvw4x		vs8,	o0,	BO		//  load b0_r
+	lxvw4x		vs9,	o16,	BO		//  load b0_i
+	lxvw4x		vs10,	o32,	BO		//  load b1_r
+	lxvw4x		vs11,	o48,	BO		//  load b1_i
+
+	addi		BO,	BO,	64
+
+
+
+	xvmulsp		vs32,	vs0,	vs8		// a0_r*b0_r, a0_i*b0_r, a1_r*b0_r, a1_i*b0_r
+	xvmulsp		vs33,	vs0,	vs9		// a0_r*b0_i, a0_i*b0_i, a1_r*b0_i, a1_i*b0_i
+	xvmulsp		vs34,	vs1,	vs8		// a1_r*b0_r, a1_i*b0_r, a1_r*b0_r, a1_i*b0_r
+	xvmulsp		vs35,	vs1,	vs9		// a1_r*b0_i, a1_i*b0_i, a1_r*b0_i, a1_i*b0_i
+	xvmulsp		vs36,	vs2,	vs8		// a2_r*b0_r, a2_i*b0_r, a1_r*b0_r, a1_i*b0_r
+	xvmulsp		vs37,	vs2,	vs9		// a2_r*b0_i, a2_i*b0_i, a1_r*b0_i, a1_i*b0_i
+	xvmulsp		vs38,	vs3,	vs8		// a3_r*b0_r, a3_i*b0_r, a1_r*b0_r, a1_i*b0_r
+	xvmulsp		vs39,	vs3,	vs9		// a3_r*b0_i, a3_i*b0_i, a1_r*b0_i, a1_i*b0_i
+
+	xvmulsp		vs40,	vs0,	vs10		// a0_r*b1_r, a0_i*b1_r, a1_r*b1_r, a1_i*b1_r
+	xvmulsp		vs41,	vs0,	vs11		// a0_r*b1_i, a0_i*b1_i, a1_r*b1_i, a1_i*b1_i
+	xvmulsp		vs42,	vs1,	vs10		// a1_r*b1_r, a1_i*b1_r, a1_r*b1_r, a1_i*b1_r
+	xvmulsp		vs43,	vs1,	vs11		// a1_r*b1_i, a1_i*b1_i, a1_r*b1_i, a1_i*b1_i
+	xvmulsp		vs44,	vs2,	vs10		// a2_r*b1_r, a2_i*b1_r, a1_r*b1_r, a1_i*b1_r
+	xvmulsp		vs45,	vs2,	vs11		// a2_r*b1_i, a2_i*b1_i, a1_r*b1_i, a1_i*b1_i
+	xvmulsp		vs46,	vs3,	vs10		// a3_r*b1_r, a3_i*b1_r, a1_r*b1_r, a1_i*b1_r
+	xvmulsp		vs47,	vs3,	vs11		// a3_r*b1_i, a3_i*b1_i, a1_r*b1_i, a1_i*b1_i
+
+
+.endm
+
+.macro KERNEL2x8_SUB1
+
+
+	lxvw4x		vs0,	o0,	AO		// load a0, a1
+	lxvw4x		vs1,	o16,	AO		// load a2, a3
+	lxvw4x		vs2,	o32,	AO		// load a4, a5
+	lxvw4x		vs3,	o48,	AO		// load a6, a7
+
+	addi		AO,	AO,	64
+
+	lxvw4x		vs8,	o0,	BO		//  load b0_r
+	lxvw4x		vs9,	o16,	BO		//  load b0_i
+	lxvw4x		vs10,	o32,	BO		//  load b1_r
+	lxvw4x		vs11,	o48,	BO		//  load b1_i
+
+	addi		BO,	BO,	64
+
+
+
+	xvmaddasp	vs32,	vs0,	vs8		// a0_r*b0_r, a0_i*b0_r, a1_r*b0_r, a1_i*b0_r
+	xvmaddasp	vs33,	vs0,	vs9		// a0_r*b0_i, a0_i*b0_i, a1_r*b0_i, a1_i*b0_i
+	xvmaddasp	vs34,	vs1,	vs8		// a1_r*b0_r, a1_i*b0_r, a1_r*b0_r, a1_i*b0_r
+	xvmaddasp	vs35,	vs1,	vs9		// a1_r*b0_i, a1_i*b0_i, a1_r*b0_i, a1_i*b0_i
+	xvmaddasp	vs36,	vs2,	vs8		// a2_r*b0_r, a2_i*b0_r, a1_r*b0_r, a1_i*b0_r
+	xvmaddasp	vs37,	vs2,	vs9		// a2_r*b0_i, a2_i*b0_i, a1_r*b0_i, a1_i*b0_i
+	xvmaddasp	vs38,	vs3,	vs8		// a3_r*b0_r, a3_i*b0_r, a1_r*b0_r, a1_i*b0_r
+	xvmaddasp	vs39,	vs3,	vs9		// a3_r*b0_i, a3_i*b0_i, a1_r*b0_i, a1_i*b0_i
+
+	xvmaddasp	vs40,	vs0,	vs10		// a0_r*b1_r, a0_i*b1_r, a1_r*b1_r, a1_i*b1_r
+	xvmaddasp	vs41,	vs0,	vs11		// a0_r*b1_i, a0_i*b1_i, a1_r*b1_i, a1_i*b1_i
+	xvmaddasp	vs42,	vs1,	vs10		// a1_r*b1_r, a1_i*b1_r, a1_r*b1_r, a1_i*b1_r
+	xvmaddasp	vs43,	vs1,	vs11		// a1_r*b1_i, a1_i*b1_i, a1_r*b1_i, a1_i*b1_i
+	xvmaddasp	vs44,	vs2,	vs10		// a2_r*b1_r, a2_i*b1_r, a1_r*b1_r, a1_i*b1_r
+	xvmaddasp	vs45,	vs2,	vs11		// a2_r*b1_i, a2_i*b1_i, a1_r*b1_i, a1_i*b1_i
+	xvmaddasp	vs46,	vs3,	vs10		// a3_r*b1_r, a3_i*b1_r, a1_r*b1_r, a1_i*b1_r
+	xvmaddasp	vs47,	vs3,	vs11		// a3_r*b1_i, a3_i*b1_i, a1_r*b1_i, a1_i*b1_i
+
+
+.endm
+
+.macro SAVE2x8
+
+	mr		T1,	CO
+	xxlxor		vs24,	vs24,	vs24
+
+// N=0
+
+	mr		T2,	T1
+
+// N=0 M=0
+
+	xxlxor		vs4,	vs4,	vs4
+	xxlxor		vs5,	vs5,	vs5
+	xxlxor		vs6,	vs6,	vs6
+	xxlxor		vs7,	vs7,	vs7
+
+#ifndef TRMMKERNEL
+	lxvw4x		vs0,	o0,	T2	// c0, c1
+#else
+	xxlxor		vs0,	vs0,	vs0
+#endif
+
+
+	xxspltw		vs8,	vs32,	0
+	xxspltw		vs9,	vs32,	1
+	xxspltw		vs10,	vs32,	2
+	xxspltw		vs11,	vs32,	3
+
+
+	xxspltw		vs12,	vs33,	0
+	xxspltw		vs13,	vs33,	1
+	xxspltw		vs14,	vs33,	2
+	xxspltw		vs15,	vs33,	3
+
+	XVFADD_R1	vs4,	vs4,	vs8		// add a0_r * b0_r
+	XVFADD_I2	vs5,	vs5,	vs12		// add a0_r * b0_i
+	XVFADD_R1	vs6,	vs6,	vs10		// add a1_r * b0_r
+	XVFADD_I2	vs7,	vs7,	vs14		// add a1_r * b0_i
+
+	XVFADD_R2	vs4,	vs4,	vs13		// add a0_i * b0_i
+	XVFADD_I1	vs5,	vs5,	vs9 		// add a0_i * b0_r
+	XVFADD_R2	vs6,	vs6,	vs15		// add a1_i * b0_i
+	XVFADD_I1	vs7,	vs7,	vs11		// add a1_i * b0_r
+
+	xvmulsp		vs16,	vs4,	alpha_sr		// r0_r * alpha_r
+	xvmulsp		vs17,	vs5,	alpha_si		// r0_i * alpha_i
+	xvmulsp		vs18,	vs4,	alpha_si		// r0_r * alpha_i
+	xvmulsp		vs19,	vs5,	alpha_sr		// r0_i * alpha_r
+
+	xvsubsp		vs20,	vs16,	vs17		// r0_r * alpha_r - r0_i * alpha_i
+	xvaddsp		vs21,	vs18,	vs19		// r0_r * alpha_i + r0_i * alpha_r
+
+	xvmulsp		vs16,	vs6,	alpha_sr		// r1_r * alpha_r
+	xvmulsp		vs17,	vs7,	alpha_si		// r1_i * alpha_i
+	xvmulsp		vs18,	vs6,	alpha_si		// r1_r * alpha_i
+	xvmulsp		vs19,	vs7,	alpha_sr		// r1_i * alpha_r
+
+	xvsubsp		vs22,	vs16,	vs17		// r1_r * alpha_r - r1_i * alpha_i
+	xvaddsp		vs23,	vs18,	vs19		// r1_r * alpha_i + r1_i * alpha_r
+
+	xxsldwi		vs20,	vs20,	vs24,	3		// r0_r
+	xxsldwi		vs21,	vs21,	vs24,	2		// r0_i
+	xxsldwi		vs22,	vs22,	vs24,	1		// r1_r
+	xxsldwi		vs23,	vs23,	vs24,	0		// r1_i
+	xvaddsp		vs20,	vs20,	vs21		// r0_r, r0_i
+	xvaddsp		vs22,	vs22,	vs23		// r1_r, r1_i
+	xvaddsp		vs1,	vs20,	vs22			// r0_r, r0_i, r1_r, r1_i
+	xvaddsp		vs0,	vs0,	vs1
+
+
+	stxvw4x		vs0,	o0,	T2	// c0, c1
+
+	addi		T2,	T2,	16
+
+// N=0 M=2
+
+	xxlxor		vs4,	vs4,	vs4
+	xxlxor		vs5,	vs5,	vs5
+	xxlxor		vs6,	vs6,	vs6
+	xxlxor		vs7,	vs7,	vs7
+
+#ifndef TRMMKERNEL
+	lxvw4x		vs0,	o0,	T2	// c0, c1
+#else
+	xxlxor		vs0,	vs0,	vs0
+#endif
+
+
+	xxspltw		vs8,	vs34,	0
+	xxspltw		vs9,	vs34,	1
+	xxspltw		vs10,	vs34,	2
+	xxspltw		vs11,	vs34,	3
+
+
+	xxspltw		vs12,	vs35,	0
+	xxspltw		vs13,	vs35,	1
+	xxspltw		vs14,	vs35,	2
+	xxspltw		vs15,	vs35,	3
+
+	XVFADD_R1	vs4,	vs4,	vs8		// add a0_r * b0_r
+	XVFADD_I2	vs5,	vs5,	vs12		// add a0_r * b0_i
+	XVFADD_R1	vs6,	vs6,	vs10		// add a1_r * b0_r
+	XVFADD_I2	vs7,	vs7,	vs14		// add a1_r * b0_i
+
+	XVFADD_R2	vs4,	vs4,	vs13		// add a0_i * b0_i
+	XVFADD_I1	vs5,	vs5,	vs9 		// add a0_i * b0_r
+	XVFADD_R2	vs6,	vs6,	vs15		// add a1_i * b0_i
+	XVFADD_I1	vs7,	vs7,	vs11		// add a1_i * b0_r
+
+	xvmulsp		vs16,	vs4,	alpha_sr		// r0_r * alpha_r
+	xvmulsp		vs17,	vs5,	alpha_si		// r0_i * alpha_i
+	xvmulsp		vs18,	vs4,	alpha_si		// r0_r * alpha_i
+	xvmulsp		vs19,	vs5,	alpha_sr		// r0_i * alpha_r
+
+	xvsubsp		vs20,	vs16,	vs17		// r0_r * alpha_r - r0_i * alpha_i
+	xvaddsp		vs21,	vs18,	vs19		// r0_r * alpha_i + r0_i * alpha_r
+
+	xvmulsp		vs16,	vs6,	alpha_sr		// r1_r * alpha_r
+	xvmulsp		vs17,	vs7,	alpha_si		// r1_i * alpha_i
+	xvmulsp		vs18,	vs6,	alpha_si		// r1_r * alpha_i
+	xvmulsp		vs19,	vs7,	alpha_sr		// r1_i * alpha_r
+
+	xvsubsp		vs22,	vs16,	vs17		// r1_r * alpha_r - r1_i * alpha_i
+	xvaddsp		vs23,	vs18,	vs19		// r1_r * alpha_i + r1_i * alpha_r
+
+	xxsldwi		vs20,	vs20,	vs24,	3		// r0_r
+	xxsldwi		vs21,	vs21,	vs24,	2		// r0_i
+	xxsldwi		vs22,	vs22,	vs24,	1		// r1_r
+	xxsldwi		vs23,	vs23,	vs24,	0		// r1_i
+	xvaddsp		vs20,	vs20,	vs21		// r0_r, r0_i
+	xvaddsp		vs22,	vs22,	vs23		// r1_r, r1_i
+	xvaddsp		vs1,	vs20,	vs22			// r0_r, r0_i, r1_r, r1_i
+	xvaddsp		vs0,	vs0,	vs1
+
+
+	stxvw4x		vs0,	o0,	T2	// c0, c1
+
+	addi		T2,	T2,	16
+
+// N=0 M=4
+
+	xxlxor		vs4,	vs4,	vs4
+	xxlxor		vs5,	vs5,	vs5
+	xxlxor		vs6,	vs6,	vs6
+	xxlxor		vs7,	vs7,	vs7
+
+#ifndef TRMMKERNEL
+	lxvw4x		vs0,	o0,	T2	// c0, c1
+#else
+	xxlxor		vs0,	vs0,	vs0
+#endif
+
+
+	xxspltw		vs8,	vs36,	0
+	xxspltw		vs9,	vs36,	1
+	xxspltw		vs10,	vs36,	2
+	xxspltw		vs11,	vs36,	3
+
+
+	xxspltw		vs12,	vs37,	0
+	xxspltw		vs13,	vs37,	1
+	xxspltw		vs14,	vs37,	2
+	xxspltw		vs15,	vs37,	3
+
+	XVFADD_R1	vs4,	vs4,	vs8		// add a0_r * b0_r
+	XVFADD_I2	vs5,	vs5,	vs12		// add a0_r * b0_i
+	XVFADD_R1	vs6,	vs6,	vs10		// add a1_r * b0_r
+	XVFADD_I2	vs7,	vs7,	vs14		// add a1_r * b0_i
+
+	XVFADD_R2	vs4,	vs4,	vs13		// add a0_i * b0_i
+	XVFADD_I1	vs5,	vs5,	vs9 		// add a0_i * b0_r
+	XVFADD_R2	vs6,	vs6,	vs15		// add a1_i * b0_i
+	XVFADD_I1	vs7,	vs7,	vs11		// add a1_i * b0_r
+
+	xvmulsp		vs16,	vs4,	alpha_sr		// r0_r * alpha_r
+	xvmulsp		vs17,	vs5,	alpha_si		// r0_i * alpha_i
+	xvmulsp		vs18,	vs4,	alpha_si		// r0_r * alpha_i
+	xvmulsp		vs19,	vs5,	alpha_sr		// r0_i * alpha_r
+
+	xvsubsp		vs20,	vs16,	vs17		// r0_r * alpha_r - r0_i * alpha_i
+	xvaddsp		vs21,	vs18,	vs19		// r0_r * alpha_i + r0_i * alpha_r
+
+	xvmulsp		vs16,	vs6,	alpha_sr		// r1_r * alpha_r
+	xvmulsp		vs17,	vs7,	alpha_si		// r1_i * alpha_i
+	xvmulsp		vs18,	vs6,	alpha_si		// r1_r * alpha_i
+	xvmulsp		vs19,	vs7,	alpha_sr		// r1_i * alpha_r
+
+	xvsubsp		vs22,	vs16,	vs17		// r1_r * alpha_r - r1_i * alpha_i
+	xvaddsp		vs23,	vs18,	vs19		// r1_r * alpha_i + r1_i * alpha_r
+
+	xxsldwi		vs20,	vs20,	vs24,	3		// r0_r
+	xxsldwi		vs21,	vs21,	vs24,	2		// r0_i
+	xxsldwi		vs22,	vs22,	vs24,	1		// r1_r
+	xxsldwi		vs23,	vs23,	vs24,	0		// r1_i
+	xvaddsp		vs20,	vs20,	vs21		// r0_r, r0_i
+	xvaddsp		vs22,	vs22,	vs23		// r1_r, r1_i
+	xvaddsp		vs1,	vs20,	vs22			// r0_r, r0_i, r1_r, r1_i
+	xvaddsp		vs0,	vs0,	vs1
+
+
+	stxvw4x		vs0,	o0,	T2	// c0, c1
+
+	addi		T2,	T2,	16
+
+// N=0 M=6
+
+	xxlxor		vs4,	vs4,	vs4
+	xxlxor		vs5,	vs5,	vs5
+	xxlxor		vs6,	vs6,	vs6
+	xxlxor		vs7,	vs7,	vs7
+
+#ifndef TRMMKERNEL
+	lxvw4x		vs0,	o0,	T2	// c0, c1
+#else
+	xxlxor		vs0,	vs0,	vs0
+#endif
+
+
+	xxspltw		vs8,	vs38,	0
+	xxspltw		vs9,	vs38,	1
+	xxspltw		vs10,	vs38,	2
+	xxspltw		vs11,	vs38,	3
+
+
+	xxspltw		vs12,	vs39,	0
+	xxspltw		vs13,	vs39,	1
+	xxspltw		vs14,	vs39,	2
+	xxspltw		vs15,	vs39,	3
+
+	XVFADD_R1	vs4,	vs4,	vs8		// add a0_r * b0_r
+	XVFADD_I2	vs5,	vs5,	vs12		// add a0_r * b0_i
+	XVFADD_R1	vs6,	vs6,	vs10		// add a1_r * b0_r
+	XVFADD_I2	vs7,	vs7,	vs14		// add a1_r * b0_i
+
+	XVFADD_R2	vs4,	vs4,	vs13		// add a0_i * b0_i
+	XVFADD_I1	vs5,	vs5,	vs9 		// add a0_i * b0_r
+	XVFADD_R2	vs6,	vs6,	vs15		// add a1_i * b0_i
+	XVFADD_I1	vs7,	vs7,	vs11		// add a1_i * b0_r
+
+	xvmulsp		vs16,	vs4,	alpha_sr		// r0_r * alpha_r
+	xvmulsp		vs17,	vs5,	alpha_si		// r0_i * alpha_i
+	xvmulsp		vs18,	vs4,	alpha_si		// r0_r * alpha_i
+	xvmulsp		vs19,	vs5,	alpha_sr		// r0_i * alpha_r
+
+	xvsubsp		vs20,	vs16,	vs17		// r0_r * alpha_r - r0_i * alpha_i
+	xvaddsp		vs21,	vs18,	vs19		// r0_r * alpha_i + r0_i * alpha_r
+
+	xvmulsp		vs16,	vs6,	alpha_sr		// r1_r * alpha_r
+	xvmulsp		vs17,	vs7,	alpha_si		// r1_i * alpha_i
+	xvmulsp		vs18,	vs6,	alpha_si		// r1_r * alpha_i
+	xvmulsp		vs19,	vs7,	alpha_sr		// r1_i * alpha_r
+
+	xvsubsp		vs22,	vs16,	vs17		// r1_r * alpha_r - r1_i * alpha_i
+	xvaddsp		vs23,	vs18,	vs19		// r1_r * alpha_i + r1_i * alpha_r
+
+	xxsldwi		vs20,	vs20,	vs24,	3		// r0_r
+	xxsldwi		vs21,	vs21,	vs24,	2		// r0_i
+	xxsldwi		vs22,	vs22,	vs24,	1		// r1_r
+	xxsldwi		vs23,	vs23,	vs24,	0		// r1_i
+	xvaddsp		vs20,	vs20,	vs21		// r0_r, r0_i
+	xvaddsp		vs22,	vs22,	vs23		// r1_r, r1_i
+	xvaddsp		vs1,	vs20,	vs22			// r0_r, r0_i, r1_r, r1_i
+	xvaddsp		vs0,	vs0,	vs1
+
+
+	stxvw4x		vs0,	o0,	T2	// c0, c1
+
+	addi		T2,	T2,	16
+	add		T1,	T1,	LDC
+
+
+// N=1
+
+	mr		T2,	T1
+
+// N=1 M=0
+
+	xxlxor		vs4,	vs4,	vs4
+	xxlxor		vs5,	vs5,	vs5
+	xxlxor		vs6,	vs6,	vs6
+	xxlxor		vs7,	vs7,	vs7
+
+#ifndef TRMMKERNEL
+	lxvw4x		vs0,	o0,	T2	// c0, c1
+#else
+	xxlxor		vs0,	vs0,	vs0
+#endif
+
+
+	xxspltw		vs8,	vs40,	0
+	xxspltw		vs9,	vs40,	1
+	xxspltw		vs10,	vs40,	2
+	xxspltw		vs11,	vs40,	3
+
+
+	xxspltw		vs12,	vs41,	0
+	xxspltw		vs13,	vs41,	1
+	xxspltw		vs14,	vs41,	2
+	xxspltw		vs15,	vs41,	3
+
+	XVFADD_R1	vs4,	vs4,	vs8		// add a0_r * b0_r
+	XVFADD_I2	vs5,	vs5,	vs12		// add a0_r * b0_i
+	XVFADD_R1	vs6,	vs6,	vs10		// add a1_r * b0_r
+	XVFADD_I2	vs7,	vs7,	vs14		// add a1_r * b0_i
+
+	XVFADD_R2	vs4,	vs4,	vs13		// add a0_i * b0_i
+	XVFADD_I1	vs5,	vs5,	vs9 		// add a0_i * b0_r
+	XVFADD_R2	vs6,	vs6,	vs15		// add a1_i * b0_i
+	XVFADD_I1	vs7,	vs7,	vs11		// add a1_i * b0_r
+
+	xvmulsp		vs16,	vs4,	alpha_sr		// r0_r * alpha_r
+	xvmulsp		vs17,	vs5,	alpha_si		// r0_i * alpha_i
+	xvmulsp		vs18,	vs4,	alpha_si		// r0_r * alpha_i
+	xvmulsp		vs19,	vs5,	alpha_sr		// r0_i * alpha_r
+
+	xvsubsp		vs20,	vs16,	vs17		// r0_r * alpha_r - r0_i * alpha_i
+	xvaddsp		vs21,	vs18,	vs19		// r0_r * alpha_i + r0_i * alpha_r
+
+	xvmulsp		vs16,	vs6,	alpha_sr		// r1_r * alpha_r
+	xvmulsp		vs17,	vs7,	alpha_si		// r1_i * alpha_i
+	xvmulsp		vs18,	vs6,	alpha_si		// r1_r * alpha_i
+	xvmulsp		vs19,	vs7,	alpha_sr		// r1_i * alpha_r
+
+	xvsubsp		vs22,	vs16,	vs17		// r1_r * alpha_r - r1_i * alpha_i
+	xvaddsp		vs23,	vs18,	vs19		// r1_r * alpha_i + r1_i * alpha_r
+
+	xxsldwi		vs20,	vs20,	vs24,	3		// r0_r
+	xxsldwi		vs21,	vs21,	vs24,	2		// r0_i
+	xxsldwi		vs22,	vs22,	vs24,	1		// r1_r
+	xxsldwi		vs23,	vs23,	vs24,	0		// r1_i
+	xvaddsp		vs20,	vs20,	vs21		// r0_r, r0_i
+	xvaddsp		vs22,	vs22,	vs23		// r1_r, r1_i
+	xvaddsp		vs1,	vs20,	vs22			// r0_r, r0_i, r1_r, r1_i
+	xvaddsp		vs0,	vs0,	vs1
+
+
+	stxvw4x		vs0,	o0,	T2	// c0, c1
+
+	addi		T2,	T2,	16
+
+// N=1 M=2
+
+	xxlxor		vs4,	vs4,	vs4
+	xxlxor		vs5,	vs5,	vs5
+	xxlxor		vs6,	vs6,	vs6
+	xxlxor		vs7,	vs7,	vs7
+
+#ifndef TRMMKERNEL
+	lxvw4x		vs0,	o0,	T2	// c0, c1
+#else
+	xxlxor		vs0,	vs0,	vs0
+#endif
+
+
+	xxspltw		vs8,	vs42,	0
+	xxspltw		vs9,	vs42,	1
+	xxspltw		vs10,	vs42,	2
+	xxspltw		vs11,	vs42,	3
+
+
+	xxspltw		vs12,	vs43,	0
+	xxspltw		vs13,	vs43,	1
+	xxspltw		vs14,	vs43,	2
+	xxspltw		vs15,	vs43,	3
+
+	XVFADD_R1	vs4,	vs4,	vs8		// add a0_r * b0_r
+	XVFADD_I2	vs5,	vs5,	vs12		// add a0_r * b0_i
+	XVFADD_R1	vs6,	vs6,	vs10		// add a1_r * b0_r
+	XVFADD_I2	vs7,	vs7,	vs14		// add a1_r * b0_i
+
+	XVFADD_R2	vs4,	vs4,	vs13		// add a0_i * b0_i
+	XVFADD_I1	vs5,	vs5,	vs9 		// add a0_i * b0_r
+	XVFADD_R2	vs6,	vs6,	vs15		// add a1_i * b0_i
+	XVFADD_I1	vs7,	vs7,	vs11		// add a1_i * b0_r
+
+	xvmulsp		vs16,	vs4,	alpha_sr		// r0_r * alpha_r
+	xvmulsp		vs17,	vs5,	alpha_si		// r0_i * alpha_i
+	xvmulsp		vs18,	vs4,	alpha_si		// r0_r * alpha_i
+	xvmulsp		vs19,	vs5,	alpha_sr		// r0_i * alpha_r
+
+	xvsubsp		vs20,	vs16,	vs17		// r0_r * alpha_r - r0_i * alpha_i
+	xvaddsp		vs21,	vs18,	vs19		// r0_r * alpha_i + r0_i * alpha_r
+
+	xvmulsp		vs16,	vs6,	alpha_sr		// r1_r * alpha_r
+	xvmulsp		vs17,	vs7,	alpha_si		// r1_i * alpha_i
+	xvmulsp		vs18,	vs6,	alpha_si		// r1_r * alpha_i
+	xvmulsp		vs19,	vs7,	alpha_sr		// r1_i * alpha_r
+
+	xvsubsp		vs22,	vs16,	vs17		// r1_r * alpha_r - r1_i * alpha_i
+	xvaddsp		vs23,	vs18,	vs19		// r1_r * alpha_i + r1_i * alpha_r
+
+	xxsldwi		vs20,	vs20,	vs24,	3		// r0_r
+	xxsldwi		vs21,	vs21,	vs24,	2		// r0_i
+	xxsldwi		vs22,	vs22,	vs24,	1		// r1_r
+	xxsldwi		vs23,	vs23,	vs24,	0		// r1_i
+	xvaddsp		vs20,	vs20,	vs21		// r0_r, r0_i
+	xvaddsp		vs22,	vs22,	vs23		// r1_r, r1_i
+	xvaddsp		vs1,	vs20,	vs22			// r0_r, r0_i, r1_r, r1_i
+	xvaddsp		vs0,	vs0,	vs1
+
+
+	stxvw4x		vs0,	o0,	T2	// c0, c1
+
+	addi		T2,	T2,	16
+
+// N=1 M=4
+
+	xxlxor		vs4,	vs4,	vs4
+	xxlxor		vs5,	vs5,	vs5
+	xxlxor		vs6,	vs6,	vs6
+	xxlxor		vs7,	vs7,	vs7
+
+#ifndef TRMMKERNEL
+	lxvw4x		vs0,	o0,	T2	// c0, c1
+#else
+	xxlxor		vs0,	vs0,	vs0
+#endif
+
+
+	xxspltw		vs8,	vs44,	0
+	xxspltw		vs9,	vs44,	1
+	xxspltw		vs10,	vs44,	2
+	xxspltw		vs11,	vs44,	3
+
+
+	xxspltw		vs12,	vs45,	0
+	xxspltw		vs13,	vs45,	1
+	xxspltw		vs14,	vs45,	2
+	xxspltw		vs15,	vs45,	3
+
+	XVFADD_R1	vs4,	vs4,	vs8		// add a0_r * b0_r
+	XVFADD_I2	vs5,	vs5,	vs12		// add a0_r * b0_i
+	XVFADD_R1	vs6,	vs6,	vs10		// add a1_r * b0_r
+	XVFADD_I2	vs7,	vs7,	vs14		// add a1_r * b0_i
+
+	XVFADD_R2	vs4,	vs4,	vs13		// add a0_i * b0_i
+	XVFADD_I1	vs5,	vs5,	vs9 		// add a0_i * b0_r
+	XVFADD_R2	vs6,	vs6,	vs15		// add a1_i * b0_i
+	XVFADD_I1	vs7,	vs7,	vs11		// add a1_i * b0_r
+
+	xvmulsp		vs16,	vs4,	alpha_sr		// r0_r * alpha_r
+	xvmulsp		vs17,	vs5,	alpha_si		// r0_i * alpha_i
+	xvmulsp		vs18,	vs4,	alpha_si		// r0_r * alpha_i
+	xvmulsp		vs19,	vs5,	alpha_sr		// r0_i * alpha_r
+
+	xvsubsp		vs20,	vs16,	vs17		// r0_r * alpha_r - r0_i * alpha_i
+	xvaddsp		vs21,	vs18,	vs19		// r0_r * alpha_i + r0_i * alpha_r
+
+	xvmulsp		vs16,	vs6,	alpha_sr		// r1_r * alpha_r
+	xvmulsp		vs17,	vs7,	alpha_si		// r1_i * alpha_i
+	xvmulsp		vs18,	vs6,	alpha_si		// r1_r * alpha_i
+	xvmulsp		vs19,	vs7,	alpha_sr		// r1_i * alpha_r
+
+	xvsubsp		vs22,	vs16,	vs17		// r1_r * alpha_r - r1_i * alpha_i
+	xvaddsp		vs23,	vs18,	vs19		// r1_r * alpha_i + r1_i * alpha_r
+
+	xxsldwi		vs20,	vs20,	vs24,	3		// r0_r
+	xxsldwi		vs21,	vs21,	vs24,	2		// r0_i
+	xxsldwi		vs22,	vs22,	vs24,	1		// r1_r
+	xxsldwi		vs23,	vs23,	vs24,	0		// r1_i
+	xvaddsp		vs20,	vs20,	vs21		// r0_r, r0_i
+	xvaddsp		vs22,	vs22,	vs23		// r1_r, r1_i
+	xvaddsp		vs1,	vs20,	vs22			// r0_r, r0_i, r1_r, r1_i
+	xvaddsp		vs0,	vs0,	vs1
+
+
+	stxvw4x		vs0,	o0,	T2	// c0, c1
+
+	addi		T2,	T2,	16
+
+// N=1 M=6
+
+	xxlxor		vs4,	vs4,	vs4
+	xxlxor		vs5,	vs5,	vs5
+	xxlxor		vs6,	vs6,	vs6
+	xxlxor		vs7,	vs7,	vs7
+
+#ifndef TRMMKERNEL
+	lxvw4x		vs0,	o0,	T2	// c0, c1
+#else
+	xxlxor		vs0,	vs0,	vs0
+#endif
+
+
+	xxspltw		vs8,	vs46,	0
+	xxspltw		vs9,	vs46,	1
+	xxspltw		vs10,	vs46,	2
+	xxspltw		vs11,	vs46,	3
+
+
+	xxspltw		vs12,	vs47,	0
+	xxspltw		vs13,	vs47,	1
+	xxspltw		vs14,	vs47,	2
+	xxspltw		vs15,	vs47,	3
+
+	XVFADD_R1	vs4,	vs4,	vs8		// add a0_r * b0_r
+	XVFADD_I2	vs5,	vs5,	vs12		// add a0_r * b0_i
+	XVFADD_R1	vs6,	vs6,	vs10		// add a1_r * b0_r
+	XVFADD_I2	vs7,	vs7,	vs14		// add a1_r * b0_i
+
+	XVFADD_R2	vs4,	vs4,	vs13		// add a0_i * b0_i
+	XVFADD_I1	vs5,	vs5,	vs9 		// add a0_i * b0_r
+	XVFADD_R2	vs6,	vs6,	vs15		// add a1_i * b0_i
+	XVFADD_I1	vs7,	vs7,	vs11		// add a1_i * b0_r
+
+	xvmulsp		vs16,	vs4,	alpha_sr		// r0_r * alpha_r
+	xvmulsp		vs17,	vs5,	alpha_si		// r0_i * alpha_i
+	xvmulsp		vs18,	vs4,	alpha_si		// r0_r * alpha_i
+	xvmulsp		vs19,	vs5,	alpha_sr		// r0_i * alpha_r
+
+	xvsubsp		vs20,	vs16,	vs17		// r0_r * alpha_r - r0_i * alpha_i
+	xvaddsp		vs21,	vs18,	vs19		// r0_r * alpha_i + r0_i * alpha_r
+
+	xvmulsp		vs16,	vs6,	alpha_sr		// r1_r * alpha_r
+	xvmulsp		vs17,	vs7,	alpha_si		// r1_i * alpha_i
+	xvmulsp		vs18,	vs6,	alpha_si		// r1_r * alpha_i
+	xvmulsp		vs19,	vs7,	alpha_sr		// r1_i * alpha_r
+
+	xvsubsp		vs22,	vs16,	vs17		// r1_r * alpha_r - r1_i * alpha_i
+	xvaddsp		vs23,	vs18,	vs19		// r1_r * alpha_i + r1_i * alpha_r
+
+	xxsldwi		vs20,	vs20,	vs24,	3		// r0_r
+	xxsldwi		vs21,	vs21,	vs24,	2		// r0_i
+	xxsldwi		vs22,	vs22,	vs24,	1		// r1_r
+	xxsldwi		vs23,	vs23,	vs24,	0		// r1_i
+	xvaddsp		vs20,	vs20,	vs21		// r0_r, r0_i
+	xvaddsp		vs22,	vs22,	vs23		// r1_r, r1_i
+	xvaddsp		vs1,	vs20,	vs22			// r0_r, r0_i, r1_r, r1_i
+	xvaddsp		vs0,	vs0,	vs1
+
+
+	stxvw4x		vs0,	o0,	T2	// c0, c1
+
+	addi		T2,	T2,	16
+	add		T1,	T1,	LDC
+
+	addi		CO,	CO,	64
+
+.endm
+
+
+/**********************************************************************************************
+* Macros for N=2 and M=4
+**********************************************************************************************/
+
+.macro LOAD2x4_1
+
+	lxvw4x		vs0,	o0,	AO		// load a0, a1
+	lxvw4x		vs1,	o16,	AO		// load a2, a3
+
+	addi		AO,	AO,	32
+
+	lxvw4x		vs8,	o0,	BO		//  load b0_r
+	lxvw4x		vs9,	o16,	BO		//  load b0_i
+	lxvw4x		vs10,	o32,	BO		//  load b1_r
+	lxvw4x		vs11,	o48,	BO		//  load b1_i
+
+	addi		BO,	BO,	64
+
+
+.endm
+
+.macro KERNEL2x4_I1
+
+
+	lxvw4x		vs4,	o0,	AO		// load a0, a1
+	lxvw4x		vs5,	o16,	AO		// load a2, a3
+
+	addi		AO,	AO,	32
+
+	lxvw4x		vs16,	o0,	BO		//  load b0_r
+	lxvw4x		vs17,	o16,	BO		//  load b0_i
+	lxvw4x		vs18,	o32,	BO		//  load b1_r
+	lxvw4x		vs19,	o48,	BO		//  load b1_i
+
+	addi		BO,	BO,	64
+
+
+
+	xvmulsp		vs32,	vs0,	vs8		// a0_r*b0_r, a0_i*b0_r, a1_r*b0_r, a1_i*b0_r
+	xvmulsp		vs33,	vs0,	vs9		// a0_r*b0_i, a0_i*b0_i, a1_r*b0_i, a1_i*b0_i
+	xvmulsp		vs34,	vs1,	vs8		// a1_r*b0_r, a1_i*b0_r, a1_r*b0_r, a1_i*b0_r
+	xvmulsp		vs35,	vs1,	vs9		// a1_r*b0_i, a1_i*b0_i, a1_r*b0_i, a1_i*b0_i
+
+	xvmulsp		vs36,	vs0,	vs10		// a0_r*b1_r, a0_i*b1_r, a1_r*b1_r, a1_i*b1_r
+	xvmulsp		vs37,	vs0,	vs11		// a0_r*b1_i, a0_i*b1_i, a1_r*b1_i, a1_i*b1_i
+	xvmulsp		vs38,	vs1,	vs10		// a1_r*b1_r, a1_i*b1_r, a1_r*b1_r, a1_i*b1_r
+	xvmulsp		vs39,	vs1,	vs11		// a1_r*b1_i, a1_i*b1_i, a1_r*b1_i, a1_i*b1_i
+
+
+.endm
+
+.macro KERNEL2x4_1
+
+
+	lxvw4x		vs4,	o0,	AO		// load a0, a1
+	lxvw4x		vs5,	o16,	AO		// load a2, a3
+
+	addi		AO,	AO,	32
+
+	lxvw4x		vs16,	o0,	BO		//  load b0_r
+	lxvw4x		vs17,	o16,	BO		//  load b0_i
+	lxvw4x		vs18,	o32,	BO		//  load b1_r
+	lxvw4x		vs19,	o48,	BO		//  load b1_i
+
+	addi		BO,	BO,	64
+
+
+
+	xvmaddasp	vs32,	vs0,	vs8		// a0_r*b0_r, a0_i*b0_r, a1_r*b0_r, a1_i*b0_r
+	xvmaddasp	vs33,	vs0,	vs9		// a0_r*b0_i, a0_i*b0_i, a1_r*b0_i, a1_i*b0_i
+	xvmaddasp	vs34,	vs1,	vs8		// a1_r*b0_r, a1_i*b0_r, a1_r*b0_r, a1_i*b0_r
+	xvmaddasp	vs35,	vs1,	vs9		// a1_r*b0_i, a1_i*b0_i, a1_r*b0_i, a1_i*b0_i
+
+	xvmaddasp	vs36,	vs0,	vs10		// a0_r*b1_r, a0_i*b1_r, a1_r*b1_r, a1_i*b1_r
+	xvmaddasp	vs37,	vs0,	vs11		// a0_r*b1_i, a0_i*b1_i, a1_r*b1_i, a1_i*b1_i
+	xvmaddasp	vs38,	vs1,	vs10		// a1_r*b1_r, a1_i*b1_r, a1_r*b1_r, a1_i*b1_r
+	xvmaddasp	vs39,	vs1,	vs11		// a1_r*b1_i, a1_i*b1_i, a1_r*b1_i, a1_i*b1_i
+
+
+.endm
+
+.macro KERNEL2x4_2
+
+
+	lxvw4x		vs0,	o0,	AO		// load a0, a1
+	lxvw4x		vs1,	o16,	AO		// load a2, a3
+
+	addi		AO,	AO,	32
+
+	lxvw4x		vs8,	o0,	BO		//  load b0_r
+	lxvw4x		vs9,	o16,	BO		//  load b0_i
+	lxvw4x		vs10,	o32,	BO		//  load b1_r
+	lxvw4x		vs11,	o48,	BO		//  load b1_i
+
+	addi		BO,	BO,	64
+
+
+
+	xvmaddasp	vs32,	vs4,	vs16		// a4_r*b0_r, a4_i*b0_r, a1_r*b0_r, a1_i*b0_r
+	xvmaddasp	vs33,	vs4,	vs17		// a4_r*b0_i, a4_i*b0_i, a1_r*b0_i, a1_i*b0_i
+	xvmaddasp	vs34,	vs5,	vs16		// a5_r*b0_r, a5_i*b0_r, a1_r*b0_r, a1_i*b0_r
+	xvmaddasp	vs35,	vs5,	vs17		// a5_r*b0_i, a5_i*b0_i, a1_r*b0_i, a1_i*b0_i
+
+	xvmaddasp	vs36,	vs4,	vs18		// a4_r*b1_r, a4_i*b1_r, a1_r*b1_r, a1_i*b1_r
+	xvmaddasp	vs37,	vs4,	vs19		// a4_r*b1_i, a4_i*b1_i, a1_r*b1_i, a1_i*b1_i
+	xvmaddasp	vs38,	vs5,	vs18		// a5_r*b1_r, a5_i*b1_r, a1_r*b1_r, a1_i*b1_r
+	xvmaddasp	vs39,	vs5,	vs19		// a5_r*b1_i, a5_i*b1_i, a1_r*b1_i, a1_i*b1_i
+
+
+.endm
+
+.macro KERNEL2x4_E2
+
+
+	xvmaddasp	vs32,	vs4,	vs16		// a4_r*b0_r, a4_i*b0_r, a1_r*b0_r, a1_i*b0_r
+	xvmaddasp	vs33,	vs4,	vs17		// a4_r*b0_i, a4_i*b0_i, a1_r*b0_i, a1_i*b0_i
+	xvmaddasp	vs34,	vs5,	vs16		// a5_r*b0_r, a5_i*b0_r, a1_r*b0_r, a1_i*b0_r
+	xvmaddasp	vs35,	vs5,	vs17		// a5_r*b0_i, a5_i*b0_i, a1_r*b0_i, a1_i*b0_i
+
+	xvmaddasp	vs36,	vs4,	vs18		// a4_r*b1_r, a4_i*b1_r, a1_r*b1_r, a1_i*b1_r
+	xvmaddasp	vs37,	vs4,	vs19		// a4_r*b1_i, a4_i*b1_i, a1_r*b1_i, a1_i*b1_i
+	xvmaddasp	vs38,	vs5,	vs18		// a5_r*b1_r, a5_i*b1_r, a1_r*b1_r, a1_i*b1_r
+	xvmaddasp	vs39,	vs5,	vs19		// a5_r*b1_i, a5_i*b1_i, a1_r*b1_i, a1_i*b1_i
+
+
+.endm
+
+.macro KERNEL2x4_SUBI1
+
+
+	lxvw4x		vs0,	o0,	AO		// load a0, a1
+	lxvw4x		vs1,	o16,	AO		// load a2, a3
+
+	addi		AO,	AO,	32
+
+	lxvw4x		vs8,	o0,	BO		//  load b0_r
+	lxvw4x		vs9,	o16,	BO		//  load b0_i
+	lxvw4x		vs10,	o32,	BO		//  load b1_r
+	lxvw4x		vs11,	o48,	BO		//  load b1_i
+
+	addi		BO,	BO,	64
+
+
+
+	xvmulsp		vs32,	vs0,	vs8		// a0_r*b0_r, a0_i*b0_r, a1_r*b0_r, a1_i*b0_r
+	xvmulsp		vs33,	vs0,	vs9		// a0_r*b0_i, a0_i*b0_i, a1_r*b0_i, a1_i*b0_i
+	xvmulsp		vs34,	vs1,	vs8		// a1_r*b0_r, a1_i*b0_r, a1_r*b0_r, a1_i*b0_r
+	xvmulsp		vs35,	vs1,	vs9		// a1_r*b0_i, a1_i*b0_i, a1_r*b0_i, a1_i*b0_i
+
+	xvmulsp		vs36,	vs0,	vs10		// a0_r*b1_r, a0_i*b1_r, a1_r*b1_r, a1_i*b1_r
+	xvmulsp		vs37,	vs0,	vs11		// a0_r*b1_i, a0_i*b1_i, a1_r*b1_i, a1_i*b1_i
+	xvmulsp		vs38,	vs1,	vs10		// a1_r*b1_r, a1_i*b1_r, a1_r*b1_r, a1_i*b1_r
+	xvmulsp		vs39,	vs1,	vs11		// a1_r*b1_i, a1_i*b1_i, a1_r*b1_i, a1_i*b1_i
+
+
+.endm
+
+.macro KERNEL2x4_SUB1
+
+
+	lxvw4x		vs0,	o0,	AO		// load a0, a1
+	lxvw4x		vs1,	o16,	AO		// load a2, a3
+
+	addi		AO,	AO,	32
+
+	lxvw4x		vs8,	o0,	BO		//  load b0_r
+	lxvw4x		vs9,	o16,	BO		//  load b0_i
+	lxvw4x		vs10,	o32,	BO		//  load b1_r
+	lxvw4x		vs11,	o48,	BO		//  load b1_i
+
+	addi		BO,	BO,	64
+
+
+
+	xvmaddasp	vs32,	vs0,	vs8		// a0_r*b0_r, a0_i*b0_r, a1_r*b0_r, a1_i*b0_r
+	xvmaddasp	vs33,	vs0,	vs9		// a0_r*b0_i, a0_i*b0_i, a1_r*b0_i, a1_i*b0_i
+	xvmaddasp	vs34,	vs1,	vs8		// a1_r*b0_r, a1_i*b0_r, a1_r*b0_r, a1_i*b0_r
+	xvmaddasp	vs35,	vs1,	vs9		// a1_r*b0_i, a1_i*b0_i, a1_r*b0_i, a1_i*b0_i
+
+	xvmaddasp	vs36,	vs0,	vs10		// a0_r*b1_r, a0_i*b1_r, a1_r*b1_r, a1_i*b1_r
+	xvmaddasp	vs37,	vs0,	vs11		// a0_r*b1_i, a0_i*b1_i, a1_r*b1_i, a1_i*b1_i
+	xvmaddasp	vs38,	vs1,	vs10		// a1_r*b1_r, a1_i*b1_r, a1_r*b1_r, a1_i*b1_r
+	xvmaddasp	vs39,	vs1,	vs11		// a1_r*b1_i, a1_i*b1_i, a1_r*b1_i, a1_i*b1_i
+
+
+.endm
+
+.macro SAVE2x4
+
+	mr		T1,	CO
+	xxlxor		vs24,	vs24,	vs24
+
+// N=0
+
+	mr		T2,	T1
+
+// N=0 M=0
+
+	xxlxor		vs4,	vs4,	vs4
+	xxlxor		vs5,	vs5,	vs5
+	xxlxor		vs6,	vs6,	vs6
+	xxlxor		vs7,	vs7,	vs7
+
+#ifndef TRMMKERNEL
+	lxvw4x		vs0,	o0,	T2	// c0, c1
+#else
+	xxlxor		vs0,	vs0,	vs0
+#endif
+
+
+	xxspltw		vs8,	vs32,	0
+	xxspltw		vs9,	vs32,	1
+	xxspltw		vs10,	vs32,	2
+	xxspltw		vs11,	vs32,	3
+
+
+	xxspltw		vs12,	vs33,	0
+	xxspltw		vs13,	vs33,	1
+	xxspltw		vs14,	vs33,	2
+	xxspltw		vs15,	vs33,	3
+
+	XVFADD_R1	vs4,	vs4,	vs8		// add a0_r * b0_r
+	XVFADD_I2	vs5,	vs5,	vs12		// add a0_r * b0_i
+	XVFADD_R1	vs6,	vs6,	vs10		// add a1_r * b0_r
+	XVFADD_I2	vs7,	vs7,	vs14		// add a1_r * b0_i
+
+	XVFADD_R2	vs4,	vs4,	vs13		// add a0_i * b0_i
+	XVFADD_I1	vs5,	vs5,	vs9 		// add a0_i * b0_r
+	XVFADD_R2	vs6,	vs6,	vs15		// add a1_i * b0_i
+	XVFADD_I1	vs7,	vs7,	vs11		// add a1_i * b0_r
+
+	xvmulsp		vs16,	vs4,	alpha_sr		// r0_r * alpha_r
+	xvmulsp		vs17,	vs5,	alpha_si		// r0_i * alpha_i
+	xvmulsp		vs18,	vs4,	alpha_si		// r0_r * alpha_i
+	xvmulsp		vs19,	vs5,	alpha_sr		// r0_i * alpha_r
+
+	xvsubsp		vs20,	vs16,	vs17		// r0_r * alpha_r - r0_i * alpha_i
+	xvaddsp		vs21,	vs18,	vs19		// r0_r * alpha_i + r0_i * alpha_r
+
+	xvmulsp		vs16,	vs6,	alpha_sr		// r1_r * alpha_r
+	xvmulsp		vs17,	vs7,	alpha_si		// r1_i * alpha_i
+	xvmulsp		vs18,	vs6,	alpha_si		// r1_r * alpha_i
+	xvmulsp		vs19,	vs7,	alpha_sr		// r1_i * alpha_r
+
+	xvsubsp		vs22,	vs16,	vs17		// r1_r * alpha_r - r1_i * alpha_i
+	xvaddsp		vs23,	vs18,	vs19		// r1_r * alpha_i + r1_i * alpha_r
+
+	xxsldwi		vs20,	vs20,	vs24,	3		// r0_r
+	xxsldwi		vs21,	vs21,	vs24,	2		// r0_i
+	xxsldwi		vs22,	vs22,	vs24,	1		// r1_r
+	xxsldwi		vs23,	vs23,	vs24,	0		// r1_i
+	xvaddsp		vs20,	vs20,	vs21		// r0_r, r0_i
+	xvaddsp		vs22,	vs22,	vs23		// r1_r, r1_i
+	xvaddsp		vs1,	vs20,	vs22			// r0_r, r0_i, r1_r, r1_i
+	xvaddsp		vs0,	vs0,	vs1
+
+
+	stxvw4x		vs0,	o0,	T2	// c0, c1
+
+	addi		T2,	T2,	16
+
+// N=0 M=2
+
+	xxlxor		vs4,	vs4,	vs4
+	xxlxor		vs5,	vs5,	vs5
+	xxlxor		vs6,	vs6,	vs6
+	xxlxor		vs7,	vs7,	vs7
+
+#ifndef TRMMKERNEL
+	lxvw4x		vs0,	o0,	T2	// c0, c1
+#else
+	xxlxor		vs0,	vs0,	vs0
+#endif
+
+
+	xxspltw		vs8,	vs34,	0
+	xxspltw		vs9,	vs34,	1
+	xxspltw		vs10,	vs34,	2
+	xxspltw		vs11,	vs34,	3
+
+
+	xxspltw		vs12,	vs35,	0
+	xxspltw		vs13,	vs35,	1
+	xxspltw		vs14,	vs35,	2
+	xxspltw		vs15,	vs35,	3
+
+	XVFADD_R1	vs4,	vs4,	vs8		// add a0_r * b0_r
+	XVFADD_I2	vs5,	vs5,	vs12		// add a0_r * b0_i
+	XVFADD_R1	vs6,	vs6,	vs10		// add a1_r * b0_r
+	XVFADD_I2	vs7,	vs7,	vs14		// add a1_r * b0_i
+
+	XVFADD_R2	vs4,	vs4,	vs13		// add a0_i * b0_i
+	XVFADD_I1	vs5,	vs5,	vs9 		// add a0_i * b0_r
+	XVFADD_R2	vs6,	vs6,	vs15		// add a1_i * b0_i
+	XVFADD_I1	vs7,	vs7,	vs11		// add a1_i * b0_r
+
+	xvmulsp		vs16,	vs4,	alpha_sr		// r0_r * alpha_r
+	xvmulsp		vs17,	vs5,	alpha_si		// r0_i * alpha_i
+	xvmulsp		vs18,	vs4,	alpha_si		// r0_r * alpha_i
+	xvmulsp		vs19,	vs5,	alpha_sr		// r0_i * alpha_r
+
+	xvsubsp		vs20,	vs16,	vs17		// r0_r * alpha_r - r0_i * alpha_i
+	xvaddsp		vs21,	vs18,	vs19		// r0_r * alpha_i + r0_i * alpha_r
+
+	xvmulsp		vs16,	vs6,	alpha_sr		// r1_r * alpha_r
+	xvmulsp		vs17,	vs7,	alpha_si		// r1_i * alpha_i
+	xvmulsp		vs18,	vs6,	alpha_si		// r1_r * alpha_i
+	xvmulsp		vs19,	vs7,	alpha_sr		// r1_i * alpha_r
+
+	xvsubsp		vs22,	vs16,	vs17		// r1_r * alpha_r - r1_i * alpha_i
+	xvaddsp		vs23,	vs18,	vs19		// r1_r * alpha_i + r1_i * alpha_r
+
+	xxsldwi		vs20,	vs20,	vs24,	3		// r0_r
+	xxsldwi		vs21,	vs21,	vs24,	2		// r0_i
+	xxsldwi		vs22,	vs22,	vs24,	1		// r1_r
+	xxsldwi		vs23,	vs23,	vs24,	0		// r1_i
+	xvaddsp		vs20,	vs20,	vs21		// r0_r, r0_i
+	xvaddsp		vs22,	vs22,	vs23		// r1_r, r1_i
+	xvaddsp		vs1,	vs20,	vs22			// r0_r, r0_i, r1_r, r1_i
+	xvaddsp		vs0,	vs0,	vs1
+
+
+	stxvw4x		vs0,	o0,	T2	// c0, c1
+
+	addi		T2,	T2,	16
+	add		T1,	T1,	LDC
+
+
+// N=1
+
+	mr		T2,	T1
+
+// N=1 M=0
+
+	xxlxor		vs4,	vs4,	vs4
+	xxlxor		vs5,	vs5,	vs5
+	xxlxor		vs6,	vs6,	vs6
+	xxlxor		vs7,	vs7,	vs7
+
+#ifndef TRMMKERNEL
+	lxvw4x		vs0,	o0,	T2	// c0, c1
+#else
+	xxlxor		vs0,	vs0,	vs0
+#endif
+
+
+	xxspltw		vs8,	vs36,	0
+	xxspltw		vs9,	vs36,	1
+	xxspltw		vs10,	vs36,	2
+	xxspltw		vs11,	vs36,	3
+
+
+	xxspltw		vs12,	vs37,	0
+	xxspltw		vs13,	vs37,	1
+	xxspltw		vs14,	vs37,	2
+	xxspltw		vs15,	vs37,	3
+
+	XVFADD_R1	vs4,	vs4,	vs8		// add a0_r * b0_r
+	XVFADD_I2	vs5,	vs5,	vs12		// add a0_r * b0_i
+	XVFADD_R1	vs6,	vs6,	vs10		// add a1_r * b0_r
+	XVFADD_I2	vs7,	vs7,	vs14		// add a1_r * b0_i
+
+	XVFADD_R2	vs4,	vs4,	vs13		// add a0_i * b0_i
+	XVFADD_I1	vs5,	vs5,	vs9 		// add a0_i * b0_r
+	XVFADD_R2	vs6,	vs6,	vs15		// add a1_i * b0_i
+	XVFADD_I1	vs7,	vs7,	vs11		// add a1_i * b0_r
+
+	xvmulsp		vs16,	vs4,	alpha_sr		// r0_r * alpha_r
+	xvmulsp		vs17,	vs5,	alpha_si		// r0_i * alpha_i
+	xvmulsp		vs18,	vs4,	alpha_si		// r0_r * alpha_i
+	xvmulsp		vs19,	vs5,	alpha_sr		// r0_i * alpha_r
+
+	xvsubsp		vs20,	vs16,	vs17		// r0_r * alpha_r - r0_i * alpha_i
+	xvaddsp		vs21,	vs18,	vs19		// r0_r * alpha_i + r0_i * alpha_r
+
+	xvmulsp		vs16,	vs6,	alpha_sr		// r1_r * alpha_r
+	xvmulsp		vs17,	vs7,	alpha_si		// r1_i * alpha_i
+	xvmulsp		vs18,	vs6,	alpha_si		// r1_r * alpha_i
+	xvmulsp		vs19,	vs7,	alpha_sr		// r1_i * alpha_r
+
+	xvsubsp		vs22,	vs16,	vs17		// r1_r * alpha_r - r1_i * alpha_i
+	xvaddsp		vs23,	vs18,	vs19		// r1_r * alpha_i + r1_i * alpha_r
+
+	xxsldwi		vs20,	vs20,	vs24,	3		// r0_r
+	xxsldwi		vs21,	vs21,	vs24,	2		// r0_i
+	xxsldwi		vs22,	vs22,	vs24,	1		// r1_r
+	xxsldwi		vs23,	vs23,	vs24,	0		// r1_i
+	xvaddsp		vs20,	vs20,	vs21		// r0_r, r0_i
+	xvaddsp		vs22,	vs22,	vs23		// r1_r, r1_i
+	xvaddsp		vs1,	vs20,	vs22			// r0_r, r0_i, r1_r, r1_i
+	xvaddsp		vs0,	vs0,	vs1
+
+
+	stxvw4x		vs0,	o0,	T2	// c0, c1
+
+	addi		T2,	T2,	16
+
+// N=1 M=2
+
+	xxlxor		vs4,	vs4,	vs4
+	xxlxor		vs5,	vs5,	vs5
+	xxlxor		vs6,	vs6,	vs6
+	xxlxor		vs7,	vs7,	vs7
+
+#ifndef TRMMKERNEL
+	lxvw4x		vs0,	o0,	T2	// c0, c1
+#else
+	xxlxor		vs0,	vs0,	vs0
+#endif
+
+
+	xxspltw		vs8,	vs38,	0
+	xxspltw		vs9,	vs38,	1
+	xxspltw		vs10,	vs38,	2
+	xxspltw		vs11,	vs38,	3
+
+
+	xxspltw		vs12,	vs39,	0
+	xxspltw		vs13,	vs39,	1
+	xxspltw		vs14,	vs39,	2
+	xxspltw		vs15,	vs39,	3
+
+	XVFADD_R1	vs4,	vs4,	vs8		// add a0_r * b0_r
+	XVFADD_I2	vs5,	vs5,	vs12		// add a0_r * b0_i
+	XVFADD_R1	vs6,	vs6,	vs10		// add a1_r * b0_r
+	XVFADD_I2	vs7,	vs7,	vs14		// add a1_r * b0_i
+
+	XVFADD_R2	vs4,	vs4,	vs13		// add a0_i * b0_i
+	XVFADD_I1	vs5,	vs5,	vs9 		// add a0_i * b0_r
+	XVFADD_R2	vs6,	vs6,	vs15		// add a1_i * b0_i
+	XVFADD_I1	vs7,	vs7,	vs11		// add a1_i * b0_r
+
+	xvmulsp		vs16,	vs4,	alpha_sr		// r0_r * alpha_r
+	xvmulsp		vs17,	vs5,	alpha_si		// r0_i * alpha_i
+	xvmulsp		vs18,	vs4,	alpha_si		// r0_r * alpha_i
+	xvmulsp		vs19,	vs5,	alpha_sr		// r0_i * alpha_r
+
+	xvsubsp		vs20,	vs16,	vs17		// r0_r * alpha_r - r0_i * alpha_i
+	xvaddsp		vs21,	vs18,	vs19		// r0_r * alpha_i + r0_i * alpha_r
+
+	xvmulsp		vs16,	vs6,	alpha_sr		// r1_r * alpha_r
+	xvmulsp		vs17,	vs7,	alpha_si		// r1_i * alpha_i
+	xvmulsp		vs18,	vs6,	alpha_si		// r1_r * alpha_i
+	xvmulsp		vs19,	vs7,	alpha_sr		// r1_i * alpha_r
+
+	xvsubsp		vs22,	vs16,	vs17		// r1_r * alpha_r - r1_i * alpha_i
+	xvaddsp		vs23,	vs18,	vs19		// r1_r * alpha_i + r1_i * alpha_r
+
+	xxsldwi		vs20,	vs20,	vs24,	3		// r0_r
+	xxsldwi		vs21,	vs21,	vs24,	2		// r0_i
+	xxsldwi		vs22,	vs22,	vs24,	1		// r1_r
+	xxsldwi		vs23,	vs23,	vs24,	0		// r1_i
+	xvaddsp		vs20,	vs20,	vs21		// r0_r, r0_i
+	xvaddsp		vs22,	vs22,	vs23		// r1_r, r1_i
+	xvaddsp		vs1,	vs20,	vs22			// r0_r, r0_i, r1_r, r1_i
+	xvaddsp		vs0,	vs0,	vs1
+
+
+	stxvw4x		vs0,	o0,	T2	// c0, c1
+
+	addi		T2,	T2,	16
+	add		T1,	T1,	LDC
+
+	addi		CO,	CO,	32
+
+.endm
+
+
+/**********************************************************************************************
+* Macros for N=2 and M=2
+**********************************************************************************************/
+
+.macro LOAD2x2_1
+
+	lxvw4x		vs0,	o0,	AO		// load a0, a1
+
+	addi		AO,	AO,	16
+
+	lxvw4x		vs8,	o0,	BO		//  load b0_r
+	lxvw4x		vs9,	o16,	BO		//  load b0_i
+	lxvw4x		vs10,	o32,	BO		//  load b1_r
+	lxvw4x		vs11,	o48,	BO		//  load b1_i
+
+	addi		BO,	BO,	64
+
+
+.endm
+
+.macro KERNEL2x2_I1
+
+
+	lxvw4x		vs4,	o0,	AO		// load a0, a1
+
+	addi		AO,	AO,	16
+
+	lxvw4x		vs16,	o0,	BO		//  load b0_r
+	lxvw4x		vs17,	o16,	BO		//  load b0_i
+	lxvw4x		vs18,	o32,	BO		//  load b1_r
+	lxvw4x		vs19,	o48,	BO		//  load b1_i
+
+	addi		BO,	BO,	64
+
+
+
+	xvmulsp		vs32,	vs0,	vs8		// a0_r*b0_r, a0_i*b0_r, a1_r*b0_r, a1_i*b0_r
+	xvmulsp		vs33,	vs0,	vs9		// a0_r*b0_i, a0_i*b0_i, a1_r*b0_i, a1_i*b0_i
+
+	xvmulsp		vs34,	vs0,	vs10		// a0_r*b1_r, a0_i*b1_r, a1_r*b1_r, a1_i*b1_r
+	xvmulsp		vs35,	vs0,	vs11		// a0_r*b1_i, a0_i*b1_i, a1_r*b1_i, a1_i*b1_i
+
+
+.endm
+
+.macro KERNEL2x2_1
+
+
+	lxvw4x		vs4,	o0,	AO		// load a0, a1
+
+	addi		AO,	AO,	16
+
+	lxvw4x		vs16,	o0,	BO		//  load b0_r
+	lxvw4x		vs17,	o16,	BO		//  load b0_i
+	lxvw4x		vs18,	o32,	BO		//  load b1_r
+	lxvw4x		vs19,	o48,	BO		//  load b1_i
+
+	addi		BO,	BO,	64
+
+
+
+	xvmaddasp	vs32,	vs0,	vs8		// a0_r*b0_r, a0_i*b0_r, a1_r*b0_r, a1_i*b0_r
+	xvmaddasp	vs33,	vs0,	vs9		// a0_r*b0_i, a0_i*b0_i, a1_r*b0_i, a1_i*b0_i
+
+	xvmaddasp	vs34,	vs0,	vs10		// a0_r*b1_r, a0_i*b1_r, a1_r*b1_r, a1_i*b1_r
+	xvmaddasp	vs35,	vs0,	vs11		// a0_r*b1_i, a0_i*b1_i, a1_r*b1_i, a1_i*b1_i
+
+
+.endm
+
+.macro KERNEL2x2_2
+
+
+	lxvw4x		vs0,	o0,	AO		// load a0, a1
+
+	addi		AO,	AO,	16
+
+	lxvw4x		vs8,	o0,	BO		//  load b0_r
+	lxvw4x		vs9,	o16,	BO		//  load b0_i
+	lxvw4x		vs10,	o32,	BO		//  load b1_r
+	lxvw4x		vs11,	o48,	BO		//  load b1_i
+
+	addi		BO,	BO,	64
+
+
+
+	xvmaddasp	vs32,	vs4,	vs16		// a4_r*b0_r, a4_i*b0_r, a1_r*b0_r, a1_i*b0_r
+	xvmaddasp	vs33,	vs4,	vs17		// a4_r*b0_i, a4_i*b0_i, a1_r*b0_i, a1_i*b0_i
+
+	xvmaddasp	vs34,	vs4,	vs18		// a4_r*b1_r, a4_i*b1_r, a1_r*b1_r, a1_i*b1_r
+	xvmaddasp	vs35,	vs4,	vs19		// a4_r*b1_i, a4_i*b1_i, a1_r*b1_i, a1_i*b1_i
+
+
+.endm
+
+.macro KERNEL2x2_E2
+
+
+	xvmaddasp	vs32,	vs4,	vs16		// a4_r*b0_r, a4_i*b0_r, a1_r*b0_r, a1_i*b0_r
+	xvmaddasp	vs33,	vs4,	vs17		// a4_r*b0_i, a4_i*b0_i, a1_r*b0_i, a1_i*b0_i
+
+	xvmaddasp	vs34,	vs4,	vs18		// a4_r*b1_r, a4_i*b1_r, a1_r*b1_r, a1_i*b1_r
+	xvmaddasp	vs35,	vs4,	vs19		// a4_r*b1_i, a4_i*b1_i, a1_r*b1_i, a1_i*b1_i
+
+
+.endm
+
+.macro KERNEL2x2_SUBI1
+
+
+	lxvw4x		vs0,	o0,	AO		// load a0, a1
+
+	addi		AO,	AO,	16
+
+	lxvw4x		vs8,	o0,	BO		//  load b0_r
+	lxvw4x		vs9,	o16,	BO		//  load b0_i
+	lxvw4x		vs10,	o32,	BO		//  load b1_r
+	lxvw4x		vs11,	o48,	BO		//  load b1_i
+
+	addi		BO,	BO,	64
+
+
+
+	xvmulsp		vs32,	vs0,	vs8		// a0_r*b0_r, a0_i*b0_r, a1_r*b0_r, a1_i*b0_r
+	xvmulsp		vs33,	vs0,	vs9		// a0_r*b0_i, a0_i*b0_i, a1_r*b0_i, a1_i*b0_i
+
+	xvmulsp		vs34,	vs0,	vs10		// a0_r*b1_r, a0_i*b1_r, a1_r*b1_r, a1_i*b1_r
+	xvmulsp		vs35,	vs0,	vs11		// a0_r*b1_i, a0_i*b1_i, a1_r*b1_i, a1_i*b1_i
+
+
+.endm
+
+.macro KERNEL2x2_SUB1
+
+
+	lxvw4x		vs0,	o0,	AO		// load a0, a1
+
+	addi		AO,	AO,	16
+
+	lxvw4x		vs8,	o0,	BO		//  load b0_r
+	lxvw4x		vs9,	o16,	BO		//  load b0_i
+	lxvw4x		vs10,	o32,	BO		//  load b1_r
+	lxvw4x		vs11,	o48,	BO		//  load b1_i
+
+	addi		BO,	BO,	64
+
+
+
+	xvmaddasp	vs32,	vs0,	vs8		// a0_r*b0_r, a0_i*b0_r, a1_r*b0_r, a1_i*b0_r
+	xvmaddasp	vs33,	vs0,	vs9		// a0_r*b0_i, a0_i*b0_i, a1_r*b0_i, a1_i*b0_i
+
+	xvmaddasp	vs34,	vs0,	vs10		// a0_r*b1_r, a0_i*b1_r, a1_r*b1_r, a1_i*b1_r
+	xvmaddasp	vs35,	vs0,	vs11		// a0_r*b1_i, a0_i*b1_i, a1_r*b1_i, a1_i*b1_i
+
+
+.endm
+
+.macro SAVE2x2
+
+	mr		T1,	CO
+	xxlxor		vs24,	vs24,	vs24
+
+// N=0
+
+	mr		T2,	T1
+
+// N=0 M=0
+
+	xxlxor		vs4,	vs4,	vs4
+	xxlxor		vs5,	vs5,	vs5
+	xxlxor		vs6,	vs6,	vs6
+	xxlxor		vs7,	vs7,	vs7
+
+#ifndef TRMMKERNEL
+	lxvw4x		vs0,	o0,	T2	// c0, c1
+#else
+	xxlxor		vs0,	vs0,	vs0
+#endif
+
+
+	xxspltw		vs8,	vs32,	0
+	xxspltw		vs9,	vs32,	1
+	xxspltw		vs10,	vs32,	2
+	xxspltw		vs11,	vs32,	3
+
+
+	xxspltw		vs12,	vs33,	0
+	xxspltw		vs13,	vs33,	1
+	xxspltw		vs14,	vs33,	2
+	xxspltw		vs15,	vs33,	3
+
+	XVFADD_R1	vs4,	vs4,	vs8		// add a0_r * b0_r
+	XVFADD_I2	vs5,	vs5,	vs12		// add a0_r * b0_i
+	XVFADD_R1	vs6,	vs6,	vs10		// add a1_r * b0_r
+	XVFADD_I2	vs7,	vs7,	vs14		// add a1_r * b0_i
+
+	XVFADD_R2	vs4,	vs4,	vs13		// add a0_i * b0_i
+	XVFADD_I1	vs5,	vs5,	vs9 		// add a0_i * b0_r
+	XVFADD_R2	vs6,	vs6,	vs15		// add a1_i * b0_i
+	XVFADD_I1	vs7,	vs7,	vs11		// add a1_i * b0_r
+
+	xvmulsp		vs16,	vs4,	alpha_sr		// r0_r * alpha_r
+	xvmulsp		vs17,	vs5,	alpha_si		// r0_i * alpha_i
+	xvmulsp		vs18,	vs4,	alpha_si		// r0_r * alpha_i
+	xvmulsp		vs19,	vs5,	alpha_sr		// r0_i * alpha_r
+
+	xvsubsp		vs20,	vs16,	vs17		// r0_r * alpha_r - r0_i * alpha_i
+	xvaddsp		vs21,	vs18,	vs19		// r0_r * alpha_i + r0_i * alpha_r
+
+	xvmulsp		vs16,	vs6,	alpha_sr		// r1_r * alpha_r
+	xvmulsp		vs17,	vs7,	alpha_si		// r1_i * alpha_i
+	xvmulsp		vs18,	vs6,	alpha_si		// r1_r * alpha_i
+	xvmulsp		vs19,	vs7,	alpha_sr		// r1_i * alpha_r
+
+	xvsubsp		vs22,	vs16,	vs17		// r1_r * alpha_r - r1_i * alpha_i
+	xvaddsp		vs23,	vs18,	vs19		// r1_r * alpha_i + r1_i * alpha_r
+
+	xxsldwi		vs20,	vs20,	vs24,	3		// r0_r
+	xxsldwi		vs21,	vs21,	vs24,	2		// r0_i
+	xxsldwi		vs22,	vs22,	vs24,	1		// r1_r
+	xxsldwi		vs23,	vs23,	vs24,	0		// r1_i
+	xvaddsp		vs20,	vs20,	vs21		// r0_r, r0_i
+	xvaddsp		vs22,	vs22,	vs23		// r1_r, r1_i
+	xvaddsp		vs1,	vs20,	vs22			// r0_r, r0_i, r1_r, r1_i
+	xvaddsp		vs0,	vs0,	vs1
+
+
+	stxvw4x		vs0,	o0,	T2	// c0, c1
+
+	addi		T2,	T2,	16
+	add		T1,	T1,	LDC
+
+
+// N=1
+
+	mr		T2,	T1
+
+// N=1 M=0
+
+	xxlxor		vs4,	vs4,	vs4
+	xxlxor		vs5,	vs5,	vs5
+	xxlxor		vs6,	vs6,	vs6
+	xxlxor		vs7,	vs7,	vs7
+
+#ifndef TRMMKERNEL
+	lxvw4x		vs0,	o0,	T2	// c0, c1
+#else
+	xxlxor		vs0,	vs0,	vs0
+#endif
+
+
+	xxspltw		vs8,	vs34,	0
+	xxspltw		vs9,	vs34,	1
+	xxspltw		vs10,	vs34,	2
+	xxspltw		vs11,	vs34,	3
+
+
+	xxspltw		vs12,	vs35,	0
+	xxspltw		vs13,	vs35,	1
+	xxspltw		vs14,	vs35,	2
+	xxspltw		vs15,	vs35,	3
+
+	XVFADD_R1	vs4,	vs4,	vs8		// add a0_r * b0_r
+	XVFADD_I2	vs5,	vs5,	vs12		// add a0_r * b0_i
+	XVFADD_R1	vs6,	vs6,	vs10		// add a1_r * b0_r
+	XVFADD_I2	vs7,	vs7,	vs14		// add a1_r * b0_i
+
+	XVFADD_R2	vs4,	vs4,	vs13		// add a0_i * b0_i
+	XVFADD_I1	vs5,	vs5,	vs9 		// add a0_i * b0_r
+	XVFADD_R2	vs6,	vs6,	vs15		// add a1_i * b0_i
+	XVFADD_I1	vs7,	vs7,	vs11		// add a1_i * b0_r
+
+	xvmulsp		vs16,	vs4,	alpha_sr		// r0_r * alpha_r
+	xvmulsp		vs17,	vs5,	alpha_si		// r0_i * alpha_i
+	xvmulsp		vs18,	vs4,	alpha_si		// r0_r * alpha_i
+	xvmulsp		vs19,	vs5,	alpha_sr		// r0_i * alpha_r
+
+	xvsubsp		vs20,	vs16,	vs17		// r0_r * alpha_r - r0_i * alpha_i
+	xvaddsp		vs21,	vs18,	vs19		// r0_r * alpha_i + r0_i * alpha_r
+
+	xvmulsp		vs16,	vs6,	alpha_sr		// r1_r * alpha_r
+	xvmulsp		vs17,	vs7,	alpha_si		// r1_i * alpha_i
+	xvmulsp		vs18,	vs6,	alpha_si		// r1_r * alpha_i
+	xvmulsp		vs19,	vs7,	alpha_sr		// r1_i * alpha_r
+
+	xvsubsp		vs22,	vs16,	vs17		// r1_r * alpha_r - r1_i * alpha_i
+	xvaddsp		vs23,	vs18,	vs19		// r1_r * alpha_i + r1_i * alpha_r
+
+	xxsldwi		vs20,	vs20,	vs24,	3		// r0_r
+	xxsldwi		vs21,	vs21,	vs24,	2		// r0_i
+	xxsldwi		vs22,	vs22,	vs24,	1		// r1_r
+	xxsldwi		vs23,	vs23,	vs24,	0		// r1_i
+	xvaddsp		vs20,	vs20,	vs21		// r0_r, r0_i
+	xvaddsp		vs22,	vs22,	vs23		// r1_r, r1_i
+	xvaddsp		vs1,	vs20,	vs22			// r0_r, r0_i, r1_r, r1_i
+	xvaddsp		vs0,	vs0,	vs1
+
+
+	stxvw4x		vs0,	o0,	T2	// c0, c1
+
+	addi		T2,	T2,	16
+	add		T1,	T1,	LDC
+
+	addi		CO,	CO,	16
+
+.endm
+
+
+/**********************************************************************************************
+* Macros for N=2 and M=1
+**********************************************************************************************/
+
+.macro LOAD2x1_1
+
+	lxsspx		vs0,	o0,	AO		// load a0_r
+	lxsspx		vs1,	o4,	AO		// load a0_i
+
+	addi		AO,	AO,	8
+
+	lxsspx		vs8,	o0,	BO		//  load b0_r
+	lxsspx		vs9,	o16,	BO		//  load b0_i
+	lxsspx		vs10,	o32,	BO		//  load b1_r
+	lxsspx		vs11,	o48,	BO		//  load b1_i
+
+	addi		BO,	BO,	64
+
+
+.endm
+
+.macro KERNEL2x1_I1
+
+
+	lxsspx		vs4,	o0,	AO		// load a0_r
+	lxsspx		vs5,	o4,	AO		// load a0_i
+
+	addi		AO,	AO,	8
+
+	lxsspx		vs16,	o0,	BO		//  load b0_r
+	lxsspx		vs17,	o16,	BO		//  load b0_i
+	lxsspx		vs18,	o32,	BO		//  load b1_r
+	lxsspx		vs19,	o48,	BO		//  load b1_i
+
+	addi		BO,	BO,	64
+
+
+
+	xsmuldp		vs32,	vs0,	vs8		// a0_r*b0_r
+	xsmuldp		vs33,	vs1,	vs9		// a0_i*b0_i
+	xsmuldp		vs34,	vs0,	vs9		// a0_r*b0_i
+	xsmuldp		vs35,	vs1,	vs8		// a0_i*b0_r
+
+	xsmuldp		vs36,	vs0,	vs10		// a0_r*b1_r
+	xsmuldp		vs37,	vs1,	vs11		// a0_i*b1_i
+	xsmuldp		vs38,	vs0,	vs11		// a0_r*b1_i
+	xsmuldp		vs39,	vs1,	vs10		// a0_i*b1_r
+
+
+.endm
+
+.macro KERNEL2x1_1
+
+
+	lxsspx		vs4,	o0,	AO		// load a0_r
+	lxsspx		vs5,	o4,	AO		// load a0_i
+
+	addi		AO,	AO,	8
+
+	lxsspx		vs16,	o0,	BO		//  load b0_r
+	lxsspx		vs17,	o16,	BO		//  load b0_i
+	lxsspx		vs18,	o32,	BO		//  load b1_r
+	lxsspx		vs19,	o48,	BO		//  load b1_i
+
+	addi		BO,	BO,	64
+
+
+
+	xsmaddadp	vs32,	vs0,	vs8		// a0_r*b0_r
+	xsmaddadp	vs33,	vs1,	vs9		// a0_i*b0_i
+	xsmaddadp	vs34,	vs0,	vs9		// a0_r*b0_i
+	xsmaddadp	vs35,	vs1,	vs8		// a0_i*b0_r
+
+	xsmaddadp	vs36,	vs0,	vs10		// a0_r*b1_r
+	xsmaddadp	vs37,	vs1,	vs11		// a0_i*b1_i
+	xsmaddadp	vs38,	vs0,	vs11		// a0_r*b1_i
+	xsmaddadp	vs39,	vs1,	vs10		// a0_i*b1_r
+
+
+.endm
+
+.macro KERNEL2x1_2
+
+
+	lxsspx		vs0,	o0,	AO		// load a0_r
+	lxsspx		vs1,	o4,	AO		// load a0_i
+
+	addi		AO,	AO,	8
+
+	lxsspx		vs8,	o0,	BO		//  load b0_r
+	lxsspx		vs9,	o16,	BO		//  load b0_i
+	lxsspx		vs10,	o32,	BO		//  load b1_r
+	lxsspx		vs11,	o48,	BO		//  load b1_i
+
+	addi		BO,	BO,	64
+
+
+
+	xsmaddadp	vs32,	vs4,	vs16		// a4_r*b0_r
+	xsmaddadp	vs33,	vs5,	vs17		// a4_i*b0_i
+	xsmaddadp	vs34,	vs4,	vs17		// a4_r*b0_i
+	xsmaddadp	vs35,	vs5,	vs16		// a4_i*b0_r
+
+	xsmaddadp	vs36,	vs4,	vs18		// a4_r*b1_r
+	xsmaddadp	vs37,	vs5,	vs19		// a4_i*b1_i
+	xsmaddadp	vs38,	vs4,	vs19		// a4_r*b1_i
+	xsmaddadp	vs39,	vs5,	vs18		// a4_i*b1_r
+
+
+.endm
+
+.macro KERNEL2x1_E2
+
+
+	xsmaddadp	vs32,	vs4,	vs16		// a4_r*b0_r
+	xsmaddadp	vs33,	vs5,	vs17		// a4_i*b0_i
+	xsmaddadp	vs34,	vs4,	vs17		// a4_r*b0_i
+	xsmaddadp	vs35,	vs5,	vs16		// a4_i*b0_r
+
+	xsmaddadp	vs36,	vs4,	vs18		// a4_r*b1_r
+	xsmaddadp	vs37,	vs5,	vs19		// a4_i*b1_i
+	xsmaddadp	vs38,	vs4,	vs19		// a4_r*b1_i
+	xsmaddadp	vs39,	vs5,	vs18		// a4_i*b1_r
+
+
+.endm
+
+.macro KERNEL2x1_SUBI1
+
+
+	lxsspx		vs0,	o0,	AO		// load a0_r
+	lxsspx		vs1,	o4,	AO		// load a0_i
+
+	addi		AO,	AO,	8
+
+	lxsspx		vs8,	o0,	BO		//  load b0_r
+	lxsspx		vs9,	o16,	BO		//  load b0_i
+	lxsspx		vs10,	o32,	BO		//  load b1_r
+	lxsspx		vs11,	o48,	BO		//  load b1_i
+
+	addi		BO,	BO,	64
+
+
+
+	xsmuldp		vs32,	vs0,	vs8		// a0_r*b0_r
+	xsmuldp		vs33,	vs1,	vs9		// a0_i*b0_i
+	xsmuldp		vs34,	vs0,	vs9		// a0_r*b0_i
+	xsmuldp		vs35,	vs1,	vs8		// a0_i*b0_r
+
+	xsmuldp		vs36,	vs0,	vs10		// a0_r*b1_r
+	xsmuldp		vs37,	vs1,	vs11		// a0_i*b1_i
+	xsmuldp		vs38,	vs0,	vs11		// a0_r*b1_i
+	xsmuldp		vs39,	vs1,	vs10		// a0_i*b1_r
+
+
+.endm
+
+.macro KERNEL2x1_SUB1
+
+
+	lxsspx		vs0,	o0,	AO		// load a0_r
+	lxsspx		vs1,	o4,	AO		// load a0_i
+
+	addi		AO,	AO,	8
+
+	lxsspx		vs8,	o0,	BO		//  load b0_r
+	lxsspx		vs9,	o16,	BO		//  load b0_i
+	lxsspx		vs10,	o32,	BO		//  load b1_r
+	lxsspx		vs11,	o48,	BO		//  load b1_i
+
+	addi		BO,	BO,	64
+
+
+
+	xsmaddadp	vs32,	vs0,	vs8		// a0_r*b0_r
+	xsmaddadp	vs33,	vs1,	vs9		// a0_i*b0_i
+	xsmaddadp	vs34,	vs0,	vs9		// a0_r*b0_i
+	xsmaddadp	vs35,	vs1,	vs8		// a0_i*b0_r
+
+	xsmaddadp	vs36,	vs0,	vs10		// a0_r*b1_r
+	xsmaddadp	vs37,	vs1,	vs11		// a0_i*b1_i
+	xsmaddadp	vs38,	vs0,	vs11		// a0_r*b1_i
+	xsmaddadp	vs39,	vs1,	vs10		// a0_i*b1_r
+
+
+.endm
+
+.macro SAVE2x1
+
+	mr		T1,	CO
+	xxlxor		vs24,	vs24,	vs24
+
+// N=0
+
+	mr		T2,	T1
+
+// N=0 M=0
+
+	xxlxor		vs4,	vs4,	vs4
+	xxlxor		vs5,	vs5,	vs5
+
+#ifndef TRMMKERNEL
+	lxsspx		vs0,	o0,	T2	// load c0_r
+	lxsspx		vs1,	o4,	T2	// load c0_i
+#else
+	xxlxor		vs0,	vs0,	vs0
+	xxlxor		vs1,	vs1,	vs1
+#endif
+
+	XSFADD_R1	vs4,	vs4,	vs32		// add a0_r * b0_r
+	XSFADD_I1	vs5,	vs5,	vs35		// add a0_r * b0_i
+
+	XSFADD_R2	vs4,	vs4,	vs33		// add a0_i * b0_i
+	XSFADD_I2	vs5,	vs5,	vs34		// add a0_i * b0_r
+
+	xsmuldp		vs16,	vs4,	alpha_dr		// r0_r * alpha_r
+	xsmuldp		vs17,	vs5,	alpha_di		// r0_i * alpha_i
+	xsmuldp		vs18,	vs4,	alpha_di		// r0_r * alpha_i
+	xsmuldp		vs19,	vs5,	alpha_dr		// r0_i * alpha_r
+
+	xssubdp		vs20,	vs16,	vs17		// r0_r * alpha_r - r0_i * alpha_i
+	xsadddp		vs21,	vs18,	vs19		// r0_r * alpha_i + r0_i * alpha_r
+
+	xsadddp		vs0,	vs0,	vs20
+	xsadddp		vs1,	vs1,	vs21
+
+
+	stxsspx		vs0,	o0,	T2	// store c0_r
+	stxsspx		vs1,	o4,	T2	// store c0_i
+
+	addi		T2,	T2,	8
+	add		T1,	T1,	LDC
+
+
+// N=1
+
+	mr		T2,	T1
+
+// N=1 M=0
+
+	xxlxor		vs4,	vs4,	vs4
+	xxlxor		vs5,	vs5,	vs5
+
+#ifndef TRMMKERNEL
+	lxsspx		vs0,	o0,	T2	// load c0_r
+	lxsspx		vs1,	o4,	T2	// load c0_i
+#else
+	xxlxor		vs0,	vs0,	vs0
+	xxlxor		vs1,	vs1,	vs1
+#endif
+
+	XSFADD_R1	vs4,	vs4,	vs36		// add a0_r * b0_r
+	XSFADD_I1	vs5,	vs5,	vs39		// add a0_r * b0_i
+
+	XSFADD_R2	vs4,	vs4,	vs37		// add a0_i * b0_i
+	XSFADD_I2	vs5,	vs5,	vs38		// add a0_i * b0_r
+
+	xsmuldp		vs16,	vs4,	alpha_dr		// r0_r * alpha_r
+	xsmuldp		vs17,	vs5,	alpha_di		// r0_i * alpha_i
+	xsmuldp		vs18,	vs4,	alpha_di		// r0_r * alpha_i
+	xsmuldp		vs19,	vs5,	alpha_dr		// r0_i * alpha_r
+
+	xssubdp		vs20,	vs16,	vs17		// r0_r * alpha_r - r0_i * alpha_i
+	xsadddp		vs21,	vs18,	vs19		// r0_r * alpha_i + r0_i * alpha_r
+
+	xsadddp		vs0,	vs0,	vs20
+	xsadddp		vs1,	vs1,	vs21
+
+
+	stxsspx		vs0,	o0,	T2	// store c0_r
+	stxsspx		vs1,	o4,	T2	// store c0_i
+
+	addi		T2,	T2,	8
+	add		T1,	T1,	LDC
+
+	addi		CO,	CO,	8
+
+.endm
+
+
+/**********************************************************************************************
+* Macros for N=1 and M=8
+**********************************************************************************************/
+
+.macro LOAD1x8_1
+
+	lxvw4x		vs0,	o0,	AO		// load a0, a1
+	lxvw4x		vs1,	o16,	AO		// load a2, a3
+	lxvw4x		vs2,	o32,	AO		// load a4, a5
+	lxvw4x		vs3,	o48,	AO		// load a6, a7
+
+	addi		AO,	AO,	64
+
+	lxvw4x		vs8,	o0,	BO		//  load b0_r
+	lxvw4x		vs9,	o16,	BO		//  load b0_i
+
+	addi		BO,	BO,	32
+
+.endm
+
+.macro KERNEL1x8_I1
+
+
+	lxvw4x		vs4,	o0,	AO		// load a0, a1
+	lxvw4x		vs5,	o16,	AO		// load a2, a3
+	lxvw4x		vs6,	o32,	AO		// load a4, a5
+	lxvw4x		vs7,	o48,	AO		// load a6, a7
+
+	addi		AO,	AO,	64
+
+	lxvw4x		vs16,	o0,	BO		//  load b0_r
+	lxvw4x		vs17,	o16,	BO		//  load b0_i
+
+	addi		BO,	BO,	32
+
+
+	xvmulsp		vs32,	vs0,	vs8		// a0_r*b0_r, a0_i*b0_r, a1_r*b0_r, a1_i*b0_r
+	xvmulsp		vs33,	vs0,	vs9		// a0_r*b0_i, a0_i*b0_i, a1_r*b0_i, a1_i*b0_i
+	xvmulsp		vs34,	vs1,	vs8		// a1_r*b0_r, a1_i*b0_r, a1_r*b0_r, a1_i*b0_r
+	xvmulsp		vs35,	vs1,	vs9		// a1_r*b0_i, a1_i*b0_i, a1_r*b0_i, a1_i*b0_i
+	xvmulsp		vs36,	vs2,	vs8		// a2_r*b0_r, a2_i*b0_r, a1_r*b0_r, a1_i*b0_r
+	xvmulsp		vs37,	vs2,	vs9		// a2_r*b0_i, a2_i*b0_i, a1_r*b0_i, a1_i*b0_i
+	xvmulsp		vs38,	vs3,	vs8		// a3_r*b0_r, a3_i*b0_r, a1_r*b0_r, a1_i*b0_r
+	xvmulsp		vs39,	vs3,	vs9		// a3_r*b0_i, a3_i*b0_i, a1_r*b0_i, a1_i*b0_i
+
+
+.endm
+
+.macro KERNEL1x8_1
+
+
+	lxvw4x		vs4,	o0,	AO		// load a0, a1
+	lxvw4x		vs5,	o16,	AO		// load a2, a3
+	lxvw4x		vs6,	o32,	AO		// load a4, a5
+	lxvw4x		vs7,	o48,	AO		// load a6, a7
+
+	addi		AO,	AO,	64
+
+	lxvw4x		vs16,	o0,	BO		//  load b0_r
+	lxvw4x		vs17,	o16,	BO		//  load b0_i
+
+	addi		BO,	BO,	32
+
+
+	xvmaddasp	vs32,	vs0,	vs8		// a0_r*b0_r, a0_i*b0_r, a1_r*b0_r, a1_i*b0_r
+	xvmaddasp	vs33,	vs0,	vs9		// a0_r*b0_i, a0_i*b0_i, a1_r*b0_i, a1_i*b0_i
+	xvmaddasp	vs34,	vs1,	vs8		// a1_r*b0_r, a1_i*b0_r, a1_r*b0_r, a1_i*b0_r
+	xvmaddasp	vs35,	vs1,	vs9		// a1_r*b0_i, a1_i*b0_i, a1_r*b0_i, a1_i*b0_i
+	xvmaddasp	vs36,	vs2,	vs8		// a2_r*b0_r, a2_i*b0_r, a1_r*b0_r, a1_i*b0_r
+	xvmaddasp	vs37,	vs2,	vs9		// a2_r*b0_i, a2_i*b0_i, a1_r*b0_i, a1_i*b0_i
+	xvmaddasp	vs38,	vs3,	vs8		// a3_r*b0_r, a3_i*b0_r, a1_r*b0_r, a1_i*b0_r
+	xvmaddasp	vs39,	vs3,	vs9		// a3_r*b0_i, a3_i*b0_i, a1_r*b0_i, a1_i*b0_i
+
+
+.endm
+
+.macro KERNEL1x8_2
+
+
+	lxvw4x		vs0,	o0,	AO		// load a0, a1
+	lxvw4x		vs1,	o16,	AO		// load a2, a3
+	lxvw4x		vs2,	o32,	AO		// load a4, a5
+	lxvw4x		vs3,	o48,	AO		// load a6, a7
+
+	addi		AO,	AO,	64
+
+	lxvw4x		vs8,	o0,	BO		//  load b0_r
+	lxvw4x		vs9,	o16,	BO		//  load b0_i
+
+	addi		BO,	BO,	32
+
+
+	xvmaddasp	vs32,	vs4,	vs16		// a4_r*b0_r, a4_i*b0_r, a1_r*b0_r, a1_i*b0_r
+	xvmaddasp	vs33,	vs4,	vs17		// a4_r*b0_i, a4_i*b0_i, a1_r*b0_i, a1_i*b0_i
+	xvmaddasp	vs34,	vs5,	vs16		// a5_r*b0_r, a5_i*b0_r, a1_r*b0_r, a1_i*b0_r
+	xvmaddasp	vs35,	vs5,	vs17		// a5_r*b0_i, a5_i*b0_i, a1_r*b0_i, a1_i*b0_i
+	xvmaddasp	vs36,	vs6,	vs16		// a6_r*b0_r, a6_i*b0_r, a1_r*b0_r, a1_i*b0_r
+	xvmaddasp	vs37,	vs6,	vs17		// a6_r*b0_i, a6_i*b0_i, a1_r*b0_i, a1_i*b0_i
+	xvmaddasp	vs38,	vs7,	vs16		// a7_r*b0_r, a7_i*b0_r, a1_r*b0_r, a1_i*b0_r
+	xvmaddasp	vs39,	vs7,	vs17		// a7_r*b0_i, a7_i*b0_i, a1_r*b0_i, a1_i*b0_i
+
+
+.endm
+
+.macro KERNEL1x8_E2
+
+
+	xvmaddasp	vs32,	vs4,	vs16		// a4_r*b0_r, a4_i*b0_r, a1_r*b0_r, a1_i*b0_r
+	xvmaddasp	vs33,	vs4,	vs17		// a4_r*b0_i, a4_i*b0_i, a1_r*b0_i, a1_i*b0_i
+	xvmaddasp	vs34,	vs5,	vs16		// a5_r*b0_r, a5_i*b0_r, a1_r*b0_r, a1_i*b0_r
+	xvmaddasp	vs35,	vs5,	vs17		// a5_r*b0_i, a5_i*b0_i, a1_r*b0_i, a1_i*b0_i
+	xvmaddasp	vs36,	vs6,	vs16		// a6_r*b0_r, a6_i*b0_r, a1_r*b0_r, a1_i*b0_r
+	xvmaddasp	vs37,	vs6,	vs17		// a6_r*b0_i, a6_i*b0_i, a1_r*b0_i, a1_i*b0_i
+	xvmaddasp	vs38,	vs7,	vs16		// a7_r*b0_r, a7_i*b0_r, a1_r*b0_r, a1_i*b0_r
+	xvmaddasp	vs39,	vs7,	vs17		// a7_r*b0_i, a7_i*b0_i, a1_r*b0_i, a1_i*b0_i
+
+
+.endm
+
+.macro KERNEL1x8_SUBI1
+
+
+	lxvw4x		vs0,	o0,	AO		// load a0, a1
+	lxvw4x		vs1,	o16,	AO		// load a2, a3
+	lxvw4x		vs2,	o32,	AO		// load a4, a5
+	lxvw4x		vs3,	o48,	AO		// load a6, a7
+
+	addi		AO,	AO,	64
+
+	lxvw4x		vs8,	o0,	BO		//  load b0_r
+	lxvw4x		vs9,	o16,	BO		//  load b0_i
+
+	addi		BO,	BO,	32
+
+
+	xvmulsp		vs32,	vs0,	vs8		// a0_r*b0_r, a0_i*b0_r, a1_r*b0_r, a1_i*b0_r
+	xvmulsp		vs33,	vs0,	vs9		// a0_r*b0_i, a0_i*b0_i, a1_r*b0_i, a1_i*b0_i
+	xvmulsp		vs34,	vs1,	vs8		// a1_r*b0_r, a1_i*b0_r, a1_r*b0_r, a1_i*b0_r
+	xvmulsp		vs35,	vs1,	vs9		// a1_r*b0_i, a1_i*b0_i, a1_r*b0_i, a1_i*b0_i
+	xvmulsp		vs36,	vs2,	vs8		// a2_r*b0_r, a2_i*b0_r, a1_r*b0_r, a1_i*b0_r
+	xvmulsp		vs37,	vs2,	vs9		// a2_r*b0_i, a2_i*b0_i, a1_r*b0_i, a1_i*b0_i
+	xvmulsp		vs38,	vs3,	vs8		// a3_r*b0_r, a3_i*b0_r, a1_r*b0_r, a1_i*b0_r
+	xvmulsp		vs39,	vs3,	vs9		// a3_r*b0_i, a3_i*b0_i, a1_r*b0_i, a1_i*b0_i
+
+
+.endm
+
+.macro KERNEL1x8_SUB1
+
+
+	lxvw4x		vs0,	o0,	AO		// load a0, a1
+	lxvw4x		vs1,	o16,	AO		// load a2, a3
+	lxvw4x		vs2,	o32,	AO		// load a4, a5
+	lxvw4x		vs3,	o48,	AO		// load a6, a7
+
+	addi		AO,	AO,	64
+
+	lxvw4x		vs8,	o0,	BO		//  load b0_r
+	lxvw4x		vs9,	o16,	BO		//  load b0_i
+
+	addi		BO,	BO,	32
+
+
+	xvmaddasp	vs32,	vs0,	vs8		// a0_r*b0_r, a0_i*b0_r, a1_r*b0_r, a1_i*b0_r
+	xvmaddasp	vs33,	vs0,	vs9		// a0_r*b0_i, a0_i*b0_i, a1_r*b0_i, a1_i*b0_i
+	xvmaddasp	vs34,	vs1,	vs8		// a1_r*b0_r, a1_i*b0_r, a1_r*b0_r, a1_i*b0_r
+	xvmaddasp	vs35,	vs1,	vs9		// a1_r*b0_i, a1_i*b0_i, a1_r*b0_i, a1_i*b0_i
+	xvmaddasp	vs36,	vs2,	vs8		// a2_r*b0_r, a2_i*b0_r, a1_r*b0_r, a1_i*b0_r
+	xvmaddasp	vs37,	vs2,	vs9		// a2_r*b0_i, a2_i*b0_i, a1_r*b0_i, a1_i*b0_i
+	xvmaddasp	vs38,	vs3,	vs8		// a3_r*b0_r, a3_i*b0_r, a1_r*b0_r, a1_i*b0_r
+	xvmaddasp	vs39,	vs3,	vs9		// a3_r*b0_i, a3_i*b0_i, a1_r*b0_i, a1_i*b0_i
+
+
+.endm
+
+.macro SAVE1x8
+
+	mr		T1,	CO
+	xxlxor		vs24,	vs24,	vs24
+
+// N=0
+
+	mr		T2,	T1
+
+// N=0 M=0
+
+	xxlxor		vs4,	vs4,	vs4
+	xxlxor		vs5,	vs5,	vs5
+	xxlxor		vs6,	vs6,	vs6
+	xxlxor		vs7,	vs7,	vs7
+
+#ifndef TRMMKERNEL
+	lxvw4x		vs0,	o0,	T2	// c0, c1
+#else
+	xxlxor		vs0,	vs0,	vs0
+#endif
+
+
+	xxspltw		vs8,	vs32,	0
+	xxspltw		vs9,	vs32,	1
+	xxspltw		vs10,	vs32,	2
+	xxspltw		vs11,	vs32,	3
+
+
+	xxspltw		vs12,	vs33,	0
+	xxspltw		vs13,	vs33,	1
+	xxspltw		vs14,	vs33,	2
+	xxspltw		vs15,	vs33,	3
+
+	XVFADD_R1	vs4,	vs4,	vs8		// add a0_r * b0_r
+	XVFADD_I2	vs5,	vs5,	vs12		// add a0_r * b0_i
+	XVFADD_R1	vs6,	vs6,	vs10		// add a1_r * b0_r
+	XVFADD_I2	vs7,	vs7,	vs14		// add a1_r * b0_i
+
+	XVFADD_R2	vs4,	vs4,	vs13		// add a0_i * b0_i
+	XVFADD_I1	vs5,	vs5,	vs9 		// add a0_i * b0_r
+	XVFADD_R2	vs6,	vs6,	vs15		// add a1_i * b0_i
+	XVFADD_I1	vs7,	vs7,	vs11		// add a1_i * b0_r
+
+	xvmulsp		vs16,	vs4,	alpha_sr		// r0_r * alpha_r
+	xvmulsp		vs17,	vs5,	alpha_si		// r0_i * alpha_i
+	xvmulsp		vs18,	vs4,	alpha_si		// r0_r * alpha_i
+	xvmulsp		vs19,	vs5,	alpha_sr		// r0_i * alpha_r
+
+	xvsubsp		vs20,	vs16,	vs17		// r0_r * alpha_r - r0_i * alpha_i
+	xvaddsp		vs21,	vs18,	vs19		// r0_r * alpha_i + r0_i * alpha_r
+
+	xvmulsp		vs16,	vs6,	alpha_sr		// r1_r * alpha_r
+	xvmulsp		vs17,	vs7,	alpha_si		// r1_i * alpha_i
+	xvmulsp		vs18,	vs6,	alpha_si		// r1_r * alpha_i
+	xvmulsp		vs19,	vs7,	alpha_sr		// r1_i * alpha_r
+
+	xvsubsp		vs22,	vs16,	vs17		// r1_r * alpha_r - r1_i * alpha_i
+	xvaddsp		vs23,	vs18,	vs19		// r1_r * alpha_i + r1_i * alpha_r
+
+	xxsldwi		vs20,	vs20,	vs24,	3		// r0_r
+	xxsldwi		vs21,	vs21,	vs24,	2		// r0_i
+	xxsldwi		vs22,	vs22,	vs24,	1		// r1_r
+	xxsldwi		vs23,	vs23,	vs24,	0		// r1_i
+	xvaddsp		vs20,	vs20,	vs21		// r0_r, r0_i
+	xvaddsp		vs22,	vs22,	vs23		// r1_r, r1_i
+	xvaddsp		vs1,	vs20,	vs22			// r0_r, r0_i, r1_r, r1_i
+	xvaddsp		vs0,	vs0,	vs1
+
+
+	stxvw4x		vs0,	o0,	T2	// c0, c1
+
+	addi		T2,	T2,	16
+
+// N=0 M=2
+
+	xxlxor		vs4,	vs4,	vs4
+	xxlxor		vs5,	vs5,	vs5
+	xxlxor		vs6,	vs6,	vs6
+	xxlxor		vs7,	vs7,	vs7
+
+#ifndef TRMMKERNEL
+	lxvw4x		vs0,	o0,	T2	// c0, c1
+#else
+	xxlxor		vs0,	vs0,	vs0
+#endif
+
+
+	xxspltw		vs8,	vs34,	0
+	xxspltw		vs9,	vs34,	1
+	xxspltw		vs10,	vs34,	2
+	xxspltw		vs11,	vs34,	3
+
+
+	xxspltw		vs12,	vs35,	0
+	xxspltw		vs13,	vs35,	1
+	xxspltw		vs14,	vs35,	2
+	xxspltw		vs15,	vs35,	3
+
+	XVFADD_R1	vs4,	vs4,	vs8		// add a0_r * b0_r
+	XVFADD_I2	vs5,	vs5,	vs12		// add a0_r * b0_i
+	XVFADD_R1	vs6,	vs6,	vs10		// add a1_r * b0_r
+	XVFADD_I2	vs7,	vs7,	vs14		// add a1_r * b0_i
+
+	XVFADD_R2	vs4,	vs4,	vs13		// add a0_i * b0_i
+	XVFADD_I1	vs5,	vs5,	vs9 		// add a0_i * b0_r
+	XVFADD_R2	vs6,	vs6,	vs15		// add a1_i * b0_i
+	XVFADD_I1	vs7,	vs7,	vs11		// add a1_i * b0_r
+
+	xvmulsp		vs16,	vs4,	alpha_sr		// r0_r * alpha_r
+	xvmulsp		vs17,	vs5,	alpha_si		// r0_i * alpha_i
+	xvmulsp		vs18,	vs4,	alpha_si		// r0_r * alpha_i
+	xvmulsp		vs19,	vs5,	alpha_sr		// r0_i * alpha_r
+
+	xvsubsp		vs20,	vs16,	vs17		// r0_r * alpha_r - r0_i * alpha_i
+	xvaddsp		vs21,	vs18,	vs19		// r0_r * alpha_i + r0_i * alpha_r
+
+	xvmulsp		vs16,	vs6,	alpha_sr		// r1_r * alpha_r
+	xvmulsp		vs17,	vs7,	alpha_si		// r1_i * alpha_i
+	xvmulsp		vs18,	vs6,	alpha_si		// r1_r * alpha_i
+	xvmulsp		vs19,	vs7,	alpha_sr		// r1_i * alpha_r
+
+	xvsubsp		vs22,	vs16,	vs17		// r1_r * alpha_r - r1_i * alpha_i
+	xvaddsp		vs23,	vs18,	vs19		// r1_r * alpha_i + r1_i * alpha_r
+
+	xxsldwi		vs20,	vs20,	vs24,	3		// r0_r
+	xxsldwi		vs21,	vs21,	vs24,	2		// r0_i
+	xxsldwi		vs22,	vs22,	vs24,	1		// r1_r
+	xxsldwi		vs23,	vs23,	vs24,	0		// r1_i
+	xvaddsp		vs20,	vs20,	vs21		// r0_r, r0_i
+	xvaddsp		vs22,	vs22,	vs23		// r1_r, r1_i
+	xvaddsp		vs1,	vs20,	vs22			// r0_r, r0_i, r1_r, r1_i
+	xvaddsp		vs0,	vs0,	vs1
+
+
+	stxvw4x		vs0,	o0,	T2	// c0, c1
+
+	addi		T2,	T2,	16
+
+// N=0 M=4
+
+	xxlxor		vs4,	vs4,	vs4
+	xxlxor		vs5,	vs5,	vs5
+	xxlxor		vs6,	vs6,	vs6
+	xxlxor		vs7,	vs7,	vs7
+
+#ifndef TRMMKERNEL
+	lxvw4x		vs0,	o0,	T2	// c0, c1
+#else
+	xxlxor		vs0,	vs0,	vs0
+#endif
+
+
+	xxspltw		vs8,	vs36,	0
+	xxspltw		vs9,	vs36,	1
+	xxspltw		vs10,	vs36,	2
+	xxspltw		vs11,	vs36,	3
+
+
+	xxspltw		vs12,	vs37,	0
+	xxspltw		vs13,	vs37,	1
+	xxspltw		vs14,	vs37,	2
+	xxspltw		vs15,	vs37,	3
+
+	XVFADD_R1	vs4,	vs4,	vs8		// add a0_r * b0_r
+	XVFADD_I2	vs5,	vs5,	vs12		// add a0_r * b0_i
+	XVFADD_R1	vs6,	vs6,	vs10		// add a1_r * b0_r
+	XVFADD_I2	vs7,	vs7,	vs14		// add a1_r * b0_i
+
+	XVFADD_R2	vs4,	vs4,	vs13		// add a0_i * b0_i
+	XVFADD_I1	vs5,	vs5,	vs9 		// add a0_i * b0_r
+	XVFADD_R2	vs6,	vs6,	vs15		// add a1_i * b0_i
+	XVFADD_I1	vs7,	vs7,	vs11		// add a1_i * b0_r
+
+	xvmulsp		vs16,	vs4,	alpha_sr		// r0_r * alpha_r
+	xvmulsp		vs17,	vs5,	alpha_si		// r0_i * alpha_i
+	xvmulsp		vs18,	vs4,	alpha_si		// r0_r * alpha_i
+	xvmulsp		vs19,	vs5,	alpha_sr		// r0_i * alpha_r
+
+	xvsubsp		vs20,	vs16,	vs17		// r0_r * alpha_r - r0_i * alpha_i
+	xvaddsp		vs21,	vs18,	vs19		// r0_r * alpha_i + r0_i * alpha_r
+
+	xvmulsp		vs16,	vs6,	alpha_sr		// r1_r * alpha_r
+	xvmulsp		vs17,	vs7,	alpha_si		// r1_i * alpha_i
+	xvmulsp		vs18,	vs6,	alpha_si		// r1_r * alpha_i
+	xvmulsp		vs19,	vs7,	alpha_sr		// r1_i * alpha_r
+
+	xvsubsp		vs22,	vs16,	vs17		// r1_r * alpha_r - r1_i * alpha_i
+	xvaddsp		vs23,	vs18,	vs19		// r1_r * alpha_i + r1_i * alpha_r
+
+	xxsldwi		vs20,	vs20,	vs24,	3		// r0_r
+	xxsldwi		vs21,	vs21,	vs24,	2		// r0_i
+	xxsldwi		vs22,	vs22,	vs24,	1		// r1_r
+	xxsldwi		vs23,	vs23,	vs24,	0		// r1_i
+	xvaddsp		vs20,	vs20,	vs21		// r0_r, r0_i
+	xvaddsp		vs22,	vs22,	vs23		// r1_r, r1_i
+	xvaddsp		vs1,	vs20,	vs22			// r0_r, r0_i, r1_r, r1_i
+	xvaddsp		vs0,	vs0,	vs1
+
+
+	stxvw4x		vs0,	o0,	T2	// c0, c1
+
+	addi		T2,	T2,	16
+
+// N=0 M=6
+
+	xxlxor		vs4,	vs4,	vs4
+	xxlxor		vs5,	vs5,	vs5
+	xxlxor		vs6,	vs6,	vs6
+	xxlxor		vs7,	vs7,	vs7
+
+#ifndef TRMMKERNEL
+	lxvw4x		vs0,	o0,	T2	// c0, c1
+#else
+	xxlxor		vs0,	vs0,	vs0
+#endif
+
+
+	xxspltw		vs8,	vs38,	0
+	xxspltw		vs9,	vs38,	1
+	xxspltw		vs10,	vs38,	2
+	xxspltw		vs11,	vs38,	3
+
+
+	xxspltw		vs12,	vs39,	0
+	xxspltw		vs13,	vs39,	1
+	xxspltw		vs14,	vs39,	2
+	xxspltw		vs15,	vs39,	3
+
+	XVFADD_R1	vs4,	vs4,	vs8		// add a0_r * b0_r
+	XVFADD_I2	vs5,	vs5,	vs12		// add a0_r * b0_i
+	XVFADD_R1	vs6,	vs6,	vs10		// add a1_r * b0_r
+	XVFADD_I2	vs7,	vs7,	vs14		// add a1_r * b0_i
+
+	XVFADD_R2	vs4,	vs4,	vs13		// add a0_i * b0_i
+	XVFADD_I1	vs5,	vs5,	vs9 		// add a0_i * b0_r
+	XVFADD_R2	vs6,	vs6,	vs15		// add a1_i * b0_i
+	XVFADD_I1	vs7,	vs7,	vs11		// add a1_i * b0_r
+
+	xvmulsp		vs16,	vs4,	alpha_sr		// r0_r * alpha_r
+	xvmulsp		vs17,	vs5,	alpha_si		// r0_i * alpha_i
+	xvmulsp		vs18,	vs4,	alpha_si		// r0_r * alpha_i
+	xvmulsp		vs19,	vs5,	alpha_sr		// r0_i * alpha_r
+
+	xvsubsp		vs20,	vs16,	vs17		// r0_r * alpha_r - r0_i * alpha_i
+	xvaddsp		vs21,	vs18,	vs19		// r0_r * alpha_i + r0_i * alpha_r
+
+	xvmulsp		vs16,	vs6,	alpha_sr		// r1_r * alpha_r
+	xvmulsp		vs17,	vs7,	alpha_si		// r1_i * alpha_i
+	xvmulsp		vs18,	vs6,	alpha_si		// r1_r * alpha_i
+	xvmulsp		vs19,	vs7,	alpha_sr		// r1_i * alpha_r
+
+	xvsubsp		vs22,	vs16,	vs17		// r1_r * alpha_r - r1_i * alpha_i
+	xvaddsp		vs23,	vs18,	vs19		// r1_r * alpha_i + r1_i * alpha_r
+
+	xxsldwi		vs20,	vs20,	vs24,	3		// r0_r
+	xxsldwi		vs21,	vs21,	vs24,	2		// r0_i
+	xxsldwi		vs22,	vs22,	vs24,	1		// r1_r
+	xxsldwi		vs23,	vs23,	vs24,	0		// r1_i
+	xvaddsp		vs20,	vs20,	vs21		// r0_r, r0_i
+	xvaddsp		vs22,	vs22,	vs23		// r1_r, r1_i
+	xvaddsp		vs1,	vs20,	vs22			// r0_r, r0_i, r1_r, r1_i
+	xvaddsp		vs0,	vs0,	vs1
+
+
+	stxvw4x		vs0,	o0,	T2	// c0, c1
+
+	addi		T2,	T2,	16
+	add		T1,	T1,	LDC
+
+	addi		CO,	CO,	64
+
+.endm
+
+
+/**********************************************************************************************
+* Macros for N=1 and M=4
+**********************************************************************************************/
+
+.macro LOAD1x4_1
+
+	lxvw4x		vs0,	o0,	AO		// load a0, a1
+	lxvw4x		vs1,	o16,	AO		// load a2, a3
+
+	addi		AO,	AO,	32
+
+	lxvw4x		vs8,	o0,	BO		//  load b0_r
+	lxvw4x		vs9,	o16,	BO		//  load b0_i
+
+	addi		BO,	BO,	32
+
+.endm
+
+.macro KERNEL1x4_I1
+
+
+	lxvw4x		vs4,	o0,	AO		// load a0, a1
+	lxvw4x		vs5,	o16,	AO		// load a2, a3
+
+	addi		AO,	AO,	32
+
+	lxvw4x		vs16,	o0,	BO		//  load b0_r
+	lxvw4x		vs17,	o16,	BO		//  load b0_i
+
+	addi		BO,	BO,	32
+
+
+	xvmulsp		vs32,	vs0,	vs8		// a0_r*b0_r, a0_i*b0_r, a1_r*b0_r, a1_i*b0_r
+	xvmulsp		vs33,	vs0,	vs9		// a0_r*b0_i, a0_i*b0_i, a1_r*b0_i, a1_i*b0_i
+	xvmulsp		vs34,	vs1,	vs8		// a1_r*b0_r, a1_i*b0_r, a1_r*b0_r, a1_i*b0_r
+	xvmulsp		vs35,	vs1,	vs9		// a1_r*b0_i, a1_i*b0_i, a1_r*b0_i, a1_i*b0_i
+
+
+.endm
+
+.macro KERNEL1x4_1
+
+
+	lxvw4x		vs4,	o0,	AO		// load a0, a1
+	lxvw4x		vs5,	o16,	AO		// load a2, a3
+
+	addi		AO,	AO,	32
+
+	lxvw4x		vs16,	o0,	BO		//  load b0_r
+	lxvw4x		vs17,	o16,	BO		//  load b0_i
+
+	addi		BO,	BO,	32
+
+
+	xvmaddasp	vs32,	vs0,	vs8		// a0_r*b0_r, a0_i*b0_r, a1_r*b0_r, a1_i*b0_r
+	xvmaddasp	vs33,	vs0,	vs9		// a0_r*b0_i, a0_i*b0_i, a1_r*b0_i, a1_i*b0_i
+	xvmaddasp	vs34,	vs1,	vs8		// a1_r*b0_r, a1_i*b0_r, a1_r*b0_r, a1_i*b0_r
+	xvmaddasp	vs35,	vs1,	vs9		// a1_r*b0_i, a1_i*b0_i, a1_r*b0_i, a1_i*b0_i
+
+
+.endm
+
+.macro KERNEL1x4_2
+
+
+	lxvw4x		vs0,	o0,	AO		// load a0, a1
+	lxvw4x		vs1,	o16,	AO		// load a2, a3
+
+	addi		AO,	AO,	32
+
+	lxvw4x		vs8,	o0,	BO		//  load b0_r
+	lxvw4x		vs9,	o16,	BO		//  load b0_i
+
+	addi		BO,	BO,	32
+
+
+	xvmaddasp	vs32,	vs4,	vs16		// a4_r*b0_r, a4_i*b0_r, a1_r*b0_r, a1_i*b0_r
+	xvmaddasp	vs33,	vs4,	vs17		// a4_r*b0_i, a4_i*b0_i, a1_r*b0_i, a1_i*b0_i
+	xvmaddasp	vs34,	vs5,	vs16		// a5_r*b0_r, a5_i*b0_r, a1_r*b0_r, a1_i*b0_r
+	xvmaddasp	vs35,	vs5,	vs17		// a5_r*b0_i, a5_i*b0_i, a1_r*b0_i, a1_i*b0_i
+
+
+.endm
+
+.macro KERNEL1x4_E2
+
+
+	xvmaddasp	vs32,	vs4,	vs16		// a4_r*b0_r, a4_i*b0_r, a1_r*b0_r, a1_i*b0_r
+	xvmaddasp	vs33,	vs4,	vs17		// a4_r*b0_i, a4_i*b0_i, a1_r*b0_i, a1_i*b0_i
+	xvmaddasp	vs34,	vs5,	vs16		// a5_r*b0_r, a5_i*b0_r, a1_r*b0_r, a1_i*b0_r
+	xvmaddasp	vs35,	vs5,	vs17		// a5_r*b0_i, a5_i*b0_i, a1_r*b0_i, a1_i*b0_i
+
+
+.endm
+
+.macro KERNEL1x4_SUBI1
+
+
+	lxvw4x		vs0,	o0,	AO		// load a0, a1
+	lxvw4x		vs1,	o16,	AO		// load a2, a3
+
+	addi		AO,	AO,	32
+
+	lxvw4x		vs8,	o0,	BO		//  load b0_r
+	lxvw4x		vs9,	o16,	BO		//  load b0_i
+
+	addi		BO,	BO,	32
+
+
+	xvmulsp		vs32,	vs0,	vs8		// a0_r*b0_r, a0_i*b0_r, a1_r*b0_r, a1_i*b0_r
+	xvmulsp		vs33,	vs0,	vs9		// a0_r*b0_i, a0_i*b0_i, a1_r*b0_i, a1_i*b0_i
+	xvmulsp		vs34,	vs1,	vs8		// a1_r*b0_r, a1_i*b0_r, a1_r*b0_r, a1_i*b0_r
+	xvmulsp		vs35,	vs1,	vs9		// a1_r*b0_i, a1_i*b0_i, a1_r*b0_i, a1_i*b0_i
+
+
+.endm
+
+.macro KERNEL1x4_SUB1
+
+
+	lxvw4x		vs0,	o0,	AO		// load a0, a1
+	lxvw4x		vs1,	o16,	AO		// load a2, a3
+
+	addi		AO,	AO,	32
+
+	lxvw4x		vs8,	o0,	BO		//  load b0_r
+	lxvw4x		vs9,	o16,	BO		//  load b0_i
+
+	addi		BO,	BO,	32
+
+
+	xvmaddasp	vs32,	vs0,	vs8		// a0_r*b0_r, a0_i*b0_r, a1_r*b0_r, a1_i*b0_r
+	xvmaddasp	vs33,	vs0,	vs9		// a0_r*b0_i, a0_i*b0_i, a1_r*b0_i, a1_i*b0_i
+	xvmaddasp	vs34,	vs1,	vs8		// a1_r*b0_r, a1_i*b0_r, a1_r*b0_r, a1_i*b0_r
+	xvmaddasp	vs35,	vs1,	vs9		// a1_r*b0_i, a1_i*b0_i, a1_r*b0_i, a1_i*b0_i
+
+
+.endm
+
+.macro SAVE1x4
+
+	mr		T1,	CO
+	xxlxor		vs24,	vs24,	vs24
+
+// N=0
+
+	mr		T2,	T1
+
+// N=0 M=0
+
+	xxlxor		vs4,	vs4,	vs4
+	xxlxor		vs5,	vs5,	vs5
+	xxlxor		vs6,	vs6,	vs6
+	xxlxor		vs7,	vs7,	vs7
+
+#ifndef TRMMKERNEL
+	lxvw4x		vs0,	o0,	T2	// c0, c1
+#else
+	xxlxor		vs0,	vs0,	vs0
+#endif
+
+
+	xxspltw		vs8,	vs32,	0
+	xxspltw		vs9,	vs32,	1
+	xxspltw		vs10,	vs32,	2
+	xxspltw		vs11,	vs32,	3
+
+
+	xxspltw		vs12,	vs33,	0
+	xxspltw		vs13,	vs33,	1
+	xxspltw		vs14,	vs33,	2
+	xxspltw		vs15,	vs33,	3
+
+	XVFADD_R1	vs4,	vs4,	vs8		// add a0_r * b0_r
+	XVFADD_I2	vs5,	vs5,	vs12		// add a0_r * b0_i
+	XVFADD_R1	vs6,	vs6,	vs10		// add a1_r * b0_r
+	XVFADD_I2	vs7,	vs7,	vs14		// add a1_r * b0_i
+
+	XVFADD_R2	vs4,	vs4,	vs13		// add a0_i * b0_i
+	XVFADD_I1	vs5,	vs5,	vs9 		// add a0_i * b0_r
+	XVFADD_R2	vs6,	vs6,	vs15		// add a1_i * b0_i
+	XVFADD_I1	vs7,	vs7,	vs11		// add a1_i * b0_r
+
+	xvmulsp		vs16,	vs4,	alpha_sr		// r0_r * alpha_r
+	xvmulsp		vs17,	vs5,	alpha_si		// r0_i * alpha_i
+	xvmulsp		vs18,	vs4,	alpha_si		// r0_r * alpha_i
+	xvmulsp		vs19,	vs5,	alpha_sr		// r0_i * alpha_r
+
+	xvsubsp		vs20,	vs16,	vs17		// r0_r * alpha_r - r0_i * alpha_i
+	xvaddsp		vs21,	vs18,	vs19		// r0_r * alpha_i + r0_i * alpha_r
+
+	xvmulsp		vs16,	vs6,	alpha_sr		// r1_r * alpha_r
+	xvmulsp		vs17,	vs7,	alpha_si		// r1_i * alpha_i
+	xvmulsp		vs18,	vs6,	alpha_si		// r1_r * alpha_i
+	xvmulsp		vs19,	vs7,	alpha_sr		// r1_i * alpha_r
+
+	xvsubsp		vs22,	vs16,	vs17		// r1_r * alpha_r - r1_i * alpha_i
+	xvaddsp		vs23,	vs18,	vs19		// r1_r * alpha_i + r1_i * alpha_r
+
+	xxsldwi		vs20,	vs20,	vs24,	3		// r0_r
+	xxsldwi		vs21,	vs21,	vs24,	2		// r0_i
+	xxsldwi		vs22,	vs22,	vs24,	1		// r1_r
+	xxsldwi		vs23,	vs23,	vs24,	0		// r1_i
+	xvaddsp		vs20,	vs20,	vs21		// r0_r, r0_i
+	xvaddsp		vs22,	vs22,	vs23		// r1_r, r1_i
+	xvaddsp		vs1,	vs20,	vs22			// r0_r, r0_i, r1_r, r1_i
+	xvaddsp		vs0,	vs0,	vs1
+
+
+	stxvw4x		vs0,	o0,	T2	// c0, c1
+
+	addi		T2,	T2,	16
+
+// N=0 M=2
+
+	xxlxor		vs4,	vs4,	vs4
+	xxlxor		vs5,	vs5,	vs5
+	xxlxor		vs6,	vs6,	vs6
+	xxlxor		vs7,	vs7,	vs7
+
+#ifndef TRMMKERNEL
+	lxvw4x		vs0,	o0,	T2	// c0, c1
+#else
+	xxlxor		vs0,	vs0,	vs0
+#endif
+
+
+	xxspltw		vs8,	vs34,	0
+	xxspltw		vs9,	vs34,	1
+	xxspltw		vs10,	vs34,	2
+	xxspltw		vs11,	vs34,	3
+
+
+	xxspltw		vs12,	vs35,	0
+	xxspltw		vs13,	vs35,	1
+	xxspltw		vs14,	vs35,	2
+	xxspltw		vs15,	vs35,	3
+
+	XVFADD_R1	vs4,	vs4,	vs8		// add a0_r * b0_r
+	XVFADD_I2	vs5,	vs5,	vs12		// add a0_r * b0_i
+	XVFADD_R1	vs6,	vs6,	vs10		// add a1_r * b0_r
+	XVFADD_I2	vs7,	vs7,	vs14		// add a1_r * b0_i
+
+	XVFADD_R2	vs4,	vs4,	vs13		// add a0_i * b0_i
+	XVFADD_I1	vs5,	vs5,	vs9 		// add a0_i * b0_r
+	XVFADD_R2	vs6,	vs6,	vs15		// add a1_i * b0_i
+	XVFADD_I1	vs7,	vs7,	vs11		// add a1_i * b0_r
+
+	xvmulsp		vs16,	vs4,	alpha_sr		// r0_r * alpha_r
+	xvmulsp		vs17,	vs5,	alpha_si		// r0_i * alpha_i
+	xvmulsp		vs18,	vs4,	alpha_si		// r0_r * alpha_i
+	xvmulsp		vs19,	vs5,	alpha_sr		// r0_i * alpha_r
+
+	xvsubsp		vs20,	vs16,	vs17		// r0_r * alpha_r - r0_i * alpha_i
+	xvaddsp		vs21,	vs18,	vs19		// r0_r * alpha_i + r0_i * alpha_r
+
+	xvmulsp		vs16,	vs6,	alpha_sr		// r1_r * alpha_r
+	xvmulsp		vs17,	vs7,	alpha_si		// r1_i * alpha_i
+	xvmulsp		vs18,	vs6,	alpha_si		// r1_r * alpha_i
+	xvmulsp		vs19,	vs7,	alpha_sr		// r1_i * alpha_r
+
+	xvsubsp		vs22,	vs16,	vs17		// r1_r * alpha_r - r1_i * alpha_i
+	xvaddsp		vs23,	vs18,	vs19		// r1_r * alpha_i + r1_i * alpha_r
+
+	xxsldwi		vs20,	vs20,	vs24,	3		// r0_r
+	xxsldwi		vs21,	vs21,	vs24,	2		// r0_i
+	xxsldwi		vs22,	vs22,	vs24,	1		// r1_r
+	xxsldwi		vs23,	vs23,	vs24,	0		// r1_i
+	xvaddsp		vs20,	vs20,	vs21		// r0_r, r0_i
+	xvaddsp		vs22,	vs22,	vs23		// r1_r, r1_i
+	xvaddsp		vs1,	vs20,	vs22			// r0_r, r0_i, r1_r, r1_i
+	xvaddsp		vs0,	vs0,	vs1
+
+
+	stxvw4x		vs0,	o0,	T2	// c0, c1
+
+	addi		T2,	T2,	16
+	add		T1,	T1,	LDC
+
+	addi		CO,	CO,	32
+
+.endm
+
+
+/**********************************************************************************************
+* Macros for N=1 and M=2
+**********************************************************************************************/
+
+.macro LOAD1x2_1
+
+	lxvw4x		vs0,	o0,	AO		// load a0, a1
+
+	addi		AO,	AO,	16
+
+	lxvw4x		vs8,	o0,	BO		//  load b0_r
+	lxvw4x		vs9,	o16,	BO		//  load b0_i
+
+	addi		BO,	BO,	32
+
+.endm
+
+.macro KERNEL1x2_I1
+
+
+	lxvw4x		vs4,	o0,	AO		// load a0, a1
+
+	addi		AO,	AO,	16
+
+	lxvw4x		vs16,	o0,	BO		//  load b0_r
+	lxvw4x		vs17,	o16,	BO		//  load b0_i
+
+	addi		BO,	BO,	32
+
+
+	xvmulsp		vs32,	vs0,	vs8		// a0_r*b0_r, a0_i*b0_r, a1_r*b0_r, a1_i*b0_r
+	xvmulsp		vs33,	vs0,	vs9		// a0_r*b0_i, a0_i*b0_i, a1_r*b0_i, a1_i*b0_i
+
+
+.endm
+
+.macro KERNEL1x2_1
+
+
+	lxvw4x		vs4,	o0,	AO		// load a0, a1
+
+	addi		AO,	AO,	16
+
+	lxvw4x		vs16,	o0,	BO		//  load b0_r
+	lxvw4x		vs17,	o16,	BO		//  load b0_i
+
+	addi		BO,	BO,	32
+
+
+	xvmaddasp	vs32,	vs0,	vs8		// a0_r*b0_r, a0_i*b0_r, a1_r*b0_r, a1_i*b0_r
+	xvmaddasp	vs33,	vs0,	vs9		// a0_r*b0_i, a0_i*b0_i, a1_r*b0_i, a1_i*b0_i
+
+
+.endm
+
+.macro KERNEL1x2_2
+
+
+	lxvw4x		vs0,	o0,	AO		// load a0, a1
+
+	addi		AO,	AO,	16
+
+	lxvw4x		vs8,	o0,	BO		//  load b0_r
+	lxvw4x		vs9,	o16,	BO		//  load b0_i
+
+	addi		BO,	BO,	32
+
+
+	xvmaddasp	vs32,	vs4,	vs16		// a4_r*b0_r, a4_i*b0_r, a1_r*b0_r, a1_i*b0_r
+	xvmaddasp	vs33,	vs4,	vs17		// a4_r*b0_i, a4_i*b0_i, a1_r*b0_i, a1_i*b0_i
+
+
+.endm
+
+.macro KERNEL1x2_E2
+
+
+	xvmaddasp	vs32,	vs4,	vs16		// a4_r*b0_r, a4_i*b0_r, a1_r*b0_r, a1_i*b0_r
+	xvmaddasp	vs33,	vs4,	vs17		// a4_r*b0_i, a4_i*b0_i, a1_r*b0_i, a1_i*b0_i
+
+
+.endm
+
+.macro KERNEL1x2_SUBI1
+
+
+	lxvw4x		vs0,	o0,	AO		// load a0, a1
+
+	addi		AO,	AO,	16
+
+	lxvw4x		vs8,	o0,	BO		//  load b0_r
+	lxvw4x		vs9,	o16,	BO		//  load b0_i
+
+	addi		BO,	BO,	32
+
+
+	xvmulsp		vs32,	vs0,	vs8		// a0_r*b0_r, a0_i*b0_r, a1_r*b0_r, a1_i*b0_r
+	xvmulsp		vs33,	vs0,	vs9		// a0_r*b0_i, a0_i*b0_i, a1_r*b0_i, a1_i*b0_i
+
+
+.endm
+
+.macro KERNEL1x2_SUB1
+
+
+	lxvw4x		vs0,	o0,	AO		// load a0, a1
+
+	addi		AO,	AO,	16
+
+	lxvw4x		vs8,	o0,	BO		//  load b0_r
+	lxvw4x		vs9,	o16,	BO		//  load b0_i
+
+	addi		BO,	BO,	32
+
+
+	xvmaddasp	vs32,	vs0,	vs8		// a0_r*b0_r, a0_i*b0_r, a1_r*b0_r, a1_i*b0_r
+	xvmaddasp	vs33,	vs0,	vs9		// a0_r*b0_i, a0_i*b0_i, a1_r*b0_i, a1_i*b0_i
+
+
+.endm
+
+.macro SAVE1x2
+
+	mr		T1,	CO
+	xxlxor		vs24,	vs24,	vs24
+
+// N=0
+
+	mr		T2,	T1
+
+// N=0 M=0
+
+	xxlxor		vs4,	vs4,	vs4
+	xxlxor		vs5,	vs5,	vs5
+	xxlxor		vs6,	vs6,	vs6
+	xxlxor		vs7,	vs7,	vs7
+
+#ifndef TRMMKERNEL
+	lxvw4x		vs0,	o0,	T2	// c0, c1
+#else
+	xxlxor		vs0,	vs0,	vs0
+#endif
+
+
+	xxspltw		vs8,	vs32,	0
+	xxspltw		vs9,	vs32,	1
+	xxspltw		vs10,	vs32,	2
+	xxspltw		vs11,	vs32,	3
+
+
+	xxspltw		vs12,	vs33,	0
+	xxspltw		vs13,	vs33,	1
+	xxspltw		vs14,	vs33,	2
+	xxspltw		vs15,	vs33,	3
+
+	XVFADD_R1	vs4,	vs4,	vs8		// add a0_r * b0_r
+	XVFADD_I2	vs5,	vs5,	vs12		// add a0_r * b0_i
+	XVFADD_R1	vs6,	vs6,	vs10		// add a1_r * b0_r
+	XVFADD_I2	vs7,	vs7,	vs14		// add a1_r * b0_i
+
+	XVFADD_R2	vs4,	vs4,	vs13		// add a0_i * b0_i
+	XVFADD_I1	vs5,	vs5,	vs9 		// add a0_i * b0_r
+	XVFADD_R2	vs6,	vs6,	vs15		// add a1_i * b0_i
+	XVFADD_I1	vs7,	vs7,	vs11		// add a1_i * b0_r
+
+	xvmulsp		vs16,	vs4,	alpha_sr		// r0_r * alpha_r
+	xvmulsp		vs17,	vs5,	alpha_si		// r0_i * alpha_i
+	xvmulsp		vs18,	vs4,	alpha_si		// r0_r * alpha_i
+	xvmulsp		vs19,	vs5,	alpha_sr		// r0_i * alpha_r
+
+	xvsubsp		vs20,	vs16,	vs17		// r0_r * alpha_r - r0_i * alpha_i
+	xvaddsp		vs21,	vs18,	vs19		// r0_r * alpha_i + r0_i * alpha_r
+
+	xvmulsp		vs16,	vs6,	alpha_sr		// r1_r * alpha_r
+	xvmulsp		vs17,	vs7,	alpha_si		// r1_i * alpha_i
+	xvmulsp		vs18,	vs6,	alpha_si		// r1_r * alpha_i
+	xvmulsp		vs19,	vs7,	alpha_sr		// r1_i * alpha_r
+
+	xvsubsp		vs22,	vs16,	vs17		// r1_r * alpha_r - r1_i * alpha_i
+	xvaddsp		vs23,	vs18,	vs19		// r1_r * alpha_i + r1_i * alpha_r
+
+	xxsldwi		vs20,	vs20,	vs24,	3		// r0_r
+	xxsldwi		vs21,	vs21,	vs24,	2		// r0_i
+	xxsldwi		vs22,	vs22,	vs24,	1		// r1_r
+	xxsldwi		vs23,	vs23,	vs24,	0		// r1_i
+	xvaddsp		vs20,	vs20,	vs21		// r0_r, r0_i
+	xvaddsp		vs22,	vs22,	vs23		// r1_r, r1_i
+	xvaddsp		vs1,	vs20,	vs22			// r0_r, r0_i, r1_r, r1_i
+	xvaddsp		vs0,	vs0,	vs1
+
+
+	stxvw4x		vs0,	o0,	T2	// c0, c1
+
+	addi		T2,	T2,	16
+	add		T1,	T1,	LDC
+
+	addi		CO,	CO,	16
+
+.endm
+
+
+/**********************************************************************************************
+* Macros for N=1 and M=1
+**********************************************************************************************/
+
+.macro LOAD1x1_1
+
+	lxsspx		vs0,	o0,	AO		// load a0_r
+	lxsspx		vs1,	o4,	AO		// load a0_i
+
+	addi		AO,	AO,	8
+
+	lxsspx		vs8,	o0,	BO		//  load b0_r
+	lxsspx		vs9,	o16,	BO		//  load b0_i
+
+	addi		BO,	BO,	32
+
+.endm
+
+.macro KERNEL1x1_I1
+
+
+	lxsspx		vs4,	o0,	AO		// load a0_r
+	lxsspx		vs5,	o4,	AO		// load a0_i
+
+	addi		AO,	AO,	8
+
+	lxsspx		vs16,	o0,	BO		//  load b0_r
+	lxsspx		vs17,	o16,	BO		//  load b0_i
+
+	addi		BO,	BO,	32
+
+
+	xsmuldp		vs32,	vs0,	vs8		// a0_r*b0_r
+	xsmuldp		vs33,	vs1,	vs9		// a0_i*b0_i
+	xsmuldp		vs34,	vs0,	vs9		// a0_r*b0_i
+	xsmuldp		vs35,	vs1,	vs8		// a0_i*b0_r
+
+
+.endm
+
+.macro KERNEL1x1_1
+
+
+	lxsspx		vs4,	o0,	AO		// load a0_r
+	lxsspx		vs5,	o4,	AO		// load a0_i
+
+	addi		AO,	AO,	8
+
+	lxsspx		vs16,	o0,	BO		//  load b0_r
+	lxsspx		vs17,	o16,	BO		//  load b0_i
+
+	addi		BO,	BO,	32
+
+
+	xsmaddadp	vs32,	vs0,	vs8		// a0_r*b0_r
+	xsmaddadp	vs33,	vs1,	vs9		// a0_i*b0_i
+	xsmaddadp	vs34,	vs0,	vs9		// a0_r*b0_i
+	xsmaddadp	vs35,	vs1,	vs8		// a0_i*b0_r
+
+
+.endm
+
+.macro KERNEL1x1_2
+
+
+	lxsspx		vs0,	o0,	AO		// load a0_r
+	lxsspx		vs1,	o4,	AO		// load a0_i
+
+	addi		AO,	AO,	8
+
+	lxsspx		vs8,	o0,	BO		//  load b0_r
+	lxsspx		vs9,	o16,	BO		//  load b0_i
+
+	addi		BO,	BO,	32
+
+
+	xsmaddadp	vs32,	vs4,	vs16		// a4_r*b0_r
+	xsmaddadp	vs33,	vs5,	vs17		// a4_i*b0_i
+	xsmaddadp	vs34,	vs4,	vs17		// a4_r*b0_i
+	xsmaddadp	vs35,	vs5,	vs16		// a4_i*b0_r
+
+
+.endm
+
+.macro KERNEL1x1_E2
+
+
+	xsmaddadp	vs32,	vs4,	vs16		// a4_r*b0_r
+	xsmaddadp	vs33,	vs5,	vs17		// a4_i*b0_i
+	xsmaddadp	vs34,	vs4,	vs17		// a4_r*b0_i
+	xsmaddadp	vs35,	vs5,	vs16		// a4_i*b0_r
+
+
+.endm
+
+.macro KERNEL1x1_SUBI1
+
+
+	lxsspx		vs0,	o0,	AO		// load a0_r
+	lxsspx		vs1,	o4,	AO		// load a0_i
+
+	addi		AO,	AO,	8
+
+	lxsspx		vs8,	o0,	BO		//  load b0_r
+	lxsspx		vs9,	o16,	BO		//  load b0_i
+
+	addi		BO,	BO,	32
+
+
+	xsmuldp		vs32,	vs0,	vs8		// a0_r*b0_r
+	xsmuldp		vs33,	vs1,	vs9		// a0_i*b0_i
+	xsmuldp		vs34,	vs0,	vs9		// a0_r*b0_i
+	xsmuldp		vs35,	vs1,	vs8		// a0_i*b0_r
+
+
+.endm
+
+.macro KERNEL1x1_SUB1
+
+
+	lxsspx		vs0,	o0,	AO		// load a0_r
+	lxsspx		vs1,	o4,	AO		// load a0_i
+
+	addi		AO,	AO,	8
+
+	lxsspx		vs8,	o0,	BO		//  load b0_r
+	lxsspx		vs9,	o16,	BO		//  load b0_i
+
+	addi		BO,	BO,	32
+
+
+	xsmaddadp	vs32,	vs0,	vs8		// a0_r*b0_r
+	xsmaddadp	vs33,	vs1,	vs9		// a0_i*b0_i
+	xsmaddadp	vs34,	vs0,	vs9		// a0_r*b0_i
+	xsmaddadp	vs35,	vs1,	vs8		// a0_i*b0_r
+
+
+.endm
+
+.macro SAVE1x1
+
+	mr		T1,	CO
+	xxlxor		vs24,	vs24,	vs24
+
+// N=0
+
+	mr		T2,	T1
+
+// N=0 M=0
+
+	xxlxor		vs4,	vs4,	vs4
+	xxlxor		vs5,	vs5,	vs5
+
+#ifndef TRMMKERNEL
+	lxsspx		vs0,	o0,	T2	// load c0_r
+	lxsspx		vs1,	o4,	T2	// load c0_i
+#else
+	xxlxor		vs0,	vs0,	vs0
+	xxlxor		vs1,	vs1,	vs1
+#endif
+
+	XSFADD_R1	vs4,	vs4,	vs32		// add a0_r * b0_r
+	XSFADD_I1	vs5,	vs5,	vs35		// add a0_r * b0_i
+
+	XSFADD_R2	vs4,	vs4,	vs33		// add a0_i * b0_i
+	XSFADD_I2	vs5,	vs5,	vs34		// add a0_i * b0_r
+
+	xsmuldp		vs16,	vs4,	alpha_dr		// r0_r * alpha_r
+	xsmuldp		vs17,	vs5,	alpha_di		// r0_i * alpha_i
+	xsmuldp		vs18,	vs4,	alpha_di		// r0_r * alpha_i
+	xsmuldp		vs19,	vs5,	alpha_dr		// r0_i * alpha_r
+
+	xssubdp		vs20,	vs16,	vs17		// r0_r * alpha_r - r0_i * alpha_i
+	xsadddp		vs21,	vs18,	vs19		// r0_r * alpha_i + r0_i * alpha_r
+
+	xsadddp		vs0,	vs0,	vs20
+	xsadddp		vs1,	vs1,	vs21
+
+
+	stxsspx		vs0,	o0,	T2	// store c0_r
+	stxsspx		vs1,	o4,	T2	// store c0_i
+
+	addi		T2,	T2,	8
+	add		T1,	T1,	LDC
+
+	addi		CO,	CO,	8
+
+.endm
+
diff --git a/kernel/power/cswap.c b/kernel/power/cswap.c
new file mode 100644
index 000000000..da97c896e
--- /dev/null
+++ b/kernel/power/cswap.c
@@ -0,0 +1,175 @@
+/***************************************************************************
+Copyright (c) 2013-2016, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*****************************************************************************/
+
+/**************************************************************************************
+* 2016/03/27 Werner Saar (wernsaar@googlemail.com)
+* 	 BLASTEST 		: OK
+* 	 CTEST			: OK
+* 	 TEST			: OK
+*	 LAPACK-TEST		: OK
+**************************************************************************************/
+
+#include "common.h"
+
+
+#if defined(POWER8)
+#include "cswap_microk_power8.c"
+#endif
+
+
+#ifndef HAVE_KERNEL_32
+
+static void cswap_kernel_32(BLASLONG n, FLOAT *x, FLOAT *y)
+{
+
+	BLASLONG i=0;
+	FLOAT f0, f1, f2, f3, f4, f5, f6, f7;
+	FLOAT g0, g1, g2, g3, g4, g5, g6, g7;
+	FLOAT *x1=x;
+	FLOAT *y1=y;
+
+	while ( i<n )
+	{
+
+		f0 = x1[0];
+		f1 = x1[1];
+		f2 = x1[2];
+		f3 = x1[3];
+		f4 = x1[4];
+		f5 = x1[5];
+		f6 = x1[6];
+		f7 = x1[7];
+
+		g0 = y1[0];
+		g1 = y1[1];
+		g2 = y1[2];
+		g3 = y1[3];
+		g4 = y1[4];
+		g5 = y1[5];
+		g6 = y1[6];
+		g7 = y1[7];
+
+		y1[0] = f0;
+		y1[1] = f1;
+		y1[2] = f2;
+		y1[3] = f3;
+		y1[4] = f4;
+		y1[5] = f5;
+		y1[6] = f6;
+		y1[7] = f7;
+
+		x1[0] = g0;
+		x1[1] = g1;
+		x1[2] = g2;
+		x1[3] = g3;
+		x1[4] = g4;
+		x1[5] = g5;
+		x1[6] = g6;
+		x1[7] = g7;
+
+		x1 += 8;
+		y1 += 8;
+
+		i+=4;
+	}
+	return;
+
+}
+
+
+#endif
+
+
+int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT dummy3, FLOAT dummy4, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *dummy, BLASLONG dummy2)
+{
+	BLASLONG i=0;
+	BLASLONG ix=0,iy=0;
+	FLOAT temp[2];
+	BLASLONG inc_x2, inc_y2;
+
+	if ( n <= 0     )  return(0);
+
+	if ( (inc_x == 1) && (inc_y == 1 ))
+	{
+
+		BLASLONG n1 = n & -32;
+		if ( n1 > 0 )
+		{
+			cswap_kernel_32(n1, x, y);
+			i=n1;
+			ix = 2* n1;
+			iy = 2* n1;
+		}
+
+		while(i < n)
+		{
+
+	                temp[0]  = x[ix]   ;
+        	        temp[1]  = x[ix+1] ;
+                	x[ix]    = y[iy]   ;
+                	x[ix+1]  = y[iy+1] ;
+                	y[iy]    = temp[0] ;
+                	y[iy+1]  = temp[1] ;
+
+                	ix += 2 ;
+                	iy += 2 ;
+                	i++ ;
+
+
+		}
+
+
+	}
+	else
+	{
+
+	        inc_x2 = 2 * inc_x;
+	        inc_y2 = 2 * inc_y;
+
+		while(i < n)
+		{
+
+	                temp[0]  = x[ix]   ;
+        	        temp[1]  = x[ix+1] ;
+                	x[ix]    = y[iy]   ;
+                	x[ix+1]  = y[iy+1] ;
+                	y[iy]    = temp[0] ;
+                	y[iy+1]  = temp[1] ;
+
+                	ix += inc_x2 ;
+                	iy += inc_y2 ;
+                	i++ ;
+
+		}
+
+	}
+	return(0);
+	
+
+}
+
+
diff --git a/kernel/power/cswap_microk_power8.c b/kernel/power/cswap_microk_power8.c
new file mode 100644
index 000000000..90ab59c54
--- /dev/null
+++ b/kernel/power/cswap_microk_power8.c
@@ -0,0 +1,180 @@
+/***************************************************************************
+Copyright (c) 2013-2016, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*****************************************************************************/
+
+/**************************************************************************************
+* 2016/03/27 Werner Saar (wernsaar@googlemail.com)
+* 	 BLASTEST 		: OK
+* 	 CTEST			: OK
+* 	 TEST			: OK
+*	 LAPACK-TEST		: OK
+**************************************************************************************/
+
+#define HAVE_KERNEL_32 1
+
+static void cswap_kernel_32( BLASLONG n, FLOAT *x, FLOAT *y) __attribute__ ((noinline));
+
+static void cswap_kernel_32( BLASLONG n, FLOAT *x, FLOAT *y)
+{
+
+
+	BLASLONG i = n;
+	BLASLONG o16 = 16;
+	BLASLONG o32 = 32;
+	BLASLONG o48 = 48;
+	BLASLONG o64 = 64;
+	BLASLONG o80 = 80;
+	BLASLONG o96 = 96;
+	BLASLONG o112 = 112;
+	FLOAT *x1=x;
+	FLOAT *y1=y;
+	FLOAT *x2=x+1;
+	FLOAT *y2=y+1;
+	BLASLONG pre = 384;
+	BLASLONG alpha=0;
+
+	__asm__  __volatile__
+	(
+
+	"addi		%3, %3, -4			    \n\t"	
+	"addi		%4, %4, -4			    \n\t"	
+
+	".align 5				            \n\t"
+	"1:				                    \n\t"
+
+	"lxvw4x		32, 0, %2			    \n\t"
+	"lxvw4x		33, %5, %2			    \n\t"
+	"lxvw4x		34, %6, %2			    \n\t"
+	"lxvw4x		35, %7, %2			    \n\t"
+	"lxvw4x		36, %8, %2			    \n\t"
+	"lxvw4x		37, %9, %2			    \n\t"
+	"lxvw4x		38, %10, %2			    \n\t"
+	"lxvw4x		39, %11, %2			    \n\t"
+
+	"addi		%2, %2, 128			    \n\t"
+
+	"lxvw4x		40, 0, %2			    \n\t"
+	"lxvw4x		41, %5, %2			    \n\t"
+	"lxvw4x		42, %6, %2			    \n\t"
+	"lxvw4x		43, %7, %2			    \n\t"
+	"lxvw4x		44, %8, %2			    \n\t"
+	"lxvw4x		45, %9, %2			    \n\t"
+	"lxvw4x		46, %10, %2			    \n\t"
+	"lxvw4x		47, %11, %2			    \n\t"
+
+	"addi		%2, %2, 128			    \n\t"
+
+	"lxvw4x		48, 0, %1			    \n\t"
+	"lxvw4x		49, %5, %1			    \n\t"
+	"lxvw4x		50, %6, %1			    \n\t"
+	"lxvw4x		51, %7, %1			    \n\t"
+	"lxvw4x		52, %8, %1			    \n\t"
+	"lxvw4x		53, %9, %1			    \n\t"
+	"lxvw4x		54, %10, %1			    \n\t"
+	"lxvw4x		55, %11, %1			    \n\t"
+
+	"addi		%1, %1, 128			    \n\t"
+
+	"lxvw4x		56, 0, %1			    \n\t"
+	"lxvw4x		57, %5, %1			    \n\t"
+	"lxvw4x		58, %6, %1			    \n\t"
+	"lxvw4x		59, %7, %1			    \n\t"
+	"lxvw4x		60, %8, %1			    \n\t"
+	"lxvw4x		61, %9, %1			    \n\t"
+	"lxvw4x		62, %10, %1			    \n\t"
+	"lxvw4x		63, %11, %1			    \n\t"
+
+	"addi		%1, %1, 128			    \n\t"
+
+	"stxvw4x		32, 0, %3			    \n\t"
+	"stxvw4x		33, %5, %3			    \n\t"
+	"stxvw4x		34, %6, %3			    \n\t"
+	"stxvw4x		35, %7, %3			    \n\t"
+	"stxvw4x		36, %8, %3			    \n\t"
+	"stxvw4x		37, %9, %3			    \n\t"
+	"stxvw4x		38, %10, %3			    \n\t"
+	"stxvw4x		39, %11, %3			    \n\t"
+
+	"addi		%3, %3, 128			    \n\t"
+
+	"stxvw4x		40, 0, %3			    \n\t"
+	"stxvw4x		41, %5, %3			    \n\t"
+	"stxvw4x		42, %6, %3			    \n\t"
+	"stxvw4x		43, %7, %3			    \n\t"
+	"stxvw4x		44, %8, %3			    \n\t"
+	"stxvw4x		45, %9, %3			    \n\t"
+	"stxvw4x		46, %10, %3			    \n\t"
+	"stxvw4x		47, %11, %3			    \n\t"
+
+	"addi		%3, %3, 128			    \n\t"
+
+	"stxvw4x		48, 0, %4			    \n\t"
+	"stxvw4x		49, %5, %4			    \n\t"
+	"stxvw4x		50, %6, %4			    \n\t"
+	"stxvw4x		51, %7, %4			    \n\t"
+	"stxvw4x		52, %8, %4			    \n\t"
+	"stxvw4x		53, %9, %4			    \n\t"
+	"stxvw4x		54, %10, %4			    \n\t"
+	"stxvw4x		55, %11, %4			    \n\t"
+
+	"addi		%4, %4, 128			    \n\t"
+
+	"stxvw4x		56, 0, %4			    \n\t"
+	"stxvw4x		57, %5, %4			    \n\t"
+	"stxvw4x		58, %6, %4			    \n\t"
+	"stxvw4x		59, %7, %4			    \n\t"
+	"stxvw4x		60, %8, %4			    \n\t"
+	"stxvw4x		61, %9, %4			    \n\t"
+	"stxvw4x		62, %10, %4			    \n\t"
+	"stxvw4x		63, %11, %4			    \n\t"
+
+	"addi		%4, %4, 128			    \n\t"
+
+	"addic.		%0 , %0	, -32  	 	             \n\t"
+	"bgt		1b		             	     \n\t"
+
+	"2:						     \n\t"
+
+	:
+        : 
+          "r" (i),	// 0	
+	  "r" (y1),  	// 1
+          "r" (x1),     // 2
+          "r" (y2),     // 3
+          "r" (x2),     // 4
+	  "r" (o16),	// 5
+	  "r" (o32),	// 6
+	  "r" (o48),    // 7
+          "r" (o64),    // 8
+          "r" (o80),    // 9
+          "r" (o96),    // 10
+          "r" (o112)    // 11
+	: "cr0", "%0", "%2" , "%1", "%3", "%4", "memory"
+	);
+
+} 
+
+
diff --git a/kernel/power/ctrmm_kernel_8x4_power8.S b/kernel/power/ctrmm_kernel_8x4_power8.S
new file mode 100644
index 000000000..460a387fb
--- /dev/null
+++ b/kernel/power/ctrmm_kernel_8x4_power8.S
@@ -0,0 +1,399 @@
+/***************************************************************************
+Copyright (c) 2013-2016, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*****************************************************************************/
+
+/**************************************************************************************
+* 2016/04/04 Werner Saar (wernsaar@googlemail.com)
+* 	 BLASTEST 		: OK
+* 	 CTEST			: OK
+* 	 TEST			: OK
+* 	 LAPACK-TEST		: OK
+**************************************************************************************/
+
+/*********************************************************************/
+/* Copyright 2009, 2010 The University of Texas at Austin.           */
+/* All rights reserved.                                              */
+/*                                                                   */
+/* Redistribution and use in source and binary forms, with or        */
+/* without modification, are permitted provided that the following   */
+/* conditions are met:                                               */
+/*                                                                   */
+/*   1. Redistributions of source code must retain the above         */
+/*      copyright notice, this list of conditions and the following  */
+/*      disclaimer.                                                  */
+/*                                                                   */
+/*   2. Redistributions in binary form must reproduce the above      */
+/*      copyright notice, this list of conditions and the following  */
+/*      disclaimer in the documentation and/or other materials       */
+/*      provided with the distribution.                              */
+/*                                                                   */
+/*    THIS  SOFTWARE IS PROVIDED  BY THE  UNIVERSITY OF  TEXAS AT    */
+/*    AUSTIN  ``AS IS''  AND ANY  EXPRESS OR  IMPLIED WARRANTIES,    */
+/*    INCLUDING, BUT  NOT LIMITED  TO, THE IMPLIED  WARRANTIES OF    */
+/*    MERCHANTABILITY  AND FITNESS FOR  A PARTICULAR  PURPOSE ARE    */
+/*    DISCLAIMED.  IN  NO EVENT SHALL THE UNIVERSITY  OF TEXAS AT    */
+/*    AUSTIN OR CONTRIBUTORS BE  LIABLE FOR ANY DIRECT, INDIRECT,    */
+/*    INCIDENTAL,  SPECIAL, EXEMPLARY,  OR  CONSEQUENTIAL DAMAGES    */
+/*    (INCLUDING, BUT  NOT LIMITED TO,  PROCUREMENT OF SUBSTITUTE    */
+/*    GOODS  OR  SERVICES; LOSS  OF  USE,  DATA,  OR PROFITS;  OR    */
+/*    BUSINESS INTERRUPTION) HOWEVER CAUSED  AND ON ANY THEORY OF    */
+/*    LIABILITY, WHETHER  IN CONTRACT, STRICT  LIABILITY, OR TORT    */
+/*    (INCLUDING NEGLIGENCE OR OTHERWISE)  ARISING IN ANY WAY OUT    */
+/*    OF  THE  USE OF  THIS  SOFTWARE,  EVEN  IF ADVISED  OF  THE    */
+/*    POSSIBILITY OF SUCH DAMAGE.                                    */
+/*                                                                   */
+/* The views and conclusions contained in the software and           */
+/* documentation are those of the authors and should not be          */
+/* interpreted as representing official policies, either expressed   */
+/* or implied, of The University of Texas at Austin.                 */
+/*********************************************************************/
+
+#define ASSEMBLER
+#include "common.h"
+#include "def_vsx.h"
+
+#ifndef __64BIT__
+#define LOAD	lwz
+#else
+#define LOAD	ld
+#endif
+
+#ifdef __64BIT__
+#define STACKSIZE 400
+#define ALPHA_R_SP 304(SP)
+#define ALPHA_I_SP 312(SP)
+#else
+#define STACKSIZE 256
+#define ALPHA_R_SP 224(SP)
+#define ALPHA_I_SP 232(SP)
+#define FZERO	240(SP)
+#endif
+
+#define	M	r3
+#define	N	r4
+#define	K	r5
+
+#ifdef linux
+#ifndef __64BIT__
+#define A	r6
+#define	B	r7
+#define	C	r8
+#define	LDC	r9
+#define OFFSET	r10
+#else
+#define A	r8
+#define	B	r9
+#define	C	r10
+#define	LDC	r6
+#define OFFSET	r7
+#endif
+#endif
+
+#if defined(_AIX) || defined(__APPLE__)
+#if !defined(__64BIT__) && defined(DOUBLE)
+#define A	r10
+#define	B	r6
+#define	C	r7
+#define	LDC	r8
+#define OFFSET	r9
+#else
+#define A	r8
+#define	B	r9
+#define	C	r10
+#define	LDC	r6
+#define OFFSET	r7
+#endif
+#endif
+
+#define o0	0
+
+#define alpha_dr vs28
+#define alpha_di vs29
+#define alpha_sr vs30
+#define alpha_si vs31
+
+#define o12	r12
+#define KKK	r13
+#define K1	r14
+#define L	r15
+#define o16	r16
+#define NOTUSED	r17
+#define T2	r19
+#define KK	r20
+#define	o8	r21
+#define	I	r22
+#define J	r23
+#define AO	r24
+#define	BO	r25
+#define	CO	r26
+#define o4	r27
+#define	o32	r28
+#define o48	r29
+
+#define PRE	r30
+#define T1  	r31
+
+#ifndef NEEDPARAM
+
+	PROLOGUE
+	PROFCODE
+
+	addi	SP, SP, -STACKSIZE
+	li	r0, 0
+
+	stfd	f14,    0(SP)
+	stfd	f15,    8(SP)
+	stfd	f16,   16(SP)
+	stfd	f17,   24(SP)
+
+	stfd	f18,   32(SP)
+	stfd	f19,   40(SP)
+	stfd	f20,   48(SP)
+	stfd	f21,   56(SP)
+
+	stfd	f22,   64(SP)
+	stfd	f23,   72(SP)
+	stfd	f24,   80(SP)
+	stfd	f25,   88(SP)
+
+	stfd	f26,   96(SP)
+	stfd	f27,  104(SP)
+	stfd	f28,  112(SP)
+	stfd	f29,  120(SP)
+
+	stfd	f30,  128(SP)
+	stfd	f31,  136(SP)
+
+#ifdef __64BIT__
+	std	r31,  144(SP)
+	std	r30,  152(SP)
+	std	r29,  160(SP)
+	std	r28,  168(SP)
+	std	r27,  176(SP)
+	std	r26,  184(SP)
+	std	r25,  192(SP)
+	std	r24,  200(SP)
+	std	r23,  208(SP)
+	std	r22,  216(SP)
+	std	r21,  224(SP)
+	std	r20,  232(SP)
+	std	r19,  240(SP)
+	std	r18,  248(SP)
+	std	r17,  256(SP)
+	std	r16,  264(SP)
+	std	r15,  272(SP)
+	std	r14,  280(SP)
+	std	r13,  288(SP)
+	std	r12,  296(SP)
+#else
+	stw	r31,  144(SP)
+	stw	r30,  148(SP)
+	stw	r29,  152(SP)
+	stw	r28,  156(SP)
+	stw	r27,  160(SP)
+	stw	r26,  164(SP)
+	stw	r25,  168(SP)
+	stw	r24,  172(SP)
+	stw	r23,  176(SP)
+	stw	r22,  180(SP)
+	stw	r21,  184(SP)
+	stw	r20,  188(SP)
+	stw	r19,  192(SP)
+	stw	r18,  196(SP)
+	stw	r17,  200(SP)
+	stw	r16,  204(SP)
+	stw	r15,  208(SP)
+	stw	r14,  212(SP)
+	stw	r13,  216(SP)
+#endif
+
+	stfs	f1,  ALPHA_R_SP
+	stfs	f2,  ALPHA_I_SP
+	// stw	r0,  FZERO
+
+#ifdef linux
+#ifdef __64BIT__
+	ld	LDC, FRAMESLOT(0) + STACKSIZE(SP)
+#endif
+#endif
+
+#if defined(_AIX) || defined(__APPLE__)
+#ifdef __64BIT__
+	ld	LDC, FRAMESLOT(0) + STACKSIZE(SP)
+#else
+#ifdef DOUBLE
+	lwz	B,   FRAMESLOT(0) + STACKSIZE(SP)
+	lwz	C,   FRAMESLOT(1) + STACKSIZE(SP)
+	lwz	LDC, FRAMESLOT(2) + STACKSIZE(SP)
+#else
+	lwz	LDC, FRAMESLOT(0) + STACKSIZE(SP)
+#endif
+#endif
+#endif
+
+#ifdef TRMMKERNEL
+#if defined(linux) && defined(__64BIT__)
+	ld	OFFSET,  FRAMESLOT(1) + STACKSIZE(SP)
+#endif
+
+#if defined(_AIX) || defined(__APPLE__)
+#ifdef __64BIT__
+	ld	OFFSET,  FRAMESLOT(1) + STACKSIZE(SP)
+#else
+#ifdef DOUBLE
+	lwz	OFFSET,  FRAMESLOT(3) + STACKSIZE(SP)
+#else
+	lwz	OFFSET,  FRAMESLOT(1) + STACKSIZE(SP)
+#endif
+#endif
+#endif
+#if defined(TRMMKERNEL) && !defined(LEFT)
+	neg	KK, OFFSET
+#endif
+#endif
+
+#include "ctrmm_macros_8x4_power8.S"
+
+	cmpwi	cr0, M, 0
+	ble	L999_H1
+	cmpwi	cr0, N, 0
+	ble	L999_H1
+	cmpwi	cr0, K, 0
+	ble	L999_H1
+
+        slwi    LDC, LDC, ZBASE_SHIFT
+        li      PRE, 384
+        li      o4  , 4
+        li      o8  , 8
+        li      o12 , 12
+        li      o16 , 16
+        li      o32 , 32
+        li      o48 , 48
+
+
+#ifdef __64BIT__
+	addi	T1, SP, 304
+#else
+	addi	T1, SP, 224
+#endif
+
+        lxsspx  alpha_dr, 0, T1
+        lxsspx  alpha_di, o8, T1
+        addi    T1, SP, 360
+        li      T2, 0
+
+        stw             T2, 0(T1)
+        stw             T2, 4(T1)
+        stw             T2, 8(T1)
+        stxsspx         alpha_dr, o12, T1
+        lxvw4x          alpha_sr, o0 , T1
+        addi            T1, T1, 16
+
+        stw             T2, 0(T1)
+        stw             T2, 4(T1)
+        stw             T2, 8(T1)
+        stxsspx         alpha_di, o12, T1
+        lxvw4x          alpha_si, o0 , T1
+
+	.align 5
+
+#include "ctrmm_logic_8x4_power8.S"
+
+L999:
+	addi	r3, 0, 0
+
+	lfd	f14,    0(SP)
+	lfd	f15,    8(SP)
+	lfd	f16,   16(SP)
+	lfd	f17,   24(SP)
+
+	lfd	f18,   32(SP)
+	lfd	f19,   40(SP)
+	lfd	f20,   48(SP)
+	lfd	f21,   56(SP)
+
+	lfd	f22,   64(SP)
+	lfd	f23,   72(SP)
+	lfd	f24,   80(SP)
+	lfd	f25,   88(SP)
+
+	lfd	f26,   96(SP)
+	lfd	f27,  104(SP)
+	lfd	f28,  112(SP)
+	lfd	f29,  120(SP)
+
+	lfd	f30,  128(SP)
+	lfd	f31,  136(SP)
+
+#ifdef __64BIT__
+	ld	r31,  144(SP)
+	ld	r30,  152(SP)
+	ld	r29,  160(SP)
+	ld	r28,  168(SP)
+	ld	r27,  176(SP)
+	ld	r26,  184(SP)
+	ld	r25,  192(SP)
+	ld	r24,  200(SP)
+	ld	r23,  208(SP)
+	ld	r22,  216(SP)
+	ld	r21,  224(SP)
+	ld	r20,  232(SP)
+	ld	r19,  240(SP)
+	ld	r18,  248(SP)
+	ld	r17,  256(SP)
+	ld	r16,  264(SP)
+	ld	r15,  272(SP)
+	ld	r14,  280(SP)
+	ld	r13,  288(SP)
+	ld	r12,  296(SP)
+#else
+	lwz	r31,  144(SP)
+	lwz	r30,  148(SP)
+	lwz	r29,  152(SP)
+	lwz	r28,  156(SP)
+	lwz	r27,  160(SP)
+	lwz	r26,  164(SP)
+	lwz	r25,  168(SP)
+	lwz	r24,  172(SP)
+	lwz	r23,  176(SP)
+	lwz	r22,  180(SP)
+	lwz	r21,  184(SP)
+	lwz	r20,  188(SP)
+	lwz	r19,  192(SP)
+	lwz	r18,  196(SP)
+	lwz	r17,  200(SP)
+	lwz	r16,  204(SP)
+	lwz	r15,  208(SP)
+	lwz	r14,  212(SP)
+	lwz	r13,  216(SP)
+#endif
+
+	addi	SP, SP, STACKSIZE
+
+	blr
+
+	EPILOGUE
+#endif
diff --git a/kernel/power/ctrmm_logic_8x4_power8.S b/kernel/power/ctrmm_logic_8x4_power8.S
new file mode 100644
index 000000000..9ab258501
--- /dev/null
+++ b/kernel/power/ctrmm_logic_8x4_power8.S
@@ -0,0 +1,1769 @@
+/***************************************************************************
+Copyright (c) 2013-2016, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*****************************************************************************/
+
+/**************************************************************************************
+* 2016/04/04 Werner Saar (wernsaar@googlemail.com)
+* 	 BLASTEST 		: OK
+* 	 CTEST			: OK
+* 	 TEST			: OK
+* 	 LAPACK-TEST		: OK
+**************************************************************************************/
+
+	srawi.		J,	N,	2
+	ble		CTRMM_L4_END
+
+CTRMM_L4_BEGIN:
+
+	mr		CO,	C
+	mr		AO,	A
+	slwi		T1,	LDC	,	2
+	add		C,	C,	T1
+
+#if defined(LEFT)
+	mr		KK,	OFFSET		// OFFSET -> KK
+#endif
+
+	srawi.		I,	M,	3
+	ble		CTRMM_L4x8_END
+
+CTRMM_L4x8_BEGIN:
+
+
+#if  (defined(LEFT) &&  defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+	mr		BO,	B					// B -> BO
+#else
+	mr		BO,	B					// B -> BO
+	slwi		T1,	KK,	5				// Number of values in B shifted
+	slwi		T2,	KK,	6				// Number of values in A shifted
+	add		BO,	BO,	T1				// Add values to BO
+	add		AO,	AO,	T2				// Add values to AO
+#endif
+
+#if (defined(LEFT) &&  !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
+	sub		T1,	K,	KK				// K - KK -> TEMP1
+#else
+	mr		T1,	KK					// KK -> KTEMP
+#ifdef LEFT
+	addi		T1,	T1,	8				// KTEMP + Number of values in A -> KTEMP
+#else
+	addi		T1,	T1,	4				// KTEMP + Number of values in B -> KTEMP
+#endif
+#endif
+
+	mr		KKK,	T1
+	mr		K1,	T1
+	srawi.		L,	K1,	3				// KTEMP / 8 -> L
+	ble		CTRMM_L4x8_SUB0
+	cmpwi		cr0,	L,	1
+	ble		CTRMM_L4x8_SUB4
+
+CTRMM_L4x8_LOOP_START:
+
+	dcbt		AO, PRE
+	dcbt		BO, PRE
+	LOAD4x8_1
+	KERNEL4x8_I1
+	dcbt		AO, PRE
+	KERNEL4x8_2
+	KERNEL4x8_1
+	dcbt		AO, PRE
+	KERNEL4x8_2
+
+	KERNEL4x8_1
+	dcbt		AO, PRE
+	KERNEL4x8_2
+	KERNEL4x8_1
+	dcbt		AO, PRE
+	dcbt		BO, PRE
+	KERNEL4x8_2
+
+	addic.		L,	L,	-2
+	ble		CTRMM_L4x8_LOOP_END
+
+	.align 5
+
+CTRMM_L4x8_LOOP:
+
+	KERNEL4x8_1
+	dcbt		AO, PRE
+	KERNEL4x8_2
+	KERNEL4x8_1
+	dcbt		AO, PRE
+	KERNEL4x8_2
+
+	KERNEL4x8_1
+	dcbt		AO, PRE
+	KERNEL4x8_2
+	KERNEL4x8_1
+	dcbt		AO, PRE
+	dcbt		BO, PRE
+	KERNEL4x8_2
+
+	addic.		L,	L,	-1
+	bgt		CTRMM_L4x8_LOOP
+
+CTRMM_L4x8_LOOP_END:
+
+	KERNEL4x8_1
+	dcbt		AO, PRE
+	KERNEL4x8_2
+	KERNEL4x8_1
+	dcbt		AO, PRE
+	KERNEL4x8_2
+
+	KERNEL4x8_1
+	KERNEL4x8_2
+	KERNEL4x8_1
+	KERNEL4x8_E2
+
+	b		CTRMM_L4x8_SUB1
+
+CTRMM_L4x8_SUB4:
+
+	KERNEL4x8_SUBI1
+	KERNEL4x8_SUB1
+	KERNEL4x8_SUB1
+	KERNEL4x8_SUB1
+
+	KERNEL4x8_SUB1
+	KERNEL4x8_SUB1
+	KERNEL4x8_SUB1
+	KERNEL4x8_SUB1
+
+	b		CTRMM_L4x8_SUB1
+
+CTRMM_L4x8_SUB0:
+
+	andi.		L,	K1,	7						// K1 & 7 -> L
+
+	KERNEL4x8_SUBI1
+
+	addic.		L,	L,	-1
+	ble		CTRMM_L4x8_SAVE
+	b		CTRMM_L4x8_SUB2
+
+CTRMM_L4x8_SUB1:
+
+	andi.		L,	K1,	7						// K1 & 7 -> L
+	ble		CTRMM_L4x8_SAVE
+
+CTRMM_L4x8_SUB2:
+
+	KERNEL4x8_SUB1
+
+	addic.		L,	L,	-1
+	bgt		CTRMM_L4x8_SUB2
+
+CTRMM_L4x8_SAVE:
+
+	SAVE4x8
+
+#if  (defined(LEFT) &&  defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+	sub		T1,	K,	KKK					// K - KKK -> TEMP1
+	slwi		T2,	T1,	5			// TEMP1 * Number of values in B shifted -> TEMP2
+	slwi		T1,	T1,	6			// TEMP1 * Number of values in A shifted -> TEMP1
+	add		BO,	BO,	T2					// BO += TEMP2 * number of values in B shifted
+	add		AO,	AO,	T1					// AO += TEMP1 * number of values in A shifted
+#endif
+
+#if defined(LEFT)
+	addi		KK,	KK,	8				// KK += Number of values in A
+#endif
+
+
+	addic.		I,	I,	-1
+	bgt		CTRMM_L4x8_BEGIN
+
+CTRMM_L4x8_END:
+
+CTRMM_L4x4_BEGIN:
+	andi.		T2,	M,	7
+	ble		CTRMM_L4x1_END
+
+	andi.		T1,	M,	4
+	ble		CTRMM_L4x4_END
+
+#if  (defined(LEFT) &&  defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+	mr		BO,	B					// B -> BO
+#else
+	mr		BO,	B					// B -> BO
+	slwi		T1,	KK,	5				// Number of values in B shifted
+	slwi		T2,	KK,	5				// Number of values in A shifted
+	add		BO,	BO,	T1				// Add values to BO
+	add		AO,	AO,	T2				// Add values to AO
+#endif
+
+#if (defined(LEFT) &&  !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
+	sub		T1,	K,	KK				// K - KK -> TEMP1
+#else
+	mr		T1,	KK					// KK -> KTEMP
+#ifdef LEFT
+	addi		T1,	T1,	4				// KTEMP + Number of values in A -> KTEMP
+#else
+	addi		T1,	T1,	4				// KTEMP + Number of values in B -> KTEMP
+#endif
+#endif
+
+	mr		KKK,	T1
+	mr		K1,	T1
+	srawi.		L,	K1,	3				// KTEMP / 8 -> L
+	ble		CTRMM_L4x4_SUB0
+	cmpwi		cr0,	L,	1
+	ble		CTRMM_L4x4_SUB4
+
+CTRMM_L4x4_LOOP_START:
+
+	LOAD4x4_1
+	KERNEL4x4_I1
+	KERNEL4x4_2
+	KERNEL4x4_1
+	KERNEL4x4_2
+
+	KERNEL4x4_1
+	KERNEL4x4_2
+	KERNEL4x4_1
+	KERNEL4x4_2
+
+	addic.		L,	L,	-2
+	ble		CTRMM_L4x4_LOOP_END
+
+	.align 5
+
+CTRMM_L4x4_LOOP:
+
+	KERNEL4x4_1
+	KERNEL4x4_2
+	KERNEL4x4_1
+	KERNEL4x4_2
+
+	KERNEL4x4_1
+	KERNEL4x4_2
+	KERNEL4x4_1
+	KERNEL4x4_2
+
+	addic.		L,	L,	-1
+	bgt		CTRMM_L4x4_LOOP
+
+CTRMM_L4x4_LOOP_END:
+
+	KERNEL4x4_1
+	KERNEL4x4_2
+	KERNEL4x4_1
+	KERNEL4x4_2
+
+	KERNEL4x4_1
+	KERNEL4x4_2
+	KERNEL4x4_1
+	KERNEL4x4_E2
+
+	b		CTRMM_L4x4_SUB1
+
+CTRMM_L4x4_SUB4:
+
+	KERNEL4x4_SUBI1
+	KERNEL4x4_SUB1
+	KERNEL4x4_SUB1
+	KERNEL4x4_SUB1
+
+	KERNEL4x4_SUB1
+	KERNEL4x4_SUB1
+	KERNEL4x4_SUB1
+	KERNEL4x4_SUB1
+
+	b		CTRMM_L4x4_SUB1
+
+CTRMM_L4x4_SUB0:
+
+	andi.		L,	K1,	7						// K1 & 7 -> L
+
+	KERNEL4x4_SUBI1
+
+	addic.		L,	L,	-1
+	ble		CTRMM_L4x4_SAVE
+	b		CTRMM_L4x4_SUB2
+
+CTRMM_L4x4_SUB1:
+
+	andi.		L,	K1,	7						// K1 & 7 -> L
+	ble		CTRMM_L4x4_SAVE
+
+CTRMM_L4x4_SUB2:
+
+	KERNEL4x4_SUB1
+
+	addic.		L,	L,	-1
+	bgt		CTRMM_L4x4_SUB2
+
+CTRMM_L4x4_SAVE:
+
+	SAVE4x4
+
+#if  (defined(LEFT) &&  defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+	sub		T1,	K,	KKK					// K - KKK -> TEMP1
+	slwi		T2,	T1,	5			// TEMP1 * Number of values in B shifted -> TEMP2
+	slwi		T1,	T1,	5			// TEMP1 * Number of values in A shifted -> TEMP1
+	add		BO,	BO,	T2					// BO += TEMP2 * number of values in B shifted
+	add		AO,	AO,	T1					// AO += TEMP1 * number of values in A shifted
+#endif
+
+#if defined(LEFT)
+	addi		KK,	KK,	4				// KK += Number of values in A
+#endif
+
+
+CTRMM_L4x4_END:
+
+CTRMM_L4x2_BEGIN:
+
+	andi.		T1,	M,	2
+	ble		CTRMM_L4x2_END
+
+#if  (defined(LEFT) &&  defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+	mr		BO,	B					// B -> BO
+#else
+	mr		BO,	B					// B -> BO
+	slwi		T1,	KK,	5				// Number of values in B shifted
+	slwi		T2,	KK,	4				// Number of values in A shifted
+	add		BO,	BO,	T1				// Add values to BO
+	add		AO,	AO,	T2				// Add values to AO
+#endif
+
+#if (defined(LEFT) &&  !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
+	sub		T1,	K,	KK				// K - KK -> TEMP1
+#else
+	mr		T1,	KK					// KK -> KTEMP
+#ifdef LEFT
+	addi		T1,	T1,	2				// KTEMP + Number of values in A -> KTEMP
+#else
+	addi		T1,	T1,	4				// KTEMP + Number of values in B -> KTEMP
+#endif
+#endif
+
+	mr		KKK,	T1
+	mr		K1,	T1
+	srawi.		L,	K1,	3				// KTEMP / 8 -> L
+	ble		CTRMM_L4x2_SUB0
+	cmpwi		cr0,	L,	1
+	ble		CTRMM_L4x2_SUB4
+
+CTRMM_L4x2_LOOP_START:
+
+	LOAD4x2_1
+	KERNEL4x2_I1
+	KERNEL4x2_2
+	KERNEL4x2_1
+	KERNEL4x2_2
+
+	KERNEL4x2_1
+	KERNEL4x2_2
+	KERNEL4x2_1
+	KERNEL4x2_2
+
+	addic.		L,	L,	-2
+	ble		CTRMM_L4x2_LOOP_END
+
+	.align 5
+
+CTRMM_L4x2_LOOP:
+
+	KERNEL4x2_1
+	KERNEL4x2_2
+	KERNEL4x2_1
+	KERNEL4x2_2
+
+	KERNEL4x2_1
+	KERNEL4x2_2
+	KERNEL4x2_1
+	KERNEL4x2_2
+
+	addic.		L,	L,	-1
+	bgt		CTRMM_L4x2_LOOP
+
+CTRMM_L4x2_LOOP_END:
+
+	KERNEL4x2_1
+	KERNEL4x2_2
+	KERNEL4x2_1
+	KERNEL4x2_2
+
+	KERNEL4x2_1
+	KERNEL4x2_2
+	KERNEL4x2_1
+	KERNEL4x2_E2
+
+	b		CTRMM_L4x2_SUB1
+
+CTRMM_L4x2_SUB4:
+
+	KERNEL4x2_SUBI1
+	KERNEL4x2_SUB1
+	KERNEL4x2_SUB1
+	KERNEL4x2_SUB1
+
+	KERNEL4x2_SUB1
+	KERNEL4x2_SUB1
+	KERNEL4x2_SUB1
+	KERNEL4x2_SUB1
+
+	b		CTRMM_L4x2_SUB1
+
+CTRMM_L4x2_SUB0:
+
+	andi.		L,	K1,	7						// K1 & 7 -> L
+
+	KERNEL4x2_SUBI1
+
+	addic.		L,	L,	-1
+	ble		CTRMM_L4x2_SAVE
+	b		CTRMM_L4x2_SUB2
+
+CTRMM_L4x2_SUB1:
+
+	andi.		L,	K1,	7						// K1 & 7 -> L
+	ble		CTRMM_L4x2_SAVE
+
+CTRMM_L4x2_SUB2:
+
+	KERNEL4x2_SUB1
+
+	addic.		L,	L,	-1
+	bgt		CTRMM_L4x2_SUB2
+
+CTRMM_L4x2_SAVE:
+
+	SAVE4x2
+
+#if  (defined(LEFT) &&  defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+	sub		T1,	K,	KKK					// K - KKK -> TEMP1
+	slwi		T2,	T1,	5			// TEMP1 * Number of values in B shifted -> TEMP2
+	slwi		T1,	T1,	4			// TEMP1 * Number of values in A shifted -> TEMP1
+	add		BO,	BO,	T2					// BO += TEMP2 * number of values in B shifted
+	add		AO,	AO,	T1					// AO += TEMP1 * number of values in A shifted
+#endif
+
+#if defined(LEFT)
+	addi		KK,	KK,	2				// KK += Number of values in A
+#endif
+
+
+CTRMM_L4x2_END:
+
+CTRMM_L4x1_BEGIN:
+
+	andi.		T1,	M,	1
+	ble		CTRMM_L4x1_END
+
+#if  (defined(LEFT) &&  defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+	mr		BO,	B					// B -> BO
+#else
+	mr		BO,	B					// B -> BO
+	slwi		T1,	KK,	5				// Number of values in B shifted
+	slwi		T2,	KK,	3				// Number of values in A shifted
+	add		BO,	BO,	T1				// Add values to BO
+	add		AO,	AO,	T2				// Add values to AO
+#endif
+
+#if (defined(LEFT) &&  !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
+	sub		T1,	K,	KK				// K - KK -> TEMP1
+#else
+	mr		T1,	KK					// KK -> KTEMP
+#ifdef LEFT
+	addi		T1,	T1,	1				// KTEMP + Number of values in A -> KTEMP
+#else
+	addi		T1,	T1,	4				// KTEMP + Number of values in B -> KTEMP
+#endif
+#endif
+
+	mr		KKK,	T1
+	mr		K1,	T1
+	srawi.		L,	K1,	3				// KTEMP / 8 -> L
+	ble		CTRMM_L4x1_SUB0
+	cmpwi		cr0,	L,	1
+	ble		CTRMM_L4x1_SUB4
+
+CTRMM_L4x1_LOOP_START:
+
+	LOAD4x1_1
+	KERNEL4x1_I1
+	KERNEL4x1_2
+	KERNEL4x1_1
+	KERNEL4x1_2
+
+	KERNEL4x1_1
+	KERNEL4x1_2
+	KERNEL4x1_1
+	KERNEL4x1_2
+
+	addic.		L,	L,	-2
+	ble		CTRMM_L4x1_LOOP_END
+
+	.align 5
+
+CTRMM_L4x1_LOOP:
+
+	KERNEL4x1_1
+	KERNEL4x1_2
+	KERNEL4x1_1
+	KERNEL4x1_2
+
+	KERNEL4x1_1
+	KERNEL4x1_2
+	KERNEL4x1_1
+	KERNEL4x1_2
+
+	addic.		L,	L,	-1
+	bgt		CTRMM_L4x1_LOOP
+
+CTRMM_L4x1_LOOP_END:
+
+	KERNEL4x1_1
+	KERNEL4x1_2
+	KERNEL4x1_1
+	KERNEL4x1_2
+
+	KERNEL4x1_1
+	KERNEL4x1_2
+	KERNEL4x1_1
+	KERNEL4x1_E2
+
+	b		CTRMM_L4x1_SUB1
+
+CTRMM_L4x1_SUB4:
+
+	KERNEL4x1_SUBI1
+	KERNEL4x1_SUB1
+	KERNEL4x1_SUB1
+	KERNEL4x1_SUB1
+
+	KERNEL4x1_SUB1
+	KERNEL4x1_SUB1
+	KERNEL4x1_SUB1
+	KERNEL4x1_SUB1
+
+	b		CTRMM_L4x1_SUB1
+
+CTRMM_L4x1_SUB0:
+
+	andi.		L,	K1,	7						// K1 & 7 -> L
+
+	KERNEL4x1_SUBI1
+
+	addic.		L,	L,	-1
+	ble		CTRMM_L4x1_SAVE
+	b		CTRMM_L4x1_SUB2
+
+CTRMM_L4x1_SUB1:
+
+	andi.		L,	K1,	7						// K1 & 7 -> L
+	ble		CTRMM_L4x1_SAVE
+
+CTRMM_L4x1_SUB2:
+
+	KERNEL4x1_SUB1
+
+	addic.		L,	L,	-1
+	bgt		CTRMM_L4x1_SUB2
+
+CTRMM_L4x1_SAVE:
+
+	SAVE4x1
+
+#if  (defined(LEFT) &&  defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+	sub		T1,	K,	KKK					// K - KKK -> TEMP1
+	slwi		T2,	T1,	5			// TEMP1 * Number of values in B shifted -> TEMP2
+	slwi		T1,	T1,	3			// TEMP1 * Number of values in A shifted -> TEMP1
+	add		BO,	BO,	T2					// BO += TEMP2 * number of values in B shifted
+	add		AO,	AO,	T1					// AO += TEMP1 * number of values in A shifted
+#endif
+
+#if defined(LEFT)
+	addi		KK,	KK,	1				// KK += Number of values in A
+#endif
+
+
+CTRMM_L4x1_END:
+
+	slwi		T1,	K,	5
+	add		B,	B,	T1
+
+#if !defined(LEFT)
+	addi		KK,	KK,	4					// KK += Number of values in B
+#endif
+
+
+	addic.		J,	J,	-1
+	bgt		CTRMM_L4_BEGIN
+
+	andi.		T2,	N,	3
+	ble		L999_H2
+
+CTRMM_L4_END:
+
+	b		CTRMM_L2_BEGIN
+
+L999_H1:
+
+	b		L999_H2
+
+CTRMM_L2_BEGIN:
+
+	andi.		T1,	N,	2
+	ble		CTRMM_L2_END
+	mr		CO,	C
+	mr		AO,	A
+	slwi		T1,	LDC	,	1
+	add		C,	C,	T1
+
+#if defined(LEFT)
+	mr		KK,	OFFSET		// OFFSET -> KK
+#endif
+
+	srawi.		I,	M,	3
+	ble		CTRMM_L2x8_END
+
+CTRMM_L2x8_BEGIN:
+
+
+#if  (defined(LEFT) &&  defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+	mr		BO,	B					// B -> BO
+#else
+	mr		BO,	B					// B -> BO
+	slwi		T1,	KK,	4				// Number of values in B shifted
+	slwi		T2,	KK,	6				// Number of values in A shifted
+	add		BO,	BO,	T1				// Add values to BO
+	add		AO,	AO,	T2				// Add values to AO
+#endif
+
+#if (defined(LEFT) &&  !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
+	sub		T1,	K,	KK				// K - KK -> TEMP1
+#else
+	mr		T1,	KK					// KK -> KTEMP
+#ifdef LEFT
+	addi		T1,	T1,	8				// KTEMP + Number of values in A -> KTEMP
+#else
+	addi		T1,	T1,	2				// KTEMP + Number of values in B -> KTEMP
+#endif
+#endif
+
+	mr		KKK,	T1
+	mr		K1,	T1
+	srawi.		L,	K1,	3				// KTEMP / 8 -> L
+	ble		CTRMM_L2x8_SUB0
+	cmpwi		cr0,	L,	1
+	ble		CTRMM_L2x8_SUB4
+
+CTRMM_L2x8_LOOP_START:
+
+	LOAD2x8_1
+	KERNEL2x8_I1
+	KERNEL2x8_2
+	KERNEL2x8_1
+	KERNEL2x8_2
+
+	KERNEL2x8_1
+	KERNEL2x8_2
+	KERNEL2x8_1
+	KERNEL2x8_2
+
+	addic.		L,	L,	-2
+	ble		CTRMM_L2x8_LOOP_END
+
+	.align 5
+
+CTRMM_L2x8_LOOP:
+
+	KERNEL2x8_1
+	KERNEL2x8_2
+	KERNEL2x8_1
+	KERNEL2x8_2
+
+	KERNEL2x8_1
+	KERNEL2x8_2
+	KERNEL2x8_1
+	KERNEL2x8_2
+
+	addic.		L,	L,	-1
+	bgt		CTRMM_L2x8_LOOP
+
+CTRMM_L2x8_LOOP_END:
+
+	KERNEL2x8_1
+	KERNEL2x8_2
+	KERNEL2x8_1
+	KERNEL2x8_2
+
+	KERNEL2x8_1
+	KERNEL2x8_2
+	KERNEL2x8_1
+	KERNEL2x8_E2
+
+	b		CTRMM_L2x8_SUB1
+
+CTRMM_L2x8_SUB4:
+
+	KERNEL2x8_SUBI1
+	KERNEL2x8_SUB1
+	KERNEL2x8_SUB1
+	KERNEL2x8_SUB1
+
+	KERNEL2x8_SUB1
+	KERNEL2x8_SUB1
+	KERNEL2x8_SUB1
+	KERNEL2x8_SUB1
+
+	b		CTRMM_L2x8_SUB1
+
+CTRMM_L2x8_SUB0:
+
+	andi.		L,	K1,	7						// K1 & 7 -> L
+
+	KERNEL2x8_SUBI1
+
+	addic.		L,	L,	-1
+	ble		CTRMM_L2x8_SAVE
+	b		CTRMM_L2x8_SUB2
+
+CTRMM_L2x8_SUB1:
+
+	andi.		L,	K1,	7						// K1 & 7 -> L
+	ble		CTRMM_L2x8_SAVE
+
+CTRMM_L2x8_SUB2:
+
+	KERNEL2x8_SUB1
+
+	addic.		L,	L,	-1
+	bgt		CTRMM_L2x8_SUB2
+
+CTRMM_L2x8_SAVE:
+
+	SAVE2x8
+
+#if  (defined(LEFT) &&  defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+	sub		T1,	K,	KKK					// K - KKK -> TEMP1
+	slwi		T2,	T1,	4			// TEMP1 * Number of values in B shifted -> TEMP2
+	slwi		T1,	T1,	6			// TEMP1 * Number of values in A shifted -> TEMP1
+	add		BO,	BO,	T2					// BO += TEMP2 * number of values in B shifted
+	add		AO,	AO,	T1					// AO += TEMP1 * number of values in A shifted
+#endif
+
+#if defined(LEFT)
+	addi		KK,	KK,	8				// KK += Number of values in A
+#endif
+
+
+	addic.		I,	I,	-1
+	bgt		CTRMM_L2x8_BEGIN
+
+CTRMM_L2x8_END:
+
+CTRMM_L2x4_BEGIN:
+	andi.		T2,	M,	7
+	ble		CTRMM_L2x1_END
+
+	andi.		T1,	M,	4
+	ble		CTRMM_L2x4_END
+
+#if  (defined(LEFT) &&  defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+	mr		BO,	B					// B -> BO
+#else
+	mr		BO,	B					// B -> BO
+	slwi		T1,	KK,	4				// Number of values in B shifted
+	slwi		T2,	KK,	5				// Number of values in A shifted
+	add		BO,	BO,	T1				// Add values to BO
+	add		AO,	AO,	T2				// Add values to AO
+#endif
+
+#if (defined(LEFT) &&  !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
+	sub		T1,	K,	KK				// K - KK -> TEMP1
+#else
+	mr		T1,	KK					// KK -> KTEMP
+#ifdef LEFT
+	addi		T1,	T1,	4				// KTEMP + Number of values in A -> KTEMP
+#else
+	addi		T1,	T1,	2				// KTEMP + Number of values in B -> KTEMP
+#endif
+#endif
+
+	mr		KKK,	T1
+	mr		K1,	T1
+	srawi.		L,	K1,	3				// KTEMP / 8 -> L
+	ble		CTRMM_L2x4_SUB0
+	cmpwi		cr0,	L,	1
+	ble		CTRMM_L2x4_SUB4
+
+CTRMM_L2x4_LOOP_START:
+
+	LOAD2x4_1
+	KERNEL2x4_I1
+	KERNEL2x4_2
+	KERNEL2x4_1
+	KERNEL2x4_2
+
+	KERNEL2x4_1
+	KERNEL2x4_2
+	KERNEL2x4_1
+	KERNEL2x4_2
+
+	addic.		L,	L,	-2
+	ble		CTRMM_L2x4_LOOP_END
+
+	.align 5
+
+CTRMM_L2x4_LOOP:
+
+	KERNEL2x4_1
+	KERNEL2x4_2
+	KERNEL2x4_1
+	KERNEL2x4_2
+
+	KERNEL2x4_1
+	KERNEL2x4_2
+	KERNEL2x4_1
+	KERNEL2x4_2
+
+	addic.		L,	L,	-1
+	bgt		CTRMM_L2x4_LOOP
+
+CTRMM_L2x4_LOOP_END:
+
+	KERNEL2x4_1
+	KERNEL2x4_2
+	KERNEL2x4_1
+	KERNEL2x4_2
+
+	KERNEL2x4_1
+	KERNEL2x4_2
+	KERNEL2x4_1
+	KERNEL2x4_E2
+
+	b		CTRMM_L2x4_SUB1
+
+CTRMM_L2x4_SUB4:
+
+	KERNEL2x4_SUBI1
+	KERNEL2x4_SUB1
+	KERNEL2x4_SUB1
+	KERNEL2x4_SUB1
+
+	KERNEL2x4_SUB1
+	KERNEL2x4_SUB1
+	KERNEL2x4_SUB1
+	KERNEL2x4_SUB1
+
+	b		CTRMM_L2x4_SUB1
+
+CTRMM_L2x4_SUB0:
+
+	andi.		L,	K1,	7						// K1 & 7 -> L
+
+	KERNEL2x4_SUBI1
+
+	addic.		L,	L,	-1
+	ble		CTRMM_L2x4_SAVE
+	b		CTRMM_L2x4_SUB2
+
+CTRMM_L2x4_SUB1:
+
+	andi.		L,	K1,	7						// K1 & 7 -> L
+	ble		CTRMM_L2x4_SAVE
+
+CTRMM_L2x4_SUB2:
+
+	KERNEL2x4_SUB1
+
+	addic.		L,	L,	-1
+	bgt		CTRMM_L2x4_SUB2
+
+CTRMM_L2x4_SAVE:
+
+	SAVE2x4
+
+#if  (defined(LEFT) &&  defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+	sub		T1,	K,	KKK					// K - KKK -> TEMP1
+	slwi		T2,	T1,	4			// TEMP1 * Number of values in B shifted -> TEMP2
+	slwi		T1,	T1,	5			// TEMP1 * Number of values in A shifted -> TEMP1
+	add		BO,	BO,	T2					// BO += TEMP2 * number of values in B shifted
+	add		AO,	AO,	T1					// AO += TEMP1 * number of values in A shifted
+#endif
+
+#if defined(LEFT)
+	addi		KK,	KK,	4				// KK += Number of values in A
+#endif
+
+
+CTRMM_L2x4_END:
+
+CTRMM_L2x2_BEGIN:
+
+	andi.		T1,	M,	2
+	ble		CTRMM_L2x2_END
+
+#if  (defined(LEFT) &&  defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+	mr		BO,	B					// B -> BO
+#else
+	mr		BO,	B					// B -> BO
+	slwi		T1,	KK,	4				// Number of values in B shifted
+	slwi		T2,	KK,	4				// Number of values in A shifted
+	add		BO,	BO,	T1				// Add values to BO
+	add		AO,	AO,	T2				// Add values to AO
+#endif
+
+#if (defined(LEFT) &&  !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
+	sub		T1,	K,	KK				// K - KK -> TEMP1
+#else
+	mr		T1,	KK					// KK -> KTEMP
+#ifdef LEFT
+	addi		T1,	T1,	2				// KTEMP + Number of values in A -> KTEMP
+#else
+	addi		T1,	T1,	2				// KTEMP + Number of values in B -> KTEMP
+#endif
+#endif
+
+	mr		KKK,	T1
+	mr		K1,	T1
+	srawi.		L,	K1,	3				// KTEMP / 8 -> L
+	ble		CTRMM_L2x2_SUB0
+	cmpwi		cr0,	L,	1
+	ble		CTRMM_L2x2_SUB4
+
+CTRMM_L2x2_LOOP_START:
+
+	LOAD2x2_1
+	KERNEL2x2_I1
+	KERNEL2x2_2
+	KERNEL2x2_1
+	KERNEL2x2_2
+
+	KERNEL2x2_1
+	KERNEL2x2_2
+	KERNEL2x2_1
+	KERNEL2x2_2
+
+	addic.		L,	L,	-2
+	ble		CTRMM_L2x2_LOOP_END
+
+	.align 5
+
+CTRMM_L2x2_LOOP:
+
+	KERNEL2x2_1
+	KERNEL2x2_2
+	KERNEL2x2_1
+	KERNEL2x2_2
+
+	KERNEL2x2_1
+	KERNEL2x2_2
+	KERNEL2x2_1
+	KERNEL2x2_2
+
+	addic.		L,	L,	-1
+	bgt		CTRMM_L2x2_LOOP
+
+CTRMM_L2x2_LOOP_END:
+
+	KERNEL2x2_1
+	KERNEL2x2_2
+	KERNEL2x2_1
+	KERNEL2x2_2
+
+	KERNEL2x2_1
+	KERNEL2x2_2
+	KERNEL2x2_1
+	KERNEL2x2_E2
+
+	b		CTRMM_L2x2_SUB1
+
+CTRMM_L2x2_SUB4:
+
+	KERNEL2x2_SUBI1
+	KERNEL2x2_SUB1
+	KERNEL2x2_SUB1
+	KERNEL2x2_SUB1
+
+	KERNEL2x2_SUB1
+	KERNEL2x2_SUB1
+	KERNEL2x2_SUB1
+	KERNEL2x2_SUB1
+
+	b		CTRMM_L2x2_SUB1
+
+CTRMM_L2x2_SUB0:
+
+	andi.		L,	K1,	7						// K1 & 7 -> L
+
+	KERNEL2x2_SUBI1
+
+	addic.		L,	L,	-1
+	ble		CTRMM_L2x2_SAVE
+	b		CTRMM_L2x2_SUB2
+
+CTRMM_L2x2_SUB1:
+
+	andi.		L,	K1,	7						// K1 & 7 -> L
+	ble		CTRMM_L2x2_SAVE
+
+CTRMM_L2x2_SUB2:
+
+	KERNEL2x2_SUB1
+
+	addic.		L,	L,	-1
+	bgt		CTRMM_L2x2_SUB2
+
+CTRMM_L2x2_SAVE:
+
+	SAVE2x2
+
+#if  (defined(LEFT) &&  defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+	sub		T1,	K,	KKK					// K - KKK -> TEMP1
+	slwi		T2,	T1,	4			// TEMP1 * Number of values in B shifted -> TEMP2
+	slwi		T1,	T1,	4			// TEMP1 * Number of values in A shifted -> TEMP1
+	add		BO,	BO,	T2					// BO += TEMP2 * number of values in B shifted
+	add		AO,	AO,	T1					// AO += TEMP1 * number of values in A shifted
+#endif
+
+#if defined(LEFT)
+	addi		KK,	KK,	2				// KK += Number of values in A
+#endif
+
+
+CTRMM_L2x2_END:
+
+CTRMM_L2x1_BEGIN:
+
+	andi.		T1,	M,	1
+	ble		CTRMM_L2x1_END
+
+#if  (defined(LEFT) &&  defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+	mr		BO,	B					// B -> BO
+#else
+	mr		BO,	B					// B -> BO
+	slwi		T1,	KK,	4				// Number of values in B shifted
+	slwi		T2,	KK,	3				// Number of values in A shifted
+	add		BO,	BO,	T1				// Add values to BO
+	add		AO,	AO,	T2				// Add values to AO
+#endif
+
+#if (defined(LEFT) &&  !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
+	sub		T1,	K,	KK				// K - KK -> TEMP1
+#else
+	mr		T1,	KK					// KK -> KTEMP
+#ifdef LEFT
+	addi		T1,	T1,	1				// KTEMP + Number of values in A -> KTEMP
+#else
+	addi		T1,	T1,	2				// KTEMP + Number of values in B -> KTEMP
+#endif
+#endif
+
+	mr		KKK,	T1
+	mr		K1,	T1
+	srawi.		L,	K1,	3				// KTEMP / 8 -> L
+	ble		CTRMM_L2x1_SUB0
+	cmpwi		cr0,	L,	1
+	ble		CTRMM_L2x1_SUB4
+
+CTRMM_L2x1_LOOP_START:
+
+	LOAD2x1_1
+	KERNEL2x1_I1
+	KERNEL2x1_2
+	KERNEL2x1_1
+	KERNEL2x1_2
+
+	KERNEL2x1_1
+	KERNEL2x1_2
+	KERNEL2x1_1
+	KERNEL2x1_2
+
+	addic.		L,	L,	-2
+	ble		CTRMM_L2x1_LOOP_END
+
+	.align 5
+
+CTRMM_L2x1_LOOP:
+
+	KERNEL2x1_1
+	KERNEL2x1_2
+	KERNEL2x1_1
+	KERNEL2x1_2
+
+	KERNEL2x1_1
+	KERNEL2x1_2
+	KERNEL2x1_1
+	KERNEL2x1_2
+
+	addic.		L,	L,	-1
+	bgt		CTRMM_L2x1_LOOP
+
+CTRMM_L2x1_LOOP_END:
+
+	KERNEL2x1_1
+	KERNEL2x1_2
+	KERNEL2x1_1
+	KERNEL2x1_2
+
+	KERNEL2x1_1
+	KERNEL2x1_2
+	KERNEL2x1_1
+	KERNEL2x1_E2
+
+	b		CTRMM_L2x1_SUB1
+
+CTRMM_L2x1_SUB4:
+
+	KERNEL2x1_SUBI1
+	KERNEL2x1_SUB1
+	KERNEL2x1_SUB1
+	KERNEL2x1_SUB1
+
+	KERNEL2x1_SUB1
+	KERNEL2x1_SUB1
+	KERNEL2x1_SUB1
+	KERNEL2x1_SUB1
+
+	b		CTRMM_L2x1_SUB1
+
+CTRMM_L2x1_SUB0:
+
+	andi.		L,	K1,	7						// K1 & 7 -> L
+
+	KERNEL2x1_SUBI1
+
+	addic.		L,	L,	-1
+	ble		CTRMM_L2x1_SAVE
+	b		CTRMM_L2x1_SUB2
+
+CTRMM_L2x1_SUB1:
+
+	andi.		L,	K1,	7						// K1 & 7 -> L
+	ble		CTRMM_L2x1_SAVE
+
+CTRMM_L2x1_SUB2:
+
+	KERNEL2x1_SUB1
+
+	addic.		L,	L,	-1
+	bgt		CTRMM_L2x1_SUB2
+
+CTRMM_L2x1_SAVE:
+
+	SAVE2x1
+
+#if  (defined(LEFT) &&  defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+	sub		T1,	K,	KKK					// K - KKK -> TEMP1
+	slwi		T2,	T1,	4			// TEMP1 * Number of values in B shifted -> TEMP2
+	slwi		T1,	T1,	3			// TEMP1 * Number of values in A shifted -> TEMP1
+	add		BO,	BO,	T2					// BO += TEMP2 * number of values in B shifted
+	add		AO,	AO,	T1					// AO += TEMP1 * number of values in A shifted
+#endif
+
+#if defined(LEFT)
+	addi		KK,	KK,	1				// KK += Number of values in A
+#endif
+
+
+CTRMM_L2x1_END:
+
+	slwi		T1,	K,	4
+	add		B,	B,	T1
+
+#if !defined(LEFT)
+	addi		KK,	KK,	2					// KK += Number of values in B
+#endif
+
+
+CTRMM_L2_END:
+
+	b		CTRMM_L1_BEGIN
+
+L999_H2:
+
+	b		L999
+
+CTRMM_L1_BEGIN:
+
+	andi.		T1,	N,	1
+	ble		CTRMM_L1_END
+	mr		CO,	C
+	mr		AO,	A
+
+#if defined(LEFT)
+	mr		KK,	OFFSET		// OFFSET -> KK
+#endif
+
+	srawi.		I,	M,	3
+	ble		CTRMM_L1x8_END
+
+CTRMM_L1x8_BEGIN:
+
+
+#if  (defined(LEFT) &&  defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+	mr		BO,	B					// B -> BO
+#else
+	mr		BO,	B					// B -> BO
+	slwi		T1,	KK,	3				// Number of values in B shifted
+	slwi		T2,	KK,	6				// Number of values in A shifted
+	add		BO,	BO,	T1				// Add values to BO
+	add		AO,	AO,	T2				// Add values to AO
+#endif
+
+#if (defined(LEFT) &&  !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
+	sub		T1,	K,	KK				// K - KK -> TEMP1
+#else
+	mr		T1,	KK					// KK -> KTEMP
+#ifdef LEFT
+	addi		T1,	T1,	8				// KTEMP + Number of values in A -> KTEMP
+#else
+	addi		T1,	T1,	1				// KTEMP + Number of values in B -> KTEMP
+#endif
+#endif
+
+	mr		KKK,	T1
+	mr		K1,	T1
+	srawi.		L,	K1,	3				// KTEMP / 8 -> L
+	ble		CTRMM_L1x8_SUB0
+	cmpwi		cr0,	L,	1
+	ble		CTRMM_L1x8_SUB4
+
+CTRMM_L1x8_LOOP_START:
+
+	LOAD1x8_1
+	KERNEL1x8_I1
+	KERNEL1x8_2
+	KERNEL1x8_1
+	KERNEL1x8_2
+
+	KERNEL1x8_1
+	KERNEL1x8_2
+	KERNEL1x8_1
+	KERNEL1x8_2
+
+	addic.		L,	L,	-2
+	ble		CTRMM_L1x8_LOOP_END
+
+	.align 5
+
+CTRMM_L1x8_LOOP:
+
+	KERNEL1x8_1
+	KERNEL1x8_2
+	KERNEL1x8_1
+	KERNEL1x8_2
+
+	KERNEL1x8_1
+	KERNEL1x8_2
+	KERNEL1x8_1
+	KERNEL1x8_2
+
+	addic.		L,	L,	-1
+	bgt		CTRMM_L1x8_LOOP
+
+CTRMM_L1x8_LOOP_END:
+
+	KERNEL1x8_1
+	KERNEL1x8_2
+	KERNEL1x8_1
+	KERNEL1x8_2
+
+	KERNEL1x8_1
+	KERNEL1x8_2
+	KERNEL1x8_1
+	KERNEL1x8_E2
+
+	b		CTRMM_L1x8_SUB1
+
+CTRMM_L1x8_SUB4:
+
+	KERNEL1x8_SUBI1
+	KERNEL1x8_SUB1
+	KERNEL1x8_SUB1
+	KERNEL1x8_SUB1
+
+	KERNEL1x8_SUB1
+	KERNEL1x8_SUB1
+	KERNEL1x8_SUB1
+	KERNEL1x8_SUB1
+
+	b		CTRMM_L1x8_SUB1
+
+CTRMM_L1x8_SUB0:
+
+	andi.		L,	K1,	7						// K1 & 7 -> L
+
+	KERNEL1x8_SUBI1
+
+	addic.		L,	L,	-1
+	ble		CTRMM_L1x8_SAVE
+	b		CTRMM_L1x8_SUB2
+
+CTRMM_L1x8_SUB1:
+
+	andi.		L,	K1,	7						// K1 & 7 -> L
+	ble		CTRMM_L1x8_SAVE
+
+CTRMM_L1x8_SUB2:
+
+	KERNEL1x8_SUB1
+
+	addic.		L,	L,	-1
+	bgt		CTRMM_L1x8_SUB2
+
+CTRMM_L1x8_SAVE:
+
+	SAVE1x8
+
+#if  (defined(LEFT) &&  defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+	sub		T1,	K,	KKK					// K - KKK -> TEMP1
+	slwi		T2,	T1,	3			// TEMP1 * Number of values in B shifted -> TEMP2
+	slwi		T1,	T1,	6			// TEMP1 * Number of values in A shifted -> TEMP1
+	add		BO,	BO,	T2					// BO += TEMP2 * number of values in B shifted
+	add		AO,	AO,	T1					// AO += TEMP1 * number of values in A shifted
+#endif
+
+#if defined(LEFT)
+	addi		KK,	KK,	8				// KK += Number of values in A
+#endif
+
+
+	addic.		I,	I,	-1
+	bgt		CTRMM_L1x8_BEGIN
+
+CTRMM_L1x8_END:
+
+CTRMM_L1x4_BEGIN:
+	andi.		T2,	M,	7
+	ble		CTRMM_L1x1_END
+
+	andi.		T1,	M,	4
+	ble		CTRMM_L1x4_END
+
+#if  (defined(LEFT) &&  defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+	mr		BO,	B					// B -> BO
+#else
+	mr		BO,	B					// B -> BO
+	slwi		T1,	KK,	3				// Number of values in B shifted
+	slwi		T2,	KK,	5				// Number of values in A shifted
+	add		BO,	BO,	T1				// Add values to BO
+	add		AO,	AO,	T2				// Add values to AO
+#endif
+
+#if (defined(LEFT) &&  !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
+	sub		T1,	K,	KK				// K - KK -> TEMP1
+#else
+	mr		T1,	KK					// KK -> KTEMP
+#ifdef LEFT
+	addi		T1,	T1,	4				// KTEMP + Number of values in A -> KTEMP
+#else
+	addi		T1,	T1,	1				// KTEMP + Number of values in B -> KTEMP
+#endif
+#endif
+
+	mr		KKK,	T1
+	mr		K1,	T1
+	srawi.		L,	K1,	3				// KTEMP / 8 -> L
+	ble		CTRMM_L1x4_SUB0
+	cmpwi		cr0,	L,	1
+	ble		CTRMM_L1x4_SUB4
+
+CTRMM_L1x4_LOOP_START:
+
+	LOAD1x4_1
+	KERNEL1x4_I1
+	KERNEL1x4_2
+	KERNEL1x4_1
+	KERNEL1x4_2
+
+	KERNEL1x4_1
+	KERNEL1x4_2
+	KERNEL1x4_1
+	KERNEL1x4_2
+
+	addic.		L,	L,	-2
+	ble		CTRMM_L1x4_LOOP_END
+
+	.align 5
+
+CTRMM_L1x4_LOOP:
+
+	KERNEL1x4_1
+	KERNEL1x4_2
+	KERNEL1x4_1
+	KERNEL1x4_2
+
+	KERNEL1x4_1
+	KERNEL1x4_2
+	KERNEL1x4_1
+	KERNEL1x4_2
+
+	addic.		L,	L,	-1
+	bgt		CTRMM_L1x4_LOOP
+
+CTRMM_L1x4_LOOP_END:
+
+	KERNEL1x4_1
+	KERNEL1x4_2
+	KERNEL1x4_1
+	KERNEL1x4_2
+
+	KERNEL1x4_1
+	KERNEL1x4_2
+	KERNEL1x4_1
+	KERNEL1x4_E2
+
+	b		CTRMM_L1x4_SUB1
+
+CTRMM_L1x4_SUB4:
+
+	KERNEL1x4_SUBI1
+	KERNEL1x4_SUB1
+	KERNEL1x4_SUB1
+	KERNEL1x4_SUB1
+
+	KERNEL1x4_SUB1
+	KERNEL1x4_SUB1
+	KERNEL1x4_SUB1
+	KERNEL1x4_SUB1
+
+	b		CTRMM_L1x4_SUB1
+
+CTRMM_L1x4_SUB0:
+
+	andi.		L,	K1,	7						// K1 & 7 -> L
+
+	KERNEL1x4_SUBI1
+
+	addic.		L,	L,	-1
+	ble		CTRMM_L1x4_SAVE
+	b		CTRMM_L1x4_SUB2
+
+CTRMM_L1x4_SUB1:
+
+	andi.		L,	K1,	7						// K1 & 7 -> L
+	ble		CTRMM_L1x4_SAVE
+
+CTRMM_L1x4_SUB2:
+
+	KERNEL1x4_SUB1
+
+	addic.		L,	L,	-1
+	bgt		CTRMM_L1x4_SUB2
+
+CTRMM_L1x4_SAVE:
+
+	SAVE1x4
+
+#if  (defined(LEFT) &&  defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+	sub		T1,	K,	KKK					// K - KKK -> TEMP1
+	slwi		T2,	T1,	3			// TEMP1 * Number of values in B shifted -> TEMP2
+	slwi		T1,	T1,	5			// TEMP1 * Number of values in A shifted -> TEMP1
+	add		BO,	BO,	T2					// BO += TEMP2 * number of values in B shifted
+	add		AO,	AO,	T1					// AO += TEMP1 * number of values in A shifted
+#endif
+
+#if defined(LEFT)
+	addi		KK,	KK,	4				// KK += Number of values in A
+#endif
+
+
+CTRMM_L1x4_END:
+
+CTRMM_L1x2_BEGIN:
+
+	andi.		T1,	M,	2
+	ble		CTRMM_L1x2_END
+
+#if  (defined(LEFT) &&  defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+	mr		BO,	B					// B -> BO
+#else
+	mr		BO,	B					// B -> BO
+	slwi		T1,	KK,	3				// Number of values in B shifted
+	slwi		T2,	KK,	4				// Number of values in A shifted
+	add		BO,	BO,	T1				// Add values to BO
+	add		AO,	AO,	T2				// Add values to AO
+#endif
+
+#if (defined(LEFT) &&  !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
+	sub		T1,	K,	KK				// K - KK -> TEMP1
+#else
+	mr		T1,	KK					// KK -> KTEMP
+#ifdef LEFT
+	addi		T1,	T1,	2				// KTEMP + Number of values in A -> KTEMP
+#else
+	addi		T1,	T1,	1				// KTEMP + Number of values in B -> KTEMP
+#endif
+#endif
+
+	mr		KKK,	T1
+	mr		K1,	T1
+	srawi.		L,	K1,	3				// KTEMP / 8 -> L
+	ble		CTRMM_L1x2_SUB0
+	cmpwi		cr0,	L,	1
+	ble		CTRMM_L1x2_SUB4
+
+CTRMM_L1x2_LOOP_START:
+
+	LOAD1x2_1
+	KERNEL1x2_I1
+	KERNEL1x2_2
+	KERNEL1x2_1
+	KERNEL1x2_2
+
+	KERNEL1x2_1
+	KERNEL1x2_2
+	KERNEL1x2_1
+	KERNEL1x2_2
+
+	addic.		L,	L,	-2
+	ble		CTRMM_L1x2_LOOP_END
+
+	.align 5
+
+CTRMM_L1x2_LOOP:
+
+	KERNEL1x2_1
+	KERNEL1x2_2
+	KERNEL1x2_1
+	KERNEL1x2_2
+
+	KERNEL1x2_1
+	KERNEL1x2_2
+	KERNEL1x2_1
+	KERNEL1x2_2
+
+	addic.		L,	L,	-1
+	bgt		CTRMM_L1x2_LOOP
+
+CTRMM_L1x2_LOOP_END:
+
+	KERNEL1x2_1
+	KERNEL1x2_2
+	KERNEL1x2_1
+	KERNEL1x2_2
+
+	KERNEL1x2_1
+	KERNEL1x2_2
+	KERNEL1x2_1
+	KERNEL1x2_E2
+
+	b		CTRMM_L1x2_SUB1
+
+CTRMM_L1x2_SUB4:
+
+	KERNEL1x2_SUBI1
+	KERNEL1x2_SUB1
+	KERNEL1x2_SUB1
+	KERNEL1x2_SUB1
+
+	KERNEL1x2_SUB1
+	KERNEL1x2_SUB1
+	KERNEL1x2_SUB1
+	KERNEL1x2_SUB1
+
+	b		CTRMM_L1x2_SUB1
+
+CTRMM_L1x2_SUB0:
+
+	andi.		L,	K1,	7						// K1 & 7 -> L
+
+	KERNEL1x2_SUBI1
+
+	addic.		L,	L,	-1
+	ble		CTRMM_L1x2_SAVE
+	b		CTRMM_L1x2_SUB2
+
+CTRMM_L1x2_SUB1:
+
+	andi.		L,	K1,	7						// K1 & 7 -> L
+	ble		CTRMM_L1x2_SAVE
+
+CTRMM_L1x2_SUB2:
+
+	KERNEL1x2_SUB1
+
+	addic.		L,	L,	-1
+	bgt		CTRMM_L1x2_SUB2
+
+CTRMM_L1x2_SAVE:
+
+	SAVE1x2
+
+#if  (defined(LEFT) &&  defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+	sub		T1,	K,	KKK					// K - KKK -> TEMP1
+	slwi		T2,	T1,	3			// TEMP1 * Number of values in B shifted -> TEMP2
+	slwi		T1,	T1,	4			// TEMP1 * Number of values in A shifted -> TEMP1
+	add		BO,	BO,	T2					// BO += TEMP2 * number of values in B shifted
+	add		AO,	AO,	T1					// AO += TEMP1 * number of values in A shifted
+#endif
+
+#if defined(LEFT)
+	addi		KK,	KK,	2				// KK += Number of values in A
+#endif
+
+
+CTRMM_L1x2_END:
+
+CTRMM_L1x1_BEGIN:
+
+	andi.		T1,	M,	1
+	ble		CTRMM_L1x1_END
+
+#if  (defined(LEFT) &&  defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+	mr		BO,	B					// B -> BO
+#else
+	mr		BO,	B					// B -> BO
+	slwi		T1,	KK,	3				// Number of values in B shifted
+	slwi		T2,	KK,	3				// Number of values in A shifted
+	add		BO,	BO,	T1				// Add values to BO
+	add		AO,	AO,	T2				// Add values to AO
+#endif
+
+#if (defined(LEFT) &&  !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
+	sub		T1,	K,	KK				// K - KK -> TEMP1
+#else
+	mr		T1,	KK					// KK -> KTEMP
+#ifdef LEFT
+	addi		T1,	T1,	1				// KTEMP + Number of values in A -> KTEMP
+#else
+	addi		T1,	T1,	1				// KTEMP + Number of values in B -> KTEMP
+#endif
+#endif
+
+	mr		KKK,	T1
+	mr		K1,	T1
+	srawi.		L,	K1,	3				// KTEMP / 8 -> L
+	ble		CTRMM_L1x1_SUB0
+	cmpwi		cr0,	L,	1
+	ble		CTRMM_L1x1_SUB4
+
+CTRMM_L1x1_LOOP_START:
+
+	LOAD1x1_1
+	KERNEL1x1_I1
+	KERNEL1x1_2
+	KERNEL1x1_1
+	KERNEL1x1_2
+
+	KERNEL1x1_1
+	KERNEL1x1_2
+	KERNEL1x1_1
+	KERNEL1x1_2
+
+	addic.		L,	L,	-2
+	ble		CTRMM_L1x1_LOOP_END
+
+	.align 5
+
+CTRMM_L1x1_LOOP:
+
+	KERNEL1x1_1
+	KERNEL1x1_2
+	KERNEL1x1_1
+	KERNEL1x1_2
+
+	KERNEL1x1_1
+	KERNEL1x1_2
+	KERNEL1x1_1
+	KERNEL1x1_2
+
+	addic.		L,	L,	-1
+	bgt		CTRMM_L1x1_LOOP
+
+CTRMM_L1x1_LOOP_END:
+
+	KERNEL1x1_1
+	KERNEL1x1_2
+	KERNEL1x1_1
+	KERNEL1x1_2
+
+	KERNEL1x1_1
+	KERNEL1x1_2
+	KERNEL1x1_1
+	KERNEL1x1_E2
+
+	b		CTRMM_L1x1_SUB1
+
+CTRMM_L1x1_SUB4:
+
+	KERNEL1x1_SUBI1
+	KERNEL1x1_SUB1
+	KERNEL1x1_SUB1
+	KERNEL1x1_SUB1
+
+	KERNEL1x1_SUB1
+	KERNEL1x1_SUB1
+	KERNEL1x1_SUB1
+	KERNEL1x1_SUB1
+
+	b		CTRMM_L1x1_SUB1
+
+CTRMM_L1x1_SUB0:
+
+	andi.		L,	K1,	7						// K1 & 7 -> L
+
+	KERNEL1x1_SUBI1
+
+	addic.		L,	L,	-1
+	ble		CTRMM_L1x1_SAVE
+	b		CTRMM_L1x1_SUB2
+
+CTRMM_L1x1_SUB1:
+
+	andi.		L,	K1,	7						// K1 & 7 -> L
+	ble		CTRMM_L1x1_SAVE
+
+CTRMM_L1x1_SUB2:
+
+	KERNEL1x1_SUB1
+
+	addic.		L,	L,	-1
+	bgt		CTRMM_L1x1_SUB2
+
+CTRMM_L1x1_SAVE:
+
+	SAVE1x1
+
+#if  (defined(LEFT) &&  defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+	sub		T1,	K,	KKK					// K - KKK -> TEMP1
+	slwi		T2,	T1,	3			// TEMP1 * Number of values in B shifted -> TEMP2
+	slwi		T1,	T1,	3			// TEMP1 * Number of values in A shifted -> TEMP1
+	add		BO,	BO,	T2					// BO += TEMP2 * number of values in B shifted
+	add		AO,	AO,	T1					// AO += TEMP1 * number of values in A shifted
+#endif
+
+#if defined(LEFT)
+	addi		KK,	KK,	1				// KK += Number of values in A
+#endif
+
+
+CTRMM_L1x1_END:
+
+#if !defined(LEFT)
+	addi		KK,	KK,	1					// KK += Number of values in B
+#endif
+
+
+CTRMM_L1_END:
diff --git a/kernel/power/ctrmm_macros_8x4_power8.S b/kernel/power/ctrmm_macros_8x4_power8.S
new file mode 100644
index 000000000..48a21252c
--- /dev/null
+++ b/kernel/power/ctrmm_macros_8x4_power8.S
@@ -0,0 +1,6794 @@
+/***************************************************************************
+Copyright (c) 2013-2016, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*****************************************************************************/
+
+/**************************************************************************************
+* 2016/04/04 Werner Saar (wernsaar@googlemail.com)
+* 	 BLASTEST 		: OK
+* 	 CTEST			: OK
+* 	 TEST			: OK
+* 	 LAPACK-TEST		: OK
+**************************************************************************************/
+
+#if   defined(NN) || defined(NT) || defined(TN) || defined(TT)
+
+	#define	XSFADD_R1   xsadddp
+	#define	XSFADD_R2   xssubdp
+	#define	XSFADD_I1   xsadddp
+	#define	XSFADD_I2   xsadddp
+	#define	XVFADD_R1   xvaddsp
+	#define	XVFADD_R2   xvsubsp
+	#define	XVFADD_I1   xvaddsp
+	#define	XVFADD_I2   xvaddsp
+
+#elif  defined(CN) || defined(CT) || defined(RN) || defined(RT)
+
+	#define	XSFADD_R1   xsadddp
+	#define	XSFADD_R2   xsadddp
+	#define	XSFADD_I1   xssubdp
+	#define	XSFADD_I2   xsadddp
+	#define	XVFADD_R1   xvaddsp
+	#define	XVFADD_R2   xvaddsp
+	#define	XVFADD_I1   xvsubsp
+	#define	XVFADD_I2   xvaddsp
+
+#elif  defined(NC) || defined(TC) || defined(NR) || defined(TR)
+
+	#define	XSFADD_R1   xsadddp
+	#define	XSFADD_R2   xsadddp
+	#define	XSFADD_I1   xsadddp
+	#define	XSFADD_I2   xssubdp
+	#define	XVFADD_R1   xvaddsp
+	#define	XVFADD_R2   xvaddsp
+	#define	XVFADD_I1   xvaddsp
+	#define	XVFADD_I2   xvsubsp
+
+#else             // CC || CR || RC || RR
+
+	#define	XSFADD_R1   xsadddp
+	#define	XSFADD_R2   xssubdp
+	#define	XSFADD_I1   xssubdp
+	#define	XSFADD_I2   xssubdp
+	#define	XVFADD_R1   xvaddsp
+	#define	XVFADD_R2   xvsubsp
+	#define	XVFADD_I1   xvsubsp
+	#define	XVFADD_I2   xvsubsp
+
+#endif
+
+/**********************************************************************************************
+* Macros for N=4 and M=8
+**********************************************************************************************/
+
+.macro LOAD4x8_1
+
+	lxvw4x		vs0,	o0,	AO		// load a0, a1
+
+	lxvw4x		vs1,	o16,	AO		// load a2, a3
+
+	lxvw4x		vs2,	o32,	AO		// load a4, a5
+
+	lxvw4x		vs3,	o48,	AO		// load a6, a7
+
+
+	addi		AO,	AO,	64
+
+	lxvw4x		vs24,	o0,	BO		//  load b0, b1
+
+	xxspltw		vs8,	vs24,	0
+	xxspltw		vs9,	vs24,	1
+	xxspltw		vs10,	vs24,	2
+	xxspltw		vs11,	vs24,	3
+
+	lxvw4x		vs25,	o16,	BO		//  load b2, b3
+
+	xxspltw		vs12,	vs25,	0
+	xxspltw		vs13,	vs25,	1
+	xxspltw		vs14,	vs25,	2
+	xxspltw		vs15,	vs25,	3
+
+
+	addi		BO,	BO,	32
+
+.endm
+
+.macro KERNEL4x8_I1
+
+
+	lxvw4x		vs4,	o0,	AO		// load a0, a1
+
+	lxvw4x		vs5,	o16,	AO		// load a2, a3
+
+	lxvw4x		vs6,	o32,	AO		// load a4, a5
+
+	lxvw4x		vs7,	o48,	AO		// load a6, a7
+
+
+	addi		AO,	AO,	64
+
+	lxvw4x		vs24,	o0,	BO		//  load b0, b1
+
+	xxspltw		vs16,	vs24,	0
+	xxspltw		vs17,	vs24,	1
+	xxspltw		vs18,	vs24,	2
+	xxspltw		vs19,	vs24,	3
+
+	lxvw4x		vs25,	o16,	BO		//  load b2, b3
+
+	xxspltw		vs20,	vs25,	0
+	xxspltw		vs21,	vs25,	1
+	xxspltw		vs22,	vs25,	2
+	xxspltw		vs23,	vs25,	3
+
+
+	addi		BO,	BO,	32
+
+
+	xvmulsp		vs32,	vs0,	vs8		// a0_r*b0_r, a0_i*b0_r, a1_r*b0_r, a1_i*b0_r
+	xvmulsp		vs33,	vs0,	vs9		// a0_r*b0_i, a0_i*b0_i, a1_r*b0_i, a1_i*b0_i
+	xvmulsp		vs34,	vs1,	vs8		// a1_r*b0_r, a1_i*b0_r, a1_r*b0_r, a1_i*b0_r
+	xvmulsp		vs35,	vs1,	vs9		// a1_r*b0_i, a1_i*b0_i, a1_r*b0_i, a1_i*b0_i
+	xvmulsp		vs36,	vs2,	vs8		// a2_r*b0_r, a2_i*b0_r, a1_r*b0_r, a1_i*b0_r
+	xvmulsp		vs37,	vs2,	vs9		// a2_r*b0_i, a2_i*b0_i, a1_r*b0_i, a1_i*b0_i
+	xvmulsp		vs38,	vs3,	vs8		// a3_r*b0_r, a3_i*b0_r, a1_r*b0_r, a1_i*b0_r
+	xvmulsp		vs39,	vs3,	vs9		// a3_r*b0_i, a3_i*b0_i, a1_r*b0_i, a1_i*b0_i
+
+	xvmulsp		vs40,	vs0,	vs10		// a0_r*b1_r, a0_i*b1_r, a1_r*b1_r, a1_i*b1_r
+	xvmulsp		vs41,	vs0,	vs11		// a0_r*b1_i, a0_i*b1_i, a1_r*b1_i, a1_i*b1_i
+	xvmulsp		vs42,	vs1,	vs10		// a1_r*b1_r, a1_i*b1_r, a1_r*b1_r, a1_i*b1_r
+	xvmulsp		vs43,	vs1,	vs11		// a1_r*b1_i, a1_i*b1_i, a1_r*b1_i, a1_i*b1_i
+	xvmulsp		vs44,	vs2,	vs10		// a2_r*b1_r, a2_i*b1_r, a1_r*b1_r, a1_i*b1_r
+	xvmulsp		vs45,	vs2,	vs11		// a2_r*b1_i, a2_i*b1_i, a1_r*b1_i, a1_i*b1_i
+	xvmulsp		vs46,	vs3,	vs10		// a3_r*b1_r, a3_i*b1_r, a1_r*b1_r, a1_i*b1_r
+	xvmulsp		vs47,	vs3,	vs11		// a3_r*b1_i, a3_i*b1_i, a1_r*b1_i, a1_i*b1_i
+
+	xvmulsp		vs48,	vs0,	vs12		// a0_r*b2_r, a0_i*b2_r, a1_r*b2_r, a1_i*b2_r
+	xvmulsp		vs49,	vs0,	vs13		// a0_r*b2_i, a0_i*b2_i, a1_r*b2_i, a1_i*b2_i
+	xvmulsp		vs50,	vs1,	vs12		// a1_r*b2_r, a1_i*b2_r, a1_r*b2_r, a1_i*b2_r
+	xvmulsp		vs51,	vs1,	vs13		// a1_r*b2_i, a1_i*b2_i, a1_r*b2_i, a1_i*b2_i
+	xvmulsp		vs52,	vs2,	vs12		// a2_r*b2_r, a2_i*b2_r, a1_r*b2_r, a1_i*b2_r
+	xvmulsp		vs53,	vs2,	vs13		// a2_r*b2_i, a2_i*b2_i, a1_r*b2_i, a1_i*b2_i
+	xvmulsp		vs54,	vs3,	vs12		// a3_r*b2_r, a3_i*b2_r, a1_r*b2_r, a1_i*b2_r
+	xvmulsp		vs55,	vs3,	vs13		// a3_r*b2_i, a3_i*b2_i, a1_r*b2_i, a1_i*b2_i
+
+	xvmulsp		vs56,	vs0,	vs14		// a0_r*b3_r, a0_i*b3_r, a1_r*b3_r, a1_i*b3_r
+	xvmulsp		vs57,	vs0,	vs15		// a0_r*b3_i, a0_i*b3_i, a1_r*b3_i, a1_i*b3_i
+	xvmulsp		vs58,	vs1,	vs14		// a1_r*b3_r, a1_i*b3_r, a1_r*b3_r, a1_i*b3_r
+	xvmulsp		vs59,	vs1,	vs15		// a1_r*b3_i, a1_i*b3_i, a1_r*b3_i, a1_i*b3_i
+	xvmulsp		vs60,	vs2,	vs14		// a2_r*b3_r, a2_i*b3_r, a1_r*b3_r, a1_i*b3_r
+	xvmulsp		vs61,	vs2,	vs15		// a2_r*b3_i, a2_i*b3_i, a1_r*b3_i, a1_i*b3_i
+	xvmulsp		vs62,	vs3,	vs14		// a3_r*b3_r, a3_i*b3_r, a1_r*b3_r, a1_i*b3_r
+	xvmulsp		vs63,	vs3,	vs15		// a3_r*b3_i, a3_i*b3_i, a1_r*b3_i, a1_i*b3_i
+
+
+.endm
+
+.macro KERNEL4x8_1
+
+
+	lxvw4x		vs4,	o0,	AO		// load a0, a1
+
+	lxvw4x		vs5,	o16,	AO		// load a2, a3
+
+	lxvw4x		vs6,	o32,	AO		// load a4, a5
+
+	lxvw4x		vs7,	o48,	AO		// load a6, a7
+
+
+	addi		AO,	AO,	64
+
+	lxvw4x		vs24,	o0,	BO		//  load b0, b1
+
+	xxspltw		vs16,	vs24,	0
+	xxspltw		vs17,	vs24,	1
+	xxspltw		vs18,	vs24,	2
+	xxspltw		vs19,	vs24,	3
+
+	lxvw4x		vs25,	o16,	BO		//  load b2, b3
+
+	xxspltw		vs20,	vs25,	0
+	xxspltw		vs21,	vs25,	1
+	xxspltw		vs22,	vs25,	2
+	xxspltw		vs23,	vs25,	3
+
+
+	addi		BO,	BO,	32
+
+
+	xvmaddasp	vs32,	vs0,	vs8		// a0_r*b0_r, a0_i*b0_r, a1_r*b0_r, a1_i*b0_r
+	xvmaddasp	vs33,	vs0,	vs9		// a0_r*b0_i, a0_i*b0_i, a1_r*b0_i, a1_i*b0_i
+	xvmaddasp	vs34,	vs1,	vs8		// a1_r*b0_r, a1_i*b0_r, a1_r*b0_r, a1_i*b0_r
+	xvmaddasp	vs35,	vs1,	vs9		// a1_r*b0_i, a1_i*b0_i, a1_r*b0_i, a1_i*b0_i
+	xvmaddasp	vs36,	vs2,	vs8		// a2_r*b0_r, a2_i*b0_r, a1_r*b0_r, a1_i*b0_r
+	xvmaddasp	vs37,	vs2,	vs9		// a2_r*b0_i, a2_i*b0_i, a1_r*b0_i, a1_i*b0_i
+	xvmaddasp	vs38,	vs3,	vs8		// a3_r*b0_r, a3_i*b0_r, a1_r*b0_r, a1_i*b0_r
+	xvmaddasp	vs39,	vs3,	vs9		// a3_r*b0_i, a3_i*b0_i, a1_r*b0_i, a1_i*b0_i
+
+	xvmaddasp	vs40,	vs0,	vs10		// a0_r*b1_r, a0_i*b1_r, a1_r*b1_r, a1_i*b1_r
+	xvmaddasp	vs41,	vs0,	vs11		// a0_r*b1_i, a0_i*b1_i, a1_r*b1_i, a1_i*b1_i
+	xvmaddasp	vs42,	vs1,	vs10		// a1_r*b1_r, a1_i*b1_r, a1_r*b1_r, a1_i*b1_r
+	xvmaddasp	vs43,	vs1,	vs11		// a1_r*b1_i, a1_i*b1_i, a1_r*b1_i, a1_i*b1_i
+	xvmaddasp	vs44,	vs2,	vs10		// a2_r*b1_r, a2_i*b1_r, a1_r*b1_r, a1_i*b1_r
+	xvmaddasp	vs45,	vs2,	vs11		// a2_r*b1_i, a2_i*b1_i, a1_r*b1_i, a1_i*b1_i
+	xvmaddasp	vs46,	vs3,	vs10		// a3_r*b1_r, a3_i*b1_r, a1_r*b1_r, a1_i*b1_r
+	xvmaddasp	vs47,	vs3,	vs11		// a3_r*b1_i, a3_i*b1_i, a1_r*b1_i, a1_i*b1_i
+
+	xvmaddasp	vs48,	vs0,	vs12		// a0_r*b2_r, a0_i*b2_r, a1_r*b2_r, a1_i*b2_r
+	xvmaddasp	vs49,	vs0,	vs13		// a0_r*b2_i, a0_i*b2_i, a1_r*b2_i, a1_i*b2_i
+	xvmaddasp	vs50,	vs1,	vs12		// a1_r*b2_r, a1_i*b2_r, a1_r*b2_r, a1_i*b2_r
+	xvmaddasp	vs51,	vs1,	vs13		// a1_r*b2_i, a1_i*b2_i, a1_r*b2_i, a1_i*b2_i
+	xvmaddasp	vs52,	vs2,	vs12		// a2_r*b2_r, a2_i*b2_r, a1_r*b2_r, a1_i*b2_r
+	xvmaddasp	vs53,	vs2,	vs13		// a2_r*b2_i, a2_i*b2_i, a1_r*b2_i, a1_i*b2_i
+	xvmaddasp	vs54,	vs3,	vs12		// a3_r*b2_r, a3_i*b2_r, a1_r*b2_r, a1_i*b2_r
+	xvmaddasp	vs55,	vs3,	vs13		// a3_r*b2_i, a3_i*b2_i, a1_r*b2_i, a1_i*b2_i
+
+	xvmaddasp	vs56,	vs0,	vs14		// a0_r*b3_r, a0_i*b3_r, a1_r*b3_r, a1_i*b3_r
+	xvmaddasp	vs57,	vs0,	vs15		// a0_r*b3_i, a0_i*b3_i, a1_r*b3_i, a1_i*b3_i
+	xvmaddasp	vs58,	vs1,	vs14		// a1_r*b3_r, a1_i*b3_r, a1_r*b3_r, a1_i*b3_r
+	xvmaddasp	vs59,	vs1,	vs15		// a1_r*b3_i, a1_i*b3_i, a1_r*b3_i, a1_i*b3_i
+	xvmaddasp	vs60,	vs2,	vs14		// a2_r*b3_r, a2_i*b3_r, a1_r*b3_r, a1_i*b3_r
+	xvmaddasp	vs61,	vs2,	vs15		// a2_r*b3_i, a2_i*b3_i, a1_r*b3_i, a1_i*b3_i
+	xvmaddasp	vs62,	vs3,	vs14		// a3_r*b3_r, a3_i*b3_r, a1_r*b3_r, a1_i*b3_r
+	xvmaddasp	vs63,	vs3,	vs15		// a3_r*b3_i, a3_i*b3_i, a1_r*b3_i, a1_i*b3_i
+
+
+.endm
+
+.macro KERNEL4x8_2
+
+
+	lxvw4x		vs0,	o0,	AO		// load a0, a1
+
+	lxvw4x		vs1,	o16,	AO		// load a2, a3
+
+	lxvw4x		vs2,	o32,	AO		// load a4, a5
+
+	lxvw4x		vs3,	o48,	AO		// load a6, a7
+
+
+	addi		AO,	AO,	64
+
+	lxvw4x		vs24,	o0,	BO		//  load b0, b1
+
+	xxspltw		vs8,	vs24,	0
+	xxspltw		vs9,	vs24,	1
+	xxspltw		vs10,	vs24,	2
+	xxspltw		vs11,	vs24,	3
+
+	lxvw4x		vs25,	o16,	BO		//  load b2, b3
+
+	xxspltw		vs12,	vs25,	0
+	xxspltw		vs13,	vs25,	1
+	xxspltw		vs14,	vs25,	2
+	xxspltw		vs15,	vs25,	3
+
+
+	addi		BO,	BO,	32
+
+
+	xvmaddasp	vs32,	vs4,	vs16		// a4_r*b0_r, a4_i*b0_r, a1_r*b0_r, a1_i*b0_r
+	xvmaddasp	vs33,	vs4,	vs17		// a4_r*b0_i, a4_i*b0_i, a1_r*b0_i, a1_i*b0_i
+	xvmaddasp	vs34,	vs5,	vs16		// a5_r*b0_r, a5_i*b0_r, a1_r*b0_r, a1_i*b0_r
+	xvmaddasp	vs35,	vs5,	vs17		// a5_r*b0_i, a5_i*b0_i, a1_r*b0_i, a1_i*b0_i
+	xvmaddasp	vs36,	vs6,	vs16		// a6_r*b0_r, a6_i*b0_r, a1_r*b0_r, a1_i*b0_r
+	xvmaddasp	vs37,	vs6,	vs17		// a6_r*b0_i, a6_i*b0_i, a1_r*b0_i, a1_i*b0_i
+	xvmaddasp	vs38,	vs7,	vs16		// a7_r*b0_r, a7_i*b0_r, a1_r*b0_r, a1_i*b0_r
+	xvmaddasp	vs39,	vs7,	vs17		// a7_r*b0_i, a7_i*b0_i, a1_r*b0_i, a1_i*b0_i
+
+	xvmaddasp	vs40,	vs4,	vs18		// a4_r*b1_r, a4_i*b1_r, a1_r*b1_r, a1_i*b1_r
+	xvmaddasp	vs41,	vs4,	vs19		// a4_r*b1_i, a4_i*b1_i, a1_r*b1_i, a1_i*b1_i
+	xvmaddasp	vs42,	vs5,	vs18		// a5_r*b1_r, a5_i*b1_r, a1_r*b1_r, a1_i*b1_r
+	xvmaddasp	vs43,	vs5,	vs19		// a5_r*b1_i, a5_i*b1_i, a1_r*b1_i, a1_i*b1_i
+	xvmaddasp	vs44,	vs6,	vs18		// a6_r*b1_r, a6_i*b1_r, a1_r*b1_r, a1_i*b1_r
+	xvmaddasp	vs45,	vs6,	vs19		// a6_r*b1_i, a6_i*b1_i, a1_r*b1_i, a1_i*b1_i
+	xvmaddasp	vs46,	vs7,	vs18		// a7_r*b1_r, a7_i*b1_r, a1_r*b1_r, a1_i*b1_r
+	xvmaddasp	vs47,	vs7,	vs19		// a7_r*b1_i, a7_i*b1_i, a1_r*b1_i, a1_i*b1_i
+
+	xvmaddasp	vs48,	vs4,	vs20		// a4_r*b2_r, a4_i*b2_r, a1_r*b2_r, a1_i*b2_r
+	xvmaddasp	vs49,	vs4,	vs21		// a4_r*b2_i, a4_i*b2_i, a1_r*b2_i, a1_i*b2_i
+	xvmaddasp	vs50,	vs5,	vs20		// a5_r*b2_r, a5_i*b2_r, a1_r*b2_r, a1_i*b2_r
+	xvmaddasp	vs51,	vs5,	vs21		// a5_r*b2_i, a5_i*b2_i, a1_r*b2_i, a1_i*b2_i
+	xvmaddasp	vs52,	vs6,	vs20		// a6_r*b2_r, a6_i*b2_r, a1_r*b2_r, a1_i*b2_r
+	xvmaddasp	vs53,	vs6,	vs21		// a6_r*b2_i, a6_i*b2_i, a1_r*b2_i, a1_i*b2_i
+	xvmaddasp	vs54,	vs7,	vs20		// a7_r*b2_r, a7_i*b2_r, a1_r*b2_r, a1_i*b2_r
+	xvmaddasp	vs55,	vs7,	vs21		// a7_r*b2_i, a7_i*b2_i, a1_r*b2_i, a1_i*b2_i
+
+	xvmaddasp	vs56,	vs4,	vs22		// a4_r*b3_r, a4_i*b3_r, a1_r*b3_r, a1_i*b3_r
+	xvmaddasp	vs57,	vs4,	vs23		// a4_r*b3_i, a4_i*b3_i, a1_r*b3_i, a1_i*b3_i
+	xvmaddasp	vs58,	vs5,	vs22		// a5_r*b3_r, a5_i*b3_r, a1_r*b3_r, a1_i*b3_r
+	xvmaddasp	vs59,	vs5,	vs23		// a5_r*b3_i, a5_i*b3_i, a1_r*b3_i, a1_i*b3_i
+	xvmaddasp	vs60,	vs6,	vs22		// a6_r*b3_r, a6_i*b3_r, a1_r*b3_r, a1_i*b3_r
+	xvmaddasp	vs61,	vs6,	vs23		// a6_r*b3_i, a6_i*b3_i, a1_r*b3_i, a1_i*b3_i
+	xvmaddasp	vs62,	vs7,	vs22		// a7_r*b3_r, a7_i*b3_r, a1_r*b3_r, a1_i*b3_r
+	xvmaddasp	vs63,	vs7,	vs23		// a7_r*b3_i, a7_i*b3_i, a1_r*b3_i, a1_i*b3_i
+
+
+.endm
+
+.macro KERNEL4x8_E2
+
+
+	xvmaddasp	vs32,	vs4,	vs16		// a4_r*b0_r, a4_i*b0_r, a1_r*b0_r, a1_i*b0_r
+	xvmaddasp	vs33,	vs4,	vs17		// a4_r*b0_i, a4_i*b0_i, a1_r*b0_i, a1_i*b0_i
+	xvmaddasp	vs34,	vs5,	vs16		// a5_r*b0_r, a5_i*b0_r, a1_r*b0_r, a1_i*b0_r
+	xvmaddasp	vs35,	vs5,	vs17		// a5_r*b0_i, a5_i*b0_i, a1_r*b0_i, a1_i*b0_i
+	xvmaddasp	vs36,	vs6,	vs16		// a6_r*b0_r, a6_i*b0_r, a1_r*b0_r, a1_i*b0_r
+	xvmaddasp	vs37,	vs6,	vs17		// a6_r*b0_i, a6_i*b0_i, a1_r*b0_i, a1_i*b0_i
+	xvmaddasp	vs38,	vs7,	vs16		// a7_r*b0_r, a7_i*b0_r, a1_r*b0_r, a1_i*b0_r
+	xvmaddasp	vs39,	vs7,	vs17		// a7_r*b0_i, a7_i*b0_i, a1_r*b0_i, a1_i*b0_i
+
+	xvmaddasp	vs40,	vs4,	vs18		// a4_r*b1_r, a4_i*b1_r, a1_r*b1_r, a1_i*b1_r
+	xvmaddasp	vs41,	vs4,	vs19		// a4_r*b1_i, a4_i*b1_i, a1_r*b1_i, a1_i*b1_i
+	xvmaddasp	vs42,	vs5,	vs18		// a5_r*b1_r, a5_i*b1_r, a1_r*b1_r, a1_i*b1_r
+	xvmaddasp	vs43,	vs5,	vs19		// a5_r*b1_i, a5_i*b1_i, a1_r*b1_i, a1_i*b1_i
+	xvmaddasp	vs44,	vs6,	vs18		// a6_r*b1_r, a6_i*b1_r, a1_r*b1_r, a1_i*b1_r
+	xvmaddasp	vs45,	vs6,	vs19		// a6_r*b1_i, a6_i*b1_i, a1_r*b1_i, a1_i*b1_i
+	xvmaddasp	vs46,	vs7,	vs18		// a7_r*b1_r, a7_i*b1_r, a1_r*b1_r, a1_i*b1_r
+	xvmaddasp	vs47,	vs7,	vs19		// a7_r*b1_i, a7_i*b1_i, a1_r*b1_i, a1_i*b1_i
+
+	xvmaddasp	vs48,	vs4,	vs20		// a4_r*b2_r, a4_i*b2_r, a1_r*b2_r, a1_i*b2_r
+	xvmaddasp	vs49,	vs4,	vs21		// a4_r*b2_i, a4_i*b2_i, a1_r*b2_i, a1_i*b2_i
+	xvmaddasp	vs50,	vs5,	vs20		// a5_r*b2_r, a5_i*b2_r, a1_r*b2_r, a1_i*b2_r
+	xvmaddasp	vs51,	vs5,	vs21		// a5_r*b2_i, a5_i*b2_i, a1_r*b2_i, a1_i*b2_i
+	xvmaddasp	vs52,	vs6,	vs20		// a6_r*b2_r, a6_i*b2_r, a1_r*b2_r, a1_i*b2_r
+	xvmaddasp	vs53,	vs6,	vs21		// a6_r*b2_i, a6_i*b2_i, a1_r*b2_i, a1_i*b2_i
+	xvmaddasp	vs54,	vs7,	vs20		// a7_r*b2_r, a7_i*b2_r, a1_r*b2_r, a1_i*b2_r
+	xvmaddasp	vs55,	vs7,	vs21		// a7_r*b2_i, a7_i*b2_i, a1_r*b2_i, a1_i*b2_i
+
+	xvmaddasp	vs56,	vs4,	vs22		// a4_r*b3_r, a4_i*b3_r, a1_r*b3_r, a1_i*b3_r
+	xvmaddasp	vs57,	vs4,	vs23		// a4_r*b3_i, a4_i*b3_i, a1_r*b3_i, a1_i*b3_i
+	xvmaddasp	vs58,	vs5,	vs22		// a5_r*b3_r, a5_i*b3_r, a1_r*b3_r, a1_i*b3_r
+	xvmaddasp	vs59,	vs5,	vs23		// a5_r*b3_i, a5_i*b3_i, a1_r*b3_i, a1_i*b3_i
+	xvmaddasp	vs60,	vs6,	vs22		// a6_r*b3_r, a6_i*b3_r, a1_r*b3_r, a1_i*b3_r
+	xvmaddasp	vs61,	vs6,	vs23		// a6_r*b3_i, a6_i*b3_i, a1_r*b3_i, a1_i*b3_i
+	xvmaddasp	vs62,	vs7,	vs22		// a7_r*b3_r, a7_i*b3_r, a1_r*b3_r, a1_i*b3_r
+	xvmaddasp	vs63,	vs7,	vs23		// a7_r*b3_i, a7_i*b3_i, a1_r*b3_i, a1_i*b3_i
+
+
+.endm
+
+.macro KERNEL4x8_SUBI1
+
+
+	lxvw4x		vs0,	o0,	AO		// load a0, a1
+
+	lxvw4x		vs1,	o16,	AO		// load a2, a3
+
+	lxvw4x		vs2,	o32,	AO		// load a4, a5
+
+	lxvw4x		vs3,	o48,	AO		// load a6, a7
+
+
+	addi		AO,	AO,	64
+
+	lxvw4x		vs24,	o0,	BO		//  load b0, b1
+
+	xxspltw		vs8,	vs24,	0
+	xxspltw		vs9,	vs24,	1
+	xxspltw		vs10,	vs24,	2
+	xxspltw		vs11,	vs24,	3
+
+	lxvw4x		vs25,	o16,	BO		//  load b2, b3
+
+	xxspltw		vs12,	vs25,	0
+	xxspltw		vs13,	vs25,	1
+	xxspltw		vs14,	vs25,	2
+	xxspltw		vs15,	vs25,	3
+
+
+	addi		BO,	BO,	32
+
+
+	xvmulsp		vs32,	vs0,	vs8		// a0_r*b0_r, a0_i*b0_r, a1_r*b0_r, a1_i*b0_r
+	xvmulsp		vs33,	vs0,	vs9		// a0_r*b0_i, a0_i*b0_i, a1_r*b0_i, a1_i*b0_i
+	xvmulsp		vs34,	vs1,	vs8		// a1_r*b0_r, a1_i*b0_r, a1_r*b0_r, a1_i*b0_r
+	xvmulsp		vs35,	vs1,	vs9		// a1_r*b0_i, a1_i*b0_i, a1_r*b0_i, a1_i*b0_i
+	xvmulsp		vs36,	vs2,	vs8		// a2_r*b0_r, a2_i*b0_r, a1_r*b0_r, a1_i*b0_r
+	xvmulsp		vs37,	vs2,	vs9		// a2_r*b0_i, a2_i*b0_i, a1_r*b0_i, a1_i*b0_i
+	xvmulsp		vs38,	vs3,	vs8		// a3_r*b0_r, a3_i*b0_r, a1_r*b0_r, a1_i*b0_r
+	xvmulsp		vs39,	vs3,	vs9		// a3_r*b0_i, a3_i*b0_i, a1_r*b0_i, a1_i*b0_i
+
+	xvmulsp		vs40,	vs0,	vs10		// a0_r*b1_r, a0_i*b1_r, a1_r*b1_r, a1_i*b1_r
+	xvmulsp		vs41,	vs0,	vs11		// a0_r*b1_i, a0_i*b1_i, a1_r*b1_i, a1_i*b1_i
+	xvmulsp		vs42,	vs1,	vs10		// a1_r*b1_r, a1_i*b1_r, a1_r*b1_r, a1_i*b1_r
+	xvmulsp		vs43,	vs1,	vs11		// a1_r*b1_i, a1_i*b1_i, a1_r*b1_i, a1_i*b1_i
+	xvmulsp		vs44,	vs2,	vs10		// a2_r*b1_r, a2_i*b1_r, a1_r*b1_r, a1_i*b1_r
+	xvmulsp		vs45,	vs2,	vs11		// a2_r*b1_i, a2_i*b1_i, a1_r*b1_i, a1_i*b1_i
+	xvmulsp		vs46,	vs3,	vs10		// a3_r*b1_r, a3_i*b1_r, a1_r*b1_r, a1_i*b1_r
+	xvmulsp		vs47,	vs3,	vs11		// a3_r*b1_i, a3_i*b1_i, a1_r*b1_i, a1_i*b1_i
+
+	xvmulsp		vs48,	vs0,	vs12		// a0_r*b2_r, a0_i*b2_r, a1_r*b2_r, a1_i*b2_r
+	xvmulsp		vs49,	vs0,	vs13		// a0_r*b2_i, a0_i*b2_i, a1_r*b2_i, a1_i*b2_i
+	xvmulsp		vs50,	vs1,	vs12		// a1_r*b2_r, a1_i*b2_r, a1_r*b2_r, a1_i*b2_r
+	xvmulsp		vs51,	vs1,	vs13		// a1_r*b2_i, a1_i*b2_i, a1_r*b2_i, a1_i*b2_i
+	xvmulsp		vs52,	vs2,	vs12		// a2_r*b2_r, a2_i*b2_r, a1_r*b2_r, a1_i*b2_r
+	xvmulsp		vs53,	vs2,	vs13		// a2_r*b2_i, a2_i*b2_i, a1_r*b2_i, a1_i*b2_i
+	xvmulsp		vs54,	vs3,	vs12		// a3_r*b2_r, a3_i*b2_r, a1_r*b2_r, a1_i*b2_r
+	xvmulsp		vs55,	vs3,	vs13		// a3_r*b2_i, a3_i*b2_i, a1_r*b2_i, a1_i*b2_i
+
+	xvmulsp		vs56,	vs0,	vs14		// a0_r*b3_r, a0_i*b3_r, a1_r*b3_r, a1_i*b3_r
+	xvmulsp		vs57,	vs0,	vs15		// a0_r*b3_i, a0_i*b3_i, a1_r*b3_i, a1_i*b3_i
+	xvmulsp		vs58,	vs1,	vs14		// a1_r*b3_r, a1_i*b3_r, a1_r*b3_r, a1_i*b3_r
+	xvmulsp		vs59,	vs1,	vs15		// a1_r*b3_i, a1_i*b3_i, a1_r*b3_i, a1_i*b3_i
+	xvmulsp		vs60,	vs2,	vs14		// a2_r*b3_r, a2_i*b3_r, a1_r*b3_r, a1_i*b3_r
+	xvmulsp		vs61,	vs2,	vs15		// a2_r*b3_i, a2_i*b3_i, a1_r*b3_i, a1_i*b3_i
+	xvmulsp		vs62,	vs3,	vs14		// a3_r*b3_r, a3_i*b3_r, a1_r*b3_r, a1_i*b3_r
+	xvmulsp		vs63,	vs3,	vs15		// a3_r*b3_i, a3_i*b3_i, a1_r*b3_i, a1_i*b3_i
+
+
+.endm
+
+.macro KERNEL4x8_SUB1
+
+
+	lxvw4x		vs0,	o0,	AO		// load a0, a1
+
+	lxvw4x		vs1,	o16,	AO		// load a2, a3
+
+	lxvw4x		vs2,	o32,	AO		// load a4, a5
+
+	lxvw4x		vs3,	o48,	AO		// load a6, a7
+
+
+	addi		AO,	AO,	64
+
+	lxvw4x		vs24,	o0,	BO		//  load b0, b1
+
+	xxspltw		vs8,	vs24,	0
+	xxspltw		vs9,	vs24,	1
+	xxspltw		vs10,	vs24,	2
+	xxspltw		vs11,	vs24,	3
+
+	lxvw4x		vs25,	o16,	BO		//  load b2, b3
+
+	xxspltw		vs12,	vs25,	0
+	xxspltw		vs13,	vs25,	1
+	xxspltw		vs14,	vs25,	2
+	xxspltw		vs15,	vs25,	3
+
+
+	addi		BO,	BO,	32
+
+
+	xvmaddasp	vs32,	vs0,	vs8		// a0_r*b0_r, a0_i*b0_r, a1_r*b0_r, a1_i*b0_r
+	xvmaddasp	vs33,	vs0,	vs9		// a0_r*b0_i, a0_i*b0_i, a1_r*b0_i, a1_i*b0_i
+	xvmaddasp	vs34,	vs1,	vs8		// a1_r*b0_r, a1_i*b0_r, a1_r*b0_r, a1_i*b0_r
+	xvmaddasp	vs35,	vs1,	vs9		// a1_r*b0_i, a1_i*b0_i, a1_r*b0_i, a1_i*b0_i
+	xvmaddasp	vs36,	vs2,	vs8		// a2_r*b0_r, a2_i*b0_r, a1_r*b0_r, a1_i*b0_r
+	xvmaddasp	vs37,	vs2,	vs9		// a2_r*b0_i, a2_i*b0_i, a1_r*b0_i, a1_i*b0_i
+	xvmaddasp	vs38,	vs3,	vs8		// a3_r*b0_r, a3_i*b0_r, a1_r*b0_r, a1_i*b0_r
+	xvmaddasp	vs39,	vs3,	vs9		// a3_r*b0_i, a3_i*b0_i, a1_r*b0_i, a1_i*b0_i
+
+	xvmaddasp	vs40,	vs0,	vs10		// a0_r*b1_r, a0_i*b1_r, a1_r*b1_r, a1_i*b1_r
+	xvmaddasp	vs41,	vs0,	vs11		// a0_r*b1_i, a0_i*b1_i, a1_r*b1_i, a1_i*b1_i
+	xvmaddasp	vs42,	vs1,	vs10		// a1_r*b1_r, a1_i*b1_r, a1_r*b1_r, a1_i*b1_r
+	xvmaddasp	vs43,	vs1,	vs11		// a1_r*b1_i, a1_i*b1_i, a1_r*b1_i, a1_i*b1_i
+	xvmaddasp	vs44,	vs2,	vs10		// a2_r*b1_r, a2_i*b1_r, a1_r*b1_r, a1_i*b1_r
+	xvmaddasp	vs45,	vs2,	vs11		// a2_r*b1_i, a2_i*b1_i, a1_r*b1_i, a1_i*b1_i
+	xvmaddasp	vs46,	vs3,	vs10		// a3_r*b1_r, a3_i*b1_r, a1_r*b1_r, a1_i*b1_r
+	xvmaddasp	vs47,	vs3,	vs11		// a3_r*b1_i, a3_i*b1_i, a1_r*b1_i, a1_i*b1_i
+
+	xvmaddasp	vs48,	vs0,	vs12		// a0_r*b2_r, a0_i*b2_r, a1_r*b2_r, a1_i*b2_r
+	xvmaddasp	vs49,	vs0,	vs13		// a0_r*b2_i, a0_i*b2_i, a1_r*b2_i, a1_i*b2_i
+	xvmaddasp	vs50,	vs1,	vs12		// a1_r*b2_r, a1_i*b2_r, a1_r*b2_r, a1_i*b2_r
+	xvmaddasp	vs51,	vs1,	vs13		// a1_r*b2_i, a1_i*b2_i, a1_r*b2_i, a1_i*b2_i
+	xvmaddasp	vs52,	vs2,	vs12		// a2_r*b2_r, a2_i*b2_r, a1_r*b2_r, a1_i*b2_r
+	xvmaddasp	vs53,	vs2,	vs13		// a2_r*b2_i, a2_i*b2_i, a1_r*b2_i, a1_i*b2_i
+	xvmaddasp	vs54,	vs3,	vs12		// a3_r*b2_r, a3_i*b2_r, a1_r*b2_r, a1_i*b2_r
+	xvmaddasp	vs55,	vs3,	vs13		// a3_r*b2_i, a3_i*b2_i, a1_r*b2_i, a1_i*b2_i
+
+	xvmaddasp	vs56,	vs0,	vs14		// a0_r*b3_r, a0_i*b3_r, a1_r*b3_r, a1_i*b3_r
+	xvmaddasp	vs57,	vs0,	vs15		// a0_r*b3_i, a0_i*b3_i, a1_r*b3_i, a1_i*b3_i
+	xvmaddasp	vs58,	vs1,	vs14		// a1_r*b3_r, a1_i*b3_r, a1_r*b3_r, a1_i*b3_r
+	xvmaddasp	vs59,	vs1,	vs15		// a1_r*b3_i, a1_i*b3_i, a1_r*b3_i, a1_i*b3_i
+	xvmaddasp	vs60,	vs2,	vs14		// a2_r*b3_r, a2_i*b3_r, a1_r*b3_r, a1_i*b3_r
+	xvmaddasp	vs61,	vs2,	vs15		// a2_r*b3_i, a2_i*b3_i, a1_r*b3_i, a1_i*b3_i
+	xvmaddasp	vs62,	vs3,	vs14		// a3_r*b3_r, a3_i*b3_r, a1_r*b3_r, a1_i*b3_r
+	xvmaddasp	vs63,	vs3,	vs15		// a3_r*b3_i, a3_i*b3_i, a1_r*b3_i, a1_i*b3_i
+
+
+.endm
+
+.macro SAVE4x8
+
+	mr		T1,	CO
+
+// N=0
+
+	mr		T2,	T1
+
+// N=0 M=0
+
+	xxlxor		vs4,	vs4,	vs4
+	xxlxor		vs5,	vs5,	vs5
+	xxlxor		vs6,	vs6,	vs6
+	xxlxor		vs7,	vs7,	vs7
+
+#ifndef TRMMKERNEL
+	lxvw4x		vs0,	o0,	T2	// c0, c1
+#else
+	xxlxor		vs0,	vs0,	vs0
+#endif
+
+
+	xxspltw		vs8,	vs32,	0
+	xxspltw		vs9,	vs32,	1
+	xxspltw		vs10,	vs32,	2
+	xxspltw		vs11,	vs32,	3
+
+
+	xxspltw		vs12,	vs33,	0
+	xxspltw		vs13,	vs33,	1
+	xxspltw		vs14,	vs33,	2
+	xxspltw		vs15,	vs33,	3
+
+	XVFADD_R1	vs4,	vs4,	vs8		// add a0_r * b0_r
+	XVFADD_I2	vs5,	vs5,	vs12		// add a0_r * b0_i
+	XVFADD_R1	vs6,	vs6,	vs10		// add a1_r * b0_r
+	XVFADD_I2	vs7,	vs7,	vs14		// add a1_r * b0_i
+
+	XVFADD_R2	vs4,	vs4,	vs13		// add a0_i * b0_i
+	XVFADD_I1	vs5,	vs5,	vs9 		// add a0_i * b0_r
+	XVFADD_R2	vs6,	vs6,	vs15		// add a1_i * b0_i
+	XVFADD_I1	vs7,	vs7,	vs11		// add a1_i * b0_r
+
+	xvmulsp		vs16,	vs4,	alpha_sr		// r0_r * alpha_r
+	xvmulsp		vs17,	vs5,	alpha_si		// r0_i * alpha_i
+	xvmulsp		vs18,	vs4,	alpha_si		// r0_r * alpha_i
+	xvmulsp		vs19,	vs5,	alpha_sr		// r0_i * alpha_r
+
+	xvsubsp		vs20,	vs16,	vs17		// r0_r * alpha_r - r0_i * alpha_i
+	xvaddsp		vs21,	vs18,	vs19		// r0_r * alpha_i + r0_i * alpha_r
+
+	xvmulsp		vs16,	vs6,	alpha_sr		// r1_r * alpha_r
+	xvmulsp		vs17,	vs7,	alpha_si		// r1_i * alpha_i
+	xvmulsp		vs18,	vs6,	alpha_si		// r1_r * alpha_i
+	xvmulsp		vs19,	vs7,	alpha_sr		// r1_i * alpha_r
+
+	xvsubsp		vs22,	vs16,	vs17		// r1_r * alpha_r - r1_i * alpha_i
+	xvaddsp		vs23,	vs18,	vs19		// r1_r * alpha_i + r1_i * alpha_r
+
+	xxlxor		vs24,	vs24,	vs24
+	xxsldwi		vs20,	vs20,	vs24,	3		// r0_r
+	xxsldwi		vs21,	vs21,	vs24,	2		// r0_i
+	xxsldwi		vs22,	vs22,	vs24,	1		// r1_r
+	xxsldwi		vs23,	vs23,	vs24,	0		// r1_i
+	xvaddsp		vs20,	vs20,	vs21		// r0_r, r0_i
+	xvaddsp		vs22,	vs22,	vs23		// r1_r, r1_i
+	xvaddsp		vs1,	vs20,	vs22			// r0_r, r0_i, r1_r, r1_i
+	xvaddsp		vs0,	vs0,	vs1
+
+
+	stxvw4x		vs0,	o0,	T2	// c0, c1
+
+	addi		T2,	T2,	16
+
+// N=0 M=2
+
+	xxlxor		vs4,	vs4,	vs4
+	xxlxor		vs5,	vs5,	vs5
+	xxlxor		vs6,	vs6,	vs6
+	xxlxor		vs7,	vs7,	vs7
+
+#ifndef TRMMKERNEL
+	lxvw4x		vs0,	o0,	T2	// c0, c1
+#else
+	xxlxor		vs0,	vs0,	vs0
+#endif
+
+
+	xxspltw		vs8,	vs34,	0
+	xxspltw		vs9,	vs34,	1
+	xxspltw		vs10,	vs34,	2
+	xxspltw		vs11,	vs34,	3
+
+
+	xxspltw		vs12,	vs35,	0
+	xxspltw		vs13,	vs35,	1
+	xxspltw		vs14,	vs35,	2
+	xxspltw		vs15,	vs35,	3
+
+	XVFADD_R1	vs4,	vs4,	vs8		// add a0_r * b0_r
+	XVFADD_I2	vs5,	vs5,	vs12		// add a0_r * b0_i
+	XVFADD_R1	vs6,	vs6,	vs10		// add a1_r * b0_r
+	XVFADD_I2	vs7,	vs7,	vs14		// add a1_r * b0_i
+
+	XVFADD_R2	vs4,	vs4,	vs13		// add a0_i * b0_i
+	XVFADD_I1	vs5,	vs5,	vs9 		// add a0_i * b0_r
+	XVFADD_R2	vs6,	vs6,	vs15		// add a1_i * b0_i
+	XVFADD_I1	vs7,	vs7,	vs11		// add a1_i * b0_r
+
+	xvmulsp		vs16,	vs4,	alpha_sr		// r0_r * alpha_r
+	xvmulsp		vs17,	vs5,	alpha_si		// r0_i * alpha_i
+	xvmulsp		vs18,	vs4,	alpha_si		// r0_r * alpha_i
+	xvmulsp		vs19,	vs5,	alpha_sr		// r0_i * alpha_r
+
+	xvsubsp		vs20,	vs16,	vs17		// r0_r * alpha_r - r0_i * alpha_i
+	xvaddsp		vs21,	vs18,	vs19		// r0_r * alpha_i + r0_i * alpha_r
+
+	xvmulsp		vs16,	vs6,	alpha_sr		// r1_r * alpha_r
+	xvmulsp		vs17,	vs7,	alpha_si		// r1_i * alpha_i
+	xvmulsp		vs18,	vs6,	alpha_si		// r1_r * alpha_i
+	xvmulsp		vs19,	vs7,	alpha_sr		// r1_i * alpha_r
+
+	xvsubsp		vs22,	vs16,	vs17		// r1_r * alpha_r - r1_i * alpha_i
+	xvaddsp		vs23,	vs18,	vs19		// r1_r * alpha_i + r1_i * alpha_r
+
+	xxlxor		vs24,	vs24,	vs24
+	xxsldwi		vs20,	vs20,	vs24,	3		// r0_r
+	xxsldwi		vs21,	vs21,	vs24,	2		// r0_i
+	xxsldwi		vs22,	vs22,	vs24,	1		// r1_r
+	xxsldwi		vs23,	vs23,	vs24,	0		// r1_i
+	xvaddsp		vs20,	vs20,	vs21		// r0_r, r0_i
+	xvaddsp		vs22,	vs22,	vs23		// r1_r, r1_i
+	xvaddsp		vs1,	vs20,	vs22			// r0_r, r0_i, r1_r, r1_i
+	xvaddsp		vs0,	vs0,	vs1
+
+
+	stxvw4x		vs0,	o0,	T2	// c0, c1
+
+	addi		T2,	T2,	16
+
+// N=0 M=4
+
+	xxlxor		vs4,	vs4,	vs4
+	xxlxor		vs5,	vs5,	vs5
+	xxlxor		vs6,	vs6,	vs6
+	xxlxor		vs7,	vs7,	vs7
+
+#ifndef TRMMKERNEL
+	lxvw4x		vs0,	o0,	T2	// c0, c1
+#else
+	xxlxor		vs0,	vs0,	vs0
+#endif
+
+
+	xxspltw		vs8,	vs36,	0
+	xxspltw		vs9,	vs36,	1
+	xxspltw		vs10,	vs36,	2
+	xxspltw		vs11,	vs36,	3
+
+
+	xxspltw		vs12,	vs37,	0
+	xxspltw		vs13,	vs37,	1
+	xxspltw		vs14,	vs37,	2
+	xxspltw		vs15,	vs37,	3
+
+	XVFADD_R1	vs4,	vs4,	vs8		// add a0_r * b0_r
+	XVFADD_I2	vs5,	vs5,	vs12		// add a0_r * b0_i
+	XVFADD_R1	vs6,	vs6,	vs10		// add a1_r * b0_r
+	XVFADD_I2	vs7,	vs7,	vs14		// add a1_r * b0_i
+
+	XVFADD_R2	vs4,	vs4,	vs13		// add a0_i * b0_i
+	XVFADD_I1	vs5,	vs5,	vs9 		// add a0_i * b0_r
+	XVFADD_R2	vs6,	vs6,	vs15		// add a1_i * b0_i
+	XVFADD_I1	vs7,	vs7,	vs11		// add a1_i * b0_r
+
+	xvmulsp		vs16,	vs4,	alpha_sr		// r0_r * alpha_r
+	xvmulsp		vs17,	vs5,	alpha_si		// r0_i * alpha_i
+	xvmulsp		vs18,	vs4,	alpha_si		// r0_r * alpha_i
+	xvmulsp		vs19,	vs5,	alpha_sr		// r0_i * alpha_r
+
+	xvsubsp		vs20,	vs16,	vs17		// r0_r * alpha_r - r0_i * alpha_i
+	xvaddsp		vs21,	vs18,	vs19		// r0_r * alpha_i + r0_i * alpha_r
+
+	xvmulsp		vs16,	vs6,	alpha_sr		// r1_r * alpha_r
+	xvmulsp		vs17,	vs7,	alpha_si		// r1_i * alpha_i
+	xvmulsp		vs18,	vs6,	alpha_si		// r1_r * alpha_i
+	xvmulsp		vs19,	vs7,	alpha_sr		// r1_i * alpha_r
+
+	xvsubsp		vs22,	vs16,	vs17		// r1_r * alpha_r - r1_i * alpha_i
+	xvaddsp		vs23,	vs18,	vs19		// r1_r * alpha_i + r1_i * alpha_r
+
+	xxlxor		vs24,	vs24,	vs24
+	xxsldwi		vs20,	vs20,	vs24,	3		// r0_r
+	xxsldwi		vs21,	vs21,	vs24,	2		// r0_i
+	xxsldwi		vs22,	vs22,	vs24,	1		// r1_r
+	xxsldwi		vs23,	vs23,	vs24,	0		// r1_i
+	xvaddsp		vs20,	vs20,	vs21		// r0_r, r0_i
+	xvaddsp		vs22,	vs22,	vs23		// r1_r, r1_i
+	xvaddsp		vs1,	vs20,	vs22			// r0_r, r0_i, r1_r, r1_i
+	xvaddsp		vs0,	vs0,	vs1
+
+
+	stxvw4x		vs0,	o0,	T2	// c0, c1
+
+	addi		T2,	T2,	16
+
+// N=0 M=6
+
+	xxlxor		vs4,	vs4,	vs4
+	xxlxor		vs5,	vs5,	vs5
+	xxlxor		vs6,	vs6,	vs6
+	xxlxor		vs7,	vs7,	vs7
+
+#ifndef TRMMKERNEL
+	lxvw4x		vs0,	o0,	T2	// c0, c1
+#else
+	xxlxor		vs0,	vs0,	vs0
+#endif
+
+
+	xxspltw		vs8,	vs38,	0
+	xxspltw		vs9,	vs38,	1
+	xxspltw		vs10,	vs38,	2
+	xxspltw		vs11,	vs38,	3
+
+
+	xxspltw		vs12,	vs39,	0
+	xxspltw		vs13,	vs39,	1
+	xxspltw		vs14,	vs39,	2
+	xxspltw		vs15,	vs39,	3
+
+	XVFADD_R1	vs4,	vs4,	vs8		// add a0_r * b0_r
+	XVFADD_I2	vs5,	vs5,	vs12		// add a0_r * b0_i
+	XVFADD_R1	vs6,	vs6,	vs10		// add a1_r * b0_r
+	XVFADD_I2	vs7,	vs7,	vs14		// add a1_r * b0_i
+
+	XVFADD_R2	vs4,	vs4,	vs13		// add a0_i * b0_i
+	XVFADD_I1	vs5,	vs5,	vs9 		// add a0_i * b0_r
+	XVFADD_R2	vs6,	vs6,	vs15		// add a1_i * b0_i
+	XVFADD_I1	vs7,	vs7,	vs11		// add a1_i * b0_r
+
+	xvmulsp		vs16,	vs4,	alpha_sr		// r0_r * alpha_r
+	xvmulsp		vs17,	vs5,	alpha_si		// r0_i * alpha_i
+	xvmulsp		vs18,	vs4,	alpha_si		// r0_r * alpha_i
+	xvmulsp		vs19,	vs5,	alpha_sr		// r0_i * alpha_r
+
+	xvsubsp		vs20,	vs16,	vs17		// r0_r * alpha_r - r0_i * alpha_i
+	xvaddsp		vs21,	vs18,	vs19		// r0_r * alpha_i + r0_i * alpha_r
+
+	xvmulsp		vs16,	vs6,	alpha_sr		// r1_r * alpha_r
+	xvmulsp		vs17,	vs7,	alpha_si		// r1_i * alpha_i
+	xvmulsp		vs18,	vs6,	alpha_si		// r1_r * alpha_i
+	xvmulsp		vs19,	vs7,	alpha_sr		// r1_i * alpha_r
+
+	xvsubsp		vs22,	vs16,	vs17		// r1_r * alpha_r - r1_i * alpha_i
+	xvaddsp		vs23,	vs18,	vs19		// r1_r * alpha_i + r1_i * alpha_r
+
+	xxlxor		vs24,	vs24,	vs24
+	xxsldwi		vs20,	vs20,	vs24,	3		// r0_r
+	xxsldwi		vs21,	vs21,	vs24,	2		// r0_i
+	xxsldwi		vs22,	vs22,	vs24,	1		// r1_r
+	xxsldwi		vs23,	vs23,	vs24,	0		// r1_i
+	xvaddsp		vs20,	vs20,	vs21		// r0_r, r0_i
+	xvaddsp		vs22,	vs22,	vs23		// r1_r, r1_i
+	xvaddsp		vs1,	vs20,	vs22			// r0_r, r0_i, r1_r, r1_i
+	xvaddsp		vs0,	vs0,	vs1
+
+
+	stxvw4x		vs0,	o0,	T2	// c0, c1
+
+	addi		T2,	T2,	16
+	add		T1,	T1,	LDC
+
+
+// N=1
+
+	mr		T2,	T1
+
+// N=1 M=0
+
+	xxlxor		vs4,	vs4,	vs4
+	xxlxor		vs5,	vs5,	vs5
+	xxlxor		vs6,	vs6,	vs6
+	xxlxor		vs7,	vs7,	vs7
+
+#ifndef TRMMKERNEL
+	lxvw4x		vs0,	o0,	T2	// c0, c1
+#else
+	xxlxor		vs0,	vs0,	vs0
+#endif
+
+
+	xxspltw		vs8,	vs40,	0
+	xxspltw		vs9,	vs40,	1
+	xxspltw		vs10,	vs40,	2
+	xxspltw		vs11,	vs40,	3
+
+
+	xxspltw		vs12,	vs41,	0
+	xxspltw		vs13,	vs41,	1
+	xxspltw		vs14,	vs41,	2
+	xxspltw		vs15,	vs41,	3
+
+	XVFADD_R1	vs4,	vs4,	vs8		// add a0_r * b0_r
+	XVFADD_I2	vs5,	vs5,	vs12		// add a0_r * b0_i
+	XVFADD_R1	vs6,	vs6,	vs10		// add a1_r * b0_r
+	XVFADD_I2	vs7,	vs7,	vs14		// add a1_r * b0_i
+
+	XVFADD_R2	vs4,	vs4,	vs13		// add a0_i * b0_i
+	XVFADD_I1	vs5,	vs5,	vs9 		// add a0_i * b0_r
+	XVFADD_R2	vs6,	vs6,	vs15		// add a1_i * b0_i
+	XVFADD_I1	vs7,	vs7,	vs11		// add a1_i * b0_r
+
+	xvmulsp		vs16,	vs4,	alpha_sr		// r0_r * alpha_r
+	xvmulsp		vs17,	vs5,	alpha_si		// r0_i * alpha_i
+	xvmulsp		vs18,	vs4,	alpha_si		// r0_r * alpha_i
+	xvmulsp		vs19,	vs5,	alpha_sr		// r0_i * alpha_r
+
+	xvsubsp		vs20,	vs16,	vs17		// r0_r * alpha_r - r0_i * alpha_i
+	xvaddsp		vs21,	vs18,	vs19		// r0_r * alpha_i + r0_i * alpha_r
+
+	xvmulsp		vs16,	vs6,	alpha_sr		// r1_r * alpha_r
+	xvmulsp		vs17,	vs7,	alpha_si		// r1_i * alpha_i
+	xvmulsp		vs18,	vs6,	alpha_si		// r1_r * alpha_i
+	xvmulsp		vs19,	vs7,	alpha_sr		// r1_i * alpha_r
+
+	xvsubsp		vs22,	vs16,	vs17		// r1_r * alpha_r - r1_i * alpha_i
+	xvaddsp		vs23,	vs18,	vs19		// r1_r * alpha_i + r1_i * alpha_r
+
+	xxlxor		vs24,	vs24,	vs24
+	xxsldwi		vs20,	vs20,	vs24,	3		// r0_r
+	xxsldwi		vs21,	vs21,	vs24,	2		// r0_i
+	xxsldwi		vs22,	vs22,	vs24,	1		// r1_r
+	xxsldwi		vs23,	vs23,	vs24,	0		// r1_i
+	xvaddsp		vs20,	vs20,	vs21		// r0_r, r0_i
+	xvaddsp		vs22,	vs22,	vs23		// r1_r, r1_i
+	xvaddsp		vs1,	vs20,	vs22			// r0_r, r0_i, r1_r, r1_i
+	xvaddsp		vs0,	vs0,	vs1
+
+
+	stxvw4x		vs0,	o0,	T2	// c0, c1
+
+	addi		T2,	T2,	16
+
+// N=1 M=2
+
+	xxlxor		vs4,	vs4,	vs4
+	xxlxor		vs5,	vs5,	vs5
+	xxlxor		vs6,	vs6,	vs6
+	xxlxor		vs7,	vs7,	vs7
+
+#ifndef TRMMKERNEL
+	lxvw4x		vs0,	o0,	T2	// c0, c1
+#else
+	xxlxor		vs0,	vs0,	vs0
+#endif
+
+
+	xxspltw		vs8,	vs42,	0
+	xxspltw		vs9,	vs42,	1
+	xxspltw		vs10,	vs42,	2
+	xxspltw		vs11,	vs42,	3
+
+
+	xxspltw		vs12,	vs43,	0
+	xxspltw		vs13,	vs43,	1
+	xxspltw		vs14,	vs43,	2
+	xxspltw		vs15,	vs43,	3
+
+	XVFADD_R1	vs4,	vs4,	vs8		// add a0_r * b0_r
+	XVFADD_I2	vs5,	vs5,	vs12		// add a0_r * b0_i
+	XVFADD_R1	vs6,	vs6,	vs10		// add a1_r * b0_r
+	XVFADD_I2	vs7,	vs7,	vs14		// add a1_r * b0_i
+
+	XVFADD_R2	vs4,	vs4,	vs13		// add a0_i * b0_i
+	XVFADD_I1	vs5,	vs5,	vs9 		// add a0_i * b0_r
+	XVFADD_R2	vs6,	vs6,	vs15		// add a1_i * b0_i
+	XVFADD_I1	vs7,	vs7,	vs11		// add a1_i * b0_r
+
+	xvmulsp		vs16,	vs4,	alpha_sr		// r0_r * alpha_r
+	xvmulsp		vs17,	vs5,	alpha_si		// r0_i * alpha_i
+	xvmulsp		vs18,	vs4,	alpha_si		// r0_r * alpha_i
+	xvmulsp		vs19,	vs5,	alpha_sr		// r0_i * alpha_r
+
+	xvsubsp		vs20,	vs16,	vs17		// r0_r * alpha_r - r0_i * alpha_i
+	xvaddsp		vs21,	vs18,	vs19		// r0_r * alpha_i + r0_i * alpha_r
+
+	xvmulsp		vs16,	vs6,	alpha_sr		// r1_r * alpha_r
+	xvmulsp		vs17,	vs7,	alpha_si		// r1_i * alpha_i
+	xvmulsp		vs18,	vs6,	alpha_si		// r1_r * alpha_i
+	xvmulsp		vs19,	vs7,	alpha_sr		// r1_i * alpha_r
+
+	xvsubsp		vs22,	vs16,	vs17		// r1_r * alpha_r - r1_i * alpha_i
+	xvaddsp		vs23,	vs18,	vs19		// r1_r * alpha_i + r1_i * alpha_r
+
+	xxlxor		vs24,	vs24,	vs24
+	xxsldwi		vs20,	vs20,	vs24,	3		// r0_r
+	xxsldwi		vs21,	vs21,	vs24,	2		// r0_i
+	xxsldwi		vs22,	vs22,	vs24,	1		// r1_r
+	xxsldwi		vs23,	vs23,	vs24,	0		// r1_i
+	xvaddsp		vs20,	vs20,	vs21		// r0_r, r0_i
+	xvaddsp		vs22,	vs22,	vs23		// r1_r, r1_i
+	xvaddsp		vs1,	vs20,	vs22			// r0_r, r0_i, r1_r, r1_i
+	xvaddsp		vs0,	vs0,	vs1
+
+
+	stxvw4x		vs0,	o0,	T2	// c0, c1
+
+	addi		T2,	T2,	16
+
+// N=1 M=4
+
+	xxlxor		vs4,	vs4,	vs4
+	xxlxor		vs5,	vs5,	vs5
+	xxlxor		vs6,	vs6,	vs6
+	xxlxor		vs7,	vs7,	vs7
+
+#ifndef TRMMKERNEL
+	lxvw4x		vs0,	o0,	T2	// c0, c1
+#else
+	xxlxor		vs0,	vs0,	vs0
+#endif
+
+
+	xxspltw		vs8,	vs44,	0
+	xxspltw		vs9,	vs44,	1
+	xxspltw		vs10,	vs44,	2
+	xxspltw		vs11,	vs44,	3
+
+
+	xxspltw		vs12,	vs45,	0
+	xxspltw		vs13,	vs45,	1
+	xxspltw		vs14,	vs45,	2
+	xxspltw		vs15,	vs45,	3
+
+	XVFADD_R1	vs4,	vs4,	vs8		// add a0_r * b0_r
+	XVFADD_I2	vs5,	vs5,	vs12		// add a0_r * b0_i
+	XVFADD_R1	vs6,	vs6,	vs10		// add a1_r * b0_r
+	XVFADD_I2	vs7,	vs7,	vs14		// add a1_r * b0_i
+
+	XVFADD_R2	vs4,	vs4,	vs13		// add a0_i * b0_i
+	XVFADD_I1	vs5,	vs5,	vs9 		// add a0_i * b0_r
+	XVFADD_R2	vs6,	vs6,	vs15		// add a1_i * b0_i
+	XVFADD_I1	vs7,	vs7,	vs11		// add a1_i * b0_r
+
+	xvmulsp		vs16,	vs4,	alpha_sr		// r0_r * alpha_r
+	xvmulsp		vs17,	vs5,	alpha_si		// r0_i * alpha_i
+	xvmulsp		vs18,	vs4,	alpha_si		// r0_r * alpha_i
+	xvmulsp		vs19,	vs5,	alpha_sr		// r0_i * alpha_r
+
+	xvsubsp		vs20,	vs16,	vs17		// r0_r * alpha_r - r0_i * alpha_i
+	xvaddsp		vs21,	vs18,	vs19		// r0_r * alpha_i + r0_i * alpha_r
+
+	xvmulsp		vs16,	vs6,	alpha_sr		// r1_r * alpha_r
+	xvmulsp		vs17,	vs7,	alpha_si		// r1_i * alpha_i
+	xvmulsp		vs18,	vs6,	alpha_si		// r1_r * alpha_i
+	xvmulsp		vs19,	vs7,	alpha_sr		// r1_i * alpha_r
+
+	xvsubsp		vs22,	vs16,	vs17		// r1_r * alpha_r - r1_i * alpha_i
+	xvaddsp		vs23,	vs18,	vs19		// r1_r * alpha_i + r1_i * alpha_r
+
+	xxlxor		vs24,	vs24,	vs24
+	xxsldwi		vs20,	vs20,	vs24,	3		// r0_r
+	xxsldwi		vs21,	vs21,	vs24,	2		// r0_i
+	xxsldwi		vs22,	vs22,	vs24,	1		// r1_r
+	xxsldwi		vs23,	vs23,	vs24,	0		// r1_i
+	xvaddsp		vs20,	vs20,	vs21		// r0_r, r0_i
+	xvaddsp		vs22,	vs22,	vs23		// r1_r, r1_i
+	xvaddsp		vs1,	vs20,	vs22			// r0_r, r0_i, r1_r, r1_i
+	xvaddsp		vs0,	vs0,	vs1
+
+
+	stxvw4x		vs0,	o0,	T2	// c0, c1
+
+	addi		T2,	T2,	16
+
+// N=1 M=6
+
+	xxlxor		vs4,	vs4,	vs4
+	xxlxor		vs5,	vs5,	vs5
+	xxlxor		vs6,	vs6,	vs6
+	xxlxor		vs7,	vs7,	vs7
+
+#ifndef TRMMKERNEL
+	lxvw4x		vs0,	o0,	T2	// c0, c1
+#else
+	xxlxor		vs0,	vs0,	vs0
+#endif
+
+
+	xxspltw		vs8,	vs46,	0
+	xxspltw		vs9,	vs46,	1
+	xxspltw		vs10,	vs46,	2
+	xxspltw		vs11,	vs46,	3
+
+
+	xxspltw		vs12,	vs47,	0
+	xxspltw		vs13,	vs47,	1
+	xxspltw		vs14,	vs47,	2
+	xxspltw		vs15,	vs47,	3
+
+	XVFADD_R1	vs4,	vs4,	vs8		// add a0_r * b0_r
+	XVFADD_I2	vs5,	vs5,	vs12		// add a0_r * b0_i
+	XVFADD_R1	vs6,	vs6,	vs10		// add a1_r * b0_r
+	XVFADD_I2	vs7,	vs7,	vs14		// add a1_r * b0_i
+
+	XVFADD_R2	vs4,	vs4,	vs13		// add a0_i * b0_i
+	XVFADD_I1	vs5,	vs5,	vs9 		// add a0_i * b0_r
+	XVFADD_R2	vs6,	vs6,	vs15		// add a1_i * b0_i
+	XVFADD_I1	vs7,	vs7,	vs11		// add a1_i * b0_r
+
+	xvmulsp		vs16,	vs4,	alpha_sr		// r0_r * alpha_r
+	xvmulsp		vs17,	vs5,	alpha_si		// r0_i * alpha_i
+	xvmulsp		vs18,	vs4,	alpha_si		// r0_r * alpha_i
+	xvmulsp		vs19,	vs5,	alpha_sr		// r0_i * alpha_r
+
+	xvsubsp		vs20,	vs16,	vs17		// r0_r * alpha_r - r0_i * alpha_i
+	xvaddsp		vs21,	vs18,	vs19		// r0_r * alpha_i + r0_i * alpha_r
+
+	xvmulsp		vs16,	vs6,	alpha_sr		// r1_r * alpha_r
+	xvmulsp		vs17,	vs7,	alpha_si		// r1_i * alpha_i
+	xvmulsp		vs18,	vs6,	alpha_si		// r1_r * alpha_i
+	xvmulsp		vs19,	vs7,	alpha_sr		// r1_i * alpha_r
+
+	xvsubsp		vs22,	vs16,	vs17		// r1_r * alpha_r - r1_i * alpha_i
+	xvaddsp		vs23,	vs18,	vs19		// r1_r * alpha_i + r1_i * alpha_r
+
+	xxlxor		vs24,	vs24,	vs24
+	xxsldwi		vs20,	vs20,	vs24,	3		// r0_r
+	xxsldwi		vs21,	vs21,	vs24,	2		// r0_i
+	xxsldwi		vs22,	vs22,	vs24,	1		// r1_r
+	xxsldwi		vs23,	vs23,	vs24,	0		// r1_i
+	xvaddsp		vs20,	vs20,	vs21		// r0_r, r0_i
+	xvaddsp		vs22,	vs22,	vs23		// r1_r, r1_i
+	xvaddsp		vs1,	vs20,	vs22			// r0_r, r0_i, r1_r, r1_i
+	xvaddsp		vs0,	vs0,	vs1
+
+
+	stxvw4x		vs0,	o0,	T2	// c0, c1
+
+	addi		T2,	T2,	16
+	add		T1,	T1,	LDC
+
+
+// N=2
+
+	mr		T2,	T1
+
+// N=2 M=0
+
+	xxlxor		vs4,	vs4,	vs4
+	xxlxor		vs5,	vs5,	vs5
+	xxlxor		vs6,	vs6,	vs6
+	xxlxor		vs7,	vs7,	vs7
+
+#ifndef TRMMKERNEL
+	lxvw4x		vs0,	o0,	T2	// c0, c1
+#else
+	xxlxor		vs0,	vs0,	vs0
+#endif
+
+
+	xxspltw		vs8,	vs48,	0
+	xxspltw		vs9,	vs48,	1
+	xxspltw		vs10,	vs48,	2
+	xxspltw		vs11,	vs48,	3
+
+
+	xxspltw		vs12,	vs49,	0
+	xxspltw		vs13,	vs49,	1
+	xxspltw		vs14,	vs49,	2
+	xxspltw		vs15,	vs49,	3
+
+	XVFADD_R1	vs4,	vs4,	vs8		// add a0_r * b0_r
+	XVFADD_I2	vs5,	vs5,	vs12		// add a0_r * b0_i
+	XVFADD_R1	vs6,	vs6,	vs10		// add a1_r * b0_r
+	XVFADD_I2	vs7,	vs7,	vs14		// add a1_r * b0_i
+
+	XVFADD_R2	vs4,	vs4,	vs13		// add a0_i * b0_i
+	XVFADD_I1	vs5,	vs5,	vs9 		// add a0_i * b0_r
+	XVFADD_R2	vs6,	vs6,	vs15		// add a1_i * b0_i
+	XVFADD_I1	vs7,	vs7,	vs11		// add a1_i * b0_r
+
+	xvmulsp		vs16,	vs4,	alpha_sr		// r0_r * alpha_r
+	xvmulsp		vs17,	vs5,	alpha_si		// r0_i * alpha_i
+	xvmulsp		vs18,	vs4,	alpha_si		// r0_r * alpha_i
+	xvmulsp		vs19,	vs5,	alpha_sr		// r0_i * alpha_r
+
+	xvsubsp		vs20,	vs16,	vs17		// r0_r * alpha_r - r0_i * alpha_i
+	xvaddsp		vs21,	vs18,	vs19		// r0_r * alpha_i + r0_i * alpha_r
+
+	xvmulsp		vs16,	vs6,	alpha_sr		// r1_r * alpha_r
+	xvmulsp		vs17,	vs7,	alpha_si		// r1_i * alpha_i
+	xvmulsp		vs18,	vs6,	alpha_si		// r1_r * alpha_i
+	xvmulsp		vs19,	vs7,	alpha_sr		// r1_i * alpha_r
+
+	xvsubsp		vs22,	vs16,	vs17		// r1_r * alpha_r - r1_i * alpha_i
+	xvaddsp		vs23,	vs18,	vs19		// r1_r * alpha_i + r1_i * alpha_r
+
+	xxlxor		vs24,	vs24,	vs24
+	xxsldwi		vs20,	vs20,	vs24,	3		// r0_r
+	xxsldwi		vs21,	vs21,	vs24,	2		// r0_i
+	xxsldwi		vs22,	vs22,	vs24,	1		// r1_r
+	xxsldwi		vs23,	vs23,	vs24,	0		// r1_i
+	xvaddsp		vs20,	vs20,	vs21		// r0_r, r0_i
+	xvaddsp		vs22,	vs22,	vs23		// r1_r, r1_i
+	xvaddsp		vs1,	vs20,	vs22			// r0_r, r0_i, r1_r, r1_i
+	xvaddsp		vs0,	vs0,	vs1
+
+
+	stxvw4x		vs0,	o0,	T2	// c0, c1
+
+	addi		T2,	T2,	16
+
+// N=2 M=2
+
+	xxlxor		vs4,	vs4,	vs4
+	xxlxor		vs5,	vs5,	vs5
+	xxlxor		vs6,	vs6,	vs6
+	xxlxor		vs7,	vs7,	vs7
+
+#ifndef TRMMKERNEL
+	lxvw4x		vs0,	o0,	T2	// c0, c1
+#else
+	xxlxor		vs0,	vs0,	vs0
+#endif
+
+
+	xxspltw		vs8,	vs50,	0
+	xxspltw		vs9,	vs50,	1
+	xxspltw		vs10,	vs50,	2
+	xxspltw		vs11,	vs50,	3
+
+
+	xxspltw		vs12,	vs51,	0
+	xxspltw		vs13,	vs51,	1
+	xxspltw		vs14,	vs51,	2
+	xxspltw		vs15,	vs51,	3
+
+	XVFADD_R1	vs4,	vs4,	vs8		// add a0_r * b0_r
+	XVFADD_I2	vs5,	vs5,	vs12		// add a0_r * b0_i
+	XVFADD_R1	vs6,	vs6,	vs10		// add a1_r * b0_r
+	XVFADD_I2	vs7,	vs7,	vs14		// add a1_r * b0_i
+
+	XVFADD_R2	vs4,	vs4,	vs13		// add a0_i * b0_i
+	XVFADD_I1	vs5,	vs5,	vs9 		// add a0_i * b0_r
+	XVFADD_R2	vs6,	vs6,	vs15		// add a1_i * b0_i
+	XVFADD_I1	vs7,	vs7,	vs11		// add a1_i * b0_r
+
+	xvmulsp		vs16,	vs4,	alpha_sr		// r0_r * alpha_r
+	xvmulsp		vs17,	vs5,	alpha_si		// r0_i * alpha_i
+	xvmulsp		vs18,	vs4,	alpha_si		// r0_r * alpha_i
+	xvmulsp		vs19,	vs5,	alpha_sr		// r0_i * alpha_r
+
+	xvsubsp		vs20,	vs16,	vs17		// r0_r * alpha_r - r0_i * alpha_i
+	xvaddsp		vs21,	vs18,	vs19		// r0_r * alpha_i + r0_i * alpha_r
+
+	xvmulsp		vs16,	vs6,	alpha_sr		// r1_r * alpha_r
+	xvmulsp		vs17,	vs7,	alpha_si		// r1_i * alpha_i
+	xvmulsp		vs18,	vs6,	alpha_si		// r1_r * alpha_i
+	xvmulsp		vs19,	vs7,	alpha_sr		// r1_i * alpha_r
+
+	xvsubsp		vs22,	vs16,	vs17		// r1_r * alpha_r - r1_i * alpha_i
+	xvaddsp		vs23,	vs18,	vs19		// r1_r * alpha_i + r1_i * alpha_r
+
+	xxlxor		vs24,	vs24,	vs24
+	xxsldwi		vs20,	vs20,	vs24,	3		// r0_r
+	xxsldwi		vs21,	vs21,	vs24,	2		// r0_i
+	xxsldwi		vs22,	vs22,	vs24,	1		// r1_r
+	xxsldwi		vs23,	vs23,	vs24,	0		// r1_i
+	xvaddsp		vs20,	vs20,	vs21		// r0_r, r0_i
+	xvaddsp		vs22,	vs22,	vs23		// r1_r, r1_i
+	xvaddsp		vs1,	vs20,	vs22			// r0_r, r0_i, r1_r, r1_i
+	xvaddsp		vs0,	vs0,	vs1
+
+
+	stxvw4x		vs0,	o0,	T2	// c0, c1
+
+	addi		T2,	T2,	16
+
+// N=2 M=4
+
+	xxlxor		vs4,	vs4,	vs4
+	xxlxor		vs5,	vs5,	vs5
+	xxlxor		vs6,	vs6,	vs6
+	xxlxor		vs7,	vs7,	vs7
+
+#ifndef TRMMKERNEL
+	lxvw4x		vs0,	o0,	T2	// c0, c1
+#else
+	xxlxor		vs0,	vs0,	vs0
+#endif
+
+
+	xxspltw		vs8,	vs52,	0
+	xxspltw		vs9,	vs52,	1
+	xxspltw		vs10,	vs52,	2
+	xxspltw		vs11,	vs52,	3
+
+
+	xxspltw		vs12,	vs53,	0
+	xxspltw		vs13,	vs53,	1
+	xxspltw		vs14,	vs53,	2
+	xxspltw		vs15,	vs53,	3
+
+	XVFADD_R1	vs4,	vs4,	vs8		// add a0_r * b0_r
+	XVFADD_I2	vs5,	vs5,	vs12		// add a0_r * b0_i
+	XVFADD_R1	vs6,	vs6,	vs10		// add a1_r * b0_r
+	XVFADD_I2	vs7,	vs7,	vs14		// add a1_r * b0_i
+
+	XVFADD_R2	vs4,	vs4,	vs13		// add a0_i * b0_i
+	XVFADD_I1	vs5,	vs5,	vs9 		// add a0_i * b0_r
+	XVFADD_R2	vs6,	vs6,	vs15		// add a1_i * b0_i
+	XVFADD_I1	vs7,	vs7,	vs11		// add a1_i * b0_r
+
+	xvmulsp		vs16,	vs4,	alpha_sr		// r0_r * alpha_r
+	xvmulsp		vs17,	vs5,	alpha_si		// r0_i * alpha_i
+	xvmulsp		vs18,	vs4,	alpha_si		// r0_r * alpha_i
+	xvmulsp		vs19,	vs5,	alpha_sr		// r0_i * alpha_r
+
+	xvsubsp		vs20,	vs16,	vs17		// r0_r * alpha_r - r0_i * alpha_i
+	xvaddsp		vs21,	vs18,	vs19		// r0_r * alpha_i + r0_i * alpha_r
+
+	xvmulsp		vs16,	vs6,	alpha_sr		// r1_r * alpha_r
+	xvmulsp		vs17,	vs7,	alpha_si		// r1_i * alpha_i
+	xvmulsp		vs18,	vs6,	alpha_si		// r1_r * alpha_i
+	xvmulsp		vs19,	vs7,	alpha_sr		// r1_i * alpha_r
+
+	xvsubsp		vs22,	vs16,	vs17		// r1_r * alpha_r - r1_i * alpha_i
+	xvaddsp		vs23,	vs18,	vs19		// r1_r * alpha_i + r1_i * alpha_r
+
+	xxlxor		vs24,	vs24,	vs24
+	xxsldwi		vs20,	vs20,	vs24,	3		// r0_r
+	xxsldwi		vs21,	vs21,	vs24,	2		// r0_i
+	xxsldwi		vs22,	vs22,	vs24,	1		// r1_r
+	xxsldwi		vs23,	vs23,	vs24,	0		// r1_i
+	xvaddsp		vs20,	vs20,	vs21		// r0_r, r0_i
+	xvaddsp		vs22,	vs22,	vs23		// r1_r, r1_i
+	xvaddsp		vs1,	vs20,	vs22			// r0_r, r0_i, r1_r, r1_i
+	xvaddsp		vs0,	vs0,	vs1
+
+
+	stxvw4x		vs0,	o0,	T2	// c0, c1
+
+	addi		T2,	T2,	16
+
+// N=2 M=6
+
+	xxlxor		vs4,	vs4,	vs4
+	xxlxor		vs5,	vs5,	vs5
+	xxlxor		vs6,	vs6,	vs6
+	xxlxor		vs7,	vs7,	vs7
+
+#ifndef TRMMKERNEL
+	lxvw4x		vs0,	o0,	T2	// c0, c1
+#else
+	xxlxor		vs0,	vs0,	vs0
+#endif
+
+
+	xxspltw		vs8,	vs54,	0
+	xxspltw		vs9,	vs54,	1
+	xxspltw		vs10,	vs54,	2
+	xxspltw		vs11,	vs54,	3
+
+
+	xxspltw		vs12,	vs55,	0
+	xxspltw		vs13,	vs55,	1
+	xxspltw		vs14,	vs55,	2
+	xxspltw		vs15,	vs55,	3
+
+	XVFADD_R1	vs4,	vs4,	vs8		// add a0_r * b0_r
+	XVFADD_I2	vs5,	vs5,	vs12		// add a0_r * b0_i
+	XVFADD_R1	vs6,	vs6,	vs10		// add a1_r * b0_r
+	XVFADD_I2	vs7,	vs7,	vs14		// add a1_r * b0_i
+
+	XVFADD_R2	vs4,	vs4,	vs13		// add a0_i * b0_i
+	XVFADD_I1	vs5,	vs5,	vs9 		// add a0_i * b0_r
+	XVFADD_R2	vs6,	vs6,	vs15		// add a1_i * b0_i
+	XVFADD_I1	vs7,	vs7,	vs11		// add a1_i * b0_r
+
+	xvmulsp		vs16,	vs4,	alpha_sr		// r0_r * alpha_r
+	xvmulsp		vs17,	vs5,	alpha_si		// r0_i * alpha_i
+	xvmulsp		vs18,	vs4,	alpha_si		// r0_r * alpha_i
+	xvmulsp		vs19,	vs5,	alpha_sr		// r0_i * alpha_r
+
+	xvsubsp		vs20,	vs16,	vs17		// r0_r * alpha_r - r0_i * alpha_i
+	xvaddsp		vs21,	vs18,	vs19		// r0_r * alpha_i + r0_i * alpha_r
+
+	xvmulsp		vs16,	vs6,	alpha_sr		// r1_r * alpha_r
+	xvmulsp		vs17,	vs7,	alpha_si		// r1_i * alpha_i
+	xvmulsp		vs18,	vs6,	alpha_si		// r1_r * alpha_i
+	xvmulsp		vs19,	vs7,	alpha_sr		// r1_i * alpha_r
+
+	xvsubsp		vs22,	vs16,	vs17		// r1_r * alpha_r - r1_i * alpha_i
+	xvaddsp		vs23,	vs18,	vs19		// r1_r * alpha_i + r1_i * alpha_r
+
+	xxlxor		vs24,	vs24,	vs24
+	xxsldwi		vs20,	vs20,	vs24,	3		// r0_r
+	xxsldwi		vs21,	vs21,	vs24,	2		// r0_i
+	xxsldwi		vs22,	vs22,	vs24,	1		// r1_r
+	xxsldwi		vs23,	vs23,	vs24,	0		// r1_i
+	xvaddsp		vs20,	vs20,	vs21		// r0_r, r0_i
+	xvaddsp		vs22,	vs22,	vs23		// r1_r, r1_i
+	xvaddsp		vs1,	vs20,	vs22			// r0_r, r0_i, r1_r, r1_i
+	xvaddsp		vs0,	vs0,	vs1
+
+
+	stxvw4x		vs0,	o0,	T2	// c0, c1
+
+	addi		T2,	T2,	16
+	add		T1,	T1,	LDC
+
+
+// N=3
+
+	mr		T2,	T1
+
+// N=3 M=0
+
+	xxlxor		vs4,	vs4,	vs4
+	xxlxor		vs5,	vs5,	vs5
+	xxlxor		vs6,	vs6,	vs6
+	xxlxor		vs7,	vs7,	vs7
+
+#ifndef TRMMKERNEL
+	lxvw4x		vs0,	o0,	T2	// c0, c1
+#else
+	xxlxor		vs0,	vs0,	vs0
+#endif
+
+
+	xxspltw		vs8,	vs56,	0
+	xxspltw		vs9,	vs56,	1
+	xxspltw		vs10,	vs56,	2
+	xxspltw		vs11,	vs56,	3
+
+
+	xxspltw		vs12,	vs57,	0
+	xxspltw		vs13,	vs57,	1
+	xxspltw		vs14,	vs57,	2
+	xxspltw		vs15,	vs57,	3
+
+	XVFADD_R1	vs4,	vs4,	vs8		// add a0_r * b0_r
+	XVFADD_I2	vs5,	vs5,	vs12		// add a0_r * b0_i
+	XVFADD_R1	vs6,	vs6,	vs10		// add a1_r * b0_r
+	XVFADD_I2	vs7,	vs7,	vs14		// add a1_r * b0_i
+
+	XVFADD_R2	vs4,	vs4,	vs13		// add a0_i * b0_i
+	XVFADD_I1	vs5,	vs5,	vs9 		// add a0_i * b0_r
+	XVFADD_R2	vs6,	vs6,	vs15		// add a1_i * b0_i
+	XVFADD_I1	vs7,	vs7,	vs11		// add a1_i * b0_r
+
+	xvmulsp		vs16,	vs4,	alpha_sr		// r0_r * alpha_r
+	xvmulsp		vs17,	vs5,	alpha_si		// r0_i * alpha_i
+	xvmulsp		vs18,	vs4,	alpha_si		// r0_r * alpha_i
+	xvmulsp		vs19,	vs5,	alpha_sr		// r0_i * alpha_r
+
+	xvsubsp		vs20,	vs16,	vs17		// r0_r * alpha_r - r0_i * alpha_i
+	xvaddsp		vs21,	vs18,	vs19		// r0_r * alpha_i + r0_i * alpha_r
+
+	xvmulsp		vs16,	vs6,	alpha_sr		// r1_r * alpha_r
+	xvmulsp		vs17,	vs7,	alpha_si		// r1_i * alpha_i
+	xvmulsp		vs18,	vs6,	alpha_si		// r1_r * alpha_i
+	xvmulsp		vs19,	vs7,	alpha_sr		// r1_i * alpha_r
+
+	xvsubsp		vs22,	vs16,	vs17		// r1_r * alpha_r - r1_i * alpha_i
+	xvaddsp		vs23,	vs18,	vs19		// r1_r * alpha_i + r1_i * alpha_r
+
+	xxlxor		vs24,	vs24,	vs24
+	xxsldwi		vs20,	vs20,	vs24,	3		// r0_r
+	xxsldwi		vs21,	vs21,	vs24,	2		// r0_i
+	xxsldwi		vs22,	vs22,	vs24,	1		// r1_r
+	xxsldwi		vs23,	vs23,	vs24,	0		// r1_i
+	xvaddsp		vs20,	vs20,	vs21		// r0_r, r0_i
+	xvaddsp		vs22,	vs22,	vs23		// r1_r, r1_i
+	xvaddsp		vs1,	vs20,	vs22			// r0_r, r0_i, r1_r, r1_i
+	xvaddsp		vs0,	vs0,	vs1
+
+
+	stxvw4x		vs0,	o0,	T2	// c0, c1
+
+	addi		T2,	T2,	16
+
+// N=3 M=2
+
+	xxlxor		vs4,	vs4,	vs4
+	xxlxor		vs5,	vs5,	vs5
+	xxlxor		vs6,	vs6,	vs6
+	xxlxor		vs7,	vs7,	vs7
+
+#ifndef TRMMKERNEL
+	lxvw4x		vs0,	o0,	T2	// c0, c1
+#else
+	xxlxor		vs0,	vs0,	vs0
+#endif
+
+
+	xxspltw		vs8,	vs58,	0
+	xxspltw		vs9,	vs58,	1
+	xxspltw		vs10,	vs58,	2
+	xxspltw		vs11,	vs58,	3
+
+
+	xxspltw		vs12,	vs59,	0
+	xxspltw		vs13,	vs59,	1
+	xxspltw		vs14,	vs59,	2
+	xxspltw		vs15,	vs59,	3
+
+	XVFADD_R1	vs4,	vs4,	vs8		// add a0_r * b0_r
+	XVFADD_I2	vs5,	vs5,	vs12		// add a0_r * b0_i
+	XVFADD_R1	vs6,	vs6,	vs10		// add a1_r * b0_r
+	XVFADD_I2	vs7,	vs7,	vs14		// add a1_r * b0_i
+
+	XVFADD_R2	vs4,	vs4,	vs13		// add a0_i * b0_i
+	XVFADD_I1	vs5,	vs5,	vs9 		// add a0_i * b0_r
+	XVFADD_R2	vs6,	vs6,	vs15		// add a1_i * b0_i
+	XVFADD_I1	vs7,	vs7,	vs11		// add a1_i * b0_r
+
+	xvmulsp		vs16,	vs4,	alpha_sr		// r0_r * alpha_r
+	xvmulsp		vs17,	vs5,	alpha_si		// r0_i * alpha_i
+	xvmulsp		vs18,	vs4,	alpha_si		// r0_r * alpha_i
+	xvmulsp		vs19,	vs5,	alpha_sr		// r0_i * alpha_r
+
+	xvsubsp		vs20,	vs16,	vs17		// r0_r * alpha_r - r0_i * alpha_i
+	xvaddsp		vs21,	vs18,	vs19		// r0_r * alpha_i + r0_i * alpha_r
+
+	xvmulsp		vs16,	vs6,	alpha_sr		// r1_r * alpha_r
+	xvmulsp		vs17,	vs7,	alpha_si		// r1_i * alpha_i
+	xvmulsp		vs18,	vs6,	alpha_si		// r1_r * alpha_i
+	xvmulsp		vs19,	vs7,	alpha_sr		// r1_i * alpha_r
+
+	xvsubsp		vs22,	vs16,	vs17		// r1_r * alpha_r - r1_i * alpha_i
+	xvaddsp		vs23,	vs18,	vs19		// r1_r * alpha_i + r1_i * alpha_r
+
+	xxlxor		vs24,	vs24,	vs24
+	xxsldwi		vs20,	vs20,	vs24,	3		// r0_r
+	xxsldwi		vs21,	vs21,	vs24,	2		// r0_i
+	xxsldwi		vs22,	vs22,	vs24,	1		// r1_r
+	xxsldwi		vs23,	vs23,	vs24,	0		// r1_i
+	xvaddsp		vs20,	vs20,	vs21		// r0_r, r0_i
+	xvaddsp		vs22,	vs22,	vs23		// r1_r, r1_i
+	xvaddsp		vs1,	vs20,	vs22			// r0_r, r0_i, r1_r, r1_i
+	xvaddsp		vs0,	vs0,	vs1
+
+
+	stxvw4x		vs0,	o0,	T2	// c0, c1
+
+	addi		T2,	T2,	16
+
+// N=3 M=4
+
+	xxlxor		vs4,	vs4,	vs4
+	xxlxor		vs5,	vs5,	vs5
+	xxlxor		vs6,	vs6,	vs6
+	xxlxor		vs7,	vs7,	vs7
+
+#ifndef TRMMKERNEL
+	lxvw4x		vs0,	o0,	T2	// c0, c1
+#else
+	xxlxor		vs0,	vs0,	vs0
+#endif
+
+
+	xxspltw		vs8,	vs60,	0
+	xxspltw		vs9,	vs60,	1
+	xxspltw		vs10,	vs60,	2
+	xxspltw		vs11,	vs60,	3
+
+
+	xxspltw		vs12,	vs61,	0
+	xxspltw		vs13,	vs61,	1
+	xxspltw		vs14,	vs61,	2
+	xxspltw		vs15,	vs61,	3
+
+	XVFADD_R1	vs4,	vs4,	vs8		// add a0_r * b0_r
+	XVFADD_I2	vs5,	vs5,	vs12		// add a0_r * b0_i
+	XVFADD_R1	vs6,	vs6,	vs10		// add a1_r * b0_r
+	XVFADD_I2	vs7,	vs7,	vs14		// add a1_r * b0_i
+
+	XVFADD_R2	vs4,	vs4,	vs13		// add a0_i * b0_i
+	XVFADD_I1	vs5,	vs5,	vs9 		// add a0_i * b0_r
+	XVFADD_R2	vs6,	vs6,	vs15		// add a1_i * b0_i
+	XVFADD_I1	vs7,	vs7,	vs11		// add a1_i * b0_r
+
+	xvmulsp		vs16,	vs4,	alpha_sr		// r0_r * alpha_r
+	xvmulsp		vs17,	vs5,	alpha_si		// r0_i * alpha_i
+	xvmulsp		vs18,	vs4,	alpha_si		// r0_r * alpha_i
+	xvmulsp		vs19,	vs5,	alpha_sr		// r0_i * alpha_r
+
+	xvsubsp		vs20,	vs16,	vs17		// r0_r * alpha_r - r0_i * alpha_i
+	xvaddsp		vs21,	vs18,	vs19		// r0_r * alpha_i + r0_i * alpha_r
+
+	xvmulsp		vs16,	vs6,	alpha_sr		// r1_r * alpha_r
+	xvmulsp		vs17,	vs7,	alpha_si		// r1_i * alpha_i
+	xvmulsp		vs18,	vs6,	alpha_si		// r1_r * alpha_i
+	xvmulsp		vs19,	vs7,	alpha_sr		// r1_i * alpha_r
+
+	xvsubsp		vs22,	vs16,	vs17		// r1_r * alpha_r - r1_i * alpha_i
+	xvaddsp		vs23,	vs18,	vs19		// r1_r * alpha_i + r1_i * alpha_r
+
+	xxlxor		vs24,	vs24,	vs24
+	xxsldwi		vs20,	vs20,	vs24,	3		// r0_r
+	xxsldwi		vs21,	vs21,	vs24,	2		// r0_i
+	xxsldwi		vs22,	vs22,	vs24,	1		// r1_r
+	xxsldwi		vs23,	vs23,	vs24,	0		// r1_i
+	xvaddsp		vs20,	vs20,	vs21		// r0_r, r0_i
+	xvaddsp		vs22,	vs22,	vs23		// r1_r, r1_i
+	xvaddsp		vs1,	vs20,	vs22			// r0_r, r0_i, r1_r, r1_i
+	xvaddsp		vs0,	vs0,	vs1
+
+
+	stxvw4x		vs0,	o0,	T2	// c0, c1
+
+	addi		T2,	T2,	16
+
+// N=3 M=6
+
+	xxlxor		vs4,	vs4,	vs4
+	xxlxor		vs5,	vs5,	vs5
+	xxlxor		vs6,	vs6,	vs6
+	xxlxor		vs7,	vs7,	vs7
+
+#ifndef TRMMKERNEL
+	lxvw4x		vs0,	o0,	T2	// c0, c1
+#else
+	xxlxor		vs0,	vs0,	vs0
+#endif
+
+
+	xxspltw		vs8,	vs62,	0
+	xxspltw		vs9,	vs62,	1
+	xxspltw		vs10,	vs62,	2
+	xxspltw		vs11,	vs62,	3
+
+
+	xxspltw		vs12,	vs63,	0
+	xxspltw		vs13,	vs63,	1
+	xxspltw		vs14,	vs63,	2
+	xxspltw		vs15,	vs63,	3
+
+	XVFADD_R1	vs4,	vs4,	vs8		// add a0_r * b0_r
+	XVFADD_I2	vs5,	vs5,	vs12		// add a0_r * b0_i
+	XVFADD_R1	vs6,	vs6,	vs10		// add a1_r * b0_r
+	XVFADD_I2	vs7,	vs7,	vs14		// add a1_r * b0_i
+
+	XVFADD_R2	vs4,	vs4,	vs13		// add a0_i * b0_i
+	XVFADD_I1	vs5,	vs5,	vs9 		// add a0_i * b0_r
+	XVFADD_R2	vs6,	vs6,	vs15		// add a1_i * b0_i
+	XVFADD_I1	vs7,	vs7,	vs11		// add a1_i * b0_r
+
+	xvmulsp		vs16,	vs4,	alpha_sr		// r0_r * alpha_r
+	xvmulsp		vs17,	vs5,	alpha_si		// r0_i * alpha_i
+	xvmulsp		vs18,	vs4,	alpha_si		// r0_r * alpha_i
+	xvmulsp		vs19,	vs5,	alpha_sr		// r0_i * alpha_r
+
+	xvsubsp		vs20,	vs16,	vs17		// r0_r * alpha_r - r0_i * alpha_i
+	xvaddsp		vs21,	vs18,	vs19		// r0_r * alpha_i + r0_i * alpha_r
+
+	xvmulsp		vs16,	vs6,	alpha_sr		// r1_r * alpha_r
+	xvmulsp		vs17,	vs7,	alpha_si		// r1_i * alpha_i
+	xvmulsp		vs18,	vs6,	alpha_si		// r1_r * alpha_i
+	xvmulsp		vs19,	vs7,	alpha_sr		// r1_i * alpha_r
+
+	xvsubsp		vs22,	vs16,	vs17		// r1_r * alpha_r - r1_i * alpha_i
+	xvaddsp		vs23,	vs18,	vs19		// r1_r * alpha_i + r1_i * alpha_r
+
+	xxlxor		vs24,	vs24,	vs24
+	xxsldwi		vs20,	vs20,	vs24,	3		// r0_r
+	xxsldwi		vs21,	vs21,	vs24,	2		// r0_i
+	xxsldwi		vs22,	vs22,	vs24,	1		// r1_r
+	xxsldwi		vs23,	vs23,	vs24,	0		// r1_i
+	xvaddsp		vs20,	vs20,	vs21		// r0_r, r0_i
+	xvaddsp		vs22,	vs22,	vs23		// r1_r, r1_i
+	xvaddsp		vs1,	vs20,	vs22			// r0_r, r0_i, r1_r, r1_i
+	xvaddsp		vs0,	vs0,	vs1
+
+
+	stxvw4x		vs0,	o0,	T2	// c0, c1
+
+	addi		T2,	T2,	16
+	add		T1,	T1,	LDC
+
+	addi		CO,	CO,	64
+
+.endm
+
+
+/**********************************************************************************************
+* Macros for N=4 and M=4
+**********************************************************************************************/
+
+.macro LOAD4x4_1
+
+	lxvw4x		vs0,	o0,	AO		// load a0, a1
+
+	lxvw4x		vs1,	o16,	AO		// load a2, a3
+
+
+	addi		AO,	AO,	32
+
+	lxvw4x		vs24,	o0,	BO		//  load b0, b1
+
+	xxspltw		vs8,	vs24,	0
+	xxspltw		vs9,	vs24,	1
+	xxspltw		vs10,	vs24,	2
+	xxspltw		vs11,	vs24,	3
+
+	lxvw4x		vs25,	o16,	BO		//  load b2, b3
+
+	xxspltw		vs12,	vs25,	0
+	xxspltw		vs13,	vs25,	1
+	xxspltw		vs14,	vs25,	2
+	xxspltw		vs15,	vs25,	3
+
+
+	addi		BO,	BO,	32
+
+.endm
+
+.macro KERNEL4x4_I1
+
+
+	lxvw4x		vs4,	o0,	AO		// load a0, a1
+
+	lxvw4x		vs5,	o16,	AO		// load a2, a3
+
+
+	addi		AO,	AO,	32
+
+	lxvw4x		vs24,	o0,	BO		//  load b0, b1
+
+	xxspltw		vs16,	vs24,	0
+	xxspltw		vs17,	vs24,	1
+	xxspltw		vs18,	vs24,	2
+	xxspltw		vs19,	vs24,	3
+
+	lxvw4x		vs25,	o16,	BO		//  load b2, b3
+
+	xxspltw		vs20,	vs25,	0
+	xxspltw		vs21,	vs25,	1
+	xxspltw		vs22,	vs25,	2
+	xxspltw		vs23,	vs25,	3
+
+
+	addi		BO,	BO,	32
+
+
+	xvmulsp		vs32,	vs0,	vs8		// a0_r*b0_r, a0_i*b0_r, a1_r*b0_r, a1_i*b0_r
+	xvmulsp		vs33,	vs0,	vs9		// a0_r*b0_i, a0_i*b0_i, a1_r*b0_i, a1_i*b0_i
+	xvmulsp		vs34,	vs1,	vs8		// a1_r*b0_r, a1_i*b0_r, a1_r*b0_r, a1_i*b0_r
+	xvmulsp		vs35,	vs1,	vs9		// a1_r*b0_i, a1_i*b0_i, a1_r*b0_i, a1_i*b0_i
+
+	xvmulsp		vs36,	vs0,	vs10		// a0_r*b1_r, a0_i*b1_r, a1_r*b1_r, a1_i*b1_r
+	xvmulsp		vs37,	vs0,	vs11		// a0_r*b1_i, a0_i*b1_i, a1_r*b1_i, a1_i*b1_i
+	xvmulsp		vs38,	vs1,	vs10		// a1_r*b1_r, a1_i*b1_r, a1_r*b1_r, a1_i*b1_r
+	xvmulsp		vs39,	vs1,	vs11		// a1_r*b1_i, a1_i*b1_i, a1_r*b1_i, a1_i*b1_i
+
+	xvmulsp		vs40,	vs0,	vs12		// a0_r*b2_r, a0_i*b2_r, a1_r*b2_r, a1_i*b2_r
+	xvmulsp		vs41,	vs0,	vs13		// a0_r*b2_i, a0_i*b2_i, a1_r*b2_i, a1_i*b2_i
+	xvmulsp		vs42,	vs1,	vs12		// a1_r*b2_r, a1_i*b2_r, a1_r*b2_r, a1_i*b2_r
+	xvmulsp		vs43,	vs1,	vs13		// a1_r*b2_i, a1_i*b2_i, a1_r*b2_i, a1_i*b2_i
+
+	xvmulsp		vs44,	vs0,	vs14		// a0_r*b3_r, a0_i*b3_r, a1_r*b3_r, a1_i*b3_r
+	xvmulsp		vs45,	vs0,	vs15		// a0_r*b3_i, a0_i*b3_i, a1_r*b3_i, a1_i*b3_i
+	xvmulsp		vs46,	vs1,	vs14		// a1_r*b3_r, a1_i*b3_r, a1_r*b3_r, a1_i*b3_r
+	xvmulsp		vs47,	vs1,	vs15		// a1_r*b3_i, a1_i*b3_i, a1_r*b3_i, a1_i*b3_i
+
+
+.endm
+
+.macro KERNEL4x4_1
+
+
+	lxvw4x		vs4,	o0,	AO		// load a0, a1
+
+	lxvw4x		vs5,	o16,	AO		// load a2, a3
+
+
+	addi		AO,	AO,	32
+
+	lxvw4x		vs24,	o0,	BO		//  load b0, b1
+
+	xxspltw		vs16,	vs24,	0
+	xxspltw		vs17,	vs24,	1
+	xxspltw		vs18,	vs24,	2
+	xxspltw		vs19,	vs24,	3
+
+	lxvw4x		vs25,	o16,	BO		//  load b2, b3
+
+	xxspltw		vs20,	vs25,	0
+	xxspltw		vs21,	vs25,	1
+	xxspltw		vs22,	vs25,	2
+	xxspltw		vs23,	vs25,	3
+
+
+	addi		BO,	BO,	32
+
+
+	xvmaddasp	vs32,	vs0,	vs8		// a0_r*b0_r, a0_i*b0_r, a1_r*b0_r, a1_i*b0_r
+	xvmaddasp	vs33,	vs0,	vs9		// a0_r*b0_i, a0_i*b0_i, a1_r*b0_i, a1_i*b0_i
+	xvmaddasp	vs34,	vs1,	vs8		// a1_r*b0_r, a1_i*b0_r, a1_r*b0_r, a1_i*b0_r
+	xvmaddasp	vs35,	vs1,	vs9		// a1_r*b0_i, a1_i*b0_i, a1_r*b0_i, a1_i*b0_i
+
+	xvmaddasp	vs36,	vs0,	vs10		// a0_r*b1_r, a0_i*b1_r, a1_r*b1_r, a1_i*b1_r
+	xvmaddasp	vs37,	vs0,	vs11		// a0_r*b1_i, a0_i*b1_i, a1_r*b1_i, a1_i*b1_i
+	xvmaddasp	vs38,	vs1,	vs10		// a1_r*b1_r, a1_i*b1_r, a1_r*b1_r, a1_i*b1_r
+	xvmaddasp	vs39,	vs1,	vs11		// a1_r*b1_i, a1_i*b1_i, a1_r*b1_i, a1_i*b1_i
+
+	xvmaddasp	vs40,	vs0,	vs12		// a0_r*b2_r, a0_i*b2_r, a1_r*b2_r, a1_i*b2_r
+	xvmaddasp	vs41,	vs0,	vs13		// a0_r*b2_i, a0_i*b2_i, a1_r*b2_i, a1_i*b2_i
+	xvmaddasp	vs42,	vs1,	vs12		// a1_r*b2_r, a1_i*b2_r, a1_r*b2_r, a1_i*b2_r
+	xvmaddasp	vs43,	vs1,	vs13		// a1_r*b2_i, a1_i*b2_i, a1_r*b2_i, a1_i*b2_i
+
+	xvmaddasp	vs44,	vs0,	vs14		// a0_r*b3_r, a0_i*b3_r, a1_r*b3_r, a1_i*b3_r
+	xvmaddasp	vs45,	vs0,	vs15		// a0_r*b3_i, a0_i*b3_i, a1_r*b3_i, a1_i*b3_i
+	xvmaddasp	vs46,	vs1,	vs14		// a1_r*b3_r, a1_i*b3_r, a1_r*b3_r, a1_i*b3_r
+	xvmaddasp	vs47,	vs1,	vs15		// a1_r*b3_i, a1_i*b3_i, a1_r*b3_i, a1_i*b3_i
+
+
+.endm
+
+.macro KERNEL4x4_2
+
+
+	lxvw4x		vs0,	o0,	AO		// load a0, a1
+
+	lxvw4x		vs1,	o16,	AO		// load a2, a3
+
+
+	addi		AO,	AO,	32
+
+	lxvw4x		vs24,	o0,	BO		//  load b0, b1
+
+	xxspltw		vs8,	vs24,	0
+	xxspltw		vs9,	vs24,	1
+	xxspltw		vs10,	vs24,	2
+	xxspltw		vs11,	vs24,	3
+
+	lxvw4x		vs25,	o16,	BO		//  load b2, b3
+
+	xxspltw		vs12,	vs25,	0
+	xxspltw		vs13,	vs25,	1
+	xxspltw		vs14,	vs25,	2
+	xxspltw		vs15,	vs25,	3
+
+
+	addi		BO,	BO,	32
+
+
+	xvmaddasp	vs32,	vs4,	vs16		// a4_r*b0_r, a4_i*b0_r, a1_r*b0_r, a1_i*b0_r
+	xvmaddasp	vs33,	vs4,	vs17		// a4_r*b0_i, a4_i*b0_i, a1_r*b0_i, a1_i*b0_i
+	xvmaddasp	vs34,	vs5,	vs16		// a5_r*b0_r, a5_i*b0_r, a1_r*b0_r, a1_i*b0_r
+	xvmaddasp	vs35,	vs5,	vs17		// a5_r*b0_i, a5_i*b0_i, a1_r*b0_i, a1_i*b0_i
+
+	xvmaddasp	vs36,	vs4,	vs18		// a4_r*b1_r, a4_i*b1_r, a1_r*b1_r, a1_i*b1_r
+	xvmaddasp	vs37,	vs4,	vs19		// a4_r*b1_i, a4_i*b1_i, a1_r*b1_i, a1_i*b1_i
+	xvmaddasp	vs38,	vs5,	vs18		// a5_r*b1_r, a5_i*b1_r, a1_r*b1_r, a1_i*b1_r
+	xvmaddasp	vs39,	vs5,	vs19		// a5_r*b1_i, a5_i*b1_i, a1_r*b1_i, a1_i*b1_i
+
+	xvmaddasp	vs40,	vs4,	vs20		// a4_r*b2_r, a4_i*b2_r, a1_r*b2_r, a1_i*b2_r
+	xvmaddasp	vs41,	vs4,	vs21		// a4_r*b2_i, a4_i*b2_i, a1_r*b2_i, a1_i*b2_i
+	xvmaddasp	vs42,	vs5,	vs20		// a5_r*b2_r, a5_i*b2_r, a1_r*b2_r, a1_i*b2_r
+	xvmaddasp	vs43,	vs5,	vs21		// a5_r*b2_i, a5_i*b2_i, a1_r*b2_i, a1_i*b2_i
+
+	xvmaddasp	vs44,	vs4,	vs22		// a4_r*b3_r, a4_i*b3_r, a1_r*b3_r, a1_i*b3_r
+	xvmaddasp	vs45,	vs4,	vs23		// a4_r*b3_i, a4_i*b3_i, a1_r*b3_i, a1_i*b3_i
+	xvmaddasp	vs46,	vs5,	vs22		// a5_r*b3_r, a5_i*b3_r, a1_r*b3_r, a1_i*b3_r
+	xvmaddasp	vs47,	vs5,	vs23		// a5_r*b3_i, a5_i*b3_i, a1_r*b3_i, a1_i*b3_i
+
+
+.endm
+
+.macro KERNEL4x4_E2
+
+
+	xvmaddasp	vs32,	vs4,	vs16		// a4_r*b0_r, a4_i*b0_r, a1_r*b0_r, a1_i*b0_r
+	xvmaddasp	vs33,	vs4,	vs17		// a4_r*b0_i, a4_i*b0_i, a1_r*b0_i, a1_i*b0_i
+	xvmaddasp	vs34,	vs5,	vs16		// a5_r*b0_r, a5_i*b0_r, a1_r*b0_r, a1_i*b0_r
+	xvmaddasp	vs35,	vs5,	vs17		// a5_r*b0_i, a5_i*b0_i, a1_r*b0_i, a1_i*b0_i
+
+	xvmaddasp	vs36,	vs4,	vs18		// a4_r*b1_r, a4_i*b1_r, a1_r*b1_r, a1_i*b1_r
+	xvmaddasp	vs37,	vs4,	vs19		// a4_r*b1_i, a4_i*b1_i, a1_r*b1_i, a1_i*b1_i
+	xvmaddasp	vs38,	vs5,	vs18		// a5_r*b1_r, a5_i*b1_r, a1_r*b1_r, a1_i*b1_r
+	xvmaddasp	vs39,	vs5,	vs19		// a5_r*b1_i, a5_i*b1_i, a1_r*b1_i, a1_i*b1_i
+
+	xvmaddasp	vs40,	vs4,	vs20		// a4_r*b2_r, a4_i*b2_r, a1_r*b2_r, a1_i*b2_r
+	xvmaddasp	vs41,	vs4,	vs21		// a4_r*b2_i, a4_i*b2_i, a1_r*b2_i, a1_i*b2_i
+	xvmaddasp	vs42,	vs5,	vs20		// a5_r*b2_r, a5_i*b2_r, a1_r*b2_r, a1_i*b2_r
+	xvmaddasp	vs43,	vs5,	vs21		// a5_r*b2_i, a5_i*b2_i, a1_r*b2_i, a1_i*b2_i
+
+	xvmaddasp	vs44,	vs4,	vs22		// a4_r*b3_r, a4_i*b3_r, a1_r*b3_r, a1_i*b3_r
+	xvmaddasp	vs45,	vs4,	vs23		// a4_r*b3_i, a4_i*b3_i, a1_r*b3_i, a1_i*b3_i
+	xvmaddasp	vs46,	vs5,	vs22		// a5_r*b3_r, a5_i*b3_r, a1_r*b3_r, a1_i*b3_r
+	xvmaddasp	vs47,	vs5,	vs23		// a5_r*b3_i, a5_i*b3_i, a1_r*b3_i, a1_i*b3_i
+
+
+.endm
+
+.macro KERNEL4x4_SUBI1
+
+
+	lxvw4x		vs0,	o0,	AO		// load a0, a1
+
+	lxvw4x		vs1,	o16,	AO		// load a2, a3
+
+
+	addi		AO,	AO,	32
+
+	lxvw4x		vs24,	o0,	BO		//  load b0, b1
+
+	xxspltw		vs8,	vs24,	0
+	xxspltw		vs9,	vs24,	1
+	xxspltw		vs10,	vs24,	2
+	xxspltw		vs11,	vs24,	3
+
+	lxvw4x		vs25,	o16,	BO		//  load b2, b3
+
+	xxspltw		vs12,	vs25,	0
+	xxspltw		vs13,	vs25,	1
+	xxspltw		vs14,	vs25,	2
+	xxspltw		vs15,	vs25,	3
+
+
+	addi		BO,	BO,	32
+
+
+	xvmulsp		vs32,	vs0,	vs8		// a0_r*b0_r, a0_i*b0_r, a1_r*b0_r, a1_i*b0_r
+	xvmulsp		vs33,	vs0,	vs9		// a0_r*b0_i, a0_i*b0_i, a1_r*b0_i, a1_i*b0_i
+	xvmulsp		vs34,	vs1,	vs8		// a1_r*b0_r, a1_i*b0_r, a1_r*b0_r, a1_i*b0_r
+	xvmulsp		vs35,	vs1,	vs9		// a1_r*b0_i, a1_i*b0_i, a1_r*b0_i, a1_i*b0_i
+
+	xvmulsp		vs36,	vs0,	vs10		// a0_r*b1_r, a0_i*b1_r, a1_r*b1_r, a1_i*b1_r
+	xvmulsp		vs37,	vs0,	vs11		// a0_r*b1_i, a0_i*b1_i, a1_r*b1_i, a1_i*b1_i
+	xvmulsp		vs38,	vs1,	vs10		// a1_r*b1_r, a1_i*b1_r, a1_r*b1_r, a1_i*b1_r
+	xvmulsp		vs39,	vs1,	vs11		// a1_r*b1_i, a1_i*b1_i, a1_r*b1_i, a1_i*b1_i
+
+	xvmulsp		vs40,	vs0,	vs12		// a0_r*b2_r, a0_i*b2_r, a1_r*b2_r, a1_i*b2_r
+	xvmulsp		vs41,	vs0,	vs13		// a0_r*b2_i, a0_i*b2_i, a1_r*b2_i, a1_i*b2_i
+	xvmulsp		vs42,	vs1,	vs12		// a1_r*b2_r, a1_i*b2_r, a1_r*b2_r, a1_i*b2_r
+	xvmulsp		vs43,	vs1,	vs13		// a1_r*b2_i, a1_i*b2_i, a1_r*b2_i, a1_i*b2_i
+
+	xvmulsp		vs44,	vs0,	vs14		// a0_r*b3_r, a0_i*b3_r, a1_r*b3_r, a1_i*b3_r
+	xvmulsp		vs45,	vs0,	vs15		// a0_r*b3_i, a0_i*b3_i, a1_r*b3_i, a1_i*b3_i
+	xvmulsp		vs46,	vs1,	vs14		// a1_r*b3_r, a1_i*b3_r, a1_r*b3_r, a1_i*b3_r
+	xvmulsp		vs47,	vs1,	vs15		// a1_r*b3_i, a1_i*b3_i, a1_r*b3_i, a1_i*b3_i
+
+
+.endm
+
+.macro KERNEL4x4_SUB1
+
+
+	lxvw4x		vs0,	o0,	AO		// load a0, a1
+
+	lxvw4x		vs1,	o16,	AO		// load a2, a3
+
+
+	addi		AO,	AO,	32
+
+	lxvw4x		vs24,	o0,	BO		//  load b0, b1
+
+	xxspltw		vs8,	vs24,	0
+	xxspltw		vs9,	vs24,	1
+	xxspltw		vs10,	vs24,	2
+	xxspltw		vs11,	vs24,	3
+
+	lxvw4x		vs25,	o16,	BO		//  load b2, b3
+
+	xxspltw		vs12,	vs25,	0
+	xxspltw		vs13,	vs25,	1
+	xxspltw		vs14,	vs25,	2
+	xxspltw		vs15,	vs25,	3
+
+
+	addi		BO,	BO,	32
+
+
+	xvmaddasp	vs32,	vs0,	vs8		// a0_r*b0_r, a0_i*b0_r, a1_r*b0_r, a1_i*b0_r
+	xvmaddasp	vs33,	vs0,	vs9		// a0_r*b0_i, a0_i*b0_i, a1_r*b0_i, a1_i*b0_i
+	xvmaddasp	vs34,	vs1,	vs8		// a1_r*b0_r, a1_i*b0_r, a1_r*b0_r, a1_i*b0_r
+	xvmaddasp	vs35,	vs1,	vs9		// a1_r*b0_i, a1_i*b0_i, a1_r*b0_i, a1_i*b0_i
+
+	xvmaddasp	vs36,	vs0,	vs10		// a0_r*b1_r, a0_i*b1_r, a1_r*b1_r, a1_i*b1_r
+	xvmaddasp	vs37,	vs0,	vs11		// a0_r*b1_i, a0_i*b1_i, a1_r*b1_i, a1_i*b1_i
+	xvmaddasp	vs38,	vs1,	vs10		// a1_r*b1_r, a1_i*b1_r, a1_r*b1_r, a1_i*b1_r
+	xvmaddasp	vs39,	vs1,	vs11		// a1_r*b1_i, a1_i*b1_i, a1_r*b1_i, a1_i*b1_i
+
+	xvmaddasp	vs40,	vs0,	vs12		// a0_r*b2_r, a0_i*b2_r, a1_r*b2_r, a1_i*b2_r
+	xvmaddasp	vs41,	vs0,	vs13		// a0_r*b2_i, a0_i*b2_i, a1_r*b2_i, a1_i*b2_i
+	xvmaddasp	vs42,	vs1,	vs12		// a1_r*b2_r, a1_i*b2_r, a1_r*b2_r, a1_i*b2_r
+	xvmaddasp	vs43,	vs1,	vs13		// a1_r*b2_i, a1_i*b2_i, a1_r*b2_i, a1_i*b2_i
+
+	xvmaddasp	vs44,	vs0,	vs14		// a0_r*b3_r, a0_i*b3_r, a1_r*b3_r, a1_i*b3_r
+	xvmaddasp	vs45,	vs0,	vs15		// a0_r*b3_i, a0_i*b3_i, a1_r*b3_i, a1_i*b3_i
+	xvmaddasp	vs46,	vs1,	vs14		// a1_r*b3_r, a1_i*b3_r, a1_r*b3_r, a1_i*b3_r
+	xvmaddasp	vs47,	vs1,	vs15		// a1_r*b3_i, a1_i*b3_i, a1_r*b3_i, a1_i*b3_i
+
+
+.endm
+
+.macro SAVE4x4
+
+	mr		T1,	CO
+
+// N=0
+
+	mr		T2,	T1
+
+// N=0 M=0
+
+	xxlxor		vs4,	vs4,	vs4
+	xxlxor		vs5,	vs5,	vs5
+	xxlxor		vs6,	vs6,	vs6
+	xxlxor		vs7,	vs7,	vs7
+
+#ifndef TRMMKERNEL
+	lxvw4x		vs0,	o0,	T2	// c0, c1
+#else
+	xxlxor		vs0,	vs0,	vs0
+#endif
+
+
+	xxspltw		vs8,	vs32,	0
+	xxspltw		vs9,	vs32,	1
+	xxspltw		vs10,	vs32,	2
+	xxspltw		vs11,	vs32,	3
+
+
+	xxspltw		vs12,	vs33,	0
+	xxspltw		vs13,	vs33,	1
+	xxspltw		vs14,	vs33,	2
+	xxspltw		vs15,	vs33,	3
+
+	XVFADD_R1	vs4,	vs4,	vs8		// add a0_r * b0_r
+	XVFADD_I2	vs5,	vs5,	vs12		// add a0_r * b0_i
+	XVFADD_R1	vs6,	vs6,	vs10		// add a1_r * b0_r
+	XVFADD_I2	vs7,	vs7,	vs14		// add a1_r * b0_i
+
+	XVFADD_R2	vs4,	vs4,	vs13		// add a0_i * b0_i
+	XVFADD_I1	vs5,	vs5,	vs9 		// add a0_i * b0_r
+	XVFADD_R2	vs6,	vs6,	vs15		// add a1_i * b0_i
+	XVFADD_I1	vs7,	vs7,	vs11		// add a1_i * b0_r
+
+	xvmulsp		vs16,	vs4,	alpha_sr		// r0_r * alpha_r
+	xvmulsp		vs17,	vs5,	alpha_si		// r0_i * alpha_i
+	xvmulsp		vs18,	vs4,	alpha_si		// r0_r * alpha_i
+	xvmulsp		vs19,	vs5,	alpha_sr		// r0_i * alpha_r
+
+	xvsubsp		vs20,	vs16,	vs17		// r0_r * alpha_r - r0_i * alpha_i
+	xvaddsp		vs21,	vs18,	vs19		// r0_r * alpha_i + r0_i * alpha_r
+
+	xvmulsp		vs16,	vs6,	alpha_sr		// r1_r * alpha_r
+	xvmulsp		vs17,	vs7,	alpha_si		// r1_i * alpha_i
+	xvmulsp		vs18,	vs6,	alpha_si		// r1_r * alpha_i
+	xvmulsp		vs19,	vs7,	alpha_sr		// r1_i * alpha_r
+
+	xvsubsp		vs22,	vs16,	vs17		// r1_r * alpha_r - r1_i * alpha_i
+	xvaddsp		vs23,	vs18,	vs19		// r1_r * alpha_i + r1_i * alpha_r
+
+	xxlxor		vs24,	vs24,	vs24
+	xxsldwi		vs20,	vs20,	vs24,	3		// r0_r
+	xxsldwi		vs21,	vs21,	vs24,	2		// r0_i
+	xxsldwi		vs22,	vs22,	vs24,	1		// r1_r
+	xxsldwi		vs23,	vs23,	vs24,	0		// r1_i
+	xvaddsp		vs20,	vs20,	vs21		// r0_r, r0_i
+	xvaddsp		vs22,	vs22,	vs23		// r1_r, r1_i
+	xvaddsp		vs1,	vs20,	vs22			// r0_r, r0_i, r1_r, r1_i
+	xvaddsp		vs0,	vs0,	vs1
+
+
+	stxvw4x		vs0,	o0,	T2	// c0, c1
+
+	addi		T2,	T2,	16
+
+// N=0 M=2
+
+	xxlxor		vs4,	vs4,	vs4
+	xxlxor		vs5,	vs5,	vs5
+	xxlxor		vs6,	vs6,	vs6
+	xxlxor		vs7,	vs7,	vs7
+
+#ifndef TRMMKERNEL
+	lxvw4x		vs0,	o0,	T2	// c0, c1
+#else
+	xxlxor		vs0,	vs0,	vs0
+#endif
+
+
+	xxspltw		vs8,	vs34,	0
+	xxspltw		vs9,	vs34,	1
+	xxspltw		vs10,	vs34,	2
+	xxspltw		vs11,	vs34,	3
+
+
+	xxspltw		vs12,	vs35,	0
+	xxspltw		vs13,	vs35,	1
+	xxspltw		vs14,	vs35,	2
+	xxspltw		vs15,	vs35,	3
+
+	XVFADD_R1	vs4,	vs4,	vs8		// add a0_r * b0_r
+	XVFADD_I2	vs5,	vs5,	vs12		// add a0_r * b0_i
+	XVFADD_R1	vs6,	vs6,	vs10		// add a1_r * b0_r
+	XVFADD_I2	vs7,	vs7,	vs14		// add a1_r * b0_i
+
+	XVFADD_R2	vs4,	vs4,	vs13		// add a0_i * b0_i
+	XVFADD_I1	vs5,	vs5,	vs9 		// add a0_i * b0_r
+	XVFADD_R2	vs6,	vs6,	vs15		// add a1_i * b0_i
+	XVFADD_I1	vs7,	vs7,	vs11		// add a1_i * b0_r
+
+	xvmulsp		vs16,	vs4,	alpha_sr		// r0_r * alpha_r
+	xvmulsp		vs17,	vs5,	alpha_si		// r0_i * alpha_i
+	xvmulsp		vs18,	vs4,	alpha_si		// r0_r * alpha_i
+	xvmulsp		vs19,	vs5,	alpha_sr		// r0_i * alpha_r
+
+	xvsubsp		vs20,	vs16,	vs17		// r0_r * alpha_r - r0_i * alpha_i
+	xvaddsp		vs21,	vs18,	vs19		// r0_r * alpha_i + r0_i * alpha_r
+
+	xvmulsp		vs16,	vs6,	alpha_sr		// r1_r * alpha_r
+	xvmulsp		vs17,	vs7,	alpha_si		// r1_i * alpha_i
+	xvmulsp		vs18,	vs6,	alpha_si		// r1_r * alpha_i
+	xvmulsp		vs19,	vs7,	alpha_sr		// r1_i * alpha_r
+
+	xvsubsp		vs22,	vs16,	vs17		// r1_r * alpha_r - r1_i * alpha_i
+	xvaddsp		vs23,	vs18,	vs19		// r1_r * alpha_i + r1_i * alpha_r
+
+	xxlxor		vs24,	vs24,	vs24
+	xxsldwi		vs20,	vs20,	vs24,	3		// r0_r
+	xxsldwi		vs21,	vs21,	vs24,	2		// r0_i
+	xxsldwi		vs22,	vs22,	vs24,	1		// r1_r
+	xxsldwi		vs23,	vs23,	vs24,	0		// r1_i
+	xvaddsp		vs20,	vs20,	vs21		// r0_r, r0_i
+	xvaddsp		vs22,	vs22,	vs23		// r1_r, r1_i
+	xvaddsp		vs1,	vs20,	vs22			// r0_r, r0_i, r1_r, r1_i
+	xvaddsp		vs0,	vs0,	vs1
+
+
+	stxvw4x		vs0,	o0,	T2	// c0, c1
+
+	addi		T2,	T2,	16
+	add		T1,	T1,	LDC
+
+
+// N=1
+
+	mr		T2,	T1
+
+// N=1 M=0
+
+	xxlxor		vs4,	vs4,	vs4
+	xxlxor		vs5,	vs5,	vs5
+	xxlxor		vs6,	vs6,	vs6
+	xxlxor		vs7,	vs7,	vs7
+
+#ifndef TRMMKERNEL
+	lxvw4x		vs0,	o0,	T2	// c0, c1
+#else
+	xxlxor		vs0,	vs0,	vs0
+#endif
+
+
+	xxspltw		vs8,	vs36,	0
+	xxspltw		vs9,	vs36,	1
+	xxspltw		vs10,	vs36,	2
+	xxspltw		vs11,	vs36,	3
+
+
+	xxspltw		vs12,	vs37,	0
+	xxspltw		vs13,	vs37,	1
+	xxspltw		vs14,	vs37,	2
+	xxspltw		vs15,	vs37,	3
+
+	XVFADD_R1	vs4,	vs4,	vs8		// add a0_r * b0_r
+	XVFADD_I2	vs5,	vs5,	vs12		// add a0_r * b0_i
+	XVFADD_R1	vs6,	vs6,	vs10		// add a1_r * b0_r
+	XVFADD_I2	vs7,	vs7,	vs14		// add a1_r * b0_i
+
+	XVFADD_R2	vs4,	vs4,	vs13		// add a0_i * b0_i
+	XVFADD_I1	vs5,	vs5,	vs9 		// add a0_i * b0_r
+	XVFADD_R2	vs6,	vs6,	vs15		// add a1_i * b0_i
+	XVFADD_I1	vs7,	vs7,	vs11		// add a1_i * b0_r
+
+	xvmulsp		vs16,	vs4,	alpha_sr		// r0_r * alpha_r
+	xvmulsp		vs17,	vs5,	alpha_si		// r0_i * alpha_i
+	xvmulsp		vs18,	vs4,	alpha_si		// r0_r * alpha_i
+	xvmulsp		vs19,	vs5,	alpha_sr		// r0_i * alpha_r
+
+	xvsubsp		vs20,	vs16,	vs17		// r0_r * alpha_r - r0_i * alpha_i
+	xvaddsp		vs21,	vs18,	vs19		// r0_r * alpha_i + r0_i * alpha_r
+
+	xvmulsp		vs16,	vs6,	alpha_sr		// r1_r * alpha_r
+	xvmulsp		vs17,	vs7,	alpha_si		// r1_i * alpha_i
+	xvmulsp		vs18,	vs6,	alpha_si		// r1_r * alpha_i
+	xvmulsp		vs19,	vs7,	alpha_sr		// r1_i * alpha_r
+
+	xvsubsp		vs22,	vs16,	vs17		// r1_r * alpha_r - r1_i * alpha_i
+	xvaddsp		vs23,	vs18,	vs19		// r1_r * alpha_i + r1_i * alpha_r
+
+	xxlxor		vs24,	vs24,	vs24
+	xxsldwi		vs20,	vs20,	vs24,	3		// r0_r
+	xxsldwi		vs21,	vs21,	vs24,	2		// r0_i
+	xxsldwi		vs22,	vs22,	vs24,	1		// r1_r
+	xxsldwi		vs23,	vs23,	vs24,	0		// r1_i
+	xvaddsp		vs20,	vs20,	vs21		// r0_r, r0_i
+	xvaddsp		vs22,	vs22,	vs23		// r1_r, r1_i
+	xvaddsp		vs1,	vs20,	vs22			// r0_r, r0_i, r1_r, r1_i
+	xvaddsp		vs0,	vs0,	vs1
+
+
+	stxvw4x		vs0,	o0,	T2	// c0, c1
+
+	addi		T2,	T2,	16
+
+// N=1 M=2
+
+	xxlxor		vs4,	vs4,	vs4
+	xxlxor		vs5,	vs5,	vs5
+	xxlxor		vs6,	vs6,	vs6
+	xxlxor		vs7,	vs7,	vs7
+
+#ifndef TRMMKERNEL
+	lxvw4x		vs0,	o0,	T2	// c0, c1
+#else
+	xxlxor		vs0,	vs0,	vs0
+#endif
+
+
+	xxspltw		vs8,	vs38,	0
+	xxspltw		vs9,	vs38,	1
+	xxspltw		vs10,	vs38,	2
+	xxspltw		vs11,	vs38,	3
+
+
+	xxspltw		vs12,	vs39,	0
+	xxspltw		vs13,	vs39,	1
+	xxspltw		vs14,	vs39,	2
+	xxspltw		vs15,	vs39,	3
+
+	XVFADD_R1	vs4,	vs4,	vs8		// add a0_r * b0_r
+	XVFADD_I2	vs5,	vs5,	vs12		// add a0_r * b0_i
+	XVFADD_R1	vs6,	vs6,	vs10		// add a1_r * b0_r
+	XVFADD_I2	vs7,	vs7,	vs14		// add a1_r * b0_i
+
+	XVFADD_R2	vs4,	vs4,	vs13		// add a0_i * b0_i
+	XVFADD_I1	vs5,	vs5,	vs9 		// add a0_i * b0_r
+	XVFADD_R2	vs6,	vs6,	vs15		// add a1_i * b0_i
+	XVFADD_I1	vs7,	vs7,	vs11		// add a1_i * b0_r
+
+	xvmulsp		vs16,	vs4,	alpha_sr		// r0_r * alpha_r
+	xvmulsp		vs17,	vs5,	alpha_si		// r0_i * alpha_i
+	xvmulsp		vs18,	vs4,	alpha_si		// r0_r * alpha_i
+	xvmulsp		vs19,	vs5,	alpha_sr		// r0_i * alpha_r
+
+	xvsubsp		vs20,	vs16,	vs17		// r0_r * alpha_r - r0_i * alpha_i
+	xvaddsp		vs21,	vs18,	vs19		// r0_r * alpha_i + r0_i * alpha_r
+
+	xvmulsp		vs16,	vs6,	alpha_sr		// r1_r * alpha_r
+	xvmulsp		vs17,	vs7,	alpha_si		// r1_i * alpha_i
+	xvmulsp		vs18,	vs6,	alpha_si		// r1_r * alpha_i
+	xvmulsp		vs19,	vs7,	alpha_sr		// r1_i * alpha_r
+
+	xvsubsp		vs22,	vs16,	vs17		// r1_r * alpha_r - r1_i * alpha_i
+	xvaddsp		vs23,	vs18,	vs19		// r1_r * alpha_i + r1_i * alpha_r
+
+	xxlxor		vs24,	vs24,	vs24
+	xxsldwi		vs20,	vs20,	vs24,	3		// r0_r
+	xxsldwi		vs21,	vs21,	vs24,	2		// r0_i
+	xxsldwi		vs22,	vs22,	vs24,	1		// r1_r
+	xxsldwi		vs23,	vs23,	vs24,	0		// r1_i
+	xvaddsp		vs20,	vs20,	vs21		// r0_r, r0_i
+	xvaddsp		vs22,	vs22,	vs23		// r1_r, r1_i
+	xvaddsp		vs1,	vs20,	vs22			// r0_r, r0_i, r1_r, r1_i
+	xvaddsp		vs0,	vs0,	vs1
+
+
+	stxvw4x		vs0,	o0,	T2	// c0, c1
+
+	addi		T2,	T2,	16
+	add		T1,	T1,	LDC
+
+
+// N=2
+
+	mr		T2,	T1
+
+// N=2 M=0
+
+	xxlxor		vs4,	vs4,	vs4
+	xxlxor		vs5,	vs5,	vs5
+	xxlxor		vs6,	vs6,	vs6
+	xxlxor		vs7,	vs7,	vs7
+
+#ifndef TRMMKERNEL
+	lxvw4x		vs0,	o0,	T2	// c0, c1
+#else
+	xxlxor		vs0,	vs0,	vs0
+#endif
+
+
+	xxspltw		vs8,	vs40,	0
+	xxspltw		vs9,	vs40,	1
+	xxspltw		vs10,	vs40,	2
+	xxspltw		vs11,	vs40,	3
+
+
+	xxspltw		vs12,	vs41,	0
+	xxspltw		vs13,	vs41,	1
+	xxspltw		vs14,	vs41,	2
+	xxspltw		vs15,	vs41,	3
+
+	XVFADD_R1	vs4,	vs4,	vs8		// add a0_r * b0_r
+	XVFADD_I2	vs5,	vs5,	vs12		// add a0_r * b0_i
+	XVFADD_R1	vs6,	vs6,	vs10		// add a1_r * b0_r
+	XVFADD_I2	vs7,	vs7,	vs14		// add a1_r * b0_i
+
+	XVFADD_R2	vs4,	vs4,	vs13		// add a0_i * b0_i
+	XVFADD_I1	vs5,	vs5,	vs9 		// add a0_i * b0_r
+	XVFADD_R2	vs6,	vs6,	vs15		// add a1_i * b0_i
+	XVFADD_I1	vs7,	vs7,	vs11		// add a1_i * b0_r
+
+	xvmulsp		vs16,	vs4,	alpha_sr		// r0_r * alpha_r
+	xvmulsp		vs17,	vs5,	alpha_si		// r0_i * alpha_i
+	xvmulsp		vs18,	vs4,	alpha_si		// r0_r * alpha_i
+	xvmulsp		vs19,	vs5,	alpha_sr		// r0_i * alpha_r
+
+	xvsubsp		vs20,	vs16,	vs17		// r0_r * alpha_r - r0_i * alpha_i
+	xvaddsp		vs21,	vs18,	vs19		// r0_r * alpha_i + r0_i * alpha_r
+
+	xvmulsp		vs16,	vs6,	alpha_sr		// r1_r * alpha_r
+	xvmulsp		vs17,	vs7,	alpha_si		// r1_i * alpha_i
+	xvmulsp		vs18,	vs6,	alpha_si		// r1_r * alpha_i
+	xvmulsp		vs19,	vs7,	alpha_sr		// r1_i * alpha_r
+
+	xvsubsp		vs22,	vs16,	vs17		// r1_r * alpha_r - r1_i * alpha_i
+	xvaddsp		vs23,	vs18,	vs19		// r1_r * alpha_i + r1_i * alpha_r
+
+	xxlxor		vs24,	vs24,	vs24
+	xxsldwi		vs20,	vs20,	vs24,	3		// r0_r
+	xxsldwi		vs21,	vs21,	vs24,	2		// r0_i
+	xxsldwi		vs22,	vs22,	vs24,	1		// r1_r
+	xxsldwi		vs23,	vs23,	vs24,	0		// r1_i
+	xvaddsp		vs20,	vs20,	vs21		// r0_r, r0_i
+	xvaddsp		vs22,	vs22,	vs23		// r1_r, r1_i
+	xvaddsp		vs1,	vs20,	vs22			// r0_r, r0_i, r1_r, r1_i
+	xvaddsp		vs0,	vs0,	vs1
+
+
+	stxvw4x		vs0,	o0,	T2	// c0, c1
+
+	addi		T2,	T2,	16
+
+// N=2 M=2
+
+	xxlxor		vs4,	vs4,	vs4
+	xxlxor		vs5,	vs5,	vs5
+	xxlxor		vs6,	vs6,	vs6
+	xxlxor		vs7,	vs7,	vs7
+
+#ifndef TRMMKERNEL
+	lxvw4x		vs0,	o0,	T2	// c0, c1
+#else
+	xxlxor		vs0,	vs0,	vs0
+#endif
+
+
+	xxspltw		vs8,	vs42,	0
+	xxspltw		vs9,	vs42,	1
+	xxspltw		vs10,	vs42,	2
+	xxspltw		vs11,	vs42,	3
+
+
+	xxspltw		vs12,	vs43,	0
+	xxspltw		vs13,	vs43,	1
+	xxspltw		vs14,	vs43,	2
+	xxspltw		vs15,	vs43,	3
+
+	XVFADD_R1	vs4,	vs4,	vs8		// add a0_r * b0_r
+	XVFADD_I2	vs5,	vs5,	vs12		// add a0_r * b0_i
+	XVFADD_R1	vs6,	vs6,	vs10		// add a1_r * b0_r
+	XVFADD_I2	vs7,	vs7,	vs14		// add a1_r * b0_i
+
+	XVFADD_R2	vs4,	vs4,	vs13		// add a0_i * b0_i
+	XVFADD_I1	vs5,	vs5,	vs9 		// add a0_i * b0_r
+	XVFADD_R2	vs6,	vs6,	vs15		// add a1_i * b0_i
+	XVFADD_I1	vs7,	vs7,	vs11		// add a1_i * b0_r
+
+	xvmulsp		vs16,	vs4,	alpha_sr		// r0_r * alpha_r
+	xvmulsp		vs17,	vs5,	alpha_si		// r0_i * alpha_i
+	xvmulsp		vs18,	vs4,	alpha_si		// r0_r * alpha_i
+	xvmulsp		vs19,	vs5,	alpha_sr		// r0_i * alpha_r
+
+	xvsubsp		vs20,	vs16,	vs17		// r0_r * alpha_r - r0_i * alpha_i
+	xvaddsp		vs21,	vs18,	vs19		// r0_r * alpha_i + r0_i * alpha_r
+
+	xvmulsp		vs16,	vs6,	alpha_sr		// r1_r * alpha_r
+	xvmulsp		vs17,	vs7,	alpha_si		// r1_i * alpha_i
+	xvmulsp		vs18,	vs6,	alpha_si		// r1_r * alpha_i
+	xvmulsp		vs19,	vs7,	alpha_sr		// r1_i * alpha_r
+
+	xvsubsp		vs22,	vs16,	vs17		// r1_r * alpha_r - r1_i * alpha_i
+	xvaddsp		vs23,	vs18,	vs19		// r1_r * alpha_i + r1_i * alpha_r
+
+	xxlxor		vs24,	vs24,	vs24
+	xxsldwi		vs20,	vs20,	vs24,	3		// r0_r
+	xxsldwi		vs21,	vs21,	vs24,	2		// r0_i
+	xxsldwi		vs22,	vs22,	vs24,	1		// r1_r
+	xxsldwi		vs23,	vs23,	vs24,	0		// r1_i
+	xvaddsp		vs20,	vs20,	vs21		// r0_r, r0_i
+	xvaddsp		vs22,	vs22,	vs23		// r1_r, r1_i
+	xvaddsp		vs1,	vs20,	vs22			// r0_r, r0_i, r1_r, r1_i
+	xvaddsp		vs0,	vs0,	vs1
+
+
+	stxvw4x		vs0,	o0,	T2	// c0, c1
+
+	addi		T2,	T2,	16
+	add		T1,	T1,	LDC
+
+
+// N=3
+
+	mr		T2,	T1
+
+// N=3 M=0
+
+	xxlxor		vs4,	vs4,	vs4
+	xxlxor		vs5,	vs5,	vs5
+	xxlxor		vs6,	vs6,	vs6
+	xxlxor		vs7,	vs7,	vs7
+
+#ifndef TRMMKERNEL
+	lxvw4x		vs0,	o0,	T2	// c0, c1
+#else
+	xxlxor		vs0,	vs0,	vs0
+#endif
+
+
+	xxspltw		vs8,	vs44,	0
+	xxspltw		vs9,	vs44,	1
+	xxspltw		vs10,	vs44,	2
+	xxspltw		vs11,	vs44,	3
+
+
+	xxspltw		vs12,	vs45,	0
+	xxspltw		vs13,	vs45,	1
+	xxspltw		vs14,	vs45,	2
+	xxspltw		vs15,	vs45,	3
+
+	XVFADD_R1	vs4,	vs4,	vs8		// add a0_r * b0_r
+	XVFADD_I2	vs5,	vs5,	vs12		// add a0_r * b0_i
+	XVFADD_R1	vs6,	vs6,	vs10		// add a1_r * b0_r
+	XVFADD_I2	vs7,	vs7,	vs14		// add a1_r * b0_i
+
+	XVFADD_R2	vs4,	vs4,	vs13		// add a0_i * b0_i
+	XVFADD_I1	vs5,	vs5,	vs9 		// add a0_i * b0_r
+	XVFADD_R2	vs6,	vs6,	vs15		// add a1_i * b0_i
+	XVFADD_I1	vs7,	vs7,	vs11		// add a1_i * b0_r
+
+	xvmulsp		vs16,	vs4,	alpha_sr		// r0_r * alpha_r
+	xvmulsp		vs17,	vs5,	alpha_si		// r0_i * alpha_i
+	xvmulsp		vs18,	vs4,	alpha_si		// r0_r * alpha_i
+	xvmulsp		vs19,	vs5,	alpha_sr		// r0_i * alpha_r
+
+	xvsubsp		vs20,	vs16,	vs17		// r0_r * alpha_r - r0_i * alpha_i
+	xvaddsp		vs21,	vs18,	vs19		// r0_r * alpha_i + r0_i * alpha_r
+
+	xvmulsp		vs16,	vs6,	alpha_sr		// r1_r * alpha_r
+	xvmulsp		vs17,	vs7,	alpha_si		// r1_i * alpha_i
+	xvmulsp		vs18,	vs6,	alpha_si		// r1_r * alpha_i
+	xvmulsp		vs19,	vs7,	alpha_sr		// r1_i * alpha_r
+
+	xvsubsp		vs22,	vs16,	vs17		// r1_r * alpha_r - r1_i * alpha_i
+	xvaddsp		vs23,	vs18,	vs19		// r1_r * alpha_i + r1_i * alpha_r
+
+	xxlxor		vs24,	vs24,	vs24
+	xxsldwi		vs20,	vs20,	vs24,	3		// r0_r
+	xxsldwi		vs21,	vs21,	vs24,	2		// r0_i
+	xxsldwi		vs22,	vs22,	vs24,	1		// r1_r
+	xxsldwi		vs23,	vs23,	vs24,	0		// r1_i
+	xvaddsp		vs20,	vs20,	vs21		// r0_r, r0_i
+	xvaddsp		vs22,	vs22,	vs23		// r1_r, r1_i
+	xvaddsp		vs1,	vs20,	vs22			// r0_r, r0_i, r1_r, r1_i
+	xvaddsp		vs0,	vs0,	vs1
+
+
+	stxvw4x		vs0,	o0,	T2	// c0, c1
+
+	addi		T2,	T2,	16
+
+// N=3 M=2
+
+	xxlxor		vs4,	vs4,	vs4
+	xxlxor		vs5,	vs5,	vs5
+	xxlxor		vs6,	vs6,	vs6
+	xxlxor		vs7,	vs7,	vs7
+
+#ifndef TRMMKERNEL
+	lxvw4x		vs0,	o0,	T2	// c0, c1
+#else
+	xxlxor		vs0,	vs0,	vs0
+#endif
+
+
+	xxspltw		vs8,	vs46,	0
+	xxspltw		vs9,	vs46,	1
+	xxspltw		vs10,	vs46,	2
+	xxspltw		vs11,	vs46,	3
+
+
+	xxspltw		vs12,	vs47,	0
+	xxspltw		vs13,	vs47,	1
+	xxspltw		vs14,	vs47,	2
+	xxspltw		vs15,	vs47,	3
+
+	XVFADD_R1	vs4,	vs4,	vs8		// add a0_r * b0_r
+	XVFADD_I2	vs5,	vs5,	vs12		// add a0_r * b0_i
+	XVFADD_R1	vs6,	vs6,	vs10		// add a1_r * b0_r
+	XVFADD_I2	vs7,	vs7,	vs14		// add a1_r * b0_i
+
+	XVFADD_R2	vs4,	vs4,	vs13		// add a0_i * b0_i
+	XVFADD_I1	vs5,	vs5,	vs9 		// add a0_i * b0_r
+	XVFADD_R2	vs6,	vs6,	vs15		// add a1_i * b0_i
+	XVFADD_I1	vs7,	vs7,	vs11		// add a1_i * b0_r
+
+	xvmulsp		vs16,	vs4,	alpha_sr		// r0_r * alpha_r
+	xvmulsp		vs17,	vs5,	alpha_si		// r0_i * alpha_i
+	xvmulsp		vs18,	vs4,	alpha_si		// r0_r * alpha_i
+	xvmulsp		vs19,	vs5,	alpha_sr		// r0_i * alpha_r
+
+	xvsubsp		vs20,	vs16,	vs17		// r0_r * alpha_r - r0_i * alpha_i
+	xvaddsp		vs21,	vs18,	vs19		// r0_r * alpha_i + r0_i * alpha_r
+
+	xvmulsp		vs16,	vs6,	alpha_sr		// r1_r * alpha_r
+	xvmulsp		vs17,	vs7,	alpha_si		// r1_i * alpha_i
+	xvmulsp		vs18,	vs6,	alpha_si		// r1_r * alpha_i
+	xvmulsp		vs19,	vs7,	alpha_sr		// r1_i * alpha_r
+
+	xvsubsp		vs22,	vs16,	vs17		// r1_r * alpha_r - r1_i * alpha_i
+	xvaddsp		vs23,	vs18,	vs19		// r1_r * alpha_i + r1_i * alpha_r
+
+	xxlxor		vs24,	vs24,	vs24
+	xxsldwi		vs20,	vs20,	vs24,	3		// r0_r
+	xxsldwi		vs21,	vs21,	vs24,	2		// r0_i
+	xxsldwi		vs22,	vs22,	vs24,	1		// r1_r
+	xxsldwi		vs23,	vs23,	vs24,	0		// r1_i
+	xvaddsp		vs20,	vs20,	vs21		// r0_r, r0_i
+	xvaddsp		vs22,	vs22,	vs23		// r1_r, r1_i
+	xvaddsp		vs1,	vs20,	vs22			// r0_r, r0_i, r1_r, r1_i
+	xvaddsp		vs0,	vs0,	vs1
+
+
+	stxvw4x		vs0,	o0,	T2	// c0, c1
+
+	addi		T2,	T2,	16
+	add		T1,	T1,	LDC
+
+	addi		CO,	CO,	32
+
+.endm
+
+
+/**********************************************************************************************
+* Macros for N=4 and M=2
+**********************************************************************************************/
+
+.macro LOAD4x2_1
+
+	lxvw4x		vs0,	o0,	AO		// load a0, a1
+
+
+	addi		AO,	AO,	16
+
+	lxvw4x		vs24,	o0,	BO		//  load b0, b1
+
+	xxspltw		vs8,	vs24,	0
+	xxspltw		vs9,	vs24,	1
+	xxspltw		vs10,	vs24,	2
+	xxspltw		vs11,	vs24,	3
+
+	lxvw4x		vs25,	o16,	BO		//  load b2, b3
+
+	xxspltw		vs12,	vs25,	0
+	xxspltw		vs13,	vs25,	1
+	xxspltw		vs14,	vs25,	2
+	xxspltw		vs15,	vs25,	3
+
+
+	addi		BO,	BO,	32
+
+.endm
+
+.macro KERNEL4x2_I1
+
+
+	lxvw4x		vs4,	o0,	AO		// load a0, a1
+
+
+	addi		AO,	AO,	16
+
+	lxvw4x		vs24,	o0,	BO		//  load b0, b1
+
+	xxspltw		vs16,	vs24,	0
+	xxspltw		vs17,	vs24,	1
+	xxspltw		vs18,	vs24,	2
+	xxspltw		vs19,	vs24,	3
+
+	lxvw4x		vs25,	o16,	BO		//  load b2, b3
+
+	xxspltw		vs20,	vs25,	0
+	xxspltw		vs21,	vs25,	1
+	xxspltw		vs22,	vs25,	2
+	xxspltw		vs23,	vs25,	3
+
+
+	addi		BO,	BO,	32
+
+
+	xvmulsp		vs32,	vs0,	vs8		// a0_r*b0_r, a0_i*b0_r, a1_r*b0_r, a1_i*b0_r
+	xvmulsp		vs33,	vs0,	vs9		// a0_r*b0_i, a0_i*b0_i, a1_r*b0_i, a1_i*b0_i
+
+	xvmulsp		vs34,	vs0,	vs10		// a0_r*b1_r, a0_i*b1_r, a1_r*b1_r, a1_i*b1_r
+	xvmulsp		vs35,	vs0,	vs11		// a0_r*b1_i, a0_i*b1_i, a1_r*b1_i, a1_i*b1_i
+
+	xvmulsp		vs36,	vs0,	vs12		// a0_r*b2_r, a0_i*b2_r, a1_r*b2_r, a1_i*b2_r
+	xvmulsp		vs37,	vs0,	vs13		// a0_r*b2_i, a0_i*b2_i, a1_r*b2_i, a1_i*b2_i
+
+	xvmulsp		vs38,	vs0,	vs14		// a0_r*b3_r, a0_i*b3_r, a1_r*b3_r, a1_i*b3_r
+	xvmulsp		vs39,	vs0,	vs15		// a0_r*b3_i, a0_i*b3_i, a1_r*b3_i, a1_i*b3_i
+
+
+.endm
+
+.macro KERNEL4x2_1
+
+
+	lxvw4x		vs4,	o0,	AO		// load a0, a1
+
+
+	addi		AO,	AO,	16
+
+	lxvw4x		vs24,	o0,	BO		//  load b0, b1
+
+	xxspltw		vs16,	vs24,	0
+	xxspltw		vs17,	vs24,	1
+	xxspltw		vs18,	vs24,	2
+	xxspltw		vs19,	vs24,	3
+
+	lxvw4x		vs25,	o16,	BO		//  load b2, b3
+
+	xxspltw		vs20,	vs25,	0
+	xxspltw		vs21,	vs25,	1
+	xxspltw		vs22,	vs25,	2
+	xxspltw		vs23,	vs25,	3
+
+
+	addi		BO,	BO,	32
+
+
+	xvmaddasp	vs32,	vs0,	vs8		// a0_r*b0_r, a0_i*b0_r, a1_r*b0_r, a1_i*b0_r
+	xvmaddasp	vs33,	vs0,	vs9		// a0_r*b0_i, a0_i*b0_i, a1_r*b0_i, a1_i*b0_i
+
+	xvmaddasp	vs34,	vs0,	vs10		// a0_r*b1_r, a0_i*b1_r, a1_r*b1_r, a1_i*b1_r
+	xvmaddasp	vs35,	vs0,	vs11		// a0_r*b1_i, a0_i*b1_i, a1_r*b1_i, a1_i*b1_i
+
+	xvmaddasp	vs36,	vs0,	vs12		// a0_r*b2_r, a0_i*b2_r, a1_r*b2_r, a1_i*b2_r
+	xvmaddasp	vs37,	vs0,	vs13		// a0_r*b2_i, a0_i*b2_i, a1_r*b2_i, a1_i*b2_i
+
+	xvmaddasp	vs38,	vs0,	vs14		// a0_r*b3_r, a0_i*b3_r, a1_r*b3_r, a1_i*b3_r
+	xvmaddasp	vs39,	vs0,	vs15		// a0_r*b3_i, a0_i*b3_i, a1_r*b3_i, a1_i*b3_i
+
+
+.endm
+
+.macro KERNEL4x2_2
+
+
+	lxvw4x		vs0,	o0,	AO		// load a0, a1
+
+
+	addi		AO,	AO,	16
+
+	lxvw4x		vs24,	o0,	BO		//  load b0, b1
+
+	xxspltw		vs8,	vs24,	0
+	xxspltw		vs9,	vs24,	1
+	xxspltw		vs10,	vs24,	2
+	xxspltw		vs11,	vs24,	3
+
+	lxvw4x		vs25,	o16,	BO		//  load b2, b3
+
+	xxspltw		vs12,	vs25,	0
+	xxspltw		vs13,	vs25,	1
+	xxspltw		vs14,	vs25,	2
+	xxspltw		vs15,	vs25,	3
+
+
+	addi		BO,	BO,	32
+
+
+	xvmaddasp	vs32,	vs4,	vs16		// a4_r*b0_r, a4_i*b0_r, a1_r*b0_r, a1_i*b0_r
+	xvmaddasp	vs33,	vs4,	vs17		// a4_r*b0_i, a4_i*b0_i, a1_r*b0_i, a1_i*b0_i
+
+	xvmaddasp	vs34,	vs4,	vs18		// a4_r*b1_r, a4_i*b1_r, a1_r*b1_r, a1_i*b1_r
+	xvmaddasp	vs35,	vs4,	vs19		// a4_r*b1_i, a4_i*b1_i, a1_r*b1_i, a1_i*b1_i
+
+	xvmaddasp	vs36,	vs4,	vs20		// a4_r*b2_r, a4_i*b2_r, a1_r*b2_r, a1_i*b2_r
+	xvmaddasp	vs37,	vs4,	vs21		// a4_r*b2_i, a4_i*b2_i, a1_r*b2_i, a1_i*b2_i
+
+	xvmaddasp	vs38,	vs4,	vs22		// a4_r*b3_r, a4_i*b3_r, a1_r*b3_r, a1_i*b3_r
+	xvmaddasp	vs39,	vs4,	vs23		// a4_r*b3_i, a4_i*b3_i, a1_r*b3_i, a1_i*b3_i
+
+
+.endm
+
+.macro KERNEL4x2_E2
+
+
+	xvmaddasp	vs32,	vs4,	vs16		// a4_r*b0_r, a4_i*b0_r, a1_r*b0_r, a1_i*b0_r
+	xvmaddasp	vs33,	vs4,	vs17		// a4_r*b0_i, a4_i*b0_i, a1_r*b0_i, a1_i*b0_i
+
+	xvmaddasp	vs34,	vs4,	vs18		// a4_r*b1_r, a4_i*b1_r, a1_r*b1_r, a1_i*b1_r
+	xvmaddasp	vs35,	vs4,	vs19		// a4_r*b1_i, a4_i*b1_i, a1_r*b1_i, a1_i*b1_i
+
+	xvmaddasp	vs36,	vs4,	vs20		// a4_r*b2_r, a4_i*b2_r, a1_r*b2_r, a1_i*b2_r
+	xvmaddasp	vs37,	vs4,	vs21		// a4_r*b2_i, a4_i*b2_i, a1_r*b2_i, a1_i*b2_i
+
+	xvmaddasp	vs38,	vs4,	vs22		// a4_r*b3_r, a4_i*b3_r, a1_r*b3_r, a1_i*b3_r
+	xvmaddasp	vs39,	vs4,	vs23		// a4_r*b3_i, a4_i*b3_i, a1_r*b3_i, a1_i*b3_i
+
+
+.endm
+
+.macro KERNEL4x2_SUBI1
+
+
+	lxvw4x		vs0,	o0,	AO		// load a0, a1
+
+
+	addi		AO,	AO,	16
+
+	lxvw4x		vs24,	o0,	BO		//  load b0, b1
+
+	xxspltw		vs8,	vs24,	0
+	xxspltw		vs9,	vs24,	1
+	xxspltw		vs10,	vs24,	2
+	xxspltw		vs11,	vs24,	3
+
+	lxvw4x		vs25,	o16,	BO		//  load b2, b3
+
+	xxspltw		vs12,	vs25,	0
+	xxspltw		vs13,	vs25,	1
+	xxspltw		vs14,	vs25,	2
+	xxspltw		vs15,	vs25,	3
+
+
+	addi		BO,	BO,	32
+
+
+	xvmulsp		vs32,	vs0,	vs8		// a0_r*b0_r, a0_i*b0_r, a1_r*b0_r, a1_i*b0_r
+	xvmulsp		vs33,	vs0,	vs9		// a0_r*b0_i, a0_i*b0_i, a1_r*b0_i, a1_i*b0_i
+
+	xvmulsp		vs34,	vs0,	vs10		// a0_r*b1_r, a0_i*b1_r, a1_r*b1_r, a1_i*b1_r
+	xvmulsp		vs35,	vs0,	vs11		// a0_r*b1_i, a0_i*b1_i, a1_r*b1_i, a1_i*b1_i
+
+	xvmulsp		vs36,	vs0,	vs12		// a0_r*b2_r, a0_i*b2_r, a1_r*b2_r, a1_i*b2_r
+	xvmulsp		vs37,	vs0,	vs13		// a0_r*b2_i, a0_i*b2_i, a1_r*b2_i, a1_i*b2_i
+
+	xvmulsp		vs38,	vs0,	vs14		// a0_r*b3_r, a0_i*b3_r, a1_r*b3_r, a1_i*b3_r
+	xvmulsp		vs39,	vs0,	vs15		// a0_r*b3_i, a0_i*b3_i, a1_r*b3_i, a1_i*b3_i
+
+
+.endm
+
+.macro KERNEL4x2_SUB1
+
+
+	lxvw4x		vs0,	o0,	AO		// load a0, a1
+
+
+	addi		AO,	AO,	16
+
+	lxvw4x		vs24,	o0,	BO		//  load b0, b1
+
+	xxspltw		vs8,	vs24,	0
+	xxspltw		vs9,	vs24,	1
+	xxspltw		vs10,	vs24,	2
+	xxspltw		vs11,	vs24,	3
+
+	lxvw4x		vs25,	o16,	BO		//  load b2, b3
+
+	xxspltw		vs12,	vs25,	0
+	xxspltw		vs13,	vs25,	1
+	xxspltw		vs14,	vs25,	2
+	xxspltw		vs15,	vs25,	3
+
+
+	addi		BO,	BO,	32
+
+
+	xvmaddasp	vs32,	vs0,	vs8		// a0_r*b0_r, a0_i*b0_r, a1_r*b0_r, a1_i*b0_r
+	xvmaddasp	vs33,	vs0,	vs9		// a0_r*b0_i, a0_i*b0_i, a1_r*b0_i, a1_i*b0_i
+
+	xvmaddasp	vs34,	vs0,	vs10		// a0_r*b1_r, a0_i*b1_r, a1_r*b1_r, a1_i*b1_r
+	xvmaddasp	vs35,	vs0,	vs11		// a0_r*b1_i, a0_i*b1_i, a1_r*b1_i, a1_i*b1_i
+
+	xvmaddasp	vs36,	vs0,	vs12		// a0_r*b2_r, a0_i*b2_r, a1_r*b2_r, a1_i*b2_r
+	xvmaddasp	vs37,	vs0,	vs13		// a0_r*b2_i, a0_i*b2_i, a1_r*b2_i, a1_i*b2_i
+
+	xvmaddasp	vs38,	vs0,	vs14		// a0_r*b3_r, a0_i*b3_r, a1_r*b3_r, a1_i*b3_r
+	xvmaddasp	vs39,	vs0,	vs15		// a0_r*b3_i, a0_i*b3_i, a1_r*b3_i, a1_i*b3_i
+
+
+.endm
+
+.macro SAVE4x2
+
+	mr		T1,	CO
+
+// N=0
+
+	mr		T2,	T1
+
+// N=0 M=0
+
+	xxlxor		vs4,	vs4,	vs4
+	xxlxor		vs5,	vs5,	vs5
+	xxlxor		vs6,	vs6,	vs6
+	xxlxor		vs7,	vs7,	vs7
+
+#ifndef TRMMKERNEL
+	lxvw4x		vs0,	o0,	T2	// c0, c1
+#else
+	xxlxor		vs0,	vs0,	vs0
+#endif
+
+
+	xxspltw		vs8,	vs32,	0
+	xxspltw		vs9,	vs32,	1
+	xxspltw		vs10,	vs32,	2
+	xxspltw		vs11,	vs32,	3
+
+
+	xxspltw		vs12,	vs33,	0
+	xxspltw		vs13,	vs33,	1
+	xxspltw		vs14,	vs33,	2
+	xxspltw		vs15,	vs33,	3
+
+	XVFADD_R1	vs4,	vs4,	vs8		// add a0_r * b0_r
+	XVFADD_I2	vs5,	vs5,	vs12		// add a0_r * b0_i
+	XVFADD_R1	vs6,	vs6,	vs10		// add a1_r * b0_r
+	XVFADD_I2	vs7,	vs7,	vs14		// add a1_r * b0_i
+
+	XVFADD_R2	vs4,	vs4,	vs13		// add a0_i * b0_i
+	XVFADD_I1	vs5,	vs5,	vs9 		// add a0_i * b0_r
+	XVFADD_R2	vs6,	vs6,	vs15		// add a1_i * b0_i
+	XVFADD_I1	vs7,	vs7,	vs11		// add a1_i * b0_r
+
+	xvmulsp		vs16,	vs4,	alpha_sr		// r0_r * alpha_r
+	xvmulsp		vs17,	vs5,	alpha_si		// r0_i * alpha_i
+	xvmulsp		vs18,	vs4,	alpha_si		// r0_r * alpha_i
+	xvmulsp		vs19,	vs5,	alpha_sr		// r0_i * alpha_r
+
+	xvsubsp		vs20,	vs16,	vs17		// r0_r * alpha_r - r0_i * alpha_i
+	xvaddsp		vs21,	vs18,	vs19		// r0_r * alpha_i + r0_i * alpha_r
+
+	xvmulsp		vs16,	vs6,	alpha_sr		// r1_r * alpha_r
+	xvmulsp		vs17,	vs7,	alpha_si		// r1_i * alpha_i
+	xvmulsp		vs18,	vs6,	alpha_si		// r1_r * alpha_i
+	xvmulsp		vs19,	vs7,	alpha_sr		// r1_i * alpha_r
+
+	xvsubsp		vs22,	vs16,	vs17		// r1_r * alpha_r - r1_i * alpha_i
+	xvaddsp		vs23,	vs18,	vs19		// r1_r * alpha_i + r1_i * alpha_r
+
+	xxlxor		vs24,	vs24,	vs24
+	xxsldwi		vs20,	vs20,	vs24,	3		// r0_r
+	xxsldwi		vs21,	vs21,	vs24,	2		// r0_i
+	xxsldwi		vs22,	vs22,	vs24,	1		// r1_r
+	xxsldwi		vs23,	vs23,	vs24,	0		// r1_i
+	xvaddsp		vs20,	vs20,	vs21		// r0_r, r0_i
+	xvaddsp		vs22,	vs22,	vs23		// r1_r, r1_i
+	xvaddsp		vs1,	vs20,	vs22			// r0_r, r0_i, r1_r, r1_i
+	xvaddsp		vs0,	vs0,	vs1
+
+
+	stxvw4x		vs0,	o0,	T2	// c0, c1
+
+	addi		T2,	T2,	16
+	add		T1,	T1,	LDC
+
+
+// N=1
+
+	mr		T2,	T1
+
+// N=1 M=0
+
+	xxlxor		vs4,	vs4,	vs4
+	xxlxor		vs5,	vs5,	vs5
+	xxlxor		vs6,	vs6,	vs6
+	xxlxor		vs7,	vs7,	vs7
+
+#ifndef TRMMKERNEL
+	lxvw4x		vs0,	o0,	T2	// c0, c1
+#else
+	xxlxor		vs0,	vs0,	vs0
+#endif
+
+
+	xxspltw		vs8,	vs34,	0
+	xxspltw		vs9,	vs34,	1
+	xxspltw		vs10,	vs34,	2
+	xxspltw		vs11,	vs34,	3
+
+
+	xxspltw		vs12,	vs35,	0
+	xxspltw		vs13,	vs35,	1
+	xxspltw		vs14,	vs35,	2
+	xxspltw		vs15,	vs35,	3
+
+	XVFADD_R1	vs4,	vs4,	vs8		// add a0_r * b0_r
+	XVFADD_I2	vs5,	vs5,	vs12		// add a0_r * b0_i
+	XVFADD_R1	vs6,	vs6,	vs10		// add a1_r * b0_r
+	XVFADD_I2	vs7,	vs7,	vs14		// add a1_r * b0_i
+
+	XVFADD_R2	vs4,	vs4,	vs13		// add a0_i * b0_i
+	XVFADD_I1	vs5,	vs5,	vs9 		// add a0_i * b0_r
+	XVFADD_R2	vs6,	vs6,	vs15		// add a1_i * b0_i
+	XVFADD_I1	vs7,	vs7,	vs11		// add a1_i * b0_r
+
+	xvmulsp		vs16,	vs4,	alpha_sr		// r0_r * alpha_r
+	xvmulsp		vs17,	vs5,	alpha_si		// r0_i * alpha_i
+	xvmulsp		vs18,	vs4,	alpha_si		// r0_r * alpha_i
+	xvmulsp		vs19,	vs5,	alpha_sr		// r0_i * alpha_r
+
+	xvsubsp		vs20,	vs16,	vs17		// r0_r * alpha_r - r0_i * alpha_i
+	xvaddsp		vs21,	vs18,	vs19		// r0_r * alpha_i + r0_i * alpha_r
+
+	xvmulsp		vs16,	vs6,	alpha_sr		// r1_r * alpha_r
+	xvmulsp		vs17,	vs7,	alpha_si		// r1_i * alpha_i
+	xvmulsp		vs18,	vs6,	alpha_si		// r1_r * alpha_i
+	xvmulsp		vs19,	vs7,	alpha_sr		// r1_i * alpha_r
+
+	xvsubsp		vs22,	vs16,	vs17		// r1_r * alpha_r - r1_i * alpha_i
+	xvaddsp		vs23,	vs18,	vs19		// r1_r * alpha_i + r1_i * alpha_r
+
+	xxlxor		vs24,	vs24,	vs24
+	xxsldwi		vs20,	vs20,	vs24,	3		// r0_r
+	xxsldwi		vs21,	vs21,	vs24,	2		// r0_i
+	xxsldwi		vs22,	vs22,	vs24,	1		// r1_r
+	xxsldwi		vs23,	vs23,	vs24,	0		// r1_i
+	xvaddsp		vs20,	vs20,	vs21		// r0_r, r0_i
+	xvaddsp		vs22,	vs22,	vs23		// r1_r, r1_i
+	xvaddsp		vs1,	vs20,	vs22			// r0_r, r0_i, r1_r, r1_i
+	xvaddsp		vs0,	vs0,	vs1
+
+
+	stxvw4x		vs0,	o0,	T2	// c0, c1
+
+	addi		T2,	T2,	16
+	add		T1,	T1,	LDC
+
+
+// N=2
+
+	mr		T2,	T1
+
+// N=2 M=0
+
+	xxlxor		vs4,	vs4,	vs4
+	xxlxor		vs5,	vs5,	vs5
+	xxlxor		vs6,	vs6,	vs6
+	xxlxor		vs7,	vs7,	vs7
+
+#ifndef TRMMKERNEL
+	lxvw4x		vs0,	o0,	T2	// c0, c1
+#else
+	xxlxor		vs0,	vs0,	vs0
+#endif
+
+
+	xxspltw		vs8,	vs36,	0
+	xxspltw		vs9,	vs36,	1
+	xxspltw		vs10,	vs36,	2
+	xxspltw		vs11,	vs36,	3
+
+
+	xxspltw		vs12,	vs37,	0
+	xxspltw		vs13,	vs37,	1
+	xxspltw		vs14,	vs37,	2
+	xxspltw		vs15,	vs37,	3
+
+	XVFADD_R1	vs4,	vs4,	vs8		// add a0_r * b0_r
+	XVFADD_I2	vs5,	vs5,	vs12		// add a0_r * b0_i
+	XVFADD_R1	vs6,	vs6,	vs10		// add a1_r * b0_r
+	XVFADD_I2	vs7,	vs7,	vs14		// add a1_r * b0_i
+
+	XVFADD_R2	vs4,	vs4,	vs13		// add a0_i * b0_i
+	XVFADD_I1	vs5,	vs5,	vs9 		// add a0_i * b0_r
+	XVFADD_R2	vs6,	vs6,	vs15		// add a1_i * b0_i
+	XVFADD_I1	vs7,	vs7,	vs11		// add a1_i * b0_r
+
+	xvmulsp		vs16,	vs4,	alpha_sr		// r0_r * alpha_r
+	xvmulsp		vs17,	vs5,	alpha_si		// r0_i * alpha_i
+	xvmulsp		vs18,	vs4,	alpha_si		// r0_r * alpha_i
+	xvmulsp		vs19,	vs5,	alpha_sr		// r0_i * alpha_r
+
+	xvsubsp		vs20,	vs16,	vs17		// r0_r * alpha_r - r0_i * alpha_i
+	xvaddsp		vs21,	vs18,	vs19		// r0_r * alpha_i + r0_i * alpha_r
+
+	xvmulsp		vs16,	vs6,	alpha_sr		// r1_r * alpha_r
+	xvmulsp		vs17,	vs7,	alpha_si		// r1_i * alpha_i
+	xvmulsp		vs18,	vs6,	alpha_si		// r1_r * alpha_i
+	xvmulsp		vs19,	vs7,	alpha_sr		// r1_i * alpha_r
+
+	xvsubsp		vs22,	vs16,	vs17		// r1_r * alpha_r - r1_i * alpha_i
+	xvaddsp		vs23,	vs18,	vs19		// r1_r * alpha_i + r1_i * alpha_r
+
+	xxlxor		vs24,	vs24,	vs24
+	xxsldwi		vs20,	vs20,	vs24,	3		// r0_r
+	xxsldwi		vs21,	vs21,	vs24,	2		// r0_i
+	xxsldwi		vs22,	vs22,	vs24,	1		// r1_r
+	xxsldwi		vs23,	vs23,	vs24,	0		// r1_i
+	xvaddsp		vs20,	vs20,	vs21		// r0_r, r0_i
+	xvaddsp		vs22,	vs22,	vs23		// r1_r, r1_i
+	xvaddsp		vs1,	vs20,	vs22			// r0_r, r0_i, r1_r, r1_i
+	xvaddsp		vs0,	vs0,	vs1
+
+
+	stxvw4x		vs0,	o0,	T2	// c0, c1
+
+	addi		T2,	T2,	16
+	add		T1,	T1,	LDC
+
+
+// N=3
+
+	mr		T2,	T1
+
+// N=3 M=0
+
+	xxlxor		vs4,	vs4,	vs4
+	xxlxor		vs5,	vs5,	vs5
+	xxlxor		vs6,	vs6,	vs6
+	xxlxor		vs7,	vs7,	vs7
+
+#ifndef TRMMKERNEL
+	lxvw4x		vs0,	o0,	T2	// c0, c1
+#else
+	xxlxor		vs0,	vs0,	vs0
+#endif
+
+
+	xxspltw		vs8,	vs38,	0
+	xxspltw		vs9,	vs38,	1
+	xxspltw		vs10,	vs38,	2
+	xxspltw		vs11,	vs38,	3
+
+
+	xxspltw		vs12,	vs39,	0
+	xxspltw		vs13,	vs39,	1
+	xxspltw		vs14,	vs39,	2
+	xxspltw		vs15,	vs39,	3
+
+	XVFADD_R1	vs4,	vs4,	vs8		// add a0_r * b0_r
+	XVFADD_I2	vs5,	vs5,	vs12		// add a0_r * b0_i
+	XVFADD_R1	vs6,	vs6,	vs10		// add a1_r * b0_r
+	XVFADD_I2	vs7,	vs7,	vs14		// add a1_r * b0_i
+
+	XVFADD_R2	vs4,	vs4,	vs13		// add a0_i * b0_i
+	XVFADD_I1	vs5,	vs5,	vs9 		// add a0_i * b0_r
+	XVFADD_R2	vs6,	vs6,	vs15		// add a1_i * b0_i
+	XVFADD_I1	vs7,	vs7,	vs11		// add a1_i * b0_r
+
+	xvmulsp		vs16,	vs4,	alpha_sr		// r0_r * alpha_r
+	xvmulsp		vs17,	vs5,	alpha_si		// r0_i * alpha_i
+	xvmulsp		vs18,	vs4,	alpha_si		// r0_r * alpha_i
+	xvmulsp		vs19,	vs5,	alpha_sr		// r0_i * alpha_r
+
+	xvsubsp		vs20,	vs16,	vs17		// r0_r * alpha_r - r0_i * alpha_i
+	xvaddsp		vs21,	vs18,	vs19		// r0_r * alpha_i + r0_i * alpha_r
+
+	xvmulsp		vs16,	vs6,	alpha_sr		// r1_r * alpha_r
+	xvmulsp		vs17,	vs7,	alpha_si		// r1_i * alpha_i
+	xvmulsp		vs18,	vs6,	alpha_si		// r1_r * alpha_i
+	xvmulsp		vs19,	vs7,	alpha_sr		// r1_i * alpha_r
+
+	xvsubsp		vs22,	vs16,	vs17		// r1_r * alpha_r - r1_i * alpha_i
+	xvaddsp		vs23,	vs18,	vs19		// r1_r * alpha_i + r1_i * alpha_r
+
+	xxlxor		vs24,	vs24,	vs24
+	xxsldwi		vs20,	vs20,	vs24,	3		// r0_r
+	xxsldwi		vs21,	vs21,	vs24,	2		// r0_i
+	xxsldwi		vs22,	vs22,	vs24,	1		// r1_r
+	xxsldwi		vs23,	vs23,	vs24,	0		// r1_i
+	xvaddsp		vs20,	vs20,	vs21		// r0_r, r0_i
+	xvaddsp		vs22,	vs22,	vs23		// r1_r, r1_i
+	xvaddsp		vs1,	vs20,	vs22			// r0_r, r0_i, r1_r, r1_i
+	xvaddsp		vs0,	vs0,	vs1
+
+
+	stxvw4x		vs0,	o0,	T2	// c0, c1
+
+	addi		T2,	T2,	16
+	add		T1,	T1,	LDC
+
+	addi		CO,	CO,	16
+
+.endm
+
+
+/**********************************************************************************************
+* Macros for N=4 and M=1
+**********************************************************************************************/
+
+.macro LOAD4x1_1
+
+	lxsspx		vs0,	o0,	AO		// load a0_r
+	lxsspx		vs1,	o4,	AO		// load a0_i
+
+	addi		AO,	AO,	8
+
+	mr		T1,	BO
+
+	lxsspx		vs8,	o0,	T1		//  load b0_r
+	lxsspx		vs9,	o4,	T1		//  load b0_i
+
+	addi		T1,	T1,8
+
+	lxsspx		vs10,	o0,	T1		//  load b1_r
+	lxsspx		vs11,	o4,	T1		//  load b1_i
+
+	addi		T1,	T1,8
+
+	lxsspx		vs12,	o0,	T1		//  load b2_r
+	lxsspx		vs13,	o4,	T1		//  load b2_i
+
+	addi		T1,	T1,8
+
+	lxsspx		vs14,	o0,	T1		//  load b3_r
+	lxsspx		vs15,	o4,	T1		//  load b3_i
+
+	addi		BO,	BO,	32
+
+.endm
+
+.macro KERNEL4x1_I1
+
+
+	lxsspx		vs4,	o0,	AO		// load a0_r
+	lxsspx		vs5,	o4,	AO		// load a0_i
+
+	addi		AO,	AO,	8
+
+	mr		T1,	BO
+
+	lxsspx		vs16,	o0,	T1		//  load b0_r
+	lxsspx		vs17,	o4,	T1		//  load b0_i
+
+	addi		T1,	T1,8
+
+	lxsspx		vs18,	o0,	T1		//  load b1_r
+	lxsspx		vs19,	o4,	T1		//  load b1_i
+
+	addi		T1,	T1,8
+
+	lxsspx		vs20,	o0,	T1		//  load b2_r
+	lxsspx		vs21,	o4,	T1		//  load b2_i
+
+	addi		T1,	T1,8
+
+	lxsspx		vs22,	o0,	T1		//  load b3_r
+	lxsspx		vs23,	o4,	T1		//  load b3_i
+
+	addi		BO,	BO,	32
+
+
+	xsmuldp		vs32,	vs0,	vs8		// a0_r*b0_r
+	xsmuldp		vs33,	vs1,	vs9		// a0_i*b0_i
+	xsmuldp		vs34,	vs0,	vs9		// a0_r*b0_i
+	xsmuldp		vs35,	vs1,	vs8		// a0_i*b0_r
+
+	xsmuldp		vs36,	vs0,	vs10		// a0_r*b1_r
+	xsmuldp		vs37,	vs1,	vs11		// a0_i*b1_i
+	xsmuldp		vs38,	vs0,	vs11		// a0_r*b1_i
+	xsmuldp		vs39,	vs1,	vs10		// a0_i*b1_r
+
+	xsmuldp		vs40,	vs0,	vs12		// a0_r*b2_r
+	xsmuldp		vs41,	vs1,	vs13		// a0_i*b2_i
+	xsmuldp		vs42,	vs0,	vs13		// a0_r*b2_i
+	xsmuldp		vs43,	vs1,	vs12		// a0_i*b2_r
+
+	xsmuldp		vs44,	vs0,	vs14		// a0_r*b3_r
+	xsmuldp		vs45,	vs1,	vs15		// a0_i*b3_i
+	xsmuldp		vs46,	vs0,	vs15		// a0_r*b3_i
+	xsmuldp		vs47,	vs1,	vs14		// a0_i*b3_r
+
+
+.endm
+
+.macro KERNEL4x1_1
+
+
+	lxsspx		vs4,	o0,	AO		// load a0_r
+	lxsspx		vs5,	o4,	AO		// load a0_i
+
+	addi		AO,	AO,	8
+
+	mr		T1,	BO
+
+	lxsspx		vs16,	o0,	T1		//  load b0_r
+	lxsspx		vs17,	o4,	T1		//  load b0_i
+
+	addi		T1,	T1,8
+
+	lxsspx		vs18,	o0,	T1		//  load b1_r
+	lxsspx		vs19,	o4,	T1		//  load b1_i
+
+	addi		T1,	T1,8
+
+	lxsspx		vs20,	o0,	T1		//  load b2_r
+	lxsspx		vs21,	o4,	T1		//  load b2_i
+
+	addi		T1,	T1,8
+
+	lxsspx		vs22,	o0,	T1		//  load b3_r
+	lxsspx		vs23,	o4,	T1		//  load b3_i
+
+	addi		BO,	BO,	32
+
+
+	xsmaddadp	vs32,	vs0,	vs8		// a0_r*b0_r
+	xsmaddadp	vs33,	vs1,	vs9		// a0_i*b0_i
+	xsmaddadp	vs34,	vs0,	vs9		// a0_r*b0_i
+	xsmaddadp	vs35,	vs1,	vs8		// a0_i*b0_r
+
+	xsmaddadp	vs36,	vs0,	vs10		// a0_r*b1_r
+	xsmaddadp	vs37,	vs1,	vs11		// a0_i*b1_i
+	xsmaddadp	vs38,	vs0,	vs11		// a0_r*b1_i
+	xsmaddadp	vs39,	vs1,	vs10		// a0_i*b1_r
+
+	xsmaddadp	vs40,	vs0,	vs12		// a0_r*b2_r
+	xsmaddadp	vs41,	vs1,	vs13		// a0_i*b2_i
+	xsmaddadp	vs42,	vs0,	vs13		// a0_r*b2_i
+	xsmaddadp	vs43,	vs1,	vs12		// a0_i*b2_r
+
+	xsmaddadp	vs44,	vs0,	vs14		// a0_r*b3_r
+	xsmaddadp	vs45,	vs1,	vs15		// a0_i*b3_i
+	xsmaddadp	vs46,	vs0,	vs15		// a0_r*b3_i
+	xsmaddadp	vs47,	vs1,	vs14		// a0_i*b3_r
+
+
+.endm
+
+.macro KERNEL4x1_2
+
+
+	lxsspx		vs0,	o0,	AO		// load a0_r
+	lxsspx		vs1,	o4,	AO		// load a0_i
+
+	addi		AO,	AO,	8
+
+	mr		T1,	BO
+
+	lxsspx		vs8,	o0,	T1		//  load b0_r
+	lxsspx		vs9,	o4,	T1		//  load b0_i
+
+	addi		T1,	T1,8
+
+	lxsspx		vs10,	o0,	T1		//  load b1_r
+	lxsspx		vs11,	o4,	T1		//  load b1_i
+
+	addi		T1,	T1,8
+
+	lxsspx		vs12,	o0,	T1		//  load b2_r
+	lxsspx		vs13,	o4,	T1		//  load b2_i
+
+	addi		T1,	T1,8
+
+	lxsspx		vs14,	o0,	T1		//  load b3_r
+	lxsspx		vs15,	o4,	T1		//  load b3_i
+
+	addi		BO,	BO,	32
+
+
+	xsmaddadp	vs32,	vs4,	vs16		// a4_r*b0_r
+	xsmaddadp	vs33,	vs5,	vs17		// a4_i*b0_i
+	xsmaddadp	vs34,	vs4,	vs17		// a4_r*b0_i
+	xsmaddadp	vs35,	vs5,	vs16		// a4_i*b0_r
+
+	xsmaddadp	vs36,	vs4,	vs18		// a4_r*b1_r
+	xsmaddadp	vs37,	vs5,	vs19		// a4_i*b1_i
+	xsmaddadp	vs38,	vs4,	vs19		// a4_r*b1_i
+	xsmaddadp	vs39,	vs5,	vs18		// a4_i*b1_r
+
+	xsmaddadp	vs40,	vs4,	vs20		// a4_r*b2_r
+	xsmaddadp	vs41,	vs5,	vs21		// a4_i*b2_i
+	xsmaddadp	vs42,	vs4,	vs21		// a4_r*b2_i
+	xsmaddadp	vs43,	vs5,	vs20		// a4_i*b2_r
+
+	xsmaddadp	vs44,	vs4,	vs22		// a4_r*b3_r
+	xsmaddadp	vs45,	vs5,	vs23		// a4_i*b3_i
+	xsmaddadp	vs46,	vs4,	vs23		// a4_r*b3_i
+	xsmaddadp	vs47,	vs5,	vs22		// a4_i*b3_r
+
+
+.endm
+
+.macro KERNEL4x1_E2
+
+
+	xsmaddadp	vs32,	vs4,	vs16		// a4_r*b0_r
+	xsmaddadp	vs33,	vs5,	vs17		// a4_i*b0_i
+	xsmaddadp	vs34,	vs4,	vs17		// a4_r*b0_i
+	xsmaddadp	vs35,	vs5,	vs16		// a4_i*b0_r
+
+	xsmaddadp	vs36,	vs4,	vs18		// a4_r*b1_r
+	xsmaddadp	vs37,	vs5,	vs19		// a4_i*b1_i
+	xsmaddadp	vs38,	vs4,	vs19		// a4_r*b1_i
+	xsmaddadp	vs39,	vs5,	vs18		// a4_i*b1_r
+
+	xsmaddadp	vs40,	vs4,	vs20		// a4_r*b2_r
+	xsmaddadp	vs41,	vs5,	vs21		// a4_i*b2_i
+	xsmaddadp	vs42,	vs4,	vs21		// a4_r*b2_i
+	xsmaddadp	vs43,	vs5,	vs20		// a4_i*b2_r
+
+	xsmaddadp	vs44,	vs4,	vs22		// a4_r*b3_r
+	xsmaddadp	vs45,	vs5,	vs23		// a4_i*b3_i
+	xsmaddadp	vs46,	vs4,	vs23		// a4_r*b3_i
+	xsmaddadp	vs47,	vs5,	vs22		// a4_i*b3_r
+
+
+.endm
+
+.macro KERNEL4x1_SUBI1
+
+
+	lxsspx		vs0,	o0,	AO		// load a0_r
+	lxsspx		vs1,	o4,	AO		// load a0_i
+
+	addi		AO,	AO,	8
+
+	mr		T1,	BO
+
+	lxsspx		vs8,	o0,	T1		//  load b0_r
+	lxsspx		vs9,	o4,	T1		//  load b0_i
+
+	addi		T1,	T1,8
+
+	lxsspx		vs10,	o0,	T1		//  load b1_r
+	lxsspx		vs11,	o4,	T1		//  load b1_i
+
+	addi		T1,	T1,8
+
+	lxsspx		vs12,	o0,	T1		//  load b2_r
+	lxsspx		vs13,	o4,	T1		//  load b2_i
+
+	addi		T1,	T1,8
+
+	lxsspx		vs14,	o0,	T1		//  load b3_r
+	lxsspx		vs15,	o4,	T1		//  load b3_i
+
+	addi		BO,	BO,	32
+
+
+	xsmuldp		vs32,	vs0,	vs8		// a0_r*b0_r
+	xsmuldp		vs33,	vs1,	vs9		// a0_i*b0_i
+	xsmuldp		vs34,	vs0,	vs9		// a0_r*b0_i
+	xsmuldp		vs35,	vs1,	vs8		// a0_i*b0_r
+
+	xsmuldp		vs36,	vs0,	vs10		// a0_r*b1_r
+	xsmuldp		vs37,	vs1,	vs11		// a0_i*b1_i
+	xsmuldp		vs38,	vs0,	vs11		// a0_r*b1_i
+	xsmuldp		vs39,	vs1,	vs10		// a0_i*b1_r
+
+	xsmuldp		vs40,	vs0,	vs12		// a0_r*b2_r
+	xsmuldp		vs41,	vs1,	vs13		// a0_i*b2_i
+	xsmuldp		vs42,	vs0,	vs13		// a0_r*b2_i
+	xsmuldp		vs43,	vs1,	vs12		// a0_i*b2_r
+
+	xsmuldp		vs44,	vs0,	vs14		// a0_r*b3_r
+	xsmuldp		vs45,	vs1,	vs15		// a0_i*b3_i
+	xsmuldp		vs46,	vs0,	vs15		// a0_r*b3_i
+	xsmuldp		vs47,	vs1,	vs14		// a0_i*b3_r
+
+
+.endm
+
+.macro KERNEL4x1_SUB1
+
+
+	lxsspx		vs0,	o0,	AO		// load a0_r
+	lxsspx		vs1,	o4,	AO		// load a0_i
+
+	addi		AO,	AO,	8
+
+	mr		T1,	BO
+
+	lxsspx		vs8,	o0,	T1		//  load b0_r
+	lxsspx		vs9,	o4,	T1		//  load b0_i
+
+	addi		T1,	T1,8
+
+	lxsspx		vs10,	o0,	T1		//  load b1_r
+	lxsspx		vs11,	o4,	T1		//  load b1_i
+
+	addi		T1,	T1,8
+
+	lxsspx		vs12,	o0,	T1		//  load b2_r
+	lxsspx		vs13,	o4,	T1		//  load b2_i
+
+	addi		T1,	T1,8
+
+	lxsspx		vs14,	o0,	T1		//  load b3_r
+	lxsspx		vs15,	o4,	T1		//  load b3_i
+
+	addi		BO,	BO,	32
+
+
+	xsmaddadp	vs32,	vs0,	vs8		// a0_r*b0_r
+	xsmaddadp	vs33,	vs1,	vs9		// a0_i*b0_i
+	xsmaddadp	vs34,	vs0,	vs9		// a0_r*b0_i
+	xsmaddadp	vs35,	vs1,	vs8		// a0_i*b0_r
+
+	xsmaddadp	vs36,	vs0,	vs10		// a0_r*b1_r
+	xsmaddadp	vs37,	vs1,	vs11		// a0_i*b1_i
+	xsmaddadp	vs38,	vs0,	vs11		// a0_r*b1_i
+	xsmaddadp	vs39,	vs1,	vs10		// a0_i*b1_r
+
+	xsmaddadp	vs40,	vs0,	vs12		// a0_r*b2_r
+	xsmaddadp	vs41,	vs1,	vs13		// a0_i*b2_i
+	xsmaddadp	vs42,	vs0,	vs13		// a0_r*b2_i
+	xsmaddadp	vs43,	vs1,	vs12		// a0_i*b2_r
+
+	xsmaddadp	vs44,	vs0,	vs14		// a0_r*b3_r
+	xsmaddadp	vs45,	vs1,	vs15		// a0_i*b3_i
+	xsmaddadp	vs46,	vs0,	vs15		// a0_r*b3_i
+	xsmaddadp	vs47,	vs1,	vs14		// a0_i*b3_r
+
+
+.endm
+
+.macro SAVE4x1
+
+	mr		T1,	CO
+
+// N=0
+
+	mr		T2,	T1
+
+// N=0 M=0
+
+	xxlxor		vs4,	vs4,	vs4
+	xxlxor		vs5,	vs5,	vs5
+
+#ifndef TRMMKERNEL
+	lxsspx		vs0,	o0,	T2	// load c0_r
+	lxsspx		vs1,	o4,	T2	// load c0_i
+#else
+	xxlxor		vs0,	vs0,	vs0
+	xxlxor		vs1,	vs1,	vs1
+#endif
+
+	XSFADD_R1	vs4,	vs4,	vs32		// add a0_r * b0_r
+	XSFADD_I1	vs5,	vs5,	vs35		// add a0_r * b0_i
+
+	XSFADD_R2	vs4,	vs4,	vs33		// add a0_i * b0_i
+	XSFADD_I2	vs5,	vs5,	vs34		// add a0_i * b0_r
+
+	xsmuldp		vs16,	vs4,	alpha_dr		// r0_r * alpha_r
+	xsmuldp		vs17,	vs5,	alpha_di		// r0_i * alpha_i
+	xsmuldp		vs18,	vs4,	alpha_di		// r0_r * alpha_i
+	xsmuldp		vs19,	vs5,	alpha_dr		// r0_i * alpha_r
+
+	xssubdp		vs20,	vs16,	vs17		// r0_r * alpha_r - r0_i * alpha_i
+	xsadddp		vs21,	vs18,	vs19		// r0_r * alpha_i + r0_i * alpha_r
+
+	xsadddp		vs0,	vs0,	vs20
+	xsadddp		vs1,	vs1,	vs21
+
+
+	stxsspx		vs0,	o0,	T2	// store c0_r
+	stxsspx		vs1,	o4,	T2	// store c0_i
+
+	addi		T2,	T2,	8
+	add		T1,	T1,	LDC
+
+
+// N=1
+
+	mr		T2,	T1
+
+// N=1 M=0
+
+	xxlxor		vs4,	vs4,	vs4
+	xxlxor		vs5,	vs5,	vs5
+
+#ifndef TRMMKERNEL
+	lxsspx		vs0,	o0,	T2	// load c0_r
+	lxsspx		vs1,	o4,	T2	// load c0_i
+#else
+	xxlxor		vs0,	vs0,	vs0
+	xxlxor		vs1,	vs1,	vs1
+#endif
+
+	XSFADD_R1	vs4,	vs4,	vs36		// add a0_r * b0_r
+	XSFADD_I1	vs5,	vs5,	vs39		// add a0_r * b0_i
+
+	XSFADD_R2	vs4,	vs4,	vs37		// add a0_i * b0_i
+	XSFADD_I2	vs5,	vs5,	vs38		// add a0_i * b0_r
+
+	xsmuldp		vs16,	vs4,	alpha_dr		// r0_r * alpha_r
+	xsmuldp		vs17,	vs5,	alpha_di		// r0_i * alpha_i
+	xsmuldp		vs18,	vs4,	alpha_di		// r0_r * alpha_i
+	xsmuldp		vs19,	vs5,	alpha_dr		// r0_i * alpha_r
+
+	xssubdp		vs20,	vs16,	vs17		// r0_r * alpha_r - r0_i * alpha_i
+	xsadddp		vs21,	vs18,	vs19		// r0_r * alpha_i + r0_i * alpha_r
+
+	xsadddp		vs0,	vs0,	vs20
+	xsadddp		vs1,	vs1,	vs21
+
+
+	stxsspx		vs0,	o0,	T2	// store c0_r
+	stxsspx		vs1,	o4,	T2	// store c0_i
+
+	addi		T2,	T2,	8
+	add		T1,	T1,	LDC
+
+
+// N=2
+
+	mr		T2,	T1
+
+// N=2 M=0
+
+	xxlxor		vs4,	vs4,	vs4
+	xxlxor		vs5,	vs5,	vs5
+
+#ifndef TRMMKERNEL
+	lxsspx		vs0,	o0,	T2	// load c0_r
+	lxsspx		vs1,	o4,	T2	// load c0_i
+#else
+	xxlxor		vs0,	vs0,	vs0
+	xxlxor		vs1,	vs1,	vs1
+#endif
+
+	XSFADD_R1	vs4,	vs4,	vs40		// add a0_r * b0_r
+	XSFADD_I1	vs5,	vs5,	vs43		// add a0_r * b0_i
+
+	XSFADD_R2	vs4,	vs4,	vs41		// add a0_i * b0_i
+	XSFADD_I2	vs5,	vs5,	vs42		// add a0_i * b0_r
+
+	xsmuldp		vs16,	vs4,	alpha_dr		// r0_r * alpha_r
+	xsmuldp		vs17,	vs5,	alpha_di		// r0_i * alpha_i
+	xsmuldp		vs18,	vs4,	alpha_di		// r0_r * alpha_i
+	xsmuldp		vs19,	vs5,	alpha_dr		// r0_i * alpha_r
+
+	xssubdp		vs20,	vs16,	vs17		// r0_r * alpha_r - r0_i * alpha_i
+	xsadddp		vs21,	vs18,	vs19		// r0_r * alpha_i + r0_i * alpha_r
+
+	xsadddp		vs0,	vs0,	vs20
+	xsadddp		vs1,	vs1,	vs21
+
+
+	stxsspx		vs0,	o0,	T2	// store c0_r
+	stxsspx		vs1,	o4,	T2	// store c0_i
+
+	addi		T2,	T2,	8
+	add		T1,	T1,	LDC
+
+
+// N=3
+
+	mr		T2,	T1
+
+// N=3 M=0
+
+	xxlxor		vs4,	vs4,	vs4
+	xxlxor		vs5,	vs5,	vs5
+
+#ifndef TRMMKERNEL
+	lxsspx		vs0,	o0,	T2	// load c0_r
+	lxsspx		vs1,	o4,	T2	// load c0_i
+#else
+	xxlxor		vs0,	vs0,	vs0
+	xxlxor		vs1,	vs1,	vs1
+#endif
+
+	XSFADD_R1	vs4,	vs4,	vs44		// add a0_r * b0_r
+	XSFADD_I1	vs5,	vs5,	vs47		// add a0_r * b0_i
+
+	XSFADD_R2	vs4,	vs4,	vs45		// add a0_i * b0_i
+	XSFADD_I2	vs5,	vs5,	vs46		// add a0_i * b0_r
+
+	xsmuldp		vs16,	vs4,	alpha_dr		// r0_r * alpha_r
+	xsmuldp		vs17,	vs5,	alpha_di		// r0_i * alpha_i
+	xsmuldp		vs18,	vs4,	alpha_di		// r0_r * alpha_i
+	xsmuldp		vs19,	vs5,	alpha_dr		// r0_i * alpha_r
+
+	xssubdp		vs20,	vs16,	vs17		// r0_r * alpha_r - r0_i * alpha_i
+	xsadddp		vs21,	vs18,	vs19		// r0_r * alpha_i + r0_i * alpha_r
+
+	xsadddp		vs0,	vs0,	vs20
+	xsadddp		vs1,	vs1,	vs21
+
+
+	stxsspx		vs0,	o0,	T2	// store c0_r
+	stxsspx		vs1,	o4,	T2	// store c0_i
+
+	addi		T2,	T2,	8
+	add		T1,	T1,	LDC
+
+	addi		CO,	CO,	8
+
+.endm
+
+
+/**********************************************************************************************
+* Macros for N=2 and M=8
+**********************************************************************************************/
+
+.macro LOAD2x8_1
+
+	lxvw4x		vs0,	o0,	AO		// load a0, a1
+
+	lxvw4x		vs1,	o16,	AO		// load a2, a3
+
+	lxvw4x		vs2,	o32,	AO		// load a4, a5
+
+	lxvw4x		vs3,	o48,	AO		// load a6, a7
+
+
+	addi		AO,	AO,	64
+
+	lxvw4x		vs24,	o0,	BO		//  load b0, b1
+
+	xxspltw		vs8,	vs24,	0
+	xxspltw		vs9,	vs24,	1
+	xxspltw		vs10,	vs24,	2
+	xxspltw		vs11,	vs24,	3
+
+
+	addi		BO,	BO,	16
+
+.endm
+
+.macro KERNEL2x8_I1
+
+
+	lxvw4x		vs4,	o0,	AO		// load a0, a1
+
+	lxvw4x		vs5,	o16,	AO		// load a2, a3
+
+	lxvw4x		vs6,	o32,	AO		// load a4, a5
+
+	lxvw4x		vs7,	o48,	AO		// load a6, a7
+
+
+	addi		AO,	AO,	64
+
+	lxvw4x		vs24,	o0,	BO		//  load b0, b1
+
+	xxspltw		vs16,	vs24,	0
+	xxspltw		vs17,	vs24,	1
+	xxspltw		vs18,	vs24,	2
+	xxspltw		vs19,	vs24,	3
+
+
+	addi		BO,	BO,	16
+
+
+	xvmulsp		vs32,	vs0,	vs8		// a0_r*b0_r, a0_i*b0_r, a1_r*b0_r, a1_i*b0_r
+	xvmulsp		vs33,	vs0,	vs9		// a0_r*b0_i, a0_i*b0_i, a1_r*b0_i, a1_i*b0_i
+	xvmulsp		vs34,	vs1,	vs8		// a1_r*b0_r, a1_i*b0_r, a1_r*b0_r, a1_i*b0_r
+	xvmulsp		vs35,	vs1,	vs9		// a1_r*b0_i, a1_i*b0_i, a1_r*b0_i, a1_i*b0_i
+	xvmulsp		vs36,	vs2,	vs8		// a2_r*b0_r, a2_i*b0_r, a1_r*b0_r, a1_i*b0_r
+	xvmulsp		vs37,	vs2,	vs9		// a2_r*b0_i, a2_i*b0_i, a1_r*b0_i, a1_i*b0_i
+	xvmulsp		vs38,	vs3,	vs8		// a3_r*b0_r, a3_i*b0_r, a1_r*b0_r, a1_i*b0_r
+	xvmulsp		vs39,	vs3,	vs9		// a3_r*b0_i, a3_i*b0_i, a1_r*b0_i, a1_i*b0_i
+
+	xvmulsp		vs40,	vs0,	vs10		// a0_r*b1_r, a0_i*b1_r, a1_r*b1_r, a1_i*b1_r
+	xvmulsp		vs41,	vs0,	vs11		// a0_r*b1_i, a0_i*b1_i, a1_r*b1_i, a1_i*b1_i
+	xvmulsp		vs42,	vs1,	vs10		// a1_r*b1_r, a1_i*b1_r, a1_r*b1_r, a1_i*b1_r
+	xvmulsp		vs43,	vs1,	vs11		// a1_r*b1_i, a1_i*b1_i, a1_r*b1_i, a1_i*b1_i
+	xvmulsp		vs44,	vs2,	vs10		// a2_r*b1_r, a2_i*b1_r, a1_r*b1_r, a1_i*b1_r
+	xvmulsp		vs45,	vs2,	vs11		// a2_r*b1_i, a2_i*b1_i, a1_r*b1_i, a1_i*b1_i
+	xvmulsp		vs46,	vs3,	vs10		// a3_r*b1_r, a3_i*b1_r, a1_r*b1_r, a1_i*b1_r
+	xvmulsp		vs47,	vs3,	vs11		// a3_r*b1_i, a3_i*b1_i, a1_r*b1_i, a1_i*b1_i
+
+
+.endm
+
+.macro KERNEL2x8_1
+
+
+	lxvw4x		vs4,	o0,	AO		// load a0, a1
+
+	lxvw4x		vs5,	o16,	AO		// load a2, a3
+
+	lxvw4x		vs6,	o32,	AO		// load a4, a5
+
+	lxvw4x		vs7,	o48,	AO		// load a6, a7
+
+
+	addi		AO,	AO,	64
+
+	lxvw4x		vs24,	o0,	BO		//  load b0, b1
+
+	xxspltw		vs16,	vs24,	0
+	xxspltw		vs17,	vs24,	1
+	xxspltw		vs18,	vs24,	2
+	xxspltw		vs19,	vs24,	3
+
+
+	addi		BO,	BO,	16
+
+
+	xvmaddasp	vs32,	vs0,	vs8		// a0_r*b0_r, a0_i*b0_r, a1_r*b0_r, a1_i*b0_r
+	xvmaddasp	vs33,	vs0,	vs9		// a0_r*b0_i, a0_i*b0_i, a1_r*b0_i, a1_i*b0_i
+	xvmaddasp	vs34,	vs1,	vs8		// a1_r*b0_r, a1_i*b0_r, a1_r*b0_r, a1_i*b0_r
+	xvmaddasp	vs35,	vs1,	vs9		// a1_r*b0_i, a1_i*b0_i, a1_r*b0_i, a1_i*b0_i
+	xvmaddasp	vs36,	vs2,	vs8		// a2_r*b0_r, a2_i*b0_r, a1_r*b0_r, a1_i*b0_r
+	xvmaddasp	vs37,	vs2,	vs9		// a2_r*b0_i, a2_i*b0_i, a1_r*b0_i, a1_i*b0_i
+	xvmaddasp	vs38,	vs3,	vs8		// a3_r*b0_r, a3_i*b0_r, a1_r*b0_r, a1_i*b0_r
+	xvmaddasp	vs39,	vs3,	vs9		// a3_r*b0_i, a3_i*b0_i, a1_r*b0_i, a1_i*b0_i
+
+	xvmaddasp	vs40,	vs0,	vs10		// a0_r*b1_r, a0_i*b1_r, a1_r*b1_r, a1_i*b1_r
+	xvmaddasp	vs41,	vs0,	vs11		// a0_r*b1_i, a0_i*b1_i, a1_r*b1_i, a1_i*b1_i
+	xvmaddasp	vs42,	vs1,	vs10		// a1_r*b1_r, a1_i*b1_r, a1_r*b1_r, a1_i*b1_r
+	xvmaddasp	vs43,	vs1,	vs11		// a1_r*b1_i, a1_i*b1_i, a1_r*b1_i, a1_i*b1_i
+	xvmaddasp	vs44,	vs2,	vs10		// a2_r*b1_r, a2_i*b1_r, a1_r*b1_r, a1_i*b1_r
+	xvmaddasp	vs45,	vs2,	vs11		// a2_r*b1_i, a2_i*b1_i, a1_r*b1_i, a1_i*b1_i
+	xvmaddasp	vs46,	vs3,	vs10		// a3_r*b1_r, a3_i*b1_r, a1_r*b1_r, a1_i*b1_r
+	xvmaddasp	vs47,	vs3,	vs11		// a3_r*b1_i, a3_i*b1_i, a1_r*b1_i, a1_i*b1_i
+
+
+.endm
+
+.macro KERNEL2x8_2
+
+
+	lxvw4x		vs0,	o0,	AO		// load a0, a1
+
+	lxvw4x		vs1,	o16,	AO		// load a2, a3
+
+	lxvw4x		vs2,	o32,	AO		// load a4, a5
+
+	lxvw4x		vs3,	o48,	AO		// load a6, a7
+
+
+	addi		AO,	AO,	64
+
+	lxvw4x		vs24,	o0,	BO		//  load b0, b1
+
+	xxspltw		vs8,	vs24,	0
+	xxspltw		vs9,	vs24,	1
+	xxspltw		vs10,	vs24,	2
+	xxspltw		vs11,	vs24,	3
+
+
+	addi		BO,	BO,	16
+
+
+	xvmaddasp	vs32,	vs4,	vs16		// a4_r*b0_r, a4_i*b0_r, a1_r*b0_r, a1_i*b0_r
+	xvmaddasp	vs33,	vs4,	vs17		// a4_r*b0_i, a4_i*b0_i, a1_r*b0_i, a1_i*b0_i
+	xvmaddasp	vs34,	vs5,	vs16		// a5_r*b0_r, a5_i*b0_r, a1_r*b0_r, a1_i*b0_r
+	xvmaddasp	vs35,	vs5,	vs17		// a5_r*b0_i, a5_i*b0_i, a1_r*b0_i, a1_i*b0_i
+	xvmaddasp	vs36,	vs6,	vs16		// a6_r*b0_r, a6_i*b0_r, a1_r*b0_r, a1_i*b0_r
+	xvmaddasp	vs37,	vs6,	vs17		// a6_r*b0_i, a6_i*b0_i, a1_r*b0_i, a1_i*b0_i
+	xvmaddasp	vs38,	vs7,	vs16		// a7_r*b0_r, a7_i*b0_r, a1_r*b0_r, a1_i*b0_r
+	xvmaddasp	vs39,	vs7,	vs17		// a7_r*b0_i, a7_i*b0_i, a1_r*b0_i, a1_i*b0_i
+
+	xvmaddasp	vs40,	vs4,	vs18		// a4_r*b1_r, a4_i*b1_r, a1_r*b1_r, a1_i*b1_r
+	xvmaddasp	vs41,	vs4,	vs19		// a4_r*b1_i, a4_i*b1_i, a1_r*b1_i, a1_i*b1_i
+	xvmaddasp	vs42,	vs5,	vs18		// a5_r*b1_r, a5_i*b1_r, a1_r*b1_r, a1_i*b1_r
+	xvmaddasp	vs43,	vs5,	vs19		// a5_r*b1_i, a5_i*b1_i, a1_r*b1_i, a1_i*b1_i
+	xvmaddasp	vs44,	vs6,	vs18		// a6_r*b1_r, a6_i*b1_r, a1_r*b1_r, a1_i*b1_r
+	xvmaddasp	vs45,	vs6,	vs19		// a6_r*b1_i, a6_i*b1_i, a1_r*b1_i, a1_i*b1_i
+	xvmaddasp	vs46,	vs7,	vs18		// a7_r*b1_r, a7_i*b1_r, a1_r*b1_r, a1_i*b1_r
+	xvmaddasp	vs47,	vs7,	vs19		// a7_r*b1_i, a7_i*b1_i, a1_r*b1_i, a1_i*b1_i
+
+
+.endm
+
+.macro KERNEL2x8_E2
+
+
+	xvmaddasp	vs32,	vs4,	vs16		// a4_r*b0_r, a4_i*b0_r, a1_r*b0_r, a1_i*b0_r
+	xvmaddasp	vs33,	vs4,	vs17		// a4_r*b0_i, a4_i*b0_i, a1_r*b0_i, a1_i*b0_i
+	xvmaddasp	vs34,	vs5,	vs16		// a5_r*b0_r, a5_i*b0_r, a1_r*b0_r, a1_i*b0_r
+	xvmaddasp	vs35,	vs5,	vs17		// a5_r*b0_i, a5_i*b0_i, a1_r*b0_i, a1_i*b0_i
+	xvmaddasp	vs36,	vs6,	vs16		// a6_r*b0_r, a6_i*b0_r, a1_r*b0_r, a1_i*b0_r
+	xvmaddasp	vs37,	vs6,	vs17		// a6_r*b0_i, a6_i*b0_i, a1_r*b0_i, a1_i*b0_i
+	xvmaddasp	vs38,	vs7,	vs16		// a7_r*b0_r, a7_i*b0_r, a1_r*b0_r, a1_i*b0_r
+	xvmaddasp	vs39,	vs7,	vs17		// a7_r*b0_i, a7_i*b0_i, a1_r*b0_i, a1_i*b0_i
+
+	xvmaddasp	vs40,	vs4,	vs18		// a4_r*b1_r, a4_i*b1_r, a1_r*b1_r, a1_i*b1_r
+	xvmaddasp	vs41,	vs4,	vs19		// a4_r*b1_i, a4_i*b1_i, a1_r*b1_i, a1_i*b1_i
+	xvmaddasp	vs42,	vs5,	vs18		// a5_r*b1_r, a5_i*b1_r, a1_r*b1_r, a1_i*b1_r
+	xvmaddasp	vs43,	vs5,	vs19		// a5_r*b1_i, a5_i*b1_i, a1_r*b1_i, a1_i*b1_i
+	xvmaddasp	vs44,	vs6,	vs18		// a6_r*b1_r, a6_i*b1_r, a1_r*b1_r, a1_i*b1_r
+	xvmaddasp	vs45,	vs6,	vs19		// a6_r*b1_i, a6_i*b1_i, a1_r*b1_i, a1_i*b1_i
+	xvmaddasp	vs46,	vs7,	vs18		// a7_r*b1_r, a7_i*b1_r, a1_r*b1_r, a1_i*b1_r
+	xvmaddasp	vs47,	vs7,	vs19		// a7_r*b1_i, a7_i*b1_i, a1_r*b1_i, a1_i*b1_i
+
+
+.endm
+
+.macro KERNEL2x8_SUBI1
+
+
+	lxvw4x		vs0,	o0,	AO		// load a0, a1
+
+	lxvw4x		vs1,	o16,	AO		// load a2, a3
+
+	lxvw4x		vs2,	o32,	AO		// load a4, a5
+
+	lxvw4x		vs3,	o48,	AO		// load a6, a7
+
+
+	addi		AO,	AO,	64
+
+	lxvw4x		vs24,	o0,	BO		//  load b0, b1
+
+	xxspltw		vs8,	vs24,	0
+	xxspltw		vs9,	vs24,	1
+	xxspltw		vs10,	vs24,	2
+	xxspltw		vs11,	vs24,	3
+
+
+	addi		BO,	BO,	16
+
+
+	xvmulsp		vs32,	vs0,	vs8		// a0_r*b0_r, a0_i*b0_r, a1_r*b0_r, a1_i*b0_r
+	xvmulsp		vs33,	vs0,	vs9		// a0_r*b0_i, a0_i*b0_i, a1_r*b0_i, a1_i*b0_i
+	xvmulsp		vs34,	vs1,	vs8		// a1_r*b0_r, a1_i*b0_r, a1_r*b0_r, a1_i*b0_r
+	xvmulsp		vs35,	vs1,	vs9		// a1_r*b0_i, a1_i*b0_i, a1_r*b0_i, a1_i*b0_i
+	xvmulsp		vs36,	vs2,	vs8		// a2_r*b0_r, a2_i*b0_r, a1_r*b0_r, a1_i*b0_r
+	xvmulsp		vs37,	vs2,	vs9		// a2_r*b0_i, a2_i*b0_i, a1_r*b0_i, a1_i*b0_i
+	xvmulsp		vs38,	vs3,	vs8		// a3_r*b0_r, a3_i*b0_r, a1_r*b0_r, a1_i*b0_r
+	xvmulsp		vs39,	vs3,	vs9		// a3_r*b0_i, a3_i*b0_i, a1_r*b0_i, a1_i*b0_i
+
+	xvmulsp		vs40,	vs0,	vs10		// a0_r*b1_r, a0_i*b1_r, a1_r*b1_r, a1_i*b1_r
+	xvmulsp		vs41,	vs0,	vs11		// a0_r*b1_i, a0_i*b1_i, a1_r*b1_i, a1_i*b1_i
+	xvmulsp		vs42,	vs1,	vs10		// a1_r*b1_r, a1_i*b1_r, a1_r*b1_r, a1_i*b1_r
+	xvmulsp		vs43,	vs1,	vs11		// a1_r*b1_i, a1_i*b1_i, a1_r*b1_i, a1_i*b1_i
+	xvmulsp		vs44,	vs2,	vs10		// a2_r*b1_r, a2_i*b1_r, a1_r*b1_r, a1_i*b1_r
+	xvmulsp		vs45,	vs2,	vs11		// a2_r*b1_i, a2_i*b1_i, a1_r*b1_i, a1_i*b1_i
+	xvmulsp		vs46,	vs3,	vs10		// a3_r*b1_r, a3_i*b1_r, a1_r*b1_r, a1_i*b1_r
+	xvmulsp		vs47,	vs3,	vs11		// a3_r*b1_i, a3_i*b1_i, a1_r*b1_i, a1_i*b1_i
+
+
+.endm
+
+.macro KERNEL2x8_SUB1
+
+
+	lxvw4x		vs0,	o0,	AO		// load a0, a1
+
+	lxvw4x		vs1,	o16,	AO		// load a2, a3
+
+	lxvw4x		vs2,	o32,	AO		// load a4, a5
+
+	lxvw4x		vs3,	o48,	AO		// load a6, a7
+
+
+	addi		AO,	AO,	64
+
+	lxvw4x		vs24,	o0,	BO		//  load b0, b1
+
+	xxspltw		vs8,	vs24,	0
+	xxspltw		vs9,	vs24,	1
+	xxspltw		vs10,	vs24,	2
+	xxspltw		vs11,	vs24,	3
+
+
+	addi		BO,	BO,	16
+
+
+	xvmaddasp	vs32,	vs0,	vs8		// a0_r*b0_r, a0_i*b0_r, a1_r*b0_r, a1_i*b0_r
+	xvmaddasp	vs33,	vs0,	vs9		// a0_r*b0_i, a0_i*b0_i, a1_r*b0_i, a1_i*b0_i
+	xvmaddasp	vs34,	vs1,	vs8		// a1_r*b0_r, a1_i*b0_r, a1_r*b0_r, a1_i*b0_r
+	xvmaddasp	vs35,	vs1,	vs9		// a1_r*b0_i, a1_i*b0_i, a1_r*b0_i, a1_i*b0_i
+	xvmaddasp	vs36,	vs2,	vs8		// a2_r*b0_r, a2_i*b0_r, a1_r*b0_r, a1_i*b0_r
+	xvmaddasp	vs37,	vs2,	vs9		// a2_r*b0_i, a2_i*b0_i, a1_r*b0_i, a1_i*b0_i
+	xvmaddasp	vs38,	vs3,	vs8		// a3_r*b0_r, a3_i*b0_r, a1_r*b0_r, a1_i*b0_r
+	xvmaddasp	vs39,	vs3,	vs9		// a3_r*b0_i, a3_i*b0_i, a1_r*b0_i, a1_i*b0_i
+
+	xvmaddasp	vs40,	vs0,	vs10		// a0_r*b1_r, a0_i*b1_r, a1_r*b1_r, a1_i*b1_r
+	xvmaddasp	vs41,	vs0,	vs11		// a0_r*b1_i, a0_i*b1_i, a1_r*b1_i, a1_i*b1_i
+	xvmaddasp	vs42,	vs1,	vs10		// a1_r*b1_r, a1_i*b1_r, a1_r*b1_r, a1_i*b1_r
+	xvmaddasp	vs43,	vs1,	vs11		// a1_r*b1_i, a1_i*b1_i, a1_r*b1_i, a1_i*b1_i
+	xvmaddasp	vs44,	vs2,	vs10		// a2_r*b1_r, a2_i*b1_r, a1_r*b1_r, a1_i*b1_r
+	xvmaddasp	vs45,	vs2,	vs11		// a2_r*b1_i, a2_i*b1_i, a1_r*b1_i, a1_i*b1_i
+	xvmaddasp	vs46,	vs3,	vs10		// a3_r*b1_r, a3_i*b1_r, a1_r*b1_r, a1_i*b1_r
+	xvmaddasp	vs47,	vs3,	vs11		// a3_r*b1_i, a3_i*b1_i, a1_r*b1_i, a1_i*b1_i
+
+
+.endm
+
+.macro SAVE2x8
+
+	mr		T1,	CO
+
+// N=0
+
+	mr		T2,	T1
+
+// N=0 M=0
+
+	xxlxor		vs4,	vs4,	vs4
+	xxlxor		vs5,	vs5,	vs5
+	xxlxor		vs6,	vs6,	vs6
+	xxlxor		vs7,	vs7,	vs7
+
+#ifndef TRMMKERNEL
+	lxvw4x		vs0,	o0,	T2	// c0, c1
+#else
+	xxlxor		vs0,	vs0,	vs0
+#endif
+
+
+	xxspltw		vs8,	vs32,	0
+	xxspltw		vs9,	vs32,	1
+	xxspltw		vs10,	vs32,	2
+	xxspltw		vs11,	vs32,	3
+
+
+	xxspltw		vs12,	vs33,	0
+	xxspltw		vs13,	vs33,	1
+	xxspltw		vs14,	vs33,	2
+	xxspltw		vs15,	vs33,	3
+
+	XVFADD_R1	vs4,	vs4,	vs8		// add a0_r * b0_r
+	XVFADD_I2	vs5,	vs5,	vs12		// add a0_r * b0_i
+	XVFADD_R1	vs6,	vs6,	vs10		// add a1_r * b0_r
+	XVFADD_I2	vs7,	vs7,	vs14		// add a1_r * b0_i
+
+	XVFADD_R2	vs4,	vs4,	vs13		// add a0_i * b0_i
+	XVFADD_I1	vs5,	vs5,	vs9 		// add a0_i * b0_r
+	XVFADD_R2	vs6,	vs6,	vs15		// add a1_i * b0_i
+	XVFADD_I1	vs7,	vs7,	vs11		// add a1_i * b0_r
+
+	xvmulsp		vs16,	vs4,	alpha_sr		// r0_r * alpha_r
+	xvmulsp		vs17,	vs5,	alpha_si		// r0_i * alpha_i
+	xvmulsp		vs18,	vs4,	alpha_si		// r0_r * alpha_i
+	xvmulsp		vs19,	vs5,	alpha_sr		// r0_i * alpha_r
+
+	xvsubsp		vs20,	vs16,	vs17		// r0_r * alpha_r - r0_i * alpha_i
+	xvaddsp		vs21,	vs18,	vs19		// r0_r * alpha_i + r0_i * alpha_r
+
+	xvmulsp		vs16,	vs6,	alpha_sr		// r1_r * alpha_r
+	xvmulsp		vs17,	vs7,	alpha_si		// r1_i * alpha_i
+	xvmulsp		vs18,	vs6,	alpha_si		// r1_r * alpha_i
+	xvmulsp		vs19,	vs7,	alpha_sr		// r1_i * alpha_r
+
+	xvsubsp		vs22,	vs16,	vs17		// r1_r * alpha_r - r1_i * alpha_i
+	xvaddsp		vs23,	vs18,	vs19		// r1_r * alpha_i + r1_i * alpha_r
+
+	xxlxor		vs24,	vs24,	vs24
+	xxsldwi		vs20,	vs20,	vs24,	3		// r0_r
+	xxsldwi		vs21,	vs21,	vs24,	2		// r0_i
+	xxsldwi		vs22,	vs22,	vs24,	1		// r1_r
+	xxsldwi		vs23,	vs23,	vs24,	0		// r1_i
+	xvaddsp		vs20,	vs20,	vs21		// r0_r, r0_i
+	xvaddsp		vs22,	vs22,	vs23		// r1_r, r1_i
+	xvaddsp		vs1,	vs20,	vs22			// r0_r, r0_i, r1_r, r1_i
+	xvaddsp		vs0,	vs0,	vs1
+
+
+	stxvw4x		vs0,	o0,	T2	// c0, c1
+
+	addi		T2,	T2,	16
+
+// N=0 M=2
+
+	xxlxor		vs4,	vs4,	vs4
+	xxlxor		vs5,	vs5,	vs5
+	xxlxor		vs6,	vs6,	vs6
+	xxlxor		vs7,	vs7,	vs7
+
+#ifndef TRMMKERNEL
+	lxvw4x		vs0,	o0,	T2	// c0, c1
+#else
+	xxlxor		vs0,	vs0,	vs0
+#endif
+
+
+	xxspltw		vs8,	vs34,	0
+	xxspltw		vs9,	vs34,	1
+	xxspltw		vs10,	vs34,	2
+	xxspltw		vs11,	vs34,	3
+
+
+	xxspltw		vs12,	vs35,	0
+	xxspltw		vs13,	vs35,	1
+	xxspltw		vs14,	vs35,	2
+	xxspltw		vs15,	vs35,	3
+
+	XVFADD_R1	vs4,	vs4,	vs8		// add a0_r * b0_r
+	XVFADD_I2	vs5,	vs5,	vs12		// add a0_r * b0_i
+	XVFADD_R1	vs6,	vs6,	vs10		// add a1_r * b0_r
+	XVFADD_I2	vs7,	vs7,	vs14		// add a1_r * b0_i
+
+	XVFADD_R2	vs4,	vs4,	vs13		// add a0_i * b0_i
+	XVFADD_I1	vs5,	vs5,	vs9 		// add a0_i * b0_r
+	XVFADD_R2	vs6,	vs6,	vs15		// add a1_i * b0_i
+	XVFADD_I1	vs7,	vs7,	vs11		// add a1_i * b0_r
+
+	xvmulsp		vs16,	vs4,	alpha_sr		// r0_r * alpha_r
+	xvmulsp		vs17,	vs5,	alpha_si		// r0_i * alpha_i
+	xvmulsp		vs18,	vs4,	alpha_si		// r0_r * alpha_i
+	xvmulsp		vs19,	vs5,	alpha_sr		// r0_i * alpha_r
+
+	xvsubsp		vs20,	vs16,	vs17		// r0_r * alpha_r - r0_i * alpha_i
+	xvaddsp		vs21,	vs18,	vs19		// r0_r * alpha_i + r0_i * alpha_r
+
+	xvmulsp		vs16,	vs6,	alpha_sr		// r1_r * alpha_r
+	xvmulsp		vs17,	vs7,	alpha_si		// r1_i * alpha_i
+	xvmulsp		vs18,	vs6,	alpha_si		// r1_r * alpha_i
+	xvmulsp		vs19,	vs7,	alpha_sr		// r1_i * alpha_r
+
+	xvsubsp		vs22,	vs16,	vs17		// r1_r * alpha_r - r1_i * alpha_i
+	xvaddsp		vs23,	vs18,	vs19		// r1_r * alpha_i + r1_i * alpha_r
+
+	xxlxor		vs24,	vs24,	vs24
+	xxsldwi		vs20,	vs20,	vs24,	3		// r0_r
+	xxsldwi		vs21,	vs21,	vs24,	2		// r0_i
+	xxsldwi		vs22,	vs22,	vs24,	1		// r1_r
+	xxsldwi		vs23,	vs23,	vs24,	0		// r1_i
+	xvaddsp		vs20,	vs20,	vs21		// r0_r, r0_i
+	xvaddsp		vs22,	vs22,	vs23		// r1_r, r1_i
+	xvaddsp		vs1,	vs20,	vs22			// r0_r, r0_i, r1_r, r1_i
+	xvaddsp		vs0,	vs0,	vs1
+
+
+	stxvw4x		vs0,	o0,	T2	// c0, c1
+
+	addi		T2,	T2,	16
+
+// N=0 M=4
+
+	xxlxor		vs4,	vs4,	vs4
+	xxlxor		vs5,	vs5,	vs5
+	xxlxor		vs6,	vs6,	vs6
+	xxlxor		vs7,	vs7,	vs7
+
+#ifndef TRMMKERNEL
+	lxvw4x		vs0,	o0,	T2	// c0, c1
+#else
+	xxlxor		vs0,	vs0,	vs0
+#endif
+
+
+	xxspltw		vs8,	vs36,	0
+	xxspltw		vs9,	vs36,	1
+	xxspltw		vs10,	vs36,	2
+	xxspltw		vs11,	vs36,	3
+
+
+	xxspltw		vs12,	vs37,	0
+	xxspltw		vs13,	vs37,	1
+	xxspltw		vs14,	vs37,	2
+	xxspltw		vs15,	vs37,	3
+
+	XVFADD_R1	vs4,	vs4,	vs8		// add a0_r * b0_r
+	XVFADD_I2	vs5,	vs5,	vs12		// add a0_r * b0_i
+	XVFADD_R1	vs6,	vs6,	vs10		// add a1_r * b0_r
+	XVFADD_I2	vs7,	vs7,	vs14		// add a1_r * b0_i
+
+	XVFADD_R2	vs4,	vs4,	vs13		// add a0_i * b0_i
+	XVFADD_I1	vs5,	vs5,	vs9 		// add a0_i * b0_r
+	XVFADD_R2	vs6,	vs6,	vs15		// add a1_i * b0_i
+	XVFADD_I1	vs7,	vs7,	vs11		// add a1_i * b0_r
+
+	xvmulsp		vs16,	vs4,	alpha_sr		// r0_r * alpha_r
+	xvmulsp		vs17,	vs5,	alpha_si		// r0_i * alpha_i
+	xvmulsp		vs18,	vs4,	alpha_si		// r0_r * alpha_i
+	xvmulsp		vs19,	vs5,	alpha_sr		// r0_i * alpha_r
+
+	xvsubsp		vs20,	vs16,	vs17		// r0_r * alpha_r - r0_i * alpha_i
+	xvaddsp		vs21,	vs18,	vs19		// r0_r * alpha_i + r0_i * alpha_r
+
+	xvmulsp		vs16,	vs6,	alpha_sr		// r1_r * alpha_r
+	xvmulsp		vs17,	vs7,	alpha_si		// r1_i * alpha_i
+	xvmulsp		vs18,	vs6,	alpha_si		// r1_r * alpha_i
+	xvmulsp		vs19,	vs7,	alpha_sr		// r1_i * alpha_r
+
+	xvsubsp		vs22,	vs16,	vs17		// r1_r * alpha_r - r1_i * alpha_i
+	xvaddsp		vs23,	vs18,	vs19		// r1_r * alpha_i + r1_i * alpha_r
+
+	xxlxor		vs24,	vs24,	vs24
+	xxsldwi		vs20,	vs20,	vs24,	3		// r0_r
+	xxsldwi		vs21,	vs21,	vs24,	2		// r0_i
+	xxsldwi		vs22,	vs22,	vs24,	1		// r1_r
+	xxsldwi		vs23,	vs23,	vs24,	0		// r1_i
+	xvaddsp		vs20,	vs20,	vs21		// r0_r, r0_i
+	xvaddsp		vs22,	vs22,	vs23		// r1_r, r1_i
+	xvaddsp		vs1,	vs20,	vs22			// r0_r, r0_i, r1_r, r1_i
+	xvaddsp		vs0,	vs0,	vs1
+
+
+	stxvw4x		vs0,	o0,	T2	// c0, c1
+
+	addi		T2,	T2,	16
+
+// N=0 M=6
+
+	xxlxor		vs4,	vs4,	vs4
+	xxlxor		vs5,	vs5,	vs5
+	xxlxor		vs6,	vs6,	vs6
+	xxlxor		vs7,	vs7,	vs7
+
+#ifndef TRMMKERNEL
+	lxvw4x		vs0,	o0,	T2	// c0, c1
+#else
+	xxlxor		vs0,	vs0,	vs0
+#endif
+
+
+	xxspltw		vs8,	vs38,	0
+	xxspltw		vs9,	vs38,	1
+	xxspltw		vs10,	vs38,	2
+	xxspltw		vs11,	vs38,	3
+
+
+	xxspltw		vs12,	vs39,	0
+	xxspltw		vs13,	vs39,	1
+	xxspltw		vs14,	vs39,	2
+	xxspltw		vs15,	vs39,	3
+
+	XVFADD_R1	vs4,	vs4,	vs8		// add a0_r * b0_r
+	XVFADD_I2	vs5,	vs5,	vs12		// add a0_r * b0_i
+	XVFADD_R1	vs6,	vs6,	vs10		// add a1_r * b0_r
+	XVFADD_I2	vs7,	vs7,	vs14		// add a1_r * b0_i
+
+	XVFADD_R2	vs4,	vs4,	vs13		// add a0_i * b0_i
+	XVFADD_I1	vs5,	vs5,	vs9 		// add a0_i * b0_r
+	XVFADD_R2	vs6,	vs6,	vs15		// add a1_i * b0_i
+	XVFADD_I1	vs7,	vs7,	vs11		// add a1_i * b0_r
+
+	xvmulsp		vs16,	vs4,	alpha_sr		// r0_r * alpha_r
+	xvmulsp		vs17,	vs5,	alpha_si		// r0_i * alpha_i
+	xvmulsp		vs18,	vs4,	alpha_si		// r0_r * alpha_i
+	xvmulsp		vs19,	vs5,	alpha_sr		// r0_i * alpha_r
+
+	xvsubsp		vs20,	vs16,	vs17		// r0_r * alpha_r - r0_i * alpha_i
+	xvaddsp		vs21,	vs18,	vs19		// r0_r * alpha_i + r0_i * alpha_r
+
+	xvmulsp		vs16,	vs6,	alpha_sr		// r1_r * alpha_r
+	xvmulsp		vs17,	vs7,	alpha_si		// r1_i * alpha_i
+	xvmulsp		vs18,	vs6,	alpha_si		// r1_r * alpha_i
+	xvmulsp		vs19,	vs7,	alpha_sr		// r1_i * alpha_r
+
+	xvsubsp		vs22,	vs16,	vs17		// r1_r * alpha_r - r1_i * alpha_i
+	xvaddsp		vs23,	vs18,	vs19		// r1_r * alpha_i + r1_i * alpha_r
+
+	xxlxor		vs24,	vs24,	vs24
+	xxsldwi		vs20,	vs20,	vs24,	3		// r0_r
+	xxsldwi		vs21,	vs21,	vs24,	2		// r0_i
+	xxsldwi		vs22,	vs22,	vs24,	1		// r1_r
+	xxsldwi		vs23,	vs23,	vs24,	0		// r1_i
+	xvaddsp		vs20,	vs20,	vs21		// r0_r, r0_i
+	xvaddsp		vs22,	vs22,	vs23		// r1_r, r1_i
+	xvaddsp		vs1,	vs20,	vs22			// r0_r, r0_i, r1_r, r1_i
+	xvaddsp		vs0,	vs0,	vs1
+
+
+	stxvw4x		vs0,	o0,	T2	// c0, c1
+
+	addi		T2,	T2,	16
+	add		T1,	T1,	LDC
+
+
+// N=1
+
+	mr		T2,	T1
+
+// N=1 M=0
+
+	xxlxor		vs4,	vs4,	vs4
+	xxlxor		vs5,	vs5,	vs5
+	xxlxor		vs6,	vs6,	vs6
+	xxlxor		vs7,	vs7,	vs7
+
+#ifndef TRMMKERNEL
+	lxvw4x		vs0,	o0,	T2	// c0, c1
+#else
+	xxlxor		vs0,	vs0,	vs0
+#endif
+
+
+	xxspltw		vs8,	vs40,	0
+	xxspltw		vs9,	vs40,	1
+	xxspltw		vs10,	vs40,	2
+	xxspltw		vs11,	vs40,	3
+
+
+	xxspltw		vs12,	vs41,	0
+	xxspltw		vs13,	vs41,	1
+	xxspltw		vs14,	vs41,	2
+	xxspltw		vs15,	vs41,	3
+
+	XVFADD_R1	vs4,	vs4,	vs8		// add a0_r * b0_r
+	XVFADD_I2	vs5,	vs5,	vs12		// add a0_r * b0_i
+	XVFADD_R1	vs6,	vs6,	vs10		// add a1_r * b0_r
+	XVFADD_I2	vs7,	vs7,	vs14		// add a1_r * b0_i
+
+	XVFADD_R2	vs4,	vs4,	vs13		// add a0_i * b0_i
+	XVFADD_I1	vs5,	vs5,	vs9 		// add a0_i * b0_r
+	XVFADD_R2	vs6,	vs6,	vs15		// add a1_i * b0_i
+	XVFADD_I1	vs7,	vs7,	vs11		// add a1_i * b0_r
+
+	xvmulsp		vs16,	vs4,	alpha_sr		// r0_r * alpha_r
+	xvmulsp		vs17,	vs5,	alpha_si		// r0_i * alpha_i
+	xvmulsp		vs18,	vs4,	alpha_si		// r0_r * alpha_i
+	xvmulsp		vs19,	vs5,	alpha_sr		// r0_i * alpha_r
+
+	xvsubsp		vs20,	vs16,	vs17		// r0_r * alpha_r - r0_i * alpha_i
+	xvaddsp		vs21,	vs18,	vs19		// r0_r * alpha_i + r0_i * alpha_r
+
+	xvmulsp		vs16,	vs6,	alpha_sr		// r1_r * alpha_r
+	xvmulsp		vs17,	vs7,	alpha_si		// r1_i * alpha_i
+	xvmulsp		vs18,	vs6,	alpha_si		// r1_r * alpha_i
+	xvmulsp		vs19,	vs7,	alpha_sr		// r1_i * alpha_r
+
+	xvsubsp		vs22,	vs16,	vs17		// r1_r * alpha_r - r1_i * alpha_i
+	xvaddsp		vs23,	vs18,	vs19		// r1_r * alpha_i + r1_i * alpha_r
+
+	xxlxor		vs24,	vs24,	vs24
+	xxsldwi		vs20,	vs20,	vs24,	3		// r0_r
+	xxsldwi		vs21,	vs21,	vs24,	2		// r0_i
+	xxsldwi		vs22,	vs22,	vs24,	1		// r1_r
+	xxsldwi		vs23,	vs23,	vs24,	0		// r1_i
+	xvaddsp		vs20,	vs20,	vs21		// r0_r, r0_i
+	xvaddsp		vs22,	vs22,	vs23		// r1_r, r1_i
+	xvaddsp		vs1,	vs20,	vs22			// r0_r, r0_i, r1_r, r1_i
+	xvaddsp		vs0,	vs0,	vs1
+
+
+	stxvw4x		vs0,	o0,	T2	// c0, c1
+
+	addi		T2,	T2,	16
+
+// N=1 M=2
+
+	xxlxor		vs4,	vs4,	vs4
+	xxlxor		vs5,	vs5,	vs5
+	xxlxor		vs6,	vs6,	vs6
+	xxlxor		vs7,	vs7,	vs7
+
+#ifndef TRMMKERNEL
+	lxvw4x		vs0,	o0,	T2	// c0, c1
+#else
+	xxlxor		vs0,	vs0,	vs0
+#endif
+
+
+	xxspltw		vs8,	vs42,	0
+	xxspltw		vs9,	vs42,	1
+	xxspltw		vs10,	vs42,	2
+	xxspltw		vs11,	vs42,	3
+
+
+	xxspltw		vs12,	vs43,	0
+	xxspltw		vs13,	vs43,	1
+	xxspltw		vs14,	vs43,	2
+	xxspltw		vs15,	vs43,	3
+
+	XVFADD_R1	vs4,	vs4,	vs8		// add a0_r * b0_r
+	XVFADD_I2	vs5,	vs5,	vs12		// add a0_r * b0_i
+	XVFADD_R1	vs6,	vs6,	vs10		// add a1_r * b0_r
+	XVFADD_I2	vs7,	vs7,	vs14		// add a1_r * b0_i
+
+	XVFADD_R2	vs4,	vs4,	vs13		// add a0_i * b0_i
+	XVFADD_I1	vs5,	vs5,	vs9 		// add a0_i * b0_r
+	XVFADD_R2	vs6,	vs6,	vs15		// add a1_i * b0_i
+	XVFADD_I1	vs7,	vs7,	vs11		// add a1_i * b0_r
+
+	xvmulsp		vs16,	vs4,	alpha_sr		// r0_r * alpha_r
+	xvmulsp		vs17,	vs5,	alpha_si		// r0_i * alpha_i
+	xvmulsp		vs18,	vs4,	alpha_si		// r0_r * alpha_i
+	xvmulsp		vs19,	vs5,	alpha_sr		// r0_i * alpha_r
+
+	xvsubsp		vs20,	vs16,	vs17		// r0_r * alpha_r - r0_i * alpha_i
+	xvaddsp		vs21,	vs18,	vs19		// r0_r * alpha_i + r0_i * alpha_r
+
+	xvmulsp		vs16,	vs6,	alpha_sr		// r1_r * alpha_r
+	xvmulsp		vs17,	vs7,	alpha_si		// r1_i * alpha_i
+	xvmulsp		vs18,	vs6,	alpha_si		// r1_r * alpha_i
+	xvmulsp		vs19,	vs7,	alpha_sr		// r1_i * alpha_r
+
+	xvsubsp		vs22,	vs16,	vs17		// r1_r * alpha_r - r1_i * alpha_i
+	xvaddsp		vs23,	vs18,	vs19		// r1_r * alpha_i + r1_i * alpha_r
+
+	xxlxor		vs24,	vs24,	vs24
+	xxsldwi		vs20,	vs20,	vs24,	3		// r0_r
+	xxsldwi		vs21,	vs21,	vs24,	2		// r0_i
+	xxsldwi		vs22,	vs22,	vs24,	1		// r1_r
+	xxsldwi		vs23,	vs23,	vs24,	0		// r1_i
+	xvaddsp		vs20,	vs20,	vs21		// r0_r, r0_i
+	xvaddsp		vs22,	vs22,	vs23		// r1_r, r1_i
+	xvaddsp		vs1,	vs20,	vs22			// r0_r, r0_i, r1_r, r1_i
+	xvaddsp		vs0,	vs0,	vs1
+
+
+	stxvw4x		vs0,	o0,	T2	// c0, c1
+
+	addi		T2,	T2,	16
+
+// N=1 M=4
+
+	xxlxor		vs4,	vs4,	vs4
+	xxlxor		vs5,	vs5,	vs5
+	xxlxor		vs6,	vs6,	vs6
+	xxlxor		vs7,	vs7,	vs7
+
+#ifndef TRMMKERNEL
+	lxvw4x		vs0,	o0,	T2	// c0, c1
+#else
+	xxlxor		vs0,	vs0,	vs0
+#endif
+
+
+	xxspltw		vs8,	vs44,	0
+	xxspltw		vs9,	vs44,	1
+	xxspltw		vs10,	vs44,	2
+	xxspltw		vs11,	vs44,	3
+
+
+	xxspltw		vs12,	vs45,	0
+	xxspltw		vs13,	vs45,	1
+	xxspltw		vs14,	vs45,	2
+	xxspltw		vs15,	vs45,	3
+
+	XVFADD_R1	vs4,	vs4,	vs8		// add a0_r * b0_r
+	XVFADD_I2	vs5,	vs5,	vs12		// add a0_r * b0_i
+	XVFADD_R1	vs6,	vs6,	vs10		// add a1_r * b0_r
+	XVFADD_I2	vs7,	vs7,	vs14		// add a1_r * b0_i
+
+	XVFADD_R2	vs4,	vs4,	vs13		// add a0_i * b0_i
+	XVFADD_I1	vs5,	vs5,	vs9 		// add a0_i * b0_r
+	XVFADD_R2	vs6,	vs6,	vs15		// add a1_i * b0_i
+	XVFADD_I1	vs7,	vs7,	vs11		// add a1_i * b0_r
+
+	xvmulsp		vs16,	vs4,	alpha_sr		// r0_r * alpha_r
+	xvmulsp		vs17,	vs5,	alpha_si		// r0_i * alpha_i
+	xvmulsp		vs18,	vs4,	alpha_si		// r0_r * alpha_i
+	xvmulsp		vs19,	vs5,	alpha_sr		// r0_i * alpha_r
+
+	xvsubsp		vs20,	vs16,	vs17		// r0_r * alpha_r - r0_i * alpha_i
+	xvaddsp		vs21,	vs18,	vs19		// r0_r * alpha_i + r0_i * alpha_r
+
+	xvmulsp		vs16,	vs6,	alpha_sr		// r1_r * alpha_r
+	xvmulsp		vs17,	vs7,	alpha_si		// r1_i * alpha_i
+	xvmulsp		vs18,	vs6,	alpha_si		// r1_r * alpha_i
+	xvmulsp		vs19,	vs7,	alpha_sr		// r1_i * alpha_r
+
+	xvsubsp		vs22,	vs16,	vs17		// r1_r * alpha_r - r1_i * alpha_i
+	xvaddsp		vs23,	vs18,	vs19		// r1_r * alpha_i + r1_i * alpha_r
+
+	xxlxor		vs24,	vs24,	vs24
+	xxsldwi		vs20,	vs20,	vs24,	3		// r0_r
+	xxsldwi		vs21,	vs21,	vs24,	2		// r0_i
+	xxsldwi		vs22,	vs22,	vs24,	1		// r1_r
+	xxsldwi		vs23,	vs23,	vs24,	0		// r1_i
+	xvaddsp		vs20,	vs20,	vs21		// r0_r, r0_i
+	xvaddsp		vs22,	vs22,	vs23		// r1_r, r1_i
+	xvaddsp		vs1,	vs20,	vs22			// r0_r, r0_i, r1_r, r1_i
+	xvaddsp		vs0,	vs0,	vs1
+
+
+	stxvw4x		vs0,	o0,	T2	// c0, c1
+
+	addi		T2,	T2,	16
+
+// N=1 M=6
+
+	xxlxor		vs4,	vs4,	vs4
+	xxlxor		vs5,	vs5,	vs5
+	xxlxor		vs6,	vs6,	vs6
+	xxlxor		vs7,	vs7,	vs7
+
+#ifndef TRMMKERNEL
+	lxvw4x		vs0,	o0,	T2	// c0, c1
+#else
+	xxlxor		vs0,	vs0,	vs0
+#endif
+
+
+	xxspltw		vs8,	vs46,	0
+	xxspltw		vs9,	vs46,	1
+	xxspltw		vs10,	vs46,	2
+	xxspltw		vs11,	vs46,	3
+
+
+	xxspltw		vs12,	vs47,	0
+	xxspltw		vs13,	vs47,	1
+	xxspltw		vs14,	vs47,	2
+	xxspltw		vs15,	vs47,	3
+
+	XVFADD_R1	vs4,	vs4,	vs8		// add a0_r * b0_r
+	XVFADD_I2	vs5,	vs5,	vs12		// add a0_r * b0_i
+	XVFADD_R1	vs6,	vs6,	vs10		// add a1_r * b0_r
+	XVFADD_I2	vs7,	vs7,	vs14		// add a1_r * b0_i
+
+	XVFADD_R2	vs4,	vs4,	vs13		// add a0_i * b0_i
+	XVFADD_I1	vs5,	vs5,	vs9 		// add a0_i * b0_r
+	XVFADD_R2	vs6,	vs6,	vs15		// add a1_i * b0_i
+	XVFADD_I1	vs7,	vs7,	vs11		// add a1_i * b0_r
+
+	xvmulsp		vs16,	vs4,	alpha_sr		// r0_r * alpha_r
+	xvmulsp		vs17,	vs5,	alpha_si		// r0_i * alpha_i
+	xvmulsp		vs18,	vs4,	alpha_si		// r0_r * alpha_i
+	xvmulsp		vs19,	vs5,	alpha_sr		// r0_i * alpha_r
+
+	xvsubsp		vs20,	vs16,	vs17		// r0_r * alpha_r - r0_i * alpha_i
+	xvaddsp		vs21,	vs18,	vs19		// r0_r * alpha_i + r0_i * alpha_r
+
+	xvmulsp		vs16,	vs6,	alpha_sr		// r1_r * alpha_r
+	xvmulsp		vs17,	vs7,	alpha_si		// r1_i * alpha_i
+	xvmulsp		vs18,	vs6,	alpha_si		// r1_r * alpha_i
+	xvmulsp		vs19,	vs7,	alpha_sr		// r1_i * alpha_r
+
+	xvsubsp		vs22,	vs16,	vs17		// r1_r * alpha_r - r1_i * alpha_i
+	xvaddsp		vs23,	vs18,	vs19		// r1_r * alpha_i + r1_i * alpha_r
+
+	xxlxor		vs24,	vs24,	vs24
+	xxsldwi		vs20,	vs20,	vs24,	3		// r0_r
+	xxsldwi		vs21,	vs21,	vs24,	2		// r0_i
+	xxsldwi		vs22,	vs22,	vs24,	1		// r1_r
+	xxsldwi		vs23,	vs23,	vs24,	0		// r1_i
+	xvaddsp		vs20,	vs20,	vs21		// r0_r, r0_i
+	xvaddsp		vs22,	vs22,	vs23		// r1_r, r1_i
+	xvaddsp		vs1,	vs20,	vs22			// r0_r, r0_i, r1_r, r1_i
+	xvaddsp		vs0,	vs0,	vs1
+
+
+	stxvw4x		vs0,	o0,	T2	// c0, c1
+
+	addi		T2,	T2,	16
+	add		T1,	T1,	LDC
+
+	addi		CO,	CO,	64
+
+.endm
+
+
+/**********************************************************************************************
+* Macros for N=2 and M=4
+**********************************************************************************************/
+
+.macro LOAD2x4_1
+
+	lxvw4x		vs0,	o0,	AO		// load a0, a1
+
+	lxvw4x		vs1,	o16,	AO		// load a2, a3
+
+
+	addi		AO,	AO,	32
+
+	lxvw4x		vs24,	o0,	BO		//  load b0, b1
+
+	xxspltw		vs8,	vs24,	0
+	xxspltw		vs9,	vs24,	1
+	xxspltw		vs10,	vs24,	2
+	xxspltw		vs11,	vs24,	3
+
+
+	addi		BO,	BO,	16
+
+.endm
+
+.macro KERNEL2x4_I1
+
+
+	lxvw4x		vs4,	o0,	AO		// load a0, a1
+
+	lxvw4x		vs5,	o16,	AO		// load a2, a3
+
+
+	addi		AO,	AO,	32
+
+	lxvw4x		vs24,	o0,	BO		//  load b0, b1
+
+	xxspltw		vs16,	vs24,	0
+	xxspltw		vs17,	vs24,	1
+	xxspltw		vs18,	vs24,	2
+	xxspltw		vs19,	vs24,	3
+
+
+	addi		BO,	BO,	16
+
+
+	xvmulsp		vs32,	vs0,	vs8		// a0_r*b0_r, a0_i*b0_r, a1_r*b0_r, a1_i*b0_r
+	xvmulsp		vs33,	vs0,	vs9		// a0_r*b0_i, a0_i*b0_i, a1_r*b0_i, a1_i*b0_i
+	xvmulsp		vs34,	vs1,	vs8		// a1_r*b0_r, a1_i*b0_r, a1_r*b0_r, a1_i*b0_r
+	xvmulsp		vs35,	vs1,	vs9		// a1_r*b0_i, a1_i*b0_i, a1_r*b0_i, a1_i*b0_i
+
+	xvmulsp		vs36,	vs0,	vs10		// a0_r*b1_r, a0_i*b1_r, a1_r*b1_r, a1_i*b1_r
+	xvmulsp		vs37,	vs0,	vs11		// a0_r*b1_i, a0_i*b1_i, a1_r*b1_i, a1_i*b1_i
+	xvmulsp		vs38,	vs1,	vs10		// a1_r*b1_r, a1_i*b1_r, a1_r*b1_r, a1_i*b1_r
+	xvmulsp		vs39,	vs1,	vs11		// a1_r*b1_i, a1_i*b1_i, a1_r*b1_i, a1_i*b1_i
+
+
+.endm
+
+.macro KERNEL2x4_1
+
+
+	lxvw4x		vs4,	o0,	AO		// load a0, a1
+
+	lxvw4x		vs5,	o16,	AO		// load a2, a3
+
+
+	addi		AO,	AO,	32
+
+	lxvw4x		vs24,	o0,	BO		//  load b0, b1
+
+	xxspltw		vs16,	vs24,	0
+	xxspltw		vs17,	vs24,	1
+	xxspltw		vs18,	vs24,	2
+	xxspltw		vs19,	vs24,	3
+
+
+	addi		BO,	BO,	16
+
+
+	xvmaddasp	vs32,	vs0,	vs8		// a0_r*b0_r, a0_i*b0_r, a1_r*b0_r, a1_i*b0_r
+	xvmaddasp	vs33,	vs0,	vs9		// a0_r*b0_i, a0_i*b0_i, a1_r*b0_i, a1_i*b0_i
+	xvmaddasp	vs34,	vs1,	vs8		// a1_r*b0_r, a1_i*b0_r, a1_r*b0_r, a1_i*b0_r
+	xvmaddasp	vs35,	vs1,	vs9		// a1_r*b0_i, a1_i*b0_i, a1_r*b0_i, a1_i*b0_i
+
+	xvmaddasp	vs36,	vs0,	vs10		// a0_r*b1_r, a0_i*b1_r, a1_r*b1_r, a1_i*b1_r
+	xvmaddasp	vs37,	vs0,	vs11		// a0_r*b1_i, a0_i*b1_i, a1_r*b1_i, a1_i*b1_i
+	xvmaddasp	vs38,	vs1,	vs10		// a1_r*b1_r, a1_i*b1_r, a1_r*b1_r, a1_i*b1_r
+	xvmaddasp	vs39,	vs1,	vs11		// a1_r*b1_i, a1_i*b1_i, a1_r*b1_i, a1_i*b1_i
+
+
+.endm
+
+.macro KERNEL2x4_2
+
+
+	lxvw4x		vs0,	o0,	AO		// load a0, a1
+
+	lxvw4x		vs1,	o16,	AO		// load a2, a3
+
+
+	addi		AO,	AO,	32
+
+	lxvw4x		vs24,	o0,	BO		//  load b0, b1
+
+	xxspltw		vs8,	vs24,	0
+	xxspltw		vs9,	vs24,	1
+	xxspltw		vs10,	vs24,	2
+	xxspltw		vs11,	vs24,	3
+
+
+	addi		BO,	BO,	16
+
+
+	xvmaddasp	vs32,	vs4,	vs16		// a4_r*b0_r, a4_i*b0_r, a1_r*b0_r, a1_i*b0_r
+	xvmaddasp	vs33,	vs4,	vs17		// a4_r*b0_i, a4_i*b0_i, a1_r*b0_i, a1_i*b0_i
+	xvmaddasp	vs34,	vs5,	vs16		// a5_r*b0_r, a5_i*b0_r, a1_r*b0_r, a1_i*b0_r
+	xvmaddasp	vs35,	vs5,	vs17		// a5_r*b0_i, a5_i*b0_i, a1_r*b0_i, a1_i*b0_i
+
+	xvmaddasp	vs36,	vs4,	vs18		// a4_r*b1_r, a4_i*b1_r, a1_r*b1_r, a1_i*b1_r
+	xvmaddasp	vs37,	vs4,	vs19		// a4_r*b1_i, a4_i*b1_i, a1_r*b1_i, a1_i*b1_i
+	xvmaddasp	vs38,	vs5,	vs18		// a5_r*b1_r, a5_i*b1_r, a1_r*b1_r, a1_i*b1_r
+	xvmaddasp	vs39,	vs5,	vs19		// a5_r*b1_i, a5_i*b1_i, a1_r*b1_i, a1_i*b1_i
+
+
+.endm
+
+.macro KERNEL2x4_E2
+
+
+	xvmaddasp	vs32,	vs4,	vs16		// a4_r*b0_r, a4_i*b0_r, a1_r*b0_r, a1_i*b0_r
+	xvmaddasp	vs33,	vs4,	vs17		// a4_r*b0_i, a4_i*b0_i, a1_r*b0_i, a1_i*b0_i
+	xvmaddasp	vs34,	vs5,	vs16		// a5_r*b0_r, a5_i*b0_r, a1_r*b0_r, a1_i*b0_r
+	xvmaddasp	vs35,	vs5,	vs17		// a5_r*b0_i, a5_i*b0_i, a1_r*b0_i, a1_i*b0_i
+
+	xvmaddasp	vs36,	vs4,	vs18		// a4_r*b1_r, a4_i*b1_r, a1_r*b1_r, a1_i*b1_r
+	xvmaddasp	vs37,	vs4,	vs19		// a4_r*b1_i, a4_i*b1_i, a1_r*b1_i, a1_i*b1_i
+	xvmaddasp	vs38,	vs5,	vs18		// a5_r*b1_r, a5_i*b1_r, a1_r*b1_r, a1_i*b1_r
+	xvmaddasp	vs39,	vs5,	vs19		// a5_r*b1_i, a5_i*b1_i, a1_r*b1_i, a1_i*b1_i
+
+
+.endm
+
+.macro KERNEL2x4_SUBI1
+
+
+	lxvw4x		vs0,	o0,	AO		// load a0, a1
+
+	lxvw4x		vs1,	o16,	AO		// load a2, a3
+
+
+	addi		AO,	AO,	32
+
+	lxvw4x		vs24,	o0,	BO		//  load b0, b1
+
+	xxspltw		vs8,	vs24,	0
+	xxspltw		vs9,	vs24,	1
+	xxspltw		vs10,	vs24,	2
+	xxspltw		vs11,	vs24,	3
+
+
+	addi		BO,	BO,	16
+
+
+	xvmulsp		vs32,	vs0,	vs8		// a0_r*b0_r, a0_i*b0_r, a1_r*b0_r, a1_i*b0_r
+	xvmulsp		vs33,	vs0,	vs9		// a0_r*b0_i, a0_i*b0_i, a1_r*b0_i, a1_i*b0_i
+	xvmulsp		vs34,	vs1,	vs8		// a1_r*b0_r, a1_i*b0_r, a1_r*b0_r, a1_i*b0_r
+	xvmulsp		vs35,	vs1,	vs9		// a1_r*b0_i, a1_i*b0_i, a1_r*b0_i, a1_i*b0_i
+
+	xvmulsp		vs36,	vs0,	vs10		// a0_r*b1_r, a0_i*b1_r, a1_r*b1_r, a1_i*b1_r
+	xvmulsp		vs37,	vs0,	vs11		// a0_r*b1_i, a0_i*b1_i, a1_r*b1_i, a1_i*b1_i
+	xvmulsp		vs38,	vs1,	vs10		// a1_r*b1_r, a1_i*b1_r, a1_r*b1_r, a1_i*b1_r
+	xvmulsp		vs39,	vs1,	vs11		// a1_r*b1_i, a1_i*b1_i, a1_r*b1_i, a1_i*b1_i
+
+
+.endm
+
+.macro KERNEL2x4_SUB1
+
+
+	lxvw4x		vs0,	o0,	AO		// load a0, a1
+
+	lxvw4x		vs1,	o16,	AO		// load a2, a3
+
+
+	addi		AO,	AO,	32
+
+	lxvw4x		vs24,	o0,	BO		//  load b0, b1
+
+	xxspltw		vs8,	vs24,	0
+	xxspltw		vs9,	vs24,	1
+	xxspltw		vs10,	vs24,	2
+	xxspltw		vs11,	vs24,	3
+
+
+	addi		BO,	BO,	16
+
+
+	xvmaddasp	vs32,	vs0,	vs8		// a0_r*b0_r, a0_i*b0_r, a1_r*b0_r, a1_i*b0_r
+	xvmaddasp	vs33,	vs0,	vs9		// a0_r*b0_i, a0_i*b0_i, a1_r*b0_i, a1_i*b0_i
+	xvmaddasp	vs34,	vs1,	vs8		// a1_r*b0_r, a1_i*b0_r, a1_r*b0_r, a1_i*b0_r
+	xvmaddasp	vs35,	vs1,	vs9		// a1_r*b0_i, a1_i*b0_i, a1_r*b0_i, a1_i*b0_i
+
+	xvmaddasp	vs36,	vs0,	vs10		// a0_r*b1_r, a0_i*b1_r, a1_r*b1_r, a1_i*b1_r
+	xvmaddasp	vs37,	vs0,	vs11		// a0_r*b1_i, a0_i*b1_i, a1_r*b1_i, a1_i*b1_i
+	xvmaddasp	vs38,	vs1,	vs10		// a1_r*b1_r, a1_i*b1_r, a1_r*b1_r, a1_i*b1_r
+	xvmaddasp	vs39,	vs1,	vs11		// a1_r*b1_i, a1_i*b1_i, a1_r*b1_i, a1_i*b1_i
+
+
+.endm
+
+.macro SAVE2x4
+
+	mr		T1,	CO
+
+// N=0
+
+	mr		T2,	T1
+
+// N=0 M=0
+
+	xxlxor		vs4,	vs4,	vs4
+	xxlxor		vs5,	vs5,	vs5
+	xxlxor		vs6,	vs6,	vs6
+	xxlxor		vs7,	vs7,	vs7
+
+#ifndef TRMMKERNEL
+	lxvw4x		vs0,	o0,	T2	// c0, c1
+#else
+	xxlxor		vs0,	vs0,	vs0
+#endif
+
+
+	xxspltw		vs8,	vs32,	0
+	xxspltw		vs9,	vs32,	1
+	xxspltw		vs10,	vs32,	2
+	xxspltw		vs11,	vs32,	3
+
+
+	xxspltw		vs12,	vs33,	0
+	xxspltw		vs13,	vs33,	1
+	xxspltw		vs14,	vs33,	2
+	xxspltw		vs15,	vs33,	3
+
+	XVFADD_R1	vs4,	vs4,	vs8		// add a0_r * b0_r
+	XVFADD_I2	vs5,	vs5,	vs12		// add a0_r * b0_i
+	XVFADD_R1	vs6,	vs6,	vs10		// add a1_r * b0_r
+	XVFADD_I2	vs7,	vs7,	vs14		// add a1_r * b0_i
+
+	XVFADD_R2	vs4,	vs4,	vs13		// add a0_i * b0_i
+	XVFADD_I1	vs5,	vs5,	vs9 		// add a0_i * b0_r
+	XVFADD_R2	vs6,	vs6,	vs15		// add a1_i * b0_i
+	XVFADD_I1	vs7,	vs7,	vs11		// add a1_i * b0_r
+
+	xvmulsp		vs16,	vs4,	alpha_sr		// r0_r * alpha_r
+	xvmulsp		vs17,	vs5,	alpha_si		// r0_i * alpha_i
+	xvmulsp		vs18,	vs4,	alpha_si		// r0_r * alpha_i
+	xvmulsp		vs19,	vs5,	alpha_sr		// r0_i * alpha_r
+
+	xvsubsp		vs20,	vs16,	vs17		// r0_r * alpha_r - r0_i * alpha_i
+	xvaddsp		vs21,	vs18,	vs19		// r0_r * alpha_i + r0_i * alpha_r
+
+	xvmulsp		vs16,	vs6,	alpha_sr		// r1_r * alpha_r
+	xvmulsp		vs17,	vs7,	alpha_si		// r1_i * alpha_i
+	xvmulsp		vs18,	vs6,	alpha_si		// r1_r * alpha_i
+	xvmulsp		vs19,	vs7,	alpha_sr		// r1_i * alpha_r
+
+	xvsubsp		vs22,	vs16,	vs17		// r1_r * alpha_r - r1_i * alpha_i
+	xvaddsp		vs23,	vs18,	vs19		// r1_r * alpha_i + r1_i * alpha_r
+
+	xxlxor		vs24,	vs24,	vs24
+	xxsldwi		vs20,	vs20,	vs24,	3		// r0_r
+	xxsldwi		vs21,	vs21,	vs24,	2		// r0_i
+	xxsldwi		vs22,	vs22,	vs24,	1		// r1_r
+	xxsldwi		vs23,	vs23,	vs24,	0		// r1_i
+	xvaddsp		vs20,	vs20,	vs21		// r0_r, r0_i
+	xvaddsp		vs22,	vs22,	vs23		// r1_r, r1_i
+	xvaddsp		vs1,	vs20,	vs22			// r0_r, r0_i, r1_r, r1_i
+	xvaddsp		vs0,	vs0,	vs1
+
+
+	stxvw4x		vs0,	o0,	T2	// c0, c1
+
+	addi		T2,	T2,	16
+
+// N=0 M=2
+
+	xxlxor		vs4,	vs4,	vs4
+	xxlxor		vs5,	vs5,	vs5
+	xxlxor		vs6,	vs6,	vs6
+	xxlxor		vs7,	vs7,	vs7
+
+#ifndef TRMMKERNEL
+	lxvw4x		vs0,	o0,	T2	// c0, c1
+#else
+	xxlxor		vs0,	vs0,	vs0
+#endif
+
+
+	xxspltw		vs8,	vs34,	0
+	xxspltw		vs9,	vs34,	1
+	xxspltw		vs10,	vs34,	2
+	xxspltw		vs11,	vs34,	3
+
+
+	xxspltw		vs12,	vs35,	0
+	xxspltw		vs13,	vs35,	1
+	xxspltw		vs14,	vs35,	2
+	xxspltw		vs15,	vs35,	3
+
+	XVFADD_R1	vs4,	vs4,	vs8		// add a0_r * b0_r
+	XVFADD_I2	vs5,	vs5,	vs12		// add a0_r * b0_i
+	XVFADD_R1	vs6,	vs6,	vs10		// add a1_r * b0_r
+	XVFADD_I2	vs7,	vs7,	vs14		// add a1_r * b0_i
+
+	XVFADD_R2	vs4,	vs4,	vs13		// add a0_i * b0_i
+	XVFADD_I1	vs5,	vs5,	vs9 		// add a0_i * b0_r
+	XVFADD_R2	vs6,	vs6,	vs15		// add a1_i * b0_i
+	XVFADD_I1	vs7,	vs7,	vs11		// add a1_i * b0_r
+
+	xvmulsp		vs16,	vs4,	alpha_sr		// r0_r * alpha_r
+	xvmulsp		vs17,	vs5,	alpha_si		// r0_i * alpha_i
+	xvmulsp		vs18,	vs4,	alpha_si		// r0_r * alpha_i
+	xvmulsp		vs19,	vs5,	alpha_sr		// r0_i * alpha_r
+
+	xvsubsp		vs20,	vs16,	vs17		// r0_r * alpha_r - r0_i * alpha_i
+	xvaddsp		vs21,	vs18,	vs19		// r0_r * alpha_i + r0_i * alpha_r
+
+	xvmulsp		vs16,	vs6,	alpha_sr		// r1_r * alpha_r
+	xvmulsp		vs17,	vs7,	alpha_si		// r1_i * alpha_i
+	xvmulsp		vs18,	vs6,	alpha_si		// r1_r * alpha_i
+	xvmulsp		vs19,	vs7,	alpha_sr		// r1_i * alpha_r
+
+	xvsubsp		vs22,	vs16,	vs17		// r1_r * alpha_r - r1_i * alpha_i
+	xvaddsp		vs23,	vs18,	vs19		// r1_r * alpha_i + r1_i * alpha_r
+
+	xxlxor		vs24,	vs24,	vs24
+	xxsldwi		vs20,	vs20,	vs24,	3		// r0_r
+	xxsldwi		vs21,	vs21,	vs24,	2		// r0_i
+	xxsldwi		vs22,	vs22,	vs24,	1		// r1_r
+	xxsldwi		vs23,	vs23,	vs24,	0		// r1_i
+	xvaddsp		vs20,	vs20,	vs21		// r0_r, r0_i
+	xvaddsp		vs22,	vs22,	vs23		// r1_r, r1_i
+	xvaddsp		vs1,	vs20,	vs22			// r0_r, r0_i, r1_r, r1_i
+	xvaddsp		vs0,	vs0,	vs1
+
+
+	stxvw4x		vs0,	o0,	T2	// c0, c1
+
+	addi		T2,	T2,	16
+	add		T1,	T1,	LDC
+
+
+// N=1
+
+	mr		T2,	T1
+
+// N=1 M=0
+
+	xxlxor		vs4,	vs4,	vs4
+	xxlxor		vs5,	vs5,	vs5
+	xxlxor		vs6,	vs6,	vs6
+	xxlxor		vs7,	vs7,	vs7
+
+#ifndef TRMMKERNEL
+	lxvw4x		vs0,	o0,	T2	// c0, c1
+#else
+	xxlxor		vs0,	vs0,	vs0
+#endif
+
+
+	xxspltw		vs8,	vs36,	0
+	xxspltw		vs9,	vs36,	1
+	xxspltw		vs10,	vs36,	2
+	xxspltw		vs11,	vs36,	3
+
+
+	xxspltw		vs12,	vs37,	0
+	xxspltw		vs13,	vs37,	1
+	xxspltw		vs14,	vs37,	2
+	xxspltw		vs15,	vs37,	3
+
+	XVFADD_R1	vs4,	vs4,	vs8		// add a0_r * b0_r
+	XVFADD_I2	vs5,	vs5,	vs12		// add a0_r * b0_i
+	XVFADD_R1	vs6,	vs6,	vs10		// add a1_r * b0_r
+	XVFADD_I2	vs7,	vs7,	vs14		// add a1_r * b0_i
+
+	XVFADD_R2	vs4,	vs4,	vs13		// add a0_i * b0_i
+	XVFADD_I1	vs5,	vs5,	vs9 		// add a0_i * b0_r
+	XVFADD_R2	vs6,	vs6,	vs15		// add a1_i * b0_i
+	XVFADD_I1	vs7,	vs7,	vs11		// add a1_i * b0_r
+
+	xvmulsp		vs16,	vs4,	alpha_sr		// r0_r * alpha_r
+	xvmulsp		vs17,	vs5,	alpha_si		// r0_i * alpha_i
+	xvmulsp		vs18,	vs4,	alpha_si		// r0_r * alpha_i
+	xvmulsp		vs19,	vs5,	alpha_sr		// r0_i * alpha_r
+
+	xvsubsp		vs20,	vs16,	vs17		// r0_r * alpha_r - r0_i * alpha_i
+	xvaddsp		vs21,	vs18,	vs19		// r0_r * alpha_i + r0_i * alpha_r
+
+	xvmulsp		vs16,	vs6,	alpha_sr		// r1_r * alpha_r
+	xvmulsp		vs17,	vs7,	alpha_si		// r1_i * alpha_i
+	xvmulsp		vs18,	vs6,	alpha_si		// r1_r * alpha_i
+	xvmulsp		vs19,	vs7,	alpha_sr		// r1_i * alpha_r
+
+	xvsubsp		vs22,	vs16,	vs17		// r1_r * alpha_r - r1_i * alpha_i
+	xvaddsp		vs23,	vs18,	vs19		// r1_r * alpha_i + r1_i * alpha_r
+
+	xxlxor		vs24,	vs24,	vs24
+	xxsldwi		vs20,	vs20,	vs24,	3		// r0_r
+	xxsldwi		vs21,	vs21,	vs24,	2		// r0_i
+	xxsldwi		vs22,	vs22,	vs24,	1		// r1_r
+	xxsldwi		vs23,	vs23,	vs24,	0		// r1_i
+	xvaddsp		vs20,	vs20,	vs21		// r0_r, r0_i
+	xvaddsp		vs22,	vs22,	vs23		// r1_r, r1_i
+	xvaddsp		vs1,	vs20,	vs22			// r0_r, r0_i, r1_r, r1_i
+	xvaddsp		vs0,	vs0,	vs1
+
+
+	stxvw4x		vs0,	o0,	T2	// c0, c1
+
+	addi		T2,	T2,	16
+
+// N=1 M=2
+
+	xxlxor		vs4,	vs4,	vs4
+	xxlxor		vs5,	vs5,	vs5
+	xxlxor		vs6,	vs6,	vs6
+	xxlxor		vs7,	vs7,	vs7
+
+#ifndef TRMMKERNEL
+	lxvw4x		vs0,	o0,	T2	// c0, c1
+#else
+	xxlxor		vs0,	vs0,	vs0
+#endif
+
+
+	xxspltw		vs8,	vs38,	0
+	xxspltw		vs9,	vs38,	1
+	xxspltw		vs10,	vs38,	2
+	xxspltw		vs11,	vs38,	3
+
+
+	xxspltw		vs12,	vs39,	0
+	xxspltw		vs13,	vs39,	1
+	xxspltw		vs14,	vs39,	2
+	xxspltw		vs15,	vs39,	3
+
+	XVFADD_R1	vs4,	vs4,	vs8		// add a0_r * b0_r
+	XVFADD_I2	vs5,	vs5,	vs12		// add a0_r * b0_i
+	XVFADD_R1	vs6,	vs6,	vs10		// add a1_r * b0_r
+	XVFADD_I2	vs7,	vs7,	vs14		// add a1_r * b0_i
+
+	XVFADD_R2	vs4,	vs4,	vs13		// add a0_i * b0_i
+	XVFADD_I1	vs5,	vs5,	vs9 		// add a0_i * b0_r
+	XVFADD_R2	vs6,	vs6,	vs15		// add a1_i * b0_i
+	XVFADD_I1	vs7,	vs7,	vs11		// add a1_i * b0_r
+
+	xvmulsp		vs16,	vs4,	alpha_sr		// r0_r * alpha_r
+	xvmulsp		vs17,	vs5,	alpha_si		// r0_i * alpha_i
+	xvmulsp		vs18,	vs4,	alpha_si		// r0_r * alpha_i
+	xvmulsp		vs19,	vs5,	alpha_sr		// r0_i * alpha_r
+
+	xvsubsp		vs20,	vs16,	vs17		// r0_r * alpha_r - r0_i * alpha_i
+	xvaddsp		vs21,	vs18,	vs19		// r0_r * alpha_i + r0_i * alpha_r
+
+	xvmulsp		vs16,	vs6,	alpha_sr		// r1_r * alpha_r
+	xvmulsp		vs17,	vs7,	alpha_si		// r1_i * alpha_i
+	xvmulsp		vs18,	vs6,	alpha_si		// r1_r * alpha_i
+	xvmulsp		vs19,	vs7,	alpha_sr		// r1_i * alpha_r
+
+	xvsubsp		vs22,	vs16,	vs17		// r1_r * alpha_r - r1_i * alpha_i
+	xvaddsp		vs23,	vs18,	vs19		// r1_r * alpha_i + r1_i * alpha_r
+
+	xxlxor		vs24,	vs24,	vs24
+	xxsldwi		vs20,	vs20,	vs24,	3		// r0_r
+	xxsldwi		vs21,	vs21,	vs24,	2		// r0_i
+	xxsldwi		vs22,	vs22,	vs24,	1		// r1_r
+	xxsldwi		vs23,	vs23,	vs24,	0		// r1_i
+	xvaddsp		vs20,	vs20,	vs21		// r0_r, r0_i
+	xvaddsp		vs22,	vs22,	vs23		// r1_r, r1_i
+	xvaddsp		vs1,	vs20,	vs22			// r0_r, r0_i, r1_r, r1_i
+	xvaddsp		vs0,	vs0,	vs1
+
+
+	stxvw4x		vs0,	o0,	T2	// c0, c1
+
+	addi		T2,	T2,	16
+	add		T1,	T1,	LDC
+
+	addi		CO,	CO,	32
+
+.endm
+
+
+/**********************************************************************************************
+* Macros for N=2 and M=2
+**********************************************************************************************/
+
+.macro LOAD2x2_1
+
+	lxvw4x		vs0,	o0,	AO		// load a0, a1
+
+
+	addi		AO,	AO,	16
+
+	lxvw4x		vs24,	o0,	BO		//  load b0, b1
+
+	xxspltw		vs8,	vs24,	0
+	xxspltw		vs9,	vs24,	1
+	xxspltw		vs10,	vs24,	2
+	xxspltw		vs11,	vs24,	3
+
+
+	addi		BO,	BO,	16
+
+.endm
+
+.macro KERNEL2x2_I1
+
+
+	lxvw4x		vs4,	o0,	AO		// load a0, a1
+
+
+	addi		AO,	AO,	16
+
+	lxvw4x		vs24,	o0,	BO		//  load b0, b1
+
+	xxspltw		vs16,	vs24,	0
+	xxspltw		vs17,	vs24,	1
+	xxspltw		vs18,	vs24,	2
+	xxspltw		vs19,	vs24,	3
+
+
+	addi		BO,	BO,	16
+
+
+	xvmulsp		vs32,	vs0,	vs8		// a0_r*b0_r, a0_i*b0_r, a1_r*b0_r, a1_i*b0_r
+	xvmulsp		vs33,	vs0,	vs9		// a0_r*b0_i, a0_i*b0_i, a1_r*b0_i, a1_i*b0_i
+
+	xvmulsp		vs34,	vs0,	vs10		// a0_r*b1_r, a0_i*b1_r, a1_r*b1_r, a1_i*b1_r
+	xvmulsp		vs35,	vs0,	vs11		// a0_r*b1_i, a0_i*b1_i, a1_r*b1_i, a1_i*b1_i
+
+
+.endm
+
+.macro KERNEL2x2_1
+
+
+	lxvw4x		vs4,	o0,	AO		// load a0, a1
+
+
+	addi		AO,	AO,	16
+
+	lxvw4x		vs24,	o0,	BO		//  load b0, b1
+
+	xxspltw		vs16,	vs24,	0
+	xxspltw		vs17,	vs24,	1
+	xxspltw		vs18,	vs24,	2
+	xxspltw		vs19,	vs24,	3
+
+
+	addi		BO,	BO,	16
+
+
+	xvmaddasp	vs32,	vs0,	vs8		// a0_r*b0_r, a0_i*b0_r, a1_r*b0_r, a1_i*b0_r
+	xvmaddasp	vs33,	vs0,	vs9		// a0_r*b0_i, a0_i*b0_i, a1_r*b0_i, a1_i*b0_i
+
+	xvmaddasp	vs34,	vs0,	vs10		// a0_r*b1_r, a0_i*b1_r, a1_r*b1_r, a1_i*b1_r
+	xvmaddasp	vs35,	vs0,	vs11		// a0_r*b1_i, a0_i*b1_i, a1_r*b1_i, a1_i*b1_i
+
+
+.endm
+
+.macro KERNEL2x2_2
+
+
+	lxvw4x		vs0,	o0,	AO		// load a0, a1
+
+
+	addi		AO,	AO,	16
+
+	lxvw4x		vs24,	o0,	BO		//  load b0, b1
+
+	xxspltw		vs8,	vs24,	0
+	xxspltw		vs9,	vs24,	1
+	xxspltw		vs10,	vs24,	2
+	xxspltw		vs11,	vs24,	3
+
+
+	addi		BO,	BO,	16
+
+
+	xvmaddasp	vs32,	vs4,	vs16		// a4_r*b0_r, a4_i*b0_r, a1_r*b0_r, a1_i*b0_r
+	xvmaddasp	vs33,	vs4,	vs17		// a4_r*b0_i, a4_i*b0_i, a1_r*b0_i, a1_i*b0_i
+
+	xvmaddasp	vs34,	vs4,	vs18		// a4_r*b1_r, a4_i*b1_r, a1_r*b1_r, a1_i*b1_r
+	xvmaddasp	vs35,	vs4,	vs19		// a4_r*b1_i, a4_i*b1_i, a1_r*b1_i, a1_i*b1_i
+
+
+.endm
+
+.macro KERNEL2x2_E2
+
+
+	xvmaddasp	vs32,	vs4,	vs16		// a4_r*b0_r, a4_i*b0_r, a1_r*b0_r, a1_i*b0_r
+	xvmaddasp	vs33,	vs4,	vs17		// a4_r*b0_i, a4_i*b0_i, a1_r*b0_i, a1_i*b0_i
+
+	xvmaddasp	vs34,	vs4,	vs18		// a4_r*b1_r, a4_i*b1_r, a1_r*b1_r, a1_i*b1_r
+	xvmaddasp	vs35,	vs4,	vs19		// a4_r*b1_i, a4_i*b1_i, a1_r*b1_i, a1_i*b1_i
+
+
+.endm
+
+.macro KERNEL2x2_SUBI1
+
+
+	lxvw4x		vs0,	o0,	AO		// load a0, a1
+
+
+	addi		AO,	AO,	16
+
+	lxvw4x		vs24,	o0,	BO		//  load b0, b1
+
+	xxspltw		vs8,	vs24,	0
+	xxspltw		vs9,	vs24,	1
+	xxspltw		vs10,	vs24,	2
+	xxspltw		vs11,	vs24,	3
+
+
+	addi		BO,	BO,	16
+
+
+	xvmulsp		vs32,	vs0,	vs8		// a0_r*b0_r, a0_i*b0_r, a1_r*b0_r, a1_i*b0_r
+	xvmulsp		vs33,	vs0,	vs9		// a0_r*b0_i, a0_i*b0_i, a1_r*b0_i, a1_i*b0_i
+
+	xvmulsp		vs34,	vs0,	vs10		// a0_r*b1_r, a0_i*b1_r, a1_r*b1_r, a1_i*b1_r
+	xvmulsp		vs35,	vs0,	vs11		// a0_r*b1_i, a0_i*b1_i, a1_r*b1_i, a1_i*b1_i
+
+
+.endm
+
+.macro KERNEL2x2_SUB1
+
+
+	lxvw4x		vs0,	o0,	AO		// load a0, a1
+
+
+	addi		AO,	AO,	16
+
+	lxvw4x		vs24,	o0,	BO		//  load b0, b1
+
+	xxspltw		vs8,	vs24,	0
+	xxspltw		vs9,	vs24,	1
+	xxspltw		vs10,	vs24,	2
+	xxspltw		vs11,	vs24,	3
+
+
+	addi		BO,	BO,	16
+
+
+	xvmaddasp	vs32,	vs0,	vs8		// a0_r*b0_r, a0_i*b0_r, a1_r*b0_r, a1_i*b0_r
+	xvmaddasp	vs33,	vs0,	vs9		// a0_r*b0_i, a0_i*b0_i, a1_r*b0_i, a1_i*b0_i
+
+	xvmaddasp	vs34,	vs0,	vs10		// a0_r*b1_r, a0_i*b1_r, a1_r*b1_r, a1_i*b1_r
+	xvmaddasp	vs35,	vs0,	vs11		// a0_r*b1_i, a0_i*b1_i, a1_r*b1_i, a1_i*b1_i
+
+
+.endm
+
+.macro SAVE2x2
+
+	mr		T1,	CO
+
+// N=0
+
+	mr		T2,	T1
+
+// N=0 M=0
+
+	xxlxor		vs4,	vs4,	vs4
+	xxlxor		vs5,	vs5,	vs5
+	xxlxor		vs6,	vs6,	vs6
+	xxlxor		vs7,	vs7,	vs7
+
+#ifndef TRMMKERNEL
+	lxvw4x		vs0,	o0,	T2	// c0, c1
+#else
+	xxlxor		vs0,	vs0,	vs0
+#endif
+
+
+	xxspltw		vs8,	vs32,	0
+	xxspltw		vs9,	vs32,	1
+	xxspltw		vs10,	vs32,	2
+	xxspltw		vs11,	vs32,	3
+
+
+	xxspltw		vs12,	vs33,	0
+	xxspltw		vs13,	vs33,	1
+	xxspltw		vs14,	vs33,	2
+	xxspltw		vs15,	vs33,	3
+
+	XVFADD_R1	vs4,	vs4,	vs8		// add a0_r * b0_r
+	XVFADD_I2	vs5,	vs5,	vs12		// add a0_r * b0_i
+	XVFADD_R1	vs6,	vs6,	vs10		// add a1_r * b0_r
+	XVFADD_I2	vs7,	vs7,	vs14		// add a1_r * b0_i
+
+	XVFADD_R2	vs4,	vs4,	vs13		// add a0_i * b0_i
+	XVFADD_I1	vs5,	vs5,	vs9 		// add a0_i * b0_r
+	XVFADD_R2	vs6,	vs6,	vs15		// add a1_i * b0_i
+	XVFADD_I1	vs7,	vs7,	vs11		// add a1_i * b0_r
+
+	xvmulsp		vs16,	vs4,	alpha_sr		// r0_r * alpha_r
+	xvmulsp		vs17,	vs5,	alpha_si		// r0_i * alpha_i
+	xvmulsp		vs18,	vs4,	alpha_si		// r0_r * alpha_i
+	xvmulsp		vs19,	vs5,	alpha_sr		// r0_i * alpha_r
+
+	xvsubsp		vs20,	vs16,	vs17		// r0_r * alpha_r - r0_i * alpha_i
+	xvaddsp		vs21,	vs18,	vs19		// r0_r * alpha_i + r0_i * alpha_r
+
+	xvmulsp		vs16,	vs6,	alpha_sr		// r1_r * alpha_r
+	xvmulsp		vs17,	vs7,	alpha_si		// r1_i * alpha_i
+	xvmulsp		vs18,	vs6,	alpha_si		// r1_r * alpha_i
+	xvmulsp		vs19,	vs7,	alpha_sr		// r1_i * alpha_r
+
+	xvsubsp		vs22,	vs16,	vs17		// r1_r * alpha_r - r1_i * alpha_i
+	xvaddsp		vs23,	vs18,	vs19		// r1_r * alpha_i + r1_i * alpha_r
+
+	xxlxor		vs24,	vs24,	vs24
+	xxsldwi		vs20,	vs20,	vs24,	3		// r0_r
+	xxsldwi		vs21,	vs21,	vs24,	2		// r0_i
+	xxsldwi		vs22,	vs22,	vs24,	1		// r1_r
+	xxsldwi		vs23,	vs23,	vs24,	0		// r1_i
+	xvaddsp		vs20,	vs20,	vs21		// r0_r, r0_i
+	xvaddsp		vs22,	vs22,	vs23		// r1_r, r1_i
+	xvaddsp		vs1,	vs20,	vs22			// r0_r, r0_i, r1_r, r1_i
+	xvaddsp		vs0,	vs0,	vs1
+
+
+	stxvw4x		vs0,	o0,	T2	// c0, c1
+
+	addi		T2,	T2,	16
+	add		T1,	T1,	LDC
+
+
+// N=1
+
+	mr		T2,	T1
+
+// N=1 M=0
+
+	xxlxor		vs4,	vs4,	vs4
+	xxlxor		vs5,	vs5,	vs5
+	xxlxor		vs6,	vs6,	vs6
+	xxlxor		vs7,	vs7,	vs7
+
+#ifndef TRMMKERNEL
+	lxvw4x		vs0,	o0,	T2	// c0, c1
+#else
+	xxlxor		vs0,	vs0,	vs0
+#endif
+
+
+	xxspltw		vs8,	vs34,	0
+	xxspltw		vs9,	vs34,	1
+	xxspltw		vs10,	vs34,	2
+	xxspltw		vs11,	vs34,	3
+
+
+	xxspltw		vs12,	vs35,	0
+	xxspltw		vs13,	vs35,	1
+	xxspltw		vs14,	vs35,	2
+	xxspltw		vs15,	vs35,	3
+
+	XVFADD_R1	vs4,	vs4,	vs8		// add a0_r * b0_r
+	XVFADD_I2	vs5,	vs5,	vs12		// add a0_r * b0_i
+	XVFADD_R1	vs6,	vs6,	vs10		// add a1_r * b0_r
+	XVFADD_I2	vs7,	vs7,	vs14		// add a1_r * b0_i
+
+	XVFADD_R2	vs4,	vs4,	vs13		// add a0_i * b0_i
+	XVFADD_I1	vs5,	vs5,	vs9 		// add a0_i * b0_r
+	XVFADD_R2	vs6,	vs6,	vs15		// add a1_i * b0_i
+	XVFADD_I1	vs7,	vs7,	vs11		// add a1_i * b0_r
+
+	xvmulsp		vs16,	vs4,	alpha_sr		// r0_r * alpha_r
+	xvmulsp		vs17,	vs5,	alpha_si		// r0_i * alpha_i
+	xvmulsp		vs18,	vs4,	alpha_si		// r0_r * alpha_i
+	xvmulsp		vs19,	vs5,	alpha_sr		// r0_i * alpha_r
+
+	xvsubsp		vs20,	vs16,	vs17		// r0_r * alpha_r - r0_i * alpha_i
+	xvaddsp		vs21,	vs18,	vs19		// r0_r * alpha_i + r0_i * alpha_r
+
+	xvmulsp		vs16,	vs6,	alpha_sr		// r1_r * alpha_r
+	xvmulsp		vs17,	vs7,	alpha_si		// r1_i * alpha_i
+	xvmulsp		vs18,	vs6,	alpha_si		// r1_r * alpha_i
+	xvmulsp		vs19,	vs7,	alpha_sr		// r1_i * alpha_r
+
+	xvsubsp		vs22,	vs16,	vs17		// r1_r * alpha_r - r1_i * alpha_i
+	xvaddsp		vs23,	vs18,	vs19		// r1_r * alpha_i + r1_i * alpha_r
+
+	xxlxor		vs24,	vs24,	vs24
+	xxsldwi		vs20,	vs20,	vs24,	3		// r0_r
+	xxsldwi		vs21,	vs21,	vs24,	2		// r0_i
+	xxsldwi		vs22,	vs22,	vs24,	1		// r1_r
+	xxsldwi		vs23,	vs23,	vs24,	0		// r1_i
+	xvaddsp		vs20,	vs20,	vs21		// r0_r, r0_i
+	xvaddsp		vs22,	vs22,	vs23		// r1_r, r1_i
+	xvaddsp		vs1,	vs20,	vs22			// r0_r, r0_i, r1_r, r1_i
+	xvaddsp		vs0,	vs0,	vs1
+
+
+	stxvw4x		vs0,	o0,	T2	// c0, c1
+
+	addi		T2,	T2,	16
+	add		T1,	T1,	LDC
+
+	addi		CO,	CO,	16
+
+.endm
+
+
+/**********************************************************************************************
+* Macros for N=2 and M=1
+**********************************************************************************************/
+
+.macro LOAD2x1_1
+
+	lxsspx		vs0,	o0,	AO		// load a0_r
+	lxsspx		vs1,	o4,	AO		// load a0_i
+
+	addi		AO,	AO,	8
+
+	mr		T1,	BO
+
+	lxsspx		vs8,	o0,	T1		//  load b0_r
+	lxsspx		vs9,	o4,	T1		//  load b0_i
+
+	addi		T1,	T1,8
+
+	lxsspx		vs10,	o0,	T1		//  load b1_r
+	lxsspx		vs11,	o4,	T1		//  load b1_i
+
+	addi		BO,	BO,	16
+
+.endm
+
+.macro KERNEL2x1_I1
+
+
+	lxsspx		vs4,	o0,	AO		// load a0_r
+	lxsspx		vs5,	o4,	AO		// load a0_i
+
+	addi		AO,	AO,	8
+
+	mr		T1,	BO
+
+	lxsspx		vs16,	o0,	T1		//  load b0_r
+	lxsspx		vs17,	o4,	T1		//  load b0_i
+
+	addi		T1,	T1,8
+
+	lxsspx		vs18,	o0,	T1		//  load b1_r
+	lxsspx		vs19,	o4,	T1		//  load b1_i
+
+	addi		BO,	BO,	16
+
+
+	xsmuldp		vs32,	vs0,	vs8		// a0_r*b0_r
+	xsmuldp		vs33,	vs1,	vs9		// a0_i*b0_i
+	xsmuldp		vs34,	vs0,	vs9		// a0_r*b0_i
+	xsmuldp		vs35,	vs1,	vs8		// a0_i*b0_r
+
+	xsmuldp		vs36,	vs0,	vs10		// a0_r*b1_r
+	xsmuldp		vs37,	vs1,	vs11		// a0_i*b1_i
+	xsmuldp		vs38,	vs0,	vs11		// a0_r*b1_i
+	xsmuldp		vs39,	vs1,	vs10		// a0_i*b1_r
+
+
+.endm
+
+.macro KERNEL2x1_1
+
+
+	lxsspx		vs4,	o0,	AO		// load a0_r
+	lxsspx		vs5,	o4,	AO		// load a0_i
+
+	addi		AO,	AO,	8
+
+	mr		T1,	BO
+
+	lxsspx		vs16,	o0,	T1		//  load b0_r
+	lxsspx		vs17,	o4,	T1		//  load b0_i
+
+	addi		T1,	T1,8
+
+	lxsspx		vs18,	o0,	T1		//  load b1_r
+	lxsspx		vs19,	o4,	T1		//  load b1_i
+
+	addi		BO,	BO,	16
+
+
+	xsmaddadp	vs32,	vs0,	vs8		// a0_r*b0_r
+	xsmaddadp	vs33,	vs1,	vs9		// a0_i*b0_i
+	xsmaddadp	vs34,	vs0,	vs9		// a0_r*b0_i
+	xsmaddadp	vs35,	vs1,	vs8		// a0_i*b0_r
+
+	xsmaddadp	vs36,	vs0,	vs10		// a0_r*b1_r
+	xsmaddadp	vs37,	vs1,	vs11		// a0_i*b1_i
+	xsmaddadp	vs38,	vs0,	vs11		// a0_r*b1_i
+	xsmaddadp	vs39,	vs1,	vs10		// a0_i*b1_r
+
+
+.endm
+
+.macro KERNEL2x1_2
+
+
+	lxsspx		vs0,	o0,	AO		// load a0_r
+	lxsspx		vs1,	o4,	AO		// load a0_i
+
+	addi		AO,	AO,	8
+
+	mr		T1,	BO
+
+	lxsspx		vs8,	o0,	T1		//  load b0_r
+	lxsspx		vs9,	o4,	T1		//  load b0_i
+
+	addi		T1,	T1,8
+
+	lxsspx		vs10,	o0,	T1		//  load b1_r
+	lxsspx		vs11,	o4,	T1		//  load b1_i
+
+	addi		BO,	BO,	16
+
+
+	xsmaddadp	vs32,	vs4,	vs16		// a4_r*b0_r
+	xsmaddadp	vs33,	vs5,	vs17		// a4_i*b0_i
+	xsmaddadp	vs34,	vs4,	vs17		// a4_r*b0_i
+	xsmaddadp	vs35,	vs5,	vs16		// a4_i*b0_r
+
+	xsmaddadp	vs36,	vs4,	vs18		// a4_r*b1_r
+	xsmaddadp	vs37,	vs5,	vs19		// a4_i*b1_i
+	xsmaddadp	vs38,	vs4,	vs19		// a4_r*b1_i
+	xsmaddadp	vs39,	vs5,	vs18		// a4_i*b1_r
+
+
+.endm
+
+.macro KERNEL2x1_E2
+
+
+	xsmaddadp	vs32,	vs4,	vs16		// a4_r*b0_r
+	xsmaddadp	vs33,	vs5,	vs17		// a4_i*b0_i
+	xsmaddadp	vs34,	vs4,	vs17		// a4_r*b0_i
+	xsmaddadp	vs35,	vs5,	vs16		// a4_i*b0_r
+
+	xsmaddadp	vs36,	vs4,	vs18		// a4_r*b1_r
+	xsmaddadp	vs37,	vs5,	vs19		// a4_i*b1_i
+	xsmaddadp	vs38,	vs4,	vs19		// a4_r*b1_i
+	xsmaddadp	vs39,	vs5,	vs18		// a4_i*b1_r
+
+
+.endm
+
+.macro KERNEL2x1_SUBI1
+
+
+	lxsspx		vs0,	o0,	AO		// load a0_r
+	lxsspx		vs1,	o4,	AO		// load a0_i
+
+	addi		AO,	AO,	8
+
+	mr		T1,	BO
+
+	lxsspx		vs8,	o0,	T1		//  load b0_r
+	lxsspx		vs9,	o4,	T1		//  load b0_i
+
+	addi		T1,	T1,8
+
+	lxsspx		vs10,	o0,	T1		//  load b1_r
+	lxsspx		vs11,	o4,	T1		//  load b1_i
+
+	addi		BO,	BO,	16
+
+
+	xsmuldp		vs32,	vs0,	vs8		// a0_r*b0_r
+	xsmuldp		vs33,	vs1,	vs9		// a0_i*b0_i
+	xsmuldp		vs34,	vs0,	vs9		// a0_r*b0_i
+	xsmuldp		vs35,	vs1,	vs8		// a0_i*b0_r
+
+	xsmuldp		vs36,	vs0,	vs10		// a0_r*b1_r
+	xsmuldp		vs37,	vs1,	vs11		// a0_i*b1_i
+	xsmuldp		vs38,	vs0,	vs11		// a0_r*b1_i
+	xsmuldp		vs39,	vs1,	vs10		// a0_i*b1_r
+
+
+.endm
+
+.macro KERNEL2x1_SUB1
+
+
+	lxsspx		vs0,	o0,	AO		// load a0_r
+	lxsspx		vs1,	o4,	AO		// load a0_i
+
+	addi		AO,	AO,	8
+
+	mr		T1,	BO
+
+	lxsspx		vs8,	o0,	T1		//  load b0_r
+	lxsspx		vs9,	o4,	T1		//  load b0_i
+
+	addi		T1,	T1,8
+
+	lxsspx		vs10,	o0,	T1		//  load b1_r
+	lxsspx		vs11,	o4,	T1		//  load b1_i
+
+	addi		BO,	BO,	16
+
+
+	xsmaddadp	vs32,	vs0,	vs8		// a0_r*b0_r
+	xsmaddadp	vs33,	vs1,	vs9		// a0_i*b0_i
+	xsmaddadp	vs34,	vs0,	vs9		// a0_r*b0_i
+	xsmaddadp	vs35,	vs1,	vs8		// a0_i*b0_r
+
+	xsmaddadp	vs36,	vs0,	vs10		// a0_r*b1_r
+	xsmaddadp	vs37,	vs1,	vs11		// a0_i*b1_i
+	xsmaddadp	vs38,	vs0,	vs11		// a0_r*b1_i
+	xsmaddadp	vs39,	vs1,	vs10		// a0_i*b1_r
+
+
+.endm
+
+.macro SAVE2x1
+
+	mr		T1,	CO
+
+// N=0
+
+	mr		T2,	T1
+
+// N=0 M=0
+
+	xxlxor		vs4,	vs4,	vs4
+	xxlxor		vs5,	vs5,	vs5
+
+#ifndef TRMMKERNEL
+	lxsspx		vs0,	o0,	T2	// load c0_r
+	lxsspx		vs1,	o4,	T2	// load c0_i
+#else
+	xxlxor		vs0,	vs0,	vs0
+	xxlxor		vs1,	vs1,	vs1
+#endif
+
+	XSFADD_R1	vs4,	vs4,	vs32		// add a0_r * b0_r
+	XSFADD_I1	vs5,	vs5,	vs35		// add a0_r * b0_i
+
+	XSFADD_R2	vs4,	vs4,	vs33		// add a0_i * b0_i
+	XSFADD_I2	vs5,	vs5,	vs34		// add a0_i * b0_r
+
+	xsmuldp		vs16,	vs4,	alpha_dr		// r0_r * alpha_r
+	xsmuldp		vs17,	vs5,	alpha_di		// r0_i * alpha_i
+	xsmuldp		vs18,	vs4,	alpha_di		// r0_r * alpha_i
+	xsmuldp		vs19,	vs5,	alpha_dr		// r0_i * alpha_r
+
+	xssubdp		vs20,	vs16,	vs17		// r0_r * alpha_r - r0_i * alpha_i
+	xsadddp		vs21,	vs18,	vs19		// r0_r * alpha_i + r0_i * alpha_r
+
+	xsadddp		vs0,	vs0,	vs20
+	xsadddp		vs1,	vs1,	vs21
+
+
+	stxsspx		vs0,	o0,	T2	// store c0_r
+	stxsspx		vs1,	o4,	T2	// store c0_i
+
+	addi		T2,	T2,	8
+	add		T1,	T1,	LDC
+
+
+// N=1
+
+	mr		T2,	T1
+
+// N=1 M=0
+
+	xxlxor		vs4,	vs4,	vs4
+	xxlxor		vs5,	vs5,	vs5
+
+#ifndef TRMMKERNEL
+	lxsspx		vs0,	o0,	T2	// load c0_r
+	lxsspx		vs1,	o4,	T2	// load c0_i
+#else
+	xxlxor		vs0,	vs0,	vs0
+	xxlxor		vs1,	vs1,	vs1
+#endif
+
+	XSFADD_R1	vs4,	vs4,	vs36		// add a0_r * b0_r
+	XSFADD_I1	vs5,	vs5,	vs39		// add a0_r * b0_i
+
+	XSFADD_R2	vs4,	vs4,	vs37		// add a0_i * b0_i
+	XSFADD_I2	vs5,	vs5,	vs38		// add a0_i * b0_r
+
+	xsmuldp		vs16,	vs4,	alpha_dr		// r0_r * alpha_r
+	xsmuldp		vs17,	vs5,	alpha_di		// r0_i * alpha_i
+	xsmuldp		vs18,	vs4,	alpha_di		// r0_r * alpha_i
+	xsmuldp		vs19,	vs5,	alpha_dr		// r0_i * alpha_r
+
+	xssubdp		vs20,	vs16,	vs17		// r0_r * alpha_r - r0_i * alpha_i
+	xsadddp		vs21,	vs18,	vs19		// r0_r * alpha_i + r0_i * alpha_r
+
+	xsadddp		vs0,	vs0,	vs20
+	xsadddp		vs1,	vs1,	vs21
+
+
+	stxsspx		vs0,	o0,	T2	// store c0_r
+	stxsspx		vs1,	o4,	T2	// store c0_i
+
+	addi		T2,	T2,	8
+	add		T1,	T1,	LDC
+
+	addi		CO,	CO,	8
+
+.endm
+
+
+/**********************************************************************************************
+* Macros for N=1 and M=8
+**********************************************************************************************/
+
+.macro LOAD1x8_1
+
+	lxvw4x		vs0,	o0,	AO		// load a0, a1
+
+	lxvw4x		vs1,	o16,	AO		// load a2, a3
+
+	lxvw4x		vs2,	o32,	AO		// load a4, a5
+
+	lxvw4x		vs3,	o48,	AO		// load a6, a7
+
+
+	addi		AO,	AO,	64
+
+	lxvw4x		vs24,	o0,	BO		//  load b0, b1
+
+
+
+	xxspltw		vs8,	vs24,	0
+	xxspltw		vs9,	vs24,	1
+	xxspltw		vs10,	vs24,	2
+	xxspltw		vs11,	vs24,	3
+
+
+	addi		BO,	BO,	8
+
+.endm
+
+.macro KERNEL1x8_I1
+
+
+	lxvw4x		vs4,	o0,	AO		// load a0, a1
+
+	lxvw4x		vs5,	o16,	AO		// load a2, a3
+
+	lxvw4x		vs6,	o32,	AO		// load a4, a5
+
+	lxvw4x		vs7,	o48,	AO		// load a6, a7
+
+
+	addi		AO,	AO,	64
+
+	lxvw4x		vs24,	o0,	BO		//  load b0, b1
+
+
+
+	xxspltw		vs16,	vs24,	0
+	xxspltw		vs17,	vs24,	1
+	xxspltw		vs18,	vs24,	2
+	xxspltw		vs19,	vs24,	3
+
+
+	addi		BO,	BO,	8
+
+
+	xvmulsp		vs32,	vs0,	vs8		// a0_r*b0_r, a0_i*b0_r, a1_r*b0_r, a1_i*b0_r
+	xvmulsp		vs33,	vs0,	vs9		// a0_r*b0_i, a0_i*b0_i, a1_r*b0_i, a1_i*b0_i
+	xvmulsp		vs34,	vs1,	vs8		// a1_r*b0_r, a1_i*b0_r, a1_r*b0_r, a1_i*b0_r
+	xvmulsp		vs35,	vs1,	vs9		// a1_r*b0_i, a1_i*b0_i, a1_r*b0_i, a1_i*b0_i
+	xvmulsp		vs36,	vs2,	vs8		// a2_r*b0_r, a2_i*b0_r, a1_r*b0_r, a1_i*b0_r
+	xvmulsp		vs37,	vs2,	vs9		// a2_r*b0_i, a2_i*b0_i, a1_r*b0_i, a1_i*b0_i
+	xvmulsp		vs38,	vs3,	vs8		// a3_r*b0_r, a3_i*b0_r, a1_r*b0_r, a1_i*b0_r
+	xvmulsp		vs39,	vs3,	vs9		// a3_r*b0_i, a3_i*b0_i, a1_r*b0_i, a1_i*b0_i
+
+
+.endm
+
+.macro KERNEL1x8_1
+
+
+	lxvw4x		vs4,	o0,	AO		// load a0, a1
+
+	lxvw4x		vs5,	o16,	AO		// load a2, a3
+
+	lxvw4x		vs6,	o32,	AO		// load a4, a5
+
+	lxvw4x		vs7,	o48,	AO		// load a6, a7
+
+
+	addi		AO,	AO,	64
+
+	lxvw4x		vs24,	o0,	BO		//  load b0, b1
+
+
+
+	xxspltw		vs16,	vs24,	0
+	xxspltw		vs17,	vs24,	1
+	xxspltw		vs18,	vs24,	2
+	xxspltw		vs19,	vs24,	3
+
+
+	addi		BO,	BO,	8
+
+
+	xvmaddasp	vs32,	vs0,	vs8		// a0_r*b0_r, a0_i*b0_r, a1_r*b0_r, a1_i*b0_r
+	xvmaddasp	vs33,	vs0,	vs9		// a0_r*b0_i, a0_i*b0_i, a1_r*b0_i, a1_i*b0_i
+	xvmaddasp	vs34,	vs1,	vs8		// a1_r*b0_r, a1_i*b0_r, a1_r*b0_r, a1_i*b0_r
+	xvmaddasp	vs35,	vs1,	vs9		// a1_r*b0_i, a1_i*b0_i, a1_r*b0_i, a1_i*b0_i
+	xvmaddasp	vs36,	vs2,	vs8		// a2_r*b0_r, a2_i*b0_r, a1_r*b0_r, a1_i*b0_r
+	xvmaddasp	vs37,	vs2,	vs9		// a2_r*b0_i, a2_i*b0_i, a1_r*b0_i, a1_i*b0_i
+	xvmaddasp	vs38,	vs3,	vs8		// a3_r*b0_r, a3_i*b0_r, a1_r*b0_r, a1_i*b0_r
+	xvmaddasp	vs39,	vs3,	vs9		// a3_r*b0_i, a3_i*b0_i, a1_r*b0_i, a1_i*b0_i
+
+
+.endm
+
+.macro KERNEL1x8_2
+
+
+	lxvw4x		vs0,	o0,	AO		// load a0, a1
+
+	lxvw4x		vs1,	o16,	AO		// load a2, a3
+
+	lxvw4x		vs2,	o32,	AO		// load a4, a5
+
+	lxvw4x		vs3,	o48,	AO		// load a6, a7
+
+
+	addi		AO,	AO,	64
+
+	lxvw4x		vs24,	o0,	BO		//  load b0, b1
+
+
+
+	xxspltw		vs8,	vs24,	0
+	xxspltw		vs9,	vs24,	1
+	xxspltw		vs10,	vs24,	2
+	xxspltw		vs11,	vs24,	3
+
+
+	addi		BO,	BO,	8
+
+
+	xvmaddasp	vs32,	vs4,	vs16		// a4_r*b0_r, a4_i*b0_r, a1_r*b0_r, a1_i*b0_r
+	xvmaddasp	vs33,	vs4,	vs17		// a4_r*b0_i, a4_i*b0_i, a1_r*b0_i, a1_i*b0_i
+	xvmaddasp	vs34,	vs5,	vs16		// a5_r*b0_r, a5_i*b0_r, a1_r*b0_r, a1_i*b0_r
+	xvmaddasp	vs35,	vs5,	vs17		// a5_r*b0_i, a5_i*b0_i, a1_r*b0_i, a1_i*b0_i
+	xvmaddasp	vs36,	vs6,	vs16		// a6_r*b0_r, a6_i*b0_r, a1_r*b0_r, a1_i*b0_r
+	xvmaddasp	vs37,	vs6,	vs17		// a6_r*b0_i, a6_i*b0_i, a1_r*b0_i, a1_i*b0_i
+	xvmaddasp	vs38,	vs7,	vs16		// a7_r*b0_r, a7_i*b0_r, a1_r*b0_r, a1_i*b0_r
+	xvmaddasp	vs39,	vs7,	vs17		// a7_r*b0_i, a7_i*b0_i, a1_r*b0_i, a1_i*b0_i
+
+
+.endm
+
+.macro KERNEL1x8_E2
+
+
+	xvmaddasp	vs32,	vs4,	vs16		// a4_r*b0_r, a4_i*b0_r, a1_r*b0_r, a1_i*b0_r
+	xvmaddasp	vs33,	vs4,	vs17		// a4_r*b0_i, a4_i*b0_i, a1_r*b0_i, a1_i*b0_i
+	xvmaddasp	vs34,	vs5,	vs16		// a5_r*b0_r, a5_i*b0_r, a1_r*b0_r, a1_i*b0_r
+	xvmaddasp	vs35,	vs5,	vs17		// a5_r*b0_i, a5_i*b0_i, a1_r*b0_i, a1_i*b0_i
+	xvmaddasp	vs36,	vs6,	vs16		// a6_r*b0_r, a6_i*b0_r, a1_r*b0_r, a1_i*b0_r
+	xvmaddasp	vs37,	vs6,	vs17		// a6_r*b0_i, a6_i*b0_i, a1_r*b0_i, a1_i*b0_i
+	xvmaddasp	vs38,	vs7,	vs16		// a7_r*b0_r, a7_i*b0_r, a1_r*b0_r, a1_i*b0_r
+	xvmaddasp	vs39,	vs7,	vs17		// a7_r*b0_i, a7_i*b0_i, a1_r*b0_i, a1_i*b0_i
+
+
+.endm
+
+.macro KERNEL1x8_SUBI1
+
+
+	lxvw4x		vs0,	o0,	AO		// load a0, a1
+
+	lxvw4x		vs1,	o16,	AO		// load a2, a3
+
+	lxvw4x		vs2,	o32,	AO		// load a4, a5
+
+	lxvw4x		vs3,	o48,	AO		// load a6, a7
+
+
+	addi		AO,	AO,	64
+
+	lxvw4x		vs24,	o0,	BO		//  load b0, b1
+
+
+
+	xxspltw		vs8,	vs24,	0
+	xxspltw		vs9,	vs24,	1
+	xxspltw		vs10,	vs24,	2
+	xxspltw		vs11,	vs24,	3
+
+
+	addi		BO,	BO,	8
+
+
+	xvmulsp		vs32,	vs0,	vs8		// a0_r*b0_r, a0_i*b0_r, a1_r*b0_r, a1_i*b0_r
+	xvmulsp		vs33,	vs0,	vs9		// a0_r*b0_i, a0_i*b0_i, a1_r*b0_i, a1_i*b0_i
+	xvmulsp		vs34,	vs1,	vs8		// a1_r*b0_r, a1_i*b0_r, a1_r*b0_r, a1_i*b0_r
+	xvmulsp		vs35,	vs1,	vs9		// a1_r*b0_i, a1_i*b0_i, a1_r*b0_i, a1_i*b0_i
+	xvmulsp		vs36,	vs2,	vs8		// a2_r*b0_r, a2_i*b0_r, a1_r*b0_r, a1_i*b0_r
+	xvmulsp		vs37,	vs2,	vs9		// a2_r*b0_i, a2_i*b0_i, a1_r*b0_i, a1_i*b0_i
+	xvmulsp		vs38,	vs3,	vs8		// a3_r*b0_r, a3_i*b0_r, a1_r*b0_r, a1_i*b0_r
+	xvmulsp		vs39,	vs3,	vs9		// a3_r*b0_i, a3_i*b0_i, a1_r*b0_i, a1_i*b0_i
+
+
+.endm
+
+.macro KERNEL1x8_SUB1
+
+
+	lxvw4x		vs0,	o0,	AO		// load a0, a1
+
+	lxvw4x		vs1,	o16,	AO		// load a2, a3
+
+	lxvw4x		vs2,	o32,	AO		// load a4, a5
+
+	lxvw4x		vs3,	o48,	AO		// load a6, a7
+
+
+	addi		AO,	AO,	64
+
+	lxvw4x		vs24,	o0,	BO		//  load b0, b1
+
+
+
+	xxspltw		vs8,	vs24,	0
+	xxspltw		vs9,	vs24,	1
+	xxspltw		vs10,	vs24,	2
+	xxspltw		vs11,	vs24,	3
+
+
+	addi		BO,	BO,	8
+
+
+	xvmaddasp	vs32,	vs0,	vs8		// a0_r*b0_r, a0_i*b0_r, a1_r*b0_r, a1_i*b0_r
+	xvmaddasp	vs33,	vs0,	vs9		// a0_r*b0_i, a0_i*b0_i, a1_r*b0_i, a1_i*b0_i
+	xvmaddasp	vs34,	vs1,	vs8		// a1_r*b0_r, a1_i*b0_r, a1_r*b0_r, a1_i*b0_r
+	xvmaddasp	vs35,	vs1,	vs9		// a1_r*b0_i, a1_i*b0_i, a1_r*b0_i, a1_i*b0_i
+	xvmaddasp	vs36,	vs2,	vs8		// a2_r*b0_r, a2_i*b0_r, a1_r*b0_r, a1_i*b0_r
+	xvmaddasp	vs37,	vs2,	vs9		// a2_r*b0_i, a2_i*b0_i, a1_r*b0_i, a1_i*b0_i
+	xvmaddasp	vs38,	vs3,	vs8		// a3_r*b0_r, a3_i*b0_r, a1_r*b0_r, a1_i*b0_r
+	xvmaddasp	vs39,	vs3,	vs9		// a3_r*b0_i, a3_i*b0_i, a1_r*b0_i, a1_i*b0_i
+
+
+.endm
+
+.macro SAVE1x8
+
+	mr		T1,	CO
+
+// N=0
+
+	mr		T2,	T1
+
+// N=0 M=0
+
+	xxlxor		vs4,	vs4,	vs4
+	xxlxor		vs5,	vs5,	vs5
+	xxlxor		vs6,	vs6,	vs6
+	xxlxor		vs7,	vs7,	vs7
+
+#ifndef TRMMKERNEL
+	lxvw4x		vs0,	o0,	T2	// c0, c1
+#else
+	xxlxor		vs0,	vs0,	vs0
+#endif
+
+
+	xxspltw		vs8,	vs32,	0
+	xxspltw		vs9,	vs32,	1
+	xxspltw		vs10,	vs32,	2
+	xxspltw		vs11,	vs32,	3
+
+
+	xxspltw		vs12,	vs33,	0
+	xxspltw		vs13,	vs33,	1
+	xxspltw		vs14,	vs33,	2
+	xxspltw		vs15,	vs33,	3
+
+	XVFADD_R1	vs4,	vs4,	vs8		// add a0_r * b0_r
+	XVFADD_I2	vs5,	vs5,	vs12		// add a0_r * b0_i
+	XVFADD_R1	vs6,	vs6,	vs10		// add a1_r * b0_r
+	XVFADD_I2	vs7,	vs7,	vs14		// add a1_r * b0_i
+
+	XVFADD_R2	vs4,	vs4,	vs13		// add a0_i * b0_i
+	XVFADD_I1	vs5,	vs5,	vs9 		// add a0_i * b0_r
+	XVFADD_R2	vs6,	vs6,	vs15		// add a1_i * b0_i
+	XVFADD_I1	vs7,	vs7,	vs11		// add a1_i * b0_r
+
+	xvmulsp		vs16,	vs4,	alpha_sr		// r0_r * alpha_r
+	xvmulsp		vs17,	vs5,	alpha_si		// r0_i * alpha_i
+	xvmulsp		vs18,	vs4,	alpha_si		// r0_r * alpha_i
+	xvmulsp		vs19,	vs5,	alpha_sr		// r0_i * alpha_r
+
+	xvsubsp		vs20,	vs16,	vs17		// r0_r * alpha_r - r0_i * alpha_i
+	xvaddsp		vs21,	vs18,	vs19		// r0_r * alpha_i + r0_i * alpha_r
+
+	xvmulsp		vs16,	vs6,	alpha_sr		// r1_r * alpha_r
+	xvmulsp		vs17,	vs7,	alpha_si		// r1_i * alpha_i
+	xvmulsp		vs18,	vs6,	alpha_si		// r1_r * alpha_i
+	xvmulsp		vs19,	vs7,	alpha_sr		// r1_i * alpha_r
+
+	xvsubsp		vs22,	vs16,	vs17		// r1_r * alpha_r - r1_i * alpha_i
+	xvaddsp		vs23,	vs18,	vs19		// r1_r * alpha_i + r1_i * alpha_r
+
+	xxlxor		vs24,	vs24,	vs24
+	xxsldwi		vs20,	vs20,	vs24,	3		// r0_r
+	xxsldwi		vs21,	vs21,	vs24,	2		// r0_i
+	xxsldwi		vs22,	vs22,	vs24,	1		// r1_r
+	xxsldwi		vs23,	vs23,	vs24,	0		// r1_i
+	xvaddsp		vs20,	vs20,	vs21		// r0_r, r0_i
+	xvaddsp		vs22,	vs22,	vs23		// r1_r, r1_i
+	xvaddsp		vs1,	vs20,	vs22			// r0_r, r0_i, r1_r, r1_i
+	xvaddsp		vs0,	vs0,	vs1
+
+
+	stxvw4x		vs0,	o0,	T2	// c0, c1
+
+	addi		T2,	T2,	16
+
+// N=0 M=2
+
+	xxlxor		vs4,	vs4,	vs4
+	xxlxor		vs5,	vs5,	vs5
+	xxlxor		vs6,	vs6,	vs6
+	xxlxor		vs7,	vs7,	vs7
+
+#ifndef TRMMKERNEL
+	lxvw4x		vs0,	o0,	T2	// c0, c1
+#else
+	xxlxor		vs0,	vs0,	vs0
+#endif
+
+
+	xxspltw		vs8,	vs34,	0
+	xxspltw		vs9,	vs34,	1
+	xxspltw		vs10,	vs34,	2
+	xxspltw		vs11,	vs34,	3
+
+
+	xxspltw		vs12,	vs35,	0
+	xxspltw		vs13,	vs35,	1
+	xxspltw		vs14,	vs35,	2
+	xxspltw		vs15,	vs35,	3
+
+	XVFADD_R1	vs4,	vs4,	vs8		// add a0_r * b0_r
+	XVFADD_I2	vs5,	vs5,	vs12		// add a0_r * b0_i
+	XVFADD_R1	vs6,	vs6,	vs10		// add a1_r * b0_r
+	XVFADD_I2	vs7,	vs7,	vs14		// add a1_r * b0_i
+
+	XVFADD_R2	vs4,	vs4,	vs13		// add a0_i * b0_i
+	XVFADD_I1	vs5,	vs5,	vs9 		// add a0_i * b0_r
+	XVFADD_R2	vs6,	vs6,	vs15		// add a1_i * b0_i
+	XVFADD_I1	vs7,	vs7,	vs11		// add a1_i * b0_r
+
+	xvmulsp		vs16,	vs4,	alpha_sr		// r0_r * alpha_r
+	xvmulsp		vs17,	vs5,	alpha_si		// r0_i * alpha_i
+	xvmulsp		vs18,	vs4,	alpha_si		// r0_r * alpha_i
+	xvmulsp		vs19,	vs5,	alpha_sr		// r0_i * alpha_r
+
+	xvsubsp		vs20,	vs16,	vs17		// r0_r * alpha_r - r0_i * alpha_i
+	xvaddsp		vs21,	vs18,	vs19		// r0_r * alpha_i + r0_i * alpha_r
+
+	xvmulsp		vs16,	vs6,	alpha_sr		// r1_r * alpha_r
+	xvmulsp		vs17,	vs7,	alpha_si		// r1_i * alpha_i
+	xvmulsp		vs18,	vs6,	alpha_si		// r1_r * alpha_i
+	xvmulsp		vs19,	vs7,	alpha_sr		// r1_i * alpha_r
+
+	xvsubsp		vs22,	vs16,	vs17		// r1_r * alpha_r - r1_i * alpha_i
+	xvaddsp		vs23,	vs18,	vs19		// r1_r * alpha_i + r1_i * alpha_r
+
+	xxlxor		vs24,	vs24,	vs24
+	xxsldwi		vs20,	vs20,	vs24,	3		// r0_r
+	xxsldwi		vs21,	vs21,	vs24,	2		// r0_i
+	xxsldwi		vs22,	vs22,	vs24,	1		// r1_r
+	xxsldwi		vs23,	vs23,	vs24,	0		// r1_i
+	xvaddsp		vs20,	vs20,	vs21		// r0_r, r0_i
+	xvaddsp		vs22,	vs22,	vs23		// r1_r, r1_i
+	xvaddsp		vs1,	vs20,	vs22			// r0_r, r0_i, r1_r, r1_i
+	xvaddsp		vs0,	vs0,	vs1
+
+
+	stxvw4x		vs0,	o0,	T2	// c0, c1
+
+	addi		T2,	T2,	16
+
+// N=0 M=4
+
+	xxlxor		vs4,	vs4,	vs4
+	xxlxor		vs5,	vs5,	vs5
+	xxlxor		vs6,	vs6,	vs6
+	xxlxor		vs7,	vs7,	vs7
+
+#ifndef TRMMKERNEL
+	lxvw4x		vs0,	o0,	T2	// c0, c1
+#else
+	xxlxor		vs0,	vs0,	vs0
+#endif
+
+
+	xxspltw		vs8,	vs36,	0
+	xxspltw		vs9,	vs36,	1
+	xxspltw		vs10,	vs36,	2
+	xxspltw		vs11,	vs36,	3
+
+
+	xxspltw		vs12,	vs37,	0
+	xxspltw		vs13,	vs37,	1
+	xxspltw		vs14,	vs37,	2
+	xxspltw		vs15,	vs37,	3
+
+	XVFADD_R1	vs4,	vs4,	vs8		// add a0_r * b0_r
+	XVFADD_I2	vs5,	vs5,	vs12		// add a0_r * b0_i
+	XVFADD_R1	vs6,	vs6,	vs10		// add a1_r * b0_r
+	XVFADD_I2	vs7,	vs7,	vs14		// add a1_r * b0_i
+
+	XVFADD_R2	vs4,	vs4,	vs13		// add a0_i * b0_i
+	XVFADD_I1	vs5,	vs5,	vs9 		// add a0_i * b0_r
+	XVFADD_R2	vs6,	vs6,	vs15		// add a1_i * b0_i
+	XVFADD_I1	vs7,	vs7,	vs11		// add a1_i * b0_r
+
+	xvmulsp		vs16,	vs4,	alpha_sr		// r0_r * alpha_r
+	xvmulsp		vs17,	vs5,	alpha_si		// r0_i * alpha_i
+	xvmulsp		vs18,	vs4,	alpha_si		// r0_r * alpha_i
+	xvmulsp		vs19,	vs5,	alpha_sr		// r0_i * alpha_r
+
+	xvsubsp		vs20,	vs16,	vs17		// r0_r * alpha_r - r0_i * alpha_i
+	xvaddsp		vs21,	vs18,	vs19		// r0_r * alpha_i + r0_i * alpha_r
+
+	xvmulsp		vs16,	vs6,	alpha_sr		// r1_r * alpha_r
+	xvmulsp		vs17,	vs7,	alpha_si		// r1_i * alpha_i
+	xvmulsp		vs18,	vs6,	alpha_si		// r1_r * alpha_i
+	xvmulsp		vs19,	vs7,	alpha_sr		// r1_i * alpha_r
+
+	xvsubsp		vs22,	vs16,	vs17		// r1_r * alpha_r - r1_i * alpha_i
+	xvaddsp		vs23,	vs18,	vs19		// r1_r * alpha_i + r1_i * alpha_r
+
+	xxlxor		vs24,	vs24,	vs24
+	xxsldwi		vs20,	vs20,	vs24,	3		// r0_r
+	xxsldwi		vs21,	vs21,	vs24,	2		// r0_i
+	xxsldwi		vs22,	vs22,	vs24,	1		// r1_r
+	xxsldwi		vs23,	vs23,	vs24,	0		// r1_i
+	xvaddsp		vs20,	vs20,	vs21		// r0_r, r0_i
+	xvaddsp		vs22,	vs22,	vs23		// r1_r, r1_i
+	xvaddsp		vs1,	vs20,	vs22			// r0_r, r0_i, r1_r, r1_i
+	xvaddsp		vs0,	vs0,	vs1
+
+
+	stxvw4x		vs0,	o0,	T2	// c0, c1
+
+	addi		T2,	T2,	16
+
+// N=0 M=6
+
+	xxlxor		vs4,	vs4,	vs4
+	xxlxor		vs5,	vs5,	vs5
+	xxlxor		vs6,	vs6,	vs6
+	xxlxor		vs7,	vs7,	vs7
+
+#ifndef TRMMKERNEL
+	lxvw4x		vs0,	o0,	T2	// c0, c1
+#else
+	xxlxor		vs0,	vs0,	vs0
+#endif
+
+
+	xxspltw		vs8,	vs38,	0
+	xxspltw		vs9,	vs38,	1
+	xxspltw		vs10,	vs38,	2
+	xxspltw		vs11,	vs38,	3
+
+
+	xxspltw		vs12,	vs39,	0
+	xxspltw		vs13,	vs39,	1
+	xxspltw		vs14,	vs39,	2
+	xxspltw		vs15,	vs39,	3
+
+	XVFADD_R1	vs4,	vs4,	vs8		// add a0_r * b0_r
+	XVFADD_I2	vs5,	vs5,	vs12		// add a0_r * b0_i
+	XVFADD_R1	vs6,	vs6,	vs10		// add a1_r * b0_r
+	XVFADD_I2	vs7,	vs7,	vs14		// add a1_r * b0_i
+
+	XVFADD_R2	vs4,	vs4,	vs13		// add a0_i * b0_i
+	XVFADD_I1	vs5,	vs5,	vs9 		// add a0_i * b0_r
+	XVFADD_R2	vs6,	vs6,	vs15		// add a1_i * b0_i
+	XVFADD_I1	vs7,	vs7,	vs11		// add a1_i * b0_r
+
+	xvmulsp		vs16,	vs4,	alpha_sr		// r0_r * alpha_r
+	xvmulsp		vs17,	vs5,	alpha_si		// r0_i * alpha_i
+	xvmulsp		vs18,	vs4,	alpha_si		// r0_r * alpha_i
+	xvmulsp		vs19,	vs5,	alpha_sr		// r0_i * alpha_r
+
+	xvsubsp		vs20,	vs16,	vs17		// r0_r * alpha_r - r0_i * alpha_i
+	xvaddsp		vs21,	vs18,	vs19		// r0_r * alpha_i + r0_i * alpha_r
+
+	xvmulsp		vs16,	vs6,	alpha_sr		// r1_r * alpha_r
+	xvmulsp		vs17,	vs7,	alpha_si		// r1_i * alpha_i
+	xvmulsp		vs18,	vs6,	alpha_si		// r1_r * alpha_i
+	xvmulsp		vs19,	vs7,	alpha_sr		// r1_i * alpha_r
+
+	xvsubsp		vs22,	vs16,	vs17		// r1_r * alpha_r - r1_i * alpha_i
+	xvaddsp		vs23,	vs18,	vs19		// r1_r * alpha_i + r1_i * alpha_r
+
+	xxlxor		vs24,	vs24,	vs24
+	xxsldwi		vs20,	vs20,	vs24,	3		// r0_r
+	xxsldwi		vs21,	vs21,	vs24,	2		// r0_i
+	xxsldwi		vs22,	vs22,	vs24,	1		// r1_r
+	xxsldwi		vs23,	vs23,	vs24,	0		// r1_i
+	xvaddsp		vs20,	vs20,	vs21		// r0_r, r0_i
+	xvaddsp		vs22,	vs22,	vs23		// r1_r, r1_i
+	xvaddsp		vs1,	vs20,	vs22			// r0_r, r0_i, r1_r, r1_i
+	xvaddsp		vs0,	vs0,	vs1
+
+
+	stxvw4x		vs0,	o0,	T2	// c0, c1
+
+	addi		T2,	T2,	16
+	add		T1,	T1,	LDC
+
+	addi		CO,	CO,	64
+
+.endm
+
+
+/**********************************************************************************************
+* Macros for N=1 and M=4
+**********************************************************************************************/
+
+.macro LOAD1x4_1
+
+	lxvw4x		vs0,	o0,	AO		// load a0, a1
+
+	lxvw4x		vs1,	o16,	AO		// load a2, a3
+
+
+	addi		AO,	AO,	32
+
+	lxvw4x		vs24,	o0,	BO		//  load b0, b1
+
+
+
+	xxspltw		vs8,	vs24,	0
+	xxspltw		vs9,	vs24,	1
+	xxspltw		vs10,	vs24,	2
+	xxspltw		vs11,	vs24,	3
+
+
+	addi		BO,	BO,	8
+
+.endm
+
+.macro KERNEL1x4_I1
+
+
+	lxvw4x		vs4,	o0,	AO		// load a0, a1
+
+	lxvw4x		vs5,	o16,	AO		// load a2, a3
+
+
+	addi		AO,	AO,	32
+
+	lxvw4x		vs24,	o0,	BO		//  load b0, b1
+
+
+
+	xxspltw		vs16,	vs24,	0
+	xxspltw		vs17,	vs24,	1
+	xxspltw		vs18,	vs24,	2
+	xxspltw		vs19,	vs24,	3
+
+
+	addi		BO,	BO,	8
+
+
+	xvmulsp		vs32,	vs0,	vs8		// a0_r*b0_r, a0_i*b0_r, a1_r*b0_r, a1_i*b0_r
+	xvmulsp		vs33,	vs0,	vs9		// a0_r*b0_i, a0_i*b0_i, a1_r*b0_i, a1_i*b0_i
+	xvmulsp		vs34,	vs1,	vs8		// a1_r*b0_r, a1_i*b0_r, a1_r*b0_r, a1_i*b0_r
+	xvmulsp		vs35,	vs1,	vs9		// a1_r*b0_i, a1_i*b0_i, a1_r*b0_i, a1_i*b0_i
+
+
+.endm
+
+.macro KERNEL1x4_1
+
+
+	lxvw4x		vs4,	o0,	AO		// load a0, a1
+
+	lxvw4x		vs5,	o16,	AO		// load a2, a3
+
+
+	addi		AO,	AO,	32
+
+	lxvw4x		vs24,	o0,	BO		//  load b0, b1
+
+
+
+	xxspltw		vs16,	vs24,	0
+	xxspltw		vs17,	vs24,	1
+	xxspltw		vs18,	vs24,	2
+	xxspltw		vs19,	vs24,	3
+
+
+	addi		BO,	BO,	8
+
+
+	xvmaddasp	vs32,	vs0,	vs8		// a0_r*b0_r, a0_i*b0_r, a1_r*b0_r, a1_i*b0_r
+	xvmaddasp	vs33,	vs0,	vs9		// a0_r*b0_i, a0_i*b0_i, a1_r*b0_i, a1_i*b0_i
+	xvmaddasp	vs34,	vs1,	vs8		// a1_r*b0_r, a1_i*b0_r, a1_r*b0_r, a1_i*b0_r
+	xvmaddasp	vs35,	vs1,	vs9		// a1_r*b0_i, a1_i*b0_i, a1_r*b0_i, a1_i*b0_i
+
+
+.endm
+
+.macro KERNEL1x4_2
+
+
+	lxvw4x		vs0,	o0,	AO		// load a0, a1
+
+	lxvw4x		vs1,	o16,	AO		// load a2, a3
+
+
+	addi		AO,	AO,	32
+
+	lxvw4x		vs24,	o0,	BO		//  load b0, b1
+
+
+
+	xxspltw		vs8,	vs24,	0
+	xxspltw		vs9,	vs24,	1
+	xxspltw		vs10,	vs24,	2
+	xxspltw		vs11,	vs24,	3
+
+
+	addi		BO,	BO,	8
+
+
+	xvmaddasp	vs32,	vs4,	vs16		// a4_r*b0_r, a4_i*b0_r, a1_r*b0_r, a1_i*b0_r
+	xvmaddasp	vs33,	vs4,	vs17		// a4_r*b0_i, a4_i*b0_i, a1_r*b0_i, a1_i*b0_i
+	xvmaddasp	vs34,	vs5,	vs16		// a5_r*b0_r, a5_i*b0_r, a1_r*b0_r, a1_i*b0_r
+	xvmaddasp	vs35,	vs5,	vs17		// a5_r*b0_i, a5_i*b0_i, a1_r*b0_i, a1_i*b0_i
+
+
+.endm
+
+.macro KERNEL1x4_E2
+
+
+	xvmaddasp	vs32,	vs4,	vs16		// a4_r*b0_r, a4_i*b0_r, a1_r*b0_r, a1_i*b0_r
+	xvmaddasp	vs33,	vs4,	vs17		// a4_r*b0_i, a4_i*b0_i, a1_r*b0_i, a1_i*b0_i
+	xvmaddasp	vs34,	vs5,	vs16		// a5_r*b0_r, a5_i*b0_r, a1_r*b0_r, a1_i*b0_r
+	xvmaddasp	vs35,	vs5,	vs17		// a5_r*b0_i, a5_i*b0_i, a1_r*b0_i, a1_i*b0_i
+
+
+.endm
+
+.macro KERNEL1x4_SUBI1
+
+
+	lxvw4x		vs0,	o0,	AO		// load a0, a1
+
+	lxvw4x		vs1,	o16,	AO		// load a2, a3
+
+
+	addi		AO,	AO,	32
+
+	lxvw4x		vs24,	o0,	BO		//  load b0, b1
+
+
+
+	xxspltw		vs8,	vs24,	0
+	xxspltw		vs9,	vs24,	1
+	xxspltw		vs10,	vs24,	2
+	xxspltw		vs11,	vs24,	3
+
+
+	addi		BO,	BO,	8
+
+
+	xvmulsp		vs32,	vs0,	vs8		// a0_r*b0_r, a0_i*b0_r, a1_r*b0_r, a1_i*b0_r
+	xvmulsp		vs33,	vs0,	vs9		// a0_r*b0_i, a0_i*b0_i, a1_r*b0_i, a1_i*b0_i
+	xvmulsp		vs34,	vs1,	vs8		// a1_r*b0_r, a1_i*b0_r, a1_r*b0_r, a1_i*b0_r
+	xvmulsp		vs35,	vs1,	vs9		// a1_r*b0_i, a1_i*b0_i, a1_r*b0_i, a1_i*b0_i
+
+
+.endm
+
+.macro KERNEL1x4_SUB1
+
+
+	lxvw4x		vs0,	o0,	AO		// load a0, a1
+
+	lxvw4x		vs1,	o16,	AO		// load a2, a3
+
+
+	addi		AO,	AO,	32
+
+	lxvw4x		vs24,	o0,	BO		//  load b0, b1
+
+
+
+	xxspltw		vs8,	vs24,	0
+	xxspltw		vs9,	vs24,	1
+	xxspltw		vs10,	vs24,	2
+	xxspltw		vs11,	vs24,	3
+
+
+	addi		BO,	BO,	8
+
+
+	xvmaddasp	vs32,	vs0,	vs8		// a0_r*b0_r, a0_i*b0_r, a1_r*b0_r, a1_i*b0_r
+	xvmaddasp	vs33,	vs0,	vs9		// a0_r*b0_i, a0_i*b0_i, a1_r*b0_i, a1_i*b0_i
+	xvmaddasp	vs34,	vs1,	vs8		// a1_r*b0_r, a1_i*b0_r, a1_r*b0_r, a1_i*b0_r
+	xvmaddasp	vs35,	vs1,	vs9		// a1_r*b0_i, a1_i*b0_i, a1_r*b0_i, a1_i*b0_i
+
+
+.endm
+
+.macro SAVE1x4
+
+	mr		T1,	CO
+
+// N=0
+
+	mr		T2,	T1
+
+// N=0 M=0
+
+	xxlxor		vs4,	vs4,	vs4
+	xxlxor		vs5,	vs5,	vs5
+	xxlxor		vs6,	vs6,	vs6
+	xxlxor		vs7,	vs7,	vs7
+
+#ifndef TRMMKERNEL
+	lxvw4x		vs0,	o0,	T2	// c0, c1
+#else
+	xxlxor		vs0,	vs0,	vs0
+#endif
+
+
+	xxspltw		vs8,	vs32,	0
+	xxspltw		vs9,	vs32,	1
+	xxspltw		vs10,	vs32,	2
+	xxspltw		vs11,	vs32,	3
+
+
+	xxspltw		vs12,	vs33,	0
+	xxspltw		vs13,	vs33,	1
+	xxspltw		vs14,	vs33,	2
+	xxspltw		vs15,	vs33,	3
+
+	XVFADD_R1	vs4,	vs4,	vs8		// add a0_r * b0_r
+	XVFADD_I2	vs5,	vs5,	vs12		// add a0_r * b0_i
+	XVFADD_R1	vs6,	vs6,	vs10		// add a1_r * b0_r
+	XVFADD_I2	vs7,	vs7,	vs14		// add a1_r * b0_i
+
+	XVFADD_R2	vs4,	vs4,	vs13		// add a0_i * b0_i
+	XVFADD_I1	vs5,	vs5,	vs9 		// add a0_i * b0_r
+	XVFADD_R2	vs6,	vs6,	vs15		// add a1_i * b0_i
+	XVFADD_I1	vs7,	vs7,	vs11		// add a1_i * b0_r
+
+	xvmulsp		vs16,	vs4,	alpha_sr		// r0_r * alpha_r
+	xvmulsp		vs17,	vs5,	alpha_si		// r0_i * alpha_i
+	xvmulsp		vs18,	vs4,	alpha_si		// r0_r * alpha_i
+	xvmulsp		vs19,	vs5,	alpha_sr		// r0_i * alpha_r
+
+	xvsubsp		vs20,	vs16,	vs17		// r0_r * alpha_r - r0_i * alpha_i
+	xvaddsp		vs21,	vs18,	vs19		// r0_r * alpha_i + r0_i * alpha_r
+
+	xvmulsp		vs16,	vs6,	alpha_sr		// r1_r * alpha_r
+	xvmulsp		vs17,	vs7,	alpha_si		// r1_i * alpha_i
+	xvmulsp		vs18,	vs6,	alpha_si		// r1_r * alpha_i
+	xvmulsp		vs19,	vs7,	alpha_sr		// r1_i * alpha_r
+
+	xvsubsp		vs22,	vs16,	vs17		// r1_r * alpha_r - r1_i * alpha_i
+	xvaddsp		vs23,	vs18,	vs19		// r1_r * alpha_i + r1_i * alpha_r
+
+	xxlxor		vs24,	vs24,	vs24
+	xxsldwi		vs20,	vs20,	vs24,	3		// r0_r
+	xxsldwi		vs21,	vs21,	vs24,	2		// r0_i
+	xxsldwi		vs22,	vs22,	vs24,	1		// r1_r
+	xxsldwi		vs23,	vs23,	vs24,	0		// r1_i
+	xvaddsp		vs20,	vs20,	vs21		// r0_r, r0_i
+	xvaddsp		vs22,	vs22,	vs23		// r1_r, r1_i
+	xvaddsp		vs1,	vs20,	vs22			// r0_r, r0_i, r1_r, r1_i
+	xvaddsp		vs0,	vs0,	vs1
+
+
+	stxvw4x		vs0,	o0,	T2	// c0, c1
+
+	addi		T2,	T2,	16
+
+// N=0 M=2
+
+	xxlxor		vs4,	vs4,	vs4
+	xxlxor		vs5,	vs5,	vs5
+	xxlxor		vs6,	vs6,	vs6
+	xxlxor		vs7,	vs7,	vs7
+
+#ifndef TRMMKERNEL
+	lxvw4x		vs0,	o0,	T2	// c0, c1
+#else
+	xxlxor		vs0,	vs0,	vs0
+#endif
+
+
+	xxspltw		vs8,	vs34,	0
+	xxspltw		vs9,	vs34,	1
+	xxspltw		vs10,	vs34,	2
+	xxspltw		vs11,	vs34,	3
+
+
+	xxspltw		vs12,	vs35,	0
+	xxspltw		vs13,	vs35,	1
+	xxspltw		vs14,	vs35,	2
+	xxspltw		vs15,	vs35,	3
+
+	XVFADD_R1	vs4,	vs4,	vs8		// add a0_r * b0_r
+	XVFADD_I2	vs5,	vs5,	vs12		// add a0_r * b0_i
+	XVFADD_R1	vs6,	vs6,	vs10		// add a1_r * b0_r
+	XVFADD_I2	vs7,	vs7,	vs14		// add a1_r * b0_i
+
+	XVFADD_R2	vs4,	vs4,	vs13		// add a0_i * b0_i
+	XVFADD_I1	vs5,	vs5,	vs9 		// add a0_i * b0_r
+	XVFADD_R2	vs6,	vs6,	vs15		// add a1_i * b0_i
+	XVFADD_I1	vs7,	vs7,	vs11		// add a1_i * b0_r
+
+	xvmulsp		vs16,	vs4,	alpha_sr		// r0_r * alpha_r
+	xvmulsp		vs17,	vs5,	alpha_si		// r0_i * alpha_i
+	xvmulsp		vs18,	vs4,	alpha_si		// r0_r * alpha_i
+	xvmulsp		vs19,	vs5,	alpha_sr		// r0_i * alpha_r
+
+	xvsubsp		vs20,	vs16,	vs17		// r0_r * alpha_r - r0_i * alpha_i
+	xvaddsp		vs21,	vs18,	vs19		// r0_r * alpha_i + r0_i * alpha_r
+
+	xvmulsp		vs16,	vs6,	alpha_sr		// r1_r * alpha_r
+	xvmulsp		vs17,	vs7,	alpha_si		// r1_i * alpha_i
+	xvmulsp		vs18,	vs6,	alpha_si		// r1_r * alpha_i
+	xvmulsp		vs19,	vs7,	alpha_sr		// r1_i * alpha_r
+
+	xvsubsp		vs22,	vs16,	vs17		// r1_r * alpha_r - r1_i * alpha_i
+	xvaddsp		vs23,	vs18,	vs19		// r1_r * alpha_i + r1_i * alpha_r
+
+	xxlxor		vs24,	vs24,	vs24
+	xxsldwi		vs20,	vs20,	vs24,	3		// r0_r
+	xxsldwi		vs21,	vs21,	vs24,	2		// r0_i
+	xxsldwi		vs22,	vs22,	vs24,	1		// r1_r
+	xxsldwi		vs23,	vs23,	vs24,	0		// r1_i
+	xvaddsp		vs20,	vs20,	vs21		// r0_r, r0_i
+	xvaddsp		vs22,	vs22,	vs23		// r1_r, r1_i
+	xvaddsp		vs1,	vs20,	vs22			// r0_r, r0_i, r1_r, r1_i
+	xvaddsp		vs0,	vs0,	vs1
+
+
+	stxvw4x		vs0,	o0,	T2	// c0, c1
+
+	addi		T2,	T2,	16
+	add		T1,	T1,	LDC
+
+	addi		CO,	CO,	32
+
+.endm
+
+
+/**********************************************************************************************
+* Macros for N=1 and M=2
+**********************************************************************************************/
+
+.macro LOAD1x2_1
+
+	lxvw4x		vs0,	o0,	AO		// load a0, a1
+
+
+	addi		AO,	AO,	16
+
+	lxvw4x		vs24,	o0,	BO		//  load b0, b1
+
+
+
+	xxspltw		vs8,	vs24,	0
+	xxspltw		vs9,	vs24,	1
+	xxspltw		vs10,	vs24,	2
+	xxspltw		vs11,	vs24,	3
+
+
+	addi		BO,	BO,	8
+
+.endm
+
+.macro KERNEL1x2_I1
+
+
+	lxvw4x		vs4,	o0,	AO		// load a0, a1
+
+
+	addi		AO,	AO,	16
+
+	lxvw4x		vs24,	o0,	BO		//  load b0, b1
+
+
+
+	xxspltw		vs16,	vs24,	0
+	xxspltw		vs17,	vs24,	1
+	xxspltw		vs18,	vs24,	2
+	xxspltw		vs19,	vs24,	3
+
+
+	addi		BO,	BO,	8
+
+
+	xvmulsp		vs32,	vs0,	vs8		// a0_r*b0_r, a0_i*b0_r, a1_r*b0_r, a1_i*b0_r
+	xvmulsp		vs33,	vs0,	vs9		// a0_r*b0_i, a0_i*b0_i, a1_r*b0_i, a1_i*b0_i
+
+
+.endm
+
+.macro KERNEL1x2_1
+
+
+	lxvw4x		vs4,	o0,	AO		// load a0, a1
+
+
+	addi		AO,	AO,	16
+
+	lxvw4x		vs24,	o0,	BO		//  load b0, b1
+
+
+
+	xxspltw		vs16,	vs24,	0
+	xxspltw		vs17,	vs24,	1
+	xxspltw		vs18,	vs24,	2
+	xxspltw		vs19,	vs24,	3
+
+
+	addi		BO,	BO,	8
+
+
+	xvmaddasp	vs32,	vs0,	vs8		// a0_r*b0_r, a0_i*b0_r, a1_r*b0_r, a1_i*b0_r
+	xvmaddasp	vs33,	vs0,	vs9		// a0_r*b0_i, a0_i*b0_i, a1_r*b0_i, a1_i*b0_i
+
+
+.endm
+
+.macro KERNEL1x2_2
+
+
+	lxvw4x		vs0,	o0,	AO		// load a0, a1
+
+
+	addi		AO,	AO,	16
+
+	lxvw4x		vs24,	o0,	BO		//  load b0, b1
+
+
+
+	xxspltw		vs8,	vs24,	0
+	xxspltw		vs9,	vs24,	1
+	xxspltw		vs10,	vs24,	2
+	xxspltw		vs11,	vs24,	3
+
+
+	addi		BO,	BO,	8
+
+
+	xvmaddasp	vs32,	vs4,	vs16		// a4_r*b0_r, a4_i*b0_r, a1_r*b0_r, a1_i*b0_r
+	xvmaddasp	vs33,	vs4,	vs17		// a4_r*b0_i, a4_i*b0_i, a1_r*b0_i, a1_i*b0_i
+
+
+.endm
+
+.macro KERNEL1x2_E2
+
+
+	xvmaddasp	vs32,	vs4,	vs16		// a4_r*b0_r, a4_i*b0_r, a1_r*b0_r, a1_i*b0_r
+	xvmaddasp	vs33,	vs4,	vs17		// a4_r*b0_i, a4_i*b0_i, a1_r*b0_i, a1_i*b0_i
+
+
+.endm
+
+.macro KERNEL1x2_SUBI1
+
+
+	lxvw4x		vs0,	o0,	AO		// load a0, a1
+
+
+	addi		AO,	AO,	16
+
+	lxvw4x		vs24,	o0,	BO		//  load b0, b1
+
+
+
+	xxspltw		vs8,	vs24,	0
+	xxspltw		vs9,	vs24,	1
+	xxspltw		vs10,	vs24,	2
+	xxspltw		vs11,	vs24,	3
+
+
+	addi		BO,	BO,	8
+
+
+	xvmulsp		vs32,	vs0,	vs8		// a0_r*b0_r, a0_i*b0_r, a1_r*b0_r, a1_i*b0_r
+	xvmulsp		vs33,	vs0,	vs9		// a0_r*b0_i, a0_i*b0_i, a1_r*b0_i, a1_i*b0_i
+
+
+.endm
+
+.macro KERNEL1x2_SUB1
+
+
+	lxvw4x		vs0,	o0,	AO		// load a0, a1
+
+
+	addi		AO,	AO,	16
+
+	lxvw4x		vs24,	o0,	BO		//  load b0, b1
+
+
+
+	xxspltw		vs8,	vs24,	0
+	xxspltw		vs9,	vs24,	1
+	xxspltw		vs10,	vs24,	2
+	xxspltw		vs11,	vs24,	3
+
+
+	addi		BO,	BO,	8
+
+
+	xvmaddasp	vs32,	vs0,	vs8		// a0_r*b0_r, a0_i*b0_r, a1_r*b0_r, a1_i*b0_r
+	xvmaddasp	vs33,	vs0,	vs9		// a0_r*b0_i, a0_i*b0_i, a1_r*b0_i, a1_i*b0_i
+
+
+.endm
+
+.macro SAVE1x2
+
+	mr		T1,	CO
+
+// N=0
+
+	mr		T2,	T1
+
+// N=0 M=0
+
+	xxlxor		vs4,	vs4,	vs4
+	xxlxor		vs5,	vs5,	vs5
+	xxlxor		vs6,	vs6,	vs6
+	xxlxor		vs7,	vs7,	vs7
+
+#ifndef TRMMKERNEL
+	lxvw4x		vs0,	o0,	T2	// c0, c1
+#else
+	xxlxor		vs0,	vs0,	vs0
+#endif
+
+
+	xxspltw		vs8,	vs32,	0
+	xxspltw		vs9,	vs32,	1
+	xxspltw		vs10,	vs32,	2
+	xxspltw		vs11,	vs32,	3
+
+
+	xxspltw		vs12,	vs33,	0
+	xxspltw		vs13,	vs33,	1
+	xxspltw		vs14,	vs33,	2
+	xxspltw		vs15,	vs33,	3
+
+	XVFADD_R1	vs4,	vs4,	vs8		// add a0_r * b0_r
+	XVFADD_I2	vs5,	vs5,	vs12		// add a0_r * b0_i
+	XVFADD_R1	vs6,	vs6,	vs10		// add a1_r * b0_r
+	XVFADD_I2	vs7,	vs7,	vs14		// add a1_r * b0_i
+
+	XVFADD_R2	vs4,	vs4,	vs13		// add a0_i * b0_i
+	XVFADD_I1	vs5,	vs5,	vs9 		// add a0_i * b0_r
+	XVFADD_R2	vs6,	vs6,	vs15		// add a1_i * b0_i
+	XVFADD_I1	vs7,	vs7,	vs11		// add a1_i * b0_r
+
+	xvmulsp		vs16,	vs4,	alpha_sr		// r0_r * alpha_r
+	xvmulsp		vs17,	vs5,	alpha_si		// r0_i * alpha_i
+	xvmulsp		vs18,	vs4,	alpha_si		// r0_r * alpha_i
+	xvmulsp		vs19,	vs5,	alpha_sr		// r0_i * alpha_r
+
+	xvsubsp		vs20,	vs16,	vs17		// r0_r * alpha_r - r0_i * alpha_i
+	xvaddsp		vs21,	vs18,	vs19		// r0_r * alpha_i + r0_i * alpha_r
+
+	xvmulsp		vs16,	vs6,	alpha_sr		// r1_r * alpha_r
+	xvmulsp		vs17,	vs7,	alpha_si		// r1_i * alpha_i
+	xvmulsp		vs18,	vs6,	alpha_si		// r1_r * alpha_i
+	xvmulsp		vs19,	vs7,	alpha_sr		// r1_i * alpha_r
+
+	xvsubsp		vs22,	vs16,	vs17		// r1_r * alpha_r - r1_i * alpha_i
+	xvaddsp		vs23,	vs18,	vs19		// r1_r * alpha_i + r1_i * alpha_r
+
+	xxlxor		vs24,	vs24,	vs24
+	xxsldwi		vs20,	vs20,	vs24,	3		// r0_r
+	xxsldwi		vs21,	vs21,	vs24,	2		// r0_i
+	xxsldwi		vs22,	vs22,	vs24,	1		// r1_r
+	xxsldwi		vs23,	vs23,	vs24,	0		// r1_i
+	xvaddsp		vs20,	vs20,	vs21		// r0_r, r0_i
+	xvaddsp		vs22,	vs22,	vs23		// r1_r, r1_i
+	xvaddsp		vs1,	vs20,	vs22			// r0_r, r0_i, r1_r, r1_i
+	xvaddsp		vs0,	vs0,	vs1
+
+
+	stxvw4x		vs0,	o0,	T2	// c0, c1
+
+	addi		T2,	T2,	16
+	add		T1,	T1,	LDC
+
+	addi		CO,	CO,	16
+
+.endm
+
+
+/**********************************************************************************************
+* Macros for N=1 and M=1
+**********************************************************************************************/
+
+.macro LOAD1x1_1
+
+	lxsspx		vs0,	o0,	AO		// load a0_r
+	lxsspx		vs1,	o4,	AO		// load a0_i
+
+	addi		AO,	AO,	8
+
+	mr		T1,	BO
+
+	lxsspx		vs8,	o0,	T1		//  load b0_r
+	lxsspx		vs9,	o4,	T1		//  load b0_i
+
+	addi		BO,	BO,	8
+
+.endm
+
+.macro KERNEL1x1_I1
+
+
+	lxsspx		vs4,	o0,	AO		// load a0_r
+	lxsspx		vs5,	o4,	AO		// load a0_i
+
+	addi		AO,	AO,	8
+
+	mr		T1,	BO
+
+	lxsspx		vs16,	o0,	T1		//  load b0_r
+	lxsspx		vs17,	o4,	T1		//  load b0_i
+
+	addi		BO,	BO,	8
+
+
+	xsmuldp		vs32,	vs0,	vs8		// a0_r*b0_r
+	xsmuldp		vs33,	vs1,	vs9		// a0_i*b0_i
+	xsmuldp		vs34,	vs0,	vs9		// a0_r*b0_i
+	xsmuldp		vs35,	vs1,	vs8		// a0_i*b0_r
+
+
+.endm
+
+.macro KERNEL1x1_1
+
+
+	lxsspx		vs4,	o0,	AO		// load a0_r
+	lxsspx		vs5,	o4,	AO		// load a0_i
+
+	addi		AO,	AO,	8
+
+	mr		T1,	BO
+
+	lxsspx		vs16,	o0,	T1		//  load b0_r
+	lxsspx		vs17,	o4,	T1		//  load b0_i
+
+	addi		BO,	BO,	8
+
+
+	xsmaddadp	vs32,	vs0,	vs8		// a0_r*b0_r
+	xsmaddadp	vs33,	vs1,	vs9		// a0_i*b0_i
+	xsmaddadp	vs34,	vs0,	vs9		// a0_r*b0_i
+	xsmaddadp	vs35,	vs1,	vs8		// a0_i*b0_r
+
+
+.endm
+
+.macro KERNEL1x1_2
+
+
+	lxsspx		vs0,	o0,	AO		// load a0_r
+	lxsspx		vs1,	o4,	AO		// load a0_i
+
+	addi		AO,	AO,	8
+
+	mr		T1,	BO
+
+	lxsspx		vs8,	o0,	T1		//  load b0_r
+	lxsspx		vs9,	o4,	T1		//  load b0_i
+
+	addi		BO,	BO,	8
+
+
+	xsmaddadp	vs32,	vs4,	vs16		// a4_r*b0_r
+	xsmaddadp	vs33,	vs5,	vs17		// a4_i*b0_i
+	xsmaddadp	vs34,	vs4,	vs17		// a4_r*b0_i
+	xsmaddadp	vs35,	vs5,	vs16		// a4_i*b0_r
+
+
+.endm
+
+.macro KERNEL1x1_E2
+
+
+	xsmaddadp	vs32,	vs4,	vs16		// a4_r*b0_r
+	xsmaddadp	vs33,	vs5,	vs17		// a4_i*b0_i
+	xsmaddadp	vs34,	vs4,	vs17		// a4_r*b0_i
+	xsmaddadp	vs35,	vs5,	vs16		// a4_i*b0_r
+
+
+.endm
+
+.macro KERNEL1x1_SUBI1
+
+
+	lxsspx		vs0,	o0,	AO		// load a0_r
+	lxsspx		vs1,	o4,	AO		// load a0_i
+
+	addi		AO,	AO,	8
+
+	mr		T1,	BO
+
+	lxsspx		vs8,	o0,	T1		//  load b0_r
+	lxsspx		vs9,	o4,	T1		//  load b0_i
+
+	addi		BO,	BO,	8
+
+
+	xsmuldp		vs32,	vs0,	vs8		// a0_r*b0_r
+	xsmuldp		vs33,	vs1,	vs9		// a0_i*b0_i
+	xsmuldp		vs34,	vs0,	vs9		// a0_r*b0_i
+	xsmuldp		vs35,	vs1,	vs8		// a0_i*b0_r
+
+
+.endm
+
+.macro KERNEL1x1_SUB1
+
+
+	lxsspx		vs0,	o0,	AO		// load a0_r
+	lxsspx		vs1,	o4,	AO		// load a0_i
+
+	addi		AO,	AO,	8
+
+	mr		T1,	BO
+
+	lxsspx		vs8,	o0,	T1		//  load b0_r
+	lxsspx		vs9,	o4,	T1		//  load b0_i
+
+	addi		BO,	BO,	8
+
+
+	xsmaddadp	vs32,	vs0,	vs8		// a0_r*b0_r
+	xsmaddadp	vs33,	vs1,	vs9		// a0_i*b0_i
+	xsmaddadp	vs34,	vs0,	vs9		// a0_r*b0_i
+	xsmaddadp	vs35,	vs1,	vs8		// a0_i*b0_r
+
+
+.endm
+
+.macro SAVE1x1
+
+	mr		T1,	CO
+
+// N=0
+
+	mr		T2,	T1
+
+// N=0 M=0
+
+	xxlxor		vs4,	vs4,	vs4
+	xxlxor		vs5,	vs5,	vs5
+
+#ifndef TRMMKERNEL
+	lxsspx		vs0,	o0,	T2	// load c0_r
+	lxsspx		vs1,	o4,	T2	// load c0_i
+#else
+	xxlxor		vs0,	vs0,	vs0
+	xxlxor		vs1,	vs1,	vs1
+#endif
+
+	XSFADD_R1	vs4,	vs4,	vs32		// add a0_r * b0_r
+	XSFADD_I1	vs5,	vs5,	vs35		// add a0_r * b0_i
+
+	XSFADD_R2	vs4,	vs4,	vs33		// add a0_i * b0_i
+	XSFADD_I2	vs5,	vs5,	vs34		// add a0_i * b0_r
+
+	xsmuldp		vs16,	vs4,	alpha_dr		// r0_r * alpha_r
+	xsmuldp		vs17,	vs5,	alpha_di		// r0_i * alpha_i
+	xsmuldp		vs18,	vs4,	alpha_di		// r0_r * alpha_i
+	xsmuldp		vs19,	vs5,	alpha_dr		// r0_i * alpha_r
+
+	xssubdp		vs20,	vs16,	vs17		// r0_r * alpha_r - r0_i * alpha_i
+	xsadddp		vs21,	vs18,	vs19		// r0_r * alpha_i + r0_i * alpha_r
+
+	xsadddp		vs0,	vs0,	vs20
+	xsadddp		vs1,	vs1,	vs21
+
+
+	stxsspx		vs0,	o0,	T2	// store c0_r
+	stxsspx		vs1,	o4,	T2	// store c0_i
+
+	addi		T2,	T2,	8
+	add		T1,	T1,	LDC
+
+	addi		CO,	CO,	8
+
+.endm
+
diff --git a/kernel/power/dasum.c b/kernel/power/dasum.c
new file mode 100644
index 000000000..77f5345ba
--- /dev/null
+++ b/kernel/power/dasum.c
@@ -0,0 +1,144 @@
+/***************************************************************************
+Copyright (c) 2013-2016, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*****************************************************************************/
+
+/**************************************************************************************
+* 2016/03/28 Werner Saar (wernsaar@googlemail.com)
+* 	 BLASTEST 		: OK
+* 	 CTEST			: OK
+* 	 TEST			: OK
+*	 LAPACK-TEST		: OK
+**************************************************************************************/
+
+#include "common.h"
+#include <math.h>
+
+#if defined(DOUBLE)
+
+#define ABS fabs
+
+#else
+
+#define ABS fabsf
+
+#endif
+
+#if defined(POWER8)
+#include "dasum_microk_power8.c"
+#endif
+
+
+#ifndef HAVE_KERNEL_16
+
+static void dasum_kernel_16(BLASLONG n, FLOAT *x1, FLOAT *svec)
+{
+
+	BLASLONG i=0;
+	FLOAT *x = x1;
+	FLOAT temp0, temp1, temp2, temp3;
+	FLOAT temp4, temp5, temp6, temp7;
+	FLOAT sum0 = 0.0;
+	FLOAT sum1 = 0.0;
+	FLOAT sum2 = 0.0;
+	FLOAT sum3 = 0.0;
+
+	while ( i< n )
+	{
+
+		temp0 = ABS(x[0]);
+		temp1 = ABS(x[1]);
+		temp2 = ABS(x[2]);
+		temp3 = ABS(x[3]);
+		temp4 = ABS(x[4]);
+		temp5 = ABS(x[5]);
+		temp6 = ABS(x[6]);
+		temp7 = ABS(x[7]);
+
+		sum0 += temp0;
+		sum1 += temp1;
+		sum2 += temp2;
+		sum3 += temp3;
+
+		sum0 += temp4;
+		sum1 += temp5;
+		sum2 += temp6;
+		sum3 += temp7;
+
+		x+=8;
+		i+=8;
+
+	}
+
+	svec[0] = sum0+sum1+sum2+sum3;
+	svec[1] = 0.0;
+
+}
+
+#endif
+
+FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x)
+{
+	BLASLONG i=0;
+	FLOAT sumf = 0.0;
+	FLOAT svec[2] __attribute__ ((aligned (16)));;
+	BLASLONG n1;
+
+	if (n <= 0 || inc_x <= 0) return(sumf);
+
+	if ( inc_x == 1 )
+	{
+
+		n1 = n & -16;
+		if ( n1 > 0 )
+		{
+
+			dasum_kernel_16(n1, x, svec);
+			sumf = svec[0] + svec[1];
+			i=n1;
+		}
+
+		while(i < n)
+		{
+			sumf += ABS(x[i]);
+			i++;
+		}
+
+	}
+	else
+	{
+
+		n *= inc_x;
+		while(i < n)
+		{
+			sumf += ABS(x[i]);
+			i += inc_x;
+		}
+
+	}
+	return(sumf);
+}
+
+
diff --git a/kernel/power/dasum_microk_power8.c b/kernel/power/dasum_microk_power8.c
new file mode 100644
index 000000000..cc38c4f7d
--- /dev/null
+++ b/kernel/power/dasum_microk_power8.c
@@ -0,0 +1,177 @@
+/***************************************************************************
+Copyright (c) 2013-2016, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*****************************************************************************/
+
+/**************************************************************************************
+* 2016/03/28 Werner Saar (wernsaar@googlemail.com)
+* 	 BLASTEST 		: OK
+* 	 CTEST			: OK
+* 	 TEST			: OK
+*	 LAPACK-TEST		: OK
+**************************************************************************************/
+
+#define HAVE_KERNEL_16 1
+static void dasum_kernel_16( BLASLONG n, FLOAT *x, FLOAT *svec) __attribute__ ((noinline));
+
+static void dasum_kernel_16( BLASLONG n, FLOAT *x, FLOAT *svec)
+{
+
+
+	BLASLONG i = n;
+	BLASLONG o16 = 16;
+	BLASLONG o32 = 32;
+	BLASLONG o48 = 48;
+	BLASLONG o64 = 64;
+	BLASLONG o80 = 80;
+	BLASLONG o96 = 96;
+	BLASLONG o112 = 112;
+	FLOAT *x1=x;
+	BLASLONG pre = 384;
+
+	__asm__  __volatile__
+	(
+
+	"dcbt		%2 , %4				    \n\t"
+
+	"xxlxor		32,32,32			    \n\t"
+	"xxlxor		33,33,33			    \n\t"
+	"xxlxor		34,34,34			    \n\t"
+	"xxlxor		35,35,35			    \n\t"
+	"xxlxor		36,36,36			    \n\t"
+	"xxlxor		37,37,37			    \n\t"
+	"xxlxor		38,38,38			    \n\t"
+	"xxlxor		39,39,39			    \n\t"
+
+	"lxvd2x		40, 0, %2			    \n\t"
+	"lxvd2x		41, %5, %2			    \n\t"
+	"lxvd2x		42, %6, %2			    \n\t"
+	"lxvd2x		43, %7, %2			    \n\t"
+	"lxvd2x		44, %8, %2			    \n\t"
+	"lxvd2x		45, %9, %2			    \n\t"
+	"lxvd2x		46, %10, %2			    \n\t"
+	"lxvd2x		47, %11, %2			    \n\t"
+
+	"addi		%2, %2, 128			    \n\t"
+
+	"addic.		%0 , %0	, -16  	 	             \n\t"
+	"ble		2f		             	     \n\t"
+
+	".align 5				            \n\t"
+	"1:				                    \n\t"
+
+	"dcbt		%2 , %4				    \n\t"
+
+	"xvabsdp	48, 40				    \n\t"
+	"xvabsdp	49, 41				    \n\t"
+	"xvabsdp	50, 42				    \n\t"
+	"xvabsdp	51, 43				    \n\t"
+
+	"lxvd2x		40, 0, %2			    \n\t"
+	"lxvd2x		41, %5, %2			    \n\t"
+
+	"xvabsdp	52, 44				    \n\t"
+	"xvabsdp	53, 45				    \n\t"
+
+	"lxvd2x		42, %6, %2			    \n\t"
+	"lxvd2x		43, %7, %2			    \n\t"
+
+	"xvabsdp	54, 46				    \n\t"
+	"xvabsdp	55, 47				    \n\t"
+
+	"lxvd2x		44, %8, %2			    \n\t"
+	"lxvd2x		45, %9, %2			    \n\t"
+
+	"xvadddp	32, 32, 48		    \n\t"
+	"xvadddp	33, 33, 49		    \n\t"
+
+	"lxvd2x		46, %10, %2			    \n\t"
+	"lxvd2x		47, %11, %2			    \n\t"
+
+	"xvadddp	34, 34, 50		    \n\t"
+	"xvadddp	35, 35, 51		    \n\t"
+	"addi		%2, %2, 128			    \n\t"
+	"xvadddp	36, 36, 52		    \n\t"
+	"xvadddp	37, 37, 53		    \n\t"
+	"addic.		%0 , %0	, -16  	 	             \n\t"
+	"xvadddp	38, 38, 54		    \n\t"
+	"xvadddp	39, 39, 55		    \n\t"
+
+	"bgt		1b		             	     \n\t"
+
+	"2:						     \n\t"
+
+
+	"xvabsdp	48, 40				    \n\t"
+	"xvabsdp	49, 41				    \n\t"
+	"xvabsdp	50, 42				    \n\t"
+	"xvabsdp	51, 43				    \n\t"
+	"xvabsdp	52, 44				    \n\t"
+	"xvabsdp	53, 45				    \n\t"
+	"xvabsdp	54, 46				    \n\t"
+	"xvabsdp	55, 47				    \n\t"
+
+	"xvadddp	32, 32, 48		    \n\t"
+	"xvadddp	33, 33, 49		    \n\t"
+	"xvadddp	34, 34, 50		    \n\t"
+	"xvadddp	35, 35, 51		    \n\t"
+	"xvadddp	36, 36, 52		    \n\t"
+	"xvadddp	37, 37, 53		    \n\t"
+	"xvadddp	38, 38, 54		    \n\t"
+	"xvadddp	39, 39, 55		    \n\t"
+
+	"xvadddp	32, 32, 33		     \n\t"
+	"xvadddp	34, 34, 35		     \n\t"
+	"xvadddp	36, 36, 37		     \n\t"
+	"xvadddp	38, 38, 39		     \n\t"
+
+	"xvadddp	32, 32, 34		     \n\t"
+	"xvadddp	36, 36, 38		     \n\t"
+
+	"xvadddp	32, 32, 36		     \n\t"
+
+
+	"stxvd2x	32, 0, %3		     \n\t"
+
+	:
+        : 
+          "r" (i),	// 0	
+	  "r" (n),  	// 1
+          "r" (x1),     // 2
+          "r" (svec),   // 3
+          "r" (pre),    // 4
+	  "r" (o16),	// 5
+	  "r" (o32),	// 6
+	  "r" (o48),    // 7
+          "r" (o64),    // 8
+          "r" (o80),    // 9
+          "r" (o96),    // 10
+          "r" (o112)   // 11
+	: "cr0", "%0", "%2",  "memory"
+	);
+
+} 
+
+
diff --git a/kernel/power/daxpy.c b/kernel/power/daxpy.c
new file mode 100644
index 000000000..4365bd88d
--- /dev/null
+++ b/kernel/power/daxpy.c
@@ -0,0 +1,136 @@
+/***************************************************************************
+Copyright (c) 2013-2016, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*****************************************************************************/
+
+/**************************************************************************************
+* 2016/03/22 Werner Saar (wernsaar@googlemail.com)
+* 	 BLASTEST 		: OK
+* 	 CTEST			: OK
+* 	 TEST			: OK
+*	 LAPACK-TEST		: OK
+**************************************************************************************/
+
+#include "common.h"
+
+
+#if defined(POWER8)
+#include "daxpy_microk_power8.c"
+#endif
+
+
+#ifndef HAVE_KERNEL_8
+
+static void daxpy_kernel_8(BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *alpha)
+{
+	BLASLONG register i = 0;
+	FLOAT a = *alpha;
+
+	while(i < n)
+        {
+              y[i]   += a * x[i];
+              y[i+1] += a * x[i+1];
+              y[i+2] += a * x[i+2];
+              y[i+3] += a * x[i+3];
+              y[i+4] += a * x[i+4];
+              y[i+5] += a * x[i+5];
+              y[i+6] += a * x[i+6];
+              y[i+7] += a * x[i+7];
+              i+=8 ;
+
+       }
+
+}
+
+#endif
+
+int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *dummy, BLASLONG dummy2)
+{
+	BLASLONG i=0;
+	BLASLONG ix=0,iy=0;
+	FLOAT a2[4];
+	a2[0]=da;
+	a2[1]=da;
+	a2[2]=da;
+	a2[3]=da;
+
+	if ( n <= 0 )  return(0);
+
+	if ( (inc_x == 1) && (inc_y == 1) )
+	{
+
+		BLASLONG n1 = n & -16;
+
+		if ( n1 )
+			daxpy_kernel_8(n1, x, y , a2 );
+
+		i = n1;
+		while(i < n)
+		{
+
+			y[i] += da * x[i] ;
+			i++ ;
+
+		}
+		return(0);
+
+
+	}
+
+	BLASLONG n1 = n & -4;
+
+	while(i < n1)
+	{
+
+		FLOAT m1      = da * x[ix] ;
+		FLOAT m2      = da * x[ix+inc_x] ;
+		FLOAT m3      = da * x[ix+2*inc_x] ;
+		FLOAT m4      = da * x[ix+3*inc_x] ;
+
+		y[iy]         += m1 ;
+		y[iy+inc_y]   += m2 ;
+		y[iy+2*inc_y] += m3 ;
+		y[iy+3*inc_y] += m4 ;
+
+		ix  += inc_x*4 ;
+		iy  += inc_y*4 ;
+		i+=4 ;
+
+	}
+
+	while(i < n)
+	{
+
+		y[iy] += da * x[ix] ;
+		ix  += inc_x ;
+		iy  += inc_y ;
+		i++ ;
+
+	}
+	return(0);
+
+}
+
+
diff --git a/kernel/power/daxpy_microk_power8.c b/kernel/power/daxpy_microk_power8.c
new file mode 100644
index 000000000..bb3f73aca
--- /dev/null
+++ b/kernel/power/daxpy_microk_power8.c
@@ -0,0 +1,201 @@
+/***************************************************************************
+Copyright (c) 2013-2016, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*****************************************************************************/
+
+/**************************************************************************************
+* 2016/03/22 Werner Saar (wernsaar@googlemail.com)
+* 	 BLASTEST 		: OK
+* 	 CTEST			: OK
+* 	 TEST			: OK
+*	 LAPACK-TEST		: OK
+**************************************************************************************/
+
+
+#define HAVE_KERNEL_8 1
+static void daxpy_kernel_8( BLASLONG n, FLOAT *x, FLOAT *y , FLOAT *alpha) __attribute__ ((noinline));
+
+static void daxpy_kernel_8( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *alpha)
+{
+
+
+	BLASLONG i = n;
+	BLASLONG o16 = 16;
+	BLASLONG o32 = 32;
+	BLASLONG o48 = 48;
+	FLOAT *x1=x;
+	FLOAT *y1=y;
+	FLOAT *y2=y+1;
+	BLASLONG pre = 384;
+
+	__asm__  __volatile__
+	(
+
+	"lxsdx		33, %5, %4			    \n\t"
+	"xxspltd	32, 33, 0			    \n\t"
+	"addi		%8, %8, -8			    \n\t"
+
+	"dcbt		%2, %9				    \n\t"
+	"dcbt		%3, %9				    \n\t"
+
+	"lxvd2x		40, 0, %2			    \n\t"
+	"lxvd2x		41, %5, %2			    \n\t"
+	"lxvd2x		42, %6, %2			    \n\t"
+	"lxvd2x		43, %7, %2			    \n\t"
+
+	"lxvd2x		48, 0, %3			    \n\t"
+	"lxvd2x		49, %5, %3			    \n\t"
+	"lxvd2x		50, %6, %3			    \n\t"
+	"lxvd2x		51, %7, %3			    \n\t"
+
+	"addi		%2, %2, 64			    \n\t"
+	"addi		%3, %3, 64			    \n\t"
+
+	"lxvd2x		44, 0, %2			    \n\t"
+	"lxvd2x		45, %5, %2			    \n\t"
+	"lxvd2x		46, %6, %2			    \n\t"
+	"lxvd2x		47, %7, %2			    \n\t"
+
+	"lxvd2x		52, 0, %3			    \n\t"
+	"lxvd2x		53, %5, %3			    \n\t"
+	"lxvd2x		54, %6, %3			    \n\t"
+	"lxvd2x		55, %7, %3			    \n\t"
+
+	"addi		%2, %2, 64			    \n\t"
+	"addi		%3, %3, 64			    \n\t"
+
+	"addic.		%0 , %0	, -16  	 	             \n\t"
+	"ble		2f		             	     \n\t"
+
+	".align 5				            \n\t"
+	"1:				                    \n\t"
+
+	"dcbt		%2, %9				    \n\t"
+	"dcbt		%3, %9				    \n\t"
+
+	"xvmaddadp	48, 40, 32		    	    \n\t"
+	"xvmaddadp	49, 41, 32		    	    \n\t"
+
+	"lxvd2x		40, 0, %2			    \n\t"
+	"lxvd2x		41, %5, %2			    \n\t"
+
+	"stxvd2x	48,  0, %8			    \n\t"
+	"stxvd2x	49, %5, %8			    \n\t"
+
+	"xvmaddadp	50, 42, 32		    	    \n\t"
+	"xvmaddadp	51, 43, 32		    	    \n\t"
+
+	"lxvd2x		42, %6, %2			    \n\t"
+	"lxvd2x		43, %7, %2			    \n\t"
+
+	"stxvd2x	50, %6, %8			    \n\t"
+	"stxvd2x	51, %7, %8			    \n\t"
+
+	"lxvd2x		48, 0, %3			    \n\t"
+	"lxvd2x		49, %5, %3			    \n\t"
+	"lxvd2x		50, %6, %3			    \n\t"
+	"lxvd2x		51, %7, %3			    \n\t"
+
+	"addi		%2, %2, 64			    \n\t"
+	"addi		%8, %8, 64			    \n\t"
+
+	"xvmaddadp	52, 44, 32		    	    \n\t"
+	"addi		%3, %3, 64			    \n\t"
+	"xvmaddadp	53, 45, 32		    	    \n\t"
+
+	"lxvd2x		44, 0, %2			    \n\t"
+	"lxvd2x		45, %5, %2			    \n\t"
+
+	"stxvd2x	52,  0, %8			    \n\t"
+	"stxvd2x	53, %5, %8			    \n\t"
+
+	"xvmaddadp	54, 46, 32		    	    \n\t"
+	"xvmaddadp	55, 47, 32		    	    \n\t"
+
+	"lxvd2x		46, %6, %2			    \n\t"
+	"lxvd2x		47, %7, %2			    \n\t"
+
+	"stxvd2x	54, %6, %8			    \n\t"
+	"stxvd2x	55, %7, %8			    \n\t"
+
+	"addi		%2, %2, 64			    \n\t"
+	"addi		%8, %8, 64			    \n\t"
+
+	"lxvd2x		52, 0, %3			    \n\t"
+	"lxvd2x		53, %5, %3			    \n\t"
+	"lxvd2x		54, %6, %3			    \n\t"
+	"lxvd2x		55, %7, %3			    \n\t"
+
+	"addi		%3, %3, 64			    \n\t"
+
+
+	"addic.		%0 , %0	, -16  	 	             \n\t"
+	"bgt		1b		             	     \n\t"
+
+	"2:						     \n\t"
+
+	
+	"xvmaddadp	48, 40, 32		    	    \n\t"
+	"xvmaddadp	49, 41, 32		    	    \n\t"
+	"xvmaddadp	50, 42, 32		    	    \n\t"
+	"xvmaddadp	51, 43, 32		    	    \n\t"
+
+	"xvmaddadp	52, 44, 32		    	    \n\t"
+	"xvmaddadp	53, 45, 32		    	    \n\t"
+	"xvmaddadp	54, 46, 32		    	    \n\t"
+	"xvmaddadp	55, 47, 32		    	    \n\t"
+
+	"stxvd2x	48,  0, %8			    \n\t"
+	"stxvd2x	49, %5, %8			    \n\t"
+	"stxvd2x	50, %6, %8			    \n\t"
+	"stxvd2x	51, %7, %8			    \n\t"
+
+	"addi		%8, %8, 64			    \n\t"
+
+	"stxvd2x	52,  0, %8			    \n\t"
+	"stxvd2x	53, %5, %8			    \n\t"
+	"stxvd2x	54, %6, %8			    \n\t"
+	"stxvd2x	55, %7, %8			    \n\t"
+
+	"addi		%8, %8, 64			    \n\t"
+
+	:
+        : 
+          "r" (i),	// 0	
+	  "r" (n),  	// 1
+          "r" (x1),     // 2
+          "r" (y1),     // 3
+          "r" (alpha),    // 4
+	  "r" (o16),	// 5
+	  "r" (o32),	// 6
+	  "r" (o48),    // 7
+	  "r" (y2),     // 8
+	  "r" (pre)	// 9
+	: "cr0", "%0", "%2" , "%3", "%8", "memory"
+	);
+
+} 
+
+
diff --git a/kernel/power/dcopy.c b/kernel/power/dcopy.c
new file mode 100644
index 000000000..059c0e5a9
--- /dev/null
+++ b/kernel/power/dcopy.c
@@ -0,0 +1,131 @@
+/***************************************************************************
+Copyright (c) 2013-2016, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*****************************************************************************/
+
+/**************************************************************************************
+* 2016/03/25 Werner Saar (wernsaar@googlemail.com)
+* 	 BLASTEST 		: OK
+* 	 CTEST			: OK
+* 	 TEST			: OK
+*	 LAPACK-TEST		: OK
+**************************************************************************************/
+
+#include "common.h"
+
+#if defined(POWER8)
+#include "dcopy_microk_power8.c"
+#endif
+
+#ifndef HAVE_KERNEL_32
+
+static void dcopy_kernel_32(BLASLONG n, FLOAT *x, FLOAT *y)
+{
+
+	BLASLONG i=0;
+	FLOAT f0, f1, f2, f3, f4, f5, f6, f7;
+	FLOAT *x1=x;
+	FLOAT *y1=y;
+
+	while ( i<n )
+	{
+
+		f0 = x1[0];
+		f1 = x1[1];
+		f2 = x1[2];
+		f3 = x1[3];
+		f4 = x1[4];
+		f5 = x1[5];
+		f6 = x1[6];
+		f7 = x1[7];
+
+		y1[0] = f0;
+		y1[1] = f1;
+		y1[2] = f2;
+		y1[3] = f3;
+		y1[4] = f4;
+		y1[5] = f5;
+		y1[6] = f6;
+		y1[7] = f7;
+
+		x1 += 8;
+		y1 += 8;
+
+		i+=8;
+	}
+	return;
+
+}
+
+
+#endif
+
+
+
+int CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y)
+{
+	BLASLONG i=0;
+	BLASLONG ix=0,iy=0;
+
+	if ( n <= 0     )  return(0);
+
+	if ( (inc_x == 1) && (inc_y == 1 ))
+	{
+
+		BLASLONG n1 = n & -32;
+		if ( n1 > 0 )
+		{
+			dcopy_kernel_32(n1, x, y);
+			i=n1;
+		}
+
+		while(i < n)
+		{
+			y[i] = x[i] ;
+			i++ ;
+
+		}
+
+
+	}
+	else
+	{
+
+		while(i < n)
+		{
+			y[iy] = x[ix] ;
+			ix += inc_x ;
+			iy += inc_y ;
+			i++ ;
+
+		}
+
+	}
+	return(0);
+	
+
+}
+
+
diff --git a/kernel/power/dcopy_microk_power8.c b/kernel/power/dcopy_microk_power8.c
new file mode 100644
index 000000000..04f7db556
--- /dev/null
+++ b/kernel/power/dcopy_microk_power8.c
@@ -0,0 +1,174 @@
+/***************************************************************************
+Copyright (c) 2013-2016, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*****************************************************************************/
+
+/**************************************************************************************
+* 2016/03/25 Werner Saar (wernsaar@googlemail.com)
+* 	 BLASTEST 		: OK
+* 	 CTEST			: OK
+* 	 TEST			: OK
+*	 LAPACK-TEST		: OK
+**************************************************************************************/
+
+#define HAVE_KERNEL_32 1
+
+static void dcopy_kernel_32( BLASLONG n, FLOAT *x, FLOAT *y) __attribute__ ((noinline));
+
+static void dcopy_kernel_32( BLASLONG n, FLOAT *x, FLOAT *y)
+{
+
+
+	BLASLONG i = n;
+	BLASLONG o16 = 16;
+	BLASLONG o32 = 32;
+	BLASLONG o48 = 48;
+	BLASLONG o64 = 64;
+	BLASLONG o80 = 80;
+	BLASLONG o96 = 96;
+	BLASLONG o112 = 112;
+	FLOAT *x1=x;
+	FLOAT *y1=y;
+	BLASLONG pre = 384;
+	BLASLONG alpha=0;
+
+	__asm__  __volatile__
+	(
+
+	"lxvd2x		40, 0, %2			    \n\t"
+	"lxvd2x		41, %5, %2			    \n\t"
+	"lxvd2x		42, %6, %2			    \n\t"
+	"lxvd2x		43, %7, %2			    \n\t"
+	"lxvd2x		44, %8, %2			    \n\t"
+	"lxvd2x		45, %9, %2			    \n\t"
+	"lxvd2x		46, %10, %2			    \n\t"
+	"lxvd2x		47, %11, %2			    \n\t"
+
+	"addi		%2, %2, 128			    \n\t"
+
+	"lxvd2x		50, 0, %2			    \n\t"
+	"lxvd2x		51, %5, %2			    \n\t"
+	"lxvd2x		52, %6, %2			    \n\t"
+	"lxvd2x		53, %7, %2			    \n\t"
+	"lxvd2x		54, %8, %2			    \n\t"
+	"lxvd2x		55, %9, %2			    \n\t"
+	"lxvd2x		56, %10, %2			    \n\t"
+	"lxvd2x		57, %11, %2			    \n\t"
+
+	"addi		%2, %2, 128			    \n\t"
+
+	"addic.		%0 , %0	, -32  	 	             \n\t"
+	"ble		2f		             	     \n\t"
+
+	".align 5				            \n\t"
+	"1:				                    \n\t"
+
+	"stxvd2x		40, 0, %1			    \n\t"
+	"stxvd2x		41, %5, %1			    \n\t"
+	"lxvd2x		40, 0, %2			    \n\t"
+	"lxvd2x		41, %5, %2			    \n\t"
+	"stxvd2x		42, %6, %1			    \n\t"
+	"stxvd2x		43, %7, %1			    \n\t"
+	"lxvd2x		42, %6, %2			    \n\t"
+	"lxvd2x		43, %7, %2			    \n\t"
+	"stxvd2x		44, %8, %1			    \n\t"
+	"stxvd2x		45, %9, %1			    \n\t"
+	"lxvd2x		44, %8, %2			    \n\t"
+	"lxvd2x		45, %9, %2			    \n\t"
+	"stxvd2x		46, %10, %1			    \n\t"
+	"stxvd2x		47, %11, %1			    \n\t"
+	"lxvd2x		46, %10, %2			    \n\t"
+	"lxvd2x		47, %11, %2			    \n\t"
+
+
+	"addi		%1, %1, 128			    \n\t"
+	"addi		%2, %2, 128			    \n\t"
+
+	"stxvd2x		50, 0, %1			    \n\t"
+	"stxvd2x		51, %5, %1			    \n\t"
+	"lxvd2x		50, 0, %2			    \n\t"
+	"lxvd2x		51, %5, %2			    \n\t"
+	"stxvd2x		52, %6, %1			    \n\t"
+	"stxvd2x		53, %7, %1			    \n\t"
+	"lxvd2x		52, %6, %2			    \n\t"
+	"lxvd2x		53, %7, %2			    \n\t"
+	"stxvd2x		54, %8, %1			    \n\t"
+	"stxvd2x		55, %9, %1			    \n\t"
+	"lxvd2x		54, %8, %2			    \n\t"
+	"lxvd2x		55, %9, %2			    \n\t"
+	"stxvd2x		56, %10, %1			    \n\t"
+	"stxvd2x		57, %11, %1			    \n\t"
+	"lxvd2x		56, %10, %2			    \n\t"
+	"lxvd2x		57, %11, %2			    \n\t"
+
+	"addi		%1, %1, 128			    \n\t"
+	"addi		%2, %2, 128			    \n\t"
+
+	"addic.		%0 , %0	, -32  	 	             \n\t"
+	"bgt		1b		             	     \n\t"
+
+	"2:						     \n\t"
+
+	"stxvd2x		40, 0, %1			    \n\t"
+	"stxvd2x		41, %5, %1			    \n\t"
+	"stxvd2x		42, %6, %1			    \n\t"
+	"stxvd2x		43, %7, %1			    \n\t"
+	"stxvd2x		44, %8, %1			    \n\t"
+	"stxvd2x		45, %9, %1			    \n\t"
+	"stxvd2x		46, %10, %1			    \n\t"
+	"stxvd2x		47, %11, %1			    \n\t"
+
+	"addi		%1, %1, 128			    \n\t"
+
+	"stxvd2x		50, 0, %1			    \n\t"
+	"stxvd2x		51, %5, %1			    \n\t"
+	"stxvd2x		52, %6, %1			    \n\t"
+	"stxvd2x		53, %7, %1			    \n\t"
+	"stxvd2x		54, %8, %1			    \n\t"
+	"stxvd2x		55, %9, %1			    \n\t"
+	"stxvd2x		56, %10, %1			    \n\t"
+	"stxvd2x		57, %11, %1			    \n\t"
+
+
+	:
+        : 
+          "r" (i),	// 0	
+	  "r" (y1),  	// 1
+          "r" (x1),     // 2
+          "r" (alpha),  // 3
+          "r" (pre),    // 4
+	  "r" (o16),	// 5
+	  "r" (o32),	// 6
+	  "r" (o48),    // 7
+          "r" (o64),    // 8
+          "r" (o80),    // 9
+          "r" (o96),    // 10
+          "r" (o112)    // 11
+	: "cr0", "%0", "%2" , "%1", "memory"
+	);
+
+} 
+
+
diff --git a/kernel/power/ddot.c b/kernel/power/ddot.c
new file mode 100644
index 000000000..cef60a2e5
--- /dev/null
+++ b/kernel/power/ddot.c
@@ -0,0 +1,139 @@
+/***************************************************************************
+Copyright (c) 2013-2016, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*****************************************************************************/
+
+/**************************************************************************************
+* 2016/03/20 Werner Saar (wernsaar@googlemail.com)
+* 	 BLASTEST 		: OK
+* 	 CTEST			: OK
+* 	 TEST			: OK
+*	 LAPACK-TEST		: OK
+**************************************************************************************/
+
+#include "common.h"
+
+
+#if defined(POWER8) 
+#include "ddot_microk_power8.c"
+#endif
+
+
+#ifndef HAVE_KERNEL_8
+
+static void ddot_kernel_8(BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *d)
+{
+	BLASLONG register i = 0;
+	FLOAT dot = 0.0;
+
+	while(i < n)
+        {
+              dot += y[i]  * x[i]
+                  + y[i+1] * x[i+1]
+                  + y[i+2] * x[i+2]
+                  + y[i+3] * x[i+3]
+                  + y[i+4] * x[i+4]
+                  + y[i+5] * x[i+5]
+                  + y[i+6] * x[i+6]
+                  + y[i+7] * x[i+7] ;
+
+              i+=8 ;
+
+       }
+       *d += dot;
+
+}
+
+#endif
+
+FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y)
+{
+	BLASLONG i=0;
+	BLASLONG ix=0,iy=0;
+
+	FLOAT  dot = 0.0 ;
+
+	if ( n <= 0 )  return(dot);
+
+	if ( (inc_x == 1) && (inc_y == 1) )
+	{
+
+		BLASLONG n1 = n & -16;
+
+		if ( n1 )
+			ddot_kernel_8(n1, x, y , &dot );
+
+		i = n1;
+		while(i < n)
+		{
+
+			dot += y[i] * x[i] ;
+			i++ ;
+
+		}
+		return(dot);
+
+
+	}
+
+	FLOAT temp1 = 0.0;
+	FLOAT temp2 = 0.0;
+
+        BLASLONG n1 = n & -4;	
+
+	while(i < n1)
+	{
+
+		FLOAT m1 = y[iy]       * x[ix] ;
+		FLOAT m2 = y[iy+inc_y] * x[ix+inc_x] ;
+
+		FLOAT m3 = y[iy+2*inc_y] * x[ix+2*inc_x] ;
+		FLOAT m4 = y[iy+3*inc_y] * x[ix+3*inc_x] ;
+
+		ix  += inc_x*4 ;
+		iy  += inc_y*4 ;
+
+		temp1 += m1+m3;
+		temp2 += m2+m4;
+
+		i+=4 ;
+
+	}
+
+	while(i < n)
+	{
+
+		temp1 += y[iy] * x[ix] ;
+		ix  += inc_x ;
+		iy  += inc_y ;
+		i++ ;
+
+	}
+	dot = temp1 + temp2;
+	return(dot);
+
+}
+
+
diff --git a/kernel/power/ddot_microk_power8.c b/kernel/power/ddot_microk_power8.c
new file mode 100644
index 000000000..b88049212
--- /dev/null
+++ b/kernel/power/ddot_microk_power8.c
@@ -0,0 +1,178 @@
+/***************************************************************************
+Copyright (c) 2013-2016, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*****************************************************************************/
+
+/**************************************************************************************
+* 2016/03/20 Werner Saar (wernsaar@googlemail.com)
+* 	 BLASTEST 		: OK
+* 	 CTEST			: OK
+* 	 TEST			: OK
+*	 LAPACK-TEST		: OK
+**************************************************************************************/
+
+#define HAVE_KERNEL_8 1
+static void ddot_kernel_8( BLASLONG n, FLOAT *x, FLOAT *y , FLOAT *dot) __attribute__ ((noinline));
+
+static void ddot_kernel_8( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *dot)
+{
+
+
+	BLASLONG i = n;
+	BLASLONG o16 = 16;
+	BLASLONG o32 = 32;
+	BLASLONG o48 = 48;
+	BLASLONG o64 = 64;
+	BLASLONG o80 = 80;
+	BLASLONG o96 = 96;
+	BLASLONG o112 = 112;
+	FLOAT *x1=x;
+	FLOAT *y1=y;
+	BLASLONG pre = 384;
+
+	__asm__  __volatile__
+	(
+	"xxlxor		32,32,32			    \n\t"
+	"xxlxor		33,33,33			    \n\t"
+	"xxlxor		34,34,34			    \n\t"
+	"xxlxor		35,35,35			    \n\t"
+	"xxlxor		36,36,36			    \n\t"
+	"xxlxor		37,37,37			    \n\t"
+	"xxlxor		38,38,38			    \n\t"
+	"xxlxor		39,39,39			    \n\t"
+
+	"dcbt		%2, %12				    \n\t"
+	"dcbt		%3, %12				    \n\t"
+
+	"lxvd2x		40, 0, %2			    \n\t"
+	"lxvd2x		48, 0, %3			    \n\t"
+	"lxvd2x		41, %5, %2			    \n\t"
+	"lxvd2x		49, %5, %3			    \n\t"
+	"lxvd2x		42, %6, %2			    \n\t"
+	"lxvd2x		50, %6, %3			    \n\t"
+	"lxvd2x		43, %7, %2			    \n\t"
+	"lxvd2x		51, %7, %3			    \n\t"
+	"lxvd2x		44, %8, %2			    \n\t"
+	"lxvd2x		52, %8, %3			    \n\t"
+	"lxvd2x		45, %9, %2			    \n\t"
+	"lxvd2x		53, %9, %3			    \n\t"
+	"lxvd2x		46, %10, %2			    \n\t"
+	"lxvd2x		54, %10, %3			    \n\t"
+	"lxvd2x		47, %11, %2			    \n\t"
+	"lxvd2x		55, %11, %3			    \n\t"
+
+	"addi		%2, %2, 128			    \n\t"
+	"addi		%3, %3, 128			    \n\t"
+
+	"addic.		%0 , %0	, -16  	 	             \n\t"
+	"ble		2f		             	     \n\t"
+
+	".align 5				            \n\t"
+	"1:				                    \n\t"
+
+	"dcbt		%2, %12				    \n\t"
+	"dcbt		%3, %12				    \n\t"
+
+	"xvmaddadp	32, 40, 48		    \n\t"
+	"lxvd2x		40, 0, %2			    \n\t"
+	"lxvd2x		48, 0, %3			    \n\t"
+	"xvmaddadp	33, 41, 49		    \n\t"
+	"lxvd2x		41, %5, %2			    \n\t"
+	"lxvd2x		49, %5, %3			    \n\t"
+	"xvmaddadp	34, 42, 50		    \n\t"
+	"lxvd2x		42, %6, %2			    \n\t"
+	"lxvd2x		50, %6, %3			    \n\t"
+	"xvmaddadp	35, 43, 51		    \n\t"
+	"lxvd2x		43, %7, %2			    \n\t"
+	"lxvd2x		51, %7, %3			    \n\t"
+	"xvmaddadp	36, 44, 52		    \n\t"
+	"lxvd2x		44, %8, %2			    \n\t"
+	"lxvd2x		52, %8, %3			    \n\t"
+	"xvmaddadp	37, 45, 53		    \n\t"
+	"lxvd2x		45, %9, %2			    \n\t"
+	"lxvd2x		53, %9, %3			    \n\t"
+	"xvmaddadp	38, 46, 54		    \n\t"
+	"lxvd2x		46, %10, %2			    \n\t"
+	"lxvd2x		54, %10, %3			    \n\t"
+	"xvmaddadp	39, 47, 55		    \n\t"
+
+	"lxvd2x		47, %11, %2			    \n\t"
+	"lxvd2x		55, %11, %3			    \n\t"
+
+
+	"addi		%2, %2, 128			    \n\t"
+	"addi		%3, %3, 128			    \n\t"
+
+	"addic.		%0 , %0	, -16  	 	             \n\t"
+	"bgt		1b		             	     \n\t"
+
+	"2:						     \n\t"
+
+	"xvmaddadp	32, 40, 48		    \n\t"
+	"xvmaddadp	33, 41, 49		    \n\t"
+	"xvmaddadp	34, 42, 50		    \n\t"
+	"xvmaddadp	35, 43, 51		    \n\t"
+	"xvmaddadp	36, 44, 52		    \n\t"
+	"xvmaddadp	37, 45, 53		    \n\t"
+	"xvmaddadp	38, 46, 54		    \n\t"
+	"xvmaddadp	39, 47, 55		    \n\t"
+
+	"xvadddp	32, 32, 33		     \n\t"
+	"xvadddp	34, 34, 35		     \n\t"
+	"xvadddp	36, 36, 37		     \n\t"
+	"xvadddp	38, 38, 39		     \n\t"
+
+	"xvadddp	32, 32, 34		     \n\t"
+	"xvadddp	36, 36, 38		     \n\t"
+
+	"xvadddp	32, 32, 36		     \n\t"
+
+	"xxswapd	33, 32			     \n\t"
+
+	"xsadddp	32, 32, 33		     \n\t"
+
+	"stxsdx		32, 0, %4			     \n\t"
+
+	:
+        : 
+          "r" (i),	// 0	
+	  "r" (n),  	// 1
+          "r" (x1),     // 2
+          "r" (y1),     // 3
+          "r" (dot),    // 4
+	  "r" (o16),	// 5
+	  "r" (o32),	// 6
+	  "r" (o48),    // 7
+          "r" (o64),    // 8
+          "r" (o80),    // 9
+          "r" (o96),    // 10
+          "r" (o112),   // 11
+	  "r" (pre)	// 12
+	: "cr0", "%0", "%2" , "%3", "memory"
+	);
+
+} 
+
+
diff --git a/kernel/power/dgemv_n.c b/kernel/power/dgemv_n.c
new file mode 100644
index 000000000..812d09d15
--- /dev/null
+++ b/kernel/power/dgemv_n.c
@@ -0,0 +1,426 @@
+/***************************************************************************
+Copyright (c) 2013-2016, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*****************************************************************************/
+
+/**************************************************************************************
+* 2016/03/30 Werner Saar (wernsaar@googlemail.com)
+* 	 BLASTEST 		: OK
+* 	 CTEST			: OK
+* 	 TEST			: OK
+*	 LAPACK-TEST		: OK
+**************************************************************************************/
+
+
+
+#include "common.h"
+
+
+#if defined(POWER8)
+#include "dgemv_n_microk_power8.c"
+#endif
+
+
+#define NBMAX 4096
+
+#ifndef HAVE_KERNEL_4x4
+
+static void dgemv_kernel_4x4(BLASLONG n, FLOAT **ap, FLOAT *xo, FLOAT *y, FLOAT *alpha)
+{
+	BLASLONG i;
+	FLOAT *a0,*a1,*a2,*a3;
+	FLOAT x[4]  __attribute__ ((aligned (16)));;
+	a0 = ap[0];
+	a1 = ap[1];
+	a2 = ap[2];
+	a3 = ap[3];
+
+	for ( i=0; i<4; i++)
+		x[i] = xo[i] * *alpha;
+
+	for ( i=0; i< n; i+=4 )
+	{
+		y[i] += a0[i]*x[0] + a1[i]*x[1] + a2[i]*x[2] + a3[i]*x[3];		
+		y[i+1] += a0[i+1]*x[0] + a1[i+1]*x[1] + a2[i+1]*x[2] + a3[i+1]*x[3];		
+		y[i+2] += a0[i+2]*x[0] + a1[i+2]*x[1] + a2[i+2]*x[2] + a3[i+2]*x[3];		
+		y[i+3] += a0[i+3]*x[0] + a1[i+3]*x[1] + a2[i+3]*x[2] + a3[i+3]*x[3];		
+	}
+}
+
+#endif
+
+#ifndef HAVE_KERNEL_4x2
+
+static void dgemv_kernel_4x2(BLASLONG n, FLOAT **ap, FLOAT *xo, FLOAT *y, FLOAT *alpha)
+{
+	BLASLONG i;
+	FLOAT *a0,*a1;
+	FLOAT x[4]  __attribute__ ((aligned (16)));;
+	a0 = ap[0];
+	a1 = ap[1];
+
+	for ( i=0; i<2; i++)
+		x[i] = xo[i] * *alpha;
+
+	for ( i=0; i< n; i+=4 )
+	{
+		y[i] += a0[i]*x[0] + a1[i]*x[1];		
+		y[i+1] += a0[i+1]*x[0] + a1[i+1]*x[1];		
+		y[i+2] += a0[i+2]*x[0] + a1[i+2]*x[1];		
+		y[i+3] += a0[i+3]*x[0] + a1[i+3]*x[1];		
+	}
+}
+
+
+#endif
+
+#ifndef HAVE_KERNEL_4x1
+
+static void dgemv_kernel_4x1(BLASLONG n, FLOAT *ap, FLOAT *xo, FLOAT *y, FLOAT *alpha)
+{
+	BLASLONG i;
+	FLOAT *a0;
+	FLOAT x[4]  __attribute__ ((aligned (16)));;
+	a0 = ap;
+
+	for ( i=0; i<1; i++)
+		x[i] = xo[i] * *alpha;
+
+	for ( i=0; i< n; i+=4 )
+	{
+		y[i] += a0[i]*x[0];		
+		y[i+1] += a0[i+1]*x[0];		
+		y[i+2] += a0[i+2]*x[0];		
+		y[i+3] += a0[i+3]*x[0];		
+	}
+}
+
+
+#endif
+
+
+static void add_y(BLASLONG n, FLOAT *src, FLOAT *dest, BLASLONG inc_dest)
+{
+	BLASLONG i;
+	if ( inc_dest != 1 )
+	{
+		for ( i=0; i<n; i++ )
+		{
+			*dest += *src;
+			src++;
+			dest += inc_dest;
+		}
+		return;
+	}
+
+}
+
+int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha, FLOAT *a, BLASLONG lda, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *buffer)
+{
+
+	BLASLONG i;
+	BLASLONG j;
+	FLOAT *a_ptr;
+	FLOAT *x_ptr;
+	FLOAT *y_ptr;
+	BLASLONG n1;
+	BLASLONG m1;
+	BLASLONG m2;
+	BLASLONG m3;
+	BLASLONG n2;
+	BLASLONG lda4 =  lda << 2;
+	FLOAT *ap[4] __attribute__ ((aligned (16)));;
+	FLOAT xbuffer[8] __attribute__ ((aligned (16)));;
+	FLOAT alpha_r[4] __attribute__ ((aligned (16)));;
+	FLOAT *ybuffer;
+
+	alpha_r[0] = alpha;
+
+        if ( m < 1 ) return(0);
+        if ( n < 1 ) return(0);
+
+	ybuffer = buffer;
+	
+	n1 = n >> 2 ;
+	n2 = n &  3 ;
+
+        m3 = m & 3  ;
+        m1 = m & -4 ;
+        m2 = (m & (NBMAX-1)) - m3 ;
+
+	y_ptr = y;
+
+	BLASLONG NB = NBMAX;
+
+	while ( NB == NBMAX )
+	{
+		
+		m1 -= NB;
+		if ( m1 < 0)
+		{
+			if ( m2 == 0 ) break;	
+			NB = m2;
+		}
+		
+		a_ptr = a;
+		x_ptr = x;
+		
+		ap[0] = a_ptr;
+		ap[1] = a_ptr + lda;
+		ap[2] = ap[1] + lda;
+		ap[3] = ap[2] + lda;
+
+		if ( inc_y != 1 )
+			memset(ybuffer,0,NB*8);
+		else
+			ybuffer = y_ptr;
+
+		if ( inc_x == 1 )
+		{
+
+
+			for( i = 0; i < n1 ; i++)
+			{
+				dgemv_kernel_4x4(NB,ap,x_ptr,ybuffer,alpha_r);
+				ap[0] += lda4; 
+				ap[1] += lda4; 
+				ap[2] += lda4; 
+				ap[3] += lda4; 
+				a_ptr += lda4;
+				x_ptr += 4;	
+			}
+
+			if ( n2 & 2 )
+			{
+				dgemv_kernel_4x2(NB,ap,x_ptr,ybuffer,alpha_r);
+				a_ptr += lda*2;
+				x_ptr += 2;	
+			}
+
+
+			if ( n2 & 1 )
+			{
+				dgemv_kernel_4x1(NB,a_ptr,x_ptr,ybuffer,alpha_r);
+				a_ptr += lda;
+				x_ptr += 1;	
+
+			}
+
+
+		}
+		else
+		{
+
+			for( i = 0; i < n1 ; i++)
+			{
+				xbuffer[0] = x_ptr[0];
+				x_ptr += inc_x;	
+				xbuffer[1] =  x_ptr[0];
+				x_ptr += inc_x;	
+				xbuffer[2] =  x_ptr[0];
+				x_ptr += inc_x;	
+				xbuffer[3] = x_ptr[0];
+				x_ptr += inc_x;	
+				dgemv_kernel_4x4(NB,ap,xbuffer,ybuffer,alpha_r);
+				ap[0] += lda4; 
+				ap[1] += lda4; 
+				ap[2] += lda4; 
+				ap[3] += lda4; 
+				a_ptr += lda4;
+			}
+
+			for( i = 0; i < n2 ; i++)
+			{
+				xbuffer[0] = x_ptr[0];
+				x_ptr += inc_x;	
+				dgemv_kernel_4x1(NB,a_ptr,xbuffer,ybuffer,alpha_r);
+				a_ptr += lda;
+
+			}
+
+		}
+
+		a     += NB;
+		if ( inc_y != 1 )
+		{
+			add_y(NB,ybuffer,y_ptr,inc_y);
+			y_ptr += NB * inc_y;
+		}
+		else
+			y_ptr += NB ;
+
+	}
+
+	if ( m3 == 0 ) return(0);
+
+	if ( m3 == 3 )
+	{
+		a_ptr = a;
+		x_ptr = x;
+		FLOAT temp0 = 0.0;
+		FLOAT temp1 = 0.0;
+		FLOAT temp2 = 0.0;
+		if ( lda == 3 && inc_x ==1 )
+		{
+
+			for( i = 0; i < ( n & -4 ); i+=4 )
+			{
+
+				temp0 += a_ptr[0] * x_ptr[0] + a_ptr[3] * x_ptr[1];
+				temp1 += a_ptr[1] * x_ptr[0] + a_ptr[4] * x_ptr[1];
+				temp2 += a_ptr[2] * x_ptr[0] + a_ptr[5] * x_ptr[1];
+
+				temp0 += a_ptr[6] * x_ptr[2] + a_ptr[9]  * x_ptr[3];
+				temp1 += a_ptr[7] * x_ptr[2] + a_ptr[10] * x_ptr[3];
+				temp2 += a_ptr[8] * x_ptr[2] + a_ptr[11] * x_ptr[3];
+
+				a_ptr += 12;
+				x_ptr += 4;
+			}
+
+			for( ; i < n; i++ )
+			{
+				temp0 += a_ptr[0] * x_ptr[0];
+				temp1 += a_ptr[1] * x_ptr[0];
+				temp2 += a_ptr[2] * x_ptr[0];
+				a_ptr += 3;
+				x_ptr ++;
+			}
+
+		}
+		else
+		{
+
+			for( i = 0; i < n; i++ )
+			{
+				temp0 += a_ptr[0] * x_ptr[0];
+				temp1 += a_ptr[1] * x_ptr[0];
+				temp2 += a_ptr[2] * x_ptr[0];
+				a_ptr += lda;
+				x_ptr += inc_x;
+
+
+			}
+
+		}
+		y_ptr[0] += alpha * temp0;
+		y_ptr += inc_y;
+		y_ptr[0] += alpha * temp1;
+		y_ptr += inc_y;
+		y_ptr[0] += alpha * temp2;
+		return(0);
+	}
+
+
+	if ( m3 == 2 )
+	{
+		a_ptr = a;
+		x_ptr = x;
+		FLOAT temp0 = 0.0;
+		FLOAT temp1 = 0.0;
+		if ( lda == 2 && inc_x ==1 )
+		{
+
+			for( i = 0; i < (n & -4) ; i+=4 )
+			{
+				temp0 += a_ptr[0] * x_ptr[0] + a_ptr[2] * x_ptr[1];
+				temp1 += a_ptr[1] * x_ptr[0] + a_ptr[3] * x_ptr[1];
+				temp0 += a_ptr[4] * x_ptr[2] + a_ptr[6] * x_ptr[3];
+				temp1 += a_ptr[5] * x_ptr[2] + a_ptr[7] * x_ptr[3];
+				a_ptr += 8;
+				x_ptr += 4;
+
+			}
+
+
+			for( ; i < n; i++ )
+			{
+				temp0 += a_ptr[0]   * x_ptr[0];
+				temp1 += a_ptr[1]   * x_ptr[0];
+				a_ptr += 2;
+				x_ptr ++;
+			}
+
+		}
+		else
+		{
+
+			for( i = 0; i < n; i++ )
+			{
+				temp0 += a_ptr[0] * x_ptr[0];
+				temp1 += a_ptr[1] * x_ptr[0];
+				a_ptr += lda;
+				x_ptr += inc_x;
+
+
+			}
+
+		}
+		y_ptr[0] += alpha * temp0;
+		y_ptr += inc_y;
+		y_ptr[0] += alpha * temp1;
+		return(0);
+	}
+
+	if ( m3 == 1 )
+	{
+		a_ptr = a;
+		x_ptr = x;
+		FLOAT temp = 0.0;
+		if ( lda == 1 && inc_x ==1 )
+		{
+
+			for( i = 0; i < (n & -4); i+=4 )
+			{
+				temp += a_ptr[i] * x_ptr[i] + a_ptr[i+1] * x_ptr[i+1] + a_ptr[i+2] * x_ptr[i+2] + a_ptr[i+3] * x_ptr[i+3];
+	
+			}
+
+			for( ; i < n; i++ )
+			{
+				temp += a_ptr[i] * x_ptr[i];
+			}
+
+		}
+		else
+		{
+
+			for( i = 0; i < n; i++ )
+			{
+				temp += a_ptr[0] * x_ptr[0];
+				a_ptr += lda;
+				x_ptr += inc_x;
+			}
+
+		}
+		y_ptr[0] += alpha * temp;
+		return(0);
+	}
+
+
+	return(0);
+}
+
+
diff --git a/kernel/power/dgemv_n_microk_power8.c b/kernel/power/dgemv_n_microk_power8.c
new file mode 100644
index 000000000..9eabe555c
--- /dev/null
+++ b/kernel/power/dgemv_n_microk_power8.c
@@ -0,0 +1,301 @@
+/***************************************************************************
+Copyright (c) 2013-2016, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*****************************************************************************/
+
+/**************************************************************************************
+* 2016/03/30 Werner Saar (wernsaar@googlemail.com)
+* 	 BLASTEST 		: OK
+* 	 CTEST			: OK
+* 	 TEST			: OK
+*	 LAPACK-TEST		: OK
+**************************************************************************************/
+
+#define HAVE_KERNEL_4x4 1
+
+static void dgemv_kernel_4x4(BLASLONG n, FLOAT **ap, FLOAT *xo, FLOAT *y, FLOAT *alpha) __attribute__ ((noinline));
+
+static void dgemv_kernel_4x4(BLASLONG n, FLOAT **ap, FLOAT *xo, FLOAT *y, FLOAT *alpha)
+{
+        BLASLONG i=n;
+	BLASLONG o8  = 8;
+	BLASLONG o16 = 16;
+	BLASLONG o24 = 24;
+	BLASLONG pre = 384;
+
+        FLOAT *a0,*a1,*a2,*a3;
+	FLOAT *y1=y+1;
+        FLOAT x[4]  __attribute__ ((aligned (16)));;
+        a0 = ap[0]+1;
+        a1 = ap[1]+1;
+        a2 = ap[2]+1;
+        a3 = ap[3]+1;
+
+	x[0]=xo[0] * *alpha;
+	x[1]=xo[1] * *alpha;
+	x[2]=xo[2] * *alpha;
+	x[3]=xo[3] * *alpha;
+
+
+	__asm__  __volatile__
+	(
+	"lxvdsx		32, 0 , %1			    \n\t"	// x0
+	"lxvdsx		33,%3 , %1			    \n\t"	// x1
+	"lxvdsx		34,%4 , %1			    \n\t"	// x2
+	"lxvdsx		35,%5 , %1			    \n\t"	// x3
+	"addi		%2 , %2 , -8			    \n\t"
+	"addi		%6 , %6 , -8			    \n\t"
+	"addi		%7 , %7 , -8			    \n\t"
+	"addi		%8 , %8 , -8			    \n\t"
+	"addi		%9 , %9 , -8			    \n\t"
+	
+	"lxvd2x		48, 0, %6			    \n\t"	// a0[0], a0[1] 
+	"lxvd2x		49,%4, %6			    \n\t"	// a0[2], a0[3] 
+
+	"lxvd2x		50, 0, %7			    \n\t"	// a1[0], a1[1] 
+	"lxvd2x		51,%4, %7			    \n\t"	// a1[2], a1[3] 
+
+	"lxvd2x		52, 0, %8			    \n\t"	// a2[0], a2[1] 
+	"lxvd2x		53,%4, %8			    \n\t"	// a2[2], a2[3] 
+
+	"lxvd2x		54, 0, %9			    \n\t"	// a3[0], a3[1] 
+	"lxvd2x		55,%4, %9			    \n\t"	// a3[2], a3[3] 
+
+	"addi		%6, %6, 32			    \n\t"
+	"addi		%7, %7, 32			    \n\t"
+	"addi		%8, %8, 32			    \n\t"
+	"addi		%9, %9, 32			    \n\t"
+
+	"addic.		%0 , %0	, -4  	 	             \n\t"
+	"ble		2f		             	     \n\t"
+
+	".align 5				            \n\t"
+	"1:				                    \n\t"
+
+	"dcbt		%2, %10				    \n\t"
+
+	"lxvd2x		40, 0, %2			    \n\t"	// y0, y1
+	"lxvd2x		41,%4, %2			    \n\t"	// y2, y3
+	
+	"dcbt		%6, %10				    \n\t"
+	"dcbt		%7, %10				    \n\t"
+	"dcbt		%8, %10				    \n\t"
+	"dcbt		%9, %10				    \n\t"
+
+	"xvmaddadp	40, 48, 32			    \n\t"
+	"xvmaddadp	41, 49, 32			    \n\t"
+
+	"lxvd2x		48, 0, %6			    \n\t"	// a0[0], a0[1] 
+	"lxvd2x		49,%4, %6			    \n\t"	// a0[2], a0[3] 
+
+	"xvmaddadp	40, 50, 33			    \n\t"
+	"addi		%6, %6, 32			    \n\t"
+	"xvmaddadp	41, 51, 33			    \n\t"
+
+	"lxvd2x		50, 0, %7			    \n\t"	// a1[0], a1[1] 
+	"lxvd2x		51,%4, %7			    \n\t"	// a1[2], a1[3] 
+
+	"xvmaddadp	40, 52, 34			    \n\t"
+	"addi		%7, %7, 32			    \n\t"
+	"xvmaddadp	41, 53, 34			    \n\t"
+
+	"lxvd2x		52, 0, %8			    \n\t"	// a2[0], a2[1] 
+	"lxvd2x		53,%4, %8			    \n\t"	// a2[2], a2[3] 
+
+	"xvmaddadp	40, 54, 35			    \n\t"
+	"addi		%8, %8, 32			    \n\t"
+	"xvmaddadp	41, 55, 35			    \n\t"
+
+	"stxvd2x	40, 0, %2			    \n\t"	// y0, y1
+	"stxvd2x	41,%4, %2			    \n\t"	// y2, y3
+
+	"lxvd2x		54, 0, %9			    \n\t"	// a3[0], a3[1] 
+	"lxvd2x		55,%4, %9			    \n\t"	// a3[2], a3[3] 
+
+	"addi		%9, %9, 32			    \n\t"
+	"addi		%2, %2, 32			    \n\t"
+
+	"addic.		%0 , %0	, -4  	 	             \n\t"
+	"ble		2f		             	     \n\t"
+
+
+	"lxvd2x		40, 0, %2			    \n\t"	// y0, y1
+	"lxvd2x		41,%4, %2			    \n\t"	// y2, y3
+	
+	"xvmaddadp	40, 48, 32			    \n\t"
+	"xvmaddadp	41, 49, 32			    \n\t"
+
+	"lxvd2x		48, 0, %6			    \n\t"	// a0[0], a0[1] 
+	"lxvd2x		49,%4, %6			    \n\t"	// a0[2], a0[3] 
+
+	"xvmaddadp	40, 50, 33			    \n\t"
+	"addi		%6, %6, 32			    \n\t"
+	"xvmaddadp	41, 51, 33			    \n\t"
+
+	"lxvd2x		50, 0, %7			    \n\t"	// a1[0], a1[1] 
+	"lxvd2x		51,%4, %7			    \n\t"	// a1[2], a1[3] 
+
+	"xvmaddadp	40, 52, 34			    \n\t"
+	"addi		%7, %7, 32			    \n\t"
+	"xvmaddadp	41, 53, 34			    \n\t"
+
+	"lxvd2x		52, 0, %8			    \n\t"	// a2[0], a2[1] 
+	"lxvd2x		53,%4, %8			    \n\t"	// a2[2], a2[3] 
+
+	"xvmaddadp	40, 54, 35			    \n\t"
+	"addi		%8, %8, 32			    \n\t"
+	"xvmaddadp	41, 55, 35			    \n\t"
+
+	"stxvd2x	40, 0, %2			    \n\t"	// y0, y1
+	"stxvd2x	41,%4, %2			    \n\t"	// y2, y3
+
+	"lxvd2x		54, 0, %9			    \n\t"	// a3[0], a3[1] 
+	"lxvd2x		55,%4, %9			    \n\t"	// a3[2], a3[3] 
+
+	"addi		%9, %9, 32			    \n\t"
+	"addi		%2, %2, 32			    \n\t"
+
+	"addic.		%0 , %0	, -4  	 	             \n\t"
+	"ble		2f		             	     \n\t"
+
+
+	"lxvd2x		40, 0, %2			    \n\t"	// y0, y1
+	"lxvd2x		41,%4, %2			    \n\t"	// y2, y3
+	
+	"xvmaddadp	40, 48, 32			    \n\t"
+	"xvmaddadp	41, 49, 32			    \n\t"
+
+	"lxvd2x		48, 0, %6			    \n\t"	// a0[0], a0[1] 
+	"lxvd2x		49,%4, %6			    \n\t"	// a0[2], a0[3] 
+
+	"xvmaddadp	40, 50, 33			    \n\t"
+	"addi		%6, %6, 32			    \n\t"
+	"xvmaddadp	41, 51, 33			    \n\t"
+
+	"lxvd2x		50, 0, %7			    \n\t"	// a1[0], a1[1] 
+	"lxvd2x		51,%4, %7			    \n\t"	// a1[2], a1[3] 
+
+	"xvmaddadp	40, 52, 34			    \n\t"
+	"addi		%7, %7, 32			    \n\t"
+	"xvmaddadp	41, 53, 34			    \n\t"
+
+	"lxvd2x		52, 0, %8			    \n\t"	// a2[0], a2[1] 
+	"lxvd2x		53,%4, %8			    \n\t"	// a2[2], a2[3] 
+
+	"xvmaddadp	40, 54, 35			    \n\t"
+	"addi		%8, %8, 32			    \n\t"
+	"xvmaddadp	41, 55, 35			    \n\t"
+
+	"stxvd2x	40, 0, %2			    \n\t"	// y0, y1
+	"stxvd2x	41,%4, %2			    \n\t"	// y2, y3
+
+	"lxvd2x		54, 0, %9			    \n\t"	// a3[0], a3[1] 
+	"lxvd2x		55,%4, %9			    \n\t"	// a3[2], a3[3] 
+
+	"addi		%9, %9, 32			    \n\t"
+	"addi		%2, %2, 32			    \n\t"
+
+	"addic.		%0 , %0	, -4  	 	             \n\t"
+	"ble		2f		             	     \n\t"
+
+
+	"lxvd2x		40, 0, %2			    \n\t"	// y0, y1
+	"lxvd2x		41,%4, %2			    \n\t"	// y2, y3
+	
+	"xvmaddadp	40, 48, 32			    \n\t"
+	"xvmaddadp	41, 49, 32			    \n\t"
+
+	"lxvd2x		48, 0, %6			    \n\t"	// a0[0], a0[1] 
+	"lxvd2x		49,%4, %6			    \n\t"	// a0[2], a0[3] 
+
+	"xvmaddadp	40, 50, 33			    \n\t"
+	"addi		%6, %6, 32			    \n\t"
+	"xvmaddadp	41, 51, 33			    \n\t"
+
+	"lxvd2x		50, 0, %7			    \n\t"	// a1[0], a1[1] 
+	"lxvd2x		51,%4, %7			    \n\t"	// a1[2], a1[3] 
+
+	"xvmaddadp	40, 52, 34			    \n\t"
+	"addi		%7, %7, 32			    \n\t"
+	"xvmaddadp	41, 53, 34			    \n\t"
+
+	"lxvd2x		52, 0, %8			    \n\t"	// a2[0], a2[1] 
+	"lxvd2x		53,%4, %8			    \n\t"	// a2[2], a2[3] 
+
+	"xvmaddadp	40, 54, 35			    \n\t"
+	"addi		%8, %8, 32			    \n\t"
+	"xvmaddadp	41, 55, 35			    \n\t"
+
+	"stxvd2x	40, 0, %2			    \n\t"	// y0, y1
+	"stxvd2x	41,%4, %2			    \n\t"	// y2, y3
+
+	"lxvd2x		54, 0, %9			    \n\t"	// a3[0], a3[1] 
+	"lxvd2x		55,%4, %9			    \n\t"	// a3[2], a3[3] 
+
+	"addi		%9, %9, 32			    \n\t"
+	"addi		%2, %2, 32			    \n\t"
+
+	"addic.		%0 , %0	, -4  	 	             \n\t"
+	"bgt		1b		             	     \n\t"
+
+	"2:						     \n\t"
+
+	"lxvd2x		40, 0, %2			    \n\t"	// y0, y1
+	"lxvd2x		41,%4, %2			    \n\t"	// y2, y3
+
+	"xvmaddadp	40, 48, 32			    \n\t"
+	"xvmaddadp	41, 49, 32			    \n\t"
+
+	"xvmaddadp	40, 50, 33			    \n\t"
+	"xvmaddadp	41, 51, 33			    \n\t"
+
+	"xvmaddadp	40, 52, 34			    \n\t"
+	"xvmaddadp	41, 53, 34			    \n\t"
+
+	"xvmaddadp	40, 54, 35			    \n\t"
+	"xvmaddadp	41, 55, 35			    \n\t"
+
+	"stxvd2x	40, 0, %2			    \n\t"	// y0, y1
+	"stxvd2x	41,%4, %2			    \n\t"	// y2, y3
+
+	:
+        : 
+          "r" (i),	// 0	
+          "r" (x),      // 1
+          "r" (y1),     // 2
+	  "r" (o8),	// 3
+	  "r" (o16),	// 4
+	  "r" (o24),	// 5
+	  "r" (a0),	// 6
+	  "r" (a1),	// 7
+	  "r" (a2),	// 8
+	  "r" (a3),	// 9
+	  "r" (pre)	// 10
+	: "cr0", "%0", "%2" , "%6", "%7", "%8", "%9", "memory"
+	);
+
+} 
+
+
diff --git a/kernel/power/drot.c b/kernel/power/drot.c
new file mode 100644
index 000000000..c93f69b12
--- /dev/null
+++ b/kernel/power/drot.c
@@ -0,0 +1,167 @@
+/***************************************************************************
+Copyright (c) 2013-2016, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*****************************************************************************/
+
+/**************************************************************************************
+* 2016/03/27 Werner Saar (wernsaar@googlemail.com)
+* 	 BLASTEST 		: OK
+* 	 CTEST			: OK
+* 	 TEST			: OK
+*	 LAPACK-TEST		: OK
+**************************************************************************************/
+
+
+
+#include "common.h"
+
+#pragma GCC optimize "O1"
+
+#if defined(POWER8)
+#include "drot_microk_power8.c"
+#endif
+
+
+#ifndef HAVE_KERNEL_16
+
+static void drot_kernel_16(BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *c, FLOAT *s)
+{
+
+	BLASLONG i=0;
+	FLOAT f0, f1, f2, f3;
+	FLOAT x00, x01, x02, x03;
+	FLOAT g0, g1, g2, g3;
+	FLOAT y00, y01, y02, y03;
+	FLOAT *x1=x;
+	FLOAT *y1=y;
+	FLOAT c1=*c;
+	FLOAT s1=*s;
+
+	while ( i<n )
+	{
+
+		x00 = x1[0];
+		y00 = y1[0];
+		x01 = x1[1];
+		y01 = y1[1];
+		x02 = x1[2];
+		y02 = y1[2];
+		x03 = x1[3];
+		y03 = y1[3];
+
+		f0 = c1*x00 + s1*y00;
+		g0 = c1*y00 - s1*x00;
+		f1 = c1*x01 + s1*y01;
+		g1 = c1*y01 - s1*x01;
+		f2 = c1*x02 + s1*y02;
+		g2 = c1*y02 - s1*x02;
+		f3 = c1*x03 + s1*y03;
+		g3 = c1*y03 - s1*x03;
+
+		x1[0] = f0;
+		y1[0] = g0;
+		x1[1] = f1;
+		y1[1] = g1;
+		x1[2] = f2;
+		y1[2] = g2;
+		x1[3] = f3;
+		y1[3] = g3;
+
+		x1 += 4;
+		y1 += 4;
+
+		i+=4;
+	}
+	return;
+
+}
+
+
+#endif
+
+
+int CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT c, FLOAT s)
+{
+	BLASLONG i=0;
+	BLASLONG ix=0,iy=0;
+	FLOAT c1[4] __attribute__ ((aligned (16)));;
+	FLOAT s1[4] __attribute__ ((aligned (16)));;
+	FLOAT *x1=x;
+	FLOAT *y1=y;
+	FLOAT temp;
+
+	if ( n <= 0     )  return(0);
+
+	if ( (inc_x == 1) && (inc_y == 1) )
+	{
+
+		BLASLONG n1 = n & -16;
+		if ( n1 > 0 )
+		{
+			c1[0]=c;
+			c1[1]=c;
+			c1[2]=c;
+			c1[3]=c;
+			s1[0]=s;
+			s1[1]=s;
+			s1[2]=s;
+			s1[3]=s;
+			drot_kernel_16(n1, x1, y1, c1, s1);
+			i=n1;
+		}
+
+		while(i < n)
+		{
+			temp  = c*x[i] + s*y[i] ;
+			y[i]  = c*y[i] - s*x[i] ;
+			x[i]  = temp ;
+
+			i++ ;
+
+		}
+
+
+	}
+	else
+	{
+
+		while(i < n)
+		{
+			temp   = c*x[ix] + s*y[iy] ;
+			y[iy]  = c*y[iy] - s*x[ix] ;
+			x[ix]  = temp ;
+
+			ix += inc_x ;
+			iy += inc_y ;
+			i++ ;
+
+		}
+
+	}
+	return(0);
+
+}
+
+
diff --git a/kernel/power/drot_microk_power8.c b/kernel/power/drot_microk_power8.c
new file mode 100644
index 000000000..4444ac7eb
--- /dev/null
+++ b/kernel/power/drot_microk_power8.c
@@ -0,0 +1,211 @@
+/***************************************************************************
+Copyright (c) 2013-2016, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*****************************************************************************/
+
+/**************************************************************************************
+* 2016/03/27 Werner Saar (wernsaar@googlemail.com)
+*
+* I don't use fused multiply-add ( precision problems with lapack )
+*
+* 	 BLASTEST 		: OK
+* 	 CTEST			: OK
+* 	 TEST			: OK
+*	 LAPACK-TEST		: OK
+**************************************************************************************/
+
+#define HAVE_KERNEL_16 1
+
+static void drot_kernel_16( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *c, FLOAT *s) __attribute__ ((noinline));
+
+static void drot_kernel_16( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *c, FLOAT *s)
+{
+
+
+	BLASLONG i = n;
+	BLASLONG o16 = 16;
+	BLASLONG o32 = 32;
+	BLASLONG o48 = 48;
+	FLOAT *x1=x;
+	FLOAT *y1=y;
+	FLOAT *x2=x+1;
+	FLOAT *y2=y+1;
+
+	__asm__  __volatile__
+	(
+
+        "lxsdx          36 , %5, %3                          \n\t"	// load c
+        "lxsdx          37 , %5, %4                          \n\t"	// load s
+	"addi		%8 , %8, -8			     \n\t"
+	"addi		%9 , %9, -8			     \n\t"
+
+        "xxspltd        36 , 36, 0                           \n\t"
+        "xxspltd        37 , 37, 0                           \n\t"
+
+	"lxvd2x		32, 0, %1			    \n\t"	// load x
+	"lxvd2x		33, %5, %1			    \n\t"
+	"lxvd2x		34, %6, %1			    \n\t"
+	"lxvd2x		35, %7, %1			    \n\t"
+
+	"lxvd2x		40, 0, %2			    \n\t"	// load y
+	"lxvd2x		41, %5, %2			    \n\t"
+	"lxvd2x		42, %6, %2			    \n\t"
+	"lxvd2x		43, %7, %2			    \n\t"
+
+	"addi		%1, %1, 64			    \n\t"
+	"addi		%2, %2, 64			    \n\t"
+
+	"addic.		%0 , %0	, -8  	 	             \n\t"
+	"ble		2f		             	     \n\t"
+
+	".align 5				            \n\t"
+	"1:				                    \n\t"
+
+	"xvmuldp	48, 32, 36		    	    \n\t"	// c * x
+	"xvmuldp	49, 33, 36		    	    \n\t"
+	"xvmuldp	50, 34, 36		    	    \n\t"
+	"xvmuldp	51, 35, 36		    	    \n\t"
+
+	"xvmuldp	56, 40, 36		    	    \n\t"	// c * y
+	"xvmuldp	57, 41, 36		    	    \n\t"
+	"xvmuldp	58, 42, 36		    	    \n\t"
+	"xvmuldp	59, 43, 36		    	    \n\t"
+
+	"xvmuldp	52, 32, 37		    	    \n\t"	// s * x
+	"xvmuldp	53, 33, 37		    	    \n\t"
+
+	"lxvd2x		32, 0, %1			    \n\t"	// load x
+	"lxvd2x		33, %5, %1			    \n\t"
+
+	"xvmuldp	54, 34, 37		    	    \n\t"
+	"xvmuldp	55, 35, 37		    	    \n\t"
+
+	"lxvd2x		34, %6, %1			    \n\t"
+	"lxvd2x		35, %7, %1			    \n\t"
+
+	"xvmuldp	60, 40, 37		    	    \n\t"	// s * y
+	"xvmuldp	61, 41, 37		    	    \n\t"
+
+	"lxvd2x		40, 0, %2			    \n\t"	// load y
+	"lxvd2x		41, %5, %2			    \n\t"
+
+	"xvmuldp	62, 42, 37		    	    \n\t"
+	"xvmuldp	63, 43, 37		    	    \n\t"
+
+	"lxvd2x		42, %6, %2			    \n\t"
+	"lxvd2x		43, %7, %2			    \n\t"
+
+	"xvadddp	48, 48 , 60			    \n\t"	// c * x + s * y 
+	"xvadddp	49, 49 , 61			    \n\t"	// c * x + s * y 
+
+	"addi		%1, %1, 64			    \n\t"
+	"addi		%2, %2, 64			    \n\t"
+
+	"xvadddp	50, 50 , 62			    \n\t"	// c * x + s * y 
+	"xvadddp	51, 51 , 63			    \n\t"	// c * x + s * y 
+
+	"xvsubdp	56, 56 , 52			    \n\t"	// c * y - s * x
+	"xvsubdp	57, 57 , 53			    \n\t"	// c * y - s * x
+	"xvsubdp	58, 58 , 54			    \n\t"	// c * y - s * x
+	"xvsubdp	59, 59 , 55			    \n\t"	// c * y - s * x
+
+	"stxvd2x	48, 0, %8			    \n\t"	// store x
+	"stxvd2x	49, %5, %8			    \n\t"
+	"stxvd2x	50, %6, %8			    \n\t"
+	"stxvd2x	51, %7, %8			    \n\t"
+
+	"stxvd2x	56, 0, %9			    \n\t"	// store y
+	"stxvd2x	57, %5, %9			    \n\t"
+	"stxvd2x	58, %6, %9			    \n\t"
+	"stxvd2x	59, %7, %9			    \n\t"
+
+	"addi		%8, %8, 64			    \n\t"
+	"addi		%9, %9, 64			    \n\t"
+
+	"addic.		%0 , %0	, -8  	 	             \n\t"
+	"bgt		1b		             	     \n\t"
+
+	"2:						     \n\t"
+
+	"xvmuldp	48, 32, 36		    	    \n\t"	// c * x
+	"xvmuldp	49, 33, 36		    	    \n\t"
+	"xvmuldp	50, 34, 36		    	    \n\t"
+	"xvmuldp	51, 35, 36		    	    \n\t"
+
+	"xvmuldp	56, 40, 36		    	    \n\t"	// c * y
+	"xvmuldp	57, 41, 36		    	    \n\t"
+	"xvmuldp	58, 42, 36		    	    \n\t"
+	"xvmuldp	59, 43, 36		    	    \n\t"
+
+	"xvmuldp	52, 32, 37		    	    \n\t"	// s * x
+	"xvmuldp	53, 33, 37		    	    \n\t"
+	"xvmuldp	54, 34, 37		    	    \n\t"
+	"xvmuldp	55, 35, 37		    	    \n\t"
+
+	"xvmuldp	60, 40, 37		    	    \n\t"	// s * y
+	"xvmuldp	61, 41, 37		    	    \n\t"
+	"xvmuldp	62, 42, 37		    	    \n\t"
+	"xvmuldp	63, 43, 37		    	    \n\t"
+
+	"xvadddp	48, 48 , 60			    \n\t"	// c * x + s * y 
+	"xvadddp	49, 49 , 61			    \n\t"	// c * x + s * y 
+	"xvadddp	50, 50 , 62			    \n\t"	// c * x + s * y 
+	"xvadddp	51, 51 , 63			    \n\t"	// c * x + s * y 
+
+	"xvsubdp	56, 56 , 52			    \n\t"	// c * y - s * x
+	"xvsubdp	57, 57 , 53			    \n\t"	// c * y - s * x
+	"xvsubdp	58, 58 , 54			    \n\t"	// c * y - s * x
+	"xvsubdp	59, 59 , 55			    \n\t"	// c * y - s * x
+
+	"stxvd2x	48, 0, %8			    \n\t"	// store x
+	"stxvd2x	49, %5, %8			    \n\t"
+	"stxvd2x	50, %6, %8			    \n\t"
+	"stxvd2x	51, %7, %8			    \n\t"
+
+	"stxvd2x	56, 0, %9			    \n\t"	// store y
+	"stxvd2x	57, %5, %9			    \n\t"
+	"stxvd2x	58, %6, %9			    \n\t"
+	"stxvd2x	59, %7, %9			    \n\t"
+
+
+
+	:
+        : 
+          "r" (i),	// 0	
+	  "r" (x1),  	// 1
+          "r" (y1),     // 2
+          "r" (c),      // 3
+          "r" (s),      // 4
+	  "r" (o16),	// 5
+	  "r" (o32),	// 6
+	  "r" (o48),    // 7
+	  "r" (x2),     // 8
+	  "r" (y2)      // 9
+	: "cr0", "%0", "%1" , "%2", "%8", "%9", "memory"
+	);
+
+} 
+
+
diff --git a/kernel/power/dscal.c b/kernel/power/dscal.c
new file mode 100644
index 000000000..c62a56315
--- /dev/null
+++ b/kernel/power/dscal.c
@@ -0,0 +1,174 @@
+/***************************************************************************
+Copyright (c) 2013-2016, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*****************************************************************************/
+
+/**************************************************************************************
+* 2016/03/25 Werner Saar (wernsaar@googlemail.com)
+* 	 BLASTEST 		: OK
+* 	 CTEST			: OK
+* 	 TEST			: OK
+*	 LAPACK-TEST		: OK
+**************************************************************************************/
+
+#include "common.h"
+
+#if defined(POWER8) 
+#include "dscal_microk_power8.c"
+#endif
+
+#if !defined(HAVE_KERNEL_8)
+
+static void dscal_kernel_8( BLASLONG n, FLOAT *da , FLOAT *x )
+{
+
+        BLASLONG i;
+        FLOAT alpha = *da;
+
+        for( i=0; i<n; i+=8 )
+        {
+                x[0] *= alpha;
+                x[1] *= alpha;
+                x[2] *= alpha;
+                x[3] *= alpha;
+                x[4] *= alpha;
+                x[5] *= alpha;
+                x[6] *= alpha;
+                x[7] *= alpha;
+                x+=8;
+        }
+
+}
+
+static void dscal_kernel_8_zero( BLASLONG n, FLOAT *da , FLOAT *x )
+{
+
+        BLASLONG i;
+	FLOAT alpha=0.0;
+
+        for( i=0; i<n; i+=8 )
+        {
+                x[0] = alpha;
+                x[1] = alpha;
+                x[2] = alpha;
+                x[3] = alpha;
+                x[4] = alpha;
+                x[5] = alpha;
+                x[6] = alpha;
+                x[7] = alpha;
+                x+=8;
+        }
+
+}
+
+
+#endif
+
+int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *dummy, BLASLONG dummy2)
+{
+	BLASLONG i=0,j=0;
+	if ( n <= 0 || inc_x <=0 )
+		return(0);
+
+
+	if ( inc_x == 1 )
+	{
+
+		if ( da == 0.0 )
+		{		
+
+			BLASLONG n1 = n & -16;
+			if ( n1 > 0 )
+			{
+				FLOAT alpha[2];
+				alpha[0]=da;
+				alpha[1]=da;
+				dscal_kernel_8_zero(n1 , alpha , x);
+				j=n1;
+			}
+
+			while(j < n)
+			{
+
+				x[j]=0.0;
+				j++;
+			}
+
+		}
+		else
+		{
+
+			BLASLONG n1 = n & -16;
+			if ( n1 > 0 )
+			{
+				FLOAT alpha[2];
+				alpha[0]=da;
+				alpha[1]=da;
+				dscal_kernel_8(n1 , alpha , x);
+				j=n1;
+			}
+			while(j < n)
+			{
+
+				x[j] = da * x[j] ;
+				j++;
+			}
+		}
+
+
+	}
+	else
+	{
+
+		if ( da == 0.0 )
+		{		
+
+			while(j < n)
+			{
+
+				x[i]=0.0;
+				i += inc_x ;
+				j++;
+			}
+
+		}
+		else
+		{
+
+			while(j < n)
+			{
+
+				x[i] = da * x[i] ;
+				i += inc_x ;
+				j++;
+			}
+		}
+
+	}
+	return 0;
+
+}
+
+
diff --git a/kernel/power/dscal_microk_power8.c b/kernel/power/dscal_microk_power8.c
new file mode 100644
index 000000000..d90c3d80c
--- /dev/null
+++ b/kernel/power/dscal_microk_power8.c
@@ -0,0 +1,219 @@
+/***************************************************************************
+Copyright (c) 2013-2016, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*****************************************************************************/
+
+/**************************************************************************************
+* 2016/03/25 Werner Saar (wernsaar@googlemail.com)
+* 	 BLASTEST 		: OK
+* 	 CTEST			: OK
+* 	 TEST			: OK
+*	 LAPACK-TEST		: OK
+**************************************************************************************/
+
+#define HAVE_KERNEL_8 1
+
+static void dscal_kernel_8( BLASLONG n, FLOAT *alpha, FLOAT *x) __attribute__ ((noinline));
+
+static void dscal_kernel_8( BLASLONG n, FLOAT *alpha, FLOAT *x)
+{
+
+
+	BLASLONG i = n;
+	BLASLONG o16 = 16;
+	BLASLONG o32 = 32;
+	BLASLONG o48 = 48;
+	BLASLONG o64 = 64;
+	BLASLONG o80 = 80;
+	BLASLONG o96 = 96;
+	BLASLONG o112 = 112;
+	FLOAT *x1=x;
+	FLOAT *x2=x+1;
+	BLASLONG pre = 384;
+
+	__asm__  __volatile__
+	(
+
+        "lxsdx          33, 0, %3                           \n\t"
+        "xxspltd        32, 33, 0                           \n\t"
+        "addi           %1, %1, -8                          \n\t"
+
+	"dcbt		%2, %4				    \n\t"
+
+	"lxvd2x		40, 0, %2			    \n\t"
+	"lxvd2x		41, %5, %2			    \n\t"
+	"lxvd2x		42, %6, %2			    \n\t"
+	"lxvd2x		43, %7, %2			    \n\t"
+	"lxvd2x		44, %8, %2			    \n\t"
+	"lxvd2x		45, %9, %2			    \n\t"
+	"lxvd2x		46, %10, %2			    \n\t"
+	"lxvd2x		47, %11, %2			    \n\t"
+
+	"addi		%2, %2, 128			    \n\t"
+
+	"addic.		%0 , %0	, -16  	 	             \n\t"
+	"ble		2f		             	     \n\t"
+
+	".align 5				            \n\t"
+	"1:				                    \n\t"
+
+	"dcbt		%2, %4				    \n\t"
+
+	"xvmuldp	48, 40, 32		    	    \n\t"
+	"xvmuldp	49, 41, 32		    	    \n\t"
+	"lxvd2x		40, 0, %2			    \n\t"
+	"lxvd2x		41, %5, %2			    \n\t"
+	"xvmuldp	50, 42, 32		    	    \n\t"
+	"xvmuldp	51, 43, 32		    	    \n\t"
+	"lxvd2x		42, %6, %2			    \n\t"
+	"lxvd2x		43, %7, %2			    \n\t"
+	"xvmuldp	52, 44, 32		    	    \n\t"
+	"xvmuldp	53, 45, 32		    	    \n\t"
+	"lxvd2x		44, %8, %2			    \n\t"
+	"lxvd2x		45, %9, %2			    \n\t"
+	"xvmuldp	54, 46, 32		    	    \n\t"
+	"xvmuldp	55, 47, 32		    	    \n\t"
+	"lxvd2x		46, %10, %2			    \n\t"
+	"lxvd2x		47, %11, %2			    \n\t"
+
+	"stxvd2x	48, 0, %1			    \n\t"
+	"stxvd2x	49, %5, %1			    \n\t"
+	"stxvd2x	50, %6, %1			    \n\t"
+	"stxvd2x	51, %7, %1			    \n\t"
+	"stxvd2x	52, %8, %1			    \n\t"
+	"stxvd2x	53, %9, %1			    \n\t"
+	"stxvd2x	54, %10, %1			    \n\t"
+	"stxvd2x	55, %11, %1			    \n\t"
+
+	"addi		%1, %1, 128			    \n\t"
+	"addi		%2, %2, 128			    \n\t"
+
+	"addic.		%0 , %0	, -16  	 	             \n\t"
+	"bgt		1b		             	     \n\t"
+
+	"2:						     \n\t"
+
+	"xvmuldp	48, 40, 32		    	    \n\t"
+	"xvmuldp	49, 41, 32		    	    \n\t"
+	"xvmuldp	50, 42, 32		    	    \n\t"
+	"xvmuldp	51, 43, 32		    	    \n\t"
+	"xvmuldp	52, 44, 32		    	    \n\t"
+	"xvmuldp	53, 45, 32		    	    \n\t"
+	"xvmuldp	54, 46, 32		    	    \n\t"
+	"xvmuldp	55, 47, 32		    	    \n\t"
+
+	"stxvd2x	48, 0, %1			    \n\t"
+	"stxvd2x	49, %5, %1			    \n\t"
+	"stxvd2x	50, %6, %1			    \n\t"
+	"stxvd2x	51, %7, %1			    \n\t"
+	"stxvd2x	52, %8, %1			    \n\t"
+	"stxvd2x	53, %9, %1			    \n\t"
+	"stxvd2x	54, %10, %1			    \n\t"
+	"stxvd2x	55, %11, %1			    \n\t"
+
+	:
+        : 
+          "r" (i),	// 0	
+	  "r" (x2),  	// 1
+          "r" (x1),     // 2
+          "r" (alpha),  // 3
+          "r" (pre),    // 4
+	  "r" (o16),	// 5
+	  "r" (o32),	// 6
+	  "r" (o48),    // 7
+          "r" (o64),    // 8
+          "r" (o80),    // 9
+          "r" (o96),    // 10
+          "r" (o112)    // 11
+	: "cr0", "%0", "%2" , "%1", "memory"
+	);
+
+} 
+
+
+static void dscal_kernel_8_zero( BLASLONG n, FLOAT *alpha, FLOAT *x) __attribute__ ((noinline));
+
+static void dscal_kernel_8_zero( BLASLONG n, FLOAT *alpha, FLOAT *x)
+{
+
+
+	BLASLONG i = n;
+	BLASLONG o16 = 16;
+	BLASLONG o32 = 32;
+	BLASLONG o48 = 48;
+	BLASLONG o64 = 64;
+	BLASLONG o80 = 80;
+	BLASLONG o96 = 96;
+	BLASLONG o112 = 112;
+	FLOAT *x1=x;
+	FLOAT *x2=x+1;
+	BLASLONG pre = 384;
+
+	__asm__  __volatile__
+	(
+
+	"xxlxor		32 , 32 , 32			    \n\t"
+        "addi           %1, %1, -8                          \n\t"
+
+
+	".align 5				            \n\t"
+	"1:				                    \n\t"
+
+	"stxvd2x	32, 0, %1			    \n\t"
+	"stxvd2x	32, %5, %1			    \n\t"
+	"stxvd2x	32, %6, %1			    \n\t"
+	"stxvd2x	32, %7, %1			    \n\t"
+	"stxvd2x	32, %8, %1			    \n\t"
+	"stxvd2x	32, %9, %1			    \n\t"
+	"stxvd2x	32, %10, %1			    \n\t"
+	"stxvd2x	32, %11, %1			    \n\t"
+
+	"addi		%1, %1, 128			    \n\t"
+
+	"addic.		%0 , %0	, -16  	 	             \n\t"
+	"bgt		1b		             	     \n\t"
+
+	"2:						     \n\t"
+
+	:
+        : 
+          "r" (i),	// 0	
+	  "r" (x2),  	// 1
+          "r" (x1),     // 2
+          "r" (alpha),  // 3
+          "r" (pre),    // 4
+	  "r" (o16),	// 5
+	  "r" (o32),	// 6
+	  "r" (o48),    // 7
+          "r" (o64),    // 8
+          "r" (o80),    // 9
+          "r" (o96),    // 10
+          "r" (o112)    // 11
+	: "cr0", "%0", "%2" , "%1", "memory"
+	);
+
+} 
+
+
diff --git a/kernel/power/dswap.c b/kernel/power/dswap.c
new file mode 100644
index 000000000..fd2dec9c4
--- /dev/null
+++ b/kernel/power/dswap.c
@@ -0,0 +1,154 @@
+/***************************************************************************
+Copyright (c) 2013-2016, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*****************************************************************************/
+
+/**************************************************************************************
+* 2016/03/25 Werner Saar (wernsaar@googlemail.com)
+* 	 BLASTEST 		: OK
+* 	 CTEST			: OK
+* 	 TEST			: OK
+*	 LAPACK-TEST		: OK
+**************************************************************************************/
+
+#include "common.h"
+
+#if defined(POWER8)
+#include "dswap_microk_power8.c"
+#endif
+
+#ifndef HAVE_KERNEL_32
+
+static void dswap_kernel_32(BLASLONG n, FLOAT *x, FLOAT *y)
+{
+
+	BLASLONG i=0;
+	FLOAT f0, f1, f2, f3, f4, f5, f6, f7;
+	FLOAT g0, g1, g2, g3, g4, g5, g6, g7;
+	FLOAT *x1=x;
+	FLOAT *y1=y;
+
+	while ( i<n )
+	{
+
+		f0 = x1[0];
+		f1 = x1[1];
+		f2 = x1[2];
+		f3 = x1[3];
+		f4 = x1[4];
+		f5 = x1[5];
+		f6 = x1[6];
+		f7 = x1[7];
+
+		g0 = y1[0];
+		g1 = y1[1];
+		g2 = y1[2];
+		g3 = y1[3];
+		g4 = y1[4];
+		g5 = y1[5];
+		g6 = y1[6];
+		g7 = y1[7];
+
+		y1[0] = f0;
+		y1[1] = f1;
+		y1[2] = f2;
+		y1[3] = f3;
+		y1[4] = f4;
+		y1[5] = f5;
+		y1[6] = f6;
+		y1[7] = f7;
+
+		x1[0] = g0;
+		x1[1] = g1;
+		x1[2] = g2;
+		x1[3] = g3;
+		x1[4] = g4;
+		x1[5] = g5;
+		x1[6] = g6;
+		x1[7] = g7;
+
+		x1 += 8;
+		y1 += 8;
+
+		i+=8;
+	}
+	return;
+
+}
+
+
+#endif
+
+
+int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT dummy3, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *dummy, BLASLONG dummy2)
+{
+	BLASLONG i=0;
+	BLASLONG ix=0,iy=0;
+	FLOAT temp;
+
+	if ( n <= 0     )  return(0);
+
+	if ( (inc_x == 1) && (inc_y == 1 ))
+	{
+
+		BLASLONG n1 = n & -32;
+		if ( n1 > 0 )
+		{
+			dswap_kernel_32(n1, x, y);
+			i=n1;
+		}
+
+		while(i < n)
+		{
+			temp = y[i];	
+			y[i] = x[i] ;
+			x[i] = temp;
+			i++ ;
+
+		}
+
+
+	}
+	else
+	{
+
+		while(i < n)
+		{
+			temp  = y[iy];
+			y[iy] = x[ix] ;
+			x[ix] = temp;
+			ix += inc_x ;
+			iy += inc_y ;
+			i++ ;
+
+		}
+
+	}
+	return(0);
+	
+
+}
+
+
diff --git a/kernel/power/dswap_microk_power8.c b/kernel/power/dswap_microk_power8.c
new file mode 100644
index 000000000..77747c3b9
--- /dev/null
+++ b/kernel/power/dswap_microk_power8.c
@@ -0,0 +1,180 @@
+/***************************************************************************
+Copyright (c) 2013-2016, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*****************************************************************************/
+
+/**************************************************************************************
+* 2016/03/25 Werner Saar (wernsaar@googlemail.com)
+* 	 BLASTEST 		: OK
+* 	 CTEST			: OK
+* 	 TEST			: OK
+*	 LAPACK-TEST		: OK
+**************************************************************************************/
+
+#define HAVE_KERNEL_32 1
+
+static void dswap_kernel_32( BLASLONG n, FLOAT *x, FLOAT *y) __attribute__ ((noinline));
+
+static void dswap_kernel_32( BLASLONG n, FLOAT *x, FLOAT *y)
+{
+
+
+	BLASLONG i = n;
+	BLASLONG o16 = 16;
+	BLASLONG o32 = 32;
+	BLASLONG o48 = 48;
+	BLASLONG o64 = 64;
+	BLASLONG o80 = 80;
+	BLASLONG o96 = 96;
+	BLASLONG o112 = 112;
+	FLOAT *x1=x;
+	FLOAT *y1=y;
+	FLOAT *x2=x+1;
+	FLOAT *y2=y+1;
+	BLASLONG pre = 384;
+	BLASLONG alpha=0;
+
+	__asm__  __volatile__
+	(
+
+	"addi		%3, %3, -8			    \n\t"	
+	"addi		%4, %4, -8			    \n\t"	
+
+	".align 5				            \n\t"
+	"1:				                    \n\t"
+
+	"lxvd2x		32, 0, %2			    \n\t"
+	"lxvd2x		33, %5, %2			    \n\t"
+	"lxvd2x		34, %6, %2			    \n\t"
+	"lxvd2x		35, %7, %2			    \n\t"
+	"lxvd2x		36, %8, %2			    \n\t"
+	"lxvd2x		37, %9, %2			    \n\t"
+	"lxvd2x		38, %10, %2			    \n\t"
+	"lxvd2x		39, %11, %2			    \n\t"
+
+	"addi		%2, %2, 128			    \n\t"
+
+	"lxvd2x		40, 0, %2			    \n\t"
+	"lxvd2x		41, %5, %2			    \n\t"
+	"lxvd2x		42, %6, %2			    \n\t"
+	"lxvd2x		43, %7, %2			    \n\t"
+	"lxvd2x		44, %8, %2			    \n\t"
+	"lxvd2x		45, %9, %2			    \n\t"
+	"lxvd2x		46, %10, %2			    \n\t"
+	"lxvd2x		47, %11, %2			    \n\t"
+
+	"addi		%2, %2, 128			    \n\t"
+
+	"lxvd2x		48, 0, %1			    \n\t"
+	"lxvd2x		49, %5, %1			    \n\t"
+	"lxvd2x		50, %6, %1			    \n\t"
+	"lxvd2x		51, %7, %1			    \n\t"
+	"lxvd2x		52, %8, %1			    \n\t"
+	"lxvd2x		53, %9, %1			    \n\t"
+	"lxvd2x		54, %10, %1			    \n\t"
+	"lxvd2x		55, %11, %1			    \n\t"
+
+	"addi		%1, %1, 128			    \n\t"
+
+	"lxvd2x		56, 0, %1			    \n\t"
+	"lxvd2x		57, %5, %1			    \n\t"
+	"lxvd2x		58, %6, %1			    \n\t"
+	"lxvd2x		59, %7, %1			    \n\t"
+	"lxvd2x		60, %8, %1			    \n\t"
+	"lxvd2x		61, %9, %1			    \n\t"
+	"lxvd2x		62, %10, %1			    \n\t"
+	"lxvd2x		63, %11, %1			    \n\t"
+
+	"addi		%1, %1, 128			    \n\t"
+
+	"stxvd2x		32, 0, %3			    \n\t"
+	"stxvd2x		33, %5, %3			    \n\t"
+	"stxvd2x		34, %6, %3			    \n\t"
+	"stxvd2x		35, %7, %3			    \n\t"
+	"stxvd2x		36, %8, %3			    \n\t"
+	"stxvd2x		37, %9, %3			    \n\t"
+	"stxvd2x		38, %10, %3			    \n\t"
+	"stxvd2x		39, %11, %3			    \n\t"
+
+	"addi		%3, %3, 128			    \n\t"
+
+	"stxvd2x		40, 0, %3			    \n\t"
+	"stxvd2x		41, %5, %3			    \n\t"
+	"stxvd2x		42, %6, %3			    \n\t"
+	"stxvd2x		43, %7, %3			    \n\t"
+	"stxvd2x		44, %8, %3			    \n\t"
+	"stxvd2x		45, %9, %3			    \n\t"
+	"stxvd2x		46, %10, %3			    \n\t"
+	"stxvd2x		47, %11, %3			    \n\t"
+
+	"addi		%3, %3, 128			    \n\t"
+
+	"stxvd2x		48, 0, %4			    \n\t"
+	"stxvd2x		49, %5, %4			    \n\t"
+	"stxvd2x		50, %6, %4			    \n\t"
+	"stxvd2x		51, %7, %4			    \n\t"
+	"stxvd2x		52, %8, %4			    \n\t"
+	"stxvd2x		53, %9, %4			    \n\t"
+	"stxvd2x		54, %10, %4			    \n\t"
+	"stxvd2x		55, %11, %4			    \n\t"
+
+	"addi		%4, %4, 128			    \n\t"
+
+	"stxvd2x		56, 0, %4			    \n\t"
+	"stxvd2x		57, %5, %4			    \n\t"
+	"stxvd2x		58, %6, %4			    \n\t"
+	"stxvd2x		59, %7, %4			    \n\t"
+	"stxvd2x		60, %8, %4			    \n\t"
+	"stxvd2x		61, %9, %4			    \n\t"
+	"stxvd2x		62, %10, %4			    \n\t"
+	"stxvd2x		63, %11, %4			    \n\t"
+
+	"addi		%4, %4, 128			    \n\t"
+
+	"addic.		%0 , %0	, -32  	 	             \n\t"
+	"bgt		1b		             	     \n\t"
+
+	"2:						     \n\t"
+
+	:
+        : 
+          "r" (i),	// 0	
+	  "r" (y1),  	// 1
+          "r" (x1),     // 2
+          "r" (y2),     // 3
+          "r" (x2),     // 4
+	  "r" (o16),	// 5
+	  "r" (o32),	// 6
+	  "r" (o48),    // 7
+          "r" (o64),    // 8
+          "r" (o80),    // 9
+          "r" (o96),    // 10
+          "r" (o112)    // 11
+	: "cr0", "%0", "%2" , "%1", "%3", "%4", "memory"
+	);
+
+} 
+
+
diff --git a/kernel/power/sasum.c b/kernel/power/sasum.c
new file mode 100644
index 000000000..43311f2ba
--- /dev/null
+++ b/kernel/power/sasum.c
@@ -0,0 +1,146 @@
+/***************************************************************************
+Copyright (c) 2013-2016, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*****************************************************************************/
+
+/**************************************************************************************
+* 2016/03/28 Werner Saar (wernsaar@googlemail.com)
+* 	 BLASTEST 		: OK
+* 	 CTEST			: OK
+* 	 TEST			: OK
+*	 LAPACK-TEST		: OK
+**************************************************************************************/
+
+#include "common.h"
+#include <math.h>
+
+#if defined(DOUBLE)
+
+#define ABS fabs
+
+#else
+
+#define ABS fabsf
+
+#endif
+
+#if defined(POWER8)
+#include "sasum_microk_power8.c"
+#endif
+
+
+#ifndef HAVE_KERNEL_32
+
+static void sasum_kernel_32(BLASLONG n, FLOAT *x1, FLOAT *svec)
+{
+
+	BLASLONG i=0;
+	FLOAT *x = x1;
+	FLOAT temp0, temp1, temp2, temp3;
+	FLOAT temp4, temp5, temp6, temp7;
+	FLOAT sum0 = 0.0;
+	FLOAT sum1 = 0.0;
+	FLOAT sum2 = 0.0;
+	FLOAT sum3 = 0.0;
+
+	while ( i< n )
+	{
+
+		temp0 = ABS(x[0]);
+		temp1 = ABS(x[1]);
+		temp2 = ABS(x[2]);
+		temp3 = ABS(x[3]);
+		temp4 = ABS(x[4]);
+		temp5 = ABS(x[5]);
+		temp6 = ABS(x[6]);
+		temp7 = ABS(x[7]);
+
+		sum0 += temp0;
+		sum1 += temp1;
+		sum2 += temp2;
+		sum3 += temp3;
+
+		sum0 += temp4;
+		sum1 += temp5;
+		sum2 += temp6;
+		sum3 += temp7;
+
+		x+=8;
+		i+=8;
+
+	}
+
+	svec[0] = sum0+sum1+sum2+sum3;
+	svec[1] = 0.0;
+	svec[2] = 0.0;
+	svec[3] = 0.0;
+
+}
+
+#endif
+
+FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x)
+{
+	BLASLONG i=0;
+	FLOAT sumf = 0.0;
+	FLOAT svec[4] __attribute__ ((aligned (16)));;
+	BLASLONG n1;
+
+	if (n <= 0 || inc_x <= 0) return(sumf);
+
+	if ( inc_x == 1 )
+	{
+
+		n1 = n & -32;
+		if ( n1 > 0 )
+		{
+
+			sasum_kernel_32(n1, x, svec);
+			sumf = svec[0] + svec[1]+svec[2]+svec[3];
+			i=n1;
+		}
+
+		while(i < n)
+		{
+			sumf += ABS(x[i]);
+			i++;
+		}
+
+	}
+	else
+	{
+
+		n *= inc_x;
+		while(i < n)
+		{
+			sumf += ABS(x[i]);
+			i += inc_x;
+		}
+
+	}
+	return(sumf);
+}
+
+
diff --git a/kernel/power/sasum_microk_power8.c b/kernel/power/sasum_microk_power8.c
new file mode 100644
index 000000000..847fffe04
--- /dev/null
+++ b/kernel/power/sasum_microk_power8.c
@@ -0,0 +1,177 @@
+/***************************************************************************
+Copyright (c) 2013-2016, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*****************************************************************************/
+
+/**************************************************************************************
+* 2016/03/28 Werner Saar (wernsaar@googlemail.com)
+* 	 BLASTEST 		: OK
+* 	 CTEST			: OK
+* 	 TEST			: OK
+*	 LAPACK-TEST		: OK
+**************************************************************************************/
+
+#define HAVE_KERNEL_32 1
+static void sasum_kernel_32( BLASLONG n, FLOAT *x, FLOAT *svec) __attribute__ ((noinline));
+
+static void sasum_kernel_32( BLASLONG n, FLOAT *x, FLOAT *svec)
+{
+
+
+	BLASLONG i = n;
+	BLASLONG o16 = 16;
+	BLASLONG o32 = 32;
+	BLASLONG o48 = 48;
+	BLASLONG o64 = 64;
+	BLASLONG o80 = 80;
+	BLASLONG o96 = 96;
+	BLASLONG o112 = 112;
+	FLOAT *x1=x;
+	BLASLONG pre = 384;
+
+	__asm__  __volatile__
+	(
+
+	"dcbt		%2 , %4				    \n\t"
+
+	"xxlxor		32,32,32			    \n\t"
+	"xxlxor		33,33,33			    \n\t"
+	"xxlxor		34,34,34			    \n\t"
+	"xxlxor		35,35,35			    \n\t"
+	"xxlxor		36,36,36			    \n\t"
+	"xxlxor		37,37,37			    \n\t"
+	"xxlxor		38,38,38			    \n\t"
+	"xxlxor		39,39,39			    \n\t"
+
+	"lxvw4x		40, 0, %2			    \n\t"
+	"lxvw4x		41, %5, %2			    \n\t"
+	"lxvw4x		42, %6, %2			    \n\t"
+	"lxvw4x		43, %7, %2			    \n\t"
+	"lxvw4x		44, %8, %2			    \n\t"
+	"lxvw4x		45, %9, %2			    \n\t"
+	"lxvw4x		46, %10, %2			    \n\t"
+	"lxvw4x		47, %11, %2			    \n\t"
+
+	"addi		%2, %2, 128			    \n\t"
+
+	"addic.		%0 , %0	, -32  	 	             \n\t"
+	"ble		2f		             	     \n\t"
+
+	".align 5				            \n\t"
+	"1:				                    \n\t"
+
+	"dcbt		%2 , %4				    \n\t"
+
+	"xvabssp	48, 40				    \n\t"
+	"xvabssp	49, 41				    \n\t"
+	"xvabssp	50, 42				    \n\t"
+	"xvabssp	51, 43				    \n\t"
+
+	"lxvw4x		40, 0, %2			    \n\t"
+	"lxvw4x		41, %5, %2			    \n\t"
+
+	"xvabssp	52, 44				    \n\t"
+	"xvabssp	53, 45				    \n\t"
+
+	"lxvw4x		42, %6, %2			    \n\t"
+	"lxvw4x		43, %7, %2			    \n\t"
+
+	"xvabssp	54, 46				    \n\t"
+	"xvabssp	55, 47				    \n\t"
+
+	"lxvw4x		44, %8, %2			    \n\t"
+	"lxvw4x		45, %9, %2			    \n\t"
+
+	"xvaddsp	32, 32, 48		    \n\t"
+	"xvaddsp	33, 33, 49		    \n\t"
+
+	"lxvw4x		46, %10, %2			    \n\t"
+	"lxvw4x		47, %11, %2			    \n\t"
+
+	"xvaddsp	34, 34, 50		    \n\t"
+	"xvaddsp	35, 35, 51		    \n\t"
+	"addi		%2, %2, 128			    \n\t"
+	"xvaddsp	36, 36, 52		    \n\t"
+	"xvaddsp	37, 37, 53		    \n\t"
+	"addic.		%0 , %0	, -32  	 	             \n\t"
+	"xvaddsp	38, 38, 54		    \n\t"
+	"xvaddsp	39, 39, 55		    \n\t"
+
+	"bgt		1b		             	     \n\t"
+
+	"2:						     \n\t"
+
+
+	"xvabssp	48, 40				    \n\t"
+	"xvabssp	49, 41				    \n\t"
+	"xvabssp	50, 42				    \n\t"
+	"xvabssp	51, 43				    \n\t"
+	"xvabssp	52, 44				    \n\t"
+	"xvabssp	53, 45				    \n\t"
+	"xvabssp	54, 46				    \n\t"
+	"xvabssp	55, 47				    \n\t"
+
+	"xvaddsp	32, 32, 48		    \n\t"
+	"xvaddsp	33, 33, 49		    \n\t"
+	"xvaddsp	34, 34, 50		    \n\t"
+	"xvaddsp	35, 35, 51		    \n\t"
+	"xvaddsp	36, 36, 52		    \n\t"
+	"xvaddsp	37, 37, 53		    \n\t"
+	"xvaddsp	38, 38, 54		    \n\t"
+	"xvaddsp	39, 39, 55		    \n\t"
+
+	"xvaddsp	32, 32, 33		     \n\t"
+	"xvaddsp	34, 34, 35		     \n\t"
+	"xvaddsp	36, 36, 37		     \n\t"
+	"xvaddsp	38, 38, 39		     \n\t"
+
+	"xvaddsp	32, 32, 34		     \n\t"
+	"xvaddsp	36, 36, 38		     \n\t"
+
+	"xvaddsp	32, 32, 36		     \n\t"
+
+
+	"stxvw4x	32, 0, %3		     \n\t"
+
+	:
+        : 
+          "r" (i),	// 0	
+	  "r" (n),  	// 1
+          "r" (x1),     // 2
+          "r" (svec),   // 3
+          "r" (pre),    // 4
+	  "r" (o16),	// 5
+	  "r" (o32),	// 6
+	  "r" (o48),    // 7
+          "r" (o64),    // 8
+          "r" (o80),    // 9
+          "r" (o96),    // 10
+          "r" (o112)   // 11
+	: "cr0", "%0", "%2",  "memory"
+	);
+
+} 
+
+
diff --git a/kernel/power/scopy.c b/kernel/power/scopy.c
new file mode 100644
index 000000000..167c29bab
--- /dev/null
+++ b/kernel/power/scopy.c
@@ -0,0 +1,131 @@
+/***************************************************************************
+Copyright (c) 2013-2016, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*****************************************************************************/
+
+/**************************************************************************************
+* 2016/03/25 Werner Saar (wernsaar@googlemail.com)
+* 	 BLASTEST 		: OK
+* 	 CTEST			: OK
+* 	 TEST			: OK
+*	 LAPACK-TEST		: OK
+**************************************************************************************/
+
+#include "common.h"
+
+#if defined(POWER8)
+#include "scopy_microk_power8.c"
+#endif
+
+#ifndef HAVE_KERNEL_32
+
+static void scopy_kernel_32(BLASLONG n, FLOAT *x, FLOAT *y)
+{
+
+	BLASLONG i=0;
+	FLOAT f0, f1, f2, f3, f4, f5, f6, f7;
+	FLOAT *x1=x;
+	FLOAT *y1=y;
+
+	while ( i<n )
+	{
+
+		f0 = x1[0];
+		f1 = x1[1];
+		f2 = x1[2];
+		f3 = x1[3];
+		f4 = x1[4];
+		f5 = x1[5];
+		f6 = x1[6];
+		f7 = x1[7];
+
+		y1[0] = f0;
+		y1[1] = f1;
+		y1[2] = f2;
+		y1[3] = f3;
+		y1[4] = f4;
+		y1[5] = f5;
+		y1[6] = f6;
+		y1[7] = f7;
+
+		x1 += 8;
+		y1 += 8;
+
+		i+=8;
+	}
+	return;
+
+}
+
+
+#endif
+
+
+
+int CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y)
+{
+	BLASLONG i=0;
+	BLASLONG ix=0,iy=0;
+
+	if ( n <= 0     )  return(0);
+
+	if ( (inc_x == 1) && (inc_y == 1 ))
+	{
+
+		BLASLONG n1 = n & -32;
+		if ( n1 > 0 )
+		{
+			scopy_kernel_32(n1, x, y);
+			i=n1;
+		}
+
+		while(i < n)
+		{
+			y[i] = x[i] ;
+			i++ ;
+
+		}
+
+
+	}
+	else
+	{
+
+		while(i < n)
+		{
+			y[iy] = x[ix] ;
+			ix += inc_x ;
+			iy += inc_y ;
+			i++ ;
+
+		}
+
+	}
+	return(0);
+	
+
+}
+
+
diff --git a/kernel/power/scopy_microk_power8.c b/kernel/power/scopy_microk_power8.c
new file mode 100644
index 000000000..2e08e3561
--- /dev/null
+++ b/kernel/power/scopy_microk_power8.c
@@ -0,0 +1,131 @@
+/***************************************************************************
+Copyright (c) 2013-2016, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*****************************************************************************/
+
+/**************************************************************************************
+* 2016/03/25 Werner Saar (wernsaar@googlemail.com)
+* 	 BLASTEST 		: OK
+* 	 CTEST			: OK
+* 	 TEST			: OK
+*	 LAPACK-TEST		: OK
+**************************************************************************************/
+
+#define HAVE_KERNEL_32 1
+
+static void scopy_kernel_32( BLASLONG n, FLOAT *x, FLOAT *y) __attribute__ ((noinline));
+
+static void scopy_kernel_32( BLASLONG n, FLOAT *x, FLOAT *y)
+{
+
+
+	BLASLONG i = n;
+	BLASLONG o16 = 16;
+	BLASLONG o32 = 32;
+	BLASLONG o48 = 48;
+	BLASLONG o64 = 64;
+	BLASLONG o80 = 80;
+	BLASLONG o96 = 96;
+	BLASLONG o112 = 112;
+	FLOAT *x1=x;
+	FLOAT *y1=y;
+	BLASLONG pre = 384;
+	BLASLONG alpha=0;
+
+	__asm__  __volatile__
+	(
+
+	"lxvw4x		40, 0, %2			    \n\t"
+	"lxvw4x		41, %5, %2			    \n\t"
+	"lxvw4x		42, %6, %2			    \n\t"
+	"lxvw4x		43, %7, %2			    \n\t"
+	"lxvw4x		44, %8, %2			    \n\t"
+	"lxvw4x		45, %9, %2			    \n\t"
+	"lxvw4x		46, %10, %2			    \n\t"
+	"lxvw4x		47, %11, %2			    \n\t"
+
+	"addi		%2, %2, 128			    \n\t"
+
+	"addic.		%0 , %0	, -32  	 	             \n\t"
+	"ble		2f		             	     \n\t"
+
+	".align 5				            \n\t"
+	"1:				                    \n\t"
+
+	"stxvw4x		40, 0, %1			    \n\t"
+	"stxvw4x		41, %5, %1			    \n\t"
+	"lxvw4x		40, 0, %2			    \n\t"
+	"lxvw4x		41, %5, %2			    \n\t"
+	"stxvw4x		42, %6, %1			    \n\t"
+	"stxvw4x		43, %7, %1			    \n\t"
+	"lxvw4x		42, %6, %2			    \n\t"
+	"lxvw4x		43, %7, %2			    \n\t"
+	"stxvw4x		44, %8, %1			    \n\t"
+	"stxvw4x		45, %9, %1			    \n\t"
+	"lxvw4x		44, %8, %2			    \n\t"
+	"lxvw4x		45, %9, %2			    \n\t"
+	"stxvw4x		46, %10, %1			    \n\t"
+	"stxvw4x		47, %11, %1			    \n\t"
+	"lxvw4x		46, %10, %2			    \n\t"
+	"lxvw4x		47, %11, %2			    \n\t"
+
+
+	"addi		%1, %1, 128			    \n\t"
+	"addi		%2, %2, 128			    \n\t"
+
+	"addic.		%0 , %0	, -32  	 	             \n\t"
+	"bgt		1b		             	     \n\t"
+
+	"2:						     \n\t"
+
+	"stxvw4x		40, 0, %1			    \n\t"
+	"stxvw4x		41, %5, %1			    \n\t"
+	"stxvw4x		42, %6, %1			    \n\t"
+	"stxvw4x		43, %7, %1			    \n\t"
+	"stxvw4x		44, %8, %1			    \n\t"
+	"stxvw4x		45, %9, %1			    \n\t"
+	"stxvw4x		46, %10, %1			    \n\t"
+	"stxvw4x		47, %11, %1			    \n\t"
+
+	:
+        : 
+          "r" (i),	// 0	
+	  "r" (y1),  	// 1
+          "r" (x1),     // 2
+          "r" (alpha),  // 3
+          "r" (pre),    // 4
+	  "r" (o16),	// 5
+	  "r" (o32),	// 6
+	  "r" (o48),    // 7
+          "r" (o64),    // 8
+          "r" (o80),    // 9
+          "r" (o96),    // 10
+          "r" (o112)    // 11
+	: "cr0", "%0", "%2" , "%1", "memory"
+	);
+
+} 
+
+
diff --git a/kernel/power/sdot.c b/kernel/power/sdot.c
new file mode 100644
index 000000000..52fb1fe24
--- /dev/null
+++ b/kernel/power/sdot.c
@@ -0,0 +1,126 @@
+/***************************************************************************
+Copyright (c) 2013-2016, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*****************************************************************************/
+
+/**************************************************************************************
+* 2016/03/21 Werner Saar (wernsaar@googlemail.com)
+* 	 BLASTEST 		: OK
+* 	 CTEST			: OK
+* 	 TEST			: OK
+*	 LAPACK-TEST		: OK
+**************************************************************************************/
+
+#include "common.h"
+
+#if defined(POWER8) 
+#include "sdot_microk_power8.c"
+#endif
+
+
+#ifndef HAVE_KERNEL_16
+
+static void sdot_kernel_16(BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *d)
+{
+	BLASLONG register i = 0;
+	FLOAT dot = 0.0;
+
+	while(i < n)
+        {
+              dot += y[i]  * x[i]
+                  + y[i+1] * x[i+1]
+                  + y[i+2] * x[i+2]
+                  + y[i+3] * x[i+3]
+                  + y[i+4] * x[i+4]
+                  + y[i+5] * x[i+5]
+                  + y[i+6] * x[i+6]
+                  + y[i+7] * x[i+7] ;
+
+              i+=8 ;
+
+       }
+       *d += dot;
+
+}
+
+#endif
+
+FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y)
+{
+	BLASLONG i=0;
+	BLASLONG ix=0,iy=0;
+
+	FLOAT  dot = 0.0 ;
+
+	if ( n <= 0 )  return(dot);
+
+	if ( (inc_x == 1) && (inc_y == 1) )
+	{
+
+		BLASLONG n1 = n & -32;
+
+		if ( n1 )
+			sdot_kernel_16(n1, x, y , &dot );
+
+
+		i = n1;
+		while(i < n)
+		{
+
+			dot += y[i] * x[i] ;
+			i++ ;
+
+		}
+		return(dot);
+
+
+	}
+
+	BLASLONG n1 = n & -2;
+
+	while(i < n1)
+	{
+
+		dot += y[iy] * x[ix] + y[iy+inc_y] * x[ix+inc_x];
+		ix  += inc_x*2 ;
+		iy  += inc_y*2 ;
+		i+=2 ;
+
+	}
+
+	while(i < n)
+	{
+
+		dot += y[iy] * x[ix] ;
+		ix  += inc_x ;
+		iy  += inc_y ;
+		i++ ;
+
+	}
+	return(dot);
+
+}
+
+
diff --git a/kernel/power/sdot_microk_power8.c b/kernel/power/sdot_microk_power8.c
new file mode 100644
index 000000000..6dd588acd
--- /dev/null
+++ b/kernel/power/sdot_microk_power8.c
@@ -0,0 +1,179 @@
+/***************************************************************************
+Copyright (c) 2013-2016, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*****************************************************************************/
+
+/**************************************************************************************
+* 2016/03/21 Werner Saar (wernsaar@googlemail.com)
+* 	 BLASTEST 		: OK
+* 	 CTEST			: OK
+* 	 TEST			: OK
+*	 LAPACK-TEST		: OK
+**************************************************************************************/
+
+#define HAVE_KERNEL_16 1
+static void sdot_kernel_16( BLASLONG n, FLOAT *x, FLOAT *y , FLOAT *dot) __attribute__ ((noinline));
+
+static void sdot_kernel_16( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *dot)
+{
+
+
+	BLASLONG i = n;
+	BLASLONG o16 = 16;
+	BLASLONG o32 = 32;
+	BLASLONG o48 = 48;
+	BLASLONG o64 = 64;
+	BLASLONG o80 = 80;
+	BLASLONG o96 = 96;
+	BLASLONG o112 = 112;
+	FLOAT *x1=x;
+	FLOAT *y1=y;
+	BLASLONG pre = 384;
+	FLOAT tempdot[4];
+
+
+	__asm__  __volatile__
+	(
+	"xxlxor		32,32,32			    \n\t"
+	"xxlxor		33,33,33			    \n\t"
+	"xxlxor		34,34,34			    \n\t"
+	"xxlxor		35,35,35			    \n\t"
+	"xxlxor		36,36,36			    \n\t"
+	"xxlxor		37,37,37			    \n\t"
+	"xxlxor		38,38,38			    \n\t"
+	"xxlxor		39,39,39			    \n\t"
+
+	"dcbt		%2, %12				    \n\t"
+	"dcbt		%3, %12				    \n\t"
+
+	"lxvw4x		40, 0, %2			    \n\t"
+	"lxvw4x		48, 0, %3			    \n\t"
+	"lxvw4x		41, %5, %2			    \n\t"
+	"lxvw4x		49, %5, %3			    \n\t"
+	"lxvw4x		42, %6, %2			    \n\t"
+	"lxvw4x		50, %6, %3			    \n\t"
+	"lxvw4x		43, %7, %2			    \n\t"
+	"lxvw4x		51, %7, %3			    \n\t"
+	"lxvw4x		44, %8, %2			    \n\t"
+	"lxvw4x		52, %8, %3			    \n\t"
+	"lxvw4x		45, %9, %2			    \n\t"
+	"lxvw4x		53, %9, %3			    \n\t"
+	"lxvw4x		46, %10, %2			    \n\t"
+	"lxvw4x		54, %10, %3			    \n\t"
+	"lxvw4x		47, %11, %2			    \n\t"
+	"lxvw4x		55, %11, %3			    \n\t"
+
+	"addi		%2, %2, 128			    \n\t"
+	"addi		%3, %3, 128			    \n\t"
+
+	"addic.		%0 , %0	, -32  	 	             \n\t"
+	"ble		2f		             	     \n\t"
+
+	".align 5				            \n\t"
+	"1:				                    \n\t"
+
+	"dcbt		%2, %12				    \n\t"
+	"dcbt		%3, %12				    \n\t"
+
+	"xvmaddasp	32, 40, 48		    \n\t"
+	"lxvw4x		40, 0, %2			    \n\t"
+	"lxvw4x		48, 0, %3			    \n\t"
+	"xvmaddasp	33, 41, 49		    \n\t"
+	"lxvw4x		41, %5, %2			    \n\t"
+	"lxvw4x		49, %5, %3			    \n\t"
+	"xvmaddasp	34, 42, 50		    \n\t"
+	"lxvw4x		42, %6, %2			    \n\t"
+	"lxvw4x		50, %6, %3			    \n\t"
+	"xvmaddasp	35, 43, 51		    \n\t"
+	"lxvw4x		43, %7, %2			    \n\t"
+	"lxvw4x		51, %7, %3			    \n\t"
+	"xvmaddasp	36, 44, 52		    \n\t"
+	"lxvw4x		44, %8, %2			    \n\t"
+	"lxvw4x		52, %8, %3			    \n\t"
+	"xvmaddasp	37, 45, 53		    \n\t"
+	"lxvw4x		45, %9, %2			    \n\t"
+	"lxvw4x		53, %9, %3			    \n\t"
+	"xvmaddasp	38, 46, 54		    \n\t"
+	"lxvw4x		46, %10, %2			    \n\t"
+	"lxvw4x		54, %10, %3			    \n\t"
+	"xvmaddasp	39, 47, 55		    \n\t"
+
+	"lxvw4x		47, %11, %2			    \n\t"
+	"lxvw4x		55, %11, %3			    \n\t"
+
+
+	"addi		%2, %2, 128			    \n\t"
+	"addi		%3, %3, 128			    \n\t"
+
+	"addic.		%0 , %0	, -32  	 	             \n\t"
+	"bgt		1b		             	     \n\t"
+
+	"2:						     \n\t"
+
+	"xvmaddasp	32, 40, 48		    \n\t"
+	"xvmaddasp	33, 41, 49		    \n\t"
+	"xvmaddasp	34, 42, 50		    \n\t"
+	"xvmaddasp	35, 43, 51		    \n\t"
+	"xvmaddasp	36, 44, 52		    \n\t"
+	"xvmaddasp	37, 45, 53		    \n\t"
+	"xvmaddasp	38, 46, 54		    \n\t"
+	"xvmaddasp	39, 47, 55		    \n\t"
+
+	"xvaddsp	32, 32 , 33		    \n\t"
+	"xvaddsp	34, 34 , 35		    \n\t"
+	"xvaddsp	36, 36 , 37		    \n\t"
+	"xvaddsp	38, 38 , 39		    \n\t"
+
+	"xvaddsp	32, 32 , 34		    \n\t"
+	"xvaddsp	36, 36 , 38		    \n\t"
+
+	"xvaddsp	32, 32 , 36		    \n\t"
+
+	"stxvw4x	32, 0 , %4		    \n\t"
+
+	:
+        : 
+          "r" (i),	// 0	
+	  "r" (n),  	// 1
+          "r" (x1),     // 2
+          "r" (y1),     // 3
+          "r" (tempdot),    // 4
+	  "r" (o16),	// 5
+	  "r" (o32),	// 6
+	  "r" (o48),    // 7
+          "r" (o64),    // 8
+          "r" (o80),    // 9
+          "r" (o96),    // 10
+          "r" (o112),   // 11
+	  "r" (pre)	// 12
+	: "cr0", "%0", "%2" , "%3", "memory"
+	);
+
+	*dot = tempdot[0] + tempdot[1] + tempdot[2] + tempdot[3];
+
+
+} 
+
+
diff --git a/kernel/power/sgemm_kernel_16x8_power8.S b/kernel/power/sgemm_kernel_16x8_power8.S
new file mode 100644
index 000000000..77f3f7cfb
--- /dev/null
+++ b/kernel/power/sgemm_kernel_16x8_power8.S
@@ -0,0 +1,371 @@
+/***************************************************************************
+Copyright (c) 2013-2016, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*****************************************************************************/
+
+/**************************************************************************************
+* 2016/04/02 Werner Saar (wernsaar@googlemail.com)
+* 	 BLASTEST 		: OK
+* 	 CTEST			: OK
+* 	 TEST			: OK
+* 	 LAPACK-TEST		: OK
+**************************************************************************************/
+
+/*********************************************************************/
+/* Copyright 2009, 2010 The University of Texas at Austin.           */
+/* All rights reserved.                                              */
+/*                                                                   */
+/* Redistribution and use in source and binary forms, with or        */
+/* without modification, are permitted provided that the following   */
+/* conditions are met:                                               */
+/*                                                                   */
+/*   1. Redistributions of source code must retain the above         */
+/*      copyright notice, this list of conditions and the following  */
+/*      disclaimer.                                                  */
+/*                                                                   */
+/*   2. Redistributions in binary form must reproduce the above      */
+/*      copyright notice, this list of conditions and the following  */
+/*      disclaimer in the documentation and/or other materials       */
+/*      provided with the distribution.                              */
+/*                                                                   */
+/*    THIS  SOFTWARE IS PROVIDED  BY THE  UNIVERSITY OF  TEXAS AT    */
+/*    AUSTIN  ``AS IS''  AND ANY  EXPRESS OR  IMPLIED WARRANTIES,    */
+/*    INCLUDING, BUT  NOT LIMITED  TO, THE IMPLIED  WARRANTIES OF    */
+/*    MERCHANTABILITY  AND FITNESS FOR  A PARTICULAR  PURPOSE ARE    */
+/*    DISCLAIMED.  IN  NO EVENT SHALL THE UNIVERSITY  OF TEXAS AT    */
+/*    AUSTIN OR CONTRIBUTORS BE  LIABLE FOR ANY DIRECT, INDIRECT,    */
+/*    INCIDENTAL,  SPECIAL, EXEMPLARY,  OR  CONSEQUENTIAL DAMAGES    */
+/*    (INCLUDING, BUT  NOT LIMITED TO,  PROCUREMENT OF SUBSTITUTE    */
+/*    GOODS  OR  SERVICES; LOSS  OF  USE,  DATA,  OR PROFITS;  OR    */
+/*    BUSINESS INTERRUPTION) HOWEVER CAUSED  AND ON ANY THEORY OF    */
+/*    LIABILITY, WHETHER  IN CONTRACT, STRICT  LIABILITY, OR TORT    */
+/*    (INCLUDING NEGLIGENCE OR OTHERWISE)  ARISING IN ANY WAY OUT    */
+/*    OF  THE  USE OF  THIS  SOFTWARE,  EVEN  IF ADVISED  OF  THE    */
+/*    POSSIBILITY OF SUCH DAMAGE.                                    */
+/*                                                                   */
+/* The views and conclusions contained in the software and           */
+/* documentation are those of the authors and should not be          */
+/* interpreted as representing official policies, either expressed   */
+/* or implied, of The University of Texas at Austin.                 */
+/*********************************************************************/
+
+#define ASSEMBLER
+#include "common.h"
+#include "def_vsx.h"
+
+#ifndef __64BIT__
+#define LOAD	lwz
+#else
+#define LOAD	ld
+#endif
+
+#ifdef __64BIT__
+#define STACKSIZE 32752
+#define ALPHA_SP   296(SP)
+#define FZERO	304(SP)
+#else
+#define STACKSIZE 240
+#define ALPHA_SP   224(SP)
+#define FZERO	232(SP)
+#endif
+
+#define	M	r3
+#define	N	r4
+#define	K	r5
+
+#ifdef linux
+#ifndef __64BIT__
+#define A	r6
+#define	B	r7
+#define	C	r8
+#define	LDC	r9
+#define OFFSET	r10
+#else
+#define A	r7
+#define	B	r8
+#define	C	r9
+#define	LDC	r10
+#define OFFSET	r6
+#endif
+#endif
+
+#if defined(_AIX) || defined(__APPLE__)
+#if !defined(__64BIT__) && defined(DOUBLE)
+#define A	r8
+#define	B	r9
+#define	C	r10
+#define	LDC	r7
+#define OFFSET	r6
+#else
+#define A	r7
+#define	B	r8
+#define	C	r9
+#define	LDC	r10
+#define OFFSET	r6
+#endif
+#endif
+
+#define alpha_r vs30
+#define alpha_vr vs31
+
+#define o0	0
+
+#define FRAMEPOINTER r12
+
+#define BBUFFER r14
+#define o4	r15
+#define o12	r16
+#define o8	r17
+#define L	r18
+#define T1	r19
+#define KK	r20
+#define BBO	r21
+#define	I	r22
+#define J	r23
+#define AO	r24
+#define	BO	r25
+#define	CO 	r26
+#define o16	r27
+#define	o32	r28
+#define	o48	r29
+
+#define PRE	r30
+#define T2	r31
+
+#include "sgemm_macros_16x8_power8.S"
+
+
+#ifndef NEEDPARAM
+
+	PROLOGUE
+	PROFCODE
+
+	mr	FRAMEPOINTER, SP
+	addi	SP, SP, -STACKSIZE
+	addi	SP, SP, -STACKSIZE
+	addi	SP, SP, -STACKSIZE
+	addi	SP, SP, -STACKSIZE
+	li	r0, 0
+
+	stfd	f14,    0(SP)
+	stfd	f15,    8(SP)
+	stfd	f16,   16(SP)
+	stfd	f17,   24(SP)
+
+	stfd	f18,   32(SP)
+	stfd	f19,   40(SP)
+	stfd	f20,   48(SP)
+	stfd	f21,   56(SP)
+
+	stfd	f22,   64(SP)
+	stfd	f23,   72(SP)
+	stfd	f24,   80(SP)
+	stfd	f25,   88(SP)
+
+	stfd	f26,   96(SP)
+	stfd	f27,  104(SP)
+	stfd	f28,  112(SP)
+	stfd	f29,  120(SP)
+
+	stfd	f30,  128(SP)
+	stfd	f31,  136(SP)
+
+#ifdef __64BIT__
+	std	r31,  144(SP)
+	std	r30,  152(SP)
+	std	r29,  160(SP)
+	std	r28,  168(SP)
+	std	r27,  176(SP)
+	std	r26,  184(SP)
+	std	r25,  192(SP)
+	std	r24,  200(SP)
+	std	r23,  208(SP)
+	std	r22,  216(SP)
+	std	r21,  224(SP)
+	std	r20,  232(SP)
+	std	r19,  240(SP)
+	std	r18,  248(SP)
+	std	r17,  256(SP)
+	std	r16,  264(SP)
+	std	r15,  272(SP)
+	std	r14,  280(SP)
+#else
+	stw	r31,  144(SP)
+	stw	r30,  148(SP)
+	stw	r29,  152(SP)
+	stw	r28,  156(SP)
+	stw	r27,  160(SP)
+	stw	r26,  164(SP)
+	stw	r25,  168(SP)
+	stw	r24,  172(SP)
+	stw	r23,  176(SP)
+	stw	r22,  180(SP)
+	stw	r21,  184(SP)
+	stw	r20,  188(SP)
+	stw	r19,  192(SP)
+	stw	r18,  196(SP)
+	stw	r17,  200(SP)
+	stw	r16,  204(SP)
+	stw	r15,  208(SP)
+	stw	r14,  212(SP)
+#endif
+
+	// stfd	f1,  ALPHA_SP
+	// stw	r0,  FZERO
+
+#if defined(_AIX) || defined(__APPLE__)
+#if !defined(__64BIT__) && defined(DOUBLE)
+	lwz	LDC, FRAMESLOT(0) + 0(FRAMEPOINTER)
+#endif
+#endif
+
+	slwi	LDC, LDC, 2
+
+#if defined(TRMMKERNEL)
+#if defined(linux) && defined(__64BIT__)
+	ld	OFFSET,  FRAMESLOT(0) + 0(FRAMEPOINTER)
+#endif
+
+#if defined(_AIX) || defined(__APPLE__)
+#ifdef __64BIT__
+	ld	OFFSET,  FRAMESLOT(0) + 0(FRAMEPOINTER)
+#else
+#ifdef DOUBLE
+	lwz	OFFSET,  FRAMESLOT(1) + 0(FRAMEPOINTER)
+#else
+	lwz	OFFSET,  FRAMESLOT(0) + 0(FRAMEPOINTER)
+#endif
+#endif
+#endif
+#endif
+
+
+	cmpwi	cr0, M, 0
+	ble	L999_H1
+	cmpwi	cr0, N, 0
+	ble	L999_H1
+	cmpwi	cr0, K, 0
+	ble	L999_H1
+
+	li	PRE, 256 
+	li	o4 , 4
+	li	o8 , 8
+	li	o12, 12
+	li	o16, 16
+	li	o32, 32
+	li	o48, 48
+
+        addi    BBUFFER, SP, 512+4096
+	li	T1, -4096
+	and	BBUFFER, BBUFFER, T1
+
+        addi    T1, SP, 300
+        stxsspx    f1, o0 , T1
+        stxsspx    f1, o4 , T1
+        stxsspx    f1, o8 , T1
+        stxsspx    f1, o12 , T1
+
+	lxsspx	   alpha_r,  o0, T1
+        lxvw4x     alpha_vr, o0, T1
+
+
+
+#include "sgemm_logic_16x8_power8.S"
+
+L999:
+	addi	r3, 0, 0
+
+	lfd	f14,    0(SP)
+	lfd	f15,    8(SP)
+	lfd	f16,   16(SP)
+	lfd	f17,   24(SP)
+
+	lfd	f18,   32(SP)
+	lfd	f19,   40(SP)
+	lfd	f20,   48(SP)
+	lfd	f21,   56(SP)
+
+	lfd	f22,   64(SP)
+	lfd	f23,   72(SP)
+	lfd	f24,   80(SP)
+	lfd	f25,   88(SP)
+
+	lfd	f26,   96(SP)
+	lfd	f27,  104(SP)
+	lfd	f28,  112(SP)
+	lfd	f29,  120(SP)
+
+	lfd	f30,  128(SP)
+	lfd	f31,  136(SP)
+
+#ifdef __64BIT__
+	ld	r31,  144(SP)
+	ld	r30,  152(SP)
+	ld	r29,  160(SP)
+	ld	r28,  168(SP)
+	ld	r27,  176(SP)
+	ld	r26,  184(SP)
+	ld	r25,  192(SP)
+	ld	r24,  200(SP)
+	ld	r23,  208(SP)
+	ld	r22,  216(SP)
+	ld	r21,  224(SP)
+	ld	r20,  232(SP)
+	ld	r19,  240(SP)
+	ld	r18,  248(SP)
+	ld	r17,  256(SP)
+	ld	r16,  264(SP)
+	ld	r15,  272(SP)
+	ld	r14,  280(SP)
+#else
+	lwz	r31,  144(SP)
+	lwz	r30,  148(SP)
+	lwz	r29,  152(SP)
+	lwz	r28,  156(SP)
+	lwz	r27,  160(SP)
+	lwz	r26,  164(SP)
+	lwz	r25,  168(SP)
+	lwz	r24,  172(SP)
+	lwz	r23,  176(SP)
+	lwz	r22,  180(SP)
+	lwz	r21,  184(SP)
+	lwz	r20,  188(SP)
+	lwz	r19,  192(SP)
+	lwz	r18,  196(SP)
+	lwz	r17,  200(SP)
+	lwz	r16,  204(SP)
+	lwz	r15,  208(SP)
+	lwz	r14,  212(SP)
+#endif
+
+	addi	SP, SP, STACKSIZE
+	addi	SP, SP, STACKSIZE
+	addi	SP, SP, STACKSIZE
+	addi	SP, SP, STACKSIZE
+
+	blr
+
+	EPILOGUE
+#endif
diff --git a/kernel/power/sgemm_logic_16x8_power8.S b/kernel/power/sgemm_logic_16x8_power8.S
new file mode 100644
index 000000000..06bb79ea3
--- /dev/null
+++ b/kernel/power/sgemm_logic_16x8_power8.S
@@ -0,0 +1,2323 @@
+/***************************************************************************
+Copyright (c) 2013-2016, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*****************************************************************************/
+
+/**************************************************************************************
+* 2016/04/02 Werner Saar (wernsaar@googlemail.com)
+* 	 BLASTEST 		: OK
+* 	 CTEST			: OK
+* 	 TEST			: OK
+* 	 LAPACK-TEST		: OK
+**************************************************************************************/
+
+	srawi.		J,	N,	3
+	ble		SGEMM_L8_END
+
+SGEMM_L8_BEGIN:
+
+	mr		BO,	B
+	mr		BBO,	BBUFFER
+	slwi		T1,	K,	3
+
+SGEMM_L8_COPYB:
+	dcbtst		BBO,	PRE
+
+	lxvw4x		vs3,	o0,	BO
+	lxvw4x		vs11,	o16,	BO
+	xxspltw		vs4,	vs3,	0
+	xxspltw		vs5,	vs3,	1
+	xxspltw		vs6,	vs3,	2
+	xxspltw		vs7,	vs3,	3
+	xxspltw		vs12,	vs11,	0
+	xxspltw		vs13,	vs11,	1
+	xxspltw		vs14,	vs11,	2
+	xxspltw		vs15,	vs11,	3
+	stxvw4x		vs4,	o0,	BBO
+	stxvw4x		vs5,	o16,	BBO
+	stxvw4x		vs6,	o32,	BBO
+	stxvw4x		vs7,	o48,	BBO
+	addi		BO,	BO,	32
+	addi		BBO,	BBO,	64
+	stxvw4x		vs12,	o0,	BBO
+	stxvw4x		vs13,	o16,	BBO
+	stxvw4x		vs14,	o32,	BBO
+	stxvw4x		vs15,	o48,	BBO
+	addic.		T1,	T1,	-8
+	addi		BBO,	BBO,	64
+
+	bge		SGEMM_L8_COPYB
+
+	mr		CO,	C
+	mr		AO,	A
+	slwi		T1,	LDC	,	3
+	add		C,	C,	T1
+	srawi.		I,	M,	4
+	ble		SGEMM_L8x16_END
+
+SGEMM_L8x16_BEGIN:
+
+
+	mr		BO,	BBUFFER
+	srawi.		L,	K,	3
+	ble		SGEMM_L8x16_SUB0
+	cmpwi		cr0,	L,	1
+	ble		SGEMM_L8x16_SUB4
+
+SGEMM_L8x16_LOOP_START:
+
+	dcbt		AO,	PRE
+	dcbt		BO,	PRE
+	LOAD8x16_1
+	dcbt		BO,	PRE
+	KERNEL8x16_I1
+	dcbt		BO,	PRE
+	dcbt		AO,	PRE
+	KERNEL8x16_2
+	dcbt		BO,	PRE
+	KERNEL8x16_1
+	dcbt		BO,	PRE
+	dcbt		AO,	PRE
+	KERNEL8x16_2
+
+	dcbt		BO,	PRE
+	KERNEL8x16_1
+	dcbt		BO,	PRE
+	dcbt		AO,	PRE
+	KERNEL8x16_2
+	dcbt		BO,	PRE
+	KERNEL8x16_1
+	dcbt		BO,	PRE
+	dcbt		AO,	PRE
+	KERNEL8x16_2
+
+	addic.		L,	L,	-2
+	ble		SGEMM_L8x16_LOOP_END
+
+	.align 5
+
+SGEMM_L8x16_LOOP:
+
+	dcbt		BO,	PRE
+	KERNEL8x16_1
+	dcbt		BO,	PRE
+	dcbt		AO,	PRE
+	KERNEL8x16_2
+	dcbt		BO,	PRE
+	KERNEL8x16_1
+	dcbt		BO,	PRE
+	dcbt		AO,	PRE
+	KERNEL8x16_2
+
+	dcbt		BO,	PRE
+	KERNEL8x16_1
+	dcbt		BO,	PRE
+	dcbt		AO,	PRE
+	KERNEL8x16_2
+	dcbt		BO,	PRE
+	KERNEL8x16_1
+	dcbt		BO,	PRE
+	dcbt		AO,	PRE
+	KERNEL8x16_2
+
+	addic.		L,	L,	-1
+	bgt		SGEMM_L8x16_LOOP
+
+SGEMM_L8x16_LOOP_END:
+
+	dcbt		BO,	PRE
+	KERNEL8x16_1
+	dcbt		BO,	PRE
+	dcbt		AO,	PRE
+	KERNEL8x16_2
+	dcbt		BO,	PRE
+	KERNEL8x16_1
+	dcbt		BO,	PRE
+	dcbt		AO,	PRE
+	KERNEL8x16_2
+
+	dcbt		BO,	PRE
+	KERNEL8x16_1
+	dcbt		BO,	PRE
+	dcbt		AO,	PRE
+	KERNEL8x16_2
+	KERNEL8x16_1
+	KERNEL8x16_E2
+
+	b		SGEMM_L8x16_SUB1
+
+SGEMM_L8x16_SUB4:
+
+	dcbt		AO,	PRE
+	KERNEL8x16_SUBI1
+	KERNEL8x16_SUB1
+	dcbt		AO,	PRE
+	KERNEL8x16_SUB1
+	KERNEL8x16_SUB1
+
+	KERNEL8x16_SUB1
+	KERNEL8x16_SUB1
+	KERNEL8x16_SUB1
+	KERNEL8x16_SUB1
+
+	b		SGEMM_L8x16_SUB1
+
+SGEMM_L8x16_SUB0:
+
+	andi.		L,	K,	7
+
+	KERNEL8x16_SUBI1
+
+	addic.		L,	L,	-1
+	ble		SGEMM_L8x16_SAVE
+	b		SGEMM_L8x16_SUB2
+
+SGEMM_L8x16_SUB1:
+
+	andi.		L,	K,	7
+	ble		SGEMM_L8x16_SAVE
+
+SGEMM_L8x16_SUB2:
+
+	KERNEL8x16_SUB1
+
+	addic.		L,	L,	-1
+	bgt		SGEMM_L8x16_SUB2
+
+SGEMM_L8x16_SAVE:
+
+	SAVE8x16
+
+	addic.		I,	I,	-1
+	bgt		SGEMM_L8x16_BEGIN
+
+SGEMM_L8x16_END:
+
+SGEMM_L8x8_BEGIN:
+
+	andi.		T2,	M,	15
+	ble		SGEMM_L8x1_END
+
+	andi.		T1,	M,	8
+	ble		SGEMM_L8x8_END
+	mr		BO,	BBUFFER
+	srawi.		L,	K,	3
+	ble		SGEMM_L8x8_SUB0
+	cmpwi		cr0,	L,	1
+	ble		SGEMM_L8x8_SUB4
+
+SGEMM_L8x8_LOOP_START:
+
+	LOAD8x8_1
+	KERNEL8x8_I1
+	KERNEL8x8_2
+	KERNEL8x8_1
+	KERNEL8x8_2
+
+	KERNEL8x8_1
+	KERNEL8x8_2
+	KERNEL8x8_1
+	KERNEL8x8_2
+
+	addic.		L,	L,	-2
+	ble		SGEMM_L8x8_LOOP_END
+
+	.align 5
+
+SGEMM_L8x8_LOOP:
+
+	KERNEL8x8_1
+	KERNEL8x8_2
+	KERNEL8x8_1
+	KERNEL8x8_2
+
+	KERNEL8x8_1
+	KERNEL8x8_2
+	KERNEL8x8_1
+	KERNEL8x8_2
+
+	addic.		L,	L,	-1
+	bgt		SGEMM_L8x8_LOOP
+
+SGEMM_L8x8_LOOP_END:
+
+	KERNEL8x8_1
+	KERNEL8x8_2
+	KERNEL8x8_1
+	KERNEL8x8_2
+
+	KERNEL8x8_1
+	KERNEL8x8_2
+	KERNEL8x8_1
+	KERNEL8x8_E2
+
+	b		SGEMM_L8x8_SUB1
+
+SGEMM_L8x8_SUB4:
+
+	KERNEL8x8_SUBI1
+	KERNEL8x8_SUB1
+	KERNEL8x8_SUB1
+	KERNEL8x8_SUB1
+
+	KERNEL8x8_SUB1
+	KERNEL8x8_SUB1
+	KERNEL8x8_SUB1
+	KERNEL8x8_SUB1
+
+	b		SGEMM_L8x8_SUB1
+
+SGEMM_L8x8_SUB0:
+
+	andi.		L,	K,	7
+
+	KERNEL8x8_SUBI1
+
+	addic.		L,	L,	-1
+	ble		SGEMM_L8x8_SAVE
+	b		SGEMM_L8x8_SUB2
+
+SGEMM_L8x8_SUB1:
+
+	andi.		L,	K,	7
+	ble		SGEMM_L8x8_SAVE
+
+SGEMM_L8x8_SUB2:
+
+	KERNEL8x8_SUB1
+
+	addic.		L,	L,	-1
+	bgt		SGEMM_L8x8_SUB2
+
+SGEMM_L8x8_SAVE:
+
+	SAVE8x8
+
+SGEMM_L8x8_END:
+
+SGEMM_L8x4_BEGIN:
+
+
+	andi.		T1,	M,	4
+	ble		SGEMM_L8x4_END
+	mr		BO,	BBUFFER
+	srawi.		L,	K,	3
+	ble		SGEMM_L8x4_SUB0
+	cmpwi		cr0,	L,	1
+	ble		SGEMM_L8x4_SUB4
+
+SGEMM_L8x4_LOOP_START:
+
+	LOAD8x4_1
+	KERNEL8x4_I1
+	KERNEL8x4_2
+	KERNEL8x4_1
+	KERNEL8x4_2
+
+	KERNEL8x4_1
+	KERNEL8x4_2
+	KERNEL8x4_1
+	KERNEL8x4_2
+
+	addic.		L,	L,	-2
+	ble		SGEMM_L8x4_LOOP_END
+
+	.align 5
+
+SGEMM_L8x4_LOOP:
+
+	KERNEL8x4_1
+	KERNEL8x4_2
+	KERNEL8x4_1
+	KERNEL8x4_2
+
+	KERNEL8x4_1
+	KERNEL8x4_2
+	KERNEL8x4_1
+	KERNEL8x4_2
+
+	addic.		L,	L,	-1
+	bgt		SGEMM_L8x4_LOOP
+
+SGEMM_L8x4_LOOP_END:
+
+	KERNEL8x4_1
+	KERNEL8x4_2
+	KERNEL8x4_1
+	KERNEL8x4_2
+
+	KERNEL8x4_1
+	KERNEL8x4_2
+	KERNEL8x4_1
+	KERNEL8x4_E2
+
+	b		SGEMM_L8x4_SUB1
+
+SGEMM_L8x4_SUB4:
+
+	KERNEL8x4_SUBI1
+	KERNEL8x4_SUB1
+	KERNEL8x4_SUB1
+	KERNEL8x4_SUB1
+
+	KERNEL8x4_SUB1
+	KERNEL8x4_SUB1
+	KERNEL8x4_SUB1
+	KERNEL8x4_SUB1
+
+	b		SGEMM_L8x4_SUB1
+
+SGEMM_L8x4_SUB0:
+
+	andi.		L,	K,	7
+
+	KERNEL8x4_SUBI1
+
+	addic.		L,	L,	-1
+	ble		SGEMM_L8x4_SAVE
+	b		SGEMM_L8x4_SUB2
+
+SGEMM_L8x4_SUB1:
+
+	andi.		L,	K,	7
+	ble		SGEMM_L8x4_SAVE
+
+SGEMM_L8x4_SUB2:
+
+	KERNEL8x4_SUB1
+
+	addic.		L,	L,	-1
+	bgt		SGEMM_L8x4_SUB2
+
+SGEMM_L8x4_SAVE:
+
+	SAVE8x4
+
+SGEMM_L8x4_END:
+
+SGEMM_L8x2_BEGIN:
+
+
+	andi.		T1,	M,	2
+	ble		SGEMM_L8x2_END
+	mr		BO,	BBUFFER
+	srawi.		L,	K,	3
+	ble		SGEMM_L8x2_SUB0
+	cmpwi		cr0,	L,	1
+	ble		SGEMM_L8x2_SUB4
+
+SGEMM_L8x2_LOOP_START:
+
+	LOAD8x2_1
+	KERNEL8x2_I1
+	KERNEL8x2_2
+	KERNEL8x2_1
+	KERNEL8x2_2
+
+	KERNEL8x2_1
+	KERNEL8x2_2
+	KERNEL8x2_1
+	KERNEL8x2_2
+
+	addic.		L,	L,	-2
+	ble		SGEMM_L8x2_LOOP_END
+
+	.align 5
+
+SGEMM_L8x2_LOOP:
+
+	KERNEL8x2_1
+	KERNEL8x2_2
+	KERNEL8x2_1
+	KERNEL8x2_2
+
+	KERNEL8x2_1
+	KERNEL8x2_2
+	KERNEL8x2_1
+	KERNEL8x2_2
+
+	addic.		L,	L,	-1
+	bgt		SGEMM_L8x2_LOOP
+
+SGEMM_L8x2_LOOP_END:
+
+	KERNEL8x2_1
+	KERNEL8x2_2
+	KERNEL8x2_1
+	KERNEL8x2_2
+
+	KERNEL8x2_1
+	KERNEL8x2_2
+	KERNEL8x2_1
+	KERNEL8x2_E2
+
+	b		SGEMM_L8x2_SUB1
+
+SGEMM_L8x2_SUB4:
+
+	KERNEL8x2_SUBI1
+	KERNEL8x2_SUB1
+	KERNEL8x2_SUB1
+	KERNEL8x2_SUB1
+
+	KERNEL8x2_SUB1
+	KERNEL8x2_SUB1
+	KERNEL8x2_SUB1
+	KERNEL8x2_SUB1
+
+	b		SGEMM_L8x2_SUB1
+
+SGEMM_L8x2_SUB0:
+
+	andi.		L,	K,	7
+
+	KERNEL8x2_SUBI1
+
+	addic.		L,	L,	-1
+	ble		SGEMM_L8x2_SAVE
+	b		SGEMM_L8x2_SUB2
+
+SGEMM_L8x2_SUB1:
+
+	andi.		L,	K,	7
+	ble		SGEMM_L8x2_SAVE
+
+SGEMM_L8x2_SUB2:
+
+	KERNEL8x2_SUB1
+
+	addic.		L,	L,	-1
+	bgt		SGEMM_L8x2_SUB2
+
+SGEMM_L8x2_SAVE:
+
+	SAVE8x2
+
+SGEMM_L8x2_END:
+
+SGEMM_L8x1_BEGIN:
+
+
+	andi.		T1,	M,	1
+	ble		SGEMM_L8x1_END
+	mr		BO,	BBUFFER
+	srawi.		L,	K,	3
+	ble		SGEMM_L8x1_SUB0
+	cmpwi		cr0,	L,	1
+	ble		SGEMM_L8x1_SUB4
+
+SGEMM_L8x1_LOOP_START:
+
+	LOAD8x1_1
+	KERNEL8x1_I1
+	KERNEL8x1_2
+	KERNEL8x1_1
+	KERNEL8x1_2
+
+	KERNEL8x1_1
+	KERNEL8x1_2
+	KERNEL8x1_1
+	KERNEL8x1_2
+
+	addic.		L,	L,	-2
+	ble		SGEMM_L8x1_LOOP_END
+
+	.align 5
+
+SGEMM_L8x1_LOOP:
+
+	KERNEL8x1_1
+	KERNEL8x1_2
+	KERNEL8x1_1
+	KERNEL8x1_2
+
+	KERNEL8x1_1
+	KERNEL8x1_2
+	KERNEL8x1_1
+	KERNEL8x1_2
+
+	addic.		L,	L,	-1
+	bgt		SGEMM_L8x1_LOOP
+
+SGEMM_L8x1_LOOP_END:
+
+	KERNEL8x1_1
+	KERNEL8x1_2
+	KERNEL8x1_1
+	KERNEL8x1_2
+
+	KERNEL8x1_1
+	KERNEL8x1_2
+	KERNEL8x1_1
+	KERNEL8x1_E2
+
+	b		SGEMM_L8x1_SUB1
+
+SGEMM_L8x1_SUB4:
+
+	KERNEL8x1_SUBI1
+	KERNEL8x1_SUB1
+	KERNEL8x1_SUB1
+	KERNEL8x1_SUB1
+
+	KERNEL8x1_SUB1
+	KERNEL8x1_SUB1
+	KERNEL8x1_SUB1
+	KERNEL8x1_SUB1
+
+	b		SGEMM_L8x1_SUB1
+
+SGEMM_L8x1_SUB0:
+
+	andi.		L,	K,	7
+
+	KERNEL8x1_SUBI1
+
+	addic.		L,	L,	-1
+	ble		SGEMM_L8x1_SAVE
+	b		SGEMM_L8x1_SUB2
+
+SGEMM_L8x1_SUB1:
+
+	andi.		L,	K,	7
+	ble		SGEMM_L8x1_SAVE
+
+SGEMM_L8x1_SUB2:
+
+	KERNEL8x1_SUB1
+
+	addic.		L,	L,	-1
+	bgt		SGEMM_L8x1_SUB2
+
+SGEMM_L8x1_SAVE:
+
+	SAVE8x1
+
+SGEMM_L8x1_END:
+
+	slwi		T1,	K,	5
+	add		B,	B,	T1
+
+	addic.		J,	J,	-1
+	bgt		SGEMM_L8_BEGIN
+
+	andi.		T2,	N,	7
+	ble		L999
+
+SGEMM_L8_END:
+
+	b		SGEMM_L4_BEGIN
+
+L999_H1:
+
+	b		L999
+
+SGEMM_L4_BEGIN:
+
+	mr		BO,	B
+	mr		BBO,	BBUFFER
+	slwi		T1,	K,	2
+
+SGEMM_L4_COPYB:
+	dcbtst		BBO,	PRE
+
+	lxvw4x		vs3,	o0,	BO
+	lxvw4x		vs11,	o16,	BO
+	xxspltw		vs4,	vs3,	0
+	xxspltw		vs5,	vs3,	1
+	xxspltw		vs6,	vs3,	2
+	xxspltw		vs7,	vs3,	3
+	xxspltw		vs12,	vs11,	0
+	xxspltw		vs13,	vs11,	1
+	xxspltw		vs14,	vs11,	2
+	xxspltw		vs15,	vs11,	3
+	stxvw4x		vs4,	o0,	BBO
+	stxvw4x		vs5,	o16,	BBO
+	stxvw4x		vs6,	o32,	BBO
+	stxvw4x		vs7,	o48,	BBO
+	addi		BO,	BO,	32
+	addi		BBO,	BBO,	64
+	stxvw4x		vs12,	o0,	BBO
+	stxvw4x		vs13,	o16,	BBO
+	stxvw4x		vs14,	o32,	BBO
+	stxvw4x		vs15,	o48,	BBO
+	addic.		T1,	T1,	-8
+	addi		BBO,	BBO,	64
+
+	bge		SGEMM_L4_COPYB
+
+	andi.		T1,	N,	4
+	ble		SGEMM_L4_END
+	mr		CO,	C
+	mr		AO,	A
+	slwi		T1,	LDC	,	2
+	add		C,	C,	T1
+	srawi.		I,	M,	4
+	ble		SGEMM_L4x16_END
+
+SGEMM_L4x16_BEGIN:
+
+
+	mr		BO,	BBUFFER
+	srawi.		L,	K,	3
+	ble		SGEMM_L4x16_SUB0
+	cmpwi		cr0,	L,	1
+	ble		SGEMM_L4x16_SUB4
+
+SGEMM_L4x16_LOOP_START:
+
+	dcbt		AO,	PRE
+	LOAD4x16_1
+	KERNEL4x16_I1
+	dcbt		AO,	PRE
+	KERNEL4x16_2
+	KERNEL4x16_1
+	dcbt		AO,	PRE
+	KERNEL4x16_2
+
+	KERNEL4x16_1
+	dcbt		AO,	PRE
+	KERNEL4x16_2
+	KERNEL4x16_1
+	dcbt		AO,	PRE
+	KERNEL4x16_2
+
+	addic.		L,	L,	-2
+	ble		SGEMM_L4x16_LOOP_END
+
+	.align 5
+
+SGEMM_L4x16_LOOP:
+
+	KERNEL4x16_1
+	dcbt		AO,	PRE
+	KERNEL4x16_2
+	KERNEL4x16_1
+	dcbt		AO,	PRE
+	KERNEL4x16_2
+
+	KERNEL4x16_1
+	dcbt		AO,	PRE
+	KERNEL4x16_2
+	KERNEL4x16_1
+	dcbt		AO,	PRE
+	KERNEL4x16_2
+
+	addic.		L,	L,	-1
+	bgt		SGEMM_L4x16_LOOP
+
+SGEMM_L4x16_LOOP_END:
+
+	KERNEL4x16_1
+	dcbt		AO,	PRE
+	KERNEL4x16_2
+	KERNEL4x16_1
+	dcbt		AO,	PRE
+	KERNEL4x16_2
+
+	KERNEL4x16_1
+	dcbt		AO,	PRE
+	KERNEL4x16_2
+	KERNEL4x16_1
+	KERNEL4x16_E2
+
+	b		SGEMM_L4x16_SUB1
+
+SGEMM_L4x16_SUB4:
+
+	dcbt		AO,	PRE
+	KERNEL4x16_SUBI1
+	KERNEL4x16_SUB1
+	dcbt		AO,	PRE
+	KERNEL4x16_SUB1
+	KERNEL4x16_SUB1
+
+	KERNEL4x16_SUB1
+	KERNEL4x16_SUB1
+	KERNEL4x16_SUB1
+	KERNEL4x16_SUB1
+
+	b		SGEMM_L4x16_SUB1
+
+SGEMM_L4x16_SUB0:
+
+	andi.		L,	K,	7
+
+	KERNEL4x16_SUBI1
+
+	addic.		L,	L,	-1
+	ble		SGEMM_L4x16_SAVE
+	b		SGEMM_L4x16_SUB2
+
+SGEMM_L4x16_SUB1:
+
+	andi.		L,	K,	7
+	ble		SGEMM_L4x16_SAVE
+
+SGEMM_L4x16_SUB2:
+
+	KERNEL4x16_SUB1
+
+	addic.		L,	L,	-1
+	bgt		SGEMM_L4x16_SUB2
+
+SGEMM_L4x16_SAVE:
+
+	SAVE4x16
+
+	addic.		I,	I,	-1
+	bgt		SGEMM_L4x16_BEGIN
+
+SGEMM_L4x16_END:
+
+SGEMM_L4x8_BEGIN:
+
+	andi.		T2,	M,	15
+	ble		SGEMM_L4x1_END
+
+	andi.		T1,	M,	8
+	ble		SGEMM_L4x8_END
+	mr		BO,	BBUFFER
+	srawi.		L,	K,	3
+	ble		SGEMM_L4x8_SUB0
+	cmpwi		cr0,	L,	1
+	ble		SGEMM_L4x8_SUB4
+
+SGEMM_L4x8_LOOP_START:
+
+	LOAD4x8_1
+	KERNEL4x8_I1
+	KERNEL4x8_2
+	KERNEL4x8_1
+	KERNEL4x8_2
+
+	KERNEL4x8_1
+	KERNEL4x8_2
+	KERNEL4x8_1
+	KERNEL4x8_2
+
+	addic.		L,	L,	-2
+	ble		SGEMM_L4x8_LOOP_END
+
+	.align 5
+
+SGEMM_L4x8_LOOP:
+
+	KERNEL4x8_1
+	KERNEL4x8_2
+	KERNEL4x8_1
+	KERNEL4x8_2
+
+	KERNEL4x8_1
+	KERNEL4x8_2
+	KERNEL4x8_1
+	KERNEL4x8_2
+
+	addic.		L,	L,	-1
+	bgt		SGEMM_L4x8_LOOP
+
+SGEMM_L4x8_LOOP_END:
+
+	KERNEL4x8_1
+	KERNEL4x8_2
+	KERNEL4x8_1
+	KERNEL4x8_2
+
+	KERNEL4x8_1
+	KERNEL4x8_2
+	KERNEL4x8_1
+	KERNEL4x8_E2
+
+	b		SGEMM_L4x8_SUB1
+
+SGEMM_L4x8_SUB4:
+
+	KERNEL4x8_SUBI1
+	KERNEL4x8_SUB1
+	KERNEL4x8_SUB1
+	KERNEL4x8_SUB1
+
+	KERNEL4x8_SUB1
+	KERNEL4x8_SUB1
+	KERNEL4x8_SUB1
+	KERNEL4x8_SUB1
+
+	b		SGEMM_L4x8_SUB1
+
+SGEMM_L4x8_SUB0:
+
+	andi.		L,	K,	7
+
+	KERNEL4x8_SUBI1
+
+	addic.		L,	L,	-1
+	ble		SGEMM_L4x8_SAVE
+	b		SGEMM_L4x8_SUB2
+
+SGEMM_L4x8_SUB1:
+
+	andi.		L,	K,	7
+	ble		SGEMM_L4x8_SAVE
+
+SGEMM_L4x8_SUB2:
+
+	KERNEL4x8_SUB1
+
+	addic.		L,	L,	-1
+	bgt		SGEMM_L4x8_SUB2
+
+SGEMM_L4x8_SAVE:
+
+	SAVE4x8
+
+SGEMM_L4x8_END:
+
+SGEMM_L4x4_BEGIN:
+
+
+	andi.		T1,	M,	4
+	ble		SGEMM_L4x4_END
+	mr		BO,	BBUFFER
+	srawi.		L,	K,	3
+	ble		SGEMM_L4x4_SUB0
+	cmpwi		cr0,	L,	1
+	ble		SGEMM_L4x4_SUB4
+
+SGEMM_L4x4_LOOP_START:
+
+	LOAD4x4_1
+	KERNEL4x4_I1
+	KERNEL4x4_2
+	KERNEL4x4_1
+	KERNEL4x4_2
+
+	KERNEL4x4_1
+	KERNEL4x4_2
+	KERNEL4x4_1
+	KERNEL4x4_2
+
+	addic.		L,	L,	-2
+	ble		SGEMM_L4x4_LOOP_END
+
+	.align 5
+
+SGEMM_L4x4_LOOP:
+
+	KERNEL4x4_1
+	KERNEL4x4_2
+	KERNEL4x4_1
+	KERNEL4x4_2
+
+	KERNEL4x4_1
+	KERNEL4x4_2
+	KERNEL4x4_1
+	KERNEL4x4_2
+
+	addic.		L,	L,	-1
+	bgt		SGEMM_L4x4_LOOP
+
+SGEMM_L4x4_LOOP_END:
+
+	KERNEL4x4_1
+	KERNEL4x4_2
+	KERNEL4x4_1
+	KERNEL4x4_2
+
+	KERNEL4x4_1
+	KERNEL4x4_2
+	KERNEL4x4_1
+	KERNEL4x4_E2
+
+	b		SGEMM_L4x4_SUB1
+
+SGEMM_L4x4_SUB4:
+
+	KERNEL4x4_SUBI1
+	KERNEL4x4_SUB1
+	KERNEL4x4_SUB1
+	KERNEL4x4_SUB1
+
+	KERNEL4x4_SUB1
+	KERNEL4x4_SUB1
+	KERNEL4x4_SUB1
+	KERNEL4x4_SUB1
+
+	b		SGEMM_L4x4_SUB1
+
+SGEMM_L4x4_SUB0:
+
+	andi.		L,	K,	7
+
+	KERNEL4x4_SUBI1
+
+	addic.		L,	L,	-1
+	ble		SGEMM_L4x4_SAVE
+	b		SGEMM_L4x4_SUB2
+
+SGEMM_L4x4_SUB1:
+
+	andi.		L,	K,	7
+	ble		SGEMM_L4x4_SAVE
+
+SGEMM_L4x4_SUB2:
+
+	KERNEL4x4_SUB1
+
+	addic.		L,	L,	-1
+	bgt		SGEMM_L4x4_SUB2
+
+SGEMM_L4x4_SAVE:
+
+	SAVE4x4
+
+SGEMM_L4x4_END:
+
+SGEMM_L4x2_BEGIN:
+
+
+	andi.		T1,	M,	2
+	ble		SGEMM_L4x2_END
+	mr		BO,	BBUFFER
+	srawi.		L,	K,	3
+	ble		SGEMM_L4x2_SUB0
+	cmpwi		cr0,	L,	1
+	ble		SGEMM_L4x2_SUB4
+
+SGEMM_L4x2_LOOP_START:
+
+	LOAD4x2_1
+	KERNEL4x2_I1
+	KERNEL4x2_2
+	KERNEL4x2_1
+	KERNEL4x2_2
+
+	KERNEL4x2_1
+	KERNEL4x2_2
+	KERNEL4x2_1
+	KERNEL4x2_2
+
+	addic.		L,	L,	-2
+	ble		SGEMM_L4x2_LOOP_END
+
+	.align 5
+
+SGEMM_L4x2_LOOP:
+
+	KERNEL4x2_1
+	KERNEL4x2_2
+	KERNEL4x2_1
+	KERNEL4x2_2
+
+	KERNEL4x2_1
+	KERNEL4x2_2
+	KERNEL4x2_1
+	KERNEL4x2_2
+
+	addic.		L,	L,	-1
+	bgt		SGEMM_L4x2_LOOP
+
+SGEMM_L4x2_LOOP_END:
+
+	KERNEL4x2_1
+	KERNEL4x2_2
+	KERNEL4x2_1
+	KERNEL4x2_2
+
+	KERNEL4x2_1
+	KERNEL4x2_2
+	KERNEL4x2_1
+	KERNEL4x2_E2
+
+	b		SGEMM_L4x2_SUB1
+
+SGEMM_L4x2_SUB4:
+
+	KERNEL4x2_SUBI1
+	KERNEL4x2_SUB1
+	KERNEL4x2_SUB1
+	KERNEL4x2_SUB1
+
+	KERNEL4x2_SUB1
+	KERNEL4x2_SUB1
+	KERNEL4x2_SUB1
+	KERNEL4x2_SUB1
+
+	b		SGEMM_L4x2_SUB1
+
+SGEMM_L4x2_SUB0:
+
+	andi.		L,	K,	7
+
+	KERNEL4x2_SUBI1
+
+	addic.		L,	L,	-1
+	ble		SGEMM_L4x2_SAVE
+	b		SGEMM_L4x2_SUB2
+
+SGEMM_L4x2_SUB1:
+
+	andi.		L,	K,	7
+	ble		SGEMM_L4x2_SAVE
+
+SGEMM_L4x2_SUB2:
+
+	KERNEL4x2_SUB1
+
+	addic.		L,	L,	-1
+	bgt		SGEMM_L4x2_SUB2
+
+SGEMM_L4x2_SAVE:
+
+	SAVE4x2
+
+SGEMM_L4x2_END:
+
+SGEMM_L4x1_BEGIN:
+
+
+	andi.		T1,	M,	1
+	ble		SGEMM_L4x1_END
+	mr		BO,	BBUFFER
+	srawi.		L,	K,	3
+	ble		SGEMM_L4x1_SUB0
+	cmpwi		cr0,	L,	1
+	ble		SGEMM_L4x1_SUB4
+
+SGEMM_L4x1_LOOP_START:
+
+	LOAD4x1_1
+	KERNEL4x1_I1
+	KERNEL4x1_2
+	KERNEL4x1_1
+	KERNEL4x1_2
+
+	KERNEL4x1_1
+	KERNEL4x1_2
+	KERNEL4x1_1
+	KERNEL4x1_2
+
+	addic.		L,	L,	-2
+	ble		SGEMM_L4x1_LOOP_END
+
+	.align 5
+
+SGEMM_L4x1_LOOP:
+
+	KERNEL4x1_1
+	KERNEL4x1_2
+	KERNEL4x1_1
+	KERNEL4x1_2
+
+	KERNEL4x1_1
+	KERNEL4x1_2
+	KERNEL4x1_1
+	KERNEL4x1_2
+
+	addic.		L,	L,	-1
+	bgt		SGEMM_L4x1_LOOP
+
+SGEMM_L4x1_LOOP_END:
+
+	KERNEL4x1_1
+	KERNEL4x1_2
+	KERNEL4x1_1
+	KERNEL4x1_2
+
+	KERNEL4x1_1
+	KERNEL4x1_2
+	KERNEL4x1_1
+	KERNEL4x1_E2
+
+	b		SGEMM_L4x1_SUB1
+
+SGEMM_L4x1_SUB4:
+
+	KERNEL4x1_SUBI1
+	KERNEL4x1_SUB1
+	KERNEL4x1_SUB1
+	KERNEL4x1_SUB1
+
+	KERNEL4x1_SUB1
+	KERNEL4x1_SUB1
+	KERNEL4x1_SUB1
+	KERNEL4x1_SUB1
+
+	b		SGEMM_L4x1_SUB1
+
+SGEMM_L4x1_SUB0:
+
+	andi.		L,	K,	7
+
+	KERNEL4x1_SUBI1
+
+	addic.		L,	L,	-1
+	ble		SGEMM_L4x1_SAVE
+	b		SGEMM_L4x1_SUB2
+
+SGEMM_L4x1_SUB1:
+
+	andi.		L,	K,	7
+	ble		SGEMM_L4x1_SAVE
+
+SGEMM_L4x1_SUB2:
+
+	KERNEL4x1_SUB1
+
+	addic.		L,	L,	-1
+	bgt		SGEMM_L4x1_SUB2
+
+SGEMM_L4x1_SAVE:
+
+	SAVE4x1
+
+SGEMM_L4x1_END:
+
+	slwi		T1,	K,	4
+	add		B,	B,	T1
+
+SGEMM_L4_END:
+SGEMM_L2_BEGIN:
+
+	mr		BO,	B
+	mr		BBO,	BBUFFER
+	slwi		T1,	K,	1
+
+SGEMM_L2_COPYB:
+	dcbtst		BBO,	PRE
+
+	lxvw4x		vs3,	o0,	BO
+	lxvw4x		vs11,	o16,	BO
+	xxspltw		vs4,	vs3,	0
+	xxspltw		vs5,	vs3,	1
+	xxspltw		vs6,	vs3,	2
+	xxspltw		vs7,	vs3,	3
+	xxspltw		vs12,	vs11,	0
+	xxspltw		vs13,	vs11,	1
+	xxspltw		vs14,	vs11,	2
+	xxspltw		vs15,	vs11,	3
+	stxvw4x		vs4,	o0,	BBO
+	stxvw4x		vs5,	o16,	BBO
+	stxvw4x		vs6,	o32,	BBO
+	stxvw4x		vs7,	o48,	BBO
+	addi		BO,	BO,	32
+	addi		BBO,	BBO,	64
+	stxvw4x		vs12,	o0,	BBO
+	stxvw4x		vs13,	o16,	BBO
+	stxvw4x		vs14,	o32,	BBO
+	stxvw4x		vs15,	o48,	BBO
+	addic.		T1,	T1,	-8
+	addi		BBO,	BBO,	64
+
+	bge		SGEMM_L2_COPYB
+
+	andi.		T1,	N,	2
+	ble		SGEMM_L2_END
+	mr		CO,	C
+	mr		AO,	A
+	slwi		T1,	LDC	,	1
+	add		C,	C,	T1
+	srawi.		I,	M,	4
+	ble		SGEMM_L2x16_END
+
+SGEMM_L2x16_BEGIN:
+
+
+	mr		BO,	BBUFFER
+	srawi.		L,	K,	3
+	ble		SGEMM_L2x16_SUB0
+	cmpwi		cr0,	L,	1
+	ble		SGEMM_L2x16_SUB4
+
+SGEMM_L2x16_LOOP_START:
+
+	dcbt		AO,	PRE
+	LOAD2x16_1
+	KERNEL2x16_I1
+	dcbt		AO,	PRE
+	KERNEL2x16_2
+	KERNEL2x16_1
+	dcbt		AO,	PRE
+	KERNEL2x16_2
+
+	KERNEL2x16_1
+	dcbt		AO,	PRE
+	KERNEL2x16_2
+	KERNEL2x16_1
+	dcbt		AO,	PRE
+	KERNEL2x16_2
+
+	addic.		L,	L,	-2
+	ble		SGEMM_L2x16_LOOP_END
+
+	.align 5
+
+SGEMM_L2x16_LOOP:
+
+	KERNEL2x16_1
+	dcbt		AO,	PRE
+	KERNEL2x16_2
+	KERNEL2x16_1
+	dcbt		AO,	PRE
+	KERNEL2x16_2
+
+	KERNEL2x16_1
+	dcbt		AO,	PRE
+	KERNEL2x16_2
+	KERNEL2x16_1
+	dcbt		AO,	PRE
+	KERNEL2x16_2
+
+	addic.		L,	L,	-1
+	bgt		SGEMM_L2x16_LOOP
+
+SGEMM_L2x16_LOOP_END:
+
+	KERNEL2x16_1
+	dcbt		AO,	PRE
+	KERNEL2x16_2
+	KERNEL2x16_1
+	dcbt		AO,	PRE
+	KERNEL2x16_2
+
+	KERNEL2x16_1
+	dcbt		AO,	PRE
+	KERNEL2x16_2
+	KERNEL2x16_1
+	KERNEL2x16_E2
+
+	b		SGEMM_L2x16_SUB1
+
+SGEMM_L2x16_SUB4:
+
+	dcbt		AO,	PRE
+	KERNEL2x16_SUBI1
+	KERNEL2x16_SUB1
+	dcbt		AO,	PRE
+	KERNEL2x16_SUB1
+	KERNEL2x16_SUB1
+
+	KERNEL2x16_SUB1
+	KERNEL2x16_SUB1
+	KERNEL2x16_SUB1
+	KERNEL2x16_SUB1
+
+	b		SGEMM_L2x16_SUB1
+
+SGEMM_L2x16_SUB0:
+
+	andi.		L,	K,	7
+
+	KERNEL2x16_SUBI1
+
+	addic.		L,	L,	-1
+	ble		SGEMM_L2x16_SAVE
+	b		SGEMM_L2x16_SUB2
+
+SGEMM_L2x16_SUB1:
+
+	andi.		L,	K,	7
+	ble		SGEMM_L2x16_SAVE
+
+SGEMM_L2x16_SUB2:
+
+	KERNEL2x16_SUB1
+
+	addic.		L,	L,	-1
+	bgt		SGEMM_L2x16_SUB2
+
+SGEMM_L2x16_SAVE:
+
+	SAVE2x16
+
+	addic.		I,	I,	-1
+	bgt		SGEMM_L2x16_BEGIN
+
+SGEMM_L2x16_END:
+
+SGEMM_L2x8_BEGIN:
+
+	andi.		T2,	M,	15
+	ble		SGEMM_L2x1_END
+
+	andi.		T1,	M,	8
+	ble		SGEMM_L2x8_END
+	mr		BO,	BBUFFER
+	srawi.		L,	K,	3
+	ble		SGEMM_L2x8_SUB0
+	cmpwi		cr0,	L,	1
+	ble		SGEMM_L2x8_SUB4
+
+SGEMM_L2x8_LOOP_START:
+
+	LOAD2x8_1
+	KERNEL2x8_I1
+	KERNEL2x8_2
+	KERNEL2x8_1
+	KERNEL2x8_2
+
+	KERNEL2x8_1
+	KERNEL2x8_2
+	KERNEL2x8_1
+	KERNEL2x8_2
+
+	addic.		L,	L,	-2
+	ble		SGEMM_L2x8_LOOP_END
+
+	.align 5
+
+SGEMM_L2x8_LOOP:
+
+	KERNEL2x8_1
+	KERNEL2x8_2
+	KERNEL2x8_1
+	KERNEL2x8_2
+
+	KERNEL2x8_1
+	KERNEL2x8_2
+	KERNEL2x8_1
+	KERNEL2x8_2
+
+	addic.		L,	L,	-1
+	bgt		SGEMM_L2x8_LOOP
+
+SGEMM_L2x8_LOOP_END:
+
+	KERNEL2x8_1
+	KERNEL2x8_2
+	KERNEL2x8_1
+	KERNEL2x8_2
+
+	KERNEL2x8_1
+	KERNEL2x8_2
+	KERNEL2x8_1
+	KERNEL2x8_E2
+
+	b		SGEMM_L2x8_SUB1
+
+SGEMM_L2x8_SUB4:
+
+	KERNEL2x8_SUBI1
+	KERNEL2x8_SUB1
+	KERNEL2x8_SUB1
+	KERNEL2x8_SUB1
+
+	KERNEL2x8_SUB1
+	KERNEL2x8_SUB1
+	KERNEL2x8_SUB1
+	KERNEL2x8_SUB1
+
+	b		SGEMM_L2x8_SUB1
+
+SGEMM_L2x8_SUB0:
+
+	andi.		L,	K,	7
+
+	KERNEL2x8_SUBI1
+
+	addic.		L,	L,	-1
+	ble		SGEMM_L2x8_SAVE
+	b		SGEMM_L2x8_SUB2
+
+SGEMM_L2x8_SUB1:
+
+	andi.		L,	K,	7
+	ble		SGEMM_L2x8_SAVE
+
+SGEMM_L2x8_SUB2:
+
+	KERNEL2x8_SUB1
+
+	addic.		L,	L,	-1
+	bgt		SGEMM_L2x8_SUB2
+
+SGEMM_L2x8_SAVE:
+
+	SAVE2x8
+
+SGEMM_L2x8_END:
+
+SGEMM_L2x4_BEGIN:
+
+
+	andi.		T1,	M,	4
+	ble		SGEMM_L2x4_END
+	mr		BO,	BBUFFER
+	srawi.		L,	K,	3
+	ble		SGEMM_L2x4_SUB0
+	cmpwi		cr0,	L,	1
+	ble		SGEMM_L2x4_SUB4
+
+SGEMM_L2x4_LOOP_START:
+
+	LOAD2x4_1
+	KERNEL2x4_I1
+	KERNEL2x4_2
+	KERNEL2x4_1
+	KERNEL2x4_2
+
+	KERNEL2x4_1
+	KERNEL2x4_2
+	KERNEL2x4_1
+	KERNEL2x4_2
+
+	addic.		L,	L,	-2
+	ble		SGEMM_L2x4_LOOP_END
+
+	.align 5
+
+SGEMM_L2x4_LOOP:
+
+	KERNEL2x4_1
+	KERNEL2x4_2
+	KERNEL2x4_1
+	KERNEL2x4_2
+
+	KERNEL2x4_1
+	KERNEL2x4_2
+	KERNEL2x4_1
+	KERNEL2x4_2
+
+	addic.		L,	L,	-1
+	bgt		SGEMM_L2x4_LOOP
+
+SGEMM_L2x4_LOOP_END:
+
+	KERNEL2x4_1
+	KERNEL2x4_2
+	KERNEL2x4_1
+	KERNEL2x4_2
+
+	KERNEL2x4_1
+	KERNEL2x4_2
+	KERNEL2x4_1
+	KERNEL2x4_E2
+
+	b		SGEMM_L2x4_SUB1
+
+SGEMM_L2x4_SUB4:
+
+	KERNEL2x4_SUBI1
+	KERNEL2x4_SUB1
+	KERNEL2x4_SUB1
+	KERNEL2x4_SUB1
+
+	KERNEL2x4_SUB1
+	KERNEL2x4_SUB1
+	KERNEL2x4_SUB1
+	KERNEL2x4_SUB1
+
+	b		SGEMM_L2x4_SUB1
+
+SGEMM_L2x4_SUB0:
+
+	andi.		L,	K,	7
+
+	KERNEL2x4_SUBI1
+
+	addic.		L,	L,	-1
+	ble		SGEMM_L2x4_SAVE
+	b		SGEMM_L2x4_SUB2
+
+SGEMM_L2x4_SUB1:
+
+	andi.		L,	K,	7
+	ble		SGEMM_L2x4_SAVE
+
+SGEMM_L2x4_SUB2:
+
+	KERNEL2x4_SUB1
+
+	addic.		L,	L,	-1
+	bgt		SGEMM_L2x4_SUB2
+
+SGEMM_L2x4_SAVE:
+
+	SAVE2x4
+
+SGEMM_L2x4_END:
+
+SGEMM_L2x2_BEGIN:
+
+
+	andi.		T1,	M,	2
+	ble		SGEMM_L2x2_END
+	mr		BO,	BBUFFER
+	srawi.		L,	K,	3
+	ble		SGEMM_L2x2_SUB0
+	cmpwi		cr0,	L,	1
+	ble		SGEMM_L2x2_SUB4
+
+SGEMM_L2x2_LOOP_START:
+
+	LOAD2x2_1
+	KERNEL2x2_I1
+	KERNEL2x2_2
+	KERNEL2x2_1
+	KERNEL2x2_2
+
+	KERNEL2x2_1
+	KERNEL2x2_2
+	KERNEL2x2_1
+	KERNEL2x2_2
+
+	addic.		L,	L,	-2
+	ble		SGEMM_L2x2_LOOP_END
+
+	.align 5
+
+SGEMM_L2x2_LOOP:
+
+	KERNEL2x2_1
+	KERNEL2x2_2
+	KERNEL2x2_1
+	KERNEL2x2_2
+
+	KERNEL2x2_1
+	KERNEL2x2_2
+	KERNEL2x2_1
+	KERNEL2x2_2
+
+	addic.		L,	L,	-1
+	bgt		SGEMM_L2x2_LOOP
+
+SGEMM_L2x2_LOOP_END:
+
+	KERNEL2x2_1
+	KERNEL2x2_2
+	KERNEL2x2_1
+	KERNEL2x2_2
+
+	KERNEL2x2_1
+	KERNEL2x2_2
+	KERNEL2x2_1
+	KERNEL2x2_E2
+
+	b		SGEMM_L2x2_SUB1
+
+SGEMM_L2x2_SUB4:
+
+	KERNEL2x2_SUBI1
+	KERNEL2x2_SUB1
+	KERNEL2x2_SUB1
+	KERNEL2x2_SUB1
+
+	KERNEL2x2_SUB1
+	KERNEL2x2_SUB1
+	KERNEL2x2_SUB1
+	KERNEL2x2_SUB1
+
+	b		SGEMM_L2x2_SUB1
+
+SGEMM_L2x2_SUB0:
+
+	andi.		L,	K,	7
+
+	KERNEL2x2_SUBI1
+
+	addic.		L,	L,	-1
+	ble		SGEMM_L2x2_SAVE
+	b		SGEMM_L2x2_SUB2
+
+SGEMM_L2x2_SUB1:
+
+	andi.		L,	K,	7
+	ble		SGEMM_L2x2_SAVE
+
+SGEMM_L2x2_SUB2:
+
+	KERNEL2x2_SUB1
+
+	addic.		L,	L,	-1
+	bgt		SGEMM_L2x2_SUB2
+
+SGEMM_L2x2_SAVE:
+
+	SAVE2x2
+
+SGEMM_L2x2_END:
+
+SGEMM_L2x1_BEGIN:
+
+
+	andi.		T1,	M,	1
+	ble		SGEMM_L2x1_END
+	mr		BO,	BBUFFER
+	srawi.		L,	K,	3
+	ble		SGEMM_L2x1_SUB0
+	cmpwi		cr0,	L,	1
+	ble		SGEMM_L2x1_SUB4
+
+SGEMM_L2x1_LOOP_START:
+
+	LOAD2x1_1
+	KERNEL2x1_I1
+	KERNEL2x1_2
+	KERNEL2x1_1
+	KERNEL2x1_2
+
+	KERNEL2x1_1
+	KERNEL2x1_2
+	KERNEL2x1_1
+	KERNEL2x1_2
+
+	addic.		L,	L,	-2
+	ble		SGEMM_L2x1_LOOP_END
+
+	.align 5
+
+SGEMM_L2x1_LOOP:
+
+	KERNEL2x1_1
+	KERNEL2x1_2
+	KERNEL2x1_1
+	KERNEL2x1_2
+
+	KERNEL2x1_1
+	KERNEL2x1_2
+	KERNEL2x1_1
+	KERNEL2x1_2
+
+	addic.		L,	L,	-1
+	bgt		SGEMM_L2x1_LOOP
+
+SGEMM_L2x1_LOOP_END:
+
+	KERNEL2x1_1
+	KERNEL2x1_2
+	KERNEL2x1_1
+	KERNEL2x1_2
+
+	KERNEL2x1_1
+	KERNEL2x1_2
+	KERNEL2x1_1
+	KERNEL2x1_E2
+
+	b		SGEMM_L2x1_SUB1
+
+SGEMM_L2x1_SUB4:
+
+	KERNEL2x1_SUBI1
+	KERNEL2x1_SUB1
+	KERNEL2x1_SUB1
+	KERNEL2x1_SUB1
+
+	KERNEL2x1_SUB1
+	KERNEL2x1_SUB1
+	KERNEL2x1_SUB1
+	KERNEL2x1_SUB1
+
+	b		SGEMM_L2x1_SUB1
+
+SGEMM_L2x1_SUB0:
+
+	andi.		L,	K,	7
+
+	KERNEL2x1_SUBI1
+
+	addic.		L,	L,	-1
+	ble		SGEMM_L2x1_SAVE
+	b		SGEMM_L2x1_SUB2
+
+SGEMM_L2x1_SUB1:
+
+	andi.		L,	K,	7
+	ble		SGEMM_L2x1_SAVE
+
+SGEMM_L2x1_SUB2:
+
+	KERNEL2x1_SUB1
+
+	addic.		L,	L,	-1
+	bgt		SGEMM_L2x1_SUB2
+
+SGEMM_L2x1_SAVE:
+
+	SAVE2x1
+
+SGEMM_L2x1_END:
+
+	slwi		T1,	K,	3
+	add		B,	B,	T1
+
+SGEMM_L2_END:
+SGEMM_L1_BEGIN:
+
+	mr		BO,	B
+	mr		BBO,	BBUFFER
+	slwi		T1,	K,	0
+
+SGEMM_L1_COPYB:
+	dcbtst		BBO,	PRE
+
+	lxvw4x		vs3,	o0,	BO
+	lxvw4x		vs11,	o16,	BO
+	xxspltw		vs4,	vs3,	0
+	xxspltw		vs5,	vs3,	1
+	xxspltw		vs6,	vs3,	2
+	xxspltw		vs7,	vs3,	3
+	xxspltw		vs12,	vs11,	0
+	xxspltw		vs13,	vs11,	1
+	xxspltw		vs14,	vs11,	2
+	xxspltw		vs15,	vs11,	3
+	stxvw4x		vs4,	o0,	BBO
+	stxvw4x		vs5,	o16,	BBO
+	stxvw4x		vs6,	o32,	BBO
+	stxvw4x		vs7,	o48,	BBO
+	addi		BO,	BO,	32
+	addi		BBO,	BBO,	64
+	stxvw4x		vs12,	o0,	BBO
+	stxvw4x		vs13,	o16,	BBO
+	stxvw4x		vs14,	o32,	BBO
+	stxvw4x		vs15,	o48,	BBO
+	addic.		T1,	T1,	-8
+	addi		BBO,	BBO,	64
+
+	bge		SGEMM_L1_COPYB
+
+	andi.		T1,	N,	1
+	ble		SGEMM_L1_END
+	mr		CO,	C
+	mr		AO,	A
+	srawi.		I,	M,	4
+	ble		SGEMM_L1x16_END
+
+SGEMM_L1x16_BEGIN:
+
+
+	mr		BO,	BBUFFER
+	srawi.		L,	K,	3
+	ble		SGEMM_L1x16_SUB0
+	cmpwi		cr0,	L,	1
+	ble		SGEMM_L1x16_SUB4
+
+SGEMM_L1x16_LOOP_START:
+
+	dcbt		AO,	PRE
+	LOAD1x16_1
+	KERNEL1x16_I1
+	dcbt		AO,	PRE
+	KERNEL1x16_2
+	KERNEL1x16_1
+	dcbt		AO,	PRE
+	KERNEL1x16_2
+
+	KERNEL1x16_1
+	dcbt		AO,	PRE
+	KERNEL1x16_2
+	KERNEL1x16_1
+	dcbt		AO,	PRE
+	KERNEL1x16_2
+
+	addic.		L,	L,	-2
+	ble		SGEMM_L1x16_LOOP_END
+
+	.align 5
+
+SGEMM_L1x16_LOOP:
+
+	KERNEL1x16_1
+	dcbt		AO,	PRE
+	KERNEL1x16_2
+	KERNEL1x16_1
+	dcbt		AO,	PRE
+	KERNEL1x16_2
+
+	KERNEL1x16_1
+	dcbt		AO,	PRE
+	KERNEL1x16_2
+	KERNEL1x16_1
+	dcbt		AO,	PRE
+	KERNEL1x16_2
+
+	addic.		L,	L,	-1
+	bgt		SGEMM_L1x16_LOOP
+
+SGEMM_L1x16_LOOP_END:
+
+	KERNEL1x16_1
+	dcbt		AO,	PRE
+	KERNEL1x16_2
+	KERNEL1x16_1
+	dcbt		AO,	PRE
+	KERNEL1x16_2
+
+	KERNEL1x16_1
+	dcbt		AO,	PRE
+	KERNEL1x16_2
+	KERNEL1x16_1
+	KERNEL1x16_E2
+
+	b		SGEMM_L1x16_SUB1
+
+SGEMM_L1x16_SUB4:
+
+	dcbt		AO,	PRE
+	KERNEL1x16_SUBI1
+	KERNEL1x16_SUB1
+	dcbt		AO,	PRE
+	KERNEL1x16_SUB1
+	KERNEL1x16_SUB1
+
+	KERNEL1x16_SUB1
+	KERNEL1x16_SUB1
+	KERNEL1x16_SUB1
+	KERNEL1x16_SUB1
+
+	b		SGEMM_L1x16_SUB1
+
+SGEMM_L1x16_SUB0:
+
+	andi.		L,	K,	7
+
+	KERNEL1x16_SUBI1
+
+	addic.		L,	L,	-1
+	ble		SGEMM_L1x16_SAVE
+	b		SGEMM_L1x16_SUB2
+
+SGEMM_L1x16_SUB1:
+
+	andi.		L,	K,	7
+	ble		SGEMM_L1x16_SAVE
+
+SGEMM_L1x16_SUB2:
+
+	KERNEL1x16_SUB1
+
+	addic.		L,	L,	-1
+	bgt		SGEMM_L1x16_SUB2
+
+SGEMM_L1x16_SAVE:
+
+	SAVE1x16
+
+	addic.		I,	I,	-1
+	bgt		SGEMM_L1x16_BEGIN
+
+SGEMM_L1x16_END:
+
+SGEMM_L1x8_BEGIN:
+
+	andi.		T2,	M,	15
+	ble		SGEMM_L1x1_END
+
+	andi.		T1,	M,	8
+	ble		SGEMM_L1x8_END
+	mr		BO,	BBUFFER
+	srawi.		L,	K,	3
+	ble		SGEMM_L1x8_SUB0
+	cmpwi		cr0,	L,	1
+	ble		SGEMM_L1x8_SUB4
+
+SGEMM_L1x8_LOOP_START:
+
+	LOAD1x8_1
+	KERNEL1x8_I1
+	KERNEL1x8_2
+	KERNEL1x8_1
+	KERNEL1x8_2
+
+	KERNEL1x8_1
+	KERNEL1x8_2
+	KERNEL1x8_1
+	KERNEL1x8_2
+
+	addic.		L,	L,	-2
+	ble		SGEMM_L1x8_LOOP_END
+
+	.align 5
+
+SGEMM_L1x8_LOOP:
+
+	KERNEL1x8_1
+	KERNEL1x8_2
+	KERNEL1x8_1
+	KERNEL1x8_2
+
+	KERNEL1x8_1
+	KERNEL1x8_2
+	KERNEL1x8_1
+	KERNEL1x8_2
+
+	addic.		L,	L,	-1
+	bgt		SGEMM_L1x8_LOOP
+
+SGEMM_L1x8_LOOP_END:
+
+	KERNEL1x8_1
+	KERNEL1x8_2
+	KERNEL1x8_1
+	KERNEL1x8_2
+
+	KERNEL1x8_1
+	KERNEL1x8_2
+	KERNEL1x8_1
+	KERNEL1x8_E2
+
+	b		SGEMM_L1x8_SUB1
+
+SGEMM_L1x8_SUB4:
+
+	KERNEL1x8_SUBI1
+	KERNEL1x8_SUB1
+	KERNEL1x8_SUB1
+	KERNEL1x8_SUB1
+
+	KERNEL1x8_SUB1
+	KERNEL1x8_SUB1
+	KERNEL1x8_SUB1
+	KERNEL1x8_SUB1
+
+	b		SGEMM_L1x8_SUB1
+
+SGEMM_L1x8_SUB0:
+
+	andi.		L,	K,	7
+
+	KERNEL1x8_SUBI1
+
+	addic.		L,	L,	-1
+	ble		SGEMM_L1x8_SAVE
+	b		SGEMM_L1x8_SUB2
+
+SGEMM_L1x8_SUB1:
+
+	andi.		L,	K,	7
+	ble		SGEMM_L1x8_SAVE
+
+SGEMM_L1x8_SUB2:
+
+	KERNEL1x8_SUB1
+
+	addic.		L,	L,	-1
+	bgt		SGEMM_L1x8_SUB2
+
+SGEMM_L1x8_SAVE:
+
+	SAVE1x8
+
+SGEMM_L1x8_END:
+
+SGEMM_L1x4_BEGIN:
+
+
+	andi.		T1,	M,	4
+	ble		SGEMM_L1x4_END
+	mr		BO,	BBUFFER
+	srawi.		L,	K,	3
+	ble		SGEMM_L1x4_SUB0
+	cmpwi		cr0,	L,	1
+	ble		SGEMM_L1x4_SUB4
+
+SGEMM_L1x4_LOOP_START:
+
+	LOAD1x4_1
+	KERNEL1x4_I1
+	KERNEL1x4_2
+	KERNEL1x4_1
+	KERNEL1x4_2
+
+	KERNEL1x4_1
+	KERNEL1x4_2
+	KERNEL1x4_1
+	KERNEL1x4_2
+
+	addic.		L,	L,	-2
+	ble		SGEMM_L1x4_LOOP_END
+
+	.align 5
+
+SGEMM_L1x4_LOOP:
+
+	KERNEL1x4_1
+	KERNEL1x4_2
+	KERNEL1x4_1
+	KERNEL1x4_2
+
+	KERNEL1x4_1
+	KERNEL1x4_2
+	KERNEL1x4_1
+	KERNEL1x4_2
+
+	addic.		L,	L,	-1
+	bgt		SGEMM_L1x4_LOOP
+
+SGEMM_L1x4_LOOP_END:
+
+	KERNEL1x4_1
+	KERNEL1x4_2
+	KERNEL1x4_1
+	KERNEL1x4_2
+
+	KERNEL1x4_1
+	KERNEL1x4_2
+	KERNEL1x4_1
+	KERNEL1x4_E2
+
+	b		SGEMM_L1x4_SUB1
+
+SGEMM_L1x4_SUB4:
+
+	KERNEL1x4_SUBI1
+	KERNEL1x4_SUB1
+	KERNEL1x4_SUB1
+	KERNEL1x4_SUB1
+
+	KERNEL1x4_SUB1
+	KERNEL1x4_SUB1
+	KERNEL1x4_SUB1
+	KERNEL1x4_SUB1
+
+	b		SGEMM_L1x4_SUB1
+
+SGEMM_L1x4_SUB0:
+
+	andi.		L,	K,	7
+
+	KERNEL1x4_SUBI1
+
+	addic.		L,	L,	-1
+	ble		SGEMM_L1x4_SAVE
+	b		SGEMM_L1x4_SUB2
+
+SGEMM_L1x4_SUB1:
+
+	andi.		L,	K,	7
+	ble		SGEMM_L1x4_SAVE
+
+SGEMM_L1x4_SUB2:
+
+	KERNEL1x4_SUB1
+
+	addic.		L,	L,	-1
+	bgt		SGEMM_L1x4_SUB2
+
+SGEMM_L1x4_SAVE:
+
+	SAVE1x4
+
+SGEMM_L1x4_END:
+
+SGEMM_L1x2_BEGIN:
+
+
+	andi.		T1,	M,	2
+	ble		SGEMM_L1x2_END
+	mr		BO,	BBUFFER
+	srawi.		L,	K,	3
+	ble		SGEMM_L1x2_SUB0
+	cmpwi		cr0,	L,	1
+	ble		SGEMM_L1x2_SUB4
+
+SGEMM_L1x2_LOOP_START:
+
+	LOAD1x2_1
+	KERNEL1x2_I1
+	KERNEL1x2_2
+	KERNEL1x2_1
+	KERNEL1x2_2
+
+	KERNEL1x2_1
+	KERNEL1x2_2
+	KERNEL1x2_1
+	KERNEL1x2_2
+
+	addic.		L,	L,	-2
+	ble		SGEMM_L1x2_LOOP_END
+
+	.align 5
+
+SGEMM_L1x2_LOOP:
+
+	KERNEL1x2_1
+	KERNEL1x2_2
+	KERNEL1x2_1
+	KERNEL1x2_2
+
+	KERNEL1x2_1
+	KERNEL1x2_2
+	KERNEL1x2_1
+	KERNEL1x2_2
+
+	addic.		L,	L,	-1
+	bgt		SGEMM_L1x2_LOOP
+
+SGEMM_L1x2_LOOP_END:
+
+	KERNEL1x2_1
+	KERNEL1x2_2
+	KERNEL1x2_1
+	KERNEL1x2_2
+
+	KERNEL1x2_1
+	KERNEL1x2_2
+	KERNEL1x2_1
+	KERNEL1x2_E2
+
+	b		SGEMM_L1x2_SUB1
+
+SGEMM_L1x2_SUB4:
+
+	KERNEL1x2_SUBI1
+	KERNEL1x2_SUB1
+	KERNEL1x2_SUB1
+	KERNEL1x2_SUB1
+
+	KERNEL1x2_SUB1
+	KERNEL1x2_SUB1
+	KERNEL1x2_SUB1
+	KERNEL1x2_SUB1
+
+	b		SGEMM_L1x2_SUB1
+
+SGEMM_L1x2_SUB0:
+
+	andi.		L,	K,	7
+
+	KERNEL1x2_SUBI1
+
+	addic.		L,	L,	-1
+	ble		SGEMM_L1x2_SAVE
+	b		SGEMM_L1x2_SUB2
+
+SGEMM_L1x2_SUB1:
+
+	andi.		L,	K,	7
+	ble		SGEMM_L1x2_SAVE
+
+SGEMM_L1x2_SUB2:
+
+	KERNEL1x2_SUB1
+
+	addic.		L,	L,	-1
+	bgt		SGEMM_L1x2_SUB2
+
+SGEMM_L1x2_SAVE:
+
+	SAVE1x2
+
+SGEMM_L1x2_END:
+
+SGEMM_L1x1_BEGIN:
+
+
+	andi.		T1,	M,	1
+	ble		SGEMM_L1x1_END
+	mr		BO,	BBUFFER
+	srawi.		L,	K,	3
+	ble		SGEMM_L1x1_SUB0
+	cmpwi		cr0,	L,	1
+	ble		SGEMM_L1x1_SUB4
+
+SGEMM_L1x1_LOOP_START:
+
+	LOAD1x1_1
+	KERNEL1x1_I1
+	KERNEL1x1_2
+	KERNEL1x1_1
+	KERNEL1x1_2
+
+	KERNEL1x1_1
+	KERNEL1x1_2
+	KERNEL1x1_1
+	KERNEL1x1_2
+
+	addic.		L,	L,	-2
+	ble		SGEMM_L1x1_LOOP_END
+
+	.align 5
+
+SGEMM_L1x1_LOOP:
+
+	KERNEL1x1_1
+	KERNEL1x1_2
+	KERNEL1x1_1
+	KERNEL1x1_2
+
+	KERNEL1x1_1
+	KERNEL1x1_2
+	KERNEL1x1_1
+	KERNEL1x1_2
+
+	addic.		L,	L,	-1
+	bgt		SGEMM_L1x1_LOOP
+
+SGEMM_L1x1_LOOP_END:
+
+	KERNEL1x1_1
+	KERNEL1x1_2
+	KERNEL1x1_1
+	KERNEL1x1_2
+
+	KERNEL1x1_1
+	KERNEL1x1_2
+	KERNEL1x1_1
+	KERNEL1x1_E2
+
+	b		SGEMM_L1x1_SUB1
+
+SGEMM_L1x1_SUB4:
+
+	KERNEL1x1_SUBI1
+	KERNEL1x1_SUB1
+	KERNEL1x1_SUB1
+	KERNEL1x1_SUB1
+
+	KERNEL1x1_SUB1
+	KERNEL1x1_SUB1
+	KERNEL1x1_SUB1
+	KERNEL1x1_SUB1
+
+	b		SGEMM_L1x1_SUB1
+
+SGEMM_L1x1_SUB0:
+
+	andi.		L,	K,	7
+
+	KERNEL1x1_SUBI1
+
+	addic.		L,	L,	-1
+	ble		SGEMM_L1x1_SAVE
+	b		SGEMM_L1x1_SUB2
+
+SGEMM_L1x1_SUB1:
+
+	andi.		L,	K,	7
+	ble		SGEMM_L1x1_SAVE
+
+SGEMM_L1x1_SUB2:
+
+	KERNEL1x1_SUB1
+
+	addic.		L,	L,	-1
+	bgt		SGEMM_L1x1_SUB2
+
+SGEMM_L1x1_SAVE:
+
+	SAVE1x1
+
+SGEMM_L1x1_END:
+
+SGEMM_L1_END:
diff --git a/kernel/power/sgemm_macros_16x8_power8.S b/kernel/power/sgemm_macros_16x8_power8.S
new file mode 100644
index 000000000..71dc52979
--- /dev/null
+++ b/kernel/power/sgemm_macros_16x8_power8.S
@@ -0,0 +1,5888 @@
+/***************************************************************************
+Copyright (c) 2013-2016, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*****************************************************************************/
+
+/**************************************************************************************
+* 2016/04/02 Werner Saar (wernsaar@googlemail.com)
+* 	 BLASTEST 		: OK
+* 	 CTEST			: OK
+* 	 TEST			: OK
+* 	 LAPACK-TEST		: OK
+**************************************************************************************/
+
+
+/**********************************************************************************************
+* Macros for N=8 and M=16
+**********************************************************************************************/
+
+.macro LOAD8x16_1
+
+	lxvw4x		vs0,	o0,	AO
+	lxvw4x		vs1,	o16,	AO
+	lxvw4x		vs2,	o32,	AO
+	lxvw4x		vs3,	o48,	AO
+
+	addi		AO,	AO,	64
+
+	mr		T1,	BO
+
+	lxvw4x		vs8,	o0,	T1
+	lxvw4x		vs9,	o16,	T1
+	lxvw4x		vs10,	o32,	T1
+	lxvw4x		vs11,	o48,	T1
+
+	addi		T1,	T1,	64
+
+	lxvw4x		vs12,	o0,	T1
+	lxvw4x		vs13,	o16,	T1
+	lxvw4x		vs14,	o32,	T1
+	lxvw4x		vs15,	o48,	T1
+
+	addi		BO,	BO,	128
+
+.endm
+
+.macro KERNEL8x16_I1
+
+
+	lxvw4x		vs4,	o0,	AO
+	lxvw4x		vs5,	o16,	AO
+	lxvw4x		vs6,	o32,	AO
+	lxvw4x		vs7,	o48,	AO
+
+	addi		AO,	AO,	64
+
+	mr		T1,	BO
+
+	lxvw4x		vs16,	o0,	T1
+	lxvw4x		vs17,	o16,	T1
+	lxvw4x		vs18,	o32,	T1
+	lxvw4x		vs19,	o48,	T1
+
+	addi		T1,	T1,	64
+
+	lxvw4x		vs20,	o0,	T1
+	lxvw4x		vs21,	o16,	T1
+	lxvw4x		vs22,	o32,	T1
+	lxvw4x		vs23,	o48,	T1
+
+	addi		BO,	BO,	128
+
+
+	xvmulsp		vs32,	vs0,	vs8
+	xvmulsp		vs33,	vs1,	vs8
+	xvmulsp		vs34,	vs2,	vs8
+	xvmulsp		vs35,	vs3,	vs8
+
+	xvmulsp		vs36,	vs0,	vs9
+	xvmulsp		vs37,	vs1,	vs9
+	xvmulsp		vs38,	vs2,	vs9
+	xvmulsp		vs39,	vs3,	vs9
+
+	xvmulsp		vs40,	vs0,	vs10
+	xvmulsp		vs41,	vs1,	vs10
+	xvmulsp		vs42,	vs2,	vs10
+	xvmulsp		vs43,	vs3,	vs10
+
+	xvmulsp		vs44,	vs0,	vs11
+	xvmulsp		vs45,	vs1,	vs11
+	xvmulsp		vs46,	vs2,	vs11
+	xvmulsp		vs47,	vs3,	vs11
+
+	xvmulsp		vs48,	vs0,	vs12
+	xvmulsp		vs49,	vs1,	vs12
+	xvmulsp		vs50,	vs2,	vs12
+	xvmulsp		vs51,	vs3,	vs12
+
+	xvmulsp		vs52,	vs0,	vs13
+	xvmulsp		vs53,	vs1,	vs13
+	xvmulsp		vs54,	vs2,	vs13
+	xvmulsp		vs55,	vs3,	vs13
+
+	xvmulsp		vs56,	vs0,	vs14
+	xvmulsp		vs57,	vs1,	vs14
+	xvmulsp		vs58,	vs2,	vs14
+	xvmulsp		vs59,	vs3,	vs14
+
+	xvmulsp		vs60,	vs0,	vs15
+	xvmulsp		vs61,	vs1,	vs15
+	xvmulsp		vs62,	vs2,	vs15
+	xvmulsp		vs63,	vs3,	vs15
+
+
+.endm
+
+.macro KERNEL8x16_1
+
+
+	lxvw4x		vs4,	o0,	AO
+	lxvw4x		vs5,	o16,	AO
+	lxvw4x		vs6,	o32,	AO
+	lxvw4x		vs7,	o48,	AO
+
+	addi		AO,	AO,	64
+
+	mr		T1,	BO
+
+	lxvw4x		vs16,	o0,	T1
+	lxvw4x		vs17,	o16,	T1
+	lxvw4x		vs18,	o32,	T1
+	lxvw4x		vs19,	o48,	T1
+
+	addi		T1,	T1,	64
+
+	lxvw4x		vs20,	o0,	T1
+	lxvw4x		vs21,	o16,	T1
+	lxvw4x		vs22,	o32,	T1
+	lxvw4x		vs23,	o48,	T1
+
+	addi		BO,	BO,	128
+
+
+	xvmaddasp	vs32,	vs0,	vs8
+	xvmaddasp	vs33,	vs1,	vs8
+	xvmaddasp	vs34,	vs2,	vs8
+	xvmaddasp	vs35,	vs3,	vs8
+
+	xvmaddasp	vs36,	vs0,	vs9
+	xvmaddasp	vs37,	vs1,	vs9
+	xvmaddasp	vs38,	vs2,	vs9
+	xvmaddasp	vs39,	vs3,	vs9
+
+	xvmaddasp	vs40,	vs0,	vs10
+	xvmaddasp	vs41,	vs1,	vs10
+	xvmaddasp	vs42,	vs2,	vs10
+	xvmaddasp	vs43,	vs3,	vs10
+
+	xvmaddasp	vs44,	vs0,	vs11
+	xvmaddasp	vs45,	vs1,	vs11
+	xvmaddasp	vs46,	vs2,	vs11
+	xvmaddasp	vs47,	vs3,	vs11
+
+	xvmaddasp	vs48,	vs0,	vs12
+	xvmaddasp	vs49,	vs1,	vs12
+	xvmaddasp	vs50,	vs2,	vs12
+	xvmaddasp	vs51,	vs3,	vs12
+
+	xvmaddasp	vs52,	vs0,	vs13
+	xvmaddasp	vs53,	vs1,	vs13
+	xvmaddasp	vs54,	vs2,	vs13
+	xvmaddasp	vs55,	vs3,	vs13
+
+	xvmaddasp	vs56,	vs0,	vs14
+	xvmaddasp	vs57,	vs1,	vs14
+	xvmaddasp	vs58,	vs2,	vs14
+	xvmaddasp	vs59,	vs3,	vs14
+
+	xvmaddasp	vs60,	vs0,	vs15
+	xvmaddasp	vs61,	vs1,	vs15
+	xvmaddasp	vs62,	vs2,	vs15
+	xvmaddasp	vs63,	vs3,	vs15
+
+
+.endm
+
+.macro KERNEL8x16_2
+
+
+	lxvw4x		vs0,	o0,	AO
+	lxvw4x		vs1,	o16,	AO
+	lxvw4x		vs2,	o32,	AO
+	lxvw4x		vs3,	o48,	AO
+
+	addi		AO,	AO,	64
+
+	mr		T1,	BO
+
+	lxvw4x		vs8,	o0,	T1
+	lxvw4x		vs9,	o16,	T1
+	lxvw4x		vs10,	o32,	T1
+	lxvw4x		vs11,	o48,	T1
+
+	addi		T1,	T1,	64
+
+	lxvw4x		vs12,	o0,	T1
+	lxvw4x		vs13,	o16,	T1
+	lxvw4x		vs14,	o32,	T1
+	lxvw4x		vs15,	o48,	T1
+
+	addi		BO,	BO,	128
+
+
+	xvmaddasp	vs32,	vs4,	vs16
+	xvmaddasp	vs33,	vs5,	vs16
+	xvmaddasp	vs34,	vs6,	vs16
+	xvmaddasp	vs35,	vs7,	vs16
+
+	xvmaddasp	vs36,	vs4,	vs17
+	xvmaddasp	vs37,	vs5,	vs17
+	xvmaddasp	vs38,	vs6,	vs17
+	xvmaddasp	vs39,	vs7,	vs17
+
+	xvmaddasp	vs40,	vs4,	vs18
+	xvmaddasp	vs41,	vs5,	vs18
+	xvmaddasp	vs42,	vs6,	vs18
+	xvmaddasp	vs43,	vs7,	vs18
+
+	xvmaddasp	vs44,	vs4,	vs19
+	xvmaddasp	vs45,	vs5,	vs19
+	xvmaddasp	vs46,	vs6,	vs19
+	xvmaddasp	vs47,	vs7,	vs19
+
+	xvmaddasp	vs48,	vs4,	vs20
+	xvmaddasp	vs49,	vs5,	vs20
+	xvmaddasp	vs50,	vs6,	vs20
+	xvmaddasp	vs51,	vs7,	vs20
+
+	xvmaddasp	vs52,	vs4,	vs21
+	xvmaddasp	vs53,	vs5,	vs21
+	xvmaddasp	vs54,	vs6,	vs21
+	xvmaddasp	vs55,	vs7,	vs21
+
+	xvmaddasp	vs56,	vs4,	vs22
+	xvmaddasp	vs57,	vs5,	vs22
+	xvmaddasp	vs58,	vs6,	vs22
+	xvmaddasp	vs59,	vs7,	vs22
+
+	xvmaddasp	vs60,	vs4,	vs23
+	xvmaddasp	vs61,	vs5,	vs23
+	xvmaddasp	vs62,	vs6,	vs23
+	xvmaddasp	vs63,	vs7,	vs23
+
+
+.endm
+
+.macro KERNEL8x16_E2
+
+
+	xvmaddasp	vs32,	vs4,	vs16
+	xvmaddasp	vs33,	vs5,	vs16
+	xvmaddasp	vs34,	vs6,	vs16
+	xvmaddasp	vs35,	vs7,	vs16
+
+	xvmaddasp	vs36,	vs4,	vs17
+	xvmaddasp	vs37,	vs5,	vs17
+	xvmaddasp	vs38,	vs6,	vs17
+	xvmaddasp	vs39,	vs7,	vs17
+
+	xvmaddasp	vs40,	vs4,	vs18
+	xvmaddasp	vs41,	vs5,	vs18
+	xvmaddasp	vs42,	vs6,	vs18
+	xvmaddasp	vs43,	vs7,	vs18
+
+	xvmaddasp	vs44,	vs4,	vs19
+	xvmaddasp	vs45,	vs5,	vs19
+	xvmaddasp	vs46,	vs6,	vs19
+	xvmaddasp	vs47,	vs7,	vs19
+
+	xvmaddasp	vs48,	vs4,	vs20
+	xvmaddasp	vs49,	vs5,	vs20
+	xvmaddasp	vs50,	vs6,	vs20
+	xvmaddasp	vs51,	vs7,	vs20
+
+	xvmaddasp	vs52,	vs4,	vs21
+	xvmaddasp	vs53,	vs5,	vs21
+	xvmaddasp	vs54,	vs6,	vs21
+	xvmaddasp	vs55,	vs7,	vs21
+
+	xvmaddasp	vs56,	vs4,	vs22
+	xvmaddasp	vs57,	vs5,	vs22
+	xvmaddasp	vs58,	vs6,	vs22
+	xvmaddasp	vs59,	vs7,	vs22
+
+	xvmaddasp	vs60,	vs4,	vs23
+	xvmaddasp	vs61,	vs5,	vs23
+	xvmaddasp	vs62,	vs6,	vs23
+	xvmaddasp	vs63,	vs7,	vs23
+
+
+.endm
+
+.macro KERNEL8x16_SUBI1
+
+
+	lxvw4x		vs0,	o0,	AO
+	lxvw4x		vs1,	o16,	AO
+	lxvw4x		vs2,	o32,	AO
+	lxvw4x		vs3,	o48,	AO
+
+	addi		AO,	AO,	64
+
+	mr		T1,	BO
+
+	lxvw4x		vs8,	o0,	T1
+	lxvw4x		vs9,	o16,	T1
+	lxvw4x		vs10,	o32,	T1
+	lxvw4x		vs11,	o48,	T1
+
+	addi		T1,	T1,	64
+
+	lxvw4x		vs12,	o0,	T1
+	lxvw4x		vs13,	o16,	T1
+	lxvw4x		vs14,	o32,	T1
+	lxvw4x		vs15,	o48,	T1
+
+	addi		BO,	BO,	128
+
+
+	xvmulsp		vs32,	vs0,	vs8
+	xvmulsp		vs33,	vs1,	vs8
+	xvmulsp		vs34,	vs2,	vs8
+	xvmulsp		vs35,	vs3,	vs8
+
+	xvmulsp		vs36,	vs0,	vs9
+	xvmulsp		vs37,	vs1,	vs9
+	xvmulsp		vs38,	vs2,	vs9
+	xvmulsp		vs39,	vs3,	vs9
+
+	xvmulsp		vs40,	vs0,	vs10
+	xvmulsp		vs41,	vs1,	vs10
+	xvmulsp		vs42,	vs2,	vs10
+	xvmulsp		vs43,	vs3,	vs10
+
+	xvmulsp		vs44,	vs0,	vs11
+	xvmulsp		vs45,	vs1,	vs11
+	xvmulsp		vs46,	vs2,	vs11
+	xvmulsp		vs47,	vs3,	vs11
+
+	xvmulsp		vs48,	vs0,	vs12
+	xvmulsp		vs49,	vs1,	vs12
+	xvmulsp		vs50,	vs2,	vs12
+	xvmulsp		vs51,	vs3,	vs12
+
+	xvmulsp		vs52,	vs0,	vs13
+	xvmulsp		vs53,	vs1,	vs13
+	xvmulsp		vs54,	vs2,	vs13
+	xvmulsp		vs55,	vs3,	vs13
+
+	xvmulsp		vs56,	vs0,	vs14
+	xvmulsp		vs57,	vs1,	vs14
+	xvmulsp		vs58,	vs2,	vs14
+	xvmulsp		vs59,	vs3,	vs14
+
+	xvmulsp		vs60,	vs0,	vs15
+	xvmulsp		vs61,	vs1,	vs15
+	xvmulsp		vs62,	vs2,	vs15
+	xvmulsp		vs63,	vs3,	vs15
+
+
+.endm
+
+.macro KERNEL8x16_SUB1
+
+
+	lxvw4x		vs0,	o0,	AO
+	lxvw4x		vs1,	o16,	AO
+	lxvw4x		vs2,	o32,	AO
+	lxvw4x		vs3,	o48,	AO
+
+	addi		AO,	AO,	64
+
+	mr		T1,	BO
+
+	lxvw4x		vs8,	o0,	T1
+	lxvw4x		vs9,	o16,	T1
+	lxvw4x		vs10,	o32,	T1
+	lxvw4x		vs11,	o48,	T1
+
+	addi		T1,	T1,	64
+
+	lxvw4x		vs12,	o0,	T1
+	lxvw4x		vs13,	o16,	T1
+	lxvw4x		vs14,	o32,	T1
+	lxvw4x		vs15,	o48,	T1
+
+	addi		BO,	BO,	128
+
+
+	xvmaddasp	vs32,	vs0,	vs8
+	xvmaddasp	vs33,	vs1,	vs8
+	xvmaddasp	vs34,	vs2,	vs8
+	xvmaddasp	vs35,	vs3,	vs8
+
+	xvmaddasp	vs36,	vs0,	vs9
+	xvmaddasp	vs37,	vs1,	vs9
+	xvmaddasp	vs38,	vs2,	vs9
+	xvmaddasp	vs39,	vs3,	vs9
+
+	xvmaddasp	vs40,	vs0,	vs10
+	xvmaddasp	vs41,	vs1,	vs10
+	xvmaddasp	vs42,	vs2,	vs10
+	xvmaddasp	vs43,	vs3,	vs10
+
+	xvmaddasp	vs44,	vs0,	vs11
+	xvmaddasp	vs45,	vs1,	vs11
+	xvmaddasp	vs46,	vs2,	vs11
+	xvmaddasp	vs47,	vs3,	vs11
+
+	xvmaddasp	vs48,	vs0,	vs12
+	xvmaddasp	vs49,	vs1,	vs12
+	xvmaddasp	vs50,	vs2,	vs12
+	xvmaddasp	vs51,	vs3,	vs12
+
+	xvmaddasp	vs52,	vs0,	vs13
+	xvmaddasp	vs53,	vs1,	vs13
+	xvmaddasp	vs54,	vs2,	vs13
+	xvmaddasp	vs55,	vs3,	vs13
+
+	xvmaddasp	vs56,	vs0,	vs14
+	xvmaddasp	vs57,	vs1,	vs14
+	xvmaddasp	vs58,	vs2,	vs14
+	xvmaddasp	vs59,	vs3,	vs14
+
+	xvmaddasp	vs60,	vs0,	vs15
+	xvmaddasp	vs61,	vs1,	vs15
+	xvmaddasp	vs62,	vs2,	vs15
+	xvmaddasp	vs63,	vs3,	vs15
+
+
+.endm
+
+.macro SAVE8x16
+
+	mr		T1,	CO
+
+#ifndef TRMMKERNEL
+
+	lxvw4x		vs0,	o0,	T1
+	lxvw4x		vs1,	o16,	T1
+	lxvw4x		vs2,	o32,	T1
+	lxvw4x		vs3,	o48,	T1
+
+#endif
+
+#ifdef TRMMKERNEL
+	xvmulsp		vs0,	vs32,	alpha_vr
+	xvmulsp		vs1,	vs33,	alpha_vr
+	xvmulsp		vs2,	vs34,	alpha_vr
+	xvmulsp		vs3,	vs35,	alpha_vr
+#else
+	xvmaddasp	vs0,	vs32,	alpha_vr
+	xvmaddasp	vs1,	vs33,	alpha_vr
+	xvmaddasp	vs2,	vs34,	alpha_vr
+	xvmaddasp	vs3,	vs35,	alpha_vr
+#endif
+
+	stxvw4x		vs0,	o0,	T1
+	stxvw4x		vs1,	o16,	T1
+	stxvw4x		vs2,	o32,	T1
+	stxvw4x		vs3,	o48,	T1
+
+	add		T1,	T1,	LDC
+
+
+#ifndef TRMMKERNEL
+
+	lxvw4x		vs0,	o0,	T1
+	lxvw4x		vs1,	o16,	T1
+	lxvw4x		vs2,	o32,	T1
+	lxvw4x		vs3,	o48,	T1
+
+#endif
+
+#ifdef TRMMKERNEL
+	xvmulsp		vs0,	vs36,	alpha_vr
+	xvmulsp		vs1,	vs37,	alpha_vr
+	xvmulsp		vs2,	vs38,	alpha_vr
+	xvmulsp		vs3,	vs39,	alpha_vr
+#else
+	xvmaddasp	vs0,	vs36,	alpha_vr
+	xvmaddasp	vs1,	vs37,	alpha_vr
+	xvmaddasp	vs2,	vs38,	alpha_vr
+	xvmaddasp	vs3,	vs39,	alpha_vr
+#endif
+
+	stxvw4x		vs0,	o0,	T1
+	stxvw4x		vs1,	o16,	T1
+	stxvw4x		vs2,	o32,	T1
+	stxvw4x		vs3,	o48,	T1
+
+	add		T1,	T1,	LDC
+
+
+#ifndef TRMMKERNEL
+
+	lxvw4x		vs0,	o0,	T1
+	lxvw4x		vs1,	o16,	T1
+	lxvw4x		vs2,	o32,	T1
+	lxvw4x		vs3,	o48,	T1
+
+#endif
+
+#ifdef TRMMKERNEL
+	xvmulsp		vs0,	vs40,	alpha_vr
+	xvmulsp		vs1,	vs41,	alpha_vr
+	xvmulsp		vs2,	vs42,	alpha_vr
+	xvmulsp		vs3,	vs43,	alpha_vr
+#else
+	xvmaddasp	vs0,	vs40,	alpha_vr
+	xvmaddasp	vs1,	vs41,	alpha_vr
+	xvmaddasp	vs2,	vs42,	alpha_vr
+	xvmaddasp	vs3,	vs43,	alpha_vr
+#endif
+
+	stxvw4x		vs0,	o0,	T1
+	stxvw4x		vs1,	o16,	T1
+	stxvw4x		vs2,	o32,	T1
+	stxvw4x		vs3,	o48,	T1
+
+	add		T1,	T1,	LDC
+
+
+#ifndef TRMMKERNEL
+
+	lxvw4x		vs0,	o0,	T1
+	lxvw4x		vs1,	o16,	T1
+	lxvw4x		vs2,	o32,	T1
+	lxvw4x		vs3,	o48,	T1
+
+#endif
+
+#ifdef TRMMKERNEL
+	xvmulsp		vs0,	vs44,	alpha_vr
+	xvmulsp		vs1,	vs45,	alpha_vr
+	xvmulsp		vs2,	vs46,	alpha_vr
+	xvmulsp		vs3,	vs47,	alpha_vr
+#else
+	xvmaddasp	vs0,	vs44,	alpha_vr
+	xvmaddasp	vs1,	vs45,	alpha_vr
+	xvmaddasp	vs2,	vs46,	alpha_vr
+	xvmaddasp	vs3,	vs47,	alpha_vr
+#endif
+
+	stxvw4x		vs0,	o0,	T1
+	stxvw4x		vs1,	o16,	T1
+	stxvw4x		vs2,	o32,	T1
+	stxvw4x		vs3,	o48,	T1
+
+	add		T1,	T1,	LDC
+
+
+#ifndef TRMMKERNEL
+
+	lxvw4x		vs0,	o0,	T1
+	lxvw4x		vs1,	o16,	T1
+	lxvw4x		vs2,	o32,	T1
+	lxvw4x		vs3,	o48,	T1
+
+#endif
+
+#ifdef TRMMKERNEL
+	xvmulsp		vs0,	vs48,	alpha_vr
+	xvmulsp		vs1,	vs49,	alpha_vr
+	xvmulsp		vs2,	vs50,	alpha_vr
+	xvmulsp		vs3,	vs51,	alpha_vr
+#else
+	xvmaddasp	vs0,	vs48,	alpha_vr
+	xvmaddasp	vs1,	vs49,	alpha_vr
+	xvmaddasp	vs2,	vs50,	alpha_vr
+	xvmaddasp	vs3,	vs51,	alpha_vr
+#endif
+
+	stxvw4x		vs0,	o0,	T1
+	stxvw4x		vs1,	o16,	T1
+	stxvw4x		vs2,	o32,	T1
+	stxvw4x		vs3,	o48,	T1
+
+	add		T1,	T1,	LDC
+
+
+#ifndef TRMMKERNEL
+
+	lxvw4x		vs0,	o0,	T1
+	lxvw4x		vs1,	o16,	T1
+	lxvw4x		vs2,	o32,	T1
+	lxvw4x		vs3,	o48,	T1
+
+#endif
+
+#ifdef TRMMKERNEL
+	xvmulsp		vs0,	vs52,	alpha_vr
+	xvmulsp		vs1,	vs53,	alpha_vr
+	xvmulsp		vs2,	vs54,	alpha_vr
+	xvmulsp		vs3,	vs55,	alpha_vr
+#else
+	xvmaddasp	vs0,	vs52,	alpha_vr
+	xvmaddasp	vs1,	vs53,	alpha_vr
+	xvmaddasp	vs2,	vs54,	alpha_vr
+	xvmaddasp	vs3,	vs55,	alpha_vr
+#endif
+
+	stxvw4x		vs0,	o0,	T1
+	stxvw4x		vs1,	o16,	T1
+	stxvw4x		vs2,	o32,	T1
+	stxvw4x		vs3,	o48,	T1
+
+	add		T1,	T1,	LDC
+
+
+#ifndef TRMMKERNEL
+
+	lxvw4x		vs0,	o0,	T1
+	lxvw4x		vs1,	o16,	T1
+	lxvw4x		vs2,	o32,	T1
+	lxvw4x		vs3,	o48,	T1
+
+#endif
+
+#ifdef TRMMKERNEL
+	xvmulsp		vs0,	vs56,	alpha_vr
+	xvmulsp		vs1,	vs57,	alpha_vr
+	xvmulsp		vs2,	vs58,	alpha_vr
+	xvmulsp		vs3,	vs59,	alpha_vr
+#else
+	xvmaddasp	vs0,	vs56,	alpha_vr
+	xvmaddasp	vs1,	vs57,	alpha_vr
+	xvmaddasp	vs2,	vs58,	alpha_vr
+	xvmaddasp	vs3,	vs59,	alpha_vr
+#endif
+
+	stxvw4x		vs0,	o0,	T1
+	stxvw4x		vs1,	o16,	T1
+	stxvw4x		vs2,	o32,	T1
+	stxvw4x		vs3,	o48,	T1
+
+	add		T1,	T1,	LDC
+
+
+#ifndef TRMMKERNEL
+
+	lxvw4x		vs0,	o0,	T1
+	lxvw4x		vs1,	o16,	T1
+	lxvw4x		vs2,	o32,	T1
+	lxvw4x		vs3,	o48,	T1
+
+#endif
+
+#ifdef TRMMKERNEL
+	xvmulsp		vs0,	vs60,	alpha_vr
+	xvmulsp		vs1,	vs61,	alpha_vr
+	xvmulsp		vs2,	vs62,	alpha_vr
+	xvmulsp		vs3,	vs63,	alpha_vr
+#else
+	xvmaddasp	vs0,	vs60,	alpha_vr
+	xvmaddasp	vs1,	vs61,	alpha_vr
+	xvmaddasp	vs2,	vs62,	alpha_vr
+	xvmaddasp	vs3,	vs63,	alpha_vr
+#endif
+
+	stxvw4x		vs0,	o0,	T1
+	stxvw4x		vs1,	o16,	T1
+	stxvw4x		vs2,	o32,	T1
+	stxvw4x		vs3,	o48,	T1
+
+	add		T1,	T1,	LDC
+
+	addi		CO,	CO,	64
+
+.endm
+
+
+/**********************************************************************************************
+* Macros for N=8 and M=8
+**********************************************************************************************/
+
+.macro LOAD8x8_1
+
+	lxvw4x		vs0,	o0,	AO
+	lxvw4x		vs1,	o16,	AO
+
+	addi		AO,	AO,	32
+
+	mr		T1,	BO
+
+	lxvw4x		vs8,	o0,	T1
+	lxvw4x		vs9,	o16,	T1
+	lxvw4x		vs10,	o32,	T1
+	lxvw4x		vs11,	o48,	T1
+
+	addi		T1,	T1,	64
+
+	lxvw4x		vs12,	o0,	T1
+	lxvw4x		vs13,	o16,	T1
+	lxvw4x		vs14,	o32,	T1
+	lxvw4x		vs15,	o48,	T1
+
+	addi		BO,	BO,	128
+
+.endm
+
+.macro KERNEL8x8_I1
+
+
+	lxvw4x		vs4,	o0,	AO
+	lxvw4x		vs5,	o16,	AO
+
+	addi		AO,	AO,	32
+
+	mr		T1,	BO
+
+	lxvw4x		vs16,	o0,	T1
+	lxvw4x		vs17,	o16,	T1
+	lxvw4x		vs18,	o32,	T1
+	lxvw4x		vs19,	o48,	T1
+
+	addi		T1,	T1,	64
+
+	lxvw4x		vs20,	o0,	T1
+	lxvw4x		vs21,	o16,	T1
+	lxvw4x		vs22,	o32,	T1
+	lxvw4x		vs23,	o48,	T1
+
+	addi		BO,	BO,	128
+
+
+	xvmulsp		vs32,	vs0,	vs8
+	xvmulsp		vs33,	vs1,	vs8
+
+	xvmulsp		vs34,	vs0,	vs9
+	xvmulsp		vs35,	vs1,	vs9
+
+	xvmulsp		vs36,	vs0,	vs10
+	xvmulsp		vs37,	vs1,	vs10
+
+	xvmulsp		vs38,	vs0,	vs11
+	xvmulsp		vs39,	vs1,	vs11
+
+	xvmulsp		vs40,	vs0,	vs12
+	xvmulsp		vs41,	vs1,	vs12
+
+	xvmulsp		vs42,	vs0,	vs13
+	xvmulsp		vs43,	vs1,	vs13
+
+	xvmulsp		vs44,	vs0,	vs14
+	xvmulsp		vs45,	vs1,	vs14
+
+	xvmulsp		vs46,	vs0,	vs15
+	xvmulsp		vs47,	vs1,	vs15
+
+
+.endm
+
+.macro KERNEL8x8_1
+
+
+	lxvw4x		vs4,	o0,	AO
+	lxvw4x		vs5,	o16,	AO
+
+	addi		AO,	AO,	32
+
+	mr		T1,	BO
+
+	lxvw4x		vs16,	o0,	T1
+	lxvw4x		vs17,	o16,	T1
+	lxvw4x		vs18,	o32,	T1
+	lxvw4x		vs19,	o48,	T1
+
+	addi		T1,	T1,	64
+
+	lxvw4x		vs20,	o0,	T1
+	lxvw4x		vs21,	o16,	T1
+	lxvw4x		vs22,	o32,	T1
+	lxvw4x		vs23,	o48,	T1
+
+	addi		BO,	BO,	128
+
+
+	xvmaddasp	vs32,	vs0,	vs8
+	xvmaddasp	vs33,	vs1,	vs8
+
+	xvmaddasp	vs34,	vs0,	vs9
+	xvmaddasp	vs35,	vs1,	vs9
+
+	xvmaddasp	vs36,	vs0,	vs10
+	xvmaddasp	vs37,	vs1,	vs10
+
+	xvmaddasp	vs38,	vs0,	vs11
+	xvmaddasp	vs39,	vs1,	vs11
+
+	xvmaddasp	vs40,	vs0,	vs12
+	xvmaddasp	vs41,	vs1,	vs12
+
+	xvmaddasp	vs42,	vs0,	vs13
+	xvmaddasp	vs43,	vs1,	vs13
+
+	xvmaddasp	vs44,	vs0,	vs14
+	xvmaddasp	vs45,	vs1,	vs14
+
+	xvmaddasp	vs46,	vs0,	vs15
+	xvmaddasp	vs47,	vs1,	vs15
+
+
+.endm
+
+.macro KERNEL8x8_2
+
+
+	lxvw4x		vs0,	o0,	AO
+	lxvw4x		vs1,	o16,	AO
+
+	addi		AO,	AO,	32
+
+	mr		T1,	BO
+
+	lxvw4x		vs8,	o0,	T1
+	lxvw4x		vs9,	o16,	T1
+	lxvw4x		vs10,	o32,	T1
+	lxvw4x		vs11,	o48,	T1
+
+	addi		T1,	T1,	64
+
+	lxvw4x		vs12,	o0,	T1
+	lxvw4x		vs13,	o16,	T1
+	lxvw4x		vs14,	o32,	T1
+	lxvw4x		vs15,	o48,	T1
+
+	addi		BO,	BO,	128
+
+
+	xvmaddasp	vs32,	vs4,	vs16
+	xvmaddasp	vs33,	vs5,	vs16
+
+	xvmaddasp	vs34,	vs4,	vs17
+	xvmaddasp	vs35,	vs5,	vs17
+
+	xvmaddasp	vs36,	vs4,	vs18
+	xvmaddasp	vs37,	vs5,	vs18
+
+	xvmaddasp	vs38,	vs4,	vs19
+	xvmaddasp	vs39,	vs5,	vs19
+
+	xvmaddasp	vs40,	vs4,	vs20
+	xvmaddasp	vs41,	vs5,	vs20
+
+	xvmaddasp	vs42,	vs4,	vs21
+	xvmaddasp	vs43,	vs5,	vs21
+
+	xvmaddasp	vs44,	vs4,	vs22
+	xvmaddasp	vs45,	vs5,	vs22
+
+	xvmaddasp	vs46,	vs4,	vs23
+	xvmaddasp	vs47,	vs5,	vs23
+
+
+.endm
+
+.macro KERNEL8x8_E2
+
+
+	xvmaddasp	vs32,	vs4,	vs16
+	xvmaddasp	vs33,	vs5,	vs16
+
+	xvmaddasp	vs34,	vs4,	vs17
+	xvmaddasp	vs35,	vs5,	vs17
+
+	xvmaddasp	vs36,	vs4,	vs18
+	xvmaddasp	vs37,	vs5,	vs18
+
+	xvmaddasp	vs38,	vs4,	vs19
+	xvmaddasp	vs39,	vs5,	vs19
+
+	xvmaddasp	vs40,	vs4,	vs20
+	xvmaddasp	vs41,	vs5,	vs20
+
+	xvmaddasp	vs42,	vs4,	vs21
+	xvmaddasp	vs43,	vs5,	vs21
+
+	xvmaddasp	vs44,	vs4,	vs22
+	xvmaddasp	vs45,	vs5,	vs22
+
+	xvmaddasp	vs46,	vs4,	vs23
+	xvmaddasp	vs47,	vs5,	vs23
+
+
+.endm
+
+.macro KERNEL8x8_SUBI1
+
+
+	lxvw4x		vs0,	o0,	AO
+	lxvw4x		vs1,	o16,	AO
+
+	addi		AO,	AO,	32
+
+	mr		T1,	BO
+
+	lxvw4x		vs8,	o0,	T1
+	lxvw4x		vs9,	o16,	T1
+	lxvw4x		vs10,	o32,	T1
+	lxvw4x		vs11,	o48,	T1
+
+	addi		T1,	T1,	64
+
+	lxvw4x		vs12,	o0,	T1
+	lxvw4x		vs13,	o16,	T1
+	lxvw4x		vs14,	o32,	T1
+	lxvw4x		vs15,	o48,	T1
+
+	addi		BO,	BO,	128
+
+
+	xvmulsp		vs32,	vs0,	vs8
+	xvmulsp		vs33,	vs1,	vs8
+
+	xvmulsp		vs34,	vs0,	vs9
+	xvmulsp		vs35,	vs1,	vs9
+
+	xvmulsp		vs36,	vs0,	vs10
+	xvmulsp		vs37,	vs1,	vs10
+
+	xvmulsp		vs38,	vs0,	vs11
+	xvmulsp		vs39,	vs1,	vs11
+
+	xvmulsp		vs40,	vs0,	vs12
+	xvmulsp		vs41,	vs1,	vs12
+
+	xvmulsp		vs42,	vs0,	vs13
+	xvmulsp		vs43,	vs1,	vs13
+
+	xvmulsp		vs44,	vs0,	vs14
+	xvmulsp		vs45,	vs1,	vs14
+
+	xvmulsp		vs46,	vs0,	vs15
+	xvmulsp		vs47,	vs1,	vs15
+
+
+.endm
+
+.macro KERNEL8x8_SUB1
+
+
+	lxvw4x		vs0,	o0,	AO
+	lxvw4x		vs1,	o16,	AO
+
+	addi		AO,	AO,	32
+
+	mr		T1,	BO
+
+	lxvw4x		vs8,	o0,	T1
+	lxvw4x		vs9,	o16,	T1
+	lxvw4x		vs10,	o32,	T1
+	lxvw4x		vs11,	o48,	T1
+
+	addi		T1,	T1,	64
+
+	lxvw4x		vs12,	o0,	T1
+	lxvw4x		vs13,	o16,	T1
+	lxvw4x		vs14,	o32,	T1
+	lxvw4x		vs15,	o48,	T1
+
+	addi		BO,	BO,	128
+
+
+	xvmaddasp	vs32,	vs0,	vs8
+	xvmaddasp	vs33,	vs1,	vs8
+
+	xvmaddasp	vs34,	vs0,	vs9
+	xvmaddasp	vs35,	vs1,	vs9
+
+	xvmaddasp	vs36,	vs0,	vs10
+	xvmaddasp	vs37,	vs1,	vs10
+
+	xvmaddasp	vs38,	vs0,	vs11
+	xvmaddasp	vs39,	vs1,	vs11
+
+	xvmaddasp	vs40,	vs0,	vs12
+	xvmaddasp	vs41,	vs1,	vs12
+
+	xvmaddasp	vs42,	vs0,	vs13
+	xvmaddasp	vs43,	vs1,	vs13
+
+	xvmaddasp	vs44,	vs0,	vs14
+	xvmaddasp	vs45,	vs1,	vs14
+
+	xvmaddasp	vs46,	vs0,	vs15
+	xvmaddasp	vs47,	vs1,	vs15
+
+
+.endm
+
+.macro SAVE8x8
+
+	mr		T1,	CO
+
+#ifndef TRMMKERNEL
+
+	lxvw4x		vs0,	o0,	T1
+	lxvw4x		vs1,	o16,	T1
+
+#endif
+
+#ifdef TRMMKERNEL
+	xvmulsp		vs0,	vs32,	alpha_vr
+	xvmulsp		vs1,	vs33,	alpha_vr
+#else
+	xvmaddasp	vs0,	vs32,	alpha_vr
+	xvmaddasp	vs1,	vs33,	alpha_vr
+#endif
+
+	stxvw4x		vs0,	o0,	T1
+	stxvw4x		vs1,	o16,	T1
+
+	add		T1,	T1,	LDC
+
+
+#ifndef TRMMKERNEL
+
+	lxvw4x		vs0,	o0,	T1
+	lxvw4x		vs1,	o16,	T1
+
+#endif
+
+#ifdef TRMMKERNEL
+	xvmulsp		vs0,	vs34,	alpha_vr
+	xvmulsp		vs1,	vs35,	alpha_vr
+#else
+	xvmaddasp	vs0,	vs34,	alpha_vr
+	xvmaddasp	vs1,	vs35,	alpha_vr
+#endif
+
+	stxvw4x		vs0,	o0,	T1
+	stxvw4x		vs1,	o16,	T1
+
+	add		T1,	T1,	LDC
+
+
+#ifndef TRMMKERNEL
+
+	lxvw4x		vs0,	o0,	T1
+	lxvw4x		vs1,	o16,	T1
+
+#endif
+
+#ifdef TRMMKERNEL
+	xvmulsp		vs0,	vs36,	alpha_vr
+	xvmulsp		vs1,	vs37,	alpha_vr
+#else
+	xvmaddasp	vs0,	vs36,	alpha_vr
+	xvmaddasp	vs1,	vs37,	alpha_vr
+#endif
+
+	stxvw4x		vs0,	o0,	T1
+	stxvw4x		vs1,	o16,	T1
+
+	add		T1,	T1,	LDC
+
+
+#ifndef TRMMKERNEL
+
+	lxvw4x		vs0,	o0,	T1
+	lxvw4x		vs1,	o16,	T1
+
+#endif
+
+#ifdef TRMMKERNEL
+	xvmulsp		vs0,	vs38,	alpha_vr
+	xvmulsp		vs1,	vs39,	alpha_vr
+#else
+	xvmaddasp	vs0,	vs38,	alpha_vr
+	xvmaddasp	vs1,	vs39,	alpha_vr
+#endif
+
+	stxvw4x		vs0,	o0,	T1
+	stxvw4x		vs1,	o16,	T1
+
+	add		T1,	T1,	LDC
+
+
+#ifndef TRMMKERNEL
+
+	lxvw4x		vs0,	o0,	T1
+	lxvw4x		vs1,	o16,	T1
+
+#endif
+
+#ifdef TRMMKERNEL
+	xvmulsp		vs0,	vs40,	alpha_vr
+	xvmulsp		vs1,	vs41,	alpha_vr
+#else
+	xvmaddasp	vs0,	vs40,	alpha_vr
+	xvmaddasp	vs1,	vs41,	alpha_vr
+#endif
+
+	stxvw4x		vs0,	o0,	T1
+	stxvw4x		vs1,	o16,	T1
+
+	add		T1,	T1,	LDC
+
+
+#ifndef TRMMKERNEL
+
+	lxvw4x		vs0,	o0,	T1
+	lxvw4x		vs1,	o16,	T1
+
+#endif
+
+#ifdef TRMMKERNEL
+	xvmulsp		vs0,	vs42,	alpha_vr
+	xvmulsp		vs1,	vs43,	alpha_vr
+#else
+	xvmaddasp	vs0,	vs42,	alpha_vr
+	xvmaddasp	vs1,	vs43,	alpha_vr
+#endif
+
+	stxvw4x		vs0,	o0,	T1
+	stxvw4x		vs1,	o16,	T1
+
+	add		T1,	T1,	LDC
+
+
+#ifndef TRMMKERNEL
+
+	lxvw4x		vs0,	o0,	T1
+	lxvw4x		vs1,	o16,	T1
+
+#endif
+
+#ifdef TRMMKERNEL
+	xvmulsp		vs0,	vs44,	alpha_vr
+	xvmulsp		vs1,	vs45,	alpha_vr
+#else
+	xvmaddasp	vs0,	vs44,	alpha_vr
+	xvmaddasp	vs1,	vs45,	alpha_vr
+#endif
+
+	stxvw4x		vs0,	o0,	T1
+	stxvw4x		vs1,	o16,	T1
+
+	add		T1,	T1,	LDC
+
+
+#ifndef TRMMKERNEL
+
+	lxvw4x		vs0,	o0,	T1
+	lxvw4x		vs1,	o16,	T1
+
+#endif
+
+#ifdef TRMMKERNEL
+	xvmulsp		vs0,	vs46,	alpha_vr
+	xvmulsp		vs1,	vs47,	alpha_vr
+#else
+	xvmaddasp	vs0,	vs46,	alpha_vr
+	xvmaddasp	vs1,	vs47,	alpha_vr
+#endif
+
+	stxvw4x		vs0,	o0,	T1
+	stxvw4x		vs1,	o16,	T1
+
+	add		T1,	T1,	LDC
+
+	addi		CO,	CO,	32
+
+.endm
+
+
+/**********************************************************************************************
+* Macros for N=8 and M=4
+**********************************************************************************************/
+
+.macro LOAD8x4_1
+
+	lxvw4x		vs0,	o0,	AO
+
+	addi		AO,	AO,	16
+
+	mr		T1,	BO
+
+	lxvw4x		vs8,	o0,	T1
+	lxvw4x		vs9,	o16,	T1
+	lxvw4x		vs10,	o32,	T1
+	lxvw4x		vs11,	o48,	T1
+
+	addi		T1,	T1,	64
+
+	lxvw4x		vs12,	o0,	T1
+	lxvw4x		vs13,	o16,	T1
+	lxvw4x		vs14,	o32,	T1
+	lxvw4x		vs15,	o48,	T1
+
+	addi		BO,	BO,	128
+
+.endm
+
+.macro KERNEL8x4_I1
+
+
+	lxvw4x		vs4,	o0,	AO
+
+	addi		AO,	AO,	16
+
+	mr		T1,	BO
+
+	lxvw4x		vs16,	o0,	T1
+	lxvw4x		vs17,	o16,	T1
+	lxvw4x		vs18,	o32,	T1
+	lxvw4x		vs19,	o48,	T1
+
+	addi		T1,	T1,	64
+
+	lxvw4x		vs20,	o0,	T1
+	lxvw4x		vs21,	o16,	T1
+	lxvw4x		vs22,	o32,	T1
+	lxvw4x		vs23,	o48,	T1
+
+	addi		BO,	BO,	128
+
+
+	xvmulsp		vs32,	vs0,	vs8
+
+	xvmulsp		vs33,	vs0,	vs9
+
+	xvmulsp		vs34,	vs0,	vs10
+
+	xvmulsp		vs35,	vs0,	vs11
+
+	xvmulsp		vs36,	vs0,	vs12
+
+	xvmulsp		vs37,	vs0,	vs13
+
+	xvmulsp		vs38,	vs0,	vs14
+
+	xvmulsp		vs39,	vs0,	vs15
+
+
+.endm
+
+.macro KERNEL8x4_1
+
+
+	lxvw4x		vs4,	o0,	AO
+
+	addi		AO,	AO,	16
+
+	mr		T1,	BO
+
+	lxvw4x		vs16,	o0,	T1
+	lxvw4x		vs17,	o16,	T1
+	lxvw4x		vs18,	o32,	T1
+	lxvw4x		vs19,	o48,	T1
+
+	addi		T1,	T1,	64
+
+	lxvw4x		vs20,	o0,	T1
+	lxvw4x		vs21,	o16,	T1
+	lxvw4x		vs22,	o32,	T1
+	lxvw4x		vs23,	o48,	T1
+
+	addi		BO,	BO,	128
+
+
+	xvmaddasp	vs32,	vs0,	vs8
+
+	xvmaddasp	vs33,	vs0,	vs9
+
+	xvmaddasp	vs34,	vs0,	vs10
+
+	xvmaddasp	vs35,	vs0,	vs11
+
+	xvmaddasp	vs36,	vs0,	vs12
+
+	xvmaddasp	vs37,	vs0,	vs13
+
+	xvmaddasp	vs38,	vs0,	vs14
+
+	xvmaddasp	vs39,	vs0,	vs15
+
+
+.endm
+
+.macro KERNEL8x4_2
+
+
+	lxvw4x		vs0,	o0,	AO
+
+	addi		AO,	AO,	16
+
+	mr		T1,	BO
+
+	lxvw4x		vs8,	o0,	T1
+	lxvw4x		vs9,	o16,	T1
+	lxvw4x		vs10,	o32,	T1
+	lxvw4x		vs11,	o48,	T1
+
+	addi		T1,	T1,	64
+
+	lxvw4x		vs12,	o0,	T1
+	lxvw4x		vs13,	o16,	T1
+	lxvw4x		vs14,	o32,	T1
+	lxvw4x		vs15,	o48,	T1
+
+	addi		BO,	BO,	128
+
+
+	xvmaddasp	vs32,	vs4,	vs16
+
+	xvmaddasp	vs33,	vs4,	vs17
+
+	xvmaddasp	vs34,	vs4,	vs18
+
+	xvmaddasp	vs35,	vs4,	vs19
+
+	xvmaddasp	vs36,	vs4,	vs20
+
+	xvmaddasp	vs37,	vs4,	vs21
+
+	xvmaddasp	vs38,	vs4,	vs22
+
+	xvmaddasp	vs39,	vs4,	vs23
+
+
+.endm
+
+.macro KERNEL8x4_E2
+
+
+	xvmaddasp	vs32,	vs4,	vs16
+
+	xvmaddasp	vs33,	vs4,	vs17
+
+	xvmaddasp	vs34,	vs4,	vs18
+
+	xvmaddasp	vs35,	vs4,	vs19
+
+	xvmaddasp	vs36,	vs4,	vs20
+
+	xvmaddasp	vs37,	vs4,	vs21
+
+	xvmaddasp	vs38,	vs4,	vs22
+
+	xvmaddasp	vs39,	vs4,	vs23
+
+
+.endm
+
+.macro KERNEL8x4_SUBI1
+
+
+	lxvw4x		vs0,	o0,	AO
+
+	addi		AO,	AO,	16
+
+	mr		T1,	BO
+
+	lxvw4x		vs8,	o0,	T1
+	lxvw4x		vs9,	o16,	T1
+	lxvw4x		vs10,	o32,	T1
+	lxvw4x		vs11,	o48,	T1
+
+	addi		T1,	T1,	64
+
+	lxvw4x		vs12,	o0,	T1
+	lxvw4x		vs13,	o16,	T1
+	lxvw4x		vs14,	o32,	T1
+	lxvw4x		vs15,	o48,	T1
+
+	addi		BO,	BO,	128
+
+
+	xvmulsp		vs32,	vs0,	vs8
+
+	xvmulsp		vs33,	vs0,	vs9
+
+	xvmulsp		vs34,	vs0,	vs10
+
+	xvmulsp		vs35,	vs0,	vs11
+
+	xvmulsp		vs36,	vs0,	vs12
+
+	xvmulsp		vs37,	vs0,	vs13
+
+	xvmulsp		vs38,	vs0,	vs14
+
+	xvmulsp		vs39,	vs0,	vs15
+
+
+.endm
+
+.macro KERNEL8x4_SUB1
+
+
+	lxvw4x		vs0,	o0,	AO
+
+	addi		AO,	AO,	16
+
+	mr		T1,	BO
+
+	lxvw4x		vs8,	o0,	T1
+	lxvw4x		vs9,	o16,	T1
+	lxvw4x		vs10,	o32,	T1
+	lxvw4x		vs11,	o48,	T1
+
+	addi		T1,	T1,	64
+
+	lxvw4x		vs12,	o0,	T1
+	lxvw4x		vs13,	o16,	T1
+	lxvw4x		vs14,	o32,	T1
+	lxvw4x		vs15,	o48,	T1
+
+	addi		BO,	BO,	128
+
+
+	xvmaddasp	vs32,	vs0,	vs8
+
+	xvmaddasp	vs33,	vs0,	vs9
+
+	xvmaddasp	vs34,	vs0,	vs10
+
+	xvmaddasp	vs35,	vs0,	vs11
+
+	xvmaddasp	vs36,	vs0,	vs12
+
+	xvmaddasp	vs37,	vs0,	vs13
+
+	xvmaddasp	vs38,	vs0,	vs14
+
+	xvmaddasp	vs39,	vs0,	vs15
+
+
+.endm
+
+.macro SAVE8x4
+
+	mr		T1,	CO
+
+#ifndef TRMMKERNEL
+
+	lxvw4x		vs0,	o0,	T1
+
+#endif
+
+#ifdef TRMMKERNEL
+	xvmulsp		vs0,	vs32,	alpha_vr
+#else
+	xvmaddasp	vs0,	vs32,	alpha_vr
+#endif
+
+	stxvw4x		vs0,	o0,	T1
+
+	add		T1,	T1,	LDC
+
+
+#ifndef TRMMKERNEL
+
+	lxvw4x		vs0,	o0,	T1
+
+#endif
+
+#ifdef TRMMKERNEL
+	xvmulsp		vs0,	vs33,	alpha_vr
+#else
+	xvmaddasp	vs0,	vs33,	alpha_vr
+#endif
+
+	stxvw4x		vs0,	o0,	T1
+
+	add		T1,	T1,	LDC
+
+
+#ifndef TRMMKERNEL
+
+	lxvw4x		vs0,	o0,	T1
+
+#endif
+
+#ifdef TRMMKERNEL
+	xvmulsp		vs0,	vs34,	alpha_vr
+#else
+	xvmaddasp	vs0,	vs34,	alpha_vr
+#endif
+
+	stxvw4x		vs0,	o0,	T1
+
+	add		T1,	T1,	LDC
+
+
+#ifndef TRMMKERNEL
+
+	lxvw4x		vs0,	o0,	T1
+
+#endif
+
+#ifdef TRMMKERNEL
+	xvmulsp		vs0,	vs35,	alpha_vr
+#else
+	xvmaddasp	vs0,	vs35,	alpha_vr
+#endif
+
+	stxvw4x		vs0,	o0,	T1
+
+	add		T1,	T1,	LDC
+
+
+#ifndef TRMMKERNEL
+
+	lxvw4x		vs0,	o0,	T1
+
+#endif
+
+#ifdef TRMMKERNEL
+	xvmulsp		vs0,	vs36,	alpha_vr
+#else
+	xvmaddasp	vs0,	vs36,	alpha_vr
+#endif
+
+	stxvw4x		vs0,	o0,	T1
+
+	add		T1,	T1,	LDC
+
+
+#ifndef TRMMKERNEL
+
+	lxvw4x		vs0,	o0,	T1
+
+#endif
+
+#ifdef TRMMKERNEL
+	xvmulsp		vs0,	vs37,	alpha_vr
+#else
+	xvmaddasp	vs0,	vs37,	alpha_vr
+#endif
+
+	stxvw4x		vs0,	o0,	T1
+
+	add		T1,	T1,	LDC
+
+
+#ifndef TRMMKERNEL
+
+	lxvw4x		vs0,	o0,	T1
+
+#endif
+
+#ifdef TRMMKERNEL
+	xvmulsp		vs0,	vs38,	alpha_vr
+#else
+	xvmaddasp	vs0,	vs38,	alpha_vr
+#endif
+
+	stxvw4x		vs0,	o0,	T1
+
+	add		T1,	T1,	LDC
+
+
+#ifndef TRMMKERNEL
+
+	lxvw4x		vs0,	o0,	T1
+
+#endif
+
+#ifdef TRMMKERNEL
+	xvmulsp		vs0,	vs39,	alpha_vr
+#else
+	xvmaddasp	vs0,	vs39,	alpha_vr
+#endif
+
+	stxvw4x		vs0,	o0,	T1
+
+	add		T1,	T1,	LDC
+
+	addi		CO,	CO,	16
+
+.endm
+
+
+/**********************************************************************************************
+* Macros for N=8 and M=2
+**********************************************************************************************/
+
+.macro LOAD8x2_1
+
+	lxsspx		vs0,	o0,	AO
+	lxsspx		vs1,	o4,	AO
+
+	addi		AO,	AO,	8
+
+	mr		T1,	BO
+
+	lxsspx		vs8,	o0,	T1
+	lxsspx		vs9,	o16,	T1
+	lxsspx		vs10,	o32,	T1
+	lxsspx		vs11,	o48,	T1
+
+	addi		T1,	T1,	64
+
+	lxsspx		vs12,	o0,	T1
+	lxsspx		vs13,	o16,	T1
+	lxsspx		vs14,	o32,	T1
+	lxsspx		vs15,	o48,	T1
+
+
+	addi		BO,	BO,	128
+
+.endm
+
+.macro KERNEL8x2_I1
+
+
+	lxsspx		vs4,	o0,	AO
+	lxsspx		vs5,	o4,	AO
+
+	addi		AO,	AO,	8
+
+	mr		T1,	BO
+
+	lxsspx		vs16,	o0,	T1
+	lxsspx		vs17,	o16,	T1
+	lxsspx		vs18,	o32,	T1
+	lxsspx		vs19,	o48,	T1
+
+	addi		T1,	T1,	64
+
+	lxsspx		vs20,	o0,	T1
+	lxsspx		vs21,	o16,	T1
+	lxsspx		vs22,	o32,	T1
+	lxsspx		vs23,	o48,	T1
+
+
+	addi		BO,	BO,	128
+
+
+	xsmuldp		vs32,	vs0,	vs8
+	xsmuldp		vs33,	vs1,	vs8
+
+	xsmuldp		vs34,	vs0,	vs9
+	xsmuldp		vs35,	vs1,	vs9
+
+	xsmuldp		vs36,	vs0,	vs10
+	xsmuldp		vs37,	vs1,	vs10
+
+	xsmuldp		vs38,	vs0,	vs11
+	xsmuldp		vs39,	vs1,	vs11
+
+	xsmuldp		vs40,	vs0,	vs12
+	xsmuldp		vs41,	vs1,	vs12
+
+	xsmuldp		vs42,	vs0,	vs13
+	xsmuldp		vs43,	vs1,	vs13
+
+	xsmuldp		vs44,	vs0,	vs14
+	xsmuldp		vs45,	vs1,	vs14
+
+	xsmuldp		vs46,	vs0,	vs15
+	xsmuldp		vs47,	vs1,	vs15
+
+
+.endm
+
+.macro KERNEL8x2_1
+
+
+	lxsspx		vs4,	o0,	AO
+	lxsspx		vs5,	o4,	AO
+
+	addi		AO,	AO,	8
+
+	mr		T1,	BO
+
+	lxsspx		vs16,	o0,	T1
+	lxsspx		vs17,	o16,	T1
+	lxsspx		vs18,	o32,	T1
+	lxsspx		vs19,	o48,	T1
+
+	addi		T1,	T1,	64
+
+	lxsspx		vs20,	o0,	T1
+	lxsspx		vs21,	o16,	T1
+	lxsspx		vs22,	o32,	T1
+	lxsspx		vs23,	o48,	T1
+
+
+	addi		BO,	BO,	128
+
+
+	xsmaddadp	vs32,	vs0,	vs8
+	xsmaddadp	vs33,	vs1,	vs8
+
+	xsmaddadp	vs34,	vs0,	vs9
+	xsmaddadp	vs35,	vs1,	vs9
+
+	xsmaddadp	vs36,	vs0,	vs10
+	xsmaddadp	vs37,	vs1,	vs10
+
+	xsmaddadp	vs38,	vs0,	vs11
+	xsmaddadp	vs39,	vs1,	vs11
+
+	xsmaddadp	vs40,	vs0,	vs12
+	xsmaddadp	vs41,	vs1,	vs12
+
+	xsmaddadp	vs42,	vs0,	vs13
+	xsmaddadp	vs43,	vs1,	vs13
+
+	xsmaddadp	vs44,	vs0,	vs14
+	xsmaddadp	vs45,	vs1,	vs14
+
+	xsmaddadp	vs46,	vs0,	vs15
+	xsmaddadp	vs47,	vs1,	vs15
+
+
+.endm
+
+.macro KERNEL8x2_2
+
+
+	lxsspx		vs0,	o0,	AO
+	lxsspx		vs1,	o4,	AO
+
+	addi		AO,	AO,	8
+
+	mr		T1,	BO
+
+	lxsspx		vs8,	o0,	T1
+	lxsspx		vs9,	o16,	T1
+	lxsspx		vs10,	o32,	T1
+	lxsspx		vs11,	o48,	T1
+
+	addi		T1,	T1,	64
+
+	lxsspx		vs12,	o0,	T1
+	lxsspx		vs13,	o16,	T1
+	lxsspx		vs14,	o32,	T1
+	lxsspx		vs15,	o48,	T1
+
+
+	addi		BO,	BO,	128
+
+
+	xsmaddadp	vs32,	vs4,	vs16
+	xsmaddadp	vs33,	vs5,	vs16
+
+	xsmaddadp	vs34,	vs4,	vs17
+	xsmaddadp	vs35,	vs5,	vs17
+
+	xsmaddadp	vs36,	vs4,	vs18
+	xsmaddadp	vs37,	vs5,	vs18
+
+	xsmaddadp	vs38,	vs4,	vs19
+	xsmaddadp	vs39,	vs5,	vs19
+
+	xsmaddadp	vs40,	vs4,	vs20
+	xsmaddadp	vs41,	vs5,	vs20
+
+	xsmaddadp	vs42,	vs4,	vs21
+	xsmaddadp	vs43,	vs5,	vs21
+
+	xsmaddadp	vs44,	vs4,	vs22
+	xsmaddadp	vs45,	vs5,	vs22
+
+	xsmaddadp	vs46,	vs4,	vs23
+	xsmaddadp	vs47,	vs5,	vs23
+
+
+.endm
+
+.macro KERNEL8x2_E2
+
+
+	xsmaddadp	vs32,	vs4,	vs16
+	xsmaddadp	vs33,	vs5,	vs16
+
+	xsmaddadp	vs34,	vs4,	vs17
+	xsmaddadp	vs35,	vs5,	vs17
+
+	xsmaddadp	vs36,	vs4,	vs18
+	xsmaddadp	vs37,	vs5,	vs18
+
+	xsmaddadp	vs38,	vs4,	vs19
+	xsmaddadp	vs39,	vs5,	vs19
+
+	xsmaddadp	vs40,	vs4,	vs20
+	xsmaddadp	vs41,	vs5,	vs20
+
+	xsmaddadp	vs42,	vs4,	vs21
+	xsmaddadp	vs43,	vs5,	vs21
+
+	xsmaddadp	vs44,	vs4,	vs22
+	xsmaddadp	vs45,	vs5,	vs22
+
+	xsmaddadp	vs46,	vs4,	vs23
+	xsmaddadp	vs47,	vs5,	vs23
+
+
+.endm
+
+.macro KERNEL8x2_SUBI1
+
+
+	lxsspx		vs0,	o0,	AO
+	lxsspx		vs1,	o4,	AO
+
+	addi		AO,	AO,	8
+
+	mr		T1,	BO
+
+	lxsspx		vs8,	o0,	T1
+	lxsspx		vs9,	o16,	T1
+	lxsspx		vs10,	o32,	T1
+	lxsspx		vs11,	o48,	T1
+
+	addi		T1,	T1,	64
+
+	lxsspx		vs12,	o0,	T1
+	lxsspx		vs13,	o16,	T1
+	lxsspx		vs14,	o32,	T1
+	lxsspx		vs15,	o48,	T1
+
+
+	addi		BO,	BO,	128
+
+
+	xsmuldp		vs32,	vs0,	vs8
+	xsmuldp		vs33,	vs1,	vs8
+
+	xsmuldp		vs34,	vs0,	vs9
+	xsmuldp		vs35,	vs1,	vs9
+
+	xsmuldp		vs36,	vs0,	vs10
+	xsmuldp		vs37,	vs1,	vs10
+
+	xsmuldp		vs38,	vs0,	vs11
+	xsmuldp		vs39,	vs1,	vs11
+
+	xsmuldp		vs40,	vs0,	vs12
+	xsmuldp		vs41,	vs1,	vs12
+
+	xsmuldp		vs42,	vs0,	vs13
+	xsmuldp		vs43,	vs1,	vs13
+
+	xsmuldp		vs44,	vs0,	vs14
+	xsmuldp		vs45,	vs1,	vs14
+
+	xsmuldp		vs46,	vs0,	vs15
+	xsmuldp		vs47,	vs1,	vs15
+
+
+.endm
+
+.macro KERNEL8x2_SUB1
+
+
+	lxsspx		vs0,	o0,	AO
+	lxsspx		vs1,	o4,	AO
+
+	addi		AO,	AO,	8
+
+	mr		T1,	BO
+
+	lxsspx		vs8,	o0,	T1
+	lxsspx		vs9,	o16,	T1
+	lxsspx		vs10,	o32,	T1
+	lxsspx		vs11,	o48,	T1
+
+	addi		T1,	T1,	64
+
+	lxsspx		vs12,	o0,	T1
+	lxsspx		vs13,	o16,	T1
+	lxsspx		vs14,	o32,	T1
+	lxsspx		vs15,	o48,	T1
+
+
+	addi		BO,	BO,	128
+
+
+	xsmaddadp	vs32,	vs0,	vs8
+	xsmaddadp	vs33,	vs1,	vs8
+
+	xsmaddadp	vs34,	vs0,	vs9
+	xsmaddadp	vs35,	vs1,	vs9
+
+	xsmaddadp	vs36,	vs0,	vs10
+	xsmaddadp	vs37,	vs1,	vs10
+
+	xsmaddadp	vs38,	vs0,	vs11
+	xsmaddadp	vs39,	vs1,	vs11
+
+	xsmaddadp	vs40,	vs0,	vs12
+	xsmaddadp	vs41,	vs1,	vs12
+
+	xsmaddadp	vs42,	vs0,	vs13
+	xsmaddadp	vs43,	vs1,	vs13
+
+	xsmaddadp	vs44,	vs0,	vs14
+	xsmaddadp	vs45,	vs1,	vs14
+
+	xsmaddadp	vs46,	vs0,	vs15
+	xsmaddadp	vs47,	vs1,	vs15
+
+
+.endm
+
+.macro SAVE8x2
+
+	mr		T1,	CO
+
+#ifndef TRMMKERNEL
+
+	lxsspx		vs0,	o0,	T1
+	lxsspx		vs1,	o4,	T1
+
+#endif
+
+#ifdef TRMMKERNEL
+	xsmuldp		vs0,	vs32,	alpha_r
+	xsmuldp		vs1,	vs33,	alpha_r
+#else
+	xsmaddadp	vs0,	vs32,	alpha_r
+	xsmaddadp	vs1,	vs33,	alpha_r
+#endif
+
+	stxsspx		vs0,	o0,	T1
+	stxsspx		vs1,	o4,	T1
+
+	add		T1,	T1,	LDC
+
+
+#ifndef TRMMKERNEL
+
+	lxsspx		vs0,	o0,	T1
+	lxsspx		vs1,	o4,	T1
+
+#endif
+
+#ifdef TRMMKERNEL
+	xsmuldp		vs0,	vs34,	alpha_r
+	xsmuldp		vs1,	vs35,	alpha_r
+#else
+	xsmaddadp	vs0,	vs34,	alpha_r
+	xsmaddadp	vs1,	vs35,	alpha_r
+#endif
+
+	stxsspx		vs0,	o0,	T1
+	stxsspx		vs1,	o4,	T1
+
+	add		T1,	T1,	LDC
+
+
+#ifndef TRMMKERNEL
+
+	lxsspx		vs0,	o0,	T1
+	lxsspx		vs1,	o4,	T1
+
+#endif
+
+#ifdef TRMMKERNEL
+	xsmuldp		vs0,	vs36,	alpha_r
+	xsmuldp		vs1,	vs37,	alpha_r
+#else
+	xsmaddadp	vs0,	vs36,	alpha_r
+	xsmaddadp	vs1,	vs37,	alpha_r
+#endif
+
+	stxsspx		vs0,	o0,	T1
+	stxsspx		vs1,	o4,	T1
+
+	add		T1,	T1,	LDC
+
+
+#ifndef TRMMKERNEL
+
+	lxsspx		vs0,	o0,	T1
+	lxsspx		vs1,	o4,	T1
+
+#endif
+
+#ifdef TRMMKERNEL
+	xsmuldp		vs0,	vs38,	alpha_r
+	xsmuldp		vs1,	vs39,	alpha_r
+#else
+	xsmaddadp	vs0,	vs38,	alpha_r
+	xsmaddadp	vs1,	vs39,	alpha_r
+#endif
+
+	stxsspx		vs0,	o0,	T1
+	stxsspx		vs1,	o4,	T1
+
+	add		T1,	T1,	LDC
+
+
+#ifndef TRMMKERNEL
+
+	lxsspx		vs0,	o0,	T1
+	lxsspx		vs1,	o4,	T1
+
+#endif
+
+#ifdef TRMMKERNEL
+	xsmuldp		vs0,	vs40,	alpha_r
+	xsmuldp		vs1,	vs41,	alpha_r
+#else
+	xsmaddadp	vs0,	vs40,	alpha_r
+	xsmaddadp	vs1,	vs41,	alpha_r
+#endif
+
+	stxsspx		vs0,	o0,	T1
+	stxsspx		vs1,	o4,	T1
+
+	add		T1,	T1,	LDC
+
+
+#ifndef TRMMKERNEL
+
+	lxsspx		vs0,	o0,	T1
+	lxsspx		vs1,	o4,	T1
+
+#endif
+
+#ifdef TRMMKERNEL
+	xsmuldp		vs0,	vs42,	alpha_r
+	xsmuldp		vs1,	vs43,	alpha_r
+#else
+	xsmaddadp	vs0,	vs42,	alpha_r
+	xsmaddadp	vs1,	vs43,	alpha_r
+#endif
+
+	stxsspx		vs0,	o0,	T1
+	stxsspx		vs1,	o4,	T1
+
+	add		T1,	T1,	LDC
+
+
+#ifndef TRMMKERNEL
+
+	lxsspx		vs0,	o0,	T1
+	lxsspx		vs1,	o4,	T1
+
+#endif
+
+#ifdef TRMMKERNEL
+	xsmuldp		vs0,	vs44,	alpha_r
+	xsmuldp		vs1,	vs45,	alpha_r
+#else
+	xsmaddadp	vs0,	vs44,	alpha_r
+	xsmaddadp	vs1,	vs45,	alpha_r
+#endif
+
+	stxsspx		vs0,	o0,	T1
+	stxsspx		vs1,	o4,	T1
+
+	add		T1,	T1,	LDC
+
+
+#ifndef TRMMKERNEL
+
+	lxsspx		vs0,	o0,	T1
+	lxsspx		vs1,	o4,	T1
+
+#endif
+
+#ifdef TRMMKERNEL
+	xsmuldp		vs0,	vs46,	alpha_r
+	xsmuldp		vs1,	vs47,	alpha_r
+#else
+	xsmaddadp	vs0,	vs46,	alpha_r
+	xsmaddadp	vs1,	vs47,	alpha_r
+#endif
+
+	stxsspx		vs0,	o0,	T1
+	stxsspx		vs1,	o4,	T1
+
+	add		T1,	T1,	LDC
+
+	addi		CO,	CO,	8
+
+.endm
+
+
+/**********************************************************************************************
+* Macros for N=8 and M=1
+**********************************************************************************************/
+
+.macro LOAD8x1_1
+
+	lxsspx		vs0,	o0,	AO
+
+	addi		AO,	AO,	4
+
+	mr		T1,	BO
+
+	lxsspx		vs8,	o0,	T1
+	lxsspx		vs9,	o16,	T1
+	lxsspx		vs10,	o32,	T1
+	lxsspx		vs11,	o48,	T1
+
+	addi		T1,	T1,	64
+
+	lxsspx		vs12,	o0,	T1
+	lxsspx		vs13,	o16,	T1
+	lxsspx		vs14,	o32,	T1
+	lxsspx		vs15,	o48,	T1
+
+
+	addi		BO,	BO,	128
+
+.endm
+
+.macro KERNEL8x1_I1
+
+
+	lxsspx		vs4,	o0,	AO
+
+	addi		AO,	AO,	4
+
+	mr		T1,	BO
+
+	lxsspx		vs16,	o0,	T1
+	lxsspx		vs17,	o16,	T1
+	lxsspx		vs18,	o32,	T1
+	lxsspx		vs19,	o48,	T1
+
+	addi		T1,	T1,	64
+
+	lxsspx		vs20,	o0,	T1
+	lxsspx		vs21,	o16,	T1
+	lxsspx		vs22,	o32,	T1
+	lxsspx		vs23,	o48,	T1
+
+
+	addi		BO,	BO,	128
+
+
+	xsmuldp		vs32,	vs0,	vs8
+
+	xsmuldp		vs33,	vs0,	vs9
+
+	xsmuldp		vs34,	vs0,	vs10
+
+	xsmuldp		vs35,	vs0,	vs11
+
+	xsmuldp		vs36,	vs0,	vs12
+
+	xsmuldp		vs37,	vs0,	vs13
+
+	xsmuldp		vs38,	vs0,	vs14
+
+	xsmuldp		vs39,	vs0,	vs15
+
+
+.endm
+
+.macro KERNEL8x1_1
+
+
+	lxsspx		vs4,	o0,	AO
+
+	addi		AO,	AO,	4
+
+	mr		T1,	BO
+
+	lxsspx		vs16,	o0,	T1
+	lxsspx		vs17,	o16,	T1
+	lxsspx		vs18,	o32,	T1
+	lxsspx		vs19,	o48,	T1
+
+	addi		T1,	T1,	64
+
+	lxsspx		vs20,	o0,	T1
+	lxsspx		vs21,	o16,	T1
+	lxsspx		vs22,	o32,	T1
+	lxsspx		vs23,	o48,	T1
+
+
+	addi		BO,	BO,	128
+
+
+	xsmaddadp	vs32,	vs0,	vs8
+
+	xsmaddadp	vs33,	vs0,	vs9
+
+	xsmaddadp	vs34,	vs0,	vs10
+
+	xsmaddadp	vs35,	vs0,	vs11
+
+	xsmaddadp	vs36,	vs0,	vs12
+
+	xsmaddadp	vs37,	vs0,	vs13
+
+	xsmaddadp	vs38,	vs0,	vs14
+
+	xsmaddadp	vs39,	vs0,	vs15
+
+
+.endm
+
+.macro KERNEL8x1_2
+
+
+	lxsspx		vs0,	o0,	AO
+
+	addi		AO,	AO,	4
+
+	mr		T1,	BO
+
+	lxsspx		vs8,	o0,	T1
+	lxsspx		vs9,	o16,	T1
+	lxsspx		vs10,	o32,	T1
+	lxsspx		vs11,	o48,	T1
+
+	addi		T1,	T1,	64
+
+	lxsspx		vs12,	o0,	T1
+	lxsspx		vs13,	o16,	T1
+	lxsspx		vs14,	o32,	T1
+	lxsspx		vs15,	o48,	T1
+
+
+	addi		BO,	BO,	128
+
+
+	xsmaddadp	vs32,	vs4,	vs16
+
+	xsmaddadp	vs33,	vs4,	vs17
+
+	xsmaddadp	vs34,	vs4,	vs18
+
+	xsmaddadp	vs35,	vs4,	vs19
+
+	xsmaddadp	vs36,	vs4,	vs20
+
+	xsmaddadp	vs37,	vs4,	vs21
+
+	xsmaddadp	vs38,	vs4,	vs22
+
+	xsmaddadp	vs39,	vs4,	vs23
+
+
+.endm
+
+.macro KERNEL8x1_E2
+
+
+	xsmaddadp	vs32,	vs4,	vs16
+
+	xsmaddadp	vs33,	vs4,	vs17
+
+	xsmaddadp	vs34,	vs4,	vs18
+
+	xsmaddadp	vs35,	vs4,	vs19
+
+	xsmaddadp	vs36,	vs4,	vs20
+
+	xsmaddadp	vs37,	vs4,	vs21
+
+	xsmaddadp	vs38,	vs4,	vs22
+
+	xsmaddadp	vs39,	vs4,	vs23
+
+
+.endm
+
+.macro KERNEL8x1_SUBI1
+
+
+	lxsspx		vs0,	o0,	AO
+
+	addi		AO,	AO,	4
+
+	mr		T1,	BO
+
+	lxsspx		vs8,	o0,	T1
+	lxsspx		vs9,	o16,	T1
+	lxsspx		vs10,	o32,	T1
+	lxsspx		vs11,	o48,	T1
+
+	addi		T1,	T1,	64
+
+	lxsspx		vs12,	o0,	T1
+	lxsspx		vs13,	o16,	T1
+	lxsspx		vs14,	o32,	T1
+	lxsspx		vs15,	o48,	T1
+
+
+	addi		BO,	BO,	128
+
+
+	xsmuldp		vs32,	vs0,	vs8
+
+	xsmuldp		vs33,	vs0,	vs9
+
+	xsmuldp		vs34,	vs0,	vs10
+
+	xsmuldp		vs35,	vs0,	vs11
+
+	xsmuldp		vs36,	vs0,	vs12
+
+	xsmuldp		vs37,	vs0,	vs13
+
+	xsmuldp		vs38,	vs0,	vs14
+
+	xsmuldp		vs39,	vs0,	vs15
+
+
+.endm
+
+.macro KERNEL8x1_SUB1
+
+
+	lxsspx		vs0,	o0,	AO
+
+	addi		AO,	AO,	4
+
+	mr		T1,	BO
+
+	lxsspx		vs8,	o0,	T1
+	lxsspx		vs9,	o16,	T1
+	lxsspx		vs10,	o32,	T1
+	lxsspx		vs11,	o48,	T1
+
+	addi		T1,	T1,	64
+
+	lxsspx		vs12,	o0,	T1
+	lxsspx		vs13,	o16,	T1
+	lxsspx		vs14,	o32,	T1
+	lxsspx		vs15,	o48,	T1
+
+
+	addi		BO,	BO,	128
+
+
+	xsmaddadp	vs32,	vs0,	vs8
+
+	xsmaddadp	vs33,	vs0,	vs9
+
+	xsmaddadp	vs34,	vs0,	vs10
+
+	xsmaddadp	vs35,	vs0,	vs11
+
+	xsmaddadp	vs36,	vs0,	vs12
+
+	xsmaddadp	vs37,	vs0,	vs13
+
+	xsmaddadp	vs38,	vs0,	vs14
+
+	xsmaddadp	vs39,	vs0,	vs15
+
+
+.endm
+
+.macro SAVE8x1
+
+	mr		T1,	CO
+
+#ifndef TRMMKERNEL
+
+	lxsspx		vs0,	o0,	T1
+
+#endif
+
+#ifdef TRMMKERNEL
+	xsmuldp		vs0,	vs32,	alpha_r
+#else
+	xsmaddadp	vs0,	vs32,	alpha_r
+#endif
+
+	stxsspx		vs0,	o0,	T1
+
+	add		T1,	T1,	LDC
+
+
+#ifndef TRMMKERNEL
+
+	lxsspx		vs0,	o0,	T1
+
+#endif
+
+#ifdef TRMMKERNEL
+	xsmuldp		vs0,	vs33,	alpha_r
+#else
+	xsmaddadp	vs0,	vs33,	alpha_r
+#endif
+
+	stxsspx		vs0,	o0,	T1
+
+	add		T1,	T1,	LDC
+
+
+#ifndef TRMMKERNEL
+
+	lxsspx		vs0,	o0,	T1
+
+#endif
+
+#ifdef TRMMKERNEL
+	xsmuldp		vs0,	vs34,	alpha_r
+#else
+	xsmaddadp	vs0,	vs34,	alpha_r
+#endif
+
+	stxsspx		vs0,	o0,	T1
+
+	add		T1,	T1,	LDC
+
+
+#ifndef TRMMKERNEL
+
+	lxsspx		vs0,	o0,	T1
+
+#endif
+
+#ifdef TRMMKERNEL
+	xsmuldp		vs0,	vs35,	alpha_r
+#else
+	xsmaddadp	vs0,	vs35,	alpha_r
+#endif
+
+	stxsspx		vs0,	o0,	T1
+
+	add		T1,	T1,	LDC
+
+
+#ifndef TRMMKERNEL
+
+	lxsspx		vs0,	o0,	T1
+
+#endif
+
+#ifdef TRMMKERNEL
+	xsmuldp		vs0,	vs36,	alpha_r
+#else
+	xsmaddadp	vs0,	vs36,	alpha_r
+#endif
+
+	stxsspx		vs0,	o0,	T1
+
+	add		T1,	T1,	LDC
+
+
+#ifndef TRMMKERNEL
+
+	lxsspx		vs0,	o0,	T1
+
+#endif
+
+#ifdef TRMMKERNEL
+	xsmuldp		vs0,	vs37,	alpha_r
+#else
+	xsmaddadp	vs0,	vs37,	alpha_r
+#endif
+
+	stxsspx		vs0,	o0,	T1
+
+	add		T1,	T1,	LDC
+
+
+#ifndef TRMMKERNEL
+
+	lxsspx		vs0,	o0,	T1
+
+#endif
+
+#ifdef TRMMKERNEL
+	xsmuldp		vs0,	vs38,	alpha_r
+#else
+	xsmaddadp	vs0,	vs38,	alpha_r
+#endif
+
+	stxsspx		vs0,	o0,	T1
+
+	add		T1,	T1,	LDC
+
+
+#ifndef TRMMKERNEL
+
+	lxsspx		vs0,	o0,	T1
+
+#endif
+
+#ifdef TRMMKERNEL
+	xsmuldp		vs0,	vs39,	alpha_r
+#else
+	xsmaddadp	vs0,	vs39,	alpha_r
+#endif
+
+	stxsspx		vs0,	o0,	T1
+
+	add		T1,	T1,	LDC
+
+	addi		CO,	CO,	4
+
+.endm
+
+
+/**********************************************************************************************
+* Macros for N=4 and M=16
+**********************************************************************************************/
+
+.macro LOAD4x16_1
+
+	lxvw4x		vs0,	o0,	AO
+	lxvw4x		vs1,	o16,	AO
+	lxvw4x		vs2,	o32,	AO
+	lxvw4x		vs3,	o48,	AO
+
+	addi		AO,	AO,	64
+
+	mr		T1,	BO
+
+	lxvw4x		vs8,	o0,	T1
+	lxvw4x		vs9,	o16,	T1
+	lxvw4x		vs10,	o32,	T1
+	lxvw4x		vs11,	o48,	T1
+
+	addi		BO,	BO,	64
+
+.endm
+
+.macro KERNEL4x16_I1
+
+
+	lxvw4x		vs4,	o0,	AO
+	lxvw4x		vs5,	o16,	AO
+	lxvw4x		vs6,	o32,	AO
+	lxvw4x		vs7,	o48,	AO
+
+	addi		AO,	AO,	64
+
+	mr		T1,	BO
+
+	lxvw4x		vs16,	o0,	T1
+	lxvw4x		vs17,	o16,	T1
+	lxvw4x		vs18,	o32,	T1
+	lxvw4x		vs19,	o48,	T1
+
+	addi		BO,	BO,	64
+
+
+	xvmulsp		vs32,	vs0,	vs8
+	xvmulsp		vs33,	vs1,	vs8
+	xvmulsp		vs34,	vs2,	vs8
+	xvmulsp		vs35,	vs3,	vs8
+
+	xvmulsp		vs36,	vs0,	vs9
+	xvmulsp		vs37,	vs1,	vs9
+	xvmulsp		vs38,	vs2,	vs9
+	xvmulsp		vs39,	vs3,	vs9
+
+	xvmulsp		vs40,	vs0,	vs10
+	xvmulsp		vs41,	vs1,	vs10
+	xvmulsp		vs42,	vs2,	vs10
+	xvmulsp		vs43,	vs3,	vs10
+
+	xvmulsp		vs44,	vs0,	vs11
+	xvmulsp		vs45,	vs1,	vs11
+	xvmulsp		vs46,	vs2,	vs11
+	xvmulsp		vs47,	vs3,	vs11
+
+
+.endm
+
+.macro KERNEL4x16_1
+
+
+	lxvw4x		vs4,	o0,	AO
+	lxvw4x		vs5,	o16,	AO
+	lxvw4x		vs6,	o32,	AO
+	lxvw4x		vs7,	o48,	AO
+
+	addi		AO,	AO,	64
+
+	mr		T1,	BO
+
+	lxvw4x		vs16,	o0,	T1
+	lxvw4x		vs17,	o16,	T1
+	lxvw4x		vs18,	o32,	T1
+	lxvw4x		vs19,	o48,	T1
+
+	addi		BO,	BO,	64
+
+
+	xvmaddasp	vs32,	vs0,	vs8
+	xvmaddasp	vs33,	vs1,	vs8
+	xvmaddasp	vs34,	vs2,	vs8
+	xvmaddasp	vs35,	vs3,	vs8
+
+	xvmaddasp	vs36,	vs0,	vs9
+	xvmaddasp	vs37,	vs1,	vs9
+	xvmaddasp	vs38,	vs2,	vs9
+	xvmaddasp	vs39,	vs3,	vs9
+
+	xvmaddasp	vs40,	vs0,	vs10
+	xvmaddasp	vs41,	vs1,	vs10
+	xvmaddasp	vs42,	vs2,	vs10
+	xvmaddasp	vs43,	vs3,	vs10
+
+	xvmaddasp	vs44,	vs0,	vs11
+	xvmaddasp	vs45,	vs1,	vs11
+	xvmaddasp	vs46,	vs2,	vs11
+	xvmaddasp	vs47,	vs3,	vs11
+
+
+.endm
+
+.macro KERNEL4x16_2
+
+
+	lxvw4x		vs0,	o0,	AO
+	lxvw4x		vs1,	o16,	AO
+	lxvw4x		vs2,	o32,	AO
+	lxvw4x		vs3,	o48,	AO
+
+	addi		AO,	AO,	64
+
+	mr		T1,	BO
+
+	lxvw4x		vs8,	o0,	T1
+	lxvw4x		vs9,	o16,	T1
+	lxvw4x		vs10,	o32,	T1
+	lxvw4x		vs11,	o48,	T1
+
+	addi		BO,	BO,	64
+
+
+	xvmaddasp	vs32,	vs4,	vs16
+	xvmaddasp	vs33,	vs5,	vs16
+	xvmaddasp	vs34,	vs6,	vs16
+	xvmaddasp	vs35,	vs7,	vs16
+
+	xvmaddasp	vs36,	vs4,	vs17
+	xvmaddasp	vs37,	vs5,	vs17
+	xvmaddasp	vs38,	vs6,	vs17
+	xvmaddasp	vs39,	vs7,	vs17
+
+	xvmaddasp	vs40,	vs4,	vs18
+	xvmaddasp	vs41,	vs5,	vs18
+	xvmaddasp	vs42,	vs6,	vs18
+	xvmaddasp	vs43,	vs7,	vs18
+
+	xvmaddasp	vs44,	vs4,	vs19
+	xvmaddasp	vs45,	vs5,	vs19
+	xvmaddasp	vs46,	vs6,	vs19
+	xvmaddasp	vs47,	vs7,	vs19
+
+
+.endm
+
+.macro KERNEL4x16_E2
+
+
+	xvmaddasp	vs32,	vs4,	vs16
+	xvmaddasp	vs33,	vs5,	vs16
+	xvmaddasp	vs34,	vs6,	vs16
+	xvmaddasp	vs35,	vs7,	vs16
+
+	xvmaddasp	vs36,	vs4,	vs17
+	xvmaddasp	vs37,	vs5,	vs17
+	xvmaddasp	vs38,	vs6,	vs17
+	xvmaddasp	vs39,	vs7,	vs17
+
+	xvmaddasp	vs40,	vs4,	vs18
+	xvmaddasp	vs41,	vs5,	vs18
+	xvmaddasp	vs42,	vs6,	vs18
+	xvmaddasp	vs43,	vs7,	vs18
+
+	xvmaddasp	vs44,	vs4,	vs19
+	xvmaddasp	vs45,	vs5,	vs19
+	xvmaddasp	vs46,	vs6,	vs19
+	xvmaddasp	vs47,	vs7,	vs19
+
+
+.endm
+
+.macro KERNEL4x16_SUBI1
+
+
+	lxvw4x		vs0,	o0,	AO
+	lxvw4x		vs1,	o16,	AO
+	lxvw4x		vs2,	o32,	AO
+	lxvw4x		vs3,	o48,	AO
+
+	addi		AO,	AO,	64
+
+	mr		T1,	BO
+
+	lxvw4x		vs8,	o0,	T1
+	lxvw4x		vs9,	o16,	T1
+	lxvw4x		vs10,	o32,	T1
+	lxvw4x		vs11,	o48,	T1
+
+	addi		BO,	BO,	64
+
+
+	xvmulsp		vs32,	vs0,	vs8
+	xvmulsp		vs33,	vs1,	vs8
+	xvmulsp		vs34,	vs2,	vs8
+	xvmulsp		vs35,	vs3,	vs8
+
+	xvmulsp		vs36,	vs0,	vs9
+	xvmulsp		vs37,	vs1,	vs9
+	xvmulsp		vs38,	vs2,	vs9
+	xvmulsp		vs39,	vs3,	vs9
+
+	xvmulsp		vs40,	vs0,	vs10
+	xvmulsp		vs41,	vs1,	vs10
+	xvmulsp		vs42,	vs2,	vs10
+	xvmulsp		vs43,	vs3,	vs10
+
+	xvmulsp		vs44,	vs0,	vs11
+	xvmulsp		vs45,	vs1,	vs11
+	xvmulsp		vs46,	vs2,	vs11
+	xvmulsp		vs47,	vs3,	vs11
+
+
+.endm
+
+.macro KERNEL4x16_SUB1
+
+
+	lxvw4x		vs0,	o0,	AO
+	lxvw4x		vs1,	o16,	AO
+	lxvw4x		vs2,	o32,	AO
+	lxvw4x		vs3,	o48,	AO
+
+	addi		AO,	AO,	64
+
+	mr		T1,	BO
+
+	lxvw4x		vs8,	o0,	T1
+	lxvw4x		vs9,	o16,	T1
+	lxvw4x		vs10,	o32,	T1
+	lxvw4x		vs11,	o48,	T1
+
+	addi		BO,	BO,	64
+
+
+	xvmaddasp	vs32,	vs0,	vs8
+	xvmaddasp	vs33,	vs1,	vs8
+	xvmaddasp	vs34,	vs2,	vs8
+	xvmaddasp	vs35,	vs3,	vs8
+
+	xvmaddasp	vs36,	vs0,	vs9
+	xvmaddasp	vs37,	vs1,	vs9
+	xvmaddasp	vs38,	vs2,	vs9
+	xvmaddasp	vs39,	vs3,	vs9
+
+	xvmaddasp	vs40,	vs0,	vs10
+	xvmaddasp	vs41,	vs1,	vs10
+	xvmaddasp	vs42,	vs2,	vs10
+	xvmaddasp	vs43,	vs3,	vs10
+
+	xvmaddasp	vs44,	vs0,	vs11
+	xvmaddasp	vs45,	vs1,	vs11
+	xvmaddasp	vs46,	vs2,	vs11
+	xvmaddasp	vs47,	vs3,	vs11
+
+
+.endm
+
+.macro SAVE4x16
+
+	mr		T1,	CO
+
+#ifndef TRMMKERNEL
+
+	lxvw4x		vs0,	o0,	T1
+	lxvw4x		vs1,	o16,	T1
+	lxvw4x		vs2,	o32,	T1
+	lxvw4x		vs3,	o48,	T1
+
+#endif
+
+#ifdef TRMMKERNEL
+	xvmulsp		vs0,	vs32,	alpha_vr
+	xvmulsp		vs1,	vs33,	alpha_vr
+	xvmulsp		vs2,	vs34,	alpha_vr
+	xvmulsp		vs3,	vs35,	alpha_vr
+#else
+	xvmaddasp	vs0,	vs32,	alpha_vr
+	xvmaddasp	vs1,	vs33,	alpha_vr
+	xvmaddasp	vs2,	vs34,	alpha_vr
+	xvmaddasp	vs3,	vs35,	alpha_vr
+#endif
+
+	stxvw4x		vs0,	o0,	T1
+	stxvw4x		vs1,	o16,	T1
+	stxvw4x		vs2,	o32,	T1
+	stxvw4x		vs3,	o48,	T1
+
+	add		T1,	T1,	LDC
+
+
+#ifndef TRMMKERNEL
+
+	lxvw4x		vs0,	o0,	T1
+	lxvw4x		vs1,	o16,	T1
+	lxvw4x		vs2,	o32,	T1
+	lxvw4x		vs3,	o48,	T1
+
+#endif
+
+#ifdef TRMMKERNEL
+	xvmulsp		vs0,	vs36,	alpha_vr
+	xvmulsp		vs1,	vs37,	alpha_vr
+	xvmulsp		vs2,	vs38,	alpha_vr
+	xvmulsp		vs3,	vs39,	alpha_vr
+#else
+	xvmaddasp	vs0,	vs36,	alpha_vr
+	xvmaddasp	vs1,	vs37,	alpha_vr
+	xvmaddasp	vs2,	vs38,	alpha_vr
+	xvmaddasp	vs3,	vs39,	alpha_vr
+#endif
+
+	stxvw4x		vs0,	o0,	T1
+	stxvw4x		vs1,	o16,	T1
+	stxvw4x		vs2,	o32,	T1
+	stxvw4x		vs3,	o48,	T1
+
+	add		T1,	T1,	LDC
+
+
+#ifndef TRMMKERNEL
+
+	lxvw4x		vs0,	o0,	T1
+	lxvw4x		vs1,	o16,	T1
+	lxvw4x		vs2,	o32,	T1
+	lxvw4x		vs3,	o48,	T1
+
+#endif
+
+#ifdef TRMMKERNEL
+	xvmulsp		vs0,	vs40,	alpha_vr
+	xvmulsp		vs1,	vs41,	alpha_vr
+	xvmulsp		vs2,	vs42,	alpha_vr
+	xvmulsp		vs3,	vs43,	alpha_vr
+#else
+	xvmaddasp	vs0,	vs40,	alpha_vr
+	xvmaddasp	vs1,	vs41,	alpha_vr
+	xvmaddasp	vs2,	vs42,	alpha_vr
+	xvmaddasp	vs3,	vs43,	alpha_vr
+#endif
+
+	stxvw4x		vs0,	o0,	T1
+	stxvw4x		vs1,	o16,	T1
+	stxvw4x		vs2,	o32,	T1
+	stxvw4x		vs3,	o48,	T1
+
+	add		T1,	T1,	LDC
+
+
+#ifndef TRMMKERNEL
+
+	lxvw4x		vs0,	o0,	T1
+	lxvw4x		vs1,	o16,	T1
+	lxvw4x		vs2,	o32,	T1
+	lxvw4x		vs3,	o48,	T1
+
+#endif
+
+#ifdef TRMMKERNEL
+	xvmulsp		vs0,	vs44,	alpha_vr
+	xvmulsp		vs1,	vs45,	alpha_vr
+	xvmulsp		vs2,	vs46,	alpha_vr
+	xvmulsp		vs3,	vs47,	alpha_vr
+#else
+	xvmaddasp	vs0,	vs44,	alpha_vr
+	xvmaddasp	vs1,	vs45,	alpha_vr
+	xvmaddasp	vs2,	vs46,	alpha_vr
+	xvmaddasp	vs3,	vs47,	alpha_vr
+#endif
+
+	stxvw4x		vs0,	o0,	T1
+	stxvw4x		vs1,	o16,	T1
+	stxvw4x		vs2,	o32,	T1
+	stxvw4x		vs3,	o48,	T1
+
+	add		T1,	T1,	LDC
+
+	addi		CO,	CO,	64
+
+.endm
+
+
+/**********************************************************************************************
+* Macros for N=4 and M=8
+**********************************************************************************************/
+
+.macro LOAD4x8_1
+
+	lxvw4x		vs0,	o0,	AO
+	lxvw4x		vs1,	o16,	AO
+
+	addi		AO,	AO,	32
+
+	mr		T1,	BO
+
+	lxvw4x		vs8,	o0,	T1
+	lxvw4x		vs9,	o16,	T1
+	lxvw4x		vs10,	o32,	T1
+	lxvw4x		vs11,	o48,	T1
+
+	addi		BO,	BO,	64
+
+.endm
+
+.macro KERNEL4x8_I1
+
+
+	lxvw4x		vs4,	o0,	AO
+	lxvw4x		vs5,	o16,	AO
+
+	addi		AO,	AO,	32
+
+	mr		T1,	BO
+
+	lxvw4x		vs16,	o0,	T1
+	lxvw4x		vs17,	o16,	T1
+	lxvw4x		vs18,	o32,	T1
+	lxvw4x		vs19,	o48,	T1
+
+	addi		BO,	BO,	64
+
+
+	xvmulsp		vs32,	vs0,	vs8
+	xvmulsp		vs33,	vs1,	vs8
+
+	xvmulsp		vs34,	vs0,	vs9
+	xvmulsp		vs35,	vs1,	vs9
+
+	xvmulsp		vs36,	vs0,	vs10
+	xvmulsp		vs37,	vs1,	vs10
+
+	xvmulsp		vs38,	vs0,	vs11
+	xvmulsp		vs39,	vs1,	vs11
+
+
+.endm
+
+.macro KERNEL4x8_1
+
+
+	lxvw4x		vs4,	o0,	AO
+	lxvw4x		vs5,	o16,	AO
+
+	addi		AO,	AO,	32
+
+	mr		T1,	BO
+
+	lxvw4x		vs16,	o0,	T1
+	lxvw4x		vs17,	o16,	T1
+	lxvw4x		vs18,	o32,	T1
+	lxvw4x		vs19,	o48,	T1
+
+	addi		BO,	BO,	64
+
+
+	xvmaddasp	vs32,	vs0,	vs8
+	xvmaddasp	vs33,	vs1,	vs8
+
+	xvmaddasp	vs34,	vs0,	vs9
+	xvmaddasp	vs35,	vs1,	vs9
+
+	xvmaddasp	vs36,	vs0,	vs10
+	xvmaddasp	vs37,	vs1,	vs10
+
+	xvmaddasp	vs38,	vs0,	vs11
+	xvmaddasp	vs39,	vs1,	vs11
+
+
+.endm
+
+.macro KERNEL4x8_2
+
+
+	lxvw4x		vs0,	o0,	AO
+	lxvw4x		vs1,	o16,	AO
+
+	addi		AO,	AO,	32
+
+	mr		T1,	BO
+
+	lxvw4x		vs8,	o0,	T1
+	lxvw4x		vs9,	o16,	T1
+	lxvw4x		vs10,	o32,	T1
+	lxvw4x		vs11,	o48,	T1
+
+	addi		BO,	BO,	64
+
+
+	xvmaddasp	vs32,	vs4,	vs16
+	xvmaddasp	vs33,	vs5,	vs16
+
+	xvmaddasp	vs34,	vs4,	vs17
+	xvmaddasp	vs35,	vs5,	vs17
+
+	xvmaddasp	vs36,	vs4,	vs18
+	xvmaddasp	vs37,	vs5,	vs18
+
+	xvmaddasp	vs38,	vs4,	vs19
+	xvmaddasp	vs39,	vs5,	vs19
+
+
+.endm
+
+.macro KERNEL4x8_E2
+
+
+	xvmaddasp	vs32,	vs4,	vs16
+	xvmaddasp	vs33,	vs5,	vs16
+
+	xvmaddasp	vs34,	vs4,	vs17
+	xvmaddasp	vs35,	vs5,	vs17
+
+	xvmaddasp	vs36,	vs4,	vs18
+	xvmaddasp	vs37,	vs5,	vs18
+
+	xvmaddasp	vs38,	vs4,	vs19
+	xvmaddasp	vs39,	vs5,	vs19
+
+
+.endm
+
+.macro KERNEL4x8_SUBI1
+
+
+	lxvw4x		vs0,	o0,	AO
+	lxvw4x		vs1,	o16,	AO
+
+	addi		AO,	AO,	32
+
+	mr		T1,	BO
+
+	lxvw4x		vs8,	o0,	T1
+	lxvw4x		vs9,	o16,	T1
+	lxvw4x		vs10,	o32,	T1
+	lxvw4x		vs11,	o48,	T1
+
+	addi		BO,	BO,	64
+
+
+	xvmulsp		vs32,	vs0,	vs8
+	xvmulsp		vs33,	vs1,	vs8
+
+	xvmulsp		vs34,	vs0,	vs9
+	xvmulsp		vs35,	vs1,	vs9
+
+	xvmulsp		vs36,	vs0,	vs10
+	xvmulsp		vs37,	vs1,	vs10
+
+	xvmulsp		vs38,	vs0,	vs11
+	xvmulsp		vs39,	vs1,	vs11
+
+
+.endm
+
+.macro KERNEL4x8_SUB1
+
+
+	lxvw4x		vs0,	o0,	AO
+	lxvw4x		vs1,	o16,	AO
+
+	addi		AO,	AO,	32
+
+	mr		T1,	BO
+
+	lxvw4x		vs8,	o0,	T1
+	lxvw4x		vs9,	o16,	T1
+	lxvw4x		vs10,	o32,	T1
+	lxvw4x		vs11,	o48,	T1
+
+	addi		BO,	BO,	64
+
+
+	xvmaddasp	vs32,	vs0,	vs8
+	xvmaddasp	vs33,	vs1,	vs8
+
+	xvmaddasp	vs34,	vs0,	vs9
+	xvmaddasp	vs35,	vs1,	vs9
+
+	xvmaddasp	vs36,	vs0,	vs10
+	xvmaddasp	vs37,	vs1,	vs10
+
+	xvmaddasp	vs38,	vs0,	vs11
+	xvmaddasp	vs39,	vs1,	vs11
+
+
+.endm
+
+.macro SAVE4x8
+
+	mr		T1,	CO
+
+#ifndef TRMMKERNEL
+
+	lxvw4x		vs0,	o0,	T1
+	lxvw4x		vs1,	o16,	T1
+
+#endif
+
+#ifdef TRMMKERNEL
+	xvmulsp		vs0,	vs32,	alpha_vr
+	xvmulsp		vs1,	vs33,	alpha_vr
+#else
+	xvmaddasp	vs0,	vs32,	alpha_vr
+	xvmaddasp	vs1,	vs33,	alpha_vr
+#endif
+
+	stxvw4x		vs0,	o0,	T1
+	stxvw4x		vs1,	o16,	T1
+
+	add		T1,	T1,	LDC
+
+
+#ifndef TRMMKERNEL
+
+	lxvw4x		vs0,	o0,	T1
+	lxvw4x		vs1,	o16,	T1
+
+#endif
+
+#ifdef TRMMKERNEL
+	xvmulsp		vs0,	vs34,	alpha_vr
+	xvmulsp		vs1,	vs35,	alpha_vr
+#else
+	xvmaddasp	vs0,	vs34,	alpha_vr
+	xvmaddasp	vs1,	vs35,	alpha_vr
+#endif
+
+	stxvw4x		vs0,	o0,	T1
+	stxvw4x		vs1,	o16,	T1
+
+	add		T1,	T1,	LDC
+
+
+#ifndef TRMMKERNEL
+
+	lxvw4x		vs0,	o0,	T1
+	lxvw4x		vs1,	o16,	T1
+
+#endif
+
+#ifdef TRMMKERNEL
+	xvmulsp		vs0,	vs36,	alpha_vr
+	xvmulsp		vs1,	vs37,	alpha_vr
+#else
+	xvmaddasp	vs0,	vs36,	alpha_vr
+	xvmaddasp	vs1,	vs37,	alpha_vr
+#endif
+
+	stxvw4x		vs0,	o0,	T1
+	stxvw4x		vs1,	o16,	T1
+
+	add		T1,	T1,	LDC
+
+
+#ifndef TRMMKERNEL
+
+	lxvw4x		vs0,	o0,	T1
+	lxvw4x		vs1,	o16,	T1
+
+#endif
+
+#ifdef TRMMKERNEL
+	xvmulsp		vs0,	vs38,	alpha_vr
+	xvmulsp		vs1,	vs39,	alpha_vr
+#else
+	xvmaddasp	vs0,	vs38,	alpha_vr
+	xvmaddasp	vs1,	vs39,	alpha_vr
+#endif
+
+	stxvw4x		vs0,	o0,	T1
+	stxvw4x		vs1,	o16,	T1
+
+	add		T1,	T1,	LDC
+
+	addi		CO,	CO,	32
+
+.endm
+
+
+/**********************************************************************************************
+* Macros for N=4 and M=4
+**********************************************************************************************/
+
+.macro LOAD4x4_1
+
+	lxvw4x		vs0,	o0,	AO
+
+	addi		AO,	AO,	16
+
+	mr		T1,	BO
+
+	lxvw4x		vs8,	o0,	T1
+	lxvw4x		vs9,	o16,	T1
+	lxvw4x		vs10,	o32,	T1
+	lxvw4x		vs11,	o48,	T1
+
+	addi		BO,	BO,	64
+
+.endm
+
+.macro KERNEL4x4_I1
+
+
+	lxvw4x		vs4,	o0,	AO
+
+	addi		AO,	AO,	16
+
+	mr		T1,	BO
+
+	lxvw4x		vs16,	o0,	T1
+	lxvw4x		vs17,	o16,	T1
+	lxvw4x		vs18,	o32,	T1
+	lxvw4x		vs19,	o48,	T1
+
+	addi		BO,	BO,	64
+
+
+	xvmulsp		vs32,	vs0,	vs8
+
+	xvmulsp		vs33,	vs0,	vs9
+
+	xvmulsp		vs34,	vs0,	vs10
+
+	xvmulsp		vs35,	vs0,	vs11
+
+
+.endm
+
+.macro KERNEL4x4_1
+
+
+	lxvw4x		vs4,	o0,	AO
+
+	addi		AO,	AO,	16
+
+	mr		T1,	BO
+
+	lxvw4x		vs16,	o0,	T1
+	lxvw4x		vs17,	o16,	T1
+	lxvw4x		vs18,	o32,	T1
+	lxvw4x		vs19,	o48,	T1
+
+	addi		BO,	BO,	64
+
+
+	xvmaddasp	vs32,	vs0,	vs8
+
+	xvmaddasp	vs33,	vs0,	vs9
+
+	xvmaddasp	vs34,	vs0,	vs10
+
+	xvmaddasp	vs35,	vs0,	vs11
+
+
+.endm
+
+.macro KERNEL4x4_2
+
+
+	lxvw4x		vs0,	o0,	AO
+
+	addi		AO,	AO,	16
+
+	mr		T1,	BO
+
+	lxvw4x		vs8,	o0,	T1
+	lxvw4x		vs9,	o16,	T1
+	lxvw4x		vs10,	o32,	T1
+	lxvw4x		vs11,	o48,	T1
+
+	addi		BO,	BO,	64
+
+
+	xvmaddasp	vs32,	vs4,	vs16
+
+	xvmaddasp	vs33,	vs4,	vs17
+
+	xvmaddasp	vs34,	vs4,	vs18
+
+	xvmaddasp	vs35,	vs4,	vs19
+
+
+.endm
+
+.macro KERNEL4x4_E2
+
+
+	xvmaddasp	vs32,	vs4,	vs16
+
+	xvmaddasp	vs33,	vs4,	vs17
+
+	xvmaddasp	vs34,	vs4,	vs18
+
+	xvmaddasp	vs35,	vs4,	vs19
+
+
+.endm
+
+.macro KERNEL4x4_SUBI1
+
+
+	lxvw4x		vs0,	o0,	AO
+
+	addi		AO,	AO,	16
+
+	mr		T1,	BO
+
+	lxvw4x		vs8,	o0,	T1
+	lxvw4x		vs9,	o16,	T1
+	lxvw4x		vs10,	o32,	T1
+	lxvw4x		vs11,	o48,	T1
+
+	addi		BO,	BO,	64
+
+
+	xvmulsp		vs32,	vs0,	vs8
+
+	xvmulsp		vs33,	vs0,	vs9
+
+	xvmulsp		vs34,	vs0,	vs10
+
+	xvmulsp		vs35,	vs0,	vs11
+
+
+.endm
+
+.macro KERNEL4x4_SUB1
+
+
+	lxvw4x		vs0,	o0,	AO
+
+	addi		AO,	AO,	16
+
+	mr		T1,	BO
+
+	lxvw4x		vs8,	o0,	T1
+	lxvw4x		vs9,	o16,	T1
+	lxvw4x		vs10,	o32,	T1
+	lxvw4x		vs11,	o48,	T1
+
+	addi		BO,	BO,	64
+
+
+	xvmaddasp	vs32,	vs0,	vs8
+
+	xvmaddasp	vs33,	vs0,	vs9
+
+	xvmaddasp	vs34,	vs0,	vs10
+
+	xvmaddasp	vs35,	vs0,	vs11
+
+
+.endm
+
+.macro SAVE4x4
+
+	mr		T1,	CO
+
+#ifndef TRMMKERNEL
+
+	lxvw4x		vs0,	o0,	T1
+
+#endif
+
+#ifdef TRMMKERNEL
+	xvmulsp		vs0,	vs32,	alpha_vr
+#else
+	xvmaddasp	vs0,	vs32,	alpha_vr
+#endif
+
+	stxvw4x		vs0,	o0,	T1
+
+	add		T1,	T1,	LDC
+
+
+#ifndef TRMMKERNEL
+
+	lxvw4x		vs0,	o0,	T1
+
+#endif
+
+#ifdef TRMMKERNEL
+	xvmulsp		vs0,	vs33,	alpha_vr
+#else
+	xvmaddasp	vs0,	vs33,	alpha_vr
+#endif
+
+	stxvw4x		vs0,	o0,	T1
+
+	add		T1,	T1,	LDC
+
+
+#ifndef TRMMKERNEL
+
+	lxvw4x		vs0,	o0,	T1
+
+#endif
+
+#ifdef TRMMKERNEL
+	xvmulsp		vs0,	vs34,	alpha_vr
+#else
+	xvmaddasp	vs0,	vs34,	alpha_vr
+#endif
+
+	stxvw4x		vs0,	o0,	T1
+
+	add		T1,	T1,	LDC
+
+
+#ifndef TRMMKERNEL
+
+	lxvw4x		vs0,	o0,	T1
+
+#endif
+
+#ifdef TRMMKERNEL
+	xvmulsp		vs0,	vs35,	alpha_vr
+#else
+	xvmaddasp	vs0,	vs35,	alpha_vr
+#endif
+
+	stxvw4x		vs0,	o0,	T1
+
+	add		T1,	T1,	LDC
+
+	addi		CO,	CO,	16
+
+.endm
+
+
+/**********************************************************************************************
+* Macros for N=4 and M=2
+**********************************************************************************************/
+
+.macro LOAD4x2_1
+
+	lxsspx		vs0,	o0,	AO
+	lxsspx		vs1,	o4,	AO
+
+	addi		AO,	AO,	8
+
+	mr		T1,	BO
+
+	lxsspx		vs8,	o0,	T1
+	lxsspx		vs9,	o16,	T1
+	lxsspx		vs10,	o32,	T1
+	lxsspx		vs11,	o48,	T1
+
+
+	addi		BO,	BO,	64
+
+.endm
+
+.macro KERNEL4x2_I1
+
+
+	lxsspx		vs4,	o0,	AO
+	lxsspx		vs5,	o4,	AO
+
+	addi		AO,	AO,	8
+
+	mr		T1,	BO
+
+	lxsspx		vs16,	o0,	T1
+	lxsspx		vs17,	o16,	T1
+	lxsspx		vs18,	o32,	T1
+	lxsspx		vs19,	o48,	T1
+
+
+	addi		BO,	BO,	64
+
+
+	xsmuldp		vs32,	vs0,	vs8
+	xsmuldp		vs33,	vs1,	vs8
+
+	xsmuldp		vs34,	vs0,	vs9
+	xsmuldp		vs35,	vs1,	vs9
+
+	xsmuldp		vs36,	vs0,	vs10
+	xsmuldp		vs37,	vs1,	vs10
+
+	xsmuldp		vs38,	vs0,	vs11
+	xsmuldp		vs39,	vs1,	vs11
+
+
+.endm
+
+.macro KERNEL4x2_1
+
+
+	lxsspx		vs4,	o0,	AO
+	lxsspx		vs5,	o4,	AO
+
+	addi		AO,	AO,	8
+
+	mr		T1,	BO
+
+	lxsspx		vs16,	o0,	T1
+	lxsspx		vs17,	o16,	T1
+	lxsspx		vs18,	o32,	T1
+	lxsspx		vs19,	o48,	T1
+
+
+	addi		BO,	BO,	64
+
+
+	xsmaddadp	vs32,	vs0,	vs8
+	xsmaddadp	vs33,	vs1,	vs8
+
+	xsmaddadp	vs34,	vs0,	vs9
+	xsmaddadp	vs35,	vs1,	vs9
+
+	xsmaddadp	vs36,	vs0,	vs10
+	xsmaddadp	vs37,	vs1,	vs10
+
+	xsmaddadp	vs38,	vs0,	vs11
+	xsmaddadp	vs39,	vs1,	vs11
+
+
+.endm
+
+.macro KERNEL4x2_2
+
+
+	lxsspx		vs0,	o0,	AO
+	lxsspx		vs1,	o4,	AO
+
+	addi		AO,	AO,	8
+
+	mr		T1,	BO
+
+	lxsspx		vs8,	o0,	T1
+	lxsspx		vs9,	o16,	T1
+	lxsspx		vs10,	o32,	T1
+	lxsspx		vs11,	o48,	T1
+
+
+	addi		BO,	BO,	64
+
+
+	xsmaddadp	vs32,	vs4,	vs16
+	xsmaddadp	vs33,	vs5,	vs16
+
+	xsmaddadp	vs34,	vs4,	vs17
+	xsmaddadp	vs35,	vs5,	vs17
+
+	xsmaddadp	vs36,	vs4,	vs18
+	xsmaddadp	vs37,	vs5,	vs18
+
+	xsmaddadp	vs38,	vs4,	vs19
+	xsmaddadp	vs39,	vs5,	vs19
+
+
+.endm
+
+.macro KERNEL4x2_E2
+
+
+	xsmaddadp	vs32,	vs4,	vs16
+	xsmaddadp	vs33,	vs5,	vs16
+
+	xsmaddadp	vs34,	vs4,	vs17
+	xsmaddadp	vs35,	vs5,	vs17
+
+	xsmaddadp	vs36,	vs4,	vs18
+	xsmaddadp	vs37,	vs5,	vs18
+
+	xsmaddadp	vs38,	vs4,	vs19
+	xsmaddadp	vs39,	vs5,	vs19
+
+
+.endm
+
+.macro KERNEL4x2_SUBI1
+
+
+	lxsspx		vs0,	o0,	AO
+	lxsspx		vs1,	o4,	AO
+
+	addi		AO,	AO,	8
+
+	mr		T1,	BO
+
+	lxsspx		vs8,	o0,	T1
+	lxsspx		vs9,	o16,	T1
+	lxsspx		vs10,	o32,	T1
+	lxsspx		vs11,	o48,	T1
+
+
+	addi		BO,	BO,	64
+
+
+	xsmuldp		vs32,	vs0,	vs8
+	xsmuldp		vs33,	vs1,	vs8
+
+	xsmuldp		vs34,	vs0,	vs9
+	xsmuldp		vs35,	vs1,	vs9
+
+	xsmuldp		vs36,	vs0,	vs10
+	xsmuldp		vs37,	vs1,	vs10
+
+	xsmuldp		vs38,	vs0,	vs11
+	xsmuldp		vs39,	vs1,	vs11
+
+
+.endm
+
+.macro KERNEL4x2_SUB1
+
+
+	lxsspx		vs0,	o0,	AO
+	lxsspx		vs1,	o4,	AO
+
+	addi		AO,	AO,	8
+
+	mr		T1,	BO
+
+	lxsspx		vs8,	o0,	T1
+	lxsspx		vs9,	o16,	T1
+	lxsspx		vs10,	o32,	T1
+	lxsspx		vs11,	o48,	T1
+
+
+	addi		BO,	BO,	64
+
+
+	xsmaddadp	vs32,	vs0,	vs8
+	xsmaddadp	vs33,	vs1,	vs8
+
+	xsmaddadp	vs34,	vs0,	vs9
+	xsmaddadp	vs35,	vs1,	vs9
+
+	xsmaddadp	vs36,	vs0,	vs10
+	xsmaddadp	vs37,	vs1,	vs10
+
+	xsmaddadp	vs38,	vs0,	vs11
+	xsmaddadp	vs39,	vs1,	vs11
+
+
+.endm
+
+.macro SAVE4x2
+
+	mr		T1,	CO
+
+#ifndef TRMMKERNEL
+
+	lxsspx		vs0,	o0,	T1
+	lxsspx		vs1,	o4,	T1
+
+#endif
+
+#ifdef TRMMKERNEL
+	xsmuldp		vs0,	vs32,	alpha_r
+	xsmuldp		vs1,	vs33,	alpha_r
+#else
+	xsmaddadp	vs0,	vs32,	alpha_r
+	xsmaddadp	vs1,	vs33,	alpha_r
+#endif
+
+	stxsspx		vs0,	o0,	T1
+	stxsspx		vs1,	o4,	T1
+
+	add		T1,	T1,	LDC
+
+
+#ifndef TRMMKERNEL
+
+	lxsspx		vs0,	o0,	T1
+	lxsspx		vs1,	o4,	T1
+
+#endif
+
+#ifdef TRMMKERNEL
+	xsmuldp		vs0,	vs34,	alpha_r
+	xsmuldp		vs1,	vs35,	alpha_r
+#else
+	xsmaddadp	vs0,	vs34,	alpha_r
+	xsmaddadp	vs1,	vs35,	alpha_r
+#endif
+
+	stxsspx		vs0,	o0,	T1
+	stxsspx		vs1,	o4,	T1
+
+	add		T1,	T1,	LDC
+
+
+#ifndef TRMMKERNEL
+
+	lxsspx		vs0,	o0,	T1
+	lxsspx		vs1,	o4,	T1
+
+#endif
+
+#ifdef TRMMKERNEL
+	xsmuldp		vs0,	vs36,	alpha_r
+	xsmuldp		vs1,	vs37,	alpha_r
+#else
+	xsmaddadp	vs0,	vs36,	alpha_r
+	xsmaddadp	vs1,	vs37,	alpha_r
+#endif
+
+	stxsspx		vs0,	o0,	T1
+	stxsspx		vs1,	o4,	T1
+
+	add		T1,	T1,	LDC
+
+
+#ifndef TRMMKERNEL
+
+	lxsspx		vs0,	o0,	T1
+	lxsspx		vs1,	o4,	T1
+
+#endif
+
+#ifdef TRMMKERNEL
+	xsmuldp		vs0,	vs38,	alpha_r
+	xsmuldp		vs1,	vs39,	alpha_r
+#else
+	xsmaddadp	vs0,	vs38,	alpha_r
+	xsmaddadp	vs1,	vs39,	alpha_r
+#endif
+
+	stxsspx		vs0,	o0,	T1
+	stxsspx		vs1,	o4,	T1
+
+	add		T1,	T1,	LDC
+
+	addi		CO,	CO,	8
+
+.endm
+
+
+/**********************************************************************************************
+* Macros for N=4 and M=1
+**********************************************************************************************/
+
+.macro LOAD4x1_1
+
+	lxsspx		vs0,	o0,	AO
+
+	addi		AO,	AO,	4
+
+	mr		T1,	BO
+
+	lxsspx		vs8,	o0,	T1
+	lxsspx		vs9,	o16,	T1
+	lxsspx		vs10,	o32,	T1
+	lxsspx		vs11,	o48,	T1
+
+
+	addi		BO,	BO,	64
+
+.endm
+
+.macro KERNEL4x1_I1
+
+
+	lxsspx		vs4,	o0,	AO
+
+	addi		AO,	AO,	4
+
+	mr		T1,	BO
+
+	lxsspx		vs16,	o0,	T1
+	lxsspx		vs17,	o16,	T1
+	lxsspx		vs18,	o32,	T1
+	lxsspx		vs19,	o48,	T1
+
+
+	addi		BO,	BO,	64
+
+
+	xsmuldp		vs32,	vs0,	vs8
+
+	xsmuldp		vs33,	vs0,	vs9
+
+	xsmuldp		vs34,	vs0,	vs10
+
+	xsmuldp		vs35,	vs0,	vs11
+
+
+.endm
+
+.macro KERNEL4x1_1
+
+
+	lxsspx		vs4,	o0,	AO
+
+	addi		AO,	AO,	4
+
+	mr		T1,	BO
+
+	lxsspx		vs16,	o0,	T1
+	lxsspx		vs17,	o16,	T1
+	lxsspx		vs18,	o32,	T1
+	lxsspx		vs19,	o48,	T1
+
+
+	addi		BO,	BO,	64
+
+
+	xsmaddadp	vs32,	vs0,	vs8
+
+	xsmaddadp	vs33,	vs0,	vs9
+
+	xsmaddadp	vs34,	vs0,	vs10
+
+	xsmaddadp	vs35,	vs0,	vs11
+
+
+.endm
+
+.macro KERNEL4x1_2
+
+
+	lxsspx		vs0,	o0,	AO
+
+	addi		AO,	AO,	4
+
+	mr		T1,	BO
+
+	lxsspx		vs8,	o0,	T1
+	lxsspx		vs9,	o16,	T1
+	lxsspx		vs10,	o32,	T1
+	lxsspx		vs11,	o48,	T1
+
+
+	addi		BO,	BO,	64
+
+
+	xsmaddadp	vs32,	vs4,	vs16
+
+	xsmaddadp	vs33,	vs4,	vs17
+
+	xsmaddadp	vs34,	vs4,	vs18
+
+	xsmaddadp	vs35,	vs4,	vs19
+
+
+.endm
+
+.macro KERNEL4x1_E2
+
+
+	xsmaddadp	vs32,	vs4,	vs16
+
+	xsmaddadp	vs33,	vs4,	vs17
+
+	xsmaddadp	vs34,	vs4,	vs18
+
+	xsmaddadp	vs35,	vs4,	vs19
+
+
+.endm
+
+.macro KERNEL4x1_SUBI1
+
+
+	lxsspx		vs0,	o0,	AO
+
+	addi		AO,	AO,	4
+
+	mr		T1,	BO
+
+	lxsspx		vs8,	o0,	T1
+	lxsspx		vs9,	o16,	T1
+	lxsspx		vs10,	o32,	T1
+	lxsspx		vs11,	o48,	T1
+
+
+	addi		BO,	BO,	64
+
+
+	xsmuldp		vs32,	vs0,	vs8
+
+	xsmuldp		vs33,	vs0,	vs9
+
+	xsmuldp		vs34,	vs0,	vs10
+
+	xsmuldp		vs35,	vs0,	vs11
+
+
+.endm
+
+.macro KERNEL4x1_SUB1
+
+
+	lxsspx		vs0,	o0,	AO
+
+	addi		AO,	AO,	4
+
+	mr		T1,	BO
+
+	lxsspx		vs8,	o0,	T1
+	lxsspx		vs9,	o16,	T1
+	lxsspx		vs10,	o32,	T1
+	lxsspx		vs11,	o48,	T1
+
+
+	addi		BO,	BO,	64
+
+
+	xsmaddadp	vs32,	vs0,	vs8
+
+	xsmaddadp	vs33,	vs0,	vs9
+
+	xsmaddadp	vs34,	vs0,	vs10
+
+	xsmaddadp	vs35,	vs0,	vs11
+
+
+.endm
+
+.macro SAVE4x1
+
+	mr		T1,	CO
+
+#ifndef TRMMKERNEL
+
+	lxsspx		vs0,	o0,	T1
+
+#endif
+
+#ifdef TRMMKERNEL
+	xsmuldp		vs0,	vs32,	alpha_r
+#else
+	xsmaddadp	vs0,	vs32,	alpha_r
+#endif
+
+	stxsspx		vs0,	o0,	T1
+
+	add		T1,	T1,	LDC
+
+
+#ifndef TRMMKERNEL
+
+	lxsspx		vs0,	o0,	T1
+
+#endif
+
+#ifdef TRMMKERNEL
+	xsmuldp		vs0,	vs33,	alpha_r
+#else
+	xsmaddadp	vs0,	vs33,	alpha_r
+#endif
+
+	stxsspx		vs0,	o0,	T1
+
+	add		T1,	T1,	LDC
+
+
+#ifndef TRMMKERNEL
+
+	lxsspx		vs0,	o0,	T1
+
+#endif
+
+#ifdef TRMMKERNEL
+	xsmuldp		vs0,	vs34,	alpha_r
+#else
+	xsmaddadp	vs0,	vs34,	alpha_r
+#endif
+
+	stxsspx		vs0,	o0,	T1
+
+	add		T1,	T1,	LDC
+
+
+#ifndef TRMMKERNEL
+
+	lxsspx		vs0,	o0,	T1
+
+#endif
+
+#ifdef TRMMKERNEL
+	xsmuldp		vs0,	vs35,	alpha_r
+#else
+	xsmaddadp	vs0,	vs35,	alpha_r
+#endif
+
+	stxsspx		vs0,	o0,	T1
+
+	add		T1,	T1,	LDC
+
+	addi		CO,	CO,	4
+
+.endm
+
+
+/**********************************************************************************************
+* Macros for N=2 and M=16
+**********************************************************************************************/
+
+.macro LOAD2x16_1
+
+	lxvw4x		vs0,	o0,	AO
+	lxvw4x		vs1,	o16,	AO
+	lxvw4x		vs2,	o32,	AO
+	lxvw4x		vs3,	o48,	AO
+
+	addi		AO,	AO,	64
+
+	mr		T1,	BO
+
+	lxvw4x		vs8,	o0,	T1
+	lxvw4x		vs9,	o16,	T1
+
+	addi		BO,	BO,	32
+
+.endm
+
+.macro KERNEL2x16_I1
+
+
+	lxvw4x		vs4,	o0,	AO
+	lxvw4x		vs5,	o16,	AO
+	lxvw4x		vs6,	o32,	AO
+	lxvw4x		vs7,	o48,	AO
+
+	addi		AO,	AO,	64
+
+	mr		T1,	BO
+
+	lxvw4x		vs16,	o0,	T1
+	lxvw4x		vs17,	o16,	T1
+
+	addi		BO,	BO,	32
+
+
+	xvmulsp		vs32,	vs0,	vs8
+	xvmulsp		vs33,	vs1,	vs8
+	xvmulsp		vs34,	vs2,	vs8
+	xvmulsp		vs35,	vs3,	vs8
+
+	xvmulsp		vs36,	vs0,	vs9
+	xvmulsp		vs37,	vs1,	vs9
+	xvmulsp		vs38,	vs2,	vs9
+	xvmulsp		vs39,	vs3,	vs9
+
+
+.endm
+
+.macro KERNEL2x16_1
+
+
+	lxvw4x		vs4,	o0,	AO
+	lxvw4x		vs5,	o16,	AO
+	lxvw4x		vs6,	o32,	AO
+	lxvw4x		vs7,	o48,	AO
+
+	addi		AO,	AO,	64
+
+	mr		T1,	BO
+
+	lxvw4x		vs16,	o0,	T1
+	lxvw4x		vs17,	o16,	T1
+
+	addi		BO,	BO,	32
+
+
+	xvmaddasp	vs32,	vs0,	vs8
+	xvmaddasp	vs33,	vs1,	vs8
+	xvmaddasp	vs34,	vs2,	vs8
+	xvmaddasp	vs35,	vs3,	vs8
+
+	xvmaddasp	vs36,	vs0,	vs9
+	xvmaddasp	vs37,	vs1,	vs9
+	xvmaddasp	vs38,	vs2,	vs9
+	xvmaddasp	vs39,	vs3,	vs9
+
+
+.endm
+
+.macro KERNEL2x16_2
+
+
+	lxvw4x		vs0,	o0,	AO
+	lxvw4x		vs1,	o16,	AO
+	lxvw4x		vs2,	o32,	AO
+	lxvw4x		vs3,	o48,	AO
+
+	addi		AO,	AO,	64
+
+	mr		T1,	BO
+
+	lxvw4x		vs8,	o0,	T1
+	lxvw4x		vs9,	o16,	T1
+
+	addi		BO,	BO,	32
+
+
+	xvmaddasp	vs32,	vs4,	vs16
+	xvmaddasp	vs33,	vs5,	vs16
+	xvmaddasp	vs34,	vs6,	vs16
+	xvmaddasp	vs35,	vs7,	vs16
+
+	xvmaddasp	vs36,	vs4,	vs17
+	xvmaddasp	vs37,	vs5,	vs17
+	xvmaddasp	vs38,	vs6,	vs17
+	xvmaddasp	vs39,	vs7,	vs17
+
+
+.endm
+
+.macro KERNEL2x16_E2
+
+
+	xvmaddasp	vs32,	vs4,	vs16
+	xvmaddasp	vs33,	vs5,	vs16
+	xvmaddasp	vs34,	vs6,	vs16
+	xvmaddasp	vs35,	vs7,	vs16
+
+	xvmaddasp	vs36,	vs4,	vs17
+	xvmaddasp	vs37,	vs5,	vs17
+	xvmaddasp	vs38,	vs6,	vs17
+	xvmaddasp	vs39,	vs7,	vs17
+
+
+.endm
+
+.macro KERNEL2x16_SUBI1
+
+
+	lxvw4x		vs0,	o0,	AO
+	lxvw4x		vs1,	o16,	AO
+	lxvw4x		vs2,	o32,	AO
+	lxvw4x		vs3,	o48,	AO
+
+	addi		AO,	AO,	64
+
+	mr		T1,	BO
+
+	lxvw4x		vs8,	o0,	T1
+	lxvw4x		vs9,	o16,	T1
+
+	addi		BO,	BO,	32
+
+
+	xvmulsp		vs32,	vs0,	vs8
+	xvmulsp		vs33,	vs1,	vs8
+	xvmulsp		vs34,	vs2,	vs8
+	xvmulsp		vs35,	vs3,	vs8
+
+	xvmulsp		vs36,	vs0,	vs9
+	xvmulsp		vs37,	vs1,	vs9
+	xvmulsp		vs38,	vs2,	vs9
+	xvmulsp		vs39,	vs3,	vs9
+
+
+.endm
+
+.macro KERNEL2x16_SUB1
+
+
+	lxvw4x		vs0,	o0,	AO
+	lxvw4x		vs1,	o16,	AO
+	lxvw4x		vs2,	o32,	AO
+	lxvw4x		vs3,	o48,	AO
+
+	addi		AO,	AO,	64
+
+	mr		T1,	BO
+
+	lxvw4x		vs8,	o0,	T1
+	lxvw4x		vs9,	o16,	T1
+
+	addi		BO,	BO,	32
+
+
+	xvmaddasp	vs32,	vs0,	vs8
+	xvmaddasp	vs33,	vs1,	vs8
+	xvmaddasp	vs34,	vs2,	vs8
+	xvmaddasp	vs35,	vs3,	vs8
+
+	xvmaddasp	vs36,	vs0,	vs9
+	xvmaddasp	vs37,	vs1,	vs9
+	xvmaddasp	vs38,	vs2,	vs9
+	xvmaddasp	vs39,	vs3,	vs9
+
+
+.endm
+
+.macro SAVE2x16
+
+	mr		T1,	CO
+
+#ifndef TRMMKERNEL
+
+	lxvw4x		vs0,	o0,	T1
+	lxvw4x		vs1,	o16,	T1
+	lxvw4x		vs2,	o32,	T1
+	lxvw4x		vs3,	o48,	T1
+
+#endif
+
+#ifdef TRMMKERNEL
+	xvmulsp		vs0,	vs32,	alpha_vr
+	xvmulsp		vs1,	vs33,	alpha_vr
+	xvmulsp		vs2,	vs34,	alpha_vr
+	xvmulsp		vs3,	vs35,	alpha_vr
+#else
+	xvmaddasp	vs0,	vs32,	alpha_vr
+	xvmaddasp	vs1,	vs33,	alpha_vr
+	xvmaddasp	vs2,	vs34,	alpha_vr
+	xvmaddasp	vs3,	vs35,	alpha_vr
+#endif
+
+	stxvw4x		vs0,	o0,	T1
+	stxvw4x		vs1,	o16,	T1
+	stxvw4x		vs2,	o32,	T1
+	stxvw4x		vs3,	o48,	T1
+
+	add		T1,	T1,	LDC
+
+
+#ifndef TRMMKERNEL
+
+	lxvw4x		vs0,	o0,	T1
+	lxvw4x		vs1,	o16,	T1
+	lxvw4x		vs2,	o32,	T1
+	lxvw4x		vs3,	o48,	T1
+
+#endif
+
+#ifdef TRMMKERNEL
+	xvmulsp		vs0,	vs36,	alpha_vr
+	xvmulsp		vs1,	vs37,	alpha_vr
+	xvmulsp		vs2,	vs38,	alpha_vr
+	xvmulsp		vs3,	vs39,	alpha_vr
+#else
+	xvmaddasp	vs0,	vs36,	alpha_vr
+	xvmaddasp	vs1,	vs37,	alpha_vr
+	xvmaddasp	vs2,	vs38,	alpha_vr
+	xvmaddasp	vs3,	vs39,	alpha_vr
+#endif
+
+	stxvw4x		vs0,	o0,	T1
+	stxvw4x		vs1,	o16,	T1
+	stxvw4x		vs2,	o32,	T1
+	stxvw4x		vs3,	o48,	T1
+
+	add		T1,	T1,	LDC
+
+	addi		CO,	CO,	64
+
+.endm
+
+
+/**********************************************************************************************
+* Macros for N=2 and M=8
+**********************************************************************************************/
+
+.macro LOAD2x8_1
+
+	lxvw4x		vs0,	o0,	AO
+	lxvw4x		vs1,	o16,	AO
+
+	addi		AO,	AO,	32
+
+	mr		T1,	BO
+
+	lxvw4x		vs8,	o0,	T1
+	lxvw4x		vs9,	o16,	T1
+
+	addi		BO,	BO,	32
+
+.endm
+
+.macro KERNEL2x8_I1
+
+
+	lxvw4x		vs4,	o0,	AO
+	lxvw4x		vs5,	o16,	AO
+
+	addi		AO,	AO,	32
+
+	mr		T1,	BO
+
+	lxvw4x		vs16,	o0,	T1
+	lxvw4x		vs17,	o16,	T1
+
+	addi		BO,	BO,	32
+
+
+	xvmulsp		vs32,	vs0,	vs8
+	xvmulsp		vs33,	vs1,	vs8
+
+	xvmulsp		vs34,	vs0,	vs9
+	xvmulsp		vs35,	vs1,	vs9
+
+
+.endm
+
+.macro KERNEL2x8_1
+
+
+	lxvw4x		vs4,	o0,	AO
+	lxvw4x		vs5,	o16,	AO
+
+	addi		AO,	AO,	32
+
+	mr		T1,	BO
+
+	lxvw4x		vs16,	o0,	T1
+	lxvw4x		vs17,	o16,	T1
+
+	addi		BO,	BO,	32
+
+
+	xvmaddasp	vs32,	vs0,	vs8
+	xvmaddasp	vs33,	vs1,	vs8
+
+	xvmaddasp	vs34,	vs0,	vs9
+	xvmaddasp	vs35,	vs1,	vs9
+
+
+.endm
+
+.macro KERNEL2x8_2
+
+
+	lxvw4x		vs0,	o0,	AO
+	lxvw4x		vs1,	o16,	AO
+
+	addi		AO,	AO,	32
+
+	mr		T1,	BO
+
+	lxvw4x		vs8,	o0,	T1
+	lxvw4x		vs9,	o16,	T1
+
+	addi		BO,	BO,	32
+
+
+	xvmaddasp	vs32,	vs4,	vs16
+	xvmaddasp	vs33,	vs5,	vs16
+
+	xvmaddasp	vs34,	vs4,	vs17
+	xvmaddasp	vs35,	vs5,	vs17
+
+
+.endm
+
+.macro KERNEL2x8_E2
+
+
+	xvmaddasp	vs32,	vs4,	vs16
+	xvmaddasp	vs33,	vs5,	vs16
+
+	xvmaddasp	vs34,	vs4,	vs17
+	xvmaddasp	vs35,	vs5,	vs17
+
+
+.endm
+
+.macro KERNEL2x8_SUBI1
+
+
+	lxvw4x		vs0,	o0,	AO
+	lxvw4x		vs1,	o16,	AO
+
+	addi		AO,	AO,	32
+
+	mr		T1,	BO
+
+	lxvw4x		vs8,	o0,	T1
+	lxvw4x		vs9,	o16,	T1
+
+	addi		BO,	BO,	32
+
+
+	xvmulsp		vs32,	vs0,	vs8
+	xvmulsp		vs33,	vs1,	vs8
+
+	xvmulsp		vs34,	vs0,	vs9
+	xvmulsp		vs35,	vs1,	vs9
+
+
+.endm
+
+.macro KERNEL2x8_SUB1
+
+
+	lxvw4x		vs0,	o0,	AO
+	lxvw4x		vs1,	o16,	AO
+
+	addi		AO,	AO,	32
+
+	mr		T1,	BO
+
+	lxvw4x		vs8,	o0,	T1
+	lxvw4x		vs9,	o16,	T1
+
+	addi		BO,	BO,	32
+
+
+	xvmaddasp	vs32,	vs0,	vs8
+	xvmaddasp	vs33,	vs1,	vs8
+
+	xvmaddasp	vs34,	vs0,	vs9
+	xvmaddasp	vs35,	vs1,	vs9
+
+
+.endm
+
+.macro SAVE2x8
+
+	mr		T1,	CO
+
+#ifndef TRMMKERNEL
+
+	lxvw4x		vs0,	o0,	T1
+	lxvw4x		vs1,	o16,	T1
+
+#endif
+
+#ifdef TRMMKERNEL
+	xvmulsp		vs0,	vs32,	alpha_vr
+	xvmulsp		vs1,	vs33,	alpha_vr
+#else
+	xvmaddasp	vs0,	vs32,	alpha_vr
+	xvmaddasp	vs1,	vs33,	alpha_vr
+#endif
+
+	stxvw4x		vs0,	o0,	T1
+	stxvw4x		vs1,	o16,	T1
+
+	add		T1,	T1,	LDC
+
+
+#ifndef TRMMKERNEL
+
+	lxvw4x		vs0,	o0,	T1
+	lxvw4x		vs1,	o16,	T1
+
+#endif
+
+#ifdef TRMMKERNEL
+	xvmulsp		vs0,	vs34,	alpha_vr
+	xvmulsp		vs1,	vs35,	alpha_vr
+#else
+	xvmaddasp	vs0,	vs34,	alpha_vr
+	xvmaddasp	vs1,	vs35,	alpha_vr
+#endif
+
+	stxvw4x		vs0,	o0,	T1
+	stxvw4x		vs1,	o16,	T1
+
+	add		T1,	T1,	LDC
+
+	addi		CO,	CO,	32
+
+.endm
+
+
+/**********************************************************************************************
+* Macros for N=2 and M=4
+**********************************************************************************************/
+
+.macro LOAD2x4_1
+
+	lxvw4x		vs0,	o0,	AO
+
+	addi		AO,	AO,	16
+
+	mr		T1,	BO
+
+	lxvw4x		vs8,	o0,	T1
+	lxvw4x		vs9,	o16,	T1
+
+	addi		BO,	BO,	32
+
+.endm
+
+.macro KERNEL2x4_I1
+
+
+	lxvw4x		vs4,	o0,	AO
+
+	addi		AO,	AO,	16
+
+	mr		T1,	BO
+
+	lxvw4x		vs16,	o0,	T1
+	lxvw4x		vs17,	o16,	T1
+
+	addi		BO,	BO,	32
+
+
+	xvmulsp		vs32,	vs0,	vs8
+
+	xvmulsp		vs33,	vs0,	vs9
+
+
+.endm
+
+.macro KERNEL2x4_1
+
+
+	lxvw4x		vs4,	o0,	AO
+
+	addi		AO,	AO,	16
+
+	mr		T1,	BO
+
+	lxvw4x		vs16,	o0,	T1
+	lxvw4x		vs17,	o16,	T1
+
+	addi		BO,	BO,	32
+
+
+	xvmaddasp	vs32,	vs0,	vs8
+
+	xvmaddasp	vs33,	vs0,	vs9
+
+
+.endm
+
+.macro KERNEL2x4_2
+
+
+	lxvw4x		vs0,	o0,	AO
+
+	addi		AO,	AO,	16
+
+	mr		T1,	BO
+
+	lxvw4x		vs8,	o0,	T1
+	lxvw4x		vs9,	o16,	T1
+
+	addi		BO,	BO,	32
+
+
+	xvmaddasp	vs32,	vs4,	vs16
+
+	xvmaddasp	vs33,	vs4,	vs17
+
+
+.endm
+
+.macro KERNEL2x4_E2
+
+
+	xvmaddasp	vs32,	vs4,	vs16
+
+	xvmaddasp	vs33,	vs4,	vs17
+
+
+.endm
+
+.macro KERNEL2x4_SUBI1
+
+
+	lxvw4x		vs0,	o0,	AO
+
+	addi		AO,	AO,	16
+
+	mr		T1,	BO
+
+	lxvw4x		vs8,	o0,	T1
+	lxvw4x		vs9,	o16,	T1
+
+	addi		BO,	BO,	32
+
+
+	xvmulsp		vs32,	vs0,	vs8
+
+	xvmulsp		vs33,	vs0,	vs9
+
+
+.endm
+
+.macro KERNEL2x4_SUB1
+
+
+	lxvw4x		vs0,	o0,	AO
+
+	addi		AO,	AO,	16
+
+	mr		T1,	BO
+
+	lxvw4x		vs8,	o0,	T1
+	lxvw4x		vs9,	o16,	T1
+
+	addi		BO,	BO,	32
+
+
+	xvmaddasp	vs32,	vs0,	vs8
+
+	xvmaddasp	vs33,	vs0,	vs9
+
+
+.endm
+
+.macro SAVE2x4
+
+	mr		T1,	CO
+
+#ifndef TRMMKERNEL
+
+	lxvw4x		vs0,	o0,	T1
+
+#endif
+
+#ifdef TRMMKERNEL
+	xvmulsp		vs0,	vs32,	alpha_vr
+#else
+	xvmaddasp	vs0,	vs32,	alpha_vr
+#endif
+
+	stxvw4x		vs0,	o0,	T1
+
+	add		T1,	T1,	LDC
+
+
+#ifndef TRMMKERNEL
+
+	lxvw4x		vs0,	o0,	T1
+
+#endif
+
+#ifdef TRMMKERNEL
+	xvmulsp		vs0,	vs33,	alpha_vr
+#else
+	xvmaddasp	vs0,	vs33,	alpha_vr
+#endif
+
+	stxvw4x		vs0,	o0,	T1
+
+	add		T1,	T1,	LDC
+
+	addi		CO,	CO,	16
+
+.endm
+
+
+/**********************************************************************************************
+* Macros for N=2 and M=2
+**********************************************************************************************/
+
+.macro LOAD2x2_1
+
+	lxsspx		vs0,	o0,	AO
+	lxsspx		vs1,	o4,	AO
+
+	addi		AO,	AO,	8
+
+	mr		T1,	BO
+
+	lxsspx		vs8,	o0,	T1
+	lxsspx		vs9,	o16,	T1
+
+
+	addi		BO,	BO,	32
+
+.endm
+
+.macro KERNEL2x2_I1
+
+
+	lxsspx		vs4,	o0,	AO
+	lxsspx		vs5,	o4,	AO
+
+	addi		AO,	AO,	8
+
+	mr		T1,	BO
+
+	lxsspx		vs16,	o0,	T1
+	lxsspx		vs17,	o16,	T1
+
+
+	addi		BO,	BO,	32
+
+
+	xsmuldp		vs32,	vs0,	vs8
+	xsmuldp		vs33,	vs1,	vs8
+
+	xsmuldp		vs34,	vs0,	vs9
+	xsmuldp		vs35,	vs1,	vs9
+
+
+.endm
+
+.macro KERNEL2x2_1
+
+
+	lxsspx		vs4,	o0,	AO
+	lxsspx		vs5,	o4,	AO
+
+	addi		AO,	AO,	8
+
+	mr		T1,	BO
+
+	lxsspx		vs16,	o0,	T1
+	lxsspx		vs17,	o16,	T1
+
+
+	addi		BO,	BO,	32
+
+
+	xsmaddadp	vs32,	vs0,	vs8
+	xsmaddadp	vs33,	vs1,	vs8
+
+	xsmaddadp	vs34,	vs0,	vs9
+	xsmaddadp	vs35,	vs1,	vs9
+
+
+.endm
+
+.macro KERNEL2x2_2
+
+
+	lxsspx		vs0,	o0,	AO
+	lxsspx		vs1,	o4,	AO
+
+	addi		AO,	AO,	8
+
+	mr		T1,	BO
+
+	lxsspx		vs8,	o0,	T1
+	lxsspx		vs9,	o16,	T1
+
+
+	addi		BO,	BO,	32
+
+
+	xsmaddadp	vs32,	vs4,	vs16
+	xsmaddadp	vs33,	vs5,	vs16
+
+	xsmaddadp	vs34,	vs4,	vs17
+	xsmaddadp	vs35,	vs5,	vs17
+
+
+.endm
+
+.macro KERNEL2x2_E2
+
+
+	xsmaddadp	vs32,	vs4,	vs16
+	xsmaddadp	vs33,	vs5,	vs16
+
+	xsmaddadp	vs34,	vs4,	vs17
+	xsmaddadp	vs35,	vs5,	vs17
+
+
+.endm
+
+.macro KERNEL2x2_SUBI1
+
+
+	lxsspx		vs0,	o0,	AO
+	lxsspx		vs1,	o4,	AO
+
+	addi		AO,	AO,	8
+
+	mr		T1,	BO
+
+	lxsspx		vs8,	o0,	T1
+	lxsspx		vs9,	o16,	T1
+
+
+	addi		BO,	BO,	32
+
+
+	xsmuldp		vs32,	vs0,	vs8
+	xsmuldp		vs33,	vs1,	vs8
+
+	xsmuldp		vs34,	vs0,	vs9
+	xsmuldp		vs35,	vs1,	vs9
+
+
+.endm
+
+.macro KERNEL2x2_SUB1
+
+
+	lxsspx		vs0,	o0,	AO
+	lxsspx		vs1,	o4,	AO
+
+	addi		AO,	AO,	8
+
+	mr		T1,	BO
+
+	lxsspx		vs8,	o0,	T1
+	lxsspx		vs9,	o16,	T1
+
+
+	addi		BO,	BO,	32
+
+
+	xsmaddadp	vs32,	vs0,	vs8
+	xsmaddadp	vs33,	vs1,	vs8
+
+	xsmaddadp	vs34,	vs0,	vs9
+	xsmaddadp	vs35,	vs1,	vs9
+
+
+.endm
+
+.macro SAVE2x2
+
+	mr		T1,	CO
+
+#ifndef TRMMKERNEL
+
+	lxsspx		vs0,	o0,	T1
+	lxsspx		vs1,	o4,	T1
+
+#endif
+
+#ifdef TRMMKERNEL
+	xsmuldp		vs0,	vs32,	alpha_r
+	xsmuldp		vs1,	vs33,	alpha_r
+#else
+	xsmaddadp	vs0,	vs32,	alpha_r
+	xsmaddadp	vs1,	vs33,	alpha_r
+#endif
+
+	stxsspx		vs0,	o0,	T1
+	stxsspx		vs1,	o4,	T1
+
+	add		T1,	T1,	LDC
+
+
+#ifndef TRMMKERNEL
+
+	lxsspx		vs0,	o0,	T1
+	lxsspx		vs1,	o4,	T1
+
+#endif
+
+#ifdef TRMMKERNEL
+	xsmuldp		vs0,	vs34,	alpha_r
+	xsmuldp		vs1,	vs35,	alpha_r
+#else
+	xsmaddadp	vs0,	vs34,	alpha_r
+	xsmaddadp	vs1,	vs35,	alpha_r
+#endif
+
+	stxsspx		vs0,	o0,	T1
+	stxsspx		vs1,	o4,	T1
+
+	add		T1,	T1,	LDC
+
+	addi		CO,	CO,	8
+
+.endm
+
+
+/**********************************************************************************************
+* Macros for N=2 and M=1
+**********************************************************************************************/
+
+.macro LOAD2x1_1
+
+	lxsspx		vs0,	o0,	AO
+
+	addi		AO,	AO,	4
+
+	mr		T1,	BO
+
+	lxsspx		vs8,	o0,	T1
+	lxsspx		vs9,	o16,	T1
+
+
+	addi		BO,	BO,	32
+
+.endm
+
+.macro KERNEL2x1_I1
+
+
+	lxsspx		vs4,	o0,	AO
+
+	addi		AO,	AO,	4
+
+	mr		T1,	BO
+
+	lxsspx		vs16,	o0,	T1
+	lxsspx		vs17,	o16,	T1
+
+
+	addi		BO,	BO,	32
+
+
+	xsmuldp		vs32,	vs0,	vs8
+
+	xsmuldp		vs33,	vs0,	vs9
+
+
+.endm
+
+.macro KERNEL2x1_1
+
+
+	lxsspx		vs4,	o0,	AO
+
+	addi		AO,	AO,	4
+
+	mr		T1,	BO
+
+	lxsspx		vs16,	o0,	T1
+	lxsspx		vs17,	o16,	T1
+
+
+	addi		BO,	BO,	32
+
+
+	xsmaddadp	vs32,	vs0,	vs8
+
+	xsmaddadp	vs33,	vs0,	vs9
+
+
+.endm
+
+.macro KERNEL2x1_2
+
+
+	lxsspx		vs0,	o0,	AO
+
+	addi		AO,	AO,	4
+
+	mr		T1,	BO
+
+	lxsspx		vs8,	o0,	T1
+	lxsspx		vs9,	o16,	T1
+
+
+	addi		BO,	BO,	32
+
+
+	xsmaddadp	vs32,	vs4,	vs16
+
+	xsmaddadp	vs33,	vs4,	vs17
+
+
+.endm
+
+.macro KERNEL2x1_E2
+
+
+	xsmaddadp	vs32,	vs4,	vs16
+
+	xsmaddadp	vs33,	vs4,	vs17
+
+
+.endm
+
+.macro KERNEL2x1_SUBI1
+
+
+	lxsspx		vs0,	o0,	AO
+
+	addi		AO,	AO,	4
+
+	mr		T1,	BO
+
+	lxsspx		vs8,	o0,	T1
+	lxsspx		vs9,	o16,	T1
+
+
+	addi		BO,	BO,	32
+
+
+	xsmuldp		vs32,	vs0,	vs8
+
+	xsmuldp		vs33,	vs0,	vs9
+
+
+.endm
+
+.macro KERNEL2x1_SUB1
+
+
+	lxsspx		vs0,	o0,	AO
+
+	addi		AO,	AO,	4
+
+	mr		T1,	BO
+
+	lxsspx		vs8,	o0,	T1
+	lxsspx		vs9,	o16,	T1
+
+
+	addi		BO,	BO,	32
+
+
+	xsmaddadp	vs32,	vs0,	vs8
+
+	xsmaddadp	vs33,	vs0,	vs9
+
+
+.endm
+
+.macro SAVE2x1
+
+	mr		T1,	CO
+
+#ifndef TRMMKERNEL
+
+	lxsspx		vs0,	o0,	T1
+
+#endif
+
+#ifdef TRMMKERNEL
+	xsmuldp		vs0,	vs32,	alpha_r
+#else
+	xsmaddadp	vs0,	vs32,	alpha_r
+#endif
+
+	stxsspx		vs0,	o0,	T1
+
+	add		T1,	T1,	LDC
+
+
+#ifndef TRMMKERNEL
+
+	lxsspx		vs0,	o0,	T1
+
+#endif
+
+#ifdef TRMMKERNEL
+	xsmuldp		vs0,	vs33,	alpha_r
+#else
+	xsmaddadp	vs0,	vs33,	alpha_r
+#endif
+
+	stxsspx		vs0,	o0,	T1
+
+	add		T1,	T1,	LDC
+
+	addi		CO,	CO,	4
+
+.endm
+
+
+/**********************************************************************************************
+* Macros for N=1 and M=16
+**********************************************************************************************/
+
+.macro LOAD1x16_1
+
+	lxvw4x		vs0,	o0,	AO
+	lxvw4x		vs1,	o16,	AO
+	lxvw4x		vs2,	o32,	AO
+	lxvw4x		vs3,	o48,	AO
+
+	addi		AO,	AO,	64
+
+	mr		T1,	BO
+
+	lxvw4x		vs8,	o0,	T1
+
+	addi		BO,	BO,	16
+
+.endm
+
+.macro KERNEL1x16_I1
+
+
+	lxvw4x		vs4,	o0,	AO
+	lxvw4x		vs5,	o16,	AO
+	lxvw4x		vs6,	o32,	AO
+	lxvw4x		vs7,	o48,	AO
+
+	addi		AO,	AO,	64
+
+	mr		T1,	BO
+
+	lxvw4x		vs16,	o0,	T1
+
+	addi		BO,	BO,	16
+
+
+	xvmulsp		vs32,	vs0,	vs8
+	xvmulsp		vs33,	vs1,	vs8
+	xvmulsp		vs34,	vs2,	vs8
+	xvmulsp		vs35,	vs3,	vs8
+
+
+.endm
+
+.macro KERNEL1x16_1
+
+
+	lxvw4x		vs4,	o0,	AO
+	lxvw4x		vs5,	o16,	AO
+	lxvw4x		vs6,	o32,	AO
+	lxvw4x		vs7,	o48,	AO
+
+	addi		AO,	AO,	64
+
+	mr		T1,	BO
+
+	lxvw4x		vs16,	o0,	T1
+
+	addi		BO,	BO,	16
+
+
+	xvmaddasp	vs32,	vs0,	vs8
+	xvmaddasp	vs33,	vs1,	vs8
+	xvmaddasp	vs34,	vs2,	vs8
+	xvmaddasp	vs35,	vs3,	vs8
+
+
+.endm
+
+.macro KERNEL1x16_2
+
+
+	lxvw4x		vs0,	o0,	AO
+	lxvw4x		vs1,	o16,	AO
+	lxvw4x		vs2,	o32,	AO
+	lxvw4x		vs3,	o48,	AO
+
+	addi		AO,	AO,	64
+
+	mr		T1,	BO
+
+	lxvw4x		vs8,	o0,	T1
+
+	addi		BO,	BO,	16
+
+
+	xvmaddasp	vs32,	vs4,	vs16
+	xvmaddasp	vs33,	vs5,	vs16
+	xvmaddasp	vs34,	vs6,	vs16
+	xvmaddasp	vs35,	vs7,	vs16
+
+
+.endm
+
+.macro KERNEL1x16_E2
+
+
+	xvmaddasp	vs32,	vs4,	vs16
+	xvmaddasp	vs33,	vs5,	vs16
+	xvmaddasp	vs34,	vs6,	vs16
+	xvmaddasp	vs35,	vs7,	vs16
+
+
+.endm
+
+.macro KERNEL1x16_SUBI1
+
+
+	lxvw4x		vs0,	o0,	AO
+	lxvw4x		vs1,	o16,	AO
+	lxvw4x		vs2,	o32,	AO
+	lxvw4x		vs3,	o48,	AO
+
+	addi		AO,	AO,	64
+
+	mr		T1,	BO
+
+	lxvw4x		vs8,	o0,	T1
+
+	addi		BO,	BO,	16
+
+
+	xvmulsp		vs32,	vs0,	vs8
+	xvmulsp		vs33,	vs1,	vs8
+	xvmulsp		vs34,	vs2,	vs8
+	xvmulsp		vs35,	vs3,	vs8
+
+
+.endm
+
+.macro KERNEL1x16_SUB1
+
+
+	lxvw4x		vs0,	o0,	AO
+	lxvw4x		vs1,	o16,	AO
+	lxvw4x		vs2,	o32,	AO
+	lxvw4x		vs3,	o48,	AO
+
+	addi		AO,	AO,	64
+
+	mr		T1,	BO
+
+	lxvw4x		vs8,	o0,	T1
+
+	addi		BO,	BO,	16
+
+
+	xvmaddasp	vs32,	vs0,	vs8
+	xvmaddasp	vs33,	vs1,	vs8
+	xvmaddasp	vs34,	vs2,	vs8
+	xvmaddasp	vs35,	vs3,	vs8
+
+
+.endm
+
+.macro SAVE1x16
+
+	mr		T1,	CO
+
+#ifndef TRMMKERNEL
+
+	lxvw4x		vs0,	o0,	T1
+	lxvw4x		vs1,	o16,	T1
+	lxvw4x		vs2,	o32,	T1
+	lxvw4x		vs3,	o48,	T1
+
+#endif
+
+#ifdef TRMMKERNEL
+	xvmulsp		vs0,	vs32,	alpha_vr
+	xvmulsp		vs1,	vs33,	alpha_vr
+	xvmulsp		vs2,	vs34,	alpha_vr
+	xvmulsp		vs3,	vs35,	alpha_vr
+#else
+	xvmaddasp	vs0,	vs32,	alpha_vr
+	xvmaddasp	vs1,	vs33,	alpha_vr
+	xvmaddasp	vs2,	vs34,	alpha_vr
+	xvmaddasp	vs3,	vs35,	alpha_vr
+#endif
+
+	stxvw4x		vs0,	o0,	T1
+	stxvw4x		vs1,	o16,	T1
+	stxvw4x		vs2,	o32,	T1
+	stxvw4x		vs3,	o48,	T1
+
+	add		T1,	T1,	LDC
+
+	addi		CO,	CO,	64
+
+.endm
+
+
+/**********************************************************************************************
+* Macros for N=1 and M=8
+**********************************************************************************************/
+
+.macro LOAD1x8_1
+
+	lxvw4x		vs0,	o0,	AO
+	lxvw4x		vs1,	o16,	AO
+
+	addi		AO,	AO,	32
+
+	mr		T1,	BO
+
+	lxvw4x		vs8,	o0,	T1
+
+	addi		BO,	BO,	16
+
+.endm
+
+.macro KERNEL1x8_I1
+
+
+	lxvw4x		vs4,	o0,	AO
+	lxvw4x		vs5,	o16,	AO
+
+	addi		AO,	AO,	32
+
+	mr		T1,	BO
+
+	lxvw4x		vs16,	o0,	T1
+
+	addi		BO,	BO,	16
+
+
+	xvmulsp		vs32,	vs0,	vs8
+	xvmulsp		vs33,	vs1,	vs8
+
+
+.endm
+
+.macro KERNEL1x8_1
+
+
+	lxvw4x		vs4,	o0,	AO
+	lxvw4x		vs5,	o16,	AO
+
+	addi		AO,	AO,	32
+
+	mr		T1,	BO
+
+	lxvw4x		vs16,	o0,	T1
+
+	addi		BO,	BO,	16
+
+
+	xvmaddasp	vs32,	vs0,	vs8
+	xvmaddasp	vs33,	vs1,	vs8
+
+
+.endm
+
+.macro KERNEL1x8_2
+
+
+	lxvw4x		vs0,	o0,	AO
+	lxvw4x		vs1,	o16,	AO
+
+	addi		AO,	AO,	32
+
+	mr		T1,	BO
+
+	lxvw4x		vs8,	o0,	T1
+
+	addi		BO,	BO,	16
+
+
+	xvmaddasp	vs32,	vs4,	vs16
+	xvmaddasp	vs33,	vs5,	vs16
+
+
+.endm
+
+.macro KERNEL1x8_E2
+
+
+	xvmaddasp	vs32,	vs4,	vs16
+	xvmaddasp	vs33,	vs5,	vs16
+
+
+.endm
+
+.macro KERNEL1x8_SUBI1
+
+
+	lxvw4x		vs0,	o0,	AO
+	lxvw4x		vs1,	o16,	AO
+
+	addi		AO,	AO,	32
+
+	mr		T1,	BO
+
+	lxvw4x		vs8,	o0,	T1
+
+	addi		BO,	BO,	16
+
+
+	xvmulsp		vs32,	vs0,	vs8
+	xvmulsp		vs33,	vs1,	vs8
+
+
+.endm
+
+.macro KERNEL1x8_SUB1
+
+
+	lxvw4x		vs0,	o0,	AO
+	lxvw4x		vs1,	o16,	AO
+
+	addi		AO,	AO,	32
+
+	mr		T1,	BO
+
+	lxvw4x		vs8,	o0,	T1
+
+	addi		BO,	BO,	16
+
+
+	xvmaddasp	vs32,	vs0,	vs8
+	xvmaddasp	vs33,	vs1,	vs8
+
+
+.endm
+
+.macro SAVE1x8
+
+	mr		T1,	CO
+
+#ifndef TRMMKERNEL
+
+	lxvw4x		vs0,	o0,	T1
+	lxvw4x		vs1,	o16,	T1
+
+#endif
+
+#ifdef TRMMKERNEL
+	xvmulsp		vs0,	vs32,	alpha_vr
+	xvmulsp		vs1,	vs33,	alpha_vr
+#else
+	xvmaddasp	vs0,	vs32,	alpha_vr
+	xvmaddasp	vs1,	vs33,	alpha_vr
+#endif
+
+	stxvw4x		vs0,	o0,	T1
+	stxvw4x		vs1,	o16,	T1
+
+	add		T1,	T1,	LDC
+
+	addi		CO,	CO,	32
+
+.endm
+
+
+/**********************************************************************************************
+* Macros for N=1 and M=4
+**********************************************************************************************/
+
+.macro LOAD1x4_1
+
+	lxvw4x		vs0,	o0,	AO
+
+	addi		AO,	AO,	16
+
+	mr		T1,	BO
+
+	lxvw4x		vs8,	o0,	T1
+
+	addi		BO,	BO,	16
+
+.endm
+
+.macro KERNEL1x4_I1
+
+
+	lxvw4x		vs4,	o0,	AO
+
+	addi		AO,	AO,	16
+
+	mr		T1,	BO
+
+	lxvw4x		vs16,	o0,	T1
+
+	addi		BO,	BO,	16
+
+
+	xvmulsp		vs32,	vs0,	vs8
+
+
+.endm
+
+.macro KERNEL1x4_1
+
+
+	lxvw4x		vs4,	o0,	AO
+
+	addi		AO,	AO,	16
+
+	mr		T1,	BO
+
+	lxvw4x		vs16,	o0,	T1
+
+	addi		BO,	BO,	16
+
+
+	xvmaddasp	vs32,	vs0,	vs8
+
+
+.endm
+
+.macro KERNEL1x4_2
+
+
+	lxvw4x		vs0,	o0,	AO
+
+	addi		AO,	AO,	16
+
+	mr		T1,	BO
+
+	lxvw4x		vs8,	o0,	T1
+
+	addi		BO,	BO,	16
+
+
+	xvmaddasp	vs32,	vs4,	vs16
+
+
+.endm
+
+.macro KERNEL1x4_E2
+
+
+	xvmaddasp	vs32,	vs4,	vs16
+
+
+.endm
+
+.macro KERNEL1x4_SUBI1
+
+
+	lxvw4x		vs0,	o0,	AO
+
+	addi		AO,	AO,	16
+
+	mr		T1,	BO
+
+	lxvw4x		vs8,	o0,	T1
+
+	addi		BO,	BO,	16
+
+
+	xvmulsp		vs32,	vs0,	vs8
+
+
+.endm
+
+.macro KERNEL1x4_SUB1
+
+
+	lxvw4x		vs0,	o0,	AO
+
+	addi		AO,	AO,	16
+
+	mr		T1,	BO
+
+	lxvw4x		vs8,	o0,	T1
+
+	addi		BO,	BO,	16
+
+
+	xvmaddasp	vs32,	vs0,	vs8
+
+
+.endm
+
+.macro SAVE1x4
+
+	mr		T1,	CO
+
+#ifndef TRMMKERNEL
+
+	lxvw4x		vs0,	o0,	T1
+
+#endif
+
+#ifdef TRMMKERNEL
+	xvmulsp		vs0,	vs32,	alpha_vr
+#else
+	xvmaddasp	vs0,	vs32,	alpha_vr
+#endif
+
+	stxvw4x		vs0,	o0,	T1
+
+	add		T1,	T1,	LDC
+
+	addi		CO,	CO,	16
+
+.endm
+
+
+/**********************************************************************************************
+* Macros for N=1 and M=2
+**********************************************************************************************/
+
+.macro LOAD1x2_1
+
+	lxsspx		vs0,	o0,	AO
+	lxsspx		vs1,	o4,	AO
+
+	addi		AO,	AO,	8
+
+	mr		T1,	BO
+
+	lxsspx		vs8,	o0,	T1
+
+
+	addi		BO,	BO,	16
+
+.endm
+
+.macro KERNEL1x2_I1
+
+
+	lxsspx		vs4,	o0,	AO
+	lxsspx		vs5,	o4,	AO
+
+	addi		AO,	AO,	8
+
+	mr		T1,	BO
+
+	lxsspx		vs16,	o0,	T1
+
+
+	addi		BO,	BO,	16
+
+
+	xsmuldp		vs32,	vs0,	vs8
+	xsmuldp		vs33,	vs1,	vs8
+
+
+.endm
+
+.macro KERNEL1x2_1
+
+
+	lxsspx		vs4,	o0,	AO
+	lxsspx		vs5,	o4,	AO
+
+	addi		AO,	AO,	8
+
+	mr		T1,	BO
+
+	lxsspx		vs16,	o0,	T1
+
+
+	addi		BO,	BO,	16
+
+
+	xsmaddadp	vs32,	vs0,	vs8
+	xsmaddadp	vs33,	vs1,	vs8
+
+
+.endm
+
+.macro KERNEL1x2_2
+
+
+	lxsspx		vs0,	o0,	AO
+	lxsspx		vs1,	o4,	AO
+
+	addi		AO,	AO,	8
+
+	mr		T1,	BO
+
+	lxsspx		vs8,	o0,	T1
+
+
+	addi		BO,	BO,	16
+
+
+	xsmaddadp	vs32,	vs4,	vs16
+	xsmaddadp	vs33,	vs5,	vs16
+
+
+.endm
+
+.macro KERNEL1x2_E2
+
+
+	xsmaddadp	vs32,	vs4,	vs16
+	xsmaddadp	vs33,	vs5,	vs16
+
+
+.endm
+
+.macro KERNEL1x2_SUBI1
+
+
+	lxsspx		vs0,	o0,	AO
+	lxsspx		vs1,	o4,	AO
+
+	addi		AO,	AO,	8
+
+	mr		T1,	BO
+
+	lxsspx		vs8,	o0,	T1
+
+
+	addi		BO,	BO,	16
+
+
+	xsmuldp		vs32,	vs0,	vs8
+	xsmuldp		vs33,	vs1,	vs8
+
+
+.endm
+
+.macro KERNEL1x2_SUB1
+
+
+	lxsspx		vs0,	o0,	AO
+	lxsspx		vs1,	o4,	AO
+
+	addi		AO,	AO,	8
+
+	mr		T1,	BO
+
+	lxsspx		vs8,	o0,	T1
+
+
+	addi		BO,	BO,	16
+
+
+	xsmaddadp	vs32,	vs0,	vs8
+	xsmaddadp	vs33,	vs1,	vs8
+
+
+.endm
+
+.macro SAVE1x2
+
+	mr		T1,	CO
+
+#ifndef TRMMKERNEL
+
+	lxsspx		vs0,	o0,	T1
+	lxsspx		vs1,	o4,	T1
+
+#endif
+
+#ifdef TRMMKERNEL
+	xsmuldp		vs0,	vs32,	alpha_r
+	xsmuldp		vs1,	vs33,	alpha_r
+#else
+	xsmaddadp	vs0,	vs32,	alpha_r
+	xsmaddadp	vs1,	vs33,	alpha_r
+#endif
+
+	stxsspx		vs0,	o0,	T1
+	stxsspx		vs1,	o4,	T1
+
+	add		T1,	T1,	LDC
+
+	addi		CO,	CO,	8
+
+.endm
+
+
+/**********************************************************************************************
+* Macros for N=1 and M=1
+**********************************************************************************************/
+
+.macro LOAD1x1_1
+
+	lxsspx		vs0,	o0,	AO
+
+	addi		AO,	AO,	4
+
+	mr		T1,	BO
+
+	lxsspx		vs8,	o0,	T1
+
+
+	addi		BO,	BO,	16
+
+.endm
+
+.macro KERNEL1x1_I1
+
+
+	lxsspx		vs4,	o0,	AO
+
+	addi		AO,	AO,	4
+
+	mr		T1,	BO
+
+	lxsspx		vs16,	o0,	T1
+
+
+	addi		BO,	BO,	16
+
+
+	xsmuldp		vs32,	vs0,	vs8
+
+
+.endm
+
+.macro KERNEL1x1_1
+
+
+	lxsspx		vs4,	o0,	AO
+
+	addi		AO,	AO,	4
+
+	mr		T1,	BO
+
+	lxsspx		vs16,	o0,	T1
+
+
+	addi		BO,	BO,	16
+
+
+	xsmaddadp	vs32,	vs0,	vs8
+
+
+.endm
+
+.macro KERNEL1x1_2
+
+
+	lxsspx		vs0,	o0,	AO
+
+	addi		AO,	AO,	4
+
+	mr		T1,	BO
+
+	lxsspx		vs8,	o0,	T1
+
+
+	addi		BO,	BO,	16
+
+
+	xsmaddadp	vs32,	vs4,	vs16
+
+
+.endm
+
+.macro KERNEL1x1_E2
+
+
+	xsmaddadp	vs32,	vs4,	vs16
+
+
+.endm
+
+.macro KERNEL1x1_SUBI1
+
+
+	lxsspx		vs0,	o0,	AO
+
+	addi		AO,	AO,	4
+
+	mr		T1,	BO
+
+	lxsspx		vs8,	o0,	T1
+
+
+	addi		BO,	BO,	16
+
+
+	xsmuldp		vs32,	vs0,	vs8
+
+
+.endm
+
+.macro KERNEL1x1_SUB1
+
+
+	lxsspx		vs0,	o0,	AO
+
+	addi		AO,	AO,	4
+
+	mr		T1,	BO
+
+	lxsspx		vs8,	o0,	T1
+
+
+	addi		BO,	BO,	16
+
+
+	xsmaddadp	vs32,	vs0,	vs8
+
+
+.endm
+
+.macro SAVE1x1
+
+	mr		T1,	CO
+
+#ifndef TRMMKERNEL
+
+	lxsspx		vs0,	o0,	T1
+
+#endif
+
+#ifdef TRMMKERNEL
+	xsmuldp		vs0,	vs32,	alpha_r
+#else
+	xsmaddadp	vs0,	vs32,	alpha_r
+#endif
+
+	stxsspx		vs0,	o0,	T1
+
+	add		T1,	T1,	LDC
+
+	addi		CO,	CO,	4
+
+.endm
+
diff --git a/kernel/power/srot.c b/kernel/power/srot.c
new file mode 100644
index 000000000..d464846a4
--- /dev/null
+++ b/kernel/power/srot.c
@@ -0,0 +1,167 @@
+/***************************************************************************
+Copyright (c) 2013-2016, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*****************************************************************************/
+
+/**************************************************************************************
+* 2016/03/26 Werner Saar (wernsaar@googlemail.com)
+* 	 BLASTEST 		: OK
+* 	 CTEST			: OK
+* 	 TEST			: OK
+*	 LAPACK-TEST		: OK
+**************************************************************************************/
+
+
+
+#include "common.h"
+
+#pragma GCC optimize "O1"
+
+#if defined(POWER8)
+#include "srot_microk_power8.c"
+#endif
+
+
+#ifndef HAVE_KERNEL_16
+
+static void srot_kernel_16(BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *c, FLOAT *s)
+{
+
+	BLASLONG i=0;
+	FLOAT f0, f1, f2, f3;
+	FLOAT x00, x01, x02, x03;
+	FLOAT g0, g1, g2, g3;
+	FLOAT y00, y01, y02, y03;
+	FLOAT *x1=x;
+	FLOAT *y1=y;
+	FLOAT c1=*c;
+	FLOAT s1=*s;
+
+	while ( i<n )
+	{
+
+		x00 = x1[0];
+		y00 = y1[0];
+		x01 = x1[1];
+		y01 = y1[1];
+		x02 = x1[2];
+		y02 = y1[2];
+		x03 = x1[3];
+		y03 = y1[3];
+
+		f0 = c1*x00 + s1*y00;
+		g0 = c1*y00 - s1*x00;
+		f1 = c1*x01 + s1*y01;
+		g1 = c1*y01 - s1*x01;
+		f2 = c1*x02 + s1*y02;
+		g2 = c1*y02 - s1*x02;
+		f3 = c1*x03 + s1*y03;
+		g3 = c1*y03 - s1*x03;
+
+		x1[0] = f0;
+		y1[0] = g0;
+		x1[1] = f1;
+		y1[1] = g1;
+		x1[2] = f2;
+		y1[2] = g2;
+		x1[3] = f3;
+		y1[3] = g3;
+
+		x1 += 4;
+		y1 += 4;
+
+		i+=4;
+	}
+	return;
+
+}
+
+
+#endif
+
+
+int CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT c, FLOAT s)
+{
+	BLASLONG i=0;
+	BLASLONG ix=0,iy=0;
+	FLOAT c1[4] __attribute__ ((aligned (16)));;
+	FLOAT s1[4] __attribute__ ((aligned (16)));;
+	FLOAT *x1=x;
+	FLOAT *y1=y;
+	FLOAT temp;
+
+	if ( n <= 0     )  return(0);
+
+	if ( (inc_x == 1) && (inc_y == 1) )
+	{
+
+		BLASLONG n1 = n & -16;
+		if ( n1 > 0 )
+		{
+			c1[0]=c;
+			c1[1]=c;
+			c1[2]=c;
+			c1[3]=c;
+			s1[0]=s;
+			s1[1]=s;
+			s1[2]=s;
+			s1[3]=s;
+			srot_kernel_16(n1, x1, y1, c1, s1);
+			i=n1;
+		}
+
+		while(i < n)
+		{
+			temp  = c*x[i] + s*y[i] ;
+			y[i]  = c*y[i] - s*x[i] ;
+			x[i]  = temp ;
+
+			i++ ;
+
+		}
+
+
+	}
+	else
+	{
+
+		while(i < n)
+		{
+			temp   = c*x[ix] + s*y[iy] ;
+			y[iy]  = c*y[iy] - s*x[ix] ;
+			x[ix]  = temp ;
+
+			ix += inc_x ;
+			iy += inc_y ;
+			i++ ;
+
+		}
+
+	}
+	return(0);
+
+}
+
+
diff --git a/kernel/power/srot_microk_power8.c b/kernel/power/srot_microk_power8.c
new file mode 100644
index 000000000..ade65500f
--- /dev/null
+++ b/kernel/power/srot_microk_power8.c
@@ -0,0 +1,208 @@
+/***************************************************************************
+Copyright (c) 2013-2016, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*****************************************************************************/
+
+/**************************************************************************************
+* 2016/03/27 Werner Saar (wernsaar@googlemail.com)
+*
+* I don't use fused multiply-add ( precision problems with lapack )
+*
+* 	 BLASTEST 		: OK
+* 	 CTEST			: OK
+* 	 TEST			: OK
+*	 LAPACK-TEST		: OK
+**************************************************************************************/
+
+#define HAVE_KERNEL_16 1
+
+static void srot_kernel_16( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *c, FLOAT *s) __attribute__ ((noinline));
+
+static void srot_kernel_16( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *c, FLOAT *s)
+{
+
+
+	BLASLONG i = n;
+	BLASLONG o16 = 16;
+	BLASLONG o32 = 32;
+	BLASLONG o48 = 48;
+	FLOAT *x1=x;
+	FLOAT *y1=y;
+	FLOAT *x2=x+1;
+	FLOAT *y2=y+1;
+
+	__asm__  __volatile__
+	(
+
+        "lxvw4x         36 , 0, %3                          \n\t"	// load c
+        "lxvw4x         37 , 0, %4                          \n\t"	// load s
+	"addi		%8 , %8, -4			     \n\t"
+	"addi		%9 , %9, -4			     \n\t"
+
+	"lxvw4x		32, 0, %1			    \n\t"	// load x
+	"lxvw4x		33, %5, %1			    \n\t"
+	"lxvw4x		34, %6, %1			    \n\t"
+	"lxvw4x		35, %7, %1			    \n\t"
+
+	"lxvw4x		40, 0, %2			    \n\t"	// load y
+	"lxvw4x		41, %5, %2			    \n\t"
+	"lxvw4x		42, %6, %2			    \n\t"
+	"lxvw4x		43, %7, %2			    \n\t"
+
+	"addi		%1, %1, 64			    \n\t"
+	"addi		%2, %2, 64			    \n\t"
+
+	"addic.		%0 , %0	, -16  	 	             \n\t"
+	"ble		2f		             	     \n\t"
+
+	".align 5				            \n\t"
+	"1:				                    \n\t"
+
+	"xvmulsp	48, 32, 36		    	    \n\t"	// c * x
+	"xvmulsp	49, 33, 36		    	    \n\t"
+	"xvmulsp	50, 34, 36		    	    \n\t"
+	"xvmulsp	51, 35, 36		    	    \n\t"
+
+	"xvmulsp	56, 40, 36		    	    \n\t"	// c * y
+	"xvmulsp	57, 41, 36		    	    \n\t"
+	"xvmulsp	58, 42, 36		    	    \n\t"
+	"xvmulsp	59, 43, 36		    	    \n\t"
+
+	"xvmulsp	52, 32, 37		    	    \n\t"	// s * x
+	"xvmulsp	53, 33, 37		    	    \n\t"
+
+	"lxvw4x		32, 0, %1			    \n\t"	// load x
+	"lxvw4x		33, %5, %1			    \n\t"
+
+	"xvmulsp	54, 34, 37		    	    \n\t"
+	"xvmulsp	55, 35, 37		    	    \n\t"
+
+	"lxvw4x		34, %6, %1			    \n\t"
+	"lxvw4x		35, %7, %1			    \n\t"
+
+	"xvmulsp	60, 40, 37		    	    \n\t"	// s * y
+	"xvmulsp	61, 41, 37		    	    \n\t"
+
+	"lxvw4x		40, 0, %2			    \n\t"	// load y
+	"lxvw4x		41, %5, %2			    \n\t"
+
+	"xvmulsp	62, 42, 37		    	    \n\t"
+	"xvmulsp	63, 43, 37		    	    \n\t"
+
+	"lxvw4x		42, %6, %2			    \n\t"
+	"lxvw4x		43, %7, %2			    \n\t"
+
+	"xvaddsp	48, 48 , 60			    \n\t"	// c * x + s * y 
+	"xvaddsp	49, 49 , 61			    \n\t"	// c * x + s * y 
+
+	"addi		%1, %1, 64			    \n\t"
+	"addi		%2, %2, 64			    \n\t"
+
+	"xvaddsp	50, 50 , 62			    \n\t"	// c * x + s * y 
+	"xvaddsp	51, 51 , 63			    \n\t"	// c * x + s * y 
+
+	"xvsubsp	56, 56 , 52			    \n\t"	// c * y - s * x
+	"xvsubsp	57, 57 , 53			    \n\t"	// c * y - s * x
+	"xvsubsp	58, 58 , 54			    \n\t"	// c * y - s * x
+	"xvsubsp	59, 59 , 55			    \n\t"	// c * y - s * x
+
+	"stxvw4x	48, 0, %8			    \n\t"	// store x
+	"stxvw4x	49, %5, %8			    \n\t"
+	"stxvw4x	50, %6, %8			    \n\t"
+	"stxvw4x	51, %7, %8			    \n\t"
+
+	"stxvw4x	56, 0, %9			    \n\t"	// store y
+	"stxvw4x	57, %5, %9			    \n\t"
+	"stxvw4x	58, %6, %9			    \n\t"
+	"stxvw4x	59, %7, %9			    \n\t"
+
+	"addi		%8, %8, 64			    \n\t"
+	"addi		%9, %9, 64			    \n\t"
+
+	"addic.		%0 , %0	, -16  	 	             \n\t"
+	"bgt		1b		             	     \n\t"
+
+	"2:						     \n\t"
+
+	"xvmulsp	48, 32, 36		    	    \n\t"	// c * x
+	"xvmulsp	49, 33, 36		    	    \n\t"
+	"xvmulsp	50, 34, 36		    	    \n\t"
+	"xvmulsp	51, 35, 36		    	    \n\t"
+
+	"xvmulsp	56, 40, 36		    	    \n\t"	// c * y
+	"xvmulsp	57, 41, 36		    	    \n\t"
+	"xvmulsp	58, 42, 36		    	    \n\t"
+	"xvmulsp	59, 43, 36		    	    \n\t"
+
+	"xvmulsp	52, 32, 37		    	    \n\t"	// s * x
+	"xvmulsp	53, 33, 37		    	    \n\t"
+	"xvmulsp	54, 34, 37		    	    \n\t"
+	"xvmulsp	55, 35, 37		    	    \n\t"
+
+	"xvmulsp	60, 40, 37		    	    \n\t"	// s * y
+	"xvmulsp	61, 41, 37		    	    \n\t"
+	"xvmulsp	62, 42, 37		    	    \n\t"
+	"xvmulsp	63, 43, 37		    	    \n\t"
+
+	"xvaddsp	48, 48 , 60			    \n\t"	// c * x + s * y 
+	"xvaddsp	49, 49 , 61			    \n\t"	// c * x + s * y 
+	"xvaddsp	50, 50 , 62			    \n\t"	// c * x + s * y 
+	"xvaddsp	51, 51 , 63			    \n\t"	// c * x + s * y 
+
+	"xvsubsp	56, 56 , 52			    \n\t"	// c * y - s * x
+	"xvsubsp	57, 57 , 53			    \n\t"	// c * y - s * x
+	"xvsubsp	58, 58 , 54			    \n\t"	// c * y - s * x
+	"xvsubsp	59, 59 , 55			    \n\t"	// c * y - s * x
+
+	"stxvw4x	48, 0, %8			    \n\t"	// store x
+	"stxvw4x	49, %5, %8			    \n\t"
+	"stxvw4x	50, %6, %8			    \n\t"
+	"stxvw4x	51, %7, %8			    \n\t"
+
+	"stxvw4x	56, 0, %9			    \n\t"	// store y
+	"stxvw4x	57, %5, %9			    \n\t"
+	"stxvw4x	58, %6, %9			    \n\t"
+	"stxvw4x	59, %7, %9			    \n\t"
+
+
+
+	:
+        : 
+          "r" (i),	// 0	
+	  "r" (x1),  	// 1
+          "r" (y1),     // 2
+          "r" (c),      // 3
+          "r" (s),      // 4
+	  "r" (o16),	// 5
+	  "r" (o32),	// 6
+	  "r" (o48),    // 7
+	  "r" (x2),     // 8
+	  "r" (y2)      // 9
+	: "cr0", "%0", "%1" , "%2", "%8", "%9", "memory"
+	);
+
+} 
+
+
diff --git a/kernel/power/sscal.c b/kernel/power/sscal.c
new file mode 100644
index 000000000..c6ef5e969
--- /dev/null
+++ b/kernel/power/sscal.c
@@ -0,0 +1,179 @@
+/***************************************************************************
+Copyright (c) 2013-2016, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*****************************************************************************/
+
+/**************************************************************************************
+* 2016/03/27 Werner Saar (wernsaar@googlemail.com)
+* 	 BLASTEST 		: OK
+* 	 CTEST			: OK
+* 	 TEST			: OK
+*	 LAPACK-TEST		: OK
+**************************************************************************************/
+
+#include "common.h"
+
+#if defined(POWER8) 
+#include "sscal_microk_power8.c"
+#endif
+
+
+#if !defined(HAVE_KERNEL_16)
+
+static void sscal_kernel_16( BLASLONG n, FLOAT *da , FLOAT *x )
+{
+
+        BLASLONG i;
+        FLOAT alpha = *da;
+
+        for( i=0; i<n; i+=8 )
+        {
+                x[0] *= alpha;
+                x[1] *= alpha;
+                x[2] *= alpha;
+                x[3] *= alpha;
+                x[4] *= alpha;
+                x[5] *= alpha;
+                x[6] *= alpha;
+                x[7] *= alpha;
+                x+=8;
+        }
+
+}
+
+static void sscal_kernel_16_zero( BLASLONG n, FLOAT *da , FLOAT *x )
+{
+
+        BLASLONG i;
+	FLOAT alpha=0.0;
+
+        for( i=0; i<n; i+=8 )
+        {
+                x[0] = alpha;
+                x[1] = alpha;
+                x[2] = alpha;
+                x[3] = alpha;
+                x[4] = alpha;
+                x[5] = alpha;
+                x[6] = alpha;
+                x[7] = alpha;
+                x+=8;
+        }
+
+}
+
+
+#endif
+
+int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *dummy, BLASLONG dummy2)
+{
+	BLASLONG i=0,j=0;
+	FLOAT alpha[4] __attribute__ ((aligned (16)));;
+
+	if ( n <= 0 || inc_x <=0 )
+		return(0);
+
+
+	if ( inc_x == 1 )
+	{
+
+		if ( da == 0.0 )
+		{		
+
+			BLASLONG n1 = n & -32;
+			if ( n1 > 0 )
+			{
+				alpha[0]=da;
+				alpha[1]=da;
+				alpha[2]=da;
+				alpha[3]=da;
+				sscal_kernel_16_zero(n1 , alpha , x);
+				j=n1;
+			}
+
+			while(j < n)
+			{
+
+				x[j]=0.0;
+				j++;
+			}
+
+		}
+		else
+		{
+
+			BLASLONG n1 = n & -32;
+			if ( n1 > 0 )
+			{
+				alpha[0]=da;
+				alpha[1]=da;
+				alpha[2]=da;
+				alpha[3]=da;
+				sscal_kernel_16(n1 , alpha , x);
+				j=n1;
+			}
+			while(j < n)
+			{
+
+				x[j] = da * x[j] ;
+				j++;
+			}
+		}
+
+
+	}
+	else
+	{
+
+		if ( da == 0.0 )
+		{		
+
+			while(j < n)
+			{
+
+				x[i]=0.0;
+				i += inc_x ;
+				j++;
+			}
+
+		}
+		else
+		{
+
+			while(j < n)
+			{
+
+				x[i] = da * x[i] ;
+				i += inc_x ;
+				j++;
+			}
+		}
+
+	}
+	return 0;
+
+}
+
+
diff --git a/kernel/power/sscal_microk_power8.c b/kernel/power/sscal_microk_power8.c
new file mode 100644
index 000000000..963cec777
--- /dev/null
+++ b/kernel/power/sscal_microk_power8.c
@@ -0,0 +1,218 @@
+/***************************************************************************
+Copyright (c) 2013-2016, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*****************************************************************************/
+
+/**************************************************************************************
+* 2016/03/27 Werner Saar (wernsaar@googlemail.com)
+* 	 BLASTEST 		: OK
+* 	 CTEST			: OK
+* 	 TEST			: OK
+*	 LAPACK-TEST		: OK
+**************************************************************************************/
+
+#define HAVE_KERNEL_16 1
+
+static void sscal_kernel_16( BLASLONG n, FLOAT *alpha, FLOAT *x) __attribute__ ((noinline));
+
+static void sscal_kernel_16( BLASLONG n, FLOAT *alpha, FLOAT *x)
+{
+
+
+	BLASLONG i = n;
+	BLASLONG o16 = 16;
+	BLASLONG o32 = 32;
+	BLASLONG o48 = 48;
+	BLASLONG o64 = 64;
+	BLASLONG o80 = 80;
+	BLASLONG o96 = 96;
+	BLASLONG o112 = 112;
+	FLOAT *x1=x;
+	FLOAT *x2=x+1;
+	BLASLONG pre = 384;
+
+	__asm__  __volatile__
+	(
+
+        "lxvw4x         32, 0, %3                           \n\t"
+        "addi           %1, %1, -4                          \n\t"
+
+	"dcbt		%2, %4				    \n\t"
+
+	"lxvw4x		40, 0, %2			    \n\t"
+	"lxvw4x		41, %5, %2			    \n\t"
+	"lxvw4x		42, %6, %2			    \n\t"
+	"lxvw4x		43, %7, %2			    \n\t"
+	"lxvw4x		44, %8, %2			    \n\t"
+	"lxvw4x		45, %9, %2			    \n\t"
+	"lxvw4x		46, %10, %2			    \n\t"
+	"lxvw4x		47, %11, %2			    \n\t"
+
+	"addi		%2, %2, 128			    \n\t"
+
+	"addic.		%0 , %0	, -32  	 	             \n\t"
+	"ble		2f		             	     \n\t"
+
+	".align 5				            \n\t"
+	"1:				                    \n\t"
+
+	"dcbt		%2, %4				    \n\t"
+
+	"xvmulsp	48, 40, 32		    	    \n\t"
+	"xvmulsp	49, 41, 32		    	    \n\t"
+	"lxvw4x		40, 0, %2			    \n\t"
+	"lxvw4x		41, %5, %2			    \n\t"
+	"xvmulsp	50, 42, 32		    	    \n\t"
+	"xvmulsp	51, 43, 32		    	    \n\t"
+	"lxvw4x		42, %6, %2			    \n\t"
+	"lxvw4x		43, %7, %2			    \n\t"
+	"xvmulsp	52, 44, 32		    	    \n\t"
+	"xvmulsp	53, 45, 32		    	    \n\t"
+	"lxvw4x		44, %8, %2			    \n\t"
+	"lxvw4x		45, %9, %2			    \n\t"
+	"xvmulsp	54, 46, 32		    	    \n\t"
+	"xvmulsp	55, 47, 32		    	    \n\t"
+	"lxvw4x		46, %10, %2			    \n\t"
+	"lxvw4x		47, %11, %2			    \n\t"
+
+	"stxvw4x	48, 0, %1			    \n\t"
+	"stxvw4x	49, %5, %1			    \n\t"
+	"stxvw4x	50, %6, %1			    \n\t"
+	"stxvw4x	51, %7, %1			    \n\t"
+	"stxvw4x	52, %8, %1			    \n\t"
+	"stxvw4x	53, %9, %1			    \n\t"
+	"stxvw4x	54, %10, %1			    \n\t"
+	"stxvw4x	55, %11, %1			    \n\t"
+
+	"addi		%1, %1, 128			    \n\t"
+	"addi		%2, %2, 128			    \n\t"
+
+	"addic.		%0 , %0	, -32  	 	             \n\t"
+	"bgt		1b		             	     \n\t"
+
+	"2:						     \n\t"
+
+	"xvmulsp	48, 40, 32		    	    \n\t"
+	"xvmulsp	49, 41, 32		    	    \n\t"
+	"xvmulsp	50, 42, 32		    	    \n\t"
+	"xvmulsp	51, 43, 32		    	    \n\t"
+	"xvmulsp	52, 44, 32		    	    \n\t"
+	"xvmulsp	53, 45, 32		    	    \n\t"
+	"xvmulsp	54, 46, 32		    	    \n\t"
+	"xvmulsp	55, 47, 32		    	    \n\t"
+
+	"stxvw4x	48, 0, %1			    \n\t"
+	"stxvw4x	49, %5, %1			    \n\t"
+	"stxvw4x	50, %6, %1			    \n\t"
+	"stxvw4x	51, %7, %1			    \n\t"
+	"stxvw4x	52, %8, %1			    \n\t"
+	"stxvw4x	53, %9, %1			    \n\t"
+	"stxvw4x	54, %10, %1			    \n\t"
+	"stxvw4x	55, %11, %1			    \n\t"
+
+	:
+        : 
+          "r" (i),	// 0	
+	  "r" (x2),  	// 1
+          "r" (x1),     // 2
+          "r" (alpha),  // 3
+          "r" (pre),    // 4
+	  "r" (o16),	// 5
+	  "r" (o32),	// 6
+	  "r" (o48),    // 7
+          "r" (o64),    // 8
+          "r" (o80),    // 9
+          "r" (o96),    // 10
+          "r" (o112)    // 11
+	: "cr0", "%0", "%2" , "%1", "memory"
+	);
+
+} 
+
+
+static void sscal_kernel_16_zero( BLASLONG n, FLOAT *alpha, FLOAT *x) __attribute__ ((noinline));
+
+static void sscal_kernel_16_zero( BLASLONG n, FLOAT *alpha, FLOAT *x)
+{
+
+
+	BLASLONG i = n;
+	BLASLONG o16 = 16;
+	BLASLONG o32 = 32;
+	BLASLONG o48 = 48;
+	BLASLONG o64 = 64;
+	BLASLONG o80 = 80;
+	BLASLONG o96 = 96;
+	BLASLONG o112 = 112;
+	FLOAT *x1=x;
+	FLOAT *x2=x+1;
+	BLASLONG pre = 384;
+
+	__asm__  __volatile__
+	(
+
+	"xxlxor		32 , 32 , 32			    \n\t"
+        "addi           %1, %1, -4                          \n\t"
+
+
+	".align 5				            \n\t"
+	"1:				                    \n\t"
+
+	"stxvw4x	32, 0, %1			    \n\t"
+	"stxvw4x	32, %5, %1			    \n\t"
+	"stxvw4x	32, %6, %1			    \n\t"
+	"stxvw4x	32, %7, %1			    \n\t"
+	"stxvw4x	32, %8, %1			    \n\t"
+	"stxvw4x	32, %9, %1			    \n\t"
+	"stxvw4x	32, %10, %1			    \n\t"
+	"stxvw4x	32, %11, %1			    \n\t"
+
+	"addi		%1, %1, 128			    \n\t"
+
+	"addic.		%0 , %0	, -32  	 	             \n\t"
+	"bgt		1b		             	     \n\t"
+
+	"2:						     \n\t"
+
+	:
+        : 
+          "r" (i),	// 0	
+	  "r" (x2),  	// 1
+          "r" (x1),     // 2
+          "r" (alpha),  // 3
+          "r" (pre),    // 4
+	  "r" (o16),	// 5
+	  "r" (o32),	// 6
+	  "r" (o48),    // 7
+          "r" (o64),    // 8
+          "r" (o80),    // 9
+          "r" (o96),    // 10
+          "r" (o112)    // 11
+	: "cr0", "%0", "%2" , "%1", "memory"
+	);
+
+} 
+
+
diff --git a/kernel/power/sswap.c b/kernel/power/sswap.c
new file mode 100644
index 000000000..932652b37
--- /dev/null
+++ b/kernel/power/sswap.c
@@ -0,0 +1,154 @@
+/***************************************************************************
+Copyright (c) 2013-2016, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*****************************************************************************/
+
+/**************************************************************************************
+* 2016/03/25 Werner Saar (wernsaar@googlemail.com)
+* 	 BLASTEST 		: OK
+* 	 CTEST			: OK
+* 	 TEST			: OK
+*	 LAPACK-TEST		: OK
+**************************************************************************************/
+
+#include "common.h"
+
+#if defined(POWER8)
+#include "sswap_microk_power8.c"
+#endif
+
+#ifndef HAVE_KERNEL_32
+
+static void sswap_kernel_32(BLASLONG n, FLOAT *x, FLOAT *y)
+{
+
+	BLASLONG i=0;
+	FLOAT f0, f1, f2, f3, f4, f5, f6, f7;
+	FLOAT g0, g1, g2, g3, g4, g5, g6, g7;
+	FLOAT *x1=x;
+	FLOAT *y1=y;
+
+	while ( i<n )
+	{
+
+		f0 = x1[0];
+		f1 = x1[1];
+		f2 = x1[2];
+		f3 = x1[3];
+		f4 = x1[4];
+		f5 = x1[5];
+		f6 = x1[6];
+		f7 = x1[7];
+
+		g0 = y1[0];
+		g1 = y1[1];
+		g2 = y1[2];
+		g3 = y1[3];
+		g4 = y1[4];
+		g5 = y1[5];
+		g6 = y1[6];
+		g7 = y1[7];
+
+		y1[0] = f0;
+		y1[1] = f1;
+		y1[2] = f2;
+		y1[3] = f3;
+		y1[4] = f4;
+		y1[5] = f5;
+		y1[6] = f6;
+		y1[7] = f7;
+
+		x1[0] = g0;
+		x1[1] = g1;
+		x1[2] = g2;
+		x1[3] = g3;
+		x1[4] = g4;
+		x1[5] = g5;
+		x1[6] = g6;
+		x1[7] = g7;
+
+		x1 += 8;
+		y1 += 8;
+
+		i+=8;
+	}
+	return;
+
+}
+
+
+#endif
+
+
+int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT dummy3, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *dummy, BLASLONG dummy2)
+{
+	BLASLONG i=0;
+	BLASLONG ix=0,iy=0;
+	FLOAT temp;
+
+	if ( n <= 0     )  return(0);
+
+	if ( (inc_x == 1) && (inc_y == 1 ))
+	{
+
+		BLASLONG n1 = n & -32;
+		if ( n1 > 0 )
+		{
+			sswap_kernel_32(n1, x, y);
+			i=n1;
+		}
+
+		while(i < n)
+		{
+			temp = y[i];	
+			y[i] = x[i] ;
+			x[i] = temp;
+			i++ ;
+
+		}
+
+
+	}
+	else
+	{
+
+		while(i < n)
+		{
+			temp  = y[iy];
+			y[iy] = x[ix] ;
+			x[ix] = temp;
+			ix += inc_x ;
+			iy += inc_y ;
+			i++ ;
+
+		}
+
+	}
+	return(0);
+	
+
+}
+
+
diff --git a/kernel/power/sswap_microk_power8.c b/kernel/power/sswap_microk_power8.c
new file mode 100644
index 000000000..c48e743de
--- /dev/null
+++ b/kernel/power/sswap_microk_power8.c
@@ -0,0 +1,136 @@
+/***************************************************************************
+Copyright (c) 2013-2016, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*****************************************************************************/
+
+/**************************************************************************************
+* 2016/03/25 Werner Saar (wernsaar@googlemail.com)
+* 	 BLASTEST 		: OK
+* 	 CTEST			: OK
+* 	 TEST			: OK
+*	 LAPACK-TEST		: OK
+**************************************************************************************/
+
+#define HAVE_KERNEL_32 1
+
+static void sswap_kernel_32( BLASLONG n, FLOAT *x, FLOAT *y) __attribute__ ((noinline));
+
+static void sswap_kernel_32( BLASLONG n, FLOAT *x, FLOAT *y)
+{
+
+
+	BLASLONG i = n;
+	BLASLONG o16 = 16;
+	BLASLONG o32 = 32;
+	BLASLONG o48 = 48;
+	BLASLONG o64 = 64;
+	BLASLONG o80 = 80;
+	BLASLONG o96 = 96;
+	BLASLONG o112 = 112;
+	FLOAT *x1=x;
+	FLOAT *y1=y;
+	FLOAT *x2=x+1;
+	FLOAT *y2=y+1;
+	BLASLONG pre = 384;
+	BLASLONG alpha=0;
+
+	__asm__  __volatile__
+	(
+
+	"addi		%3, %3, -4			    \n\t"	
+	"addi		%4, %4, -4			    \n\t"	
+
+	".align 5				            \n\t"
+	"1:				                    \n\t"
+
+	"lxvw4x		32, 0, %2			    \n\t"
+	"lxvw4x		33, %5, %2			    \n\t"
+	"lxvw4x		34, %6, %2			    \n\t"
+	"lxvw4x		35, %7, %2			    \n\t"
+	"lxvw4x		36, %8, %2			    \n\t"
+	"lxvw4x		37, %9, %2			    \n\t"
+	"lxvw4x		38, %10, %2			    \n\t"
+	"lxvw4x		39, %11, %2			    \n\t"
+
+	"addi		%2, %2, 128			    \n\t"
+
+	"lxvw4x		48, 0, %1			    \n\t"
+	"lxvw4x		49, %5, %1			    \n\t"
+	"lxvw4x		50, %6, %1			    \n\t"
+	"lxvw4x		51, %7, %1			    \n\t"
+	"lxvw4x		52, %8, %1			    \n\t"
+	"lxvw4x		53, %9, %1			    \n\t"
+	"lxvw4x		54, %10, %1			    \n\t"
+	"lxvw4x		55, %11, %1			    \n\t"
+
+	"addi		%1, %1, 128			    \n\t"
+
+	"stxvw4x		32, 0, %3			    \n\t"
+	"stxvw4x		33, %5, %3			    \n\t"
+	"stxvw4x		34, %6, %3			    \n\t"
+	"stxvw4x		35, %7, %3			    \n\t"
+	"stxvw4x		36, %8, %3			    \n\t"
+	"stxvw4x		37, %9, %3			    \n\t"
+	"stxvw4x		38, %10, %3			    \n\t"
+	"stxvw4x		39, %11, %3			    \n\t"
+
+	"addi		%3, %3, 128			    \n\t"
+
+	"stxvw4x		48, 0, %4			    \n\t"
+	"stxvw4x		49, %5, %4			    \n\t"
+	"stxvw4x		50, %6, %4			    \n\t"
+	"stxvw4x		51, %7, %4			    \n\t"
+	"stxvw4x		52, %8, %4			    \n\t"
+	"stxvw4x		53, %9, %4			    \n\t"
+	"stxvw4x		54, %10, %4			    \n\t"
+	"stxvw4x		55, %11, %4			    \n\t"
+
+	"addi		%4, %4, 128			    \n\t"
+
+	"addic.		%0 , %0	, -32  	 	             \n\t"
+	"bgt		1b		             	     \n\t"
+
+	"2:						     \n\t"
+
+	:
+        : 
+          "r" (i),	// 0	
+	  "r" (y1),  	// 1
+          "r" (x1),     // 2
+          "r" (y2),     // 3
+          "r" (x2),     // 4
+	  "r" (o16),	// 5
+	  "r" (o32),	// 6
+	  "r" (o48),    // 7
+          "r" (o64),    // 8
+          "r" (o80),    // 9
+          "r" (o96),    // 10
+          "r" (o112)    // 11
+	: "cr0", "%0", "%2" , "%1", "%3", "%4", "memory"
+	);
+
+} 
+
+
diff --git a/kernel/power/strmm_kernel_16x8_power8.S b/kernel/power/strmm_kernel_16x8_power8.S
new file mode 100644
index 000000000..f756d5d92
--- /dev/null
+++ b/kernel/power/strmm_kernel_16x8_power8.S
@@ -0,0 +1,369 @@
+/***************************************************************************
+Copyright (c) 2013-2016, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*****************************************************************************/
+
+/**************************************************************************************
+* 2016/04/02 Werner Saar (wernsaar@googlemail.com)
+* 	 BLASTEST 		: OK
+* 	 CTEST			: OK
+* 	 TEST			: OK
+* 	 LAPACK-TEST		: OK
+**************************************************************************************/
+
+/*********************************************************************/
+/* Copyright 2009, 2010 The University of Texas at Austin.           */
+/* All rights reserved.                                              */
+/*                                                                   */
+/* Redistribution and use in source and binary forms, with or        */
+/* without modification, are permitted provided that the following   */
+/* conditions are met:                                               */
+/*                                                                   */
+/*   1. Redistributions of source code must retain the above         */
+/*      copyright notice, this list of conditions and the following  */
+/*      disclaimer.                                                  */
+/*                                                                   */
+/*   2. Redistributions in binary form must reproduce the above      */
+/*      copyright notice, this list of conditions and the following  */
+/*      disclaimer in the documentation and/or other materials       */
+/*      provided with the distribution.                              */
+/*                                                                   */
+/*    THIS  SOFTWARE IS PROVIDED  BY THE  UNIVERSITY OF  TEXAS AT    */
+/*    AUSTIN  ``AS IS''  AND ANY  EXPRESS OR  IMPLIED WARRANTIES,    */
+/*    INCLUDING, BUT  NOT LIMITED  TO, THE IMPLIED  WARRANTIES OF    */
+/*    MERCHANTABILITY  AND FITNESS FOR  A PARTICULAR  PURPOSE ARE    */
+/*    DISCLAIMED.  IN  NO EVENT SHALL THE UNIVERSITY  OF TEXAS AT    */
+/*    AUSTIN OR CONTRIBUTORS BE  LIABLE FOR ANY DIRECT, INDIRECT,    */
+/*    INCIDENTAL,  SPECIAL, EXEMPLARY,  OR  CONSEQUENTIAL DAMAGES    */
+/*    (INCLUDING, BUT  NOT LIMITED TO,  PROCUREMENT OF SUBSTITUTE    */
+/*    GOODS  OR  SERVICES; LOSS  OF  USE,  DATA,  OR PROFITS;  OR    */
+/*    BUSINESS INTERRUPTION) HOWEVER CAUSED  AND ON ANY THEORY OF    */
+/*    LIABILITY, WHETHER  IN CONTRACT, STRICT  LIABILITY, OR TORT    */
+/*    (INCLUDING NEGLIGENCE OR OTHERWISE)  ARISING IN ANY WAY OUT    */
+/*    OF  THE  USE OF  THIS  SOFTWARE,  EVEN  IF ADVISED  OF  THE    */
+/*    POSSIBILITY OF SUCH DAMAGE.                                    */
+/*                                                                   */
+/* The views and conclusions contained in the software and           */
+/* documentation are those of the authors and should not be          */
+/* interpreted as representing official policies, either expressed   */
+/* or implied, of The University of Texas at Austin.                 */
+/*********************************************************************/
+
+#define ASSEMBLER
+#include "common.h"
+#include "def_vsx.h"
+
+#ifndef __64BIT__
+#define LOAD	lwz
+#else
+#define LOAD	ld
+#endif
+
+#ifdef __64BIT__
+#define STACKSIZE 340
+#define ALPHA_SP   296(SP)
+#define FZERO	304(SP)
+#else
+#define STACKSIZE 240
+#define ALPHA_SP   224(SP)
+#define FZERO	232(SP)
+#endif
+
+#define	M	r3
+#define	N	r4
+#define	K	r5
+
+#ifdef linux
+#ifndef __64BIT__
+#define A	r6
+#define	B	r7
+#define	C	r8
+#define	LDC	r9
+#define OFFSET	r10
+#else
+#define A	r7
+#define	B	r8
+#define	C	r9
+#define	LDC	r10
+#define OFFSET	r6
+#endif
+#endif
+
+#if defined(_AIX) || defined(__APPLE__)
+#if !defined(__64BIT__) && defined(DOUBLE)
+#define A	r8
+#define	B	r9
+#define	C	r10
+#define	LDC	r7
+#define OFFSET	r6
+#else
+#define A	r7
+#define	B	r8
+#define	C	r9
+#define	LDC	r10
+#define OFFSET	r6
+#endif
+#endif
+
+#define alpha_r vs30
+#define alpha_vr vs31
+
+#define o0	0
+
+#define TBUFFER r13
+#define o12	r14
+#define o4	r15
+#define K1	r16
+#define o8	r17
+#define L	r18
+#define T1	r19
+#define KK	r20
+#define KKK	r21
+#define	I	r22
+#define J	r23
+#define AO	r24
+#define	BO	r25
+#define	CO 	r26
+#define o16	r27
+#define	o32	r28
+#define	o48	r29
+
+#define PRE	r30
+#define T2	r31
+
+#include "strmm_macros_16x8_power8.S"
+
+
+#ifndef NEEDPARAM
+
+	PROLOGUE
+	PROFCODE
+
+	addi	SP, SP, -STACKSIZE
+	li	r0, 0
+
+	stfd	f14,    0(SP)
+	stfd	f15,    8(SP)
+	stfd	f16,   16(SP)
+	stfd	f17,   24(SP)
+
+	stfd	f18,   32(SP)
+	stfd	f19,   40(SP)
+	stfd	f20,   48(SP)
+	stfd	f21,   56(SP)
+
+	stfd	f22,   64(SP)
+	stfd	f23,   72(SP)
+	stfd	f24,   80(SP)
+	stfd	f25,   88(SP)
+
+	stfd	f26,   96(SP)
+	stfd	f27,  104(SP)
+	stfd	f28,  112(SP)
+	stfd	f29,  120(SP)
+
+	stfd	f30,  128(SP)
+	stfd	f31,  136(SP)
+
+#ifdef __64BIT__
+	std	r31,  144(SP)
+	std	r30,  152(SP)
+	std	r29,  160(SP)
+	std	r28,  168(SP)
+	std	r27,  176(SP)
+	std	r26,  184(SP)
+	std	r25,  192(SP)
+	std	r24,  200(SP)
+	std	r23,  208(SP)
+	std	r22,  216(SP)
+	std	r21,  224(SP)
+	std	r20,  232(SP)
+	std	r19,  240(SP)
+	std	r18,  248(SP)
+	std	r17,  256(SP)
+	std	r16,  264(SP)
+	std	r15,  272(SP)
+	std	r14,  280(SP)
+	std	r13,  288(SP)
+#else
+	stw	r31,  144(SP)
+	stw	r30,  148(SP)
+	stw	r29,  152(SP)
+	stw	r28,  156(SP)
+	stw	r27,  160(SP)
+	stw	r26,  164(SP)
+	stw	r25,  168(SP)
+	stw	r24,  172(SP)
+	stw	r23,  176(SP)
+	stw	r22,  180(SP)
+	stw	r21,  184(SP)
+	stw	r20,  188(SP)
+	stw	r19,  192(SP)
+	stw	r18,  196(SP)
+	stw	r17,  200(SP)
+	stw	r16,  204(SP)
+	stw	r15,  208(SP)
+	stw	r14,  212(SP)
+	stw	r13,  216(SP)
+#endif
+
+	// stfd	f1,  ALPHA_SP
+	// stw	r0,  FZERO
+
+#if defined(_AIX) || defined(__APPLE__)
+#if !defined(__64BIT__) && defined(DOUBLE)
+	lwz	LDC, FRAMESLOT(0) + STACKSIZE(SP)
+#endif
+#endif
+
+	slwi	LDC, LDC, BASE_SHIFT
+
+#if defined(TRMMKERNEL)
+#if defined(linux) && defined(__64BIT__)
+	ld	OFFSET,  FRAMESLOT(0) + STACKSIZE(SP)
+#endif
+
+#if defined(_AIX) || defined(__APPLE__)
+#ifdef __64BIT__
+	ld	OFFSET,  FRAMESLOT(0) + STACKSIZE(SP)
+#else
+#ifdef DOUBLE
+	lwz	OFFSET,  FRAMESLOT(1) + STACKSIZE(SP)
+#else
+	lwz	OFFSET,  FRAMESLOT(0) + STACKSIZE(SP)
+#endif
+#endif
+#endif
+#endif
+
+        mr      KK, OFFSET
+#if defined(TRMMKERNEL) && !defined(LEFT)
+        neg     KK, KK
+#endif
+
+
+	cmpwi	cr0, M, 0
+	ble	L999_H1
+	cmpwi	cr0, N, 0
+	ble	L999_H1
+	cmpwi	cr0, K, 0
+	ble	L999_H1
+
+	li	PRE, 256 
+	li	o4 , 4
+	li	o8 , 8
+	li	o12, 12
+	li	o16, 16
+	li	o32, 32
+	li	o48, 48
+	addi	TBUFFER, SP, 320
+
+        addi    T1, SP, 300
+        stxsspx    f1, o0 , T1
+        stxsspx    f1, o4 , T1
+        stxsspx    f1, o8 , T1
+        stxsspx    f1, o12 , T1
+
+	lxsspx	   alpha_r, o0,  T1
+        lxvw4x     alpha_vr, o0, T1
+
+
+
+#include "strmm_logic_16x8_power8.S"
+
+L999:
+	addi	r3, 0, 0
+
+	lfd	f14,    0(SP)
+	lfd	f15,    8(SP)
+	lfd	f16,   16(SP)
+	lfd	f17,   24(SP)
+
+	lfd	f18,   32(SP)
+	lfd	f19,   40(SP)
+	lfd	f20,   48(SP)
+	lfd	f21,   56(SP)
+
+	lfd	f22,   64(SP)
+	lfd	f23,   72(SP)
+	lfd	f24,   80(SP)
+	lfd	f25,   88(SP)
+
+	lfd	f26,   96(SP)
+	lfd	f27,  104(SP)
+	lfd	f28,  112(SP)
+	lfd	f29,  120(SP)
+
+	lfd	f30,  128(SP)
+	lfd	f31,  136(SP)
+
+#ifdef __64BIT__
+	ld	r31,  144(SP)
+	ld	r30,  152(SP)
+	ld	r29,  160(SP)
+	ld	r28,  168(SP)
+	ld	r27,  176(SP)
+	ld	r26,  184(SP)
+	ld	r25,  192(SP)
+	ld	r24,  200(SP)
+	ld	r23,  208(SP)
+	ld	r22,  216(SP)
+	ld	r21,  224(SP)
+	ld	r20,  232(SP)
+	ld	r19,  240(SP)
+	ld	r18,  248(SP)
+	ld	r17,  256(SP)
+	ld	r16,  264(SP)
+	ld	r15,  272(SP)
+	ld	r14,  280(SP)
+	ld	r13,  288(SP)
+#else
+	lwz	r31,  144(SP)
+	lwz	r30,  148(SP)
+	lwz	r29,  152(SP)
+	lwz	r28,  156(SP)
+	lwz	r27,  160(SP)
+	lwz	r26,  164(SP)
+	lwz	r25,  168(SP)
+	lwz	r24,  172(SP)
+	lwz	r23,  176(SP)
+	lwz	r22,  180(SP)
+	lwz	r21,  184(SP)
+	lwz	r20,  188(SP)
+	lwz	r19,  192(SP)
+	lwz	r18,  196(SP)
+	lwz	r17,  200(SP)
+	lwz	r16,  204(SP)
+	lwz	r15,  208(SP)
+	lwz	r14,  212(SP)
+	lwz	r13,  216(SP)
+#endif
+
+	addi	SP, SP, STACKSIZE
+
+	blr
+
+	EPILOGUE
+#endif
diff --git a/kernel/power/strmm_logic_16x8_power8.S b/kernel/power/strmm_logic_16x8_power8.S
new file mode 100644
index 000000000..fb2d3f94b
--- /dev/null
+++ b/kernel/power/strmm_logic_16x8_power8.S
@@ -0,0 +1,2968 @@
+/***************************************************************************
+Copyright (c) 2013-2016, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*****************************************************************************/
+
+/**************************************************************************************
+* 2016/04/02 Werner Saar (wernsaar@googlemail.com)
+* 	 BLASTEST 		: OK
+* 	 CTEST			: OK
+* 	 TEST			: OK
+* 	 LAPACK-TEST		: OK
+**************************************************************************************/
+
+	srawi.		J,	N,	3
+	ble		STRMM_L8_END
+
+STRMM_L8_BEGIN:
+
+	mr		CO,	C
+	mr		AO,	A
+	slwi		T1,	LDC	,	3
+	add		C,	C,	T1
+
+#if defined(LEFT)
+	mr		KK,	OFFSET		// OFFSET -> KK
+#endif
+
+	srawi.		I,	M,	4
+	ble		STRMM_L8x16_END
+
+STRMM_L8x16_BEGIN:
+
+
+#if  (defined(LEFT) &&  defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+	mr		BO,	B					// B -> BO
+#else
+	mr		BO,	B					// B -> BO
+	slwi		T1,	KK,	5				// Number of values in B shifted
+	slwi		T2,	KK,	6				// Number of values in A shifted
+	add		BO,	BO,	T1				// Add values to BO
+	add		AO,	AO,	T2				// Add values to AO
+#endif
+
+#if (defined(LEFT) &&  !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
+	sub		T1,	K,	KK				// K - KK -> TEMP1
+#else
+	mr		T1,	KK					// KK -> KTEMP
+#ifdef LEFT
+	addi		T1,	T1,	16				// KTEMP + Number of values in A -> KTEMP
+#else
+	addi		T1,	T1,	8				// KTEMP + Number of values in B -> KTEMP
+#endif
+#endif
+
+	mr		KKK,	T1
+	mr		K1,	T1
+	srawi.		L,	K1,	3				// KTEMP / 8 -> L
+	ble		STRMM_L8x16_SUB0
+	cmpwi		cr0,	L,	1
+	ble		STRMM_L8x16_SUB4
+
+STRMM_L8x16_LOOP_START:
+
+	dcbt		AO,	PRE
+	LOAD8x16_1
+	dcbt		AO,	PRE
+	KERNEL8x16_I1
+	dcbt		AO,	PRE
+	KERNEL8x16_2
+	dcbt		AO,	PRE
+	KERNEL8x16_1
+	dcbt		AO,	PRE
+	KERNEL8x16_2
+
+	dcbt		AO,	PRE
+	KERNEL8x16_1
+	dcbt		AO,	PRE
+	KERNEL8x16_2
+	dcbt		AO,	PRE
+	KERNEL8x16_1
+	dcbt		AO,	PRE
+	KERNEL8x16_2
+
+	addic.		L,	L,	-2
+	ble		STRMM_L8x16_LOOP_END
+
+	.align 5
+
+STRMM_L8x16_LOOP:
+
+	dcbt		AO,	PRE
+	KERNEL8x16_1
+	dcbt		AO,	PRE
+	KERNEL8x16_2
+	dcbt		AO,	PRE
+	KERNEL8x16_1
+	dcbt		AO,	PRE
+	KERNEL8x16_2
+
+	dcbt		AO,	PRE
+	KERNEL8x16_1
+	dcbt		AO,	PRE
+	KERNEL8x16_2
+	dcbt		AO,	PRE
+	KERNEL8x16_1
+	dcbt		AO,	PRE
+	KERNEL8x16_2
+
+	addic.		L,	L,	-1
+	bgt		STRMM_L8x16_LOOP
+
+STRMM_L8x16_LOOP_END:
+
+	dcbt		AO,	PRE
+	KERNEL8x16_1
+	dcbt		AO,	PRE
+	KERNEL8x16_2
+	dcbt		AO,	PRE
+	KERNEL8x16_1
+	dcbt		AO,	PRE
+	KERNEL8x16_2
+
+	dcbt		AO,	PRE
+	KERNEL8x16_1
+	dcbt		AO,	PRE
+	KERNEL8x16_2
+	dcbt		AO,	PRE
+	KERNEL8x16_1
+	KERNEL8x16_E2
+
+	b		STRMM_L8x16_SUB1
+
+STRMM_L8x16_SUB4:
+
+	dcbt		AO,	PRE
+	KERNEL8x16_SUBI1
+	dcbt		AO,	PRE
+	KERNEL8x16_SUB1
+	dcbt		AO,	PRE
+	KERNEL8x16_SUB1
+	dcbt		AO,	PRE
+	KERNEL8x16_SUB1
+
+	KERNEL8x16_SUB1
+	KERNEL8x16_SUB1
+	KERNEL8x16_SUB1
+	KERNEL8x16_SUB1
+
+	b		STRMM_L8x16_SUB1
+
+STRMM_L8x16_SUB0:
+
+	andi.		L,	K1,	7						// K1 & 7 -> L
+
+	KERNEL8x16_SUBI1
+
+	addic.		L,	L,	-1
+	ble		STRMM_L8x16_SAVE
+	b		STRMM_L8x16_SUB2
+
+STRMM_L8x16_SUB1:
+
+	andi.		L,	K1,	7						// K1 & 7 -> L
+	ble		STRMM_L8x16_SAVE
+
+STRMM_L8x16_SUB2:
+
+	KERNEL8x16_SUB1
+
+	addic.		L,	L,	-1
+	bgt		STRMM_L8x16_SUB2
+
+STRMM_L8x16_SAVE:
+
+	SAVE8x16
+
+#if  (defined(LEFT) &&  defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+	sub		T1,	K,	KKK					// K - KKK -> TEMP1
+	slwi		T2,	T1,	5			// TEMP1 * Number of values in B shifted -> TEMP2
+	slwi		T1,	T1,	6			// TEMP1 * Number of values in A shifted -> TEMP1
+	add		BO,	BO,	T2					// BO += TEMP2 * number of values in B shifted
+	add		AO,	AO,	T1					// AO += TEMP1 * number of values in A shifted
+#endif
+
+#if defined(LEFT)
+	addi		KK,	KK,	16				// KK += Number of values in A
+#endif
+
+
+	addic.		I,	I,	-1
+	bgt		STRMM_L8x16_BEGIN
+
+STRMM_L8x16_END:
+
+STRMM_L8x8_BEGIN:
+	andi.		T2,	M,	15
+	ble		STRMM_L8x1_END
+
+	andi.		T1,	M,	8
+	ble		STRMM_L8x8_END
+
+#if  (defined(LEFT) &&  defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+	mr		BO,	B					// B -> BO
+#else
+	mr		BO,	B					// B -> BO
+	slwi		T1,	KK,	5				// Number of values in B shifted
+	slwi		T2,	KK,	5				// Number of values in A shifted
+	add		BO,	BO,	T1				// Add values to BO
+	add		AO,	AO,	T2				// Add values to AO
+#endif
+
+#if (defined(LEFT) &&  !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
+	sub		T1,	K,	KK				// K - KK -> TEMP1
+#else
+	mr		T1,	KK					// KK -> KTEMP
+#ifdef LEFT
+	addi		T1,	T1,	8				// KTEMP + Number of values in A -> KTEMP
+#else
+	addi		T1,	T1,	8				// KTEMP + Number of values in B -> KTEMP
+#endif
+#endif
+
+	mr		KKK,	T1
+	mr		K1,	T1
+	srawi.		L,	K1,	3				// KTEMP / 8 -> L
+	ble		STRMM_L8x8_SUB0
+	cmpwi		cr0,	L,	1
+	ble		STRMM_L8x8_SUB4
+
+STRMM_L8x8_LOOP_START:
+
+	LOAD8x8_1
+	KERNEL8x8_I1
+	KERNEL8x8_2
+	KERNEL8x8_1
+	KERNEL8x8_2
+
+	KERNEL8x8_1
+	KERNEL8x8_2
+	KERNEL8x8_1
+	KERNEL8x8_2
+
+	addic.		L,	L,	-2
+	ble		STRMM_L8x8_LOOP_END
+
+	.align 5
+
+STRMM_L8x8_LOOP:
+
+	KERNEL8x8_1
+	KERNEL8x8_2
+	KERNEL8x8_1
+	KERNEL8x8_2
+
+	KERNEL8x8_1
+	KERNEL8x8_2
+	KERNEL8x8_1
+	KERNEL8x8_2
+
+	addic.		L,	L,	-1
+	bgt		STRMM_L8x8_LOOP
+
+STRMM_L8x8_LOOP_END:
+
+	KERNEL8x8_1
+	KERNEL8x8_2
+	KERNEL8x8_1
+	KERNEL8x8_2
+
+	KERNEL8x8_1
+	KERNEL8x8_2
+	KERNEL8x8_1
+	KERNEL8x8_E2
+
+	b		STRMM_L8x8_SUB1
+
+STRMM_L8x8_SUB4:
+
+	KERNEL8x8_SUBI1
+	KERNEL8x8_SUB1
+	KERNEL8x8_SUB1
+	KERNEL8x8_SUB1
+
+	KERNEL8x8_SUB1
+	KERNEL8x8_SUB1
+	KERNEL8x8_SUB1
+	KERNEL8x8_SUB1
+
+	b		STRMM_L8x8_SUB1
+
+STRMM_L8x8_SUB0:
+
+	andi.		L,	K1,	7						// K1 & 7 -> L
+
+	KERNEL8x8_SUBI1
+
+	addic.		L,	L,	-1
+	ble		STRMM_L8x8_SAVE
+	b		STRMM_L8x8_SUB2
+
+STRMM_L8x8_SUB1:
+
+	andi.		L,	K1,	7						// K1 & 7 -> L
+	ble		STRMM_L8x8_SAVE
+
+STRMM_L8x8_SUB2:
+
+	KERNEL8x8_SUB1
+
+	addic.		L,	L,	-1
+	bgt		STRMM_L8x8_SUB2
+
+STRMM_L8x8_SAVE:
+
+	SAVE8x8
+
+#if  (defined(LEFT) &&  defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+	sub		T1,	K,	KKK					// K - KKK -> TEMP1
+	slwi		T2,	T1,	5			// TEMP1 * Number of values in B shifted -> TEMP2
+	slwi		T1,	T1,	5			// TEMP1 * Number of values in A shifted -> TEMP1
+	add		BO,	BO,	T2					// BO += TEMP2 * number of values in B shifted
+	add		AO,	AO,	T1					// AO += TEMP1 * number of values in A shifted
+#endif
+
+#if defined(LEFT)
+	addi		KK,	KK,	8				// KK += Number of values in A
+#endif
+
+
+STRMM_L8x8_END:
+
+STRMM_L8x4_BEGIN:
+
+	andi.		T1,	M,	4
+	ble		STRMM_L8x4_END
+
+#if  (defined(LEFT) &&  defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+	mr		BO,	B					// B -> BO
+#else
+	mr		BO,	B					// B -> BO
+	slwi		T1,	KK,	5				// Number of values in B shifted
+	slwi		T2,	KK,	4				// Number of values in A shifted
+	add		BO,	BO,	T1				// Add values to BO
+	add		AO,	AO,	T2				// Add values to AO
+#endif
+
+#if (defined(LEFT) &&  !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
+	sub		T1,	K,	KK				// K - KK -> TEMP1
+#else
+	mr		T1,	KK					// KK -> KTEMP
+#ifdef LEFT
+	addi		T1,	T1,	4				// KTEMP + Number of values in A -> KTEMP
+#else
+	addi		T1,	T1,	8				// KTEMP + Number of values in B -> KTEMP
+#endif
+#endif
+
+	mr		KKK,	T1
+	mr		K1,	T1
+	srawi.		L,	K1,	3				// KTEMP / 8 -> L
+	ble		STRMM_L8x4_SUB0
+	cmpwi		cr0,	L,	1
+	ble		STRMM_L8x4_SUB4
+
+STRMM_L8x4_LOOP_START:
+
+	LOAD8x4_1
+	KERNEL8x4_I1
+	KERNEL8x4_2
+	KERNEL8x4_1
+	KERNEL8x4_2
+
+	KERNEL8x4_1
+	KERNEL8x4_2
+	KERNEL8x4_1
+	KERNEL8x4_2
+
+	addic.		L,	L,	-2
+	ble		STRMM_L8x4_LOOP_END
+
+	.align 5
+
+STRMM_L8x4_LOOP:
+
+	KERNEL8x4_1
+	KERNEL8x4_2
+	KERNEL8x4_1
+	KERNEL8x4_2
+
+	KERNEL8x4_1
+	KERNEL8x4_2
+	KERNEL8x4_1
+	KERNEL8x4_2
+
+	addic.		L,	L,	-1
+	bgt		STRMM_L8x4_LOOP
+
+STRMM_L8x4_LOOP_END:
+
+	KERNEL8x4_1
+	KERNEL8x4_2
+	KERNEL8x4_1
+	KERNEL8x4_2
+
+	KERNEL8x4_1
+	KERNEL8x4_2
+	KERNEL8x4_1
+	KERNEL8x4_E2
+
+	b		STRMM_L8x4_SUB1
+
+STRMM_L8x4_SUB4:
+
+	KERNEL8x4_SUBI1
+	KERNEL8x4_SUB1
+	KERNEL8x4_SUB1
+	KERNEL8x4_SUB1
+
+	KERNEL8x4_SUB1
+	KERNEL8x4_SUB1
+	KERNEL8x4_SUB1
+	KERNEL8x4_SUB1
+
+	b		STRMM_L8x4_SUB1
+
+STRMM_L8x4_SUB0:
+
+	andi.		L,	K1,	7						// K1 & 7 -> L
+
+	KERNEL8x4_SUBI1
+
+	addic.		L,	L,	-1
+	ble		STRMM_L8x4_SAVE
+	b		STRMM_L8x4_SUB2
+
+STRMM_L8x4_SUB1:
+
+	andi.		L,	K1,	7						// K1 & 7 -> L
+	ble		STRMM_L8x4_SAVE
+
+STRMM_L8x4_SUB2:
+
+	KERNEL8x4_SUB1
+
+	addic.		L,	L,	-1
+	bgt		STRMM_L8x4_SUB2
+
+STRMM_L8x4_SAVE:
+
+	SAVE8x4
+
+#if  (defined(LEFT) &&  defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+	sub		T1,	K,	KKK					// K - KKK -> TEMP1
+	slwi		T2,	T1,	5			// TEMP1 * Number of values in B shifted -> TEMP2
+	slwi		T1,	T1,	4			// TEMP1 * Number of values in A shifted -> TEMP1
+	add		BO,	BO,	T2					// BO += TEMP2 * number of values in B shifted
+	add		AO,	AO,	T1					// AO += TEMP1 * number of values in A shifted
+#endif
+
+#if defined(LEFT)
+	addi		KK,	KK,	4				// KK += Number of values in A
+#endif
+
+
+STRMM_L8x4_END:
+
+STRMM_L8x2_BEGIN:
+
+	andi.		T1,	M,	2
+	ble		STRMM_L8x2_END
+
+#if  (defined(LEFT) &&  defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+	mr		BO,	B					// B -> BO
+#else
+	mr		BO,	B					// B -> BO
+	slwi		T1,	KK,	5				// Number of values in B shifted
+	slwi		T2,	KK,	3				// Number of values in A shifted
+	add		BO,	BO,	T1				// Add values to BO
+	add		AO,	AO,	T2				// Add values to AO
+#endif
+
+#if (defined(LEFT) &&  !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
+	sub		T1,	K,	KK				// K - KK -> TEMP1
+#else
+	mr		T1,	KK					// KK -> KTEMP
+#ifdef LEFT
+	addi		T1,	T1,	2				// KTEMP + Number of values in A -> KTEMP
+#else
+	addi		T1,	T1,	8				// KTEMP + Number of values in B -> KTEMP
+#endif
+#endif
+
+	mr		KKK,	T1
+	mr		K1,	T1
+	srawi.		L,	K1,	3				// KTEMP / 8 -> L
+	ble		STRMM_L8x2_SUB0
+	cmpwi		cr0,	L,	1
+	ble		STRMM_L8x2_SUB4
+
+STRMM_L8x2_LOOP_START:
+
+	LOAD8x2_1
+	KERNEL8x2_I1
+	KERNEL8x2_2
+	KERNEL8x2_1
+	KERNEL8x2_2
+
+	KERNEL8x2_1
+	KERNEL8x2_2
+	KERNEL8x2_1
+	KERNEL8x2_2
+
+	addic.		L,	L,	-2
+	ble		STRMM_L8x2_LOOP_END
+
+	.align 5
+
+STRMM_L8x2_LOOP:
+
+	KERNEL8x2_1
+	KERNEL8x2_2
+	KERNEL8x2_1
+	KERNEL8x2_2
+
+	KERNEL8x2_1
+	KERNEL8x2_2
+	KERNEL8x2_1
+	KERNEL8x2_2
+
+	addic.		L,	L,	-1
+	bgt		STRMM_L8x2_LOOP
+
+STRMM_L8x2_LOOP_END:
+
+	KERNEL8x2_1
+	KERNEL8x2_2
+	KERNEL8x2_1
+	KERNEL8x2_2
+
+	KERNEL8x2_1
+	KERNEL8x2_2
+	KERNEL8x2_1
+	KERNEL8x2_E2
+
+	b		STRMM_L8x2_SUB1
+
+STRMM_L8x2_SUB4:
+
+	KERNEL8x2_SUBI1
+	KERNEL8x2_SUB1
+	KERNEL8x2_SUB1
+	KERNEL8x2_SUB1
+
+	KERNEL8x2_SUB1
+	KERNEL8x2_SUB1
+	KERNEL8x2_SUB1
+	KERNEL8x2_SUB1
+
+	b		STRMM_L8x2_SUB1
+
+STRMM_L8x2_SUB0:
+
+	andi.		L,	K1,	7						// K1 & 7 -> L
+
+	KERNEL8x2_SUBI1
+
+	addic.		L,	L,	-1
+	ble		STRMM_L8x2_SAVE
+	b		STRMM_L8x2_SUB2
+
+STRMM_L8x2_SUB1:
+
+	andi.		L,	K1,	7						// K1 & 7 -> L
+	ble		STRMM_L8x2_SAVE
+
+STRMM_L8x2_SUB2:
+
+	KERNEL8x2_SUB1
+
+	addic.		L,	L,	-1
+	bgt		STRMM_L8x2_SUB2
+
+STRMM_L8x2_SAVE:
+
+	SAVE8x2
+
+#if  (defined(LEFT) &&  defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+	sub		T1,	K,	KKK					// K - KKK -> TEMP1
+	slwi		T2,	T1,	5			// TEMP1 * Number of values in B shifted -> TEMP2
+	slwi		T1,	T1,	3			// TEMP1 * Number of values in A shifted -> TEMP1
+	add		BO,	BO,	T2					// BO += TEMP2 * number of values in B shifted
+	add		AO,	AO,	T1					// AO += TEMP1 * number of values in A shifted
+#endif
+
+#if defined(LEFT)
+	addi		KK,	KK,	2				// KK += Number of values in A
+#endif
+
+
+STRMM_L8x2_END:
+
+STRMM_L8x1_BEGIN:
+
+	andi.		T1,	M,	1
+	ble		STRMM_L8x1_END
+
+#if  (defined(LEFT) &&  defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+	mr		BO,	B					// B -> BO
+#else
+	mr		BO,	B					// B -> BO
+	slwi		T1,	KK,	5				// Number of values in B shifted
+	slwi		T2,	KK,	2				// Number of values in A shifted
+	add		BO,	BO,	T1				// Add values to BO
+	add		AO,	AO,	T2				// Add values to AO
+#endif
+
+#if (defined(LEFT) &&  !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
+	sub		T1,	K,	KK				// K - KK -> TEMP1
+#else
+	mr		T1,	KK					// KK -> KTEMP
+#ifdef LEFT
+	addi		T1,	T1,	1				// KTEMP + Number of values in A -> KTEMP
+#else
+	addi		T1,	T1,	8				// KTEMP + Number of values in B -> KTEMP
+#endif
+#endif
+
+	mr		KKK,	T1
+	mr		K1,	T1
+	srawi.		L,	K1,	3				// KTEMP / 8 -> L
+	ble		STRMM_L8x1_SUB0
+	cmpwi		cr0,	L,	1
+	ble		STRMM_L8x1_SUB4
+
+STRMM_L8x1_LOOP_START:
+
+	LOAD8x1_1
+	KERNEL8x1_I1
+	KERNEL8x1_2
+	KERNEL8x1_1
+	KERNEL8x1_2
+
+	KERNEL8x1_1
+	KERNEL8x1_2
+	KERNEL8x1_1
+	KERNEL8x1_2
+
+	addic.		L,	L,	-2
+	ble		STRMM_L8x1_LOOP_END
+
+	.align 5
+
+STRMM_L8x1_LOOP:
+
+	KERNEL8x1_1
+	KERNEL8x1_2
+	KERNEL8x1_1
+	KERNEL8x1_2
+
+	KERNEL8x1_1
+	KERNEL8x1_2
+	KERNEL8x1_1
+	KERNEL8x1_2
+
+	addic.		L,	L,	-1
+	bgt		STRMM_L8x1_LOOP
+
+STRMM_L8x1_LOOP_END:
+
+	KERNEL8x1_1
+	KERNEL8x1_2
+	KERNEL8x1_1
+	KERNEL8x1_2
+
+	KERNEL8x1_1
+	KERNEL8x1_2
+	KERNEL8x1_1
+	KERNEL8x1_E2
+
+	b		STRMM_L8x1_SUB1
+
+STRMM_L8x1_SUB4:
+
+	KERNEL8x1_SUBI1
+	KERNEL8x1_SUB1
+	KERNEL8x1_SUB1
+	KERNEL8x1_SUB1
+
+	KERNEL8x1_SUB1
+	KERNEL8x1_SUB1
+	KERNEL8x1_SUB1
+	KERNEL8x1_SUB1
+
+	b		STRMM_L8x1_SUB1
+
+STRMM_L8x1_SUB0:
+
+	andi.		L,	K1,	7						// K1 & 7 -> L
+
+	KERNEL8x1_SUBI1
+
+	addic.		L,	L,	-1
+	ble		STRMM_L8x1_SAVE
+	b		STRMM_L8x1_SUB2
+
+STRMM_L8x1_SUB1:
+
+	andi.		L,	K1,	7						// K1 & 7 -> L
+	ble		STRMM_L8x1_SAVE
+
+STRMM_L8x1_SUB2:
+
+	KERNEL8x1_SUB1
+
+	addic.		L,	L,	-1
+	bgt		STRMM_L8x1_SUB2
+
+STRMM_L8x1_SAVE:
+
+	SAVE8x1
+
+#if  (defined(LEFT) &&  defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+	sub		T1,	K,	KKK					// K - KKK -> TEMP1
+	slwi		T2,	T1,	5			// TEMP1 * Number of values in B shifted -> TEMP2
+	slwi		T1,	T1,	2			// TEMP1 * Number of values in A shifted -> TEMP1
+	add		BO,	BO,	T2					// BO += TEMP2 * number of values in B shifted
+	add		AO,	AO,	T1					// AO += TEMP1 * number of values in A shifted
+#endif
+
+#if defined(LEFT)
+	addi		KK,	KK,	1				// KK += Number of values in A
+#endif
+
+
+STRMM_L8x1_END:
+
+	slwi		T1,	K,	5
+	add		B,	B,	T1
+
+#if !defined(LEFT)
+	addi		KK,	KK,	8					// KK += Number of values in B
+#endif
+
+
+	addic.		J,	J,	-1
+	bgt		STRMM_L8_BEGIN
+
+	andi.		T2,	N,	7
+	ble		L999
+
+STRMM_L8_END:
+
+	b		STRMM_L4_BEGIN
+
+L999_H1:
+
+	b		L999
+
+STRMM_L4_BEGIN:
+
+	andi.		T1,	N,	4
+	ble		STRMM_L4_END
+	mr		CO,	C
+	mr		AO,	A
+	slwi		T1,	LDC	,	2
+	add		C,	C,	T1
+
+#if defined(LEFT)
+	mr		KK,	OFFSET		// OFFSET -> KK
+#endif
+
+	srawi.		I,	M,	4
+	ble		STRMM_L4x16_END
+
+STRMM_L4x16_BEGIN:
+
+
+#if  (defined(LEFT) &&  defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+	mr		BO,	B					// B -> BO
+#else
+	mr		BO,	B					// B -> BO
+	slwi		T1,	KK,	4				// Number of values in B shifted
+	slwi		T2,	KK,	6				// Number of values in A shifted
+	add		BO,	BO,	T1				// Add values to BO
+	add		AO,	AO,	T2				// Add values to AO
+#endif
+
+#if (defined(LEFT) &&  !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
+	sub		T1,	K,	KK				// K - KK -> TEMP1
+#else
+	mr		T1,	KK					// KK -> KTEMP
+#ifdef LEFT
+	addi		T1,	T1,	16				// KTEMP + Number of values in A -> KTEMP
+#else
+	addi		T1,	T1,	4				// KTEMP + Number of values in B -> KTEMP
+#endif
+#endif
+
+	mr		KKK,	T1
+	mr		K1,	T1
+	srawi.		L,	K1,	3				// KTEMP / 8 -> L
+	ble		STRMM_L4x16_SUB0
+	cmpwi		cr0,	L,	1
+	ble		STRMM_L4x16_SUB4
+
+STRMM_L4x16_LOOP_START:
+
+	dcbt		AO,	PRE
+	LOAD4x16_1
+	dcbt		AO,	PRE
+	KERNEL4x16_I1
+	dcbt		AO,	PRE
+	KERNEL4x16_2
+	dcbt		AO,	PRE
+	KERNEL4x16_1
+	dcbt		AO,	PRE
+	KERNEL4x16_2
+
+	dcbt		AO,	PRE
+	KERNEL4x16_1
+	dcbt		AO,	PRE
+	KERNEL4x16_2
+	dcbt		AO,	PRE
+	KERNEL4x16_1
+	dcbt		AO,	PRE
+	KERNEL4x16_2
+
+	addic.		L,	L,	-2
+	ble		STRMM_L4x16_LOOP_END
+
+	.align 5
+
+STRMM_L4x16_LOOP:
+
+	dcbt		AO,	PRE
+	KERNEL4x16_1
+	dcbt		AO,	PRE
+	KERNEL4x16_2
+	dcbt		AO,	PRE
+	KERNEL4x16_1
+	dcbt		AO,	PRE
+	KERNEL4x16_2
+
+	dcbt		AO,	PRE
+	KERNEL4x16_1
+	dcbt		AO,	PRE
+	KERNEL4x16_2
+	dcbt		AO,	PRE
+	KERNEL4x16_1
+	dcbt		AO,	PRE
+	KERNEL4x16_2
+
+	addic.		L,	L,	-1
+	bgt		STRMM_L4x16_LOOP
+
+STRMM_L4x16_LOOP_END:
+
+	dcbt		AO,	PRE
+	KERNEL4x16_1
+	dcbt		AO,	PRE
+	KERNEL4x16_2
+	dcbt		AO,	PRE
+	KERNEL4x16_1
+	dcbt		AO,	PRE
+	KERNEL4x16_2
+
+	dcbt		AO,	PRE
+	KERNEL4x16_1
+	dcbt		AO,	PRE
+	KERNEL4x16_2
+	dcbt		AO,	PRE
+	KERNEL4x16_1
+	KERNEL4x16_E2
+
+	b		STRMM_L4x16_SUB1
+
+STRMM_L4x16_SUB4:
+
+	dcbt		AO,	PRE
+	KERNEL4x16_SUBI1
+	dcbt		AO,	PRE
+	KERNEL4x16_SUB1
+	dcbt		AO,	PRE
+	KERNEL4x16_SUB1
+	dcbt		AO,	PRE
+	KERNEL4x16_SUB1
+
+	KERNEL4x16_SUB1
+	KERNEL4x16_SUB1
+	KERNEL4x16_SUB1
+	KERNEL4x16_SUB1
+
+	b		STRMM_L4x16_SUB1
+
+STRMM_L4x16_SUB0:
+
+	andi.		L,	K1,	7						// K1 & 7 -> L
+
+	KERNEL4x16_SUBI1
+
+	addic.		L,	L,	-1
+	ble		STRMM_L4x16_SAVE
+	b		STRMM_L4x16_SUB2
+
+STRMM_L4x16_SUB1:
+
+	andi.		L,	K1,	7						// K1 & 7 -> L
+	ble		STRMM_L4x16_SAVE
+
+STRMM_L4x16_SUB2:
+
+	KERNEL4x16_SUB1
+
+	addic.		L,	L,	-1
+	bgt		STRMM_L4x16_SUB2
+
+STRMM_L4x16_SAVE:
+
+	SAVE4x16
+
+#if  (defined(LEFT) &&  defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+	sub		T1,	K,	KKK					// K - KKK -> TEMP1
+	slwi		T2,	T1,	4			// TEMP1 * Number of values in B shifted -> TEMP2
+	slwi		T1,	T1,	6			// TEMP1 * Number of values in A shifted -> TEMP1
+	add		BO,	BO,	T2					// BO += TEMP2 * number of values in B shifted
+	add		AO,	AO,	T1					// AO += TEMP1 * number of values in A shifted
+#endif
+
+#if defined(LEFT)
+	addi		KK,	KK,	16				// KK += Number of values in A
+#endif
+
+
+	addic.		I,	I,	-1
+	bgt		STRMM_L4x16_BEGIN
+
+STRMM_L4x16_END:
+
+STRMM_L4x8_BEGIN:
+	andi.		T2,	M,	15
+	ble		STRMM_L4x1_END
+
+	andi.		T1,	M,	8
+	ble		STRMM_L4x8_END
+
+#if  (defined(LEFT) &&  defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+	mr		BO,	B					// B -> BO
+#else
+	mr		BO,	B					// B -> BO
+	slwi		T1,	KK,	4				// Number of values in B shifted
+	slwi		T2,	KK,	5				// Number of values in A shifted
+	add		BO,	BO,	T1				// Add values to BO
+	add		AO,	AO,	T2				// Add values to AO
+#endif
+
+#if (defined(LEFT) &&  !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
+	sub		T1,	K,	KK				// K - KK -> TEMP1
+#else
+	mr		T1,	KK					// KK -> KTEMP
+#ifdef LEFT
+	addi		T1,	T1,	8				// KTEMP + Number of values in A -> KTEMP
+#else
+	addi		T1,	T1,	4				// KTEMP + Number of values in B -> KTEMP
+#endif
+#endif
+
+	mr		KKK,	T1
+	mr		K1,	T1
+	srawi.		L,	K1,	3				// KTEMP / 8 -> L
+	ble		STRMM_L4x8_SUB0
+	cmpwi		cr0,	L,	1
+	ble		STRMM_L4x8_SUB4
+
+STRMM_L4x8_LOOP_START:
+
+	LOAD4x8_1
+	KERNEL4x8_I1
+	KERNEL4x8_2
+	KERNEL4x8_1
+	KERNEL4x8_2
+
+	KERNEL4x8_1
+	KERNEL4x8_2
+	KERNEL4x8_1
+	KERNEL4x8_2
+
+	addic.		L,	L,	-2
+	ble		STRMM_L4x8_LOOP_END
+
+	.align 5
+
+STRMM_L4x8_LOOP:
+
+	KERNEL4x8_1
+	KERNEL4x8_2
+	KERNEL4x8_1
+	KERNEL4x8_2
+
+	KERNEL4x8_1
+	KERNEL4x8_2
+	KERNEL4x8_1
+	KERNEL4x8_2
+
+	addic.		L,	L,	-1
+	bgt		STRMM_L4x8_LOOP
+
+STRMM_L4x8_LOOP_END:
+
+	KERNEL4x8_1
+	KERNEL4x8_2
+	KERNEL4x8_1
+	KERNEL4x8_2
+
+	KERNEL4x8_1
+	KERNEL4x8_2
+	KERNEL4x8_1
+	KERNEL4x8_E2
+
+	b		STRMM_L4x8_SUB1
+
+STRMM_L4x8_SUB4:
+
+	KERNEL4x8_SUBI1
+	KERNEL4x8_SUB1
+	KERNEL4x8_SUB1
+	KERNEL4x8_SUB1
+
+	KERNEL4x8_SUB1
+	KERNEL4x8_SUB1
+	KERNEL4x8_SUB1
+	KERNEL4x8_SUB1
+
+	b		STRMM_L4x8_SUB1
+
+STRMM_L4x8_SUB0:
+
+	andi.		L,	K1,	7						// K1 & 7 -> L
+
+	KERNEL4x8_SUBI1
+
+	addic.		L,	L,	-1
+	ble		STRMM_L4x8_SAVE
+	b		STRMM_L4x8_SUB2
+
+STRMM_L4x8_SUB1:
+
+	andi.		L,	K1,	7						// K1 & 7 -> L
+	ble		STRMM_L4x8_SAVE
+
+STRMM_L4x8_SUB2:
+
+	KERNEL4x8_SUB1
+
+	addic.		L,	L,	-1
+	bgt		STRMM_L4x8_SUB2
+
+STRMM_L4x8_SAVE:
+
+	SAVE4x8
+
+#if  (defined(LEFT) &&  defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+	sub		T1,	K,	KKK					// K - KKK -> TEMP1
+	slwi		T2,	T1,	4			// TEMP1 * Number of values in B shifted -> TEMP2
+	slwi		T1,	T1,	5			// TEMP1 * Number of values in A shifted -> TEMP1
+	add		BO,	BO,	T2					// BO += TEMP2 * number of values in B shifted
+	add		AO,	AO,	T1					// AO += TEMP1 * number of values in A shifted
+#endif
+
+#if defined(LEFT)
+	addi		KK,	KK,	8				// KK += Number of values in A
+#endif
+
+
+STRMM_L4x8_END:
+
+STRMM_L4x4_BEGIN:
+
+	andi.		T1,	M,	4
+	ble		STRMM_L4x4_END
+
+#if  (defined(LEFT) &&  defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+	mr		BO,	B					// B -> BO
+#else
+	mr		BO,	B					// B -> BO
+	slwi		T1,	KK,	4				// Number of values in B shifted
+	slwi		T2,	KK,	4				// Number of values in A shifted
+	add		BO,	BO,	T1				// Add values to BO
+	add		AO,	AO,	T2				// Add values to AO
+#endif
+
+#if (defined(LEFT) &&  !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
+	sub		T1,	K,	KK				// K - KK -> TEMP1
+#else
+	mr		T1,	KK					// KK -> KTEMP
+#ifdef LEFT
+	addi		T1,	T1,	4				// KTEMP + Number of values in A -> KTEMP
+#else
+	addi		T1,	T1,	4				// KTEMP + Number of values in B -> KTEMP
+#endif
+#endif
+
+	mr		KKK,	T1
+	mr		K1,	T1
+	srawi.		L,	K1,	3				// KTEMP / 8 -> L
+	ble		STRMM_L4x4_SUB0
+	cmpwi		cr0,	L,	1
+	ble		STRMM_L4x4_SUB4
+
+STRMM_L4x4_LOOP_START:
+
+	LOAD4x4_1
+	KERNEL4x4_I1
+	KERNEL4x4_2
+	KERNEL4x4_1
+	KERNEL4x4_2
+
+	KERNEL4x4_1
+	KERNEL4x4_2
+	KERNEL4x4_1
+	KERNEL4x4_2
+
+	addic.		L,	L,	-2
+	ble		STRMM_L4x4_LOOP_END
+
+	.align 5
+
+STRMM_L4x4_LOOP:
+
+	KERNEL4x4_1
+	KERNEL4x4_2
+	KERNEL4x4_1
+	KERNEL4x4_2
+
+	KERNEL4x4_1
+	KERNEL4x4_2
+	KERNEL4x4_1
+	KERNEL4x4_2
+
+	addic.		L,	L,	-1
+	bgt		STRMM_L4x4_LOOP
+
+STRMM_L4x4_LOOP_END:
+
+	KERNEL4x4_1
+	KERNEL4x4_2
+	KERNEL4x4_1
+	KERNEL4x4_2
+
+	KERNEL4x4_1
+	KERNEL4x4_2
+	KERNEL4x4_1
+	KERNEL4x4_E2
+
+	b		STRMM_L4x4_SUB1
+
+STRMM_L4x4_SUB4:
+
+	KERNEL4x4_SUBI1
+	KERNEL4x4_SUB1
+	KERNEL4x4_SUB1
+	KERNEL4x4_SUB1
+
+	KERNEL4x4_SUB1
+	KERNEL4x4_SUB1
+	KERNEL4x4_SUB1
+	KERNEL4x4_SUB1
+
+	b		STRMM_L4x4_SUB1
+
+STRMM_L4x4_SUB0:
+
+	andi.		L,	K1,	7						// K1 & 7 -> L
+
+	KERNEL4x4_SUBI1
+
+	addic.		L,	L,	-1
+	ble		STRMM_L4x4_SAVE
+	b		STRMM_L4x4_SUB2
+
+STRMM_L4x4_SUB1:
+
+	andi.		L,	K1,	7						// K1 & 7 -> L
+	ble		STRMM_L4x4_SAVE
+
+STRMM_L4x4_SUB2:
+
+	KERNEL4x4_SUB1
+
+	addic.		L,	L,	-1
+	bgt		STRMM_L4x4_SUB2
+
+STRMM_L4x4_SAVE:
+
+	SAVE4x4
+
+#if  (defined(LEFT) &&  defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+	sub		T1,	K,	KKK					// K - KKK -> TEMP1
+	slwi		T2,	T1,	4			// TEMP1 * Number of values in B shifted -> TEMP2
+	slwi		T1,	T1,	4			// TEMP1 * Number of values in A shifted -> TEMP1
+	add		BO,	BO,	T2					// BO += TEMP2 * number of values in B shifted
+	add		AO,	AO,	T1					// AO += TEMP1 * number of values in A shifted
+#endif
+
+#if defined(LEFT)
+	addi		KK,	KK,	4				// KK += Number of values in A
+#endif
+
+
+STRMM_L4x4_END:
+
+STRMM_L4x2_BEGIN:
+
+	andi.		T1,	M,	2
+	ble		STRMM_L4x2_END
+
+#if  (defined(LEFT) &&  defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+	mr		BO,	B					// B -> BO
+#else
+	mr		BO,	B					// B -> BO
+	slwi		T1,	KK,	4				// Number of values in B shifted
+	slwi		T2,	KK,	3				// Number of values in A shifted
+	add		BO,	BO,	T1				// Add values to BO
+	add		AO,	AO,	T2				// Add values to AO
+#endif
+
+#if (defined(LEFT) &&  !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
+	sub		T1,	K,	KK				// K - KK -> TEMP1
+#else
+	mr		T1,	KK					// KK -> KTEMP
+#ifdef LEFT
+	addi		T1,	T1,	2				// KTEMP + Number of values in A -> KTEMP
+#else
+	addi		T1,	T1,	4				// KTEMP + Number of values in B -> KTEMP
+#endif
+#endif
+
+	mr		KKK,	T1
+	mr		K1,	T1
+	srawi.		L,	K1,	3				// KTEMP / 8 -> L
+	ble		STRMM_L4x2_SUB0
+	cmpwi		cr0,	L,	1
+	ble		STRMM_L4x2_SUB4
+
+STRMM_L4x2_LOOP_START:
+
+	LOAD4x2_1
+	KERNEL4x2_I1
+	KERNEL4x2_2
+	KERNEL4x2_1
+	KERNEL4x2_2
+
+	KERNEL4x2_1
+	KERNEL4x2_2
+	KERNEL4x2_1
+	KERNEL4x2_2
+
+	addic.		L,	L,	-2
+	ble		STRMM_L4x2_LOOP_END
+
+	.align 5
+
+STRMM_L4x2_LOOP:
+
+	KERNEL4x2_1
+	KERNEL4x2_2
+	KERNEL4x2_1
+	KERNEL4x2_2
+
+	KERNEL4x2_1
+	KERNEL4x2_2
+	KERNEL4x2_1
+	KERNEL4x2_2
+
+	addic.		L,	L,	-1
+	bgt		STRMM_L4x2_LOOP
+
+STRMM_L4x2_LOOP_END:
+
+	KERNEL4x2_1
+	KERNEL4x2_2
+	KERNEL4x2_1
+	KERNEL4x2_2
+
+	KERNEL4x2_1
+	KERNEL4x2_2
+	KERNEL4x2_1
+	KERNEL4x2_E2
+
+	b		STRMM_L4x2_SUB1
+
+STRMM_L4x2_SUB4:
+
+	KERNEL4x2_SUBI1
+	KERNEL4x2_SUB1
+	KERNEL4x2_SUB1
+	KERNEL4x2_SUB1
+
+	KERNEL4x2_SUB1
+	KERNEL4x2_SUB1
+	KERNEL4x2_SUB1
+	KERNEL4x2_SUB1
+
+	b		STRMM_L4x2_SUB1
+
+STRMM_L4x2_SUB0:
+
+	andi.		L,	K1,	7						// K1 & 7 -> L
+
+	KERNEL4x2_SUBI1
+
+	addic.		L,	L,	-1
+	ble		STRMM_L4x2_SAVE
+	b		STRMM_L4x2_SUB2
+
+STRMM_L4x2_SUB1:
+
+	andi.		L,	K1,	7						// K1 & 7 -> L
+	ble		STRMM_L4x2_SAVE
+
+STRMM_L4x2_SUB2:
+
+	KERNEL4x2_SUB1
+
+	addic.		L,	L,	-1
+	bgt		STRMM_L4x2_SUB2
+
+STRMM_L4x2_SAVE:
+
+	SAVE4x2
+
+#if  (defined(LEFT) &&  defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+	sub		T1,	K,	KKK					// K - KKK -> TEMP1
+	slwi		T2,	T1,	4			// TEMP1 * Number of values in B shifted -> TEMP2
+	slwi		T1,	T1,	3			// TEMP1 * Number of values in A shifted -> TEMP1
+	add		BO,	BO,	T2					// BO += TEMP2 * number of values in B shifted
+	add		AO,	AO,	T1					// AO += TEMP1 * number of values in A shifted
+#endif
+
+#if defined(LEFT)
+	addi		KK,	KK,	2				// KK += Number of values in A
+#endif
+
+
+STRMM_L4x2_END:
+
+STRMM_L4x1_BEGIN:
+
+	andi.		T1,	M,	1
+	ble		STRMM_L4x1_END
+
+#if  (defined(LEFT) &&  defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+	mr		BO,	B					// B -> BO
+#else
+	mr		BO,	B					// B -> BO
+	slwi		T1,	KK,	4				// Number of values in B shifted
+	slwi		T2,	KK,	2				// Number of values in A shifted
+	add		BO,	BO,	T1				// Add values to BO
+	add		AO,	AO,	T2				// Add values to AO
+#endif
+
+#if (defined(LEFT) &&  !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
+	sub		T1,	K,	KK				// K - KK -> TEMP1
+#else
+	mr		T1,	KK					// KK -> KTEMP
+#ifdef LEFT
+	addi		T1,	T1,	1				// KTEMP + Number of values in A -> KTEMP
+#else
+	addi		T1,	T1,	4				// KTEMP + Number of values in B -> KTEMP
+#endif
+#endif
+
+	mr		KKK,	T1
+	mr		K1,	T1
+	srawi.		L,	K1,	3				// KTEMP / 8 -> L
+	ble		STRMM_L4x1_SUB0
+	cmpwi		cr0,	L,	1
+	ble		STRMM_L4x1_SUB4
+
+STRMM_L4x1_LOOP_START:
+
+	LOAD4x1_1
+	KERNEL4x1_I1
+	KERNEL4x1_2
+	KERNEL4x1_1
+	KERNEL4x1_2
+
+	KERNEL4x1_1
+	KERNEL4x1_2
+	KERNEL4x1_1
+	KERNEL4x1_2
+
+	addic.		L,	L,	-2
+	ble		STRMM_L4x1_LOOP_END
+
+	.align 5
+
+STRMM_L4x1_LOOP:
+
+	KERNEL4x1_1
+	KERNEL4x1_2
+	KERNEL4x1_1
+	KERNEL4x1_2
+
+	KERNEL4x1_1
+	KERNEL4x1_2
+	KERNEL4x1_1
+	KERNEL4x1_2
+
+	addic.		L,	L,	-1
+	bgt		STRMM_L4x1_LOOP
+
+STRMM_L4x1_LOOP_END:
+
+	KERNEL4x1_1
+	KERNEL4x1_2
+	KERNEL4x1_1
+	KERNEL4x1_2
+
+	KERNEL4x1_1
+	KERNEL4x1_2
+	KERNEL4x1_1
+	KERNEL4x1_E2
+
+	b		STRMM_L4x1_SUB1
+
+STRMM_L4x1_SUB4:
+
+	KERNEL4x1_SUBI1
+	KERNEL4x1_SUB1
+	KERNEL4x1_SUB1
+	KERNEL4x1_SUB1
+
+	KERNEL4x1_SUB1
+	KERNEL4x1_SUB1
+	KERNEL4x1_SUB1
+	KERNEL4x1_SUB1
+
+	b		STRMM_L4x1_SUB1
+
+STRMM_L4x1_SUB0:
+
+	andi.		L,	K1,	7						// K1 & 7 -> L
+
+	KERNEL4x1_SUBI1
+
+	addic.		L,	L,	-1
+	ble		STRMM_L4x1_SAVE
+	b		STRMM_L4x1_SUB2
+
+STRMM_L4x1_SUB1:
+
+	andi.		L,	K1,	7						// K1 & 7 -> L
+	ble		STRMM_L4x1_SAVE
+
+STRMM_L4x1_SUB2:
+
+	KERNEL4x1_SUB1
+
+	addic.		L,	L,	-1
+	bgt		STRMM_L4x1_SUB2
+
+STRMM_L4x1_SAVE:
+
+	SAVE4x1
+
+#if  (defined(LEFT) &&  defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+	sub		T1,	K,	KKK					// K - KKK -> TEMP1
+	slwi		T2,	T1,	4			// TEMP1 * Number of values in B shifted -> TEMP2
+	slwi		T1,	T1,	2			// TEMP1 * Number of values in A shifted -> TEMP1
+	add		BO,	BO,	T2					// BO += TEMP2 * number of values in B shifted
+	add		AO,	AO,	T1					// AO += TEMP1 * number of values in A shifted
+#endif
+
+#if defined(LEFT)
+	addi		KK,	KK,	1				// KK += Number of values in A
+#endif
+
+
+STRMM_L4x1_END:
+
+	slwi		T1,	K,	4
+	add		B,	B,	T1
+
+#if !defined(LEFT)
+	addi		KK,	KK,	4					// KK += Number of values in B
+#endif
+
+
+STRMM_L4_END:
+STRMM_L2_BEGIN:
+
+	andi.		T1,	N,	2
+	ble		STRMM_L2_END
+	mr		CO,	C
+	mr		AO,	A
+	slwi		T1,	LDC	,	1
+	add		C,	C,	T1
+
+#if defined(LEFT)
+	mr		KK,	OFFSET		// OFFSET -> KK
+#endif
+
+	srawi.		I,	M,	4
+	ble		STRMM_L2x16_END
+
+STRMM_L2x16_BEGIN:
+
+
+#if  (defined(LEFT) &&  defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+	mr		BO,	B					// B -> BO
+#else
+	mr		BO,	B					// B -> BO
+	slwi		T1,	KK,	3				// Number of values in B shifted
+	slwi		T2,	KK,	6				// Number of values in A shifted
+	add		BO,	BO,	T1				// Add values to BO
+	add		AO,	AO,	T2				// Add values to AO
+#endif
+
+#if (defined(LEFT) &&  !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
+	sub		T1,	K,	KK				// K - KK -> TEMP1
+#else
+	mr		T1,	KK					// KK -> KTEMP
+#ifdef LEFT
+	addi		T1,	T1,	16				// KTEMP + Number of values in A -> KTEMP
+#else
+	addi		T1,	T1,	2				// KTEMP + Number of values in B -> KTEMP
+#endif
+#endif
+
+	mr		KKK,	T1
+	mr		K1,	T1
+	srawi.		L,	K1,	3				// KTEMP / 8 -> L
+	ble		STRMM_L2x16_SUB0
+	cmpwi		cr0,	L,	1
+	ble		STRMM_L2x16_SUB4
+
+STRMM_L2x16_LOOP_START:
+
+	dcbt		AO,	PRE
+	LOAD2x16_1
+	dcbt		AO,	PRE
+	KERNEL2x16_I1
+	dcbt		AO,	PRE
+	KERNEL2x16_2
+	dcbt		AO,	PRE
+	KERNEL2x16_1
+	dcbt		AO,	PRE
+	KERNEL2x16_2
+
+	dcbt		AO,	PRE
+	KERNEL2x16_1
+	dcbt		AO,	PRE
+	KERNEL2x16_2
+	dcbt		AO,	PRE
+	KERNEL2x16_1
+	dcbt		AO,	PRE
+	KERNEL2x16_2
+
+	addic.		L,	L,	-2
+	ble		STRMM_L2x16_LOOP_END
+
+	.align 5
+
+STRMM_L2x16_LOOP:
+
+	dcbt		AO,	PRE
+	KERNEL2x16_1
+	dcbt		AO,	PRE
+	KERNEL2x16_2
+	dcbt		AO,	PRE
+	KERNEL2x16_1
+	dcbt		AO,	PRE
+	KERNEL2x16_2
+
+	dcbt		AO,	PRE
+	KERNEL2x16_1
+	dcbt		AO,	PRE
+	KERNEL2x16_2
+	dcbt		AO,	PRE
+	KERNEL2x16_1
+	dcbt		AO,	PRE
+	KERNEL2x16_2
+
+	addic.		L,	L,	-1
+	bgt		STRMM_L2x16_LOOP
+
+STRMM_L2x16_LOOP_END:
+
+	dcbt		AO,	PRE
+	KERNEL2x16_1
+	dcbt		AO,	PRE
+	KERNEL2x16_2
+	dcbt		AO,	PRE
+	KERNEL2x16_1
+	dcbt		AO,	PRE
+	KERNEL2x16_2
+
+	dcbt		AO,	PRE
+	KERNEL2x16_1
+	dcbt		AO,	PRE
+	KERNEL2x16_2
+	dcbt		AO,	PRE
+	KERNEL2x16_1
+	KERNEL2x16_E2
+
+	b		STRMM_L2x16_SUB1
+
+STRMM_L2x16_SUB4:
+
+	dcbt		AO,	PRE
+	KERNEL2x16_SUBI1
+	dcbt		AO,	PRE
+	KERNEL2x16_SUB1
+	dcbt		AO,	PRE
+	KERNEL2x16_SUB1
+	dcbt		AO,	PRE
+	KERNEL2x16_SUB1
+
+	KERNEL2x16_SUB1
+	KERNEL2x16_SUB1
+	KERNEL2x16_SUB1
+	KERNEL2x16_SUB1
+
+	b		STRMM_L2x16_SUB1
+
+STRMM_L2x16_SUB0:
+
+	andi.		L,	K1,	7						// K1 & 7 -> L
+
+	KERNEL2x16_SUBI1
+
+	addic.		L,	L,	-1
+	ble		STRMM_L2x16_SAVE
+	b		STRMM_L2x16_SUB2
+
+STRMM_L2x16_SUB1:
+
+	andi.		L,	K1,	7						// K1 & 7 -> L
+	ble		STRMM_L2x16_SAVE
+
+STRMM_L2x16_SUB2:
+
+	KERNEL2x16_SUB1
+
+	addic.		L,	L,	-1
+	bgt		STRMM_L2x16_SUB2
+
+STRMM_L2x16_SAVE:
+
+	SAVE2x16
+
+#if  (defined(LEFT) &&  defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+	sub		T1,	K,	KKK					// K - KKK -> TEMP1
+	slwi		T2,	T1,	3			// TEMP1 * Number of values in B shifted -> TEMP2
+	slwi		T1,	T1,	6			// TEMP1 * Number of values in A shifted -> TEMP1
+	add		BO,	BO,	T2					// BO += TEMP2 * number of values in B shifted
+	add		AO,	AO,	T1					// AO += TEMP1 * number of values in A shifted
+#endif
+
+#if defined(LEFT)
+	addi		KK,	KK,	16				// KK += Number of values in A
+#endif
+
+
+	addic.		I,	I,	-1
+	bgt		STRMM_L2x16_BEGIN
+
+STRMM_L2x16_END:
+
+STRMM_L2x8_BEGIN:
+	andi.		T2,	M,	15
+	ble		STRMM_L2x1_END
+
+	andi.		T1,	M,	8
+	ble		STRMM_L2x8_END
+
+#if  (defined(LEFT) &&  defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+	mr		BO,	B					// B -> BO
+#else
+	mr		BO,	B					// B -> BO
+	slwi		T1,	KK,	3				// Number of values in B shifted
+	slwi		T2,	KK,	5				// Number of values in A shifted
+	add		BO,	BO,	T1				// Add values to BO
+	add		AO,	AO,	T2				// Add values to AO
+#endif
+
+#if (defined(LEFT) &&  !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
+	sub		T1,	K,	KK				// K - KK -> TEMP1
+#else
+	mr		T1,	KK					// KK -> KTEMP
+#ifdef LEFT
+	addi		T1,	T1,	8				// KTEMP + Number of values in A -> KTEMP
+#else
+	addi		T1,	T1,	2				// KTEMP + Number of values in B -> KTEMP
+#endif
+#endif
+
+	mr		KKK,	T1
+	mr		K1,	T1
+	srawi.		L,	K1,	3				// KTEMP / 8 -> L
+	ble		STRMM_L2x8_SUB0
+	cmpwi		cr0,	L,	1
+	ble		STRMM_L2x8_SUB4
+
+STRMM_L2x8_LOOP_START:
+
+	LOAD2x8_1
+	KERNEL2x8_I1
+	KERNEL2x8_2
+	KERNEL2x8_1
+	KERNEL2x8_2
+
+	KERNEL2x8_1
+	KERNEL2x8_2
+	KERNEL2x8_1
+	KERNEL2x8_2
+
+	addic.		L,	L,	-2
+	ble		STRMM_L2x8_LOOP_END
+
+	.align 5
+
+STRMM_L2x8_LOOP:
+
+	KERNEL2x8_1
+	KERNEL2x8_2
+	KERNEL2x8_1
+	KERNEL2x8_2
+
+	KERNEL2x8_1
+	KERNEL2x8_2
+	KERNEL2x8_1
+	KERNEL2x8_2
+
+	addic.		L,	L,	-1
+	bgt		STRMM_L2x8_LOOP
+
+STRMM_L2x8_LOOP_END:
+
+	KERNEL2x8_1
+	KERNEL2x8_2
+	KERNEL2x8_1
+	KERNEL2x8_2
+
+	KERNEL2x8_1
+	KERNEL2x8_2
+	KERNEL2x8_1
+	KERNEL2x8_E2
+
+	b		STRMM_L2x8_SUB1
+
+STRMM_L2x8_SUB4:
+
+	KERNEL2x8_SUBI1
+	KERNEL2x8_SUB1
+	KERNEL2x8_SUB1
+	KERNEL2x8_SUB1
+
+	KERNEL2x8_SUB1
+	KERNEL2x8_SUB1
+	KERNEL2x8_SUB1
+	KERNEL2x8_SUB1
+
+	b		STRMM_L2x8_SUB1
+
+STRMM_L2x8_SUB0:
+
+	andi.		L,	K1,	7						// K1 & 7 -> L
+
+	KERNEL2x8_SUBI1
+
+	addic.		L,	L,	-1
+	ble		STRMM_L2x8_SAVE
+	b		STRMM_L2x8_SUB2
+
+STRMM_L2x8_SUB1:
+
+	andi.		L,	K1,	7						// K1 & 7 -> L
+	ble		STRMM_L2x8_SAVE
+
+STRMM_L2x8_SUB2:
+
+	KERNEL2x8_SUB1
+
+	addic.		L,	L,	-1
+	bgt		STRMM_L2x8_SUB2
+
+STRMM_L2x8_SAVE:
+
+	SAVE2x8
+
+#if  (defined(LEFT) &&  defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+	sub		T1,	K,	KKK					// K - KKK -> TEMP1
+	slwi		T2,	T1,	3			// TEMP1 * Number of values in B shifted -> TEMP2
+	slwi		T1,	T1,	5			// TEMP1 * Number of values in A shifted -> TEMP1
+	add		BO,	BO,	T2					// BO += TEMP2 * number of values in B shifted
+	add		AO,	AO,	T1					// AO += TEMP1 * number of values in A shifted
+#endif
+
+#if defined(LEFT)
+	addi		KK,	KK,	8				// KK += Number of values in A
+#endif
+
+
+STRMM_L2x8_END:
+
+STRMM_L2x4_BEGIN:
+
+	andi.		T1,	M,	4
+	ble		STRMM_L2x4_END
+
+#if  (defined(LEFT) &&  defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+	mr		BO,	B					// B -> BO
+#else
+	mr		BO,	B					// B -> BO
+	slwi		T1,	KK,	3				// Number of values in B shifted
+	slwi		T2,	KK,	4				// Number of values in A shifted
+	add		BO,	BO,	T1				// Add values to BO
+	add		AO,	AO,	T2				// Add values to AO
+#endif
+
+#if (defined(LEFT) &&  !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
+	sub		T1,	K,	KK				// K - KK -> TEMP1
+#else
+	mr		T1,	KK					// KK -> KTEMP
+#ifdef LEFT
+	addi		T1,	T1,	4				// KTEMP + Number of values in A -> KTEMP
+#else
+	addi		T1,	T1,	2				// KTEMP + Number of values in B -> KTEMP
+#endif
+#endif
+
+	mr		KKK,	T1
+	mr		K1,	T1
+	srawi.		L,	K1,	3				// KTEMP / 8 -> L
+	ble		STRMM_L2x4_SUB0
+	cmpwi		cr0,	L,	1
+	ble		STRMM_L2x4_SUB4
+
+STRMM_L2x4_LOOP_START:
+
+	LOAD2x4_1
+	KERNEL2x4_I1
+	KERNEL2x4_2
+	KERNEL2x4_1
+	KERNEL2x4_2
+
+	KERNEL2x4_1
+	KERNEL2x4_2
+	KERNEL2x4_1
+	KERNEL2x4_2
+
+	addic.		L,	L,	-2
+	ble		STRMM_L2x4_LOOP_END
+
+	.align 5
+
+STRMM_L2x4_LOOP:
+
+	KERNEL2x4_1
+	KERNEL2x4_2
+	KERNEL2x4_1
+	KERNEL2x4_2
+
+	KERNEL2x4_1
+	KERNEL2x4_2
+	KERNEL2x4_1
+	KERNEL2x4_2
+
+	addic.		L,	L,	-1
+	bgt		STRMM_L2x4_LOOP
+
+STRMM_L2x4_LOOP_END:
+
+	KERNEL2x4_1
+	KERNEL2x4_2
+	KERNEL2x4_1
+	KERNEL2x4_2
+
+	KERNEL2x4_1
+	KERNEL2x4_2
+	KERNEL2x4_1
+	KERNEL2x4_E2
+
+	b		STRMM_L2x4_SUB1
+
+STRMM_L2x4_SUB4:
+
+	KERNEL2x4_SUBI1
+	KERNEL2x4_SUB1
+	KERNEL2x4_SUB1
+	KERNEL2x4_SUB1
+
+	KERNEL2x4_SUB1
+	KERNEL2x4_SUB1
+	KERNEL2x4_SUB1
+	KERNEL2x4_SUB1
+
+	b		STRMM_L2x4_SUB1
+
+STRMM_L2x4_SUB0:
+
+	andi.		L,	K1,	7						// K1 & 7 -> L
+
+	KERNEL2x4_SUBI1
+
+	addic.		L,	L,	-1
+	ble		STRMM_L2x4_SAVE
+	b		STRMM_L2x4_SUB2
+
+STRMM_L2x4_SUB1:
+
+	andi.		L,	K1,	7						// K1 & 7 -> L
+	ble		STRMM_L2x4_SAVE
+
+STRMM_L2x4_SUB2:
+
+	KERNEL2x4_SUB1
+
+	addic.		L,	L,	-1
+	bgt		STRMM_L2x4_SUB2
+
+STRMM_L2x4_SAVE:
+
+	SAVE2x4
+
+#if  (defined(LEFT) &&  defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+	sub		T1,	K,	KKK					// K - KKK -> TEMP1
+	slwi		T2,	T1,	3			// TEMP1 * Number of values in B shifted -> TEMP2
+	slwi		T1,	T1,	4			// TEMP1 * Number of values in A shifted -> TEMP1
+	add		BO,	BO,	T2					// BO += TEMP2 * number of values in B shifted
+	add		AO,	AO,	T1					// AO += TEMP1 * number of values in A shifted
+#endif
+
+#if defined(LEFT)
+	addi		KK,	KK,	4				// KK += Number of values in A
+#endif
+
+
+STRMM_L2x4_END:
+
+STRMM_L2x2_BEGIN:
+
+	andi.		T1,	M,	2
+	ble		STRMM_L2x2_END
+
+#if  (defined(LEFT) &&  defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+	mr		BO,	B					// B -> BO
+#else
+	mr		BO,	B					// B -> BO
+	slwi		T1,	KK,	3				// Number of values in B shifted
+	slwi		T2,	KK,	3				// Number of values in A shifted
+	add		BO,	BO,	T1				// Add values to BO
+	add		AO,	AO,	T2				// Add values to AO
+#endif
+
+#if (defined(LEFT) &&  !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
+	sub		T1,	K,	KK				// K - KK -> TEMP1
+#else
+	mr		T1,	KK					// KK -> KTEMP
+#ifdef LEFT
+	addi		T1,	T1,	2				// KTEMP + Number of values in A -> KTEMP
+#else
+	addi		T1,	T1,	2				// KTEMP + Number of values in B -> KTEMP
+#endif
+#endif
+
+	mr		KKK,	T1
+	mr		K1,	T1
+	srawi.		L,	K1,	3				// KTEMP / 8 -> L
+	ble		STRMM_L2x2_SUB0
+	cmpwi		cr0,	L,	1
+	ble		STRMM_L2x2_SUB4
+
+STRMM_L2x2_LOOP_START:
+
+	LOAD2x2_1
+	KERNEL2x2_I1
+	KERNEL2x2_2
+	KERNEL2x2_1
+	KERNEL2x2_2
+
+	KERNEL2x2_1
+	KERNEL2x2_2
+	KERNEL2x2_1
+	KERNEL2x2_2
+
+	addic.		L,	L,	-2
+	ble		STRMM_L2x2_LOOP_END
+
+	.align 5
+
+STRMM_L2x2_LOOP:
+
+	KERNEL2x2_1
+	KERNEL2x2_2
+	KERNEL2x2_1
+	KERNEL2x2_2
+
+	KERNEL2x2_1
+	KERNEL2x2_2
+	KERNEL2x2_1
+	KERNEL2x2_2
+
+	addic.		L,	L,	-1
+	bgt		STRMM_L2x2_LOOP
+
+STRMM_L2x2_LOOP_END:
+
+	KERNEL2x2_1
+	KERNEL2x2_2
+	KERNEL2x2_1
+	KERNEL2x2_2
+
+	KERNEL2x2_1
+	KERNEL2x2_2
+	KERNEL2x2_1
+	KERNEL2x2_E2
+
+	b		STRMM_L2x2_SUB1
+
+STRMM_L2x2_SUB4:
+
+	KERNEL2x2_SUBI1
+	KERNEL2x2_SUB1
+	KERNEL2x2_SUB1
+	KERNEL2x2_SUB1
+
+	KERNEL2x2_SUB1
+	KERNEL2x2_SUB1
+	KERNEL2x2_SUB1
+	KERNEL2x2_SUB1
+
+	b		STRMM_L2x2_SUB1
+
+STRMM_L2x2_SUB0:
+
+	andi.		L,	K1,	7						// K1 & 7 -> L
+
+	KERNEL2x2_SUBI1
+
+	addic.		L,	L,	-1
+	ble		STRMM_L2x2_SAVE
+	b		STRMM_L2x2_SUB2
+
+STRMM_L2x2_SUB1:
+
+	andi.		L,	K1,	7						// K1 & 7 -> L
+	ble		STRMM_L2x2_SAVE
+
+STRMM_L2x2_SUB2:
+
+	KERNEL2x2_SUB1
+
+	addic.		L,	L,	-1
+	bgt		STRMM_L2x2_SUB2
+
+STRMM_L2x2_SAVE:
+
+	SAVE2x2
+
+#if  (defined(LEFT) &&  defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+	sub		T1,	K,	KKK					// K - KKK -> TEMP1
+	slwi		T2,	T1,	3			// TEMP1 * Number of values in B shifted -> TEMP2
+	slwi		T1,	T1,	3			// TEMP1 * Number of values in A shifted -> TEMP1
+	add		BO,	BO,	T2					// BO += TEMP2 * number of values in B shifted
+	add		AO,	AO,	T1					// AO += TEMP1 * number of values in A shifted
+#endif
+
+#if defined(LEFT)
+	addi		KK,	KK,	2				// KK += Number of values in A
+#endif
+
+
+STRMM_L2x2_END:
+
+STRMM_L2x1_BEGIN:
+
+	andi.		T1,	M,	1
+	ble		STRMM_L2x1_END
+
+#if  (defined(LEFT) &&  defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+	mr		BO,	B					// B -> BO
+#else
+	mr		BO,	B					// B -> BO
+	slwi		T1,	KK,	3				// Number of values in B shifted
+	slwi		T2,	KK,	2				// Number of values in A shifted
+	add		BO,	BO,	T1				// Add values to BO
+	add		AO,	AO,	T2				// Add values to AO
+#endif
+
+#if (defined(LEFT) &&  !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
+	sub		T1,	K,	KK				// K - KK -> TEMP1
+#else
+	mr		T1,	KK					// KK -> KTEMP
+#ifdef LEFT
+	addi		T1,	T1,	1				// KTEMP + Number of values in A -> KTEMP
+#else
+	addi		T1,	T1,	2				// KTEMP + Number of values in B -> KTEMP
+#endif
+#endif
+
+	mr		KKK,	T1
+	mr		K1,	T1
+	srawi.		L,	K1,	3				// KTEMP / 8 -> L
+	ble		STRMM_L2x1_SUB0
+	cmpwi		cr0,	L,	1
+	ble		STRMM_L2x1_SUB4
+
+STRMM_L2x1_LOOP_START:
+
+	LOAD2x1_1
+	KERNEL2x1_I1
+	KERNEL2x1_2
+	KERNEL2x1_1
+	KERNEL2x1_2
+
+	KERNEL2x1_1
+	KERNEL2x1_2
+	KERNEL2x1_1
+	KERNEL2x1_2
+
+	addic.		L,	L,	-2
+	ble		STRMM_L2x1_LOOP_END
+
+	.align 5
+
+STRMM_L2x1_LOOP:
+
+	KERNEL2x1_1
+	KERNEL2x1_2
+	KERNEL2x1_1
+	KERNEL2x1_2
+
+	KERNEL2x1_1
+	KERNEL2x1_2
+	KERNEL2x1_1
+	KERNEL2x1_2
+
+	addic.		L,	L,	-1
+	bgt		STRMM_L2x1_LOOP
+
+STRMM_L2x1_LOOP_END:
+
+	KERNEL2x1_1
+	KERNEL2x1_2
+	KERNEL2x1_1
+	KERNEL2x1_2
+
+	KERNEL2x1_1
+	KERNEL2x1_2
+	KERNEL2x1_1
+	KERNEL2x1_E2
+
+	b		STRMM_L2x1_SUB1
+
+STRMM_L2x1_SUB4:
+
+	KERNEL2x1_SUBI1
+	KERNEL2x1_SUB1
+	KERNEL2x1_SUB1
+	KERNEL2x1_SUB1
+
+	KERNEL2x1_SUB1
+	KERNEL2x1_SUB1
+	KERNEL2x1_SUB1
+	KERNEL2x1_SUB1
+
+	b		STRMM_L2x1_SUB1
+
+STRMM_L2x1_SUB0:
+
+	andi.		L,	K1,	7						// K1 & 7 -> L
+
+	KERNEL2x1_SUBI1
+
+	addic.		L,	L,	-1
+	ble		STRMM_L2x1_SAVE
+	b		STRMM_L2x1_SUB2
+
+STRMM_L2x1_SUB1:
+
+	andi.		L,	K1,	7						// K1 & 7 -> L
+	ble		STRMM_L2x1_SAVE
+
+STRMM_L2x1_SUB2:
+
+	KERNEL2x1_SUB1
+
+	addic.		L,	L,	-1
+	bgt		STRMM_L2x1_SUB2
+
+STRMM_L2x1_SAVE:
+
+	SAVE2x1
+
+#if  (defined(LEFT) &&  defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+	sub		T1,	K,	KKK					// K - KKK -> TEMP1
+	slwi		T2,	T1,	3			// TEMP1 * Number of values in B shifted -> TEMP2
+	slwi		T1,	T1,	2			// TEMP1 * Number of values in A shifted -> TEMP1
+	add		BO,	BO,	T2					// BO += TEMP2 * number of values in B shifted
+	add		AO,	AO,	T1					// AO += TEMP1 * number of values in A shifted
+#endif
+
+#if defined(LEFT)
+	addi		KK,	KK,	1				// KK += Number of values in A
+#endif
+
+
+STRMM_L2x1_END:
+
+	slwi		T1,	K,	3
+	add		B,	B,	T1
+
+#if !defined(LEFT)
+	addi		KK,	KK,	2					// KK += Number of values in B
+#endif
+
+
+STRMM_L2_END:
+STRMM_L1_BEGIN:
+
+	andi.		T1,	N,	1
+	ble		STRMM_L1_END
+	mr		CO,	C
+	mr		AO,	A
+
+#if defined(LEFT)
+	mr		KK,	OFFSET		// OFFSET -> KK
+#endif
+
+	srawi.		I,	M,	4
+	ble		STRMM_L1x16_END
+
+STRMM_L1x16_BEGIN:
+
+
+#if  (defined(LEFT) &&  defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+	mr		BO,	B					// B -> BO
+#else
+	mr		BO,	B					// B -> BO
+	slwi		T1,	KK,	2				// Number of values in B shifted
+	slwi		T2,	KK,	6				// Number of values in A shifted
+	add		BO,	BO,	T1				// Add values to BO
+	add		AO,	AO,	T2				// Add values to AO
+#endif
+
+#if (defined(LEFT) &&  !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
+	sub		T1,	K,	KK				// K - KK -> TEMP1
+#else
+	mr		T1,	KK					// KK -> KTEMP
+#ifdef LEFT
+	addi		T1,	T1,	16				// KTEMP + Number of values in A -> KTEMP
+#else
+	addi		T1,	T1,	1				// KTEMP + Number of values in B -> KTEMP
+#endif
+#endif
+
+	mr		KKK,	T1
+	mr		K1,	T1
+	srawi.		L,	K1,	3				// KTEMP / 8 -> L
+	ble		STRMM_L1x16_SUB0
+	cmpwi		cr0,	L,	1
+	ble		STRMM_L1x16_SUB4
+
+STRMM_L1x16_LOOP_START:
+
+	dcbt		AO,	PRE
+	LOAD1x16_1
+	dcbt		AO,	PRE
+	KERNEL1x16_I1
+	dcbt		AO,	PRE
+	KERNEL1x16_2
+	dcbt		AO,	PRE
+	KERNEL1x16_1
+	dcbt		AO,	PRE
+	KERNEL1x16_2
+
+	dcbt		AO,	PRE
+	KERNEL1x16_1
+	dcbt		AO,	PRE
+	KERNEL1x16_2
+	dcbt		AO,	PRE
+	KERNEL1x16_1
+	dcbt		AO,	PRE
+	KERNEL1x16_2
+
+	addic.		L,	L,	-2
+	ble		STRMM_L1x16_LOOP_END
+
+	.align 5
+
+STRMM_L1x16_LOOP:
+
+	dcbt		AO,	PRE
+	KERNEL1x16_1
+	dcbt		AO,	PRE
+	KERNEL1x16_2
+	dcbt		AO,	PRE
+	KERNEL1x16_1
+	dcbt		AO,	PRE
+	KERNEL1x16_2
+
+	dcbt		AO,	PRE
+	KERNEL1x16_1
+	dcbt		AO,	PRE
+	KERNEL1x16_2
+	dcbt		AO,	PRE
+	KERNEL1x16_1
+	dcbt		AO,	PRE
+	KERNEL1x16_2
+
+	addic.		L,	L,	-1
+	bgt		STRMM_L1x16_LOOP
+
+STRMM_L1x16_LOOP_END:
+
+	dcbt		AO,	PRE
+	KERNEL1x16_1
+	dcbt		AO,	PRE
+	KERNEL1x16_2
+	dcbt		AO,	PRE
+	KERNEL1x16_1
+	dcbt		AO,	PRE
+	KERNEL1x16_2
+
+	dcbt		AO,	PRE
+	KERNEL1x16_1
+	dcbt		AO,	PRE
+	KERNEL1x16_2
+	dcbt		AO,	PRE
+	KERNEL1x16_1
+	KERNEL1x16_E2
+
+	b		STRMM_L1x16_SUB1
+
+STRMM_L1x16_SUB4:
+
+	dcbt		AO,	PRE
+	KERNEL1x16_SUBI1
+	dcbt		AO,	PRE
+	KERNEL1x16_SUB1
+	dcbt		AO,	PRE
+	KERNEL1x16_SUB1
+	dcbt		AO,	PRE
+	KERNEL1x16_SUB1
+
+	KERNEL1x16_SUB1
+	KERNEL1x16_SUB1
+	KERNEL1x16_SUB1
+	KERNEL1x16_SUB1
+
+	b		STRMM_L1x16_SUB1
+
+STRMM_L1x16_SUB0:
+
+	andi.		L,	K1,	7						// K1 & 7 -> L
+
+	KERNEL1x16_SUBI1
+
+	addic.		L,	L,	-1
+	ble		STRMM_L1x16_SAVE
+	b		STRMM_L1x16_SUB2
+
+STRMM_L1x16_SUB1:
+
+	andi.		L,	K1,	7						// K1 & 7 -> L
+	ble		STRMM_L1x16_SAVE
+
+STRMM_L1x16_SUB2:
+
+	KERNEL1x16_SUB1
+
+	addic.		L,	L,	-1
+	bgt		STRMM_L1x16_SUB2
+
+STRMM_L1x16_SAVE:
+
+	SAVE1x16
+
+#if  (defined(LEFT) &&  defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+	sub		T1,	K,	KKK					// K - KKK -> TEMP1
+	slwi		T2,	T1,	2			// TEMP1 * Number of values in B shifted -> TEMP2
+	slwi		T1,	T1,	6			// TEMP1 * Number of values in A shifted -> TEMP1
+	add		BO,	BO,	T2					// BO += TEMP2 * number of values in B shifted
+	add		AO,	AO,	T1					// AO += TEMP1 * number of values in A shifted
+#endif
+
+#if defined(LEFT)
+	addi		KK,	KK,	16				// KK += Number of values in A
+#endif
+
+
+	addic.		I,	I,	-1
+	bgt		STRMM_L1x16_BEGIN
+
+STRMM_L1x16_END:
+
+STRMM_L1x8_BEGIN:
+	andi.		T2,	M,	15
+	ble		STRMM_L1x1_END
+
+	andi.		T1,	M,	8
+	ble		STRMM_L1x8_END
+
+#if  (defined(LEFT) &&  defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+	mr		BO,	B					// B -> BO
+#else
+	mr		BO,	B					// B -> BO
+	slwi		T1,	KK,	2				// Number of values in B shifted
+	slwi		T2,	KK,	5				// Number of values in A shifted
+	add		BO,	BO,	T1				// Add values to BO
+	add		AO,	AO,	T2				// Add values to AO
+#endif
+
+#if (defined(LEFT) &&  !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
+	sub		T1,	K,	KK				// K - KK -> TEMP1
+#else
+	mr		T1,	KK					// KK -> KTEMP
+#ifdef LEFT
+	addi		T1,	T1,	8				// KTEMP + Number of values in A -> KTEMP
+#else
+	addi		T1,	T1,	1				// KTEMP + Number of values in B -> KTEMP
+#endif
+#endif
+
+	mr		KKK,	T1
+	mr		K1,	T1
+	srawi.		L,	K1,	3				// KTEMP / 8 -> L
+	ble		STRMM_L1x8_SUB0
+	cmpwi		cr0,	L,	1
+	ble		STRMM_L1x8_SUB4
+
+STRMM_L1x8_LOOP_START:
+
+	LOAD1x8_1
+	KERNEL1x8_I1
+	KERNEL1x8_2
+	KERNEL1x8_1
+	KERNEL1x8_2
+
+	KERNEL1x8_1
+	KERNEL1x8_2
+	KERNEL1x8_1
+	KERNEL1x8_2
+
+	addic.		L,	L,	-2
+	ble		STRMM_L1x8_LOOP_END
+
+	.align 5
+
+STRMM_L1x8_LOOP:
+
+	KERNEL1x8_1
+	KERNEL1x8_2
+	KERNEL1x8_1
+	KERNEL1x8_2
+
+	KERNEL1x8_1
+	KERNEL1x8_2
+	KERNEL1x8_1
+	KERNEL1x8_2
+
+	addic.		L,	L,	-1
+	bgt		STRMM_L1x8_LOOP
+
+STRMM_L1x8_LOOP_END:
+
+	KERNEL1x8_1
+	KERNEL1x8_2
+	KERNEL1x8_1
+	KERNEL1x8_2
+
+	KERNEL1x8_1
+	KERNEL1x8_2
+	KERNEL1x8_1
+	KERNEL1x8_E2
+
+	b		STRMM_L1x8_SUB1
+
+STRMM_L1x8_SUB4:
+
+	KERNEL1x8_SUBI1
+	KERNEL1x8_SUB1
+	KERNEL1x8_SUB1
+	KERNEL1x8_SUB1
+
+	KERNEL1x8_SUB1
+	KERNEL1x8_SUB1
+	KERNEL1x8_SUB1
+	KERNEL1x8_SUB1
+
+	b		STRMM_L1x8_SUB1
+
+STRMM_L1x8_SUB0:
+
+	andi.		L,	K1,	7						// K1 & 7 -> L
+
+	KERNEL1x8_SUBI1
+
+	addic.		L,	L,	-1
+	ble		STRMM_L1x8_SAVE
+	b		STRMM_L1x8_SUB2
+
+STRMM_L1x8_SUB1:
+
+	andi.		L,	K1,	7						// K1 & 7 -> L
+	ble		STRMM_L1x8_SAVE
+
+STRMM_L1x8_SUB2:
+
+	KERNEL1x8_SUB1
+
+	addic.		L,	L,	-1
+	bgt		STRMM_L1x8_SUB2
+
+STRMM_L1x8_SAVE:
+
+	SAVE1x8
+
+#if  (defined(LEFT) &&  defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+	sub		T1,	K,	KKK					// K - KKK -> TEMP1
+	slwi		T2,	T1,	2			// TEMP1 * Number of values in B shifted -> TEMP2
+	slwi		T1,	T1,	5			// TEMP1 * Number of values in A shifted -> TEMP1
+	add		BO,	BO,	T2					// BO += TEMP2 * number of values in B shifted
+	add		AO,	AO,	T1					// AO += TEMP1 * number of values in A shifted
+#endif
+
+#if defined(LEFT)
+	addi		KK,	KK,	8				// KK += Number of values in A
+#endif
+
+
+STRMM_L1x8_END:
+
+STRMM_L1x4_BEGIN:
+
+	andi.		T1,	M,	4
+	ble		STRMM_L1x4_END
+
+#if  (defined(LEFT) &&  defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+	mr		BO,	B					// B -> BO
+#else
+	mr		BO,	B					// B -> BO
+	slwi		T1,	KK,	2				// Number of values in B shifted
+	slwi		T2,	KK,	4				// Number of values in A shifted
+	add		BO,	BO,	T1				// Add values to BO
+	add		AO,	AO,	T2				// Add values to AO
+#endif
+
+#if (defined(LEFT) &&  !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
+	sub		T1,	K,	KK				// K - KK -> TEMP1
+#else
+	mr		T1,	KK					// KK -> KTEMP
+#ifdef LEFT
+	addi		T1,	T1,	4				// KTEMP + Number of values in A -> KTEMP
+#else
+	addi		T1,	T1,	1				// KTEMP + Number of values in B -> KTEMP
+#endif
+#endif
+
+	mr		KKK,	T1
+	mr		K1,	T1
+	srawi.		L,	K1,	3				// KTEMP / 8 -> L
+	ble		STRMM_L1x4_SUB0
+	cmpwi		cr0,	L,	1
+	ble		STRMM_L1x4_SUB4
+
+STRMM_L1x4_LOOP_START:
+
+	LOAD1x4_1
+	KERNEL1x4_I1
+	KERNEL1x4_2
+	KERNEL1x4_1
+	KERNEL1x4_2
+
+	KERNEL1x4_1
+	KERNEL1x4_2
+	KERNEL1x4_1
+	KERNEL1x4_2
+
+	addic.		L,	L,	-2
+	ble		STRMM_L1x4_LOOP_END
+
+	.align 5
+
+STRMM_L1x4_LOOP:
+
+	KERNEL1x4_1
+	KERNEL1x4_2
+	KERNEL1x4_1
+	KERNEL1x4_2
+
+	KERNEL1x4_1
+	KERNEL1x4_2
+	KERNEL1x4_1
+	KERNEL1x4_2
+
+	addic.		L,	L,	-1
+	bgt		STRMM_L1x4_LOOP
+
+STRMM_L1x4_LOOP_END:
+
+	KERNEL1x4_1
+	KERNEL1x4_2
+	KERNEL1x4_1
+	KERNEL1x4_2
+
+	KERNEL1x4_1
+	KERNEL1x4_2
+	KERNEL1x4_1
+	KERNEL1x4_E2
+
+	b		STRMM_L1x4_SUB1
+
+STRMM_L1x4_SUB4:
+
+	KERNEL1x4_SUBI1
+	KERNEL1x4_SUB1
+	KERNEL1x4_SUB1
+	KERNEL1x4_SUB1
+
+	KERNEL1x4_SUB1
+	KERNEL1x4_SUB1
+	KERNEL1x4_SUB1
+	KERNEL1x4_SUB1
+
+	b		STRMM_L1x4_SUB1
+
+STRMM_L1x4_SUB0:
+
+	andi.		L,	K1,	7						// K1 & 7 -> L
+
+	KERNEL1x4_SUBI1
+
+	addic.		L,	L,	-1
+	ble		STRMM_L1x4_SAVE
+	b		STRMM_L1x4_SUB2
+
+STRMM_L1x4_SUB1:
+
+	andi.		L,	K1,	7						// K1 & 7 -> L
+	ble		STRMM_L1x4_SAVE
+
+STRMM_L1x4_SUB2:
+
+	KERNEL1x4_SUB1
+
+	addic.		L,	L,	-1
+	bgt		STRMM_L1x4_SUB2
+
+STRMM_L1x4_SAVE:
+
+	SAVE1x4
+
+#if  (defined(LEFT) &&  defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+	sub		T1,	K,	KKK					// K - KKK -> TEMP1
+	slwi		T2,	T1,	2			// TEMP1 * Number of values in B shifted -> TEMP2
+	slwi		T1,	T1,	4			// TEMP1 * Number of values in A shifted -> TEMP1
+	add		BO,	BO,	T2					// BO += TEMP2 * number of values in B shifted
+	add		AO,	AO,	T1					// AO += TEMP1 * number of values in A shifted
+#endif
+
+#if defined(LEFT)
+	addi		KK,	KK,	4				// KK += Number of values in A
+#endif
+
+
+STRMM_L1x4_END:
+
+STRMM_L1x2_BEGIN:
+
+	andi.		T1,	M,	2
+	ble		STRMM_L1x2_END
+
+#if  (defined(LEFT) &&  defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+	mr		BO,	B					// B -> BO
+#else
+	mr		BO,	B					// B -> BO
+	slwi		T1,	KK,	2				// Number of values in B shifted
+	slwi		T2,	KK,	3				// Number of values in A shifted
+	add		BO,	BO,	T1				// Add values to BO
+	add		AO,	AO,	T2				// Add values to AO
+#endif
+
+#if (defined(LEFT) &&  !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
+	sub		T1,	K,	KK				// K - KK -> TEMP1
+#else
+	mr		T1,	KK					// KK -> KTEMP
+#ifdef LEFT
+	addi		T1,	T1,	2				// KTEMP + Number of values in A -> KTEMP
+#else
+	addi		T1,	T1,	1				// KTEMP + Number of values in B -> KTEMP
+#endif
+#endif
+
+	mr		KKK,	T1
+	mr		K1,	T1
+	srawi.		L,	K1,	3				// KTEMP / 8 -> L
+	ble		STRMM_L1x2_SUB0
+	cmpwi		cr0,	L,	1
+	ble		STRMM_L1x2_SUB4
+
+STRMM_L1x2_LOOP_START:
+
+	LOAD1x2_1
+	KERNEL1x2_I1
+	KERNEL1x2_2
+	KERNEL1x2_1
+	KERNEL1x2_2
+
+	KERNEL1x2_1
+	KERNEL1x2_2
+	KERNEL1x2_1
+	KERNEL1x2_2
+
+	addic.		L,	L,	-2
+	ble		STRMM_L1x2_LOOP_END
+
+	.align 5
+
+STRMM_L1x2_LOOP:
+
+	KERNEL1x2_1
+	KERNEL1x2_2
+	KERNEL1x2_1
+	KERNEL1x2_2
+
+	KERNEL1x2_1
+	KERNEL1x2_2
+	KERNEL1x2_1
+	KERNEL1x2_2
+
+	addic.		L,	L,	-1
+	bgt		STRMM_L1x2_LOOP
+
+STRMM_L1x2_LOOP_END:
+
+	KERNEL1x2_1
+	KERNEL1x2_2
+	KERNEL1x2_1
+	KERNEL1x2_2
+
+	KERNEL1x2_1
+	KERNEL1x2_2
+	KERNEL1x2_1
+	KERNEL1x2_E2
+
+	b		STRMM_L1x2_SUB1
+
+STRMM_L1x2_SUB4:
+
+	KERNEL1x2_SUBI1
+	KERNEL1x2_SUB1
+	KERNEL1x2_SUB1
+	KERNEL1x2_SUB1
+
+	KERNEL1x2_SUB1
+	KERNEL1x2_SUB1
+	KERNEL1x2_SUB1
+	KERNEL1x2_SUB1
+
+	b		STRMM_L1x2_SUB1
+
+STRMM_L1x2_SUB0:
+
+	andi.		L,	K1,	7						// K1 & 7 -> L
+
+	KERNEL1x2_SUBI1
+
+	addic.		L,	L,	-1
+	ble		STRMM_L1x2_SAVE
+	b		STRMM_L1x2_SUB2
+
+STRMM_L1x2_SUB1:
+
+	andi.		L,	K1,	7						// K1 & 7 -> L
+	ble		STRMM_L1x2_SAVE
+
+STRMM_L1x2_SUB2:
+
+	KERNEL1x2_SUB1
+
+	addic.		L,	L,	-1
+	bgt		STRMM_L1x2_SUB2
+
+STRMM_L1x2_SAVE:
+
+	SAVE1x2
+
+#if  (defined(LEFT) &&  defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+	sub		T1,	K,	KKK					// K - KKK -> TEMP1
+	slwi		T2,	T1,	2			// TEMP1 * Number of values in B shifted -> TEMP2
+	slwi		T1,	T1,	3			// TEMP1 * Number of values in A shifted -> TEMP1
+	add		BO,	BO,	T2					// BO += TEMP2 * number of values in B shifted
+	add		AO,	AO,	T1					// AO += TEMP1 * number of values in A shifted
+#endif
+
+#if defined(LEFT)
+	addi		KK,	KK,	2				// KK += Number of values in A
+#endif
+
+
+STRMM_L1x2_END:
+
+STRMM_L1x1_BEGIN:
+
+	andi.		T1,	M,	1
+	ble		STRMM_L1x1_END
+
+#if  (defined(LEFT) &&  defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+	mr		BO,	B					// B -> BO
+#else
+	mr		BO,	B					// B -> BO
+	slwi		T1,	KK,	2				// Number of values in B shifted
+	slwi		T2,	KK,	2				// Number of values in A shifted
+	add		BO,	BO,	T1				// Add values to BO
+	add		AO,	AO,	T2				// Add values to AO
+#endif
+
+#if (defined(LEFT) &&  !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
+	sub		T1,	K,	KK				// K - KK -> TEMP1
+#else
+	mr		T1,	KK					// KK -> KTEMP
+#ifdef LEFT
+	addi		T1,	T1,	1				// KTEMP + Number of values in A -> KTEMP
+#else
+	addi		T1,	T1,	1				// KTEMP + Number of values in B -> KTEMP
+#endif
+#endif
+
+	mr		KKK,	T1
+	mr		K1,	T1
+	srawi.		L,	K1,	3				// KTEMP / 8 -> L
+	ble		STRMM_L1x1_SUB0
+	cmpwi		cr0,	L,	1
+	ble		STRMM_L1x1_SUB4
+
+STRMM_L1x1_LOOP_START:
+
+	LOAD1x1_1
+	KERNEL1x1_I1
+	KERNEL1x1_2
+	KERNEL1x1_1
+	KERNEL1x1_2
+
+	KERNEL1x1_1
+	KERNEL1x1_2
+	KERNEL1x1_1
+	KERNEL1x1_2
+
+	addic.		L,	L,	-2
+	ble		STRMM_L1x1_LOOP_END
+
+	.align 5
+
+STRMM_L1x1_LOOP:
+
+	KERNEL1x1_1
+	KERNEL1x1_2
+	KERNEL1x1_1
+	KERNEL1x1_2
+
+	KERNEL1x1_1
+	KERNEL1x1_2
+	KERNEL1x1_1
+	KERNEL1x1_2
+
+	addic.		L,	L,	-1
+	bgt		STRMM_L1x1_LOOP
+
+STRMM_L1x1_LOOP_END:
+
+	KERNEL1x1_1
+	KERNEL1x1_2
+	KERNEL1x1_1
+	KERNEL1x1_2
+
+	KERNEL1x1_1
+	KERNEL1x1_2
+	KERNEL1x1_1
+	KERNEL1x1_E2
+
+	b		STRMM_L1x1_SUB1
+
+STRMM_L1x1_SUB4:
+
+	KERNEL1x1_SUBI1
+	KERNEL1x1_SUB1
+	KERNEL1x1_SUB1
+	KERNEL1x1_SUB1
+
+	KERNEL1x1_SUB1
+	KERNEL1x1_SUB1
+	KERNEL1x1_SUB1
+	KERNEL1x1_SUB1
+
+	b		STRMM_L1x1_SUB1
+
+STRMM_L1x1_SUB0:
+
+	andi.		L,	K1,	7						// K1 & 7 -> L
+
+	KERNEL1x1_SUBI1
+
+	addic.		L,	L,	-1
+	ble		STRMM_L1x1_SAVE
+	b		STRMM_L1x1_SUB2
+
+STRMM_L1x1_SUB1:
+
+	andi.		L,	K1,	7						// K1 & 7 -> L
+	ble		STRMM_L1x1_SAVE
+
+STRMM_L1x1_SUB2:
+
+	KERNEL1x1_SUB1
+
+	addic.		L,	L,	-1
+	bgt		STRMM_L1x1_SUB2
+
+STRMM_L1x1_SAVE:
+
+	SAVE1x1
+
+#if  (defined(LEFT) &&  defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+	sub		T1,	K,	KKK					// K - KKK -> TEMP1
+	slwi		T2,	T1,	2			// TEMP1 * Number of values in B shifted -> TEMP2
+	slwi		T1,	T1,	2			// TEMP1 * Number of values in A shifted -> TEMP1
+	add		BO,	BO,	T2					// BO += TEMP2 * number of values in B shifted
+	add		AO,	AO,	T1					// AO += TEMP1 * number of values in A shifted
+#endif
+
+#if defined(LEFT)
+	addi		KK,	KK,	1				// KK += Number of values in A
+#endif
+
+
+STRMM_L1x1_END:
+
+#if !defined(LEFT)
+	addi		KK,	KK,	1					// KK += Number of values in B
+#endif
+
+
+STRMM_L1_END:
diff --git a/kernel/power/strmm_macros_16x8_power8.S b/kernel/power/strmm_macros_16x8_power8.S
new file mode 100644
index 000000000..27bc1e89c
--- /dev/null
+++ b/kernel/power/strmm_macros_16x8_power8.S
@@ -0,0 +1,5840 @@
+/***************************************************************************
+Copyright (c) 2013-2016, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*****************************************************************************/
+
+/**************************************************************************************
+* 2016/04/02 Werner Saar (wernsaar@googlemail.com)
+* 	 BLASTEST 		: OK
+* 	 CTEST			: OK
+* 	 TEST			: OK
+* 	 LAPACK-TEST		: OK
+**************************************************************************************/
+
+
+/**********************************************************************************************
+* Macros for N=8 and M=16
+**********************************************************************************************/
+
+.macro LOAD8x16_1
+
+	lxvw4x		vs0,	o0,	AO
+	lxvw4x		vs1,	o16,	AO
+	lxvw4x		vs2,	o32,	AO
+	lxvw4x		vs3,	o48,	AO
+
+	addi		AO,	AO,	64
+
+	lxvw4x		vs28,	o0,	BO
+
+	xxspltw		vs8,	vs28,	0
+	xxspltw		vs9,	vs28,	1
+	xxspltw		vs10,	vs28,	2
+	xxspltw		vs11,	vs28,	3
+
+	lxvw4x		vs29,	o16,	BO
+
+	xxspltw		vs12,	vs29,	0
+	xxspltw		vs13,	vs29,	1
+	xxspltw		vs14,	vs29,	2
+	xxspltw		vs15,	vs29,	3
+
+	addi		BO,	BO,	32
+
+.endm
+
+.macro KERNEL8x16_I1
+
+
+	lxvw4x		vs4,	o0,	AO
+	lxvw4x		vs5,	o16,	AO
+	lxvw4x		vs6,	o32,	AO
+	lxvw4x		vs7,	o48,	AO
+
+	addi		AO,	AO,	64
+
+	lxvw4x		vs28,	o0,	BO
+
+	xxspltw		vs16,	vs28,	0
+	xxspltw		vs17,	vs28,	1
+	xxspltw		vs18,	vs28,	2
+	xxspltw		vs19,	vs28,	3
+
+	lxvw4x		vs29,	o16,	BO
+
+	xxspltw		vs20,	vs29,	0
+	xxspltw		vs21,	vs29,	1
+	xxspltw		vs22,	vs29,	2
+	xxspltw		vs23,	vs29,	3
+
+	addi		BO,	BO,	32
+
+
+	xvmulsp		vs32,	vs0,	vs8
+	xvmulsp		vs33,	vs1,	vs8
+	xvmulsp		vs34,	vs2,	vs8
+	xvmulsp		vs35,	vs3,	vs8
+
+	xvmulsp		vs36,	vs0,	vs9
+	xvmulsp		vs37,	vs1,	vs9
+	xvmulsp		vs38,	vs2,	vs9
+	xvmulsp		vs39,	vs3,	vs9
+
+	xvmulsp		vs40,	vs0,	vs10
+	xvmulsp		vs41,	vs1,	vs10
+	xvmulsp		vs42,	vs2,	vs10
+	xvmulsp		vs43,	vs3,	vs10
+
+	xvmulsp		vs44,	vs0,	vs11
+	xvmulsp		vs45,	vs1,	vs11
+	xvmulsp		vs46,	vs2,	vs11
+	xvmulsp		vs47,	vs3,	vs11
+
+	xvmulsp		vs48,	vs0,	vs12
+	xvmulsp		vs49,	vs1,	vs12
+	xvmulsp		vs50,	vs2,	vs12
+	xvmulsp		vs51,	vs3,	vs12
+
+	xvmulsp		vs52,	vs0,	vs13
+	xvmulsp		vs53,	vs1,	vs13
+	xvmulsp		vs54,	vs2,	vs13
+	xvmulsp		vs55,	vs3,	vs13
+
+	xvmulsp		vs56,	vs0,	vs14
+	xvmulsp		vs57,	vs1,	vs14
+	xvmulsp		vs58,	vs2,	vs14
+	xvmulsp		vs59,	vs3,	vs14
+
+	xvmulsp		vs60,	vs0,	vs15
+	xvmulsp		vs61,	vs1,	vs15
+	xvmulsp		vs62,	vs2,	vs15
+	xvmulsp		vs63,	vs3,	vs15
+
+
+.endm
+
+.macro KERNEL8x16_1
+
+
+	lxvw4x		vs4,	o0,	AO
+	lxvw4x		vs5,	o16,	AO
+	lxvw4x		vs6,	o32,	AO
+	lxvw4x		vs7,	o48,	AO
+
+	addi		AO,	AO,	64
+
+	lxvw4x		vs28,	o0,	BO
+
+	xxspltw		vs16,	vs28,	0
+	xxspltw		vs17,	vs28,	1
+	xxspltw		vs18,	vs28,	2
+	xxspltw		vs19,	vs28,	3
+
+	lxvw4x		vs29,	o16,	BO
+
+	xxspltw		vs20,	vs29,	0
+	xxspltw		vs21,	vs29,	1
+	xxspltw		vs22,	vs29,	2
+	xxspltw		vs23,	vs29,	3
+
+	addi		BO,	BO,	32
+
+
+	xvmaddasp	vs32,	vs0,	vs8
+	xvmaddasp	vs33,	vs1,	vs8
+	xvmaddasp	vs34,	vs2,	vs8
+	xvmaddasp	vs35,	vs3,	vs8
+
+	xvmaddasp	vs36,	vs0,	vs9
+	xvmaddasp	vs37,	vs1,	vs9
+	xvmaddasp	vs38,	vs2,	vs9
+	xvmaddasp	vs39,	vs3,	vs9
+
+	xvmaddasp	vs40,	vs0,	vs10
+	xvmaddasp	vs41,	vs1,	vs10
+	xvmaddasp	vs42,	vs2,	vs10
+	xvmaddasp	vs43,	vs3,	vs10
+
+	xvmaddasp	vs44,	vs0,	vs11
+	xvmaddasp	vs45,	vs1,	vs11
+	xvmaddasp	vs46,	vs2,	vs11
+	xvmaddasp	vs47,	vs3,	vs11
+
+	xvmaddasp	vs48,	vs0,	vs12
+	xvmaddasp	vs49,	vs1,	vs12
+	xvmaddasp	vs50,	vs2,	vs12
+	xvmaddasp	vs51,	vs3,	vs12
+
+	xvmaddasp	vs52,	vs0,	vs13
+	xvmaddasp	vs53,	vs1,	vs13
+	xvmaddasp	vs54,	vs2,	vs13
+	xvmaddasp	vs55,	vs3,	vs13
+
+	xvmaddasp	vs56,	vs0,	vs14
+	xvmaddasp	vs57,	vs1,	vs14
+	xvmaddasp	vs58,	vs2,	vs14
+	xvmaddasp	vs59,	vs3,	vs14
+
+	xvmaddasp	vs60,	vs0,	vs15
+	xvmaddasp	vs61,	vs1,	vs15
+	xvmaddasp	vs62,	vs2,	vs15
+	xvmaddasp	vs63,	vs3,	vs15
+
+
+.endm
+
+.macro KERNEL8x16_2
+
+
+	lxvw4x		vs0,	o0,	AO
+	lxvw4x		vs1,	o16,	AO
+	lxvw4x		vs2,	o32,	AO
+	lxvw4x		vs3,	o48,	AO
+
+	addi		AO,	AO,	64
+
+	lxvw4x		vs28,	o0,	BO
+
+	xxspltw		vs8,	vs28,	0
+	xxspltw		vs9,	vs28,	1
+	xxspltw		vs10,	vs28,	2
+	xxspltw		vs11,	vs28,	3
+
+	lxvw4x		vs29,	o16,	BO
+
+	xxspltw		vs12,	vs29,	0
+	xxspltw		vs13,	vs29,	1
+	xxspltw		vs14,	vs29,	2
+	xxspltw		vs15,	vs29,	3
+
+	addi		BO,	BO,	32
+
+
+	xvmaddasp	vs32,	vs4,	vs16
+	xvmaddasp	vs33,	vs5,	vs16
+	xvmaddasp	vs34,	vs6,	vs16
+	xvmaddasp	vs35,	vs7,	vs16
+
+	xvmaddasp	vs36,	vs4,	vs17
+	xvmaddasp	vs37,	vs5,	vs17
+	xvmaddasp	vs38,	vs6,	vs17
+	xvmaddasp	vs39,	vs7,	vs17
+
+	xvmaddasp	vs40,	vs4,	vs18
+	xvmaddasp	vs41,	vs5,	vs18
+	xvmaddasp	vs42,	vs6,	vs18
+	xvmaddasp	vs43,	vs7,	vs18
+
+	xvmaddasp	vs44,	vs4,	vs19
+	xvmaddasp	vs45,	vs5,	vs19
+	xvmaddasp	vs46,	vs6,	vs19
+	xvmaddasp	vs47,	vs7,	vs19
+
+	xvmaddasp	vs48,	vs4,	vs20
+	xvmaddasp	vs49,	vs5,	vs20
+	xvmaddasp	vs50,	vs6,	vs20
+	xvmaddasp	vs51,	vs7,	vs20
+
+	xvmaddasp	vs52,	vs4,	vs21
+	xvmaddasp	vs53,	vs5,	vs21
+	xvmaddasp	vs54,	vs6,	vs21
+	xvmaddasp	vs55,	vs7,	vs21
+
+	xvmaddasp	vs56,	vs4,	vs22
+	xvmaddasp	vs57,	vs5,	vs22
+	xvmaddasp	vs58,	vs6,	vs22
+	xvmaddasp	vs59,	vs7,	vs22
+
+	xvmaddasp	vs60,	vs4,	vs23
+	xvmaddasp	vs61,	vs5,	vs23
+	xvmaddasp	vs62,	vs6,	vs23
+	xvmaddasp	vs63,	vs7,	vs23
+
+
+.endm
+
+.macro KERNEL8x16_E2
+
+
+	xvmaddasp	vs32,	vs4,	vs16
+	xvmaddasp	vs33,	vs5,	vs16
+	xvmaddasp	vs34,	vs6,	vs16
+	xvmaddasp	vs35,	vs7,	vs16
+
+	xvmaddasp	vs36,	vs4,	vs17
+	xvmaddasp	vs37,	vs5,	vs17
+	xvmaddasp	vs38,	vs6,	vs17
+	xvmaddasp	vs39,	vs7,	vs17
+
+	xvmaddasp	vs40,	vs4,	vs18
+	xvmaddasp	vs41,	vs5,	vs18
+	xvmaddasp	vs42,	vs6,	vs18
+	xvmaddasp	vs43,	vs7,	vs18
+
+	xvmaddasp	vs44,	vs4,	vs19
+	xvmaddasp	vs45,	vs5,	vs19
+	xvmaddasp	vs46,	vs6,	vs19
+	xvmaddasp	vs47,	vs7,	vs19
+
+	xvmaddasp	vs48,	vs4,	vs20
+	xvmaddasp	vs49,	vs5,	vs20
+	xvmaddasp	vs50,	vs6,	vs20
+	xvmaddasp	vs51,	vs7,	vs20
+
+	xvmaddasp	vs52,	vs4,	vs21
+	xvmaddasp	vs53,	vs5,	vs21
+	xvmaddasp	vs54,	vs6,	vs21
+	xvmaddasp	vs55,	vs7,	vs21
+
+	xvmaddasp	vs56,	vs4,	vs22
+	xvmaddasp	vs57,	vs5,	vs22
+	xvmaddasp	vs58,	vs6,	vs22
+	xvmaddasp	vs59,	vs7,	vs22
+
+	xvmaddasp	vs60,	vs4,	vs23
+	xvmaddasp	vs61,	vs5,	vs23
+	xvmaddasp	vs62,	vs6,	vs23
+	xvmaddasp	vs63,	vs7,	vs23
+
+
+.endm
+
+.macro KERNEL8x16_SUBI1
+
+
+	lxvw4x		vs0,	o0,	AO
+	lxvw4x		vs1,	o16,	AO
+	lxvw4x		vs2,	o32,	AO
+	lxvw4x		vs3,	o48,	AO
+
+	addi		AO,	AO,	64
+
+	lxvw4x		vs28,	o0,	BO
+
+	xxspltw		vs8,	vs28,	0
+	xxspltw		vs9,	vs28,	1
+	xxspltw		vs10,	vs28,	2
+	xxspltw		vs11,	vs28,	3
+
+	lxvw4x		vs29,	o16,	BO
+
+	xxspltw		vs12,	vs29,	0
+	xxspltw		vs13,	vs29,	1
+	xxspltw		vs14,	vs29,	2
+	xxspltw		vs15,	vs29,	3
+
+	addi		BO,	BO,	32
+
+
+	xvmulsp		vs32,	vs0,	vs8
+	xvmulsp		vs33,	vs1,	vs8
+	xvmulsp		vs34,	vs2,	vs8
+	xvmulsp		vs35,	vs3,	vs8
+
+	xvmulsp		vs36,	vs0,	vs9
+	xvmulsp		vs37,	vs1,	vs9
+	xvmulsp		vs38,	vs2,	vs9
+	xvmulsp		vs39,	vs3,	vs9
+
+	xvmulsp		vs40,	vs0,	vs10
+	xvmulsp		vs41,	vs1,	vs10
+	xvmulsp		vs42,	vs2,	vs10
+	xvmulsp		vs43,	vs3,	vs10
+
+	xvmulsp		vs44,	vs0,	vs11
+	xvmulsp		vs45,	vs1,	vs11
+	xvmulsp		vs46,	vs2,	vs11
+	xvmulsp		vs47,	vs3,	vs11
+
+	xvmulsp		vs48,	vs0,	vs12
+	xvmulsp		vs49,	vs1,	vs12
+	xvmulsp		vs50,	vs2,	vs12
+	xvmulsp		vs51,	vs3,	vs12
+
+	xvmulsp		vs52,	vs0,	vs13
+	xvmulsp		vs53,	vs1,	vs13
+	xvmulsp		vs54,	vs2,	vs13
+	xvmulsp		vs55,	vs3,	vs13
+
+	xvmulsp		vs56,	vs0,	vs14
+	xvmulsp		vs57,	vs1,	vs14
+	xvmulsp		vs58,	vs2,	vs14
+	xvmulsp		vs59,	vs3,	vs14
+
+	xvmulsp		vs60,	vs0,	vs15
+	xvmulsp		vs61,	vs1,	vs15
+	xvmulsp		vs62,	vs2,	vs15
+	xvmulsp		vs63,	vs3,	vs15
+
+
+.endm
+
+.macro KERNEL8x16_SUB1
+
+
+	lxvw4x		vs0,	o0,	AO
+	lxvw4x		vs1,	o16,	AO
+	lxvw4x		vs2,	o32,	AO
+	lxvw4x		vs3,	o48,	AO
+
+	addi		AO,	AO,	64
+
+	lxvw4x		vs28,	o0,	BO
+
+	xxspltw		vs8,	vs28,	0
+	xxspltw		vs9,	vs28,	1
+	xxspltw		vs10,	vs28,	2
+	xxspltw		vs11,	vs28,	3
+
+	lxvw4x		vs29,	o16,	BO
+
+	xxspltw		vs12,	vs29,	0
+	xxspltw		vs13,	vs29,	1
+	xxspltw		vs14,	vs29,	2
+	xxspltw		vs15,	vs29,	3
+
+	addi		BO,	BO,	32
+
+
+	xvmaddasp	vs32,	vs0,	vs8
+	xvmaddasp	vs33,	vs1,	vs8
+	xvmaddasp	vs34,	vs2,	vs8
+	xvmaddasp	vs35,	vs3,	vs8
+
+	xvmaddasp	vs36,	vs0,	vs9
+	xvmaddasp	vs37,	vs1,	vs9
+	xvmaddasp	vs38,	vs2,	vs9
+	xvmaddasp	vs39,	vs3,	vs9
+
+	xvmaddasp	vs40,	vs0,	vs10
+	xvmaddasp	vs41,	vs1,	vs10
+	xvmaddasp	vs42,	vs2,	vs10
+	xvmaddasp	vs43,	vs3,	vs10
+
+	xvmaddasp	vs44,	vs0,	vs11
+	xvmaddasp	vs45,	vs1,	vs11
+	xvmaddasp	vs46,	vs2,	vs11
+	xvmaddasp	vs47,	vs3,	vs11
+
+	xvmaddasp	vs48,	vs0,	vs12
+	xvmaddasp	vs49,	vs1,	vs12
+	xvmaddasp	vs50,	vs2,	vs12
+	xvmaddasp	vs51,	vs3,	vs12
+
+	xvmaddasp	vs52,	vs0,	vs13
+	xvmaddasp	vs53,	vs1,	vs13
+	xvmaddasp	vs54,	vs2,	vs13
+	xvmaddasp	vs55,	vs3,	vs13
+
+	xvmaddasp	vs56,	vs0,	vs14
+	xvmaddasp	vs57,	vs1,	vs14
+	xvmaddasp	vs58,	vs2,	vs14
+	xvmaddasp	vs59,	vs3,	vs14
+
+	xvmaddasp	vs60,	vs0,	vs15
+	xvmaddasp	vs61,	vs1,	vs15
+	xvmaddasp	vs62,	vs2,	vs15
+	xvmaddasp	vs63,	vs3,	vs15
+
+
+.endm
+
+.macro SAVE8x16
+
+	mr		T1,	CO
+
+#ifndef TRMMKERNEL
+
+	lxvw4x		vs0,	o0,	T1
+	lxvw4x		vs1,	o16,	T1
+	lxvw4x		vs2,	o32,	T1
+	lxvw4x		vs3,	o48,	T1
+
+#endif
+
+#ifdef TRMMKERNEL
+	xvmulsp		vs0,	vs32,	alpha_vr
+	xvmulsp		vs1,	vs33,	alpha_vr
+	xvmulsp		vs2,	vs34,	alpha_vr
+	xvmulsp		vs3,	vs35,	alpha_vr
+#else
+	xvmaddasp	vs0,	vs32,	alpha_vr
+	xvmaddasp	vs1,	vs33,	alpha_vr
+	xvmaddasp	vs2,	vs34,	alpha_vr
+	xvmaddasp	vs3,	vs35,	alpha_vr
+#endif
+
+	stxvw4x		vs0,	o0,	T1
+	stxvw4x		vs1,	o16,	T1
+	stxvw4x		vs2,	o32,	T1
+	stxvw4x		vs3,	o48,	T1
+
+	add		T1,	T1,	LDC
+
+
+#ifndef TRMMKERNEL
+
+	lxvw4x		vs0,	o0,	T1
+	lxvw4x		vs1,	o16,	T1
+	lxvw4x		vs2,	o32,	T1
+	lxvw4x		vs3,	o48,	T1
+
+#endif
+
+#ifdef TRMMKERNEL
+	xvmulsp		vs0,	vs36,	alpha_vr
+	xvmulsp		vs1,	vs37,	alpha_vr
+	xvmulsp		vs2,	vs38,	alpha_vr
+	xvmulsp		vs3,	vs39,	alpha_vr
+#else
+	xvmaddasp	vs0,	vs36,	alpha_vr
+	xvmaddasp	vs1,	vs37,	alpha_vr
+	xvmaddasp	vs2,	vs38,	alpha_vr
+	xvmaddasp	vs3,	vs39,	alpha_vr
+#endif
+
+	stxvw4x		vs0,	o0,	T1
+	stxvw4x		vs1,	o16,	T1
+	stxvw4x		vs2,	o32,	T1
+	stxvw4x		vs3,	o48,	T1
+
+	add		T1,	T1,	LDC
+
+
+#ifndef TRMMKERNEL
+
+	lxvw4x		vs0,	o0,	T1
+	lxvw4x		vs1,	o16,	T1
+	lxvw4x		vs2,	o32,	T1
+	lxvw4x		vs3,	o48,	T1
+
+#endif
+
+#ifdef TRMMKERNEL
+	xvmulsp		vs0,	vs40,	alpha_vr
+	xvmulsp		vs1,	vs41,	alpha_vr
+	xvmulsp		vs2,	vs42,	alpha_vr
+	xvmulsp		vs3,	vs43,	alpha_vr
+#else
+	xvmaddasp	vs0,	vs40,	alpha_vr
+	xvmaddasp	vs1,	vs41,	alpha_vr
+	xvmaddasp	vs2,	vs42,	alpha_vr
+	xvmaddasp	vs3,	vs43,	alpha_vr
+#endif
+
+	stxvw4x		vs0,	o0,	T1
+	stxvw4x		vs1,	o16,	T1
+	stxvw4x		vs2,	o32,	T1
+	stxvw4x		vs3,	o48,	T1
+
+	add		T1,	T1,	LDC
+
+
+#ifndef TRMMKERNEL
+
+	lxvw4x		vs0,	o0,	T1
+	lxvw4x		vs1,	o16,	T1
+	lxvw4x		vs2,	o32,	T1
+	lxvw4x		vs3,	o48,	T1
+
+#endif
+
+#ifdef TRMMKERNEL
+	xvmulsp		vs0,	vs44,	alpha_vr
+	xvmulsp		vs1,	vs45,	alpha_vr
+	xvmulsp		vs2,	vs46,	alpha_vr
+	xvmulsp		vs3,	vs47,	alpha_vr
+#else
+	xvmaddasp	vs0,	vs44,	alpha_vr
+	xvmaddasp	vs1,	vs45,	alpha_vr
+	xvmaddasp	vs2,	vs46,	alpha_vr
+	xvmaddasp	vs3,	vs47,	alpha_vr
+#endif
+
+	stxvw4x		vs0,	o0,	T1
+	stxvw4x		vs1,	o16,	T1
+	stxvw4x		vs2,	o32,	T1
+	stxvw4x		vs3,	o48,	T1
+
+	add		T1,	T1,	LDC
+
+
+#ifndef TRMMKERNEL
+
+	lxvw4x		vs0,	o0,	T1
+	lxvw4x		vs1,	o16,	T1
+	lxvw4x		vs2,	o32,	T1
+	lxvw4x		vs3,	o48,	T1
+
+#endif
+
+#ifdef TRMMKERNEL
+	xvmulsp		vs0,	vs48,	alpha_vr
+	xvmulsp		vs1,	vs49,	alpha_vr
+	xvmulsp		vs2,	vs50,	alpha_vr
+	xvmulsp		vs3,	vs51,	alpha_vr
+#else
+	xvmaddasp	vs0,	vs48,	alpha_vr
+	xvmaddasp	vs1,	vs49,	alpha_vr
+	xvmaddasp	vs2,	vs50,	alpha_vr
+	xvmaddasp	vs3,	vs51,	alpha_vr
+#endif
+
+	stxvw4x		vs0,	o0,	T1
+	stxvw4x		vs1,	o16,	T1
+	stxvw4x		vs2,	o32,	T1
+	stxvw4x		vs3,	o48,	T1
+
+	add		T1,	T1,	LDC
+
+
+#ifndef TRMMKERNEL
+
+	lxvw4x		vs0,	o0,	T1
+	lxvw4x		vs1,	o16,	T1
+	lxvw4x		vs2,	o32,	T1
+	lxvw4x		vs3,	o48,	T1
+
+#endif
+
+#ifdef TRMMKERNEL
+	xvmulsp		vs0,	vs52,	alpha_vr
+	xvmulsp		vs1,	vs53,	alpha_vr
+	xvmulsp		vs2,	vs54,	alpha_vr
+	xvmulsp		vs3,	vs55,	alpha_vr
+#else
+	xvmaddasp	vs0,	vs52,	alpha_vr
+	xvmaddasp	vs1,	vs53,	alpha_vr
+	xvmaddasp	vs2,	vs54,	alpha_vr
+	xvmaddasp	vs3,	vs55,	alpha_vr
+#endif
+
+	stxvw4x		vs0,	o0,	T1
+	stxvw4x		vs1,	o16,	T1
+	stxvw4x		vs2,	o32,	T1
+	stxvw4x		vs3,	o48,	T1
+
+	add		T1,	T1,	LDC
+
+
+#ifndef TRMMKERNEL
+
+	lxvw4x		vs0,	o0,	T1
+	lxvw4x		vs1,	o16,	T1
+	lxvw4x		vs2,	o32,	T1
+	lxvw4x		vs3,	o48,	T1
+
+#endif
+
+#ifdef TRMMKERNEL
+	xvmulsp		vs0,	vs56,	alpha_vr
+	xvmulsp		vs1,	vs57,	alpha_vr
+	xvmulsp		vs2,	vs58,	alpha_vr
+	xvmulsp		vs3,	vs59,	alpha_vr
+#else
+	xvmaddasp	vs0,	vs56,	alpha_vr
+	xvmaddasp	vs1,	vs57,	alpha_vr
+	xvmaddasp	vs2,	vs58,	alpha_vr
+	xvmaddasp	vs3,	vs59,	alpha_vr
+#endif
+
+	stxvw4x		vs0,	o0,	T1
+	stxvw4x		vs1,	o16,	T1
+	stxvw4x		vs2,	o32,	T1
+	stxvw4x		vs3,	o48,	T1
+
+	add		T1,	T1,	LDC
+
+
+#ifndef TRMMKERNEL
+
+	lxvw4x		vs0,	o0,	T1
+	lxvw4x		vs1,	o16,	T1
+	lxvw4x		vs2,	o32,	T1
+	lxvw4x		vs3,	o48,	T1
+
+#endif
+
+#ifdef TRMMKERNEL
+	xvmulsp		vs0,	vs60,	alpha_vr
+	xvmulsp		vs1,	vs61,	alpha_vr
+	xvmulsp		vs2,	vs62,	alpha_vr
+	xvmulsp		vs3,	vs63,	alpha_vr
+#else
+	xvmaddasp	vs0,	vs60,	alpha_vr
+	xvmaddasp	vs1,	vs61,	alpha_vr
+	xvmaddasp	vs2,	vs62,	alpha_vr
+	xvmaddasp	vs3,	vs63,	alpha_vr
+#endif
+
+	stxvw4x		vs0,	o0,	T1
+	stxvw4x		vs1,	o16,	T1
+	stxvw4x		vs2,	o32,	T1
+	stxvw4x		vs3,	o48,	T1
+
+	add		T1,	T1,	LDC
+
+	addi		CO,	CO,	64
+
+.endm
+
+
+/**********************************************************************************************
+* Macros for N=8 and M=8
+**********************************************************************************************/
+
+.macro LOAD8x8_1
+
+	lxvw4x		vs0,	o0,	AO
+	lxvw4x		vs1,	o16,	AO
+
+	addi		AO,	AO,	32
+
+	lxvw4x		vs28,	o0,	BO
+
+	xxspltw		vs8,	vs28,	0
+	xxspltw		vs9,	vs28,	1
+	xxspltw		vs10,	vs28,	2
+	xxspltw		vs11,	vs28,	3
+
+	lxvw4x		vs29,	o16,	BO
+
+	xxspltw		vs12,	vs29,	0
+	xxspltw		vs13,	vs29,	1
+	xxspltw		vs14,	vs29,	2
+	xxspltw		vs15,	vs29,	3
+
+	addi		BO,	BO,	32
+
+.endm
+
+.macro KERNEL8x8_I1
+
+
+	lxvw4x		vs4,	o0,	AO
+	lxvw4x		vs5,	o16,	AO
+
+	addi		AO,	AO,	32
+
+	lxvw4x		vs28,	o0,	BO
+
+	xxspltw		vs16,	vs28,	0
+	xxspltw		vs17,	vs28,	1
+	xxspltw		vs18,	vs28,	2
+	xxspltw		vs19,	vs28,	3
+
+	lxvw4x		vs29,	o16,	BO
+
+	xxspltw		vs20,	vs29,	0
+	xxspltw		vs21,	vs29,	1
+	xxspltw		vs22,	vs29,	2
+	xxspltw		vs23,	vs29,	3
+
+	addi		BO,	BO,	32
+
+
+	xvmulsp		vs32,	vs0,	vs8
+	xvmulsp		vs33,	vs1,	vs8
+
+	xvmulsp		vs34,	vs0,	vs9
+	xvmulsp		vs35,	vs1,	vs9
+
+	xvmulsp		vs36,	vs0,	vs10
+	xvmulsp		vs37,	vs1,	vs10
+
+	xvmulsp		vs38,	vs0,	vs11
+	xvmulsp		vs39,	vs1,	vs11
+
+	xvmulsp		vs40,	vs0,	vs12
+	xvmulsp		vs41,	vs1,	vs12
+
+	xvmulsp		vs42,	vs0,	vs13
+	xvmulsp		vs43,	vs1,	vs13
+
+	xvmulsp		vs44,	vs0,	vs14
+	xvmulsp		vs45,	vs1,	vs14
+
+	xvmulsp		vs46,	vs0,	vs15
+	xvmulsp		vs47,	vs1,	vs15
+
+
+.endm
+
+.macro KERNEL8x8_1
+
+
+	lxvw4x		vs4,	o0,	AO
+	lxvw4x		vs5,	o16,	AO
+
+	addi		AO,	AO,	32
+
+	lxvw4x		vs28,	o0,	BO
+
+	xxspltw		vs16,	vs28,	0
+	xxspltw		vs17,	vs28,	1
+	xxspltw		vs18,	vs28,	2
+	xxspltw		vs19,	vs28,	3
+
+	lxvw4x		vs29,	o16,	BO
+
+	xxspltw		vs20,	vs29,	0
+	xxspltw		vs21,	vs29,	1
+	xxspltw		vs22,	vs29,	2
+	xxspltw		vs23,	vs29,	3
+
+	addi		BO,	BO,	32
+
+
+	xvmaddasp	vs32,	vs0,	vs8
+	xvmaddasp	vs33,	vs1,	vs8
+
+	xvmaddasp	vs34,	vs0,	vs9
+	xvmaddasp	vs35,	vs1,	vs9
+
+	xvmaddasp	vs36,	vs0,	vs10
+	xvmaddasp	vs37,	vs1,	vs10
+
+	xvmaddasp	vs38,	vs0,	vs11
+	xvmaddasp	vs39,	vs1,	vs11
+
+	xvmaddasp	vs40,	vs0,	vs12
+	xvmaddasp	vs41,	vs1,	vs12
+
+	xvmaddasp	vs42,	vs0,	vs13
+	xvmaddasp	vs43,	vs1,	vs13
+
+	xvmaddasp	vs44,	vs0,	vs14
+	xvmaddasp	vs45,	vs1,	vs14
+
+	xvmaddasp	vs46,	vs0,	vs15
+	xvmaddasp	vs47,	vs1,	vs15
+
+
+.endm
+
+.macro KERNEL8x8_2
+
+
+	lxvw4x		vs0,	o0,	AO
+	lxvw4x		vs1,	o16,	AO
+
+	addi		AO,	AO,	32
+
+	lxvw4x		vs28,	o0,	BO
+
+	xxspltw		vs8,	vs28,	0
+	xxspltw		vs9,	vs28,	1
+	xxspltw		vs10,	vs28,	2
+	xxspltw		vs11,	vs28,	3
+
+	lxvw4x		vs29,	o16,	BO
+
+	xxspltw		vs12,	vs29,	0
+	xxspltw		vs13,	vs29,	1
+	xxspltw		vs14,	vs29,	2
+	xxspltw		vs15,	vs29,	3
+
+	addi		BO,	BO,	32
+
+
+	xvmaddasp	vs32,	vs4,	vs16
+	xvmaddasp	vs33,	vs5,	vs16
+
+	xvmaddasp	vs34,	vs4,	vs17
+	xvmaddasp	vs35,	vs5,	vs17
+
+	xvmaddasp	vs36,	vs4,	vs18
+	xvmaddasp	vs37,	vs5,	vs18
+
+	xvmaddasp	vs38,	vs4,	vs19
+	xvmaddasp	vs39,	vs5,	vs19
+
+	xvmaddasp	vs40,	vs4,	vs20
+	xvmaddasp	vs41,	vs5,	vs20
+
+	xvmaddasp	vs42,	vs4,	vs21
+	xvmaddasp	vs43,	vs5,	vs21
+
+	xvmaddasp	vs44,	vs4,	vs22
+	xvmaddasp	vs45,	vs5,	vs22
+
+	xvmaddasp	vs46,	vs4,	vs23
+	xvmaddasp	vs47,	vs5,	vs23
+
+
+.endm
+
+.macro KERNEL8x8_E2
+
+
+	xvmaddasp	vs32,	vs4,	vs16
+	xvmaddasp	vs33,	vs5,	vs16
+
+	xvmaddasp	vs34,	vs4,	vs17
+	xvmaddasp	vs35,	vs5,	vs17
+
+	xvmaddasp	vs36,	vs4,	vs18
+	xvmaddasp	vs37,	vs5,	vs18
+
+	xvmaddasp	vs38,	vs4,	vs19
+	xvmaddasp	vs39,	vs5,	vs19
+
+	xvmaddasp	vs40,	vs4,	vs20
+	xvmaddasp	vs41,	vs5,	vs20
+
+	xvmaddasp	vs42,	vs4,	vs21
+	xvmaddasp	vs43,	vs5,	vs21
+
+	xvmaddasp	vs44,	vs4,	vs22
+	xvmaddasp	vs45,	vs5,	vs22
+
+	xvmaddasp	vs46,	vs4,	vs23
+	xvmaddasp	vs47,	vs5,	vs23
+
+
+.endm
+
+.macro KERNEL8x8_SUBI1
+
+
+	lxvw4x		vs0,	o0,	AO
+	lxvw4x		vs1,	o16,	AO
+
+	addi		AO,	AO,	32
+
+	lxvw4x		vs28,	o0,	BO
+
+	xxspltw		vs8,	vs28,	0
+	xxspltw		vs9,	vs28,	1
+	xxspltw		vs10,	vs28,	2
+	xxspltw		vs11,	vs28,	3
+
+	lxvw4x		vs29,	o16,	BO
+
+	xxspltw		vs12,	vs29,	0
+	xxspltw		vs13,	vs29,	1
+	xxspltw		vs14,	vs29,	2
+	xxspltw		vs15,	vs29,	3
+
+	addi		BO,	BO,	32
+
+
+	xvmulsp		vs32,	vs0,	vs8
+	xvmulsp		vs33,	vs1,	vs8
+
+	xvmulsp		vs34,	vs0,	vs9
+	xvmulsp		vs35,	vs1,	vs9
+
+	xvmulsp		vs36,	vs0,	vs10
+	xvmulsp		vs37,	vs1,	vs10
+
+	xvmulsp		vs38,	vs0,	vs11
+	xvmulsp		vs39,	vs1,	vs11
+
+	xvmulsp		vs40,	vs0,	vs12
+	xvmulsp		vs41,	vs1,	vs12
+
+	xvmulsp		vs42,	vs0,	vs13
+	xvmulsp		vs43,	vs1,	vs13
+
+	xvmulsp		vs44,	vs0,	vs14
+	xvmulsp		vs45,	vs1,	vs14
+
+	xvmulsp		vs46,	vs0,	vs15
+	xvmulsp		vs47,	vs1,	vs15
+
+
+.endm
+
+.macro KERNEL8x8_SUB1
+
+
+	lxvw4x		vs0,	o0,	AO
+	lxvw4x		vs1,	o16,	AO
+
+	addi		AO,	AO,	32
+
+	lxvw4x		vs28,	o0,	BO
+
+	xxspltw		vs8,	vs28,	0
+	xxspltw		vs9,	vs28,	1
+	xxspltw		vs10,	vs28,	2
+	xxspltw		vs11,	vs28,	3
+
+	lxvw4x		vs29,	o16,	BO
+
+	xxspltw		vs12,	vs29,	0
+	xxspltw		vs13,	vs29,	1
+	xxspltw		vs14,	vs29,	2
+	xxspltw		vs15,	vs29,	3
+
+	addi		BO,	BO,	32
+
+
+	xvmaddasp	vs32,	vs0,	vs8
+	xvmaddasp	vs33,	vs1,	vs8
+
+	xvmaddasp	vs34,	vs0,	vs9
+	xvmaddasp	vs35,	vs1,	vs9
+
+	xvmaddasp	vs36,	vs0,	vs10
+	xvmaddasp	vs37,	vs1,	vs10
+
+	xvmaddasp	vs38,	vs0,	vs11
+	xvmaddasp	vs39,	vs1,	vs11
+
+	xvmaddasp	vs40,	vs0,	vs12
+	xvmaddasp	vs41,	vs1,	vs12
+
+	xvmaddasp	vs42,	vs0,	vs13
+	xvmaddasp	vs43,	vs1,	vs13
+
+	xvmaddasp	vs44,	vs0,	vs14
+	xvmaddasp	vs45,	vs1,	vs14
+
+	xvmaddasp	vs46,	vs0,	vs15
+	xvmaddasp	vs47,	vs1,	vs15
+
+
+.endm
+
+.macro SAVE8x8
+
+	mr		T1,	CO
+
+#ifndef TRMMKERNEL
+
+	lxvw4x		vs0,	o0,	T1
+	lxvw4x		vs1,	o16,	T1
+
+#endif
+
+#ifdef TRMMKERNEL
+	xvmulsp		vs0,	vs32,	alpha_vr
+	xvmulsp		vs1,	vs33,	alpha_vr
+#else
+	xvmaddasp	vs0,	vs32,	alpha_vr
+	xvmaddasp	vs1,	vs33,	alpha_vr
+#endif
+
+	stxvw4x		vs0,	o0,	T1
+	stxvw4x		vs1,	o16,	T1
+
+	add		T1,	T1,	LDC
+
+
+#ifndef TRMMKERNEL
+
+	lxvw4x		vs0,	o0,	T1
+	lxvw4x		vs1,	o16,	T1
+
+#endif
+
+#ifdef TRMMKERNEL
+	xvmulsp		vs0,	vs34,	alpha_vr
+	xvmulsp		vs1,	vs35,	alpha_vr
+#else
+	xvmaddasp	vs0,	vs34,	alpha_vr
+	xvmaddasp	vs1,	vs35,	alpha_vr
+#endif
+
+	stxvw4x		vs0,	o0,	T1
+	stxvw4x		vs1,	o16,	T1
+
+	add		T1,	T1,	LDC
+
+
+#ifndef TRMMKERNEL
+
+	lxvw4x		vs0,	o0,	T1
+	lxvw4x		vs1,	o16,	T1
+
+#endif
+
+#ifdef TRMMKERNEL
+	xvmulsp		vs0,	vs36,	alpha_vr
+	xvmulsp		vs1,	vs37,	alpha_vr
+#else
+	xvmaddasp	vs0,	vs36,	alpha_vr
+	xvmaddasp	vs1,	vs37,	alpha_vr
+#endif
+
+	stxvw4x		vs0,	o0,	T1
+	stxvw4x		vs1,	o16,	T1
+
+	add		T1,	T1,	LDC
+
+
+#ifndef TRMMKERNEL
+
+	lxvw4x		vs0,	o0,	T1
+	lxvw4x		vs1,	o16,	T1
+
+#endif
+
+#ifdef TRMMKERNEL
+	xvmulsp		vs0,	vs38,	alpha_vr
+	xvmulsp		vs1,	vs39,	alpha_vr
+#else
+	xvmaddasp	vs0,	vs38,	alpha_vr
+	xvmaddasp	vs1,	vs39,	alpha_vr
+#endif
+
+	stxvw4x		vs0,	o0,	T1
+	stxvw4x		vs1,	o16,	T1
+
+	add		T1,	T1,	LDC
+
+
+#ifndef TRMMKERNEL
+
+	lxvw4x		vs0,	o0,	T1
+	lxvw4x		vs1,	o16,	T1
+
+#endif
+
+#ifdef TRMMKERNEL
+	xvmulsp		vs0,	vs40,	alpha_vr
+	xvmulsp		vs1,	vs41,	alpha_vr
+#else
+	xvmaddasp	vs0,	vs40,	alpha_vr
+	xvmaddasp	vs1,	vs41,	alpha_vr
+#endif
+
+	stxvw4x		vs0,	o0,	T1
+	stxvw4x		vs1,	o16,	T1
+
+	add		T1,	T1,	LDC
+
+
+#ifndef TRMMKERNEL
+
+	lxvw4x		vs0,	o0,	T1
+	lxvw4x		vs1,	o16,	T1
+
+#endif
+
+#ifdef TRMMKERNEL
+	xvmulsp		vs0,	vs42,	alpha_vr
+	xvmulsp		vs1,	vs43,	alpha_vr
+#else
+	xvmaddasp	vs0,	vs42,	alpha_vr
+	xvmaddasp	vs1,	vs43,	alpha_vr
+#endif
+
+	stxvw4x		vs0,	o0,	T1
+	stxvw4x		vs1,	o16,	T1
+
+	add		T1,	T1,	LDC
+
+
+#ifndef TRMMKERNEL
+
+	lxvw4x		vs0,	o0,	T1
+	lxvw4x		vs1,	o16,	T1
+
+#endif
+
+#ifdef TRMMKERNEL
+	xvmulsp		vs0,	vs44,	alpha_vr
+	xvmulsp		vs1,	vs45,	alpha_vr
+#else
+	xvmaddasp	vs0,	vs44,	alpha_vr
+	xvmaddasp	vs1,	vs45,	alpha_vr
+#endif
+
+	stxvw4x		vs0,	o0,	T1
+	stxvw4x		vs1,	o16,	T1
+
+	add		T1,	T1,	LDC
+
+
+#ifndef TRMMKERNEL
+
+	lxvw4x		vs0,	o0,	T1
+	lxvw4x		vs1,	o16,	T1
+
+#endif
+
+#ifdef TRMMKERNEL
+	xvmulsp		vs0,	vs46,	alpha_vr
+	xvmulsp		vs1,	vs47,	alpha_vr
+#else
+	xvmaddasp	vs0,	vs46,	alpha_vr
+	xvmaddasp	vs1,	vs47,	alpha_vr
+#endif
+
+	stxvw4x		vs0,	o0,	T1
+	stxvw4x		vs1,	o16,	T1
+
+	add		T1,	T1,	LDC
+
+	addi		CO,	CO,	32
+
+.endm
+
+
+/**********************************************************************************************
+* Macros for N=8 and M=4
+**********************************************************************************************/
+
+.macro LOAD8x4_1
+
+	lxvw4x		vs0,	o0,	AO
+
+	addi		AO,	AO,	16
+
+	lxvw4x		vs28,	o0,	BO
+
+	xxspltw		vs8,	vs28,	0
+	xxspltw		vs9,	vs28,	1
+	xxspltw		vs10,	vs28,	2
+	xxspltw		vs11,	vs28,	3
+
+	lxvw4x		vs29,	o16,	BO
+
+	xxspltw		vs12,	vs29,	0
+	xxspltw		vs13,	vs29,	1
+	xxspltw		vs14,	vs29,	2
+	xxspltw		vs15,	vs29,	3
+
+	addi		BO,	BO,	32
+
+.endm
+
+.macro KERNEL8x4_I1
+
+
+	lxvw4x		vs4,	o0,	AO
+
+	addi		AO,	AO,	16
+
+	lxvw4x		vs28,	o0,	BO
+
+	xxspltw		vs16,	vs28,	0
+	xxspltw		vs17,	vs28,	1
+	xxspltw		vs18,	vs28,	2
+	xxspltw		vs19,	vs28,	3
+
+	lxvw4x		vs29,	o16,	BO
+
+	xxspltw		vs20,	vs29,	0
+	xxspltw		vs21,	vs29,	1
+	xxspltw		vs22,	vs29,	2
+	xxspltw		vs23,	vs29,	3
+
+	addi		BO,	BO,	32
+
+
+	xvmulsp		vs32,	vs0,	vs8
+
+	xvmulsp		vs33,	vs0,	vs9
+
+	xvmulsp		vs34,	vs0,	vs10
+
+	xvmulsp		vs35,	vs0,	vs11
+
+	xvmulsp		vs36,	vs0,	vs12
+
+	xvmulsp		vs37,	vs0,	vs13
+
+	xvmulsp		vs38,	vs0,	vs14
+
+	xvmulsp		vs39,	vs0,	vs15
+
+
+.endm
+
+.macro KERNEL8x4_1
+
+
+	lxvw4x		vs4,	o0,	AO
+
+	addi		AO,	AO,	16
+
+	lxvw4x		vs28,	o0,	BO
+
+	xxspltw		vs16,	vs28,	0
+	xxspltw		vs17,	vs28,	1
+	xxspltw		vs18,	vs28,	2
+	xxspltw		vs19,	vs28,	3
+
+	lxvw4x		vs29,	o16,	BO
+
+	xxspltw		vs20,	vs29,	0
+	xxspltw		vs21,	vs29,	1
+	xxspltw		vs22,	vs29,	2
+	xxspltw		vs23,	vs29,	3
+
+	addi		BO,	BO,	32
+
+
+	xvmaddasp	vs32,	vs0,	vs8
+
+	xvmaddasp	vs33,	vs0,	vs9
+
+	xvmaddasp	vs34,	vs0,	vs10
+
+	xvmaddasp	vs35,	vs0,	vs11
+
+	xvmaddasp	vs36,	vs0,	vs12
+
+	xvmaddasp	vs37,	vs0,	vs13
+
+	xvmaddasp	vs38,	vs0,	vs14
+
+	xvmaddasp	vs39,	vs0,	vs15
+
+
+.endm
+
+.macro KERNEL8x4_2
+
+
+	lxvw4x		vs0,	o0,	AO
+
+	addi		AO,	AO,	16
+
+	lxvw4x		vs28,	o0,	BO
+
+	xxspltw		vs8,	vs28,	0
+	xxspltw		vs9,	vs28,	1
+	xxspltw		vs10,	vs28,	2
+	xxspltw		vs11,	vs28,	3
+
+	lxvw4x		vs29,	o16,	BO
+
+	xxspltw		vs12,	vs29,	0
+	xxspltw		vs13,	vs29,	1
+	xxspltw		vs14,	vs29,	2
+	xxspltw		vs15,	vs29,	3
+
+	addi		BO,	BO,	32
+
+
+	xvmaddasp	vs32,	vs4,	vs16
+
+	xvmaddasp	vs33,	vs4,	vs17
+
+	xvmaddasp	vs34,	vs4,	vs18
+
+	xvmaddasp	vs35,	vs4,	vs19
+
+	xvmaddasp	vs36,	vs4,	vs20
+
+	xvmaddasp	vs37,	vs4,	vs21
+
+	xvmaddasp	vs38,	vs4,	vs22
+
+	xvmaddasp	vs39,	vs4,	vs23
+
+
+.endm
+
+.macro KERNEL8x4_E2
+
+
+	xvmaddasp	vs32,	vs4,	vs16
+
+	xvmaddasp	vs33,	vs4,	vs17
+
+	xvmaddasp	vs34,	vs4,	vs18
+
+	xvmaddasp	vs35,	vs4,	vs19
+
+	xvmaddasp	vs36,	vs4,	vs20
+
+	xvmaddasp	vs37,	vs4,	vs21
+
+	xvmaddasp	vs38,	vs4,	vs22
+
+	xvmaddasp	vs39,	vs4,	vs23
+
+
+.endm
+
+.macro KERNEL8x4_SUBI1
+
+
+	lxvw4x		vs0,	o0,	AO
+
+	addi		AO,	AO,	16
+
+	lxvw4x		vs28,	o0,	BO
+
+	xxspltw		vs8,	vs28,	0
+	xxspltw		vs9,	vs28,	1
+	xxspltw		vs10,	vs28,	2
+	xxspltw		vs11,	vs28,	3
+
+	lxvw4x		vs29,	o16,	BO
+
+	xxspltw		vs12,	vs29,	0
+	xxspltw		vs13,	vs29,	1
+	xxspltw		vs14,	vs29,	2
+	xxspltw		vs15,	vs29,	3
+
+	addi		BO,	BO,	32
+
+
+	xvmulsp		vs32,	vs0,	vs8
+
+	xvmulsp		vs33,	vs0,	vs9
+
+	xvmulsp		vs34,	vs0,	vs10
+
+	xvmulsp		vs35,	vs0,	vs11
+
+	xvmulsp		vs36,	vs0,	vs12
+
+	xvmulsp		vs37,	vs0,	vs13
+
+	xvmulsp		vs38,	vs0,	vs14
+
+	xvmulsp		vs39,	vs0,	vs15
+
+
+.endm
+
+.macro KERNEL8x4_SUB1
+
+
+	lxvw4x		vs0,	o0,	AO
+
+	addi		AO,	AO,	16
+
+	lxvw4x		vs28,	o0,	BO
+
+	xxspltw		vs8,	vs28,	0
+	xxspltw		vs9,	vs28,	1
+	xxspltw		vs10,	vs28,	2
+	xxspltw		vs11,	vs28,	3
+
+	lxvw4x		vs29,	o16,	BO
+
+	xxspltw		vs12,	vs29,	0
+	xxspltw		vs13,	vs29,	1
+	xxspltw		vs14,	vs29,	2
+	xxspltw		vs15,	vs29,	3
+
+	addi		BO,	BO,	32
+
+
+	xvmaddasp	vs32,	vs0,	vs8
+
+	xvmaddasp	vs33,	vs0,	vs9
+
+	xvmaddasp	vs34,	vs0,	vs10
+
+	xvmaddasp	vs35,	vs0,	vs11
+
+	xvmaddasp	vs36,	vs0,	vs12
+
+	xvmaddasp	vs37,	vs0,	vs13
+
+	xvmaddasp	vs38,	vs0,	vs14
+
+	xvmaddasp	vs39,	vs0,	vs15
+
+
+.endm
+
+.macro SAVE8x4
+
+	mr		T1,	CO
+
+#ifndef TRMMKERNEL
+
+	lxvw4x		vs0,	o0,	T1
+
+#endif
+
+#ifdef TRMMKERNEL
+	xvmulsp		vs0,	vs32,	alpha_vr
+#else
+	xvmaddasp	vs0,	vs32,	alpha_vr
+#endif
+
+	stxvw4x		vs0,	o0,	T1
+
+	add		T1,	T1,	LDC
+
+
+#ifndef TRMMKERNEL
+
+	lxvw4x		vs0,	o0,	T1
+
+#endif
+
+#ifdef TRMMKERNEL
+	xvmulsp		vs0,	vs33,	alpha_vr
+#else
+	xvmaddasp	vs0,	vs33,	alpha_vr
+#endif
+
+	stxvw4x		vs0,	o0,	T1
+
+	add		T1,	T1,	LDC
+
+
+#ifndef TRMMKERNEL
+
+	lxvw4x		vs0,	o0,	T1
+
+#endif
+
+#ifdef TRMMKERNEL
+	xvmulsp		vs0,	vs34,	alpha_vr
+#else
+	xvmaddasp	vs0,	vs34,	alpha_vr
+#endif
+
+	stxvw4x		vs0,	o0,	T1
+
+	add		T1,	T1,	LDC
+
+
+#ifndef TRMMKERNEL
+
+	lxvw4x		vs0,	o0,	T1
+
+#endif
+
+#ifdef TRMMKERNEL
+	xvmulsp		vs0,	vs35,	alpha_vr
+#else
+	xvmaddasp	vs0,	vs35,	alpha_vr
+#endif
+
+	stxvw4x		vs0,	o0,	T1
+
+	add		T1,	T1,	LDC
+
+
+#ifndef TRMMKERNEL
+
+	lxvw4x		vs0,	o0,	T1
+
+#endif
+
+#ifdef TRMMKERNEL
+	xvmulsp		vs0,	vs36,	alpha_vr
+#else
+	xvmaddasp	vs0,	vs36,	alpha_vr
+#endif
+
+	stxvw4x		vs0,	o0,	T1
+
+	add		T1,	T1,	LDC
+
+
+#ifndef TRMMKERNEL
+
+	lxvw4x		vs0,	o0,	T1
+
+#endif
+
+#ifdef TRMMKERNEL
+	xvmulsp		vs0,	vs37,	alpha_vr
+#else
+	xvmaddasp	vs0,	vs37,	alpha_vr
+#endif
+
+	stxvw4x		vs0,	o0,	T1
+
+	add		T1,	T1,	LDC
+
+
+#ifndef TRMMKERNEL
+
+	lxvw4x		vs0,	o0,	T1
+
+#endif
+
+#ifdef TRMMKERNEL
+	xvmulsp		vs0,	vs38,	alpha_vr
+#else
+	xvmaddasp	vs0,	vs38,	alpha_vr
+#endif
+
+	stxvw4x		vs0,	o0,	T1
+
+	add		T1,	T1,	LDC
+
+
+#ifndef TRMMKERNEL
+
+	lxvw4x		vs0,	o0,	T1
+
+#endif
+
+#ifdef TRMMKERNEL
+	xvmulsp		vs0,	vs39,	alpha_vr
+#else
+	xvmaddasp	vs0,	vs39,	alpha_vr
+#endif
+
+	stxvw4x		vs0,	o0,	T1
+
+	add		T1,	T1,	LDC
+
+	addi		CO,	CO,	16
+
+.endm
+
+
+/**********************************************************************************************
+* Macros for N=8 and M=2
+**********************************************************************************************/
+
+.macro LOAD8x2_1
+
+	lxsspx		vs0,	o0,	AO
+	lxsspx		vs1,	o4,	AO
+
+	addi		AO,	AO,	8
+
+	mr		T1,	BO
+
+	lxsspx		vs8,	o0,	T1
+	lxsspx		vs9,	o4,	T1
+	lxsspx		vs10,	o8,	T1
+	lxsspx		vs11,	o12,	T1
+
+	addi		T1,	T1,	16
+
+	lxsspx		vs12,	o0,	T1
+	lxsspx		vs13,	o4,	T1
+	lxsspx		vs14,	o8,	T1
+	lxsspx		vs15,	o12,	T1
+
+	addi		BO,	BO,	32
+
+.endm
+
+.macro KERNEL8x2_I1
+
+
+	lxsspx		vs4,	o0,	AO
+	lxsspx		vs5,	o4,	AO
+
+	addi		AO,	AO,	8
+
+	mr		T1,	BO
+
+	lxsspx		vs16,	o0,	T1
+	lxsspx		vs17,	o4,	T1
+	lxsspx		vs18,	o8,	T1
+	lxsspx		vs19,	o12,	T1
+
+	addi		T1,	T1,	16
+
+	lxsspx		vs20,	o0,	T1
+	lxsspx		vs21,	o4,	T1
+	lxsspx		vs22,	o8,	T1
+	lxsspx		vs23,	o12,	T1
+
+	addi		BO,	BO,	32
+
+
+	xsmuldp		vs32,	vs0,	vs8
+	xsmuldp		vs33,	vs1,	vs8
+
+	xsmuldp		vs34,	vs0,	vs9
+	xsmuldp		vs35,	vs1,	vs9
+
+	xsmuldp		vs36,	vs0,	vs10
+	xsmuldp		vs37,	vs1,	vs10
+
+	xsmuldp		vs38,	vs0,	vs11
+	xsmuldp		vs39,	vs1,	vs11
+
+	xsmuldp		vs40,	vs0,	vs12
+	xsmuldp		vs41,	vs1,	vs12
+
+	xsmuldp		vs42,	vs0,	vs13
+	xsmuldp		vs43,	vs1,	vs13
+
+	xsmuldp		vs44,	vs0,	vs14
+	xsmuldp		vs45,	vs1,	vs14
+
+	xsmuldp		vs46,	vs0,	vs15
+	xsmuldp		vs47,	vs1,	vs15
+
+
+.endm
+
+.macro KERNEL8x2_1
+
+
+	lxsspx		vs4,	o0,	AO
+	lxsspx		vs5,	o4,	AO
+
+	addi		AO,	AO,	8
+
+	mr		T1,	BO
+
+	lxsspx		vs16,	o0,	T1
+	lxsspx		vs17,	o4,	T1
+	lxsspx		vs18,	o8,	T1
+	lxsspx		vs19,	o12,	T1
+
+	addi		T1,	T1,	16
+
+	lxsspx		vs20,	o0,	T1
+	lxsspx		vs21,	o4,	T1
+	lxsspx		vs22,	o8,	T1
+	lxsspx		vs23,	o12,	T1
+
+	addi		BO,	BO,	32
+
+
+	xsmaddadp	vs32,	vs0,	vs8
+	xsmaddadp	vs33,	vs1,	vs8
+
+	xsmaddadp	vs34,	vs0,	vs9
+	xsmaddadp	vs35,	vs1,	vs9
+
+	xsmaddadp	vs36,	vs0,	vs10
+	xsmaddadp	vs37,	vs1,	vs10
+
+	xsmaddadp	vs38,	vs0,	vs11
+	xsmaddadp	vs39,	vs1,	vs11
+
+	xsmaddadp	vs40,	vs0,	vs12
+	xsmaddadp	vs41,	vs1,	vs12
+
+	xsmaddadp	vs42,	vs0,	vs13
+	xsmaddadp	vs43,	vs1,	vs13
+
+	xsmaddadp	vs44,	vs0,	vs14
+	xsmaddadp	vs45,	vs1,	vs14
+
+	xsmaddadp	vs46,	vs0,	vs15
+	xsmaddadp	vs47,	vs1,	vs15
+
+
+.endm
+
+.macro KERNEL8x2_2
+
+
+	lxsspx		vs0,	o0,	AO
+	lxsspx		vs1,	o4,	AO
+
+	addi		AO,	AO,	8
+
+	mr		T1,	BO
+
+	lxsspx		vs8,	o0,	T1
+	lxsspx		vs9,	o4,	T1
+	lxsspx		vs10,	o8,	T1
+	lxsspx		vs11,	o12,	T1
+
+	addi		T1,	T1,	16
+
+	lxsspx		vs12,	o0,	T1
+	lxsspx		vs13,	o4,	T1
+	lxsspx		vs14,	o8,	T1
+	lxsspx		vs15,	o12,	T1
+
+	addi		BO,	BO,	32
+
+
+	xsmaddadp	vs32,	vs4,	vs16
+	xsmaddadp	vs33,	vs5,	vs16
+
+	xsmaddadp	vs34,	vs4,	vs17
+	xsmaddadp	vs35,	vs5,	vs17
+
+	xsmaddadp	vs36,	vs4,	vs18
+	xsmaddadp	vs37,	vs5,	vs18
+
+	xsmaddadp	vs38,	vs4,	vs19
+	xsmaddadp	vs39,	vs5,	vs19
+
+	xsmaddadp	vs40,	vs4,	vs20
+	xsmaddadp	vs41,	vs5,	vs20
+
+	xsmaddadp	vs42,	vs4,	vs21
+	xsmaddadp	vs43,	vs5,	vs21
+
+	xsmaddadp	vs44,	vs4,	vs22
+	xsmaddadp	vs45,	vs5,	vs22
+
+	xsmaddadp	vs46,	vs4,	vs23
+	xsmaddadp	vs47,	vs5,	vs23
+
+
+.endm
+
+.macro KERNEL8x2_E2
+
+
+	xsmaddadp	vs32,	vs4,	vs16
+	xsmaddadp	vs33,	vs5,	vs16
+
+	xsmaddadp	vs34,	vs4,	vs17
+	xsmaddadp	vs35,	vs5,	vs17
+
+	xsmaddadp	vs36,	vs4,	vs18
+	xsmaddadp	vs37,	vs5,	vs18
+
+	xsmaddadp	vs38,	vs4,	vs19
+	xsmaddadp	vs39,	vs5,	vs19
+
+	xsmaddadp	vs40,	vs4,	vs20
+	xsmaddadp	vs41,	vs5,	vs20
+
+	xsmaddadp	vs42,	vs4,	vs21
+	xsmaddadp	vs43,	vs5,	vs21
+
+	xsmaddadp	vs44,	vs4,	vs22
+	xsmaddadp	vs45,	vs5,	vs22
+
+	xsmaddadp	vs46,	vs4,	vs23
+	xsmaddadp	vs47,	vs5,	vs23
+
+
+.endm
+
+.macro KERNEL8x2_SUBI1
+
+
+	lxsspx		vs0,	o0,	AO
+	lxsspx		vs1,	o4,	AO
+
+	addi		AO,	AO,	8
+
+	mr		T1,	BO
+
+	lxsspx		vs8,	o0,	T1
+	lxsspx		vs9,	o4,	T1
+	lxsspx		vs10,	o8,	T1
+	lxsspx		vs11,	o12,	T1
+
+	addi		T1,	T1,	16
+
+	lxsspx		vs12,	o0,	T1
+	lxsspx		vs13,	o4,	T1
+	lxsspx		vs14,	o8,	T1
+	lxsspx		vs15,	o12,	T1
+
+	addi		BO,	BO,	32
+
+
+	xsmuldp		vs32,	vs0,	vs8
+	xsmuldp		vs33,	vs1,	vs8
+
+	xsmuldp		vs34,	vs0,	vs9
+	xsmuldp		vs35,	vs1,	vs9
+
+	xsmuldp		vs36,	vs0,	vs10
+	xsmuldp		vs37,	vs1,	vs10
+
+	xsmuldp		vs38,	vs0,	vs11
+	xsmuldp		vs39,	vs1,	vs11
+
+	xsmuldp		vs40,	vs0,	vs12
+	xsmuldp		vs41,	vs1,	vs12
+
+	xsmuldp		vs42,	vs0,	vs13
+	xsmuldp		vs43,	vs1,	vs13
+
+	xsmuldp		vs44,	vs0,	vs14
+	xsmuldp		vs45,	vs1,	vs14
+
+	xsmuldp		vs46,	vs0,	vs15
+	xsmuldp		vs47,	vs1,	vs15
+
+
+.endm
+
+.macro KERNEL8x2_SUB1
+
+
+	lxsspx		vs0,	o0,	AO
+	lxsspx		vs1,	o4,	AO
+
+	addi		AO,	AO,	8
+
+	mr		T1,	BO
+
+	lxsspx		vs8,	o0,	T1
+	lxsspx		vs9,	o4,	T1
+	lxsspx		vs10,	o8,	T1
+	lxsspx		vs11,	o12,	T1
+
+	addi		T1,	T1,	16
+
+	lxsspx		vs12,	o0,	T1
+	lxsspx		vs13,	o4,	T1
+	lxsspx		vs14,	o8,	T1
+	lxsspx		vs15,	o12,	T1
+
+	addi		BO,	BO,	32
+
+
+	xsmaddadp	vs32,	vs0,	vs8
+	xsmaddadp	vs33,	vs1,	vs8
+
+	xsmaddadp	vs34,	vs0,	vs9
+	xsmaddadp	vs35,	vs1,	vs9
+
+	xsmaddadp	vs36,	vs0,	vs10
+	xsmaddadp	vs37,	vs1,	vs10
+
+	xsmaddadp	vs38,	vs0,	vs11
+	xsmaddadp	vs39,	vs1,	vs11
+
+	xsmaddadp	vs40,	vs0,	vs12
+	xsmaddadp	vs41,	vs1,	vs12
+
+	xsmaddadp	vs42,	vs0,	vs13
+	xsmaddadp	vs43,	vs1,	vs13
+
+	xsmaddadp	vs44,	vs0,	vs14
+	xsmaddadp	vs45,	vs1,	vs14
+
+	xsmaddadp	vs46,	vs0,	vs15
+	xsmaddadp	vs47,	vs1,	vs15
+
+
+.endm
+
+.macro SAVE8x2
+
+	mr		T1,	CO
+
+#ifndef TRMMKERNEL
+
+	lxsspx		vs0,	o0,	T1
+	lxsspx		vs1,	o4,	T1
+
+#endif
+
+#ifdef TRMMKERNEL
+	xsmuldp		vs0,	vs32,	alpha_r
+	xsmuldp		vs1,	vs33,	alpha_r
+#else
+	xsmaddadp	vs0,	vs32,	alpha_r
+	xsmaddadp	vs1,	vs33,	alpha_r
+#endif
+
+	stxsspx		vs0,	o0,	T1
+	stxsspx		vs1,	o4,	T1
+
+	add		T1,	T1,	LDC
+
+
+#ifndef TRMMKERNEL
+
+	lxsspx		vs0,	o0,	T1
+	lxsspx		vs1,	o4,	T1
+
+#endif
+
+#ifdef TRMMKERNEL
+	xsmuldp		vs0,	vs34,	alpha_r
+	xsmuldp		vs1,	vs35,	alpha_r
+#else
+	xsmaddadp	vs0,	vs34,	alpha_r
+	xsmaddadp	vs1,	vs35,	alpha_r
+#endif
+
+	stxsspx		vs0,	o0,	T1
+	stxsspx		vs1,	o4,	T1
+
+	add		T1,	T1,	LDC
+
+
+#ifndef TRMMKERNEL
+
+	lxsspx		vs0,	o0,	T1
+	lxsspx		vs1,	o4,	T1
+
+#endif
+
+#ifdef TRMMKERNEL
+	xsmuldp		vs0,	vs36,	alpha_r
+	xsmuldp		vs1,	vs37,	alpha_r
+#else
+	xsmaddadp	vs0,	vs36,	alpha_r
+	xsmaddadp	vs1,	vs37,	alpha_r
+#endif
+
+	stxsspx		vs0,	o0,	T1
+	stxsspx		vs1,	o4,	T1
+
+	add		T1,	T1,	LDC
+
+
+#ifndef TRMMKERNEL
+
+	lxsspx		vs0,	o0,	T1
+	lxsspx		vs1,	o4,	T1
+
+#endif
+
+#ifdef TRMMKERNEL
+	xsmuldp		vs0,	vs38,	alpha_r
+	xsmuldp		vs1,	vs39,	alpha_r
+#else
+	xsmaddadp	vs0,	vs38,	alpha_r
+	xsmaddadp	vs1,	vs39,	alpha_r
+#endif
+
+	stxsspx		vs0,	o0,	T1
+	stxsspx		vs1,	o4,	T1
+
+	add		T1,	T1,	LDC
+
+
+#ifndef TRMMKERNEL
+
+	lxsspx		vs0,	o0,	T1
+	lxsspx		vs1,	o4,	T1
+
+#endif
+
+#ifdef TRMMKERNEL
+	xsmuldp		vs0,	vs40,	alpha_r
+	xsmuldp		vs1,	vs41,	alpha_r
+#else
+	xsmaddadp	vs0,	vs40,	alpha_r
+	xsmaddadp	vs1,	vs41,	alpha_r
+#endif
+
+	stxsspx		vs0,	o0,	T1
+	stxsspx		vs1,	o4,	T1
+
+	add		T1,	T1,	LDC
+
+
+#ifndef TRMMKERNEL
+
+	lxsspx		vs0,	o0,	T1
+	lxsspx		vs1,	o4,	T1
+
+#endif
+
+#ifdef TRMMKERNEL
+	xsmuldp		vs0,	vs42,	alpha_r
+	xsmuldp		vs1,	vs43,	alpha_r
+#else
+	xsmaddadp	vs0,	vs42,	alpha_r
+	xsmaddadp	vs1,	vs43,	alpha_r
+#endif
+
+	stxsspx		vs0,	o0,	T1
+	stxsspx		vs1,	o4,	T1
+
+	add		T1,	T1,	LDC
+
+
+#ifndef TRMMKERNEL
+
+	lxsspx		vs0,	o0,	T1
+	lxsspx		vs1,	o4,	T1
+
+#endif
+
+#ifdef TRMMKERNEL
+	xsmuldp		vs0,	vs44,	alpha_r
+	xsmuldp		vs1,	vs45,	alpha_r
+#else
+	xsmaddadp	vs0,	vs44,	alpha_r
+	xsmaddadp	vs1,	vs45,	alpha_r
+#endif
+
+	stxsspx		vs0,	o0,	T1
+	stxsspx		vs1,	o4,	T1
+
+	add		T1,	T1,	LDC
+
+
+#ifndef TRMMKERNEL
+
+	lxsspx		vs0,	o0,	T1
+	lxsspx		vs1,	o4,	T1
+
+#endif
+
+#ifdef TRMMKERNEL
+	xsmuldp		vs0,	vs46,	alpha_r
+	xsmuldp		vs1,	vs47,	alpha_r
+#else
+	xsmaddadp	vs0,	vs46,	alpha_r
+	xsmaddadp	vs1,	vs47,	alpha_r
+#endif
+
+	stxsspx		vs0,	o0,	T1
+	stxsspx		vs1,	o4,	T1
+
+	add		T1,	T1,	LDC
+
+	addi		CO,	CO,	8
+
+.endm
+
+
+/**********************************************************************************************
+* Macros for N=8 and M=1
+**********************************************************************************************/
+
+.macro LOAD8x1_1
+
+	lxsspx		vs0,	o0,	AO
+
+	addi		AO,	AO,	4
+
+	mr		T1,	BO
+
+	lxsspx		vs8,	o0,	T1
+	lxsspx		vs9,	o4,	T1
+	lxsspx		vs10,	o8,	T1
+	lxsspx		vs11,	o12,	T1
+
+	addi		T1,	T1,	16
+
+	lxsspx		vs12,	o0,	T1
+	lxsspx		vs13,	o4,	T1
+	lxsspx		vs14,	o8,	T1
+	lxsspx		vs15,	o12,	T1
+
+	addi		BO,	BO,	32
+
+.endm
+
+.macro KERNEL8x1_I1
+
+
+	lxsspx		vs4,	o0,	AO
+
+	addi		AO,	AO,	4
+
+	mr		T1,	BO
+
+	lxsspx		vs16,	o0,	T1
+	lxsspx		vs17,	o4,	T1
+	lxsspx		vs18,	o8,	T1
+	lxsspx		vs19,	o12,	T1
+
+	addi		T1,	T1,	16
+
+	lxsspx		vs20,	o0,	T1
+	lxsspx		vs21,	o4,	T1
+	lxsspx		vs22,	o8,	T1
+	lxsspx		vs23,	o12,	T1
+
+	addi		BO,	BO,	32
+
+
+	xsmuldp		vs32,	vs0,	vs8
+
+	xsmuldp		vs33,	vs0,	vs9
+
+	xsmuldp		vs34,	vs0,	vs10
+
+	xsmuldp		vs35,	vs0,	vs11
+
+	xsmuldp		vs36,	vs0,	vs12
+
+	xsmuldp		vs37,	vs0,	vs13
+
+	xsmuldp		vs38,	vs0,	vs14
+
+	xsmuldp		vs39,	vs0,	vs15
+
+
+.endm
+
+.macro KERNEL8x1_1
+
+
+	lxsspx		vs4,	o0,	AO
+
+	addi		AO,	AO,	4
+
+	mr		T1,	BO
+
+	lxsspx		vs16,	o0,	T1
+	lxsspx		vs17,	o4,	T1
+	lxsspx		vs18,	o8,	T1
+	lxsspx		vs19,	o12,	T1
+
+	addi		T1,	T1,	16
+
+	lxsspx		vs20,	o0,	T1
+	lxsspx		vs21,	o4,	T1
+	lxsspx		vs22,	o8,	T1
+	lxsspx		vs23,	o12,	T1
+
+	addi		BO,	BO,	32
+
+
+	xsmaddadp	vs32,	vs0,	vs8
+
+	xsmaddadp	vs33,	vs0,	vs9
+
+	xsmaddadp	vs34,	vs0,	vs10
+
+	xsmaddadp	vs35,	vs0,	vs11
+
+	xsmaddadp	vs36,	vs0,	vs12
+
+	xsmaddadp	vs37,	vs0,	vs13
+
+	xsmaddadp	vs38,	vs0,	vs14
+
+	xsmaddadp	vs39,	vs0,	vs15
+
+
+.endm
+
+.macro KERNEL8x1_2
+
+
+	lxsspx		vs0,	o0,	AO
+
+	addi		AO,	AO,	4
+
+	mr		T1,	BO
+
+	lxsspx		vs8,	o0,	T1
+	lxsspx		vs9,	o4,	T1
+	lxsspx		vs10,	o8,	T1
+	lxsspx		vs11,	o12,	T1
+
+	addi		T1,	T1,	16
+
+	lxsspx		vs12,	o0,	T1
+	lxsspx		vs13,	o4,	T1
+	lxsspx		vs14,	o8,	T1
+	lxsspx		vs15,	o12,	T1
+
+	addi		BO,	BO,	32
+
+
+	xsmaddadp	vs32,	vs4,	vs16
+
+	xsmaddadp	vs33,	vs4,	vs17
+
+	xsmaddadp	vs34,	vs4,	vs18
+
+	xsmaddadp	vs35,	vs4,	vs19
+
+	xsmaddadp	vs36,	vs4,	vs20
+
+	xsmaddadp	vs37,	vs4,	vs21
+
+	xsmaddadp	vs38,	vs4,	vs22
+
+	xsmaddadp	vs39,	vs4,	vs23
+
+
+.endm
+
+.macro KERNEL8x1_E2
+
+
+	xsmaddadp	vs32,	vs4,	vs16
+
+	xsmaddadp	vs33,	vs4,	vs17
+
+	xsmaddadp	vs34,	vs4,	vs18
+
+	xsmaddadp	vs35,	vs4,	vs19
+
+	xsmaddadp	vs36,	vs4,	vs20
+
+	xsmaddadp	vs37,	vs4,	vs21
+
+	xsmaddadp	vs38,	vs4,	vs22
+
+	xsmaddadp	vs39,	vs4,	vs23
+
+
+.endm
+
+.macro KERNEL8x1_SUBI1
+
+
+	lxsspx		vs0,	o0,	AO
+
+	addi		AO,	AO,	4
+
+	mr		T1,	BO
+
+	lxsspx		vs8,	o0,	T1
+	lxsspx		vs9,	o4,	T1
+	lxsspx		vs10,	o8,	T1
+	lxsspx		vs11,	o12,	T1
+
+	addi		T1,	T1,	16
+
+	lxsspx		vs12,	o0,	T1
+	lxsspx		vs13,	o4,	T1
+	lxsspx		vs14,	o8,	T1
+	lxsspx		vs15,	o12,	T1
+
+	addi		BO,	BO,	32
+
+
+	xsmuldp		vs32,	vs0,	vs8
+
+	xsmuldp		vs33,	vs0,	vs9
+
+	xsmuldp		vs34,	vs0,	vs10
+
+	xsmuldp		vs35,	vs0,	vs11
+
+	xsmuldp		vs36,	vs0,	vs12
+
+	xsmuldp		vs37,	vs0,	vs13
+
+	xsmuldp		vs38,	vs0,	vs14
+
+	xsmuldp		vs39,	vs0,	vs15
+
+
+.endm
+
+.macro KERNEL8x1_SUB1
+
+
+	lxsspx		vs0,	o0,	AO
+
+	addi		AO,	AO,	4
+
+	mr		T1,	BO
+
+	lxsspx		vs8,	o0,	T1
+	lxsspx		vs9,	o4,	T1
+	lxsspx		vs10,	o8,	T1
+	lxsspx		vs11,	o12,	T1
+
+	addi		T1,	T1,	16
+
+	lxsspx		vs12,	o0,	T1
+	lxsspx		vs13,	o4,	T1
+	lxsspx		vs14,	o8,	T1
+	lxsspx		vs15,	o12,	T1
+
+	addi		BO,	BO,	32
+
+
+	xsmaddadp	vs32,	vs0,	vs8
+
+	xsmaddadp	vs33,	vs0,	vs9
+
+	xsmaddadp	vs34,	vs0,	vs10
+
+	xsmaddadp	vs35,	vs0,	vs11
+
+	xsmaddadp	vs36,	vs0,	vs12
+
+	xsmaddadp	vs37,	vs0,	vs13
+
+	xsmaddadp	vs38,	vs0,	vs14
+
+	xsmaddadp	vs39,	vs0,	vs15
+
+
+.endm
+
+.macro SAVE8x1
+
+	mr		T1,	CO
+
+#ifndef TRMMKERNEL
+
+	lxsspx		vs0,	o0,	T1
+
+#endif
+
+#ifdef TRMMKERNEL
+	xsmuldp		vs0,	vs32,	alpha_r
+#else
+	xsmaddadp	vs0,	vs32,	alpha_r
+#endif
+
+	stxsspx		vs0,	o0,	T1
+
+	add		T1,	T1,	LDC
+
+
+#ifndef TRMMKERNEL
+
+	lxsspx		vs0,	o0,	T1
+
+#endif
+
+#ifdef TRMMKERNEL
+	xsmuldp		vs0,	vs33,	alpha_r
+#else
+	xsmaddadp	vs0,	vs33,	alpha_r
+#endif
+
+	stxsspx		vs0,	o0,	T1
+
+	add		T1,	T1,	LDC
+
+
+#ifndef TRMMKERNEL
+
+	lxsspx		vs0,	o0,	T1
+
+#endif
+
+#ifdef TRMMKERNEL
+	xsmuldp		vs0,	vs34,	alpha_r
+#else
+	xsmaddadp	vs0,	vs34,	alpha_r
+#endif
+
+	stxsspx		vs0,	o0,	T1
+
+	add		T1,	T1,	LDC
+
+
+#ifndef TRMMKERNEL
+
+	lxsspx		vs0,	o0,	T1
+
+#endif
+
+#ifdef TRMMKERNEL
+	xsmuldp		vs0,	vs35,	alpha_r
+#else
+	xsmaddadp	vs0,	vs35,	alpha_r
+#endif
+
+	stxsspx		vs0,	o0,	T1
+
+	add		T1,	T1,	LDC
+
+
+#ifndef TRMMKERNEL
+
+	lxsspx		vs0,	o0,	T1
+
+#endif
+
+#ifdef TRMMKERNEL
+	xsmuldp		vs0,	vs36,	alpha_r
+#else
+	xsmaddadp	vs0,	vs36,	alpha_r
+#endif
+
+	stxsspx		vs0,	o0,	T1
+
+	add		T1,	T1,	LDC
+
+
+#ifndef TRMMKERNEL
+
+	lxsspx		vs0,	o0,	T1
+
+#endif
+
+#ifdef TRMMKERNEL
+	xsmuldp		vs0,	vs37,	alpha_r
+#else
+	xsmaddadp	vs0,	vs37,	alpha_r
+#endif
+
+	stxsspx		vs0,	o0,	T1
+
+	add		T1,	T1,	LDC
+
+
+#ifndef TRMMKERNEL
+
+	lxsspx		vs0,	o0,	T1
+
+#endif
+
+#ifdef TRMMKERNEL
+	xsmuldp		vs0,	vs38,	alpha_r
+#else
+	xsmaddadp	vs0,	vs38,	alpha_r
+#endif
+
+	stxsspx		vs0,	o0,	T1
+
+	add		T1,	T1,	LDC
+
+
+#ifndef TRMMKERNEL
+
+	lxsspx		vs0,	o0,	T1
+
+#endif
+
+#ifdef TRMMKERNEL
+	xsmuldp		vs0,	vs39,	alpha_r
+#else
+	xsmaddadp	vs0,	vs39,	alpha_r
+#endif
+
+	stxsspx		vs0,	o0,	T1
+
+	add		T1,	T1,	LDC
+
+	addi		CO,	CO,	4
+
+.endm
+
+
+/**********************************************************************************************
+* Macros for N=4 and M=16
+**********************************************************************************************/
+
+.macro LOAD4x16_1
+
+	lxvw4x		vs0,	o0,	AO
+	lxvw4x		vs1,	o16,	AO
+	lxvw4x		vs2,	o32,	AO
+	lxvw4x		vs3,	o48,	AO
+
+	addi		AO,	AO,	64
+
+	lxvw4x		vs28,	o0,	BO
+
+	xxspltw		vs8,	vs28,	0
+	xxspltw		vs9,	vs28,	1
+	xxspltw		vs10,	vs28,	2
+	xxspltw		vs11,	vs28,	3
+
+	addi		BO,	BO,	16
+
+.endm
+
+.macro KERNEL4x16_I1
+
+
+	lxvw4x		vs4,	o0,	AO
+	lxvw4x		vs5,	o16,	AO
+	lxvw4x		vs6,	o32,	AO
+	lxvw4x		vs7,	o48,	AO
+
+	addi		AO,	AO,	64
+
+	lxvw4x		vs28,	o0,	BO
+
+	xxspltw		vs16,	vs28,	0
+	xxspltw		vs17,	vs28,	1
+	xxspltw		vs18,	vs28,	2
+	xxspltw		vs19,	vs28,	3
+
+	addi		BO,	BO,	16
+
+
+	xvmulsp		vs32,	vs0,	vs8
+	xvmulsp		vs33,	vs1,	vs8
+	xvmulsp		vs34,	vs2,	vs8
+	xvmulsp		vs35,	vs3,	vs8
+
+	xvmulsp		vs36,	vs0,	vs9
+	xvmulsp		vs37,	vs1,	vs9
+	xvmulsp		vs38,	vs2,	vs9
+	xvmulsp		vs39,	vs3,	vs9
+
+	xvmulsp		vs40,	vs0,	vs10
+	xvmulsp		vs41,	vs1,	vs10
+	xvmulsp		vs42,	vs2,	vs10
+	xvmulsp		vs43,	vs3,	vs10
+
+	xvmulsp		vs44,	vs0,	vs11
+	xvmulsp		vs45,	vs1,	vs11
+	xvmulsp		vs46,	vs2,	vs11
+	xvmulsp		vs47,	vs3,	vs11
+
+
+.endm
+
+.macro KERNEL4x16_1
+
+
+	lxvw4x		vs4,	o0,	AO
+	lxvw4x		vs5,	o16,	AO
+	lxvw4x		vs6,	o32,	AO
+	lxvw4x		vs7,	o48,	AO
+
+	addi		AO,	AO,	64
+
+	lxvw4x		vs28,	o0,	BO
+
+	xxspltw		vs16,	vs28,	0
+	xxspltw		vs17,	vs28,	1
+	xxspltw		vs18,	vs28,	2
+	xxspltw		vs19,	vs28,	3
+
+	addi		BO,	BO,	16
+
+
+	xvmaddasp	vs32,	vs0,	vs8
+	xvmaddasp	vs33,	vs1,	vs8
+	xvmaddasp	vs34,	vs2,	vs8
+	xvmaddasp	vs35,	vs3,	vs8
+
+	xvmaddasp	vs36,	vs0,	vs9
+	xvmaddasp	vs37,	vs1,	vs9
+	xvmaddasp	vs38,	vs2,	vs9
+	xvmaddasp	vs39,	vs3,	vs9
+
+	xvmaddasp	vs40,	vs0,	vs10
+	xvmaddasp	vs41,	vs1,	vs10
+	xvmaddasp	vs42,	vs2,	vs10
+	xvmaddasp	vs43,	vs3,	vs10
+
+	xvmaddasp	vs44,	vs0,	vs11
+	xvmaddasp	vs45,	vs1,	vs11
+	xvmaddasp	vs46,	vs2,	vs11
+	xvmaddasp	vs47,	vs3,	vs11
+
+
+.endm
+
+.macro KERNEL4x16_2
+
+
+	lxvw4x		vs0,	o0,	AO
+	lxvw4x		vs1,	o16,	AO
+	lxvw4x		vs2,	o32,	AO
+	lxvw4x		vs3,	o48,	AO
+
+	addi		AO,	AO,	64
+
+	lxvw4x		vs28,	o0,	BO
+
+	xxspltw		vs8,	vs28,	0
+	xxspltw		vs9,	vs28,	1
+	xxspltw		vs10,	vs28,	2
+	xxspltw		vs11,	vs28,	3
+
+	addi		BO,	BO,	16
+
+
+	xvmaddasp	vs32,	vs4,	vs16
+	xvmaddasp	vs33,	vs5,	vs16
+	xvmaddasp	vs34,	vs6,	vs16
+	xvmaddasp	vs35,	vs7,	vs16
+
+	xvmaddasp	vs36,	vs4,	vs17
+	xvmaddasp	vs37,	vs5,	vs17
+	xvmaddasp	vs38,	vs6,	vs17
+	xvmaddasp	vs39,	vs7,	vs17
+
+	xvmaddasp	vs40,	vs4,	vs18
+	xvmaddasp	vs41,	vs5,	vs18
+	xvmaddasp	vs42,	vs6,	vs18
+	xvmaddasp	vs43,	vs7,	vs18
+
+	xvmaddasp	vs44,	vs4,	vs19
+	xvmaddasp	vs45,	vs5,	vs19
+	xvmaddasp	vs46,	vs6,	vs19
+	xvmaddasp	vs47,	vs7,	vs19
+
+
+.endm
+
+.macro KERNEL4x16_E2
+
+
+	xvmaddasp	vs32,	vs4,	vs16
+	xvmaddasp	vs33,	vs5,	vs16
+	xvmaddasp	vs34,	vs6,	vs16
+	xvmaddasp	vs35,	vs7,	vs16
+
+	xvmaddasp	vs36,	vs4,	vs17
+	xvmaddasp	vs37,	vs5,	vs17
+	xvmaddasp	vs38,	vs6,	vs17
+	xvmaddasp	vs39,	vs7,	vs17
+
+	xvmaddasp	vs40,	vs4,	vs18
+	xvmaddasp	vs41,	vs5,	vs18
+	xvmaddasp	vs42,	vs6,	vs18
+	xvmaddasp	vs43,	vs7,	vs18
+
+	xvmaddasp	vs44,	vs4,	vs19
+	xvmaddasp	vs45,	vs5,	vs19
+	xvmaddasp	vs46,	vs6,	vs19
+	xvmaddasp	vs47,	vs7,	vs19
+
+
+.endm
+
+.macro KERNEL4x16_SUBI1
+
+
+	lxvw4x		vs0,	o0,	AO
+	lxvw4x		vs1,	o16,	AO
+	lxvw4x		vs2,	o32,	AO
+	lxvw4x		vs3,	o48,	AO
+
+	addi		AO,	AO,	64
+
+	lxvw4x		vs28,	o0,	BO
+
+	xxspltw		vs8,	vs28,	0
+	xxspltw		vs9,	vs28,	1
+	xxspltw		vs10,	vs28,	2
+	xxspltw		vs11,	vs28,	3
+
+	addi		BO,	BO,	16
+
+
+	xvmulsp		vs32,	vs0,	vs8
+	xvmulsp		vs33,	vs1,	vs8
+	xvmulsp		vs34,	vs2,	vs8
+	xvmulsp		vs35,	vs3,	vs8
+
+	xvmulsp		vs36,	vs0,	vs9
+	xvmulsp		vs37,	vs1,	vs9
+	xvmulsp		vs38,	vs2,	vs9
+	xvmulsp		vs39,	vs3,	vs9
+
+	xvmulsp		vs40,	vs0,	vs10
+	xvmulsp		vs41,	vs1,	vs10
+	xvmulsp		vs42,	vs2,	vs10
+	xvmulsp		vs43,	vs3,	vs10
+
+	xvmulsp		vs44,	vs0,	vs11
+	xvmulsp		vs45,	vs1,	vs11
+	xvmulsp		vs46,	vs2,	vs11
+	xvmulsp		vs47,	vs3,	vs11
+
+
+.endm
+
+.macro KERNEL4x16_SUB1
+
+
+	lxvw4x		vs0,	o0,	AO
+	lxvw4x		vs1,	o16,	AO
+	lxvw4x		vs2,	o32,	AO
+	lxvw4x		vs3,	o48,	AO
+
+	addi		AO,	AO,	64
+
+	lxvw4x		vs28,	o0,	BO
+
+	xxspltw		vs8,	vs28,	0
+	xxspltw		vs9,	vs28,	1
+	xxspltw		vs10,	vs28,	2
+	xxspltw		vs11,	vs28,	3
+
+	addi		BO,	BO,	16
+
+
+	xvmaddasp	vs32,	vs0,	vs8
+	xvmaddasp	vs33,	vs1,	vs8
+	xvmaddasp	vs34,	vs2,	vs8
+	xvmaddasp	vs35,	vs3,	vs8
+
+	xvmaddasp	vs36,	vs0,	vs9
+	xvmaddasp	vs37,	vs1,	vs9
+	xvmaddasp	vs38,	vs2,	vs9
+	xvmaddasp	vs39,	vs3,	vs9
+
+	xvmaddasp	vs40,	vs0,	vs10
+	xvmaddasp	vs41,	vs1,	vs10
+	xvmaddasp	vs42,	vs2,	vs10
+	xvmaddasp	vs43,	vs3,	vs10
+
+	xvmaddasp	vs44,	vs0,	vs11
+	xvmaddasp	vs45,	vs1,	vs11
+	xvmaddasp	vs46,	vs2,	vs11
+	xvmaddasp	vs47,	vs3,	vs11
+
+
+.endm
+
+.macro SAVE4x16
+
+	mr		T1,	CO
+
+#ifndef TRMMKERNEL
+
+	lxvw4x		vs0,	o0,	T1
+	lxvw4x		vs1,	o16,	T1
+	lxvw4x		vs2,	o32,	T1
+	lxvw4x		vs3,	o48,	T1
+
+#endif
+
+#ifdef TRMMKERNEL
+	xvmulsp		vs0,	vs32,	alpha_vr
+	xvmulsp		vs1,	vs33,	alpha_vr
+	xvmulsp		vs2,	vs34,	alpha_vr
+	xvmulsp		vs3,	vs35,	alpha_vr
+#else
+	xvmaddasp	vs0,	vs32,	alpha_vr
+	xvmaddasp	vs1,	vs33,	alpha_vr
+	xvmaddasp	vs2,	vs34,	alpha_vr
+	xvmaddasp	vs3,	vs35,	alpha_vr
+#endif
+
+	stxvw4x		vs0,	o0,	T1
+	stxvw4x		vs1,	o16,	T1
+	stxvw4x		vs2,	o32,	T1
+	stxvw4x		vs3,	o48,	T1
+
+	add		T1,	T1,	LDC
+
+
+#ifndef TRMMKERNEL
+
+	lxvw4x		vs0,	o0,	T1
+	lxvw4x		vs1,	o16,	T1
+	lxvw4x		vs2,	o32,	T1
+	lxvw4x		vs3,	o48,	T1
+
+#endif
+
+#ifdef TRMMKERNEL
+	xvmulsp		vs0,	vs36,	alpha_vr
+	xvmulsp		vs1,	vs37,	alpha_vr
+	xvmulsp		vs2,	vs38,	alpha_vr
+	xvmulsp		vs3,	vs39,	alpha_vr
+#else
+	xvmaddasp	vs0,	vs36,	alpha_vr
+	xvmaddasp	vs1,	vs37,	alpha_vr
+	xvmaddasp	vs2,	vs38,	alpha_vr
+	xvmaddasp	vs3,	vs39,	alpha_vr
+#endif
+
+	stxvw4x		vs0,	o0,	T1
+	stxvw4x		vs1,	o16,	T1
+	stxvw4x		vs2,	o32,	T1
+	stxvw4x		vs3,	o48,	T1
+
+	add		T1,	T1,	LDC
+
+
+#ifndef TRMMKERNEL
+
+	lxvw4x		vs0,	o0,	T1
+	lxvw4x		vs1,	o16,	T1
+	lxvw4x		vs2,	o32,	T1
+	lxvw4x		vs3,	o48,	T1
+
+#endif
+
+#ifdef TRMMKERNEL
+	xvmulsp		vs0,	vs40,	alpha_vr
+	xvmulsp		vs1,	vs41,	alpha_vr
+	xvmulsp		vs2,	vs42,	alpha_vr
+	xvmulsp		vs3,	vs43,	alpha_vr
+#else
+	xvmaddasp	vs0,	vs40,	alpha_vr
+	xvmaddasp	vs1,	vs41,	alpha_vr
+	xvmaddasp	vs2,	vs42,	alpha_vr
+	xvmaddasp	vs3,	vs43,	alpha_vr
+#endif
+
+	stxvw4x		vs0,	o0,	T1
+	stxvw4x		vs1,	o16,	T1
+	stxvw4x		vs2,	o32,	T1
+	stxvw4x		vs3,	o48,	T1
+
+	add		T1,	T1,	LDC
+
+
+#ifndef TRMMKERNEL
+
+	lxvw4x		vs0,	o0,	T1
+	lxvw4x		vs1,	o16,	T1
+	lxvw4x		vs2,	o32,	T1
+	lxvw4x		vs3,	o48,	T1
+
+#endif
+
+#ifdef TRMMKERNEL
+	xvmulsp		vs0,	vs44,	alpha_vr
+	xvmulsp		vs1,	vs45,	alpha_vr
+	xvmulsp		vs2,	vs46,	alpha_vr
+	xvmulsp		vs3,	vs47,	alpha_vr
+#else
+	xvmaddasp	vs0,	vs44,	alpha_vr
+	xvmaddasp	vs1,	vs45,	alpha_vr
+	xvmaddasp	vs2,	vs46,	alpha_vr
+	xvmaddasp	vs3,	vs47,	alpha_vr
+#endif
+
+	stxvw4x		vs0,	o0,	T1
+	stxvw4x		vs1,	o16,	T1
+	stxvw4x		vs2,	o32,	T1
+	stxvw4x		vs3,	o48,	T1
+
+	add		T1,	T1,	LDC
+
+	addi		CO,	CO,	64
+
+.endm
+
+
+/**********************************************************************************************
+* Macros for N=4 and M=8
+**********************************************************************************************/
+
+.macro LOAD4x8_1
+
+	lxvw4x		vs0,	o0,	AO
+	lxvw4x		vs1,	o16,	AO
+
+	addi		AO,	AO,	32
+
+	lxvw4x		vs28,	o0,	BO
+
+	xxspltw		vs8,	vs28,	0
+	xxspltw		vs9,	vs28,	1
+	xxspltw		vs10,	vs28,	2
+	xxspltw		vs11,	vs28,	3
+
+	addi		BO,	BO,	16
+
+.endm
+
+.macro KERNEL4x8_I1
+
+
+	lxvw4x		vs4,	o0,	AO
+	lxvw4x		vs5,	o16,	AO
+
+	addi		AO,	AO,	32
+
+	lxvw4x		vs28,	o0,	BO
+
+	xxspltw		vs16,	vs28,	0
+	xxspltw		vs17,	vs28,	1
+	xxspltw		vs18,	vs28,	2
+	xxspltw		vs19,	vs28,	3
+
+	addi		BO,	BO,	16
+
+
+	xvmulsp		vs32,	vs0,	vs8
+	xvmulsp		vs33,	vs1,	vs8
+
+	xvmulsp		vs34,	vs0,	vs9
+	xvmulsp		vs35,	vs1,	vs9
+
+	xvmulsp		vs36,	vs0,	vs10
+	xvmulsp		vs37,	vs1,	vs10
+
+	xvmulsp		vs38,	vs0,	vs11
+	xvmulsp		vs39,	vs1,	vs11
+
+
+.endm
+
+.macro KERNEL4x8_1
+
+
+	lxvw4x		vs4,	o0,	AO
+	lxvw4x		vs5,	o16,	AO
+
+	addi		AO,	AO,	32
+
+	lxvw4x		vs28,	o0,	BO
+
+	xxspltw		vs16,	vs28,	0
+	xxspltw		vs17,	vs28,	1
+	xxspltw		vs18,	vs28,	2
+	xxspltw		vs19,	vs28,	3
+
+	addi		BO,	BO,	16
+
+
+	xvmaddasp	vs32,	vs0,	vs8
+	xvmaddasp	vs33,	vs1,	vs8
+
+	xvmaddasp	vs34,	vs0,	vs9
+	xvmaddasp	vs35,	vs1,	vs9
+
+	xvmaddasp	vs36,	vs0,	vs10
+	xvmaddasp	vs37,	vs1,	vs10
+
+	xvmaddasp	vs38,	vs0,	vs11
+	xvmaddasp	vs39,	vs1,	vs11
+
+
+.endm
+
+.macro KERNEL4x8_2
+
+
+	lxvw4x		vs0,	o0,	AO
+	lxvw4x		vs1,	o16,	AO
+
+	addi		AO,	AO,	32
+
+	lxvw4x		vs28,	o0,	BO
+
+	xxspltw		vs8,	vs28,	0
+	xxspltw		vs9,	vs28,	1
+	xxspltw		vs10,	vs28,	2
+	xxspltw		vs11,	vs28,	3
+
+	addi		BO,	BO,	16
+
+
+	xvmaddasp	vs32,	vs4,	vs16
+	xvmaddasp	vs33,	vs5,	vs16
+
+	xvmaddasp	vs34,	vs4,	vs17
+	xvmaddasp	vs35,	vs5,	vs17
+
+	xvmaddasp	vs36,	vs4,	vs18
+	xvmaddasp	vs37,	vs5,	vs18
+
+	xvmaddasp	vs38,	vs4,	vs19
+	xvmaddasp	vs39,	vs5,	vs19
+
+
+.endm
+
+.macro KERNEL4x8_E2
+
+
+	xvmaddasp	vs32,	vs4,	vs16
+	xvmaddasp	vs33,	vs5,	vs16
+
+	xvmaddasp	vs34,	vs4,	vs17
+	xvmaddasp	vs35,	vs5,	vs17
+
+	xvmaddasp	vs36,	vs4,	vs18
+	xvmaddasp	vs37,	vs5,	vs18
+
+	xvmaddasp	vs38,	vs4,	vs19
+	xvmaddasp	vs39,	vs5,	vs19
+
+
+.endm
+
+.macro KERNEL4x8_SUBI1
+
+
+	lxvw4x		vs0,	o0,	AO
+	lxvw4x		vs1,	o16,	AO
+
+	addi		AO,	AO,	32
+
+	lxvw4x		vs28,	o0,	BO
+
+	xxspltw		vs8,	vs28,	0
+	xxspltw		vs9,	vs28,	1
+	xxspltw		vs10,	vs28,	2
+	xxspltw		vs11,	vs28,	3
+
+	addi		BO,	BO,	16
+
+
+	xvmulsp		vs32,	vs0,	vs8
+	xvmulsp		vs33,	vs1,	vs8
+
+	xvmulsp		vs34,	vs0,	vs9
+	xvmulsp		vs35,	vs1,	vs9
+
+	xvmulsp		vs36,	vs0,	vs10
+	xvmulsp		vs37,	vs1,	vs10
+
+	xvmulsp		vs38,	vs0,	vs11
+	xvmulsp		vs39,	vs1,	vs11
+
+
+.endm
+
+.macro KERNEL4x8_SUB1
+
+
+	lxvw4x		vs0,	o0,	AO
+	lxvw4x		vs1,	o16,	AO
+
+	addi		AO,	AO,	32
+
+	lxvw4x		vs28,	o0,	BO
+
+	xxspltw		vs8,	vs28,	0
+	xxspltw		vs9,	vs28,	1
+	xxspltw		vs10,	vs28,	2
+	xxspltw		vs11,	vs28,	3
+
+	addi		BO,	BO,	16
+
+
+	xvmaddasp	vs32,	vs0,	vs8
+	xvmaddasp	vs33,	vs1,	vs8
+
+	xvmaddasp	vs34,	vs0,	vs9
+	xvmaddasp	vs35,	vs1,	vs9
+
+	xvmaddasp	vs36,	vs0,	vs10
+	xvmaddasp	vs37,	vs1,	vs10
+
+	xvmaddasp	vs38,	vs0,	vs11
+	xvmaddasp	vs39,	vs1,	vs11
+
+
+.endm
+
+.macro SAVE4x8
+
+	mr		T1,	CO
+
+#ifndef TRMMKERNEL
+
+	lxvw4x		vs0,	o0,	T1
+	lxvw4x		vs1,	o16,	T1
+
+#endif
+
+#ifdef TRMMKERNEL
+	xvmulsp		vs0,	vs32,	alpha_vr
+	xvmulsp		vs1,	vs33,	alpha_vr
+#else
+	xvmaddasp	vs0,	vs32,	alpha_vr
+	xvmaddasp	vs1,	vs33,	alpha_vr
+#endif
+
+	stxvw4x		vs0,	o0,	T1
+	stxvw4x		vs1,	o16,	T1
+
+	add		T1,	T1,	LDC
+
+
+#ifndef TRMMKERNEL
+
+	lxvw4x		vs0,	o0,	T1
+	lxvw4x		vs1,	o16,	T1
+
+#endif
+
+#ifdef TRMMKERNEL
+	xvmulsp		vs0,	vs34,	alpha_vr
+	xvmulsp		vs1,	vs35,	alpha_vr
+#else
+	xvmaddasp	vs0,	vs34,	alpha_vr
+	xvmaddasp	vs1,	vs35,	alpha_vr
+#endif
+
+	stxvw4x		vs0,	o0,	T1
+	stxvw4x		vs1,	o16,	T1
+
+	add		T1,	T1,	LDC
+
+
+#ifndef TRMMKERNEL
+
+	lxvw4x		vs0,	o0,	T1
+	lxvw4x		vs1,	o16,	T1
+
+#endif
+
+#ifdef TRMMKERNEL
+	xvmulsp		vs0,	vs36,	alpha_vr
+	xvmulsp		vs1,	vs37,	alpha_vr
+#else
+	xvmaddasp	vs0,	vs36,	alpha_vr
+	xvmaddasp	vs1,	vs37,	alpha_vr
+#endif
+
+	stxvw4x		vs0,	o0,	T1
+	stxvw4x		vs1,	o16,	T1
+
+	add		T1,	T1,	LDC
+
+
+#ifndef TRMMKERNEL
+
+	lxvw4x		vs0,	o0,	T1
+	lxvw4x		vs1,	o16,	T1
+
+#endif
+
+#ifdef TRMMKERNEL
+	xvmulsp		vs0,	vs38,	alpha_vr
+	xvmulsp		vs1,	vs39,	alpha_vr
+#else
+	xvmaddasp	vs0,	vs38,	alpha_vr
+	xvmaddasp	vs1,	vs39,	alpha_vr
+#endif
+
+	stxvw4x		vs0,	o0,	T1
+	stxvw4x		vs1,	o16,	T1
+
+	add		T1,	T1,	LDC
+
+	addi		CO,	CO,	32
+
+.endm
+
+
+/**********************************************************************************************
+* Macros for N=4 and M=4
+**********************************************************************************************/
+
+.macro LOAD4x4_1
+
+	lxvw4x		vs0,	o0,	AO
+
+	addi		AO,	AO,	16
+
+	lxvw4x		vs28,	o0,	BO
+
+	xxspltw		vs8,	vs28,	0
+	xxspltw		vs9,	vs28,	1
+	xxspltw		vs10,	vs28,	2
+	xxspltw		vs11,	vs28,	3
+
+	addi		BO,	BO,	16
+
+.endm
+
+.macro KERNEL4x4_I1
+
+
+	lxvw4x		vs4,	o0,	AO
+
+	addi		AO,	AO,	16
+
+	lxvw4x		vs28,	o0,	BO
+
+	xxspltw		vs16,	vs28,	0
+	xxspltw		vs17,	vs28,	1
+	xxspltw		vs18,	vs28,	2
+	xxspltw		vs19,	vs28,	3
+
+	addi		BO,	BO,	16
+
+
+	xvmulsp		vs32,	vs0,	vs8
+
+	xvmulsp		vs33,	vs0,	vs9
+
+	xvmulsp		vs34,	vs0,	vs10
+
+	xvmulsp		vs35,	vs0,	vs11
+
+
+.endm
+
+.macro KERNEL4x4_1
+
+
+	lxvw4x		vs4,	o0,	AO
+
+	addi		AO,	AO,	16
+
+	lxvw4x		vs28,	o0,	BO
+
+	xxspltw		vs16,	vs28,	0
+	xxspltw		vs17,	vs28,	1
+	xxspltw		vs18,	vs28,	2
+	xxspltw		vs19,	vs28,	3
+
+	addi		BO,	BO,	16
+
+
+	xvmaddasp	vs32,	vs0,	vs8
+
+	xvmaddasp	vs33,	vs0,	vs9
+
+	xvmaddasp	vs34,	vs0,	vs10
+
+	xvmaddasp	vs35,	vs0,	vs11
+
+
+.endm
+
+.macro KERNEL4x4_2
+
+
+	lxvw4x		vs0,	o0,	AO
+
+	addi		AO,	AO,	16
+
+	lxvw4x		vs28,	o0,	BO
+
+	xxspltw		vs8,	vs28,	0
+	xxspltw		vs9,	vs28,	1
+	xxspltw		vs10,	vs28,	2
+	xxspltw		vs11,	vs28,	3
+
+	addi		BO,	BO,	16
+
+
+	xvmaddasp	vs32,	vs4,	vs16
+
+	xvmaddasp	vs33,	vs4,	vs17
+
+	xvmaddasp	vs34,	vs4,	vs18
+
+	xvmaddasp	vs35,	vs4,	vs19
+
+
+.endm
+
+.macro KERNEL4x4_E2
+
+
+	xvmaddasp	vs32,	vs4,	vs16
+
+	xvmaddasp	vs33,	vs4,	vs17
+
+	xvmaddasp	vs34,	vs4,	vs18
+
+	xvmaddasp	vs35,	vs4,	vs19
+
+
+.endm
+
+.macro KERNEL4x4_SUBI1
+
+
+	lxvw4x		vs0,	o0,	AO
+
+	addi		AO,	AO,	16
+
+	lxvw4x		vs28,	o0,	BO
+
+	xxspltw		vs8,	vs28,	0
+	xxspltw		vs9,	vs28,	1
+	xxspltw		vs10,	vs28,	2
+	xxspltw		vs11,	vs28,	3
+
+	addi		BO,	BO,	16
+
+
+	xvmulsp		vs32,	vs0,	vs8
+
+	xvmulsp		vs33,	vs0,	vs9
+
+	xvmulsp		vs34,	vs0,	vs10
+
+	xvmulsp		vs35,	vs0,	vs11
+
+
+.endm
+
+.macro KERNEL4x4_SUB1
+
+
+	lxvw4x		vs0,	o0,	AO
+
+	addi		AO,	AO,	16
+
+	lxvw4x		vs28,	o0,	BO
+
+	xxspltw		vs8,	vs28,	0
+	xxspltw		vs9,	vs28,	1
+	xxspltw		vs10,	vs28,	2
+	xxspltw		vs11,	vs28,	3
+
+	addi		BO,	BO,	16
+
+
+	xvmaddasp	vs32,	vs0,	vs8
+
+	xvmaddasp	vs33,	vs0,	vs9
+
+	xvmaddasp	vs34,	vs0,	vs10
+
+	xvmaddasp	vs35,	vs0,	vs11
+
+
+.endm
+
+.macro SAVE4x4
+
+	mr		T1,	CO
+
+#ifndef TRMMKERNEL
+
+	lxvw4x		vs0,	o0,	T1
+
+#endif
+
+#ifdef TRMMKERNEL
+	xvmulsp		vs0,	vs32,	alpha_vr
+#else
+	xvmaddasp	vs0,	vs32,	alpha_vr
+#endif
+
+	stxvw4x		vs0,	o0,	T1
+
+	add		T1,	T1,	LDC
+
+
+#ifndef TRMMKERNEL
+
+	lxvw4x		vs0,	o0,	T1
+
+#endif
+
+#ifdef TRMMKERNEL
+	xvmulsp		vs0,	vs33,	alpha_vr
+#else
+	xvmaddasp	vs0,	vs33,	alpha_vr
+#endif
+
+	stxvw4x		vs0,	o0,	T1
+
+	add		T1,	T1,	LDC
+
+
+#ifndef TRMMKERNEL
+
+	lxvw4x		vs0,	o0,	T1
+
+#endif
+
+#ifdef TRMMKERNEL
+	xvmulsp		vs0,	vs34,	alpha_vr
+#else
+	xvmaddasp	vs0,	vs34,	alpha_vr
+#endif
+
+	stxvw4x		vs0,	o0,	T1
+
+	add		T1,	T1,	LDC
+
+
+#ifndef TRMMKERNEL
+
+	lxvw4x		vs0,	o0,	T1
+
+#endif
+
+#ifdef TRMMKERNEL
+	xvmulsp		vs0,	vs35,	alpha_vr
+#else
+	xvmaddasp	vs0,	vs35,	alpha_vr
+#endif
+
+	stxvw4x		vs0,	o0,	T1
+
+	add		T1,	T1,	LDC
+
+	addi		CO,	CO,	16
+
+.endm
+
+
+/**********************************************************************************************
+* Macros for N=4 and M=2
+**********************************************************************************************/
+
+.macro LOAD4x2_1
+
+	lxsspx		vs0,	o0,	AO
+	lxsspx		vs1,	o4,	AO
+
+	addi		AO,	AO,	8
+
+	mr		T1,	BO
+
+	lxsspx		vs8,	o0,	T1
+	lxsspx		vs9,	o4,	T1
+	lxsspx		vs10,	o8,	T1
+	lxsspx		vs11,	o12,	T1
+
+	addi		BO,	BO,	16
+
+.endm
+
+.macro KERNEL4x2_I1
+
+
+	lxsspx		vs4,	o0,	AO
+	lxsspx		vs5,	o4,	AO
+
+	addi		AO,	AO,	8
+
+	mr		T1,	BO
+
+	lxsspx		vs16,	o0,	T1
+	lxsspx		vs17,	o4,	T1
+	lxsspx		vs18,	o8,	T1
+	lxsspx		vs19,	o12,	T1
+
+	addi		BO,	BO,	16
+
+
+	xsmuldp		vs32,	vs0,	vs8
+	xsmuldp		vs33,	vs1,	vs8
+
+	xsmuldp		vs34,	vs0,	vs9
+	xsmuldp		vs35,	vs1,	vs9
+
+	xsmuldp		vs36,	vs0,	vs10
+	xsmuldp		vs37,	vs1,	vs10
+
+	xsmuldp		vs38,	vs0,	vs11
+	xsmuldp		vs39,	vs1,	vs11
+
+
+.endm
+
+.macro KERNEL4x2_1
+
+
+	lxsspx		vs4,	o0,	AO
+	lxsspx		vs5,	o4,	AO
+
+	addi		AO,	AO,	8
+
+	mr		T1,	BO
+
+	lxsspx		vs16,	o0,	T1
+	lxsspx		vs17,	o4,	T1
+	lxsspx		vs18,	o8,	T1
+	lxsspx		vs19,	o12,	T1
+
+	addi		BO,	BO,	16
+
+
+	xsmaddadp	vs32,	vs0,	vs8
+	xsmaddadp	vs33,	vs1,	vs8
+
+	xsmaddadp	vs34,	vs0,	vs9
+	xsmaddadp	vs35,	vs1,	vs9
+
+	xsmaddadp	vs36,	vs0,	vs10
+	xsmaddadp	vs37,	vs1,	vs10
+
+	xsmaddadp	vs38,	vs0,	vs11
+	xsmaddadp	vs39,	vs1,	vs11
+
+
+.endm
+
+.macro KERNEL4x2_2
+
+
+	lxsspx		vs0,	o0,	AO
+	lxsspx		vs1,	o4,	AO
+
+	addi		AO,	AO,	8
+
+	mr		T1,	BO
+
+	lxsspx		vs8,	o0,	T1
+	lxsspx		vs9,	o4,	T1
+	lxsspx		vs10,	o8,	T1
+	lxsspx		vs11,	o12,	T1
+
+	addi		BO,	BO,	16
+
+
+	xsmaddadp	vs32,	vs4,	vs16
+	xsmaddadp	vs33,	vs5,	vs16
+
+	xsmaddadp	vs34,	vs4,	vs17
+	xsmaddadp	vs35,	vs5,	vs17
+
+	xsmaddadp	vs36,	vs4,	vs18
+	xsmaddadp	vs37,	vs5,	vs18
+
+	xsmaddadp	vs38,	vs4,	vs19
+	xsmaddadp	vs39,	vs5,	vs19
+
+
+.endm
+
+.macro KERNEL4x2_E2
+
+
+	xsmaddadp	vs32,	vs4,	vs16
+	xsmaddadp	vs33,	vs5,	vs16
+
+	xsmaddadp	vs34,	vs4,	vs17
+	xsmaddadp	vs35,	vs5,	vs17
+
+	xsmaddadp	vs36,	vs4,	vs18
+	xsmaddadp	vs37,	vs5,	vs18
+
+	xsmaddadp	vs38,	vs4,	vs19
+	xsmaddadp	vs39,	vs5,	vs19
+
+
+.endm
+
+.macro KERNEL4x2_SUBI1
+
+
+	lxsspx		vs0,	o0,	AO
+	lxsspx		vs1,	o4,	AO
+
+	addi		AO,	AO,	8
+
+	mr		T1,	BO
+
+	lxsspx		vs8,	o0,	T1
+	lxsspx		vs9,	o4,	T1
+	lxsspx		vs10,	o8,	T1
+	lxsspx		vs11,	o12,	T1
+
+	addi		BO,	BO,	16
+
+
+	xsmuldp		vs32,	vs0,	vs8
+	xsmuldp		vs33,	vs1,	vs8
+
+	xsmuldp		vs34,	vs0,	vs9
+	xsmuldp		vs35,	vs1,	vs9
+
+	xsmuldp		vs36,	vs0,	vs10
+	xsmuldp		vs37,	vs1,	vs10
+
+	xsmuldp		vs38,	vs0,	vs11
+	xsmuldp		vs39,	vs1,	vs11
+
+
+.endm
+
+.macro KERNEL4x2_SUB1
+
+
+	lxsspx		vs0,	o0,	AO
+	lxsspx		vs1,	o4,	AO
+
+	addi		AO,	AO,	8
+
+	mr		T1,	BO
+
+	lxsspx		vs8,	o0,	T1
+	lxsspx		vs9,	o4,	T1
+	lxsspx		vs10,	o8,	T1
+	lxsspx		vs11,	o12,	T1
+
+	addi		BO,	BO,	16
+
+
+	xsmaddadp	vs32,	vs0,	vs8
+	xsmaddadp	vs33,	vs1,	vs8
+
+	xsmaddadp	vs34,	vs0,	vs9
+	xsmaddadp	vs35,	vs1,	vs9
+
+	xsmaddadp	vs36,	vs0,	vs10
+	xsmaddadp	vs37,	vs1,	vs10
+
+	xsmaddadp	vs38,	vs0,	vs11
+	xsmaddadp	vs39,	vs1,	vs11
+
+
+.endm
+
+.macro SAVE4x2
+
+	mr		T1,	CO
+
+#ifndef TRMMKERNEL
+
+	lxsspx		vs0,	o0,	T1
+	lxsspx		vs1,	o4,	T1
+
+#endif
+
+#ifdef TRMMKERNEL
+	xsmuldp		vs0,	vs32,	alpha_r
+	xsmuldp		vs1,	vs33,	alpha_r
+#else
+	xsmaddadp	vs0,	vs32,	alpha_r
+	xsmaddadp	vs1,	vs33,	alpha_r
+#endif
+
+	stxsspx		vs0,	o0,	T1
+	stxsspx		vs1,	o4,	T1
+
+	add		T1,	T1,	LDC
+
+
+#ifndef TRMMKERNEL
+
+	lxsspx		vs0,	o0,	T1
+	lxsspx		vs1,	o4,	T1
+
+#endif
+
+#ifdef TRMMKERNEL
+	xsmuldp		vs0,	vs34,	alpha_r
+	xsmuldp		vs1,	vs35,	alpha_r
+#else
+	xsmaddadp	vs0,	vs34,	alpha_r
+	xsmaddadp	vs1,	vs35,	alpha_r
+#endif
+
+	stxsspx		vs0,	o0,	T1
+	stxsspx		vs1,	o4,	T1
+
+	add		T1,	T1,	LDC
+
+
+#ifndef TRMMKERNEL
+
+	lxsspx		vs0,	o0,	T1
+	lxsspx		vs1,	o4,	T1
+
+#endif
+
+#ifdef TRMMKERNEL
+	xsmuldp		vs0,	vs36,	alpha_r
+	xsmuldp		vs1,	vs37,	alpha_r
+#else
+	xsmaddadp	vs0,	vs36,	alpha_r
+	xsmaddadp	vs1,	vs37,	alpha_r
+#endif
+
+	stxsspx		vs0,	o0,	T1
+	stxsspx		vs1,	o4,	T1
+
+	add		T1,	T1,	LDC
+
+
+#ifndef TRMMKERNEL
+
+	lxsspx		vs0,	o0,	T1
+	lxsspx		vs1,	o4,	T1
+
+#endif
+
+#ifdef TRMMKERNEL
+	xsmuldp		vs0,	vs38,	alpha_r
+	xsmuldp		vs1,	vs39,	alpha_r
+#else
+	xsmaddadp	vs0,	vs38,	alpha_r
+	xsmaddadp	vs1,	vs39,	alpha_r
+#endif
+
+	stxsspx		vs0,	o0,	T1
+	stxsspx		vs1,	o4,	T1
+
+	add		T1,	T1,	LDC
+
+	addi		CO,	CO,	8
+
+.endm
+
+
+/**********************************************************************************************
+* Macros for N=4 and M=1
+**********************************************************************************************/
+
+.macro LOAD4x1_1
+
+	lxsspx		vs0,	o0,	AO
+
+	addi		AO,	AO,	4
+
+	mr		T1,	BO
+
+	lxsspx		vs8,	o0,	T1
+	lxsspx		vs9,	o4,	T1
+	lxsspx		vs10,	o8,	T1
+	lxsspx		vs11,	o12,	T1
+
+	addi		BO,	BO,	16
+
+.endm
+
+.macro KERNEL4x1_I1
+
+
+	lxsspx		vs4,	o0,	AO
+
+	addi		AO,	AO,	4
+
+	mr		T1,	BO
+
+	lxsspx		vs16,	o0,	T1
+	lxsspx		vs17,	o4,	T1
+	lxsspx		vs18,	o8,	T1
+	lxsspx		vs19,	o12,	T1
+
+	addi		BO,	BO,	16
+
+
+	xsmuldp		vs32,	vs0,	vs8
+
+	xsmuldp		vs33,	vs0,	vs9
+
+	xsmuldp		vs34,	vs0,	vs10
+
+	xsmuldp		vs35,	vs0,	vs11
+
+
+.endm
+
+.macro KERNEL4x1_1
+
+
+	lxsspx		vs4,	o0,	AO
+
+	addi		AO,	AO,	4
+
+	mr		T1,	BO
+
+	lxsspx		vs16,	o0,	T1
+	lxsspx		vs17,	o4,	T1
+	lxsspx		vs18,	o8,	T1
+	lxsspx		vs19,	o12,	T1
+
+	addi		BO,	BO,	16
+
+
+	xsmaddadp	vs32,	vs0,	vs8
+
+	xsmaddadp	vs33,	vs0,	vs9
+
+	xsmaddadp	vs34,	vs0,	vs10
+
+	xsmaddadp	vs35,	vs0,	vs11
+
+
+.endm
+
+.macro KERNEL4x1_2
+
+
+	lxsspx		vs0,	o0,	AO
+
+	addi		AO,	AO,	4
+
+	mr		T1,	BO
+
+	lxsspx		vs8,	o0,	T1
+	lxsspx		vs9,	o4,	T1
+	lxsspx		vs10,	o8,	T1
+	lxsspx		vs11,	o12,	T1
+
+	addi		BO,	BO,	16
+
+
+	xsmaddadp	vs32,	vs4,	vs16
+
+	xsmaddadp	vs33,	vs4,	vs17
+
+	xsmaddadp	vs34,	vs4,	vs18
+
+	xsmaddadp	vs35,	vs4,	vs19
+
+
+.endm
+
+.macro KERNEL4x1_E2
+
+
+	xsmaddadp	vs32,	vs4,	vs16
+
+	xsmaddadp	vs33,	vs4,	vs17
+
+	xsmaddadp	vs34,	vs4,	vs18
+
+	xsmaddadp	vs35,	vs4,	vs19
+
+
+.endm
+
+.macro KERNEL4x1_SUBI1
+
+
+	lxsspx		vs0,	o0,	AO
+
+	addi		AO,	AO,	4
+
+	mr		T1,	BO
+
+	lxsspx		vs8,	o0,	T1
+	lxsspx		vs9,	o4,	T1
+	lxsspx		vs10,	o8,	T1
+	lxsspx		vs11,	o12,	T1
+
+	addi		BO,	BO,	16
+
+
+	xsmuldp		vs32,	vs0,	vs8
+
+	xsmuldp		vs33,	vs0,	vs9
+
+	xsmuldp		vs34,	vs0,	vs10
+
+	xsmuldp		vs35,	vs0,	vs11
+
+
+.endm
+
+.macro KERNEL4x1_SUB1
+
+
+	lxsspx		vs0,	o0,	AO
+
+	addi		AO,	AO,	4
+
+	mr		T1,	BO
+
+	lxsspx		vs8,	o0,	T1
+	lxsspx		vs9,	o4,	T1
+	lxsspx		vs10,	o8,	T1
+	lxsspx		vs11,	o12,	T1
+
+	addi		BO,	BO,	16
+
+
+	xsmaddadp	vs32,	vs0,	vs8
+
+	xsmaddadp	vs33,	vs0,	vs9
+
+	xsmaddadp	vs34,	vs0,	vs10
+
+	xsmaddadp	vs35,	vs0,	vs11
+
+
+.endm
+
+.macro SAVE4x1
+
+	mr		T1,	CO
+
+#ifndef TRMMKERNEL
+
+	lxsspx		vs0,	o0,	T1
+
+#endif
+
+#ifdef TRMMKERNEL
+	xsmuldp		vs0,	vs32,	alpha_r
+#else
+	xsmaddadp	vs0,	vs32,	alpha_r
+#endif
+
+	stxsspx		vs0,	o0,	T1
+
+	add		T1,	T1,	LDC
+
+
+#ifndef TRMMKERNEL
+
+	lxsspx		vs0,	o0,	T1
+
+#endif
+
+#ifdef TRMMKERNEL
+	xsmuldp		vs0,	vs33,	alpha_r
+#else
+	xsmaddadp	vs0,	vs33,	alpha_r
+#endif
+
+	stxsspx		vs0,	o0,	T1
+
+	add		T1,	T1,	LDC
+
+
+#ifndef TRMMKERNEL
+
+	lxsspx		vs0,	o0,	T1
+
+#endif
+
+#ifdef TRMMKERNEL
+	xsmuldp		vs0,	vs34,	alpha_r
+#else
+	xsmaddadp	vs0,	vs34,	alpha_r
+#endif
+
+	stxsspx		vs0,	o0,	T1
+
+	add		T1,	T1,	LDC
+
+
+#ifndef TRMMKERNEL
+
+	lxsspx		vs0,	o0,	T1
+
+#endif
+
+#ifdef TRMMKERNEL
+	xsmuldp		vs0,	vs35,	alpha_r
+#else
+	xsmaddadp	vs0,	vs35,	alpha_r
+#endif
+
+	stxsspx		vs0,	o0,	T1
+
+	add		T1,	T1,	LDC
+
+	addi		CO,	CO,	4
+
+.endm
+
+
+/**********************************************************************************************
+* Macros for N=2 and M=16
+**********************************************************************************************/
+
+.macro LOAD2x16_1
+
+	lxvw4x		vs0,	o0,	AO
+	lxvw4x		vs1,	o16,	AO
+	lxvw4x		vs2,	o32,	AO
+	lxvw4x		vs3,	o48,	AO
+
+	addi		AO,	AO,	64
+
+	lxvw4x		vs28,	o0,	BO
+
+	xxspltw		vs8,	vs28,	0
+	xxspltw		vs9,	vs28,	1
+
+	addi		BO,	BO,	8
+
+.endm
+
+.macro KERNEL2x16_I1
+
+
+	lxvw4x		vs4,	o0,	AO
+	lxvw4x		vs5,	o16,	AO
+	lxvw4x		vs6,	o32,	AO
+	lxvw4x		vs7,	o48,	AO
+
+	addi		AO,	AO,	64
+
+	lxvw4x		vs28,	o0,	BO
+
+	xxspltw		vs16,	vs28,	0
+	xxspltw		vs17,	vs28,	1
+
+	addi		BO,	BO,	8
+
+
+	xvmulsp		vs32,	vs0,	vs8
+	xvmulsp		vs33,	vs1,	vs8
+	xvmulsp		vs34,	vs2,	vs8
+	xvmulsp		vs35,	vs3,	vs8
+
+	xvmulsp		vs36,	vs0,	vs9
+	xvmulsp		vs37,	vs1,	vs9
+	xvmulsp		vs38,	vs2,	vs9
+	xvmulsp		vs39,	vs3,	vs9
+
+
+.endm
+
+.macro KERNEL2x16_1
+
+
+	lxvw4x		vs4,	o0,	AO
+	lxvw4x		vs5,	o16,	AO
+	lxvw4x		vs6,	o32,	AO
+	lxvw4x		vs7,	o48,	AO
+
+	addi		AO,	AO,	64
+
+	lxvw4x		vs28,	o0,	BO
+
+	xxspltw		vs16,	vs28,	0
+	xxspltw		vs17,	vs28,	1
+
+	addi		BO,	BO,	8
+
+
+	xvmaddasp	vs32,	vs0,	vs8
+	xvmaddasp	vs33,	vs1,	vs8
+	xvmaddasp	vs34,	vs2,	vs8
+	xvmaddasp	vs35,	vs3,	vs8
+
+	xvmaddasp	vs36,	vs0,	vs9
+	xvmaddasp	vs37,	vs1,	vs9
+	xvmaddasp	vs38,	vs2,	vs9
+	xvmaddasp	vs39,	vs3,	vs9
+
+
+.endm
+
+.macro KERNEL2x16_2
+
+
+	lxvw4x		vs0,	o0,	AO
+	lxvw4x		vs1,	o16,	AO
+	lxvw4x		vs2,	o32,	AO
+	lxvw4x		vs3,	o48,	AO
+
+	addi		AO,	AO,	64
+
+	lxvw4x		vs28,	o0,	BO
+
+	xxspltw		vs8,	vs28,	0
+	xxspltw		vs9,	vs28,	1
+
+	addi		BO,	BO,	8
+
+
+	xvmaddasp	vs32,	vs4,	vs16
+	xvmaddasp	vs33,	vs5,	vs16
+	xvmaddasp	vs34,	vs6,	vs16
+	xvmaddasp	vs35,	vs7,	vs16
+
+	xvmaddasp	vs36,	vs4,	vs17
+	xvmaddasp	vs37,	vs5,	vs17
+	xvmaddasp	vs38,	vs6,	vs17
+	xvmaddasp	vs39,	vs7,	vs17
+
+
+.endm
+
+.macro KERNEL2x16_E2
+
+
+	xvmaddasp	vs32,	vs4,	vs16
+	xvmaddasp	vs33,	vs5,	vs16
+	xvmaddasp	vs34,	vs6,	vs16
+	xvmaddasp	vs35,	vs7,	vs16
+
+	xvmaddasp	vs36,	vs4,	vs17
+	xvmaddasp	vs37,	vs5,	vs17
+	xvmaddasp	vs38,	vs6,	vs17
+	xvmaddasp	vs39,	vs7,	vs17
+
+
+.endm
+
+.macro KERNEL2x16_SUBI1
+
+
+	lxvw4x		vs0,	o0,	AO
+	lxvw4x		vs1,	o16,	AO
+	lxvw4x		vs2,	o32,	AO
+	lxvw4x		vs3,	o48,	AO
+
+	addi		AO,	AO,	64
+
+	lxvw4x		vs28,	o0,	BO
+
+	xxspltw		vs8,	vs28,	0
+	xxspltw		vs9,	vs28,	1
+
+	addi		BO,	BO,	8
+
+
+	xvmulsp		vs32,	vs0,	vs8
+	xvmulsp		vs33,	vs1,	vs8
+	xvmulsp		vs34,	vs2,	vs8
+	xvmulsp		vs35,	vs3,	vs8
+
+	xvmulsp		vs36,	vs0,	vs9
+	xvmulsp		vs37,	vs1,	vs9
+	xvmulsp		vs38,	vs2,	vs9
+	xvmulsp		vs39,	vs3,	vs9
+
+
+.endm
+
+.macro KERNEL2x16_SUB1
+
+
+	lxvw4x		vs0,	o0,	AO
+	lxvw4x		vs1,	o16,	AO
+	lxvw4x		vs2,	o32,	AO
+	lxvw4x		vs3,	o48,	AO
+
+	addi		AO,	AO,	64
+
+	lxvw4x		vs28,	o0,	BO
+
+	xxspltw		vs8,	vs28,	0
+	xxspltw		vs9,	vs28,	1
+
+	addi		BO,	BO,	8
+
+
+	xvmaddasp	vs32,	vs0,	vs8
+	xvmaddasp	vs33,	vs1,	vs8
+	xvmaddasp	vs34,	vs2,	vs8
+	xvmaddasp	vs35,	vs3,	vs8
+
+	xvmaddasp	vs36,	vs0,	vs9
+	xvmaddasp	vs37,	vs1,	vs9
+	xvmaddasp	vs38,	vs2,	vs9
+	xvmaddasp	vs39,	vs3,	vs9
+
+
+.endm
+
+.macro SAVE2x16
+
+	mr		T1,	CO
+
+#ifndef TRMMKERNEL
+
+	lxvw4x		vs0,	o0,	T1
+	lxvw4x		vs1,	o16,	T1
+	lxvw4x		vs2,	o32,	T1
+	lxvw4x		vs3,	o48,	T1
+
+#endif
+
+#ifdef TRMMKERNEL
+	xvmulsp		vs0,	vs32,	alpha_vr
+	xvmulsp		vs1,	vs33,	alpha_vr
+	xvmulsp		vs2,	vs34,	alpha_vr
+	xvmulsp		vs3,	vs35,	alpha_vr
+#else
+	xvmaddasp	vs0,	vs32,	alpha_vr
+	xvmaddasp	vs1,	vs33,	alpha_vr
+	xvmaddasp	vs2,	vs34,	alpha_vr
+	xvmaddasp	vs3,	vs35,	alpha_vr
+#endif
+
+	stxvw4x		vs0,	o0,	T1
+	stxvw4x		vs1,	o16,	T1
+	stxvw4x		vs2,	o32,	T1
+	stxvw4x		vs3,	o48,	T1
+
+	add		T1,	T1,	LDC
+
+
+#ifndef TRMMKERNEL
+
+	lxvw4x		vs0,	o0,	T1
+	lxvw4x		vs1,	o16,	T1
+	lxvw4x		vs2,	o32,	T1
+	lxvw4x		vs3,	o48,	T1
+
+#endif
+
+#ifdef TRMMKERNEL
+	xvmulsp		vs0,	vs36,	alpha_vr
+	xvmulsp		vs1,	vs37,	alpha_vr
+	xvmulsp		vs2,	vs38,	alpha_vr
+	xvmulsp		vs3,	vs39,	alpha_vr
+#else
+	xvmaddasp	vs0,	vs36,	alpha_vr
+	xvmaddasp	vs1,	vs37,	alpha_vr
+	xvmaddasp	vs2,	vs38,	alpha_vr
+	xvmaddasp	vs3,	vs39,	alpha_vr
+#endif
+
+	stxvw4x		vs0,	o0,	T1
+	stxvw4x		vs1,	o16,	T1
+	stxvw4x		vs2,	o32,	T1
+	stxvw4x		vs3,	o48,	T1
+
+	add		T1,	T1,	LDC
+
+	addi		CO,	CO,	64
+
+.endm
+
+
+/**********************************************************************************************
+* Macros for N=2 and M=8
+**********************************************************************************************/
+
+.macro LOAD2x8_1
+
+	lxvw4x		vs0,	o0,	AO
+	lxvw4x		vs1,	o16,	AO
+
+	addi		AO,	AO,	32
+
+	lxvw4x		vs28,	o0,	BO
+
+	xxspltw		vs8,	vs28,	0
+	xxspltw		vs9,	vs28,	1
+
+	addi		BO,	BO,	8
+
+.endm
+
+.macro KERNEL2x8_I1
+
+
+	lxvw4x		vs4,	o0,	AO
+	lxvw4x		vs5,	o16,	AO
+
+	addi		AO,	AO,	32
+
+	lxvw4x		vs28,	o0,	BO
+
+	xxspltw		vs16,	vs28,	0
+	xxspltw		vs17,	vs28,	1
+
+	addi		BO,	BO,	8
+
+
+	xvmulsp		vs32,	vs0,	vs8
+	xvmulsp		vs33,	vs1,	vs8
+
+	xvmulsp		vs34,	vs0,	vs9
+	xvmulsp		vs35,	vs1,	vs9
+
+
+.endm
+
+.macro KERNEL2x8_1
+
+
+	lxvw4x		vs4,	o0,	AO
+	lxvw4x		vs5,	o16,	AO
+
+	addi		AO,	AO,	32
+
+	lxvw4x		vs28,	o0,	BO
+
+	xxspltw		vs16,	vs28,	0
+	xxspltw		vs17,	vs28,	1
+
+	addi		BO,	BO,	8
+
+
+	xvmaddasp	vs32,	vs0,	vs8
+	xvmaddasp	vs33,	vs1,	vs8
+
+	xvmaddasp	vs34,	vs0,	vs9
+	xvmaddasp	vs35,	vs1,	vs9
+
+
+.endm
+
+.macro KERNEL2x8_2
+
+
+	lxvw4x		vs0,	o0,	AO
+	lxvw4x		vs1,	o16,	AO
+
+	addi		AO,	AO,	32
+
+	lxvw4x		vs28,	o0,	BO
+
+	xxspltw		vs8,	vs28,	0
+	xxspltw		vs9,	vs28,	1
+
+	addi		BO,	BO,	8
+
+
+	xvmaddasp	vs32,	vs4,	vs16
+	xvmaddasp	vs33,	vs5,	vs16
+
+	xvmaddasp	vs34,	vs4,	vs17
+	xvmaddasp	vs35,	vs5,	vs17
+
+
+.endm
+
+.macro KERNEL2x8_E2
+
+
+	xvmaddasp	vs32,	vs4,	vs16
+	xvmaddasp	vs33,	vs5,	vs16
+
+	xvmaddasp	vs34,	vs4,	vs17
+	xvmaddasp	vs35,	vs5,	vs17
+
+
+.endm
+
+.macro KERNEL2x8_SUBI1
+
+
+	lxvw4x		vs0,	o0,	AO
+	lxvw4x		vs1,	o16,	AO
+
+	addi		AO,	AO,	32
+
+	lxvw4x		vs28,	o0,	BO
+
+	xxspltw		vs8,	vs28,	0
+	xxspltw		vs9,	vs28,	1
+
+	addi		BO,	BO,	8
+
+
+	xvmulsp		vs32,	vs0,	vs8
+	xvmulsp		vs33,	vs1,	vs8
+
+	xvmulsp		vs34,	vs0,	vs9
+	xvmulsp		vs35,	vs1,	vs9
+
+
+.endm
+
+.macro KERNEL2x8_SUB1
+
+
+	lxvw4x		vs0,	o0,	AO
+	lxvw4x		vs1,	o16,	AO
+
+	addi		AO,	AO,	32
+
+	lxvw4x		vs28,	o0,	BO
+
+	xxspltw		vs8,	vs28,	0
+	xxspltw		vs9,	vs28,	1
+
+	addi		BO,	BO,	8
+
+
+	xvmaddasp	vs32,	vs0,	vs8
+	xvmaddasp	vs33,	vs1,	vs8
+
+	xvmaddasp	vs34,	vs0,	vs9
+	xvmaddasp	vs35,	vs1,	vs9
+
+
+.endm
+
+.macro SAVE2x8
+
+	mr		T1,	CO
+
+#ifndef TRMMKERNEL
+
+	lxvw4x		vs0,	o0,	T1
+	lxvw4x		vs1,	o16,	T1
+
+#endif
+
+#ifdef TRMMKERNEL
+	xvmulsp		vs0,	vs32,	alpha_vr
+	xvmulsp		vs1,	vs33,	alpha_vr
+#else
+	xvmaddasp	vs0,	vs32,	alpha_vr
+	xvmaddasp	vs1,	vs33,	alpha_vr
+#endif
+
+	stxvw4x		vs0,	o0,	T1
+	stxvw4x		vs1,	o16,	T1
+
+	add		T1,	T1,	LDC
+
+
+#ifndef TRMMKERNEL
+
+	lxvw4x		vs0,	o0,	T1
+	lxvw4x		vs1,	o16,	T1
+
+#endif
+
+#ifdef TRMMKERNEL
+	xvmulsp		vs0,	vs34,	alpha_vr
+	xvmulsp		vs1,	vs35,	alpha_vr
+#else
+	xvmaddasp	vs0,	vs34,	alpha_vr
+	xvmaddasp	vs1,	vs35,	alpha_vr
+#endif
+
+	stxvw4x		vs0,	o0,	T1
+	stxvw4x		vs1,	o16,	T1
+
+	add		T1,	T1,	LDC
+
+	addi		CO,	CO,	32
+
+.endm
+
+
+/**********************************************************************************************
+* Macros for N=2 and M=4
+**********************************************************************************************/
+
+.macro LOAD2x4_1
+
+	lxvw4x		vs0,	o0,	AO
+
+	addi		AO,	AO,	16
+
+	lxvw4x		vs28,	o0,	BO
+
+	xxspltw		vs8,	vs28,	0
+	xxspltw		vs9,	vs28,	1
+
+	addi		BO,	BO,	8
+
+.endm
+
+.macro KERNEL2x4_I1
+
+
+	lxvw4x		vs4,	o0,	AO
+
+	addi		AO,	AO,	16
+
+	lxvw4x		vs28,	o0,	BO
+
+	xxspltw		vs16,	vs28,	0
+	xxspltw		vs17,	vs28,	1
+
+	addi		BO,	BO,	8
+
+
+	xvmulsp		vs32,	vs0,	vs8
+
+	xvmulsp		vs33,	vs0,	vs9
+
+
+.endm
+
+.macro KERNEL2x4_1
+
+
+	lxvw4x		vs4,	o0,	AO
+
+	addi		AO,	AO,	16
+
+	lxvw4x		vs28,	o0,	BO
+
+	xxspltw		vs16,	vs28,	0
+	xxspltw		vs17,	vs28,	1
+
+	addi		BO,	BO,	8
+
+
+	xvmaddasp	vs32,	vs0,	vs8
+
+	xvmaddasp	vs33,	vs0,	vs9
+
+
+.endm
+
+.macro KERNEL2x4_2
+
+
+	lxvw4x		vs0,	o0,	AO
+
+	addi		AO,	AO,	16
+
+	lxvw4x		vs28,	o0,	BO
+
+	xxspltw		vs8,	vs28,	0
+	xxspltw		vs9,	vs28,	1
+
+	addi		BO,	BO,	8
+
+
+	xvmaddasp	vs32,	vs4,	vs16
+
+	xvmaddasp	vs33,	vs4,	vs17
+
+
+.endm
+
+.macro KERNEL2x4_E2
+
+
+	xvmaddasp	vs32,	vs4,	vs16
+
+	xvmaddasp	vs33,	vs4,	vs17
+
+
+.endm
+
+.macro KERNEL2x4_SUBI1
+
+
+	lxvw4x		vs0,	o0,	AO
+
+	addi		AO,	AO,	16
+
+	lxvw4x		vs28,	o0,	BO
+
+	xxspltw		vs8,	vs28,	0
+	xxspltw		vs9,	vs28,	1
+
+	addi		BO,	BO,	8
+
+
+	xvmulsp		vs32,	vs0,	vs8
+
+	xvmulsp		vs33,	vs0,	vs9
+
+
+.endm
+
+.macro KERNEL2x4_SUB1
+
+
+	lxvw4x		vs0,	o0,	AO
+
+	addi		AO,	AO,	16
+
+	lxvw4x		vs28,	o0,	BO
+
+	xxspltw		vs8,	vs28,	0
+	xxspltw		vs9,	vs28,	1
+
+	addi		BO,	BO,	8
+
+
+	xvmaddasp	vs32,	vs0,	vs8
+
+	xvmaddasp	vs33,	vs0,	vs9
+
+
+.endm
+
+.macro SAVE2x4
+
+	mr		T1,	CO
+
+#ifndef TRMMKERNEL
+
+	lxvw4x		vs0,	o0,	T1
+
+#endif
+
+#ifdef TRMMKERNEL
+	xvmulsp		vs0,	vs32,	alpha_vr
+#else
+	xvmaddasp	vs0,	vs32,	alpha_vr
+#endif
+
+	stxvw4x		vs0,	o0,	T1
+
+	add		T1,	T1,	LDC
+
+
+#ifndef TRMMKERNEL
+
+	lxvw4x		vs0,	o0,	T1
+
+#endif
+
+#ifdef TRMMKERNEL
+	xvmulsp		vs0,	vs33,	alpha_vr
+#else
+	xvmaddasp	vs0,	vs33,	alpha_vr
+#endif
+
+	stxvw4x		vs0,	o0,	T1
+
+	add		T1,	T1,	LDC
+
+	addi		CO,	CO,	16
+
+.endm
+
+
+/**********************************************************************************************
+* Macros for N=2 and M=2
+**********************************************************************************************/
+
+.macro LOAD2x2_1
+
+	lxsspx		vs0,	o0,	AO
+	lxsspx		vs1,	o4,	AO
+
+	addi		AO,	AO,	8
+
+	mr		T1,	BO
+
+	lxsspx		vs8,	o0,	T1
+	lxsspx		vs9,	o4,	T1
+
+	addi		BO,	BO,	8
+
+.endm
+
+.macro KERNEL2x2_I1
+
+
+	lxsspx		vs4,	o0,	AO
+	lxsspx		vs5,	o4,	AO
+
+	addi		AO,	AO,	8
+
+	mr		T1,	BO
+
+	lxsspx		vs16,	o0,	T1
+	lxsspx		vs17,	o4,	T1
+
+	addi		BO,	BO,	8
+
+
+	xsmuldp		vs32,	vs0,	vs8
+	xsmuldp		vs33,	vs1,	vs8
+
+	xsmuldp		vs34,	vs0,	vs9
+	xsmuldp		vs35,	vs1,	vs9
+
+
+.endm
+
+.macro KERNEL2x2_1
+
+
+	lxsspx		vs4,	o0,	AO
+	lxsspx		vs5,	o4,	AO
+
+	addi		AO,	AO,	8
+
+	mr		T1,	BO
+
+	lxsspx		vs16,	o0,	T1
+	lxsspx		vs17,	o4,	T1
+
+	addi		BO,	BO,	8
+
+
+	xsmaddadp	vs32,	vs0,	vs8
+	xsmaddadp	vs33,	vs1,	vs8
+
+	xsmaddadp	vs34,	vs0,	vs9
+	xsmaddadp	vs35,	vs1,	vs9
+
+
+.endm
+
+.macro KERNEL2x2_2
+
+
+	lxsspx		vs0,	o0,	AO
+	lxsspx		vs1,	o4,	AO
+
+	addi		AO,	AO,	8
+
+	mr		T1,	BO
+
+	lxsspx		vs8,	o0,	T1
+	lxsspx		vs9,	o4,	T1
+
+	addi		BO,	BO,	8
+
+
+	xsmaddadp	vs32,	vs4,	vs16
+	xsmaddadp	vs33,	vs5,	vs16
+
+	xsmaddadp	vs34,	vs4,	vs17
+	xsmaddadp	vs35,	vs5,	vs17
+
+
+.endm
+
+.macro KERNEL2x2_E2
+
+
+	xsmaddadp	vs32,	vs4,	vs16
+	xsmaddadp	vs33,	vs5,	vs16
+
+	xsmaddadp	vs34,	vs4,	vs17
+	xsmaddadp	vs35,	vs5,	vs17
+
+
+.endm
+
+.macro KERNEL2x2_SUBI1
+
+
+	lxsspx		vs0,	o0,	AO
+	lxsspx		vs1,	o4,	AO
+
+	addi		AO,	AO,	8
+
+	mr		T1,	BO
+
+	lxsspx		vs8,	o0,	T1
+	lxsspx		vs9,	o4,	T1
+
+	addi		BO,	BO,	8
+
+
+	xsmuldp		vs32,	vs0,	vs8
+	xsmuldp		vs33,	vs1,	vs8
+
+	xsmuldp		vs34,	vs0,	vs9
+	xsmuldp		vs35,	vs1,	vs9
+
+
+.endm
+
+.macro KERNEL2x2_SUB1
+
+
+	lxsspx		vs0,	o0,	AO
+	lxsspx		vs1,	o4,	AO
+
+	addi		AO,	AO,	8
+
+	mr		T1,	BO
+
+	lxsspx		vs8,	o0,	T1
+	lxsspx		vs9,	o4,	T1
+
+	addi		BO,	BO,	8
+
+
+	xsmaddadp	vs32,	vs0,	vs8
+	xsmaddadp	vs33,	vs1,	vs8
+
+	xsmaddadp	vs34,	vs0,	vs9
+	xsmaddadp	vs35,	vs1,	vs9
+
+
+.endm
+
+.macro SAVE2x2
+
+	mr		T1,	CO
+
+#ifndef TRMMKERNEL
+
+	lxsspx		vs0,	o0,	T1
+	lxsspx		vs1,	o4,	T1
+
+#endif
+
+#ifdef TRMMKERNEL
+	xsmuldp		vs0,	vs32,	alpha_r
+	xsmuldp		vs1,	vs33,	alpha_r
+#else
+	xsmaddadp	vs0,	vs32,	alpha_r
+	xsmaddadp	vs1,	vs33,	alpha_r
+#endif
+
+	stxsspx		vs0,	o0,	T1
+	stxsspx		vs1,	o4,	T1
+
+	add		T1,	T1,	LDC
+
+
+#ifndef TRMMKERNEL
+
+	lxsspx		vs0,	o0,	T1
+	lxsspx		vs1,	o4,	T1
+
+#endif
+
+#ifdef TRMMKERNEL
+	xsmuldp		vs0,	vs34,	alpha_r
+	xsmuldp		vs1,	vs35,	alpha_r
+#else
+	xsmaddadp	vs0,	vs34,	alpha_r
+	xsmaddadp	vs1,	vs35,	alpha_r
+#endif
+
+	stxsspx		vs0,	o0,	T1
+	stxsspx		vs1,	o4,	T1
+
+	add		T1,	T1,	LDC
+
+	addi		CO,	CO,	8
+
+.endm
+
+
+/**********************************************************************************************
+* Macros for N=2 and M=1
+**********************************************************************************************/
+
+.macro LOAD2x1_1
+
+	lxsspx		vs0,	o0,	AO
+
+	addi		AO,	AO,	4
+
+	mr		T1,	BO
+
+	lxsspx		vs8,	o0,	T1
+	lxsspx		vs9,	o4,	T1
+
+	addi		BO,	BO,	8
+
+.endm
+
+.macro KERNEL2x1_I1
+
+
+	lxsspx		vs4,	o0,	AO
+
+	addi		AO,	AO,	4
+
+	mr		T1,	BO
+
+	lxsspx		vs16,	o0,	T1
+	lxsspx		vs17,	o4,	T1
+
+	addi		BO,	BO,	8
+
+
+	xsmuldp		vs32,	vs0,	vs8
+
+	xsmuldp		vs33,	vs0,	vs9
+
+
+.endm
+
+.macro KERNEL2x1_1
+
+
+	lxsspx		vs4,	o0,	AO
+
+	addi		AO,	AO,	4
+
+	mr		T1,	BO
+
+	lxsspx		vs16,	o0,	T1
+	lxsspx		vs17,	o4,	T1
+
+	addi		BO,	BO,	8
+
+
+	xsmaddadp	vs32,	vs0,	vs8
+
+	xsmaddadp	vs33,	vs0,	vs9
+
+
+.endm
+
+.macro KERNEL2x1_2
+
+
+	lxsspx		vs0,	o0,	AO
+
+	addi		AO,	AO,	4
+
+	mr		T1,	BO
+
+	lxsspx		vs8,	o0,	T1
+	lxsspx		vs9,	o4,	T1
+
+	addi		BO,	BO,	8
+
+
+	xsmaddadp	vs32,	vs4,	vs16
+
+	xsmaddadp	vs33,	vs4,	vs17
+
+
+.endm
+
+.macro KERNEL2x1_E2
+
+
+	xsmaddadp	vs32,	vs4,	vs16
+
+	xsmaddadp	vs33,	vs4,	vs17
+
+
+.endm
+
+.macro KERNEL2x1_SUBI1
+
+
+	lxsspx		vs0,	o0,	AO
+
+	addi		AO,	AO,	4
+
+	mr		T1,	BO
+
+	lxsspx		vs8,	o0,	T1
+	lxsspx		vs9,	o4,	T1
+
+	addi		BO,	BO,	8
+
+
+	xsmuldp		vs32,	vs0,	vs8
+
+	xsmuldp		vs33,	vs0,	vs9
+
+
+.endm
+
+.macro KERNEL2x1_SUB1
+
+
+	lxsspx		vs0,	o0,	AO
+
+	addi		AO,	AO,	4
+
+	mr		T1,	BO
+
+	lxsspx		vs8,	o0,	T1
+	lxsspx		vs9,	o4,	T1
+
+	addi		BO,	BO,	8
+
+
+	xsmaddadp	vs32,	vs0,	vs8
+
+	xsmaddadp	vs33,	vs0,	vs9
+
+
+.endm
+
+.macro SAVE2x1
+
+	mr		T1,	CO
+
+#ifndef TRMMKERNEL
+
+	lxsspx		vs0,	o0,	T1
+
+#endif
+
+#ifdef TRMMKERNEL
+	xsmuldp		vs0,	vs32,	alpha_r
+#else
+	xsmaddadp	vs0,	vs32,	alpha_r
+#endif
+
+	stxsspx		vs0,	o0,	T1
+
+	add		T1,	T1,	LDC
+
+
+#ifndef TRMMKERNEL
+
+	lxsspx		vs0,	o0,	T1
+
+#endif
+
+#ifdef TRMMKERNEL
+	xsmuldp		vs0,	vs33,	alpha_r
+#else
+	xsmaddadp	vs0,	vs33,	alpha_r
+#endif
+
+	stxsspx		vs0,	o0,	T1
+
+	add		T1,	T1,	LDC
+
+	addi		CO,	CO,	4
+
+.endm
+
+
+/**********************************************************************************************
+* Macros for N=1 and M=16
+**********************************************************************************************/
+
+.macro LOAD1x16_1
+
+	lxvw4x		vs0,	o0,	AO
+	lxvw4x		vs1,	o16,	AO
+	lxvw4x		vs2,	o32,	AO
+	lxvw4x		vs3,	o48,	AO
+
+	addi		AO,	AO,	64
+
+	lxvw4x		vs28,	o0,	BO
+
+	xxspltw		vs8,	vs28,	0
+
+	addi		BO,	BO,	4
+
+.endm
+
+.macro KERNEL1x16_I1
+
+
+	lxvw4x		vs4,	o0,	AO
+	lxvw4x		vs5,	o16,	AO
+	lxvw4x		vs6,	o32,	AO
+	lxvw4x		vs7,	o48,	AO
+
+	addi		AO,	AO,	64
+
+	lxvw4x		vs28,	o0,	BO
+
+	xxspltw		vs16,	vs28,	0
+
+	addi		BO,	BO,	4
+
+
+	xvmulsp		vs32,	vs0,	vs8
+	xvmulsp		vs33,	vs1,	vs8
+	xvmulsp		vs34,	vs2,	vs8
+	xvmulsp		vs35,	vs3,	vs8
+
+
+.endm
+
+.macro KERNEL1x16_1
+
+
+	lxvw4x		vs4,	o0,	AO
+	lxvw4x		vs5,	o16,	AO
+	lxvw4x		vs6,	o32,	AO
+	lxvw4x		vs7,	o48,	AO
+
+	addi		AO,	AO,	64
+
+	lxvw4x		vs28,	o0,	BO
+
+	xxspltw		vs16,	vs28,	0
+
+	addi		BO,	BO,	4
+
+
+	xvmaddasp	vs32,	vs0,	vs8
+	xvmaddasp	vs33,	vs1,	vs8
+	xvmaddasp	vs34,	vs2,	vs8
+	xvmaddasp	vs35,	vs3,	vs8
+
+
+.endm
+
+.macro KERNEL1x16_2
+
+
+	lxvw4x		vs0,	o0,	AO
+	lxvw4x		vs1,	o16,	AO
+	lxvw4x		vs2,	o32,	AO
+	lxvw4x		vs3,	o48,	AO
+
+	addi		AO,	AO,	64
+
+	lxvw4x		vs28,	o0,	BO
+
+	xxspltw		vs8,	vs28,	0
+
+	addi		BO,	BO,	4
+
+
+	xvmaddasp	vs32,	vs4,	vs16
+	xvmaddasp	vs33,	vs5,	vs16
+	xvmaddasp	vs34,	vs6,	vs16
+	xvmaddasp	vs35,	vs7,	vs16
+
+
+.endm
+
+.macro KERNEL1x16_E2
+
+
+	xvmaddasp	vs32,	vs4,	vs16
+	xvmaddasp	vs33,	vs5,	vs16
+	xvmaddasp	vs34,	vs6,	vs16
+	xvmaddasp	vs35,	vs7,	vs16
+
+
+.endm
+
+.macro KERNEL1x16_SUBI1
+
+
+	lxvw4x		vs0,	o0,	AO
+	lxvw4x		vs1,	o16,	AO
+	lxvw4x		vs2,	o32,	AO
+	lxvw4x		vs3,	o48,	AO
+
+	addi		AO,	AO,	64
+
+	lxvw4x		vs28,	o0,	BO
+
+	xxspltw		vs8,	vs28,	0
+
+	addi		BO,	BO,	4
+
+
+	xvmulsp		vs32,	vs0,	vs8
+	xvmulsp		vs33,	vs1,	vs8
+	xvmulsp		vs34,	vs2,	vs8
+	xvmulsp		vs35,	vs3,	vs8
+
+
+.endm
+
+.macro KERNEL1x16_SUB1
+
+
+	lxvw4x		vs0,	o0,	AO
+	lxvw4x		vs1,	o16,	AO
+	lxvw4x		vs2,	o32,	AO
+	lxvw4x		vs3,	o48,	AO
+
+	addi		AO,	AO,	64
+
+	lxvw4x		vs28,	o0,	BO
+
+	xxspltw		vs8,	vs28,	0
+
+	addi		BO,	BO,	4
+
+
+	xvmaddasp	vs32,	vs0,	vs8
+	xvmaddasp	vs33,	vs1,	vs8
+	xvmaddasp	vs34,	vs2,	vs8
+	xvmaddasp	vs35,	vs3,	vs8
+
+
+.endm
+
+.macro SAVE1x16
+
+	mr		T1,	CO
+
+#ifndef TRMMKERNEL
+
+	lxvw4x		vs0,	o0,	T1
+	lxvw4x		vs1,	o16,	T1
+	lxvw4x		vs2,	o32,	T1
+	lxvw4x		vs3,	o48,	T1
+
+#endif
+
+#ifdef TRMMKERNEL
+	xvmulsp		vs0,	vs32,	alpha_vr
+	xvmulsp		vs1,	vs33,	alpha_vr
+	xvmulsp		vs2,	vs34,	alpha_vr
+	xvmulsp		vs3,	vs35,	alpha_vr
+#else
+	xvmaddasp	vs0,	vs32,	alpha_vr
+	xvmaddasp	vs1,	vs33,	alpha_vr
+	xvmaddasp	vs2,	vs34,	alpha_vr
+	xvmaddasp	vs3,	vs35,	alpha_vr
+#endif
+
+	stxvw4x		vs0,	o0,	T1
+	stxvw4x		vs1,	o16,	T1
+	stxvw4x		vs2,	o32,	T1
+	stxvw4x		vs3,	o48,	T1
+
+	add		T1,	T1,	LDC
+
+	addi		CO,	CO,	64
+
+.endm
+
+
+/**********************************************************************************************
+* Macros for N=1 and M=8
+**********************************************************************************************/
+
+.macro LOAD1x8_1
+
+	lxvw4x		vs0,	o0,	AO
+	lxvw4x		vs1,	o16,	AO
+
+	addi		AO,	AO,	32
+
+	lxvw4x		vs28,	o0,	BO
+
+	xxspltw		vs8,	vs28,	0
+
+	addi		BO,	BO,	4
+
+.endm
+
+.macro KERNEL1x8_I1
+
+
+	lxvw4x		vs4,	o0,	AO
+	lxvw4x		vs5,	o16,	AO
+
+	addi		AO,	AO,	32
+
+	lxvw4x		vs28,	o0,	BO
+
+	xxspltw		vs16,	vs28,	0
+
+	addi		BO,	BO,	4
+
+
+	xvmulsp		vs32,	vs0,	vs8
+	xvmulsp		vs33,	vs1,	vs8
+
+
+.endm
+
+.macro KERNEL1x8_1
+
+
+	lxvw4x		vs4,	o0,	AO
+	lxvw4x		vs5,	o16,	AO
+
+	addi		AO,	AO,	32
+
+	lxvw4x		vs28,	o0,	BO
+
+	xxspltw		vs16,	vs28,	0
+
+	addi		BO,	BO,	4
+
+
+	xvmaddasp	vs32,	vs0,	vs8
+	xvmaddasp	vs33,	vs1,	vs8
+
+
+.endm
+
+.macro KERNEL1x8_2
+
+
+	lxvw4x		vs0,	o0,	AO
+	lxvw4x		vs1,	o16,	AO
+
+	addi		AO,	AO,	32
+
+	lxvw4x		vs28,	o0,	BO
+
+	xxspltw		vs8,	vs28,	0
+
+	addi		BO,	BO,	4
+
+
+	xvmaddasp	vs32,	vs4,	vs16
+	xvmaddasp	vs33,	vs5,	vs16
+
+
+.endm
+
+.macro KERNEL1x8_E2
+
+
+	xvmaddasp	vs32,	vs4,	vs16
+	xvmaddasp	vs33,	vs5,	vs16
+
+
+.endm
+
+.macro KERNEL1x8_SUBI1
+
+
+	lxvw4x		vs0,	o0,	AO
+	lxvw4x		vs1,	o16,	AO
+
+	addi		AO,	AO,	32
+
+	lxvw4x		vs28,	o0,	BO
+
+	xxspltw		vs8,	vs28,	0
+
+	addi		BO,	BO,	4
+
+
+	xvmulsp		vs32,	vs0,	vs8
+	xvmulsp		vs33,	vs1,	vs8
+
+
+.endm
+
+.macro KERNEL1x8_SUB1
+
+
+	lxvw4x		vs0,	o0,	AO
+	lxvw4x		vs1,	o16,	AO
+
+	addi		AO,	AO,	32
+
+	lxvw4x		vs28,	o0,	BO
+
+	xxspltw		vs8,	vs28,	0
+
+	addi		BO,	BO,	4
+
+
+	xvmaddasp	vs32,	vs0,	vs8
+	xvmaddasp	vs33,	vs1,	vs8
+
+
+.endm
+
+.macro SAVE1x8
+
+	mr		T1,	CO
+
+#ifndef TRMMKERNEL
+
+	lxvw4x		vs0,	o0,	T1
+	lxvw4x		vs1,	o16,	T1
+
+#endif
+
+#ifdef TRMMKERNEL
+	xvmulsp		vs0,	vs32,	alpha_vr
+	xvmulsp		vs1,	vs33,	alpha_vr
+#else
+	xvmaddasp	vs0,	vs32,	alpha_vr
+	xvmaddasp	vs1,	vs33,	alpha_vr
+#endif
+
+	stxvw4x		vs0,	o0,	T1
+	stxvw4x		vs1,	o16,	T1
+
+	add		T1,	T1,	LDC
+
+	addi		CO,	CO,	32
+
+.endm
+
+
+/**********************************************************************************************
+* Macros for N=1 and M=4
+**********************************************************************************************/
+
+.macro LOAD1x4_1
+
+	lxvw4x		vs0,	o0,	AO
+
+	addi		AO,	AO,	16
+
+	lxvw4x		vs28,	o0,	BO
+
+	xxspltw		vs8,	vs28,	0
+
+	addi		BO,	BO,	4
+
+.endm
+
+.macro KERNEL1x4_I1
+
+
+	lxvw4x		vs4,	o0,	AO
+
+	addi		AO,	AO,	16
+
+	lxvw4x		vs28,	o0,	BO
+
+	xxspltw		vs16,	vs28,	0
+
+	addi		BO,	BO,	4
+
+
+	xvmulsp		vs32,	vs0,	vs8
+
+
+.endm
+
+.macro KERNEL1x4_1
+
+
+	lxvw4x		vs4,	o0,	AO
+
+	addi		AO,	AO,	16
+
+	lxvw4x		vs28,	o0,	BO
+
+	xxspltw		vs16,	vs28,	0
+
+	addi		BO,	BO,	4
+
+
+	xvmaddasp	vs32,	vs0,	vs8
+
+
+.endm
+
+.macro KERNEL1x4_2
+
+
+	lxvw4x		vs0,	o0,	AO
+
+	addi		AO,	AO,	16
+
+	lxvw4x		vs28,	o0,	BO
+
+	xxspltw		vs8,	vs28,	0
+
+	addi		BO,	BO,	4
+
+
+	xvmaddasp	vs32,	vs4,	vs16
+
+
+.endm
+
+.macro KERNEL1x4_E2
+
+
+	xvmaddasp	vs32,	vs4,	vs16
+
+
+.endm
+
+.macro KERNEL1x4_SUBI1
+
+
+	lxvw4x		vs0,	o0,	AO
+
+	addi		AO,	AO,	16
+
+	lxvw4x		vs28,	o0,	BO
+
+	xxspltw		vs8,	vs28,	0
+
+	addi		BO,	BO,	4
+
+
+	xvmulsp		vs32,	vs0,	vs8
+
+
+.endm
+
+.macro KERNEL1x4_SUB1
+
+
+	lxvw4x		vs0,	o0,	AO
+
+	addi		AO,	AO,	16
+
+	lxvw4x		vs28,	o0,	BO
+
+	xxspltw		vs8,	vs28,	0
+
+	addi		BO,	BO,	4
+
+
+	xvmaddasp	vs32,	vs0,	vs8
+
+
+.endm
+
+.macro SAVE1x4
+
+	mr		T1,	CO
+
+#ifndef TRMMKERNEL
+
+	lxvw4x		vs0,	o0,	T1
+
+#endif
+
+#ifdef TRMMKERNEL
+	xvmulsp		vs0,	vs32,	alpha_vr
+#else
+	xvmaddasp	vs0,	vs32,	alpha_vr
+#endif
+
+	stxvw4x		vs0,	o0,	T1
+
+	add		T1,	T1,	LDC
+
+	addi		CO,	CO,	16
+
+.endm
+
+
+/**********************************************************************************************
+* Macros for N=1 and M=2
+**********************************************************************************************/
+
+.macro LOAD1x2_1
+
+	lxsspx		vs0,	o0,	AO
+	lxsspx		vs1,	o4,	AO
+
+	addi		AO,	AO,	8
+
+	mr		T1,	BO
+
+	lxsspx		vs8,	o0,	T1
+
+	addi		BO,	BO,	4
+
+.endm
+
+.macro KERNEL1x2_I1
+
+
+	lxsspx		vs4,	o0,	AO
+	lxsspx		vs5,	o4,	AO
+
+	addi		AO,	AO,	8
+
+	mr		T1,	BO
+
+	lxsspx		vs16,	o0,	T1
+
+	addi		BO,	BO,	4
+
+
+	xsmuldp		vs32,	vs0,	vs8
+	xsmuldp		vs33,	vs1,	vs8
+
+
+.endm
+
+.macro KERNEL1x2_1
+
+
+	lxsspx		vs4,	o0,	AO
+	lxsspx		vs5,	o4,	AO
+
+	addi		AO,	AO,	8
+
+	mr		T1,	BO
+
+	lxsspx		vs16,	o0,	T1
+
+	addi		BO,	BO,	4
+
+
+	xsmaddadp	vs32,	vs0,	vs8
+	xsmaddadp	vs33,	vs1,	vs8
+
+
+.endm
+
+.macro KERNEL1x2_2
+
+
+	lxsspx		vs0,	o0,	AO
+	lxsspx		vs1,	o4,	AO
+
+	addi		AO,	AO,	8
+
+	mr		T1,	BO
+
+	lxsspx		vs8,	o0,	T1
+
+	addi		BO,	BO,	4
+
+
+	xsmaddadp	vs32,	vs4,	vs16
+	xsmaddadp	vs33,	vs5,	vs16
+
+
+.endm
+
+.macro KERNEL1x2_E2
+
+
+	xsmaddadp	vs32,	vs4,	vs16
+	xsmaddadp	vs33,	vs5,	vs16
+
+
+.endm
+
+.macro KERNEL1x2_SUBI1
+
+
+	lxsspx		vs0,	o0,	AO
+	lxsspx		vs1,	o4,	AO
+
+	addi		AO,	AO,	8
+
+	mr		T1,	BO
+
+	lxsspx		vs8,	o0,	T1
+
+	addi		BO,	BO,	4
+
+
+	xsmuldp		vs32,	vs0,	vs8
+	xsmuldp		vs33,	vs1,	vs8
+
+
+.endm
+
+.macro KERNEL1x2_SUB1
+
+
+	lxsspx		vs0,	o0,	AO
+	lxsspx		vs1,	o4,	AO
+
+	addi		AO,	AO,	8
+
+	mr		T1,	BO
+
+	lxsspx		vs8,	o0,	T1
+
+	addi		BO,	BO,	4
+
+
+	xsmaddadp	vs32,	vs0,	vs8
+	xsmaddadp	vs33,	vs1,	vs8
+
+
+.endm
+
+.macro SAVE1x2
+
+	mr		T1,	CO
+
+#ifndef TRMMKERNEL
+
+	lxsspx		vs0,	o0,	T1
+	lxsspx		vs1,	o4,	T1
+
+#endif
+
+#ifdef TRMMKERNEL
+	xsmuldp		vs0,	vs32,	alpha_r
+	xsmuldp		vs1,	vs33,	alpha_r
+#else
+	xsmaddadp	vs0,	vs32,	alpha_r
+	xsmaddadp	vs1,	vs33,	alpha_r
+#endif
+
+	stxsspx		vs0,	o0,	T1
+	stxsspx		vs1,	o4,	T1
+
+	add		T1,	T1,	LDC
+
+	addi		CO,	CO,	8
+
+.endm
+
+
+/**********************************************************************************************
+* Macros for N=1 and M=1
+**********************************************************************************************/
+
+.macro LOAD1x1_1
+
+	lxsspx		vs0,	o0,	AO
+
+	addi		AO,	AO,	4
+
+	mr		T1,	BO
+
+	lxsspx		vs8,	o0,	T1
+
+	addi		BO,	BO,	4
+
+.endm
+
+.macro KERNEL1x1_I1
+
+
+	lxsspx		vs4,	o0,	AO
+
+	addi		AO,	AO,	4
+
+	mr		T1,	BO
+
+	lxsspx		vs16,	o0,	T1
+
+	addi		BO,	BO,	4
+
+
+	xsmuldp		vs32,	vs0,	vs8
+
+
+.endm
+
+.macro KERNEL1x1_1
+
+
+	lxsspx		vs4,	o0,	AO
+
+	addi		AO,	AO,	4
+
+	mr		T1,	BO
+
+	lxsspx		vs16,	o0,	T1
+
+	addi		BO,	BO,	4
+
+
+	xsmaddadp	vs32,	vs0,	vs8
+
+
+.endm
+
+.macro KERNEL1x1_2
+
+
+	lxsspx		vs0,	o0,	AO
+
+	addi		AO,	AO,	4
+
+	mr		T1,	BO
+
+	lxsspx		vs8,	o0,	T1
+
+	addi		BO,	BO,	4
+
+
+	xsmaddadp	vs32,	vs4,	vs16
+
+
+.endm
+
+.macro KERNEL1x1_E2
+
+
+	xsmaddadp	vs32,	vs4,	vs16
+
+
+.endm
+
+.macro KERNEL1x1_SUBI1
+
+
+	lxsspx		vs0,	o0,	AO
+
+	addi		AO,	AO,	4
+
+	mr		T1,	BO
+
+	lxsspx		vs8,	o0,	T1
+
+	addi		BO,	BO,	4
+
+
+	xsmuldp		vs32,	vs0,	vs8
+
+
+.endm
+
+.macro KERNEL1x1_SUB1
+
+
+	lxsspx		vs0,	o0,	AO
+
+	addi		AO,	AO,	4
+
+	mr		T1,	BO
+
+	lxsspx		vs8,	o0,	T1
+
+	addi		BO,	BO,	4
+
+
+	xsmaddadp	vs32,	vs0,	vs8
+
+
+.endm
+
+.macro SAVE1x1
+
+	mr		T1,	CO
+
+#ifndef TRMMKERNEL
+
+	lxsspx		vs0,	o0,	T1
+
+#endif
+
+#ifdef TRMMKERNEL
+	xsmuldp		vs0,	vs32,	alpha_r
+#else
+	xsmaddadp	vs0,	vs32,	alpha_r
+#endif
+
+	stxsspx		vs0,	o0,	T1
+
+	add		T1,	T1,	LDC
+
+	addi		CO,	CO,	4
+
+.endm
+
diff --git a/kernel/power/zasum.c b/kernel/power/zasum.c
new file mode 100644
index 000000000..abd6ec08a
--- /dev/null
+++ b/kernel/power/zasum.c
@@ -0,0 +1,149 @@
+/***************************************************************************
+Copyright (c) 2013-2016, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*****************************************************************************/
+
+/**************************************************************************************
+* 2016/03/28 Werner Saar (wernsaar@googlemail.com)
+* 	 BLASTEST 		: OK
+* 	 CTEST			: OK
+* 	 TEST			: OK
+*	 LAPACK-TEST		: OK
+**************************************************************************************/
+
+#include "common.h"
+#include <math.h>
+
+#if defined(DOUBLE)
+
+#define ABS fabs
+
+#else
+
+#define ABS fabsf
+
+#endif
+
+#if defined(POWER8)
+#include "zasum_microk_power8.c"
+#endif
+
+
+#ifndef HAVE_KERNEL_8
+
+static void zasum_kernel_8(BLASLONG n, FLOAT *x1, FLOAT *svec)
+{
+
+	BLASLONG i=0;
+	FLOAT *x = x1;
+	FLOAT temp0, temp1, temp2, temp3;
+	FLOAT temp4, temp5, temp6, temp7;
+	FLOAT sum0 = 0.0;
+	FLOAT sum1 = 0.0;
+	FLOAT sum2 = 0.0;
+	FLOAT sum3 = 0.0;
+
+	while ( i< n )
+	{
+
+		temp0 = ABS(x[0]);
+		temp1 = ABS(x[1]);
+		temp2 = ABS(x[2]);
+		temp3 = ABS(x[3]);
+		temp4 = ABS(x[4]);
+		temp5 = ABS(x[5]);
+		temp6 = ABS(x[6]);
+		temp7 = ABS(x[7]);
+
+		sum0 += temp0;
+		sum1 += temp1;
+		sum2 += temp2;
+		sum3 += temp3;
+
+		sum0 += temp4;
+		sum1 += temp5;
+		sum2 += temp6;
+		sum3 += temp7;
+
+		x+=8;
+		i+=4;
+
+	}
+
+	svec[0] = sum0+sum1+sum2+sum3;
+	svec[1] = 0.0;
+
+}
+
+#endif
+
+FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x)
+{
+	BLASLONG i=0;
+	BLASLONG ip=0;
+	FLOAT sumf = 0.0;
+	FLOAT svec[2] __attribute__ ((aligned (16)));;
+	BLASLONG n1;
+	BLASLONG inc_x2;
+
+	if (n <= 0 || inc_x <= 0) return(sumf);
+
+	if ( inc_x == 1 )
+	{
+
+		n1 = n & -8;
+		if ( n1 > 0 )
+		{
+
+			zasum_kernel_8(n1, x, svec);
+			sumf = svec[0] + svec[1];
+			i=n1;
+			ip=2*n1;
+		}
+
+		while(i < n)
+		{
+			sumf += ABS(x[ip]) + ABS(x[ip+1]);
+			i++;
+			ip+=2;
+		}
+
+	}
+	else
+	{
+		inc_x2 = 2* inc_x;
+
+		while(i < n)
+		{
+			sumf += ABS(x[ip]) + ABS(x[ip+1]);
+			ip+=inc_x2;
+			i++;
+		}
+
+	}
+	return(sumf);
+}
+
+
diff --git a/kernel/power/zasum_microk_power8.c b/kernel/power/zasum_microk_power8.c
new file mode 100644
index 000000000..b9f6c0ac6
--- /dev/null
+++ b/kernel/power/zasum_microk_power8.c
@@ -0,0 +1,177 @@
+/***************************************************************************
+Copyright (c) 2013-2016, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*****************************************************************************/
+
+/**************************************************************************************
+* 2016/03/28 Werner Saar (wernsaar@googlemail.com)
+* 	 BLASTEST 		: OK
+* 	 CTEST			: OK
+* 	 TEST			: OK
+*	 LAPACK-TEST		: OK
+**************************************************************************************/
+
+#define HAVE_KERNEL_8 1
+static void zasum_kernel_8( BLASLONG n, FLOAT *x, FLOAT *svec) __attribute__ ((noinline));
+
+static void zasum_kernel_8( BLASLONG n, FLOAT *x, FLOAT *svec)
+{
+
+
+	BLASLONG i = n;
+	BLASLONG o16 = 16;
+	BLASLONG o32 = 32;
+	BLASLONG o48 = 48;
+	BLASLONG o64 = 64;
+	BLASLONG o80 = 80;
+	BLASLONG o96 = 96;
+	BLASLONG o112 = 112;
+	FLOAT *x1=x;
+	BLASLONG pre = 384;
+
+	__asm__  __volatile__
+	(
+
+	"dcbt		%2 , %4				    \n\t"
+
+	"xxlxor		32,32,32			    \n\t"
+	"xxlxor		33,33,33			    \n\t"
+	"xxlxor		34,34,34			    \n\t"
+	"xxlxor		35,35,35			    \n\t"
+	"xxlxor		36,36,36			    \n\t"
+	"xxlxor		37,37,37			    \n\t"
+	"xxlxor		38,38,38			    \n\t"
+	"xxlxor		39,39,39			    \n\t"
+
+	"lxvd2x		40, 0, %2			    \n\t"
+	"lxvd2x		41, %5, %2			    \n\t"
+	"lxvd2x		42, %6, %2			    \n\t"
+	"lxvd2x		43, %7, %2			    \n\t"
+	"lxvd2x		44, %8, %2			    \n\t"
+	"lxvd2x		45, %9, %2			    \n\t"
+	"lxvd2x		46, %10, %2			    \n\t"
+	"lxvd2x		47, %11, %2			    \n\t"
+
+	"addi		%2, %2, 128			    \n\t"
+
+	"addic.		%0 , %0	, -8  	 	             \n\t"
+	"ble		2f		             	     \n\t"
+
+	".align 5				            \n\t"
+	"1:				                    \n\t"
+
+	"dcbt		%2 , %4				    \n\t"
+
+	"xvabsdp	48, 40				    \n\t"
+	"xvabsdp	49, 41				    \n\t"
+	"xvabsdp	50, 42				    \n\t"
+	"xvabsdp	51, 43				    \n\t"
+
+	"lxvd2x		40, 0, %2			    \n\t"
+	"lxvd2x		41, %5, %2			    \n\t"
+
+	"xvabsdp	52, 44				    \n\t"
+	"xvabsdp	53, 45				    \n\t"
+
+	"lxvd2x		42, %6, %2			    \n\t"
+	"lxvd2x		43, %7, %2			    \n\t"
+
+	"xvabsdp	54, 46				    \n\t"
+	"xvabsdp	55, 47				    \n\t"
+
+	"lxvd2x		44, %8, %2			    \n\t"
+	"lxvd2x		45, %9, %2			    \n\t"
+
+	"xvadddp	32, 32, 48		    \n\t"
+	"xvadddp	33, 33, 49		    \n\t"
+
+	"lxvd2x		46, %10, %2			    \n\t"
+	"lxvd2x		47, %11, %2			    \n\t"
+
+	"xvadddp	34, 34, 50		    \n\t"
+	"xvadddp	35, 35, 51		    \n\t"
+	"addi		%2, %2, 128			    \n\t"
+	"xvadddp	36, 36, 52		    \n\t"
+	"xvadddp	37, 37, 53		    \n\t"
+	"addic.		%0 , %0	, -8  	 	             \n\t"
+	"xvadddp	38, 38, 54		    \n\t"
+	"xvadddp	39, 39, 55		    \n\t"
+
+	"bgt		1b		             	     \n\t"
+
+	"2:						     \n\t"
+
+
+	"xvabsdp	48, 40				    \n\t"
+	"xvabsdp	49, 41				    \n\t"
+	"xvabsdp	50, 42				    \n\t"
+	"xvabsdp	51, 43				    \n\t"
+	"xvabsdp	52, 44				    \n\t"
+	"xvabsdp	53, 45				    \n\t"
+	"xvabsdp	54, 46				    \n\t"
+	"xvabsdp	55, 47				    \n\t"
+
+	"xvadddp	32, 32, 48		    \n\t"
+	"xvadddp	33, 33, 49		    \n\t"
+	"xvadddp	34, 34, 50		    \n\t"
+	"xvadddp	35, 35, 51		    \n\t"
+	"xvadddp	36, 36, 52		    \n\t"
+	"xvadddp	37, 37, 53		    \n\t"
+	"xvadddp	38, 38, 54		    \n\t"
+	"xvadddp	39, 39, 55		    \n\t"
+
+	"xvadddp	32, 32, 33		     \n\t"
+	"xvadddp	34, 34, 35		     \n\t"
+	"xvadddp	36, 36, 37		     \n\t"
+	"xvadddp	38, 38, 39		     \n\t"
+
+	"xvadddp	32, 32, 34		     \n\t"
+	"xvadddp	36, 36, 38		     \n\t"
+
+	"xvadddp	32, 32, 36		     \n\t"
+
+
+	"stxvd2x	32, 0, %3		     \n\t"
+
+	:
+        : 
+          "r" (i),	// 0	
+	  "r" (n),  	// 1
+          "r" (x1),     // 2
+          "r" (svec),   // 3
+          "r" (pre),    // 4
+	  "r" (o16),	// 5
+	  "r" (o32),	// 6
+	  "r" (o48),    // 7
+          "r" (o64),    // 8
+          "r" (o80),    // 9
+          "r" (o96),    // 10
+          "r" (o112)   // 11
+	: "cr0", "%0", "%2",  "memory"
+	);
+
+} 
+
+
diff --git a/kernel/power/zaxpy.c b/kernel/power/zaxpy.c
new file mode 100644
index 000000000..0ee0c1bf9
--- /dev/null
+++ b/kernel/power/zaxpy.c
@@ -0,0 +1,140 @@
+/***************************************************************************
+Copyright (c) 2013-2016, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*****************************************************************************/
+
+/**************************************************************************************
+* 2016/03/23 Werner Saar (wernsaar@googlemail.com)
+* 	 BLASTEST 		: OK
+* 	 CTEST			: OK
+* 	 TEST			: OK
+*	 LAPACK-TEST		: OK
+**************************************************************************************/
+
+#include "common.h"
+
+
+#if defined(POWER8)
+#include "zaxpy_microk_power8.c"
+#endif
+
+
+#ifndef HAVE_KERNEL_4
+
+static void zaxpy_kernel_4(BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *alpha)
+{
+	BLASLONG register i  = 0;
+	BLASLONG register ix = 0;
+	FLOAT da_r = alpha[0];
+	FLOAT da_i = alpha[1];
+	
+
+	while(i < n)
+        {
+#if !defined(CONJ)
+              y[ix]   += ( da_r * x[ix]   - da_i * x[ix+1] ) ;
+              y[ix+1] += ( da_r * x[ix+1] + da_i * x[ix]   ) ;
+              y[ix+2] += ( da_r * x[ix+2] - da_i * x[ix+3] ) ;
+              y[ix+3] += ( da_r * x[ix+3] + da_i * x[ix+2] ) ;
+#else
+              y[ix]   += ( da_r * x[ix]   + da_i * x[ix+1] ) ;
+              y[ix+1] -= ( da_r * x[ix+1] - da_i * x[ix]   ) ;
+              y[ix+2] += ( da_r * x[ix+2] + da_i * x[ix+3] ) ;
+              y[ix+3] -= ( da_r * x[ix+3] - da_i * x[ix+2] ) ;
+#endif
+
+              ix+=4 ;
+              i+=2 ;
+
+       }
+
+}
+
+#endif
+
+int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da_r, FLOAT da_i, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *dummy, BLASLONG dummy2)
+{
+	BLASLONG i=0;
+	BLASLONG ix=0,iy=0;
+	FLOAT da[4];
+
+	if ( n <= 0 )  return(0);
+
+	if ( (inc_x == 1) && (inc_y == 1) )
+	{
+
+		BLASLONG n1 = n & -16;
+
+		if ( n1 )
+		{
+			da[0] = da_r;
+			da[1] = da_r;
+			da[2] = da_i;
+			da[3] = da_i;
+			zaxpy_kernel_4(n1, x, y , da );
+			ix = 2 * n1;
+		}
+		i = n1;
+		while(i < n)
+		{
+#if !defined(CONJ)
+                	y[ix]   += ( da_r * x[ix]   - da_i * x[ix+1] ) ;
+                	y[ix+1] += ( da_r * x[ix+1] + da_i * x[ix]   ) ;
+#else
+                	y[ix]   += ( da_r * x[ix]   + da_i * x[ix+1] ) ;
+                	y[ix+1] -= ( da_r * x[ix+1] - da_i * x[ix]   ) ;
+#endif
+			i++ ;
+			ix += 2;
+
+		}
+		return(0);
+
+
+	}
+
+	inc_x *=2;
+	inc_y *=2;
+
+	while(i < n)
+	{
+
+#if !defined(CONJ)
+                y[iy]   += ( da_r * x[ix]   - da_i * x[ix+1] ) ;
+                y[iy+1] += ( da_r * x[ix+1] + da_i * x[ix]   ) ;
+#else
+                y[iy]   += ( da_r * x[ix]   + da_i * x[ix+1] ) ;
+                y[iy+1] -= ( da_r * x[ix+1] - da_i * x[ix]   ) ;
+#endif
+		ix  += inc_x ;
+		iy  += inc_y ;
+		i++ ;
+
+	}
+	return(0);
+
+}
+
+
diff --git a/kernel/power/zaxpy_microk_power8.c b/kernel/power/zaxpy_microk_power8.c
new file mode 100644
index 000000000..c8a529fd9
--- /dev/null
+++ b/kernel/power/zaxpy_microk_power8.c
@@ -0,0 +1,250 @@
+/***************************************************************************
+Copyright (c) 2013-2016, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*****************************************************************************/
+
+/**************************************************************************************
+* 2016/03/23 Werner Saar (wernsaar@googlemail.com)
+* 	 BLASTEST 		: OK
+* 	 CTEST			: OK
+* 	 TEST			: OK
+*	 LAPACK-TEST		: OK
+**************************************************************************************/
+
+
+#define HAVE_KERNEL_4 1
+static void zaxpy_kernel_4( BLASLONG n, FLOAT *x, FLOAT *y , FLOAT *alpha) __attribute__ ((noinline));
+
+static void zaxpy_kernel_4( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *alpha)
+{
+
+
+	BLASLONG i = n;
+	BLASLONG o16 = 16;
+	BLASLONG o32 = 32;
+	BLASLONG o48 = 48;
+	FLOAT *x1=x;
+	FLOAT *y1=y;
+	FLOAT *y2=y+1;
+	BLASLONG pre = 384;
+
+#if !defined(CONJ)
+        FLOAT mvec[2] = { -1.0, 1.0 };
+#else
+        FLOAT mvec[2] = { 1.0, -1.0 };
+#endif
+
+
+	__asm__  __volatile__
+	(
+
+	"lxsdx		34, 0 , %4			    \n\t"	// alpha_r
+	"lxsdx		35, %5, %4			    \n\t"	// alpha_i
+	"xxspltd	32, 34, 0			    \n\t"
+	"xxspltd	33, 35, 0			    \n\t"
+
+	"lxvd2x		36, 0,	%9			    \n\t"	// mvec
+
+#if !defined(CONJ)
+        "xvmuldp         33, 33  , 36               	    \n\t"	// alpha_i * mvec
+#else
+        "xvmuldp         32, 32  , 36               	    \n\t"	// alpha_r * mvec
+#endif
+
+	"addi		%8, %8, -8			    \n\t"
+
+	"dcbt		%2, %10				    \n\t"
+	"dcbt		%3, %10				    \n\t"
+
+
+	"lxvd2x		40, 0, %2			    \n\t"       // x0
+	"lxvd2x		41, %5, %2			    \n\t"	// x1
+	"lxvd2x		42, %6, %2			    \n\t"	// x2
+	"lxvd2x		43, %7, %2			    \n\t"	// x3
+
+	"lxvd2x		48, 0, %3			    \n\t"	// y0
+	"lxvd2x		49, %5, %3			    \n\t"	// y1
+	"lxvd2x		50, %6, %3			    \n\t"	// y2
+	"lxvd2x		51, %7, %3			    \n\t"	// y3
+
+	"xxswapd	56, 40				    \n\t"	// exchange real and imag part
+	"xxswapd	57, 41				    \n\t"	// exchange real and imag part
+	"xxswapd	58, 42				    \n\t"	// exchange real and imag part
+	"xxswapd	59, 43				    \n\t"	// exchange real and imag part
+
+	"addi		%2, %2, 64			    \n\t"
+	"addi		%3, %3, 64			    \n\t"
+
+	"lxvd2x		44, 0, %2			    \n\t"	// x4
+	"lxvd2x		45, %5, %2			    \n\t"	// x5
+	"lxvd2x		46, %6, %2			    \n\t"	// x6
+	"lxvd2x		47, %7, %2			    \n\t"	// x7
+
+	"lxvd2x		52, 0, %3			    \n\t"	// y4
+	"lxvd2x		53, %5, %3			    \n\t"	// y5
+	"lxvd2x		54, %6, %3			    \n\t"	// y6
+	"lxvd2x		55, %7, %3			    \n\t"	// y7
+
+	"xxswapd	60, 44				    \n\t"	// exchange real and imag part
+	"xxswapd	61, 45				    \n\t"	// exchange real and imag part
+	"xxswapd	62, 46				    \n\t"	// exchange real and imag part
+	"xxswapd	63, 47				    \n\t"	// exchange real and imag part
+
+	"addi		%2, %2, 64			    \n\t"
+	"addi		%3, %3, 64			    \n\t"
+
+	"addic.		%0 , %0	, -8  	 	             \n\t"
+	"ble		2f		             	     \n\t"
+
+	".align 5				            \n\t"
+	"1:				                    \n\t"
+
+	"dcbt		%2, %10				    \n\t"
+	"dcbt		%3, %10				    \n\t"
+
+	"xvmaddadp	48, 40, 32		    	    \n\t"	// alpha_r * x0_r , alpha_r * x0_i
+	"xvmaddadp	49, 41, 32		    	    \n\t"
+	"lxvd2x		40, 0, %2			    \n\t"       // x0
+	"lxvd2x		41, %5, %2			    \n\t"	// x1
+	"xvmaddadp	50, 42, 32		    	    \n\t"
+	"xvmaddadp	51, 43, 32		    	    \n\t"
+	"lxvd2x		42, %6, %2			    \n\t"	// x2
+	"lxvd2x		43, %7, %2			    \n\t"	// x3
+
+	"xvmaddadp	52, 44, 32		    	    \n\t"
+	"addi		%2, %2, 64			    \n\t"
+	"xvmaddadp	53, 45, 32		    	    \n\t"
+	"lxvd2x		44, 0, %2			    \n\t"	// x4
+	"lxvd2x		45, %5, %2			    \n\t"	// x5
+	"xvmaddadp	54, 46, 32		    	    \n\t"
+	"xvmaddadp	55, 47, 32		    	    \n\t"
+	"lxvd2x		46, %6, %2			    \n\t"	// x6
+	"lxvd2x		47, %7, %2			    \n\t"	// x7
+
+	"xvmaddadp	48, 56, 33		    	    \n\t"	// alpha_i * x0_i , alpha_i * x0_r
+	"addi		%2, %2, 64			    \n\t"
+	"xvmaddadp	49, 57, 33		    	    \n\t"
+	"xvmaddadp	50, 58, 33		    	    \n\t"
+	"xvmaddadp	51, 59, 33		    	    \n\t"
+
+	"xvmaddadp	52, 60, 33		    	    \n\t"
+	"xvmaddadp	53, 61, 33		    	    \n\t"
+	"xvmaddadp	54, 62, 33		    	    \n\t"
+	"xvmaddadp	55, 63, 33		    	    \n\t"
+
+	"stxvd2x	48,  0, %8			    \n\t"
+	"stxvd2x	49, %5, %8			    \n\t"
+	"stxvd2x	50, %6, %8			    \n\t"
+	"stxvd2x	51, %7, %8			    \n\t"
+
+	"addi		%8, %8, 64			    \n\t"
+
+	"stxvd2x	52,  0, %8			    \n\t"
+	"stxvd2x	53, %5, %8			    \n\t"
+	"stxvd2x	54, %6, %8			    \n\t"
+	"stxvd2x	55, %7, %8			    \n\t"
+
+	"addi		%8, %8, 64			    \n\t"
+
+	"xxswapd	56, 40				    \n\t"	// exchange real and imag part
+	"xxswapd	57, 41				    \n\t"	// exchange real and imag part
+	"lxvd2x		48, 0, %3			    \n\t"	// y0
+	"lxvd2x		49, %5, %3			    \n\t"	// y1
+	"xxswapd	58, 42				    \n\t"	// exchange real and imag part
+	"xxswapd	59, 43				    \n\t"	// exchange real and imag part
+	"lxvd2x		50, %6, %3			    \n\t"	// y2
+	"lxvd2x		51, %7, %3			    \n\t"	// y3
+
+	"xxswapd	60, 44				    \n\t"	// exchange real and imag part
+	"addi		%3, %3, 64			    \n\t"
+	"xxswapd	61, 45				    \n\t"	// exchange real and imag part
+	"lxvd2x		52, 0, %3			    \n\t"	// y4
+	"lxvd2x		53, %5, %3			    \n\t"	// y5
+	"xxswapd	62, 46				    \n\t"	// exchange real and imag part
+	"xxswapd	63, 47				    \n\t"	// exchange real and imag part
+	"lxvd2x		54, %6, %3			    \n\t"	// y6
+	"lxvd2x		55, %7, %3			    \n\t"	// y7
+
+	"addi		%3, %3, 64			    \n\t"
+
+	"addic.		%0 , %0	, -8  	 	             \n\t"
+	"bgt		1b		             	     \n\t"
+
+	"2:						     \n\t"
+
+	"xvmaddadp	48, 40, 32		    	    \n\t"	// alpha_r * x0_r , alpha_r * x0_i
+	"xvmaddadp	49, 41, 32		    	    \n\t"
+	"xvmaddadp	50, 42, 32		    	    \n\t"
+	"xvmaddadp	51, 43, 32		    	    \n\t"
+
+	"xvmaddadp	52, 44, 32		    	    \n\t"
+	"xvmaddadp	53, 45, 32		    	    \n\t"
+	"xvmaddadp	54, 46, 32		    	    \n\t"
+	"xvmaddadp	55, 47, 32		    	    \n\t"
+
+	"xvmaddadp	48, 56, 33		    	    \n\t"	// alpha_i * x0_i , alpha_i * x0_r
+	"xvmaddadp	49, 57, 33		    	    \n\t"
+	"xvmaddadp	50, 58, 33		    	    \n\t"
+	"xvmaddadp	51, 59, 33		    	    \n\t"
+
+	"xvmaddadp	52, 60, 33		    	    \n\t"
+	"xvmaddadp	53, 61, 33		    	    \n\t"
+	"xvmaddadp	54, 62, 33		    	    \n\t"
+	"xvmaddadp	55, 63, 33		    	    \n\t"
+
+
+	"stxvd2x	48,  0, %8			    \n\t"
+	"stxvd2x	49, %5, %8			    \n\t"
+	"stxvd2x	50, %6, %8			    \n\t"
+	"stxvd2x	51, %7, %8			    \n\t"
+
+	"addi		%8, %8, 64			    \n\t"
+
+	"stxvd2x	52,  0, %8			    \n\t"
+	"stxvd2x	53, %5, %8			    \n\t"
+	"stxvd2x	54, %6, %8			    \n\t"
+	"stxvd2x	55, %7, %8			    \n\t"
+
+	"addi		%8, %8, 64			    \n\t"
+
+	:
+        : 
+          "r" (i),	// 0	
+	  "r" (n),  	// 1
+          "r" (x1),     // 2
+          "r" (y1),     // 3
+          "r" (alpha),  // 4
+	  "r" (o16),	// 5
+	  "r" (o32),	// 6
+	  "r" (o48),    // 7
+	  "r" (y2),     // 8
+	  "r" (mvec),   // 9
+	  "r" (pre)	// 10
+	: "cr0", "%0", "%2" , "%3", "%8", "memory"
+	);
+
+} 
+
+
diff --git a/kernel/power/zcopy.c b/kernel/power/zcopy.c
new file mode 100644
index 000000000..a7658f7ab
--- /dev/null
+++ b/kernel/power/zcopy.c
@@ -0,0 +1,140 @@
+/***************************************************************************
+Copyright (c) 2013-2016, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*****************************************************************************/
+
+/**************************************************************************************
+* 2016/03/25 Werner Saar (wernsaar@googlemail.com)
+* 	 BLASTEST 		: OK
+* 	 CTEST			: OK
+* 	 TEST			: OK
+*	 LAPACK-TEST		: OK
+**************************************************************************************/
+
+#include "common.h"
+
+#if defined(POWER8)
+#include "zcopy_microk_power8.c"
+#endif
+
+#ifndef HAVE_KERNEL_16
+
+static void zcopy_kernel_16(BLASLONG n, FLOAT *x, FLOAT *y)
+{
+
+	BLASLONG i=0;
+	FLOAT f0, f1, f2, f3, f4, f5, f6, f7;
+	FLOAT *x1=x;
+	FLOAT *y1=y;
+
+	while ( i<n )
+	{
+
+		f0 = x1[0];
+		f1 = x1[1];
+		f2 = x1[2];
+		f3 = x1[3];
+		f4 = x1[4];
+		f5 = x1[5];
+		f6 = x1[6];
+		f7 = x1[7];
+
+		y1[0] = f0;
+		y1[1] = f1;
+		y1[2] = f2;
+		y1[3] = f3;
+		y1[4] = f4;
+		y1[5] = f5;
+		y1[6] = f6;
+		y1[7] = f7;
+
+		x1 += 8;
+		y1 += 8;
+
+		i+=4;
+	}
+	return;
+
+}
+
+
+#endif
+
+
+
+int CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y)
+{
+	BLASLONG i=0;
+	BLASLONG ix=0,iy=0;
+
+	if ( n <= 0     )  return(0);
+
+	if ( (inc_x == 1) && (inc_y == 1 ))
+	{
+
+		BLASLONG n1 = n & -16;
+		if ( n1 > 0 )
+		{
+			zcopy_kernel_16(n1, x, y);
+			i=n1;
+			ix=n1*2;
+			iy=n1*2;
+		}
+
+		while(i < n)
+		{
+			y[iy] = x[iy] ;
+			y[iy+1] = x[ix+1] ;
+			ix+=2;
+			iy+=2;
+			i++ ;
+
+		}
+
+
+	}
+	else
+	{
+
+		BLASLONG inc_x2 = 2 * inc_x;
+		BLASLONG inc_y2 = 2 * inc_y;
+
+		while(i < n)
+		{
+			y[iy] = x[ix] ;
+			y[iy+1] = x[ix+1] ;
+			ix += inc_x2 ;
+			iy += inc_y2 ;
+			i++ ;
+
+		}
+
+	}
+	return(0);
+	
+
+}
+
+
diff --git a/kernel/power/zcopy_microk_power8.c b/kernel/power/zcopy_microk_power8.c
new file mode 100644
index 000000000..73abe084e
--- /dev/null
+++ b/kernel/power/zcopy_microk_power8.c
@@ -0,0 +1,174 @@
+/***************************************************************************
+Copyright (c) 2013-2016, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*****************************************************************************/
+
+/**************************************************************************************
+* 2016/03/25 Werner Saar (wernsaar@googlemail.com)
+* 	 BLASTEST 		: OK
+* 	 CTEST			: OK
+* 	 TEST			: OK
+*	 LAPACK-TEST		: OK
+**************************************************************************************/
+
+#define HAVE_KERNEL_16 1
+
+static void zcopy_kernel_16( BLASLONG n, FLOAT *x, FLOAT *y) __attribute__ ((noinline));
+
+static void zcopy_kernel_16( BLASLONG n, FLOAT *x, FLOAT *y)
+{
+
+
+	BLASLONG i = n;
+	BLASLONG o16 = 16;
+	BLASLONG o32 = 32;
+	BLASLONG o48 = 48;
+	BLASLONG o64 = 64;
+	BLASLONG o80 = 80;
+	BLASLONG o96 = 96;
+	BLASLONG o112 = 112;
+	FLOAT *x1=x;
+	FLOAT *y1=y;
+	BLASLONG pre = 384;
+	BLASLONG alpha=0;
+
+	__asm__  __volatile__
+	(
+
+	"lxvd2x		40, 0, %2			    \n\t"
+	"lxvd2x		41, %5, %2			    \n\t"
+	"lxvd2x		42, %6, %2			    \n\t"
+	"lxvd2x		43, %7, %2			    \n\t"
+	"lxvd2x		44, %8, %2			    \n\t"
+	"lxvd2x		45, %9, %2			    \n\t"
+	"lxvd2x		46, %10, %2			    \n\t"
+	"lxvd2x		47, %11, %2			    \n\t"
+
+	"addi		%2, %2, 128			    \n\t"
+
+	"lxvd2x		50, 0, %2			    \n\t"
+	"lxvd2x		51, %5, %2			    \n\t"
+	"lxvd2x		52, %6, %2			    \n\t"
+	"lxvd2x		53, %7, %2			    \n\t"
+	"lxvd2x		54, %8, %2			    \n\t"
+	"lxvd2x		55, %9, %2			    \n\t"
+	"lxvd2x		56, %10, %2			    \n\t"
+	"lxvd2x		57, %11, %2			    \n\t"
+
+	"addi		%2, %2, 128			    \n\t"
+
+	"addic.		%0 , %0	, -16  	 	             \n\t"
+	"ble		2f		             	     \n\t"
+
+	".align 5				            \n\t"
+	"1:				                    \n\t"
+
+	"stxvd2x		40, 0, %1			    \n\t"
+	"stxvd2x		41, %5, %1			    \n\t"
+	"lxvd2x		40, 0, %2			    \n\t"
+	"lxvd2x		41, %5, %2			    \n\t"
+	"stxvd2x		42, %6, %1			    \n\t"
+	"stxvd2x		43, %7, %1			    \n\t"
+	"lxvd2x		42, %6, %2			    \n\t"
+	"lxvd2x		43, %7, %2			    \n\t"
+	"stxvd2x		44, %8, %1			    \n\t"
+	"stxvd2x		45, %9, %1			    \n\t"
+	"lxvd2x		44, %8, %2			    \n\t"
+	"lxvd2x		45, %9, %2			    \n\t"
+	"stxvd2x		46, %10, %1			    \n\t"
+	"stxvd2x		47, %11, %1			    \n\t"
+	"lxvd2x		46, %10, %2			    \n\t"
+	"lxvd2x		47, %11, %2			    \n\t"
+
+
+	"addi		%1, %1, 128			    \n\t"
+	"addi		%2, %2, 128			    \n\t"
+
+	"stxvd2x		50, 0, %1			    \n\t"
+	"stxvd2x		51, %5, %1			    \n\t"
+	"lxvd2x		50, 0, %2			    \n\t"
+	"lxvd2x		51, %5, %2			    \n\t"
+	"stxvd2x		52, %6, %1			    \n\t"
+	"stxvd2x		53, %7, %1			    \n\t"
+	"lxvd2x		52, %6, %2			    \n\t"
+	"lxvd2x		53, %7, %2			    \n\t"
+	"stxvd2x		54, %8, %1			    \n\t"
+	"stxvd2x		55, %9, %1			    \n\t"
+	"lxvd2x		54, %8, %2			    \n\t"
+	"lxvd2x		55, %9, %2			    \n\t"
+	"stxvd2x		56, %10, %1			    \n\t"
+	"stxvd2x		57, %11, %1			    \n\t"
+	"lxvd2x		56, %10, %2			    \n\t"
+	"lxvd2x		57, %11, %2			    \n\t"
+
+	"addi		%1, %1, 128			    \n\t"
+	"addi		%2, %2, 128			    \n\t"
+
+	"addic.		%0 , %0	, -16  	 	             \n\t"
+	"bgt		1b		             	     \n\t"
+
+	"2:						     \n\t"
+
+	"stxvd2x		40, 0, %1			    \n\t"
+	"stxvd2x		41, %5, %1			    \n\t"
+	"stxvd2x		42, %6, %1			    \n\t"
+	"stxvd2x		43, %7, %1			    \n\t"
+	"stxvd2x		44, %8, %1			    \n\t"
+	"stxvd2x		45, %9, %1			    \n\t"
+	"stxvd2x		46, %10, %1			    \n\t"
+	"stxvd2x		47, %11, %1			    \n\t"
+
+	"addi		%1, %1, 128			    \n\t"
+
+	"stxvd2x		50, 0, %1			    \n\t"
+	"stxvd2x		51, %5, %1			    \n\t"
+	"stxvd2x		52, %6, %1			    \n\t"
+	"stxvd2x		53, %7, %1			    \n\t"
+	"stxvd2x		54, %8, %1			    \n\t"
+	"stxvd2x		55, %9, %1			    \n\t"
+	"stxvd2x		56, %10, %1			    \n\t"
+	"stxvd2x		57, %11, %1			    \n\t"
+
+
+	:
+        : 
+          "r" (i),	// 0	
+	  "r" (y1),  	// 1
+          "r" (x1),     // 2
+          "r" (alpha),  // 3
+          "r" (pre),    // 4
+	  "r" (o16),	// 5
+	  "r" (o32),	// 6
+	  "r" (o48),    // 7
+          "r" (o64),    // 8
+          "r" (o80),    // 9
+          "r" (o96),    // 10
+          "r" (o112)    // 11
+	: "cr0", "%0", "%2" , "%1", "memory"
+	);
+
+} 
+
+
diff --git a/kernel/power/zdot.c b/kernel/power/zdot.c
new file mode 100644
index 000000000..1205b34b6
--- /dev/null
+++ b/kernel/power/zdot.c
@@ -0,0 +1,167 @@
+/***************************************************************************
+Copyright (c) 2013-2016, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*****************************************************************************/
+
+/**************************************************************************************
+* 2016/03/21 Werner Saar (wernsaar@googlemail.com)
+* 	 BLASTEST 		: OK
+* 	 CTEST			: OK
+* 	 TEST			: OK
+*	 LAPACK-TEST		: OK
+**************************************************************************************/
+
+#include "common.h"
+#include <complex.h>
+
+
+#if defined(POWER8) 
+#include "zdot_microk_power8.c"
+#endif
+
+
+#ifndef HAVE_KERNEL_8
+
+static void zdot_kernel_8(BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *d) __attribute__ ((noinline));
+
+static void zdot_kernel_8(BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *d)
+{
+	BLASLONG register i = 0;
+	FLOAT dot[4] = { 0.0, 0.0, 0.0, 0.0 };
+	BLASLONG j=0;
+
+	while( i < n )
+        {
+
+            dot[0] += x[j]   * y[j]   ;
+            dot[1] += x[j+1] * y[j+1] ;
+            dot[2] += x[j]   * y[j+1] ;
+            dot[3] += x[j+1] * y[j]   ;
+
+            dot[0] += x[j+2] * y[j+2] ;
+            dot[1] += x[j+3] * y[j+3] ;
+            dot[2] += x[j+2] * y[j+3] ;
+            dot[3] += x[j+3] * y[j+2] ;
+
+            dot[0] += x[j+4] * y[j+4] ;
+            dot[1] += x[j+5] * y[j+5] ;
+            dot[2] += x[j+4] * y[j+5] ;
+            dot[3] += x[j+5] * y[j+4] ;
+
+            dot[0] += x[j+6] * y[j+6] ;
+            dot[1] += x[j+7] * y[j+7] ;
+            dot[2] += x[j+6] * y[j+7] ;
+            dot[3] += x[j+7] * y[j+6] ;
+
+	    j+=8;
+            i+=4;
+
+        }
+	d[0] = dot[0];
+	d[1] = dot[1];
+	d[2] = dot[2];
+	d[3] = dot[3];
+
+}
+
+#endif
+
+FLOAT _Complex CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y)
+{
+	BLASLONG i;
+	BLASLONG ix,iy;
+	FLOAT _Complex result;
+	FLOAT  dot[4] = { 0.0, 0.0, 0.0 , 0.0 } ; 
+
+	if ( n <= 0 ) 
+	{
+	        __real__ result = 0.0 ;
+        	__imag__ result = 0.0 ;
+		return(result);
+
+	}
+
+	if ( (inc_x == 1) && (inc_y == 1) )
+	{
+
+		BLASLONG n1 = n & -8;
+
+		if ( n1 )
+			zdot_kernel_8(n1, x, y , dot );
+
+		i = n1;
+		BLASLONG j = i * 2;
+
+		while( i < n )
+		{
+
+			dot[0] += x[j]   * y[j]   ;
+			dot[1] += x[j+1] * y[j+1] ;
+			dot[2] += x[j]   * y[j+1] ;
+			dot[3] += x[j+1] * y[j]   ;
+
+			j+=2;
+			i++ ;
+
+		}
+
+
+	}
+	else
+	{
+		i=0;
+		ix=0;
+		iy=0;
+		inc_x <<= 1;
+		inc_y <<= 1;
+		while(i < n)
+		{
+
+			dot[0] += x[ix]   * y[iy]   ;
+			dot[1] += x[ix+1] * y[iy+1] ;
+			dot[2] += x[ix]   * y[iy+1] ;
+			dot[3] += x[ix+1] * y[iy]   ;
+
+			ix  += inc_x ;
+			iy  += inc_y ;
+			i++ ;
+
+		}
+	}
+
+#if !defined(CONJ)
+	__real__ result = dot[0] - dot[1];
+	__imag__ result = dot[2] + dot[3];
+#else
+	__real__ result = dot[0] + dot[1];
+	__imag__ result = dot[2] - dot[3];
+
+#endif
+
+	return(result);
+
+}
+
+
diff --git a/kernel/power/zdot_microk_power8.c b/kernel/power/zdot_microk_power8.c
new file mode 100644
index 000000000..296d3d469
--- /dev/null
+++ b/kernel/power/zdot_microk_power8.c
@@ -0,0 +1,219 @@
+/***************************************************************************
+Copyright (c) 2013-2016, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*****************************************************************************/
+
+/**************************************************************************************
+* 2016/03/21 Werner Saar (wernsaar@googlemail.com)
+* 	 BLASTEST 		: OK
+* 	 CTEST			: OK
+* 	 TEST			: OK
+*	 LAPACK-TEST		: OK
+**************************************************************************************/
+
+#define HAVE_KERNEL_8 1
+static void zdot_kernel_8( BLASLONG n, FLOAT *x, FLOAT *y , FLOAT *dot) __attribute__ ((noinline));
+
+static void zdot_kernel_8( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *dot)
+{
+
+
+	BLASLONG i = n;
+	BLASLONG o16 = 16;
+	BLASLONG o32 = 32;
+	BLASLONG o48 = 48;
+	FLOAT *x1=x;
+	FLOAT *y1=y;
+	BLASLONG pre = 384;
+
+	__asm__  __volatile__
+	(
+	"xxlxor		32,32,32			    \n\t"
+	"xxlxor		33,33,33			    \n\t"
+	"xxlxor		34,34,34			    \n\t"
+	"xxlxor		35,35,35			    \n\t"
+	"xxlxor		36,36,36			    \n\t"
+	"xxlxor		37,37,37			    \n\t"
+	"xxlxor		38,38,38			    \n\t"
+	"xxlxor		39,39,39			    \n\t"
+
+	"dcbt		%2, %8				    \n\t"
+	"dcbt		%3, %8				    \n\t"
+
+	"lxvd2x		40, 0, %2			    \n\t"	// x0_r, x0_i
+	"lxvd2x		48, 0, %3			    \n\t"	// y0_r, y0_i
+	"lxvd2x		41, %5, %2			    \n\t"	// x1_r, x1_i
+	"lxvd2x		49, %5, %3			    \n\t"	// y1_r, y1_i
+	"lxvd2x		42, %6, %2			    \n\t"	// x2_r, x2_i
+	"lxvd2x		50, %6, %3			    \n\t"	// y2_r, y2_i
+	"lxvd2x		43, %7, %2			    \n\t"	// x3_r, x3_i
+	"lxvd2x		51, %7, %3			    \n\t"	// y3_r, y3_i
+
+	"xxswapd	52,48				    \n\t"	// y0_i, y0_r
+	"xxswapd	53,49				    \n\t"	// y1_i, y1_r
+	"xxswapd	54,50				    \n\t"	// y2_i, y2_r
+	"xxswapd	55,51				    \n\t"	// y3_i, y3_r
+
+	"addi		%2, %2, 64			    \n\t"
+	"addi		%3, %3, 64			    \n\t"
+
+
+	"lxvd2x		44, 0, %2			    \n\t"	// x0_r, x0_i
+	"lxvd2x		56, 0, %3			    \n\t"	// y0_r, y0_i
+	"lxvd2x		45, %5, %2			    \n\t"	// x1_r, x1_i
+	"lxvd2x		57, %5, %3			    \n\t"	// y1_r, y1_i
+	"lxvd2x		46, %6, %2			    \n\t"	// x2_r, x2_i
+	"lxvd2x		58, %6, %3			    \n\t"	// y2_r, y2_i
+	"lxvd2x		47, %7, %2			    \n\t"	// x3_r, x3_i
+	"lxvd2x		59, %7, %3			    \n\t"	// y3_r, y3_i
+
+	"xxswapd	60,56				    \n\t"	// y0_i, y0_r
+	"xxswapd	61,57				    \n\t"	// y1_i, y1_r
+	"xxswapd	62,58				    \n\t"	// y2_i, y2_r
+	"xxswapd	63,59				    \n\t"	// y3_i, y3_r
+
+	"addi		%2, %2, 64			    \n\t"
+	"addi		%3, %3, 64			    \n\t"
+
+	"addic.		%0 , %0	, -8  	 	             \n\t"
+	"ble		2f		             	     \n\t"
+
+	".align 5				            \n\t"
+	"1:				                    \n\t"
+
+	"dcbt		%2, %8				    \n\t"
+	"dcbt		%3, %8				    \n\t"
+
+	"xvmaddadp	32, 40, 48		    	    \n\t"	// x0_r * y0_r , x0_i * y0_i
+	"lxvd2x		48, 0, %3			    \n\t"	// y0_r, y0_i
+	"xvmaddadp	34, 41, 49		    	    \n\t"	// x1_r * y1_r , x1_i * y1_i
+	"lxvd2x		49, %5, %3			    \n\t"	// y1_r, y1_i
+
+	"xvmaddadp	36, 42, 50		    	    \n\t"	// x2_r * y2_r , x2_i * y2_i
+	"lxvd2x		50, %6, %3			    \n\t"	// y2_r, y2_i
+	"xvmaddadp	38, 43, 51		    	    \n\t"	// x3_r * y3_r , x3_i * y3_i
+	"lxvd2x		51, %7, %3			    \n\t"	// y3_r, y3_i
+
+	"xvmaddadp	33, 40, 52		    	    \n\t"	// x0_r * y0_i , x0_i * y0_r
+	"lxvd2x		40, 0, %2			    \n\t"	// x0_r, x0_i
+	"xvmaddadp	35, 41, 53		    	    \n\t"	// x1_r * y1_i , x1_i * y1_r
+	"lxvd2x		41, %5, %2			    \n\t"	// x1_r, x1_i
+
+	"xvmaddadp	37, 42, 54		    	    \n\t"	// x2_r * y2_i , x2_i * y2_r
+	"lxvd2x		42, %6, %2			    \n\t"	// x2_r, x2_i
+	"xvmaddadp	39, 43, 55		    	    \n\t"	// x3_r * y3_i , x3_i * y3_r
+	"lxvd2x		43, %7, %2			    \n\t"	// x3_r, x3_i
+
+	"xxswapd	52,48				    \n\t"	// y0_i, y0_r
+	"xxswapd	53,49				    \n\t"	// y1_i, y1_r
+
+	"addi		%2, %2, 64			    \n\t"
+	"addi		%3, %3, 64			    \n\t"
+
+	"xxswapd	54,50				    \n\t"	// y2_i, y2_r
+	"xxswapd	55,51				    \n\t"	// y3_i, y3_r
+
+	"xvmaddadp	32, 44, 56		    	    \n\t"	// x0_r * y0_r , x0_i * y0_i
+	"lxvd2x		56, 0, %3			    \n\t"	// y0_r, y0_i
+	"xvmaddadp	34, 45, 57		    	    \n\t"	// x1_r * y1_r , x1_i * y1_i
+	"lxvd2x		57, %5, %3			    \n\t"	// y1_r, y1_i
+	"xvmaddadp	36, 46, 58		    	    \n\t"	// x2_r * y2_r , x2_i * y2_i
+	"lxvd2x		58, %6, %3			    \n\t"	// y2_r, y2_i
+	"xvmaddadp	38, 47, 59		    	    \n\t"	// x3_r * y3_r , x3_i * y3_i
+	"lxvd2x		59, %7, %3			    \n\t"	// y3_r, y3_i
+
+	"xvmaddadp	33, 44, 60		    	    \n\t"	// x0_r * y0_i , x0_i * y0_r
+	"lxvd2x		44, 0, %2			    \n\t"	// x0_r, x0_i
+	"xvmaddadp	35, 45, 61		    	    \n\t"	// x1_r * y1_i , x1_i * y1_r
+	"lxvd2x		45, %5, %2			    \n\t"	// x1_r, x1_i
+	"xvmaddadp	37, 46, 62		    	    \n\t"	// x2_r * y2_i , x2_i * y2_r
+	"lxvd2x		46, %6, %2			    \n\t"	// x2_r, x2_i
+	"xvmaddadp	39, 47, 63		    	    \n\t"	// x3_r * y3_i , x3_i * y3_r
+	"lxvd2x		47, %7, %2			    \n\t"	// x3_r, x3_i
+
+	"xxswapd	60,56				    \n\t"	// y0_i, y0_r
+	"xxswapd	61,57				    \n\t"	// y1_i, y1_r
+
+	"addi		%2, %2, 64			    \n\t"
+	"addi		%3, %3, 64			    \n\t"
+
+	"xxswapd	62,58				    \n\t"	// y2_i, y2_r
+	"xxswapd	63,59				    \n\t"	// y3_i, y3_r
+
+	"addic.		%0 , %0	, -8  	 	             \n\t"
+	"bgt		1b		             	     \n\t"
+
+	"2:						     \n\t"
+
+	"xvmaddadp	32, 40, 48		    	    \n\t"	// x0_r * y0_r , x0_i * y0_i
+	"xvmaddadp	34, 41, 49		    	    \n\t"	// x1_r * y1_r , x1_i * y1_i
+	"xvmaddadp	36, 42, 50		    	    \n\t"	// x2_r * y2_r , x2_i * y2_i
+	"xvmaddadp	38, 43, 51		    	    \n\t"	// x3_r * y3_r , x3_i * y3_i
+
+	"xvmaddadp	33, 40, 52		    	    \n\t"	// x0_r * y0_i , x0_i * y0_r
+	"xvmaddadp	35, 41, 53		    	    \n\t"	// x1_r * y1_i , x1_i * y1_r
+	"xvmaddadp	37, 42, 54		    	    \n\t"	// x2_r * y2_i , x2_i * y2_r
+	"xvmaddadp	39, 43, 55		    	    \n\t"	// x3_r * y3_i , x3_i * y3_r
+
+	"xvmaddadp	32, 44, 56		    	    \n\t"	// x0_r * y0_r , x0_i * y0_i
+	"xvmaddadp	34, 45, 57		    	    \n\t"	// x1_r * y1_r , x1_i * y1_i
+	"xvmaddadp	36, 46, 58		    	    \n\t"	// x2_r * y2_r , x2_i * y2_i
+	"xvmaddadp	38, 47, 59		    	    \n\t"	// x3_r * y3_r , x3_i * y3_i
+
+	"xvmaddadp	33, 44, 60		    	    \n\t"	// x0_r * y0_i , x0_i * y0_r
+	"xvmaddadp	35, 45, 61		    	    \n\t"	// x1_r * y1_i , x1_i * y1_r
+	"xvmaddadp	37, 46, 62		    	    \n\t"	// x2_r * y2_i , x2_i * y2_r
+	"xvmaddadp	39, 47, 63		    	    \n\t"	// x3_r * y3_i , x3_i * y3_r
+
+
+	"xvadddp	32, 32, 34		     \n\t"
+	"xvadddp	36, 36, 38		     \n\t"
+
+	"xvadddp	33, 33, 35		     \n\t"
+	"xvadddp	37, 37, 39		     \n\t"
+
+	"xvadddp	32, 32, 36		     \n\t"
+	"xvadddp	33, 33, 37		     \n\t"
+
+	"stxvd2x	32, 0, %4		     \n\t"
+	"stxvd2x	33, %5, %4		     \n\t"
+
+	:
+        : 
+          "r" (i),	// 0	
+	  "r" (n),  	// 1
+          "r" (x1),     // 2
+          "r" (y1),     // 3
+          "r" (dot),    // 4
+	  "r" (o16),	// 5
+	  "r" (o32),	// 6
+	  "r" (o48),    // 7
+	  "r" (pre)	// 8
+	: "cr0", "%0", "%2" , "%3", "memory"
+	);
+
+} 
+
+
diff --git a/kernel/power/zgemm_kernel_8x2_power8.S b/kernel/power/zgemm_kernel_8x2_power8.S
index a7665f749..336b13b1f 100644
--- a/kernel/power/zgemm_kernel_8x2_power8.S
+++ b/kernel/power/zgemm_kernel_8x2_power8.S
@@ -1,38 +1,3 @@
-/***************************************************************************
-Copyright (c) 2013-2016, The OpenBLAS Project
-All rights reserved.
-Redistribution and use in source and binary forms, with or without
-modification, are permitted provided that the following conditions are
-met:
-1. Redistributions of source code must retain the above copyright
-notice, this list of conditions and the following disclaimer.
-2. Redistributions in binary form must reproduce the above copyright
-notice, this list of conditions and the following disclaimer in
-the documentation and/or other materials provided with the
-distribution.
-3. Neither the name of the OpenBLAS project nor the names of
-its contributors may be used to endorse or promote products
-derived from this software without specific prior written permission.
-THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
-AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
-IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
-ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
-LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
-DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
-SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
-CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
-OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
-USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-*****************************************************************************/
-
-/**************************************************************************************
-* 2016/03/05 Werner Saar (wernsaar@googlemail.com)
-* 	 BLASTEST 		: OK
-* 	 CTEST			: OK
-* 	 TEST			: OK
-*	 LAPACK-TEST		: OK
-**************************************************************************************/
-
 /*********************************************************************/
 /* Copyright 2009, 2010 The University of Texas at Austin.           */
 /* All rights reserved.                                              */
@@ -82,7 +47,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #endif
 
 #ifdef __64BIT__
-#define STACKSIZE 320
+#define STACKSIZE 32000
 #define ALPHA_R_SP 296(SP)
 #define ALPHA_I_SP 304(SP)
 #define FZERO	312(SP)
@@ -133,11 +98,16 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #define alpha_r vs30
 #define alpha_i vs31
 
+
+#define FRAMEPOINTER r12
+
+#define BBUFFER r14
+
 #define L	r15
 #define ALPHA	r16
 #define o24	r17
 #define T2	r19
-#define KK	r20
+#define BBO	r20
 #define	o8	r21
 #define	I	r22
 #define J	r23
@@ -156,8 +126,12 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 	PROLOGUE
 	PROFCODE
 
-	addi	SP, SP, -STACKSIZE
-	li	r0, 0
+	mr      FRAMEPOINTER, SP
+        addi    SP, SP, -STACKSIZE
+        addi    SP, SP, -STACKSIZE
+        addi    SP, SP, -STACKSIZE
+        addi    SP, SP, -STACKSIZE
+        li      r0, 0
 
 	stfd	f14,    0(SP)
 	stfd	f15,    8(SP)
@@ -200,6 +174,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 	std	r17,  256(SP)
 	std	r16,  264(SP)
 	std	r15,  272(SP)
+	std	r14,  280(SP)
 #else
 	stw	r31,  144(SP)
 	stw	r30,  148(SP)
@@ -226,37 +201,37 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
 #ifdef linux
 #ifdef __64BIT__
-	ld	LDC, FRAMESLOT(0) + STACKSIZE(SP)
+	ld	LDC, FRAMESLOT(0) + 0(FRAMEPOINTER)
 #endif
 #endif
 
 #if defined(_AIX) || defined(__APPLE__)
 #ifdef __64BIT__
-	ld	LDC, FRAMESLOT(0) + STACKSIZE(SP)
+	ld	LDC, FRAMESLOT(0) + 0(FRAMEPOINTER)
 #else
 #ifdef DOUBLE
-	lwz	B,   FRAMESLOT(0) + STACKSIZE(SP)
-	lwz	C,   FRAMESLOT(1) + STACKSIZE(SP)
-	lwz	LDC, FRAMESLOT(2) + STACKSIZE(SP)
+	lwz	B,   FRAMESLOT(0) + 0(FRAMEPOINTER)
+	lwz	C,   FRAMESLOT(1) + 0(FRAMEPOINTER)
+	lwz	LDC, FRAMESLOT(2) + 0(FRAMEPOINTER)
 #else
-	lwz	LDC, FRAMESLOT(0) + STACKSIZE(SP)
+	lwz	LDC, FRAMESLOT(0) + 0(FRAMEPOINTER)
 #endif
 #endif
 #endif
 
 #ifdef TRMMKERNEL
 #if defined(linux) && defined(__64BIT__)
-	ld	OFFSET,  FRAMESLOT(1) + STACKSIZE(SP)
+	ld	OFFSET,  FRAMESLOT(1) + 0(FRAMEPOINTER)
 #endif
 
 #if defined(_AIX) || defined(__APPLE__)
 #ifdef __64BIT__
-	ld	OFFSET,  FRAMESLOT(1) + STACKSIZE(SP)
+	ld	OFFSET,  FRAMESLOT(1) + 0(FRAMEPOINTER)
 #else
 #ifdef DOUBLE
-	lwz	OFFSET,  FRAMESLOT(3) + STACKSIZE(SP)
+	lwz	OFFSET,  FRAMESLOT(3) + 0(FRAMEPOINTER)
 #else
-	lwz	OFFSET,  FRAMESLOT(1) + STACKSIZE(SP)
+	lwz	OFFSET,  FRAMESLOT(1) + 0(FRAMEPOINTER)
 #endif
 #endif
 #endif
@@ -268,34 +243,38 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #include "zgemm_macros_8x2_power8.S"
 
 	cmpwi	cr0, M, 0
-	ble	.L999
+	ble	L999
 	cmpwi	cr0, N, 0
-	ble	.L999
+	ble	L999
 	cmpwi	cr0, K, 0
-	ble	.L999
+	ble	L999
 
 	slwi	LDC, LDC, ZBASE_SHIFT
-	li	PRE, 256 
+	li	PRE, 384 
 	li	o8  , 8
 	li	o16 , 16
 	li	o24 , 24
 	li	o32 , 32
 	li	o48 , 48
 
+        addi    BBUFFER, SP, 512+4096
+        li      T1, -4096
+        and     BBUFFER, BBUFFER, T1
+
 #ifdef __64BIT__
 	addi	ALPHA, SP, 296
 #else
 	addi	ALPHA, SP, 224
 #endif
 
-	lxvdsx	alpha_r, 0, ALPHA
-	lxvdsx	alpha_i, o8, ALPHA
+	lxsdx	alpha_r, 0, ALPHA
+	lxsdx	alpha_i, o8, ALPHA
 
-	.align 5
+	.align 4
 
 #include "zgemm_logic_8x2_power8.S"
 
-.L999:
+L999:
 	addi	r3, 0, 0
 
 	lfd	f14,    0(SP)
@@ -339,6 +318,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 	ld	r17,  256(SP)
 	ld	r16,  264(SP)
 	ld	r15,  272(SP)
+	ld	r14,  280(SP)
 #else
 	lwz	r31,  144(SP)
 	lwz	r30,  148(SP)
@@ -360,6 +340,9 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #endif
 
 	addi	SP, SP, STACKSIZE
+	addi	SP, SP, STACKSIZE
+	addi	SP, SP, STACKSIZE
+	addi	SP, SP, STACKSIZE
 
 	blr
 
diff --git a/kernel/power/zgemm_logic_8x2_power8.S b/kernel/power/zgemm_logic_8x2_power8.S
index 5fcade5bf..96612da82 100644
--- a/kernel/power/zgemm_logic_8x2_power8.S
+++ b/kernel/power/zgemm_logic_8x2_power8.S
@@ -1,83 +1,111 @@
 	srawi.		J,	N,	1
-	ble		.LZGEMM_L2_END
+	ble		ZGEMM_L2_END
+
+ZGEMM_L2_BEGIN:
+
+	mr		BO,	B
+	mr		BBO,	BBUFFER
+	slwi		T1,	K,	1
+
+ZGEMM_L2_COPYB:
+
+	lxvdsx		vs4,	o0,	BO              // b0_r
+	lxvdsx		vs5,	o8,	BO              // b0_i
+	addi		BO,	BO,	16
+	stxvd2x		vs4,	o0,	BBO
+	stxvd2x		vs5,	o16,	BBO
+	addic.		T1,	T1,	-1
+	addi		BBO,	BBO,	32
+
+	bge		ZGEMM_L2_COPYB
 
-.LZGEMM_L2_BEGIN:
 
 	mr		CO,	C
 	mr		AO,	A
 	slwi		T1,	LDC	,	1
 	add		C,	C,	T1
 	srawi.		I,	M,	3
-	ble		.LZGEMM_L2x8_END
+	ble		ZGEMM_L2x8_END
 
-.LZGEMM_L2x8_BEGIN:
+ZGEMM_L2x8_BEGIN:
 
 
-	mr		BO,	B
+	mr		BO,	BBUFFER
 	srawi.		L,	K,	3
-	ble		.LZGEMM_L2x8_SUB0
+	ble		ZGEMM_L2x8_SUB0
 	cmpwi		cr0,	L,	1
-	ble		.LZGEMM_L2x8_SUB4
+	ble		ZGEMM_L2x8_SUB4
 
-.LZGEMM_L2x8_LOOP_START:
+ZGEMM_L2x8_LOOP_START:
 
 	dcbt		AO,	PRE
+	dcbt		BO,	PRE
 	LOAD2x8_1
 	dcbt		AO,	PRE
 	KERNEL2x8_I1
 	dcbt		AO,	PRE
+	dcbt		BO,	PRE
 	KERNEL2x8_2
 	dcbt		AO,	PRE
 	KERNEL2x8_1
 	dcbt		AO,	PRE
+	dcbt		BO,	PRE
 	KERNEL2x8_2
 
 	dcbt		AO,	PRE
 	KERNEL2x8_1
 	dcbt		AO,	PRE
+	dcbt		BO,	PRE
 	KERNEL2x8_2
 	dcbt		AO,	PRE
 	KERNEL2x8_1
 	dcbt		AO,	PRE
+	dcbt		BO,	PRE
 	KERNEL2x8_2
 
 	addic.		L,	L,	-2
-	ble		.LZGEMM_L2x8_LOOP_END
+	ble		ZGEMM_L2x8_LOOP_END
 
 	.align 5
 
-.LZGEMM_L2x8_LOOP:
+ZGEMM_L2x8_LOOP:
 
 	dcbt		AO,	PRE
 	KERNEL2x8_1
 	dcbt		AO,	PRE
+	dcbt		BO,	PRE
 	KERNEL2x8_2
 	dcbt		AO,	PRE
 	KERNEL2x8_1
 	dcbt		AO,	PRE
+	dcbt		BO,	PRE
 	KERNEL2x8_2
 
 	dcbt		AO,	PRE
 	KERNEL2x8_1
 	dcbt		AO,	PRE
+	dcbt		BO,	PRE
 	KERNEL2x8_2
 	dcbt		AO,	PRE
 	KERNEL2x8_1
 	dcbt		AO,	PRE
+	dcbt		BO,	PRE
 	KERNEL2x8_2
 
 	addic.		L,	L,	-1
-	bgt		.LZGEMM_L2x8_LOOP
+	bgt		ZGEMM_L2x8_LOOP
 
-.LZGEMM_L2x8_LOOP_END:
+ZGEMM_L2x8_LOOP_END:
 
 	dcbt		AO,	PRE
 	KERNEL2x8_1
 	dcbt		AO,	PRE
+	dcbt		BO,	PRE
 	KERNEL2x8_2
 	dcbt		AO,	PRE
 	KERNEL2x8_1
 	dcbt		AO,	PRE
+	dcbt		BO,	PRE
 	KERNEL2x8_2
 
 	dcbt		AO,	PRE
@@ -88,9 +116,9 @@
 	KERNEL2x8_1
 	KERNEL2x8_E2
 
-	b		.LZGEMM_L2x8_SUB1
+	b		ZGEMM_L2x8_SUB1
 
-.LZGEMM_L2x8_SUB4:
+ZGEMM_L2x8_SUB4:
 
 	dcbt		AO,	PRE
 	KERNEL2x8_SUBI1
@@ -106,53 +134,53 @@
 	KERNEL2x8_SUB1
 	KERNEL2x8_SUB1
 
-	b		.LZGEMM_L2x8_SUB1
+	b		ZGEMM_L2x8_SUB1
 
-.LZGEMM_L2x8_SUB0:
+ZGEMM_L2x8_SUB0:
 
 	andi.		L,	K,	7
 
 	KERNEL2x8_SUBI1
 
 	addic.		L,	L,	-1
-	ble		.LZGEMM_L2x8_SAVE
-	b		.LZGEMM_L2x8_SUB2
+	ble		ZGEMM_L2x8_SAVE
+	b		ZGEMM_L2x8_SUB2
 
-.LZGEMM_L2x8_SUB1:
+ZGEMM_L2x8_SUB1:
 
 	andi.		L,	K,	7
-	ble		.LZGEMM_L2x8_SAVE
+	ble		ZGEMM_L2x8_SAVE
 
-.LZGEMM_L2x8_SUB2:
+ZGEMM_L2x8_SUB2:
 
 	KERNEL2x8_SUB1
 
 	addic.		L,	L,	-1
-	bgt		.LZGEMM_L2x8_SUB2
+	bgt		ZGEMM_L2x8_SUB2
 
-.LZGEMM_L2x8_SAVE:
+ZGEMM_L2x8_SAVE:
 
 	SAVE2x8
 
 	addic.		I,	I,	-1
-	bgt		.LZGEMM_L2x8_BEGIN
+	bgt		ZGEMM_L2x8_BEGIN
 
-.LZGEMM_L2x8_END:
+ZGEMM_L2x8_END:
 
-.LZGEMM_L2x4_BEGIN:
+ZGEMM_L2x4_BEGIN:
 
 	andi.		T2,	M,	7
-	ble		.LZGEMM_L2x1_END
+	ble		ZGEMM_L2x1_END
 
 	andi.		T1,	M,	4
-	ble		.LZGEMM_L2x4_END
-	mr		BO,	B
+	ble		ZGEMM_L2x4_END
+	mr		BO,	BBUFFER
 	srawi.		L,	K,	3
-	ble		.LZGEMM_L2x4_SUB0
+	ble		ZGEMM_L2x4_SUB0
 	cmpwi		cr0,	L,	1
-	ble		.LZGEMM_L2x4_SUB4
+	ble		ZGEMM_L2x4_SUB4
 
-.LZGEMM_L2x4_LOOP_START:
+ZGEMM_L2x4_LOOP_START:
 
 	LOAD2x4_1
 	KERNEL2x4_I1
@@ -166,11 +194,11 @@
 	KERNEL2x4_2
 
 	addic.		L,	L,	-2
-	ble		.LZGEMM_L2x4_LOOP_END
+	ble		ZGEMM_L2x4_LOOP_END
 
 	.align 5
 
-.LZGEMM_L2x4_LOOP:
+ZGEMM_L2x4_LOOP:
 
 	KERNEL2x4_1
 	KERNEL2x4_2
@@ -183,9 +211,9 @@
 	KERNEL2x4_2
 
 	addic.		L,	L,	-1
-	bgt		.LZGEMM_L2x4_LOOP
+	bgt		ZGEMM_L2x4_LOOP
 
-.LZGEMM_L2x4_LOOP_END:
+ZGEMM_L2x4_LOOP_END:
 
 	KERNEL2x4_1
 	KERNEL2x4_2
@@ -197,9 +225,9 @@
 	KERNEL2x4_1
 	KERNEL2x4_E2
 
-	b		.LZGEMM_L2x4_SUB1
+	b		ZGEMM_L2x4_SUB1
 
-.LZGEMM_L2x4_SUB4:
+ZGEMM_L2x4_SUB4:
 
 	KERNEL2x4_SUBI1
 	KERNEL2x4_SUB1
@@ -211,48 +239,48 @@
 	KERNEL2x4_SUB1
 	KERNEL2x4_SUB1
 
-	b		.LZGEMM_L2x4_SUB1
+	b		ZGEMM_L2x4_SUB1
 
-.LZGEMM_L2x4_SUB0:
+ZGEMM_L2x4_SUB0:
 
 	andi.		L,	K,	7
 
 	KERNEL2x4_SUBI1
 
 	addic.		L,	L,	-1
-	ble		.LZGEMM_L2x4_SAVE
-	b		.LZGEMM_L2x4_SUB2
+	ble		ZGEMM_L2x4_SAVE
+	b		ZGEMM_L2x4_SUB2
 
-.LZGEMM_L2x4_SUB1:
+ZGEMM_L2x4_SUB1:
 
 	andi.		L,	K,	7
-	ble		.LZGEMM_L2x4_SAVE
+	ble		ZGEMM_L2x4_SAVE
 
-.LZGEMM_L2x4_SUB2:
+ZGEMM_L2x4_SUB2:
 
 	KERNEL2x4_SUB1
 
 	addic.		L,	L,	-1
-	bgt		.LZGEMM_L2x4_SUB2
+	bgt		ZGEMM_L2x4_SUB2
 
-.LZGEMM_L2x4_SAVE:
+ZGEMM_L2x4_SAVE:
 
 	SAVE2x4
 
-.LZGEMM_L2x4_END:
+ZGEMM_L2x4_END:
 
-.LZGEMM_L2x2_BEGIN:
+ZGEMM_L2x2_BEGIN:
 
 
 	andi.		T1,	M,	2
-	ble		.LZGEMM_L2x2_END
-	mr		BO,	B
+	ble		ZGEMM_L2x2_END
+	mr		BO,	BBUFFER
 	srawi.		L,	K,	3
-	ble		.LZGEMM_L2x2_SUB0
+	ble		ZGEMM_L2x2_SUB0
 	cmpwi		cr0,	L,	1
-	ble		.LZGEMM_L2x2_SUB4
+	ble		ZGEMM_L2x2_SUB4
 
-.LZGEMM_L2x2_LOOP_START:
+ZGEMM_L2x2_LOOP_START:
 
 	LOAD2x2_1
 	KERNEL2x2_I1
@@ -266,11 +294,11 @@
 	KERNEL2x2_2
 
 	addic.		L,	L,	-2
-	ble		.LZGEMM_L2x2_LOOP_END
+	ble		ZGEMM_L2x2_LOOP_END
 
 	.align 5
 
-.LZGEMM_L2x2_LOOP:
+ZGEMM_L2x2_LOOP:
 
 	KERNEL2x2_1
 	KERNEL2x2_2
@@ -283,9 +311,9 @@
 	KERNEL2x2_2
 
 	addic.		L,	L,	-1
-	bgt		.LZGEMM_L2x2_LOOP
+	bgt		ZGEMM_L2x2_LOOP
 
-.LZGEMM_L2x2_LOOP_END:
+ZGEMM_L2x2_LOOP_END:
 
 	KERNEL2x2_1
 	KERNEL2x2_2
@@ -297,9 +325,9 @@
 	KERNEL2x2_1
 	KERNEL2x2_E2
 
-	b		.LZGEMM_L2x2_SUB1
+	b		ZGEMM_L2x2_SUB1
 
-.LZGEMM_L2x2_SUB4:
+ZGEMM_L2x2_SUB4:
 
 	KERNEL2x2_SUBI1
 	KERNEL2x2_SUB1
@@ -311,48 +339,48 @@
 	KERNEL2x2_SUB1
 	KERNEL2x2_SUB1
 
-	b		.LZGEMM_L2x2_SUB1
+	b		ZGEMM_L2x2_SUB1
 
-.LZGEMM_L2x2_SUB0:
+ZGEMM_L2x2_SUB0:
 
 	andi.		L,	K,	7
 
 	KERNEL2x2_SUBI1
 
 	addic.		L,	L,	-1
-	ble		.LZGEMM_L2x2_SAVE
-	b		.LZGEMM_L2x2_SUB2
+	ble		ZGEMM_L2x2_SAVE
+	b		ZGEMM_L2x2_SUB2
 
-.LZGEMM_L2x2_SUB1:
+ZGEMM_L2x2_SUB1:
 
 	andi.		L,	K,	7
-	ble		.LZGEMM_L2x2_SAVE
+	ble		ZGEMM_L2x2_SAVE
 
-.LZGEMM_L2x2_SUB2:
+ZGEMM_L2x2_SUB2:
 
 	KERNEL2x2_SUB1
 
 	addic.		L,	L,	-1
-	bgt		.LZGEMM_L2x2_SUB2
+	bgt		ZGEMM_L2x2_SUB2
 
-.LZGEMM_L2x2_SAVE:
+ZGEMM_L2x2_SAVE:
 
 	SAVE2x2
 
-.LZGEMM_L2x2_END:
+ZGEMM_L2x2_END:
 
-.LZGEMM_L2x1_BEGIN:
+ZGEMM_L2x1_BEGIN:
 
 
 	andi.		T1,	M,	1
-	ble		.LZGEMM_L2x1_END
-	mr		BO,	B
+	ble		ZGEMM_L2x1_END
+	mr		BO,	BBUFFER
 	srawi.		L,	K,	3
-	ble		.LZGEMM_L2x1_SUB0
+	ble		ZGEMM_L2x1_SUB0
 	cmpwi		cr0,	L,	1
-	ble		.LZGEMM_L2x1_SUB4
+	ble		ZGEMM_L2x1_SUB4
 
-.LZGEMM_L2x1_LOOP_START:
+ZGEMM_L2x1_LOOP_START:
 
 	LOAD2x1_1
 	KERNEL2x1_I1
@@ -366,11 +394,11 @@
 	KERNEL2x1_2
 
 	addic.		L,	L,	-2
-	ble		.LZGEMM_L2x1_LOOP_END
+	ble		ZGEMM_L2x1_LOOP_END
 
 	.align 5
 
-.LZGEMM_L2x1_LOOP:
+ZGEMM_L2x1_LOOP:
 
 	KERNEL2x1_1
 	KERNEL2x1_2
@@ -383,9 +411,9 @@
 	KERNEL2x1_2
 
 	addic.		L,	L,	-1
-	bgt		.LZGEMM_L2x1_LOOP
+	bgt		ZGEMM_L2x1_LOOP
 
-.LZGEMM_L2x1_LOOP_END:
+ZGEMM_L2x1_LOOP_END:
 
 	KERNEL2x1_1
 	KERNEL2x1_2
@@ -397,9 +425,9 @@
 	KERNEL2x1_1
 	KERNEL2x1_E2
 
-	b		.LZGEMM_L2x1_SUB1
+	b		ZGEMM_L2x1_SUB1
 
-.LZGEMM_L2x1_SUB4:
+ZGEMM_L2x1_SUB4:
 
 	KERNEL2x1_SUBI1
 	KERNEL2x1_SUB1
@@ -411,72 +439,89 @@
 	KERNEL2x1_SUB1
 	KERNEL2x1_SUB1
 
-	b		.LZGEMM_L2x1_SUB1
+	b		ZGEMM_L2x1_SUB1
 
-.LZGEMM_L2x1_SUB0:
+ZGEMM_L2x1_SUB0:
 
 	andi.		L,	K,	7
 
 	KERNEL2x1_SUBI1
 
 	addic.		L,	L,	-1
-	ble		.LZGEMM_L2x1_SAVE
-	b		.LZGEMM_L2x1_SUB2
+	ble		ZGEMM_L2x1_SAVE
+	b		ZGEMM_L2x1_SUB2
 
-.LZGEMM_L2x1_SUB1:
+ZGEMM_L2x1_SUB1:
 
 	andi.		L,	K,	7
-	ble		.LZGEMM_L2x1_SAVE
+	ble		ZGEMM_L2x1_SAVE
 
-.LZGEMM_L2x1_SUB2:
+ZGEMM_L2x1_SUB2:
 
 	KERNEL2x1_SUB1
 
 	addic.		L,	L,	-1
-	bgt		.LZGEMM_L2x1_SUB2
+	bgt		ZGEMM_L2x1_SUB2
 
-.LZGEMM_L2x1_SAVE:
+ZGEMM_L2x1_SAVE:
 
 	SAVE2x1
 
-.LZGEMM_L2x1_END:
+ZGEMM_L2x1_END:
 
 	slwi		T1,	K,	5
 	add		B,	B,	T1
 
 	addic.		J,	J,	-1
-	bgt		.LZGEMM_L2_BEGIN
+	bgt		ZGEMM_L2_BEGIN
 
 	andi.		T2,	N,	1
-	ble		.L999
+	ble		L999
 
-.LZGEMM_L2_END:
+ZGEMM_L2_END:
 
-	b		.LZGEMM_L1_BEGIN
+	b		ZGEMM_L1_BEGIN
 
-.L999_H1:
+L999_H1:
 
-	b		.L999
+	b		L999
+
+ZGEMM_L1_BEGIN:
+
+	mr		BO,	B
+	mr		BBO,	BBUFFER
+	slwi		T1,	K,	0
+
+ZGEMM_L1_COPYB:
+
+	lxvdsx		vs4,	o0,	BO              // b0_r
+	lxvdsx		vs5,	o8,	BO              // b0_i
+	addi		BO,	BO,	16
+	stxvd2x		vs4,	o0,	BBO
+	stxvd2x		vs5,	o16,	BBO
+	addic.		T1,	T1,	-1
+	addi		BBO,	BBO,	32
+
+	bge		ZGEMM_L1_COPYB
 
-.LZGEMM_L1_BEGIN:
 
 	andi.		T1,	N,	1
-	ble		.LZGEMM_L1_END
+	ble		ZGEMM_L1_END
 	mr		CO,	C
 	mr		AO,	A
 	srawi.		I,	M,	3
-	ble		.LZGEMM_L1x8_END
+	ble		ZGEMM_L1x8_END
 
-.LZGEMM_L1x8_BEGIN:
+ZGEMM_L1x8_BEGIN:
 
 
-	mr		BO,	B
+	mr		BO,	BBUFFER
 	srawi.		L,	K,	3
-	ble		.LZGEMM_L1x8_SUB0
+	ble		ZGEMM_L1x8_SUB0
 	cmpwi		cr0,	L,	1
-	ble		.LZGEMM_L1x8_SUB4
+	ble		ZGEMM_L1x8_SUB4
 
-.LZGEMM_L1x8_LOOP_START:
+ZGEMM_L1x8_LOOP_START:
 
 	dcbt		AO,	PRE
 	LOAD1x8_1
@@ -499,11 +544,11 @@
 	KERNEL1x8_2
 
 	addic.		L,	L,	-2
-	ble		.LZGEMM_L1x8_LOOP_END
+	ble		ZGEMM_L1x8_LOOP_END
 
 	.align 5
 
-.LZGEMM_L1x8_LOOP:
+ZGEMM_L1x8_LOOP:
 
 	dcbt		AO,	PRE
 	KERNEL1x8_1
@@ -524,9 +569,9 @@
 	KERNEL1x8_2
 
 	addic.		L,	L,	-1
-	bgt		.LZGEMM_L1x8_LOOP
+	bgt		ZGEMM_L1x8_LOOP
 
-.LZGEMM_L1x8_LOOP_END:
+ZGEMM_L1x8_LOOP_END:
 
 	dcbt		AO,	PRE
 	KERNEL1x8_1
@@ -545,9 +590,9 @@
 	KERNEL1x8_1
 	KERNEL1x8_E2
 
-	b		.LZGEMM_L1x8_SUB1
+	b		ZGEMM_L1x8_SUB1
 
-.LZGEMM_L1x8_SUB4:
+ZGEMM_L1x8_SUB4:
 
 	dcbt		AO,	PRE
 	KERNEL1x8_SUBI1
@@ -563,53 +608,53 @@
 	KERNEL1x8_SUB1
 	KERNEL1x8_SUB1
 
-	b		.LZGEMM_L1x8_SUB1
+	b		ZGEMM_L1x8_SUB1
 
-.LZGEMM_L1x8_SUB0:
+ZGEMM_L1x8_SUB0:
 
 	andi.		L,	K,	7
 
 	KERNEL1x8_SUBI1
 
 	addic.		L,	L,	-1
-	ble		.LZGEMM_L1x8_SAVE
-	b		.LZGEMM_L1x8_SUB2
+	ble		ZGEMM_L1x8_SAVE
+	b		ZGEMM_L1x8_SUB2
 
-.LZGEMM_L1x8_SUB1:
+ZGEMM_L1x8_SUB1:
 
 	andi.		L,	K,	7
-	ble		.LZGEMM_L1x8_SAVE
+	ble		ZGEMM_L1x8_SAVE
 
-.LZGEMM_L1x8_SUB2:
+ZGEMM_L1x8_SUB2:
 
 	KERNEL1x8_SUB1
 
 	addic.		L,	L,	-1
-	bgt		.LZGEMM_L1x8_SUB2
+	bgt		ZGEMM_L1x8_SUB2
 
-.LZGEMM_L1x8_SAVE:
+ZGEMM_L1x8_SAVE:
 
 	SAVE1x8
 
 	addic.		I,	I,	-1
-	bgt		.LZGEMM_L1x8_BEGIN
+	bgt		ZGEMM_L1x8_BEGIN
 
-.LZGEMM_L1x8_END:
+ZGEMM_L1x8_END:
 
-.LZGEMM_L1x4_BEGIN:
+ZGEMM_L1x4_BEGIN:
 
 	andi.		T2,	M,	7
-	ble		.LZGEMM_L1x1_END
+	ble		ZGEMM_L1x1_END
 
 	andi.		T1,	M,	4
-	ble		.LZGEMM_L1x4_END
-	mr		BO,	B
+	ble		ZGEMM_L1x4_END
+	mr		BO,	BBUFFER
 	srawi.		L,	K,	3
-	ble		.LZGEMM_L1x4_SUB0
+	ble		ZGEMM_L1x4_SUB0
 	cmpwi		cr0,	L,	1
-	ble		.LZGEMM_L1x4_SUB4
+	ble		ZGEMM_L1x4_SUB4
 
-.LZGEMM_L1x4_LOOP_START:
+ZGEMM_L1x4_LOOP_START:
 
 	LOAD1x4_1
 	KERNEL1x4_I1
@@ -623,11 +668,11 @@
 	KERNEL1x4_2
 
 	addic.		L,	L,	-2
-	ble		.LZGEMM_L1x4_LOOP_END
+	ble		ZGEMM_L1x4_LOOP_END
 
 	.align 5
 
-.LZGEMM_L1x4_LOOP:
+ZGEMM_L1x4_LOOP:
 
 	KERNEL1x4_1
 	KERNEL1x4_2
@@ -640,9 +685,9 @@
 	KERNEL1x4_2
 
 	addic.		L,	L,	-1
-	bgt		.LZGEMM_L1x4_LOOP
+	bgt		ZGEMM_L1x4_LOOP
 
-.LZGEMM_L1x4_LOOP_END:
+ZGEMM_L1x4_LOOP_END:
 
 	KERNEL1x4_1
 	KERNEL1x4_2
@@ -654,9 +699,9 @@
 	KERNEL1x4_1
 	KERNEL1x4_E2
 
-	b		.LZGEMM_L1x4_SUB1
+	b		ZGEMM_L1x4_SUB1
 
-.LZGEMM_L1x4_SUB4:
+ZGEMM_L1x4_SUB4:
 
 	KERNEL1x4_SUBI1
 	KERNEL1x4_SUB1
@@ -668,48 +713,48 @@
 	KERNEL1x4_SUB1
 	KERNEL1x4_SUB1
 
-	b		.LZGEMM_L1x4_SUB1
+	b		ZGEMM_L1x4_SUB1
 
-.LZGEMM_L1x4_SUB0:
+ZGEMM_L1x4_SUB0:
 
 	andi.		L,	K,	7
 
 	KERNEL1x4_SUBI1
 
 	addic.		L,	L,	-1
-	ble		.LZGEMM_L1x4_SAVE
-	b		.LZGEMM_L1x4_SUB2
+	ble		ZGEMM_L1x4_SAVE
+	b		ZGEMM_L1x4_SUB2
 
-.LZGEMM_L1x4_SUB1:
+ZGEMM_L1x4_SUB1:
 
 	andi.		L,	K,	7
-	ble		.LZGEMM_L1x4_SAVE
+	ble		ZGEMM_L1x4_SAVE
 
-.LZGEMM_L1x4_SUB2:
+ZGEMM_L1x4_SUB2:
 
 	KERNEL1x4_SUB1
 
 	addic.		L,	L,	-1
-	bgt		.LZGEMM_L1x4_SUB2
+	bgt		ZGEMM_L1x4_SUB2
 
-.LZGEMM_L1x4_SAVE:
+ZGEMM_L1x4_SAVE:
 
 	SAVE1x4
 
-.LZGEMM_L1x4_END:
+ZGEMM_L1x4_END:
 
-.LZGEMM_L1x2_BEGIN:
+ZGEMM_L1x2_BEGIN:
 
 
 	andi.		T1,	M,	2
-	ble		.LZGEMM_L1x2_END
-	mr		BO,	B
+	ble		ZGEMM_L1x2_END
+	mr		BO,	BBUFFER
 	srawi.		L,	K,	3
-	ble		.LZGEMM_L1x2_SUB0
+	ble		ZGEMM_L1x2_SUB0
 	cmpwi		cr0,	L,	1
-	ble		.LZGEMM_L1x2_SUB4
+	ble		ZGEMM_L1x2_SUB4
 
-.LZGEMM_L1x2_LOOP_START:
+ZGEMM_L1x2_LOOP_START:
 
 	LOAD1x2_1
 	KERNEL1x2_I1
@@ -723,11 +768,11 @@
 	KERNEL1x2_2
 
 	addic.		L,	L,	-2
-	ble		.LZGEMM_L1x2_LOOP_END
+	ble		ZGEMM_L1x2_LOOP_END
 
 	.align 5
 
-.LZGEMM_L1x2_LOOP:
+ZGEMM_L1x2_LOOP:
 
 	KERNEL1x2_1
 	KERNEL1x2_2
@@ -740,9 +785,9 @@
 	KERNEL1x2_2
 
 	addic.		L,	L,	-1
-	bgt		.LZGEMM_L1x2_LOOP
+	bgt		ZGEMM_L1x2_LOOP
 
-.LZGEMM_L1x2_LOOP_END:
+ZGEMM_L1x2_LOOP_END:
 
 	KERNEL1x2_1
 	KERNEL1x2_2
@@ -754,9 +799,9 @@
 	KERNEL1x2_1
 	KERNEL1x2_E2
 
-	b		.LZGEMM_L1x2_SUB1
+	b		ZGEMM_L1x2_SUB1
 
-.LZGEMM_L1x2_SUB4:
+ZGEMM_L1x2_SUB4:
 
 	KERNEL1x2_SUBI1
 	KERNEL1x2_SUB1
@@ -768,48 +813,48 @@
 	KERNEL1x2_SUB1
 	KERNEL1x2_SUB1
 
-	b		.LZGEMM_L1x2_SUB1
+	b		ZGEMM_L1x2_SUB1
 
-.LZGEMM_L1x2_SUB0:
+ZGEMM_L1x2_SUB0:
 
 	andi.		L,	K,	7
 
 	KERNEL1x2_SUBI1
 
 	addic.		L,	L,	-1
-	ble		.LZGEMM_L1x2_SAVE
-	b		.LZGEMM_L1x2_SUB2
+	ble		ZGEMM_L1x2_SAVE
+	b		ZGEMM_L1x2_SUB2
 
-.LZGEMM_L1x2_SUB1:
+ZGEMM_L1x2_SUB1:
 
 	andi.		L,	K,	7
-	ble		.LZGEMM_L1x2_SAVE
+	ble		ZGEMM_L1x2_SAVE
 
-.LZGEMM_L1x2_SUB2:
+ZGEMM_L1x2_SUB2:
 
 	KERNEL1x2_SUB1
 
 	addic.		L,	L,	-1
-	bgt		.LZGEMM_L1x2_SUB2
+	bgt		ZGEMM_L1x2_SUB2
 
-.LZGEMM_L1x2_SAVE:
+ZGEMM_L1x2_SAVE:
 
 	SAVE1x2
 
-.LZGEMM_L1x2_END:
+ZGEMM_L1x2_END:
 
-.LZGEMM_L1x1_BEGIN:
+ZGEMM_L1x1_BEGIN:
 
 
 	andi.		T1,	M,	1
-	ble		.LZGEMM_L1x1_END
-	mr		BO,	B
+	ble		ZGEMM_L1x1_END
+	mr		BO,	BBUFFER
 	srawi.		L,	K,	3
-	ble		.LZGEMM_L1x1_SUB0
+	ble		ZGEMM_L1x1_SUB0
 	cmpwi		cr0,	L,	1
-	ble		.LZGEMM_L1x1_SUB4
+	ble		ZGEMM_L1x1_SUB4
 
-.LZGEMM_L1x1_LOOP_START:
+ZGEMM_L1x1_LOOP_START:
 
 	LOAD1x1_1
 	KERNEL1x1_I1
@@ -823,11 +868,11 @@
 	KERNEL1x1_2
 
 	addic.		L,	L,	-2
-	ble		.LZGEMM_L1x1_LOOP_END
+	ble		ZGEMM_L1x1_LOOP_END
 
 	.align 5
 
-.LZGEMM_L1x1_LOOP:
+ZGEMM_L1x1_LOOP:
 
 	KERNEL1x1_1
 	KERNEL1x1_2
@@ -840,9 +885,9 @@
 	KERNEL1x1_2
 
 	addic.		L,	L,	-1
-	bgt		.LZGEMM_L1x1_LOOP
+	bgt		ZGEMM_L1x1_LOOP
 
-.LZGEMM_L1x1_LOOP_END:
+ZGEMM_L1x1_LOOP_END:
 
 	KERNEL1x1_1
 	KERNEL1x1_2
@@ -854,9 +899,9 @@
 	KERNEL1x1_1
 	KERNEL1x1_E2
 
-	b		.LZGEMM_L1x1_SUB1
+	b		ZGEMM_L1x1_SUB1
 
-.LZGEMM_L1x1_SUB4:
+ZGEMM_L1x1_SUB4:
 
 	KERNEL1x1_SUBI1
 	KERNEL1x1_SUB1
@@ -868,34 +913,34 @@
 	KERNEL1x1_SUB1
 	KERNEL1x1_SUB1
 
-	b		.LZGEMM_L1x1_SUB1
+	b		ZGEMM_L1x1_SUB1
 
-.LZGEMM_L1x1_SUB0:
+ZGEMM_L1x1_SUB0:
 
 	andi.		L,	K,	7
 
 	KERNEL1x1_SUBI1
 
 	addic.		L,	L,	-1
-	ble		.LZGEMM_L1x1_SAVE
-	b		.LZGEMM_L1x1_SUB2
+	ble		ZGEMM_L1x1_SAVE
+	b		ZGEMM_L1x1_SUB2
 
-.LZGEMM_L1x1_SUB1:
+ZGEMM_L1x1_SUB1:
 
 	andi.		L,	K,	7
-	ble		.LZGEMM_L1x1_SAVE
+	ble		ZGEMM_L1x1_SAVE
 
-.LZGEMM_L1x1_SUB2:
+ZGEMM_L1x1_SUB2:
 
 	KERNEL1x1_SUB1
 
 	addic.		L,	L,	-1
-	bgt		.LZGEMM_L1x1_SUB2
+	bgt		ZGEMM_L1x1_SUB2
 
-.LZGEMM_L1x1_SAVE:
+ZGEMM_L1x1_SAVE:
 
 	SAVE1x1
 
-.LZGEMM_L1x1_END:
+ZGEMM_L1x1_END:
 
-.LZGEMM_L1_END:
+ZGEMM_L1_END:
diff --git a/kernel/power/zgemm_macros_8x2_power8.S b/kernel/power/zgemm_macros_8x2_power8.S
index 701ec65c8..a0fbb2e11 100644
--- a/kernel/power/zgemm_macros_8x2_power8.S
+++ b/kernel/power/zgemm_macros_8x2_power8.S
@@ -1,39 +1,3 @@
-/***************************************************************************
-Copyright (c) 2013-2016, The OpenBLAS Project
-All rights reserved.
-Redistribution and use in source and binary forms, with or without
-modification, are permitted provided that the following conditions are
-met:
-1. Redistributions of source code must retain the above copyright
-notice, this list of conditions and the following disclaimer.
-2. Redistributions in binary form must reproduce the above copyright
-notice, this list of conditions and the following disclaimer in
-the documentation and/or other materials provided with the
-distribution.
-3. Neither the name of the OpenBLAS project nor the names of
-its contributors may be used to endorse or promote products
-derived from this software without specific prior written permission.
-THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
-AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
-IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
-ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
-LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
-DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
-SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
-CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
-OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
-USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-*****************************************************************************/
-
-/**************************************************************************************
-* 2016/03/05 Werner Saar (wernsaar@googlemail.com)
-* 	 BLASTEST 		: OK
-* 	 CTEST			: OK
-* 	 TEST			: OK
-*	 LAPACK-TEST		: OK
-**************************************************************************************/
-
-
 #if   defined(NN) || defined(NT) || defined(TN) || defined(TT)
 
 	#define	XSFADD_R1	xsadddp
@@ -70,12 +34,12 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
 .macro LOAD2x8_1
 
-	lxvdsx		vs16,	o0,	BO		// load real part from B
-	lxvdsx		vs17,	o8,	BO		// load imag part from B
-	lxvdsx		vs18,	o16,	BO		// load real part from B
-	lxvdsx		vs19,	o24,	BO		// load imag part from B
+	lxvd2x		vs16,	o0,	BO		// load real part from B
+	lxvd2x		vs17,	o16,	BO		// load imag part from B
+	lxvd2x		vs18,	o32,	BO		// load real part from B
+	lxvd2x		vs19,	o48,	BO		// load imag part from B
 
-	addi		BO,	BO,	32
+	addi		BO,	BO,	64
 
 	lxvd2x		vs0,	o0,	AO		// load real,imag from A
 	lxvd2x		vs1,	o16,	AO		// load real,imag from A
@@ -110,12 +74,12 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
 	addi		AO,	AO,	64
 
-	lxvdsx		vs20,	o0,	BO		// load real part from B
-	lxvdsx		vs21,	o8,	BO		// load imag part from B
-	lxvdsx		vs22,	o16,	BO		// load real part from B
-	lxvdsx		vs23,	o24,	BO		// load imag part from B
+	lxvd2x		vs20,	o0,	BO		// load real part from B
+	lxvd2x		vs21,	o16,	BO		// load imag part from B
+	lxvd2x		vs22,	o32,	BO		// load real part from B
+	lxvd2x		vs23,	o48,	BO		// load imag part from B
 
-	addi		BO,	BO,	32
+	addi		BO,	BO,	64
 
 	xvmuldp		vs32,	vs0,	vs16		// real*real, imag*real
 	xvmuldp		vs33,	vs0,	vs17		// real*imag, imag*imag
@@ -156,36 +120,41 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
 .macro KERNEL2x8_1
 
+	lxvd2x		vs8,	o0,	AO		// load real,imag from A
+	lxvd2x		vs9,	o16,	AO		// load real,imag from A
+	lxvd2x		vs10,	o32,	AO		// load real,imag from A
+	lxvd2x		vs11,	o48,	AO		// load real,imag from A
+
+	addi		AO,	AO,	64
+
+	lxvd2x		vs12,	o0,	AO		// load real,imag from A
+	lxvd2x		vs13,	o16,	AO		// load real,imag from A
+	lxvd2x		vs14,	o32,	AO		// load real,imag from A
+	lxvd2x		vs15,	o48,	AO		// load real,imag from A
+
+	addi		AO,	AO,	64
+
+	lxvd2x		vs20,	o0,	BO		// load real part from B
+	lxvd2x		vs21,	o16,	BO		// load imag part from B
+	lxvd2x		vs22,	o32,	BO		// load real part from B
+	lxvd2x		vs23,	o48,	BO		// load imag part from B
+
+	addi		BO,	BO,	64
 
 	xvmaddadp	vs32,	vs0,	vs16		// real*real, imag*real
 	xvmaddadp	vs33,	vs0,	vs17		// real*imag, imag*imag
 	xvmaddadp	vs34,	vs1,	vs16		// real*real, imag*real
 	xvmaddadp	vs35,	vs1,	vs17		// real*imag, imag*imag
-
-	lxvdsx		vs22,	o16,	BO		// load real part from B
-	lxvdsx		vs23,	o24,	BO		// load imag part from B
-
 	xvmaddadp	vs36,	vs2,	vs16		// real*real, imag*real
 	xvmaddadp	vs37,	vs2,	vs17		// real*imag, imag*imag
 	xvmaddadp	vs38,	vs3,	vs16		// real*real, imag*real
 	xvmaddadp	vs39,	vs3,	vs17		// real*imag, imag*imag
-
-	lxvd2x		vs8,	o0,	AO		// load real,imag from A
-	lxvd2x		vs9,	o16,	AO		// load real,imag from A
-
 	xvmaddadp	vs40,	vs4,	vs16		// real*real, imag*real
 	xvmaddadp	vs41,	vs4,	vs17		// real*imag, imag*imag
 	xvmaddadp	vs42,	vs5,	vs16		// real*real, imag*real
 	xvmaddadp	vs43,	vs5,	vs17		// real*imag, imag*imag
-
-	lxvd2x		vs10,	o32,	AO		// load real,imag from A
-	lxvd2x		vs11,	o48,	AO		// load real,imag from A
-
 	xvmaddadp	vs44,	vs6,	vs16		// real*real, imag*real
 	xvmaddadp	vs45,	vs6,	vs17		// real*imag, imag*imag
-
-	addi		AO,	AO,	64
-
 	xvmaddadp	vs46,	vs7,	vs16		// real*real, imag*real
 	xvmaddadp	vs47,	vs7,	vs17		// real*imag, imag*imag
 
@@ -193,101 +162,79 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 	xvmaddadp	vs49,	vs0,	vs19		// real*imag, imag*imag
 	xvmaddadp	vs50,	vs1,	vs18		// real*real, imag*real
 	xvmaddadp	vs51,	vs1,	vs19		// real*imag, imag*imag
-
-	lxvd2x		vs12,	o0,	AO		// load real,imag from A
-	lxvd2x		vs13,	o16,	AO		// load real,imag from A
-
 	xvmaddadp	vs52,	vs2,	vs18		// real*real, imag*real
 	xvmaddadp	vs53,	vs2,	vs19		// real*imag, imag*imag
 	xvmaddadp	vs54,	vs3,	vs18		// real*real, imag*real
 	xvmaddadp	vs55,	vs3,	vs19		// real*imag, imag*imag
-
-	lxvd2x		vs14,	o32,	AO		// load real,imag from A
-	lxvd2x		vs15,	o48,	AO		// load real,imag from A
-
 	xvmaddadp	vs56,	vs4,	vs18		// real*real, imag*real
 	xvmaddadp	vs57,	vs4,	vs19		// real*imag, imag*imag
 	xvmaddadp	vs58,	vs5,	vs18		// real*real, imag*real
 	xvmaddadp	vs59,	vs5,	vs19		// real*imag, imag*imag
-
-	lxvdsx		vs20,	o0,	BO		// load real part from B
-	lxvdsx		vs21,	o8,	BO		// load imag part from B
-
 	xvmaddadp	vs60,	vs6,	vs18		// real*real, imag*real
 	xvmaddadp	vs61,	vs6,	vs19		// real*imag, imag*imag
 	xvmaddadp	vs62,	vs7,	vs18		// real*real, imag*real
 	xvmaddadp	vs63,	vs7,	vs19		// real*imag, imag*imag
 
-	addi		AO,	AO,	64
-	addi		BO,	BO,	32
 
 .endm
 
 .macro KERNEL2x8_2
 
+	lxvd2x		vs0,	o0,	AO		// load real,imag from A
+	lxvd2x		vs1,	o16,	AO		// load real,imag from A
+	lxvd2x		vs2,	o32,	AO		// load real,imag from A
+	lxvd2x		vs3,	o48,	AO		// load real,imag from A
+
+	addi		AO,	AO,	64
+
+	lxvd2x		vs4,	o0,	AO		// load real,imag from A
+	lxvd2x		vs5,	o16,	AO		// load real,imag from A
+	lxvd2x		vs6,	o32,	AO		// load real,imag from A
+	lxvd2x		vs7,	o48,	AO		// load real,imag from A
+
+	addi		AO,	AO,	64
+
+	lxvd2x		vs16,	o0,	BO		// load real part from B
+	lxvd2x		vs17,	o16,	BO		// load imag part from B
+	lxvd2x		vs18,	o32,	BO		// load real part from B
+	lxvd2x		vs19,	o48,	BO		// load imag part from B
+
+	addi		BO,	BO,	64
 
 	xvmaddadp	vs32,	vs8,	vs20		// real*real, imag*real
 	xvmaddadp	vs33,	vs8,	vs21		// real*imag, imag*imag
 	xvmaddadp	vs34,	vs9,	vs20		// real*real, imag*real
 	xvmaddadp	vs35,	vs9,	vs21		// real*imag, imag*imag
-
-	lxvdsx		vs16,	o0,	BO		// load real part from B
-	lxvdsx		vs17,	o8,	BO		// load imag part from B
-
 	xvmaddadp	vs36,	vs10,	vs20		// real*real, imag*real
 	xvmaddadp	vs37,	vs10,	vs21		// real*imag, imag*imag
 	xvmaddadp	vs38,	vs11,	vs20		// real*real, imag*real
 	xvmaddadp	vs39,	vs11,	vs21		// real*imag, imag*imag
-
-	lxvd2x		vs0,	o0,	AO		// load real,imag from A
-	lxvd2x		vs1,	o16,	AO		// load real,imag from A
-
 	xvmaddadp	vs40,	vs12,	vs20		// real*real, imag*real
 	xvmaddadp	vs41,	vs12,	vs21		// real*imag, imag*imag
 	xvmaddadp	vs42,	vs13,	vs20		// real*real, imag*real
 	xvmaddadp	vs43,	vs13,	vs21		// real*imag, imag*imag
-
-	lxvd2x		vs2,	o32,	AO		// load real,imag from A
-	lxvd2x		vs3,	o48,	AO		// load real,imag from A
-
 	xvmaddadp	vs44,	vs14,	vs20		// real*real, imag*real
 	xvmaddadp	vs45,	vs14,	vs21		// real*imag, imag*imag
 	xvmaddadp	vs46,	vs15,	vs20		// real*real, imag*real
 	xvmaddadp	vs47,	vs15,	vs21		// real*imag, imag*imag
 
-	addi		AO,	AO,	64
-
 	xvmaddadp	vs48,	vs8,	vs22		// real*real, imag*real
 	xvmaddadp	vs49,	vs8,	vs23		// real*imag, imag*imag
 	xvmaddadp	vs50,	vs9,	vs22		// real*real, imag*real
 	xvmaddadp	vs51,	vs9,	vs23		// real*imag, imag*imag
-
-	lxvd2x		vs4,	o0,	AO		// load real,imag from A
-	lxvd2x		vs5,	o16,	AO		// load real,imag from A
-
 	xvmaddadp	vs52,	vs10,	vs22		// real*real, imag*real
 	xvmaddadp	vs53,	vs10,	vs23		// real*imag, imag*imag
 	xvmaddadp	vs54,	vs11,	vs22		// real*real, imag*real
 	xvmaddadp	vs55,	vs11,	vs23		// real*imag, imag*imag
-
-	lxvd2x		vs6,	o32,	AO		// load real,imag from A
-	lxvd2x		vs7,	o48,	AO		// load real,imag from A
-
 	xvmaddadp	vs56,	vs12,	vs22		// real*real, imag*real
 	xvmaddadp	vs57,	vs12,	vs23		// real*imag, imag*imag
 	xvmaddadp	vs58,	vs13,	vs22		// real*real, imag*real
 	xvmaddadp	vs59,	vs13,	vs23		// real*imag, imag*imag
-
-	lxvdsx		vs18,	o16,	BO		// load real part from B
-	lxvdsx		vs19,	o24,	BO		// load imag part from B
-
 	xvmaddadp	vs60,	vs14,	vs22		// real*real, imag*real
 	xvmaddadp	vs61,	vs14,	vs23		// real*imag, imag*imag
 	xvmaddadp	vs62,	vs15,	vs22		// real*real, imag*real
 	xvmaddadp	vs63,	vs15,	vs23		// real*imag, imag*imag
 
-	addi		AO,	AO,	64
-	addi		BO,	BO,	32
 
 .endm
 
@@ -347,12 +294,12 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
 	addi		AO,	AO,	64
 
-	lxvdsx		vs16,	o0,	BO		// load real part from B
-	lxvdsx		vs17,	o8,	BO		// load imag part from B
-	lxvdsx		vs18,	o16,	BO		// load real part from B
-	lxvdsx		vs19,	o24,	BO		// load imag part from B
+	lxvd2x		vs16,	o0,	BO		// load real part from B
+	lxvd2x		vs17,	o16,	BO		// load imag part from B
+	lxvd2x		vs18,	o32,	BO		// load real part from B
+	lxvd2x		vs19,	o48,	BO		// load imag part from B
 
-	addi		BO,	BO,	32
+	addi		BO,	BO,	64
 
 	xvmuldp		vs32,	vs0,	vs16		// real*real, imag*real
 	xvmuldp		vs33,	vs0,	vs17		// real*imag, imag*imag
@@ -407,12 +354,12 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
 	addi		AO,	AO,	64
 
-	lxvdsx		vs16,	o0,	BO		// load real part from B
-	lxvdsx		vs17,	o8,	BO		// load imag part from B
-	lxvdsx		vs18,	o16,	BO		// load real part from B
-	lxvdsx		vs19,	o24,	BO		// load imag part from B
+	lxvd2x		vs16,	o0,	BO		// load real part from B
+	lxvd2x		vs17,	o16,	BO		// load imag part from B
+	lxvd2x		vs18,	o32,	BO		// load real part from B
+	lxvd2x		vs19,	o48,	BO		// load imag part from B
 
-	addi		BO,	BO,	32
+	addi		BO,	BO,	64
 
 	xvmaddadp	vs32,	vs0,	vs16		// real*real, imag*real
 	xvmaddadp	vs33,	vs0,	vs17		// real*imag, imag*imag
@@ -927,12 +874,12 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
 .macro LOAD2x4_1
 
-	lxvdsx		vs16,	o0,	BO		// load real part from B
-	lxvdsx		vs17,	o8,	BO		// load imag part from B
-	lxvdsx		vs18,	o16,	BO		// load real part from B
-	lxvdsx		vs19,	o24,	BO		// load imag part from B
+	lxvd2x		vs16,	o0,	BO		// load real part from B
+	lxvd2x		vs17,	o16,	BO		// load imag part from B
+	lxvd2x		vs18,	o32,	BO		// load real part from B
+	lxvd2x		vs19,	o48,	BO		// load imag part from B
 
-	addi		BO,	BO,	32
+	addi		BO,	BO,	64
 
 	lxvd2x		vs0,	o0,	AO		// load real,imag from A
 	lxvd2x		vs1,	o16,	AO		// load real,imag from A
@@ -953,12 +900,12 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
 	addi		AO,	AO,	64
 
-	lxvdsx		vs20,	o0,	BO		// load real part from B
-	lxvdsx		vs21,	o8,	BO		// load imag part from B
-	lxvdsx		vs22,	o16,	BO		// load real part from B
-	lxvdsx		vs23,	o24,	BO		// load imag part from B
+	lxvd2x		vs20,	o0,	BO		// load real part from B
+	lxvd2x		vs21,	o16,	BO		// load imag part from B
+	lxvd2x		vs22,	o32,	BO		// load real part from B
+	lxvd2x		vs23,	o48,	BO		// load imag part from B
 
-	addi		BO,	BO,	32
+	addi		BO,	BO,	64
 
 	xvmuldp		vs32,	vs0,	vs16		// real*real, imag*real
 	xvmuldp		vs33,	vs0,	vs17		// real*imag, imag*imag
@@ -990,12 +937,12 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
 	addi		AO,	AO,	64
 
-	lxvdsx		vs20,	o0,	BO		// load real part from B
-	lxvdsx		vs21,	o8,	BO		// load imag part from B
-	lxvdsx		vs22,	o16,	BO		// load real part from B
-	lxvdsx		vs23,	o24,	BO		// load imag part from B
+	lxvd2x		vs20,	o0,	BO		// load real part from B
+	lxvd2x		vs21,	o16,	BO		// load imag part from B
+	lxvd2x		vs22,	o32,	BO		// load real part from B
+	lxvd2x		vs23,	o48,	BO		// load imag part from B
 
-	addi		BO,	BO,	32
+	addi		BO,	BO,	64
 
 	xvmaddadp	vs32,	vs0,	vs16		// real*real, imag*real
 	xvmaddadp	vs33,	vs0,	vs17		// real*imag, imag*imag
@@ -1027,12 +974,12 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
 	addi		AO,	AO,	64
 
-	lxvdsx		vs16,	o0,	BO		// load real part from B
-	lxvdsx		vs17,	o8,	BO		// load imag part from B
-	lxvdsx		vs18,	o16,	BO		// load real part from B
-	lxvdsx		vs19,	o24,	BO		// load imag part from B
+	lxvd2x		vs16,	o0,	BO		// load real part from B
+	lxvd2x		vs17,	o16,	BO		// load imag part from B
+	lxvd2x		vs18,	o32,	BO		// load real part from B
+	lxvd2x		vs19,	o48,	BO		// load imag part from B
 
-	addi		BO,	BO,	32
+	addi		BO,	BO,	64
 
 	xvmaddadp	vs32,	vs8,	vs20		// real*real, imag*real
 	xvmaddadp	vs33,	vs8,	vs21		// real*imag, imag*imag
@@ -1088,12 +1035,12 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
 	addi		AO,	AO,	64
 
-	lxvdsx		vs16,	o0,	BO		// load real part from B
-	lxvdsx		vs17,	o8,	BO		// load imag part from B
-	lxvdsx		vs18,	o16,	BO		// load real part from B
-	lxvdsx		vs19,	o24,	BO		// load imag part from B
+	lxvd2x		vs16,	o0,	BO		// load real part from B
+	lxvd2x		vs17,	o16,	BO		// load imag part from B
+	lxvd2x		vs18,	o32,	BO		// load real part from B
+	lxvd2x		vs19,	o48,	BO		// load imag part from B
 
-	addi		BO,	BO,	32
+	addi		BO,	BO,	64
 
 	xvmuldp		vs32,	vs0,	vs16		// real*real, imag*real
 	xvmuldp		vs33,	vs0,	vs17		// real*imag, imag*imag
@@ -1125,12 +1072,12 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
 	addi		AO,	AO,	64
 
-	lxvdsx		vs16,	o0,	BO		// load real part from B
-	lxvdsx		vs17,	o8,	BO		// load imag part from B
-	lxvdsx		vs18,	o16,	BO		// load real part from B
-	lxvdsx		vs19,	o24,	BO		// load imag part from B
+	lxvd2x		vs16,	o0,	BO		// load real part from B
+	lxvd2x		vs17,	o16,	BO		// load imag part from B
+	lxvd2x		vs18,	o32,	BO		// load real part from B
+	lxvd2x		vs19,	o48,	BO		// load imag part from B
 
-	addi		BO,	BO,	32
+	addi		BO,	BO,	64
 
 	xvmaddadp	vs32,	vs0,	vs16		// real*real, imag*real
 	xvmaddadp	vs33,	vs0,	vs17		// real*imag, imag*imag
@@ -1410,12 +1357,12 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
 .macro LOAD2x2_1
 
-	lxvdsx		vs16,	o0,	BO		// load real part from B
-	lxvdsx		vs17,	o8,	BO		// load imag part from B
-	lxvdsx		vs18,	o16,	BO		// load real part from B
-	lxvdsx		vs19,	o24,	BO		// load imag part from B
+	lxvd2x		vs16,	o0,	BO		// load real part from B
+	lxvd2x		vs17,	o16,	BO		// load imag part from B
+	lxvd2x		vs18,	o32,	BO		// load real part from B
+	lxvd2x		vs19,	o48,	BO		// load imag part from B
 
-	addi		BO,	BO,	32
+	addi		BO,	BO,	64
 
 	lxvd2x		vs0,	o0,	AO		// load real,imag from A
 	lxvd2x		vs1,	o16,	AO		// load real,imag from A
@@ -1432,12 +1379,12 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
 	addi		AO,	AO,	32
 
-	lxvdsx		vs20,	o0,	BO		// load real part from B
-	lxvdsx		vs21,	o8,	BO		// load imag part from B
-	lxvdsx		vs22,	o16,	BO		// load real part from B
-	lxvdsx		vs23,	o24,	BO		// load imag part from B
+	lxvd2x		vs20,	o0,	BO		// load real part from B
+	lxvd2x		vs21,	o16,	BO		// load imag part from B
+	lxvd2x		vs22,	o32,	BO		// load real part from B
+	lxvd2x		vs23,	o48,	BO		// load imag part from B
 
-	addi		BO,	BO,	32
+	addi		BO,	BO,	64
 
 	xvmuldp		vs32,	vs0,	vs16		// real*real, imag*real
 	xvmuldp		vs33,	vs0,	vs17		// real*imag, imag*imag
@@ -1459,12 +1406,12 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
 	addi		AO,	AO,	32
 
-	lxvdsx		vs20,	o0,	BO		// load real part from B
-	lxvdsx		vs21,	o8,	BO		// load imag part from B
-	lxvdsx		vs22,	o16,	BO		// load real part from B
-	lxvdsx		vs23,	o24,	BO		// load imag part from B
+	lxvd2x		vs20,	o0,	BO		// load real part from B
+	lxvd2x		vs21,	o16,	BO		// load imag part from B
+	lxvd2x		vs22,	o32,	BO		// load real part from B
+	lxvd2x		vs23,	o48,	BO		// load imag part from B
 
-	addi		BO,	BO,	32
+	addi		BO,	BO,	64
 
 	xvmaddadp	vs32,	vs0,	vs16		// real*real, imag*real
 	xvmaddadp	vs33,	vs0,	vs17		// real*imag, imag*imag
@@ -1486,12 +1433,12 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
 	addi		AO,	AO,	32
 
-	lxvdsx		vs16,	o0,	BO		// load real part from B
-	lxvdsx		vs17,	o8,	BO		// load imag part from B
-	lxvdsx		vs18,	o16,	BO		// load real part from B
-	lxvdsx		vs19,	o24,	BO		// load imag part from B
+	lxvd2x		vs16,	o0,	BO		// load real part from B
+	lxvd2x		vs17,	o16,	BO		// load imag part from B
+	lxvd2x		vs18,	o32,	BO		// load real part from B
+	lxvd2x		vs19,	o48,	BO		// load imag part from B
 
-	addi		BO,	BO,	32
+	addi		BO,	BO,	64
 
 	xvmaddadp	vs32,	vs8,	vs20		// real*real, imag*real
 	xvmaddadp	vs33,	vs8,	vs21		// real*imag, imag*imag
@@ -1529,12 +1476,12 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
 	addi		AO,	AO,	32
 
-	lxvdsx		vs16,	o0,	BO		// load real part from B
-	lxvdsx		vs17,	o8,	BO		// load imag part from B
-	lxvdsx		vs18,	o16,	BO		// load real part from B
-	lxvdsx		vs19,	o24,	BO		// load imag part from B
+	lxvd2x		vs16,	o0,	BO		// load real part from B
+	lxvd2x		vs17,	o16,	BO		// load imag part from B
+	lxvd2x		vs18,	o32,	BO		// load real part from B
+	lxvd2x		vs19,	o48,	BO		// load imag part from B
 
-	addi		BO,	BO,	32
+	addi		BO,	BO,	64
 
 	xvmuldp		vs32,	vs0,	vs16		// real*real, imag*real
 	xvmuldp		vs33,	vs0,	vs17		// real*imag, imag*imag
@@ -1556,12 +1503,12 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
 	addi		AO,	AO,	32
 
-	lxvdsx		vs16,	o0,	BO		// load real part from B
-	lxvdsx		vs17,	o8,	BO		// load imag part from B
-	lxvdsx		vs18,	o16,	BO		// load real part from B
-	lxvdsx		vs19,	o24,	BO		// load imag part from B
+	lxvd2x		vs16,	o0,	BO		// load real part from B
+	lxvd2x		vs17,	o16,	BO		// load imag part from B
+	lxvd2x		vs18,	o32,	BO		// load real part from B
+	lxvd2x		vs19,	o48,	BO		// load imag part from B
 
-	addi		BO,	BO,	32
+	addi		BO,	BO,	64
 
 	xvmaddadp	vs32,	vs0,	vs16		// real*real, imag*real
 	xvmaddadp	vs33,	vs0,	vs17		// real*imag, imag*imag
@@ -1725,12 +1672,12 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
 .macro LOAD2x1_1
 
-	lxvdsx		vs16,	o0,	BO		// load real part from B
-	lxvdsx		vs17,	o8,	BO		// load imag part from B
-	lxvdsx		vs18,	o16,	BO		// load real part from B
-	lxvdsx		vs19,	o24,	BO		// load imag part from B
+	lxvd2x		vs16,	o0,	BO		// load real part from B
+	lxvd2x		vs17,	o16,	BO		// load imag part from B
+	lxvd2x		vs18,	o32,	BO		// load real part from B
+	lxvd2x		vs19,	o48,	BO		// load imag part from B
 
-	addi		BO,	BO,	32
+	addi		BO,	BO,	64
 
 	lxvd2x		vs0,	o0,	AO		// load real,imag from A
 
@@ -1745,12 +1692,12 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
 	addi		AO,	AO,	16
 
-	lxvdsx		vs20,	o0,	BO		// load real part from B
-	lxvdsx		vs21,	o8,	BO		// load imag part from B
-	lxvdsx		vs22,	o16,	BO		// load real part from B
-	lxvdsx		vs23,	o24,	BO		// load imag part from B
+	lxvd2x		vs20,	o0,	BO		// load real part from B
+	lxvd2x		vs21,	o16,	BO		// load imag part from B
+	lxvd2x		vs22,	o32,	BO		// load real part from B
+	lxvd2x		vs23,	o48,	BO		// load imag part from B
 
-	addi		BO,	BO,	32
+	addi		BO,	BO,	64
 
 	xvmuldp		vs32,	vs0,	vs16		// real*real, imag*real
 	xvmuldp		vs33,	vs0,	vs17		// real*imag, imag*imag
@@ -1767,12 +1714,12 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
 	addi		AO,	AO,	16
 
-	lxvdsx		vs20,	o0,	BO		// load real part from B
-	lxvdsx		vs21,	o8,	BO		// load imag part from B
-	lxvdsx		vs22,	o16,	BO		// load real part from B
-	lxvdsx		vs23,	o24,	BO		// load imag part from B
+	lxvd2x		vs20,	o0,	BO		// load real part from B
+	lxvd2x		vs21,	o16,	BO		// load imag part from B
+	lxvd2x		vs22,	o32,	BO		// load real part from B
+	lxvd2x		vs23,	o48,	BO		// load imag part from B
 
-	addi		BO,	BO,	32
+	addi		BO,	BO,	64
 
 	xvmaddadp	vs32,	vs0,	vs16		// real*real, imag*real
 	xvmaddadp	vs33,	vs0,	vs17		// real*imag, imag*imag
@@ -1789,12 +1736,12 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
 	addi		AO,	AO,	16
 
-	lxvdsx		vs16,	o0,	BO		// load real part from B
-	lxvdsx		vs17,	o8,	BO		// load imag part from B
-	lxvdsx		vs18,	o16,	BO		// load real part from B
-	lxvdsx		vs19,	o24,	BO		// load imag part from B
+	lxvd2x		vs16,	o0,	BO		// load real part from B
+	lxvd2x		vs17,	o16,	BO		// load imag part from B
+	lxvd2x		vs18,	o32,	BO		// load real part from B
+	lxvd2x		vs19,	o48,	BO		// load imag part from B
 
-	addi		BO,	BO,	32
+	addi		BO,	BO,	64
 
 	xvmaddadp	vs32,	vs8,	vs20		// real*real, imag*real
 	xvmaddadp	vs33,	vs8,	vs21		// real*imag, imag*imag
@@ -1823,12 +1770,12 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
 	addi		AO,	AO,	16
 
-	lxvdsx		vs16,	o0,	BO		// load real part from B
-	lxvdsx		vs17,	o8,	BO		// load imag part from B
-	lxvdsx		vs18,	o16,	BO		// load real part from B
-	lxvdsx		vs19,	o24,	BO		// load imag part from B
+	lxvd2x		vs16,	o0,	BO		// load real part from B
+	lxvd2x		vs17,	o16,	BO		// load imag part from B
+	lxvd2x		vs18,	o32,	BO		// load real part from B
+	lxvd2x		vs19,	o48,	BO		// load imag part from B
 
-	addi		BO,	BO,	32
+	addi		BO,	BO,	64
 
 	xvmuldp		vs32,	vs0,	vs16		// real*real, imag*real
 	xvmuldp		vs33,	vs0,	vs17		// real*imag, imag*imag
@@ -1845,12 +1792,12 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
 	addi		AO,	AO,	16
 
-	lxvdsx		vs16,	o0,	BO		// load real part from B
-	lxvdsx		vs17,	o8,	BO		// load imag part from B
-	lxvdsx		vs18,	o16,	BO		// load real part from B
-	lxvdsx		vs19,	o24,	BO		// load imag part from B
+	lxvd2x		vs16,	o0,	BO		// load real part from B
+	lxvd2x		vs17,	o16,	BO		// load imag part from B
+	lxvd2x		vs18,	o32,	BO		// load real part from B
+	lxvd2x		vs19,	o48,	BO		// load imag part from B
 
-	addi		BO,	BO,	32
+	addi		BO,	BO,	64
 
 	xvmaddadp	vs32,	vs0,	vs16		// real*real, imag*real
 	xvmaddadp	vs33,	vs0,	vs17		// real*imag, imag*imag
@@ -1956,10 +1903,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
 .macro LOAD1x8_1
 
-	lxvdsx		vs16,	o0,	BO		// load real part from B
-	lxvdsx		vs17,	o8,	BO		// load imag part from B
+	lxvd2x		vs16,	o0,	BO		// load real part from B
+	lxvd2x		vs17,	o16,	BO		// load imag part from B
 
-	addi		BO,	BO,	16
+	addi		BO,	BO,	32
 
 	lxvd2x		vs0,	o0,	AO		// load real,imag from A
 	lxvd2x		vs1,	o16,	AO		// load real,imag from A
@@ -1994,10 +1941,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
 	addi		AO,	AO,	64
 
-	lxvdsx		vs20,	o0,	BO		// load real part from B
-	lxvdsx		vs21,	o8,	BO		// load imag part from B
+	lxvd2x		vs20,	o0,	BO		// load real part from B
+	lxvd2x		vs21,	o16,	BO		// load imag part from B
 
-	addi		BO,	BO,	16
+	addi		BO,	BO,	32
 
 	xvmuldp		vs32,	vs0,	vs16		// real*real, imag*real
 	xvmuldp		vs33,	vs0,	vs17		// real*imag, imag*imag
@@ -2035,10 +1982,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
 	addi		AO,	AO,	64
 
-	lxvdsx		vs20,	o0,	BO		// load real part from B
-	lxvdsx		vs21,	o8,	BO		// load imag part from B
+	lxvd2x		vs20,	o0,	BO		// load real part from B
+	lxvd2x		vs21,	o16,	BO		// load imag part from B
 
-	addi		BO,	BO,	16
+	addi		BO,	BO,	32
 
 	xvmaddadp	vs32,	vs0,	vs16		// real*real, imag*real
 	xvmaddadp	vs33,	vs0,	vs17		// real*imag, imag*imag
@@ -2076,10 +2023,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
 	addi		AO,	AO,	64
 
-	lxvdsx		vs16,	o0,	BO		// load real part from B
-	lxvdsx		vs17,	o8,	BO		// load imag part from B
+	lxvd2x		vs16,	o0,	BO		// load real part from B
+	lxvd2x		vs17,	o16,	BO		// load imag part from B
 
-	addi		BO,	BO,	16
+	addi		BO,	BO,	32
 
 	xvmaddadp	vs32,	vs8,	vs20		// real*real, imag*real
 	xvmaddadp	vs33,	vs8,	vs21		// real*imag, imag*imag
@@ -2140,10 +2087,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
 	addi		AO,	AO,	64
 
-	lxvdsx		vs16,	o0,	BO		// load real part from B
-	lxvdsx		vs17,	o8,	BO		// load imag part from B
+	lxvd2x		vs16,	o0,	BO		// load real part from B
+	lxvd2x		vs17,	o16,	BO		// load imag part from B
 
-	addi		BO,	BO,	16
+	addi		BO,	BO,	32
 
 	xvmuldp		vs32,	vs0,	vs16		// real*real, imag*real
 	xvmuldp		vs33,	vs0,	vs17		// real*imag, imag*imag
@@ -2181,10 +2128,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
 	addi		AO,	AO,	64
 
-	lxvdsx		vs16,	o0,	BO		// load real part from B
-	lxvdsx		vs17,	o8,	BO		// load imag part from B
+	lxvd2x		vs16,	o0,	BO		// load real part from B
+	lxvd2x		vs17,	o16,	BO		// load imag part from B
 
-	addi		BO,	BO,	16
+	addi		BO,	BO,	32
 
 	xvmaddadp	vs32,	vs0,	vs16		// real*real, imag*real
 	xvmaddadp	vs33,	vs0,	vs17		// real*imag, imag*imag
@@ -2452,10 +2399,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
 .macro LOAD1x4_1
 
-	lxvdsx		vs16,	o0,	BO		// load real part from B
-	lxvdsx		vs17,	o8,	BO		// load imag part from B
+	lxvd2x		vs16,	o0,	BO		// load real part from B
+	lxvd2x		vs17,	o16,	BO		// load imag part from B
 
-	addi		BO,	BO,	16
+	addi		BO,	BO,	32
 
 	lxvd2x		vs0,	o0,	AO		// load real,imag from A
 	lxvd2x		vs1,	o16,	AO		// load real,imag from A
@@ -2476,10 +2423,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
 	addi		AO,	AO,	64
 
-	lxvdsx		vs20,	o0,	BO		// load real part from B
-	lxvdsx		vs21,	o8,	BO		// load imag part from B
+	lxvd2x		vs20,	o0,	BO		// load real part from B
+	lxvd2x		vs21,	o16,	BO		// load imag part from B
 
-	addi		BO,	BO,	16
+	addi		BO,	BO,	32
 
 	xvmuldp		vs32,	vs0,	vs16		// real*real, imag*real
 	xvmuldp		vs33,	vs0,	vs17		// real*imag, imag*imag
@@ -2502,10 +2449,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
 	addi		AO,	AO,	64
 
-	lxvdsx		vs20,	o0,	BO		// load real part from B
-	lxvdsx		vs21,	o8,	BO		// load imag part from B
+	lxvd2x		vs20,	o0,	BO		// load real part from B
+	lxvd2x		vs21,	o16,	BO		// load imag part from B
 
-	addi		BO,	BO,	16
+	addi		BO,	BO,	32
 
 	xvmaddadp	vs32,	vs0,	vs16		// real*real, imag*real
 	xvmaddadp	vs33,	vs0,	vs17		// real*imag, imag*imag
@@ -2528,10 +2475,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
 	addi		AO,	AO,	64
 
-	lxvdsx		vs16,	o0,	BO		// load real part from B
-	lxvdsx		vs17,	o8,	BO		// load imag part from B
+	lxvd2x		vs16,	o0,	BO		// load real part from B
+	lxvd2x		vs17,	o16,	BO		// load imag part from B
 
-	addi		BO,	BO,	16
+	addi		BO,	BO,	32
 
 	xvmaddadp	vs32,	vs8,	vs20		// real*real, imag*real
 	xvmaddadp	vs33,	vs8,	vs21		// real*imag, imag*imag
@@ -2569,10 +2516,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
 	addi		AO,	AO,	64
 
-	lxvdsx		vs16,	o0,	BO		// load real part from B
-	lxvdsx		vs17,	o8,	BO		// load imag part from B
+	lxvd2x		vs16,	o0,	BO		// load real part from B
+	lxvd2x		vs17,	o16,	BO		// load imag part from B
 
-	addi		BO,	BO,	16
+	addi		BO,	BO,	32
 
 	xvmuldp		vs32,	vs0,	vs16		// real*real, imag*real
 	xvmuldp		vs33,	vs0,	vs17		// real*imag, imag*imag
@@ -2595,10 +2542,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
 	addi		AO,	AO,	64
 
-	lxvdsx		vs16,	o0,	BO		// load real part from B
-	lxvdsx		vs17,	o8,	BO		// load imag part from B
+	lxvd2x		vs16,	o0,	BO		// load real part from B
+	lxvd2x		vs17,	o16,	BO		// load imag part from B
 
-	addi		BO,	BO,	16
+	addi		BO,	BO,	32
 
 	xvmaddadp	vs32,	vs0,	vs16		// real*real, imag*real
 	xvmaddadp	vs33,	vs0,	vs17		// real*imag, imag*imag
@@ -2748,10 +2695,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
 .macro LOAD1x2_1
 
-	lxvdsx		vs16,	o0,	BO		// load real part from B
-	lxvdsx		vs17,	o8,	BO		// load imag part from B
+	lxvd2x		vs16,	o0,	BO		// load real part from B
+	lxvd2x		vs17,	o16,	BO		// load imag part from B
 
-	addi		BO,	BO,	16
+	addi		BO,	BO,	32
 
 	lxvd2x		vs0,	o0,	AO		// load real,imag from A
 	lxvd2x		vs1,	o16,	AO		// load real,imag from A
@@ -2768,10 +2715,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
 	addi		AO,	AO,	32
 
-	lxvdsx		vs20,	o0,	BO		// load real part from B
-	lxvdsx		vs21,	o8,	BO		// load imag part from B
+	lxvd2x		vs20,	o0,	BO		// load real part from B
+	lxvd2x		vs21,	o16,	BO		// load imag part from B
 
-	addi		BO,	BO,	16
+	addi		BO,	BO,	32
 
 	xvmuldp		vs32,	vs0,	vs16		// real*real, imag*real
 	xvmuldp		vs33,	vs0,	vs17		// real*imag, imag*imag
@@ -2788,10 +2735,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
 	addi		AO,	AO,	32
 
-	lxvdsx		vs20,	o0,	BO		// load real part from B
-	lxvdsx		vs21,	o8,	BO		// load imag part from B
+	lxvd2x		vs20,	o0,	BO		// load real part from B
+	lxvd2x		vs21,	o16,	BO		// load imag part from B
 
-	addi		BO,	BO,	16
+	addi		BO,	BO,	32
 
 	xvmaddadp	vs32,	vs0,	vs16		// real*real, imag*real
 	xvmaddadp	vs33,	vs0,	vs17		// real*imag, imag*imag
@@ -2808,10 +2755,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
 	addi		AO,	AO,	32
 
-	lxvdsx		vs16,	o0,	BO		// load real part from B
-	lxvdsx		vs17,	o8,	BO		// load imag part from B
+	lxvd2x		vs16,	o0,	BO		// load real part from B
+	lxvd2x		vs17,	o16,	BO		// load imag part from B
 
-	addi		BO,	BO,	16
+	addi		BO,	BO,	32
 
 	xvmaddadp	vs32,	vs8,	vs20		// real*real, imag*real
 	xvmaddadp	vs33,	vs8,	vs21		// real*imag, imag*imag
@@ -2839,10 +2786,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
 	addi		AO,	AO,	32
 
-	lxvdsx		vs16,	o0,	BO		// load real part from B
-	lxvdsx		vs17,	o8,	BO		// load imag part from B
+	lxvd2x		vs16,	o0,	BO		// load real part from B
+	lxvd2x		vs17,	o16,	BO		// load imag part from B
 
-	addi		BO,	BO,	16
+	addi		BO,	BO,	32
 
 	xvmuldp		vs32,	vs0,	vs16		// real*real, imag*real
 	xvmuldp		vs33,	vs0,	vs17		// real*imag, imag*imag
@@ -2859,10 +2806,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
 	addi		AO,	AO,	32
 
-	lxvdsx		vs16,	o0,	BO		// load real part from B
-	lxvdsx		vs17,	o8,	BO		// load imag part from B
+	lxvd2x		vs16,	o0,	BO		// load real part from B
+	lxvd2x		vs17,	o16,	BO		// load imag part from B
 
-	addi		BO,	BO,	16
+	addi		BO,	BO,	32
 
 	xvmaddadp	vs32,	vs0,	vs16		// real*real, imag*real
 	xvmaddadp	vs33,	vs0,	vs17		// real*imag, imag*imag
@@ -2954,10 +2901,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
 .macro LOAD1x1_1
 
-	lxvdsx		vs16,	o0,	BO		// load real part from B
-	lxvdsx		vs17,	o8,	BO		// load imag part from B
+	lxvd2x		vs16,	o0,	BO		// load real part from B
+	lxvd2x		vs17,	o16,	BO		// load imag part from B
 
-	addi		BO,	BO,	16
+	addi		BO,	BO,	32
 
 	lxvd2x		vs0,	o0,	AO		// load real,imag from A
 
@@ -2972,10 +2919,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
 	addi		AO,	AO,	16
 
-	lxvdsx		vs20,	o0,	BO		// load real part from B
-	lxvdsx		vs21,	o8,	BO		// load imag part from B
+	lxvd2x		vs20,	o0,	BO		// load real part from B
+	lxvd2x		vs21,	o16,	BO		// load imag part from B
 
-	addi		BO,	BO,	16
+	addi		BO,	BO,	32
 
 	xvmuldp		vs32,	vs0,	vs16		// real*real, imag*real
 	xvmuldp		vs33,	vs0,	vs17		// real*imag, imag*imag
@@ -2989,10 +2936,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
 	addi		AO,	AO,	16
 
-	lxvdsx		vs20,	o0,	BO		// load real part from B
-	lxvdsx		vs21,	o8,	BO		// load imag part from B
+	lxvd2x		vs20,	o0,	BO		// load real part from B
+	lxvd2x		vs21,	o16,	BO		// load imag part from B
 
-	addi		BO,	BO,	16
+	addi		BO,	BO,	32
 
 	xvmaddadp	vs32,	vs0,	vs16		// real*real, imag*real
 	xvmaddadp	vs33,	vs0,	vs17		// real*imag, imag*imag
@@ -3006,10 +2953,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
 	addi		AO,	AO,	16
 
-	lxvdsx		vs16,	o0,	BO		// load real part from B
-	lxvdsx		vs17,	o8,	BO		// load imag part from B
+	lxvd2x		vs16,	o0,	BO		// load real part from B
+	lxvd2x		vs17,	o16,	BO		// load imag part from B
 
-	addi		BO,	BO,	16
+	addi		BO,	BO,	32
 
 	xvmaddadp	vs32,	vs8,	vs20		// real*real, imag*real
 	xvmaddadp	vs33,	vs8,	vs21		// real*imag, imag*imag
@@ -3032,10 +2979,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
 	addi		AO,	AO,	16
 
-	lxvdsx		vs16,	o0,	BO		// load real part from B
-	lxvdsx		vs17,	o8,	BO		// load imag part from B
+	lxvd2x		vs16,	o0,	BO		// load real part from B
+	lxvd2x		vs17,	o16,	BO		// load imag part from B
 
-	addi		BO,	BO,	16
+	addi		BO,	BO,	32
 
 	xvmuldp		vs32,	vs0,	vs16		// real*real, imag*real
 	xvmuldp		vs33,	vs0,	vs17		// real*imag, imag*imag
@@ -3049,10 +2996,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
 	addi		AO,	AO,	16
 
-	lxvdsx		vs16,	o0,	BO		// load real part from B
-	lxvdsx		vs17,	o8,	BO		// load imag part from B
+	lxvd2x		vs16,	o0,	BO		// load real part from B
+	lxvd2x		vs17,	o16,	BO		// load imag part from B
 
-	addi		BO,	BO,	16
+	addi		BO,	BO,	32
 
 	xvmaddadp	vs32,	vs0,	vs16		// real*real, imag*real
 	xvmaddadp	vs33,	vs0,	vs17		// real*imag, imag*imag
diff --git a/kernel/power/zscal.c b/kernel/power/zscal.c
new file mode 100644
index 000000000..213839a8f
--- /dev/null
+++ b/kernel/power/zscal.c
@@ -0,0 +1,176 @@
+/***************************************************************************
+Copyright (c) 2013-2016, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*****************************************************************************/
+
+/**************************************************************************************
+* 2016/03/27 Werner Saar (wernsaar@googlemail.com)
+* 	 BLASTEST 		: OK
+* 	 CTEST			: OK
+* 	 TEST			: OK
+*	 LAPACK-TEST		: OK
+**************************************************************************************/
+
+
+#include "common.h"
+
+#pragma GCC optimize "O1"
+
+#if defined(POWER8)
+#include "zscal_microk_power8.c"
+#endif
+
+
+#ifndef HAVE_KERNEL_8
+
+static void zscal_kernel_8(BLASLONG n, FLOAT *x, FLOAT *alpha)
+{
+
+	BLASLONG i=0;
+	FLOAT *x1=x;
+	FLOAT  alpha_r1=alpha[0];
+	FLOAT  alpha_r2=alpha[1];
+	FLOAT  alpha_i1=alpha[2];
+	FLOAT  alpha_i2=alpha[3];
+	FLOAT  temp00, temp01, temp10, temp11, temp20, temp21, temp30, temp31;
+	FLOAT  x0_r, x0_i, x1_r, x1_i, x2_r, x2_i, x3_r, x3_i;
+
+	while ( i<n )
+	{
+		x0_r = x1[0];
+		x0_i = x1[1];
+		x1_r = x1[2];
+		x1_i = x1[3];
+		x2_r = x1[4];
+		x2_i = x1[5];
+		x3_r = x1[6];
+		x3_i = x1[7];
+
+		temp00  = x0_r * alpha_r1;
+		temp10  = x1_r * alpha_r1;
+		temp20  = x2_r * alpha_r1;
+		temp30  = x3_r * alpha_r1;
+
+		temp01  = x0_i * alpha_r2;
+		temp11  = x1_i * alpha_r2;
+		temp21  = x2_i * alpha_r2;
+		temp31  = x3_i * alpha_r2;
+
+		temp00 += x0_i * alpha_i1;
+		temp10 += x1_i * alpha_i1;
+		temp20 += x2_i * alpha_i1;
+		temp30 += x3_i * alpha_i1;
+
+		temp01 += x0_r * alpha_i2;
+		temp11 += x1_r * alpha_i2;
+		temp21 += x2_r * alpha_i2;
+		temp31 += x3_r * alpha_i2;
+
+		x1[0] = temp00;
+		x1[1] = temp01;
+		x1[2] = temp10;
+		x1[3] = temp11;
+		x1[4] = temp20;
+		x1[5] = temp21;
+		x1[6] = temp30;
+		x1[7] = temp31;
+
+		x1 += 8;
+		i+=4;
+
+	}
+	return;
+
+
+}
+
+#endif
+
+int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da_r,FLOAT da_i, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *dummy, BLASLONG dummy2)
+{
+	BLASLONG i=0;
+	BLASLONG inc_x2;
+	BLASLONG ip = 0;
+	FLOAT temp;
+	FLOAT alpha[4] __attribute__ ((aligned (16)));;
+	BLASLONG n1;
+
+	if ( n <= 0 )
+		return(0);
+
+	if ( inc_x <= 0 )
+		return(0);
+
+	if ( inc_x == 1 )
+	{
+
+
+		n1 = n & -8;
+		if ( n1 > 0 )
+		{
+			alpha[0] = da_r;
+			alpha[1] = da_r;
+			alpha[2] = -da_i;
+			alpha[3] = da_i;
+			zscal_kernel_8(n1, x, alpha);
+			i=n1;
+			ip = n1 * 2;
+
+		}
+
+		while ( i < n )
+		{
+
+				temp    = da_r * x[ip]   - da_i * x[ip+1] ;
+				x[ip+1] = da_r * x[ip+1] + da_i * x[ip]   ;
+				x[ip]   = temp;
+				ip += 2;
+				i++;
+		}
+
+	}
+	else
+	{
+
+		inc_x2 = 2 * inc_x;
+
+		while ( i < n )
+		{
+
+				temp    = da_r * x[ip]   - da_i * x[ip+1] ;
+				x[ip+1] = da_r * x[ip+1] + da_i * x[ip]   ;
+				x[ip]   = temp;
+				ip += inc_x2;
+				i++;
+		}
+
+
+	}
+
+	return(0);
+
+}
+
+
diff --git a/kernel/power/zscal_microk_power8.c b/kernel/power/zscal_microk_power8.c
new file mode 100644
index 000000000..5e09d8d79
--- /dev/null
+++ b/kernel/power/zscal_microk_power8.c
@@ -0,0 +1,224 @@
+/***************************************************************************
+Copyright (c) 2013-2016, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*****************************************************************************/
+
+/**************************************************************************************
+* 2016/03/25 Werner Saar (wernsaar@googlemail.com)
+*
+* I don't use fused multipy-add ( lapack precision problems )
+*
+* 	 BLASTEST 		: OK
+* 	 CTEST			: OK
+* 	 TEST			: OK
+*	 LAPACK-TEST		: OK
+**************************************************************************************/
+
+#define HAVE_KERNEL_8 1
+
+static void zscal_kernel_8( BLASLONG n, FLOAT *x, FLOAT *alpha) __attribute__ ((noinline));
+
+static void zscal_kernel_8( BLASLONG n, FLOAT *x, FLOAT *alpha)
+{
+
+
+	BLASLONG i = n;
+	BLASLONG o16 = 16;
+	BLASLONG o32 = 32;
+	BLASLONG o48 = 48;
+	BLASLONG o64 = 64;
+	BLASLONG o80 = 80;
+	BLASLONG o96 = 96;
+	BLASLONG o112 = 112;
+	FLOAT *x1=x;
+	FLOAT *x2=x+1;
+	BLASLONG pre = 384;
+
+	__asm__  __volatile__
+	(
+
+        "lxvd2x         32, 0, %3                           \n\t"	// alpha_r , alpha_r
+        "lxvd2x         33, %5, %3                          \n\t"	// -alpha_i , alpha_i
+        "addi           %1, %1, -8                          \n\t"
+
+	"dcbt		%2, %4				    \n\t"
+
+	"lxvd2x		40, 0, %2			    \n\t"	// x0_r, x0_i
+	"lxvd2x		41, %5, %2			    \n\t"
+	"lxvd2x		42, %6, %2			    \n\t"
+	"lxvd2x		43, %7, %2			    \n\t"
+	"lxvd2x		44, %8, %2			    \n\t"
+	"lxvd2x		45, %9, %2			    \n\t"
+	"lxvd2x		46, %10, %2			    \n\t"
+	"lxvd2x		47, %11, %2			    \n\t"
+
+	"addi		%2, %2, 128			    \n\t"
+
+	"addic.		%0 , %0	, -8  	 	             \n\t"
+	"ble		2f		             	     \n\t"
+
+	".align 5				            \n\t"
+	"1:				                    \n\t"
+
+	"dcbt		%2, %4				    \n\t"
+
+	"xvmuldp	48, 40, 32		    	    \n\t"	// x0_r * alpha_r, x0_i * alpha_r
+	"xvmuldp	49, 41, 32		    	    \n\t"
+	"xvmuldp	50, 42, 32		    	    \n\t"
+	"xvmuldp	51, 43, 32		    	    \n\t"
+	"xvmuldp	52, 44, 32		    	    \n\t"
+	"xvmuldp	53, 45, 32		    	    \n\t"
+	"xvmuldp	54, 46, 32		    	    \n\t"
+	"xvmuldp	55, 47, 32		    	    \n\t"
+
+	"xxswapd	56, 40				    \n\t"
+	"xxswapd	57, 41				    \n\t"
+	"xxswapd	58, 42				    \n\t"
+	"xxswapd	59, 43				    \n\t"
+	"xxswapd	60, 44				    \n\t"
+	"xxswapd	61, 45				    \n\t"
+	"xxswapd	62, 46				    \n\t"
+	"xxswapd	63, 47				    \n\t"
+
+	"xvmuldp	56, 56, 33		    	    \n\t"	// x0_i * -alpha_i, x0_r * alpha_i
+	"xvmuldp	57, 57, 33		    	    \n\t"
+
+	"lxvd2x		40, 0, %2			    \n\t"	// x0_r, x0_i
+	"lxvd2x		41, %5, %2			    \n\t"
+
+	"xvmuldp	58, 58, 33		    	    \n\t"
+	"xvmuldp	59, 59, 33		    	    \n\t"
+
+	"lxvd2x		42, %6, %2			    \n\t"
+	"lxvd2x		43, %7, %2			    \n\t"
+
+	"xvmuldp	60, 60, 33		    	    \n\t"
+	"xvmuldp	61, 61, 33		    	    \n\t"
+
+	"lxvd2x		44, %8, %2			    \n\t"
+	"lxvd2x		45, %9, %2			    \n\t"
+
+	"xvmuldp	62, 62, 33		    	    \n\t"
+	"xvmuldp	63, 63, 33		    	    \n\t"
+
+	"lxvd2x		46, %10, %2			    \n\t"
+	"lxvd2x		47, %11, %2			    \n\t"
+
+	"xvadddp	48, 48 , 56			    \n\t"
+	"xvadddp	49, 49 , 57			    \n\t"
+	"xvadddp	50, 50 , 58			    \n\t"
+	"xvadddp	51, 51 , 59			    \n\t"
+
+	"stxvd2x	48, 0, %1			    \n\t"
+	"stxvd2x	49, %5, %1			    \n\t"
+
+	"xvadddp	52, 52 , 60			    \n\t"
+	"xvadddp	53, 53 , 61			    \n\t"
+
+	"stxvd2x	50, %6, %1			    \n\t"
+	"stxvd2x	51, %7, %1			    \n\t"
+
+	"xvadddp	54, 54 , 62			    \n\t"
+	"xvadddp	55, 55 , 63			    \n\t"
+
+	"stxvd2x	52, %8, %1			    \n\t"
+	"stxvd2x	53, %9, %1			    \n\t"
+	"stxvd2x	54, %10, %1			    \n\t"
+	"stxvd2x	55, %11, %1			    \n\t"
+
+	"addi		%1, %1, 128			    \n\t"
+	"addi		%2, %2, 128			    \n\t"
+
+	"addic.		%0 , %0	, -8  	 	             \n\t"
+	"bgt		1b		             	     \n\t"
+
+	"2:						     \n\t"
+
+	"xvmuldp	48, 40, 32		    	    \n\t"	// x0_r * alpha_r, x0_i * alpha_r
+	"xvmuldp	49, 41, 32		    	    \n\t"
+	"xvmuldp	50, 42, 32		    	    \n\t"
+	"xvmuldp	51, 43, 32		    	    \n\t"
+	"xvmuldp	52, 44, 32		    	    \n\t"
+	"xvmuldp	53, 45, 32		    	    \n\t"
+	"xvmuldp	54, 46, 32		    	    \n\t"
+	"xvmuldp	55, 47, 32		    	    \n\t"
+
+	"xxswapd	56, 40				    \n\t"
+	"xxswapd	57, 41				    \n\t"
+	"xxswapd	58, 42				    \n\t"
+	"xxswapd	59, 43				    \n\t"
+	"xxswapd	60, 44				    \n\t"
+	"xxswapd	61, 45				    \n\t"
+	"xxswapd	62, 46				    \n\t"
+	"xxswapd	63, 47				    \n\t"
+
+	"xvmuldp	56, 56, 33		    	    \n\t"	// x0_i * -alpha_i, x0_r * alpha_i
+	"xvmuldp	57, 57, 33		    	    \n\t"
+	"xvmuldp	58, 58, 33		    	    \n\t"
+	"xvmuldp	59, 59, 33		    	    \n\t"
+	"xvmuldp	60, 60, 33		    	    \n\t"
+	"xvmuldp	61, 61, 33		    	    \n\t"
+	"xvmuldp	62, 62, 33		    	    \n\t"
+	"xvmuldp	63, 63, 33		    	    \n\t"
+
+	"xvadddp	48, 48 , 56			    \n\t"
+	"xvadddp	49, 49 , 57			    \n\t"
+	"xvadddp	50, 50 , 58			    \n\t"
+	"xvadddp	51, 51 , 59			    \n\t"
+	"xvadddp	52, 52 , 60			    \n\t"
+	"xvadddp	53, 53 , 61			    \n\t"
+	"xvadddp	54, 54 , 62			    \n\t"
+	"xvadddp	55, 55 , 63			    \n\t"
+
+	"stxvd2x	48, 0, %1			    \n\t"
+	"stxvd2x	49, %5, %1			    \n\t"
+	"stxvd2x	50, %6, %1			    \n\t"
+	"stxvd2x	51, %7, %1			    \n\t"
+	"stxvd2x	52, %8, %1			    \n\t"
+	"stxvd2x	53, %9, %1			    \n\t"
+	"stxvd2x	54, %10, %1			    \n\t"
+	"stxvd2x	55, %11, %1			    \n\t"
+
+
+	:
+        : 
+          "r" (i),	// 0	
+	  "r" (x2),  	// 1
+          "r" (x1),     // 2
+          "r" (alpha),  // 3
+          "r" (pre),    // 4
+	  "r" (o16),	// 5
+	  "r" (o32),	// 6
+	  "r" (o48),    // 7
+          "r" (o64),    // 8
+          "r" (o80),    // 9
+          "r" (o96),    // 10
+          "r" (o112)    // 11
+	: "cr0", "%0", "%2" , "%1", "memory"
+	);
+
+} 
+
+
diff --git a/kernel/power/zswap.c b/kernel/power/zswap.c
new file mode 100644
index 000000000..5ec1eee2e
--- /dev/null
+++ b/kernel/power/zswap.c
@@ -0,0 +1,175 @@
+/***************************************************************************
+Copyright (c) 2013-2016, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*****************************************************************************/
+
+/**************************************************************************************
+* 2016/03/27 Werner Saar (wernsaar@googlemail.com)
+* 	 BLASTEST 		: OK
+* 	 CTEST			: OK
+* 	 TEST			: OK
+*	 LAPACK-TEST		: OK
+**************************************************************************************/
+
+#include "common.h"
+
+
+#if defined(POWER8)
+#include "zswap_microk_power8.c"
+#endif
+
+
+#ifndef HAVE_KERNEL_16
+
+static void zswap_kernel_16(BLASLONG n, FLOAT *x, FLOAT *y)
+{
+
+	BLASLONG i=0;
+	FLOAT f0, f1, f2, f3, f4, f5, f6, f7;
+	FLOAT g0, g1, g2, g3, g4, g5, g6, g7;
+	FLOAT *x1=x;
+	FLOAT *y1=y;
+
+	while ( i<n )
+	{
+
+		f0 = x1[0];
+		f1 = x1[1];
+		f2 = x1[2];
+		f3 = x1[3];
+		f4 = x1[4];
+		f5 = x1[5];
+		f6 = x1[6];
+		f7 = x1[7];
+
+		g0 = y1[0];
+		g1 = y1[1];
+		g2 = y1[2];
+		g3 = y1[3];
+		g4 = y1[4];
+		g5 = y1[5];
+		g6 = y1[6];
+		g7 = y1[7];
+
+		y1[0] = f0;
+		y1[1] = f1;
+		y1[2] = f2;
+		y1[3] = f3;
+		y1[4] = f4;
+		y1[5] = f5;
+		y1[6] = f6;
+		y1[7] = f7;
+
+		x1[0] = g0;
+		x1[1] = g1;
+		x1[2] = g2;
+		x1[3] = g3;
+		x1[4] = g4;
+		x1[5] = g5;
+		x1[6] = g6;
+		x1[7] = g7;
+
+		x1 += 8;
+		y1 += 8;
+
+		i+=4;
+	}
+	return;
+
+}
+
+
+#endif
+
+
+int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT dummy3, FLOAT dummy4, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *dummy, BLASLONG dummy2)
+{
+	BLASLONG i=0;
+	BLASLONG ix=0,iy=0;
+	FLOAT temp[2];
+	BLASLONG inc_x2, inc_y2;
+
+	if ( n <= 0     )  return(0);
+
+	if ( (inc_x == 1) && (inc_y == 1 ))
+	{
+
+		BLASLONG n1 = n & -16;
+		if ( n1 > 0 )
+		{
+			zswap_kernel_16(n1, x, y);
+			i=n1;
+			ix = 2* n1;
+			iy = 2* n1;
+		}
+
+		while(i < n)
+		{
+
+	                temp[0]  = x[ix]   ;
+        	        temp[1]  = x[ix+1] ;
+                	x[ix]    = y[iy]   ;
+                	x[ix+1]  = y[iy+1] ;
+                	y[iy]    = temp[0] ;
+                	y[iy+1]  = temp[1] ;
+
+                	ix += 2 ;
+                	iy += 2 ;
+                	i++ ;
+
+
+		}
+
+
+	}
+	else
+	{
+
+	        inc_x2 = 2 * inc_x;
+	        inc_y2 = 2 * inc_y;
+
+		while(i < n)
+		{
+
+	                temp[0]  = x[ix]   ;
+        	        temp[1]  = x[ix+1] ;
+                	x[ix]    = y[iy]   ;
+                	x[ix+1]  = y[iy+1] ;
+                	y[iy]    = temp[0] ;
+                	y[iy+1]  = temp[1] ;
+
+                	ix += inc_x2 ;
+                	iy += inc_y2 ;
+                	i++ ;
+
+		}
+
+	}
+	return(0);
+	
+
+}
+
+
diff --git a/kernel/power/zswap_microk_power8.c b/kernel/power/zswap_microk_power8.c
new file mode 100644
index 000000000..9e5623752
--- /dev/null
+++ b/kernel/power/zswap_microk_power8.c
@@ -0,0 +1,180 @@
+/***************************************************************************
+Copyright (c) 2013-2016, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*****************************************************************************/
+
+/**************************************************************************************
+* 2016/03/27 Werner Saar (wernsaar@googlemail.com)
+* 	 BLASTEST 		: OK
+* 	 CTEST			: OK
+* 	 TEST			: OK
+*	 LAPACK-TEST		: OK
+**************************************************************************************/
+
+#define HAVE_KERNEL_16 1
+
+static void zswap_kernel_16( BLASLONG n, FLOAT *x, FLOAT *y) __attribute__ ((noinline));
+
+static void zswap_kernel_16( BLASLONG n, FLOAT *x, FLOAT *y)
+{
+
+
+	BLASLONG i = n;
+	BLASLONG o16 = 16;
+	BLASLONG o32 = 32;
+	BLASLONG o48 = 48;
+	BLASLONG o64 = 64;
+	BLASLONG o80 = 80;
+	BLASLONG o96 = 96;
+	BLASLONG o112 = 112;
+	FLOAT *x1=x;
+	FLOAT *y1=y;
+	FLOAT *x2=x+1;
+	FLOAT *y2=y+1;
+	BLASLONG pre = 384;
+	BLASLONG alpha=0;
+
+	__asm__  __volatile__
+	(
+
+	"addi		%3, %3, -8			    \n\t"	
+	"addi		%4, %4, -8			    \n\t"	
+
+	".align 5				            \n\t"
+	"1:				                    \n\t"
+
+	"lxvd2x		32, 0, %2			    \n\t"
+	"lxvd2x		33, %5, %2			    \n\t"
+	"lxvd2x		34, %6, %2			    \n\t"
+	"lxvd2x		35, %7, %2			    \n\t"
+	"lxvd2x		36, %8, %2			    \n\t"
+	"lxvd2x		37, %9, %2			    \n\t"
+	"lxvd2x		38, %10, %2			    \n\t"
+	"lxvd2x		39, %11, %2			    \n\t"
+
+	"addi		%2, %2, 128			    \n\t"
+
+	"lxvd2x		40, 0, %2			    \n\t"
+	"lxvd2x		41, %5, %2			    \n\t"
+	"lxvd2x		42, %6, %2			    \n\t"
+	"lxvd2x		43, %7, %2			    \n\t"
+	"lxvd2x		44, %8, %2			    \n\t"
+	"lxvd2x		45, %9, %2			    \n\t"
+	"lxvd2x		46, %10, %2			    \n\t"
+	"lxvd2x		47, %11, %2			    \n\t"
+
+	"addi		%2, %2, 128			    \n\t"
+
+	"lxvd2x		48, 0, %1			    \n\t"
+	"lxvd2x		49, %5, %1			    \n\t"
+	"lxvd2x		50, %6, %1			    \n\t"
+	"lxvd2x		51, %7, %1			    \n\t"
+	"lxvd2x		52, %8, %1			    \n\t"
+	"lxvd2x		53, %9, %1			    \n\t"
+	"lxvd2x		54, %10, %1			    \n\t"
+	"lxvd2x		55, %11, %1			    \n\t"
+
+	"addi		%1, %1, 128			    \n\t"
+
+	"lxvd2x		56, 0, %1			    \n\t"
+	"lxvd2x		57, %5, %1			    \n\t"
+	"lxvd2x		58, %6, %1			    \n\t"
+	"lxvd2x		59, %7, %1			    \n\t"
+	"lxvd2x		60, %8, %1			    \n\t"
+	"lxvd2x		61, %9, %1			    \n\t"
+	"lxvd2x		62, %10, %1			    \n\t"
+	"lxvd2x		63, %11, %1			    \n\t"
+
+	"addi		%1, %1, 128			    \n\t"
+
+	"stxvd2x		32, 0, %3			    \n\t"
+	"stxvd2x		33, %5, %3			    \n\t"
+	"stxvd2x		34, %6, %3			    \n\t"
+	"stxvd2x		35, %7, %3			    \n\t"
+	"stxvd2x		36, %8, %3			    \n\t"
+	"stxvd2x		37, %9, %3			    \n\t"
+	"stxvd2x		38, %10, %3			    \n\t"
+	"stxvd2x		39, %11, %3			    \n\t"
+
+	"addi		%3, %3, 128			    \n\t"
+
+	"stxvd2x		40, 0, %3			    \n\t"
+	"stxvd2x		41, %5, %3			    \n\t"
+	"stxvd2x		42, %6, %3			    \n\t"
+	"stxvd2x		43, %7, %3			    \n\t"
+	"stxvd2x		44, %8, %3			    \n\t"
+	"stxvd2x		45, %9, %3			    \n\t"
+	"stxvd2x		46, %10, %3			    \n\t"
+	"stxvd2x		47, %11, %3			    \n\t"
+
+	"addi		%3, %3, 128			    \n\t"
+
+	"stxvd2x		48, 0, %4			    \n\t"
+	"stxvd2x		49, %5, %4			    \n\t"
+	"stxvd2x		50, %6, %4			    \n\t"
+	"stxvd2x		51, %7, %4			    \n\t"
+	"stxvd2x		52, %8, %4			    \n\t"
+	"stxvd2x		53, %9, %4			    \n\t"
+	"stxvd2x		54, %10, %4			    \n\t"
+	"stxvd2x		55, %11, %4			    \n\t"
+
+	"addi		%4, %4, 128			    \n\t"
+
+	"stxvd2x		56, 0, %4			    \n\t"
+	"stxvd2x		57, %5, %4			    \n\t"
+	"stxvd2x		58, %6, %4			    \n\t"
+	"stxvd2x		59, %7, %4			    \n\t"
+	"stxvd2x		60, %8, %4			    \n\t"
+	"stxvd2x		61, %9, %4			    \n\t"
+	"stxvd2x		62, %10, %4			    \n\t"
+	"stxvd2x		63, %11, %4			    \n\t"
+
+	"addi		%4, %4, 128			    \n\t"
+
+	"addic.		%0 , %0	, -16  	 	             \n\t"
+	"bgt		1b		             	     \n\t"
+
+	"2:						     \n\t"
+
+	:
+        : 
+          "r" (i),	// 0	
+	  "r" (y1),  	// 1
+          "r" (x1),     // 2
+          "r" (y2),     // 3
+          "r" (x2),     // 4
+	  "r" (o16),	// 5
+	  "r" (o32),	// 6
+	  "r" (o48),    // 7
+          "r" (o64),    // 8
+          "r" (o80),    // 9
+          "r" (o96),    // 10
+          "r" (o112)    // 11
+	: "cr0", "%0", "%2" , "%1", "%3", "%4", "memory"
+	);
+
+} 
+
+
diff --git a/kernel/power/ztrmm_kernel_8x2_power8.S b/kernel/power/ztrmm_kernel_8x2_power8.S
index 8b953765e..0cfe613d5 100644
--- a/kernel/power/ztrmm_kernel_8x2_power8.S
+++ b/kernel/power/ztrmm_kernel_8x2_power8.S
@@ -271,7 +271,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #endif
 #endif
 
-#include "zgemm_macros_8x2_power8.S"
+#include "ztrmm_macros_8x2_power8.S"
 
 	cmpwi	cr0, M, 0
 	ble	.L999
diff --git a/kernel/power/ztrmm_macros_8x2_power8.S b/kernel/power/ztrmm_macros_8x2_power8.S
new file mode 100644
index 000000000..701ec65c8
--- /dev/null
+++ b/kernel/power/ztrmm_macros_8x2_power8.S
@@ -0,0 +1,3110 @@
+/***************************************************************************
+Copyright (c) 2013-2016, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*****************************************************************************/
+
+/**************************************************************************************
+* 2016/03/05 Werner Saar (wernsaar@googlemail.com)
+* 	 BLASTEST 		: OK
+* 	 CTEST			: OK
+* 	 TEST			: OK
+*	 LAPACK-TEST		: OK
+**************************************************************************************/
+
+
+#if   defined(NN) || defined(NT) || defined(TN) || defined(TT)
+
+	#define	XSFADD_R1	xsadddp
+	#define	XSFADD_R2	xssubdp
+	#define	XSFADD_I1	xsadddp
+	#define	XSFADD_I2	xsadddp
+
+#elif  defined(CN) || defined(CT) || defined(RN) || defined(RT)
+
+	#define	XSFADD_R1	xsadddp
+	#define	XSFADD_R2	xsadddp
+	#define	XSFADD_I1	xssubdp
+	#define	XSFADD_I2	xsadddp
+
+#elif  defined(NC) || defined(TC) || defined(NR) || defined(TR)
+
+	#define	XSFADD_R1	xsadddp
+	#define	XSFADD_R2	xsadddp
+	#define	XSFADD_I1	xsadddp
+	#define	XSFADD_I2	xssubdp
+
+#else		// CC || CR || RC || RR
+
+	#define	XSFADD_R1	xsadddp
+	#define	XSFADD_R2	xssubdp
+	#define	XSFADD_I1	xssubdp
+	#define	XSFADD_I2	xssubdp
+
+#endif
+
+/**********************************************************************************************
+* Macros for N=2 and M=8
+**********************************************************************************************/
+
+.macro LOAD2x8_1
+
+	lxvdsx		vs16,	o0,	BO		// load real part from B
+	lxvdsx		vs17,	o8,	BO		// load imag part from B
+	lxvdsx		vs18,	o16,	BO		// load real part from B
+	lxvdsx		vs19,	o24,	BO		// load imag part from B
+
+	addi		BO,	BO,	32
+
+	lxvd2x		vs0,	o0,	AO		// load real,imag from A
+	lxvd2x		vs1,	o16,	AO		// load real,imag from A
+	lxvd2x		vs2,	o32,	AO		// load real,imag from A
+	lxvd2x		vs3,	o48,	AO		// load real,imag from A
+
+	addi		AO,	AO,	64
+
+	lxvd2x		vs4,	o0,	AO		// load real,imag from A
+	lxvd2x		vs5,	o16,	AO		// load real,imag from A
+	lxvd2x		vs6,	o32,	AO		// load real,imag from A
+	lxvd2x		vs7,	o48,	AO		// load real,imag from A
+
+	addi		AO,	AO,	64
+
+
+.endm
+
+.macro KERNEL2x8_I1
+
+	lxvd2x		vs8,	o0,	AO		// load real,imag from A
+	lxvd2x		vs9,	o16,	AO		// load real,imag from A
+	lxvd2x		vs10,	o32,	AO		// load real,imag from A
+	lxvd2x		vs11,	o48,	AO		// load real,imag from A
+
+	addi		AO,	AO,	64
+
+	lxvd2x		vs12,	o0,	AO		// load real,imag from A
+	lxvd2x		vs13,	o16,	AO		// load real,imag from A
+	lxvd2x		vs14,	o32,	AO		// load real,imag from A
+	lxvd2x		vs15,	o48,	AO		// load real,imag from A
+
+	addi		AO,	AO,	64
+
+	lxvdsx		vs20,	o0,	BO		// load real part from B
+	lxvdsx		vs21,	o8,	BO		// load imag part from B
+	lxvdsx		vs22,	o16,	BO		// load real part from B
+	lxvdsx		vs23,	o24,	BO		// load imag part from B
+
+	addi		BO,	BO,	32
+
+	xvmuldp		vs32,	vs0,	vs16		// real*real, imag*real
+	xvmuldp		vs33,	vs0,	vs17		// real*imag, imag*imag
+	xvmuldp		vs34,	vs1,	vs16		// real*real, imag*real
+	xvmuldp		vs35,	vs1,	vs17		// real*imag, imag*imag
+	xvmuldp		vs36,	vs2,	vs16		// real*real, imag*real
+	xvmuldp		vs37,	vs2,	vs17		// real*imag, imag*imag
+	xvmuldp		vs38,	vs3,	vs16		// real*real, imag*real
+	xvmuldp		vs39,	vs3,	vs17		// real*imag, imag*imag
+	xvmuldp		vs40,	vs4,	vs16		// real*real, imag*real
+	xvmuldp		vs41,	vs4,	vs17		// real*imag, imag*imag
+	xvmuldp		vs42,	vs5,	vs16		// real*real, imag*real
+	xvmuldp		vs43,	vs5,	vs17		// real*imag, imag*imag
+	xvmuldp		vs44,	vs6,	vs16		// real*real, imag*real
+	xvmuldp		vs45,	vs6,	vs17		// real*imag, imag*imag
+	xvmuldp		vs46,	vs7,	vs16		// real*real, imag*real
+	xvmuldp		vs47,	vs7,	vs17		// real*imag, imag*imag
+
+	xvmuldp		vs48,	vs0,	vs18		// real*real, imag*real
+	xvmuldp		vs49,	vs0,	vs19		// real*imag, imag*imag
+	xvmuldp		vs50,	vs1,	vs18		// real*real, imag*real
+	xvmuldp		vs51,	vs1,	vs19		// real*imag, imag*imag
+	xvmuldp		vs52,	vs2,	vs18		// real*real, imag*real
+	xvmuldp		vs53,	vs2,	vs19		// real*imag, imag*imag
+	xvmuldp		vs54,	vs3,	vs18		// real*real, imag*real
+	xvmuldp		vs55,	vs3,	vs19		// real*imag, imag*imag
+	xvmuldp		vs56,	vs4,	vs18		// real*real, imag*real
+	xvmuldp		vs57,	vs4,	vs19		// real*imag, imag*imag
+	xvmuldp		vs58,	vs5,	vs18		// real*real, imag*real
+	xvmuldp		vs59,	vs5,	vs19		// real*imag, imag*imag
+	xvmuldp		vs60,	vs6,	vs18		// real*real, imag*real
+	xvmuldp		vs61,	vs6,	vs19		// real*imag, imag*imag
+	xvmuldp		vs62,	vs7,	vs18		// real*real, imag*real
+	xvmuldp		vs63,	vs7,	vs19		// real*imag, imag*imag
+
+
+.endm
+
+.macro KERNEL2x8_1
+
+
+	xvmaddadp	vs32,	vs0,	vs16		// real*real, imag*real
+	xvmaddadp	vs33,	vs0,	vs17		// real*imag, imag*imag
+	xvmaddadp	vs34,	vs1,	vs16		// real*real, imag*real
+	xvmaddadp	vs35,	vs1,	vs17		// real*imag, imag*imag
+
+	lxvdsx		vs22,	o16,	BO		// load real part from B
+	lxvdsx		vs23,	o24,	BO		// load imag part from B
+
+	xvmaddadp	vs36,	vs2,	vs16		// real*real, imag*real
+	xvmaddadp	vs37,	vs2,	vs17		// real*imag, imag*imag
+	xvmaddadp	vs38,	vs3,	vs16		// real*real, imag*real
+	xvmaddadp	vs39,	vs3,	vs17		// real*imag, imag*imag
+
+	lxvd2x		vs8,	o0,	AO		// load real,imag from A
+	lxvd2x		vs9,	o16,	AO		// load real,imag from A
+
+	xvmaddadp	vs40,	vs4,	vs16		// real*real, imag*real
+	xvmaddadp	vs41,	vs4,	vs17		// real*imag, imag*imag
+	xvmaddadp	vs42,	vs5,	vs16		// real*real, imag*real
+	xvmaddadp	vs43,	vs5,	vs17		// real*imag, imag*imag
+
+	lxvd2x		vs10,	o32,	AO		// load real,imag from A
+	lxvd2x		vs11,	o48,	AO		// load real,imag from A
+
+	xvmaddadp	vs44,	vs6,	vs16		// real*real, imag*real
+	xvmaddadp	vs45,	vs6,	vs17		// real*imag, imag*imag
+
+	addi		AO,	AO,	64
+
+	xvmaddadp	vs46,	vs7,	vs16		// real*real, imag*real
+	xvmaddadp	vs47,	vs7,	vs17		// real*imag, imag*imag
+
+	xvmaddadp	vs48,	vs0,	vs18		// real*real, imag*real
+	xvmaddadp	vs49,	vs0,	vs19		// real*imag, imag*imag
+	xvmaddadp	vs50,	vs1,	vs18		// real*real, imag*real
+	xvmaddadp	vs51,	vs1,	vs19		// real*imag, imag*imag
+
+	lxvd2x		vs12,	o0,	AO		// load real,imag from A
+	lxvd2x		vs13,	o16,	AO		// load real,imag from A
+
+	xvmaddadp	vs52,	vs2,	vs18		// real*real, imag*real
+	xvmaddadp	vs53,	vs2,	vs19		// real*imag, imag*imag
+	xvmaddadp	vs54,	vs3,	vs18		// real*real, imag*real
+	xvmaddadp	vs55,	vs3,	vs19		// real*imag, imag*imag
+
+	lxvd2x		vs14,	o32,	AO		// load real,imag from A
+	lxvd2x		vs15,	o48,	AO		// load real,imag from A
+
+	xvmaddadp	vs56,	vs4,	vs18		// real*real, imag*real
+	xvmaddadp	vs57,	vs4,	vs19		// real*imag, imag*imag
+	xvmaddadp	vs58,	vs5,	vs18		// real*real, imag*real
+	xvmaddadp	vs59,	vs5,	vs19		// real*imag, imag*imag
+
+	lxvdsx		vs20,	o0,	BO		// load real part from B
+	lxvdsx		vs21,	o8,	BO		// load imag part from B
+
+	xvmaddadp	vs60,	vs6,	vs18		// real*real, imag*real
+	xvmaddadp	vs61,	vs6,	vs19		// real*imag, imag*imag
+	xvmaddadp	vs62,	vs7,	vs18		// real*real, imag*real
+	xvmaddadp	vs63,	vs7,	vs19		// real*imag, imag*imag
+
+	addi		AO,	AO,	64
+	addi		BO,	BO,	32
+
+.endm
+
+.macro KERNEL2x8_2
+
+
+	xvmaddadp	vs32,	vs8,	vs20		// real*real, imag*real
+	xvmaddadp	vs33,	vs8,	vs21		// real*imag, imag*imag
+	xvmaddadp	vs34,	vs9,	vs20		// real*real, imag*real
+	xvmaddadp	vs35,	vs9,	vs21		// real*imag, imag*imag
+
+	lxvdsx		vs16,	o0,	BO		// load real part from B
+	lxvdsx		vs17,	o8,	BO		// load imag part from B
+
+	xvmaddadp	vs36,	vs10,	vs20		// real*real, imag*real
+	xvmaddadp	vs37,	vs10,	vs21		// real*imag, imag*imag
+	xvmaddadp	vs38,	vs11,	vs20		// real*real, imag*real
+	xvmaddadp	vs39,	vs11,	vs21		// real*imag, imag*imag
+
+	lxvd2x		vs0,	o0,	AO		// load real,imag from A
+	lxvd2x		vs1,	o16,	AO		// load real,imag from A
+
+	xvmaddadp	vs40,	vs12,	vs20		// real*real, imag*real
+	xvmaddadp	vs41,	vs12,	vs21		// real*imag, imag*imag
+	xvmaddadp	vs42,	vs13,	vs20		// real*real, imag*real
+	xvmaddadp	vs43,	vs13,	vs21		// real*imag, imag*imag
+
+	lxvd2x		vs2,	o32,	AO		// load real,imag from A
+	lxvd2x		vs3,	o48,	AO		// load real,imag from A
+
+	xvmaddadp	vs44,	vs14,	vs20		// real*real, imag*real
+	xvmaddadp	vs45,	vs14,	vs21		// real*imag, imag*imag
+	xvmaddadp	vs46,	vs15,	vs20		// real*real, imag*real
+	xvmaddadp	vs47,	vs15,	vs21		// real*imag, imag*imag
+
+	addi		AO,	AO,	64
+
+	xvmaddadp	vs48,	vs8,	vs22		// real*real, imag*real
+	xvmaddadp	vs49,	vs8,	vs23		// real*imag, imag*imag
+	xvmaddadp	vs50,	vs9,	vs22		// real*real, imag*real
+	xvmaddadp	vs51,	vs9,	vs23		// real*imag, imag*imag
+
+	lxvd2x		vs4,	o0,	AO		// load real,imag from A
+	lxvd2x		vs5,	o16,	AO		// load real,imag from A
+
+	xvmaddadp	vs52,	vs10,	vs22		// real*real, imag*real
+	xvmaddadp	vs53,	vs10,	vs23		// real*imag, imag*imag
+	xvmaddadp	vs54,	vs11,	vs22		// real*real, imag*real
+	xvmaddadp	vs55,	vs11,	vs23		// real*imag, imag*imag
+
+	lxvd2x		vs6,	o32,	AO		// load real,imag from A
+	lxvd2x		vs7,	o48,	AO		// load real,imag from A
+
+	xvmaddadp	vs56,	vs12,	vs22		// real*real, imag*real
+	xvmaddadp	vs57,	vs12,	vs23		// real*imag, imag*imag
+	xvmaddadp	vs58,	vs13,	vs22		// real*real, imag*real
+	xvmaddadp	vs59,	vs13,	vs23		// real*imag, imag*imag
+
+	lxvdsx		vs18,	o16,	BO		// load real part from B
+	lxvdsx		vs19,	o24,	BO		// load imag part from B
+
+	xvmaddadp	vs60,	vs14,	vs22		// real*real, imag*real
+	xvmaddadp	vs61,	vs14,	vs23		// real*imag, imag*imag
+	xvmaddadp	vs62,	vs15,	vs22		// real*real, imag*real
+	xvmaddadp	vs63,	vs15,	vs23		// real*imag, imag*imag
+
+	addi		AO,	AO,	64
+	addi		BO,	BO,	32
+
+.endm
+
+.macro KERNEL2x8_E2
+
+
+	xvmaddadp	vs32,	vs8,	vs20		// real*real, imag*real
+	xvmaddadp	vs33,	vs8,	vs21		// real*imag, imag*imag
+	xvmaddadp	vs34,	vs9,	vs20		// real*real, imag*real
+	xvmaddadp	vs35,	vs9,	vs21		// real*imag, imag*imag
+	xvmaddadp	vs36,	vs10,	vs20		// real*real, imag*real
+	xvmaddadp	vs37,	vs10,	vs21		// real*imag, imag*imag
+	xvmaddadp	vs38,	vs11,	vs20		// real*real, imag*real
+	xvmaddadp	vs39,	vs11,	vs21		// real*imag, imag*imag
+	xvmaddadp	vs40,	vs12,	vs20		// real*real, imag*real
+	xvmaddadp	vs41,	vs12,	vs21		// real*imag, imag*imag
+	xvmaddadp	vs42,	vs13,	vs20		// real*real, imag*real
+	xvmaddadp	vs43,	vs13,	vs21		// real*imag, imag*imag
+	xvmaddadp	vs44,	vs14,	vs20		// real*real, imag*real
+	xvmaddadp	vs45,	vs14,	vs21		// real*imag, imag*imag
+	xvmaddadp	vs46,	vs15,	vs20		// real*real, imag*real
+	xvmaddadp	vs47,	vs15,	vs21		// real*imag, imag*imag
+
+	xvmaddadp	vs48,	vs8,	vs22		// real*real, imag*real
+	xvmaddadp	vs49,	vs8,	vs23		// real*imag, imag*imag
+	xvmaddadp	vs50,	vs9,	vs22		// real*real, imag*real
+	xvmaddadp	vs51,	vs9,	vs23		// real*imag, imag*imag
+	xvmaddadp	vs52,	vs10,	vs22		// real*real, imag*real
+	xvmaddadp	vs53,	vs10,	vs23		// real*imag, imag*imag
+	xvmaddadp	vs54,	vs11,	vs22		// real*real, imag*real
+	xvmaddadp	vs55,	vs11,	vs23		// real*imag, imag*imag
+	xvmaddadp	vs56,	vs12,	vs22		// real*real, imag*real
+	xvmaddadp	vs57,	vs12,	vs23		// real*imag, imag*imag
+	xvmaddadp	vs58,	vs13,	vs22		// real*real, imag*real
+	xvmaddadp	vs59,	vs13,	vs23		// real*imag, imag*imag
+	xvmaddadp	vs60,	vs14,	vs22		// real*real, imag*real
+	xvmaddadp	vs61,	vs14,	vs23		// real*imag, imag*imag
+	xvmaddadp	vs62,	vs15,	vs22		// real*real, imag*real
+	xvmaddadp	vs63,	vs15,	vs23		// real*imag, imag*imag
+
+
+.endm
+
+.macro KERNEL2x8_SUBI1
+
+	lxvd2x		vs0,	o0,	AO		// load real,imag from A
+	lxvd2x		vs1,	o16,	AO		// load real,imag from A
+	lxvd2x		vs2,	o32,	AO		// load real,imag from A
+	lxvd2x		vs3,	o48,	AO		// load real,imag from A
+
+	addi		AO,	AO,	64
+
+	lxvd2x		vs4,	o0,	AO		// load real,imag from A
+	lxvd2x		vs5,	o16,	AO		// load real,imag from A
+	lxvd2x		vs6,	o32,	AO		// load real,imag from A
+	lxvd2x		vs7,	o48,	AO		// load real,imag from A
+
+	addi		AO,	AO,	64
+
+	lxvdsx		vs16,	o0,	BO		// load real part from B
+	lxvdsx		vs17,	o8,	BO		// load imag part from B
+	lxvdsx		vs18,	o16,	BO		// load real part from B
+	lxvdsx		vs19,	o24,	BO		// load imag part from B
+
+	addi		BO,	BO,	32
+
+	xvmuldp		vs32,	vs0,	vs16		// real*real, imag*real
+	xvmuldp		vs33,	vs0,	vs17		// real*imag, imag*imag
+	xvmuldp		vs34,	vs1,	vs16		// real*real, imag*real
+	xvmuldp		vs35,	vs1,	vs17		// real*imag, imag*imag
+	xvmuldp		vs36,	vs2,	vs16		// real*real, imag*real
+	xvmuldp		vs37,	vs2,	vs17		// real*imag, imag*imag
+	xvmuldp		vs38,	vs3,	vs16		// real*real, imag*real
+	xvmuldp		vs39,	vs3,	vs17		// real*imag, imag*imag
+	xvmuldp		vs40,	vs4,	vs16		// real*real, imag*real
+	xvmuldp		vs41,	vs4,	vs17		// real*imag, imag*imag
+	xvmuldp		vs42,	vs5,	vs16		// real*real, imag*real
+	xvmuldp		vs43,	vs5,	vs17		// real*imag, imag*imag
+	xvmuldp		vs44,	vs6,	vs16		// real*real, imag*real
+	xvmuldp		vs45,	vs6,	vs17		// real*imag, imag*imag
+	xvmuldp		vs46,	vs7,	vs16		// real*real, imag*real
+	xvmuldp		vs47,	vs7,	vs17		// real*imag, imag*imag
+
+	xvmuldp		vs48,	vs0,	vs18		// real*real, imag*real
+	xvmuldp		vs49,	vs0,	vs19		// real*imag, imag*imag
+	xvmuldp		vs50,	vs1,	vs18		// real*real, imag*real
+	xvmuldp		vs51,	vs1,	vs19		// real*imag, imag*imag
+	xvmuldp		vs52,	vs2,	vs18		// real*real, imag*real
+	xvmuldp		vs53,	vs2,	vs19		// real*imag, imag*imag
+	xvmuldp		vs54,	vs3,	vs18		// real*real, imag*real
+	xvmuldp		vs55,	vs3,	vs19		// real*imag, imag*imag
+	xvmuldp		vs56,	vs4,	vs18		// real*real, imag*real
+	xvmuldp		vs57,	vs4,	vs19		// real*imag, imag*imag
+	xvmuldp		vs58,	vs5,	vs18		// real*real, imag*real
+	xvmuldp		vs59,	vs5,	vs19		// real*imag, imag*imag
+	xvmuldp		vs60,	vs6,	vs18		// real*real, imag*real
+	xvmuldp		vs61,	vs6,	vs19		// real*imag, imag*imag
+	xvmuldp		vs62,	vs7,	vs18		// real*real, imag*real
+	xvmuldp		vs63,	vs7,	vs19		// real*imag, imag*imag
+
+
+.endm
+
+.macro KERNEL2x8_SUB1
+
+	lxvd2x		vs0,	o0,	AO		// load real,imag from A
+	lxvd2x		vs1,	o16,	AO		// load real,imag from A
+	lxvd2x		vs2,	o32,	AO		// load real,imag from A
+	lxvd2x		vs3,	o48,	AO		// load real,imag from A
+
+	addi		AO,	AO,	64
+
+	lxvd2x		vs4,	o0,	AO		// load real,imag from A
+	lxvd2x		vs5,	o16,	AO		// load real,imag from A
+	lxvd2x		vs6,	o32,	AO		// load real,imag from A
+	lxvd2x		vs7,	o48,	AO		// load real,imag from A
+
+	addi		AO,	AO,	64
+
+	lxvdsx		vs16,	o0,	BO		// load real part from B
+	lxvdsx		vs17,	o8,	BO		// load imag part from B
+	lxvdsx		vs18,	o16,	BO		// load real part from B
+	lxvdsx		vs19,	o24,	BO		// load imag part from B
+
+	addi		BO,	BO,	32
+
+	xvmaddadp	vs32,	vs0,	vs16		// real*real, imag*real
+	xvmaddadp	vs33,	vs0,	vs17		// real*imag, imag*imag
+	xvmaddadp	vs34,	vs1,	vs16		// real*real, imag*real
+	xvmaddadp	vs35,	vs1,	vs17		// real*imag, imag*imag
+	xvmaddadp	vs36,	vs2,	vs16		// real*real, imag*real
+	xvmaddadp	vs37,	vs2,	vs17		// real*imag, imag*imag
+	xvmaddadp	vs38,	vs3,	vs16		// real*real, imag*real
+	xvmaddadp	vs39,	vs3,	vs17		// real*imag, imag*imag
+	xvmaddadp	vs40,	vs4,	vs16		// real*real, imag*real
+	xvmaddadp	vs41,	vs4,	vs17		// real*imag, imag*imag
+	xvmaddadp	vs42,	vs5,	vs16		// real*real, imag*real
+	xvmaddadp	vs43,	vs5,	vs17		// real*imag, imag*imag
+	xvmaddadp	vs44,	vs6,	vs16		// real*real, imag*real
+	xvmaddadp	vs45,	vs6,	vs17		// real*imag, imag*imag
+	xvmaddadp	vs46,	vs7,	vs16		// real*real, imag*real
+	xvmaddadp	vs47,	vs7,	vs17		// real*imag, imag*imag
+
+	xvmaddadp	vs48,	vs0,	vs18		// real*real, imag*real
+	xvmaddadp	vs49,	vs0,	vs19		// real*imag, imag*imag
+	xvmaddadp	vs50,	vs1,	vs18		// real*real, imag*real
+	xvmaddadp	vs51,	vs1,	vs19		// real*imag, imag*imag
+	xvmaddadp	vs52,	vs2,	vs18		// real*real, imag*real
+	xvmaddadp	vs53,	vs2,	vs19		// real*imag, imag*imag
+	xvmaddadp	vs54,	vs3,	vs18		// real*real, imag*real
+	xvmaddadp	vs55,	vs3,	vs19		// real*imag, imag*imag
+	xvmaddadp	vs56,	vs4,	vs18		// real*real, imag*real
+	xvmaddadp	vs57,	vs4,	vs19		// real*imag, imag*imag
+	xvmaddadp	vs58,	vs5,	vs18		// real*real, imag*real
+	xvmaddadp	vs59,	vs5,	vs19		// real*imag, imag*imag
+	xvmaddadp	vs60,	vs6,	vs18		// real*real, imag*real
+	xvmaddadp	vs61,	vs6,	vs19		// real*imag, imag*imag
+	xvmaddadp	vs62,	vs7,	vs18		// real*real, imag*real
+	xvmaddadp	vs63,	vs7,	vs19		// real*imag, imag*imag
+
+
+.endm
+
+.macro SAVE2x8
+
+
+	mr		T1,	CO
+	addi		T2,	T1,	64
+
+#ifndef TRMMKERNEL
+
+	lxvd2x		vs16,	o0,	T1
+	lxvd2x		vs17,	o16,	T1
+	lxvd2x		vs18,	o32,	T1
+	lxvd2x		vs19,	o48,	T1
+	lxvd2x		vs20,	o0,	T2
+	lxvd2x		vs21,	o16,	T2
+	lxvd2x		vs22,	o32,	T2
+	lxvd2x		vs23,	o48,	T2
+
+#endif
+
+
+	xxlxor		vs0,	vs0,	vs0
+	xxlxor		vs1,	vs1,	vs1
+	xxswapd		vs33,	vs33			// realA*imagB, imagA*imagB -> imagA*imagB, realA*imagB
+
+	XSFADD_R1	vs0,	vs0,	vs32		// realA*realB
+	XSFADD_R2	vs0,	vs0,	vs33		// imagA*imagB
+
+	xxswapd		vs32,	vs32			// realA*realB, imagA*realB -> imagA*realB, realA*realB
+	xxswapd		vs33,	vs33			// imagA*imagB, realA*imagB -> realA*imagB, imagA*imagB
+
+	XSFADD_I1	vs1,	vs1,	vs32		// realA*imagB
+	XSFADD_I2	vs1,	vs1,	vs33		// imagA*realB
+
+	xsmuldp		vs4,	vs0,	alpha_r		// real*alpha_r
+	xsmuldp		vs5,	vs1,	alpha_i		// imag*alpha_i
+	xsmuldp		vs6,	vs0,	alpha_i		// real*alpha_i
+	xsmuldp		vs7,	vs1,	alpha_r		// imag*alpha_r
+
+	xssubdp		vs2,	vs4,	vs5		// real*alpha_r - imag*alpha_i
+	xsadddp		vs3,	vs6,	vs7		// real*alpha_i + imag*alpha_r
+	xxpermdi	vs8,	vs2,	vs3,	0	// merge real and imag part
+
+
+
+	xxlxor		vs0,	vs0,	vs0
+	xxlxor		vs1,	vs1,	vs1
+	xxswapd		vs35,	vs35			// realA*imagB, imagA*imagB -> imagA*imagB, realA*imagB
+
+	XSFADD_R1	vs0,	vs0,	vs34		// realA*realB
+	XSFADD_R2	vs0,	vs0,	vs35		// imagA*imagB
+
+	xxswapd		vs34,	vs34			// realA*realB, imagA*realB -> imagA*realB, realA*realB
+	xxswapd		vs35,	vs35			// imagA*imagB, realA*imagB -> realA*imagB, imagA*imagB
+
+	XSFADD_I1	vs1,	vs1,	vs34		// realA*imagB
+	XSFADD_I2	vs1,	vs1,	vs35		// imagA*realB
+
+	xsmuldp		vs4,	vs0,	alpha_r		// real*alpha_r
+	xsmuldp		vs5,	vs1,	alpha_i		// imag*alpha_i
+	xsmuldp		vs6,	vs0,	alpha_i		// real*alpha_i
+	xsmuldp		vs7,	vs1,	alpha_r		// imag*alpha_r
+
+	xssubdp		vs2,	vs4,	vs5		// real*alpha_r - imag*alpha_i
+	xsadddp		vs3,	vs6,	vs7		// real*alpha_i + imag*alpha_r
+	xxpermdi	vs9,	vs2,	vs3,	0	// merge real and imag part
+
+
+
+	xxlxor		vs0,	vs0,	vs0
+	xxlxor		vs1,	vs1,	vs1
+	xxswapd		vs37,	vs37			// realA*imagB, imagA*imagB -> imagA*imagB, realA*imagB
+
+	XSFADD_R1	vs0,	vs0,	vs36		// realA*realB
+	XSFADD_R2	vs0,	vs0,	vs37		// imagA*imagB
+
+	xxswapd		vs36,	vs36			// realA*realB, imagA*realB -> imagA*realB, realA*realB
+	xxswapd		vs37,	vs37			// imagA*imagB, realA*imagB -> realA*imagB, imagA*imagB
+
+	XSFADD_I1	vs1,	vs1,	vs36		// realA*imagB
+	XSFADD_I2	vs1,	vs1,	vs37		// imagA*realB
+
+	xsmuldp		vs4,	vs0,	alpha_r		// real*alpha_r
+	xsmuldp		vs5,	vs1,	alpha_i		// imag*alpha_i
+	xsmuldp		vs6,	vs0,	alpha_i		// real*alpha_i
+	xsmuldp		vs7,	vs1,	alpha_r		// imag*alpha_r
+
+	xssubdp		vs2,	vs4,	vs5		// real*alpha_r - imag*alpha_i
+	xsadddp		vs3,	vs6,	vs7		// real*alpha_i + imag*alpha_r
+	xxpermdi	vs10,	vs2,	vs3,	0	// merge real and imag part
+
+
+
+	xxlxor		vs0,	vs0,	vs0
+	xxlxor		vs1,	vs1,	vs1
+	xxswapd		vs39,	vs39			// realA*imagB, imagA*imagB -> imagA*imagB, realA*imagB
+
+	XSFADD_R1	vs0,	vs0,	vs38		// realA*realB
+	XSFADD_R2	vs0,	vs0,	vs39		// imagA*imagB
+
+	xxswapd		vs38,	vs38			// realA*realB, imagA*realB -> imagA*realB, realA*realB
+	xxswapd		vs39,	vs39			// imagA*imagB, realA*imagB -> realA*imagB, imagA*imagB
+
+	XSFADD_I1	vs1,	vs1,	vs38		// realA*imagB
+	XSFADD_I2	vs1,	vs1,	vs39		// imagA*realB
+
+	xsmuldp		vs4,	vs0,	alpha_r		// real*alpha_r
+	xsmuldp		vs5,	vs1,	alpha_i		// imag*alpha_i
+	xsmuldp		vs6,	vs0,	alpha_i		// real*alpha_i
+	xsmuldp		vs7,	vs1,	alpha_r		// imag*alpha_r
+
+	xssubdp		vs2,	vs4,	vs5		// real*alpha_r - imag*alpha_i
+	xsadddp		vs3,	vs6,	vs7		// real*alpha_i + imag*alpha_r
+	xxpermdi	vs11,	vs2,	vs3,	0	// merge real and imag part
+
+
+
+	xxlxor		vs0,	vs0,	vs0
+	xxlxor		vs1,	vs1,	vs1
+	xxswapd		vs41,	vs41			// realA*imagB, imagA*imagB -> imagA*imagB, realA*imagB
+
+	XSFADD_R1	vs0,	vs0,	vs40		// realA*realB
+	XSFADD_R2	vs0,	vs0,	vs41		// imagA*imagB
+
+	xxswapd		vs40,	vs40			// realA*realB, imagA*realB -> imagA*realB, realA*realB
+	xxswapd		vs41,	vs41			// imagA*imagB, realA*imagB -> realA*imagB, imagA*imagB
+
+	XSFADD_I1	vs1,	vs1,	vs40		// realA*imagB
+	XSFADD_I2	vs1,	vs1,	vs41		// imagA*realB
+
+	xsmuldp		vs4,	vs0,	alpha_r		// real*alpha_r
+	xsmuldp		vs5,	vs1,	alpha_i		// imag*alpha_i
+	xsmuldp		vs6,	vs0,	alpha_i		// real*alpha_i
+	xsmuldp		vs7,	vs1,	alpha_r		// imag*alpha_r
+
+	xssubdp		vs2,	vs4,	vs5		// real*alpha_r - imag*alpha_i
+	xsadddp		vs3,	vs6,	vs7		// real*alpha_i + imag*alpha_r
+	xxpermdi	vs12,	vs2,	vs3,	0	// merge real and imag part
+
+
+
+	xxlxor		vs0,	vs0,	vs0
+	xxlxor		vs1,	vs1,	vs1
+	xxswapd		vs43,	vs43			// realA*imagB, imagA*imagB -> imagA*imagB, realA*imagB
+
+	XSFADD_R1	vs0,	vs0,	vs42		// realA*realB
+	XSFADD_R2	vs0,	vs0,	vs43		// imagA*imagB
+
+	xxswapd		vs42,	vs42			// realA*realB, imagA*realB -> imagA*realB, realA*realB
+	xxswapd		vs43,	vs43			// imagA*imagB, realA*imagB -> realA*imagB, imagA*imagB
+
+	XSFADD_I1	vs1,	vs1,	vs42		// realA*imagB
+	XSFADD_I2	vs1,	vs1,	vs43		// imagA*realB
+
+	xsmuldp		vs4,	vs0,	alpha_r		// real*alpha_r
+	xsmuldp		vs5,	vs1,	alpha_i		// imag*alpha_i
+	xsmuldp		vs6,	vs0,	alpha_i		// real*alpha_i
+	xsmuldp		vs7,	vs1,	alpha_r		// imag*alpha_r
+
+	xssubdp		vs2,	vs4,	vs5		// real*alpha_r - imag*alpha_i
+	xsadddp		vs3,	vs6,	vs7		// real*alpha_i + imag*alpha_r
+	xxpermdi	vs13,	vs2,	vs3,	0	// merge real and imag part
+
+
+
+	xxlxor		vs0,	vs0,	vs0
+	xxlxor		vs1,	vs1,	vs1
+	xxswapd		vs45,	vs45			// realA*imagB, imagA*imagB -> imagA*imagB, realA*imagB
+
+	XSFADD_R1	vs0,	vs0,	vs44		// realA*realB
+	XSFADD_R2	vs0,	vs0,	vs45		// imagA*imagB
+
+	xxswapd		vs44,	vs44			// realA*realB, imagA*realB -> imagA*realB, realA*realB
+	xxswapd		vs45,	vs45			// imagA*imagB, realA*imagB -> realA*imagB, imagA*imagB
+
+	XSFADD_I1	vs1,	vs1,	vs44		// realA*imagB
+	XSFADD_I2	vs1,	vs1,	vs45		// imagA*realB
+
+	xsmuldp		vs4,	vs0,	alpha_r		// real*alpha_r
+	xsmuldp		vs5,	vs1,	alpha_i		// imag*alpha_i
+	xsmuldp		vs6,	vs0,	alpha_i		// real*alpha_i
+	xsmuldp		vs7,	vs1,	alpha_r		// imag*alpha_r
+
+	xssubdp		vs2,	vs4,	vs5		// real*alpha_r - imag*alpha_i
+	xsadddp		vs3,	vs6,	vs7		// real*alpha_i + imag*alpha_r
+	xxpermdi	vs14,	vs2,	vs3,	0	// merge real and imag part
+
+
+
+	xxlxor		vs0,	vs0,	vs0
+	xxlxor		vs1,	vs1,	vs1
+	xxswapd		vs47,	vs47			// realA*imagB, imagA*imagB -> imagA*imagB, realA*imagB
+
+	XSFADD_R1	vs0,	vs0,	vs46		// realA*realB
+	XSFADD_R2	vs0,	vs0,	vs47		// imagA*imagB
+
+	xxswapd		vs46,	vs46			// realA*realB, imagA*realB -> imagA*realB, realA*realB
+	xxswapd		vs47,	vs47			// imagA*imagB, realA*imagB -> realA*imagB, imagA*imagB
+
+	XSFADD_I1	vs1,	vs1,	vs46		// realA*imagB
+	XSFADD_I2	vs1,	vs1,	vs47		// imagA*realB
+
+	xsmuldp		vs4,	vs0,	alpha_r		// real*alpha_r
+	xsmuldp		vs5,	vs1,	alpha_i		// imag*alpha_i
+	xsmuldp		vs6,	vs0,	alpha_i		// real*alpha_i
+	xsmuldp		vs7,	vs1,	alpha_r		// imag*alpha_r
+
+	xssubdp		vs2,	vs4,	vs5		// real*alpha_r - imag*alpha_i
+	xsadddp		vs3,	vs6,	vs7		// real*alpha_i + imag*alpha_r
+	xxpermdi	vs15,	vs2,	vs3,	0	// merge real and imag part
+
+
+#ifndef TRMMKERNEL
+
+	xvadddp		vs8,	vs8,	vs16
+	xvadddp		vs9,	vs9,	vs17
+	xvadddp		vs10,	vs10,	vs18
+	xvadddp		vs11,	vs11,	vs19
+	xvadddp		vs12,	vs12,	vs20
+	xvadddp		vs13,	vs13,	vs21
+	xvadddp		vs14,	vs14,	vs22
+	xvadddp		vs15,	vs15,	vs23
+
+#endif
+
+	stxvd2x		vs8,	o0,	T1
+	stxvd2x		vs9,	o16,	T1
+	stxvd2x		vs10,	o32,	T1
+	stxvd2x		vs11,	o48,	T1
+	stxvd2x		vs12,	o0,	T2
+	stxvd2x		vs13,	o16,	T2
+	stxvd2x		vs14,	o32,	T2
+	stxvd2x		vs15,	o48,	T2
+
+	add		T1,	T1,	LDC
+	add		T2,	T2,	LDC
+
+#ifndef TRMMKERNEL
+
+	lxvd2x		vs16,	o0,	T1
+	lxvd2x		vs17,	o16,	T1
+	lxvd2x		vs18,	o32,	T1
+	lxvd2x		vs19,	o48,	T1
+	lxvd2x		vs20,	o0,	T2
+	lxvd2x		vs21,	o16,	T2
+	lxvd2x		vs22,	o32,	T2
+	lxvd2x		vs23,	o48,	T2
+
+#endif
+
+
+	xxlxor		vs0,	vs0,	vs0
+	xxlxor		vs1,	vs1,	vs1
+	xxswapd		vs49,	vs49			// realA*imagB, imagA*imagB -> imagA*imagB, realA*imagB
+
+	XSFADD_R1	vs0,	vs0,	vs48		// realA*realB
+	XSFADD_R2	vs0,	vs0,	vs49		// imagA*imagB
+
+	xxswapd		vs48,	vs48			// realA*realB, imagA*realB -> imagA*realB, realA*realB
+	xxswapd		vs49,	vs49			// imagA*imagB, realA*imagB -> realA*imagB, imagA*imagB
+
+	XSFADD_I1	vs1,	vs1,	vs48		// realA*imagB
+	XSFADD_I2	vs1,	vs1,	vs49		// imagA*realB
+
+	xsmuldp		vs4,	vs0,	alpha_r		// real*alpha_r
+	xsmuldp		vs5,	vs1,	alpha_i		// imag*alpha_i
+	xsmuldp		vs6,	vs0,	alpha_i		// real*alpha_i
+	xsmuldp		vs7,	vs1,	alpha_r		// imag*alpha_r
+
+	xssubdp		vs2,	vs4,	vs5		// real*alpha_r - imag*alpha_i
+	xsadddp		vs3,	vs6,	vs7		// real*alpha_i + imag*alpha_r
+	xxpermdi	vs8,	vs2,	vs3,	0	// merge real and imag part
+
+
+
+	xxlxor		vs0,	vs0,	vs0
+	xxlxor		vs1,	vs1,	vs1
+	xxswapd		vs51,	vs51			// realA*imagB, imagA*imagB -> imagA*imagB, realA*imagB
+
+	XSFADD_R1	vs0,	vs0,	vs50		// realA*realB
+	XSFADD_R2	vs0,	vs0,	vs51		// imagA*imagB
+
+	xxswapd		vs50,	vs50			// realA*realB, imagA*realB -> imagA*realB, realA*realB
+	xxswapd		vs51,	vs51			// imagA*imagB, realA*imagB -> realA*imagB, imagA*imagB
+
+	XSFADD_I1	vs1,	vs1,	vs50		// realA*imagB
+	XSFADD_I2	vs1,	vs1,	vs51		// imagA*realB
+
+	xsmuldp		vs4,	vs0,	alpha_r		// real*alpha_r
+	xsmuldp		vs5,	vs1,	alpha_i		// imag*alpha_i
+	xsmuldp		vs6,	vs0,	alpha_i		// real*alpha_i
+	xsmuldp		vs7,	vs1,	alpha_r		// imag*alpha_r
+
+	xssubdp		vs2,	vs4,	vs5		// real*alpha_r - imag*alpha_i
+	xsadddp		vs3,	vs6,	vs7		// real*alpha_i + imag*alpha_r
+	xxpermdi	vs9,	vs2,	vs3,	0	// merge real and imag part
+
+
+
+	xxlxor		vs0,	vs0,	vs0
+	xxlxor		vs1,	vs1,	vs1
+	xxswapd		vs53,	vs53			// realA*imagB, imagA*imagB -> imagA*imagB, realA*imagB
+
+	XSFADD_R1	vs0,	vs0,	vs52		// realA*realB
+	XSFADD_R2	vs0,	vs0,	vs53		// imagA*imagB
+
+	xxswapd		vs52,	vs52			// realA*realB, imagA*realB -> imagA*realB, realA*realB
+	xxswapd		vs53,	vs53			// imagA*imagB, realA*imagB -> realA*imagB, imagA*imagB
+
+	XSFADD_I1	vs1,	vs1,	vs52		// realA*imagB
+	XSFADD_I2	vs1,	vs1,	vs53		// imagA*realB
+
+	xsmuldp		vs4,	vs0,	alpha_r		// real*alpha_r
+	xsmuldp		vs5,	vs1,	alpha_i		// imag*alpha_i
+	xsmuldp		vs6,	vs0,	alpha_i		// real*alpha_i
+	xsmuldp		vs7,	vs1,	alpha_r		// imag*alpha_r
+
+	xssubdp		vs2,	vs4,	vs5		// real*alpha_r - imag*alpha_i
+	xsadddp		vs3,	vs6,	vs7		// real*alpha_i + imag*alpha_r
+	xxpermdi	vs10,	vs2,	vs3,	0	// merge real and imag part
+
+
+
+	xxlxor		vs0,	vs0,	vs0
+	xxlxor		vs1,	vs1,	vs1
+	xxswapd		vs55,	vs55			// realA*imagB, imagA*imagB -> imagA*imagB, realA*imagB
+
+	XSFADD_R1	vs0,	vs0,	vs54		// realA*realB
+	XSFADD_R2	vs0,	vs0,	vs55		// imagA*imagB
+
+	xxswapd		vs54,	vs54			// realA*realB, imagA*realB -> imagA*realB, realA*realB
+	xxswapd		vs55,	vs55			// imagA*imagB, realA*imagB -> realA*imagB, imagA*imagB
+
+	XSFADD_I1	vs1,	vs1,	vs54		// realA*imagB
+	XSFADD_I2	vs1,	vs1,	vs55		// imagA*realB
+
+	xsmuldp		vs4,	vs0,	alpha_r		// real*alpha_r
+	xsmuldp		vs5,	vs1,	alpha_i		// imag*alpha_i
+	xsmuldp		vs6,	vs0,	alpha_i		// real*alpha_i
+	xsmuldp		vs7,	vs1,	alpha_r		// imag*alpha_r
+
+	xssubdp		vs2,	vs4,	vs5		// real*alpha_r - imag*alpha_i
+	xsadddp		vs3,	vs6,	vs7		// real*alpha_i + imag*alpha_r
+	xxpermdi	vs11,	vs2,	vs3,	0	// merge real and imag part
+
+
+
+	xxlxor		vs0,	vs0,	vs0
+	xxlxor		vs1,	vs1,	vs1
+	xxswapd		vs57,	vs57			// realA*imagB, imagA*imagB -> imagA*imagB, realA*imagB
+
+	XSFADD_R1	vs0,	vs0,	vs56		// realA*realB
+	XSFADD_R2	vs0,	vs0,	vs57		// imagA*imagB
+
+	xxswapd		vs56,	vs56			// realA*realB, imagA*realB -> imagA*realB, realA*realB
+	xxswapd		vs57,	vs57			// imagA*imagB, realA*imagB -> realA*imagB, imagA*imagB
+
+	XSFADD_I1	vs1,	vs1,	vs56		// realA*imagB
+	XSFADD_I2	vs1,	vs1,	vs57		// imagA*realB
+
+	xsmuldp		vs4,	vs0,	alpha_r		// real*alpha_r
+	xsmuldp		vs5,	vs1,	alpha_i		// imag*alpha_i
+	xsmuldp		vs6,	vs0,	alpha_i		// real*alpha_i
+	xsmuldp		vs7,	vs1,	alpha_r		// imag*alpha_r
+
+	xssubdp		vs2,	vs4,	vs5		// real*alpha_r - imag*alpha_i
+	xsadddp		vs3,	vs6,	vs7		// real*alpha_i + imag*alpha_r
+	xxpermdi	vs12,	vs2,	vs3,	0	// merge real and imag part
+
+
+
+	xxlxor		vs0,	vs0,	vs0
+	xxlxor		vs1,	vs1,	vs1
+	xxswapd		vs59,	vs59			// realA*imagB, imagA*imagB -> imagA*imagB, realA*imagB
+
+	XSFADD_R1	vs0,	vs0,	vs58		// realA*realB
+	XSFADD_R2	vs0,	vs0,	vs59		// imagA*imagB
+
+	xxswapd		vs58,	vs58			// realA*realB, imagA*realB -> imagA*realB, realA*realB
+	xxswapd		vs59,	vs59			// imagA*imagB, realA*imagB -> realA*imagB, imagA*imagB
+
+	XSFADD_I1	vs1,	vs1,	vs58		// realA*imagB
+	XSFADD_I2	vs1,	vs1,	vs59		// imagA*realB
+
+	xsmuldp		vs4,	vs0,	alpha_r		// real*alpha_r
+	xsmuldp		vs5,	vs1,	alpha_i		// imag*alpha_i
+	xsmuldp		vs6,	vs0,	alpha_i		// real*alpha_i
+	xsmuldp		vs7,	vs1,	alpha_r		// imag*alpha_r
+
+	xssubdp		vs2,	vs4,	vs5		// real*alpha_r - imag*alpha_i
+	xsadddp		vs3,	vs6,	vs7		// real*alpha_i + imag*alpha_r
+	xxpermdi	vs13,	vs2,	vs3,	0	// merge real and imag part
+
+
+
+	xxlxor		vs0,	vs0,	vs0
+	xxlxor		vs1,	vs1,	vs1
+	xxswapd		vs61,	vs61			// realA*imagB, imagA*imagB -> imagA*imagB, realA*imagB
+
+	XSFADD_R1	vs0,	vs0,	vs60		// realA*realB
+	XSFADD_R2	vs0,	vs0,	vs61		// imagA*imagB
+
+	xxswapd		vs60,	vs60			// realA*realB, imagA*realB -> imagA*realB, realA*realB
+	xxswapd		vs61,	vs61			// imagA*imagB, realA*imagB -> realA*imagB, imagA*imagB
+
+	XSFADD_I1	vs1,	vs1,	vs60		// realA*imagB
+	XSFADD_I2	vs1,	vs1,	vs61		// imagA*realB
+
+	xsmuldp		vs4,	vs0,	alpha_r		// real*alpha_r
+	xsmuldp		vs5,	vs1,	alpha_i		// imag*alpha_i
+	xsmuldp		vs6,	vs0,	alpha_i		// real*alpha_i
+	xsmuldp		vs7,	vs1,	alpha_r		// imag*alpha_r
+
+	xssubdp		vs2,	vs4,	vs5		// real*alpha_r - imag*alpha_i
+	xsadddp		vs3,	vs6,	vs7		// real*alpha_i + imag*alpha_r
+	xxpermdi	vs14,	vs2,	vs3,	0	// merge real and imag part
+
+
+
+	xxlxor		vs0,	vs0,	vs0
+	xxlxor		vs1,	vs1,	vs1
+	xxswapd		vs63,	vs63			// realA*imagB, imagA*imagB -> imagA*imagB, realA*imagB
+
+	XSFADD_R1	vs0,	vs0,	vs62		// realA*realB
+	XSFADD_R2	vs0,	vs0,	vs63		// imagA*imagB
+
+	xxswapd		vs62,	vs62			// realA*realB, imagA*realB -> imagA*realB, realA*realB
+	xxswapd		vs63,	vs63			// imagA*imagB, realA*imagB -> realA*imagB, imagA*imagB
+
+	XSFADD_I1	vs1,	vs1,	vs62		// realA*imagB
+	XSFADD_I2	vs1,	vs1,	vs63		// imagA*realB
+
+	xsmuldp		vs4,	vs0,	alpha_r		// real*alpha_r
+	xsmuldp		vs5,	vs1,	alpha_i		// imag*alpha_i
+	xsmuldp		vs6,	vs0,	alpha_i		// real*alpha_i
+	xsmuldp		vs7,	vs1,	alpha_r		// imag*alpha_r
+
+	xssubdp		vs2,	vs4,	vs5		// real*alpha_r - imag*alpha_i
+	xsadddp		vs3,	vs6,	vs7		// real*alpha_i + imag*alpha_r
+	xxpermdi	vs15,	vs2,	vs3,	0	// merge real and imag part
+
+
+#ifndef TRMMKERNEL
+
+	xvadddp		vs8,	vs8,	vs16
+	xvadddp		vs9,	vs9,	vs17
+	xvadddp		vs10,	vs10,	vs18
+	xvadddp		vs11,	vs11,	vs19
+	xvadddp		vs12,	vs12,	vs20
+	xvadddp		vs13,	vs13,	vs21
+	xvadddp		vs14,	vs14,	vs22
+	xvadddp		vs15,	vs15,	vs23
+
+#endif
+
+	stxvd2x		vs8,	o0,	T1
+	stxvd2x		vs9,	o16,	T1
+	stxvd2x		vs10,	o32,	T1
+	stxvd2x		vs11,	o48,	T1
+	stxvd2x		vs12,	o0,	T2
+	stxvd2x		vs13,	o16,	T2
+	stxvd2x		vs14,	o32,	T2
+	stxvd2x		vs15,	o48,	T2
+
+	add		T1,	T1,	LDC
+	add		T2,	T2,	LDC
+	addi		CO,	CO,	128
+
+.endm
+
+
+/**********************************************************************************************
+* Macros for N=2 and M=4
+**********************************************************************************************/
+
+.macro LOAD2x4_1
+
+	lxvdsx		vs16,	o0,	BO		// load real part from B
+	lxvdsx		vs17,	o8,	BO		// load imag part from B
+	lxvdsx		vs18,	o16,	BO		// load real part from B
+	lxvdsx		vs19,	o24,	BO		// load imag part from B
+
+	addi		BO,	BO,	32
+
+	lxvd2x		vs0,	o0,	AO		// load real,imag from A
+	lxvd2x		vs1,	o16,	AO		// load real,imag from A
+	lxvd2x		vs2,	o32,	AO		// load real,imag from A
+	lxvd2x		vs3,	o48,	AO		// load real,imag from A
+
+	addi		AO,	AO,	64
+
+
+.endm
+
+.macro KERNEL2x4_I1
+
+	lxvd2x		vs8,	o0,	AO		// load real,imag from A
+	lxvd2x		vs9,	o16,	AO		// load real,imag from A
+	lxvd2x		vs10,	o32,	AO		// load real,imag from A
+	lxvd2x		vs11,	o48,	AO		// load real,imag from A
+
+	addi		AO,	AO,	64
+
+	lxvdsx		vs20,	o0,	BO		// load real part from B
+	lxvdsx		vs21,	o8,	BO		// load imag part from B
+	lxvdsx		vs22,	o16,	BO		// load real part from B
+	lxvdsx		vs23,	o24,	BO		// load imag part from B
+
+	addi		BO,	BO,	32
+
+	xvmuldp		vs32,	vs0,	vs16		// real*real, imag*real
+	xvmuldp		vs33,	vs0,	vs17		// real*imag, imag*imag
+	xvmuldp		vs34,	vs1,	vs16		// real*real, imag*real
+	xvmuldp		vs35,	vs1,	vs17		// real*imag, imag*imag
+	xvmuldp		vs36,	vs2,	vs16		// real*real, imag*real
+	xvmuldp		vs37,	vs2,	vs17		// real*imag, imag*imag
+	xvmuldp		vs38,	vs3,	vs16		// real*real, imag*real
+	xvmuldp		vs39,	vs3,	vs17		// real*imag, imag*imag
+
+	xvmuldp		vs40,	vs0,	vs18		// real*real, imag*real
+	xvmuldp		vs41,	vs0,	vs19		// real*imag, imag*imag
+	xvmuldp		vs42,	vs1,	vs18		// real*real, imag*real
+	xvmuldp		vs43,	vs1,	vs19		// real*imag, imag*imag
+	xvmuldp		vs44,	vs2,	vs18		// real*real, imag*real
+	xvmuldp		vs45,	vs2,	vs19		// real*imag, imag*imag
+	xvmuldp		vs46,	vs3,	vs18		// real*real, imag*real
+	xvmuldp		vs47,	vs3,	vs19		// real*imag, imag*imag
+
+
+.endm
+
+.macro KERNEL2x4_1
+
+	lxvd2x		vs8,	o0,	AO		// load real,imag from A
+	lxvd2x		vs9,	o16,	AO		// load real,imag from A
+	lxvd2x		vs10,	o32,	AO		// load real,imag from A
+	lxvd2x		vs11,	o48,	AO		// load real,imag from A
+
+	addi		AO,	AO,	64
+
+	lxvdsx		vs20,	o0,	BO		// load real part from B
+	lxvdsx		vs21,	o8,	BO		// load imag part from B
+	lxvdsx		vs22,	o16,	BO		// load real part from B
+	lxvdsx		vs23,	o24,	BO		// load imag part from B
+
+	addi		BO,	BO,	32
+
+	xvmaddadp	vs32,	vs0,	vs16		// real*real, imag*real
+	xvmaddadp	vs33,	vs0,	vs17		// real*imag, imag*imag
+	xvmaddadp	vs34,	vs1,	vs16		// real*real, imag*real
+	xvmaddadp	vs35,	vs1,	vs17		// real*imag, imag*imag
+	xvmaddadp	vs36,	vs2,	vs16		// real*real, imag*real
+	xvmaddadp	vs37,	vs2,	vs17		// real*imag, imag*imag
+	xvmaddadp	vs38,	vs3,	vs16		// real*real, imag*real
+	xvmaddadp	vs39,	vs3,	vs17		// real*imag, imag*imag
+
+	xvmaddadp	vs40,	vs0,	vs18		// real*real, imag*real
+	xvmaddadp	vs41,	vs0,	vs19		// real*imag, imag*imag
+	xvmaddadp	vs42,	vs1,	vs18		// real*real, imag*real
+	xvmaddadp	vs43,	vs1,	vs19		// real*imag, imag*imag
+	xvmaddadp	vs44,	vs2,	vs18		// real*real, imag*real
+	xvmaddadp	vs45,	vs2,	vs19		// real*imag, imag*imag
+	xvmaddadp	vs46,	vs3,	vs18		// real*real, imag*real
+	xvmaddadp	vs47,	vs3,	vs19		// real*imag, imag*imag
+
+
+.endm
+
+.macro KERNEL2x4_2
+
+	lxvd2x		vs0,	o0,	AO		// load real,imag from A
+	lxvd2x		vs1,	o16,	AO		// load real,imag from A
+	lxvd2x		vs2,	o32,	AO		// load real,imag from A
+	lxvd2x		vs3,	o48,	AO		// load real,imag from A
+
+	addi		AO,	AO,	64
+
+	lxvdsx		vs16,	o0,	BO		// load real part from B
+	lxvdsx		vs17,	o8,	BO		// load imag part from B
+	lxvdsx		vs18,	o16,	BO		// load real part from B
+	lxvdsx		vs19,	o24,	BO		// load imag part from B
+
+	addi		BO,	BO,	32
+
+	xvmaddadp	vs32,	vs8,	vs20		// real*real, imag*real
+	xvmaddadp	vs33,	vs8,	vs21		// real*imag, imag*imag
+	xvmaddadp	vs34,	vs9,	vs20		// real*real, imag*real
+	xvmaddadp	vs35,	vs9,	vs21		// real*imag, imag*imag
+	xvmaddadp	vs36,	vs10,	vs20		// real*real, imag*real
+	xvmaddadp	vs37,	vs10,	vs21		// real*imag, imag*imag
+	xvmaddadp	vs38,	vs11,	vs20		// real*real, imag*real
+	xvmaddadp	vs39,	vs11,	vs21		// real*imag, imag*imag
+
+	xvmaddadp	vs40,	vs8,	vs22		// real*real, imag*real
+	xvmaddadp	vs41,	vs8,	vs23		// real*imag, imag*imag
+	xvmaddadp	vs42,	vs9,	vs22		// real*real, imag*real
+	xvmaddadp	vs43,	vs9,	vs23		// real*imag, imag*imag
+	xvmaddadp	vs44,	vs10,	vs22		// real*real, imag*real
+	xvmaddadp	vs45,	vs10,	vs23		// real*imag, imag*imag
+	xvmaddadp	vs46,	vs11,	vs22		// real*real, imag*real
+	xvmaddadp	vs47,	vs11,	vs23		// real*imag, imag*imag
+
+
+.endm
+
+.macro KERNEL2x4_E2
+
+
+	xvmaddadp	vs32,	vs8,	vs20		// real*real, imag*real
+	xvmaddadp	vs33,	vs8,	vs21		// real*imag, imag*imag
+	xvmaddadp	vs34,	vs9,	vs20		// real*real, imag*real
+	xvmaddadp	vs35,	vs9,	vs21		// real*imag, imag*imag
+	xvmaddadp	vs36,	vs10,	vs20		// real*real, imag*real
+	xvmaddadp	vs37,	vs10,	vs21		// real*imag, imag*imag
+	xvmaddadp	vs38,	vs11,	vs20		// real*real, imag*real
+	xvmaddadp	vs39,	vs11,	vs21		// real*imag, imag*imag
+
+	xvmaddadp	vs40,	vs8,	vs22		// real*real, imag*real
+	xvmaddadp	vs41,	vs8,	vs23		// real*imag, imag*imag
+	xvmaddadp	vs42,	vs9,	vs22		// real*real, imag*real
+	xvmaddadp	vs43,	vs9,	vs23		// real*imag, imag*imag
+	xvmaddadp	vs44,	vs10,	vs22		// real*real, imag*real
+	xvmaddadp	vs45,	vs10,	vs23		// real*imag, imag*imag
+	xvmaddadp	vs46,	vs11,	vs22		// real*real, imag*real
+	xvmaddadp	vs47,	vs11,	vs23		// real*imag, imag*imag
+
+
+.endm
+
+.macro KERNEL2x4_SUBI1
+
+	lxvd2x		vs0,	o0,	AO		// load real,imag from A
+	lxvd2x		vs1,	o16,	AO		// load real,imag from A
+	lxvd2x		vs2,	o32,	AO		// load real,imag from A
+	lxvd2x		vs3,	o48,	AO		// load real,imag from A
+
+	addi		AO,	AO,	64
+
+	lxvdsx		vs16,	o0,	BO		// load real part from B
+	lxvdsx		vs17,	o8,	BO		// load imag part from B
+	lxvdsx		vs18,	o16,	BO		// load real part from B
+	lxvdsx		vs19,	o24,	BO		// load imag part from B
+
+	addi		BO,	BO,	32
+
+	xvmuldp		vs32,	vs0,	vs16		// real*real, imag*real
+	xvmuldp		vs33,	vs0,	vs17		// real*imag, imag*imag
+	xvmuldp		vs34,	vs1,	vs16		// real*real, imag*real
+	xvmuldp		vs35,	vs1,	vs17		// real*imag, imag*imag
+	xvmuldp		vs36,	vs2,	vs16		// real*real, imag*real
+	xvmuldp		vs37,	vs2,	vs17		// real*imag, imag*imag
+	xvmuldp		vs38,	vs3,	vs16		// real*real, imag*real
+	xvmuldp		vs39,	vs3,	vs17		// real*imag, imag*imag
+
+	xvmuldp		vs40,	vs0,	vs18		// real*real, imag*real
+	xvmuldp		vs41,	vs0,	vs19		// real*imag, imag*imag
+	xvmuldp		vs42,	vs1,	vs18		// real*real, imag*real
+	xvmuldp		vs43,	vs1,	vs19		// real*imag, imag*imag
+	xvmuldp		vs44,	vs2,	vs18		// real*real, imag*real
+	xvmuldp		vs45,	vs2,	vs19		// real*imag, imag*imag
+	xvmuldp		vs46,	vs3,	vs18		// real*real, imag*real
+	xvmuldp		vs47,	vs3,	vs19		// real*imag, imag*imag
+
+
+.endm
+
+.macro KERNEL2x4_SUB1
+
+	lxvd2x		vs0,	o0,	AO		// load real,imag from A
+	lxvd2x		vs1,	o16,	AO		// load real,imag from A
+	lxvd2x		vs2,	o32,	AO		// load real,imag from A
+	lxvd2x		vs3,	o48,	AO		// load real,imag from A
+
+	addi		AO,	AO,	64
+
+	lxvdsx		vs16,	o0,	BO		// load real part from B
+	lxvdsx		vs17,	o8,	BO		// load imag part from B
+	lxvdsx		vs18,	o16,	BO		// load real part from B
+	lxvdsx		vs19,	o24,	BO		// load imag part from B
+
+	addi		BO,	BO,	32
+
+	xvmaddadp	vs32,	vs0,	vs16		// real*real, imag*real
+	xvmaddadp	vs33,	vs0,	vs17		// real*imag, imag*imag
+	xvmaddadp	vs34,	vs1,	vs16		// real*real, imag*real
+	xvmaddadp	vs35,	vs1,	vs17		// real*imag, imag*imag
+	xvmaddadp	vs36,	vs2,	vs16		// real*real, imag*real
+	xvmaddadp	vs37,	vs2,	vs17		// real*imag, imag*imag
+	xvmaddadp	vs38,	vs3,	vs16		// real*real, imag*real
+	xvmaddadp	vs39,	vs3,	vs17		// real*imag, imag*imag
+
+	xvmaddadp	vs40,	vs0,	vs18		// real*real, imag*real
+	xvmaddadp	vs41,	vs0,	vs19		// real*imag, imag*imag
+	xvmaddadp	vs42,	vs1,	vs18		// real*real, imag*real
+	xvmaddadp	vs43,	vs1,	vs19		// real*imag, imag*imag
+	xvmaddadp	vs44,	vs2,	vs18		// real*real, imag*real
+	xvmaddadp	vs45,	vs2,	vs19		// real*imag, imag*imag
+	xvmaddadp	vs46,	vs3,	vs18		// real*real, imag*real
+	xvmaddadp	vs47,	vs3,	vs19		// real*imag, imag*imag
+
+
+.endm
+
+.macro SAVE2x4
+
+
+	mr		T1,	CO
+
+#ifndef TRMMKERNEL
+
+	lxvd2x		vs16,	o0,	T1
+	lxvd2x		vs17,	o16,	T1
+	lxvd2x		vs18,	o32,	T1
+	lxvd2x		vs19,	o48,	T1
+
+#endif
+
+
+	xxlxor		vs0,	vs0,	vs0
+	xxlxor		vs1,	vs1,	vs1
+	xxswapd		vs33,	vs33			// realA*imagB, imagA*imagB -> imagA*imagB, realA*imagB
+
+	XSFADD_R1	vs0,	vs0,	vs32		// realA*realB
+	XSFADD_R2	vs0,	vs0,	vs33		// imagA*imagB
+
+	xxswapd		vs32,	vs32			// realA*realB, imagA*realB -> imagA*realB, realA*realB
+	xxswapd		vs33,	vs33			// imagA*imagB, realA*imagB -> realA*imagB, imagA*imagB
+
+	XSFADD_I1	vs1,	vs1,	vs32		// realA*imagB
+	XSFADD_I2	vs1,	vs1,	vs33		// imagA*realB
+
+	xsmuldp		vs4,	vs0,	alpha_r		// real*alpha_r
+	xsmuldp		vs5,	vs1,	alpha_i		// imag*alpha_i
+	xsmuldp		vs6,	vs0,	alpha_i		// real*alpha_i
+	xsmuldp		vs7,	vs1,	alpha_r		// imag*alpha_r
+
+	xssubdp		vs2,	vs4,	vs5		// real*alpha_r - imag*alpha_i
+	xsadddp		vs3,	vs6,	vs7		// real*alpha_i + imag*alpha_r
+	xxpermdi	vs8,	vs2,	vs3,	0	// merge real and imag part
+
+
+
+	xxlxor		vs0,	vs0,	vs0
+	xxlxor		vs1,	vs1,	vs1
+	xxswapd		vs35,	vs35			// realA*imagB, imagA*imagB -> imagA*imagB, realA*imagB
+
+	XSFADD_R1	vs0,	vs0,	vs34		// realA*realB
+	XSFADD_R2	vs0,	vs0,	vs35		// imagA*imagB
+
+	xxswapd		vs34,	vs34			// realA*realB, imagA*realB -> imagA*realB, realA*realB
+	xxswapd		vs35,	vs35			// imagA*imagB, realA*imagB -> realA*imagB, imagA*imagB
+
+	XSFADD_I1	vs1,	vs1,	vs34		// realA*imagB
+	XSFADD_I2	vs1,	vs1,	vs35		// imagA*realB
+
+	xsmuldp		vs4,	vs0,	alpha_r		// real*alpha_r
+	xsmuldp		vs5,	vs1,	alpha_i		// imag*alpha_i
+	xsmuldp		vs6,	vs0,	alpha_i		// real*alpha_i
+	xsmuldp		vs7,	vs1,	alpha_r		// imag*alpha_r
+
+	xssubdp		vs2,	vs4,	vs5		// real*alpha_r - imag*alpha_i
+	xsadddp		vs3,	vs6,	vs7		// real*alpha_i + imag*alpha_r
+	xxpermdi	vs9,	vs2,	vs3,	0	// merge real and imag part
+
+
+
+	xxlxor		vs0,	vs0,	vs0
+	xxlxor		vs1,	vs1,	vs1
+	xxswapd		vs37,	vs37			// realA*imagB, imagA*imagB -> imagA*imagB, realA*imagB
+
+	XSFADD_R1	vs0,	vs0,	vs36		// realA*realB
+	XSFADD_R2	vs0,	vs0,	vs37		// imagA*imagB
+
+	xxswapd		vs36,	vs36			// realA*realB, imagA*realB -> imagA*realB, realA*realB
+	xxswapd		vs37,	vs37			// imagA*imagB, realA*imagB -> realA*imagB, imagA*imagB
+
+	XSFADD_I1	vs1,	vs1,	vs36		// realA*imagB
+	XSFADD_I2	vs1,	vs1,	vs37		// imagA*realB
+
+	xsmuldp		vs4,	vs0,	alpha_r		// real*alpha_r
+	xsmuldp		vs5,	vs1,	alpha_i		// imag*alpha_i
+	xsmuldp		vs6,	vs0,	alpha_i		// real*alpha_i
+	xsmuldp		vs7,	vs1,	alpha_r		// imag*alpha_r
+
+	xssubdp		vs2,	vs4,	vs5		// real*alpha_r - imag*alpha_i
+	xsadddp		vs3,	vs6,	vs7		// real*alpha_i + imag*alpha_r
+	xxpermdi	vs10,	vs2,	vs3,	0	// merge real and imag part
+
+
+
+	xxlxor		vs0,	vs0,	vs0
+	xxlxor		vs1,	vs1,	vs1
+	xxswapd		vs39,	vs39			// realA*imagB, imagA*imagB -> imagA*imagB, realA*imagB
+
+	XSFADD_R1	vs0,	vs0,	vs38		// realA*realB
+	XSFADD_R2	vs0,	vs0,	vs39		// imagA*imagB
+
+	xxswapd		vs38,	vs38			// realA*realB, imagA*realB -> imagA*realB, realA*realB
+	xxswapd		vs39,	vs39			// imagA*imagB, realA*imagB -> realA*imagB, imagA*imagB
+
+	XSFADD_I1	vs1,	vs1,	vs38		// realA*imagB
+	XSFADD_I2	vs1,	vs1,	vs39		// imagA*realB
+
+	xsmuldp		vs4,	vs0,	alpha_r		// real*alpha_r
+	xsmuldp		vs5,	vs1,	alpha_i		// imag*alpha_i
+	xsmuldp		vs6,	vs0,	alpha_i		// real*alpha_i
+	xsmuldp		vs7,	vs1,	alpha_r		// imag*alpha_r
+
+	xssubdp		vs2,	vs4,	vs5		// real*alpha_r - imag*alpha_i
+	xsadddp		vs3,	vs6,	vs7		// real*alpha_i + imag*alpha_r
+	xxpermdi	vs11,	vs2,	vs3,	0	// merge real and imag part
+
+
+#ifndef TRMMKERNEL
+
+	xvadddp		vs8,	vs8,	vs16
+	xvadddp		vs9,	vs9,	vs17
+	xvadddp		vs10,	vs10,	vs18
+	xvadddp		vs11,	vs11,	vs19
+
+#endif
+
+	stxvd2x		vs8,	o0,	T1
+	stxvd2x		vs9,	o16,	T1
+	stxvd2x		vs10,	o32,	T1
+	stxvd2x		vs11,	o48,	T1
+
+	add		T1,	T1,	LDC
+
+#ifndef TRMMKERNEL
+
+	lxvd2x		vs16,	o0,	T1
+	lxvd2x		vs17,	o16,	T1
+	lxvd2x		vs18,	o32,	T1
+	lxvd2x		vs19,	o48,	T1
+
+#endif
+
+
+	xxlxor		vs0,	vs0,	vs0
+	xxlxor		vs1,	vs1,	vs1
+	xxswapd		vs41,	vs41			// realA*imagB, imagA*imagB -> imagA*imagB, realA*imagB
+
+	XSFADD_R1	vs0,	vs0,	vs40		// realA*realB
+	XSFADD_R2	vs0,	vs0,	vs41		// imagA*imagB
+
+	xxswapd		vs40,	vs40			// realA*realB, imagA*realB -> imagA*realB, realA*realB
+	xxswapd		vs41,	vs41			// imagA*imagB, realA*imagB -> realA*imagB, imagA*imagB
+
+	XSFADD_I1	vs1,	vs1,	vs40		// realA*imagB
+	XSFADD_I2	vs1,	vs1,	vs41		// imagA*realB
+
+	xsmuldp		vs4,	vs0,	alpha_r		// real*alpha_r
+	xsmuldp		vs5,	vs1,	alpha_i		// imag*alpha_i
+	xsmuldp		vs6,	vs0,	alpha_i		// real*alpha_i
+	xsmuldp		vs7,	vs1,	alpha_r		// imag*alpha_r
+
+	xssubdp		vs2,	vs4,	vs5		// real*alpha_r - imag*alpha_i
+	xsadddp		vs3,	vs6,	vs7		// real*alpha_i + imag*alpha_r
+	xxpermdi	vs8,	vs2,	vs3,	0	// merge real and imag part
+
+
+
+	xxlxor		vs0,	vs0,	vs0
+	xxlxor		vs1,	vs1,	vs1
+	xxswapd		vs43,	vs43			// realA*imagB, imagA*imagB -> imagA*imagB, realA*imagB
+
+	XSFADD_R1	vs0,	vs0,	vs42		// realA*realB
+	XSFADD_R2	vs0,	vs0,	vs43		// imagA*imagB
+
+	xxswapd		vs42,	vs42			// realA*realB, imagA*realB -> imagA*realB, realA*realB
+	xxswapd		vs43,	vs43			// imagA*imagB, realA*imagB -> realA*imagB, imagA*imagB
+
+	XSFADD_I1	vs1,	vs1,	vs42		// realA*imagB
+	XSFADD_I2	vs1,	vs1,	vs43		// imagA*realB
+
+	xsmuldp		vs4,	vs0,	alpha_r		// real*alpha_r
+	xsmuldp		vs5,	vs1,	alpha_i		// imag*alpha_i
+	xsmuldp		vs6,	vs0,	alpha_i		// real*alpha_i
+	xsmuldp		vs7,	vs1,	alpha_r		// imag*alpha_r
+
+	xssubdp		vs2,	vs4,	vs5		// real*alpha_r - imag*alpha_i
+	xsadddp		vs3,	vs6,	vs7		// real*alpha_i + imag*alpha_r
+	xxpermdi	vs9,	vs2,	vs3,	0	// merge real and imag part
+
+
+
+	xxlxor		vs0,	vs0,	vs0
+	xxlxor		vs1,	vs1,	vs1
+	xxswapd		vs45,	vs45			// realA*imagB, imagA*imagB -> imagA*imagB, realA*imagB
+
+	XSFADD_R1	vs0,	vs0,	vs44		// realA*realB
+	XSFADD_R2	vs0,	vs0,	vs45		// imagA*imagB
+
+	xxswapd		vs44,	vs44			// realA*realB, imagA*realB -> imagA*realB, realA*realB
+	xxswapd		vs45,	vs45			// imagA*imagB, realA*imagB -> realA*imagB, imagA*imagB
+
+	XSFADD_I1	vs1,	vs1,	vs44		// realA*imagB
+	XSFADD_I2	vs1,	vs1,	vs45		// imagA*realB
+
+	xsmuldp		vs4,	vs0,	alpha_r		// real*alpha_r
+	xsmuldp		vs5,	vs1,	alpha_i		// imag*alpha_i
+	xsmuldp		vs6,	vs0,	alpha_i		// real*alpha_i
+	xsmuldp		vs7,	vs1,	alpha_r		// imag*alpha_r
+
+	xssubdp		vs2,	vs4,	vs5		// real*alpha_r - imag*alpha_i
+	xsadddp		vs3,	vs6,	vs7		// real*alpha_i + imag*alpha_r
+	xxpermdi	vs10,	vs2,	vs3,	0	// merge real and imag part
+
+
+
+	xxlxor		vs0,	vs0,	vs0
+	xxlxor		vs1,	vs1,	vs1
+	xxswapd		vs47,	vs47			// realA*imagB, imagA*imagB -> imagA*imagB, realA*imagB
+
+	XSFADD_R1	vs0,	vs0,	vs46		// realA*realB
+	XSFADD_R2	vs0,	vs0,	vs47		// imagA*imagB
+
+	xxswapd		vs46,	vs46			// realA*realB, imagA*realB -> imagA*realB, realA*realB
+	xxswapd		vs47,	vs47			// imagA*imagB, realA*imagB -> realA*imagB, imagA*imagB
+
+	XSFADD_I1	vs1,	vs1,	vs46		// realA*imagB
+	XSFADD_I2	vs1,	vs1,	vs47		// imagA*realB
+
+	xsmuldp		vs4,	vs0,	alpha_r		// real*alpha_r
+	xsmuldp		vs5,	vs1,	alpha_i		// imag*alpha_i
+	xsmuldp		vs6,	vs0,	alpha_i		// real*alpha_i
+	xsmuldp		vs7,	vs1,	alpha_r		// imag*alpha_r
+
+	xssubdp		vs2,	vs4,	vs5		// real*alpha_r - imag*alpha_i
+	xsadddp		vs3,	vs6,	vs7		// real*alpha_i + imag*alpha_r
+	xxpermdi	vs11,	vs2,	vs3,	0	// merge real and imag part
+
+
+#ifndef TRMMKERNEL
+
+	xvadddp		vs8,	vs8,	vs16
+	xvadddp		vs9,	vs9,	vs17
+	xvadddp		vs10,	vs10,	vs18
+	xvadddp		vs11,	vs11,	vs19
+
+#endif
+
+	stxvd2x		vs8,	o0,	T1
+	stxvd2x		vs9,	o16,	T1
+	stxvd2x		vs10,	o32,	T1
+	stxvd2x		vs11,	o48,	T1
+
+	add		T1,	T1,	LDC
+	addi		CO,	CO,	64
+
+.endm
+
+
+/**********************************************************************************************
+* Macros for N=2 and M=2
+**********************************************************************************************/
+
+.macro LOAD2x2_1
+
+	lxvdsx		vs16,	o0,	BO		// load real part from B
+	lxvdsx		vs17,	o8,	BO		// load imag part from B
+	lxvdsx		vs18,	o16,	BO		// load real part from B
+	lxvdsx		vs19,	o24,	BO		// load imag part from B
+
+	addi		BO,	BO,	32
+
+	lxvd2x		vs0,	o0,	AO		// load real,imag from A
+	lxvd2x		vs1,	o16,	AO		// load real,imag from A
+
+	addi		AO,	AO,	32
+
+
+.endm
+
+.macro KERNEL2x2_I1
+
+	lxvd2x		vs8,	o0,	AO		// load real,imag from A
+	lxvd2x		vs9,	o16,	AO		// load real,imag from A
+
+	addi		AO,	AO,	32
+
+	lxvdsx		vs20,	o0,	BO		// load real part from B
+	lxvdsx		vs21,	o8,	BO		// load imag part from B
+	lxvdsx		vs22,	o16,	BO		// load real part from B
+	lxvdsx		vs23,	o24,	BO		// load imag part from B
+
+	addi		BO,	BO,	32
+
+	xvmuldp		vs32,	vs0,	vs16		// real*real, imag*real
+	xvmuldp		vs33,	vs0,	vs17		// real*imag, imag*imag
+	xvmuldp		vs34,	vs1,	vs16		// real*real, imag*real
+	xvmuldp		vs35,	vs1,	vs17		// real*imag, imag*imag
+
+	xvmuldp		vs36,	vs0,	vs18		// real*real, imag*real
+	xvmuldp		vs37,	vs0,	vs19		// real*imag, imag*imag
+	xvmuldp		vs38,	vs1,	vs18		// real*real, imag*real
+	xvmuldp		vs39,	vs1,	vs19		// real*imag, imag*imag
+
+
+.endm
+
+.macro KERNEL2x2_1
+
+	lxvd2x		vs8,	o0,	AO		// load real,imag from A
+	lxvd2x		vs9,	o16,	AO		// load real,imag from A
+
+	addi		AO,	AO,	32
+
+	lxvdsx		vs20,	o0,	BO		// load real part from B
+	lxvdsx		vs21,	o8,	BO		// load imag part from B
+	lxvdsx		vs22,	o16,	BO		// load real part from B
+	lxvdsx		vs23,	o24,	BO		// load imag part from B
+
+	addi		BO,	BO,	32
+
+	xvmaddadp	vs32,	vs0,	vs16		// real*real, imag*real
+	xvmaddadp	vs33,	vs0,	vs17		// real*imag, imag*imag
+	xvmaddadp	vs34,	vs1,	vs16		// real*real, imag*real
+	xvmaddadp	vs35,	vs1,	vs17		// real*imag, imag*imag
+
+	xvmaddadp	vs36,	vs0,	vs18		// real*real, imag*real
+	xvmaddadp	vs37,	vs0,	vs19		// real*imag, imag*imag
+	xvmaddadp	vs38,	vs1,	vs18		// real*real, imag*real
+	xvmaddadp	vs39,	vs1,	vs19		// real*imag, imag*imag
+
+
+.endm
+
+.macro KERNEL2x2_2
+
+	lxvd2x		vs0,	o0,	AO		// load real,imag from A
+	lxvd2x		vs1,	o16,	AO		// load real,imag from A
+
+	addi		AO,	AO,	32
+
+	lxvdsx		vs16,	o0,	BO		// load real part from B
+	lxvdsx		vs17,	o8,	BO		// load imag part from B
+	lxvdsx		vs18,	o16,	BO		// load real part from B
+	lxvdsx		vs19,	o24,	BO		// load imag part from B
+
+	addi		BO,	BO,	32
+
+	xvmaddadp	vs32,	vs8,	vs20		// real*real, imag*real
+	xvmaddadp	vs33,	vs8,	vs21		// real*imag, imag*imag
+	xvmaddadp	vs34,	vs9,	vs20		// real*real, imag*real
+	xvmaddadp	vs35,	vs9,	vs21		// real*imag, imag*imag
+
+	xvmaddadp	vs36,	vs8,	vs22		// real*real, imag*real
+	xvmaddadp	vs37,	vs8,	vs23		// real*imag, imag*imag
+	xvmaddadp	vs38,	vs9,	vs22		// real*real, imag*real
+	xvmaddadp	vs39,	vs9,	vs23		// real*imag, imag*imag
+
+
+.endm
+
+.macro KERNEL2x2_E2
+
+
+	xvmaddadp	vs32,	vs8,	vs20		// real*real, imag*real
+	xvmaddadp	vs33,	vs8,	vs21		// real*imag, imag*imag
+	xvmaddadp	vs34,	vs9,	vs20		// real*real, imag*real
+	xvmaddadp	vs35,	vs9,	vs21		// real*imag, imag*imag
+
+	xvmaddadp	vs36,	vs8,	vs22		// real*real, imag*real
+	xvmaddadp	vs37,	vs8,	vs23		// real*imag, imag*imag
+	xvmaddadp	vs38,	vs9,	vs22		// real*real, imag*real
+	xvmaddadp	vs39,	vs9,	vs23		// real*imag, imag*imag
+
+
+.endm
+
+.macro KERNEL2x2_SUBI1
+
+	lxvd2x		vs0,	o0,	AO		// load real,imag from A
+	lxvd2x		vs1,	o16,	AO		// load real,imag from A
+
+	addi		AO,	AO,	32
+
+	lxvdsx		vs16,	o0,	BO		// load real part from B
+	lxvdsx		vs17,	o8,	BO		// load imag part from B
+	lxvdsx		vs18,	o16,	BO		// load real part from B
+	lxvdsx		vs19,	o24,	BO		// load imag part from B
+
+	addi		BO,	BO,	32
+
+	xvmuldp		vs32,	vs0,	vs16		// real*real, imag*real
+	xvmuldp		vs33,	vs0,	vs17		// real*imag, imag*imag
+	xvmuldp		vs34,	vs1,	vs16		// real*real, imag*real
+	xvmuldp		vs35,	vs1,	vs17		// real*imag, imag*imag
+
+	xvmuldp		vs36,	vs0,	vs18		// real*real, imag*real
+	xvmuldp		vs37,	vs0,	vs19		// real*imag, imag*imag
+	xvmuldp		vs38,	vs1,	vs18		// real*real, imag*real
+	xvmuldp		vs39,	vs1,	vs19		// real*imag, imag*imag
+
+
+.endm
+
+.macro KERNEL2x2_SUB1
+
+	lxvd2x		vs0,	o0,	AO		// load real,imag from A
+	lxvd2x		vs1,	o16,	AO		// load real,imag from A
+
+	addi		AO,	AO,	32
+
+	lxvdsx		vs16,	o0,	BO		// load real part from B
+	lxvdsx		vs17,	o8,	BO		// load imag part from B
+	lxvdsx		vs18,	o16,	BO		// load real part from B
+	lxvdsx		vs19,	o24,	BO		// load imag part from B
+
+	addi		BO,	BO,	32
+
+	xvmaddadp	vs32,	vs0,	vs16		// real*real, imag*real
+	xvmaddadp	vs33,	vs0,	vs17		// real*imag, imag*imag
+	xvmaddadp	vs34,	vs1,	vs16		// real*real, imag*real
+	xvmaddadp	vs35,	vs1,	vs17		// real*imag, imag*imag
+
+	xvmaddadp	vs36,	vs0,	vs18		// real*real, imag*real
+	xvmaddadp	vs37,	vs0,	vs19		// real*imag, imag*imag
+	xvmaddadp	vs38,	vs1,	vs18		// real*real, imag*real
+	xvmaddadp	vs39,	vs1,	vs19		// real*imag, imag*imag
+
+
+.endm
+
+.macro SAVE2x2
+
+
+	mr		T1,	CO
+
+#ifndef TRMMKERNEL
+
+	lxvd2x		vs16,	o0,	T1
+	lxvd2x		vs17,	o16,	T1
+
+#endif
+
+
+	xxlxor		vs0,	vs0,	vs0
+	xxlxor		vs1,	vs1,	vs1
+	xxswapd		vs33,	vs33			// realA*imagB, imagA*imagB -> imagA*imagB, realA*imagB
+
+	XSFADD_R1	vs0,	vs0,	vs32		// realA*realB
+	XSFADD_R2	vs0,	vs0,	vs33		// imagA*imagB
+
+	xxswapd		vs32,	vs32			// realA*realB, imagA*realB -> imagA*realB, realA*realB
+	xxswapd		vs33,	vs33			// imagA*imagB, realA*imagB -> realA*imagB, imagA*imagB
+
+	XSFADD_I1	vs1,	vs1,	vs32		// realA*imagB
+	XSFADD_I2	vs1,	vs1,	vs33		// imagA*realB
+
+	xsmuldp		vs4,	vs0,	alpha_r		// real*alpha_r
+	xsmuldp		vs5,	vs1,	alpha_i		// imag*alpha_i
+	xsmuldp		vs6,	vs0,	alpha_i		// real*alpha_i
+	xsmuldp		vs7,	vs1,	alpha_r		// imag*alpha_r
+
+	xssubdp		vs2,	vs4,	vs5		// real*alpha_r - imag*alpha_i
+	xsadddp		vs3,	vs6,	vs7		// real*alpha_i + imag*alpha_r
+	xxpermdi	vs8,	vs2,	vs3,	0	// merge real and imag part
+
+
+
+	xxlxor		vs0,	vs0,	vs0
+	xxlxor		vs1,	vs1,	vs1
+	xxswapd		vs35,	vs35			// realA*imagB, imagA*imagB -> imagA*imagB, realA*imagB
+
+	XSFADD_R1	vs0,	vs0,	vs34		// realA*realB
+	XSFADD_R2	vs0,	vs0,	vs35		// imagA*imagB
+
+	xxswapd		vs34,	vs34			// realA*realB, imagA*realB -> imagA*realB, realA*realB
+	xxswapd		vs35,	vs35			// imagA*imagB, realA*imagB -> realA*imagB, imagA*imagB
+
+	XSFADD_I1	vs1,	vs1,	vs34		// realA*imagB
+	XSFADD_I2	vs1,	vs1,	vs35		// imagA*realB
+
+	xsmuldp		vs4,	vs0,	alpha_r		// real*alpha_r
+	xsmuldp		vs5,	vs1,	alpha_i		// imag*alpha_i
+	xsmuldp		vs6,	vs0,	alpha_i		// real*alpha_i
+	xsmuldp		vs7,	vs1,	alpha_r		// imag*alpha_r
+
+	xssubdp		vs2,	vs4,	vs5		// real*alpha_r - imag*alpha_i
+	xsadddp		vs3,	vs6,	vs7		// real*alpha_i + imag*alpha_r
+	xxpermdi	vs9,	vs2,	vs3,	0	// merge real and imag part
+
+
+#ifndef TRMMKERNEL
+
+	xvadddp		vs8,	vs8,	vs16
+	xvadddp		vs9,	vs9,	vs17
+
+#endif
+
+	stxvd2x		vs8,	o0,	T1
+	stxvd2x		vs9,	o16,	T1
+
+	add		T1,	T1,	LDC
+
+#ifndef TRMMKERNEL
+
+	lxvd2x		vs16,	o0,	T1
+	lxvd2x		vs17,	o16,	T1
+
+#endif
+
+
+	xxlxor		vs0,	vs0,	vs0
+	xxlxor		vs1,	vs1,	vs1
+	xxswapd		vs37,	vs37			// realA*imagB, imagA*imagB -> imagA*imagB, realA*imagB
+
+	XSFADD_R1	vs0,	vs0,	vs36		// realA*realB
+	XSFADD_R2	vs0,	vs0,	vs37		// imagA*imagB
+
+	xxswapd		vs36,	vs36			// realA*realB, imagA*realB -> imagA*realB, realA*realB
+	xxswapd		vs37,	vs37			// imagA*imagB, realA*imagB -> realA*imagB, imagA*imagB
+
+	XSFADD_I1	vs1,	vs1,	vs36		// realA*imagB
+	XSFADD_I2	vs1,	vs1,	vs37		// imagA*realB
+
+	xsmuldp		vs4,	vs0,	alpha_r		// real*alpha_r
+	xsmuldp		vs5,	vs1,	alpha_i		// imag*alpha_i
+	xsmuldp		vs6,	vs0,	alpha_i		// real*alpha_i
+	xsmuldp		vs7,	vs1,	alpha_r		// imag*alpha_r
+
+	xssubdp		vs2,	vs4,	vs5		// real*alpha_r - imag*alpha_i
+	xsadddp		vs3,	vs6,	vs7		// real*alpha_i + imag*alpha_r
+	xxpermdi	vs8,	vs2,	vs3,	0	// merge real and imag part
+
+
+
+	xxlxor		vs0,	vs0,	vs0
+	xxlxor		vs1,	vs1,	vs1
+	xxswapd		vs39,	vs39			// realA*imagB, imagA*imagB -> imagA*imagB, realA*imagB
+
+	XSFADD_R1	vs0,	vs0,	vs38		// realA*realB
+	XSFADD_R2	vs0,	vs0,	vs39		// imagA*imagB
+
+	xxswapd		vs38,	vs38			// realA*realB, imagA*realB -> imagA*realB, realA*realB
+	xxswapd		vs39,	vs39			// imagA*imagB, realA*imagB -> realA*imagB, imagA*imagB
+
+	XSFADD_I1	vs1,	vs1,	vs38		// realA*imagB
+	XSFADD_I2	vs1,	vs1,	vs39		// imagA*realB
+
+	xsmuldp		vs4,	vs0,	alpha_r		// real*alpha_r
+	xsmuldp		vs5,	vs1,	alpha_i		// imag*alpha_i
+	xsmuldp		vs6,	vs0,	alpha_i		// real*alpha_i
+	xsmuldp		vs7,	vs1,	alpha_r		// imag*alpha_r
+
+	xssubdp		vs2,	vs4,	vs5		// real*alpha_r - imag*alpha_i
+	xsadddp		vs3,	vs6,	vs7		// real*alpha_i + imag*alpha_r
+	xxpermdi	vs9,	vs2,	vs3,	0	// merge real and imag part
+
+
+#ifndef TRMMKERNEL
+
+	xvadddp		vs8,	vs8,	vs16
+	xvadddp		vs9,	vs9,	vs17
+
+#endif
+
+	stxvd2x		vs8,	o0,	T1
+	stxvd2x		vs9,	o16,	T1
+
+	add		T1,	T1,	LDC
+	addi		CO,	CO,	32
+
+.endm
+
+
+/**********************************************************************************************
+* Macros for N=2 and M=1
+**********************************************************************************************/
+
+.macro LOAD2x1_1
+
+	lxvdsx		vs16,	o0,	BO		// load real part from B
+	lxvdsx		vs17,	o8,	BO		// load imag part from B
+	lxvdsx		vs18,	o16,	BO		// load real part from B
+	lxvdsx		vs19,	o24,	BO		// load imag part from B
+
+	addi		BO,	BO,	32
+
+	lxvd2x		vs0,	o0,	AO		// load real,imag from A
+
+	addi		AO,	AO,	16
+
+
+.endm
+
+.macro KERNEL2x1_I1
+
+	lxvd2x		vs8,	o0,	AO		// load real,imag from A
+
+	addi		AO,	AO,	16
+
+	lxvdsx		vs20,	o0,	BO		// load real part from B
+	lxvdsx		vs21,	o8,	BO		// load imag part from B
+	lxvdsx		vs22,	o16,	BO		// load real part from B
+	lxvdsx		vs23,	o24,	BO		// load imag part from B
+
+	addi		BO,	BO,	32
+
+	xvmuldp		vs32,	vs0,	vs16		// real*real, imag*real
+	xvmuldp		vs33,	vs0,	vs17		// real*imag, imag*imag
+
+	xvmuldp		vs34,	vs0,	vs18		// real*real, imag*real
+	xvmuldp		vs35,	vs0,	vs19		// real*imag, imag*imag
+
+
+.endm
+
+.macro KERNEL2x1_1
+
+	lxvd2x		vs8,	o0,	AO		// load real,imag from A
+
+	addi		AO,	AO,	16
+
+	lxvdsx		vs20,	o0,	BO		// load real part from B
+	lxvdsx		vs21,	o8,	BO		// load imag part from B
+	lxvdsx		vs22,	o16,	BO		// load real part from B
+	lxvdsx		vs23,	o24,	BO		// load imag part from B
+
+	addi		BO,	BO,	32
+
+	xvmaddadp	vs32,	vs0,	vs16		// real*real, imag*real
+	xvmaddadp	vs33,	vs0,	vs17		// real*imag, imag*imag
+
+	xvmaddadp	vs34,	vs0,	vs18		// real*real, imag*real
+	xvmaddadp	vs35,	vs0,	vs19		// real*imag, imag*imag
+
+
+.endm
+
+.macro KERNEL2x1_2
+
+	lxvd2x		vs0,	o0,	AO		// load real,imag from A
+
+	addi		AO,	AO,	16
+
+	lxvdsx		vs16,	o0,	BO		// load real part from B
+	lxvdsx		vs17,	o8,	BO		// load imag part from B
+	lxvdsx		vs18,	o16,	BO		// load real part from B
+	lxvdsx		vs19,	o24,	BO		// load imag part from B
+
+	addi		BO,	BO,	32
+
+	xvmaddadp	vs32,	vs8,	vs20		// real*real, imag*real
+	xvmaddadp	vs33,	vs8,	vs21		// real*imag, imag*imag
+
+	xvmaddadp	vs34,	vs8,	vs22		// real*real, imag*real
+	xvmaddadp	vs35,	vs8,	vs23		// real*imag, imag*imag
+
+
+.endm
+
+.macro KERNEL2x1_E2
+
+
+	xvmaddadp	vs32,	vs8,	vs20		// real*real, imag*real
+	xvmaddadp	vs33,	vs8,	vs21		// real*imag, imag*imag
+
+	xvmaddadp	vs34,	vs8,	vs22		// real*real, imag*real
+	xvmaddadp	vs35,	vs8,	vs23		// real*imag, imag*imag
+
+
+.endm
+
+.macro KERNEL2x1_SUBI1
+
+	lxvd2x		vs0,	o0,	AO		// load real,imag from A
+
+	addi		AO,	AO,	16
+
+	lxvdsx		vs16,	o0,	BO		// load real part from B
+	lxvdsx		vs17,	o8,	BO		// load imag part from B
+	lxvdsx		vs18,	o16,	BO		// load real part from B
+	lxvdsx		vs19,	o24,	BO		// load imag part from B
+
+	addi		BO,	BO,	32
+
+	xvmuldp		vs32,	vs0,	vs16		// real*real, imag*real
+	xvmuldp		vs33,	vs0,	vs17		// real*imag, imag*imag
+
+	xvmuldp		vs34,	vs0,	vs18		// real*real, imag*real
+	xvmuldp		vs35,	vs0,	vs19		// real*imag, imag*imag
+
+
+.endm
+
+.macro KERNEL2x1_SUB1
+
+	lxvd2x		vs0,	o0,	AO		// load real,imag from A
+
+	addi		AO,	AO,	16
+
+	lxvdsx		vs16,	o0,	BO		// load real part from B
+	lxvdsx		vs17,	o8,	BO		// load imag part from B
+	lxvdsx		vs18,	o16,	BO		// load real part from B
+	lxvdsx		vs19,	o24,	BO		// load imag part from B
+
+	addi		BO,	BO,	32
+
+	xvmaddadp	vs32,	vs0,	vs16		// real*real, imag*real
+	xvmaddadp	vs33,	vs0,	vs17		// real*imag, imag*imag
+
+	xvmaddadp	vs34,	vs0,	vs18		// real*real, imag*real
+	xvmaddadp	vs35,	vs0,	vs19		// real*imag, imag*imag
+
+
+.endm
+
+.macro SAVE2x1
+
+
+	mr		T1,	CO
+
+#ifndef TRMMKERNEL
+
+	lxvd2x		vs16,	o0,	T1
+
+#endif
+
+
+	xxlxor		vs0,	vs0,	vs0
+	xxlxor		vs1,	vs1,	vs1
+	xxswapd		vs33,	vs33			// realA*imagB, imagA*imagB -> imagA*imagB, realA*imagB
+
+	XSFADD_R1	vs0,	vs0,	vs32		// realA*realB
+	XSFADD_R2	vs0,	vs0,	vs33		// imagA*imagB
+
+	xxswapd		vs32,	vs32			// realA*realB, imagA*realB -> imagA*realB, realA*realB
+	xxswapd		vs33,	vs33			// imagA*imagB, realA*imagB -> realA*imagB, imagA*imagB
+
+	XSFADD_I1	vs1,	vs1,	vs32		// realA*imagB
+	XSFADD_I2	vs1,	vs1,	vs33		// imagA*realB
+
+	xsmuldp		vs4,	vs0,	alpha_r		// real*alpha_r
+	xsmuldp		vs5,	vs1,	alpha_i		// imag*alpha_i
+	xsmuldp		vs6,	vs0,	alpha_i		// real*alpha_i
+	xsmuldp		vs7,	vs1,	alpha_r		// imag*alpha_r
+
+	xssubdp		vs2,	vs4,	vs5		// real*alpha_r - imag*alpha_i
+	xsadddp		vs3,	vs6,	vs7		// real*alpha_i + imag*alpha_r
+	xxpermdi	vs8,	vs2,	vs3,	0	// merge real and imag part
+
+
+#ifndef TRMMKERNEL
+
+	xvadddp		vs8,	vs8,	vs16
+
+#endif
+
+	stxvd2x		vs8,	o0,	T1
+
+	add		T1,	T1,	LDC
+
+#ifndef TRMMKERNEL
+
+	lxvd2x		vs16,	o0,	T1
+
+#endif
+
+
+	xxlxor		vs0,	vs0,	vs0
+	xxlxor		vs1,	vs1,	vs1
+	xxswapd		vs35,	vs35			// realA*imagB, imagA*imagB -> imagA*imagB, realA*imagB
+
+	XSFADD_R1	vs0,	vs0,	vs34		// realA*realB
+	XSFADD_R2	vs0,	vs0,	vs35		// imagA*imagB
+
+	xxswapd		vs34,	vs34			// realA*realB, imagA*realB -> imagA*realB, realA*realB
+	xxswapd		vs35,	vs35			// imagA*imagB, realA*imagB -> realA*imagB, imagA*imagB
+
+	XSFADD_I1	vs1,	vs1,	vs34		// realA*imagB
+	XSFADD_I2	vs1,	vs1,	vs35		// imagA*realB
+
+	xsmuldp		vs4,	vs0,	alpha_r		// real*alpha_r
+	xsmuldp		vs5,	vs1,	alpha_i		// imag*alpha_i
+	xsmuldp		vs6,	vs0,	alpha_i		// real*alpha_i
+	xsmuldp		vs7,	vs1,	alpha_r		// imag*alpha_r
+
+	xssubdp		vs2,	vs4,	vs5		// real*alpha_r - imag*alpha_i
+	xsadddp		vs3,	vs6,	vs7		// real*alpha_i + imag*alpha_r
+	xxpermdi	vs8,	vs2,	vs3,	0	// merge real and imag part
+
+
+#ifndef TRMMKERNEL
+
+	xvadddp		vs8,	vs8,	vs16
+
+#endif
+
+	stxvd2x		vs8,	o0,	T1
+
+	add		T1,	T1,	LDC
+	addi		CO,	CO,	16
+
+.endm
+
+
+/**********************************************************************************************
+* Macros for N=1 and M=8
+**********************************************************************************************/
+
+.macro LOAD1x8_1
+
+	lxvdsx		vs16,	o0,	BO		// load real part from B
+	lxvdsx		vs17,	o8,	BO		// load imag part from B
+
+	addi		BO,	BO,	16
+
+	lxvd2x		vs0,	o0,	AO		// load real,imag from A
+	lxvd2x		vs1,	o16,	AO		// load real,imag from A
+	lxvd2x		vs2,	o32,	AO		// load real,imag from A
+	lxvd2x		vs3,	o48,	AO		// load real,imag from A
+
+	addi		AO,	AO,	64
+
+	lxvd2x		vs4,	o0,	AO		// load real,imag from A
+	lxvd2x		vs5,	o16,	AO		// load real,imag from A
+	lxvd2x		vs6,	o32,	AO		// load real,imag from A
+	lxvd2x		vs7,	o48,	AO		// load real,imag from A
+
+	addi		AO,	AO,	64
+
+
+.endm
+
+.macro KERNEL1x8_I1
+
+	lxvd2x		vs8,	o0,	AO		// load real,imag from A
+	lxvd2x		vs9,	o16,	AO		// load real,imag from A
+	lxvd2x		vs10,	o32,	AO		// load real,imag from A
+	lxvd2x		vs11,	o48,	AO		// load real,imag from A
+
+	addi		AO,	AO,	64
+
+	lxvd2x		vs12,	o0,	AO		// load real,imag from A
+	lxvd2x		vs13,	o16,	AO		// load real,imag from A
+	lxvd2x		vs14,	o32,	AO		// load real,imag from A
+	lxvd2x		vs15,	o48,	AO		// load real,imag from A
+
+	addi		AO,	AO,	64
+
+	lxvdsx		vs20,	o0,	BO		// load real part from B
+	lxvdsx		vs21,	o8,	BO		// load imag part from B
+
+	addi		BO,	BO,	16
+
+	xvmuldp		vs32,	vs0,	vs16		// real*real, imag*real
+	xvmuldp		vs33,	vs0,	vs17		// real*imag, imag*imag
+	xvmuldp		vs34,	vs1,	vs16		// real*real, imag*real
+	xvmuldp		vs35,	vs1,	vs17		// real*imag, imag*imag
+	xvmuldp		vs36,	vs2,	vs16		// real*real, imag*real
+	xvmuldp		vs37,	vs2,	vs17		// real*imag, imag*imag
+	xvmuldp		vs38,	vs3,	vs16		// real*real, imag*real
+	xvmuldp		vs39,	vs3,	vs17		// real*imag, imag*imag
+	xvmuldp		vs40,	vs4,	vs16		// real*real, imag*real
+	xvmuldp		vs41,	vs4,	vs17		// real*imag, imag*imag
+	xvmuldp		vs42,	vs5,	vs16		// real*real, imag*real
+	xvmuldp		vs43,	vs5,	vs17		// real*imag, imag*imag
+	xvmuldp		vs44,	vs6,	vs16		// real*real, imag*real
+	xvmuldp		vs45,	vs6,	vs17		// real*imag, imag*imag
+	xvmuldp		vs46,	vs7,	vs16		// real*real, imag*real
+	xvmuldp		vs47,	vs7,	vs17		// real*imag, imag*imag
+
+
+.endm
+
+.macro KERNEL1x8_1
+
+	lxvd2x		vs8,	o0,	AO		// load real,imag from A
+	lxvd2x		vs9,	o16,	AO		// load real,imag from A
+	lxvd2x		vs10,	o32,	AO		// load real,imag from A
+	lxvd2x		vs11,	o48,	AO		// load real,imag from A
+
+	addi		AO,	AO,	64
+
+	lxvd2x		vs12,	o0,	AO		// load real,imag from A
+	lxvd2x		vs13,	o16,	AO		// load real,imag from A
+	lxvd2x		vs14,	o32,	AO		// load real,imag from A
+	lxvd2x		vs15,	o48,	AO		// load real,imag from A
+
+	addi		AO,	AO,	64
+
+	lxvdsx		vs20,	o0,	BO		// load real part from B
+	lxvdsx		vs21,	o8,	BO		// load imag part from B
+
+	addi		BO,	BO,	16
+
+	xvmaddadp	vs32,	vs0,	vs16		// real*real, imag*real
+	xvmaddadp	vs33,	vs0,	vs17		// real*imag, imag*imag
+	xvmaddadp	vs34,	vs1,	vs16		// real*real, imag*real
+	xvmaddadp	vs35,	vs1,	vs17		// real*imag, imag*imag
+	xvmaddadp	vs36,	vs2,	vs16		// real*real, imag*real
+	xvmaddadp	vs37,	vs2,	vs17		// real*imag, imag*imag
+	xvmaddadp	vs38,	vs3,	vs16		// real*real, imag*real
+	xvmaddadp	vs39,	vs3,	vs17		// real*imag, imag*imag
+	xvmaddadp	vs40,	vs4,	vs16		// real*real, imag*real
+	xvmaddadp	vs41,	vs4,	vs17		// real*imag, imag*imag
+	xvmaddadp	vs42,	vs5,	vs16		// real*real, imag*real
+	xvmaddadp	vs43,	vs5,	vs17		// real*imag, imag*imag
+	xvmaddadp	vs44,	vs6,	vs16		// real*real, imag*real
+	xvmaddadp	vs45,	vs6,	vs17		// real*imag, imag*imag
+	xvmaddadp	vs46,	vs7,	vs16		// real*real, imag*real
+	xvmaddadp	vs47,	vs7,	vs17		// real*imag, imag*imag
+
+
+.endm
+
+.macro KERNEL1x8_2
+
+	lxvd2x		vs0,	o0,	AO		// load real,imag from A
+	lxvd2x		vs1,	o16,	AO		// load real,imag from A
+	lxvd2x		vs2,	o32,	AO		// load real,imag from A
+	lxvd2x		vs3,	o48,	AO		// load real,imag from A
+
+	addi		AO,	AO,	64
+
+	lxvd2x		vs4,	o0,	AO		// load real,imag from A
+	lxvd2x		vs5,	o16,	AO		// load real,imag from A
+	lxvd2x		vs6,	o32,	AO		// load real,imag from A
+	lxvd2x		vs7,	o48,	AO		// load real,imag from A
+
+	addi		AO,	AO,	64
+
+	lxvdsx		vs16,	o0,	BO		// load real part from B
+	lxvdsx		vs17,	o8,	BO		// load imag part from B
+
+	addi		BO,	BO,	16
+
+	xvmaddadp	vs32,	vs8,	vs20		// real*real, imag*real
+	xvmaddadp	vs33,	vs8,	vs21		// real*imag, imag*imag
+	xvmaddadp	vs34,	vs9,	vs20		// real*real, imag*real
+	xvmaddadp	vs35,	vs9,	vs21		// real*imag, imag*imag
+	xvmaddadp	vs36,	vs10,	vs20		// real*real, imag*real
+	xvmaddadp	vs37,	vs10,	vs21		// real*imag, imag*imag
+	xvmaddadp	vs38,	vs11,	vs20		// real*real, imag*real
+	xvmaddadp	vs39,	vs11,	vs21		// real*imag, imag*imag
+	xvmaddadp	vs40,	vs12,	vs20		// real*real, imag*real
+	xvmaddadp	vs41,	vs12,	vs21		// real*imag, imag*imag
+	xvmaddadp	vs42,	vs13,	vs20		// real*real, imag*real
+	xvmaddadp	vs43,	vs13,	vs21		// real*imag, imag*imag
+	xvmaddadp	vs44,	vs14,	vs20		// real*real, imag*real
+	xvmaddadp	vs45,	vs14,	vs21		// real*imag, imag*imag
+	xvmaddadp	vs46,	vs15,	vs20		// real*real, imag*real
+	xvmaddadp	vs47,	vs15,	vs21		// real*imag, imag*imag
+
+
+.endm
+
+.macro KERNEL1x8_E2
+
+
+	xvmaddadp	vs32,	vs8,	vs20		// real*real, imag*real
+	xvmaddadp	vs33,	vs8,	vs21		// real*imag, imag*imag
+	xvmaddadp	vs34,	vs9,	vs20		// real*real, imag*real
+	xvmaddadp	vs35,	vs9,	vs21		// real*imag, imag*imag
+	xvmaddadp	vs36,	vs10,	vs20		// real*real, imag*real
+	xvmaddadp	vs37,	vs10,	vs21		// real*imag, imag*imag
+	xvmaddadp	vs38,	vs11,	vs20		// real*real, imag*real
+	xvmaddadp	vs39,	vs11,	vs21		// real*imag, imag*imag
+	xvmaddadp	vs40,	vs12,	vs20		// real*real, imag*real
+	xvmaddadp	vs41,	vs12,	vs21		// real*imag, imag*imag
+	xvmaddadp	vs42,	vs13,	vs20		// real*real, imag*real
+	xvmaddadp	vs43,	vs13,	vs21		// real*imag, imag*imag
+	xvmaddadp	vs44,	vs14,	vs20		// real*real, imag*real
+	xvmaddadp	vs45,	vs14,	vs21		// real*imag, imag*imag
+	xvmaddadp	vs46,	vs15,	vs20		// real*real, imag*real
+	xvmaddadp	vs47,	vs15,	vs21		// real*imag, imag*imag
+
+
+.endm
+
+.macro KERNEL1x8_SUBI1
+
+	lxvd2x		vs0,	o0,	AO		// load real,imag from A
+	lxvd2x		vs1,	o16,	AO		// load real,imag from A
+	lxvd2x		vs2,	o32,	AO		// load real,imag from A
+	lxvd2x		vs3,	o48,	AO		// load real,imag from A
+
+	addi		AO,	AO,	64
+
+	lxvd2x		vs4,	o0,	AO		// load real,imag from A
+	lxvd2x		vs5,	o16,	AO		// load real,imag from A
+	lxvd2x		vs6,	o32,	AO		// load real,imag from A
+	lxvd2x		vs7,	o48,	AO		// load real,imag from A
+
+	addi		AO,	AO,	64
+
+	lxvdsx		vs16,	o0,	BO		// load real part from B
+	lxvdsx		vs17,	o8,	BO		// load imag part from B
+
+	addi		BO,	BO,	16
+
+	xvmuldp		vs32,	vs0,	vs16		// real*real, imag*real
+	xvmuldp		vs33,	vs0,	vs17		// real*imag, imag*imag
+	xvmuldp		vs34,	vs1,	vs16		// real*real, imag*real
+	xvmuldp		vs35,	vs1,	vs17		// real*imag, imag*imag
+	xvmuldp		vs36,	vs2,	vs16		// real*real, imag*real
+	xvmuldp		vs37,	vs2,	vs17		// real*imag, imag*imag
+	xvmuldp		vs38,	vs3,	vs16		// real*real, imag*real
+	xvmuldp		vs39,	vs3,	vs17		// real*imag, imag*imag
+	xvmuldp		vs40,	vs4,	vs16		// real*real, imag*real
+	xvmuldp		vs41,	vs4,	vs17		// real*imag, imag*imag
+	xvmuldp		vs42,	vs5,	vs16		// real*real, imag*real
+	xvmuldp		vs43,	vs5,	vs17		// real*imag, imag*imag
+	xvmuldp		vs44,	vs6,	vs16		// real*real, imag*real
+	xvmuldp		vs45,	vs6,	vs17		// real*imag, imag*imag
+	xvmuldp		vs46,	vs7,	vs16		// real*real, imag*real
+	xvmuldp		vs47,	vs7,	vs17		// real*imag, imag*imag
+
+
+.endm
+
+.macro KERNEL1x8_SUB1
+
+	lxvd2x		vs0,	o0,	AO		// load real,imag from A
+	lxvd2x		vs1,	o16,	AO		// load real,imag from A
+	lxvd2x		vs2,	o32,	AO		// load real,imag from A
+	lxvd2x		vs3,	o48,	AO		// load real,imag from A
+
+	addi		AO,	AO,	64
+
+	lxvd2x		vs4,	o0,	AO		// load real,imag from A
+	lxvd2x		vs5,	o16,	AO		// load real,imag from A
+	lxvd2x		vs6,	o32,	AO		// load real,imag from A
+	lxvd2x		vs7,	o48,	AO		// load real,imag from A
+
+	addi		AO,	AO,	64
+
+	lxvdsx		vs16,	o0,	BO		// load real part from B
+	lxvdsx		vs17,	o8,	BO		// load imag part from B
+
+	addi		BO,	BO,	16
+
+	xvmaddadp	vs32,	vs0,	vs16		// real*real, imag*real
+	xvmaddadp	vs33,	vs0,	vs17		// real*imag, imag*imag
+	xvmaddadp	vs34,	vs1,	vs16		// real*real, imag*real
+	xvmaddadp	vs35,	vs1,	vs17		// real*imag, imag*imag
+	xvmaddadp	vs36,	vs2,	vs16		// real*real, imag*real
+	xvmaddadp	vs37,	vs2,	vs17		// real*imag, imag*imag
+	xvmaddadp	vs38,	vs3,	vs16		// real*real, imag*real
+	xvmaddadp	vs39,	vs3,	vs17		// real*imag, imag*imag
+	xvmaddadp	vs40,	vs4,	vs16		// real*real, imag*real
+	xvmaddadp	vs41,	vs4,	vs17		// real*imag, imag*imag
+	xvmaddadp	vs42,	vs5,	vs16		// real*real, imag*real
+	xvmaddadp	vs43,	vs5,	vs17		// real*imag, imag*imag
+	xvmaddadp	vs44,	vs6,	vs16		// real*real, imag*real
+	xvmaddadp	vs45,	vs6,	vs17		// real*imag, imag*imag
+	xvmaddadp	vs46,	vs7,	vs16		// real*real, imag*real
+	xvmaddadp	vs47,	vs7,	vs17		// real*imag, imag*imag
+
+
+.endm
+
+.macro SAVE1x8
+
+
+	mr		T1,	CO
+	addi		T2,	T1,	64
+
+#ifndef TRMMKERNEL
+
+	lxvd2x		vs16,	o0,	T1
+	lxvd2x		vs17,	o16,	T1
+	lxvd2x		vs18,	o32,	T1
+	lxvd2x		vs19,	o48,	T1
+	lxvd2x		vs20,	o0,	T2
+	lxvd2x		vs21,	o16,	T2
+	lxvd2x		vs22,	o32,	T2
+	lxvd2x		vs23,	o48,	T2
+
+#endif
+
+
+	xxlxor		vs0,	vs0,	vs0
+	xxlxor		vs1,	vs1,	vs1
+	xxswapd		vs33,	vs33			// realA*imagB, imagA*imagB -> imagA*imagB, realA*imagB
+
+	XSFADD_R1	vs0,	vs0,	vs32		// realA*realB
+	XSFADD_R2	vs0,	vs0,	vs33		// imagA*imagB
+
+	xxswapd		vs32,	vs32			// realA*realB, imagA*realB -> imagA*realB, realA*realB
+	xxswapd		vs33,	vs33			// imagA*imagB, realA*imagB -> realA*imagB, imagA*imagB
+
+	XSFADD_I1	vs1,	vs1,	vs32		// realA*imagB
+	XSFADD_I2	vs1,	vs1,	vs33		// imagA*realB
+
+	xsmuldp		vs4,	vs0,	alpha_r		// real*alpha_r
+	xsmuldp		vs5,	vs1,	alpha_i		// imag*alpha_i
+	xsmuldp		vs6,	vs0,	alpha_i		// real*alpha_i
+	xsmuldp		vs7,	vs1,	alpha_r		// imag*alpha_r
+
+	xssubdp		vs2,	vs4,	vs5		// real*alpha_r - imag*alpha_i
+	xsadddp		vs3,	vs6,	vs7		// real*alpha_i + imag*alpha_r
+	xxpermdi	vs8,	vs2,	vs3,	0	// merge real and imag part
+
+
+
+	xxlxor		vs0,	vs0,	vs0
+	xxlxor		vs1,	vs1,	vs1
+	xxswapd		vs35,	vs35			// realA*imagB, imagA*imagB -> imagA*imagB, realA*imagB
+
+	XSFADD_R1	vs0,	vs0,	vs34		// realA*realB
+	XSFADD_R2	vs0,	vs0,	vs35		// imagA*imagB
+
+	xxswapd		vs34,	vs34			// realA*realB, imagA*realB -> imagA*realB, realA*realB
+	xxswapd		vs35,	vs35			// imagA*imagB, realA*imagB -> realA*imagB, imagA*imagB
+
+	XSFADD_I1	vs1,	vs1,	vs34		// realA*imagB
+	XSFADD_I2	vs1,	vs1,	vs35		// imagA*realB
+
+	xsmuldp		vs4,	vs0,	alpha_r		// real*alpha_r
+	xsmuldp		vs5,	vs1,	alpha_i		// imag*alpha_i
+	xsmuldp		vs6,	vs0,	alpha_i		// real*alpha_i
+	xsmuldp		vs7,	vs1,	alpha_r		// imag*alpha_r
+
+	xssubdp		vs2,	vs4,	vs5		// real*alpha_r - imag*alpha_i
+	xsadddp		vs3,	vs6,	vs7		// real*alpha_i + imag*alpha_r
+	xxpermdi	vs9,	vs2,	vs3,	0	// merge real and imag part
+
+
+
+	xxlxor		vs0,	vs0,	vs0
+	xxlxor		vs1,	vs1,	vs1
+	xxswapd		vs37,	vs37			// realA*imagB, imagA*imagB -> imagA*imagB, realA*imagB
+
+	XSFADD_R1	vs0,	vs0,	vs36		// realA*realB
+	XSFADD_R2	vs0,	vs0,	vs37		// imagA*imagB
+
+	xxswapd		vs36,	vs36			// realA*realB, imagA*realB -> imagA*realB, realA*realB
+	xxswapd		vs37,	vs37			// imagA*imagB, realA*imagB -> realA*imagB, imagA*imagB
+
+	XSFADD_I1	vs1,	vs1,	vs36		// realA*imagB
+	XSFADD_I2	vs1,	vs1,	vs37		// imagA*realB
+
+	xsmuldp		vs4,	vs0,	alpha_r		// real*alpha_r
+	xsmuldp		vs5,	vs1,	alpha_i		// imag*alpha_i
+	xsmuldp		vs6,	vs0,	alpha_i		// real*alpha_i
+	xsmuldp		vs7,	vs1,	alpha_r		// imag*alpha_r
+
+	xssubdp		vs2,	vs4,	vs5		// real*alpha_r - imag*alpha_i
+	xsadddp		vs3,	vs6,	vs7		// real*alpha_i + imag*alpha_r
+	xxpermdi	vs10,	vs2,	vs3,	0	// merge real and imag part
+
+
+
+	xxlxor		vs0,	vs0,	vs0
+	xxlxor		vs1,	vs1,	vs1
+	xxswapd		vs39,	vs39			// realA*imagB, imagA*imagB -> imagA*imagB, realA*imagB
+
+	XSFADD_R1	vs0,	vs0,	vs38		// realA*realB
+	XSFADD_R2	vs0,	vs0,	vs39		// imagA*imagB
+
+	xxswapd		vs38,	vs38			// realA*realB, imagA*realB -> imagA*realB, realA*realB
+	xxswapd		vs39,	vs39			// imagA*imagB, realA*imagB -> realA*imagB, imagA*imagB
+
+	XSFADD_I1	vs1,	vs1,	vs38		// realA*imagB
+	XSFADD_I2	vs1,	vs1,	vs39		// imagA*realB
+
+	xsmuldp		vs4,	vs0,	alpha_r		// real*alpha_r
+	xsmuldp		vs5,	vs1,	alpha_i		// imag*alpha_i
+	xsmuldp		vs6,	vs0,	alpha_i		// real*alpha_i
+	xsmuldp		vs7,	vs1,	alpha_r		// imag*alpha_r
+
+	xssubdp		vs2,	vs4,	vs5		// real*alpha_r - imag*alpha_i
+	xsadddp		vs3,	vs6,	vs7		// real*alpha_i + imag*alpha_r
+	xxpermdi	vs11,	vs2,	vs3,	0	// merge real and imag part
+
+
+
+	xxlxor		vs0,	vs0,	vs0
+	xxlxor		vs1,	vs1,	vs1
+	xxswapd		vs41,	vs41			// realA*imagB, imagA*imagB -> imagA*imagB, realA*imagB
+
+	XSFADD_R1	vs0,	vs0,	vs40		// realA*realB
+	XSFADD_R2	vs0,	vs0,	vs41		// imagA*imagB
+
+	xxswapd		vs40,	vs40			// realA*realB, imagA*realB -> imagA*realB, realA*realB
+	xxswapd		vs41,	vs41			// imagA*imagB, realA*imagB -> realA*imagB, imagA*imagB
+
+	XSFADD_I1	vs1,	vs1,	vs40		// realA*imagB
+	XSFADD_I2	vs1,	vs1,	vs41		// imagA*realB
+
+	xsmuldp		vs4,	vs0,	alpha_r		// real*alpha_r
+	xsmuldp		vs5,	vs1,	alpha_i		// imag*alpha_i
+	xsmuldp		vs6,	vs0,	alpha_i		// real*alpha_i
+	xsmuldp		vs7,	vs1,	alpha_r		// imag*alpha_r
+
+	xssubdp		vs2,	vs4,	vs5		// real*alpha_r - imag*alpha_i
+	xsadddp		vs3,	vs6,	vs7		// real*alpha_i + imag*alpha_r
+	xxpermdi	vs12,	vs2,	vs3,	0	// merge real and imag part
+
+
+
+	xxlxor		vs0,	vs0,	vs0
+	xxlxor		vs1,	vs1,	vs1
+	xxswapd		vs43,	vs43			// realA*imagB, imagA*imagB -> imagA*imagB, realA*imagB
+
+	XSFADD_R1	vs0,	vs0,	vs42		// realA*realB
+	XSFADD_R2	vs0,	vs0,	vs43		// imagA*imagB
+
+	xxswapd		vs42,	vs42			// realA*realB, imagA*realB -> imagA*realB, realA*realB
+	xxswapd		vs43,	vs43			// imagA*imagB, realA*imagB -> realA*imagB, imagA*imagB
+
+	XSFADD_I1	vs1,	vs1,	vs42		// realA*imagB
+	XSFADD_I2	vs1,	vs1,	vs43		// imagA*realB
+
+	xsmuldp		vs4,	vs0,	alpha_r		// real*alpha_r
+	xsmuldp		vs5,	vs1,	alpha_i		// imag*alpha_i
+	xsmuldp		vs6,	vs0,	alpha_i		// real*alpha_i
+	xsmuldp		vs7,	vs1,	alpha_r		// imag*alpha_r
+
+	xssubdp		vs2,	vs4,	vs5		// real*alpha_r - imag*alpha_i
+	xsadddp		vs3,	vs6,	vs7		// real*alpha_i + imag*alpha_r
+	xxpermdi	vs13,	vs2,	vs3,	0	// merge real and imag part
+
+
+
+	xxlxor		vs0,	vs0,	vs0
+	xxlxor		vs1,	vs1,	vs1
+	xxswapd		vs45,	vs45			// realA*imagB, imagA*imagB -> imagA*imagB, realA*imagB
+
+	XSFADD_R1	vs0,	vs0,	vs44		// realA*realB
+	XSFADD_R2	vs0,	vs0,	vs45		// imagA*imagB
+
+	xxswapd		vs44,	vs44			// realA*realB, imagA*realB -> imagA*realB, realA*realB
+	xxswapd		vs45,	vs45			// imagA*imagB, realA*imagB -> realA*imagB, imagA*imagB
+
+	XSFADD_I1	vs1,	vs1,	vs44		// realA*imagB
+	XSFADD_I2	vs1,	vs1,	vs45		// imagA*realB
+
+	xsmuldp		vs4,	vs0,	alpha_r		// real*alpha_r
+	xsmuldp		vs5,	vs1,	alpha_i		// imag*alpha_i
+	xsmuldp		vs6,	vs0,	alpha_i		// real*alpha_i
+	xsmuldp		vs7,	vs1,	alpha_r		// imag*alpha_r
+
+	xssubdp		vs2,	vs4,	vs5		// real*alpha_r - imag*alpha_i
+	xsadddp		vs3,	vs6,	vs7		// real*alpha_i + imag*alpha_r
+	xxpermdi	vs14,	vs2,	vs3,	0	// merge real and imag part
+
+
+
+	xxlxor		vs0,	vs0,	vs0
+	xxlxor		vs1,	vs1,	vs1
+	xxswapd		vs47,	vs47			// realA*imagB, imagA*imagB -> imagA*imagB, realA*imagB
+
+	XSFADD_R1	vs0,	vs0,	vs46		// realA*realB
+	XSFADD_R2	vs0,	vs0,	vs47		// imagA*imagB
+
+	xxswapd		vs46,	vs46			// realA*realB, imagA*realB -> imagA*realB, realA*realB
+	xxswapd		vs47,	vs47			// imagA*imagB, realA*imagB -> realA*imagB, imagA*imagB
+
+	XSFADD_I1	vs1,	vs1,	vs46		// realA*imagB
+	XSFADD_I2	vs1,	vs1,	vs47		// imagA*realB
+
+	xsmuldp		vs4,	vs0,	alpha_r		// real*alpha_r
+	xsmuldp		vs5,	vs1,	alpha_i		// imag*alpha_i
+	xsmuldp		vs6,	vs0,	alpha_i		// real*alpha_i
+	xsmuldp		vs7,	vs1,	alpha_r		// imag*alpha_r
+
+	xssubdp		vs2,	vs4,	vs5		// real*alpha_r - imag*alpha_i
+	xsadddp		vs3,	vs6,	vs7		// real*alpha_i + imag*alpha_r
+	xxpermdi	vs15,	vs2,	vs3,	0	// merge real and imag part
+
+
+#ifndef TRMMKERNEL
+
+	xvadddp		vs8,	vs8,	vs16
+	xvadddp		vs9,	vs9,	vs17
+	xvadddp		vs10,	vs10,	vs18
+	xvadddp		vs11,	vs11,	vs19
+	xvadddp		vs12,	vs12,	vs20
+	xvadddp		vs13,	vs13,	vs21
+	xvadddp		vs14,	vs14,	vs22
+	xvadddp		vs15,	vs15,	vs23
+
+#endif
+
+	stxvd2x		vs8,	o0,	T1
+	stxvd2x		vs9,	o16,	T1
+	stxvd2x		vs10,	o32,	T1
+	stxvd2x		vs11,	o48,	T1
+	stxvd2x		vs12,	o0,	T2
+	stxvd2x		vs13,	o16,	T2
+	stxvd2x		vs14,	o32,	T2
+	stxvd2x		vs15,	o48,	T2
+
+	add		T1,	T1,	LDC
+	add		T2,	T2,	LDC
+	addi		CO,	CO,	128
+
+.endm
+
+
+/**********************************************************************************************
+* Macros for N=1 and M=4
+**********************************************************************************************/
+
+.macro LOAD1x4_1
+
+	lxvdsx		vs16,	o0,	BO		// load real part from B
+	lxvdsx		vs17,	o8,	BO		// load imag part from B
+
+	addi		BO,	BO,	16
+
+	lxvd2x		vs0,	o0,	AO		// load real,imag from A
+	lxvd2x		vs1,	o16,	AO		// load real,imag from A
+	lxvd2x		vs2,	o32,	AO		// load real,imag from A
+	lxvd2x		vs3,	o48,	AO		// load real,imag from A
+
+	addi		AO,	AO,	64
+
+
+.endm
+
+.macro KERNEL1x4_I1
+
+	lxvd2x		vs8,	o0,	AO		// load real,imag from A
+	lxvd2x		vs9,	o16,	AO		// load real,imag from A
+	lxvd2x		vs10,	o32,	AO		// load real,imag from A
+	lxvd2x		vs11,	o48,	AO		// load real,imag from A
+
+	addi		AO,	AO,	64
+
+	lxvdsx		vs20,	o0,	BO		// load real part from B
+	lxvdsx		vs21,	o8,	BO		// load imag part from B
+
+	addi		BO,	BO,	16
+
+	xvmuldp		vs32,	vs0,	vs16		// real*real, imag*real
+	xvmuldp		vs33,	vs0,	vs17		// real*imag, imag*imag
+	xvmuldp		vs34,	vs1,	vs16		// real*real, imag*real
+	xvmuldp		vs35,	vs1,	vs17		// real*imag, imag*imag
+	xvmuldp		vs36,	vs2,	vs16		// real*real, imag*real
+	xvmuldp		vs37,	vs2,	vs17		// real*imag, imag*imag
+	xvmuldp		vs38,	vs3,	vs16		// real*real, imag*real
+	xvmuldp		vs39,	vs3,	vs17		// real*imag, imag*imag
+
+
+.endm
+
+.macro KERNEL1x4_1
+
+	lxvd2x		vs8,	o0,	AO		// load real,imag from A
+	lxvd2x		vs9,	o16,	AO		// load real,imag from A
+	lxvd2x		vs10,	o32,	AO		// load real,imag from A
+	lxvd2x		vs11,	o48,	AO		// load real,imag from A
+
+	addi		AO,	AO,	64
+
+	lxvdsx		vs20,	o0,	BO		// load real part from B
+	lxvdsx		vs21,	o8,	BO		// load imag part from B
+
+	addi		BO,	BO,	16
+
+	xvmaddadp	vs32,	vs0,	vs16		// real*real, imag*real
+	xvmaddadp	vs33,	vs0,	vs17		// real*imag, imag*imag
+	xvmaddadp	vs34,	vs1,	vs16		// real*real, imag*real
+	xvmaddadp	vs35,	vs1,	vs17		// real*imag, imag*imag
+	xvmaddadp	vs36,	vs2,	vs16		// real*real, imag*real
+	xvmaddadp	vs37,	vs2,	vs17		// real*imag, imag*imag
+	xvmaddadp	vs38,	vs3,	vs16		// real*real, imag*real
+	xvmaddadp	vs39,	vs3,	vs17		// real*imag, imag*imag
+
+
+.endm
+
+.macro KERNEL1x4_2
+
+	lxvd2x		vs0,	o0,	AO		// load real,imag from A
+	lxvd2x		vs1,	o16,	AO		// load real,imag from A
+	lxvd2x		vs2,	o32,	AO		// load real,imag from A
+	lxvd2x		vs3,	o48,	AO		// load real,imag from A
+
+	addi		AO,	AO,	64
+
+	lxvdsx		vs16,	o0,	BO		// load real part from B
+	lxvdsx		vs17,	o8,	BO		// load imag part from B
+
+	addi		BO,	BO,	16
+
+	xvmaddadp	vs32,	vs8,	vs20		// real*real, imag*real
+	xvmaddadp	vs33,	vs8,	vs21		// real*imag, imag*imag
+	xvmaddadp	vs34,	vs9,	vs20		// real*real, imag*real
+	xvmaddadp	vs35,	vs9,	vs21		// real*imag, imag*imag
+	xvmaddadp	vs36,	vs10,	vs20		// real*real, imag*real
+	xvmaddadp	vs37,	vs10,	vs21		// real*imag, imag*imag
+	xvmaddadp	vs38,	vs11,	vs20		// real*real, imag*real
+	xvmaddadp	vs39,	vs11,	vs21		// real*imag, imag*imag
+
+
+.endm
+
+.macro KERNEL1x4_E2
+
+
+	xvmaddadp	vs32,	vs8,	vs20		// real*real, imag*real
+	xvmaddadp	vs33,	vs8,	vs21		// real*imag, imag*imag
+	xvmaddadp	vs34,	vs9,	vs20		// real*real, imag*real
+	xvmaddadp	vs35,	vs9,	vs21		// real*imag, imag*imag
+	xvmaddadp	vs36,	vs10,	vs20		// real*real, imag*real
+	xvmaddadp	vs37,	vs10,	vs21		// real*imag, imag*imag
+	xvmaddadp	vs38,	vs11,	vs20		// real*real, imag*real
+	xvmaddadp	vs39,	vs11,	vs21		// real*imag, imag*imag
+
+
+.endm
+
+.macro KERNEL1x4_SUBI1
+
+	lxvd2x		vs0,	o0,	AO		// load real,imag from A
+	lxvd2x		vs1,	o16,	AO		// load real,imag from A
+	lxvd2x		vs2,	o32,	AO		// load real,imag from A
+	lxvd2x		vs3,	o48,	AO		// load real,imag from A
+
+	addi		AO,	AO,	64
+
+	lxvdsx		vs16,	o0,	BO		// load real part from B
+	lxvdsx		vs17,	o8,	BO		// load imag part from B
+
+	addi		BO,	BO,	16
+
+	xvmuldp		vs32,	vs0,	vs16		// real*real, imag*real
+	xvmuldp		vs33,	vs0,	vs17		// real*imag, imag*imag
+	xvmuldp		vs34,	vs1,	vs16		// real*real, imag*real
+	xvmuldp		vs35,	vs1,	vs17		// real*imag, imag*imag
+	xvmuldp		vs36,	vs2,	vs16		// real*real, imag*real
+	xvmuldp		vs37,	vs2,	vs17		// real*imag, imag*imag
+	xvmuldp		vs38,	vs3,	vs16		// real*real, imag*real
+	xvmuldp		vs39,	vs3,	vs17		// real*imag, imag*imag
+
+
+.endm
+
+.macro KERNEL1x4_SUB1
+
+	lxvd2x		vs0,	o0,	AO		// load real,imag from A
+	lxvd2x		vs1,	o16,	AO		// load real,imag from A
+	lxvd2x		vs2,	o32,	AO		// load real,imag from A
+	lxvd2x		vs3,	o48,	AO		// load real,imag from A
+
+	addi		AO,	AO,	64
+
+	lxvdsx		vs16,	o0,	BO		// load real part from B
+	lxvdsx		vs17,	o8,	BO		// load imag part from B
+
+	addi		BO,	BO,	16
+
+	xvmaddadp	vs32,	vs0,	vs16		// real*real, imag*real
+	xvmaddadp	vs33,	vs0,	vs17		// real*imag, imag*imag
+	xvmaddadp	vs34,	vs1,	vs16		// real*real, imag*real
+	xvmaddadp	vs35,	vs1,	vs17		// real*imag, imag*imag
+	xvmaddadp	vs36,	vs2,	vs16		// real*real, imag*real
+	xvmaddadp	vs37,	vs2,	vs17		// real*imag, imag*imag
+	xvmaddadp	vs38,	vs3,	vs16		// real*real, imag*real
+	xvmaddadp	vs39,	vs3,	vs17		// real*imag, imag*imag
+
+
+.endm
+
+.macro SAVE1x4
+
+
+	mr		T1,	CO
+
+#ifndef TRMMKERNEL
+
+	lxvd2x		vs16,	o0,	T1
+	lxvd2x		vs17,	o16,	T1
+	lxvd2x		vs18,	o32,	T1
+	lxvd2x		vs19,	o48,	T1
+
+#endif
+
+
+	xxlxor		vs0,	vs0,	vs0
+	xxlxor		vs1,	vs1,	vs1
+	xxswapd		vs33,	vs33			// realA*imagB, imagA*imagB -> imagA*imagB, realA*imagB
+
+	XSFADD_R1	vs0,	vs0,	vs32		// realA*realB
+	XSFADD_R2	vs0,	vs0,	vs33		// imagA*imagB
+
+	xxswapd		vs32,	vs32			// realA*realB, imagA*realB -> imagA*realB, realA*realB
+	xxswapd		vs33,	vs33			// imagA*imagB, realA*imagB -> realA*imagB, imagA*imagB
+
+	XSFADD_I1	vs1,	vs1,	vs32		// realA*imagB
+	XSFADD_I2	vs1,	vs1,	vs33		// imagA*realB
+
+	xsmuldp		vs4,	vs0,	alpha_r		// real*alpha_r
+	xsmuldp		vs5,	vs1,	alpha_i		// imag*alpha_i
+	xsmuldp		vs6,	vs0,	alpha_i		// real*alpha_i
+	xsmuldp		vs7,	vs1,	alpha_r		// imag*alpha_r
+
+	xssubdp		vs2,	vs4,	vs5		// real*alpha_r - imag*alpha_i
+	xsadddp		vs3,	vs6,	vs7		// real*alpha_i + imag*alpha_r
+	xxpermdi	vs8,	vs2,	vs3,	0	// merge real and imag part
+
+
+
+	xxlxor		vs0,	vs0,	vs0
+	xxlxor		vs1,	vs1,	vs1
+	xxswapd		vs35,	vs35			// realA*imagB, imagA*imagB -> imagA*imagB, realA*imagB
+
+	XSFADD_R1	vs0,	vs0,	vs34		// realA*realB
+	XSFADD_R2	vs0,	vs0,	vs35		// imagA*imagB
+
+	xxswapd		vs34,	vs34			// realA*realB, imagA*realB -> imagA*realB, realA*realB
+	xxswapd		vs35,	vs35			// imagA*imagB, realA*imagB -> realA*imagB, imagA*imagB
+
+	XSFADD_I1	vs1,	vs1,	vs34		// realA*imagB
+	XSFADD_I2	vs1,	vs1,	vs35		// imagA*realB
+
+	xsmuldp		vs4,	vs0,	alpha_r		// real*alpha_r
+	xsmuldp		vs5,	vs1,	alpha_i		// imag*alpha_i
+	xsmuldp		vs6,	vs0,	alpha_i		// real*alpha_i
+	xsmuldp		vs7,	vs1,	alpha_r		// imag*alpha_r
+
+	xssubdp		vs2,	vs4,	vs5		// real*alpha_r - imag*alpha_i
+	xsadddp		vs3,	vs6,	vs7		// real*alpha_i + imag*alpha_r
+	xxpermdi	vs9,	vs2,	vs3,	0	// merge real and imag part
+
+
+
+	xxlxor		vs0,	vs0,	vs0
+	xxlxor		vs1,	vs1,	vs1
+	xxswapd		vs37,	vs37			// realA*imagB, imagA*imagB -> imagA*imagB, realA*imagB
+
+	XSFADD_R1	vs0,	vs0,	vs36		// realA*realB
+	XSFADD_R2	vs0,	vs0,	vs37		// imagA*imagB
+
+	xxswapd		vs36,	vs36			// realA*realB, imagA*realB -> imagA*realB, realA*realB
+	xxswapd		vs37,	vs37			// imagA*imagB, realA*imagB -> realA*imagB, imagA*imagB
+
+	XSFADD_I1	vs1,	vs1,	vs36		// realA*imagB
+	XSFADD_I2	vs1,	vs1,	vs37		// imagA*realB
+
+	xsmuldp		vs4,	vs0,	alpha_r		// real*alpha_r
+	xsmuldp		vs5,	vs1,	alpha_i		// imag*alpha_i
+	xsmuldp		vs6,	vs0,	alpha_i		// real*alpha_i
+	xsmuldp		vs7,	vs1,	alpha_r		// imag*alpha_r
+
+	xssubdp		vs2,	vs4,	vs5		// real*alpha_r - imag*alpha_i
+	xsadddp		vs3,	vs6,	vs7		// real*alpha_i + imag*alpha_r
+	xxpermdi	vs10,	vs2,	vs3,	0	// merge real and imag part
+
+
+
+	xxlxor		vs0,	vs0,	vs0
+	xxlxor		vs1,	vs1,	vs1
+	xxswapd		vs39,	vs39			// realA*imagB, imagA*imagB -> imagA*imagB, realA*imagB
+
+	XSFADD_R1	vs0,	vs0,	vs38		// realA*realB
+	XSFADD_R2	vs0,	vs0,	vs39		// imagA*imagB
+
+	xxswapd		vs38,	vs38			// realA*realB, imagA*realB -> imagA*realB, realA*realB
+	xxswapd		vs39,	vs39			// imagA*imagB, realA*imagB -> realA*imagB, imagA*imagB
+
+	XSFADD_I1	vs1,	vs1,	vs38		// realA*imagB
+	XSFADD_I2	vs1,	vs1,	vs39		// imagA*realB
+
+	xsmuldp		vs4,	vs0,	alpha_r		// real*alpha_r
+	xsmuldp		vs5,	vs1,	alpha_i		// imag*alpha_i
+	xsmuldp		vs6,	vs0,	alpha_i		// real*alpha_i
+	xsmuldp		vs7,	vs1,	alpha_r		// imag*alpha_r
+
+	xssubdp		vs2,	vs4,	vs5		// real*alpha_r - imag*alpha_i
+	xsadddp		vs3,	vs6,	vs7		// real*alpha_i + imag*alpha_r
+	xxpermdi	vs11,	vs2,	vs3,	0	// merge real and imag part
+
+
+#ifndef TRMMKERNEL
+
+	xvadddp		vs8,	vs8,	vs16
+	xvadddp		vs9,	vs9,	vs17
+	xvadddp		vs10,	vs10,	vs18
+	xvadddp		vs11,	vs11,	vs19
+
+#endif
+
+	stxvd2x		vs8,	o0,	T1
+	stxvd2x		vs9,	o16,	T1
+	stxvd2x		vs10,	o32,	T1
+	stxvd2x		vs11,	o48,	T1
+
+	add		T1,	T1,	LDC
+	addi		CO,	CO,	64
+
+.endm
+
+
+/**********************************************************************************************
+* Macros for N=1 and M=2
+**********************************************************************************************/
+
+.macro LOAD1x2_1
+
+	lxvdsx		vs16,	o0,	BO		// load real part from B
+	lxvdsx		vs17,	o8,	BO		// load imag part from B
+
+	addi		BO,	BO,	16
+
+	lxvd2x		vs0,	o0,	AO		// load real,imag from A
+	lxvd2x		vs1,	o16,	AO		// load real,imag from A
+
+	addi		AO,	AO,	32
+
+
+.endm
+
+.macro KERNEL1x2_I1
+
+	lxvd2x		vs8,	o0,	AO		// load real,imag from A
+	lxvd2x		vs9,	o16,	AO		// load real,imag from A
+
+	addi		AO,	AO,	32
+
+	lxvdsx		vs20,	o0,	BO		// load real part from B
+	lxvdsx		vs21,	o8,	BO		// load imag part from B
+
+	addi		BO,	BO,	16
+
+	xvmuldp		vs32,	vs0,	vs16		// real*real, imag*real
+	xvmuldp		vs33,	vs0,	vs17		// real*imag, imag*imag
+	xvmuldp		vs34,	vs1,	vs16		// real*real, imag*real
+	xvmuldp		vs35,	vs1,	vs17		// real*imag, imag*imag
+
+
+.endm
+
+.macro KERNEL1x2_1
+
+	lxvd2x		vs8,	o0,	AO		// load real,imag from A
+	lxvd2x		vs9,	o16,	AO		// load real,imag from A
+
+	addi		AO,	AO,	32
+
+	lxvdsx		vs20,	o0,	BO		// load real part from B
+	lxvdsx		vs21,	o8,	BO		// load imag part from B
+
+	addi		BO,	BO,	16
+
+	xvmaddadp	vs32,	vs0,	vs16		// real*real, imag*real
+	xvmaddadp	vs33,	vs0,	vs17		// real*imag, imag*imag
+	xvmaddadp	vs34,	vs1,	vs16		// real*real, imag*real
+	xvmaddadp	vs35,	vs1,	vs17		// real*imag, imag*imag
+
+
+.endm
+
+.macro KERNEL1x2_2
+
+	lxvd2x		vs0,	o0,	AO		// load real,imag from A
+	lxvd2x		vs1,	o16,	AO		// load real,imag from A
+
+	addi		AO,	AO,	32
+
+	lxvdsx		vs16,	o0,	BO		// load real part from B
+	lxvdsx		vs17,	o8,	BO		// load imag part from B
+
+	addi		BO,	BO,	16
+
+	xvmaddadp	vs32,	vs8,	vs20		// real*real, imag*real
+	xvmaddadp	vs33,	vs8,	vs21		// real*imag, imag*imag
+	xvmaddadp	vs34,	vs9,	vs20		// real*real, imag*real
+	xvmaddadp	vs35,	vs9,	vs21		// real*imag, imag*imag
+
+
+.endm
+
+.macro KERNEL1x2_E2
+
+
+	xvmaddadp	vs32,	vs8,	vs20		// real*real, imag*real
+	xvmaddadp	vs33,	vs8,	vs21		// real*imag, imag*imag
+	xvmaddadp	vs34,	vs9,	vs20		// real*real, imag*real
+	xvmaddadp	vs35,	vs9,	vs21		// real*imag, imag*imag
+
+
+.endm
+
+.macro KERNEL1x2_SUBI1
+
+	lxvd2x		vs0,	o0,	AO		// load real,imag from A
+	lxvd2x		vs1,	o16,	AO		// load real,imag from A
+
+	addi		AO,	AO,	32
+
+	lxvdsx		vs16,	o0,	BO		// load real part from B
+	lxvdsx		vs17,	o8,	BO		// load imag part from B
+
+	addi		BO,	BO,	16
+
+	xvmuldp		vs32,	vs0,	vs16		// real*real, imag*real
+	xvmuldp		vs33,	vs0,	vs17		// real*imag, imag*imag
+	xvmuldp		vs34,	vs1,	vs16		// real*real, imag*real
+	xvmuldp		vs35,	vs1,	vs17		// real*imag, imag*imag
+
+
+.endm
+
+.macro KERNEL1x2_SUB1
+
+	lxvd2x		vs0,	o0,	AO		// load real,imag from A
+	lxvd2x		vs1,	o16,	AO		// load real,imag from A
+
+	addi		AO,	AO,	32
+
+	lxvdsx		vs16,	o0,	BO		// load real part from B
+	lxvdsx		vs17,	o8,	BO		// load imag part from B
+
+	addi		BO,	BO,	16
+
+	xvmaddadp	vs32,	vs0,	vs16		// real*real, imag*real
+	xvmaddadp	vs33,	vs0,	vs17		// real*imag, imag*imag
+	xvmaddadp	vs34,	vs1,	vs16		// real*real, imag*real
+	xvmaddadp	vs35,	vs1,	vs17		// real*imag, imag*imag
+
+
+.endm
+
+.macro SAVE1x2
+
+
+	mr		T1,	CO
+
+#ifndef TRMMKERNEL
+
+	lxvd2x		vs16,	o0,	T1
+	lxvd2x		vs17,	o16,	T1
+
+#endif
+
+
+	xxlxor		vs0,	vs0,	vs0
+	xxlxor		vs1,	vs1,	vs1
+	xxswapd		vs33,	vs33			// realA*imagB, imagA*imagB -> imagA*imagB, realA*imagB
+
+	XSFADD_R1	vs0,	vs0,	vs32		// realA*realB
+	XSFADD_R2	vs0,	vs0,	vs33		// imagA*imagB
+
+	xxswapd		vs32,	vs32			// realA*realB, imagA*realB -> imagA*realB, realA*realB
+	xxswapd		vs33,	vs33			// imagA*imagB, realA*imagB -> realA*imagB, imagA*imagB
+
+	XSFADD_I1	vs1,	vs1,	vs32		// realA*imagB
+	XSFADD_I2	vs1,	vs1,	vs33		// imagA*realB
+
+	xsmuldp		vs4,	vs0,	alpha_r		// real*alpha_r
+	xsmuldp		vs5,	vs1,	alpha_i		// imag*alpha_i
+	xsmuldp		vs6,	vs0,	alpha_i		// real*alpha_i
+	xsmuldp		vs7,	vs1,	alpha_r		// imag*alpha_r
+
+	xssubdp		vs2,	vs4,	vs5		// real*alpha_r - imag*alpha_i
+	xsadddp		vs3,	vs6,	vs7		// real*alpha_i + imag*alpha_r
+	xxpermdi	vs8,	vs2,	vs3,	0	// merge real and imag part
+
+
+
+	xxlxor		vs0,	vs0,	vs0
+	xxlxor		vs1,	vs1,	vs1
+	xxswapd		vs35,	vs35			// realA*imagB, imagA*imagB -> imagA*imagB, realA*imagB
+
+	XSFADD_R1	vs0,	vs0,	vs34		// realA*realB
+	XSFADD_R2	vs0,	vs0,	vs35		// imagA*imagB
+
+	xxswapd		vs34,	vs34			// realA*realB, imagA*realB -> imagA*realB, realA*realB
+	xxswapd		vs35,	vs35			// imagA*imagB, realA*imagB -> realA*imagB, imagA*imagB
+
+	XSFADD_I1	vs1,	vs1,	vs34		// realA*imagB
+	XSFADD_I2	vs1,	vs1,	vs35		// imagA*realB
+
+	xsmuldp		vs4,	vs0,	alpha_r		// real*alpha_r
+	xsmuldp		vs5,	vs1,	alpha_i		// imag*alpha_i
+	xsmuldp		vs6,	vs0,	alpha_i		// real*alpha_i
+	xsmuldp		vs7,	vs1,	alpha_r		// imag*alpha_r
+
+	xssubdp		vs2,	vs4,	vs5		// real*alpha_r - imag*alpha_i
+	xsadddp		vs3,	vs6,	vs7		// real*alpha_i + imag*alpha_r
+	xxpermdi	vs9,	vs2,	vs3,	0	// merge real and imag part
+
+
+#ifndef TRMMKERNEL
+
+	xvadddp		vs8,	vs8,	vs16
+	xvadddp		vs9,	vs9,	vs17
+
+#endif
+
+	stxvd2x		vs8,	o0,	T1
+	stxvd2x		vs9,	o16,	T1
+
+	add		T1,	T1,	LDC
+	addi		CO,	CO,	32
+
+.endm
+
+
+/**********************************************************************************************
+* Macros for N=1 and M=1
+**********************************************************************************************/
+
+.macro LOAD1x1_1
+
+	lxvdsx		vs16,	o0,	BO		// load real part from B
+	lxvdsx		vs17,	o8,	BO		// load imag part from B
+
+	addi		BO,	BO,	16
+
+	lxvd2x		vs0,	o0,	AO		// load real,imag from A
+
+	addi		AO,	AO,	16
+
+
+.endm
+
+.macro KERNEL1x1_I1
+
+	lxvd2x		vs8,	o0,	AO		// load real,imag from A
+
+	addi		AO,	AO,	16
+
+	lxvdsx		vs20,	o0,	BO		// load real part from B
+	lxvdsx		vs21,	o8,	BO		// load imag part from B
+
+	addi		BO,	BO,	16
+
+	xvmuldp		vs32,	vs0,	vs16		// real*real, imag*real
+	xvmuldp		vs33,	vs0,	vs17		// real*imag, imag*imag
+
+
+.endm
+
+.macro KERNEL1x1_1
+
+	lxvd2x		vs8,	o0,	AO		// load real,imag from A
+
+	addi		AO,	AO,	16
+
+	lxvdsx		vs20,	o0,	BO		// load real part from B
+	lxvdsx		vs21,	o8,	BO		// load imag part from B
+
+	addi		BO,	BO,	16
+
+	xvmaddadp	vs32,	vs0,	vs16		// real*real, imag*real
+	xvmaddadp	vs33,	vs0,	vs17		// real*imag, imag*imag
+
+
+.endm
+
+.macro KERNEL1x1_2
+
+	lxvd2x		vs0,	o0,	AO		// load real,imag from A
+
+	addi		AO,	AO,	16
+
+	lxvdsx		vs16,	o0,	BO		// load real part from B
+	lxvdsx		vs17,	o8,	BO		// load imag part from B
+
+	addi		BO,	BO,	16
+
+	xvmaddadp	vs32,	vs8,	vs20		// real*real, imag*real
+	xvmaddadp	vs33,	vs8,	vs21		// real*imag, imag*imag
+
+
+.endm
+
+.macro KERNEL1x1_E2
+
+
+	xvmaddadp	vs32,	vs8,	vs20		// real*real, imag*real
+	xvmaddadp	vs33,	vs8,	vs21		// real*imag, imag*imag
+
+
+.endm
+
+.macro KERNEL1x1_SUBI1
+
+	lxvd2x		vs0,	o0,	AO		// load real,imag from A
+
+	addi		AO,	AO,	16
+
+	lxvdsx		vs16,	o0,	BO		// load real part from B
+	lxvdsx		vs17,	o8,	BO		// load imag part from B
+
+	addi		BO,	BO,	16
+
+	xvmuldp		vs32,	vs0,	vs16		// real*real, imag*real
+	xvmuldp		vs33,	vs0,	vs17		// real*imag, imag*imag
+
+
+.endm
+
+.macro KERNEL1x1_SUB1
+
+	lxvd2x		vs0,	o0,	AO		// load real,imag from A
+
+	addi		AO,	AO,	16
+
+	lxvdsx		vs16,	o0,	BO		// load real part from B
+	lxvdsx		vs17,	o8,	BO		// load imag part from B
+
+	addi		BO,	BO,	16
+
+	xvmaddadp	vs32,	vs0,	vs16		// real*real, imag*real
+	xvmaddadp	vs33,	vs0,	vs17		// real*imag, imag*imag
+
+
+.endm
+
+.macro SAVE1x1
+
+
+	mr		T1,	CO
+
+#ifndef TRMMKERNEL
+
+	lxvd2x		vs16,	o0,	T1
+
+#endif
+
+
+	xxlxor		vs0,	vs0,	vs0
+	xxlxor		vs1,	vs1,	vs1
+	xxswapd		vs33,	vs33			// realA*imagB, imagA*imagB -> imagA*imagB, realA*imagB
+
+	XSFADD_R1	vs0,	vs0,	vs32		// realA*realB
+	XSFADD_R2	vs0,	vs0,	vs33		// imagA*imagB
+
+	xxswapd		vs32,	vs32			// realA*realB, imagA*realB -> imagA*realB, realA*realB
+	xxswapd		vs33,	vs33			// imagA*imagB, realA*imagB -> realA*imagB, imagA*imagB
+
+	XSFADD_I1	vs1,	vs1,	vs32		// realA*imagB
+	XSFADD_I2	vs1,	vs1,	vs33		// imagA*realB
+
+	xsmuldp		vs4,	vs0,	alpha_r		// real*alpha_r
+	xsmuldp		vs5,	vs1,	alpha_i		// imag*alpha_i
+	xsmuldp		vs6,	vs0,	alpha_i		// real*alpha_i
+	xsmuldp		vs7,	vs1,	alpha_r		// imag*alpha_r
+
+	xssubdp		vs2,	vs4,	vs5		// real*alpha_r - imag*alpha_i
+	xsadddp		vs3,	vs6,	vs7		// real*alpha_i + imag*alpha_r
+	xxpermdi	vs8,	vs2,	vs3,	0	// merge real and imag part
+
+
+#ifndef TRMMKERNEL
+
+	xvadddp		vs8,	vs8,	vs16
+
+#endif
+
+	stxvd2x		vs8,	o0,	T1
+
+	add		T1,	T1,	LDC
+	addi		CO,	CO,	16
+
+.endm
+
diff --git a/kernel/x86_64/KERNEL.STEAMROLLER b/kernel/x86_64/KERNEL.STEAMROLLER
index f14c82303..4ec748284 100644
--- a/kernel/x86_64/KERNEL.STEAMROLLER
+++ b/kernel/x86_64/KERNEL.STEAMROLLER
@@ -24,7 +24,7 @@ SGEMVTKERNEL = sgemv_t_4.c
 DGEMVNKERNEL = dgemv_n_4.c
 DGEMVTKERNEL = dgemv_t_4.c
 
-ZGEMVNKERNEL = zgemv_t_4.c
+ZGEMVNKERNEL = zgemv_n_4.c
 ZGEMVTKERNEL = zgemv_t_4.c
 
 DCOPYKERNEL  = dcopy_bulldozer.S
diff --git a/kernel/x86_64/sdot.c b/kernel/x86_64/sdot.c
index a6da1fea7..a3d20d276 100644
--- a/kernel/x86_64/sdot.c
+++ b/kernel/x86_64/sdot.c
@@ -72,18 +72,20 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y)
 {
 	BLASLONG i=0;
 	BLASLONG ix=0,iy=0;
+	double dot = 0.0 ;
 
-	FLOAT  dot = 0.0 ;
+	FLOAT mydot=0.0;
+	BLASLONG n1;
 
 	if ( n <= 0 )  return(dot);
 
 	if ( (inc_x == 1) && (inc_y == 1) )
 	{
 
-		BLASLONG n1 = n & -32;
+	        n1 = n & (BLASLONG)(-32);
 
 		if ( n1 )
-			sdot_kernel_16(n1, x, y , &dot );
+			sdot_kernel_16(n1, x, y , &mydot );
 
 
 		i = n1;
@@ -94,12 +96,13 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y)
 			i++ ;
 
 		}
+		dot+=mydot;
 		return(dot);
 
 
 	}
 
-	BLASLONG n1 = n & -2;
+	n1 = n & (BLASLONG)(-2);
 
 	while(i < n1)
 	{
@@ -124,4 +127,3 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y)
 
 }
 
-
diff --git a/param.h b/param.h
index 31125d8e4..a6ead4b64 100644
--- a/param.h
+++ b/param.h
@@ -1961,35 +1961,36 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
 #if defined(POWER8)
 
-#define SNUMOPT		4
+#define SNUMOPT		16
 #define DNUMOPT		8
 
-#define GEMM_DEFAULT_OFFSET_A  384
-#define GEMM_DEFAULT_OFFSET_B 1024
+#define GEMM_DEFAULT_OFFSET_A 4096 
+#define GEMM_DEFAULT_OFFSET_B 4096
 #define GEMM_DEFAULT_ALIGN 0x03fffUL
 
-#define SGEMM_DEFAULT_UNROLL_M 4
-#define SGEMM_DEFAULT_UNROLL_N 4
+#define SGEMM_DEFAULT_UNROLL_M 16
+#define SGEMM_DEFAULT_UNROLL_N 8
 #define DGEMM_DEFAULT_UNROLL_M 16
 #define DGEMM_DEFAULT_UNROLL_N 4
-#define CGEMM_DEFAULT_UNROLL_M 2
-#define CGEMM_DEFAULT_UNROLL_N 2
+#define CGEMM_DEFAULT_UNROLL_M 8
+#define CGEMM_DEFAULT_UNROLL_N 4
 #define ZGEMM_DEFAULT_UNROLL_M 8
 #define ZGEMM_DEFAULT_UNROLL_N 2
 
-#define SGEMM_DEFAULT_P  992
+#define SGEMM_DEFAULT_P  960
 #define DGEMM_DEFAULT_P  480
-#define CGEMM_DEFAULT_P  488
-#define ZGEMM_DEFAULT_P  240
+#define CGEMM_DEFAULT_P  720
+#define ZGEMM_DEFAULT_P  480
 
-#define SGEMM_DEFAULT_Q  504
+#define SGEMM_DEFAULT_Q  720
 #define DGEMM_DEFAULT_Q  720
-#define CGEMM_DEFAULT_Q  400
-#define ZGEMM_DEFAULT_Q  360
+#define CGEMM_DEFAULT_Q  720
+#define ZGEMM_DEFAULT_Q  720
 
-#define SGEMM_DEFAULT_R 28800
+#define SGEMM_DEFAULT_R 21600
 #define DGEMM_DEFAULT_R 14400
-#define ZGEMM_DEFAULT_R 7200
+#define CGEMM_DEFAULT_R 16200
+#define ZGEMM_DEFAULT_R 21600
 
 #define SYMV_P	 8