diff --git a/.travis.yml b/.travis.yml
index 990bed864..63b469716 100644
--- a/.travis.yml
+++ b/.travis.yml
@@ -25,6 +25,7 @@ before_install:
  - if [[ "$TARGET_BOX" == "LINUX32" ]]; then sudo apt-get install -qq  gcc-multilib gfortran-multilib; fi
 
 script: 
+ - set -e
  - make QUIET_MAKE=1 DYNAMIC_ARCH=1 TARGET=NEHALEM NUM_THREADS=32 $BTYPE
  - if [ "$TARGET_BOX" == "LINUX32" ] || [ "$TARGET_BOX" == "LINUX64" ]; then make -C test DYNAMIC_ARCH=1 TARGET=NEHALEM NUM_THREADS=32 $BTYPE; fi
  - if [ "$TARGET_BOX" == "LINUX32" ] || [ "$TARGET_BOX" == "LINUX64" ]; then make -C ctest DYNAMIC_ARCH=1 TARGET=NEHALEM NUM_THREADS=32 $BTYPE; fi
diff --git a/CONTRIBUTORS.md b/CONTRIBUTORS.md
index df92cf4ef..da56c0758 100644
--- a/CONTRIBUTORS.md
+++ b/CONTRIBUTORS.md
@@ -141,5 +141,11 @@ In chronological order:
 * Martin Koehler <https://github.com/grisuthedragon/>
   * [2015-09-07] Improved imatcopy
 
+* Ashwin Sekhar T K <https://github.com/ashwinyes/>
+  * [2015-11-09] Assembly kernels for Cortex-A57 (ARMv8)
+  * [2015-11-20] lapack-test fixes for Cortex-A57
+  * [2016-03-14] Additional functional Assembly Kernels for Cortex-A57
+  * [2016-03-14] Optimize Dgemm 4x4 for Cortex-A57
+
 * [Your name or handle] <[email or website]>
   * [Date] [Brief summary of your changes]
diff --git a/Makefile b/Makefile
index 5aa10b2c3..9ba2bffb3 100644
--- a/Makefile
+++ b/Makefile
@@ -83,20 +83,20 @@ shared :
 ifndef NO_SHARED
 ifeq ($(OSNAME), $(filter $(OSNAME),Linux SunOS))
 	@$(MAKE) -C exports so
-	@-ln -fs $(LIBSONAME) $(LIBPREFIX).so
-	@-ln -fs $(LIBSONAME) $(LIBPREFIX).so.$(MAJOR_VERSION)
+	@ln -fs $(LIBSONAME) $(LIBPREFIX).so
+	@ln -fs $(LIBSONAME) $(LIBPREFIX).so.$(MAJOR_VERSION)
 endif
 ifeq ($(OSNAME), FreeBSD)
 	@$(MAKE) -C exports so
-	@-ln -fs $(LIBSONAME) $(LIBPREFIX).so
+	@ln -fs $(LIBSONAME) $(LIBPREFIX).so
 endif
 ifeq ($(OSNAME), NetBSD)
 	@$(MAKE) -C exports so
-	@-ln -fs $(LIBSONAME) $(LIBPREFIX).so
+	@ln -fs $(LIBSONAME) $(LIBPREFIX).so
 endif
 ifeq ($(OSNAME), Darwin)
 	@$(MAKE) -C exports dyn
-	@-ln -fs $(LIBDYNNAME) $(LIBPREFIX).dylib
+	@ln -fs $(LIBDYNNAME) $(LIBPREFIX).dylib
 endif
 ifeq ($(OSNAME), WINNT)
 	@$(MAKE) -C exports dll
diff --git a/USAGE.md b/USAGE.md
new file mode 100644
index 000000000..c76ceb324
--- /dev/null
+++ b/USAGE.md
@@ -0,0 +1,199 @@
+# Notes on OpenBLAS usage
+## Usage
+
+#### Program is Terminated. Because you tried to allocate too many memory regions
+
+In OpenBLAS, we mange a pool of memory buffers and allocate the number of
+buffers as the following.
+```
+#define NUM_BUFFERS (MAX_CPU_NUMBER * 2)
+```
+This error indicates that the program exceeded the number of buffers.
+
+Please build OpenBLAS with larger `NUM_THREADS`. For example, `make
+NUM_THREADS=32` or `make NUM_THREADS=64`.  In `Makefile.system`, we will set
+`MAX_CPU_NUMBER=NUM_THREADS`.
+
+#### How can I use OpenBLAS in multi-threaded applications?
+
+If your application is already multi-threaded, it will conflict with OpenBLAS
+multi-threading. Thus, you must set OpenBLAS to use single thread in any of the
+following ways:
+
+* `export OPENBLAS_NUM_THREADS=1` in the environment variables.
+* Call `openblas_set_num_threads(1)` in the application on runtime.
+* Build OpenBLAS single thread version, e.g. `make USE_THREAD=0`
+
+If the application is parallelized by OpenMP, please use OpenBLAS built with
+`USE_OPENMP=1`
+
+#### How to choose TARGET manually at runtime when compiled with DYNAMIC_ARCH
+
+The environment variable which control the kernel selection is
+`OPENBLAS_CORETYPE` (see `driver/others/dynamic.c`) e.g. `export
+OPENBLAS_CORETYPE=Haswell` and the function `char* openblas_get_corename()`
+returns the used target.
+
+#### How could I disable OpenBLAS threading affinity on runtime?
+
+You can define the `OPENBLAS_MAIN_FREE` or `GOTOBLAS_MAIN_FREE` environment
+variable to disable threading affinity on runtime. For example, before the
+running,
+```
+export OPENBLAS_MAIN_FREE=1
+```
+
+Alternatively, you can disable affinity feature with enabling `NO_AFFINITY=1`
+in `Makefile.rule`.
+
+## Linking with the library
+
+* Link with shared library
+
+`gcc -o test test.c -I /your_path/OpenBLAS/include/ -L/your_path/OpenBLAS/lib -lopenblas`
+
+If the library is multithreaded, please add `-lpthread`. If the library
+contains LAPACK functions, please add `-lgfortran` or other Fortran libs.
+
+* Link with static library
+
+`gcc -o test test.c /your/path/libopenblas.a`
+
+You can download `test.c` from https://gist.github.com/xianyi/5780018
+
+On Linux, if OpenBLAS was compiled with threading support (`USE_THREAD=1` by
+default), custom programs statically linked against `libopenblas.a` should also
+link with the pthread library e.g.:
+
+```
+gcc -static -I/opt/OpenBLAS/include -L/opt/OpenBLAS/lib -o my_program my_program.c -lopenblas -lpthread
+```
+
+Failing to add the `-lpthread` flag will cause errors such as:
+
+```
+/opt/OpenBLAS/libopenblas.a(memory.o): In function `_touch_memory':
+memory.c:(.text+0x15): undefined reference to `pthread_mutex_lock'
+memory.c:(.text+0x41): undefined reference to `pthread_mutex_unlock'
+...
+```
+
+## Code examples
+
+#### Call CBLAS interface
+This example shows calling cblas_dgemm in C. https://gist.github.com/xianyi/6930656
+```
+#include <cblas.h>
+#include <stdio.h>
+
+void main()
+{
+  int i=0;
+  double A[6] = {1.0,2.0,1.0,-3.0,4.0,-1.0};
+  double B[6] = {1.0,2.0,1.0,-3.0,4.0,-1.0};
+  double C[9] = {.5,.5,.5,.5,.5,.5,.5,.5,.5};
+  cblas_dgemm(CblasColMajor, CblasNoTrans, CblasTrans,3,3,2,1,A, 3, B, 3,2,C,3);
+
+  for(i=0; i<9; i++)
+    printf("%lf ", C[i]);
+  printf("\n");
+}
+```
+`gcc -o test_cblas_open test_cblas_dgemm.c -I /your_path/OpenBLAS/include/ -L/your_path/OpenBLAS/lib -lopenblas -lpthread -lgfortran`
+
+#### Call BLAS Fortran interface
+
+This example shows calling dgemm Fortran interface in C. https://gist.github.com/xianyi/5780018
+
+```
+#include "stdio.h"
+#include "stdlib.h"
+#include "sys/time.h"
+#include "time.h"
+
+extern void dgemm_(char*, char*, int*, int*,int*, double*, double*, int*, double*, int*, double*, double*, int*);
+
+int main(int argc, char* argv[])
+{
+  int i;
+  printf("test!\n");
+  if(argc<4){
+    printf("Input Error\n");
+    return 1;
+  }
+
+  int m = atoi(argv[1]);
+  int n = atoi(argv[2]);
+  int k = atoi(argv[3]);
+  int sizeofa = m * k;
+  int sizeofb = k * n;
+  int sizeofc = m * n;
+  char ta = 'N';
+  char tb = 'N';
+  double alpha = 1.2;
+  double beta = 0.001;
+
+  struct timeval start,finish;
+  double duration;
+
+  double* A = (double*)malloc(sizeof(double) * sizeofa);
+  double* B = (double*)malloc(sizeof(double) * sizeofb);
+  double* C = (double*)malloc(sizeof(double) * sizeofc);
+
+  srand((unsigned)time(NULL));
+
+  for (i=0; i<sizeofa; i++)
+    A[i] = i%3+1;//(rand()%100)/10.0;
+
+  for (i=0; i<sizeofb; i++)
+    B[i] = i%3+1;//(rand()%100)/10.0;
+
+  for (i=0; i<sizeofc; i++)
+    C[i] = i%3+1;//(rand()%100)/10.0;
+  //#if 0
+  printf("m=%d,n=%d,k=%d,alpha=%lf,beta=%lf,sizeofc=%d\n",m,n,k,alpha,beta,sizeofc);
+  gettimeofday(&start, NULL);
+  dgemm_(&ta, &tb, &m, &n, &k, &alpha, A, &m, B, &k, &beta, C, &m);
+  gettimeofday(&finish, NULL);
+
+  duration = ((double)(finish.tv_sec-start.tv_sec)*1000000 + (double)(finish.tv_usec-start.tv_usec)) / 1000000;
+  double gflops = 2.0 * m *n*k;
+  gflops = gflops/duration*1.0e-6;
+
+  FILE *fp;
+  fp = fopen("timeDGEMM.txt", "a");
+  fprintf(fp, "%dx%dx%d\t%lf s\t%lf MFLOPS\n", m, n, k, duration, gflops);
+  fclose(fp);
+
+  free(A);
+  free(B);
+  free(C);
+  return 0;
+}
+```
+
+` gcc -o time_dgemm time_dgemm.c /your/path/libopenblas.a`
+
+` ./time_dgemm <m> <n> <k> `
+
+## Troubleshooting
+* Please read [Faq](https://github.com/xianyi/OpenBLAS/wiki/Faq) at first.
+* Please use gcc version 4.6 and above to compile Sandy Bridge AVX kernels on Linux/MingW/BSD.
+* Please use Clang version 3.1 and above to compile the library on Sandy Bridge microarchitecture. The Clang 3.0 will generate the wrong AVX binary code.
+* The number of CPUs/Cores should less than or equal to 256. On Linux x86_64(amd64), there is experimental support for up to 1024 CPUs/Cores and 128 numa nodes if you build the library with BIGNUMA=1.
+* OpenBLAS does not set processor affinity by default. On Linux, you can enable processor affinity by commenting the line NO_AFFINITY=1 in Makefile.rule. But this may cause [the conflict with R parallel](https://stat.ethz.ch/pipermail/r-sig-hpc/2012-April/001348.html).
+* On Loongson 3A. make test would be failed because of pthread_create error. The error code is EAGAIN. However, it will be OK when you run the same testcase on shell.
+
+## BLAS reference manual
+If you want to understand every BLAS function and definition, please read
+[Intel MKL reference manual](https://software.intel.com/sites/products/documentation/doclib/iss/2013/mkl/mklman/GUID-F7ED9FB8-6663-4F44-A62B-61B63C4F0491.htm)
+or [netlib.org](http://netlib.org/blas/)
+
+Here are [OpenBLAS extension functions](https://github.com/xianyi/OpenBLAS/wiki/OpenBLAS-Extensions)
+
+## How to reference OpenBLAS.
+
+You can reference our [papers](https://github.com/xianyi/OpenBLAS/wiki/publications).
+
+Alternatively, you can cite the OpenBLAS homepage http://www.openblas.net directly.
+
diff --git a/benchmark/Makefile b/benchmark/Makefile
index bcf3da2cc..11d3c5bec 100644
--- a/benchmark/Makefile
+++ b/benchmark/Makefile
@@ -2134,7 +2134,7 @@ zgemm3m.$(SUFFIX) : gemm3m.c
 	$(CC) $(CFLAGS) -c -DCOMPLEX -DDOUBLE -o $(@F) $^
 
 smallscaling: smallscaling.c ../$(LIBNAME)
-	$(CC) $(CFLAGS) -lpthread -fopenmp -lm -o $(@F) $^
+	$(CC) $(CFLAGS) -o $(@F) $^ $(EXTRALIB) -fopenmp -lm
 
 clean ::
 	@rm -f *.goto *.mkl *.acml *.atlas *.veclib
diff --git a/benchmark/smallscaling.c b/benchmark/smallscaling.c
index daed8f3da..9068c61b1 100644
--- a/benchmark/smallscaling.c
+++ b/benchmark/smallscaling.c
@@ -23,28 +23,32 @@ typedef struct {
 
 void * s_create_matrix(int size) {
     float * r = malloc(size * sizeof(double));
-    for(int i = 0; i < size; i++)
+    int i;
+    for(i = 0; i < size; i++)
         r[i] = 1e3 * i / size;
     return r;
 }
 
 void * c_create_matrix(int size) {
     float * r = malloc(size * 2 * sizeof(double));
-    for(int i = 0; i < 2 * size; i++)
+    int i;
+    for(i = 0; i < 2 * size; i++)
         r[i] = 1e3 * i / size;
     return r;
 }
 
 void * z_create_matrix(int size) {
     double * r = malloc(size * 2 * sizeof(double));
-    for(int i = 0; i < 2 * size; i++)
+    int i;
+    for(i = 0; i < 2 * size; i++)
         r[i] = 1e3 * i / size;
     return r;
 }
 
 void * d_create_matrix(int size) {
     double * r = malloc(size * sizeof(double));
-    for(int i = 0; i < size; i++)
+    int i;
+    for(i = 0; i < size; i++)
         r[i] = 1e3 * i / size;
     return r;
 }
@@ -188,4 +192,5 @@ int main(int argc, char * argv[]) {
             size *= inc_factor;
         }
     }
+    return(0);
 }
diff --git a/common.h b/common.h
index 6b65c37d1..e045e42b2 100644
--- a/common.h
+++ b/common.h
@@ -332,12 +332,13 @@ typedef int blasint;
 #endif
 #endif
 
-
+/*
 #ifdef PILEDRIVER
 #ifndef YIELDING
 #define YIELDING        __asm__ __volatile__ ("nop;nop;nop;nop;nop;nop;nop;nop;\n");
 #endif
 #endif
+*/
 
 /*
 #ifdef STEAMROLLER
diff --git a/common_power.h b/common_power.h
index ab331b04a..052d38828 100644
--- a/common_power.h
+++ b/common_power.h
@@ -236,7 +236,7 @@ static inline int blas_quickdivide(blasint x, blasint y){
 #define HAVE_PREFETCH
 #endif
 
-#if defined(POWER3) || defined(POWER6) || defined(PPCG4) || defined(CELL)
+#if defined(POWER3) || defined(POWER6) || defined(PPCG4) || defined(CELL) || defined(POWER8)
 #define DCBT_ARG	0
 #else
 #define DCBT_ARG	8
@@ -258,6 +258,13 @@ static inline int blas_quickdivide(blasint x, blasint y){
 #define L1_PREFETCH	dcbtst
 #endif
 
+#if defined(POWER8)
+#define L1_DUALFETCH
+#define L1_PREFETCHSIZE (16 + 128 * 100)
+#define L1_PREFETCH	dcbtst
+#endif
+
+#
 #ifndef L1_PREFETCH
 #define L1_PREFETCH	dcbt
 #endif
@@ -790,6 +797,8 @@ Lmcount$lazy_ptr:
 #define BUFFER_SIZE     (  2 << 20)
 #elif defined(PPC440FP2)
 #define BUFFER_SIZE     ( 16 << 20)
+#elif defined(POWER8)
+#define BUFFER_SIZE     ( 64 << 20)
 #else
 #define BUFFER_SIZE     ( 16 << 20)
 #endif
diff --git a/common_x86_64.h b/common_x86_64.h
index da9afc0e4..11937b415 100644
--- a/common_x86_64.h
+++ b/common_x86_64.h
@@ -396,7 +396,7 @@ REALNAME:
 
 #define PROFCODE
 
-#define EPILOGUE .end	 REALNAME
+#define EPILOGUE .end
 #endif
 
 #if defined(OS_LINUX) || defined(OS_FREEBSD) || defined(OS_NETBSD) || defined(__ELF__) || defined(C_PGI)
diff --git a/cpuid_power.c b/cpuid_power.c
index 366c6ed08..951204ae9 100644
--- a/cpuid_power.c
+++ b/cpuid_power.c
@@ -55,6 +55,7 @@
 #define CPUTYPE_POWER6     5
 #define CPUTYPE_CELL       6
 #define CPUTYPE_PPCG4	   7
+#define CPUTYPE_POWER8     8
 
 char *cpuname[] = {
   "UNKNOWN",
@@ -65,6 +66,7 @@ char *cpuname[] = {
   "POWER6",
   "CELL",
   "PPCG4",
+  "POWER8"
 };
 
 char *lowercpuname[] = {
@@ -76,6 +78,7 @@ char *lowercpuname[] = {
   "power6",
   "cell",
   "ppcg4",
+  "power8"
 };
 
 char *corename[] = {
@@ -87,6 +90,7 @@ char *corename[] = {
   "POWER6",
   "CELL",
   "PPCG4",
+  "POWER8"
 };
 
 int detect(void){
@@ -115,7 +119,7 @@ int detect(void){
   if (!strncasecmp(p, "POWER5", 6)) return CPUTYPE_POWER5;
   if (!strncasecmp(p, "POWER6", 6)) return CPUTYPE_POWER6;
   if (!strncasecmp(p, "POWER7", 6)) return CPUTYPE_POWER6;
-  if (!strncasecmp(p, "POWER8", 6)) return CPUTYPE_POWER6;
+  if (!strncasecmp(p, "POWER8", 6)) return CPUTYPE_POWER8;
   if (!strncasecmp(p, "Cell",   4)) return CPUTYPE_CELL;
   if (!strncasecmp(p, "7447",   4)) return CPUTYPE_PPCG4;
 
diff --git a/ctest/cin2 b/ctest/cin2
index 032fcbb39..b2e1e4a0e 100644
--- a/ctest/cin2
+++ b/ctest/cin2
@@ -1,7 +1,7 @@
 'CBLAT2.SNAP'     NAME OF SNAPSHOT OUTPUT FILE
 -1                UNIT NUMBER OF SNAPSHOT FILE (NOT USED IF .LT. 0)
 F        LOGICAL FLAG, T TO REWIND SNAPSHOT FILE AFTER EACH RECORD.
-F        LOGICAL FLAG, T TO STOP ON FAILURES.
+T        LOGICAL FLAG, T TO STOP ON FAILURES.
 T        LOGICAL FLAG, T TO TEST ERROR EXITS.
 2        LOGICAL FLAG, T TO TEST ROW-MAJOR (IF FALSE COLUMN-MAJOR IS TESTED)
 16.0     THRESHOLD VALUE OF TEST RATIO
diff --git a/ctest/cin3 b/ctest/cin3
index 223d165db..fbdb57857 100644
--- a/ctest/cin3
+++ b/ctest/cin3
@@ -1,7 +1,7 @@
 'CBLAT3.SNAP'     NAME OF SNAPSHOT OUTPUT FILE
 -1                UNIT NUMBER OF SNAPSHOT FILE (NOT USED IF .LT. 0)
 F        LOGICAL FLAG, T TO REWIND SNAPSHOT FILE AFTER EACH RECORD.
-F        LOGICAL FLAG, T TO STOP ON FAILURES.
+T        LOGICAL FLAG, T TO STOP ON FAILURES.
 T        LOGICAL FLAG, T TO TEST ERROR EXITS.
 2        0 TO TEST COLUMN-MAJOR, 1 TO TEST ROW-MAJOR, 2 TO TEST BOTH
 16.0     THRESHOLD VALUE OF TEST RATIO
diff --git a/ctest/cin3_3m b/ctest/cin3_3m
index 34014143e..5a797291a 100644
--- a/ctest/cin3_3m
+++ b/ctest/cin3_3m
@@ -1,7 +1,7 @@
 'CBLAT3.SNAP'     NAME OF SNAPSHOT OUTPUT FILE
 -1                UNIT NUMBER OF SNAPSHOT FILE (NOT USED IF .LT. 0)
 F        LOGICAL FLAG, T TO REWIND SNAPSHOT FILE AFTER EACH RECORD.
-F        LOGICAL FLAG, T TO STOP ON FAILURES.
+T        LOGICAL FLAG, T TO STOP ON FAILURES.
 T        LOGICAL FLAG, T TO TEST ERROR EXITS.
 2        0 TO TEST COLUMN-MAJOR, 1 TO TEST ROW-MAJOR, 2 TO TEST BOTH
 16.0     THRESHOLD VALUE OF TEST RATIO
diff --git a/ctest/din2 b/ctest/din2
index 6f42b2792..df8f7b6ae 100644
--- a/ctest/din2
+++ b/ctest/din2
@@ -1,7 +1,7 @@
 'DBLAT2.SNAP'     NAME OF SNAPSHOT OUTPUT FILE
 -1                UNIT NUMBER OF SNAPSHOT FILE (NOT USED IF .LT. 0)
 F        LOGICAL FLAG, T TO REWIND SNAPSHOT FILE AFTER EACH RECORD.
-F        LOGICAL FLAG, T TO STOP ON FAILURES.
+T        LOGICAL FLAG, T TO STOP ON FAILURES.
 T        LOGICAL FLAG, T TO TEST ERROR EXITS.
 2        0 TO TEST COLUMN-MAJOR, 1 TO TEST ROW-MAJOR, 2 TO TEST BOTH
 16.0     THRESHOLD VALUE OF TEST RATIO
diff --git a/ctest/din3 b/ctest/din3
index cbbcc22ab..23fedfe32 100644
--- a/ctest/din3
+++ b/ctest/din3
@@ -1,7 +1,7 @@
 'DBLAT3.SNAP'     NAME OF SNAPSHOT OUTPUT FILE
 -1                UNIT NUMBER OF SNAPSHOT FILE (NOT USED IF .LT. 0)
 F        LOGICAL FLAG, T TO REWIND SNAPSHOT FILE AFTER EACH RECORD.
-F        LOGICAL FLAG, T TO STOP ON FAILURES.
+T        LOGICAL FLAG, T TO STOP ON FAILURES.
 T        LOGICAL FLAG, T TO TEST ERROR EXITS.
 2        0 TO TEST COLUMN-MAJOR, 1 TO TEST ROW-MAJOR, 2 TO TEST BOTH
 16.0     THRESHOLD VALUE OF TEST RATIO
diff --git a/ctest/sin2 b/ctest/sin2
index 3eee5c2f9..0e1ecd9d6 100644
--- a/ctest/sin2
+++ b/ctest/sin2
@@ -1,7 +1,7 @@
 'SBLAT2.SNAP'     NAME OF SNAPSHOT OUTPUT FILE
 -1                UNIT NUMBER OF SNAPSHOT FILE (NOT USED IF .LT. 0)
 F        LOGICAL FLAG, T TO REWIND SNAPSHOT FILE AFTER EACH RECORD.
-F        LOGICAL FLAG, T TO STOP ON FAILURES.
+T        LOGICAL FLAG, T TO STOP ON FAILURES.
 T        LOGICAL FLAG, T TO TEST ERROR EXITS.
 2        LOGICAL FLAG, T TO TEST ROW-MAJOR (IF FALSE COLUMN-MAJOR IS TESTED)
 16.0     THRESHOLD VALUE OF TEST RATIO
diff --git a/ctest/sin3 b/ctest/sin3
index 01e32d6ee..644083f22 100644
--- a/ctest/sin3
+++ b/ctest/sin3
@@ -1,7 +1,7 @@
 'SBLAT3.SNAP'     NAME OF SNAPSHOT OUTPUT FILE
 -1                UNIT NUMBER OF SNAPSHOT FILE (NOT USED IF .LT. 0)
 F        LOGICAL FLAG, T TO REWIND SNAPSHOT FILE AFTER EACH RECORD.
-F        LOGICAL FLAG, T TO STOP ON FAILURES.
+T        LOGICAL FLAG, T TO STOP ON FAILURES.
 T        LOGICAL FLAG, T TO TEST ERROR EXITS.
 2        0 TO TEST COLUMN-MAJOR, 1 TO TEST ROW-MAJOR, 2 TO TEST BOTH
 16.0     THRESHOLD VALUE OF TEST RATIO
diff --git a/ctest/zin2 b/ctest/zin2
index 4c0affe92..217697191 100644
--- a/ctest/zin2
+++ b/ctest/zin2
@@ -1,7 +1,7 @@
 'ZBLAT2.SNAP'     NAME OF SNAPSHOT OUTPUT FILE
 -1                UNIT NUMBER OF SNAPSHOT FILE (NOT USED IF .LT. 0)
 F        LOGICAL FLAG, T TO REWIND SNAPSHOT FILE AFTER EACH RECORD.
-F        LOGICAL FLAG, T TO STOP ON FAILURES.
+T        LOGICAL FLAG, T TO STOP ON FAILURES.
 T        LOGICAL FLAG, T TO TEST ERROR EXITS.
 2        LOGICAL FLAG, T TO TEST ROW-MAJOR (IF FALSE COLUMN-MAJOR IS TESTED)
 16.0     THRESHOLD VALUE OF TEST RATIO
diff --git a/ctest/zin3 b/ctest/zin3
index 70050b693..ee269e8d5 100644
--- a/ctest/zin3
+++ b/ctest/zin3
@@ -1,7 +1,7 @@
 'ZBLAT3.SNAP'     NAME OF SNAPSHOT OUTPUT FILE
 -1                UNIT NUMBER OF SNAPSHOT FILE (NOT USED IF .LT. 0)
 F        LOGICAL FLAG, T TO REWIND SNAPSHOT FILE AFTER EACH RECORD.
-F        LOGICAL FLAG, T TO STOP ON FAILURES.
+T        LOGICAL FLAG, T TO STOP ON FAILURES.
 T        LOGICAL FLAG, T TO TEST ERROR EXITS.
 2        0 TO TEST COLUMN-MAJOR, 1 TO TEST ROW-MAJOR, 2 TO TEST BOTH
 16.0     THRESHOLD VALUE OF TEST RATIO
diff --git a/ctest/zin3_3m b/ctest/zin3_3m
index 33bf08353..a0d4fde0a 100644
--- a/ctest/zin3_3m
+++ b/ctest/zin3_3m
@@ -1,7 +1,7 @@
 'ZBLAT3.SNAP'     NAME OF SNAPSHOT OUTPUT FILE
 -1                UNIT NUMBER OF SNAPSHOT FILE (NOT USED IF .LT. 0)
 F        LOGICAL FLAG, T TO REWIND SNAPSHOT FILE AFTER EACH RECORD.
-F        LOGICAL FLAG, T TO STOP ON FAILURES.
+T        LOGICAL FLAG, T TO STOP ON FAILURES.
 T        LOGICAL FLAG, T TO TEST ERROR EXITS.
 2        0 TO TEST COLUMN-MAJOR, 1 TO TEST ROW-MAJOR, 2 TO TEST BOTH
 16.0     THRESHOLD VALUE OF TEST RATIO
diff --git a/driver/level2/ztrmv_L.c b/driver/level2/ztrmv_L.c
index 92c86aec2..2d5fb7802 100644
--- a/driver/level2/ztrmv_L.c
+++ b/driver/level2/ztrmv_L.c
@@ -56,7 +56,7 @@ int CNAME(BLASLONG m, FLOAT *a, BLASLONG lda, FLOAT *b, BLASLONG incb, FLOAT *bu
 
   if (incb != 1) {
     B = buffer;
-    gemvbuffer = (FLOAT *)(((BLASLONG)buffer + m * sizeof(FLOAT) * 2 + 4095) & ~4095);
+    gemvbuffer = (FLOAT *)(((BLASLONG)buffer + m * sizeof(FLOAT) * 2 + 15) & ~15);
     COPY_K(m, b, incb, buffer, 1);
   }
 
diff --git a/driver/others/CMakeLists.txt b/driver/others/CMakeLists.txt
index b2af55e36..b361f2a97 100644
--- a/driver/others/CMakeLists.txt
+++ b/driver/others/CMakeLists.txt
@@ -33,6 +33,7 @@ set(COMMON_SOURCES
   xerbla.c
   openblas_set_num_threads.c
   openblas_error_handle.c
+  openblas_env.c
   openblas_get_num_procs.c
   openblas_get_num_threads.c
 )
diff --git a/driver/others/Makefile b/driver/others/Makefile
index ed145cee8..e61ba7bc8 100644
--- a/driver/others/Makefile
+++ b/driver/others/Makefile
@@ -1,7 +1,7 @@
 TOPDIR	= ../..
 include ../../Makefile.system
 
-COMMONOBJS	 = memory.$(SUFFIX) xerbla.$(SUFFIX) c_abs.$(SUFFIX) z_abs.$(SUFFIX) openblas_set_num_threads.$(SUFFIX) openblas_get_num_threads.$(SUFFIX) openblas_get_num_procs.$(SUFFIX) openblas_get_config.$(SUFFIX) openblas_get_parallel.$(SUFFIX) openblas_error_handle.$(SUFFIX)
+COMMONOBJS	 = memory.$(SUFFIX) xerbla.$(SUFFIX) c_abs.$(SUFFIX) z_abs.$(SUFFIX) openblas_set_num_threads.$(SUFFIX) openblas_get_num_threads.$(SUFFIX) openblas_get_num_procs.$(SUFFIX) openblas_get_config.$(SUFFIX) openblas_get_parallel.$(SUFFIX) openblas_error_handle.$(SUFFIX) openblas_env.$(SUFFIX)
 
 #COMMONOBJS	+= slamch.$(SUFFIX) slamc3.$(SUFFIX) dlamch.$(SUFFIX)  dlamc3.$(SUFFIX)
 
@@ -118,6 +118,9 @@ openblas_get_parallel.$(SUFFIX) : openblas_get_parallel.c
 openblas_error_handle.$(SUFFIX) : openblas_error_handle.c
 	$(CC) $(CFLAGS) -c $< -o $(@F)
 
+openblas_env.$(SUFFIX) : openblas_env.c
+	$(CC) $(CFLAGS) -c $< -o $(@F)
+
 blasL1thread.$(SUFFIX) : blas_l1_thread.c ../../common.h ../../common_thread.h
 	$(CC) $(CFLAGS) -c $< -o $(@F)
 
diff --git a/driver/others/blas_server.c b/driver/others/blas_server.c
index c3bf80173..42cadf4b5 100644
--- a/driver/others/blas_server.c
+++ b/driver/others/blas_server.c
@@ -92,6 +92,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #endif
 #endif
 
+extern unsigned int openblas_thread_timeout();
+
 #ifdef SMP_SERVER
 
 #undef MONITOR
@@ -524,6 +526,7 @@ static int blas_monitor(void *arg){
 int blas_thread_init(void){
   BLASLONG i;
   int ret;
+  int thread_timeout_env;
 #ifdef NEED_STACKATTR
   pthread_attr_t attr;
 #endif
@@ -540,22 +543,12 @@ int blas_thread_init(void){
 
   if (!blas_server_avail){
 
-    env_var_t p;
-
-    if (readenv(p,"THREAD_TIMEOUT")) {
-      thread_timeout = atoi(p);
-      if (thread_timeout <  4) thread_timeout =  4;
-      if (thread_timeout > 30) thread_timeout = 30;
-      thread_timeout = (1 << thread_timeout);
-    }else{
-		if (readenv(p,"GOTO_THREAD_TIMEOUT")) {
-			thread_timeout = atoi(p);
-			if (thread_timeout <  4) thread_timeout =  4;
-			if (thread_timeout > 30) thread_timeout = 30;
-			thread_timeout = (1 << thread_timeout);
-		}
-	}
-
+    thread_timeout_env=openblas_thread_timeout();
+    if (thread_timeout_env>0) {
+      if (thread_timeout_env <  4) thread_timeout_env =  4;
+      if (thread_timeout_env > 30) thread_timeout_env = 30;
+      thread_timeout = (1 << thread_timeout_env);
+    }
 
     for(i = 0; i < blas_num_threads - 1; i++){
 
diff --git a/driver/others/dynamic.c b/driver/others/dynamic.c
index a2b7c7045..2fde07fcc 100644
--- a/driver/others/dynamic.c
+++ b/driver/others/dynamic.c
@@ -391,7 +391,7 @@ static char *corename[] = {
     "Nehalem",
     "Athlon",
     "Opteron",
-    "Opteron(SSE3)",
+    "Opteron_SSE3",
     "Barcelona",
     "Nano",
     "Sandybridge",
diff --git a/driver/others/memory.c b/driver/others/memory.c
index e0761d784..e64781740 100644
--- a/driver/others/memory.c
+++ b/driver/others/memory.c
@@ -294,8 +294,11 @@ void openblas_fork_handler()
 #endif
 }
 
+extern int openblas_num_threads_env();
+extern int openblas_goto_num_threads_env();
+extern int openblas_omp_num_threads_env();
+
 int blas_get_cpu_number(void){
-  env_var_t p;
 #if defined(OS_LINUX) || defined(OS_WINDOWS) || defined(OS_FREEBSD) || defined(OS_DARWIN) || defined(OS_ANDROID)
   int max_num;
 #endif
@@ -310,18 +313,18 @@ int blas_get_cpu_number(void){
 
   blas_goto_num = 0;
 #ifndef USE_OPENMP
-  if (readenv(p,"OPENBLAS_NUM_THREADS")) blas_goto_num = atoi(p);
+  blas_goto_num=openblas_num_threads_env();
   if (blas_goto_num < 0) blas_goto_num = 0;
 
   if (blas_goto_num == 0) {
-		if (readenv(p,"GOTO_NUM_THREADS")) blas_goto_num = atoi(p);
-		if (blas_goto_num < 0) blas_goto_num = 0;
+    blas_goto_num=openblas_goto_num_threads_env();
+    if (blas_goto_num < 0) blas_goto_num = 0;
   }
 
 #endif
 
   blas_omp_num = 0;
-  if (readenv(p,"OMP_NUM_THREADS")) blas_omp_num = atoi(p);
+  blas_omp_num=openblas_omp_num_threads_env();
   if (blas_omp_num < 0) blas_omp_num = 0;
 
   if (blas_goto_num > 0) blas_num_threads = blas_goto_num;
@@ -1340,6 +1343,7 @@ static void gotoblas_memory_init(void) {
 /* Initialization for all function; this function should be called before main */
 
 static int gotoblas_initialized = 0;
+extern void openblas_read_env();
 
 void CONSTRUCTOR gotoblas_init(void) {
 
@@ -1349,6 +1353,8 @@ void CONSTRUCTOR gotoblas_init(void) {
   openblas_fork_handler();
 #endif
 
+  openblas_read_env();
+
 #ifdef PROFILE
    moncontrol (0);
 #endif
diff --git a/driver/others/openblas_env.c b/driver/others/openblas_env.c
new file mode 100644
index 000000000..64ece9515
--- /dev/null
+++ b/driver/others/openblas_env.c
@@ -0,0 +1,84 @@
+/***************************************************************************
+Copyright (c) 2011-2016,                              The OpenBLAS Project
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+
+   1. Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+
+   2. Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in
+      the documentation and/or other materials provided with the
+      distribution.
+
+   3. Neither the name of the OpenBLAS project nor the names of
+      its contributors may be used to endorse or promote products
+      derived from this software without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+*****************************************************************************/
+
+#include "common.h"
+
+static int openblas_env_verbose=0;
+static unsigned int openblas_env_thread_timeout=0;
+static int openblas_env_block_factor=0;
+static int openblas_env_openblas_num_threads=0;
+static int openblas_env_goto_num_threads=0;
+static int openblas_env_omp_num_threads=0;
+
+int openblas_verbose() { return openblas_env_verbose;}
+unsigned int openblas_thread_timeout() { return openblas_env_thread_timeout;}
+int openblas_block_factor() { return openblas_env_block_factor;}
+int openblas_num_threads_env() { return openblas_env_openblas_num_threads;}
+int openblas_goto_num_threads_env() { return openblas_env_goto_num_threads;}
+int openblas_omp_num_threads_env() { return openblas_env_omp_num_threads;}
+
+void openblas_read_env() {
+  int ret=0;
+  env_var_t p;
+  if (readenv(p,"OPENBLAS_VERBOSE")) ret = atoi(p);
+  if(ret<0) ret=0;
+  openblas_env_verbose=ret;
+
+  ret=0;
+  if (readenv(p,"OPENBLAS_BLOCK_FACTOR")) ret = atoi(p);
+  if(ret<0) ret=0;
+  openblas_env_block_factor=ret;
+
+  ret=0;
+  if (readenv(p,"OPENBLAS_THREAD_TIMEOUT")) ret = atoi(p);
+  if(ret<0) ret=0;
+  openblas_env_thread_timeout=(unsigned int)ret;
+
+  ret=0;
+  if (readenv(p,"OPENBLAS_NUM_THREADS")) ret = atoi(p);
+  if(ret<0) ret=0;
+  openblas_env_openblas_num_threads=ret;
+
+  ret=0;
+  if (readenv(p,"GOTO_NUM_THREADS")) ret = atoi(p);
+  if(ret<0) ret=0;
+  openblas_env_goto_num_threads=ret;
+
+  ret=0;
+  if (readenv(p,"OMP_NUM_THREADS")) ret = atoi(p);
+  if(ret<0) ret=0;
+  openblas_env_omp_num_threads=ret;
+
+}
+
+
diff --git a/driver/others/openblas_error_handle.c b/driver/others/openblas_error_handle.c
index f32a54452..9ac72c15d 100644
--- a/driver/others/openblas_error_handle.c
+++ b/driver/others/openblas_error_handle.c
@@ -33,13 +33,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
 #include "common.h"
 
-int openblas_verbose() {
-  int ret=0;
-  env_var_t p;
-  if (readenv(p,"OPENBLAS_VERBOSE")) ret = atoi(p);
-  if(ret<0) ret=0;
-  return ret;
-}
+extern int openblas_verbose();
 
 void openblas_warning(int verbose, const char * msg) {
   int current_verbose;
diff --git a/driver/others/parameter.c b/driver/others/parameter.c
index d741f2fb9..f4b1a80ad 100644
--- a/driver/others/parameter.c
+++ b/driver/others/parameter.c
@@ -40,6 +40,7 @@
 #include <string.h>
 #include "common.h"
 
+extern int openblas_block_factor();
 int get_L2_size(void);
 
 #define DEFAULT_GEMM_P 128
@@ -249,7 +250,6 @@ int get_L2_size(void){
 
 void blas_set_parameter(void){
 
-  env_var_t p;
   int factor;
 #if defined(BULLDOZER) || defined(PILEDRIVER) || defined(SANDYBRIDGE) || defined(NEHALEM) || defined(HASWELL) || defined(STEAMROLLER)
   int size = 16;
@@ -468,9 +468,8 @@ void blas_set_parameter(void){
 #endif
 #endif
 
-
-  if (readenv(p,"GOTO_BLOCK_FACTOR")) {
-    factor = atoi(p);
+  factor=openblas_block_factor();
+  if (factor>0) {
     if (factor <  10) factor =  10;
     if (factor > 200) factor = 200;
 
diff --git a/getarch.c b/getarch.c
index fb80a4c9b..f9c49e663 100644
--- a/getarch.c
+++ b/getarch.c
@@ -552,7 +552,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #define CORENAME  "POWER5"
 #endif
 
-#if defined(FORCE_POWER6) || defined(FORCE_POWER7) || defined(FORCE_POWER8)
+#if defined(FORCE_POWER6) || defined(FORCE_POWER7)
 #define FORCE
 #define ARCHITECTURE    "POWER"
 #define SUBARCHITECTURE "POWER6"
@@ -565,6 +565,20 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #define CORENAME  "POWER6"
 #endif
 
+#if defined(FORCE_POWER8) 
+#define FORCE
+#define ARCHITECTURE    "POWER"
+#define SUBARCHITECTURE "POWER8"
+#define SUBDIRNAME      "power"
+#define ARCHCONFIG   "-DPOWER8 " \
+		     "-DL1_DATA_SIZE=65536 -DL1_DATA_LINESIZE=128 " \
+		     "-DL2_SIZE=4194304 -DL2_LINESIZE=128 " \
+		     "-DDTB_DEFAULT_ENTRIES=128 -DDTB_SIZE=4096 -DL2_ASSOCIATIVE=8 "
+#define LIBNAME   "power8"
+#define CORENAME  "POWER8"
+#endif
+
+
 #ifdef FORCE_PPCG4
 #define FORCE
 #define ARCHITECTURE    "POWER"
diff --git a/kernel/Makefile.L3 b/kernel/Makefile.L3
index 63e675b8d..8e6827424 100644
--- a/kernel/Makefile.L3
+++ b/kernel/Makefile.L3
@@ -36,6 +36,11 @@ ifeq ($(CORE), HASWELL)
 USE_TRMM = 1
 endif
 
+ifeq ($(CORE), POWER8)
+USE_TRMM = 1
+endif
+
+
 
 
 SKERNELOBJS	+= \
diff --git a/kernel/arm64/KERNEL.CORTEXA57 b/kernel/arm64/KERNEL.CORTEXA57
index 7c8eeeea7..64666f05b 100644
--- a/kernel/arm64/KERNEL.CORTEXA57
+++ b/kernel/arm64/KERNEL.CORTEXA57
@@ -60,32 +60,55 @@ DGEMVTKERNEL = gemv_t.S
 CGEMVTKERNEL = zgemv_t.S
 ZGEMVTKERNEL = zgemv_t.S
 
-STRMMKERNEL = strmm_kernel_4x4.S
-DTRMMKERNEL = dtrmm_kernel_4x4.S
-CTRMMKERNEL = ctrmm_kernel_4x4.S
-ZTRMMKERNEL = ztrmm_kernel_4x4.S
-
-SGEMMKERNEL    =  sgemm_kernel_4x4.S
-SGEMMONCOPY    =  ../generic/gemm_ncopy_4.c
-SGEMMOTCOPY    =  ../generic/gemm_tcopy_4.c
+SGEMMKERNEL    =  sgemm_kernel_$(SGEMM_UNROLL_M)x$(SGEMM_UNROLL_N).S
+STRMMKERNEL    =  strmm_kernel_$(SGEMM_UNROLL_M)x$(SGEMM_UNROLL_N).S
+ifneq ($(SGEMM_UNROLL_M), $(SGEMM_UNROLL_N))
+SGEMMINCOPY    =  ../generic/gemm_ncopy_$(SGEMM_UNROLL_M).c
+SGEMMITCOPY    =  ../generic/gemm_tcopy_$(SGEMM_UNROLL_M).c
+SGEMMINCOPYOBJ =  sgemm_incopy.o
+SGEMMITCOPYOBJ =  sgemm_itcopy.o
+endif
+SGEMMONCOPY    =  ../generic/gemm_ncopy_$(SGEMM_UNROLL_N).c
+SGEMMOTCOPY    =  ../generic/gemm_tcopy_$(SGEMM_UNROLL_N).c
 SGEMMONCOPYOBJ =  sgemm_oncopy.o
 SGEMMOTCOPYOBJ =  sgemm_otcopy.o
 
-DGEMMKERNEL    =  dgemm_kernel_4x4.S
-DGEMMONCOPY    =  ../generic/gemm_ncopy_4.c
-DGEMMOTCOPY    =  ../generic/gemm_tcopy_4.c
+DGEMMKERNEL    =  dgemm_kernel_$(DGEMM_UNROLL_M)x$(DGEMM_UNROLL_N).S
+DTRMMKERNEL    =  dtrmm_kernel_$(DGEMM_UNROLL_M)x$(DGEMM_UNROLL_N).S
+ifneq ($(DGEMM_UNROLL_M), $(DGEMM_UNROLL_N))
+DGEMMINCOPY    =  ../generic/gemm_ncopy_$(DGEMM_UNROLL_M).c
+DGEMMITCOPY    =  ../generic/gemm_tcopy_$(DGEMM_UNROLL_M).c
+DGEMMINCOPYOBJ =  dgemm_incopy.o
+DGEMMITCOPYOBJ =  dgemm_itcopy.o
+endif
+DGEMMONCOPY    =  ../generic/gemm_ncopy_$(DGEMM_UNROLL_N).c
+DGEMMOTCOPY    =  ../generic/gemm_tcopy_$(DGEMM_UNROLL_N).c
 DGEMMONCOPYOBJ =  dgemm_oncopy.o
 DGEMMOTCOPYOBJ =  dgemm_otcopy.o
 
-CGEMMKERNEL    =  cgemm_kernel_4x4.S
-CGEMMONCOPY    =  ../generic/zgemm_ncopy_4.c
-CGEMMOTCOPY    =  ../generic/zgemm_tcopy_4.c
+CGEMMKERNEL    =  cgemm_kernel_$(CGEMM_UNROLL_M)x$(CGEMM_UNROLL_N).S
+CTRMMKERNEL    =  ctrmm_kernel_$(CGEMM_UNROLL_M)x$(CGEMM_UNROLL_N).S
+ifneq ($(CGEMM_UNROLL_M), $(CGEMM_UNROLL_N))
+CGEMMINCOPY    =  ../generic/zgemm_ncopy_$(CGEMM_UNROLL_M).c
+CGEMMITCOPY    =  ../generic/zgemm_tcopy_$(CGEMM_UNROLL_M).c
+CGEMMINCOPYOBJ =  cgemm_incopy.o
+CGEMMITCOPYOBJ =  cgemm_itcopy.o
+endif
+CGEMMONCOPY    =  ../generic/zgemm_ncopy_$(CGEMM_UNROLL_N).c
+CGEMMOTCOPY    =  ../generic/zgemm_tcopy_$(CGEMM_UNROLL_N).c
 CGEMMONCOPYOBJ =  cgemm_oncopy.o
 CGEMMOTCOPYOBJ =  cgemm_otcopy.o
 
-ZGEMMKERNEL    =  zgemm_kernel_4x4.S
-ZGEMMONCOPY    =  ../generic/zgemm_ncopy_4.c
-ZGEMMOTCOPY    =  ../generic/zgemm_tcopy_4.c
+ZGEMMKERNEL    =  zgemm_kernel_$(ZGEMM_UNROLL_M)x$(ZGEMM_UNROLL_N).S
+ZTRMMKERNEL    =  ztrmm_kernel_$(ZGEMM_UNROLL_M)x$(ZGEMM_UNROLL_N).S
+ifneq ($(ZGEMM_UNROLL_M), $(ZGEMM_UNROLL_N))
+ZGEMMINCOPY    =  ../generic/zgemm_ncopy_$(ZGEMM_UNROLL_M).c
+ZGEMMITCOPY    =  ../generic/zgemm_tcopy_$(ZGEMM_UNROLL_M).c
+ZGEMMINCOPYOBJ =  zgemm_incopy.o
+ZGEMMITCOPYOBJ =  zgemm_itcopy.o
+endif
+ZGEMMONCOPY    =  ../generic/zgemm_ncopy_$(ZGEMM_UNROLL_N).c
+ZGEMMOTCOPY    =  ../generic/zgemm_tcopy_$(ZGEMM_UNROLL_N).c
 ZGEMMONCOPYOBJ =  zgemm_oncopy.o
 ZGEMMOTCOPYOBJ =  zgemm_otcopy.o
 
diff --git a/kernel/arm64/cgemm_kernel_8x4.S b/kernel/arm64/cgemm_kernel_8x4.S
new file mode 100755
index 000000000..40b98cee2
--- /dev/null
+++ b/kernel/arm64/cgemm_kernel_8x4.S
@@ -0,0 +1,2044 @@
+/*******************************************************************************
+Copyright (c) 2015, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*******************************************************************************/
+
+#define ASSEMBLER
+#include "common.h"
+
+/*                   X0          X1          X2          s0        X3        x4       x5           x6 */
+/*int CNAME(BLASLONG bm,BLASLONG bn,BLASLONG bk,FLOAT alpha0,FLOAT* ba,FLOAT* bb,FLOAT* C,BLASLONG ldc */
+
+#define origM		x0
+#define origN		x1
+#define origK		x2
+#define origPA		x3
+#define origPB		x4
+#define pC		x5
+#define LDC		x6
+#define temp		x7
+#define counterL	x8
+#define counterI	x9
+#define counterJ	x10
+#define pB		x11
+#define pCRow0		x12
+#define pCRow1		x13
+#define pCRow2		x14
+#define pA		x15
+
+#define alpha0_R	s10
+#define alphaV0_R	v10.s[0]
+#define alpha0_I	s11
+#define alphaV0_I	v11.s[0]
+
+#define alpha1_R	s14
+#define alphaV1_R	v14.s[0]
+#define alpha1_I	s15
+#define alphaV1_I	v15.s[0]
+
+#if defined(NN) || defined(NT) || defined(TN) || defined(TT)
+#define OP_rr		fmla
+#define OP_ii		fmls
+#define OP_ri		fmla
+#define OP_ir		fmla
+#elif defined(NR) || defined(NC) || defined(TR) || defined(TC)
+#define OP_rr		fmla
+#define OP_ii		fmla
+#define OP_ri		fmls
+#define OP_ir		fmla
+#elif defined(RN) || defined(RT) || defined(CN) || defined(CT)
+#define OP_rr		fmla
+#define OP_ii		fmla
+#define OP_ri		fmla
+#define OP_ir		fmls
+#elif defined(RR) || defined(RC) || defined(CR) || defined(CC)
+#define OP_rr		fmla
+#define OP_ii		fmls
+#define OP_ri		fmls
+#define OP_ir		fmls
+#endif
+
+// 00 origM
+// 01 origN
+// 02 origK
+// 03 origPA
+// 04 origPB
+// 05 pC
+// 06 origLDC -> LDC
+// 07 offset -> temp
+// 08 counterL
+// 09 counterI
+// 10 counterJ
+// 11 pB
+// 12 pCRow0
+// 13 pCRow1
+// 14 pCRow2
+// 15 pA
+// 16
+// 17
+// 18 must save
+// 19 must save
+// 20 must save
+// 21 must save
+// 22 must save
+// 23 must save
+// 24 must save
+// 25 must save
+// 26 must save
+// 27 must save
+// 28 must save
+// 29 frame
+// 30 link
+// 31 sp
+
+//v00 ALPHA_R -> pA0_00_R, pA0_01_R, pA0_02_R, pA0_03_R
+//v01 ALPHA_I -> pA0_00_I, pA0_01_I, pA0_02_I, pA0_03_I
+//v02 pA0_04_R, pA0_05_R, pA0_06_R, pA0_07_R
+//v03 pA0_04_I, pA0_05_I, pA0_06_I, pA0_07_I
+//v04 pA1_00_R, pA1_01_R, pA1_02_R, pA1_03_R
+//v05 pA1_00_I, pA1_01_I, pA1_02_I, pA1_03_I
+//v06 pA1_04_R, pA1_05_R, pA1_06_R, pA1_07_R
+//v07 pA1_04_I, pA1_05_I, pA1_06_I, pA1_07_I
+//v08 must save pB0_00_R, pB0_01_R, pB0_02_R, pB0_03_R
+//v09 must save pB0_00_I, pB0_01_I, pB0_02_I, pB0_03_I
+//v10 must save ALPHA0_R
+//v11 must save ALPHA0_I
+//v12 must save pB1_00_R, pB1_01_R, pB1_02_R, pB1_03_R
+//v13 must save pB1_00_I, pB1_01_I, pB1_02_I, pB1_03_I
+//v14 must save ALPHA1_R
+//v15 must save ALPHA1_I
+//v16 must save pC_00_R, pC_01_R, pC_02_R, pC_03_R
+//v17 must save pC_00_I, pC_01_I, pC_02_I, pC_03_I
+//v18 pC_04_R, pC_05_R, pC_06_R, pC_07_R
+//v19 pC_04_I, pC_05_I, pC_06_I, pC_07_I
+//v20 pC_08_R, pC_09_R, pC_10_R, pC_11_R
+//v21 pC_08_I, pC_09_I, pC_10_I, pC_11_I
+//v22 pC_12_R, pC_13_R, pC_14_R, pC_15_R
+//v23 pC_12_I, pC_13_I, pC_14_I, pC_15_I
+//v24 pC_16_R, pC_17_R, pC_18_R, pC_19_R
+//v25 pC_16_I, pC_17_I, pC_18_I, pC_19_I
+//v26 pC_20_R, pC_21_R, pC_22_R, pC_23_R
+//v27 pC_20_I, pC_21_I, pC_22_I, pC_23_I
+//v28 pC_24_R, pC_25_R, pC_26_R, pC_27_R
+//v29 pC_24_I, pC_25_I, pC_26_I, pC_27_I
+//v30 pC_28_R, pC_29_R, pC_30_R, pC_31_R
+//v31 pC_28_I, pC_29_I, pC_30_I, pC_31_I
+
+
+/*******************************************************************************
+* Macro definitions
+*******************************************************************************/
+
+.macro INIT8x4
+	fmov		s16, wzr
+	fmov		s17, wzr
+	fmov		s18, wzr
+	fmov		s19, s16
+	fmov		s20, wzr
+	fmov		s21, s16
+	fmov		s22, s17
+	fmov		s23, s18
+	fmov		s24, wzr
+	fmov		s25, s16
+	fmov		s26, s17
+	fmov		s27, s18
+	fmov		s28, wzr
+	fmov		s29, s16
+	fmov		s30, s17
+	fmov		s31, s18
+.endm
+
+.macro KERNEL8x4_I
+	ld2	{v8.4s, v9.4s}, [pB]
+	add	pB, pB, #32
+	ld2	{v0.4s, v1.4s}, [pA]
+	add	pA, pA, #32
+	ld2	{v2.4s, v3.4s}, [pA]
+	add	pA, pA, #32
+
+	fmul	v16.4s, v0.4s, v8.4s[0]
+	OP_ii	v16.4s, v1.4s, v9.4s[0]
+#if defined(NR) || defined(NC) || defined(TR) || defined(TC) || \
+    defined(RR) || defined(RC) || defined(CR) || defined(CC)
+	eor	v17.16b, v17.16b, v17.16b
+	fmls	v17.4s, v0.4s, v9.4s[0]
+#else
+	fmul	v17.4s, v0.4s, v9.4s[0]
+#endif
+	OP_ir	v17.4s, v1.4s, v8.4s[0]
+
+	fmul	v18.4s, v2.4s, v8.4s[0]
+	OP_ii	v18.4s, v3.4s, v9.4s[0]
+#if defined(NR) || defined(NC) || defined(TR) || defined(TC) || \
+    defined(RR) || defined(RC) || defined(CR) || defined(CC)
+	eor	v19.16b, v19.16b, v19.16b
+	fmls	v19.4s, v2.4s, v9.4s[0]
+#else
+	fmul	v19.4s, v2.4s, v9.4s[0]
+#endif
+	OP_ir	v19.4s, v3.4s, v8.4s[0]
+
+	fmul	v20.4s, v0.4s, v8.4s[1]
+	OP_ii	v20.4s, v1.4s, v9.4s[1]
+#if defined(NR) || defined(NC) || defined(TR) || defined(TC) || \
+    defined(RR) || defined(RC) || defined(CR) || defined(CC)
+	eor	v21.16b, v21.16b, v21.16b
+	fmls	v21.4s, v0.4s, v9.4s[1]
+#else
+	fmul	v21.4s, v0.4s, v9.4s[1]
+#endif
+	OP_ir	v21.4s, v1.4s, v8.4s[1]
+
+	fmul	v22.4s, v2.4s, v8.4s[1]
+	OP_ii	v22.4s, v3.4s, v9.4s[1]
+#if defined(NR) || defined(NC) || defined(TR) || defined(TC) || \
+    defined(RR) || defined(RC) || defined(CR) || defined(CC)
+	eor	v23.16b, v23.16b, v23.16b
+	fmls	v23.4s, v2.4s, v9.4s[1]
+#else
+	fmul	v23.4s, v2.4s, v9.4s[1]
+#endif
+	OP_ir	v23.4s, v3.4s, v8.4s[1]
+
+	fmul	v24.4s, v0.4s, v8.4s[2]
+	OP_ii	v24.4s, v1.4s, v9.4s[2]
+#if defined(NR) || defined(NC) || defined(TR) || defined(TC) || \
+    defined(RR) || defined(RC) || defined(CR) || defined(CC)
+	eor	v25.16b, v25.16b, v25.16b
+	fmls	v25.4s, v0.4s, v9.4s[2]
+#else
+	fmul	v25.4s, v0.4s, v9.4s[2]
+#endif
+	OP_ir	v25.4s, v1.4s, v8.4s[2]
+
+	fmul	v26.4s, v2.4s, v8.4s[2]
+	OP_ii	v26.4s, v3.4s, v9.4s[2]
+#if defined(NR) || defined(NC) || defined(TR) || defined(TC) || \
+    defined(RR) || defined(RC) || defined(CR) || defined(CC)
+	eor	v27.16b, v27.16b, v27.16b
+	fmls	v27.4s, v2.4s, v9.4s[2]
+#else
+	fmul	v27.4s, v2.4s, v9.4s[2]
+#endif
+	OP_ir	v27.4s, v3.4s, v8.4s[2]
+
+	fmul	v28.4s, v0.4s, v8.4s[3]
+	OP_ii	v28.4s, v1.4s, v9.4s[3]
+#if defined(NR) || defined(NC) || defined(TR) || defined(TC) || \
+    defined(RR) || defined(RC) || defined(CR) || defined(CC)
+	eor	v29.16b, v29.16b, v29.16b
+	fmls	v29.4s, v0.4s, v9.4s[3]
+#else
+	fmul	v29.4s, v0.4s, v9.4s[3]
+#endif
+	OP_ir	v29.4s, v1.4s, v8.4s[3]
+
+	fmul	v30.4s, v2.4s, v8.4s[3]
+	OP_ii	v30.4s, v3.4s, v9.4s[3]
+#if defined(NR) || defined(NC) || defined(TR) || defined(TC) || \
+    defined(RR) || defined(RC) || defined(CR) || defined(CC)
+	eor	v31.16b, v31.16b, v31.16b
+	fmls	v31.4s, v2.4s, v9.4s[3]
+#else
+	fmul	v31.4s, v2.4s, v9.4s[3]
+#endif
+	OP_ir	v31.4s, v3.4s, v8.4s[3]
+
+	ld2	{v12.4s, v13.4s}, [pB]
+	add	pB, pB, #32
+	ld2	{v4.4s, v5.4s}, [pA]
+	add	pA, pA, #32
+	ld2	{v6.4s, v7.4s}, [pA]
+	add	pA, pA, #32
+.endm
+
+.macro KERNEL8x4_M1
+	OP_rr	v16.4s, v0.4s, v8.4s[0]
+	OP_ii	v16.4s, v1.4s, v9.4s[0]
+	OP_ri	v17.4s, v0.4s, v9.4s[0]
+	OP_ir	v17.4s, v1.4s, v8.4s[0]
+
+	OP_rr	v18.4s, v2.4s, v8.4s[0]
+	OP_ii	v18.4s, v3.4s, v9.4s[0]
+	OP_ri	v19.4s, v2.4s, v9.4s[0]
+	OP_ir	v19.4s, v3.4s, v8.4s[0]
+
+	OP_rr	v20.4s, v0.4s, v8.4s[1]
+	OP_ii	v20.4s, v1.4s, v9.4s[1]
+	OP_ri	v21.4s, v0.4s, v9.4s[1]
+	OP_ir	v21.4s, v1.4s, v8.4s[1]
+
+	OP_rr	v22.4s, v2.4s, v8.4s[1]
+	OP_ii	v22.4s, v3.4s, v9.4s[1]
+	OP_ri	v23.4s, v2.4s, v9.4s[1]
+	OP_ir	v23.4s, v3.4s, v8.4s[1]
+
+	OP_rr	v24.4s, v0.4s, v8.4s[2]
+	OP_ii	v24.4s, v1.4s, v9.4s[2]
+	OP_ri	v25.4s, v0.4s, v9.4s[2]
+	OP_ir	v25.4s, v1.4s, v8.4s[2]
+
+	OP_rr	v26.4s, v2.4s, v8.4s[2]
+	OP_ii	v26.4s, v3.4s, v9.4s[2]
+	OP_ri	v27.4s, v2.4s, v9.4s[2]
+	OP_ir	v27.4s, v3.4s, v8.4s[2]
+
+	OP_rr	v28.4s, v0.4s, v8.4s[3]
+	OP_ii	v28.4s, v1.4s, v9.4s[3]
+	OP_ri	v29.4s, v0.4s, v9.4s[3]
+	OP_ir	v29.4s, v1.4s, v8.4s[3]
+
+	OP_rr	v30.4s, v2.4s, v8.4s[3]
+	OP_ii	v30.4s, v3.4s, v9.4s[3]
+	OP_ri	v31.4s, v2.4s, v9.4s[3]
+	OP_ir	v31.4s, v3.4s, v8.4s[3]
+
+	ld2	{v12.4s, v13.4s}, [pB]		// For next round
+	add	pB, pB, #32
+	ld2	{v4.4s, v5.4s}, [pA]		// For next round
+	add	pA, pA, #32
+	ld2	{v6.4s, v7.4s}, [pA]
+	add	pA, pA, #32
+.endm
+
+.macro KERNEL8x4_M2
+	OP_rr	v16.4s, v4.4s, v12.4s[0]
+	OP_ii	v16.4s, v5.4s, v13.4s[0]
+	OP_ri	v17.4s, v4.4s, v13.4s[0]
+	OP_ir	v17.4s, v5.4s, v12.4s[0]
+
+	OP_rr	v18.4s, v6.4s, v12.4s[0]
+	OP_ii	v18.4s, v7.4s, v13.4s[0]
+	OP_ri	v19.4s, v6.4s, v13.4s[0]
+	OP_ir	v19.4s, v7.4s, v12.4s[0]
+
+	OP_rr	v20.4s, v4.4s, v12.4s[1]
+	OP_ii	v20.4s, v5.4s, v13.4s[1]
+	OP_ri	v21.4s, v4.4s, v13.4s[1]
+	OP_ir	v21.4s, v5.4s, v12.4s[1]
+
+	OP_rr	v22.4s, v6.4s, v12.4s[1]
+	OP_ii	v22.4s, v7.4s, v13.4s[1]
+	OP_ri	v23.4s, v6.4s, v13.4s[1]
+	OP_ir	v23.4s, v7.4s, v12.4s[1]
+
+	OP_rr	v24.4s, v4.4s, v12.4s[2]
+	OP_ii	v24.4s, v5.4s, v13.4s[2]
+	OP_ri	v25.4s, v4.4s, v13.4s[2]
+	OP_ir	v25.4s, v5.4s, v12.4s[2]
+
+	OP_rr	v26.4s, v6.4s, v12.4s[2]
+	OP_ii	v26.4s, v7.4s, v13.4s[2]
+	OP_ri	v27.4s, v6.4s, v13.4s[2]
+	OP_ir	v27.4s, v7.4s, v12.4s[2]
+
+	OP_rr	v28.4s, v4.4s, v12.4s[3]
+	OP_ii	v28.4s, v5.4s, v13.4s[3]
+	OP_ri	v29.4s, v4.4s, v13.4s[3]
+	OP_ir	v29.4s, v5.4s, v12.4s[3]
+
+	OP_rr	v30.4s, v6.4s, v12.4s[3]
+	OP_ii	v30.4s, v7.4s, v13.4s[3]
+	OP_ri	v31.4s, v6.4s, v13.4s[3]
+	OP_ir	v31.4s, v7.4s, v12.4s[3]
+
+	ld2	{v8.4s, v9.4s}, [pB]
+	add	pB, pB, #32
+	ld2	{v0.4s, v1.4s}, [pA]
+	add	pA, pA, #32
+	ld2	{v2.4s, v3.4s}, [pA]
+	add	pA, pA, #32
+.endm
+
+.macro KERNEL8x4_E
+	OP_rr	v16.4s, v4.4s, v12.4s[0]
+	OP_ii	v16.4s, v5.4s, v13.4s[0]
+	OP_ri	v17.4s, v4.4s, v13.4s[0]
+	OP_ir	v17.4s, v5.4s, v12.4s[0]
+
+	OP_rr	v18.4s, v6.4s, v12.4s[0]
+	OP_ii	v18.4s, v7.4s, v13.4s[0]
+	OP_ri	v19.4s, v6.4s, v13.4s[0]
+	OP_ir	v19.4s, v7.4s, v12.4s[0]
+
+	OP_rr	v20.4s, v4.4s, v12.4s[1]
+	OP_ii	v20.4s, v5.4s, v13.4s[1]
+	OP_ri	v21.4s, v4.4s, v13.4s[1]
+	OP_ir	v21.4s, v5.4s, v12.4s[1]
+
+	OP_rr	v22.4s, v6.4s, v12.4s[1]
+	OP_ii	v22.4s, v7.4s, v13.4s[1]
+	OP_ri	v23.4s, v6.4s, v13.4s[1]
+	OP_ir	v23.4s, v7.4s, v12.4s[1]
+
+	OP_rr	v24.4s, v4.4s, v12.4s[2]
+	OP_ii	v24.4s, v5.4s, v13.4s[2]
+	OP_ri	v25.4s, v4.4s, v13.4s[2]
+	OP_ir	v25.4s, v5.4s, v12.4s[2]
+
+	OP_rr	v26.4s, v6.4s, v12.4s[2]
+	OP_ii	v26.4s, v7.4s, v13.4s[2]
+	OP_ri	v27.4s, v6.4s, v13.4s[2]
+	OP_ir	v27.4s, v7.4s, v12.4s[2]
+
+	OP_rr	v28.4s, v4.4s, v12.4s[3]
+	OP_ii	v28.4s, v5.4s, v13.4s[3]
+	OP_ri	v29.4s, v4.4s, v13.4s[3]
+	OP_ir	v29.4s, v5.4s, v12.4s[3]
+
+	OP_rr	v30.4s, v6.4s, v12.4s[3]
+	OP_ii	v30.4s, v7.4s, v13.4s[3]
+	OP_ri	v31.4s, v6.4s, v13.4s[3]
+	OP_ir	v31.4s, v7.4s, v12.4s[3]
+
+.endm
+
+.macro KERNEL8x4_SUB
+	ld2	{v8.4s, v9.4s}, [pB]
+	add	pB, pB, #32
+	ld2	{v0.4s, v1.4s}, [pA]
+	add	pA, pA, #32
+	ld2	{v2.4s, v3.4s}, [pA]
+	add	pA, pA, #32
+
+	OP_rr	v16.4s, v0.4s, v8.4s[0]
+	OP_ii	v16.4s, v1.4s, v9.4s[0]
+	OP_ri	v17.4s, v0.4s, v9.4s[0]
+	OP_ir	v17.4s, v1.4s, v8.4s[0]
+
+	OP_rr	v18.4s, v2.4s, v8.4s[0]
+	OP_ii	v18.4s, v3.4s, v9.4s[0]
+	OP_ri	v19.4s, v2.4s, v9.4s[0]
+	OP_ir	v19.4s, v3.4s, v8.4s[0]
+
+	OP_rr	v20.4s, v0.4s, v8.4s[1]
+	OP_ii	v20.4s, v1.4s, v9.4s[1]
+	OP_ri	v21.4s, v0.4s, v9.4s[1]
+	OP_ir	v21.4s, v1.4s, v8.4s[1]
+
+	OP_rr	v22.4s, v2.4s, v8.4s[1]
+	OP_ii	v22.4s, v3.4s, v9.4s[1]
+	OP_ri	v23.4s, v2.4s, v9.4s[1]
+	OP_ir	v23.4s, v3.4s, v8.4s[1]
+
+	OP_rr	v24.4s, v0.4s, v8.4s[2]
+	OP_ii	v24.4s, v1.4s, v9.4s[2]
+	OP_ri	v25.4s, v0.4s, v9.4s[2]
+	OP_ir	v25.4s, v1.4s, v8.4s[2]
+
+	OP_rr	v26.4s, v2.4s, v8.4s[2]
+	OP_ii	v26.4s, v3.4s, v9.4s[2]
+	OP_ri	v27.4s, v2.4s, v9.4s[2]
+	OP_ir	v27.4s, v3.4s, v8.4s[2]
+
+	OP_rr	v28.4s, v0.4s, v8.4s[3]
+	OP_ii	v28.4s, v1.4s, v9.4s[3]
+	OP_ri	v29.4s, v0.4s, v9.4s[3]
+	OP_ir	v29.4s, v1.4s, v8.4s[3]
+
+	OP_rr	v30.4s, v2.4s, v8.4s[3]
+	OP_ii	v30.4s, v3.4s, v9.4s[3]
+	OP_ri	v31.4s, v2.4s, v9.4s[3]
+	OP_ir	v31.4s, v3.4s, v8.4s[3]
+
+.endm
+
+.macro SAVE8x4
+	mov	pCRow1, pCRow0
+
+	ld2	{v0.4s, v1.4s}, [pCRow1]
+	fmla	v0.4s, v16.4s, alphaV0_R
+	fmls	v0.4s, v17.4s, alphaV0_I
+	fmla	v1.4s, v16.4s, alphaV1_I
+	fmla	v1.4s, v17.4s, alphaV1_R
+	st2 	{v0.4s, v1.4s}, [pCRow1]
+
+	add	pCRow2, pCRow1, #32
+
+	ld2	{v2.4s, v3.4s}, [pCRow2]
+	fmla	v2.4s, v18.4s, alphaV0_R
+	fmls	v2.4s, v19.4s, alphaV0_I
+	fmla	v3.4s, v18.4s, alphaV1_I
+	fmla	v3.4s, v19.4s, alphaV1_R
+	st2 	{v2.4s, v3.4s}, [pCRow2]
+
+	add	pCRow1, pCRow1, LDC
+
+	ld2	{v4.4s, v5.4s}, [pCRow1]
+	fmla	v4.4s, v20.4s, alphaV0_R
+	fmls	v4.4s, v21.4s, alphaV0_I
+	fmla	v5.4s, v20.4s, alphaV1_I
+	fmla	v5.4s, v21.4s, alphaV1_R
+	st2 	{v4.4s, v5.4s}, [pCRow1]
+
+	add	pCRow2, pCRow1, #32
+
+	ld2	{v6.4s, v7.4s}, [pCRow2]
+	fmla	v6.4s, v22.4s, alphaV0_R
+	fmls	v6.4s, v23.4s, alphaV0_I
+	fmla	v7.4s, v22.4s, alphaV1_I
+	fmla	v7.4s, v23.4s, alphaV1_R
+	st2 	{v6.4s, v7.4s}, [pCRow2]
+
+	add	pCRow1, pCRow1, LDC
+
+	ld2	{v0.4s, v1.4s}, [pCRow1]
+	fmla	v0.4s, v24.4s, alphaV0_R
+	fmls	v0.4s, v25.4s, alphaV0_I
+	fmla	v1.4s, v24.4s, alphaV1_I
+	fmla	v1.4s, v25.4s, alphaV1_R
+	st2 	{v0.4s, v1.4s}, [pCRow1]
+
+	add	pCRow2, pCRow1, #32
+
+	ld2	{v2.4s, v3.4s}, [pCRow2]
+	fmla	v2.4s, v26.4s, alphaV0_R
+	fmls	v2.4s, v27.4s, alphaV0_I
+	fmla	v3.4s, v26.4s, alphaV1_I
+	fmla	v3.4s, v27.4s, alphaV1_R
+	st2 	{v2.4s, v3.4s}, [pCRow2]
+
+	add	pCRow1, pCRow1, LDC
+
+	ld2	{v4.4s, v5.4s}, [pCRow1]
+	fmla	v4.4s, v28.4s, alphaV0_R
+	fmls	v4.4s, v29.4s, alphaV0_I
+	fmla	v5.4s, v28.4s, alphaV1_I
+	fmla	v5.4s, v29.4s, alphaV1_R
+	st2 	{v4.4s, v5.4s}, [pCRow1]
+
+	add	pCRow2, pCRow1, #32
+
+	ld2	{v6.4s, v7.4s}, [pCRow2]
+	fmla	v6.4s, v30.4s, alphaV0_R
+	fmls	v6.4s, v31.4s, alphaV0_I
+	fmla	v7.4s, v30.4s, alphaV1_I
+	fmla	v7.4s, v31.4s, alphaV1_R
+	st2 	{v6.4s, v7.4s}, [pCRow2]
+
+	add	pCRow0, pCRow0, #64
+.endm
+
+/******************************************************************************/
+
+.macro INIT4x4
+	fmov		s16, wzr
+	fmov		s17, s16
+	fmov		s20, s17
+	fmov		s21, s16
+	fmov		s24, s17
+	fmov		s25, s16
+	fmov		s28, s17
+	fmov		s29, s16
+.endm
+
+.macro KERNEL4x4_I
+	ld2	{v8.4s, v9.4s}, [pB]
+	add	pB, pB, #32
+	ld2	{v0.4s, v1.4s}, [pA]
+	add	pA, pA, #32
+
+	fmul	v16.4s, v0.4s, v8.4s[0]
+	OP_ii	v16.4s, v1.4s, v9.4s[0]
+#if defined(NR) || defined(NC) || defined(TR) || defined(TC) || \
+    defined(RR) || defined(RC) || defined(CR) || defined(CC)
+	eor	v17.16b, v17.16b, v17.16b
+	fmls	v17.4s, v0.4s, v9.4s[0]
+#else
+	fmul	v17.4s, v0.4s, v9.4s[0]
+#endif
+	OP_ir	v17.4s, v1.4s, v8.4s[0]
+
+	fmul	v20.4s, v0.4s, v8.4s[1]
+	OP_ii	v20.4s, v1.4s, v9.4s[1]
+#if defined(NR) || defined(NC) || defined(TR) || defined(TC) || \
+    defined(RR) || defined(RC) || defined(CR) || defined(CC)
+	eor	v21.16b, v21.16b, v21.16b
+	fmls	v21.4s, v0.4s, v9.4s[1]
+#else
+	fmul	v21.4s, v0.4s, v9.4s[1]
+#endif
+	OP_ir	v21.4s, v1.4s, v8.4s[1]
+
+	fmul	v24.4s, v0.4s, v8.4s[2]
+	OP_ii	v24.4s, v1.4s, v9.4s[2]
+#if defined(NR) || defined(NC) || defined(TR) || defined(TC) || \
+    defined(RR) || defined(RC) || defined(CR) || defined(CC)
+	eor	v25.16b, v25.16b, v25.16b
+	fmls	v25.4s, v0.4s, v9.4s[2]
+#else
+	fmul	v25.4s, v0.4s, v9.4s[2]
+#endif
+	OP_ir	v25.4s, v1.4s, v8.4s[2]
+
+	fmul	v28.4s, v0.4s, v8.4s[3]
+	OP_ii	v28.4s, v1.4s, v9.4s[3]
+#if defined(NR) || defined(NC) || defined(TR) || defined(TC) || \
+    defined(RR) || defined(RC) || defined(CR) || defined(CC)
+	eor	v29.16b, v29.16b, v29.16b
+	fmls	v29.4s, v0.4s, v9.4s[3]
+#else
+	fmul	v29.4s, v0.4s, v9.4s[3]
+#endif
+	OP_ir	v29.4s, v1.4s, v8.4s[3]
+
+	ld2	{v12.4s, v13.4s}, [pB]
+	add	pB, pB, #32
+	ld2	{v4.4s, v5.4s}, [pA]
+	add	pA, pA, #32
+.endm
+
+.macro KERNEL4x4_M1
+	OP_rr	v16.4s, v0.4s, v8.4s[0]
+	OP_ii	v16.4s, v1.4s, v9.4s[0]
+	OP_ri	v17.4s, v0.4s, v9.4s[0]
+	OP_ir	v17.4s, v1.4s, v8.4s[0]
+
+	ld2	{v12.4s, v13.4s}, [pB]		// For next round
+	add	pB, pB, #32
+
+	OP_rr	v20.4s, v0.4s, v8.4s[1]
+	OP_ii	v20.4s, v1.4s, v9.4s[1]
+	OP_ri	v21.4s, v0.4s, v9.4s[1]
+	OP_ir	v21.4s, v1.4s, v8.4s[1]
+
+	ld2	{v4.4s, v5.4s}, [pA]		// For next round
+	add	pA, pA, #32
+
+	OP_rr	v24.4s, v0.4s, v8.4s[2]
+	OP_ii	v24.4s, v1.4s, v9.4s[2]
+	OP_ri	v25.4s, v0.4s, v9.4s[2]
+	OP_ir	v25.4s, v1.4s, v8.4s[2]
+
+	prfm	PLDL1KEEP, [pA, #512]
+
+	OP_rr	v28.4s, v0.4s, v8.4s[3]
+	OP_ii	v28.4s, v1.4s, v9.4s[3]
+	OP_ri	v29.4s, v0.4s, v9.4s[3]
+	OP_ir	v29.4s, v1.4s, v8.4s[3]
+.endm
+
+.macro KERNEL4x4_M2
+	OP_rr	v16.4s, v4.4s, v12.4s[0]
+	OP_ii	v16.4s, v5.4s, v13.4s[0]
+	OP_ri	v17.4s, v4.4s, v13.4s[0]
+	OP_ir	v17.4s, v5.4s, v12.4s[0]
+
+	ld2	{v8.4s, v9.4s}, [pB]		// For next round
+	add	pB, pB, #32
+
+	OP_rr	v20.4s, v4.4s, v12.4s[1]
+	OP_ii	v20.4s, v5.4s, v13.4s[1]
+	OP_ri	v21.4s, v4.4s, v13.4s[1]
+	OP_ir	v21.4s, v5.4s, v12.4s[1]
+
+	ld2	{v0.4s, v1.4s}, [pA]		// For next round
+	add	pA, pA, #32
+
+	OP_rr	v24.4s, v4.4s, v12.4s[2]
+	OP_ii	v24.4s, v5.4s, v13.4s[2]
+	OP_ri	v25.4s, v4.4s, v13.4s[2]
+	OP_ir	v25.4s, v5.4s, v12.4s[2]
+
+	prfm	PLDL1KEEP, [pB, #512]
+
+	OP_rr	v28.4s, v4.4s, v12.4s[3]
+	OP_ii	v28.4s, v5.4s, v13.4s[3]
+	OP_ri	v29.4s, v4.4s, v13.4s[3]
+	OP_ir	v29.4s, v5.4s, v12.4s[3]
+.endm
+
+.macro KERNEL4x4_E
+	OP_rr	v16.4s, v4.4s, v12.4s[0]
+	OP_ii	v16.4s, v5.4s, v13.4s[0]
+	OP_ri	v17.4s, v4.4s, v13.4s[0]
+	OP_ir	v17.4s, v5.4s, v12.4s[0]
+
+	OP_rr	v20.4s, v4.4s, v12.4s[1]
+	OP_ii	v20.4s, v5.4s, v13.4s[1]
+	OP_ri	v21.4s, v4.4s, v13.4s[1]
+	OP_ir	v21.4s, v5.4s, v12.4s[1]
+
+	OP_rr	v24.4s, v4.4s, v12.4s[2]
+	OP_ii	v24.4s, v5.4s, v13.4s[2]
+	OP_ri	v25.4s, v4.4s, v13.4s[2]
+	OP_ir	v25.4s, v5.4s, v12.4s[2]
+
+	OP_rr	v28.4s, v4.4s, v12.4s[3]
+	OP_ii	v28.4s, v5.4s, v13.4s[3]
+	OP_ri	v29.4s, v4.4s, v13.4s[3]
+	OP_ir	v29.4s, v5.4s, v12.4s[3]
+.endm
+
+.macro KERNEL4x4_SUB
+	ld2	{v8.4s, v9.4s}, [pB]
+	add	pB, pB, #32
+	ld2	{v0.4s, v1.4s}, [pA]
+	add	pA, pA, #32
+
+	OP_rr	v16.4s, v0.4s, v8.4s[0]
+	OP_ii	v16.4s, v1.4s, v9.4s[0]
+	OP_ri	v17.4s, v0.4s, v9.4s[0]
+	OP_ir	v17.4s, v1.4s, v8.4s[0]
+
+	OP_rr	v20.4s, v0.4s, v8.4s[1]
+	OP_ii	v20.4s, v1.4s, v9.4s[1]
+	OP_ri	v21.4s, v0.4s, v9.4s[1]
+	OP_ir	v21.4s, v1.4s, v8.4s[1]
+
+	OP_rr	v24.4s, v0.4s, v8.4s[2]
+	OP_ii	v24.4s, v1.4s, v9.4s[2]
+	OP_ri	v25.4s, v0.4s, v9.4s[2]
+	OP_ir	v25.4s, v1.4s, v8.4s[2]
+
+	OP_rr	v28.4s, v0.4s, v8.4s[3]
+	OP_ii	v28.4s, v1.4s, v9.4s[3]
+	OP_ri	v29.4s, v0.4s, v9.4s[3]
+	OP_ir	v29.4s, v1.4s, v8.4s[3]
+.endm
+
+.macro SAVE4x4
+	mov	pCRow1, pCRow0
+
+	ld2	{v0.4s, v1.4s}, [pCRow1]
+	fmla	v0.4s, v16.4s, alphaV0_R
+	fmls	v0.4s, v17.4s, alphaV0_I
+	fmla	v1.4s, v16.4s, alphaV1_I
+	fmla	v1.4s, v17.4s, alphaV1_R
+	st2 	{v0.4s, v1.4s}, [pCRow1]
+
+	add	pCRow1, pCRow1, LDC
+
+	ld2	{v4.4s, v5.4s}, [pCRow1]
+	fmla	v4.4s, v20.4s, alphaV0_R
+	fmls	v4.4s, v21.4s, alphaV0_I
+	fmla	v5.4s, v20.4s, alphaV1_I
+	fmla	v5.4s, v21.4s, alphaV1_R
+	st2 	{v4.4s, v5.4s}, [pCRow1]
+
+	add	pCRow1, pCRow1, LDC
+
+	ld2	{v0.4s, v1.4s}, [pCRow1]
+	fmla	v0.4s, v24.4s, alphaV0_R
+	fmls	v0.4s, v25.4s, alphaV0_I
+	fmla	v1.4s, v24.4s, alphaV1_I
+	fmla	v1.4s, v25.4s, alphaV1_R
+	st2 	{v0.4s, v1.4s}, [pCRow1]
+
+	add	pCRow1, pCRow1, LDC
+
+	ld2	{v4.4s, v5.4s}, [pCRow1]
+	fmla	v4.4s, v28.4s, alphaV0_R
+	fmls	v4.4s, v29.4s, alphaV0_I
+	fmla	v5.4s, v28.4s, alphaV1_I
+	fmla	v5.4s, v29.4s, alphaV1_R
+	st2 	{v4.4s, v5.4s}, [pCRow1]
+
+	add	pCRow0, pCRow0, #32
+.endm
+
+/******************************************************************************/
+
+.macro INIT2x4
+	fmov	s16, wzr
+	fmov	s17, wzr
+	fmov	s20, s16
+	fmov	s21, s17
+	fmov	s24, s16
+	fmov	s25, s17
+	fmov	s28, s16
+	fmov	s29, s17
+.endm
+
+.macro KERNEL2x4_SUB
+	ld2	{v8.4s, v9.4s}, [pB]
+	add	pB, pB, #32
+	ld2	{v0.2s, v1.2s}, [pA]
+	add	pA, pA, #16
+
+	OP_rr	v16.2s, v0.2s, v8.4s[0]
+	OP_ii	v16.2s, v1.2s, v9.4s[0]
+	OP_ri	v17.2s, v0.2s, v9.4s[0]
+	OP_ir	v17.2s, v1.2s, v8.4s[0]
+
+	OP_rr	v20.2s, v0.2s, v8.4s[1]
+	OP_ii	v20.2s, v1.2s, v9.4s[1]
+	OP_ri	v21.2s, v0.2s, v9.4s[1]
+	OP_ir	v21.2s, v1.2s, v8.4s[1]
+
+	OP_rr	v24.2s, v0.2s, v8.4s[2]
+	OP_ii	v24.2s, v1.2s, v9.4s[2]
+	OP_ri	v25.2s, v0.2s, v9.4s[2]
+	OP_ir	v25.2s, v1.2s, v8.4s[2]
+
+	OP_rr	v28.2s, v0.2s, v8.4s[3]
+	OP_ii	v28.2s, v1.2s, v9.4s[3]
+	OP_ri	v29.2s, v0.2s, v9.4s[3]
+	OP_ir	v29.2s, v1.2s, v8.4s[3]
+.endm
+
+.macro SAVE2x4
+	mov	pCRow1, pCRow0
+
+	ld2	{v0.2s, v1.2s}, [pCRow1]
+	fmla	v0.2s, v16.2s, alphaV0_R
+	fmls	v0.2s, v17.2s, alphaV0_I
+	fmla	v1.2s, v16.2s, alphaV1_I
+	fmla	v1.2s, v17.2s, alphaV1_R
+	st2 	{v0.2s, v1.2s}, [pCRow1]
+
+	add	pCRow1, pCRow1, LDC
+
+	ld2	{v4.2s, v5.2s}, [pCRow1]
+	fmla	v4.2s, v20.2s, alphaV0_R
+	fmls	v4.2s, v21.2s, alphaV0_I
+	fmla	v5.2s, v20.2s, alphaV1_I
+	fmla	v5.2s, v21.2s, alphaV1_R
+	st2 	{v4.2s, v5.2s}, [pCRow1]
+
+	add	pCRow1, pCRow1, LDC
+
+	ld2	{v0.2s, v1.2s}, [pCRow1]
+	fmla	v0.2s, v24.2s, alphaV0_R
+	fmls	v0.2s, v25.2s, alphaV0_I
+	fmla	v1.2s, v24.2s, alphaV1_I
+	fmla	v1.2s, v25.2s, alphaV1_R
+	st2 	{v0.2s, v1.2s}, [pCRow1]
+
+	add	pCRow1, pCRow1, LDC
+
+	ld2	{v4.2s, v5.2s}, [pCRow1]
+	fmla	v4.2s, v28.2s, alphaV0_R
+	fmls	v4.2s, v29.2s, alphaV0_I
+	fmla	v5.2s, v28.2s, alphaV1_I
+	fmla	v5.2s, v29.2s, alphaV1_R
+	st2 	{v4.2s, v5.2s}, [pCRow1]
+
+	add	pCRow0, pCRow0, #16
+.endm
+
+/******************************************************************************/
+
+.macro INIT1x4
+	fmov	s16, wzr
+	fmov	s17, wzr
+	fmov	s20, s16
+	fmov	s21, s17
+	fmov	s24, s16
+	fmov	s25, s17
+	fmov	s28, s16
+	fmov	s29, s17
+.endm
+
+.macro KERNEL1x4_SUB
+	ld2	{v8.4s, v9.4s}, [pB]
+	add	pB, pB, #32
+	ld2	{v0.s, v1.s}[0], [pA]
+	add	pA, pA, #8
+
+	OP_rr	s16, s0, v8.4s[0]
+	OP_ii	s16, s1, v9.4s[0]
+	OP_ri	s17, s0, v9.4s[0]
+	OP_ir	s17, s1, v8.4s[0]
+
+	OP_rr	s20, s0, v8.4s[1]
+	OP_ii	s20, s1, v9.4s[1]
+	OP_ri	s21, s0, v9.4s[1]
+	OP_ir	s21, s1, v8.4s[1]
+
+	OP_rr	s24, s0, v8.4s[2]
+	OP_ii	s24, s1, v9.4s[2]
+	OP_ri	s25, s0, v9.4s[2]
+	OP_ir	s25, s1, v8.4s[2]
+
+	OP_rr	s28, s0, v8.4s[3]
+	OP_ii	s28, s1, v9.4s[3]
+	OP_ri	s29, s0, v9.4s[3]
+	OP_ir	s29, s1, v8.4s[3]
+.endm
+
+.macro SAVE1x4
+	mov	pCRow1, pCRow0
+
+	ld2	{v0.s, v1.s}[0], [pCRow1]
+	fmla	s0, s16, alphaV0_R
+	fmls	s0, s17, alphaV0_I
+	fmla	s1, s16, alphaV1_I
+	fmla	s1, s17, alphaV1_R
+	st2	{v0.s, v1.s}[0], [pCRow1]
+
+	add	pCRow1, pCRow1, LDC
+
+	ld2	{v4.s, v5.s}[0], [pCRow1]
+	fmla	s4, s20, alphaV0_R
+	fmls	s4, s21, alphaV0_I
+	fmla	s5, s20, alphaV1_I
+	fmla	s5, s21, alphaV1_R
+	st2 	{v4.s, v5.s}[0], [pCRow1]
+
+	add	pCRow1, pCRow1, LDC
+
+	ld2	{v0.s, v1.s}[0], [pCRow1]
+	fmla	s0, s24, alphaV0_R
+	fmls	s0, s25, alphaV0_I
+	fmla	s1, s24, alphaV1_I
+	fmla	s1, s25, alphaV1_R
+	st2	{v0.s, v1.s}[0], [pCRow1]
+
+	add	pCRow1, pCRow1, LDC
+
+	ld2	{v4.s, v5.s}[0], [pCRow1]
+	fmla	s4, s28, alphaV0_R
+	fmls	s4, s29, alphaV0_I
+	fmla	s5, s28, alphaV1_I
+	fmla	s5, s29, alphaV1_R
+	st2	{v4.s, v5.s}[0], [pCRow1]
+
+	add	pCRow0, pCRow0, #8
+.endm
+
+/******************************************************************************/
+
+.macro INIT8x2
+	fmov		s16, wzr
+	fmov		s17, wzr
+	fmov		s18, wzr
+	fmov		s19, s16
+	fmov		s20, wzr
+	fmov		s21, s16
+	fmov		s22, s17
+	fmov		s23, s18
+.endm
+
+.macro KERNEL8x2_SUB
+	ld2	{v8.2s, v9.2s}, [pB]
+	add	pB, pB, #16
+	ld2	{v0.4s, v1.4s}, [pA]
+	add	pA, pA, #32
+	ld2	{v2.4s, v3.4s}, [pA]
+	add	pA, pA, #32
+
+	OP_rr	v16.4s, v0.4s, v8.2s[0]
+	OP_ii	v16.4s, v1.4s, v9.2s[0]
+	OP_ri	v17.4s, v0.4s, v9.2s[0]
+	OP_ir	v17.4s, v1.4s, v8.2s[0]
+
+	OP_rr	v18.4s, v2.4s, v8.2s[0]
+	OP_ii	v18.4s, v3.4s, v9.2s[0]
+	OP_ri	v19.4s, v2.4s, v9.2s[0]
+	OP_ir	v19.4s, v3.4s, v8.2s[0]
+
+	OP_rr	v20.4s, v0.4s, v8.2s[1]
+	OP_ii	v20.4s, v1.4s, v9.2s[1]
+	OP_ri	v21.4s, v0.4s, v9.2s[1]
+	OP_ir	v21.4s, v1.4s, v8.2s[1]
+
+	OP_rr	v22.4s, v2.4s, v8.2s[1]
+	OP_ii	v22.4s, v3.4s, v9.2s[1]
+	OP_ri	v23.4s, v2.4s, v9.2s[1]
+	OP_ir	v23.4s, v3.4s, v8.2s[1]
+.endm
+
+.macro SAVE8x2
+	mov	pCRow1, pCRow0
+
+	ld2	{v0.4s, v1.4s}, [pCRow1]
+	fmla	v0.4s, v16.4s, alphaV0_R
+	fmls	v0.4s, v17.4s, alphaV0_I
+	fmla	v1.4s, v16.4s, alphaV1_I
+	fmla	v1.4s, v17.4s, alphaV1_R
+	st2 	{v0.4s, v1.4s}, [pCRow1]
+
+	add	pCRow2, pCRow1, #32
+
+	ld2	{v2.4s, v3.4s}, [pCRow2]
+	fmla	v2.4s, v18.4s, alphaV0_R
+	fmls	v2.4s, v19.4s, alphaV0_I
+	fmla	v3.4s, v18.4s, alphaV1_I
+	fmla	v3.4s, v19.4s, alphaV1_R
+	st2 	{v2.4s, v3.4s}, [pCRow2]
+
+	add	pCRow1, pCRow1, LDC
+
+	ld2	{v4.4s, v5.4s}, [pCRow1]
+	fmla	v4.4s, v20.4s, alphaV0_R
+	fmls	v4.4s, v21.4s, alphaV0_I
+	fmla	v5.4s, v20.4s, alphaV1_I
+	fmla	v5.4s, v21.4s, alphaV1_R
+	st2 	{v4.4s, v5.4s}, [pCRow1]
+
+	add	pCRow2, pCRow1, #32
+
+	ld2	{v6.4s, v7.4s}, [pCRow2]
+	fmla	v6.4s, v22.4s, alphaV0_R
+	fmls	v6.4s, v23.4s, alphaV0_I
+	fmla	v7.4s, v22.4s, alphaV1_I
+	fmla	v7.4s, v23.4s, alphaV1_R
+	st2 	{v6.4s, v7.4s}, [pCRow2]
+
+	add	pCRow0, pCRow0, #64
+.endm
+
+/******************************************************************************/
+
+.macro INIT4x2
+	fmov	s16, wzr
+	fmov	s17, wzr
+	fmov	s20, s16
+	fmov	s21, s17
+.endm
+
+.macro KERNEL4x2_SUB
+	ld2	{v8.2s, v9.2s}, [pB]
+	add	pB, pB, #16
+	ld2	{v0.4s, v1.4s}, [pA]
+	add	pA, pA, #32
+
+	OP_rr	v16.4s, v0.4s, v8.2s[0]
+	OP_ii	v16.4s, v1.4s, v9.2s[0]
+	OP_ri	v17.4s, v0.4s, v9.2s[0]
+	OP_ir	v17.4s, v1.4s, v8.2s[0]
+
+	OP_rr	v20.4s, v0.4s, v8.2s[1]
+	OP_ii	v20.4s, v1.4s, v9.2s[1]
+	OP_ri	v21.4s, v0.4s, v9.2s[1]
+	OP_ir	v21.4s, v1.4s, v8.2s[1]
+.endm
+
+.macro SAVE4x2
+	mov	pCRow1, pCRow0
+
+	ld2	{v0.4s, v1.4s}, [pCRow1]
+	fmla	v0.4s, v16.4s, alphaV0_R
+	fmls	v0.4s, v17.4s, alphaV0_I
+	fmla	v1.4s, v16.4s, alphaV1_I
+	fmla	v1.4s, v17.4s, alphaV1_R
+	st2 	{v0.4s, v1.4s}, [pCRow1]
+
+	add	pCRow1, pCRow1, LDC
+
+	ld2	{v4.4s, v5.4s}, [pCRow1]
+	fmla	v4.4s, v20.4s, alphaV0_R
+	fmls	v4.4s, v21.4s, alphaV0_I
+	fmla	v5.4s, v20.4s, alphaV1_I
+	fmla	v5.4s, v21.4s, alphaV1_R
+	st2 	{v4.4s, v5.4s}, [pCRow1]
+
+	add	pCRow0, pCRow0, #32
+.endm
+
+/******************************************************************************/
+
+.macro INIT2x2
+	fmov		s16, wzr
+	fmov		s17, wzr
+	fmov		s20, s16
+	fmov		s21, s17
+.endm
+
+.macro KERNEL2x2_SUB
+	ld2	{v8.2s, v9.2s}, [pB]
+	add	pB, pB, #16
+	ld2	{v0.2s, v1.2s}, [pA]
+	add	pA, pA, #16
+
+	OP_rr	v16.2s, v0.2s, v8.2s[0]
+	OP_ii	v16.2s, v1.2s, v9.2s[0]
+	OP_ri	v17.2s, v0.2s, v9.2s[0]
+	OP_ir	v17.2s, v1.2s, v8.2s[0]
+
+	OP_rr	v20.2s, v0.2s, v8.2s[1]
+	OP_ii	v20.2s, v1.2s, v9.2s[1]
+	OP_ri	v21.2s, v0.2s, v9.2s[1]
+	OP_ir	v21.2s, v1.2s, v8.2s[1]
+.endm
+
+.macro SAVE2x2
+	mov	pCRow1, pCRow0
+
+	ld2	{v0.2s, v1.2s}, [pCRow1]
+	fmla	v0.2s, v16.2s, alphaV0_R
+	fmls	v0.2s, v17.2s, alphaV0_I
+	fmla	v1.2s, v16.2s, alphaV1_I
+	fmla	v1.2s, v17.2s, alphaV1_R
+	st2 	{v0.2s, v1.2s}, [pCRow1]
+
+	add	pCRow1, pCRow1, LDC
+
+	ld2	{v4.2s, v5.2s}, [pCRow1]
+	fmla	v4.2s, v20.2s, alphaV0_R
+	fmls	v4.2s, v21.2s, alphaV0_I
+	fmla	v5.2s, v20.2s, alphaV1_I
+	fmla	v5.2s, v21.2s, alphaV1_R
+	st2 	{v4.2s, v5.2s}, [pCRow1]
+
+	add	pCRow0, pCRow0, #16
+.endm
+
+/******************************************************************************/
+
+.macro INIT1x2
+	fmov		s16, wzr
+	fmov		s17, wzr
+	fmov		s20, wzr
+	fmov		s21, wzr
+.endm
+
+.macro KERNEL1x2_SUB
+	ld2	{v8.2s, v9.2s}, [pB]
+	add	pB, pB, #16
+	ld2	{v0.s, v1.s}[0], [pA]
+	add	pA, pA, #8
+
+	OP_rr	s16, s0, v8.2s[0]
+	OP_ii	s16, s1, v9.2s[0]
+	OP_ri	s17, s0, v9.2s[0]
+	OP_ir	s17, s1, v8.2s[0]
+
+	OP_rr	s20, s0, v8.2s[1]
+	OP_ii	s20, s1, v9.2s[1]
+	OP_ri	s21, s0, v9.2s[1]
+	OP_ir	s21, s1, v8.2s[1]
+.endm
+
+.macro SAVE1x2
+	mov	pCRow1, pCRow0
+
+	ld2	{v0.s, v1.s}[0], [pCRow1]
+	fmla	s0, s16, alphaV0_R
+	fmls	s0, s17, alphaV0_I
+	fmla	s1, s16, alphaV1_I
+	fmla	s1, s17, alphaV1_R
+	st2	{v0.s, v1.s}[0], [pCRow1]
+
+	add	pCRow1, pCRow1, LDC
+
+	ld2	{v4.s, v5.s}[0], [pCRow1]
+	fmla	s4, s20, alphaV0_R
+	fmls	s4, s21, alphaV0_I
+	fmla	s5, s20, alphaV1_I
+	fmla	s5, s21, alphaV1_R
+	st2 	{v4.s, v5.s}[0], [pCRow1]
+
+	add	pCRow0, pCRow0, #8
+.endm
+
+/******************************************************************************/
+
+.macro INIT8x1
+	fmov		s16, wzr
+	fmov		s17, wzr
+	fmov		s18, wzr
+	fmov		s19, s16
+.endm
+
+.macro KERNEL8x1_SUB
+	ld1	{v8.2s}, [pB]
+	add	pB, pB, #8
+	ld2	{v0.4s, v1.4s}, [pA]
+	add	pA, pA, #32
+	ld2	{v2.4s, v3.4s}, [pA]
+	add	pA, pA, #32
+
+	OP_rr	v16.4s, v0.4s, v8.4s[0]
+	OP_ii	v16.4s, v1.4s, v8.4s[1]
+	OP_ri	v17.4s, v0.4s, v8.4s[1]
+	OP_ir	v17.4s, v1.4s, v8.4s[0]
+
+	OP_rr	v18.4s, v2.4s, v8.4s[0]
+	OP_ii	v18.4s, v3.4s, v8.4s[1]
+	OP_ri	v19.4s, v2.4s, v8.4s[1]
+	OP_ir	v19.4s, v3.4s, v8.4s[0]
+.endm
+
+.macro SAVE8x1
+	mov	pCRow1, pCRow0
+
+	ld2	{v0.4s, v1.4s}, [pCRow1]
+	fmla	v0.4s, v16.4s, alphaV0_R
+	fmls	v0.4s, v17.4s, alphaV0_I
+	fmla	v1.4s, v16.4s, alphaV1_I
+	fmla	v1.4s, v17.4s, alphaV1_R
+	st2 	{v0.4s, v1.4s}, [pCRow1]
+
+	add	pCRow1, pCRow1, #32
+
+	ld2	{v2.4s, v3.4s}, [pCRow1]
+	fmla	v2.4s, v18.4s, alphaV0_R
+	fmls	v2.4s, v19.4s, alphaV0_I
+	fmla	v3.4s, v18.4s, alphaV1_I
+	fmla	v3.4s, v19.4s, alphaV1_R
+	st2 	{v2.4s, v3.4s}, [pCRow1]
+
+	add	pCRow0, pCRow0, #64
+.endm
+
+
+/******************************************************************************/
+
+.macro INIT4x1
+	fmov	s16, wzr
+	fmov	s17, s16
+.endm
+
+.macro KERNEL4x1_SUB
+	ld2	{v8.s, v9.s}[0], [pB]
+	add	pB, pB, #8
+	ld2	{v0.4s, v1.4s}, [pA]
+	add	pA, pA, #32
+
+	OP_rr	v16.4s, v0.4s, v8.s[0]
+	OP_ii	v16.4s, v1.4s, v9.s[0]
+	OP_ri	v17.4s, v0.4s, v9.s[0]
+	OP_ir	v17.4s, v1.4s, v8.s[0]
+.endm
+
+.macro SAVE4x1
+	mov	pCRow1, pCRow0
+
+	ld2	{v0.4s, v1.4s}, [pCRow1]
+	fmla	v0.4s, v16.4s, alphaV0_R
+	fmls	v0.4s, v17.4s, alphaV0_I
+	fmla	v1.4s, v16.4s, alphaV1_I
+	fmla	v1.4s, v17.4s, alphaV1_R
+	st2 	{v0.4s, v1.4s}, [pCRow1]
+
+	add	pCRow0, pCRow0, #32
+.endm
+
+/******************************************************************************/
+
+.macro INIT2x1
+	fmov	s16, wzr
+	fmov	s17, wzr
+.endm
+
+.macro KERNEL2x1_SUB
+	ld2	{v8.s, v9.s}[0], [pB]
+	add	pB, pB, #8
+	ld2	{v0.2s, v1.2s}, [pA]
+	add	pA, pA, #16
+
+	OP_rr	v16.2s, v0.2s, v8.s[0]
+	OP_ii	v16.2s, v1.2s, v9.s[0]
+	OP_ri	v17.2s, v0.2s, v9.s[0]
+	OP_ir	v17.2s, v1.2s, v8.s[0]
+.endm
+
+.macro SAVE2x1
+	mov	pCRow1, pCRow0
+
+	ld2	{v0.2s, v1.2s}, [pCRow1]
+	fmla	v0.2s, v16.2s, alphaV0_R
+	fmls	v0.2s, v17.2s, alphaV0_I
+	fmla	v1.2s, v16.2s, alphaV1_I
+	fmla	v1.2s, v17.2s, alphaV1_R
+	st2 	{v0.2s, v1.2s}, [pCRow1]
+
+	add	pCRow0, pCRow0, #16
+
+.endm
+
+/******************************************************************************/
+
+.macro INIT1x1
+	fmov	s16, wzr
+	fmov	s17, wzr
+.endm
+
+.macro KERNEL1x1_SUB
+	ld2	{v8.s, v9.s}[0], [pB]
+	add	pB, pB, #8
+	ld2	{v0.s, v1.s}[0], [pA]
+	add	pA, pA, #8
+
+	OP_rr	s16, s0, v8.s[0]
+	OP_ii	s16, s1, v9.s[0]
+	OP_ri	s17, s0, v9.s[0]
+	OP_ir	s17, s1, v8.s[0]
+.endm
+
+.macro SAVE1x1
+	mov	pCRow1, pCRow0
+
+	ld2	{v0.s, v1.s}[0], [pCRow1]
+	fmla	s0, s16, alphaV0_R
+	fmls	s0, s17, alphaV0_I
+	fmla	s1, s16, alphaV1_I
+	fmla	s1, s17, alphaV1_R
+	st2	{v0.s, v1.s}[0], [pCRow1]
+
+	add	pCRow0, pCRow0, #8
+.endm
+
+/*******************************************************************************
+* End of macro definitions
+*******************************************************************************/
+
+	PROLOGUE
+
+	.align 5
+	add	sp, sp, #-(11 * 16)
+	stp	d8, d9, [sp, #(0 * 16)]
+	stp	d10, d11, [sp, #(1 * 16)]
+	stp	d12, d13, [sp, #(2 * 16)]
+	stp	d14, d15, [sp, #(3 * 16)]
+	stp	d16, d17, [sp, #(4 * 16)]
+	stp	x18, x19, [sp, #(5 * 16)]
+	stp	x20, x21, [sp, #(6 * 16)]
+	stp	x22, x23, [sp, #(7 * 16)]
+	stp	x24, x25, [sp, #(8 * 16)]
+	stp	x26, x27, [sp, #(9 * 16)]
+	str	x28, [sp, #(10 * 16)]
+
+	fmov	alpha0_R, s0
+	fmov	alpha0_I, s1
+	fmov	alpha1_R, s0
+	fmov	alpha1_I, s1
+
+	lsl	LDC, LDC, #3			// ldc = ldc * 8
+
+	mov	pB, origPB
+
+	mov	counterJ, origN
+	asr 	counterJ, counterJ, #2		// J = J / 4
+	cmp 	counterJ, #0
+	ble	cgemm_kernel_L2_BEGIN
+
+/******************************************************************************/
+
+cgemm_kernel_L4_BEGIN:
+	mov	pCRow0, pC			// pCRow0 = C
+	add	pC, pC, LDC, lsl #2
+
+	mov	pA, origPA			// pA = start of A array
+
+cgemm_kernel_L4_M8_BEGIN:
+
+	mov	counterI, origM
+	asr 	counterI, counterI, #3		// counterI = counterI / 8
+	cmp 	counterI, #0
+	ble	cgemm_kernel_L4_M4_BEGIN
+
+cgemm_kernel_L4_M8_20:
+
+	mov	pB, origPB
+
+	asr 	counterL , origK, #1		// L = K / 2
+	cmp	counterL , #2			// is there at least 4 to do?
+	blt	cgemm_kernel_L4_M8_32
+
+	KERNEL8x4_I				// do one in the K
+	KERNEL8x4_M2				// do another in the K
+
+	subs	counterL, counterL, #2		// subtract 2
+	ble	cgemm_kernel_L4_M8_22a
+	.align 5
+
+cgemm_kernel_L4_M8_22:
+
+	KERNEL8x4_M1
+	KERNEL8x4_M2
+
+	subs	counterL, counterL, #1
+	bgt	cgemm_kernel_L4_M8_22
+
+
+cgemm_kernel_L4_M8_22a:
+
+	KERNEL8x4_M1
+	KERNEL8x4_E
+
+	b	 cgemm_kernel_L4_M8_44
+
+cgemm_kernel_L4_M8_32:
+
+	tst	counterL, #1
+	ble	cgemm_kernel_L4_M8_40
+
+	KERNEL8x4_I
+
+	KERNEL8x4_E
+
+	b	cgemm_kernel_L4_M8_44
+
+cgemm_kernel_L4_M8_40:
+
+	INIT8x4
+
+cgemm_kernel_L4_M8_44:
+
+	ands	counterL , origK, #1
+	ble	cgemm_kernel_L4_M8_100
+
+cgemm_kernel_L4_M8_46:
+
+	KERNEL8x4_SUB
+
+cgemm_kernel_L4_M8_100:
+
+	SAVE8x4
+
+cgemm_kernel_L4_M8_END:
+	subs	counterI, counterI, #1
+	bne	cgemm_kernel_L4_M8_20
+
+cgemm_kernel_L4_M4_BEGIN:
+
+	mov	counterI, origM
+	tst	counterI , #7
+	ble	cgemm_kernel_L4_END
+
+	tst	counterI, #4
+	ble	cgemm_kernel_L4_M2_BEGIN
+
+
+cgemm_kernel_L4_M4_20:
+
+	mov	pB, origPB
+	
+	asr 	counterL , origK, #1		// L = K / 2
+	cmp	counterL , #2			// is there at least 4 to do?
+	blt	cgemm_kernel_L4_M4_32
+
+	KERNEL4x4_I				// do one in the K
+	KERNEL4x4_M2				// do another in the K
+
+	subs	counterL, counterL, #2
+	ble	cgemm_kernel_L4_M4_22a
+	.align 5
+
+
+cgemm_kernel_L4_M4_22:
+
+	KERNEL4x4_M1
+	KERNEL4x4_M2
+
+	subs	counterL, counterL, #1
+	bgt	cgemm_kernel_L4_M4_22
+
+cgemm_kernel_L4_M4_22a:
+	KERNEL4x4_M1
+	KERNEL4x4_E
+	b	 cgemm_kernel_L4_M4_44
+cgemm_kernel_L4_M4_32:
+	tst	counterL, #1
+	ble	cgemm_kernel_L4_M4_40
+	KERNEL4x4_I
+	KERNEL4x4_E
+	b	cgemm_kernel_L4_M4_44
+cgemm_kernel_L4_M4_40:
+
+	INIT4x4
+
+cgemm_kernel_L4_M4_44:
+	ands	counterL , origK, #1
+	ble	cgemm_kernel_L4_M4_100
+
+cgemm_kernel_L4_M4_46:
+	KERNEL4x4_SUB
+
+cgemm_kernel_L4_M4_100:
+
+	SAVE4x4
+
+cgemm_kernel_L4_M4_END:
+
+cgemm_kernel_L4_M2_BEGIN:
+
+	mov	counterI, origM
+	tst	counterI , #3
+	ble	cgemm_kernel_L4_END
+
+	tst	counterI, #2			// counterI = counterI / 2
+	ble	cgemm_kernel_L4_M1_BEGIN
+
+cgemm_kernel_L4_M2_20:
+
+	INIT2x4
+
+	mov	pB, origPB
+	asr 	counterL , origK, #3		// counterL = counterL / 8
+	cmp	counterL , #0
+	ble	cgemm_kernel_L4_M2_40
+
+cgemm_kernel_L4_M2_22:
+
+	KERNEL2x4_SUB
+	KERNEL2x4_SUB
+	KERNEL2x4_SUB
+	KERNEL2x4_SUB
+
+	KERNEL2x4_SUB
+	KERNEL2x4_SUB
+	KERNEL2x4_SUB
+	KERNEL2x4_SUB
+
+	subs	counterL, counterL, #1
+	bgt	cgemm_kernel_L4_M2_22
+
+
+cgemm_kernel_L4_M2_40:
+
+	ands	counterL , origK, #7		// counterL = counterL % 8
+	ble	cgemm_kernel_L4_M2_100
+
+cgemm_kernel_L4_M2_42:
+
+	KERNEL2x4_SUB
+
+	subs	counterL, counterL, #1
+	bgt	cgemm_kernel_L4_M2_42
+
+cgemm_kernel_L4_M2_100:
+
+	SAVE2x4
+
+cgemm_kernel_L4_M2_END:
+
+
+cgemm_kernel_L4_M1_BEGIN:
+
+	tst	counterI, #1			// counterI = counterI % 2
+	ble	cgemm_kernel_L4_END
+
+cgemm_kernel_L4_M1_20:
+
+	INIT1x4
+
+	mov	pB, origPB
+	asr 	counterL , origK, #3		// counterL = counterL / 8
+	cmp	counterL , #0
+	ble	cgemm_kernel_L4_M1_40
+
+cgemm_kernel_L4_M1_22:
+	KERNEL1x4_SUB
+	KERNEL1x4_SUB
+	KERNEL1x4_SUB
+	KERNEL1x4_SUB
+
+	KERNEL1x4_SUB
+	KERNEL1x4_SUB
+	KERNEL1x4_SUB
+	KERNEL1x4_SUB
+
+	subs	counterL, counterL, #1
+	bgt	cgemm_kernel_L4_M1_22
+
+
+cgemm_kernel_L4_M1_40:
+
+	ands	counterL , origK, #7		// counterL = counterL % 8
+	ble	cgemm_kernel_L4_M1_100
+
+cgemm_kernel_L4_M1_42:
+
+	KERNEL1x4_SUB
+
+	subs	counterL, counterL, #1
+	bgt	cgemm_kernel_L4_M1_42
+
+cgemm_kernel_L4_M1_100:
+
+	SAVE1x4
+
+
+cgemm_kernel_L4_END:
+
+	lsl	temp, origK, #5 
+	add	origPB, origPB, temp		// B = B + K * 4 * 8
+
+	subs	counterJ, counterJ , #1		// j--
+	bgt	cgemm_kernel_L4_BEGIN
+
+
+/******************************************************************************/
+
+cgemm_kernel_L2_BEGIN:   // less than 2 left in N direction
+
+	mov	counterJ , origN
+	tst	counterJ , #3
+	ble	cgemm_kernel_L999   // error, N was less than 4?
+
+	tst	counterJ , #2
+	ble	cgemm_kernel_L1_BEGIN
+
+	mov	pCRow0, pC			// pCRow0 = pC
+
+	add	pC,pC,LDC, lsl #1
+
+	mov	pA, origPA			// pA = A
+
+
+cgemm_kernel_L2_M8_BEGIN:
+
+	mov	counterI, origM
+	asr 	counterI, counterI, #3		// counterI = counterI / 8
+	cmp	counterI, #0
+	ble	cgemm_kernel_L2_M4_BEGIN
+
+cgemm_kernel_L2_M8_20:
+
+	INIT8x2
+
+	mov	pB, origPB
+
+	asr	counterL , origK, #3		// counterL = counterL / 8
+	cmp	counterL,#0
+	ble	cgemm_kernel_L2_M8_40
+	.align 5
+
+cgemm_kernel_L2_M8_22:
+	KERNEL8x2_SUB
+	KERNEL8x2_SUB
+	KERNEL8x2_SUB
+	KERNEL8x2_SUB
+
+	KERNEL8x2_SUB
+	KERNEL8x2_SUB
+	KERNEL8x2_SUB
+	KERNEL8x2_SUB
+
+	subs	counterL, counterL, #1
+	bgt	cgemm_kernel_L2_M8_22
+
+
+cgemm_kernel_L2_M8_40:
+
+	ands	counterL , origK, #7		// counterL = counterL % 8
+	ble	cgemm_kernel_L2_M8_100
+
+cgemm_kernel_L2_M8_42:
+
+	KERNEL8x2_SUB
+
+	subs	counterL, counterL, #1
+	bgt	cgemm_kernel_L2_M8_42
+
+cgemm_kernel_L2_M8_100:
+
+	SAVE8x2
+
+cgemm_kernel_L2_M8_END:
+
+	subs	counterI, counterI, #1
+	bgt	cgemm_kernel_L2_M8_20
+
+cgemm_kernel_L2_M4_BEGIN:
+
+	mov	counterI, origM
+	tst	counterI , #7
+	ble	cgemm_kernel_L2_END
+
+	tst	counterI, #4			// counterI = counterI / 2
+	ble	cgemm_kernel_L2_M2_BEGIN
+
+cgemm_kernel_L2_M4_20:
+
+	INIT4x2
+
+	mov	pB, origPB
+	asr	counterL , origK, #3		// counterL = counterL / 8
+	cmp	counterL,#0
+	ble	cgemm_kernel_L2_M4_40
+	.align 5
+
+cgemm_kernel_L2_M4_22:
+	KERNEL4x2_SUB
+	KERNEL4x2_SUB
+	KERNEL4x2_SUB
+	KERNEL4x2_SUB
+
+	KERNEL4x2_SUB
+	KERNEL4x2_SUB
+	KERNEL4x2_SUB
+	KERNEL4x2_SUB
+
+	subs	counterL, counterL, #1
+	bgt	cgemm_kernel_L2_M4_22
+
+
+cgemm_kernel_L2_M4_40:
+
+	ands	counterL , origK, #7		// counterL = counterL % 8
+	ble	cgemm_kernel_L2_M4_100
+
+cgemm_kernel_L2_M4_42:
+
+	KERNEL4x2_SUB
+
+	subs	counterL, counterL, #1
+	bgt	cgemm_kernel_L2_M4_42
+
+cgemm_kernel_L2_M4_100:
+
+	SAVE4x2
+
+cgemm_kernel_L2_M4_END:
+
+cgemm_kernel_L2_M2_BEGIN:
+
+	mov	counterI, origM
+	tst	counterI , #3
+	ble	cgemm_kernel_L2_END
+
+	tst	counterI, #2			// counterI = counterI / 2
+	ble	cgemm_kernel_L2_M1_BEGIN
+
+cgemm_kernel_L2_M2_20:
+
+	INIT2x2
+
+	mov	pB, origPB
+	asr	counterL , origK, #3		// counterL = counterL / 8
+        cmp	counterL,#0
+	ble	cgemm_kernel_L2_M2_40
+
+cgemm_kernel_L2_M2_22:
+
+	KERNEL2x2_SUB
+	KERNEL2x2_SUB
+	KERNEL2x2_SUB
+	KERNEL2x2_SUB
+
+	KERNEL2x2_SUB
+	KERNEL2x2_SUB
+	KERNEL2x2_SUB
+	KERNEL2x2_SUB
+
+	subs	counterL, counterL, #1
+	bgt	cgemm_kernel_L2_M2_22
+
+
+cgemm_kernel_L2_M2_40:
+
+	ands	counterL , origK, #7		// counterL = counterL % 8
+	ble	cgemm_kernel_L2_M2_100
+
+cgemm_kernel_L2_M2_42:
+
+	KERNEL2x2_SUB
+
+	subs	counterL, counterL, #1
+	bgt	cgemm_kernel_L2_M2_42
+
+cgemm_kernel_L2_M2_100:
+
+	SAVE2x2
+
+cgemm_kernel_L2_M2_END:
+
+
+cgemm_kernel_L2_M1_BEGIN:
+
+	tst	counterI, #1			// counterI = counterI % 2
+	ble	cgemm_kernel_L2_END
+
+cgemm_kernel_L2_M1_20:
+
+	INIT1x2
+
+	mov	pB, origPB
+	asr 	counterL , origK, #3		// counterL = counterL / 8
+        cmp     counterL, #0
+	ble	cgemm_kernel_L2_M1_40
+
+cgemm_kernel_L2_M1_22:
+	KERNEL1x2_SUB
+	KERNEL1x2_SUB
+	KERNEL1x2_SUB
+	KERNEL1x2_SUB
+
+	KERNEL1x2_SUB
+	KERNEL1x2_SUB
+	KERNEL1x2_SUB
+	KERNEL1x2_SUB
+
+	subs	counterL, counterL, #1
+	bgt	cgemm_kernel_L2_M1_22
+
+
+cgemm_kernel_L2_M1_40:
+
+	ands	counterL , origK, #7		// counterL = counterL % 8
+	ble	cgemm_kernel_L2_M1_100
+
+cgemm_kernel_L2_M1_42:
+
+	KERNEL1x2_SUB
+
+	subs	counterL, counterL, #1
+	bgt	cgemm_kernel_L2_M1_42
+
+cgemm_kernel_L2_M1_100:
+
+	SAVE1x2
+
+
+cgemm_kernel_L2_END:
+	add	origPB, origPB, origK, lsl #4	// B = B + K * 2 * 8
+
+/******************************************************************************/
+
+cgemm_kernel_L1_BEGIN:
+
+	mov	counterJ , origN
+	tst	counterJ , #1
+	ble	cgemm_kernel_L999 // done
+
+
+	mov	pCRow0, pC			// pCRow0 = C
+	add	pC , pC , LDC			// Update pC to point to next
+
+	mov	pA, origPA			// pA = A
+
+
+cgemm_kernel_L1_M8_BEGIN:
+
+	mov	counterI, origM
+	asr 	counterI, counterI, #3		// counterI = counterI / 8
+	cmp	counterI, #0
+	ble	cgemm_kernel_L1_M4_BEGIN
+
+cgemm_kernel_L1_M8_20:
+
+	INIT8x1
+
+	mov	pB, origPB
+	asr	counterL , origK, #3		// counterL = counterL / 8
+	cmp	counterL , #0
+	ble	cgemm_kernel_L1_M8_40
+	.align 5
+
+cgemm_kernel_L1_M8_22:
+	KERNEL8x1_SUB
+	KERNEL8x1_SUB
+	KERNEL8x1_SUB
+	KERNEL8x1_SUB
+
+	KERNEL8x1_SUB
+	KERNEL8x1_SUB
+	KERNEL8x1_SUB
+	KERNEL8x1_SUB
+
+	subs	counterL, counterL, #1
+	bgt	cgemm_kernel_L1_M8_22
+
+
+cgemm_kernel_L1_M8_40:
+
+	ands	counterL , origK, #7		// counterL = counterL % 8
+	ble	cgemm_kernel_L1_M8_100
+
+cgemm_kernel_L1_M8_42:
+
+	KERNEL8x1_SUB
+
+	subs	counterL, counterL, #1
+	bgt	cgemm_kernel_L1_M8_42
+
+cgemm_kernel_L1_M8_100:
+
+	SAVE8x1
+
+cgemm_kernel_L1_M8_END:
+
+	subs	counterI, counterI, #1
+	bgt	cgemm_kernel_L1_M8_20
+
+cgemm_kernel_L1_M4_BEGIN:
+
+	mov	counterI, origM
+	tst	counterI , #7
+	ble	cgemm_kernel_L1_END
+
+	tst	counterI, #4			// counterI = counterI / 2
+	ble	cgemm_kernel_L1_M2_BEGIN
+
+
+cgemm_kernel_L1_M4_20:
+
+	INIT4x1
+
+	mov	pB, origPB
+	asr	counterL , origK, #3		// counterL = counterL / 8
+	cmp	counterL , #0
+	ble	cgemm_kernel_L1_M4_40
+	.align 5
+
+cgemm_kernel_L1_M4_22:
+	KERNEL4x1_SUB
+	KERNEL4x1_SUB
+	KERNEL4x1_SUB
+	KERNEL4x1_SUB
+
+	KERNEL4x1_SUB
+	KERNEL4x1_SUB
+	KERNEL4x1_SUB
+	KERNEL4x1_SUB
+
+	subs	counterL, counterL, #1
+	bgt	cgemm_kernel_L1_M4_22
+
+
+cgemm_kernel_L1_M4_40:
+
+	ands	counterL , origK, #7		// counterL = counterL % 8
+	ble	cgemm_kernel_L1_M4_100
+
+cgemm_kernel_L1_M4_42:
+
+	KERNEL4x1_SUB
+
+	subs	counterL, counterL, #1
+	bgt	cgemm_kernel_L1_M4_42
+
+cgemm_kernel_L1_M4_100:
+
+	SAVE4x1
+
+cgemm_kernel_L1_M4_END:
+
+
+cgemm_kernel_L1_M2_BEGIN:
+
+	mov	counterI, origM
+	tst	counterI , #3
+	ble	cgemm_kernel_L1_END
+
+	tst	counterI, #2			// counterI = counterI / 2
+	ble	cgemm_kernel_L1_M1_BEGIN
+
+cgemm_kernel_L1_M2_20:
+
+	INIT2x1
+
+	mov	pB, origPB
+	asr 	counterL , origK, #3		// counterL = counterL / 8
+	cmp	counterL , #0
+	ble	cgemm_kernel_L1_M2_40
+
+cgemm_kernel_L1_M2_22:
+
+	KERNEL2x1_SUB
+	KERNEL2x1_SUB
+	KERNEL2x1_SUB
+	KERNEL2x1_SUB
+
+	KERNEL2x1_SUB
+	KERNEL2x1_SUB
+	KERNEL2x1_SUB
+	KERNEL2x1_SUB
+
+	subs	counterL, counterL, #1
+	bgt	cgemm_kernel_L1_M2_22
+
+
+cgemm_kernel_L1_M2_40:
+
+	ands	counterL , origK, #7		// counterL = counterL % 8
+	ble	cgemm_kernel_L1_M2_100
+
+cgemm_kernel_L1_M2_42:
+
+	KERNEL2x1_SUB
+
+	subs	counterL, counterL, #1
+	bgt	cgemm_kernel_L1_M2_42
+
+cgemm_kernel_L1_M2_100:
+
+	SAVE2x1
+
+cgemm_kernel_L1_M2_END:
+
+
+cgemm_kernel_L1_M1_BEGIN:
+
+	tst	counterI, #1			// counterI = counterI % 2
+	ble	cgemm_kernel_L1_END
+
+cgemm_kernel_L1_M1_20:
+
+	INIT1x1
+
+	mov	pB, origPB
+	asr 	counterL , origK, #3		// counterL = counterL / 8
+	cmp	counterL , #0
+	ble	cgemm_kernel_L1_M1_40
+
+cgemm_kernel_L1_M1_22:
+	KERNEL1x1_SUB
+	KERNEL1x1_SUB
+	KERNEL1x1_SUB
+	KERNEL1x1_SUB
+
+	KERNEL1x1_SUB
+	KERNEL1x1_SUB
+	KERNEL1x1_SUB
+	KERNEL1x1_SUB
+
+	subs	counterL, counterL, #1
+	bgt	cgemm_kernel_L1_M1_22
+
+
+cgemm_kernel_L1_M1_40:
+
+	ands	counterL , origK, #7		// counterL = counterL % 8
+	ble	cgemm_kernel_L1_M1_100
+
+cgemm_kernel_L1_M1_42:
+
+	KERNEL1x1_SUB
+
+	subs	counterL, counterL, #1
+	bgt	cgemm_kernel_L1_M1_42
+
+cgemm_kernel_L1_M1_100:
+
+	SAVE1x1
+
+
+cgemm_kernel_L1_END:
+
+
+cgemm_kernel_L999:
+	mov	x0, #0				// set return value
+	ldp	d8, d9, [sp, #(0 * 16)]
+	ldp	d10, d11, [sp, #(1 * 16)]
+	ldp	d12, d13, [sp, #(2 * 16)]
+	ldp	d14, d15, [sp, #(3 * 16)]
+	ldp	d16, d17, [sp, #(4 * 16)]
+	ldp	x18, x19, [sp, #(5 * 16)]
+	ldp	x20, x21, [sp, #(6 * 16)]
+	ldp	x22, x23, [sp, #(7 * 16)]
+	ldp	x24, x25, [sp, #(8 * 16)]
+	ldp	x26, x27, [sp, #(9 * 16)]
+	ldr	x28, [sp, #(10 * 16)]
+	add	sp, sp, #(11*16)
+	ret
+
+	EPILOGUE
+
diff --git a/kernel/arm64/ctrmm_kernel_8x4.S b/kernel/arm64/ctrmm_kernel_8x4.S
new file mode 100755
index 000000000..3131541d4
--- /dev/null
+++ b/kernel/arm64/ctrmm_kernel_8x4.S
@@ -0,0 +1,2425 @@
+/*******************************************************************************
+Copyright (c) 2015, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*******************************************************************************/
+
+#define ASSEMBLER
+#include "common.h"
+
+/*                   X0          X1          X2          s0            s1         X3        x4       x5           x6           x7*/
+/*int CNAME(BLASLONG bm,BLASLONG bn,BLASLONG bk,FLOAT alpha0, FLOAT alpha1,FLOAT* ba,FLOAT* bb,FLOAT* C,BLASLONG ldc, BLASLONG offset */
+
+#define origM		x0
+#define origN		x1
+#define origK		x2
+#define origPA		x3
+#define origPB		x4
+#define pC		x5
+#define LDC		x6
+#define offset		x7
+#define counterL	x8
+#define counterI	x9
+#define counterJ	x10
+#define pB		x11
+#define pCRow0		x12
+#define pCRow1		x13
+#define pCRow2		x14
+#define pA		x15
+#define temp		x16
+#define tempOffset	x17
+#define tempK		x18
+
+#define alpha0_R	s10
+#define alphaV0_R	v10.s[0]
+#define alpha0_I	s11
+#define alphaV0_I	v11.s[0]
+
+#define alpha1_R	s14
+#define alphaV1_R	v14.s[0]
+#define alpha1_I	s15
+#define alphaV1_I	v15.s[0]
+
+#if defined(NN) || defined(NT) || defined(TN) || defined(TT)
+#define OP_rr		fmla
+#define OP_ii		fmls
+#define OP_ri		fmla
+#define OP_ir		fmla
+#elif defined(NR) || defined(NC) || defined(TR) || defined(TC)
+#define OP_rr		fmla
+#define OP_ii		fmla
+#define OP_ri		fmls
+#define OP_ir		fmla
+#elif defined(RN) || defined(RT) || defined(CN) || defined(CT)
+#define OP_rr		fmla
+#define OP_ii		fmla
+#define OP_ri		fmla
+#define OP_ir		fmls
+#elif defined(RR) || defined(RC) || defined(CR) || defined(CC)
+#define OP_rr		fmla
+#define OP_ii		fmls
+#define OP_ri		fmls
+#define OP_ir		fmls
+#endif
+
+// 00 origM
+// 01 origN
+// 02 origK
+// 03 origPA
+// 04 origPB
+// 05 pC
+// 06 origLDC -> LDC
+// 07 offset
+// 08 counterL
+// 09 counterI
+// 10 counterJ
+// 11 pB
+// 12 pCRow0
+// 13 pCRow1
+// 14 pCRow2
+// 15 pA
+// 16 temp
+// 17 tempOffset
+// 18 must save tempK
+// 19 must save
+// 20 must save
+// 21 must save
+// 22 must save
+// 23 must save
+// 24 must save
+// 25 must save
+// 26 must save
+// 27 must save
+// 28 must save
+// 29 frame
+// 30 link
+// 31 sp
+
+//v00 ALPHA_R -> pA0_00_R, pA0_01_R, pA0_02_R, pA0_03_R
+//v01 ALPHA_I -> pA0_00_I, pA0_01_I, pA0_02_I, pA0_03_I
+//v02 pA0_04_R, pA0_05_R, pA0_06_R, pA0_07_R
+//v03 pA0_04_I, pA0_05_I, pA0_06_I, pA0_07_I
+//v04 pA1_00_R, pA1_01_R, pA1_02_R, pA1_03_R
+//v05 pA1_00_I, pA1_01_I, pA1_02_I, pA1_03_I
+//v06 pA1_04_R, pA1_05_R, pA1_06_R, pA1_07_R
+//v07 pA1_04_I, pA1_05_I, pA1_06_I, pA1_07_I
+//v08 must save pB0_00_R, pB0_01_R, pB0_02_R, pB0_03_R
+//v09 must save pB0_00_I, pB0_01_I, pB0_02_I, pB0_03_I
+//v10 must save ALPHA0_R
+//v11 must save ALPHA0_I
+//v12 must save pB1_00_R, pB1_01_R, pB1_02_R, pB1_03_R
+//v13 must save pB1_00_I, pB1_01_I, pB1_02_I, pB1_03_I
+//v14 must save ALPHA1_R
+//v15 must save ALPHA1_I
+//v16 must save pC_00_R, pC_01_R, pC_02_R, pC_03_R
+//v17 must save pC_00_I, pC_01_I, pC_02_I, pC_03_I
+//v18 pC_04_R, pC_05_R, pC_06_R, pC_07_R
+//v19 pC_04_I, pC_05_I, pC_06_I, pC_07_I
+//v20 pC_08_R, pC_09_R, pC_10_R, pC_11_R
+//v21 pC_08_I, pC_09_I, pC_10_I, pC_11_I
+//v22 pC_12_R, pC_13_R, pC_14_R, pC_15_R
+//v23 pC_12_I, pC_13_I, pC_14_I, pC_15_I
+//v24 pC_16_R, pC_17_R, pC_18_R, pC_19_R
+//v25 pC_16_I, pC_17_I, pC_18_I, pC_19_I
+//v26 pC_20_R, pC_21_R, pC_22_R, pC_23_R
+//v27 pC_20_I, pC_21_I, pC_22_I, pC_23_I
+//v28 pC_24_R, pC_25_R, pC_26_R, pC_27_R
+//v29 pC_24_I, pC_25_I, pC_26_I, pC_27_I
+//v30 pC_28_R, pC_29_R, pC_30_R, pC_31_R
+//v31 pC_28_I, pC_29_I, pC_30_I, pC_31_I
+
+/*******************************************************************************
+* Macro definitions
+*******************************************************************************/
+
+.macro INIT8x4
+	fmov		s16, wzr
+	fmov		s17, wzr
+	fmov		s18, wzr
+	fmov		s19, s16
+	fmov		s20, wzr
+	fmov		s21, s16
+	fmov		s22, s17
+	fmov		s23, s18
+	fmov		s24, wzr
+	fmov		s25, s16
+	fmov		s26, s17
+	fmov		s27, s18
+	fmov		s28, wzr
+	fmov		s29, s16
+	fmov		s30, s17
+	fmov		s31, s18
+.endm
+
+.macro KERNEL8x4_I
+	ld2	{v8.4s, v9.4s}, [pB]
+	add	pB, pB, #32
+	ld2	{v0.4s, v1.4s}, [pA]
+	add	pA, pA, #32
+	ld2	{v2.4s, v3.4s}, [pA]
+	add	pA, pA, #32
+
+	fmul	v16.4s, v0.4s, v8.4s[0]
+	OP_ii	v16.4s, v1.4s, v9.4s[0]
+#if defined(NR) || defined(NC) || defined(TR) || defined(TC) || \
+    defined(RR) || defined(RC) || defined(CR) || defined(CC)
+	eor	v17.16b, v17.16b, v17.16b
+	fmls	v17.4s, v0.4s, v9.4s[0]
+#else
+	fmul	v17.4s, v0.4s, v9.4s[0]
+#endif
+	OP_ir	v17.4s, v1.4s, v8.4s[0]
+
+	fmul	v18.4s, v2.4s, v8.4s[0]
+	OP_ii	v18.4s, v3.4s, v9.4s[0]
+#if defined(NR) || defined(NC) || defined(TR) || defined(TC) || \
+    defined(RR) || defined(RC) || defined(CR) || defined(CC)
+	eor	v19.16b, v19.16b, v19.16b
+	fmls	v19.4s, v2.4s, v9.4s[0]
+#else
+	fmul	v19.4s, v2.4s, v9.4s[0]
+#endif
+	OP_ir	v19.4s, v3.4s, v8.4s[0]
+
+	fmul	v20.4s, v0.4s, v8.4s[1]
+	OP_ii	v20.4s, v1.4s, v9.4s[1]
+#if defined(NR) || defined(NC) || defined(TR) || defined(TC) || \
+    defined(RR) || defined(RC) || defined(CR) || defined(CC)
+	eor	v21.16b, v21.16b, v21.16b
+	fmls	v21.4s, v0.4s, v9.4s[1]
+#else
+	fmul	v21.4s, v0.4s, v9.4s[1]
+#endif
+	OP_ir	v21.4s, v1.4s, v8.4s[1]
+
+	fmul	v22.4s, v2.4s, v8.4s[1]
+	OP_ii	v22.4s, v3.4s, v9.4s[1]
+#if defined(NR) || defined(NC) || defined(TR) || defined(TC) || \
+    defined(RR) || defined(RC) || defined(CR) || defined(CC)
+	eor	v23.16b, v23.16b, v23.16b
+	fmls	v23.4s, v2.4s, v9.4s[1]
+#else
+	fmul	v23.4s, v2.4s, v9.4s[1]
+#endif
+	OP_ir	v23.4s, v3.4s, v8.4s[1]
+
+	fmul	v24.4s, v0.4s, v8.4s[2]
+	OP_ii	v24.4s, v1.4s, v9.4s[2]
+#if defined(NR) || defined(NC) || defined(TR) || defined(TC) || \
+    defined(RR) || defined(RC) || defined(CR) || defined(CC)
+	eor	v25.16b, v25.16b, v25.16b
+	fmls	v25.4s, v0.4s, v9.4s[2]
+#else
+	fmul	v25.4s, v0.4s, v9.4s[2]
+#endif
+	OP_ir	v25.4s, v1.4s, v8.4s[2]
+
+	fmul	v26.4s, v2.4s, v8.4s[2]
+	OP_ii	v26.4s, v3.4s, v9.4s[2]
+#if defined(NR) || defined(NC) || defined(TR) || defined(TC) || \
+    defined(RR) || defined(RC) || defined(CR) || defined(CC)
+	eor	v27.16b, v27.16b, v27.16b
+	fmls	v27.4s, v2.4s, v9.4s[2]
+#else
+	fmul	v27.4s, v2.4s, v9.4s[2]
+#endif
+	OP_ir	v27.4s, v3.4s, v8.4s[2]
+
+	fmul	v28.4s, v0.4s, v8.4s[3]
+	OP_ii	v28.4s, v1.4s, v9.4s[3]
+#if defined(NR) || defined(NC) || defined(TR) || defined(TC) || \
+    defined(RR) || defined(RC) || defined(CR) || defined(CC)
+	eor	v29.16b, v29.16b, v29.16b
+	fmls	v29.4s, v0.4s, v9.4s[3]
+#else
+	fmul	v29.4s, v0.4s, v9.4s[3]
+#endif
+	OP_ir	v29.4s, v1.4s, v8.4s[3]
+
+	fmul	v30.4s, v2.4s, v8.4s[3]
+	OP_ii	v30.4s, v3.4s, v9.4s[3]
+#if defined(NR) || defined(NC) || defined(TR) || defined(TC) || \
+    defined(RR) || defined(RC) || defined(CR) || defined(CC)
+	eor	v31.16b, v31.16b, v31.16b
+	fmls	v31.4s, v2.4s, v9.4s[3]
+#else
+	fmul	v31.4s, v2.4s, v9.4s[3]
+#endif
+	OP_ir	v31.4s, v3.4s, v8.4s[3]
+
+	ld2	{v12.4s, v13.4s}, [pB]
+	add	pB, pB, #32
+	ld2	{v4.4s, v5.4s}, [pA]
+	add	pA, pA, #32
+	ld2	{v6.4s, v7.4s}, [pA]
+	add	pA, pA, #32
+.endm
+
+.macro KERNEL8x4_M1
+	OP_rr	v16.4s, v0.4s, v8.4s[0]
+	OP_ii	v16.4s, v1.4s, v9.4s[0]
+	OP_ri	v17.4s, v0.4s, v9.4s[0]
+	OP_ir	v17.4s, v1.4s, v8.4s[0]
+
+	OP_rr	v18.4s, v2.4s, v8.4s[0]
+	OP_ii	v18.4s, v3.4s, v9.4s[0]
+	OP_ri	v19.4s, v2.4s, v9.4s[0]
+	OP_ir	v19.4s, v3.4s, v8.4s[0]
+
+	OP_rr	v20.4s, v0.4s, v8.4s[1]
+	OP_ii	v20.4s, v1.4s, v9.4s[1]
+	OP_ri	v21.4s, v0.4s, v9.4s[1]
+	OP_ir	v21.4s, v1.4s, v8.4s[1]
+
+	OP_rr	v22.4s, v2.4s, v8.4s[1]
+	OP_ii	v22.4s, v3.4s, v9.4s[1]
+	OP_ri	v23.4s, v2.4s, v9.4s[1]
+	OP_ir	v23.4s, v3.4s, v8.4s[1]
+
+	OP_rr	v24.4s, v0.4s, v8.4s[2]
+	OP_ii	v24.4s, v1.4s, v9.4s[2]
+	OP_ri	v25.4s, v0.4s, v9.4s[2]
+	OP_ir	v25.4s, v1.4s, v8.4s[2]
+
+	OP_rr	v26.4s, v2.4s, v8.4s[2]
+	OP_ii	v26.4s, v3.4s, v9.4s[2]
+	OP_ri	v27.4s, v2.4s, v9.4s[2]
+	OP_ir	v27.4s, v3.4s, v8.4s[2]
+
+	OP_rr	v28.4s, v0.4s, v8.4s[3]
+	OP_ii	v28.4s, v1.4s, v9.4s[3]
+	OP_ri	v29.4s, v0.4s, v9.4s[3]
+	OP_ir	v29.4s, v1.4s, v8.4s[3]
+
+	OP_rr	v30.4s, v2.4s, v8.4s[3]
+	OP_ii	v30.4s, v3.4s, v9.4s[3]
+	OP_ri	v31.4s, v2.4s, v9.4s[3]
+	OP_ir	v31.4s, v3.4s, v8.4s[3]
+
+	ld2	{v12.4s, v13.4s}, [pB]		// For next round
+	add	pB, pB, #32
+	ld2	{v4.4s, v5.4s}, [pA]		// For next round
+	add	pA, pA, #32
+	ld2	{v6.4s, v7.4s}, [pA]
+	add	pA, pA, #32
+.endm
+
+.macro KERNEL8x4_M2
+	OP_rr	v16.4s, v4.4s, v12.4s[0]
+	OP_ii	v16.4s, v5.4s, v13.4s[0]
+	OP_ri	v17.4s, v4.4s, v13.4s[0]
+	OP_ir	v17.4s, v5.4s, v12.4s[0]
+
+	OP_rr	v18.4s, v6.4s, v12.4s[0]
+	OP_ii	v18.4s, v7.4s, v13.4s[0]
+	OP_ri	v19.4s, v6.4s, v13.4s[0]
+	OP_ir	v19.4s, v7.4s, v12.4s[0]
+
+	OP_rr	v20.4s, v4.4s, v12.4s[1]
+	OP_ii	v20.4s, v5.4s, v13.4s[1]
+	OP_ri	v21.4s, v4.4s, v13.4s[1]
+	OP_ir	v21.4s, v5.4s, v12.4s[1]
+
+	OP_rr	v22.4s, v6.4s, v12.4s[1]
+	OP_ii	v22.4s, v7.4s, v13.4s[1]
+	OP_ri	v23.4s, v6.4s, v13.4s[1]
+	OP_ir	v23.4s, v7.4s, v12.4s[1]
+
+	OP_rr	v24.4s, v4.4s, v12.4s[2]
+	OP_ii	v24.4s, v5.4s, v13.4s[2]
+	OP_ri	v25.4s, v4.4s, v13.4s[2]
+	OP_ir	v25.4s, v5.4s, v12.4s[2]
+
+	OP_rr	v26.4s, v6.4s, v12.4s[2]
+	OP_ii	v26.4s, v7.4s, v13.4s[2]
+	OP_ri	v27.4s, v6.4s, v13.4s[2]
+	OP_ir	v27.4s, v7.4s, v12.4s[2]
+
+	OP_rr	v28.4s, v4.4s, v12.4s[3]
+	OP_ii	v28.4s, v5.4s, v13.4s[3]
+	OP_ri	v29.4s, v4.4s, v13.4s[3]
+	OP_ir	v29.4s, v5.4s, v12.4s[3]
+
+	OP_rr	v30.4s, v6.4s, v12.4s[3]
+	OP_ii	v30.4s, v7.4s, v13.4s[3]
+	OP_ri	v31.4s, v6.4s, v13.4s[3]
+	OP_ir	v31.4s, v7.4s, v12.4s[3]
+
+	ld2	{v8.4s, v9.4s}, [pB]
+	add	pB, pB, #32
+	ld2	{v0.4s, v1.4s}, [pA]
+	add	pA, pA, #32
+	ld2	{v2.4s, v3.4s}, [pA]
+	add	pA, pA, #32
+.endm
+
+.macro KERNEL8x4_E
+	OP_rr	v16.4s, v4.4s, v12.4s[0]
+	OP_ii	v16.4s, v5.4s, v13.4s[0]
+	OP_ri	v17.4s, v4.4s, v13.4s[0]
+	OP_ir	v17.4s, v5.4s, v12.4s[0]
+
+	OP_rr	v18.4s, v6.4s, v12.4s[0]
+	OP_ii	v18.4s, v7.4s, v13.4s[0]
+	OP_ri	v19.4s, v6.4s, v13.4s[0]
+	OP_ir	v19.4s, v7.4s, v12.4s[0]
+
+	OP_rr	v20.4s, v4.4s, v12.4s[1]
+	OP_ii	v20.4s, v5.4s, v13.4s[1]
+	OP_ri	v21.4s, v4.4s, v13.4s[1]
+	OP_ir	v21.4s, v5.4s, v12.4s[1]
+
+	OP_rr	v22.4s, v6.4s, v12.4s[1]
+	OP_ii	v22.4s, v7.4s, v13.4s[1]
+	OP_ri	v23.4s, v6.4s, v13.4s[1]
+	OP_ir	v23.4s, v7.4s, v12.4s[1]
+
+	OP_rr	v24.4s, v4.4s, v12.4s[2]
+	OP_ii	v24.4s, v5.4s, v13.4s[2]
+	OP_ri	v25.4s, v4.4s, v13.4s[2]
+	OP_ir	v25.4s, v5.4s, v12.4s[2]
+
+	OP_rr	v26.4s, v6.4s, v12.4s[2]
+	OP_ii	v26.4s, v7.4s, v13.4s[2]
+	OP_ri	v27.4s, v6.4s, v13.4s[2]
+	OP_ir	v27.4s, v7.4s, v12.4s[2]
+
+	OP_rr	v28.4s, v4.4s, v12.4s[3]
+	OP_ii	v28.4s, v5.4s, v13.4s[3]
+	OP_ri	v29.4s, v4.4s, v13.4s[3]
+	OP_ir	v29.4s, v5.4s, v12.4s[3]
+
+	OP_rr	v30.4s, v6.4s, v12.4s[3]
+	OP_ii	v30.4s, v7.4s, v13.4s[3]
+	OP_ri	v31.4s, v6.4s, v13.4s[3]
+	OP_ir	v31.4s, v7.4s, v12.4s[3]
+
+.endm
+
+.macro KERNEL8x4_SUB
+	ld2	{v8.4s, v9.4s}, [pB]
+	add	pB, pB, #32
+	ld2	{v0.4s, v1.4s}, [pA]
+	add	pA, pA, #32
+	ld2	{v2.4s, v3.4s}, [pA]
+	add	pA, pA, #32
+
+	OP_rr	v16.4s, v0.4s, v8.4s[0]
+	OP_ii	v16.4s, v1.4s, v9.4s[0]
+	OP_ri	v17.4s, v0.4s, v9.4s[0]
+	OP_ir	v17.4s, v1.4s, v8.4s[0]
+
+	OP_rr	v18.4s, v2.4s, v8.4s[0]
+	OP_ii	v18.4s, v3.4s, v9.4s[0]
+	OP_ri	v19.4s, v2.4s, v9.4s[0]
+	OP_ir	v19.4s, v3.4s, v8.4s[0]
+
+	OP_rr	v20.4s, v0.4s, v8.4s[1]
+	OP_ii	v20.4s, v1.4s, v9.4s[1]
+	OP_ri	v21.4s, v0.4s, v9.4s[1]
+	OP_ir	v21.4s, v1.4s, v8.4s[1]
+
+	OP_rr	v22.4s, v2.4s, v8.4s[1]
+	OP_ii	v22.4s, v3.4s, v9.4s[1]
+	OP_ri	v23.4s, v2.4s, v9.4s[1]
+	OP_ir	v23.4s, v3.4s, v8.4s[1]
+
+	OP_rr	v24.4s, v0.4s, v8.4s[2]
+	OP_ii	v24.4s, v1.4s, v9.4s[2]
+	OP_ri	v25.4s, v0.4s, v9.4s[2]
+	OP_ir	v25.4s, v1.4s, v8.4s[2]
+
+	OP_rr	v26.4s, v2.4s, v8.4s[2]
+	OP_ii	v26.4s, v3.4s, v9.4s[2]
+	OP_ri	v27.4s, v2.4s, v9.4s[2]
+	OP_ir	v27.4s, v3.4s, v8.4s[2]
+
+	OP_rr	v28.4s, v0.4s, v8.4s[3]
+	OP_ii	v28.4s, v1.4s, v9.4s[3]
+	OP_ri	v29.4s, v0.4s, v9.4s[3]
+	OP_ir	v29.4s, v1.4s, v8.4s[3]
+
+	OP_rr	v30.4s, v2.4s, v8.4s[3]
+	OP_ii	v30.4s, v3.4s, v9.4s[3]
+	OP_ri	v31.4s, v2.4s, v9.4s[3]
+	OP_ir	v31.4s, v3.4s, v8.4s[3]
+
+.endm
+
+.macro SAVE8x4
+	mov	pCRow1, pCRow0
+
+
+	fmul	v0.4s, v16.4s, alphaV0_R
+	fmls	v0.4s, v17.4s, alphaV0_I
+	fmul	v1.4s, v16.4s, alphaV1_I
+	fmla	v1.4s, v17.4s, alphaV1_R
+	st2 	{v0.4s, v1.4s}, [pCRow1]
+
+	add	pCRow2, pCRow1, #32
+
+
+	fmul	v2.4s, v18.4s, alphaV0_R
+	fmls	v2.4s, v19.4s, alphaV0_I
+	fmul	v3.4s, v18.4s, alphaV1_I
+	fmla	v3.4s, v19.4s, alphaV1_R
+	st2 	{v2.4s, v3.4s}, [pCRow2]
+
+	add	pCRow1, pCRow1, LDC
+
+
+	fmul	v4.4s, v20.4s, alphaV0_R
+	fmls	v4.4s, v21.4s, alphaV0_I
+	fmul	v5.4s, v20.4s, alphaV1_I
+	fmla	v5.4s, v21.4s, alphaV1_R
+	st2 	{v4.4s, v5.4s}, [pCRow1]
+
+	add	pCRow2, pCRow1, #32
+
+
+	fmul	v6.4s, v22.4s, alphaV0_R
+	fmls	v6.4s, v23.4s, alphaV0_I
+	fmul	v7.4s, v22.4s, alphaV1_I
+	fmla	v7.4s, v23.4s, alphaV1_R
+	st2 	{v6.4s, v7.4s}, [pCRow2]
+
+	add	pCRow1, pCRow1, LDC
+
+
+	fmul	v0.4s, v24.4s, alphaV0_R
+	fmls	v0.4s, v25.4s, alphaV0_I
+	fmul	v1.4s, v24.4s, alphaV1_I
+	fmla	v1.4s, v25.4s, alphaV1_R
+	st2 	{v0.4s, v1.4s}, [pCRow1]
+
+	add	pCRow2, pCRow1, #32
+
+
+	fmul	v2.4s, v26.4s, alphaV0_R
+	fmls	v2.4s, v27.4s, alphaV0_I
+	fmul	v3.4s, v26.4s, alphaV1_I
+	fmla	v3.4s, v27.4s, alphaV1_R
+	st2 	{v2.4s, v3.4s}, [pCRow2]
+
+	add	pCRow1, pCRow1, LDC
+
+
+	fmul	v4.4s, v28.4s, alphaV0_R
+	fmls	v4.4s, v29.4s, alphaV0_I
+	fmul	v5.4s, v28.4s, alphaV1_I
+	fmla	v5.4s, v29.4s, alphaV1_R
+	st2 	{v4.4s, v5.4s}, [pCRow1]
+
+	add	pCRow2, pCRow1, #32
+
+
+	fmul	v6.4s, v30.4s, alphaV0_R
+	fmls	v6.4s, v31.4s, alphaV0_I
+	fmul	v7.4s, v30.4s, alphaV1_I
+	fmla	v7.4s, v31.4s, alphaV1_R
+	st2 	{v6.4s, v7.4s}, [pCRow2]
+
+	add	pCRow0, pCRow0, #64
+.endm
+
+/******************************************************************************/
+
+.macro INIT4x4
+	fmov		s16, wzr
+	fmov		s17, s16
+	fmov		s20, s17
+	fmov		s21, s16
+	fmov		s24, s17
+	fmov		s25, s16
+	fmov		s28, s17
+	fmov		s29, s16
+.endm
+
+.macro KERNEL4x4_I
+	ld2	{v8.4s, v9.4s}, [pB]
+	add	pB, pB, #32
+	ld2	{v0.4s, v1.4s}, [pA]
+	add	pA, pA, #32
+
+	fmul	v16.4s, v0.4s, v8.4s[0]
+	OP_ii	v16.4s, v1.4s, v9.4s[0]
+#if defined(NR) || defined(NC) || defined(TR) || defined(TC) || \
+    defined(RR) || defined(RC) || defined(CR) || defined(CC)
+	eor	v17.16b, v17.16b, v17.16b
+	fmls	v17.4s, v0.4s, v9.4s[0]
+#else
+	fmul	v17.4s, v0.4s, v9.4s[0]
+#endif
+	OP_ir	v17.4s, v1.4s, v8.4s[0]
+
+	fmul	v20.4s, v0.4s, v8.4s[1]
+	OP_ii	v20.4s, v1.4s, v9.4s[1]
+#if defined(NR) || defined(NC) || defined(TR) || defined(TC) || \
+    defined(RR) || defined(RC) || defined(CR) || defined(CC)
+	eor	v21.16b, v21.16b, v21.16b
+	fmls	v21.4s, v0.4s, v9.4s[1]
+#else
+	fmul	v21.4s, v0.4s, v9.4s[1]
+#endif
+	OP_ir	v21.4s, v1.4s, v8.4s[1]
+
+	fmul	v24.4s, v0.4s, v8.4s[2]
+	OP_ii	v24.4s, v1.4s, v9.4s[2]
+#if defined(NR) || defined(NC) || defined(TR) || defined(TC) || \
+    defined(RR) || defined(RC) || defined(CR) || defined(CC)
+	eor	v25.16b, v25.16b, v25.16b
+	fmls	v25.4s, v0.4s, v9.4s[2]
+#else
+	fmul	v25.4s, v0.4s, v9.4s[2]
+#endif
+	OP_ir	v25.4s, v1.4s, v8.4s[2]
+
+	fmul	v28.4s, v0.4s, v8.4s[3]
+	OP_ii	v28.4s, v1.4s, v9.4s[3]
+#if defined(NR) || defined(NC) || defined(TR) || defined(TC) || \
+    defined(RR) || defined(RC) || defined(CR) || defined(CC)
+	eor	v29.16b, v29.16b, v29.16b
+	fmls	v29.4s, v0.4s, v9.4s[3]
+#else
+	fmul	v29.4s, v0.4s, v9.4s[3]
+#endif
+	OP_ir	v29.4s, v1.4s, v8.4s[3]
+
+	ld2	{v12.4s, v13.4s}, [pB]
+	add	pB, pB, #32
+	ld2	{v4.4s, v5.4s}, [pA]
+	add	pA, pA, #32
+.endm
+
+.macro KERNEL4x4_M1
+	OP_rr	v16.4s, v0.4s, v8.4s[0]
+	OP_ii	v16.4s, v1.4s, v9.4s[0]
+	OP_ri	v17.4s, v0.4s, v9.4s[0]
+	OP_ir	v17.4s, v1.4s, v8.4s[0]
+
+	ld2	{v12.4s, v13.4s}, [pB]		// For next round
+	add	pB, pB, #32
+
+	OP_rr	v20.4s, v0.4s, v8.4s[1]
+	OP_ii	v20.4s, v1.4s, v9.4s[1]
+	OP_ri	v21.4s, v0.4s, v9.4s[1]
+	OP_ir	v21.4s, v1.4s, v8.4s[1]
+
+	ld2	{v4.4s, v5.4s}, [pA]		// For next round
+	add	pA, pA, #32
+
+	OP_rr	v24.4s, v0.4s, v8.4s[2]
+	OP_ii	v24.4s, v1.4s, v9.4s[2]
+	OP_ri	v25.4s, v0.4s, v9.4s[2]
+	OP_ir	v25.4s, v1.4s, v8.4s[2]
+
+	prfm	PLDL1KEEP, [pA, #512]
+
+	OP_rr	v28.4s, v0.4s, v8.4s[3]
+	OP_ii	v28.4s, v1.4s, v9.4s[3]
+	OP_ri	v29.4s, v0.4s, v9.4s[3]
+	OP_ir	v29.4s, v1.4s, v8.4s[3]
+.endm
+
+.macro KERNEL4x4_M2
+	OP_rr	v16.4s, v4.4s, v12.4s[0]
+	OP_ii	v16.4s, v5.4s, v13.4s[0]
+	OP_ri	v17.4s, v4.4s, v13.4s[0]
+	OP_ir	v17.4s, v5.4s, v12.4s[0]
+
+	ld2	{v8.4s, v9.4s}, [pB]		// For next round
+	add	pB, pB, #32
+
+	OP_rr	v20.4s, v4.4s, v12.4s[1]
+	OP_ii	v20.4s, v5.4s, v13.4s[1]
+	OP_ri	v21.4s, v4.4s, v13.4s[1]
+	OP_ir	v21.4s, v5.4s, v12.4s[1]
+
+	ld2	{v0.4s, v1.4s}, [pA]		// For next round
+	add	pA, pA, #32
+
+	OP_rr	v24.4s, v4.4s, v12.4s[2]
+	OP_ii	v24.4s, v5.4s, v13.4s[2]
+	OP_ri	v25.4s, v4.4s, v13.4s[2]
+	OP_ir	v25.4s, v5.4s, v12.4s[2]
+
+	prfm	PLDL1KEEP, [pB, #512]
+
+	OP_rr	v28.4s, v4.4s, v12.4s[3]
+	OP_ii	v28.4s, v5.4s, v13.4s[3]
+	OP_ri	v29.4s, v4.4s, v13.4s[3]
+	OP_ir	v29.4s, v5.4s, v12.4s[3]
+.endm
+
+.macro KERNEL4x4_E
+	OP_rr	v16.4s, v4.4s, v12.4s[0]
+	OP_ii	v16.4s, v5.4s, v13.4s[0]
+	OP_ri	v17.4s, v4.4s, v13.4s[0]
+	OP_ir	v17.4s, v5.4s, v12.4s[0]
+
+	OP_rr	v20.4s, v4.4s, v12.4s[1]
+	OP_ii	v20.4s, v5.4s, v13.4s[1]
+	OP_ri	v21.4s, v4.4s, v13.4s[1]
+	OP_ir	v21.4s, v5.4s, v12.4s[1]
+
+	OP_rr	v24.4s, v4.4s, v12.4s[2]
+	OP_ii	v24.4s, v5.4s, v13.4s[2]
+	OP_ri	v25.4s, v4.4s, v13.4s[2]
+	OP_ir	v25.4s, v5.4s, v12.4s[2]
+
+	OP_rr	v28.4s, v4.4s, v12.4s[3]
+	OP_ii	v28.4s, v5.4s, v13.4s[3]
+	OP_ri	v29.4s, v4.4s, v13.4s[3]
+	OP_ir	v29.4s, v5.4s, v12.4s[3]
+.endm
+
+.macro KERNEL4x4_SUB
+	ld2	{v8.4s, v9.4s}, [pB]
+	add	pB, pB, #32
+	ld2	{v0.4s, v1.4s}, [pA]
+	add	pA, pA, #32
+
+	OP_rr	v16.4s, v0.4s, v8.4s[0]
+	OP_ii	v16.4s, v1.4s, v9.4s[0]
+	OP_ri	v17.4s, v0.4s, v9.4s[0]
+	OP_ir	v17.4s, v1.4s, v8.4s[0]
+
+	OP_rr	v20.4s, v0.4s, v8.4s[1]
+	OP_ii	v20.4s, v1.4s, v9.4s[1]
+	OP_ri	v21.4s, v0.4s, v9.4s[1]
+	OP_ir	v21.4s, v1.4s, v8.4s[1]
+
+	OP_rr	v24.4s, v0.4s, v8.4s[2]
+	OP_ii	v24.4s, v1.4s, v9.4s[2]
+	OP_ri	v25.4s, v0.4s, v9.4s[2]
+	OP_ir	v25.4s, v1.4s, v8.4s[2]
+
+	OP_rr	v28.4s, v0.4s, v8.4s[3]
+	OP_ii	v28.4s, v1.4s, v9.4s[3]
+	OP_ri	v29.4s, v0.4s, v9.4s[3]
+	OP_ir	v29.4s, v1.4s, v8.4s[3]
+.endm
+
+.macro SAVE4x4
+	mov	pCRow1, pCRow0
+
+
+	fmul	v0.4s, v16.4s, alphaV0_R
+	fmls	v0.4s, v17.4s, alphaV0_I
+	fmul	v1.4s, v16.4s, alphaV1_I
+	fmla	v1.4s, v17.4s, alphaV1_R
+	st2 	{v0.4s, v1.4s}, [pCRow1]
+
+	add	pCRow1, pCRow1, LDC
+
+
+	fmul	v4.4s, v20.4s, alphaV0_R
+	fmls	v4.4s, v21.4s, alphaV0_I
+	fmul	v5.4s, v20.4s, alphaV1_I
+	fmla	v5.4s, v21.4s, alphaV1_R
+	st2 	{v4.4s, v5.4s}, [pCRow1]
+
+	add	pCRow1, pCRow1, LDC
+
+
+	fmul	v0.4s, v24.4s, alphaV0_R
+	fmls	v0.4s, v25.4s, alphaV0_I
+	fmul	v1.4s, v24.4s, alphaV1_I
+	fmla	v1.4s, v25.4s, alphaV1_R
+	st2 	{v0.4s, v1.4s}, [pCRow1]
+
+	add	pCRow1, pCRow1, LDC
+
+
+	fmul	v4.4s, v28.4s, alphaV0_R
+	fmls	v4.4s, v29.4s, alphaV0_I
+	fmul	v5.4s, v28.4s, alphaV1_I
+	fmla	v5.4s, v29.4s, alphaV1_R
+	st2 	{v4.4s, v5.4s}, [pCRow1]
+
+	add	pCRow0, pCRow0, #32
+.endm
+
+/******************************************************************************/
+
+.macro INIT2x4
+	fmov	s16, wzr
+	fmov	s17, wzr
+	fmov	s20, s16
+	fmov	s21, s17
+	fmov	s24, s16
+	fmov	s25, s17
+	fmov	s28, s16
+	fmov	s29, s17
+.endm
+
+.macro KERNEL2x4_SUB
+	ld2	{v8.4s, v9.4s}, [pB]
+	add	pB, pB, #32
+	ld2	{v0.2s, v1.2s}, [pA]
+	add	pA, pA, #16
+
+	OP_rr	v16.2s, v0.2s, v8.4s[0]
+	OP_ii	v16.2s, v1.2s, v9.4s[0]
+	OP_ri	v17.2s, v0.2s, v9.4s[0]
+	OP_ir	v17.2s, v1.2s, v8.4s[0]
+
+	OP_rr	v20.2s, v0.2s, v8.4s[1]
+	OP_ii	v20.2s, v1.2s, v9.4s[1]
+	OP_ri	v21.2s, v0.2s, v9.4s[1]
+	OP_ir	v21.2s, v1.2s, v8.4s[1]
+
+	OP_rr	v24.2s, v0.2s, v8.4s[2]
+	OP_ii	v24.2s, v1.2s, v9.4s[2]
+	OP_ri	v25.2s, v0.2s, v9.4s[2]
+	OP_ir	v25.2s, v1.2s, v8.4s[2]
+
+	OP_rr	v28.2s, v0.2s, v8.4s[3]
+	OP_ii	v28.2s, v1.2s, v9.4s[3]
+	OP_ri	v29.2s, v0.2s, v9.4s[3]
+	OP_ir	v29.2s, v1.2s, v8.4s[3]
+.endm
+
+.macro SAVE2x4
+	mov	pCRow1, pCRow0
+
+
+	fmul	v0.2s, v16.2s, alphaV0_R
+	fmls	v0.2s, v17.2s, alphaV0_I
+	fmul	v1.2s, v16.2s, alphaV1_I
+	fmla	v1.2s, v17.2s, alphaV1_R
+	st2 	{v0.2s, v1.2s}, [pCRow1]
+
+	add	pCRow1, pCRow1, LDC
+
+
+	fmul	v4.2s, v20.2s, alphaV0_R
+	fmls	v4.2s, v21.2s, alphaV0_I
+	fmul	v5.2s, v20.2s, alphaV1_I
+	fmla	v5.2s, v21.2s, alphaV1_R
+	st2 	{v4.2s, v5.2s}, [pCRow1]
+
+	add	pCRow1, pCRow1, LDC
+
+
+	fmul	v0.2s, v24.2s, alphaV0_R
+	fmls	v0.2s, v25.2s, alphaV0_I
+	fmul	v1.2s, v24.2s, alphaV1_I
+	fmla	v1.2s, v25.2s, alphaV1_R
+	st2 	{v0.2s, v1.2s}, [pCRow1]
+
+	add	pCRow1, pCRow1, LDC
+
+
+	fmul	v4.2s, v28.2s, alphaV0_R
+	fmls	v4.2s, v29.2s, alphaV0_I
+	fmul	v5.2s, v28.2s, alphaV1_I
+	fmla	v5.2s, v29.2s, alphaV1_R
+	st2 	{v4.2s, v5.2s}, [pCRow1]
+
+	add	pCRow0, pCRow0, #16
+.endm
+
+/******************************************************************************/
+
+.macro INIT1x4
+	fmov	s16, wzr
+	fmov	s17, wzr
+	fmov	s20, s16
+	fmov	s21, s17
+	fmov	s24, s16
+	fmov	s25, s17
+	fmov	s28, s16
+	fmov	s29, s17
+.endm
+
+.macro KERNEL1x4_SUB
+	ld2	{v8.4s, v9.4s}, [pB]
+	add	pB, pB, #32
+	ld2	{v0.s, v1.s}[0], [pA]
+	add	pA, pA, #8
+
+	OP_rr	s16, s0, v8.4s[0]
+	OP_ii	s16, s1, v9.4s[0]
+	OP_ri	s17, s0, v9.4s[0]
+	OP_ir	s17, s1, v8.4s[0]
+
+	OP_rr	s20, s0, v8.4s[1]
+	OP_ii	s20, s1, v9.4s[1]
+	OP_ri	s21, s0, v9.4s[1]
+	OP_ir	s21, s1, v8.4s[1]
+
+	OP_rr	s24, s0, v8.4s[2]
+	OP_ii	s24, s1, v9.4s[2]
+	OP_ri	s25, s0, v9.4s[2]
+	OP_ir	s25, s1, v8.4s[2]
+
+	OP_rr	s28, s0, v8.4s[3]
+	OP_ii	s28, s1, v9.4s[3]
+	OP_ri	s29, s0, v9.4s[3]
+	OP_ir	s29, s1, v8.4s[3]
+.endm
+
+.macro SAVE1x4
+	mov	pCRow1, pCRow0
+
+
+	fmul	s0, s16, alphaV0_R
+	fmls	s0, s17, alphaV0_I
+	fmul	s1, s16, alphaV1_I
+	fmla	s1, s17, alphaV1_R
+	st2	{v0.s, v1.s}[0], [pCRow1]
+
+	add	pCRow1, pCRow1, LDC
+
+
+	fmul	s4, s20, alphaV0_R
+	fmls	s4, s21, alphaV0_I
+	fmul	s5, s20, alphaV1_I
+	fmla	s5, s21, alphaV1_R
+	st2 	{v4.s, v5.s}[0], [pCRow1]
+
+	add	pCRow1, pCRow1, LDC
+
+
+	fmul	s0, s24, alphaV0_R
+	fmls	s0, s25, alphaV0_I
+	fmul	s1, s24, alphaV1_I
+	fmla	s1, s25, alphaV1_R
+	st2	{v0.s, v1.s}[0], [pCRow1]
+
+	add	pCRow1, pCRow1, LDC
+
+
+	fmul	s4, s28, alphaV0_R
+	fmls	s4, s29, alphaV0_I
+	fmul	s5, s28, alphaV1_I
+	fmla	s5, s29, alphaV1_R
+	st2	{v4.s, v5.s}[0], [pCRow1]
+
+	add	pCRow0, pCRow0, #8
+.endm
+
+/******************************************************************************/
+
+.macro INIT8x2
+	fmov		s16, wzr
+	fmov		s17, wzr
+	fmov		s18, wzr
+	fmov		s19, s16
+	fmov		s20, wzr
+	fmov		s21, s16
+	fmov		s22, s17
+	fmov		s23, s18
+.endm
+
+.macro KERNEL8x2_SUB
+	ld2	{v8.2s, v9.2s}, [pB]
+	add	pB, pB, #16
+	ld2	{v0.4s, v1.4s}, [pA]
+	add	pA, pA, #32
+	ld2	{v2.4s, v3.4s}, [pA]
+	add	pA, pA, #32
+
+	OP_rr	v16.4s, v0.4s, v8.2s[0]
+	OP_ii	v16.4s, v1.4s, v9.2s[0]
+	OP_ri	v17.4s, v0.4s, v9.2s[0]
+	OP_ir	v17.4s, v1.4s, v8.2s[0]
+
+	OP_rr	v18.4s, v2.4s, v8.2s[0]
+	OP_ii	v18.4s, v3.4s, v9.2s[0]
+	OP_ri	v19.4s, v2.4s, v9.2s[0]
+	OP_ir	v19.4s, v3.4s, v8.2s[0]
+
+	OP_rr	v20.4s, v0.4s, v8.2s[1]
+	OP_ii	v20.4s, v1.4s, v9.2s[1]
+	OP_ri	v21.4s, v0.4s, v9.2s[1]
+	OP_ir	v21.4s, v1.4s, v8.2s[1]
+
+	OP_rr	v22.4s, v2.4s, v8.2s[1]
+	OP_ii	v22.4s, v3.4s, v9.2s[1]
+	OP_ri	v23.4s, v2.4s, v9.2s[1]
+	OP_ir	v23.4s, v3.4s, v8.2s[1]
+.endm
+
+.macro SAVE8x2
+	mov	pCRow1, pCRow0
+
+
+	fmul	v0.4s, v16.4s, alphaV0_R
+	fmls	v0.4s, v17.4s, alphaV0_I
+	fmul	v1.4s, v16.4s, alphaV1_I
+	fmla	v1.4s, v17.4s, alphaV1_R
+	st2 	{v0.4s, v1.4s}, [pCRow1]
+
+	add	pCRow2, pCRow1, #32
+
+
+	fmul	v2.4s, v18.4s, alphaV0_R
+	fmls	v2.4s, v19.4s, alphaV0_I
+	fmul	v3.4s, v18.4s, alphaV1_I
+	fmla	v3.4s, v19.4s, alphaV1_R
+	st2 	{v2.4s, v3.4s}, [pCRow2]
+
+	add	pCRow1, pCRow1, LDC
+
+
+	fmul	v4.4s, v20.4s, alphaV0_R
+	fmls	v4.4s, v21.4s, alphaV0_I
+	fmul	v5.4s, v20.4s, alphaV1_I
+	fmla	v5.4s, v21.4s, alphaV1_R
+	st2 	{v4.4s, v5.4s}, [pCRow1]
+
+	add	pCRow2, pCRow1, #32
+
+
+	fmul	v6.4s, v22.4s, alphaV0_R
+	fmls	v6.4s, v23.4s, alphaV0_I
+	fmul	v7.4s, v22.4s, alphaV1_I
+	fmla	v7.4s, v23.4s, alphaV1_R
+	st2 	{v6.4s, v7.4s}, [pCRow2]
+
+	add	pCRow0, pCRow0, #64
+.endm
+
+/******************************************************************************/
+
+.macro INIT4x2
+	fmov	s16, wzr
+	fmov	s17, wzr
+	fmov	s20, s16
+	fmov	s21, s17
+.endm
+
+.macro KERNEL4x2_SUB
+	ld2	{v8.2s, v9.2s}, [pB]
+	add	pB, pB, #16
+	ld2	{v0.4s, v1.4s}, [pA]
+	add	pA, pA, #32
+
+	OP_rr	v16.4s, v0.4s, v8.2s[0]
+	OP_ii	v16.4s, v1.4s, v9.2s[0]
+	OP_ri	v17.4s, v0.4s, v9.2s[0]
+	OP_ir	v17.4s, v1.4s, v8.2s[0]
+
+	OP_rr	v20.4s, v0.4s, v8.2s[1]
+	OP_ii	v20.4s, v1.4s, v9.2s[1]
+	OP_ri	v21.4s, v0.4s, v9.2s[1]
+	OP_ir	v21.4s, v1.4s, v8.2s[1]
+.endm
+
+.macro SAVE4x2
+	mov	pCRow1, pCRow0
+
+
+	fmul	v0.4s, v16.4s, alphaV0_R
+	fmls	v0.4s, v17.4s, alphaV0_I
+	fmul	v1.4s, v16.4s, alphaV1_I
+	fmla	v1.4s, v17.4s, alphaV1_R
+	st2 	{v0.4s, v1.4s}, [pCRow1]
+
+	add	pCRow1, pCRow1, LDC
+
+
+	fmul	v4.4s, v20.4s, alphaV0_R
+	fmls	v4.4s, v21.4s, alphaV0_I
+	fmul	v5.4s, v20.4s, alphaV1_I
+	fmla	v5.4s, v21.4s, alphaV1_R
+	st2 	{v4.4s, v5.4s}, [pCRow1]
+
+	add	pCRow0, pCRow0, #32
+.endm
+
+/******************************************************************************/
+
+.macro INIT2x2
+	fmov		s16, wzr
+	fmov		s17, wzr
+	fmov		s20, s16
+	fmov		s21, s17
+.endm
+
+.macro KERNEL2x2_SUB
+	ld2	{v8.2s, v9.2s}, [pB]
+	add	pB, pB, #16
+	ld2	{v0.2s, v1.2s}, [pA]
+	add	pA, pA, #16
+
+	OP_rr	v16.2s, v0.2s, v8.2s[0]
+	OP_ii	v16.2s, v1.2s, v9.2s[0]
+	OP_ri	v17.2s, v0.2s, v9.2s[0]
+	OP_ir	v17.2s, v1.2s, v8.2s[0]
+
+	OP_rr	v20.2s, v0.2s, v8.2s[1]
+	OP_ii	v20.2s, v1.2s, v9.2s[1]
+	OP_ri	v21.2s, v0.2s, v9.2s[1]
+	OP_ir	v21.2s, v1.2s, v8.2s[1]
+.endm
+
+.macro SAVE2x2
+	mov	pCRow1, pCRow0
+
+
+	fmul	v0.2s, v16.2s, alphaV0_R
+	fmls	v0.2s, v17.2s, alphaV0_I
+	fmul	v1.2s, v16.2s, alphaV1_I
+	fmla	v1.2s, v17.2s, alphaV1_R
+	st2 	{v0.2s, v1.2s}, [pCRow1]
+
+	add	pCRow1, pCRow1, LDC
+
+
+	fmul	v4.2s, v20.2s, alphaV0_R
+	fmls	v4.2s, v21.2s, alphaV0_I
+	fmul	v5.2s, v20.2s, alphaV1_I
+	fmla	v5.2s, v21.2s, alphaV1_R
+	st2 	{v4.2s, v5.2s}, [pCRow1]
+
+	add	pCRow0, pCRow0, #16
+.endm
+
+/******************************************************************************/
+
+.macro INIT1x2
+	fmov		s16, wzr
+	fmov		s17, wzr
+	fmov		s20, wzr
+	fmov		s21, wzr
+.endm
+
+.macro KERNEL1x2_SUB
+	ld2	{v8.2s, v9.2s}, [pB]
+	add	pB, pB, #16
+	ld2	{v0.s, v1.s}[0], [pA]
+	add	pA, pA, #8
+
+	OP_rr	s16, s0, v8.2s[0]
+	OP_ii	s16, s1, v9.2s[0]
+	OP_ri	s17, s0, v9.2s[0]
+	OP_ir	s17, s1, v8.2s[0]
+
+	OP_rr	s20, s0, v8.2s[1]
+	OP_ii	s20, s1, v9.2s[1]
+	OP_ri	s21, s0, v9.2s[1]
+	OP_ir	s21, s1, v8.2s[1]
+.endm
+
+.macro SAVE1x2
+	mov	pCRow1, pCRow0
+
+
+	fmul	s0, s16, alphaV0_R
+	fmls	s0, s17, alphaV0_I
+	fmul	s1, s16, alphaV1_I
+	fmla	s1, s17, alphaV1_R
+	st2	{v0.s, v1.s}[0], [pCRow1]
+
+	add	pCRow1, pCRow1, LDC
+
+
+	fmul	s4, s20, alphaV0_R
+	fmls	s4, s21, alphaV0_I
+	fmul	s5, s20, alphaV1_I
+	fmla	s5, s21, alphaV1_R
+	st2 	{v4.s, v5.s}[0], [pCRow1]
+
+	add	pCRow0, pCRow0, #8
+.endm
+
+/******************************************************************************/
+
+.macro INIT8x1
+	fmov		s16, wzr
+	fmov		s17, wzr
+	fmov		s18, wzr
+	fmov		s19, s16
+.endm
+
+.macro KERNEL8x1_SUB
+	ld1	{v8.2s}, [pB]
+	add	pB, pB, #8
+	ld2	{v0.4s, v1.4s}, [pA]
+	add	pA, pA, #32
+	ld2	{v2.4s, v3.4s}, [pA]
+	add	pA, pA, #32
+
+	OP_rr	v16.4s, v0.4s, v8.4s[0]
+	OP_ii	v16.4s, v1.4s, v8.4s[1]
+	OP_ri	v17.4s, v0.4s, v8.4s[1]
+	OP_ir	v17.4s, v1.4s, v8.4s[0]
+
+	OP_rr	v18.4s, v2.4s, v8.4s[0]
+	OP_ii	v18.4s, v3.4s, v8.4s[1]
+	OP_ri	v19.4s, v2.4s, v8.4s[1]
+	OP_ir	v19.4s, v3.4s, v8.4s[0]
+.endm
+
+.macro SAVE8x1
+	mov	pCRow1, pCRow0
+
+
+	fmul	v0.4s, v16.4s, alphaV0_R
+	fmls	v0.4s, v17.4s, alphaV0_I
+	fmul	v1.4s, v16.4s, alphaV1_I
+	fmla	v1.4s, v17.4s, alphaV1_R
+	st2 	{v0.4s, v1.4s}, [pCRow1]
+
+	add	pCRow1, pCRow1, #32
+
+
+	fmul	v2.4s, v18.4s, alphaV0_R
+	fmls	v2.4s, v19.4s, alphaV0_I
+	fmul	v3.4s, v18.4s, alphaV1_I
+	fmla	v3.4s, v19.4s, alphaV1_R
+	st2 	{v2.4s, v3.4s}, [pCRow1]
+
+	add	pCRow0, pCRow0, #64
+.endm
+
+
+/******************************************************************************/
+
+.macro INIT4x1
+	fmov	s16, wzr
+	fmov	s17, s16
+.endm
+
+.macro KERNEL4x1_SUB
+	ld2	{v8.s, v9.s}[0], [pB]
+	add	pB, pB, #8
+	ld2	{v0.4s, v1.4s}, [pA]
+	add	pA, pA, #32
+
+	OP_rr	v16.4s, v0.4s, v8.s[0]
+	OP_ii	v16.4s, v1.4s, v9.s[0]
+	OP_ri	v17.4s, v0.4s, v9.s[0]
+	OP_ir	v17.4s, v1.4s, v8.s[0]
+.endm
+
+.macro SAVE4x1
+	mov	pCRow1, pCRow0
+
+
+	fmul	v0.4s, v16.4s, alphaV0_R
+	fmls	v0.4s, v17.4s, alphaV0_I
+	fmul	v1.4s, v16.4s, alphaV1_I
+	fmla	v1.4s, v17.4s, alphaV1_R
+	st2 	{v0.4s, v1.4s}, [pCRow1]
+
+	add	pCRow0, pCRow0, #32
+.endm
+
+/******************************************************************************/
+
+.macro INIT2x1
+	fmov	s16, wzr
+	fmov	s17, wzr
+.endm
+
+.macro KERNEL2x1_SUB
+	ld2	{v8.s, v9.s}[0], [pB]
+	add	pB, pB, #8
+	ld2	{v0.2s, v1.2s}, [pA]
+	add	pA, pA, #16
+
+	OP_rr	v16.2s, v0.2s, v8.s[0]
+	OP_ii	v16.2s, v1.2s, v9.s[0]
+	OP_ri	v17.2s, v0.2s, v9.s[0]
+	OP_ir	v17.2s, v1.2s, v8.s[0]
+.endm
+
+.macro SAVE2x1
+	mov	pCRow1, pCRow0
+
+
+	fmul	v0.2s, v16.2s, alphaV0_R
+	fmls	v0.2s, v17.2s, alphaV0_I
+	fmul	v1.2s, v16.2s, alphaV1_I
+	fmla	v1.2s, v17.2s, alphaV1_R
+	st2 	{v0.2s, v1.2s}, [pCRow1]
+
+	add	pCRow0, pCRow0, #16
+
+.endm
+
+/******************************************************************************/
+
+.macro INIT1x1
+	fmov	s16, wzr
+	fmov	s17, wzr
+.endm
+
+.macro KERNEL1x1_SUB
+	ld2	{v8.s, v9.s}[0], [pB]
+	add	pB, pB, #8
+	ld2	{v0.s, v1.s}[0], [pA]
+	add	pA, pA, #8
+
+	OP_rr	s16, s0, v8.s[0]
+	OP_ii	s16, s1, v9.s[0]
+	OP_ri	s17, s0, v9.s[0]
+	OP_ir	s17, s1, v8.s[0]
+.endm
+
+.macro SAVE1x1
+	mov	pCRow1, pCRow0
+
+
+	fmul	s0, s16, alphaV0_R
+	fmls	s0, s17, alphaV0_I
+	fmul	s1, s16, alphaV1_I
+	fmla	s1, s17, alphaV1_R
+	st2	{v0.s, v1.s}[0], [pCRow1]
+
+	add	pCRow0, pCRow0, #8
+.endm
+
+/*******************************************************************************
+* End of macro definitions
+*******************************************************************************/
+
+	PROLOGUE
+
+	.align 5
+	add	sp, sp, #-(11 * 16)
+	stp	d8, d9, [sp, #(0 * 16)]
+	stp	d10, d11, [sp, #(1 * 16)]
+	stp	d12, d13, [sp, #(2 * 16)]
+	stp	d14, d15, [sp, #(3 * 16)]
+	stp	d16, d17, [sp, #(4 * 16)]
+	stp	x18, x19, [sp, #(5 * 16)]
+	stp	x20, x21, [sp, #(6 * 16)]
+	stp	x22, x23, [sp, #(7 * 16)]
+	stp	x24, x25, [sp, #(8 * 16)]
+	stp	x26, x27, [sp, #(9 * 16)]
+	str	x28, [sp, #(10 * 16)]
+
+	fmov	alpha0_R, s0
+	fmov	alpha0_I, s1
+	fmov	alpha1_R, s0
+	fmov	alpha1_I, s1
+
+	lsl	LDC, LDC, #3			// ldc = ldc * 8
+
+#if !defined(LEFT)
+	neg	tempOffset, offset
+#endif
+	mov	pB, origPB
+
+	mov	counterJ, origN
+	asr 	counterJ, counterJ, #2		// J = J / 4
+	cmp 	counterJ, #0
+	ble	ctrmm_kernel_L2_BEGIN
+
+/******************************************************************************/
+
+ctrmm_kernel_L4_BEGIN:
+	mov	pCRow0, pC			// pCRow0 = C
+	add	pC, pC, LDC, lsl #2
+
+#if defined(LEFT)
+	mov	tempOffset, offset
+#endif
+	mov	pA, origPA			// pA = start of A array
+
+ctrmm_kernel_L4_M8_BEGIN:
+
+	mov	counterI, origM
+	asr 	counterI, counterI, #3		// counterI = counterI / 8
+	cmp 	counterI, #0
+	ble	ctrmm_kernel_L4_M4_BEGIN
+
+ctrmm_kernel_L4_M8_20:
+
+#if (defined(LEFT) &&  defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+	mov	pB, origPB
+#else
+	mov	pB, origPB
+	lsl	temp, tempOffset, #6
+	add	pA, pA, temp
+	lsl	temp, tempOffset, #5
+	add	pB, pB, temp
+#endif
+
+#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
+	sub	tempK, origK, tempOffset
+#elif defined(LEFT)
+	add	tempK, tempOffset, #8
+#else
+	add	tempK, tempOffset, #4
+#endif
+
+	asr 	counterL , tempK, #1		// L = K / 2
+	cmp	counterL , #2			// is there at least 4 to do?
+	blt	ctrmm_kernel_L4_M8_32
+
+	KERNEL8x4_I				// do one in the K
+	KERNEL8x4_M2				// do another in the K
+
+	subs	counterL, counterL, #2		// subtract 2
+	ble	ctrmm_kernel_L4_M8_22a
+	.align 5
+
+ctrmm_kernel_L4_M8_22:
+
+	KERNEL8x4_M1
+	KERNEL8x4_M2
+
+	subs	counterL, counterL, #1
+	bgt	ctrmm_kernel_L4_M8_22
+
+
+ctrmm_kernel_L4_M8_22a:
+
+	KERNEL8x4_M1
+	KERNEL8x4_E
+
+	b	 ctrmm_kernel_L4_M8_44
+
+ctrmm_kernel_L4_M8_32:
+
+	tst	counterL, #1
+	ble	ctrmm_kernel_L4_M8_40
+
+	KERNEL8x4_I
+
+	KERNEL8x4_E
+
+	b	ctrmm_kernel_L4_M8_44
+
+ctrmm_kernel_L4_M8_40:
+
+	INIT8x4
+
+ctrmm_kernel_L4_M8_44:
+
+	ands	counterL , tempK, #1
+	ble	ctrmm_kernel_L4_M8_100
+
+ctrmm_kernel_L4_M8_46:
+
+	KERNEL8x4_SUB
+
+ctrmm_kernel_L4_M8_100:
+
+	SAVE8x4
+
+#if (defined(LEFT) &&  defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+	sub	tempK, origK, tempOffset
+#if defined(LEFT)
+	sub	tempK, tempK, #8
+#else
+	sub	tempK, tempK, #4
+#endif
+	lsl	temp, tempK, #6
+	add	pA, pA, temp
+	lsl	temp, tempK, #5
+	add	pB, pB, temp
+#endif
+#if defined(LEFT)
+	add	tempOffset, tempOffset, #8
+#endif
+
+ctrmm_kernel_L4_M8_END:
+	subs	counterI, counterI, #1
+	bne	ctrmm_kernel_L4_M8_20
+
+ctrmm_kernel_L4_M4_BEGIN:
+
+	mov	counterI, origM
+	tst	counterI , #7
+	ble	ctrmm_kernel_L4_END
+
+	tst	counterI, #4
+	ble	ctrmm_kernel_L4_M2_BEGIN
+
+ctrmm_kernel_L4_M4_20:
+
+	INIT4x4
+
+#if (defined(LEFT) &&  defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+	mov	pB, origPB
+#else
+	mov	pB, origPB
+	lsl	temp, tempOffset, #5
+	add	pB, pB, temp
+	add	pA, pA, temp
+#endif
+
+#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
+	sub	tempK, origK, tempOffset
+#elif defined(LEFT)
+	add	tempK, tempOffset, #4
+#else
+	add	tempK, tempOffset, #4
+#endif
+
+	asr 	counterL , tempK, #3		// counterL = counterL / 8
+	cmp	counterL , #0
+	ble	ctrmm_kernel_L4_M4_40
+
+ctrmm_kernel_L4_M4_22:
+
+	KERNEL4x4_SUB
+	KERNEL4x4_SUB
+	KERNEL4x4_SUB
+	KERNEL4x4_SUB
+
+	KERNEL4x4_SUB
+	KERNEL4x4_SUB
+	KERNEL4x4_SUB
+	KERNEL4x4_SUB
+
+	subs	counterL, counterL, #1
+	bgt	ctrmm_kernel_L4_M4_22
+
+
+ctrmm_kernel_L4_M4_40:
+
+	ands	counterL , tempK, #7		// counterL = counterL % 8
+	ble	ctrmm_kernel_L4_M4_100
+
+ctrmm_kernel_L4_M4_42:
+
+	KERNEL4x4_SUB
+
+	subs	counterL, counterL, #1
+	bgt	ctrmm_kernel_L4_M4_42
+
+ctrmm_kernel_L4_M4_100:
+
+	SAVE4x4
+
+#if (defined(LEFT) &&  defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+	sub	tempK, origK, tempOffset
+#if defined(LEFT)
+	sub	tempK, tempK, #4
+#else
+	sub	tempK, tempK, #4
+#endif
+	lsl	temp, tempK, #5
+	add	pA, pA, temp
+	add	pB, pB, temp
+#endif
+#if defined(LEFT)
+	add	tempOffset, tempOffset, #4
+#endif
+
+ctrmm_kernel_L4_M4_END:
+
+
+ctrmm_kernel_L4_M2_BEGIN:
+
+	mov	counterI, origM
+	tst	counterI , #3
+	ble	ctrmm_kernel_L4_END
+
+	tst	counterI, #2			// counterI = counterI / 2
+	ble	ctrmm_kernel_L4_M1_BEGIN
+
+ctrmm_kernel_L4_M2_20:
+
+	INIT2x4
+
+#if (defined(LEFT) &&  defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+	mov	pB, origPB
+#else
+	mov	pB, origPB
+	lsl	temp, tempOffset, #4
+	add	pA, pA, temp
+	lsl	temp, tempOffset, #5
+	add	pB, pB, temp
+#endif
+
+#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
+	sub	tempK, origK, tempOffset
+#elif defined(LEFT)
+	add	tempK, tempOffset, #2
+#else
+	add	tempK, tempOffset, #4
+#endif
+	asr 	counterL , tempK, #3		// counterL = counterL / 8
+	cmp	counterL , #0
+	ble	ctrmm_kernel_L4_M2_40
+
+ctrmm_kernel_L4_M2_22:
+
+	KERNEL2x4_SUB
+	KERNEL2x4_SUB
+	KERNEL2x4_SUB
+	KERNEL2x4_SUB
+
+	KERNEL2x4_SUB
+	KERNEL2x4_SUB
+	KERNEL2x4_SUB
+	KERNEL2x4_SUB
+
+	subs	counterL, counterL, #1
+	bgt	ctrmm_kernel_L4_M2_22
+
+
+ctrmm_kernel_L4_M2_40:
+
+	ands	counterL , tempK, #7		// counterL = counterL % 8
+	ble	ctrmm_kernel_L4_M2_100
+
+ctrmm_kernel_L4_M2_42:
+
+	KERNEL2x4_SUB
+
+	subs	counterL, counterL, #1
+	bgt	ctrmm_kernel_L4_M2_42
+
+ctrmm_kernel_L4_M2_100:
+
+	SAVE2x4
+
+#if (defined(LEFT) &&  defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+	sub	tempK, origK, tempOffset
+#if defined(LEFT)
+	sub	tempK, tempK, #2
+#else
+	sub	tempK, tempK, #4
+#endif
+	lsl	temp, tempK, #4
+	add	pA, pA, temp
+	lsl	temp, tempK, #5
+	add	pB, pB, temp
+#endif
+#if defined(LEFT)
+	add	tempOffset, tempOffset, #2
+#endif
+
+ctrmm_kernel_L4_M2_END:
+
+
+ctrmm_kernel_L4_M1_BEGIN:
+
+	tst	counterI, #1			// counterI = counterI % 2
+	ble	ctrmm_kernel_L4_END
+
+ctrmm_kernel_L4_M1_20:
+
+	INIT1x4
+
+#if (defined(LEFT) &&  defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+	mov	pB, origPB
+#else
+	mov	pB, origPB
+	lsl	temp, tempOffset, #5
+	add	pB, pB, temp
+	lsl	temp, tempOffset, #3
+	add	pA, pA, temp
+#endif
+
+#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
+	sub	tempK, origK, tempOffset
+#elif defined(LEFT)
+	add	tempK, tempOffset, #1
+#else
+	add	tempK, tempOffset, #4
+#endif
+
+	asr 	counterL , tempK, #3		// counterL = counterL / 8
+	cmp	counterL , #0
+	ble	ctrmm_kernel_L4_M1_40
+
+ctrmm_kernel_L4_M1_22:
+	KERNEL1x4_SUB
+	KERNEL1x4_SUB
+	KERNEL1x4_SUB
+	KERNEL1x4_SUB
+
+	KERNEL1x4_SUB
+	KERNEL1x4_SUB
+	KERNEL1x4_SUB
+	KERNEL1x4_SUB
+
+	subs	counterL, counterL, #1
+	bgt	ctrmm_kernel_L4_M1_22
+
+
+ctrmm_kernel_L4_M1_40:
+
+	ands	counterL , tempK, #7		// counterL = counterL % 8
+	ble	ctrmm_kernel_L4_M1_100
+
+ctrmm_kernel_L4_M1_42:
+
+	KERNEL1x4_SUB
+
+	subs	counterL, counterL, #1
+	bgt	ctrmm_kernel_L4_M1_42
+
+ctrmm_kernel_L4_M1_100:
+
+	SAVE1x4
+
+#if (defined(LEFT) &&  defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+	sub	tempK, origK, tempOffset
+#if defined(LEFT)
+	sub	tempK, tempK, #1
+#else
+	sub	tempK, tempK, #4
+#endif
+	lsl	temp, tempK, #3
+	add	pA, pA, temp
+	lsl	temp, tempK, #5
+	add	pB, pB, temp
+#endif
+#if defined(LEFT)
+	add	tempOffset, tempOffset, #1
+#endif
+
+ctrmm_kernel_L4_END:
+
+	lsl	temp, origK, #5 
+	add	origPB, origPB, temp		// B = B + K * 4 * 8
+
+#if !defined(LEFT)
+	add	tempOffset, tempOffset, #4
+#endif
+
+	subs	counterJ, counterJ , #1		// j--
+	bgt	ctrmm_kernel_L4_BEGIN
+
+
+/******************************************************************************/
+
+ctrmm_kernel_L2_BEGIN:   // less than 2 left in N direction
+
+	mov	counterJ , origN
+	tst	counterJ , #3
+	ble	ctrmm_kernel_L999   // error, N was less than 4?
+
+	tst	counterJ , #2
+	ble	ctrmm_kernel_L1_BEGIN
+
+	mov	pCRow0, pC			// pCRow0 = pC
+
+	add	pC,pC,LDC, lsl #1
+
+#if defined(LEFT)
+	mov	tempOffset, offset
+#endif
+	mov	pA, origPA			// pA = A
+
+ctrmm_kernel_L2_M8_BEGIN:
+
+	mov	counterI, origM
+	asr 	counterI, counterI, #3		// counterI = counterI / 8
+	cmp	counterI, #0
+	ble	ctrmm_kernel_L2_M4_BEGIN
+
+ctrmm_kernel_L2_M8_20:
+
+	INIT8x2
+
+#if (defined(LEFT) &&  defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+	mov	pB, origPB
+#else
+	mov	pB, origPB
+	lsl	temp, tempOffset, #6
+	add	pA, pA, temp
+	lsl	temp, tempOffset, #4
+	add	pB, pB, temp
+#endif
+
+#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
+	sub	tempK, origK, tempOffset
+#elif defined(LEFT)
+	add	tempK, tempOffset, #8
+#else
+	add	tempK, tempOffset, #2
+#endif
+
+	asr	counterL , tempK, #3		// counterL = counterL / 8
+	cmp	counterL,#0
+	ble	ctrmm_kernel_L2_M8_40
+	.align 5
+
+ctrmm_kernel_L2_M8_22:
+	KERNEL8x2_SUB
+	KERNEL8x2_SUB
+	KERNEL8x2_SUB
+	KERNEL8x2_SUB
+
+	KERNEL8x2_SUB
+	KERNEL8x2_SUB
+	KERNEL8x2_SUB
+	KERNEL8x2_SUB
+
+	subs	counterL, counterL, #1
+	bgt	ctrmm_kernel_L2_M8_22
+
+
+ctrmm_kernel_L2_M8_40:
+
+	ands	counterL , tempK, #7		// counterL = counterL % 8
+	ble	ctrmm_kernel_L2_M8_100
+
+ctrmm_kernel_L2_M8_42:
+
+	KERNEL8x2_SUB
+
+	subs	counterL, counterL, #1
+	bgt	ctrmm_kernel_L2_M8_42
+
+ctrmm_kernel_L2_M8_100:
+
+	SAVE8x2
+
+#if (defined(LEFT) &&  defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+	sub	tempK, origK, tempOffset
+#if defined(LEFT)
+	sub	tempK, tempK, #8
+#else
+	sub	tempK, tempK, #2
+#endif
+	lsl	temp, tempK, #6
+	add	pA, pA, temp
+	lsl	temp, tempK, #4
+	add	pB, pB, temp
+#endif
+#if defined(LEFT)
+	add	tempOffset, tempOffset, #8
+#endif
+
+ctrmm_kernel_L2_M8_END:
+
+	subs	counterI, counterI, #1
+	bgt	ctrmm_kernel_L2_M8_20
+
+ctrmm_kernel_L2_M4_BEGIN:
+
+	mov	counterI, origM
+	tst	counterI , #7
+	ble	ctrmm_kernel_L2_END
+
+	tst	counterI, #4			// counterI = counterI / 2
+	ble	ctrmm_kernel_L2_M2_BEGIN
+
+ctrmm_kernel_L2_M4_20:
+
+	INIT4x2
+
+#if (defined(LEFT) &&  defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+	mov	pB, origPB
+#else
+	mov	pB, origPB
+	lsl	temp, tempOffset, #4
+	add	pB, pB, temp
+	lsl	temp, tempOffset, #5
+	add	pA, pA, temp
+#endif
+
+#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
+	sub	tempK, origK, tempOffset
+#elif defined(LEFT)
+	add	tempK, tempOffset, #4
+#else
+	add	tempK, tempOffset, #2
+#endif
+
+	asr	counterL , tempK, #3		// counterL = counterL / 8
+	cmp	counterL,#0
+	ble	ctrmm_kernel_L2_M4_40
+	.align 5
+
+ctrmm_kernel_L2_M4_22:
+	KERNEL4x2_SUB
+	KERNEL4x2_SUB
+	KERNEL4x2_SUB
+	KERNEL4x2_SUB
+
+	KERNEL4x2_SUB
+	KERNEL4x2_SUB
+	KERNEL4x2_SUB
+	KERNEL4x2_SUB
+
+	subs	counterL, counterL, #1
+	bgt	ctrmm_kernel_L2_M4_22
+
+
+ctrmm_kernel_L2_M4_40:
+
+	ands	counterL , tempK, #7		// counterL = counterL % 8
+	ble	ctrmm_kernel_L2_M4_100
+
+ctrmm_kernel_L2_M4_42:
+
+	KERNEL4x2_SUB
+
+	subs	counterL, counterL, #1
+	bgt	ctrmm_kernel_L2_M4_42
+
+ctrmm_kernel_L2_M4_100:
+
+	SAVE4x2
+
+#if (defined(LEFT) &&  defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+	sub	tempK, origK, tempOffset
+#if defined(LEFT)
+	sub	tempK, tempK, #4
+#else
+	sub	tempK, tempK, #2
+#endif
+	lsl	temp, tempK, #5
+	add	pA, pA, temp
+	lsl	temp, tempK, #4
+	add	pB, pB, temp
+#endif
+#if defined(LEFT)
+	add	tempOffset, tempOffset, #4
+#endif
+
+ctrmm_kernel_L2_M4_END:
+
+
+ctrmm_kernel_L2_M2_BEGIN:
+
+	mov	counterI, origM
+	tst	counterI , #3
+	ble	ctrmm_kernel_L2_END
+
+	tst	counterI, #2			// counterI = counterI / 2
+	ble	ctrmm_kernel_L2_M1_BEGIN
+
+ctrmm_kernel_L2_M2_20:
+
+	INIT2x2
+
+#if (defined(LEFT) &&  defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+	mov	pB, origPB
+#else
+	mov	pB, origPB
+	lsl	temp, tempOffset, #4
+	add	pB, pB, temp
+	lsl	temp, tempOffset, #4
+	add	pA, pA, temp
+#endif
+
+#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
+	sub	tempK, origK, tempOffset
+#elif defined(LEFT)
+	add	tempK, tempOffset, #2
+#else
+	add	tempK, tempOffset, #2
+#endif
+
+	asr	counterL , tempK, #3		// counterL = counterL / 8
+        cmp	counterL,#0
+	ble	ctrmm_kernel_L2_M2_40
+
+ctrmm_kernel_L2_M2_22:
+
+	KERNEL2x2_SUB
+	KERNEL2x2_SUB
+	KERNEL2x2_SUB
+	KERNEL2x2_SUB
+
+	KERNEL2x2_SUB
+	KERNEL2x2_SUB
+	KERNEL2x2_SUB
+	KERNEL2x2_SUB
+
+	subs	counterL, counterL, #1
+	bgt	ctrmm_kernel_L2_M2_22
+
+
+ctrmm_kernel_L2_M2_40:
+
+	ands	counterL , tempK, #7		// counterL = counterL % 8
+	ble	ctrmm_kernel_L2_M2_100
+
+ctrmm_kernel_L2_M2_42:
+
+	KERNEL2x2_SUB
+
+	subs	counterL, counterL, #1
+	bgt	ctrmm_kernel_L2_M2_42
+
+ctrmm_kernel_L2_M2_100:
+
+	SAVE2x2
+
+#if (defined(LEFT) &&  defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+	sub	tempK, origK, tempOffset
+#if defined(LEFT)
+	sub	tempK, tempK, #2
+#else
+	sub	tempK, tempK, #2
+#endif
+	lsl	temp, tempK, #4
+	add	pA, pA, temp
+	lsl	temp, tempK, #4
+	add	pB, pB, temp
+#endif
+#if defined(LEFT)
+	add	tempOffset, tempOffset, #2
+#endif
+
+ctrmm_kernel_L2_M2_END:
+
+
+ctrmm_kernel_L2_M1_BEGIN:
+
+	tst	counterI, #1			// counterI = counterI % 2
+	ble	ctrmm_kernel_L2_END
+
+ctrmm_kernel_L2_M1_20:
+
+	INIT1x2
+
+#if (defined(LEFT) &&  defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+	mov	pB, origPB
+#else
+	mov	pB, origPB
+	lsl	temp, tempOffset, #4
+	add	pB, pB, temp
+	lsl	temp, tempOffset, #3
+	add	pA, pA, temp
+#endif
+
+#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
+	sub	tempK, origK, tempOffset
+#elif defined(LEFT)
+	add	tempK, tempOffset, #1
+#else
+	add	tempK, tempOffset, #2
+#endif
+
+	asr 	counterL , tempK, #3		// counterL = counterL / 8
+        cmp     counterL, #0
+	ble	ctrmm_kernel_L2_M1_40
+
+ctrmm_kernel_L2_M1_22:
+	KERNEL1x2_SUB
+	KERNEL1x2_SUB
+	KERNEL1x2_SUB
+	KERNEL1x2_SUB
+
+	KERNEL1x2_SUB
+	KERNEL1x2_SUB
+	KERNEL1x2_SUB
+	KERNEL1x2_SUB
+
+	subs	counterL, counterL, #1
+	bgt	ctrmm_kernel_L2_M1_22
+
+
+ctrmm_kernel_L2_M1_40:
+
+	ands	counterL , tempK, #7		// counterL = counterL % 8
+	ble	ctrmm_kernel_L2_M1_100
+
+ctrmm_kernel_L2_M1_42:
+
+	KERNEL1x2_SUB
+
+	subs	counterL, counterL, #1
+	bgt	ctrmm_kernel_L2_M1_42
+
+ctrmm_kernel_L2_M1_100:
+
+	SAVE1x2
+
+#if (defined(LEFT) &&  defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+	sub	tempK, origK, tempOffset
+#if defined(LEFT)
+	sub	tempK, tempK, #1
+#else
+	sub	tempK, tempK, #2
+#endif
+	lsl	temp, tempK, #3
+	add	pA, pA, temp
+	lsl	temp, tempK, #4
+	add	pB, pB, temp
+#endif
+#if defined(LEFT)
+	add	tempOffset, tempOffset, #1
+#endif
+
+ctrmm_kernel_L2_END:
+#if !defined(LEFT)
+	add	tempOffset, tempOffset, #2
+#endif
+	add	origPB, origPB, origK, lsl #4	// B = B + K * 2 * 8
+
+/******************************************************************************/
+
+ctrmm_kernel_L1_BEGIN:
+
+	mov	counterJ , origN
+	tst	counterJ , #1
+	ble	ctrmm_kernel_L999 // done
+
+	mov	pCRow0, pC			// pCRow0 = C
+	add	pC , pC , LDC			// Update pC to point to next
+
+#if defined(LEFT)
+	mov	tempOffset, offset
+#endif
+	mov	pA, origPA			// pA = A
+
+ctrmm_kernel_L1_M8_BEGIN:
+
+	mov	counterI, origM
+	asr 	counterI, counterI, #3		// counterI = counterI / 8
+	cmp	counterI, #0
+	ble	ctrmm_kernel_L1_M4_BEGIN
+
+ctrmm_kernel_L1_M8_20:
+
+	INIT8x1
+
+#if (defined(LEFT) &&  defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+	mov	pB, origPB
+#else
+	mov	pB, origPB
+	lsl	temp, tempOffset, #6
+	add	pA, pA, temp
+	lsl	temp, tempOffset, #3
+	add	pB, pB, temp
+#endif
+
+#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
+	sub	tempK, origK, tempOffset
+#elif defined(LEFT)
+	add	tempK, tempOffset, #8
+#else
+	add	tempK, tempOffset, #1
+#endif
+
+	asr	counterL , tempK, #3		// counterL = counterL / 8
+	cmp	counterL , #0
+	ble	ctrmm_kernel_L1_M8_40
+	.align 5
+
+ctrmm_kernel_L1_M8_22:
+	KERNEL8x1_SUB
+	KERNEL8x1_SUB
+	KERNEL8x1_SUB
+	KERNEL8x1_SUB
+
+	KERNEL8x1_SUB
+	KERNEL8x1_SUB
+	KERNEL8x1_SUB
+	KERNEL8x1_SUB
+
+	subs	counterL, counterL, #1
+	bgt	ctrmm_kernel_L1_M8_22
+
+
+ctrmm_kernel_L1_M8_40:
+
+	ands	counterL , tempK, #7		// counterL = counterL % 8
+	ble	ctrmm_kernel_L1_M8_100
+
+ctrmm_kernel_L1_M8_42:
+
+	KERNEL8x1_SUB
+
+	subs	counterL, counterL, #1
+	bgt	ctrmm_kernel_L1_M8_42
+
+ctrmm_kernel_L1_M8_100:
+
+	SAVE8x1
+
+#if (defined(LEFT) &&  defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+	sub	tempK, origK, tempOffset
+#if defined(LEFT)
+	sub	tempK, tempK, #8
+#else
+	sub	tempK, tempK, #1
+#endif
+	lsl	temp, tempK, #6
+	add	pA, pA, temp
+	lsl	temp, tempK, #3
+	add	pB, pB, temp
+#endif
+#if defined(LEFT)
+	add	tempOffset, tempOffset, #8
+#endif
+
+ctrmm_kernel_L1_M8_END:
+
+	subs	counterI, counterI, #1
+	bgt	ctrmm_kernel_L1_M8_20
+
+ctrmm_kernel_L1_M4_BEGIN:
+
+	mov	counterI, origM
+	tst	counterI , #7
+	ble	ctrmm_kernel_L1_END
+
+	tst	counterI, #4			// counterI = counterI / 2
+	ble	ctrmm_kernel_L1_M2_BEGIN
+
+ctrmm_kernel_L1_M4_20:
+
+	INIT4x1
+
+#if (defined(LEFT) &&  defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+	mov	pB, origPB
+#else
+	mov	pB, origPB
+	lsl	temp, tempOffset, #3
+	add	pB, pB, temp
+	lsl	temp, tempOffset, #5
+	add	pA, pA, temp
+#endif
+#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
+	sub	tempK, origK, tempOffset
+#elif defined(LEFT)
+	add	tempK, tempOffset, #4
+#else
+	add	tempK, tempOffset, #1
+#endif
+
+	asr	counterL , tempK, #3		// counterL = counterL / 8
+	cmp	counterL , #0
+	ble	ctrmm_kernel_L1_M4_40
+	.align 5
+
+ctrmm_kernel_L1_M4_22:
+	KERNEL4x1_SUB
+	KERNEL4x1_SUB
+	KERNEL4x1_SUB
+	KERNEL4x1_SUB
+
+	KERNEL4x1_SUB
+	KERNEL4x1_SUB
+	KERNEL4x1_SUB
+	KERNEL4x1_SUB
+
+	subs	counterL, counterL, #1
+	bgt	ctrmm_kernel_L1_M4_22
+
+
+ctrmm_kernel_L1_M4_40:
+
+	ands	counterL , tempK, #7		// counterL = counterL % 8
+	ble	ctrmm_kernel_L1_M4_100
+
+ctrmm_kernel_L1_M4_42:
+
+	KERNEL4x1_SUB
+
+	subs	counterL, counterL, #1
+	bgt	ctrmm_kernel_L1_M4_42
+
+ctrmm_kernel_L1_M4_100:
+
+	SAVE4x1
+
+#if (defined(LEFT) &&  defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+	sub	tempK, origK, tempOffset
+#if defined(LEFT)
+	sub	tempK, tempK, #4
+#else
+	sub	tempK, tempK, #1
+#endif
+	lsl	temp, tempK, #5
+	add	pA, pA, temp
+	lsl	temp, tempK, #3
+	add	pB, pB, temp
+#endif
+#if defined(LEFT)
+	add	tempOffset, tempOffset, #4
+#endif
+
+ctrmm_kernel_L1_M4_END:
+
+ctrmm_kernel_L1_M2_BEGIN:
+
+	mov	counterI, origM
+	tst	counterI , #3
+	ble	ctrmm_kernel_L1_END
+
+	tst	counterI, #2			// counterI = counterI / 2
+	ble	ctrmm_kernel_L1_M1_BEGIN
+
+ctrmm_kernel_L1_M2_20:
+
+	INIT2x1
+
+#if (defined(LEFT) &&  defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+	mov	pB, origPB
+#else
+	mov	pB, origPB
+	lsl	temp, tempOffset, #3
+	add	pB, pB, temp
+	lsl	temp, tempOffset, #4
+	add	pA, pA, temp
+#endif
+
+#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
+	sub	tempK, origK, tempOffset
+#elif defined(LEFT)
+	add	tempK, tempOffset, #2
+#else
+	add	tempK, tempOffset, #1
+#endif
+
+	asr 	counterL , tempK, #3		// counterL = counterL / 8
+	cmp	counterL , #0
+	ble	ctrmm_kernel_L1_M2_40
+
+ctrmm_kernel_L1_M2_22:
+
+	KERNEL2x1_SUB
+	KERNEL2x1_SUB
+	KERNEL2x1_SUB
+	KERNEL2x1_SUB
+
+	KERNEL2x1_SUB
+	KERNEL2x1_SUB
+	KERNEL2x1_SUB
+	KERNEL2x1_SUB
+
+	subs	counterL, counterL, #1
+	bgt	ctrmm_kernel_L1_M2_22
+
+
+ctrmm_kernel_L1_M2_40:
+
+	ands	counterL , tempK, #7		// counterL = counterL % 8
+	ble	ctrmm_kernel_L1_M2_100
+
+ctrmm_kernel_L1_M2_42:
+
+	KERNEL2x1_SUB
+
+	subs	counterL, counterL, #1
+	bgt	ctrmm_kernel_L1_M2_42
+
+ctrmm_kernel_L1_M2_100:
+
+	SAVE2x1
+
+#if (defined(LEFT) &&  defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+	sub	tempK, origK, tempOffset
+#if defined(LEFT)
+	sub	tempK, tempK, #2
+#else
+	sub	tempK, tempK, #1
+#endif
+	lsl	temp, tempK, #4
+	add	pA, pA, temp
+	lsl	temp, tempK, #3
+	add	pB, pB, temp
+#endif
+#if defined(LEFT)
+	add	tempOffset, tempOffset, #2
+#endif
+
+ctrmm_kernel_L1_M2_END:
+
+
+ctrmm_kernel_L1_M1_BEGIN:
+
+	tst	counterI, #1			// counterI = counterI % 2
+	ble	ctrmm_kernel_L1_END
+
+ctrmm_kernel_L1_M1_20:
+
+	INIT1x1
+
+#if (defined(LEFT) &&  defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+	mov	pB, origPB
+#else
+	mov	pB, origPB
+	lsl	temp, tempOffset, #3
+	add	pB, pB, temp
+	lsl	temp, tempOffset, #3
+	add	pA, pA, temp
+#endif
+
+#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
+	sub	tempK, origK, tempOffset
+#elif defined(LEFT)
+	add	tempK, tempOffset, #1
+#else
+	add	tempK, tempOffset, #1
+#endif
+
+	asr 	counterL , tempK, #3		// counterL = counterL / 8
+	cmp	counterL , #0
+	ble	ctrmm_kernel_L1_M1_40
+
+ctrmm_kernel_L1_M1_22:
+	KERNEL1x1_SUB
+	KERNEL1x1_SUB
+	KERNEL1x1_SUB
+	KERNEL1x1_SUB
+
+	KERNEL1x1_SUB
+	KERNEL1x1_SUB
+	KERNEL1x1_SUB
+	KERNEL1x1_SUB
+
+	subs	counterL, counterL, #1
+	bgt	ctrmm_kernel_L1_M1_22
+
+
+ctrmm_kernel_L1_M1_40:
+
+	ands	counterL , tempK, #7		// counterL = counterL % 8
+	ble	ctrmm_kernel_L1_M1_100
+
+ctrmm_kernel_L1_M1_42:
+
+	KERNEL1x1_SUB
+
+	subs	counterL, counterL, #1
+	bgt	ctrmm_kernel_L1_M1_42
+
+ctrmm_kernel_L1_M1_100:
+
+	SAVE1x1
+
+
+ctrmm_kernel_L1_END:
+
+
+ctrmm_kernel_L999:
+	mov	x0, #0				// set return value
+	ldp	d8, d9, [sp, #(0 * 16)]
+	ldp	d10, d11, [sp, #(1 * 16)]
+	ldp	d12, d13, [sp, #(2 * 16)]
+	ldp	d14, d15, [sp, #(3 * 16)]
+	ldp	d16, d17, [sp, #(4 * 16)]
+	ldp	x18, x19, [sp, #(5 * 16)]
+	ldp	x20, x21, [sp, #(6 * 16)]
+	ldp	x22, x23, [sp, #(7 * 16)]
+	ldp	x24, x25, [sp, #(8 * 16)]
+	ldp	x26, x27, [sp, #(9 * 16)]
+	ldr	x28, [sp, #(10 * 16)]
+	add	sp, sp, #(11*16)
+	ret
+
+	EPILOGUE
+
diff --git a/kernel/arm64/dgemm_kernel_4x4.S b/kernel/arm64/dgemm_kernel_4x4.S
index e88253af1..e2ad11492 100644
--- a/kernel/arm64/dgemm_kernel_4x4.S
+++ b/kernel/arm64/dgemm_kernel_4x4.S
@@ -46,21 +46,22 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #define pCRow0		x12
 #define pCRow1		x13
 #define pCRow2		x14
-#define pA		x15
-#define ppC		x16
-#define ppCRow0		x17
-#define ppCRow1		x18
-#define ppCRow2		x19
-#define ppA		x20
+#define pCRow3		x15
+#define pA		x16
+#define ppC		x17
+#define ppCRow0		x18
+#define ppCRow1		x19
+#define ppCRow2		x20
+#define ppCRow3		x21
+#define ppA		x22
+#define alpha		x23
 
 #define alpha0		d10
 #define alphaV0		v10.d[0]
-#define alpha1		d11
-#define alphaV1		v11.d[0]
-#define alpha2		d14
-#define alphaV2		v14.d[0]
-#define alpha3		d15
-#define alphaV3		v15.d[0]
+
+#define A_PRE_SIZE	1024
+#define B_PRE_SIZE	1024
+#define C_PRE_SIZE	128
 
 // 00 origM
 // 01 origN
@@ -77,15 +78,15 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 // 12 pCRow0
 // 13 pCRow1
 // 14 pCRow2
-// 15 pA
-// 16 ppC
-// 17 ppCRow0
-// 18 must save ppCRow1
-// 19 must save ppCRow2
-// 20 must save ppA
-// 21 must save
-// 22 must save
-// 23 must save
+// 15 pCRow3
+// 16 pA
+// 17 ppC
+// 18 must save ppCRow0
+// 19 must save ppCRow1
+// 20 must save ppCRow2
+// 21 must save ppCRow3
+// 22 must save ppA
+// 23 must save alpha
 // 24 must save
 // 25 must save
 // 26 must save
@@ -106,11 +107,11 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 //v08 must save pB00, pB01
 //v09 must save pB02, pB03
 //v10 must save ALPHA0
-//v11 must save ALPHA1
+//v11 must save
 //v12 must save pB10, pB11
 //v13 must save pB12, pB13
-//v14 must save ALPHA2
-//v15 must save ALPHA3
+//v14 must save
+//v15 must save
 //v16 must save C00, C01
 //v17 must save C02, C03
 //v18 ppC00, ppC01
@@ -152,222 +153,254 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 .endm
 
 .macro KERNEL8x4_I
-	ld1	{v8.2d, v9.2d}, [pB]
-	add	pB, pB, #32
-	ld1	{v0.2d, v1.2d}, [pA]
+	ldp	d8, d9, [pB]
+	add	pB, pB, #16
+	ldp	d10, d11, [pB]
+	add	pB, pB, #16
+
+	ldp	q0, q1, [pA]
 	add	pA, pA, #32
 
 	fmul	v16.2d, v0.2d, v8.2d[0]
-	fmul	v29.2d, v1.2d, v9.2d[1]
+	fmul	v29.2d, v1.2d, v11.2d[0]
 
-	ld1	{v2.2d, v3.2d}, [ppA]
+	ldp	q2, q3, [ppA]
 	add	ppA, ppA, #32
 
-	fmul	v20.2d, v0.2d, v8.2d[1]
-	fmul	v25.2d, v1.2d, v9.2d[0]
+	fmul	v20.2d, v0.2d, v9.2d[0]
+	fmul	v25.2d, v1.2d, v10.2d[0]
+
+	prfm	PLDL1KEEP, [pA, #A_PRE_SIZE]
 
 	fmul	v18.2d, v2.2d, v8.2d[0]
-	fmul	v31.2d, v3.2d, v9.2d[1]
-	fmul	v22.2d, v2.2d, v8.2d[1]
-	fmul	v27.2d, v3.2d, v9.2d[0]
+	fmul	v31.2d, v3.2d, v11.2d[0]
 
-	ld1	{v12.2d, v13.2d}, [pB]		// for next round
-	add	pB, pB, #32
+	prfm	PLDL1KEEP, [ppA, #A_PRE_SIZE]
 
-	fmul	v24.2d, v0.2d, v9.2d[0]
-	fmul	v21.2d, v1.2d, v8.2d[1]
+	fmul	v22.2d, v2.2d, v9.2d[0]
+	fmul	v27.2d, v3.2d, v10.2d[0]
 
-	ld1	{v4.2d, v5.2d} , [pA]		// for next round
+	ldp	d12, d13, [pB]
+	add	pB, pB, #16
+
+	fmul	v24.2d, v0.2d, v10.2d[0]
+	fmul	v21.2d, v1.2d, v9.2d[0]
+
+	ldp	q4, q5, [pA]		// for next round
 	add	pA, pA, #32
 
-	fmul	v26.2d, v2.2d, v9.2d[0]
-	fmul	v23.2d, v3.2d, v8.2d[1]
+	fmul	v26.2d, v2.2d, v10.2d[0]
+	fmul	v23.2d, v3.2d, v9.2d[0]
 
-	ld1	{v6.2d, v7.2d} , [ppA]		// for next round
+	ldp	q6, q7, [ppA]		// for next round
 	add	ppA, ppA, #32
 
-	fmul	v28.2d, v0.2d, v9.2d[1]
+	fmul	v28.2d, v0.2d, v11.2d[0]
 	fmul	v17.2d, v1.2d, v8.2d[0]
-	fmul	v30.2d, v2.2d, v9.2d[1]
+
+	ldp	d14, d15, [pB]
+	add	pB, pB, #16
+
+	fmul	v30.2d, v2.2d, v11.2d[0]
 	fmul	v19.2d, v3.2d, v8.2d[0]
 .endm
 
 .macro KERNEL8x4_M2
 	fmla	v16.2d, v4.2d, v12.2d[0]
-	fmla	v29.2d, v5.2d, v13.2d[1]
+	fmla	v29.2d, v5.2d, v15.2d[0]
 
-	ld1	{v8.2d, v9.2d}, [pB]
-	add	pB, pB, #32
+	ldp	d8, d9, [pB]
+	add	pB, pB, #16
 
 	fmla	v18.2d, v6.2d, v12.2d[0]
-	fmla	v31.2d, v7.2d, v13.2d[1]
-	fmla	v20.2d, v4.2d, v12.2d[1]
-	fmla	v25.2d, v5.2d, v13.2d[0]
+	fmla	v31.2d, v7.2d, v15.2d[0]
 
-	prfm	PLDL1KEEP, [pB, #512]
+	ldp	d10, d11, [pB]
+	add	pB, pB, #16
 
-	fmla	v22.2d, v6.2d, v12.2d[1]
-	fmla	v27.2d, v7.2d, v13.2d[0]
-	fmla	v24.2d, v4.2d, v13.2d[0]
-	fmla	v21.2d, v5.2d, v12.2d[1]
+	fmla	v20.2d, v4.2d, v13.2d[0]
+	fmla	v25.2d, v5.2d, v14.2d[0]
 
-	ld1	{v0.2d, v1.2d}, [pA]
+	prfm	PLDL1KEEP, [pB, #B_PRE_SIZE]
+
+	fmla	v22.2d, v6.2d, v13.2d[0]
+	fmla	v27.2d, v7.2d, v14.2d[0]
+	fmla	v24.2d, v4.2d, v14.2d[0]
+	fmla	v21.2d, v5.2d, v13.2d[0]
+
+	ldp	q0, q1, [pA]
 	add	pA, pA, #32
 
-	fmla	v26.2d, v6.2d, v13.2d[0]
-	fmla	v23.2d, v7.2d, v12.2d[1]
-	fmla	v28.2d, v4.2d, v13.2d[1]
+	fmla	v26.2d, v6.2d, v14.2d[0]
+	fmla	v23.2d, v7.2d, v13.2d[0]
+	fmla	v28.2d, v4.2d, v15.2d[0]
 	fmla	v17.2d, v5.2d, v12.2d[0]
 
-	ld1	{v2.2d, v3.2d}, [ppA]
+	ldp	q2, q3, [ppA]
 	add	ppA, ppA, #32
 
-	fmla	v30.2d, v6.2d, v13.2d[1]
+	fmla	v30.2d, v6.2d, v15.2d[0]
 	fmla	v19.2d, v7.2d, v12.2d[0]
 .endm
 
 .macro KERNEL8x4_M1
 	fmla	v16.2d, v0.2d, v8.2d[0]
-	fmla	v29.2d, v1.2d, v9.2d[1]
+	fmla	v29.2d, v1.2d, v11.2d[0]
 
-	ld1	{v12.2d, v13.2d}, [pB]		// for next round
-	add	pB, pB, #32
+	ldp	d12, d13, [pB]
+	add	pB, pB, #16
 
 	fmla	v18.2d, v2.2d, v8.2d[0]
-	fmla	v31.2d, v3.2d, v9.2d[1]
-	fmla	v20.2d, v0.2d, v8.2d[1]
-	fmla	v25.2d, v1.2d, v9.2d[0]
+	fmla	v31.2d, v3.2d, v11.2d[0]
 
-	prfm	PLDL1KEEP, [pA, #512]
+	ldp	d14, d15, [pB]
+	add	pB, pB, #16
 
-	fmla	v22.2d, v2.2d, v8.2d[1]
-	fmla	v27.2d, v3.2d, v9.2d[0]
+	fmla	v20.2d, v0.2d, v9.2d[0]
+	fmla	v25.2d, v1.2d, v10.2d[0]
 
-	prfm	PLDL1KEEP, [ppA, #512]
+	prfm	PLDL1KEEP, [pA, #A_PRE_SIZE]
 
-	fmla	v24.2d, v0.2d, v9.2d[0]
-	fmla	v21.2d, v1.2d, v8.2d[1]
+	fmla	v22.2d, v2.2d, v9.2d[0]
+	fmla	v27.2d, v3.2d, v10.2d[0]
 
-	ld1	{v4.2d, v5.2d} , [pA]		// for next round
+	prfm	PLDL1KEEP, [ppA, #A_PRE_SIZE]
+
+	fmla	v24.2d, v0.2d, v10.2d[0]
+	fmla	v21.2d, v1.2d, v9.2d[0]
+
+	ldp	q4, q5, [pA]
 	add	pA, pA, #32
 
-	fmla	v26.2d, v2.2d, v9.2d[0]
-	fmla	v23.2d, v3.2d, v8.2d[1]
-	fmla	v28.2d, v0.2d, v9.2d[1]
+	fmla	v26.2d, v2.2d, v10.2d[0]
+	fmla	v23.2d, v3.2d, v9.2d[0]
+
+	fmla	v28.2d, v0.2d, v11.2d[0]
 	fmla	v17.2d, v1.2d, v8.2d[0]
 
-	ld1	{v6.2d, v7.2d} , [ppA]		// for next round
+	ldp	q6, q7, [ppA]
 	add	ppA, ppA, #32
 
-	fmla	v30.2d, v2.2d, v9.2d[1]
+	fmla	v30.2d, v2.2d, v11.2d[0]
 	fmla	v19.2d, v3.2d, v8.2d[0]
 .endm
 
 .macro KERNEL8x4_E
 	fmla	v16.2d, v4.2d, v12.2d[0]
-	fmla	v25.2d, v5.2d, v13.2d[0]
+	fmla	v25.2d, v5.2d, v14.2d[0]
 	fmla	v18.2d, v6.2d, v12.2d[0]
-	fmla	v27.2d, v7.2d, v13.2d[0]
+	fmla	v27.2d, v7.2d, v14.2d[0]
 
-	fmla	v20.2d, v4.2d, v12.2d[1]
-	fmla	v29.2d, v5.2d, v13.2d[1]
-	fmla	v22.2d, v6.2d, v12.2d[1]
-	fmla	v31.2d, v7.2d, v13.2d[1]
+	fmla	v20.2d, v4.2d, v13.2d[0]
+	fmla	v29.2d, v5.2d, v15.2d[0]
+	fmla	v22.2d, v6.2d, v13.2d[0]
+	fmla	v31.2d, v7.2d, v15.2d[0]
 
-	fmla	v24.2d, v4.2d, v13.2d[0]
+	fmla	v24.2d, v4.2d, v14.2d[0]
 	fmla	v17.2d, v5.2d, v12.2d[0]
-	fmla	v26.2d, v6.2d, v13.2d[0]
+	fmla	v26.2d, v6.2d, v14.2d[0]
 	fmla	v19.2d, v7.2d, v12.2d[0]
 
-	fmla	v28.2d, v4.2d, v13.2d[1]
-	fmla	v21.2d, v5.2d, v12.2d[1]
-	fmla	v30.2d, v6.2d, v13.2d[1]
-	fmla	v23.2d, v7.2d, v12.2d[1]
+	fmla	v28.2d, v4.2d, v15.2d[0]
+	fmla	v21.2d, v5.2d, v13.2d[0]
+	fmla	v30.2d, v6.2d, v15.2d[0]
+	fmla	v23.2d, v7.2d, v13.2d[0]
 .endm
 
 .macro KERNEL8x4_SUB
-	ld1	{v8.2d, v9.2d}, [pB]
-	add	pB, pB, #32
-	ld1	{v0.2d, v1.2d}, [pA]
+	ldp	d8, d9, [pB]
+	add	pB, pB, #16
+	ldp	d10, d11, [pB]
+	add	pB, pB, #16
+	ldp	q0, q1, [pA]
 	add	pA, pA, #32
 
 	fmla	v16.2d, v0.2d, v8.2d[0]
-	fmla	v29.2d, v1.2d, v9.2d[1]
-	fmla	v20.2d, v0.2d, v8.2d[1]
-	fmla	v25.2d, v1.2d, v9.2d[0]
+	fmla	v29.2d, v1.2d, v11.2d[0]
+	fmla	v20.2d, v0.2d, v9.2d[0]
+	fmla	v25.2d, v1.2d, v10.2d[0]
 
-	ld1	{v2.2d, v3.2d}, [ppA]
+	ldp	q2, q3, [ppA]
 	add	ppA, ppA, #32
 
-	fmla	v24.2d, v0.2d, v9.2d[0]
-	fmla	v21.2d, v1.2d, v8.2d[1]
-	fmla	v28.2d, v0.2d, v9.2d[1]
+	fmla	v24.2d, v0.2d, v10.2d[0]
+	fmla	v21.2d, v1.2d, v9.2d[0]
+	fmla	v28.2d, v0.2d, v11.2d[0]
 	fmla	v17.2d, v1.2d, v8.2d[0]
 
 	fmla	v18.2d, v2.2d, v8.2d[0]
-	fmla	v31.2d, v3.2d, v9.2d[1]
-	fmla	v22.2d, v2.2d, v8.2d[1]
-	fmla	v27.2d, v3.2d, v9.2d[0]
+	fmla	v31.2d, v3.2d, v11.2d[0]
+	fmla	v22.2d, v2.2d, v9.2d[0]
+	fmla	v27.2d, v3.2d, v10.2d[0]
 
-	fmla	v26.2d, v2.2d, v9.2d[0]
-	fmla	v23.2d, v3.2d, v8.2d[1]
-	fmla	v30.2d, v2.2d, v9.2d[1]
+	fmla	v26.2d, v2.2d, v10.2d[0]
+	fmla	v23.2d, v3.2d, v9.2d[0]
+	fmla	v30.2d, v2.2d, v11.2d[0]
 	fmla	v19.2d, v3.2d, v8.2d[0]
 .endm
 
 .macro SAVE8x4
+	fmov	alpha0, alpha
+
+	prfm	PLDL2KEEP, [pCRow0, #C_PRE_SIZE]
 	add	ppCRow0, pCRow0, #32
 
-	ld1	{v0.2d, v1.2d}, [pCRow0]
+	ldp	q0, q1, [pCRow0]
 	fmla	v0.2d, v16.2d, alphaV0
-	fmla	v1.2d, v17.2d, alphaV1
-	st1 	{v0.2d, v1.2d}, [pCRow0]
-
-	ld1	{v2.2d, v3.2d}, [ppCRow0]
-	fmla	v2.2d, v18.2d, alphaV2
-	fmla	v3.2d, v19.2d, alphaV3
-	st1 	{v2.2d, v3.2d}, [ppCRow0]
-
-	add	pCRow1, pCRow0, LDC
-	add	ppCRow1, ppCRow0, LDC
-
-	ld1	{v4.2d, v5.2d}, [pCRow1]
-	fmla	v4.2d, v20.2d, alphaV0
-	fmla	v5.2d, v21.2d, alphaV1
-	st1 	{v4.2d, v5.2d}, [pCRow1]
-
-	ld1	{v6.2d, v7.2d}, [ppCRow1]
-	fmla	v6.2d, v22.2d, alphaV2
-	fmla	v7.2d, v23.2d, alphaV3
-	st1 	{v6.2d, v7.2d}, [ppCRow1]
-
-	add	pCRow2, pCRow1, LDC
-	add	ppCRow2, ppCRow1, LDC
-
-	ld1	{v0.2d, v1.2d}, [pCRow2]
-	fmla	v0.2d, v24.2d, alphaV0
-	fmla	v1.2d, v25.2d, alphaV1
-	st1 	{v0.2d, v1.2d}, [pCRow2]
-
-	ld1	{v2.2d, v3.2d}, [ppCRow2]
-	fmla	v2.2d, v26.2d, alphaV2
-	fmla	v3.2d, v27.2d, alphaV3
-	st1 	{v2.2d, v3.2d}, [ppCRow2]
-
-	add	pCRow1, pCRow2, LDC
-	add	ppCRow1, ppCRow2, LDC
-
-	ld1	{v4.2d, v5.2d}, [pCRow1]
-	fmla	v4.2d, v28.2d, alphaV0
-	fmla	v5.2d, v29.2d, alphaV1
-	st1 	{v4.2d, v5.2d}, [pCRow1]
-
-	ld1	{v6.2d, v7.2d}, [ppCRow1]
-	fmla	v6.2d, v30.2d, alphaV2
-	fmla	v7.2d, v31.2d, alphaV3
-	st1 	{v6.2d, v7.2d}, [ppCRow1]
+	fmla	v1.2d, v17.2d, alphaV0
+	stp 	q0, q1, [pCRow0]
 
 	add	pCRow0, pCRow0, #64
+
+	ldp	q2, q3, [ppCRow0]
+	fmla	v2.2d, v18.2d, alphaV0
+	fmla	v3.2d, v19.2d, alphaV0
+	stp 	q2, q3, [ppCRow0]
+
+	prfm	PLDL2KEEP, [pCRow1, #C_PRE_SIZE]
+	add	ppCRow1, pCRow1, #32
+
+	ldp	q4, q5, [pCRow1]
+	fmla	v4.2d, v20.2d, alphaV0
+	fmla	v5.2d, v21.2d, alphaV0
+	stp 	q4, q5, [pCRow1]
+
+	add	pCRow1, pCRow1, #64
+
+	ldp	q6, q7, [ppCRow1]
+	fmla	v6.2d, v22.2d, alphaV0
+	fmla	v7.2d, v23.2d, alphaV0
+	stp 	q6, q7, [ppCRow1]
+
+	prfm	PLDL2KEEP, [pCRow2, #C_PRE_SIZE]
+	add	ppCRow2, pCRow2, #32
+
+	ldp	q0, q1, [pCRow2]
+	fmla	v0.2d, v24.2d, alphaV0
+	fmla	v1.2d, v25.2d, alphaV0
+	stp 	q0, q1, [pCRow2]
+
+	add	pCRow2, pCRow2, #64
+
+	ldp	q2, q3, [ppCRow2]
+	fmla	v2.2d, v26.2d, alphaV0
+	fmla	v3.2d, v27.2d, alphaV0
+	stp 	q2, q3, [ppCRow2]
+
+	prfm	PLDL2KEEP, [pCRow3, #C_PRE_SIZE]
+	add	ppCRow3, pCRow3, #32
+
+	ldp	q4, q5, [pCRow3]
+	fmla	v4.2d, v28.2d, alphaV0
+	fmla	v5.2d, v29.2d, alphaV0
+	stp 	q4, q5, [pCRow3]
+
+	add	pCRow3, pCRow3, #64
+
+	ldp	q6, q7, [ppCRow3]
+	fmla	v6.2d, v30.2d, alphaV0
+	fmla	v7.2d, v31.2d, alphaV0
+	stp 	q6, q7, [ppCRow3]
 .endm
 
 /******************************************************************************/
@@ -403,30 +436,32 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 .endm
 
 .macro SAVE4x4
+	fmov	alpha0, alpha
+
 	ld1	{v8.2d, v9.2d}, [pCRow0]
 	fmla	v8.2d, v16.2d, alphaV0
-	fmla	v9.2d, v17.2d, alphaV1
+	fmla	v9.2d, v17.2d, alphaV0
 	st1 	{v8.2d, v9.2d}, [pCRow0]
 
 	add	pCRow1, pCRow0, LDC
 
 	ld1	{v12.2d, v13.2d}, [pCRow1]
-	fmla	v12.2d, v20.2d, alphaV2
-	fmla	v13.2d, v21.2d, alphaV3
+	fmla	v12.2d, v20.2d, alphaV0
+	fmla	v13.2d, v21.2d, alphaV0
 	st1 	{v12.2d, v13.2d}, [pCRow1]
 
 	add	pCRow2, pCRow1, LDC
 
 	ld1	{v8.2d, v9.2d}, [pCRow2]
 	fmla	v8.2d, v24.2d, alphaV0
-	fmla	v9.2d, v25.2d, alphaV1
+	fmla	v9.2d, v25.2d, alphaV0
 	st1 	{v8.2d, v9.2d}, [pCRow2]
 
 	add	pCRow1, pCRow2, LDC
 
 	ld1	{v12.2d, v13.2d}, [pCRow1]
-	fmla	v12.2d, v28.2d, alphaV2
-	fmla	v13.2d, v29.2d, alphaV3
+	fmla	v12.2d, v28.2d, alphaV0
+	fmla	v13.2d, v29.2d, alphaV0
 	st1 	{v12.2d, v13.2d}, [pCRow1]
 
 	add	pCRow0, pCRow0, #32
@@ -454,6 +489,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 .endm
 
 .macro SAVE2x4
+	fmov	alpha0, alpha
+
 	ld1	{v8.2d}, [pCRow0]
 	fmla	v8.2d, v16.2d, alphaV0
 	st1	{v8.2d}, [pCRow0]
@@ -461,19 +498,19 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 	add	pCRow1, pCRow0, LDC
 
 	ld1	{v12.2d}, [pCRow1]
-	fmla	v12.2d, v20.2d, alphaV1
+	fmla	v12.2d, v20.2d, alphaV0
 	st1	{v12.2d}, [pCRow1]
 
 	add	pCRow2, pCRow1, LDC
 
 	ld1	{v8.2d}, [pCRow2]
-	fmla	v8.2d, v24.2d, alphaV2
+	fmla	v8.2d, v24.2d, alphaV0
 	st1	{v8.2d}, [pCRow2]
 
 	add	pCRow1, pCRow2, LDC
 
 	ld1	{v12.2d}, [pCRow1]
-	fmla	v12.2d, v28.2d, alphaV3
+	fmla	v12.2d, v28.2d, alphaV0
 	st1	{v12.2d}, [pCRow1]
 
 	add	pCRow0, pCRow0, #16
@@ -498,6 +535,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 .endm
 
 .macro SAVE1x4
+	fmov	alpha0, alpha
+
 	add	pCRow1, pCRow0, LDC
 
 	ld1	{v8.d}[0], [pCRow0]
@@ -511,7 +550,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
 	ld1	{v12.d}[0], [pCRow2]
 	ld1	{v12.d}[1], [pCRow1]
-	fmla	v12.2d, v20.2d, alphaV1
+	fmla	v12.2d, v20.2d, alphaV0
 	st1	{v12.d}[0], [pCRow2]
 	st1	{v12.d}[1], [pCRow1]
 
@@ -540,16 +579,18 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 .endm
 
 .macro SAVE4x2
+	fmov	alpha0, alpha
+
 	ld1	{v8.2d, v9.2d}, [pCRow0]
 	fmla	v8.2d, v16.2d, alphaV0
-	fmla	v9.2d, v17.2d, alphaV1
+	fmla	v9.2d, v17.2d, alphaV0
 	st1	{v8.2d, v9.2d}, [pCRow0]
 
 	add	pCRow1, pCRow0, LDC
 
 	ld1	{v12.2d, v13.2d}, [pCRow1]
-	fmla	v12.2d, v20.2d, alphaV2
-	fmla	v13.2d, v21.2d, alphaV3
+	fmla	v12.2d, v20.2d, alphaV0
+	fmla	v13.2d, v21.2d, alphaV0
 	st1	{v12.2d, v13.2d}, [pCRow1]
 
 	add	pCRow0, pCRow0, #32
@@ -574,6 +615,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 .endm
 
 .macro SAVE2x2
+	fmov	alpha0, alpha
+
 	ld1	{v8.2d}, [pCRow0]
 	fmla	v8.2d, v16.2d, alphaV0
 	st1	{v8.2d}, [pCRow0]
@@ -581,7 +624,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 	add	pCRow1 , pCRow0, LDC
 
 	ld1	{v12.2d}, [pCRow1]
-	fmla	v12.2d, v20.2d, alphaV1
+	fmla	v12.2d, v20.2d, alphaV0
 	st1	{v12.2d}, [pCRow1]
 
 	add	pCRow0, pCRow0, #16
@@ -604,6 +647,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 .endm
 
 .macro SAVE1x2
+	fmov	alpha0, alpha
+
 	add	pCRow1 , pCRow0, LDC
 
 	ld1	{v8.d}[0], [pCRow0]
@@ -634,9 +679,11 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 .endm
 
 .macro SAVE4x1
+	fmov	alpha0, alpha
+
 	ld1	{v8.2d, v9.2d}, [pCRow0]
 	fmla	v8.2d, v16.2d, alphaV0
-	fmla	v9.2d, v17.2d, alphaV1
+	fmla	v9.2d, v17.2d, alphaV0
 	st1	{v8.2d, v9.2d}, [pCRow0]
 
 	add	pCRow0, pCRow0, #32
@@ -662,6 +709,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 .endm
 
 .macro SAVE2x1
+	fmov	alpha0, alpha
+
 	ld1	{v8.2d}, [pCRow0]
 	fmla	v8.2d, v16.2d, alphaV0
 	st1	{v8.2d}, [pCRow0]
@@ -686,6 +735,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 .endm
 
 .macro SAVE1x1
+	fmov	alpha0, alpha
+
 	ldr	d8, [pCRow0]
 	fmadd	d8, d16, alpha0, d8
 	str 	d8, [pCRow0]
@@ -713,10 +764,9 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 	stp	x26, x27, [sp, #(9 * 16)]
 	str	x28, [sp, #(10 * 16)]
 
-	fmov	alpha0, d0
-	fmov	alpha1, d0
-	fmov	alpha2, d0
-	fmov	alpha3, d0
+	fmov	alpha, d0
+	prfm	PLDL1KEEP, [origPA]
+	prfm	PLDL1KEEP, [origPB]
 
 	lsl	LDC, LDC, #3			// ldc = ldc * 8
 
@@ -728,12 +778,16 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 	ble	dgemm_kernel_L2_BEGIN
 
 dgemm_kernel_L4_BEGIN:
-	mov	pCRow0, pC			// pCRow0 = C
-	add	pC, pC, LDC, lsl #2
+	mov	pCRow0, pC
+	add	pCRow1, pCRow0, LDC
+	add	pCRow2, pCRow1, LDC
+	add	pCRow3, pCRow2, LDC
+	add	pC, pCRow3, LDC
 
 	lsl	temp, origK, #5			// k * 4 * 8
 	mov	pA, origPA			// pA = start of A array
 	add	ppA, temp, pA
+	prfm	PLDL1KEEP, [ppA]
 
 //------------------------------------------------------------------------------
 
@@ -744,43 +798,51 @@ dgemm_kernel_L4_M8_BEGIN:
 	cmp 	counterI, #0
 	ble	dgemm_kernel_L4_M4_BEGIN
 
+	.align 5
 dgemm_kernel_L4_M8_20:
 
 	mov	pB, origPB
-	asr 	counterL , origK, #1		// L = K / 2
-	cmp	counterL , #2			// is there at least 4 to do?
+	asr 	counterL , origK, #2		// L = K / 4
+	cmp	counterL , #2
 	blt	dgemm_kernel_L4_M8_32
 
-	KERNEL8x4_I				// do one in the K
-	KERNEL8x4_M2				// do another in the K
+	KERNEL8x4_I
+	KERNEL8x4_M2
+	KERNEL8x4_M1
+	KERNEL8x4_M2
 
 	subs	counterL, counterL, #2		// subtract 2
 	ble	dgemm_kernel_L4_M8_22a
+
 	.align 5
-
 dgemm_kernel_L4_M8_22:
-
+	KERNEL8x4_M1
+	KERNEL8x4_M2
 	KERNEL8x4_M1
 	KERNEL8x4_M2
 
 	subs	counterL, counterL, #1
 	bgt	dgemm_kernel_L4_M8_22
 
-
+	.align 5
 dgemm_kernel_L4_M8_22a:
 
+	KERNEL8x4_M1
+	KERNEL8x4_M2
 	KERNEL8x4_M1
 	KERNEL8x4_E
 
 	b	 dgemm_kernel_L4_M8_44
 
+	.align 5
 dgemm_kernel_L4_M8_32:
 
 	tst	counterL, #1
 	ble	dgemm_kernel_L4_M8_40
 
 	KERNEL8x4_I
-
+	KERNEL8x4_M2
+	KERNEL8x4_M1
 	KERNEL8x4_E
 
 	b	dgemm_kernel_L4_M8_44
@@ -792,14 +854,22 @@ dgemm_kernel_L4_M8_40:
 
 dgemm_kernel_L4_M8_44:
 
-	ands	counterL , origK, #1
+	ands	counterL , origK, #3
 	ble	dgemm_kernel_L4_M8_100
 
+	.align 5
 dgemm_kernel_L4_M8_46:
 
 	KERNEL8x4_SUB
 
+	subs	counterL, counterL, #1
+	bne	dgemm_kernel_L4_M8_46
+
 dgemm_kernel_L4_M8_100:
+	lsl	temp, origK, #5
+	prfm	PLDL1KEEP, [pA, temp]
+	prfm	PLDL1KEEP, [ppA, temp]
+	prfm	PLDL1KEEP, [origPB]
 
 	SAVE8x4
 
@@ -810,7 +880,6 @@ dgemm_kernel_L4_M8_END:
 	subs	counterI, counterI, #1
 	bne	dgemm_kernel_L4_M8_20
 
-
 dgemm_kernel_L4_M4_BEGIN:
 	mov	counterI, origM
 	tst	counterI , #7
diff --git a/kernel/arm64/dgemm_kernel_4x8.S b/kernel/arm64/dgemm_kernel_4x8.S
new file mode 100755
index 000000000..88e9a773d
--- /dev/null
+++ b/kernel/arm64/dgemm_kernel_4x8.S
@@ -0,0 +1,1689 @@
+/*******************************************************************************
+Copyright (c) 2015, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*******************************************************************************/
+
+#define ASSEMBLER
+#include "common.h"
+
+/*                   X0          X1          X2          s0         X3        x4       x5           x6 */
+/*int CNAME(BLASLONG bm,BLASLONG bn,BLASLONG bk,FLOAT alpha0,FLOAT* ba,FLOAT* bb,FLOAT* C,BLASLONG ldc )*/
+
+#define origM		x0
+#define origN		x1
+#define origK		x2
+#define origPA		x3
+#define origPB		x4
+#define pC		x5
+#define LDC		x6
+#define temp		x7
+#define counterL	x8
+#define counterI	x9
+#define counterJ	x10
+#define pB		x11
+#define pCRow0		x12
+#define pCRow1		x13
+#define pCRow2		x14
+#define pA		x15
+
+#define alpha0		d2
+#define alphaV0		v2.d[0]
+#define alpha1		d3
+#define alphaV1		v3.d[0]
+#define alpha2		d6
+#define alphaV2		v6.d[0]
+#define alpha3		d7
+#define alphaV3		v7.d[0]
+
+// 00 origM
+// 01 origN
+// 02 origK
+// 03 origPA
+// 04 origPB
+// 05 pC
+// 06 origLDC -> LDC
+// 07 temp
+// 08 counterL
+// 09 counterI
+// 10 counterJ
+// 11 pB
+// 12 pCRow0
+// 13 pCRow1
+// 14 pCRow2
+// 15 pA
+// 16
+// 17
+// 18 must save
+// 19 must save
+// 20 must save
+// 21 must save
+// 22 must save
+// 23 must save
+// 24 must save
+// 25 must save
+// 26 must save
+// 27 must save
+// 28 must save
+// 29 frame
+// 30 link
+// 31 sp
+
+//v00 ALPHA -> pA00, pA01
+//v01 pA02, pA03
+//v02 ALPHA0
+//v03 ALPHA1
+//v04 pA10, pA11
+//v05 pA12, pA13
+//v06 ALPHA2
+//v07 ALPHA3
+//v08 must save pB0_0, pB0_1
+//v09 must save pB0_2, pB0_3
+//v10 must save pB0_4, pB0_5
+//v11 must save pB0_6, pB0_7
+//v12 must save pB1_0, pB1_1
+//v13 must save pB1_2, pB1_3
+//v14 must save pB1_4, pB1_5
+//v15 must save pB1_6, pB1_7
+//v16 must save C00, C01
+//v17 must save C02, C03
+//v18 C04, C05
+//v19 C06, C07
+//v20 C10, C11
+//v21 C12, C13
+//v22 C14, C15
+//v23 C16, C17
+//v24 C20, C21
+//v25 C22, C23
+//v26 C24, C25
+//v27 C26, C27
+//v28 C30, C31
+//v29 C32, C33
+//v30 C34, C35
+//v31 C36, C37
+
+/*******************************************************************************
+* Macro definitions
+*******************************************************************************/
+
+.macro INIT4x8
+	fmov		d16, xzr
+	fmov		d17, xzr
+	fmov		d18, xzr
+	fmov		d19, d16
+	fmov		d20, xzr
+	fmov		d21, d16
+	fmov		d22, d17
+	fmov		d23, d18
+	fmov		d24, xzr
+	fmov		d25, d16
+	fmov		d26, d17
+	fmov		d27, d18
+	fmov		d28, xzr
+	fmov		d29, d16
+	fmov		d30, d17
+	fmov		d31, d18
+.endm
+
+.macro KERNEL4x8_I
+	ld1	{v8.2d, v9.2d}, [pB]
+	add	pB, pB, #32
+	ld1	{v0.2d, v1.2d}, [pA]
+	add	pA, pA, #32
+	ld1	{v10.2d, v11.2d}, [pB]
+	add	pB, pB, #32
+
+	fmul	v16.2d, v0.2d, v8.2d[0]
+	fmul	v17.2d, v1.2d, v8.2d[0]
+	fmul	v18.2d, v0.2d, v8.2d[1]
+	fmul	v19.2d, v1.2d, v8.2d[1]
+
+	fmul	v20.2d, v0.2d, v9.2d[0]
+	fmul	v21.2d, v1.2d, v9.2d[0]
+	fmul	v22.2d, v0.2d, v9.2d[1]
+	fmul	v23.2d, v1.2d, v9.2d[1]
+
+	fmul	v24.2d, v0.2d, v10.2d[0]
+	fmul	v25.2d, v1.2d, v10.2d[0]
+	fmul	v26.2d, v0.2d, v10.2d[1]
+	fmul	v27.2d, v1.2d, v10.2d[1]
+
+	fmul	v28.2d, v0.2d, v11.2d[0]
+	fmul	v29.2d, v1.2d, v11.2d[0]
+	fmul	v30.2d, v0.2d, v11.2d[1]
+	fmul	v31.2d, v1.2d, v11.2d[1]
+
+	ld1	{v12.2d, v13.2d}, [pB]
+	add	pB, pB, #32
+	ld1	{v4.2d, v5.2d}, [pA]
+	add	pA, pA, #32
+	ld1	{v14.2d, v15.2d}, [pB]
+	add	pB, pB, #32
+.endm
+
+.macro KERNEL4x8_M1
+	fmla	v16.2d, v0.2d, v8.2d[0]
+	fmla	v17.2d, v1.2d, v8.2d[0]
+	fmla	v18.2d, v0.2d, v8.2d[1]
+	fmla	v19.2d, v1.2d, v8.2d[1]
+
+	fmla	v20.2d, v0.2d, v9.2d[0]
+	fmla	v21.2d, v1.2d, v9.2d[0]
+	fmla	v22.2d, v0.2d, v9.2d[1]
+	fmla	v23.2d, v1.2d, v9.2d[1]
+
+	fmla	v24.2d, v0.2d, v10.2d[0]
+	fmla	v25.2d, v1.2d, v10.2d[0]
+	fmla	v26.2d, v0.2d, v10.2d[1]
+	fmla	v27.2d, v1.2d, v10.2d[1]
+
+	fmla	v28.2d, v0.2d, v11.2d[0]
+	fmla	v29.2d, v1.2d, v11.2d[0]
+	fmla	v30.2d, v0.2d, v11.2d[1]
+	fmla	v31.2d, v1.2d, v11.2d[1]
+
+	ld1	{v12.2d, v13.2d}, [pB]		// For next round
+	add	pB, pB, #32
+	ld1	{v4.2d, v5.2d}, [pA]		// For next round
+	add	pA, pA, #32
+	ld1	{v14.2d, v15.2d}, [pB]
+	add	pB, pB, #32
+
+	prfm	PLDL1KEEP, [pA, #512]
+.endm
+
+.macro KERNEL4x8_M2
+	fmla	v16.2d, v4.2d, v12.2d[0]
+	fmla	v17.2d, v5.2d, v12.2d[0]
+	fmla	v18.2d, v4.2d, v12.2d[1]
+	fmla	v19.2d, v5.2d, v12.2d[1]
+
+	fmla	v20.2d, v4.2d, v13.2d[0]
+	fmla	v21.2d, v5.2d, v13.2d[0]
+	fmla	v22.2d, v4.2d, v13.2d[1]
+	fmla	v23.2d, v5.2d, v13.2d[1]
+
+	fmla	v24.2d, v4.2d, v14.2d[0]
+	fmla	v25.2d, v5.2d, v14.2d[0]
+	fmla	v26.2d, v4.2d, v14.2d[1]
+	fmla	v27.2d, v5.2d, v14.2d[1]
+
+	fmla	v28.2d, v4.2d, v15.2d[0]
+	fmla	v29.2d, v5.2d, v15.2d[0]
+	fmla	v30.2d, v4.2d, v15.2d[1]
+	fmla	v31.2d, v5.2d, v15.2d[1]
+
+	ld1	{v8.2d, v9.2d}, [pB]		// For next round
+	add	pB, pB, #32
+	ld1	{v0.2d, v1.2d}, [pA]		// For next round
+	add	pA, pA, #32
+	ld1	{v10.2d, v11.2d}, [pB]
+	add	pB, pB, #32
+
+	prfm	PLDL1KEEP, [pB, #512]
+.endm
+
+.macro KERNEL4x8_E
+	fmla	v16.2d, v4.2d, v12.2d[0]
+	fmla	v17.2d, v5.2d, v12.2d[0]
+	fmla	v18.2d, v4.2d, v12.2d[1]
+	fmla	v19.2d, v5.2d, v12.2d[1]
+
+	fmla	v20.2d, v4.2d, v13.2d[0]
+	fmla	v21.2d, v5.2d, v13.2d[0]
+	fmla	v22.2d, v4.2d, v13.2d[1]
+	fmla	v23.2d, v5.2d, v13.2d[1]
+
+	fmla	v24.2d, v4.2d, v14.2d[0]
+	fmla	v25.2d, v5.2d, v14.2d[0]
+	fmla	v26.2d, v4.2d, v14.2d[1]
+	fmla	v27.2d, v5.2d, v14.2d[1]
+
+	fmla	v28.2d, v4.2d, v15.2d[0]
+	fmla	v29.2d, v5.2d, v15.2d[0]
+	fmla	v30.2d, v4.2d, v15.2d[1]
+	fmla	v31.2d, v5.2d, v15.2d[1]
+.endm
+
+.macro KERNEL4x8_SUB
+	ld1	{v8.2d, v9.2d}, [pB]		// For next round
+	add	pB, pB, #32
+	ld1	{v0.2d, v1.2d}, [pA]		// For next round
+	add	pA, pA, #32
+	ld1	{v10.2d, v11.2d}, [pB]
+	add	pB, pB, #32
+
+	fmla	v16.2d, v0.2d, v8.2d[0]
+	fmla	v17.2d, v1.2d, v8.2d[0]
+	fmla	v18.2d, v0.2d, v8.2d[1]
+	fmla	v19.2d, v1.2d, v8.2d[1]
+
+	fmla	v20.2d, v0.2d, v9.2d[0]
+	fmla	v21.2d, v1.2d, v9.2d[0]
+	fmla	v22.2d, v0.2d, v9.2d[1]
+	fmla	v23.2d, v1.2d, v9.2d[1]
+
+	fmla	v24.2d, v0.2d, v10.2d[0]
+	fmla	v25.2d, v1.2d, v10.2d[0]
+	fmla	v26.2d, v0.2d, v10.2d[1]
+	fmla	v27.2d, v1.2d, v10.2d[1]
+
+	fmla	v28.2d, v0.2d, v11.2d[0]
+	fmla	v29.2d, v1.2d, v11.2d[0]
+	fmla	v30.2d, v0.2d, v11.2d[1]
+	fmla	v31.2d, v1.2d, v11.2d[1]
+.endm
+
+.macro SAVE4x8
+	add	pCRow1, pCRow0, LDC
+
+	ld1	{v8.2d, v9.2d}, [pCRow0]
+	fmla	v8.2d, v16.2d, alphaV0
+	fmla	v9.2d, v17.2d, alphaV1
+	st1 	{v8.2d, v9.2d}, [pCRow0]
+
+	add	pCRow2, pCRow1, LDC
+
+	ld1	{v10.2d, v11.2d}, [pCRow1]
+	fmla	v10.2d, v18.2d, alphaV2
+	fmla	v11.2d, v19.2d, alphaV3
+	st1 	{v10.2d, v11.2d}, [pCRow1]
+
+	add	pCRow1, pCRow2, LDC
+
+	ld1	{v12.2d, v13.2d}, [pCRow2]
+	fmla	v12.2d, v20.2d, alphaV0
+	fmla	v13.2d, v21.2d, alphaV1
+	st1 	{v12.2d, v13.2d}, [pCRow2]
+
+	add	pCRow2, pCRow1, LDC
+
+	ld1	{v14.2d, v15.2d}, [pCRow1]
+	fmla	v14.2d, v22.2d, alphaV2
+	fmla	v15.2d, v23.2d, alphaV3
+	st1 	{v14.2d, v15.2d}, [pCRow1]
+
+	add	pCRow1, pCRow2, LDC
+
+	ld1	{v8.2d, v9.2d}, [pCRow2]
+	fmla	v8.2d, v24.2d, alphaV0
+	fmla	v9.2d, v25.2d, alphaV1
+	st1 	{v8.2d, v9.2d}, [pCRow2]
+
+	add	pCRow2, pCRow1, LDC
+
+	ld1	{v10.2d, v11.2d}, [pCRow1]
+	fmla	v10.2d, v26.2d, alphaV2
+	fmla	v11.2d, v27.2d, alphaV3
+	st1 	{v10.2d, v11.2d}, [pCRow1]
+
+	add	pCRow1, pCRow2, LDC
+
+	ld1	{v12.2d, v13.2d}, [pCRow2]
+	fmla	v12.2d, v28.2d, alphaV0
+	fmla	v13.2d, v29.2d, alphaV1
+	st1 	{v12.2d, v13.2d}, [pCRow2]
+
+	ld1	{v14.2d, v15.2d}, [pCRow1]
+	fmla	v14.2d, v30.2d, alphaV2
+	fmla	v15.2d, v31.2d, alphaV3
+	st1 	{v14.2d, v15.2d}, [pCRow1]
+
+	add	pCRow0, pCRow0, #32
+.endm
+
+/******************************************************************************/
+
+.macro INIT2x8
+	fmov	d16, xzr
+	fmov	d18, xzr
+	fmov	d20, xzr
+	fmov	d22, d16
+	fmov	d24, xzr
+	fmov	d26, d16
+	fmov	d28, xzr
+	fmov	d30, d16
+.endm
+
+.macro KERNEL2x8_SUB
+	ld1	{v8.2d, v9.2d}, [pB]
+	add	pB, pB, #32
+	ld1	{v0.2d}, [pA]
+	add	pA, pA, #16
+	ld1	{v10.2d, v11.2d}, [pB]
+	add	pB, pB, #32
+
+	fmla	v16.2d, v0.2d, v8.2d[0]
+	fmla	v18.2d, v0.2d, v8.2d[1]
+
+	fmla	v20.2d, v0.2d, v9.2d[0]
+	fmla	v22.2d, v0.2d, v9.2d[1]
+
+	fmla	v24.2d, v0.2d, v10.2d[0]
+	fmla	v26.2d, v0.2d, v10.2d[1]
+
+	fmla	v28.2d, v0.2d, v11.2d[0]
+	fmla	v30.2d, v0.2d, v11.2d[1]
+.endm
+
+.macro SAVE2x8
+	add	pCRow1, pCRow0, LDC
+
+	ld1	{v8.2d}, [pCRow0]
+	fmla	v8.2d, v16.2d, alphaV0
+	st1 	{v8.2d}, [pCRow0]
+
+	add	pCRow2, pCRow1, LDC
+
+	ld1	{v10.2d}, [pCRow1]
+	fmla	v10.2d, v18.2d, alphaV2
+	st1 	{v10.2d}, [pCRow1]
+
+	add	pCRow1, pCRow2, LDC
+
+	ld1	{v12.2d}, [pCRow2]
+	fmla	v12.2d, v20.2d, alphaV0
+	st1 	{v12.2d}, [pCRow2]
+
+	add	pCRow2, pCRow1, LDC
+
+	ld1	{v14.2d}, [pCRow1]
+	fmla	v14.2d, v22.2d, alphaV2
+	st1 	{v14.2d}, [pCRow1]
+
+	add	pCRow1, pCRow2, LDC
+
+	ld1	{v8.2d}, [pCRow2]
+	fmla	v8.2d, v24.2d, alphaV0
+	st1 	{v8.2d}, [pCRow2]
+
+	add	pCRow2, pCRow1, LDC
+
+	ld1	{v10.2d}, [pCRow1]
+	fmla	v10.2d, v26.2d, alphaV2
+	st1 	{v10.2d}, [pCRow1]
+
+	add	pCRow1, pCRow2, LDC
+
+	ld1	{v12.2d}, [pCRow2]
+	fmla	v12.2d, v28.2d, alphaV0
+	st1 	{v12.2d}, [pCRow2]
+
+	add	pCRow2, pCRow1, LDC
+
+	ld1	{v14.2d}, [pCRow1]
+	fmla	v14.2d, v30.2d, alphaV2
+	st1 	{v14.2d}, [pCRow1]
+
+	add	pCRow0, pCRow0, #16
+.endm
+
+/******************************************************************************/
+
+.macro INIT1x8
+	fmov	d16, xzr
+	fmov	d20, xzr
+	fmov	d24, xzr
+	fmov	d28, xzr
+.endm
+
+.macro KERNEL1x8_SUB
+	ld1	{v8.2d, v9.2d}, [pB]
+	add	pB, pB, #32
+	ldr	d0, [pA]
+	add	pA, pA, #8
+	ld1	{v10.2d, v11.2d}, [pB]
+	add	pB, pB, #32
+
+	fmla	v16.2d, v8.2d, v0.d[0]
+	fmla	v20.2d, v9.2d, v0.d[0]
+	fmla	v24.2d, v10.2d, v0.d[0]
+	fmla	v28.2d, v11.2d, v0.d[0]
+.endm
+
+.macro SAVE1x8
+	add	pCRow1, pCRow0, LDC
+
+	ld1	{v8.d}[0], [pCRow0]
+	ld1	{v8.d}[1], [pCRow1]
+	fmla	v8.2d, v16.2d, alphaV0
+	st1	{v8.d}[0], [pCRow0]
+	st1	{v8.d}[1], [pCRow1]
+
+	add	pCRow2, pCRow1, LDC
+	add	pCRow1, pCRow2, LDC
+
+	ld1	{v10.d}[0], [pCRow2]
+	ld1	{v10.d}[1], [pCRow1]
+	fmla	v10.2d, v20.2d, alphaV1
+	st1	{v10.d}[0], [pCRow2]
+	st1	{v10.d}[1], [pCRow1]
+
+	add	pCRow2, pCRow1, LDC
+	add	pCRow1, pCRow2, LDC
+
+	ld1	{v12.d}[0], [pCRow2]
+	ld1	{v12.d}[1], [pCRow1]
+	fmla	v12.2d, v24.2d, alphaV2
+	st1	{v12.d}[0], [pCRow2]
+	st1	{v12.d}[1], [pCRow1]
+
+	add	pCRow2, pCRow1, LDC
+	add	pCRow1, pCRow2, LDC
+
+	ld1	{v14.d}[0], [pCRow2]
+	ld1	{v14.d}[1], [pCRow1]
+	fmla	v14.2d, v28.2d, alphaV3
+	st1	{v14.d}[0], [pCRow2]
+	st1	{v14.d}[1], [pCRow1]
+
+	add	pCRow0, pCRow0, #8
+.endm
+
+/******************************************************************************/
+
+.macro INIT4x4
+	fmov		d16, xzr
+	fmov		d17, d16
+	fmov		d20, d17
+	fmov		d21, d16
+	fmov		d24, d17
+	fmov		d25, d16
+	fmov		d28, d17
+	fmov		d29, d16
+.endm
+
+.macro KERNEL4x4_I
+	ld1	{v8.2d, v9.2d}, [pB]
+	add	pB, pB, #32
+	ld1	{v0.2d, v1.2d}, [pA]
+	add	pA, pA, #32
+
+	fmul	v16.2d, v0.2d, v8.2d[0]
+	fmul	v29.2d, v1.2d, v9.2d[1]
+
+	fmul	v20.2d, v0.2d, v8.2d[1]
+	fmul	v25.2d, v1.2d, v9.2d[0]
+
+	fmul	v24.2d, v0.2d, v9.2d[0]
+	fmul	v21.2d, v1.2d, v8.2d[1]
+
+	fmul	v28.2d, v0.2d, v9.2d[1]
+	fmul	v17.2d, v1.2d, v8.2d[0]
+
+	ld1	{v12.2d, v13.2d}, [pB]
+	add	pB, pB, #32
+	ld1	{v4.2d, v5.2d}, [pA]
+	add	pA, pA, #32
+.endm
+
+.macro KERNEL4x4_M1
+	fmla	v16.2d, v0.2d, v8.2d[0]
+	fmla	v29.2d, v1.2d, v9.2d[1]
+
+	ld1	{v12.2d, v13.2d}, [pB]		// For next round
+	add	pB, pB, #32
+
+	fmla	v20.2d, v0.2d, v8.2d[1]
+	fmla	v25.2d, v1.2d, v9.2d[0]
+
+	ld1	{v4.2d, v5.2d}, [pA]		// For next round
+	add	pA, pA, #32
+
+	fmla	v24.2d, v0.2d, v9.2d[0]
+	fmla	v21.2d, v1.2d, v8.2d[1]
+
+	prfm	PLDL1KEEP, [pA, #512]
+
+	fmla	v28.2d, v0.2d, v9.2d[1]
+	fmla	v17.2d, v1.2d, v8.2d[0]
+.endm
+
+.macro KERNEL4x4_M2
+	fmla	v16.2d, v4.2d, v12.2d[0]
+	fmla	v29.2d, v5.2d, v13.2d[1]
+
+	ld1	{v8.2d, v9.2d}, [pB]		// For next round
+	add	pB, pB, #32
+
+	fmla	v20.2d, v4.2d, v12.2d[1]
+	fmla	v25.2d, v5.2d, v13.2d[0]
+
+	ld1	{v0.2d, v1.2d}, [pA]		// For next round
+	add	pA, pA, #32
+
+	fmla	v24.2d, v4.2d, v13.2d[0]
+	fmla	v21.2d, v5.2d, v12.2d[1]
+
+	prfm	PLDL1KEEP, [pB, #512]
+
+	fmla	v28.2d, v4.2d, v13.2d[1]
+	fmla	v17.2d, v5.2d, v12.2d[0]
+.endm
+
+.macro KERNEL4x4_E
+	fmla	v16.2d, v4.2d, v12.2d[0]
+	fmla	v29.2d, v5.2d, v13.2d[1]
+
+	fmla	v20.2d, v4.2d, v12.2d[1]
+	fmla	v25.2d, v5.2d, v13.2d[0]
+
+	fmla	v24.2d, v4.2d, v13.2d[0]
+	fmla	v21.2d, v5.2d, v12.2d[1]
+
+	fmla	v28.2d, v4.2d, v13.2d[1]
+	fmla	v17.2d, v5.2d, v12.2d[0]
+.endm
+
+.macro KERNEL4x4_SUB
+	ld1	{v8.2d, v9.2d}, [pB]
+	add	pB, pB, #32
+	ld1	{v0.2d, v1.2d}, [pA]
+	add	pA, pA, #32
+
+	fmla	v16.2d, v0.2d, v8.2d[0]
+	fmla	v29.2d, v1.2d, v9.2d[1]
+
+	fmla	v20.2d, v0.2d, v8.2d[1]
+	fmla	v25.2d, v1.2d, v9.2d[0]
+
+	fmla	v24.2d, v0.2d, v9.2d[0]
+	fmla	v21.2d, v1.2d, v8.2d[1]
+
+	fmla	v28.2d, v0.2d, v9.2d[1]
+	fmla	v17.2d, v1.2d, v8.2d[0]
+.endm
+
+.macro SAVE4x4
+	ld1	{v8.2d, v9.2d}, [pCRow0]
+	fmla	v8.2d, v16.2d, alphaV0
+	fmla	v9.2d, v17.2d, alphaV1
+	st1 	{v8.2d, v9.2d}, [pCRow0]
+
+	add	pCRow1, pCRow0, LDC
+
+	ld1	{v12.2d, v13.2d}, [pCRow1]
+	fmla	v12.2d, v20.2d, alphaV2
+	fmla	v13.2d, v21.2d, alphaV3
+	st1 	{v12.2d, v13.2d}, [pCRow1]
+
+	add	pCRow2, pCRow1, LDC
+
+	ld1	{v8.2d, v9.2d}, [pCRow2]
+	fmla	v8.2d, v24.2d, alphaV0
+	fmla	v9.2d, v25.2d, alphaV1
+	st1 	{v8.2d, v9.2d}, [pCRow2]
+
+	add	pCRow1, pCRow2, LDC
+
+	ld1	{v12.2d, v13.2d}, [pCRow1]
+	fmla	v12.2d, v28.2d, alphaV2
+	fmla	v13.2d, v29.2d, alphaV3
+	st1 	{v12.2d, v13.2d}, [pCRow1]
+
+	add	pCRow0, pCRow0, #32
+.endm
+
+/******************************************************************************/
+
+.macro INIT2x4
+	fmov		d16, xzr
+	fmov		d20, d16
+	fmov		d24, d20
+	fmov		d28, d16
+.endm
+
+.macro KERNEL2x4_SUB
+	ld1	{v8.2d, v9.2d}, [pB]
+	add	pB, pB, #32
+	ld1	{v0.2d}, [pA]
+	add	pA, pA, #16
+
+	fmla	v16.2d, v0.2d, v8.2d[0]
+	fmla	v20.2d, v0.2d, v8.2d[1]
+	fmla	v24.2d, v0.2d, v9.2d[0]
+	fmla	v28.2d, v0.2d, v9.2d[1]
+.endm
+
+.macro SAVE2x4
+	ld1	{v8.2d}, [pCRow0]
+	fmla	v8.2d, v16.2d, alphaV0
+	st1	{v8.2d}, [pCRow0]
+
+	add	pCRow1, pCRow0, LDC
+
+	ld1	{v12.2d}, [pCRow1]
+	fmla	v12.2d, v20.2d, alphaV1
+	st1	{v12.2d}, [pCRow1]
+
+	add	pCRow2, pCRow1, LDC
+
+	ld1	{v8.2d}, [pCRow2]
+	fmla	v8.2d, v24.2d, alphaV2
+	st1	{v8.2d}, [pCRow2]
+
+	add	pCRow1, pCRow2, LDC
+
+	ld1	{v12.2d}, [pCRow1]
+	fmla	v12.2d, v28.2d, alphaV3
+	st1	{v12.2d}, [pCRow1]
+
+	add	pCRow0, pCRow0, #16
+.endm
+
+/******************************************************************************/
+
+.macro INIT1x4
+	fmov		d16, xzr
+	fmov		d20, d16
+.endm
+
+.macro KERNEL1x4_SUB
+	ldr	d0, [pA]
+	add	pA, pA, #8
+
+	ld1	{v8.2d, v9.2d}, [pB]
+	add	pB, pB, #32
+
+	fmla	v16.2d, v8.2d, v0.d[0]
+	fmla	v20.2d, v9.2d, v0.d[0]
+.endm
+
+.macro SAVE1x4
+	add	pCRow1, pCRow0, LDC
+
+	ld1	{v8.d}[0], [pCRow0]
+	ld1	{v8.d}[1], [pCRow1]
+	fmla	v8.2d, v16.2d, alphaV0
+	st1	{v8.d}[0], [pCRow0]
+	st1	{v8.d}[1], [pCRow1]
+
+	add	pCRow2, pCRow1, LDC
+	add	pCRow1, pCRow2, LDC
+
+	ld1	{v12.d}[0], [pCRow2]
+	ld1	{v12.d}[1], [pCRow1]
+	fmla	v12.2d, v20.2d, alphaV1
+	st1	{v12.d}[0], [pCRow2]
+	st1	{v12.d}[1], [pCRow1]
+
+	add	pCRow0, pCRow0, #8
+.endm
+
+/******************************************************************************/
+
+.macro INIT4x2
+	fmov	d16, xzr
+	fmov	d17, d16
+	fmov	d20, d17
+	fmov	d21, d16
+.endm
+
+.macro KERNEL4x2_SUB
+	ld1	{v8.2d}, [pB]
+	add	pB, pB, #16
+	ld1	{v0.2d, v1.2d}, [pA]
+	add	pA, pA, #32
+
+	fmla	v16.2d, v0.2d, v8.2d[0]
+	fmla	v17.2d, v1.2d, v8.2d[0]
+	fmla	v20.2d, v0.2d, v8.2d[1]
+	fmla	v21.2d, v1.2d, v8.2d[1]
+.endm
+
+.macro SAVE4x2
+	ld1	{v8.2d, v9.2d}, [pCRow0]
+	fmla	v8.2d, v16.2d, alphaV0
+	fmla	v9.2d, v17.2d, alphaV1
+	st1	{v8.2d, v9.2d}, [pCRow0]
+
+	add	pCRow1, pCRow0, LDC
+
+	ld1	{v12.2d, v13.2d}, [pCRow1]
+	fmla	v12.2d, v20.2d, alphaV2
+	fmla	v13.2d, v21.2d, alphaV3
+	st1	{v12.2d, v13.2d}, [pCRow1]
+
+	add	pCRow0, pCRow0, #32
+.endm
+
+/******************************************************************************/
+
+.macro INIT2x2
+	fmov		d16, xzr
+	fmov		d20, d16
+.endm
+
+.macro KERNEL2x2_SUB
+	ld1	{v8.2d}, [pB]
+	add	pB, pB, #16
+
+	ld1	{v0.2d}, [pA]
+	add	pA, pA, #16
+
+	fmla	v16.2d, v0.2d, v8.2d[0]
+	fmla	v20.2d, v0.2d, v8.2d[1]
+.endm
+
+.macro SAVE2x2
+	ld1	{v8.2d}, [pCRow0]
+	fmla	v8.2d, v16.2d, alphaV0
+	st1	{v8.2d}, [pCRow0]
+
+	add	pCRow1 , pCRow0, LDC
+
+	ld1	{v12.2d}, [pCRow1]
+	fmla	v12.2d, v20.2d, alphaV1
+	st1	{v12.2d}, [pCRow1]
+
+	add	pCRow0, pCRow0, #16
+.endm
+
+/******************************************************************************/
+
+.macro INIT1x2
+	fmov		d16, xzr
+.endm
+
+.macro KERNEL1x2_SUB
+	ld1	{v8.2d} , [pB]
+	add	pB , pB, #16
+
+	ldr	d0 , [pA]
+	add	pA, pA, #8
+
+	fmla	v16.2d, v8.2d, v0.2d[0]
+.endm
+
+.macro SAVE1x2
+	add	pCRow1 , pCRow0, LDC
+
+	ld1	{v8.d}[0], [pCRow0]
+	ld1	{v8.d}[1], [pCRow1]
+	fmla	v8.2d, v16.2d, alphaV0
+	st1	{v8.d}[0], [pCRow0]
+	st1	{v8.d}[1], [pCRow1]
+
+	add	pCRow0, pCRow0, #8
+.endm
+
+/******************************************************************************/
+
+.macro INIT4x1
+	fmov	d16, xzr
+	fmov	d17, d16
+.endm
+
+.macro KERNEL4x1_SUB
+	ldr	d8, [pB]
+	add	pB , pB, #8
+
+	ld1	{v0.2d, v1.2d}, [pA]
+	add	pA , pA, #32
+
+	fmla	v16.2d, v0.2d, v8.2d[0]
+	fmla	v17.2d, v1.2d, v8.2d[0]
+.endm
+
+.macro SAVE4x1
+	ld1	{v8.2d, v9.2d}, [pCRow0]
+	fmla	v8.2d, v16.2d, alphaV0
+	fmla	v9.2d, v17.2d, alphaV1
+	st1	{v8.2d, v9.2d}, [pCRow0]
+
+	add	pCRow0, pCRow0, #32
+.endm
+
+
+
+
+/******************************************************************************/
+
+.macro INIT2x1
+	fmov		d16, xzr
+.endm
+
+.macro KERNEL2x1_SUB
+	ldr	d8, [pB]
+	add	pB , pB, #8
+
+	ld1	{v0.2d}, [pA]
+	add	pA , pA, #16
+
+	fmla	v16.2d, v0.2d, v8.2d[0]
+.endm
+
+.macro SAVE2x1
+	ld1	{v8.2d}, [pCRow0]
+	fmla	v8.2d, v16.2d, alphaV0
+	st1	{v8.2d}, [pCRow0]
+
+	add	pCRow0, pCRow0, #16
+.endm
+
+/******************************************************************************/
+
+.macro INIT1x1
+	fmov	d16, xzr
+.endm
+
+.macro KERNEL1x1_SUB
+	ldr	d8, [pB]
+	add	pB , pB, #8
+
+	ldr	d0, [pA]
+	add	pA , pA, #8
+
+	fmadd 	d16, d0, d8, d16  
+.endm
+
+.macro SAVE1x1
+	ldr	d8, [pCRow0]
+	fmadd	d8, d16, alpha0, d8
+	str 	d8, [pCRow0]
+
+	add	pCRow0, pCRow0, #8
+.endm
+
+/*******************************************************************************
+* End of macro definitions
+*******************************************************************************/
+
+	PROLOGUE
+
+	.align 5
+	add	sp, sp, #-(11 * 16)
+	stp	d8, d9, [sp, #(0 * 16)]
+	stp	d10, d11, [sp, #(1 * 16)]
+	stp	d12, d13, [sp, #(2 * 16)]
+	stp	d14, d15, [sp, #(3 * 16)]
+	stp	d16, d17, [sp, #(4 * 16)]
+	stp	x18, x19, [sp, #(5 * 16)]
+	stp	x20, x21, [sp, #(6 * 16)]
+	stp	x22, x23, [sp, #(7 * 16)]
+	stp	x24, x25, [sp, #(8 * 16)]
+	stp	x26, x27, [sp, #(9 * 16)]
+	str	x28, [sp, #(10 * 16)]
+
+	fmov	alpha0, d0
+	fmov	alpha1, d0
+	fmov	alpha2, d0
+	fmov	alpha3, d0
+
+	lsl	LDC, LDC, #3			// ldc = ldc * 8
+
+	mov	pB, origPB
+
+	mov	counterJ, origN
+	asr 	counterJ, counterJ, #3		// J = J / 8
+	cmp 	counterJ, #0
+	ble	dgemm_kernel_L4_BEGIN
+
+/******************************************************************************/
+
+dgemm_kernel_L8_BEGIN:
+
+	mov	pCRow0, pC			// pCRow0 = C
+	add	pC, pC, LDC, lsl #3
+
+	mov	pA, origPA			// pA = start of A array
+
+dgemm_kernel_L8_M4_BEGIN:
+
+	mov	counterI, origM
+	asr 	counterI, counterI, #2		// counterI = counterI / 4
+	cmp 	counterI, #0
+	ble	dgemm_kernel_L8_M2_BEGIN
+
+dgemm_kernel_L8_M4_20:
+
+	mov	pB, origPB
+
+	asr 	counterL , origK, #1		// L = K / 2
+	cmp	counterL , #2			// is there at least 4 to do?
+	blt	dgemm_kernel_L8_M4_32
+
+	KERNEL4x8_I				// do one in the K
+	KERNEL4x8_M2				// do another in the K
+
+	subs	counterL, counterL, #2
+	ble	dgemm_kernel_L8_M4_22a
+	.align 5
+
+dgemm_kernel_L8_M4_22:
+
+	KERNEL4x8_M1
+	KERNEL4x8_M2
+
+	subs	counterL, counterL, #1
+	bgt	dgemm_kernel_L8_M4_22
+
+
+dgemm_kernel_L8_M4_22a:
+
+	KERNEL4x8_M1
+	KERNEL4x8_E
+
+	b	 dgemm_kernel_L8_M4_44
+
+dgemm_kernel_L8_M4_32:
+
+	tst	counterL, #1
+	ble	dgemm_kernel_L8_M4_40
+
+	KERNEL4x8_I
+
+	KERNEL4x8_E
+
+	b	dgemm_kernel_L8_M4_44
+
+
+dgemm_kernel_L8_M4_40:
+
+	INIT4x8
+
+dgemm_kernel_L8_M4_44:
+
+	ands	counterL , origK, #1
+	ble	dgemm_kernel_L8_M4_100
+
+dgemm_kernel_L8_M4_46:
+
+	KERNEL4x8_SUB
+
+dgemm_kernel_L8_M4_100:
+
+	SAVE4x8
+
+dgemm_kernel_L8_M4_END:
+	subs	counterI, counterI, #1
+	bne	dgemm_kernel_L8_M4_20
+
+dgemm_kernel_L8_M2_BEGIN:
+
+	mov	counterI, origM
+	tst	counterI , #3
+	ble	dgemm_kernel_L8_END
+
+	tst	counterI, #2			// counterI = counterI / 2
+	ble	dgemm_kernel_L8_M1_BEGIN
+
+dgemm_kernel_L8_M2_20:
+
+	INIT2x8
+
+	mov	pB, origPB
+
+	asr 	counterL , origK, #3		// counterL = counterL / 8
+	cmp	counterL , #0
+	ble	dgemm_kernel_L8_M2_40
+
+dgemm_kernel_L8_M2_22:
+
+	KERNEL2x8_SUB
+	KERNEL2x8_SUB
+	KERNEL2x8_SUB
+	KERNEL2x8_SUB
+
+	KERNEL2x8_SUB
+	KERNEL2x8_SUB
+	KERNEL2x8_SUB
+	KERNEL2x8_SUB
+
+	subs	counterL, counterL, #1
+	bgt	dgemm_kernel_L8_M2_22
+
+
+dgemm_kernel_L8_M2_40:
+
+	ands	counterL , origK, #7		// counterL = counterL % 8
+	ble	dgemm_kernel_L8_M2_100
+
+dgemm_kernel_L8_M2_42:
+
+	KERNEL2x8_SUB
+
+	subs	counterL, counterL, #1
+	bgt	dgemm_kernel_L8_M2_42
+
+dgemm_kernel_L8_M2_100:
+
+	SAVE2x8
+
+dgemm_kernel_L8_M2_END:
+
+
+dgemm_kernel_L8_M1_BEGIN:
+
+	tst	counterI, #1			// counterI = counterI % 2
+	ble	dgemm_kernel_L8_END
+
+dgemm_kernel_L8_M1_20:
+
+	INIT1x8
+
+	mov	pB, origPB
+
+	asr 	counterL , origK, #3		// counterL = counterL / 8
+	cmp	counterL , #0
+	ble	dgemm_kernel_L8_M1_40
+
+dgemm_kernel_L8_M1_22:
+	KERNEL1x8_SUB
+	KERNEL1x8_SUB
+	KERNEL1x8_SUB
+	KERNEL1x8_SUB
+
+	KERNEL1x8_SUB
+	KERNEL1x8_SUB
+	KERNEL1x8_SUB
+	KERNEL1x8_SUB
+
+	subs	counterL, counterL, #1
+	bgt	dgemm_kernel_L8_M1_22
+
+
+dgemm_kernel_L8_M1_40:
+
+	ands	counterL , origK, #7		// counterL = counterL % 8
+	ble	dgemm_kernel_L8_M1_100
+
+dgemm_kernel_L8_M1_42:
+
+	KERNEL1x8_SUB
+
+	subs	counterL, counterL, #1
+	bgt	dgemm_kernel_L8_M1_42
+
+dgemm_kernel_L8_M1_100:
+
+	SAVE1x8
+
+dgemm_kernel_L8_END:
+
+	lsl	temp, origK, #6
+	add	origPB, origPB, temp		// B = B + K * 8 * 8
+
+	subs	counterJ, counterJ , #1		// j--
+	bgt	dgemm_kernel_L8_BEGIN
+
+
+/******************************************************************************/
+
+dgemm_kernel_L4_BEGIN:
+
+	mov	counterJ , origN
+	tst	counterJ , #7
+	ble	dgemm_kernel_L999
+
+	tst	counterJ , #4
+	ble	dgemm_kernel_L2_BEGIN
+
+	mov	pCRow0, pC			// pCRow0 = C
+	add	pC, pC, LDC, lsl #2
+
+	mov	pA, origPA			// pA = start of A array
+
+dgemm_kernel_L4_M4_BEGIN:
+
+	mov	counterI, origM
+	asr 	counterI, counterI, #2		// counterI = counterI / 4
+	cmp 	counterI, #0
+	ble	dgemm_kernel_L4_M2_BEGIN
+
+dgemm_kernel_L4_M4_20:
+
+	mov	pB, origPB
+
+	asr 	counterL , origK, #1		// L = K / 2
+	cmp	counterL , #2			// is there at least 4 to do?
+	blt	dgemm_kernel_L4_M4_32
+
+	KERNEL4x4_I				// do one in the K
+	KERNEL4x4_M2				// do another in the K
+
+	subs	counterL, counterL, #2
+	ble	dgemm_kernel_L4_M4_22a
+	.align 5
+
+dgemm_kernel_L4_M4_22:
+
+	KERNEL4x4_M1
+	KERNEL4x4_M2
+
+	subs	counterL, counterL, #1
+	bgt	dgemm_kernel_L4_M4_22
+
+
+dgemm_kernel_L4_M4_22a:
+
+	KERNEL4x4_M1
+	KERNEL4x4_E
+
+	b	 dgemm_kernel_L4_M4_44
+
+dgemm_kernel_L4_M4_32:
+
+	tst	counterL, #1
+	ble	dgemm_kernel_L4_M4_40
+
+	KERNEL4x4_I
+
+	KERNEL4x4_E
+
+	b	dgemm_kernel_L4_M4_44
+
+
+dgemm_kernel_L4_M4_40:
+
+	INIT4x4
+
+dgemm_kernel_L4_M4_44:
+
+	ands	counterL , origK, #1
+	ble	dgemm_kernel_L4_M4_100
+
+dgemm_kernel_L4_M4_46:
+
+	KERNEL4x4_SUB
+
+dgemm_kernel_L4_M4_100:
+
+	SAVE4x4
+
+dgemm_kernel_L4_M4_END:
+	subs	counterI, counterI, #1
+	bne	dgemm_kernel_L4_M4_20
+
+dgemm_kernel_L4_M2_BEGIN:
+
+	mov	counterI, origM
+	tst	counterI , #3
+	ble	dgemm_kernel_L4_END
+
+	tst	counterI, #2			// counterI = counterI / 2
+	ble	dgemm_kernel_L4_M1_BEGIN
+
+dgemm_kernel_L4_M2_20:
+
+	INIT2x4
+
+	mov	pB, origPB
+
+	asr 	counterL , origK, #3		// counterL = counterL / 8
+	cmp	counterL , #0
+	ble	dgemm_kernel_L4_M2_40
+
+dgemm_kernel_L4_M2_22:
+
+	KERNEL2x4_SUB
+	KERNEL2x4_SUB
+	KERNEL2x4_SUB
+	KERNEL2x4_SUB
+
+	KERNEL2x4_SUB
+	KERNEL2x4_SUB
+	KERNEL2x4_SUB
+	KERNEL2x4_SUB
+
+	subs	counterL, counterL, #1
+	bgt	dgemm_kernel_L4_M2_22
+
+
+dgemm_kernel_L4_M2_40:
+
+	ands	counterL , origK, #7		// counterL = counterL % 8
+	ble	dgemm_kernel_L4_M2_100
+
+dgemm_kernel_L4_M2_42:
+
+	KERNEL2x4_SUB
+
+	subs	counterL, counterL, #1
+	bgt	dgemm_kernel_L4_M2_42
+
+dgemm_kernel_L4_M2_100:
+
+	SAVE2x4
+
+dgemm_kernel_L4_M2_END:
+
+
+dgemm_kernel_L4_M1_BEGIN:
+
+	tst	counterI, #1			// counterI = counterI % 2
+	ble	dgemm_kernel_L4_END
+
+dgemm_kernel_L4_M1_20:
+
+	INIT1x4
+
+	mov	pB, origPB
+
+	asr 	counterL , origK, #3		// counterL = counterL / 8
+	cmp	counterL , #0
+	ble	dgemm_kernel_L4_M1_40
+
+dgemm_kernel_L4_M1_22:
+	KERNEL1x4_SUB
+	KERNEL1x4_SUB
+	KERNEL1x4_SUB
+	KERNEL1x4_SUB
+
+	KERNEL1x4_SUB
+	KERNEL1x4_SUB
+	KERNEL1x4_SUB
+	KERNEL1x4_SUB
+
+	subs	counterL, counterL, #1
+	bgt	dgemm_kernel_L4_M1_22
+
+
+dgemm_kernel_L4_M1_40:
+
+	ands	counterL , origK, #7		// counterL = counterL % 8
+	ble	dgemm_kernel_L4_M1_100
+
+dgemm_kernel_L4_M1_42:
+
+	KERNEL1x4_SUB
+
+	subs	counterL, counterL, #1
+	bgt	dgemm_kernel_L4_M1_42
+
+dgemm_kernel_L4_M1_100:
+
+	SAVE1x4
+
+dgemm_kernel_L4_END:
+
+	lsl	temp, origK, #5 
+	add	origPB, origPB, temp		// B = B + K * 4 * 8
+
+/******************************************************************************/
+
+dgemm_kernel_L2_BEGIN:   // less than 2 left in N direction
+
+	mov	counterJ , origN
+	tst	counterJ , #3
+	ble	dgemm_kernel_L999   // error, N was less than 4?
+
+	tst	counterJ , #2
+	ble	dgemm_kernel_L1_BEGIN
+
+	mov	pCRow0, pC			// pCRow0 = pC
+
+	add	pC,pC,LDC, lsl #1
+
+	mov	pA, origPA			// pA = A
+
+
+dgemm_kernel_L2_M4_BEGIN:
+
+	mov	counterI, origM
+	asr 	counterI, counterI, #2		// counterI = counterI / 4
+	cmp	counterI,#0
+	ble	dgemm_kernel_L2_M2_BEGIN
+
+dgemm_kernel_L2_M4_20:
+
+	INIT4x2
+
+	mov	pB, origPB
+
+	asr	counterL , origK, #3		// counterL = counterL / 8
+	cmp	counterL,#0
+	ble	dgemm_kernel_L2_M4_40
+	.align 5
+
+dgemm_kernel_L2_M4_22:
+	KERNEL4x2_SUB
+	KERNEL4x2_SUB
+	KERNEL4x2_SUB
+	KERNEL4x2_SUB
+
+	KERNEL4x2_SUB
+	KERNEL4x2_SUB
+	KERNEL4x2_SUB
+	KERNEL4x2_SUB
+
+	subs	counterL, counterL, #1
+	bgt	dgemm_kernel_L2_M4_22
+
+
+dgemm_kernel_L2_M4_40:
+
+	ands	counterL , origK, #7		// counterL = counterL % 8
+	ble	dgemm_kernel_L2_M4_100
+
+dgemm_kernel_L2_M4_42:
+
+	KERNEL4x2_SUB
+
+	subs	counterL, counterL, #1
+	bgt	dgemm_kernel_L2_M4_42
+
+dgemm_kernel_L2_M4_100:
+
+	SAVE4x2
+
+dgemm_kernel_L2_M4_END:
+
+	subs	counterI, counterI, #1
+	bgt	dgemm_kernel_L2_M4_20
+
+
+dgemm_kernel_L2_M2_BEGIN:
+
+	mov	counterI, origM
+	tst	counterI , #3
+	ble	dgemm_kernel_L2_END
+
+	tst	counterI, #2			// counterI = counterI / 2
+	ble	dgemm_kernel_L2_M1_BEGIN
+
+dgemm_kernel_L2_M2_20:
+
+	INIT2x2
+
+	mov	pB, origPB
+
+	asr	counterL , origK, #3		// counterL = counterL / 8
+        cmp	counterL,#0
+	ble	dgemm_kernel_L2_M2_40
+
+dgemm_kernel_L2_M2_22:
+
+	KERNEL2x2_SUB
+	KERNEL2x2_SUB
+	KERNEL2x2_SUB
+	KERNEL2x2_SUB
+
+	KERNEL2x2_SUB
+	KERNEL2x2_SUB
+	KERNEL2x2_SUB
+	KERNEL2x2_SUB
+
+	subs	counterL, counterL, #1
+	bgt	dgemm_kernel_L2_M2_22
+
+
+dgemm_kernel_L2_M2_40:
+
+	ands	counterL , origK, #7		// counterL = counterL % 8
+	ble	dgemm_kernel_L2_M2_100
+
+dgemm_kernel_L2_M2_42:
+
+	KERNEL2x2_SUB
+
+	subs	counterL, counterL, #1
+	bgt	dgemm_kernel_L2_M2_42
+
+dgemm_kernel_L2_M2_100:
+
+	SAVE2x2
+
+dgemm_kernel_L2_M2_END:
+
+
+dgemm_kernel_L2_M1_BEGIN:
+
+	tst	counterI, #1			// counterI = counterI % 2
+	ble	dgemm_kernel_L2_END
+
+dgemm_kernel_L2_M1_20:
+
+	INIT1x2
+
+	mov	pB, origPB
+
+	asr 	counterL , origK, #3		// counterL = counterL / 8
+        cmp     counterL, #0
+	ble	dgemm_kernel_L2_M1_40
+
+dgemm_kernel_L2_M1_22:
+	KERNEL1x2_SUB
+	KERNEL1x2_SUB
+	KERNEL1x2_SUB
+	KERNEL1x2_SUB
+
+	KERNEL1x2_SUB
+	KERNEL1x2_SUB
+	KERNEL1x2_SUB
+	KERNEL1x2_SUB
+
+	subs	counterL, counterL, #1
+	bgt	dgemm_kernel_L2_M1_22
+
+
+dgemm_kernel_L2_M1_40:
+
+	ands	counterL , origK, #7		// counterL = counterL % 8
+	ble	dgemm_kernel_L2_M1_100
+
+dgemm_kernel_L2_M1_42:
+
+	KERNEL1x2_SUB
+
+	subs	counterL, counterL, #1
+	bgt	dgemm_kernel_L2_M1_42
+
+dgemm_kernel_L2_M1_100:
+
+	SAVE1x2
+
+dgemm_kernel_L2_END:
+	add	origPB, origPB, origK, lsl #4	// B = B + K * 2 * 8
+
+/******************************************************************************/
+
+dgemm_kernel_L1_BEGIN:
+
+	mov	counterJ , origN
+	tst	counterJ , #1
+	ble	dgemm_kernel_L999 // done
+
+
+	mov	pCRow0, pC			// pCRow0 = C
+	add	pC , pC , LDC			// Update pC to point to next
+
+	mov	pA, origPA			// pA = A
+
+dgemm_kernel_L1_M4_BEGIN:
+
+	mov	counterI, origM
+	asr 	counterI, counterI, #2		// counterI = counterI / 4
+	cmp	counterI, #0
+	ble	dgemm_kernel_L1_M2_BEGIN
+
+dgemm_kernel_L1_M4_20:
+
+	INIT4x1
+
+	mov	pB, origPB
+	asr	counterL , origK, #3		// counterL = counterL / 8
+	cmp	counterL , #0
+	ble	dgemm_kernel_L1_M4_40
+	.align 5
+
+dgemm_kernel_L1_M4_22:
+	KERNEL4x1_SUB
+	KERNEL4x1_SUB
+	KERNEL4x1_SUB
+	KERNEL4x1_SUB
+
+	KERNEL4x1_SUB
+	KERNEL4x1_SUB
+	KERNEL4x1_SUB
+	KERNEL4x1_SUB
+
+	subs	counterL, counterL, #1
+	bgt	dgemm_kernel_L1_M4_22
+
+
+dgemm_kernel_L1_M4_40:
+
+	ands	counterL , origK, #7		// counterL = counterL % 8
+	ble	dgemm_kernel_L1_M4_100
+
+dgemm_kernel_L1_M4_42:
+
+	KERNEL4x1_SUB
+
+	subs	counterL, counterL, #1
+	bgt	dgemm_kernel_L1_M4_42
+
+dgemm_kernel_L1_M4_100:
+
+	SAVE4x1
+
+dgemm_kernel_L1_M4_END:
+
+	subs	counterI, counterI, #1
+	bgt	dgemm_kernel_L1_M4_20
+
+
+dgemm_kernel_L1_M2_BEGIN:
+
+	mov	counterI, origM
+	tst	counterI , #3
+	ble	dgemm_kernel_L1_END
+
+	tst	counterI, #2			// counterI = counterI / 2
+	ble	dgemm_kernel_L1_M1_BEGIN
+
+dgemm_kernel_L1_M2_20:
+
+	INIT2x1
+
+	mov	pB, origPB
+
+	asr 	counterL , origK, #3		// counterL = counterL / 8
+	cmp	counterL , #0
+	ble	dgemm_kernel_L1_M2_40
+
+dgemm_kernel_L1_M2_22:
+
+	KERNEL2x1_SUB
+	KERNEL2x1_SUB
+	KERNEL2x1_SUB
+	KERNEL2x1_SUB
+
+	KERNEL2x1_SUB
+	KERNEL2x1_SUB
+	KERNEL2x1_SUB
+	KERNEL2x1_SUB
+
+	subs	counterL, counterL, #1
+	bgt	dgemm_kernel_L1_M2_22
+
+
+dgemm_kernel_L1_M2_40:
+
+	ands	counterL , origK, #7		// counterL = counterL % 8
+	ble	dgemm_kernel_L1_M2_100
+
+dgemm_kernel_L1_M2_42:
+
+	KERNEL2x1_SUB
+
+	subs	counterL, counterL, #1
+	bgt	dgemm_kernel_L1_M2_42
+
+dgemm_kernel_L1_M2_100:
+
+	SAVE2x1
+
+dgemm_kernel_L1_M2_END:
+
+
+dgemm_kernel_L1_M1_BEGIN:
+
+	tst	counterI, #1			// counterI = counterI % 2
+	ble	dgemm_kernel_L1_END
+
+dgemm_kernel_L1_M1_20:
+
+	INIT1x1
+
+	mov	pB, origPB
+
+	asr 	counterL , origK, #3		// counterL = counterL / 8
+	cmp	counterL , #0
+	ble	dgemm_kernel_L1_M1_40
+
+dgemm_kernel_L1_M1_22:
+	KERNEL1x1_SUB
+	KERNEL1x1_SUB
+	KERNEL1x1_SUB
+	KERNEL1x1_SUB
+
+	KERNEL1x1_SUB
+	KERNEL1x1_SUB
+	KERNEL1x1_SUB
+	KERNEL1x1_SUB
+
+	subs	counterL, counterL, #1
+	bgt	dgemm_kernel_L1_M1_22
+
+
+dgemm_kernel_L1_M1_40:
+
+	ands	counterL , origK, #7		// counterL = counterL % 8
+	ble	dgemm_kernel_L1_M1_100
+
+dgemm_kernel_L1_M1_42:
+
+	KERNEL1x1_SUB
+
+	subs	counterL, counterL, #1
+	bgt	dgemm_kernel_L1_M1_42
+
+dgemm_kernel_L1_M1_100:
+
+	SAVE1x1
+
+
+dgemm_kernel_L1_END:
+
+
+dgemm_kernel_L999:
+	mov	x0, #0				// set return value
+	ldp	d8, d9, [sp, #(0 * 16)]
+	ldp	d10, d11, [sp, #(1 * 16)]
+	ldp	d12, d13, [sp, #(2 * 16)]
+	ldp	d14, d15, [sp, #(3 * 16)]
+	ldp	d16, d17, [sp, #(4 * 16)]
+	ldp	x18, x19, [sp, #(5 * 16)]
+	ldp	x20, x21, [sp, #(6 * 16)]
+	ldp	x22, x23, [sp, #(7 * 16)]
+	ldp	x24, x25, [sp, #(8 * 16)]
+	ldp	x26, x27, [sp, #(9 * 16)]
+	ldr	x28, [sp, #(10 * 16)]
+	add	sp, sp, #(11*16)
+	ret
+
+	EPILOGUE
+
diff --git a/kernel/arm64/dgemm_kernel_8x4.S b/kernel/arm64/dgemm_kernel_8x4.S
new file mode 100755
index 000000000..a607fecc4
--- /dev/null
+++ b/kernel/arm64/dgemm_kernel_8x4.S
@@ -0,0 +1,1570 @@
+/*******************************************************************************
+Copyright (c) 2015, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*******************************************************************************/
+
+#define ASSEMBLER
+#include "common.h"
+
+/*                   X0          X1          X2          s0         X3        x4       x5           x6 */
+/*int CNAME(BLASLONG bm,BLASLONG bn,BLASLONG bk,FLOAT alpha0,FLOAT* ba,FLOAT* bb,FLOAT* C,BLASLONG ldc )*/
+
+#define origM		x0
+#define origN		x1
+#define origK		x2
+#define origPA		x3
+#define origPB		x4
+#define pC		x5
+#define LDC		x6
+#define temp		x7
+#define counterL	x8
+#define counterI	x9
+#define counterJ	x10
+#define pB		x11
+#define pCRow0		x12
+#define pCRow1		x13
+#define pCRow2		x14
+#define pCRow3		x15
+#define pA		x16
+#define alpha		x17
+
+#define alpha0		d10
+#define alphaV0		v10.d[0]
+#define alpha1		d11
+#define alphaV1		v11.d[0]
+#define alpha2		d14
+#define alphaV2		v14.d[0]
+#define alpha3		d15
+#define alphaV3		v15.d[0]
+
+// 00 origM
+// 01 origN
+// 02 origK
+// 03 origPA
+// 04 origPB
+// 05 pC
+// 06 origLDC -> LDC
+// 07 temp
+// 08 counterL
+// 09 counterI
+// 10 counterJ
+// 11 pB
+// 12 pCRow0
+// 13 pCRow1
+// 14 pCRow2
+// 15 pA
+// 16
+// 17
+// 18 must save
+// 19 must save
+// 20 must save
+// 21 must save
+// 22 must save
+// 23 must save
+// 24 must save
+// 25 must save
+// 26 must save
+// 27 must save
+// 28 must save
+// 29 frame
+// 30 link
+// 31 sp
+
+//v00 ALPHA -> pA0_0, pA0_1
+//v01 pA0_2, pA0_3
+//v02 pA0_4, pA0_5
+//v03 pA0_6, pA0_7
+//v04 pA1_0, pA1_1
+//v05 pA1_2, pA1_3
+//v06 pA1_4, pA1_5
+//v07 pA1_6, pA1_7
+//v08 must save pB0_0, pB0_1
+//v09 must save pB0_2, pB0_3
+//v10 must save ALPHA0
+//v11 must save ALPHA1
+//v12 must save pB1_0, pB1_1
+//v13 must save pB1_2, pB1_3
+//v14 must save ALPHA2
+//v15 must save ALPHA3
+//v16 must save C00, C01
+//v17 must save C02, C03
+//v18 C04, C05
+//v19 C06, C07
+//v20 C10, C11
+//v21 C12, C13
+//v22 C14, C15
+//v23 C16, C17
+//v24 C20, C21
+//v25 C22, C23
+//v26 C24, C25
+//v27 C26, C27
+//v28 C30, C31
+//v29 C32, C33
+//v30 C34, C35
+//v31 C36, C37
+
+/*******************************************************************************
+* Macro definitions
+*******************************************************************************/
+
+.macro INIT8x4
+	fmov		d16, xzr
+	fmov		d17, xzr
+	fmov		d18, d16
+	fmov		d19, xzr
+	fmov		d20, xzr
+	fmov		d21, d16
+	fmov		d22, d17
+	fmov		d23, d18
+	fmov		d24, xzr
+	fmov		d25, d16
+	fmov		d26, d17
+	fmov		d27, d18
+	fmov		d28, xzr
+	fmov		d29, d16
+	fmov		d30, d17
+	fmov		d31, d18
+.endm
+
+.macro KERNEL8x4_I
+	ld1	{v0.2d, v1.2d}, [pA]
+	add	pA, pA, #32
+	ld1	{v2.2d, v3.2d}, [pA]
+	add	pA, pA, #32
+	ldp	d8, d9, [pB]
+	add	pB, pB, #16
+	ldp	d10, d11, [pB]
+	add	pB, pB, #16
+
+	fmul	v16.2d, v0.2d, v8.2d[0]
+	fmul	v17.2d, v1.2d, v8.2d[0]
+
+	fmul	v18.2d, v2.2d, v8.2d[0]
+	fmul	v19.2d, v3.2d, v8.2d[0]
+
+	fmul	v20.2d, v0.2d, v9.2d[0]
+	fmul	v21.2d, v1.2d, v9.2d[0]
+
+	fmul	v22.2d, v2.2d, v9.2d[0]
+	fmul	v23.2d, v3.2d, v9.2d[0]
+
+	fmul	v24.2d, v0.2d, v10.2d[0]
+	fmul	v25.2d, v1.2d, v10.2d[0]
+
+	fmul	v26.2d, v2.2d, v10.2d[0]
+	fmul	v27.2d, v3.2d, v10.2d[0]
+
+	fmul	v28.2d, v0.2d, v11.2d[0]
+	fmul	v29.2d, v1.2d, v11.2d[0]
+
+	fmul	v30.2d, v2.2d, v11.2d[0]
+	fmul	v31.2d, v3.2d, v11.2d[0]
+
+	ld1	{v4.2d, v5.2d}, [pA]
+	add	pA, pA, #32
+	ld1	{v6.2d, v7.2d}, [pA]
+	add	pA, pA, #32
+	ldp	d12, d13, [pB]
+	add	pB, pB, #16
+	ldp	d14, d15, [pB]
+	add	pB, pB, #16
+.endm
+
+.macro KERNEL8x4_M1
+	fmla	v16.2d, v0.2d, v8.2d[0]
+	fmla	v21.2d, v1.2d, v9.2d[0]
+	fmla	v26.2d, v2.2d, v10.2d[0]
+	fmla	v31.2d, v3.2d, v11.2d[0]
+
+	ld1	{v4.2d}, [pA], #16
+
+	fmla	v20.2d, v0.2d, v9.2d[0]
+	fmla	v17.2d, v1.2d, v8.2d[0]
+
+	ld1	{v5.2d}, [pA], #16
+
+	fmla	v30.2d, v2.2d, v11.2d[0]
+	fmla	v27.2d, v3.2d, v10.2d[0]
+
+	ldp	d12, d13, [pB]
+	add	pB, pB, #16
+
+	fmla	v28.2d, v0.2d, v11.2d[0]
+	fmla	v25.2d, v1.2d, v10.2d[0]
+
+	ldp	d14, d15, [pB]
+	add	pB, pB, #16
+
+	fmla	v18.2d, v2.2d, v8.2d[0]
+	fmla	v23.2d, v3.2d, v9.2d[0]
+
+	ld1	{v6.2d}, [pA], #16
+
+	fmla	v24.2d, v0.2d, v10.2d[0]
+	fmla	v29.2d, v1.2d, v11.2d[0]
+
+	ld1	{v7.2d}, [pA], #16
+
+	fmla	v22.2d, v2.2d, v9.2d[0]
+	fmla	v19.2d, v3.2d, v8.2d[0]
+
+	prfm	PLDL1KEEP, [pA, #224]
+	prfm	PLDL1KEEP, [pA, #224+64]
+.endm
+
+.macro KERNEL8x4_M2
+	fmla	v16.2d, v4.2d, v12.2d[0]
+	fmla	v21.2d, v5.2d, v13.2d[0]
+	fmla	v26.2d, v6.2d, v14.2d[0]
+	fmla	v31.2d, v7.2d, v15.2d[0]
+
+	ld1	{v0.2d}, [pA], #16
+
+	fmla	v20.2d, v4.2d, v13.2d[0]
+	fmla	v17.2d, v5.2d, v12.2d[0]
+
+	ld1	{v1.2d}, [pA], #16
+
+	fmla	v30.2d, v6.2d, v15.2d[0]
+	fmla	v27.2d, v7.2d, v14.2d[0]
+
+	ldp	d8, d9, [pB]
+	add	pB, pB, #16
+
+	fmla	v28.2d, v4.2d, v15.2d[0]
+	fmla	v25.2d, v5.2d, v14.2d[0]
+
+	ldp	d10, d11, [pB]
+	add	pB, pB, #16
+
+	fmla	v22.2d, v6.2d, v13.2d[0]
+	fmla	v19.2d, v7.2d, v12.2d[0]
+
+	ld1	{v2.2d}, [pA], #16
+
+	fmla	v24.2d, v4.2d, v14.2d[0]
+	fmla	v29.2d, v5.2d, v15.2d[0]
+
+	ld1	{v3.2d}, [pA], #16
+
+	fmla	v18.2d, v6.2d, v12.2d[0]
+	fmla	v23.2d, v7.2d, v13.2d[0]
+
+	prfm	PLDL1KEEP, [pB, #640]
+.endm
+
+.macro KERNEL8x4_E
+	fmla	v16.2d, v4.2d, v12.2d[0]
+	fmla	v17.2d, v5.2d, v12.2d[0]
+	fmla	v18.2d, v6.2d, v12.2d[0]
+	fmla	v19.2d, v7.2d, v12.2d[0]
+	fmla	v20.2d, v4.2d, v13.2d[0]
+	fmla	v21.2d, v5.2d, v13.2d[0]
+	fmla	v22.2d, v6.2d, v13.2d[0]
+	fmla	v23.2d, v7.2d, v13.2d[0]
+	fmla	v24.2d, v4.2d, v14.2d[0]
+	fmla	v25.2d, v5.2d, v14.2d[0]
+	fmla	v26.2d, v6.2d, v14.2d[0]
+	fmla	v27.2d, v7.2d, v14.2d[0]
+	fmla	v28.2d, v4.2d, v15.2d[0]
+	fmla	v29.2d, v5.2d, v15.2d[0]
+	fmla	v30.2d, v6.2d, v15.2d[0]
+	fmla	v31.2d, v7.2d, v15.2d[0]
+.endm
+
+.macro KERNEL8x4_SUB
+	ld1	{v0.2d, v1.2d}, [pA]
+	add	pA, pA, #32
+	ld1	{v2.2d, v3.2d}, [pA]
+	add	pA, pA, #32
+	ldp	d8, d9, [pB]
+	add	pB, pB, #16
+	ldp	d10, d11, [pB]
+	add	pB, pB, #16
+
+	fmla	v16.2d, v0.2d, v8.2d[0]
+	fmla	v17.2d, v1.2d, v8.2d[0]
+	fmla	v18.2d, v2.2d, v8.2d[0]
+	fmla	v19.2d, v3.2d, v8.2d[0]
+
+	fmla	v20.2d, v0.2d, v9.2d[0]
+	fmla	v21.2d, v1.2d, v9.2d[0]
+	fmla	v22.2d, v2.2d, v9.2d[0]
+	fmla	v23.2d, v3.2d, v9.2d[0]
+
+	fmla	v24.2d, v0.2d, v10.2d[0]
+	fmla	v25.2d, v1.2d, v10.2d[0]
+	fmla	v26.2d, v2.2d, v10.2d[0]
+	fmla	v27.2d, v3.2d, v10.2d[0]
+
+	fmla	v28.2d, v0.2d, v11.2d[0]
+	fmla	v29.2d, v1.2d, v11.2d[0]
+	fmla	v30.2d, v2.2d, v11.2d[0]
+	fmla	v31.2d, v3.2d, v11.2d[0]
+.endm
+
+.macro SAVE8x4
+	fmov	alpha0, alpha
+
+	ld1	{v0.2d, v1.2d}, [pCRow0]
+	fmla	v0.2d, v16.2d, alphaV0
+	fmla	v1.2d, v17.2d, alphaV0
+	st1 	{v0.2d, v1.2d}, [pCRow0]
+
+	add	pCRow0, pCRow0, #32
+
+	ld1	{v2.2d, v3.2d}, [pCRow0]
+	fmla	v2.2d, v18.2d, alphaV0
+	fmla	v3.2d, v19.2d, alphaV0
+	st1 	{v2.2d, v3.2d}, [pCRow0]
+
+	add	pCRow0, pCRow0, #32
+
+	ld1	{v4.2d, v5.2d}, [pCRow1]
+	fmla	v4.2d, v20.2d, alphaV0
+	fmla	v5.2d, v21.2d, alphaV0
+	st1 	{v4.2d, v5.2d}, [pCRow1]
+
+	add	pCRow1, pCRow1, #32
+
+	ld1	{v6.2d, v7.2d}, [pCRow1]
+	fmla	v6.2d, v22.2d, alphaV0
+	fmla	v7.2d, v23.2d, alphaV0
+	st1 	{v6.2d, v7.2d}, [pCRow1]
+
+	add	pCRow1, pCRow1, #32
+
+	ld1	{v0.2d, v1.2d}, [pCRow2]
+	fmla	v0.2d, v24.2d, alphaV0
+	fmla	v1.2d, v25.2d, alphaV0
+	st1 	{v0.2d, v1.2d}, [pCRow2]
+
+	add	pCRow2, pCRow2, #32
+	ld1	{v2.2d, v3.2d}, [pCRow2]
+	fmla	v2.2d, v26.2d, alphaV0
+	fmla	v3.2d, v27.2d, alphaV0
+	st1 	{v2.2d, v3.2d}, [pCRow2]
+
+	add	pCRow2, pCRow2, #32
+
+	ld1	{v4.2d, v5.2d}, [pCRow3]
+	fmla	v4.2d, v28.2d, alphaV0
+	fmla	v5.2d, v29.2d, alphaV0
+	st1 	{v4.2d, v5.2d}, [pCRow3]
+
+	add	pCRow3, pCRow3, #32
+
+	ld1	{v6.2d, v7.2d}, [pCRow3]
+	fmla	v6.2d, v30.2d, alphaV0
+	fmla	v7.2d, v31.2d, alphaV0
+	st1 	{v6.2d, v7.2d}, [pCRow3]
+
+	add	pCRow3, pCRow3, #32
+
+	prfm	PLDL2KEEP, [pCRow0, #128]
+	prfm	PLDL2KEEP, [pCRow1, #128]
+	prfm	PLDL2KEEP, [pCRow2, #128]
+	prfm	PLDL2KEEP, [pCRow3, #128]
+.endm
+
+/******************************************************************************/
+
+.macro INIT4x4
+	fmov		d16, xzr
+	fmov		d17, d16
+	fmov		d20, d17
+	fmov		d21, d16
+	fmov		d24, d17
+	fmov		d25, d16
+	fmov		d28, d17
+	fmov		d29, d16
+.endm
+
+.macro KERNEL4x4_SUB
+	ld1	{v8.2d, v9.2d}, [pB]
+	add	pB, pB, #32
+	ld1	{v0.2d, v1.2d}, [pA]
+	add	pA, pA, #32
+
+	fmla	v16.2d, v0.2d, v8.2d[0]
+	fmla	v29.2d, v1.2d, v9.2d[1]
+
+	fmla	v20.2d, v0.2d, v8.2d[1]
+	fmla	v25.2d, v1.2d, v9.2d[0]
+
+	fmla	v24.2d, v0.2d, v9.2d[0]
+	fmla	v21.2d, v1.2d, v8.2d[1]
+
+	fmla	v28.2d, v0.2d, v9.2d[1]
+	fmla	v17.2d, v1.2d, v8.2d[0]
+.endm
+
+.macro SAVE4x4
+	ld1	{v8.2d, v9.2d}, [pCRow0]
+	fmla	v8.2d, v16.2d, alphaV0
+	fmla	v9.2d, v17.2d, alphaV1
+	st1 	{v8.2d, v9.2d}, [pCRow0]
+
+	add	pCRow1, pCRow0, LDC
+
+	ld1	{v12.2d, v13.2d}, [pCRow1]
+	fmla	v12.2d, v20.2d, alphaV2
+	fmla	v13.2d, v21.2d, alphaV3
+	st1 	{v12.2d, v13.2d}, [pCRow1]
+
+	add	pCRow2, pCRow1, LDC
+
+	ld1	{v8.2d, v9.2d}, [pCRow2]
+	fmla	v8.2d, v24.2d, alphaV0
+	fmla	v9.2d, v25.2d, alphaV1
+	st1 	{v8.2d, v9.2d}, [pCRow2]
+
+	add	pCRow1, pCRow2, LDC
+
+	ld1	{v12.2d, v13.2d}, [pCRow1]
+	fmla	v12.2d, v28.2d, alphaV2
+	fmla	v13.2d, v29.2d, alphaV3
+	st1 	{v12.2d, v13.2d}, [pCRow1]
+
+	add	pCRow0, pCRow0, #32
+.endm
+
+/******************************************************************************/
+
+
+.macro INIT2x4
+	fmov		d16, xzr
+	fmov		d20, d16
+	fmov		d24, d20
+	fmov		d28, d16
+.endm
+
+.macro KERNEL2x4_SUB
+	ld1	{v8.2d, v9.2d}, [pB]
+	add	pB, pB, #32
+	ld1	{v0.2d}, [pA]
+	add	pA, pA, #16
+
+	fmla	v16.2d, v0.2d, v8.2d[0]
+	fmla	v20.2d, v0.2d, v8.2d[1]
+	fmla	v24.2d, v0.2d, v9.2d[0]
+	fmla	v28.2d, v0.2d, v9.2d[1]
+.endm
+
+.macro SAVE2x4
+	ld1	{v8.2d}, [pCRow0]
+	fmla	v8.2d, v16.2d, alphaV0
+	st1	{v8.2d}, [pCRow0]
+
+	add	pCRow1, pCRow0, LDC
+
+	ld1	{v12.2d}, [pCRow1]
+	fmla	v12.2d, v20.2d, alphaV1
+	st1	{v12.2d}, [pCRow1]
+
+	add	pCRow2, pCRow1, LDC
+
+	ld1	{v8.2d}, [pCRow2]
+	fmla	v8.2d, v24.2d, alphaV2
+	st1	{v8.2d}, [pCRow2]
+
+	add	pCRow1, pCRow2, LDC
+
+	ld1	{v12.2d}, [pCRow1]
+	fmla	v12.2d, v28.2d, alphaV3
+	st1	{v12.2d}, [pCRow1]
+
+	add	pCRow0, pCRow0, #16
+.endm
+
+/******************************************************************************/
+
+.macro INIT1x4
+	fmov		d16, xzr
+	fmov		d20, d16
+.endm
+
+.macro KERNEL1x4_SUB
+	ldr	d0, [pA]
+	add	pA, pA, #8
+
+	ld1	{v8.2d, v9.2d}, [pB]
+	add	pB, pB, #32
+
+	fmla	v16.2d, v8.2d, v0.d[0]
+	fmla	v20.2d, v9.2d, v0.d[0]
+.endm
+
+.macro SAVE1x4
+	add	pCRow1, pCRow0, LDC
+
+	ld1	{v8.d}[0], [pCRow0]
+	ld1	{v8.d}[1], [pCRow1]
+	fmla	v8.2d, v16.2d, alphaV0
+	st1	{v8.d}[0], [pCRow0]
+	st1	{v8.d}[1], [pCRow1]
+
+	add	pCRow2, pCRow1, LDC
+	add	pCRow1, pCRow2, LDC
+
+	ld1	{v12.d}[0], [pCRow2]
+	ld1	{v12.d}[1], [pCRow1]
+	fmla	v12.2d, v20.2d, alphaV1
+	st1	{v12.d}[0], [pCRow2]
+	st1	{v12.d}[1], [pCRow1]
+
+	add	pCRow0, pCRow0, #8
+.endm
+
+/******************************************************************************/
+
+.macro INIT8x2
+	fmov	d16, xzr
+	fmov	d17, xzr
+	fmov	d18, d16
+	fmov	d19, d17
+	fmov	d20, xzr
+	fmov	d21, d16
+	fmov	d22, d17
+	fmov	d23, d18
+.endm
+
+.macro KERNEL8x2_SUB
+	ld1	{v0.2d, v1.2d}, [pA]
+	add	pA, pA, #32
+	ld1	{v8.2d}, [pB]
+	add	pB, pB, #16
+	ld1	{v2.2d, v3.2d}, [pA]
+	add	pA, pA, #32
+
+	fmla	v16.2d, v0.2d, v8.2d[0]
+	fmla	v17.2d, v1.2d, v8.2d[0]
+	fmla	v18.2d, v2.2d, v8.2d[0]
+	fmla	v19.2d, v3.2d, v8.2d[0]
+
+	fmla	v20.2d, v0.2d, v8.2d[1]
+	fmla	v21.2d, v1.2d, v8.2d[1]
+	fmla	v22.2d, v2.2d, v8.2d[1]
+	fmla	v23.2d, v3.2d, v8.2d[1]
+.endm
+
+.macro SAVE8x2
+	add	pCRow1, pCRow0, LDC
+
+	ld1	{v0.2d, v1.2d, v2.2d, v3.2d}, [pCRow0]
+	fmla	v0.2d, v16.2d, alphaV0
+	fmla	v1.2d, v17.2d, alphaV1
+	fmla	v2.2d, v18.2d, alphaV2
+	fmla	v3.2d, v19.2d, alphaV3
+	st1 	{v0.2d, v1.2d, v2.2d, v3.2d}, [pCRow0]
+
+	ld1	{v4.2d, v5.2d, v6.2d, v7.2d}, [pCRow1]
+	fmla	v4.2d, v20.2d, alphaV0
+	fmla	v5.2d, v21.2d, alphaV1
+	fmla	v6.2d, v22.2d, alphaV2
+	fmla	v7.2d, v23.2d, alphaV3
+	st1 	{v4.2d, v5.2d, v6.2d, v7.2d}, [pCRow1]
+
+	add	pCRow0, pCRow0, #64
+.endm
+
+/******************************************************************************/
+
+.macro INIT4x2
+	fmov	d16, xzr
+	fmov	d17, d16
+	fmov	d20, d17
+	fmov	d21, d16
+.endm
+
+.macro KERNEL4x2_SUB
+	ld1	{v8.2d}, [pB]
+	add	pB, pB, #16
+	ld1	{v0.2d, v1.2d}, [pA]
+	add	pA, pA, #32
+
+	fmla	v16.2d, v0.2d, v8.2d[0]
+	fmla	v17.2d, v1.2d, v8.2d[0]
+	fmla	v20.2d, v0.2d, v8.2d[1]
+	fmla	v21.2d, v1.2d, v8.2d[1]
+.endm
+
+.macro SAVE4x2
+	ld1	{v8.2d, v9.2d}, [pCRow0]
+	fmla	v8.2d, v16.2d, alphaV0
+	fmla	v9.2d, v17.2d, alphaV1
+	st1	{v8.2d, v9.2d}, [pCRow0]
+
+	add	pCRow1, pCRow0, LDC
+
+	ld1	{v12.2d, v13.2d}, [pCRow1]
+	fmla	v12.2d, v20.2d, alphaV2
+	fmla	v13.2d, v21.2d, alphaV3
+	st1	{v12.2d, v13.2d}, [pCRow1]
+
+	add	pCRow0, pCRow0, #32
+.endm
+
+/******************************************************************************/
+
+.macro INIT2x2
+	fmov		d16, xzr
+	fmov		d20, d16
+.endm
+
+.macro KERNEL2x2_SUB
+	ld1	{v8.2d}, [pB]
+	add	pB, pB, #16
+
+	ld1	{v0.2d}, [pA]
+	add	pA, pA, #16
+
+	fmla	v16.2d, v0.2d, v8.2d[0]
+	fmla	v20.2d, v0.2d, v8.2d[1]
+.endm
+
+.macro SAVE2x2
+	ld1	{v8.2d}, [pCRow0]
+	fmla	v8.2d, v16.2d, alphaV0
+	st1	{v8.2d}, [pCRow0]
+
+	add	pCRow1 , pCRow0, LDC
+
+	ld1	{v12.2d}, [pCRow1]
+	fmla	v12.2d, v20.2d, alphaV1
+	st1	{v12.2d}, [pCRow1]
+
+	add	pCRow0, pCRow0, #16
+.endm
+
+/******************************************************************************/
+
+.macro INIT1x2
+	fmov		d16, xzr
+.endm
+
+.macro KERNEL1x2_SUB
+	ld1	{v8.2d} , [pB]
+	add	pB , pB, #16
+
+	ldr	d0 , [pA]
+	add	pA, pA, #8
+
+	fmla	v16.2d, v8.2d, v0.2d[0]
+.endm
+
+.macro SAVE1x2
+	add	pCRow1 , pCRow0, LDC
+
+	ld1	{v8.d}[0], [pCRow0]
+	ld1	{v8.d}[1], [pCRow1]
+	fmla	v8.2d, v16.2d, alphaV0
+	st1	{v8.d}[0], [pCRow0]
+	st1	{v8.d}[1], [pCRow1]
+
+	add	pCRow0, pCRow0, #8
+.endm
+
+/******************************************************************************/
+
+.macro INIT8x1
+	fmov	d16, xzr
+	fmov	d17, xzr
+	fmov	d18, d16
+	fmov	d19, d17
+.endm
+
+.macro KERNEL8x1_SUB
+	ld1	{v0.2d, v1.2d}, [pA]
+	add	pA , pA, #32
+
+	ldr	d8, [pB]
+	add	pB , pB, #8
+
+	ld1	{v2.2d, v3.2d}, [pA]
+	add	pA, pA, #32
+
+	fmla	v16.2d, v0.2d, v8.2d[0]
+	fmla	v17.2d, v1.2d, v8.2d[0]
+	fmla	v18.2d, v2.2d, v8.2d[0]
+	fmla	v19.2d, v3.2d, v8.2d[0]
+.endm
+
+.macro SAVE8x1
+	ld1	{v0.2d, v1.2d, v2.2d, v3.2d}, [pCRow0]
+	fmla	v0.2d, v16.2d, alphaV0
+	fmla	v1.2d, v17.2d, alphaV1
+	fmla	v2.2d, v18.2d, alphaV2
+	fmla	v3.2d, v19.2d, alphaV3
+	st1 	{v0.2d, v1.2d, v2.2d, v3.2d}, [pCRow0]
+
+	add	pCRow0, pCRow0, #64
+.endm
+
+
+/******************************************************************************/
+
+.macro INIT4x1
+	fmov	d16, xzr
+	fmov	d17, d16
+.endm
+
+.macro KERNEL4x1_SUB
+	ldr	d8, [pB]
+	add	pB , pB, #8
+
+	ld1	{v0.2d, v1.2d}, [pA]
+	add	pA , pA, #32
+
+	fmla	v16.2d, v0.2d, v8.2d[0]
+	fmla	v17.2d, v1.2d, v8.2d[0]
+.endm
+
+.macro SAVE4x1
+	ld1	{v8.2d, v9.2d}, [pCRow0]
+	fmla	v8.2d, v16.2d, alphaV0
+	fmla	v9.2d, v17.2d, alphaV1
+	st1	{v8.2d, v9.2d}, [pCRow0]
+
+	add	pCRow0, pCRow0, #32
+.endm
+
+
+/******************************************************************************/
+
+.macro INIT2x1
+	fmov		d16, xzr
+.endm
+
+.macro KERNEL2x1_SUB
+	ldr	d8, [pB]
+	add	pB , pB, #8
+
+	ld1	{v0.2d}, [pA]
+	add	pA , pA, #16
+
+	fmla	v16.2d, v0.2d, v8.2d[0]
+.endm
+
+.macro SAVE2x1
+	ld1	{v8.2d}, [pCRow0]
+	fmla	v8.2d, v16.2d, alphaV0
+	st1	{v8.2d}, [pCRow0]
+
+	add	pCRow0, pCRow0, #16
+.endm
+
+/******************************************************************************/
+
+.macro INIT1x1
+	fmov	d16, xzr
+.endm
+
+.macro KERNEL1x1_SUB
+	ldr	d8, [pB]
+	add	pB , pB, #8
+
+	ldr	d0, [pA]
+	add	pA , pA, #8
+
+	fmadd 	d16, d0, d8, d16  
+.endm
+
+.macro SAVE1x1
+	ldr	d8, [pCRow0]
+	fmadd	d8, d16, alpha0, d8
+	str 	d8, [pCRow0]
+
+	add	pCRow0, pCRow0, #8
+.endm
+
+/*******************************************************************************
+* End of macro definitions
+*******************************************************************************/
+
+	PROLOGUE
+
+	.align 5
+	add	sp, sp, #-(11 * 16)
+	stp	d8, d9, [sp, #(0 * 16)]
+	stp	d10, d11, [sp, #(1 * 16)]
+	stp	d12, d13, [sp, #(2 * 16)]
+	stp	d14, d15, [sp, #(3 * 16)]
+	stp	d16, d17, [sp, #(4 * 16)]
+	stp	x18, x19, [sp, #(5 * 16)]
+	stp	x20, x21, [sp, #(6 * 16)]
+	stp	x22, x23, [sp, #(7 * 16)]
+	stp	x24, x25, [sp, #(8 * 16)]
+	stp	x26, x27, [sp, #(9 * 16)]
+	str	x28, [sp, #(10 * 16)]
+
+	fmov	alpha, d0
+
+	lsl	LDC, LDC, #3			// ldc = ldc * 8
+
+	mov	pB, origPB
+
+	mov	counterJ, origN
+	asr 	counterJ, counterJ, #2		// J = J / 4
+	cmp 	counterJ, #0
+	ble	dgemm_kernel_L2_BEGIN
+
+/******************************************************************************/
+
+dgemm_kernel_L4_BEGIN:
+	mov	pCRow0, pC
+	add	pCRow1, pCRow0, LDC
+	add	pCRow2, pCRow1, LDC
+	add	pCRow3, pCRow2, LDC
+	add	pC, pCRow3, LDC
+
+	mov	pA, origPA			// pA = start of A array
+
+dgemm_kernel_L4_M8_BEGIN:
+
+	mov	counterI, origM
+	asr 	counterI, counterI, #3		// counterI = counterI / 8
+	cmp 	counterI, #0
+	ble	dgemm_kernel_L4_M4_BEGIN
+
+dgemm_kernel_L4_M8_20:
+
+	mov	pB, origPB
+
+	asr 	counterL , origK, #3		// L = K / 8
+	cmp	counterL , #2			// is there at least 4 to do?
+	blt	dgemm_kernel_L4_M8_32
+
+	KERNEL8x4_I
+	KERNEL8x4_M2
+	KERNEL8x4_M1
+	KERNEL8x4_M2
+	KERNEL8x4_M1
+	KERNEL8x4_M2
+	KERNEL8x4_M1
+	KERNEL8x4_M2
+
+	subs	counterL, counterL, #2		// subtract 2
+	ble	dgemm_kernel_L4_M8_22a
+	.align 5
+
+dgemm_kernel_L4_M8_22:
+
+	KERNEL8x4_M1
+	KERNEL8x4_M2
+	KERNEL8x4_M1
+	KERNEL8x4_M2
+	KERNEL8x4_M1
+	KERNEL8x4_M2
+	KERNEL8x4_M1
+	KERNEL8x4_M2
+
+	subs	counterL, counterL, #1
+	bgt	dgemm_kernel_L4_M8_22
+
+
+dgemm_kernel_L4_M8_22a:
+
+	KERNEL8x4_M1
+	KERNEL8x4_M2
+	KERNEL8x4_M1
+	KERNEL8x4_M2
+	KERNEL8x4_M1
+	KERNEL8x4_M2
+	KERNEL8x4_M1
+	KERNEL8x4_E
+
+	b	 dgemm_kernel_L4_M8_44
+
+dgemm_kernel_L4_M8_32:
+
+	tst	counterL, #1
+	ble	dgemm_kernel_L4_M8_40
+
+	KERNEL8x4_I
+	KERNEL8x4_M2
+	KERNEL8x4_M1
+	KERNEL8x4_M2
+	KERNEL8x4_M1
+	KERNEL8x4_M2
+	KERNEL8x4_M1
+	KERNEL8x4_E
+
+	b	dgemm_kernel_L4_M8_44
+
+dgemm_kernel_L4_M8_40:
+
+	INIT8x4
+
+dgemm_kernel_L4_M8_44:
+
+	ands	counterL , origK, #7
+	ble	dgemm_kernel_L4_M8_100
+
+dgemm_kernel_L4_M8_46:
+
+	KERNEL8x4_SUB
+
+	subs	counterL, counterL, #1
+	bne	dgemm_kernel_L4_M8_46
+
+dgemm_kernel_L4_M8_100:
+
+	SAVE8x4
+
+dgemm_kernel_L4_M8_END:
+	subs	counterI, counterI, #1
+	bne	dgemm_kernel_L4_M8_20
+
+dgemm_kernel_L4_M4_BEGIN:
+
+	mov	counterI, origM
+	tst	counterI , #7
+	ble	dgemm_kernel_L4_END
+
+	tst	counterI, #4
+	ble	dgemm_kernel_L4_M2_BEGIN
+
+dgemm_kernel_L4_M4_20:
+
+	INIT4x4
+
+	mov	pB, origPB
+
+	asr 	counterL , origK, #3		// counterL = counterL / 8
+	cmp	counterL , #0
+	ble	dgemm_kernel_L4_M4_40
+
+dgemm_kernel_L4_M4_22:
+
+	KERNEL4x4_SUB
+	KERNEL4x4_SUB
+	KERNEL4x4_SUB
+	KERNEL4x4_SUB
+
+	KERNEL4x4_SUB
+	KERNEL4x4_SUB
+	KERNEL4x4_SUB
+	KERNEL4x4_SUB
+
+	subs	counterL, counterL, #1
+	bgt	dgemm_kernel_L4_M4_22
+
+dgemm_kernel_L4_M4_40:
+
+	ands	counterL , origK, #7		// counterL = counterL % 8
+	ble	dgemm_kernel_L4_M4_100
+
+dgemm_kernel_L4_M4_42:
+
+	KERNEL4x4_SUB
+
+	subs	counterL, counterL, #1
+	bgt	dgemm_kernel_L4_M4_42
+
+dgemm_kernel_L4_M4_100:
+
+	SAVE4x4
+
+dgemm_kernel_L4_M4_END:
+
+
+dgemm_kernel_L4_M2_BEGIN:
+
+	mov	counterI, origM
+	tst	counterI , #3
+	ble	dgemm_kernel_L4_END
+
+	tst	counterI, #2			// counterI = counterI / 2
+	ble	dgemm_kernel_L4_M1_BEGIN
+
+dgemm_kernel_L4_M2_20:
+
+	INIT2x4
+
+	mov	pB, origPB
+
+	asr 	counterL , origK, #3		// counterL = counterL / 8
+	cmp	counterL , #0
+	ble	dgemm_kernel_L4_M2_40
+
+dgemm_kernel_L4_M2_22:
+
+	KERNEL2x4_SUB
+	KERNEL2x4_SUB
+	KERNEL2x4_SUB
+	KERNEL2x4_SUB
+
+	KERNEL2x4_SUB
+	KERNEL2x4_SUB
+	KERNEL2x4_SUB
+	KERNEL2x4_SUB
+
+	subs	counterL, counterL, #1
+	bgt	dgemm_kernel_L4_M2_22
+
+
+dgemm_kernel_L4_M2_40:
+
+	ands	counterL , origK, #7		// counterL = counterL % 8
+	ble	dgemm_kernel_L4_M2_100
+
+dgemm_kernel_L4_M2_42:
+
+	KERNEL2x4_SUB
+
+	subs	counterL, counterL, #1
+	bgt	dgemm_kernel_L4_M2_42
+
+dgemm_kernel_L4_M2_100:
+
+	SAVE2x4
+
+dgemm_kernel_L4_M2_END:
+
+
+dgemm_kernel_L4_M1_BEGIN:
+
+	tst	counterI, #1			// counterI = counterI % 2
+	ble	dgemm_kernel_L4_END
+
+dgemm_kernel_L4_M1_20:
+
+	INIT1x4
+
+	mov	pB, origPB
+
+	asr 	counterL , origK, #3		// counterL = counterL / 8
+	cmp	counterL , #0
+	ble	dgemm_kernel_L4_M1_40
+
+dgemm_kernel_L4_M1_22:
+	KERNEL1x4_SUB
+	KERNEL1x4_SUB
+	KERNEL1x4_SUB
+	KERNEL1x4_SUB
+
+	KERNEL1x4_SUB
+	KERNEL1x4_SUB
+	KERNEL1x4_SUB
+	KERNEL1x4_SUB
+
+	subs	counterL, counterL, #1
+	bgt	dgemm_kernel_L4_M1_22
+
+
+dgemm_kernel_L4_M1_40:
+
+	ands	counterL , origK, #7		// counterL = counterL % 8
+	ble	dgemm_kernel_L4_M1_100
+
+dgemm_kernel_L4_M1_42:
+
+	KERNEL1x4_SUB
+
+	subs	counterL, counterL, #1
+	bgt	dgemm_kernel_L4_M1_42
+
+dgemm_kernel_L4_M1_100:
+
+	SAVE1x4
+
+dgemm_kernel_L4_END:
+
+	lsl	temp, origK, #5 
+	add	origPB, origPB, temp		// B = B + K * 4 * 8
+
+	subs	counterJ, counterJ , #1		// j--
+	bgt	dgemm_kernel_L4_BEGIN
+
+
+/******************************************************************************/
+
+dgemm_kernel_L2_BEGIN:   // less than 2 left in N direction
+
+	mov	counterJ , origN
+	tst	counterJ , #3
+	ble	dgemm_kernel_L999   // error, N was less than 4?
+
+	tst	counterJ , #2
+	ble	dgemm_kernel_L1_BEGIN
+
+	mov	pCRow0, pC			// pCRow0 = pC
+
+	add	pC,pC,LDC, lsl #1
+
+	mov	pA, origPA			// pA = A
+
+dgemm_kernel_L2_M8_BEGIN:
+
+	mov	counterI, origM
+	asr 	counterI, counterI, #3		// counterI = counterI / 8
+	cmp	counterI, #0
+	ble	dgemm_kernel_L2_M4_BEGIN
+
+dgemm_kernel_L2_M8_20:
+
+	INIT8x2
+
+	mov	pB, origPB
+
+	asr	counterL , origK, #3		// counterL = counterL / 8
+	cmp	counterL,#0
+	ble	dgemm_kernel_L2_M8_40
+	.align 5
+
+dgemm_kernel_L2_M8_22:
+	KERNEL8x2_SUB
+	KERNEL8x2_SUB
+	KERNEL8x2_SUB
+	KERNEL8x2_SUB
+
+	KERNEL8x2_SUB
+	KERNEL8x2_SUB
+	KERNEL8x2_SUB
+	KERNEL8x2_SUB
+
+	subs	counterL, counterL, #1
+	bgt	dgemm_kernel_L2_M8_22
+
+
+dgemm_kernel_L2_M8_40:
+
+	ands	counterL , origK, #7		// counterL = counterL % 8
+	ble	dgemm_kernel_L2_M8_100
+
+dgemm_kernel_L2_M8_42:
+
+	KERNEL8x2_SUB
+
+	subs	counterL, counterL, #1
+	bgt	dgemm_kernel_L2_M8_42
+
+dgemm_kernel_L2_M8_100:
+
+	SAVE8x2
+
+dgemm_kernel_L2_M8_END:
+
+	subs	counterI, counterI, #1
+	bgt	dgemm_kernel_L2_M8_20
+
+dgemm_kernel_L2_M4_BEGIN:
+
+	mov	counterI, origM
+	tst	counterI , #7
+	ble	dgemm_kernel_L2_END
+
+	tst	counterI, #4			// counterI = counterI / 2
+	ble	dgemm_kernel_L2_M2_BEGIN
+
+dgemm_kernel_L2_M4_20:
+
+	INIT4x2
+
+	mov	pB, origPB
+
+	asr	counterL , origK, #3		// counterL = counterL / 8
+	cmp	counterL,#0
+	ble	dgemm_kernel_L2_M4_40
+	.align 5
+
+dgemm_kernel_L2_M4_22:
+	KERNEL4x2_SUB
+	KERNEL4x2_SUB
+	KERNEL4x2_SUB
+	KERNEL4x2_SUB
+
+	KERNEL4x2_SUB
+	KERNEL4x2_SUB
+	KERNEL4x2_SUB
+	KERNEL4x2_SUB
+
+	subs	counterL, counterL, #1
+	bgt	dgemm_kernel_L2_M4_22
+
+
+dgemm_kernel_L2_M4_40:
+
+	ands	counterL , origK, #7		// counterL = counterL % 8
+	ble	dgemm_kernel_L2_M4_100
+
+dgemm_kernel_L2_M4_42:
+
+	KERNEL4x2_SUB
+
+	subs	counterL, counterL, #1
+	bgt	dgemm_kernel_L2_M4_42
+
+dgemm_kernel_L2_M4_100:
+
+	SAVE4x2
+
+dgemm_kernel_L2_M4_END:
+
+
+dgemm_kernel_L2_M2_BEGIN:
+
+	mov	counterI, origM
+	tst	counterI , #3
+	ble	dgemm_kernel_L2_END
+
+	tst	counterI, #2			// counterI = counterI / 2
+	ble	dgemm_kernel_L2_M1_BEGIN
+
+dgemm_kernel_L2_M2_20:
+
+	INIT2x2
+
+	mov	pB, origPB
+
+	asr	counterL , origK, #3		// counterL = counterL / 8
+        cmp	counterL,#0
+	ble	dgemm_kernel_L2_M2_40
+
+dgemm_kernel_L2_M2_22:
+
+	KERNEL2x2_SUB
+	KERNEL2x2_SUB
+	KERNEL2x2_SUB
+	KERNEL2x2_SUB
+
+	KERNEL2x2_SUB
+	KERNEL2x2_SUB
+	KERNEL2x2_SUB
+	KERNEL2x2_SUB
+
+	subs	counterL, counterL, #1
+	bgt	dgemm_kernel_L2_M2_22
+
+
+dgemm_kernel_L2_M2_40:
+
+	ands	counterL , origK, #7		// counterL = counterL % 8
+	ble	dgemm_kernel_L2_M2_100
+
+dgemm_kernel_L2_M2_42:
+
+	KERNEL2x2_SUB
+
+	subs	counterL, counterL, #1
+	bgt	dgemm_kernel_L2_M2_42
+
+dgemm_kernel_L2_M2_100:
+
+	SAVE2x2
+
+dgemm_kernel_L2_M2_END:
+
+
+dgemm_kernel_L2_M1_BEGIN:
+
+	tst	counterI, #1			// counterI = counterI % 2
+	ble	dgemm_kernel_L2_END
+
+dgemm_kernel_L2_M1_20:
+
+	INIT1x2
+
+	mov	pB, origPB
+
+	asr 	counterL , origK, #3		// counterL = counterL / 8
+        cmp     counterL, #0
+	ble	dgemm_kernel_L2_M1_40
+
+dgemm_kernel_L2_M1_22:
+	KERNEL1x2_SUB
+	KERNEL1x2_SUB
+	KERNEL1x2_SUB
+	KERNEL1x2_SUB
+
+	KERNEL1x2_SUB
+	KERNEL1x2_SUB
+	KERNEL1x2_SUB
+	KERNEL1x2_SUB
+
+	subs	counterL, counterL, #1
+	bgt	dgemm_kernel_L2_M1_22
+
+
+dgemm_kernel_L2_M1_40:
+
+	ands	counterL , origK, #7		// counterL = counterL % 8
+	ble	dgemm_kernel_L2_M1_100
+
+dgemm_kernel_L2_M1_42:
+
+	KERNEL1x2_SUB
+
+	subs	counterL, counterL, #1
+	bgt	dgemm_kernel_L2_M1_42
+
+dgemm_kernel_L2_M1_100:
+
+	SAVE1x2
+
+dgemm_kernel_L2_END:
+	add	origPB, origPB, origK, lsl #4	// B = B + K * 2 * 8
+
+/******************************************************************************/
+
+dgemm_kernel_L1_BEGIN:
+
+	mov	counterJ , origN
+	tst	counterJ , #1
+	ble	dgemm_kernel_L999 // done
+
+	mov	pCRow0, pC			// pCRow0 = C
+	add	pC , pC , LDC			// Update pC to point to next
+
+	mov	pA, origPA			// pA = A
+
+dgemm_kernel_L1_M8_BEGIN:
+
+	mov	counterI, origM
+	asr 	counterI, counterI, #3		// counterI = counterI / 8
+	cmp	counterI, #0
+	ble	dgemm_kernel_L1_M4_BEGIN
+
+dgemm_kernel_L1_M8_20:
+
+	INIT8x1
+
+	mov	pB, origPB
+	asr	counterL , origK, #3		// counterL = counterL / 8
+	cmp	counterL , #0
+	ble	dgemm_kernel_L1_M8_40
+	.align 5
+
+dgemm_kernel_L1_M8_22:
+	KERNEL8x1_SUB
+	KERNEL8x1_SUB
+	KERNEL8x1_SUB
+	KERNEL8x1_SUB
+
+	KERNEL8x1_SUB
+	KERNEL8x1_SUB
+	KERNEL8x1_SUB
+	KERNEL8x1_SUB
+
+	subs	counterL, counterL, #1
+	bgt	dgemm_kernel_L1_M8_22
+
+
+dgemm_kernel_L1_M8_40:
+
+	ands	counterL , origK, #7		// counterL = counterL % 8
+	ble	dgemm_kernel_L1_M8_100
+
+dgemm_kernel_L1_M8_42:
+
+	KERNEL8x1_SUB
+
+	subs	counterL, counterL, #1
+	bgt	dgemm_kernel_L1_M8_42
+
+dgemm_kernel_L1_M8_100:
+
+	SAVE8x1
+
+dgemm_kernel_L1_M8_END:
+
+	subs	counterI, counterI, #1
+	bgt	dgemm_kernel_L1_M8_20
+
+dgemm_kernel_L1_M4_BEGIN:
+
+	mov	counterI, origM
+	tst	counterI , #7
+	ble	dgemm_kernel_L1_END
+
+	tst	counterI, #4			// counterI = counterI / 2
+	ble	dgemm_kernel_L1_M2_BEGIN
+
+dgemm_kernel_L1_M4_20:
+
+	INIT4x1
+
+	mov	pB, origPB
+	asr	counterL , origK, #3		// counterL = counterL / 8
+	cmp	counterL , #0
+	ble	dgemm_kernel_L1_M4_40
+	.align 5
+
+dgemm_kernel_L1_M4_22:
+	KERNEL4x1_SUB
+	KERNEL4x1_SUB
+	KERNEL4x1_SUB
+	KERNEL4x1_SUB
+
+	KERNEL4x1_SUB
+	KERNEL4x1_SUB
+	KERNEL4x1_SUB
+	KERNEL4x1_SUB
+
+	subs	counterL, counterL, #1
+	bgt	dgemm_kernel_L1_M4_22
+
+
+dgemm_kernel_L1_M4_40:
+
+	ands	counterL , origK, #7		// counterL = counterL % 8
+	ble	dgemm_kernel_L1_M4_100
+
+dgemm_kernel_L1_M4_42:
+
+	KERNEL4x1_SUB
+
+	subs	counterL, counterL, #1
+	bgt	dgemm_kernel_L1_M4_42
+
+dgemm_kernel_L1_M4_100:
+
+	SAVE4x1
+
+dgemm_kernel_L1_M4_END:
+
+dgemm_kernel_L1_M2_BEGIN:
+
+	mov	counterI, origM
+	tst	counterI , #3
+	ble	dgemm_kernel_L1_END
+
+	tst	counterI, #2			// counterI = counterI / 2
+	ble	dgemm_kernel_L1_M1_BEGIN
+
+dgemm_kernel_L1_M2_20:
+
+	INIT2x1
+
+	mov	pB, origPB
+
+	asr 	counterL , origK, #3		// counterL = counterL / 8
+	cmp	counterL , #0
+	ble	dgemm_kernel_L1_M2_40
+
+dgemm_kernel_L1_M2_22:
+
+	KERNEL2x1_SUB
+	KERNEL2x1_SUB
+	KERNEL2x1_SUB
+	KERNEL2x1_SUB
+
+	KERNEL2x1_SUB
+	KERNEL2x1_SUB
+	KERNEL2x1_SUB
+	KERNEL2x1_SUB
+
+	subs	counterL, counterL, #1
+	bgt	dgemm_kernel_L1_M2_22
+
+
+dgemm_kernel_L1_M2_40:
+
+	ands	counterL , origK, #7		// counterL = counterL % 8
+	ble	dgemm_kernel_L1_M2_100
+
+dgemm_kernel_L1_M2_42:
+
+	KERNEL2x1_SUB
+
+	subs	counterL, counterL, #1
+	bgt	dgemm_kernel_L1_M2_42
+
+dgemm_kernel_L1_M2_100:
+
+	SAVE2x1
+
+dgemm_kernel_L1_M2_END:
+
+
+dgemm_kernel_L1_M1_BEGIN:
+
+	tst	counterI, #1			// counterI = counterI % 2
+	ble	dgemm_kernel_L1_END
+
+dgemm_kernel_L1_M1_20:
+
+	INIT1x1
+
+	mov	pB, origPB
+
+	asr 	counterL , origK, #3		// counterL = counterL / 8
+	cmp	counterL , #0
+	ble	dgemm_kernel_L1_M1_40
+
+dgemm_kernel_L1_M1_22:
+	KERNEL1x1_SUB
+	KERNEL1x1_SUB
+	KERNEL1x1_SUB
+	KERNEL1x1_SUB
+
+	KERNEL1x1_SUB
+	KERNEL1x1_SUB
+	KERNEL1x1_SUB
+	KERNEL1x1_SUB
+
+	subs	counterL, counterL, #1
+	bgt	dgemm_kernel_L1_M1_22
+
+
+dgemm_kernel_L1_M1_40:
+
+	ands	counterL , origK, #7		// counterL = counterL % 8
+	ble	dgemm_kernel_L1_M1_100
+
+dgemm_kernel_L1_M1_42:
+
+	KERNEL1x1_SUB
+
+	subs	counterL, counterL, #1
+	bgt	dgemm_kernel_L1_M1_42
+
+dgemm_kernel_L1_M1_100:
+
+	SAVE1x1
+
+
+dgemm_kernel_L1_END:
+
+
+dgemm_kernel_L999:
+	mov	x0, #0				// set return value
+	ldp	d8, d9, [sp, #(0 * 16)]
+	ldp	d10, d11, [sp, #(1 * 16)]
+	ldp	d12, d13, [sp, #(2 * 16)]
+	ldp	d14, d15, [sp, #(3 * 16)]
+	ldp	d16, d17, [sp, #(4 * 16)]
+	ldp	x18, x19, [sp, #(5 * 16)]
+	ldp	x20, x21, [sp, #(6 * 16)]
+	ldp	x22, x23, [sp, #(7 * 16)]
+	ldp	x24, x25, [sp, #(8 * 16)]
+	ldp	x26, x27, [sp, #(9 * 16)]
+	ldr	x28, [sp, #(10 * 16)]
+	add	sp, sp, #(11*16)
+	ret
+
+	EPILOGUE
+
diff --git a/kernel/arm64/dtrmm_kernel_4x8.S b/kernel/arm64/dtrmm_kernel_4x8.S
new file mode 100755
index 000000000..eb7397faa
--- /dev/null
+++ b/kernel/arm64/dtrmm_kernel_4x8.S
@@ -0,0 +1,2026 @@
+/*******************************************************************************
+Copyright (c) 2015, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*******************************************************************************/
+
+#define ASSEMBLER
+#include "common.h"
+
+/*                   X0          X1          X2          s0        X3        x4       x5           x6            x7*/
+/*int CNAME(BLASLONG bm,BLASLONG bn,BLASLONG bk,FLOAT alpha0,FLOAT* ba,FLOAT* bb,FLOAT* C,BLASLONG ldc, BLASLONG offset) */
+
+#define origM		x0
+#define origN		x1
+#define origK		x2
+#define origPA		x3
+#define origPB		x4
+#define pC		x5
+#define LDC		x6
+#define offset		x7
+#define counterL	x8
+#define counterI	x9
+#define counterJ	x10
+#define pB		x11
+#define pCRow0		x12
+#define pCRow1		x13
+#define pCRow2		x14
+#define pA		x15
+#define temp		x16
+#define tempOffset	x17
+#define tempK		x18
+
+#define alpha0		d2
+#define alphaV0		v2.d[0]
+#define alpha1		d3
+#define alphaV1		v3.d[0]
+#define alpha2		d6
+#define alphaV2		v6.d[0]
+#define alpha3		d7
+#define alphaV3		v7.d[0]
+
+// 00 origM
+// 01 origN
+// 02 origK
+// 03 origPA
+// 04 origPB
+// 05 pC
+// 06 origLDC -> LDC
+// 07 offset
+// 08 counterL
+// 09 counterI
+// 10 counterJ
+// 11 pB
+// 12 pCRow0
+// 13 pCRow1
+// 14 pCRow2
+// 15 pA
+// 16 temp
+// 17 tempOffset
+// 18 must save tempK
+// 19 must save
+// 20 must save
+// 21 must save
+// 22 must save
+// 23 must save
+// 24 must save
+// 25 must save
+// 26 must save
+// 27 must save
+// 28 must save
+// 29 frame
+// 30 link
+// 31 sp
+
+//v00 ALPHA -> pA00, pA01
+//v01 pA02, pA03
+//v02 ALPHA0
+//v03 ALPHA1
+//v04 pA10, pA11
+//v05 pA12, pA13
+//v06 ALPHA2
+//v07 ALPHA3
+//v08 must save pB0_0, pB0_1
+//v09 must save pB0_2, pB0_3
+//v10 must save pB0_4, pB0_5
+//v11 must save pB0_6, pB0_7
+//v12 must save pB1_0, pB1_1
+//v13 must save pB1_2, pB1_3
+//v14 must save pB1_4, pB1_5
+//v15 must save pB1_6, pB1_7
+//v16 must save C00, C01
+//v17 must save C02, C03
+//v18 C04, C05
+//v19 C06, C07
+//v20 C10, C11
+//v21 C12, C13
+//v22 C14, C15
+//v23 C16, C17
+//v24 C20, C21
+//v25 C22, C23
+//v26 C24, C25
+//v27 C26, C27
+//v28 C30, C31
+//v29 C32, C33
+//v30 C34, C35
+//v31 C36, C37
+
+/*******************************************************************************
+* Macro definitions
+*******************************************************************************/
+
+.macro INIT4x8
+	fmov		d16, xzr
+	fmov		d17, xzr
+	fmov		d18, xzr
+	fmov		d19, d16
+	fmov		d20, xzr
+	fmov		d21, d16
+	fmov		d22, d17
+	fmov		d23, d18
+	fmov		d24, xzr
+	fmov		d25, d16
+	fmov		d26, d17
+	fmov		d27, d18
+	fmov		d28, xzr
+	fmov		d29, d16
+	fmov		d30, d17
+	fmov		d31, d18
+.endm
+
+.macro KERNEL4x8_I
+	ld1	{v8.2d, v9.2d}, [pB]
+	add	pB, pB, #32
+	ld1	{v0.2d, v1.2d}, [pA]
+	add	pA, pA, #32
+	ld1	{v10.2d, v11.2d}, [pB]
+	add	pB, pB, #32
+
+	fmul	v16.2d, v0.2d, v8.2d[0]
+	fmul	v17.2d, v1.2d, v8.2d[0]
+	fmul	v18.2d, v0.2d, v8.2d[1]
+	fmul	v19.2d, v1.2d, v8.2d[1]
+
+	fmul	v20.2d, v0.2d, v9.2d[0]
+	fmul	v21.2d, v1.2d, v9.2d[0]
+	fmul	v22.2d, v0.2d, v9.2d[1]
+	fmul	v23.2d, v1.2d, v9.2d[1]
+
+	fmul	v24.2d, v0.2d, v10.2d[0]
+	fmul	v25.2d, v1.2d, v10.2d[0]
+	fmul	v26.2d, v0.2d, v10.2d[1]
+	fmul	v27.2d, v1.2d, v10.2d[1]
+
+	fmul	v28.2d, v0.2d, v11.2d[0]
+	fmul	v29.2d, v1.2d, v11.2d[0]
+	fmul	v30.2d, v0.2d, v11.2d[1]
+	fmul	v31.2d, v1.2d, v11.2d[1]
+
+	ld1	{v12.2d, v13.2d}, [pB]
+	add	pB, pB, #32
+	ld1	{v4.2d, v5.2d}, [pA]
+	add	pA, pA, #32
+	ld1	{v14.2d, v15.2d}, [pB]
+	add	pB, pB, #32
+.endm
+
+.macro KERNEL4x8_M1
+	fmla	v16.2d, v0.2d, v8.2d[0]
+	fmla	v17.2d, v1.2d, v8.2d[0]
+	fmla	v18.2d, v0.2d, v8.2d[1]
+	fmla	v19.2d, v1.2d, v8.2d[1]
+
+	fmla	v20.2d, v0.2d, v9.2d[0]
+	fmla	v21.2d, v1.2d, v9.2d[0]
+	fmla	v22.2d, v0.2d, v9.2d[1]
+	fmla	v23.2d, v1.2d, v9.2d[1]
+
+	fmla	v24.2d, v0.2d, v10.2d[0]
+	fmla	v25.2d, v1.2d, v10.2d[0]
+	fmla	v26.2d, v0.2d, v10.2d[1]
+	fmla	v27.2d, v1.2d, v10.2d[1]
+
+	fmla	v28.2d, v0.2d, v11.2d[0]
+	fmla	v29.2d, v1.2d, v11.2d[0]
+	fmla	v30.2d, v0.2d, v11.2d[1]
+	fmla	v31.2d, v1.2d, v11.2d[1]
+
+	ld1	{v12.2d, v13.2d}, [pB]		// For next round
+	add	pB, pB, #32
+	ld1	{v4.2d, v5.2d}, [pA]		// For next round
+	add	pA, pA, #32
+	ld1	{v14.2d, v15.2d}, [pB]
+	add	pB, pB, #32
+
+	prfm	PLDL1KEEP, [pA, #512]
+.endm
+
+.macro KERNEL4x8_M2
+	fmla	v16.2d, v4.2d, v12.2d[0]
+	fmla	v17.2d, v5.2d, v12.2d[0]
+	fmla	v18.2d, v4.2d, v12.2d[1]
+	fmla	v19.2d, v5.2d, v12.2d[1]
+
+	fmla	v20.2d, v4.2d, v13.2d[0]
+	fmla	v21.2d, v5.2d, v13.2d[0]
+	fmla	v22.2d, v4.2d, v13.2d[1]
+	fmla	v23.2d, v5.2d, v13.2d[1]
+
+	fmla	v24.2d, v4.2d, v14.2d[0]
+	fmla	v25.2d, v5.2d, v14.2d[0]
+	fmla	v26.2d, v4.2d, v14.2d[1]
+	fmla	v27.2d, v5.2d, v14.2d[1]
+
+	fmla	v28.2d, v4.2d, v15.2d[0]
+	fmla	v29.2d, v5.2d, v15.2d[0]
+	fmla	v30.2d, v4.2d, v15.2d[1]
+	fmla	v31.2d, v5.2d, v15.2d[1]
+
+	ld1	{v8.2d, v9.2d}, [pB]		// For next round
+	add	pB, pB, #32
+	ld1	{v0.2d, v1.2d}, [pA]		// For next round
+	add	pA, pA, #32
+	ld1	{v10.2d, v11.2d}, [pB]
+	add	pB, pB, #32
+
+	prfm	PLDL1KEEP, [pB, #512]
+.endm
+
+.macro KERNEL4x8_E
+	fmla	v16.2d, v4.2d, v12.2d[0]
+	fmla	v17.2d, v5.2d, v12.2d[0]
+	fmla	v18.2d, v4.2d, v12.2d[1]
+	fmla	v19.2d, v5.2d, v12.2d[1]
+
+	fmla	v20.2d, v4.2d, v13.2d[0]
+	fmla	v21.2d, v5.2d, v13.2d[0]
+	fmla	v22.2d, v4.2d, v13.2d[1]
+	fmla	v23.2d, v5.2d, v13.2d[1]
+
+	fmla	v24.2d, v4.2d, v14.2d[0]
+	fmla	v25.2d, v5.2d, v14.2d[0]
+	fmla	v26.2d, v4.2d, v14.2d[1]
+	fmla	v27.2d, v5.2d, v14.2d[1]
+
+	fmla	v28.2d, v4.2d, v15.2d[0]
+	fmla	v29.2d, v5.2d, v15.2d[0]
+	fmla	v30.2d, v4.2d, v15.2d[1]
+	fmla	v31.2d, v5.2d, v15.2d[1]
+.endm
+
+.macro KERNEL4x8_SUB
+	ld1	{v8.2d, v9.2d}, [pB]		// For next round
+	add	pB, pB, #32
+	ld1	{v0.2d, v1.2d}, [pA]		// For next round
+	add	pA, pA, #32
+	ld1	{v10.2d, v11.2d}, [pB]
+	add	pB, pB, #32
+
+	fmla	v16.2d, v0.2d, v8.2d[0]
+	fmla	v17.2d, v1.2d, v8.2d[0]
+	fmla	v18.2d, v0.2d, v8.2d[1]
+	fmla	v19.2d, v1.2d, v8.2d[1]
+
+	fmla	v20.2d, v0.2d, v9.2d[0]
+	fmla	v21.2d, v1.2d, v9.2d[0]
+	fmla	v22.2d, v0.2d, v9.2d[1]
+	fmla	v23.2d, v1.2d, v9.2d[1]
+
+	fmla	v24.2d, v0.2d, v10.2d[0]
+	fmla	v25.2d, v1.2d, v10.2d[0]
+	fmla	v26.2d, v0.2d, v10.2d[1]
+	fmla	v27.2d, v1.2d, v10.2d[1]
+
+	fmla	v28.2d, v0.2d, v11.2d[0]
+	fmla	v29.2d, v1.2d, v11.2d[0]
+	fmla	v30.2d, v0.2d, v11.2d[1]
+	fmla	v31.2d, v1.2d, v11.2d[1]
+.endm
+
+.macro SAVE4x8
+	add	pCRow1, pCRow0, LDC
+
+	fmul	v8.2d, v16.2d, alphaV0
+	fmul	v9.2d, v17.2d, alphaV1
+	st1 	{v8.2d, v9.2d}, [pCRow0]
+
+	add	pCRow2, pCRow1, LDC
+
+	fmul	v10.2d, v18.2d, alphaV2
+	fmul	v11.2d, v19.2d, alphaV3
+	st1 	{v10.2d, v11.2d}, [pCRow1]
+
+	add	pCRow1, pCRow2, LDC
+
+	fmul	v12.2d, v20.2d, alphaV0
+	fmul	v13.2d, v21.2d, alphaV1
+	st1 	{v12.2d, v13.2d}, [pCRow2]
+
+	add	pCRow2, pCRow1, LDC
+
+	fmul	v14.2d, v22.2d, alphaV2
+	fmul	v15.2d, v23.2d, alphaV3
+	st1 	{v14.2d, v15.2d}, [pCRow1]
+
+	add	pCRow1, pCRow2, LDC
+
+	fmul	v8.2d, v24.2d, alphaV0
+	fmul	v9.2d, v25.2d, alphaV1
+	st1 	{v8.2d, v9.2d}, [pCRow2]
+
+	add	pCRow2, pCRow1, LDC
+
+	fmul	v10.2d, v26.2d, alphaV2
+	fmul	v11.2d, v27.2d, alphaV3
+	st1 	{v10.2d, v11.2d}, [pCRow1]
+
+	add	pCRow1, pCRow2, LDC
+
+	fmul	v12.2d, v28.2d, alphaV0
+	fmul	v13.2d, v29.2d, alphaV1
+	st1 	{v12.2d, v13.2d}, [pCRow2]
+
+	fmul	v14.2d, v30.2d, alphaV2
+	fmul	v15.2d, v31.2d, alphaV3
+	st1 	{v14.2d, v15.2d}, [pCRow1]
+
+	add	pCRow0, pCRow0, #32
+.endm
+
+/******************************************************************************/
+
+.macro INIT2x8
+	fmov	d16, xzr
+	fmov	d18, xzr
+	fmov	d20, xzr
+	fmov	d22, d16
+	fmov	d24, xzr
+	fmov	d26, d16
+	fmov	d28, xzr
+	fmov	d30, d16
+.endm
+
+.macro KERNEL2x8_SUB
+	ld1	{v8.2d, v9.2d}, [pB]
+	add	pB, pB, #32
+	ld1	{v0.2d}, [pA]
+	add	pA, pA, #16
+	ld1	{v10.2d, v11.2d}, [pB]
+	add	pB, pB, #32
+
+	fmla	v16.2d, v0.2d, v8.2d[0]
+	fmla	v18.2d, v0.2d, v8.2d[1]
+
+	fmla	v20.2d, v0.2d, v9.2d[0]
+	fmla	v22.2d, v0.2d, v9.2d[1]
+
+	fmla	v24.2d, v0.2d, v10.2d[0]
+	fmla	v26.2d, v0.2d, v10.2d[1]
+
+	fmla	v28.2d, v0.2d, v11.2d[0]
+	fmla	v30.2d, v0.2d, v11.2d[1]
+.endm
+
+.macro SAVE2x8
+	add	pCRow1, pCRow0, LDC
+
+	fmul	v8.2d, v16.2d, alphaV0
+	st1 	{v8.2d}, [pCRow0]
+
+	add	pCRow2, pCRow1, LDC
+
+	fmul	v10.2d, v18.2d, alphaV2
+	st1 	{v10.2d}, [pCRow1]
+
+	add	pCRow1, pCRow2, LDC
+
+	fmul	v12.2d, v20.2d, alphaV0
+	st1 	{v12.2d}, [pCRow2]
+
+	add	pCRow2, pCRow1, LDC
+
+	fmul	v14.2d, v22.2d, alphaV2
+	st1 	{v14.2d}, [pCRow1]
+
+	add	pCRow1, pCRow2, LDC
+
+	fmul	v8.2d, v24.2d, alphaV0
+	st1 	{v8.2d}, [pCRow2]
+
+	add	pCRow2, pCRow1, LDC
+
+	fmul	v10.2d, v26.2d, alphaV2
+	st1 	{v10.2d}, [pCRow1]
+
+	add	pCRow1, pCRow2, LDC
+
+	fmul	v12.2d, v28.2d, alphaV0
+	st1 	{v12.2d}, [pCRow2]
+
+	add	pCRow2, pCRow1, LDC
+
+	fmul	v14.2d, v30.2d, alphaV2
+	st1 	{v14.2d}, [pCRow1]
+
+	add	pCRow0, pCRow0, #16
+.endm
+
+/******************************************************************************/
+
+.macro INIT1x8
+	fmov	d16, xzr
+	fmov	d20, xzr
+	fmov	d24, xzr
+	fmov	d28, xzr
+.endm
+
+.macro KERNEL1x8_SUB
+	ld1	{v8.2d, v9.2d}, [pB]
+	add	pB, pB, #32
+	ldr	d0, [pA]
+	add	pA, pA, #8
+	ld1	{v10.2d, v11.2d}, [pB]
+	add	pB, pB, #32
+
+	fmla	v16.2d, v8.2d, v0.d[0]
+	fmla	v20.2d, v9.2d, v0.d[0]
+	fmla	v24.2d, v10.2d, v0.d[0]
+	fmla	v28.2d, v11.2d, v0.d[0]
+.endm
+
+.macro SAVE1x8
+	add	pCRow1, pCRow0, LDC
+
+	fmul	v8.2d, v16.2d, alphaV0
+	st1	{v8.d}[0], [pCRow0]
+	st1	{v8.d}[1], [pCRow1]
+
+	add	pCRow2, pCRow1, LDC
+	add	pCRow1, pCRow2, LDC
+
+	fmul	v10.2d, v20.2d, alphaV1
+	st1	{v10.d}[0], [pCRow2]
+	st1	{v10.d}[1], [pCRow1]
+
+	add	pCRow2, pCRow1, LDC
+	add	pCRow1, pCRow2, LDC
+
+	fmul	v12.2d, v24.2d, alphaV2
+	st1	{v12.d}[0], [pCRow2]
+	st1	{v12.d}[1], [pCRow1]
+
+	add	pCRow2, pCRow1, LDC
+	add	pCRow1, pCRow2, LDC
+
+	fmul	v14.2d, v28.2d, alphaV3
+	st1	{v14.d}[0], [pCRow2]
+	st1	{v14.d}[1], [pCRow1]
+
+	add	pCRow0, pCRow0, #8
+.endm
+
+/******************************************************************************/
+
+.macro INIT4x4
+	fmov		d16, xzr
+	fmov		d17, d16
+	fmov		d20, d17
+	fmov		d21, d16
+	fmov		d24, d17
+	fmov		d25, d16
+	fmov		d28, d17
+	fmov		d29, d16
+.endm
+
+.macro KERNEL4x4_I
+	ld1	{v8.2d, v9.2d}, [pB]
+	add	pB, pB, #32
+	ld1	{v0.2d, v1.2d}, [pA]
+	add	pA, pA, #32
+
+	fmul	v16.2d, v0.2d, v8.2d[0]
+	fmul	v29.2d, v1.2d, v9.2d[1]
+
+	fmul	v20.2d, v0.2d, v8.2d[1]
+	fmul	v25.2d, v1.2d, v9.2d[0]
+
+	fmul	v24.2d, v0.2d, v9.2d[0]
+	fmul	v21.2d, v1.2d, v8.2d[1]
+
+	fmul	v28.2d, v0.2d, v9.2d[1]
+	fmul	v17.2d, v1.2d, v8.2d[0]
+
+	ld1	{v12.2d, v13.2d}, [pB]
+	add	pB, pB, #32
+	ld1	{v4.2d, v5.2d}, [pA]
+	add	pA, pA, #32
+.endm
+
+.macro KERNEL4x4_M1
+	fmla	v16.2d, v0.2d, v8.2d[0]
+	fmla	v29.2d, v1.2d, v9.2d[1]
+
+	ld1	{v12.2d, v13.2d}, [pB]		// For next round
+	add	pB, pB, #32
+
+	fmla	v20.2d, v0.2d, v8.2d[1]
+	fmla	v25.2d, v1.2d, v9.2d[0]
+
+	ld1	{v4.2d, v5.2d}, [pA]		// For next round
+	add	pA, pA, #32
+
+	fmla	v24.2d, v0.2d, v9.2d[0]
+	fmla	v21.2d, v1.2d, v8.2d[1]
+
+	prfm	PLDL1KEEP, [pA, #512]
+
+	fmla	v28.2d, v0.2d, v9.2d[1]
+	fmla	v17.2d, v1.2d, v8.2d[0]
+.endm
+
+.macro KERNEL4x4_M2
+	fmla	v16.2d, v4.2d, v12.2d[0]
+	fmla	v29.2d, v5.2d, v13.2d[1]
+
+	ld1	{v8.2d, v9.2d}, [pB]		// For next round
+	add	pB, pB, #32
+
+	fmla	v20.2d, v4.2d, v12.2d[1]
+	fmla	v25.2d, v5.2d, v13.2d[0]
+
+	ld1	{v0.2d, v1.2d}, [pA]		// For next round
+	add	pA, pA, #32
+
+	fmla	v24.2d, v4.2d, v13.2d[0]
+	fmla	v21.2d, v5.2d, v12.2d[1]
+
+	prfm	PLDL1KEEP, [pB, #512]
+
+	fmla	v28.2d, v4.2d, v13.2d[1]
+	fmla	v17.2d, v5.2d, v12.2d[0]
+.endm
+
+.macro KERNEL4x4_E
+	fmla	v16.2d, v4.2d, v12.2d[0]
+	fmla	v29.2d, v5.2d, v13.2d[1]
+
+	fmla	v20.2d, v4.2d, v12.2d[1]
+	fmla	v25.2d, v5.2d, v13.2d[0]
+
+	fmla	v24.2d, v4.2d, v13.2d[0]
+	fmla	v21.2d, v5.2d, v12.2d[1]
+
+	fmla	v28.2d, v4.2d, v13.2d[1]
+	fmla	v17.2d, v5.2d, v12.2d[0]
+.endm
+
+.macro KERNEL4x4_SUB
+	ld1	{v8.2d, v9.2d}, [pB]
+	add	pB, pB, #32
+	ld1	{v0.2d, v1.2d}, [pA]
+	add	pA, pA, #32
+
+	fmla	v16.2d, v0.2d, v8.2d[0]
+	fmla	v29.2d, v1.2d, v9.2d[1]
+
+	fmla	v20.2d, v0.2d, v8.2d[1]
+	fmla	v25.2d, v1.2d, v9.2d[0]
+
+	fmla	v24.2d, v0.2d, v9.2d[0]
+	fmla	v21.2d, v1.2d, v8.2d[1]
+
+	fmla	v28.2d, v0.2d, v9.2d[1]
+	fmla	v17.2d, v1.2d, v8.2d[0]
+.endm
+
+.macro SAVE4x4
+	fmul	v8.2d, v16.2d, alphaV0
+	fmul	v9.2d, v17.2d, alphaV1
+	st1 	{v8.2d, v9.2d}, [pCRow0]
+
+	add	pCRow1, pCRow0, LDC
+
+	fmul	v12.2d, v20.2d, alphaV2
+	fmul	v13.2d, v21.2d, alphaV3
+	st1 	{v12.2d, v13.2d}, [pCRow1]
+
+	add	pCRow2, pCRow1, LDC
+
+	fmul	v8.2d, v24.2d, alphaV0
+	fmul	v9.2d, v25.2d, alphaV1
+	st1 	{v8.2d, v9.2d}, [pCRow2]
+
+	add	pCRow1, pCRow2, LDC
+
+	fmul	v12.2d, v28.2d, alphaV2
+	fmul	v13.2d, v29.2d, alphaV3
+	st1 	{v12.2d, v13.2d}, [pCRow1]
+
+	add	pCRow0, pCRow0, #32
+.endm
+
+/******************************************************************************/
+
+.macro INIT2x4
+	fmov		d16, xzr
+	fmov		d20, d16
+	fmov		d24, d20
+	fmov		d28, d16
+.endm
+
+.macro KERNEL2x4_SUB
+	ld1	{v8.2d, v9.2d}, [pB]
+	add	pB, pB, #32
+	ld1	{v0.2d}, [pA]
+	add	pA, pA, #16
+
+	fmla	v16.2d, v0.2d, v8.2d[0]
+	fmla	v20.2d, v0.2d, v8.2d[1]
+	fmla	v24.2d, v0.2d, v9.2d[0]
+	fmla	v28.2d, v0.2d, v9.2d[1]
+.endm
+
+.macro SAVE2x4
+	fmul	v8.2d, v16.2d, alphaV0
+	st1	{v8.2d}, [pCRow0]
+
+	add	pCRow1, pCRow0, LDC
+
+	fmul	v12.2d, v20.2d, alphaV1
+	st1	{v12.2d}, [pCRow1]
+
+	add	pCRow2, pCRow1, LDC
+
+	fmul	v8.2d, v24.2d, alphaV2
+	st1	{v8.2d}, [pCRow2]
+
+	add	pCRow1, pCRow2, LDC
+
+	fmul	v12.2d, v28.2d, alphaV3
+	st1	{v12.2d}, [pCRow1]
+
+	add	pCRow0, pCRow0, #16
+.endm
+
+/******************************************************************************/
+
+.macro INIT1x4
+	fmov		d16, xzr
+	fmov		d20, d16
+.endm
+
+.macro KERNEL1x4_SUB
+	ldr	d0, [pA]
+	add	pA, pA, #8
+
+	ld1	{v8.2d, v9.2d}, [pB]
+	add	pB, pB, #32
+
+	fmla	v16.2d, v8.2d, v0.d[0]
+	fmla	v20.2d, v9.2d, v0.d[0]
+.endm
+
+.macro SAVE1x4
+	add	pCRow1, pCRow0, LDC
+
+	fmul	v8.2d, v16.2d, alphaV0
+	st1	{v8.d}[0], [pCRow0]
+	st1	{v8.d}[1], [pCRow1]
+
+	add	pCRow2, pCRow1, LDC
+	add	pCRow1, pCRow2, LDC
+
+	fmul	v12.2d, v20.2d, alphaV1
+	st1	{v12.d}[0], [pCRow2]
+	st1	{v12.d}[1], [pCRow1]
+
+	add	pCRow0, pCRow0, #8
+.endm
+
+/******************************************************************************/
+
+.macro INIT4x2
+	fmov	d16, xzr
+	fmov	d17, d16
+	fmov	d20, d17
+	fmov	d21, d16
+.endm
+
+.macro KERNEL4x2_SUB
+	ld1	{v8.2d}, [pB]
+	add	pB, pB, #16
+	ld1	{v0.2d, v1.2d}, [pA]
+	add	pA, pA, #32
+
+	fmla	v16.2d, v0.2d, v8.2d[0]
+	fmla	v17.2d, v1.2d, v8.2d[0]
+	fmla	v20.2d, v0.2d, v8.2d[1]
+	fmla	v21.2d, v1.2d, v8.2d[1]
+.endm
+
+.macro SAVE4x2
+	fmul	v8.2d, v16.2d, alphaV0
+	fmul	v9.2d, v17.2d, alphaV1
+	st1	{v8.2d, v9.2d}, [pCRow0]
+
+	add	pCRow1, pCRow0, LDC
+
+	fmul	v12.2d, v20.2d, alphaV2
+	fmul	v13.2d, v21.2d, alphaV3
+	st1	{v12.2d, v13.2d}, [pCRow1]
+
+	add	pCRow0, pCRow0, #32
+.endm
+
+/******************************************************************************/
+
+.macro INIT2x2
+	fmov		d16, xzr
+	fmov		d20, d16
+.endm
+
+.macro KERNEL2x2_SUB
+	ld1	{v8.2d}, [pB]
+	add	pB, pB, #16
+
+	ld1	{v0.2d}, [pA]
+	add	pA, pA, #16
+
+	fmla	v16.2d, v0.2d, v8.2d[0]
+	fmla	v20.2d, v0.2d, v8.2d[1]
+.endm
+
+.macro SAVE2x2
+	fmul	v8.2d, v16.2d, alphaV0
+	st1	{v8.2d}, [pCRow0]
+
+	add	pCRow1 , pCRow0, LDC
+
+	fmul	v12.2d, v20.2d, alphaV1
+	st1	{v12.2d}, [pCRow1]
+
+	add	pCRow0, pCRow0, #16
+.endm
+
+/******************************************************************************/
+
+.macro INIT1x2
+	fmov		d16, xzr
+.endm
+
+.macro KERNEL1x2_SUB
+	ld1	{v8.2d} , [pB]
+	add	pB , pB, #16
+
+	ldr	d0 , [pA]
+	add	pA, pA, #8
+
+	fmla	v16.2d, v8.2d, v0.2d[0]
+.endm
+
+.macro SAVE1x2
+	add	pCRow1 , pCRow0, LDC
+
+	fmul	v8.2d, v16.2d, alphaV0
+	st1	{v8.d}[0], [pCRow0]
+	st1	{v8.d}[1], [pCRow1]
+
+	add	pCRow0, pCRow0, #8
+.endm
+
+/******************************************************************************/
+
+.macro INIT4x1
+	fmov	d16, xzr
+	fmov	d17, d16
+.endm
+
+.macro KERNEL4x1_SUB
+	ldr	d8, [pB]
+	add	pB , pB, #8
+
+	ld1	{v0.2d, v1.2d}, [pA]
+	add	pA , pA, #32
+
+	fmla	v16.2d, v0.2d, v8.2d[0]
+	fmla	v17.2d, v1.2d, v8.2d[0]
+.endm
+
+.macro SAVE4x1
+	fmul	v8.2d, v16.2d, alphaV0
+	fmul	v9.2d, v17.2d, alphaV1
+	st1	{v8.2d, v9.2d}, [pCRow0]
+
+	add	pCRow0, pCRow0, #32
+.endm
+
+
+
+
+/******************************************************************************/
+
+.macro INIT2x1
+	fmov		d16, xzr
+.endm
+
+.macro KERNEL2x1_SUB
+	ldr	d8, [pB]
+	add	pB , pB, #8
+
+	ld1	{v0.2d}, [pA]
+	add	pA , pA, #16
+
+	fmla	v16.2d, v0.2d, v8.2d[0]
+.endm
+
+.macro SAVE2x1
+	fmul	v8.2d, v16.2d, alphaV0
+	st1	{v8.2d}, [pCRow0]
+
+	add	pCRow0, pCRow0, #16
+.endm
+
+/******************************************************************************/
+
+.macro INIT1x1
+	fmov	d16, xzr
+.endm
+
+.macro KERNEL1x1_SUB
+	ldr	d8, [pB]
+	add	pB , pB, #8
+
+	ldr	d0, [pA]
+	add	pA , pA, #8
+
+	fmadd 	d16, d0, d8, d16  
+.endm
+
+.macro SAVE1x1
+	fmul	d8, d16, alpha0
+	str 	d8, [pCRow0]
+
+	add	pCRow0, pCRow0, #8
+.endm
+
+/*******************************************************************************
+* End of macro definitions
+*******************************************************************************/
+
+	PROLOGUE
+
+	.align 5
+	add	sp, sp, #-(11 * 16)
+	stp	d8, d9, [sp, #(0 * 16)]
+	stp	d10, d11, [sp, #(1 * 16)]
+	stp	d12, d13, [sp, #(2 * 16)]
+	stp	d14, d15, [sp, #(3 * 16)]
+	stp	d16, d17, [sp, #(4 * 16)]
+	stp	x18, x19, [sp, #(5 * 16)]
+	stp	x20, x21, [sp, #(6 * 16)]
+	stp	x22, x23, [sp, #(7 * 16)]
+	stp	x24, x25, [sp, #(8 * 16)]
+	stp	x26, x27, [sp, #(9 * 16)]
+	str	x28, [sp, #(10 * 16)]
+
+	fmov	alpha0, d0
+	fmov	alpha1, d0
+	fmov	alpha2, d0
+	fmov	alpha3, d0
+
+	lsl	LDC, LDC, #3			// ldc = ldc * 8
+
+#if !defined(LEFT)
+	neg	tempOffset, offset
+#endif
+
+	mov	pB, origPB
+
+	mov	counterJ, origN
+	asr 	counterJ, counterJ, #3		// J = J / 8
+	cmp 	counterJ, #0
+	ble	dtrmm_kernel_L4_BEGIN
+
+/******************************************************************************/
+
+dtrmm_kernel_L8_BEGIN:
+
+	mov	pCRow0, pC			// pCRow0 = C
+	add	pC, pC, LDC, lsl #3
+
+#if defined(LEFT)
+	mov	tempOffset, offset
+#endif
+
+	mov	pA, origPA			// pA = start of A array
+
+dtrmm_kernel_L8_M4_BEGIN:
+
+	mov	counterI, origM
+	asr 	counterI, counterI, #2		// counterI = counterI / 4
+	cmp 	counterI, #0
+	ble	dtrmm_kernel_L8_M2_BEGIN
+
+dtrmm_kernel_L8_M4_20:
+
+#if (defined(LEFT) &&  defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+	mov	pB, origPB
+#else
+	mov	pB, origPB
+	lsl	temp, tempOffset, #5
+	add	pA, pA, temp
+	lsl	temp, tempOffset, #6
+	add	pB, pB, temp
+#endif
+
+#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
+	sub	tempK, origK, tempOffset
+#elif defined(LEFT)
+	add	tempK, tempOffset, #4
+#else
+	add	tempK, tempOffset, #8
+#endif
+
+	asr 	counterL, tempK, #1		// L = K / 2
+	cmp	counterL , #2			// is there at least 4 to do?
+	blt	dtrmm_kernel_L8_M4_32
+
+	KERNEL4x8_I				// do one in the K
+	KERNEL4x8_M2				// do another in the K
+
+	subs	counterL, counterL, #2
+	ble	dtrmm_kernel_L8_M4_22a
+	.align 5
+
+dtrmm_kernel_L8_M4_22:
+
+	KERNEL4x8_M1
+	KERNEL4x8_M2
+
+	subs	counterL, counterL, #1
+	bgt	dtrmm_kernel_L8_M4_22
+
+
+dtrmm_kernel_L8_M4_22a:
+
+	KERNEL4x8_M1
+	KERNEL4x8_E
+
+	b	 dtrmm_kernel_L8_M4_44
+
+dtrmm_kernel_L8_M4_32:
+
+	tst	counterL, #1
+	ble	dtrmm_kernel_L8_M4_40
+
+	KERNEL4x8_I
+
+	KERNEL4x8_E
+
+	b	dtrmm_kernel_L8_M4_44
+
+
+dtrmm_kernel_L8_M4_40:
+
+	INIT4x8
+
+dtrmm_kernel_L8_M4_44:
+
+	ands	counterL, tempK, #1
+	ble	dtrmm_kernel_L8_M4_100
+
+dtrmm_kernel_L8_M4_46:
+
+	KERNEL4x8_SUB
+
+dtrmm_kernel_L8_M4_100:
+
+	SAVE4x8
+
+#if (defined(LEFT) &&  defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+	sub	tempK, origK, tempOffset
+#if defined(LEFT)
+	sub	tempK, tempK, #4
+#else
+	sub	tempK, tempK, #8
+#endif
+	lsl	temp, tempK, #5
+	add	pA, pA, temp
+	lsl	temp, tempK, #6
+	add	pB, pB, temp
+#endif
+#if defined(LEFT)
+	add	tempOffset, tempOffset, #4
+#endif
+
+dtrmm_kernel_L8_M4_END:
+	subs	counterI, counterI, #1
+	bne	dtrmm_kernel_L8_M4_20
+
+dtrmm_kernel_L8_M2_BEGIN:
+
+	mov	counterI, origM
+	tst	counterI , #3
+	ble	dtrmm_kernel_L8_END
+
+	tst	counterI, #2			// counterI = counterI / 2
+	ble	dtrmm_kernel_L8_M1_BEGIN
+
+dtrmm_kernel_L8_M2_20:
+
+	INIT2x8
+
+#if (defined(LEFT) &&  defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+	mov	pB, origPB
+#else
+	mov	pB, origPB
+	lsl	temp, tempOffset, #4
+	add	pA, pA, temp
+	lsl	temp, tempOffset, #6
+	add	pB, pB, temp
+#endif
+
+#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
+	sub	tempK, origK, tempOffset
+#elif defined(LEFT)
+	add	tempK, tempOffset, #2
+#else
+	add	tempK, tempOffset, #8
+#endif
+
+	asr 	counterL, tempK, #3		// counterL = counterL / 8
+	cmp	counterL , #0
+	ble	dtrmm_kernel_L8_M2_40
+
+dtrmm_kernel_L8_M2_22:
+
+	KERNEL2x8_SUB
+	KERNEL2x8_SUB
+	KERNEL2x8_SUB
+	KERNEL2x8_SUB
+
+	KERNEL2x8_SUB
+	KERNEL2x8_SUB
+	KERNEL2x8_SUB
+	KERNEL2x8_SUB
+
+	subs	counterL, counterL, #1
+	bgt	dtrmm_kernel_L8_M2_22
+
+
+dtrmm_kernel_L8_M2_40:
+
+	ands	counterL, tempK, #7		// counterL = counterL % 8
+	ble	dtrmm_kernel_L8_M2_100
+
+dtrmm_kernel_L8_M2_42:
+
+	KERNEL2x8_SUB
+
+	subs	counterL, counterL, #1
+	bgt	dtrmm_kernel_L8_M2_42
+
+dtrmm_kernel_L8_M2_100:
+
+	SAVE2x8
+
+
+#if (defined(LEFT) &&  defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+	sub	tempK, origK, tempOffset
+#if defined(LEFT)
+	sub	tempK, tempK, #2
+#else
+	sub	tempK, tempK, #8
+#endif
+	lsl	temp, tempK, #4
+	add	pA, pA, temp
+	lsl	temp, tempK, #6
+	add	pB, pB, temp
+#endif
+#if defined(LEFT)
+	add	tempOffset, tempOffset, #2
+#endif
+
+dtrmm_kernel_L8_M2_END:
+
+
+dtrmm_kernel_L8_M1_BEGIN:
+
+	tst	counterI, #1			// counterI = counterI % 2
+	ble	dtrmm_kernel_L8_END
+
+dtrmm_kernel_L8_M1_20:
+
+	INIT1x8
+
+#if (defined(LEFT) &&  defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+	mov	pB, origPB
+#else
+	mov	pB, origPB
+	lsl	temp, tempOffset, #3
+	add	pA, pA, temp
+	lsl	temp, tempOffset, #6
+	add	pB, pB, temp
+#endif
+
+#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
+	sub	tempK, origK, tempOffset
+#elif defined(LEFT)
+	add	tempK, tempOffset, #1
+#else
+	add	tempK, tempOffset, #8
+#endif
+
+	asr 	counterL, tempK, #3		// counterL = counterL / 8
+	cmp	counterL , #0
+	ble	dtrmm_kernel_L8_M1_40
+
+dtrmm_kernel_L8_M1_22:
+	KERNEL1x8_SUB
+	KERNEL1x8_SUB
+	KERNEL1x8_SUB
+	KERNEL1x8_SUB
+
+	KERNEL1x8_SUB
+	KERNEL1x8_SUB
+	KERNEL1x8_SUB
+	KERNEL1x8_SUB
+
+	subs	counterL, counterL, #1
+	bgt	dtrmm_kernel_L8_M1_22
+
+
+dtrmm_kernel_L8_M1_40:
+
+	ands	counterL, tempK, #7		// counterL = counterL % 8
+	ble	dtrmm_kernel_L8_M1_100
+
+dtrmm_kernel_L8_M1_42:
+
+	KERNEL1x8_SUB
+
+	subs	counterL, counterL, #1
+	bgt	dtrmm_kernel_L8_M1_42
+
+dtrmm_kernel_L8_M1_100:
+
+	SAVE1x8
+
+#if (defined(LEFT) &&  defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+	sub	tempK, origK, tempOffset
+#if defined(LEFT)
+	sub	tempK, tempK, #1
+#else
+	sub	tempK, tempK, #8
+#endif
+	lsl	temp, tempK, #3
+	add	pA, pA, temp
+	lsl	temp, tempK, #6
+	add	pB, pB, temp
+#endif
+#if defined(LEFT)
+	add	tempOffset, tempOffset, #1
+#endif
+
+dtrmm_kernel_L8_END:
+
+	lsl	temp, origK, #6
+	add	origPB, origPB, temp		// B = B + K * 8 * 8
+
+#if !defined(LEFT)
+	add	tempOffset, tempOffset, #8
+#endif
+
+	subs	counterJ, counterJ , #1		// j--
+	bgt	dtrmm_kernel_L8_BEGIN
+
+
+/******************************************************************************/
+
+dtrmm_kernel_L4_BEGIN:
+
+	mov	counterJ , origN
+	tst	counterJ , #7
+	ble	dtrmm_kernel_L999
+
+	tst	counterJ , #4
+	ble	dtrmm_kernel_L2_BEGIN
+
+	mov	pCRow0, pC			// pCRow0 = C
+	add	pC, pC, LDC, lsl #2
+
+#if defined(LEFT)
+	mov	tempOffset, offset
+#endif
+
+	mov	pA, origPA			// pA = start of A array
+
+dtrmm_kernel_L4_M4_BEGIN:
+
+	mov	counterI, origM
+	asr 	counterI, counterI, #2		// counterI = counterI / 4
+	cmp 	counterI, #0
+	ble	dtrmm_kernel_L4_M2_BEGIN
+
+dtrmm_kernel_L4_M4_20:
+
+#if (defined(LEFT) &&  defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+	mov	pB, origPB
+#else
+	mov	pB, origPB
+	lsl	temp, tempOffset, #5
+	add	pB, pB, temp
+	add	pA, pA, temp
+#endif
+
+#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
+	sub	tempK, origK, tempOffset
+#elif defined(LEFT)
+	add	tempK, tempOffset, #4
+#else
+	add	tempK, tempOffset, #4
+#endif
+
+	asr 	counterL, tempK, #1		// L = K / 2
+	cmp	counterL , #2			// is there at least 4 to do?
+	blt	dtrmm_kernel_L4_M4_32
+
+	KERNEL4x4_I				// do one in the K
+	KERNEL4x4_M2				// do another in the K
+
+	subs	counterL, counterL, #2
+	ble	dtrmm_kernel_L4_M4_22a
+	.align 5
+
+dtrmm_kernel_L4_M4_22:
+
+	KERNEL4x4_M1
+	KERNEL4x4_M2
+
+	subs	counterL, counterL, #1
+	bgt	dtrmm_kernel_L4_M4_22
+
+
+dtrmm_kernel_L4_M4_22a:
+
+	KERNEL4x4_M1
+	KERNEL4x4_E
+
+	b	 dtrmm_kernel_L4_M4_44
+
+dtrmm_kernel_L4_M4_32:
+
+	tst	counterL, #1
+	ble	dtrmm_kernel_L4_M4_40
+
+	KERNEL4x4_I
+
+	KERNEL4x4_E
+
+	b	dtrmm_kernel_L4_M4_44
+
+
+dtrmm_kernel_L4_M4_40:
+
+	INIT4x4
+
+dtrmm_kernel_L4_M4_44:
+
+	ands	counterL , tempK, #1
+	ble	dtrmm_kernel_L4_M4_100
+
+dtrmm_kernel_L4_M4_46:
+
+	KERNEL4x4_SUB
+
+dtrmm_kernel_L4_M4_100:
+
+	SAVE4x4
+#if (defined(LEFT) &&  defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+	sub	tempK, origK, tempOffset
+#if defined(LEFT)
+	sub	tempK, tempK, #4
+#else
+	sub	tempK, tempK, #4
+#endif
+	lsl	temp, tempK, #5
+	add	pA, pA, temp
+	add	pB, pB, temp
+#endif
+#if defined(LEFT)
+	add	tempOffset, tempOffset, #4
+#endif
+
+dtrmm_kernel_L4_M4_END:
+	subs	counterI, counterI, #1
+	bne	dtrmm_kernel_L4_M4_20
+
+dtrmm_kernel_L4_M2_BEGIN:
+
+	mov	counterI, origM
+	tst	counterI , #3
+	ble	dtrmm_kernel_L4_END
+
+	tst	counterI, #2			// counterI = counterI / 2
+	ble	dtrmm_kernel_L4_M1_BEGIN
+
+dtrmm_kernel_L4_M2_20:
+
+	INIT2x4
+
+#if (defined(LEFT) &&  defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+	mov	pB, origPB
+#else
+	mov	pB, origPB
+	lsl	temp, tempOffset, #4
+	add	pA, pA, temp
+	lsl	temp, tempOffset, #5
+	add	pB, pB, temp
+#endif
+
+#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
+	sub	tempK, origK, tempOffset
+#elif defined(LEFT)
+	add	tempK, tempOffset, #2
+#else
+	add	tempK, tempOffset, #4
+#endif
+	asr 	counterL , tempK, #3		// counterL = counterL / 8
+	cmp	counterL , #0
+	ble	dtrmm_kernel_L4_M2_40
+
+dtrmm_kernel_L4_M2_22:
+
+	KERNEL2x4_SUB
+	KERNEL2x4_SUB
+	KERNEL2x4_SUB
+	KERNEL2x4_SUB
+
+	KERNEL2x4_SUB
+	KERNEL2x4_SUB
+	KERNEL2x4_SUB
+	KERNEL2x4_SUB
+
+	subs	counterL, counterL, #1
+	bgt	dtrmm_kernel_L4_M2_22
+
+
+dtrmm_kernel_L4_M2_40:
+
+	ands	counterL , tempK, #7		// counterL = counterL % 8
+	ble	dtrmm_kernel_L4_M2_100
+
+dtrmm_kernel_L4_M2_42:
+
+	KERNEL2x4_SUB
+
+	subs	counterL, counterL, #1
+	bgt	dtrmm_kernel_L4_M2_42
+
+dtrmm_kernel_L4_M2_100:
+
+	SAVE2x4
+
+#if (defined(LEFT) &&  defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+	sub	tempK, origK, tempOffset
+#if defined(LEFT)
+	sub	tempK, tempK, #2
+#else
+	sub	tempK, tempK, #4
+#endif
+	lsl	temp, tempK, #4
+	add	pA, pA, temp
+	lsl	temp, tempK, #5
+	add	pB, pB, temp
+#endif
+#if defined(LEFT)
+	add	tempOffset, tempOffset, #2
+#endif
+dtrmm_kernel_L4_M2_END:
+
+
+dtrmm_kernel_L4_M1_BEGIN:
+
+	tst	counterI, #1			// counterI = counterI % 2
+	ble	dtrmm_kernel_L4_END
+
+dtrmm_kernel_L4_M1_20:
+
+	INIT1x4
+
+#if (defined(LEFT) &&  defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+	mov	pB, origPB
+#else
+	mov	pB, origPB
+	lsl	temp, tempOffset, #5
+	add	pB, pB, temp
+	lsl	temp, tempOffset, #3
+	add	pA, pA, temp
+#endif
+
+#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
+	sub	tempK, origK, tempOffset
+#elif defined(LEFT)
+	add	tempK, tempOffset, #1
+#else
+	add	tempK, tempOffset, #4
+#endif
+	asr 	counterL , tempK, #3		// counterL = counterL / 8
+	cmp	counterL , #0
+	ble	dtrmm_kernel_L4_M1_40
+
+dtrmm_kernel_L4_M1_22:
+	KERNEL1x4_SUB
+	KERNEL1x4_SUB
+	KERNEL1x4_SUB
+	KERNEL1x4_SUB
+
+	KERNEL1x4_SUB
+	KERNEL1x4_SUB
+	KERNEL1x4_SUB
+	KERNEL1x4_SUB
+
+	subs	counterL, counterL, #1
+	bgt	dtrmm_kernel_L4_M1_22
+
+
+dtrmm_kernel_L4_M1_40:
+
+	ands	counterL , tempK, #7		// counterL = counterL % 8
+	ble	dtrmm_kernel_L4_M1_100
+
+dtrmm_kernel_L4_M1_42:
+
+	KERNEL1x4_SUB
+
+	subs	counterL, counterL, #1
+	bgt	dtrmm_kernel_L4_M1_42
+
+dtrmm_kernel_L4_M1_100:
+
+	SAVE1x4
+
+#if (defined(LEFT) &&  defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+	sub	tempK, origK, tempOffset
+#if defined(LEFT)
+	sub	tempK, tempK, #1
+#else
+	sub	tempK, tempK, #4
+#endif
+	lsl	temp, tempK, #3
+	add	pA, pA, temp
+	lsl	temp, tempK, #5
+	add	pB, pB, temp
+#endif
+#if defined(LEFT)
+	add	tempOffset, tempOffset, #1
+#endif
+dtrmm_kernel_L4_END:
+
+	lsl	temp, origK, #5 
+	add	origPB, origPB, temp		// B = B + K * 4 * 8
+#if !defined(LEFT)
+	add	tempOffset, tempOffset, #4
+#endif
+
+/******************************************************************************/
+
+dtrmm_kernel_L2_BEGIN:   // less than 2 left in N direction
+
+	mov	counterJ , origN
+	tst	counterJ , #3
+	ble	dtrmm_kernel_L999   // error, N was less than 4?
+
+	tst	counterJ , #2
+	ble	dtrmm_kernel_L1_BEGIN
+
+	mov	pCRow0, pC			// pCRow0 = pC
+
+	add	pC,pC,LDC, lsl #1
+
+#if defined(LEFT)
+	mov	tempOffset, offset
+#endif
+	mov	pA, origPA			// pA = A
+
+
+dtrmm_kernel_L2_M4_BEGIN:
+
+	mov	counterI, origM
+	asr 	counterI, counterI, #2		// counterI = counterI / 4
+	cmp	counterI,#0
+	ble	dtrmm_kernel_L2_M2_BEGIN
+
+dtrmm_kernel_L2_M4_20:
+
+	INIT4x2
+
+#if (defined(LEFT) &&  defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+	mov	pB, origPB
+#else
+	mov	pB, origPB
+	lsl	temp, tempOffset, #4
+	add	pB, pB, temp
+	lsl	temp, tempOffset, #5
+	add	pA, pA, temp
+#endif
+
+#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
+	sub	tempK, origK, tempOffset
+#elif defined(LEFT)
+	add	tempK, tempOffset, #4
+#else
+	add	tempK, tempOffset, #2
+#endif
+	asr	counterL , tempK, #3		// counterL = counterL / 8
+	cmp	counterL,#0
+	ble	dtrmm_kernel_L2_M4_40
+	.align 5
+
+dtrmm_kernel_L2_M4_22:
+	KERNEL4x2_SUB
+	KERNEL4x2_SUB
+	KERNEL4x2_SUB
+	KERNEL4x2_SUB
+
+	KERNEL4x2_SUB
+	KERNEL4x2_SUB
+	KERNEL4x2_SUB
+	KERNEL4x2_SUB
+
+	subs	counterL, counterL, #1
+	bgt	dtrmm_kernel_L2_M4_22
+
+
+dtrmm_kernel_L2_M4_40:
+
+	ands	counterL , tempK, #7		// counterL = counterL % 8
+	ble	dtrmm_kernel_L2_M4_100
+
+dtrmm_kernel_L2_M4_42:
+
+	KERNEL4x2_SUB
+
+	subs	counterL, counterL, #1
+	bgt	dtrmm_kernel_L2_M4_42
+
+dtrmm_kernel_L2_M4_100:
+
+	SAVE4x2
+#if (defined(LEFT) &&  defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+	sub	tempK, origK, tempOffset
+#if defined(LEFT)
+	sub	tempK, tempK, #4
+#else
+	sub	tempK, tempK, #2
+#endif
+	lsl	temp, tempK, #5
+	add	pA, pA, temp
+	lsl	temp, tempK, #4
+	add	pB, pB, temp
+#endif
+#if defined(LEFT)
+	add	tempOffset, tempOffset, #4
+#endif
+
+dtrmm_kernel_L2_M4_END:
+
+	subs	counterI, counterI, #1
+	bgt	dtrmm_kernel_L2_M4_20
+
+
+dtrmm_kernel_L2_M2_BEGIN:
+
+	mov	counterI, origM
+	tst	counterI , #3
+	ble	dtrmm_kernel_L2_END
+
+	tst	counterI, #2			// counterI = counterI / 2
+	ble	dtrmm_kernel_L2_M1_BEGIN
+
+dtrmm_kernel_L2_M2_20:
+
+	INIT2x2
+
+#if (defined(LEFT) &&  defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+	mov	pB, origPB
+#else
+	mov	pB, origPB
+	lsl	temp, tempOffset, #4
+	add	pB, pB, temp
+	lsl	temp, tempOffset, #4
+	add	pA, pA, temp
+#endif
+
+#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
+	sub	tempK, origK, tempOffset
+#elif defined(LEFT)
+	add	tempK, tempOffset, #2
+#else
+	add	tempK, tempOffset, #2
+#endif
+	asr	counterL , tempK, #3		// counterL = counterL / 8
+        cmp	counterL,#0
+	ble	dtrmm_kernel_L2_M2_40
+
+dtrmm_kernel_L2_M2_22:
+
+	KERNEL2x2_SUB
+	KERNEL2x2_SUB
+	KERNEL2x2_SUB
+	KERNEL2x2_SUB
+
+	KERNEL2x2_SUB
+	KERNEL2x2_SUB
+	KERNEL2x2_SUB
+	KERNEL2x2_SUB
+
+	subs	counterL, counterL, #1
+	bgt	dtrmm_kernel_L2_M2_22
+
+
+dtrmm_kernel_L2_M2_40:
+
+	ands	counterL , tempK, #7		// counterL = counterL % 8
+	ble	dtrmm_kernel_L2_M2_100
+
+dtrmm_kernel_L2_M2_42:
+
+	KERNEL2x2_SUB
+
+	subs	counterL, counterL, #1
+	bgt	dtrmm_kernel_L2_M2_42
+
+dtrmm_kernel_L2_M2_100:
+
+	SAVE2x2
+
+#if (defined(LEFT) &&  defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+	sub	tempK, origK, tempOffset
+#if defined(LEFT)
+	sub	tempK, tempK, #2
+#else
+	sub	tempK, tempK, #2
+#endif
+	lsl	temp, tempK, #4
+	add	pA, pA, temp
+	lsl	temp, tempK, #4
+	add	pB, pB, temp
+#endif
+#if defined(LEFT)
+	add	tempOffset, tempOffset, #2
+#endif
+dtrmm_kernel_L2_M2_END:
+
+
+dtrmm_kernel_L2_M1_BEGIN:
+
+	tst	counterI, #1			// counterI = counterI % 2
+	ble	dtrmm_kernel_L2_END
+
+dtrmm_kernel_L2_M1_20:
+
+	INIT1x2
+
+#if (defined(LEFT) &&  defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+	mov	pB, origPB
+#else
+	mov	pB, origPB
+	lsl	temp, tempOffset, #4
+	add	pB, pB, temp
+	lsl	temp, tempOffset, #3
+	add	pA, pA, temp
+#endif
+
+#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
+	sub	tempK, origK, tempOffset
+#elif defined(LEFT)
+	add	tempK, tempOffset, #1
+#else
+	add	tempK, tempOffset, #2
+#endif
+	asr 	counterL , tempK, #3		// counterL = counterL / 8
+        cmp     counterL, #0
+	ble	dtrmm_kernel_L2_M1_40
+
+dtrmm_kernel_L2_M1_22:
+	KERNEL1x2_SUB
+	KERNEL1x2_SUB
+	KERNEL1x2_SUB
+	KERNEL1x2_SUB
+
+	KERNEL1x2_SUB
+	KERNEL1x2_SUB
+	KERNEL1x2_SUB
+	KERNEL1x2_SUB
+
+	subs	counterL, counterL, #1
+	bgt	dtrmm_kernel_L2_M1_22
+
+
+dtrmm_kernel_L2_M1_40:
+
+	ands	counterL , tempK, #7		// counterL = counterL % 8
+	ble	dtrmm_kernel_L2_M1_100
+
+dtrmm_kernel_L2_M1_42:
+
+	KERNEL1x2_SUB
+
+	subs	counterL, counterL, #1
+	bgt	dtrmm_kernel_L2_M1_42
+
+dtrmm_kernel_L2_M1_100:
+
+	SAVE1x2
+
+#if (defined(LEFT) &&  defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+	sub	tempK, origK, tempOffset
+#if defined(LEFT)
+	sub	tempK, tempK, #1
+#else
+	sub	tempK, tempK, #2
+#endif
+	lsl	temp, tempK, #3
+	add	pA, pA, temp
+	lsl	temp, tempK, #4
+	add	pB, pB, temp
+#endif
+#if defined(LEFT)
+	add	tempOffset, tempOffset, #1
+#endif
+dtrmm_kernel_L2_END:
+#if !defined(LEFT)
+	add	tempOffset, tempOffset, #2
+#endif
+	add	origPB, origPB, origK, lsl #4	// B = B + K * 2 * 8
+
+/******************************************************************************/
+
+dtrmm_kernel_L1_BEGIN:
+
+	mov	counterJ , origN
+	tst	counterJ , #1
+	ble	dtrmm_kernel_L999 // done
+
+
+	mov	pCRow0, pC			// pCRow0 = C
+	add	pC , pC , LDC			// Update pC to point to next
+
+#if defined(LEFT)
+	mov	tempOffset, offset
+#endif
+	mov	pA, origPA			// pA = A
+
+dtrmm_kernel_L1_M4_BEGIN:
+
+	mov	counterI, origM
+	asr 	counterI, counterI, #2		// counterI = counterI / 4
+	cmp	counterI, #0
+	ble	dtrmm_kernel_L1_M2_BEGIN
+
+dtrmm_kernel_L1_M4_20:
+
+	INIT4x1
+
+#if (defined(LEFT) &&  defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+	mov	pB, origPB
+#else
+	mov	pB, origPB
+	lsl	temp, tempOffset, #3
+	add	pB, pB, temp
+	lsl	temp, tempOffset, #5
+	add	pA, pA, temp
+#endif
+#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
+	sub	tempK, origK, tempOffset
+#elif defined(LEFT)
+	add	tempK, tempOffset, #4
+#else
+	add	tempK, tempOffset, #1
+#endif
+	asr	counterL , tempK, #3		// counterL = counterL / 8
+	cmp	counterL , #0
+	ble	dtrmm_kernel_L1_M4_40
+	.align 5
+
+dtrmm_kernel_L1_M4_22:
+	KERNEL4x1_SUB
+	KERNEL4x1_SUB
+	KERNEL4x1_SUB
+	KERNEL4x1_SUB
+
+	KERNEL4x1_SUB
+	KERNEL4x1_SUB
+	KERNEL4x1_SUB
+	KERNEL4x1_SUB
+
+	subs	counterL, counterL, #1
+	bgt	dtrmm_kernel_L1_M4_22
+
+
+dtrmm_kernel_L1_M4_40:
+
+	ands	counterL , tempK, #7		// counterL = counterL % 8
+	ble	dtrmm_kernel_L1_M4_100
+
+dtrmm_kernel_L1_M4_42:
+
+	KERNEL4x1_SUB
+
+	subs	counterL, counterL, #1
+	bgt	dtrmm_kernel_L1_M4_42
+
+dtrmm_kernel_L1_M4_100:
+
+	SAVE4x1
+#if (defined(LEFT) &&  defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+	sub	tempK, origK, tempOffset
+#if defined(LEFT)
+	sub	tempK, tempK, #4
+#else
+	sub	tempK, tempK, #1
+#endif
+	lsl	temp, tempK, #5
+	add	pA, pA, temp
+	lsl	temp, tempK, #3
+	add	pB, pB, temp
+#endif
+#if defined(LEFT)
+	add	tempOffset, tempOffset, #4
+#endif
+
+dtrmm_kernel_L1_M4_END:
+
+	subs	counterI, counterI, #1
+	bgt	dtrmm_kernel_L1_M4_20
+
+
+dtrmm_kernel_L1_M2_BEGIN:
+
+	mov	counterI, origM
+	tst	counterI , #3
+	ble	dtrmm_kernel_L1_END
+
+	tst	counterI, #2			// counterI = counterI / 2
+	ble	dtrmm_kernel_L1_M1_BEGIN
+
+dtrmm_kernel_L1_M2_20:
+
+	INIT2x1
+
+#if (defined(LEFT) &&  defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+	mov	pB, origPB
+#else
+	mov	pB, origPB
+	lsl	temp, tempOffset, #3
+	add	pB, pB, temp
+	lsl	temp, tempOffset, #4
+	add	pA, pA, temp
+#endif
+
+#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
+	sub	tempK, origK, tempOffset
+#elif defined(LEFT)
+	add	tempK, tempOffset, #2
+#else
+	add	tempK, tempOffset, #1
+#endif
+	asr 	counterL , tempK, #3		// counterL = counterL / 8
+	cmp	counterL , #0
+	ble	dtrmm_kernel_L1_M2_40
+
+dtrmm_kernel_L1_M2_22:
+
+	KERNEL2x1_SUB
+	KERNEL2x1_SUB
+	KERNEL2x1_SUB
+	KERNEL2x1_SUB
+
+	KERNEL2x1_SUB
+	KERNEL2x1_SUB
+	KERNEL2x1_SUB
+	KERNEL2x1_SUB
+
+	subs	counterL, counterL, #1
+	bgt	dtrmm_kernel_L1_M2_22
+
+
+dtrmm_kernel_L1_M2_40:
+
+	ands	counterL , tempK, #7		// counterL = counterL % 8
+	ble	dtrmm_kernel_L1_M2_100
+
+dtrmm_kernel_L1_M2_42:
+
+	KERNEL2x1_SUB
+
+	subs	counterL, counterL, #1
+	bgt	dtrmm_kernel_L1_M2_42
+
+dtrmm_kernel_L1_M2_100:
+
+	SAVE2x1
+
+#if (defined(LEFT) &&  defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+	sub	tempK, origK, tempOffset
+#if defined(LEFT)
+	sub	tempK, tempK, #2
+#else
+	sub	tempK, tempK, #1
+#endif
+	lsl	temp, tempK, #4
+	add	pA, pA, temp
+	lsl	temp, tempK, #3
+	add	pB, pB, temp
+#endif
+#if defined(LEFT)
+	add	tempOffset, tempOffset, #2
+#endif
+dtrmm_kernel_L1_M2_END:
+
+
+dtrmm_kernel_L1_M1_BEGIN:
+
+	tst	counterI, #1			// counterI = counterI % 2
+	ble	dtrmm_kernel_L1_END
+
+dtrmm_kernel_L1_M1_20:
+
+	INIT1x1
+
+#if (defined(LEFT) &&  defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+	mov	pB, origPB
+#else
+	mov	pB, origPB
+	lsl	temp, tempOffset, #3
+	add	pB, pB, temp
+	lsl	temp, tempOffset, #3
+	add	pA, pA, temp
+#endif
+
+#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
+	sub	tempK, origK, tempOffset
+#elif defined(LEFT)
+	add	tempK, tempOffset, #1
+#else
+	add	tempK, tempOffset, #1
+#endif
+	asr 	counterL , tempK, #3		// counterL = counterL / 8
+	cmp	counterL , #0
+	ble	dtrmm_kernel_L1_M1_40
+
+dtrmm_kernel_L1_M1_22:
+	KERNEL1x1_SUB
+	KERNEL1x1_SUB
+	KERNEL1x1_SUB
+	KERNEL1x1_SUB
+
+	KERNEL1x1_SUB
+	KERNEL1x1_SUB
+	KERNEL1x1_SUB
+	KERNEL1x1_SUB
+
+	subs	counterL, counterL, #1
+	bgt	dtrmm_kernel_L1_M1_22
+
+
+dtrmm_kernel_L1_M1_40:
+
+	ands	counterL , tempK, #7		// counterL = counterL % 8
+	ble	dtrmm_kernel_L1_M1_100
+
+dtrmm_kernel_L1_M1_42:
+
+	KERNEL1x1_SUB
+
+	subs	counterL, counterL, #1
+	bgt	dtrmm_kernel_L1_M1_42
+
+dtrmm_kernel_L1_M1_100:
+
+	SAVE1x1
+
+
+dtrmm_kernel_L1_END:
+
+
+dtrmm_kernel_L999:
+	mov	x0, #0				// set return value
+	ldp	d8, d9, [sp, #(0 * 16)]
+	ldp	d10, d11, [sp, #(1 * 16)]
+	ldp	d12, d13, [sp, #(2 * 16)]
+	ldp	d14, d15, [sp, #(3 * 16)]
+	ldp	d16, d17, [sp, #(4 * 16)]
+	ldp	x18, x19, [sp, #(5 * 16)]
+	ldp	x20, x21, [sp, #(6 * 16)]
+	ldp	x22, x23, [sp, #(7 * 16)]
+	ldp	x24, x25, [sp, #(8 * 16)]
+	ldp	x26, x27, [sp, #(9 * 16)]
+	ldr	x28, [sp, #(10 * 16)]
+	add	sp, sp, #(11*16)
+	ret
+
+	EPILOGUE
+
diff --git a/kernel/arm64/dtrmm_kernel_8x4.S b/kernel/arm64/dtrmm_kernel_8x4.S
new file mode 100755
index 000000000..6890505bd
--- /dev/null
+++ b/kernel/arm64/dtrmm_kernel_8x4.S
@@ -0,0 +1,1849 @@
+/*******************************************************************************
+Copyright (c) 2015, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*******************************************************************************/
+
+#define ASSEMBLER
+#include "common.h"
+
+/*                   X0          X1          X2          s0        X3        x4       x5           x6            x7*/
+/*int CNAME(BLASLONG bm,BLASLONG bn,BLASLONG bk,FLOAT alpha0,FLOAT* ba,FLOAT* bb,FLOAT* C,BLASLONG ldc, BLASLONG offset) */
+
+#define origM		x0
+#define origN		x1
+#define origK		x2
+#define origPA		x3
+#define origPB		x4
+#define pC		x5
+#define LDC		x6
+#define offset		x7
+#define counterL	x8
+#define counterI	x9
+#define counterJ	x10
+#define pB		x11
+#define pCRow0		x12
+#define pCRow1		x13
+#define pCRow2		x14
+#define pA		x15
+#define temp		x16
+#define tempOffset	x17
+#define tempK		x18
+
+#define alpha0		d10
+#define alphaV0		v10.d[0]
+#define alpha1		d11
+#define alphaV1		v11.d[0]
+#define alpha2		d14
+#define alphaV2		v14.d[0]
+#define alpha3		d15
+#define alphaV3		v15.d[0]
+
+// 00 origM
+// 01 origN
+// 02 origK
+// 03 origPA
+// 04 origPB
+// 05 pC
+// 06 origLDC -> LDC
+// 07 offset
+// 08 counterL
+// 09 counterI
+// 10 counterJ
+// 11 pB
+// 12 pCRow0
+// 13 pCRow1
+// 14 pCRow2
+// 15 pA
+// 16 temp
+// 17 tempOffset
+// 18 must save tempK
+// 19 must save
+// 20 must save
+// 21 must save
+// 22 must save
+// 23 must save
+// 24 must save
+// 25 must save
+// 26 must save
+// 27 must save
+// 28 must save
+// 29 frame
+// 30 link
+// 31 sp
+
+//v00 ALPHA -> pA0_0, pA0_1
+//v01 pA0_2, pA0_3
+//v02 pA0_4, pA0_5
+//v03 pA0_6, pA0_7
+//v04 pA1_0, pA1_1
+//v05 pA1_2, pA1_3
+//v06 pA1_4, pA1_5
+//v07 pA1_6, pA1_7
+//v08 must save pB0_0, pB0_1
+//v09 must save pB0_2, pB0_3
+//v10 must save ALPHA0
+//v11 must save ALPHA1
+//v12 must save pB1_0, pB1_1
+//v13 must save pB1_2, pB1_3
+//v14 must save ALPHA2
+//v15 must save ALPHA3
+//v16 must save C00, C01
+//v17 must save C02, C03
+//v18 C04, C05
+//v19 C06, C07
+//v20 C10, C11
+//v21 C12, C13
+//v22 C14, C15
+//v23 C16, C17
+//v24 C20, C21
+//v25 C22, C23
+//v26 C24, C25
+//v27 C26, C27
+//v28 C30, C31
+//v29 C32, C33
+//v30 C34, C35
+//v31 C36, C37
+
+/*******************************************************************************
+* Macro definitions
+*******************************************************************************/
+
+.macro INIT8x4
+	fmov		d16, xzr
+	fmov		d17, xzr
+	fmov		d18, d16
+	fmov		d19, xzr
+	fmov		d20, xzr
+	fmov		d21, d16
+	fmov		d22, d17
+	fmov		d23, d18
+	fmov		d24, xzr
+	fmov		d25, d16
+	fmov		d26, d17
+	fmov		d27, d18
+	fmov		d28, xzr
+	fmov		d29, d16
+	fmov		d30, d17
+	fmov		d31, d18
+.endm
+
+.macro KERNEL8x4_I
+	ld1	{v0.2d, v1.2d}, [pA]
+	add	pA, pA, #32
+	ld1	{v8.2d, v9.2d}, [pB]
+	add	pB, pB, #32
+	ld1	{v2.2d, v3.2d}, [pA]
+	add	pA, pA, #32
+
+	fmul	v16.2d, v0.2d, v8.2d[0]
+	fmul	v17.2d, v1.2d, v8.2d[0]
+	fmul	v18.2d, v2.2d, v8.2d[0]
+	fmul	v19.2d, v3.2d, v8.2d[0]
+
+	fmul	v20.2d, v0.2d, v8.2d[1]
+	fmul	v21.2d, v1.2d, v8.2d[1]
+	fmul	v22.2d, v2.2d, v8.2d[1]
+	fmul	v23.2d, v3.2d, v8.2d[1]
+
+	fmul	v24.2d, v0.2d, v9.2d[0]
+	fmul	v25.2d, v1.2d, v9.2d[0]
+	fmul	v26.2d, v2.2d, v9.2d[0]
+	fmul	v27.2d, v3.2d, v9.2d[0]
+
+	fmul	v28.2d, v0.2d, v9.2d[1]
+	fmul	v29.2d, v1.2d, v9.2d[1]
+	fmul	v30.2d, v2.2d, v9.2d[1]
+	fmul	v31.2d, v3.2d, v9.2d[1]
+
+	ld1	{v4.2d, v5.2d}, [pA]
+	add	pA, pA, #32
+	ld1	{v12.2d, v13.2d}, [pB]
+	add	pB, pB, #32
+	ld1	{v6.2d, v7.2d}, [pA]
+	add	pA, pA, #32
+.endm
+
+.macro KERNEL8x4_M1
+	fmla	v16.2d, v0.2d, v8.2d[0]
+	fmla	v17.2d, v1.2d, v8.2d[0]
+	fmla	v18.2d, v2.2d, v8.2d[0]
+	fmla	v19.2d, v3.2d, v8.2d[0]
+
+	fmla	v20.2d, v0.2d, v8.2d[1]
+	fmla	v21.2d, v1.2d, v8.2d[1]
+	fmla	v22.2d, v2.2d, v8.2d[1]
+	fmla	v23.2d, v3.2d, v8.2d[1]
+
+	fmla	v24.2d, v0.2d, v9.2d[0]
+	fmla	v25.2d, v1.2d, v9.2d[0]
+	fmla	v26.2d, v2.2d, v9.2d[0]
+	fmla	v27.2d, v3.2d, v9.2d[0]
+
+	fmla	v28.2d, v0.2d, v9.2d[1]
+	fmla	v29.2d, v1.2d, v9.2d[1]
+	fmla	v30.2d, v2.2d, v9.2d[1]
+	fmla	v31.2d, v3.2d, v9.2d[1]
+
+	ld1	{v4.2d, v5.2d}, [pA]
+	add	pA, pA, #32
+	ld1	{v12.2d, v13.2d}, [pB]
+	add	pB, pB, #32
+	ld1	{v6.2d, v7.2d}, [pA]
+	add	pA, pA, #32
+
+	prfm	PLDL1KEEP, [pA, #512]
+.endm
+
+.macro KERNEL8x4_M2
+	fmla	v16.2d, v4.2d, v12.2d[0]
+	fmla	v17.2d, v5.2d, v12.2d[0]
+	fmla	v18.2d, v6.2d, v12.2d[0]
+	fmla	v19.2d, v7.2d, v12.2d[0]
+
+	fmla	v20.2d, v4.2d, v12.2d[1]
+	fmla	v21.2d, v5.2d, v12.2d[1]
+	fmla	v22.2d, v6.2d, v12.2d[1]
+	fmla	v23.2d, v7.2d, v12.2d[1]
+
+	fmla	v24.2d, v4.2d, v13.2d[0]
+	fmla	v25.2d, v5.2d, v13.2d[0]
+	fmla	v26.2d, v6.2d, v13.2d[0]
+	fmla	v27.2d, v7.2d, v13.2d[0]
+
+	fmla	v28.2d, v4.2d, v13.2d[1]
+	fmla	v29.2d, v5.2d, v13.2d[1]
+	fmla	v30.2d, v6.2d, v13.2d[1]
+	fmla	v31.2d, v7.2d, v13.2d[1]
+
+	ld1	{v0.2d, v1.2d}, [pA]
+	add	pA, pA, #32
+	ld1	{v8.2d, v9.2d}, [pB]
+	add	pB, pB, #32
+	ld1	{v2.2d, v3.2d}, [pA]
+	add	pA, pA, #32
+
+	prfm	PLDL1KEEP, [pB, #512]
+.endm
+
+.macro KERNEL8x4_E
+	fmla	v16.2d, v4.2d, v12.2d[0]
+	fmla	v17.2d, v5.2d, v12.2d[0]
+	fmla	v18.2d, v6.2d, v12.2d[0]
+	fmla	v19.2d, v7.2d, v12.2d[0]
+
+	fmla	v20.2d, v4.2d, v12.2d[1]
+	fmla	v21.2d, v5.2d, v12.2d[1]
+	fmla	v22.2d, v6.2d, v12.2d[1]
+	fmla	v23.2d, v7.2d, v12.2d[1]
+
+	fmla	v24.2d, v4.2d, v13.2d[0]
+	fmla	v25.2d, v5.2d, v13.2d[0]
+	fmla	v26.2d, v6.2d, v13.2d[0]
+	fmla	v27.2d, v7.2d, v13.2d[0]
+
+	fmla	v28.2d, v4.2d, v13.2d[1]
+	fmla	v29.2d, v5.2d, v13.2d[1]
+	fmla	v30.2d, v6.2d, v13.2d[1]
+	fmla	v31.2d, v7.2d, v13.2d[1]
+.endm
+
+.macro KERNEL8x4_SUB
+	ld1	{v0.2d, v1.2d}, [pA]
+	add	pA, pA, #32
+	ld1	{v8.2d, v9.2d}, [pB]
+	add	pB, pB, #32
+	ld1	{v2.2d, v3.2d}, [pA]
+	add	pA, pA, #32
+
+	fmla	v16.2d, v0.2d, v8.2d[0]
+	fmla	v17.2d, v1.2d, v8.2d[0]
+	fmla	v18.2d, v2.2d, v8.2d[0]
+	fmla	v19.2d, v3.2d, v8.2d[0]
+
+	fmla	v20.2d, v0.2d, v8.2d[1]
+	fmla	v21.2d, v1.2d, v8.2d[1]
+	fmla	v22.2d, v2.2d, v8.2d[1]
+	fmla	v23.2d, v3.2d, v8.2d[1]
+
+	fmla	v24.2d, v0.2d, v9.2d[0]
+	fmla	v25.2d, v1.2d, v9.2d[0]
+	fmla	v26.2d, v2.2d, v9.2d[0]
+	fmla	v27.2d, v3.2d, v9.2d[0]
+
+	fmla	v28.2d, v0.2d, v9.2d[1]
+	fmla	v29.2d, v1.2d, v9.2d[1]
+	fmla	v30.2d, v2.2d, v9.2d[1]
+	fmla	v31.2d, v3.2d, v9.2d[1]
+.endm
+
+.macro SAVE8x4
+	add	pCRow1, pCRow0, LDC
+
+	fmul	v0.2d, v16.2d, alphaV0
+	fmul	v1.2d, v17.2d, alphaV1
+	fmul	v2.2d, v18.2d, alphaV2
+	fmul	v3.2d, v19.2d, alphaV3
+	st1 	{v0.2d, v1.2d, v2.2d, v3.2d}, [pCRow0]
+
+	add	pCRow2, pCRow1, LDC
+
+	fmul	v4.2d, v20.2d, alphaV0
+	fmul	v5.2d, v21.2d, alphaV1
+	fmul	v6.2d, v22.2d, alphaV2
+	fmul	v7.2d, v23.2d, alphaV3
+	st1 	{v4.2d, v5.2d, v6.2d, v7.2d}, [pCRow1]
+
+	add	pCRow1, pCRow2, LDC
+
+	fmul	v0.2d, v24.2d, alphaV0
+	fmul	v1.2d, v25.2d, alphaV1
+	fmul	v2.2d, v26.2d, alphaV2
+	fmul	v3.2d, v27.2d, alphaV3
+	st1 	{v0.2d, v1.2d, v2.2d, v3.2d}, [pCRow2]
+
+	fmul	v4.2d, v28.2d, alphaV0
+	fmul	v5.2d, v29.2d, alphaV1
+	fmul	v6.2d, v30.2d, alphaV2
+	fmul	v7.2d, v31.2d, alphaV3
+	st1 	{v4.2d, v5.2d, v6.2d, v7.2d}, [pCRow1]
+
+	add	pCRow0, pCRow0, #64
+.endm
+
+/******************************************************************************/
+
+.macro INIT4x4
+	fmov		d16, xzr
+	fmov		d17, d16
+	fmov		d20, d17
+	fmov		d21, d16
+	fmov		d24, d17
+	fmov		d25, d16
+	fmov		d28, d17
+	fmov		d29, d16
+.endm
+
+.macro KERNEL4x4_SUB
+	ld1	{v8.2d, v9.2d}, [pB]
+	add	pB, pB, #32
+	ld1	{v0.2d, v1.2d}, [pA]
+	add	pA, pA, #32
+
+	fmla	v16.2d, v0.2d, v8.2d[0]
+	fmla	v29.2d, v1.2d, v9.2d[1]
+
+	fmla	v20.2d, v0.2d, v8.2d[1]
+	fmla	v25.2d, v1.2d, v9.2d[0]
+
+	fmla	v24.2d, v0.2d, v9.2d[0]
+	fmla	v21.2d, v1.2d, v8.2d[1]
+
+	fmla	v28.2d, v0.2d, v9.2d[1]
+	fmla	v17.2d, v1.2d, v8.2d[0]
+.endm
+
+.macro SAVE4x4
+	fmul	v8.2d, v16.2d, alphaV0
+	fmul	v9.2d, v17.2d, alphaV1
+	st1 	{v8.2d, v9.2d}, [pCRow0]
+
+	add	pCRow1, pCRow0, LDC
+
+	fmul	v12.2d, v20.2d, alphaV2
+	fmul	v13.2d, v21.2d, alphaV3
+	st1 	{v12.2d, v13.2d}, [pCRow1]
+
+	add	pCRow2, pCRow1, LDC
+
+	fmul	v8.2d, v24.2d, alphaV0
+	fmul	v9.2d, v25.2d, alphaV1
+	st1 	{v8.2d, v9.2d}, [pCRow2]
+
+	add	pCRow1, pCRow2, LDC
+
+	fmul	v12.2d, v28.2d, alphaV2
+	fmul	v13.2d, v29.2d, alphaV3
+	st1 	{v12.2d, v13.2d}, [pCRow1]
+
+	add	pCRow0, pCRow0, #32
+.endm
+
+/******************************************************************************/
+
+
+.macro INIT2x4
+	fmov		d16, xzr
+	fmov		d20, d16
+	fmov		d24, d20
+	fmov		d28, d16
+.endm
+
+.macro KERNEL2x4_SUB
+	ld1	{v8.2d, v9.2d}, [pB]
+	add	pB, pB, #32
+	ld1	{v0.2d}, [pA]
+	add	pA, pA, #16
+
+	fmla	v16.2d, v0.2d, v8.2d[0]
+	fmla	v20.2d, v0.2d, v8.2d[1]
+	fmla	v24.2d, v0.2d, v9.2d[0]
+	fmla	v28.2d, v0.2d, v9.2d[1]
+.endm
+
+.macro SAVE2x4
+	fmul	v8.2d, v16.2d, alphaV0
+	st1	{v8.2d}, [pCRow0]
+
+	add	pCRow1, pCRow0, LDC
+
+	fmul	v12.2d, v20.2d, alphaV1
+	st1	{v12.2d}, [pCRow1]
+
+	add	pCRow2, pCRow1, LDC
+
+	fmul	v8.2d, v24.2d, alphaV2
+	st1	{v8.2d}, [pCRow2]
+
+	add	pCRow1, pCRow2, LDC
+
+	fmul	v12.2d, v28.2d, alphaV3
+	st1	{v12.2d}, [pCRow1]
+
+	add	pCRow0, pCRow0, #16
+.endm
+
+/******************************************************************************/
+
+.macro INIT1x4
+	fmov		d16, xzr
+	fmov		d20, d16
+.endm
+
+.macro KERNEL1x4_SUB
+	ldr	d0, [pA]
+	add	pA, pA, #8
+
+	ld1	{v8.2d, v9.2d}, [pB]
+	add	pB, pB, #32
+
+	fmla	v16.2d, v8.2d, v0.d[0]
+	fmla	v20.2d, v9.2d, v0.d[0]
+.endm
+
+.macro SAVE1x4
+	add	pCRow1, pCRow0, LDC
+
+	fmul	v8.2d, v16.2d, alphaV0
+	st1	{v8.d}[0], [pCRow0]
+	st1	{v8.d}[1], [pCRow1]
+
+	add	pCRow2, pCRow1, LDC
+	add	pCRow1, pCRow2, LDC
+
+	fmul	v12.2d, v20.2d, alphaV1
+	st1	{v12.d}[0], [pCRow2]
+	st1	{v12.d}[1], [pCRow1]
+
+	add	pCRow0, pCRow0, #8
+.endm
+
+/******************************************************************************/
+
+.macro INIT8x2
+	fmov	d16, xzr
+	fmov	d17, xzr
+	fmov	d18, d16
+	fmov	d19, d17
+	fmov	d20, xzr
+	fmov	d21, d16
+	fmov	d22, d17
+	fmov	d23, d18
+.endm
+
+.macro KERNEL8x2_SUB
+	ld1	{v0.2d, v1.2d}, [pA]
+	add	pA, pA, #32
+	ld1	{v8.2d}, [pB]
+	add	pB, pB, #16
+	ld1	{v2.2d, v3.2d}, [pA]
+	add	pA, pA, #32
+
+	fmla	v16.2d, v0.2d, v8.2d[0]
+	fmla	v17.2d, v1.2d, v8.2d[0]
+	fmla	v18.2d, v2.2d, v8.2d[0]
+	fmla	v19.2d, v3.2d, v8.2d[0]
+
+	fmla	v20.2d, v0.2d, v8.2d[1]
+	fmla	v21.2d, v1.2d, v8.2d[1]
+	fmla	v22.2d, v2.2d, v8.2d[1]
+	fmla	v23.2d, v3.2d, v8.2d[1]
+.endm
+
+.macro SAVE8x2
+	add	pCRow1, pCRow0, LDC
+
+	fmul	v0.2d, v16.2d, alphaV0
+	fmul	v1.2d, v17.2d, alphaV1
+	fmul	v2.2d, v18.2d, alphaV2
+	fmul	v3.2d, v19.2d, alphaV3
+	st1 	{v0.2d, v1.2d, v2.2d, v3.2d}, [pCRow0]
+
+	fmul	v4.2d, v20.2d, alphaV0
+	fmul	v5.2d, v21.2d, alphaV1
+	fmul	v6.2d, v22.2d, alphaV2
+	fmul	v7.2d, v23.2d, alphaV3
+	st1 	{v4.2d, v5.2d, v6.2d, v7.2d}, [pCRow1]
+
+	add	pCRow0, pCRow0, #64
+.endm
+
+/******************************************************************************/
+
+.macro INIT4x2
+	fmov	d16, xzr
+	fmov	d17, d16
+	fmov	d20, d17
+	fmov	d21, d16
+.endm
+
+.macro KERNEL4x2_SUB
+	ld1	{v8.2d}, [pB]
+	add	pB, pB, #16
+	ld1	{v0.2d, v1.2d}, [pA]
+	add	pA, pA, #32
+
+	fmla	v16.2d, v0.2d, v8.2d[0]
+	fmla	v17.2d, v1.2d, v8.2d[0]
+	fmla	v20.2d, v0.2d, v8.2d[1]
+	fmla	v21.2d, v1.2d, v8.2d[1]
+.endm
+
+.macro SAVE4x2
+	fmul	v8.2d, v16.2d, alphaV0
+	fmul	v9.2d, v17.2d, alphaV1
+	st1	{v8.2d, v9.2d}, [pCRow0]
+
+	add	pCRow1, pCRow0, LDC
+
+	fmul	v12.2d, v20.2d, alphaV2
+	fmul	v13.2d, v21.2d, alphaV3
+	st1	{v12.2d, v13.2d}, [pCRow1]
+
+	add	pCRow0, pCRow0, #32
+.endm
+
+/******************************************************************************/
+
+.macro INIT2x2
+	fmov		d16, xzr
+	fmov		d20, d16
+.endm
+
+.macro KERNEL2x2_SUB
+	ld1	{v8.2d}, [pB]
+	add	pB, pB, #16
+
+	ld1	{v0.2d}, [pA]
+	add	pA, pA, #16
+
+	fmla	v16.2d, v0.2d, v8.2d[0]
+	fmla	v20.2d, v0.2d, v8.2d[1]
+.endm
+
+.macro SAVE2x2
+	fmul	v8.2d, v16.2d, alphaV0
+	st1	{v8.2d}, [pCRow0]
+
+	add	pCRow1 , pCRow0, LDC
+
+	fmul	v12.2d, v20.2d, alphaV1
+	st1	{v12.2d}, [pCRow1]
+
+	add	pCRow0, pCRow0, #16
+.endm
+
+/******************************************************************************/
+
+.macro INIT1x2
+	fmov		d16, xzr
+.endm
+
+.macro KERNEL1x2_SUB
+	ld1	{v8.2d} , [pB]
+	add	pB , pB, #16
+
+	ldr	d0 , [pA]
+	add	pA, pA, #8
+
+	fmla	v16.2d, v8.2d, v0.2d[0]
+.endm
+
+.macro SAVE1x2
+	add	pCRow1 , pCRow0, LDC
+
+	fmul	v8.2d, v16.2d, alphaV0
+	st1	{v8.d}[0], [pCRow0]
+	st1	{v8.d}[1], [pCRow1]
+
+	add	pCRow0, pCRow0, #8
+.endm
+
+/******************************************************************************/
+
+.macro INIT8x1
+	fmov	d16, xzr
+	fmov	d17, xzr
+	fmov	d18, d16
+	fmov	d19, d17
+.endm
+
+.macro KERNEL8x1_SUB
+	ld1	{v0.2d, v1.2d}, [pA]
+	add	pA , pA, #32
+
+	ldr	d8, [pB]
+	add	pB , pB, #8
+
+	ld1	{v2.2d, v3.2d}, [pA]
+	add	pA, pA, #32
+
+	fmla	v16.2d, v0.2d, v8.2d[0]
+	fmla	v17.2d, v1.2d, v8.2d[0]
+	fmla	v18.2d, v2.2d, v8.2d[0]
+	fmla	v19.2d, v3.2d, v8.2d[0]
+.endm
+
+.macro SAVE8x1
+	fmul	v0.2d, v16.2d, alphaV0
+	fmul	v1.2d, v17.2d, alphaV1
+	fmul	v2.2d, v18.2d, alphaV2
+	fmul	v3.2d, v19.2d, alphaV3
+	st1 	{v0.2d, v1.2d, v2.2d, v3.2d}, [pCRow0]
+
+	add	pCRow0, pCRow0, #64
+.endm
+
+
+/******************************************************************************/
+
+.macro INIT4x1
+	fmov	d16, xzr
+	fmov	d17, d16
+.endm
+
+.macro KERNEL4x1_SUB
+	ldr	d8, [pB]
+	add	pB , pB, #8
+
+	ld1	{v0.2d, v1.2d}, [pA]
+	add	pA , pA, #32
+
+	fmla	v16.2d, v0.2d, v8.2d[0]
+	fmla	v17.2d, v1.2d, v8.2d[0]
+.endm
+
+.macro SAVE4x1
+	fmul	v8.2d, v16.2d, alphaV0
+	fmul	v9.2d, v17.2d, alphaV1
+	st1	{v8.2d, v9.2d}, [pCRow0]
+
+	add	pCRow0, pCRow0, #32
+.endm
+
+
+/******************************************************************************/
+
+.macro INIT2x1
+	fmov		d16, xzr
+.endm
+
+.macro KERNEL2x1_SUB
+	ldr	d8, [pB]
+	add	pB , pB, #8
+
+	ld1	{v0.2d}, [pA]
+	add	pA , pA, #16
+
+	fmla	v16.2d, v0.2d, v8.2d[0]
+.endm
+
+.macro SAVE2x1
+	fmul	v8.2d, v16.2d, alphaV0
+	st1	{v8.2d}, [pCRow0]
+
+	add	pCRow0, pCRow0, #16
+.endm
+
+/******************************************************************************/
+
+.macro INIT1x1
+	fmov	d16, xzr
+.endm
+
+.macro KERNEL1x1_SUB
+	ldr	d8, [pB]
+	add	pB , pB, #8
+
+	ldr	d0, [pA]
+	add	pA , pA, #8
+
+	fmadd 	d16, d0, d8, d16  
+.endm
+
+.macro SAVE1x1
+	fmul	d8, d16, alpha0
+	str 	d8, [pCRow0]
+
+	add	pCRow0, pCRow0, #8
+.endm
+
+/*******************************************************************************
+* End of macro definitions
+*******************************************************************************/
+
+	PROLOGUE
+
+	.align 5
+	add	sp, sp, #-(11 * 16)
+	stp	d8, d9, [sp, #(0 * 16)]
+	stp	d10, d11, [sp, #(1 * 16)]
+	stp	d12, d13, [sp, #(2 * 16)]
+	stp	d14, d15, [sp, #(3 * 16)]
+	stp	d16, d17, [sp, #(4 * 16)]
+	stp	x18, x19, [sp, #(5 * 16)]
+	stp	x20, x21, [sp, #(6 * 16)]
+	stp	x22, x23, [sp, #(7 * 16)]
+	stp	x24, x25, [sp, #(8 * 16)]
+	stp	x26, x27, [sp, #(9 * 16)]
+	str	x28, [sp, #(10 * 16)]
+
+	fmov	alpha0, d0
+	fmov	alpha1, d0
+	fmov	alpha2, d0
+	fmov	alpha3, d0
+
+	lsl	LDC, LDC, #3			// ldc = ldc * 8
+
+#if !defined(LEFT)
+	neg	tempOffset, offset
+#endif
+	mov	pB, origPB
+
+	mov	counterJ, origN
+	asr 	counterJ, counterJ, #2		// J = J / 4
+	cmp 	counterJ, #0
+	ble	dtrmm_kernel_L2_BEGIN
+
+/******************************************************************************/
+
+dtrmm_kernel_L4_BEGIN:
+	mov	pCRow0, pC			// pCRow0 = C
+	add	pC, pC, LDC, lsl #2
+
+#if defined(LEFT)
+	mov	tempOffset, offset
+#endif
+	mov	pA, origPA			// pA = start of A array
+
+dtrmm_kernel_L4_M8_BEGIN:
+
+	mov	counterI, origM
+	asr 	counterI, counterI, #3		// counterI = counterI / 8
+	cmp 	counterI, #0
+	ble	dtrmm_kernel_L4_M4_BEGIN
+
+dtrmm_kernel_L4_M8_20:
+
+#if (defined(LEFT) &&  defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+	mov	pB, origPB
+#else
+	mov	pB, origPB
+	lsl	temp, tempOffset, #6
+	add	pA, pA, temp
+	lsl	temp, tempOffset, #5
+	add	pB, pB, temp
+#endif
+
+#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
+	sub	tempK, origK, tempOffset
+#elif defined(LEFT)
+	add	tempK, tempOffset, #8
+#else
+	add	tempK, tempOffset, #4
+#endif
+
+	asr 	counterL , tempK, #1		// L = K / 2
+	cmp	counterL , #2			// is there at least 4 to do?
+	blt	dtrmm_kernel_L4_M8_32
+
+	KERNEL8x4_I				// do one in the K
+	KERNEL8x4_M2				// do another in the K
+
+	subs	counterL, counterL, #2		// subtract 2
+	ble	dtrmm_kernel_L4_M8_22a
+	.align 5
+
+dtrmm_kernel_L4_M8_22:
+
+	KERNEL8x4_M1
+	KERNEL8x4_M2
+
+	subs	counterL, counterL, #1
+	bgt	dtrmm_kernel_L4_M8_22
+
+
+dtrmm_kernel_L4_M8_22a:
+
+	KERNEL8x4_M1
+	KERNEL8x4_E
+
+	b	 dtrmm_kernel_L4_M8_44
+
+dtrmm_kernel_L4_M8_32:
+
+	tst	counterL, #1
+	ble	dtrmm_kernel_L4_M8_40
+
+	KERNEL8x4_I
+
+	KERNEL8x4_E
+
+	b	dtrmm_kernel_L4_M8_44
+
+dtrmm_kernel_L4_M8_40:
+
+	INIT8x4
+
+dtrmm_kernel_L4_M8_44:
+
+	ands	counterL , tempK, #1
+	ble	dtrmm_kernel_L4_M8_100
+
+dtrmm_kernel_L4_M8_46:
+
+	KERNEL8x4_SUB
+
+dtrmm_kernel_L4_M8_100:
+
+	SAVE8x4
+
+#if (defined(LEFT) &&  defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+	sub	tempK, origK, tempOffset
+#if defined(LEFT)
+	sub	tempK, tempK, #8
+#else
+	sub	tempK, tempK, #4
+#endif
+	lsl	temp, tempK, #6
+	add	pA, pA, temp
+	lsl	temp, tempK, #5
+	add	pB, pB, temp
+#endif
+#if defined(LEFT)
+	add	tempOffset, tempOffset, #8
+#endif
+
+dtrmm_kernel_L4_M8_END:
+	subs	counterI, counterI, #1
+	bne	dtrmm_kernel_L4_M8_20
+
+dtrmm_kernel_L4_M4_BEGIN:
+
+	mov	counterI, origM
+	tst	counterI , #7
+	ble	dtrmm_kernel_L4_END
+
+	tst	counterI, #4
+	ble	dtrmm_kernel_L4_M2_BEGIN
+
+dtrmm_kernel_L4_M4_20:
+
+	INIT4x4
+
+#if (defined(LEFT) &&  defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+	mov	pB, origPB
+#else
+	mov	pB, origPB
+	lsl	temp, tempOffset, #5
+	add	pB, pB, temp
+	add	pA, pA, temp
+#endif
+
+#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
+	sub	tempK, origK, tempOffset
+#elif defined(LEFT)
+	add	tempK, tempOffset, #4
+#else
+	add	tempK, tempOffset, #4
+#endif
+
+	asr 	counterL , tempK, #3		// counterL = counterL / 8
+	cmp	counterL , #0
+	ble	dtrmm_kernel_L4_M4_40
+
+dtrmm_kernel_L4_M4_22:
+
+	KERNEL4x4_SUB
+	KERNEL4x4_SUB
+	KERNEL4x4_SUB
+	KERNEL4x4_SUB
+
+	KERNEL4x4_SUB
+	KERNEL4x4_SUB
+	KERNEL4x4_SUB
+	KERNEL4x4_SUB
+
+	subs	counterL, counterL, #1
+	bgt	dtrmm_kernel_L4_M4_22
+
+
+dtrmm_kernel_L4_M4_40:
+
+	ands	counterL , tempK, #7		// counterL = counterL % 8
+	ble	dtrmm_kernel_L4_M4_100
+
+dtrmm_kernel_L4_M4_42:
+
+	KERNEL4x4_SUB
+
+	subs	counterL, counterL, #1
+	bgt	dtrmm_kernel_L4_M4_42
+
+dtrmm_kernel_L4_M4_100:
+
+	SAVE4x4
+
+#if (defined(LEFT) &&  defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+	sub	tempK, origK, tempOffset
+#if defined(LEFT)
+	sub	tempK, tempK, #4
+#else
+	sub	tempK, tempK, #4
+#endif
+	lsl	temp, tempK, #5
+	add	pA, pA, temp
+	add	pB, pB, temp
+#endif
+#if defined(LEFT)
+	add	tempOffset, tempOffset, #4
+#endif
+
+dtrmm_kernel_L4_M4_END:
+
+
+dtrmm_kernel_L4_M2_BEGIN:
+
+	mov	counterI, origM
+	tst	counterI , #3
+	ble	dtrmm_kernel_L4_END
+
+	tst	counterI, #2			// counterI = counterI / 2
+	ble	dtrmm_kernel_L4_M1_BEGIN
+
+dtrmm_kernel_L4_M2_20:
+
+	INIT2x4
+
+#if (defined(LEFT) &&  defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+	mov	pB, origPB
+#else
+	mov	pB, origPB
+	lsl	temp, tempOffset, #4
+	add	pA, pA, temp
+	lsl	temp, tempOffset, #5
+	add	pB, pB, temp
+#endif
+
+#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
+	sub	tempK, origK, tempOffset
+#elif defined(LEFT)
+	add	tempK, tempOffset, #2
+#else
+	add	tempK, tempOffset, #4
+#endif
+	asr 	counterL , tempK, #3		// counterL = counterL / 8
+	cmp	counterL , #0
+	ble	dtrmm_kernel_L4_M2_40
+
+dtrmm_kernel_L4_M2_22:
+
+	KERNEL2x4_SUB
+	KERNEL2x4_SUB
+	KERNEL2x4_SUB
+	KERNEL2x4_SUB
+
+	KERNEL2x4_SUB
+	KERNEL2x4_SUB
+	KERNEL2x4_SUB
+	KERNEL2x4_SUB
+
+	subs	counterL, counterL, #1
+	bgt	dtrmm_kernel_L4_M2_22
+
+
+dtrmm_kernel_L4_M2_40:
+
+	ands	counterL , tempK, #7		// counterL = counterL % 8
+	ble	dtrmm_kernel_L4_M2_100
+
+dtrmm_kernel_L4_M2_42:
+
+	KERNEL2x4_SUB
+
+	subs	counterL, counterL, #1
+	bgt	dtrmm_kernel_L4_M2_42
+
+dtrmm_kernel_L4_M2_100:
+
+	SAVE2x4
+
+#if (defined(LEFT) &&  defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+	sub	tempK, origK, tempOffset
+#if defined(LEFT)
+	sub	tempK, tempK, #2
+#else
+	sub	tempK, tempK, #4
+#endif
+	lsl	temp, tempK, #4
+	add	pA, pA, temp
+	lsl	temp, tempK, #5
+	add	pB, pB, temp
+#endif
+#if defined(LEFT)
+	add	tempOffset, tempOffset, #2
+#endif
+
+dtrmm_kernel_L4_M2_END:
+
+
+dtrmm_kernel_L4_M1_BEGIN:
+
+	tst	counterI, #1			// counterI = counterI % 2
+	ble	dtrmm_kernel_L4_END
+
+dtrmm_kernel_L4_M1_20:
+
+	INIT1x4
+
+#if (defined(LEFT) &&  defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+	mov	pB, origPB
+#else
+	mov	pB, origPB
+	lsl	temp, tempOffset, #5
+	add	pB, pB, temp
+	lsl	temp, tempOffset, #3
+	add	pA, pA, temp
+#endif
+
+#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
+	sub	tempK, origK, tempOffset
+#elif defined(LEFT)
+	add	tempK, tempOffset, #1
+#else
+	add	tempK, tempOffset, #4
+#endif
+
+	asr 	counterL , tempK, #3		// counterL = counterL / 8
+	cmp	counterL , #0
+	ble	dtrmm_kernel_L4_M1_40
+
+dtrmm_kernel_L4_M1_22:
+	KERNEL1x4_SUB
+	KERNEL1x4_SUB
+	KERNEL1x4_SUB
+	KERNEL1x4_SUB
+
+	KERNEL1x4_SUB
+	KERNEL1x4_SUB
+	KERNEL1x4_SUB
+	KERNEL1x4_SUB
+
+	subs	counterL, counterL, #1
+	bgt	dtrmm_kernel_L4_M1_22
+
+
+dtrmm_kernel_L4_M1_40:
+
+	ands	counterL , tempK, #7		// counterL = counterL % 8
+	ble	dtrmm_kernel_L4_M1_100
+
+dtrmm_kernel_L4_M1_42:
+
+	KERNEL1x4_SUB
+
+	subs	counterL, counterL, #1
+	bgt	dtrmm_kernel_L4_M1_42
+
+dtrmm_kernel_L4_M1_100:
+
+	SAVE1x4
+
+#if (defined(LEFT) &&  defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+	sub	tempK, origK, tempOffset
+#if defined(LEFT)
+	sub	tempK, tempK, #1
+#else
+	sub	tempK, tempK, #4
+#endif
+	lsl	temp, tempK, #3
+	add	pA, pA, temp
+	lsl	temp, tempK, #5
+	add	pB, pB, temp
+#endif
+#if defined(LEFT)
+	add	tempOffset, tempOffset, #1
+#endif
+
+dtrmm_kernel_L4_END:
+
+	lsl	temp, origK, #5 
+	add	origPB, origPB, temp		// B = B + K * 4 * 8
+
+#if !defined(LEFT)
+	add	tempOffset, tempOffset, #4
+#endif
+
+	subs	counterJ, counterJ , #1		// j--
+	bgt	dtrmm_kernel_L4_BEGIN
+
+
+/******************************************************************************/
+
+dtrmm_kernel_L2_BEGIN:   // less than 2 left in N direction
+
+	mov	counterJ , origN
+	tst	counterJ , #3
+	ble	dtrmm_kernel_L999   // error, N was less than 4?
+
+	tst	counterJ , #2
+	ble	dtrmm_kernel_L1_BEGIN
+
+	mov	pCRow0, pC			// pCRow0 = pC
+
+	add	pC,pC,LDC, lsl #1
+
+#if defined(LEFT)
+	mov	tempOffset, offset
+#endif
+	mov	pA, origPA			// pA = A
+
+dtrmm_kernel_L2_M8_BEGIN:
+
+	mov	counterI, origM
+	asr 	counterI, counterI, #3		// counterI = counterI / 8
+	cmp	counterI, #0
+	ble	dtrmm_kernel_L2_M4_BEGIN
+
+dtrmm_kernel_L2_M8_20:
+
+	INIT8x2
+
+#if (defined(LEFT) &&  defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+	mov	pB, origPB
+#else
+	mov	pB, origPB
+	lsl	temp, tempOffset, #6
+	add	pA, pA, temp
+	lsl	temp, tempOffset, #4
+	add	pB, pB, temp
+#endif
+
+#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
+	sub	tempK, origK, tempOffset
+#elif defined(LEFT)
+	add	tempK, tempOffset, #8
+#else
+	add	tempK, tempOffset, #2
+#endif
+
+	asr	counterL , tempK, #3		// counterL = counterL / 8
+	cmp	counterL,#0
+	ble	dtrmm_kernel_L2_M8_40
+	.align 5
+
+dtrmm_kernel_L2_M8_22:
+	KERNEL8x2_SUB
+	KERNEL8x2_SUB
+	KERNEL8x2_SUB
+	KERNEL8x2_SUB
+
+	KERNEL8x2_SUB
+	KERNEL8x2_SUB
+	KERNEL8x2_SUB
+	KERNEL8x2_SUB
+
+	subs	counterL, counterL, #1
+	bgt	dtrmm_kernel_L2_M8_22
+
+
+dtrmm_kernel_L2_M8_40:
+
+	ands	counterL , tempK, #7		// counterL = counterL % 8
+	ble	dtrmm_kernel_L2_M8_100
+
+dtrmm_kernel_L2_M8_42:
+
+	KERNEL8x2_SUB
+
+	subs	counterL, counterL, #1
+	bgt	dtrmm_kernel_L2_M8_42
+
+dtrmm_kernel_L2_M8_100:
+
+	SAVE8x2
+
+#if (defined(LEFT) &&  defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+	sub	tempK, origK, tempOffset
+#if defined(LEFT)
+	sub	tempK, tempK, #8
+#else
+	sub	tempK, tempK, #2
+#endif
+	lsl	temp, tempK, #6
+	add	pA, pA, temp
+	lsl	temp, tempK, #4
+	add	pB, pB, temp
+#endif
+#if defined(LEFT)
+	add	tempOffset, tempOffset, #8
+#endif
+
+dtrmm_kernel_L2_M8_END:
+
+	subs	counterI, counterI, #1
+	bgt	dtrmm_kernel_L2_M8_20
+
+dtrmm_kernel_L2_M4_BEGIN:
+
+	mov	counterI, origM
+	tst	counterI , #7
+	ble	dtrmm_kernel_L2_END
+
+	tst	counterI, #4			// counterI = counterI / 2
+	ble	dtrmm_kernel_L2_M2_BEGIN
+
+dtrmm_kernel_L2_M4_20:
+
+	INIT4x2
+
+#if (defined(LEFT) &&  defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+	mov	pB, origPB
+#else
+	mov	pB, origPB
+	lsl	temp, tempOffset, #4
+	add	pB, pB, temp
+	lsl	temp, tempOffset, #5
+	add	pA, pA, temp
+#endif
+
+#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
+	sub	tempK, origK, tempOffset
+#elif defined(LEFT)
+	add	tempK, tempOffset, #4
+#else
+	add	tempK, tempOffset, #2
+#endif
+
+	asr	counterL , tempK, #3		// counterL = counterL / 8
+	cmp	counterL,#0
+	ble	dtrmm_kernel_L2_M4_40
+	.align 5
+
+dtrmm_kernel_L2_M4_22:
+	KERNEL4x2_SUB
+	KERNEL4x2_SUB
+	KERNEL4x2_SUB
+	KERNEL4x2_SUB
+
+	KERNEL4x2_SUB
+	KERNEL4x2_SUB
+	KERNEL4x2_SUB
+	KERNEL4x2_SUB
+
+	subs	counterL, counterL, #1
+	bgt	dtrmm_kernel_L2_M4_22
+
+
+dtrmm_kernel_L2_M4_40:
+
+	ands	counterL , tempK, #7		// counterL = counterL % 8
+	ble	dtrmm_kernel_L2_M4_100
+
+dtrmm_kernel_L2_M4_42:
+
+	KERNEL4x2_SUB
+
+	subs	counterL, counterL, #1
+	bgt	dtrmm_kernel_L2_M4_42
+
+dtrmm_kernel_L2_M4_100:
+
+	SAVE4x2
+
+#if (defined(LEFT) &&  defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+	sub	tempK, origK, tempOffset
+#if defined(LEFT)
+	sub	tempK, tempK, #4
+#else
+	sub	tempK, tempK, #2
+#endif
+	lsl	temp, tempK, #5
+	add	pA, pA, temp
+	lsl	temp, tempK, #4
+	add	pB, pB, temp
+#endif
+#if defined(LEFT)
+	add	tempOffset, tempOffset, #4
+#endif
+
+dtrmm_kernel_L2_M4_END:
+
+
+dtrmm_kernel_L2_M2_BEGIN:
+
+	mov	counterI, origM
+	tst	counterI , #3
+	ble	dtrmm_kernel_L2_END
+
+	tst	counterI, #2			// counterI = counterI / 2
+	ble	dtrmm_kernel_L2_M1_BEGIN
+
+dtrmm_kernel_L2_M2_20:
+
+	INIT2x2
+
+#if (defined(LEFT) &&  defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+	mov	pB, origPB
+#else
+	mov	pB, origPB
+	lsl	temp, tempOffset, #4
+	add	pB, pB, temp
+	lsl	temp, tempOffset, #4
+	add	pA, pA, temp
+#endif
+
+#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
+	sub	tempK, origK, tempOffset
+#elif defined(LEFT)
+	add	tempK, tempOffset, #2
+#else
+	add	tempK, tempOffset, #2
+#endif
+
+	asr	counterL , tempK, #3		// counterL = counterL / 8
+        cmp	counterL,#0
+	ble	dtrmm_kernel_L2_M2_40
+
+dtrmm_kernel_L2_M2_22:
+
+	KERNEL2x2_SUB
+	KERNEL2x2_SUB
+	KERNEL2x2_SUB
+	KERNEL2x2_SUB
+
+	KERNEL2x2_SUB
+	KERNEL2x2_SUB
+	KERNEL2x2_SUB
+	KERNEL2x2_SUB
+
+	subs	counterL, counterL, #1
+	bgt	dtrmm_kernel_L2_M2_22
+
+
+dtrmm_kernel_L2_M2_40:
+
+	ands	counterL , tempK, #7		// counterL = counterL % 8
+	ble	dtrmm_kernel_L2_M2_100
+
+dtrmm_kernel_L2_M2_42:
+
+	KERNEL2x2_SUB
+
+	subs	counterL, counterL, #1
+	bgt	dtrmm_kernel_L2_M2_42
+
+dtrmm_kernel_L2_M2_100:
+
+	SAVE2x2
+
+#if (defined(LEFT) &&  defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+	sub	tempK, origK, tempOffset
+#if defined(LEFT)
+	sub	tempK, tempK, #2
+#else
+	sub	tempK, tempK, #2
+#endif
+	lsl	temp, tempK, #4
+	add	pA, pA, temp
+	lsl	temp, tempK, #4
+	add	pB, pB, temp
+#endif
+#if defined(LEFT)
+	add	tempOffset, tempOffset, #2
+#endif
+
+dtrmm_kernel_L2_M2_END:
+
+
+dtrmm_kernel_L2_M1_BEGIN:
+
+	tst	counterI, #1			// counterI = counterI % 2
+	ble	dtrmm_kernel_L2_END
+
+dtrmm_kernel_L2_M1_20:
+
+	INIT1x2
+
+#if (defined(LEFT) &&  defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+	mov	pB, origPB
+#else
+	mov	pB, origPB
+	lsl	temp, tempOffset, #4
+	add	pB, pB, temp
+	lsl	temp, tempOffset, #3
+	add	pA, pA, temp
+#endif
+
+#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
+	sub	tempK, origK, tempOffset
+#elif defined(LEFT)
+	add	tempK, tempOffset, #1
+#else
+	add	tempK, tempOffset, #2
+#endif
+
+	asr 	counterL , tempK, #3		// counterL = counterL / 8
+        cmp     counterL, #0
+	ble	dtrmm_kernel_L2_M1_40
+
+dtrmm_kernel_L2_M1_22:
+	KERNEL1x2_SUB
+	KERNEL1x2_SUB
+	KERNEL1x2_SUB
+	KERNEL1x2_SUB
+
+	KERNEL1x2_SUB
+	KERNEL1x2_SUB
+	KERNEL1x2_SUB
+	KERNEL1x2_SUB
+
+	subs	counterL, counterL, #1
+	bgt	dtrmm_kernel_L2_M1_22
+
+
+dtrmm_kernel_L2_M1_40:
+
+	ands	counterL , tempK, #7		// counterL = counterL % 8
+	ble	dtrmm_kernel_L2_M1_100
+
+dtrmm_kernel_L2_M1_42:
+
+	KERNEL1x2_SUB
+
+	subs	counterL, counterL, #1
+	bgt	dtrmm_kernel_L2_M1_42
+
+dtrmm_kernel_L2_M1_100:
+
+	SAVE1x2
+
+#if (defined(LEFT) &&  defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+	sub	tempK, origK, tempOffset
+#if defined(LEFT)
+	sub	tempK, tempK, #1
+#else
+	sub	tempK, tempK, #2
+#endif
+	lsl	temp, tempK, #3
+	add	pA, pA, temp
+	lsl	temp, tempK, #4
+	add	pB, pB, temp
+#endif
+#if defined(LEFT)
+	add	tempOffset, tempOffset, #1
+#endif
+
+dtrmm_kernel_L2_END:
+#if !defined(LEFT)
+	add	tempOffset, tempOffset, #2
+#endif
+	add	origPB, origPB, origK, lsl #4	// B = B + K * 2 * 8
+
+/******************************************************************************/
+
+dtrmm_kernel_L1_BEGIN:
+
+	mov	counterJ , origN
+	tst	counterJ , #1
+	ble	dtrmm_kernel_L999 // done
+
+	mov	pCRow0, pC			// pCRow0 = C
+	add	pC , pC , LDC			// Update pC to point to next
+
+#if defined(LEFT)
+	mov	tempOffset, offset
+#endif
+	mov	pA, origPA			// pA = A
+
+dtrmm_kernel_L1_M8_BEGIN:
+
+	mov	counterI, origM
+	asr 	counterI, counterI, #3		// counterI = counterI / 8
+	cmp	counterI, #0
+	ble	dtrmm_kernel_L1_M4_BEGIN
+
+dtrmm_kernel_L1_M8_20:
+
+	INIT8x1
+
+#if (defined(LEFT) &&  defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+	mov	pB, origPB
+#else
+	mov	pB, origPB
+	lsl	temp, tempOffset, #6
+	add	pA, pA, temp
+	lsl	temp, tempOffset, #3
+	add	pB, pB, temp
+#endif
+
+#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
+	sub	tempK, origK, tempOffset
+#elif defined(LEFT)
+	add	tempK, tempOffset, #8
+#else
+	add	tempK, tempOffset, #1
+#endif
+
+	asr	counterL , tempK, #3		// counterL = counterL / 8
+	cmp	counterL , #0
+	ble	dtrmm_kernel_L1_M8_40
+	.align 5
+
+dtrmm_kernel_L1_M8_22:
+	KERNEL8x1_SUB
+	KERNEL8x1_SUB
+	KERNEL8x1_SUB
+	KERNEL8x1_SUB
+
+	KERNEL8x1_SUB
+	KERNEL8x1_SUB
+	KERNEL8x1_SUB
+	KERNEL8x1_SUB
+
+	subs	counterL, counterL, #1
+	bgt	dtrmm_kernel_L1_M8_22
+
+
+dtrmm_kernel_L1_M8_40:
+
+	ands	counterL , tempK, #7		// counterL = counterL % 8
+	ble	dtrmm_kernel_L1_M8_100
+
+dtrmm_kernel_L1_M8_42:
+
+	KERNEL8x1_SUB
+
+	subs	counterL, counterL, #1
+	bgt	dtrmm_kernel_L1_M8_42
+
+dtrmm_kernel_L1_M8_100:
+
+	SAVE8x1
+
+#if (defined(LEFT) &&  defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+	sub	tempK, origK, tempOffset
+#if defined(LEFT)
+	sub	tempK, tempK, #8
+#else
+	sub	tempK, tempK, #1
+#endif
+	lsl	temp, tempK, #6
+	add	pA, pA, temp
+	lsl	temp, tempK, #3
+	add	pB, pB, temp
+#endif
+#if defined(LEFT)
+	add	tempOffset, tempOffset, #8
+#endif
+
+dtrmm_kernel_L1_M8_END:
+
+	subs	counterI, counterI, #1
+	bgt	dtrmm_kernel_L1_M8_20
+
+dtrmm_kernel_L1_M4_BEGIN:
+
+	mov	counterI, origM
+	tst	counterI , #7
+	ble	dtrmm_kernel_L1_END
+
+	tst	counterI, #4			// counterI = counterI / 2
+	ble	dtrmm_kernel_L1_M2_BEGIN
+
+dtrmm_kernel_L1_M4_20:
+
+	INIT4x1
+
+#if (defined(LEFT) &&  defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+	mov	pB, origPB
+#else
+	mov	pB, origPB
+	lsl	temp, tempOffset, #3
+	add	pB, pB, temp
+	lsl	temp, tempOffset, #5
+	add	pA, pA, temp
+#endif
+#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
+	sub	tempK, origK, tempOffset
+#elif defined(LEFT)
+	add	tempK, tempOffset, #4
+#else
+	add	tempK, tempOffset, #1
+#endif
+
+	asr	counterL , tempK, #3		// counterL = counterL / 8
+	cmp	counterL , #0
+	ble	dtrmm_kernel_L1_M4_40
+	.align 5
+
+dtrmm_kernel_L1_M4_22:
+	KERNEL4x1_SUB
+	KERNEL4x1_SUB
+	KERNEL4x1_SUB
+	KERNEL4x1_SUB
+
+	KERNEL4x1_SUB
+	KERNEL4x1_SUB
+	KERNEL4x1_SUB
+	KERNEL4x1_SUB
+
+	subs	counterL, counterL, #1
+	bgt	dtrmm_kernel_L1_M4_22
+
+
+dtrmm_kernel_L1_M4_40:
+
+	ands	counterL , tempK, #7		// counterL = counterL % 8
+	ble	dtrmm_kernel_L1_M4_100
+
+dtrmm_kernel_L1_M4_42:
+
+	KERNEL4x1_SUB
+
+	subs	counterL, counterL, #1
+	bgt	dtrmm_kernel_L1_M4_42
+
+dtrmm_kernel_L1_M4_100:
+
+	SAVE4x1
+
+#if (defined(LEFT) &&  defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+	sub	tempK, origK, tempOffset
+#if defined(LEFT)
+	sub	tempK, tempK, #4
+#else
+	sub	tempK, tempK, #1
+#endif
+	lsl	temp, tempK, #5
+	add	pA, pA, temp
+	lsl	temp, tempK, #3
+	add	pB, pB, temp
+#endif
+#if defined(LEFT)
+	add	tempOffset, tempOffset, #4
+#endif
+
+dtrmm_kernel_L1_M4_END:
+
+dtrmm_kernel_L1_M2_BEGIN:
+
+	mov	counterI, origM
+	tst	counterI , #3
+	ble	dtrmm_kernel_L1_END
+
+	tst	counterI, #2			// counterI = counterI / 2
+	ble	dtrmm_kernel_L1_M1_BEGIN
+
+dtrmm_kernel_L1_M2_20:
+
+	INIT2x1
+
+#if (defined(LEFT) &&  defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+	mov	pB, origPB
+#else
+	mov	pB, origPB
+	lsl	temp, tempOffset, #3
+	add	pB, pB, temp
+	lsl	temp, tempOffset, #4
+	add	pA, pA, temp
+#endif
+
+#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
+	sub	tempK, origK, tempOffset
+#elif defined(LEFT)
+	add	tempK, tempOffset, #2
+#else
+	add	tempK, tempOffset, #1
+#endif
+
+	asr 	counterL , tempK, #3		// counterL = counterL / 8
+	cmp	counterL , #0
+	ble	dtrmm_kernel_L1_M2_40
+
+dtrmm_kernel_L1_M2_22:
+
+	KERNEL2x1_SUB
+	KERNEL2x1_SUB
+	KERNEL2x1_SUB
+	KERNEL2x1_SUB
+
+	KERNEL2x1_SUB
+	KERNEL2x1_SUB
+	KERNEL2x1_SUB
+	KERNEL2x1_SUB
+
+	subs	counterL, counterL, #1
+	bgt	dtrmm_kernel_L1_M2_22
+
+
+dtrmm_kernel_L1_M2_40:
+
+	ands	counterL , tempK, #7		// counterL = counterL % 8
+	ble	dtrmm_kernel_L1_M2_100
+
+dtrmm_kernel_L1_M2_42:
+
+	KERNEL2x1_SUB
+
+	subs	counterL, counterL, #1
+	bgt	dtrmm_kernel_L1_M2_42
+
+dtrmm_kernel_L1_M2_100:
+
+	SAVE2x1
+
+#if (defined(LEFT) &&  defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+	sub	tempK, origK, tempOffset
+#if defined(LEFT)
+	sub	tempK, tempK, #2
+#else
+	sub	tempK, tempK, #1
+#endif
+	lsl	temp, tempK, #4
+	add	pA, pA, temp
+	lsl	temp, tempK, #3
+	add	pB, pB, temp
+#endif
+#if defined(LEFT)
+	add	tempOffset, tempOffset, #2
+#endif
+
+dtrmm_kernel_L1_M2_END:
+
+
+dtrmm_kernel_L1_M1_BEGIN:
+
+	tst	counterI, #1			// counterI = counterI % 2
+	ble	dtrmm_kernel_L1_END
+
+dtrmm_kernel_L1_M1_20:
+
+	INIT1x1
+
+#if (defined(LEFT) &&  defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+	mov	pB, origPB
+#else
+	mov	pB, origPB
+	lsl	temp, tempOffset, #3
+	add	pB, pB, temp
+	lsl	temp, tempOffset, #3
+	add	pA, pA, temp
+#endif
+
+#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
+	sub	tempK, origK, tempOffset
+#elif defined(LEFT)
+	add	tempK, tempOffset, #1
+#else
+	add	tempK, tempOffset, #1
+#endif
+
+	asr 	counterL , tempK, #3		// counterL = counterL / 8
+	cmp	counterL , #0
+	ble	dtrmm_kernel_L1_M1_40
+
+dtrmm_kernel_L1_M1_22:
+	KERNEL1x1_SUB
+	KERNEL1x1_SUB
+	KERNEL1x1_SUB
+	KERNEL1x1_SUB
+
+	KERNEL1x1_SUB
+	KERNEL1x1_SUB
+	KERNEL1x1_SUB
+	KERNEL1x1_SUB
+
+	subs	counterL, counterL, #1
+	bgt	dtrmm_kernel_L1_M1_22
+
+
+dtrmm_kernel_L1_M1_40:
+
+	ands	counterL , tempK, #7		// counterL = counterL % 8
+	ble	dtrmm_kernel_L1_M1_100
+
+dtrmm_kernel_L1_M1_42:
+
+	KERNEL1x1_SUB
+
+	subs	counterL, counterL, #1
+	bgt	dtrmm_kernel_L1_M1_42
+
+dtrmm_kernel_L1_M1_100:
+
+	SAVE1x1
+
+
+dtrmm_kernel_L1_END:
+
+
+dtrmm_kernel_L999:
+	mov	x0, #0				// set return value
+	ldp	d8, d9, [sp, #(0 * 16)]
+	ldp	d10, d11, [sp, #(1 * 16)]
+	ldp	d12, d13, [sp, #(2 * 16)]
+	ldp	d14, d15, [sp, #(3 * 16)]
+	ldp	d16, d17, [sp, #(4 * 16)]
+	ldp	x18, x19, [sp, #(5 * 16)]
+	ldp	x20, x21, [sp, #(6 * 16)]
+	ldp	x22, x23, [sp, #(7 * 16)]
+	ldp	x24, x25, [sp, #(8 * 16)]
+	ldp	x26, x27, [sp, #(9 * 16)]
+	ldr	x28, [sp, #(10 * 16)]
+	add	sp, sp, #(11*16)
+	ret
+
+	EPILOGUE
+
diff --git a/kernel/arm64/sgemm_kernel_16x4.S b/kernel/arm64/sgemm_kernel_16x4.S
new file mode 100644
index 000000000..22b55b01c
--- /dev/null
+++ b/kernel/arm64/sgemm_kernel_16x4.S
@@ -0,0 +1,1987 @@
+/*******************************************************************************
+Copyright (c) 2015, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*******************************************************************************/
+
+#define ASSEMBLER
+#include "common.h"
+
+/*                   X0          X1          X2          s0        X3        x4       x5           x6  */
+/*int CNAME(BLASLONG bm,BLASLONG bn,BLASLONG bk,FLOAT alpha,FLOAT* ba,FLOAT* bb,FLOAT* C,BLASLONG ldc) */
+
+#define origM		x0
+#define origN		x1
+#define origK		x2
+#define origPA		x3
+#define origPB		x4
+#define pC		x5
+#define LDC		x6
+#define temp		x7
+#define counterL	x8
+#define counterI	x9
+#define counterJ	x10
+#define pB		x11
+#define pCRow0		x12
+#define pCRow1		x13
+#define pCRow2		x14
+#define pA		x15
+
+#define alpha0		s10
+#define alphaV0		v10.s[0]
+#define alpha1		s11
+#define alphaV1		v11.s[0]
+#define alpha2		s14
+#define alphaV2		v14.s[0]
+#define alpha3		s15
+#define alphaV3		v15.s[0]
+
+// 00 origM
+// 01 origN
+// 02 origK
+// 03 origPA
+// 04 origPB
+// 05 pC
+// 06 origLDC -> LDC
+// 07 offset
+// 08 counterL
+// 09 counterI
+// 10 counterJ
+// 11 pB
+// 12 pCRow0
+// 13 pCRow1
+// 14 pCRow2
+// 15 pA
+// 16 temp
+// 17
+// 18 must save
+// 19 must save
+// 20 must save
+// 21 must save
+// 22 must save
+// 23 must save
+// 24 must save
+// 25 must save
+// 26 must save
+// 27 must save
+// 28 must save
+// 29 frame
+// 30 link
+// 31 sp
+
+//v00 ALPHA -> pA0_00, pA0_01, pA0_02, pA0_03
+//v01 pA0_04, pA0_05, pA0_06, pA0_07
+//v02 pA0_08, pA0_09, pA0_10, pA0_11
+//v03 pA0_12, pA0_13, pA0_14, pA0_15
+//v04 pA1_00, pA1_01, pA1_02, pA1_03
+//v05 pA1_04, pA1_05, pA1_06, pA1_07
+//v06 pA1_08, pA1_09, pA1_10, pA1_11
+//v07 pA1_12, pA1_13, pA1_14, pA1_15
+//v08 must save pB00, pB01
+//v09 must save pB02, pB03
+//v10 must save ALPHA0
+//v11 must save ALPHA1
+//v12 must save pB10, pB11
+//v13 must save pB12, pB13
+//v14 must save ALPHA2
+//v15 must save ALPHA3
+//v16 must save C00, C01, C02, C03
+//v17 must save C04, C05, C06, C07
+//v18 C08, C09, C10, C11
+//v19 C12, C13, C14, C15
+//v20 C16, C17, C18, C19
+//v21 C20, C21, C22, C23
+//v22 C24, C25, C26, C27
+//v23 C28, C29, C30, C31
+//v24 C32, C33, C34, C35
+//v25 C36, C37, C38, C39
+//v26 C40, C41, C42, C43
+//v27 C44, C45, C46, C47
+//v28 C48, C49, C50, C51
+//v29 C52, C53, C54, C55
+//v30 C56, C57, C58, C59
+//v31 C60, C61, C62, C63
+
+/*******************************************************************************
+* Macro definitions
+*******************************************************************************/
+
+.macro INIT16x4
+	fmov		s16, wzr
+	fmov		s17, wzr
+	fmov		s18, s16
+	fmov		s19, s17
+	fmov		s20, wzr
+	fmov		s21, s16
+	fmov		s22, s17
+	fmov		s23, s18
+	fmov		s24, wzr
+	fmov		s25, s16
+	fmov		s26, s17
+	fmov		s27, s18
+	fmov		s28, wzr
+	fmov		s29, s16
+	fmov		s30, s17
+	fmov		s31, s18
+.endm
+
+.macro KERNEL16x4_I
+	ld1	{v8.2s, v9.2s}, [pB]
+	add	pB, pB, #16
+	ld1	{v0.4s}, [pA]
+	add	pA, pA, #16
+	ld1	{v1.4s}, [pA]
+	add	pA, pA, #16
+	ld1	{v2.4s}, [pA]
+	add	pA, pA, #16
+	ld1	{v3.4s}, [pA]
+	add	pA, pA, #16
+
+	fmul	v16.4s, v0.4s, v8.2s[0]
+	fmul	v17.4s, v1.4s, v8.2s[0]
+	fmul	v18.4s, v2.4s, v8.2s[0]
+	fmul	v19.4s, v3.4s, v8.2s[0]
+
+	fmul	v20.4s, v0.4s, v8.2s[1]
+	fmul	v21.4s, v1.4s, v8.2s[1]
+	fmul	v22.4s, v2.4s, v8.2s[1]
+	fmul	v23.4s, v3.4s, v8.2s[1]
+
+	fmul	v24.4s, v0.4s, v9.2s[0]
+	fmul	v25.4s, v1.4s, v9.2s[0]
+	fmul	v26.4s, v2.4s, v9.2s[0]
+	fmul	v27.4s, v3.4s, v9.2s[0]
+
+	fmul	v28.4s, v0.4s, v9.2s[1]
+	fmul	v29.4s, v1.4s, v9.2s[1]
+	fmul	v30.4s, v2.4s, v9.2s[1]
+	fmul	v31.4s, v3.4s, v9.2s[1]
+
+	ld1	{v12.2s, v13.2s}, [pB]
+	add	pB, pB, #16
+	ld1	{v4.4s}, [pA]
+	add	pA, pA, #16
+	ld1	{v5.4s}, [pA]
+	add	pA, pA, #16
+	ld1	{v6.4s}, [pA]
+	add	pA, pA, #16
+	ld1	{v7.4s}, [pA]
+	add	pA, pA, #16
+.endm
+
+.macro KERNEL16x4_M1
+	fmla	v16.4s, v0.4s, v8.2s[0]
+	fmla	v17.4s, v1.4s, v8.2s[0]
+	fmla	v18.4s, v2.4s, v8.2s[0]
+	fmla	v19.4s, v3.4s, v8.2s[0]
+
+	fmla	v20.4s, v0.4s, v8.2s[1]
+	fmla	v21.4s, v1.4s, v8.2s[1]
+	fmla	v22.4s, v2.4s, v8.2s[1]
+	fmla	v23.4s, v3.4s, v8.2s[1]
+
+	fmla	v24.4s, v0.4s, v9.2s[0]
+	fmla	v25.4s, v1.4s, v9.2s[0]
+	fmla	v26.4s, v2.4s, v9.2s[0]
+	fmla	v27.4s, v3.4s, v9.2s[0]
+
+	fmla	v28.4s, v0.4s, v9.2s[1]
+	fmla	v29.4s, v1.4s, v9.2s[1]
+	fmla	v30.4s, v2.4s, v9.2s[1]
+	fmla	v31.4s, v3.4s, v9.2s[1]
+
+	ld1	{v12.2s, v13.2s}, [pB]
+	add	pB, pB, #16
+	ld1	{v4.4s}, [pA]
+	add	pA, pA, #16
+	ld1	{v5.4s}, [pA]
+	add	pA, pA, #16
+	ld1	{v6.4s}, [pA]
+	add	pA, pA, #16
+	ld1	{v7.4s}, [pA]
+	add	pA, pA, #16
+.endm
+
+.macro KERNEL16x4_M2
+	fmla	v16.4s, v4.4s, v12.2s[0]
+	fmla	v17.4s, v5.4s, v12.2s[0]
+	fmla	v18.4s, v6.4s, v12.2s[0]
+	fmla	v19.4s, v7.4s, v12.2s[0]
+
+	fmla	v20.4s, v4.4s, v12.2s[1]
+	fmla	v21.4s, v5.4s, v12.2s[1]
+	fmla	v22.4s, v6.4s, v12.2s[1]
+	fmla	v23.4s, v7.4s, v12.2s[1]
+
+	fmla	v24.4s, v4.4s, v13.2s[0]
+	fmla	v25.4s, v5.4s, v13.2s[0]
+	fmla	v26.4s, v6.4s, v13.2s[0]
+	fmla	v27.4s, v7.4s, v13.2s[0]
+
+	fmla	v28.4s, v4.4s, v13.2s[1]
+	fmla	v29.4s, v5.4s, v13.2s[1]
+	fmla	v30.4s, v6.4s, v13.2s[1]
+	fmla	v31.4s, v7.4s, v13.2s[1]
+
+	ld1	{v8.2s, v9.2s}, [pB]
+	add	pB, pB, #16
+	ld1	{v0.4s}, [pA]
+	add	pA, pA, #16
+	ld1	{v1.4s}, [pA]
+	add	pA, pA, #16
+	ld1	{v2.4s}, [pA]
+	add	pA, pA, #16
+	ld1	{v3.4s}, [pA]
+	add	pA, pA, #16
+.endm
+
+.macro KERNEL16x4_E
+	fmla	v16.4s, v4.4s, v12.2s[0]
+	fmla	v17.4s, v5.4s, v12.2s[0]
+	fmla	v18.4s, v6.4s, v12.2s[0]
+	fmla	v19.4s, v7.4s, v12.2s[0]
+
+	fmla	v20.4s, v4.4s, v12.2s[1]
+	fmla	v21.4s, v5.4s, v12.2s[1]
+	fmla	v22.4s, v6.4s, v12.2s[1]
+	fmla	v23.4s, v7.4s, v12.2s[1]
+
+	fmla	v24.4s, v4.4s, v13.2s[0]
+	fmla	v25.4s, v5.4s, v13.2s[0]
+	fmla	v26.4s, v6.4s, v13.2s[0]
+	fmla	v27.4s, v7.4s, v13.2s[0]
+
+	fmla	v28.4s, v4.4s, v13.2s[1]
+	fmla	v29.4s, v5.4s, v13.2s[1]
+	fmla	v30.4s, v6.4s, v13.2s[1]
+	fmla	v31.4s, v7.4s, v13.2s[1]
+.endm
+
+.macro KERNEL16x4_SUB
+	ld1	{v8.2s, v9.2s}, [pB]
+	add	pB, pB, #16
+	ld1	{v0.4s}, [pA]
+	add	pA, pA, #16
+	ld1	{v1.4s}, [pA]
+	add	pA, pA, #16
+	ld1	{v2.4s}, [pA]
+	add	pA, pA, #16
+	ld1	{v3.4s}, [pA]
+	add	pA, pA, #16
+
+	fmla	v16.4s, v0.4s, v8.2s[0]
+	fmla	v17.4s, v1.4s, v8.2s[0]
+	fmla	v18.4s, v2.4s, v8.2s[0]
+	fmla	v19.4s, v3.4s, v8.2s[0]
+
+	fmla	v20.4s, v0.4s, v8.2s[1]
+	fmla	v21.4s, v1.4s, v8.2s[1]
+	fmla	v22.4s, v2.4s, v8.2s[1]
+	fmla	v23.4s, v3.4s, v8.2s[1]
+
+	fmla	v24.4s, v0.4s, v9.2s[0]
+	fmla	v25.4s, v1.4s, v9.2s[0]
+	fmla	v26.4s, v2.4s, v9.2s[0]
+	fmla	v27.4s, v3.4s, v9.2s[0]
+
+	fmla	v28.4s, v0.4s, v9.2s[1]
+	fmla	v29.4s, v1.4s, v9.2s[1]
+	fmla	v30.4s, v2.4s, v9.2s[1]
+	fmla	v31.4s, v3.4s, v9.2s[1]
+.endm
+
+.macro SAVE16x4
+	add	pCRow1, pCRow0, LDC
+
+	ld1	{v0.4s, v1.4s, v2.4s, v3.4s}, [pCRow0]
+	fmla	v0.4s, v16.4s, alphaV0
+	fmla	v1.4s, v17.4s, alphaV1
+	fmla	v2.4s, v18.4s, alphaV2
+	fmla	v3.4s, v19.4s, alphaV3
+	st1 	{v0.4s, v1.4s, v2.4s, v3.4s}, [pCRow0]
+
+	add	pCRow2, pCRow1, LDC
+
+	ld1	{v4.4s, v5.4s, v6.4s, v7.4s}, [pCRow1]
+	fmla	v4.4s, v20.4s, alphaV0
+	fmla	v5.4s, v21.4s, alphaV1
+	fmla	v6.4s, v22.4s, alphaV2
+	fmla	v7.4s, v23.4s, alphaV3
+	st1 	{v4.4s, v5.4s, v6.4s, v7.4s}, [pCRow1]
+
+	add	pCRow1, pCRow2, LDC
+
+	ld1	{v0.4s, v1.4s, v2.4s, v3.4s}, [pCRow2]
+	fmla	v0.4s, v24.4s, alphaV0
+	fmla	v1.4s, v25.4s, alphaV1
+	fmla	v2.4s, v26.4s, alphaV2
+	fmla	v3.4s, v27.4s, alphaV3
+	st1 	{v0.4s, v1.4s, v2.4s, v3.4s}, [pCRow2]
+
+	ld1	{v4.4s, v5.4s, v6.4s, v7.4s}, [pCRow1]
+	fmla	v4.4s, v28.4s, alphaV0
+	fmla	v5.4s, v29.4s, alphaV1
+	fmla	v6.4s, v30.4s, alphaV2
+	fmla	v7.4s, v31.4s, alphaV3
+	st1 	{v4.4s, v5.4s, v6.4s, v7.4s}, [pCRow1]
+
+	add	pCRow0, pCRow0, #64
+.endm
+
+/******************************************************************************/
+
+.macro INIT8x4
+	fmov		s16, wzr
+	fmov		s17, wzr
+	fmov		s20, wzr
+	fmov		s21, s16
+	fmov		s24, wzr
+	fmov		s25, s16
+	fmov		s28, wzr
+	fmov		s29, s16
+.endm
+
+.macro KERNEL8x4_I
+	ld1	{v8.2s, v9.2s}, [pB]
+	add	pB, pB, #16
+	ld1	{v0.4s}, [pA]
+	add	pA, pA, #16
+	ld1	{v1.4s}, [pA]
+	add	pA, pA, #16
+
+	fmul	v16.4s, v0.4s, v8.2s[0]
+	fmul	v17.4s, v1.4s, v8.2s[0]
+	fmul	v20.4s, v0.4s, v8.2s[1]
+	fmul	v21.4s, v1.4s, v8.2s[1]
+	fmul	v24.4s, v0.4s, v9.2s[0]
+	fmul	v25.4s, v1.4s, v9.2s[0]
+	fmul	v28.4s, v0.4s, v9.2s[1]
+	fmul	v29.4s, v1.4s, v9.2s[1]
+
+	ld1	{v12.2s, v13.2s}, [pB]
+	add	pB, pB, #16
+	ld1	{v4.4s}, [pA]
+	add	pA, pA, #16
+	ld1	{v5.4s}, [pA]
+	add	pA, pA, #16
+.endm
+
+.macro KERNEL8x4_M1
+	fmla	v16.4s, v0.4s, v8.2s[0]
+	fmla	v17.4s, v1.4s, v8.2s[0]
+	fmla	v20.4s, v0.4s, v8.2s[1]
+	fmla	v21.4s, v1.4s, v8.2s[1]
+	fmla	v24.4s, v0.4s, v9.2s[0]
+	fmla	v25.4s, v1.4s, v9.2s[0]
+	fmla	v28.4s, v0.4s, v9.2s[1]
+	fmla	v29.4s, v1.4s, v9.2s[1]
+
+	ld1	{v12.2s, v13.2s}, [pB]
+	add	pB, pB, #16
+	ld1	{v4.4s}, [pA]
+	add	pA, pA, #16
+	ld1	{v5.4s}, [pA]
+	add	pA, pA, #16
+.endm
+
+.macro KERNEL8x4_M2
+	fmla	v16.4s, v4.4s, v12.2s[0]
+	fmla	v17.4s, v5.4s, v12.2s[0]
+	fmla	v20.4s, v4.4s, v12.2s[1]
+	fmla	v21.4s, v5.4s, v12.2s[1]
+	fmla	v24.4s, v4.4s, v13.2s[0]
+	fmla	v25.4s, v5.4s, v13.2s[0]
+	fmla	v28.4s, v4.4s, v13.2s[1]
+	fmla	v29.4s, v5.4s, v13.2s[1]
+
+	ld1	{v8.2s, v9.2s}, [pB]
+	add	pB, pB, #16
+	ld1	{v0.4s}, [pA]
+	add	pA, pA, #16
+	ld1	{v1.4s}, [pA]
+	add	pA, pA, #16
+.endm
+
+.macro KERNEL8x4_E
+	fmla	v16.4s, v4.4s, v12.2s[0]
+	fmla	v17.4s, v5.4s, v12.2s[0]
+	fmla	v20.4s, v4.4s, v12.2s[1]
+	fmla	v21.4s, v5.4s, v12.2s[1]
+	fmla	v24.4s, v4.4s, v13.2s[0]
+	fmla	v25.4s, v5.4s, v13.2s[0]
+	fmla	v28.4s, v4.4s, v13.2s[1]
+	fmla	v29.4s, v5.4s, v13.2s[1]
+.endm
+
+.macro KERNEL8x4_SUB
+	ld1	{v8.2s, v9.2s}, [pB]
+	add	pB, pB, #16
+	ld1	{v0.4s}, [pA]
+	add	pA, pA, #16
+	ld1	{v1.4s}, [pA]
+	add	pA, pA, #16
+
+	fmla	v16.4s, v0.4s, v8.2s[0]
+	fmla	v17.4s, v1.4s, v8.2s[0]
+	fmla	v20.4s, v0.4s, v8.2s[1]
+	fmla	v21.4s, v1.4s, v8.2s[1]
+	fmla	v24.4s, v0.4s, v9.2s[0]
+	fmla	v25.4s, v1.4s, v9.2s[0]
+	fmla	v28.4s, v0.4s, v9.2s[1]
+	fmla	v29.4s, v1.4s, v9.2s[1]
+.endm
+
+.macro SAVE8x4
+	add	pCRow1, pCRow0, LDC
+
+	ld1	{v0.4s, v1.4s}, [pCRow0]
+	fmla	v0.4s, v16.4s, alphaV0
+	fmla	v1.4s, v17.4s, alphaV1
+	st1 	{v0.4s, v1.4s}, [pCRow0]
+
+	add	pCRow2, pCRow1, LDC
+
+	ld1	{v4.4s, v5.4s}, [pCRow1]
+	fmla	v4.4s, v20.4s, alphaV0
+	fmla	v5.4s, v21.4s, alphaV1
+	st1 	{v4.4s, v5.4s}, [pCRow1]
+
+	add	pCRow1, pCRow2, LDC
+
+	ld1	{v0.4s, v1.4s}, [pCRow2]
+	fmla	v0.4s, v24.4s, alphaV0
+	fmla	v1.4s, v25.4s, alphaV1
+	st1 	{v0.4s, v1.4s}, [pCRow2]
+
+	ld1	{v4.4s, v5.4s}, [pCRow1]
+	fmla	v4.4s, v28.4s, alphaV0
+	fmla	v5.4s, v29.4s, alphaV1
+	st1 	{v4.4s, v5.4s}, [pCRow1]
+
+	add	pCRow0, pCRow0, #32
+.endm
+
+/******************************************************************************/
+
+.macro INIT4x4
+	fmov		s16, wzr
+	fmov		s17, s16
+	fmov		s20, s17
+	fmov		s21, s16
+	fmov		s24, s17
+	fmov		s25, s16
+	fmov		s28, s17
+	fmov		s29, s16
+.endm
+
+.macro KERNEL4x4_I
+	ld1	{v8.2s, v9.2s}, [pB]
+	add	pB, pB, #16
+	ld1	{v0.2s, v1.2s}, [pA]
+	add	pA, pA, #16
+
+	fmul	v16.2s, v0.2s, v8.2s[0]
+	fmul	v29.2s, v1.2s, v9.2s[1]
+
+	fmul	v20.2s, v0.2s, v8.2s[1]
+	fmul	v25.2s, v1.2s, v9.2s[0]
+
+	fmul	v24.2s, v0.2s, v9.2s[0]
+	fmul	v21.2s, v1.2s, v8.2s[1]
+
+	fmul	v28.2s, v0.2s, v9.2s[1]
+	fmul	v17.2s, v1.2s, v8.2s[0]
+
+	ld1	{v12.2s, v13.2s}, [pB]
+	add	pB, pB, #16
+	ld1	{v4.2s, v5.2s}, [pA]
+	add	pA, pA, #16
+.endm
+
+.macro KERNEL4x4_M1
+	fmla	v16.2s, v0.2s, v8.2s[0]
+	fmla	v29.2s, v1.2s, v9.2s[1]
+
+	ld1	{v12.2s, v13.2s}, [pB]		// For next round
+	add	pB, pB, #16
+
+	fmla	v20.2s, v0.2s, v8.2s[1]
+	fmla	v25.2s, v1.2s, v9.2s[0]
+
+	ld1	{v4.2s, v5.2s}, [pA]		// For next round
+	add	pA, pA, #16
+
+	fmla	v24.2s, v0.2s, v9.2s[0]
+	fmla	v21.2s, v1.2s, v8.2s[1]
+
+	prfm	PLDL1KEEP, [pB, #512]
+
+	fmla	v28.2s, v0.2s, v9.2s[1]
+	fmla	v17.2s, v1.2s, v8.2s[0]
+.endm
+
+.macro KERNEL4x4_M2
+	fmla	v16.2s, v4.2s, v12.2s[0]
+	fmla	v29.2s, v5.2s, v13.2s[1]
+
+	ld1	{v8.2s, v9.2s}, [pB]		// For next round
+	add	pB, pB, #16
+
+	fmla	v20.2s, v4.2s, v12.2s[1]
+	fmla	v25.2s, v5.2s, v13.2s[0]
+
+	ld1	{v0.2s, v1.2s}, [pA]		// For next round
+	add	pA, pA, #16
+
+	fmla	v24.2s, v4.2s, v13.2s[0]
+	fmla	v21.2s, v5.2s, v12.2s[1]
+
+	prfm	PLDL1KEEP, [pA, #512]
+
+	fmla	v28.2s, v4.2s, v13.2s[1]
+	fmla	v17.2s, v5.2s, v12.2s[0]
+.endm
+
+.macro KERNEL4x4_E
+	fmla	v16.2s, v4.2s, v12.2s[0]
+	fmla	v29.2s, v5.2s, v13.2s[1]
+
+	fmla	v20.2s, v4.2s, v12.2s[1]
+	fmla	v25.2s, v5.2s, v13.2s[0]
+
+	fmla	v24.2s, v4.2s, v13.2s[0]
+	fmla	v21.2s, v5.2s, v12.2s[1]
+
+	fmla	v28.2s, v4.2s, v13.2s[1]
+	fmla	v17.2s, v5.2s, v12.2s[0]
+.endm
+
+.macro KERNEL4x4_SUB
+	ld1	{v8.2s, v9.2s}, [pB]
+	add	pB, pB, #16
+	ld1	{v0.2s, v1.2s}, [pA]
+	add	pA, pA, #16
+
+	fmla	v16.2s, v0.2s, v8.2s[0]
+	fmla	v29.2s, v1.2s, v9.2s[1]
+
+	fmla	v20.2s, v0.2s, v8.2s[1]
+	fmla	v25.2s, v1.2s, v9.2s[0]
+
+	fmla	v24.2s, v0.2s, v9.2s[0]
+	fmla	v21.2s, v1.2s, v8.2s[1]
+
+	fmla	v28.2s, v0.2s, v9.2s[1]
+	fmla	v17.2s, v1.2s, v8.2s[0]
+.endm
+
+.macro SAVE4x4
+	ld1 	{v8.2s, v9.2s}, [pCRow0]
+	fmla	v8.2s, v16.2s, alphaV0
+	fmla	v9.2s, v17.2s, alphaV1
+	st1 	{v8.2s, v9.2s}, [pCRow0]
+
+	add	pCRow1, pCRow0, LDC
+	ld1 	{v12.2s, v13.2s}, [pCRow1]
+	fmla	v12.2s, v20.2s, alphaV2
+	fmla	v13.2s, v21.2s, alphaV3
+	st1 	{v12.2s, v13.2s}, [pCRow1]
+
+	add	pCRow2, pCRow1, LDC
+	ld1 	{v8.2s, v9.2s}, [pCRow2]
+	fmla	v8.2s, v24.2s, alphaV0
+	fmla	v9.2s, v25.2s, alphaV1
+	st1 	{v8.2s, v9.2s}, [pCRow2]
+
+	add	pCRow1, pCRow2, LDC
+	ld1 	{v12.2s, v13.2s}, [pCRow1]
+	fmla	v12.2s, v28.2s, alphaV2
+	fmla	v13.2s, v29.2s, alphaV3
+	st1 	{v12.2s, v13.2s}, [pCRow1]
+
+	add	pCRow0, pCRow0, #16
+.endm
+
+/******************************************************************************/
+
+.macro INIT2x4
+	fmov		s16, wzr
+	fmov		s20, s16
+	fmov		s24, s20
+	fmov		s28, s16
+.endm
+
+.macro KERNEL2x4_SUB
+	ld1	{v8.2s, v9.2s}, [pB]
+	add	pB, pB, #16
+	ld1	{v0.2s}, [pA]
+	add	pA, pA, #8
+
+	fmla	v16.2s, v0.2s, v8.2s[0]
+	fmla	v20.2s, v0.2s, v8.2s[1]
+	fmla	v24.2s, v0.2s, v9.2s[0]
+	fmla	v28.2s, v0.2s, v9.2s[1]
+.endm
+
+.macro SAVE2x4
+	ld1	{v8.2s}, [pCRow0]
+	fmla	v8.2s, v16.2s, alphaV0
+	st1	{v8.2s}, [pCRow0]
+
+	add	pCRow1, pCRow0, LDC
+	ld1	{v12.2s}, [pCRow1]
+	fmla	v12.2s, v20.2s, alphaV1
+	st1	{v12.2s}, [pCRow1]
+
+	add	pCRow2, pCRow1, LDC
+	ld1	{v8.2s}, [pCRow2]
+	fmla	v8.2s, v24.2s, alphaV2
+	st1	{v8.2s}, [pCRow2]
+
+	add	pCRow1, pCRow2, LDC
+	ld1	{v12.2s}, [pCRow1]
+	fmla	v12.2s, v28.2s, alphaV3
+	st1	{v12.2s}, [pCRow1]
+
+	add	pCRow0, pCRow0, #8
+.endm
+
+/******************************************************************************/
+
+.macro INIT1x4
+	fmov		s16, wzr
+	fmov		s20, s16
+.endm
+
+.macro KERNEL1x4_SUB
+	ldr	s0, [pA]
+	add	pA, pA, #4
+
+	ld1	{v8.2s, v9.2s}, [pB]
+	add	pB, pB, #16
+
+	fmla	v16.2s, v8.2s, v0.s[0]
+	fmla	v20.2s, v9.2s, v0.s[0]
+.endm
+
+.macro SAVE1x4
+	add	pCRow1, pCRow0, LDC
+	ld1	{v8.s}[0], [pCRow0]
+	ld1	{v8.s}[1], [pCRow1]
+	fmla	v8.2s, v16.2s, alphaV0
+	st1	{v8.s}[0], [pCRow0]
+	st1	{v8.s}[1], [pCRow1]
+
+	add	pCRow2, pCRow1, LDC
+	add	pCRow1, pCRow2, LDC
+	ld1	{v12.s}[0], [pCRow2]
+	ld1	{v12.s}[1], [pCRow1]
+	fmla	v12.2s, v20.2s, alphaV1
+	st1	{v12.s}[0], [pCRow2]
+	st1	{v12.s}[1], [pCRow1]
+
+	add	pCRow0, pCRow0, #4
+.endm
+
+/******************************************************************************/
+
+.macro INIT16x2
+	fmov	s16, wzr
+	fmov	s17, wzr
+	fmov	s18, wzr
+	fmov	s19, s16
+	fmov	s20, wzr
+	fmov	s21, s16
+	fmov	s22, wzr
+	fmov	s23, s16
+.endm
+
+.macro KERNEL16x2_SUB
+	ld1	{v8.2s}, [pB]
+	add	pB, pB, #8
+	ld1	{v0.4s}, [pA]
+	add	pA, pA, #16
+	ld1	{v1.4s}, [pA]
+	add	pA, pA, #16
+	ld1	{v2.4s}, [pA]
+	add	pA, pA, #16
+	ld1	{v3.4s}, [pA]
+	add	pA, pA, #16
+
+	fmla	v16.4s, v0.4s, v8.2s[0]
+	fmla	v17.4s, v1.4s, v8.2s[0]
+	fmla	v18.4s, v2.4s, v8.2s[0]
+	fmla	v19.4s, v3.4s, v8.2s[0]
+
+	fmla	v20.4s, v0.4s, v8.2s[1]
+	fmla	v21.4s, v1.4s, v8.2s[1]
+	fmla	v22.4s, v2.4s, v8.2s[1]
+	fmla	v23.4s, v3.4s, v8.2s[1]
+.endm
+
+.macro SAVE16x2
+	add	pCRow1, pCRow0, LDC
+
+	ld1	{v0.4s, v1.4s, v2.4s, v3.4s}, [pCRow0]
+	fmla	v0.4s, v16.4s, alphaV0
+	fmla	v1.4s, v17.4s, alphaV1
+	fmla	v2.4s, v18.4s, alphaV2
+	fmla	v3.4s, v19.4s, alphaV3
+	st1 	{v0.4s, v1.4s, v2.4s, v3.4s}, [pCRow0]
+
+	ld1	{v4.4s, v5.4s, v6.4s, v7.4s}, [pCRow1]
+	fmla	v4.4s, v20.4s, alphaV0
+	fmla	v5.4s, v21.4s, alphaV1
+	fmla	v6.4s, v22.4s, alphaV2
+	fmla	v7.4s, v23.4s, alphaV3
+	st1 	{v4.4s, v5.4s, v6.4s, v7.4s}, [pCRow1]
+
+	add	pCRow0, pCRow0, #64
+.endm
+
+/******************************************************************************/
+
+.macro INIT8x2
+	fmov	s16, wzr
+	fmov	s17, s16
+	fmov	s20, s17
+	fmov	s21, s16
+.endm
+
+.macro KERNEL8x2_SUB
+	ld1	{v8.2s}, [pB]
+	add	pB, pB, #8
+	ld1	{v0.4s}, [pA]
+	add	pA, pA, #16
+	ld1	{v1.4s}, [pA]
+	add	pA, pA, #16
+
+	fmla	v16.4s, v0.4s, v8.2s[0]
+	fmla	v17.4s, v1.4s, v8.2s[0]
+
+	fmla	v20.4s, v0.4s, v8.2s[1]
+	fmla	v21.4s, v1.4s, v8.2s[1]
+.endm
+
+.macro SAVE8x2
+	add	pCRow1, pCRow0, LDC
+
+	ld1	{v0.4s, v1.4s}, [pCRow0]
+	fmla	v0.4s, v16.4s, alphaV0
+	fmla	v1.4s, v17.4s, alphaV1
+	st1 	{v0.4s, v1.4s}, [pCRow0]
+
+	add	pCRow2, pCRow1, LDC
+
+	ld1	{v4.4s, v5.4s}, [pCRow1]
+	fmla	v4.4s, v20.4s, alphaV0
+	fmla	v5.4s, v21.4s, alphaV1
+	st1 	{v4.4s, v5.4s}, [pCRow1]
+
+	add	pCRow0, pCRow0, #32
+.endm
+
+/******************************************************************************/
+
+.macro INIT4x2
+	fmov	s16, wzr
+	fmov	s17, s16
+	fmov	s20, s17
+	fmov	s21, s16
+.endm
+
+.macro KERNEL4x2_SUB
+	ld1	{v8.2s}, [pB]
+	add	pB, pB, #8
+	ld1	{v0.2s, v1.2s}, [pA]
+	add	pA, pA, #16
+
+	fmla	v16.2s, v0.2s, v8.2s[0]
+	fmla	v17.2s, v1.2s, v8.2s[0]
+	fmla	v20.2s, v0.2s, v8.2s[1]
+	fmla	v21.2s, v1.2s, v8.2s[1]
+.endm
+
+.macro SAVE4x2
+	ld1	{v8.2s, v9.2s}, [pCRow0]
+	fmla	v8.2s, v16.2s, alphaV0
+	fmla	v9.2s, v17.2s, alphaV1
+	st1	{v8.2s, v9.2s}, [pCRow0]
+
+	add	pCRow1, pCRow0, LDC
+	ld1	{v12.2s, v13.2s}, [pCRow1]
+	fmla	v12.2s, v20.2s, alphaV2
+	fmla	v13.2s, v21.2s, alphaV3
+	st1	{v12.2s, v13.2s}, [pCRow1]
+
+	add	pCRow0, pCRow0, #16
+.endm
+
+/******************************************************************************/
+
+.macro INIT2x2
+	fmov		s16, wzr
+	fmov		s20, s16
+.endm
+
+.macro KERNEL2x2_SUB
+	ld1	{v8.2s}, [pB]
+	add	pB, pB, #8
+
+	ld1	{v0.2s}, [pA]
+	add	pA, pA, #8
+
+	fmla	v16.2s, v0.2s, v8.2s[0]
+	fmla	v20.2s, v0.2s, v8.2s[1]
+.endm
+
+.macro SAVE2x2
+	ld1	{v8.2s}, [pCRow0]
+	fmla	v8.2s, v16.2s, alphaV0
+	st1	{v8.2s}, [pCRow0]
+
+	add	pCRow1 , pCRow0, LDC
+	ld1	{v12.2s}, [pCRow1]
+	fmla	v12.2s, v20.2s, alphaV1
+	st1	{v12.2s}, [pCRow1]
+
+	add	pCRow0, pCRow0, #8
+.endm
+
+/******************************************************************************/
+
+.macro INIT1x2
+	fmov		s16, wzr
+.endm
+
+.macro KERNEL1x2_SUB
+	ld1	{v8.2s} , [pB]
+	add	pB , pB, #8
+
+	ldr	s0 , [pA]
+	add	pA, pA, #4
+
+	fmla	v16.2s, v8.2s, v0.2s[0]
+.endm
+
+.macro SAVE1x2
+	add	pCRow1 , pCRow0, LDC
+	ld1	{v8.s}[0], [pCRow0]
+	ld1	{v8.s}[1], [pCRow1]
+	fmla	v8.2s, v16.2s, alphaV0
+	st1	{v8.s}[0], [pCRow0]
+	st1	{v8.s}[1], [pCRow1]
+
+	add	pCRow0, pCRow0, #4
+.endm
+
+/******************************************************************************/
+
+.macro INIT16x1
+	fmov	s16, wzr
+	fmov	s17, wzr
+	fmov	s18, wzr
+	fmov	s19, s16
+.endm
+
+.macro KERNEL16x1_SUB
+	ldr	s8, [pB]
+	add	pB , pB, #4
+
+	ld1	{v0.4s}, [pA]
+	add	pA, pA, #16
+	ld1	{v1.4s}, [pA]
+	add	pA, pA, #16
+	ld1	{v2.4s}, [pA]
+	add	pA, pA, #16
+	ld1	{v3.4s}, [pA]
+	add	pA, pA, #16
+
+	fmla	v16.4s, v0.4s, v8.2s[0]
+	fmla	v17.4s, v1.4s, v8.2s[0]
+	fmla	v18.4s, v2.4s, v8.2s[0]
+	fmla	v19.4s, v3.4s, v8.2s[0]
+.endm
+
+.macro SAVE16x1
+	ld1	{v0.4s, v1.4s, v2.4s, v3.4s}, [pCRow0]
+	fmla	v0.4s, v16.4s, alphaV0
+	fmla	v1.4s, v17.4s, alphaV1
+	fmla	v2.4s, v18.4s, alphaV2
+	fmla	v3.4s, v19.4s, alphaV3
+	st1 	{v0.4s, v1.4s, v2.4s, v3.4s}, [pCRow0]
+
+	add	pCRow0, pCRow0, #64
+.endm
+
+/******************************************************************************/
+
+.macro INIT8x1
+	fmov	s16, wzr
+	fmov	s17, wzr
+.endm
+
+.macro KERNEL8x1_SUB
+	ldr	s8, [pB]
+	add	pB , pB, #4
+
+	ld1	{v0.4s}, [pA]
+	add	pA, pA, #16
+	ld1	{v1.4s}, [pA]
+	add	pA, pA, #16
+
+	fmla	v16.4s, v0.4s, v8.2s[0]
+	fmla	v17.4s, v1.4s, v8.2s[0]
+.endm
+
+.macro SAVE8x1
+	ld1	{v0.4s, v1.4s}, [pCRow0]
+	fmla	v0.4s, v16.4s, alphaV0
+	fmla	v1.4s, v17.4s, alphaV1
+	st1 	{v0.4s, v1.4s}, [pCRow0]
+
+	add	pCRow0, pCRow0, #32
+.endm
+
+/******************************************************************************/
+
+.macro INIT4x1
+	fmov	s16, wzr
+	fmov	s17, s16
+.endm
+
+.macro KERNEL4x1_SUB
+	ldr	s8, [pB]
+	add	pB , pB, #4
+
+	ld1	{v0.2s, v1.2s}, [pA]
+	add	pA , pA, #16
+
+	fmla	v16.2s, v0.2s, v8.2s[0]
+	fmla	v17.2s, v1.2s, v8.2s[0]
+.endm
+
+.macro SAVE4x1
+	ld1	{v8.2s, v9.2s}, [pCRow0]
+	fmla	v8.2s, v16.2s, alphaV0
+	fmla	v9.2s, v17.2s, alphaV1
+	st1	{v8.2s, v9.2s}, [pCRow0]
+
+	add	pCRow0, pCRow0, #16
+.endm
+
+/******************************************************************************/
+
+.macro INIT2x1
+	fmov		s16, wzr
+.endm
+
+.macro KERNEL2x1_SUB
+	ldr	s8, [pB]
+	add	pB , pB, #4
+
+	ld1	{v0.2s}, [pA]
+	add	pA , pA, #8
+
+	fmla	v16.2s, v0.2s, v8.2s[0]
+.endm
+
+.macro SAVE2x1
+	ld1	{v8.2s}, [pCRow0]
+	fmla	v8.2s, v16.2s, alphaV0
+	st1	{v8.2s}, [pCRow0]
+
+	add	pCRow0, pCRow0, #8
+.endm
+
+/******************************************************************************/
+
+.macro INIT1x1
+	fmov	s16, wzr
+.endm
+
+.macro KERNEL1x1_SUB
+	ldr	s8, [pB]
+	add	pB , pB, #4
+
+	ldr	s0, [pA]
+	add	pA , pA, #4
+
+	fmadd 	s16, s0, s8, s16  
+.endm
+
+.macro SAVE1x1
+	ldr 	s8, [pCRow0]
+	fmla	s8, s16, alphaV0
+	str 	s8, [pCRow0]
+
+	add	pCRow0, pCRow0, #4
+.endm
+
+/*******************************************************************************
+* End of macro definitions
+*******************************************************************************/
+
+	PROLOGUE
+
+sgemm_kernel_begin:
+
+	.align 5
+	add	sp, sp, #-(11 * 16)
+	stp	d8, d9, [sp, #(0 * 16)]
+	stp	d10, d11, [sp, #(1 * 16)]
+	stp	d12, d13, [sp, #(2 * 16)]
+	stp	d14, d15, [sp, #(3 * 16)]
+	stp	d16, d17, [sp, #(4 * 16)]
+	stp	x18, x19, [sp, #(5 * 16)]
+	stp	x20, x21, [sp, #(6 * 16)]
+	stp	x22, x23, [sp, #(7 * 16)]
+	stp	x24, x25, [sp, #(8 * 16)]
+	stp	x26, x27, [sp, #(9 * 16)]
+	str	x28, [sp, #(10 * 16)]
+
+	fmov	alpha0, s0
+	fmov	alpha1, s0
+	fmov	alpha2, s0
+	fmov	alpha3, s0
+
+	lsl	LDC, LDC, #2			// ldc = ldc * 4
+
+	mov	pB, origPB
+
+	mov	counterJ, origN
+	asr 	counterJ, counterJ, #2		// J = J / 4
+	cmp 	counterJ, #0
+	ble	sgemm_kernel_L2_BEGIN
+
+/******************************************************************************/
+
+sgemm_kernel_L4_BEGIN:
+	mov	pCRow0, pC			// pCRow0 = C
+	add	pC, pC, LDC, lsl #2
+
+	mov	pA, origPA			// pA = start of A array
+
+sgemm_kernel_L4_M16_BEGIN:
+
+	mov	counterI, origM
+	asr 	counterI, counterI, #4		// counterI = counterI / 16
+	cmp 	counterI, #0
+	ble	sgemm_kernel_L4_M8_BEGIN
+
+sgemm_kernel_L4_M16_20:
+
+	mov	pB, origPB
+
+	asr 	counterL , origK, #1		// L = K / 2
+	cmp	counterL , #2			// is there at least 4 to do?
+	blt	sgemm_kernel_L4_M16_32
+
+	KERNEL16x4_I				// do one in the K
+	KERNEL16x4_M2				// do another in the K
+
+	subs	counterL, counterL, #2
+	ble	sgemm_kernel_L4_M16_22a
+	.align 5
+
+sgemm_kernel_L4_M16_22:
+
+	KERNEL16x4_M1
+	KERNEL16x4_M2
+
+	subs	counterL, counterL, #1
+	bgt	sgemm_kernel_L4_M16_22
+
+sgemm_kernel_L4_M16_22a:
+
+	KERNEL16x4_M1
+	KERNEL16x4_E
+
+	b	 sgemm_kernel_L4_M16_44
+
+sgemm_kernel_L4_M16_32:
+
+	tst	counterL, #1
+	ble	sgemm_kernel_L4_M16_40
+
+	KERNEL16x4_I
+	KERNEL16x4_E
+
+	b	sgemm_kernel_L4_M16_44
+
+sgemm_kernel_L4_M16_40:
+
+	INIT16x4
+
+sgemm_kernel_L4_M16_44:
+
+	ands	counterL , origK, #1
+	ble	sgemm_kernel_L4_M16_100
+
+sgemm_kernel_L4_M16_46:
+
+	KERNEL16x4_SUB
+
+sgemm_kernel_L4_M16_100:
+
+	SAVE16x4
+
+sgemm_kernel_L4_M16_END:
+	subs	counterI, counterI, #1
+	bne	sgemm_kernel_L4_M16_20
+
+//------------------------------------------------------------------------------
+
+sgemm_kernel_L4_M8_BEGIN:
+
+	mov	counterI, origM
+	tst	counterI , #15
+	ble	sgemm_kernel_L4_END
+
+	tst	counterI, #8
+	ble	sgemm_kernel_L4_M4_BEGIN
+
+sgemm_kernel_L4_M8_20:
+
+	mov	pB, origPB
+
+	asr 	counterL , origK, #1		// L = K / 2
+	cmp	counterL , #2			// is there at least 4 to do?
+	blt	sgemm_kernel_L4_M8_32
+
+	KERNEL8x4_I				// do one in the K
+	KERNEL8x4_M2				// do another in the K
+
+	subs	counterL, counterL, #2
+	ble	sgemm_kernel_L4_M8_22a
+	.align 5
+
+sgemm_kernel_L4_M8_22:
+
+	KERNEL8x4_M1
+	KERNEL8x4_M2
+
+	subs	counterL, counterL, #1
+	bgt	sgemm_kernel_L4_M8_22
+
+sgemm_kernel_L4_M8_22a:
+
+	KERNEL8x4_M1
+	KERNEL8x4_E
+
+	b	 sgemm_kernel_L4_M8_44
+
+sgemm_kernel_L4_M8_32:
+
+	tst	counterL, #1
+	ble	sgemm_kernel_L4_M8_40
+
+	KERNEL8x4_I
+	KERNEL8x4_E
+
+	b	sgemm_kernel_L4_M8_44
+
+sgemm_kernel_L4_M8_40:
+
+	INIT8x4
+
+sgemm_kernel_L4_M8_44:
+
+	ands	counterL , origK, #1
+	ble	sgemm_kernel_L4_M8_100
+
+sgemm_kernel_L4_M8_46:
+
+	KERNEL8x4_SUB
+
+sgemm_kernel_L4_M8_100:
+
+	SAVE8x4
+
+sgemm_kernel_L4_M8_END:
+
+//------------------------------------------------------------------------------
+
+sgemm_kernel_L4_M4_BEGIN:
+
+	mov	counterI, origM
+	tst	counterI , #7
+	ble	sgemm_kernel_L4_END
+
+	tst	counterI, #4
+	ble	sgemm_kernel_L4_M2_BEGIN
+
+sgemm_kernel_L4_M4_20:
+
+	mov	pB, origPB
+
+	asr 	counterL , origK, #1		// L = K / 2
+	cmp	counterL , #2			// is there at least 4 to do?
+	blt	sgemm_kernel_L4_M4_32
+
+	KERNEL4x4_I				// do one in the K
+	KERNEL4x4_M2				// do another in the K
+
+	subs	counterL, counterL, #2
+	ble	sgemm_kernel_L4_M4_22a
+	.align 5
+
+sgemm_kernel_L4_M4_22:
+
+	KERNEL4x4_M1
+	KERNEL4x4_M2
+
+	subs	counterL, counterL, #1
+	bgt	sgemm_kernel_L4_M4_22
+
+sgemm_kernel_L4_M4_22a:
+
+	KERNEL4x4_M1
+	KERNEL4x4_E
+
+	b	 sgemm_kernel_L4_M4_44
+
+sgemm_kernel_L4_M4_32:
+
+	tst	counterL, #1
+	ble	sgemm_kernel_L4_M4_40
+
+	KERNEL4x4_I
+	KERNEL4x4_E
+
+	b	sgemm_kernel_L4_M4_44
+
+sgemm_kernel_L4_M4_40:
+
+	INIT4x4
+
+sgemm_kernel_L4_M4_44:
+
+	ands	counterL , origK, #1
+	ble	sgemm_kernel_L4_M4_100
+
+sgemm_kernel_L4_M4_46:
+
+	KERNEL4x4_SUB
+
+sgemm_kernel_L4_M4_100:
+
+	SAVE4x4
+
+sgemm_kernel_L4_M4_END:
+
+//------------------------------------------------------------------------------
+
+sgemm_kernel_L4_M2_BEGIN:
+
+	mov	counterI, origM
+	tst	counterI , #3
+	ble	sgemm_kernel_L4_END
+
+	tst	counterI, #2			// counterI = counterI / 2
+	ble	sgemm_kernel_L4_M1_BEGIN
+
+sgemm_kernel_L4_M2_20:
+
+	INIT2x4
+
+	mov	pB, origPB
+
+	asr 	counterL , origK, #3		// counterL = counterL / 8
+	cmp	counterL , #0
+	ble	sgemm_kernel_L4_M2_40
+
+sgemm_kernel_L4_M2_22:
+
+	KERNEL2x4_SUB
+	KERNEL2x4_SUB
+	KERNEL2x4_SUB
+	KERNEL2x4_SUB
+
+	KERNEL2x4_SUB
+	KERNEL2x4_SUB
+	KERNEL2x4_SUB
+	KERNEL2x4_SUB
+
+	subs	counterL, counterL, #1
+	bgt	sgemm_kernel_L4_M2_22
+
+
+sgemm_kernel_L4_M2_40:
+
+	ands	counterL , origK, #7		// counterL = counterL % 8
+	ble	sgemm_kernel_L4_M2_100
+
+sgemm_kernel_L4_M2_42:
+
+	KERNEL2x4_SUB
+
+	subs	counterL, counterL, #1
+	bgt	sgemm_kernel_L4_M2_42
+
+sgemm_kernel_L4_M2_100:
+
+	SAVE2x4
+
+sgemm_kernel_L4_M2_END:
+
+
+sgemm_kernel_L4_M1_BEGIN:
+
+	tst	counterI, #1			// counterI = counterI % 2
+	ble	sgemm_kernel_L4_END
+
+sgemm_kernel_L4_M1_20:
+
+	INIT1x4
+
+	mov	pB, origPB
+
+	asr 	counterL , origK, #3		// counterL = counterL / 8
+	cmp	counterL , #0
+	ble	sgemm_kernel_L4_M1_40
+
+sgemm_kernel_L4_M1_22:
+	KERNEL1x4_SUB
+	KERNEL1x4_SUB
+	KERNEL1x4_SUB
+	KERNEL1x4_SUB
+
+	KERNEL1x4_SUB
+	KERNEL1x4_SUB
+	KERNEL1x4_SUB
+	KERNEL1x4_SUB
+
+	subs	counterL, counterL, #1
+	bgt	sgemm_kernel_L4_M1_22
+
+
+sgemm_kernel_L4_M1_40:
+
+	ands	counterL , origK, #7		// counterL = counterL % 8
+	ble	sgemm_kernel_L4_M1_100
+
+sgemm_kernel_L4_M1_42:
+
+	KERNEL1x4_SUB
+
+	subs	counterL, counterL, #1
+	bgt	sgemm_kernel_L4_M1_42
+
+sgemm_kernel_L4_M1_100:
+
+	SAVE1x4
+
+sgemm_kernel_L4_END:
+	add	origPB, origPB, origK, lsl #4	// B = B + K * 4 * 4
+
+	subs	counterJ, counterJ , #1		// j--
+	bgt	sgemm_kernel_L4_BEGIN
+
+
+/******************************************************************************/
+
+sgemm_kernel_L2_BEGIN:   // less than 2 left in N direction
+
+	mov	counterJ , origN
+	tst	counterJ , #3
+	ble	sgemm_kernel_L999
+
+	tst	counterJ , #2
+	ble	sgemm_kernel_L1_BEGIN
+
+	mov	pCRow0, pC			// pCRow0 = pC
+
+	add	pC,pC,LDC, lsl #1
+
+	mov	pA, origPA			// pA = A
+
+sgemm_kernel_L2_M16_BEGIN:
+
+	mov	counterI, origM
+	asr 	counterI, counterI, #4		// counterI = counterI / 16
+	cmp	counterI,#0
+	ble	sgemm_kernel_L2_M8_BEGIN
+
+sgemm_kernel_L2_M16_20:
+
+	INIT16x2
+
+	mov	pB, origPB
+
+	asr	counterL , origK, #3		// counterL = counterL / 8
+	cmp	counterL,#0
+	ble	sgemm_kernel_L2_M16_40
+	.align 5
+
+sgemm_kernel_L2_M16_22:
+	KERNEL16x2_SUB
+	KERNEL16x2_SUB
+	KERNEL16x2_SUB
+	KERNEL16x2_SUB
+
+	KERNEL16x2_SUB
+	KERNEL16x2_SUB
+	KERNEL16x2_SUB
+	KERNEL16x2_SUB
+
+	subs	counterL, counterL, #1
+	bgt	sgemm_kernel_L2_M16_22
+
+
+sgemm_kernel_L2_M16_40:
+
+	ands	counterL , origK, #7		// counterL = counterL % 8
+	ble	sgemm_kernel_L2_M16_100
+
+sgemm_kernel_L2_M16_42:
+
+	KERNEL16x2_SUB
+
+	subs	counterL, counterL, #1
+	bgt	sgemm_kernel_L2_M16_42
+
+sgemm_kernel_L2_M16_100:
+
+	SAVE16x2
+
+sgemm_kernel_L2_M16_END:
+
+	subs	counterI, counterI, #1
+	bgt	sgemm_kernel_L2_M16_20
+
+//------------------------------------------------------------------------------
+
+sgemm_kernel_L2_M8_BEGIN:
+	mov	counterI, origM
+	tst	counterI , #15
+	ble	sgemm_kernel_L2_END
+
+	tst	counterI, #8
+	ble	sgemm_kernel_L2_M4_BEGIN
+
+sgemm_kernel_L2_M8_20:
+
+	INIT8x2
+
+	mov	pB, origPB
+
+	asr	counterL , origK, #3		// counterL = counterL / 8
+	cmp	counterL,#0
+	ble	sgemm_kernel_L2_M8_40
+	.align 5
+
+sgemm_kernel_L2_M8_22:
+	KERNEL8x2_SUB
+	KERNEL8x2_SUB
+	KERNEL8x2_SUB
+	KERNEL8x2_SUB
+
+	KERNEL8x2_SUB
+	KERNEL8x2_SUB
+	KERNEL8x2_SUB
+	KERNEL8x2_SUB
+
+	subs	counterL, counterL, #1
+	bgt	sgemm_kernel_L2_M8_22
+
+
+sgemm_kernel_L2_M8_40:
+
+	ands	counterL , origK, #7		// counterL = counterL % 8
+	ble	sgemm_kernel_L2_M8_100
+
+sgemm_kernel_L2_M8_42:
+
+	KERNEL8x2_SUB
+
+	subs	counterL, counterL, #1
+	bgt	sgemm_kernel_L2_M8_42
+
+sgemm_kernel_L2_M8_100:
+
+	SAVE8x2
+
+sgemm_kernel_L2_M8_END:
+
+//------------------------------------------------------------------------------
+
+sgemm_kernel_L2_M4_BEGIN:
+	mov	counterI, origM
+	tst	counterI , #7
+	ble	sgemm_kernel_L2_END
+
+	tst	counterI, #4
+	ble	sgemm_kernel_L2_M2_BEGIN
+
+sgemm_kernel_L2_M4_20:
+
+	INIT4x2
+
+	mov	pB, origPB
+
+	asr	counterL , origK, #3		// counterL = counterL / 8
+	cmp	counterL,#0
+	ble	sgemm_kernel_L2_M4_40
+	.align 5
+
+sgemm_kernel_L2_M4_22:
+	KERNEL4x2_SUB
+	KERNEL4x2_SUB
+	KERNEL4x2_SUB
+	KERNEL4x2_SUB
+
+	KERNEL4x2_SUB
+	KERNEL4x2_SUB
+	KERNEL4x2_SUB
+	KERNEL4x2_SUB
+
+	subs	counterL, counterL, #1
+	bgt	sgemm_kernel_L2_M4_22
+
+
+sgemm_kernel_L2_M4_40:
+
+	ands	counterL , origK, #7		// counterL = counterL % 8
+	ble	sgemm_kernel_L2_M4_100
+
+sgemm_kernel_L2_M4_42:
+
+	KERNEL4x2_SUB
+
+	subs	counterL, counterL, #1
+	bgt	sgemm_kernel_L2_M4_42
+
+sgemm_kernel_L2_M4_100:
+
+	SAVE4x2
+
+sgemm_kernel_L2_M4_END:
+
+//------------------------------------------------------------------------------
+
+
+sgemm_kernel_L2_M2_BEGIN:
+
+	mov	counterI, origM
+	tst	counterI , #3
+	ble	sgemm_kernel_L2_END
+
+	tst	counterI, #2			// counterI = counterI / 2
+	ble	sgemm_kernel_L2_M1_BEGIN
+
+sgemm_kernel_L2_M2_20:
+
+	INIT2x2
+
+	mov	pB, origPB
+
+	asr	counterL , origK, #3		// counterL = counterL / 8
+        cmp	counterL,#0
+	ble	sgemm_kernel_L2_M2_40
+
+sgemm_kernel_L2_M2_22:
+
+	KERNEL2x2_SUB
+	KERNEL2x2_SUB
+	KERNEL2x2_SUB
+	KERNEL2x2_SUB
+
+	KERNEL2x2_SUB
+	KERNEL2x2_SUB
+	KERNEL2x2_SUB
+	KERNEL2x2_SUB
+
+	subs	counterL, counterL, #1
+	bgt	sgemm_kernel_L2_M2_22
+
+
+sgemm_kernel_L2_M2_40:
+
+	ands	counterL , origK, #7		// counterL = counterL % 8
+	ble	sgemm_kernel_L2_M2_100
+
+sgemm_kernel_L2_M2_42:
+
+	KERNEL2x2_SUB
+
+	subs	counterL, counterL, #1
+	bgt	sgemm_kernel_L2_M2_42
+
+sgemm_kernel_L2_M2_100:
+
+	SAVE2x2
+
+sgemm_kernel_L2_M2_END:
+
+
+sgemm_kernel_L2_M1_BEGIN:
+
+	tst	counterI, #1			// counterI = counterI % 2
+	ble	sgemm_kernel_L2_END
+
+sgemm_kernel_L2_M1_20:
+
+	INIT1x2
+
+	mov	pB, origPB
+
+	asr 	counterL , origK, #3		// counterL = counterL / 8
+        cmp     counterL, #0
+	ble	sgemm_kernel_L2_M1_40
+
+sgemm_kernel_L2_M1_22:
+	KERNEL1x2_SUB
+	KERNEL1x2_SUB
+	KERNEL1x2_SUB
+	KERNEL1x2_SUB
+
+	KERNEL1x2_SUB
+	KERNEL1x2_SUB
+	KERNEL1x2_SUB
+	KERNEL1x2_SUB
+
+	subs	counterL, counterL, #1
+	bgt	sgemm_kernel_L2_M1_22
+
+
+sgemm_kernel_L2_M1_40:
+
+	ands	counterL , origK, #7		// counterL = counterL % 8
+	ble	sgemm_kernel_L2_M1_100
+
+sgemm_kernel_L2_M1_42:
+
+	KERNEL1x2_SUB
+
+	subs	counterL, counterL, #1
+	bgt	sgemm_kernel_L2_M1_42
+
+sgemm_kernel_L2_M1_100:
+
+	SAVE1x2
+
+sgemm_kernel_L2_END:
+
+	add	origPB, origPB, origK, lsl #3	// B = B + K * 2 * 4
+
+/******************************************************************************/
+
+sgemm_kernel_L1_BEGIN:
+
+	mov	counterJ , origN
+	tst	counterJ , #1
+	ble	sgemm_kernel_L999 // done
+
+
+	mov	pCRow0, pC			// pCRow0 = C
+	add	pC , pC , LDC			// Update pC to point to next
+
+	mov	pA, origPA			// pA = A
+
+sgemm_kernel_L1_M16_BEGIN:
+
+	mov	counterI, origM
+	asr 	counterI, counterI, #4		// counterI = counterI / 16
+	cmp	counterI, #0
+	ble	sgemm_kernel_L1_M8_BEGIN
+
+sgemm_kernel_L1_M16_20:
+
+	INIT16x1
+
+	mov	pB, origPB
+
+	asr	counterL , origK, #3		// counterL = counterL / 8
+	cmp	counterL , #0
+	ble	sgemm_kernel_L1_M16_40
+	.align 5
+
+sgemm_kernel_L1_M16_22:
+	KERNEL16x1_SUB
+	KERNEL16x1_SUB
+	KERNEL16x1_SUB
+	KERNEL16x1_SUB
+
+	KERNEL16x1_SUB
+	KERNEL16x1_SUB
+	KERNEL16x1_SUB
+	KERNEL16x1_SUB
+
+	subs	counterL, counterL, #1
+	bgt	sgemm_kernel_L1_M16_22
+
+
+sgemm_kernel_L1_M16_40:
+
+	ands	counterL , origK, #7		// counterL = counterL % 8
+	ble	sgemm_kernel_L1_M16_100
+
+sgemm_kernel_L1_M16_42:
+
+	KERNEL16x1_SUB
+
+	subs	counterL, counterL, #1
+	bgt	sgemm_kernel_L1_M16_42
+
+sgemm_kernel_L1_M16_100:
+
+	SAVE16x1
+
+sgemm_kernel_L1_M16_END:
+
+	subs	counterI, counterI, #1
+	bgt	sgemm_kernel_L1_M16_20
+
+//------------------------------------------------------------------------------
+
+sgemm_kernel_L1_M8_BEGIN:
+
+	mov	counterI, origM
+	tst	counterI , #15
+	ble	sgemm_kernel_L1_END
+
+	tst	counterI, #8
+	ble	sgemm_kernel_L1_M4_BEGIN
+
+sgemm_kernel_L1_M8_20:
+
+	INIT8x1
+
+	mov	pB, origPB
+
+	asr	counterL , origK, #3		// counterL = counterL / 8
+	cmp	counterL , #0
+	ble	sgemm_kernel_L1_M8_40
+	.align 5
+
+sgemm_kernel_L1_M8_22:
+	KERNEL8x1_SUB
+	KERNEL8x1_SUB
+	KERNEL8x1_SUB
+	KERNEL8x1_SUB
+
+	KERNEL8x1_SUB
+	KERNEL8x1_SUB
+	KERNEL8x1_SUB
+	KERNEL8x1_SUB
+
+	subs	counterL, counterL, #1
+	bgt	sgemm_kernel_L1_M8_22
+
+
+sgemm_kernel_L1_M8_40:
+
+	ands	counterL , origK, #7		// counterL = counterL % 8
+	ble	sgemm_kernel_L1_M8_100
+
+sgemm_kernel_L1_M8_42:
+
+	KERNEL8x1_SUB
+
+	subs	counterL, counterL, #1
+	bgt	sgemm_kernel_L1_M8_42
+
+sgemm_kernel_L1_M8_100:
+
+	SAVE8x1
+
+sgemm_kernel_L1_M8_END:
+
+//------------------------------------------------------------------------------
+
+sgemm_kernel_L1_M4_BEGIN:
+	mov	counterI, origM
+	tst	counterI , #7
+	ble	sgemm_kernel_L1_END
+
+	tst	counterI, #4
+	ble	sgemm_kernel_L1_M2_BEGIN
+
+sgemm_kernel_L1_M4_20:
+
+	INIT4x1
+
+	mov	pB, origPB
+
+	asr	counterL , origK, #3		// counterL = counterL / 8
+	cmp	counterL , #0
+	ble	sgemm_kernel_L1_M4_40
+	.align 5
+
+sgemm_kernel_L1_M4_22:
+	KERNEL4x1_SUB
+	KERNEL4x1_SUB
+	KERNEL4x1_SUB
+	KERNEL4x1_SUB
+
+	KERNEL4x1_SUB
+	KERNEL4x1_SUB
+	KERNEL4x1_SUB
+	KERNEL4x1_SUB
+
+	subs	counterL, counterL, #1
+	bgt	sgemm_kernel_L1_M4_22
+
+
+sgemm_kernel_L1_M4_40:
+
+	ands	counterL , origK, #7		// counterL = counterL % 8
+	ble	sgemm_kernel_L1_M4_100
+
+sgemm_kernel_L1_M4_42:
+
+	KERNEL4x1_SUB
+
+	subs	counterL, counterL, #1
+	bgt	sgemm_kernel_L1_M4_42
+
+sgemm_kernel_L1_M4_100:
+
+	SAVE4x1
+
+sgemm_kernel_L1_M4_END:
+
+//------------------------------------------------------------------------------
+
+sgemm_kernel_L1_M2_BEGIN:
+
+	mov	counterI, origM
+	tst	counterI , #3
+	ble	sgemm_kernel_L1_END
+
+	tst	counterI, #2			// counterI = counterI / 2
+	ble	sgemm_kernel_L1_M1_BEGIN
+
+sgemm_kernel_L1_M2_20:
+
+	INIT2x1
+
+	mov	pB, origPB
+
+	asr 	counterL , origK, #3		// counterL = counterL / 8
+	cmp	counterL , #0
+	ble	sgemm_kernel_L1_M2_40
+
+sgemm_kernel_L1_M2_22:
+
+	KERNEL2x1_SUB
+	KERNEL2x1_SUB
+	KERNEL2x1_SUB
+	KERNEL2x1_SUB
+
+	KERNEL2x1_SUB
+	KERNEL2x1_SUB
+	KERNEL2x1_SUB
+	KERNEL2x1_SUB
+
+	subs	counterL, counterL, #1
+	bgt	sgemm_kernel_L1_M2_22
+
+
+sgemm_kernel_L1_M2_40:
+
+	ands	counterL , origK, #7		// counterL = counterL % 8
+	ble	sgemm_kernel_L1_M2_100
+
+sgemm_kernel_L1_M2_42:
+
+	KERNEL2x1_SUB
+
+	subs	counterL, counterL, #1
+	bgt	sgemm_kernel_L1_M2_42
+
+sgemm_kernel_L1_M2_100:
+
+	SAVE2x1
+
+sgemm_kernel_L1_M2_END:
+
+
+sgemm_kernel_L1_M1_BEGIN:
+
+	tst	counterI, #1			// counterI = counterI % 2
+	ble	sgemm_kernel_L1_END
+
+sgemm_kernel_L1_M1_20:
+
+	INIT1x1
+
+	mov	pB, origPB
+
+	asr 	counterL , origK, #3		// counterL = counterL / 8
+	cmp	counterL , #0
+	ble	sgemm_kernel_L1_M1_40
+
+sgemm_kernel_L1_M1_22:
+	KERNEL1x1_SUB
+	KERNEL1x1_SUB
+	KERNEL1x1_SUB
+	KERNEL1x1_SUB
+
+	KERNEL1x1_SUB
+	KERNEL1x1_SUB
+	KERNEL1x1_SUB
+	KERNEL1x1_SUB
+
+	subs	counterL, counterL, #1
+	bgt	sgemm_kernel_L1_M1_22
+
+
+sgemm_kernel_L1_M1_40:
+
+	ands	counterL , origK, #7		// counterL = counterL % 8
+	ble	sgemm_kernel_L1_M1_100
+
+sgemm_kernel_L1_M1_42:
+
+	KERNEL1x1_SUB
+
+	subs	counterL, counterL, #1
+	bgt	sgemm_kernel_L1_M1_42
+
+sgemm_kernel_L1_M1_100:
+
+	SAVE1x1
+
+sgemm_kernel_L1_END:
+
+sgemm_kernel_L999:
+	mov	x0, #0				// set return value
+	ldp	d8, d9, [sp, #(0 * 16)]
+	ldp	d10, d11, [sp, #(1 * 16)]
+	ldp	d12, d13, [sp, #(2 * 16)]
+	ldp	d14, d15, [sp, #(3 * 16)]
+	ldp	d16, d17, [sp, #(4 * 16)]
+	ldp	x18, x19, [sp, #(5 * 16)]
+	ldp	x20, x21, [sp, #(6 * 16)]
+	ldp	x22, x23, [sp, #(7 * 16)]
+	ldp	x24, x25, [sp, #(8 * 16)]
+	ldp	x26, x27, [sp, #(9 * 16)]
+	ldr	x28, [sp, #(10 * 16)]
+	add	sp, sp, #(11*16)
+	ret
+
+	EPILOGUE
+
diff --git a/kernel/arm64/sgemm_kernel_8x8.S b/kernel/arm64/sgemm_kernel_8x8.S
new file mode 100644
index 000000000..ac690e4d4
--- /dev/null
+++ b/kernel/arm64/sgemm_kernel_8x8.S
@@ -0,0 +1,2305 @@
+/*******************************************************************************
+Copyright (c) 2015, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*******************************************************************************/
+
+#define ASSEMBLER
+#include "common.h"
+
+/*                   X0          X1          X2          s0        X3        x4       x5           x6  */
+/*int CNAME(BLASLONG bm,BLASLONG bn,BLASLONG bk,FLOAT alpha,FLOAT* ba,FLOAT* bb,FLOAT* C,BLASLONG ldc) */
+
+#define origM		x0
+#define origN		x1
+#define origK		x2
+#define origPA		x3
+#define origPB		x4
+#define pC		x5
+#define LDC		x6
+#define offset		x7
+#define counterL	x8
+#define counterI	x9
+#define counterJ	x10
+#define pB		x11
+#define pCRow0		x12
+#define pCRow1		x13
+#define pCRow2		x14
+#define pA		x15
+#define temp		x16
+
+#define alpha0		s10
+#define alphaV0		v10.s[0]
+#define alpha1		s11
+#define alphaV1		v11.s[0]
+#define alpha2		s14
+#define alphaV2		v14.s[0]
+#define alpha3		s15
+#define alphaV3		v15.s[0]
+
+// 00 origM
+// 01 origN
+// 02 origK
+// 03 origPA
+// 04 origPB
+// 05 pC
+// 06 origLDC -> LDC
+// 07 offset
+// 08 counterL
+// 09 counterI
+// 10 counterJ
+// 11 pB
+// 12 pCRow0
+// 13 pCRow1
+// 14 pCRow2
+// 15 pA
+// 16 temp
+// 17
+// 18 must save
+// 19 must save
+// 20 must save
+// 21 must save
+// 22 must save
+// 23 must save
+// 24 must save
+// 25 must save
+// 26 must save
+// 27 must save
+// 28 must save
+// 29 frame
+// 30 link
+// 31 sp
+
+//v00 ALPHA -> pA0_0, pA0_1, pA0_2, pA0_3
+//v01 pA0_4, pA0_5, pA0_6, pA0_7
+//v02 pA1_0, pA1_1, pA1_2, pA1_3
+//v03 pA1_4, pA1_5, pA1_6, pA1_7
+//v04 pB0_0, pB0_1, pB0_2, pB0_3
+//v05 pB0_4, pB0_5, pB0_6, pB0_7
+//v06 pB1_0, pB1_1, pB1_2, pB1_3
+//v07 pB1_4, pB1_5, pB1_6, pB1_7
+//v08 must save
+//v09 must save
+//v10 must save ALPHA0
+//v11 must save ALPHA1
+//v12 must save
+//v13 must save
+//v14 must save ALPHA2
+//v15 must save ALPHA3
+//v16 must save C00, C01, C02, C03
+//v17 must save C04, C05, C06, C07
+//v18 C08, C09, C10, C11
+//v19 C12, C13, C14, C15
+//v20 C16, C17, C18, C19
+//v21 C20, C21, C22, C23
+//v22 C24, C25, C26, C27
+//v23 C28, C29, C30, C31
+//v24 C32, C33, C34, C35
+//v25 C36, C37, C38, C39
+//v26 C40, C41, C42, C43
+//v27 C44, C45, C46, C47
+//v28 C48, C49, C50, C51
+//v29 C52, C53, C54, C55
+//v30 C56, C57, C58, C59
+//v31 C60, C61, C62, C63
+
+/*******************************************************************************
+* Macro definitions
+*******************************************************************************/
+
+.macro INIT8x8
+	fmov		s16, wzr
+	fmov		s17, wzr
+	fmov		s18, s16
+	fmov		s19, s17
+	fmov		s20, wzr
+	fmov		s21, s16
+	fmov		s22, s17
+	fmov		s23, s18
+	fmov		s24, wzr
+	fmov		s25, s16
+	fmov		s26, s17
+	fmov		s27, s18
+	fmov		s28, wzr
+	fmov		s29, s16
+	fmov		s30, s17
+	fmov		s31, s18
+.endm
+
+.macro KERNEL8x8_I
+	ld1	{v4.4s}, [pB]
+	add	pB, pB, #16
+	ld1	{v5.4s}, [pB]
+	add	pB, pB, #16
+	ld1	{v0.4s}, [pA]
+	add	pA, pA, #16
+	ld1	{v1.4s}, [pA]
+	add	pA, pA, #16
+
+	fmul	v16.4s, v0.4s, v4.4s[0]
+	fmul	v17.4s, v1.4s, v4.4s[0]
+	fmul	v18.4s, v0.4s, v4.4s[1]
+	fmul	v19.4s, v1.4s, v4.4s[1]
+	fmul	v20.4s, v0.4s, v4.4s[2]
+	fmul	v21.4s, v1.4s, v4.4s[2]
+	fmul	v22.4s, v0.4s, v4.4s[3]
+	fmul	v23.4s, v1.4s, v4.4s[3]
+	fmul	v24.4s, v0.4s, v5.4s[0]
+	fmul	v25.4s, v1.4s, v5.4s[0]
+	fmul	v26.4s, v0.4s, v5.4s[1]
+	fmul	v27.4s, v1.4s, v5.4s[1]
+	fmul	v28.4s, v0.4s, v5.4s[2]
+	fmul	v29.4s, v1.4s, v5.4s[2]
+	fmul	v30.4s, v0.4s, v5.4s[3]
+	fmul	v31.4s, v1.4s, v5.4s[3]
+
+	ld1	{v6.4s}, [pB]
+	add	pB, pB, #16
+	ld1	{v7.4s}, [pB]
+	add	pB, pB, #16
+	ld1	{v2.4s}, [pA]
+	add	pA, pA, #16
+	ld1	{v3.4s}, [pA]
+	add	pA, pA, #16
+.endm
+
+.macro KERNEL8x8_M1
+	fmla	v16.4s, v0.4s, v4.4s[0]
+	fmla	v17.4s, v1.4s, v4.4s[0]
+	fmla	v18.4s, v0.4s, v4.4s[1]
+	fmla	v19.4s, v1.4s, v4.4s[1]
+	fmla	v20.4s, v0.4s, v4.4s[2]
+	fmla	v21.4s, v1.4s, v4.4s[2]
+	fmla	v22.4s, v0.4s, v4.4s[3]
+	fmla	v23.4s, v1.4s, v4.4s[3]
+	fmla	v24.4s, v0.4s, v5.4s[0]
+	fmla	v25.4s, v1.4s, v5.4s[0]
+	fmla	v26.4s, v0.4s, v5.4s[1]
+	fmla	v27.4s, v1.4s, v5.4s[1]
+	fmla	v28.4s, v0.4s, v5.4s[2]
+	fmla	v29.4s, v1.4s, v5.4s[2]
+	fmla	v30.4s, v0.4s, v5.4s[3]
+	fmla	v31.4s, v1.4s, v5.4s[3]
+
+	ld1	{v6.4s}, [pB]
+	add	pB, pB, #16
+	ld1	{v7.4s}, [pB]
+	add	pB, pB, #16
+	ld1	{v2.4s}, [pA]
+	add	pA, pA, #16
+	ld1	{v3.4s}, [pA]
+	add	pA, pA, #16
+.endm
+
+.macro KERNEL8x8_M2
+	fmla	v16.4s, v2.4s, v6.4s[0]
+	fmla	v17.4s, v3.4s, v6.4s[0]
+	fmla	v18.4s, v2.4s, v6.4s[1]
+	fmla	v19.4s, v3.4s, v6.4s[1]
+	fmla	v20.4s, v2.4s, v6.4s[2]
+	fmla	v21.4s, v3.4s, v6.4s[2]
+	fmla	v22.4s, v2.4s, v6.4s[3]
+	fmla	v23.4s, v3.4s, v6.4s[3]
+	fmla	v24.4s, v2.4s, v7.4s[0]
+	fmla	v25.4s, v3.4s, v7.4s[0]
+	fmla	v26.4s, v2.4s, v7.4s[1]
+	fmla	v27.4s, v3.4s, v7.4s[1]
+	fmla	v28.4s, v2.4s, v7.4s[2]
+	fmla	v29.4s, v3.4s, v7.4s[2]
+	fmla	v30.4s, v2.4s, v7.4s[3]
+	fmla	v31.4s, v3.4s, v7.4s[3]
+
+	ld1	{v4.4s}, [pB]
+	add	pB, pB, #16
+	ld1	{v5.4s}, [pB]
+	add	pB, pB, #16
+	ld1	{v0.4s}, [pA]
+	add	pA, pA, #16
+	ld1	{v1.4s}, [pA]
+	add	pA, pA, #16
+.endm
+
+.macro KERNEL8x8_E
+	fmla	v16.4s, v2.4s, v6.4s[0]
+	fmla	v17.4s, v3.4s, v6.4s[0]
+	fmla	v18.4s, v2.4s, v6.4s[1]
+	fmla	v19.4s, v3.4s, v6.4s[1]
+	fmla	v20.4s, v2.4s, v6.4s[2]
+	fmla	v21.4s, v3.4s, v6.4s[2]
+	fmla	v22.4s, v2.4s, v6.4s[3]
+	fmla	v23.4s, v3.4s, v6.4s[3]
+	fmla	v24.4s, v2.4s, v7.4s[0]
+	fmla	v25.4s, v3.4s, v7.4s[0]
+	fmla	v26.4s, v2.4s, v7.4s[1]
+	fmla	v27.4s, v3.4s, v7.4s[1]
+	fmla	v28.4s, v2.4s, v7.4s[2]
+	fmla	v29.4s, v3.4s, v7.4s[2]
+	fmla	v30.4s, v2.4s, v7.4s[3]
+	fmla	v31.4s, v3.4s, v7.4s[3]
+.endm
+
+.macro KERNEL8x8_SUB
+	ld1	{v4.4s}, [pB]
+	add	pB, pB, #16
+	ld1	{v5.4s}, [pB]
+	add	pB, pB, #16
+	ld1	{v0.4s}, [pA]
+	add	pA, pA, #16
+	ld1	{v1.4s}, [pA]
+	add	pA, pA, #16
+
+	fmla	v16.4s, v0.4s, v4.4s[0]
+	fmla	v17.4s, v1.4s, v4.4s[0]
+	fmla	v18.4s, v0.4s, v4.4s[1]
+	fmla	v19.4s, v1.4s, v4.4s[1]
+	fmla	v20.4s, v0.4s, v4.4s[2]
+	fmla	v21.4s, v1.4s, v4.4s[2]
+	fmla	v22.4s, v0.4s, v4.4s[3]
+	fmla	v23.4s, v1.4s, v4.4s[3]
+	fmla	v24.4s, v0.4s, v5.4s[0]
+	fmla	v25.4s, v1.4s, v5.4s[0]
+	fmla	v26.4s, v0.4s, v5.4s[1]
+	fmla	v27.4s, v1.4s, v5.4s[1]
+	fmla	v28.4s, v0.4s, v5.4s[2]
+	fmla	v29.4s, v1.4s, v5.4s[2]
+	fmla	v30.4s, v0.4s, v5.4s[3]
+	fmla	v31.4s, v1.4s, v5.4s[3]
+.endm
+
+.macro SAVE8x8
+	add	pCRow1, pCRow0, LDC
+
+	ld1	{v0.4s, v1.4s}, [pCRow0]
+	fmla	v0.4s, v16.4s, alphaV0
+	fmla	v1.4s, v17.4s, alphaV1
+	st1 	{v0.4s, v1.4s}, [pCRow0]
+
+	add	pCRow2, pCRow1, LDC
+
+	ld1	{v2.4s, v3.4s}, [pCRow1]
+	fmla	v2.4s, v18.4s, alphaV2
+	fmla	v3.4s, v19.4s, alphaV3
+	st1 	{v2.4s, v3.4s}, [pCRow1]
+
+	add	pCRow1, pCRow2, LDC
+
+	ld1	{v4.4s, v5.4s}, [pCRow2]
+	fmla	v4.4s, v20.4s, alphaV0
+	fmla	v5.4s, v21.4s, alphaV1
+	st1 	{v4.4s, v5.4s}, [pCRow2]
+
+	add	pCRow2, pCRow1, LDC
+
+	ld1	{v6.4s, v7.4s}, [pCRow1]
+	fmla	v6.4s, v22.4s, alphaV2
+	fmla	v7.4s, v23.4s, alphaV3
+	st1 	{v6.4s, v7.4s}, [pCRow1]
+
+	add	pCRow1, pCRow2, LDC
+
+	ld1	{v0.4s, v1.4s}, [pCRow2]
+	fmla	v0.4s, v24.4s, alphaV0
+	fmla	v1.4s, v25.4s, alphaV1
+	st1 	{v0.4s, v1.4s}, [pCRow2]
+
+	add	pCRow2, pCRow1, LDC
+
+	ld1	{v2.4s, v3.4s}, [pCRow1]
+	fmla	v2.4s, v26.4s, alphaV2
+	fmla	v3.4s, v27.4s, alphaV3
+	st1 	{v2.4s, v3.4s}, [pCRow1]
+
+	add	pCRow1, pCRow2, LDC
+
+	ld1	{v4.4s, v5.4s}, [pCRow2]
+	fmla	v4.4s, v28.4s, alphaV0
+	fmla	v5.4s, v29.4s, alphaV1
+	st1 	{v4.4s, v5.4s}, [pCRow2]
+
+	ld1	{v6.4s, v7.4s}, [pCRow1]
+	fmla	v6.4s, v30.4s, alphaV2
+	fmla	v7.4s, v31.4s, alphaV3
+	st1 	{v6.4s, v7.4s}, [pCRow1]
+
+	add	pCRow0, pCRow0, #32
+.endm
+
+/******************************************************************************/
+
+
+.macro INIT4x8
+	fmov		s16, wzr
+	fmov		s18, wzr
+	fmov		s20, wzr
+	fmov		s22, s16
+	fmov		s24, wzr
+	fmov		s26, s16
+	fmov		s28, s18
+	fmov		s30, s20
+.endm
+
+.macro KERNEL4x8_I
+	ld1	{v4.4s}, [pB]
+	add	pB, pB, #16
+	ld1	{v5.4s}, [pB]
+	add	pB, pB, #16
+	ld1	{v0.4s}, [pA]
+	add	pA, pA, #16
+
+	fmul	v16.4s, v0.4s, v4.4s[0]
+	fmul	v18.4s, v0.4s, v4.4s[1]
+	fmul	v20.4s, v0.4s, v4.4s[2]
+	fmul	v22.4s, v0.4s, v4.4s[3]
+	fmul	v24.4s, v0.4s, v5.4s[0]
+	fmul	v26.4s, v0.4s, v5.4s[1]
+	fmul	v28.4s, v0.4s, v5.4s[2]
+	fmul	v30.4s, v0.4s, v5.4s[3]
+
+	ld1	{v6.4s}, [pB]
+	add	pB, pB, #16
+	ld1	{v7.4s}, [pB]
+	add	pB, pB, #16
+	ld1	{v2.4s}, [pA]
+	add	pA, pA, #16
+.endm
+
+.macro KERNEL4x8_M1
+	fmla	v16.4s, v0.4s, v4.4s[0]
+	fmla	v18.4s, v0.4s, v4.4s[1]
+	fmla	v20.4s, v0.4s, v4.4s[2]
+	fmla	v22.4s, v0.4s, v4.4s[3]
+	fmla	v24.4s, v0.4s, v5.4s[0]
+	fmla	v26.4s, v0.4s, v5.4s[1]
+	fmla	v28.4s, v0.4s, v5.4s[2]
+	fmla	v30.4s, v0.4s, v5.4s[3]
+
+	ld1	{v6.4s}, [pB]
+	add	pB, pB, #16
+	ld1	{v7.4s}, [pB]
+	add	pB, pB, #16
+	ld1	{v2.4s}, [pA]
+	add	pA, pA, #16
+.endm
+
+.macro KERNEL4x8_M2
+	fmla	v16.4s, v2.4s, v6.4s[0]
+	fmla	v18.4s, v2.4s, v6.4s[1]
+	fmla	v20.4s, v2.4s, v6.4s[2]
+	fmla	v22.4s, v2.4s, v6.4s[3]
+	fmla	v24.4s, v2.4s, v7.4s[0]
+	fmla	v26.4s, v2.4s, v7.4s[1]
+	fmla	v28.4s, v2.4s, v7.4s[2]
+	fmla	v30.4s, v2.4s, v7.4s[3]
+
+	ld1	{v4.4s}, [pB]
+	add	pB, pB, #16
+	ld1	{v5.4s}, [pB]
+	add	pB, pB, #16
+	ld1	{v0.4s}, [pA]
+	add	pA, pA, #16
+.endm
+
+.macro KERNEL4x8_E
+	fmla	v16.4s, v2.4s, v6.4s[0]
+	fmla	v18.4s, v2.4s, v6.4s[1]
+	fmla	v20.4s, v2.4s, v6.4s[2]
+	fmla	v22.4s, v2.4s, v6.4s[3]
+	fmla	v24.4s, v2.4s, v7.4s[0]
+	fmla	v26.4s, v2.4s, v7.4s[1]
+	fmla	v28.4s, v2.4s, v7.4s[2]
+	fmla	v30.4s, v2.4s, v7.4s[3]
+.endm
+
+.macro KERNEL4x8_SUB
+	ld1	{v4.4s}, [pB]
+	add	pB, pB, #16
+	ld1	{v5.4s}, [pB]
+	add	pB, pB, #16
+	ld1	{v0.4s}, [pA]
+	add	pA, pA, #16
+
+	fmla	v16.4s, v0.4s, v4.4s[0]
+	fmla	v18.4s, v0.4s, v4.4s[1]
+	fmla	v20.4s, v0.4s, v4.4s[2]
+	fmla	v22.4s, v0.4s, v4.4s[3]
+	fmla	v24.4s, v0.4s, v5.4s[0]
+	fmla	v26.4s, v0.4s, v5.4s[1]
+	fmla	v28.4s, v0.4s, v5.4s[2]
+	fmla	v30.4s, v0.4s, v5.4s[3]
+.endm
+
+.macro SAVE4x8
+	add	pCRow1, pCRow0, LDC
+
+	ld1	{v0.4s}, [pCRow0]
+	fmla	v0.4s, v16.4s, alphaV0
+	st1 	{v0.4s}, [pCRow0]
+
+	add	pCRow2, pCRow1, LDC
+
+	ld1	{v2.4s}, [pCRow1]
+	fmla	v2.4s, v18.4s, alphaV2
+	st1 	{v2.4s}, [pCRow1]
+
+	add	pCRow1, pCRow2, LDC
+
+	ld1	{v4.4s}, [pCRow2]
+	fmla	v4.4s, v20.4s, alphaV0
+	st1 	{v4.4s}, [pCRow2]
+
+	add	pCRow2, pCRow1, LDC
+
+	ld1	{v6.4s}, [pCRow1]
+	fmla	v6.4s, v22.4s, alphaV2
+	st1 	{v6.4s}, [pCRow1]
+
+	add	pCRow1, pCRow2, LDC
+
+	ld1	{v0.4s}, [pCRow2]
+	fmla	v0.4s, v24.4s, alphaV0
+	st1 	{v0.4s}, [pCRow2]
+
+	add	pCRow2, pCRow1, LDC
+
+	ld1	{v2.4s}, [pCRow1]
+	fmla	v2.4s, v26.4s, alphaV2
+	st1 	{v2.4s}, [pCRow1]
+
+	add	pCRow1, pCRow2, LDC
+
+	ld1	{v4.4s}, [pCRow2]
+	fmla	v4.4s, v28.4s, alphaV0
+	st1 	{v4.4s}, [pCRow2]
+
+	ld1	{v6.4s}, [pCRow1]
+	fmla	v6.4s, v30.4s, alphaV2
+	st1 	{v6.4s}, [pCRow1]
+
+	add	pCRow0, pCRow0, #16
+.endm
+
+/******************************************************************************/
+
+.macro INIT2x8
+	fmov		s16, wzr
+	fmov		s18, wzr
+	fmov		s20, wzr
+	fmov		s22, s16
+	fmov		s24, wzr
+	fmov		s26, s16
+	fmov		s28, s18
+	fmov		s30, s20
+.endm
+
+.macro KERNEL2x8_SUB
+	ld1	{v4.4s}, [pB]
+	add	pB, pB, #16
+	ld1	{v5.4s}, [pB]
+	add	pB, pB, #16
+	ld1	{v0.2s}, [pA]
+	add	pA, pA, #8
+
+	fmla	v16.2s, v0.2s, v4.4s[0]
+	fmla	v18.2s, v0.2s, v4.4s[1]
+	fmla	v20.2s, v0.2s, v4.4s[2]
+	fmla	v22.2s, v0.2s, v4.4s[3]
+	fmla	v24.2s, v0.2s, v5.4s[0]
+	fmla	v26.2s, v0.2s, v5.4s[1]
+	fmla	v28.2s, v0.2s, v5.4s[2]
+	fmla	v30.2s, v0.2s, v5.4s[3]
+.endm
+
+.macro SAVE2x8
+	add	pCRow1, pCRow0, LDC
+
+	ld1	{v0.2s}, [pCRow0]
+	fmla	v0.2s, v16.2s, alphaV0
+	st1 	{v0.2s}, [pCRow0]
+
+	add	pCRow2, pCRow1, LDC
+
+	ld1	{v2.2s}, [pCRow1]
+	fmla	v2.2s, v18.2s, alphaV2
+	st1 	{v2.2s}, [pCRow1]
+
+	add	pCRow1, pCRow2, LDC
+
+	ld1	{v4.2s}, [pCRow2]
+	fmla	v4.2s, v20.2s, alphaV0
+	st1 	{v4.2s}, [pCRow2]
+
+	add	pCRow2, pCRow1, LDC
+
+	ld1	{v6.2s}, [pCRow1]
+	fmla	v6.2s, v22.2s, alphaV2
+	st1 	{v6.2s}, [pCRow1]
+
+	add	pCRow1, pCRow2, LDC
+
+	ld1	{v0.2s}, [pCRow2]
+	fmla	v0.2s, v24.2s, alphaV0
+	st1 	{v0.2s}, [pCRow2]
+
+	add	pCRow2, pCRow1, LDC
+
+	ld1	{v2.2s}, [pCRow1]
+	fmla	v2.2s, v26.2s, alphaV2
+	st1 	{v2.2s}, [pCRow1]
+
+	add	pCRow1, pCRow2, LDC
+
+	ld1	{v4.2s}, [pCRow2]
+	fmla	v4.2s, v28.2s, alphaV0
+	st1 	{v4.2s}, [pCRow2]
+
+	ld1	{v6.2s}, [pCRow1]
+	fmla	v6.2s, v30.2s, alphaV2
+	st1 	{v6.2s}, [pCRow1]
+
+	add	pCRow0, pCRow0, #8
+.endm
+
+/******************************************************************************/
+
+.macro INIT1x8
+	fmov		s16, wzr
+	fmov		s18, wzr
+	fmov		s20, wzr
+	fmov		s22, s16
+	fmov		s24, wzr
+	fmov		s26, s16
+	fmov		s28, s18
+	fmov		s30, s20
+.endm
+
+.macro KERNEL1x8_SUB
+	ld1	{v4.4s}, [pB]
+	add	pB, pB, #16
+	ld1	{v5.4s}, [pB]
+	add	pB, pB, #16
+	ldr	s0, [pA]
+	add	pA, pA, #4
+
+	fmla	s16, s0, v4.4s[0]
+	fmla	s18, s0, v4.4s[1]
+	fmla	s20, s0, v4.4s[2]
+	fmla	s22, s0, v4.4s[3]
+	fmla	s24, s0, v5.4s[0]
+	fmla	s26, s0, v5.4s[1]
+	fmla	s28, s0, v5.4s[2]
+	fmla	s30, s0, v5.4s[3]
+.endm
+
+.macro SAVE1x8
+	add	pCRow1, pCRow0, LDC
+
+	ldr	s0, [pCRow0]
+	fmla	s0, s16, alphaV0
+	str 	s0, [pCRow0]
+
+	add	pCRow2, pCRow1, LDC
+
+	ldr	s2, [pCRow1]
+	fmla	s2, s18, alphaV2
+	str 	s2, [pCRow1]
+
+	add	pCRow1, pCRow2, LDC
+
+	ldr	s4, [pCRow2]
+	fmla	s4, s20, alphaV0
+	str 	s4, [pCRow2]
+
+	add	pCRow2, pCRow1, LDC
+
+	ldr	s6, [pCRow1]
+	fmla	s6, s22, alphaV2
+	str 	s6, [pCRow1]
+
+	add	pCRow1, pCRow2, LDC
+
+	ldr	s0, [pCRow2]
+	fmla	s0, s24, alphaV0
+	str 	s0, [pCRow2]
+
+	add	pCRow2, pCRow1, LDC
+
+	ldr	s2, [pCRow1]
+	fmla	s2, s26, alphaV2
+	str 	s2, [pCRow1]
+
+	add	pCRow1, pCRow2, LDC
+
+	ldr	s4, [pCRow2]
+	fmla	s4, s28, alphaV0
+	str 	s4, [pCRow2]
+
+	ldr	s6, [pCRow1]
+	fmla	s6, s30, alphaV2
+	str 	s6, [pCRow1]
+
+	add	pCRow0, pCRow0, #4
+.endm
+
+/******************************************************************************/
+
+.macro INIT8x4
+	fmov		s16, wzr
+	fmov		s17, wzr
+	fmov		s20, wzr
+	fmov		s21, s16
+	fmov		s24, wzr
+	fmov		s25, s16
+	fmov		s28, wzr
+	fmov		s29, s16
+.endm
+
+.macro KERNEL8x4_I
+	ld1	{v8.2s, v9.2s}, [pB]
+	add	pB, pB, #16
+	ld1	{v0.4s}, [pA]
+	add	pA, pA, #16
+	ld1	{v1.4s}, [pA]
+	add	pA, pA, #16
+
+	fmul	v16.4s, v0.4s, v8.2s[0]
+	fmul	v17.4s, v1.4s, v8.2s[0]
+	fmul	v20.4s, v0.4s, v8.2s[1]
+	fmul	v21.4s, v1.4s, v8.2s[1]
+	fmul	v24.4s, v0.4s, v9.2s[0]
+	fmul	v25.4s, v1.4s, v9.2s[0]
+	fmul	v28.4s, v0.4s, v9.2s[1]
+	fmul	v29.4s, v1.4s, v9.2s[1]
+
+	ld1	{v12.2s, v13.2s}, [pB]
+	add	pB, pB, #16
+	ld1	{v4.4s}, [pA]
+	add	pA, pA, #16
+	ld1	{v5.4s}, [pA]
+	add	pA, pA, #16
+.endm
+
+.macro KERNEL8x4_M1
+	fmla	v16.4s, v0.4s, v8.2s[0]
+	fmla	v17.4s, v1.4s, v8.2s[0]
+	fmla	v20.4s, v0.4s, v8.2s[1]
+	fmla	v21.4s, v1.4s, v8.2s[1]
+	fmla	v24.4s, v0.4s, v9.2s[0]
+	fmla	v25.4s, v1.4s, v9.2s[0]
+	fmla	v28.4s, v0.4s, v9.2s[1]
+	fmla	v29.4s, v1.4s, v9.2s[1]
+
+	ld1	{v12.2s, v13.2s}, [pB]
+	add	pB, pB, #16
+	ld1	{v4.4s}, [pA]
+	add	pA, pA, #16
+	ld1	{v5.4s}, [pA]
+	add	pA, pA, #16
+.endm
+
+.macro KERNEL8x4_M2
+	fmla	v16.4s, v4.4s, v12.2s[0]
+	fmla	v17.4s, v5.4s, v12.2s[0]
+	fmla	v20.4s, v4.4s, v12.2s[1]
+	fmla	v21.4s, v5.4s, v12.2s[1]
+	fmla	v24.4s, v4.4s, v13.2s[0]
+	fmla	v25.4s, v5.4s, v13.2s[0]
+	fmla	v28.4s, v4.4s, v13.2s[1]
+	fmla	v29.4s, v5.4s, v13.2s[1]
+
+	ld1	{v8.2s, v9.2s}, [pB]
+	add	pB, pB, #16
+	ld1	{v0.4s}, [pA]
+	add	pA, pA, #16
+	ld1	{v1.4s}, [pA]
+	add	pA, pA, #16
+.endm
+
+.macro KERNEL8x4_E
+	fmla	v16.4s, v4.4s, v12.2s[0]
+	fmla	v17.4s, v5.4s, v12.2s[0]
+	fmla	v20.4s, v4.4s, v12.2s[1]
+	fmla	v21.4s, v5.4s, v12.2s[1]
+	fmla	v24.4s, v4.4s, v13.2s[0]
+	fmla	v25.4s, v5.4s, v13.2s[0]
+	fmla	v28.4s, v4.4s, v13.2s[1]
+	fmla	v29.4s, v5.4s, v13.2s[1]
+.endm
+
+.macro KERNEL8x4_SUB
+	ld1	{v8.2s, v9.2s}, [pB]
+	add	pB, pB, #16
+	ld1	{v0.4s}, [pA]
+	add	pA, pA, #16
+	ld1	{v1.4s}, [pA]
+	add	pA, pA, #16
+
+	fmla	v16.4s, v0.4s, v8.2s[0]
+	fmla	v17.4s, v1.4s, v8.2s[0]
+	fmla	v20.4s, v0.4s, v8.2s[1]
+	fmla	v21.4s, v1.4s, v8.2s[1]
+	fmla	v24.4s, v0.4s, v9.2s[0]
+	fmla	v25.4s, v1.4s, v9.2s[0]
+	fmla	v28.4s, v0.4s, v9.2s[1]
+	fmla	v29.4s, v1.4s, v9.2s[1]
+.endm
+
+.macro SAVE8x4
+	add	pCRow1, pCRow0, LDC
+
+	ld1	{v0.4s, v1.4s}, [pCRow0]
+	fmla	v0.4s, v16.4s, alphaV0
+	fmla	v1.4s, v17.4s, alphaV1
+	st1 	{v0.4s, v1.4s}, [pCRow0]
+
+	add	pCRow2, pCRow1, LDC
+
+	ld1	{v4.4s, v5.4s}, [pCRow1]
+	fmla	v4.4s, v20.4s, alphaV0
+	fmla	v5.4s, v21.4s, alphaV1
+	st1 	{v4.4s, v5.4s}, [pCRow1]
+
+	add	pCRow1, pCRow2, LDC
+
+	ld1	{v0.4s, v1.4s}, [pCRow2]
+	fmla	v0.4s, v24.4s, alphaV0
+	fmla	v1.4s, v25.4s, alphaV1
+	st1 	{v0.4s, v1.4s}, [pCRow2]
+
+	ld1	{v4.4s, v5.4s}, [pCRow1]
+	fmla	v4.4s, v28.4s, alphaV0
+	fmla	v5.4s, v29.4s, alphaV1
+	st1 	{v4.4s, v5.4s}, [pCRow1]
+
+	add	pCRow0, pCRow0, #32
+.endm
+
+/******************************************************************************/
+
+
+.macro INIT4x4
+	fmov		s16, wzr
+	fmov		s17, s16
+	fmov		s20, s17
+	fmov		s21, s16
+	fmov		s24, s17
+	fmov		s25, s16
+	fmov		s28, s17
+	fmov		s29, s16
+.endm
+
+.macro KERNEL4x4_I
+	ld1	{v8.2s, v9.2s}, [pB]
+	add	pB, pB, #16
+	ld1	{v0.2s, v1.2s}, [pA]
+	add	pA, pA, #16
+
+	fmul	v16.2s, v0.2s, v8.2s[0]
+	fmul	v29.2s, v1.2s, v9.2s[1]
+
+	fmul	v20.2s, v0.2s, v8.2s[1]
+	fmul	v25.2s, v1.2s, v9.2s[0]
+
+	fmul	v24.2s, v0.2s, v9.2s[0]
+	fmul	v21.2s, v1.2s, v8.2s[1]
+
+	fmul	v28.2s, v0.2s, v9.2s[1]
+	fmul	v17.2s, v1.2s, v8.2s[0]
+
+	ld1	{v12.2s, v13.2s}, [pB]
+	add	pB, pB, #16
+	ld1	{v4.2s, v5.2s}, [pA]
+	add	pA, pA, #16
+.endm
+
+.macro KERNEL4x4_M1
+	fmla	v16.2s, v0.2s, v8.2s[0]
+	fmla	v29.2s, v1.2s, v9.2s[1]
+
+	ld1	{v12.2s, v13.2s}, [pB]		// For next round
+	add	pB, pB, #16
+
+	fmla	v20.2s, v0.2s, v8.2s[1]
+	fmla	v25.2s, v1.2s, v9.2s[0]
+
+	ld1	{v4.2s, v5.2s}, [pA]		// For next round
+	add	pA, pA, #16
+
+	fmla	v24.2s, v0.2s, v9.2s[0]
+	fmla	v21.2s, v1.2s, v8.2s[1]
+
+	prfm	PLDL1KEEP, [pB, #512]
+
+	fmla	v28.2s, v0.2s, v9.2s[1]
+	fmla	v17.2s, v1.2s, v8.2s[0]
+.endm
+
+.macro KERNEL4x4_M2
+	fmla	v16.2s, v4.2s, v12.2s[0]
+	fmla	v29.2s, v5.2s, v13.2s[1]
+
+	ld1	{v8.2s, v9.2s}, [pB]		// For next round
+	add	pB, pB, #16
+
+	fmla	v20.2s, v4.2s, v12.2s[1]
+	fmla	v25.2s, v5.2s, v13.2s[0]
+
+	ld1	{v0.2s, v1.2s}, [pA]		// For next round
+	add	pA, pA, #16
+
+	fmla	v24.2s, v4.2s, v13.2s[0]
+	fmla	v21.2s, v5.2s, v12.2s[1]
+
+	prfm	PLDL1KEEP, [pA, #512]
+
+	fmla	v28.2s, v4.2s, v13.2s[1]
+	fmla	v17.2s, v5.2s, v12.2s[0]
+.endm
+
+.macro KERNEL4x4_E
+	fmla	v16.2s, v4.2s, v12.2s[0]
+	fmla	v29.2s, v5.2s, v13.2s[1]
+
+	fmla	v20.2s, v4.2s, v12.2s[1]
+	fmla	v25.2s, v5.2s, v13.2s[0]
+
+	fmla	v24.2s, v4.2s, v13.2s[0]
+	fmla	v21.2s, v5.2s, v12.2s[1]
+
+	fmla	v28.2s, v4.2s, v13.2s[1]
+	fmla	v17.2s, v5.2s, v12.2s[0]
+.endm
+
+.macro KERNEL4x4_SUB
+	ld1	{v8.2s, v9.2s}, [pB]
+	add	pB, pB, #16
+	ld1	{v0.2s, v1.2s}, [pA]
+	add	pA, pA, #16
+
+	fmla	v16.2s, v0.2s, v8.2s[0]
+	fmla	v29.2s, v1.2s, v9.2s[1]
+
+	fmla	v20.2s, v0.2s, v8.2s[1]
+	fmla	v25.2s, v1.2s, v9.2s[0]
+
+	fmla	v24.2s, v0.2s, v9.2s[0]
+	fmla	v21.2s, v1.2s, v8.2s[1]
+
+	fmla	v28.2s, v0.2s, v9.2s[1]
+	fmla	v17.2s, v1.2s, v8.2s[0]
+.endm
+
+.macro SAVE4x4
+	ld1 	{v8.2s, v9.2s}, [pCRow0]
+	fmla	v8.2s, v16.2s, alphaV0
+	fmla	v9.2s, v17.2s, alphaV1
+	st1 	{v8.2s, v9.2s}, [pCRow0]
+
+	add	pCRow1, pCRow0, LDC
+	ld1 	{v12.2s, v13.2s}, [pCRow1]
+	fmla	v12.2s, v20.2s, alphaV2
+	fmla	v13.2s, v21.2s, alphaV3
+	st1 	{v12.2s, v13.2s}, [pCRow1]
+
+	add	pCRow2, pCRow1, LDC
+	ld1 	{v8.2s, v9.2s}, [pCRow2]
+	fmla	v8.2s, v24.2s, alphaV0
+	fmla	v9.2s, v25.2s, alphaV1
+	st1 	{v8.2s, v9.2s}, [pCRow2]
+
+	add	pCRow1, pCRow2, LDC
+	ld1 	{v12.2s, v13.2s}, [pCRow1]
+	fmla	v12.2s, v28.2s, alphaV2
+	fmla	v13.2s, v29.2s, alphaV3
+	st1 	{v12.2s, v13.2s}, [pCRow1]
+
+	add	pCRow0, pCRow0, #16
+.endm
+
+/******************************************************************************/
+
+.macro INIT2x4
+	fmov		s16, wzr
+	fmov		s20, s16
+	fmov		s24, s20
+	fmov		s28, s16
+.endm
+
+.macro KERNEL2x4_SUB
+	ld1	{v8.2s, v9.2s}, [pB]
+	add	pB, pB, #16
+	ld1	{v0.2s}, [pA]
+	add	pA, pA, #8
+
+	fmla	v16.2s, v0.2s, v8.2s[0]
+	fmla	v20.2s, v0.2s, v8.2s[1]
+	fmla	v24.2s, v0.2s, v9.2s[0]
+	fmla	v28.2s, v0.2s, v9.2s[1]
+.endm
+
+.macro SAVE2x4
+	ld1	{v8.2s}, [pCRow0]
+	fmla	v8.2s, v16.2s, alphaV0
+	st1	{v8.2s}, [pCRow0]
+
+	add	pCRow1, pCRow0, LDC
+	ld1	{v12.2s}, [pCRow1]
+	fmla	v12.2s, v20.2s, alphaV1
+	st1	{v12.2s}, [pCRow1]
+
+	add	pCRow2, pCRow1, LDC
+	ld1	{v8.2s}, [pCRow2]
+	fmla	v8.2s, v24.2s, alphaV2
+	st1	{v8.2s}, [pCRow2]
+
+	add	pCRow1, pCRow2, LDC
+	ld1	{v12.2s}, [pCRow1]
+	fmla	v12.2s, v28.2s, alphaV3
+	st1	{v12.2s}, [pCRow1]
+
+	add	pCRow0, pCRow0, #8
+.endm
+
+/******************************************************************************/
+
+.macro INIT1x4
+	fmov		s16, wzr
+	fmov		s20, s16
+.endm
+
+.macro KERNEL1x4_SUB
+	ldr	s0, [pA]
+	add	pA, pA, #4
+
+	ld1	{v8.2s, v9.2s}, [pB]
+	add	pB, pB, #16
+
+	fmla	v16.2s, v8.2s, v0.s[0]
+	fmla	v20.2s, v9.2s, v0.s[0]
+.endm
+
+.macro SAVE1x4
+	add	pCRow1, pCRow0, LDC
+	ld1	{v8.s}[0], [pCRow0]
+	ld1	{v8.s}[1], [pCRow1]
+	fmla	v8.2s, v16.2s, alphaV0
+	st1	{v8.s}[0], [pCRow0]
+	st1	{v8.s}[1], [pCRow1]
+
+	add	pCRow2, pCRow1, LDC
+	add	pCRow1, pCRow2, LDC
+	ld1	{v12.s}[0], [pCRow2]
+	ld1	{v12.s}[1], [pCRow1]
+	fmla	v12.2s, v20.2s, alphaV1
+	st1	{v12.s}[0], [pCRow2]
+	st1	{v12.s}[1], [pCRow1]
+
+	add	pCRow0, pCRow0, #4
+.endm
+
+/******************************************************************************/
+
+.macro INIT8x2
+	fmov	s16, wzr
+	fmov	s17, s16
+	fmov	s20, s17
+	fmov	s21, s16
+.endm
+
+.macro KERNEL8x2_SUB
+	ld1	{v8.2s}, [pB]
+	add	pB, pB, #8
+	ld1	{v0.4s}, [pA]
+	add	pA, pA, #16
+	ld1	{v1.4s}, [pA]
+	add	pA, pA, #16
+
+	fmla	v16.4s, v0.4s, v8.2s[0]
+	fmla	v17.4s, v1.4s, v8.2s[0]
+
+	fmla	v20.4s, v0.4s, v8.2s[1]
+	fmla	v21.4s, v1.4s, v8.2s[1]
+.endm
+
+.macro SAVE8x2
+	add	pCRow1, pCRow0, LDC
+
+	ld1	{v0.4s, v1.4s}, [pCRow0]
+	fmla	v0.4s, v16.4s, alphaV0
+	fmla	v1.4s, v17.4s, alphaV1
+	st1 	{v0.4s, v1.4s}, [pCRow0]
+
+	add	pCRow2, pCRow1, LDC
+
+	ld1	{v4.4s, v5.4s}, [pCRow1]
+	fmla	v4.4s, v20.4s, alphaV0
+	fmla	v5.4s, v21.4s, alphaV1
+	st1 	{v4.4s, v5.4s}, [pCRow1]
+
+	add	pCRow0, pCRow0, #32
+.endm
+
+/******************************************************************************/
+
+.macro INIT4x2
+	fmov	s16, wzr
+	fmov	s17, s16
+	fmov	s20, s17
+	fmov	s21, s16
+.endm
+
+.macro KERNEL4x2_SUB
+	ld1	{v8.2s}, [pB]
+	add	pB, pB, #8
+	ld1	{v0.2s, v1.2s}, [pA]
+	add	pA, pA, #16
+
+	fmla	v16.2s, v0.2s, v8.2s[0]
+	fmla	v17.2s, v1.2s, v8.2s[0]
+	fmla	v20.2s, v0.2s, v8.2s[1]
+	fmla	v21.2s, v1.2s, v8.2s[1]
+.endm
+
+.macro SAVE4x2
+	ld1	{v8.2s, v9.2s}, [pCRow0]
+	fmla	v8.2s, v16.2s, alphaV0
+	fmla	v9.2s, v17.2s, alphaV1
+	st1	{v8.2s, v9.2s}, [pCRow0]
+
+	add	pCRow1, pCRow0, LDC
+	ld1	{v12.2s, v13.2s}, [pCRow1]
+	fmla	v12.2s, v20.2s, alphaV2
+	fmla	v13.2s, v21.2s, alphaV3
+	st1	{v12.2s, v13.2s}, [pCRow1]
+
+	add	pCRow0, pCRow0, #16
+.endm
+
+/******************************************************************************/
+
+.macro INIT2x2
+	fmov		s16, wzr
+	fmov		s20, s16
+.endm
+
+.macro KERNEL2x2_SUB
+	ld1	{v8.2s}, [pB]
+	add	pB, pB, #8
+
+	ld1	{v0.2s}, [pA]
+	add	pA, pA, #8
+
+	fmla	v16.2s, v0.2s, v8.2s[0]
+	fmla	v20.2s, v0.2s, v8.2s[1]
+.endm
+
+.macro SAVE2x2
+	ld1	{v8.2s}, [pCRow0]
+	fmla	v8.2s, v16.2s, alphaV0
+	st1	{v8.2s}, [pCRow0]
+
+	add	pCRow1 , pCRow0, LDC
+	ld1	{v12.2s}, [pCRow1]
+	fmla	v12.2s, v20.2s, alphaV1
+	st1	{v12.2s}, [pCRow1]
+
+	add	pCRow0, pCRow0, #8
+.endm
+
+/******************************************************************************/
+
+.macro INIT1x2
+	fmov		s16, wzr
+.endm
+
+.macro KERNEL1x2_SUB
+	ld1	{v8.2s} , [pB]
+	add	pB , pB, #8
+
+	ldr	s0 , [pA]
+	add	pA, pA, #4
+
+	fmla	v16.2s, v8.2s, v0.2s[0]
+.endm
+
+.macro SAVE1x2
+	add	pCRow1 , pCRow0, LDC
+	ld1	{v8.s}[0], [pCRow0]
+	ld1	{v8.s}[1], [pCRow1]
+	fmla	v8.2s, v16.2s, alphaV0
+	st1	{v8.s}[0], [pCRow0]
+	st1	{v8.s}[1], [pCRow1]
+
+	add	pCRow0, pCRow0, #4
+.endm
+
+/******************************************************************************/
+
+.macro INIT8x1
+	fmov	s16, wzr
+	fmov	s17, wzr
+.endm
+
+.macro KERNEL8x1_SUB
+	ldr	s8, [pB]
+	add	pB , pB, #4
+
+	ld1	{v0.4s}, [pA]
+	add	pA, pA, #16
+	ld1	{v1.4s}, [pA]
+	add	pA, pA, #16
+
+	fmla	v16.4s, v0.4s, v8.2s[0]
+	fmla	v17.4s, v1.4s, v8.2s[0]
+.endm
+
+.macro SAVE8x1
+	ld1	{v0.4s, v1.4s}, [pCRow0]
+	fmla	v0.4s, v16.4s, alphaV0
+	fmla	v1.4s, v17.4s, alphaV1
+	st1 	{v0.4s, v1.4s}, [pCRow0]
+
+	add	pCRow0, pCRow0, #32
+.endm
+
+/******************************************************************************/
+
+.macro INIT4x1
+	fmov	s16, wzr
+	fmov	s17, s16
+.endm
+
+.macro KERNEL4x1_SUB
+	ldr	s8, [pB]
+	add	pB , pB, #4
+
+	ld1	{v0.2s, v1.2s}, [pA]
+	add	pA , pA, #16
+
+	fmla	v16.2s, v0.2s, v8.2s[0]
+	fmla	v17.2s, v1.2s, v8.2s[0]
+.endm
+
+.macro SAVE4x1
+	ld1	{v8.2s, v9.2s}, [pCRow0]
+	fmla	v8.2s, v16.2s, alphaV0
+	fmla	v9.2s, v17.2s, alphaV1
+	st1	{v8.2s, v9.2s}, [pCRow0]
+
+	add	pCRow0, pCRow0, #16
+.endm
+
+/******************************************************************************/
+
+.macro INIT2x1
+	fmov		s16, wzr
+.endm
+
+.macro KERNEL2x1_SUB
+	ldr	s8, [pB]
+	add	pB , pB, #4
+
+	ld1	{v0.2s}, [pA]
+	add	pA , pA, #8
+
+	fmla	v16.2s, v0.2s, v8.2s[0]
+.endm
+
+.macro SAVE2x1
+	ld1	{v8.2s}, [pCRow0]
+	fmla	v8.2s, v16.2s, alphaV0
+	st1	{v8.2s}, [pCRow0]
+
+	add	pCRow0, pCRow0, #8
+.endm
+
+/******************************************************************************/
+
+.macro INIT1x1
+	fmov	s16, wzr
+.endm
+
+.macro KERNEL1x1_SUB
+	ldr	s8, [pB]
+	add	pB , pB, #4
+
+	ldr	s0, [pA]
+	add	pA , pA, #4
+
+	fmadd 	s16, s0, s8, s16  
+.endm
+
+.macro SAVE1x1
+	ldr 	s8, [pCRow0]
+	fmla	s8, s16, alphaV0
+	str 	s8, [pCRow0]
+
+	add	pCRow0, pCRow0, #4
+.endm
+
+/*******************************************************************************
+* End of macro definitions
+*******************************************************************************/
+
+	PROLOGUE
+
+sgemm_kernel_begin:
+
+	.align 5
+	add	sp, sp, #-(11 * 16)
+	stp	d8, d9, [sp, #(0 * 16)]
+	stp	d10, d11, [sp, #(1 * 16)]
+	stp	d12, d13, [sp, #(2 * 16)]
+	stp	d14, d15, [sp, #(3 * 16)]
+	stp	d16, d17, [sp, #(4 * 16)]
+	stp	x18, x19, [sp, #(5 * 16)]
+	stp	x20, x21, [sp, #(6 * 16)]
+	stp	x22, x23, [sp, #(7 * 16)]
+	stp	x24, x25, [sp, #(8 * 16)]
+	stp	x26, x27, [sp, #(9 * 16)]
+	str	x28, [sp, #(10 * 16)]
+
+	fmov	alpha0, s0
+	fmov	alpha1, s0
+	fmov	alpha2, s0
+	fmov	alpha3, s0
+
+	lsl	LDC, LDC, #2			// ldc = ldc * 4
+
+	mov	pB, origPB
+
+	mov	counterJ, origN
+	asr 	counterJ, counterJ, #3		// J = J / 8
+	cmp 	counterJ, #0
+	ble	sgemm_kernel_L4_BEGIN
+
+/******************************************************************************/
+/******************************************************************************/
+
+sgemm_kernel_L8_BEGIN:
+	mov	pCRow0, pC			// pCRow0 = C
+	add	pC, pC, LDC, lsl #3
+
+	mov	pA, origPA			// pA = start of A array
+
+/******************************************************************************/
+
+sgemm_kernel_L8_M8_BEGIN:
+
+	mov	counterI, origM
+	asr 	counterI, counterI, #3		// counterI = counterI / 8
+	cmp 	counterI, #0
+	ble	sgemm_kernel_L8_M4_BEGIN
+
+sgemm_kernel_L8_M8_20:
+
+	mov	pB, origPB
+
+	asr 	counterL , origK, #1		// L = K / 2
+	cmp	counterL , #2			// is there at least 4 to do?
+	blt	sgemm_kernel_L8_M8_32
+
+	KERNEL8x8_I				// do one in the K
+	KERNEL8x8_M2				// do another in the K
+
+	subs	counterL, counterL, #2
+	ble	sgemm_kernel_L8_M8_22a
+	.align 5
+
+sgemm_kernel_L8_M8_22:
+
+	KERNEL8x8_M1
+	KERNEL8x8_M2
+
+	subs	counterL, counterL, #1
+	bgt	sgemm_kernel_L8_M8_22
+
+sgemm_kernel_L8_M8_22a:
+
+	KERNEL8x8_M1
+	KERNEL8x8_E
+
+	b	 sgemm_kernel_L8_M8_44
+
+sgemm_kernel_L8_M8_32:
+
+	tst	counterL, #1
+	ble	sgemm_kernel_L8_M8_40
+
+	KERNEL8x8_I
+	KERNEL8x8_E
+
+	b	sgemm_kernel_L8_M8_44
+
+sgemm_kernel_L8_M8_40:
+
+	INIT8x8
+
+sgemm_kernel_L8_M8_44:
+
+	ands	counterL , origK, #1
+	ble	sgemm_kernel_L8_M8_100
+
+sgemm_kernel_L8_M8_46:
+
+	KERNEL8x8_SUB
+
+sgemm_kernel_L8_M8_100:
+
+	SAVE8x8
+
+sgemm_kernel_L8_M8_END:
+	subs	counterI, counterI, #1
+	bne	sgemm_kernel_L8_M8_20
+
+/******************************************************************************/
+
+sgemm_kernel_L8_M4_BEGIN:
+
+	mov	counterI, origM
+	tst	counterI , #7
+	ble	sgemm_kernel_L8_END
+
+	tst	counterI, #4
+	ble	sgemm_kernel_L8_M2_BEGIN
+
+sgemm_kernel_L8_M4_20:
+
+	mov	pB, origPB
+
+	asr 	counterL , origK, #1		// L = K / 2
+	cmp	counterL , #2			// is there at least 4 to do?
+	blt	sgemm_kernel_L8_M4_32
+
+	KERNEL4x8_I				// do one in the K
+	KERNEL4x8_M2				// do another in the K
+
+	subs	counterL, counterL, #2
+	ble	sgemm_kernel_L8_M4_22a
+	.align 5
+
+sgemm_kernel_L8_M4_22:
+
+	KERNEL4x8_M1
+	KERNEL4x8_M2
+
+	subs	counterL, counterL, #1
+	bgt	sgemm_kernel_L8_M4_22
+
+sgemm_kernel_L8_M4_22a:
+
+	KERNEL4x8_M1
+	KERNEL4x8_E
+
+	b	 sgemm_kernel_L8_M4_44
+
+sgemm_kernel_L8_M4_32:
+
+	tst	counterL, #1
+	ble	sgemm_kernel_L8_M4_40
+
+	KERNEL4x8_I
+	KERNEL4x8_E
+
+	b	sgemm_kernel_L8_M4_44
+
+sgemm_kernel_L8_M4_40:
+
+	INIT4x8
+
+sgemm_kernel_L8_M4_44:
+
+	ands	counterL , origK, #1
+	ble	sgemm_kernel_L8_M4_100
+
+sgemm_kernel_L8_M4_46:
+
+	KERNEL4x8_SUB
+
+sgemm_kernel_L8_M4_100:
+
+	SAVE4x8
+
+sgemm_kernel_L8_M4_END:
+
+/******************************************************************************/
+
+sgemm_kernel_L8_M2_BEGIN:
+
+	mov	counterI, origM
+	tst	counterI , #3
+	ble	sgemm_kernel_L8_END
+
+	tst	counterI, #2			// counterI = counterI / 2
+	ble	sgemm_kernel_L8_M1_BEGIN
+
+sgemm_kernel_L8_M2_20:
+
+	INIT2x8
+
+	mov	pB, origPB
+
+	asr 	counterL , origK, #3		// counterL = counterL / 8
+	cmp	counterL , #0
+	ble	sgemm_kernel_L8_M2_40
+
+sgemm_kernel_L8_M2_22:
+
+	KERNEL2x8_SUB
+	KERNEL2x8_SUB
+	KERNEL2x8_SUB
+	KERNEL2x8_SUB
+
+	KERNEL2x8_SUB
+	KERNEL2x8_SUB
+	KERNEL2x8_SUB
+	KERNEL2x8_SUB
+
+	subs	counterL, counterL, #1
+	bgt	sgemm_kernel_L8_M2_22
+
+
+sgemm_kernel_L8_M2_40:
+
+	ands	counterL , origK, #7		// counterL = counterL % 8
+	ble	sgemm_kernel_L8_M2_100
+
+sgemm_kernel_L8_M2_42:
+
+	KERNEL2x8_SUB
+
+	subs	counterL, counterL, #1
+	bgt	sgemm_kernel_L8_M2_42
+
+sgemm_kernel_L8_M2_100:
+
+	SAVE2x8
+
+sgemm_kernel_L8_M2_END:
+
+/******************************************************************************/
+
+sgemm_kernel_L8_M1_BEGIN:
+
+	tst	counterI, #1			// counterI = counterI % 2
+	ble	sgemm_kernel_L8_END
+
+sgemm_kernel_L8_M1_20:
+
+	INIT1x8
+
+	mov	pB, origPB
+
+	asr 	counterL , origK, #3		// counterL = counterL / 8
+	cmp	counterL , #0
+	ble	sgemm_kernel_L8_M1_40
+
+sgemm_kernel_L8_M1_22:
+	KERNEL1x8_SUB
+	KERNEL1x8_SUB
+	KERNEL1x8_SUB
+	KERNEL1x8_SUB
+
+	KERNEL1x8_SUB
+	KERNEL1x8_SUB
+	KERNEL1x8_SUB
+	KERNEL1x8_SUB
+
+	subs	counterL, counterL, #1
+	bgt	sgemm_kernel_L8_M1_22
+
+
+sgemm_kernel_L8_M1_40:
+
+	ands	counterL , origK, #7		// counterL = counterL % 8
+	ble	sgemm_kernel_L8_M1_100
+
+sgemm_kernel_L8_M1_42:
+
+	KERNEL1x8_SUB
+
+	subs	counterL, counterL, #1
+	bgt	sgemm_kernel_L8_M1_42
+
+sgemm_kernel_L8_M1_100:
+
+	SAVE1x8
+
+sgemm_kernel_L8_END:
+	lsl	temp, origK, #5			// B = B + K * 4 * 8
+	add	origPB, origPB, temp
+
+	subs	counterJ, counterJ , #1		// j--
+	bgt	sgemm_kernel_L8_BEGIN
+
+/******************************************************************************/
+/******************************************************************************/
+
+sgemm_kernel_L4_BEGIN:
+
+	mov	counterJ , origN
+	tst	counterJ , #7
+	ble	sgemm_kernel_L999
+
+	tst	counterJ , #4
+	ble	sgemm_kernel_L2_BEGIN
+
+	mov	pCRow0, pC			// pCRow0 = pC
+
+	add	pC,pC,LDC, lsl #2
+
+	mov	pA, origPA			// pA = A
+
+/******************************************************************************/
+
+sgemm_kernel_L4_M8_BEGIN:
+
+	mov	counterI, origM
+	asr 	counterI, counterI, #3		// counterI = counterI / 8
+	cmp 	counterI, #0
+	ble	sgemm_kernel_L4_M4_BEGIN
+
+sgemm_kernel_L4_M8_20:
+
+	mov	pB, origPB
+
+	asr 	counterL , origK, #1		// L = K / 2
+	cmp	counterL , #2			// is there at least 4 to do?
+	blt	sgemm_kernel_L4_M8_32
+
+	KERNEL8x4_I				// do one in the K
+	KERNEL8x4_M2				// do another in the K
+
+	subs	counterL, counterL, #2
+	ble	sgemm_kernel_L4_M8_22a
+	.align 5
+
+sgemm_kernel_L4_M8_22:
+
+	KERNEL8x4_M1
+	KERNEL8x4_M2
+
+	subs	counterL, counterL, #1
+	bgt	sgemm_kernel_L4_M8_22
+
+sgemm_kernel_L4_M8_22a:
+
+	KERNEL8x4_M1
+	KERNEL8x4_E
+
+	b	 sgemm_kernel_L4_M8_44
+
+sgemm_kernel_L4_M8_32:
+
+	tst	counterL, #1
+	ble	sgemm_kernel_L4_M8_40
+
+	KERNEL8x4_I
+	KERNEL8x4_E
+
+	b	sgemm_kernel_L4_M8_44
+
+sgemm_kernel_L4_M8_40:
+
+	INIT8x4
+
+sgemm_kernel_L4_M8_44:
+
+	ands	counterL , origK, #1
+	ble	sgemm_kernel_L4_M8_100
+
+sgemm_kernel_L4_M8_46:
+
+	KERNEL8x4_SUB
+
+sgemm_kernel_L4_M8_100:
+
+	SAVE8x4
+
+sgemm_kernel_L4_M8_END:
+	subs	counterI, counterI, #1
+	bne	sgemm_kernel_L4_M8_20
+
+/******************************************************************************/
+
+sgemm_kernel_L4_M4_BEGIN:
+
+	mov	counterI, origM
+	tst	counterI , #7
+	ble	sgemm_kernel_L4_END
+
+	tst	counterI, #4
+	ble	sgemm_kernel_L4_M2_BEGIN
+
+sgemm_kernel_L4_M4_20:
+
+	mov	pB, origPB
+
+	asr 	counterL , origK, #1		// L = K / 2
+	cmp	counterL , #2			// is there at least 4 to do?
+	blt	sgemm_kernel_L4_M4_32
+
+	KERNEL4x4_I				// do one in the K
+	KERNEL4x4_M2				// do another in the K
+
+	subs	counterL, counterL, #2
+	ble	sgemm_kernel_L4_M4_22a
+	.align 5
+
+sgemm_kernel_L4_M4_22:
+
+	KERNEL4x4_M1
+	KERNEL4x4_M2
+
+	subs	counterL, counterL, #1
+	bgt	sgemm_kernel_L4_M4_22
+
+sgemm_kernel_L4_M4_22a:
+
+	KERNEL4x4_M1
+	KERNEL4x4_E
+
+	b	 sgemm_kernel_L4_M4_44
+
+sgemm_kernel_L4_M4_32:
+
+	tst	counterL, #1
+	ble	sgemm_kernel_L4_M4_40
+
+	KERNEL4x4_I
+	KERNEL4x4_E
+
+	b	sgemm_kernel_L4_M4_44
+
+sgemm_kernel_L4_M4_40:
+
+	INIT4x4
+
+sgemm_kernel_L4_M4_44:
+
+	ands	counterL , origK, #1
+	ble	sgemm_kernel_L4_M4_100
+
+sgemm_kernel_L4_M4_46:
+
+	KERNEL4x4_SUB
+
+sgemm_kernel_L4_M4_100:
+
+	SAVE4x4
+
+sgemm_kernel_L4_M4_END:
+
+/******************************************************************************/
+
+sgemm_kernel_L4_M2_BEGIN:
+
+	mov	counterI, origM
+	tst	counterI , #3
+	ble	sgemm_kernel_L4_END
+
+	tst	counterI, #2			// counterI = counterI / 2
+	ble	sgemm_kernel_L4_M1_BEGIN
+
+sgemm_kernel_L4_M2_20:
+
+	INIT2x4
+
+	mov	pB, origPB
+
+	asr 	counterL , origK, #3		// counterL = counterL / 8
+	cmp	counterL , #0
+	ble	sgemm_kernel_L4_M2_40
+
+sgemm_kernel_L4_M2_22:
+
+	KERNEL2x4_SUB
+	KERNEL2x4_SUB
+	KERNEL2x4_SUB
+	KERNEL2x4_SUB
+
+	KERNEL2x4_SUB
+	KERNEL2x4_SUB
+	KERNEL2x4_SUB
+	KERNEL2x4_SUB
+
+	subs	counterL, counterL, #1
+	bgt	sgemm_kernel_L4_M2_22
+
+
+sgemm_kernel_L4_M2_40:
+
+	ands	counterL , origK, #7		// counterL = counterL % 8
+	ble	sgemm_kernel_L4_M2_100
+
+sgemm_kernel_L4_M2_42:
+
+	KERNEL2x4_SUB
+
+	subs	counterL, counterL, #1
+	bgt	sgemm_kernel_L4_M2_42
+
+sgemm_kernel_L4_M2_100:
+
+	SAVE2x4
+
+sgemm_kernel_L4_M2_END:
+
+/******************************************************************************/
+
+sgemm_kernel_L4_M1_BEGIN:
+
+	tst	counterI, #1			// counterI = counterI % 2
+	ble	sgemm_kernel_L4_END
+
+sgemm_kernel_L4_M1_20:
+
+	INIT1x4
+
+	mov	pB, origPB
+
+	asr 	counterL , origK, #3		// counterL = counterL / 8
+	cmp	counterL , #0
+	ble	sgemm_kernel_L4_M1_40
+
+sgemm_kernel_L4_M1_22:
+	KERNEL1x4_SUB
+	KERNEL1x4_SUB
+	KERNEL1x4_SUB
+	KERNEL1x4_SUB
+
+	KERNEL1x4_SUB
+	KERNEL1x4_SUB
+	KERNEL1x4_SUB
+	KERNEL1x4_SUB
+
+	subs	counterL, counterL, #1
+	bgt	sgemm_kernel_L4_M1_22
+
+
+sgemm_kernel_L4_M1_40:
+
+	ands	counterL , origK, #7		// counterL = counterL % 8
+	ble	sgemm_kernel_L4_M1_100
+
+sgemm_kernel_L4_M1_42:
+
+	KERNEL1x4_SUB
+
+	subs	counterL, counterL, #1
+	bgt	sgemm_kernel_L4_M1_42
+
+sgemm_kernel_L4_M1_100:
+
+	SAVE1x4
+
+sgemm_kernel_L4_END:
+	add	origPB, origPB, origK, lsl #4	// B = B + K * 4 * 4
+
+/******************************************************************************/
+/******************************************************************************/
+
+sgemm_kernel_L2_BEGIN:   // less than 2 left in N direction
+
+	mov	counterJ , origN
+	tst	counterJ , #3
+	ble	sgemm_kernel_L999
+
+	tst	counterJ , #2
+	ble	sgemm_kernel_L1_BEGIN
+
+	mov	pCRow0, pC			// pCRow0 = pC
+
+	add	pC,pC,LDC, lsl #1
+
+	mov	pA, origPA			// pA = A
+
+/******************************************************************************/
+
+sgemm_kernel_L2_M8_BEGIN:
+
+	mov	counterI, origM
+	asr 	counterI, counterI, #3		// counterI = counterI / 8
+	cmp	counterI,#0
+	ble	sgemm_kernel_L2_M4_BEGIN
+
+sgemm_kernel_L2_M8_20:
+
+	INIT8x2
+
+	mov	pB, origPB
+
+	asr	counterL , origK, #3		// counterL = counterL / 8
+	cmp	counterL,#0
+	ble	sgemm_kernel_L2_M8_40
+	.align 5
+
+sgemm_kernel_L2_M8_22:
+	KERNEL8x2_SUB
+	KERNEL8x2_SUB
+	KERNEL8x2_SUB
+	KERNEL8x2_SUB
+
+	KERNEL8x2_SUB
+	KERNEL8x2_SUB
+	KERNEL8x2_SUB
+	KERNEL8x2_SUB
+
+	subs	counterL, counterL, #1
+	bgt	sgemm_kernel_L2_M8_22
+
+
+sgemm_kernel_L2_M8_40:
+
+	ands	counterL , origK, #7		// counterL = counterL % 8
+	ble	sgemm_kernel_L2_M8_100
+
+sgemm_kernel_L2_M8_42:
+
+	KERNEL8x2_SUB
+
+	subs	counterL, counterL, #1
+	bgt	sgemm_kernel_L2_M8_42
+
+sgemm_kernel_L2_M8_100:
+
+	SAVE8x2
+
+sgemm_kernel_L2_M8_END:
+
+	subs	counterI, counterI, #1
+	bgt	sgemm_kernel_L2_M8_20
+
+/******************************************************************************/
+
+sgemm_kernel_L2_M4_BEGIN:
+
+	mov	counterI, origM
+	tst	counterI , #7
+	ble	sgemm_kernel_L2_END
+
+	tst	counterI, #4
+	ble	sgemm_kernel_L2_M2_BEGIN
+
+sgemm_kernel_L2_M4_20:
+
+	INIT4x2
+
+	mov	pB, origPB
+
+	asr	counterL , origK, #3		// counterL = counterL / 8
+	cmp	counterL,#0
+	ble	sgemm_kernel_L2_M4_40
+	.align 5
+
+sgemm_kernel_L2_M4_22:
+	KERNEL4x2_SUB
+	KERNEL4x2_SUB
+	KERNEL4x2_SUB
+	KERNEL4x2_SUB
+
+	KERNEL4x2_SUB
+	KERNEL4x2_SUB
+	KERNEL4x2_SUB
+	KERNEL4x2_SUB
+
+	subs	counterL, counterL, #1
+	bgt	sgemm_kernel_L2_M4_22
+
+
+sgemm_kernel_L2_M4_40:
+
+	ands	counterL , origK, #7		// counterL = counterL % 8
+	ble	sgemm_kernel_L2_M4_100
+
+sgemm_kernel_L2_M4_42:
+
+	KERNEL4x2_SUB
+
+	subs	counterL, counterL, #1
+	bgt	sgemm_kernel_L2_M4_42
+
+sgemm_kernel_L2_M4_100:
+
+	SAVE4x2
+
+sgemm_kernel_L2_M4_END:
+
+/******************************************************************************/
+
+sgemm_kernel_L2_M2_BEGIN:
+
+	mov	counterI, origM
+	tst	counterI , #3
+	ble	sgemm_kernel_L2_END
+
+	tst	counterI, #2			// counterI = counterI / 2
+	ble	sgemm_kernel_L2_M1_BEGIN
+
+sgemm_kernel_L2_M2_20:
+
+	INIT2x2
+
+	mov	pB, origPB
+
+	asr	counterL , origK, #3		// counterL = counterL / 8
+        cmp	counterL,#0
+	ble	sgemm_kernel_L2_M2_40
+
+sgemm_kernel_L2_M2_22:
+
+	KERNEL2x2_SUB
+	KERNEL2x2_SUB
+	KERNEL2x2_SUB
+	KERNEL2x2_SUB
+
+	KERNEL2x2_SUB
+	KERNEL2x2_SUB
+	KERNEL2x2_SUB
+	KERNEL2x2_SUB
+
+	subs	counterL, counterL, #1
+	bgt	sgemm_kernel_L2_M2_22
+
+
+sgemm_kernel_L2_M2_40:
+
+	ands	counterL , origK, #7		// counterL = counterL % 8
+	ble	sgemm_kernel_L2_M2_100
+
+sgemm_kernel_L2_M2_42:
+
+	KERNEL2x2_SUB
+
+	subs	counterL, counterL, #1
+	bgt	sgemm_kernel_L2_M2_42
+
+sgemm_kernel_L2_M2_100:
+
+	SAVE2x2
+
+sgemm_kernel_L2_M2_END:
+
+/******************************************************************************/
+
+sgemm_kernel_L2_M1_BEGIN:
+
+	tst	counterI, #1			// counterI = counterI % 2
+	ble	sgemm_kernel_L2_END
+
+sgemm_kernel_L2_M1_20:
+
+	INIT1x2
+
+	mov	pB, origPB
+
+	asr 	counterL , origK, #3		// counterL = counterL / 8
+        cmp     counterL, #0
+	ble	sgemm_kernel_L2_M1_40
+
+sgemm_kernel_L2_M1_22:
+	KERNEL1x2_SUB
+	KERNEL1x2_SUB
+	KERNEL1x2_SUB
+	KERNEL1x2_SUB
+
+	KERNEL1x2_SUB
+	KERNEL1x2_SUB
+	KERNEL1x2_SUB
+	KERNEL1x2_SUB
+
+	subs	counterL, counterL, #1
+	bgt	sgemm_kernel_L2_M1_22
+
+
+sgemm_kernel_L2_M1_40:
+
+	ands	counterL , origK, #7		// counterL = counterL % 8
+	ble	sgemm_kernel_L2_M1_100
+
+sgemm_kernel_L2_M1_42:
+
+	KERNEL1x2_SUB
+
+	subs	counterL, counterL, #1
+	bgt	sgemm_kernel_L2_M1_42
+
+sgemm_kernel_L2_M1_100:
+
+	SAVE1x2
+
+sgemm_kernel_L2_END:
+
+	add	origPB, origPB, origK, lsl #3	// B = B + K * 2 * 4
+
+/******************************************************************************/
+/******************************************************************************/
+
+sgemm_kernel_L1_BEGIN:
+
+	mov	counterJ , origN
+	tst	counterJ , #1
+	ble	sgemm_kernel_L999 // done
+
+
+	mov	pCRow0, pC			// pCRow0 = C
+	add	pC , pC , LDC			// Update pC to point to next
+
+	mov	pA, origPA			// pA = A
+
+/******************************************************************************/
+
+sgemm_kernel_L1_M8_BEGIN:
+
+	mov	counterI, origM
+	asr 	counterI, counterI, #3
+	cmp	counterI, #0
+	ble	sgemm_kernel_L1_M4_BEGIN
+
+sgemm_kernel_L1_M8_20:
+
+	INIT8x1
+
+	mov	pB, origPB
+
+	asr	counterL , origK, #3		// counterL = counterL / 8
+	cmp	counterL , #0
+	ble	sgemm_kernel_L1_M8_40
+	.align 5
+
+sgemm_kernel_L1_M8_22:
+	KERNEL8x1_SUB
+	KERNEL8x1_SUB
+	KERNEL8x1_SUB
+	KERNEL8x1_SUB
+
+	KERNEL8x1_SUB
+	KERNEL8x1_SUB
+	KERNEL8x1_SUB
+	KERNEL8x1_SUB
+
+	subs	counterL, counterL, #1
+	bgt	sgemm_kernel_L1_M8_22
+
+
+sgemm_kernel_L1_M8_40:
+
+	ands	counterL , origK, #7		// counterL = counterL % 8
+	ble	sgemm_kernel_L1_M8_100
+
+sgemm_kernel_L1_M8_42:
+
+	KERNEL8x1_SUB
+
+	subs	counterL, counterL, #1
+	bgt	sgemm_kernel_L1_M8_42
+
+sgemm_kernel_L1_M8_100:
+
+	SAVE8x1
+
+sgemm_kernel_L1_M8_END:
+
+	subs	counterI, counterI, #1
+	bgt	sgemm_kernel_L1_M8_20
+
+/******************************************************************************/
+
+sgemm_kernel_L1_M4_BEGIN:
+
+	mov	counterI, origM
+	tst	counterI , #7
+	ble	sgemm_kernel_L1_END
+
+	tst	counterI, #4
+	ble	sgemm_kernel_L1_M2_BEGIN
+
+sgemm_kernel_L1_M4_20:
+
+	INIT4x1
+
+	mov	pB, origPB
+
+	asr	counterL , origK, #3		// counterL = counterL / 8
+	cmp	counterL , #0
+	ble	sgemm_kernel_L1_M4_40
+	.align 5
+
+sgemm_kernel_L1_M4_22:
+	KERNEL4x1_SUB
+	KERNEL4x1_SUB
+	KERNEL4x1_SUB
+	KERNEL4x1_SUB
+
+	KERNEL4x1_SUB
+	KERNEL4x1_SUB
+	KERNEL4x1_SUB
+	KERNEL4x1_SUB
+
+	subs	counterL, counterL, #1
+	bgt	sgemm_kernel_L1_M4_22
+
+
+sgemm_kernel_L1_M4_40:
+
+	ands	counterL , origK, #7		// counterL = counterL % 8
+	ble	sgemm_kernel_L1_M4_100
+
+sgemm_kernel_L1_M4_42:
+
+	KERNEL4x1_SUB
+
+	subs	counterL, counterL, #1
+	bgt	sgemm_kernel_L1_M4_42
+
+sgemm_kernel_L1_M4_100:
+
+	SAVE4x1
+
+sgemm_kernel_L1_M4_END:
+
+/******************************************************************************/
+
+sgemm_kernel_L1_M2_BEGIN:
+
+	mov	counterI, origM
+	tst	counterI , #3
+	ble	sgemm_kernel_L1_END
+
+	tst	counterI, #2			// counterI = counterI / 2
+	ble	sgemm_kernel_L1_M1_BEGIN
+
+sgemm_kernel_L1_M2_20:
+
+	INIT2x1
+
+	mov	pB, origPB
+
+	asr 	counterL , origK, #3		// counterL = counterL / 8
+	cmp	counterL , #0
+	ble	sgemm_kernel_L1_M2_40
+
+sgemm_kernel_L1_M2_22:
+
+	KERNEL2x1_SUB
+	KERNEL2x1_SUB
+	KERNEL2x1_SUB
+	KERNEL2x1_SUB
+
+	KERNEL2x1_SUB
+	KERNEL2x1_SUB
+	KERNEL2x1_SUB
+	KERNEL2x1_SUB
+
+	subs	counterL, counterL, #1
+	bgt	sgemm_kernel_L1_M2_22
+
+
+sgemm_kernel_L1_M2_40:
+
+	ands	counterL , origK, #7		// counterL = counterL % 8
+	ble	sgemm_kernel_L1_M2_100
+
+sgemm_kernel_L1_M2_42:
+
+	KERNEL2x1_SUB
+
+	subs	counterL, counterL, #1
+	bgt	sgemm_kernel_L1_M2_42
+
+sgemm_kernel_L1_M2_100:
+
+	SAVE2x1
+
+sgemm_kernel_L1_M2_END:
+
+/******************************************************************************/
+
+sgemm_kernel_L1_M1_BEGIN:
+
+	tst	counterI, #1			// counterI = counterI % 2
+	ble	sgemm_kernel_L1_END
+
+sgemm_kernel_L1_M1_20:
+
+	INIT1x1
+
+	mov	pB, origPB
+
+	asr 	counterL , origK, #3		// counterL = counterL / 8
+	cmp	counterL , #0
+	ble	sgemm_kernel_L1_M1_40
+
+sgemm_kernel_L1_M1_22:
+	KERNEL1x1_SUB
+	KERNEL1x1_SUB
+	KERNEL1x1_SUB
+	KERNEL1x1_SUB
+
+	KERNEL1x1_SUB
+	KERNEL1x1_SUB
+	KERNEL1x1_SUB
+	KERNEL1x1_SUB
+
+	subs	counterL, counterL, #1
+	bgt	sgemm_kernel_L1_M1_22
+
+
+sgemm_kernel_L1_M1_40:
+
+	ands	counterL , origK, #7		// counterL = counterL % 8
+	ble	sgemm_kernel_L1_M1_100
+
+sgemm_kernel_L1_M1_42:
+
+	KERNEL1x1_SUB
+
+	subs	counterL, counterL, #1
+	bgt	sgemm_kernel_L1_M1_42
+
+sgemm_kernel_L1_M1_100:
+
+	SAVE1x1
+
+sgemm_kernel_L1_END:
+
+/******************************************************************************/
+
+sgemm_kernel_L999:
+	mov	x0, #0				// set return value
+	ldp	d8, d9, [sp, #(0 * 16)]
+	ldp	d10, d11, [sp, #(1 * 16)]
+	ldp	d12, d13, [sp, #(2 * 16)]
+	ldp	d14, d15, [sp, #(3 * 16)]
+	ldp	d16, d17, [sp, #(4 * 16)]
+	ldp	x18, x19, [sp, #(5 * 16)]
+	ldp	x20, x21, [sp, #(6 * 16)]
+	ldp	x22, x23, [sp, #(7 * 16)]
+	ldp	x24, x25, [sp, #(8 * 16)]
+	ldp	x26, x27, [sp, #(9 * 16)]
+	ldr	x28, [sp, #(10 * 16)]
+	add	sp, sp, #(11*16)
+	ret
+
+	EPILOGUE
+
diff --git a/kernel/arm64/strmm_kernel_16x4.S b/kernel/arm64/strmm_kernel_16x4.S
new file mode 100755
index 000000000..b99760a03
--- /dev/null
+++ b/kernel/arm64/strmm_kernel_16x4.S
@@ -0,0 +1,2431 @@
+/*******************************************************************************
+Copyright (c) 2015, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*******************************************************************************/
+
+#define ASSEMBLER
+#include "common.h"
+
+/*                   X0          X1          X2          s0        X3        x4       x5           x6               x7 */
+/*int CNAME(BLASLONG bm,BLASLONG bn,BLASLONG bk,FLOAT alpha,FLOAT* ba,FLOAT* bb,FLOAT* C,BLASLONG ldc, BLASLONG offset) */
+
+#define origM		x0
+#define origN		x1
+#define origK		x2
+#define origPA		x3
+#define origPB		x4
+#define pC		x5
+#define LDC		x6
+#define offset		x7
+#define counterL	x8
+#define counterI	x9
+#define counterJ	x10
+#define pB		x11
+#define pCRow0		x12
+#define pCRow1		x13
+#define pCRow2		x14
+#define pA		x15
+#define temp		x16
+#define tempOffset	x17
+#define tempK		x18
+
+#define alpha0		s10
+#define alphaV0		v10.s[0]
+#define alpha1		s11
+#define alphaV1		v11.s[0]
+#define alpha2		s14
+#define alphaV2		v14.s[0]
+#define alpha3		s15
+#define alphaV3		v15.s[0]
+
+// 00 origM
+// 01 origN
+// 02 origK
+// 03 origPA
+// 04 origPB
+// 05 pC
+// 06 origLDC -> LDC
+// 07 offset
+// 08 counterL
+// 09 counterI
+// 10 counterJ
+// 11 pB
+// 12 pCRow0
+// 13 pCRow1
+// 14 pCRow2
+// 15 pA
+// 16 temp
+// 17 tempOffset
+// 18 must save tempK
+// 19 must save
+// 20 must save
+// 21 must save
+// 22 must save
+// 23 must save
+// 24 must save
+// 25 must save
+// 26 must save
+// 27 must save
+// 28 must save
+// 29 frame
+// 30 link
+// 31 sp
+
+//v00 ALPHA -> pA0_00, pA0_01, pA0_02, pA0_03
+//v01 pA0_04, pA0_05, pA0_06, pA0_07
+//v02 pA0_08, pA0_09, pA0_10, pA0_11
+//v03 pA0_12, pA0_13, pA0_14, pA0_15
+//v04 pA1_00, pA1_01, pA1_02, pA1_03
+//v05 pA1_04, pA1_05, pA1_06, pA1_07
+//v06 pA1_08, pA1_09, pA1_10, pA1_11
+//v07 pA1_12, pA1_13, pA1_14, pA1_15
+//v08 must save pB00, pB01
+//v09 must save pB02, pB03
+//v10 must save ALPHA0
+//v11 must save ALPHA1
+//v12 must save pB10, pB11
+//v13 must save pB12, pB13
+//v14 must save ALPHA2
+//v15 must save ALPHA3
+//v16 must save C00, C01, C02, C03
+//v17 must save C04, C05, C06, C07
+//v18 C08, C09, C10, C11
+//v19 C12, C13, C14, C15
+//v20 C16, C17, C18, C19
+//v21 C20, C21, C22, C23
+//v22 C24, C25, C26, C27
+//v23 C28, C29, C30, C31
+//v24 C32, C33, C34, C35
+//v25 C36, C37, C38, C39
+//v26 C40, C41, C42, C43
+//v27 C44, C45, C46, C47
+//v28 C48, C49, C50, C51
+//v29 C52, C53, C54, C55
+//v30 C56, C57, C58, C59
+//v31 C60, C61, C62, C63
+
+/*******************************************************************************
+* Macro definitions
+*******************************************************************************/
+
+.macro INIT16x4
+	fmov		s16, wzr
+	fmov		s17, wzr
+	fmov		s18, s16
+	fmov		s19, s17
+	fmov		s20, wzr
+	fmov		s21, s16
+	fmov		s22, s17
+	fmov		s23, s18
+	fmov		s24, wzr
+	fmov		s25, s16
+	fmov		s26, s17
+	fmov		s27, s18
+	fmov		s28, wzr
+	fmov		s29, s16
+	fmov		s30, s17
+	fmov		s31, s18
+.endm
+
+.macro KERNEL16x4_I
+	ld1	{v8.2s, v9.2s}, [pB]
+	add	pB, pB, #16
+	ld1	{v0.4s}, [pA]
+	add	pA, pA, #16
+	ld1	{v1.4s}, [pA]
+	add	pA, pA, #16
+	ld1	{v2.4s}, [pA]
+	add	pA, pA, #16
+	ld1	{v3.4s}, [pA]
+	add	pA, pA, #16
+
+	fmul	v16.4s, v0.4s, v8.2s[0]
+	fmul	v17.4s, v1.4s, v8.2s[0]
+	fmul	v18.4s, v2.4s, v8.2s[0]
+	fmul	v19.4s, v3.4s, v8.2s[0]
+
+	fmul	v20.4s, v0.4s, v8.2s[1]
+	fmul	v21.4s, v1.4s, v8.2s[1]
+	fmul	v22.4s, v2.4s, v8.2s[1]
+	fmul	v23.4s, v3.4s, v8.2s[1]
+
+	fmul	v24.4s, v0.4s, v9.2s[0]
+	fmul	v25.4s, v1.4s, v9.2s[0]
+	fmul	v26.4s, v2.4s, v9.2s[0]
+	fmul	v27.4s, v3.4s, v9.2s[0]
+
+	fmul	v28.4s, v0.4s, v9.2s[1]
+	fmul	v29.4s, v1.4s, v9.2s[1]
+	fmul	v30.4s, v2.4s, v9.2s[1]
+	fmul	v31.4s, v3.4s, v9.2s[1]
+
+	ld1	{v12.2s, v13.2s}, [pB]
+	add	pB, pB, #16
+	ld1	{v4.4s}, [pA]
+	add	pA, pA, #16
+	ld1	{v5.4s}, [pA]
+	add	pA, pA, #16
+	ld1	{v6.4s}, [pA]
+	add	pA, pA, #16
+	ld1	{v7.4s}, [pA]
+	add	pA, pA, #16
+.endm
+
+.macro KERNEL16x4_M1
+	fmla	v16.4s, v0.4s, v8.2s[0]
+	fmla	v17.4s, v1.4s, v8.2s[0]
+	fmla	v18.4s, v2.4s, v8.2s[0]
+	fmla	v19.4s, v3.4s, v8.2s[0]
+
+	fmla	v20.4s, v0.4s, v8.2s[1]
+	fmla	v21.4s, v1.4s, v8.2s[1]
+	fmla	v22.4s, v2.4s, v8.2s[1]
+	fmla	v23.4s, v3.4s, v8.2s[1]
+
+	fmla	v24.4s, v0.4s, v9.2s[0]
+	fmla	v25.4s, v1.4s, v9.2s[0]
+	fmla	v26.4s, v2.4s, v9.2s[0]
+	fmla	v27.4s, v3.4s, v9.2s[0]
+
+	fmla	v28.4s, v0.4s, v9.2s[1]
+	fmla	v29.4s, v1.4s, v9.2s[1]
+	fmla	v30.4s, v2.4s, v9.2s[1]
+	fmla	v31.4s, v3.4s, v9.2s[1]
+
+	ld1	{v12.2s, v13.2s}, [pB]
+	add	pB, pB, #16
+	ld1	{v4.4s}, [pA]
+	add	pA, pA, #16
+	ld1	{v5.4s}, [pA]
+	add	pA, pA, #16
+	ld1	{v6.4s}, [pA]
+	add	pA, pA, #16
+	ld1	{v7.4s}, [pA]
+	add	pA, pA, #16
+.endm
+
+.macro KERNEL16x4_M2
+	fmla	v16.4s, v4.4s, v12.2s[0]
+	fmla	v17.4s, v5.4s, v12.2s[0]
+	fmla	v18.4s, v6.4s, v12.2s[0]
+	fmla	v19.4s, v7.4s, v12.2s[0]
+
+	fmla	v20.4s, v4.4s, v12.2s[1]
+	fmla	v21.4s, v5.4s, v12.2s[1]
+	fmla	v22.4s, v6.4s, v12.2s[1]
+	fmla	v23.4s, v7.4s, v12.2s[1]
+
+	fmla	v24.4s, v4.4s, v13.2s[0]
+	fmla	v25.4s, v5.4s, v13.2s[0]
+	fmla	v26.4s, v6.4s, v13.2s[0]
+	fmla	v27.4s, v7.4s, v13.2s[0]
+
+	fmla	v28.4s, v4.4s, v13.2s[1]
+	fmla	v29.4s, v5.4s, v13.2s[1]
+	fmla	v30.4s, v6.4s, v13.2s[1]
+	fmla	v31.4s, v7.4s, v13.2s[1]
+
+	ld1	{v8.2s, v9.2s}, [pB]
+	add	pB, pB, #16
+	ld1	{v0.4s}, [pA]
+	add	pA, pA, #16
+	ld1	{v1.4s}, [pA]
+	add	pA, pA, #16
+	ld1	{v2.4s}, [pA]
+	add	pA, pA, #16
+	ld1	{v3.4s}, [pA]
+	add	pA, pA, #16
+.endm
+
+.macro KERNEL16x4_E
+	fmla	v16.4s, v4.4s, v12.2s[0]
+	fmla	v17.4s, v5.4s, v12.2s[0]
+	fmla	v18.4s, v6.4s, v12.2s[0]
+	fmla	v19.4s, v7.4s, v12.2s[0]
+
+	fmla	v20.4s, v4.4s, v12.2s[1]
+	fmla	v21.4s, v5.4s, v12.2s[1]
+	fmla	v22.4s, v6.4s, v12.2s[1]
+	fmla	v23.4s, v7.4s, v12.2s[1]
+
+	fmla	v24.4s, v4.4s, v13.2s[0]
+	fmla	v25.4s, v5.4s, v13.2s[0]
+	fmla	v26.4s, v6.4s, v13.2s[0]
+	fmla	v27.4s, v7.4s, v13.2s[0]
+
+	fmla	v28.4s, v4.4s, v13.2s[1]
+	fmla	v29.4s, v5.4s, v13.2s[1]
+	fmla	v30.4s, v6.4s, v13.2s[1]
+	fmla	v31.4s, v7.4s, v13.2s[1]
+.endm
+
+.macro KERNEL16x4_SUB
+	ld1	{v8.2s, v9.2s}, [pB]
+	add	pB, pB, #16
+	ld1	{v0.4s}, [pA]
+	add	pA, pA, #16
+	ld1	{v1.4s}, [pA]
+	add	pA, pA, #16
+	ld1	{v2.4s}, [pA]
+	add	pA, pA, #16
+	ld1	{v3.4s}, [pA]
+	add	pA, pA, #16
+
+	fmla	v16.4s, v0.4s, v8.2s[0]
+	fmla	v17.4s, v1.4s, v8.2s[0]
+	fmla	v18.4s, v2.4s, v8.2s[0]
+	fmla	v19.4s, v3.4s, v8.2s[0]
+
+	fmla	v20.4s, v0.4s, v8.2s[1]
+	fmla	v21.4s, v1.4s, v8.2s[1]
+	fmla	v22.4s, v2.4s, v8.2s[1]
+	fmla	v23.4s, v3.4s, v8.2s[1]
+
+	fmla	v24.4s, v0.4s, v9.2s[0]
+	fmla	v25.4s, v1.4s, v9.2s[0]
+	fmla	v26.4s, v2.4s, v9.2s[0]
+	fmla	v27.4s, v3.4s, v9.2s[0]
+
+	fmla	v28.4s, v0.4s, v9.2s[1]
+	fmla	v29.4s, v1.4s, v9.2s[1]
+	fmla	v30.4s, v2.4s, v9.2s[1]
+	fmla	v31.4s, v3.4s, v9.2s[1]
+.endm
+
+.macro SAVE16x4
+	add	pCRow1, pCRow0, LDC
+
+	fmul	v0.4s, v16.4s, alphaV0
+	fmul	v1.4s, v17.4s, alphaV1
+	fmul	v2.4s, v18.4s, alphaV2
+	fmul	v3.4s, v19.4s, alphaV3
+	st1 	{v0.4s, v1.4s, v2.4s, v3.4s}, [pCRow0]
+
+	add	pCRow2, pCRow1, LDC
+
+	fmul	v4.4s, v20.4s, alphaV0
+	fmul	v5.4s, v21.4s, alphaV1
+	fmul	v6.4s, v22.4s, alphaV2
+	fmul	v7.4s, v23.4s, alphaV3
+	st1 	{v4.4s, v5.4s, v6.4s, v7.4s}, [pCRow1]
+
+	add	pCRow1, pCRow2, LDC
+
+	fmul	v0.4s, v24.4s, alphaV0
+	fmul	v1.4s, v25.4s, alphaV1
+	fmul	v2.4s, v26.4s, alphaV2
+	fmul	v3.4s, v27.4s, alphaV3
+	st1 	{v0.4s, v1.4s, v2.4s, v3.4s}, [pCRow2]
+
+	fmul	v4.4s, v28.4s, alphaV0
+	fmul	v5.4s, v29.4s, alphaV1
+	fmul	v6.4s, v30.4s, alphaV2
+	fmul	v7.4s, v31.4s, alphaV3
+	st1 	{v4.4s, v5.4s, v6.4s, v7.4s}, [pCRow1]
+
+	add	pCRow0, pCRow0, #64
+.endm
+
+/******************************************************************************/
+
+.macro INIT8x4
+	fmov		s16, wzr
+	fmov		s17, wzr
+	fmov		s20, wzr
+	fmov		s21, s16
+	fmov		s24, wzr
+	fmov		s25, s16
+	fmov		s28, wzr
+	fmov		s29, s16
+.endm
+
+.macro KERNEL8x4_I
+	ld1	{v8.2s, v9.2s}, [pB]
+	add	pB, pB, #16
+	ld1	{v0.4s}, [pA]
+	add	pA, pA, #16
+	ld1	{v1.4s}, [pA]
+	add	pA, pA, #16
+
+	fmul	v16.4s, v0.4s, v8.2s[0]
+	fmul	v17.4s, v1.4s, v8.2s[0]
+	fmul	v20.4s, v0.4s, v8.2s[1]
+	fmul	v21.4s, v1.4s, v8.2s[1]
+	fmul	v24.4s, v0.4s, v9.2s[0]
+	fmul	v25.4s, v1.4s, v9.2s[0]
+	fmul	v28.4s, v0.4s, v9.2s[1]
+	fmul	v29.4s, v1.4s, v9.2s[1]
+
+	ld1	{v12.2s, v13.2s}, [pB]
+	add	pB, pB, #16
+	ld1	{v4.4s}, [pA]
+	add	pA, pA, #16
+	ld1	{v5.4s}, [pA]
+	add	pA, pA, #16
+.endm
+
+.macro KERNEL8x4_M1
+	fmla	v16.4s, v0.4s, v8.2s[0]
+	fmla	v17.4s, v1.4s, v8.2s[0]
+	fmla	v20.4s, v0.4s, v8.2s[1]
+	fmla	v21.4s, v1.4s, v8.2s[1]
+	fmla	v24.4s, v0.4s, v9.2s[0]
+	fmla	v25.4s, v1.4s, v9.2s[0]
+	fmla	v28.4s, v0.4s, v9.2s[1]
+	fmla	v29.4s, v1.4s, v9.2s[1]
+
+	ld1	{v12.2s, v13.2s}, [pB]
+	add	pB, pB, #16
+	ld1	{v4.4s}, [pA]
+	add	pA, pA, #16
+	ld1	{v5.4s}, [pA]
+	add	pA, pA, #16
+.endm
+
+.macro KERNEL8x4_M2
+	fmla	v16.4s, v4.4s, v12.2s[0]
+	fmla	v17.4s, v5.4s, v12.2s[0]
+	fmla	v20.4s, v4.4s, v12.2s[1]
+	fmla	v21.4s, v5.4s, v12.2s[1]
+	fmla	v24.4s, v4.4s, v13.2s[0]
+	fmla	v25.4s, v5.4s, v13.2s[0]
+	fmla	v28.4s, v4.4s, v13.2s[1]
+	fmla	v29.4s, v5.4s, v13.2s[1]
+
+	ld1	{v8.2s, v9.2s}, [pB]
+	add	pB, pB, #16
+	ld1	{v0.4s}, [pA]
+	add	pA, pA, #16
+	ld1	{v1.4s}, [pA]
+	add	pA, pA, #16
+.endm
+
+.macro KERNEL8x4_E
+	fmla	v16.4s, v4.4s, v12.2s[0]
+	fmla	v17.4s, v5.4s, v12.2s[0]
+	fmla	v20.4s, v4.4s, v12.2s[1]
+	fmla	v21.4s, v5.4s, v12.2s[1]
+	fmla	v24.4s, v4.4s, v13.2s[0]
+	fmla	v25.4s, v5.4s, v13.2s[0]
+	fmla	v28.4s, v4.4s, v13.2s[1]
+	fmla	v29.4s, v5.4s, v13.2s[1]
+.endm
+
+.macro KERNEL8x4_SUB
+	ld1	{v8.2s, v9.2s}, [pB]
+	add	pB, pB, #16
+	ld1	{v0.4s}, [pA]
+	add	pA, pA, #16
+	ld1	{v1.4s}, [pA]
+	add	pA, pA, #16
+
+	fmla	v16.4s, v0.4s, v8.2s[0]
+	fmla	v17.4s, v1.4s, v8.2s[0]
+	fmla	v20.4s, v0.4s, v8.2s[1]
+	fmla	v21.4s, v1.4s, v8.2s[1]
+	fmla	v24.4s, v0.4s, v9.2s[0]
+	fmla	v25.4s, v1.4s, v9.2s[0]
+	fmla	v28.4s, v0.4s, v9.2s[1]
+	fmla	v29.4s, v1.4s, v9.2s[1]
+.endm
+
+.macro SAVE8x4
+	add	pCRow1, pCRow0, LDC
+
+	fmul	v0.4s, v16.4s, alphaV0
+	fmul	v1.4s, v17.4s, alphaV1
+	st1 	{v0.4s, v1.4s}, [pCRow0]
+
+	add	pCRow2, pCRow1, LDC
+
+	fmul	v4.4s, v20.4s, alphaV0
+	fmul	v5.4s, v21.4s, alphaV1
+	st1 	{v4.4s, v5.4s}, [pCRow1]
+
+	add	pCRow1, pCRow2, LDC
+
+	fmul	v0.4s, v24.4s, alphaV0
+	fmul	v1.4s, v25.4s, alphaV1
+	st1 	{v0.4s, v1.4s}, [pCRow2]
+
+	fmul	v4.4s, v28.4s, alphaV0
+	fmul	v5.4s, v29.4s, alphaV1
+	st1 	{v4.4s, v5.4s}, [pCRow1]
+
+	add	pCRow0, pCRow0, #32
+.endm
+
+/******************************************************************************/
+
+.macro INIT4x4
+	fmov		s16, wzr
+	fmov		s17, s16
+	fmov		s20, s17
+	fmov		s21, s16
+	fmov		s24, s17
+	fmov		s25, s16
+	fmov		s28, s17
+	fmov		s29, s16
+.endm
+
+.macro KERNEL4x4_I
+	ld1	{v8.2s, v9.2s}, [pB]
+	add	pB, pB, #16
+	ld1	{v0.2s, v1.2s}, [pA]
+	add	pA, pA, #16
+
+	fmul	v16.2s, v0.2s, v8.2s[0]
+	fmul	v29.2s, v1.2s, v9.2s[1]
+
+	fmul	v20.2s, v0.2s, v8.2s[1]
+	fmul	v25.2s, v1.2s, v9.2s[0]
+
+	fmul	v24.2s, v0.2s, v9.2s[0]
+	fmul	v21.2s, v1.2s, v8.2s[1]
+
+	fmul	v28.2s, v0.2s, v9.2s[1]
+	fmul	v17.2s, v1.2s, v8.2s[0]
+
+	ld1	{v12.2s, v13.2s}, [pB]
+	add	pB, pB, #16
+	ld1	{v4.2s, v5.2s}, [pA]
+	add	pA, pA, #16
+.endm
+
+.macro KERNEL4x4_M1
+	fmla	v16.2s, v0.2s, v8.2s[0]
+	fmla	v29.2s, v1.2s, v9.2s[1]
+
+	ld1	{v12.2s, v13.2s}, [pB]		// For next round
+	add	pB, pB, #16
+
+	fmla	v20.2s, v0.2s, v8.2s[1]
+	fmla	v25.2s, v1.2s, v9.2s[0]
+
+	ld1	{v4.2s, v5.2s}, [pA]		// For next round
+	add	pA, pA, #16
+
+	fmla	v24.2s, v0.2s, v9.2s[0]
+	fmla	v21.2s, v1.2s, v8.2s[1]
+
+	prfm	PLDL1KEEP, [pB, #512]
+
+	fmla	v28.2s, v0.2s, v9.2s[1]
+	fmla	v17.2s, v1.2s, v8.2s[0]
+.endm
+
+.macro KERNEL4x4_M2
+	fmla	v16.2s, v4.2s, v12.2s[0]
+	fmla	v29.2s, v5.2s, v13.2s[1]
+
+	ld1	{v8.2s, v9.2s}, [pB]		// For next round
+	add	pB, pB, #16
+
+	fmla	v20.2s, v4.2s, v12.2s[1]
+	fmla	v25.2s, v5.2s, v13.2s[0]
+
+	ld1	{v0.2s, v1.2s}, [pA]		// For next round
+	add	pA, pA, #16
+
+	fmla	v24.2s, v4.2s, v13.2s[0]
+	fmla	v21.2s, v5.2s, v12.2s[1]
+
+	prfm	PLDL1KEEP, [pA, #512]
+
+	fmla	v28.2s, v4.2s, v13.2s[1]
+	fmla	v17.2s, v5.2s, v12.2s[0]
+.endm
+
+.macro KERNEL4x4_E
+	fmla	v16.2s, v4.2s, v12.2s[0]
+	fmla	v29.2s, v5.2s, v13.2s[1]
+
+	fmla	v20.2s, v4.2s, v12.2s[1]
+	fmla	v25.2s, v5.2s, v13.2s[0]
+
+	fmla	v24.2s, v4.2s, v13.2s[0]
+	fmla	v21.2s, v5.2s, v12.2s[1]
+
+	fmla	v28.2s, v4.2s, v13.2s[1]
+	fmla	v17.2s, v5.2s, v12.2s[0]
+.endm
+
+.macro KERNEL4x4_SUB
+	ld1	{v8.2s, v9.2s}, [pB]
+	add	pB, pB, #16
+	ld1	{v0.2s, v1.2s}, [pA]
+	add	pA, pA, #16
+
+	fmla	v16.2s, v0.2s, v8.2s[0]
+	fmla	v29.2s, v1.2s, v9.2s[1]
+
+	fmla	v20.2s, v0.2s, v8.2s[1]
+	fmla	v25.2s, v1.2s, v9.2s[0]
+
+	fmla	v24.2s, v0.2s, v9.2s[0]
+	fmla	v21.2s, v1.2s, v8.2s[1]
+
+	fmla	v28.2s, v0.2s, v9.2s[1]
+	fmla	v17.2s, v1.2s, v8.2s[0]
+.endm
+
+.macro SAVE4x4
+
+	fmul	v8.2s, v16.2s, alphaV0
+	fmul	v9.2s, v17.2s, alphaV1
+	st1 	{v8.2s, v9.2s}, [pCRow0]
+
+	add	pCRow1, pCRow0, LDC
+
+	fmul	v12.2s, v20.2s, alphaV2
+	fmul	v13.2s, v21.2s, alphaV3
+	st1 	{v12.2s, v13.2s}, [pCRow1]
+
+	add	pCRow2, pCRow1, LDC
+
+	fmul	v8.2s, v24.2s, alphaV0
+	fmul	v9.2s, v25.2s, alphaV1
+	st1 	{v8.2s, v9.2s}, [pCRow2]
+
+	add	pCRow1, pCRow2, LDC
+
+	fmul	v12.2s, v28.2s, alphaV2
+	fmul	v13.2s, v29.2s, alphaV3
+	st1 	{v12.2s, v13.2s}, [pCRow1]
+
+	add	pCRow0, pCRow0, #16
+.endm
+
+/******************************************************************************/
+
+.macro INIT2x4
+	fmov		s16, wzr
+	fmov		s20, s16
+	fmov		s24, s20
+	fmov		s28, s16
+.endm
+
+.macro KERNEL2x4_SUB
+	ld1	{v8.2s, v9.2s}, [pB]
+	add	pB, pB, #16
+	ld1	{v0.2s}, [pA]
+	add	pA, pA, #8
+
+	fmla	v16.2s, v0.2s, v8.2s[0]
+	fmla	v20.2s, v0.2s, v8.2s[1]
+	fmla	v24.2s, v0.2s, v9.2s[0]
+	fmla	v28.2s, v0.2s, v9.2s[1]
+.endm
+
+.macro SAVE2x4
+	fmul	v8.2s, v16.2s, alphaV0
+	st1	{v8.2s}, [pCRow0]
+
+	add	pCRow1, pCRow0, LDC
+	fmul	v12.2s, v20.2s, alphaV1
+	st1	{v12.2s}, [pCRow1]
+
+	add	pCRow2, pCRow1, LDC
+	fmul	v8.2s, v24.2s, alphaV2
+	st1	{v8.2s}, [pCRow2]
+
+	add	pCRow1, pCRow2, LDC
+	fmul	v12.2s, v28.2s, alphaV3
+	st1	{v12.2s}, [pCRow1]
+
+	add	pCRow0, pCRow0, #8
+.endm
+
+/******************************************************************************/
+
+.macro INIT1x4
+	fmov		s16, wzr
+	fmov		s20, s16
+.endm
+
+.macro KERNEL1x4_SUB
+	ldr	s0, [pA]
+	add	pA, pA, #4
+
+	ld1	{v8.2s, v9.2s}, [pB]
+	add	pB, pB, #16
+
+	fmla	v16.2s, v8.2s, v0.s[0]
+	fmla	v20.2s, v9.2s, v0.s[0]
+.endm
+
+.macro SAVE1x4
+	add	pCRow1, pCRow0, LDC
+
+	fmul	v8.2s, v16.2s, alphaV0
+	st1	{v8.s}[0], [pCRow0]
+	st1	{v8.s}[1], [pCRow1]
+
+	add	pCRow2, pCRow1, LDC
+	add	pCRow1, pCRow2, LDC
+
+	fmul	v12.2s, v20.2s, alphaV1
+	st1	{v12.s}[0], [pCRow2]
+	st1	{v12.s}[1], [pCRow1]
+
+	add	pCRow0, pCRow0, #4
+.endm
+
+/******************************************************************************/
+
+.macro INIT16x2
+	fmov	s16, wzr
+	fmov	s17, wzr
+	fmov	s18, wzr
+	fmov	s19, s16
+	fmov	s20, wzr
+	fmov	s21, s16
+	fmov	s22, wzr
+	fmov	s23, s16
+.endm
+
+.macro KERNEL16x2_SUB
+	ld1	{v8.2s}, [pB]
+	add	pB, pB, #8
+	ld1	{v0.4s}, [pA]
+	add	pA, pA, #16
+	ld1	{v1.4s}, [pA]
+	add	pA, pA, #16
+	ld1	{v2.4s}, [pA]
+	add	pA, pA, #16
+	ld1	{v3.4s}, [pA]
+	add	pA, pA, #16
+
+	fmla	v16.4s, v0.4s, v8.2s[0]
+	fmla	v17.4s, v1.4s, v8.2s[0]
+	fmla	v18.4s, v2.4s, v8.2s[0]
+	fmla	v19.4s, v3.4s, v8.2s[0]
+
+	fmla	v20.4s, v0.4s, v8.2s[1]
+	fmla	v21.4s, v1.4s, v8.2s[1]
+	fmla	v22.4s, v2.4s, v8.2s[1]
+	fmla	v23.4s, v3.4s, v8.2s[1]
+.endm
+
+.macro SAVE16x2
+	add	pCRow1, pCRow0, LDC
+
+	fmul	v0.4s, v16.4s, alphaV0
+	fmul	v1.4s, v17.4s, alphaV1
+	fmul	v2.4s, v18.4s, alphaV2
+	fmul	v3.4s, v19.4s, alphaV3
+	st1 	{v0.4s, v1.4s, v2.4s, v3.4s}, [pCRow0]
+
+	fmul	v4.4s, v20.4s, alphaV0
+	fmul	v5.4s, v21.4s, alphaV1
+	fmul	v6.4s, v22.4s, alphaV2
+	fmul	v7.4s, v23.4s, alphaV3
+	st1 	{v4.4s, v5.4s, v6.4s, v7.4s}, [pCRow1]
+
+	add	pCRow0, pCRow0, #64
+.endm
+
+/******************************************************************************/
+
+.macro INIT8x2
+	fmov	s16, wzr
+	fmov	s17, s16
+	fmov	s20, s17
+	fmov	s21, s16
+.endm
+
+.macro KERNEL8x2_SUB
+	ld1	{v8.2s}, [pB]
+	add	pB, pB, #8
+	ld1	{v0.4s}, [pA]
+	add	pA, pA, #16
+	ld1	{v1.4s}, [pA]
+	add	pA, pA, #16
+
+	fmla	v16.4s, v0.4s, v8.2s[0]
+	fmla	v17.4s, v1.4s, v8.2s[0]
+
+	fmla	v20.4s, v0.4s, v8.2s[1]
+	fmla	v21.4s, v1.4s, v8.2s[1]
+.endm
+
+.macro SAVE8x2
+	add	pCRow1, pCRow0, LDC
+
+	fmul	v0.4s, v16.4s, alphaV0
+	fmul	v1.4s, v17.4s, alphaV1
+	st1 	{v0.4s, v1.4s}, [pCRow0]
+
+	add	pCRow2, pCRow1, LDC
+
+	fmul	v4.4s, v20.4s, alphaV0
+	fmul	v5.4s, v21.4s, alphaV1
+	st1 	{v4.4s, v5.4s}, [pCRow1]
+
+	add	pCRow0, pCRow0, #32
+.endm
+
+/******************************************************************************/
+
+.macro INIT4x2
+	fmov	s16, wzr
+	fmov	s17, s16
+	fmov	s20, s17
+	fmov	s21, s16
+.endm
+
+.macro KERNEL4x2_SUB
+	ld1	{v8.2s}, [pB]
+	add	pB, pB, #8
+	ld1	{v0.2s, v1.2s}, [pA]
+	add	pA, pA, #16
+
+	fmla	v16.2s, v0.2s, v8.2s[0]
+	fmla	v17.2s, v1.2s, v8.2s[0]
+	fmla	v20.2s, v0.2s, v8.2s[1]
+	fmla	v21.2s, v1.2s, v8.2s[1]
+.endm
+
+.macro SAVE4x2
+
+	fmul	v8.2s, v16.2s, alphaV0
+	fmul	v9.2s, v17.2s, alphaV1
+	st1	{v8.2s, v9.2s}, [pCRow0]
+
+	add	pCRow1, pCRow0, LDC
+
+	fmul	v12.2s, v20.2s, alphaV2
+	fmul	v13.2s, v21.2s, alphaV3
+	st1	{v12.2s, v13.2s}, [pCRow1]
+
+	add	pCRow0, pCRow0, #16
+.endm
+
+/******************************************************************************/
+
+.macro INIT2x2
+	fmov		s16, wzr
+	fmov		s20, s16
+.endm
+
+.macro KERNEL2x2_SUB
+	ld1	{v8.2s}, [pB]
+	add	pB, pB, #8
+
+	ld1	{v0.2s}, [pA]
+	add	pA, pA, #8
+
+	fmla	v16.2s, v0.2s, v8.2s[0]
+	fmla	v20.2s, v0.2s, v8.2s[1]
+.endm
+
+.macro SAVE2x2
+	fmul	v8.2s, v16.2s, alphaV0
+	st1	{v8.2s}, [pCRow0]
+
+	add	pCRow1 , pCRow0, LDC
+
+	fmul	v12.2s, v20.2s, alphaV1
+	st1	{v12.2s}, [pCRow1]
+
+	add	pCRow0, pCRow0, #8
+.endm
+
+/******************************************************************************/
+
+.macro INIT1x2
+	fmov		s16, wzr
+.endm
+
+.macro KERNEL1x2_SUB
+	ld1	{v8.2s} , [pB]
+	add	pB , pB, #8
+
+	ldr	s0 , [pA]
+	add	pA, pA, #4
+
+	fmla	v16.2s, v8.2s, v0.2s[0]
+.endm
+
+.macro SAVE1x2
+	add	pCRow1 , pCRow0, LDC
+
+	fmul	v8.2s, v16.2s, alphaV0
+	st1	{v8.s}[0], [pCRow0]
+	st1	{v8.s}[1], [pCRow1]
+
+	add	pCRow0, pCRow0, #4
+.endm
+
+/******************************************************************************/
+
+.macro INIT16x1
+	fmov	s16, wzr
+	fmov	s17, wzr
+	fmov	s18, wzr
+	fmov	s19, s16
+.endm
+
+.macro KERNEL16x1_SUB
+	ldr	s8, [pB]
+	add	pB , pB, #4
+
+	ld1	{v0.4s}, [pA]
+	add	pA, pA, #16
+	ld1	{v1.4s}, [pA]
+	add	pA, pA, #16
+	ld1	{v2.4s}, [pA]
+	add	pA, pA, #16
+	ld1	{v3.4s}, [pA]
+	add	pA, pA, #16
+
+	fmla	v16.4s, v0.4s, v8.2s[0]
+	fmla	v17.4s, v1.4s, v8.2s[0]
+	fmla	v18.4s, v2.4s, v8.2s[0]
+	fmla	v19.4s, v3.4s, v8.2s[0]
+.endm
+
+.macro SAVE16x1
+
+	fmul	v0.4s, v16.4s, alphaV0
+	fmul	v1.4s, v17.4s, alphaV1
+	fmul	v2.4s, v18.4s, alphaV2
+	fmul	v3.4s, v19.4s, alphaV3
+	st1 	{v0.4s, v1.4s, v2.4s, v3.4s}, [pCRow0]
+
+	add	pCRow0, pCRow0, #64
+.endm
+
+/******************************************************************************/
+
+.macro INIT8x1
+	fmov	s16, wzr
+	fmov	s17, wzr
+.endm
+
+.macro KERNEL8x1_SUB
+	ldr	s8, [pB]
+	add	pB , pB, #4
+
+	ld1	{v0.4s}, [pA]
+	add	pA, pA, #16
+	ld1	{v1.4s}, [pA]
+	add	pA, pA, #16
+
+	fmla	v16.4s, v0.4s, v8.2s[0]
+	fmla	v17.4s, v1.4s, v8.2s[0]
+.endm
+
+.macro SAVE8x1
+
+	fmul	v0.4s, v16.4s, alphaV0
+	fmul	v1.4s, v17.4s, alphaV1
+	st1 	{v0.4s, v1.4s}, [pCRow0]
+
+	add	pCRow0, pCRow0, #32
+.endm
+
+/******************************************************************************/
+
+.macro INIT4x1
+	fmov	s16, wzr
+	fmov	s17, s16
+.endm
+
+.macro KERNEL4x1_SUB
+	ldr	s8, [pB]
+	add	pB , pB, #4
+
+	ld1	{v0.2s, v1.2s}, [pA]
+	add	pA , pA, #16
+
+	fmla	v16.2s, v0.2s, v8.2s[0]
+	fmla	v17.2s, v1.2s, v8.2s[0]
+.endm
+
+.macro SAVE4x1
+
+	fmul	v8.2s, v16.2s, alphaV0
+	fmul	v9.2s, v17.2s, alphaV1
+	st1	{v8.2s, v9.2s}, [pCRow0]
+
+	add	pCRow0, pCRow0, #16
+.endm
+
+/******************************************************************************/
+
+.macro INIT2x1
+	fmov		s16, wzr
+.endm
+
+.macro KERNEL2x1_SUB
+	ldr	s8, [pB]
+	add	pB , pB, #4
+
+	ld1	{v0.2s}, [pA]
+	add	pA , pA, #8
+
+	fmla	v16.2s, v0.2s, v8.2s[0]
+.endm
+
+.macro SAVE2x1
+
+	fmul	v8.2s, v16.2s, alphaV0
+	st1	{v8.2s}, [pCRow0]
+
+	add	pCRow0, pCRow0, #8
+.endm
+
+/******************************************************************************/
+
+.macro INIT1x1
+	fmov	s16, wzr
+.endm
+
+.macro KERNEL1x1_SUB
+	ldr	s8, [pB]
+	add	pB , pB, #4
+
+	ldr	s0, [pA]
+	add	pA , pA, #4
+
+	fmadd 	s16, s0, s8, s16  
+.endm
+
+.macro SAVE1x1
+	fmul	s8, s16, alpha0
+	str 	s8, [pCRow0]
+
+	add	pCRow0, pCRow0, #4
+.endm
+
+/*******************************************************************************
+* End of macro definitions
+*******************************************************************************/
+
+	PROLOGUE
+
+strmm_kernel_begin:
+
+	.align 5
+	add	sp, sp, #-(11 * 16)
+	stp	d8, d9, [sp, #(0 * 16)]
+	stp	d10, d11, [sp, #(1 * 16)]
+	stp	d12, d13, [sp, #(2 * 16)]
+	stp	d14, d15, [sp, #(3 * 16)]
+	stp	d16, d17, [sp, #(4 * 16)]
+	stp	x18, x19, [sp, #(5 * 16)]
+	stp	x20, x21, [sp, #(6 * 16)]
+	stp	x22, x23, [sp, #(7 * 16)]
+	stp	x24, x25, [sp, #(8 * 16)]
+	stp	x26, x27, [sp, #(9 * 16)]
+	str	x28, [sp, #(10 * 16)]
+
+	fmov	alpha0, s0
+	fmov	alpha1, s0
+	fmov	alpha2, s0
+	fmov	alpha3, s0
+
+	lsl	LDC, LDC, #2			// ldc = ldc * 4
+
+#if !defined(LEFT)
+	neg	tempOffset, offset
+#endif
+	mov	pB, origPB
+
+	mov	counterJ, origN
+	asr 	counterJ, counterJ, #2		// J = J / 4
+	cmp 	counterJ, #0
+	ble	strmm_kernel_L2_BEGIN
+
+/******************************************************************************/
+
+strmm_kernel_L4_BEGIN:
+	mov	pCRow0, pC			// pCRow0 = C
+	add	pC, pC, LDC, lsl #2
+
+#if defined(LEFT)
+	mov	tempOffset, offset
+#endif
+	mov	pA, origPA			// pA = start of A array
+
+strmm_kernel_L4_M16_BEGIN:
+
+	mov	counterI, origM
+	asr 	counterI, counterI, #4		// counterI = counterI / 16
+	cmp 	counterI, #0
+	ble	strmm_kernel_L4_M8_BEGIN
+
+strmm_kernel_L4_M16_20:
+
+#if (defined(LEFT) &&  defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+	mov	pB, origPB
+#else
+	mov	pB, origPB
+	lsl	temp, tempOffset, #6
+	add	pA, pA, temp
+	lsl	temp, tempOffset, #4
+	add	pB, pB, temp
+#endif
+
+#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
+	sub	tempK, origK, tempOffset
+#elif defined(LEFT)
+	add	tempK, tempOffset, #16
+#else
+	add	tempK, tempOffset, #4
+#endif
+
+	asr 	counterL , tempK, #1		// L = K / 2
+	cmp	counterL , #2			// is there at least 4 to do?
+	blt	strmm_kernel_L4_M16_32
+
+	KERNEL16x4_I				// do one in the K
+	KERNEL16x4_M2				// do another in the K
+
+	subs	counterL, counterL, #2
+	ble	strmm_kernel_L4_M16_22a
+	.align 5
+
+strmm_kernel_L4_M16_22:
+
+	KERNEL16x4_M1
+	KERNEL16x4_M2
+
+	subs	counterL, counterL, #1
+	bgt	strmm_kernel_L4_M16_22
+
+strmm_kernel_L4_M16_22a:
+
+	KERNEL16x4_M1
+	KERNEL16x4_E
+
+	b	 strmm_kernel_L4_M16_44
+
+strmm_kernel_L4_M16_32:
+
+	tst	counterL, #1
+	ble	strmm_kernel_L4_M16_40
+
+	KERNEL16x4_I
+	KERNEL16x4_E
+
+	b	strmm_kernel_L4_M16_44
+
+strmm_kernel_L4_M16_40:
+
+	INIT16x4
+
+strmm_kernel_L4_M16_44:
+
+	ands	counterL , tempK, #1
+	ble	strmm_kernel_L4_M16_100
+
+strmm_kernel_L4_M16_46:
+
+	KERNEL16x4_SUB
+
+strmm_kernel_L4_M16_100:
+
+	SAVE16x4
+
+#if (defined(LEFT) &&  defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+	sub	tempK, origK, tempOffset
+#if defined(LEFT)
+	sub	tempK, tempK, #16
+#else
+	sub	tempK, tempK, #4
+#endif
+	lsl	temp, tempK, #6
+	add	pA, pA, temp
+	lsl	temp, tempK, #4
+	add	pB, pB, temp
+#endif
+#if defined(LEFT)
+	add	tempOffset, tempOffset, #16
+#endif
+
+strmm_kernel_L4_M16_END:
+	subs	counterI, counterI, #1
+	bne	strmm_kernel_L4_M16_20
+
+//------------------------------------------------------------------------------
+
+strmm_kernel_L4_M8_BEGIN:
+
+	mov	counterI, origM
+	tst	counterI , #15
+	ble	strmm_kernel_L4_END
+
+	tst	counterI, #8
+	ble	strmm_kernel_L4_M4_BEGIN
+
+strmm_kernel_L4_M8_20:
+
+#if (defined(LEFT) &&  defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+	mov	pB, origPB
+#else
+	mov	pB, origPB
+	lsl	temp, tempOffset, #5
+	add	pA, pA, temp
+	lsl	temp, tempOffset, #4
+	add	pB, pB, temp
+#endif
+
+#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
+	sub	tempK, origK, tempOffset
+#elif defined(LEFT)
+	add	tempK, tempOffset, #8
+#else
+	add	tempK, tempOffset, #4
+#endif
+
+	asr 	counterL , tempK, #1		// L = K / 2
+	cmp	counterL , #2			// is there at least 4 to do?
+	blt	strmm_kernel_L4_M8_32
+
+	KERNEL8x4_I				// do one in the K
+	KERNEL8x4_M2				// do another in the K
+
+	subs	counterL, counterL, #2
+	ble	strmm_kernel_L4_M8_22a
+	.align 5
+
+strmm_kernel_L4_M8_22:
+
+	KERNEL8x4_M1
+	KERNEL8x4_M2
+
+	subs	counterL, counterL, #1
+	bgt	strmm_kernel_L4_M8_22
+
+strmm_kernel_L4_M8_22a:
+
+	KERNEL8x4_M1
+	KERNEL8x4_E
+
+	b	 strmm_kernel_L4_M8_44
+
+strmm_kernel_L4_M8_32:
+
+	tst	counterL, #1
+	ble	strmm_kernel_L4_M8_40
+
+	KERNEL8x4_I
+	KERNEL8x4_E
+
+	b	strmm_kernel_L4_M8_44
+
+strmm_kernel_L4_M8_40:
+
+	INIT8x4
+
+strmm_kernel_L4_M8_44:
+
+	ands	counterL , tempK, #1
+	ble	strmm_kernel_L4_M8_100
+
+strmm_kernel_L4_M8_46:
+
+	KERNEL8x4_SUB
+
+strmm_kernel_L4_M8_100:
+
+	SAVE8x4
+
+#if (defined(LEFT) &&  defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+	sub	tempK, origK, tempOffset
+#if defined(LEFT)
+	sub	tempK, tempK, #8
+#else
+	sub	tempK, tempK, #4
+#endif
+	lsl	temp, tempK, #5
+	add	pA, pA, temp
+	lsl	temp, tempK, #4
+	add	pB, pB, temp
+#endif
+#if defined(LEFT)
+	add	tempOffset, tempOffset, #8
+#endif
+
+strmm_kernel_L4_M8_END:
+
+//------------------------------------------------------------------------------
+
+strmm_kernel_L4_M4_BEGIN:
+
+	mov	counterI, origM
+	tst	counterI , #7
+	ble	strmm_kernel_L4_END
+
+	tst	counterI, #4
+	ble	strmm_kernel_L4_M2_BEGIN
+
+strmm_kernel_L4_M4_20:
+
+#if (defined(LEFT) &&  defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+	mov	pB, origPB
+#else
+	mov	pB, origPB
+	lsl	temp, tempOffset, #4
+	add	pB, pB, temp
+	add	pA, pA, temp
+#endif
+
+#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
+	sub	tempK, origK, tempOffset
+#elif defined(LEFT)
+	add	tempK, tempOffset, #4
+#else
+	add	tempK, tempOffset, #4
+#endif
+	asr 	counterL , tempK, #1		// L = K / 2
+	cmp	counterL , #2			// is there at least 4 to do?
+	blt	strmm_kernel_L4_M4_32
+
+	KERNEL4x4_I				// do one in the K
+	KERNEL4x4_M2				// do another in the K
+
+	subs	counterL, counterL, #2
+	ble	strmm_kernel_L4_M4_22a
+	.align 5
+
+strmm_kernel_L4_M4_22:
+
+	KERNEL4x4_M1
+	KERNEL4x4_M2
+
+	subs	counterL, counterL, #1
+	bgt	strmm_kernel_L4_M4_22
+
+strmm_kernel_L4_M4_22a:
+
+	KERNEL4x4_M1
+	KERNEL4x4_E
+
+	b	 strmm_kernel_L4_M4_44
+
+strmm_kernel_L4_M4_32:
+
+	tst	counterL, #1
+	ble	strmm_kernel_L4_M4_40
+
+	KERNEL4x4_I
+	KERNEL4x4_E
+
+	b	strmm_kernel_L4_M4_44
+
+strmm_kernel_L4_M4_40:
+
+	INIT4x4
+
+strmm_kernel_L4_M4_44:
+
+	ands	counterL , tempK, #1
+	ble	strmm_kernel_L4_M4_100
+
+strmm_kernel_L4_M4_46:
+
+	KERNEL4x4_SUB
+
+strmm_kernel_L4_M4_100:
+
+	SAVE4x4
+
+#if (defined(LEFT) &&  defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+	sub	tempK, origK, tempOffset
+#if defined(LEFT)
+	sub	tempK, tempK, #4
+#else
+	sub	tempK, tempK, #4
+#endif
+	lsl	temp, tempK, #4
+	add	pA, pA, temp
+	add	pB, pB, temp
+#endif
+#if defined(LEFT)
+	add	tempOffset, tempOffset, #4
+#endif
+strmm_kernel_L4_M4_END:
+
+//------------------------------------------------------------------------------
+
+strmm_kernel_L4_M2_BEGIN:
+
+	mov	counterI, origM
+	tst	counterI , #3
+	ble	strmm_kernel_L4_END
+
+	tst	counterI, #2			// counterI = counterI / 2
+	ble	strmm_kernel_L4_M1_BEGIN
+
+strmm_kernel_L4_M2_20:
+
+	INIT2x4
+
+#if (defined(LEFT) &&  defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+	mov	pB, origPB
+#else
+	mov	pB, origPB
+	lsl	temp, tempOffset, #3
+	add	pA, pA, temp
+	lsl	temp, tempOffset, #4
+	add	pB, pB, temp
+#endif
+
+#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
+	sub	tempK, origK, tempOffset
+#elif defined(LEFT)
+	add	tempK, tempOffset, #2
+#else
+	add	tempK, tempOffset, #4
+#endif
+	asr 	counterL , tempK, #3		// counterL = counterL / 8
+	cmp	counterL , #0
+	ble	strmm_kernel_L4_M2_40
+
+strmm_kernel_L4_M2_22:
+
+	KERNEL2x4_SUB
+	KERNEL2x4_SUB
+	KERNEL2x4_SUB
+	KERNEL2x4_SUB
+
+	KERNEL2x4_SUB
+	KERNEL2x4_SUB
+	KERNEL2x4_SUB
+	KERNEL2x4_SUB
+
+	subs	counterL, counterL, #1
+	bgt	strmm_kernel_L4_M2_22
+
+
+strmm_kernel_L4_M2_40:
+
+	ands	counterL , tempK, #7		// counterL = counterL % 8
+	ble	strmm_kernel_L4_M2_100
+
+strmm_kernel_L4_M2_42:
+
+	KERNEL2x4_SUB
+
+	subs	counterL, counterL, #1
+	bgt	strmm_kernel_L4_M2_42
+
+strmm_kernel_L4_M2_100:
+
+	SAVE2x4
+
+#if (defined(LEFT) &&  defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+	sub	tempK, origK, tempOffset
+#if defined(LEFT)
+	sub	tempK, tempK, #2
+#else
+	sub	tempK, tempK, #4
+#endif
+	lsl	temp, tempK, #3
+	add	pA, pA, temp
+	lsl	temp, tempK, #4
+	add	pB, pB, temp
+#endif
+#if defined(LEFT)
+	add	tempOffset, tempOffset, #2
+#endif
+strmm_kernel_L4_M2_END:
+
+
+strmm_kernel_L4_M1_BEGIN:
+
+	tst	counterI, #1			// counterI = counterI % 2
+	ble	strmm_kernel_L4_END
+
+strmm_kernel_L4_M1_20:
+
+	INIT1x4
+
+#if (defined(LEFT) &&  defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+	mov	pB, origPB
+#else
+	mov	pB, origPB
+	lsl	temp, tempOffset, #4
+	add	pB, pB, temp
+	lsl	temp, tempOffset, #2
+	add	pA, pA, temp
+#endif
+
+#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
+	sub	tempK, origK, tempOffset
+#elif defined(LEFT)
+	add	tempK, tempOffset, #1
+#else
+	add	tempK, tempOffset, #4
+#endif
+	asr 	counterL , tempK, #3		// counterL = counterL / 8
+	cmp	counterL , #0
+	ble	strmm_kernel_L4_M1_40
+
+strmm_kernel_L4_M1_22:
+	KERNEL1x4_SUB
+	KERNEL1x4_SUB
+	KERNEL1x4_SUB
+	KERNEL1x4_SUB
+
+	KERNEL1x4_SUB
+	KERNEL1x4_SUB
+	KERNEL1x4_SUB
+	KERNEL1x4_SUB
+
+	subs	counterL, counterL, #1
+	bgt	strmm_kernel_L4_M1_22
+
+
+strmm_kernel_L4_M1_40:
+
+	ands	counterL , tempK, #7		// counterL = counterL % 8
+	ble	strmm_kernel_L4_M1_100
+
+strmm_kernel_L4_M1_42:
+
+	KERNEL1x4_SUB
+
+	subs	counterL, counterL, #1
+	bgt	strmm_kernel_L4_M1_42
+
+strmm_kernel_L4_M1_100:
+
+	SAVE1x4
+
+#if (defined(LEFT) &&  defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+	sub	tempK, origK, tempOffset
+#if defined(LEFT)
+	sub	tempK, tempK, #1
+#else
+	sub	tempK, tempK, #4
+#endif
+	lsl	temp, tempK, #2
+	add	pA, pA, temp
+	lsl	temp, tempK, #4
+	add	pB, pB, temp
+#endif
+#if defined(LEFT)
+	add	tempOffset, tempOffset, #1
+#endif
+strmm_kernel_L4_END:
+	add	origPB, origPB, origK, lsl #4	// B = B + K * 4 * 4
+#if !defined(LEFT)
+	add	tempOffset, tempOffset, #4
+#endif
+
+	subs	counterJ, counterJ , #1		// j--
+	bgt	strmm_kernel_L4_BEGIN
+
+
+/******************************************************************************/
+
+strmm_kernel_L2_BEGIN:   // less than 2 left in N direction
+
+	mov	counterJ , origN
+	tst	counterJ , #3
+	ble	strmm_kernel_L999
+
+	tst	counterJ , #2
+	ble	strmm_kernel_L1_BEGIN
+
+	mov	pCRow0, pC			// pCRow0 = pC
+
+	add	pC,pC,LDC, lsl #1
+
+#if defined(LEFT)
+	mov	tempOffset, offset
+#endif
+	mov	pA, origPA			// pA = A
+
+strmm_kernel_L2_M16_BEGIN:
+
+	mov	counterI, origM
+	asr 	counterI, counterI, #4		// counterI = counterI / 16
+	cmp	counterI,#0
+	ble	strmm_kernel_L2_M8_BEGIN
+
+strmm_kernel_L2_M16_20:
+
+	INIT16x2
+
+#if (defined(LEFT) &&  defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+	mov	pB, origPB
+#else
+	mov	pB, origPB
+	lsl	temp, tempOffset, #6
+	add	pA, pA, temp
+	lsl	temp, tempOffset, #3
+	add	pB, pB, temp
+#endif
+
+#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
+	sub	tempK, origK, tempOffset
+#elif defined(LEFT)
+	add	tempK, tempOffset, #16
+#else
+	add	tempK, tempOffset, #2
+#endif
+
+	asr	counterL , tempK, #3		// counterL = counterL / 8
+	cmp	counterL,#0
+	ble	strmm_kernel_L2_M16_40
+	.align 5
+
+strmm_kernel_L2_M16_22:
+	KERNEL16x2_SUB
+	KERNEL16x2_SUB
+	KERNEL16x2_SUB
+	KERNEL16x2_SUB
+
+	KERNEL16x2_SUB
+	KERNEL16x2_SUB
+	KERNEL16x2_SUB
+	KERNEL16x2_SUB
+
+	subs	counterL, counterL, #1
+	bgt	strmm_kernel_L2_M16_22
+
+
+strmm_kernel_L2_M16_40:
+
+	ands	counterL , tempK, #7		// counterL = counterL % 8
+	ble	strmm_kernel_L2_M16_100
+
+strmm_kernel_L2_M16_42:
+
+	KERNEL16x2_SUB
+
+	subs	counterL, counterL, #1
+	bgt	strmm_kernel_L2_M16_42
+
+strmm_kernel_L2_M16_100:
+
+	SAVE16x2
+
+#if (defined(LEFT) &&  defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+	sub	tempK, origK, tempOffset
+#if defined(LEFT)
+	sub	tempK, tempK, #16
+#else
+	sub	tempK, tempK, #2
+#endif
+	lsl	temp, tempK, #6
+	add	pA, pA, temp
+	lsl	temp, tempK, #3
+	add	pB, pB, temp
+#endif
+#if defined(LEFT)
+	add	tempOffset, tempOffset, #16
+#endif
+
+strmm_kernel_L2_M16_END:
+
+	subs	counterI, counterI, #1
+	bgt	strmm_kernel_L2_M16_20
+
+//------------------------------------------------------------------------------
+
+strmm_kernel_L2_M8_BEGIN:
+	mov	counterI, origM
+	tst	counterI , #15
+	ble	strmm_kernel_L2_END
+
+	tst	counterI, #8
+	ble	strmm_kernel_L2_M4_BEGIN
+
+strmm_kernel_L2_M8_20:
+
+	INIT8x2
+
+#if (defined(LEFT) &&  defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+	mov	pB, origPB
+#else
+	mov	pB, origPB
+	lsl	temp, tempOffset, #5
+	add	pA, pA, temp
+	lsl	temp, tempOffset, #3
+	add	pB, pB, temp
+#endif
+
+#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
+	sub	tempK, origK, tempOffset
+#elif defined(LEFT)
+	add	tempK, tempOffset, #8
+#else
+	add	tempK, tempOffset, #2
+#endif
+
+	asr	counterL , tempK, #3		// counterL = counterL / 8
+	cmp	counterL,#0
+	ble	strmm_kernel_L2_M8_40
+	.align 5
+
+strmm_kernel_L2_M8_22:
+	KERNEL8x2_SUB
+	KERNEL8x2_SUB
+	KERNEL8x2_SUB
+	KERNEL8x2_SUB
+
+	KERNEL8x2_SUB
+	KERNEL8x2_SUB
+	KERNEL8x2_SUB
+	KERNEL8x2_SUB
+
+	subs	counterL, counterL, #1
+	bgt	strmm_kernel_L2_M8_22
+
+
+strmm_kernel_L2_M8_40:
+
+	ands	counterL , tempK, #7		// counterL = counterL % 8
+	ble	strmm_kernel_L2_M8_100
+
+strmm_kernel_L2_M8_42:
+
+	KERNEL8x2_SUB
+
+	subs	counterL, counterL, #1
+	bgt	strmm_kernel_L2_M8_42
+
+strmm_kernel_L2_M8_100:
+
+	SAVE8x2
+
+#if (defined(LEFT) &&  defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+	sub	tempK, origK, tempOffset
+#if defined(LEFT)
+	sub	tempK, tempK, #8
+#else
+	sub	tempK, tempK, #2
+#endif
+	lsl	temp, tempK, #5
+	add	pA, pA, temp
+	lsl	temp, tempK, #3
+	add	pB, pB, temp
+#endif
+#if defined(LEFT)
+	add	tempOffset, tempOffset, #8
+#endif
+
+strmm_kernel_L2_M8_END:
+
+//------------------------------------------------------------------------------
+
+strmm_kernel_L2_M4_BEGIN:
+	mov	counterI, origM
+	tst	counterI , #7
+	ble	strmm_kernel_L2_END
+
+	tst	counterI, #4
+	ble	strmm_kernel_L2_M2_BEGIN
+
+strmm_kernel_L2_M4_20:
+
+	INIT4x2
+
+#if (defined(LEFT) &&  defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+	mov	pB, origPB
+#else
+	mov	pB, origPB
+	lsl	temp, tempOffset, #3
+	add	pB, pB, temp
+	lsl	temp, tempOffset, #4
+	add	pA, pA, temp
+#endif
+
+#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
+	sub	tempK, origK, tempOffset
+#elif defined(LEFT)
+	add	tempK, tempOffset, #4
+#else
+	add	tempK, tempOffset, #2
+#endif
+	asr	counterL , tempK, #3		// counterL = counterL / 8
+	cmp	counterL,#0
+	ble	strmm_kernel_L2_M4_40
+	.align 5
+
+strmm_kernel_L2_M4_22:
+	KERNEL4x2_SUB
+	KERNEL4x2_SUB
+	KERNEL4x2_SUB
+	KERNEL4x2_SUB
+
+	KERNEL4x2_SUB
+	KERNEL4x2_SUB
+	KERNEL4x2_SUB
+	KERNEL4x2_SUB
+
+	subs	counterL, counterL, #1
+	bgt	strmm_kernel_L2_M4_22
+
+
+strmm_kernel_L2_M4_40:
+
+	ands	counterL , tempK, #7		// counterL = counterL % 8
+	ble	strmm_kernel_L2_M4_100
+
+strmm_kernel_L2_M4_42:
+
+	KERNEL4x2_SUB
+
+	subs	counterL, counterL, #1
+	bgt	strmm_kernel_L2_M4_42
+
+strmm_kernel_L2_M4_100:
+
+	SAVE4x2
+
+#if (defined(LEFT) &&  defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+	sub	tempK, origK, tempOffset
+#if defined(LEFT)
+	sub	tempK, tempK, #4
+#else
+	sub	tempK, tempK, #2
+#endif
+	lsl	temp, tempK, #4
+	add	pA, pA, temp
+	lsl	temp, tempK, #3
+	add	pB, pB, temp
+#endif
+#if defined(LEFT)
+	add	tempOffset, tempOffset, #4
+#endif
+strmm_kernel_L2_M4_END:
+
+//------------------------------------------------------------------------------
+
+
+strmm_kernel_L2_M2_BEGIN:
+
+	mov	counterI, origM
+	tst	counterI , #3
+	ble	strmm_kernel_L2_END
+
+	tst	counterI, #2			// counterI = counterI / 2
+	ble	strmm_kernel_L2_M1_BEGIN
+
+strmm_kernel_L2_M2_20:
+
+	INIT2x2
+
+#if (defined(LEFT) &&  defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+	mov	pB, origPB
+#else
+	mov	pB, origPB
+	lsl	temp, tempOffset, #3
+	add	pB, pB, temp
+	lsl	temp, tempOffset, #3
+	add	pA, pA, temp
+#endif
+
+#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
+	sub	tempK, origK, tempOffset
+#elif defined(LEFT)
+	add	tempK, tempOffset, #2
+#else
+	add	tempK, tempOffset, #2
+#endif
+	asr	counterL , tempK, #3		// counterL = counterL / 8
+        cmp	counterL,#0
+	ble	strmm_kernel_L2_M2_40
+
+strmm_kernel_L2_M2_22:
+
+	KERNEL2x2_SUB
+	KERNEL2x2_SUB
+	KERNEL2x2_SUB
+	KERNEL2x2_SUB
+
+	KERNEL2x2_SUB
+	KERNEL2x2_SUB
+	KERNEL2x2_SUB
+	KERNEL2x2_SUB
+
+	subs	counterL, counterL, #1
+	bgt	strmm_kernel_L2_M2_22
+
+
+strmm_kernel_L2_M2_40:
+
+	ands	counterL , tempK, #7		// counterL = counterL % 8
+	ble	strmm_kernel_L2_M2_100
+
+strmm_kernel_L2_M2_42:
+
+	KERNEL2x2_SUB
+
+	subs	counterL, counterL, #1
+	bgt	strmm_kernel_L2_M2_42
+
+strmm_kernel_L2_M2_100:
+
+	SAVE2x2
+#if (defined(LEFT) &&  defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+	sub	tempK, origK, tempOffset
+#if defined(LEFT)
+	sub	tempK, tempK, #2
+#else
+	sub	tempK, tempK, #2
+#endif
+	lsl	temp, tempK, #3
+	add	pA, pA, temp
+	lsl	temp, tempK, #3
+	add	pB, pB, temp
+#endif
+#if defined(LEFT)
+	add	tempOffset, tempOffset, #2
+#endif
+
+strmm_kernel_L2_M2_END:
+
+
+strmm_kernel_L2_M1_BEGIN:
+
+	tst	counterI, #1			// counterI = counterI % 2
+	ble	strmm_kernel_L2_END
+
+strmm_kernel_L2_M1_20:
+
+	INIT1x2
+
+#if (defined(LEFT) &&  defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+	mov	pB, origPB
+#else
+	mov	pB, origPB
+	lsl	temp, tempOffset, #3
+	add	pB, pB, temp
+	lsl	temp, tempOffset, #2
+	add	pA, pA, temp
+#endif
+
+#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
+	sub	tempK, origK, tempOffset
+#elif defined(LEFT)
+	add	tempK, tempOffset, #1
+#else
+	add	tempK, tempOffset, #2
+#endif
+	asr 	counterL , tempK, #3		// counterL = counterL / 8
+        cmp     counterL, #0
+	ble	strmm_kernel_L2_M1_40
+
+strmm_kernel_L2_M1_22:
+	KERNEL1x2_SUB
+	KERNEL1x2_SUB
+	KERNEL1x2_SUB
+	KERNEL1x2_SUB
+
+	KERNEL1x2_SUB
+	KERNEL1x2_SUB
+	KERNEL1x2_SUB
+	KERNEL1x2_SUB
+
+	subs	counterL, counterL, #1
+	bgt	strmm_kernel_L2_M1_22
+
+
+strmm_kernel_L2_M1_40:
+
+	ands	counterL , tempK, #7		// counterL = counterL % 8
+	ble	strmm_kernel_L2_M1_100
+
+strmm_kernel_L2_M1_42:
+
+	KERNEL1x2_SUB
+
+	subs	counterL, counterL, #1
+	bgt	strmm_kernel_L2_M1_42
+
+strmm_kernel_L2_M1_100:
+
+	SAVE1x2
+
+#if (defined(LEFT) &&  defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+	sub	tempK, origK, tempOffset
+#if defined(LEFT)
+	sub	tempK, tempK, #1
+#else
+	sub	tempK, tempK, #2
+#endif
+	lsl	temp, tempK, #2
+	add	pA, pA, temp
+	lsl	temp, tempK, #3
+	add	pB, pB, temp
+#endif
+#if defined(LEFT)
+	add	tempOffset, tempOffset, #1
+#endif
+strmm_kernel_L2_END:
+#if !defined(LEFT)
+	add	tempOffset, tempOffset, #2
+#endif
+	add	origPB, origPB, origK, lsl #3	// B = B + K * 2 * 4
+
+/******************************************************************************/
+
+strmm_kernel_L1_BEGIN:
+
+	mov	counterJ , origN
+	tst	counterJ , #1
+	ble	strmm_kernel_L999 // done
+
+
+	mov	pCRow0, pC			// pCRow0 = C
+	add	pC , pC , LDC			// Update pC to point to next
+
+#if defined(LEFT)
+	mov	tempOffset, offset
+#endif
+	mov	pA, origPA			// pA = A
+
+strmm_kernel_L1_M16_BEGIN:
+
+	mov	counterI, origM
+	asr 	counterI, counterI, #4		// counterI = counterI / 16
+	cmp	counterI, #0
+	ble	strmm_kernel_L1_M8_BEGIN
+
+strmm_kernel_L1_M16_20:
+
+	INIT16x1
+
+#if (defined(LEFT) &&  defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+	mov	pB, origPB
+#else
+	mov	pB, origPB
+	lsl	temp, tempOffset, #6
+	add	pA, pA, temp
+	lsl	temp, tempOffset, #2
+	add	pB, pB, temp
+#endif
+
+#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
+	sub	tempK, origK, tempOffset
+#elif defined(LEFT)
+	add	tempK, tempOffset, #16
+#else
+	add	tempK, tempOffset, #1
+#endif
+
+	asr	counterL , tempK, #3		// counterL = counterL / 8
+	cmp	counterL , #0
+	ble	strmm_kernel_L1_M16_40
+	.align 5
+
+strmm_kernel_L1_M16_22:
+	KERNEL16x1_SUB
+	KERNEL16x1_SUB
+	KERNEL16x1_SUB
+	KERNEL16x1_SUB
+
+	KERNEL16x1_SUB
+	KERNEL16x1_SUB
+	KERNEL16x1_SUB
+	KERNEL16x1_SUB
+
+	subs	counterL, counterL, #1
+	bgt	strmm_kernel_L1_M16_22
+
+
+strmm_kernel_L1_M16_40:
+
+	ands	counterL , tempK, #7		// counterL = counterL % 8
+	ble	strmm_kernel_L1_M16_100
+
+strmm_kernel_L1_M16_42:
+
+	KERNEL16x1_SUB
+
+	subs	counterL, counterL, #1
+	bgt	strmm_kernel_L1_M16_42
+
+strmm_kernel_L1_M16_100:
+
+	SAVE16x1
+
+#if (defined(LEFT) &&  defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+	sub	tempK, origK, tempOffset
+#if defined(LEFT)
+	sub	tempK, tempK, #16
+#else
+	sub	tempK, tempK, #1
+#endif
+	lsl	temp, tempK, #6
+	add	pA, pA, temp
+	lsl	temp, tempK, #2
+	add	pB, pB, temp
+#endif
+#if defined(LEFT)
+	add	tempOffset, tempOffset, #16
+#endif
+
+strmm_kernel_L1_M16_END:
+
+	subs	counterI, counterI, #1
+	bgt	strmm_kernel_L1_M16_20
+
+//------------------------------------------------------------------------------
+
+strmm_kernel_L1_M8_BEGIN:
+
+	mov	counterI, origM
+	tst	counterI , #15
+	ble	strmm_kernel_L1_END
+
+	tst	counterI, #8
+	ble	strmm_kernel_L1_M4_BEGIN
+
+strmm_kernel_L1_M8_20:
+
+	INIT8x1
+
+#if (defined(LEFT) &&  defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+	mov	pB, origPB
+#else
+	mov	pB, origPB
+	lsl	temp, tempOffset, #5
+	add	pA, pA, temp
+	lsl	temp, tempOffset, #2
+	add	pB, pB, temp
+#endif
+
+#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
+	sub	tempK, origK, tempOffset
+#elif defined(LEFT)
+	add	tempK, tempOffset, #8
+#else
+	add	tempK, tempOffset, #1
+#endif
+
+	asr	counterL , tempK, #3		// counterL = counterL / 8
+	cmp	counterL , #0
+	ble	strmm_kernel_L1_M8_40
+	.align 5
+
+strmm_kernel_L1_M8_22:
+	KERNEL8x1_SUB
+	KERNEL8x1_SUB
+	KERNEL8x1_SUB
+	KERNEL8x1_SUB
+
+	KERNEL8x1_SUB
+	KERNEL8x1_SUB
+	KERNEL8x1_SUB
+	KERNEL8x1_SUB
+
+	subs	counterL, counterL, #1
+	bgt	strmm_kernel_L1_M8_22
+
+
+strmm_kernel_L1_M8_40:
+
+	ands	counterL , tempK, #7		// counterL = counterL % 8
+	ble	strmm_kernel_L1_M8_100
+
+strmm_kernel_L1_M8_42:
+
+	KERNEL8x1_SUB
+
+	subs	counterL, counterL, #1
+	bgt	strmm_kernel_L1_M8_42
+
+strmm_kernel_L1_M8_100:
+
+	SAVE8x1
+
+#if (defined(LEFT) &&  defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+	sub	tempK, origK, tempOffset
+#if defined(LEFT)
+	sub	tempK, tempK, #8
+#else
+	sub	tempK, tempK, #1
+#endif
+	lsl	temp, tempK, #5
+	add	pA, pA, temp
+	lsl	temp, tempK, #2
+	add	pB, pB, temp
+#endif
+#if defined(LEFT)
+	add	tempOffset, tempOffset, #8
+#endif
+
+strmm_kernel_L1_M8_END:
+
+//------------------------------------------------------------------------------
+
+strmm_kernel_L1_M4_BEGIN:
+	mov	counterI, origM
+	tst	counterI , #7
+	ble	strmm_kernel_L1_END
+
+	tst	counterI, #4
+	ble	strmm_kernel_L1_M2_BEGIN
+
+strmm_kernel_L1_M4_20:
+
+	INIT4x1
+
+#if (defined(LEFT) &&  defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+	mov	pB, origPB
+#else
+	mov	pB, origPB
+	lsl	temp, tempOffset, #2
+	add	pB, pB, temp
+	lsl	temp, tempOffset, #4
+	add	pA, pA, temp
+#endif
+
+#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
+	sub	tempK, origK, tempOffset
+#elif defined(LEFT)
+	add	tempK, tempOffset, #4
+#else
+	add	tempK, tempOffset, #1
+#endif
+	asr	counterL , tempK, #3		// counterL = counterL / 8
+	cmp	counterL , #0
+	ble	strmm_kernel_L1_M4_40
+	.align 5
+
+strmm_kernel_L1_M4_22:
+	KERNEL4x1_SUB
+	KERNEL4x1_SUB
+	KERNEL4x1_SUB
+	KERNEL4x1_SUB
+
+	KERNEL4x1_SUB
+	KERNEL4x1_SUB
+	KERNEL4x1_SUB
+	KERNEL4x1_SUB
+
+	subs	counterL, counterL, #1
+	bgt	strmm_kernel_L1_M4_22
+
+
+strmm_kernel_L1_M4_40:
+
+	ands	counterL , tempK, #7		// counterL = counterL % 8
+	ble	strmm_kernel_L1_M4_100
+
+strmm_kernel_L1_M4_42:
+
+	KERNEL4x1_SUB
+
+	subs	counterL, counterL, #1
+	bgt	strmm_kernel_L1_M4_42
+
+strmm_kernel_L1_M4_100:
+
+	SAVE4x1
+
+#if (defined(LEFT) &&  defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+	sub	tempK, origK, tempOffset
+#if defined(LEFT)
+	sub	tempK, tempK, #4
+#else
+	sub	tempK, tempK, #1
+#endif
+	lsl	temp, tempK, #4
+	add	pA, pA, temp
+	lsl	temp, tempK, #2
+	add	pB, pB, temp
+#endif
+#if defined(LEFT)
+	add	tempOffset, tempOffset, #4
+#endif
+strmm_kernel_L1_M4_END:
+
+//------------------------------------------------------------------------------
+
+strmm_kernel_L1_M2_BEGIN:
+
+	mov	counterI, origM
+	tst	counterI , #3
+	ble	strmm_kernel_L1_END
+
+	tst	counterI, #2			// counterI = counterI / 2
+	ble	strmm_kernel_L1_M1_BEGIN
+
+strmm_kernel_L1_M2_20:
+
+	INIT2x1
+
+#if (defined(LEFT) &&  defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+	mov	pB, origPB
+#else
+	mov	pB, origPB
+	lsl	temp, tempOffset, #2
+	add	pB, pB, temp
+	lsl	temp, tempOffset, #3
+	add	pA, pA, temp
+#endif
+
+#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
+	sub	tempK, origK, tempOffset
+#elif defined(LEFT)
+	add	tempK, tempOffset, #2
+#else
+	add	tempK, tempOffset, #1
+#endif
+	asr 	counterL , tempK, #3		// counterL = counterL / 8
+	cmp	counterL , #0
+	ble	strmm_kernel_L1_M2_40
+
+strmm_kernel_L1_M2_22:
+
+	KERNEL2x1_SUB
+	KERNEL2x1_SUB
+	KERNEL2x1_SUB
+	KERNEL2x1_SUB
+
+	KERNEL2x1_SUB
+	KERNEL2x1_SUB
+	KERNEL2x1_SUB
+	KERNEL2x1_SUB
+
+	subs	counterL, counterL, #1
+	bgt	strmm_kernel_L1_M2_22
+
+
+strmm_kernel_L1_M2_40:
+
+	ands	counterL , tempK, #7		// counterL = counterL % 8
+	ble	strmm_kernel_L1_M2_100
+
+strmm_kernel_L1_M2_42:
+
+	KERNEL2x1_SUB
+
+	subs	counterL, counterL, #1
+	bgt	strmm_kernel_L1_M2_42
+
+strmm_kernel_L1_M2_100:
+
+	SAVE2x1
+
+#if (defined(LEFT) &&  defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+	sub	tempK, origK, tempOffset
+#if defined(LEFT)
+	sub	tempK, tempK, #2
+#else
+	sub	tempK, tempK, #1
+#endif
+	lsl	temp, tempK, #3
+	add	pA, pA, temp
+	lsl	temp, tempK, #2
+	add	pB, pB, temp
+#endif
+#if defined(LEFT)
+	add	tempOffset, tempOffset, #2
+#endif
+strmm_kernel_L1_M2_END:
+
+
+strmm_kernel_L1_M1_BEGIN:
+
+	tst	counterI, #1			// counterI = counterI % 2
+	ble	strmm_kernel_L1_END
+
+strmm_kernel_L1_M1_20:
+
+	INIT1x1
+
+#if (defined(LEFT) &&  defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+	mov	pB, origPB
+#else
+	mov	pB, origPB
+	lsl	temp, tempOffset, #2
+	add	pB, pB, temp
+	lsl	temp, tempOffset, #2
+	add	pA, pA, temp
+#endif
+
+#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
+	sub	tempK, origK, tempOffset
+#elif defined(LEFT)
+	add	tempK, tempOffset, #1
+#else
+	add	tempK, tempOffset, #1
+#endif
+	asr 	counterL , tempK, #3		// counterL = counterL / 8
+	cmp	counterL , #0
+	ble	strmm_kernel_L1_M1_40
+
+strmm_kernel_L1_M1_22:
+	KERNEL1x1_SUB
+	KERNEL1x1_SUB
+	KERNEL1x1_SUB
+	KERNEL1x1_SUB
+
+	KERNEL1x1_SUB
+	KERNEL1x1_SUB
+	KERNEL1x1_SUB
+	KERNEL1x1_SUB
+
+	subs	counterL, counterL, #1
+	bgt	strmm_kernel_L1_M1_22
+
+
+strmm_kernel_L1_M1_40:
+
+	ands	counterL , tempK, #7		// counterL = counterL % 8
+	ble	strmm_kernel_L1_M1_100
+
+strmm_kernel_L1_M1_42:
+
+	KERNEL1x1_SUB
+
+	subs	counterL, counterL, #1
+	bgt	strmm_kernel_L1_M1_42
+
+strmm_kernel_L1_M1_100:
+
+	SAVE1x1
+
+strmm_kernel_L1_END:
+
+strmm_kernel_L999:
+	mov	x0, #0				// set return value
+	ldp	d8, d9, [sp, #(0 * 16)]
+	ldp	d10, d11, [sp, #(1 * 16)]
+	ldp	d12, d13, [sp, #(2 * 16)]
+	ldp	d14, d15, [sp, #(3 * 16)]
+	ldp	d16, d17, [sp, #(4 * 16)]
+	ldp	x18, x19, [sp, #(5 * 16)]
+	ldp	x20, x21, [sp, #(6 * 16)]
+	ldp	x22, x23, [sp, #(7 * 16)]
+	ldp	x24, x25, [sp, #(8 * 16)]
+	ldp	x26, x27, [sp, #(9 * 16)]
+	ldr	x28, [sp, #(10 * 16)]
+	add	sp, sp, #(11*16)
+	ret
+
+	EPILOGUE
+
diff --git a/kernel/arm64/strmm_kernel_8x8.S b/kernel/arm64/strmm_kernel_8x8.S
new file mode 100755
index 000000000..98b912934
--- /dev/null
+++ b/kernel/arm64/strmm_kernel_8x8.S
@@ -0,0 +1,2795 @@
+/*******************************************************************************
+Copyright (c) 2015, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*******************************************************************************/
+
+#define ASSEMBLER
+#include "common.h"
+
+/*                   X0          X1          X2          s0        X3        x4       x5           x6               x7 */
+/*int CNAME(BLASLONG bm,BLASLONG bn,BLASLONG bk,FLOAT alpha,FLOAT* ba,FLOAT* bb,FLOAT* C,BLASLONG ldc, BLASLONG offset) */
+
+#define origM		x0
+#define origN		x1
+#define origK		x2
+#define origPA		x3
+#define origPB		x4
+#define pC		x5
+#define LDC		x6
+#define offset		x7
+#define counterL	x8
+#define counterI	x9
+#define counterJ	x10
+#define pB		x11
+#define pCRow0		x12
+#define pCRow1		x13
+#define pCRow2		x14
+#define pA		x15
+#define temp		x16
+#define tempOffset	x17
+#define tempK		x18
+
+#define alpha0		s10
+#define alphaV0		v10.s[0]
+#define alpha1		s11
+#define alphaV1		v11.s[0]
+#define alpha2		s14
+#define alphaV2		v14.s[0]
+#define alpha3		s15
+#define alphaV3		v15.s[0]
+
+// 00 origM
+// 01 origN
+// 02 origK
+// 03 origPA
+// 04 origPB
+// 05 pC
+// 06 origLDC -> LDC
+// 07 offset
+// 08 counterL
+// 09 counterI
+// 10 counterJ
+// 11 pB
+// 12 pCRow0
+// 13 pCRow1
+// 14 pCRow2
+// 15 pA
+// 16 temp
+// 17 tempOffset
+// 18 must save tempK
+// 19 must save
+// 20 must save
+// 21 must save
+// 22 must save
+// 23 must save
+// 24 must save
+// 25 must save
+// 26 must save
+// 27 must save
+// 28 must save
+// 29 frame
+// 30 link
+// 31 sp
+
+//v00 ALPHA -> pA0_0, pA0_1, pA0_2, pA0_3
+//v01 pA0_4, pA0_5, pA0_6, pA0_7
+//v02 pA1_0, pA1_1, pA1_2, pA1_3
+//v03 pA1_4, pA1_5, pA1_6, pA1_7
+//v04 pB0_0, pB0_1, pB0_2, pB0_3
+//v05 pB0_4, pB0_5, pB0_6, pB0_7
+//v06 pB1_0, pB1_1, pB1_2, pB1_3
+//v07 pB1_4, pB1_5, pB1_6, pB1_7
+//v08 must save
+//v09 must save
+//v10 must save ALPHA0
+//v11 must save ALPHA1
+//v12 must save
+//v13 must save
+//v14 must save ALPHA2
+//v15 must save ALPHA3
+//v16 must save C00, C01, C02, C03
+//v17 must save C04, C05, C06, C07
+//v18 C08, C09, C10, C11
+//v19 C12, C13, C14, C15
+//v20 C16, C17, C18, C19
+//v21 C20, C21, C22, C23
+//v22 C24, C25, C26, C27
+//v23 C28, C29, C30, C31
+//v24 C32, C33, C34, C35
+//v25 C36, C37, C38, C39
+//v26 C40, C41, C42, C43
+//v27 C44, C45, C46, C47
+//v28 C48, C49, C50, C51
+//v29 C52, C53, C54, C55
+//v30 C56, C57, C58, C59
+//v31 C60, C61, C62, C63
+
+/*******************************************************************************
+* Macro definitions
+*******************************************************************************/
+
+.macro INIT8x8
+	fmov		s16, wzr
+	fmov		s17, wzr
+	fmov		s18, s16
+	fmov		s19, s17
+	fmov		s20, wzr
+	fmov		s21, s16
+	fmov		s22, s17
+	fmov		s23, s18
+	fmov		s24, wzr
+	fmov		s25, s16
+	fmov		s26, s17
+	fmov		s27, s18
+	fmov		s28, wzr
+	fmov		s29, s16
+	fmov		s30, s17
+	fmov		s31, s18
+.endm
+
+.macro KERNEL8x8_I
+	ld1	{v4.4s}, [pB]
+	add	pB, pB, #16
+	ld1	{v5.4s}, [pB]
+	add	pB, pB, #16
+	ld1	{v0.4s}, [pA]
+	add	pA, pA, #16
+	ld1	{v1.4s}, [pA]
+	add	pA, pA, #16
+
+	fmul	v16.4s, v0.4s, v4.4s[0]
+	fmul	v17.4s, v1.4s, v4.4s[0]
+	fmul	v18.4s, v0.4s, v4.4s[1]
+	fmul	v19.4s, v1.4s, v4.4s[1]
+	fmul	v20.4s, v0.4s, v4.4s[2]
+	fmul	v21.4s, v1.4s, v4.4s[2]
+	fmul	v22.4s, v0.4s, v4.4s[3]
+	fmul	v23.4s, v1.4s, v4.4s[3]
+	fmul	v24.4s, v0.4s, v5.4s[0]
+	fmul	v25.4s, v1.4s, v5.4s[0]
+	fmul	v26.4s, v0.4s, v5.4s[1]
+	fmul	v27.4s, v1.4s, v5.4s[1]
+	fmul	v28.4s, v0.4s, v5.4s[2]
+	fmul	v29.4s, v1.4s, v5.4s[2]
+	fmul	v30.4s, v0.4s, v5.4s[3]
+	fmul	v31.4s, v1.4s, v5.4s[3]
+
+	ld1	{v6.4s}, [pB]
+	add	pB, pB, #16
+	ld1	{v7.4s}, [pB]
+	add	pB, pB, #16
+	ld1	{v2.4s}, [pA]
+	add	pA, pA, #16
+	ld1	{v3.4s}, [pA]
+	add	pA, pA, #16
+.endm
+
+.macro KERNEL8x8_M1
+	fmla	v16.4s, v0.4s, v4.4s[0]
+	fmla	v17.4s, v1.4s, v4.4s[0]
+	fmla	v18.4s, v0.4s, v4.4s[1]
+	fmla	v19.4s, v1.4s, v4.4s[1]
+	fmla	v20.4s, v0.4s, v4.4s[2]
+	fmla	v21.4s, v1.4s, v4.4s[2]
+	fmla	v22.4s, v0.4s, v4.4s[3]
+	fmla	v23.4s, v1.4s, v4.4s[3]
+	fmla	v24.4s, v0.4s, v5.4s[0]
+	fmla	v25.4s, v1.4s, v5.4s[0]
+	fmla	v26.4s, v0.4s, v5.4s[1]
+	fmla	v27.4s, v1.4s, v5.4s[1]
+	fmla	v28.4s, v0.4s, v5.4s[2]
+	fmla	v29.4s, v1.4s, v5.4s[2]
+	fmla	v30.4s, v0.4s, v5.4s[3]
+	fmla	v31.4s, v1.4s, v5.4s[3]
+
+	ld1	{v6.4s}, [pB]
+	add	pB, pB, #16
+	ld1	{v7.4s}, [pB]
+	add	pB, pB, #16
+	ld1	{v2.4s}, [pA]
+	add	pA, pA, #16
+	ld1	{v3.4s}, [pA]
+	add	pA, pA, #16
+.endm
+
+.macro KERNEL8x8_M2
+	fmla	v16.4s, v2.4s, v6.4s[0]
+	fmla	v17.4s, v3.4s, v6.4s[0]
+	fmla	v18.4s, v2.4s, v6.4s[1]
+	fmla	v19.4s, v3.4s, v6.4s[1]
+	fmla	v20.4s, v2.4s, v6.4s[2]
+	fmla	v21.4s, v3.4s, v6.4s[2]
+	fmla	v22.4s, v2.4s, v6.4s[3]
+	fmla	v23.4s, v3.4s, v6.4s[3]
+	fmla	v24.4s, v2.4s, v7.4s[0]
+	fmla	v25.4s, v3.4s, v7.4s[0]
+	fmla	v26.4s, v2.4s, v7.4s[1]
+	fmla	v27.4s, v3.4s, v7.4s[1]
+	fmla	v28.4s, v2.4s, v7.4s[2]
+	fmla	v29.4s, v3.4s, v7.4s[2]
+	fmla	v30.4s, v2.4s, v7.4s[3]
+	fmla	v31.4s, v3.4s, v7.4s[3]
+
+	ld1	{v4.4s}, [pB]
+	add	pB, pB, #16
+	ld1	{v5.4s}, [pB]
+	add	pB, pB, #16
+	ld1	{v0.4s}, [pA]
+	add	pA, pA, #16
+	ld1	{v1.4s}, [pA]
+	add	pA, pA, #16
+.endm
+
+.macro KERNEL8x8_E
+	fmla	v16.4s, v2.4s, v6.4s[0]
+	fmla	v17.4s, v3.4s, v6.4s[0]
+	fmla	v18.4s, v2.4s, v6.4s[1]
+	fmla	v19.4s, v3.4s, v6.4s[1]
+	fmla	v20.4s, v2.4s, v6.4s[2]
+	fmla	v21.4s, v3.4s, v6.4s[2]
+	fmla	v22.4s, v2.4s, v6.4s[3]
+	fmla	v23.4s, v3.4s, v6.4s[3]
+	fmla	v24.4s, v2.4s, v7.4s[0]
+	fmla	v25.4s, v3.4s, v7.4s[0]
+	fmla	v26.4s, v2.4s, v7.4s[1]
+	fmla	v27.4s, v3.4s, v7.4s[1]
+	fmla	v28.4s, v2.4s, v7.4s[2]
+	fmla	v29.4s, v3.4s, v7.4s[2]
+	fmla	v30.4s, v2.4s, v7.4s[3]
+	fmla	v31.4s, v3.4s, v7.4s[3]
+.endm
+
+.macro KERNEL8x8_SUB
+	ld1	{v4.4s}, [pB]
+	add	pB, pB, #16
+	ld1	{v5.4s}, [pB]
+	add	pB, pB, #16
+	ld1	{v0.4s}, [pA]
+	add	pA, pA, #16
+	ld1	{v1.4s}, [pA]
+	add	pA, pA, #16
+
+	fmla	v16.4s, v0.4s, v4.4s[0]
+	fmla	v17.4s, v1.4s, v4.4s[0]
+	fmla	v18.4s, v0.4s, v4.4s[1]
+	fmla	v19.4s, v1.4s, v4.4s[1]
+	fmla	v20.4s, v0.4s, v4.4s[2]
+	fmla	v21.4s, v1.4s, v4.4s[2]
+	fmla	v22.4s, v0.4s, v4.4s[3]
+	fmla	v23.4s, v1.4s, v4.4s[3]
+	fmla	v24.4s, v0.4s, v5.4s[0]
+	fmla	v25.4s, v1.4s, v5.4s[0]
+	fmla	v26.4s, v0.4s, v5.4s[1]
+	fmla	v27.4s, v1.4s, v5.4s[1]
+	fmla	v28.4s, v0.4s, v5.4s[2]
+	fmla	v29.4s, v1.4s, v5.4s[2]
+	fmla	v30.4s, v0.4s, v5.4s[3]
+	fmla	v31.4s, v1.4s, v5.4s[3]
+.endm
+
+.macro SAVE8x8
+	add	pCRow1, pCRow0, LDC
+
+	fmul	v0.4s, v16.4s, alphaV0
+	fmul	v1.4s, v17.4s, alphaV1
+	st1 	{v0.4s, v1.4s}, [pCRow0]
+
+	add	pCRow2, pCRow1, LDC
+
+	fmul	v2.4s, v18.4s, alphaV2
+	fmul	v3.4s, v19.4s, alphaV3
+	st1 	{v2.4s, v3.4s}, [pCRow1]
+
+	add	pCRow1, pCRow2, LDC
+
+	fmul	v4.4s, v20.4s, alphaV0
+	fmul	v5.4s, v21.4s, alphaV1
+	st1 	{v4.4s, v5.4s}, [pCRow2]
+
+	add	pCRow2, pCRow1, LDC
+
+	fmul	v6.4s, v22.4s, alphaV2
+	fmul	v7.4s, v23.4s, alphaV3
+	st1 	{v6.4s, v7.4s}, [pCRow1]
+
+	add	pCRow1, pCRow2, LDC
+
+	fmul	v0.4s, v24.4s, alphaV0
+	fmul	v1.4s, v25.4s, alphaV1
+	st1 	{v0.4s, v1.4s}, [pCRow2]
+
+	add	pCRow2, pCRow1, LDC
+
+	fmul	v2.4s, v26.4s, alphaV2
+	fmul	v3.4s, v27.4s, alphaV3
+	st1 	{v2.4s, v3.4s}, [pCRow1]
+
+	add	pCRow1, pCRow2, LDC
+
+	fmul	v4.4s, v28.4s, alphaV0
+	fmul	v5.4s, v29.4s, alphaV1
+	st1 	{v4.4s, v5.4s}, [pCRow2]
+
+	fmul	v6.4s, v30.4s, alphaV2
+	fmul	v7.4s, v31.4s, alphaV3
+	st1 	{v6.4s, v7.4s}, [pCRow1]
+
+	add	pCRow0, pCRow0, #32
+.endm
+
+/******************************************************************************/
+
+
+.macro INIT4x8
+	fmov		s16, wzr
+	fmov		s18, wzr
+	fmov		s20, wzr
+	fmov		s22, s16
+	fmov		s24, wzr
+	fmov		s26, s16
+	fmov		s28, s18
+	fmov		s30, s20
+.endm
+
+.macro KERNEL4x8_I
+	ld1	{v4.4s}, [pB]
+	add	pB, pB, #16
+	ld1	{v5.4s}, [pB]
+	add	pB, pB, #16
+	ld1	{v0.4s}, [pA]
+	add	pA, pA, #16
+
+	fmul	v16.4s, v0.4s, v4.4s[0]
+	fmul	v18.4s, v0.4s, v4.4s[1]
+	fmul	v20.4s, v0.4s, v4.4s[2]
+	fmul	v22.4s, v0.4s, v4.4s[3]
+	fmul	v24.4s, v0.4s, v5.4s[0]
+	fmul	v26.4s, v0.4s, v5.4s[1]
+	fmul	v28.4s, v0.4s, v5.4s[2]
+	fmul	v30.4s, v0.4s, v5.4s[3]
+
+	ld1	{v6.4s}, [pB]
+	add	pB, pB, #16
+	ld1	{v7.4s}, [pB]
+	add	pB, pB, #16
+	ld1	{v2.4s}, [pA]
+	add	pA, pA, #16
+.endm
+
+.macro KERNEL4x8_M1
+	fmla	v16.4s, v0.4s, v4.4s[0]
+	fmla	v18.4s, v0.4s, v4.4s[1]
+	fmla	v20.4s, v0.4s, v4.4s[2]
+	fmla	v22.4s, v0.4s, v4.4s[3]
+	fmla	v24.4s, v0.4s, v5.4s[0]
+	fmla	v26.4s, v0.4s, v5.4s[1]
+	fmla	v28.4s, v0.4s, v5.4s[2]
+	fmla	v30.4s, v0.4s, v5.4s[3]
+
+	ld1	{v6.4s}, [pB]
+	add	pB, pB, #16
+	ld1	{v7.4s}, [pB]
+	add	pB, pB, #16
+	ld1	{v2.4s}, [pA]
+	add	pA, pA, #16
+.endm
+
+.macro KERNEL4x8_M2
+	fmla	v16.4s, v2.4s, v6.4s[0]
+	fmla	v18.4s, v2.4s, v6.4s[1]
+	fmla	v20.4s, v2.4s, v6.4s[2]
+	fmla	v22.4s, v2.4s, v6.4s[3]
+	fmla	v24.4s, v2.4s, v7.4s[0]
+	fmla	v26.4s, v2.4s, v7.4s[1]
+	fmla	v28.4s, v2.4s, v7.4s[2]
+	fmla	v30.4s, v2.4s, v7.4s[3]
+
+	ld1	{v4.4s}, [pB]
+	add	pB, pB, #16
+	ld1	{v5.4s}, [pB]
+	add	pB, pB, #16
+	ld1	{v0.4s}, [pA]
+	add	pA, pA, #16
+.endm
+
+.macro KERNEL4x8_E
+	fmla	v16.4s, v2.4s, v6.4s[0]
+	fmla	v18.4s, v2.4s, v6.4s[1]
+	fmla	v20.4s, v2.4s, v6.4s[2]
+	fmla	v22.4s, v2.4s, v6.4s[3]
+	fmla	v24.4s, v2.4s, v7.4s[0]
+	fmla	v26.4s, v2.4s, v7.4s[1]
+	fmla	v28.4s, v2.4s, v7.4s[2]
+	fmla	v30.4s, v2.4s, v7.4s[3]
+.endm
+
+.macro KERNEL4x8_SUB
+	ld1	{v4.4s}, [pB]
+	add	pB, pB, #16
+	ld1	{v5.4s}, [pB]
+	add	pB, pB, #16
+	ld1	{v0.4s}, [pA]
+	add	pA, pA, #16
+
+	fmla	v16.4s, v0.4s, v4.4s[0]
+	fmla	v18.4s, v0.4s, v4.4s[1]
+	fmla	v20.4s, v0.4s, v4.4s[2]
+	fmla	v22.4s, v0.4s, v4.4s[3]
+	fmla	v24.4s, v0.4s, v5.4s[0]
+	fmla	v26.4s, v0.4s, v5.4s[1]
+	fmla	v28.4s, v0.4s, v5.4s[2]
+	fmla	v30.4s, v0.4s, v5.4s[3]
+.endm
+
+.macro SAVE4x8
+	add	pCRow1, pCRow0, LDC
+
+
+	fmul	v0.4s, v16.4s, alphaV0
+	st1 	{v0.4s}, [pCRow0]
+
+	add	pCRow2, pCRow1, LDC
+
+
+	fmul	v2.4s, v18.4s, alphaV2
+	st1 	{v2.4s}, [pCRow1]
+
+	add	pCRow1, pCRow2, LDC
+
+
+	fmul	v4.4s, v20.4s, alphaV0
+	st1 	{v4.4s}, [pCRow2]
+
+	add	pCRow2, pCRow1, LDC
+
+
+	fmul	v6.4s, v22.4s, alphaV2
+	st1 	{v6.4s}, [pCRow1]
+
+	add	pCRow1, pCRow2, LDC
+
+
+	fmul	v0.4s, v24.4s, alphaV0
+	st1 	{v0.4s}, [pCRow2]
+
+	add	pCRow2, pCRow1, LDC
+
+
+	fmul	v2.4s, v26.4s, alphaV2
+	st1 	{v2.4s}, [pCRow1]
+
+	add	pCRow1, pCRow2, LDC
+
+
+	fmul	v4.4s, v28.4s, alphaV0
+	st1 	{v4.4s}, [pCRow2]
+
+
+	fmul	v6.4s, v30.4s, alphaV2
+	st1 	{v6.4s}, [pCRow1]
+
+	add	pCRow0, pCRow0, #16
+.endm
+
+/******************************************************************************/
+
+.macro INIT2x8
+	fmov		s16, wzr
+	fmov		s18, wzr
+	fmov		s20, wzr
+	fmov		s22, s16
+	fmov		s24, wzr
+	fmov		s26, s16
+	fmov		s28, s18
+	fmov		s30, s20
+.endm
+
+.macro KERNEL2x8_SUB
+	ld1	{v4.4s}, [pB]
+	add	pB, pB, #16
+	ld1	{v5.4s}, [pB]
+	add	pB, pB, #16
+	ld1	{v0.2s}, [pA]
+	add	pA, pA, #8
+
+	fmla	v16.2s, v0.2s, v4.4s[0]
+	fmla	v18.2s, v0.2s, v4.4s[1]
+	fmla	v20.2s, v0.2s, v4.4s[2]
+	fmla	v22.2s, v0.2s, v4.4s[3]
+	fmla	v24.2s, v0.2s, v5.4s[0]
+	fmla	v26.2s, v0.2s, v5.4s[1]
+	fmla	v28.2s, v0.2s, v5.4s[2]
+	fmla	v30.2s, v0.2s, v5.4s[3]
+.endm
+
+.macro SAVE2x8
+	add	pCRow1, pCRow0, LDC
+
+
+	fmul	v0.2s, v16.2s, alphaV0
+	st1 	{v0.2s}, [pCRow0]
+
+	add	pCRow2, pCRow1, LDC
+
+
+	fmul	v2.2s, v18.2s, alphaV2
+	st1 	{v2.2s}, [pCRow1]
+
+	add	pCRow1, pCRow2, LDC
+
+
+	fmul	v4.2s, v20.2s, alphaV0
+	st1 	{v4.2s}, [pCRow2]
+
+	add	pCRow2, pCRow1, LDC
+
+
+	fmul	v6.2s, v22.2s, alphaV2
+	st1 	{v6.2s}, [pCRow1]
+
+	add	pCRow1, pCRow2, LDC
+
+
+	fmul	v0.2s, v24.2s, alphaV0
+	st1 	{v0.2s}, [pCRow2]
+
+	add	pCRow2, pCRow1, LDC
+
+
+	fmul	v2.2s, v26.2s, alphaV2
+	st1 	{v2.2s}, [pCRow1]
+
+	add	pCRow1, pCRow2, LDC
+
+
+	fmul	v4.2s, v28.2s, alphaV0
+	st1 	{v4.2s}, [pCRow2]
+
+
+	fmul	v6.2s, v30.2s, alphaV2
+	st1 	{v6.2s}, [pCRow1]
+
+	add	pCRow0, pCRow0, #8
+.endm
+
+/******************************************************************************/
+
+.macro INIT1x8
+	fmov		s16, wzr
+	fmov		s18, wzr
+	fmov		s20, wzr
+	fmov		s22, s16
+	fmov		s24, wzr
+	fmov		s26, s16
+	fmov		s28, s18
+	fmov		s30, s20
+.endm
+
+.macro KERNEL1x8_SUB
+	ld1	{v4.4s}, [pB]
+	add	pB, pB, #16
+	ld1	{v5.4s}, [pB]
+	add	pB, pB, #16
+	ldr	s0, [pA]
+	add	pA, pA, #4
+
+	fmla	s16, s0, v4.4s[0]
+	fmla	s18, s0, v4.4s[1]
+	fmla	s20, s0, v4.4s[2]
+	fmla	s22, s0, v4.4s[3]
+	fmla	s24, s0, v5.4s[0]
+	fmla	s26, s0, v5.4s[1]
+	fmla	s28, s0, v5.4s[2]
+	fmla	s30, s0, v5.4s[3]
+.endm
+
+.macro SAVE1x8
+	add	pCRow1, pCRow0, LDC
+
+
+	fmul	s0, s16, alphaV0
+	str 	s0, [pCRow0]
+
+	add	pCRow2, pCRow1, LDC
+
+
+	fmul	s2, s18, alphaV2
+	str 	s2, [pCRow1]
+
+	add	pCRow1, pCRow2, LDC
+
+
+	fmul	s4, s20, alphaV0
+	str 	s4, [pCRow2]
+
+	add	pCRow2, pCRow1, LDC
+
+
+	fmul	s6, s22, alphaV2
+	str 	s6, [pCRow1]
+
+	add	pCRow1, pCRow2, LDC
+
+
+	fmul	s0, s24, alphaV0
+	str 	s0, [pCRow2]
+
+	add	pCRow2, pCRow1, LDC
+
+
+	fmul	s2, s26, alphaV2
+	str 	s2, [pCRow1]
+
+	add	pCRow1, pCRow2, LDC
+
+
+	fmul	s4, s28, alphaV0
+	str 	s4, [pCRow2]
+
+
+	fmul	s6, s30, alphaV2
+	str 	s6, [pCRow1]
+
+	add	pCRow0, pCRow0, #4
+.endm
+
+/******************************************************************************/
+
+.macro INIT8x4
+	fmov		s16, wzr
+	fmov		s17, wzr
+	fmov		s20, wzr
+	fmov		s21, s16
+	fmov		s24, wzr
+	fmov		s25, s16
+	fmov		s28, wzr
+	fmov		s29, s16
+.endm
+
+.macro KERNEL8x4_I
+	ld1	{v8.2s, v9.2s}, [pB]
+	add	pB, pB, #16
+	ld1	{v0.4s}, [pA]
+	add	pA, pA, #16
+	ld1	{v1.4s}, [pA]
+	add	pA, pA, #16
+
+	fmul	v16.4s, v0.4s, v8.2s[0]
+	fmul	v17.4s, v1.4s, v8.2s[0]
+	fmul	v20.4s, v0.4s, v8.2s[1]
+	fmul	v21.4s, v1.4s, v8.2s[1]
+	fmul	v24.4s, v0.4s, v9.2s[0]
+	fmul	v25.4s, v1.4s, v9.2s[0]
+	fmul	v28.4s, v0.4s, v9.2s[1]
+	fmul	v29.4s, v1.4s, v9.2s[1]
+
+	ld1	{v12.2s, v13.2s}, [pB]
+	add	pB, pB, #16
+	ld1	{v4.4s}, [pA]
+	add	pA, pA, #16
+	ld1	{v5.4s}, [pA]
+	add	pA, pA, #16
+.endm
+
+.macro KERNEL8x4_M1
+	fmla	v16.4s, v0.4s, v8.2s[0]
+	fmla	v17.4s, v1.4s, v8.2s[0]
+	fmla	v20.4s, v0.4s, v8.2s[1]
+	fmla	v21.4s, v1.4s, v8.2s[1]
+	fmla	v24.4s, v0.4s, v9.2s[0]
+	fmla	v25.4s, v1.4s, v9.2s[0]
+	fmla	v28.4s, v0.4s, v9.2s[1]
+	fmla	v29.4s, v1.4s, v9.2s[1]
+
+	ld1	{v12.2s, v13.2s}, [pB]
+	add	pB, pB, #16
+	ld1	{v4.4s}, [pA]
+	add	pA, pA, #16
+	ld1	{v5.4s}, [pA]
+	add	pA, pA, #16
+.endm
+
+.macro KERNEL8x4_M2
+	fmla	v16.4s, v4.4s, v12.2s[0]
+	fmla	v17.4s, v5.4s, v12.2s[0]
+	fmla	v20.4s, v4.4s, v12.2s[1]
+	fmla	v21.4s, v5.4s, v12.2s[1]
+	fmla	v24.4s, v4.4s, v13.2s[0]
+	fmla	v25.4s, v5.4s, v13.2s[0]
+	fmla	v28.4s, v4.4s, v13.2s[1]
+	fmla	v29.4s, v5.4s, v13.2s[1]
+
+	ld1	{v8.2s, v9.2s}, [pB]
+	add	pB, pB, #16
+	ld1	{v0.4s}, [pA]
+	add	pA, pA, #16
+	ld1	{v1.4s}, [pA]
+	add	pA, pA, #16
+.endm
+
+.macro KERNEL8x4_E
+	fmla	v16.4s, v4.4s, v12.2s[0]
+	fmla	v17.4s, v5.4s, v12.2s[0]
+	fmla	v20.4s, v4.4s, v12.2s[1]
+	fmla	v21.4s, v5.4s, v12.2s[1]
+	fmla	v24.4s, v4.4s, v13.2s[0]
+	fmla	v25.4s, v5.4s, v13.2s[0]
+	fmla	v28.4s, v4.4s, v13.2s[1]
+	fmla	v29.4s, v5.4s, v13.2s[1]
+.endm
+
+.macro KERNEL8x4_SUB
+	ld1	{v8.2s, v9.2s}, [pB]
+	add	pB, pB, #16
+	ld1	{v0.4s}, [pA]
+	add	pA, pA, #16
+	ld1	{v1.4s}, [pA]
+	add	pA, pA, #16
+
+	fmla	v16.4s, v0.4s, v8.2s[0]
+	fmla	v17.4s, v1.4s, v8.2s[0]
+	fmla	v20.4s, v0.4s, v8.2s[1]
+	fmla	v21.4s, v1.4s, v8.2s[1]
+	fmla	v24.4s, v0.4s, v9.2s[0]
+	fmla	v25.4s, v1.4s, v9.2s[0]
+	fmla	v28.4s, v0.4s, v9.2s[1]
+	fmla	v29.4s, v1.4s, v9.2s[1]
+.endm
+
+.macro SAVE8x4
+	add	pCRow1, pCRow0, LDC
+
+
+	fmul	v0.4s, v16.4s, alphaV0
+	fmul	v1.4s, v17.4s, alphaV1
+	st1 	{v0.4s, v1.4s}, [pCRow0]
+
+	add	pCRow2, pCRow1, LDC
+
+
+	fmul	v4.4s, v20.4s, alphaV0
+	fmul	v5.4s, v21.4s, alphaV1
+	st1 	{v4.4s, v5.4s}, [pCRow1]
+
+	add	pCRow1, pCRow2, LDC
+
+
+	fmul	v0.4s, v24.4s, alphaV0
+	fmul	v1.4s, v25.4s, alphaV1
+	st1 	{v0.4s, v1.4s}, [pCRow2]
+
+
+	fmul	v4.4s, v28.4s, alphaV0
+	fmul	v5.4s, v29.4s, alphaV1
+	st1 	{v4.4s, v5.4s}, [pCRow1]
+
+	add	pCRow0, pCRow0, #32
+.endm
+
+/******************************************************************************/
+
+
+.macro INIT4x4
+	fmov		s16, wzr
+	fmov		s17, s16
+	fmov		s20, s17
+	fmov		s21, s16
+	fmov		s24, s17
+	fmov		s25, s16
+	fmov		s28, s17
+	fmov		s29, s16
+.endm
+
+.macro KERNEL4x4_I
+	ld1	{v8.2s, v9.2s}, [pB]
+	add	pB, pB, #16
+	ld1	{v0.2s, v1.2s}, [pA]
+	add	pA, pA, #16
+
+	fmul	v16.2s, v0.2s, v8.2s[0]
+	fmul	v29.2s, v1.2s, v9.2s[1]
+
+	fmul	v20.2s, v0.2s, v8.2s[1]
+	fmul	v25.2s, v1.2s, v9.2s[0]
+
+	fmul	v24.2s, v0.2s, v9.2s[0]
+	fmul	v21.2s, v1.2s, v8.2s[1]
+
+	fmul	v28.2s, v0.2s, v9.2s[1]
+	fmul	v17.2s, v1.2s, v8.2s[0]
+
+	ld1	{v12.2s, v13.2s}, [pB]
+	add	pB, pB, #16
+	ld1	{v4.2s, v5.2s}, [pA]
+	add	pA, pA, #16
+.endm
+
+.macro KERNEL4x4_M1
+	fmla	v16.2s, v0.2s, v8.2s[0]
+	fmla	v29.2s, v1.2s, v9.2s[1]
+
+	ld1	{v12.2s, v13.2s}, [pB]		// For next round
+	add	pB, pB, #16
+
+	fmla	v20.2s, v0.2s, v8.2s[1]
+	fmla	v25.2s, v1.2s, v9.2s[0]
+
+	ld1	{v4.2s, v5.2s}, [pA]		// For next round
+	add	pA, pA, #16
+
+	fmla	v24.2s, v0.2s, v9.2s[0]
+	fmla	v21.2s, v1.2s, v8.2s[1]
+
+	prfm	PLDL1KEEP, [pB, #512]
+
+	fmla	v28.2s, v0.2s, v9.2s[1]
+	fmla	v17.2s, v1.2s, v8.2s[0]
+.endm
+
+.macro KERNEL4x4_M2
+	fmla	v16.2s, v4.2s, v12.2s[0]
+	fmla	v29.2s, v5.2s, v13.2s[1]
+
+	ld1	{v8.2s, v9.2s}, [pB]		// For next round
+	add	pB, pB, #16
+
+	fmla	v20.2s, v4.2s, v12.2s[1]
+	fmla	v25.2s, v5.2s, v13.2s[0]
+
+	ld1	{v0.2s, v1.2s}, [pA]		// For next round
+	add	pA, pA, #16
+
+	fmla	v24.2s, v4.2s, v13.2s[0]
+	fmla	v21.2s, v5.2s, v12.2s[1]
+
+	prfm	PLDL1KEEP, [pA, #512]
+
+	fmla	v28.2s, v4.2s, v13.2s[1]
+	fmla	v17.2s, v5.2s, v12.2s[0]
+.endm
+
+.macro KERNEL4x4_E
+	fmla	v16.2s, v4.2s, v12.2s[0]
+	fmla	v29.2s, v5.2s, v13.2s[1]
+
+	fmla	v20.2s, v4.2s, v12.2s[1]
+	fmla	v25.2s, v5.2s, v13.2s[0]
+
+	fmla	v24.2s, v4.2s, v13.2s[0]
+	fmla	v21.2s, v5.2s, v12.2s[1]
+
+	fmla	v28.2s, v4.2s, v13.2s[1]
+	fmla	v17.2s, v5.2s, v12.2s[0]
+.endm
+
+.macro KERNEL4x4_SUB
+	ld1	{v8.2s, v9.2s}, [pB]
+	add	pB, pB, #16
+	ld1	{v0.2s, v1.2s}, [pA]
+	add	pA, pA, #16
+
+	fmla	v16.2s, v0.2s, v8.2s[0]
+	fmla	v29.2s, v1.2s, v9.2s[1]
+
+	fmla	v20.2s, v0.2s, v8.2s[1]
+	fmla	v25.2s, v1.2s, v9.2s[0]
+
+	fmla	v24.2s, v0.2s, v9.2s[0]
+	fmla	v21.2s, v1.2s, v8.2s[1]
+
+	fmla	v28.2s, v0.2s, v9.2s[1]
+	fmla	v17.2s, v1.2s, v8.2s[0]
+.endm
+
+.macro SAVE4x4
+
+	fmul	v8.2s, v16.2s, alphaV0
+	fmul	v9.2s, v17.2s, alphaV1
+	st1 	{v8.2s, v9.2s}, [pCRow0]
+
+	add	pCRow1, pCRow0, LDC
+
+	fmul	v12.2s, v20.2s, alphaV2
+	fmul	v13.2s, v21.2s, alphaV3
+	st1 	{v12.2s, v13.2s}, [pCRow1]
+
+	add	pCRow2, pCRow1, LDC
+
+	fmul	v8.2s, v24.2s, alphaV0
+	fmul	v9.2s, v25.2s, alphaV1
+	st1 	{v8.2s, v9.2s}, [pCRow2]
+
+	add	pCRow1, pCRow2, LDC
+
+	fmul	v12.2s, v28.2s, alphaV2
+	fmul	v13.2s, v29.2s, alphaV3
+	st1 	{v12.2s, v13.2s}, [pCRow1]
+
+	add	pCRow0, pCRow0, #16
+.endm
+
+/******************************************************************************/
+
+.macro INIT2x4
+	fmov		s16, wzr
+	fmov		s20, s16
+	fmov		s24, s20
+	fmov		s28, s16
+.endm
+
+.macro KERNEL2x4_SUB
+	ld1	{v8.2s, v9.2s}, [pB]
+	add	pB, pB, #16
+	ld1	{v0.2s}, [pA]
+	add	pA, pA, #8
+
+	fmla	v16.2s, v0.2s, v8.2s[0]
+	fmla	v20.2s, v0.2s, v8.2s[1]
+	fmla	v24.2s, v0.2s, v9.2s[0]
+	fmla	v28.2s, v0.2s, v9.2s[1]
+.endm
+
+.macro SAVE2x4
+
+	fmul	v8.2s, v16.2s, alphaV0
+	st1	{v8.2s}, [pCRow0]
+
+	add	pCRow1, pCRow0, LDC
+
+	fmul	v12.2s, v20.2s, alphaV1
+	st1	{v12.2s}, [pCRow1]
+
+	add	pCRow2, pCRow1, LDC
+
+	fmul	v8.2s, v24.2s, alphaV2
+	st1	{v8.2s}, [pCRow2]
+
+	add	pCRow1, pCRow2, LDC
+
+	fmul	v12.2s, v28.2s, alphaV3
+	st1	{v12.2s}, [pCRow1]
+
+	add	pCRow0, pCRow0, #8
+.endm
+
+/******************************************************************************/
+
+.macro INIT1x4
+	fmov		s16, wzr
+	fmov		s20, s16
+.endm
+
+.macro KERNEL1x4_SUB
+	ldr	s0, [pA]
+	add	pA, pA, #4
+
+	ld1	{v8.2s, v9.2s}, [pB]
+	add	pB, pB, #16
+
+	fmla	v16.2s, v8.2s, v0.s[0]
+	fmla	v20.2s, v9.2s, v0.s[0]
+.endm
+
+.macro SAVE1x4
+	add	pCRow1, pCRow0, LDC
+
+
+	fmul	v8.2s, v16.2s, alphaV0
+	st1	{v8.s}[0], [pCRow0]
+	st1	{v8.s}[1], [pCRow1]
+
+	add	pCRow2, pCRow1, LDC
+	add	pCRow1, pCRow2, LDC
+
+
+	fmul	v12.2s, v20.2s, alphaV1
+	st1	{v12.s}[0], [pCRow2]
+	st1	{v12.s}[1], [pCRow1]
+
+	add	pCRow0, pCRow0, #4
+.endm
+
+/******************************************************************************/
+
+.macro INIT8x2
+	fmov	s16, wzr
+	fmov	s17, s16
+	fmov	s20, s17
+	fmov	s21, s16
+.endm
+
+.macro KERNEL8x2_SUB
+	ld1	{v8.2s}, [pB]
+	add	pB, pB, #8
+	ld1	{v0.4s}, [pA]
+	add	pA, pA, #16
+	ld1	{v1.4s}, [pA]
+	add	pA, pA, #16
+
+	fmla	v16.4s, v0.4s, v8.2s[0]
+	fmla	v17.4s, v1.4s, v8.2s[0]
+
+	fmla	v20.4s, v0.4s, v8.2s[1]
+	fmla	v21.4s, v1.4s, v8.2s[1]
+.endm
+
+.macro SAVE8x2
+	add	pCRow1, pCRow0, LDC
+
+
+	fmul	v0.4s, v16.4s, alphaV0
+	fmul	v1.4s, v17.4s, alphaV1
+	st1 	{v0.4s, v1.4s}, [pCRow0]
+
+	add	pCRow2, pCRow1, LDC
+
+
+	fmul	v4.4s, v20.4s, alphaV0
+	fmul	v5.4s, v21.4s, alphaV1
+	st1 	{v4.4s, v5.4s}, [pCRow1]
+
+	add	pCRow0, pCRow0, #32
+.endm
+
+/******************************************************************************/
+
+.macro INIT4x2
+	fmov	s16, wzr
+	fmov	s17, s16
+	fmov	s20, s17
+	fmov	s21, s16
+.endm
+
+.macro KERNEL4x2_SUB
+	ld1	{v8.2s}, [pB]
+	add	pB, pB, #8
+	ld1	{v0.2s, v1.2s}, [pA]
+	add	pA, pA, #16
+
+	fmla	v16.2s, v0.2s, v8.2s[0]
+	fmla	v17.2s, v1.2s, v8.2s[0]
+	fmla	v20.2s, v0.2s, v8.2s[1]
+	fmla	v21.2s, v1.2s, v8.2s[1]
+.endm
+
+.macro SAVE4x2
+
+	fmul	v8.2s, v16.2s, alphaV0
+	fmul	v9.2s, v17.2s, alphaV1
+	st1	{v8.2s, v9.2s}, [pCRow0]
+
+	add	pCRow1, pCRow0, LDC
+
+	fmul	v12.2s, v20.2s, alphaV2
+	fmul	v13.2s, v21.2s, alphaV3
+	st1	{v12.2s, v13.2s}, [pCRow1]
+
+	add	pCRow0, pCRow0, #16
+.endm
+
+/******************************************************************************/
+
+.macro INIT2x2
+	fmov		s16, wzr
+	fmov		s20, s16
+.endm
+
+.macro KERNEL2x2_SUB
+	ld1	{v8.2s}, [pB]
+	add	pB, pB, #8
+
+	ld1	{v0.2s}, [pA]
+	add	pA, pA, #8
+
+	fmla	v16.2s, v0.2s, v8.2s[0]
+	fmla	v20.2s, v0.2s, v8.2s[1]
+.endm
+
+.macro SAVE2x2
+
+	fmul	v8.2s, v16.2s, alphaV0
+	st1	{v8.2s}, [pCRow0]
+
+	add	pCRow1 , pCRow0, LDC
+
+	fmul	v12.2s, v20.2s, alphaV1
+	st1	{v12.2s}, [pCRow1]
+
+	add	pCRow0, pCRow0, #8
+.endm
+
+/******************************************************************************/
+
+.macro INIT1x2
+	fmov		s16, wzr
+.endm
+
+.macro KERNEL1x2_SUB
+	ld1	{v8.2s} , [pB]
+	add	pB , pB, #8
+
+	ldr	s0 , [pA]
+	add	pA, pA, #4
+
+	fmla	v16.2s, v8.2s, v0.2s[0]
+.endm
+
+.macro SAVE1x2
+	add	pCRow1 , pCRow0, LDC
+
+
+	fmul	v8.2s, v16.2s, alphaV0
+	st1	{v8.s}[0], [pCRow0]
+	st1	{v8.s}[1], [pCRow1]
+
+	add	pCRow0, pCRow0, #4
+.endm
+
+/******************************************************************************/
+
+.macro INIT8x1
+	fmov	s16, wzr
+	fmov	s17, wzr
+.endm
+
+.macro KERNEL8x1_SUB
+	ldr	s8, [pB]
+	add	pB , pB, #4
+
+	ld1	{v0.4s}, [pA]
+	add	pA, pA, #16
+	ld1	{v1.4s}, [pA]
+	add	pA, pA, #16
+
+	fmla	v16.4s, v0.4s, v8.2s[0]
+	fmla	v17.4s, v1.4s, v8.2s[0]
+.endm
+
+.macro SAVE8x1
+
+	fmul	v0.4s, v16.4s, alphaV0
+	fmul	v1.4s, v17.4s, alphaV1
+	st1 	{v0.4s, v1.4s}, [pCRow0]
+
+	add	pCRow0, pCRow0, #32
+.endm
+
+/******************************************************************************/
+
+.macro INIT4x1
+	fmov	s16, wzr
+	fmov	s17, s16
+.endm
+
+.macro KERNEL4x1_SUB
+	ldr	s8, [pB]
+	add	pB , pB, #4
+
+	ld1	{v0.2s, v1.2s}, [pA]
+	add	pA , pA, #16
+
+	fmla	v16.2s, v0.2s, v8.2s[0]
+	fmla	v17.2s, v1.2s, v8.2s[0]
+.endm
+
+.macro SAVE4x1
+
+	fmul	v8.2s, v16.2s, alphaV0
+	fmul	v9.2s, v17.2s, alphaV1
+	st1	{v8.2s, v9.2s}, [pCRow0]
+
+	add	pCRow0, pCRow0, #16
+.endm
+
+/******************************************************************************/
+
+.macro INIT2x1
+	fmov		s16, wzr
+.endm
+
+.macro KERNEL2x1_SUB
+	ldr	s8, [pB]
+	add	pB , pB, #4
+
+	ld1	{v0.2s}, [pA]
+	add	pA , pA, #8
+
+	fmla	v16.2s, v0.2s, v8.2s[0]
+.endm
+
+.macro SAVE2x1
+
+	fmul	v8.2s, v16.2s, alphaV0
+	st1	{v8.2s}, [pCRow0]
+
+	add	pCRow0, pCRow0, #8
+.endm
+
+/******************************************************************************/
+
+.macro INIT1x1
+	fmov	s16, wzr
+.endm
+
+.macro KERNEL1x1_SUB
+	ldr	s8, [pB]
+	add	pB , pB, #4
+
+	ldr	s0, [pA]
+	add	pA , pA, #4
+
+	fmadd 	s16, s0, s8, s16  
+.endm
+
+.macro SAVE1x1
+
+	fmul	s8, s16, alpha0
+	str 	s8, [pCRow0]
+
+	add	pCRow0, pCRow0, #4
+.endm
+
+/*******************************************************************************
+* End of macro definitions
+*******************************************************************************/
+
+	PROLOGUE
+
+strmm_kernel_begin:
+
+	.align 5
+	add	sp, sp, #-(11 * 16)
+	stp	d8, d9, [sp, #(0 * 16)]
+	stp	d10, d11, [sp, #(1 * 16)]
+	stp	d12, d13, [sp, #(2 * 16)]
+	stp	d14, d15, [sp, #(3 * 16)]
+	stp	d16, d17, [sp, #(4 * 16)]
+	stp	x18, x19, [sp, #(5 * 16)]
+	stp	x20, x21, [sp, #(6 * 16)]
+	stp	x22, x23, [sp, #(7 * 16)]
+	stp	x24, x25, [sp, #(8 * 16)]
+	stp	x26, x27, [sp, #(9 * 16)]
+	str	x28, [sp, #(10 * 16)]
+
+	fmov	alpha0, s0
+	fmov	alpha1, s0
+	fmov	alpha2, s0
+	fmov	alpha3, s0
+
+	lsl	LDC, LDC, #2			// ldc = ldc * 4
+
+#if !defined(LEFT)
+	neg	tempOffset, offset
+#endif
+	mov	pB, origPB
+
+	mov	counterJ, origN
+	asr 	counterJ, counterJ, #3		// J = J / 8
+	cmp 	counterJ, #0
+	ble	strmm_kernel_L4_BEGIN
+
+/******************************************************************************/
+/******************************************************************************/
+
+strmm_kernel_L8_BEGIN:
+	mov	pCRow0, pC			// pCRow0 = C
+	add	pC, pC, LDC, lsl #3
+
+#if defined(LEFT)
+	mov	tempOffset, offset
+#endif
+
+	mov	pA, origPA			// pA = start of A array
+
+/******************************************************************************/
+
+strmm_kernel_L8_M8_BEGIN:
+
+	mov	counterI, origM
+	asr 	counterI, counterI, #3		// counterI = counterI / 8
+	cmp 	counterI, #0
+	ble	strmm_kernel_L8_M4_BEGIN
+
+strmm_kernel_L8_M8_20:
+
+#if (defined(LEFT) &&  defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+	mov	pB, origPB
+#else
+	mov	pB, origPB
+	lsl	temp, tempOffset, #5
+	add	pA, pA, temp
+	add	pB, pB, temp
+#endif
+
+#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
+	sub	tempK, origK, tempOffset
+#elif defined(LEFT)
+	add	tempK, tempOffset, #8
+#else
+	add	tempK, tempOffset, #8
+#endif
+
+	asr 	counterL , tempK, #1		// L = K / 2
+	cmp	counterL , #2			// is there at least 4 to do?
+	blt	strmm_kernel_L8_M8_32
+
+	KERNEL8x8_I				// do one in the K
+	KERNEL8x8_M2				// do another in the K
+
+	subs	counterL, counterL, #2
+	ble	strmm_kernel_L8_M8_22a
+	.align 5
+
+strmm_kernel_L8_M8_22:
+
+	KERNEL8x8_M1
+	KERNEL8x8_M2
+
+	subs	counterL, counterL, #1
+	bgt	strmm_kernel_L8_M8_22
+
+strmm_kernel_L8_M8_22a:
+
+	KERNEL8x8_M1
+	KERNEL8x8_E
+
+	b	 strmm_kernel_L8_M8_44
+
+strmm_kernel_L8_M8_32:
+
+	tst	counterL, #1
+	ble	strmm_kernel_L8_M8_40
+
+	KERNEL8x8_I
+	KERNEL8x8_E
+
+	b	strmm_kernel_L8_M8_44
+
+strmm_kernel_L8_M8_40:
+
+	INIT8x8
+
+strmm_kernel_L8_M8_44:
+
+	ands	counterL , tempK, #1
+	ble	strmm_kernel_L8_M8_100
+
+strmm_kernel_L8_M8_46:
+
+	KERNEL8x8_SUB
+
+strmm_kernel_L8_M8_100:
+
+	SAVE8x8
+
+#if (defined(LEFT) &&  defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+	sub	tempK, origK, tempOffset
+#if defined(LEFT)
+	sub	tempK, tempK, #8
+#else
+	sub	tempK, tempK, #8
+#endif
+	lsl	temp, tempK, #5
+	add	pA, pA, temp
+	add	pB, pB, temp
+#endif
+#if defined(LEFT)
+	add	tempOffset, tempOffset, #8
+#endif
+
+strmm_kernel_L8_M8_END:
+	subs	counterI, counterI, #1
+	bne	strmm_kernel_L8_M8_20
+
+/******************************************************************************/
+
+strmm_kernel_L8_M4_BEGIN:
+
+	mov	counterI, origM
+	tst	counterI , #7
+	ble	strmm_kernel_L8_END
+
+	tst	counterI, #4
+	ble	strmm_kernel_L8_M2_BEGIN
+
+strmm_kernel_L8_M4_20:
+
+#if (defined(LEFT) &&  defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+	mov	pB, origPB
+#else
+	mov	pB, origPB
+	lsl	temp, tempOffset, #4
+	add	pA, pA, temp
+	lsl	temp, tempOffset, #5
+	add	pB, pB, temp
+#endif
+
+#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
+	sub	tempK, origK, tempOffset
+#elif defined(LEFT)
+	add	tempK, tempOffset, #4
+#else
+	add	tempK, tempOffset, #8
+#endif
+
+	asr 	counterL , tempK, #1		// L = K / 2
+	cmp	counterL , #2			// is there at least 4 to do?
+	blt	strmm_kernel_L8_M4_32
+
+	KERNEL4x8_I				// do one in the K
+	KERNEL4x8_M2				// do another in the K
+
+	subs	counterL, counterL, #2
+	ble	strmm_kernel_L8_M4_22a
+	.align 5
+
+strmm_kernel_L8_M4_22:
+
+	KERNEL4x8_M1
+	KERNEL4x8_M2
+
+	subs	counterL, counterL, #1
+	bgt	strmm_kernel_L8_M4_22
+
+strmm_kernel_L8_M4_22a:
+
+	KERNEL4x8_M1
+	KERNEL4x8_E
+
+	b	 strmm_kernel_L8_M4_44
+
+strmm_kernel_L8_M4_32:
+
+	tst	counterL, #1
+	ble	strmm_kernel_L8_M4_40
+
+	KERNEL4x8_I
+	KERNEL4x8_E
+
+	b	strmm_kernel_L8_M4_44
+
+strmm_kernel_L8_M4_40:
+
+	INIT4x8
+
+strmm_kernel_L8_M4_44:
+
+	ands	counterL , tempK, #1
+	ble	strmm_kernel_L8_M4_100
+
+strmm_kernel_L8_M4_46:
+
+	KERNEL4x8_SUB
+
+strmm_kernel_L8_M4_100:
+
+	SAVE4x8
+
+#if (defined(LEFT) &&  defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+	sub	tempK, origK, tempOffset
+#if defined(LEFT)
+	sub	tempK, tempK, #4
+#else
+	sub	tempK, tempK, #8
+#endif
+	lsl	temp, tempK, #4
+	add	pA, pA, temp
+	lsl	temp, tempK, #5
+	add	pB, pB, temp
+#endif
+#if defined(LEFT)
+	add	tempOffset, tempOffset, #4
+#endif
+
+strmm_kernel_L8_M4_END:
+
+/******************************************************************************/
+
+strmm_kernel_L8_M2_BEGIN:
+
+	mov	counterI, origM
+	tst	counterI , #3
+	ble	strmm_kernel_L8_END
+
+	tst	counterI, #2			// counterI = counterI / 2
+	ble	strmm_kernel_L8_M1_BEGIN
+
+strmm_kernel_L8_M2_20:
+
+	INIT2x8
+
+#if (defined(LEFT) &&  defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+	mov	pB, origPB
+#else
+	mov	pB, origPB
+	lsl	temp, tempOffset, #3
+	add	pA, pA, temp
+	lsl	temp, tempOffset, #5
+	add	pB, pB, temp
+#endif
+
+#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
+	sub	tempK, origK, tempOffset
+#elif defined(LEFT)
+	add	tempK, tempOffset, #2
+#else
+	add	tempK, tempOffset, #8
+#endif
+
+	asr 	counterL , tempK, #3		// counterL = counterL / 8
+	cmp	counterL , #0
+	ble	strmm_kernel_L8_M2_40
+
+strmm_kernel_L8_M2_22:
+
+	KERNEL2x8_SUB
+	KERNEL2x8_SUB
+	KERNEL2x8_SUB
+	KERNEL2x8_SUB
+
+	KERNEL2x8_SUB
+	KERNEL2x8_SUB
+	KERNEL2x8_SUB
+	KERNEL2x8_SUB
+
+	subs	counterL, counterL, #1
+	bgt	strmm_kernel_L8_M2_22
+
+
+strmm_kernel_L8_M2_40:
+
+	ands	counterL , tempK, #7		// counterL = counterL % 8
+	ble	strmm_kernel_L8_M2_100
+
+strmm_kernel_L8_M2_42:
+
+	KERNEL2x8_SUB
+
+	subs	counterL, counterL, #1
+	bgt	strmm_kernel_L8_M2_42
+
+strmm_kernel_L8_M2_100:
+
+	SAVE2x8
+
+#if (defined(LEFT) &&  defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+	sub	tempK, origK, tempOffset
+#if defined(LEFT)
+	sub	tempK, tempK, #2
+#else
+	sub	tempK, tempK, #8
+#endif
+	lsl	temp, tempK, #3
+	add	pA, pA, temp
+	lsl	temp, tempK, #5
+	add	pB, pB, temp
+#endif
+#if defined(LEFT)
+	add	tempOffset, tempOffset, #2
+#endif
+
+strmm_kernel_L8_M2_END:
+
+/******************************************************************************/
+
+strmm_kernel_L8_M1_BEGIN:
+
+	tst	counterI, #1			// counterI = counterI % 2
+	ble	strmm_kernel_L8_END
+
+strmm_kernel_L8_M1_20:
+
+	INIT1x8
+
+#if (defined(LEFT) &&  defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+	mov	pB, origPB
+#else
+	mov	pB, origPB
+	lsl	temp, tempOffset, #2
+	add	pA, pA, temp
+	lsl	temp, tempOffset, #5
+	add	pB, pB, temp
+#endif
+
+#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
+	sub	tempK, origK, tempOffset
+#elif defined(LEFT)
+	add	tempK, tempOffset, #1
+#else
+	add	tempK, tempOffset, #8
+#endif
+
+	asr 	counterL , tempK, #3		// counterL = counterL / 8
+	cmp	counterL , #0
+	ble	strmm_kernel_L8_M1_40
+
+strmm_kernel_L8_M1_22:
+	KERNEL1x8_SUB
+	KERNEL1x8_SUB
+	KERNEL1x8_SUB
+	KERNEL1x8_SUB
+
+	KERNEL1x8_SUB
+	KERNEL1x8_SUB
+	KERNEL1x8_SUB
+	KERNEL1x8_SUB
+
+	subs	counterL, counterL, #1
+	bgt	strmm_kernel_L8_M1_22
+
+
+strmm_kernel_L8_M1_40:
+
+	ands	counterL , tempK, #7		// counterL = counterL % 8
+	ble	strmm_kernel_L8_M1_100
+
+strmm_kernel_L8_M1_42:
+
+	KERNEL1x8_SUB
+
+	subs	counterL, counterL, #1
+	bgt	strmm_kernel_L8_M1_42
+
+strmm_kernel_L8_M1_100:
+
+	SAVE1x8
+
+#if (defined(LEFT) &&  defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+	sub	tempK, origK, tempOffset
+#if defined(LEFT)
+	sub	tempK, tempK, #1
+#else
+	sub	tempK, tempK, #8
+#endif
+	lsl	temp, tempK, #2
+	add	pA, pA, temp
+	lsl	temp, tempK, #5
+	add	pB, pB, temp
+#endif
+#if defined(LEFT)
+	add	tempOffset, tempOffset, #1
+#endif
+
+strmm_kernel_L8_END:
+	lsl	temp, origK, #5			// B = B + K * 4 * 8
+	add	origPB, origPB, temp
+
+#if !defined(LEFT)
+	add	tempOffset, tempOffset, #8
+#endif
+
+	subs	counterJ, counterJ , #1		// j--
+	bgt	strmm_kernel_L8_BEGIN
+
+/******************************************************************************/
+/******************************************************************************/
+
+strmm_kernel_L4_BEGIN:
+
+	mov	counterJ , origN
+	tst	counterJ , #7
+	ble	strmm_kernel_L999
+
+	tst	counterJ , #4
+	ble	strmm_kernel_L2_BEGIN
+
+	mov	pCRow0, pC			// pCRow0 = pC
+
+	add	pC,pC,LDC, lsl #2
+
+#if defined(LEFT)
+	mov	tempOffset, offset
+#endif
+
+	mov	pA, origPA			// pA = A
+
+/******************************************************************************/
+
+strmm_kernel_L4_M8_BEGIN:
+
+	mov	counterI, origM
+	asr 	counterI, counterI, #3		// counterI = counterI / 8
+	cmp 	counterI, #0
+	ble	strmm_kernel_L4_M4_BEGIN
+
+strmm_kernel_L4_M8_20:
+
+#if (defined(LEFT) &&  defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+	mov	pB, origPB
+#else
+	mov	pB, origPB
+	lsl	temp, tempOffset, #5
+	add	pA, pA, temp
+	lsl	temp, tempOffset, #4
+	add	pB, pB, temp
+#endif
+
+#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
+	sub	tempK, origK, tempOffset
+#elif defined(LEFT)
+	add	tempK, tempOffset, #8
+#else
+	add	tempK, tempOffset, #4
+#endif
+
+	asr 	counterL , tempK, #1		// L = K / 2
+	cmp	counterL , #2			// is there at least 4 to do?
+	blt	strmm_kernel_L4_M8_32
+
+	KERNEL8x4_I				// do one in the K
+	KERNEL8x4_M2				// do another in the K
+
+	subs	counterL, counterL, #2
+	ble	strmm_kernel_L4_M8_22a
+	.align 5
+
+strmm_kernel_L4_M8_22:
+
+	KERNEL8x4_M1
+	KERNEL8x4_M2
+
+	subs	counterL, counterL, #1
+	bgt	strmm_kernel_L4_M8_22
+
+strmm_kernel_L4_M8_22a:
+
+	KERNEL8x4_M1
+	KERNEL8x4_E
+
+	b	 strmm_kernel_L4_M8_44
+
+strmm_kernel_L4_M8_32:
+
+	tst	counterL, #1
+	ble	strmm_kernel_L4_M8_40
+
+	KERNEL8x4_I
+	KERNEL8x4_E
+
+	b	strmm_kernel_L4_M8_44
+
+strmm_kernel_L4_M8_40:
+
+	INIT8x4
+
+strmm_kernel_L4_M8_44:
+
+	ands	counterL , tempK, #1
+	ble	strmm_kernel_L4_M8_100
+
+strmm_kernel_L4_M8_46:
+
+	KERNEL8x4_SUB
+
+strmm_kernel_L4_M8_100:
+
+	SAVE8x4
+
+#if (defined(LEFT) &&  defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+	sub	tempK, origK, tempOffset
+#if defined(LEFT)
+	sub	tempK, tempK, #8
+#else
+	sub	tempK, tempK, #4
+#endif
+	lsl	temp, tempK, #5
+	add	pA, pA, temp
+	lsl	temp, tempK, #4
+	add	pB, pB, temp
+#endif
+#if defined(LEFT)
+	add	tempOffset, tempOffset, #8
+#endif
+strmm_kernel_L4_M8_END:
+	subs	counterI, counterI, #1
+	bne	strmm_kernel_L4_M8_20
+
+/******************************************************************************/
+
+strmm_kernel_L4_M4_BEGIN:
+
+	mov	counterI, origM
+	tst	counterI , #7
+	ble	strmm_kernel_L4_END
+
+	tst	counterI, #4
+	ble	strmm_kernel_L4_M2_BEGIN
+
+strmm_kernel_L4_M4_20:
+
+#if (defined(LEFT) &&  defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+	mov	pB, origPB
+#else
+	mov	pB, origPB
+	lsl	temp, tempOffset, #4
+	add	pB, pB, temp
+	add	pA, pA, temp
+#endif
+
+#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
+	sub	tempK, origK, tempOffset
+#elif defined(LEFT)
+	add	tempK, tempOffset, #4
+#else
+	add	tempK, tempOffset, #4
+#endif
+	asr 	counterL , tempK, #1		// L = K / 2
+	cmp	counterL , #2			// is there at least 4 to do?
+	blt	strmm_kernel_L4_M4_32
+
+	KERNEL4x4_I				// do one in the K
+	KERNEL4x4_M2				// do another in the K
+
+	subs	counterL, counterL, #2
+	ble	strmm_kernel_L4_M4_22a
+	.align 5
+
+strmm_kernel_L4_M4_22:
+
+	KERNEL4x4_M1
+	KERNEL4x4_M2
+
+	subs	counterL, counterL, #1
+	bgt	strmm_kernel_L4_M4_22
+
+strmm_kernel_L4_M4_22a:
+
+	KERNEL4x4_M1
+	KERNEL4x4_E
+
+	b	 strmm_kernel_L4_M4_44
+
+strmm_kernel_L4_M4_32:
+
+	tst	counterL, #1
+	ble	strmm_kernel_L4_M4_40
+
+	KERNEL4x4_I
+	KERNEL4x4_E
+
+	b	strmm_kernel_L4_M4_44
+
+strmm_kernel_L4_M4_40:
+
+	INIT4x4
+
+strmm_kernel_L4_M4_44:
+
+	ands	counterL , tempK, #1
+	ble	strmm_kernel_L4_M4_100
+
+strmm_kernel_L4_M4_46:
+
+	KERNEL4x4_SUB
+
+strmm_kernel_L4_M4_100:
+
+	SAVE4x4
+
+#if (defined(LEFT) &&  defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+	sub	tempK, origK, tempOffset
+#if defined(LEFT)
+	sub	tempK, tempK, #4
+#else
+	sub	tempK, tempK, #4
+#endif
+	lsl	temp, tempK, #4
+	add	pA, pA, temp
+	add	pB, pB, temp
+#endif
+#if defined(LEFT)
+	add	tempOffset, tempOffset, #4
+#endif
+strmm_kernel_L4_M4_END:
+
+/******************************************************************************/
+
+strmm_kernel_L4_M2_BEGIN:
+
+	mov	counterI, origM
+	tst	counterI , #3
+	ble	strmm_kernel_L4_END
+
+	tst	counterI, #2			// counterI = counterI / 2
+	ble	strmm_kernel_L4_M1_BEGIN
+
+strmm_kernel_L4_M2_20:
+
+	INIT2x4
+
+#if (defined(LEFT) &&  defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+	mov	pB, origPB
+#else
+	mov	pB, origPB
+	lsl	temp, tempOffset, #3
+	add	pA, pA, temp
+	lsl	temp, tempOffset, #4
+	add	pB, pB, temp
+#endif
+
+#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
+	sub	tempK, origK, tempOffset
+#elif defined(LEFT)
+	add	tempK, tempOffset, #2
+#else
+	add	tempK, tempOffset, #4
+#endif
+	asr 	counterL , tempK, #3		// counterL = counterL / 8
+	cmp	counterL , #0
+	ble	strmm_kernel_L4_M2_40
+
+strmm_kernel_L4_M2_22:
+
+	KERNEL2x4_SUB
+	KERNEL2x4_SUB
+	KERNEL2x4_SUB
+	KERNEL2x4_SUB
+
+	KERNEL2x4_SUB
+	KERNEL2x4_SUB
+	KERNEL2x4_SUB
+	KERNEL2x4_SUB
+
+	subs	counterL, counterL, #1
+	bgt	strmm_kernel_L4_M2_22
+
+
+strmm_kernel_L4_M2_40:
+
+	ands	counterL , tempK, #7		// counterL = counterL % 8
+	ble	strmm_kernel_L4_M2_100
+
+strmm_kernel_L4_M2_42:
+
+	KERNEL2x4_SUB
+
+	subs	counterL, counterL, #1
+	bgt	strmm_kernel_L4_M2_42
+
+strmm_kernel_L4_M2_100:
+
+	SAVE2x4
+
+#if (defined(LEFT) &&  defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+	sub	tempK, origK, tempOffset
+#if defined(LEFT)
+	sub	tempK, tempK, #2
+#else
+	sub	tempK, tempK, #4
+#endif
+	lsl	temp, tempK, #3
+	add	pA, pA, temp
+	lsl	temp, tempK, #4
+	add	pB, pB, temp
+#endif
+#if defined(LEFT)
+	add	tempOffset, tempOffset, #2
+#endif
+strmm_kernel_L4_M2_END:
+
+/******************************************************************************/
+
+strmm_kernel_L4_M1_BEGIN:
+
+	tst	counterI, #1			// counterI = counterI % 2
+	ble	strmm_kernel_L4_END
+
+strmm_kernel_L4_M1_20:
+
+	INIT1x4
+
+#if (defined(LEFT) &&  defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+	mov	pB, origPB
+#else
+	mov	pB, origPB
+	lsl	temp, tempOffset, #4
+	add	pB, pB, temp
+	lsl	temp, tempOffset, #2
+	add	pA, pA, temp
+#endif
+
+#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
+	sub	tempK, origK, tempOffset
+#elif defined(LEFT)
+	add	tempK, tempOffset, #1
+#else
+	add	tempK, tempOffset, #4
+#endif
+	asr 	counterL , tempK, #3		// counterL = counterL / 8
+	cmp	counterL , #0
+	ble	strmm_kernel_L4_M1_40
+
+strmm_kernel_L4_M1_22:
+	KERNEL1x4_SUB
+	KERNEL1x4_SUB
+	KERNEL1x4_SUB
+	KERNEL1x4_SUB
+
+	KERNEL1x4_SUB
+	KERNEL1x4_SUB
+	KERNEL1x4_SUB
+	KERNEL1x4_SUB
+
+	subs	counterL, counterL, #1
+	bgt	strmm_kernel_L4_M1_22
+
+
+strmm_kernel_L4_M1_40:
+
+	ands	counterL , tempK, #7		// counterL = counterL % 8
+	ble	strmm_kernel_L4_M1_100
+
+strmm_kernel_L4_M1_42:
+
+	KERNEL1x4_SUB
+
+	subs	counterL, counterL, #1
+	bgt	strmm_kernel_L4_M1_42
+
+strmm_kernel_L4_M1_100:
+
+	SAVE1x4
+
+#if (defined(LEFT) &&  defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+	sub	tempK, origK, tempOffset
+#if defined(LEFT)
+	sub	tempK, tempK, #1
+#else
+	sub	tempK, tempK, #4
+#endif
+	lsl	temp, tempK, #2
+	add	pA, pA, temp
+	lsl	temp, tempK, #4
+	add	pB, pB, temp
+#endif
+#if defined(LEFT)
+	add	tempOffset, tempOffset, #1
+#endif
+strmm_kernel_L4_END:
+	add	origPB, origPB, origK, lsl #4	// B = B + K * 4 * 4
+#if !defined(LEFT)
+	add	tempOffset, tempOffset, #4
+#endif
+
+/******************************************************************************/
+/******************************************************************************/
+
+strmm_kernel_L2_BEGIN:   // less than 2 left in N direction
+
+	mov	counterJ , origN
+	tst	counterJ , #3
+	ble	strmm_kernel_L999
+
+	tst	counterJ , #2
+	ble	strmm_kernel_L1_BEGIN
+
+	mov	pCRow0, pC			// pCRow0 = pC
+
+	add	pC,pC,LDC, lsl #1
+
+#if defined(LEFT)
+	mov	tempOffset, offset
+#endif
+	mov	pA, origPA			// pA = A
+
+/******************************************************************************/
+
+strmm_kernel_L2_M8_BEGIN:
+
+	mov	counterI, origM
+	asr 	counterI, counterI, #3		// counterI = counterI / 8
+	cmp	counterI,#0
+	ble	strmm_kernel_L2_M4_BEGIN
+
+strmm_kernel_L2_M8_20:
+
+	INIT8x2
+
+#if (defined(LEFT) &&  defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+	mov	pB, origPB
+#else
+	mov	pB, origPB
+	lsl	temp, tempOffset, #5
+	add	pA, pA, temp
+	lsl	temp, tempOffset, #3
+	add	pB, pB, temp
+#endif
+
+#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
+	sub	tempK, origK, tempOffset
+#elif defined(LEFT)
+	add	tempK, tempOffset, #8
+#else
+	add	tempK, tempOffset, #2
+#endif
+	asr	counterL , tempK, #3		// counterL = counterL / 8
+	cmp	counterL,#0
+	ble	strmm_kernel_L2_M8_40
+	.align 5
+
+strmm_kernel_L2_M8_22:
+	KERNEL8x2_SUB
+	KERNEL8x2_SUB
+	KERNEL8x2_SUB
+	KERNEL8x2_SUB
+
+	KERNEL8x2_SUB
+	KERNEL8x2_SUB
+	KERNEL8x2_SUB
+	KERNEL8x2_SUB
+
+	subs	counterL, counterL, #1
+	bgt	strmm_kernel_L2_M8_22
+
+
+strmm_kernel_L2_M8_40:
+
+	ands	counterL , tempK, #7		// counterL = counterL % 8
+	ble	strmm_kernel_L2_M8_100
+
+strmm_kernel_L2_M8_42:
+
+	KERNEL8x2_SUB
+
+	subs	counterL, counterL, #1
+	bgt	strmm_kernel_L2_M8_42
+
+strmm_kernel_L2_M8_100:
+
+	SAVE8x2
+
+#if (defined(LEFT) &&  defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+	sub	tempK, origK, tempOffset
+#if defined(LEFT)
+	sub	tempK, tempK, #8
+#else
+	sub	tempK, tempK, #2
+#endif
+	lsl	temp, tempK, #5
+	add	pA, pA, temp
+	lsl	temp, tempK, #3
+	add	pB, pB, temp
+#endif
+#if defined(LEFT)
+	add	tempOffset, tempOffset, #8
+#endif
+strmm_kernel_L2_M8_END:
+
+	subs	counterI, counterI, #1
+	bgt	strmm_kernel_L2_M8_20
+
+/******************************************************************************/
+
+strmm_kernel_L2_M4_BEGIN:
+
+	mov	counterI, origM
+	tst	counterI , #7
+	ble	strmm_kernel_L2_END
+
+	tst	counterI, #4
+	ble	strmm_kernel_L2_M2_BEGIN
+
+strmm_kernel_L2_M4_20:
+
+	INIT4x2
+
+#if (defined(LEFT) &&  defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+	mov	pB, origPB
+#else
+	mov	pB, origPB
+	lsl	temp, tempOffset, #3
+	add	pB, pB, temp
+	lsl	temp, tempOffset, #4
+	add	pA, pA, temp
+#endif
+
+#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
+	sub	tempK, origK, tempOffset
+#elif defined(LEFT)
+	add	tempK, tempOffset, #4
+#else
+	add	tempK, tempOffset, #2
+#endif
+	asr	counterL , tempK, #3		// counterL = counterL / 8
+	cmp	counterL,#0
+	ble	strmm_kernel_L2_M4_40
+	.align 5
+
+strmm_kernel_L2_M4_22:
+	KERNEL4x2_SUB
+	KERNEL4x2_SUB
+	KERNEL4x2_SUB
+	KERNEL4x2_SUB
+
+	KERNEL4x2_SUB
+	KERNEL4x2_SUB
+	KERNEL4x2_SUB
+	KERNEL4x2_SUB
+
+	subs	counterL, counterL, #1
+	bgt	strmm_kernel_L2_M4_22
+
+
+strmm_kernel_L2_M4_40:
+
+	ands	counterL , tempK, #7		// counterL = counterL % 8
+	ble	strmm_kernel_L2_M4_100
+
+strmm_kernel_L2_M4_42:
+
+	KERNEL4x2_SUB
+
+	subs	counterL, counterL, #1
+	bgt	strmm_kernel_L2_M4_42
+
+strmm_kernel_L2_M4_100:
+
+	SAVE4x2
+
+#if (defined(LEFT) &&  defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+	sub	tempK, origK, tempOffset
+#if defined(LEFT)
+	sub	tempK, tempK, #4
+#else
+	sub	tempK, tempK, #2
+#endif
+	lsl	temp, tempK, #4
+	add	pA, pA, temp
+	lsl	temp, tempK, #3
+	add	pB, pB, temp
+#endif
+#if defined(LEFT)
+	add	tempOffset, tempOffset, #4
+#endif
+strmm_kernel_L2_M4_END:
+
+/******************************************************************************/
+
+strmm_kernel_L2_M2_BEGIN:
+
+	mov	counterI, origM
+	tst	counterI , #3
+	ble	strmm_kernel_L2_END
+
+	tst	counterI, #2			// counterI = counterI / 2
+	ble	strmm_kernel_L2_M1_BEGIN
+
+strmm_kernel_L2_M2_20:
+
+	INIT2x2
+
+#if (defined(LEFT) &&  defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+	mov	pB, origPB
+#else
+	mov	pB, origPB
+	lsl	temp, tempOffset, #3
+	add	pB, pB, temp
+	lsl	temp, tempOffset, #3
+	add	pA, pA, temp
+#endif
+
+#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
+	sub	tempK, origK, tempOffset
+#elif defined(LEFT)
+	add	tempK, tempOffset, #2
+#else
+	add	tempK, tempOffset, #2
+#endif
+	asr	counterL , tempK, #3		// counterL = counterL / 8
+        cmp	counterL,#0
+	ble	strmm_kernel_L2_M2_40
+
+strmm_kernel_L2_M2_22:
+
+	KERNEL2x2_SUB
+	KERNEL2x2_SUB
+	KERNEL2x2_SUB
+	KERNEL2x2_SUB
+
+	KERNEL2x2_SUB
+	KERNEL2x2_SUB
+	KERNEL2x2_SUB
+	KERNEL2x2_SUB
+
+	subs	counterL, counterL, #1
+	bgt	strmm_kernel_L2_M2_22
+
+
+strmm_kernel_L2_M2_40:
+
+	ands	counterL , tempK, #7		// counterL = counterL % 8
+	ble	strmm_kernel_L2_M2_100
+
+strmm_kernel_L2_M2_42:
+
+	KERNEL2x2_SUB
+
+	subs	counterL, counterL, #1
+	bgt	strmm_kernel_L2_M2_42
+
+strmm_kernel_L2_M2_100:
+
+	SAVE2x2
+#if (defined(LEFT) &&  defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+	sub	tempK, origK, tempOffset
+#if defined(LEFT)
+	sub	tempK, tempK, #2
+#else
+	sub	tempK, tempK, #2
+#endif
+	lsl	temp, tempK, #3
+	add	pA, pA, temp
+	lsl	temp, tempK, #3
+	add	pB, pB, temp
+#endif
+#if defined(LEFT)
+	add	tempOffset, tempOffset, #2
+#endif
+
+strmm_kernel_L2_M2_END:
+
+/******************************************************************************/
+
+strmm_kernel_L2_M1_BEGIN:
+
+	tst	counterI, #1			// counterI = counterI % 2
+	ble	strmm_kernel_L2_END
+
+strmm_kernel_L2_M1_20:
+
+	INIT1x2
+
+#if (defined(LEFT) &&  defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+	mov	pB, origPB
+#else
+	mov	pB, origPB
+	lsl	temp, tempOffset, #3
+	add	pB, pB, temp
+	lsl	temp, tempOffset, #2
+	add	pA, pA, temp
+#endif
+
+#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
+	sub	tempK, origK, tempOffset
+#elif defined(LEFT)
+	add	tempK, tempOffset, #1
+#else
+	add	tempK, tempOffset, #2
+#endif
+	asr 	counterL , tempK, #3		// counterL = counterL / 8
+        cmp     counterL, #0
+	ble	strmm_kernel_L2_M1_40
+
+strmm_kernel_L2_M1_22:
+	KERNEL1x2_SUB
+	KERNEL1x2_SUB
+	KERNEL1x2_SUB
+	KERNEL1x2_SUB
+
+	KERNEL1x2_SUB
+	KERNEL1x2_SUB
+	KERNEL1x2_SUB
+	KERNEL1x2_SUB
+
+	subs	counterL, counterL, #1
+	bgt	strmm_kernel_L2_M1_22
+
+
+strmm_kernel_L2_M1_40:
+
+	ands	counterL , tempK, #7		// counterL = counterL % 8
+	ble	strmm_kernel_L2_M1_100
+
+strmm_kernel_L2_M1_42:
+
+	KERNEL1x2_SUB
+
+	subs	counterL, counterL, #1
+	bgt	strmm_kernel_L2_M1_42
+
+strmm_kernel_L2_M1_100:
+
+	SAVE1x2
+
+#if (defined(LEFT) &&  defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+	sub	tempK, origK, tempOffset
+#if defined(LEFT)
+	sub	tempK, tempK, #1
+#else
+	sub	tempK, tempK, #2
+#endif
+	lsl	temp, tempK, #2
+	add	pA, pA, temp
+	lsl	temp, tempK, #3
+	add	pB, pB, temp
+#endif
+#if defined(LEFT)
+	add	tempOffset, tempOffset, #1
+#endif
+strmm_kernel_L2_END:
+#if !defined(LEFT)
+	add	tempOffset, tempOffset, #2
+#endif
+	add	origPB, origPB, origK, lsl #3	// B = B + K * 2 * 4
+
+/******************************************************************************/
+/******************************************************************************/
+
+strmm_kernel_L1_BEGIN:
+
+	mov	counterJ , origN
+	tst	counterJ , #1
+	ble	strmm_kernel_L999 // done
+
+
+	mov	pCRow0, pC			// pCRow0 = C
+	add	pC , pC , LDC			// Update pC to point to next
+
+#if defined(LEFT)
+	mov	tempOffset, offset
+#endif
+	mov	pA, origPA			// pA = A
+
+/******************************************************************************/
+
+strmm_kernel_L1_M8_BEGIN:
+
+	mov	counterI, origM
+	asr 	counterI, counterI, #3
+	cmp	counterI, #0
+	ble	strmm_kernel_L1_M4_BEGIN
+
+strmm_kernel_L1_M8_20:
+
+	INIT8x1
+
+#if (defined(LEFT) &&  defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+	mov	pB, origPB
+#else
+	mov	pB, origPB
+	lsl	temp, tempOffset, #5
+	add	pA, pA, temp
+	lsl	temp, tempOffset, #2
+	add	pB, pB, temp
+#endif
+
+#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
+	sub	tempK, origK, tempOffset
+#elif defined(LEFT)
+	add	tempK, tempOffset, #8
+#else
+	add	tempK, tempOffset, #1
+#endif
+	asr	counterL , tempK, #3		// counterL = counterL / 8
+	cmp	counterL , #0
+	ble	strmm_kernel_L1_M8_40
+	.align 5
+
+strmm_kernel_L1_M8_22:
+	KERNEL8x1_SUB
+	KERNEL8x1_SUB
+	KERNEL8x1_SUB
+	KERNEL8x1_SUB
+
+	KERNEL8x1_SUB
+	KERNEL8x1_SUB
+	KERNEL8x1_SUB
+	KERNEL8x1_SUB
+
+	subs	counterL, counterL, #1
+	bgt	strmm_kernel_L1_M8_22
+
+
+strmm_kernel_L1_M8_40:
+
+	ands	counterL , tempK, #7		// counterL = counterL % 8
+	ble	strmm_kernel_L1_M8_100
+
+strmm_kernel_L1_M8_42:
+
+	KERNEL8x1_SUB
+
+	subs	counterL, counterL, #1
+	bgt	strmm_kernel_L1_M8_42
+
+strmm_kernel_L1_M8_100:
+
+	SAVE8x1
+
+#if (defined(LEFT) &&  defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+	sub	tempK, origK, tempOffset
+#if defined(LEFT)
+	sub	tempK, tempK, #8
+#else
+	sub	tempK, tempK, #1
+#endif
+	lsl	temp, tempK, #5
+	add	pA, pA, temp
+	lsl	temp, tempK, #2
+	add	pB, pB, temp
+#endif
+#if defined(LEFT)
+	add	tempOffset, tempOffset, #8
+#endif
+strmm_kernel_L1_M8_END:
+
+	subs	counterI, counterI, #1
+	bgt	strmm_kernel_L1_M8_20
+
+/******************************************************************************/
+
+strmm_kernel_L1_M4_BEGIN:
+
+	mov	counterI, origM
+	tst	counterI , #7
+	ble	strmm_kernel_L1_END
+
+	tst	counterI, #4
+	ble	strmm_kernel_L1_M2_BEGIN
+
+strmm_kernel_L1_M4_20:
+
+	INIT4x1
+
+#if (defined(LEFT) &&  defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+	mov	pB, origPB
+#else
+	mov	pB, origPB
+	lsl	temp, tempOffset, #2
+	add	pB, pB, temp
+	lsl	temp, tempOffset, #4
+	add	pA, pA, temp
+#endif
+
+#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
+	sub	tempK, origK, tempOffset
+#elif defined(LEFT)
+	add	tempK, tempOffset, #4
+#else
+	add	tempK, tempOffset, #1
+#endif
+	asr	counterL , tempK, #3		// counterL = counterL / 8
+	cmp	counterL , #0
+	ble	strmm_kernel_L1_M4_40
+	.align 5
+
+strmm_kernel_L1_M4_22:
+	KERNEL4x1_SUB
+	KERNEL4x1_SUB
+	KERNEL4x1_SUB
+	KERNEL4x1_SUB
+
+	KERNEL4x1_SUB
+	KERNEL4x1_SUB
+	KERNEL4x1_SUB
+	KERNEL4x1_SUB
+
+	subs	counterL, counterL, #1
+	bgt	strmm_kernel_L1_M4_22
+
+
+strmm_kernel_L1_M4_40:
+
+	ands	counterL , tempK, #7		// counterL = counterL % 8
+	ble	strmm_kernel_L1_M4_100
+
+strmm_kernel_L1_M4_42:
+
+	KERNEL4x1_SUB
+
+	subs	counterL, counterL, #1
+	bgt	strmm_kernel_L1_M4_42
+
+strmm_kernel_L1_M4_100:
+
+	SAVE4x1
+
+#if (defined(LEFT) &&  defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+	sub	tempK, origK, tempOffset
+#if defined(LEFT)
+	sub	tempK, tempK, #4
+#else
+	sub	tempK, tempK, #1
+#endif
+	lsl	temp, tempK, #4
+	add	pA, pA, temp
+	lsl	temp, tempK, #2
+	add	pB, pB, temp
+#endif
+#if defined(LEFT)
+	add	tempOffset, tempOffset, #4
+#endif
+strmm_kernel_L1_M4_END:
+
+/******************************************************************************/
+
+strmm_kernel_L1_M2_BEGIN:
+
+	mov	counterI, origM
+	tst	counterI , #3
+	ble	strmm_kernel_L1_END
+
+	tst	counterI, #2			// counterI = counterI / 2
+	ble	strmm_kernel_L1_M1_BEGIN
+
+strmm_kernel_L1_M2_20:
+
+	INIT2x1
+
+#if (defined(LEFT) &&  defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+	mov	pB, origPB
+#else
+	mov	pB, origPB
+	lsl	temp, tempOffset, #2
+	add	pB, pB, temp
+	lsl	temp, tempOffset, #3
+	add	pA, pA, temp
+#endif
+
+#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
+	sub	tempK, origK, tempOffset
+#elif defined(LEFT)
+	add	tempK, tempOffset, #2
+#else
+	add	tempK, tempOffset, #1
+#endif
+	asr 	counterL , tempK, #3		// counterL = counterL / 8
+	cmp	counterL , #0
+	ble	strmm_kernel_L1_M2_40
+
+strmm_kernel_L1_M2_22:
+
+	KERNEL2x1_SUB
+	KERNEL2x1_SUB
+	KERNEL2x1_SUB
+	KERNEL2x1_SUB
+
+	KERNEL2x1_SUB
+	KERNEL2x1_SUB
+	KERNEL2x1_SUB
+	KERNEL2x1_SUB
+
+	subs	counterL, counterL, #1
+	bgt	strmm_kernel_L1_M2_22
+
+
+strmm_kernel_L1_M2_40:
+
+	ands	counterL , tempK, #7		// counterL = counterL % 8
+	ble	strmm_kernel_L1_M2_100
+
+strmm_kernel_L1_M2_42:
+
+	KERNEL2x1_SUB
+
+	subs	counterL, counterL, #1
+	bgt	strmm_kernel_L1_M2_42
+
+strmm_kernel_L1_M2_100:
+
+	SAVE2x1
+
+#if (defined(LEFT) &&  defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+	sub	tempK, origK, tempOffset
+#if defined(LEFT)
+	sub	tempK, tempK, #2
+#else
+	sub	tempK, tempK, #1
+#endif
+	lsl	temp, tempK, #3
+	add	pA, pA, temp
+	lsl	temp, tempK, #2
+	add	pB, pB, temp
+#endif
+#if defined(LEFT)
+	add	tempOffset, tempOffset, #2
+#endif
+strmm_kernel_L1_M2_END:
+
+/******************************************************************************/
+
+strmm_kernel_L1_M1_BEGIN:
+
+	tst	counterI, #1			// counterI = counterI % 2
+	ble	strmm_kernel_L1_END
+
+strmm_kernel_L1_M1_20:
+
+	INIT1x1
+
+#if (defined(LEFT) &&  defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+	mov	pB, origPB
+#else
+	mov	pB, origPB
+	lsl	temp, tempOffset, #2
+	add	pB, pB, temp
+	lsl	temp, tempOffset, #2
+	add	pA, pA, temp
+#endif
+
+#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
+	sub	tempK, origK, tempOffset
+#elif defined(LEFT)
+	add	tempK, tempOffset, #1
+#else
+	add	tempK, tempOffset, #1
+#endif
+	asr 	counterL , tempK, #3		// counterL = counterL / 8
+	cmp	counterL , #0
+	ble	strmm_kernel_L1_M1_40
+
+strmm_kernel_L1_M1_22:
+	KERNEL1x1_SUB
+	KERNEL1x1_SUB
+	KERNEL1x1_SUB
+	KERNEL1x1_SUB
+
+	KERNEL1x1_SUB
+	KERNEL1x1_SUB
+	KERNEL1x1_SUB
+	KERNEL1x1_SUB
+
+	subs	counterL, counterL, #1
+	bgt	strmm_kernel_L1_M1_22
+
+
+strmm_kernel_L1_M1_40:
+
+	ands	counterL , tempK, #7		// counterL = counterL % 8
+	ble	strmm_kernel_L1_M1_100
+
+strmm_kernel_L1_M1_42:
+
+	KERNEL1x1_SUB
+
+	subs	counterL, counterL, #1
+	bgt	strmm_kernel_L1_M1_42
+
+strmm_kernel_L1_M1_100:
+
+	SAVE1x1
+
+strmm_kernel_L1_END:
+
+/******************************************************************************/
+
+strmm_kernel_L999:
+	mov	x0, #0				// set return value
+	ldp	d8, d9, [sp, #(0 * 16)]
+	ldp	d10, d11, [sp, #(1 * 16)]
+	ldp	d12, d13, [sp, #(2 * 16)]
+	ldp	d14, d15, [sp, #(3 * 16)]
+	ldp	d16, d17, [sp, #(4 * 16)]
+	ldp	x18, x19, [sp, #(5 * 16)]
+	ldp	x20, x21, [sp, #(6 * 16)]
+	ldp	x22, x23, [sp, #(7 * 16)]
+	ldp	x24, x25, [sp, #(8 * 16)]
+	ldp	x26, x27, [sp, #(9 * 16)]
+	ldr	x28, [sp, #(10 * 16)]
+	add	sp, sp, #(11*16)
+	ret
+
+	EPILOGUE
+
diff --git a/kernel/power/KERNEL b/kernel/power/KERNEL
index cb9ed848b..c3c86b310 100644
--- a/kernel/power/KERNEL
+++ b/kernel/power/KERNEL
@@ -1,57 +1,3 @@
-SGEMM_BETA = gemm_beta.S
-DGEMM_BETA = gemm_beta.S
-CGEMM_BETA = zgemm_beta.S
-ZGEMM_BETA = zgemm_beta.S
-
-
-ifndef SSYMV_U_KERNEL
-SSYMV_U_KERNEL =  symv_U.S
-endif
-
-ifndef SSYMV_L_KERNEL
-SSYMV_L_KERNEL =  symv_L.S
-endif
-
-ifndef DSYMV_U_KERNEL
-DSYMV_U_KERNEL =  symv_U.S
-endif
-
-ifndef DSYMV_L_KERNEL
-DSYMV_L_KERNEL =  symv_L.S
-endif
-
-ifndef CSYMV_U_KERNEL
-CSYMV_U_KERNEL =  zsymv_U.S
-endif
-
-ifndef CSYMV_L_KERNEL
-CSYMV_L_KERNEL =  zsymv_L.S
-endif
-
-ifndef ZSYMV_U_KERNEL
-ZSYMV_U_KERNEL =  zsymv_U.S
-endif
-
-ifndef ZSYMV_L_KERNEL
-ZSYMV_L_KERNEL =  zsymv_L.S
-endif
-
-ifndef CHEMV_U_KERNEL
-CHEMV_U_KERNEL =  zsymv_U.S
-endif
-
-ifndef CHEMV_L_KERNEL
-CHEMV_L_KERNEL =  zsymv_L.S
-endif
-
-ifndef ZHEMV_U_KERNEL
-ZHEMV_U_KERNEL =  zsymv_U.S
-endif
-
-ifndef ZHEMV_L_KERNEL
-ZHEMV_L_KERNEL =  zsymv_L.S
-endif
-
 ifndef STRSMKERNEL_LN
 STRSMKERNEL_LN	= ../generic/trsm_kernel_LN.c
 endif
@@ -84,3 +30,23 @@ ifndef CTRSMKERNEL_RT
 CTRSMKERNEL_RT	= ../generic/trsm_kernel_RT.c
 endif
 
+ifndef SGEMM_BETA
+SGEMM_BETA = gemm_beta.S
+endif
+
+ifndef DGEMM_BETA
+DGEMM_BETA = gemm_beta.S
+endif
+
+ifndef CGEMM_BETA
+CGEMM_BETA = zgemm_beta.S
+endif
+
+ifndef ZGEMM_BETA
+ZGEMM_BETA = zgemm_beta.S
+endif
+
+ifndef DSDOTKERNEL
+DSDOTKERNEL = ../generic/dot.c
+endif
+
diff --git a/kernel/power/KERNEL.POWER8 b/kernel/power/KERNEL.POWER8
new file mode 100644
index 000000000..760d568cd
--- /dev/null
+++ b/kernel/power/KERNEL.POWER8
@@ -0,0 +1,175 @@
+#SGEMM_BETA = ../generic/gemm_beta.c
+#DGEMM_BETA = ../generic/gemm_beta.c
+#CGEMM_BETA = ../generic/zgemm_beta.c
+#ZGEMM_BETA = ../generic/zgemm_beta.c
+
+STRMMKERNEL	= gemm_kernel_power6.S
+DTRMMKERNEL	= dtrmm_kernel_16x4_power8.S
+CTRMMKERNEL	= ../generic/ztrmmkernel_2x2.c
+ZTRMMKERNEL	= ztrmm_kernel_8x2_power8.S
+
+SGEMMKERNEL    =  gemm_kernel_power6.S
+SGEMMONCOPY    =  ../generic/gemm_ncopy_4.c
+SGEMMOTCOPY    =  ../generic/gemm_tcopy_4.c
+SGEMMONCOPYOBJ =  sgemm_oncopy.o
+SGEMMOTCOPYOBJ =  sgemm_otcopy.o
+
+DGEMMKERNEL    =  dgemm_kernel_16x4_power8.S
+DGEMMINCOPY    = ../generic/gemm_ncopy_16.c
+DGEMMITCOPY    = ../generic/gemm_tcopy_16.c
+DGEMMONCOPY    =  gemm_ncopy_4.S
+DGEMMOTCOPY    =  gemm_tcopy_4.S
+DGEMMINCOPYOBJ = dgemm_incopy.o
+DGEMMITCOPYOBJ = dgemm_itcopy.o
+DGEMMONCOPYOBJ = dgemm_oncopy.o
+DGEMMOTCOPYOBJ = dgemm_otcopy.o
+
+CGEMMKERNEL    = ../generic/zgemmkernel_2x2.c
+CGEMMONCOPY    = ../generic/zgemm_ncopy_2.c
+CGEMMOTCOPY    = ../generic/zgemm_tcopy_2.c
+CGEMMONCOPYOBJ =  cgemm_oncopy.o
+CGEMMOTCOPYOBJ =  cgemm_otcopy.o
+
+ZGEMMKERNEL    = zgemm_kernel_8x2_power8.S
+ZGEMMONCOPY    = ../generic/zgemm_ncopy_2.c
+ZGEMMOTCOPY    = ../generic/zgemm_tcopy_2.c
+ZGEMMINCOPY    = ../generic/zgemm_ncopy_8.c
+ZGEMMITCOPY    = ../generic/zgemm_tcopy_8.c
+ZGEMMONCOPYOBJ =  zgemm_oncopy.o
+ZGEMMOTCOPYOBJ =  zgemm_otcopy.o
+ZGEMMINCOPYOBJ =  zgemm_incopy.o
+ZGEMMITCOPYOBJ =  zgemm_itcopy.o
+
+STRSMKERNEL_LN	=  ../generic/trsm_kernel_LN.c
+STRSMKERNEL_LT	=  ../generic/trsm_kernel_LT.c
+STRSMKERNEL_RN	=  ../generic/trsm_kernel_RN.c
+STRSMKERNEL_RT	=  ../generic/trsm_kernel_RT.c
+
+DTRSMKERNEL_LN	= ../generic/trsm_kernel_LN.c
+DTRSMKERNEL_LT	= ../generic/trsm_kernel_LT.c
+DTRSMKERNEL_RN	= ../generic/trsm_kernel_RN.c
+DTRSMKERNEL_RT	= ../generic/trsm_kernel_RT.c
+
+CTRSMKERNEL_LN	= ../generic/trsm_kernel_LN.c
+CTRSMKERNEL_LT	= ../generic/trsm_kernel_LT.c
+CTRSMKERNEL_RN	= ../generic/trsm_kernel_RN.c
+CTRSMKERNEL_RT	= ../generic/trsm_kernel_RT.c
+
+ZTRSMKERNEL_LN	= ../generic/trsm_kernel_LN.c
+ZTRSMKERNEL_LT	= ../generic/trsm_kernel_LT.c
+ZTRSMKERNEL_RN	= ../generic/trsm_kernel_RN.c
+ZTRSMKERNEL_RT	= ../generic/trsm_kernel_RT.c
+
+#Todo: CGEMM3MKERNEL should be 4x4 blocksizes.
+#CGEMM3MKERNEL    =  zgemm3m_kernel_8x4_sse3.S
+#ZGEMM3MKERNEL    =  zgemm3m_kernel_4x4_sse3.S
+
+#Pure C for other kernels
+#SAMAXKERNEL  = ../arm/amax.c
+#DAMAXKERNEL  = ../arm/amax.c
+#CAMAXKERNEL  = ../arm/zamax.c
+#ZAMAXKERNEL  = ../arm/zamax.c
+#
+#SAMINKERNEL  = ../arm/amin.c
+#DAMINKERNEL  = ../arm/amin.c
+#CAMINKERNEL  = ../arm/zamin.c
+#ZAMINKERNEL  = ../arm/zamin.c
+#
+#SMAXKERNEL   = ../arm/max.c
+#DMAXKERNEL   = ../arm/max.c
+#
+#SMINKERNEL   = ../arm/min.c
+#DMINKERNEL   = ../arm/min.c
+#
+#ISAMAXKERNEL = ../arm/iamax.c
+#IDAMAXKERNEL = ../arm/iamax.c
+#ICAMAXKERNEL = ../arm/izamax.c
+#IZAMAXKERNEL = ../arm/izamax.c
+#
+#ISAMINKERNEL = ../arm/iamin.c
+#IDAMINKERNEL = ../arm/iamin.c
+#ICAMINKERNEL = ../arm/izamin.c
+#IZAMINKERNEL = ../arm/izamin.c
+#
+#ISMAXKERNEL  = ../arm/imax.c
+#IDMAXKERNEL  = ../arm/imax.c
+#
+#ISMINKERNEL  = ../arm/imin.c
+#IDMINKERNEL  = ../arm/imin.c
+#
+#SASUMKERNEL  = ../arm/asum.c
+#DASUMKERNEL  = ../arm/asum.c
+#CASUMKERNEL  = ../arm/zasum.c
+#ZASUMKERNEL  = ../arm/zasum.c
+#
+#SAXPYKERNEL  = ../arm/axpy.c
+#DAXPYKERNEL  = ../arm/axpy.c
+#CAXPYKERNEL  = ../arm/zaxpy.c
+#ZAXPYKERNEL  = ../arm/zaxpy.c
+#
+#SCOPYKERNEL  = ../arm/copy.c
+#DCOPYKERNEL  = ../arm/copy.c
+#CCOPYKERNEL  = ../arm/zcopy.c
+#ZCOPYKERNEL  = ../arm/zcopy.c
+#
+#SDOTKERNEL   = ../arm/dot.c
+#DDOTKERNEL   = ../arm/dot.c
+#CDOTKERNEL   = ../arm/zdot.c
+#ZDOTKERNEL   = ../arm/zdot.c
+#
+#SNRM2KERNEL  = ../arm/nrm2.c
+#DNRM2KERNEL  = ../arm/nrm2.c
+#CNRM2KERNEL  = ../arm/znrm2.c
+#ZNRM2KERNEL  = ../arm/znrm2.c
+#
+#SROTKERNEL   = ../arm/rot.c
+#DROTKERNEL   = ../arm/rot.c
+#CROTKERNEL   = ../arm/zrot.c
+#ZROTKERNEL   = ../arm/zrot.c
+#
+#SSCALKERNEL  = ../arm/scal.c
+#DSCALKERNEL  = ../arm/scal.c
+#CSCALKERNEL  = ../arm/zscal.c
+#ZSCALKERNEL  = ../arm/zscal.c
+#
+#SSWAPKERNEL  = ../arm/swap.c
+#DSWAPKERNEL  = ../arm/swap.c
+#CSWAPKERNEL  = ../arm/zswap.c
+#ZSWAPKERNEL  = ../arm/zswap.c
+#
+
+#SGEMVNKERNEL = ../arm/gemv_n.c
+#DGEMVNKERNEL = ../arm/gemv_n.c
+#CGEMVNKERNEL = ../arm/zgemv_n.c
+#ZGEMVNKERNEL = ../arm/zgemv_n.c
+#
+#SGEMVTKERNEL = ../arm/gemv_t.c
+#DGEMVTKERNEL = ../arm/gemv_t.c
+#CGEMVTKERNEL = ../arm/zgemv_t.c
+#ZGEMVTKERNEL = ../arm/zgemv_t.c
+
+
+#SSYMV_U_KERNEL =  ../generic/symv_k.c
+#SSYMV_L_KERNEL =  ../generic/symv_k.c
+#DSYMV_U_KERNEL =  ../generic/symv_k.c
+#DSYMV_L_KERNEL =  ../generic/symv_k.c
+#QSYMV_U_KERNEL =  ../generic/symv_k.c
+#QSYMV_L_KERNEL =  ../generic/symv_k.c
+#CSYMV_U_KERNEL =  ../generic/zsymv_k.c
+#CSYMV_L_KERNEL =  ../generic/zsymv_k.c
+#ZSYMV_U_KERNEL =  ../generic/zsymv_k.c
+#ZSYMV_L_KERNEL =  ../generic/zsymv_k.c
+#XSYMV_U_KERNEL =  ../generic/zsymv_k.c
+#XSYMV_L_KERNEL =  ../generic/zsymv_k.c
+
+#ZHEMV_U_KERNEL =  ../generic/zhemv_k.c
+#ZHEMV_L_KERNEL =  ../generic/zhemv_k.c
+
+LSAME_KERNEL = ../generic/lsame.c
+SCABS_KERNEL	= ../generic/cabs.c
+DCABS_KERNEL	= ../generic/cabs.c
+QCABS_KERNEL	= ../generic/cabs.c
+
+#Dump kernel
+CGEMM3MKERNEL    = ../generic/zgemm3mkernel_dump.c
+ZGEMM3MKERNEL    = ../generic/zgemm3mkernel_dump.c
diff --git a/kernel/power/def_vsx.h b/kernel/power/def_vsx.h
new file mode 100644
index 000000000..c2d29e268
--- /dev/null
+++ b/kernel/power/def_vsx.h
@@ -0,0 +1,64 @@
+#define vs0 0
+#define vs1 1
+#define vs2 2
+#define vs3 3
+#define vs4 4
+#define vs5 5
+#define vs6 6
+#define vs7 7
+#define vs8 8
+#define vs9 9
+#define vs10 10
+#define vs11 11
+#define vs12 12
+#define vs13 13
+#define vs14 14
+#define vs15 15
+#define vs16 16
+#define vs17 17
+#define vs18 18
+#define vs19 19
+#define vs20 20
+#define vs21 21
+#define vs22 22
+#define vs23 23
+#define vs24 24
+#define vs25 25
+#define vs26 26
+#define vs27 27
+#define vs28 28
+#define vs29 29
+#define vs30 30
+#define vs31 31
+#define vs32 32
+#define vs33 33
+#define vs34 34
+#define vs35 35
+#define vs36 36
+#define vs37 37
+#define vs38 38
+#define vs39 39
+#define vs40 40
+#define vs41 41
+#define vs42 42
+#define vs43 43
+#define vs44 44
+#define vs45 45
+#define vs46 46
+#define vs47 47
+#define vs48 48
+#define vs49 49
+#define vs50 50
+#define vs51 51
+#define vs52 52
+#define vs53 53
+#define vs54 54
+#define vs55 55
+#define vs56 56
+#define vs57 57
+#define vs58 58
+#define vs59 59
+#define vs60 60
+#define vs61 61
+#define vs62 62
+#define vs63 63
diff --git a/kernel/power/dgemm_kernel_16x4_power8.S b/kernel/power/dgemm_kernel_16x4_power8.S
new file mode 100644
index 000000000..c67f31160
--- /dev/null
+++ b/kernel/power/dgemm_kernel_16x4_power8.S
@@ -0,0 +1,348 @@
+/***************************************************************************
+Copyright (c) 2013-2016, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*****************************************************************************/
+
+/**************************************************************************************
+* 2016/03/05 Werner Saar (wernsaar@googlemail.com)
+* 	 BLASTEST 		: OK
+* 	 CTEST			: OK
+* 	 TEST			: OK
+*	 LAPACK-TEST		: OK
+**************************************************************************************/
+
+/*********************************************************************/
+/* Copyright 2009, 2010 The University of Texas at Austin.           */
+/* All rights reserved.                                              */
+/*                                                                   */
+/* Redistribution and use in source and binary forms, with or        */
+/* without modification, are permitted provided that the following   */
+/* conditions are met:                                               */
+/*                                                                   */
+/*   1. Redistributions of source code must retain the above         */
+/*      copyright notice, this list of conditions and the following  */
+/*      disclaimer.                                                  */
+/*                                                                   */
+/*   2. Redistributions in binary form must reproduce the above      */
+/*      copyright notice, this list of conditions and the following  */
+/*      disclaimer in the documentation and/or other materials       */
+/*      provided with the distribution.                              */
+/*                                                                   */
+/*    THIS  SOFTWARE IS PROVIDED  BY THE  UNIVERSITY OF  TEXAS AT    */
+/*    AUSTIN  ``AS IS''  AND ANY  EXPRESS OR  IMPLIED WARRANTIES,    */
+/*    INCLUDING, BUT  NOT LIMITED  TO, THE IMPLIED  WARRANTIES OF    */
+/*    MERCHANTABILITY  AND FITNESS FOR  A PARTICULAR  PURPOSE ARE    */
+/*    DISCLAIMED.  IN  NO EVENT SHALL THE UNIVERSITY  OF TEXAS AT    */
+/*    AUSTIN OR CONTRIBUTORS BE  LIABLE FOR ANY DIRECT, INDIRECT,    */
+/*    INCIDENTAL,  SPECIAL, EXEMPLARY,  OR  CONSEQUENTIAL DAMAGES    */
+/*    (INCLUDING, BUT  NOT LIMITED TO,  PROCUREMENT OF SUBSTITUTE    */
+/*    GOODS  OR  SERVICES; LOSS  OF  USE,  DATA,  OR PROFITS;  OR    */
+/*    BUSINESS INTERRUPTION) HOWEVER CAUSED  AND ON ANY THEORY OF    */
+/*    LIABILITY, WHETHER  IN CONTRACT, STRICT  LIABILITY, OR TORT    */
+/*    (INCLUDING NEGLIGENCE OR OTHERWISE)  ARISING IN ANY WAY OUT    */
+/*    OF  THE  USE OF  THIS  SOFTWARE,  EVEN  IF ADVISED  OF  THE    */
+/*    POSSIBILITY OF SUCH DAMAGE.                                    */
+/*                                                                   */
+/* The views and conclusions contained in the software and           */
+/* documentation are those of the authors and should not be          */
+/* interpreted as representing official policies, either expressed   */
+/* or implied, of The University of Texas at Austin.                 */
+/*********************************************************************/
+
+#define ASSEMBLER
+#include "common.h"
+#include "def_vsx.h"
+
+#ifndef __64BIT__
+#define LOAD	lwz
+#else
+#define LOAD	ld
+#endif
+
+#ifdef __64BIT__
+#define STACKSIZE 320
+#define ALPHA_SP   296(SP)
+#define FZERO	304(SP)
+#else
+#define STACKSIZE 240
+#define ALPHA_SP   224(SP)
+#define FZERO	232(SP)
+#endif
+
+#define	M	r3
+#define	N	r4
+#define	K	r5
+
+#ifdef linux
+#ifndef __64BIT__
+#define A	r6
+#define	B	r7
+#define	C	r8
+#define	LDC	r9
+#define OFFSET	r10
+#else
+#define A	r7
+#define	B	r8
+#define	C	r9
+#define	LDC	r10
+#define OFFSET	r6
+#endif
+#endif
+
+#if defined(_AIX) || defined(__APPLE__)
+#if !defined(__64BIT__) && defined(DOUBLE)
+#define A	r8
+#define	B	r9
+#define	C	r10
+#define	LDC	r7
+#define OFFSET	r6
+#else
+#define A	r7
+#define	B	r8
+#define	C	r9
+#define	LDC	r10
+#define OFFSET	r6
+#endif
+#endif
+
+#define alpha_r vs18
+
+#define o0	0
+
+#define o8	r15
+#define o24	r16
+#define ALPHA	r17
+#define L	r18
+#define T1	r19
+#define KK	r20
+#define BB	r21
+#define	I	r22
+#define J	r23
+#define AO	r24
+#define	BO	r25
+#define	CO 	r26
+#define o16	r27
+#define	o32	r28
+#define	o48	r29
+
+#define PRE	r30
+#define T2	r31
+
+#include "dgemm_macros_16x4_power8.S"
+
+
+#ifndef NEEDPARAM
+
+	PROLOGUE
+	PROFCODE
+
+	addi	SP, SP, -STACKSIZE
+	li	r0, 0
+
+	stfd	f14,    0(SP)
+	stfd	f15,    8(SP)
+	stfd	f16,   16(SP)
+	stfd	f17,   24(SP)
+
+	stfd	f18,   32(SP)
+	stfd	f19,   40(SP)
+	stfd	f20,   48(SP)
+	stfd	f21,   56(SP)
+
+	stfd	f22,   64(SP)
+	stfd	f23,   72(SP)
+	stfd	f24,   80(SP)
+	stfd	f25,   88(SP)
+
+	stfd	f26,   96(SP)
+	stfd	f27,  104(SP)
+	stfd	f28,  112(SP)
+	stfd	f29,  120(SP)
+
+	stfd	f30,  128(SP)
+	stfd	f31,  136(SP)
+
+#ifdef __64BIT__
+	std	r31,  144(SP)
+	std	r30,  152(SP)
+	std	r29,  160(SP)
+	std	r28,  168(SP)
+	std	r27,  176(SP)
+	std	r26,  184(SP)
+	std	r25,  192(SP)
+	std	r24,  200(SP)
+	std	r23,  208(SP)
+	std	r22,  216(SP)
+	std	r21,  224(SP)
+	std	r20,  232(SP)
+	std	r19,  240(SP)
+	std	r18,  248(SP)
+	std	r17,  256(SP)
+	std	r16,  264(SP)
+	std	r15,  272(SP)
+#else
+	stw	r31,  144(SP)
+	stw	r30,  148(SP)
+	stw	r29,  152(SP)
+	stw	r28,  156(SP)
+	stw	r27,  160(SP)
+	stw	r26,  164(SP)
+	stw	r25,  168(SP)
+	stw	r24,  172(SP)
+	stw	r23,  176(SP)
+	stw	r22,  180(SP)
+	stw	r21,  184(SP)
+	stw	r20,  188(SP)
+	stw	r19,  192(SP)
+	stw	r18,  196(SP)
+	stw	r17,  200(SP)
+	stw	r16,  204(SP)
+	stw	r15,  208(SP)
+#endif
+
+	stfd	f1,  ALPHA_SP
+	stw	r0,  FZERO
+
+#if defined(_AIX) || defined(__APPLE__)
+#if !defined(__64BIT__) && defined(DOUBLE)
+	lwz	LDC, FRAMESLOT(0) + STACKSIZE(SP)
+#endif
+#endif
+
+	slwi	LDC, LDC, BASE_SHIFT
+
+#if defined(TRMMKERNEL)
+#if defined(linux) && defined(__64BIT__)
+	ld	OFFSET,  FRAMESLOT(0) + STACKSIZE(SP)
+#endif
+
+#if defined(_AIX) || defined(__APPLE__)
+#ifdef __64BIT__
+	ld	OFFSET,  FRAMESLOT(0) + STACKSIZE(SP)
+#else
+#ifdef DOUBLE
+	lwz	OFFSET,  FRAMESLOT(1) + STACKSIZE(SP)
+#else
+	lwz	OFFSET,  FRAMESLOT(0) + STACKSIZE(SP)
+#endif
+#endif
+#endif
+#endif
+
+
+	cmpwi	cr0, M, 0
+	ble	.L999_H1
+	cmpwi	cr0, N, 0
+	ble	.L999_H1
+	cmpwi	cr0, K, 0
+	ble	.L999_H1
+
+#ifdef __64BIT__
+	addi	ALPHA, SP, 296
+#else
+	addi	ALPHA, SP, 224
+#endif
+
+	li	PRE, 256 
+	li	o8 , 8
+	li	o16, 16
+	li	o24, 24
+	li	o32, 32
+	li	o48, 48
+
+	lxvdsx	alpha_r, 0, ALPHA
+
+#include "dgemm_logic_16x4_power8.S"
+
+.L999:
+	addi	r3, 0, 0
+
+	lfd	f14,    0(SP)
+	lfd	f15,    8(SP)
+	lfd	f16,   16(SP)
+	lfd	f17,   24(SP)
+
+	lfd	f18,   32(SP)
+	lfd	f19,   40(SP)
+	lfd	f20,   48(SP)
+	lfd	f21,   56(SP)
+
+	lfd	f22,   64(SP)
+	lfd	f23,   72(SP)
+	lfd	f24,   80(SP)
+	lfd	f25,   88(SP)
+
+	lfd	f26,   96(SP)
+	lfd	f27,  104(SP)
+	lfd	f28,  112(SP)
+	lfd	f29,  120(SP)
+
+	lfd	f30,  128(SP)
+	lfd	f31,  136(SP)
+
+#ifdef __64BIT__
+	ld	r31,  144(SP)
+	ld	r30,  152(SP)
+	ld	r29,  160(SP)
+	ld	r28,  168(SP)
+	ld	r27,  176(SP)
+	ld	r26,  184(SP)
+	ld	r25,  192(SP)
+	ld	r24,  200(SP)
+	ld	r23,  208(SP)
+	ld	r22,  216(SP)
+	ld	r21,  224(SP)
+	ld	r20,  232(SP)
+	ld	r19,  240(SP)
+	ld	r18,  248(SP)
+	ld	r17,  256(SP)
+	ld	r16,  264(SP)
+	ld	r15,  272(SP)
+#else
+	lwz	r31,  144(SP)
+	lwz	r30,  148(SP)
+	lwz	r29,  152(SP)
+	lwz	r28,  156(SP)
+	lwz	r27,  160(SP)
+	lwz	r26,  164(SP)
+	lwz	r25,  168(SP)
+	lwz	r24,  172(SP)
+	lwz	r23,  176(SP)
+	lwz	r22,  180(SP)
+	lwz	r21,  184(SP)
+	lwz	r20,  188(SP)
+	lwz	r19,  192(SP)
+	lwz	r18,  196(SP)
+	lwz	r17,  200(SP)
+	lwz	r16,  204(SP)
+	lwz	r15,  208(SP)
+#endif
+
+	addi	SP, SP, STACKSIZE
+
+	blr
+
+	EPILOGUE
+#endif
diff --git a/kernel/power/dgemm_logic_16x4_power8.S b/kernel/power/dgemm_logic_16x4_power8.S
new file mode 100644
index 000000000..49c438f61
--- /dev/null
+++ b/kernel/power/dgemm_logic_16x4_power8.S
@@ -0,0 +1,1683 @@
+/***************************************************************************
+Copyright (c) 2013-2016, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*****************************************************************************/
+
+/**************************************************************************************
+* 2016/03/05 Werner Saar (wernsaar@googlemail.com)
+* 	 BLASTEST 		: OK
+* 	 CTEST			: OK
+* 	 TEST			: OK
+*	 LAPACK-TEST		: OK
+**************************************************************************************/
+
+
+	srawi.		J,	N,	2
+	ble		.LDGEMM_L4_END
+
+.LDGEMM_L4_BEGIN:
+
+	mr		CO,	C
+	mr		AO,	A
+	slwi		T1,	LDC	,	2
+	add		C,	C,	T1
+	srawi.		I,	M,	4
+	ble		.LDGEMM_L4x16_END
+
+.LDGEMM_L4x16_BEGIN:
+
+
+	mr		BO,	B
+	srawi.		L,	K,	3
+	ble		.LDGEMM_L4x16_SUB0
+	cmpwi		cr0,	L,	1
+	ble		.LDGEMM_L4x16_SUB4
+
+.LDGEMM_L4x16_LOOP_START:
+
+	dcbt		AO,	PRE
+	LOAD4x16_1
+	dcbt		AO,	PRE
+	KERNEL4x16_I1
+	dcbt		AO,	PRE
+	KERNEL4x16_2
+	dcbt		AO,	PRE
+	KERNEL4x16_1
+	dcbt		AO,	PRE
+	KERNEL4x16_2
+
+	dcbt		AO,	PRE
+	KERNEL4x16_1
+	dcbt		AO,	PRE
+	KERNEL4x16_2
+	dcbt		AO,	PRE
+	KERNEL4x16_1
+	dcbt		AO,	PRE
+	KERNEL4x16_2
+
+	addic.		L,	L,	-2
+	ble		.LDGEMM_L4x16_LOOP_END
+
+	.align 5
+
+.LDGEMM_L4x16_LOOP:
+
+	dcbt		AO,	PRE
+	KERNEL4x16_1
+	dcbt		AO,	PRE
+	KERNEL4x16_2
+	dcbt		AO,	PRE
+	KERNEL4x16_1
+	dcbt		AO,	PRE
+	KERNEL4x16_2
+
+	dcbt		AO,	PRE
+	KERNEL4x16_1
+	dcbt		AO,	PRE
+	KERNEL4x16_2
+	dcbt		AO,	PRE
+	KERNEL4x16_1
+	dcbt		AO,	PRE
+	KERNEL4x16_2
+
+	addic.		L,	L,	-1
+	bgt		.LDGEMM_L4x16_LOOP
+
+.LDGEMM_L4x16_LOOP_END:
+
+	dcbt		AO,	PRE
+	KERNEL4x16_1
+	dcbt		AO,	PRE
+	KERNEL4x16_2
+	dcbt		AO,	PRE
+	KERNEL4x16_1
+	dcbt		AO,	PRE
+	KERNEL4x16_2
+
+	dcbt		AO,	PRE
+	KERNEL4x16_1
+	dcbt		AO,	PRE
+	KERNEL4x16_2
+	dcbt		AO,	PRE
+	KERNEL4x16_1
+	KERNEL4x16_E2
+
+	b		.LDGEMM_L4x16_SUB1
+
+.LDGEMM_L4x16_SUB4:
+
+	dcbt		AO,	PRE
+	KERNEL4x16_SUBI1
+	dcbt		AO,	PRE
+	KERNEL4x16_SUB1
+	dcbt		AO,	PRE
+	KERNEL4x16_SUB1
+	dcbt		AO,	PRE
+	KERNEL4x16_SUB1
+
+	KERNEL4x16_SUB1
+	KERNEL4x16_SUB1
+	KERNEL4x16_SUB1
+	KERNEL4x16_SUB1
+
+	b		.LDGEMM_L4x16_SUB1
+
+.LDGEMM_L4x16_SUB0:
+
+	andi.		L,	K,	7
+
+	KERNEL4x16_SUBI1
+
+	addic.		L,	L,	-1
+	ble		.LDGEMM_L4x16_SAVE
+	b		.LDGEMM_L4x16_SUB2
+
+.LDGEMM_L4x16_SUB1:
+
+	andi.		L,	K,	7
+	ble		.LDGEMM_L4x16_SAVE
+
+.LDGEMM_L4x16_SUB2:
+
+	KERNEL4x16_SUB1
+
+	addic.		L,	L,	-1
+	bgt		.LDGEMM_L4x16_SUB2
+
+.LDGEMM_L4x16_SAVE:
+
+	SAVE4x16
+
+	addic.		I,	I,	-1
+	bgt		.LDGEMM_L4x16_BEGIN
+
+.LDGEMM_L4x16_END:
+
+.LDGEMM_L4x8_BEGIN:
+
+	andi.		T2,	M,	15
+	ble		.LDGEMM_L4x1_END
+
+	andi.		T1,	M,	8
+	ble		.LDGEMM_L4x8_END
+	mr		BO,	B
+	srawi.		L,	K,	3
+	ble		.LDGEMM_L4x8_SUB0
+	cmpwi		cr0,	L,	1
+	ble		.LDGEMM_L4x8_SUB4
+
+.LDGEMM_L4x8_LOOP_START:
+
+	LOAD4x8_1
+	KERNEL4x8_I1
+	KERNEL4x8_2
+	KERNEL4x8_1
+	KERNEL4x8_2
+
+	KERNEL4x8_1
+	KERNEL4x8_2
+	KERNEL4x8_1
+	KERNEL4x8_2
+
+	addic.		L,	L,	-2
+	ble		.LDGEMM_L4x8_LOOP_END
+
+	.align 5
+
+.LDGEMM_L4x8_LOOP:
+
+	KERNEL4x8_1
+	KERNEL4x8_2
+	KERNEL4x8_1
+	KERNEL4x8_2
+
+	KERNEL4x8_1
+	KERNEL4x8_2
+	KERNEL4x8_1
+	KERNEL4x8_2
+
+	addic.		L,	L,	-1
+	bgt		.LDGEMM_L4x8_LOOP
+
+.LDGEMM_L4x8_LOOP_END:
+
+	KERNEL4x8_1
+	KERNEL4x8_2
+	KERNEL4x8_1
+	KERNEL4x8_2
+
+	KERNEL4x8_1
+	KERNEL4x8_2
+	KERNEL4x8_1
+	KERNEL4x8_E2
+
+	b		.LDGEMM_L4x8_SUB1
+
+.LDGEMM_L4x8_SUB4:
+
+	KERNEL4x8_SUBI1
+	KERNEL4x8_SUB1
+	KERNEL4x8_SUB1
+	KERNEL4x8_SUB1
+
+	KERNEL4x8_SUB1
+	KERNEL4x8_SUB1
+	KERNEL4x8_SUB1
+	KERNEL4x8_SUB1
+
+	b		.LDGEMM_L4x8_SUB1
+
+.LDGEMM_L4x8_SUB0:
+
+	andi.		L,	K,	7
+
+	KERNEL4x8_SUBI1
+
+	addic.		L,	L,	-1
+	ble		.LDGEMM_L4x8_SAVE
+	b		.LDGEMM_L4x8_SUB2
+
+.LDGEMM_L4x8_SUB1:
+
+	andi.		L,	K,	7
+	ble		.LDGEMM_L4x8_SAVE
+
+.LDGEMM_L4x8_SUB2:
+
+	KERNEL4x8_SUB1
+
+	addic.		L,	L,	-1
+	bgt		.LDGEMM_L4x8_SUB2
+
+.LDGEMM_L4x8_SAVE:
+
+	SAVE4x8
+
+.LDGEMM_L4x8_END:
+
+.LDGEMM_L4x4_BEGIN:
+
+
+	andi.		T1,	M,	4
+	ble		.LDGEMM_L4x4_END
+	mr		BO,	B
+	srawi.		L,	K,	3
+	ble		.LDGEMM_L4x4_SUB0
+	cmpwi		cr0,	L,	1
+	ble		.LDGEMM_L4x4_SUB4
+
+.LDGEMM_L4x4_LOOP_START:
+
+	LOAD4x4_1
+	KERNEL4x4_I1
+	KERNEL4x4_2
+	KERNEL4x4_1
+	KERNEL4x4_2
+
+	KERNEL4x4_1
+	KERNEL4x4_2
+	KERNEL4x4_1
+	KERNEL4x4_2
+
+	addic.		L,	L,	-2
+	ble		.LDGEMM_L4x4_LOOP_END
+
+	.align 5
+
+.LDGEMM_L4x4_LOOP:
+
+	KERNEL4x4_1
+	KERNEL4x4_2
+	KERNEL4x4_1
+	KERNEL4x4_2
+
+	KERNEL4x4_1
+	KERNEL4x4_2
+	KERNEL4x4_1
+	KERNEL4x4_2
+
+	addic.		L,	L,	-1
+	bgt		.LDGEMM_L4x4_LOOP
+
+.LDGEMM_L4x4_LOOP_END:
+
+	KERNEL4x4_1
+	KERNEL4x4_2
+	KERNEL4x4_1
+	KERNEL4x4_2
+
+	KERNEL4x4_1
+	KERNEL4x4_2
+	KERNEL4x4_1
+	KERNEL4x4_E2
+
+	b		.LDGEMM_L4x4_SUB1
+
+.LDGEMM_L4x4_SUB4:
+
+	KERNEL4x4_SUBI1
+	KERNEL4x4_SUB1
+	KERNEL4x4_SUB1
+	KERNEL4x4_SUB1
+
+	KERNEL4x4_SUB1
+	KERNEL4x4_SUB1
+	KERNEL4x4_SUB1
+	KERNEL4x4_SUB1
+
+	b		.LDGEMM_L4x4_SUB1
+
+.LDGEMM_L4x4_SUB0:
+
+	andi.		L,	K,	7
+
+	KERNEL4x4_SUBI1
+
+	addic.		L,	L,	-1
+	ble		.LDGEMM_L4x4_SAVE
+	b		.LDGEMM_L4x4_SUB2
+
+.LDGEMM_L4x4_SUB1:
+
+	andi.		L,	K,	7
+	ble		.LDGEMM_L4x4_SAVE
+
+.LDGEMM_L4x4_SUB2:
+
+	KERNEL4x4_SUB1
+
+	addic.		L,	L,	-1
+	bgt		.LDGEMM_L4x4_SUB2
+
+.LDGEMM_L4x4_SAVE:
+
+	SAVE4x4
+
+.LDGEMM_L4x4_END:
+
+.LDGEMM_L4x2_BEGIN:
+
+
+	andi.		T1,	M,	2
+	ble		.LDGEMM_L4x2_END
+	mr		BO,	B
+	srawi.		L,	K,	3
+	ble		.LDGEMM_L4x2_SUB0
+	cmpwi		cr0,	L,	1
+	ble		.LDGEMM_L4x2_SUB4
+
+.LDGEMM_L4x2_LOOP_START:
+
+	LOAD4x2_1
+	KERNEL4x2_I1
+	KERNEL4x2_2
+	KERNEL4x2_1
+	KERNEL4x2_2
+
+	KERNEL4x2_1
+	KERNEL4x2_2
+	KERNEL4x2_1
+	KERNEL4x2_2
+
+	addic.		L,	L,	-2
+	ble		.LDGEMM_L4x2_LOOP_END
+
+	.align 5
+
+.LDGEMM_L4x2_LOOP:
+
+	KERNEL4x2_1
+	KERNEL4x2_2
+	KERNEL4x2_1
+	KERNEL4x2_2
+
+	KERNEL4x2_1
+	KERNEL4x2_2
+	KERNEL4x2_1
+	KERNEL4x2_2
+
+	addic.		L,	L,	-1
+	bgt		.LDGEMM_L4x2_LOOP
+
+.LDGEMM_L4x2_LOOP_END:
+
+	KERNEL4x2_1
+	KERNEL4x2_2
+	KERNEL4x2_1
+	KERNEL4x2_2
+
+	KERNEL4x2_1
+	KERNEL4x2_2
+	KERNEL4x2_1
+	KERNEL4x2_E2
+
+	b		.LDGEMM_L4x2_SUB1
+
+.LDGEMM_L4x2_SUB4:
+
+	KERNEL4x2_SUBI1
+	KERNEL4x2_SUB1
+	KERNEL4x2_SUB1
+	KERNEL4x2_SUB1
+
+	KERNEL4x2_SUB1
+	KERNEL4x2_SUB1
+	KERNEL4x2_SUB1
+	KERNEL4x2_SUB1
+
+	b		.LDGEMM_L4x2_SUB1
+
+.LDGEMM_L4x2_SUB0:
+
+	andi.		L,	K,	7
+
+	KERNEL4x2_SUBI1
+
+	addic.		L,	L,	-1
+	ble		.LDGEMM_L4x2_SAVE
+	b		.LDGEMM_L4x2_SUB2
+
+.LDGEMM_L4x2_SUB1:
+
+	andi.		L,	K,	7
+	ble		.LDGEMM_L4x2_SAVE
+
+.LDGEMM_L4x2_SUB2:
+
+	KERNEL4x2_SUB1
+
+	addic.		L,	L,	-1
+	bgt		.LDGEMM_L4x2_SUB2
+
+.LDGEMM_L4x2_SAVE:
+
+	SAVE4x2
+
+.LDGEMM_L4x2_END:
+
+.LDGEMM_L4x1_BEGIN:
+
+
+	andi.		T1,	M,	1
+	ble		.LDGEMM_L4x1_END
+	mr		BO,	B
+	srawi.		L,	K,	3
+	ble		.LDGEMM_L4x1_SUB0
+	cmpwi		cr0,	L,	1
+	ble		.LDGEMM_L4x1_SUB4
+
+.LDGEMM_L4x1_LOOP_START:
+
+	LOAD4x1_1
+	KERNEL4x1_I1
+	KERNEL4x1_2
+	KERNEL4x1_1
+	KERNEL4x1_2
+
+	KERNEL4x1_1
+	KERNEL4x1_2
+	KERNEL4x1_1
+	KERNEL4x1_2
+
+	addic.		L,	L,	-2
+	ble		.LDGEMM_L4x1_LOOP_END
+
+	.align 5
+
+.LDGEMM_L4x1_LOOP:
+
+	KERNEL4x1_1
+	KERNEL4x1_2
+	KERNEL4x1_1
+	KERNEL4x1_2
+
+	KERNEL4x1_1
+	KERNEL4x1_2
+	KERNEL4x1_1
+	KERNEL4x1_2
+
+	addic.		L,	L,	-1
+	bgt		.LDGEMM_L4x1_LOOP
+
+.LDGEMM_L4x1_LOOP_END:
+
+	KERNEL4x1_1
+	KERNEL4x1_2
+	KERNEL4x1_1
+	KERNEL4x1_2
+
+	KERNEL4x1_1
+	KERNEL4x1_2
+	KERNEL4x1_1
+	KERNEL4x1_E2
+
+	b		.LDGEMM_L4x1_SUB1
+
+.LDGEMM_L4x1_SUB4:
+
+	KERNEL4x1_SUBI1
+	KERNEL4x1_SUB1
+	KERNEL4x1_SUB1
+	KERNEL4x1_SUB1
+
+	KERNEL4x1_SUB1
+	KERNEL4x1_SUB1
+	KERNEL4x1_SUB1
+	KERNEL4x1_SUB1
+
+	b		.LDGEMM_L4x1_SUB1
+
+.LDGEMM_L4x1_SUB0:
+
+	andi.		L,	K,	7
+
+	KERNEL4x1_SUBI1
+
+	addic.		L,	L,	-1
+	ble		.LDGEMM_L4x1_SAVE
+	b		.LDGEMM_L4x1_SUB2
+
+.LDGEMM_L4x1_SUB1:
+
+	andi.		L,	K,	7
+	ble		.LDGEMM_L4x1_SAVE
+
+.LDGEMM_L4x1_SUB2:
+
+	KERNEL4x1_SUB1
+
+	addic.		L,	L,	-1
+	bgt		.LDGEMM_L4x1_SUB2
+
+.LDGEMM_L4x1_SAVE:
+
+	SAVE4x1
+
+.LDGEMM_L4x1_END:
+
+	slwi		T1,	K,	5
+	add		B,	B,	T1
+
+	addic.		J,	J,	-1
+	bgt		.LDGEMM_L4_BEGIN
+
+	andi.		T2,	N,	3
+	ble		.L999
+
+.LDGEMM_L4_END:
+
+	b		.LDGEMM_L2_BEGIN
+
+.L999_H1:
+
+	b		.L999
+
+.LDGEMM_L2_BEGIN:
+
+	andi.		T1,	N,	2
+	ble		.LDGEMM_L2_END
+	mr		CO,	C
+	mr		AO,	A
+	slwi		T1,	LDC	,	1
+	add		C,	C,	T1
+	srawi.		I,	M,	4
+	ble		.LDGEMM_L2x16_END
+
+.LDGEMM_L2x16_BEGIN:
+
+
+	mr		BO,	B
+	srawi.		L,	K,	3
+	ble		.LDGEMM_L2x16_SUB0
+	cmpwi		cr0,	L,	1
+	ble		.LDGEMM_L2x16_SUB4
+
+.LDGEMM_L2x16_LOOP_START:
+
+	dcbt		AO,	PRE
+	LOAD2x16_1
+	dcbt		AO,	PRE
+	KERNEL2x16_I1
+	dcbt		AO,	PRE
+	KERNEL2x16_2
+	dcbt		AO,	PRE
+	KERNEL2x16_1
+	dcbt		AO,	PRE
+	KERNEL2x16_2
+
+	dcbt		AO,	PRE
+	KERNEL2x16_1
+	dcbt		AO,	PRE
+	KERNEL2x16_2
+	dcbt		AO,	PRE
+	KERNEL2x16_1
+	dcbt		AO,	PRE
+	KERNEL2x16_2
+
+	addic.		L,	L,	-2
+	ble		.LDGEMM_L2x16_LOOP_END
+
+	.align 5
+
+.LDGEMM_L2x16_LOOP:
+
+	dcbt		AO,	PRE
+	KERNEL2x16_1
+	dcbt		AO,	PRE
+	KERNEL2x16_2
+	dcbt		AO,	PRE
+	KERNEL2x16_1
+	dcbt		AO,	PRE
+	KERNEL2x16_2
+
+	dcbt		AO,	PRE
+	KERNEL2x16_1
+	dcbt		AO,	PRE
+	KERNEL2x16_2
+	dcbt		AO,	PRE
+	KERNEL2x16_1
+	dcbt		AO,	PRE
+	KERNEL2x16_2
+
+	addic.		L,	L,	-1
+	bgt		.LDGEMM_L2x16_LOOP
+
+.LDGEMM_L2x16_LOOP_END:
+
+	dcbt		AO,	PRE
+	KERNEL2x16_1
+	dcbt		AO,	PRE
+	KERNEL2x16_2
+	dcbt		AO,	PRE
+	KERNEL2x16_1
+	dcbt		AO,	PRE
+	KERNEL2x16_2
+
+	dcbt		AO,	PRE
+	KERNEL2x16_1
+	dcbt		AO,	PRE
+	KERNEL2x16_2
+	dcbt		AO,	PRE
+	KERNEL2x16_1
+	KERNEL2x16_E2
+
+	b		.LDGEMM_L2x16_SUB1
+
+.LDGEMM_L2x16_SUB4:
+
+	dcbt		AO,	PRE
+	KERNEL2x16_SUBI1
+	dcbt		AO,	PRE
+	KERNEL2x16_SUB1
+	dcbt		AO,	PRE
+	KERNEL2x16_SUB1
+	dcbt		AO,	PRE
+	KERNEL2x16_SUB1
+
+	KERNEL2x16_SUB1
+	KERNEL2x16_SUB1
+	KERNEL2x16_SUB1
+	KERNEL2x16_SUB1
+
+	b		.LDGEMM_L2x16_SUB1
+
+.LDGEMM_L2x16_SUB0:
+
+	andi.		L,	K,	7
+
+	KERNEL2x16_SUBI1
+
+	addic.		L,	L,	-1
+	ble		.LDGEMM_L2x16_SAVE
+	b		.LDGEMM_L2x16_SUB2
+
+.LDGEMM_L2x16_SUB1:
+
+	andi.		L,	K,	7
+	ble		.LDGEMM_L2x16_SAVE
+
+.LDGEMM_L2x16_SUB2:
+
+	KERNEL2x16_SUB1
+
+	addic.		L,	L,	-1
+	bgt		.LDGEMM_L2x16_SUB2
+
+.LDGEMM_L2x16_SAVE:
+
+	SAVE2x16
+
+	addic.		I,	I,	-1
+	bgt		.LDGEMM_L2x16_BEGIN
+
+.LDGEMM_L2x16_END:
+
+.LDGEMM_L2x8_BEGIN:
+
+	andi.		T2,	M,	15
+	ble		.LDGEMM_L2x1_END
+
+	andi.		T1,	M,	8
+	ble		.LDGEMM_L2x8_END
+	mr		BO,	B
+	srawi.		L,	K,	3
+	ble		.LDGEMM_L2x8_SUB0
+	cmpwi		cr0,	L,	1
+	ble		.LDGEMM_L2x8_SUB4
+
+.LDGEMM_L2x8_LOOP_START:
+
+	LOAD2x8_1
+	KERNEL2x8_I1
+	KERNEL2x8_2
+	KERNEL2x8_1
+	KERNEL2x8_2
+
+	KERNEL2x8_1
+	KERNEL2x8_2
+	KERNEL2x8_1
+	KERNEL2x8_2
+
+	addic.		L,	L,	-2
+	ble		.LDGEMM_L2x8_LOOP_END
+
+	.align 5
+
+.LDGEMM_L2x8_LOOP:
+
+	KERNEL2x8_1
+	KERNEL2x8_2
+	KERNEL2x8_1
+	KERNEL2x8_2
+
+	KERNEL2x8_1
+	KERNEL2x8_2
+	KERNEL2x8_1
+	KERNEL2x8_2
+
+	addic.		L,	L,	-1
+	bgt		.LDGEMM_L2x8_LOOP
+
+.LDGEMM_L2x8_LOOP_END:
+
+	KERNEL2x8_1
+	KERNEL2x8_2
+	KERNEL2x8_1
+	KERNEL2x8_2
+
+	KERNEL2x8_1
+	KERNEL2x8_2
+	KERNEL2x8_1
+	KERNEL2x8_E2
+
+	b		.LDGEMM_L2x8_SUB1
+
+.LDGEMM_L2x8_SUB4:
+
+	KERNEL2x8_SUBI1
+	KERNEL2x8_SUB1
+	KERNEL2x8_SUB1
+	KERNEL2x8_SUB1
+
+	KERNEL2x8_SUB1
+	KERNEL2x8_SUB1
+	KERNEL2x8_SUB1
+	KERNEL2x8_SUB1
+
+	b		.LDGEMM_L2x8_SUB1
+
+.LDGEMM_L2x8_SUB0:
+
+	andi.		L,	K,	7
+
+	KERNEL2x8_SUBI1
+
+	addic.		L,	L,	-1
+	ble		.LDGEMM_L2x8_SAVE
+	b		.LDGEMM_L2x8_SUB2
+
+.LDGEMM_L2x8_SUB1:
+
+	andi.		L,	K,	7
+	ble		.LDGEMM_L2x8_SAVE
+
+.LDGEMM_L2x8_SUB2:
+
+	KERNEL2x8_SUB1
+
+	addic.		L,	L,	-1
+	bgt		.LDGEMM_L2x8_SUB2
+
+.LDGEMM_L2x8_SAVE:
+
+	SAVE2x8
+
+.LDGEMM_L2x8_END:
+
+.LDGEMM_L2x4_BEGIN:
+
+
+	andi.		T1,	M,	4
+	ble		.LDGEMM_L2x4_END
+	mr		BO,	B
+	srawi.		L,	K,	3
+	ble		.LDGEMM_L2x4_SUB0
+	cmpwi		cr0,	L,	1
+	ble		.LDGEMM_L2x4_SUB4
+
+.LDGEMM_L2x4_LOOP_START:
+
+	LOAD2x4_1
+	KERNEL2x4_I1
+	KERNEL2x4_2
+	KERNEL2x4_1
+	KERNEL2x4_2
+
+	KERNEL2x4_1
+	KERNEL2x4_2
+	KERNEL2x4_1
+	KERNEL2x4_2
+
+	addic.		L,	L,	-2
+	ble		.LDGEMM_L2x4_LOOP_END
+
+	.align 5
+
+.LDGEMM_L2x4_LOOP:
+
+	KERNEL2x4_1
+	KERNEL2x4_2
+	KERNEL2x4_1
+	KERNEL2x4_2
+
+	KERNEL2x4_1
+	KERNEL2x4_2
+	KERNEL2x4_1
+	KERNEL2x4_2
+
+	addic.		L,	L,	-1
+	bgt		.LDGEMM_L2x4_LOOP
+
+.LDGEMM_L2x4_LOOP_END:
+
+	KERNEL2x4_1
+	KERNEL2x4_2
+	KERNEL2x4_1
+	KERNEL2x4_2
+
+	KERNEL2x4_1
+	KERNEL2x4_2
+	KERNEL2x4_1
+	KERNEL2x4_E2
+
+	b		.LDGEMM_L2x4_SUB1
+
+.LDGEMM_L2x4_SUB4:
+
+	KERNEL2x4_SUBI1
+	KERNEL2x4_SUB1
+	KERNEL2x4_SUB1
+	KERNEL2x4_SUB1
+
+	KERNEL2x4_SUB1
+	KERNEL2x4_SUB1
+	KERNEL2x4_SUB1
+	KERNEL2x4_SUB1
+
+	b		.LDGEMM_L2x4_SUB1
+
+.LDGEMM_L2x4_SUB0:
+
+	andi.		L,	K,	7
+
+	KERNEL2x4_SUBI1
+
+	addic.		L,	L,	-1
+	ble		.LDGEMM_L2x4_SAVE
+	b		.LDGEMM_L2x4_SUB2
+
+.LDGEMM_L2x4_SUB1:
+
+	andi.		L,	K,	7
+	ble		.LDGEMM_L2x4_SAVE
+
+.LDGEMM_L2x4_SUB2:
+
+	KERNEL2x4_SUB1
+
+	addic.		L,	L,	-1
+	bgt		.LDGEMM_L2x4_SUB2
+
+.LDGEMM_L2x4_SAVE:
+
+	SAVE2x4
+
+.LDGEMM_L2x4_END:
+
+.LDGEMM_L2x2_BEGIN:
+
+
+	andi.		T1,	M,	2
+	ble		.LDGEMM_L2x2_END
+	mr		BO,	B
+	srawi.		L,	K,	3
+	ble		.LDGEMM_L2x2_SUB0
+	cmpwi		cr0,	L,	1
+	ble		.LDGEMM_L2x2_SUB4
+
+.LDGEMM_L2x2_LOOP_START:
+
+	LOAD2x2_1
+	KERNEL2x2_I1
+	KERNEL2x2_2
+	KERNEL2x2_1
+	KERNEL2x2_2
+
+	KERNEL2x2_1
+	KERNEL2x2_2
+	KERNEL2x2_1
+	KERNEL2x2_2
+
+	addic.		L,	L,	-2
+	ble		.LDGEMM_L2x2_LOOP_END
+
+	.align 5
+
+.LDGEMM_L2x2_LOOP:
+
+	KERNEL2x2_1
+	KERNEL2x2_2
+	KERNEL2x2_1
+	KERNEL2x2_2
+
+	KERNEL2x2_1
+	KERNEL2x2_2
+	KERNEL2x2_1
+	KERNEL2x2_2
+
+	addic.		L,	L,	-1
+	bgt		.LDGEMM_L2x2_LOOP
+
+.LDGEMM_L2x2_LOOP_END:
+
+	KERNEL2x2_1
+	KERNEL2x2_2
+	KERNEL2x2_1
+	KERNEL2x2_2
+
+	KERNEL2x2_1
+	KERNEL2x2_2
+	KERNEL2x2_1
+	KERNEL2x2_E2
+
+	b		.LDGEMM_L2x2_SUB1
+
+.LDGEMM_L2x2_SUB4:
+
+	KERNEL2x2_SUBI1
+	KERNEL2x2_SUB1
+	KERNEL2x2_SUB1
+	KERNEL2x2_SUB1
+
+	KERNEL2x2_SUB1
+	KERNEL2x2_SUB1
+	KERNEL2x2_SUB1
+	KERNEL2x2_SUB1
+
+	b		.LDGEMM_L2x2_SUB1
+
+.LDGEMM_L2x2_SUB0:
+
+	andi.		L,	K,	7
+
+	KERNEL2x2_SUBI1
+
+	addic.		L,	L,	-1
+	ble		.LDGEMM_L2x2_SAVE
+	b		.LDGEMM_L2x2_SUB2
+
+.LDGEMM_L2x2_SUB1:
+
+	andi.		L,	K,	7
+	ble		.LDGEMM_L2x2_SAVE
+
+.LDGEMM_L2x2_SUB2:
+
+	KERNEL2x2_SUB1
+
+	addic.		L,	L,	-1
+	bgt		.LDGEMM_L2x2_SUB2
+
+.LDGEMM_L2x2_SAVE:
+
+	SAVE2x2
+
+.LDGEMM_L2x2_END:
+
+.LDGEMM_L2x1_BEGIN:
+
+
+	andi.		T1,	M,	1
+	ble		.LDGEMM_L2x1_END
+	mr		BO,	B
+	srawi.		L,	K,	3
+	ble		.LDGEMM_L2x1_SUB0
+	cmpwi		cr0,	L,	1
+	ble		.LDGEMM_L2x1_SUB4
+
+.LDGEMM_L2x1_LOOP_START:
+
+	LOAD2x1_1
+	KERNEL2x1_I1
+	KERNEL2x1_2
+	KERNEL2x1_1
+	KERNEL2x1_2
+
+	KERNEL2x1_1
+	KERNEL2x1_2
+	KERNEL2x1_1
+	KERNEL2x1_2
+
+	addic.		L,	L,	-2
+	ble		.LDGEMM_L2x1_LOOP_END
+
+	.align 5
+
+.LDGEMM_L2x1_LOOP:
+
+	KERNEL2x1_1
+	KERNEL2x1_2
+	KERNEL2x1_1
+	KERNEL2x1_2
+
+	KERNEL2x1_1
+	KERNEL2x1_2
+	KERNEL2x1_1
+	KERNEL2x1_2
+
+	addic.		L,	L,	-1
+	bgt		.LDGEMM_L2x1_LOOP
+
+.LDGEMM_L2x1_LOOP_END:
+
+	KERNEL2x1_1
+	KERNEL2x1_2
+	KERNEL2x1_1
+	KERNEL2x1_2
+
+	KERNEL2x1_1
+	KERNEL2x1_2
+	KERNEL2x1_1
+	KERNEL2x1_E2
+
+	b		.LDGEMM_L2x1_SUB1
+
+.LDGEMM_L2x1_SUB4:
+
+	KERNEL2x1_SUBI1
+	KERNEL2x1_SUB1
+	KERNEL2x1_SUB1
+	KERNEL2x1_SUB1
+
+	KERNEL2x1_SUB1
+	KERNEL2x1_SUB1
+	KERNEL2x1_SUB1
+	KERNEL2x1_SUB1
+
+	b		.LDGEMM_L2x1_SUB1
+
+.LDGEMM_L2x1_SUB0:
+
+	andi.		L,	K,	7
+
+	KERNEL2x1_SUBI1
+
+	addic.		L,	L,	-1
+	ble		.LDGEMM_L2x1_SAVE
+	b		.LDGEMM_L2x1_SUB2
+
+.LDGEMM_L2x1_SUB1:
+
+	andi.		L,	K,	7
+	ble		.LDGEMM_L2x1_SAVE
+
+.LDGEMM_L2x1_SUB2:
+
+	KERNEL2x1_SUB1
+
+	addic.		L,	L,	-1
+	bgt		.LDGEMM_L2x1_SUB2
+
+.LDGEMM_L2x1_SAVE:
+
+	SAVE2x1
+
+.LDGEMM_L2x1_END:
+
+	slwi		T1,	K,	4
+	add		B,	B,	T1
+
+.LDGEMM_L2_END:
+.LDGEMM_L1_BEGIN:
+
+	andi.		T1,	N,	1
+	ble		.LDGEMM_L1_END
+	mr		CO,	C
+	mr		AO,	A
+	srawi.		I,	M,	4
+	ble		.LDGEMM_L1x16_END
+
+.LDGEMM_L1x16_BEGIN:
+
+
+	mr		BO,	B
+	srawi.		L,	K,	3
+	ble		.LDGEMM_L1x16_SUB0
+	cmpwi		cr0,	L,	1
+	ble		.LDGEMM_L1x16_SUB4
+
+.LDGEMM_L1x16_LOOP_START:
+
+	dcbt		AO,	PRE
+	LOAD1x16_1
+	dcbt		AO,	PRE
+	KERNEL1x16_I1
+	dcbt		AO,	PRE
+	KERNEL1x16_2
+	dcbt		AO,	PRE
+	KERNEL1x16_1
+	dcbt		AO,	PRE
+	KERNEL1x16_2
+
+	dcbt		AO,	PRE
+	KERNEL1x16_1
+	dcbt		AO,	PRE
+	KERNEL1x16_2
+	dcbt		AO,	PRE
+	KERNEL1x16_1
+	dcbt		AO,	PRE
+	KERNEL1x16_2
+
+	addic.		L,	L,	-2
+	ble		.LDGEMM_L1x16_LOOP_END
+
+	.align 5
+
+.LDGEMM_L1x16_LOOP:
+
+	dcbt		AO,	PRE
+	KERNEL1x16_1
+	dcbt		AO,	PRE
+	KERNEL1x16_2
+	dcbt		AO,	PRE
+	KERNEL1x16_1
+	dcbt		AO,	PRE
+	KERNEL1x16_2
+
+	dcbt		AO,	PRE
+	KERNEL1x16_1
+	dcbt		AO,	PRE
+	KERNEL1x16_2
+	dcbt		AO,	PRE
+	KERNEL1x16_1
+	dcbt		AO,	PRE
+	KERNEL1x16_2
+
+	addic.		L,	L,	-1
+	bgt		.LDGEMM_L1x16_LOOP
+
+.LDGEMM_L1x16_LOOP_END:
+
+	dcbt		AO,	PRE
+	KERNEL1x16_1
+	dcbt		AO,	PRE
+	KERNEL1x16_2
+	dcbt		AO,	PRE
+	KERNEL1x16_1
+	dcbt		AO,	PRE
+	KERNEL1x16_2
+
+	dcbt		AO,	PRE
+	KERNEL1x16_1
+	dcbt		AO,	PRE
+	KERNEL1x16_2
+	dcbt		AO,	PRE
+	KERNEL1x16_1
+	KERNEL1x16_E2
+
+	b		.LDGEMM_L1x16_SUB1
+
+.LDGEMM_L1x16_SUB4:
+
+	dcbt		AO,	PRE
+	KERNEL1x16_SUBI1
+	dcbt		AO,	PRE
+	KERNEL1x16_SUB1
+	dcbt		AO,	PRE
+	KERNEL1x16_SUB1
+	dcbt		AO,	PRE
+	KERNEL1x16_SUB1
+
+	KERNEL1x16_SUB1
+	KERNEL1x16_SUB1
+	KERNEL1x16_SUB1
+	KERNEL1x16_SUB1
+
+	b		.LDGEMM_L1x16_SUB1
+
+.LDGEMM_L1x16_SUB0:
+
+	andi.		L,	K,	7
+
+	KERNEL1x16_SUBI1
+
+	addic.		L,	L,	-1
+	ble		.LDGEMM_L1x16_SAVE
+	b		.LDGEMM_L1x16_SUB2
+
+.LDGEMM_L1x16_SUB1:
+
+	andi.		L,	K,	7
+	ble		.LDGEMM_L1x16_SAVE
+
+.LDGEMM_L1x16_SUB2:
+
+	KERNEL1x16_SUB1
+
+	addic.		L,	L,	-1
+	bgt		.LDGEMM_L1x16_SUB2
+
+.LDGEMM_L1x16_SAVE:
+
+	SAVE1x16
+
+	addic.		I,	I,	-1
+	bgt		.LDGEMM_L1x16_BEGIN
+
+.LDGEMM_L1x16_END:
+
+.LDGEMM_L1x8_BEGIN:
+
+	andi.		T2,	M,	15
+	ble		.LDGEMM_L1x1_END
+
+	andi.		T1,	M,	8
+	ble		.LDGEMM_L1x8_END
+	mr		BO,	B
+	srawi.		L,	K,	3
+	ble		.LDGEMM_L1x8_SUB0
+	cmpwi		cr0,	L,	1
+	ble		.LDGEMM_L1x8_SUB4
+
+.LDGEMM_L1x8_LOOP_START:
+
+	LOAD1x8_1
+	KERNEL1x8_I1
+	KERNEL1x8_2
+	KERNEL1x8_1
+	KERNEL1x8_2
+
+	KERNEL1x8_1
+	KERNEL1x8_2
+	KERNEL1x8_1
+	KERNEL1x8_2
+
+	addic.		L,	L,	-2
+	ble		.LDGEMM_L1x8_LOOP_END
+
+	.align 5
+
+.LDGEMM_L1x8_LOOP:
+
+	KERNEL1x8_1
+	KERNEL1x8_2
+	KERNEL1x8_1
+	KERNEL1x8_2
+
+	KERNEL1x8_1
+	KERNEL1x8_2
+	KERNEL1x8_1
+	KERNEL1x8_2
+
+	addic.		L,	L,	-1
+	bgt		.LDGEMM_L1x8_LOOP
+
+.LDGEMM_L1x8_LOOP_END:
+
+	KERNEL1x8_1
+	KERNEL1x8_2
+	KERNEL1x8_1
+	KERNEL1x8_2
+
+	KERNEL1x8_1
+	KERNEL1x8_2
+	KERNEL1x8_1
+	KERNEL1x8_E2
+
+	b		.LDGEMM_L1x8_SUB1
+
+.LDGEMM_L1x8_SUB4:
+
+	KERNEL1x8_SUBI1
+	KERNEL1x8_SUB1
+	KERNEL1x8_SUB1
+	KERNEL1x8_SUB1
+
+	KERNEL1x8_SUB1
+	KERNEL1x8_SUB1
+	KERNEL1x8_SUB1
+	KERNEL1x8_SUB1
+
+	b		.LDGEMM_L1x8_SUB1
+
+.LDGEMM_L1x8_SUB0:
+
+	andi.		L,	K,	7
+
+	KERNEL1x8_SUBI1
+
+	addic.		L,	L,	-1
+	ble		.LDGEMM_L1x8_SAVE
+	b		.LDGEMM_L1x8_SUB2
+
+.LDGEMM_L1x8_SUB1:
+
+	andi.		L,	K,	7
+	ble		.LDGEMM_L1x8_SAVE
+
+.LDGEMM_L1x8_SUB2:
+
+	KERNEL1x8_SUB1
+
+	addic.		L,	L,	-1
+	bgt		.LDGEMM_L1x8_SUB2
+
+.LDGEMM_L1x8_SAVE:
+
+	SAVE1x8
+
+.LDGEMM_L1x8_END:
+
+.LDGEMM_L1x4_BEGIN:
+
+
+	andi.		T1,	M,	4
+	ble		.LDGEMM_L1x4_END
+	mr		BO,	B
+	srawi.		L,	K,	3
+	ble		.LDGEMM_L1x4_SUB0
+	cmpwi		cr0,	L,	1
+	ble		.LDGEMM_L1x4_SUB4
+
+.LDGEMM_L1x4_LOOP_START:
+
+	LOAD1x4_1
+	KERNEL1x4_I1
+	KERNEL1x4_2
+	KERNEL1x4_1
+	KERNEL1x4_2
+
+	KERNEL1x4_1
+	KERNEL1x4_2
+	KERNEL1x4_1
+	KERNEL1x4_2
+
+	addic.		L,	L,	-2
+	ble		.LDGEMM_L1x4_LOOP_END
+
+	.align 5
+
+.LDGEMM_L1x4_LOOP:
+
+	KERNEL1x4_1
+	KERNEL1x4_2
+	KERNEL1x4_1
+	KERNEL1x4_2
+
+	KERNEL1x4_1
+	KERNEL1x4_2
+	KERNEL1x4_1
+	KERNEL1x4_2
+
+	addic.		L,	L,	-1
+	bgt		.LDGEMM_L1x4_LOOP
+
+.LDGEMM_L1x4_LOOP_END:
+
+	KERNEL1x4_1
+	KERNEL1x4_2
+	KERNEL1x4_1
+	KERNEL1x4_2
+
+	KERNEL1x4_1
+	KERNEL1x4_2
+	KERNEL1x4_1
+	KERNEL1x4_E2
+
+	b		.LDGEMM_L1x4_SUB1
+
+.LDGEMM_L1x4_SUB4:
+
+	KERNEL1x4_SUBI1
+	KERNEL1x4_SUB1
+	KERNEL1x4_SUB1
+	KERNEL1x4_SUB1
+
+	KERNEL1x4_SUB1
+	KERNEL1x4_SUB1
+	KERNEL1x4_SUB1
+	KERNEL1x4_SUB1
+
+	b		.LDGEMM_L1x4_SUB1
+
+.LDGEMM_L1x4_SUB0:
+
+	andi.		L,	K,	7
+
+	KERNEL1x4_SUBI1
+
+	addic.		L,	L,	-1
+	ble		.LDGEMM_L1x4_SAVE
+	b		.LDGEMM_L1x4_SUB2
+
+.LDGEMM_L1x4_SUB1:
+
+	andi.		L,	K,	7
+	ble		.LDGEMM_L1x4_SAVE
+
+.LDGEMM_L1x4_SUB2:
+
+	KERNEL1x4_SUB1
+
+	addic.		L,	L,	-1
+	bgt		.LDGEMM_L1x4_SUB2
+
+.LDGEMM_L1x4_SAVE:
+
+	SAVE1x4
+
+.LDGEMM_L1x4_END:
+
+.LDGEMM_L1x2_BEGIN:
+
+
+	andi.		T1,	M,	2
+	ble		.LDGEMM_L1x2_END
+	mr		BO,	B
+	srawi.		L,	K,	3
+	ble		.LDGEMM_L1x2_SUB0
+	cmpwi		cr0,	L,	1
+	ble		.LDGEMM_L1x2_SUB4
+
+.LDGEMM_L1x2_LOOP_START:
+
+	LOAD1x2_1
+	KERNEL1x2_I1
+	KERNEL1x2_2
+	KERNEL1x2_1
+	KERNEL1x2_2
+
+	KERNEL1x2_1
+	KERNEL1x2_2
+	KERNEL1x2_1
+	KERNEL1x2_2
+
+	addic.		L,	L,	-2
+	ble		.LDGEMM_L1x2_LOOP_END
+
+	.align 5
+
+.LDGEMM_L1x2_LOOP:
+
+	KERNEL1x2_1
+	KERNEL1x2_2
+	KERNEL1x2_1
+	KERNEL1x2_2
+
+	KERNEL1x2_1
+	KERNEL1x2_2
+	KERNEL1x2_1
+	KERNEL1x2_2
+
+	addic.		L,	L,	-1
+	bgt		.LDGEMM_L1x2_LOOP
+
+.LDGEMM_L1x2_LOOP_END:
+
+	KERNEL1x2_1
+	KERNEL1x2_2
+	KERNEL1x2_1
+	KERNEL1x2_2
+
+	KERNEL1x2_1
+	KERNEL1x2_2
+	KERNEL1x2_1
+	KERNEL1x2_E2
+
+	b		.LDGEMM_L1x2_SUB1
+
+.LDGEMM_L1x2_SUB4:
+
+	KERNEL1x2_SUBI1
+	KERNEL1x2_SUB1
+	KERNEL1x2_SUB1
+	KERNEL1x2_SUB1
+
+	KERNEL1x2_SUB1
+	KERNEL1x2_SUB1
+	KERNEL1x2_SUB1
+	KERNEL1x2_SUB1
+
+	b		.LDGEMM_L1x2_SUB1
+
+.LDGEMM_L1x2_SUB0:
+
+	andi.		L,	K,	7
+
+	KERNEL1x2_SUBI1
+
+	addic.		L,	L,	-1
+	ble		.LDGEMM_L1x2_SAVE
+	b		.LDGEMM_L1x2_SUB2
+
+.LDGEMM_L1x2_SUB1:
+
+	andi.		L,	K,	7
+	ble		.LDGEMM_L1x2_SAVE
+
+.LDGEMM_L1x2_SUB2:
+
+	KERNEL1x2_SUB1
+
+	addic.		L,	L,	-1
+	bgt		.LDGEMM_L1x2_SUB2
+
+.LDGEMM_L1x2_SAVE:
+
+	SAVE1x2
+
+.LDGEMM_L1x2_END:
+
+.LDGEMM_L1x1_BEGIN:
+
+
+	andi.		T1,	M,	1
+	ble		.LDGEMM_L1x1_END
+	mr		BO,	B
+	srawi.		L,	K,	3
+	ble		.LDGEMM_L1x1_SUB0
+	cmpwi		cr0,	L,	1
+	ble		.LDGEMM_L1x1_SUB4
+
+.LDGEMM_L1x1_LOOP_START:
+
+	LOAD1x1_1
+	KERNEL1x1_I1
+	KERNEL1x1_2
+	KERNEL1x1_1
+	KERNEL1x1_2
+
+	KERNEL1x1_1
+	KERNEL1x1_2
+	KERNEL1x1_1
+	KERNEL1x1_2
+
+	addic.		L,	L,	-2
+	ble		.LDGEMM_L1x1_LOOP_END
+
+	.align 5
+
+.LDGEMM_L1x1_LOOP:
+
+	KERNEL1x1_1
+	KERNEL1x1_2
+	KERNEL1x1_1
+	KERNEL1x1_2
+
+	KERNEL1x1_1
+	KERNEL1x1_2
+	KERNEL1x1_1
+	KERNEL1x1_2
+
+	addic.		L,	L,	-1
+	bgt		.LDGEMM_L1x1_LOOP
+
+.LDGEMM_L1x1_LOOP_END:
+
+	KERNEL1x1_1
+	KERNEL1x1_2
+	KERNEL1x1_1
+	KERNEL1x1_2
+
+	KERNEL1x1_1
+	KERNEL1x1_2
+	KERNEL1x1_1
+	KERNEL1x1_E2
+
+	b		.LDGEMM_L1x1_SUB1
+
+.LDGEMM_L1x1_SUB4:
+
+	KERNEL1x1_SUBI1
+	KERNEL1x1_SUB1
+	KERNEL1x1_SUB1
+	KERNEL1x1_SUB1
+
+	KERNEL1x1_SUB1
+	KERNEL1x1_SUB1
+	KERNEL1x1_SUB1
+	KERNEL1x1_SUB1
+
+	b		.LDGEMM_L1x1_SUB1
+
+.LDGEMM_L1x1_SUB0:
+
+	andi.		L,	K,	7
+
+	KERNEL1x1_SUBI1
+
+	addic.		L,	L,	-1
+	ble		.LDGEMM_L1x1_SAVE
+	b		.LDGEMM_L1x1_SUB2
+
+.LDGEMM_L1x1_SUB1:
+
+	andi.		L,	K,	7
+	ble		.LDGEMM_L1x1_SAVE
+
+.LDGEMM_L1x1_SUB2:
+
+	KERNEL1x1_SUB1
+
+	addic.		L,	L,	-1
+	bgt		.LDGEMM_L1x1_SUB2
+
+.LDGEMM_L1x1_SAVE:
+
+	SAVE1x1
+
+.LDGEMM_L1x1_END:
+
+.LDGEMM_L1_END:
diff --git a/kernel/power/dgemm_macros_16x4_power8.S b/kernel/power/dgemm_macros_16x4_power8.S
new file mode 100644
index 000000000..27c05e08e
--- /dev/null
+++ b/kernel/power/dgemm_macros_16x4_power8.S
@@ -0,0 +1,3435 @@
+/***************************************************************************
+Copyright (c) 2013-2016, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*****************************************************************************/
+
+/**************************************************************************************
+* 2016/03/05 Werner Saar (wernsaar@googlemail.com)
+* 	 BLASTEST 		: OK
+* 	 CTEST			: OK
+* 	 TEST			: OK
+*	 LAPACK-TEST		: OK
+**************************************************************************************/
+
+/*********************************************************************
+* Macros for N=4, M=16                                               *
+*********************************************************************/
+
+.macro LOAD4x16_1
+
+	lxvd2x	vs0,	0,	AO
+	lxvd2x	vs1,	o16,	AO
+	lxvd2x	vs2,	o32,	AO
+	lxvd2x	vs3,	o48,	AO
+
+	lxvdsx	vs24,	0,	BO
+	lxvdsx	vs25,	o8,	BO
+
+	addi		AO, AO, 64
+
+	lxvd2x	vs4,	0,	AO
+	lxvd2x	vs5,	o16,	AO
+	lxvd2x	vs6,	o32,	AO
+	lxvd2x	vs7,	o48,	AO
+
+	lxvdsx	vs26,	o16,	BO
+	lxvdsx	vs27,	o24,	BO
+
+	addi		AO, AO, 64
+	addi		BO, BO, 32
+
+.endm
+
+.macro KERNEL4x16_I1
+
+	xvmuldp			vs32,	vs0,	vs24
+	xvmuldp			vs33,	vs1,	vs24
+	xvmuldp			vs34,	vs2,	vs24
+	xvmuldp			vs35,	vs3,	vs24
+
+	lxvd2x	vs8,	0,	AO
+	lxvd2x	vs9,	o16,	AO
+
+	xvmuldp			vs36,	vs4,	vs24
+	xvmuldp			vs37,	vs5,	vs24
+	xvmuldp			vs38,	vs6,	vs24
+	xvmuldp			vs39,	vs7,	vs24
+
+	lxvdsx	vs28,	0,	BO
+	lxvdsx	vs29,	o8,	BO
+
+	xvmuldp			vs40,	vs0,	vs25
+	xvmuldp			vs41,	vs1,	vs25
+	xvmuldp			vs42,	vs2,	vs25
+	xvmuldp			vs43,	vs3,	vs25
+
+	lxvd2x	vs10,	o32,	AO
+	lxvd2x	vs11,	o48,	AO
+
+	xvmuldp			vs44,	vs4,	vs25
+	xvmuldp			vs45,	vs5,	vs25
+	xvmuldp			vs46,	vs6,	vs25
+	xvmuldp			vs47,	vs7,	vs25
+
+	addi		AO, AO, 64
+
+	xvmuldp			vs48,	vs0,	vs26
+	xvmuldp			vs49,	vs1,	vs26
+	xvmuldp			vs50,	vs2,	vs26
+	xvmuldp			vs51,	vs3,	vs26
+
+	lxvd2x	vs12,	0,	AO
+	lxvd2x	vs13,	o16,	AO
+
+	xvmuldp			vs52,	vs4,	vs26
+	xvmuldp			vs53,	vs5,	vs26
+	xvmuldp			vs54,	vs6,	vs26
+	xvmuldp			vs55,	vs7,	vs26
+
+	lxvd2x	vs14,	o32,	AO
+	lxvd2x	vs15,	o48,	AO
+
+	xvmuldp			vs56,	vs0,	vs27
+	xvmuldp			vs57,	vs1,	vs27
+	xvmuldp			vs58,	vs2,	vs27
+	xvmuldp			vs59,	vs3,	vs27
+
+	lxvdsx	vs30,	o16,	BO
+	lxvdsx	vs31,	o24,	BO
+
+	xvmuldp			vs60,	vs4,	vs27
+	xvmuldp			vs61,	vs5,	vs27
+	xvmuldp			vs62,	vs6,	vs27
+	xvmuldp			vs63,	vs7,	vs27
+
+	addi		AO, AO, 64
+	addi		BO, BO, 32
+
+.endm
+
+.macro KERNEL4x16_1
+
+	xvmaddadp		vs32,	vs0,	vs24
+	xvmaddadp		vs33,	vs1,	vs24
+	xvmaddadp		vs34,	vs2,	vs24
+	xvmaddadp		vs35,	vs3,	vs24
+
+	lxvd2x	vs8,	0,	AO
+	lxvd2x	vs9,	o16,	AO
+
+	xvmaddadp		vs36,	vs4,	vs24
+	xvmaddadp		vs37,	vs5,	vs24
+	xvmaddadp		vs38,	vs6,	vs24
+	xvmaddadp		vs39,	vs7,	vs24
+
+	lxvdsx	vs28,	0,	BO
+	lxvdsx	vs29,	o8,	BO
+
+	xvmaddadp		vs40,	vs0,	vs25
+	xvmaddadp		vs41,	vs1,	vs25
+	xvmaddadp		vs42,	vs2,	vs25
+	xvmaddadp		vs43,	vs3,	vs25
+
+	lxvd2x	vs10,	o32,	AO
+	lxvd2x	vs11,	o48,	AO
+
+	xvmaddadp		vs44,	vs4,	vs25
+	xvmaddadp		vs45,	vs5,	vs25
+	xvmaddadp		vs46,	vs6,	vs25
+	xvmaddadp		vs47,	vs7,	vs25
+
+	addi		AO, AO, 64
+
+	xvmaddadp		vs48,	vs0,	vs26
+	xvmaddadp		vs49,	vs1,	vs26
+	xvmaddadp		vs50,	vs2,	vs26
+	xvmaddadp		vs51,	vs3,	vs26
+
+	lxvd2x	vs12,	0,	AO
+	lxvd2x	vs13,	o16,	AO
+
+	xvmaddadp		vs52,	vs4,	vs26
+	xvmaddadp		vs53,	vs5,	vs26
+	xvmaddadp		vs54,	vs6,	vs26
+	xvmaddadp		vs55,	vs7,	vs26
+
+	lxvd2x	vs14,	o32,	AO
+	lxvd2x	vs15,	o48,	AO
+
+	xvmaddadp		vs56,	vs0,	vs27
+	xvmaddadp		vs57,	vs1,	vs27
+	xvmaddadp		vs58,	vs2,	vs27
+	xvmaddadp		vs59,	vs3,	vs27
+
+
+	lxvdsx	vs30,	o16,	BO
+	lxvdsx	vs31,	o24,	BO
+
+	xvmaddadp		vs60,	vs4,	vs27
+	xvmaddadp		vs61,	vs5,	vs27
+	xvmaddadp		vs62,	vs6,	vs27
+	xvmaddadp		vs63,	vs7,	vs27
+
+	addi		AO, AO, 64
+	addi		BO, BO, 32
+
+.endm
+
+.macro KERNEL4x16_2
+
+	xvmaddadp		vs32,	vs8,	vs28
+	xvmaddadp		vs33,	vs9,	vs28
+	xvmaddadp		vs34,	vs10,	vs28
+	xvmaddadp		vs35,	vs11,	vs28
+
+	lxvd2x	vs0,	0,	AO
+	lxvd2x	vs1,	o16,	AO
+
+	xvmaddadp		vs36,	vs12,	vs28
+	xvmaddadp		vs37,	vs13,	vs28
+	xvmaddadp		vs38,	vs14,	vs28
+	xvmaddadp		vs39,	vs15,	vs28
+
+	lxvdsx	vs24,	0,	BO
+	lxvdsx	vs25,	o8,	BO
+
+	xvmaddadp		vs40,	vs8,	vs29
+	xvmaddadp		vs41,	vs9,	vs29
+	xvmaddadp		vs42,	vs10,	vs29
+	xvmaddadp		vs43,	vs11,	vs29
+
+	lxvd2x	vs2,	o32,	AO
+	lxvd2x	vs3,	o48,	AO
+
+	xvmaddadp		vs44,	vs12,	vs29
+	xvmaddadp		vs45,	vs13,	vs29
+	xvmaddadp		vs46,	vs14,	vs29
+	xvmaddadp		vs47,	vs15,	vs29
+
+	addi		AO, AO, 64
+
+	xvmaddadp		vs48,	vs8,	vs30
+	xvmaddadp		vs49,	vs9,	vs30
+	xvmaddadp		vs50,	vs10,	vs30
+	xvmaddadp		vs51,	vs11,	vs30
+
+	lxvd2x	vs4,	0,	AO
+	lxvd2x	vs5,	o16,	AO
+
+	xvmaddadp		vs52,	vs12,	vs30
+	xvmaddadp		vs53,	vs13,	vs30
+	xvmaddadp		vs54,	vs14,	vs30
+	xvmaddadp		vs55,	vs15,	vs30
+
+	lxvd2x	vs6,	o32,	AO
+	lxvd2x	vs7,	o48,	AO
+
+	xvmaddadp		vs56,	vs8,	vs31
+	xvmaddadp		vs57,	vs9,	vs31
+	xvmaddadp		vs58,	vs10,	vs31
+	xvmaddadp		vs59,	vs11,	vs31
+
+	lxvdsx	vs26,	o16,	BO
+	lxvdsx	vs27,	o24,	BO
+
+	xvmaddadp		vs60,	vs12,	vs31
+	xvmaddadp		vs61,	vs13,	vs31
+	xvmaddadp		vs62,	vs14,	vs31
+	xvmaddadp		vs63,	vs15,	vs31
+
+	addi		AO, AO, 64
+	addi		BO, BO, 32
+
+.endm
+
+.macro KERNEL4x16_E2
+
+
+	xvmaddadp		vs32,	vs8,	vs28
+	xvmaddadp		vs33,	vs9,	vs28
+	xvmaddadp		vs34,	vs10,	vs28
+	xvmaddadp		vs35,	vs11,	vs28
+	xvmaddadp		vs36,	vs12,	vs28
+	xvmaddadp		vs37,	vs13,	vs28
+	xvmaddadp		vs38,	vs14,	vs28
+	xvmaddadp		vs39,	vs15,	vs28
+
+	xvmaddadp		vs40,	vs8,	vs29
+	xvmaddadp		vs41,	vs9,	vs29
+	xvmaddadp		vs42,	vs10,	vs29
+	xvmaddadp		vs43,	vs11,	vs29
+	xvmaddadp		vs44,	vs12,	vs29
+	xvmaddadp		vs45,	vs13,	vs29
+	xvmaddadp		vs46,	vs14,	vs29
+	xvmaddadp		vs47,	vs15,	vs29
+
+	xvmaddadp		vs48,	vs8,	vs30
+	xvmaddadp		vs49,	vs9,	vs30
+	xvmaddadp		vs50,	vs10,	vs30
+	xvmaddadp		vs51,	vs11,	vs30
+	xvmaddadp		vs52,	vs12,	vs30
+	xvmaddadp		vs53,	vs13,	vs30
+	xvmaddadp		vs54,	vs14,	vs30
+	xvmaddadp		vs55,	vs15,	vs30
+
+	xvmaddadp		vs56,	vs8,	vs31
+	xvmaddadp		vs57,	vs9,	vs31
+	xvmaddadp		vs58,	vs10,	vs31
+	xvmaddadp		vs59,	vs11,	vs31
+	xvmaddadp		vs60,	vs12,	vs31
+	xvmaddadp		vs61,	vs13,	vs31
+	xvmaddadp		vs62,	vs14,	vs31
+	xvmaddadp		vs63,	vs15,	vs31
+
+.endm
+
+.macro KERNEL4x16_SUBI1
+
+	lxvd2x	vs0,	0,	AO
+	lxvd2x	vs1,	o16,	AO
+	lxvd2x	vs2,	o32,	AO
+	lxvd2x	vs3,	o48,	AO
+
+	lxvdsx	vs24,	0,	BO
+	lxvdsx	vs25,	o8,	BO
+	lxvdsx	vs26,	o16,	BO
+	lxvdsx	vs27,	o24,	BO
+
+	addi		AO, AO, 64
+	addi		BO, BO, 32
+
+	lxvd2x	vs4,	0,	AO
+	lxvd2x	vs5,	o16,	AO
+	lxvd2x	vs6,	o32,	AO
+	lxvd2x	vs7,	o48,	AO
+
+	addi		AO, AO, 64
+
+
+	xvmuldp			vs32,	vs0,	vs24
+	xvmuldp			vs33,	vs1,	vs24
+	xvmuldp			vs34,	vs2,	vs24
+	xvmuldp			vs35,	vs3,	vs24
+	xvmuldp			vs36,	vs4,	vs24
+	xvmuldp			vs37,	vs5,	vs24
+	xvmuldp			vs38,	vs6,	vs24
+	xvmuldp			vs39,	vs7,	vs24
+
+	xvmuldp			vs40,	vs0,	vs25
+	xvmuldp			vs41,	vs1,	vs25
+	xvmuldp			vs42,	vs2,	vs25
+	xvmuldp			vs43,	vs3,	vs25
+	xvmuldp			vs44,	vs4,	vs25
+	xvmuldp			vs45,	vs5,	vs25
+	xvmuldp			vs46,	vs6,	vs25
+	xvmuldp			vs47,	vs7,	vs25
+
+	xvmuldp			vs48,	vs0,	vs26
+	xvmuldp			vs49,	vs1,	vs26
+	xvmuldp			vs50,	vs2,	vs26
+	xvmuldp			vs51,	vs3,	vs26
+	xvmuldp			vs52,	vs4,	vs26
+	xvmuldp			vs53,	vs5,	vs26
+	xvmuldp			vs54,	vs6,	vs26
+	xvmuldp			vs55,	vs7,	vs26
+
+	xvmuldp			vs56,	vs0,	vs27
+	xvmuldp			vs57,	vs1,	vs27
+	xvmuldp			vs58,	vs2,	vs27
+	xvmuldp			vs59,	vs3,	vs27
+	xvmuldp			vs60,	vs4,	vs27
+	xvmuldp			vs61,	vs5,	vs27
+	xvmuldp			vs62,	vs6,	vs27
+	xvmuldp			vs63,	vs7,	vs27
+
+.endm
+
+.macro KERNEL4x16_SUB1
+
+	lxvd2x	vs0,	0,	AO
+	lxvd2x	vs1,	o16,	AO
+	lxvd2x	vs2,	o32,	AO
+	lxvd2x	vs3,	o48,	AO
+
+	lxvdsx	vs24,	0,	BO
+	lxvdsx	vs25,	o8,	BO
+	lxvdsx	vs26,	o16,	BO
+	lxvdsx	vs27,	o24,	BO
+
+	addi		AO, AO, 64
+	addi		BO, BO, 32
+
+	lxvd2x	vs4,	0,	AO
+	lxvd2x	vs5,	o16,	AO
+	lxvd2x	vs6,	o32,	AO
+	lxvd2x	vs7,	o48,	AO
+
+	addi		AO, AO, 64
+
+
+	xvmaddadp		vs32,	vs0,	vs24
+	xvmaddadp		vs33,	vs1,	vs24
+	xvmaddadp		vs34,	vs2,	vs24
+	xvmaddadp		vs35,	vs3,	vs24
+	xvmaddadp		vs36,	vs4,	vs24
+	xvmaddadp		vs37,	vs5,	vs24
+	xvmaddadp		vs38,	vs6,	vs24
+	xvmaddadp		vs39,	vs7,	vs24
+
+	xvmaddadp		vs40,	vs0,	vs25
+	xvmaddadp		vs41,	vs1,	vs25
+	xvmaddadp		vs42,	vs2,	vs25
+	xvmaddadp		vs43,	vs3,	vs25
+	xvmaddadp		vs44,	vs4,	vs25
+	xvmaddadp		vs45,	vs5,	vs25
+	xvmaddadp		vs46,	vs6,	vs25
+	xvmaddadp		vs47,	vs7,	vs25
+
+	xvmaddadp		vs48,	vs0,	vs26
+	xvmaddadp		vs49,	vs1,	vs26
+	xvmaddadp		vs50,	vs2,	vs26
+	xvmaddadp		vs51,	vs3,	vs26
+	xvmaddadp		vs52,	vs4,	vs26
+	xvmaddadp		vs53,	vs5,	vs26
+	xvmaddadp		vs54,	vs6,	vs26
+	xvmaddadp		vs55,	vs7,	vs26
+
+	xvmaddadp		vs56,	vs0,	vs27
+	xvmaddadp		vs57,	vs1,	vs27
+	xvmaddadp		vs58,	vs2,	vs27
+	xvmaddadp		vs59,	vs3,	vs27
+	xvmaddadp		vs60,	vs4,	vs27
+	xvmaddadp		vs61,	vs5,	vs27
+	xvmaddadp		vs62,	vs6,	vs27
+	xvmaddadp		vs63,	vs7,	vs27
+
+.endm
+
+.macro SAVE4x16
+
+	mr		T1,	CO
+	addi		T2,	T1,	64
+
+#ifndef TRMMKERNEL
+	lxvd2x		vs0,	0,	T1
+	lxvd2x		vs1,	o16,	T1
+	lxvd2x		vs2,	o32,	T1
+	lxvd2x		vs3,	o48,	T1
+
+	lxvd2x		vs4,	0,	T2
+	lxvd2x		vs5,	o16,	T2
+	lxvd2x		vs6,	o32,	T2
+	lxvd2x		vs7,	o48,	T2
+#endif
+
+#ifndef TRMMKERNEL
+	xvmaddadp	vs0,	vs32,	alpha_r
+	xvmaddadp	vs1,	vs33,	alpha_r
+	xvmaddadp	vs2,	vs34,	alpha_r
+	xvmaddadp	vs3,	vs35,	alpha_r
+	xvmaddadp	vs4,	vs36,	alpha_r
+	xvmaddadp	vs5,	vs37,	alpha_r
+	xvmaddadp	vs6,	vs38,	alpha_r
+	xvmaddadp	vs7,	vs39,	alpha_r
+#else
+	xvmuldp		vs0,	vs32,	alpha_r
+	xvmuldp		vs1,	vs33,	alpha_r
+	xvmuldp		vs2,	vs34,	alpha_r
+	xvmuldp		vs3,	vs35,	alpha_r
+	xvmuldp		vs4,	vs36,	alpha_r
+	xvmuldp		vs5,	vs37,	alpha_r
+	xvmuldp		vs6,	vs38,	alpha_r
+	xvmuldp		vs7,	vs39,	alpha_r
+#endif
+
+	stxvd2x		vs0,	0,	T1
+	stxvd2x		vs1,	o16,	T1
+	stxvd2x		vs2,	o32,	T1
+	stxvd2x		vs3,	o48,	T1
+
+	dcbt		T1, PRE
+
+	stxvd2x		vs4,	0,	T2
+	stxvd2x		vs5,	o16,	T2
+	stxvd2x		vs6,	o32,	T2
+	stxvd2x		vs7,	o48,	T2
+
+	add		T1,	T1,	LDC
+	add		T2,	T2,	LDC
+
+#ifndef TRMMKERNEL
+	lxvd2x		vs8,	0,	T1
+	lxvd2x		vs9,	o16,	T1
+	lxvd2x		vs10,	o32,	T1
+	lxvd2x		vs11,	o48,	T1
+
+	lxvd2x		vs12,	0,	T2
+	lxvd2x		vs13,	o16,	T2
+	lxvd2x		vs14,	o32,	T2
+	lxvd2x		vs15,	o48,	T2
+#endif
+
+#ifndef TRMMKERNEL
+	xvmaddadp	vs8,	vs40,	alpha_r
+	xvmaddadp	vs9,	vs41,	alpha_r
+	xvmaddadp	vs10,	vs42,	alpha_r
+	xvmaddadp	vs11,	vs43,	alpha_r
+	xvmaddadp	vs12,	vs44,	alpha_r
+	xvmaddadp	vs13,	vs45,	alpha_r
+	xvmaddadp	vs14,	vs46,	alpha_r
+	xvmaddadp	vs15,	vs47,	alpha_r
+#else
+	xvmuldp		vs8,	vs40,	alpha_r
+	xvmuldp		vs9,	vs41,	alpha_r
+	xvmuldp		vs10,	vs42,	alpha_r
+	xvmuldp		vs11,	vs43,	alpha_r
+	xvmuldp		vs12,	vs44,	alpha_r
+	xvmuldp		vs13,	vs45,	alpha_r
+	xvmuldp		vs14,	vs46,	alpha_r
+	xvmuldp		vs15,	vs47,	alpha_r
+#endif
+
+	stxvd2x		vs8,	0,	T1
+	stxvd2x		vs9,	o16,	T1
+	stxvd2x		vs10,	o32,	T1
+	stxvd2x		vs11,	o48,	T1
+
+	dcbt		T1, PRE
+
+	stxvd2x		vs12,	0,	T2
+	stxvd2x		vs13,	o16,	T2
+	stxvd2x		vs14,	o32,	T2
+	stxvd2x		vs15,	o48,	T2
+
+	add		T1,	T1,	LDC
+	add		T2,	T2,	LDC
+
+#ifndef TRMMKERNEL
+	lxvd2x		vs0,	0,	T1
+	lxvd2x		vs1,	o16,	T1
+	lxvd2x		vs2,	o32,	T1
+	lxvd2x		vs3,	o48,	T1
+
+	lxvd2x		vs4,	0,	T2
+	lxvd2x		vs5,	o16,	T2
+	lxvd2x		vs6,	o32,	T2
+	lxvd2x		vs7,	o48,	T2
+#endif
+
+#ifndef TRMMKERNEL
+	xvmaddadp	vs0,	vs48,	alpha_r
+	xvmaddadp	vs1,	vs49,	alpha_r
+	xvmaddadp	vs2,	vs50,	alpha_r
+	xvmaddadp	vs3,	vs51,	alpha_r
+	xvmaddadp	vs4,	vs52,	alpha_r
+	xvmaddadp	vs5,	vs53,	alpha_r
+	xvmaddadp	vs6,	vs54,	alpha_r
+	xvmaddadp	vs7,	vs55,	alpha_r
+#else
+	xvmuldp		vs0,	vs48,	alpha_r
+	xvmuldp		vs1,	vs49,	alpha_r
+	xvmuldp		vs2,	vs50,	alpha_r
+	xvmuldp		vs3,	vs51,	alpha_r
+	xvmuldp		vs4,	vs52,	alpha_r
+	xvmuldp		vs5,	vs53,	alpha_r
+	xvmuldp		vs6,	vs54,	alpha_r
+	xvmuldp		vs7,	vs55,	alpha_r
+#endif
+
+	stxvd2x		vs0,	0,	T1
+	stxvd2x		vs1,	o16,	T1
+	stxvd2x		vs2,	o32,	T1
+	stxvd2x		vs3,	o48,	T1
+
+	dcbt		T1, PRE
+
+	stxvd2x		vs4,	0,	T2
+	stxvd2x		vs5,	o16,	T2
+	stxvd2x		vs6,	o32,	T2
+	stxvd2x		vs7,	o48,	T2
+
+	add		T1,	T1,	LDC
+	add		T2,	T2,	LDC
+
+#ifndef TRMMKERNEL
+	lxvd2x		vs8,	0,	T1
+	lxvd2x		vs9,	o16,	T1
+	lxvd2x		vs10,	o32,	T1
+	lxvd2x		vs11,	o48,	T1
+
+	lxvd2x		vs12,	0,	T2
+	lxvd2x		vs13,	o16,	T2
+	lxvd2x		vs14,	o32,	T2
+	lxvd2x		vs15,	o48,	T2
+#endif
+
+#ifndef TRMMKERNEL
+	xvmaddadp	vs8,	vs56,	alpha_r
+	xvmaddadp	vs9,	vs57,	alpha_r
+	xvmaddadp	vs10,	vs58,	alpha_r
+	xvmaddadp	vs11,	vs59,	alpha_r
+	xvmaddadp	vs12,	vs60,	alpha_r
+	xvmaddadp	vs13,	vs61,	alpha_r
+	xvmaddadp	vs14,	vs62,	alpha_r
+	xvmaddadp	vs15,	vs63,	alpha_r
+#else
+	xvmuldp		vs8,	vs56,	alpha_r
+	xvmuldp		vs9,	vs57,	alpha_r
+	xvmuldp		vs10,	vs58,	alpha_r
+	xvmuldp		vs11,	vs59,	alpha_r
+	xvmuldp		vs12,	vs60,	alpha_r
+	xvmuldp		vs13,	vs61,	alpha_r
+	xvmuldp		vs14,	vs62,	alpha_r
+	xvmuldp		vs15,	vs63,	alpha_r
+#endif
+
+	stxvd2x		vs8,	0,	T1
+	stxvd2x		vs9,	o16,	T1
+	stxvd2x		vs10,	o32,	T1
+	stxvd2x		vs11,	o48,	T1
+
+	dcbt		T1, PRE
+
+	stxvd2x		vs12,	0,	T2
+	stxvd2x		vs13,	o16,	T2
+	stxvd2x		vs14,	o32,	T2
+	stxvd2x		vs15,	o48,	T2
+
+	addi		CO,	CO,	128
+
+.endm
+
+/*********************************************************************
+* Macros for N=4, M=8                                                *
+*********************************************************************/
+
+.macro LOAD4x8_1
+
+	lxvd2x	vs0,	0,	AO
+	lxvd2x	vs1,	o16,	AO
+
+	lxvdsx	vs24,	0,	BO
+	lxvdsx	vs25,	o8,	BO
+
+	lxvd2x	vs2,	o32,	AO
+	lxvd2x	vs3,	o48,	AO
+
+	lxvdsx	vs26,	o16,	BO
+	lxvdsx	vs27,	o24,	BO
+
+	addi		AO, AO, 64
+	addi		BO, BO, 32
+
+.endm
+
+.macro KERNEL4x8_I1
+
+	xvmuldp			vs32,	vs0,	vs24
+	xvmuldp			vs33,	vs1,	vs24
+	xvmuldp			vs34,	vs2,	vs24
+	xvmuldp			vs35,	vs3,	vs24
+
+	lxvd2x	vs8,	0,	AO
+	lxvd2x	vs9,	o16,	AO
+
+	xvmuldp			vs40,	vs0,	vs25
+	xvmuldp			vs41,	vs1,	vs25
+
+	lxvdsx	vs28,	0,	BO
+	lxvdsx	vs29,	o8,	BO
+
+	xvmuldp			vs42,	vs2,	vs25
+	xvmuldp			vs43,	vs3,	vs25
+
+	xvmuldp			vs48,	vs0,	vs26
+	xvmuldp			vs49,	vs1,	vs26
+
+	lxvd2x	vs10,	o32,	AO
+	lxvd2x	vs11,	o48,	AO
+
+	xvmuldp			vs50,	vs2,	vs26
+	xvmuldp			vs51,	vs3,	vs26
+
+	lxvdsx	vs30,	o16,	BO
+	lxvdsx	vs31,	o24,	BO
+
+	xvmuldp			vs56,	vs0,	vs27
+	xvmuldp			vs57,	vs1,	vs27
+	xvmuldp			vs58,	vs2,	vs27
+	xvmuldp			vs59,	vs3,	vs27
+
+	addi		AO, AO, 64
+	addi		BO, BO, 32
+
+.endm
+
+.macro KERNEL4x8_1
+
+	xvmaddadp		vs32,	vs0,	vs24
+	xvmaddadp		vs33,	vs1,	vs24
+	xvmaddadp		vs34,	vs2,	vs24
+	xvmaddadp		vs35,	vs3,	vs24
+
+	lxvd2x	vs8,	0,	AO
+	lxvd2x	vs9,	o16,	AO
+
+	xvmaddadp		vs40,	vs0,	vs25
+	xvmaddadp		vs41,	vs1,	vs25
+	xvmaddadp		vs42,	vs2,	vs25
+	xvmaddadp		vs43,	vs3,	vs25
+
+	lxvdsx	vs28,	0,	BO
+	lxvdsx	vs29,	o8,	BO
+
+	xvmaddadp		vs48,	vs0,	vs26
+	xvmaddadp		vs49,	vs1,	vs26
+
+	lxvd2x	vs10,	o32,	AO
+	lxvd2x	vs11,	o48,	AO
+
+	xvmaddadp		vs50,	vs2,	vs26
+	xvmaddadp		vs51,	vs3,	vs26
+
+	lxvdsx	vs30,	o16,	BO
+	lxvdsx	vs31,	o24,	BO
+
+	xvmaddadp		vs56,	vs0,	vs27
+	xvmaddadp		vs57,	vs1,	vs27
+	xvmaddadp		vs58,	vs2,	vs27
+	xvmaddadp		vs59,	vs3,	vs27
+
+	addi		AO, AO, 64
+	addi		BO, BO, 32
+
+.endm
+
+.macro KERNEL4x8_2
+
+	xvmaddadp		vs32,	vs8,	vs28
+	xvmaddadp		vs33,	vs9,	vs28
+	xvmaddadp		vs34,	vs10,	vs28
+	xvmaddadp		vs35,	vs11,	vs28
+
+	lxvd2x	vs0,	0,	AO
+	lxvd2x	vs1,	o16,	AO
+
+	xvmaddadp		vs40,	vs8,	vs29
+	xvmaddadp		vs41,	vs9,	vs29
+	xvmaddadp		vs42,	vs10,	vs29
+	xvmaddadp		vs43,	vs11,	vs29
+
+	lxvdsx	vs24,	0,	BO
+	lxvdsx	vs25,	o8,	BO
+
+	xvmaddadp		vs48,	vs8,	vs30
+	xvmaddadp		vs49,	vs9,	vs30
+
+	lxvd2x	vs2,	o32,	AO
+	lxvd2x	vs3,	o48,	AO
+
+	xvmaddadp		vs50,	vs10,	vs30
+	xvmaddadp		vs51,	vs11,	vs30
+
+	lxvdsx	vs26,	o16,	BO
+	lxvdsx	vs27,	o24,	BO
+
+	xvmaddadp		vs56,	vs8,	vs31
+	xvmaddadp		vs57,	vs9,	vs31
+	xvmaddadp		vs58,	vs10,	vs31
+	xvmaddadp		vs59,	vs11,	vs31
+
+	addi		AO, AO, 64
+	addi		BO, BO, 32
+
+.endm
+
+.macro KERNEL4x8_E2
+
+
+	xvmaddadp		vs32,	vs8,	vs28
+	xvmaddadp		vs33,	vs9,	vs28
+	xvmaddadp		vs34,	vs10,	vs28
+	xvmaddadp		vs35,	vs11,	vs28
+
+	xvmaddadp		vs40,	vs8,	vs29
+	xvmaddadp		vs41,	vs9,	vs29
+	xvmaddadp		vs42,	vs10,	vs29
+	xvmaddadp		vs43,	vs11,	vs29
+
+	xvmaddadp		vs48,	vs8,	vs30
+	xvmaddadp		vs49,	vs9,	vs30
+	xvmaddadp		vs50,	vs10,	vs30
+	xvmaddadp		vs51,	vs11,	vs30
+
+	xvmaddadp		vs56,	vs8,	vs31
+	xvmaddadp		vs57,	vs9,	vs31
+	xvmaddadp		vs58,	vs10,	vs31
+	xvmaddadp		vs59,	vs11,	vs31
+
+.endm
+
+.macro KERNEL4x8_SUBI1
+
+	lxvd2x	vs0,	0,	AO
+	lxvd2x	vs1,	o16,	AO
+	lxvd2x	vs2,	o32,	AO
+	lxvd2x	vs3,	o48,	AO
+
+	lxvdsx	vs24,	0,	BO
+	lxvdsx	vs25,	o8,	BO
+	lxvdsx	vs26,	o16,	BO
+	lxvdsx	vs27,	o24,	BO
+
+	addi		AO, AO, 64
+	addi		BO, BO, 32
+
+
+	xvmuldp			vs32,	vs0,	vs24
+	xvmuldp			vs33,	vs1,	vs24
+	xvmuldp			vs34,	vs2,	vs24
+	xvmuldp			vs35,	vs3,	vs24
+
+	xvmuldp			vs40,	vs0,	vs25
+	xvmuldp			vs41,	vs1,	vs25
+	xvmuldp			vs42,	vs2,	vs25
+	xvmuldp			vs43,	vs3,	vs25
+
+	xvmuldp			vs48,	vs0,	vs26
+	xvmuldp			vs49,	vs1,	vs26
+	xvmuldp			vs50,	vs2,	vs26
+	xvmuldp			vs51,	vs3,	vs26
+
+	xvmuldp			vs56,	vs0,	vs27
+	xvmuldp			vs57,	vs1,	vs27
+	xvmuldp			vs58,	vs2,	vs27
+	xvmuldp			vs59,	vs3,	vs27
+
+.endm
+
+.macro KERNEL4x8_SUB1
+
+	lxvd2x	vs0,	0,	AO
+	lxvd2x	vs1,	o16,	AO
+	lxvd2x	vs2,	o32,	AO
+	lxvd2x	vs3,	o48,	AO
+
+	lxvdsx	vs24,	0,	BO
+	lxvdsx	vs25,	o8,	BO
+	lxvdsx	vs26,	o16,	BO
+	lxvdsx	vs27,	o24,	BO
+
+	addi		AO, AO, 64
+	addi		BO, BO, 32
+
+
+	xvmaddadp		vs32,	vs0,	vs24
+	xvmaddadp		vs33,	vs1,	vs24
+	xvmaddadp		vs34,	vs2,	vs24
+	xvmaddadp		vs35,	vs3,	vs24
+
+	xvmaddadp		vs40,	vs0,	vs25
+	xvmaddadp		vs41,	vs1,	vs25
+	xvmaddadp		vs42,	vs2,	vs25
+	xvmaddadp		vs43,	vs3,	vs25
+
+	xvmaddadp		vs48,	vs0,	vs26
+	xvmaddadp		vs49,	vs1,	vs26
+	xvmaddadp		vs50,	vs2,	vs26
+	xvmaddadp		vs51,	vs3,	vs26
+
+	xvmaddadp		vs56,	vs0,	vs27
+	xvmaddadp		vs57,	vs1,	vs27
+	xvmaddadp		vs58,	vs2,	vs27
+	xvmaddadp		vs59,	vs3,	vs27
+
+.endm
+
+.macro SAVE4x8
+
+	mr		T1,	CO
+
+#ifndef TRMMKERNEL
+	lxvd2x		vs0,	0,	T1
+	lxvd2x		vs1,	o16,	T1
+	lxvd2x		vs2,	o32,	T1
+	lxvd2x		vs3,	o48,	T1
+#endif
+
+#ifndef TRMMKERNEL
+	xvmaddadp	vs0,	vs32,	alpha_r
+	xvmaddadp	vs1,	vs33,	alpha_r
+	xvmaddadp	vs2,	vs34,	alpha_r
+	xvmaddadp	vs3,	vs35,	alpha_r
+#else
+	xvmuldp		vs0,	vs32,	alpha_r
+	xvmuldp		vs1,	vs33,	alpha_r
+	xvmuldp		vs2,	vs34,	alpha_r
+	xvmuldp		vs3,	vs35,	alpha_r
+#endif
+
+	stxvd2x		vs0,	0,	T1
+	stxvd2x		vs1,	o16,	T1
+	stxvd2x		vs2,	o32,	T1
+	stxvd2x		vs3,	o48,	T1
+
+	add		T1,	T1,	LDC
+
+#ifndef TRMMKERNEL
+	lxvd2x		vs8,	0,	T1
+	lxvd2x		vs9,	o16,	T1
+	lxvd2x		vs10,	o32,	T1
+	lxvd2x		vs11,	o48,	T1
+#endif
+
+#ifndef TRMMKERNEL
+	xvmaddadp	vs8,	vs40,	alpha_r
+	xvmaddadp	vs9,	vs41,	alpha_r
+	xvmaddadp	vs10,	vs42,	alpha_r
+	xvmaddadp	vs11,	vs43,	alpha_r
+#else
+	xvmuldp		vs8,	vs40,	alpha_r
+	xvmuldp		vs9,	vs41,	alpha_r
+	xvmuldp		vs10,	vs42,	alpha_r
+	xvmuldp		vs11,	vs43,	alpha_r
+#endif
+
+	stxvd2x		vs8,	0,	T1
+	stxvd2x		vs9,	o16,	T1
+	stxvd2x		vs10,	o32,	T1
+	stxvd2x		vs11,	o48,	T1
+
+	add		T1,	T1,	LDC
+
+#ifndef TRMMKERNEL
+	lxvd2x		vs0,	0,	T1
+	lxvd2x		vs1,	o16,	T1
+	lxvd2x		vs2,	o32,	T1
+	lxvd2x		vs3,	o48,	T1
+#endif
+
+#ifndef TRMMKERNEL
+	xvmaddadp	vs0,	vs48,	alpha_r
+	xvmaddadp	vs1,	vs49,	alpha_r
+	xvmaddadp	vs2,	vs50,	alpha_r
+	xvmaddadp	vs3,	vs51,	alpha_r
+#else
+	xvmuldp		vs0,	vs48,	alpha_r
+	xvmuldp		vs1,	vs49,	alpha_r
+	xvmuldp		vs2,	vs50,	alpha_r
+	xvmuldp		vs3,	vs51,	alpha_r
+#endif
+
+	stxvd2x		vs0,	0,	T1
+	stxvd2x		vs1,	o16,	T1
+	stxvd2x		vs2,	o32,	T1
+	stxvd2x		vs3,	o48,	T1
+
+	add		T1,	T1,	LDC
+
+#ifndef TRMMKERNEL
+	lxvd2x		vs8,	0,	T1
+	lxvd2x		vs9,	o16,	T1
+	lxvd2x		vs10,	o32,	T1
+	lxvd2x		vs11,	o48,	T1
+#endif
+
+#ifndef TRMMKERNEL
+	xvmaddadp	vs8,	vs56,	alpha_r
+	xvmaddadp	vs9,	vs57,	alpha_r
+	xvmaddadp	vs10,	vs58,	alpha_r
+	xvmaddadp	vs11,	vs59,	alpha_r
+#else
+	xvmuldp		vs8,	vs56,	alpha_r
+	xvmuldp		vs9,	vs57,	alpha_r
+	xvmuldp		vs10,	vs58,	alpha_r
+	xvmuldp		vs11,	vs59,	alpha_r
+#endif
+
+	stxvd2x		vs8,	0,	T1
+	stxvd2x		vs9,	o16,	T1
+	stxvd2x		vs10,	o32,	T1
+	stxvd2x		vs11,	o48,	T1
+
+	addi		CO,	CO,	64
+
+.endm
+
+/*********************************************************************
+* Macros for N=4, M=4                                                *
+*********************************************************************/
+
+.macro LOAD4x4_1
+
+	lxvd2x	vs0,	0,	AO
+	lxvd2x	vs1,	o16,	AO
+
+	lxvdsx	vs24,	0,	BO
+	lxvdsx	vs25,	o8,	BO
+	lxvdsx	vs26,	o16,	BO
+	lxvdsx	vs27,	o24,	BO
+
+	addi		AO, AO, 32
+	addi		BO, BO, 32
+
+.endm
+
+.macro KERNEL4x4_I1
+
+	lxvd2x	vs8,	0,	AO
+	lxvd2x	vs9,	o16,	AO
+
+	lxvdsx	vs28,	0,	BO
+	lxvdsx	vs29,	o8,	BO
+	lxvdsx	vs30,	o16,	BO
+	lxvdsx	vs31,	o24,	BO
+
+	addi		AO, AO, 32
+	addi		BO, BO, 32
+
+
+	xvmuldp			vs32,	vs0,	vs24
+	xvmuldp			vs33,	vs1,	vs24
+
+	xvmuldp			vs40,	vs0,	vs25
+	xvmuldp			vs41,	vs1,	vs25
+
+	xvmuldp			vs48,	vs0,	vs26
+	xvmuldp			vs49,	vs1,	vs26
+
+	xvmuldp			vs56,	vs0,	vs27
+	xvmuldp			vs57,	vs1,	vs27
+
+.endm
+
+.macro KERNEL4x4_1
+
+	lxvd2x	vs8,	0,	AO
+	lxvd2x	vs9,	o16,	AO
+
+	lxvdsx	vs28,	0,	BO
+	lxvdsx	vs29,	o8,	BO
+	lxvdsx	vs30,	o16,	BO
+	lxvdsx	vs31,	o24,	BO
+
+	addi		AO, AO, 32
+	addi		BO, BO, 32
+
+
+	xvmaddadp		vs32,	vs0,	vs24
+	xvmaddadp		vs33,	vs1,	vs24
+
+	xvmaddadp		vs40,	vs0,	vs25
+	xvmaddadp		vs41,	vs1,	vs25
+
+	xvmaddadp		vs48,	vs0,	vs26
+	xvmaddadp		vs49,	vs1,	vs26
+
+	xvmaddadp		vs56,	vs0,	vs27
+	xvmaddadp		vs57,	vs1,	vs27
+
+.endm
+
+.macro KERNEL4x4_2
+
+	lxvd2x	vs0,	0,	AO
+	lxvd2x	vs1,	o16,	AO
+
+	lxvdsx	vs24,	0,	BO
+	lxvdsx	vs25,	o8,	BO
+	lxvdsx	vs26,	o16,	BO
+	lxvdsx	vs27,	o24,	BO
+
+	addi		AO, AO, 32
+	addi		BO, BO, 32
+
+
+	xvmaddadp		vs32,	vs8,	vs28
+	xvmaddadp		vs33,	vs9,	vs28
+
+	xvmaddadp		vs40,	vs8,	vs29
+	xvmaddadp		vs41,	vs9,	vs29
+
+	xvmaddadp		vs48,	vs8,	vs30
+	xvmaddadp		vs49,	vs9,	vs30
+
+	xvmaddadp		vs56,	vs8,	vs31
+	xvmaddadp		vs57,	vs9,	vs31
+
+.endm
+
+.macro KERNEL4x4_E2
+
+
+	xvmaddadp		vs32,	vs8,	vs28
+	xvmaddadp		vs33,	vs9,	vs28
+
+	xvmaddadp		vs40,	vs8,	vs29
+	xvmaddadp		vs41,	vs9,	vs29
+
+	xvmaddadp		vs48,	vs8,	vs30
+	xvmaddadp		vs49,	vs9,	vs30
+
+	xvmaddadp		vs56,	vs8,	vs31
+	xvmaddadp		vs57,	vs9,	vs31
+
+.endm
+
+.macro KERNEL4x4_SUBI1
+
+	lxvd2x	vs0,	0,	AO
+	lxvd2x	vs1,	o16,	AO
+
+	lxvdsx	vs24,	0,	BO
+	lxvdsx	vs25,	o8,	BO
+	lxvdsx	vs26,	o16,	BO
+	lxvdsx	vs27,	o24,	BO
+
+	addi		AO, AO, 32
+	addi		BO, BO, 32
+
+
+	xvmuldp			vs32,	vs0,	vs24
+	xvmuldp			vs33,	vs1,	vs24
+
+	xvmuldp			vs40,	vs0,	vs25
+	xvmuldp			vs41,	vs1,	vs25
+
+	xvmuldp			vs48,	vs0,	vs26
+	xvmuldp			vs49,	vs1,	vs26
+
+	xvmuldp			vs56,	vs0,	vs27
+	xvmuldp			vs57,	vs1,	vs27
+
+.endm
+
+.macro KERNEL4x4_SUB1
+
+	lxvd2x	vs0,	0,	AO
+	lxvd2x	vs1,	o16,	AO
+
+	lxvdsx	vs24,	0,	BO
+	lxvdsx	vs25,	o8,	BO
+	lxvdsx	vs26,	o16,	BO
+	lxvdsx	vs27,	o24,	BO
+
+	addi		AO, AO, 32
+	addi		BO, BO, 32
+
+
+	xvmaddadp		vs32,	vs0,	vs24
+	xvmaddadp		vs33,	vs1,	vs24
+
+	xvmaddadp		vs40,	vs0,	vs25
+	xvmaddadp		vs41,	vs1,	vs25
+
+	xvmaddadp		vs48,	vs0,	vs26
+	xvmaddadp		vs49,	vs1,	vs26
+
+	xvmaddadp		vs56,	vs0,	vs27
+	xvmaddadp		vs57,	vs1,	vs27
+
+.endm
+
+.macro SAVE4x4
+
+	mr		T1,	CO
+
+#ifndef TRMMKERNEL
+	lxvd2x		vs0,	0,	T1
+	lxvd2x		vs1,	o16,	T1
+#endif
+
+#ifndef TRMMKERNEL
+	xvmaddadp	vs0,	vs32,	alpha_r
+	xvmaddadp	vs1,	vs33,	alpha_r
+#else
+	xvmuldp		vs0,	vs32,	alpha_r
+	xvmuldp		vs1,	vs33,	alpha_r
+#endif
+
+	stxvd2x		vs0,	0,	T1
+	stxvd2x		vs1,	o16,	T1
+
+	add		T1,	T1,	LDC
+
+#ifndef TRMMKERNEL
+	lxvd2x		vs8,	0,	T1
+	lxvd2x		vs9,	o16,	T1
+#endif
+
+#ifndef TRMMKERNEL
+	xvmaddadp	vs8,	vs40,	alpha_r
+	xvmaddadp	vs9,	vs41,	alpha_r
+#else
+	xvmuldp		vs8,	vs40,	alpha_r
+	xvmuldp		vs9,	vs41,	alpha_r
+#endif
+
+	stxvd2x		vs8,	0,	T1
+	stxvd2x		vs9,	o16,	T1
+
+	add		T1,	T1,	LDC
+
+#ifndef TRMMKERNEL
+	lxvd2x		vs0,	0,	T1
+	lxvd2x		vs1,	o16,	T1
+#endif
+
+#ifndef TRMMKERNEL
+	xvmaddadp	vs0,	vs48,	alpha_r
+	xvmaddadp	vs1,	vs49,	alpha_r
+#else
+	xvmuldp		vs0,	vs48,	alpha_r
+	xvmuldp		vs1,	vs49,	alpha_r
+#endif
+
+	stxvd2x		vs0,	0,	T1
+	stxvd2x		vs1,	o16,	T1
+
+	add		T1,	T1,	LDC
+
+#ifndef TRMMKERNEL
+	lxvd2x		vs8,	0,	T1
+	lxvd2x		vs9,	o16,	T1
+#endif
+
+#ifndef TRMMKERNEL
+	xvmaddadp	vs8,	vs56,	alpha_r
+	xvmaddadp	vs9,	vs57,	alpha_r
+#else
+	xvmuldp		vs8,	vs56,	alpha_r
+	xvmuldp		vs9,	vs57,	alpha_r
+#endif
+
+	stxvd2x		vs8,	0,	T1
+	stxvd2x		vs9,	o16,	T1
+
+	addi		CO,	CO,	32
+
+.endm
+
+/*********************************************************************
+* Macros for N=4, M=2                                                *
+*********************************************************************/
+
+.macro LOAD4x2_1
+
+	lxvd2x	vs0,	0,	AO
+
+	lxvdsx	vs24,	0,	BO
+	lxvdsx	vs25,	o8,	BO
+	lxvdsx	vs26,	o16,	BO
+	lxvdsx	vs27,	o24,	BO
+
+	addi		AO, AO, 16
+	addi		BO, BO, 32
+
+.endm
+
+.macro KERNEL4x2_I1
+
+	lxvd2x	vs8,	0,	AO
+
+	lxvdsx	vs28,	0,	BO
+	lxvdsx	vs29,	o8,	BO
+	lxvdsx	vs30,	o16,	BO
+	lxvdsx	vs31,	o24,	BO
+
+	addi		AO, AO, 16
+	addi		BO, BO, 32
+
+
+	xvmuldp			vs32,	vs0,	vs24
+
+	xvmuldp			vs40,	vs0,	vs25
+
+	xvmuldp			vs48,	vs0,	vs26
+
+	xvmuldp			vs56,	vs0,	vs27
+
+.endm
+
+.macro KERNEL4x2_1
+
+	lxvd2x	vs8,	0,	AO
+
+	lxvdsx	vs28,	0,	BO
+	lxvdsx	vs29,	o8,	BO
+	lxvdsx	vs30,	o16,	BO
+	lxvdsx	vs31,	o24,	BO
+
+	addi		AO, AO, 16
+	addi		BO, BO, 32
+
+
+	xvmaddadp		vs32,	vs0,	vs24
+
+	xvmaddadp		vs40,	vs0,	vs25
+
+	xvmaddadp		vs48,	vs0,	vs26
+
+	xvmaddadp		vs56,	vs0,	vs27
+
+.endm
+
+.macro KERNEL4x2_2
+
+	lxvd2x	vs0,	0,	AO
+
+	lxvdsx	vs24,	0,	BO
+	lxvdsx	vs25,	o8,	BO
+	lxvdsx	vs26,	o16,	BO
+	lxvdsx	vs27,	o24,	BO
+
+	addi		AO, AO, 16
+	addi		BO, BO, 32
+
+
+	xvmaddadp		vs32,	vs8,	vs28
+
+	xvmaddadp		vs40,	vs8,	vs29
+
+	xvmaddadp		vs48,	vs8,	vs30
+
+	xvmaddadp		vs56,	vs8,	vs31
+
+.endm
+
+.macro KERNEL4x2_E2
+
+
+	xvmaddadp		vs32,	vs8,	vs28
+
+	xvmaddadp		vs40,	vs8,	vs29
+
+	xvmaddadp		vs48,	vs8,	vs30
+
+	xvmaddadp		vs56,	vs8,	vs31
+
+.endm
+
+.macro KERNEL4x2_SUBI1
+
+	lxvd2x	vs0,	0,	AO
+
+	lxvdsx	vs24,	0,	BO
+	lxvdsx	vs25,	o8,	BO
+	lxvdsx	vs26,	o16,	BO
+	lxvdsx	vs27,	o24,	BO
+
+	addi		AO, AO, 16
+	addi		BO, BO, 32
+
+
+	xvmuldp			vs32,	vs0,	vs24
+
+	xvmuldp			vs40,	vs0,	vs25
+
+	xvmuldp			vs48,	vs0,	vs26
+
+	xvmuldp			vs56,	vs0,	vs27
+
+.endm
+
+.macro KERNEL4x2_SUB1
+
+	lxvd2x	vs0,	0,	AO
+
+	lxvdsx	vs24,	0,	BO
+	lxvdsx	vs25,	o8,	BO
+	lxvdsx	vs26,	o16,	BO
+	lxvdsx	vs27,	o24,	BO
+
+	addi		AO, AO, 16
+	addi		BO, BO, 32
+
+
+	xvmaddadp		vs32,	vs0,	vs24
+
+	xvmaddadp		vs40,	vs0,	vs25
+
+	xvmaddadp		vs48,	vs0,	vs26
+
+	xvmaddadp		vs56,	vs0,	vs27
+
+.endm
+
+.macro SAVE4x2
+
+	mr		T1,	CO
+
+#ifndef TRMMKERNEL
+	lxvd2x		vs0,	0,	T1
+#endif
+
+#ifndef TRMMKERNEL
+	xvmaddadp	vs0,	vs32,	alpha_r
+#else
+	xvmuldp		vs0,	vs32,	alpha_r
+#endif
+
+	stxvd2x		vs0,	0,	T1
+
+	add		T1,	T1,	LDC
+
+#ifndef TRMMKERNEL
+	lxvd2x		vs8,	0,	T1
+#endif
+
+#ifndef TRMMKERNEL
+	xvmaddadp	vs8,	vs40,	alpha_r
+#else
+	xvmuldp		vs8,	vs40,	alpha_r
+#endif
+
+	stxvd2x		vs8,	0,	T1
+
+	add		T1,	T1,	LDC
+
+#ifndef TRMMKERNEL
+	lxvd2x		vs0,	0,	T1
+#endif
+
+#ifndef TRMMKERNEL
+	xvmaddadp	vs0,	vs48,	alpha_r
+#else
+	xvmuldp		vs0,	vs48,	alpha_r
+#endif
+
+	stxvd2x		vs0,	0,	T1
+
+	add		T1,	T1,	LDC
+
+#ifndef TRMMKERNEL
+	lxvd2x		vs8,	0,	T1
+#endif
+
+#ifndef TRMMKERNEL
+	xvmaddadp	vs8,	vs56,	alpha_r
+#else
+	xvmuldp		vs8,	vs56,	alpha_r
+#endif
+
+	stxvd2x		vs8,	0,	T1
+
+	addi		CO,	CO,	16
+
+.endm
+
+/*********************************************************************
+* Macros for N=4, M=1                                                *
+*********************************************************************/
+
+.macro LOAD4x1_1
+
+	lxsdx	vs0,	0,	AO
+
+	lxsdx	vs24,	0,	BO
+	lxsdx	vs25,	o8,	BO
+	lxsdx	vs26,	o16,	BO
+	lxsdx	vs27,	o24,	BO
+
+	addi		AO, AO, 8
+	addi		BO, BO, 32
+
+.endm
+
+.macro KERNEL4x1_I1
+
+	lxsdx	vs8,	0,	AO
+
+	lxsdx	vs28,	0,	BO
+	lxsdx	vs29,	o8,	BO
+	lxsdx	vs30,	o16,	BO
+	lxsdx	vs31,	o24,	BO
+
+	addi		AO, AO, 8
+	addi		BO, BO, 32
+
+
+	xsmuldp			vs32,	vs0,	vs24
+
+	xsmuldp			vs40,	vs0,	vs25
+
+	xsmuldp			vs48,	vs0,	vs26
+
+	xsmuldp			vs56,	vs0,	vs27
+
+.endm
+
+.macro KERNEL4x1_1
+
+	lxsdx	vs8,	0,	AO
+
+	lxsdx	vs28,	0,	BO
+	lxsdx	vs29,	o8,	BO
+	lxsdx	vs30,	o16,	BO
+	lxsdx	vs31,	o24,	BO
+
+	addi		AO, AO, 8
+	addi		BO, BO, 32
+
+
+	xsmaddadp		vs32,	vs0,	vs24
+
+	xsmaddadp		vs40,	vs0,	vs25
+
+	xsmaddadp		vs48,	vs0,	vs26
+
+	xsmaddadp		vs56,	vs0,	vs27
+
+.endm
+
+.macro KERNEL4x1_2
+
+	lxsdx	vs0,	0,	AO
+
+	lxsdx	vs24,	0,	BO
+	lxsdx	vs25,	o8,	BO
+	lxsdx	vs26,	o16,	BO
+	lxsdx	vs27,	o24,	BO
+
+	addi		AO, AO, 8
+	addi		BO, BO, 32
+
+
+	xsmaddadp		vs32,	vs8,	vs28
+
+	xsmaddadp		vs40,	vs8,	vs29
+
+	xsmaddadp		vs48,	vs8,	vs30
+
+	xsmaddadp		vs56,	vs8,	vs31
+
+.endm
+
+.macro KERNEL4x1_E2
+
+
+	xsmaddadp		vs32,	vs8,	vs28
+
+	xsmaddadp		vs40,	vs8,	vs29
+
+	xsmaddadp		vs48,	vs8,	vs30
+
+	xsmaddadp		vs56,	vs8,	vs31
+
+.endm
+
+.macro KERNEL4x1_SUBI1
+
+	lxsdx	vs0,	0,	AO
+
+	lxsdx	vs24,	0,	BO
+	lxsdx	vs25,	o8,	BO
+	lxsdx	vs26,	o16,	BO
+	lxsdx	vs27,	o24,	BO
+
+	addi		AO, AO, 8
+	addi		BO, BO, 32
+
+
+	xsmuldp			vs32,	vs0,	vs24
+
+	xsmuldp			vs40,	vs0,	vs25
+
+	xsmuldp			vs48,	vs0,	vs26
+
+	xsmuldp			vs56,	vs0,	vs27
+
+.endm
+
+.macro KERNEL4x1_SUB1
+
+	lxsdx	vs0,	0,	AO
+
+	lxsdx	vs24,	0,	BO
+	lxsdx	vs25,	o8,	BO
+	lxsdx	vs26,	o16,	BO
+	lxsdx	vs27,	o24,	BO
+
+	addi		AO, AO, 8
+	addi		BO, BO, 32
+
+
+	xsmaddadp		vs32,	vs0,	vs24
+
+	xsmaddadp		vs40,	vs0,	vs25
+
+	xsmaddadp		vs48,	vs0,	vs26
+
+	xsmaddadp		vs56,	vs0,	vs27
+
+.endm
+
+.macro SAVE4x1
+
+	mr		T1,	CO
+
+#ifndef TRMMKERNEL
+	lxsdx		vs0,	0,	T1
+#endif
+
+#ifndef TRMMKERNEL
+	xsmaddadp	vs0,	vs32,	alpha_r
+#else
+	xsmuldp		vs0,	vs32,	alpha_r
+#endif
+
+	stxsdx		vs0,	0,	T1
+
+	add		T1,	T1,	LDC
+
+#ifndef TRMMKERNEL
+	lxsdx		vs8,	0,	T1
+#endif
+
+#ifndef TRMMKERNEL
+	xsmaddadp	vs8,	vs40,	alpha_r
+#else
+	xsmuldp		vs8,	vs40,	alpha_r
+#endif
+
+	stxsdx		vs8,	0,	T1
+
+	add		T1,	T1,	LDC
+
+#ifndef TRMMKERNEL
+	lxsdx		vs0,	0,	T1
+#endif
+
+#ifndef TRMMKERNEL
+	xsmaddadp	vs0,	vs48,	alpha_r
+#else
+	xsmuldp		vs0,	vs48,	alpha_r
+#endif
+
+	stxsdx		vs0,	0,	T1
+
+	add		T1,	T1,	LDC
+
+#ifndef TRMMKERNEL
+	lxsdx		vs8,	0,	T1
+#endif
+
+#ifndef TRMMKERNEL
+	xsmaddadp	vs8,	vs56,	alpha_r
+#else
+	xsmuldp		vs8,	vs56,	alpha_r
+#endif
+
+	stxsdx		vs8,	0,	T1
+
+	addi		CO,	CO,	8
+
+.endm
+
+/*********************************************************************
+* Macros for N=2, M=16                                               *
+*********************************************************************/
+
+.macro LOAD2x16_1
+
+	lxvd2x	vs0,	0,	AO
+	lxvd2x	vs1,	o16,	AO
+	lxvd2x	vs2,	o32,	AO
+	lxvd2x	vs3,	o48,	AO
+
+	lxvdsx	vs24,	0,	BO
+	lxvdsx	vs25,	o8,	BO
+
+	addi		AO, AO, 64
+	addi		BO, BO, 16
+
+	lxvd2x	vs4,	0,	AO
+	lxvd2x	vs5,	o16,	AO
+	lxvd2x	vs6,	o32,	AO
+	lxvd2x	vs7,	o48,	AO
+
+	addi		AO, AO, 64
+
+.endm
+
+.macro KERNEL2x16_I1
+
+	lxvd2x	vs8,	0,	AO
+	lxvd2x	vs9,	o16,	AO
+	lxvd2x	vs10,	o32,	AO
+	lxvd2x	vs11,	o48,	AO
+
+	lxvdsx	vs28,	0,	BO
+	lxvdsx	vs29,	o8,	BO
+
+	addi		AO, AO, 64
+	addi		BO, BO, 16
+
+	lxvd2x	vs12,	0,	AO
+	lxvd2x	vs13,	o16,	AO
+	lxvd2x	vs14,	o32,	AO
+	lxvd2x	vs15,	o48,	AO
+
+	addi		AO, AO, 64
+
+
+	xvmuldp			vs32,	vs0,	vs24
+	xvmuldp			vs33,	vs1,	vs24
+	xvmuldp			vs34,	vs2,	vs24
+	xvmuldp			vs35,	vs3,	vs24
+	xvmuldp			vs36,	vs4,	vs24
+	xvmuldp			vs37,	vs5,	vs24
+	xvmuldp			vs38,	vs6,	vs24
+	xvmuldp			vs39,	vs7,	vs24
+
+	xvmuldp			vs40,	vs0,	vs25
+	xvmuldp			vs41,	vs1,	vs25
+	xvmuldp			vs42,	vs2,	vs25
+	xvmuldp			vs43,	vs3,	vs25
+	xvmuldp			vs44,	vs4,	vs25
+	xvmuldp			vs45,	vs5,	vs25
+	xvmuldp			vs46,	vs6,	vs25
+	xvmuldp			vs47,	vs7,	vs25
+
+.endm
+
+.macro KERNEL2x16_1
+
+	lxvd2x	vs8,	0,	AO
+	lxvd2x	vs9,	o16,	AO
+	lxvd2x	vs10,	o32,	AO
+	lxvd2x	vs11,	o48,	AO
+
+	lxvdsx	vs28,	0,	BO
+	lxvdsx	vs29,	o8,	BO
+
+	addi		AO, AO, 64
+	addi		BO, BO, 16
+
+	lxvd2x	vs12,	0,	AO
+	lxvd2x	vs13,	o16,	AO
+	lxvd2x	vs14,	o32,	AO
+	lxvd2x	vs15,	o48,	AO
+
+	addi		AO, AO, 64
+
+
+	xvmaddadp		vs32,	vs0,	vs24
+	xvmaddadp		vs33,	vs1,	vs24
+	xvmaddadp		vs34,	vs2,	vs24
+	xvmaddadp		vs35,	vs3,	vs24
+	xvmaddadp		vs36,	vs4,	vs24
+	xvmaddadp		vs37,	vs5,	vs24
+	xvmaddadp		vs38,	vs6,	vs24
+	xvmaddadp		vs39,	vs7,	vs24
+
+	xvmaddadp		vs40,	vs0,	vs25
+	xvmaddadp		vs41,	vs1,	vs25
+	xvmaddadp		vs42,	vs2,	vs25
+	xvmaddadp		vs43,	vs3,	vs25
+	xvmaddadp		vs44,	vs4,	vs25
+	xvmaddadp		vs45,	vs5,	vs25
+	xvmaddadp		vs46,	vs6,	vs25
+	xvmaddadp		vs47,	vs7,	vs25
+
+.endm
+
+.macro KERNEL2x16_2
+
+	lxvd2x	vs0,	0,	AO
+	lxvd2x	vs1,	o16,	AO
+	lxvd2x	vs2,	o32,	AO
+	lxvd2x	vs3,	o48,	AO
+
+	lxvdsx	vs24,	0,	BO
+	lxvdsx	vs25,	o8,	BO
+
+	addi		AO, AO, 64
+	addi		BO, BO, 16
+
+	lxvd2x	vs4,	0,	AO
+	lxvd2x	vs5,	o16,	AO
+	lxvd2x	vs6,	o32,	AO
+	lxvd2x	vs7,	o48,	AO
+
+	addi		AO, AO, 64
+
+
+	xvmaddadp		vs32,	vs8,	vs28
+	xvmaddadp		vs33,	vs9,	vs28
+	xvmaddadp		vs34,	vs10,	vs28
+	xvmaddadp		vs35,	vs11,	vs28
+	xvmaddadp		vs36,	vs12,	vs28
+	xvmaddadp		vs37,	vs13,	vs28
+	xvmaddadp		vs38,	vs14,	vs28
+	xvmaddadp		vs39,	vs15,	vs28
+
+	xvmaddadp		vs40,	vs8,	vs29
+	xvmaddadp		vs41,	vs9,	vs29
+	xvmaddadp		vs42,	vs10,	vs29
+	xvmaddadp		vs43,	vs11,	vs29
+	xvmaddadp		vs44,	vs12,	vs29
+	xvmaddadp		vs45,	vs13,	vs29
+	xvmaddadp		vs46,	vs14,	vs29
+	xvmaddadp		vs47,	vs15,	vs29
+
+.endm
+
+.macro KERNEL2x16_E2
+
+
+	xvmaddadp		vs32,	vs8,	vs28
+	xvmaddadp		vs33,	vs9,	vs28
+	xvmaddadp		vs34,	vs10,	vs28
+	xvmaddadp		vs35,	vs11,	vs28
+	xvmaddadp		vs36,	vs12,	vs28
+	xvmaddadp		vs37,	vs13,	vs28
+	xvmaddadp		vs38,	vs14,	vs28
+	xvmaddadp		vs39,	vs15,	vs28
+
+	xvmaddadp		vs40,	vs8,	vs29
+	xvmaddadp		vs41,	vs9,	vs29
+	xvmaddadp		vs42,	vs10,	vs29
+	xvmaddadp		vs43,	vs11,	vs29
+	xvmaddadp		vs44,	vs12,	vs29
+	xvmaddadp		vs45,	vs13,	vs29
+	xvmaddadp		vs46,	vs14,	vs29
+	xvmaddadp		vs47,	vs15,	vs29
+
+.endm
+
+.macro KERNEL2x16_SUBI1
+
+	lxvd2x	vs0,	0,	AO
+	lxvd2x	vs1,	o16,	AO
+	lxvd2x	vs2,	o32,	AO
+	lxvd2x	vs3,	o48,	AO
+
+	lxvdsx	vs24,	0,	BO
+	lxvdsx	vs25,	o8,	BO
+
+	addi		AO, AO, 64
+	addi		BO, BO, 16
+
+	lxvd2x	vs4,	0,	AO
+	lxvd2x	vs5,	o16,	AO
+	lxvd2x	vs6,	o32,	AO
+	lxvd2x	vs7,	o48,	AO
+
+	addi		AO, AO, 64
+
+
+	xvmuldp			vs32,	vs0,	vs24
+	xvmuldp			vs33,	vs1,	vs24
+	xvmuldp			vs34,	vs2,	vs24
+	xvmuldp			vs35,	vs3,	vs24
+	xvmuldp			vs36,	vs4,	vs24
+	xvmuldp			vs37,	vs5,	vs24
+	xvmuldp			vs38,	vs6,	vs24
+	xvmuldp			vs39,	vs7,	vs24
+
+	xvmuldp			vs40,	vs0,	vs25
+	xvmuldp			vs41,	vs1,	vs25
+	xvmuldp			vs42,	vs2,	vs25
+	xvmuldp			vs43,	vs3,	vs25
+	xvmuldp			vs44,	vs4,	vs25
+	xvmuldp			vs45,	vs5,	vs25
+	xvmuldp			vs46,	vs6,	vs25
+	xvmuldp			vs47,	vs7,	vs25
+
+.endm
+
+.macro KERNEL2x16_SUB1
+
+	lxvd2x	vs0,	0,	AO
+	lxvd2x	vs1,	o16,	AO
+	lxvd2x	vs2,	o32,	AO
+	lxvd2x	vs3,	o48,	AO
+
+	lxvdsx	vs24,	0,	BO
+	lxvdsx	vs25,	o8,	BO
+
+	addi		AO, AO, 64
+	addi		BO, BO, 16
+
+	lxvd2x	vs4,	0,	AO
+	lxvd2x	vs5,	o16,	AO
+	lxvd2x	vs6,	o32,	AO
+	lxvd2x	vs7,	o48,	AO
+
+	addi		AO, AO, 64
+
+
+	xvmaddadp		vs32,	vs0,	vs24
+	xvmaddadp		vs33,	vs1,	vs24
+	xvmaddadp		vs34,	vs2,	vs24
+	xvmaddadp		vs35,	vs3,	vs24
+	xvmaddadp		vs36,	vs4,	vs24
+	xvmaddadp		vs37,	vs5,	vs24
+	xvmaddadp		vs38,	vs6,	vs24
+	xvmaddadp		vs39,	vs7,	vs24
+
+	xvmaddadp		vs40,	vs0,	vs25
+	xvmaddadp		vs41,	vs1,	vs25
+	xvmaddadp		vs42,	vs2,	vs25
+	xvmaddadp		vs43,	vs3,	vs25
+	xvmaddadp		vs44,	vs4,	vs25
+	xvmaddadp		vs45,	vs5,	vs25
+	xvmaddadp		vs46,	vs6,	vs25
+	xvmaddadp		vs47,	vs7,	vs25
+
+.endm
+
+.macro SAVE2x16
+
+	mr		T1,	CO
+	addi		T2,	T1,	64
+
+#ifndef TRMMKERNEL
+	lxvd2x		vs0,	0,	T1
+	lxvd2x		vs1,	o16,	T1
+	lxvd2x		vs2,	o32,	T1
+	lxvd2x		vs3,	o48,	T1
+
+	lxvd2x		vs4,	0,	T2
+	lxvd2x		vs5,	o16,	T2
+	lxvd2x		vs6,	o32,	T2
+	lxvd2x		vs7,	o48,	T2
+#endif
+
+#ifndef TRMMKERNEL
+	xvmaddadp	vs0,	vs32,	alpha_r
+	xvmaddadp	vs1,	vs33,	alpha_r
+	xvmaddadp	vs2,	vs34,	alpha_r
+	xvmaddadp	vs3,	vs35,	alpha_r
+	xvmaddadp	vs4,	vs36,	alpha_r
+	xvmaddadp	vs5,	vs37,	alpha_r
+	xvmaddadp	vs6,	vs38,	alpha_r
+	xvmaddadp	vs7,	vs39,	alpha_r
+#else
+	xvmuldp		vs0,	vs32,	alpha_r
+	xvmuldp		vs1,	vs33,	alpha_r
+	xvmuldp		vs2,	vs34,	alpha_r
+	xvmuldp		vs3,	vs35,	alpha_r
+	xvmuldp		vs4,	vs36,	alpha_r
+	xvmuldp		vs5,	vs37,	alpha_r
+	xvmuldp		vs6,	vs38,	alpha_r
+	xvmuldp		vs7,	vs39,	alpha_r
+#endif
+
+	stxvd2x		vs0,	0,	T1
+	stxvd2x		vs1,	o16,	T1
+	stxvd2x		vs2,	o32,	T1
+	stxvd2x		vs3,	o48,	T1
+
+	stxvd2x		vs4,	0,	T2
+	stxvd2x		vs5,	o16,	T2
+	stxvd2x		vs6,	o32,	T2
+	stxvd2x		vs7,	o48,	T2
+
+	add		T1,	T1,	LDC
+	add		T2,	T2,	LDC
+
+#ifndef TRMMKERNEL
+	lxvd2x		vs8,	0,	T1
+	lxvd2x		vs9,	o16,	T1
+	lxvd2x		vs10,	o32,	T1
+	lxvd2x		vs11,	o48,	T1
+
+	lxvd2x		vs12,	0,	T2
+	lxvd2x		vs13,	o16,	T2
+	lxvd2x		vs14,	o32,	T2
+	lxvd2x		vs15,	o48,	T2
+#endif
+
+#ifndef TRMMKERNEL
+	xvmaddadp	vs8,	vs40,	alpha_r
+	xvmaddadp	vs9,	vs41,	alpha_r
+	xvmaddadp	vs10,	vs42,	alpha_r
+	xvmaddadp	vs11,	vs43,	alpha_r
+	xvmaddadp	vs12,	vs44,	alpha_r
+	xvmaddadp	vs13,	vs45,	alpha_r
+	xvmaddadp	vs14,	vs46,	alpha_r
+	xvmaddadp	vs15,	vs47,	alpha_r
+#else
+	xvmuldp		vs8,	vs40,	alpha_r
+	xvmuldp		vs9,	vs41,	alpha_r
+	xvmuldp		vs10,	vs42,	alpha_r
+	xvmuldp		vs11,	vs43,	alpha_r
+	xvmuldp		vs12,	vs44,	alpha_r
+	xvmuldp		vs13,	vs45,	alpha_r
+	xvmuldp		vs14,	vs46,	alpha_r
+	xvmuldp		vs15,	vs47,	alpha_r
+#endif
+
+	stxvd2x		vs8,	0,	T1
+	stxvd2x		vs9,	o16,	T1
+	stxvd2x		vs10,	o32,	T1
+	stxvd2x		vs11,	o48,	T1
+
+	stxvd2x		vs12,	0,	T2
+	stxvd2x		vs13,	o16,	T2
+	stxvd2x		vs14,	o32,	T2
+	stxvd2x		vs15,	o48,	T2
+
+	addi		CO,	CO,	128
+
+.endm
+
+/*********************************************************************
+* Macros for N=4, M=8                                                *
+*********************************************************************/
+
+.macro LOAD2x8_1
+
+	lxvd2x	vs0,	0,	AO
+	lxvd2x	vs1,	o16,	AO
+	lxvd2x	vs2,	o32,	AO
+	lxvd2x	vs3,	o48,	AO
+
+	lxvdsx	vs24,	0,	BO
+	lxvdsx	vs25,	o8,	BO
+
+	addi		AO, AO, 64
+	addi		BO, BO, 16
+
+.endm
+
+.macro KERNEL2x8_I1
+
+	lxvd2x	vs8,	0,	AO
+	lxvd2x	vs9,	o16,	AO
+	lxvd2x	vs10,	o32,	AO
+	lxvd2x	vs11,	o48,	AO
+
+	lxvdsx	vs28,	0,	BO
+	lxvdsx	vs29,	o8,	BO
+
+	addi		AO, AO, 64
+	addi		BO, BO, 16
+
+
+	xvmuldp			vs32,	vs0,	vs24
+	xvmuldp			vs33,	vs1,	vs24
+	xvmuldp			vs34,	vs2,	vs24
+	xvmuldp			vs35,	vs3,	vs24
+
+	xvmuldp			vs40,	vs0,	vs25
+	xvmuldp			vs41,	vs1,	vs25
+	xvmuldp			vs42,	vs2,	vs25
+	xvmuldp			vs43,	vs3,	vs25
+
+.endm
+
+.macro KERNEL2x8_1
+
+	lxvd2x	vs8,	0,	AO
+	lxvd2x	vs9,	o16,	AO
+	lxvd2x	vs10,	o32,	AO
+	lxvd2x	vs11,	o48,	AO
+
+	lxvdsx	vs28,	0,	BO
+	lxvdsx	vs29,	o8,	BO
+
+	addi		AO, AO, 64
+	addi		BO, BO, 16
+
+
+	xvmaddadp		vs32,	vs0,	vs24
+	xvmaddadp		vs33,	vs1,	vs24
+	xvmaddadp		vs34,	vs2,	vs24
+	xvmaddadp		vs35,	vs3,	vs24
+
+	xvmaddadp		vs40,	vs0,	vs25
+	xvmaddadp		vs41,	vs1,	vs25
+	xvmaddadp		vs42,	vs2,	vs25
+	xvmaddadp		vs43,	vs3,	vs25
+
+.endm
+
+.macro KERNEL2x8_2
+
+	lxvd2x	vs0,	0,	AO
+	lxvd2x	vs1,	o16,	AO
+	lxvd2x	vs2,	o32,	AO
+	lxvd2x	vs3,	o48,	AO
+
+	lxvdsx	vs24,	0,	BO
+	lxvdsx	vs25,	o8,	BO
+
+	addi		AO, AO, 64
+	addi		BO, BO, 16
+
+
+	xvmaddadp		vs32,	vs8,	vs28
+	xvmaddadp		vs33,	vs9,	vs28
+	xvmaddadp		vs34,	vs10,	vs28
+	xvmaddadp		vs35,	vs11,	vs28
+
+	xvmaddadp		vs40,	vs8,	vs29
+	xvmaddadp		vs41,	vs9,	vs29
+	xvmaddadp		vs42,	vs10,	vs29
+	xvmaddadp		vs43,	vs11,	vs29
+
+.endm
+
+.macro KERNEL2x8_E2
+
+
+	xvmaddadp		vs32,	vs8,	vs28
+	xvmaddadp		vs33,	vs9,	vs28
+	xvmaddadp		vs34,	vs10,	vs28
+	xvmaddadp		vs35,	vs11,	vs28
+
+	xvmaddadp		vs40,	vs8,	vs29
+	xvmaddadp		vs41,	vs9,	vs29
+	xvmaddadp		vs42,	vs10,	vs29
+	xvmaddadp		vs43,	vs11,	vs29
+
+.endm
+
+.macro KERNEL2x8_SUBI1
+
+	lxvd2x	vs0,	0,	AO
+	lxvd2x	vs1,	o16,	AO
+	lxvd2x	vs2,	o32,	AO
+	lxvd2x	vs3,	o48,	AO
+
+	lxvdsx	vs24,	0,	BO
+	lxvdsx	vs25,	o8,	BO
+
+	addi		AO, AO, 64
+	addi		BO, BO, 16
+
+
+	xvmuldp			vs32,	vs0,	vs24
+	xvmuldp			vs33,	vs1,	vs24
+	xvmuldp			vs34,	vs2,	vs24
+	xvmuldp			vs35,	vs3,	vs24
+
+	xvmuldp			vs40,	vs0,	vs25
+	xvmuldp			vs41,	vs1,	vs25
+	xvmuldp			vs42,	vs2,	vs25
+	xvmuldp			vs43,	vs3,	vs25
+
+.endm
+
+.macro KERNEL2x8_SUB1
+
+	lxvd2x	vs0,	0,	AO
+	lxvd2x	vs1,	o16,	AO
+	lxvd2x	vs2,	o32,	AO
+	lxvd2x	vs3,	o48,	AO
+
+	lxvdsx	vs24,	0,	BO
+	lxvdsx	vs25,	o8,	BO
+
+	addi		AO, AO, 64
+	addi		BO, BO, 16
+
+
+	xvmaddadp		vs32,	vs0,	vs24
+	xvmaddadp		vs33,	vs1,	vs24
+	xvmaddadp		vs34,	vs2,	vs24
+	xvmaddadp		vs35,	vs3,	vs24
+
+	xvmaddadp		vs40,	vs0,	vs25
+	xvmaddadp		vs41,	vs1,	vs25
+	xvmaddadp		vs42,	vs2,	vs25
+	xvmaddadp		vs43,	vs3,	vs25
+
+.endm
+
+.macro SAVE2x8
+
+	mr		T1,	CO
+
+#ifndef TRMMKERNEL
+	lxvd2x		vs0,	0,	T1
+	lxvd2x		vs1,	o16,	T1
+	lxvd2x		vs2,	o32,	T1
+	lxvd2x		vs3,	o48,	T1
+#endif
+
+#ifndef TRMMKERNEL
+	xvmaddadp	vs0,	vs32,	alpha_r
+	xvmaddadp	vs1,	vs33,	alpha_r
+	xvmaddadp	vs2,	vs34,	alpha_r
+	xvmaddadp	vs3,	vs35,	alpha_r
+#else
+	xvmuldp		vs0,	vs32,	alpha_r
+	xvmuldp		vs1,	vs33,	alpha_r
+	xvmuldp		vs2,	vs34,	alpha_r
+	xvmuldp		vs3,	vs35,	alpha_r
+#endif
+
+	stxvd2x		vs0,	0,	T1
+	stxvd2x		vs1,	o16,	T1
+	stxvd2x		vs2,	o32,	T1
+	stxvd2x		vs3,	o48,	T1
+
+	add		T1,	T1,	LDC
+
+#ifndef TRMMKERNEL
+	lxvd2x		vs8,	0,	T1
+	lxvd2x		vs9,	o16,	T1
+	lxvd2x		vs10,	o32,	T1
+	lxvd2x		vs11,	o48,	T1
+#endif
+
+#ifndef TRMMKERNEL
+	xvmaddadp	vs8,	vs40,	alpha_r
+	xvmaddadp	vs9,	vs41,	alpha_r
+	xvmaddadp	vs10,	vs42,	alpha_r
+	xvmaddadp	vs11,	vs43,	alpha_r
+#else
+	xvmuldp		vs8,	vs40,	alpha_r
+	xvmuldp		vs9,	vs41,	alpha_r
+	xvmuldp		vs10,	vs42,	alpha_r
+	xvmuldp		vs11,	vs43,	alpha_r
+#endif
+
+	stxvd2x		vs8,	0,	T1
+	stxvd2x		vs9,	o16,	T1
+	stxvd2x		vs10,	o32,	T1
+	stxvd2x		vs11,	o48,	T1
+
+	addi		CO,	CO,	64
+
+.endm
+
+/*********************************************************************
+* Macros for N=2, M=4                                                *
+*********************************************************************/
+
+.macro LOAD2x4_1
+
+	lxvd2x	vs0,	0,	AO
+	lxvd2x	vs1,	o16,	AO
+
+	lxvdsx	vs24,	0,	BO
+	lxvdsx	vs25,	o8,	BO
+
+	addi		AO, AO, 32
+	addi		BO, BO, 16
+
+.endm
+
+.macro KERNEL2x4_I1
+
+	lxvd2x	vs8,	0,	AO
+	lxvd2x	vs9,	o16,	AO
+
+	lxvdsx	vs28,	0,	BO
+	lxvdsx	vs29,	o8,	BO
+
+	addi		AO, AO, 32
+	addi		BO, BO, 16
+
+
+	xvmuldp			vs32,	vs0,	vs24
+	xvmuldp			vs33,	vs1,	vs24
+
+	xvmuldp			vs40,	vs0,	vs25
+	xvmuldp			vs41,	vs1,	vs25
+
+.endm
+
+.macro KERNEL2x4_1
+
+	lxvd2x	vs8,	0,	AO
+	lxvd2x	vs9,	o16,	AO
+
+	lxvdsx	vs28,	0,	BO
+	lxvdsx	vs29,	o8,	BO
+
+	addi		AO, AO, 32
+	addi		BO, BO, 16
+
+
+	xvmaddadp		vs32,	vs0,	vs24
+	xvmaddadp		vs33,	vs1,	vs24
+
+	xvmaddadp		vs40,	vs0,	vs25
+	xvmaddadp		vs41,	vs1,	vs25
+
+.endm
+
+.macro KERNEL2x4_2
+
+	lxvd2x	vs0,	0,	AO
+	lxvd2x	vs1,	o16,	AO
+
+	lxvdsx	vs24,	0,	BO
+	lxvdsx	vs25,	o8,	BO
+
+	addi		AO, AO, 32
+	addi		BO, BO, 16
+
+
+	xvmaddadp		vs32,	vs8,	vs28
+	xvmaddadp		vs33,	vs9,	vs28
+
+	xvmaddadp		vs40,	vs8,	vs29
+	xvmaddadp		vs41,	vs9,	vs29
+
+.endm
+
+.macro KERNEL2x4_E2
+
+
+	xvmaddadp		vs32,	vs8,	vs28
+	xvmaddadp		vs33,	vs9,	vs28
+
+	xvmaddadp		vs40,	vs8,	vs29
+	xvmaddadp		vs41,	vs9,	vs29
+
+.endm
+
+.macro KERNEL2x4_SUBI1
+
+	lxvd2x	vs0,	0,	AO
+	lxvd2x	vs1,	o16,	AO
+
+	lxvdsx	vs24,	0,	BO
+	lxvdsx	vs25,	o8,	BO
+
+	addi		AO, AO, 32
+	addi		BO, BO, 16
+
+
+	xvmuldp			vs32,	vs0,	vs24
+	xvmuldp			vs33,	vs1,	vs24
+
+	xvmuldp			vs40,	vs0,	vs25
+	xvmuldp			vs41,	vs1,	vs25
+
+.endm
+
+.macro KERNEL2x4_SUB1
+
+	lxvd2x	vs0,	0,	AO
+	lxvd2x	vs1,	o16,	AO
+
+	lxvdsx	vs24,	0,	BO
+	lxvdsx	vs25,	o8,	BO
+
+	addi		AO, AO, 32
+	addi		BO, BO, 16
+
+
+	xvmaddadp		vs32,	vs0,	vs24
+	xvmaddadp		vs33,	vs1,	vs24
+
+	xvmaddadp		vs40,	vs0,	vs25
+	xvmaddadp		vs41,	vs1,	vs25
+
+.endm
+
+.macro SAVE2x4
+
+	mr		T1,	CO
+
+#ifndef TRMMKERNEL
+	lxvd2x		vs0,	0,	T1
+	lxvd2x		vs1,	o16,	T1
+#endif
+
+#ifndef TRMMKERNEL
+	xvmaddadp	vs0,	vs32,	alpha_r
+	xvmaddadp	vs1,	vs33,	alpha_r
+#else
+	xvmuldp		vs0,	vs32,	alpha_r
+	xvmuldp		vs1,	vs33,	alpha_r
+#endif
+
+	stxvd2x		vs0,	0,	T1
+	stxvd2x		vs1,	o16,	T1
+
+	add		T1,	T1,	LDC
+
+#ifndef TRMMKERNEL
+	lxvd2x		vs8,	0,	T1
+	lxvd2x		vs9,	o16,	T1
+#endif
+
+#ifndef TRMMKERNEL
+	xvmaddadp	vs8,	vs40,	alpha_r
+	xvmaddadp	vs9,	vs41,	alpha_r
+#else
+	xvmuldp		vs8,	vs40,	alpha_r
+	xvmuldp		vs9,	vs41,	alpha_r
+#endif
+
+	stxvd2x		vs8,	0,	T1
+	stxvd2x		vs9,	o16,	T1
+
+	addi		CO,	CO,	32
+
+.endm
+
+/*********************************************************************
+* Macros for N=2, M=2                                                *
+*********************************************************************/
+
+.macro LOAD2x2_1
+
+	lxvd2x	vs0,	0,	AO
+
+	lxvdsx	vs24,	0,	BO
+	lxvdsx	vs25,	o8,	BO
+
+	addi		AO, AO, 16
+	addi		BO, BO, 16
+
+.endm
+
+.macro KERNEL2x2_I1
+
+	lxvd2x	vs8,	0,	AO
+
+	lxvdsx	vs28,	0,	BO
+	lxvdsx	vs29,	o8,	BO
+
+	addi		AO, AO, 16
+	addi		BO, BO, 16
+
+
+	xvmuldp			vs32,	vs0,	vs24
+
+	xvmuldp			vs40,	vs0,	vs25
+
+.endm
+
+.macro KERNEL2x2_1
+
+	lxvd2x	vs8,	0,	AO
+
+	lxvdsx	vs28,	0,	BO
+	lxvdsx	vs29,	o8,	BO
+
+	addi		AO, AO, 16
+	addi		BO, BO, 16
+
+
+	xvmaddadp		vs32,	vs0,	vs24
+
+	xvmaddadp		vs40,	vs0,	vs25
+
+.endm
+
+.macro KERNEL2x2_2
+
+	lxvd2x	vs0,	0,	AO
+
+	lxvdsx	vs24,	0,	BO
+	lxvdsx	vs25,	o8,	BO
+
+	addi		AO, AO, 16
+	addi		BO, BO, 16
+
+
+	xvmaddadp		vs32,	vs8,	vs28
+
+	xvmaddadp		vs40,	vs8,	vs29
+
+.endm
+
+.macro KERNEL2x2_E2
+
+
+	xvmaddadp		vs32,	vs8,	vs28
+
+	xvmaddadp		vs40,	vs8,	vs29
+
+.endm
+
+.macro KERNEL2x2_SUBI1
+
+	lxvd2x	vs0,	0,	AO
+
+	lxvdsx	vs24,	0,	BO
+	lxvdsx	vs25,	o8,	BO
+
+	addi		AO, AO, 16
+	addi		BO, BO, 16
+
+
+	xvmuldp			vs32,	vs0,	vs24
+
+	xvmuldp			vs40,	vs0,	vs25
+
+.endm
+
+.macro KERNEL2x2_SUB1
+
+	lxvd2x	vs0,	0,	AO
+
+	lxvdsx	vs24,	0,	BO
+	lxvdsx	vs25,	o8,	BO
+
+	addi		AO, AO, 16
+	addi		BO, BO, 16
+
+
+	xvmaddadp		vs32,	vs0,	vs24
+
+	xvmaddadp		vs40,	vs0,	vs25
+
+.endm
+
+.macro SAVE2x2
+
+	mr		T1,	CO
+
+#ifndef TRMMKERNEL
+	lxvd2x		vs0,	0,	T1
+#endif
+
+#ifndef TRMMKERNEL
+	xvmaddadp	vs0,	vs32,	alpha_r
+#else
+	xvmuldp		vs0,	vs32,	alpha_r
+#endif
+
+	stxvd2x		vs0,	0,	T1
+
+	add		T1,	T1,	LDC
+
+#ifndef TRMMKERNEL
+	lxvd2x		vs8,	0,	T1
+#endif
+
+#ifndef TRMMKERNEL
+	xvmaddadp	vs8,	vs40,	alpha_r
+#else
+	xvmuldp		vs8,	vs40,	alpha_r
+#endif
+
+	stxvd2x		vs8,	0,	T1
+
+	addi		CO,	CO,	16
+
+.endm
+
+/*********************************************************************
+* Macros for N=2, M=1                                                *
+*********************************************************************/
+
+.macro LOAD2x1_1
+
+	lxsdx	vs0,	0,	AO
+
+	lxsdx	vs24,	0,	BO
+	lxsdx	vs25,	o8,	BO
+
+	addi		AO, AO, 8
+	addi		BO, BO, 16
+
+.endm
+
+.macro KERNEL2x1_I1
+
+	lxsdx	vs8,	0,	AO
+
+	lxsdx	vs28,	0,	BO
+	lxsdx	vs29,	o8,	BO
+
+	addi		AO, AO, 8
+	addi		BO, BO, 16
+
+
+	xsmuldp			vs32,	vs0,	vs24
+
+	xsmuldp			vs40,	vs0,	vs25
+
+.endm
+
+.macro KERNEL2x1_1
+
+	lxsdx	vs8,	0,	AO
+
+	lxsdx	vs28,	0,	BO
+	lxsdx	vs29,	o8,	BO
+
+	addi		AO, AO, 8
+	addi		BO, BO, 16
+
+
+	xsmaddadp		vs32,	vs0,	vs24
+
+	xsmaddadp		vs40,	vs0,	vs25
+
+.endm
+
+.macro KERNEL2x1_2
+
+	lxsdx	vs0,	0,	AO
+
+	lxsdx	vs24,	0,	BO
+	lxsdx	vs25,	o8,	BO
+
+	addi		AO, AO, 8
+	addi		BO, BO, 16
+
+
+	xsmaddadp		vs32,	vs8,	vs28
+
+	xsmaddadp		vs40,	vs8,	vs29
+
+.endm
+
+.macro KERNEL2x1_E2
+
+
+	xsmaddadp		vs32,	vs8,	vs28
+
+	xsmaddadp		vs40,	vs8,	vs29
+
+.endm
+
+.macro KERNEL2x1_SUBI1
+
+	lxsdx	vs0,	0,	AO
+
+	lxsdx	vs24,	0,	BO
+	lxsdx	vs25,	o8,	BO
+
+	addi		AO, AO, 8
+	addi		BO, BO, 16
+
+
+	xsmuldp			vs32,	vs0,	vs24
+
+	xsmuldp			vs40,	vs0,	vs25
+
+.endm
+
+.macro KERNEL2x1_SUB1
+
+	lxsdx	vs0,	0,	AO
+
+	lxsdx	vs24,	0,	BO
+	lxsdx	vs25,	o8,	BO
+
+	addi		AO, AO, 8
+	addi		BO, BO, 16
+
+
+	xsmaddadp		vs32,	vs0,	vs24
+
+	xsmaddadp		vs40,	vs0,	vs25
+
+.endm
+
+.macro SAVE2x1
+
+	mr		T1,	CO
+
+#ifndef TRMMKERNEL
+	lxsdx		vs0,	0,	T1
+#endif
+
+#ifndef TRMMKERNEL
+	xsmaddadp	vs0,	vs32,	alpha_r
+#else
+	xsmuldp		vs0,	vs32,	alpha_r
+#endif
+
+	stxsdx		vs0,	0,	T1
+
+	add		T1,	T1,	LDC
+
+#ifndef TRMMKERNEL
+	lxsdx		vs8,	0,	T1
+#endif
+
+#ifndef TRMMKERNEL
+	xsmaddadp	vs8,	vs40,	alpha_r
+#else
+	xsmuldp		vs8,	vs40,	alpha_r
+#endif
+
+	stxsdx		vs8,	0,	T1
+
+	addi		CO,	CO,	8
+
+.endm
+
+/*********************************************************************
+* Macros for N=1, M=16                                               *
+*********************************************************************/
+
+.macro LOAD1x16_1
+
+	lxvd2x	vs0,	0,	AO
+	lxvd2x	vs1,	o16,	AO
+	lxvd2x	vs2,	o32,	AO
+	lxvd2x	vs3,	o48,	AO
+
+	lxvdsx	vs24,	0,	BO
+
+	addi		AO, AO, 64
+	addi		BO, BO, 8
+
+	lxvd2x	vs4,	0,	AO
+	lxvd2x	vs5,	o16,	AO
+	lxvd2x	vs6,	o32,	AO
+	lxvd2x	vs7,	o48,	AO
+
+	addi		AO, AO, 64
+
+.endm
+
+.macro KERNEL1x16_I1
+
+	lxvd2x	vs8,	0,	AO
+	lxvd2x	vs9,	o16,	AO
+	lxvd2x	vs10,	o32,	AO
+	lxvd2x	vs11,	o48,	AO
+
+	lxvdsx	vs28,	0,	BO
+
+	addi		AO, AO, 64
+	addi		BO, BO, 8
+
+	lxvd2x	vs12,	0,	AO
+	lxvd2x	vs13,	o16,	AO
+	lxvd2x	vs14,	o32,	AO
+	lxvd2x	vs15,	o48,	AO
+
+	addi		AO, AO, 64
+
+
+	xvmuldp			vs32,	vs0,	vs24
+	xvmuldp			vs33,	vs1,	vs24
+	xvmuldp			vs34,	vs2,	vs24
+	xvmuldp			vs35,	vs3,	vs24
+	xvmuldp			vs36,	vs4,	vs24
+	xvmuldp			vs37,	vs5,	vs24
+	xvmuldp			vs38,	vs6,	vs24
+	xvmuldp			vs39,	vs7,	vs24
+
+.endm
+
+.macro KERNEL1x16_1
+
+	lxvd2x	vs8,	0,	AO
+	lxvd2x	vs9,	o16,	AO
+	lxvd2x	vs10,	o32,	AO
+	lxvd2x	vs11,	o48,	AO
+
+	lxvdsx	vs28,	0,	BO
+
+	addi		AO, AO, 64
+	addi		BO, BO, 8
+
+	lxvd2x	vs12,	0,	AO
+	lxvd2x	vs13,	o16,	AO
+	lxvd2x	vs14,	o32,	AO
+	lxvd2x	vs15,	o48,	AO
+
+	addi		AO, AO, 64
+
+
+	xvmaddadp		vs32,	vs0,	vs24
+	xvmaddadp		vs33,	vs1,	vs24
+	xvmaddadp		vs34,	vs2,	vs24
+	xvmaddadp		vs35,	vs3,	vs24
+	xvmaddadp		vs36,	vs4,	vs24
+	xvmaddadp		vs37,	vs5,	vs24
+	xvmaddadp		vs38,	vs6,	vs24
+	xvmaddadp		vs39,	vs7,	vs24
+
+.endm
+
+.macro KERNEL1x16_2
+
+	lxvd2x	vs0,	0,	AO
+	lxvd2x	vs1,	o16,	AO
+	lxvd2x	vs2,	o32,	AO
+	lxvd2x	vs3,	o48,	AO
+
+	lxvdsx	vs24,	0,	BO
+
+	addi		AO, AO, 64
+	addi		BO, BO, 8
+
+	lxvd2x	vs4,	0,	AO
+	lxvd2x	vs5,	o16,	AO
+	lxvd2x	vs6,	o32,	AO
+	lxvd2x	vs7,	o48,	AO
+
+	addi		AO, AO, 64
+
+
+	xvmaddadp		vs32,	vs8,	vs28
+	xvmaddadp		vs33,	vs9,	vs28
+	xvmaddadp		vs34,	vs10,	vs28
+	xvmaddadp		vs35,	vs11,	vs28
+	xvmaddadp		vs36,	vs12,	vs28
+	xvmaddadp		vs37,	vs13,	vs28
+	xvmaddadp		vs38,	vs14,	vs28
+	xvmaddadp		vs39,	vs15,	vs28
+
+.endm
+
+.macro KERNEL1x16_E2
+
+
+	xvmaddadp		vs32,	vs8,	vs28
+	xvmaddadp		vs33,	vs9,	vs28
+	xvmaddadp		vs34,	vs10,	vs28
+	xvmaddadp		vs35,	vs11,	vs28
+	xvmaddadp		vs36,	vs12,	vs28
+	xvmaddadp		vs37,	vs13,	vs28
+	xvmaddadp		vs38,	vs14,	vs28
+	xvmaddadp		vs39,	vs15,	vs28
+
+.endm
+
+.macro KERNEL1x16_SUBI1
+
+	lxvd2x	vs0,	0,	AO
+	lxvd2x	vs1,	o16,	AO
+	lxvd2x	vs2,	o32,	AO
+	lxvd2x	vs3,	o48,	AO
+
+	lxvdsx	vs24,	0,	BO
+
+	addi		AO, AO, 64
+	addi		BO, BO, 8
+
+	lxvd2x	vs4,	0,	AO
+	lxvd2x	vs5,	o16,	AO
+	lxvd2x	vs6,	o32,	AO
+	lxvd2x	vs7,	o48,	AO
+
+	addi		AO, AO, 64
+
+
+	xvmuldp			vs32,	vs0,	vs24
+	xvmuldp			vs33,	vs1,	vs24
+	xvmuldp			vs34,	vs2,	vs24
+	xvmuldp			vs35,	vs3,	vs24
+	xvmuldp			vs36,	vs4,	vs24
+	xvmuldp			vs37,	vs5,	vs24
+	xvmuldp			vs38,	vs6,	vs24
+	xvmuldp			vs39,	vs7,	vs24
+
+.endm
+
+.macro KERNEL1x16_SUB1
+
+	lxvd2x	vs0,	0,	AO
+	lxvd2x	vs1,	o16,	AO
+	lxvd2x	vs2,	o32,	AO
+	lxvd2x	vs3,	o48,	AO
+
+	lxvdsx	vs24,	0,	BO
+
+	addi		AO, AO, 64
+	addi		BO, BO, 8
+
+	lxvd2x	vs4,	0,	AO
+	lxvd2x	vs5,	o16,	AO
+	lxvd2x	vs6,	o32,	AO
+	lxvd2x	vs7,	o48,	AO
+
+	addi		AO, AO, 64
+
+
+	xvmaddadp		vs32,	vs0,	vs24
+	xvmaddadp		vs33,	vs1,	vs24
+	xvmaddadp		vs34,	vs2,	vs24
+	xvmaddadp		vs35,	vs3,	vs24
+	xvmaddadp		vs36,	vs4,	vs24
+	xvmaddadp		vs37,	vs5,	vs24
+	xvmaddadp		vs38,	vs6,	vs24
+	xvmaddadp		vs39,	vs7,	vs24
+
+.endm
+
+.macro SAVE1x16
+
+	mr		T1,	CO
+	addi		T2,	T1,	64
+
+#ifndef TRMMKERNEL
+	lxvd2x		vs0,	0,	T1
+	lxvd2x		vs1,	o16,	T1
+	lxvd2x		vs2,	o32,	T1
+	lxvd2x		vs3,	o48,	T1
+
+	lxvd2x		vs4,	0,	T2
+	lxvd2x		vs5,	o16,	T2
+	lxvd2x		vs6,	o32,	T2
+	lxvd2x		vs7,	o48,	T2
+#endif
+
+#ifndef TRMMKERNEL
+	xvmaddadp	vs0,	vs32,	alpha_r
+	xvmaddadp	vs1,	vs33,	alpha_r
+	xvmaddadp	vs2,	vs34,	alpha_r
+	xvmaddadp	vs3,	vs35,	alpha_r
+	xvmaddadp	vs4,	vs36,	alpha_r
+	xvmaddadp	vs5,	vs37,	alpha_r
+	xvmaddadp	vs6,	vs38,	alpha_r
+	xvmaddadp	vs7,	vs39,	alpha_r
+#else
+	xvmuldp		vs0,	vs32,	alpha_r
+	xvmuldp		vs1,	vs33,	alpha_r
+	xvmuldp		vs2,	vs34,	alpha_r
+	xvmuldp		vs3,	vs35,	alpha_r
+	xvmuldp		vs4,	vs36,	alpha_r
+	xvmuldp		vs5,	vs37,	alpha_r
+	xvmuldp		vs6,	vs38,	alpha_r
+	xvmuldp		vs7,	vs39,	alpha_r
+#endif
+
+	stxvd2x		vs0,	0,	T1
+	stxvd2x		vs1,	o16,	T1
+	stxvd2x		vs2,	o32,	T1
+	stxvd2x		vs3,	o48,	T1
+
+	stxvd2x		vs4,	0,	T2
+	stxvd2x		vs5,	o16,	T2
+	stxvd2x		vs6,	o32,	T2
+	stxvd2x		vs7,	o48,	T2
+
+	addi		CO,	CO,	128
+
+.endm
+
+/*********************************************************************
+* Macros for N=4, M=8                                                *
+*********************************************************************/
+
+.macro LOAD1x8_1
+
+	lxvd2x	vs0,	0,	AO
+	lxvd2x	vs1,	o16,	AO
+	lxvd2x	vs2,	o32,	AO
+	lxvd2x	vs3,	o48,	AO
+
+	lxvdsx	vs24,	0,	BO
+
+	addi		AO, AO, 64
+	addi		BO, BO, 8
+
+.endm
+
+.macro KERNEL1x8_I1
+
+	lxvd2x	vs8,	0,	AO
+	lxvd2x	vs9,	o16,	AO
+	lxvd2x	vs10,	o32,	AO
+	lxvd2x	vs11,	o48,	AO
+
+	lxvdsx	vs28,	0,	BO
+
+	addi		AO, AO, 64
+	addi		BO, BO, 8
+
+
+	xvmuldp			vs32,	vs0,	vs24
+	xvmuldp			vs33,	vs1,	vs24
+	xvmuldp			vs34,	vs2,	vs24
+	xvmuldp			vs35,	vs3,	vs24
+
+.endm
+
+.macro KERNEL1x8_1
+
+	lxvd2x	vs8,	0,	AO
+	lxvd2x	vs9,	o16,	AO
+	lxvd2x	vs10,	o32,	AO
+	lxvd2x	vs11,	o48,	AO
+
+	lxvdsx	vs28,	0,	BO
+
+	addi		AO, AO, 64
+	addi		BO, BO, 8
+
+
+	xvmaddadp		vs32,	vs0,	vs24
+	xvmaddadp		vs33,	vs1,	vs24
+	xvmaddadp		vs34,	vs2,	vs24
+	xvmaddadp		vs35,	vs3,	vs24
+
+.endm
+
+.macro KERNEL1x8_2
+
+	lxvd2x	vs0,	0,	AO
+	lxvd2x	vs1,	o16,	AO
+	lxvd2x	vs2,	o32,	AO
+	lxvd2x	vs3,	o48,	AO
+
+	lxvdsx	vs24,	0,	BO
+
+	addi		AO, AO, 64
+	addi		BO, BO, 8
+
+
+	xvmaddadp		vs32,	vs8,	vs28
+	xvmaddadp		vs33,	vs9,	vs28
+	xvmaddadp		vs34,	vs10,	vs28
+	xvmaddadp		vs35,	vs11,	vs28
+
+.endm
+
+.macro KERNEL1x8_E2
+
+
+	xvmaddadp		vs32,	vs8,	vs28
+	xvmaddadp		vs33,	vs9,	vs28
+	xvmaddadp		vs34,	vs10,	vs28
+	xvmaddadp		vs35,	vs11,	vs28
+
+.endm
+
+.macro KERNEL1x8_SUBI1
+
+	lxvd2x	vs0,	0,	AO
+	lxvd2x	vs1,	o16,	AO
+	lxvd2x	vs2,	o32,	AO
+	lxvd2x	vs3,	o48,	AO
+
+	lxvdsx	vs24,	0,	BO
+
+	addi		AO, AO, 64
+	addi		BO, BO, 8
+
+
+	xvmuldp			vs32,	vs0,	vs24
+	xvmuldp			vs33,	vs1,	vs24
+	xvmuldp			vs34,	vs2,	vs24
+	xvmuldp			vs35,	vs3,	vs24
+
+.endm
+
+.macro KERNEL1x8_SUB1
+
+	lxvd2x	vs0,	0,	AO
+	lxvd2x	vs1,	o16,	AO
+	lxvd2x	vs2,	o32,	AO
+	lxvd2x	vs3,	o48,	AO
+
+	lxvdsx	vs24,	0,	BO
+
+	addi		AO, AO, 64
+	addi		BO, BO, 8
+
+
+	xvmaddadp		vs32,	vs0,	vs24
+	xvmaddadp		vs33,	vs1,	vs24
+	xvmaddadp		vs34,	vs2,	vs24
+	xvmaddadp		vs35,	vs3,	vs24
+
+.endm
+
+.macro SAVE1x8
+
+	mr		T1,	CO
+
+#ifndef TRMMKERNEL
+	lxvd2x		vs0,	0,	T1
+	lxvd2x		vs1,	o16,	T1
+	lxvd2x		vs2,	o32,	T1
+	lxvd2x		vs3,	o48,	T1
+#endif
+
+#ifndef TRMMKERNEL
+	xvmaddadp	vs0,	vs32,	alpha_r
+	xvmaddadp	vs1,	vs33,	alpha_r
+	xvmaddadp	vs2,	vs34,	alpha_r
+	xvmaddadp	vs3,	vs35,	alpha_r
+#else
+	xvmuldp		vs0,	vs32,	alpha_r
+	xvmuldp		vs1,	vs33,	alpha_r
+	xvmuldp		vs2,	vs34,	alpha_r
+	xvmuldp		vs3,	vs35,	alpha_r
+#endif
+
+	stxvd2x		vs0,	0,	T1
+	stxvd2x		vs1,	o16,	T1
+	stxvd2x		vs2,	o32,	T1
+	stxvd2x		vs3,	o48,	T1
+
+	addi		CO,	CO,	64
+
+.endm
+
+/*********************************************************************
+* Macros for N=1, M=4                                                *
+*********************************************************************/
+
+.macro LOAD1x4_1
+
+	lxvd2x	vs0,	0,	AO
+	lxvd2x	vs1,	o16,	AO
+
+	lxvdsx	vs24,	0,	BO
+
+	addi		AO, AO, 32
+	addi		BO, BO, 8
+
+.endm
+
+.macro KERNEL1x4_I1
+
+	lxvd2x	vs8,	0,	AO
+	lxvd2x	vs9,	o16,	AO
+
+	lxvdsx	vs28,	0,	BO
+
+	addi		AO, AO, 32
+	addi		BO, BO, 8
+
+
+	xvmuldp			vs32,	vs0,	vs24
+	xvmuldp			vs33,	vs1,	vs24
+
+.endm
+
+.macro KERNEL1x4_1
+
+	lxvd2x	vs8,	0,	AO
+	lxvd2x	vs9,	o16,	AO
+
+	lxvdsx	vs28,	0,	BO
+
+	addi		AO, AO, 32
+	addi		BO, BO, 8
+
+
+	xvmaddadp		vs32,	vs0,	vs24
+	xvmaddadp		vs33,	vs1,	vs24
+
+.endm
+
+.macro KERNEL1x4_2
+
+	lxvd2x	vs0,	0,	AO
+	lxvd2x	vs1,	o16,	AO
+
+	lxvdsx	vs24,	0,	BO
+
+	addi		AO, AO, 32
+	addi		BO, BO, 8
+
+
+	xvmaddadp		vs32,	vs8,	vs28
+	xvmaddadp		vs33,	vs9,	vs28
+
+.endm
+
+.macro KERNEL1x4_E2
+
+
+	xvmaddadp		vs32,	vs8,	vs28
+	xvmaddadp		vs33,	vs9,	vs28
+
+.endm
+
+.macro KERNEL1x4_SUBI1
+
+	lxvd2x	vs0,	0,	AO
+	lxvd2x	vs1,	o16,	AO
+
+	lxvdsx	vs24,	0,	BO
+
+	addi		AO, AO, 32
+	addi		BO, BO, 8
+
+
+	xvmuldp			vs32,	vs0,	vs24
+	xvmuldp			vs33,	vs1,	vs24
+
+.endm
+
+.macro KERNEL1x4_SUB1
+
+	lxvd2x	vs0,	0,	AO
+	lxvd2x	vs1,	o16,	AO
+
+	lxvdsx	vs24,	0,	BO
+
+	addi		AO, AO, 32
+	addi		BO, BO, 8
+
+
+	xvmaddadp		vs32,	vs0,	vs24
+	xvmaddadp		vs33,	vs1,	vs24
+
+.endm
+
+.macro SAVE1x4
+
+	mr		T1,	CO
+
+#ifndef TRMMKERNEL
+	lxvd2x		vs0,	0,	T1
+	lxvd2x		vs1,	o16,	T1
+#endif
+
+#ifndef TRMMKERNEL
+	xvmaddadp	vs0,	vs32,	alpha_r
+	xvmaddadp	vs1,	vs33,	alpha_r
+#else
+	xvmuldp		vs0,	vs32,	alpha_r
+	xvmuldp		vs1,	vs33,	alpha_r
+#endif
+
+	stxvd2x		vs0,	0,	T1
+	stxvd2x		vs1,	o16,	T1
+
+	addi		CO,	CO,	32
+
+.endm
+
+/*********************************************************************
+* Macros for N=1, M=2                                                *
+*********************************************************************/
+
+.macro LOAD1x2_1
+
+	lxvd2x	vs0,	0,	AO
+
+	lxvdsx	vs24,	0,	BO
+
+	addi		AO, AO, 16
+	addi		BO, BO, 8
+
+.endm
+
+.macro KERNEL1x2_I1
+
+	lxvd2x	vs8,	0,	AO
+
+	lxvdsx	vs28,	0,	BO
+
+	addi		AO, AO, 16
+	addi		BO, BO, 8
+
+
+	xvmuldp			vs32,	vs0,	vs24
+
+.endm
+
+.macro KERNEL1x2_1
+
+	lxvd2x	vs8,	0,	AO
+
+	lxvdsx	vs28,	0,	BO
+
+	addi		AO, AO, 16
+	addi		BO, BO, 8
+
+
+	xvmaddadp		vs32,	vs0,	vs24
+
+.endm
+
+.macro KERNEL1x2_2
+
+	lxvd2x	vs0,	0,	AO
+
+	lxvdsx	vs24,	0,	BO
+
+	addi		AO, AO, 16
+	addi		BO, BO, 8
+
+
+	xvmaddadp		vs32,	vs8,	vs28
+
+.endm
+
+.macro KERNEL1x2_E2
+
+
+	xvmaddadp		vs32,	vs8,	vs28
+
+.endm
+
+.macro KERNEL1x2_SUBI1
+
+	lxvd2x	vs0,	0,	AO
+
+	lxvdsx	vs24,	0,	BO
+
+	addi		AO, AO, 16
+	addi		BO, BO, 8
+
+
+	xvmuldp			vs32,	vs0,	vs24
+
+.endm
+
+.macro KERNEL1x2_SUB1
+
+	lxvd2x	vs0,	0,	AO
+
+	lxvdsx	vs24,	0,	BO
+
+	addi		AO, AO, 16
+	addi		BO, BO, 8
+
+
+	xvmaddadp		vs32,	vs0,	vs24
+
+.endm
+
+.macro SAVE1x2
+
+	mr		T1,	CO
+
+#ifndef TRMMKERNEL
+	lxvd2x		vs0,	0,	T1
+#endif
+
+#ifndef TRMMKERNEL
+	xvmaddadp	vs0,	vs32,	alpha_r
+#else
+	xvmuldp		vs0,	vs32,	alpha_r
+#endif
+
+	stxvd2x		vs0,	0,	T1
+
+	addi		CO,	CO,	16
+
+.endm
+
+/*********************************************************************
+* Macros for N=1, M=1                                                *
+*********************************************************************/
+
+.macro LOAD1x1_1
+
+	lxsdx	vs0,	0,	AO
+
+	lxsdx	vs24,	0,	BO
+
+	addi		AO, AO, 8
+	addi		BO, BO, 8
+
+.endm
+
+.macro KERNEL1x1_I1
+
+	lxsdx	vs8,	0,	AO
+
+	lxsdx	vs28,	0,	BO
+
+	addi		AO, AO, 8
+	addi		BO, BO, 8
+
+
+	xsmuldp			vs32,	vs0,	vs24
+
+.endm
+
+.macro KERNEL1x1_1
+
+	lxsdx	vs8,	0,	AO
+
+	lxsdx	vs28,	0,	BO
+
+	addi		AO, AO, 8
+	addi		BO, BO, 8
+
+
+	xsmaddadp		vs32,	vs0,	vs24
+
+.endm
+
+.macro KERNEL1x1_2
+
+	lxsdx	vs0,	0,	AO
+
+	lxsdx	vs24,	0,	BO
+
+	addi		AO, AO, 8
+	addi		BO, BO, 8
+
+
+	xsmaddadp		vs32,	vs8,	vs28
+
+.endm
+
+.macro KERNEL1x1_E2
+
+
+	xsmaddadp		vs32,	vs8,	vs28
+
+.endm
+
+.macro KERNEL1x1_SUBI1
+
+	lxsdx	vs0,	0,	AO
+
+	lxsdx	vs24,	0,	BO
+
+	addi		AO, AO, 8
+	addi		BO, BO, 8
+
+
+	xsmuldp			vs32,	vs0,	vs24
+
+.endm
+
+.macro KERNEL1x1_SUB1
+
+	lxsdx	vs0,	0,	AO
+
+	lxsdx	vs24,	0,	BO
+
+	addi		AO, AO, 8
+	addi		BO, BO, 8
+
+
+	xsmaddadp		vs32,	vs0,	vs24
+
+.endm
+
+.macro SAVE1x1
+
+	mr		T1,	CO
+
+#ifndef TRMMKERNEL
+	lxsdx		vs0,	0,	T1
+#endif
+
+#ifndef TRMMKERNEL
+	xsmaddadp	vs0,	vs32,	alpha_r
+#else
+	xsmuldp		vs0,	vs32,	alpha_r
+#endif
+
+	stxsdx		vs0,	0,	T1
+
+	addi		CO,	CO,	8
+
+.endm
+
diff --git a/kernel/power/dtrmm_kernel_16x4_power8.S b/kernel/power/dtrmm_kernel_16x4_power8.S
new file mode 100644
index 000000000..2294128a2
--- /dev/null
+++ b/kernel/power/dtrmm_kernel_16x4_power8.S
@@ -0,0 +1,362 @@
+/***************************************************************************
+Copyright (c) 2013-2016, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*****************************************************************************/
+
+/**************************************************************************************
+* 2016/03/05 Werner Saar (wernsaar@googlemail.com)
+* 	 BLASTEST 		: OK
+* 	 CTEST			: OK
+* 	 TEST			: OK
+*	 LAPACK-TEST		: OK
+**************************************************************************************/
+
+/*********************************************************************/
+/* Copyright 2009, 2010 The University of Texas at Austin.           */
+/* All rights reserved.                                              */
+/*                                                                   */
+/* Redistribution and use in source and binary forms, with or        */
+/* without modification, are permitted provided that the following   */
+/* conditions are met:                                               */
+/*                                                                   */
+/*   1. Redistributions of source code must retain the above         */
+/*      copyright notice, this list of conditions and the following  */
+/*      disclaimer.                                                  */
+/*                                                                   */
+/*   2. Redistributions in binary form must reproduce the above      */
+/*      copyright notice, this list of conditions and the following  */
+/*      disclaimer in the documentation and/or other materials       */
+/*      provided with the distribution.                              */
+/*                                                                   */
+/*    THIS  SOFTWARE IS PROVIDED  BY THE  UNIVERSITY OF  TEXAS AT    */
+/*    AUSTIN  ``AS IS''  AND ANY  EXPRESS OR  IMPLIED WARRANTIES,    */
+/*    INCLUDING, BUT  NOT LIMITED  TO, THE IMPLIED  WARRANTIES OF    */
+/*    MERCHANTABILITY  AND FITNESS FOR  A PARTICULAR  PURPOSE ARE    */
+/*    DISCLAIMED.  IN  NO EVENT SHALL THE UNIVERSITY  OF TEXAS AT    */
+/*    AUSTIN OR CONTRIBUTORS BE  LIABLE FOR ANY DIRECT, INDIRECT,    */
+/*    INCIDENTAL,  SPECIAL, EXEMPLARY,  OR  CONSEQUENTIAL DAMAGES    */
+/*    (INCLUDING, BUT  NOT LIMITED TO,  PROCUREMENT OF SUBSTITUTE    */
+/*    GOODS  OR  SERVICES; LOSS  OF  USE,  DATA,  OR PROFITS;  OR    */
+/*    BUSINESS INTERRUPTION) HOWEVER CAUSED  AND ON ANY THEORY OF    */
+/*    LIABILITY, WHETHER  IN CONTRACT, STRICT  LIABILITY, OR TORT    */
+/*    (INCLUDING NEGLIGENCE OR OTHERWISE)  ARISING IN ANY WAY OUT    */
+/*    OF  THE  USE OF  THIS  SOFTWARE,  EVEN  IF ADVISED  OF  THE    */
+/*    POSSIBILITY OF SUCH DAMAGE.                                    */
+/*                                                                   */
+/* The views and conclusions contained in the software and           */
+/* documentation are those of the authors and should not be          */
+/* interpreted as representing official policies, either expressed   */
+/* or implied, of The University of Texas at Austin.                 */
+/*********************************************************************/
+
+#define ASSEMBLER
+#include "common.h"
+#include "def_vsx.h"
+
+#ifndef __64BIT__
+#define LOAD	lwz
+#else
+#define LOAD	ld
+#endif
+
+#ifdef __64BIT__
+#define STACKSIZE 320
+#define ALPHA_SP   296(SP)
+#define FZERO	304(SP)
+#else
+#define STACKSIZE 240
+#define ALPHA_SP   224(SP)
+#define FZERO	232(SP)
+#endif
+
+#define	M	r3
+#define	N	r4
+#define	K	r5
+
+#ifdef linux
+#ifndef __64BIT__
+#define A	r6
+#define	B	r7
+#define	C	r8
+#define	LDC	r9
+#define OFFSET	r10
+#else
+#define A	r7
+#define	B	r8
+#define	C	r9
+#define	LDC	r10
+#define OFFSET	r6
+#endif
+#endif
+
+#if defined(_AIX) || defined(__APPLE__)
+#if !defined(__64BIT__) && defined(DOUBLE)
+#define A	r8
+#define	B	r9
+#define	C	r10
+#define	LDC	r7
+#define OFFSET	r6
+#else
+#define A	r7
+#define	B	r8
+#define	C	r9
+#define	LDC	r10
+#define OFFSET	r6
+#endif
+#endif
+
+#define alpha_r vs18
+
+#define o0	0
+
+#define K1	r13
+#define KKK	r14
+#define o8	r15
+#define o24	r16
+#define ALPHA	r17
+#define L	r18
+#define T1	r19
+#define KK	r20
+#define BB	r21
+#define	I	r22
+#define J	r23
+#define AO	r24
+#define	BO	r25
+#define	CO 	r26
+#define o16	r27
+#define	o32	r28
+#define	o48	r29
+
+#define PRE	r30
+#define T2	r31
+
+#include "dgemm_macros_16x4_power8.S"
+
+
+#ifndef NEEDPARAM
+
+	PROLOGUE
+	PROFCODE
+
+	addi	SP, SP, -STACKSIZE
+	li	r0, 0
+
+	stfd	f14,    0(SP)
+	stfd	f15,    8(SP)
+	stfd	f16,   16(SP)
+	stfd	f17,   24(SP)
+
+	stfd	f18,   32(SP)
+	stfd	f19,   40(SP)
+	stfd	f20,   48(SP)
+	stfd	f21,   56(SP)
+
+	stfd	f22,   64(SP)
+	stfd	f23,   72(SP)
+	stfd	f24,   80(SP)
+	stfd	f25,   88(SP)
+
+	stfd	f26,   96(SP)
+	stfd	f27,  104(SP)
+	stfd	f28,  112(SP)
+	stfd	f29,  120(SP)
+
+	stfd	f30,  128(SP)
+	stfd	f31,  136(SP)
+
+#ifdef __64BIT__
+	std	r31,  144(SP)
+	std	r30,  152(SP)
+	std	r29,  160(SP)
+	std	r28,  168(SP)
+	std	r27,  176(SP)
+	std	r26,  184(SP)
+	std	r25,  192(SP)
+	std	r24,  200(SP)
+	std	r23,  208(SP)
+	std	r22,  216(SP)
+	std	r21,  224(SP)
+	std	r20,  232(SP)
+	std	r19,  240(SP)
+	std	r18,  248(SP)
+	std	r17,  256(SP)
+	std	r16,  264(SP)
+	std	r15,  272(SP)
+	std	r14,  280(SP)
+	std	r13,  288(SP)
+#else
+	stw	r31,  144(SP)
+	stw	r30,  148(SP)
+	stw	r29,  152(SP)
+	stw	r28,  156(SP)
+	stw	r27,  160(SP)
+	stw	r26,  164(SP)
+	stw	r25,  168(SP)
+	stw	r24,  172(SP)
+	stw	r23,  176(SP)
+	stw	r22,  180(SP)
+	stw	r21,  184(SP)
+	stw	r20,  188(SP)
+	stw	r19,  192(SP)
+	stw	r18,  196(SP)
+	stw	r17,  200(SP)
+	stw	r16,  204(SP)
+	stw	r15,  208(SP)
+	stw	r14,  212(SP)
+	stw	r13,  216(SP)
+#endif
+
+	stfd	f1,  ALPHA_SP
+	stw	r0,  FZERO
+
+#if defined(_AIX) || defined(__APPLE__)
+#if !defined(__64BIT__) && defined(DOUBLE)
+	lwz	LDC, FRAMESLOT(0) + STACKSIZE(SP)
+#endif
+#endif
+
+	slwi	LDC, LDC, BASE_SHIFT
+
+#if defined(TRMMKERNEL)
+#if defined(linux) && defined(__64BIT__)
+	ld	OFFSET,  FRAMESLOT(0) + STACKSIZE(SP)
+#endif
+
+#if defined(_AIX) || defined(__APPLE__)
+#ifdef __64BIT__
+	ld	OFFSET,  FRAMESLOT(0) + STACKSIZE(SP)
+#else
+#ifdef DOUBLE
+	lwz	OFFSET,  FRAMESLOT(1) + STACKSIZE(SP)
+#else
+	lwz	OFFSET,  FRAMESLOT(0) + STACKSIZE(SP)
+#endif
+#endif
+#endif
+#endif
+
+	mr	KK, OFFSET
+#if defined(TRMMKERNEL) && !defined(LEFT)
+        neg     KK, KK
+#endif
+
+	cmpwi	cr0, M, 0
+	ble	.L999_H1
+	cmpwi	cr0, N, 0
+	ble	.L999_H1
+	cmpwi	cr0, K, 0
+	ble	.L999_H1
+
+#ifdef __64BIT__
+	addi	ALPHA, SP, 296
+#else
+	addi	ALPHA, SP, 224
+#endif
+
+	li	PRE, 256 
+	li	o8 , 8
+	li	o16, 16
+	li	o24, 24
+	li	o32, 32
+	li	o48, 48
+
+	lxvdsx	alpha_r, 0, ALPHA
+
+#include "dtrmm_logic_16x4_power8.S"
+
+.L999:
+	addi	r3, 0, 0
+
+	lfd	f14,    0(SP)
+	lfd	f15,    8(SP)
+	lfd	f16,   16(SP)
+	lfd	f17,   24(SP)
+
+	lfd	f18,   32(SP)
+	lfd	f19,   40(SP)
+	lfd	f20,   48(SP)
+	lfd	f21,   56(SP)
+
+	lfd	f22,   64(SP)
+	lfd	f23,   72(SP)
+	lfd	f24,   80(SP)
+	lfd	f25,   88(SP)
+
+	lfd	f26,   96(SP)
+	lfd	f27,  104(SP)
+	lfd	f28,  112(SP)
+	lfd	f29,  120(SP)
+
+	lfd	f30,  128(SP)
+	lfd	f31,  136(SP)
+
+#ifdef __64BIT__
+	ld	r31,  144(SP)
+	ld	r30,  152(SP)
+	ld	r29,  160(SP)
+	ld	r28,  168(SP)
+	ld	r27,  176(SP)
+	ld	r26,  184(SP)
+	ld	r25,  192(SP)
+	ld	r24,  200(SP)
+	ld	r23,  208(SP)
+	ld	r22,  216(SP)
+	ld	r21,  224(SP)
+	ld	r20,  232(SP)
+	ld	r19,  240(SP)
+	ld	r18,  248(SP)
+	ld	r17,  256(SP)
+	ld	r16,  264(SP)
+	ld	r15,  272(SP)
+	ld	r14,  280(SP)
+	ld	r13,  288(SP)
+#else
+	lwz	r31,  144(SP)
+	lwz	r30,  148(SP)
+	lwz	r29,  152(SP)
+	lwz	r28,  156(SP)
+	lwz	r27,  160(SP)
+	lwz	r26,  164(SP)
+	lwz	r25,  168(SP)
+	lwz	r24,  172(SP)
+	lwz	r23,  176(SP)
+	lwz	r22,  180(SP)
+	lwz	r21,  184(SP)
+	lwz	r20,  188(SP)
+	lwz	r19,  192(SP)
+	lwz	r18,  196(SP)
+	lwz	r17,  200(SP)
+	lwz	r16,  204(SP)
+	lwz	r15,  208(SP)
+	lwz	r14,  212(SP)
+	lwz	r13,  216(SP)
+#endif
+
+	addi	SP, SP, STACKSIZE
+
+	blr
+
+	EPILOGUE
+#endif
diff --git a/kernel/power/dtrmm_logic_16x4_power8.S b/kernel/power/dtrmm_logic_16x4_power8.S
new file mode 100644
index 000000000..a4340c598
--- /dev/null
+++ b/kernel/power/dtrmm_logic_16x4_power8.S
@@ -0,0 +1,2239 @@
+/***************************************************************************
+Copyright (c) 2013-2016, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*****************************************************************************/
+
+/**************************************************************************************
+* 2016/03/05 Werner Saar (wernsaar@googlemail.com)
+* 	 BLASTEST 		: OK
+* 	 CTEST			: OK
+* 	 TEST			: OK
+*	 LAPACK-TEST		: OK
+**************************************************************************************/
+
+
+
+	srawi.		J,	N,	2
+	ble		.LDTRMM_L4_END
+
+.LDTRMM_L4_BEGIN:
+
+	mr		CO,	C
+	mr		AO,	A
+	slwi		T1,	LDC	,	2
+	add		C,	C,	T1
+
+#if defined(LEFT)
+	mr		KK,	OFFSET		// OFFSET -> KK
+#endif
+
+	srawi.		I,	M,	4
+	ble		.LDTRMM_L4x16_END
+
+.LDTRMM_L4x16_BEGIN:
+
+
+#if  (defined(LEFT) &&  defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+	mr		BO,	B					// B -> BO
+#else
+	mr		BO,	B					// B -> BO
+	slwi		T1,	KK,	5				// Number of values in B shifted
+	slwi		T2,	KK,	7				// Number of values in A shifted
+	add		BO,	BO,	T1				// Add values to BO
+	add		AO,	AO,	T2				// Add values to AO
+#endif
+
+#if (defined(LEFT) &&  !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
+	sub		T1,	K,	KK				// K - KK -> TEMP1
+#else
+	mr		T1,	KK					// KK -> KTEMP
+#ifdef LEFT
+	addi		T1,	T1,	16				// KTEMP + Number of values in A -> KTEMP
+#else
+	addi		T1,	T1,	4				// KTEMP + Number of values in B -> KTEMP
+#endif
+#endif
+
+	mr		KKK,	T1
+	mr		K1,	T1
+	srawi.		L,	K1,	3				// KTEMP / 8 -> L
+	ble		.LDTRMM_L4x16_SUB0
+	cmpwi		cr0,	L,	1
+	ble		.LDTRMM_L4x16_SUB4
+
+.LDTRMM_L4x16_LOOP_START:
+
+	dcbt		AO,	PRE
+	LOAD4x16_1
+	dcbt		AO,	PRE
+	KERNEL4x16_I1
+	dcbt		AO,	PRE
+	KERNEL4x16_2
+	dcbt		AO,	PRE
+	KERNEL4x16_1
+	dcbt		AO,	PRE
+	KERNEL4x16_2
+
+	dcbt		AO,	PRE
+	KERNEL4x16_1
+	dcbt		AO,	PRE
+	KERNEL4x16_2
+	dcbt		AO,	PRE
+	KERNEL4x16_1
+	dcbt		AO,	PRE
+	KERNEL4x16_2
+
+	addic.		L,	L,	-2
+	ble		.LDTRMM_L4x16_LOOP_END
+
+	.align 5
+
+.LDTRMM_L4x16_LOOP:
+
+	dcbt		AO,	PRE
+	KERNEL4x16_1
+	dcbt		AO,	PRE
+	KERNEL4x16_2
+	dcbt		AO,	PRE
+	KERNEL4x16_1
+	dcbt		AO,	PRE
+	KERNEL4x16_2
+
+	dcbt		AO,	PRE
+	KERNEL4x16_1
+	dcbt		AO,	PRE
+	KERNEL4x16_2
+	dcbt		AO,	PRE
+	KERNEL4x16_1
+	dcbt		AO,	PRE
+	KERNEL4x16_2
+
+	addic.		L,	L,	-1
+	bgt		.LDTRMM_L4x16_LOOP
+
+.LDTRMM_L4x16_LOOP_END:
+
+	dcbt		AO,	PRE
+	KERNEL4x16_1
+	dcbt		AO,	PRE
+	KERNEL4x16_2
+	dcbt		AO,	PRE
+	KERNEL4x16_1
+	dcbt		AO,	PRE
+	KERNEL4x16_2
+
+	dcbt		AO,	PRE
+	KERNEL4x16_1
+	dcbt		AO,	PRE
+	KERNEL4x16_2
+	dcbt		AO,	PRE
+	KERNEL4x16_1
+	KERNEL4x16_E2
+
+	b		.LDTRMM_L4x16_SUB1
+
+.LDTRMM_L4x16_SUB4:
+
+	dcbt		AO,	PRE
+	KERNEL4x16_SUBI1
+	dcbt		AO,	PRE
+	KERNEL4x16_SUB1
+	dcbt		AO,	PRE
+	KERNEL4x16_SUB1
+	dcbt		AO,	PRE
+	KERNEL4x16_SUB1
+
+	KERNEL4x16_SUB1
+	KERNEL4x16_SUB1
+	KERNEL4x16_SUB1
+	KERNEL4x16_SUB1
+
+	b		.LDTRMM_L4x16_SUB1
+
+.LDTRMM_L4x16_SUB0:
+
+	andi.		L,	K1,	7						// K1 & 7 -> L
+
+	KERNEL4x16_SUBI1
+
+	addic.		L,	L,	-1
+	ble		.LDTRMM_L4x16_SAVE
+	b		.LDTRMM_L4x16_SUB2
+
+.LDTRMM_L4x16_SUB1:
+
+	andi.		L,	K1,	7						// K1 & 7 -> L
+	ble		.LDTRMM_L4x16_SAVE
+
+.LDTRMM_L4x16_SUB2:
+
+	KERNEL4x16_SUB1
+
+	addic.		L,	L,	-1
+	bgt		.LDTRMM_L4x16_SUB2
+
+.LDTRMM_L4x16_SAVE:
+
+	SAVE4x16
+
+#if  (defined(LEFT) &&  defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+	sub		T1,	K,	KKK					// K - KKK -> TEMP1
+	slwi		T2,	T1,	5			// TEMP1 * Number of values in B shifted -> TEMP2
+	slwi		T1,	T1,	7			// TEMP1 * Number of values in A shifted -> TEMP1
+	add		BO,	BO,	T2					// BO += TEMP2 * number of values in B shifted
+	add		AO,	AO,	T1					// AO += TEMP1 * number of values in A shifted
+#endif
+
+#if defined(LEFT)
+	addi		KK,	KK,	16				// KK += Number of values in A
+#endif
+
+
+	addic.		I,	I,	-1
+	bgt		.LDTRMM_L4x16_BEGIN
+
+.LDTRMM_L4x16_END:
+
+.LDTRMM_L4x8_BEGIN:
+	andi.		T2,	M,	15
+	ble		.LDTRMM_L4x1_END
+
+	andi.		T1,	M,	8
+	ble		.LDTRMM_L4x8_END
+
+#if  (defined(LEFT) &&  defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+	mr		BO,	B					// B -> BO
+#else
+	mr		BO,	B					// B -> BO
+	slwi		T1,	KK,	5				// Number of values in B shifted
+	slwi		T2,	KK,	6				// Number of values in A shifted
+	add		BO,	BO,	T1				// Add values to BO
+	add		AO,	AO,	T2				// Add values to AO
+#endif
+
+#if (defined(LEFT) &&  !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
+	sub		T1,	K,	KK				// K - KK -> TEMP1
+#else
+	mr		T1,	KK					// KK -> KTEMP
+#ifdef LEFT
+	addi		T1,	T1,	8				// KTEMP + Number of values in A -> KTEMP
+#else
+	addi		T1,	T1,	4				// KTEMP + Number of values in B -> KTEMP
+#endif
+#endif
+
+	mr		KKK,	T1
+	mr		K1,	T1
+	srawi.		L,	K1,	3				// KTEMP / 8 -> L
+	ble		.LDTRMM_L4x8_SUB0
+	cmpwi		cr0,	L,	1
+	ble		.LDTRMM_L4x8_SUB4
+
+.LDTRMM_L4x8_LOOP_START:
+
+	LOAD4x8_1
+	KERNEL4x8_I1
+	KERNEL4x8_2
+	KERNEL4x8_1
+	KERNEL4x8_2
+
+	KERNEL4x8_1
+	KERNEL4x8_2
+	KERNEL4x8_1
+	KERNEL4x8_2
+
+	addic.		L,	L,	-2
+	ble		.LDTRMM_L4x8_LOOP_END
+
+	.align 5
+
+.LDTRMM_L4x8_LOOP:
+
+	KERNEL4x8_1
+	KERNEL4x8_2
+	KERNEL4x8_1
+	KERNEL4x8_2
+
+	KERNEL4x8_1
+	KERNEL4x8_2
+	KERNEL4x8_1
+	KERNEL4x8_2
+
+	addic.		L,	L,	-1
+	bgt		.LDTRMM_L4x8_LOOP
+
+.LDTRMM_L4x8_LOOP_END:
+
+	KERNEL4x8_1
+	KERNEL4x8_2
+	KERNEL4x8_1
+	KERNEL4x8_2
+
+	KERNEL4x8_1
+	KERNEL4x8_2
+	KERNEL4x8_1
+	KERNEL4x8_E2
+
+	b		.LDTRMM_L4x8_SUB1
+
+.LDTRMM_L4x8_SUB4:
+
+	KERNEL4x8_SUBI1
+	KERNEL4x8_SUB1
+	KERNEL4x8_SUB1
+	KERNEL4x8_SUB1
+
+	KERNEL4x8_SUB1
+	KERNEL4x8_SUB1
+	KERNEL4x8_SUB1
+	KERNEL4x8_SUB1
+
+	b		.LDTRMM_L4x8_SUB1
+
+.LDTRMM_L4x8_SUB0:
+
+	andi.		L,	K1,	7						// K1 & 7 -> L
+
+	KERNEL4x8_SUBI1
+
+	addic.		L,	L,	-1
+	ble		.LDTRMM_L4x8_SAVE
+	b		.LDTRMM_L4x8_SUB2
+
+.LDTRMM_L4x8_SUB1:
+
+	andi.		L,	K1,	7						// K1 & 7 -> L
+	ble		.LDTRMM_L4x8_SAVE
+
+.LDTRMM_L4x8_SUB2:
+
+	KERNEL4x8_SUB1
+
+	addic.		L,	L,	-1
+	bgt		.LDTRMM_L4x8_SUB2
+
+.LDTRMM_L4x8_SAVE:
+
+	SAVE4x8
+
+#if  (defined(LEFT) &&  defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+	sub		T1,	K,	KKK					// K - KKK -> TEMP1
+	slwi		T2,	T1,	5			// TEMP1 * Number of values in B shifted -> TEMP2
+	slwi		T1,	T1,	6			// TEMP1 * Number of values in A shifted -> TEMP1
+	add		BO,	BO,	T2					// BO += TEMP2 * number of values in B shifted
+	add		AO,	AO,	T1					// AO += TEMP1 * number of values in A shifted
+#endif
+
+#if defined(LEFT)
+	addi		KK,	KK,	8				// KK += Number of values in A
+#endif
+
+
+.LDTRMM_L4x8_END:
+
+.LDTRMM_L4x4_BEGIN:
+
+	andi.		T1,	M,	4
+	ble		.LDTRMM_L4x4_END
+
+#if  (defined(LEFT) &&  defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+	mr		BO,	B					// B -> BO
+#else
+	mr		BO,	B					// B -> BO
+	slwi		T1,	KK,	5				// Number of values in B shifted
+	slwi		T2,	KK,	5				// Number of values in A shifted
+	add		BO,	BO,	T1				// Add values to BO
+	add		AO,	AO,	T2				// Add values to AO
+#endif
+
+#if (defined(LEFT) &&  !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
+	sub		T1,	K,	KK				// K - KK -> TEMP1
+#else
+	mr		T1,	KK					// KK -> KTEMP
+#ifdef LEFT
+	addi		T1,	T1,	4				// KTEMP + Number of values in A -> KTEMP
+#else
+	addi		T1,	T1,	4				// KTEMP + Number of values in B -> KTEMP
+#endif
+#endif
+
+	mr		KKK,	T1
+	mr		K1,	T1
+	srawi.		L,	K1,	3				// KTEMP / 8 -> L
+	ble		.LDTRMM_L4x4_SUB0
+	cmpwi		cr0,	L,	1
+	ble		.LDTRMM_L4x4_SUB4
+
+.LDTRMM_L4x4_LOOP_START:
+
+	LOAD4x4_1
+	KERNEL4x4_I1
+	KERNEL4x4_2
+	KERNEL4x4_1
+	KERNEL4x4_2
+
+	KERNEL4x4_1
+	KERNEL4x4_2
+	KERNEL4x4_1
+	KERNEL4x4_2
+
+	addic.		L,	L,	-2
+	ble		.LDTRMM_L4x4_LOOP_END
+
+	.align 5
+
+.LDTRMM_L4x4_LOOP:
+
+	KERNEL4x4_1
+	KERNEL4x4_2
+	KERNEL4x4_1
+	KERNEL4x4_2
+
+	KERNEL4x4_1
+	KERNEL4x4_2
+	KERNEL4x4_1
+	KERNEL4x4_2
+
+	addic.		L,	L,	-1
+	bgt		.LDTRMM_L4x4_LOOP
+
+.LDTRMM_L4x4_LOOP_END:
+
+	KERNEL4x4_1
+	KERNEL4x4_2
+	KERNEL4x4_1
+	KERNEL4x4_2
+
+	KERNEL4x4_1
+	KERNEL4x4_2
+	KERNEL4x4_1
+	KERNEL4x4_E2
+
+	b		.LDTRMM_L4x4_SUB1
+
+.LDTRMM_L4x4_SUB4:
+
+	KERNEL4x4_SUBI1
+	KERNEL4x4_SUB1
+	KERNEL4x4_SUB1
+	KERNEL4x4_SUB1
+
+	KERNEL4x4_SUB1
+	KERNEL4x4_SUB1
+	KERNEL4x4_SUB1
+	KERNEL4x4_SUB1
+
+	b		.LDTRMM_L4x4_SUB1
+
+.LDTRMM_L4x4_SUB0:
+
+	andi.		L,	K1,	7						// K1 & 7 -> L
+
+	KERNEL4x4_SUBI1
+
+	addic.		L,	L,	-1
+	ble		.LDTRMM_L4x4_SAVE
+	b		.LDTRMM_L4x4_SUB2
+
+.LDTRMM_L4x4_SUB1:
+
+	andi.		L,	K1,	7						// K1 & 7 -> L
+	ble		.LDTRMM_L4x4_SAVE
+
+.LDTRMM_L4x4_SUB2:
+
+	KERNEL4x4_SUB1
+
+	addic.		L,	L,	-1
+	bgt		.LDTRMM_L4x4_SUB2
+
+.LDTRMM_L4x4_SAVE:
+
+	SAVE4x4
+
+#if  (defined(LEFT) &&  defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+	sub		T1,	K,	KKK					// K - KKK -> TEMP1
+	slwi		T2,	T1,	5			// TEMP1 * Number of values in B shifted -> TEMP2
+	slwi		T1,	T1,	5			// TEMP1 * Number of values in A shifted -> TEMP1
+	add		BO,	BO,	T2					// BO += TEMP2 * number of values in B shifted
+	add		AO,	AO,	T1					// AO += TEMP1 * number of values in A shifted
+#endif
+
+#if defined(LEFT)
+	addi		KK,	KK,	4				// KK += Number of values in A
+#endif
+
+
+.LDTRMM_L4x4_END:
+
+.LDTRMM_L4x2_BEGIN:
+
+	andi.		T1,	M,	2
+	ble		.LDTRMM_L4x2_END
+
+#if  (defined(LEFT) &&  defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+	mr		BO,	B					// B -> BO
+#else
+	mr		BO,	B					// B -> BO
+	slwi		T1,	KK,	5				// Number of values in B shifted
+	slwi		T2,	KK,	4				// Number of values in A shifted
+	add		BO,	BO,	T1				// Add values to BO
+	add		AO,	AO,	T2				// Add values to AO
+#endif
+
+#if (defined(LEFT) &&  !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
+	sub		T1,	K,	KK				// K - KK -> TEMP1
+#else
+	mr		T1,	KK					// KK -> KTEMP
+#ifdef LEFT
+	addi		T1,	T1,	2				// KTEMP + Number of values in A -> KTEMP
+#else
+	addi		T1,	T1,	4				// KTEMP + Number of values in B -> KTEMP
+#endif
+#endif
+
+	mr		KKK,	T1
+	mr		K1,	T1
+	srawi.		L,	K1,	3				// KTEMP / 8 -> L
+	ble		.LDTRMM_L4x2_SUB0
+	cmpwi		cr0,	L,	1
+	ble		.LDTRMM_L4x2_SUB4
+
+.LDTRMM_L4x2_LOOP_START:
+
+	LOAD4x2_1
+	KERNEL4x2_I1
+	KERNEL4x2_2
+	KERNEL4x2_1
+	KERNEL4x2_2
+
+	KERNEL4x2_1
+	KERNEL4x2_2
+	KERNEL4x2_1
+	KERNEL4x2_2
+
+	addic.		L,	L,	-2
+	ble		.LDTRMM_L4x2_LOOP_END
+
+	.align 5
+
+.LDTRMM_L4x2_LOOP:
+
+	KERNEL4x2_1
+	KERNEL4x2_2
+	KERNEL4x2_1
+	KERNEL4x2_2
+
+	KERNEL4x2_1
+	KERNEL4x2_2
+	KERNEL4x2_1
+	KERNEL4x2_2
+
+	addic.		L,	L,	-1
+	bgt		.LDTRMM_L4x2_LOOP
+
+.LDTRMM_L4x2_LOOP_END:
+
+	KERNEL4x2_1
+	KERNEL4x2_2
+	KERNEL4x2_1
+	KERNEL4x2_2
+
+	KERNEL4x2_1
+	KERNEL4x2_2
+	KERNEL4x2_1
+	KERNEL4x2_E2
+
+	b		.LDTRMM_L4x2_SUB1
+
+.LDTRMM_L4x2_SUB4:
+
+	KERNEL4x2_SUBI1
+	KERNEL4x2_SUB1
+	KERNEL4x2_SUB1
+	KERNEL4x2_SUB1
+
+	KERNEL4x2_SUB1
+	KERNEL4x2_SUB1
+	KERNEL4x2_SUB1
+	KERNEL4x2_SUB1
+
+	b		.LDTRMM_L4x2_SUB1
+
+.LDTRMM_L4x2_SUB0:
+
+	andi.		L,	K1,	7						// K1 & 7 -> L
+
+	KERNEL4x2_SUBI1
+
+	addic.		L,	L,	-1
+	ble		.LDTRMM_L4x2_SAVE
+	b		.LDTRMM_L4x2_SUB2
+
+.LDTRMM_L4x2_SUB1:
+
+	andi.		L,	K1,	7						// K1 & 7 -> L
+	ble		.LDTRMM_L4x2_SAVE
+
+.LDTRMM_L4x2_SUB2:
+
+	KERNEL4x2_SUB1
+
+	addic.		L,	L,	-1
+	bgt		.LDTRMM_L4x2_SUB2
+
+.LDTRMM_L4x2_SAVE:
+
+	SAVE4x2
+
+#if  (defined(LEFT) &&  defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+	sub		T1,	K,	KKK					// K - KKK -> TEMP1
+	slwi		T2,	T1,	5			// TEMP1 * Number of values in B shifted -> TEMP2
+	slwi		T1,	T1,	4			// TEMP1 * Number of values in A shifted -> TEMP1
+	add		BO,	BO,	T2					// BO += TEMP2 * number of values in B shifted
+	add		AO,	AO,	T1					// AO += TEMP1 * number of values in A shifted
+#endif
+
+#if defined(LEFT)
+	addi		KK,	KK,	2				// KK += Number of values in A
+#endif
+
+
+.LDTRMM_L4x2_END:
+
+.LDTRMM_L4x1_BEGIN:
+
+	andi.		T1,	M,	1
+	ble		.LDTRMM_L4x1_END
+
+#if  (defined(LEFT) &&  defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+	mr		BO,	B					// B -> BO
+#else
+	mr		BO,	B					// B -> BO
+	slwi		T1,	KK,	5				// Number of values in B shifted
+	slwi		T2,	KK,	3				// Number of values in A shifted
+	add		BO,	BO,	T1				// Add values to BO
+	add		AO,	AO,	T2				// Add values to AO
+#endif
+
+#if (defined(LEFT) &&  !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
+	sub		T1,	K,	KK				// K - KK -> TEMP1
+#else
+	mr		T1,	KK					// KK -> KTEMP
+#ifdef LEFT
+	addi		T1,	T1,	1				// KTEMP + Number of values in A -> KTEMP
+#else
+	addi		T1,	T1,	4				// KTEMP + Number of values in B -> KTEMP
+#endif
+#endif
+
+	mr		KKK,	T1
+	mr		K1,	T1
+	srawi.		L,	K1,	3				// KTEMP / 8 -> L
+	ble		.LDTRMM_L4x1_SUB0
+	cmpwi		cr0,	L,	1
+	ble		.LDTRMM_L4x1_SUB4
+
+.LDTRMM_L4x1_LOOP_START:
+
+	LOAD4x1_1
+	KERNEL4x1_I1
+	KERNEL4x1_2
+	KERNEL4x1_1
+	KERNEL4x1_2
+
+	KERNEL4x1_1
+	KERNEL4x1_2
+	KERNEL4x1_1
+	KERNEL4x1_2
+
+	addic.		L,	L,	-2
+	ble		.LDTRMM_L4x1_LOOP_END
+
+	.align 5
+
+.LDTRMM_L4x1_LOOP:
+
+	KERNEL4x1_1
+	KERNEL4x1_2
+	KERNEL4x1_1
+	KERNEL4x1_2
+
+	KERNEL4x1_1
+	KERNEL4x1_2
+	KERNEL4x1_1
+	KERNEL4x1_2
+
+	addic.		L,	L,	-1
+	bgt		.LDTRMM_L4x1_LOOP
+
+.LDTRMM_L4x1_LOOP_END:
+
+	KERNEL4x1_1
+	KERNEL4x1_2
+	KERNEL4x1_1
+	KERNEL4x1_2
+
+	KERNEL4x1_1
+	KERNEL4x1_2
+	KERNEL4x1_1
+	KERNEL4x1_E2
+
+	b		.LDTRMM_L4x1_SUB1
+
+.LDTRMM_L4x1_SUB4:
+
+	KERNEL4x1_SUBI1
+	KERNEL4x1_SUB1
+	KERNEL4x1_SUB1
+	KERNEL4x1_SUB1
+
+	KERNEL4x1_SUB1
+	KERNEL4x1_SUB1
+	KERNEL4x1_SUB1
+	KERNEL4x1_SUB1
+
+	b		.LDTRMM_L4x1_SUB1
+
+.LDTRMM_L4x1_SUB0:
+
+	andi.		L,	K1,	7						// K1 & 7 -> L
+
+	KERNEL4x1_SUBI1
+
+	addic.		L,	L,	-1
+	ble		.LDTRMM_L4x1_SAVE
+	b		.LDTRMM_L4x1_SUB2
+
+.LDTRMM_L4x1_SUB1:
+
+	andi.		L,	K1,	7						// K1 & 7 -> L
+	ble		.LDTRMM_L4x1_SAVE
+
+.LDTRMM_L4x1_SUB2:
+
+	KERNEL4x1_SUB1
+
+	addic.		L,	L,	-1
+	bgt		.LDTRMM_L4x1_SUB2
+
+.LDTRMM_L4x1_SAVE:
+
+	SAVE4x1
+
+#if  (defined(LEFT) &&  defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+	sub		T1,	K,	KKK					// K - KKK -> TEMP1
+	slwi		T2,	T1,	5			// TEMP1 * Number of values in B shifted -> TEMP2
+	slwi		T1,	T1,	3			// TEMP1 * Number of values in A shifted -> TEMP1
+	add		BO,	BO,	T2					// BO += TEMP2 * number of values in B shifted
+	add		AO,	AO,	T1					// AO += TEMP1 * number of values in A shifted
+#endif
+
+#if defined(LEFT)
+	addi		KK,	KK,	1				// KK += Number of values in A
+#endif
+
+
+.LDTRMM_L4x1_END:
+
+	slwi		T1,	K,	5
+	add		B,	B,	T1
+
+#if !defined(LEFT)
+	addi		KK,	KK,	4					// KK += Number of values in B
+#endif
+
+
+	addic.		J,	J,	-1
+	bgt		.LDTRMM_L4_BEGIN
+
+	andi.		T2,	N,	3
+	ble		.L999
+
+.LDTRMM_L4_END:
+
+	b		.LDTRMM_L2_BEGIN
+
+.L999_H1:
+
+	b		.L999
+
+.LDTRMM_L2_BEGIN:
+
+	andi.		T1,	N,	2
+	ble		.LDTRMM_L2_END
+	mr		CO,	C
+	mr		AO,	A
+	slwi		T1,	LDC	,	1
+	add		C,	C,	T1
+
+#if defined(LEFT)
+	mr		KK,	OFFSET		// OFFSET -> KK
+#endif
+
+	srawi.		I,	M,	4
+	ble		.LDTRMM_L2x16_END
+
+.LDTRMM_L2x16_BEGIN:
+
+
+#if  (defined(LEFT) &&  defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+	mr		BO,	B					// B -> BO
+#else
+	mr		BO,	B					// B -> BO
+	slwi		T1,	KK,	4				// Number of values in B shifted
+	slwi		T2,	KK,	7				// Number of values in A shifted
+	add		BO,	BO,	T1				// Add values to BO
+	add		AO,	AO,	T2				// Add values to AO
+#endif
+
+#if (defined(LEFT) &&  !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
+	sub		T1,	K,	KK				// K - KK -> TEMP1
+#else
+	mr		T1,	KK					// KK -> KTEMP
+#ifdef LEFT
+	addi		T1,	T1,	16				// KTEMP + Number of values in A -> KTEMP
+#else
+	addi		T1,	T1,	2				// KTEMP + Number of values in B -> KTEMP
+#endif
+#endif
+
+	mr		KKK,	T1
+	mr		K1,	T1
+	srawi.		L,	K1,	3				// KTEMP / 8 -> L
+	ble		.LDTRMM_L2x16_SUB0
+	cmpwi		cr0,	L,	1
+	ble		.LDTRMM_L2x16_SUB4
+
+.LDTRMM_L2x16_LOOP_START:
+
+	dcbt		AO,	PRE
+	LOAD2x16_1
+	dcbt		AO,	PRE
+	KERNEL2x16_I1
+	dcbt		AO,	PRE
+	KERNEL2x16_2
+	dcbt		AO,	PRE
+	KERNEL2x16_1
+	dcbt		AO,	PRE
+	KERNEL2x16_2
+
+	dcbt		AO,	PRE
+	KERNEL2x16_1
+	dcbt		AO,	PRE
+	KERNEL2x16_2
+	dcbt		AO,	PRE
+	KERNEL2x16_1
+	dcbt		AO,	PRE
+	KERNEL2x16_2
+
+	addic.		L,	L,	-2
+	ble		.LDTRMM_L2x16_LOOP_END
+
+	.align 5
+
+.LDTRMM_L2x16_LOOP:
+
+	dcbt		AO,	PRE
+	KERNEL2x16_1
+	dcbt		AO,	PRE
+	KERNEL2x16_2
+	dcbt		AO,	PRE
+	KERNEL2x16_1
+	dcbt		AO,	PRE
+	KERNEL2x16_2
+
+	dcbt		AO,	PRE
+	KERNEL2x16_1
+	dcbt		AO,	PRE
+	KERNEL2x16_2
+	dcbt		AO,	PRE
+	KERNEL2x16_1
+	dcbt		AO,	PRE
+	KERNEL2x16_2
+
+	addic.		L,	L,	-1
+	bgt		.LDTRMM_L2x16_LOOP
+
+.LDTRMM_L2x16_LOOP_END:
+
+	dcbt		AO,	PRE
+	KERNEL2x16_1
+	dcbt		AO,	PRE
+	KERNEL2x16_2
+	dcbt		AO,	PRE
+	KERNEL2x16_1
+	dcbt		AO,	PRE
+	KERNEL2x16_2
+
+	dcbt		AO,	PRE
+	KERNEL2x16_1
+	dcbt		AO,	PRE
+	KERNEL2x16_2
+	dcbt		AO,	PRE
+	KERNEL2x16_1
+	KERNEL2x16_E2
+
+	b		.LDTRMM_L2x16_SUB1
+
+.LDTRMM_L2x16_SUB4:
+
+	dcbt		AO,	PRE
+	KERNEL2x16_SUBI1
+	dcbt		AO,	PRE
+	KERNEL2x16_SUB1
+	dcbt		AO,	PRE
+	KERNEL2x16_SUB1
+	dcbt		AO,	PRE
+	KERNEL2x16_SUB1
+
+	KERNEL2x16_SUB1
+	KERNEL2x16_SUB1
+	KERNEL2x16_SUB1
+	KERNEL2x16_SUB1
+
+	b		.LDTRMM_L2x16_SUB1
+
+.LDTRMM_L2x16_SUB0:
+
+	andi.		L,	K1,	7						// K1 & 7 -> L
+
+	KERNEL2x16_SUBI1
+
+	addic.		L,	L,	-1
+	ble		.LDTRMM_L2x16_SAVE
+	b		.LDTRMM_L2x16_SUB2
+
+.LDTRMM_L2x16_SUB1:
+
+	andi.		L,	K1,	7						// K1 & 7 -> L
+	ble		.LDTRMM_L2x16_SAVE
+
+.LDTRMM_L2x16_SUB2:
+
+	KERNEL2x16_SUB1
+
+	addic.		L,	L,	-1
+	bgt		.LDTRMM_L2x16_SUB2
+
+.LDTRMM_L2x16_SAVE:
+
+	SAVE2x16
+
+#if  (defined(LEFT) &&  defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+	sub		T1,	K,	KKK					// K - KKK -> TEMP1
+	slwi		T2,	T1,	4			// TEMP1 * Number of values in B shifted -> TEMP2
+	slwi		T1,	T1,	7			// TEMP1 * Number of values in A shifted -> TEMP1
+	add		BO,	BO,	T2					// BO += TEMP2 * number of values in B shifted
+	add		AO,	AO,	T1					// AO += TEMP1 * number of values in A shifted
+#endif
+
+#if defined(LEFT)
+	addi		KK,	KK,	16				// KK += Number of values in A
+#endif
+
+
+	addic.		I,	I,	-1
+	bgt		.LDTRMM_L2x16_BEGIN
+
+.LDTRMM_L2x16_END:
+
+.LDTRMM_L2x8_BEGIN:
+	andi.		T2,	M,	15
+	ble		.LDTRMM_L2x1_END
+
+	andi.		T1,	M,	8
+	ble		.LDTRMM_L2x8_END
+
+#if  (defined(LEFT) &&  defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+	mr		BO,	B					// B -> BO
+#else
+	mr		BO,	B					// B -> BO
+	slwi		T1,	KK,	4				// Number of values in B shifted
+	slwi		T2,	KK,	6				// Number of values in A shifted
+	add		BO,	BO,	T1				// Add values to BO
+	add		AO,	AO,	T2				// Add values to AO
+#endif
+
+#if (defined(LEFT) &&  !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
+	sub		T1,	K,	KK				// K - KK -> TEMP1
+#else
+	mr		T1,	KK					// KK -> KTEMP
+#ifdef LEFT
+	addi		T1,	T1,	8				// KTEMP + Number of values in A -> KTEMP
+#else
+	addi		T1,	T1,	2				// KTEMP + Number of values in B -> KTEMP
+#endif
+#endif
+
+	mr		KKK,	T1
+	mr		K1,	T1
+	srawi.		L,	K1,	3				// KTEMP / 8 -> L
+	ble		.LDTRMM_L2x8_SUB0
+	cmpwi		cr0,	L,	1
+	ble		.LDTRMM_L2x8_SUB4
+
+.LDTRMM_L2x8_LOOP_START:
+
+	LOAD2x8_1
+	KERNEL2x8_I1
+	KERNEL2x8_2
+	KERNEL2x8_1
+	KERNEL2x8_2
+
+	KERNEL2x8_1
+	KERNEL2x8_2
+	KERNEL2x8_1
+	KERNEL2x8_2
+
+	addic.		L,	L,	-2
+	ble		.LDTRMM_L2x8_LOOP_END
+
+	.align 5
+
+.LDTRMM_L2x8_LOOP:
+
+	KERNEL2x8_1
+	KERNEL2x8_2
+	KERNEL2x8_1
+	KERNEL2x8_2
+
+	KERNEL2x8_1
+	KERNEL2x8_2
+	KERNEL2x8_1
+	KERNEL2x8_2
+
+	addic.		L,	L,	-1
+	bgt		.LDTRMM_L2x8_LOOP
+
+.LDTRMM_L2x8_LOOP_END:
+
+	KERNEL2x8_1
+	KERNEL2x8_2
+	KERNEL2x8_1
+	KERNEL2x8_2
+
+	KERNEL2x8_1
+	KERNEL2x8_2
+	KERNEL2x8_1
+	KERNEL2x8_E2
+
+	b		.LDTRMM_L2x8_SUB1
+
+.LDTRMM_L2x8_SUB4:
+
+	KERNEL2x8_SUBI1
+	KERNEL2x8_SUB1
+	KERNEL2x8_SUB1
+	KERNEL2x8_SUB1
+
+	KERNEL2x8_SUB1
+	KERNEL2x8_SUB1
+	KERNEL2x8_SUB1
+	KERNEL2x8_SUB1
+
+	b		.LDTRMM_L2x8_SUB1
+
+.LDTRMM_L2x8_SUB0:
+
+	andi.		L,	K1,	7						// K1 & 7 -> L
+
+	KERNEL2x8_SUBI1
+
+	addic.		L,	L,	-1
+	ble		.LDTRMM_L2x8_SAVE
+	b		.LDTRMM_L2x8_SUB2
+
+.LDTRMM_L2x8_SUB1:
+
+	andi.		L,	K1,	7						// K1 & 7 -> L
+	ble		.LDTRMM_L2x8_SAVE
+
+.LDTRMM_L2x8_SUB2:
+
+	KERNEL2x8_SUB1
+
+	addic.		L,	L,	-1
+	bgt		.LDTRMM_L2x8_SUB2
+
+.LDTRMM_L2x8_SAVE:
+
+	SAVE2x8
+
+#if  (defined(LEFT) &&  defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+	sub		T1,	K,	KKK					// K - KKK -> TEMP1
+	slwi		T2,	T1,	4			// TEMP1 * Number of values in B shifted -> TEMP2
+	slwi		T1,	T1,	6			// TEMP1 * Number of values in A shifted -> TEMP1
+	add		BO,	BO,	T2					// BO += TEMP2 * number of values in B shifted
+	add		AO,	AO,	T1					// AO += TEMP1 * number of values in A shifted
+#endif
+
+#if defined(LEFT)
+	addi		KK,	KK,	8				// KK += Number of values in A
+#endif
+
+
+.LDTRMM_L2x8_END:
+
+.LDTRMM_L2x4_BEGIN:
+
+	andi.		T1,	M,	4
+	ble		.LDTRMM_L2x4_END
+
+#if  (defined(LEFT) &&  defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+	mr		BO,	B					// B -> BO
+#else
+	mr		BO,	B					// B -> BO
+	slwi		T1,	KK,	4				// Number of values in B shifted
+	slwi		T2,	KK,	5				// Number of values in A shifted
+	add		BO,	BO,	T1				// Add values to BO
+	add		AO,	AO,	T2				// Add values to AO
+#endif
+
+#if (defined(LEFT) &&  !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
+	sub		T1,	K,	KK				// K - KK -> TEMP1
+#else
+	mr		T1,	KK					// KK -> KTEMP
+#ifdef LEFT
+	addi		T1,	T1,	4				// KTEMP + Number of values in A -> KTEMP
+#else
+	addi		T1,	T1,	2				// KTEMP + Number of values in B -> KTEMP
+#endif
+#endif
+
+	mr		KKK,	T1
+	mr		K1,	T1
+	srawi.		L,	K1,	3				// KTEMP / 8 -> L
+	ble		.LDTRMM_L2x4_SUB0
+	cmpwi		cr0,	L,	1
+	ble		.LDTRMM_L2x4_SUB4
+
+.LDTRMM_L2x4_LOOP_START:
+
+	LOAD2x4_1
+	KERNEL2x4_I1
+	KERNEL2x4_2
+	KERNEL2x4_1
+	KERNEL2x4_2
+
+	KERNEL2x4_1
+	KERNEL2x4_2
+	KERNEL2x4_1
+	KERNEL2x4_2
+
+	addic.		L,	L,	-2
+	ble		.LDTRMM_L2x4_LOOP_END
+
+	.align 5
+
+.LDTRMM_L2x4_LOOP:
+
+	KERNEL2x4_1
+	KERNEL2x4_2
+	KERNEL2x4_1
+	KERNEL2x4_2
+
+	KERNEL2x4_1
+	KERNEL2x4_2
+	KERNEL2x4_1
+	KERNEL2x4_2
+
+	addic.		L,	L,	-1
+	bgt		.LDTRMM_L2x4_LOOP
+
+.LDTRMM_L2x4_LOOP_END:
+
+	KERNEL2x4_1
+	KERNEL2x4_2
+	KERNEL2x4_1
+	KERNEL2x4_2
+
+	KERNEL2x4_1
+	KERNEL2x4_2
+	KERNEL2x4_1
+	KERNEL2x4_E2
+
+	b		.LDTRMM_L2x4_SUB1
+
+.LDTRMM_L2x4_SUB4:
+
+	KERNEL2x4_SUBI1
+	KERNEL2x4_SUB1
+	KERNEL2x4_SUB1
+	KERNEL2x4_SUB1
+
+	KERNEL2x4_SUB1
+	KERNEL2x4_SUB1
+	KERNEL2x4_SUB1
+	KERNEL2x4_SUB1
+
+	b		.LDTRMM_L2x4_SUB1
+
+.LDTRMM_L2x4_SUB0:
+
+	andi.		L,	K1,	7						// K1 & 7 -> L
+
+	KERNEL2x4_SUBI1
+
+	addic.		L,	L,	-1
+	ble		.LDTRMM_L2x4_SAVE
+	b		.LDTRMM_L2x4_SUB2
+
+.LDTRMM_L2x4_SUB1:
+
+	andi.		L,	K1,	7						// K1 & 7 -> L
+	ble		.LDTRMM_L2x4_SAVE
+
+.LDTRMM_L2x4_SUB2:
+
+	KERNEL2x4_SUB1
+
+	addic.		L,	L,	-1
+	bgt		.LDTRMM_L2x4_SUB2
+
+.LDTRMM_L2x4_SAVE:
+
+	SAVE2x4
+
+#if  (defined(LEFT) &&  defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+	sub		T1,	K,	KKK					// K - KKK -> TEMP1
+	slwi		T2,	T1,	4			// TEMP1 * Number of values in B shifted -> TEMP2
+	slwi		T1,	T1,	5			// TEMP1 * Number of values in A shifted -> TEMP1
+	add		BO,	BO,	T2					// BO += TEMP2 * number of values in B shifted
+	add		AO,	AO,	T1					// AO += TEMP1 * number of values in A shifted
+#endif
+
+#if defined(LEFT)
+	addi		KK,	KK,	4				// KK += Number of values in A
+#endif
+
+
+.LDTRMM_L2x4_END:
+
+.LDTRMM_L2x2_BEGIN:
+
+	andi.		T1,	M,	2
+	ble		.LDTRMM_L2x2_END
+
+#if  (defined(LEFT) &&  defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+	mr		BO,	B					// B -> BO
+#else
+	mr		BO,	B					// B -> BO
+	slwi		T1,	KK,	4				// Number of values in B shifted
+	slwi		T2,	KK,	4				// Number of values in A shifted
+	add		BO,	BO,	T1				// Add values to BO
+	add		AO,	AO,	T2				// Add values to AO
+#endif
+
+#if (defined(LEFT) &&  !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
+	sub		T1,	K,	KK				// K - KK -> TEMP1
+#else
+	mr		T1,	KK					// KK -> KTEMP
+#ifdef LEFT
+	addi		T1,	T1,	2				// KTEMP + Number of values in A -> KTEMP
+#else
+	addi		T1,	T1,	2				// KTEMP + Number of values in B -> KTEMP
+#endif
+#endif
+
+	mr		KKK,	T1
+	mr		K1,	T1
+	srawi.		L,	K1,	3				// KTEMP / 8 -> L
+	ble		.LDTRMM_L2x2_SUB0
+	cmpwi		cr0,	L,	1
+	ble		.LDTRMM_L2x2_SUB4
+
+.LDTRMM_L2x2_LOOP_START:
+
+	LOAD2x2_1
+	KERNEL2x2_I1
+	KERNEL2x2_2
+	KERNEL2x2_1
+	KERNEL2x2_2
+
+	KERNEL2x2_1
+	KERNEL2x2_2
+	KERNEL2x2_1
+	KERNEL2x2_2
+
+	addic.		L,	L,	-2
+	ble		.LDTRMM_L2x2_LOOP_END
+
+	.align 5
+
+.LDTRMM_L2x2_LOOP:
+
+	KERNEL2x2_1
+	KERNEL2x2_2
+	KERNEL2x2_1
+	KERNEL2x2_2
+
+	KERNEL2x2_1
+	KERNEL2x2_2
+	KERNEL2x2_1
+	KERNEL2x2_2
+
+	addic.		L,	L,	-1
+	bgt		.LDTRMM_L2x2_LOOP
+
+.LDTRMM_L2x2_LOOP_END:
+
+	KERNEL2x2_1
+	KERNEL2x2_2
+	KERNEL2x2_1
+	KERNEL2x2_2
+
+	KERNEL2x2_1
+	KERNEL2x2_2
+	KERNEL2x2_1
+	KERNEL2x2_E2
+
+	b		.LDTRMM_L2x2_SUB1
+
+.LDTRMM_L2x2_SUB4:
+
+	KERNEL2x2_SUBI1
+	KERNEL2x2_SUB1
+	KERNEL2x2_SUB1
+	KERNEL2x2_SUB1
+
+	KERNEL2x2_SUB1
+	KERNEL2x2_SUB1
+	KERNEL2x2_SUB1
+	KERNEL2x2_SUB1
+
+	b		.LDTRMM_L2x2_SUB1
+
+.LDTRMM_L2x2_SUB0:
+
+	andi.		L,	K1,	7						// K1 & 7 -> L
+
+	KERNEL2x2_SUBI1
+
+	addic.		L,	L,	-1
+	ble		.LDTRMM_L2x2_SAVE
+	b		.LDTRMM_L2x2_SUB2
+
+.LDTRMM_L2x2_SUB1:
+
+	andi.		L,	K1,	7						// K1 & 7 -> L
+	ble		.LDTRMM_L2x2_SAVE
+
+.LDTRMM_L2x2_SUB2:
+
+	KERNEL2x2_SUB1
+
+	addic.		L,	L,	-1
+	bgt		.LDTRMM_L2x2_SUB2
+
+.LDTRMM_L2x2_SAVE:
+
+	SAVE2x2
+
+#if  (defined(LEFT) &&  defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+	sub		T1,	K,	KKK					// K - KKK -> TEMP1
+	slwi		T2,	T1,	4			// TEMP1 * Number of values in B shifted -> TEMP2
+	slwi		T1,	T1,	4			// TEMP1 * Number of values in A shifted -> TEMP1
+	add		BO,	BO,	T2					// BO += TEMP2 * number of values in B shifted
+	add		AO,	AO,	T1					// AO += TEMP1 * number of values in A shifted
+#endif
+
+#if defined(LEFT)
+	addi		KK,	KK,	2				// KK += Number of values in A
+#endif
+
+
+.LDTRMM_L2x2_END:
+
+.LDTRMM_L2x1_BEGIN:
+
+	andi.		T1,	M,	1
+	ble		.LDTRMM_L2x1_END
+
+#if  (defined(LEFT) &&  defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+	mr		BO,	B					// B -> BO
+#else
+	mr		BO,	B					// B -> BO
+	slwi		T1,	KK,	4				// Number of values in B shifted
+	slwi		T2,	KK,	3				// Number of values in A shifted
+	add		BO,	BO,	T1				// Add values to BO
+	add		AO,	AO,	T2				// Add values to AO
+#endif
+
+#if (defined(LEFT) &&  !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
+	sub		T1,	K,	KK				// K - KK -> TEMP1
+#else
+	mr		T1,	KK					// KK -> KTEMP
+#ifdef LEFT
+	addi		T1,	T1,	1				// KTEMP + Number of values in A -> KTEMP
+#else
+	addi		T1,	T1,	2				// KTEMP + Number of values in B -> KTEMP
+#endif
+#endif
+
+	mr		KKK,	T1
+	mr		K1,	T1
+	srawi.		L,	K1,	3				// KTEMP / 8 -> L
+	ble		.LDTRMM_L2x1_SUB0
+	cmpwi		cr0,	L,	1
+	ble		.LDTRMM_L2x1_SUB4
+
+.LDTRMM_L2x1_LOOP_START:
+
+	LOAD2x1_1
+	KERNEL2x1_I1
+	KERNEL2x1_2
+	KERNEL2x1_1
+	KERNEL2x1_2
+
+	KERNEL2x1_1
+	KERNEL2x1_2
+	KERNEL2x1_1
+	KERNEL2x1_2
+
+	addic.		L,	L,	-2
+	ble		.LDTRMM_L2x1_LOOP_END
+
+	.align 5
+
+.LDTRMM_L2x1_LOOP:
+
+	KERNEL2x1_1
+	KERNEL2x1_2
+	KERNEL2x1_1
+	KERNEL2x1_2
+
+	KERNEL2x1_1
+	KERNEL2x1_2
+	KERNEL2x1_1
+	KERNEL2x1_2
+
+	addic.		L,	L,	-1
+	bgt		.LDTRMM_L2x1_LOOP
+
+.LDTRMM_L2x1_LOOP_END:
+
+	KERNEL2x1_1
+	KERNEL2x1_2
+	KERNEL2x1_1
+	KERNEL2x1_2
+
+	KERNEL2x1_1
+	KERNEL2x1_2
+	KERNEL2x1_1
+	KERNEL2x1_E2
+
+	b		.LDTRMM_L2x1_SUB1
+
+.LDTRMM_L2x1_SUB4:
+
+	KERNEL2x1_SUBI1
+	KERNEL2x1_SUB1
+	KERNEL2x1_SUB1
+	KERNEL2x1_SUB1
+
+	KERNEL2x1_SUB1
+	KERNEL2x1_SUB1
+	KERNEL2x1_SUB1
+	KERNEL2x1_SUB1
+
+	b		.LDTRMM_L2x1_SUB1
+
+.LDTRMM_L2x1_SUB0:
+
+	andi.		L,	K1,	7						// K1 & 7 -> L
+
+	KERNEL2x1_SUBI1
+
+	addic.		L,	L,	-1
+	ble		.LDTRMM_L2x1_SAVE
+	b		.LDTRMM_L2x1_SUB2
+
+.LDTRMM_L2x1_SUB1:
+
+	andi.		L,	K1,	7						// K1 & 7 -> L
+	ble		.LDTRMM_L2x1_SAVE
+
+.LDTRMM_L2x1_SUB2:
+
+	KERNEL2x1_SUB1
+
+	addic.		L,	L,	-1
+	bgt		.LDTRMM_L2x1_SUB2
+
+.LDTRMM_L2x1_SAVE:
+
+	SAVE2x1
+
+#if  (defined(LEFT) &&  defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+	sub		T1,	K,	KKK					// K - KKK -> TEMP1
+	slwi		T2,	T1,	4			// TEMP1 * Number of values in B shifted -> TEMP2
+	slwi		T1,	T1,	3			// TEMP1 * Number of values in A shifted -> TEMP1
+	add		BO,	BO,	T2					// BO += TEMP2 * number of values in B shifted
+	add		AO,	AO,	T1					// AO += TEMP1 * number of values in A shifted
+#endif
+
+#if defined(LEFT)
+	addi		KK,	KK,	1				// KK += Number of values in A
+#endif
+
+
+.LDTRMM_L2x1_END:
+
+	slwi		T1,	K,	4
+	add		B,	B,	T1
+
+#if !defined(LEFT)
+	addi		KK,	KK,	2					// KK += Number of values in B
+#endif
+
+
+.LDTRMM_L2_END:
+.LDTRMM_L1_BEGIN:
+
+	andi.		T1,	N,	1
+	ble		.LDTRMM_L1_END
+	mr		CO,	C
+	mr		AO,	A
+
+#if defined(LEFT)
+	mr		KK,	OFFSET		// OFFSET -> KK
+#endif
+
+	srawi.		I,	M,	4
+	ble		.LDTRMM_L1x16_END
+
+.LDTRMM_L1x16_BEGIN:
+
+
+#if  (defined(LEFT) &&  defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+	mr		BO,	B					// B -> BO
+#else
+	mr		BO,	B					// B -> BO
+	slwi		T1,	KK,	3				// Number of values in B shifted
+	slwi		T2,	KK,	7				// Number of values in A shifted
+	add		BO,	BO,	T1				// Add values to BO
+	add		AO,	AO,	T2				// Add values to AO
+#endif
+
+#if (defined(LEFT) &&  !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
+	sub		T1,	K,	KK				// K - KK -> TEMP1
+#else
+	mr		T1,	KK					// KK -> KTEMP
+#ifdef LEFT
+	addi		T1,	T1,	16				// KTEMP + Number of values in A -> KTEMP
+#else
+	addi		T1,	T1,	1				// KTEMP + Number of values in B -> KTEMP
+#endif
+#endif
+
+	mr		KKK,	T1
+	mr		K1,	T1
+	srawi.		L,	K1,	3				// KTEMP / 8 -> L
+	ble		.LDTRMM_L1x16_SUB0
+	cmpwi		cr0,	L,	1
+	ble		.LDTRMM_L1x16_SUB4
+
+.LDTRMM_L1x16_LOOP_START:
+
+	dcbt		AO,	PRE
+	LOAD1x16_1
+	dcbt		AO,	PRE
+	KERNEL1x16_I1
+	dcbt		AO,	PRE
+	KERNEL1x16_2
+	dcbt		AO,	PRE
+	KERNEL1x16_1
+	dcbt		AO,	PRE
+	KERNEL1x16_2
+
+	dcbt		AO,	PRE
+	KERNEL1x16_1
+	dcbt		AO,	PRE
+	KERNEL1x16_2
+	dcbt		AO,	PRE
+	KERNEL1x16_1
+	dcbt		AO,	PRE
+	KERNEL1x16_2
+
+	addic.		L,	L,	-2
+	ble		.LDTRMM_L1x16_LOOP_END
+
+	.align 5
+
+.LDTRMM_L1x16_LOOP:
+
+	dcbt		AO,	PRE
+	KERNEL1x16_1
+	dcbt		AO,	PRE
+	KERNEL1x16_2
+	dcbt		AO,	PRE
+	KERNEL1x16_1
+	dcbt		AO,	PRE
+	KERNEL1x16_2
+
+	dcbt		AO,	PRE
+	KERNEL1x16_1
+	dcbt		AO,	PRE
+	KERNEL1x16_2
+	dcbt		AO,	PRE
+	KERNEL1x16_1
+	dcbt		AO,	PRE
+	KERNEL1x16_2
+
+	addic.		L,	L,	-1
+	bgt		.LDTRMM_L1x16_LOOP
+
+.LDTRMM_L1x16_LOOP_END:
+
+	dcbt		AO,	PRE
+	KERNEL1x16_1
+	dcbt		AO,	PRE
+	KERNEL1x16_2
+	dcbt		AO,	PRE
+	KERNEL1x16_1
+	dcbt		AO,	PRE
+	KERNEL1x16_2
+
+	dcbt		AO,	PRE
+	KERNEL1x16_1
+	dcbt		AO,	PRE
+	KERNEL1x16_2
+	dcbt		AO,	PRE
+	KERNEL1x16_1
+	KERNEL1x16_E2
+
+	b		.LDTRMM_L1x16_SUB1
+
+.LDTRMM_L1x16_SUB4:
+
+	dcbt		AO,	PRE
+	KERNEL1x16_SUBI1
+	dcbt		AO,	PRE
+	KERNEL1x16_SUB1
+	dcbt		AO,	PRE
+	KERNEL1x16_SUB1
+	dcbt		AO,	PRE
+	KERNEL1x16_SUB1
+
+	KERNEL1x16_SUB1
+	KERNEL1x16_SUB1
+	KERNEL1x16_SUB1
+	KERNEL1x16_SUB1
+
+	b		.LDTRMM_L1x16_SUB1
+
+.LDTRMM_L1x16_SUB0:
+
+	andi.		L,	K1,	7						// K1 & 7 -> L
+
+	KERNEL1x16_SUBI1
+
+	addic.		L,	L,	-1
+	ble		.LDTRMM_L1x16_SAVE
+	b		.LDTRMM_L1x16_SUB2
+
+.LDTRMM_L1x16_SUB1:
+
+	andi.		L,	K1,	7						// K1 & 7 -> L
+	ble		.LDTRMM_L1x16_SAVE
+
+.LDTRMM_L1x16_SUB2:
+
+	KERNEL1x16_SUB1
+
+	addic.		L,	L,	-1
+	bgt		.LDTRMM_L1x16_SUB2
+
+.LDTRMM_L1x16_SAVE:
+
+	SAVE1x16
+
+#if  (defined(LEFT) &&  defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+	sub		T1,	K,	KKK					// K - KKK -> TEMP1
+	slwi		T2,	T1,	3			// TEMP1 * Number of values in B shifted -> TEMP2
+	slwi		T1,	T1,	7			// TEMP1 * Number of values in A shifted -> TEMP1
+	add		BO,	BO,	T2					// BO += TEMP2 * number of values in B shifted
+	add		AO,	AO,	T1					// AO += TEMP1 * number of values in A shifted
+#endif
+
+#if defined(LEFT)
+	addi		KK,	KK,	16				// KK += Number of values in A
+#endif
+
+
+	addic.		I,	I,	-1
+	bgt		.LDTRMM_L1x16_BEGIN
+
+.LDTRMM_L1x16_END:
+
+.LDTRMM_L1x8_BEGIN:
+	andi.		T2,	M,	15
+	ble		.LDTRMM_L1x1_END
+
+	andi.		T1,	M,	8
+	ble		.LDTRMM_L1x8_END
+
+#if  (defined(LEFT) &&  defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+	mr		BO,	B					// B -> BO
+#else
+	mr		BO,	B					// B -> BO
+	slwi		T1,	KK,	3				// Number of values in B shifted
+	slwi		T2,	KK,	6				// Number of values in A shifted
+	add		BO,	BO,	T1				// Add values to BO
+	add		AO,	AO,	T2				// Add values to AO
+#endif
+
+#if (defined(LEFT) &&  !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
+	sub		T1,	K,	KK				// K - KK -> TEMP1
+#else
+	mr		T1,	KK					// KK -> KTEMP
+#ifdef LEFT
+	addi		T1,	T1,	8				// KTEMP + Number of values in A -> KTEMP
+#else
+	addi		T1,	T1,	1				// KTEMP + Number of values in B -> KTEMP
+#endif
+#endif
+
+	mr		KKK,	T1
+	mr		K1,	T1
+	srawi.		L,	K1,	3				// KTEMP / 8 -> L
+	ble		.LDTRMM_L1x8_SUB0
+	cmpwi		cr0,	L,	1
+	ble		.LDTRMM_L1x8_SUB4
+
+.LDTRMM_L1x8_LOOP_START:
+
+	LOAD1x8_1
+	KERNEL1x8_I1
+	KERNEL1x8_2
+	KERNEL1x8_1
+	KERNEL1x8_2
+
+	KERNEL1x8_1
+	KERNEL1x8_2
+	KERNEL1x8_1
+	KERNEL1x8_2
+
+	addic.		L,	L,	-2
+	ble		.LDTRMM_L1x8_LOOP_END
+
+	.align 5
+
+.LDTRMM_L1x8_LOOP:
+
+	KERNEL1x8_1
+	KERNEL1x8_2
+	KERNEL1x8_1
+	KERNEL1x8_2
+
+	KERNEL1x8_1
+	KERNEL1x8_2
+	KERNEL1x8_1
+	KERNEL1x8_2
+
+	addic.		L,	L,	-1
+	bgt		.LDTRMM_L1x8_LOOP
+
+.LDTRMM_L1x8_LOOP_END:
+
+	KERNEL1x8_1
+	KERNEL1x8_2
+	KERNEL1x8_1
+	KERNEL1x8_2
+
+	KERNEL1x8_1
+	KERNEL1x8_2
+	KERNEL1x8_1
+	KERNEL1x8_E2
+
+	b		.LDTRMM_L1x8_SUB1
+
+.LDTRMM_L1x8_SUB4:
+
+	KERNEL1x8_SUBI1
+	KERNEL1x8_SUB1
+	KERNEL1x8_SUB1
+	KERNEL1x8_SUB1
+
+	KERNEL1x8_SUB1
+	KERNEL1x8_SUB1
+	KERNEL1x8_SUB1
+	KERNEL1x8_SUB1
+
+	b		.LDTRMM_L1x8_SUB1
+
+.LDTRMM_L1x8_SUB0:
+
+	andi.		L,	K1,	7						// K1 & 7 -> L
+
+	KERNEL1x8_SUBI1
+
+	addic.		L,	L,	-1
+	ble		.LDTRMM_L1x8_SAVE
+	b		.LDTRMM_L1x8_SUB2
+
+.LDTRMM_L1x8_SUB1:
+
+	andi.		L,	K1,	7						// K1 & 7 -> L
+	ble		.LDTRMM_L1x8_SAVE
+
+.LDTRMM_L1x8_SUB2:
+
+	KERNEL1x8_SUB1
+
+	addic.		L,	L,	-1
+	bgt		.LDTRMM_L1x8_SUB2
+
+.LDTRMM_L1x8_SAVE:
+
+	SAVE1x8
+
+#if  (defined(LEFT) &&  defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+	sub		T1,	K,	KKK					// K - KKK -> TEMP1
+	slwi		T2,	T1,	3			// TEMP1 * Number of values in B shifted -> TEMP2
+	slwi		T1,	T1,	6			// TEMP1 * Number of values in A shifted -> TEMP1
+	add		BO,	BO,	T2					// BO += TEMP2 * number of values in B shifted
+	add		AO,	AO,	T1					// AO += TEMP1 * number of values in A shifted
+#endif
+
+#if defined(LEFT)
+	addi		KK,	KK,	8				// KK += Number of values in A
+#endif
+
+
+.LDTRMM_L1x8_END:
+
+.LDTRMM_L1x4_BEGIN:
+
+	andi.		T1,	M,	4
+	ble		.LDTRMM_L1x4_END
+
+#if  (defined(LEFT) &&  defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+	mr		BO,	B					// B -> BO
+#else
+	mr		BO,	B					// B -> BO
+	slwi		T1,	KK,	3				// Number of values in B shifted
+	slwi		T2,	KK,	5				// Number of values in A shifted
+	add		BO,	BO,	T1				// Add values to BO
+	add		AO,	AO,	T2				// Add values to AO
+#endif
+
+#if (defined(LEFT) &&  !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
+	sub		T1,	K,	KK				// K - KK -> TEMP1
+#else
+	mr		T1,	KK					// KK -> KTEMP
+#ifdef LEFT
+	addi		T1,	T1,	4				// KTEMP + Number of values in A -> KTEMP
+#else
+	addi		T1,	T1,	1				// KTEMP + Number of values in B -> KTEMP
+#endif
+#endif
+
+	mr		KKK,	T1
+	mr		K1,	T1
+	srawi.		L,	K1,	3				// KTEMP / 8 -> L
+	ble		.LDTRMM_L1x4_SUB0
+	cmpwi		cr0,	L,	1
+	ble		.LDTRMM_L1x4_SUB4
+
+.LDTRMM_L1x4_LOOP_START:
+
+	LOAD1x4_1
+	KERNEL1x4_I1
+	KERNEL1x4_2
+	KERNEL1x4_1
+	KERNEL1x4_2
+
+	KERNEL1x4_1
+	KERNEL1x4_2
+	KERNEL1x4_1
+	KERNEL1x4_2
+
+	addic.		L,	L,	-2
+	ble		.LDTRMM_L1x4_LOOP_END
+
+	.align 5
+
+.LDTRMM_L1x4_LOOP:
+
+	KERNEL1x4_1
+	KERNEL1x4_2
+	KERNEL1x4_1
+	KERNEL1x4_2
+
+	KERNEL1x4_1
+	KERNEL1x4_2
+	KERNEL1x4_1
+	KERNEL1x4_2
+
+	addic.		L,	L,	-1
+	bgt		.LDTRMM_L1x4_LOOP
+
+.LDTRMM_L1x4_LOOP_END:
+
+	KERNEL1x4_1
+	KERNEL1x4_2
+	KERNEL1x4_1
+	KERNEL1x4_2
+
+	KERNEL1x4_1
+	KERNEL1x4_2
+	KERNEL1x4_1
+	KERNEL1x4_E2
+
+	b		.LDTRMM_L1x4_SUB1
+
+.LDTRMM_L1x4_SUB4:
+
+	KERNEL1x4_SUBI1
+	KERNEL1x4_SUB1
+	KERNEL1x4_SUB1
+	KERNEL1x4_SUB1
+
+	KERNEL1x4_SUB1
+	KERNEL1x4_SUB1
+	KERNEL1x4_SUB1
+	KERNEL1x4_SUB1
+
+	b		.LDTRMM_L1x4_SUB1
+
+.LDTRMM_L1x4_SUB0:
+
+	andi.		L,	K1,	7						// K1 & 7 -> L
+
+	KERNEL1x4_SUBI1
+
+	addic.		L,	L,	-1
+	ble		.LDTRMM_L1x4_SAVE
+	b		.LDTRMM_L1x4_SUB2
+
+.LDTRMM_L1x4_SUB1:
+
+	andi.		L,	K1,	7						// K1 & 7 -> L
+	ble		.LDTRMM_L1x4_SAVE
+
+.LDTRMM_L1x4_SUB2:
+
+	KERNEL1x4_SUB1
+
+	addic.		L,	L,	-1
+	bgt		.LDTRMM_L1x4_SUB2
+
+.LDTRMM_L1x4_SAVE:
+
+	SAVE1x4
+
+#if  (defined(LEFT) &&  defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+	sub		T1,	K,	KKK					// K - KKK -> TEMP1
+	slwi		T2,	T1,	3			// TEMP1 * Number of values in B shifted -> TEMP2
+	slwi		T1,	T1,	5			// TEMP1 * Number of values in A shifted -> TEMP1
+	add		BO,	BO,	T2					// BO += TEMP2 * number of values in B shifted
+	add		AO,	AO,	T1					// AO += TEMP1 * number of values in A shifted
+#endif
+
+#if defined(LEFT)
+	addi		KK,	KK,	4				// KK += Number of values in A
+#endif
+
+
+.LDTRMM_L1x4_END:
+
+.LDTRMM_L1x2_BEGIN:
+
+	andi.		T1,	M,	2
+	ble		.LDTRMM_L1x2_END
+
+#if  (defined(LEFT) &&  defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+	mr		BO,	B					// B -> BO
+#else
+	mr		BO,	B					// B -> BO
+	slwi		T1,	KK,	3				// Number of values in B shifted
+	slwi		T2,	KK,	4				// Number of values in A shifted
+	add		BO,	BO,	T1				// Add values to BO
+	add		AO,	AO,	T2				// Add values to AO
+#endif
+
+#if (defined(LEFT) &&  !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
+	sub		T1,	K,	KK				// K - KK -> TEMP1
+#else
+	mr		T1,	KK					// KK -> KTEMP
+#ifdef LEFT
+	addi		T1,	T1,	2				// KTEMP + Number of values in A -> KTEMP
+#else
+	addi		T1,	T1,	1				// KTEMP + Number of values in B -> KTEMP
+#endif
+#endif
+
+	mr		KKK,	T1
+	mr		K1,	T1
+	srawi.		L,	K1,	3				// KTEMP / 8 -> L
+	ble		.LDTRMM_L1x2_SUB0
+	cmpwi		cr0,	L,	1
+	ble		.LDTRMM_L1x2_SUB4
+
+.LDTRMM_L1x2_LOOP_START:
+
+	LOAD1x2_1
+	KERNEL1x2_I1
+	KERNEL1x2_2
+	KERNEL1x2_1
+	KERNEL1x2_2
+
+	KERNEL1x2_1
+	KERNEL1x2_2
+	KERNEL1x2_1
+	KERNEL1x2_2
+
+	addic.		L,	L,	-2
+	ble		.LDTRMM_L1x2_LOOP_END
+
+	.align 5
+
+.LDTRMM_L1x2_LOOP:
+
+	KERNEL1x2_1
+	KERNEL1x2_2
+	KERNEL1x2_1
+	KERNEL1x2_2
+
+	KERNEL1x2_1
+	KERNEL1x2_2
+	KERNEL1x2_1
+	KERNEL1x2_2
+
+	addic.		L,	L,	-1
+	bgt		.LDTRMM_L1x2_LOOP
+
+.LDTRMM_L1x2_LOOP_END:
+
+	KERNEL1x2_1
+	KERNEL1x2_2
+	KERNEL1x2_1
+	KERNEL1x2_2
+
+	KERNEL1x2_1
+	KERNEL1x2_2
+	KERNEL1x2_1
+	KERNEL1x2_E2
+
+	b		.LDTRMM_L1x2_SUB1
+
+.LDTRMM_L1x2_SUB4:
+
+	KERNEL1x2_SUBI1
+	KERNEL1x2_SUB1
+	KERNEL1x2_SUB1
+	KERNEL1x2_SUB1
+
+	KERNEL1x2_SUB1
+	KERNEL1x2_SUB1
+	KERNEL1x2_SUB1
+	KERNEL1x2_SUB1
+
+	b		.LDTRMM_L1x2_SUB1
+
+.LDTRMM_L1x2_SUB0:
+
+	andi.		L,	K1,	7						// K1 & 7 -> L
+
+	KERNEL1x2_SUBI1
+
+	addic.		L,	L,	-1
+	ble		.LDTRMM_L1x2_SAVE
+	b		.LDTRMM_L1x2_SUB2
+
+.LDTRMM_L1x2_SUB1:
+
+	andi.		L,	K1,	7						// K1 & 7 -> L
+	ble		.LDTRMM_L1x2_SAVE
+
+.LDTRMM_L1x2_SUB2:
+
+	KERNEL1x2_SUB1
+
+	addic.		L,	L,	-1
+	bgt		.LDTRMM_L1x2_SUB2
+
+.LDTRMM_L1x2_SAVE:
+
+	SAVE1x2
+
+#if  (defined(LEFT) &&  defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+	sub		T1,	K,	KKK					// K - KKK -> TEMP1
+	slwi		T2,	T1,	3			// TEMP1 * Number of values in B shifted -> TEMP2
+	slwi		T1,	T1,	4			// TEMP1 * Number of values in A shifted -> TEMP1
+	add		BO,	BO,	T2					// BO += TEMP2 * number of values in B shifted
+	add		AO,	AO,	T1					// AO += TEMP1 * number of values in A shifted
+#endif
+
+#if defined(LEFT)
+	addi		KK,	KK,	2				// KK += Number of values in A
+#endif
+
+
+.LDTRMM_L1x2_END:
+
+.LDTRMM_L1x1_BEGIN:
+
+	andi.		T1,	M,	1
+	ble		.LDTRMM_L1x1_END
+
+#if  (defined(LEFT) &&  defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+	mr		BO,	B					// B -> BO
+#else
+	mr		BO,	B					// B -> BO
+	slwi		T1,	KK,	3				// Number of values in B shifted
+	slwi		T2,	KK,	3				// Number of values in A shifted
+	add		BO,	BO,	T1				// Add values to BO
+	add		AO,	AO,	T2				// Add values to AO
+#endif
+
+#if (defined(LEFT) &&  !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
+	sub		T1,	K,	KK				// K - KK -> TEMP1
+#else
+	mr		T1,	KK					// KK -> KTEMP
+#ifdef LEFT
+	addi		T1,	T1,	1				// KTEMP + Number of values in A -> KTEMP
+#else
+	addi		T1,	T1,	1				// KTEMP + Number of values in B -> KTEMP
+#endif
+#endif
+
+	mr		KKK,	T1
+	mr		K1,	T1
+	srawi.		L,	K1,	3				// KTEMP / 8 -> L
+	ble		.LDTRMM_L1x1_SUB0
+	cmpwi		cr0,	L,	1
+	ble		.LDTRMM_L1x1_SUB4
+
+.LDTRMM_L1x1_LOOP_START:
+
+	LOAD1x1_1
+	KERNEL1x1_I1
+	KERNEL1x1_2
+	KERNEL1x1_1
+	KERNEL1x1_2
+
+	KERNEL1x1_1
+	KERNEL1x1_2
+	KERNEL1x1_1
+	KERNEL1x1_2
+
+	addic.		L,	L,	-2
+	ble		.LDTRMM_L1x1_LOOP_END
+
+	.align 5
+
+.LDTRMM_L1x1_LOOP:
+
+	KERNEL1x1_1
+	KERNEL1x1_2
+	KERNEL1x1_1
+	KERNEL1x1_2
+
+	KERNEL1x1_1
+	KERNEL1x1_2
+	KERNEL1x1_1
+	KERNEL1x1_2
+
+	addic.		L,	L,	-1
+	bgt		.LDTRMM_L1x1_LOOP
+
+.LDTRMM_L1x1_LOOP_END:
+
+	KERNEL1x1_1
+	KERNEL1x1_2
+	KERNEL1x1_1
+	KERNEL1x1_2
+
+	KERNEL1x1_1
+	KERNEL1x1_2
+	KERNEL1x1_1
+	KERNEL1x1_E2
+
+	b		.LDTRMM_L1x1_SUB1
+
+.LDTRMM_L1x1_SUB4:
+
+	KERNEL1x1_SUBI1
+	KERNEL1x1_SUB1
+	KERNEL1x1_SUB1
+	KERNEL1x1_SUB1
+
+	KERNEL1x1_SUB1
+	KERNEL1x1_SUB1
+	KERNEL1x1_SUB1
+	KERNEL1x1_SUB1
+
+	b		.LDTRMM_L1x1_SUB1
+
+.LDTRMM_L1x1_SUB0:
+
+	andi.		L,	K1,	7						// K1 & 7 -> L
+
+	KERNEL1x1_SUBI1
+
+	addic.		L,	L,	-1
+	ble		.LDTRMM_L1x1_SAVE
+	b		.LDTRMM_L1x1_SUB2
+
+.LDTRMM_L1x1_SUB1:
+
+	andi.		L,	K1,	7						// K1 & 7 -> L
+	ble		.LDTRMM_L1x1_SAVE
+
+.LDTRMM_L1x1_SUB2:
+
+	KERNEL1x1_SUB1
+
+	addic.		L,	L,	-1
+	bgt		.LDTRMM_L1x1_SUB2
+
+.LDTRMM_L1x1_SAVE:
+
+	SAVE1x1
+
+#if  (defined(LEFT) &&  defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+	sub		T1,	K,	KKK					// K - KKK -> TEMP1
+	slwi		T2,	T1,	3			// TEMP1 * Number of values in B shifted -> TEMP2
+	slwi		T1,	T1,	3			// TEMP1 * Number of values in A shifted -> TEMP1
+	add		BO,	BO,	T2					// BO += TEMP2 * number of values in B shifted
+	add		AO,	AO,	T1					// AO += TEMP1 * number of values in A shifted
+#endif
+
+#if defined(LEFT)
+	addi		KK,	KK,	1				// KK += Number of values in A
+#endif
+
+
+.LDTRMM_L1x1_END:
+
+#if !defined(LEFT)
+	addi		KK,	KK,	1					// KK += Number of values in B
+#endif
+
+
+.LDTRMM_L1_END:
diff --git a/kernel/power/gemm_ncopy_4.S b/kernel/power/gemm_ncopy_4.S
index a4dcc49c1..c6e69b4fc 100644
--- a/kernel/power/gemm_ncopy_4.S
+++ b/kernel/power/gemm_ncopy_4.S
@@ -107,6 +107,11 @@
 #ifdef PPCG4
 #define PREFETCHSIZE   16
 #define PREFETCHWSIZE  72
+#endif
+
+#ifdef POWER8
+#define PREFETCHSIZE   16
+#define PREFETCHWSIZE  72
 #endif
 
 	PROLOGUE
@@ -193,7 +198,7 @@ LL(12):
 	STFD	c12,  14 * SIZE(B)
 	STFD	c16,  15 * SIZE(B)
 
-#ifdef POWER6
+#if defined(POWER6) || defined(POWER8)
 	dcbtst	PREA, AO1
 	dcbtst	PREA, AO2
 	dcbtst	PREA, AO3
diff --git a/kernel/power/gemm_tcopy_4.S b/kernel/power/gemm_tcopy_4.S
index 1b6af4801..30513447e 100644
--- a/kernel/power/gemm_tcopy_4.S
+++ b/kernel/power/gemm_tcopy_4.S
@@ -111,6 +111,11 @@
 #ifdef PPCG4
 #define PREFETCHSIZE   16
 #define PREFETCHWSIZE  48
+#endif
+
+#ifdef POWER8
+#define PREFETCHSIZE   16
+#define PREFETCHWSIZE  48
 #endif
 
 	PROLOGUE
@@ -224,7 +229,7 @@ LL(12):
 	STFD	c15,  14 * SIZE(B1)
 	STFD	c16,  15 * SIZE(B1)
 
-#ifdef POWER6
+#if defined(POWER6) || defined(POWER8)
 	dcbtst	PREA, AO1
 	dcbtst	PREA, AO2
 	dcbtst	PREA, AO3
diff --git a/kernel/power/gemv_n.S b/kernel/power/gemv_n.S
index 77587ecb1..02160bd61 100644
--- a/kernel/power/gemv_n.S
+++ b/kernel/power/gemv_n.S
@@ -174,6 +174,12 @@
 #define PREFETCHSIZE_C  40
 #endif
 
+#ifdef POWER8
+#define PREFETCHSIZE_A  96
+#define PREFETCHSIZE_C  40
+#endif
+
+
 #ifndef NEEDPARAM
 
 #ifndef __64BIT__
diff --git a/kernel/power/gemv_t.S b/kernel/power/gemv_t.S
index 817a60b86..457753065 100644
--- a/kernel/power/gemv_t.S
+++ b/kernel/power/gemv_t.S
@@ -139,6 +139,11 @@
 #define PREFETCHSIZE_C   8
 #endif
 
+#ifdef POWER8
+#define PREFETCHSIZE_A  96
+#define PREFETCHSIZE_C   8
+#endif
+
 #define y01 f0
 #define y02 f1
 #define y03 f2
diff --git a/kernel/power/zgemm_kernel_8x2_power8.S b/kernel/power/zgemm_kernel_8x2_power8.S
new file mode 100644
index 000000000..a7665f749
--- /dev/null
+++ b/kernel/power/zgemm_kernel_8x2_power8.S
@@ -0,0 +1,367 @@
+/***************************************************************************
+Copyright (c) 2013-2016, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*****************************************************************************/
+
+/**************************************************************************************
+* 2016/03/05 Werner Saar (wernsaar@googlemail.com)
+* 	 BLASTEST 		: OK
+* 	 CTEST			: OK
+* 	 TEST			: OK
+*	 LAPACK-TEST		: OK
+**************************************************************************************/
+
+/*********************************************************************/
+/* Copyright 2009, 2010 The University of Texas at Austin.           */
+/* All rights reserved.                                              */
+/*                                                                   */
+/* Redistribution and use in source and binary forms, with or        */
+/* without modification, are permitted provided that the following   */
+/* conditions are met:                                               */
+/*                                                                   */
+/*   1. Redistributions of source code must retain the above         */
+/*      copyright notice, this list of conditions and the following  */
+/*      disclaimer.                                                  */
+/*                                                                   */
+/*   2. Redistributions in binary form must reproduce the above      */
+/*      copyright notice, this list of conditions and the following  */
+/*      disclaimer in the documentation and/or other materials       */
+/*      provided with the distribution.                              */
+/*                                                                   */
+/*    THIS  SOFTWARE IS PROVIDED  BY THE  UNIVERSITY OF  TEXAS AT    */
+/*    AUSTIN  ``AS IS''  AND ANY  EXPRESS OR  IMPLIED WARRANTIES,    */
+/*    INCLUDING, BUT  NOT LIMITED  TO, THE IMPLIED  WARRANTIES OF    */
+/*    MERCHANTABILITY  AND FITNESS FOR  A PARTICULAR  PURPOSE ARE    */
+/*    DISCLAIMED.  IN  NO EVENT SHALL THE UNIVERSITY  OF TEXAS AT    */
+/*    AUSTIN OR CONTRIBUTORS BE  LIABLE FOR ANY DIRECT, INDIRECT,    */
+/*    INCIDENTAL,  SPECIAL, EXEMPLARY,  OR  CONSEQUENTIAL DAMAGES    */
+/*    (INCLUDING, BUT  NOT LIMITED TO,  PROCUREMENT OF SUBSTITUTE    */
+/*    GOODS  OR  SERVICES; LOSS  OF  USE,  DATA,  OR PROFITS;  OR    */
+/*    BUSINESS INTERRUPTION) HOWEVER CAUSED  AND ON ANY THEORY OF    */
+/*    LIABILITY, WHETHER  IN CONTRACT, STRICT  LIABILITY, OR TORT    */
+/*    (INCLUDING NEGLIGENCE OR OTHERWISE)  ARISING IN ANY WAY OUT    */
+/*    OF  THE  USE OF  THIS  SOFTWARE,  EVEN  IF ADVISED  OF  THE    */
+/*    POSSIBILITY OF SUCH DAMAGE.                                    */
+/*                                                                   */
+/* The views and conclusions contained in the software and           */
+/* documentation are those of the authors and should not be          */
+/* interpreted as representing official policies, either expressed   */
+/* or implied, of The University of Texas at Austin.                 */
+/*********************************************************************/
+
+#define ASSEMBLER
+#include "common.h"
+#include "def_vsx.h"
+
+#ifndef __64BIT__
+#define LOAD	lwz
+#else
+#define LOAD	ld
+#endif
+
+#ifdef __64BIT__
+#define STACKSIZE 320
+#define ALPHA_R_SP 296(SP)
+#define ALPHA_I_SP 304(SP)
+#define FZERO	312(SP)
+#else
+#define STACKSIZE 256
+#define ALPHA_R_SP 224(SP)
+#define ALPHA_I_SP 232(SP)
+#define FZERO	240(SP)
+#endif
+
+#define	M	r3
+#define	N	r4
+#define	K	r5
+
+#ifdef linux
+#ifndef __64BIT__
+#define A	r6
+#define	B	r7
+#define	C	r8
+#define	LDC	r9
+#define OFFSET	r10
+#else
+#define A	r8
+#define	B	r9
+#define	C	r10
+#define	LDC	r6
+#define OFFSET	r7
+#endif
+#endif
+
+#if defined(_AIX) || defined(__APPLE__)
+#if !defined(__64BIT__) && defined(DOUBLE)
+#define A	r10
+#define	B	r6
+#define	C	r7
+#define	LDC	r8
+#define OFFSET	r9
+#else
+#define A	r8
+#define	B	r9
+#define	C	r10
+#define	LDC	r6
+#define OFFSET	r7
+#endif
+#endif
+
+#define o0	0
+#define alpha_r vs30
+#define alpha_i vs31
+
+#define L	r15
+#define ALPHA	r16
+#define o24	r17
+#define T2	r19
+#define KK	r20
+#define	o8	r21
+#define	I	r22
+#define J	r23
+#define AO	r24
+#define	BO	r25
+#define	CO	r26
+#define o16	r27
+#define	o32	r28
+#define o48	r29
+
+#define PRE	r30
+#define T1  	r31
+
+#ifndef NEEDPARAM
+
+	PROLOGUE
+	PROFCODE
+
+	addi	SP, SP, -STACKSIZE
+	li	r0, 0
+
+	stfd	f14,    0(SP)
+	stfd	f15,    8(SP)
+	stfd	f16,   16(SP)
+	stfd	f17,   24(SP)
+
+	stfd	f18,   32(SP)
+	stfd	f19,   40(SP)
+	stfd	f20,   48(SP)
+	stfd	f21,   56(SP)
+
+	stfd	f22,   64(SP)
+	stfd	f23,   72(SP)
+	stfd	f24,   80(SP)
+	stfd	f25,   88(SP)
+
+	stfd	f26,   96(SP)
+	stfd	f27,  104(SP)
+	stfd	f28,  112(SP)
+	stfd	f29,  120(SP)
+
+	stfd	f30,  128(SP)
+	stfd	f31,  136(SP)
+
+#ifdef __64BIT__
+	std	r31,  144(SP)
+	std	r30,  152(SP)
+	std	r29,  160(SP)
+	std	r28,  168(SP)
+	std	r27,  176(SP)
+	std	r26,  184(SP)
+	std	r25,  192(SP)
+	std	r24,  200(SP)
+	std	r23,  208(SP)
+	std	r22,  216(SP)
+	std	r21,  224(SP)
+	std	r20,  232(SP)
+	std	r19,  240(SP)
+	std	r18,  248(SP)
+	std	r17,  256(SP)
+	std	r16,  264(SP)
+	std	r15,  272(SP)
+#else
+	stw	r31,  144(SP)
+	stw	r30,  148(SP)
+	stw	r29,  152(SP)
+	stw	r28,  156(SP)
+	stw	r27,  160(SP)
+	stw	r26,  164(SP)
+	stw	r25,  168(SP)
+	stw	r24,  172(SP)
+	stw	r23,  176(SP)
+	stw	r22,  180(SP)
+	stw	r21,  184(SP)
+	stw	r20,  188(SP)
+	stw	r19,  192(SP)
+	stw	r18,  196(SP)
+	stw	r17,  200(SP)
+	stw	r16,  204(SP)
+	stw	r15,  208(SP)
+#endif
+
+	stfd	f1,  ALPHA_R_SP
+	stfd	f2,  ALPHA_I_SP
+	stw	r0,  FZERO
+
+#ifdef linux
+#ifdef __64BIT__
+	ld	LDC, FRAMESLOT(0) + STACKSIZE(SP)
+#endif
+#endif
+
+#if defined(_AIX) || defined(__APPLE__)
+#ifdef __64BIT__
+	ld	LDC, FRAMESLOT(0) + STACKSIZE(SP)
+#else
+#ifdef DOUBLE
+	lwz	B,   FRAMESLOT(0) + STACKSIZE(SP)
+	lwz	C,   FRAMESLOT(1) + STACKSIZE(SP)
+	lwz	LDC, FRAMESLOT(2) + STACKSIZE(SP)
+#else
+	lwz	LDC, FRAMESLOT(0) + STACKSIZE(SP)
+#endif
+#endif
+#endif
+
+#ifdef TRMMKERNEL
+#if defined(linux) && defined(__64BIT__)
+	ld	OFFSET,  FRAMESLOT(1) + STACKSIZE(SP)
+#endif
+
+#if defined(_AIX) || defined(__APPLE__)
+#ifdef __64BIT__
+	ld	OFFSET,  FRAMESLOT(1) + STACKSIZE(SP)
+#else
+#ifdef DOUBLE
+	lwz	OFFSET,  FRAMESLOT(3) + STACKSIZE(SP)
+#else
+	lwz	OFFSET,  FRAMESLOT(1) + STACKSIZE(SP)
+#endif
+#endif
+#endif
+#if defined(TRMMKERNEL) && !defined(LEFT)
+	neg	KK, OFFSET
+#endif
+#endif
+
+#include "zgemm_macros_8x2_power8.S"
+
+	cmpwi	cr0, M, 0
+	ble	.L999
+	cmpwi	cr0, N, 0
+	ble	.L999
+	cmpwi	cr0, K, 0
+	ble	.L999
+
+	slwi	LDC, LDC, ZBASE_SHIFT
+	li	PRE, 256 
+	li	o8  , 8
+	li	o16 , 16
+	li	o24 , 24
+	li	o32 , 32
+	li	o48 , 48
+
+#ifdef __64BIT__
+	addi	ALPHA, SP, 296
+#else
+	addi	ALPHA, SP, 224
+#endif
+
+	lxvdsx	alpha_r, 0, ALPHA
+	lxvdsx	alpha_i, o8, ALPHA
+
+	.align 5
+
+#include "zgemm_logic_8x2_power8.S"
+
+.L999:
+	addi	r3, 0, 0
+
+	lfd	f14,    0(SP)
+	lfd	f15,    8(SP)
+	lfd	f16,   16(SP)
+	lfd	f17,   24(SP)
+
+	lfd	f18,   32(SP)
+	lfd	f19,   40(SP)
+	lfd	f20,   48(SP)
+	lfd	f21,   56(SP)
+
+	lfd	f22,   64(SP)
+	lfd	f23,   72(SP)
+	lfd	f24,   80(SP)
+	lfd	f25,   88(SP)
+
+	lfd	f26,   96(SP)
+	lfd	f27,  104(SP)
+	lfd	f28,  112(SP)
+	lfd	f29,  120(SP)
+
+	lfd	f30,  128(SP)
+	lfd	f31,  136(SP)
+
+#ifdef __64BIT__
+	ld	r31,  144(SP)
+	ld	r30,  152(SP)
+	ld	r29,  160(SP)
+	ld	r28,  168(SP)
+	ld	r27,  176(SP)
+	ld	r26,  184(SP)
+	ld	r25,  192(SP)
+	ld	r24,  200(SP)
+	ld	r23,  208(SP)
+	ld	r22,  216(SP)
+	ld	r21,  224(SP)
+	ld	r20,  232(SP)
+	ld	r19,  240(SP)
+	ld	r18,  248(SP)
+	ld	r17,  256(SP)
+	ld	r16,  264(SP)
+	ld	r15,  272(SP)
+#else
+	lwz	r31,  144(SP)
+	lwz	r30,  148(SP)
+	lwz	r29,  152(SP)
+	lwz	r28,  156(SP)
+	lwz	r27,  160(SP)
+	lwz	r26,  164(SP)
+	lwz	r25,  168(SP)
+	lwz	r24,  172(SP)
+	lwz	r23,  176(SP)
+	lwz	r22,  180(SP)
+	lwz	r21,  184(SP)
+	lwz	r20,  188(SP)
+	lwz	r19,  192(SP)
+	lwz	r18,  196(SP)
+	lwz	r17,  200(SP)
+	lwz	r16,  204(SP)
+	lwz	r15,  208(SP)
+#endif
+
+	addi	SP, SP, STACKSIZE
+
+	blr
+
+	EPILOGUE
+#endif
diff --git a/kernel/power/zgemm_logic_8x2_power8.S b/kernel/power/zgemm_logic_8x2_power8.S
new file mode 100644
index 000000000..5fcade5bf
--- /dev/null
+++ b/kernel/power/zgemm_logic_8x2_power8.S
@@ -0,0 +1,901 @@
+	srawi.		J,	N,	1
+	ble		.LZGEMM_L2_END
+
+.LZGEMM_L2_BEGIN:
+
+	mr		CO,	C
+	mr		AO,	A
+	slwi		T1,	LDC	,	1
+	add		C,	C,	T1
+	srawi.		I,	M,	3
+	ble		.LZGEMM_L2x8_END
+
+.LZGEMM_L2x8_BEGIN:
+
+
+	mr		BO,	B
+	srawi.		L,	K,	3
+	ble		.LZGEMM_L2x8_SUB0
+	cmpwi		cr0,	L,	1
+	ble		.LZGEMM_L2x8_SUB4
+
+.LZGEMM_L2x8_LOOP_START:
+
+	dcbt		AO,	PRE
+	LOAD2x8_1
+	dcbt		AO,	PRE
+	KERNEL2x8_I1
+	dcbt		AO,	PRE
+	KERNEL2x8_2
+	dcbt		AO,	PRE
+	KERNEL2x8_1
+	dcbt		AO,	PRE
+	KERNEL2x8_2
+
+	dcbt		AO,	PRE
+	KERNEL2x8_1
+	dcbt		AO,	PRE
+	KERNEL2x8_2
+	dcbt		AO,	PRE
+	KERNEL2x8_1
+	dcbt		AO,	PRE
+	KERNEL2x8_2
+
+	addic.		L,	L,	-2
+	ble		.LZGEMM_L2x8_LOOP_END
+
+	.align 5
+
+.LZGEMM_L2x8_LOOP:
+
+	dcbt		AO,	PRE
+	KERNEL2x8_1
+	dcbt		AO,	PRE
+	KERNEL2x8_2
+	dcbt		AO,	PRE
+	KERNEL2x8_1
+	dcbt		AO,	PRE
+	KERNEL2x8_2
+
+	dcbt		AO,	PRE
+	KERNEL2x8_1
+	dcbt		AO,	PRE
+	KERNEL2x8_2
+	dcbt		AO,	PRE
+	KERNEL2x8_1
+	dcbt		AO,	PRE
+	KERNEL2x8_2
+
+	addic.		L,	L,	-1
+	bgt		.LZGEMM_L2x8_LOOP
+
+.LZGEMM_L2x8_LOOP_END:
+
+	dcbt		AO,	PRE
+	KERNEL2x8_1
+	dcbt		AO,	PRE
+	KERNEL2x8_2
+	dcbt		AO,	PRE
+	KERNEL2x8_1
+	dcbt		AO,	PRE
+	KERNEL2x8_2
+
+	dcbt		AO,	PRE
+	KERNEL2x8_1
+	dcbt		AO,	PRE
+	KERNEL2x8_2
+	dcbt		AO,	PRE
+	KERNEL2x8_1
+	KERNEL2x8_E2
+
+	b		.LZGEMM_L2x8_SUB1
+
+.LZGEMM_L2x8_SUB4:
+
+	dcbt		AO,	PRE
+	KERNEL2x8_SUBI1
+	dcbt		AO,	PRE
+	KERNEL2x8_SUB1
+	dcbt		AO,	PRE
+	KERNEL2x8_SUB1
+	dcbt		AO,	PRE
+	KERNEL2x8_SUB1
+
+	KERNEL2x8_SUB1
+	KERNEL2x8_SUB1
+	KERNEL2x8_SUB1
+	KERNEL2x8_SUB1
+
+	b		.LZGEMM_L2x8_SUB1
+
+.LZGEMM_L2x8_SUB0:
+
+	andi.		L,	K,	7
+
+	KERNEL2x8_SUBI1
+
+	addic.		L,	L,	-1
+	ble		.LZGEMM_L2x8_SAVE
+	b		.LZGEMM_L2x8_SUB2
+
+.LZGEMM_L2x8_SUB1:
+
+	andi.		L,	K,	7
+	ble		.LZGEMM_L2x8_SAVE
+
+.LZGEMM_L2x8_SUB2:
+
+	KERNEL2x8_SUB1
+
+	addic.		L,	L,	-1
+	bgt		.LZGEMM_L2x8_SUB2
+
+.LZGEMM_L2x8_SAVE:
+
+	SAVE2x8
+
+	addic.		I,	I,	-1
+	bgt		.LZGEMM_L2x8_BEGIN
+
+.LZGEMM_L2x8_END:
+
+.LZGEMM_L2x4_BEGIN:
+
+	andi.		T2,	M,	7
+	ble		.LZGEMM_L2x1_END
+
+	andi.		T1,	M,	4
+	ble		.LZGEMM_L2x4_END
+	mr		BO,	B
+	srawi.		L,	K,	3
+	ble		.LZGEMM_L2x4_SUB0
+	cmpwi		cr0,	L,	1
+	ble		.LZGEMM_L2x4_SUB4
+
+.LZGEMM_L2x4_LOOP_START:
+
+	LOAD2x4_1
+	KERNEL2x4_I1
+	KERNEL2x4_2
+	KERNEL2x4_1
+	KERNEL2x4_2
+
+	KERNEL2x4_1
+	KERNEL2x4_2
+	KERNEL2x4_1
+	KERNEL2x4_2
+
+	addic.		L,	L,	-2
+	ble		.LZGEMM_L2x4_LOOP_END
+
+	.align 5
+
+.LZGEMM_L2x4_LOOP:
+
+	KERNEL2x4_1
+	KERNEL2x4_2
+	KERNEL2x4_1
+	KERNEL2x4_2
+
+	KERNEL2x4_1
+	KERNEL2x4_2
+	KERNEL2x4_1
+	KERNEL2x4_2
+
+	addic.		L,	L,	-1
+	bgt		.LZGEMM_L2x4_LOOP
+
+.LZGEMM_L2x4_LOOP_END:
+
+	KERNEL2x4_1
+	KERNEL2x4_2
+	KERNEL2x4_1
+	KERNEL2x4_2
+
+	KERNEL2x4_1
+	KERNEL2x4_2
+	KERNEL2x4_1
+	KERNEL2x4_E2
+
+	b		.LZGEMM_L2x4_SUB1
+
+.LZGEMM_L2x4_SUB4:
+
+	KERNEL2x4_SUBI1
+	KERNEL2x4_SUB1
+	KERNEL2x4_SUB1
+	KERNEL2x4_SUB1
+
+	KERNEL2x4_SUB1
+	KERNEL2x4_SUB1
+	KERNEL2x4_SUB1
+	KERNEL2x4_SUB1
+
+	b		.LZGEMM_L2x4_SUB1
+
+.LZGEMM_L2x4_SUB0:
+
+	andi.		L,	K,	7
+
+	KERNEL2x4_SUBI1
+
+	addic.		L,	L,	-1
+	ble		.LZGEMM_L2x4_SAVE
+	b		.LZGEMM_L2x4_SUB2
+
+.LZGEMM_L2x4_SUB1:
+
+	andi.		L,	K,	7
+	ble		.LZGEMM_L2x4_SAVE
+
+.LZGEMM_L2x4_SUB2:
+
+	KERNEL2x4_SUB1
+
+	addic.		L,	L,	-1
+	bgt		.LZGEMM_L2x4_SUB2
+
+.LZGEMM_L2x4_SAVE:
+
+	SAVE2x4
+
+.LZGEMM_L2x4_END:
+
+.LZGEMM_L2x2_BEGIN:
+
+
+	andi.		T1,	M,	2
+	ble		.LZGEMM_L2x2_END
+	mr		BO,	B
+	srawi.		L,	K,	3
+	ble		.LZGEMM_L2x2_SUB0
+	cmpwi		cr0,	L,	1
+	ble		.LZGEMM_L2x2_SUB4
+
+.LZGEMM_L2x2_LOOP_START:
+
+	LOAD2x2_1
+	KERNEL2x2_I1
+	KERNEL2x2_2
+	KERNEL2x2_1
+	KERNEL2x2_2
+
+	KERNEL2x2_1
+	KERNEL2x2_2
+	KERNEL2x2_1
+	KERNEL2x2_2
+
+	addic.		L,	L,	-2
+	ble		.LZGEMM_L2x2_LOOP_END
+
+	.align 5
+
+.LZGEMM_L2x2_LOOP:
+
+	KERNEL2x2_1
+	KERNEL2x2_2
+	KERNEL2x2_1
+	KERNEL2x2_2
+
+	KERNEL2x2_1
+	KERNEL2x2_2
+	KERNEL2x2_1
+	KERNEL2x2_2
+
+	addic.		L,	L,	-1
+	bgt		.LZGEMM_L2x2_LOOP
+
+.LZGEMM_L2x2_LOOP_END:
+
+	KERNEL2x2_1
+	KERNEL2x2_2
+	KERNEL2x2_1
+	KERNEL2x2_2
+
+	KERNEL2x2_1
+	KERNEL2x2_2
+	KERNEL2x2_1
+	KERNEL2x2_E2
+
+	b		.LZGEMM_L2x2_SUB1
+
+.LZGEMM_L2x2_SUB4:
+
+	KERNEL2x2_SUBI1
+	KERNEL2x2_SUB1
+	KERNEL2x2_SUB1
+	KERNEL2x2_SUB1
+
+	KERNEL2x2_SUB1
+	KERNEL2x2_SUB1
+	KERNEL2x2_SUB1
+	KERNEL2x2_SUB1
+
+	b		.LZGEMM_L2x2_SUB1
+
+.LZGEMM_L2x2_SUB0:
+
+	andi.		L,	K,	7
+
+	KERNEL2x2_SUBI1
+
+	addic.		L,	L,	-1
+	ble		.LZGEMM_L2x2_SAVE
+	b		.LZGEMM_L2x2_SUB2
+
+.LZGEMM_L2x2_SUB1:
+
+	andi.		L,	K,	7
+	ble		.LZGEMM_L2x2_SAVE
+
+.LZGEMM_L2x2_SUB2:
+
+	KERNEL2x2_SUB1
+
+	addic.		L,	L,	-1
+	bgt		.LZGEMM_L2x2_SUB2
+
+.LZGEMM_L2x2_SAVE:
+
+	SAVE2x2
+
+.LZGEMM_L2x2_END:
+
+.LZGEMM_L2x1_BEGIN:
+
+
+	andi.		T1,	M,	1
+	ble		.LZGEMM_L2x1_END
+	mr		BO,	B
+	srawi.		L,	K,	3
+	ble		.LZGEMM_L2x1_SUB0
+	cmpwi		cr0,	L,	1
+	ble		.LZGEMM_L2x1_SUB4
+
+.LZGEMM_L2x1_LOOP_START:
+
+	LOAD2x1_1
+	KERNEL2x1_I1
+	KERNEL2x1_2
+	KERNEL2x1_1
+	KERNEL2x1_2
+
+	KERNEL2x1_1
+	KERNEL2x1_2
+	KERNEL2x1_1
+	KERNEL2x1_2
+
+	addic.		L,	L,	-2
+	ble		.LZGEMM_L2x1_LOOP_END
+
+	.align 5
+
+.LZGEMM_L2x1_LOOP:
+
+	KERNEL2x1_1
+	KERNEL2x1_2
+	KERNEL2x1_1
+	KERNEL2x1_2
+
+	KERNEL2x1_1
+	KERNEL2x1_2
+	KERNEL2x1_1
+	KERNEL2x1_2
+
+	addic.		L,	L,	-1
+	bgt		.LZGEMM_L2x1_LOOP
+
+.LZGEMM_L2x1_LOOP_END:
+
+	KERNEL2x1_1
+	KERNEL2x1_2
+	KERNEL2x1_1
+	KERNEL2x1_2
+
+	KERNEL2x1_1
+	KERNEL2x1_2
+	KERNEL2x1_1
+	KERNEL2x1_E2
+
+	b		.LZGEMM_L2x1_SUB1
+
+.LZGEMM_L2x1_SUB4:
+
+	KERNEL2x1_SUBI1
+	KERNEL2x1_SUB1
+	KERNEL2x1_SUB1
+	KERNEL2x1_SUB1
+
+	KERNEL2x1_SUB1
+	KERNEL2x1_SUB1
+	KERNEL2x1_SUB1
+	KERNEL2x1_SUB1
+
+	b		.LZGEMM_L2x1_SUB1
+
+.LZGEMM_L2x1_SUB0:
+
+	andi.		L,	K,	7
+
+	KERNEL2x1_SUBI1
+
+	addic.		L,	L,	-1
+	ble		.LZGEMM_L2x1_SAVE
+	b		.LZGEMM_L2x1_SUB2
+
+.LZGEMM_L2x1_SUB1:
+
+	andi.		L,	K,	7
+	ble		.LZGEMM_L2x1_SAVE
+
+.LZGEMM_L2x1_SUB2:
+
+	KERNEL2x1_SUB1
+
+	addic.		L,	L,	-1
+	bgt		.LZGEMM_L2x1_SUB2
+
+.LZGEMM_L2x1_SAVE:
+
+	SAVE2x1
+
+.LZGEMM_L2x1_END:
+
+	slwi		T1,	K,	5
+	add		B,	B,	T1
+
+	addic.		J,	J,	-1
+	bgt		.LZGEMM_L2_BEGIN
+
+	andi.		T2,	N,	1
+	ble		.L999
+
+.LZGEMM_L2_END:
+
+	b		.LZGEMM_L1_BEGIN
+
+.L999_H1:
+
+	b		.L999
+
+.LZGEMM_L1_BEGIN:
+
+	andi.		T1,	N,	1
+	ble		.LZGEMM_L1_END
+	mr		CO,	C
+	mr		AO,	A
+	srawi.		I,	M,	3
+	ble		.LZGEMM_L1x8_END
+
+.LZGEMM_L1x8_BEGIN:
+
+
+	mr		BO,	B
+	srawi.		L,	K,	3
+	ble		.LZGEMM_L1x8_SUB0
+	cmpwi		cr0,	L,	1
+	ble		.LZGEMM_L1x8_SUB4
+
+.LZGEMM_L1x8_LOOP_START:
+
+	dcbt		AO,	PRE
+	LOAD1x8_1
+	dcbt		AO,	PRE
+	KERNEL1x8_I1
+	dcbt		AO,	PRE
+	KERNEL1x8_2
+	dcbt		AO,	PRE
+	KERNEL1x8_1
+	dcbt		AO,	PRE
+	KERNEL1x8_2
+
+	dcbt		AO,	PRE
+	KERNEL1x8_1
+	dcbt		AO,	PRE
+	KERNEL1x8_2
+	dcbt		AO,	PRE
+	KERNEL1x8_1
+	dcbt		AO,	PRE
+	KERNEL1x8_2
+
+	addic.		L,	L,	-2
+	ble		.LZGEMM_L1x8_LOOP_END
+
+	.align 5
+
+.LZGEMM_L1x8_LOOP:
+
+	dcbt		AO,	PRE
+	KERNEL1x8_1
+	dcbt		AO,	PRE
+	KERNEL1x8_2
+	dcbt		AO,	PRE
+	KERNEL1x8_1
+	dcbt		AO,	PRE
+	KERNEL1x8_2
+
+	dcbt		AO,	PRE
+	KERNEL1x8_1
+	dcbt		AO,	PRE
+	KERNEL1x8_2
+	dcbt		AO,	PRE
+	KERNEL1x8_1
+	dcbt		AO,	PRE
+	KERNEL1x8_2
+
+	addic.		L,	L,	-1
+	bgt		.LZGEMM_L1x8_LOOP
+
+.LZGEMM_L1x8_LOOP_END:
+
+	dcbt		AO,	PRE
+	KERNEL1x8_1
+	dcbt		AO,	PRE
+	KERNEL1x8_2
+	dcbt		AO,	PRE
+	KERNEL1x8_1
+	dcbt		AO,	PRE
+	KERNEL1x8_2
+
+	dcbt		AO,	PRE
+	KERNEL1x8_1
+	dcbt		AO,	PRE
+	KERNEL1x8_2
+	dcbt		AO,	PRE
+	KERNEL1x8_1
+	KERNEL1x8_E2
+
+	b		.LZGEMM_L1x8_SUB1
+
+.LZGEMM_L1x8_SUB4:
+
+	dcbt		AO,	PRE
+	KERNEL1x8_SUBI1
+	dcbt		AO,	PRE
+	KERNEL1x8_SUB1
+	dcbt		AO,	PRE
+	KERNEL1x8_SUB1
+	dcbt		AO,	PRE
+	KERNEL1x8_SUB1
+
+	KERNEL1x8_SUB1
+	KERNEL1x8_SUB1
+	KERNEL1x8_SUB1
+	KERNEL1x8_SUB1
+
+	b		.LZGEMM_L1x8_SUB1
+
+.LZGEMM_L1x8_SUB0:
+
+	andi.		L,	K,	7
+
+	KERNEL1x8_SUBI1
+
+	addic.		L,	L,	-1
+	ble		.LZGEMM_L1x8_SAVE
+	b		.LZGEMM_L1x8_SUB2
+
+.LZGEMM_L1x8_SUB1:
+
+	andi.		L,	K,	7
+	ble		.LZGEMM_L1x8_SAVE
+
+.LZGEMM_L1x8_SUB2:
+
+	KERNEL1x8_SUB1
+
+	addic.		L,	L,	-1
+	bgt		.LZGEMM_L1x8_SUB2
+
+.LZGEMM_L1x8_SAVE:
+
+	SAVE1x8
+
+	addic.		I,	I,	-1
+	bgt		.LZGEMM_L1x8_BEGIN
+
+.LZGEMM_L1x8_END:
+
+.LZGEMM_L1x4_BEGIN:
+
+	andi.		T2,	M,	7
+	ble		.LZGEMM_L1x1_END
+
+	andi.		T1,	M,	4
+	ble		.LZGEMM_L1x4_END
+	mr		BO,	B
+	srawi.		L,	K,	3
+	ble		.LZGEMM_L1x4_SUB0
+	cmpwi		cr0,	L,	1
+	ble		.LZGEMM_L1x4_SUB4
+
+.LZGEMM_L1x4_LOOP_START:
+
+	LOAD1x4_1
+	KERNEL1x4_I1
+	KERNEL1x4_2
+	KERNEL1x4_1
+	KERNEL1x4_2
+
+	KERNEL1x4_1
+	KERNEL1x4_2
+	KERNEL1x4_1
+	KERNEL1x4_2
+
+	addic.		L,	L,	-2
+	ble		.LZGEMM_L1x4_LOOP_END
+
+	.align 5
+
+.LZGEMM_L1x4_LOOP:
+
+	KERNEL1x4_1
+	KERNEL1x4_2
+	KERNEL1x4_1
+	KERNEL1x4_2
+
+	KERNEL1x4_1
+	KERNEL1x4_2
+	KERNEL1x4_1
+	KERNEL1x4_2
+
+	addic.		L,	L,	-1
+	bgt		.LZGEMM_L1x4_LOOP
+
+.LZGEMM_L1x4_LOOP_END:
+
+	KERNEL1x4_1
+	KERNEL1x4_2
+	KERNEL1x4_1
+	KERNEL1x4_2
+
+	KERNEL1x4_1
+	KERNEL1x4_2
+	KERNEL1x4_1
+	KERNEL1x4_E2
+
+	b		.LZGEMM_L1x4_SUB1
+
+.LZGEMM_L1x4_SUB4:
+
+	KERNEL1x4_SUBI1
+	KERNEL1x4_SUB1
+	KERNEL1x4_SUB1
+	KERNEL1x4_SUB1
+
+	KERNEL1x4_SUB1
+	KERNEL1x4_SUB1
+	KERNEL1x4_SUB1
+	KERNEL1x4_SUB1
+
+	b		.LZGEMM_L1x4_SUB1
+
+.LZGEMM_L1x4_SUB0:
+
+	andi.		L,	K,	7
+
+	KERNEL1x4_SUBI1
+
+	addic.		L,	L,	-1
+	ble		.LZGEMM_L1x4_SAVE
+	b		.LZGEMM_L1x4_SUB2
+
+.LZGEMM_L1x4_SUB1:
+
+	andi.		L,	K,	7
+	ble		.LZGEMM_L1x4_SAVE
+
+.LZGEMM_L1x4_SUB2:
+
+	KERNEL1x4_SUB1
+
+	addic.		L,	L,	-1
+	bgt		.LZGEMM_L1x4_SUB2
+
+.LZGEMM_L1x4_SAVE:
+
+	SAVE1x4
+
+.LZGEMM_L1x4_END:
+
+.LZGEMM_L1x2_BEGIN:
+
+
+	andi.		T1,	M,	2
+	ble		.LZGEMM_L1x2_END
+	mr		BO,	B
+	srawi.		L,	K,	3
+	ble		.LZGEMM_L1x2_SUB0
+	cmpwi		cr0,	L,	1
+	ble		.LZGEMM_L1x2_SUB4
+
+.LZGEMM_L1x2_LOOP_START:
+
+	LOAD1x2_1
+	KERNEL1x2_I1
+	KERNEL1x2_2
+	KERNEL1x2_1
+	KERNEL1x2_2
+
+	KERNEL1x2_1
+	KERNEL1x2_2
+	KERNEL1x2_1
+	KERNEL1x2_2
+
+	addic.		L,	L,	-2
+	ble		.LZGEMM_L1x2_LOOP_END
+
+	.align 5
+
+.LZGEMM_L1x2_LOOP:
+
+	KERNEL1x2_1
+	KERNEL1x2_2
+	KERNEL1x2_1
+	KERNEL1x2_2
+
+	KERNEL1x2_1
+	KERNEL1x2_2
+	KERNEL1x2_1
+	KERNEL1x2_2
+
+	addic.		L,	L,	-1
+	bgt		.LZGEMM_L1x2_LOOP
+
+.LZGEMM_L1x2_LOOP_END:
+
+	KERNEL1x2_1
+	KERNEL1x2_2
+	KERNEL1x2_1
+	KERNEL1x2_2
+
+	KERNEL1x2_1
+	KERNEL1x2_2
+	KERNEL1x2_1
+	KERNEL1x2_E2
+
+	b		.LZGEMM_L1x2_SUB1
+
+.LZGEMM_L1x2_SUB4:
+
+	KERNEL1x2_SUBI1
+	KERNEL1x2_SUB1
+	KERNEL1x2_SUB1
+	KERNEL1x2_SUB1
+
+	KERNEL1x2_SUB1
+	KERNEL1x2_SUB1
+	KERNEL1x2_SUB1
+	KERNEL1x2_SUB1
+
+	b		.LZGEMM_L1x2_SUB1
+
+.LZGEMM_L1x2_SUB0:
+
+	andi.		L,	K,	7
+
+	KERNEL1x2_SUBI1
+
+	addic.		L,	L,	-1
+	ble		.LZGEMM_L1x2_SAVE
+	b		.LZGEMM_L1x2_SUB2
+
+.LZGEMM_L1x2_SUB1:
+
+	andi.		L,	K,	7
+	ble		.LZGEMM_L1x2_SAVE
+
+.LZGEMM_L1x2_SUB2:
+
+	KERNEL1x2_SUB1
+
+	addic.		L,	L,	-1
+	bgt		.LZGEMM_L1x2_SUB2
+
+.LZGEMM_L1x2_SAVE:
+
+	SAVE1x2
+
+.LZGEMM_L1x2_END:
+
+.LZGEMM_L1x1_BEGIN:
+
+
+	andi.		T1,	M,	1
+	ble		.LZGEMM_L1x1_END
+	mr		BO,	B
+	srawi.		L,	K,	3
+	ble		.LZGEMM_L1x1_SUB0
+	cmpwi		cr0,	L,	1
+	ble		.LZGEMM_L1x1_SUB4
+
+.LZGEMM_L1x1_LOOP_START:
+
+	LOAD1x1_1
+	KERNEL1x1_I1
+	KERNEL1x1_2
+	KERNEL1x1_1
+	KERNEL1x1_2
+
+	KERNEL1x1_1
+	KERNEL1x1_2
+	KERNEL1x1_1
+	KERNEL1x1_2
+
+	addic.		L,	L,	-2
+	ble		.LZGEMM_L1x1_LOOP_END
+
+	.align 5
+
+.LZGEMM_L1x1_LOOP:
+
+	KERNEL1x1_1
+	KERNEL1x1_2
+	KERNEL1x1_1
+	KERNEL1x1_2
+
+	KERNEL1x1_1
+	KERNEL1x1_2
+	KERNEL1x1_1
+	KERNEL1x1_2
+
+	addic.		L,	L,	-1
+	bgt		.LZGEMM_L1x1_LOOP
+
+.LZGEMM_L1x1_LOOP_END:
+
+	KERNEL1x1_1
+	KERNEL1x1_2
+	KERNEL1x1_1
+	KERNEL1x1_2
+
+	KERNEL1x1_1
+	KERNEL1x1_2
+	KERNEL1x1_1
+	KERNEL1x1_E2
+
+	b		.LZGEMM_L1x1_SUB1
+
+.LZGEMM_L1x1_SUB4:
+
+	KERNEL1x1_SUBI1
+	KERNEL1x1_SUB1
+	KERNEL1x1_SUB1
+	KERNEL1x1_SUB1
+
+	KERNEL1x1_SUB1
+	KERNEL1x1_SUB1
+	KERNEL1x1_SUB1
+	KERNEL1x1_SUB1
+
+	b		.LZGEMM_L1x1_SUB1
+
+.LZGEMM_L1x1_SUB0:
+
+	andi.		L,	K,	7
+
+	KERNEL1x1_SUBI1
+
+	addic.		L,	L,	-1
+	ble		.LZGEMM_L1x1_SAVE
+	b		.LZGEMM_L1x1_SUB2
+
+.LZGEMM_L1x1_SUB1:
+
+	andi.		L,	K,	7
+	ble		.LZGEMM_L1x1_SAVE
+
+.LZGEMM_L1x1_SUB2:
+
+	KERNEL1x1_SUB1
+
+	addic.		L,	L,	-1
+	bgt		.LZGEMM_L1x1_SUB2
+
+.LZGEMM_L1x1_SAVE:
+
+	SAVE1x1
+
+.LZGEMM_L1x1_END:
+
+.LZGEMM_L1_END:
diff --git a/kernel/power/zgemm_macros_8x2_power8.S b/kernel/power/zgemm_macros_8x2_power8.S
new file mode 100644
index 000000000..701ec65c8
--- /dev/null
+++ b/kernel/power/zgemm_macros_8x2_power8.S
@@ -0,0 +1,3110 @@
+/***************************************************************************
+Copyright (c) 2013-2016, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*****************************************************************************/
+
+/**************************************************************************************
+* 2016/03/05 Werner Saar (wernsaar@googlemail.com)
+* 	 BLASTEST 		: OK
+* 	 CTEST			: OK
+* 	 TEST			: OK
+*	 LAPACK-TEST		: OK
+**************************************************************************************/
+
+
+#if   defined(NN) || defined(NT) || defined(TN) || defined(TT)
+
+	#define	XSFADD_R1	xsadddp
+	#define	XSFADD_R2	xssubdp
+	#define	XSFADD_I1	xsadddp
+	#define	XSFADD_I2	xsadddp
+
+#elif  defined(CN) || defined(CT) || defined(RN) || defined(RT)
+
+	#define	XSFADD_R1	xsadddp
+	#define	XSFADD_R2	xsadddp
+	#define	XSFADD_I1	xssubdp
+	#define	XSFADD_I2	xsadddp
+
+#elif  defined(NC) || defined(TC) || defined(NR) || defined(TR)
+
+	#define	XSFADD_R1	xsadddp
+	#define	XSFADD_R2	xsadddp
+	#define	XSFADD_I1	xsadddp
+	#define	XSFADD_I2	xssubdp
+
+#else		// CC || CR || RC || RR
+
+	#define	XSFADD_R1	xsadddp
+	#define	XSFADD_R2	xssubdp
+	#define	XSFADD_I1	xssubdp
+	#define	XSFADD_I2	xssubdp
+
+#endif
+
+/**********************************************************************************************
+* Macros for N=2 and M=8
+**********************************************************************************************/
+
+.macro LOAD2x8_1
+
+	lxvdsx		vs16,	o0,	BO		// load real part from B
+	lxvdsx		vs17,	o8,	BO		// load imag part from B
+	lxvdsx		vs18,	o16,	BO		// load real part from B
+	lxvdsx		vs19,	o24,	BO		// load imag part from B
+
+	addi		BO,	BO,	32
+
+	lxvd2x		vs0,	o0,	AO		// load real,imag from A
+	lxvd2x		vs1,	o16,	AO		// load real,imag from A
+	lxvd2x		vs2,	o32,	AO		// load real,imag from A
+	lxvd2x		vs3,	o48,	AO		// load real,imag from A
+
+	addi		AO,	AO,	64
+
+	lxvd2x		vs4,	o0,	AO		// load real,imag from A
+	lxvd2x		vs5,	o16,	AO		// load real,imag from A
+	lxvd2x		vs6,	o32,	AO		// load real,imag from A
+	lxvd2x		vs7,	o48,	AO		// load real,imag from A
+
+	addi		AO,	AO,	64
+
+
+.endm
+
+.macro KERNEL2x8_I1
+
+	lxvd2x		vs8,	o0,	AO		// load real,imag from A
+	lxvd2x		vs9,	o16,	AO		// load real,imag from A
+	lxvd2x		vs10,	o32,	AO		// load real,imag from A
+	lxvd2x		vs11,	o48,	AO		// load real,imag from A
+
+	addi		AO,	AO,	64
+
+	lxvd2x		vs12,	o0,	AO		// load real,imag from A
+	lxvd2x		vs13,	o16,	AO		// load real,imag from A
+	lxvd2x		vs14,	o32,	AO		// load real,imag from A
+	lxvd2x		vs15,	o48,	AO		// load real,imag from A
+
+	addi		AO,	AO,	64
+
+	lxvdsx		vs20,	o0,	BO		// load real part from B
+	lxvdsx		vs21,	o8,	BO		// load imag part from B
+	lxvdsx		vs22,	o16,	BO		// load real part from B
+	lxvdsx		vs23,	o24,	BO		// load imag part from B
+
+	addi		BO,	BO,	32
+
+	xvmuldp		vs32,	vs0,	vs16		// real*real, imag*real
+	xvmuldp		vs33,	vs0,	vs17		// real*imag, imag*imag
+	xvmuldp		vs34,	vs1,	vs16		// real*real, imag*real
+	xvmuldp		vs35,	vs1,	vs17		// real*imag, imag*imag
+	xvmuldp		vs36,	vs2,	vs16		// real*real, imag*real
+	xvmuldp		vs37,	vs2,	vs17		// real*imag, imag*imag
+	xvmuldp		vs38,	vs3,	vs16		// real*real, imag*real
+	xvmuldp		vs39,	vs3,	vs17		// real*imag, imag*imag
+	xvmuldp		vs40,	vs4,	vs16		// real*real, imag*real
+	xvmuldp		vs41,	vs4,	vs17		// real*imag, imag*imag
+	xvmuldp		vs42,	vs5,	vs16		// real*real, imag*real
+	xvmuldp		vs43,	vs5,	vs17		// real*imag, imag*imag
+	xvmuldp		vs44,	vs6,	vs16		// real*real, imag*real
+	xvmuldp		vs45,	vs6,	vs17		// real*imag, imag*imag
+	xvmuldp		vs46,	vs7,	vs16		// real*real, imag*real
+	xvmuldp		vs47,	vs7,	vs17		// real*imag, imag*imag
+
+	xvmuldp		vs48,	vs0,	vs18		// real*real, imag*real
+	xvmuldp		vs49,	vs0,	vs19		// real*imag, imag*imag
+	xvmuldp		vs50,	vs1,	vs18		// real*real, imag*real
+	xvmuldp		vs51,	vs1,	vs19		// real*imag, imag*imag
+	xvmuldp		vs52,	vs2,	vs18		// real*real, imag*real
+	xvmuldp		vs53,	vs2,	vs19		// real*imag, imag*imag
+	xvmuldp		vs54,	vs3,	vs18		// real*real, imag*real
+	xvmuldp		vs55,	vs3,	vs19		// real*imag, imag*imag
+	xvmuldp		vs56,	vs4,	vs18		// real*real, imag*real
+	xvmuldp		vs57,	vs4,	vs19		// real*imag, imag*imag
+	xvmuldp		vs58,	vs5,	vs18		// real*real, imag*real
+	xvmuldp		vs59,	vs5,	vs19		// real*imag, imag*imag
+	xvmuldp		vs60,	vs6,	vs18		// real*real, imag*real
+	xvmuldp		vs61,	vs6,	vs19		// real*imag, imag*imag
+	xvmuldp		vs62,	vs7,	vs18		// real*real, imag*real
+	xvmuldp		vs63,	vs7,	vs19		// real*imag, imag*imag
+
+
+.endm
+
+.macro KERNEL2x8_1
+
+
+	xvmaddadp	vs32,	vs0,	vs16		// real*real, imag*real
+	xvmaddadp	vs33,	vs0,	vs17		// real*imag, imag*imag
+	xvmaddadp	vs34,	vs1,	vs16		// real*real, imag*real
+	xvmaddadp	vs35,	vs1,	vs17		// real*imag, imag*imag
+
+	lxvdsx		vs22,	o16,	BO		// load real part from B
+	lxvdsx		vs23,	o24,	BO		// load imag part from B
+
+	xvmaddadp	vs36,	vs2,	vs16		// real*real, imag*real
+	xvmaddadp	vs37,	vs2,	vs17		// real*imag, imag*imag
+	xvmaddadp	vs38,	vs3,	vs16		// real*real, imag*real
+	xvmaddadp	vs39,	vs3,	vs17		// real*imag, imag*imag
+
+	lxvd2x		vs8,	o0,	AO		// load real,imag from A
+	lxvd2x		vs9,	o16,	AO		// load real,imag from A
+
+	xvmaddadp	vs40,	vs4,	vs16		// real*real, imag*real
+	xvmaddadp	vs41,	vs4,	vs17		// real*imag, imag*imag
+	xvmaddadp	vs42,	vs5,	vs16		// real*real, imag*real
+	xvmaddadp	vs43,	vs5,	vs17		// real*imag, imag*imag
+
+	lxvd2x		vs10,	o32,	AO		// load real,imag from A
+	lxvd2x		vs11,	o48,	AO		// load real,imag from A
+
+	xvmaddadp	vs44,	vs6,	vs16		// real*real, imag*real
+	xvmaddadp	vs45,	vs6,	vs17		// real*imag, imag*imag
+
+	addi		AO,	AO,	64
+
+	xvmaddadp	vs46,	vs7,	vs16		// real*real, imag*real
+	xvmaddadp	vs47,	vs7,	vs17		// real*imag, imag*imag
+
+	xvmaddadp	vs48,	vs0,	vs18		// real*real, imag*real
+	xvmaddadp	vs49,	vs0,	vs19		// real*imag, imag*imag
+	xvmaddadp	vs50,	vs1,	vs18		// real*real, imag*real
+	xvmaddadp	vs51,	vs1,	vs19		// real*imag, imag*imag
+
+	lxvd2x		vs12,	o0,	AO		// load real,imag from A
+	lxvd2x		vs13,	o16,	AO		// load real,imag from A
+
+	xvmaddadp	vs52,	vs2,	vs18		// real*real, imag*real
+	xvmaddadp	vs53,	vs2,	vs19		// real*imag, imag*imag
+	xvmaddadp	vs54,	vs3,	vs18		// real*real, imag*real
+	xvmaddadp	vs55,	vs3,	vs19		// real*imag, imag*imag
+
+	lxvd2x		vs14,	o32,	AO		// load real,imag from A
+	lxvd2x		vs15,	o48,	AO		// load real,imag from A
+
+	xvmaddadp	vs56,	vs4,	vs18		// real*real, imag*real
+	xvmaddadp	vs57,	vs4,	vs19		// real*imag, imag*imag
+	xvmaddadp	vs58,	vs5,	vs18		// real*real, imag*real
+	xvmaddadp	vs59,	vs5,	vs19		// real*imag, imag*imag
+
+	lxvdsx		vs20,	o0,	BO		// load real part from B
+	lxvdsx		vs21,	o8,	BO		// load imag part from B
+
+	xvmaddadp	vs60,	vs6,	vs18		// real*real, imag*real
+	xvmaddadp	vs61,	vs6,	vs19		// real*imag, imag*imag
+	xvmaddadp	vs62,	vs7,	vs18		// real*real, imag*real
+	xvmaddadp	vs63,	vs7,	vs19		// real*imag, imag*imag
+
+	addi		AO,	AO,	64
+	addi		BO,	BO,	32
+
+.endm
+
+.macro KERNEL2x8_2
+
+
+	xvmaddadp	vs32,	vs8,	vs20		// real*real, imag*real
+	xvmaddadp	vs33,	vs8,	vs21		// real*imag, imag*imag
+	xvmaddadp	vs34,	vs9,	vs20		// real*real, imag*real
+	xvmaddadp	vs35,	vs9,	vs21		// real*imag, imag*imag
+
+	lxvdsx		vs16,	o0,	BO		// load real part from B
+	lxvdsx		vs17,	o8,	BO		// load imag part from B
+
+	xvmaddadp	vs36,	vs10,	vs20		// real*real, imag*real
+	xvmaddadp	vs37,	vs10,	vs21		// real*imag, imag*imag
+	xvmaddadp	vs38,	vs11,	vs20		// real*real, imag*real
+	xvmaddadp	vs39,	vs11,	vs21		// real*imag, imag*imag
+
+	lxvd2x		vs0,	o0,	AO		// load real,imag from A
+	lxvd2x		vs1,	o16,	AO		// load real,imag from A
+
+	xvmaddadp	vs40,	vs12,	vs20		// real*real, imag*real
+	xvmaddadp	vs41,	vs12,	vs21		// real*imag, imag*imag
+	xvmaddadp	vs42,	vs13,	vs20		// real*real, imag*real
+	xvmaddadp	vs43,	vs13,	vs21		// real*imag, imag*imag
+
+	lxvd2x		vs2,	o32,	AO		// load real,imag from A
+	lxvd2x		vs3,	o48,	AO		// load real,imag from A
+
+	xvmaddadp	vs44,	vs14,	vs20		// real*real, imag*real
+	xvmaddadp	vs45,	vs14,	vs21		// real*imag, imag*imag
+	xvmaddadp	vs46,	vs15,	vs20		// real*real, imag*real
+	xvmaddadp	vs47,	vs15,	vs21		// real*imag, imag*imag
+
+	addi		AO,	AO,	64
+
+	xvmaddadp	vs48,	vs8,	vs22		// real*real, imag*real
+	xvmaddadp	vs49,	vs8,	vs23		// real*imag, imag*imag
+	xvmaddadp	vs50,	vs9,	vs22		// real*real, imag*real
+	xvmaddadp	vs51,	vs9,	vs23		// real*imag, imag*imag
+
+	lxvd2x		vs4,	o0,	AO		// load real,imag from A
+	lxvd2x		vs5,	o16,	AO		// load real,imag from A
+
+	xvmaddadp	vs52,	vs10,	vs22		// real*real, imag*real
+	xvmaddadp	vs53,	vs10,	vs23		// real*imag, imag*imag
+	xvmaddadp	vs54,	vs11,	vs22		// real*real, imag*real
+	xvmaddadp	vs55,	vs11,	vs23		// real*imag, imag*imag
+
+	lxvd2x		vs6,	o32,	AO		// load real,imag from A
+	lxvd2x		vs7,	o48,	AO		// load real,imag from A
+
+	xvmaddadp	vs56,	vs12,	vs22		// real*real, imag*real
+	xvmaddadp	vs57,	vs12,	vs23		// real*imag, imag*imag
+	xvmaddadp	vs58,	vs13,	vs22		// real*real, imag*real
+	xvmaddadp	vs59,	vs13,	vs23		// real*imag, imag*imag
+
+	lxvdsx		vs18,	o16,	BO		// load real part from B
+	lxvdsx		vs19,	o24,	BO		// load imag part from B
+
+	xvmaddadp	vs60,	vs14,	vs22		// real*real, imag*real
+	xvmaddadp	vs61,	vs14,	vs23		// real*imag, imag*imag
+	xvmaddadp	vs62,	vs15,	vs22		// real*real, imag*real
+	xvmaddadp	vs63,	vs15,	vs23		// real*imag, imag*imag
+
+	addi		AO,	AO,	64
+	addi		BO,	BO,	32
+
+.endm
+
+.macro KERNEL2x8_E2
+
+
+	xvmaddadp	vs32,	vs8,	vs20		// real*real, imag*real
+	xvmaddadp	vs33,	vs8,	vs21		// real*imag, imag*imag
+	xvmaddadp	vs34,	vs9,	vs20		// real*real, imag*real
+	xvmaddadp	vs35,	vs9,	vs21		// real*imag, imag*imag
+	xvmaddadp	vs36,	vs10,	vs20		// real*real, imag*real
+	xvmaddadp	vs37,	vs10,	vs21		// real*imag, imag*imag
+	xvmaddadp	vs38,	vs11,	vs20		// real*real, imag*real
+	xvmaddadp	vs39,	vs11,	vs21		// real*imag, imag*imag
+	xvmaddadp	vs40,	vs12,	vs20		// real*real, imag*real
+	xvmaddadp	vs41,	vs12,	vs21		// real*imag, imag*imag
+	xvmaddadp	vs42,	vs13,	vs20		// real*real, imag*real
+	xvmaddadp	vs43,	vs13,	vs21		// real*imag, imag*imag
+	xvmaddadp	vs44,	vs14,	vs20		// real*real, imag*real
+	xvmaddadp	vs45,	vs14,	vs21		// real*imag, imag*imag
+	xvmaddadp	vs46,	vs15,	vs20		// real*real, imag*real
+	xvmaddadp	vs47,	vs15,	vs21		// real*imag, imag*imag
+
+	xvmaddadp	vs48,	vs8,	vs22		// real*real, imag*real
+	xvmaddadp	vs49,	vs8,	vs23		// real*imag, imag*imag
+	xvmaddadp	vs50,	vs9,	vs22		// real*real, imag*real
+	xvmaddadp	vs51,	vs9,	vs23		// real*imag, imag*imag
+	xvmaddadp	vs52,	vs10,	vs22		// real*real, imag*real
+	xvmaddadp	vs53,	vs10,	vs23		// real*imag, imag*imag
+	xvmaddadp	vs54,	vs11,	vs22		// real*real, imag*real
+	xvmaddadp	vs55,	vs11,	vs23		// real*imag, imag*imag
+	xvmaddadp	vs56,	vs12,	vs22		// real*real, imag*real
+	xvmaddadp	vs57,	vs12,	vs23		// real*imag, imag*imag
+	xvmaddadp	vs58,	vs13,	vs22		// real*real, imag*real
+	xvmaddadp	vs59,	vs13,	vs23		// real*imag, imag*imag
+	xvmaddadp	vs60,	vs14,	vs22		// real*real, imag*real
+	xvmaddadp	vs61,	vs14,	vs23		// real*imag, imag*imag
+	xvmaddadp	vs62,	vs15,	vs22		// real*real, imag*real
+	xvmaddadp	vs63,	vs15,	vs23		// real*imag, imag*imag
+
+
+.endm
+
+.macro KERNEL2x8_SUBI1
+
+	lxvd2x		vs0,	o0,	AO		// load real,imag from A
+	lxvd2x		vs1,	o16,	AO		// load real,imag from A
+	lxvd2x		vs2,	o32,	AO		// load real,imag from A
+	lxvd2x		vs3,	o48,	AO		// load real,imag from A
+
+	addi		AO,	AO,	64
+
+	lxvd2x		vs4,	o0,	AO		// load real,imag from A
+	lxvd2x		vs5,	o16,	AO		// load real,imag from A
+	lxvd2x		vs6,	o32,	AO		// load real,imag from A
+	lxvd2x		vs7,	o48,	AO		// load real,imag from A
+
+	addi		AO,	AO,	64
+
+	lxvdsx		vs16,	o0,	BO		// load real part from B
+	lxvdsx		vs17,	o8,	BO		// load imag part from B
+	lxvdsx		vs18,	o16,	BO		// load real part from B
+	lxvdsx		vs19,	o24,	BO		// load imag part from B
+
+	addi		BO,	BO,	32
+
+	xvmuldp		vs32,	vs0,	vs16		// real*real, imag*real
+	xvmuldp		vs33,	vs0,	vs17		// real*imag, imag*imag
+	xvmuldp		vs34,	vs1,	vs16		// real*real, imag*real
+	xvmuldp		vs35,	vs1,	vs17		// real*imag, imag*imag
+	xvmuldp		vs36,	vs2,	vs16		// real*real, imag*real
+	xvmuldp		vs37,	vs2,	vs17		// real*imag, imag*imag
+	xvmuldp		vs38,	vs3,	vs16		// real*real, imag*real
+	xvmuldp		vs39,	vs3,	vs17		// real*imag, imag*imag
+	xvmuldp		vs40,	vs4,	vs16		// real*real, imag*real
+	xvmuldp		vs41,	vs4,	vs17		// real*imag, imag*imag
+	xvmuldp		vs42,	vs5,	vs16		// real*real, imag*real
+	xvmuldp		vs43,	vs5,	vs17		// real*imag, imag*imag
+	xvmuldp		vs44,	vs6,	vs16		// real*real, imag*real
+	xvmuldp		vs45,	vs6,	vs17		// real*imag, imag*imag
+	xvmuldp		vs46,	vs7,	vs16		// real*real, imag*real
+	xvmuldp		vs47,	vs7,	vs17		// real*imag, imag*imag
+
+	xvmuldp		vs48,	vs0,	vs18		// real*real, imag*real
+	xvmuldp		vs49,	vs0,	vs19		// real*imag, imag*imag
+	xvmuldp		vs50,	vs1,	vs18		// real*real, imag*real
+	xvmuldp		vs51,	vs1,	vs19		// real*imag, imag*imag
+	xvmuldp		vs52,	vs2,	vs18		// real*real, imag*real
+	xvmuldp		vs53,	vs2,	vs19		// real*imag, imag*imag
+	xvmuldp		vs54,	vs3,	vs18		// real*real, imag*real
+	xvmuldp		vs55,	vs3,	vs19		// real*imag, imag*imag
+	xvmuldp		vs56,	vs4,	vs18		// real*real, imag*real
+	xvmuldp		vs57,	vs4,	vs19		// real*imag, imag*imag
+	xvmuldp		vs58,	vs5,	vs18		// real*real, imag*real
+	xvmuldp		vs59,	vs5,	vs19		// real*imag, imag*imag
+	xvmuldp		vs60,	vs6,	vs18		// real*real, imag*real
+	xvmuldp		vs61,	vs6,	vs19		// real*imag, imag*imag
+	xvmuldp		vs62,	vs7,	vs18		// real*real, imag*real
+	xvmuldp		vs63,	vs7,	vs19		// real*imag, imag*imag
+
+
+.endm
+
+.macro KERNEL2x8_SUB1
+
+	lxvd2x		vs0,	o0,	AO		// load real,imag from A
+	lxvd2x		vs1,	o16,	AO		// load real,imag from A
+	lxvd2x		vs2,	o32,	AO		// load real,imag from A
+	lxvd2x		vs3,	o48,	AO		// load real,imag from A
+
+	addi		AO,	AO,	64
+
+	lxvd2x		vs4,	o0,	AO		// load real,imag from A
+	lxvd2x		vs5,	o16,	AO		// load real,imag from A
+	lxvd2x		vs6,	o32,	AO		// load real,imag from A
+	lxvd2x		vs7,	o48,	AO		// load real,imag from A
+
+	addi		AO,	AO,	64
+
+	lxvdsx		vs16,	o0,	BO		// load real part from B
+	lxvdsx		vs17,	o8,	BO		// load imag part from B
+	lxvdsx		vs18,	o16,	BO		// load real part from B
+	lxvdsx		vs19,	o24,	BO		// load imag part from B
+
+	addi		BO,	BO,	32
+
+	xvmaddadp	vs32,	vs0,	vs16		// real*real, imag*real
+	xvmaddadp	vs33,	vs0,	vs17		// real*imag, imag*imag
+	xvmaddadp	vs34,	vs1,	vs16		// real*real, imag*real
+	xvmaddadp	vs35,	vs1,	vs17		// real*imag, imag*imag
+	xvmaddadp	vs36,	vs2,	vs16		// real*real, imag*real
+	xvmaddadp	vs37,	vs2,	vs17		// real*imag, imag*imag
+	xvmaddadp	vs38,	vs3,	vs16		// real*real, imag*real
+	xvmaddadp	vs39,	vs3,	vs17		// real*imag, imag*imag
+	xvmaddadp	vs40,	vs4,	vs16		// real*real, imag*real
+	xvmaddadp	vs41,	vs4,	vs17		// real*imag, imag*imag
+	xvmaddadp	vs42,	vs5,	vs16		// real*real, imag*real
+	xvmaddadp	vs43,	vs5,	vs17		// real*imag, imag*imag
+	xvmaddadp	vs44,	vs6,	vs16		// real*real, imag*real
+	xvmaddadp	vs45,	vs6,	vs17		// real*imag, imag*imag
+	xvmaddadp	vs46,	vs7,	vs16		// real*real, imag*real
+	xvmaddadp	vs47,	vs7,	vs17		// real*imag, imag*imag
+
+	xvmaddadp	vs48,	vs0,	vs18		// real*real, imag*real
+	xvmaddadp	vs49,	vs0,	vs19		// real*imag, imag*imag
+	xvmaddadp	vs50,	vs1,	vs18		// real*real, imag*real
+	xvmaddadp	vs51,	vs1,	vs19		// real*imag, imag*imag
+	xvmaddadp	vs52,	vs2,	vs18		// real*real, imag*real
+	xvmaddadp	vs53,	vs2,	vs19		// real*imag, imag*imag
+	xvmaddadp	vs54,	vs3,	vs18		// real*real, imag*real
+	xvmaddadp	vs55,	vs3,	vs19		// real*imag, imag*imag
+	xvmaddadp	vs56,	vs4,	vs18		// real*real, imag*real
+	xvmaddadp	vs57,	vs4,	vs19		// real*imag, imag*imag
+	xvmaddadp	vs58,	vs5,	vs18		// real*real, imag*real
+	xvmaddadp	vs59,	vs5,	vs19		// real*imag, imag*imag
+	xvmaddadp	vs60,	vs6,	vs18		// real*real, imag*real
+	xvmaddadp	vs61,	vs6,	vs19		// real*imag, imag*imag
+	xvmaddadp	vs62,	vs7,	vs18		// real*real, imag*real
+	xvmaddadp	vs63,	vs7,	vs19		// real*imag, imag*imag
+
+
+.endm
+
+.macro SAVE2x8
+
+
+	mr		T1,	CO
+	addi		T2,	T1,	64
+
+#ifndef TRMMKERNEL
+
+	lxvd2x		vs16,	o0,	T1
+	lxvd2x		vs17,	o16,	T1
+	lxvd2x		vs18,	o32,	T1
+	lxvd2x		vs19,	o48,	T1
+	lxvd2x		vs20,	o0,	T2
+	lxvd2x		vs21,	o16,	T2
+	lxvd2x		vs22,	o32,	T2
+	lxvd2x		vs23,	o48,	T2
+
+#endif
+
+
+	xxlxor		vs0,	vs0,	vs0
+	xxlxor		vs1,	vs1,	vs1
+	xxswapd		vs33,	vs33			// realA*imagB, imagA*imagB -> imagA*imagB, realA*imagB
+
+	XSFADD_R1	vs0,	vs0,	vs32		// realA*realB
+	XSFADD_R2	vs0,	vs0,	vs33		// imagA*imagB
+
+	xxswapd		vs32,	vs32			// realA*realB, imagA*realB -> imagA*realB, realA*realB
+	xxswapd		vs33,	vs33			// imagA*imagB, realA*imagB -> realA*imagB, imagA*imagB
+
+	XSFADD_I1	vs1,	vs1,	vs32		// realA*imagB
+	XSFADD_I2	vs1,	vs1,	vs33		// imagA*realB
+
+	xsmuldp		vs4,	vs0,	alpha_r		// real*alpha_r
+	xsmuldp		vs5,	vs1,	alpha_i		// imag*alpha_i
+	xsmuldp		vs6,	vs0,	alpha_i		// real*alpha_i
+	xsmuldp		vs7,	vs1,	alpha_r		// imag*alpha_r
+
+	xssubdp		vs2,	vs4,	vs5		// real*alpha_r - imag*alpha_i
+	xsadddp		vs3,	vs6,	vs7		// real*alpha_i + imag*alpha_r
+	xxpermdi	vs8,	vs2,	vs3,	0	// merge real and imag part
+
+
+
+	xxlxor		vs0,	vs0,	vs0
+	xxlxor		vs1,	vs1,	vs1
+	xxswapd		vs35,	vs35			// realA*imagB, imagA*imagB -> imagA*imagB, realA*imagB
+
+	XSFADD_R1	vs0,	vs0,	vs34		// realA*realB
+	XSFADD_R2	vs0,	vs0,	vs35		// imagA*imagB
+
+	xxswapd		vs34,	vs34			// realA*realB, imagA*realB -> imagA*realB, realA*realB
+	xxswapd		vs35,	vs35			// imagA*imagB, realA*imagB -> realA*imagB, imagA*imagB
+
+	XSFADD_I1	vs1,	vs1,	vs34		// realA*imagB
+	XSFADD_I2	vs1,	vs1,	vs35		// imagA*realB
+
+	xsmuldp		vs4,	vs0,	alpha_r		// real*alpha_r
+	xsmuldp		vs5,	vs1,	alpha_i		// imag*alpha_i
+	xsmuldp		vs6,	vs0,	alpha_i		// real*alpha_i
+	xsmuldp		vs7,	vs1,	alpha_r		// imag*alpha_r
+
+	xssubdp		vs2,	vs4,	vs5		// real*alpha_r - imag*alpha_i
+	xsadddp		vs3,	vs6,	vs7		// real*alpha_i + imag*alpha_r
+	xxpermdi	vs9,	vs2,	vs3,	0	// merge real and imag part
+
+
+
+	xxlxor		vs0,	vs0,	vs0
+	xxlxor		vs1,	vs1,	vs1
+	xxswapd		vs37,	vs37			// realA*imagB, imagA*imagB -> imagA*imagB, realA*imagB
+
+	XSFADD_R1	vs0,	vs0,	vs36		// realA*realB
+	XSFADD_R2	vs0,	vs0,	vs37		// imagA*imagB
+
+	xxswapd		vs36,	vs36			// realA*realB, imagA*realB -> imagA*realB, realA*realB
+	xxswapd		vs37,	vs37			// imagA*imagB, realA*imagB -> realA*imagB, imagA*imagB
+
+	XSFADD_I1	vs1,	vs1,	vs36		// realA*imagB
+	XSFADD_I2	vs1,	vs1,	vs37		// imagA*realB
+
+	xsmuldp		vs4,	vs0,	alpha_r		// real*alpha_r
+	xsmuldp		vs5,	vs1,	alpha_i		// imag*alpha_i
+	xsmuldp		vs6,	vs0,	alpha_i		// real*alpha_i
+	xsmuldp		vs7,	vs1,	alpha_r		// imag*alpha_r
+
+	xssubdp		vs2,	vs4,	vs5		// real*alpha_r - imag*alpha_i
+	xsadddp		vs3,	vs6,	vs7		// real*alpha_i + imag*alpha_r
+	xxpermdi	vs10,	vs2,	vs3,	0	// merge real and imag part
+
+
+
+	xxlxor		vs0,	vs0,	vs0
+	xxlxor		vs1,	vs1,	vs1
+	xxswapd		vs39,	vs39			// realA*imagB, imagA*imagB -> imagA*imagB, realA*imagB
+
+	XSFADD_R1	vs0,	vs0,	vs38		// realA*realB
+	XSFADD_R2	vs0,	vs0,	vs39		// imagA*imagB
+
+	xxswapd		vs38,	vs38			// realA*realB, imagA*realB -> imagA*realB, realA*realB
+	xxswapd		vs39,	vs39			// imagA*imagB, realA*imagB -> realA*imagB, imagA*imagB
+
+	XSFADD_I1	vs1,	vs1,	vs38		// realA*imagB
+	XSFADD_I2	vs1,	vs1,	vs39		// imagA*realB
+
+	xsmuldp		vs4,	vs0,	alpha_r		// real*alpha_r
+	xsmuldp		vs5,	vs1,	alpha_i		// imag*alpha_i
+	xsmuldp		vs6,	vs0,	alpha_i		// real*alpha_i
+	xsmuldp		vs7,	vs1,	alpha_r		// imag*alpha_r
+
+	xssubdp		vs2,	vs4,	vs5		// real*alpha_r - imag*alpha_i
+	xsadddp		vs3,	vs6,	vs7		// real*alpha_i + imag*alpha_r
+	xxpermdi	vs11,	vs2,	vs3,	0	// merge real and imag part
+
+
+
+	xxlxor		vs0,	vs0,	vs0
+	xxlxor		vs1,	vs1,	vs1
+	xxswapd		vs41,	vs41			// realA*imagB, imagA*imagB -> imagA*imagB, realA*imagB
+
+	XSFADD_R1	vs0,	vs0,	vs40		// realA*realB
+	XSFADD_R2	vs0,	vs0,	vs41		// imagA*imagB
+
+	xxswapd		vs40,	vs40			// realA*realB, imagA*realB -> imagA*realB, realA*realB
+	xxswapd		vs41,	vs41			// imagA*imagB, realA*imagB -> realA*imagB, imagA*imagB
+
+	XSFADD_I1	vs1,	vs1,	vs40		// realA*imagB
+	XSFADD_I2	vs1,	vs1,	vs41		// imagA*realB
+
+	xsmuldp		vs4,	vs0,	alpha_r		// real*alpha_r
+	xsmuldp		vs5,	vs1,	alpha_i		// imag*alpha_i
+	xsmuldp		vs6,	vs0,	alpha_i		// real*alpha_i
+	xsmuldp		vs7,	vs1,	alpha_r		// imag*alpha_r
+
+	xssubdp		vs2,	vs4,	vs5		// real*alpha_r - imag*alpha_i
+	xsadddp		vs3,	vs6,	vs7		// real*alpha_i + imag*alpha_r
+	xxpermdi	vs12,	vs2,	vs3,	0	// merge real and imag part
+
+
+
+	xxlxor		vs0,	vs0,	vs0
+	xxlxor		vs1,	vs1,	vs1
+	xxswapd		vs43,	vs43			// realA*imagB, imagA*imagB -> imagA*imagB, realA*imagB
+
+	XSFADD_R1	vs0,	vs0,	vs42		// realA*realB
+	XSFADD_R2	vs0,	vs0,	vs43		// imagA*imagB
+
+	xxswapd		vs42,	vs42			// realA*realB, imagA*realB -> imagA*realB, realA*realB
+	xxswapd		vs43,	vs43			// imagA*imagB, realA*imagB -> realA*imagB, imagA*imagB
+
+	XSFADD_I1	vs1,	vs1,	vs42		// realA*imagB
+	XSFADD_I2	vs1,	vs1,	vs43		// imagA*realB
+
+	xsmuldp		vs4,	vs0,	alpha_r		// real*alpha_r
+	xsmuldp		vs5,	vs1,	alpha_i		// imag*alpha_i
+	xsmuldp		vs6,	vs0,	alpha_i		// real*alpha_i
+	xsmuldp		vs7,	vs1,	alpha_r		// imag*alpha_r
+
+	xssubdp		vs2,	vs4,	vs5		// real*alpha_r - imag*alpha_i
+	xsadddp		vs3,	vs6,	vs7		// real*alpha_i + imag*alpha_r
+	xxpermdi	vs13,	vs2,	vs3,	0	// merge real and imag part
+
+
+
+	xxlxor		vs0,	vs0,	vs0
+	xxlxor		vs1,	vs1,	vs1
+	xxswapd		vs45,	vs45			// realA*imagB, imagA*imagB -> imagA*imagB, realA*imagB
+
+	XSFADD_R1	vs0,	vs0,	vs44		// realA*realB
+	XSFADD_R2	vs0,	vs0,	vs45		// imagA*imagB
+
+	xxswapd		vs44,	vs44			// realA*realB, imagA*realB -> imagA*realB, realA*realB
+	xxswapd		vs45,	vs45			// imagA*imagB, realA*imagB -> realA*imagB, imagA*imagB
+
+	XSFADD_I1	vs1,	vs1,	vs44		// realA*imagB
+	XSFADD_I2	vs1,	vs1,	vs45		// imagA*realB
+
+	xsmuldp		vs4,	vs0,	alpha_r		// real*alpha_r
+	xsmuldp		vs5,	vs1,	alpha_i		// imag*alpha_i
+	xsmuldp		vs6,	vs0,	alpha_i		// real*alpha_i
+	xsmuldp		vs7,	vs1,	alpha_r		// imag*alpha_r
+
+	xssubdp		vs2,	vs4,	vs5		// real*alpha_r - imag*alpha_i
+	xsadddp		vs3,	vs6,	vs7		// real*alpha_i + imag*alpha_r
+	xxpermdi	vs14,	vs2,	vs3,	0	// merge real and imag part
+
+
+
+	xxlxor		vs0,	vs0,	vs0
+	xxlxor		vs1,	vs1,	vs1
+	xxswapd		vs47,	vs47			// realA*imagB, imagA*imagB -> imagA*imagB, realA*imagB
+
+	XSFADD_R1	vs0,	vs0,	vs46		// realA*realB
+	XSFADD_R2	vs0,	vs0,	vs47		// imagA*imagB
+
+	xxswapd		vs46,	vs46			// realA*realB, imagA*realB -> imagA*realB, realA*realB
+	xxswapd		vs47,	vs47			// imagA*imagB, realA*imagB -> realA*imagB, imagA*imagB
+
+	XSFADD_I1	vs1,	vs1,	vs46		// realA*imagB
+	XSFADD_I2	vs1,	vs1,	vs47		// imagA*realB
+
+	xsmuldp		vs4,	vs0,	alpha_r		// real*alpha_r
+	xsmuldp		vs5,	vs1,	alpha_i		// imag*alpha_i
+	xsmuldp		vs6,	vs0,	alpha_i		// real*alpha_i
+	xsmuldp		vs7,	vs1,	alpha_r		// imag*alpha_r
+
+	xssubdp		vs2,	vs4,	vs5		// real*alpha_r - imag*alpha_i
+	xsadddp		vs3,	vs6,	vs7		// real*alpha_i + imag*alpha_r
+	xxpermdi	vs15,	vs2,	vs3,	0	// merge real and imag part
+
+
+#ifndef TRMMKERNEL
+
+	xvadddp		vs8,	vs8,	vs16
+	xvadddp		vs9,	vs9,	vs17
+	xvadddp		vs10,	vs10,	vs18
+	xvadddp		vs11,	vs11,	vs19
+	xvadddp		vs12,	vs12,	vs20
+	xvadddp		vs13,	vs13,	vs21
+	xvadddp		vs14,	vs14,	vs22
+	xvadddp		vs15,	vs15,	vs23
+
+#endif
+
+	stxvd2x		vs8,	o0,	T1
+	stxvd2x		vs9,	o16,	T1
+	stxvd2x		vs10,	o32,	T1
+	stxvd2x		vs11,	o48,	T1
+	stxvd2x		vs12,	o0,	T2
+	stxvd2x		vs13,	o16,	T2
+	stxvd2x		vs14,	o32,	T2
+	stxvd2x		vs15,	o48,	T2
+
+	add		T1,	T1,	LDC
+	add		T2,	T2,	LDC
+
+#ifndef TRMMKERNEL
+
+	lxvd2x		vs16,	o0,	T1
+	lxvd2x		vs17,	o16,	T1
+	lxvd2x		vs18,	o32,	T1
+	lxvd2x		vs19,	o48,	T1
+	lxvd2x		vs20,	o0,	T2
+	lxvd2x		vs21,	o16,	T2
+	lxvd2x		vs22,	o32,	T2
+	lxvd2x		vs23,	o48,	T2
+
+#endif
+
+
+	xxlxor		vs0,	vs0,	vs0
+	xxlxor		vs1,	vs1,	vs1
+	xxswapd		vs49,	vs49			// realA*imagB, imagA*imagB -> imagA*imagB, realA*imagB
+
+	XSFADD_R1	vs0,	vs0,	vs48		// realA*realB
+	XSFADD_R2	vs0,	vs0,	vs49		// imagA*imagB
+
+	xxswapd		vs48,	vs48			// realA*realB, imagA*realB -> imagA*realB, realA*realB
+	xxswapd		vs49,	vs49			// imagA*imagB, realA*imagB -> realA*imagB, imagA*imagB
+
+	XSFADD_I1	vs1,	vs1,	vs48		// realA*imagB
+	XSFADD_I2	vs1,	vs1,	vs49		// imagA*realB
+
+	xsmuldp		vs4,	vs0,	alpha_r		// real*alpha_r
+	xsmuldp		vs5,	vs1,	alpha_i		// imag*alpha_i
+	xsmuldp		vs6,	vs0,	alpha_i		// real*alpha_i
+	xsmuldp		vs7,	vs1,	alpha_r		// imag*alpha_r
+
+	xssubdp		vs2,	vs4,	vs5		// real*alpha_r - imag*alpha_i
+	xsadddp		vs3,	vs6,	vs7		// real*alpha_i + imag*alpha_r
+	xxpermdi	vs8,	vs2,	vs3,	0	// merge real and imag part
+
+
+
+	xxlxor		vs0,	vs0,	vs0
+	xxlxor		vs1,	vs1,	vs1
+	xxswapd		vs51,	vs51			// realA*imagB, imagA*imagB -> imagA*imagB, realA*imagB
+
+	XSFADD_R1	vs0,	vs0,	vs50		// realA*realB
+	XSFADD_R2	vs0,	vs0,	vs51		// imagA*imagB
+
+	xxswapd		vs50,	vs50			// realA*realB, imagA*realB -> imagA*realB, realA*realB
+	xxswapd		vs51,	vs51			// imagA*imagB, realA*imagB -> realA*imagB, imagA*imagB
+
+	XSFADD_I1	vs1,	vs1,	vs50		// realA*imagB
+	XSFADD_I2	vs1,	vs1,	vs51		// imagA*realB
+
+	xsmuldp		vs4,	vs0,	alpha_r		// real*alpha_r
+	xsmuldp		vs5,	vs1,	alpha_i		// imag*alpha_i
+	xsmuldp		vs6,	vs0,	alpha_i		// real*alpha_i
+	xsmuldp		vs7,	vs1,	alpha_r		// imag*alpha_r
+
+	xssubdp		vs2,	vs4,	vs5		// real*alpha_r - imag*alpha_i
+	xsadddp		vs3,	vs6,	vs7		// real*alpha_i + imag*alpha_r
+	xxpermdi	vs9,	vs2,	vs3,	0	// merge real and imag part
+
+
+
+	xxlxor		vs0,	vs0,	vs0
+	xxlxor		vs1,	vs1,	vs1
+	xxswapd		vs53,	vs53			// realA*imagB, imagA*imagB -> imagA*imagB, realA*imagB
+
+	XSFADD_R1	vs0,	vs0,	vs52		// realA*realB
+	XSFADD_R2	vs0,	vs0,	vs53		// imagA*imagB
+
+	xxswapd		vs52,	vs52			// realA*realB, imagA*realB -> imagA*realB, realA*realB
+	xxswapd		vs53,	vs53			// imagA*imagB, realA*imagB -> realA*imagB, imagA*imagB
+
+	XSFADD_I1	vs1,	vs1,	vs52		// realA*imagB
+	XSFADD_I2	vs1,	vs1,	vs53		// imagA*realB
+
+	xsmuldp		vs4,	vs0,	alpha_r		// real*alpha_r
+	xsmuldp		vs5,	vs1,	alpha_i		// imag*alpha_i
+	xsmuldp		vs6,	vs0,	alpha_i		// real*alpha_i
+	xsmuldp		vs7,	vs1,	alpha_r		// imag*alpha_r
+
+	xssubdp		vs2,	vs4,	vs5		// real*alpha_r - imag*alpha_i
+	xsadddp		vs3,	vs6,	vs7		// real*alpha_i + imag*alpha_r
+	xxpermdi	vs10,	vs2,	vs3,	0	// merge real and imag part
+
+
+
+	xxlxor		vs0,	vs0,	vs0
+	xxlxor		vs1,	vs1,	vs1
+	xxswapd		vs55,	vs55			// realA*imagB, imagA*imagB -> imagA*imagB, realA*imagB
+
+	XSFADD_R1	vs0,	vs0,	vs54		// realA*realB
+	XSFADD_R2	vs0,	vs0,	vs55		// imagA*imagB
+
+	xxswapd		vs54,	vs54			// realA*realB, imagA*realB -> imagA*realB, realA*realB
+	xxswapd		vs55,	vs55			// imagA*imagB, realA*imagB -> realA*imagB, imagA*imagB
+
+	XSFADD_I1	vs1,	vs1,	vs54		// realA*imagB
+	XSFADD_I2	vs1,	vs1,	vs55		// imagA*realB
+
+	xsmuldp		vs4,	vs0,	alpha_r		// real*alpha_r
+	xsmuldp		vs5,	vs1,	alpha_i		// imag*alpha_i
+	xsmuldp		vs6,	vs0,	alpha_i		// real*alpha_i
+	xsmuldp		vs7,	vs1,	alpha_r		// imag*alpha_r
+
+	xssubdp		vs2,	vs4,	vs5		// real*alpha_r - imag*alpha_i
+	xsadddp		vs3,	vs6,	vs7		// real*alpha_i + imag*alpha_r
+	xxpermdi	vs11,	vs2,	vs3,	0	// merge real and imag part
+
+
+
+	xxlxor		vs0,	vs0,	vs0
+	xxlxor		vs1,	vs1,	vs1
+	xxswapd		vs57,	vs57			// realA*imagB, imagA*imagB -> imagA*imagB, realA*imagB
+
+	XSFADD_R1	vs0,	vs0,	vs56		// realA*realB
+	XSFADD_R2	vs0,	vs0,	vs57		// imagA*imagB
+
+	xxswapd		vs56,	vs56			// realA*realB, imagA*realB -> imagA*realB, realA*realB
+	xxswapd		vs57,	vs57			// imagA*imagB, realA*imagB -> realA*imagB, imagA*imagB
+
+	XSFADD_I1	vs1,	vs1,	vs56		// realA*imagB
+	XSFADD_I2	vs1,	vs1,	vs57		// imagA*realB
+
+	xsmuldp		vs4,	vs0,	alpha_r		// real*alpha_r
+	xsmuldp		vs5,	vs1,	alpha_i		// imag*alpha_i
+	xsmuldp		vs6,	vs0,	alpha_i		// real*alpha_i
+	xsmuldp		vs7,	vs1,	alpha_r		// imag*alpha_r
+
+	xssubdp		vs2,	vs4,	vs5		// real*alpha_r - imag*alpha_i
+	xsadddp		vs3,	vs6,	vs7		// real*alpha_i + imag*alpha_r
+	xxpermdi	vs12,	vs2,	vs3,	0	// merge real and imag part
+
+
+
+	xxlxor		vs0,	vs0,	vs0
+	xxlxor		vs1,	vs1,	vs1
+	xxswapd		vs59,	vs59			// realA*imagB, imagA*imagB -> imagA*imagB, realA*imagB
+
+	XSFADD_R1	vs0,	vs0,	vs58		// realA*realB
+	XSFADD_R2	vs0,	vs0,	vs59		// imagA*imagB
+
+	xxswapd		vs58,	vs58			// realA*realB, imagA*realB -> imagA*realB, realA*realB
+	xxswapd		vs59,	vs59			// imagA*imagB, realA*imagB -> realA*imagB, imagA*imagB
+
+	XSFADD_I1	vs1,	vs1,	vs58		// realA*imagB
+	XSFADD_I2	vs1,	vs1,	vs59		// imagA*realB
+
+	xsmuldp		vs4,	vs0,	alpha_r		// real*alpha_r
+	xsmuldp		vs5,	vs1,	alpha_i		// imag*alpha_i
+	xsmuldp		vs6,	vs0,	alpha_i		// real*alpha_i
+	xsmuldp		vs7,	vs1,	alpha_r		// imag*alpha_r
+
+	xssubdp		vs2,	vs4,	vs5		// real*alpha_r - imag*alpha_i
+	xsadddp		vs3,	vs6,	vs7		// real*alpha_i + imag*alpha_r
+	xxpermdi	vs13,	vs2,	vs3,	0	// merge real and imag part
+
+
+
+	xxlxor		vs0,	vs0,	vs0
+	xxlxor		vs1,	vs1,	vs1
+	xxswapd		vs61,	vs61			// realA*imagB, imagA*imagB -> imagA*imagB, realA*imagB
+
+	XSFADD_R1	vs0,	vs0,	vs60		// realA*realB
+	XSFADD_R2	vs0,	vs0,	vs61		// imagA*imagB
+
+	xxswapd		vs60,	vs60			// realA*realB, imagA*realB -> imagA*realB, realA*realB
+	xxswapd		vs61,	vs61			// imagA*imagB, realA*imagB -> realA*imagB, imagA*imagB
+
+	XSFADD_I1	vs1,	vs1,	vs60		// realA*imagB
+	XSFADD_I2	vs1,	vs1,	vs61		// imagA*realB
+
+	xsmuldp		vs4,	vs0,	alpha_r		// real*alpha_r
+	xsmuldp		vs5,	vs1,	alpha_i		// imag*alpha_i
+	xsmuldp		vs6,	vs0,	alpha_i		// real*alpha_i
+	xsmuldp		vs7,	vs1,	alpha_r		// imag*alpha_r
+
+	xssubdp		vs2,	vs4,	vs5		// real*alpha_r - imag*alpha_i
+	xsadddp		vs3,	vs6,	vs7		// real*alpha_i + imag*alpha_r
+	xxpermdi	vs14,	vs2,	vs3,	0	// merge real and imag part
+
+
+
+	xxlxor		vs0,	vs0,	vs0
+	xxlxor		vs1,	vs1,	vs1
+	xxswapd		vs63,	vs63			// realA*imagB, imagA*imagB -> imagA*imagB, realA*imagB
+
+	XSFADD_R1	vs0,	vs0,	vs62		// realA*realB
+	XSFADD_R2	vs0,	vs0,	vs63		// imagA*imagB
+
+	xxswapd		vs62,	vs62			// realA*realB, imagA*realB -> imagA*realB, realA*realB
+	xxswapd		vs63,	vs63			// imagA*imagB, realA*imagB -> realA*imagB, imagA*imagB
+
+	XSFADD_I1	vs1,	vs1,	vs62		// realA*imagB
+	XSFADD_I2	vs1,	vs1,	vs63		// imagA*realB
+
+	xsmuldp		vs4,	vs0,	alpha_r		// real*alpha_r
+	xsmuldp		vs5,	vs1,	alpha_i		// imag*alpha_i
+	xsmuldp		vs6,	vs0,	alpha_i		// real*alpha_i
+	xsmuldp		vs7,	vs1,	alpha_r		// imag*alpha_r
+
+	xssubdp		vs2,	vs4,	vs5		// real*alpha_r - imag*alpha_i
+	xsadddp		vs3,	vs6,	vs7		// real*alpha_i + imag*alpha_r
+	xxpermdi	vs15,	vs2,	vs3,	0	// merge real and imag part
+
+
+#ifndef TRMMKERNEL
+
+	xvadddp		vs8,	vs8,	vs16
+	xvadddp		vs9,	vs9,	vs17
+	xvadddp		vs10,	vs10,	vs18
+	xvadddp		vs11,	vs11,	vs19
+	xvadddp		vs12,	vs12,	vs20
+	xvadddp		vs13,	vs13,	vs21
+	xvadddp		vs14,	vs14,	vs22
+	xvadddp		vs15,	vs15,	vs23
+
+#endif
+
+	stxvd2x		vs8,	o0,	T1
+	stxvd2x		vs9,	o16,	T1
+	stxvd2x		vs10,	o32,	T1
+	stxvd2x		vs11,	o48,	T1
+	stxvd2x		vs12,	o0,	T2
+	stxvd2x		vs13,	o16,	T2
+	stxvd2x		vs14,	o32,	T2
+	stxvd2x		vs15,	o48,	T2
+
+	add		T1,	T1,	LDC
+	add		T2,	T2,	LDC
+	addi		CO,	CO,	128
+
+.endm
+
+
+/**********************************************************************************************
+* Macros for N=2 and M=4
+**********************************************************************************************/
+
+.macro LOAD2x4_1
+
+	lxvdsx		vs16,	o0,	BO		// load real part from B
+	lxvdsx		vs17,	o8,	BO		// load imag part from B
+	lxvdsx		vs18,	o16,	BO		// load real part from B
+	lxvdsx		vs19,	o24,	BO		// load imag part from B
+
+	addi		BO,	BO,	32
+
+	lxvd2x		vs0,	o0,	AO		// load real,imag from A
+	lxvd2x		vs1,	o16,	AO		// load real,imag from A
+	lxvd2x		vs2,	o32,	AO		// load real,imag from A
+	lxvd2x		vs3,	o48,	AO		// load real,imag from A
+
+	addi		AO,	AO,	64
+
+
+.endm
+
+.macro KERNEL2x4_I1
+
+	lxvd2x		vs8,	o0,	AO		// load real,imag from A
+	lxvd2x		vs9,	o16,	AO		// load real,imag from A
+	lxvd2x		vs10,	o32,	AO		// load real,imag from A
+	lxvd2x		vs11,	o48,	AO		// load real,imag from A
+
+	addi		AO,	AO,	64
+
+	lxvdsx		vs20,	o0,	BO		// load real part from B
+	lxvdsx		vs21,	o8,	BO		// load imag part from B
+	lxvdsx		vs22,	o16,	BO		// load real part from B
+	lxvdsx		vs23,	o24,	BO		// load imag part from B
+
+	addi		BO,	BO,	32
+
+	xvmuldp		vs32,	vs0,	vs16		// real*real, imag*real
+	xvmuldp		vs33,	vs0,	vs17		// real*imag, imag*imag
+	xvmuldp		vs34,	vs1,	vs16		// real*real, imag*real
+	xvmuldp		vs35,	vs1,	vs17		// real*imag, imag*imag
+	xvmuldp		vs36,	vs2,	vs16		// real*real, imag*real
+	xvmuldp		vs37,	vs2,	vs17		// real*imag, imag*imag
+	xvmuldp		vs38,	vs3,	vs16		// real*real, imag*real
+	xvmuldp		vs39,	vs3,	vs17		// real*imag, imag*imag
+
+	xvmuldp		vs40,	vs0,	vs18		// real*real, imag*real
+	xvmuldp		vs41,	vs0,	vs19		// real*imag, imag*imag
+	xvmuldp		vs42,	vs1,	vs18		// real*real, imag*real
+	xvmuldp		vs43,	vs1,	vs19		// real*imag, imag*imag
+	xvmuldp		vs44,	vs2,	vs18		// real*real, imag*real
+	xvmuldp		vs45,	vs2,	vs19		// real*imag, imag*imag
+	xvmuldp		vs46,	vs3,	vs18		// real*real, imag*real
+	xvmuldp		vs47,	vs3,	vs19		// real*imag, imag*imag
+
+
+.endm
+
+.macro KERNEL2x4_1
+
+	lxvd2x		vs8,	o0,	AO		// load real,imag from A
+	lxvd2x		vs9,	o16,	AO		// load real,imag from A
+	lxvd2x		vs10,	o32,	AO		// load real,imag from A
+	lxvd2x		vs11,	o48,	AO		// load real,imag from A
+
+	addi		AO,	AO,	64
+
+	lxvdsx		vs20,	o0,	BO		// load real part from B
+	lxvdsx		vs21,	o8,	BO		// load imag part from B
+	lxvdsx		vs22,	o16,	BO		// load real part from B
+	lxvdsx		vs23,	o24,	BO		// load imag part from B
+
+	addi		BO,	BO,	32
+
+	xvmaddadp	vs32,	vs0,	vs16		// real*real, imag*real
+	xvmaddadp	vs33,	vs0,	vs17		// real*imag, imag*imag
+	xvmaddadp	vs34,	vs1,	vs16		// real*real, imag*real
+	xvmaddadp	vs35,	vs1,	vs17		// real*imag, imag*imag
+	xvmaddadp	vs36,	vs2,	vs16		// real*real, imag*real
+	xvmaddadp	vs37,	vs2,	vs17		// real*imag, imag*imag
+	xvmaddadp	vs38,	vs3,	vs16		// real*real, imag*real
+	xvmaddadp	vs39,	vs3,	vs17		// real*imag, imag*imag
+
+	xvmaddadp	vs40,	vs0,	vs18		// real*real, imag*real
+	xvmaddadp	vs41,	vs0,	vs19		// real*imag, imag*imag
+	xvmaddadp	vs42,	vs1,	vs18		// real*real, imag*real
+	xvmaddadp	vs43,	vs1,	vs19		// real*imag, imag*imag
+	xvmaddadp	vs44,	vs2,	vs18		// real*real, imag*real
+	xvmaddadp	vs45,	vs2,	vs19		// real*imag, imag*imag
+	xvmaddadp	vs46,	vs3,	vs18		// real*real, imag*real
+	xvmaddadp	vs47,	vs3,	vs19		// real*imag, imag*imag
+
+
+.endm
+
+.macro KERNEL2x4_2
+
+	lxvd2x		vs0,	o0,	AO		// load real,imag from A
+	lxvd2x		vs1,	o16,	AO		// load real,imag from A
+	lxvd2x		vs2,	o32,	AO		// load real,imag from A
+	lxvd2x		vs3,	o48,	AO		// load real,imag from A
+
+	addi		AO,	AO,	64
+
+	lxvdsx		vs16,	o0,	BO		// load real part from B
+	lxvdsx		vs17,	o8,	BO		// load imag part from B
+	lxvdsx		vs18,	o16,	BO		// load real part from B
+	lxvdsx		vs19,	o24,	BO		// load imag part from B
+
+	addi		BO,	BO,	32
+
+	xvmaddadp	vs32,	vs8,	vs20		// real*real, imag*real
+	xvmaddadp	vs33,	vs8,	vs21		// real*imag, imag*imag
+	xvmaddadp	vs34,	vs9,	vs20		// real*real, imag*real
+	xvmaddadp	vs35,	vs9,	vs21		// real*imag, imag*imag
+	xvmaddadp	vs36,	vs10,	vs20		// real*real, imag*real
+	xvmaddadp	vs37,	vs10,	vs21		// real*imag, imag*imag
+	xvmaddadp	vs38,	vs11,	vs20		// real*real, imag*real
+	xvmaddadp	vs39,	vs11,	vs21		// real*imag, imag*imag
+
+	xvmaddadp	vs40,	vs8,	vs22		// real*real, imag*real
+	xvmaddadp	vs41,	vs8,	vs23		// real*imag, imag*imag
+	xvmaddadp	vs42,	vs9,	vs22		// real*real, imag*real
+	xvmaddadp	vs43,	vs9,	vs23		// real*imag, imag*imag
+	xvmaddadp	vs44,	vs10,	vs22		// real*real, imag*real
+	xvmaddadp	vs45,	vs10,	vs23		// real*imag, imag*imag
+	xvmaddadp	vs46,	vs11,	vs22		// real*real, imag*real
+	xvmaddadp	vs47,	vs11,	vs23		// real*imag, imag*imag
+
+
+.endm
+
+.macro KERNEL2x4_E2
+
+
+	xvmaddadp	vs32,	vs8,	vs20		// real*real, imag*real
+	xvmaddadp	vs33,	vs8,	vs21		// real*imag, imag*imag
+	xvmaddadp	vs34,	vs9,	vs20		// real*real, imag*real
+	xvmaddadp	vs35,	vs9,	vs21		// real*imag, imag*imag
+	xvmaddadp	vs36,	vs10,	vs20		// real*real, imag*real
+	xvmaddadp	vs37,	vs10,	vs21		// real*imag, imag*imag
+	xvmaddadp	vs38,	vs11,	vs20		// real*real, imag*real
+	xvmaddadp	vs39,	vs11,	vs21		// real*imag, imag*imag
+
+	xvmaddadp	vs40,	vs8,	vs22		// real*real, imag*real
+	xvmaddadp	vs41,	vs8,	vs23		// real*imag, imag*imag
+	xvmaddadp	vs42,	vs9,	vs22		// real*real, imag*real
+	xvmaddadp	vs43,	vs9,	vs23		// real*imag, imag*imag
+	xvmaddadp	vs44,	vs10,	vs22		// real*real, imag*real
+	xvmaddadp	vs45,	vs10,	vs23		// real*imag, imag*imag
+	xvmaddadp	vs46,	vs11,	vs22		// real*real, imag*real
+	xvmaddadp	vs47,	vs11,	vs23		// real*imag, imag*imag
+
+
+.endm
+
+.macro KERNEL2x4_SUBI1
+
+	lxvd2x		vs0,	o0,	AO		// load real,imag from A
+	lxvd2x		vs1,	o16,	AO		// load real,imag from A
+	lxvd2x		vs2,	o32,	AO		// load real,imag from A
+	lxvd2x		vs3,	o48,	AO		// load real,imag from A
+
+	addi		AO,	AO,	64
+
+	lxvdsx		vs16,	o0,	BO		// load real part from B
+	lxvdsx		vs17,	o8,	BO		// load imag part from B
+	lxvdsx		vs18,	o16,	BO		// load real part from B
+	lxvdsx		vs19,	o24,	BO		// load imag part from B
+
+	addi		BO,	BO,	32
+
+	xvmuldp		vs32,	vs0,	vs16		// real*real, imag*real
+	xvmuldp		vs33,	vs0,	vs17		// real*imag, imag*imag
+	xvmuldp		vs34,	vs1,	vs16		// real*real, imag*real
+	xvmuldp		vs35,	vs1,	vs17		// real*imag, imag*imag
+	xvmuldp		vs36,	vs2,	vs16		// real*real, imag*real
+	xvmuldp		vs37,	vs2,	vs17		// real*imag, imag*imag
+	xvmuldp		vs38,	vs3,	vs16		// real*real, imag*real
+	xvmuldp		vs39,	vs3,	vs17		// real*imag, imag*imag
+
+	xvmuldp		vs40,	vs0,	vs18		// real*real, imag*real
+	xvmuldp		vs41,	vs0,	vs19		// real*imag, imag*imag
+	xvmuldp		vs42,	vs1,	vs18		// real*real, imag*real
+	xvmuldp		vs43,	vs1,	vs19		// real*imag, imag*imag
+	xvmuldp		vs44,	vs2,	vs18		// real*real, imag*real
+	xvmuldp		vs45,	vs2,	vs19		// real*imag, imag*imag
+	xvmuldp		vs46,	vs3,	vs18		// real*real, imag*real
+	xvmuldp		vs47,	vs3,	vs19		// real*imag, imag*imag
+
+
+.endm
+
+.macro KERNEL2x4_SUB1
+
+	lxvd2x		vs0,	o0,	AO		// load real,imag from A
+	lxvd2x		vs1,	o16,	AO		// load real,imag from A
+	lxvd2x		vs2,	o32,	AO		// load real,imag from A
+	lxvd2x		vs3,	o48,	AO		// load real,imag from A
+
+	addi		AO,	AO,	64
+
+	lxvdsx		vs16,	o0,	BO		// load real part from B
+	lxvdsx		vs17,	o8,	BO		// load imag part from B
+	lxvdsx		vs18,	o16,	BO		// load real part from B
+	lxvdsx		vs19,	o24,	BO		// load imag part from B
+
+	addi		BO,	BO,	32
+
+	xvmaddadp	vs32,	vs0,	vs16		// real*real, imag*real
+	xvmaddadp	vs33,	vs0,	vs17		// real*imag, imag*imag
+	xvmaddadp	vs34,	vs1,	vs16		// real*real, imag*real
+	xvmaddadp	vs35,	vs1,	vs17		// real*imag, imag*imag
+	xvmaddadp	vs36,	vs2,	vs16		// real*real, imag*real
+	xvmaddadp	vs37,	vs2,	vs17		// real*imag, imag*imag
+	xvmaddadp	vs38,	vs3,	vs16		// real*real, imag*real
+	xvmaddadp	vs39,	vs3,	vs17		// real*imag, imag*imag
+
+	xvmaddadp	vs40,	vs0,	vs18		// real*real, imag*real
+	xvmaddadp	vs41,	vs0,	vs19		// real*imag, imag*imag
+	xvmaddadp	vs42,	vs1,	vs18		// real*real, imag*real
+	xvmaddadp	vs43,	vs1,	vs19		// real*imag, imag*imag
+	xvmaddadp	vs44,	vs2,	vs18		// real*real, imag*real
+	xvmaddadp	vs45,	vs2,	vs19		// real*imag, imag*imag
+	xvmaddadp	vs46,	vs3,	vs18		// real*real, imag*real
+	xvmaddadp	vs47,	vs3,	vs19		// real*imag, imag*imag
+
+
+.endm
+
+.macro SAVE2x4
+
+
+	mr		T1,	CO
+
+#ifndef TRMMKERNEL
+
+	lxvd2x		vs16,	o0,	T1
+	lxvd2x		vs17,	o16,	T1
+	lxvd2x		vs18,	o32,	T1
+	lxvd2x		vs19,	o48,	T1
+
+#endif
+
+
+	xxlxor		vs0,	vs0,	vs0
+	xxlxor		vs1,	vs1,	vs1
+	xxswapd		vs33,	vs33			// realA*imagB, imagA*imagB -> imagA*imagB, realA*imagB
+
+	XSFADD_R1	vs0,	vs0,	vs32		// realA*realB
+	XSFADD_R2	vs0,	vs0,	vs33		// imagA*imagB
+
+	xxswapd		vs32,	vs32			// realA*realB, imagA*realB -> imagA*realB, realA*realB
+	xxswapd		vs33,	vs33			// imagA*imagB, realA*imagB -> realA*imagB, imagA*imagB
+
+	XSFADD_I1	vs1,	vs1,	vs32		// realA*imagB
+	XSFADD_I2	vs1,	vs1,	vs33		// imagA*realB
+
+	xsmuldp		vs4,	vs0,	alpha_r		// real*alpha_r
+	xsmuldp		vs5,	vs1,	alpha_i		// imag*alpha_i
+	xsmuldp		vs6,	vs0,	alpha_i		// real*alpha_i
+	xsmuldp		vs7,	vs1,	alpha_r		// imag*alpha_r
+
+	xssubdp		vs2,	vs4,	vs5		// real*alpha_r - imag*alpha_i
+	xsadddp		vs3,	vs6,	vs7		// real*alpha_i + imag*alpha_r
+	xxpermdi	vs8,	vs2,	vs3,	0	// merge real and imag part
+
+
+
+	xxlxor		vs0,	vs0,	vs0
+	xxlxor		vs1,	vs1,	vs1
+	xxswapd		vs35,	vs35			// realA*imagB, imagA*imagB -> imagA*imagB, realA*imagB
+
+	XSFADD_R1	vs0,	vs0,	vs34		// realA*realB
+	XSFADD_R2	vs0,	vs0,	vs35		// imagA*imagB
+
+	xxswapd		vs34,	vs34			// realA*realB, imagA*realB -> imagA*realB, realA*realB
+	xxswapd		vs35,	vs35			// imagA*imagB, realA*imagB -> realA*imagB, imagA*imagB
+
+	XSFADD_I1	vs1,	vs1,	vs34		// realA*imagB
+	XSFADD_I2	vs1,	vs1,	vs35		// imagA*realB
+
+	xsmuldp		vs4,	vs0,	alpha_r		// real*alpha_r
+	xsmuldp		vs5,	vs1,	alpha_i		// imag*alpha_i
+	xsmuldp		vs6,	vs0,	alpha_i		// real*alpha_i
+	xsmuldp		vs7,	vs1,	alpha_r		// imag*alpha_r
+
+	xssubdp		vs2,	vs4,	vs5		// real*alpha_r - imag*alpha_i
+	xsadddp		vs3,	vs6,	vs7		// real*alpha_i + imag*alpha_r
+	xxpermdi	vs9,	vs2,	vs3,	0	// merge real and imag part
+
+
+
+	xxlxor		vs0,	vs0,	vs0
+	xxlxor		vs1,	vs1,	vs1
+	xxswapd		vs37,	vs37			// realA*imagB, imagA*imagB -> imagA*imagB, realA*imagB
+
+	XSFADD_R1	vs0,	vs0,	vs36		// realA*realB
+	XSFADD_R2	vs0,	vs0,	vs37		// imagA*imagB
+
+	xxswapd		vs36,	vs36			// realA*realB, imagA*realB -> imagA*realB, realA*realB
+	xxswapd		vs37,	vs37			// imagA*imagB, realA*imagB -> realA*imagB, imagA*imagB
+
+	XSFADD_I1	vs1,	vs1,	vs36		// realA*imagB
+	XSFADD_I2	vs1,	vs1,	vs37		// imagA*realB
+
+	xsmuldp		vs4,	vs0,	alpha_r		// real*alpha_r
+	xsmuldp		vs5,	vs1,	alpha_i		// imag*alpha_i
+	xsmuldp		vs6,	vs0,	alpha_i		// real*alpha_i
+	xsmuldp		vs7,	vs1,	alpha_r		// imag*alpha_r
+
+	xssubdp		vs2,	vs4,	vs5		// real*alpha_r - imag*alpha_i
+	xsadddp		vs3,	vs6,	vs7		// real*alpha_i + imag*alpha_r
+	xxpermdi	vs10,	vs2,	vs3,	0	// merge real and imag part
+
+
+
+	xxlxor		vs0,	vs0,	vs0
+	xxlxor		vs1,	vs1,	vs1
+	xxswapd		vs39,	vs39			// realA*imagB, imagA*imagB -> imagA*imagB, realA*imagB
+
+	XSFADD_R1	vs0,	vs0,	vs38		// realA*realB
+	XSFADD_R2	vs0,	vs0,	vs39		// imagA*imagB
+
+	xxswapd		vs38,	vs38			// realA*realB, imagA*realB -> imagA*realB, realA*realB
+	xxswapd		vs39,	vs39			// imagA*imagB, realA*imagB -> realA*imagB, imagA*imagB
+
+	XSFADD_I1	vs1,	vs1,	vs38		// realA*imagB
+	XSFADD_I2	vs1,	vs1,	vs39		// imagA*realB
+
+	xsmuldp		vs4,	vs0,	alpha_r		// real*alpha_r
+	xsmuldp		vs5,	vs1,	alpha_i		// imag*alpha_i
+	xsmuldp		vs6,	vs0,	alpha_i		// real*alpha_i
+	xsmuldp		vs7,	vs1,	alpha_r		// imag*alpha_r
+
+	xssubdp		vs2,	vs4,	vs5		// real*alpha_r - imag*alpha_i
+	xsadddp		vs3,	vs6,	vs7		// real*alpha_i + imag*alpha_r
+	xxpermdi	vs11,	vs2,	vs3,	0	// merge real and imag part
+
+
+#ifndef TRMMKERNEL
+
+	xvadddp		vs8,	vs8,	vs16
+	xvadddp		vs9,	vs9,	vs17
+	xvadddp		vs10,	vs10,	vs18
+	xvadddp		vs11,	vs11,	vs19
+
+#endif
+
+	stxvd2x		vs8,	o0,	T1
+	stxvd2x		vs9,	o16,	T1
+	stxvd2x		vs10,	o32,	T1
+	stxvd2x		vs11,	o48,	T1
+
+	add		T1,	T1,	LDC
+
+#ifndef TRMMKERNEL
+
+	lxvd2x		vs16,	o0,	T1
+	lxvd2x		vs17,	o16,	T1
+	lxvd2x		vs18,	o32,	T1
+	lxvd2x		vs19,	o48,	T1
+
+#endif
+
+
+	xxlxor		vs0,	vs0,	vs0
+	xxlxor		vs1,	vs1,	vs1
+	xxswapd		vs41,	vs41			// realA*imagB, imagA*imagB -> imagA*imagB, realA*imagB
+
+	XSFADD_R1	vs0,	vs0,	vs40		// realA*realB
+	XSFADD_R2	vs0,	vs0,	vs41		// imagA*imagB
+
+	xxswapd		vs40,	vs40			// realA*realB, imagA*realB -> imagA*realB, realA*realB
+	xxswapd		vs41,	vs41			// imagA*imagB, realA*imagB -> realA*imagB, imagA*imagB
+
+	XSFADD_I1	vs1,	vs1,	vs40		// realA*imagB
+	XSFADD_I2	vs1,	vs1,	vs41		// imagA*realB
+
+	xsmuldp		vs4,	vs0,	alpha_r		// real*alpha_r
+	xsmuldp		vs5,	vs1,	alpha_i		// imag*alpha_i
+	xsmuldp		vs6,	vs0,	alpha_i		// real*alpha_i
+	xsmuldp		vs7,	vs1,	alpha_r		// imag*alpha_r
+
+	xssubdp		vs2,	vs4,	vs5		// real*alpha_r - imag*alpha_i
+	xsadddp		vs3,	vs6,	vs7		// real*alpha_i + imag*alpha_r
+	xxpermdi	vs8,	vs2,	vs3,	0	// merge real and imag part
+
+
+
+	xxlxor		vs0,	vs0,	vs0
+	xxlxor		vs1,	vs1,	vs1
+	xxswapd		vs43,	vs43			// realA*imagB, imagA*imagB -> imagA*imagB, realA*imagB
+
+	XSFADD_R1	vs0,	vs0,	vs42		// realA*realB
+	XSFADD_R2	vs0,	vs0,	vs43		// imagA*imagB
+
+	xxswapd		vs42,	vs42			// realA*realB, imagA*realB -> imagA*realB, realA*realB
+	xxswapd		vs43,	vs43			// imagA*imagB, realA*imagB -> realA*imagB, imagA*imagB
+
+	XSFADD_I1	vs1,	vs1,	vs42		// realA*imagB
+	XSFADD_I2	vs1,	vs1,	vs43		// imagA*realB
+
+	xsmuldp		vs4,	vs0,	alpha_r		// real*alpha_r
+	xsmuldp		vs5,	vs1,	alpha_i		// imag*alpha_i
+	xsmuldp		vs6,	vs0,	alpha_i		// real*alpha_i
+	xsmuldp		vs7,	vs1,	alpha_r		// imag*alpha_r
+
+	xssubdp		vs2,	vs4,	vs5		// real*alpha_r - imag*alpha_i
+	xsadddp		vs3,	vs6,	vs7		// real*alpha_i + imag*alpha_r
+	xxpermdi	vs9,	vs2,	vs3,	0	// merge real and imag part
+
+
+
+	xxlxor		vs0,	vs0,	vs0
+	xxlxor		vs1,	vs1,	vs1
+	xxswapd		vs45,	vs45			// realA*imagB, imagA*imagB -> imagA*imagB, realA*imagB
+
+	XSFADD_R1	vs0,	vs0,	vs44		// realA*realB
+	XSFADD_R2	vs0,	vs0,	vs45		// imagA*imagB
+
+	xxswapd		vs44,	vs44			// realA*realB, imagA*realB -> imagA*realB, realA*realB
+	xxswapd		vs45,	vs45			// imagA*imagB, realA*imagB -> realA*imagB, imagA*imagB
+
+	XSFADD_I1	vs1,	vs1,	vs44		// realA*imagB
+	XSFADD_I2	vs1,	vs1,	vs45		// imagA*realB
+
+	xsmuldp		vs4,	vs0,	alpha_r		// real*alpha_r
+	xsmuldp		vs5,	vs1,	alpha_i		// imag*alpha_i
+	xsmuldp		vs6,	vs0,	alpha_i		// real*alpha_i
+	xsmuldp		vs7,	vs1,	alpha_r		// imag*alpha_r
+
+	xssubdp		vs2,	vs4,	vs5		// real*alpha_r - imag*alpha_i
+	xsadddp		vs3,	vs6,	vs7		// real*alpha_i + imag*alpha_r
+	xxpermdi	vs10,	vs2,	vs3,	0	// merge real and imag part
+
+
+
+	xxlxor		vs0,	vs0,	vs0
+	xxlxor		vs1,	vs1,	vs1
+	xxswapd		vs47,	vs47			// realA*imagB, imagA*imagB -> imagA*imagB, realA*imagB
+
+	XSFADD_R1	vs0,	vs0,	vs46		// realA*realB
+	XSFADD_R2	vs0,	vs0,	vs47		// imagA*imagB
+
+	xxswapd		vs46,	vs46			// realA*realB, imagA*realB -> imagA*realB, realA*realB
+	xxswapd		vs47,	vs47			// imagA*imagB, realA*imagB -> realA*imagB, imagA*imagB
+
+	XSFADD_I1	vs1,	vs1,	vs46		// realA*imagB
+	XSFADD_I2	vs1,	vs1,	vs47		// imagA*realB
+
+	xsmuldp		vs4,	vs0,	alpha_r		// real*alpha_r
+	xsmuldp		vs5,	vs1,	alpha_i		// imag*alpha_i
+	xsmuldp		vs6,	vs0,	alpha_i		// real*alpha_i
+	xsmuldp		vs7,	vs1,	alpha_r		// imag*alpha_r
+
+	xssubdp		vs2,	vs4,	vs5		// real*alpha_r - imag*alpha_i
+	xsadddp		vs3,	vs6,	vs7		// real*alpha_i + imag*alpha_r
+	xxpermdi	vs11,	vs2,	vs3,	0	// merge real and imag part
+
+
+#ifndef TRMMKERNEL
+
+	xvadddp		vs8,	vs8,	vs16
+	xvadddp		vs9,	vs9,	vs17
+	xvadddp		vs10,	vs10,	vs18
+	xvadddp		vs11,	vs11,	vs19
+
+#endif
+
+	stxvd2x		vs8,	o0,	T1
+	stxvd2x		vs9,	o16,	T1
+	stxvd2x		vs10,	o32,	T1
+	stxvd2x		vs11,	o48,	T1
+
+	add		T1,	T1,	LDC
+	addi		CO,	CO,	64
+
+.endm
+
+
+/**********************************************************************************************
+* Macros for N=2 and M=2
+**********************************************************************************************/
+
+.macro LOAD2x2_1
+
+	lxvdsx		vs16,	o0,	BO		// load real part from B
+	lxvdsx		vs17,	o8,	BO		// load imag part from B
+	lxvdsx		vs18,	o16,	BO		// load real part from B
+	lxvdsx		vs19,	o24,	BO		// load imag part from B
+
+	addi		BO,	BO,	32
+
+	lxvd2x		vs0,	o0,	AO		// load real,imag from A
+	lxvd2x		vs1,	o16,	AO		// load real,imag from A
+
+	addi		AO,	AO,	32
+
+
+.endm
+
+.macro KERNEL2x2_I1
+
+	lxvd2x		vs8,	o0,	AO		// load real,imag from A
+	lxvd2x		vs9,	o16,	AO		// load real,imag from A
+
+	addi		AO,	AO,	32
+
+	lxvdsx		vs20,	o0,	BO		// load real part from B
+	lxvdsx		vs21,	o8,	BO		// load imag part from B
+	lxvdsx		vs22,	o16,	BO		// load real part from B
+	lxvdsx		vs23,	o24,	BO		// load imag part from B
+
+	addi		BO,	BO,	32
+
+	xvmuldp		vs32,	vs0,	vs16		// real*real, imag*real
+	xvmuldp		vs33,	vs0,	vs17		// real*imag, imag*imag
+	xvmuldp		vs34,	vs1,	vs16		// real*real, imag*real
+	xvmuldp		vs35,	vs1,	vs17		// real*imag, imag*imag
+
+	xvmuldp		vs36,	vs0,	vs18		// real*real, imag*real
+	xvmuldp		vs37,	vs0,	vs19		// real*imag, imag*imag
+	xvmuldp		vs38,	vs1,	vs18		// real*real, imag*real
+	xvmuldp		vs39,	vs1,	vs19		// real*imag, imag*imag
+
+
+.endm
+
+.macro KERNEL2x2_1
+
+	lxvd2x		vs8,	o0,	AO		// load real,imag from A
+	lxvd2x		vs9,	o16,	AO		// load real,imag from A
+
+	addi		AO,	AO,	32
+
+	lxvdsx		vs20,	o0,	BO		// load real part from B
+	lxvdsx		vs21,	o8,	BO		// load imag part from B
+	lxvdsx		vs22,	o16,	BO		// load real part from B
+	lxvdsx		vs23,	o24,	BO		// load imag part from B
+
+	addi		BO,	BO,	32
+
+	xvmaddadp	vs32,	vs0,	vs16		// real*real, imag*real
+	xvmaddadp	vs33,	vs0,	vs17		// real*imag, imag*imag
+	xvmaddadp	vs34,	vs1,	vs16		// real*real, imag*real
+	xvmaddadp	vs35,	vs1,	vs17		// real*imag, imag*imag
+
+	xvmaddadp	vs36,	vs0,	vs18		// real*real, imag*real
+	xvmaddadp	vs37,	vs0,	vs19		// real*imag, imag*imag
+	xvmaddadp	vs38,	vs1,	vs18		// real*real, imag*real
+	xvmaddadp	vs39,	vs1,	vs19		// real*imag, imag*imag
+
+
+.endm
+
+.macro KERNEL2x2_2
+
+	lxvd2x		vs0,	o0,	AO		// load real,imag from A
+	lxvd2x		vs1,	o16,	AO		// load real,imag from A
+
+	addi		AO,	AO,	32
+
+	lxvdsx		vs16,	o0,	BO		// load real part from B
+	lxvdsx		vs17,	o8,	BO		// load imag part from B
+	lxvdsx		vs18,	o16,	BO		// load real part from B
+	lxvdsx		vs19,	o24,	BO		// load imag part from B
+
+	addi		BO,	BO,	32
+
+	xvmaddadp	vs32,	vs8,	vs20		// real*real, imag*real
+	xvmaddadp	vs33,	vs8,	vs21		// real*imag, imag*imag
+	xvmaddadp	vs34,	vs9,	vs20		// real*real, imag*real
+	xvmaddadp	vs35,	vs9,	vs21		// real*imag, imag*imag
+
+	xvmaddadp	vs36,	vs8,	vs22		// real*real, imag*real
+	xvmaddadp	vs37,	vs8,	vs23		// real*imag, imag*imag
+	xvmaddadp	vs38,	vs9,	vs22		// real*real, imag*real
+	xvmaddadp	vs39,	vs9,	vs23		// real*imag, imag*imag
+
+
+.endm
+
+.macro KERNEL2x2_E2
+
+
+	xvmaddadp	vs32,	vs8,	vs20		// real*real, imag*real
+	xvmaddadp	vs33,	vs8,	vs21		// real*imag, imag*imag
+	xvmaddadp	vs34,	vs9,	vs20		// real*real, imag*real
+	xvmaddadp	vs35,	vs9,	vs21		// real*imag, imag*imag
+
+	xvmaddadp	vs36,	vs8,	vs22		// real*real, imag*real
+	xvmaddadp	vs37,	vs8,	vs23		// real*imag, imag*imag
+	xvmaddadp	vs38,	vs9,	vs22		// real*real, imag*real
+	xvmaddadp	vs39,	vs9,	vs23		// real*imag, imag*imag
+
+
+.endm
+
+.macro KERNEL2x2_SUBI1
+
+	lxvd2x		vs0,	o0,	AO		// load real,imag from A
+	lxvd2x		vs1,	o16,	AO		// load real,imag from A
+
+	addi		AO,	AO,	32
+
+	lxvdsx		vs16,	o0,	BO		// load real part from B
+	lxvdsx		vs17,	o8,	BO		// load imag part from B
+	lxvdsx		vs18,	o16,	BO		// load real part from B
+	lxvdsx		vs19,	o24,	BO		// load imag part from B
+
+	addi		BO,	BO,	32
+
+	xvmuldp		vs32,	vs0,	vs16		// real*real, imag*real
+	xvmuldp		vs33,	vs0,	vs17		// real*imag, imag*imag
+	xvmuldp		vs34,	vs1,	vs16		// real*real, imag*real
+	xvmuldp		vs35,	vs1,	vs17		// real*imag, imag*imag
+
+	xvmuldp		vs36,	vs0,	vs18		// real*real, imag*real
+	xvmuldp		vs37,	vs0,	vs19		// real*imag, imag*imag
+	xvmuldp		vs38,	vs1,	vs18		// real*real, imag*real
+	xvmuldp		vs39,	vs1,	vs19		// real*imag, imag*imag
+
+
+.endm
+
+.macro KERNEL2x2_SUB1
+
+	lxvd2x		vs0,	o0,	AO		// load real,imag from A
+	lxvd2x		vs1,	o16,	AO		// load real,imag from A
+
+	addi		AO,	AO,	32
+
+	lxvdsx		vs16,	o0,	BO		// load real part from B
+	lxvdsx		vs17,	o8,	BO		// load imag part from B
+	lxvdsx		vs18,	o16,	BO		// load real part from B
+	lxvdsx		vs19,	o24,	BO		// load imag part from B
+
+	addi		BO,	BO,	32
+
+	xvmaddadp	vs32,	vs0,	vs16		// real*real, imag*real
+	xvmaddadp	vs33,	vs0,	vs17		// real*imag, imag*imag
+	xvmaddadp	vs34,	vs1,	vs16		// real*real, imag*real
+	xvmaddadp	vs35,	vs1,	vs17		// real*imag, imag*imag
+
+	xvmaddadp	vs36,	vs0,	vs18		// real*real, imag*real
+	xvmaddadp	vs37,	vs0,	vs19		// real*imag, imag*imag
+	xvmaddadp	vs38,	vs1,	vs18		// real*real, imag*real
+	xvmaddadp	vs39,	vs1,	vs19		// real*imag, imag*imag
+
+
+.endm
+
+.macro SAVE2x2
+
+
+	mr		T1,	CO
+
+#ifndef TRMMKERNEL
+
+	lxvd2x		vs16,	o0,	T1
+	lxvd2x		vs17,	o16,	T1
+
+#endif
+
+
+	xxlxor		vs0,	vs0,	vs0
+	xxlxor		vs1,	vs1,	vs1
+	xxswapd		vs33,	vs33			// realA*imagB, imagA*imagB -> imagA*imagB, realA*imagB
+
+	XSFADD_R1	vs0,	vs0,	vs32		// realA*realB
+	XSFADD_R2	vs0,	vs0,	vs33		// imagA*imagB
+
+	xxswapd		vs32,	vs32			// realA*realB, imagA*realB -> imagA*realB, realA*realB
+	xxswapd		vs33,	vs33			// imagA*imagB, realA*imagB -> realA*imagB, imagA*imagB
+
+	XSFADD_I1	vs1,	vs1,	vs32		// realA*imagB
+	XSFADD_I2	vs1,	vs1,	vs33		// imagA*realB
+
+	xsmuldp		vs4,	vs0,	alpha_r		// real*alpha_r
+	xsmuldp		vs5,	vs1,	alpha_i		// imag*alpha_i
+	xsmuldp		vs6,	vs0,	alpha_i		// real*alpha_i
+	xsmuldp		vs7,	vs1,	alpha_r		// imag*alpha_r
+
+	xssubdp		vs2,	vs4,	vs5		// real*alpha_r - imag*alpha_i
+	xsadddp		vs3,	vs6,	vs7		// real*alpha_i + imag*alpha_r
+	xxpermdi	vs8,	vs2,	vs3,	0	// merge real and imag part
+
+
+
+	xxlxor		vs0,	vs0,	vs0
+	xxlxor		vs1,	vs1,	vs1
+	xxswapd		vs35,	vs35			// realA*imagB, imagA*imagB -> imagA*imagB, realA*imagB
+
+	XSFADD_R1	vs0,	vs0,	vs34		// realA*realB
+	XSFADD_R2	vs0,	vs0,	vs35		// imagA*imagB
+
+	xxswapd		vs34,	vs34			// realA*realB, imagA*realB -> imagA*realB, realA*realB
+	xxswapd		vs35,	vs35			// imagA*imagB, realA*imagB -> realA*imagB, imagA*imagB
+
+	XSFADD_I1	vs1,	vs1,	vs34		// realA*imagB
+	XSFADD_I2	vs1,	vs1,	vs35		// imagA*realB
+
+	xsmuldp		vs4,	vs0,	alpha_r		// real*alpha_r
+	xsmuldp		vs5,	vs1,	alpha_i		// imag*alpha_i
+	xsmuldp		vs6,	vs0,	alpha_i		// real*alpha_i
+	xsmuldp		vs7,	vs1,	alpha_r		// imag*alpha_r
+
+	xssubdp		vs2,	vs4,	vs5		// real*alpha_r - imag*alpha_i
+	xsadddp		vs3,	vs6,	vs7		// real*alpha_i + imag*alpha_r
+	xxpermdi	vs9,	vs2,	vs3,	0	// merge real and imag part
+
+
+#ifndef TRMMKERNEL
+
+	xvadddp		vs8,	vs8,	vs16
+	xvadddp		vs9,	vs9,	vs17
+
+#endif
+
+	stxvd2x		vs8,	o0,	T1
+	stxvd2x		vs9,	o16,	T1
+
+	add		T1,	T1,	LDC
+
+#ifndef TRMMKERNEL
+
+	lxvd2x		vs16,	o0,	T1
+	lxvd2x		vs17,	o16,	T1
+
+#endif
+
+
+	xxlxor		vs0,	vs0,	vs0
+	xxlxor		vs1,	vs1,	vs1
+	xxswapd		vs37,	vs37			// realA*imagB, imagA*imagB -> imagA*imagB, realA*imagB
+
+	XSFADD_R1	vs0,	vs0,	vs36		// realA*realB
+	XSFADD_R2	vs0,	vs0,	vs37		// imagA*imagB
+
+	xxswapd		vs36,	vs36			// realA*realB, imagA*realB -> imagA*realB, realA*realB
+	xxswapd		vs37,	vs37			// imagA*imagB, realA*imagB -> realA*imagB, imagA*imagB
+
+	XSFADD_I1	vs1,	vs1,	vs36		// realA*imagB
+	XSFADD_I2	vs1,	vs1,	vs37		// imagA*realB
+
+	xsmuldp		vs4,	vs0,	alpha_r		// real*alpha_r
+	xsmuldp		vs5,	vs1,	alpha_i		// imag*alpha_i
+	xsmuldp		vs6,	vs0,	alpha_i		// real*alpha_i
+	xsmuldp		vs7,	vs1,	alpha_r		// imag*alpha_r
+
+	xssubdp		vs2,	vs4,	vs5		// real*alpha_r - imag*alpha_i
+	xsadddp		vs3,	vs6,	vs7		// real*alpha_i + imag*alpha_r
+	xxpermdi	vs8,	vs2,	vs3,	0	// merge real and imag part
+
+
+
+	xxlxor		vs0,	vs0,	vs0
+	xxlxor		vs1,	vs1,	vs1
+	xxswapd		vs39,	vs39			// realA*imagB, imagA*imagB -> imagA*imagB, realA*imagB
+
+	XSFADD_R1	vs0,	vs0,	vs38		// realA*realB
+	XSFADD_R2	vs0,	vs0,	vs39		// imagA*imagB
+
+	xxswapd		vs38,	vs38			// realA*realB, imagA*realB -> imagA*realB, realA*realB
+	xxswapd		vs39,	vs39			// imagA*imagB, realA*imagB -> realA*imagB, imagA*imagB
+
+	XSFADD_I1	vs1,	vs1,	vs38		// realA*imagB
+	XSFADD_I2	vs1,	vs1,	vs39		// imagA*realB
+
+	xsmuldp		vs4,	vs0,	alpha_r		// real*alpha_r
+	xsmuldp		vs5,	vs1,	alpha_i		// imag*alpha_i
+	xsmuldp		vs6,	vs0,	alpha_i		// real*alpha_i
+	xsmuldp		vs7,	vs1,	alpha_r		// imag*alpha_r
+
+	xssubdp		vs2,	vs4,	vs5		// real*alpha_r - imag*alpha_i
+	xsadddp		vs3,	vs6,	vs7		// real*alpha_i + imag*alpha_r
+	xxpermdi	vs9,	vs2,	vs3,	0	// merge real and imag part
+
+
+#ifndef TRMMKERNEL
+
+	xvadddp		vs8,	vs8,	vs16
+	xvadddp		vs9,	vs9,	vs17
+
+#endif
+
+	stxvd2x		vs8,	o0,	T1
+	stxvd2x		vs9,	o16,	T1
+
+	add		T1,	T1,	LDC
+	addi		CO,	CO,	32
+
+.endm
+
+
+/**********************************************************************************************
+* Macros for N=2 and M=1
+**********************************************************************************************/
+
+.macro LOAD2x1_1
+
+	lxvdsx		vs16,	o0,	BO		// load real part from B
+	lxvdsx		vs17,	o8,	BO		// load imag part from B
+	lxvdsx		vs18,	o16,	BO		// load real part from B
+	lxvdsx		vs19,	o24,	BO		// load imag part from B
+
+	addi		BO,	BO,	32
+
+	lxvd2x		vs0,	o0,	AO		// load real,imag from A
+
+	addi		AO,	AO,	16
+
+
+.endm
+
+.macro KERNEL2x1_I1
+
+	lxvd2x		vs8,	o0,	AO		// load real,imag from A
+
+	addi		AO,	AO,	16
+
+	lxvdsx		vs20,	o0,	BO		// load real part from B
+	lxvdsx		vs21,	o8,	BO		// load imag part from B
+	lxvdsx		vs22,	o16,	BO		// load real part from B
+	lxvdsx		vs23,	o24,	BO		// load imag part from B
+
+	addi		BO,	BO,	32
+
+	xvmuldp		vs32,	vs0,	vs16		// real*real, imag*real
+	xvmuldp		vs33,	vs0,	vs17		// real*imag, imag*imag
+
+	xvmuldp		vs34,	vs0,	vs18		// real*real, imag*real
+	xvmuldp		vs35,	vs0,	vs19		// real*imag, imag*imag
+
+
+.endm
+
+.macro KERNEL2x1_1
+
+	lxvd2x		vs8,	o0,	AO		// load real,imag from A
+
+	addi		AO,	AO,	16
+
+	lxvdsx		vs20,	o0,	BO		// load real part from B
+	lxvdsx		vs21,	o8,	BO		// load imag part from B
+	lxvdsx		vs22,	o16,	BO		// load real part from B
+	lxvdsx		vs23,	o24,	BO		// load imag part from B
+
+	addi		BO,	BO,	32
+
+	xvmaddadp	vs32,	vs0,	vs16		// real*real, imag*real
+	xvmaddadp	vs33,	vs0,	vs17		// real*imag, imag*imag
+
+	xvmaddadp	vs34,	vs0,	vs18		// real*real, imag*real
+	xvmaddadp	vs35,	vs0,	vs19		// real*imag, imag*imag
+
+
+.endm
+
+.macro KERNEL2x1_2
+
+	lxvd2x		vs0,	o0,	AO		// load real,imag from A
+
+	addi		AO,	AO,	16
+
+	lxvdsx		vs16,	o0,	BO		// load real part from B
+	lxvdsx		vs17,	o8,	BO		// load imag part from B
+	lxvdsx		vs18,	o16,	BO		// load real part from B
+	lxvdsx		vs19,	o24,	BO		// load imag part from B
+
+	addi		BO,	BO,	32
+
+	xvmaddadp	vs32,	vs8,	vs20		// real*real, imag*real
+	xvmaddadp	vs33,	vs8,	vs21		// real*imag, imag*imag
+
+	xvmaddadp	vs34,	vs8,	vs22		// real*real, imag*real
+	xvmaddadp	vs35,	vs8,	vs23		// real*imag, imag*imag
+
+
+.endm
+
+.macro KERNEL2x1_E2
+
+
+	xvmaddadp	vs32,	vs8,	vs20		// real*real, imag*real
+	xvmaddadp	vs33,	vs8,	vs21		// real*imag, imag*imag
+
+	xvmaddadp	vs34,	vs8,	vs22		// real*real, imag*real
+	xvmaddadp	vs35,	vs8,	vs23		// real*imag, imag*imag
+
+
+.endm
+
+.macro KERNEL2x1_SUBI1
+
+	lxvd2x		vs0,	o0,	AO		// load real,imag from A
+
+	addi		AO,	AO,	16
+
+	lxvdsx		vs16,	o0,	BO		// load real part from B
+	lxvdsx		vs17,	o8,	BO		// load imag part from B
+	lxvdsx		vs18,	o16,	BO		// load real part from B
+	lxvdsx		vs19,	o24,	BO		// load imag part from B
+
+	addi		BO,	BO,	32
+
+	xvmuldp		vs32,	vs0,	vs16		// real*real, imag*real
+	xvmuldp		vs33,	vs0,	vs17		// real*imag, imag*imag
+
+	xvmuldp		vs34,	vs0,	vs18		// real*real, imag*real
+	xvmuldp		vs35,	vs0,	vs19		// real*imag, imag*imag
+
+
+.endm
+
+.macro KERNEL2x1_SUB1
+
+	lxvd2x		vs0,	o0,	AO		// load real,imag from A
+
+	addi		AO,	AO,	16
+
+	lxvdsx		vs16,	o0,	BO		// load real part from B
+	lxvdsx		vs17,	o8,	BO		// load imag part from B
+	lxvdsx		vs18,	o16,	BO		// load real part from B
+	lxvdsx		vs19,	o24,	BO		// load imag part from B
+
+	addi		BO,	BO,	32
+
+	xvmaddadp	vs32,	vs0,	vs16		// real*real, imag*real
+	xvmaddadp	vs33,	vs0,	vs17		// real*imag, imag*imag
+
+	xvmaddadp	vs34,	vs0,	vs18		// real*real, imag*real
+	xvmaddadp	vs35,	vs0,	vs19		// real*imag, imag*imag
+
+
+.endm
+
+.macro SAVE2x1
+
+
+	mr		T1,	CO
+
+#ifndef TRMMKERNEL
+
+	lxvd2x		vs16,	o0,	T1
+
+#endif
+
+
+	xxlxor		vs0,	vs0,	vs0
+	xxlxor		vs1,	vs1,	vs1
+	xxswapd		vs33,	vs33			// realA*imagB, imagA*imagB -> imagA*imagB, realA*imagB
+
+	XSFADD_R1	vs0,	vs0,	vs32		// realA*realB
+	XSFADD_R2	vs0,	vs0,	vs33		// imagA*imagB
+
+	xxswapd		vs32,	vs32			// realA*realB, imagA*realB -> imagA*realB, realA*realB
+	xxswapd		vs33,	vs33			// imagA*imagB, realA*imagB -> realA*imagB, imagA*imagB
+
+	XSFADD_I1	vs1,	vs1,	vs32		// realA*imagB
+	XSFADD_I2	vs1,	vs1,	vs33		// imagA*realB
+
+	xsmuldp		vs4,	vs0,	alpha_r		// real*alpha_r
+	xsmuldp		vs5,	vs1,	alpha_i		// imag*alpha_i
+	xsmuldp		vs6,	vs0,	alpha_i		// real*alpha_i
+	xsmuldp		vs7,	vs1,	alpha_r		// imag*alpha_r
+
+	xssubdp		vs2,	vs4,	vs5		// real*alpha_r - imag*alpha_i
+	xsadddp		vs3,	vs6,	vs7		// real*alpha_i + imag*alpha_r
+	xxpermdi	vs8,	vs2,	vs3,	0	// merge real and imag part
+
+
+#ifndef TRMMKERNEL
+
+	xvadddp		vs8,	vs8,	vs16
+
+#endif
+
+	stxvd2x		vs8,	o0,	T1
+
+	add		T1,	T1,	LDC
+
+#ifndef TRMMKERNEL
+
+	lxvd2x		vs16,	o0,	T1
+
+#endif
+
+
+	xxlxor		vs0,	vs0,	vs0
+	xxlxor		vs1,	vs1,	vs1
+	xxswapd		vs35,	vs35			// realA*imagB, imagA*imagB -> imagA*imagB, realA*imagB
+
+	XSFADD_R1	vs0,	vs0,	vs34		// realA*realB
+	XSFADD_R2	vs0,	vs0,	vs35		// imagA*imagB
+
+	xxswapd		vs34,	vs34			// realA*realB, imagA*realB -> imagA*realB, realA*realB
+	xxswapd		vs35,	vs35			// imagA*imagB, realA*imagB -> realA*imagB, imagA*imagB
+
+	XSFADD_I1	vs1,	vs1,	vs34		// realA*imagB
+	XSFADD_I2	vs1,	vs1,	vs35		// imagA*realB
+
+	xsmuldp		vs4,	vs0,	alpha_r		// real*alpha_r
+	xsmuldp		vs5,	vs1,	alpha_i		// imag*alpha_i
+	xsmuldp		vs6,	vs0,	alpha_i		// real*alpha_i
+	xsmuldp		vs7,	vs1,	alpha_r		// imag*alpha_r
+
+	xssubdp		vs2,	vs4,	vs5		// real*alpha_r - imag*alpha_i
+	xsadddp		vs3,	vs6,	vs7		// real*alpha_i + imag*alpha_r
+	xxpermdi	vs8,	vs2,	vs3,	0	// merge real and imag part
+
+
+#ifndef TRMMKERNEL
+
+	xvadddp		vs8,	vs8,	vs16
+
+#endif
+
+	stxvd2x		vs8,	o0,	T1
+
+	add		T1,	T1,	LDC
+	addi		CO,	CO,	16
+
+.endm
+
+
+/**********************************************************************************************
+* Macros for N=1 and M=8
+**********************************************************************************************/
+
+.macro LOAD1x8_1
+
+	lxvdsx		vs16,	o0,	BO		// load real part from B
+	lxvdsx		vs17,	o8,	BO		// load imag part from B
+
+	addi		BO,	BO,	16
+
+	lxvd2x		vs0,	o0,	AO		// load real,imag from A
+	lxvd2x		vs1,	o16,	AO		// load real,imag from A
+	lxvd2x		vs2,	o32,	AO		// load real,imag from A
+	lxvd2x		vs3,	o48,	AO		// load real,imag from A
+
+	addi		AO,	AO,	64
+
+	lxvd2x		vs4,	o0,	AO		// load real,imag from A
+	lxvd2x		vs5,	o16,	AO		// load real,imag from A
+	lxvd2x		vs6,	o32,	AO		// load real,imag from A
+	lxvd2x		vs7,	o48,	AO		// load real,imag from A
+
+	addi		AO,	AO,	64
+
+
+.endm
+
+.macro KERNEL1x8_I1
+
+	lxvd2x		vs8,	o0,	AO		// load real,imag from A
+	lxvd2x		vs9,	o16,	AO		// load real,imag from A
+	lxvd2x		vs10,	o32,	AO		// load real,imag from A
+	lxvd2x		vs11,	o48,	AO		// load real,imag from A
+
+	addi		AO,	AO,	64
+
+	lxvd2x		vs12,	o0,	AO		// load real,imag from A
+	lxvd2x		vs13,	o16,	AO		// load real,imag from A
+	lxvd2x		vs14,	o32,	AO		// load real,imag from A
+	lxvd2x		vs15,	o48,	AO		// load real,imag from A
+
+	addi		AO,	AO,	64
+
+	lxvdsx		vs20,	o0,	BO		// load real part from B
+	lxvdsx		vs21,	o8,	BO		// load imag part from B
+
+	addi		BO,	BO,	16
+
+	xvmuldp		vs32,	vs0,	vs16		// real*real, imag*real
+	xvmuldp		vs33,	vs0,	vs17		// real*imag, imag*imag
+	xvmuldp		vs34,	vs1,	vs16		// real*real, imag*real
+	xvmuldp		vs35,	vs1,	vs17		// real*imag, imag*imag
+	xvmuldp		vs36,	vs2,	vs16		// real*real, imag*real
+	xvmuldp		vs37,	vs2,	vs17		// real*imag, imag*imag
+	xvmuldp		vs38,	vs3,	vs16		// real*real, imag*real
+	xvmuldp		vs39,	vs3,	vs17		// real*imag, imag*imag
+	xvmuldp		vs40,	vs4,	vs16		// real*real, imag*real
+	xvmuldp		vs41,	vs4,	vs17		// real*imag, imag*imag
+	xvmuldp		vs42,	vs5,	vs16		// real*real, imag*real
+	xvmuldp		vs43,	vs5,	vs17		// real*imag, imag*imag
+	xvmuldp		vs44,	vs6,	vs16		// real*real, imag*real
+	xvmuldp		vs45,	vs6,	vs17		// real*imag, imag*imag
+	xvmuldp		vs46,	vs7,	vs16		// real*real, imag*real
+	xvmuldp		vs47,	vs7,	vs17		// real*imag, imag*imag
+
+
+.endm
+
+.macro KERNEL1x8_1
+
+	lxvd2x		vs8,	o0,	AO		// load real,imag from A
+	lxvd2x		vs9,	o16,	AO		// load real,imag from A
+	lxvd2x		vs10,	o32,	AO		// load real,imag from A
+	lxvd2x		vs11,	o48,	AO		// load real,imag from A
+
+	addi		AO,	AO,	64
+
+	lxvd2x		vs12,	o0,	AO		// load real,imag from A
+	lxvd2x		vs13,	o16,	AO		// load real,imag from A
+	lxvd2x		vs14,	o32,	AO		// load real,imag from A
+	lxvd2x		vs15,	o48,	AO		// load real,imag from A
+
+	addi		AO,	AO,	64
+
+	lxvdsx		vs20,	o0,	BO		// load real part from B
+	lxvdsx		vs21,	o8,	BO		// load imag part from B
+
+	addi		BO,	BO,	16
+
+	xvmaddadp	vs32,	vs0,	vs16		// real*real, imag*real
+	xvmaddadp	vs33,	vs0,	vs17		// real*imag, imag*imag
+	xvmaddadp	vs34,	vs1,	vs16		// real*real, imag*real
+	xvmaddadp	vs35,	vs1,	vs17		// real*imag, imag*imag
+	xvmaddadp	vs36,	vs2,	vs16		// real*real, imag*real
+	xvmaddadp	vs37,	vs2,	vs17		// real*imag, imag*imag
+	xvmaddadp	vs38,	vs3,	vs16		// real*real, imag*real
+	xvmaddadp	vs39,	vs3,	vs17		// real*imag, imag*imag
+	xvmaddadp	vs40,	vs4,	vs16		// real*real, imag*real
+	xvmaddadp	vs41,	vs4,	vs17		// real*imag, imag*imag
+	xvmaddadp	vs42,	vs5,	vs16		// real*real, imag*real
+	xvmaddadp	vs43,	vs5,	vs17		// real*imag, imag*imag
+	xvmaddadp	vs44,	vs6,	vs16		// real*real, imag*real
+	xvmaddadp	vs45,	vs6,	vs17		// real*imag, imag*imag
+	xvmaddadp	vs46,	vs7,	vs16		// real*real, imag*real
+	xvmaddadp	vs47,	vs7,	vs17		// real*imag, imag*imag
+
+
+.endm
+
+.macro KERNEL1x8_2
+
+	lxvd2x		vs0,	o0,	AO		// load real,imag from A
+	lxvd2x		vs1,	o16,	AO		// load real,imag from A
+	lxvd2x		vs2,	o32,	AO		// load real,imag from A
+	lxvd2x		vs3,	o48,	AO		// load real,imag from A
+
+	addi		AO,	AO,	64
+
+	lxvd2x		vs4,	o0,	AO		// load real,imag from A
+	lxvd2x		vs5,	o16,	AO		// load real,imag from A
+	lxvd2x		vs6,	o32,	AO		// load real,imag from A
+	lxvd2x		vs7,	o48,	AO		// load real,imag from A
+
+	addi		AO,	AO,	64
+
+	lxvdsx		vs16,	o0,	BO		// load real part from B
+	lxvdsx		vs17,	o8,	BO		// load imag part from B
+
+	addi		BO,	BO,	16
+
+	xvmaddadp	vs32,	vs8,	vs20		// real*real, imag*real
+	xvmaddadp	vs33,	vs8,	vs21		// real*imag, imag*imag
+	xvmaddadp	vs34,	vs9,	vs20		// real*real, imag*real
+	xvmaddadp	vs35,	vs9,	vs21		// real*imag, imag*imag
+	xvmaddadp	vs36,	vs10,	vs20		// real*real, imag*real
+	xvmaddadp	vs37,	vs10,	vs21		// real*imag, imag*imag
+	xvmaddadp	vs38,	vs11,	vs20		// real*real, imag*real
+	xvmaddadp	vs39,	vs11,	vs21		// real*imag, imag*imag
+	xvmaddadp	vs40,	vs12,	vs20		// real*real, imag*real
+	xvmaddadp	vs41,	vs12,	vs21		// real*imag, imag*imag
+	xvmaddadp	vs42,	vs13,	vs20		// real*real, imag*real
+	xvmaddadp	vs43,	vs13,	vs21		// real*imag, imag*imag
+	xvmaddadp	vs44,	vs14,	vs20		// real*real, imag*real
+	xvmaddadp	vs45,	vs14,	vs21		// real*imag, imag*imag
+	xvmaddadp	vs46,	vs15,	vs20		// real*real, imag*real
+	xvmaddadp	vs47,	vs15,	vs21		// real*imag, imag*imag
+
+
+.endm
+
+.macro KERNEL1x8_E2
+
+
+	xvmaddadp	vs32,	vs8,	vs20		// real*real, imag*real
+	xvmaddadp	vs33,	vs8,	vs21		// real*imag, imag*imag
+	xvmaddadp	vs34,	vs9,	vs20		// real*real, imag*real
+	xvmaddadp	vs35,	vs9,	vs21		// real*imag, imag*imag
+	xvmaddadp	vs36,	vs10,	vs20		// real*real, imag*real
+	xvmaddadp	vs37,	vs10,	vs21		// real*imag, imag*imag
+	xvmaddadp	vs38,	vs11,	vs20		// real*real, imag*real
+	xvmaddadp	vs39,	vs11,	vs21		// real*imag, imag*imag
+	xvmaddadp	vs40,	vs12,	vs20		// real*real, imag*real
+	xvmaddadp	vs41,	vs12,	vs21		// real*imag, imag*imag
+	xvmaddadp	vs42,	vs13,	vs20		// real*real, imag*real
+	xvmaddadp	vs43,	vs13,	vs21		// real*imag, imag*imag
+	xvmaddadp	vs44,	vs14,	vs20		// real*real, imag*real
+	xvmaddadp	vs45,	vs14,	vs21		// real*imag, imag*imag
+	xvmaddadp	vs46,	vs15,	vs20		// real*real, imag*real
+	xvmaddadp	vs47,	vs15,	vs21		// real*imag, imag*imag
+
+
+.endm
+
+.macro KERNEL1x8_SUBI1
+
+	lxvd2x		vs0,	o0,	AO		// load real,imag from A
+	lxvd2x		vs1,	o16,	AO		// load real,imag from A
+	lxvd2x		vs2,	o32,	AO		// load real,imag from A
+	lxvd2x		vs3,	o48,	AO		// load real,imag from A
+
+	addi		AO,	AO,	64
+
+	lxvd2x		vs4,	o0,	AO		// load real,imag from A
+	lxvd2x		vs5,	o16,	AO		// load real,imag from A
+	lxvd2x		vs6,	o32,	AO		// load real,imag from A
+	lxvd2x		vs7,	o48,	AO		// load real,imag from A
+
+	addi		AO,	AO,	64
+
+	lxvdsx		vs16,	o0,	BO		// load real part from B
+	lxvdsx		vs17,	o8,	BO		// load imag part from B
+
+	addi		BO,	BO,	16
+
+	xvmuldp		vs32,	vs0,	vs16		// real*real, imag*real
+	xvmuldp		vs33,	vs0,	vs17		// real*imag, imag*imag
+	xvmuldp		vs34,	vs1,	vs16		// real*real, imag*real
+	xvmuldp		vs35,	vs1,	vs17		// real*imag, imag*imag
+	xvmuldp		vs36,	vs2,	vs16		// real*real, imag*real
+	xvmuldp		vs37,	vs2,	vs17		// real*imag, imag*imag
+	xvmuldp		vs38,	vs3,	vs16		// real*real, imag*real
+	xvmuldp		vs39,	vs3,	vs17		// real*imag, imag*imag
+	xvmuldp		vs40,	vs4,	vs16		// real*real, imag*real
+	xvmuldp		vs41,	vs4,	vs17		// real*imag, imag*imag
+	xvmuldp		vs42,	vs5,	vs16		// real*real, imag*real
+	xvmuldp		vs43,	vs5,	vs17		// real*imag, imag*imag
+	xvmuldp		vs44,	vs6,	vs16		// real*real, imag*real
+	xvmuldp		vs45,	vs6,	vs17		// real*imag, imag*imag
+	xvmuldp		vs46,	vs7,	vs16		// real*real, imag*real
+	xvmuldp		vs47,	vs7,	vs17		// real*imag, imag*imag
+
+
+.endm
+
+.macro KERNEL1x8_SUB1
+
+	lxvd2x		vs0,	o0,	AO		// load real,imag from A
+	lxvd2x		vs1,	o16,	AO		// load real,imag from A
+	lxvd2x		vs2,	o32,	AO		// load real,imag from A
+	lxvd2x		vs3,	o48,	AO		// load real,imag from A
+
+	addi		AO,	AO,	64
+
+	lxvd2x		vs4,	o0,	AO		// load real,imag from A
+	lxvd2x		vs5,	o16,	AO		// load real,imag from A
+	lxvd2x		vs6,	o32,	AO		// load real,imag from A
+	lxvd2x		vs7,	o48,	AO		// load real,imag from A
+
+	addi		AO,	AO,	64
+
+	lxvdsx		vs16,	o0,	BO		// load real part from B
+	lxvdsx		vs17,	o8,	BO		// load imag part from B
+
+	addi		BO,	BO,	16
+
+	xvmaddadp	vs32,	vs0,	vs16		// real*real, imag*real
+	xvmaddadp	vs33,	vs0,	vs17		// real*imag, imag*imag
+	xvmaddadp	vs34,	vs1,	vs16		// real*real, imag*real
+	xvmaddadp	vs35,	vs1,	vs17		// real*imag, imag*imag
+	xvmaddadp	vs36,	vs2,	vs16		// real*real, imag*real
+	xvmaddadp	vs37,	vs2,	vs17		// real*imag, imag*imag
+	xvmaddadp	vs38,	vs3,	vs16		// real*real, imag*real
+	xvmaddadp	vs39,	vs3,	vs17		// real*imag, imag*imag
+	xvmaddadp	vs40,	vs4,	vs16		// real*real, imag*real
+	xvmaddadp	vs41,	vs4,	vs17		// real*imag, imag*imag
+	xvmaddadp	vs42,	vs5,	vs16		// real*real, imag*real
+	xvmaddadp	vs43,	vs5,	vs17		// real*imag, imag*imag
+	xvmaddadp	vs44,	vs6,	vs16		// real*real, imag*real
+	xvmaddadp	vs45,	vs6,	vs17		// real*imag, imag*imag
+	xvmaddadp	vs46,	vs7,	vs16		// real*real, imag*real
+	xvmaddadp	vs47,	vs7,	vs17		// real*imag, imag*imag
+
+
+.endm
+
+.macro SAVE1x8
+
+
+	mr		T1,	CO
+	addi		T2,	T1,	64
+
+#ifndef TRMMKERNEL
+
+	lxvd2x		vs16,	o0,	T1
+	lxvd2x		vs17,	o16,	T1
+	lxvd2x		vs18,	o32,	T1
+	lxvd2x		vs19,	o48,	T1
+	lxvd2x		vs20,	o0,	T2
+	lxvd2x		vs21,	o16,	T2
+	lxvd2x		vs22,	o32,	T2
+	lxvd2x		vs23,	o48,	T2
+
+#endif
+
+
+	xxlxor		vs0,	vs0,	vs0
+	xxlxor		vs1,	vs1,	vs1
+	xxswapd		vs33,	vs33			// realA*imagB, imagA*imagB -> imagA*imagB, realA*imagB
+
+	XSFADD_R1	vs0,	vs0,	vs32		// realA*realB
+	XSFADD_R2	vs0,	vs0,	vs33		// imagA*imagB
+
+	xxswapd		vs32,	vs32			// realA*realB, imagA*realB -> imagA*realB, realA*realB
+	xxswapd		vs33,	vs33			// imagA*imagB, realA*imagB -> realA*imagB, imagA*imagB
+
+	XSFADD_I1	vs1,	vs1,	vs32		// realA*imagB
+	XSFADD_I2	vs1,	vs1,	vs33		// imagA*realB
+
+	xsmuldp		vs4,	vs0,	alpha_r		// real*alpha_r
+	xsmuldp		vs5,	vs1,	alpha_i		// imag*alpha_i
+	xsmuldp		vs6,	vs0,	alpha_i		// real*alpha_i
+	xsmuldp		vs7,	vs1,	alpha_r		// imag*alpha_r
+
+	xssubdp		vs2,	vs4,	vs5		// real*alpha_r - imag*alpha_i
+	xsadddp		vs3,	vs6,	vs7		// real*alpha_i + imag*alpha_r
+	xxpermdi	vs8,	vs2,	vs3,	0	// merge real and imag part
+
+
+
+	xxlxor		vs0,	vs0,	vs0
+	xxlxor		vs1,	vs1,	vs1
+	xxswapd		vs35,	vs35			// realA*imagB, imagA*imagB -> imagA*imagB, realA*imagB
+
+	XSFADD_R1	vs0,	vs0,	vs34		// realA*realB
+	XSFADD_R2	vs0,	vs0,	vs35		// imagA*imagB
+
+	xxswapd		vs34,	vs34			// realA*realB, imagA*realB -> imagA*realB, realA*realB
+	xxswapd		vs35,	vs35			// imagA*imagB, realA*imagB -> realA*imagB, imagA*imagB
+
+	XSFADD_I1	vs1,	vs1,	vs34		// realA*imagB
+	XSFADD_I2	vs1,	vs1,	vs35		// imagA*realB
+
+	xsmuldp		vs4,	vs0,	alpha_r		// real*alpha_r
+	xsmuldp		vs5,	vs1,	alpha_i		// imag*alpha_i
+	xsmuldp		vs6,	vs0,	alpha_i		// real*alpha_i
+	xsmuldp		vs7,	vs1,	alpha_r		// imag*alpha_r
+
+	xssubdp		vs2,	vs4,	vs5		// real*alpha_r - imag*alpha_i
+	xsadddp		vs3,	vs6,	vs7		// real*alpha_i + imag*alpha_r
+	xxpermdi	vs9,	vs2,	vs3,	0	// merge real and imag part
+
+
+
+	xxlxor		vs0,	vs0,	vs0
+	xxlxor		vs1,	vs1,	vs1
+	xxswapd		vs37,	vs37			// realA*imagB, imagA*imagB -> imagA*imagB, realA*imagB
+
+	XSFADD_R1	vs0,	vs0,	vs36		// realA*realB
+	XSFADD_R2	vs0,	vs0,	vs37		// imagA*imagB
+
+	xxswapd		vs36,	vs36			// realA*realB, imagA*realB -> imagA*realB, realA*realB
+	xxswapd		vs37,	vs37			// imagA*imagB, realA*imagB -> realA*imagB, imagA*imagB
+
+	XSFADD_I1	vs1,	vs1,	vs36		// realA*imagB
+	XSFADD_I2	vs1,	vs1,	vs37		// imagA*realB
+
+	xsmuldp		vs4,	vs0,	alpha_r		// real*alpha_r
+	xsmuldp		vs5,	vs1,	alpha_i		// imag*alpha_i
+	xsmuldp		vs6,	vs0,	alpha_i		// real*alpha_i
+	xsmuldp		vs7,	vs1,	alpha_r		// imag*alpha_r
+
+	xssubdp		vs2,	vs4,	vs5		// real*alpha_r - imag*alpha_i
+	xsadddp		vs3,	vs6,	vs7		// real*alpha_i + imag*alpha_r
+	xxpermdi	vs10,	vs2,	vs3,	0	// merge real and imag part
+
+
+
+	xxlxor		vs0,	vs0,	vs0
+	xxlxor		vs1,	vs1,	vs1
+	xxswapd		vs39,	vs39			// realA*imagB, imagA*imagB -> imagA*imagB, realA*imagB
+
+	XSFADD_R1	vs0,	vs0,	vs38		// realA*realB
+	XSFADD_R2	vs0,	vs0,	vs39		// imagA*imagB
+
+	xxswapd		vs38,	vs38			// realA*realB, imagA*realB -> imagA*realB, realA*realB
+	xxswapd		vs39,	vs39			// imagA*imagB, realA*imagB -> realA*imagB, imagA*imagB
+
+	XSFADD_I1	vs1,	vs1,	vs38		// realA*imagB
+	XSFADD_I2	vs1,	vs1,	vs39		// imagA*realB
+
+	xsmuldp		vs4,	vs0,	alpha_r		// real*alpha_r
+	xsmuldp		vs5,	vs1,	alpha_i		// imag*alpha_i
+	xsmuldp		vs6,	vs0,	alpha_i		// real*alpha_i
+	xsmuldp		vs7,	vs1,	alpha_r		// imag*alpha_r
+
+	xssubdp		vs2,	vs4,	vs5		// real*alpha_r - imag*alpha_i
+	xsadddp		vs3,	vs6,	vs7		// real*alpha_i + imag*alpha_r
+	xxpermdi	vs11,	vs2,	vs3,	0	// merge real and imag part
+
+
+
+	xxlxor		vs0,	vs0,	vs0
+	xxlxor		vs1,	vs1,	vs1
+	xxswapd		vs41,	vs41			// realA*imagB, imagA*imagB -> imagA*imagB, realA*imagB
+
+	XSFADD_R1	vs0,	vs0,	vs40		// realA*realB
+	XSFADD_R2	vs0,	vs0,	vs41		// imagA*imagB
+
+	xxswapd		vs40,	vs40			// realA*realB, imagA*realB -> imagA*realB, realA*realB
+	xxswapd		vs41,	vs41			// imagA*imagB, realA*imagB -> realA*imagB, imagA*imagB
+
+	XSFADD_I1	vs1,	vs1,	vs40		// realA*imagB
+	XSFADD_I2	vs1,	vs1,	vs41		// imagA*realB
+
+	xsmuldp		vs4,	vs0,	alpha_r		// real*alpha_r
+	xsmuldp		vs5,	vs1,	alpha_i		// imag*alpha_i
+	xsmuldp		vs6,	vs0,	alpha_i		// real*alpha_i
+	xsmuldp		vs7,	vs1,	alpha_r		// imag*alpha_r
+
+	xssubdp		vs2,	vs4,	vs5		// real*alpha_r - imag*alpha_i
+	xsadddp		vs3,	vs6,	vs7		// real*alpha_i + imag*alpha_r
+	xxpermdi	vs12,	vs2,	vs3,	0	// merge real and imag part
+
+
+
+	xxlxor		vs0,	vs0,	vs0
+	xxlxor		vs1,	vs1,	vs1
+	xxswapd		vs43,	vs43			// realA*imagB, imagA*imagB -> imagA*imagB, realA*imagB
+
+	XSFADD_R1	vs0,	vs0,	vs42		// realA*realB
+	XSFADD_R2	vs0,	vs0,	vs43		// imagA*imagB
+
+	xxswapd		vs42,	vs42			// realA*realB, imagA*realB -> imagA*realB, realA*realB
+	xxswapd		vs43,	vs43			// imagA*imagB, realA*imagB -> realA*imagB, imagA*imagB
+
+	XSFADD_I1	vs1,	vs1,	vs42		// realA*imagB
+	XSFADD_I2	vs1,	vs1,	vs43		// imagA*realB
+
+	xsmuldp		vs4,	vs0,	alpha_r		// real*alpha_r
+	xsmuldp		vs5,	vs1,	alpha_i		// imag*alpha_i
+	xsmuldp		vs6,	vs0,	alpha_i		// real*alpha_i
+	xsmuldp		vs7,	vs1,	alpha_r		// imag*alpha_r
+
+	xssubdp		vs2,	vs4,	vs5		// real*alpha_r - imag*alpha_i
+	xsadddp		vs3,	vs6,	vs7		// real*alpha_i + imag*alpha_r
+	xxpermdi	vs13,	vs2,	vs3,	0	// merge real and imag part
+
+
+
+	xxlxor		vs0,	vs0,	vs0
+	xxlxor		vs1,	vs1,	vs1
+	xxswapd		vs45,	vs45			// realA*imagB, imagA*imagB -> imagA*imagB, realA*imagB
+
+	XSFADD_R1	vs0,	vs0,	vs44		// realA*realB
+	XSFADD_R2	vs0,	vs0,	vs45		// imagA*imagB
+
+	xxswapd		vs44,	vs44			// realA*realB, imagA*realB -> imagA*realB, realA*realB
+	xxswapd		vs45,	vs45			// imagA*imagB, realA*imagB -> realA*imagB, imagA*imagB
+
+	XSFADD_I1	vs1,	vs1,	vs44		// realA*imagB
+	XSFADD_I2	vs1,	vs1,	vs45		// imagA*realB
+
+	xsmuldp		vs4,	vs0,	alpha_r		// real*alpha_r
+	xsmuldp		vs5,	vs1,	alpha_i		// imag*alpha_i
+	xsmuldp		vs6,	vs0,	alpha_i		// real*alpha_i
+	xsmuldp		vs7,	vs1,	alpha_r		// imag*alpha_r
+
+	xssubdp		vs2,	vs4,	vs5		// real*alpha_r - imag*alpha_i
+	xsadddp		vs3,	vs6,	vs7		// real*alpha_i + imag*alpha_r
+	xxpermdi	vs14,	vs2,	vs3,	0	// merge real and imag part
+
+
+
+	xxlxor		vs0,	vs0,	vs0
+	xxlxor		vs1,	vs1,	vs1
+	xxswapd		vs47,	vs47			// realA*imagB, imagA*imagB -> imagA*imagB, realA*imagB
+
+	XSFADD_R1	vs0,	vs0,	vs46		// realA*realB
+	XSFADD_R2	vs0,	vs0,	vs47		// imagA*imagB
+
+	xxswapd		vs46,	vs46			// realA*realB, imagA*realB -> imagA*realB, realA*realB
+	xxswapd		vs47,	vs47			// imagA*imagB, realA*imagB -> realA*imagB, imagA*imagB
+
+	XSFADD_I1	vs1,	vs1,	vs46		// realA*imagB
+	XSFADD_I2	vs1,	vs1,	vs47		// imagA*realB
+
+	xsmuldp		vs4,	vs0,	alpha_r		// real*alpha_r
+	xsmuldp		vs5,	vs1,	alpha_i		// imag*alpha_i
+	xsmuldp		vs6,	vs0,	alpha_i		// real*alpha_i
+	xsmuldp		vs7,	vs1,	alpha_r		// imag*alpha_r
+
+	xssubdp		vs2,	vs4,	vs5		// real*alpha_r - imag*alpha_i
+	xsadddp		vs3,	vs6,	vs7		// real*alpha_i + imag*alpha_r
+	xxpermdi	vs15,	vs2,	vs3,	0	// merge real and imag part
+
+
+#ifndef TRMMKERNEL
+
+	xvadddp		vs8,	vs8,	vs16
+	xvadddp		vs9,	vs9,	vs17
+	xvadddp		vs10,	vs10,	vs18
+	xvadddp		vs11,	vs11,	vs19
+	xvadddp		vs12,	vs12,	vs20
+	xvadddp		vs13,	vs13,	vs21
+	xvadddp		vs14,	vs14,	vs22
+	xvadddp		vs15,	vs15,	vs23
+
+#endif
+
+	stxvd2x		vs8,	o0,	T1
+	stxvd2x		vs9,	o16,	T1
+	stxvd2x		vs10,	o32,	T1
+	stxvd2x		vs11,	o48,	T1
+	stxvd2x		vs12,	o0,	T2
+	stxvd2x		vs13,	o16,	T2
+	stxvd2x		vs14,	o32,	T2
+	stxvd2x		vs15,	o48,	T2
+
+	add		T1,	T1,	LDC
+	add		T2,	T2,	LDC
+	addi		CO,	CO,	128
+
+.endm
+
+
+/**********************************************************************************************
+* Macros for N=1 and M=4
+**********************************************************************************************/
+
+.macro LOAD1x4_1
+
+	lxvdsx		vs16,	o0,	BO		// load real part from B
+	lxvdsx		vs17,	o8,	BO		// load imag part from B
+
+	addi		BO,	BO,	16
+
+	lxvd2x		vs0,	o0,	AO		// load real,imag from A
+	lxvd2x		vs1,	o16,	AO		// load real,imag from A
+	lxvd2x		vs2,	o32,	AO		// load real,imag from A
+	lxvd2x		vs3,	o48,	AO		// load real,imag from A
+
+	addi		AO,	AO,	64
+
+
+.endm
+
+.macro KERNEL1x4_I1
+
+	lxvd2x		vs8,	o0,	AO		// load real,imag from A
+	lxvd2x		vs9,	o16,	AO		// load real,imag from A
+	lxvd2x		vs10,	o32,	AO		// load real,imag from A
+	lxvd2x		vs11,	o48,	AO		// load real,imag from A
+
+	addi		AO,	AO,	64
+
+	lxvdsx		vs20,	o0,	BO		// load real part from B
+	lxvdsx		vs21,	o8,	BO		// load imag part from B
+
+	addi		BO,	BO,	16
+
+	xvmuldp		vs32,	vs0,	vs16		// real*real, imag*real
+	xvmuldp		vs33,	vs0,	vs17		// real*imag, imag*imag
+	xvmuldp		vs34,	vs1,	vs16		// real*real, imag*real
+	xvmuldp		vs35,	vs1,	vs17		// real*imag, imag*imag
+	xvmuldp		vs36,	vs2,	vs16		// real*real, imag*real
+	xvmuldp		vs37,	vs2,	vs17		// real*imag, imag*imag
+	xvmuldp		vs38,	vs3,	vs16		// real*real, imag*real
+	xvmuldp		vs39,	vs3,	vs17		// real*imag, imag*imag
+
+
+.endm
+
+.macro KERNEL1x4_1
+
+	lxvd2x		vs8,	o0,	AO		// load real,imag from A
+	lxvd2x		vs9,	o16,	AO		// load real,imag from A
+	lxvd2x		vs10,	o32,	AO		// load real,imag from A
+	lxvd2x		vs11,	o48,	AO		// load real,imag from A
+
+	addi		AO,	AO,	64
+
+	lxvdsx		vs20,	o0,	BO		// load real part from B
+	lxvdsx		vs21,	o8,	BO		// load imag part from B
+
+	addi		BO,	BO,	16
+
+	xvmaddadp	vs32,	vs0,	vs16		// real*real, imag*real
+	xvmaddadp	vs33,	vs0,	vs17		// real*imag, imag*imag
+	xvmaddadp	vs34,	vs1,	vs16		// real*real, imag*real
+	xvmaddadp	vs35,	vs1,	vs17		// real*imag, imag*imag
+	xvmaddadp	vs36,	vs2,	vs16		// real*real, imag*real
+	xvmaddadp	vs37,	vs2,	vs17		// real*imag, imag*imag
+	xvmaddadp	vs38,	vs3,	vs16		// real*real, imag*real
+	xvmaddadp	vs39,	vs3,	vs17		// real*imag, imag*imag
+
+
+.endm
+
+.macro KERNEL1x4_2
+
+	lxvd2x		vs0,	o0,	AO		// load real,imag from A
+	lxvd2x		vs1,	o16,	AO		// load real,imag from A
+	lxvd2x		vs2,	o32,	AO		// load real,imag from A
+	lxvd2x		vs3,	o48,	AO		// load real,imag from A
+
+	addi		AO,	AO,	64
+
+	lxvdsx		vs16,	o0,	BO		// load real part from B
+	lxvdsx		vs17,	o8,	BO		// load imag part from B
+
+	addi		BO,	BO,	16
+
+	xvmaddadp	vs32,	vs8,	vs20		// real*real, imag*real
+	xvmaddadp	vs33,	vs8,	vs21		// real*imag, imag*imag
+	xvmaddadp	vs34,	vs9,	vs20		// real*real, imag*real
+	xvmaddadp	vs35,	vs9,	vs21		// real*imag, imag*imag
+	xvmaddadp	vs36,	vs10,	vs20		// real*real, imag*real
+	xvmaddadp	vs37,	vs10,	vs21		// real*imag, imag*imag
+	xvmaddadp	vs38,	vs11,	vs20		// real*real, imag*real
+	xvmaddadp	vs39,	vs11,	vs21		// real*imag, imag*imag
+
+
+.endm
+
+.macro KERNEL1x4_E2
+
+
+	xvmaddadp	vs32,	vs8,	vs20		// real*real, imag*real
+	xvmaddadp	vs33,	vs8,	vs21		// real*imag, imag*imag
+	xvmaddadp	vs34,	vs9,	vs20		// real*real, imag*real
+	xvmaddadp	vs35,	vs9,	vs21		// real*imag, imag*imag
+	xvmaddadp	vs36,	vs10,	vs20		// real*real, imag*real
+	xvmaddadp	vs37,	vs10,	vs21		// real*imag, imag*imag
+	xvmaddadp	vs38,	vs11,	vs20		// real*real, imag*real
+	xvmaddadp	vs39,	vs11,	vs21		// real*imag, imag*imag
+
+
+.endm
+
+.macro KERNEL1x4_SUBI1
+
+	lxvd2x		vs0,	o0,	AO		// load real,imag from A
+	lxvd2x		vs1,	o16,	AO		// load real,imag from A
+	lxvd2x		vs2,	o32,	AO		// load real,imag from A
+	lxvd2x		vs3,	o48,	AO		// load real,imag from A
+
+	addi		AO,	AO,	64
+
+	lxvdsx		vs16,	o0,	BO		// load real part from B
+	lxvdsx		vs17,	o8,	BO		// load imag part from B
+
+	addi		BO,	BO,	16
+
+	xvmuldp		vs32,	vs0,	vs16		// real*real, imag*real
+	xvmuldp		vs33,	vs0,	vs17		// real*imag, imag*imag
+	xvmuldp		vs34,	vs1,	vs16		// real*real, imag*real
+	xvmuldp		vs35,	vs1,	vs17		// real*imag, imag*imag
+	xvmuldp		vs36,	vs2,	vs16		// real*real, imag*real
+	xvmuldp		vs37,	vs2,	vs17		// real*imag, imag*imag
+	xvmuldp		vs38,	vs3,	vs16		// real*real, imag*real
+	xvmuldp		vs39,	vs3,	vs17		// real*imag, imag*imag
+
+
+.endm
+
+.macro KERNEL1x4_SUB1
+
+	lxvd2x		vs0,	o0,	AO		// load real,imag from A
+	lxvd2x		vs1,	o16,	AO		// load real,imag from A
+	lxvd2x		vs2,	o32,	AO		// load real,imag from A
+	lxvd2x		vs3,	o48,	AO		// load real,imag from A
+
+	addi		AO,	AO,	64
+
+	lxvdsx		vs16,	o0,	BO		// load real part from B
+	lxvdsx		vs17,	o8,	BO		// load imag part from B
+
+	addi		BO,	BO,	16
+
+	xvmaddadp	vs32,	vs0,	vs16		// real*real, imag*real
+	xvmaddadp	vs33,	vs0,	vs17		// real*imag, imag*imag
+	xvmaddadp	vs34,	vs1,	vs16		// real*real, imag*real
+	xvmaddadp	vs35,	vs1,	vs17		// real*imag, imag*imag
+	xvmaddadp	vs36,	vs2,	vs16		// real*real, imag*real
+	xvmaddadp	vs37,	vs2,	vs17		// real*imag, imag*imag
+	xvmaddadp	vs38,	vs3,	vs16		// real*real, imag*real
+	xvmaddadp	vs39,	vs3,	vs17		// real*imag, imag*imag
+
+
+.endm
+
+.macro SAVE1x4
+
+
+	mr		T1,	CO
+
+#ifndef TRMMKERNEL
+
+	lxvd2x		vs16,	o0,	T1
+	lxvd2x		vs17,	o16,	T1
+	lxvd2x		vs18,	o32,	T1
+	lxvd2x		vs19,	o48,	T1
+
+#endif
+
+
+	xxlxor		vs0,	vs0,	vs0
+	xxlxor		vs1,	vs1,	vs1
+	xxswapd		vs33,	vs33			// realA*imagB, imagA*imagB -> imagA*imagB, realA*imagB
+
+	XSFADD_R1	vs0,	vs0,	vs32		// realA*realB
+	XSFADD_R2	vs0,	vs0,	vs33		// imagA*imagB
+
+	xxswapd		vs32,	vs32			// realA*realB, imagA*realB -> imagA*realB, realA*realB
+	xxswapd		vs33,	vs33			// imagA*imagB, realA*imagB -> realA*imagB, imagA*imagB
+
+	XSFADD_I1	vs1,	vs1,	vs32		// realA*imagB
+	XSFADD_I2	vs1,	vs1,	vs33		// imagA*realB
+
+	xsmuldp		vs4,	vs0,	alpha_r		// real*alpha_r
+	xsmuldp		vs5,	vs1,	alpha_i		// imag*alpha_i
+	xsmuldp		vs6,	vs0,	alpha_i		// real*alpha_i
+	xsmuldp		vs7,	vs1,	alpha_r		// imag*alpha_r
+
+	xssubdp		vs2,	vs4,	vs5		// real*alpha_r - imag*alpha_i
+	xsadddp		vs3,	vs6,	vs7		// real*alpha_i + imag*alpha_r
+	xxpermdi	vs8,	vs2,	vs3,	0	// merge real and imag part
+
+
+
+	xxlxor		vs0,	vs0,	vs0
+	xxlxor		vs1,	vs1,	vs1
+	xxswapd		vs35,	vs35			// realA*imagB, imagA*imagB -> imagA*imagB, realA*imagB
+
+	XSFADD_R1	vs0,	vs0,	vs34		// realA*realB
+	XSFADD_R2	vs0,	vs0,	vs35		// imagA*imagB
+
+	xxswapd		vs34,	vs34			// realA*realB, imagA*realB -> imagA*realB, realA*realB
+	xxswapd		vs35,	vs35			// imagA*imagB, realA*imagB -> realA*imagB, imagA*imagB
+
+	XSFADD_I1	vs1,	vs1,	vs34		// realA*imagB
+	XSFADD_I2	vs1,	vs1,	vs35		// imagA*realB
+
+	xsmuldp		vs4,	vs0,	alpha_r		// real*alpha_r
+	xsmuldp		vs5,	vs1,	alpha_i		// imag*alpha_i
+	xsmuldp		vs6,	vs0,	alpha_i		// real*alpha_i
+	xsmuldp		vs7,	vs1,	alpha_r		// imag*alpha_r
+
+	xssubdp		vs2,	vs4,	vs5		// real*alpha_r - imag*alpha_i
+	xsadddp		vs3,	vs6,	vs7		// real*alpha_i + imag*alpha_r
+	xxpermdi	vs9,	vs2,	vs3,	0	// merge real and imag part
+
+
+
+	xxlxor		vs0,	vs0,	vs0
+	xxlxor		vs1,	vs1,	vs1
+	xxswapd		vs37,	vs37			// realA*imagB, imagA*imagB -> imagA*imagB, realA*imagB
+
+	XSFADD_R1	vs0,	vs0,	vs36		// realA*realB
+	XSFADD_R2	vs0,	vs0,	vs37		// imagA*imagB
+
+	xxswapd		vs36,	vs36			// realA*realB, imagA*realB -> imagA*realB, realA*realB
+	xxswapd		vs37,	vs37			// imagA*imagB, realA*imagB -> realA*imagB, imagA*imagB
+
+	XSFADD_I1	vs1,	vs1,	vs36		// realA*imagB
+	XSFADD_I2	vs1,	vs1,	vs37		// imagA*realB
+
+	xsmuldp		vs4,	vs0,	alpha_r		// real*alpha_r
+	xsmuldp		vs5,	vs1,	alpha_i		// imag*alpha_i
+	xsmuldp		vs6,	vs0,	alpha_i		// real*alpha_i
+	xsmuldp		vs7,	vs1,	alpha_r		// imag*alpha_r
+
+	xssubdp		vs2,	vs4,	vs5		// real*alpha_r - imag*alpha_i
+	xsadddp		vs3,	vs6,	vs7		// real*alpha_i + imag*alpha_r
+	xxpermdi	vs10,	vs2,	vs3,	0	// merge real and imag part
+
+
+
+	xxlxor		vs0,	vs0,	vs0
+	xxlxor		vs1,	vs1,	vs1
+	xxswapd		vs39,	vs39			// realA*imagB, imagA*imagB -> imagA*imagB, realA*imagB
+
+	XSFADD_R1	vs0,	vs0,	vs38		// realA*realB
+	XSFADD_R2	vs0,	vs0,	vs39		// imagA*imagB
+
+	xxswapd		vs38,	vs38			// realA*realB, imagA*realB -> imagA*realB, realA*realB
+	xxswapd		vs39,	vs39			// imagA*imagB, realA*imagB -> realA*imagB, imagA*imagB
+
+	XSFADD_I1	vs1,	vs1,	vs38		// realA*imagB
+	XSFADD_I2	vs1,	vs1,	vs39		// imagA*realB
+
+	xsmuldp		vs4,	vs0,	alpha_r		// real*alpha_r
+	xsmuldp		vs5,	vs1,	alpha_i		// imag*alpha_i
+	xsmuldp		vs6,	vs0,	alpha_i		// real*alpha_i
+	xsmuldp		vs7,	vs1,	alpha_r		// imag*alpha_r
+
+	xssubdp		vs2,	vs4,	vs5		// real*alpha_r - imag*alpha_i
+	xsadddp		vs3,	vs6,	vs7		// real*alpha_i + imag*alpha_r
+	xxpermdi	vs11,	vs2,	vs3,	0	// merge real and imag part
+
+
+#ifndef TRMMKERNEL
+
+	xvadddp		vs8,	vs8,	vs16
+	xvadddp		vs9,	vs9,	vs17
+	xvadddp		vs10,	vs10,	vs18
+	xvadddp		vs11,	vs11,	vs19
+
+#endif
+
+	stxvd2x		vs8,	o0,	T1
+	stxvd2x		vs9,	o16,	T1
+	stxvd2x		vs10,	o32,	T1
+	stxvd2x		vs11,	o48,	T1
+
+	add		T1,	T1,	LDC
+	addi		CO,	CO,	64
+
+.endm
+
+
+/**********************************************************************************************
+* Macros for N=1 and M=2
+**********************************************************************************************/
+
+.macro LOAD1x2_1
+
+	lxvdsx		vs16,	o0,	BO		// load real part from B
+	lxvdsx		vs17,	o8,	BO		// load imag part from B
+
+	addi		BO,	BO,	16
+
+	lxvd2x		vs0,	o0,	AO		// load real,imag from A
+	lxvd2x		vs1,	o16,	AO		// load real,imag from A
+
+	addi		AO,	AO,	32
+
+
+.endm
+
+.macro KERNEL1x2_I1
+
+	lxvd2x		vs8,	o0,	AO		// load real,imag from A
+	lxvd2x		vs9,	o16,	AO		// load real,imag from A
+
+	addi		AO,	AO,	32
+
+	lxvdsx		vs20,	o0,	BO		// load real part from B
+	lxvdsx		vs21,	o8,	BO		// load imag part from B
+
+	addi		BO,	BO,	16
+
+	xvmuldp		vs32,	vs0,	vs16		// real*real, imag*real
+	xvmuldp		vs33,	vs0,	vs17		// real*imag, imag*imag
+	xvmuldp		vs34,	vs1,	vs16		// real*real, imag*real
+	xvmuldp		vs35,	vs1,	vs17		// real*imag, imag*imag
+
+
+.endm
+
+.macro KERNEL1x2_1
+
+	lxvd2x		vs8,	o0,	AO		// load real,imag from A
+	lxvd2x		vs9,	o16,	AO		// load real,imag from A
+
+	addi		AO,	AO,	32
+
+	lxvdsx		vs20,	o0,	BO		// load real part from B
+	lxvdsx		vs21,	o8,	BO		// load imag part from B
+
+	addi		BO,	BO,	16
+
+	xvmaddadp	vs32,	vs0,	vs16		// real*real, imag*real
+	xvmaddadp	vs33,	vs0,	vs17		// real*imag, imag*imag
+	xvmaddadp	vs34,	vs1,	vs16		// real*real, imag*real
+	xvmaddadp	vs35,	vs1,	vs17		// real*imag, imag*imag
+
+
+.endm
+
+.macro KERNEL1x2_2
+
+	lxvd2x		vs0,	o0,	AO		// load real,imag from A
+	lxvd2x		vs1,	o16,	AO		// load real,imag from A
+
+	addi		AO,	AO,	32
+
+	lxvdsx		vs16,	o0,	BO		// load real part from B
+	lxvdsx		vs17,	o8,	BO		// load imag part from B
+
+	addi		BO,	BO,	16
+
+	xvmaddadp	vs32,	vs8,	vs20		// real*real, imag*real
+	xvmaddadp	vs33,	vs8,	vs21		// real*imag, imag*imag
+	xvmaddadp	vs34,	vs9,	vs20		// real*real, imag*real
+	xvmaddadp	vs35,	vs9,	vs21		// real*imag, imag*imag
+
+
+.endm
+
+.macro KERNEL1x2_E2
+
+
+	xvmaddadp	vs32,	vs8,	vs20		// real*real, imag*real
+	xvmaddadp	vs33,	vs8,	vs21		// real*imag, imag*imag
+	xvmaddadp	vs34,	vs9,	vs20		// real*real, imag*real
+	xvmaddadp	vs35,	vs9,	vs21		// real*imag, imag*imag
+
+
+.endm
+
+.macro KERNEL1x2_SUBI1
+
+	lxvd2x		vs0,	o0,	AO		// load real,imag from A
+	lxvd2x		vs1,	o16,	AO		// load real,imag from A
+
+	addi		AO,	AO,	32
+
+	lxvdsx		vs16,	o0,	BO		// load real part from B
+	lxvdsx		vs17,	o8,	BO		// load imag part from B
+
+	addi		BO,	BO,	16
+
+	xvmuldp		vs32,	vs0,	vs16		// real*real, imag*real
+	xvmuldp		vs33,	vs0,	vs17		// real*imag, imag*imag
+	xvmuldp		vs34,	vs1,	vs16		// real*real, imag*real
+	xvmuldp		vs35,	vs1,	vs17		// real*imag, imag*imag
+
+
+.endm
+
+.macro KERNEL1x2_SUB1
+
+	lxvd2x		vs0,	o0,	AO		// load real,imag from A
+	lxvd2x		vs1,	o16,	AO		// load real,imag from A
+
+	addi		AO,	AO,	32
+
+	lxvdsx		vs16,	o0,	BO		// load real part from B
+	lxvdsx		vs17,	o8,	BO		// load imag part from B
+
+	addi		BO,	BO,	16
+
+	xvmaddadp	vs32,	vs0,	vs16		// real*real, imag*real
+	xvmaddadp	vs33,	vs0,	vs17		// real*imag, imag*imag
+	xvmaddadp	vs34,	vs1,	vs16		// real*real, imag*real
+	xvmaddadp	vs35,	vs1,	vs17		// real*imag, imag*imag
+
+
+.endm
+
+.macro SAVE1x2
+
+
+	mr		T1,	CO
+
+#ifndef TRMMKERNEL
+
+	lxvd2x		vs16,	o0,	T1
+	lxvd2x		vs17,	o16,	T1
+
+#endif
+
+
+	xxlxor		vs0,	vs0,	vs0
+	xxlxor		vs1,	vs1,	vs1
+	xxswapd		vs33,	vs33			// realA*imagB, imagA*imagB -> imagA*imagB, realA*imagB
+
+	XSFADD_R1	vs0,	vs0,	vs32		// realA*realB
+	XSFADD_R2	vs0,	vs0,	vs33		// imagA*imagB
+
+	xxswapd		vs32,	vs32			// realA*realB, imagA*realB -> imagA*realB, realA*realB
+	xxswapd		vs33,	vs33			// imagA*imagB, realA*imagB -> realA*imagB, imagA*imagB
+
+	XSFADD_I1	vs1,	vs1,	vs32		// realA*imagB
+	XSFADD_I2	vs1,	vs1,	vs33		// imagA*realB
+
+	xsmuldp		vs4,	vs0,	alpha_r		// real*alpha_r
+	xsmuldp		vs5,	vs1,	alpha_i		// imag*alpha_i
+	xsmuldp		vs6,	vs0,	alpha_i		// real*alpha_i
+	xsmuldp		vs7,	vs1,	alpha_r		// imag*alpha_r
+
+	xssubdp		vs2,	vs4,	vs5		// real*alpha_r - imag*alpha_i
+	xsadddp		vs3,	vs6,	vs7		// real*alpha_i + imag*alpha_r
+	xxpermdi	vs8,	vs2,	vs3,	0	// merge real and imag part
+
+
+
+	xxlxor		vs0,	vs0,	vs0
+	xxlxor		vs1,	vs1,	vs1
+	xxswapd		vs35,	vs35			// realA*imagB, imagA*imagB -> imagA*imagB, realA*imagB
+
+	XSFADD_R1	vs0,	vs0,	vs34		// realA*realB
+	XSFADD_R2	vs0,	vs0,	vs35		// imagA*imagB
+
+	xxswapd		vs34,	vs34			// realA*realB, imagA*realB -> imagA*realB, realA*realB
+	xxswapd		vs35,	vs35			// imagA*imagB, realA*imagB -> realA*imagB, imagA*imagB
+
+	XSFADD_I1	vs1,	vs1,	vs34		// realA*imagB
+	XSFADD_I2	vs1,	vs1,	vs35		// imagA*realB
+
+	xsmuldp		vs4,	vs0,	alpha_r		// real*alpha_r
+	xsmuldp		vs5,	vs1,	alpha_i		// imag*alpha_i
+	xsmuldp		vs6,	vs0,	alpha_i		// real*alpha_i
+	xsmuldp		vs7,	vs1,	alpha_r		// imag*alpha_r
+
+	xssubdp		vs2,	vs4,	vs5		// real*alpha_r - imag*alpha_i
+	xsadddp		vs3,	vs6,	vs7		// real*alpha_i + imag*alpha_r
+	xxpermdi	vs9,	vs2,	vs3,	0	// merge real and imag part
+
+
+#ifndef TRMMKERNEL
+
+	xvadddp		vs8,	vs8,	vs16
+	xvadddp		vs9,	vs9,	vs17
+
+#endif
+
+	stxvd2x		vs8,	o0,	T1
+	stxvd2x		vs9,	o16,	T1
+
+	add		T1,	T1,	LDC
+	addi		CO,	CO,	32
+
+.endm
+
+
+/**********************************************************************************************
+* Macros for N=1 and M=1
+**********************************************************************************************/
+
+.macro LOAD1x1_1
+
+	lxvdsx		vs16,	o0,	BO		// load real part from B
+	lxvdsx		vs17,	o8,	BO		// load imag part from B
+
+	addi		BO,	BO,	16
+
+	lxvd2x		vs0,	o0,	AO		// load real,imag from A
+
+	addi		AO,	AO,	16
+
+
+.endm
+
+.macro KERNEL1x1_I1
+
+	lxvd2x		vs8,	o0,	AO		// load real,imag from A
+
+	addi		AO,	AO,	16
+
+	lxvdsx		vs20,	o0,	BO		// load real part from B
+	lxvdsx		vs21,	o8,	BO		// load imag part from B
+
+	addi		BO,	BO,	16
+
+	xvmuldp		vs32,	vs0,	vs16		// real*real, imag*real
+	xvmuldp		vs33,	vs0,	vs17		// real*imag, imag*imag
+
+
+.endm
+
+.macro KERNEL1x1_1
+
+	lxvd2x		vs8,	o0,	AO		// load real,imag from A
+
+	addi		AO,	AO,	16
+
+	lxvdsx		vs20,	o0,	BO		// load real part from B
+	lxvdsx		vs21,	o8,	BO		// load imag part from B
+
+	addi		BO,	BO,	16
+
+	xvmaddadp	vs32,	vs0,	vs16		// real*real, imag*real
+	xvmaddadp	vs33,	vs0,	vs17		// real*imag, imag*imag
+
+
+.endm
+
+.macro KERNEL1x1_2
+
+	lxvd2x		vs0,	o0,	AO		// load real,imag from A
+
+	addi		AO,	AO,	16
+
+	lxvdsx		vs16,	o0,	BO		// load real part from B
+	lxvdsx		vs17,	o8,	BO		// load imag part from B
+
+	addi		BO,	BO,	16
+
+	xvmaddadp	vs32,	vs8,	vs20		// real*real, imag*real
+	xvmaddadp	vs33,	vs8,	vs21		// real*imag, imag*imag
+
+
+.endm
+
+.macro KERNEL1x1_E2
+
+
+	xvmaddadp	vs32,	vs8,	vs20		// real*real, imag*real
+	xvmaddadp	vs33,	vs8,	vs21		// real*imag, imag*imag
+
+
+.endm
+
+.macro KERNEL1x1_SUBI1
+
+	lxvd2x		vs0,	o0,	AO		// load real,imag from A
+
+	addi		AO,	AO,	16
+
+	lxvdsx		vs16,	o0,	BO		// load real part from B
+	lxvdsx		vs17,	o8,	BO		// load imag part from B
+
+	addi		BO,	BO,	16
+
+	xvmuldp		vs32,	vs0,	vs16		// real*real, imag*real
+	xvmuldp		vs33,	vs0,	vs17		// real*imag, imag*imag
+
+
+.endm
+
+.macro KERNEL1x1_SUB1
+
+	lxvd2x		vs0,	o0,	AO		// load real,imag from A
+
+	addi		AO,	AO,	16
+
+	lxvdsx		vs16,	o0,	BO		// load real part from B
+	lxvdsx		vs17,	o8,	BO		// load imag part from B
+
+	addi		BO,	BO,	16
+
+	xvmaddadp	vs32,	vs0,	vs16		// real*real, imag*real
+	xvmaddadp	vs33,	vs0,	vs17		// real*imag, imag*imag
+
+
+.endm
+
+.macro SAVE1x1
+
+
+	mr		T1,	CO
+
+#ifndef TRMMKERNEL
+
+	lxvd2x		vs16,	o0,	T1
+
+#endif
+
+
+	xxlxor		vs0,	vs0,	vs0
+	xxlxor		vs1,	vs1,	vs1
+	xxswapd		vs33,	vs33			// realA*imagB, imagA*imagB -> imagA*imagB, realA*imagB
+
+	XSFADD_R1	vs0,	vs0,	vs32		// realA*realB
+	XSFADD_R2	vs0,	vs0,	vs33		// imagA*imagB
+
+	xxswapd		vs32,	vs32			// realA*realB, imagA*realB -> imagA*realB, realA*realB
+	xxswapd		vs33,	vs33			// imagA*imagB, realA*imagB -> realA*imagB, imagA*imagB
+
+	XSFADD_I1	vs1,	vs1,	vs32		// realA*imagB
+	XSFADD_I2	vs1,	vs1,	vs33		// imagA*realB
+
+	xsmuldp		vs4,	vs0,	alpha_r		// real*alpha_r
+	xsmuldp		vs5,	vs1,	alpha_i		// imag*alpha_i
+	xsmuldp		vs6,	vs0,	alpha_i		// real*alpha_i
+	xsmuldp		vs7,	vs1,	alpha_r		// imag*alpha_r
+
+	xssubdp		vs2,	vs4,	vs5		// real*alpha_r - imag*alpha_i
+	xsadddp		vs3,	vs6,	vs7		// real*alpha_i + imag*alpha_r
+	xxpermdi	vs8,	vs2,	vs3,	0	// merge real and imag part
+
+
+#ifndef TRMMKERNEL
+
+	xvadddp		vs8,	vs8,	vs16
+
+#endif
+
+	stxvd2x		vs8,	o0,	T1
+
+	add		T1,	T1,	LDC
+	addi		CO,	CO,	16
+
+.endm
+
diff --git a/kernel/power/zgemv_n.S b/kernel/power/zgemv_n.S
index 23e0177c0..f93439986 100644
--- a/kernel/power/zgemv_n.S
+++ b/kernel/power/zgemv_n.S
@@ -170,6 +170,11 @@
 #define PREFETCHSIZE_C  24
 #endif
 
+#ifdef POWER8
+#define PREFETCHSIZE_A  24
+#define PREFETCHSIZE_C  24
+#endif
+
 #ifndef XCONJ
 #define FMADDR FMADD
 #define FMSUBR FNMSUB
diff --git a/kernel/power/zgemv_t.S b/kernel/power/zgemv_t.S
index c0bad3152..9c6f510c2 100644
--- a/kernel/power/zgemv_t.S
+++ b/kernel/power/zgemv_t.S
@@ -144,6 +144,12 @@
 #define PREFETCHSIZE_C   8
 #endif
 
+#ifdef POWER8
+#define PREFETCHSIZE_A  24
+#define PREFETCHSIZE_C   8
+#endif
+
+
 #if !(defined(CONJ) && defined(XCONJ))
 #define FMADDR FMADD
 #define FMSUBR FNMSUB
diff --git a/kernel/power/ztrmm_kernel_8x2_power8.S b/kernel/power/ztrmm_kernel_8x2_power8.S
new file mode 100644
index 000000000..8b953765e
--- /dev/null
+++ b/kernel/power/ztrmm_kernel_8x2_power8.S
@@ -0,0 +1,377 @@
+/***************************************************************************
+Copyright (c) 2013-2016, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*****************************************************************************/
+
+/**************************************************************************************
+* 2016/03/05 Werner Saar (wernsaar@googlemail.com)
+* 	 BLASTEST 		: OK
+* 	 CTEST			: OK
+* 	 TEST			: OK
+*	 LAPACK-TEST		: OK
+**************************************************************************************/
+
+/*********************************************************************/
+/* Copyright 2009, 2010 The University of Texas at Austin.           */
+/* All rights reserved.                                              */
+/*                                                                   */
+/* Redistribution and use in source and binary forms, with or        */
+/* without modification, are permitted provided that the following   */
+/* conditions are met:                                               */
+/*                                                                   */
+/*   1. Redistributions of source code must retain the above         */
+/*      copyright notice, this list of conditions and the following  */
+/*      disclaimer.                                                  */
+/*                                                                   */
+/*   2. Redistributions in binary form must reproduce the above      */
+/*      copyright notice, this list of conditions and the following  */
+/*      disclaimer in the documentation and/or other materials       */
+/*      provided with the distribution.                              */
+/*                                                                   */
+/*    THIS  SOFTWARE IS PROVIDED  BY THE  UNIVERSITY OF  TEXAS AT    */
+/*    AUSTIN  ``AS IS''  AND ANY  EXPRESS OR  IMPLIED WARRANTIES,    */
+/*    INCLUDING, BUT  NOT LIMITED  TO, THE IMPLIED  WARRANTIES OF    */
+/*    MERCHANTABILITY  AND FITNESS FOR  A PARTICULAR  PURPOSE ARE    */
+/*    DISCLAIMED.  IN  NO EVENT SHALL THE UNIVERSITY  OF TEXAS AT    */
+/*    AUSTIN OR CONTRIBUTORS BE  LIABLE FOR ANY DIRECT, INDIRECT,    */
+/*    INCIDENTAL,  SPECIAL, EXEMPLARY,  OR  CONSEQUENTIAL DAMAGES    */
+/*    (INCLUDING, BUT  NOT LIMITED TO,  PROCUREMENT OF SUBSTITUTE    */
+/*    GOODS  OR  SERVICES; LOSS  OF  USE,  DATA,  OR PROFITS;  OR    */
+/*    BUSINESS INTERRUPTION) HOWEVER CAUSED  AND ON ANY THEORY OF    */
+/*    LIABILITY, WHETHER  IN CONTRACT, STRICT  LIABILITY, OR TORT    */
+/*    (INCLUDING NEGLIGENCE OR OTHERWISE)  ARISING IN ANY WAY OUT    */
+/*    OF  THE  USE OF  THIS  SOFTWARE,  EVEN  IF ADVISED  OF  THE    */
+/*    POSSIBILITY OF SUCH DAMAGE.                                    */
+/*                                                                   */
+/* The views and conclusions contained in the software and           */
+/* documentation are those of the authors and should not be          */
+/* interpreted as representing official policies, either expressed   */
+/* or implied, of The University of Texas at Austin.                 */
+/*********************************************************************/
+
+#define ASSEMBLER
+#include "common.h"
+#include "def_vsx.h"
+
+#ifndef __64BIT__
+#define LOAD	lwz
+#else
+#define LOAD	ld
+#endif
+
+#ifdef __64BIT__
+#define STACKSIZE 320
+#define ALPHA_R_SP 296(SP)
+#define ALPHA_I_SP 304(SP)
+#define FZERO	312(SP)
+#else
+#define STACKSIZE 256
+#define ALPHA_R_SP 224(SP)
+#define ALPHA_I_SP 232(SP)
+#define FZERO	240(SP)
+#endif
+
+#define	M	r3
+#define	N	r4
+#define	K	r5
+
+#ifdef linux
+#ifndef __64BIT__
+#define A	r6
+#define	B	r7
+#define	C	r8
+#define	LDC	r9
+#define OFFSET	r10
+#else
+#define A	r8
+#define	B	r9
+#define	C	r10
+#define	LDC	r6
+#define OFFSET	r7
+#endif
+#endif
+
+#if defined(_AIX) || defined(__APPLE__)
+#if !defined(__64BIT__) && defined(DOUBLE)
+#define A	r10
+#define	B	r6
+#define	C	r7
+#define	LDC	r8
+#define OFFSET	r9
+#else
+#define A	r8
+#define	B	r9
+#define	C	r10
+#define	LDC	r6
+#define OFFSET	r7
+#endif
+#endif
+
+#define o0	0
+#define alpha_r vs30
+#define alpha_i vs31
+
+#define KKK	r13
+#define K1	r14
+#define L	r15
+#define ALPHA	r16
+#define o24	r17
+#define T2	r19
+#define KK	r20
+#define	o8	r21
+#define	I	r22
+#define J	r23
+#define AO	r24
+#define	BO	r25
+#define	CO	r26
+#define o16	r27
+#define	o32	r28
+#define o48	r29
+
+#define PRE	r30
+#define T1  	r31
+
+#ifndef NEEDPARAM
+
+	PROLOGUE
+	PROFCODE
+
+	addi	SP, SP, -STACKSIZE
+	li	r0, 0
+
+	stfd	f14,    0(SP)
+	stfd	f15,    8(SP)
+	stfd	f16,   16(SP)
+	stfd	f17,   24(SP)
+
+	stfd	f18,   32(SP)
+	stfd	f19,   40(SP)
+	stfd	f20,   48(SP)
+	stfd	f21,   56(SP)
+
+	stfd	f22,   64(SP)
+	stfd	f23,   72(SP)
+	stfd	f24,   80(SP)
+	stfd	f25,   88(SP)
+
+	stfd	f26,   96(SP)
+	stfd	f27,  104(SP)
+	stfd	f28,  112(SP)
+	stfd	f29,  120(SP)
+
+	stfd	f30,  128(SP)
+	stfd	f31,  136(SP)
+
+#ifdef __64BIT__
+	std	r31,  144(SP)
+	std	r30,  152(SP)
+	std	r29,  160(SP)
+	std	r28,  168(SP)
+	std	r27,  176(SP)
+	std	r26,  184(SP)
+	std	r25,  192(SP)
+	std	r24,  200(SP)
+	std	r23,  208(SP)
+	std	r22,  216(SP)
+	std	r21,  224(SP)
+	std	r20,  232(SP)
+	std	r19,  240(SP)
+	std	r18,  248(SP)
+	std	r17,  256(SP)
+	std	r16,  264(SP)
+	std	r15,  272(SP)
+	std	r14,  280(SP)
+	std	r13,  288(SP)
+#else
+	stw	r31,  144(SP)
+	stw	r30,  148(SP)
+	stw	r29,  152(SP)
+	stw	r28,  156(SP)
+	stw	r27,  160(SP)
+	stw	r26,  164(SP)
+	stw	r25,  168(SP)
+	stw	r24,  172(SP)
+	stw	r23,  176(SP)
+	stw	r22,  180(SP)
+	stw	r21,  184(SP)
+	stw	r20,  188(SP)
+	stw	r19,  192(SP)
+	stw	r18,  196(SP)
+	stw	r17,  200(SP)
+	stw	r16,  204(SP)
+	stw	r15,  208(SP)
+	stw	r14,  212(SP)
+	stw	r13,  216(SP)
+#endif
+
+	stfd	f1,  ALPHA_R_SP
+	stfd	f2,  ALPHA_I_SP
+	stw	r0,  FZERO
+
+#ifdef linux
+#ifdef __64BIT__
+	ld	LDC, FRAMESLOT(0) + STACKSIZE(SP)
+#endif
+#endif
+
+#if defined(_AIX) || defined(__APPLE__)
+#ifdef __64BIT__
+	ld	LDC, FRAMESLOT(0) + STACKSIZE(SP)
+#else
+#ifdef DOUBLE
+	lwz	B,   FRAMESLOT(0) + STACKSIZE(SP)
+	lwz	C,   FRAMESLOT(1) + STACKSIZE(SP)
+	lwz	LDC, FRAMESLOT(2) + STACKSIZE(SP)
+#else
+	lwz	LDC, FRAMESLOT(0) + STACKSIZE(SP)
+#endif
+#endif
+#endif
+
+#ifdef TRMMKERNEL
+#if defined(linux) && defined(__64BIT__)
+	ld	OFFSET,  FRAMESLOT(1) + STACKSIZE(SP)
+#endif
+
+#if defined(_AIX) || defined(__APPLE__)
+#ifdef __64BIT__
+	ld	OFFSET,  FRAMESLOT(1) + STACKSIZE(SP)
+#else
+#ifdef DOUBLE
+	lwz	OFFSET,  FRAMESLOT(3) + STACKSIZE(SP)
+#else
+	lwz	OFFSET,  FRAMESLOT(1) + STACKSIZE(SP)
+#endif
+#endif
+#endif
+#if defined(TRMMKERNEL) && !defined(LEFT)
+	neg	KK, OFFSET
+#endif
+#endif
+
+#include "zgemm_macros_8x2_power8.S"
+
+	cmpwi	cr0, M, 0
+	ble	.L999
+	cmpwi	cr0, N, 0
+	ble	.L999
+	cmpwi	cr0, K, 0
+	ble	.L999
+
+	slwi	LDC, LDC, ZBASE_SHIFT
+	li	PRE, 256 
+	li	o8  , 8
+	li	o16 , 16
+	li	o24 , 24
+	li	o32 , 32
+	li	o48 , 48
+
+#ifdef __64BIT__
+	addi	ALPHA, SP, 296
+#else
+	addi	ALPHA, SP, 224
+#endif
+
+	lxsdx	alpha_r, 0, ALPHA
+	lxsdx	alpha_i, o8, ALPHA
+
+	.align 4
+
+#include "ztrmm_logic_8x2_power8.S"
+
+.L999:
+	addi	r3, 0, 0
+
+	lfd	f14,    0(SP)
+	lfd	f15,    8(SP)
+	lfd	f16,   16(SP)
+	lfd	f17,   24(SP)
+
+	lfd	f18,   32(SP)
+	lfd	f19,   40(SP)
+	lfd	f20,   48(SP)
+	lfd	f21,   56(SP)
+
+	lfd	f22,   64(SP)
+	lfd	f23,   72(SP)
+	lfd	f24,   80(SP)
+	lfd	f25,   88(SP)
+
+	lfd	f26,   96(SP)
+	lfd	f27,  104(SP)
+	lfd	f28,  112(SP)
+	lfd	f29,  120(SP)
+
+	lfd	f30,  128(SP)
+	lfd	f31,  136(SP)
+
+#ifdef __64BIT__
+	ld	r31,  144(SP)
+	ld	r30,  152(SP)
+	ld	r29,  160(SP)
+	ld	r28,  168(SP)
+	ld	r27,  176(SP)
+	ld	r26,  184(SP)
+	ld	r25,  192(SP)
+	ld	r24,  200(SP)
+	ld	r23,  208(SP)
+	ld	r22,  216(SP)
+	ld	r21,  224(SP)
+	ld	r20,  232(SP)
+	ld	r19,  240(SP)
+	ld	r18,  248(SP)
+	ld	r17,  256(SP)
+	ld	r16,  264(SP)
+	ld	r15,  272(SP)
+	ld	r14,  280(SP)
+	ld	r13,  288(SP)
+#else
+	lwz	r31,  144(SP)
+	lwz	r30,  148(SP)
+	lwz	r29,  152(SP)
+	lwz	r28,  156(SP)
+	lwz	r27,  160(SP)
+	lwz	r26,  164(SP)
+	lwz	r25,  168(SP)
+	lwz	r24,  172(SP)
+	lwz	r23,  176(SP)
+	lwz	r22,  180(SP)
+	lwz	r21,  184(SP)
+	lwz	r20,  188(SP)
+	lwz	r19,  192(SP)
+	lwz	r18,  196(SP)
+	lwz	r17,  200(SP)
+	lwz	r16,  204(SP)
+	lwz	r15,  208(SP)
+	lwz	r14,  212(SP)
+	lwz	r13,  216(SP)
+#endif
+
+	addi	SP, SP, STACKSIZE
+
+	blr
+
+	EPILOGUE
+#endif
diff --git a/kernel/power/ztrmm_logic_8x2_power8.S b/kernel/power/ztrmm_logic_8x2_power8.S
new file mode 100644
index 000000000..f422b17b1
--- /dev/null
+++ b/kernel/power/ztrmm_logic_8x2_power8.S
@@ -0,0 +1,1237 @@
+/***************************************************************************
+Copyright (c) 2013-2016, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*****************************************************************************/
+
+/**************************************************************************************
+* 2016/03/05 Werner Saar (wernsaar@googlemail.com)
+* 	 BLASTEST 		: OK
+* 	 CTEST			: OK
+* 	 TEST			: OK
+*	 LAPACK-TEST		: OK
+**************************************************************************************/
+
+
+	srawi.		J,	N,	1
+	ble		.LZTRMM_L2_END
+
+.LZTRMM_L2_BEGIN:
+
+	mr		CO,	C
+	mr		AO,	A
+	slwi		T1,	LDC	,	1
+	add		C,	C,	T1
+
+#if defined(LEFT)
+	mr		KK,	OFFSET		// OFFSET -> KK
+#endif
+
+	srawi.		I,	M,	3
+	ble		.LZTRMM_L2x8_END
+
+.LZTRMM_L2x8_BEGIN:
+
+
+#if  (defined(LEFT) &&  defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+	mr		BO,	B					// B -> BO
+#else
+	mr		BO,	B					// B -> BO
+	slwi		T1,	KK,	5				// Number of values in B shifted
+	slwi		T2,	KK,	7				// Number of values in A shifted
+	add		BO,	BO,	T1				// Add values to BO
+	add		AO,	AO,	T2				// Add values to AO
+#endif
+
+#if (defined(LEFT) &&  !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
+	sub		T1,	K,	KK				// K - KK -> TEMP1
+#else
+	mr		T1,	KK					// KK -> KTEMP
+#ifdef LEFT
+	addi		T1,	T1,	8				// KTEMP + Number of values in A -> KTEMP
+#else
+	addi		T1,	T1,	2				// KTEMP + Number of values in B -> KTEMP
+#endif
+#endif
+
+	mr		KKK,	T1
+	mr		K1,	T1
+	srawi.		L,	K1,	3				// KTEMP / 8 -> L
+	ble		.LZTRMM_L2x8_SUB0
+	cmpwi		cr0,	L,	1
+	ble		.LZTRMM_L2x8_SUB4
+
+.LZTRMM_L2x8_LOOP_START:
+
+	dcbt		AO,	PRE
+	LOAD2x8_1
+	dcbt		AO,	PRE
+	KERNEL2x8_I1
+	dcbt		AO,	PRE
+	KERNEL2x8_2
+	dcbt		AO,	PRE
+	KERNEL2x8_1
+	dcbt		AO,	PRE
+	KERNEL2x8_2
+
+	dcbt		AO,	PRE
+	KERNEL2x8_1
+	dcbt		AO,	PRE
+	KERNEL2x8_2
+	dcbt		AO,	PRE
+	KERNEL2x8_1
+	dcbt		AO,	PRE
+	KERNEL2x8_2
+
+	addic.		L,	L,	-2
+	ble		.LZTRMM_L2x8_LOOP_END
+
+	.align 5
+
+.LZTRMM_L2x8_LOOP:
+
+	dcbt		AO,	PRE
+	KERNEL2x8_1
+	dcbt		AO,	PRE
+	KERNEL2x8_2
+	dcbt		AO,	PRE
+	KERNEL2x8_1
+	dcbt		AO,	PRE
+	KERNEL2x8_2
+
+	dcbt		AO,	PRE
+	KERNEL2x8_1
+	dcbt		AO,	PRE
+	KERNEL2x8_2
+	dcbt		AO,	PRE
+	KERNEL2x8_1
+	dcbt		AO,	PRE
+	KERNEL2x8_2
+
+	addic.		L,	L,	-1
+	bgt		.LZTRMM_L2x8_LOOP
+
+.LZTRMM_L2x8_LOOP_END:
+
+	dcbt		AO,	PRE
+	KERNEL2x8_1
+	dcbt		AO,	PRE
+	KERNEL2x8_2
+	dcbt		AO,	PRE
+	KERNEL2x8_1
+	dcbt		AO,	PRE
+	KERNEL2x8_2
+
+	dcbt		AO,	PRE
+	KERNEL2x8_1
+	dcbt		AO,	PRE
+	KERNEL2x8_2
+	dcbt		AO,	PRE
+	KERNEL2x8_1
+	KERNEL2x8_E2
+
+	b		.LZTRMM_L2x8_SUB1
+
+.LZTRMM_L2x8_SUB4:
+
+	dcbt		AO,	PRE
+	KERNEL2x8_SUBI1
+	dcbt		AO,	PRE
+	KERNEL2x8_SUB1
+	dcbt		AO,	PRE
+	KERNEL2x8_SUB1
+	dcbt		AO,	PRE
+	KERNEL2x8_SUB1
+
+	KERNEL2x8_SUB1
+	KERNEL2x8_SUB1
+	KERNEL2x8_SUB1
+	KERNEL2x8_SUB1
+
+	b		.LZTRMM_L2x8_SUB1
+
+.LZTRMM_L2x8_SUB0:
+
+	andi.		L,	K1,	7						// K1 & 7 -> L
+
+	KERNEL2x8_SUBI1
+
+	addic.		L,	L,	-1
+	ble		.LZTRMM_L2x8_SAVE
+	b		.LZTRMM_L2x8_SUB2
+
+.LZTRMM_L2x8_SUB1:
+
+	andi.		L,	K1,	7						// K1 & 7 -> L
+	ble		.LZTRMM_L2x8_SAVE
+
+.LZTRMM_L2x8_SUB2:
+
+	KERNEL2x8_SUB1
+
+	addic.		L,	L,	-1
+	bgt		.LZTRMM_L2x8_SUB2
+
+.LZTRMM_L2x8_SAVE:
+
+	SAVE2x8
+
+#if  (defined(LEFT) &&  defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+	sub		T1,	K,	KKK					// K - KKK -> TEMP1
+	slwi		T2,	T1,	5			// TEMP1 * Number of values in B shifted -> TEMP2
+	slwi		T1,	T1,	7			// TEMP1 * Number of values in A shifted -> TEMP1
+	add		BO,	BO,	T2					// BO += TEMP2 * number of values in B shifted
+	add		AO,	AO,	T1					// AO += TEMP1 * number of values in A shifted
+#endif
+
+#if defined(LEFT)
+	addi		KK,	KK,	8				// KK += Number of values in A
+#endif
+
+
+	addic.		I,	I,	-1
+	bgt		.LZTRMM_L2x8_BEGIN
+
+.LZTRMM_L2x8_END:
+
+.LZTRMM_L2x4_BEGIN:
+	andi.		T2,	M,	7
+	ble		.LZTRMM_L2x1_END
+
+	andi.		T1,	M,	4
+	ble		.LZTRMM_L2x4_END
+
+#if  (defined(LEFT) &&  defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+	mr		BO,	B					// B -> BO
+#else
+	mr		BO,	B					// B -> BO
+	slwi		T1,	KK,	5				// Number of values in B shifted
+	slwi		T2,	KK,	6				// Number of values in A shifted
+	add		BO,	BO,	T1				// Add values to BO
+	add		AO,	AO,	T2				// Add values to AO
+#endif
+
+#if (defined(LEFT) &&  !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
+	sub		T1,	K,	KK				// K - KK -> TEMP1
+#else
+	mr		T1,	KK					// KK -> KTEMP
+#ifdef LEFT
+	addi		T1,	T1,	4				// KTEMP + Number of values in A -> KTEMP
+#else
+	addi		T1,	T1,	2				// KTEMP + Number of values in B -> KTEMP
+#endif
+#endif
+
+	mr		KKK,	T1
+	mr		K1,	T1
+	srawi.		L,	K1,	3				// KTEMP / 8 -> L
+	ble		.LZTRMM_L2x4_SUB0
+	cmpwi		cr0,	L,	1
+	ble		.LZTRMM_L2x4_SUB4
+
+.LZTRMM_L2x4_LOOP_START:
+
+	LOAD2x4_1
+	KERNEL2x4_I1
+	KERNEL2x4_2
+	KERNEL2x4_1
+	KERNEL2x4_2
+
+	KERNEL2x4_1
+	KERNEL2x4_2
+	KERNEL2x4_1
+	KERNEL2x4_2
+
+	addic.		L,	L,	-2
+	ble		.LZTRMM_L2x4_LOOP_END
+
+	.align 5
+
+.LZTRMM_L2x4_LOOP:
+
+	KERNEL2x4_1
+	KERNEL2x4_2
+	KERNEL2x4_1
+	KERNEL2x4_2
+
+	KERNEL2x4_1
+	KERNEL2x4_2
+	KERNEL2x4_1
+	KERNEL2x4_2
+
+	addic.		L,	L,	-1
+	bgt		.LZTRMM_L2x4_LOOP
+
+.LZTRMM_L2x4_LOOP_END:
+
+	KERNEL2x4_1
+	KERNEL2x4_2
+	KERNEL2x4_1
+	KERNEL2x4_2
+
+	KERNEL2x4_1
+	KERNEL2x4_2
+	KERNEL2x4_1
+	KERNEL2x4_E2
+
+	b		.LZTRMM_L2x4_SUB1
+
+.LZTRMM_L2x4_SUB4:
+
+	KERNEL2x4_SUBI1
+	KERNEL2x4_SUB1
+	KERNEL2x4_SUB1
+	KERNEL2x4_SUB1
+
+	KERNEL2x4_SUB1
+	KERNEL2x4_SUB1
+	KERNEL2x4_SUB1
+	KERNEL2x4_SUB1
+
+	b		.LZTRMM_L2x4_SUB1
+
+.LZTRMM_L2x4_SUB0:
+
+	andi.		L,	K1,	7						// K1 & 7 -> L
+
+	KERNEL2x4_SUBI1
+
+	addic.		L,	L,	-1
+	ble		.LZTRMM_L2x4_SAVE
+	b		.LZTRMM_L2x4_SUB2
+
+.LZTRMM_L2x4_SUB1:
+
+	andi.		L,	K1,	7						// K1 & 7 -> L
+	ble		.LZTRMM_L2x4_SAVE
+
+.LZTRMM_L2x4_SUB2:
+
+	KERNEL2x4_SUB1
+
+	addic.		L,	L,	-1
+	bgt		.LZTRMM_L2x4_SUB2
+
+.LZTRMM_L2x4_SAVE:
+
+	SAVE2x4
+
+#if  (defined(LEFT) &&  defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+	sub		T1,	K,	KKK					// K - KKK -> TEMP1
+	slwi		T2,	T1,	5			// TEMP1 * Number of values in B shifted -> TEMP2
+	slwi		T1,	T1,	6			// TEMP1 * Number of values in A shifted -> TEMP1
+	add		BO,	BO,	T2					// BO += TEMP2 * number of values in B shifted
+	add		AO,	AO,	T1					// AO += TEMP1 * number of values in A shifted
+#endif
+
+#if defined(LEFT)
+	addi		KK,	KK,	4				// KK += Number of values in A
+#endif
+
+
+.LZTRMM_L2x4_END:
+
+.LZTRMM_L2x2_BEGIN:
+
+	andi.		T1,	M,	2
+	ble		.LZTRMM_L2x2_END
+
+#if  (defined(LEFT) &&  defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+	mr		BO,	B					// B -> BO
+#else
+	mr		BO,	B					// B -> BO
+	slwi		T1,	KK,	5				// Number of values in B shifted
+	slwi		T2,	KK,	5				// Number of values in A shifted
+	add		BO,	BO,	T1				// Add values to BO
+	add		AO,	AO,	T2				// Add values to AO
+#endif
+
+#if (defined(LEFT) &&  !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
+	sub		T1,	K,	KK				// K - KK -> TEMP1
+#else
+	mr		T1,	KK					// KK -> KTEMP
+#ifdef LEFT
+	addi		T1,	T1,	2				// KTEMP + Number of values in A -> KTEMP
+#else
+	addi		T1,	T1,	2				// KTEMP + Number of values in B -> KTEMP
+#endif
+#endif
+
+	mr		KKK,	T1
+	mr		K1,	T1
+	srawi.		L,	K1,	3				// KTEMP / 8 -> L
+	ble		.LZTRMM_L2x2_SUB0
+	cmpwi		cr0,	L,	1
+	ble		.LZTRMM_L2x2_SUB4
+
+.LZTRMM_L2x2_LOOP_START:
+
+	LOAD2x2_1
+	KERNEL2x2_I1
+	KERNEL2x2_2
+	KERNEL2x2_1
+	KERNEL2x2_2
+
+	KERNEL2x2_1
+	KERNEL2x2_2
+	KERNEL2x2_1
+	KERNEL2x2_2
+
+	addic.		L,	L,	-2
+	ble		.LZTRMM_L2x2_LOOP_END
+
+	.align 5
+
+.LZTRMM_L2x2_LOOP:
+
+	KERNEL2x2_1
+	KERNEL2x2_2
+	KERNEL2x2_1
+	KERNEL2x2_2
+
+	KERNEL2x2_1
+	KERNEL2x2_2
+	KERNEL2x2_1
+	KERNEL2x2_2
+
+	addic.		L,	L,	-1
+	bgt		.LZTRMM_L2x2_LOOP
+
+.LZTRMM_L2x2_LOOP_END:
+
+	KERNEL2x2_1
+	KERNEL2x2_2
+	KERNEL2x2_1
+	KERNEL2x2_2
+
+	KERNEL2x2_1
+	KERNEL2x2_2
+	KERNEL2x2_1
+	KERNEL2x2_E2
+
+	b		.LZTRMM_L2x2_SUB1
+
+.LZTRMM_L2x2_SUB4:
+
+	KERNEL2x2_SUBI1
+	KERNEL2x2_SUB1
+	KERNEL2x2_SUB1
+	KERNEL2x2_SUB1
+
+	KERNEL2x2_SUB1
+	KERNEL2x2_SUB1
+	KERNEL2x2_SUB1
+	KERNEL2x2_SUB1
+
+	b		.LZTRMM_L2x2_SUB1
+
+.LZTRMM_L2x2_SUB0:
+
+	andi.		L,	K1,	7						// K1 & 7 -> L
+
+	KERNEL2x2_SUBI1
+
+	addic.		L,	L,	-1
+	ble		.LZTRMM_L2x2_SAVE
+	b		.LZTRMM_L2x2_SUB2
+
+.LZTRMM_L2x2_SUB1:
+
+	andi.		L,	K1,	7						// K1 & 7 -> L
+	ble		.LZTRMM_L2x2_SAVE
+
+.LZTRMM_L2x2_SUB2:
+
+	KERNEL2x2_SUB1
+
+	addic.		L,	L,	-1
+	bgt		.LZTRMM_L2x2_SUB2
+
+.LZTRMM_L2x2_SAVE:
+
+	SAVE2x2
+
+#if  (defined(LEFT) &&  defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+	sub		T1,	K,	KKK					// K - KKK -> TEMP1
+	slwi		T2,	T1,	5			// TEMP1 * Number of values in B shifted -> TEMP2
+	slwi		T1,	T1,	5			// TEMP1 * Number of values in A shifted -> TEMP1
+	add		BO,	BO,	T2					// BO += TEMP2 * number of values in B shifted
+	add		AO,	AO,	T1					// AO += TEMP1 * number of values in A shifted
+#endif
+
+#if defined(LEFT)
+	addi		KK,	KK,	2				// KK += Number of values in A
+#endif
+
+
+.LZTRMM_L2x2_END:
+
+.LZTRMM_L2x1_BEGIN:
+
+	andi.		T1,	M,	1
+	ble		.LZTRMM_L2x1_END
+
+#if  (defined(LEFT) &&  defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+	mr		BO,	B					// B -> BO
+#else
+	mr		BO,	B					// B -> BO
+	slwi		T1,	KK,	5				// Number of values in B shifted
+	slwi		T2,	KK,	4				// Number of values in A shifted
+	add		BO,	BO,	T1				// Add values to BO
+	add		AO,	AO,	T2				// Add values to AO
+#endif
+
+#if (defined(LEFT) &&  !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
+	sub		T1,	K,	KK				// K - KK -> TEMP1
+#else
+	mr		T1,	KK					// KK -> KTEMP
+#ifdef LEFT
+	addi		T1,	T1,	1				// KTEMP + Number of values in A -> KTEMP
+#else
+	addi		T1,	T1,	2				// KTEMP + Number of values in B -> KTEMP
+#endif
+#endif
+
+	mr		KKK,	T1
+	mr		K1,	T1
+	srawi.		L,	K1,	3				// KTEMP / 8 -> L
+	ble		.LZTRMM_L2x1_SUB0
+	cmpwi		cr0,	L,	1
+	ble		.LZTRMM_L2x1_SUB4
+
+.LZTRMM_L2x1_LOOP_START:
+
+	LOAD2x1_1
+	KERNEL2x1_I1
+	KERNEL2x1_2
+	KERNEL2x1_1
+	KERNEL2x1_2
+
+	KERNEL2x1_1
+	KERNEL2x1_2
+	KERNEL2x1_1
+	KERNEL2x1_2
+
+	addic.		L,	L,	-2
+	ble		.LZTRMM_L2x1_LOOP_END
+
+	.align 5
+
+.LZTRMM_L2x1_LOOP:
+
+	KERNEL2x1_1
+	KERNEL2x1_2
+	KERNEL2x1_1
+	KERNEL2x1_2
+
+	KERNEL2x1_1
+	KERNEL2x1_2
+	KERNEL2x1_1
+	KERNEL2x1_2
+
+	addic.		L,	L,	-1
+	bgt		.LZTRMM_L2x1_LOOP
+
+.LZTRMM_L2x1_LOOP_END:
+
+	KERNEL2x1_1
+	KERNEL2x1_2
+	KERNEL2x1_1
+	KERNEL2x1_2
+
+	KERNEL2x1_1
+	KERNEL2x1_2
+	KERNEL2x1_1
+	KERNEL2x1_E2
+
+	b		.LZTRMM_L2x1_SUB1
+
+.LZTRMM_L2x1_SUB4:
+
+	KERNEL2x1_SUBI1
+	KERNEL2x1_SUB1
+	KERNEL2x1_SUB1
+	KERNEL2x1_SUB1
+
+	KERNEL2x1_SUB1
+	KERNEL2x1_SUB1
+	KERNEL2x1_SUB1
+	KERNEL2x1_SUB1
+
+	b		.LZTRMM_L2x1_SUB1
+
+.LZTRMM_L2x1_SUB0:
+
+	andi.		L,	K1,	7						// K1 & 7 -> L
+
+	KERNEL2x1_SUBI1
+
+	addic.		L,	L,	-1
+	ble		.LZTRMM_L2x1_SAVE
+	b		.LZTRMM_L2x1_SUB2
+
+.LZTRMM_L2x1_SUB1:
+
+	andi.		L,	K1,	7						// K1 & 7 -> L
+	ble		.LZTRMM_L2x1_SAVE
+
+.LZTRMM_L2x1_SUB2:
+
+	KERNEL2x1_SUB1
+
+	addic.		L,	L,	-1
+	bgt		.LZTRMM_L2x1_SUB2
+
+.LZTRMM_L2x1_SAVE:
+
+	SAVE2x1
+
+#if  (defined(LEFT) &&  defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+	sub		T1,	K,	KKK					// K - KKK -> TEMP1
+	slwi		T2,	T1,	5			// TEMP1 * Number of values in B shifted -> TEMP2
+	slwi		T1,	T1,	4			// TEMP1 * Number of values in A shifted -> TEMP1
+	add		BO,	BO,	T2					// BO += TEMP2 * number of values in B shifted
+	add		AO,	AO,	T1					// AO += TEMP1 * number of values in A shifted
+#endif
+
+#if defined(LEFT)
+	addi		KK,	KK,	1				// KK += Number of values in A
+#endif
+
+
+.LZTRMM_L2x1_END:
+
+	slwi		T1,	K,	5
+	add		B,	B,	T1
+
+#if !defined(LEFT)
+	addi		KK,	KK,	2					// KK += Number of values in B
+#endif
+
+
+	addic.		J,	J,	-1
+	bgt		.LZTRMM_L2_BEGIN
+
+	andi.		T2,	N,	1
+	ble		.L999
+
+.LZTRMM_L2_END:
+
+	b		.LZTRMM_L1_BEGIN
+
+.L999_H1:
+
+	b		.L999
+
+.LZTRMM_L1_BEGIN:
+
+	andi.		T1,	N,	1
+	ble		.LZTRMM_L1_END
+	mr		CO,	C
+	mr		AO,	A
+
+#if defined(LEFT)
+	mr		KK,	OFFSET		// OFFSET -> KK
+#endif
+
+	srawi.		I,	M,	3
+	ble		.LZTRMM_L1x8_END
+
+.LZTRMM_L1x8_BEGIN:
+
+
+#if  (defined(LEFT) &&  defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+	mr		BO,	B					// B -> BO
+#else
+	mr		BO,	B					// B -> BO
+	slwi		T1,	KK,	4				// Number of values in B shifted
+	slwi		T2,	KK,	7				// Number of values in A shifted
+	add		BO,	BO,	T1				// Add values to BO
+	add		AO,	AO,	T2				// Add values to AO
+#endif
+
+#if (defined(LEFT) &&  !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
+	sub		T1,	K,	KK				// K - KK -> TEMP1
+#else
+	mr		T1,	KK					// KK -> KTEMP
+#ifdef LEFT
+	addi		T1,	T1,	8				// KTEMP + Number of values in A -> KTEMP
+#else
+	addi		T1,	T1,	1				// KTEMP + Number of values in B -> KTEMP
+#endif
+#endif
+
+	mr		KKK,	T1
+	mr		K1,	T1
+	srawi.		L,	K1,	3				// KTEMP / 8 -> L
+	ble		.LZTRMM_L1x8_SUB0
+	cmpwi		cr0,	L,	1
+	ble		.LZTRMM_L1x8_SUB4
+
+.LZTRMM_L1x8_LOOP_START:
+
+	dcbt		AO,	PRE
+	LOAD1x8_1
+	dcbt		AO,	PRE
+	KERNEL1x8_I1
+	dcbt		AO,	PRE
+	KERNEL1x8_2
+	dcbt		AO,	PRE
+	KERNEL1x8_1
+	dcbt		AO,	PRE
+	KERNEL1x8_2
+
+	dcbt		AO,	PRE
+	KERNEL1x8_1
+	dcbt		AO,	PRE
+	KERNEL1x8_2
+	dcbt		AO,	PRE
+	KERNEL1x8_1
+	dcbt		AO,	PRE
+	KERNEL1x8_2
+
+	addic.		L,	L,	-2
+	ble		.LZTRMM_L1x8_LOOP_END
+
+	.align 5
+
+.LZTRMM_L1x8_LOOP:
+
+	dcbt		AO,	PRE
+	KERNEL1x8_1
+	dcbt		AO,	PRE
+	KERNEL1x8_2
+	dcbt		AO,	PRE
+	KERNEL1x8_1
+	dcbt		AO,	PRE
+	KERNEL1x8_2
+
+	dcbt		AO,	PRE
+	KERNEL1x8_1
+	dcbt		AO,	PRE
+	KERNEL1x8_2
+	dcbt		AO,	PRE
+	KERNEL1x8_1
+	dcbt		AO,	PRE
+	KERNEL1x8_2
+
+	addic.		L,	L,	-1
+	bgt		.LZTRMM_L1x8_LOOP
+
+.LZTRMM_L1x8_LOOP_END:
+
+	dcbt		AO,	PRE
+	KERNEL1x8_1
+	dcbt		AO,	PRE
+	KERNEL1x8_2
+	dcbt		AO,	PRE
+	KERNEL1x8_1
+	dcbt		AO,	PRE
+	KERNEL1x8_2
+
+	dcbt		AO,	PRE
+	KERNEL1x8_1
+	dcbt		AO,	PRE
+	KERNEL1x8_2
+	dcbt		AO,	PRE
+	KERNEL1x8_1
+	KERNEL1x8_E2
+
+	b		.LZTRMM_L1x8_SUB1
+
+.LZTRMM_L1x8_SUB4:
+
+	dcbt		AO,	PRE
+	KERNEL1x8_SUBI1
+	dcbt		AO,	PRE
+	KERNEL1x8_SUB1
+	dcbt		AO,	PRE
+	KERNEL1x8_SUB1
+	dcbt		AO,	PRE
+	KERNEL1x8_SUB1
+
+	KERNEL1x8_SUB1
+	KERNEL1x8_SUB1
+	KERNEL1x8_SUB1
+	KERNEL1x8_SUB1
+
+	b		.LZTRMM_L1x8_SUB1
+
+.LZTRMM_L1x8_SUB0:
+
+	andi.		L,	K1,	7						// K1 & 7 -> L
+
+	KERNEL1x8_SUBI1
+
+	addic.		L,	L,	-1
+	ble		.LZTRMM_L1x8_SAVE
+	b		.LZTRMM_L1x8_SUB2
+
+.LZTRMM_L1x8_SUB1:
+
+	andi.		L,	K1,	7						// K1 & 7 -> L
+	ble		.LZTRMM_L1x8_SAVE
+
+.LZTRMM_L1x8_SUB2:
+
+	KERNEL1x8_SUB1
+
+	addic.		L,	L,	-1
+	bgt		.LZTRMM_L1x8_SUB2
+
+.LZTRMM_L1x8_SAVE:
+
+	SAVE1x8
+
+#if  (defined(LEFT) &&  defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+	sub		T1,	K,	KKK					// K - KKK -> TEMP1
+	slwi		T2,	T1,	4			// TEMP1 * Number of values in B shifted -> TEMP2
+	slwi		T1,	T1,	7			// TEMP1 * Number of values in A shifted -> TEMP1
+	add		BO,	BO,	T2					// BO += TEMP2 * number of values in B shifted
+	add		AO,	AO,	T1					// AO += TEMP1 * number of values in A shifted
+#endif
+
+#if defined(LEFT)
+	addi		KK,	KK,	8				// KK += Number of values in A
+#endif
+
+
+	addic.		I,	I,	-1
+	bgt		.LZTRMM_L1x8_BEGIN
+
+.LZTRMM_L1x8_END:
+
+.LZTRMM_L1x4_BEGIN:
+	andi.		T2,	M,	7
+	ble		.LZTRMM_L1x1_END
+
+	andi.		T1,	M,	4
+	ble		.LZTRMM_L1x4_END
+
+#if  (defined(LEFT) &&  defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+	mr		BO,	B					// B -> BO
+#else
+	mr		BO,	B					// B -> BO
+	slwi		T1,	KK,	4				// Number of values in B shifted
+	slwi		T2,	KK,	6				// Number of values in A shifted
+	add		BO,	BO,	T1				// Add values to BO
+	add		AO,	AO,	T2				// Add values to AO
+#endif
+
+#if (defined(LEFT) &&  !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
+	sub		T1,	K,	KK				// K - KK -> TEMP1
+#else
+	mr		T1,	KK					// KK -> KTEMP
+#ifdef LEFT
+	addi		T1,	T1,	4				// KTEMP + Number of values in A -> KTEMP
+#else
+	addi		T1,	T1,	1				// KTEMP + Number of values in B -> KTEMP
+#endif
+#endif
+
+	mr		KKK,	T1
+	mr		K1,	T1
+	srawi.		L,	K1,	3				// KTEMP / 8 -> L
+	ble		.LZTRMM_L1x4_SUB0
+	cmpwi		cr0,	L,	1
+	ble		.LZTRMM_L1x4_SUB4
+
+.LZTRMM_L1x4_LOOP_START:
+
+	LOAD1x4_1
+	KERNEL1x4_I1
+	KERNEL1x4_2
+	KERNEL1x4_1
+	KERNEL1x4_2
+
+	KERNEL1x4_1
+	KERNEL1x4_2
+	KERNEL1x4_1
+	KERNEL1x4_2
+
+	addic.		L,	L,	-2
+	ble		.LZTRMM_L1x4_LOOP_END
+
+	.align 5
+
+.LZTRMM_L1x4_LOOP:
+
+	KERNEL1x4_1
+	KERNEL1x4_2
+	KERNEL1x4_1
+	KERNEL1x4_2
+
+	KERNEL1x4_1
+	KERNEL1x4_2
+	KERNEL1x4_1
+	KERNEL1x4_2
+
+	addic.		L,	L,	-1
+	bgt		.LZTRMM_L1x4_LOOP
+
+.LZTRMM_L1x4_LOOP_END:
+
+	KERNEL1x4_1
+	KERNEL1x4_2
+	KERNEL1x4_1
+	KERNEL1x4_2
+
+	KERNEL1x4_1
+	KERNEL1x4_2
+	KERNEL1x4_1
+	KERNEL1x4_E2
+
+	b		.LZTRMM_L1x4_SUB1
+
+.LZTRMM_L1x4_SUB4:
+
+	KERNEL1x4_SUBI1
+	KERNEL1x4_SUB1
+	KERNEL1x4_SUB1
+	KERNEL1x4_SUB1
+
+	KERNEL1x4_SUB1
+	KERNEL1x4_SUB1
+	KERNEL1x4_SUB1
+	KERNEL1x4_SUB1
+
+	b		.LZTRMM_L1x4_SUB1
+
+.LZTRMM_L1x4_SUB0:
+
+	andi.		L,	K1,	7						// K1 & 7 -> L
+
+	KERNEL1x4_SUBI1
+
+	addic.		L,	L,	-1
+	ble		.LZTRMM_L1x4_SAVE
+	b		.LZTRMM_L1x4_SUB2
+
+.LZTRMM_L1x4_SUB1:
+
+	andi.		L,	K1,	7						// K1 & 7 -> L
+	ble		.LZTRMM_L1x4_SAVE
+
+.LZTRMM_L1x4_SUB2:
+
+	KERNEL1x4_SUB1
+
+	addic.		L,	L,	-1
+	bgt		.LZTRMM_L1x4_SUB2
+
+.LZTRMM_L1x4_SAVE:
+
+	SAVE1x4
+
+#if  (defined(LEFT) &&  defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+	sub		T1,	K,	KKK					// K - KKK -> TEMP1
+	slwi		T2,	T1,	4			// TEMP1 * Number of values in B shifted -> TEMP2
+	slwi		T1,	T1,	6			// TEMP1 * Number of values in A shifted -> TEMP1
+	add		BO,	BO,	T2					// BO += TEMP2 * number of values in B shifted
+	add		AO,	AO,	T1					// AO += TEMP1 * number of values in A shifted
+#endif
+
+#if defined(LEFT)
+	addi		KK,	KK,	4				// KK += Number of values in A
+#endif
+
+
+.LZTRMM_L1x4_END:
+
+.LZTRMM_L1x2_BEGIN:
+
+	andi.		T1,	M,	2
+	ble		.LZTRMM_L1x2_END
+
+#if  (defined(LEFT) &&  defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+	mr		BO,	B					// B -> BO
+#else
+	mr		BO,	B					// B -> BO
+	slwi		T1,	KK,	4				// Number of values in B shifted
+	slwi		T2,	KK,	5				// Number of values in A shifted
+	add		BO,	BO,	T1				// Add values to BO
+	add		AO,	AO,	T2				// Add values to AO
+#endif
+
+#if (defined(LEFT) &&  !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
+	sub		T1,	K,	KK				// K - KK -> TEMP1
+#else
+	mr		T1,	KK					// KK -> KTEMP
+#ifdef LEFT
+	addi		T1,	T1,	2				// KTEMP + Number of values in A -> KTEMP
+#else
+	addi		T1,	T1,	1				// KTEMP + Number of values in B -> KTEMP
+#endif
+#endif
+
+	mr		KKK,	T1
+	mr		K1,	T1
+	srawi.		L,	K1,	3				// KTEMP / 8 -> L
+	ble		.LZTRMM_L1x2_SUB0
+	cmpwi		cr0,	L,	1
+	ble		.LZTRMM_L1x2_SUB4
+
+.LZTRMM_L1x2_LOOP_START:
+
+	LOAD1x2_1
+	KERNEL1x2_I1
+	KERNEL1x2_2
+	KERNEL1x2_1
+	KERNEL1x2_2
+
+	KERNEL1x2_1
+	KERNEL1x2_2
+	KERNEL1x2_1
+	KERNEL1x2_2
+
+	addic.		L,	L,	-2
+	ble		.LZTRMM_L1x2_LOOP_END
+
+	.align 5
+
+.LZTRMM_L1x2_LOOP:
+
+	KERNEL1x2_1
+	KERNEL1x2_2
+	KERNEL1x2_1
+	KERNEL1x2_2
+
+	KERNEL1x2_1
+	KERNEL1x2_2
+	KERNEL1x2_1
+	KERNEL1x2_2
+
+	addic.		L,	L,	-1
+	bgt		.LZTRMM_L1x2_LOOP
+
+.LZTRMM_L1x2_LOOP_END:
+
+	KERNEL1x2_1
+	KERNEL1x2_2
+	KERNEL1x2_1
+	KERNEL1x2_2
+
+	KERNEL1x2_1
+	KERNEL1x2_2
+	KERNEL1x2_1
+	KERNEL1x2_E2
+
+	b		.LZTRMM_L1x2_SUB1
+
+.LZTRMM_L1x2_SUB4:
+
+	KERNEL1x2_SUBI1
+	KERNEL1x2_SUB1
+	KERNEL1x2_SUB1
+	KERNEL1x2_SUB1
+
+	KERNEL1x2_SUB1
+	KERNEL1x2_SUB1
+	KERNEL1x2_SUB1
+	KERNEL1x2_SUB1
+
+	b		.LZTRMM_L1x2_SUB1
+
+.LZTRMM_L1x2_SUB0:
+
+	andi.		L,	K1,	7						// K1 & 7 -> L
+
+	KERNEL1x2_SUBI1
+
+	addic.		L,	L,	-1
+	ble		.LZTRMM_L1x2_SAVE
+	b		.LZTRMM_L1x2_SUB2
+
+.LZTRMM_L1x2_SUB1:
+
+	andi.		L,	K1,	7						// K1 & 7 -> L
+	ble		.LZTRMM_L1x2_SAVE
+
+.LZTRMM_L1x2_SUB2:
+
+	KERNEL1x2_SUB1
+
+	addic.		L,	L,	-1
+	bgt		.LZTRMM_L1x2_SUB2
+
+.LZTRMM_L1x2_SAVE:
+
+	SAVE1x2
+
+#if  (defined(LEFT) &&  defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+	sub		T1,	K,	KKK					// K - KKK -> TEMP1
+	slwi		T2,	T1,	4			// TEMP1 * Number of values in B shifted -> TEMP2
+	slwi		T1,	T1,	5			// TEMP1 * Number of values in A shifted -> TEMP1
+	add		BO,	BO,	T2					// BO += TEMP2 * number of values in B shifted
+	add		AO,	AO,	T1					// AO += TEMP1 * number of values in A shifted
+#endif
+
+#if defined(LEFT)
+	addi		KK,	KK,	2				// KK += Number of values in A
+#endif
+
+
+.LZTRMM_L1x2_END:
+
+.LZTRMM_L1x1_BEGIN:
+
+	andi.		T1,	M,	1
+	ble		.LZTRMM_L1x1_END
+
+#if  (defined(LEFT) &&  defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+	mr		BO,	B					// B -> BO
+#else
+	mr		BO,	B					// B -> BO
+	slwi		T1,	KK,	4				// Number of values in B shifted
+	slwi		T2,	KK,	4				// Number of values in A shifted
+	add		BO,	BO,	T1				// Add values to BO
+	add		AO,	AO,	T2				// Add values to AO
+#endif
+
+#if (defined(LEFT) &&  !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
+	sub		T1,	K,	KK				// K - KK -> TEMP1
+#else
+	mr		T1,	KK					// KK -> KTEMP
+#ifdef LEFT
+	addi		T1,	T1,	1				// KTEMP + Number of values in A -> KTEMP
+#else
+	addi		T1,	T1,	1				// KTEMP + Number of values in B -> KTEMP
+#endif
+#endif
+
+	mr		KKK,	T1
+	mr		K1,	T1
+	srawi.		L,	K1,	3				// KTEMP / 8 -> L
+	ble		.LZTRMM_L1x1_SUB0
+	cmpwi		cr0,	L,	1
+	ble		.LZTRMM_L1x1_SUB4
+
+.LZTRMM_L1x1_LOOP_START:
+
+	LOAD1x1_1
+	KERNEL1x1_I1
+	KERNEL1x1_2
+	KERNEL1x1_1
+	KERNEL1x1_2
+
+	KERNEL1x1_1
+	KERNEL1x1_2
+	KERNEL1x1_1
+	KERNEL1x1_2
+
+	addic.		L,	L,	-2
+	ble		.LZTRMM_L1x1_LOOP_END
+
+	.align 5
+
+.LZTRMM_L1x1_LOOP:
+
+	KERNEL1x1_1
+	KERNEL1x1_2
+	KERNEL1x1_1
+	KERNEL1x1_2
+
+	KERNEL1x1_1
+	KERNEL1x1_2
+	KERNEL1x1_1
+	KERNEL1x1_2
+
+	addic.		L,	L,	-1
+	bgt		.LZTRMM_L1x1_LOOP
+
+.LZTRMM_L1x1_LOOP_END:
+
+	KERNEL1x1_1
+	KERNEL1x1_2
+	KERNEL1x1_1
+	KERNEL1x1_2
+
+	KERNEL1x1_1
+	KERNEL1x1_2
+	KERNEL1x1_1
+	KERNEL1x1_E2
+
+	b		.LZTRMM_L1x1_SUB1
+
+.LZTRMM_L1x1_SUB4:
+
+	KERNEL1x1_SUBI1
+	KERNEL1x1_SUB1
+	KERNEL1x1_SUB1
+	KERNEL1x1_SUB1
+
+	KERNEL1x1_SUB1
+	KERNEL1x1_SUB1
+	KERNEL1x1_SUB1
+	KERNEL1x1_SUB1
+
+	b		.LZTRMM_L1x1_SUB1
+
+.LZTRMM_L1x1_SUB0:
+
+	andi.		L,	K1,	7						// K1 & 7 -> L
+
+	KERNEL1x1_SUBI1
+
+	addic.		L,	L,	-1
+	ble		.LZTRMM_L1x1_SAVE
+	b		.LZTRMM_L1x1_SUB2
+
+.LZTRMM_L1x1_SUB1:
+
+	andi.		L,	K1,	7						// K1 & 7 -> L
+	ble		.LZTRMM_L1x1_SAVE
+
+.LZTRMM_L1x1_SUB2:
+
+	KERNEL1x1_SUB1
+
+	addic.		L,	L,	-1
+	bgt		.LZTRMM_L1x1_SUB2
+
+.LZTRMM_L1x1_SAVE:
+
+	SAVE1x1
+
+#if  (defined(LEFT) &&  defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+	sub		T1,	K,	KKK					// K - KKK -> TEMP1
+	slwi		T2,	T1,	4			// TEMP1 * Number of values in B shifted -> TEMP2
+	slwi		T1,	T1,	4			// TEMP1 * Number of values in A shifted -> TEMP1
+	add		BO,	BO,	T2					// BO += TEMP2 * number of values in B shifted
+	add		AO,	AO,	T1					// AO += TEMP1 * number of values in A shifted
+#endif
+
+#if defined(LEFT)
+	addi		KK,	KK,	1				// KK += Number of values in A
+#endif
+
+
+.LZTRMM_L1x1_END:
+
+#if !defined(LEFT)
+	addi		KK,	KK,	1					// KK += Number of values in B
+#endif
+
+
+.LZTRMM_L1_END:
diff --git a/kernel/x86_64/KERNEL b/kernel/x86_64/KERNEL
index 2dcc8658b..4874711bb 100644
--- a/kernel/x86_64/KERNEL
+++ b/kernel/x86_64/KERNEL
@@ -389,19 +389,19 @@ DGEMVTKERNEL = dgemv_t.S
 endif
 
 ifndef CGEMVNKERNEL
-CGEMVNKERNEL = cgemv_n.S
+CGEMVNKERNEL = cgemv_n_4.c
 endif
 
 ifndef CGEMVTKERNEL
-CGEMVTKERNEL = cgemv_t.S
+CGEMVTKERNEL = cgemv_t_4.c
 endif
 
 ifndef ZGEMVNKERNEL
-ZGEMVNKERNEL = zgemv_n.S
+ZGEMVNKERNEL = zgemv_n_4.c
 endif
 
 ifndef ZGEMVTKERNEL
-ZGEMVTKERNEL = zgemv_t.S
+ZGEMVTKERNEL = zgemv_t_4.c
 endif
 
 ifndef QGEMVNKERNEL
diff --git a/kernel/x86_64/KERNEL.BARCELONA b/kernel/x86_64/KERNEL.BARCELONA
index 313c62d7c..70f3d6058 100644
--- a/kernel/x86_64/KERNEL.BARCELONA
+++ b/kernel/x86_64/KERNEL.BARCELONA
@@ -1,6 +1,3 @@
-ZGEMVNKERNEL = zgemv_n_dup.S
-ZGEMVTKERNEL = zgemv_t.S
-
 SGEMMKERNEL    =  gemm_kernel_8x4_barcelona.S
 SGEMMINCOPY    =  ../generic/gemm_ncopy_8.c
 SGEMMITCOPY    =  ../generic/gemm_tcopy_8.c
diff --git a/kernel/x86_64/KERNEL.BULLDOZER b/kernel/x86_64/KERNEL.BULLDOZER
index c8ccae1ea..90834d9ca 100644
--- a/kernel/x86_64/KERNEL.BULLDOZER
+++ b/kernel/x86_64/KERNEL.BULLDOZER
@@ -18,7 +18,7 @@ SSYMV_L_KERNEL = ssymv_L.c
 SGEMVNKERNEL = sgemv_n_4.c
 SGEMVTKERNEL = sgemv_t_4.c
 
-ZGEMVNKERNEL = zgemv_n_dup.S
+ZGEMVNKERNEL = zgemv_n_4.c
 ZGEMVTKERNEL = zgemv_t_4.c
 
 DGEMVNKERNEL = dgemv_n_bulldozer.S
diff --git a/kernel/x86_64/KERNEL.PILEDRIVER b/kernel/x86_64/KERNEL.PILEDRIVER
index 6c726a6e9..3ad142063 100644
--- a/kernel/x86_64/KERNEL.PILEDRIVER
+++ b/kernel/x86_64/KERNEL.PILEDRIVER
@@ -11,7 +11,7 @@ ZAXPYKERNEL = zaxpy.c
 SGEMVNKERNEL = sgemv_n_4.c
 SGEMVTKERNEL = sgemv_t_4.c
 
-ZGEMVNKERNEL = zgemv_n_dup.S
+ZGEMVNKERNEL = zgemv_n_4.c
 ZGEMVTKERNEL = zgemv_t_4.c
 
 DGEMVNKERNEL = dgemv_n_bulldozer.S
diff --git a/kernel/x86_64/KERNEL.STEAMROLLER b/kernel/x86_64/KERNEL.STEAMROLLER
index 5291cc624..f14c82303 100644
--- a/kernel/x86_64/KERNEL.STEAMROLLER
+++ b/kernel/x86_64/KERNEL.STEAMROLLER
@@ -24,7 +24,7 @@ SGEMVTKERNEL = sgemv_t_4.c
 DGEMVNKERNEL = dgemv_n_4.c
 DGEMVTKERNEL = dgemv_t_4.c
 
-ZGEMVNKERNEL = zgemv_n_dup.S
+ZGEMVNKERNEL = zgemv_t_4.c
 ZGEMVTKERNEL = zgemv_t_4.c
 
 DCOPYKERNEL  = dcopy_bulldozer.S
diff --git a/kernel/x86_64/cgemv_n_4.c b/kernel/x86_64/cgemv_n_4.c
index ff8058549..d60e4475d 100644
--- a/kernel/x86_64/cgemv_n_4.c
+++ b/kernel/x86_64/cgemv_n_4.c
@@ -31,6 +31,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
 #if defined(HASWELL)
 #include "cgemv_n_microk_haswell-4.c"
+#elif defined(BULLDOZER) || defined(PILEDRIVER) || defined(STEAMROLLER)
+#include "cgemv_n_microk_bulldozer-4.c"
 #endif
 
 
diff --git a/kernel/x86_64/cgemv_n_microk_bulldozer-4.c b/kernel/x86_64/cgemv_n_microk_bulldozer-4.c
new file mode 100644
index 000000000..a74b41269
--- /dev/null
+++ b/kernel/x86_64/cgemv_n_microk_bulldozer-4.c
@@ -0,0 +1,541 @@
+/***************************************************************************
+Copyright (c) 2014, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*****************************************************************************/
+
+#define HAVE_KERNEL_4x4 1
+static void cgemv_kernel_4x4( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y) __attribute__ ((noinline));
+
+static void cgemv_kernel_4x4( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y)
+{
+
+	BLASLONG register i = 0;
+	BLASLONG register n1 = n & -8 ;
+	BLASLONG register n2 = n &  4 ;
+
+	__asm__  __volatile__
+	(
+
+	"vbroadcastss	  (%2), %%ymm0                  \n\t"  // real part x0
+	"vbroadcastss	 4(%2), %%ymm1                  \n\t"  // imag part x0
+	"vbroadcastss	 8(%2), %%ymm2                  \n\t"  // real part x1
+	"vbroadcastss	12(%2), %%ymm3                  \n\t"  // imag part x1
+	"vbroadcastss	16(%2), %%ymm4                  \n\t"  // real part x2
+	"vbroadcastss	20(%2), %%ymm5                  \n\t"  // imag part x2
+	"vbroadcastss	24(%2), %%ymm6                  \n\t"  // real part x3
+	"vbroadcastss	28(%2), %%ymm7                  \n\t"  // imag part x3
+
+	"cmpq		$0 , %1				\n\t"
+	"je		2f			        \n\t"
+
+	".align 16				        \n\t"
+	"1:				        \n\t"
+	"prefetcht0      384(%4,%0,4)			\n\t"
+	"vmovups	(%4,%0,4), %%ymm8	        \n\t" // 4 complex values form a0
+	"vmovups      32(%4,%0,4), %%ymm9	        \n\t" // 4 complex values form a0
+
+	"prefetcht0      384(%5,%0,4)			\n\t"
+	"vmovups	(%5,%0,4), %%ymm10              \n\t" // 4 complex values form a1
+	"vmovups      32(%5,%0,4), %%ymm11              \n\t" // 4 complex values form a1
+
+	"vmulps      %%ymm8 , %%ymm0, %%ymm12      \n\t" // a_r[0] * x_r , a_i[0] * x_r, a_r[1] * x_r, a_i[1] * x_r
+	"vmulps      %%ymm8 , %%ymm1, %%ymm13      \n\t" // a_r[0] * x_i , a_i[0] * x_i, a_r[1] * x_i, a_i[1] * x_i
+	"vmulps      %%ymm9 , %%ymm0, %%ymm14      \n\t" // a_r[2] * x_r , a_i[2] * x_r, a_r[3] * x_r, a_i[3] * x_r
+	"vmulps      %%ymm9 , %%ymm1, %%ymm15      \n\t" // a_r[2] * x_i , a_i[2] * x_i, a_r[3] * x_i, a_i[3] * x_i
+
+	"prefetcht0      384(%6,%0,4)			\n\t"
+	"vmovups	(%6,%0,4), %%ymm8	        \n\t" // 4 complex values form a2
+	"vmovups      32(%6,%0,4), %%ymm9	        \n\t" // 4 complex values form a2
+
+	"vfmaddps      %%ymm12, %%ymm10, %%ymm2, %%ymm12      \n\t" // a_r[0] * x_r , a_i[0] * x_r, a_r[1] * x_r, a_i[1] * x_r
+	"vfmaddps      %%ymm13, %%ymm10, %%ymm3, %%ymm13      \n\t" // a_r[0] * x_i , a_i[0] * x_i, a_r[1] * x_i, a_i[1] * x_i
+	"vfmaddps      %%ymm14, %%ymm11, %%ymm2, %%ymm14      \n\t" // a_r[2] * x_r , a_i[2] * x_r, a_r[3] * x_r, a_i[3] * x_r
+	"vfmaddps      %%ymm15, %%ymm11, %%ymm3, %%ymm15      \n\t" // a_r[2] * x_i , a_i[2] * x_i, a_r[3] * x_i, a_i[3] * x_i
+
+	"prefetcht0      384(%7,%0,4)			\n\t"
+	"vmovups	(%7,%0,4), %%ymm10              \n\t" // 4 complex values form a3
+	"vmovups      32(%7,%0,4), %%ymm11              \n\t" // 4 complex values form a3
+
+	"vfmaddps      %%ymm12, %%ymm8 , %%ymm4, %%ymm12      \n\t" // a_r[0] * x_r , a_i[0] * x_r, a_r[1] * x_r, a_i[1] * x_r
+	"vfmaddps      %%ymm13, %%ymm8 , %%ymm5, %%ymm13      \n\t" // a_r[0] * x_i , a_i[0] * x_i, a_r[1] * x_i, a_i[1] * x_i
+	"vfmaddps      %%ymm14, %%ymm9 , %%ymm4, %%ymm14      \n\t" // a_r[2] * x_r , a_i[2] * x_r, a_r[3] * x_r, a_i[3] * x_r
+	"vfmaddps      %%ymm15, %%ymm9 , %%ymm5, %%ymm15      \n\t" // a_r[2] * x_i , a_i[2] * x_i, a_r[3] * x_i, a_i[3] * x_i
+
+	"vfmaddps      %%ymm12, %%ymm10, %%ymm6, %%ymm12      \n\t" // a_r[0] * x_r , a_i[0] * x_r, a_r[1] * x_r, a_i[1] * x_r
+	"vfmaddps      %%ymm13, %%ymm10, %%ymm7, %%ymm13      \n\t" // a_r[0] * x_i , a_i[0] * x_i, a_r[1] * x_i, a_i[1] * x_i
+	"vfmaddps      %%ymm14, %%ymm11, %%ymm6, %%ymm14      \n\t" // a_r[2] * x_r , a_i[2] * x_r, a_r[3] * x_r, a_i[3] * x_r
+	"vfmaddps      %%ymm15, %%ymm11, %%ymm7, %%ymm15      \n\t" // a_r[2] * x_i , a_i[2] * x_i, a_r[3] * x_i, a_i[3] * x_i
+
+	"prefetcht0      384(%3,%0,4)			\n\t"
+	"vmovups	  (%3,%0,4),  %%ymm10           \n\t"
+	"vmovups	32(%3,%0,4),  %%ymm11           \n\t"
+
+#if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) )
+        "vpermilps      $0xb1 , %%ymm13, %%ymm13               \n\t"
+        "vpermilps      $0xb1 , %%ymm15, %%ymm15               \n\t"
+        "vaddsubps      %%ymm13, %%ymm12, %%ymm8              \n\t"
+        "vaddsubps      %%ymm15, %%ymm14, %%ymm9              \n\t"
+#else
+        "vpermilps      $0xb1 , %%ymm12, %%ymm12               \n\t"
+        "vpermilps      $0xb1 , %%ymm14, %%ymm14               \n\t"
+        "vaddsubps      %%ymm12, %%ymm13, %%ymm8              \n\t"
+        "vaddsubps      %%ymm14, %%ymm15, %%ymm9              \n\t"
+        "vpermilps      $0xb1 , %%ymm8 , %%ymm8                \n\t"
+        "vpermilps      $0xb1 , %%ymm9 , %%ymm9                \n\t"
+#endif
+
+        "vaddps         %%ymm8, %%ymm10, %%ymm12              \n\t"
+        "vaddps         %%ymm9, %%ymm11, %%ymm13              \n\t"
+
+	"vmovups  %%ymm12,   (%3,%0,4)		        \n\t" // 4 complex values to y	
+	"vmovups  %%ymm13, 32(%3,%0,4)		        \n\t"	
+
+        "addq		$16, %0	  	 	        \n\t"
+	"subq	        $8 , %1			        \n\t"		
+	"jnz		1b		        \n\t"
+
+	"2:				        \n\t"
+
+	"cmpq		$4, %8				\n\t"
+	"jne		3f				\n\t"
+
+	"vmovups	(%4,%0,4), %%ymm8	        \n\t" // 4 complex values form a0
+	"vmovups	(%5,%0,4), %%ymm10              \n\t" // 4 complex values form a1
+
+	"vmulps      %%ymm8 , %%ymm0, %%ymm12      \n\t" // a_r[0] * x_r , a_i[0] * x_r, a_r[1] * x_r, a_i[1] * x_r
+	"vmulps      %%ymm8 , %%ymm1, %%ymm13      \n\t" // a_r[0] * x_i , a_i[0] * x_i, a_r[1] * x_i, a_i[1] * x_i
+
+	"vfmaddps      %%ymm12, %%ymm10, %%ymm2, %%ymm12      \n\t" // a_r[0] * x_r , a_i[0] * x_r, a_r[1] * x_r, a_i[1] * x_r
+	"vfmaddps      %%ymm13, %%ymm10, %%ymm3, %%ymm13      \n\t" // a_r[0] * x_i , a_i[0] * x_i, a_r[1] * x_i, a_i[1] * x_i
+
+	"vmovups	(%6,%0,4), %%ymm8	        \n\t" // 4 complex values form a2
+	"vmovups	(%7,%0,4), %%ymm10              \n\t" // 4 complex values form a3
+
+	"vfmaddps      %%ymm12, %%ymm8 , %%ymm4, %%ymm12      \n\t" // a_r[0] * x_r , a_i[0] * x_r, a_r[1] * x_r, a_i[1] * x_r
+	"vfmaddps      %%ymm13, %%ymm8 , %%ymm5, %%ymm13      \n\t" // a_r[0] * x_i , a_i[0] * x_i, a_r[1] * x_i, a_i[1] * x_i
+
+	"vfmaddps      %%ymm12, %%ymm10, %%ymm6, %%ymm12      \n\t" // a_r[0] * x_r , a_i[0] * x_r, a_r[1] * x_r, a_i[1] * x_r
+	"vfmaddps      %%ymm13, %%ymm10, %%ymm7, %%ymm13      \n\t" // a_r[0] * x_i , a_i[0] * x_i, a_r[1] * x_i, a_i[1] * x_i
+
+	"vmovups	  (%3,%0,4),  %%ymm10           \n\t"
+
+#if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) )
+        "vpermilps      $0xb1 , %%ymm13, %%ymm13               \n\t"
+        "vaddsubps      %%ymm13, %%ymm12, %%ymm8              \n\t"
+#else
+        "vpermilps      $0xb1 , %%ymm12, %%ymm12               \n\t"
+        "vaddsubps      %%ymm12, %%ymm13, %%ymm8              \n\t"
+        "vpermilps      $0xb1 , %%ymm8 , %%ymm8                \n\t"
+#endif
+
+        "vaddps         %%ymm8, %%ymm10, %%ymm12              \n\t"
+
+	"vmovups  %%ymm12,   (%3,%0,4)		        \n\t" // 4 complex values to y	
+
+	"3:				        \n\t"
+	"vzeroupper			 \n\t"
+
+	:
+        : 
+          "r" (i),	// 0	
+	  "r" (n1),  	// 1
+          "r" (x),      // 2
+          "r" (y),      // 3
+          "r" (ap[0]),  // 4
+          "r" (ap[1]),  // 5
+          "r" (ap[2]),  // 6
+          "r" (ap[3]),  // 7
+          "r" (n2)      // 8
+	: "cc", 
+	  "%xmm0", "%xmm1", "%xmm2", "%xmm3", 
+	  "%xmm4", "%xmm5", "%xmm6", "%xmm7", 
+	  "%xmm8", "%xmm9", "%xmm10", "%xmm11", 
+	  "%xmm12", "%xmm13", "%xmm14", "%xmm15",
+	  "memory"
+	);
+
+} 
+
+
+#define HAVE_KERNEL_4x2 1
+static void cgemv_kernel_4x2( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y) __attribute__ ((noinline));
+
+static void cgemv_kernel_4x2( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y)
+{
+
+	BLASLONG register i = 0;
+	BLASLONG register n1 = n & -8 ;
+	BLASLONG register n2 = n &  4 ;
+
+	__asm__  __volatile__
+	(
+	"vzeroupper			 \n\t"
+
+	"vbroadcastss	  (%2), %%ymm0                  \n\t"  // real part x0
+	"vbroadcastss	 4(%2), %%ymm1                  \n\t"  // imag part x0
+	"vbroadcastss	 8(%2), %%ymm2                  \n\t"  // real part x1
+	"vbroadcastss	12(%2), %%ymm3                  \n\t"  // imag part x1
+
+	"cmpq		$0 , %1				\n\t"
+	"je		2f			        \n\t"
+
+	//	".align 16				        \n\t"
+	"1:				        \n\t"
+	"prefetcht0      384(%4,%0,4)			\n\t"
+	"vmovups	(%4,%0,4), %%ymm8	        \n\t" // 4 complex values form a0
+	"vmovups      32(%4,%0,4), %%ymm9	        \n\t" // 4 complex values form a0
+
+	"prefetcht0      384(%5,%0,4)			\n\t"
+	"vmovups	(%5,%0,4), %%ymm10              \n\t" // 4 complex values form a1
+	"vmovups      32(%5,%0,4), %%ymm11              \n\t" // 4 complex values form a1
+
+	"vmulps      %%ymm8 , %%ymm0, %%ymm12      \n\t" // a_r[0] * x_r , a_i[0] * x_r, a_r[1] * x_r, a_i[1] * x_r
+	"vmulps      %%ymm8 , %%ymm1, %%ymm13      \n\t" // a_r[0] * x_i , a_i[0] * x_i, a_r[1] * x_i, a_i[1] * x_i
+	"vmulps      %%ymm9 , %%ymm0, %%ymm14      \n\t" // a_r[2] * x_r , a_i[2] * x_r, a_r[3] * x_r, a_i[3] * x_r
+	"vmulps      %%ymm9 , %%ymm1, %%ymm15      \n\t" // a_r[2] * x_i , a_i[2] * x_i, a_r[3] * x_i, a_i[3] * x_i
+
+	"vfmaddps      %%ymm12, %%ymm10, %%ymm2, %%ymm12      \n\t" // a_r[0] * x_r , a_i[0] * x_r, a_r[1] * x_r, a_i[1] * x_r
+	"vfmaddps      %%ymm13, %%ymm10, %%ymm3, %%ymm13      \n\t" // a_r[0] * x_i , a_i[0] * x_i, a_r[1] * x_i, a_i[1] * x_i
+	"vfmaddps      %%ymm14, %%ymm11, %%ymm2, %%ymm14      \n\t" // a_r[2] * x_r , a_i[2] * x_r, a_r[3] * x_r, a_i[3] * x_r
+	"vfmaddps      %%ymm15, %%ymm11, %%ymm3, %%ymm15      \n\t" // a_r[2] * x_i , a_i[2] * x_i, a_r[3] * x_i, a_i[3] * x_i
+
+	"prefetcht0      384(%3,%0,4)			\n\t"
+	"vmovups	  (%3,%0,4),  %%ymm10           \n\t"
+	"vmovups	32(%3,%0,4),  %%ymm11           \n\t"
+
+#if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) )
+        "vpermilps      $0xb1 , %%ymm13, %%ymm13               \n\t"
+        "vpermilps      $0xb1 , %%ymm15, %%ymm15               \n\t"
+        "vaddsubps      %%ymm13, %%ymm12, %%ymm8              \n\t"
+        "vaddsubps      %%ymm15, %%ymm14, %%ymm9              \n\t"
+#else
+        "vpermilps      $0xb1 , %%ymm12, %%ymm12               \n\t"
+        "vpermilps      $0xb1 , %%ymm14, %%ymm14               \n\t"
+        "vaddsubps      %%ymm12, %%ymm13, %%ymm8              \n\t"
+        "vaddsubps      %%ymm14, %%ymm15, %%ymm9              \n\t"
+        "vpermilps      $0xb1 , %%ymm8 , %%ymm8                \n\t"
+        "vpermilps      $0xb1 , %%ymm9 , %%ymm9                \n\t"
+#endif
+
+        "vaddps         %%ymm8, %%ymm10, %%ymm12              \n\t"
+        "vaddps         %%ymm9, %%ymm11, %%ymm13              \n\t"
+
+	"vmovups  %%ymm12,   (%3,%0,4)		        \n\t" // 4 complex values to y	
+	"vmovups  %%ymm13, 32(%3,%0,4)		        \n\t"	
+
+        "addq		$16, %0	  	 	        \n\t"
+	"subq	        $8 , %1			        \n\t"		
+	"jnz		1b		        \n\t"
+
+	"2:				        \n\t"
+
+	"cmpq		$4, %6				\n\t"
+	"jne		3f				\n\t"
+
+	"vmovups	(%4,%0,4), %%ymm8	        \n\t" // 4 complex values form a0
+	"vmovups	(%5,%0,4), %%ymm10              \n\t" // 4 complex values form a1
+
+	"vmulps      %%ymm8 , %%ymm0, %%ymm12      \n\t" // a_r[0] * x_r , a_i[0] * x_r, a_r[1] * x_r, a_i[1] * x_r
+	"vmulps      %%ymm8 , %%ymm1, %%ymm13      \n\t" // a_r[0] * x_i , a_i[0] * x_i, a_r[1] * x_i, a_i[1] * x_i
+
+	"vfmaddps      %%ymm12, %%ymm10, %%ymm2, %%ymm12      \n\t" // a_r[0] * x_r , a_i[0] * x_r, a_r[1] * x_r, a_i[1] * x_r
+	"vfmaddps      %%ymm13, %%ymm10, %%ymm3, %%ymm13      \n\t" // a_r[0] * x_i , a_i[0] * x_i, a_r[1] * x_i, a_i[1] * x_i
+
+	"vmovups	  (%3,%0,4),  %%ymm10           \n\t"
+
+#if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) )
+        "vpermilps      $0xb1 , %%ymm13, %%ymm13               \n\t"
+        "vaddsubps      %%ymm13, %%ymm12, %%ymm8              \n\t"
+#else
+        "vpermilps      $0xb1 , %%ymm12, %%ymm12               \n\t"
+        "vaddsubps      %%ymm12, %%ymm13, %%ymm8              \n\t"
+        "vpermilps      $0xb1 , %%ymm8 , %%ymm8                \n\t"
+#endif
+
+        "vaddps         %%ymm8, %%ymm10, %%ymm12              \n\t"
+
+	"vmovups  %%ymm12,   (%3,%0,4)		        \n\t" // 4 complex values to y	
+
+	"3:				        \n\t"
+	"vzeroupper			 \n\t"
+
+	:
+        : 
+          "r" (i),	// 0	
+	  "r" (n1),  	// 1
+          "r" (x),      // 2
+          "r" (y),      // 3
+          "r" (ap[0]),  // 4
+          "r" (ap[1]),  // 5
+          "r" (n2)      // 6
+	: "cc", 
+	  "%xmm0", "%xmm1", "%xmm2", "%xmm3", 
+	  "%xmm4", "%xmm5", "%xmm6", "%xmm7", 
+	  "%xmm8", "%xmm9", "%xmm10", "%xmm11", 
+	  "%xmm12", "%xmm13", "%xmm14", "%xmm15",
+	  "memory"
+	);
+
+} 
+
+
+#define HAVE_KERNEL_4x1 1
+static void cgemv_kernel_4x1( BLASLONG n, FLOAT *ap, FLOAT *x, FLOAT *y) __attribute__ ((noinline));
+
+static void cgemv_kernel_4x1( BLASLONG n, FLOAT *ap, FLOAT *x, FLOAT *y)
+{
+
+	BLASLONG register i = 0;
+	BLASLONG register n1 = n & -8 ;
+	BLASLONG register n2 = n &  4 ;
+
+	__asm__  __volatile__
+	(
+	"vzeroupper			 \n\t"
+
+	"vbroadcastss	  (%2), %%ymm0                  \n\t"  // real part x0
+	"vbroadcastss	 4(%2), %%ymm1                  \n\t"  // imag part x0
+
+	"cmpq		$0 , %1				\n\t"
+	"je		2f			        \n\t"
+
+	//	".align 16				        \n\t"
+	"1:				        \n\t"
+	"prefetcht0      384(%4,%0,4)			\n\t"
+	"vmovups	(%4,%0,4), %%ymm8	        \n\t" // 4 complex values form a0
+	"vmovups      32(%4,%0,4), %%ymm9	        \n\t" // 4 complex values form a0
+
+	"vmulps      %%ymm8 , %%ymm0, %%ymm12      \n\t" // a_r[0] * x_r , a_i[0] * x_r, a_r[1] * x_r, a_i[1] * x_r
+	"vmulps      %%ymm8 , %%ymm1, %%ymm13      \n\t" // a_r[0] * x_i , a_i[0] * x_i, a_r[1] * x_i, a_i[1] * x_i
+	"vmulps      %%ymm9 , %%ymm0, %%ymm14      \n\t" // a_r[2] * x_r , a_i[2] * x_r, a_r[3] * x_r, a_i[3] * x_r
+	"vmulps      %%ymm9 , %%ymm1, %%ymm15      \n\t" // a_r[2] * x_i , a_i[2] * x_i, a_r[3] * x_i, a_i[3] * x_i
+
+	"prefetcht0      384(%3,%0,4)			\n\t"
+	"vmovups	  (%3,%0,4),  %%ymm10           \n\t"
+	"vmovups	32(%3,%0,4),  %%ymm11           \n\t"
+
+#if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) )
+        "vpermilps      $0xb1 , %%ymm13, %%ymm13               \n\t"
+        "vpermilps      $0xb1 , %%ymm15, %%ymm15               \n\t"
+        "vaddsubps      %%ymm13, %%ymm12, %%ymm8              \n\t"
+        "vaddsubps      %%ymm15, %%ymm14, %%ymm9              \n\t"
+#else
+        "vpermilps      $0xb1 , %%ymm12, %%ymm12               \n\t"
+        "vpermilps      $0xb1 , %%ymm14, %%ymm14               \n\t"
+        "vaddsubps      %%ymm12, %%ymm13, %%ymm8              \n\t"
+        "vaddsubps      %%ymm14, %%ymm15, %%ymm9              \n\t"
+        "vpermilps      $0xb1 , %%ymm8 , %%ymm8                \n\t"
+        "vpermilps      $0xb1 , %%ymm9 , %%ymm9                \n\t"
+#endif
+
+        "addq		$16, %0	  	 	        \n\t"
+        "vaddps         %%ymm8, %%ymm10, %%ymm12              \n\t"
+        "vaddps         %%ymm9, %%ymm11, %%ymm13              \n\t"
+
+	"subq	        $8 , %1			        \n\t"		
+	"vmovups  %%ymm12,-64(%3,%0,4)		        \n\t" // 4 complex values to y	
+	"vmovups  %%ymm13,-32(%3,%0,4)		        \n\t"	
+
+	"jnz		1b		        \n\t"
+
+	"2:				        \n\t"
+
+	"cmpq		$4, %5				\n\t"
+	"jne		3f				\n\t"
+
+	"vmovups	(%4,%0,4), %%ymm8	        \n\t" // 4 complex values form a0
+
+	"vmulps      %%ymm8 , %%ymm0, %%ymm12      \n\t" // a_r[0] * x_r , a_i[0] * x_r, a_r[1] * x_r, a_i[1] * x_r
+	"vmulps      %%ymm8 , %%ymm1, %%ymm13      \n\t" // a_r[0] * x_i , a_i[0] * x_i, a_r[1] * x_i, a_i[1] * x_i
+
+	"vmovups	  (%3,%0,4),  %%ymm10           \n\t"
+
+#if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) )
+        "vpermilps      $0xb1 , %%ymm13, %%ymm13               \n\t"
+        "vaddsubps      %%ymm13, %%ymm12, %%ymm8              \n\t"
+#else
+        "vpermilps      $0xb1 , %%ymm12, %%ymm12               \n\t"
+        "vaddsubps      %%ymm12, %%ymm13, %%ymm8              \n\t"
+        "vpermilps      $0xb1 , %%ymm8 , %%ymm8                \n\t"
+#endif
+
+        "vaddps         %%ymm8, %%ymm10, %%ymm12              \n\t"
+
+	"vmovups  %%ymm12,   (%3,%0,4)		        \n\t" // 4 complex values to y	
+
+	"3:				        \n\t"
+	"vzeroupper			 \n\t"
+
+	:
+        : 
+          "r" (i),	// 0	
+	  "r" (n1),  	// 1
+          "r" (x),      // 2
+          "r" (y),      // 3
+          "r" (ap),     // 4
+          "r" (n2)      // 5
+	: "cc", 
+	  "%xmm0", "%xmm1", "%xmm2", "%xmm3", 
+	  "%xmm4", "%xmm5", "%xmm6", "%xmm7", 
+	  "%xmm8", "%xmm9", "%xmm10", "%xmm11", 
+	  "%xmm12", "%xmm13", "%xmm14", "%xmm15",
+	  "memory"
+	);
+
+} 
+
+
+#define HAVE_KERNEL_ADDY 1
+
+static void add_y(BLASLONG n, FLOAT *src, FLOAT *dest, BLASLONG inc_dest,FLOAT alpha_r, FLOAT alpha_i)  __attribute__ ((noinline));
+
+static void add_y(BLASLONG n, FLOAT *src, FLOAT *dest, BLASLONG inc_dest,FLOAT alpha_r, FLOAT alpha_i)
+{
+	BLASLONG i;
+
+	if ( inc_dest != 2 )
+	{
+
+		FLOAT temp_r;
+		FLOAT temp_i;
+		for ( i=0; i<n; i++ )
+		{
+#if !defined(XCONJ) 
+			temp_r = alpha_r * src[0] - alpha_i * src[1];
+			temp_i = alpha_r * src[1] + alpha_i * src[0];
+#else
+			temp_r =  alpha_r * src[0] + alpha_i * src[1];
+			temp_i = -alpha_r * src[1] + alpha_i * src[0];
+#endif
+
+			*dest += temp_r;
+			*(dest+1) += temp_i;
+
+			src+=2;
+			dest += inc_dest;
+		}
+		return;
+	}
+
+	i=0;
+	BLASLONG register n1 = n & -8 ;
+	BLASLONG register n2 = n &  4 ;
+
+	__asm__  __volatile__
+	(
+	"vzeroupper			 \n\t"
+
+	"vbroadcastss	  (%4), %%ymm0                  \n\t"  // alpha_r
+	"vbroadcastss	  (%5), %%ymm1                  \n\t"  // alpha_i
+
+	"cmpq		$0 , %1				\n\t"
+	"je		2f			        \n\t"
+
+	//	".align 16				        \n\t"
+	"1:				        \n\t"
+	"vmovups	(%2,%0,4), %%ymm8	        \n\t" // 4 complex values from src
+	"vmovups      32(%2,%0,4), %%ymm9	        \n\t" 
+
+	"vmulps      %%ymm8 , %%ymm0, %%ymm12      \n\t" // a_r[0] * x_r , a_i[0] * x_r, a_r[1] * x_r, a_i[1] * x_r
+	"vmulps      %%ymm8 , %%ymm1, %%ymm13      \n\t" // a_r[0] * x_i , a_i[0] * x_i, a_r[1] * x_i, a_i[1] * x_i
+	"vmulps      %%ymm9 , %%ymm0, %%ymm14      \n\t" // a_r[2] * x_r , a_i[2] * x_r, a_r[3] * x_r, a_i[3] * x_r
+	"vmulps      %%ymm9 , %%ymm1, %%ymm15      \n\t" // a_r[2] * x_i , a_i[2] * x_i, a_r[3] * x_i, a_i[3] * x_i
+
+	"vmovups	  (%3,%0,4),  %%ymm10           \n\t" // 4 complex values from dest
+	"vmovups	32(%3,%0,4),  %%ymm11           \n\t"
+
+#if  !defined(XCONJ)  
+        "vpermilps      $0xb1 , %%ymm13, %%ymm13               \n\t"
+        "vpermilps      $0xb1 , %%ymm15, %%ymm15               \n\t"
+        "vaddsubps      %%ymm13, %%ymm12, %%ymm8              \n\t"
+        "vaddsubps      %%ymm15, %%ymm14, %%ymm9              \n\t"
+#else
+        "vpermilps      $0xb1 , %%ymm12, %%ymm12               \n\t"
+        "vpermilps      $0xb1 , %%ymm14, %%ymm14               \n\t"
+        "vaddsubps      %%ymm12, %%ymm13, %%ymm8              \n\t"
+        "vaddsubps      %%ymm14, %%ymm15, %%ymm9              \n\t"
+        "vpermilps      $0xb1 , %%ymm8 , %%ymm8                \n\t"
+        "vpermilps      $0xb1 , %%ymm9 , %%ymm9                \n\t"
+#endif
+
+        "addq		$16, %0	  	 	        \n\t"
+        "vaddps         %%ymm8, %%ymm10, %%ymm12              \n\t"
+        "vaddps         %%ymm9, %%ymm11, %%ymm13              \n\t"
+
+	"subq	        $8 , %1			        \n\t"		
+	"vmovups  %%ymm12,-64(%3,%0,4)		        \n\t" // 4 complex values to y	
+	"vmovups  %%ymm13,-32(%3,%0,4)		        \n\t"	
+
+	"jnz		1b		        \n\t"
+
+	"2:				        \n\t"
+
+	"cmpq		$4, %6				\n\t"
+	"jne		3f				\n\t"
+
+	"vmovups	(%2,%0,4), %%ymm8	        \n\t" // 4 complex values src
+
+	"vmulps      %%ymm8 , %%ymm0, %%ymm12      \n\t" // a_r[0] * x_r , a_i[0] * x_r, a_r[1] * x_r, a_i[1] * x_r
+	"vmulps      %%ymm8 , %%ymm1, %%ymm13      \n\t" // a_r[0] * x_i , a_i[0] * x_i, a_r[1] * x_i, a_i[1] * x_i
+
+	"vmovups	  (%3,%0,4),  %%ymm10           \n\t"
+
+#if !defined(XCONJ)
+        "vpermilps      $0xb1 , %%ymm13, %%ymm13               \n\t"
+        "vaddsubps      %%ymm13, %%ymm12, %%ymm8              \n\t"
+#else
+        "vpermilps      $0xb1 , %%ymm12, %%ymm12               \n\t"
+        "vaddsubps      %%ymm12, %%ymm13, %%ymm8              \n\t"
+        "vpermilps      $0xb1 , %%ymm8 , %%ymm8                \n\t"
+#endif
+
+        "vaddps         %%ymm8, %%ymm10, %%ymm12              \n\t"
+
+	"vmovups  %%ymm12,   (%3,%0,4)		        \n\t" // 4 complex values to y	
+
+	"3:				        \n\t"
+	"vzeroupper			 \n\t"
+
+	:
+        : 
+          "r" (i),	      // 0	
+	  "r" (n1),  	      // 1
+          "r" (src),          // 2
+          "r" (dest),         // 3
+          "r" (&alpha_r),     // 4
+          "r" (&alpha_i),     // 5
+          "r" (n2)            // 6
+	: "cc", 
+	  "%xmm0", "%xmm1", "%xmm2", "%xmm3", 
+	  "%xmm4", "%xmm5", "%xmm6", "%xmm7", 
+	  "%xmm8", "%xmm9", "%xmm10", "%xmm11", 
+	  "%xmm12", "%xmm13", "%xmm14", "%xmm15",
+	  "memory"
+	);
+
+	return;
+
+}
+
diff --git a/kernel/x86_64/cgemv_t_4.c b/kernel/x86_64/cgemv_t_4.c
index b383a4869..b558164ff 100644
--- a/kernel/x86_64/cgemv_t_4.c
+++ b/kernel/x86_64/cgemv_t_4.c
@@ -30,6 +30,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
 #if defined(HASWELL)
 #include "cgemv_t_microk_haswell-4.c"
+#elif defined(BULLDOZER) || defined(PILEDRIVER) || defined(STEAMROLLER)
+#include "cgemv_t_microk_bulldozer-4.c"
 #endif
 
 #define NBMAX 2048
diff --git a/kernel/x86_64/cgemv_t_microk_bulldozer-4.c b/kernel/x86_64/cgemv_t_microk_bulldozer-4.c
new file mode 100644
index 000000000..941b9cfc7
--- /dev/null
+++ b/kernel/x86_64/cgemv_t_microk_bulldozer-4.c
@@ -0,0 +1,541 @@
+/***************************************************************************
+Copyright (c) 2014, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary froms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary from must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*****************************************************************************/
+
+#define HAVE_KERNEL_4x4 1
+static void cgemv_kernel_4x4( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, FLOAT *alpha) __attribute__ ((noinline));
+
+static void cgemv_kernel_4x4( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, FLOAT *alpha)
+{
+
+	BLASLONG register i = 0;
+
+	__asm__  __volatile__
+	(
+	"vzeroupper			 \n\t"
+
+	"vxorps		%%ymm8 , %%ymm8 , %%ymm8 	\n\t" // temp
+	"vxorps		%%ymm9 , %%ymm9 , %%ymm9 	\n\t" // temp
+	"vxorps		%%ymm10, %%ymm10, %%ymm10	\n\t" // temp
+	"vxorps		%%ymm11, %%ymm11, %%ymm11	\n\t" // temp
+	"vxorps		%%ymm12, %%ymm12, %%ymm12	\n\t" // temp
+	"vxorps		%%ymm13, %%ymm13, %%ymm13	\n\t"
+	"vxorps		%%ymm14, %%ymm14, %%ymm14	\n\t"
+	"vxorps		%%ymm15, %%ymm15, %%ymm15	\n\t"
+
+        "testq          $0x04, %1                      \n\t"
+        "jz             2f                      \n\t"
+
+	"vmovups	(%4,%0,4), %%ymm4	        \n\t" // 4 complex values from a0
+	"vmovups	(%5,%0,4), %%ymm5               \n\t" // 4 complex values from a1
+
+	"vmovups	    (%2,%0,4)  , %%ymm6		\n\t" // 4 complex values from x
+	"vpermilps        $0xb1, %%ymm6, %%ymm7		\n\t" // exchange real and imap parts
+	"vblendps $0x55, %%ymm6, %%ymm7, %%ymm0         \n\t" // only the real parts
+	"vblendps $0x55, %%ymm7, %%ymm6, %%ymm1         \n\t" // only the imag parts
+	
+	"vmovups	(%6,%0,4), %%ymm6	        \n\t" // 4 complex values from a2
+	"vmovups	(%7,%0,4), %%ymm7               \n\t" // 4 complex values from a3
+
+	"vfmaddps      %%ymm8 , %%ymm4 , %%ymm0, %%ymm8       \n\t" // ar0*xr0,al0*xr0,ar1*xr1,al1*xr1 
+	"vfmaddps      %%ymm9 , %%ymm4 , %%ymm1, %%ymm9       \n\t" // ar0*xl0,al0*xl0,ar1*xl1,al1*xl1 
+	"vfmaddps      %%ymm10, %%ymm5 , %%ymm0, %%ymm10      \n\t" // ar0*xr0,al0*xr0,ar1*xr1,al1*xr1 
+	"vfmaddps      %%ymm11, %%ymm5 , %%ymm1, %%ymm11      \n\t" // ar0*xl0,al0*xl0,ar1*xl1,al1*xl1 
+	"vfmaddps      %%ymm12, %%ymm6 , %%ymm0, %%ymm12      \n\t" // ar0*xr0,al0*xr0,ar1*xr1,al1*xr1 
+	"vfmaddps      %%ymm13, %%ymm6 , %%ymm1, %%ymm13      \n\t" // ar0*xl0,al0*xl0,ar1*xl1,al1*xl1 
+	"vfmaddps      %%ymm14, %%ymm7 , %%ymm0, %%ymm14      \n\t" // ar0*xr0,al0*xr0,ar1*xr1,al1*xr1 
+	"vfmaddps      %%ymm15, %%ymm7 , %%ymm1, %%ymm15      \n\t" // ar0*xl0,al0*xl0,ar1*xl1,al1*xl1 
+	
+        "addq		$8  , %0	  	 	        \n\t"
+	"subq	        $4  , %1			        \n\t"		
+
+        "2:                                  \n\t"
+	"cmpq           $0, %1                         \n\t"
+        "je             3f                      \n\t"
+
+	//	".align 16				        \n\t"
+	"1:				        \n\t"
+        "prefetcht0      384(%4,%0,4)                   \n\t"
+	"vmovups	(%4,%0,4), %%ymm4	        \n\t" // 4 complex values from a0
+        "prefetcht0      384(%5,%0,4)                   \n\t"
+	"vmovups	(%5,%0,4), %%ymm5               \n\t" // 4 complex values from a1
+
+        "prefetcht0      384(%2,%0,4)                   \n\t"
+	"vmovups	    (%2,%0,4)  , %%ymm6		\n\t" // 4 complex values from x
+	"vpermilps        $0xb1, %%ymm6, %%ymm7		\n\t" // exchange real and imap parts
+	"vblendps $0x55, %%ymm6, %%ymm7, %%ymm0         \n\t" // only the real parts
+	"vblendps $0x55, %%ymm7, %%ymm6, %%ymm1         \n\t" // only the imag parts
+	
+        "prefetcht0      384(%6,%0,4)                   \n\t"
+	"vmovups	(%6,%0,4), %%ymm6	        \n\t" // 4 complex values from a2
+        "prefetcht0      384(%7,%0,4)                   \n\t"
+	"vmovups	(%7,%0,4), %%ymm7               \n\t" // 4 complex values from a3
+
+	"vfmaddps      %%ymm8 , %%ymm4 , %%ymm0, %%ymm8       \n\t" // ar0*xr0,al0*xr0,ar1*xr1,al1*xr1 
+	"vfmaddps      %%ymm9 , %%ymm4 , %%ymm1, %%ymm9       \n\t" // ar0*xl0,al0*xl0,ar1*xl1,al1*xl1 
+	"vfmaddps      %%ymm10, %%ymm5 , %%ymm0, %%ymm10      \n\t" // ar0*xr0,al0*xr0,ar1*xr1,al1*xr1 
+	"vfmaddps      %%ymm11, %%ymm5 , %%ymm1, %%ymm11      \n\t" // ar0*xl0,al0*xl0,ar1*xl1,al1*xl1 
+	"vfmaddps      %%ymm12, %%ymm6 , %%ymm0, %%ymm12      \n\t" // ar0*xr0,al0*xr0,ar1*xr1,al1*xr1 
+	"vfmaddps      %%ymm13, %%ymm6 , %%ymm1, %%ymm13      \n\t" // ar0*xl0,al0*xl0,ar1*xl1,al1*xl1 
+	"vfmaddps      %%ymm14, %%ymm7 , %%ymm0, %%ymm14      \n\t" // ar0*xr0,al0*xr0,ar1*xr1,al1*xr1 
+	"vfmaddps      %%ymm15, %%ymm7 , %%ymm1, %%ymm15      \n\t" // ar0*xl0,al0*xl0,ar1*xl1,al1*xl1 
+	
+
+	"vmovups       32(%4,%0,4), %%ymm4	        \n\t" // 4 complex values from a0
+	"vmovups       32(%5,%0,4), %%ymm5              \n\t" // 4 complex values from a1
+
+	"vmovups	  32(%2,%0,4)  , %%ymm6		\n\t" // 4 complex values from x
+	"vpermilps        $0xb1, %%ymm6, %%ymm7		\n\t" // exchange real and imap parts
+	"vblendps $0x55, %%ymm6, %%ymm7, %%ymm0         \n\t" // only the real parts
+	"vblendps $0x55, %%ymm7, %%ymm6, %%ymm1         \n\t" // only the imag parts
+
+	"vmovups       32(%6,%0,4), %%ymm6	        \n\t" // 4 complex values from a2
+	"vmovups       32(%7,%0,4), %%ymm7              \n\t" // 4 complex values from a3
+
+	"vfmaddps      %%ymm8 , %%ymm4 , %%ymm0, %%ymm8       \n\t" // ar0*xr0,al0*xr0,ar1*xr1,al1*xr1 
+	"vfmaddps      %%ymm9 , %%ymm4 , %%ymm1, %%ymm9       \n\t" // ar0*xl0,al0*xl0,ar1*xl1,al1*xl1 
+	"vfmaddps      %%ymm10, %%ymm5 , %%ymm0, %%ymm10      \n\t" // ar0*xr0,al0*xr0,ar1*xr1,al1*xr1 
+	"vfmaddps      %%ymm11, %%ymm5 , %%ymm1, %%ymm11      \n\t" // ar0*xl0,al0*xl0,ar1*xl1,al1*xl1 
+	"vfmaddps      %%ymm12, %%ymm6 , %%ymm0, %%ymm12      \n\t" // ar0*xr0,al0*xr0,ar1*xr1,al1*xr1 
+	"vfmaddps      %%ymm13, %%ymm6 , %%ymm1, %%ymm13      \n\t" // ar0*xl0,al0*xl0,ar1*xl1,al1*xl1 
+	"vfmaddps      %%ymm14, %%ymm7 , %%ymm0, %%ymm14      \n\t" // ar0*xr0,al0*xr0,ar1*xr1,al1*xr1 
+	"vfmaddps      %%ymm15, %%ymm7 , %%ymm1, %%ymm15      \n\t" // ar0*xl0,al0*xl0,ar1*xl1,al1*xl1 
+	
+        "addq		$16 , %0	  	 	        \n\t"
+	"subq	        $8  , %1			        \n\t"		
+	"jnz		1b		        \n\t"
+
+        "3:                                   \n\t"
+
+        "vbroadcastss    (%8)  , %%xmm0                \n\t"  // value from alpha
+        "vbroadcastss   4(%8)  , %%xmm1                \n\t"  // value from alpha
+
+
+#if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) )
+        "vpermilps      $0xb1 , %%ymm9 , %%ymm9                \n\t"
+        "vpermilps      $0xb1 , %%ymm11, %%ymm11               \n\t"
+        "vpermilps      $0xb1 , %%ymm13, %%ymm13               \n\t"
+        "vpermilps      $0xb1 , %%ymm15, %%ymm15               \n\t"
+        "vaddsubps      %%ymm9 , %%ymm8, %%ymm8                \n\t" 
+        "vaddsubps      %%ymm11, %%ymm10, %%ymm10              \n\t"
+        "vaddsubps      %%ymm13, %%ymm12, %%ymm12              \n\t"
+        "vaddsubps      %%ymm15, %%ymm14, %%ymm14              \n\t"
+#else
+        "vpermilps      $0xb1 , %%ymm8 , %%ymm8                \n\t"
+        "vpermilps      $0xb1 , %%ymm10, %%ymm10               \n\t"
+        "vpermilps      $0xb1 , %%ymm12, %%ymm12               \n\t"
+        "vpermilps      $0xb1 , %%ymm14, %%ymm14               \n\t"
+        "vaddsubps      %%ymm8 , %%ymm9 , %%ymm8               \n\t"
+        "vaddsubps      %%ymm10, %%ymm11, %%ymm10              \n\t"
+        "vaddsubps      %%ymm12, %%ymm13, %%ymm12              \n\t"
+        "vaddsubps      %%ymm14, %%ymm15, %%ymm14              \n\t"
+        "vpermilps      $0xb1 , %%ymm8 , %%ymm8                \n\t"
+        "vpermilps      $0xb1 , %%ymm10, %%ymm10               \n\t"
+        "vpermilps      $0xb1 , %%ymm12, %%ymm12               \n\t"
+        "vpermilps      $0xb1 , %%ymm14, %%ymm14               \n\t"
+#endif
+
+	"vmovsd         (%3), %%xmm4			\n\t" // read y
+	"vmovsd        8(%3), %%xmm5			\n\t"
+	"vmovsd       16(%3), %%xmm6			\n\t"
+	"vmovsd       24(%3), %%xmm7			\n\t"
+
+	"vextractf128   $1, %%ymm8 , %%xmm9		      \n\t"
+	"vextractf128   $1, %%ymm10, %%xmm11	      	      \n\t"
+	"vextractf128   $1, %%ymm12, %%xmm13		      \n\t"
+	"vextractf128   $1, %%ymm14, %%xmm15		      \n\t"
+
+	"vaddps		%%xmm8 , %%xmm9 , %%xmm8       \n\t"
+	"vaddps		%%xmm10, %%xmm11, %%xmm10      \n\t"
+	"vaddps		%%xmm12, %%xmm13, %%xmm12      \n\t"
+	"vaddps		%%xmm14, %%xmm15, %%xmm14      \n\t"
+
+	"vshufpd        $0x1, %%xmm8 , %%xmm8 , %%xmm9   \n\t"
+	"vshufpd        $0x1, %%xmm10, %%xmm10, %%xmm11  \n\t"
+	"vshufpd        $0x1, %%xmm12, %%xmm12, %%xmm13  \n\t"
+	"vshufpd        $0x1, %%xmm14, %%xmm14, %%xmm15  \n\t"
+
+	"vaddps		%%xmm8 , %%xmm9 , %%xmm8       \n\t"
+	"vaddps		%%xmm10, %%xmm11, %%xmm10      \n\t"
+	"vaddps		%%xmm12, %%xmm13, %%xmm12      \n\t"
+	"vaddps		%%xmm14, %%xmm15, %%xmm14      \n\t"
+
+
+        "vmulps         %%xmm8 , %%xmm1 , %%xmm9              \n\t"  // t_r * alpha_i , t_i * alpha_i
+        "vmulps         %%xmm8 , %%xmm0 , %%xmm8              \n\t"  // t_r * alpha_r , t_i * alpha_r
+        "vmulps         %%xmm10, %%xmm1 , %%xmm11             \n\t"  // t_r * alpha_i , t_i * alpha_i
+        "vmulps         %%xmm10, %%xmm0 , %%xmm10             \n\t"  // t_r * alpha_r , t_i * alpha_r
+        "vmulps         %%xmm12, %%xmm1 , %%xmm13             \n\t"  // t_r * alpha_i , t_i * alpha_i
+        "vmulps         %%xmm12, %%xmm0 , %%xmm12             \n\t"  // t_r * alpha_r , t_i * alpha_r
+        "vmulps         %%xmm14, %%xmm1 , %%xmm15             \n\t"  // t_r * alpha_i , t_i * alpha_i
+        "vmulps         %%xmm14, %%xmm0 , %%xmm14             \n\t"  // t_r * alpha_r , t_i * alpha_r
+
+#if !defined(XCONJ)
+        "vpermilps      $0xb1 , %%xmm9 , %%xmm9                \n\t"
+        "vpermilps      $0xb1 , %%xmm11, %%xmm11               \n\t"
+        "vpermilps      $0xb1 , %%xmm13, %%xmm13               \n\t"
+        "vpermilps      $0xb1 , %%xmm15, %%xmm15               \n\t"
+        "vaddsubps      %%xmm9 , %%xmm8, %%xmm8               \n\t"
+        "vaddsubps      %%xmm11, %%xmm10, %%xmm10             \n\t"
+        "vaddsubps      %%xmm13, %%xmm12, %%xmm12             \n\t"
+        "vaddsubps      %%xmm15, %%xmm14, %%xmm14             \n\t"
+#else
+        "vpermilps      $0xb1 , %%xmm8 , %%xmm8                \n\t"
+        "vpermilps      $0xb1 , %%xmm10, %%xmm10               \n\t"
+        "vpermilps      $0xb1 , %%xmm12, %%xmm12               \n\t"
+        "vpermilps      $0xb1 , %%xmm14, %%xmm14               \n\t"
+        "vaddsubps      %%xmm8 , %%xmm9 , %%xmm8              \n\t"
+        "vaddsubps      %%xmm10, %%xmm11, %%xmm10             \n\t"
+        "vaddsubps      %%xmm12, %%xmm13, %%xmm12             \n\t"
+        "vaddsubps      %%xmm14, %%xmm15, %%xmm14             \n\t"
+        "vpermilps      $0xb1 , %%xmm8 , %%xmm8                \n\t"
+        "vpermilps      $0xb1 , %%xmm10, %%xmm10               \n\t"
+        "vpermilps      $0xb1 , %%xmm12, %%xmm12               \n\t"
+        "vpermilps      $0xb1 , %%xmm14, %%xmm14               \n\t"
+#endif
+
+
+	"vaddps		%%xmm8 , %%xmm4 , %%xmm8       \n\t"
+	"vaddps		%%xmm10, %%xmm5 , %%xmm10      \n\t"
+	"vaddps		%%xmm12, %%xmm6 , %%xmm12      \n\t"
+	"vaddps		%%xmm14, %%xmm7 , %%xmm14      \n\t"
+
+	"vmovsd	%%xmm8 ,   (%3)			\n\t"
+	"vmovsd	%%xmm10,  8(%3)			\n\t"
+	"vmovsd	%%xmm12, 16(%3)			\n\t"
+	"vmovsd	%%xmm14, 24(%3)			\n\t"
+
+	"vzeroupper			 \n\t"
+
+	:
+        : 
+          "r" (i),	// 0	
+	  "r" (n),  	// 1
+          "r" (x),      // 2
+          "r" (y),      // 3
+          "r" (ap[0]),  // 4
+          "r" (ap[1]),  // 5
+          "r" (ap[2]),  // 6
+          "r" (ap[3]),  // 7
+          "r" (alpha)   // 8
+	: "cc", 
+	  "%xmm0", "%xmm1", "%xmm2", "%xmm3", 
+	  "%xmm4", "%xmm5", "%xmm6", "%xmm7", 
+	  "%xmm8", "%xmm9", "%xmm10", "%xmm11", 
+	  "%xmm12", "%xmm13", "%xmm14", "%xmm15",
+	  "memory"
+	);
+
+} 
+
+
+#define HAVE_KERNEL_4x2 1
+static void cgemv_kernel_4x2( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, FLOAT *alpha) __attribute__ ((noinline));
+
+static void cgemv_kernel_4x2( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, FLOAT *alpha)
+{
+
+	BLASLONG register i = 0;
+
+	__asm__  __volatile__
+	(
+	"vzeroupper			 \n\t"
+
+	"vxorps		%%ymm8 , %%ymm8 , %%ymm8 	\n\t" // temp
+	"vxorps		%%ymm9 , %%ymm9 , %%ymm9 	\n\t" // temp
+	"vxorps		%%ymm10, %%ymm10, %%ymm10	\n\t" // temp
+	"vxorps		%%ymm11, %%ymm11, %%ymm11	\n\t" // temp
+
+        "testq          $0x04, %1                      \n\t"
+        "jz             2f                    \n\t"
+
+	"vmovups	(%4,%0,4), %%ymm4	        \n\t" // 4 complex values from a0
+	"vmovups	(%5,%0,4), %%ymm5               \n\t" // 4 complex values from a1
+
+	"vmovups	    (%2,%0,4)  , %%ymm6		\n\t" // 4 complex values from x
+	"vpermilps        $0xb1, %%ymm6, %%ymm7		\n\t" // exchange real and imap parts
+	"vblendps $0x55, %%ymm6, %%ymm7, %%ymm0         \n\t" // only the real parts
+	"vblendps $0x55, %%ymm7, %%ymm6, %%ymm1         \n\t" // only the imag parts
+	
+
+	"vfmaddps      %%ymm8 , %%ymm4 , %%ymm0, %%ymm8       \n\t" // ar0*xr0,al0*xr0,ar1*xr1,al1*xr1 
+	"vfmaddps      %%ymm9 , %%ymm4 , %%ymm1, %%ymm9       \n\t" // ar0*xl0,al0*xl0,ar1*xl1,al1*xl1 
+	"vfmaddps      %%ymm10, %%ymm5 , %%ymm0, %%ymm10      \n\t" // ar0*xr0,al0*xr0,ar1*xr1,al1*xr1 
+	"vfmaddps      %%ymm11, %%ymm5 , %%ymm1, %%ymm11      \n\t" // ar0*xl0,al0*xl0,ar1*xl1,al1*xl1 
+	
+        "addq		$8  , %0	  	 	        \n\t"
+	"subq	        $4  , %1			        \n\t"		
+
+        "2:                                  \n\t"
+	"cmpq           $0, %1                         \n\t"
+        "je             3f                      \n\t"
+
+	//	".align 16				        \n\t"
+	"1:				        \n\t"
+        "prefetcht0      384(%4,%0,4)                   \n\t"
+	"vmovups	(%4,%0,4), %%ymm4	        \n\t" // 4 complex values from a0
+        "prefetcht0      384(%5,%0,4)                   \n\t"
+	"vmovups	(%5,%0,4), %%ymm5               \n\t" // 4 complex values from a1
+
+        "prefetcht0      384(%2,%0,4)                   \n\t"
+	"vmovups	    (%2,%0,4)  , %%ymm6		\n\t" // 4 complex values from x
+	"vpermilps        $0xb1, %%ymm6, %%ymm7		\n\t" // exchange real and imap parts
+	"vblendps $0x55, %%ymm6, %%ymm7, %%ymm0         \n\t" // only the real parts
+	"vblendps $0x55, %%ymm7, %%ymm6, %%ymm1         \n\t" // only the imag parts
+	
+	"vfmaddps      %%ymm8 , %%ymm4 , %%ymm0, %%ymm8       \n\t" // ar0*xr0,al0*xr0,ar1*xr1,al1*xr1 
+	"vfmaddps      %%ymm9 , %%ymm4 , %%ymm1, %%ymm9       \n\t" // ar0*xl0,al0*xl0,ar1*xl1,al1*xl1 
+	"vfmaddps      %%ymm10, %%ymm5 , %%ymm0, %%ymm10      \n\t" // ar0*xr0,al0*xr0,ar1*xr1,al1*xr1 
+	"vfmaddps      %%ymm11, %%ymm5 , %%ymm1, %%ymm11      \n\t" // ar0*xl0,al0*xl0,ar1*xl1,al1*xl1 
+
+	"vmovups       32(%4,%0,4), %%ymm4	        \n\t" // 4 complex values from a0
+	"vmovups       32(%5,%0,4), %%ymm5              \n\t" // 4 complex values from a1
+
+	"vmovups	  32(%2,%0,4)  , %%ymm6		\n\t" // 4 complex values from x
+	"vpermilps        $0xb1, %%ymm6, %%ymm7		\n\t" // exchange real and imap parts
+	"vblendps $0x55, %%ymm6, %%ymm7, %%ymm0         \n\t" // only the real parts
+	"vblendps $0x55, %%ymm7, %%ymm6, %%ymm1         \n\t" // only the imag parts
+
+	"vfmaddps      %%ymm8 , %%ymm4 , %%ymm0, %%ymm8       \n\t" // ar0*xr0,al0*xr0,ar1*xr1,al1*xr1 
+	"vfmaddps      %%ymm9 , %%ymm4 , %%ymm1, %%ymm9       \n\t" // ar0*xl0,al0*xl0,ar1*xl1,al1*xl1 
+	"vfmaddps      %%ymm10, %%ymm5 , %%ymm0, %%ymm10      \n\t" // ar0*xr0,al0*xr0,ar1*xr1,al1*xr1 
+	"vfmaddps      %%ymm11, %%ymm5 , %%ymm1, %%ymm11      \n\t" // ar0*xl0,al0*xl0,ar1*xl1,al1*xl1 
+	
+        "addq		$16 , %0	  	 	        \n\t"
+	"subq	        $8  , %1			        \n\t"		
+	"jnz		1b		        \n\t"
+
+        "3:                                   \n\t"
+
+        "vbroadcastss    (%6)  , %%xmm0                \n\t"  // value from alpha
+        "vbroadcastss   4(%6)  , %%xmm1                \n\t"  // value from alpha
+
+
+#if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) )
+        "vpermilps      $0xb1 , %%ymm9 , %%ymm9                \n\t"
+        "vpermilps      $0xb1 , %%ymm11, %%ymm11               \n\t"
+        "vaddsubps      %%ymm9 , %%ymm8, %%ymm8                \n\t" 
+        "vaddsubps      %%ymm11, %%ymm10, %%ymm10              \n\t"
+#else
+        "vpermilps      $0xb1 , %%ymm8 , %%ymm8                \n\t"
+        "vpermilps      $0xb1 , %%ymm10, %%ymm10               \n\t"
+        "vaddsubps      %%ymm8 , %%ymm9 , %%ymm8               \n\t"
+        "vaddsubps      %%ymm10, %%ymm11, %%ymm10              \n\t"
+        "vpermilps      $0xb1 , %%ymm8 , %%ymm8                \n\t"
+        "vpermilps      $0xb1 , %%ymm10, %%ymm10               \n\t"
+#endif
+
+	"vmovsd         (%3), %%xmm4			\n\t" // read y
+	"vmovsd        8(%3), %%xmm5			\n\t"
+
+	"vextractf128   $1, %%ymm8 , %%xmm9		      \n\t"
+	"vextractf128   $1, %%ymm10, %%xmm11	      	      \n\t"
+
+	"vaddps		%%xmm8 , %%xmm9 , %%xmm8       \n\t"
+	"vaddps		%%xmm10, %%xmm11, %%xmm10      \n\t"
+
+	"vshufpd        $0x1, %%xmm8 , %%xmm8 , %%xmm9   \n\t"
+	"vshufpd        $0x1, %%xmm10, %%xmm10, %%xmm11  \n\t"
+
+	"vaddps		%%xmm8 , %%xmm9 , %%xmm8       \n\t"
+	"vaddps		%%xmm10, %%xmm11, %%xmm10      \n\t"
+
+        "vmulps         %%xmm8 , %%xmm1 , %%xmm9              \n\t"  // t_r * alpha_i , t_i * alpha_i
+        "vmulps         %%xmm8 , %%xmm0 , %%xmm8              \n\t"  // t_r * alpha_r , t_i * alpha_r
+        "vmulps         %%xmm10, %%xmm1 , %%xmm11             \n\t"  // t_r * alpha_i , t_i * alpha_i
+        "vmulps         %%xmm10, %%xmm0 , %%xmm10             \n\t"  // t_r * alpha_r , t_i * alpha_r
+
+#if !defined(XCONJ)
+        "vpermilps      $0xb1 , %%xmm9 , %%xmm9                \n\t"
+        "vpermilps      $0xb1 , %%xmm11, %%xmm11               \n\t"
+        "vaddsubps      %%xmm9 , %%xmm8, %%xmm8               \n\t"
+        "vaddsubps      %%xmm11, %%xmm10, %%xmm10             \n\t"
+#else
+        "vpermilps      $0xb1 , %%xmm8 , %%xmm8                \n\t"
+        "vpermilps      $0xb1 , %%xmm10, %%xmm10               \n\t"
+        "vaddsubps      %%xmm8 , %%xmm9 , %%xmm8              \n\t"
+        "vaddsubps      %%xmm10, %%xmm11, %%xmm10             \n\t"
+        "vpermilps      $0xb1 , %%xmm8 , %%xmm8                \n\t"
+        "vpermilps      $0xb1 , %%xmm10, %%xmm10               \n\t"
+#endif
+
+
+	"vaddps		%%xmm8 , %%xmm4 , %%xmm8       \n\t"
+	"vaddps		%%xmm10, %%xmm5 , %%xmm10      \n\t"
+
+	"vmovsd	%%xmm8 ,   (%3)			\n\t"
+	"vmovsd	%%xmm10,  8(%3)			\n\t"
+
+	"vzeroupper			 \n\t"
+
+	:
+        : 
+          "r" (i),	// 0	
+	  "r" (n),  	// 1
+          "r" (x),      // 2
+          "r" (y),      // 3
+          "r" (ap[0]),  // 4
+          "r" (ap[1]),  // 5
+          "r" (alpha)   // 6
+	: "cc", 
+	  "%xmm0", "%xmm1", "%xmm2", "%xmm3", 
+	  "%xmm4", "%xmm5", "%xmm6", "%xmm7", 
+	  "%xmm8", "%xmm9", "%xmm10", "%xmm11", 
+	  "%xmm12", "%xmm13", "%xmm14", "%xmm15",
+	  "memory"
+	);
+
+} 
+
+
+#define HAVE_KERNEL_4x1 1
+static void cgemv_kernel_4x1( BLASLONG n, FLOAT *ap, FLOAT *x, FLOAT *y, FLOAT *alpha) __attribute__ ((noinline));
+
+static void cgemv_kernel_4x1( BLASLONG n, FLOAT *ap, FLOAT *x, FLOAT *y, FLOAT *alpha)
+{
+
+	BLASLONG register i = 0;
+
+	__asm__  __volatile__
+	(
+	"vzeroupper			 \n\t"
+
+	"vxorps		%%ymm8 , %%ymm8 , %%ymm8 	\n\t" // temp
+	"vxorps		%%ymm9 , %%ymm9 , %%ymm9 	\n\t" // temp
+
+        "testq          $0x04, %1                      \n\t"
+        "jz             2f                    \n\t"
+
+	"vmovups	(%4,%0,4), %%ymm4	        \n\t" // 4 complex values from a0
+
+	"vmovups	    (%2,%0,4)  , %%ymm6		\n\t" // 4 complex values from x
+	"vpermilps        $0xb1, %%ymm6, %%ymm7		\n\t" // exchange real and imap parts
+	"vblendps $0x55, %%ymm6, %%ymm7, %%ymm0         \n\t" // only the real parts
+	"vblendps $0x55, %%ymm7, %%ymm6, %%ymm1         \n\t" // only the imag parts
+	
+
+	"vfmaddps      %%ymm8 , %%ymm4 , %%ymm0, %%ymm8       \n\t" // ar0*xr0,al0*xr0,ar1*xr1,al1*xr1 
+	"vfmaddps      %%ymm9 , %%ymm4 , %%ymm1, %%ymm9       \n\t" // ar0*xl0,al0*xl0,ar1*xl1,al1*xl1 
+	
+        "addq		$8  , %0	  	 	        \n\t"
+	"subq	        $4  , %1			        \n\t"		
+
+        "2:                                  \n\t"
+	"cmpq           $0, %1                         \n\t"
+        "je             3f                      \n\t"
+
+	//	".align 16				        \n\t"
+	"1:				        \n\t"
+        "prefetcht0      384(%4,%0,4)                   \n\t"
+	"vmovups	(%4,%0,4), %%ymm4	        \n\t" // 4 complex values from a0
+
+        "prefetcht0      384(%2,%0,4)                   \n\t"
+	"vmovups	    (%2,%0,4)  , %%ymm6		\n\t" // 4 complex values from x
+	"vpermilps        $0xb1, %%ymm6, %%ymm7		\n\t" // exchange real and imap parts
+	"vblendps $0x55, %%ymm6, %%ymm7, %%ymm0         \n\t" // only the real parts
+	"vblendps $0x55, %%ymm7, %%ymm6, %%ymm1         \n\t" // only the imag parts
+	
+	"vfmaddps      %%ymm8 , %%ymm4 , %%ymm0, %%ymm8       \n\t" // ar0*xr0,al0*xr0,ar1*xr1,al1*xr1 
+	"vfmaddps      %%ymm9 , %%ymm4 , %%ymm1, %%ymm9       \n\t" // ar0*xl0,al0*xl0,ar1*xl1,al1*xl1 
+	
+	"vmovups       32(%4,%0,4), %%ymm4	        \n\t" // 4 complex values from a0
+
+	"vmovups	  32(%2,%0,4)  , %%ymm6		\n\t" // 4 complex values from x
+	"vpermilps        $0xb1, %%ymm6, %%ymm7		\n\t" // exchange real and imap parts
+	"vblendps $0x55, %%ymm6, %%ymm7, %%ymm0         \n\t" // only the real parts
+	"vblendps $0x55, %%ymm7, %%ymm6, %%ymm1         \n\t" // only the imag parts
+
+	"vfmaddps      %%ymm8 , %%ymm4 , %%ymm0, %%ymm8       \n\t" // ar0*xr0,al0*xr0,ar1*xr1,al1*xr1 
+	"vfmaddps      %%ymm9 , %%ymm4 , %%ymm1, %%ymm9       \n\t" // ar0*xl0,al0*xl0,ar1*xl1,al1*xl1 
+	
+
+        "addq		$16 , %0	  	 	        \n\t"
+	"subq	        $8  , %1			        \n\t"		
+	"jnz		1b		        \n\t"
+
+        "3:                                   \n\t"
+
+        "vbroadcastss    (%5)  , %%xmm0                \n\t"  // value from alpha
+        "vbroadcastss   4(%5)  , %%xmm1                \n\t"  // value from alpha
+
+
+#if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) )
+        "vpermilps      $0xb1 , %%ymm9 , %%ymm9                \n\t"
+        "vaddsubps      %%ymm9 , %%ymm8, %%ymm8                \n\t" 
+#else
+        "vpermilps      $0xb1 , %%ymm8 , %%ymm8                \n\t"
+        "vaddsubps      %%ymm8 , %%ymm9 , %%ymm8               \n\t"
+        "vpermilps      $0xb1 , %%ymm8 , %%ymm8                \n\t"
+#endif
+
+	"vmovsd         (%3), %%xmm4			\n\t" // read y
+
+	"vextractf128   $1, %%ymm8 , %%xmm9		      \n\t"
+
+	"vaddps		%%xmm8 , %%xmm9 , %%xmm8       \n\t"
+
+	"vshufpd        $0x1, %%xmm8 , %%xmm8 , %%xmm9   \n\t"
+
+	"vaddps		%%xmm8 , %%xmm9 , %%xmm8       \n\t"
+
+        "vmulps         %%xmm8 , %%xmm1 , %%xmm9              \n\t"  // t_r * alpha_i , t_i * alpha_i
+        "vmulps         %%xmm8 , %%xmm0 , %%xmm8              \n\t"  // t_r * alpha_r , t_i * alpha_r
+
+#if !defined(XCONJ)
+        "vpermilps      $0xb1 , %%xmm9 , %%xmm9                \n\t"
+        "vaddsubps      %%xmm9 , %%xmm8, %%xmm8               \n\t"
+#else
+        "vpermilps      $0xb1 , %%xmm8 , %%xmm8                \n\t"
+        "vaddsubps      %%xmm8 , %%xmm9 , %%xmm8              \n\t"
+        "vpermilps      $0xb1 , %%xmm8 , %%xmm8                \n\t"
+#endif
+
+
+	"vaddps		%%xmm8 , %%xmm4 , %%xmm8       \n\t"
+
+	"vmovsd	%%xmm8 ,   (%3)			\n\t"
+
+	"vzeroupper			 \n\t"
+
+	:
+        : 
+          "r" (i),	// 0	
+	  "r" (n),  	// 1
+          "r" (x),      // 2
+          "r" (y),      // 3
+          "r" (ap),     // 4
+          "r" (alpha)   // 5
+	: "cc", 
+	  "%xmm0", "%xmm1", "%xmm2", "%xmm3", 
+	  "%xmm4", "%xmm5", "%xmm6", "%xmm7", 
+	  "%xmm8", "%xmm9", "%xmm10", "%xmm11", 
+	  "%xmm12", "%xmm13", "%xmm14", "%xmm15",
+	  "memory"
+	);
+
+} 
+
+
diff --git a/kernel/x86_64/zgemv_n_4.c b/kernel/x86_64/zgemv_n_4.c
index 5ace6123b..63e49f2af 100644
--- a/kernel/x86_64/zgemv_n_4.c
+++ b/kernel/x86_64/zgemv_n_4.c
@@ -34,9 +34,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #include "zgemv_n_microk_haswell-4.c"
 #elif defined(SANDYBRIDGE)
 #include "zgemv_n_microk_sandy-4.c"
+#elif defined(BULLDOZER) || defined(PILEDRIVER) || defined(STEAMROLLER)
+#include "zgemv_n_microk_bulldozer-4.c"
 #endif
 
-
 #define NBMAX 1024
 
 #ifndef HAVE_KERNEL_4x4
diff --git a/kernel/x86_64/zgemv_n_microk_bulldozer-4.c b/kernel/x86_64/zgemv_n_microk_bulldozer-4.c
new file mode 100644
index 000000000..f367ad607
--- /dev/null
+++ b/kernel/x86_64/zgemv_n_microk_bulldozer-4.c
@@ -0,0 +1,514 @@
+/***************************************************************************
+Copyright (c) 2014, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*****************************************************************************/
+
+#define HAVE_KERNEL_4x4 1
+static void zgemv_kernel_4x4( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y) __attribute__ ((noinline));
+
+static void zgemv_kernel_4x4( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y)
+{
+
+	BLASLONG register i = 0;
+
+   if ( n > 384 )
+   {
+
+	__asm__  __volatile__
+	(
+
+	"vbroadcastsd	  (%2), %%ymm0                  \n\t"  // real part x0
+	"vbroadcastsd	 8(%2), %%ymm1                  \n\t"  // imag part x0
+	"vbroadcastsd	16(%2), %%ymm2                  \n\t"  // real part x1
+	"vbroadcastsd	24(%2), %%ymm3                  \n\t"  // imag part x1
+	"vbroadcastsd	32(%2), %%ymm4                  \n\t"  // real part x2
+	"vbroadcastsd	40(%2), %%ymm5                  \n\t"  // imag part x2
+	"vbroadcastsd	48(%2), %%ymm6                  \n\t"  // real part x3
+	"vbroadcastsd	56(%2), %%ymm7                  \n\t"  // imag part x3
+
+
+	".align 16				        \n\t"
+	"1:				        	\n\t"
+	"prefetcht0	512(%4,%0,8)			\n\t"
+
+	"vmovups	(%4,%0,8), %%ymm8	        \n\t" // 2 complex values form a0
+	"vmovups      32(%4,%0,8), %%ymm9	        \n\t" // 2 complex values form a0
+
+	"vmulpd      %%ymm8 , %%ymm0, %%ymm12      	\n\t" // a_r[0] * x_r , a_i[0] * x_r, a_r[1] * x_r, a_i[1] * x_r
+	"vmulpd      %%ymm8 , %%ymm1, %%ymm13      	\n\t" // a_r[0] * x_i , a_i[0] * x_i, a_r[1] * x_i, a_i[1] * x_i
+
+	"vmovups	(%5,%0,8), %%ymm10              \n\t" // 2 complex values form a1
+	"vmovups      32(%5,%0,8), %%ymm11              \n\t" // 2 complex values form a1
+
+	"prefetcht0	512(%5,%0,8)			\n\t"
+
+	"vmulpd      %%ymm9 , %%ymm0, %%ymm14      	\n\t" // a_r[2] * x_r , a_i[2] * x_r, a_r[3] * x_r, a_i[3] * x_r
+	"vmulpd      %%ymm9 , %%ymm1, %%ymm15      	\n\t" // a_r[2] * x_i , a_i[2] * x_i, a_r[3] * x_i, a_i[3] * x_i
+
+	"vfmaddpd    %%ymm12,  %%ymm10, %%ymm2, %%ymm12      \n\t" // a_r[0] * x_r , a_i[0] * x_r, a_r[1] * x_r, a_i[1] * x_r
+	"vfmaddpd    %%ymm13,  %%ymm10, %%ymm3, %%ymm13      \n\t" // a_r[0] * x_i , a_i[0] * x_i, a_r[1] * x_i, a_i[1] * x_i
+
+	"prefetcht0	512(%6,%0,8)			\n\t"
+
+	"vfmaddpd    %%ymm14,  %%ymm11, %%ymm2, %%ymm14      \n\t" // a_r[2] * x_r , a_i[2] * x_r, a_r[3] * x_r, a_i[3] * x_r
+	"vfmaddpd    %%ymm15,  %%ymm11, %%ymm3, %%ymm15      \n\t" // a_r[2] * x_i , a_i[2] * x_i, a_r[3] * x_i, a_i[3] * x_i
+
+	"vmovups	(%6,%0,8), %%ymm8	        \n\t" // 2 complex values form a2
+	"vmovups      32(%6,%0,8), %%ymm9	        \n\t" // 2 complex values form a2
+
+	"vfmaddpd    %%ymm12,  %%ymm8 , %%ymm4, %%ymm12      \n\t" // a_r[0] * x_r , a_i[0] * x_r, a_r[1] * x_r, a_i[1] * x_r
+	"vfmaddpd    %%ymm13,  %%ymm8 , %%ymm5, %%ymm13      \n\t" // a_r[0] * x_i , a_i[0] * x_i, a_r[1] * x_i, a_i[1] * x_i
+
+	"vmovups	(%7,%0,8), %%ymm10              \n\t" // 2 complex values form a3
+	"vmovups      32(%7,%0,8), %%ymm11              \n\t" // 2 complex values form a3
+
+	"vfmaddpd    %%ymm14,  %%ymm9 , %%ymm4, %%ymm14      \n\t" // a_r[2] * x_r , a_i[2] * x_r, a_r[3] * x_r, a_i[3] * x_r
+	"vfmaddpd    %%ymm15,  %%ymm9 , %%ymm5, %%ymm15      \n\t" // a_r[2] * x_i , a_i[2] * x_i, a_r[3] * x_i, a_i[3] * x_i
+
+	"prefetcht0	512(%7,%0,8)			\n\t"
+
+	"vfmaddpd    %%ymm12,  %%ymm10, %%ymm6, %%ymm12      \n\t" // a_r[0] * x_r , a_i[0] * x_r, a_r[1] * x_r, a_i[1] * x_r
+	"vfmaddpd    %%ymm13,  %%ymm10, %%ymm7, %%ymm13      \n\t" // a_r[0] * x_i , a_i[0] * x_i, a_r[1] * x_i, a_i[1] * x_i
+
+	"vfmaddpd    %%ymm14,  %%ymm11, %%ymm6, %%ymm14      \n\t" // a_r[2] * x_r , a_i[2] * x_r, a_r[3] * x_r, a_i[3] * x_r
+	"vfmaddpd    %%ymm15,  %%ymm11, %%ymm7, %%ymm15      \n\t" // a_r[2] * x_i , a_i[2] * x_i, a_r[3] * x_i, a_i[3] * x_i
+
+	"vmovups	  (%3,%0,8),  %%ymm10           \n\t"
+	"vmovups	32(%3,%0,8),  %%ymm11           \n\t"
+
+#if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) )
+        "vpermilpd      $0x5 , %%ymm13, %%ymm13               \n\t"
+        "vpermilpd      $0x5 , %%ymm15, %%ymm15               \n\t"
+        "vaddsubpd      %%ymm13, %%ymm12, %%ymm8              \n\t"
+        "vaddsubpd      %%ymm15, %%ymm14, %%ymm9              \n\t"
+#else
+        "vpermilpd      $0x5 , %%ymm12, %%ymm12               \n\t"
+        "vpermilpd      $0x5 , %%ymm14, %%ymm14               \n\t"
+        "vaddsubpd      %%ymm12, %%ymm13, %%ymm8              \n\t"
+        "vaddsubpd      %%ymm14, %%ymm15, %%ymm9              \n\t"
+        "vpermilpd      $0x5 , %%ymm8 , %%ymm8                \n\t"
+        "vpermilpd      $0x5 , %%ymm9 , %%ymm9                \n\t"
+#endif
+
+        "vaddpd         %%ymm8, %%ymm10, %%ymm12              \n\t"
+        "vaddpd         %%ymm9, %%ymm11, %%ymm13              \n\t"
+
+	"vmovups  %%ymm12,   (%3,%0,8)		        \n\t" // 2 complex values to y	
+	"vmovups  %%ymm13, 32(%3,%0,8)		        \n\t"	
+
+        "addq		$8 , %0	  	 	        \n\t"
+	"subq	        $4 , %1			        \n\t"		
+	"jnz		1b		        \n\t"
+
+	"2:					\n\t"
+
+	:
+        : 
+          "r" (i),	// 0	
+	  "r" (n),  	// 1
+          "r" (x),      // 2
+          "r" (y),      // 3
+          "r" (ap[0]),  // 4
+          "r" (ap[1]),  // 5
+          "r" (ap[2]),  // 6
+          "r" (ap[3])   // 7
+	: "cc", 
+	  "%xmm0", "%xmm1", "%xmm2", "%xmm3", 
+	  "%xmm4", "%xmm5", "%xmm6", "%xmm7", 
+	  "%xmm8", "%xmm9", "%xmm10", "%xmm11", 
+	  "%xmm12", "%xmm13", "%xmm14", "%xmm15",
+	  "memory"
+	);
+
+   }
+   else
+   {
+
+	__asm__  __volatile__
+	(
+
+	"vbroadcastsd	  (%2), %%ymm0                  \n\t"  // real part x0
+	"vbroadcastsd	 8(%2), %%ymm1                  \n\t"  // imag part x0
+	"vbroadcastsd	16(%2), %%ymm2                  \n\t"  // real part x1
+	"vbroadcastsd	24(%2), %%ymm3                  \n\t"  // imag part x1
+	"vbroadcastsd	32(%2), %%ymm4                  \n\t"  // real part x2
+	"vbroadcastsd	40(%2), %%ymm5                  \n\t"  // imag part x2
+	"vbroadcastsd	48(%2), %%ymm6                  \n\t"  // real part x3
+	"vbroadcastsd	56(%2), %%ymm7                  \n\t"  // imag part x3
+
+
+	".align 16				        \n\t"
+	"1:				        	\n\t"
+
+	"vmovups	(%4,%0,8), %%ymm8	        \n\t" // 2 complex values form a0
+	"vmovups      32(%4,%0,8), %%ymm9	        \n\t" // 2 complex values form a0
+
+	"vmulpd      %%ymm8 , %%ymm0, %%ymm12      	\n\t" // a_r[0] * x_r , a_i[0] * x_r, a_r[1] * x_r, a_i[1] * x_r
+	"vmulpd      %%ymm8 , %%ymm1, %%ymm13      	\n\t" // a_r[0] * x_i , a_i[0] * x_i, a_r[1] * x_i, a_i[1] * x_i
+
+	"vmovups	(%5,%0,8), %%ymm10              \n\t" // 2 complex values form a1
+	"vmovups      32(%5,%0,8), %%ymm11              \n\t" // 2 complex values form a1
+
+
+	"vmulpd      %%ymm9 , %%ymm0, %%ymm14      	\n\t" // a_r[2] * x_r , a_i[2] * x_r, a_r[3] * x_r, a_i[3] * x_r
+	"vmulpd      %%ymm9 , %%ymm1, %%ymm15      	\n\t" // a_r[2] * x_i , a_i[2] * x_i, a_r[3] * x_i, a_i[3] * x_i
+
+	"vfmaddpd    %%ymm12,  %%ymm10, %%ymm2, %%ymm12      \n\t" // a_r[0] * x_r , a_i[0] * x_r, a_r[1] * x_r, a_i[1] * x_r
+	"vfmaddpd    %%ymm13,  %%ymm10, %%ymm3, %%ymm13      \n\t" // a_r[0] * x_i , a_i[0] * x_i, a_r[1] * x_i, a_i[1] * x_i
+
+
+	"vfmaddpd    %%ymm14,  %%ymm11, %%ymm2, %%ymm14      \n\t" // a_r[2] * x_r , a_i[2] * x_r, a_r[3] * x_r, a_i[3] * x_r
+	"vfmaddpd    %%ymm15,  %%ymm11, %%ymm3, %%ymm15      \n\t" // a_r[2] * x_i , a_i[2] * x_i, a_r[3] * x_i, a_i[3] * x_i
+
+	"vmovups	(%6,%0,8), %%ymm8	        \n\t" // 2 complex values form a2
+	"vmovups      32(%6,%0,8), %%ymm9	        \n\t" // 2 complex values form a2
+
+	"vfmaddpd    %%ymm12,  %%ymm8 , %%ymm4, %%ymm12      \n\t" // a_r[0] * x_r , a_i[0] * x_r, a_r[1] * x_r, a_i[1] * x_r
+	"vfmaddpd    %%ymm13,  %%ymm8 , %%ymm5, %%ymm13      \n\t" // a_r[0] * x_i , a_i[0] * x_i, a_r[1] * x_i, a_i[1] * x_i
+
+	"vmovups	(%7,%0,8), %%ymm10              \n\t" // 2 complex values form a3
+	"vmovups      32(%7,%0,8), %%ymm11              \n\t" // 2 complex values form a3
+
+	"vfmaddpd    %%ymm14,  %%ymm9 , %%ymm4, %%ymm14      \n\t" // a_r[2] * x_r , a_i[2] * x_r, a_r[3] * x_r, a_i[3] * x_r
+	"vfmaddpd    %%ymm15,  %%ymm9 , %%ymm5, %%ymm15      \n\t" // a_r[2] * x_i , a_i[2] * x_i, a_r[3] * x_i, a_i[3] * x_i
+
+
+	"vfmaddpd    %%ymm12,  %%ymm10, %%ymm6, %%ymm12      \n\t" // a_r[0] * x_r , a_i[0] * x_r, a_r[1] * x_r, a_i[1] * x_r
+	"vfmaddpd    %%ymm13,  %%ymm10, %%ymm7, %%ymm13      \n\t" // a_r[0] * x_i , a_i[0] * x_i, a_r[1] * x_i, a_i[1] * x_i
+
+	"vfmaddpd    %%ymm14,  %%ymm11, %%ymm6, %%ymm14      \n\t" // a_r[2] * x_r , a_i[2] * x_r, a_r[3] * x_r, a_i[3] * x_r
+	"vfmaddpd    %%ymm15,  %%ymm11, %%ymm7, %%ymm15      \n\t" // a_r[2] * x_i , a_i[2] * x_i, a_r[3] * x_i, a_i[3] * x_i
+
+	"vmovups	  (%3,%0,8),  %%ymm10           \n\t"
+	"vmovups	32(%3,%0,8),  %%ymm11           \n\t"
+
+#if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) )
+        "vpermilpd      $0x5 , %%ymm13, %%ymm13               \n\t"
+        "vpermilpd      $0x5 , %%ymm15, %%ymm15               \n\t"
+        "vaddsubpd      %%ymm13, %%ymm12, %%ymm8              \n\t"
+        "vaddsubpd      %%ymm15, %%ymm14, %%ymm9              \n\t"
+#else
+        "vpermilpd      $0x5 , %%ymm12, %%ymm12               \n\t"
+        "vpermilpd      $0x5 , %%ymm14, %%ymm14               \n\t"
+        "vaddsubpd      %%ymm12, %%ymm13, %%ymm8              \n\t"
+        "vaddsubpd      %%ymm14, %%ymm15, %%ymm9              \n\t"
+        "vpermilpd      $0x5 , %%ymm8 , %%ymm8                \n\t"
+        "vpermilpd      $0x5 , %%ymm9 , %%ymm9                \n\t"
+#endif
+
+        "vaddpd         %%ymm8, %%ymm10, %%ymm12              \n\t"
+        "vaddpd         %%ymm9, %%ymm11, %%ymm13              \n\t"
+
+	"vmovups  %%ymm12,   (%3,%0,8)		        \n\t" // 2 complex values to y	
+	"vmovups  %%ymm13, 32(%3,%0,8)		        \n\t"	
+
+        "addq		$8 , %0	  	 	        \n\t"
+	"subq	        $4 , %1			        \n\t"		
+	"jnz		1b		        \n\t"
+
+	"2:					\n\t"
+
+	:
+        : 
+          "r" (i),	// 0	
+	  "r" (n),  	// 1
+          "r" (x),      // 2
+          "r" (y),      // 3
+          "r" (ap[0]),  // 4
+          "r" (ap[1]),  // 5
+          "r" (ap[2]),  // 6
+          "r" (ap[3])   // 7
+	: "cc", 
+	  "%xmm0", "%xmm1", "%xmm2", "%xmm3", 
+	  "%xmm4", "%xmm5", "%xmm6", "%xmm7", 
+	  "%xmm8", "%xmm9", "%xmm10", "%xmm11", 
+	  "%xmm12", "%xmm13", "%xmm14", "%xmm15",
+	  "memory"
+	);
+
+
+
+   }
+
+
+} 
+
+#define HAVE_KERNEL_4x2 1
+static void zgemv_kernel_4x2( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y) __attribute__ ((noinline));
+
+static void zgemv_kernel_4x2( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y)
+{
+
+	BLASLONG register i = 0;
+
+	__asm__  __volatile__
+	(
+	"vzeroupper			 \n\t"
+
+	"vbroadcastsd	  (%2), %%ymm0                  \n\t"  // real part x0
+	"vbroadcastsd	 8(%2), %%ymm1                  \n\t"  // imag part x0
+	"vbroadcastsd	16(%2), %%ymm2                  \n\t"  // real part x1
+	"vbroadcastsd	24(%2), %%ymm3                  \n\t"  // imag part x1
+
+
+	//	".align 16				        \n\t"
+	"1:				        \n\t"
+	"vmovups	(%4,%0,8), %%ymm8	        \n\t" // 2 complex values form a0
+	"vmovups      32(%4,%0,8), %%ymm9	        \n\t" // 2 complex values form a0
+
+	"vmovups	(%5,%0,8), %%ymm10              \n\t" // 2 complex values form a1
+	"vmovups      32(%5,%0,8), %%ymm11              \n\t" // 2 complex values form a1
+
+	"vmulpd      %%ymm8 , %%ymm0, %%ymm12      \n\t" // a_r[0] * x_r , a_i[0] * x_r, a_r[1] * x_r, a_i[1] * x_r
+	"vmulpd      %%ymm8 , %%ymm1, %%ymm13      \n\t" // a_r[0] * x_i , a_i[0] * x_i, a_r[1] * x_i, a_i[1] * x_i
+	"vmulpd      %%ymm9 , %%ymm0, %%ymm14      \n\t" // a_r[2] * x_r , a_i[2] * x_r, a_r[3] * x_r, a_i[3] * x_r
+	"vmulpd      %%ymm9 , %%ymm1, %%ymm15      \n\t" // a_r[2] * x_i , a_i[2] * x_i, a_r[3] * x_i, a_i[3] * x_i
+
+	"vfmaddpd    %%ymm12,  %%ymm10, %%ymm2, %%ymm12      \n\t" // a_r[0] * x_r , a_i[0] * x_r, a_r[1] * x_r, a_i[1] * x_r
+	"vfmaddpd    %%ymm13,  %%ymm10, %%ymm3, %%ymm13      \n\t" // a_r[0] * x_i , a_i[0] * x_i, a_r[1] * x_i, a_i[1] * x_i
+	"vfmaddpd    %%ymm14,  %%ymm11, %%ymm2, %%ymm14      \n\t" // a_r[2] * x_r , a_i[2] * x_r, a_r[3] * x_r, a_i[3] * x_r
+	"vfmaddpd    %%ymm15,  %%ymm11, %%ymm3, %%ymm15      \n\t" // a_r[2] * x_i , a_i[2] * x_i, a_r[3] * x_i, a_i[3] * x_i
+
+
+	"vmovups	  (%3,%0,8),  %%ymm10           \n\t"
+	"vmovups	32(%3,%0,8),  %%ymm11           \n\t"
+
+#if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) )
+        "vpermilpd      $0x5 , %%ymm13, %%ymm13               \n\t"
+        "vpermilpd      $0x5 , %%ymm15, %%ymm15               \n\t"
+        "vaddsubpd      %%ymm13, %%ymm12, %%ymm8              \n\t"
+        "vaddsubpd      %%ymm15, %%ymm14, %%ymm9              \n\t"
+#else
+        "vpermilpd      $0x5 , %%ymm12, %%ymm12               \n\t"
+        "vpermilpd      $0x5 , %%ymm14, %%ymm14               \n\t"
+        "vaddsubpd      %%ymm12, %%ymm13, %%ymm8              \n\t"
+        "vaddsubpd      %%ymm14, %%ymm15, %%ymm9              \n\t"
+        "vpermilpd      $0x5 , %%ymm8 , %%ymm8                \n\t"
+        "vpermilpd      $0x5 , %%ymm9 , %%ymm9                \n\t"
+#endif
+
+        "vaddpd         %%ymm8, %%ymm10, %%ymm12              \n\t"
+        "vaddpd         %%ymm9, %%ymm11, %%ymm13              \n\t"
+
+	"vmovups  %%ymm12,   (%3,%0,8)		        \n\t" // 2 complex values to y	
+	"vmovups  %%ymm13, 32(%3,%0,8)		        \n\t"	
+
+        "addq		$8 , %0	  	 	        \n\t"
+	"subq	        $4 , %1			        \n\t"		
+	"jnz		1b		        \n\t"
+	"vzeroupper			 \n\t"
+
+	:
+        : 
+          "r" (i),	// 0	
+	  "r" (n),  	// 1
+          "r" (x),      // 2
+          "r" (y),      // 3
+          "r" (ap[0]),  // 4
+          "r" (ap[1])   // 5
+	: "cc", 
+	  "%xmm0", "%xmm1", "%xmm2", "%xmm3", 
+	  "%xmm4", "%xmm5", "%xmm6", "%xmm7", 
+	  "%xmm8", "%xmm9", "%xmm10", "%xmm11", 
+	  "%xmm12", "%xmm13", "%xmm14", "%xmm15",
+	  "memory"
+	);
+
+} 
+
+
+#define HAVE_KERNEL_4x1 1
+static void zgemv_kernel_4x1( BLASLONG n, FLOAT *ap, FLOAT *x, FLOAT *y) __attribute__ ((noinline));
+
+static void zgemv_kernel_4x1( BLASLONG n, FLOAT *ap, FLOAT *x, FLOAT *y)
+{
+
+	BLASLONG register i = 0;
+
+	__asm__  __volatile__
+	(
+	"vzeroupper			 \n\t"
+
+	"vbroadcastsd	  (%2), %%ymm0                  \n\t"  // real part x0
+	"vbroadcastsd	 8(%2), %%ymm1                  \n\t"  // imag part x0
+
+	//	".align 16				        \n\t"
+	"1:				        \n\t"
+	"vmovups	(%4,%0,8), %%ymm8	        \n\t" // 2 complex values form a0
+	"vmovups      32(%4,%0,8), %%ymm9	        \n\t" // 2 complex values form a0
+
+	"vmulpd      %%ymm8 , %%ymm0, %%ymm12      \n\t" // a_r[0] * x_r , a_i[0] * x_r, a_r[1] * x_r, a_i[1] * x_r
+	"vmulpd      %%ymm8 , %%ymm1, %%ymm13      \n\t" // a_r[0] * x_i , a_i[0] * x_i, a_r[1] * x_i, a_i[1] * x_i
+	"vmulpd      %%ymm9 , %%ymm0, %%ymm14      \n\t" // a_r[2] * x_r , a_i[2] * x_r, a_r[3] * x_r, a_i[3] * x_r
+	"vmulpd      %%ymm9 , %%ymm1, %%ymm15      \n\t" // a_r[2] * x_i , a_i[2] * x_i, a_r[3] * x_i, a_i[3] * x_i
+
+	"vmovups	  (%3,%0,8),  %%ymm10           \n\t"
+	"vmovups	32(%3,%0,8),  %%ymm11           \n\t"
+
+#if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) )
+        "vpermilpd      $0x5 , %%ymm13, %%ymm13               \n\t"
+        "vpermilpd      $0x5 , %%ymm15, %%ymm15               \n\t"
+        "vaddsubpd      %%ymm13, %%ymm12, %%ymm8              \n\t"
+        "vaddsubpd      %%ymm15, %%ymm14, %%ymm9              \n\t"
+#else
+        "vpermilpd      $0x5 , %%ymm12, %%ymm12               \n\t"
+        "vpermilpd      $0x5 , %%ymm14, %%ymm14               \n\t"
+        "vaddsubpd      %%ymm12, %%ymm13, %%ymm8              \n\t"
+        "vaddsubpd      %%ymm14, %%ymm15, %%ymm9              \n\t"
+        "vpermilpd      $0x5 , %%ymm8 , %%ymm8                \n\t"
+        "vpermilpd      $0x5 , %%ymm9 , %%ymm9                \n\t"
+#endif
+
+        "vaddpd         %%ymm8, %%ymm10, %%ymm12              \n\t"
+        "vaddpd         %%ymm9, %%ymm11, %%ymm13              \n\t"
+
+	"vmovups  %%ymm12,   (%3,%0,8)		        \n\t" // 2 complex values to y	
+	"vmovups  %%ymm13, 32(%3,%0,8)		        \n\t"	
+
+        "addq		$8 , %0	  	 	        \n\t"
+	"subq	        $4 , %1			        \n\t"		
+	"jnz		1b		        \n\t"
+	"vzeroupper			 \n\t"
+
+	:
+        : 
+          "r" (i),	// 0	
+	  "r" (n),  	// 1
+          "r" (x),      // 2
+          "r" (y),      // 3
+          "r" (ap)      // 4
+	: "cc", 
+	  "%xmm0", "%xmm1", "%xmm2", "%xmm3", 
+	  "%xmm4", "%xmm5", "%xmm6", "%xmm7", 
+	  "%xmm8", "%xmm9", "%xmm10", "%xmm11", 
+	  "%xmm12", "%xmm13", "%xmm14", "%xmm15",
+	  "memory"
+	);
+
+} 
+
+
+
+
+#define HAVE_KERNEL_ADDY 1
+
+static void add_y(BLASLONG n, FLOAT *src, FLOAT *dest, BLASLONG inc_dest,FLOAT alpha_r, FLOAT alpha_i)  __attribute__ ((noinline));
+
+static void add_y(BLASLONG n, FLOAT *src, FLOAT *dest, BLASLONG inc_dest,FLOAT alpha_r, FLOAT alpha_i)
+{
+	BLASLONG i;
+
+	if ( inc_dest != 2 )
+	{
+
+		FLOAT temp_r;
+		FLOAT temp_i;
+		for ( i=0; i<n; i++ )
+		{
+#if !defined(XCONJ) 
+			temp_r = alpha_r * src[0] - alpha_i * src[1];
+			temp_i = alpha_r * src[1] + alpha_i * src[0];
+#else
+			temp_r =  alpha_r * src[0] + alpha_i * src[1];
+			temp_i = -alpha_r * src[1] + alpha_i * src[0];
+#endif
+
+			*dest += temp_r;
+			*(dest+1) += temp_i;
+
+			src+=2;
+			dest += inc_dest;
+		}
+		return;
+	}
+
+	i=0;
+
+	__asm__  __volatile__
+	(
+
+	"vzeroupper			 \n\t"
+
+	"vbroadcastsd	  (%4), %%ymm0                  \n\t"  // alpha_r
+	"vbroadcastsd	  (%5), %%ymm1                  \n\t"  // alpha_i
+
+	//	".align 16				        \n\t"
+	"1:				        \n\t"
+	"vmovups	(%2,%0,8), %%ymm8	        \n\t" // 2 complex values from src
+	"vmovups      32(%2,%0,8), %%ymm9	        \n\t" 
+
+	"vmulpd      %%ymm8 , %%ymm0, %%ymm12      \n\t" // a_r[0] * x_r , a_i[0] * x_r, a_r[1] * x_r, a_i[1] * x_r
+	"vmulpd      %%ymm8 , %%ymm1, %%ymm13      \n\t" // a_r[0] * x_i , a_i[0] * x_i, a_r[1] * x_i, a_i[1] * x_i
+	"vmulpd      %%ymm9 , %%ymm0, %%ymm14      \n\t" // a_r[2] * x_r , a_i[2] * x_r, a_r[3] * x_r, a_i[3] * x_r
+	"vmulpd      %%ymm9 , %%ymm1, %%ymm15      \n\t" // a_r[2] * x_i , a_i[2] * x_i, a_r[3] * x_i, a_i[3] * x_i
+
+	"vmovups	  (%3,%0,8),  %%ymm10           \n\t" // 2 complex values from dest
+	"vmovups	32(%3,%0,8),  %%ymm11           \n\t"
+
+#if !defined(XCONJ)
+        "vpermilpd      $0x5 , %%ymm13, %%ymm13               \n\t"
+        "vpermilpd      $0x5 , %%ymm15, %%ymm15               \n\t"
+        "vaddsubpd      %%ymm13, %%ymm12, %%ymm8              \n\t"
+        "vaddsubpd      %%ymm15, %%ymm14, %%ymm9              \n\t"
+#else
+        "vpermilpd      $0x5 , %%ymm12, %%ymm12               \n\t"
+        "vpermilpd      $0x5 , %%ymm14, %%ymm14               \n\t"
+        "vaddsubpd      %%ymm12, %%ymm13, %%ymm8              \n\t"
+        "vaddsubpd      %%ymm14, %%ymm15, %%ymm9              \n\t"
+        "vpermilpd      $0x5 , %%ymm8 , %%ymm8                \n\t"
+        "vpermilpd      $0x5 , %%ymm9 , %%ymm9                \n\t"
+#endif
+
+        "vaddpd         %%ymm8, %%ymm10, %%ymm12              \n\t"
+        "vaddpd         %%ymm9, %%ymm11, %%ymm13              \n\t"
+
+	"vmovups  %%ymm12,   (%3,%0,8)		        \n\t" // 2 complex values to y	
+	"vmovups  %%ymm13, 32(%3,%0,8)		        \n\t"	
+
+        "addq		$8 , %0	  	 	        \n\t"
+	"subq	        $4 , %1			        \n\t"		
+	"jnz		1b		        \n\t"
+	"vzeroupper			 \n\t"
+
+	:
+        : 
+          "r" (i),	      // 0	
+	  "r" (n),  	      // 1
+          "r" (src),          // 2
+          "r" (dest),         // 3
+          "r" (&alpha_r),     // 4
+          "r" (&alpha_i)      // 5
+	: "cc", 
+	  "%xmm0", "%xmm1", "%xmm2", "%xmm3", 
+	  "%xmm4", "%xmm5", "%xmm6", "%xmm7", 
+	  "%xmm8", "%xmm9", "%xmm10", "%xmm11", 
+	  "%xmm12", "%xmm13", "%xmm14", "%xmm15",
+	  "memory"
+	);
+
+	return;
+
+}
+
diff --git a/lapack-netlib/LAPACKE/src/lapacke_clantr.c b/lapack-netlib/LAPACKE/src/lapacke_clantr.c
index 4ba436753..33e6e57ff 100644
--- a/lapack-netlib/LAPACKE/src/lapacke_clantr.c
+++ b/lapack-netlib/LAPACKE/src/lapacke_clantr.c
@@ -51,8 +51,7 @@ float LAPACKE_clantr( int matrix_layout, char norm, char uplo, char diag,
     }
 #endif
     /* Allocate memory for working array(s) */
-    if( LAPACKE_lsame( norm, 'i' ) || LAPACKE_lsame( norm, '1' ) ||
-        LAPACKE_lsame( norm, 'O' ) ) {
+    if( LAPACKE_lsame( norm, 'i' ) ) {
         work = (float*)LAPACKE_malloc( sizeof(float) * MAX(1,MAX(m,n)) );
         if( work == NULL ) {
             info = LAPACK_WORK_MEMORY_ERROR;
@@ -63,8 +62,7 @@ float LAPACKE_clantr( int matrix_layout, char norm, char uplo, char diag,
     res = LAPACKE_clantr_work( matrix_layout, norm, uplo, diag, m, n, a, lda,
                                 work );
     /* Release memory and exit */
-    if( LAPACKE_lsame( norm, 'i' ) || LAPACKE_lsame( norm, '1' ) ||
-        LAPACKE_lsame( norm, 'O' ) ) {
+    if( LAPACKE_lsame( norm, 'i' ) ) {
         LAPACKE_free( work );
     }
 exit_level_0:
diff --git a/lapack-netlib/LAPACKE/src/lapacke_dlantr.c b/lapack-netlib/LAPACKE/src/lapacke_dlantr.c
index 65802f3e3..8fd112084 100644
--- a/lapack-netlib/LAPACKE/src/lapacke_dlantr.c
+++ b/lapack-netlib/LAPACKE/src/lapacke_dlantr.c
@@ -51,8 +51,7 @@ double LAPACKE_dlantr( int matrix_layout, char norm, char uplo, char diag,
     }
 #endif
     /* Allocate memory for working array(s) */
-    if( LAPACKE_lsame( norm, 'i' ) || LAPACKE_lsame( norm, '1' ) ||
-        LAPACKE_lsame( norm, 'O' ) ) {
+    if( LAPACKE_lsame( norm, 'i' ) ) {
         work = (double*)LAPACKE_malloc( sizeof(double) * MAX(1,MAX(m,n)) );
         if( work == NULL ) {
             info = LAPACK_WORK_MEMORY_ERROR;
@@ -63,8 +62,7 @@ double LAPACKE_dlantr( int matrix_layout, char norm, char uplo, char diag,
     res = LAPACKE_dlantr_work( matrix_layout, norm, uplo, diag, m, n, a, lda,
                                 work );
     /* Release memory and exit */
-    if( LAPACKE_lsame( norm, 'i' ) || LAPACKE_lsame( norm, '1' ) ||
-        LAPACKE_lsame( norm, 'O' ) ) {
+    if( LAPACKE_lsame( norm, 'i' ) ) {
         LAPACKE_free( work );
     }
 exit_level_0:
diff --git a/lapack-netlib/LAPACKE/src/lapacke_dlantr_work.c b/lapack-netlib/LAPACKE/src/lapacke_dlantr_work.c
index 59eef3801..ee5e665ee 100644
--- a/lapack-netlib/LAPACKE/src/lapacke_dlantr_work.c
+++ b/lapack-netlib/LAPACKE/src/lapacke_dlantr_work.c
@@ -38,10 +38,10 @@ double LAPACKE_dlantr_work( int matrix_layout, char norm, char uplo,
                                 const double* a, lapack_int lda, double* work )
 {
     lapack_int info = 0;
-	double res = 0.;
+    double res = 0.;
     if( matrix_layout == LAPACK_COL_MAJOR ) {
         /* Call LAPACK function and adjust info */
-        LAPACK_dlantr( &norm, &uplo, &diag, &m, &n, a, &lda, work );
+        res = LAPACK_dlantr( &norm, &uplo, &diag, &m, &n, a, &lda, work );
         if( info < 0 ) {
             info = info - 1;
         }
diff --git a/lapack-netlib/LAPACKE/src/lapacke_dormbr_work.c b/lapack-netlib/LAPACKE/src/lapacke_dormbr_work.c
index 9db92ce98..dcd8842fa 100644
--- a/lapack-netlib/LAPACKE/src/lapacke_dormbr_work.c
+++ b/lapack-netlib/LAPACKE/src/lapacke_dormbr_work.c
@@ -74,11 +74,10 @@ lapack_int LAPACKE_dormbr_work( int matrix_layout, char vect, char side,
         }
         /* Allocate memory for temporary array(s) */
         if( LAPACKE_lsame( vect, 'q' ) ) {
-          a_t = (double*)LAPACKE_malloc( sizeof(double) * lda_t * k );
+          a_t = (double*)LAPACKE_malloc( sizeof(double) * lda_t * MAX(1,k) );
         } else {
-          a_t = (double*)LAPACKE_malloc( sizeof(double) * lda_t * nq );
+          a_t = (double*)LAPACKE_malloc( sizeof(double) * lda_t * MAX(1,nq) );
         }
-      
         if( a_t == NULL ) {
             info = LAPACK_TRANSPOSE_MEMORY_ERROR;
             goto exit_level_0;
@@ -89,11 +88,7 @@ lapack_int LAPACKE_dormbr_work( int matrix_layout, char vect, char side,
             goto exit_level_1;
         }
         /* Transpose input matrices */
-        if( LAPACKE_lsame( vect, 'q' ) ) {
-          LAPACKE_dge_trans( matrix_layout, nq, k, a, lda, a_t, lda_t );
-        } else {
-          LAPACKE_dge_trans( matrix_layout, k, nq, a, lda, a_t, lda_t );
-        }
+        LAPACKE_dge_trans( matrix_layout, r, MIN(nq,k), a, lda, a_t, lda_t );
         LAPACKE_dge_trans( matrix_layout, m, n, c, ldc, c_t, ldc_t );
         /* Call LAPACK function and adjust info */
         LAPACK_dormbr( &vect, &side, &trans, &m, &n, &k, a_t, &lda_t, tau, c_t,
diff --git a/lapack-netlib/LAPACKE/src/lapacke_dormlq_work.c b/lapack-netlib/LAPACKE/src/lapacke_dormlq_work.c
index 2a59cd56a..f46c6d3b1 100644
--- a/lapack-netlib/LAPACKE/src/lapacke_dormlq_work.c
+++ b/lapack-netlib/LAPACKE/src/lapacke_dormlq_work.c
@@ -87,12 +87,7 @@ lapack_int LAPACKE_dormlq_work( int matrix_layout, char side, char trans,
             goto exit_level_1;
         }
         /* Transpose input matrices */
-        if( LAPACKE_lsame( side, 'l' ) ){
-            LAPACKE_dge_trans( matrix_layout, k, m, a, lda, a_t, lda_t );
-        } else {
-            LAPACKE_dge_trans( matrix_layout, k, n, a, lda, a_t, lda_t );
-        }
-
+        LAPACKE_dge_trans( matrix_layout, k, m, a, lda, a_t, lda_t );
         LAPACKE_dge_trans( matrix_layout, m, n, c, ldc, c_t, ldc_t );
         /* Call LAPACK function and adjust info */
         LAPACK_dormlq( &side, &trans, &m, &n, &k, a_t, &lda_t, tau, c_t, &ldc_t,
diff --git a/lapack-netlib/LAPACKE/src/lapacke_slantr.c b/lapack-netlib/LAPACKE/src/lapacke_slantr.c
index 99942b352..23b19b15d 100644
--- a/lapack-netlib/LAPACKE/src/lapacke_slantr.c
+++ b/lapack-netlib/LAPACKE/src/lapacke_slantr.c
@@ -51,8 +51,7 @@ float LAPACKE_slantr( int matrix_layout, char norm, char uplo, char diag,
     }
 #endif
     /* Allocate memory for working array(s) */
-    if( LAPACKE_lsame( norm, 'i' ) || LAPACKE_lsame( norm, '1' ) ||
-        LAPACKE_lsame( norm, 'O' ) ) {
+    if( LAPACKE_lsame( norm, 'i' ) ) {
         work = (float*)LAPACKE_malloc( sizeof(float) * MAX(1,MAX(m,n)) );
         if( work == NULL ) {
             info = LAPACK_WORK_MEMORY_ERROR;
@@ -63,8 +62,7 @@ float LAPACKE_slantr( int matrix_layout, char norm, char uplo, char diag,
     res = LAPACKE_slantr_work( matrix_layout, norm, uplo, diag, m, n, a, lda,
                                 work );
     /* Release memory and exit */
-    if( LAPACKE_lsame( norm, 'i' ) || LAPACKE_lsame( norm, '1' ) ||
-        LAPACKE_lsame( norm, 'O' ) ) {
+    if( LAPACKE_lsame( norm, 'i' ) ) {
         LAPACKE_free( work );
     }
 exit_level_0:
diff --git a/lapack-netlib/LAPACKE/src/lapacke_slantr_work.c b/lapack-netlib/LAPACKE/src/lapacke_slantr_work.c
index 79c71a00f..92a0e4017 100644
--- a/lapack-netlib/LAPACKE/src/lapacke_slantr_work.c
+++ b/lapack-netlib/LAPACKE/src/lapacke_slantr_work.c
@@ -41,7 +41,7 @@ float LAPACKE_slantr_work( int matrix_layout, char norm, char uplo,
     float res = 0.;
     if( matrix_layout == LAPACK_COL_MAJOR ) {
         /* Call LAPACK function and adjust info */
-        LAPACK_slantr( &norm, &uplo, &diag, &m, &n, a, &lda, work );
+        res = LAPACK_slantr( &norm, &uplo, &diag, &m, &n, a, &lda, work );
         if( info < 0 ) {
             info = info - 1;
         }
diff --git a/lapack-netlib/LAPACKE/src/lapacke_sormbr_work.c b/lapack-netlib/LAPACKE/src/lapacke_sormbr_work.c
index f7a2ff3d0..34bc41f30 100644
--- a/lapack-netlib/LAPACKE/src/lapacke_sormbr_work.c
+++ b/lapack-netlib/LAPACKE/src/lapacke_sormbr_work.c
@@ -73,8 +73,11 @@ lapack_int LAPACKE_sormbr_work( int matrix_layout, char vect, char side,
             return (info < 0) ? (info - 1) : info;
         }
         /* Allocate memory for temporary array(s) */
-        a_t = (float*)
-            LAPACKE_malloc( sizeof(float) * lda_t * MAX(1,MIN(nq,k)) );
+        if( LAPACKE_lsame( vect, 'q' ) ) {
+          a_t = (float*)LAPACKE_malloc( sizeof(float) * lda_t * MAX(1,k) );
+        } else {
+          a_t = (float*)LAPACKE_malloc( sizeof(float) * lda_t * MAX(1,nq) );
+        }
         if( a_t == NULL ) {
             info = LAPACK_TRANSPOSE_MEMORY_ERROR;
             goto exit_level_0;
diff --git a/lapack-netlib/LAPACKE/src/lapacke_sormlq_work.c b/lapack-netlib/LAPACKE/src/lapacke_sormlq_work.c
index a277436cd..b02a2d100 100644
--- a/lapack-netlib/LAPACKE/src/lapacke_sormlq_work.c
+++ b/lapack-netlib/LAPACKE/src/lapacke_sormlq_work.c
@@ -72,7 +72,11 @@ lapack_int LAPACKE_sormlq_work( int matrix_layout, char side, char trans,
             return (info < 0) ? (info - 1) : info;
         }
         /* Allocate memory for temporary array(s) */
-        a_t = (float*)LAPACKE_malloc( sizeof(float) * lda_t * MAX(1,m) );
+        if( LAPACKE_lsame( side, 'l' ) ) {
+            a_t = (float*)LAPACKE_malloc( sizeof(float) * lda_t * MAX(1,m) );
+        } else {
+            a_t = (float*)LAPACKE_malloc( sizeof(float) * lda_t * MAX(1,n) );
+        }
         if( a_t == NULL ) {
             info = LAPACK_TRANSPOSE_MEMORY_ERROR;
             goto exit_level_0;
diff --git a/lapack-netlib/LAPACKE/src/lapacke_zlantr.c b/lapack-netlib/LAPACKE/src/lapacke_zlantr.c
index b2c637101..2b645e750 100644
--- a/lapack-netlib/LAPACKE/src/lapacke_zlantr.c
+++ b/lapack-netlib/LAPACKE/src/lapacke_zlantr.c
@@ -51,8 +51,7 @@ double LAPACKE_zlantr( int matrix_layout, char norm, char uplo, char diag,
     }
 #endif
     /* Allocate memory for working array(s) */
-    if( LAPACKE_lsame( norm, 'i' ) || LAPACKE_lsame( norm, '1' ) ||
-        LAPACKE_lsame( norm, 'O' ) ) {
+    if( LAPACKE_lsame( norm, 'i' ) ) {
         work = (double*)LAPACKE_malloc( sizeof(double) * MAX(1,MAX(m,n)) );
         if( work == NULL ) {
             info = LAPACK_WORK_MEMORY_ERROR;
@@ -63,8 +62,7 @@ double LAPACKE_zlantr( int matrix_layout, char norm, char uplo, char diag,
     res = LAPACKE_zlantr_work( matrix_layout, norm, uplo, diag, m, n, a, lda,
                                 work );
     /* Release memory and exit */
-    if( LAPACKE_lsame( norm, 'i' ) || LAPACKE_lsame( norm, '1' ) ||
-        LAPACKE_lsame( norm, 'O' ) ) {
+    if( LAPACKE_lsame( norm, 'i' ) ) {
         LAPACKE_free( work );
     }
 exit_level_0:
diff --git a/lapack-netlib/LAPACKE/src/lapacke_zlantr_work.c b/lapack-netlib/LAPACKE/src/lapacke_zlantr_work.c
index 5fd9f8442..0988bf6e8 100644
--- a/lapack-netlib/LAPACKE/src/lapacke_zlantr_work.c
+++ b/lapack-netlib/LAPACKE/src/lapacke_zlantr_work.c
@@ -39,7 +39,7 @@ double LAPACKE_zlantr_work( int matrix_layout, char norm, char uplo,
                                 double* work )
 {
     lapack_int info = 0;
-	double res = 0.;
+    double res = 0.;
     if( matrix_layout == LAPACK_COL_MAJOR ) {
         /* Call LAPACK function and adjust info */
         res = LAPACK_zlantr( &norm, &uplo, &diag, &m, &n, a, &lda, work );
diff --git a/lapack-netlib/SRC/cgeev.f b/lapack-netlib/SRC/cgeev.f
index b79b64544..0f48322a8 100644
--- a/lapack-netlib/SRC/cgeev.f
+++ b/lapack-netlib/SRC/cgeev.f
@@ -405,9 +405,9 @@
      $                WORK( IWRK ), LWORK-IWRK+1, INFO )
       END IF
 *
-*     If INFO > 0 from CHSEQR, then quit
+*     If INFO .NE. 0 from CHSEQR, then quit
 *
-      IF( INFO.GT.0 )
+      IF( INFO.NE.0 )
      $   GO TO 50
 *
       IF( WANTVL .OR. WANTVR ) THEN
diff --git a/lapack-netlib/SRC/cgetc2.f b/lapack-netlib/SRC/cgetc2.f
index fac6b56820..99eb69d92 100644
--- a/lapack-netlib/SRC/cgetc2.f
+++ b/lapack-netlib/SRC/cgetc2.f
@@ -145,15 +145,33 @@
       INTRINSIC          ABS, CMPLX, MAX
 *     ..
 *     .. Executable Statements ..
+*
+      INFO = 0
+*
+*     Quick return if possible
+*
+      IF( N.EQ.0 )
+     $   RETURN
 *
 *     Set constants to control overflow
 *
-      INFO = 0
       EPS = SLAMCH( 'P' )
       SMLNUM = SLAMCH( 'S' ) / EPS
       BIGNUM = ONE / SMLNUM
       CALL SLABAD( SMLNUM, BIGNUM )
 *
+*     Handle the case N=1 by itself
+*
+      IF( N.EQ.1 ) THEN
+         IPIV( 1 ) = 1
+         JPIV( 1 ) = 1
+         IF( ABS( A( 1, 1 ) ).LT.SMLNUM ) THEN
+            INFO = 1
+            A( 1, 1 ) = CMPLX( SMLNUM, ZERO )
+         END IF
+         RETURN
+      END IF
+*
 *     Factorize A using complete pivoting.
 *     Set pivots less than SMIN to SMIN
 *
diff --git a/lapack-netlib/SRC/cggev3.f b/lapack-netlib/SRC/cggev3.f
index 4a000fe10..decdae509 100644
--- a/lapack-netlib/SRC/cggev3.f
+++ b/lapack-netlib/SRC/cggev3.f
@@ -339,16 +339,16 @@
      $                   LDVL, VR, LDVR, WORK, -1, IERR )
             LWKOPT = MAX( LWKOPT, N+INT( WORK( 1 ) ) )
             CALL CHGEQZ( 'S', JOBVL, JOBVR, N, 1, N, A, LDA, B, LDB,
-     $                   ALPHA, BETA, VL, LDVL, VR, LDVR, WORK,
-     $                   -1, WORK, IERR )
+     $                   ALPHA, BETA, VL, LDVL, VR, LDVR, WORK, -1,
+     $                   RWORK, IERR )
             LWKOPT = MAX( LWKOPT, N+INT( WORK( 1 ) ) )
          ELSE
             CALL CGGHD3( 'N', 'N', N, 1, N, A, LDA, B, LDB, VL, LDVL,
      $                   VR, LDVR, WORK, -1, IERR )
             LWKOPT = MAX( LWKOPT, N+INT( WORK( 1 ) ) )
             CALL CHGEQZ( 'E', JOBVL, JOBVR, N, 1, N, A, LDA, B, LDB,
-     $                   ALPHA, BETA, VL, LDVL, VR, LDVR, WORK,
-     $                   -1, WORK, IERR )
+     $                   ALPHA, BETA, VL, LDVL, VR, LDVR, WORK, -1,
+     $                   RWORK, IERR )
             LWKOPT = MAX( LWKOPT, N+INT( WORK( 1 ) ) )
          END IF
          WORK( 1 ) = CMPLX( LWKOPT )
diff --git a/lapack-netlib/SRC/dgeev.f b/lapack-netlib/SRC/dgeev.f
index dd60db69e..328eaa39c 100644
--- a/lapack-netlib/SRC/dgeev.f
+++ b/lapack-netlib/SRC/dgeev.f
@@ -418,9 +418,9 @@
      $                WORK( IWRK ), LWORK-IWRK+1, INFO )
       END IF
 *
-*     If INFO > 0 from DHSEQR, then quit
+*     If INFO .NE. 0 from DHSEQR, then quit
 *
-      IF( INFO.GT.0 )
+      IF( INFO.NE.0 )
      $   GO TO 50
 *
       IF( WANTVL .OR. WANTVR ) THEN
diff --git a/lapack-netlib/SRC/dgetc2.f b/lapack-netlib/SRC/dgetc2.f
index 7e43a0236..3cd7eeb2b 100644
--- a/lapack-netlib/SRC/dgetc2.f
+++ b/lapack-netlib/SRC/dgetc2.f
@@ -145,15 +145,33 @@
       INTRINSIC          ABS, MAX
 *     ..
 *     .. Executable Statements ..
+*
+      INFO = 0
+*
+*     Quick return if possible
+*
+      IF( N.EQ.0 )
+     $   RETURN
 *
 *     Set constants to control overflow
 *
-      INFO = 0
       EPS = DLAMCH( 'P' )
       SMLNUM = DLAMCH( 'S' ) / EPS
       BIGNUM = ONE / SMLNUM
       CALL DLABAD( SMLNUM, BIGNUM )
 *
+*     Handle the case N=1 by itself
+*
+      IF( N.EQ.1 ) THEN
+         IPIV( 1 ) = 1
+         JPIV( 1 ) = 1
+         IF( ABS( A( 1, 1 ) ).LT.SMLNUM ) THEN
+            INFO = 1
+            A( 1, 1 ) = SMLNUM
+         END IF
+         RETURN
+      END IF
+*
 *     Factorize A using complete pivoting.
 *     Set pivots less than SMIN to SMIN.
 *
diff --git a/lapack-netlib/SRC/sgeev.f b/lapack-netlib/SRC/sgeev.f
index 89dbe08c8..667de0afe 100644
--- a/lapack-netlib/SRC/sgeev.f
+++ b/lapack-netlib/SRC/sgeev.f
@@ -418,9 +418,9 @@
      $                WORK( IWRK ), LWORK-IWRK+1, INFO )
       END IF
 *
-*     If INFO > 0 from SHSEQR, then quit
+*     If INFO .NE. 0 from SHSEQR, then quit
 *
-      IF( INFO.GT.0 )
+      IF( INFO.NE.0 )
      $   GO TO 50
 *
       IF( WANTVL .OR. WANTVR ) THEN
diff --git a/lapack-netlib/SRC/sgetc2.f b/lapack-netlib/SRC/sgetc2.f
index 3c3880d4e..598446519 100644
--- a/lapack-netlib/SRC/sgetc2.f
+++ b/lapack-netlib/SRC/sgetc2.f
@@ -145,15 +145,33 @@
       INTRINSIC          ABS, MAX
 *     ..
 *     .. Executable Statements ..
+*
+      INFO = 0
+*
+*     Quick return if possible
+*
+      IF( N.EQ.0 )
+     $   RETURN
 *
 *     Set constants to control overflow
 *
-      INFO = 0
       EPS = SLAMCH( 'P' )
       SMLNUM = SLAMCH( 'S' ) / EPS
       BIGNUM = ONE / SMLNUM
       CALL SLABAD( SMLNUM, BIGNUM )
 *
+*     Handle the case N=1 by itself
+*
+      IF( N.EQ.1 ) THEN
+         IPIV( 1 ) = 1
+         JPIV( 1 ) = 1
+         IF( ABS( A( 1, 1 ) ).LT.SMLNUM ) THEN
+            INFO = 1
+            A( 1, 1 ) = SMLNUM
+         END IF
+         RETURN
+      END IF
+*
 *     Factorize A using complete pivoting.
 *     Set pivots less than SMIN to SMIN.
 *
diff --git a/lapack-netlib/SRC/zgeev.f b/lapack-netlib/SRC/zgeev.f
index d4520805f..a518b4cd9 100644
--- a/lapack-netlib/SRC/zgeev.f
+++ b/lapack-netlib/SRC/zgeev.f
@@ -404,9 +404,9 @@
      $                WORK( IWRK ), LWORK-IWRK+1, INFO )
       END IF
 *
-*     If INFO > 0 from ZHSEQR, then quit
+*     If INFO .NE. 0 from ZHSEQR, then quit
 *
-      IF( INFO.GT.0 )
+      IF( INFO.NE.0 )
      $   GO TO 50
 *
       IF( WANTVL .OR. WANTVR ) THEN
diff --git a/lapack-netlib/SRC/zgetc2.f b/lapack-netlib/SRC/zgetc2.f
index 3179612f5..bf59415b5 100644
--- a/lapack-netlib/SRC/zgetc2.f
+++ b/lapack-netlib/SRC/zgetc2.f
@@ -145,15 +145,33 @@
       INTRINSIC          ABS, DCMPLX, MAX
 *     ..
 *     .. Executable Statements ..
+*
+      INFO = 0
+*
+*     Quick return if possible
+*
+      IF( N.EQ.0 )
+     $   RETURN
 *
 *     Set constants to control overflow
 *
-      INFO = 0
       EPS = DLAMCH( 'P' )
       SMLNUM = DLAMCH( 'S' ) / EPS
       BIGNUM = ONE / SMLNUM
       CALL DLABAD( SMLNUM, BIGNUM )
 *
+*     Handle the case N=1 by itself
+*
+      IF( N.EQ.1 ) THEN
+         IPIV( 1 ) = 1
+         JPIV( 1 ) = 1
+         IF( ABS( A( 1, 1 ) ).LT.SMLNUM ) THEN
+            INFO = 1
+            A( 1, 1 ) = DCMPLX( SMLNUM, ZERO )
+         END IF
+         RETURN
+      END IF
+*
 *     Factorize A using complete pivoting.
 *     Set pivots less than SMIN to SMIN
 *
diff --git a/lapack-netlib/SRC/zggev3.f b/lapack-netlib/SRC/zggev3.f
index 1c4e832af..78337fd07 100644
--- a/lapack-netlib/SRC/zggev3.f
+++ b/lapack-netlib/SRC/zggev3.f
@@ -340,7 +340,7 @@
             LWKOPT = MAX( LWKOPT, N+INT( WORK( 1 ) ) )
             CALL ZHGEQZ( 'S', JOBVL, JOBVR, N, 1, N, A, LDA, B, LDB,
      $                   ALPHA, BETA, VL, LDVL, VR, LDVR, WORK, -1,
-     $                   WORK, IERR )
+     $                   RWORK, IERR )
             LWKOPT = MAX( LWKOPT, N+INT( WORK( 1 ) ) )
          ELSE
             CALL ZGGHD3( JOBVL, JOBVR, N, 1, N, A, LDA, B, LDB, VL,
@@ -348,7 +348,7 @@
             LWKOPT = MAX( LWKOPT, N+INT( WORK( 1 ) ) )
             CALL ZHGEQZ( 'E', JOBVL, JOBVR, N, 1, N, A, LDA, B, LDB,
      $                   ALPHA, BETA, VL, LDVL, VR, LDVR, WORK, -1,
-     $                   WORK, IERR )
+     $                   RWORK, IERR )
             LWKOPT = MAX( LWKOPT, N+INT( WORK( 1 ) ) )
          END IF
          WORK( 1 ) = DCMPLX( LWKOPT )
diff --git a/lapack-netlib/TESTING/nep.in b/lapack-netlib/TESTING/nep.in
index ed6869b80..af427fbde 100644
--- a/lapack-netlib/TESTING/nep.in
+++ b/lapack-netlib/TESTING/nep.in
@@ -10,7 +10,7 @@ NEP:  Data file for testing Nonsymmetric Eigenvalue Problem routines
   0   5   7    3  200             Values of INIBL  (nibble crossover point)
   1   2   4    2    1             Values of ISHFTS (number of simultaneous shifts)
   0   1   2    0    1             Values of IACC22 (select structured matrix multiply: 0, 1 or 2)
-30.0                              Threshold value
+40.0                              Threshold value
 T                                 Put T to test the error exits
 1                                 Code to interpret the seed
 NEP  21
diff --git a/param.h b/param.h
index 962f80ef3..31125d8e4 100644
--- a/param.h
+++ b/param.h
@@ -1959,6 +1959,43 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
 #endif
 
+#if defined(POWER8)
+
+#define SNUMOPT		4
+#define DNUMOPT		8
+
+#define GEMM_DEFAULT_OFFSET_A  384
+#define GEMM_DEFAULT_OFFSET_B 1024
+#define GEMM_DEFAULT_ALIGN 0x03fffUL
+
+#define SGEMM_DEFAULT_UNROLL_M 4
+#define SGEMM_DEFAULT_UNROLL_N 4
+#define DGEMM_DEFAULT_UNROLL_M 16
+#define DGEMM_DEFAULT_UNROLL_N 4
+#define CGEMM_DEFAULT_UNROLL_M 2
+#define CGEMM_DEFAULT_UNROLL_N 2
+#define ZGEMM_DEFAULT_UNROLL_M 8
+#define ZGEMM_DEFAULT_UNROLL_N 2
+
+#define SGEMM_DEFAULT_P  992
+#define DGEMM_DEFAULT_P  480
+#define CGEMM_DEFAULT_P  488
+#define ZGEMM_DEFAULT_P  240
+
+#define SGEMM_DEFAULT_Q  504
+#define DGEMM_DEFAULT_Q  720
+#define CGEMM_DEFAULT_Q  400
+#define ZGEMM_DEFAULT_Q  360
+
+#define SGEMM_DEFAULT_R 28800
+#define DGEMM_DEFAULT_R 14400
+#define ZGEMM_DEFAULT_R 7200
+
+#define SYMV_P	 8
+
+#endif
+
+
 #if defined(SPARC) && defined(V7)
 
 #define SNUMOPT		4
diff --git a/utest/CMakeLists.txt b/utest/CMakeLists.txt
index 6c7788d97..dfa42df67 100644
--- a/utest/CMakeLists.txt
+++ b/utest/CMakeLists.txt
@@ -5,6 +5,13 @@ set(OpenBLAS_utest_src
   test_amax.c
   )
 
+if (NOT NO_LAPACK)
+set(OpenBLAS_utest_src
+  ${OpenBLAS_utest_src}
+  test_potrs.c
+  )
+endif()
+
 set(OpenBLAS_utest_bin openblas_utest)
 add_executable(${OpenBLAS_utest_bin} ${OpenBLAS_utest_src})
 
diff --git a/utest/Makefile b/utest/Makefile
index 716b1c784..9f9808920 100644
--- a/utest/Makefile
+++ b/utest/Makefile
@@ -11,6 +11,10 @@ include $(TOPDIR)/Makefile.system
 OBJS=utest_main.o test_amax.o
 #test_rot.o test_swap.o test_axpy.o test_dotu.o test_rotmg.o test_dsdot.o test_fork.o
 
+ifneq ($(NO_LAPACK), 1)
+OBJS += test_potrs.o
+endif
+
 all : run_test
 
 $(UTESTBIN): $(OBJS)
diff --git a/utest/ctest.h b/utest/ctest.h
index 01c50f73b..a62103ff5 100644
--- a/utest/ctest.h
+++ b/utest/ctest.h
@@ -1,4 +1,4 @@
-/* Copyright 2011-2015 Bas van den Berg
+/* Copyright 2011-2016 Bas van den Berg
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -58,6 +58,10 @@ struct ctest {
 #define __CTEST_APPLE
 #endif
 
+#ifdef __MINGW32__
+#undef CTEST_SEGFAULT
+#endif
+
 #if defined(_WIN32) && defined(_MSC_VER)
 #define __CTEST_MSVC
 #endif
@@ -212,6 +216,9 @@ void assert_not_equal(intmax_t exp, intmax_t real, const char* caller, int line)
 void assert_not_equal_u(uintmax_t exp, uintmax_t real, const char* caller, int line);
 #define ASSERT_NOT_EQUAL_U(exp, real) assert_not_equal_u(exp, real, __FILE__, __LINE__)
 
+void assert_interval(intmax_t exp1, intmax_t exp2, intmax_t real, const char* caller, int line);
+#define ASSERT_INTERVAL(exp1, exp2, real) assert_interval(exp1, exp2, real, __FILE__, __LINE__)
+
 void assert_null(void* real, const char* caller, int line);
 #define ASSERT_NULL(real) assert_null((void*)real, __FILE__, __LINE__)
 
@@ -511,6 +518,12 @@ void assert_not_equal_u(uintmax_t exp, uintmax_t real, const char* caller, int l
     }
 }
 
+void assert_interval(intmax_t exp1, intmax_t exp2, intmax_t real, const char* caller, int line) {
+    if (real < exp1 || real > exp2) {
+        CTEST_ERR("%s:%d  expected %" PRIdMAX "-%" PRIdMAX ", got %" PRIdMAX, caller, line, exp1, exp2, real);
+    }
+}
+
 void assert_dbl_near(double exp, double real, double tol, const char* caller, int line) {
     double diff = exp - real;
     double absdiff = diff;
diff --git a/utest/openblas_utest.h b/utest/openblas_utest.h
index fb70fdc27..abe381a92 100644
--- a/utest/openblas_utest.h
+++ b/utest/openblas_utest.h
@@ -38,6 +38,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #include "ctest.h"
 
 #include <common.h>
+#include <math.h>
 
 #define SINGLE_EPS 1e-04
 #define DOUBLE_EPS 1e-13
diff --git a/utest/test_potrs.c b/utest/test_potrs.c
new file mode 100644
index 000000000..41b3f6492
--- /dev/null
+++ b/utest/test_potrs.c
@@ -0,0 +1,96 @@
+/*****************************************************************************
+Copyright (c) 2011-2016, The OpenBLAS Project
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+
+   1. Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+
+   2. Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in
+      the documentation and/or other materials provided with the
+      distribution.
+   3. Neither the name of the OpenBLAS project nor the names of 
+      its contributors may be used to endorse or promote products 
+      derived from this software without specific prior written 
+      permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+**********************************************************************************/
+
+#include "openblas_utest.h"
+
+/*
+void BLASFUNC(cpotrf)(char*, BLASINT*, complex float*, BLASINT*, BLASINT*);
+void BLASFUNC(zpotrs_(char*, BLASINT*, BLASINT*, complex double*,
+	     BLASINT*, complex double*, BLASINT*, BLASINT*);
+*/
+
+
+//https://github.com/xianyi/OpenBLAS/issues/695
+CTEST(potrf, bug_695){
+
+  openblas_complex_float A1[100] = {5.8525753+0.0*I, -0.79540455-0.7066077*I, 0.98274714-1.3824869*I, 2.619998-1.8532984*I, -1.8306153+1.2336911*I, 0.32275113-0.015575029*I, 2.1968813-1.0640624*I, 0.27894387-0.97911835*I, 3.0476584-0.18548489*I, 0.3842994-0.7050991*I,
+			   -0.79540455+0.7066077*I, 8.313246+0.0*I, -1.8076122+0.8882447*I, 0.47806996-0.48494184*I, 0.5096429+0.5395974*I, -0.7285097+0.10360408*I, -1.1760061+2.7146957*I, -0.4271084-0.042899966*I, -1.7228563-2.8335886*I, 1.8942566-0.6389735*I,
+			   0.98274714+1.3824869*I, -1.8076122-0.8882447*I, 9.367975+0.0*I, -0.1838578-0.6468568*I, -1.8338387-0.7064959*I, 0.041852742+0.6556877*I, 2.5673025-1.9732997*I, -1.1148382+0.15693812*I, 2.4704504+1.0389464*I, 1.0858271+1.298006*I,
+			   2.619998+1.8532984*I, 0.47806996+0.48494184*I, -0.1838578+0.6468568*I, 3.1117508+0.0*I, -1.956626-0.22825956*I, 0.07081801+0.31801307*I, 0.3698375+0.5400855*I, 0.80686307-1.5315914*I, 1.5649154+1.6229297*I, -0.112077385-1.2014246*I,
+			   -1.8306153-1.2336911*I, 0.5096429-0.5395974*I, -1.8338387+0.7064959*I, -1.956626+0.22825956*I, 3.6439795+0.0*I, -0.2594722-0.48786148*I, -0.47636223+0.27821827*I, -0.61608654+2.01858*I, -2.7767487-1.7693765*I, 0.048102796+0.9741874*I,
+			   0.32275113+0.015575029*I, -0.7285097-0.10360408*I, 0.041852742-0.6556877*I, 0.07081801-0.31801307*I, -0.2594722+0.48786148*I, 3.624376+0.0*I, -1.6697118-0.4017511*I, -1.4397877+0.7550918*I, -0.31456697+1.0403451*I, -0.31978557-0.13701046*I,
+			   2.1968813+1.0640624*I, -1.1760061-2.7146957*I, 2.5673025+1.9732997*I, 0.3698375-0.5400855*I, -0.47636223-0.27821827*I, -1.6697118+0.4017511*I, 6.8273163+0.0*I, -0.10051322-0.24303961*I, 1.4415971-0.29750675*I, 1.221786+0.85654986*I,
+			   0.27894387+0.97911835*I, -0.4271084+0.042899966*I, -1.1148382-0.15693812*I, 0.80686307+1.5315914*I, -0.61608654-2.01858*I, -1.4397877-0.7550918*I, -0.10051322+0.24303961*I, 3.4057708+0.0*I, -0.5856801+1.0203559*I, 0.7103452-0.8422135*I,
+			   3.0476584+0.18548489*I, -1.7228563+2.8335886*I, 2.4704504-1.0389464*I, 1.5649154-1.6229297*I, -2.7767487+1.7693765*I, -0.31456697-1.0403451*I, 1.4415971+0.29750675*I, -0.5856801-1.0203559*I, 7.005772+0.0*I, -0.9617417+1.2486815*I,
+			   0.3842994+0.7050991*I, 1.8942566+0.6389735*I, 1.0858271-1.298006*I, -0.112077385+1.2014246*I, 0.048102796-0.9741874*I, -0.31978557+0.13701046*I, 1.221786-0.85654986*I, 0.7103452+0.8422135*I, -0.9617417-1.2486815*I, 3.4629636+0.0*I};
+  char up = 'U';
+
+  blasint n=10;
+  blasint info[1];
+  BLASFUNC(cpotrf)(&up, &n, (float*)(A1), &n, info);
+  //printf("%g+%g*I\n", creal(A1[91]), cimag(A1[91]));
+
+  openblas_complex_double A2[100] = {3.0607147216796875+0.0*I, -0.5905849933624268-0.29020825028419495*I, 0.321084201335907+0.45168760418891907*I, 0.8387917876243591-0.644718587398529*I, -0.3642411530017853+0.051274992525577545*I, 0.8071482181549072+0.33944568037986755*I, 0.013674172572791576+0.21422699093818665*I, 0.35476258397102356+0.42408594489097595*I, -0.5991537570953369-0.23082709312438965*I, -0.0600702166557312-0.2113417387008667*I,
+			    -0.7954045534133911+0.7066076993942261*I, 2.807175397872925+0.0*I, -0.1691000759601593+0.313548743724823*I, -0.30911174416542053+0.7447023987770081*I, -0.22347848117351532+0.03316075727343559*I, -0.4088296890258789-1.0214389562606812*I, -0.2344931811094284+0.08056317269802094*I, 0.793269693851471-0.17507623136043549*I, 0.03163455054163933+0.20559945702552795*I, 0.13581633567810059-0.2110036462545395*I,
+			    0.9827471375465393+1.3824869394302368*I, -1.8076121807098389-0.8882446885108948*I, 2.3277781009674072+0.0*I, 0.830405056476593-0.19296252727508545*I, 0.1394239068031311-0.5260677933692932*I, 1.239942193031311-0.09915469586849213*I, 0.06731037050485611-0.059320636093616486*I, 0.11507681757211685-0.1984301060438156*I, -0.6843825578689575+0.4647614359855652*I, 1.213119387626648-0.7757048010826111*I,
+			    2.619997978210449+1.8532984256744385*I, 0.4780699610710144+0.48494184017181396*I, -0.18385779857635498+0.6468567848205566*I, 2.0811400413513184+0.0*I, -0.035075582563877106+0.09732913225889206*I, 0.27337002754211426-0.9032229781150818*I, -0.8374675512313843+0.0479498989880085*I, 0.6916252374649048+0.45711082220077515*I, 0.1883818507194519+0.06482727080583572*I, -0.32384994626045227+0.05857187137007713*I,
+			    -1.8306152820587158-1.2336910963058472*I, 0.5096428990364075-0.5395973920822144*I, -1.833838701248169+0.7064958810806274*I, -1.956626057624817+0.22825956344604492*I, 1.706615924835205+0.0*I, -0.2895336151123047+0.17579378187656403*I, -0.923172116279602-0.4530014097690582*I, 0.5040621757507324-0.37026339769363403*I, -0.2824432849884033-1.0374568700790405*I, 0.1399831622838974+0.4977008104324341*I,
+			    0.32275113463401794+0.015575028955936432*I, -0.7285097241401672-0.10360407829284668*I, 0.041852742433547974-0.655687689781189*I, 0.07081800699234009-0.318013072013855*I, -0.25947219133377075+0.4878614842891693*I, 1.5735365152359009+0.0*I, -0.2647853195667267-0.26654252409935*I, -0.6190430521965027-0.24699924886226654*I, -0.6288471221923828+0.48154571652412415*I, 0.02446540631353855-0.2611822783946991*I,
+			    2.1968812942504883+1.0640623569488525*I, -1.1760060787200928-2.714695692062378*I, 2.5673024654388428+1.9732997417449951*I, 0.3698374927043915-0.54008549451828*I, -0.4763622283935547-0.27821826934814453*I, -1.6697118282318115+0.4017511010169983*I, 1.2674795389175415+0.0*I, 0.3079095482826233-0.07258892804384232*I, -0.5929520130157471-0.038360968232154846*I, 0.04388086497783661-0.025549031794071198*I,
+			    0.27894386649131775+0.9791183471679688*I, -0.42710840702056885+0.0428999662399292*I, -1.1148382425308228-0.1569381207227707*I, 0.8068630695343018+1.5315914154052734*I, -0.6160865426063538-2.0185799598693848*I, -1.439787745475769-0.7550917863845825*I, -0.10051321983337402+0.24303960800170898*I, 0.9066106081008911+0.0*I, 0.05315789580345154-0.06136537343263626*I, -0.21304509043693542+0.6494344472885132*I,
+			    3.0476584434509277+0.1854848861694336*I, -1.7228562831878662+2.8335886001586914*I, 2.4704504013061523-1.0389463901519775*I, 1.564915418624878-1.6229296922683716*I, -2.7767486572265625+1.769376516342163*I, -0.314566969871521-1.0403450727462769*I, 1.4415971040725708+0.29750674962997437*I, -0.5856801271438599-1.0203559398651123*I, 0.5668219923973083+0.0*I, 0.033351436257362366-0.07832501083612442*I,
+			    0.3842993974685669+0.7050991058349609*I, 1.894256591796875+0.6389734745025635*I, 1.085827112197876-1.2980060577392578*I, -0.11207738518714905+1.2014245986938477*I, 0.04810279607772827-0.9741873741149902*I, -0.31978556513786316+0.13701045513153076*I, 1.2217860221862793-0.856549859046936*I, 0.7103452086448669+0.84221351146698*I, -0.9617416858673096-1.2486815452575684*I, 0.0756804421544075+0.0*I};
+  openblas_complex_double B[20] = {-0.21782716937787788-0.9222220085490986*I, -0.7620356655676837+0.15533508334193666*I, -0.905011814118756+0.2847570854574069*I, -0.3451346708401685+1.076948486041297*I, 0.25336108035924787+0.975317836492159*I, 0.11192755545114-0.1603741874112385*I, -0.20604111555491242+0.10570814584017311*I, -1.0568488936791578-0.06025820467086475*I, -0.6650468984506477-0.5000967284800251*I, -1.0509472322215125+0.5022165705328413*I,
+			  -0.727775859267237+0.50638268521728*I, 0.39947219167701153-0.4576746001199889*I, -0.7122162951294634-0.630289556702497*I, 0.9870834574024372-0.2825689605519449*I, 0.0628393808469436-0.1253397353973715*I, 0.8439562576196216+1.0850814110398734*I, 0.562377322638969-0.2578030745663871*I, 0.12696236014017806-0.09853584666755086*I, -0.023682508769195098+0.18093440285319276*I, -0.7264975746431271+0.31670415674097235*I};
+  char lo = 'L';
+  blasint nrhs = 2;
+  BLASFUNC(zpotrs)(&lo, &n, &nrhs, (double*)(A2), &n, (double*)(B), &n, info);
+
+  // note that this is exactly equal to A1
+  openblas_complex_float A3[100] = {5.8525753+0.0*I, -0.79540455-0.7066077*I, 0.98274714-1.3824869*I, 2.619998-1.8532984*I, -1.8306153+1.2336911*I, 0.32275113-0.015575029*I, 2.1968813-1.0640624*I, 0.27894387-0.97911835*I, 3.0476584-0.18548489*I, 0.3842994-0.7050991*I,
+			   -0.79540455+0.7066077*I, 8.313246+0.0*I, -1.8076122+0.8882447*I, 0.47806996-0.48494184*I, 0.5096429+0.5395974*I, -0.7285097+0.10360408*I, -1.1760061+2.7146957*I, -0.4271084-0.042899966*I, -1.7228563-2.8335886*I, 1.8942566-0.6389735*I,
+			   0.98274714+1.3824869*I, -1.8076122-0.8882447*I, 9.367975+0.0*I, -0.1838578-0.6468568*I, -1.8338387-0.7064959*I, 0.041852742+0.6556877*I, 2.5673025-1.9732997*I, -1.1148382+0.15693812*I, 2.4704504+1.0389464*I, 1.0858271+1.298006*I,
+			   2.619998+1.8532984*I, 0.47806996+0.48494184*I, -0.1838578+0.6468568*I, 3.1117508+0.0*I, -1.956626-0.22825956*I, 0.07081801+0.31801307*I, 0.3698375+0.5400855*I, 0.80686307-1.5315914*I, 1.5649154+1.6229297*I, -0.112077385-1.2014246*I,
+			   -1.8306153-1.2336911*I, 0.5096429-0.5395974*I, -1.8338387+0.7064959*I, -1.956626+0.22825956*I, 3.6439795+0.0*I, -0.2594722-0.48786148*I, -0.47636223+0.27821827*I, -0.61608654+2.01858*I, -2.7767487-1.7693765*I, 0.048102796+0.9741874*I,
+			   0.32275113+0.015575029*I, -0.7285097-0.10360408*I, 0.041852742-0.6556877*I, 0.07081801-0.31801307*I, -0.2594722+0.48786148*I, 3.624376+0.0*I, -1.6697118-0.4017511*I, -1.4397877+0.7550918*I, -0.31456697+1.0403451*I, -0.31978557-0.13701046*I,
+			   2.1968813+1.0640624*I, -1.1760061-2.7146957*I, 2.5673025+1.9732997*I, 0.3698375-0.5400855*I, -0.47636223-0.27821827*I, -1.6697118+0.4017511*I, 6.8273163+0.0*I, -0.10051322-0.24303961*I, 1.4415971-0.29750675*I, 1.221786+0.85654986*I,
+			   0.27894387+0.97911835*I, -0.4271084+0.042899966*I, -1.1148382-0.15693812*I, 0.80686307+1.5315914*I, -0.61608654-2.01858*I, -1.4397877-0.7550918*I, -0.10051322+0.24303961*I, 3.4057708+0.0*I, -0.5856801+1.0203559*I, 0.7103452-0.8422135*I,
+			   3.0476584+0.18548489*I, -1.7228563+2.8335886*I, 2.4704504-1.0389464*I, 1.5649154-1.6229297*I, -2.7767487+1.7693765*I, -0.31456697-1.0403451*I, 1.4415971+0.29750675*I, -0.5856801-1.0203559*I, 7.005772+0.0*I, -0.9617417+1.2486815*I,
+			   0.3842994+0.7050991*I, 1.8942566+0.6389735*I, 1.0858271-1.298006*I, -0.112077385+1.2014246*I, 0.048102796-0.9741874*I, -0.31978557+0.13701046*I, 1.221786-0.85654986*I, 0.7103452+0.8422135*I, -0.9617417-1.2486815*I, 3.4629636+0.0*I};
+
+  BLASFUNC(cpotrf)(&up, &n, (float*)(A3), &n, info);
+  //  printf("%g+%g*I\n", creal(A3[91]), cimag(A3[91]));
+  if(isnan(CREAL(A3[91])) || isnan(CIMAG(A3[91]))) {
+    CTEST_ERR("%s:%d  got NaN", __FILE__, __LINE__);
+  }
+}