From 9185d419d3c0452a898eb44618d47c11c9cd450e Mon Sep 17 00:00:00 2001
From: Martin Kroeker <martin@ruby.chemie.uni-freiburg.de>
Date: Mon, 31 Dec 2018 23:09:20 +0100
Subject: [PATCH 01/28] Version 0.3.5

---
 CMakeLists.txt | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index 24c169afe..ac5dd93de 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -6,7 +6,7 @@ cmake_minimum_required(VERSION 2.8.5)
 project(OpenBLAS C ASM)
 set(OpenBLAS_MAJOR_VERSION 0)
 set(OpenBLAS_MINOR_VERSION 3)
-set(OpenBLAS_PATCH_VERSION 5.dev)
+set(OpenBLAS_PATCH_VERSION 5)
 set(OpenBLAS_VERSION "${OpenBLAS_MAJOR_VERSION}.${OpenBLAS_MINOR_VERSION}.${OpenBLAS_PATCH_VERSION}")
 
 # Adhere to GNU filesystem layout conventions

From eebc18928715775c9ed254684edee16e4efe0342 Mon Sep 17 00:00:00 2001
From: Martin Kroeker <martin@ruby.chemie.uni-freiburg.de>
Date: Mon, 31 Dec 2018 23:09:59 +0100
Subject: [PATCH 02/28] Version 0.3.5

---
 Makefile.rule | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/Makefile.rule b/Makefile.rule
index 0d5b83b39..3033455d3 100644
--- a/Makefile.rule
+++ b/Makefile.rule
@@ -3,7 +3,7 @@
 #
 
 # This library's version
-VERSION = 0.3.5.dev
+VERSION = 0.3.5
 
 # If you set the suffix, the library name will be libopenblas_$(LIBNAMESUFFIX).a
 # and libopenblas_$(LIBNAMESUFFIX).so. Meanwhile, the soname in shared library

From 11530b76f7b19fbb2d9089ab8166ab54bde8b423 Mon Sep 17 00:00:00 2001
From: Martin Kroeker <martin@ruby.chemie.uni-freiburg.de>
Date: Sun, 28 Apr 2019 09:58:56 +0200
Subject: [PATCH 03/28] Correct INFO=4 condition

---
 relapack/src/cgetrf.c | 2 +-
 relapack/src/dgetrf.c | 5 ++---
 relapack/src/sgetrf.c | 7 +------
 relapack/src/zgetrf.c | 2 +-
 4 files changed, 5 insertions(+), 11 deletions(-)

diff --git a/relapack/src/cgetrf.c b/relapack/src/cgetrf.c
index 9aab718a0..878c9ec15 100644
--- a/relapack/src/cgetrf.c
+++ b/relapack/src/cgetrf.c
@@ -22,7 +22,7 @@ void RELAPACK_cgetrf(
         *info = -1;
     else if (*n < 0)
         *info = -2;
-    else if (*ldA < MAX(1, *n))
+    else if (*ldA < MAX(1, *m))
         *info = -4;
     if (*info) {
         const blasint minfo = -*info;
diff --git a/relapack/src/dgetrf.c b/relapack/src/dgetrf.c
index c4bce8fc5..be960fde9 100644
--- a/relapack/src/dgetrf.c
+++ b/relapack/src/dgetrf.c
@@ -15,16 +15,15 @@ void RELAPACK_dgetrf(
     double *A, const blasint *ldA, blasint *ipiv,
     blasint *info
 ) {
-
     // Check arguments
     *info = 0;
     if (*m < 0)
         *info = -1;
     else if (*n < 0)
         *info = -2;
-    else if (*ldA < MAX(1, *n))
+    else if (*ldA < MAX(1, *m))
         *info = -4;
-    if (*info) {
+    if (*info!=0) {
         const blasint minfo = -*info;
         LAPACK(xerbla)("DGETRF", &minfo, strlen("DGETRF"));
         return;
diff --git a/relapack/src/sgetrf.c b/relapack/src/sgetrf.c
index 9d0ff1039..0231cc166 100644
--- a/relapack/src/sgetrf.c
+++ b/relapack/src/sgetrf.c
@@ -1,5 +1,4 @@
 #include "relapack.h"
-
 static void RELAPACK_sgetrf_rec(const blasint *, const blasint *, float *, const blasint *,
     blasint *, blasint *);
 
@@ -22,16 +21,14 @@ void RELAPACK_sgetrf(
         *info = -1;
     else if (*n < 0)
         *info = -2;
-    else if (*ldA < MAX(1, *n))
+    else if (*ldA < MAX(1, *m))
         *info = -4;
     if (*info) {
         const blasint minfo = -*info;
         LAPACK(xerbla)("SGETRF", &minfo, strlen("SGETRF"));
         return;
     }
-
     const blasint sn = MIN(*m, *n);
-
     RELAPACK_sgetrf_rec(m, &sn, A, ldA, ipiv, info);
 
     // Right remainder
@@ -61,7 +58,6 @@ static void RELAPACK_sgetrf_rec(
     float *A, const blasint *ldA, blasint *ipiv,
     blasint *info
 ) {
-
     if (*n <= MAX(CROSSOVER_SGETRF, 1)) {
         // Unblocked
         LAPACK(sgetf2)(m, n, A, ldA, ipiv, info);
@@ -77,7 +73,6 @@ static void RELAPACK_sgetrf_rec(
     const blasint n1 = SREC_SPLIT(*n);
     const blasint n2 = *n - n1;
     const blasint m2 = *m - n1;
-
     // A_L A_R
     float *const A_L = A;
     float *const A_R = A + *ldA * n1;
diff --git a/relapack/src/zgetrf.c b/relapack/src/zgetrf.c
index 121b03401..b0d14ffb1 100644
--- a/relapack/src/zgetrf.c
+++ b/relapack/src/zgetrf.c
@@ -22,7 +22,7 @@ void RELAPACK_zgetrf(
         *info = -1;
     else if (*n < 0)
         *info = -2;
-    else if (*ldA < MAX(1, *n))
+    else if (*ldA < MAX(1, *m))
         *info = -4;
     if (*info) {
         const blasint minfo = -*info;

From 2cd463eabdcecce01a379c7aaebbb0c48e21c27d Mon Sep 17 00:00:00 2001
From: Martin Kroeker <martin@ruby.chemie.uni-freiburg.de>
Date: Sun, 28 Apr 2019 10:02:28 +0200
Subject: [PATCH 04/28] Disable reallocation of work array in xSYTRF

as it appears to cause memory management problems (seen in the LAPACK tests)
---
 relapack/config.h | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/relapack/config.h b/relapack/config.h
index 9113a712d..e4fab0a12 100644
--- a/relapack/config.h
+++ b/relapack/config.h
@@ -36,8 +36,8 @@
 // allow malloc in xsygst for improved performance
 #define XSYGST_ALLOW_MALLOC ALLOW_MALLOC
 // allow malloc in xsytrf if the passed work buffer is too small
-#define XSYTRF_ALLOW_MALLOC ALLOW_MALLOC
-
+//#define XSYTRF_ALLOW_MALLOC ALLOW_MALLOC
+#define XSYTRF_ALLOW_MALLOC 0
 
 ////////////////////////////////
 // LAPACK routine replacement //

From 1036299da06d4ebd60139529885804fa63400e10 Mon Sep 17 00:00:00 2001
From: Martin Kroeker <martin@ruby.chemie.uni-freiburg.de>
Date: Mon, 29 Apr 2019 00:12:37 +0200
Subject: [PATCH 05/28] Disable repeated recursion on Ab_BR in ReLAPACK xGBTRF

due to crashes in LAPACK tests
---
 relapack/src/cgbtrf.c |  4 +++-
 relapack/src/dgbtrf.c |  6 ++++--
 relapack/src/sgbtrf.c | 20 +++++++++++++-------
 relapack/src/zgbtrf.c | 12 +++++++-----
 4 files changed, 27 insertions(+), 15 deletions(-)

diff --git a/relapack/src/cgbtrf.c b/relapack/src/cgbtrf.c
index eddfdedf7..61332c6a6 100644
--- a/relapack/src/cgbtrf.c
+++ b/relapack/src/cgbtrf.c
@@ -221,7 +221,9 @@ static void RELAPACK_cgbtrf_rec(
     }
 
     // recursion(Ab_BR, ipiv_B)
-    RELAPACK_cgbtrf_rec(&m2, &n2, kl, ku, Ab_BR, ldAb, ipiv_B, Workl, ldWorkl, Worku, ldWorku, info);
+    //RELAPACK_cgbtrf_rec(&m2, &n2, kl, ku, Ab_BR, ldAb, ipiv_B, Workl, ldWorkl, Worku, ldWorku, info);
+       LAPACK(cgbtf2)(&m2, &n2, kl, ku, Ab_BR, ldAb, ipiv_B, info);
+       
     if (*info)
         *info += n1;
     // shift pivots
diff --git a/relapack/src/dgbtrf.c b/relapack/src/dgbtrf.c
index f4b443629..cdf06ad5b 100644
--- a/relapack/src/dgbtrf.c
+++ b/relapack/src/dgbtrf.c
@@ -1,5 +1,6 @@
 #include "relapack.h"
-#include "stdlib.h"
+#include <stdlib.h>
+#include <stdio.h>
 static void RELAPACK_dgbtrf_rec(const blasint *, const blasint *, const blasint *,
     const blasint *, double *, const blasint *, blasint *, double *, const blasint *, double *,
     const blasint *, blasint *);
@@ -218,7 +219,8 @@ static void RELAPACK_dgbtrf_rec(
     }
 
     // recursion(Ab_BR, ipiv_B)
-    RELAPACK_dgbtrf_rec(&m2, &n2, kl, ku, Ab_BR, ldAb, ipiv_B, Workl, ldWorkl, Worku, ldWorku, info);
+//    RELAPACK_dgbtrf_rec(&m2, &n2, kl, ku, Ab_BR, ldAb, ipiv_B, Workl, ldWorkl, Worku, ldWorku, info);
+        LAPACK(dgbtf2)(&m2, &n2, kl, ku, Ab_BR, ldAb, ipiv_B, info);
     if (*info)
         *info += n1;
     // shift pivots
diff --git a/relapack/src/sgbtrf.c b/relapack/src/sgbtrf.c
index 3a4de4ece..3e3fdf455 100644
--- a/relapack/src/sgbtrf.c
+++ b/relapack/src/sgbtrf.c
@@ -27,7 +27,7 @@ void RELAPACK_sgbtrf(
         *info = -3;
     else if (*ku < 0)
         *info = -4;
-    else if (*ldAb < 2 * *kl + *ku + 1)
+    else if (*ldAb < 2 * *kl + *ku + 1) 
         *info = -6;
     if (*info) {
         const blasint minfo = -*info;
@@ -55,15 +55,16 @@ void RELAPACK_sgbtrf(
 
     // Allocate work space
     const blasint n1 = SREC_SPLIT(*n);
-    const blasint mWorkl = (kv > n1) ? MAX(1, *m - *kl) : kv;
-    const blasint nWorkl = (kv > n1) ? n1 : kv;
-    const blasint mWorku = (*kl > n1) ? n1 : *kl;
-    const blasint nWorku = (*kl > n1) ? MAX(0, *n - *kl) : *kl;
+    const blasint mWorkl = abs( (kv > n1) ? MAX(1, *m - *kl) : kv );
+    const blasint nWorkl = abs( (kv > n1) ? n1 : kv );
+    const blasint mWorku = abs( (*kl > n1) ? n1 : *kl );
+    const blasint nWorku = abs( (*kl > n1) ? MAX(0, *n - *kl) : *kl );
     float *Workl = malloc(mWorkl * nWorkl * sizeof(float));
     float *Worku = malloc(mWorku * nWorku * sizeof(float));
     LAPACK(slaset)("L", &mWorkl, &nWorkl, ZERO, ZERO, Workl, &mWorkl);
     LAPACK(slaset)("U", &mWorku, &nWorku, ZERO, ZERO, Worku, &mWorku);
 
+
     // Recursive kernel
     RELAPACK_sgbtrf_rec(m, n, kl, ku, Ab, ldAb, ipiv, Workl, &mWorkl, Worku, &mWorku, info);
 
@@ -81,6 +82,7 @@ static void RELAPACK_sgbtrf_rec(
     blasint *info
 ) {
 
+
     if (*n <= MAX(CROSSOVER_SGBTRF, 1)) {
         // Unblocked
         LAPACK(sgbtf2)(m, n, kl, ku, Ab, ldAb, ipiv, info);
@@ -127,7 +129,7 @@ static void RELAPACK_sgbtrf_rec(
     float *const A_BR = A + *ldA * n1 + m1;
 
     // ipiv_T
-    // ipiv_B
+    // ipiv_B 
     blasint *const ipiv_T = ipiv;
     blasint *const ipiv_B = ipiv + n1;
 
@@ -155,6 +157,7 @@ static void RELAPACK_sgbtrf_rec(
     float *const A_BRbl = A_BR              + m21;
     float *const A_BRbr = A_BR + *ldA * n21 + m21;
 
+
     // recursion(Ab_L, ipiv_T)
     RELAPACK_sgbtrf_rec(m, &n1, kl, ku, Ab_L, ldAb, ipiv_T, Workl, ldWorkl, Worku, ldWorku, info);
 
@@ -216,8 +219,11 @@ static void RELAPACK_sgbtrf_rec(
         }
     }
 
+
     // recursion(Ab_BR, ipiv_B)
-    RELAPACK_sgbtrf_rec(&m2, &n2, kl, ku, Ab_BR, ldAb, ipiv_B, Workl, ldWorkl, Worku, ldWorku, info);
+//cause of infinite recursion here ?    
+//      RELAPACK_sgbtrf_rec(&m2, &n2, kl, ku, Ab_BR, ldAb, ipiv_B, Workl, ldWorkl, Worku, ldWorku, info);
+        LAPACK(sgbtf2)(&m2, &n2, kl, ku, Ab_BR, ldAb, ipiv_B, info);
     if (*info)
         *info += n1;
     // shift pivots
diff --git a/relapack/src/zgbtrf.c b/relapack/src/zgbtrf.c
index 0dd3fa7c3..d4ba41753 100644
--- a/relapack/src/zgbtrf.c
+++ b/relapack/src/zgbtrf.c
@@ -56,10 +56,10 @@ void RELAPACK_zgbtrf(
 
     // Allocate work space
     const blasint n1 = ZREC_SPLIT(*n);
-    const blasint mWorkl = (kv > n1) ? MAX(1, *m - *kl) : kv;
-    const blasint nWorkl = (kv > n1) ? n1 : kv;
-    const blasint mWorku = (*kl > n1) ? n1 : *kl;
-    const blasint nWorku = (*kl > n1) ? MAX(0, *n - *kl) : *kl;
+    const blasint mWorkl = abs ( (kv > n1) ? MAX(1, *m - *kl) : kv);
+    const blasint nWorkl = abs ( (kv > n1) ? n1 : kv);
+    const blasint mWorku = abs ( (*kl > n1) ? n1 : *kl);
+    const blasint nWorku = abs ( (*kl > n1) ? MAX(0, *n - *kl) : *kl);
     double *Workl = malloc(mWorkl * nWorkl * 2 * sizeof(double));
     double *Worku = malloc(mWorku * nWorku * 2 * sizeof(double));
     LAPACK(zlaset)("L", &mWorkl, &nWorkl, ZERO, ZERO, Workl, &mWorkl);
@@ -221,7 +221,9 @@ static void RELAPACK_zgbtrf_rec(
     }
 
     // recursion(Ab_BR, ipiv_B)
-    RELAPACK_zgbtrf_rec(&m2, &n2, kl, ku, Ab_BR, ldAb, ipiv_B, Workl, ldWorkl, Worku, ldWorku, info);
+ //   RELAPACK_zgbtrf_rec(&m2, &n2, kl, ku, Ab_BR, ldAb, ipiv_B, Workl, ldWorkl, Worku, ldWorku, info);
+ LAPACK(zgbtf2)(&m2, &n2, kl, ku, Ab_BR, ldAb, ipiv_B, info);
+ 
     if (*info)
         *info += n1;
     // shift pivots

From 0f105dd8a5a597b2f468f774a52da226581efbdc Mon Sep 17 00:00:00 2001
From: AbdelRauf <quickwritereader@gmail.com>
Date: Sat, 13 Apr 2019 13:56:19 +0000
Subject: [PATCH 06/28] sgemm/strmm

---
 CONTRIBUTORS.md                    |    5 +-
 kernel/power/KERNEL.POWER9         |    6 +-
 kernel/power/sgemm_kernel_power9.S |  286 ++
 kernel/power/sgemm_logic_power9.S  | 2133 ++++++++++
 kernel/power/sgemm_macros_power9.S | 5828 ++++++++++++++++++++++++++++
 param.h                            |    4 +-
 6 files changed, 8256 insertions(+), 6 deletions(-)
 create mode 100644 kernel/power/sgemm_kernel_power9.S
 create mode 100644 kernel/power/sgemm_logic_power9.S
 create mode 100644 kernel/power/sgemm_macros_power9.S

diff --git a/CONTRIBUTORS.md b/CONTRIBUTORS.md
index 08f8cc69d..3859a9c19 100644
--- a/CONTRIBUTORS.md
+++ b/CONTRIBUTORS.md
@@ -167,4 +167,7 @@ In chronological order:
   * [2017-02-26] ztrmm kernel for IBM z13
   * [2017-03-13] strmm and ctrmm kernel for IBM z13
   * [2017-09-01] initial Blas Level-1,2 (double precision) for IBM z13
-
+  * [2018-03-07] added missing Blas Level 1-2  (double precision) simd codes
+  * [2019-02-01] added missing Blas Level-1,2 (single precision)  simd codes
+  * [2019-03-14] power9 dgemm/dtrmm kernel
+  * [2019-04-29] power9 sgemm/strmm kernel 
diff --git a/kernel/power/KERNEL.POWER9 b/kernel/power/KERNEL.POWER9
index 86a931971..6d5cf9068 100644
--- a/kernel/power/KERNEL.POWER9
+++ b/kernel/power/KERNEL.POWER9
@@ -3,16 +3,16 @@
 #CGEMM_BETA = ../generic/zgemm_beta.c
 #ZGEMM_BETA = ../generic/zgemm_beta.c
 
-STRMMKERNEL	= strmm_kernel_16x8_power8.S
+STRMMKERNEL	= sgemm_kernel_power9.S
 DTRMMKERNEL	= dgemm_kernel_power9.S
 CTRMMKERNEL	= ctrmm_kernel_8x4_power8.S
 ZTRMMKERNEL	= ztrmm_kernel_8x2_power8.S
 
-SGEMMKERNEL    =  sgemm_kernel_16x8_power8.S
+SGEMMKERNEL    =  sgemm_kernel_power9.S
 SGEMMINCOPY    = ../generic/gemm_ncopy_16.c
 SGEMMITCOPY    = sgemm_tcopy_16_power8.S
 SGEMMONCOPY    =  ../generic/gemm_ncopy_8.c
-SGEMMOTCOPY    = sgemm_tcopy_8_power8.S 
+SGEMMOTCOPY    = sgemm_tcopy_8_power8.S
 SGEMMINCOPYOBJ =  sgemm_incopy.o
 SGEMMITCOPYOBJ =  sgemm_itcopy.o
 SGEMMONCOPYOBJ =  sgemm_oncopy.o
diff --git a/kernel/power/sgemm_kernel_power9.S b/kernel/power/sgemm_kernel_power9.S
new file mode 100644
index 000000000..a44659468
--- /dev/null
+++ b/kernel/power/sgemm_kernel_power9.S
@@ -0,0 +1,286 @@
+/***************************************************************************
+Copyright (c) 2013-2019, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*****************************************************************************/
+ 
+#define ASSEMBLER
+#include "common.h"
+#include "def_vsx.h"
+
+ 
+#define LOAD	ld
+#define STACKSIZE  (512 )  
+  
+#define	M	r3
+#define	N	r4
+#define	K	r5
+
+ 
+#define A	r7
+#define	B	r8
+#define	C	r9
+#define	LDC	r10
+#define OFFSET	r6
+ 
+ 
+
+#define alpha_r vs20
+#define save_permute_1 vs21
+#define save_permute_2 vs22
+#define permute_mask vs23
+#define o0	0
+ 
+
+#define T1	r11
+#define T2	r12
+#define T3	r14
+#define T4	r15
+#define T5	r16
+#define T6	r17
+#define L	r18
+#define T7	r19
+#define T8	r20
+#define TEMP_REG	r21
+#define	I	r22
+#define J	r23
+#define AO	r24
+#define	BO	r25
+#define	CO 	r26
+#define T9	r27
+#define	T10	r28
+#define	T11	r29
+
+#define T12	r30
+#define T13	r31
+
+#include "sgemm_macros_power9.S"
+
+.equ    perm_const1, 0x0405060700010203
+.equ    perm_const2, 0x0c0d0e0f08090a0b
+.equ save_permute_11, 0x1415161718191a1b
+.equ save_permute_12, 0x0405060708090a0b
+.equ save_permute_21, 0x101112131c1d1e1f
+.equ save_permute_22, 0x000102030c0d0e0f 
+
+
+#ifndef NEEDPARAM
+
+	PROLOGUE
+	PROFCODE
+
+	addi	SP, SP, -STACKSIZE
+	li	r0, 0
+
+	stfd	f14,    0(SP)
+	stfd	f15,    8(SP)
+	stfd	f16,   16(SP)
+	stfd	f17,   24(SP)
+
+	stfd	f18,   32(SP)
+	stfd	f19,   40(SP)
+	stfd	f20,   48(SP)
+	stfd	f21,   56(SP)
+
+	stfd	f22,   64(SP)
+	stfd	f23,   72(SP)
+	stfd	f24,   80(SP)
+	stfd	f25,   88(SP)
+
+	stfd	f26,   96(SP)
+	stfd	f27,  104(SP)
+	stfd	f28,  112(SP)
+	stfd	f29,  120(SP)
+
+	stfd	f30,  128(SP)
+	stfd	f31,  136(SP)
+
+ 
+	std	r31,  144(SP)
+	std	r30,  152(SP)
+	std	r29,  160(SP)
+	std	r28,  168(SP)
+	std	r27,  176(SP)
+	std	r26,  184(SP)
+	std	r25,  192(SP)
+	std	r24,  200(SP)
+	std	r23,  208(SP)
+	std	r22,  216(SP)
+	std	r21,  224(SP)
+	std	r20,  232(SP)
+	std	r19,  240(SP)
+	std	r18,  248(SP)
+	std	r17,  256(SP)
+	std	r16,  264(SP)
+	std	r15,  272(SP)
+	std	r14,  280(SP)
+ 
+ 
+	stxv    v20,  288(SP)
+	stxv    v21,  304(SP)
+	stxv    v22,  320(SP)
+	stxv    v23,  336(SP)
+	stxv    v24,  352(SP)
+	stxv    v25,  368(SP)
+	stxv    v26,  384(SP)
+	stxv    v27,  400(SP)
+	stxv    v28,  416(SP)
+	stxv    v29,  432(SP)
+	stxv    v30,  448(SP)
+	stxv    v31,  464(SP)
+
+ 
+
+#if defined(TRMMKERNEL) 
+	ld	OFFSET,  FRAMESLOT(0) + STACKSIZE(SP)
+#endif
+   slwi    LDC, LDC, 2
+
+
+/*	cmpwi	cr0, M, 0
+	ble	.L999_H1
+	cmpwi	cr0, N, 0
+	ble	.L999_H1
+	cmpwi	cr0, K, 0
+	ble	.L999_H1
+*/
+ 
+ 
+	/*alpha is stored in f1. convert to single and splat*/
+  xscvdpspn alpha_r,vs1 
+	xxspltw   alpha_r,alpha_r,0
+ 
+ 
+/*load reverse permute mask for big endian
+  uint128 = 0xc0d0e0f08090a0b0405060700010203
+*/ 
+		
+	lis T2, perm_const2@highest
+	ori T2, T2, perm_const2@higher
+	rldicr T2, T2, 32, 31
+	oris T2, T2, perm_const2@h
+	ori T2, T2, perm_const2@l 
+
+	lis T1, perm_const1@highest
+	ori T1, T1, perm_const1@higher
+	rldicr T1, T1, 32, 31
+	oris T1, T1, perm_const1@h
+	ori T1, T1, perm_const1@l
+
+	mtvsrdd permute_mask,T2,T1
+
+	lis T2, save_permute_12@highest
+	ori T2, T2, save_permute_12@higher
+	rldicr T2, T2, 32, 31
+	oris T2, T2, save_permute_12@h
+	ori T2, T2, save_permute_12@l 
+
+	lis T1, save_permute_11@highest
+	ori T1, T1, save_permute_11@higher
+	rldicr T1, T1, 32, 31
+	oris T1, T1, save_permute_11@h
+	ori T1, T1, save_permute_11@l
+
+	mtvsrdd save_permute_1,T2,T1	
+
+	lis T2, save_permute_22@highest
+	ori T2, T2, save_permute_22@higher
+	rldicr T2, T2, 32, 31
+	oris T2, T2, save_permute_22@h
+	ori T2, T2, save_permute_22@l 
+
+	lis T1, save_permute_21@highest
+	ori T1, T1, save_permute_21@higher
+	rldicr T1, T1, 32, 31
+	oris T1, T1, save_permute_21@h
+	ori T1, T1, save_permute_21@l
+
+	mtvsrdd save_permute_2,T2,T1	
+
+#include "sgemm_logic_power9.S"
+
+.L999:
+	addi	r3, 0, 0
+
+	lfd	f14,    0(SP)
+	lfd	f15,    8(SP)
+	lfd	f16,   16(SP)
+	lfd	f17,   24(SP)
+
+	lfd	f18,   32(SP)
+	lfd	f19,   40(SP)
+	lfd	f20,   48(SP)
+	lfd	f21,   56(SP)
+
+	lfd	f22,   64(SP)
+	lfd	f23,   72(SP)
+	lfd	f24,   80(SP)
+	lfd	f25,   88(SP)
+
+	lfd	f26,   96(SP)
+	lfd	f27,  104(SP)
+	lfd	f28,  112(SP)
+	lfd	f29,  120(SP)
+
+	lfd	f30,  128(SP)
+	lfd	f31,  136(SP)
+
+	ld	r31,  144(SP)
+	ld	r30,  152(SP)
+	ld	r29,  160(SP)
+	ld	r28,  168(SP)
+	ld	r27,  176(SP)
+	ld	r26,  184(SP)
+	ld	r25,  192(SP)
+	ld	r24,  200(SP)
+	ld	r23,  208(SP)
+	ld	r22,  216(SP)
+	ld	r21,  224(SP)
+	ld	r20,  232(SP)
+	ld	r19,  240(SP)
+	ld	r18,  248(SP)
+	ld	r17,  256(SP)
+	ld	r16,  264(SP)
+	ld	r15,  272(SP)
+	ld	r14,  280(SP)
+ 
+	lxv    v20,  288(SP)
+	lxv    v21,  304(SP)
+	lxv    v22,  320(SP)
+	lxv    v23,  336(SP)
+	lxv    v24,  352(SP)
+	lxv    v25,  368(SP)
+	lxv    v26,  384(SP)
+	lxv    v27,  400(SP)
+	lxv    v28,  416(SP)
+	lxv    v29,  432(SP)
+	lxv    v30,  448(SP)
+	lxv    v31,  464(SP)
+
+ 
+	addi	SP, SP, STACKSIZE
+	blr
+
+	EPILOGUE
+#endif
diff --git a/kernel/power/sgemm_logic_power9.S b/kernel/power/sgemm_logic_power9.S
new file mode 100644
index 000000000..300e30470
--- /dev/null
+++ b/kernel/power/sgemm_logic_power9.S
@@ -0,0 +1,2133 @@
+#define MY_ALIGN .align 3
+
+#if defined(TRMMKERNEL) && !defined(LEFT)
+   neg TEMP_REG, OFFSET 
+#endif
+
+	srawi.		J,	N,	3
+
+	ble		LSGEMM_L8_END
+
+LSGEMM_L8_BEGIN:
+
+	li		T1,	128
+	li		T2,	256
+ 
+	mr		AO,	A
+	mr		CO,	C
+	slwi		T3,	LDC	,	3
+	add		C,	C,	T3
+
+	dcbt		A,	T1
+	dcbt		A,	T2
+#if defined(TRMMKERNEL) && defined(LEFT)
+	mr TEMP_REG, OFFSET	 /*off = offset;*/
+#endif 
+	srawi.		I,	M,	4
+	ble		LSGEMM_L8x16_END
+
+	MY_ALIGN
+LSGEMM_L8x16_BEGIN:
+
+#if defined(TRMMKERNEL)
+    REFRESH_POINTERS  AO,BO,TEMP_REG,B,16,8
+#else
+	mr		BO,	B
+#endif	
+
+#if defined(TRMMKERNEL)
+   REFRESH_TEMP_BK T11,K,TEMP_REG,16,8
+   mr T12, T11
+   addi T12,T12, -1
+   srawi.		L, T12,	6 /**(T11-1) % 64x */
+#else
+   mr T12, K
+   addi T12,T12, -1
+   srawi.		L,	T12,	6 /**(K-1) % 64x */
+#endif 
+ 
+    ZERO8x16
+	ble		LSGEMM_L8x16_SUB0
+
+	MY_ALIGN
+LSGEMM_L8x16_LOOP_START:
+ 
+	LOAD8x16_0  /*we already zeroed */
+    ##OffsetA=64 OffsetB=32
+    addi AO,AO,2112
+    addi BO,BO,32  
+
+	mtctr		L
+
+	MY_ALIGN
+
+LSGEMM_L8x16_LOOP:
+
+    KERNEL8x16_I1_L4_2  -2048,0, 0,0
+    KERNEL8x16_I1_L4_2  -2048,0, 1,0
+    KERNEL8x16_I1_L4_2  -2048,0, 2,0
+    KERNEL8x16_I1_L4_2  -2048,0, 3,0
+    KERNEL8x16_I1_L4_2  -2048,0, 4,0
+    KERNEL8x16_I1_L4_2  -2048,0, 5,0        
+    KERNEL8x16_I1_L4_2  -2048,0, 6,0
+    KERNEL8x16_I1_L4_2  -2048,0, 7,0  
+    KERNEL8x16_I1_L4_2  -2048,0, 8,0      
+    KERNEL8x16_I1_L4_2  -2048,0, 9,0
+    KERNEL8x16_I1_L4_2  -2048,0, 10,0
+    KERNEL8x16_I1_L4_2  -2048,0, 11,0
+    KERNEL8x16_I1_L4_2  -2048,0, 12,0
+    KERNEL8x16_I1_L4_2  -2048,0, 13,0    
+    KERNEL8x16_I1_L4_2  -2048,0, 14,0    
+    KERNEL8x16_I1_L4_2  -2048,0, 15,1  	
+
+	bdnz		LSGEMM_L8x16_LOOP
+
+	MY_ALIGN
+LSGEMM_L8x16_LOOP_END:
+
+    END8x16 0, AO, BO, -2048, 0    
+
+	b		LSGEMM_L8x16_SUB1 
+	MY_ALIGN
+LSGEMM_L8x16_SUB0:
+#if defined(TRMMKERNEL)
+	andi.		L,	T11,	127
+#else
+	andi.		L,	K,	127
+#endif   
+	b		LSGEMM_L8x16_SUB2
+	MY_ALIGN
+LSGEMM_L8x16_SUB1:
+#if defined(TRMMKERNEL)
+	andi.		L,	T12,	63
+#else
+	andi.		L,  T12,	63
+#endif	
+	ble		LSGEMM_L8x16_SAVE
+	MY_ALIGN
+LSGEMM_L8x16_SUB2:
+
+    srawi.      T10,L, 5
+    ble LSGEMM_L8x16_SUB2_16
+    mtctr		T10
+    MY_ALIGN
+LSGEMM_L8x16_SUB2_LOOP:
+	LOAD8x16_0 
+    KERNEL8x16_I1_L4_2  64,32, 0,0
+    KERNEL8x16_I1_L4_2  64,32, 1,0
+    KERNEL8x16_I1_L4_2  64,32, 2,0
+    KERNEL8x16_I1_L4_2  64,32, 3,0
+    KERNEL8x16_I1_L4_2  64,32, 4,0
+    KERNEL8x16_I1_L4_2  64,32, 5,0
+    KERNEL8x16_I1_L4_2  64,32, 6,0
+    KERNEL8x16_I1_L4_3  64,32, 7,1
+    bdnz LSGEMM_L8x16_SUB2_LOOP 
+    MY_ALIGN        
+LSGEMM_L8x16_SUB2_16:
+    andi.      T10,L, 16
+    ble LSGEMM_L8x16_SUB2_8
+	LOAD8x16_0 
+    KERNEL8x16_I1_L4_2  64,32, 0,0
+    KERNEL8x16_I1_L4_2  64,32, 1,0
+    KERNEL8x16_I1_L4_2  64,32, 2,0
+    KERNEL8x16_I1_L4_3  64,32, 3,1
+    MY_ALIGN 
+LSGEMM_L8x16_SUB2_8:
+    andi.      T10,L, 8
+    ble LSGEMM_L8x16_SUB2_4 
+	LOAD8x16_0
+    KERNEL8x16_I1_L4_2  64,32, 0,0
+    KERNEL8x16_I1_L4_3  64,32, 1,1
+	MY_ALIGN	
+LSGEMM_L8x16_SUB2_4:
+    andi.      T10,L, 4
+    ble LSGEMM_L8x16_SUB2_2
+    LOAD8x16_0
+    KERNEL8x16_I1_L4_3  64,32, 0,1
+    MY_ALIGN
+LSGEMM_L8x16_SUB2_2:
+    andi.      T10,L, 2
+    ble LSGEMM_L8x16_SUB2_1
+    LOAD8x16_0
+    KERNEL8x16_I1_L2_3  64,32, 0,1
+    MY_ALIGN    
+LSGEMM_L8x16_SUB2_1:
+    andi.      T10,L, 1
+    ble LSGEMM_L8x16_SAVE	
+    KERNEL8x16 0
+#	addic.		L,	L,	-1
+#	bgt		LSGEMM_L8x16_SUB2
+
+	MY_ALIGN
+LSGEMM_L8x16_SAVE:
+	SAVE8x16
+#if defined(TRMMKERNEL)	
+	REFRESH_AFTER_SAVE T11,K,TEMP_REG,BO,AO,16,8
+#endif	
+	addic.		I,	I,	-1
+	bgt+		LSGEMM_L8x16_BEGIN
+    MY_ALIGN
+LSGEMM_L8x16_END:
+LSGEMM_L8x8_BEGIN:
+    andi.       T2, M,  15
+    ble     LSGEMM_L8x1_END
+
+    andi.       T1, M,  8
+    ble     LSGEMM_L8x8_END
+
+#if defined(TRMMKERNEL)
+    REFRESH_POINTERS  AO,BO,TEMP_REG,B,8,8
+#else
+    mr      BO, B
+#endif  
+
+#if defined(TRMMKERNEL)
+   REFRESH_TEMP_BK T11,K,TEMP_REG,8,8
+   mr T12, T11
+   addi T12,T12, -1
+   srawi.       L, T12, 4 /**(T11-1) % 16x */
+#else
+   mr T12, K
+   addi T12,T12, -1
+   srawi.       L,  T12,    4 /**(K-1) % 16x */
+#endif 
+    
+    ZERO8x8
+    ble     LSGEMM_L8x8_SUB0
+
+    MY_ALIGN
+LSGEMM_L8x8_LOOP_START:
+ 
+    LOAD8x8_0  /*we already zeroed */ 
+    mtctr       L
+
+    MY_ALIGN
+
+LSGEMM_L8x8_LOOP:
+
+    KERNEL8x8_I1_L4_2  32,32, 0,0
+    KERNEL8x8_I1_L4_2  32,32, 1,0
+    KERNEL8x8_I1_L4_2  32,32, 2,0
+    KERNEL8x8_I1_L4_2  32,32, 3,1    
+
+    bdnz        LSGEMM_L8x8_LOOP
+
+    MY_ALIGN
+LSGEMM_L8x8_LOOP_END:
+
+    END8x8 0, AO, BO, 32, 32    
+
+    b       LSGEMM_L8x8_SUB1 
+    MY_ALIGN
+LSGEMM_L8x8_SUB0:
+#if defined(TRMMKERNEL)
+    andi.       L,  T11,    31
+#else
+    andi.       L,  K,  31
+#endif   
+    b       LSGEMM_L8x8_SUB2
+    MY_ALIGN
+LSGEMM_L8x8_SUB1:
+#if defined(TRMMKERNEL)
+    andi.       L,  T12,    15
+#else
+    andi.       L,  T12,    15
+#endif  
+    ble     LSGEMM_L8x8_SAVE
+    MY_ALIGN
+LSGEMM_L8x8_SUB2:
+ 
+    srawi.      T1,L, 3
+    ble LSGEMM_L8x8_SUB2_4 
+    mtctr		T1
+    MY_ALIGN
+LSGEMM_L8x8_SUB2_LOOP:    
+    LOAD8x8_0
+    KERNEL8x8_I1_L4_2  32,32, 0,0
+    KERNEL8x8_I1_L4_3  32,32, 1,1
+    bdnz LSGEMM_L8x8_SUB2_LOOP
+    MY_ALIGN    
+LSGEMM_L8x8_SUB2_4:
+    andi.      T1,L, 4
+    ble LSGEMM_L8x8_SUB2_2
+    LOAD8x8_0
+    KERNEL8x8_I1_L4_3  32,32, 0,1
+    MY_ALIGN
+LSGEMM_L8x8_SUB2_2:
+    andi.      T1,L, 2
+    ble LSGEMM_L8x8_SUB2_1
+    LOAD8x8_0
+    KERNEL8x8_I1_L2_3  32,32, 0,1
+    MY_ALIGN    
+LSGEMM_L8x8_SUB2_1:
+    andi.      T1,L, 1
+    ble LSGEMM_L8x8_SAVE   
+    KERNEL8x8 0
+ 
+
+    MY_ALIGN
+LSGEMM_L8x8_SAVE:
+    SAVE8x8
+#if defined(TRMMKERNEL) 
+    REFRESH_AFTER_SAVE T11,K,TEMP_REG,BO,AO,8,8
+#endif  
+    MY_ALIGN  
+LSGEMM_L8x8_END:
+LSGEMM_L8x4_BEGIN:
+    andi.       T2, M,  15
+    ble     LSGEMM_L8x1_END
+
+    andi.       T1, M,  4
+    ble     LSGEMM_L8x4_END
+
+#if defined(TRMMKERNEL)
+    REFRESH_POINTERS  AO,BO,TEMP_REG,B,4,8
+#else
+    mr      BO, B
+#endif  
+
+#if defined(TRMMKERNEL)
+   REFRESH_TEMP_BK T11,K,TEMP_REG,4,8
+   mr T12, T11
+   addi T12,T12, -1
+   srawi.       L, T12, 4 /**(T11-1) % 16x */
+#else
+   mr T12, K
+   addi T12,T12, -1
+   srawi.       L,  T12,    4 /**(K-1) % 16x */
+#endif 
+    
+    ZERO8x4
+    ble     LSGEMM_L8x4_SUB0
+
+    MY_ALIGN
+LSGEMM_L8x4_LOOP_START:
+ 
+    LOAD8x4_0  /*we already zeroed */ 
+    mtctr       L
+
+    MY_ALIGN
+
+LSGEMM_L8x4_LOOP:
+
+    KERNEL8x4_I1_L4_2  16,32, 0,0
+    KERNEL8x4_I1_L4_2  16,32, 1,0
+    KERNEL8x4_I1_L4_2  16,32, 2,0
+    KERNEL8x4_I1_L4_2  16,32, 3,1    
+
+    bdnz        LSGEMM_L8x4_LOOP
+
+    MY_ALIGN
+LSGEMM_L8x4_LOOP_END:
+
+    END8x4 0, AO, BO, 16, 32    
+
+    b       LSGEMM_L8x4_SUB1 
+    MY_ALIGN
+LSGEMM_L8x4_SUB0:
+#if defined(TRMMKERNEL)
+    andi.       L,  T11,    31
+#else
+    andi.       L,  K,  31
+#endif   
+    b       LSGEMM_L8x4_SUB2
+    MY_ALIGN
+LSGEMM_L8x4_SUB1:
+#if defined(TRMMKERNEL)
+    andi.       L,  T12,    15
+#else
+    andi.       L,  T12,    15
+#endif  
+    ble     LSGEMM_L8x4_SAVE
+    MY_ALIGN
+LSGEMM_L8x4_SUB2:
+
+    srawi.      T1,L, 3
+    ble LSGEMM_L8x4_SUB2_4 
+    mtctr		T1
+    MY_ALIGN
+LSGEMM_L8x4_SUB2_LOOP:      
+    LOAD8x4_0
+    KERNEL8x4_I1_L4_2  16,32, 0,0
+    KERNEL8x4_I1_L4_3  16,32, 1,1
+    bdnz LSGEMM_L8x4_SUB2_LOOP
+    MY_ALIGN    
+LSGEMM_L8x4_SUB2_4:
+    andi.      T1,L, 4
+    ble LSGEMM_L8x4_SUB2_2
+    LOAD8x4_0
+    KERNEL8x4_I1_L4_3  16,32, 0,1
+    MY_ALIGN
+LSGEMM_L8x4_SUB2_2:
+    andi.      T1,L, 2
+    ble LSGEMM_L8x4_SUB2_1
+    LOAD8x4_0
+    KERNEL8x4_I1_L2_3  16,32, 0,1
+    MY_ALIGN    
+LSGEMM_L8x4_SUB2_1:
+    andi.      T1,L, 1
+    ble LSGEMM_L8x4_SAVE   
+    KERNEL8x4 0
+ 
+
+    MY_ALIGN
+LSGEMM_L8x4_SAVE:
+    SAVE8x4
+#if defined(TRMMKERNEL) 
+    REFRESH_AFTER_SAVE T11,K,TEMP_REG,BO,AO,4,8
+#endif  
+    MY_ALIGN  
+LSGEMM_L8x4_END:
+LSGEMM_L8x2_BEGIN:
+    andi.       T1, M,  2
+    ble     LSGEMM_L8x2_END
+
+#if defined(TRMMKERNEL)
+    REFRESH_POINTERS  AO,BO,TEMP_REG,B,2,8
+#else
+    mr      BO, B
+#endif  
+
+#if defined(TRMMKERNEL)
+   REFRESH_TEMP_BK T11,K,TEMP_REG,2,8 
+   srawi.       L, T11, 3 /**(T11) % 8x */
+#else
+   srawi.       L,  K,    3 /**(K) % 8x */
+#endif 
+    
+    ZERO8x2
+    ble     LSGEMM_L8x2_SUB0
+
+    MY_ALIGN
+LSGEMM_L8x2_LOOP_START: 
+    mtctr       L
+
+    MY_ALIGN
+
+LSGEMM_L8x2_LOOP:
+
+    KERNEL8x2_2  0,0, 0,0
+    KERNEL8x2_2  0,0, 1,0
+    KERNEL8x2_2  0,0, 2,0
+    KERNEL8x2_2  0,0, 3,1    
+
+    bdnz        LSGEMM_L8x2_LOOP
+
+    MY_ALIGN
+LSGEMM_L8x2_LOOP_END:   
+ 
+LSGEMM_L8x2_SUB0:
+#if defined(TRMMKERNEL)
+    andi.       L,  T11,    7
+#else
+    andi.       L,  K,  7
+#endif    
+    ble     LSGEMM_L8x2_SAVE
+    MY_ALIGN
+LSGEMM_L8x2_SUB2:
+    andi.      T1,L, 4
+    ble LSGEMM_L8x2_SUB2_2
+    KERNEL8x2_2  0,0, 0,0
+    KERNEL8x2_2  0,0, 1,1
+    MY_ALIGN
+LSGEMM_L8x2_SUB2_2:
+    andi.      T1,L, 2
+    ble LSGEMM_L8x2_SUB2_1
+    KERNEL8x2_2  0,0, 0,1 
+    MY_ALIGN    
+LSGEMM_L8x2_SUB2_1:
+    andi.      T1,L, 1
+    ble LSGEMM_L8x2_SAVE   
+    KERNEL8x2
+  
+    MY_ALIGN
+LSGEMM_L8x2_SAVE:
+    SAVE8x2
+#if defined(TRMMKERNEL) 
+    REFRESH_AFTER_SAVE T11,K,TEMP_REG,BO,AO,2,8
+#endif  
+    MY_ALIGN  
+LSGEMM_L8x2_END:
+LSGEMM_L8x1_BEGIN: 
+    andi.       T1, M,  1
+    ble     LSGEMM_L8x1_END
+#if defined(TRMMKERNEL)
+    REFRESH_POINTERS  AO,BO,TEMP_REG,B,1,8
+#else
+    mr      BO, B
+#endif  
+
+#if defined(TRMMKERNEL)
+   REFRESH_TEMP_BK T11,K,TEMP_REG,1,8 
+   srawi.       L, T11, 3 /**(T11) % 8x */
+#else
+   srawi.       L,  K,    3 /**(K) % 8x */
+#endif 
+    
+    ZERO8x1
+    ble     LSGEMM_L8x1_SUB0
+
+    MY_ALIGN
+LSGEMM_L8x1_LOOP_START: 
+    mtctr       L
+
+    MY_ALIGN
+
+LSGEMM_L8x1_LOOP:
+
+    KERNEL8x1_4  0,0, 0,0
+    KERNEL8x1_4  0,0, 1,1     
+
+    bdnz        LSGEMM_L8x1_LOOP
+
+    MY_ALIGN
+LSGEMM_L8x1_LOOP_END:   
+ 
+LSGEMM_L8x1_SUB0:
+#if defined(TRMMKERNEL)
+    andi.       L,  T11,    7
+#else
+    andi.       L,  K,  7
+#endif    
+    ble     LSGEMM_L8x1_SAVE
+    MY_ALIGN
+LSGEMM_L8x1_SUB2:
+    andi.      T1,L, 4
+    ble LSGEMM_L8x1_SUB2_2
+    KERNEL8x1_4  0,0, 0,1 
+    MY_ALIGN
+LSGEMM_L8x1_SUB2_2:
+    andi.      T1,L, 2
+    ble LSGEMM_L8x1_SUB2_1
+    KERNEL8x1_2 
+    MY_ALIGN    
+LSGEMM_L8x1_SUB2_1:
+    andi.      T1,L, 1
+    ble LSGEMM_L8x1_SAVE   
+    KERNEL8x1
+  
+    MY_ALIGN
+LSGEMM_L8x1_SAVE:
+    SAVE8x1
+#if defined(TRMMKERNEL) 
+    REFRESH_AFTER_SAVE T11,K,TEMP_REG,BO,AO,1,8
+#endif  
+    MY_ALIGN  
+LSGEMM_L8x1_END:
+
+	slwi		T1,	K,	5
+	add		B,	B,	T1
+#if defined(TRMMKERNEL) && !defined(LEFT)
+    addi TEMP_REG, TEMP_REG, 8
+#endif
+	addic.		J,	J,	-1
+	bgt		LSGEMM_L8_BEGIN
+ 
+
+LSGEMM_L8_END:
+
+/*	b		LSGEMM_L4_BEGIN*/
+    andi.       T1, N,  4
+    ble     LSGEMM_L4_END
+LSGEMM_L4_BEGIN:
+  
+
+	mr		AO,	A
+	mr		CO,	C
+	slwi		T3,	LDC	,	2
+	add		C,	C,	T3
+ 
+#if defined(TRMMKERNEL) && defined(LEFT)
+	mr TEMP_REG, OFFSET	 /*off = offset;*/
+#endif 
+	srawi.		I,	M,	4
+	ble		LSGEMM_L4x16_END
+
+	MY_ALIGN
+LSGEMM_L4x16_BEGIN:
+
+#if defined(TRMMKERNEL)
+    REFRESH_POINTERS  AO,BO,TEMP_REG,B,16,4
+#else
+	mr		BO,	B
+#endif	
+
+#if defined(TRMMKERNEL)
+   REFRESH_TEMP_BK T11,K,TEMP_REG,16,4
+   mr T12, T11
+   addi T12,T12, -1
+   srawi.		L, T12,	6 /**(T11-1) % 64x */
+#else
+   mr T12, K
+   addi T12,T12, -1
+   srawi.		L,	T12,	6 /**(K-1) % 64x */
+#endif 
+ 
+    ZERO4x16
+	ble		LSGEMM_L4x16_SUB0
+
+	MY_ALIGN
+LSGEMM_L4x16_LOOP_START:
+ 
+	LOAD4x16_0  /*we already zeroed */
+    ##OffsetA=64 OffsetB=16
+    addi AO,AO,2112
+    addi BO,BO,16  
+
+	mtctr		L
+
+	MY_ALIGN
+
+LSGEMM_L4x16_LOOP:
+
+    KERNEL4x16_I1_L4_2  -2048,0, 0,0
+    KERNEL4x16_I1_L4_2  -2048,0, 1,0
+    KERNEL4x16_I1_L4_2  -2048,0, 2,0
+    KERNEL4x16_I1_L4_2  -2048,0, 3,0
+    KERNEL4x16_I1_L4_2  -2048,0, 4,0
+    KERNEL4x16_I1_L4_2  -2048,0, 5,0        
+    KERNEL4x16_I1_L4_2  -2048,0, 6,0
+    KERNEL4x16_I1_L4_2  -2048,0, 7,0  
+    KERNEL4x16_I1_L4_2  -2048,0, 8,0      
+    KERNEL4x16_I1_L4_2  -2048,0, 9,0
+    KERNEL4x16_I1_L4_2  -2048,0, 10,0
+    KERNEL4x16_I1_L4_2  -2048,0, 11,0
+    KERNEL4x16_I1_L4_2  -2048,0, 12,0
+    KERNEL4x16_I1_L4_2  -2048,0, 13,0    
+    KERNEL4x16_I1_L4_2  -2048,0, 14,0    
+    KERNEL4x16_I1_L4_2  -2048,0, 15,1  	
+
+	bdnz		LSGEMM_L4x16_LOOP
+
+	MY_ALIGN
+LSGEMM_L4x16_LOOP_END:
+
+    END4x16 0, AO, BO, -2048, 0    
+
+	b		LSGEMM_L4x16_SUB1 
+	MY_ALIGN
+LSGEMM_L4x16_SUB0:
+#if defined(TRMMKERNEL)
+	andi.		L,	T11,	127
+#else
+	andi.		L,	K,	127
+#endif   
+	b		LSGEMM_L4x16_SUB2
+	MY_ALIGN
+LSGEMM_L4x16_SUB1:
+#if defined(TRMMKERNEL)
+	andi.		L,	T12,	63
+#else
+	andi.		L,  T12,	63
+#endif	
+	ble		LSGEMM_L4x16_SAVE
+	MY_ALIGN
+LSGEMM_L4x16_SUB2:
+
+    srawi.      T10,L, 5
+    ble LSGEMM_L4x16_SUB2_16
+    mtctr		T10
+    MY_ALIGN
+LSGEMM_L4x16_SUB2_LOOP:
+	LOAD4x16_0 
+    KERNEL4x16_I1_L4_2  64,16, 0,0
+    KERNEL4x16_I1_L4_2  64,16, 1,0
+    KERNEL4x16_I1_L4_2  64,16, 2,0
+    KERNEL4x16_I1_L4_2  64,16, 3,0
+    KERNEL4x16_I1_L4_2  64,16, 4,0
+    KERNEL4x16_I1_L4_2  64,16, 5,0
+    KERNEL4x16_I1_L4_2  64,16, 6,0
+    KERNEL4x16_I1_L4_3  64,16, 7,1
+    bdnz LSGEMM_L4x16_SUB2_LOOP 
+    MY_ALIGN        
+LSGEMM_L4x16_SUB2_16:
+    andi.      T10,L, 16
+    ble LSGEMM_L4x16_SUB2_8
+	LOAD4x16_0 
+    KERNEL4x16_I1_L4_2  64,16, 0,0
+    KERNEL4x16_I1_L4_2  64,16, 1,0
+    KERNEL4x16_I1_L4_2  64,16, 2,0
+    KERNEL4x16_I1_L4_3  64,16, 3,1
+    MY_ALIGN 
+LSGEMM_L4x16_SUB2_8:
+    andi.      T10,L, 8
+    ble LSGEMM_L4x16_SUB2_4 
+	LOAD4x16_0
+    KERNEL4x16_I1_L4_2  64,16, 0,0
+    KERNEL4x16_I1_L4_3  64,16, 1,1
+	MY_ALIGN	
+LSGEMM_L4x16_SUB2_4:
+    andi.      T10,L, 4
+    ble LSGEMM_L4x16_SUB2_2
+    LOAD4x16_0
+    KERNEL4x16_I1_L4_3  64,16, 0,1
+    MY_ALIGN
+LSGEMM_L4x16_SUB2_2:
+    andi.      T10,L, 2
+    ble LSGEMM_L4x16_SUB2_1
+    LOAD4x16_0
+    KERNEL4x16_I1_L2_3  64,16, 0,1
+    MY_ALIGN    
+LSGEMM_L4x16_SUB2_1:
+    andi.      T10,L, 1
+    ble LSGEMM_L4x16_SAVE	
+    KERNEL4x16 0
+#	addic.		L,	L,	-1
+#	bgt		LSGEMM_L4x16_SUB2
+
+	MY_ALIGN
+LSGEMM_L4x16_SAVE:
+	SAVE4x16
+#if defined(TRMMKERNEL)	
+	REFRESH_AFTER_SAVE T11,K,TEMP_REG,BO,AO,16,4
+#endif	
+	addic.		I,	I,	-1
+	bgt+		LSGEMM_L4x16_BEGIN
+    MY_ALIGN
+LSGEMM_L4x16_END:
+LSGEMM_L4x8_BEGIN:
+    andi.       T2, M,  15
+    ble     LSGEMM_L4x1_END
+
+    andi.       T1, M,  8
+    ble     LSGEMM_L4x8_END
+
+#if defined(TRMMKERNEL)
+    REFRESH_POINTERS  AO,BO,TEMP_REG,B,8,4
+#else
+    mr      BO, B
+#endif  
+
+#if defined(TRMMKERNEL)
+   REFRESH_TEMP_BK T11,K,TEMP_REG,8,4
+   mr T12, T11
+   addi T12,T12, -1
+   srawi.       L, T12, 4 /**(T11-1) % 16x */
+#else
+   mr T12, K
+   addi T12,T12, -1
+   srawi.       L,  T12,    4 /**(K-1) % 16x */
+#endif 
+    
+    ZERO4x8
+    ble     LSGEMM_L4x8_SUB0
+
+    MY_ALIGN
+LSGEMM_L4x8_LOOP_START:
+ 
+    LOAD4x8_0  /*we already zeroed */ 
+    mtctr       L
+
+    MY_ALIGN
+
+LSGEMM_L4x8_LOOP:
+
+    KERNEL4x8_I1_L4_2  32,16, 0,0
+    KERNEL4x8_I1_L4_2  32,16, 1,0
+    KERNEL4x8_I1_L4_2  32,16, 2,0
+    KERNEL4x8_I1_L4_2  32,16, 3,1    
+
+    bdnz        LSGEMM_L4x8_LOOP
+
+    MY_ALIGN
+LSGEMM_L4x8_LOOP_END:
+
+    END4x8 0, AO, BO, 32, 16    
+
+    b       LSGEMM_L4x8_SUB1 
+    MY_ALIGN
+LSGEMM_L4x8_SUB0:
+#if defined(TRMMKERNEL)
+    andi.       L,  T11,    31
+#else
+    andi.       L,  K,  31
+#endif   
+    b       LSGEMM_L4x8_SUB2
+    MY_ALIGN
+LSGEMM_L4x8_SUB1:
+#if defined(TRMMKERNEL)
+    andi.       L,  T12,    15
+#else
+    andi.       L,  T12,    15
+#endif  
+    ble     LSGEMM_L4x8_SAVE
+    MY_ALIGN
+LSGEMM_L4x8_SUB2:
+ 
+    srawi.      T1,L, 3
+    ble LSGEMM_L4x8_SUB2_4 
+    mtctr		T1
+    MY_ALIGN
+LSGEMM_L4x8_SUB2_LOOP:    
+    LOAD4x8_0
+    KERNEL4x8_I1_L4_2  32,16, 0,0
+    KERNEL4x8_I1_L4_3  32,16, 1,1
+    bdnz LSGEMM_L4x8_SUB2_LOOP
+    MY_ALIGN    
+LSGEMM_L4x8_SUB2_4:
+    andi.      T1,L, 4
+    ble LSGEMM_L4x8_SUB2_2
+    LOAD4x8_0
+    KERNEL4x8_I1_L4_3  32,16, 0,1
+    MY_ALIGN
+LSGEMM_L4x8_SUB2_2:
+    andi.      T1,L, 2
+    ble LSGEMM_L4x8_SUB2_1
+    LOAD4x8_0
+    KERNEL4x8_I1_L2_3  32,16, 0,1
+    MY_ALIGN    
+LSGEMM_L4x8_SUB2_1:
+    andi.      T1,L, 1
+    ble LSGEMM_L4x8_SAVE   
+    KERNEL4x8 0
+ 
+
+    MY_ALIGN
+LSGEMM_L4x8_SAVE:
+    SAVE4x8
+#if defined(TRMMKERNEL) 
+    REFRESH_AFTER_SAVE T11,K,TEMP_REG,BO,AO,8,4
+#endif  
+    MY_ALIGN  
+LSGEMM_L4x8_END:
+LSGEMM_L4x4_BEGIN:
+    andi.       T2, M,  15
+    ble     LSGEMM_L4x1_END
+
+    andi.       T1, M,  4
+    ble     LSGEMM_L4x4_END
+
+#if defined(TRMMKERNEL)
+    REFRESH_POINTERS  AO,BO,TEMP_REG,B,4,4
+#else
+    mr      BO, B
+#endif  
+
+#if defined(TRMMKERNEL)
+   REFRESH_TEMP_BK T11,K,TEMP_REG,4,4
+   mr T12, T11
+   addi T12,T12, -1
+   srawi.       L, T12, 4 /**(T11-1) % 16x */
+#else
+   mr T12, K
+   addi T12,T12, -1
+   srawi.       L,  T12,    4 /**(K-1) % 16x */
+#endif 
+    
+    ZERO4x4
+    ble     LSGEMM_L4x4_SUB0
+
+    MY_ALIGN
+LSGEMM_L4x4_LOOP_START:
+ 
+    LOAD4x4_0  /*we already zeroed */ 
+    mtctr       L
+
+    MY_ALIGN
+
+LSGEMM_L4x4_LOOP:
+
+    KERNEL4x4_I1_L4_2  16,16, 0,0
+    KERNEL4x4_I1_L4_2  16,16, 1,0
+    KERNEL4x4_I1_L4_2  16,16, 2,0
+    KERNEL4x4_I1_L4_2  16,16, 3,1    
+
+    bdnz        LSGEMM_L4x4_LOOP
+
+    MY_ALIGN
+LSGEMM_L4x4_LOOP_END:
+
+    END4x4 0, AO, BO, 16, 16    
+
+    b       LSGEMM_L4x4_SUB1 
+    MY_ALIGN
+LSGEMM_L4x4_SUB0:
+#if defined(TRMMKERNEL)
+    andi.       L,  T11,    31
+#else
+    andi.       L,  K,  31
+#endif   
+    b       LSGEMM_L4x4_SUB2
+    MY_ALIGN
+LSGEMM_L4x4_SUB1:
+#if defined(TRMMKERNEL)
+    andi.       L,  T12,    15
+#else
+    andi.       L,  T12,    15
+#endif  
+    ble     LSGEMM_L4x4_SAVE
+    MY_ALIGN
+LSGEMM_L4x4_SUB2:
+
+    srawi.      T1,L, 3 
+    ble LSGEMM_L4x4_SUB2_4  
+    mtctr		T1
+    MY_ALIGN
+LSGEMM_L4x4_SUB2_LOOP:     
+    LOAD4x4_0
+    KERNEL4x4_I1_L4_2  16,16, 0,0
+    KERNEL4x4_I1_L4_3  16,16, 1,1
+    bdnz LSGEMM_L4x4_SUB2_LOOP
+    MY_ALIGN    
+LSGEMM_L4x4_SUB2_4:
+    andi.      T1,L, 4
+    ble LSGEMM_L4x4_SUB2_2
+    LOAD4x4_0
+    KERNEL4x4_I1_L4_3  16,16, 0,1
+    MY_ALIGN
+LSGEMM_L4x4_SUB2_2:
+    andi.      T1,L, 2
+    ble LSGEMM_L4x4_SUB2_1
+    LOAD4x4_0
+    KERNEL4x4_I1_L2_3  16,16, 0,1
+    MY_ALIGN    
+LSGEMM_L4x4_SUB2_1:
+    andi.      T1,L, 1
+    ble LSGEMM_L4x4_SAVE   
+    KERNEL4x4 0
+ 
+
+    MY_ALIGN
+LSGEMM_L4x4_SAVE:
+    SAVE4x4
+#if defined(TRMMKERNEL) 
+    REFRESH_AFTER_SAVE T11,K,TEMP_REG,BO,AO,4,4
+#endif  
+    MY_ALIGN  
+LSGEMM_L4x4_END:
+LSGEMM_L4x2_BEGIN:
+    andi.       T1, M,  2
+    ble     LSGEMM_L4x2_END
+
+#if defined(TRMMKERNEL)
+    REFRESH_POINTERS  AO,BO,TEMP_REG,B,2,4
+#else
+    mr      BO, B
+#endif  
+
+#if defined(TRMMKERNEL)
+   REFRESH_TEMP_BK T11,K,TEMP_REG,2,4 
+   srawi.       L, T11, 3 /**(T11) % 8x */
+#else
+   srawi.       L,  K,    3 /**(K) % 8x */
+#endif 
+    
+    ZERO4x2
+    ble     LSGEMM_L4x2_SUB0
+
+    MY_ALIGN
+LSGEMM_L4x2_LOOP_START: 
+    mtctr       L
+
+    MY_ALIGN
+
+LSGEMM_L4x2_LOOP:
+
+    KERNEL4x2_2  0,0, 0,0
+    KERNEL4x2_2  0,0, 1,0
+    KERNEL4x2_2  0,0, 2,0
+    KERNEL4x2_2  0,0, 3,1    
+
+    bdnz        LSGEMM_L4x2_LOOP
+
+    MY_ALIGN
+LSGEMM_L4x2_LOOP_END:   
+ 
+LSGEMM_L4x2_SUB0:
+#if defined(TRMMKERNEL)
+    andi.       L,  T11,    7
+#else
+    andi.       L,  K,  7
+#endif    
+    ble     LSGEMM_L4x2_SAVE
+    MY_ALIGN
+LSGEMM_L4x2_SUB2:
+    andi.      T1,L, 4
+    ble LSGEMM_L4x2_SUB2_2
+    KERNEL4x2_2  0,0, 0,0
+    KERNEL4x2_2  0,0, 1,1
+    MY_ALIGN
+LSGEMM_L4x2_SUB2_2:
+    andi.      T1,L, 2
+    ble LSGEMM_L4x2_SUB2_1
+    KERNEL4x2_2  0,0, 0,1 
+    MY_ALIGN    
+LSGEMM_L4x2_SUB2_1:
+    andi.      T1,L, 1
+    ble LSGEMM_L4x2_SAVE   
+    KERNEL4x2
+  
+    MY_ALIGN
+LSGEMM_L4x2_SAVE:
+    SAVE4x2
+#if defined(TRMMKERNEL) 
+    REFRESH_AFTER_SAVE T11,K,TEMP_REG,BO,AO,2,4
+#endif  
+    MY_ALIGN  
+LSGEMM_L4x2_END:
+LSGEMM_L4x1_BEGIN: 
+    andi.       T1, M,  1
+    ble     LSGEMM_L4x1_END
+#if defined(TRMMKERNEL)
+    REFRESH_POINTERS  AO,BO,TEMP_REG,B,1,4
+#else
+    mr      BO, B
+#endif  
+
+#if defined(TRMMKERNEL)
+   REFRESH_TEMP_BK T11,K,TEMP_REG,1,4 
+   srawi.       L, T11, 3 /**(T11) % 8x */
+#else
+   srawi.       L,  K,    3 /**(K) % 8x */
+#endif 
+    
+    ZERO4x1
+    ble     LSGEMM_L4x1_SUB0
+
+    MY_ALIGN
+LSGEMM_L4x1_LOOP_START: 
+    mtctr       L
+
+    MY_ALIGN
+
+LSGEMM_L4x1_LOOP:
+
+    KERNEL4x1_4  0,0, 0,0
+    KERNEL4x1_4  0,0, 1,1     
+
+    bdnz        LSGEMM_L4x1_LOOP
+
+    MY_ALIGN
+LSGEMM_L4x1_LOOP_END:   
+ 
+LSGEMM_L4x1_SUB0:
+#if defined(TRMMKERNEL)
+    andi.       L,  T11,    7
+#else
+    andi.       L,  K,  7
+#endif    
+    ble     LSGEMM_L4x1_SAVE
+    MY_ALIGN
+LSGEMM_L4x1_SUB2:
+    andi.      T1,L, 4
+    ble LSGEMM_L4x1_SUB2_2
+    KERNEL4x1_4  0,0, 0,1 
+    MY_ALIGN
+LSGEMM_L4x1_SUB2_2:
+    andi.      T1,L, 2
+    ble LSGEMM_L4x1_SUB2_1
+    KERNEL4x1_2 
+    MY_ALIGN    
+LSGEMM_L4x1_SUB2_1:
+    andi.      T1,L, 1
+    ble LSGEMM_L4x1_SAVE   
+    KERNEL4x1
+  
+    MY_ALIGN
+LSGEMM_L4x1_SAVE:
+    SAVE4x1
+#if defined(TRMMKERNEL) 
+    REFRESH_AFTER_SAVE T11,K,TEMP_REG,BO,AO,1,4
+#endif  
+    MY_ALIGN  
+LSGEMM_L4x1_END:
+
+	slwi		T1,	K,	4
+	add		B,	B,	T1
+#if defined(TRMMKERNEL) && !defined(LEFT)
+    addi TEMP_REG, TEMP_REG, 4
+#endif
+
+	andi.		T2,	N,	3
+	ble		.L999
+
+LSGEMM_L4_END:
+    andi.       T1, N,  2
+    ble     LSGEMM_L2_END
+LSGEMM_L2_BEGIN:
+  
+
+	mr		AO,	A
+	mr		CO,	C
+	slwi		T3,	LDC	,	1
+	add		C,	C,	T3
+ 
+#if defined(TRMMKERNEL) && defined(LEFT)
+	mr TEMP_REG, OFFSET	 /*off = offset;*/
+#endif 
+	srawi.		I,	M,	4
+	ble		LSGEMM_L2x16_END
+
+	MY_ALIGN
+LSGEMM_L2x16_BEGIN:
+
+#if defined(TRMMKERNEL)
+    REFRESH_POINTERS  AO,BO,TEMP_REG,B,16,2
+#else
+	mr		BO,	B
+#endif	
+
+#if defined(TRMMKERNEL)
+   REFRESH_TEMP_BK T11,K,TEMP_REG,16,2 
+   srawi.		L, T11,	6 /**(T11 ) % 64x */
+#else 
+   srawi.		L,	K,	6 /**(K ) % 64x */
+#endif 
+ 
+    ZERO2x16
+	ble		LSGEMM_L2x16_SUB0
+    addi AO,AO,2048
+  
+	mtctr		L
+
+	MY_ALIGN
+
+LSGEMM_L2x16_LOOP:
+
+    KERNEL2x16_4  -2048,0, 0,0
+    KERNEL2x16_4  -2048,0, 1,0
+    KERNEL2x16_4  -2048,0, 2,0
+    KERNEL2x16_4  -2048,0, 3,0
+    KERNEL2x16_4  -2048,0, 4,0
+    KERNEL2x16_4  -2048,0, 5,0        
+    KERNEL2x16_4  -2048,0, 6,0
+    KERNEL2x16_4  -2048,0, 7,0  
+    KERNEL2x16_4  -2048,0, 8,0      
+    KERNEL2x16_4  -2048,0, 9,0
+    KERNEL2x16_4  -2048,0, 10,0
+    KERNEL2x16_4  -2048,0, 11,0
+    KERNEL2x16_4  -2048,0, 12,0
+    KERNEL2x16_4  -2048,0, 13,0    
+    KERNEL2x16_4  -2048,0, 14,0    
+    KERNEL2x16_4  -2048,0, 15,1  	
+
+	bdnz		LSGEMM_L2x16_LOOP
+    MY_ALIGN
+    addi AO,AO, -2048
+	MY_ALIGN
+LSGEMM_L2x16_SUB0: 
+#if defined(TRMMKERNEL)
+	andi.		L,	T11,	63
+#else
+	andi.		L,  K,	63
+#endif	
+	ble		LSGEMM_L2x16_SAVE
+	MY_ALIGN
+LSGEMM_L2x16_SUB2:
+    andi.      T10,L, 32
+    ble LSGEMM_L2x16_SUB2_16 
+    KERNEL2x16_4  0,0, 0,0
+    KERNEL2x16_4  0,0, 1,0
+    KERNEL2x16_4  0,0, 2,0
+    KERNEL2x16_4  0,0, 3,0
+    KERNEL2x16_4  0,0, 4,0
+    KERNEL2x16_4  0,0, 5,0
+    KERNEL2x16_4  0,0, 6,0
+    KERNEL2x16_4  0,0, 7,1 
+    MY_ALIGN        
+LSGEMM_L2x16_SUB2_16:
+    andi.      T10,L, 16
+    ble LSGEMM_L2x16_SUB2_8 
+    KERNEL2x16_4  0,0, 0,0
+    KERNEL2x16_4  0,0, 1,0
+    KERNEL2x16_4  0,0, 2,0
+    KERNEL2x16_4  0,0, 3,1
+    MY_ALIGN 
+LSGEMM_L2x16_SUB2_8:
+    andi.      T10,L, 8
+    ble LSGEMM_L2x16_SUB2_4  
+    KERNEL2x16_4  0,0, 0,0
+    KERNEL2x16_4  0,0, 1,1
+	MY_ALIGN	
+LSGEMM_L2x16_SUB2_4:
+    andi.      T10,L, 4
+    ble LSGEMM_L2x16_SUB2_2 
+    KERNEL2x16_4  0,0, 0,1
+    MY_ALIGN
+LSGEMM_L2x16_SUB2_2:
+    andi.      T10,L, 2
+    ble LSGEMM_L2x16_SUB2_1 
+    KERNEL2x16_2  0,0, 0,1
+    MY_ALIGN    
+LSGEMM_L2x16_SUB2_1:
+    andi.      T10,L, 1
+    ble LSGEMM_L2x16_SAVE	
+    KERNEL2x16
+
+	MY_ALIGN
+LSGEMM_L2x16_SAVE:
+	SAVE2x16
+#if defined(TRMMKERNEL)	
+	REFRESH_AFTER_SAVE T11,K,TEMP_REG,BO,AO,16,2
+#endif	
+	addic.		I,	I,	-1
+	bgt+		LSGEMM_L2x16_BEGIN
+    MY_ALIGN
+LSGEMM_L2x16_END:
+	andi.		I,	M,	8
+	ble		LSGEMM_L2x8_END
+
+	MY_ALIGN
+LSGEMM_L2x8_BEGIN:
+
+#if defined(TRMMKERNEL)
+    REFRESH_POINTERS  AO,BO,TEMP_REG,B,8,2
+#else
+	mr		BO,	B
+#endif	
+
+#if defined(TRMMKERNEL)
+   REFRESH_TEMP_BK T11,K,TEMP_REG,8,2 
+   srawi.		L, T11,	6 /**(T11 ) % 64x */
+#else 
+   srawi.		L,	K,	6 /**(K ) % 64x */
+#endif 
+ 
+    ZERO2x8
+	ble		LSGEMM_L2x8_SUB0
+    addi AO,AO,2048
+  
+	mtctr		L
+
+	MY_ALIGN
+
+LSGEMM_L2x8_LOOP:
+
+    KERNEL2x8_4  -2048,0, 0,0
+    KERNEL2x8_4  -2048,0, 1,0
+    KERNEL2x8_4  -2048,0, 2,0
+    KERNEL2x8_4  -2048,0, 3,0
+    KERNEL2x8_4  -2048,0, 4,0
+    KERNEL2x8_4  -2048,0, 5,0        
+    KERNEL2x8_4  -2048,0, 6,0
+    KERNEL2x8_4  -2048,0, 7,0  
+    KERNEL2x8_4  -2048,0, 8,0      
+    KERNEL2x8_4  -2048,0, 9,0
+    KERNEL2x8_4  -2048,0, 10,0
+    KERNEL2x8_4  -2048,0, 11,0
+    KERNEL2x8_4  -2048,0, 12,0
+    KERNEL2x8_4  -2048,0, 13,0    
+    KERNEL2x8_4  -2048,0, 14,0    
+    KERNEL2x8_4  -2048,0, 15,1  	
+
+	bdnz		LSGEMM_L2x8_LOOP
+    MY_ALIGN
+    addi AO,AO, -2048
+	MY_ALIGN
+LSGEMM_L2x8_SUB0: 
+#if defined(TRMMKERNEL)
+	andi.		L,	T11,	63
+#else
+	andi.		L,  K,	63
+#endif	
+	ble		LSGEMM_L2x8_SAVE
+	MY_ALIGN
+LSGEMM_L2x8_SUB2:
+    andi.      T10,L, 32
+    ble LSGEMM_L2x8_SUB2_16 
+    KERNEL2x8_4  0,0, 0,0
+    KERNEL2x8_4  0,0, 1,0
+    KERNEL2x8_4  0,0, 2,0
+    KERNEL2x8_4  0,0, 3,0
+    KERNEL2x8_4  0,0, 4,0
+    KERNEL2x8_4  0,0, 5,0
+    KERNEL2x8_4  0,0, 6,0
+    KERNEL2x8_4  0,0, 7,1 
+    MY_ALIGN        
+LSGEMM_L2x8_SUB2_16:
+    andi.      T10,L, 16
+    ble LSGEMM_L2x8_SUB2_8 
+    KERNEL2x8_4  0,0, 0,0
+    KERNEL2x8_4  0,0, 1,0
+    KERNEL2x8_4  0,0, 2,0
+    KERNEL2x8_4  0,0, 3,1
+    MY_ALIGN 
+LSGEMM_L2x8_SUB2_8:
+    andi.      T10,L, 8
+    ble LSGEMM_L2x8_SUB2_4  
+    KERNEL2x8_4  0,0, 0,0
+    KERNEL2x8_4  0,0, 1,1
+	MY_ALIGN	
+LSGEMM_L2x8_SUB2_4:
+    andi.      T10,L, 4
+    ble LSGEMM_L2x8_SUB2_2 
+    KERNEL2x8_4  0,0, 0,1
+    MY_ALIGN
+LSGEMM_L2x8_SUB2_2:
+    andi.      T10,L, 2
+    ble LSGEMM_L2x8_SUB2_1 
+    KERNEL2x8_2  0,0, 0,1
+    MY_ALIGN    
+LSGEMM_L2x8_SUB2_1:
+    andi.      T10,L, 1
+    ble LSGEMM_L2x8_SAVE	
+    KERNEL2x8
+
+	MY_ALIGN
+LSGEMM_L2x8_SAVE:
+	SAVE2x8
+#if defined(TRMMKERNEL)	
+	REFRESH_AFTER_SAVE T11,K,TEMP_REG,BO,AO,8,2
+#endif	 
+    MY_ALIGN
+LSGEMM_L2x8_END:
+	andi.		I,	M,	4
+	ble		LSGEMM_L2x4_END
+
+	MY_ALIGN
+LSGEMM_L2x4_BEGIN:
+
+#if defined(TRMMKERNEL)
+    REFRESH_POINTERS  AO,BO,TEMP_REG,B,4,2
+#else
+	mr		BO,	B
+#endif	
+
+#if defined(TRMMKERNEL)
+   REFRESH_TEMP_BK T11,K,TEMP_REG,4,2 
+   srawi.		L, T11,	6 /**(T11 ) % 64x */
+#else 
+   srawi.		L,	K,	6 /**(K ) % 64x */
+#endif 
+ 
+    ZERO2x4
+	ble		LSGEMM_L2x4_SUB0
+ 
+  
+	mtctr		L
+
+	MY_ALIGN
+
+LSGEMM_L2x4_LOOP:
+
+    KERNEL2x4_4  0,0, 0,0
+    KERNEL2x4_4  0,0, 1,0
+    KERNEL2x4_4  0,0, 2,0
+    KERNEL2x4_4  0,0, 3,0
+    KERNEL2x4_4  0,0, 4,0
+    KERNEL2x4_4  0,0, 5,0        
+    KERNEL2x4_4  0,0, 6,0
+    KERNEL2x4_4  0,0, 7,0  
+    KERNEL2x4_4  0,0, 8,0      
+    KERNEL2x4_4  0,0, 9,0
+    KERNEL2x4_4  0,0, 10,0
+    KERNEL2x4_4  0,0, 11,0
+    KERNEL2x4_4  0,0, 12,0
+    KERNEL2x4_4  0,0, 13,0    
+    KERNEL2x4_4  0,0, 14,0    
+    KERNEL2x4_4  0,0, 15,1  	
+
+	bdnz		LSGEMM_L2x4_LOOP
+    MY_ALIGN
+ 
+	MY_ALIGN
+LSGEMM_L2x4_SUB0: 
+#if defined(TRMMKERNEL)
+	andi.		L,	T11,	63
+#else
+	andi.		L,  K,	63
+#endif	
+	ble		LSGEMM_L2x4_SAVE
+	MY_ALIGN
+LSGEMM_L2x4_SUB2:
+    andi.      T10,L, 32
+    ble LSGEMM_L2x4_SUB2_16 
+    KERNEL2x4_4  0,0, 0,0
+    KERNEL2x4_4  0,0, 1,0
+    KERNEL2x4_4  0,0, 2,0
+    KERNEL2x4_4  0,0, 3,0
+    KERNEL2x4_4  0,0, 4,0
+    KERNEL2x4_4  0,0, 5,0
+    KERNEL2x4_4  0,0, 6,0
+    KERNEL2x4_4  0,0, 7,1 
+    MY_ALIGN        
+LSGEMM_L2x4_SUB2_16:
+    andi.      T10,L, 16
+    ble LSGEMM_L2x4_SUB2_8 
+    KERNEL2x4_4  0,0, 0,0
+    KERNEL2x4_4  0,0, 1,0
+    KERNEL2x4_4  0,0, 2,0
+    KERNEL2x4_4  0,0, 3,1
+    MY_ALIGN 
+LSGEMM_L2x4_SUB2_8:
+    andi.      T10,L, 8
+    ble LSGEMM_L2x4_SUB2_4  
+    KERNEL2x4_4  0,0, 0,0
+    KERNEL2x4_4  0,0, 1,1
+	MY_ALIGN	
+LSGEMM_L2x4_SUB2_4:
+    andi.      T10,L, 4
+    ble LSGEMM_L2x4_SUB2_2 
+    KERNEL2x4_4  0,0, 0,1
+    MY_ALIGN
+LSGEMM_L2x4_SUB2_2:
+    andi.      T10,L, 2
+    ble LSGEMM_L2x4_SUB2_1 
+    KERNEL2x4_2  0,0, 0,1
+    MY_ALIGN    
+LSGEMM_L2x4_SUB2_1:
+    andi.      T10,L, 1
+    ble LSGEMM_L2x4_SAVE	
+    KERNEL2x4
+
+	MY_ALIGN
+LSGEMM_L2x4_SAVE:
+	SAVE2x4
+#if defined(TRMMKERNEL)	
+	REFRESH_AFTER_SAVE T11,K,TEMP_REG,BO,AO,4,2
+#endif	 
+    MY_ALIGN
+LSGEMM_L2x4_END:
+	andi.		I,	M,	2
+	ble		LSGEMM_L2x2_END
+
+	MY_ALIGN
+LSGEMM_L2x2_BEGIN:
+
+#if defined(TRMMKERNEL)
+    REFRESH_POINTERS  AO,BO,TEMP_REG,B,2,2
+#else
+	mr		BO,	B
+#endif	
+
+#if defined(TRMMKERNEL)
+   REFRESH_TEMP_BK T11,K,TEMP_REG,2,2 
+   srawi.		L, T11,	6 /**(T11 ) % 64x */
+#else 
+   srawi.		L,	K,	6 /**(K ) % 64x */
+#endif 
+ 
+    ZERO2x2
+	ble		LSGEMM_L2x2_SUB0
+ 
+  
+	mtctr		L
+
+	MY_ALIGN
+
+LSGEMM_L2x2_LOOP:
+
+    KERNEL2x2_4  0,0, 0,0
+    KERNEL2x2_4  0,0, 1,0
+    KERNEL2x2_4  0,0, 2,0
+    KERNEL2x2_4  0,0, 3,0
+    KERNEL2x2_4  0,0, 4,0
+    KERNEL2x2_4  0,0, 5,0        
+    KERNEL2x2_4  0,0, 6,0
+    KERNEL2x2_4  0,0, 7,0  
+    KERNEL2x2_4  0,0, 8,0      
+    KERNEL2x2_4  0,0, 9,0
+    KERNEL2x2_4  0,0, 10,0
+    KERNEL2x2_4  0,0, 11,0
+    KERNEL2x2_4  0,0, 12,0
+    KERNEL2x2_4  0,0, 13,0    
+    KERNEL2x2_4  0,0, 14,0    
+    KERNEL2x2_4  0,0, 15,1  	
+
+	bdnz		LSGEMM_L2x2_LOOP
+    MY_ALIGN
+ 
+	MY_ALIGN
+LSGEMM_L2x2_SUB0: 
+#if defined(TRMMKERNEL)
+	andi.		L,	T11,	63
+#else
+	andi.		L,  K,	63
+#endif	
+	ble		LSGEMM_L2x2_SAVE
+	MY_ALIGN
+LSGEMM_L2x2_SUB2:
+    andi.      T10,L, 32
+    ble LSGEMM_L2x2_SUB2_16 
+    KERNEL2x2_4  0,0, 0,0
+    KERNEL2x2_4  0,0, 1,0
+    KERNEL2x2_4  0,0, 2,0
+    KERNEL2x2_4  0,0, 3,0
+    KERNEL2x2_4  0,0, 4,0
+    KERNEL2x2_4  0,0, 5,0
+    KERNEL2x2_4  0,0, 6,0
+    KERNEL2x2_4  0,0, 7,1 
+    MY_ALIGN        
+LSGEMM_L2x2_SUB2_16:
+    andi.      T10,L, 16
+    ble LSGEMM_L2x2_SUB2_8 
+    KERNEL2x2_4  0,0, 0,0
+    KERNEL2x2_4  0,0, 1,0
+    KERNEL2x2_4  0,0, 2,0
+    KERNEL2x2_4  0,0, 3,1
+    MY_ALIGN 
+LSGEMM_L2x2_SUB2_8:
+    andi.      T10,L, 8
+    ble LSGEMM_L2x2_SUB2_4  
+    KERNEL2x2_4  0,0, 0,0
+    KERNEL2x2_4  0,0, 1,1
+	MY_ALIGN	
+LSGEMM_L2x2_SUB2_4:
+    andi.      T10,L, 4
+    ble LSGEMM_L2x2_SUB2_2 
+    KERNEL2x2_4  0,0, 0,1
+    MY_ALIGN
+LSGEMM_L2x2_SUB2_2:
+    andi.      T10,L, 2
+    ble LSGEMM_L2x2_SUB2_1 
+    KERNEL2x2_2  0,0, 0,1
+    MY_ALIGN    
+LSGEMM_L2x2_SUB2_1:
+    andi.      T10,L, 1
+    ble LSGEMM_L2x2_SAVE	
+    KERNEL2x2
+
+	MY_ALIGN
+LSGEMM_L2x2_SAVE:
+	SAVE2x2
+#if defined(TRMMKERNEL)	
+	REFRESH_AFTER_SAVE T11,K,TEMP_REG,BO,AO,2,2
+#endif	 
+    MY_ALIGN
+LSGEMM_L2x2_END:
+	andi.		I,	M,	1
+	ble		LSGEMM_L2x1_END
+
+	MY_ALIGN
+LSGEMM_L2x1_BEGIN:
+
+#if defined(TRMMKERNEL)
+    REFRESH_POINTERS  AO,BO,TEMP_REG,B,1,2
+#else
+	mr		BO,	B
+#endif	
+
+#if defined(TRMMKERNEL)
+   REFRESH_TEMP_BK T11,K,TEMP_REG,1,2 
+   srawi.		L, T11,	6 /**(T11 ) % 64x */
+#else 
+   srawi.		L,	K,	6 /**(K ) % 64x */
+#endif 
+ 
+    ZERO2x1
+	ble		LSGEMM_L2x1_SUB0
+ 
+  
+	mtctr		L
+
+	MY_ALIGN
+
+LSGEMM_L2x1_LOOP:
+
+    KERNEL2x1_4  0,0, 0,0
+    KERNEL2x1_4  0,0, 1,0
+    KERNEL2x1_4  0,0, 2,0
+    KERNEL2x1_4  0,0, 3,0
+    KERNEL2x1_4  0,0, 4,0
+    KERNEL2x1_4  0,0, 5,0        
+    KERNEL2x1_4  0,0, 6,0
+    KERNEL2x1_4  0,0, 7,0  
+    KERNEL2x1_4  0,0, 8,0      
+    KERNEL2x1_4  0,0, 9,0
+    KERNEL2x1_4  0,0, 10,0
+    KERNEL2x1_4  0,0, 11,0
+    KERNEL2x1_4  0,0, 12,0
+    KERNEL2x1_4  0,0, 13,0    
+    KERNEL2x1_4  0,0, 14,0    
+    KERNEL2x1_4  0,0, 15,1  	
+
+	bdnz		LSGEMM_L2x1_LOOP
+    MY_ALIGN
+ 
+	MY_ALIGN
+LSGEMM_L2x1_SUB0: 
+#if defined(TRMMKERNEL)
+	andi.		L,	T11,	63
+#else
+	andi.		L,  K,	63
+#endif	
+	ble		LSGEMM_L2x1_SAVE
+	MY_ALIGN
+LSGEMM_L2x1_SUB2:
+    andi.      T10,L, 32
+    ble LSGEMM_L2x1_SUB2_16 
+    KERNEL2x1_4  0,0, 0,0
+    KERNEL2x1_4  0,0, 1,0
+    KERNEL2x1_4  0,0, 2,0
+    KERNEL2x1_4  0,0, 3,0
+    KERNEL2x1_4  0,0, 4,0
+    KERNEL2x1_4  0,0, 5,0
+    KERNEL2x1_4  0,0, 6,0
+    KERNEL2x1_4  0,0, 7,1 
+    MY_ALIGN        
+LSGEMM_L2x1_SUB2_16:
+    andi.      T10,L, 16
+    ble LSGEMM_L2x1_SUB2_8 
+    KERNEL2x1_4  0,0, 0,0
+    KERNEL2x1_4  0,0, 1,0
+    KERNEL2x1_4  0,0, 2,0
+    KERNEL2x1_4  0,0, 3,1
+    MY_ALIGN 
+LSGEMM_L2x1_SUB2_8:
+    andi.      T10,L, 8
+    ble LSGEMM_L2x1_SUB2_4  
+    KERNEL2x1_4  0,0, 0,0
+    KERNEL2x1_4  0,0, 1,1
+	MY_ALIGN	
+LSGEMM_L2x1_SUB2_4:
+    andi.      T10,L, 4
+    ble LSGEMM_L2x1_SUB2_2 
+    KERNEL2x1_4  0,0, 0,1
+    MY_ALIGN
+LSGEMM_L2x1_SUB2_2:
+    andi.      T10,L, 2
+    ble LSGEMM_L2x1_SUB2_1 
+    KERNEL2x1_2  0,0, 0,1
+    MY_ALIGN    
+LSGEMM_L2x1_SUB2_1:
+    andi.      T10,L, 1
+    ble LSGEMM_L2x1_SAVE	
+    KERNEL2x1
+
+	MY_ALIGN
+LSGEMM_L2x1_SAVE:
+	SAVE2x1
+#if defined(TRMMKERNEL)	
+	REFRESH_AFTER_SAVE T11,K,TEMP_REG,BO,AO,1,2
+#endif	 
+    MY_ALIGN
+LSGEMM_L2x1_END:
+	slwi		T1,	K,	3
+	add		B,	B,	T1
+#if defined(TRMMKERNEL) && !defined(LEFT)
+    addi TEMP_REG, TEMP_REG, 2
+#endif 
+LSGEMM_L2_END:
+   andi.       T1, N,  1
+   ble     LSGEMM_END
+LSGEMM_1_BEGIN:
+  
+
+	mr		AO,	A
+	mr		CO,	C 
+	add		C,	C,	LDC
+ 
+#if defined(TRMMKERNEL) && defined(LEFT)
+	mr TEMP_REG, OFFSET	 /*off = offset;*/
+#endif 
+	srawi.		I,	M,	4
+	ble		LSGEMM_1x16_END
+
+	MY_ALIGN
+LSGEMM_1x16_BEGIN:
+
+#if defined(TRMMKERNEL)
+    REFRESH_POINTERS  AO,BO,TEMP_REG,B,16,1
+#else
+	mr		BO,	B
+#endif	
+
+#if defined(TRMMKERNEL)
+   REFRESH_TEMP_BK T11,K,TEMP_REG,16,1 
+   srawi.		L, T11,	6 /**(T11 ) % 64x */
+#else 
+   srawi.		L,	K,	6 /**(K ) % 64x */
+#endif 
+ 
+    ZERO1x16
+	ble		LSGEMM_1x16_SUB0
+    addi AO,AO,2048
+  
+	mtctr		L
+
+	MY_ALIGN
+
+LSGEMM_1x16_LOOP:
+
+    KERNEL1x16_4  -2048,0, 0,0
+    KERNEL1x16_4  -2048,0, 1,0
+    KERNEL1x16_4  -2048,0, 2,0
+    KERNEL1x16_4  -2048,0, 3,0
+    KERNEL1x16_4  -2048,0, 4,0
+    KERNEL1x16_4  -2048,0, 5,0        
+    KERNEL1x16_4  -2048,0, 6,0
+    KERNEL1x16_4  -2048,0, 7,0  
+    KERNEL1x16_4  -2048,0, 8,0      
+    KERNEL1x16_4  -2048,0, 9,0
+    KERNEL1x16_4  -2048,0, 10,0
+    KERNEL1x16_4  -2048,0, 11,0
+    KERNEL1x16_4  -2048,0, 12,0
+    KERNEL1x16_4  -2048,0, 13,0    
+    KERNEL1x16_4  -2048,0, 14,0    
+    KERNEL1x16_4  -2048,0, 15,1  	
+
+	bdnz		LSGEMM_1x16_LOOP
+    MY_ALIGN
+    addi AO,AO, -2048
+	MY_ALIGN
+LSGEMM_1x16_SUB0: 
+#if defined(TRMMKERNEL)
+	andi.		L,	T11,	63
+#else
+	andi.		L,  K,	63
+#endif	
+	ble		LSGEMM_1x16_SAVE
+	MY_ALIGN
+LSGEMM_1x16_SUB2:
+    andi.      T10,L, 32
+    ble LSGEMM_1x16_SUB2_16 
+    KERNEL1x16_4  0,0, 0,0
+    KERNEL1x16_4  0,0, 1,0
+    KERNEL1x16_4  0,0, 2,0
+    KERNEL1x16_4  0,0, 3,0
+    KERNEL1x16_4  0,0, 4,0
+    KERNEL1x16_4  0,0, 5,0
+    KERNEL1x16_4  0,0, 6,0
+    KERNEL1x16_4  0,0, 7,1 
+    MY_ALIGN        
+LSGEMM_1x16_SUB2_16:
+    andi.      T10,L, 16
+    ble LSGEMM_1x16_SUB2_8 
+    KERNEL1x16_4  0,0, 0,0
+    KERNEL1x16_4  0,0, 1,0
+    KERNEL1x16_4  0,0, 2,0
+    KERNEL1x16_4  0,0, 3,1
+    MY_ALIGN 
+LSGEMM_1x16_SUB2_8:
+    andi.      T10,L, 8
+    ble LSGEMM_1x16_SUB2_4  
+    KERNEL1x16_4  0,0, 0,0
+    KERNEL1x16_4  0,0, 1,1
+	MY_ALIGN	
+LSGEMM_1x16_SUB2_4:
+    andi.      T10,L, 4
+    ble LSGEMM_1x16_SUB2_2 
+    KERNEL1x16_4  0,0, 0,1
+    MY_ALIGN
+LSGEMM_1x16_SUB2_2:
+    andi.      T10,L, 2
+    ble LSGEMM_1x16_SUB2_1 
+    KERNEL1x16_2  0,0, 0,1
+    MY_ALIGN    
+LSGEMM_1x16_SUB2_1:
+    andi.      T10,L, 1
+    ble LSGEMM_1x16_SAVE	
+    KERNEL1x16
+
+	MY_ALIGN
+LSGEMM_1x16_SAVE:
+	SAVE1x16
+#if defined(TRMMKERNEL)	
+	REFRESH_AFTER_SAVE T11,K,TEMP_REG,BO,AO,16,1
+#endif	
+	addic.		I,	I,	-1
+	bgt+		LSGEMM_1x16_BEGIN
+    MY_ALIGN
+LSGEMM_1x16_END:
+	andi.		I,	M,	8
+	ble		LSGEMM_1x8_END
+
+	MY_ALIGN
+LSGEMM_1x8_BEGIN:
+
+#if defined(TRMMKERNEL)
+    REFRESH_POINTERS  AO,BO,TEMP_REG,B,8,1
+#else
+	mr		BO,	B
+#endif	
+
+#if defined(TRMMKERNEL)
+   REFRESH_TEMP_BK T11,K,TEMP_REG,8,1 
+   srawi.		L, T11,	6 /**(T11 ) % 64x */
+#else 
+   srawi.		L,	K,	6 /**(K ) % 64x */
+#endif 
+ 
+    ZERO1x8
+	ble		LSGEMM_1x8_SUB0
+    addi AO,AO,2048
+  
+	mtctr		L
+
+	MY_ALIGN
+
+LSGEMM_1x8_LOOP:
+
+    KERNEL1x8_4  -2048,0, 0,0
+    KERNEL1x8_4  -2048,0, 1,0
+    KERNEL1x8_4  -2048,0, 2,0
+    KERNEL1x8_4  -2048,0, 3,0
+    KERNEL1x8_4  -2048,0, 4,0
+    KERNEL1x8_4  -2048,0, 5,0        
+    KERNEL1x8_4  -2048,0, 6,0
+    KERNEL1x8_4  -2048,0, 7,0  
+    KERNEL1x8_4  -2048,0, 8,0      
+    KERNEL1x8_4  -2048,0, 9,0
+    KERNEL1x8_4  -2048,0, 10,0
+    KERNEL1x8_4  -2048,0, 11,0
+    KERNEL1x8_4  -2048,0, 12,0
+    KERNEL1x8_4  -2048,0, 13,0    
+    KERNEL1x8_4  -2048,0, 14,0    
+    KERNEL1x8_4  -2048,0, 15,1  	
+
+	bdnz		LSGEMM_1x8_LOOP
+    MY_ALIGN
+    addi AO,AO, -2048
+	MY_ALIGN
+LSGEMM_1x8_SUB0: 
+#if defined(TRMMKERNEL)
+	andi.		L,	T11,	63
+#else
+	andi.		L,  K,	63
+#endif	
+	ble		LSGEMM_1x8_SAVE
+	MY_ALIGN
+LSGEMM_1x8_SUB2:
+    andi.      T10,L, 32
+    ble LSGEMM_1x8_SUB2_16 
+    KERNEL1x8_4  0,0, 0,0
+    KERNEL1x8_4  0,0, 1,0
+    KERNEL1x8_4  0,0, 2,0
+    KERNEL1x8_4  0,0, 3,0
+    KERNEL1x8_4  0,0, 4,0
+    KERNEL1x8_4  0,0, 5,0
+    KERNEL1x8_4  0,0, 6,0
+    KERNEL1x8_4  0,0, 7,1 
+    MY_ALIGN        
+LSGEMM_1x8_SUB2_16:
+    andi.      T10,L, 16
+    ble LSGEMM_1x8_SUB2_8 
+    KERNEL1x8_4  0,0, 0,0
+    KERNEL1x8_4  0,0, 1,0
+    KERNEL1x8_4  0,0, 2,0
+    KERNEL1x8_4  0,0, 3,1
+    MY_ALIGN 
+LSGEMM_1x8_SUB2_8:
+    andi.      T10,L, 8
+    ble LSGEMM_1x8_SUB2_4  
+    KERNEL1x8_4  0,0, 0,0
+    KERNEL1x8_4  0,0, 1,1
+	MY_ALIGN	
+LSGEMM_1x8_SUB2_4:
+    andi.      T10,L, 4
+    ble LSGEMM_1x8_SUB2_2 
+    KERNEL1x8_4  0,0, 0,1
+    MY_ALIGN
+LSGEMM_1x8_SUB2_2:
+    andi.      T10,L, 2
+    ble LSGEMM_1x8_SUB2_1 
+    KERNEL1x8_2  0,0, 0,1
+    MY_ALIGN    
+LSGEMM_1x8_SUB2_1:
+    andi.      T10,L, 1
+    ble LSGEMM_1x8_SAVE	
+    KERNEL1x8
+
+	MY_ALIGN
+LSGEMM_1x8_SAVE:
+	SAVE1x8
+#if defined(TRMMKERNEL)	
+	REFRESH_AFTER_SAVE T11,K,TEMP_REG,BO,AO,8,1
+#endif	 
+    MY_ALIGN
+LSGEMM_1x8_END:
+	andi.		I,	M,	4
+	ble		LSGEMM_1x4_END
+
+	MY_ALIGN
+LSGEMM_1x4_BEGIN:
+
+#if defined(TRMMKERNEL)
+    REFRESH_POINTERS  AO,BO,TEMP_REG,B,4,1
+#else
+	mr		BO,	B
+#endif	
+
+#if defined(TRMMKERNEL)
+   REFRESH_TEMP_BK T11,K,TEMP_REG,4,1 
+   srawi.		L, T11,	6 /**(T11 ) % 64x */
+#else 
+   srawi.		L,	K,	6 /**(K ) % 64x */
+#endif 
+ 
+    ZERO1x4
+	ble		LSGEMM_1x4_SUB0
+ 
+  
+	mtctr		L
+
+	MY_ALIGN
+
+LSGEMM_1x4_LOOP:
+
+    KERNEL1x4_4  0,0, 0,0
+    KERNEL1x4_4  0,0, 1,0
+    KERNEL1x4_4  0,0, 2,0
+    KERNEL1x4_4  0,0, 3,0
+    KERNEL1x4_4  0,0, 4,0
+    KERNEL1x4_4  0,0, 5,0        
+    KERNEL1x4_4  0,0, 6,0
+    KERNEL1x4_4  0,0, 7,0  
+    KERNEL1x4_4  0,0, 8,0      
+    KERNEL1x4_4  0,0, 9,0
+    KERNEL1x4_4  0,0, 10,0
+    KERNEL1x4_4  0,0, 11,0
+    KERNEL1x4_4  0,0, 12,0
+    KERNEL1x4_4  0,0, 13,0    
+    KERNEL1x4_4  0,0, 14,0    
+    KERNEL1x4_4  0,0, 15,1  	
+
+	bdnz		LSGEMM_1x4_LOOP
+    MY_ALIGN
+ 
+	MY_ALIGN
+LSGEMM_1x4_SUB0: 
+#if defined(TRMMKERNEL)
+	andi.		L,	T11,	63
+#else
+	andi.		L,  K,	63
+#endif	
+	ble		LSGEMM_1x4_SAVE
+	MY_ALIGN
+LSGEMM_1x4_SUB2:
+    andi.      T10,L, 32
+    ble LSGEMM_1x4_SUB2_16 
+    KERNEL1x4_4  0,0, 0,0
+    KERNEL1x4_4  0,0, 1,0
+    KERNEL1x4_4  0,0, 2,0
+    KERNEL1x4_4  0,0, 3,0
+    KERNEL1x4_4  0,0, 4,0
+    KERNEL1x4_4  0,0, 5,0
+    KERNEL1x4_4  0,0, 6,0
+    KERNEL1x4_4  0,0, 7,1 
+    MY_ALIGN        
+LSGEMM_1x4_SUB2_16:
+    andi.      T10,L, 16
+    ble LSGEMM_1x4_SUB2_8 
+    KERNEL1x4_4  0,0, 0,0
+    KERNEL1x4_4  0,0, 1,0
+    KERNEL1x4_4  0,0, 2,0
+    KERNEL1x4_4  0,0, 3,1
+    MY_ALIGN 
+LSGEMM_1x4_SUB2_8:
+    andi.      T10,L, 8
+    ble LSGEMM_1x4_SUB2_4  
+    KERNEL1x4_4  0,0, 0,0
+    KERNEL1x4_4  0,0, 1,1
+	MY_ALIGN	
+LSGEMM_1x4_SUB2_4:
+    andi.      T10,L, 4
+    ble LSGEMM_1x4_SUB2_2 
+    KERNEL1x4_4  0,0, 0,1
+    MY_ALIGN
+LSGEMM_1x4_SUB2_2:
+    andi.      T10,L, 2
+    ble LSGEMM_1x4_SUB2_1 
+    KERNEL1x4_2  0,0, 0,1
+    MY_ALIGN    
+LSGEMM_1x4_SUB2_1:
+    andi.      T10,L, 1
+    ble LSGEMM_1x4_SAVE	
+    KERNEL1x4
+
+	MY_ALIGN
+LSGEMM_1x4_SAVE:
+	SAVE1x4
+#if defined(TRMMKERNEL)	
+	REFRESH_AFTER_SAVE T11,K,TEMP_REG,BO,AO,4,1
+#endif	 
+    MY_ALIGN
+LSGEMM_1x4_END:
+	andi.		I,	M,	2
+	ble		LSGEMM_1x2_END
+
+	MY_ALIGN
+LSGEMM_1x2_BEGIN:
+
+#if defined(TRMMKERNEL)
+    REFRESH_POINTERS  AO,BO,TEMP_REG,B,2,1
+#else
+	mr		BO,	B
+#endif	
+
+#if defined(TRMMKERNEL)
+   REFRESH_TEMP_BK T11,K,TEMP_REG,2,1 
+   srawi.		L, T11,	6 /**(T11 ) % 64x */
+#else 
+   srawi.		L,	K,	6 /**(K ) % 64x */
+#endif 
+ 
+    ZERO1x2
+	ble		LSGEMM_1x2_SUB0
+ 
+  
+	mtctr		L
+
+	MY_ALIGN
+
+LSGEMM_1x2_LOOP:
+
+    KERNEL1x2_4  0,0, 0,0
+    KERNEL1x2_4  0,0, 1,0
+    KERNEL1x2_4  0,0, 2,0
+    KERNEL1x2_4  0,0, 3,0
+    KERNEL1x2_4  0,0, 4,0
+    KERNEL1x2_4  0,0, 5,0        
+    KERNEL1x2_4  0,0, 6,0
+    KERNEL1x2_4  0,0, 7,0  
+    KERNEL1x2_4  0,0, 8,0      
+    KERNEL1x2_4  0,0, 9,0
+    KERNEL1x2_4  0,0, 10,0
+    KERNEL1x2_4  0,0, 11,0
+    KERNEL1x2_4  0,0, 12,0
+    KERNEL1x2_4  0,0, 13,0    
+    KERNEL1x2_4  0,0, 14,0    
+    KERNEL1x2_4  0,0, 15,1  	
+
+	bdnz		LSGEMM_1x2_LOOP
+    MY_ALIGN
+ 
+	MY_ALIGN
+LSGEMM_1x2_SUB0: 
+#if defined(TRMMKERNEL)
+	andi.		L,	T11,	63
+#else
+	andi.		L,  K,	63
+#endif	
+	ble		LSGEMM_1x2_SAVE
+	MY_ALIGN
+LSGEMM_1x2_SUB2:
+    andi.      T10,L, 32
+    ble LSGEMM_1x2_SUB2_16 
+    KERNEL1x2_4  0,0, 0,0
+    KERNEL1x2_4  0,0, 1,0
+    KERNEL1x2_4  0,0, 2,0
+    KERNEL1x2_4  0,0, 3,0
+    KERNEL1x2_4  0,0, 4,0
+    KERNEL1x2_4  0,0, 5,0
+    KERNEL1x2_4  0,0, 6,0
+    KERNEL1x2_4  0,0, 7,1 
+    MY_ALIGN        
+LSGEMM_1x2_SUB2_16:
+    andi.      T10,L, 16
+    ble LSGEMM_1x2_SUB2_8 
+    KERNEL1x2_4  0,0, 0,0
+    KERNEL1x2_4  0,0, 1,0
+    KERNEL1x2_4  0,0, 2,0
+    KERNEL1x2_4  0,0, 3,1
+    MY_ALIGN 
+LSGEMM_1x2_SUB2_8:
+    andi.      T10,L, 8
+    ble LSGEMM_1x2_SUB2_4  
+    KERNEL1x2_4  0,0, 0,0
+    KERNEL1x2_4  0,0, 1,1
+	MY_ALIGN	
+LSGEMM_1x2_SUB2_4:
+    andi.      T10,L, 4
+    ble LSGEMM_1x2_SUB2_2 
+    KERNEL1x2_4  0,0, 0,1
+    MY_ALIGN
+LSGEMM_1x2_SUB2_2:
+    andi.      T10,L, 2
+    ble LSGEMM_1x2_SUB2_1 
+    KERNEL1x2_2  0,0, 0,1
+    MY_ALIGN    
+LSGEMM_1x2_SUB2_1:
+    andi.      T10,L, 1
+    ble LSGEMM_1x2_SAVE	
+    KERNEL1x2
+
+	MY_ALIGN
+LSGEMM_1x2_SAVE:
+	SAVE1x2
+#if defined(TRMMKERNEL)	
+	REFRESH_AFTER_SAVE T11,K,TEMP_REG,BO,AO,2,1
+#endif	 
+    MY_ALIGN
+LSGEMM_1x2_END:
+    andi.		I,	M,	1
+	ble		LSGEMM_1x1_END
+
+	MY_ALIGN
+LSGEMM_1x1_BEGIN:
+
+#if defined(TRMMKERNEL)
+    REFRESH_POINTERS  AO,BO,TEMP_REG,B,1,1
+#else
+	mr		BO,	B
+#endif	
+
+#if defined(TRMMKERNEL)
+   REFRESH_TEMP_BK T11,K,TEMP_REG,1,1 
+   srawi.		L, T11,	6 /**(T11 ) % 64x */
+#else 
+   srawi.		L,	K,	6 /**(K ) % 64x */
+#endif 
+ 
+    ZERO1x1
+	ble		LSGEMM_1x1_SUB0
+ 
+  
+	mtctr		L
+
+	MY_ALIGN
+
+LSGEMM_1x1_LOOP:
+
+    KERNEL1x1_16  0,0, 0,0
+    KERNEL1x1_16  0,0, 1,0
+    KERNEL1x1_16  0,0, 2,0
+    KERNEL1x1_16  0,0, 3,1 	
+
+	bdnz		LSGEMM_1x1_LOOP
+    MY_ALIGN
+ 
+	MY_ALIGN
+LSGEMM_1x1_SUB0: 
+#if defined(TRMMKERNEL)
+	andi.		L,	T11,	63
+#else
+	andi.		L,  K,	63
+#endif	
+	ble		LSGEMM_1x1_SAVE
+	MY_ALIGN
+LSGEMM_1x1_SUB2:
+    andi.      T10,L, 32
+    ble LSGEMM_1x1_SUB2_16 
+    KERNEL1x1_16  0,0, 0,0
+    KERNEL1x1_16  0,0, 1,1 
+    MY_ALIGN        
+LSGEMM_1x1_SUB2_16:
+    andi.      T10,L, 16
+    ble LSGEMM_1x1_SUB2_8 
+    KERNEL1x1_16  0,0, 0,1
+    MY_ALIGN 
+LSGEMM_1x1_SUB2_8:
+    andi.      T10,L, 8
+    ble LSGEMM_1x1_SUB2_4  
+    KERNEL1x1_8  0,0, 0,1
+	MY_ALIGN	
+LSGEMM_1x1_SUB2_4:
+    andi.      T10,L, 4
+    ble LSGEMM_1x1_SUB2_2 
+    KERNEL1x1_4  0,0, 0,1
+    MY_ALIGN
+LSGEMM_1x1_SUB2_2:
+    andi.      T10,L, 2
+    ble LSGEMM_1x1_SUB2_1 
+    KERNEL1x1_2  0,0, 0,1
+    MY_ALIGN    
+LSGEMM_1x1_SUB2_1:
+    andi.      T10,L, 1
+    ble LSGEMM_1x1_SAVE	
+    KERNEL1x1
+
+	MY_ALIGN
+LSGEMM_1x1_SAVE:
+	SAVE1x1
+#if defined(TRMMKERNEL)	
+	REFRESH_AFTER_SAVE T11,K,TEMP_REG,BO,AO,1,1
+#endif	 
+    MY_ALIGN
+LSGEMM_1x1_END:
+	slwi		T1,	K,	2
+	add		B,	B,	T1
+#if defined(TRMMKERNEL) && !defined(LEFT)
+    addi TEMP_REG, TEMP_REG, 1
+#endif 
+LSGEMM_END:
\ No newline at end of file
diff --git a/kernel/power/sgemm_macros_power9.S b/kernel/power/sgemm_macros_power9.S
new file mode 100644
index 000000000..c61f419ac
--- /dev/null
+++ b/kernel/power/sgemm_macros_power9.S
@@ -0,0 +1,5828 @@
+/***************************************************************************
+Copyright (c) 2013-2019, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*****************************************************************************/
+ 
+#define unit_size 4
+#define DISP64(ind,disp) (ind*unit_size*64+disp)
+#define DISP32(ind,disp) (ind*unit_size*32+disp)
+#define DISP16(ind,disp) (ind*unit_size*16+disp)
+#define DISP8(ind,disp) (ind*unit_size*8+disp)
+#define DISP4(ind,disp) (ind*unit_size*4+disp)
+#define DISP2(ind,disp) (ind*unit_size*2+disp)
+#define DISP1(ind,disp) (ind*unit_size+disp)
+
+/**********************************************************************************************
+* Macros for N=8 and M=16
+**********************************************************************************************/
+
+.macro LOAD8x16_1
+   LOAD8x16 1
+.endm
+
+.macro LOAD8x16_0
+   LOAD8x16 0
+.endm
+
+.macro KERNEL8x16_L1_L4  Index,IsLast
+  KERNEL8x16_L1_L4_I AO,BO, 0,0, \Index,\IsLast,0
+.endm
+
+.macro KERNEL8x16_I1_L4  OffsetA,OffsetB, Index,IsLast
+  KERNEL8x16_L1_L4_I  AO,BO, \OffsetA,\OffsetB,\Index,\IsLast,0
+.endm
+
+.macro KERNEL8x16_I1_L4_2  OffsetA,OffsetB, Index,IsLast
+  KERNEL8x16_L1_L4_I  AO,BO,  \OffsetA,\OffsetB,\Index,\IsLast,0
+.endm
+
+.macro KERNEL8x16_I1_L4_3  OffsetA,OffsetB, Index,IsLast
+  KERNEL8x16_L1_L4_I  AO,BO,  \OffsetA,\OffsetB,\Index,\IsLast,1
+.endm
+.macro KERNEL8x16_I1_L2_3  OffsetA,OffsetB, Index,IsLast
+   KERNEL8x16_L1_L2_I AO,BO,0,  \OffsetA,\OffsetB,\Index,\IsLast,1
+.endm
+
+.macro KERNEL8x16_I2_L4_2  AREG,BREG,OffsetA,OffsetB, Index,IsLast
+  KERNEL8x16_L1_L4_I  \AREG,\BREG,  \OffsetA,\OffsetB,\Index,\IsLast,0
+.endm
+
+.macro KERNEL8x16_I2_L4_3  AREG,BREG,OffsetA,OffsetB, Index,IsLast
+  KERNEL8x16_L1_L4_I \AREG,\BREG,  \OffsetA,\OffsetB,\Index,\IsLast,1
+.endm
+
+.macro Zero8X16
+    xxlxor		vs32,	vs32,	vs32
+    xxlxor		vs33,	vs33,	vs33
+	xxlxor		vs34,	vs34,	vs34
+	xxlxor		vs35,	vs35,	vs35
+	xxlxor		vs36,	vs36,	vs36
+	xxlxor		vs37,	vs37,	vs37
+	xxlxor		vs38,	vs38,	vs38
+	xxlxor		vs39,	vs39,	vs39
+	xxlxor		vs40,	vs40,	vs40
+	xxlxor		vs41,	vs41,	vs41
+	xxlxor		vs42,	vs42,	vs42
+	xxlxor		vs43,	vs43,	vs43
+	xxlxor		vs44,	vs44,	vs44
+	xxlxor		vs45,	vs45,	vs45
+	xxlxor		vs46,	vs46,	vs46
+	xxlxor		vs47,	vs47,	vs47
+	xxlxor		vs48,	vs48,	vs48
+	xxlxor		vs49,	vs49,	vs49
+	xxlxor		vs50,	vs50,	vs50
+	xxlxor		vs51,	vs51,	vs51 
+	xxlxor		vs52,	vs52,	vs52
+	xxlxor		vs53,	vs53,	vs53
+	xxlxor		vs54,	vs54,	vs54
+	xxlxor		vs55,	vs55,	vs55 
+	xxlxor		vs56,	vs56,	vs56
+	xxlxor		vs57,	vs57,	vs57
+	xxlxor		vs58,	vs58,	vs58
+	xxlxor		vs59,	vs59,	vs59 
+	xxlxor		vs60,	vs60,	vs60
+	xxlxor		vs61,	vs61,	vs61
+	xxlxor		vs62,	vs62,	vs62
+	xxlxor		vs63,	vs63,	vs63	
+.endm
+
+.macro LOAD8x16  Zero
+
+	lxv	vs24,	0(BO)
+	lxv	vs28,	16(BO)
+	lxv	vs0,	 0(AO)
+	lxv	vs1,	16(AO)
+	lxv	vs2,	32(AO)
+	lxv	vs3,	48(AO)
+	xxperm  	vs26,	vs24,		permute_mask
+	xxperm  	vs30,	vs28,		permute_mask	
+	xxpermdi	vs25,	vs24,	vs24,2	   
+	xxpermdi	vs29,	vs28,	vs28,2	  
+
+	xxpermdi	vs27,	vs26,	vs26,2	
+	xxpermdi	vs31,	vs30,	vs30,2	 	
+
+.if \Zero==1 
+    xxlxor		vs32,	vs32,	vs32
+    xxlxor		vs33,	vs33,	vs33
+	xxlxor		vs34,	vs34,	vs34
+	xxlxor		vs35,	vs35,	vs35
+	xxlxor		vs36,	vs36,	vs36
+	xxlxor		vs37,	vs37,	vs37
+	xxlxor		vs38,	vs38,	vs38
+	xxlxor		vs39,	vs39,	vs39
+	xxlxor		vs40,	vs40,	vs40
+	xxlxor		vs41,	vs41,	vs41
+	xxlxor		vs42,	vs42,	vs42
+	xxlxor		vs43,	vs43,	vs43
+	xxlxor		vs44,	vs44,	vs44
+	xxlxor		vs45,	vs45,	vs45
+	xxlxor		vs46,	vs46,	vs46
+	xxlxor		vs47,	vs47,	vs47
+	xxlxor		vs48,	vs48,	vs48
+	xxlxor		vs49,	vs49,	vs49
+	xxlxor		vs50,	vs50,	vs50
+	xxlxor		vs51,	vs51,	vs51 
+	xxlxor		vs52,	vs52,	vs52
+	xxlxor		vs53,	vs53,	vs53
+	xxlxor		vs54,	vs54,	vs54
+	xxlxor		vs55,	vs55,	vs55 
+	xxlxor		vs56,	vs56,	vs56
+	xxlxor		vs57,	vs57,	vs57
+	xxlxor		vs58,	vs58,	vs58
+	xxlxor		vs59,	vs59,	vs59 
+	xxlxor		vs60,	vs60,	vs60
+	xxlxor		vs61,	vs61,	vs61
+	xxlxor		vs62,	vs62,	vs62
+	xxlxor		vs63,	vs63,	vs63	
+.endif
+.endm
+
+.macro END8x16_NORMAL
+  END8x16 0, AO, BO, 64,32 
+.endm
+
+.macro END8x16 First, AREG, BREG, OffsetA, OffsetB
+
+.if \OffsetB != 0 
+    addi        \BREG, \BREG, \OffsetB 
+.endif
+.if \OffsetA != 0 
+    addi        \AREG, \AREG, \OffsetA 
+.endif  
+
+.if \First==1
+    xvmulsp     vs32, vs0,vs24
+    xvmulsp     vs33, vs1,vs24
+    xvmulsp     vs34, vs2,vs24  
+    xvmulsp     vs35, vs3,vs24  
+
+    xvmulsp     vs36, vs0,vs25
+    xvmulsp     vs37, vs1,vs25
+    xvmulsp     vs38, vs2,vs25  
+    xvmulsp     vs39, vs3,vs25
+
+    xvmulsp     vs40, vs0,vs26
+    xvmulsp     vs41, vs1,vs26
+    xvmulsp     vs42, vs2,vs26  
+    xvmulsp     vs43, vs3,vs26
+
+    xvmulsp     vs44, vs0,vs27
+    xvmulsp     vs45, vs1,vs27
+    xvmulsp     vs46, vs2,vs27  
+    xvmulsp     vs47, vs3,vs27
+
+    xvmulsp     vs48, vs0,vs28
+    xvmulsp     vs49, vs1,vs28
+    xvmulsp     vs50, vs2,vs28  
+    xvmulsp     vs51, vs3,vs28  
+
+    xvmulsp     vs52, vs0,vs29
+    xvmulsp     vs53, vs1,vs29
+    xvmulsp     vs54, vs2,vs29  
+    xvmulsp     vs55, vs3,vs29
+
+    xvmulsp     vs56, vs0,vs30
+    xvmulsp     vs57, vs1,vs30
+    xvmulsp     vs58, vs2,vs30  
+    xvmulsp     vs59, vs3,vs30
+
+    xvmulsp     vs60, vs0,vs31
+    xvmulsp     vs61, vs1,vs31
+    xvmulsp     vs62, vs2,vs31  
+    xvmulsp     vs63, vs3,vs31
+
+.else
+    xvmaddasp       vs32, vs0,vs24
+    xvmaddasp       vs33, vs1,vs24
+    xvmaddasp       vs34, vs2,vs24  
+    xvmaddasp       vs35, vs3,vs24  
+
+    xvmaddasp       vs36, vs0,vs25
+    xvmaddasp       vs37, vs1,vs25
+    xvmaddasp       vs38, vs2,vs25  
+    xvmaddasp       vs39, vs3,vs25 
+    xvmaddasp       vs40, vs0,vs26
+    xvmaddasp       vs41, vs1,vs26
+    xvmaddasp       vs42, vs2,vs26  
+    xvmaddasp       vs43, vs3,vs26
+
+    xvmaddasp       vs44, vs0,vs27
+    xvmaddasp       vs45, vs1,vs27
+    xvmaddasp       vs46, vs2,vs27  
+    xvmaddasp       vs47, vs3,vs27
+
+    xvmaddasp       vs48, vs0,vs28
+    xvmaddasp       vs49, vs1,vs28
+    xvmaddasp       vs50, vs2,vs28  
+    xvmaddasp       vs51, vs3,vs28  
+
+    xvmaddasp       vs52, vs0,vs29
+    xvmaddasp       vs53, vs1,vs29
+    xvmaddasp       vs54, vs2,vs29  
+    xvmaddasp       vs55, vs3,vs29
+
+    xvmaddasp       vs56, vs0,vs30
+    xvmaddasp       vs57, vs1,vs30
+    xvmaddasp       vs58, vs2,vs30  
+    xvmaddasp       vs59, vs3,vs30
+
+    xvmaddasp       vs60, vs0,vs31
+    xvmaddasp       vs61, vs1,vs31
+    xvmaddasp       vs62, vs2,vs31  
+    xvmaddasp       vs63, vs3,vs31 
+
+.endif
+.endm  
+
+.macro KERNEL8x16_L1_L4_I  AREG,BREG,   OffsetA,OffsetB, Index,IsLast ,Complete
+
+	lxv	vs8,	DISP32(\Index, 0+\OffsetB)(\BREG)
+	lxv	vs12,	DISP32(\Index,16+\OffsetB)(\BREG)
+
+ 	lxv	vs4,	DISP64(\Index, 0+\OffsetA)(\AREG)
+	lxv	vs5,	DISP64(\Index,16+\OffsetA)(\AREG)
+	lxv	vs6,	DISP64(\Index,32+\OffsetA)(\AREG)
+	lxv	vs7,	DISP64(\Index,48+\OffsetA)(\AREG) 
+
+	xxperm  	vs10,	vs8,		permute_mask
+	xxperm  	vs14,	vs12,		permute_mask	
+	xxpermdi	vs9,	vs8,	vs8,2	 
+	xxpermdi	vs13,	vs12,	vs12,2	 
+
+    xvmaddasp		vs32, vs0,vs24
+	xvmaddasp		vs33, vs1,vs24
+	xvmaddasp		vs34, vs2,vs24	
+	xvmaddasp		vs35, vs3,vs24	 
+
+    xvmaddasp		vs36, vs0,vs25
+	xvmaddasp		vs37, vs1,vs25
+	xvmaddasp		vs38, vs2,vs25	
+	xvmaddasp		vs39, vs3,vs25 
+
+ 	xxpermdi	vs11,	vs10,	vs10,2	
+	xxpermdi	vs15,	vs14,	vs14,2	
+
+    xvmaddasp		vs40, vs0,vs26
+	xvmaddasp		vs41, vs1,vs26
+	xvmaddasp		vs42, vs2,vs26	
+	xvmaddasp		vs43, vs3,vs26
+
+    xvmaddasp		vs44, vs0,vs27
+	xvmaddasp		vs45, vs1,vs27
+	xvmaddasp		vs46, vs2,vs27	
+	xvmaddasp		vs47, vs3,vs27
+
+    xvmaddasp		vs48, vs0,vs28
+	xvmaddasp		vs49, vs1,vs28
+	xvmaddasp		vs50, vs2,vs28	
+	xvmaddasp		vs51, vs3,vs28	
+
+    xvmaddasp		vs52, vs0,vs29
+	xvmaddasp		vs53, vs1,vs29
+	xvmaddasp		vs54, vs2,vs29	
+	xvmaddasp		vs55, vs3,vs29
+
+    xvmaddasp		vs56, vs0,vs30
+	xvmaddasp		vs57, vs1,vs30
+	xvmaddasp		vs58, vs2,vs30	
+	xvmaddasp		vs59, vs3,vs30
+
+    xvmaddasp		vs60, vs0,vs31
+	xvmaddasp		vs61, vs1,vs31
+	xvmaddasp		vs62, vs2,vs31	
+	xvmaddasp		vs63, vs3,vs31 
+
+	lxv	vs24,	DISP32(\Index,32+\OffsetB)(\BREG)
+	lxv	vs28,	DISP32(\Index,32+16+\OffsetB)(\BREG)
+
+	lxv	vs0,	DISP64(\Index,64+\OffsetA)(\AREG)
+	lxv	vs1,	DISP64(\Index,64+16+\OffsetA)(\AREG)
+	lxv	vs2,	DISP64(\Index,64+32+\OffsetA)(\AREG)
+	lxv	vs3,	DISP64(\Index,64+48+\OffsetA)(\AREG)
+
+	xxperm  	vs26,	vs24,		permute_mask
+	xxperm  	vs30,	vs28,		permute_mask	
+	xxpermdi	vs25,	vs24,	vs24,2	   
+	xxpermdi	vs29,	vs28,	vs28,2	  
+ 
+
+    xvmaddasp		vs32, vs4,vs8
+	xvmaddasp		vs33, vs5,vs8
+	xvmaddasp		vs34, vs6,vs8	
+	xvmaddasp		vs35, vs7,vs8	
+ 
+    xvmaddasp		vs36, vs4,vs9
+	xvmaddasp		vs37, vs5,vs9
+	xvmaddasp		vs38, vs6,vs9	
+	xvmaddasp		vs39, vs7,vs9
+         
+	xxpermdi	vs27,	vs26,	vs26,2	
+	xxpermdi	vs31,	vs30,	vs30,2		
+
+    xvmaddasp		vs40, vs4,vs10
+	xvmaddasp		vs41, vs5,vs10
+	xvmaddasp		vs42, vs6,vs10	
+	xvmaddasp		vs43, vs7,vs10
+
+    xvmaddasp		vs44, vs4,vs11
+	xvmaddasp		vs45, vs5,vs11
+	xvmaddasp		vs46, vs6,vs11	
+	xvmaddasp		vs47, vs7,vs11
+
+    xvmaddasp		vs48, vs4,vs12
+	xvmaddasp		vs49, vs5,vs12
+	xvmaddasp		vs50, vs6,vs12	
+	xvmaddasp		vs51, vs7,vs12	
+
+    xvmaddasp		vs52, vs4,vs13
+	xvmaddasp		vs53, vs5,vs13
+	xvmaddasp		vs54, vs6,vs13	
+	xvmaddasp		vs55, vs7,vs13
+
+    xvmaddasp		vs56, vs4,vs14
+	xvmaddasp		vs57, vs5,vs14
+	xvmaddasp		vs58, vs6,vs14	
+	xvmaddasp		vs59, vs7,vs14
+
+    xvmaddasp		vs60, vs4,vs15
+	xvmaddasp		vs61, vs5,vs15
+	xvmaddasp		vs62, vs6,vs15	
+	xvmaddasp		vs63, vs7,vs15
+
+	lxv	vs8,	DISP32(\Index,64+\OffsetB)(\BREG)
+	lxv	vs12,	DISP32(\Index,64+16+\OffsetB)(\BREG)
+
+ 	lxv	vs4,	DISP64(\Index,128+0+\OffsetA)(\AREG)
+	lxv	vs5,	DISP64(\Index,128+16+\OffsetA)(\AREG)
+	lxv	vs6,	DISP64(\Index,128+32+\OffsetA)(\AREG)
+	lxv	vs7,	DISP64(\Index,128+48+\OffsetA)(\AREG) 
+
+	xxperm  	vs10,	vs8,		permute_mask
+	xxperm  	vs14,	vs12,		permute_mask	
+	xxpermdi	vs9,	vs8,	vs8,2	 
+	xxpermdi	vs13,	vs12,	vs12,2	
+
+    xvmaddasp		vs32, vs0,vs24
+	xvmaddasp		vs33, vs1,vs24
+	xvmaddasp		vs34, vs2,vs24	
+	xvmaddasp		vs35, vs3,vs24	 
+
+    xvmaddasp		vs36, vs0,vs25
+	xvmaddasp		vs37, vs1,vs25
+	xvmaddasp		vs38, vs2,vs25	
+	xvmaddasp		vs39, vs3,vs25
+
+ 	xxpermdi	vs11,	vs10,	vs10,2	
+	xxpermdi	vs15,	vs14,	vs14,2	
+
+    xvmaddasp		vs40, vs0,vs26
+	xvmaddasp		vs41, vs1,vs26
+	xvmaddasp		vs42, vs2,vs26	
+	xvmaddasp		vs43, vs3,vs26
+
+    xvmaddasp		vs44, vs0,vs27
+	xvmaddasp		vs45, vs1,vs27
+	xvmaddasp		vs46, vs2,vs27	
+	xvmaddasp		vs47, vs3,vs27
+
+    xvmaddasp		vs48, vs0,vs28
+	xvmaddasp		vs49, vs1,vs28
+	xvmaddasp		vs50, vs2,vs28	
+	xvmaddasp		vs51, vs3,vs28	
+
+    xvmaddasp		vs52, vs0,vs29
+	xvmaddasp		vs53, vs1,vs29
+	xvmaddasp		vs54, vs2,vs29	
+	xvmaddasp		vs55, vs3,vs29
+
+    xvmaddasp		vs56, vs0,vs30
+	xvmaddasp		vs57, vs1,vs30
+	xvmaddasp		vs58, vs2,vs30	
+	xvmaddasp		vs59, vs3,vs30
+
+    xvmaddasp		vs60, vs0,vs31
+	xvmaddasp		vs61, vs1,vs31
+	xvmaddasp		vs62, vs2,vs31	
+	xvmaddasp		vs63, vs3,vs31 
+ 
+.if \Complete==0
+	lxv	vs24,	DISP32(\Index,96+\OffsetB)(\BREG)
+	lxv	vs28,	DISP32(\Index,96+16+\OffsetB)(\BREG)
+
+	lxv	vs0,	DISP64(\Index,192+\OffsetA)(\AREG)
+	lxv	vs1,	DISP64(\Index,192+16+\OffsetA)(\AREG) 
+	lxv	vs2,	DISP64(\Index,192+32+\OffsetA)(\AREG)
+	lxv	vs3,	DISP64(\Index,192+48+\OffsetA)(\AREG)
+
+	xxperm  	vs26,	vs24,	permute_mask
+	xxperm  	vs30,	vs28,	permute_mask	
+	xxpermdi	vs25,	vs24,	vs24,2 
+	xxpermdi	vs29,	vs28,	vs28,2		
+
+.endif 
+.if \IsLast==1	
+.if \Complete==1
+  
+	addi		\BREG, \BREG,  DISP32(\Index,32*3+\OffsetB)
+	addi		\AREG, \AREG, DISP64(\Index,64*3+\OffsetA)
+.else
+  
+	addi		\BREG, \BREG,  DISP32(\Index,128)
+	addi		\AREG, \AREG, DISP64(\Index,256)
+.endif
+.endif   
+ 
+    xvmaddasp		vs32, vs4,vs8
+	xvmaddasp		vs33, vs5,vs8
+	xvmaddasp		vs34, vs6,vs8	
+	xvmaddasp		vs35, vs7,vs8	 
+ 
+    xvmaddasp		vs36, vs4,vs9
+	xvmaddasp		vs37, vs5,vs9
+	xvmaddasp		vs38, vs6,vs9	
+	xvmaddasp		vs39, vs7,vs9
+  
+.if \Complete==0        
+	xxpermdi	vs27,	vs26,	vs26,2	
+	xxpermdi	vs31,	vs30,	vs30,2	
+ 	
+.endif
+ 
+    xvmaddasp		vs40, vs4,vs10
+	xvmaddasp		vs41, vs5,vs10
+	xvmaddasp		vs42, vs6,vs10	
+	xvmaddasp		vs43, vs7,vs10
+
+    xvmaddasp		vs44, vs4,vs11
+	xvmaddasp		vs45, vs5,vs11
+	xvmaddasp		vs46, vs6,vs11	
+	xvmaddasp		vs47, vs7,vs11
+
+    xvmaddasp		vs48, vs4,vs12
+	xvmaddasp		vs49, vs5,vs12
+	xvmaddasp		vs50, vs6,vs12	
+	xvmaddasp		vs51, vs7,vs12	
+
+    xvmaddasp		vs52, vs4,vs13
+	xvmaddasp		vs53, vs5,vs13
+	xvmaddasp		vs54, vs6,vs13	
+	xvmaddasp		vs55, vs7,vs13
+
+    xvmaddasp		vs56, vs4,vs14
+	xvmaddasp		vs57, vs5,vs14
+	xvmaddasp		vs58, vs6,vs14	
+	xvmaddasp		vs59, vs7,vs14
+
+    xvmaddasp		vs60, vs4,vs15
+	xvmaddasp		vs61, vs5,vs15
+	xvmaddasp		vs62, vs6,vs15	
+	xvmaddasp		vs63, vs7,vs15
+
+.endm
+
+.macro KERNEL8x16 First
+
+  LOAD8x16 0
+  END8x16 \First, AO, BO, 64,32 
+.endm
+
+.macro KERNEL8x16_L1_L2_I  AREG,BREG,First,OffsetA,OffsetB, Index,IsLast ,Complete
+	
+	lxv	vs8,	DISP16(\Index, 0+\OffsetB)(\BREG)
+	lxv	vs12,	DISP16(\Index,16+\OffsetB)(\BREG)
+
+ 	lxv	vs4,	DISP32(\Index, 0+\OffsetA)(\AREG)
+	lxv	vs5,	DISP32(\Index,16+\OffsetA)(\AREG)
+	lxv	vs6,	DISP32(\Index,32+\OffsetA)(\AREG)
+	lxv	vs7,	DISP32(\Index,48+\OffsetA)(\AREG) 
+
+	xxperm  	vs10,	vs8,		permute_mask
+	xxperm  	vs14,	vs12,		permute_mask	
+	xxpermdi	vs9,	vs8,	vs8,2	 
+	xxpermdi	vs13,	vs12,	vs12,2	 
+.if \First==1
+    xvmulsp		vs32, vs0,vs24
+	xvmulsp		vs33, vs1,vs24
+	xvmulsp		vs34, vs2,vs24	
+	xvmulsp		vs35, vs3,vs24	
+
+    xvmulsp		vs36, vs0,vs25
+	xvmulsp		vs37, vs1,vs25
+	xvmulsp		vs38, vs2,vs25	
+	xvmulsp		vs39, vs3,vs25	
+.else
+    xvmaddasp		vs32, vs0,vs24
+	xvmaddasp		vs33, vs1,vs24
+	xvmaddasp		vs34, vs2,vs24	
+	xvmaddasp		vs35, vs3,vs24
+
+    xvmaddasp		vs36, vs0,vs25
+	xvmaddasp		vs37, vs1,vs25
+	xvmaddasp		vs38, vs2,vs25	
+	xvmaddasp		vs39, vs3,vs25		
+.endif
+
+ 	xxpermdi	vs11,	vs10,	vs10,2	
+	xxpermdi	vs15,	vs14,	vs14,2	
+ 
+.if \First==1  
+    xvmulsp		vs40, vs0,vs26
+	xvmulsp		vs41, vs1,vs26
+	xvmulsp		vs42, vs2,vs26	
+	xvmulsp		vs43, vs3,vs26
+
+    xvmulsp		vs44, vs0,vs27
+	xvmulsp		vs45, vs1,vs27
+	xvmulsp		vs46, vs2,vs27	
+	xvmulsp		vs47, vs3,vs27
+
+    xvmulsp		vs48, vs0,vs28
+	xvmulsp		vs49, vs1,vs28
+	xvmulsp		vs50, vs2,vs28	
+	xvmulsp		vs51, vs3,vs28	
+
+    xvmulsp		vs52, vs0,vs29
+	xvmulsp		vs53, vs1,vs29
+	xvmulsp		vs54, vs2,vs29	
+	xvmulsp		vs55, vs3,vs29
+
+    xvmulsp		vs56, vs0,vs30
+	xvmulsp		vs57, vs1,vs30
+	xvmulsp		vs58, vs2,vs30	
+	xvmulsp		vs59, vs3,vs30
+
+    xvmulsp		vs60, vs0,vs31
+	xvmulsp		vs61, vs1,vs31
+	xvmulsp		vs62, vs2,vs31	
+	xvmulsp		vs63, vs3,vs31
+
+.else 
+    xvmaddasp		vs40, vs0,vs26
+	xvmaddasp		vs41, vs1,vs26
+	xvmaddasp		vs42, vs2,vs26	
+	xvmaddasp		vs43, vs3,vs26
+
+    xvmaddasp		vs44, vs0,vs27
+	xvmaddasp		vs45, vs1,vs27
+	xvmaddasp		vs46, vs2,vs27	
+	xvmaddasp		vs47, vs3,vs27
+
+    xvmaddasp		vs48, vs0,vs28
+	xvmaddasp		vs49, vs1,vs28
+	xvmaddasp		vs50, vs2,vs28	
+	xvmaddasp		vs51, vs3,vs28	
+
+    xvmaddasp		vs52, vs0,vs29
+	xvmaddasp		vs53, vs1,vs29
+	xvmaddasp		vs54, vs2,vs29	
+	xvmaddasp		vs55, vs3,vs29
+
+    xvmaddasp		vs56, vs0,vs30
+	xvmaddasp		vs57, vs1,vs30
+	xvmaddasp		vs58, vs2,vs30	
+	xvmaddasp		vs59, vs3,vs30
+
+    xvmaddasp		vs60, vs0,vs31
+	xvmaddasp		vs61, vs1,vs31
+	xvmaddasp		vs62, vs2,vs31	
+	xvmaddasp		vs63, vs3,vs31 
+
+.endif
+.if \Complete==0
+	lxv	vs24,	DISP16(\Index,32+\OffsetB)(\BREG)
+	lxv	vs28,	DISP16(\Index,32+16+\OffsetB)(\BREG)
+
+	lxv	vs0,	DISP32(\Index,64+\OffsetA)(\AREG)
+	lxv	vs1,	DISP32(\Index,64+16+\OffsetA)(\AREG)
+	lxv	vs2,	DISP32(\Index,64+32+\OffsetA)(\AREG)
+	lxv	vs3,	DISP32(\Index,64+48+\OffsetA)(\AREG)
+
+	xxperm  	vs26,	vs24,	permute_mask
+	xxperm  	vs30,	vs28,	permute_mask	
+	xxpermdi	vs25,	vs24,	vs24,2	 
+	xxpermdi	vs29,	vs28,	vs28,2	
+.endif    
+.if \IsLast==1	
+.if \Complete==1
+ 	addi		\BREG, \BREG,  DISP16(\Index,32+\OffsetB) 
+	addi		\AREG, \AREG, DISP32(\Index,64+\OffsetA)
+
+.else
+  	addi		\BREG, \BREG,  DISP16(\Index,64)
+	addi		\AREG, \AREG, DISP32(\Index,128) 
+.endif
+.endif
+
+.if \First==1
+    xvmulsp		vs32, vs4,vs8
+	xvmulsp		vs33, vs5,vs8
+	xvmulsp		vs34, vs6,vs8	
+	xvmulsp		vs35, vs7,vs8
+
+    xvmulsp		vs36, vs4,vs9
+	xvmulsp		vs37, vs5,vs9
+	xvmulsp		vs38, vs6,vs9	
+	xvmulsp		vs39, vs7,vs9
+.else
+    xvmaddasp		vs32, vs4,vs8
+	xvmaddasp		vs33, vs5,vs8
+	xvmaddasp		vs34, vs6,vs8	
+	xvmaddasp		vs35, vs7,vs8	
+
+    xvmaddasp		vs36, vs4,vs9
+	xvmaddasp		vs37, vs5,vs9
+	xvmaddasp		vs38, vs6,vs9	
+	xvmaddasp		vs39, vs7,vs9
+.endif 
+ 
+.if \Complete==0        
+	xxpermdi	vs27,	vs26,	vs26,2	
+	xxpermdi	vs31,	vs30,	vs30,2	
+ 
+.endif
+.if \First==1  
+    xvmulsp		vs40, vs4,vs10
+	xvmulsp		vs41, vs5,vs10
+	xvmulsp		vs42, vs6,vs10	
+	xvmulsp		vs43, vs7,vs10
+
+    xvmulsp		vs44, vs4,vs11
+	xvmulsp		vs45, vs5,vs11
+	xvmulsp		vs46, vs6,vs11	
+	xvmulsp		vs47, vs7,vs11
+
+    xvmulsp		vs48, vs4,vs12
+	xvmulsp		vs49, vs5,vs12
+	xvmulsp		vs50, vs6,vs12	
+	xvmulsp		vs51, vs7,vs12	
+
+    xvmulsp		vs52, vs4,vs13
+	xvmulsp		vs53, vs5,vs13
+	xvmulsp		vs54, vs6,vs13	
+	xvmulsp		vs55, vs7,vs13
+
+    xvmulsp		vs56, vs4,vs14
+	xvmulsp		vs57, vs5,vs14
+	xvmulsp		vs58, vs6,vs14	
+	xvmulsp		vs59, vs7,vs14
+
+    xvmulsp		vs60, vs4,vs15
+	xvmulsp		vs61, vs5,vs15
+	xvmulsp		vs62, vs6,vs15	
+	xvmulsp		vs63, vs7,vs15
+
+.else 
+    xvmaddasp		vs40, vs4,vs10
+	xvmaddasp		vs41, vs5,vs10
+	xvmaddasp		vs42, vs6,vs10	
+	xvmaddasp		vs43, vs7,vs10
+
+    xvmaddasp		vs44, vs4,vs11
+	xvmaddasp		vs45, vs5,vs11
+	xvmaddasp		vs46, vs6,vs11	
+	xvmaddasp		vs47, vs7,vs11
+
+    xvmaddasp		vs48, vs4,vs12
+	xvmaddasp		vs49, vs5,vs12
+	xvmaddasp		vs50, vs6,vs12	
+	xvmaddasp		vs51, vs7,vs12	
+
+    xvmaddasp		vs52, vs4,vs13
+	xvmaddasp		vs53, vs5,vs13
+	xvmaddasp		vs54, vs6,vs13	
+	xvmaddasp		vs55, vs7,vs13
+
+    xvmaddasp		vs56, vs4,vs14
+	xvmaddasp		vs57, vs5,vs14
+	xvmaddasp		vs58, vs6,vs14	
+	xvmaddasp		vs59, vs7,vs14
+
+    xvmaddasp		vs60, vs4,vs15
+	xvmaddasp		vs61, vs5,vs15
+	xvmaddasp		vs62, vs6,vs15	
+	xvmaddasp		vs63, vs7,vs15
+
+.endif
+
+.endm
+
+ 
+.macro SAVE8x16
+
+  slwi    T10, LDC ,   1 
+  add     T1, CO, LDC 
+
+  add     T2, CO, T10  
+  add     T3, T1, T10  
+
+  add     T4, T2, T10  
+  add     T5, T3, T10 
+
+  add     T6, T4, T10 
+  add     T7, T5, T10 
+
+
+
+   /* permute to restore butterfly rank 1 updateto normal promoted one */  
+    /* permute 16 vs8 MEM(CO) vs9 MEM(CO+LDC) vs10 MEM(CO+2*LDC)  vs11 MEM(CO+3*LDC) */
+    /* permute 16 vs12 MEM(16+CO) vs13 MEM(16+CO+LDC) vs14 MEM(16+CO+2*LDC)  vs15 MEM(16+CO+3*LDC) */
+    /* permute 16 vs16 MEM(32+CO) vs17 MEM(32+CO+LDC) vs18 MEM(32+CO+2*LDC)  vs19 MEM(32+CO+3*LDC) */
+    /* permute 16 vs24 MEM(32+CO) vs25 MEM(32+CO+LDC) vs26 MEM(32+CO+2*LDC)  vs27 MEM(32+CO+3*LDC) */
+
+    xxmrglw     vs8,    vs32,   vs44
+    xxmrglw     vs10,   vs36,   vs40  
+
+    xxmrghw     vs1,    vs32,   vs44
+    xxmrghw     vs0,    vs36,   vs40
+
+    xxmrglw     vs12,   vs33,   vs45
+    xxmrglw     vs14,   vs37,   vs41  
+
+    xxmrghw     vs2,    vs37,   vs41
+    xxmrghw     vs3,    vs33,   vs45
+
+    xxmrglw     vs16,   vs34,   vs46
+    xxmrglw     vs18,   vs38,   vs42   
+
+    xxlor      vs9, vs8,    vs8
+    xxlor      vs11,    vs10,   vs10 
+
+    xxmrghw     vs4,    vs38,   vs42
+    xxmrghw     vs5,    vs34,   vs46
+
+    xxlor      vs13,    vs12,   vs12
+    xxlor      vs15,    vs14,   vs14
+
+    xxmrglw     vs24,   vs35,   vs47
+    xxmrglw     vs26,   vs39,   vs43  
+
+    xxlor      vs17,    vs16,   vs16
+    xxlor      vs19,    vs18,   vs18
+
+    xxmrghw     vs30,   vs39,   vs43 
+    xxmrghw     vs31,   vs35,   vs47
+
+    xxperm      vs8,    vs0,    save_permute_1
+    xxperm      vs10,   vs1,    save_permute_1
+    xxperm      vs9,    vs0,    save_permute_2  
+    xxperm      vs11,   vs1,    save_permute_2      
+
+#ifndef TRMMKERNEL    
+    lxv        vs32, 0(CO)
+    lxv        vs33, 16(CO) 
+    lxv        vs34, 32(CO)  
+    lxv        vs35, 48(CO)      
+#endif
+    xxlor      vs25,    vs24,   vs24
+    xxlor      vs27,    vs26,   vs26 
+
+#ifndef TRMMKERNEL    
+    lxv        vs36, 0(T1)
+    lxv        vs37, 16(T1) 
+    lxv        vs38, 32(T1)  
+    lxv        vs39, 48(T1)     
+#endif
+#ifndef TRMMKERNEL       
+    lxv        vs40, 0(T2)
+    lxv        vs41, 16(T2) 
+    lxv        vs42, 32(T2)  
+    lxv        vs43, 48(T2)     
+#endif  
+#ifndef TRMMKERNEL    
+    lxv        vs44, 0(T3)
+    lxv        vs45, 16(T3) 
+    lxv        vs46, 32(T3)  
+    lxv        vs47, 48(T3)                 
+#endif  
+
+    xxperm     vs12,    vs2,    save_permute_1
+    xxperm     vs14,    vs3,    save_permute_1
+       
+    xxperm     vs13,    vs2,    save_permute_2   
+    xxperm     vs15,    vs3,    save_permute_2      
+
+    xxperm     vs16,    vs4,    save_permute_1
+    xxperm     vs18,    vs5,    save_permute_1
+      
+    xxperm     vs17,    vs4,    save_permute_2   
+    xxperm     vs19,    vs5,    save_permute_2      
+
+    xxperm     vs24,    vs30,   save_permute_1
+    xxperm     vs26,    vs31,   save_permute_1 
+         
+    xxperm     vs25,    vs30,   save_permute_2   
+    xxperm     vs27,    vs31,   save_permute_2  
+
+
+    /* multiply add normal way */
+ 
+#ifdef TRMMKERNEL
+    xvmulsp     vs32,   vs8,    alpha_r 
+    xvmulsp     vs33,   vs12,   alpha_r   
+    xvmulsp     vs34,   vs16,   alpha_r 
+    xvmulsp     vs35,   vs24,   alpha_r  
+    xvmulsp     vs36,   vs9,    alpha_r 
+    xvmulsp     vs37,   vs13,   alpha_r  
+    xvmulsp     vs38,   vs17,   alpha_r 
+    xvmulsp     vs39,   vs25,   alpha_r               
+#else 
+    xvmaddasp   vs32,   vs8,    alpha_r 
+    xvmaddasp   vs33,   vs12,   alpha_r   
+    xvmaddasp   vs34,   vs16,   alpha_r 
+    xvmaddasp   vs35,   vs24,   alpha_r  
+    xvmaddasp   vs36,   vs9,    alpha_r 
+    xvmaddasp   vs37,   vs13,   alpha_r   
+    xvmaddasp   vs38,   vs17,   alpha_r 
+    xvmaddasp   vs39,   vs25,   alpha_r         
+#endif 
+
+
+
+#ifdef TRMMKERNEL
+    xvmulsp     vs40,   vs10,   alpha_r 
+    xvmulsp     vs41,   vs14,   alpha_r 
+    xvmulsp     vs42,   vs18,   alpha_r 
+    xvmulsp     vs43,   vs26,   alpha_r  
+    xvmulsp     vs44,   vs11,   alpha_r 
+    xvmulsp     vs45,   vs15,   alpha_r  
+    xvmulsp     vs46,   vs19,   alpha_r 
+    xvmulsp     vs47,   vs27,   alpha_r                   
+#else
+
+    xvmaddasp   vs40,   vs10,   alpha_r 
+    xvmaddasp   vs41,   vs14,   alpha_r   
+    xvmaddasp   vs42,   vs18,   alpha_r 
+    xvmaddasp   vs43,   vs26,   alpha_r  
+    xvmaddasp   vs44,   vs11,   alpha_r 
+    xvmaddasp   vs45,   vs15,   alpha_r 
+    xvmaddasp   vs46,   vs19,   alpha_r 
+    xvmaddasp   vs47,   vs27,   alpha_r  
+        
+#endif  
+
+    stxv        vs32, 0(CO)
+    stxv        vs33, 16(CO) 
+    stxv        vs34, 32(CO)  
+    stxv        vs35, 48(CO)  
+
+    stxv        vs36, 0(T1)
+    stxv        vs37, 16(T1)  
+    stxv        vs38, 32(T1)  
+    stxv        vs39, 48(T1)
+
+    stxv        vs40, 0(T2)
+    stxv        vs41, 16(T2)  
+    stxv        vs42, 32(T2)  
+    stxv        vs43, 48(T2)  
+    stxv        vs44, 0(T3)
+    stxv        vs45, 16(T3) 
+    stxv        vs46, 32(T3)  
+    stxv        vs47, 48(T3)
+  
+ /*****the same with the second 8X8 ****/
+#ifndef TRMMKERNEL
+  
+    lxv        vs32, 0(T4)
+    lxv        vs33, 16(T4) 
+    lxv        vs34, 32(T4)  
+    lxv        vs35, 48(T4)      
+    lxv        vs36, 0(T5)
+    lxv        vs37, 16(T5) 
+    lxv        vs38,32(T5)  
+    lxv        vs39, 48(T5)     
+#endif  
+ 
+    xxmrglw     vs8,    vs48,   vs60
+    xxmrglw     vs10,   vs52,   vs56  
+
+    xxmrghw     vs1,    vs48,   vs60
+    xxmrghw     vs0,    vs52,   vs56
+    xxmrglw     vs12,   vs49,   vs61
+    xxmrglw     vs14,   vs53,   vs57  
+
+#ifndef TRMMKERNEL   
+    lxv        vs40, 0(T6)
+    lxv        vs41, 16(T6)  
+    lxv        vs42, 32(T6)  
+    lxv        vs43, 48(T6)           
+    lxv        vs44, 0(T7)
+    lxv        vs45, 16(T7) 
+    lxv        vs46, 32(T7)  
+    lxv        vs47, 48(T7)     
+#endif  
+    xxmrghw     vs2,    vs53,   vs57
+    xxmrghw     vs3,    vs49,   vs61
+
+    xxmrglw     vs16,   vs50,   vs62
+    xxmrglw     vs18,   vs54,   vs58   
+
+    xxlor      vs9, vs8,    vs8
+    xxlor      vs11,    vs10,   vs10 
+    xxmrghw     vs4,    vs54,   vs58
+    xxmrghw     vs5,    vs50,   vs62
+
+    xxlor      vs13,    vs12,   vs12
+    xxlor      vs15,    vs14,   vs14
+ 
+    xxmrglw     vs24,   vs51,   vs63
+    xxmrglw     vs26,   vs55,   vs59  
+
+    xxlor      vs17,    vs16,   vs16
+    xxlor      vs19,    vs18,   vs18
+    xxmrghw     vs30,   vs55,   vs59 
+    xxmrghw     vs31,   vs51,   vs63
+
+    xxperm      vs8,    vs0,    save_permute_1
+    xxperm      vs10,   vs1,    save_permute_1
+     
+    xxperm      vs9,    vs0,    save_permute_2  
+    xxperm      vs11,   vs1,    save_permute_2      
+
+    xxlor      vs25,    vs24,   vs24
+    xxlor      vs27,    vs26,   vs26 
+    xxperm     vs12,    vs2,    save_permute_1
+    xxperm     vs14,    vs3,    save_permute_1
+    xxperm     vs13,    vs2,    save_permute_2   
+    xxperm     vs15,    vs3,    save_permute_2      
+ 
+    xxperm     vs16,    vs4,    save_permute_1
+    xxperm     vs18,    vs5,    save_permute_1
+    xxperm     vs17,    vs4,    save_permute_2   
+    xxperm     vs19,    vs5,    save_permute_2      
+    xxperm     vs24,    vs30,   save_permute_1
+    xxperm     vs26,    vs31,   save_permute_1 
+    xxperm     vs25,    vs30,   save_permute_2   
+    xxperm     vs27,    vs31,   save_permute_2      
+
+#ifdef TRMMKERNEL
+    xvmulsp     vs32,   vs8,    alpha_r 
+    xvmulsp     vs33,   vs12,   alpha_r   
+    xvmulsp     vs34,   vs16,   alpha_r 
+    xvmulsp     vs35,   vs24,   alpha_r  
+    xvmulsp     vs36,   vs9,    alpha_r 
+    xvmulsp     vs37,   vs13,   alpha_r  
+    xvmulsp     vs38,   vs17,   alpha_r 
+    xvmulsp     vs39,   vs25,   alpha_r               
+#else 
+    xvmaddasp   vs32,   vs8,    alpha_r 
+    xvmaddasp   vs33,   vs12,   alpha_r   
+    xvmaddasp   vs34,   vs16,   alpha_r 
+    xvmaddasp   vs35,   vs24,   alpha_r  
+    xvmaddasp   vs36,   vs9,    alpha_r 
+    xvmaddasp   vs37,   vs13,   alpha_r   
+    xvmaddasp   vs38,   vs17,   alpha_r 
+    xvmaddasp   vs39,   vs25,   alpha_r         
+#endif 
+
+    stxv        vs32, 0(T4)
+    stxv        vs33, 16(T4) 
+    stxv        vs34, 32(T4)  
+    stxv        vs35, 48(T4)  
+
+    stxv        vs36, 0(T5)
+    stxv        vs37, 16(T5)  
+    stxv        vs38, 32(T5)  
+    stxv        vs39, 48(T5)
+
+#ifdef TRMMKERNEL
+    xvmulsp     vs40,   vs10,   alpha_r 
+    xvmulsp     vs41,   vs14,   alpha_r 
+    xvmulsp     vs42,   vs18,   alpha_r 
+    xvmulsp     vs43,   vs26,   alpha_r  
+    xvmulsp     vs44,   vs11,   alpha_r 
+    xvmulsp     vs45,   vs15,   alpha_r  
+    xvmulsp     vs46,   vs19,   alpha_r 
+    xvmulsp     vs47,   vs27,   alpha_r                   
+#else
+
+    xvmaddasp   vs40,   vs10,   alpha_r 
+    xvmaddasp   vs41,   vs14,   alpha_r   
+    xvmaddasp   vs42,   vs18,   alpha_r 
+    xvmaddasp   vs43,   vs26,   alpha_r  
+    xvmaddasp   vs44,   vs11,   alpha_r 
+    xvmaddasp   vs45,   vs15,   alpha_r 
+    xvmaddasp   vs46,   vs19,   alpha_r 
+    xvmaddasp   vs47,   vs27,   alpha_r  
+        
+#endif  
+
+    stxv        vs40, 0(T6)
+    stxv        vs41, 16(T6)  
+    stxv        vs42, 32(T6)  
+    stxv        vs43, 48(T6)  
+    stxv        vs44, 0(T7)
+    stxv        vs45, 16(T7) 
+    stxv        vs46, 32(T7)  
+    stxv        vs47, 48(T7)
+  
+
+    addi CO,CO,64
+
+
+.endm
+
+
+
+/**********************************************************************************************
+* Macros for N=8 and M=8
+**********************************************************************************************/
+
+.macro LOAD8x8_1
+   LOAD8x8 1
+.endm
+
+.macro LOAD8x8_0
+   LOAD8x8 0
+.endm
+
+.macro KERNEL8x8_L1_L4  Index,IsLast
+  KERNEL8x8_L1_L4_I AO,BO, 0,0, \Index,\IsLast,0
+.endm
+
+.macro KERNEL8x8_I1_L4  OffsetA,OffsetB, Index,IsLast
+  KERNEL8x8_L1_L4_I  AO,BO, \OffsetA,\OffsetB,\Index,\IsLast,0
+.endm
+
+.macro KERNEL8x8_I1_L4_2  OffsetA,OffsetB, Index,IsLast
+  KERNEL8x8_L1_L4_I  AO,BO,  \OffsetA,\OffsetB,\Index,\IsLast,0
+.endm
+
+.macro KERNEL8x8_I1_L4_3  OffsetA,OffsetB, Index,IsLast
+  KERNEL8x8_L1_L4_I  AO,BO,  \OffsetA,\OffsetB,\Index,\IsLast,1
+.endm
+.macro KERNEL8x8_I1_L2_3  OffsetA,OffsetB, Index,IsLast
+   KERNEL8x8_L1_L2_I AO,BO,0,  \OffsetA,\OffsetB,\Index,\IsLast,1
+.endm
+
+.macro KERNEL8x8_I2_L4_2  AREG,BREG,OffsetA,OffsetB, Index,IsLast
+  KERNEL8x8_L1_L4_I  \AREG,\BREG,  \OffsetA,\OffsetB,\Index,\IsLast,0
+.endm
+
+.macro KERNEL8x8_I2_L4_3  AREG,BREG,OffsetA,OffsetB, Index,IsLast
+  KERNEL8x8_L1_L4_I \AREG,\BREG,  \OffsetA,\OffsetB,\Index,\IsLast,1
+.endm
+
+.macro END8x8_NORMAL
+  END8x8 0, AO, BO, 32,32 
+.endm
+
+.macro Zero8X8
+    xxlxor      vs32,   vs32,   vs32
+    xxlxor      vs33,   vs33,   vs33
+ 
+    xxlxor      vs36,   vs36,   vs36
+    xxlxor      vs37,   vs37,   vs37
+ 
+    xxlxor      vs40,   vs40,   vs40
+    xxlxor      vs41,   vs41,   vs41
+ 
+    xxlxor      vs44,   vs44,   vs44
+    xxlxor      vs45,   vs45,   vs45
+ 
+    xxlxor      vs48,   vs48,   vs48
+    xxlxor      vs49,   vs49,   vs49
+ 
+    xxlxor      vs52,   vs52,   vs52
+    xxlxor      vs53,   vs53,   vs53
+ 
+    xxlxor      vs56,   vs56,   vs56
+    xxlxor      vs57,   vs57,   vs57
+  
+    xxlxor      vs60,   vs60,   vs60
+    xxlxor      vs61,   vs61,   vs61
+    
+.endm
+
+.macro LOAD8x8  Zero
+
+    lxv vs24,   0(BO)
+    lxv vs28,   16(BO)
+    lxv vs0,     0(AO)
+    lxv vs1,    16(AO)
+
+    xxperm      vs26,   vs24,       permute_mask
+    xxperm      vs30,   vs28,       permute_mask    
+    xxpermdi    vs25,   vs24,   vs24,2     
+    xxpermdi    vs29,   vs28,   vs28,2    
+
+    xxpermdi    vs27,   vs26,   vs26,2  
+    xxpermdi    vs31,   vs30,   vs30,2      
+
+.if \Zero==1 
+    xxlxor      vs32,   vs32,   vs32
+    xxlxor      vs33,   vs33,   vs33 
+    xxlxor      vs36,   vs36,   vs36
+    xxlxor      vs37,   vs37,   vs37
+    xxlxor      vs40,   vs40,   vs40
+    xxlxor      vs41,   vs41,   vs41 
+    xxlxor      vs44,   vs44,   vs44
+    xxlxor      vs45,   vs45,   vs45 
+    xxlxor      vs48,   vs48,   vs48
+    xxlxor      vs49,   vs49,   vs49 
+    xxlxor      vs52,   vs52,   vs52
+    xxlxor      vs53,   vs53,   vs53 
+    xxlxor      vs56,   vs56,   vs56
+    xxlxor      vs57,   vs57,   vs57  
+    xxlxor      vs60,   vs60,   vs60
+    xxlxor      vs61,   vs61,   vs61  
+.endif
+.endm
+
+
+.macro END8x8 First, AREG, BREG, OffsetA, OffsetB
+
+.if \OffsetB != 0 
+    addi        \BREG, \BREG, \OffsetB 
+.endif
+.if \OffsetA != 0 
+    addi        \AREG, \AREG, \OffsetA 
+.endif  
+
+.if \First==1
+    xvmulsp     vs32, vs0,vs24
+    xvmulsp     vs33, vs1,vs24
+
+    xvmulsp     vs36, vs0,vs25
+    xvmulsp     vs37, vs1,vs25
+
+    xvmulsp     vs40, vs0,vs26
+    xvmulsp     vs41, vs1,vs26
+
+    xvmulsp     vs44, vs0,vs27
+    xvmulsp     vs45, vs1,vs27
+
+    xvmulsp     vs48, vs0,vs28
+    xvmulsp     vs49, vs1,vs28
+
+    xvmulsp     vs52, vs0,vs29
+    xvmulsp     vs53, vs1,vs29
+
+    xvmulsp     vs56, vs0,vs30
+    xvmulsp     vs57, vs1,vs30
+
+    xvmulsp     vs60, vs0,vs31
+    xvmulsp     vs61, vs1,vs31
+
+.else
+    xvmaddasp       vs32, vs0,vs24
+    xvmaddasp       vs33, vs1,vs24
+
+    xvmaddasp       vs36, vs0,vs25
+    xvmaddasp       vs37, vs1,vs25
+
+    xvmaddasp       vs40, vs0,vs26
+    xvmaddasp       vs41, vs1,vs26
+
+    xvmaddasp       vs44, vs0,vs27
+    xvmaddasp       vs45, vs1,vs27
+
+    xvmaddasp       vs48, vs0,vs28
+    xvmaddasp       vs49, vs1,vs28
+
+    xvmaddasp       vs52, vs0,vs29
+    xvmaddasp       vs53, vs1,vs29
+
+    xvmaddasp       vs56, vs0,vs30
+    xvmaddasp       vs57, vs1,vs30
+
+    xvmaddasp       vs60, vs0,vs31
+    xvmaddasp       vs61, vs1,vs31
+
+.endif
+.endm  
+
+.macro KERNEL8x8_L1_L4_I  AREG,BREG,   OffsetA,OffsetB, Index,IsLast ,Complete
+
+    lxv vs8,    DISP32(\Index, 0+\OffsetB)(\BREG)
+    lxv vs12,   DISP32(\Index,16+\OffsetB)(\BREG)
+
+    lxv vs4,    DISP32(\Index, 0+\OffsetA)(\AREG)
+    lxv vs5,    DISP32(\Index,16+\OffsetA)(\AREG)
+
+    xxperm      vs10,   vs8,        permute_mask
+    xxperm      vs14,   vs12,       permute_mask    
+    xxpermdi    vs9,    vs8,    vs8,2    
+    xxpermdi    vs13,   vs12,   vs12,2   
+
+    xvmaddasp       vs32, vs0,vs24
+    xvmaddasp       vs33, vs1,vs24
+
+    xvmaddasp       vs36, vs0,vs25
+    xvmaddasp       vs37, vs1,vs25
+
+    xxpermdi    vs11,   vs10,   vs10,2  
+    xxpermdi    vs15,   vs14,   vs14,2  
+
+    xvmaddasp       vs40, vs0,vs26
+    xvmaddasp       vs41, vs1,vs26
+
+    xvmaddasp       vs44, vs0,vs27
+    xvmaddasp       vs45, vs1,vs27
+
+    xvmaddasp       vs48, vs0,vs28
+    xvmaddasp       vs49, vs1,vs28
+
+    xvmaddasp       vs52, vs0,vs29
+    xvmaddasp       vs53, vs1,vs29
+
+    xvmaddasp       vs56, vs0,vs30
+    xvmaddasp       vs57, vs1,vs30
+
+    xvmaddasp       vs60, vs0,vs31
+    xvmaddasp       vs61, vs1,vs31
+
+    lxv vs24,   DISP32(\Index,32+\OffsetB)(\BREG)
+    lxv vs28,   DISP32(\Index,32+16+\OffsetB)(\BREG)
+
+    lxv vs0,    DISP32(\Index,32+\OffsetA)(\AREG)
+    lxv vs1,    DISP32(\Index,32+16+\OffsetA)(\AREG)
+
+    xxperm      vs26,   vs24,       permute_mask
+    xxperm      vs30,   vs28,       permute_mask    
+    xxpermdi    vs25,   vs24,   vs24,2     
+    xxpermdi    vs29,   vs28,   vs28,2    
+
+    xvmaddasp       vs32, vs4,vs8
+    xvmaddasp       vs33, vs5,vs8
+
+    xvmaddasp       vs36, vs4,vs9
+    xvmaddasp       vs37, vs5,vs9
+
+    xxpermdi    vs27,   vs26,   vs26,2  
+    xxpermdi    vs31,   vs30,   vs30,2      
+
+    xvmaddasp       vs40, vs4,vs10
+    xvmaddasp       vs41, vs5,vs10
+
+    xvmaddasp       vs44, vs4,vs11
+    xvmaddasp       vs45, vs5,vs11
+
+    xvmaddasp       vs48, vs4,vs12
+    xvmaddasp       vs49, vs5,vs12
+
+    xvmaddasp       vs52, vs4,vs13
+    xvmaddasp       vs53, vs5,vs13
+
+    xvmaddasp       vs56, vs4,vs14
+    xvmaddasp       vs57, vs5,vs14
+
+    xvmaddasp       vs60, vs4,vs15
+    xvmaddasp       vs61, vs5,vs15
+
+    lxv vs8,    DISP32(\Index,64+\OffsetB)(\BREG)
+    lxv vs12,   DISP32(\Index,64+16+\OffsetB)(\BREG)
+
+    lxv vs4,    DISP32(\Index,64+0+\OffsetA)(\AREG)
+    lxv vs5,    DISP32(\Index,64+16+\OffsetA)(\AREG)
+
+    xxperm      vs10,   vs8,        permute_mask
+    xxperm      vs14,   vs12,       permute_mask    
+    xxpermdi    vs9,    vs8,    vs8,2    
+    xxpermdi    vs13,   vs12,   vs12,2  
+
+    xvmaddasp       vs32, vs0,vs24
+    xvmaddasp       vs33, vs1,vs24
+
+    xvmaddasp       vs36, vs0,vs25
+    xvmaddasp       vs37, vs1,vs25
+
+    xxpermdi    vs11,   vs10,   vs10,2  
+    xxpermdi    vs15,   vs14,   vs14,2  
+
+    xvmaddasp       vs40, vs0,vs26
+    xvmaddasp       vs41, vs1,vs26
+
+    xvmaddasp       vs44, vs0,vs27
+    xvmaddasp       vs45, vs1,vs27
+
+    xvmaddasp       vs48, vs0,vs28
+    xvmaddasp       vs49, vs1,vs28
+
+    xvmaddasp       vs52, vs0,vs29
+    xvmaddasp       vs53, vs1,vs29
+
+    xvmaddasp       vs56, vs0,vs30
+    xvmaddasp       vs57, vs1,vs30
+
+    xvmaddasp       vs60, vs0,vs31
+    xvmaddasp       vs61, vs1,vs31
+
+.if \Complete==0
+    lxv vs24,   DISP32(\Index,96+\OffsetB)(\BREG)
+    lxv vs28,   DISP32(\Index,96+16+\OffsetB)(\BREG)
+
+    lxv vs0,    DISP32(\Index,96+\OffsetA)(\AREG)
+    lxv vs1,    DISP32(\Index,96+16+\OffsetA)(\AREG) 
+
+    xxperm      vs26,   vs24,   permute_mask
+    xxperm      vs30,   vs28,   permute_mask    
+    xxpermdi    vs25,   vs24,   vs24,2 
+    xxpermdi    vs29,   vs28,   vs28,2      
+
+.endif 
+.if \IsLast==1  
+.if \Complete==1
+  
+    addi        \BREG, \BREG,  DISP32(\Index,32*3+\OffsetB)
+    addi        \AREG, \AREG, DISP32(\Index,32*3+\OffsetA)
+.else
+  
+    addi        \BREG, \BREG,  DISP32(\Index,128)
+    addi        \AREG, \AREG, DISP32(\Index,128)
+.endif
+.endif   
+ 
+    xvmaddasp       vs32, vs4,vs8
+    xvmaddasp       vs33, vs5,vs8
+
+    xvmaddasp       vs36, vs4,vs9
+    xvmaddasp       vs37, vs5,vs9
+
+.if \Complete==0        
+    xxpermdi    vs27,   vs26,   vs26,2  
+    xxpermdi    vs31,   vs30,   vs30,2  
+    
+.endif
+ 
+    xvmaddasp       vs40, vs4,vs10
+    xvmaddasp       vs41, vs5,vs10
+
+    xvmaddasp       vs44, vs4,vs11
+    xvmaddasp       vs45, vs5,vs11
+
+    xvmaddasp       vs48, vs4,vs12
+    xvmaddasp       vs49, vs5,vs12
+
+    xvmaddasp       vs52, vs4,vs13
+    xvmaddasp       vs53, vs5,vs13
+
+    xvmaddasp       vs56, vs4,vs14
+    xvmaddasp       vs57, vs5,vs14
+
+    xvmaddasp       vs60, vs4,vs15
+    xvmaddasp       vs61, vs5,vs15
+
+.endm
+
+.macro KERNEL8x8 First
+
+  LOAD8x8 0
+  END8x8 \First, AO, BO, 32,32  
+.endm
+
+.macro KERNEL8x8_L1_L2_I  AREG,BREG,First,OffsetA,OffsetB, Index,IsLast ,Complete
+    
+    lxv vs8,    DISP16(\Index, 0+\OffsetB)(\BREG)
+    lxv vs12,   DISP16(\Index,16+\OffsetB)(\BREG)
+
+    lxv vs4,    DISP16(\Index, 0+\OffsetA)(\AREG)
+    lxv vs5,    DISP16(\Index,16+\OffsetA)(\AREG)
+
+    xxperm      vs10,   vs8,        permute_mask
+    xxperm      vs14,   vs12,       permute_mask    
+    xxpermdi    vs9,    vs8,    vs8,2    
+    xxpermdi    vs13,   vs12,   vs12,2   
+.if \First==1
+    xvmulsp     vs32, vs0,vs24
+    xvmulsp     vs33, vs1,vs24
+
+    xvmulsp     vs36, vs0,vs25
+    xvmulsp     vs37, vs1,vs25
+
+.else
+    xvmaddasp       vs32, vs0,vs24
+    xvmaddasp       vs33, vs1,vs24
+
+    xvmaddasp       vs36, vs0,vs25
+    xvmaddasp       vs37, vs1,vs25
+
+.endif
+
+    xxpermdi    vs11,   vs10,   vs10,2  
+    xxpermdi    vs15,   vs14,   vs14,2  
+ 
+.if \First==1  
+    xvmulsp     vs40, vs0,vs26
+    xvmulsp     vs41, vs1,vs26
+
+    xvmulsp     vs44, vs0,vs27
+    xvmulsp     vs45, vs1,vs27
+
+    xvmulsp     vs48, vs0,vs28
+    xvmulsp     vs49, vs1,vs28
+
+    xvmulsp     vs52, vs0,vs29
+    xvmulsp     vs53, vs1,vs29
+
+    xvmulsp     vs56, vs0,vs30
+    xvmulsp     vs57, vs1,vs30
+
+    xvmulsp     vs60, vs0,vs31
+    xvmulsp     vs61, vs1,vs31
+
+.else 
+    xvmaddasp       vs40, vs0,vs26
+    xvmaddasp       vs41, vs1,vs26
+
+    xvmaddasp       vs44, vs0,vs27
+    xvmaddasp       vs45, vs1,vs27
+
+    xvmaddasp       vs48, vs0,vs28
+    xvmaddasp       vs49, vs1,vs28
+
+    xvmaddasp       vs52, vs0,vs29
+    xvmaddasp       vs53, vs1,vs29
+
+    xvmaddasp       vs56, vs0,vs30
+    xvmaddasp       vs57, vs1,vs30
+
+    xvmaddasp       vs60, vs0,vs31
+    xvmaddasp       vs61, vs1,vs31
+
+.endif
+.if \Complete==0
+    lxv vs24,   DISP16(\Index,32+\OffsetB)(\BREG)
+    lxv vs28,   DISP16(\Index,32+16+\OffsetB)(\BREG)
+
+    lxv vs0,    DISP16(\Index,32+\OffsetA)(\AREG)
+    lxv vs1,    DISP16(\Index,32+16+\OffsetA)(\AREG)
+
+    xxperm      vs26,   vs24,   permute_mask
+    xxperm      vs30,   vs28,   permute_mask    
+    xxpermdi    vs25,   vs24,   vs24,2   
+    xxpermdi    vs29,   vs28,   vs28,2  
+.endif    
+.if \IsLast==1  
+.if \Complete==1
+    addi        \BREG, \BREG,  DISP16(\Index,32+\OffsetB) 
+    addi        \AREG, \AREG,  DISP16(\Index,32+\OffsetA)
+
+.else
+    addi        \BREG, \BREG,  DISP16(\Index,64)
+    addi        \AREG, \AREG,  DISP16(\Index,64) 
+.endif
+.endif
+
+.if \First==1
+    xvmulsp     vs32, vs4,vs8
+    xvmulsp     vs33, vs5,vs8
+
+    xvmulsp     vs36, vs4,vs9
+    xvmulsp     vs37, vs5,vs9
+
+.else
+    xvmaddasp       vs32, vs4,vs8
+    xvmaddasp       vs33, vs5,vs8
+
+    xvmaddasp       vs36, vs4,vs9
+    xvmaddasp       vs37, vs5,vs9
+
+.endif 
+ 
+.if \Complete==0        
+    xxpermdi    vs27,   vs26,   vs26,2  
+    xxpermdi    vs31,   vs30,   vs30,2  
+ 
+.endif
+.if \First==1  
+    xvmulsp     vs40, vs4,vs10
+    xvmulsp     vs41, vs5,vs10
+
+    xvmulsp     vs44, vs4,vs11
+    xvmulsp     vs45, vs5,vs11
+
+    xvmulsp     vs48, vs4,vs12
+    xvmulsp     vs49, vs5,vs12
+
+    xvmulsp     vs52, vs4,vs13
+    xvmulsp     vs53, vs5,vs13
+
+    xvmulsp     vs56, vs4,vs14
+    xvmulsp     vs57, vs5,vs14
+
+    xvmulsp     vs60, vs4,vs15
+    xvmulsp     vs61, vs5,vs15
+
+.else 
+    xvmaddasp       vs40, vs4,vs10
+    xvmaddasp       vs41, vs5,vs10
+
+    xvmaddasp       vs44, vs4,vs11
+    xvmaddasp       vs45, vs5,vs11
+
+    xvmaddasp       vs48, vs4,vs12
+    xvmaddasp       vs49, vs5,vs12
+
+    xvmaddasp       vs52, vs4,vs13
+    xvmaddasp       vs53, vs5,vs13
+
+    xvmaddasp       vs56, vs4,vs14
+    xvmaddasp       vs57, vs5,vs14
+
+    xvmaddasp       vs60, vs4,vs15
+    xvmaddasp       vs61, vs5,vs15
+
+.endif
+
+.endm
+
+
+.macro SAVE8x8 
+ 
+  slwi    T10, LDC ,   1 
+  add     T1, CO, LDC 
+
+  add     T2, CO, T10  
+  add     T3, T1, T10  
+
+  add     T4, T2, T10  
+  add     T5, T3, T10 
+
+  add     T6, T4, T10 
+  add     T7, T5, T10 
+
+#ifndef TRMMKERNEL    
+    lxv        vs34, 0(CO)
+    lxv        vs35, 16(CO)      
+    lxv        vs38, 0(T1)
+    lxv        vs39, 16(T1)  
+    lxv        vs42, 0(T2)
+    lxv        vs43, 16(T2)     
+    lxv        vs46, 0(T3)
+    lxv        vs47, 16(T3)  
+
+    lxv        vs50, 0(T4)
+    lxv        vs51, 16(T4)      
+    lxv        vs54, 0(T5)
+    lxv        vs55, 16(T5)  
+    lxv        vs58, 0(T6)
+    lxv        vs59, 16(T6)     
+    lxv        vs62, 0(T7)
+    lxv        vs63, 16(T7) 
+#endif  
+
+    xxmrglw     vs8,    vs32,   vs44
+    xxmrglw     vs10,   vs36,   vs40  
+
+    xxmrghw     vs1,    vs32,   vs44
+    xxmrghw     vs0,    vs36,   vs40
+
+    xxmrglw     vs12,   vs33,   vs45
+    xxmrglw     vs14,   vs37,   vs41  
+
+    xxmrghw     vs2,    vs37,   vs41
+    xxmrghw     vs3,    vs33,   vs45
+
+    xxlor      vs9, vs8,    vs8
+    xxlor      vs11,    vs10,   vs10 
+ 
+    xxlor      vs13,    vs12,   vs12
+    xxlor      vs15,    vs14,   vs14
+
+    xxperm      vs8,    vs0,    save_permute_1
+    xxperm      vs10,   vs1,    save_permute_1
+    xxperm      vs9,    vs0,    save_permute_2  
+    xxperm      vs11,   vs1,    save_permute_2      
+
+    xxperm     vs12,    vs2,    save_permute_1
+    xxperm     vs14,    vs3,    save_permute_1
+      
+    xxperm     vs13,    vs2,    save_permute_2   
+    xxperm     vs15,    vs3,    save_permute_2      
+
+
+    /* multiply add normal way */
+ 
+#ifdef TRMMKERNEL
+    xvmulsp     vs34,   vs8,    alpha_r 
+    xvmulsp     vs35,   vs12,   alpha_r 
+    xvmulsp     vs38,   vs9,    alpha_r 
+    xvmulsp     vs39,   vs13,   alpha_r 
+    xvmulsp     vs42,   vs10,   alpha_r 
+    xvmulsp     vs43,   vs14,   alpha_r 
+    xvmulsp     vs46,   vs11,   alpha_r 
+    xvmulsp     vs47,   vs15,   alpha_r                    
+#else 
+    xvmaddasp   vs34,   vs8,    alpha_r 
+    xvmaddasp   vs35,   vs12,   alpha_r 
+    xvmaddasp   vs38,   vs9,    alpha_r 
+    xvmaddasp   vs39,   vs13,   alpha_r  
+    xvmaddasp   vs42,   vs10,   alpha_r 
+    xvmaddasp   vs43,   vs14,   alpha_r   
+    xvmaddasp   vs46,   vs11,   alpha_r 
+    xvmaddasp   vs47,   vs15,   alpha_r                     
+#endif     
+ 
+   
+    xxmrglw     vs8,    vs48,   vs60
+    xxmrglw     vs10,   vs52,   vs56  
+
+    xxmrghw     vs1,    vs48,   vs60
+    xxmrghw     vs0,    vs52,   vs56
+    stxv        vs34, 0(CO)
+    stxv        vs35, 16(CO) 
+    xxmrglw     vs12,   vs49,   vs61
+    xxmrglw     vs14,   vs53,   vs57  
+    stxv        vs38, 0(T1)
+    stxv        vs39, 16(T1) 
+    xxmrghw     vs2,    vs53,   vs57
+    xxmrghw     vs3,    vs49,   vs61
+    stxv        vs42, 0(T2)
+    stxv        vs43, 16(T2)   
+    xxlor      vs9, vs8,    vs8
+    xxlor      vs11,    vs10,   vs10  
+    stxv        vs46, 0(T3)
+    stxv        vs47, 16(T3)  
+    xxlor      vs13,    vs12,   vs12
+    xxlor      vs15,    vs14,   vs14
+   
+    xxperm      vs8,    vs0,    save_permute_1
+    xxperm      vs10,   vs1,    save_permute_1
+    
+ 
+    xxperm      vs9,    vs0,    save_permute_2  
+    xxperm      vs11,   vs1,    save_permute_2      
+ 
+    xxperm     vs12,    vs2,    save_permute_1
+    xxperm     vs14,    vs3,    save_permute_1
+    xxperm     vs13,    vs2,    save_permute_2   
+    xxperm     vs15,    vs3,    save_permute_2      
+    
+ #ifdef TRMMKERNEL
+    xvmulsp     vs50,   vs8,    alpha_r 
+    xvmulsp     vs51,   vs12,   alpha_r 
+    xvmulsp     vs54,   vs9,    alpha_r 
+    xvmulsp     vs55,   vs13,   alpha_r 
+    xvmulsp     vs58,   vs10,   alpha_r 
+    xvmulsp     vs59,   vs14,   alpha_r 
+    xvmulsp     vs62,   vs11,   alpha_r 
+    xvmulsp     vs63,   vs15,   alpha_r                    
+#else 
+    xvmaddasp     vs50,   vs8,    alpha_r 
+    xvmaddasp     vs51,   vs12,   alpha_r 
+    xvmaddasp     vs54,   vs9,    alpha_r 
+    xvmaddasp     vs55,   vs13,   alpha_r 
+    xvmaddasp     vs58,   vs10,   alpha_r 
+    xvmaddasp     vs59,   vs14,   alpha_r 
+    xvmaddasp     vs62,   vs11,   alpha_r 
+    xvmaddasp     vs63,   vs15,   alpha_r                     
+#endif  
+
+    stxv        vs50, 0(T4)
+    stxv        vs51, 16(T4)      
+    stxv        vs54, 0(T5)
+    stxv        vs55, 16(T5)  
+    stxv        vs58, 0(T6)
+    stxv        vs59, 16(T6)     
+    stxv        vs62, 0(T7)
+    stxv        vs63, 16(T7)   
+
+    addi CO,CO,32
+
+.endm
+
+
+/**********************************************************************************************
+* Macros for N=8 and M=4
+**********************************************************************************************/
+
+.macro LOAD8x4_1
+   LOAD8x4 1
+.endm
+
+.macro LOAD8x4_0
+   LOAD8x4 0
+.endm
+
+.macro KERNEL8x4_L1_L4  Index,IsLast
+  KERNEL8x4_L1_L4_I AO,BO, 0,0, \Index,\IsLast,0
+.endm
+
+.macro KERNEL8x4_I1_L4  OffsetA,OffsetB, Index,IsLast
+  KERNEL8x4_L1_L4_I  AO,BO, \OffsetA,\OffsetB,\Index,\IsLast,0
+.endm
+
+.macro KERNEL8x4_I1_L4_2  OffsetA,OffsetB, Index,IsLast
+  KERNEL8x4_L1_L4_I  AO,BO,  \OffsetA,\OffsetB,\Index,\IsLast,0
+.endm
+
+.macro KERNEL8x4_I1_L4_3  OffsetA,OffsetB, Index,IsLast
+  KERNEL8x4_L1_L4_I  AO,BO,  \OffsetA,\OffsetB,\Index,\IsLast,1
+.endm
+.macro KERNEL8x4_I1_L2_3  OffsetA,OffsetB, Index,IsLast
+   KERNEL8x4_L1_L2_I AO,BO,0,  \OffsetA,\OffsetB,\Index,\IsLast,1
+.endm
+
+.macro KERNEL8x4_I2_L4_2  AREG,BREG,OffsetA,OffsetB, Index,IsLast
+  KERNEL8x4_L1_L4_I  \AREG,\BREG,  \OffsetA,\OffsetB,\Index,\IsLast,0
+.endm
+
+.macro KERNEL8x4_I2_L4_3  AREG,BREG,OffsetA,OffsetB, Index,IsLast
+  KERNEL8x4_L1_L4_I \AREG,\BREG,  \OffsetA,\OffsetB,\Index,\IsLast,1
+.endm
+
+.macro Zero8X4
+    xxlxor      vs32,   vs32,   vs32
+    xxlxor      vs33,   vs33,   vs33 
+    xxlxor      vs34,   vs34,   vs34
+    xxlxor      vs35,   vs35,   vs35
+    
+    xxlxor      vs48,   vs48,   vs48
+    xxlxor      vs49,   vs49,   vs49
+    xxlxor      vs50,   vs50,   vs50
+    xxlxor      vs51,   vs51,   vs51  
+    
+.endm
+
+.macro LOAD8x4  Zero
+
+    lxv vs0,     0(AO)
+    lxv vs24,   0(BO)
+    lxv vs25,   16(BO)
+
+
+
+    xxperm      vs2,   vs0,       permute_mask  
+    xxpermdi    vs1,   vs0,   vs0,2      
+    xxpermdi    vs3,   vs2,   vs2,2        
+
+.if \Zero==1 
+    xxlxor      vs32,   vs32,   vs32
+    xxlxor      vs33,   vs33,   vs33 
+    xxlxor      vs34,   vs34,   vs34
+    xxlxor      vs35,   vs35,   vs35
+
+    xxlxor      vs48,   vs48,   vs48
+    xxlxor      vs49,   vs49,   vs49
+    xxlxor      vs50,   vs50,   vs50
+    xxlxor      vs51,   vs51,   vs51  
+.endif
+.endm
+
+.macro END8x4_NORMAL
+  END8x4 0, AO, BO, 16,32 
+.endm
+
+.macro END8x4 First, AREG, BREG, OffsetA, OffsetB
+
+.if \OffsetB != 0 
+    addi        \BREG, \BREG, \OffsetB 
+.endif
+.if \OffsetA != 0 
+    addi        \AREG, \AREG, \OffsetA 
+.endif  
+
+.if \First==1
+    xvmulsp      vs32,   vs24,   vs0
+    xvmulsp      vs33,   vs24,   vs1 
+    xvmulsp      vs34,   vs24,   vs2
+    xvmulsp      vs35,   vs24,   vs3
+
+    xvmulsp      vs48,   vs25,   vs0
+    xvmulsp      vs49,   vs25,   vs1
+    xvmulsp      vs50,   vs25,   vs2
+    xvmulsp      vs51,   vs25,   vs3  
+.else
+    xvmaddasp      vs32,   vs24,   vs0
+    xvmaddasp      vs33,   vs24,   vs1 
+    xvmaddasp      vs34,   vs24,   vs2
+    xvmaddasp      vs35,   vs24,   vs3
+
+    xvmaddasp      vs48,   vs25,   vs0
+    xvmaddasp      vs49,   vs25,   vs1
+    xvmaddasp      vs50,   vs25,   vs2
+    xvmaddasp      vs51,   vs25,   vs3 
+
+.endif
+.endm  
+
+.macro KERNEL8x4_L1_L4_I  AREG,BREG,   OffsetA,OffsetB, Index,IsLast ,Complete
+
+    lxv vs4,    DISP16(\Index, 0+\OffsetA)(\AREG)
+    lxv vs26,   DISP32(\Index, 0+\OffsetB)(\BREG)
+    lxv vs27,   DISP32(\Index,16+\OffsetB)(\BREG)
+
+    xxperm      vs6,   vs4,       permute_mask  
+    xxpermdi    vs5,   vs4,   vs4,2      
+    xxpermdi    vs7,   vs6,   vs6,2 
+ 
+    xvmaddasp      vs32,   vs24,   vs0
+    xvmaddasp      vs33,   vs24,   vs1 
+    xvmaddasp      vs34,   vs24,   vs2
+    xvmaddasp      vs35,   vs24,   vs3
+
+    xvmaddasp      vs48,   vs25,   vs0
+    xvmaddasp      vs49,   vs25,   vs1
+    xvmaddasp      vs50,   vs25,   vs2
+    xvmaddasp      vs51,   vs25,   vs3 
+
+    lxv vs0,    DISP16(\Index, 16+\OffsetA)(\AREG)
+    lxv vs24,   DISP32(\Index, 32+\OffsetB)(\BREG)
+    lxv vs25,   DISP32(\Index, 48+\OffsetB)(\BREG) 
+
+    xxperm      vs2,   vs0,       permute_mask  
+    xxpermdi    vs1,   vs0,   vs0,2      
+    xxpermdi    vs3,   vs2,   vs2,2   
+
+    xvmaddasp      vs32,   vs26,   vs4
+    xvmaddasp      vs33,   vs26,   vs5 
+    xvmaddasp      vs34,   vs26,   vs6
+    xvmaddasp      vs35,   vs26,   vs7
+
+    xvmaddasp      vs48,   vs27,   vs4
+    xvmaddasp      vs49,   vs27,   vs5
+    xvmaddasp      vs50,   vs27,   vs6
+    xvmaddasp      vs51,   vs27,   vs7
+ 
+
+    lxv vs4,    DISP16(\Index, 32+\OffsetA)(\AREG)
+    lxv vs26,   DISP32(\Index, 64+\OffsetB)(\BREG)
+    lxv vs27,   DISP32(\Index, 80+\OffsetB)(\BREG)
+
+    xxperm      vs6,   vs4,       permute_mask  
+    xxpermdi    vs5,   vs4,   vs4,2      
+    xxpermdi    vs7,   vs6,   vs6,2 
+ 
+    xvmaddasp      vs32,   vs24,   vs0
+    xvmaddasp      vs33,   vs24,   vs1 
+    xvmaddasp      vs34,   vs24,   vs2
+    xvmaddasp      vs35,   vs24,   vs3
+
+    xvmaddasp      vs48,   vs25,   vs0
+    xvmaddasp      vs49,   vs25,   vs1
+    xvmaddasp      vs50,   vs25,   vs2
+    xvmaddasp      vs51,   vs25,   vs3 
+
+.if \Complete==0 
+
+    lxv vs0,    DISP16(\Index, 48+\OffsetA)(\AREG)
+    lxv vs24,   DISP32(\Index, 96+\OffsetB)(\BREG)
+    lxv vs25,   DISP32(\Index, 96+16+\OffsetB)(\BREG) 
+
+    xxperm      vs2,   vs0,       permute_mask  
+    xxpermdi    vs1,   vs0,   vs0,2      
+    xxpermdi    vs3,   vs2,   vs2,2   
+.endif
+    xvmaddasp      vs32,   vs26,   vs4
+    xvmaddasp      vs33,   vs26,   vs5 
+    xvmaddasp      vs34,   vs26,   vs6
+    xvmaddasp      vs35,   vs26,   vs7
+
+    xvmaddasp      vs48,   vs27,   vs4
+    xvmaddasp      vs49,   vs27,   vs5
+    xvmaddasp      vs50,   vs27,   vs6
+    xvmaddasp      vs51,   vs27,   vs7
+
+ 
+ 
+.if \IsLast==1  
+.if \Complete==1
+    addi        \AREG, \AREG, DISP16(\Index,16*3+\OffsetA)  
+    addi        \BREG, \BREG,  DISP32(\Index,32*3+\OffsetB)
+
+.else
+    addi        \AREG, \AREG, DISP16(\Index,64)  
+    addi        \BREG, \BREG,  DISP32(\Index,128)
+
+.endif
+.endif   
+ 
+ 
+.endm
+
+.macro KERNEL8x4 First
+    LOAD8x4 0
+    END8x4 \First, AO, BO, 16,32  
+.endm
+
+.macro KERNEL8x4_L1_L2_I  AREG,BREG,First,OffsetA,OffsetB, Index,IsLast ,Complete
+
+    lxv vs4,    DISP8(\Index, 0+\OffsetA)(\AREG)
+    lxv vs26,   DISP16(\Index, 0+\OffsetB)(\BREG)
+    lxv vs27,   DISP16(\Index,16+\OffsetB)(\BREG)
+
+    xxperm      vs6,   vs4,       permute_mask  
+    xxpermdi    vs5,   vs4,   vs4,2      
+    xxpermdi    vs7,   vs6,   vs6,2 
+.if \First==1
+    xvmulsp      vs32,   vs24,   vs0
+    xvmulsp      vs33,   vs24,   vs1 
+    xvmulsp      vs34,   vs24,   vs2
+    xvmulsp      vs35,   vs24,   vs3
+
+    xvmulsp      vs48,   vs25,   vs0
+    xvmulsp      vs49,   vs25,   vs1
+    xvmulsp      vs50,   vs25,   vs2
+    xvmulsp      vs51,   vs25,   vs3  
+.else 
+    xvmaddasp      vs32,   vs24,   vs0
+    xvmaddasp      vs33,   vs24,   vs1 
+    xvmaddasp      vs34,   vs24,   vs2
+    xvmaddasp      vs35,   vs24,   vs3
+
+    xvmaddasp      vs48,   vs25,   vs0
+    xvmaddasp      vs49,   vs25,   vs1
+    xvmaddasp      vs50,   vs25,   vs2
+    xvmaddasp      vs51,   vs25,   vs3 
+.endif
+
+.if \Complete==0 
+
+    lxv vs0,    DISP8(\Index, 16+\OffsetA)(\AREG)
+    lxv vs24,   DISP16(\Index, 32+\OffsetB)(\BREG)
+    lxv vs25,   DISP16(\Index, 48+\OffsetB)(\BREG) 
+
+    xxperm      vs2,   vs0,       permute_mask  
+    xxpermdi    vs1,   vs0,   vs0,2      
+    xxpermdi    vs3,   vs2,   vs2,2   
+.endif
+
+.if \First==1
+    xvmulsp      vs32,   vs26,   vs4
+    xvmulsp      vs33,   vs26,   vs5 
+    xvmulsp      vs34,   vs26,   vs6
+    xvmulsp      vs35,   vs26,   vs7
+
+    xvmulsp      vs48,   vs27,   vs4
+    xvmulsp      vs49,   vs27,   vs5
+    xvmulsp      vs50,   vs27,   vs6
+    xvmulsp      vs51,   vs27,   vs7
+
+
+.else
+    xvmaddasp      vs32,   vs26,   vs4
+    xvmaddasp      vs33,   vs26,   vs5 
+    xvmaddasp      vs34,   vs26,   vs6
+    xvmaddasp      vs35,   vs26,   vs7
+
+    xvmaddasp      vs48,   vs27,   vs4
+    xvmaddasp      vs49,   vs27,   vs5
+    xvmaddasp      vs50,   vs27,   vs6
+    xvmaddasp      vs51,   vs27,   vs7
+.endif
+ 
+ 
+.if \IsLast==1  
+.if \Complete==1
+    addi        \AREG, \AREG, DISP8(\Index,16+\OffsetA)  
+    addi        \BREG, \BREG,  DISP16(\Index,32+\OffsetB)
+
+.else
+    addi        \AREG, \AREG, DISP8(\Index,32)  
+    addi        \BREG, \BREG,  DISP16(\Index,64)
+
+.endif
+.endif   
+     
+  
+.endm
+
+
+.macro SAVE8x4
+  slwi    T10, LDC ,   1 
+  add     T1, CO, LDC 
+#if !defined(TRMMKERNEL)  
+  lxv        vs36, 0(CO)
+  lxv        vs37, 0(T1)
+#endif  
+  add     T2, CO, T10  
+  add     T3, T1, T10 
+#if !defined(TRMMKERNEL)    
+  lxv        vs38, 0(T2)
+  lxv        vs39, 0(T3)   
+#endif   
+  add     T4, T2, T10 
+  add     T5, T3, T10
+#if !defined(TRMMKERNEL)    
+  lxv        vs40, 0(T4)
+  lxv        vs41, 0(T5)
+#endif  
+  add     T6, T4, T10 
+  add     T7, T5, T10
+#if !defined(TRMMKERNEL)    
+  lxv        vs42, 0(T6)
+  lxv        vs43, 0(T7)
+#endif
+  xxmrglw  vs0, vs35,vs32
+  xxmrglw  vs1, vs34,vs33 
+  xxmrglw  vs4, vs32,vs35
+  xxmrglw  vs5, vs33,vs34 
+
+
+  xxmrghw  vs2, vs35,vs32
+  xxmrghw  vs3, vs34,vs33 
+  xxmrghw  vs6, vs32,vs35
+  xxmrghw  vs7, vs33,vs34  
+
+  xxmrgld  vs24, vs1, vs0  
+  xxmrghd  vs25,vs5,vs4 
+
+  xxmrgld  vs26, vs2, vs3  
+  xxmrghd  vs27,vs6,vs7
+
+
+  xxmrglw  vs0, vs51,vs48
+  xxmrglw  vs1, vs50,vs49  
+  xxmrglw  vs4, vs48,vs51
+  xxmrglw  vs5, vs49,vs50 
+
+  xxmrghw  vs2, vs51,vs48
+  xxmrghw  vs3, vs50,vs49  
+  xxmrghw  vs6, vs48,vs51
+  xxmrghw  vs7, vs49,vs50   
+
+  xxmrgld  vs28, vs1, vs0  
+  xxmrghd  vs29,vs5,vs4
+
+  xxmrgld  vs30, vs2, vs3   
+  xxmrghd  vs31,vs6,vs7
+#if defined(TRMMKERNEL)
+
+  xvmulsp        vs36, vs24, alpha_r
+  xvmulsp        vs37, vs25, alpha_r 
+  xvmulsp        vs38, vs26, alpha_r
+  xvmulsp        vs39, vs27, alpha_r   
+  xvmulsp        vs40, vs28, alpha_r
+  xvmulsp        vs41, vs29, alpha_r 
+  xvmulsp        vs42, vs30, alpha_r
+  xvmulsp        vs43, vs31, alpha_r
+#else
+  xvmaddasp        vs36, vs24, alpha_r
+  xvmaddasp        vs37, vs25, alpha_r 
+  xvmaddasp        vs38, vs26, alpha_r
+  xvmaddasp        vs39, vs27, alpha_r   
+  xvmaddasp        vs40, vs28, alpha_r
+  xvmaddasp        vs41, vs29, alpha_r 
+  xvmaddasp        vs42, vs30, alpha_r
+  xvmaddasp        vs43, vs31, alpha_r
+#endif
+
+  stxv        vs36, 0(CO)
+  stxv        vs37, 0(T1) 
+  stxv        vs38, 0(T2)
+  stxv        vs39, 0(T3)   
+  stxv        vs40, 0(T4)
+  stxv        vs41, 0(T5) 
+  stxv        vs42, 0(T6)
+  stxv        vs43, 0(T7)
+
+
+  addi CO,CO,16
+.endm
+
+
+/**********************************************************************************************
+* Macros for N=8 and M=2
+**********************************************************************************************/
+
+ 
+.macro KERNEL8x2_2   OffsetA,OffsetB, Index,IsLast
+  KERNEL8x2_I_2 AO,BO, 0, \OffsetA,\OffsetB,\Index,\IsLast
+.endm
+
+ 
+
+.macro Zero8x2
+    xxlxor      vs0,   vs0,   vs0
+    xxlxor      vs1,   vs1,   vs1 
+    xxlxor      vs2,   vs2,   vs2
+    xxlxor      vs3,   vs3,   vs3
+       
+.endm
+ 
+.macro KERNEL8x2
+  KERNEL8x2_1 AO,BO, 0, 0,0,0
+.endm
+.macro KERNEL8x2_1 AREG,BREG,First,OffsetA,OffsetB,Index
+
+
+    lxsd v4,   DISP2(\Index, 0+\OffsetA)(\AREG)
+    lxv vs26,   DISP8(\Index, 0+\OffsetB)(\BREG)
+    lxv vs27,   DISP8(\Index,16+\OffsetB)(\BREG)      
+    xxspltw   vs8,  vs36, 0 
+    xxspltw   vs9,  vs36, 1  
+ 
+.if \First==1
+    xvmulsp      vs0,   vs26,   vs8
+    xvmulsp      vs1,   vs27,   vs8 
+    xvmulsp      vs2,   vs26,   vs9
+    xvmulsp      vs3,   vs27,   vs9 
+     
+.else 
+    xvmaddasp      vs0,   vs26,   vs8
+    xvmaddasp      vs1,   vs27,   vs8 
+    xvmaddasp      vs2,   vs26,   vs9
+    xvmaddasp      vs3,   vs27,   vs9
+ 
+ .endif
+   
+    addi        \AREG, \AREG, DISP2(\Index,8)  
+    addi        \BREG, \BREG, DISP8(\Index,32)
+ 
+.endm
+
+.macro KERNEL8x2_I_2  AREG,BREG,First,OffsetA,OffsetB, Index,IsLast  
+
+    lxv vs4,    DISP4(\Index, 0+\OffsetA)(\AREG)
+    lxv vs26,   DISP16(\Index, 0+\OffsetB)(\BREG)
+    lxv vs27,   DISP16(\Index,16+\OffsetB)(\BREG)
+    lxv vs28,   DISP16(\Index,32+\OffsetB)(\BREG)
+    lxv vs29,   DISP16(\Index,48+\OffsetB)(\BREG)      
+    xxspltw   vs8,  vs4, 2  
+    xxspltw   vs9,  vs4, 3 
+    xxspltw   vs10, vs4, 0 
+    xxspltw   vs11, vs4, 1
+ 
+.if \First==1
+    xvmulsp      vs0,   vs26,   vs8
+    xvmulsp      vs1,   vs27,   vs8 
+    xvmulsp      vs2,   vs26,   vs9
+    xvmulsp      vs3,   vs27,   vs9 
+
+    xvmulsp      vs0,   vs28,   vs10
+    xvmulsp      vs1,   vs29,   vs10 
+    xvmulsp      vs2,   vs28,   vs11
+    xvmulsp      vs3,   vs29,   vs11     
+.else 
+    xvmaddasp      vs0,   vs26,   vs8
+    xvmaddasp      vs1,   vs27,   vs8 
+    xvmaddasp      vs2,   vs26,   vs9
+    xvmaddasp      vs3,   vs27,   vs9
+
+    xvmaddasp      vs0,   vs28,   vs10
+    xvmaddasp      vs1,   vs29,   vs10 
+    xvmaddasp      vs2,   vs28,   vs11
+    xvmaddasp      vs3,   vs29,   vs11  
+ .endif
+
+ 
+.if \IsLast==1   
+    addi        \AREG, \AREG, DISP4(\Index,16)  
+    addi        \BREG, \BREG, DISP16(\Index,64)
+.endif 
+  
+.endm
+
+
+.macro SAVE8x2
+  slwi    T10, LDC ,   1 
+  add     T1, CO, LDC  
+  add     T2, CO, T10  
+  add     T3, T1, T10     
+  add     T4, T2, T10 
+  add     T5, T3, T10 
+  add     T6, T4, T10 
+  add     T7, T5, T10 
+  /*convert alpha_r for multiply*/
+  xscvspdp  vs4,alpha_r
+/* v0 corresponds to vs32, do not forget*/
+#if !defined(TRMMKERNEL)
+  lxssp  v0,0(CO) 
+  lxssp  v1,4(CO) 
+
+  lxssp  v2,0(T1)
+  lxssp  v3,4(T1)
+
+  lxssp  v4,0(T2)
+  lxssp  v5,4(T2)
+
+  lxssp  v6,0(T3)
+  lxssp  v7,4(T3)
+
+  lxssp  v8,0(T4)
+  lxssp  v9,4(T4)
+
+  lxssp  v10,0(T5)
+  lxssp  v11,4(T5)
+
+  lxssp  v12,0(T6)
+  lxssp  v13,4(T6)
+
+  lxssp  v14,0(T7)
+  lxssp  v15,4(T7)
+#endif
+  xscvspdp  vs5, vs2
+  xxspltw   vs6, vs2, 1 
+  xxspltw   vs7, vs2, 2 
+  xxspltw   vs8, vs2, 3  
+  xscvspdp  vs6,vs6
+  xscvspdp  vs7,vs7
+  xscvspdp  vs8,vs8
+
+  xscvspdp  vs24, vs0
+  xxspltw   vs25, vs0, 1 
+  xxspltw   vs26, vs0, 2 
+  xxspltw   vs27, vs0, 3  
+  xscvspdp  vs25,vs25
+  xscvspdp  vs26,vs26
+  xscvspdp  vs27,vs27
+
+  xscvspdp  vs9, vs3
+  xxspltw   vs10, vs3, 1 
+  xxspltw   vs11, vs3, 2 
+  xxspltw   vs12, vs3, 3  
+  xscvspdp  vs10,vs10
+  xscvspdp  vs11,vs11
+  xscvspdp  vs12,vs12
+
+  xscvspdp  vs28, vs1
+  xxspltw   vs29, vs1, 1 
+  xxspltw   vs30, vs1, 2 
+  xxspltw   vs31, vs1, 3  
+  xscvspdp  vs29,vs29
+  xscvspdp  vs30,vs30
+  xscvspdp  vs31,vs31
+
+
+
+
+#if defined(TRMMKERNEL)
+  xsmuldp  vs32,vs8, vs4 
+  xsmuldp  vs33,vs27, vs4 
+
+  xsmuldp  vs34,vs7, vs4 
+  xsmuldp  vs35,vs26, vs4 
+
+  xsmuldp  vs36,vs6, vs4 
+  xsmuldp  vs37,vs25, vs4  
+
+  xsmuldp  vs38,vs5, vs4 
+  xsmuldp  vs39,vs24, vs4  
+
+  xsmuldp  vs40,vs12, vs4 
+  xsmuldp  vs41,vs31, vs4
+
+  xsmuldp  vs42,vs11, vs4 
+  xsmuldp  vs43,vs30, vs4  
+
+  xsmuldp  vs44,vs10, vs4 
+  xsmuldp  vs45,vs29, vs4 
+
+  xsmuldp  vs46,vs9, vs4 
+  xsmuldp  vs47,vs28, vs4      
+#else
+  xsmaddadp  vs32,vs8, vs4 
+  xsmaddadp  vs33,vs27, vs4 
+
+  xsmaddadp  vs34,vs7, vs4 
+  xsmaddadp  vs35,vs26, vs4 
+
+  xsmaddadp  vs36,vs6, vs4 
+  xsmaddadp  vs37,vs25, vs4  
+
+  xsmaddadp  vs38,vs5, vs4 
+  xsmaddadp  vs39,vs24, vs4  
+
+  xsmaddadp  vs40,vs12, vs4 
+  xsmaddadp  vs41,vs31, vs4
+
+  xsmaddadp  vs42,vs11, vs4 
+  xsmaddadp  vs43,vs30, vs4  
+
+  xsmaddadp  vs44,vs10, vs4 
+  xsmaddadp  vs45,vs29, vs4 
+
+  xsmaddadp  vs46,vs9, vs4 
+  xsmaddadp  vs47,vs28, vs4     
+#endif  
+
+  stxssp  v0,0(CO) 
+  stxssp  v1,4(CO) 
+
+  stxssp  v2,0(T1)
+  stxssp  v3,4(T1)
+
+  stxssp  v4,0(T2)
+  stxssp  v5,4(T2)
+
+  stxssp  v6,0(T3)
+  stxssp  v7,4(T3)
+
+  stxssp  v8,0(T4)
+  stxssp  v9,4(T4)
+
+  stxssp  v10,0(T5)
+  stxssp  v11,4(T5)
+
+  stxssp  v12,0(T6)
+  stxssp  v13,4(T6)
+
+  stxssp  v14,0(T7)
+  stxssp  v15,4(T7)
+ 
+
+  addi CO,CO,8
+.endm
+
+
+/**********************************************************************************************
+* Macros for N=8 and M=1
+**********************************************************************************************/
+.macro KERNEL8x1_4   OffsetA,OffsetB, Index,IsLast
+  KERNEL8x1_I_4 AO,BO, 0, \OffsetA,\OffsetB,\Index,\IsLast
+.endm
+
+.macro Zero8x1
+    xxlxor      vs0,   vs0,   vs0
+    xxlxor      vs1,   vs1,   vs1  
+.endm
+
+.macro KERNEL8x1
+  KERNEL8x1_1 AO,BO, 0 
+.endm
+
+.macro KERNEL8x1_2
+  KERNEL8x1_2_1 AO,BO, 0 
+.endm
+
+.macro KERNEL8x1_1 AREG,BREG,First 
+    lxvwsx vs8,  0, \AREG
+    lxv vs26,   0(\BREG)
+    lxv vs27,   16(\BREG)      
+.if \First==1
+    xvmulsp      vs0,   vs26,   vs8
+    xvmulsp      vs1,   vs27,   vs8  
+.else 
+    xvmaddasp      vs0,   vs26,   vs8
+    xvmaddasp      vs1,   vs27,   vs8  
+ .endif
+    addi        \AREG, \AREG,  4  
+    addi        \BREG, \BREG,  32
+.endm
+
+.macro KERNEL8x1_2_1 AREG,BREG,First 
+    lxsd v4,    0(\AREG)
+    lxv vs26,   0(\BREG)
+    lxv vs27,  16(\BREG)      
+    lxv vs28,  32(\BREG)
+    lxv vs29,  48(\BREG) 
+    xxspltw   vs8,  vs36, 1 
+    xxspltw   vs9,  vs36, 0  
+.if \First==1
+    xvmulsp      vs0,   vs26,   vs8
+    xvmulsp      vs1,   vs27,   vs8  
+    xvmulsp      vs0,   vs28,   vs9
+    xvmulsp      vs1,   vs29,   vs9     
+.else 
+    xvmaddasp      vs0,   vs26,   vs8
+    xvmaddasp      vs1,   vs27,   vs8  
+    xvmaddasp      vs0,   vs28,   vs9
+    xvmaddasp      vs1,   vs29,   vs9 
+ .endif
+    addi        \AREG, \AREG,  8 
+    addi        \BREG, \BREG,  64
+.endm
+
+.macro KERNEL8x1_I_4  AREG,BREG,First,OffsetA,OffsetB, Index,IsLast  
+    lxv vs4,    DISP4(\Index, 0+\OffsetA)(\AREG)
+    xxspltw   vs8,  vs4, 3 
+    xxspltw   vs9,  vs4, 2 
+    xxspltw   vs10, vs4, 1 
+    xxspltw   vs11, vs4, 0
+    lxv vs26,   DISP32(\Index, 0+\OffsetB)(\BREG)
+    lxv vs27,   DISP32(\Index,16+\OffsetB)(\BREG)
+    lxv vs28,   DISP32(\Index,32+\OffsetB)(\BREG)
+    lxv vs29,   DISP32(\Index,48+\OffsetB)(\BREG) 
+    lxv vs30,   DISP32(\Index,64+ 0+\OffsetB)(\BREG)
+    lxv vs31,   DISP32(\Index,64+16+\OffsetB)(\BREG)
+    lxv vs32,   DISP32(\Index,64+32+\OffsetB)(\BREG)
+    lxv vs33,   DISP32(\Index,64+48+\OffsetB)(\BREG)         
+.if \First==1
+    xvmulsp      vs0,   vs26,   vs8
+    xvmulsp      vs1,   vs27,   vs8  
+    xvmulsp      vs0,   vs28,   vs9
+    xvmulsp      vs1,   vs29,   vs9     
+    xvmulsp      vs0,   vs30,   vs10
+    xvmulsp      vs1,   vs31,   vs10  
+    xvmulsp      vs0,   vs32,   vs11
+    xvmulsp      vs1,   vs33,   vs11     
+.else 
+    xvmaddasp      vs0,   vs26,   vs8
+    xvmaddasp      vs1,   vs27,   vs8  
+    xvmaddasp      vs0,   vs28,   vs9
+    xvmaddasp      vs1,   vs29,   vs9     
+    xvmaddasp      vs0,   vs30,   vs10
+    xvmaddasp      vs1,   vs31,   vs10  
+    xvmaddasp      vs0,   vs32,   vs11
+    xvmaddasp      vs1,   vs33,   vs11  
+ .endif
+.if \IsLast==1   
+    addi        \AREG, \AREG, DISP4(\Index,16)  
+    addi        \BREG, \BREG, DISP32(\Index,128)
+.endif 
+.endm
+
+.macro SAVE8x1
+  slwi    T10, LDC ,   1 
+  add     T1, CO, LDC  
+  add     T2, CO, T10  
+  add     T3, T1, T10     
+  add     T4, T2, T10 
+  add     T5, T3, T10 
+  add     T6, T4, T10 
+  add     T7, T5, T10 
+  /*convert alpha_r for multiply*/
+  xscvspdp  vs4,alpha_r
+/* v0 corresponds to vs32, do not forget*/
+#if !defined(TRMMKERNEL)
+  lxssp  v0,0(CO)  
+  lxssp  v2,0(T1) 
+  lxssp  v4,0(T2) 
+  lxssp  v6,0(T3) 
+  lxssp  v8,0(T4) 
+  lxssp  v10,0(T5) 
+  lxssp  v12,0(T6) 
+  lxssp  v14,0(T7)
+#endif
+  xscvspdp  vs24, vs0
+  xxspltw   vs25, vs0, 1 
+  xxspltw   vs26, vs0, 2 
+  xxspltw   vs27, vs0, 3  
+  xscvspdp  vs25,vs25
+  xscvspdp  vs26,vs26
+  xscvspdp  vs27,vs27
+  xscvspdp  vs28, vs1
+  xxspltw   vs29, vs1, 1 
+  xxspltw   vs30, vs1, 2 
+  xxspltw   vs31, vs1, 3  
+  xscvspdp  vs29,vs29
+  xscvspdp  vs30,vs30
+  xscvspdp  vs31,vs31
+#if defined(TRMMKERNEL)
+  xsmuldp  vs32,vs27, vs4 
+  xsmuldp  vs34,vs26, vs4 
+  xsmuldp  vs36,vs25, vs4 
+  xsmuldp  vs38,vs24, vs4 
+  xsmuldp  vs40,vs31, vs4 
+  xsmuldp  vs42,vs30, vs4 
+  xsmuldp  vs44,vs29, vs4 
+  xsmuldp  vs46,vs28, vs4 
+#else
+  xsmaddadp  vs32,vs27, vs4 
+  xsmaddadp  vs34,vs26, vs4 
+  xsmaddadp  vs36,vs25, vs4 
+  xsmaddadp  vs38,vs24, vs4 
+  xsmaddadp  vs40,vs31, vs4 
+  xsmaddadp  vs42,vs30, vs4 
+  xsmaddadp  vs44,vs29, vs4 
+  xsmaddadp  vs46,vs28, vs4  
+#endif  
+  stxssp  v0,0(CO)  
+  stxssp  v2,0(T1) 
+  stxssp  v4,0(T2) 
+  stxssp  v6,0(T3) 
+  stxssp  v8,0(T4) 
+  stxssp  v10,0(T5) 
+  stxssp  v12,0(T6) 
+  stxssp  v14,0(T7) 
+  addi CO,CO,4
+.endm
+
+
+
+/**********************************************************************************************
+* Macros for N=4 and M=16
+**********************************************************************************************/
+
+.macro LOAD4x16_1
+   LOAD4x16 1
+.endm
+
+.macro LOAD4x16_0
+   LOAD4x16 0
+.endm
+
+.macro KERNEL4x16_L1_L4  Index,IsLast
+  KERNEL4x16_L1_L4_I AO,BO, 0,0, \Index,\IsLast,0
+.endm
+
+.macro KERNEL4x16_I1_L4  OffsetA,OffsetB, Index,IsLast
+  KERNEL4x16_L1_L4_I  AO,BO, \OffsetA,\OffsetB,\Index,\IsLast,0
+.endm
+
+.macro KERNEL4x16_I1_L4_2  OffsetA,OffsetB, Index,IsLast
+  KERNEL4x16_L1_L4_I  AO,BO,  \OffsetA,\OffsetB,\Index,\IsLast,0
+.endm
+
+.macro KERNEL4x16_I1_L4_3  OffsetA,OffsetB, Index,IsLast
+  KERNEL4x16_L1_L4_I  AO,BO,  \OffsetA,\OffsetB,\Index,\IsLast,1
+.endm
+.macro KERNEL4x16_I1_L2_3  OffsetA,OffsetB, Index,IsLast
+   KERNEL4x16_L1_L2_I AO,BO,0,  \OffsetA,\OffsetB,\Index,\IsLast,1
+.endm
+
+.macro KERNEL4x16_I2_L4_2  AREG,BREG,OffsetA,OffsetB, Index,IsLast
+  KERNEL4x16_L1_L4_I  \AREG,\BREG,  \OffsetA,\OffsetB,\Index,\IsLast,0
+.endm
+
+.macro KERNEL4x16_I2_L4_3  AREG,BREG,OffsetA,OffsetB, Index,IsLast
+  KERNEL4x16_L1_L4_I \AREG,\BREG,  \OffsetA,\OffsetB,\Index,\IsLast,1
+.endm
+
+.macro Zero4X16
+    xxlxor		vs32,	vs32,	vs32
+    xxlxor		vs33,	vs33,	vs33
+	xxlxor		vs34,	vs34,	vs34
+	xxlxor		vs35,	vs35,	vs35
+	xxlxor		vs36,	vs36,	vs36
+	xxlxor		vs37,	vs37,	vs37
+	xxlxor		vs38,	vs38,	vs38
+	xxlxor		vs39,	vs39,	vs39
+	xxlxor		vs40,	vs40,	vs40
+	xxlxor		vs41,	vs41,	vs41
+	xxlxor		vs42,	vs42,	vs42
+	xxlxor		vs43,	vs43,	vs43
+	xxlxor		vs44,	vs44,	vs44
+	xxlxor		vs45,	vs45,	vs45
+	xxlxor		vs46,	vs46,	vs46
+	xxlxor		vs47,	vs47,	vs47	
+.endm
+
+.macro LOAD4x16  Zero
+
+	lxv	vs24,	0(BO) 
+	lxv	vs0,	 0(AO)
+	lxv	vs1,	16(AO)
+	lxv	vs2,	32(AO)
+	lxv	vs3,	48(AO)
+	xxperm  	vs26,	vs24,		permute_mask 	
+	xxpermdi	vs25,	vs24,	vs24,2 
+	xxpermdi	vs27,	vs26,	vs26,2	 	
+
+.if \Zero==1 
+    xxlxor		vs32,	vs32,	vs32
+    xxlxor		vs33,	vs33,	vs33
+	xxlxor		vs34,	vs34,	vs34
+	xxlxor		vs35,	vs35,	vs35
+	xxlxor		vs36,	vs36,	vs36
+	xxlxor		vs37,	vs37,	vs37
+	xxlxor		vs38,	vs38,	vs38
+	xxlxor		vs39,	vs39,	vs39
+	xxlxor		vs40,	vs40,	vs40
+	xxlxor		vs41,	vs41,	vs41
+	xxlxor		vs42,	vs42,	vs42
+	xxlxor		vs43,	vs43,	vs43
+	xxlxor		vs44,	vs44,	vs44
+	xxlxor		vs45,	vs45,	vs45
+	xxlxor		vs46,	vs46,	vs46
+	xxlxor		vs47,	vs47,	vs47
+ 
+.endif
+.endm
+
+.macro END4x16_NORMAL
+  END4x16 0, AO, BO, 64,16 
+.endm
+
+.macro END4x16 First, AREG, BREG, OffsetA, OffsetB
+
+.if \OffsetB != 0 
+    addi        \BREG, \BREG, \OffsetB 
+.endif
+.if \OffsetA != 0 
+    addi        \AREG, \AREG, \OffsetA 
+.endif  
+
+.if \First==1
+    xvmulsp     vs32, vs0,vs24
+    xvmulsp     vs33, vs1,vs24
+    xvmulsp     vs34, vs2,vs24  
+    xvmulsp     vs35, vs3,vs24  
+
+    xvmulsp     vs36, vs0,vs25
+    xvmulsp     vs37, vs1,vs25
+    xvmulsp     vs38, vs2,vs25  
+    xvmulsp     vs39, vs3,vs25
+
+    xvmulsp     vs40, vs0,vs26
+    xvmulsp     vs41, vs1,vs26
+    xvmulsp     vs42, vs2,vs26  
+    xvmulsp     vs43, vs3,vs26
+
+    xvmulsp     vs44, vs0,vs27
+    xvmulsp     vs45, vs1,vs27
+    xvmulsp     vs46, vs2,vs27  
+    xvmulsp     vs47, vs3,vs27
+
+.else
+    xvmaddasp       vs32, vs0,vs24
+    xvmaddasp       vs33, vs1,vs24
+    xvmaddasp       vs34, vs2,vs24  
+    xvmaddasp       vs35, vs3,vs24  
+
+    xvmaddasp       vs36, vs0,vs25
+    xvmaddasp       vs37, vs1,vs25
+    xvmaddasp       vs38, vs2,vs25  
+    xvmaddasp       vs39, vs3,vs25 
+    xvmaddasp       vs40, vs0,vs26
+    xvmaddasp       vs41, vs1,vs26
+    xvmaddasp       vs42, vs2,vs26  
+    xvmaddasp       vs43, vs3,vs26
+
+    xvmaddasp       vs44, vs0,vs27
+    xvmaddasp       vs45, vs1,vs27
+    xvmaddasp       vs46, vs2,vs27  
+    xvmaddasp       vs47, vs3,vs27
+
+.endif
+.endm  
+
+.macro KERNEL4x16_L1_L4_I  AREG,BREG,   OffsetA,OffsetB, Index,IsLast ,Complete
+
+	lxv	vs8,	DISP16(\Index, 0+\OffsetB)(\BREG) 
+
+ 	lxv	vs4,	DISP64(\Index, 0+\OffsetA)(\AREG)
+	lxv	vs5,	DISP64(\Index,16+\OffsetA)(\AREG)
+	lxv	vs6,	DISP64(\Index,32+\OffsetA)(\AREG)
+	lxv	vs7,	DISP64(\Index,48+\OffsetA)(\AREG) 
+
+	xxperm  	vs10,	vs8,		permute_mask 
+	xxpermdi	vs9,	vs8,	vs8,2	  
+
+    xvmaddasp		vs32, vs0,vs24
+	xvmaddasp		vs33, vs1,vs24
+	xvmaddasp		vs34, vs2,vs24	
+	xvmaddasp		vs35, vs3,vs24	 
+
+    xvmaddasp		vs36, vs0,vs25
+	xvmaddasp		vs37, vs1,vs25
+	xvmaddasp		vs38, vs2,vs25	
+	xvmaddasp		vs39, vs3,vs25 
+
+ 	xxpermdi	vs11,	vs10,	vs10,2	 
+
+    xvmaddasp		vs40, vs0,vs26
+	xvmaddasp		vs41, vs1,vs26
+	xvmaddasp		vs42, vs2,vs26	
+	xvmaddasp		vs43, vs3,vs26
+
+    xvmaddasp		vs44, vs0,vs27
+	xvmaddasp		vs45, vs1,vs27
+	xvmaddasp		vs46, vs2,vs27	
+	xvmaddasp		vs47, vs3,vs27
+
+
+
+	lxv	vs24,	DISP16(\Index,16+\OffsetB)(\BREG) 
+
+	lxv	vs0,	DISP64(\Index,64+\OffsetA)(\AREG)
+	lxv	vs1,	DISP64(\Index,64+16+\OffsetA)(\AREG)
+	lxv	vs2,	DISP64(\Index,64+32+\OffsetA)(\AREG)
+	lxv	vs3,	DISP64(\Index,64+48+\OffsetA)(\AREG)
+
+	xxperm  	vs26,	vs24,		permute_mask 
+	xxpermdi	vs25,	vs24,	vs24,2	    
+ 
+
+    xvmaddasp		vs32, vs4,vs8
+	xvmaddasp		vs33, vs5,vs8
+	xvmaddasp		vs34, vs6,vs8	
+	xvmaddasp		vs35, vs7,vs8	
+ 
+    xvmaddasp		vs36, vs4,vs9
+	xvmaddasp		vs37, vs5,vs9
+	xvmaddasp		vs38, vs6,vs9	
+	xvmaddasp		vs39, vs7,vs9
+         
+	xxpermdi	vs27,	vs26,	vs26,2	 	
+
+    xvmaddasp		vs40, vs4,vs10
+	xvmaddasp		vs41, vs5,vs10
+	xvmaddasp		vs42, vs6,vs10	
+	xvmaddasp		vs43, vs7,vs10
+
+    xvmaddasp		vs44, vs4,vs11
+	xvmaddasp		vs45, vs5,vs11
+	xvmaddasp		vs46, vs6,vs11	
+	xvmaddasp		vs47, vs7,vs11
+ 
+
+	lxv	vs8,	DISP16(\Index,32+\OffsetB)(\BREG) 
+
+ 	lxv	vs4,	DISP64(\Index,128+0+\OffsetA)(\AREG)
+	lxv	vs5,	DISP64(\Index,128+16+\OffsetA)(\AREG)
+	lxv	vs6,	DISP64(\Index,128+32+\OffsetA)(\AREG)
+	lxv	vs7,	DISP64(\Index,128+48+\OffsetA)(\AREG) 
+
+	xxperm  	vs10,	vs8,		permute_mask 
+	xxpermdi	vs9,	vs8,	vs8,2	  
+
+    xvmaddasp		vs32, vs0,vs24
+	xvmaddasp		vs33, vs1,vs24
+	xvmaddasp		vs34, vs2,vs24	
+	xvmaddasp		vs35, vs3,vs24	 
+
+    xvmaddasp		vs36, vs0,vs25
+	xvmaddasp		vs37, vs1,vs25
+	xvmaddasp		vs38, vs2,vs25	
+	xvmaddasp		vs39, vs3,vs25
+
+ 	xxpermdi	vs11,	vs10,	vs10,2	 
+
+    xvmaddasp		vs40, vs0,vs26
+	xvmaddasp		vs41, vs1,vs26
+	xvmaddasp		vs42, vs2,vs26	
+	xvmaddasp		vs43, vs3,vs26
+
+    xvmaddasp		vs44, vs0,vs27
+	xvmaddasp		vs45, vs1,vs27
+	xvmaddasp		vs46, vs2,vs27	
+	xvmaddasp		vs47, vs3,vs27
+
+ 
+ 
+.if \Complete==0
+	lxv	vs24,	DISP16(\Index,48+\OffsetB)(\BREG) 
+
+	lxv	vs0,	DISP64(\Index,192+\OffsetA)(\AREG)
+	lxv	vs1,	DISP64(\Index,192+16+\OffsetA)(\AREG) 
+	lxv	vs2,	DISP64(\Index,192+32+\OffsetA)(\AREG)
+	lxv	vs3,	DISP64(\Index,192+48+\OffsetA)(\AREG)
+
+	xxperm  	vs26,	vs24,	permute_mask 	
+	xxpermdi	vs25,	vs24,	vs24,2  	
+
+.endif 
+.if \IsLast==1	
+.if \Complete==1
+  
+	addi		\BREG, \BREG,  DISP16(\Index,16*3+\OffsetB)
+	addi		\AREG, \AREG, DISP64(\Index,64*3+\OffsetA)
+.else
+  
+	addi		\BREG, \BREG,  DISP16(\Index,64)
+	addi		\AREG, \AREG, DISP64(\Index,256)
+.endif
+.endif   
+ 
+    xvmaddasp		vs32, vs4,vs8
+	xvmaddasp		vs33, vs5,vs8
+	xvmaddasp		vs34, vs6,vs8	
+	xvmaddasp		vs35, vs7,vs8	 
+ 
+    xvmaddasp		vs36, vs4,vs9
+	xvmaddasp		vs37, vs5,vs9
+	xvmaddasp		vs38, vs6,vs9	
+	xvmaddasp		vs39, vs7,vs9
+  
+.if \Complete==0        
+	xxpermdi	vs27,	vs26,	vs26,2	 
+ 	
+.endif
+ 
+    xvmaddasp		vs40, vs4,vs10
+	xvmaddasp		vs41, vs5,vs10
+	xvmaddasp		vs42, vs6,vs10	
+	xvmaddasp		vs43, vs7,vs10
+
+    xvmaddasp		vs44, vs4,vs11
+	xvmaddasp		vs45, vs5,vs11
+	xvmaddasp		vs46, vs6,vs11	
+	xvmaddasp		vs47, vs7,vs11
+
+ 
+
+.endm
+
+.macro KERNEL4x16 First
+
+  LOAD4x16 0
+  END4x16 \First, AO, BO, 64,16 
+.endm
+
+.macro KERNEL4x16_L1_L2_I  AREG,BREG,First,OffsetA,OffsetB, Index,IsLast ,Complete
+	
+	lxv	vs8,	DISP8(\Index, 0+\OffsetB)(\BREG) 
+ 	lxv	vs4,	DISP32(\Index, 0+\OffsetA)(\AREG)
+	lxv	vs5,	DISP32(\Index,16+\OffsetA)(\AREG)
+	lxv	vs6,	DISP32(\Index,32+\OffsetA)(\AREG)
+	lxv	vs7,	DISP32(\Index,48+\OffsetA)(\AREG) 
+
+	xxperm  	vs10,	vs8,		permute_mask 
+	xxpermdi	vs9,	vs8,	vs8,2	  
+.if \First==1
+    xvmulsp		vs32, vs0,vs24
+	xvmulsp		vs33, vs1,vs24
+	xvmulsp		vs34, vs2,vs24	
+	xvmulsp		vs35, vs3,vs24	
+
+    xvmulsp		vs36, vs0,vs25
+	xvmulsp		vs37, vs1,vs25
+	xvmulsp		vs38, vs2,vs25	
+	xvmulsp		vs39, vs3,vs25	
+.else
+    xvmaddasp		vs32, vs0,vs24
+	xvmaddasp		vs33, vs1,vs24
+	xvmaddasp		vs34, vs2,vs24	
+	xvmaddasp		vs35, vs3,vs24
+
+    xvmaddasp		vs36, vs0,vs25
+	xvmaddasp		vs37, vs1,vs25
+	xvmaddasp		vs38, vs2,vs25	
+	xvmaddasp		vs39, vs3,vs25		
+.endif
+
+ 	xxpermdi	vs11,	vs10,	vs10,2	 	
+ 
+.if \First==1  
+    xvmulsp		vs40, vs0,vs26
+	xvmulsp		vs41, vs1,vs26
+	xvmulsp		vs42, vs2,vs26	
+	xvmulsp		vs43, vs3,vs26
+
+    xvmulsp		vs44, vs0,vs27
+	xvmulsp		vs45, vs1,vs27
+	xvmulsp		vs46, vs2,vs27	
+	xvmulsp		vs47, vs3,vs27
+
+  
+.else 
+    xvmaddasp		vs40, vs0,vs26
+	xvmaddasp		vs41, vs1,vs26
+	xvmaddasp		vs42, vs2,vs26	
+	xvmaddasp		vs43, vs3,vs26
+
+    xvmaddasp		vs44, vs0,vs27
+	xvmaddasp		vs45, vs1,vs27
+	xvmaddasp		vs46, vs2,vs27	
+	xvmaddasp		vs47, vs3,vs27
+ 
+
+.endif
+.if \Complete==0
+	lxv	vs24,	DISP8(\Index,16+\OffsetB)(\BREG) 
+	lxv	vs0,	DISP32(\Index,64+\OffsetA)(\AREG)
+	lxv	vs1,	DISP32(\Index,64+16+\OffsetA)(\AREG)
+	lxv	vs2,	DISP32(\Index,64+32+\OffsetA)(\AREG)
+	lxv	vs3,	DISP32(\Index,64+48+\OffsetA)(\AREG)
+
+	xxperm  	vs26,	vs24,	permute_mask 
+	xxpermdi	vs25,	vs24,	vs24,2	  
+.endif    
+.if \IsLast==1	
+.if \Complete==1
+ 	addi		\BREG, \BREG,  DISP8(\Index,16+\OffsetB) 
+	addi		\AREG, \AREG, DISP32(\Index,64+\OffsetA)
+
+.else
+  	addi		\BREG, \BREG,  DISP8(\Index,32)
+	addi		\AREG, \AREG, DISP32(\Index,128) 
+.endif
+.endif
+
+.if \First==1
+    xvmulsp		vs32, vs4,vs8
+	xvmulsp		vs33, vs5,vs8
+	xvmulsp		vs34, vs6,vs8	
+	xvmulsp		vs35, vs7,vs8
+
+    xvmulsp		vs36, vs4,vs9
+	xvmulsp		vs37, vs5,vs9
+	xvmulsp		vs38, vs6,vs9	
+	xvmulsp		vs39, vs7,vs9
+.else
+    xvmaddasp		vs32, vs4,vs8
+	xvmaddasp		vs33, vs5,vs8
+	xvmaddasp		vs34, vs6,vs8	
+	xvmaddasp		vs35, vs7,vs8	
+
+    xvmaddasp		vs36, vs4,vs9
+	xvmaddasp		vs37, vs5,vs9
+	xvmaddasp		vs38, vs6,vs9	
+	xvmaddasp		vs39, vs7,vs9
+.endif 
+ 
+.if \Complete==0        
+	xxpermdi	vs27,	vs26,	vs26,2	 
+ 
+.endif
+.if \First==1  
+    xvmulsp		vs40, vs4,vs10
+	xvmulsp		vs41, vs5,vs10
+	xvmulsp		vs42, vs6,vs10	
+	xvmulsp		vs43, vs7,vs10
+
+    xvmulsp		vs44, vs4,vs11
+	xvmulsp		vs45, vs5,vs11
+	xvmulsp		vs46, vs6,vs11	
+	xvmulsp		vs47, vs7,vs11
+
+ 
+
+.else 
+    xvmaddasp		vs40, vs4,vs10
+	xvmaddasp		vs41, vs5,vs10
+	xvmaddasp		vs42, vs6,vs10	
+	xvmaddasp		vs43, vs7,vs10
+
+    xvmaddasp		vs44, vs4,vs11
+	xvmaddasp		vs45, vs5,vs11
+	xvmaddasp		vs46, vs6,vs11	
+	xvmaddasp		vs47, vs7,vs11
+
+ 
+
+.endif
+
+.endm
+
+ 
+.macro SAVE4x16
+
+  slwi    T10, LDC ,   1 
+  add     T1, CO, LDC 
+
+  add     T2, CO, T10  
+  add     T3, T1, T10  
+
+  
+ 
+    xxmrglw     vs8,    vs32,   vs44
+    xxmrglw     vs10,   vs36,   vs40  
+
+    xxmrghw     vs1,    vs32,   vs44
+    xxmrghw     vs0,    vs36,   vs40
+
+    xxmrglw     vs12,   vs33,   vs45
+    xxmrglw     vs14,   vs37,   vs41  
+
+    xxmrghw     vs2,    vs37,   vs41
+    xxmrghw     vs3,    vs33,   vs45
+
+    xxmrglw     vs16,   vs34,   vs46
+    xxmrglw     vs18,   vs38,   vs42   
+
+    xxlor      vs9, vs8,    vs8
+    xxlor      vs11,    vs10,   vs10 
+
+    xxmrghw     vs4,    vs38,   vs42
+    xxmrghw     vs5,    vs34,   vs46
+
+    xxlor      vs13,    vs12,   vs12
+    xxlor      vs15,    vs14,   vs14
+
+    xxmrglw     vs24,   vs35,   vs47
+    xxmrglw     vs26,   vs39,   vs43  
+
+    xxlor      vs17,    vs16,   vs16
+    xxlor      vs19,    vs18,   vs18
+
+    xxmrghw     vs30,   vs39,   vs43 
+    xxmrghw     vs31,   vs35,   vs47
+
+    xxperm      vs8,    vs0,    save_permute_1
+    xxperm      vs10,   vs1,    save_permute_1
+    xxperm      vs9,    vs0,    save_permute_2  
+    xxperm      vs11,   vs1,    save_permute_2      
+
+#ifndef TRMMKERNEL    
+    lxv        vs32, 0(CO)
+    lxv        vs33, 16(CO) 
+    lxv        vs34, 32(CO)  
+    lxv        vs35, 48(CO)      
+#endif
+    xxlor      vs25,    vs24,   vs24
+    xxlor      vs27,    vs26,   vs26 
+
+#ifndef TRMMKERNEL    
+    lxv        vs36, 0(T1)
+    lxv        vs37, 16(T1) 
+    lxv        vs38, 32(T1)  
+    lxv        vs39, 48(T1)     
+#endif
+#ifndef TRMMKERNEL       
+    lxv        vs40, 0(T2)
+    lxv        vs41, 16(T2) 
+    lxv        vs42, 32(T2)  
+    lxv        vs43, 48(T2)     
+#endif  
+#ifndef TRMMKERNEL    
+    lxv        vs44, 0(T3)
+    lxv        vs45, 16(T3) 
+    lxv        vs46, 32(T3)  
+    lxv        vs47, 48(T3)                 
+#endif  
+
+    xxperm     vs12,    vs2,    save_permute_1
+    xxperm     vs14,    vs3,    save_permute_1
+       
+    xxperm     vs13,    vs2,    save_permute_2   
+    xxperm     vs15,    vs3,    save_permute_2      
+
+    xxperm     vs16,    vs4,    save_permute_1
+    xxperm     vs18,    vs5,    save_permute_1
+      
+    xxperm     vs17,    vs4,    save_permute_2   
+    xxperm     vs19,    vs5,    save_permute_2      
+
+    xxperm     vs24,    vs30,   save_permute_1
+    xxperm     vs26,    vs31,   save_permute_1 
+         
+    xxperm     vs25,    vs30,   save_permute_2   
+    xxperm     vs27,    vs31,   save_permute_2  
+
+
+    /* multiply add normal way */
+ 
+#ifdef TRMMKERNEL
+    xvmulsp     vs32,   vs8,    alpha_r 
+    xvmulsp     vs33,   vs12,   alpha_r   
+    xvmulsp     vs34,   vs16,   alpha_r 
+    xvmulsp     vs35,   vs24,   alpha_r  
+    xvmulsp     vs36,   vs9,    alpha_r 
+    xvmulsp     vs37,   vs13,   alpha_r  
+    xvmulsp     vs38,   vs17,   alpha_r 
+    xvmulsp     vs39,   vs25,   alpha_r               
+#else 
+    xvmaddasp   vs32,   vs8,    alpha_r 
+    xvmaddasp   vs33,   vs12,   alpha_r   
+    xvmaddasp   vs34,   vs16,   alpha_r 
+    xvmaddasp   vs35,   vs24,   alpha_r  
+    xvmaddasp   vs36,   vs9,    alpha_r 
+    xvmaddasp   vs37,   vs13,   alpha_r   
+    xvmaddasp   vs38,   vs17,   alpha_r 
+    xvmaddasp   vs39,   vs25,   alpha_r         
+#endif 
+
+
+
+#ifdef TRMMKERNEL
+    xvmulsp     vs40,   vs10,   alpha_r 
+    xvmulsp     vs41,   vs14,   alpha_r 
+    xvmulsp     vs42,   vs18,   alpha_r 
+    xvmulsp     vs43,   vs26,   alpha_r  
+    xvmulsp     vs44,   vs11,   alpha_r 
+    xvmulsp     vs45,   vs15,   alpha_r  
+    xvmulsp     vs46,   vs19,   alpha_r 
+    xvmulsp     vs47,   vs27,   alpha_r                   
+#else
+
+    xvmaddasp   vs40,   vs10,   alpha_r 
+    xvmaddasp   vs41,   vs14,   alpha_r   
+    xvmaddasp   vs42,   vs18,   alpha_r 
+    xvmaddasp   vs43,   vs26,   alpha_r  
+    xvmaddasp   vs44,   vs11,   alpha_r 
+    xvmaddasp   vs45,   vs15,   alpha_r 
+    xvmaddasp   vs46,   vs19,   alpha_r 
+    xvmaddasp   vs47,   vs27,   alpha_r  
+        
+#endif  
+
+    stxv        vs32, 0(CO)
+    stxv        vs33, 16(CO) 
+    stxv        vs34, 32(CO)  
+    stxv        vs35, 48(CO)  
+
+    stxv        vs36, 0(T1)
+    stxv        vs37, 16(T1)  
+    stxv        vs38, 32(T1)  
+    stxv        vs39, 48(T1)
+
+    stxv        vs40, 0(T2)
+    stxv        vs41, 16(T2)  
+    stxv        vs42, 32(T2)  
+    stxv        vs43, 48(T2)  
+    stxv        vs44, 0(T3)
+    stxv        vs45, 16(T3) 
+    stxv        vs46, 32(T3)  
+    stxv        vs47, 48(T3)
+   
+    addi CO,CO,64
+
+
+.endm
+
+
+
+/**********************************************************************************************
+* Macros for N=4 and M=8
+**********************************************************************************************/
+
+.macro LOAD4x8_1
+   LOAD4x8 1
+.endm
+
+.macro LOAD4x8_0
+   LOAD4x8 0
+.endm
+
+.macro KERNEL4x8_L1_L4  Index,IsLast
+  KERNEL4x8_L1_L4_I AO,BO, 0,0, \Index,\IsLast,0
+.endm
+
+.macro KERNEL4x8_I1_L4  OffsetA,OffsetB, Index,IsLast
+  KERNEL4x8_L1_L4_I  AO,BO, \OffsetA,\OffsetB,\Index,\IsLast,0
+.endm
+
+.macro KERNEL4x8_I1_L4_2  OffsetA,OffsetB, Index,IsLast
+  KERNEL4x8_L1_L4_I  AO,BO,  \OffsetA,\OffsetB,\Index,\IsLast,0
+.endm
+
+.macro KERNEL4x8_I1_L4_3  OffsetA,OffsetB, Index,IsLast
+  KERNEL4x8_L1_L4_I  AO,BO,  \OffsetA,\OffsetB,\Index,\IsLast,1
+.endm
+.macro KERNEL4x8_I1_L2_3  OffsetA,OffsetB, Index,IsLast
+   KERNEL4x8_L1_L2_I AO,BO,0,  \OffsetA,\OffsetB,\Index,\IsLast,1
+.endm
+
+.macro KERNEL4x8_I2_L4_2  AREG,BREG,OffsetA,OffsetB, Index,IsLast
+  KERNEL4x8_L1_L4_I  \AREG,\BREG,  \OffsetA,\OffsetB,\Index,\IsLast,0
+.endm
+
+.macro KERNEL4x8_I2_L4_3  AREG,BREG,OffsetA,OffsetB, Index,IsLast
+  KERNEL4x8_L1_L4_I \AREG,\BREG,  \OffsetA,\OffsetB,\Index,\IsLast,1
+.endm
+
+.macro END4x8_NORMAL
+  END4x8 0, AO, BO, 32,16 
+.endm
+
+.macro Zero4X8
+    xxlxor      vs32,   vs32,   vs32
+    xxlxor      vs33,   vs33,   vs33
+ 
+    xxlxor      vs36,   vs36,   vs36
+    xxlxor      vs37,   vs37,   vs37
+ 
+    xxlxor      vs40,   vs40,   vs40
+    xxlxor      vs41,   vs41,   vs41
+ 
+    xxlxor      vs44,   vs44,   vs44
+    xxlxor      vs45,   vs45,   vs45
+    
+.endm
+
+.macro LOAD4x8  Zero
+
+    lxv vs24,   0(BO) 
+    lxv vs0,     0(AO)
+    lxv vs1,    16(AO)
+
+    xxperm      vs26,   vs24,       permute_mask    
+    xxpermdi    vs25,   vs24,   vs24,2      
+
+    xxpermdi    vs27,   vs26,   vs26,2      
+
+.if \Zero==1 
+    xxlxor      vs32,   vs32,   vs32
+    xxlxor      vs33,   vs33,   vs33 
+    xxlxor      vs36,   vs36,   vs36
+    xxlxor      vs37,   vs37,   vs37
+    xxlxor      vs40,   vs40,   vs40
+    xxlxor      vs41,   vs41,   vs41 
+    xxlxor      vs44,   vs44,   vs44
+    xxlxor      vs45,   vs45,   vs45 
+ 
+.endif
+.endm
+
+
+.macro END4x8 First, AREG, BREG, OffsetA, OffsetB
+
+.if \OffsetB != 0 
+    addi        \BREG, \BREG, \OffsetB 
+.endif
+.if \OffsetA != 0 
+    addi        \AREG, \AREG, \OffsetA 
+.endif  
+
+.if \First==1
+    xvmulsp     vs32, vs0,vs24
+    xvmulsp     vs33, vs1,vs24
+
+    xvmulsp     vs36, vs0,vs25
+    xvmulsp     vs37, vs1,vs25
+
+    xvmulsp     vs40, vs0,vs26
+    xvmulsp     vs41, vs1,vs26
+
+    xvmulsp     vs44, vs0,vs27
+    xvmulsp     vs45, vs1,vs27
+ 
+
+.else
+    xvmaddasp       vs32, vs0,vs24
+    xvmaddasp       vs33, vs1,vs24
+
+    xvmaddasp       vs36, vs0,vs25
+    xvmaddasp       vs37, vs1,vs25
+
+    xvmaddasp       vs40, vs0,vs26
+    xvmaddasp       vs41, vs1,vs26
+
+    xvmaddasp       vs44, vs0,vs27
+    xvmaddasp       vs45, vs1,vs27
+ 
+
+.endif
+.endm  
+
+.macro KERNEL4x8_L1_L4_I  AREG,BREG,   OffsetA,OffsetB, Index,IsLast ,Complete
+
+    lxv vs8,    DISP16(\Index, 0+\OffsetB)(\BREG) 
+
+    lxv vs4,    DISP32(\Index, 0+\OffsetA)(\AREG)
+    lxv vs5,    DISP32(\Index,16+\OffsetA)(\AREG)
+
+    xxperm      vs10,   vs8,        permute_mask    
+    xxpermdi    vs9,    vs8,    vs8,2     
+
+    xvmaddasp       vs32, vs0,vs24
+    xvmaddasp       vs33, vs1,vs24
+
+    xvmaddasp       vs36, vs0,vs25
+    xvmaddasp       vs37, vs1,vs25
+
+    xxpermdi    vs11,   vs10,   vs10,2   
+
+    xvmaddasp       vs40, vs0,vs26
+    xvmaddasp       vs41, vs1,vs26
+
+    xvmaddasp       vs44, vs0,vs27
+    xvmaddasp       vs45, vs1,vs27
+
+ 
+
+    lxv vs24,   DISP16(\Index,16+\OffsetB)(\BREG) 
+
+    lxv vs0,    DISP32(\Index,32+\OffsetA)(\AREG)
+    lxv vs1,    DISP32(\Index,32+16+\OffsetA)(\AREG)
+
+    xxperm      vs26,   vs24,       permute_mask   
+    xxpermdi    vs25,   vs24,   vs24,2      
+
+    xvmaddasp       vs32, vs4,vs8
+    xvmaddasp       vs33, vs5,vs8
+
+    xvmaddasp       vs36, vs4,vs9
+    xvmaddasp       vs37, vs5,vs9
+
+    xxpermdi    vs27,   vs26,   vs26,2       
+
+    xvmaddasp       vs40, vs4,vs10
+    xvmaddasp       vs41, vs5,vs10
+
+    xvmaddasp       vs44, vs4,vs11
+    xvmaddasp       vs45, vs5,vs11
+
+ 
+
+    lxv vs8,    DISP16(\Index,32+\OffsetB)(\BREG) 
+
+    lxv vs4,    DISP32(\Index,64+0+\OffsetA)(\AREG)
+    lxv vs5,    DISP32(\Index,64+16+\OffsetA)(\AREG)
+
+    xxperm      vs10,   vs8,        permute_mask     
+    xxpermdi    vs9,    vs8,    vs8,2     
+
+    xvmaddasp       vs32, vs0,vs24
+    xvmaddasp       vs33, vs1,vs24
+
+    xvmaddasp       vs36, vs0,vs25
+    xvmaddasp       vs37, vs1,vs25
+
+    xxpermdi    vs11,   vs10,   vs10,2   
+
+    xvmaddasp       vs40, vs0,vs26
+    xvmaddasp       vs41, vs1,vs26
+
+    xvmaddasp       vs44, vs0,vs27
+    xvmaddasp       vs45, vs1,vs27
+
+ 
+
+.if \Complete==0
+    lxv vs24,   DISP16(\Index,48+\OffsetB)(\BREG) 
+
+    lxv vs0,    DISP32(\Index,96+\OffsetA)(\AREG)
+    lxv vs1,    DISP32(\Index,96+16+\OffsetA)(\AREG) 
+
+    xxperm      vs26,   vs24,   permute_mask     
+    xxpermdi    vs25,   vs24,   vs24,2      
+
+.endif 
+.if \IsLast==1  
+.if \Complete==1
+  
+    addi        \BREG, \BREG,  DISP16(\Index,16*3+\OffsetB)
+    addi        \AREG, \AREG, DISP32(\Index,32*3+\OffsetA)
+.else
+  
+    addi        \BREG, \BREG,  DISP16(\Index,64)
+    addi        \AREG, \AREG, DISP32(\Index,128)
+.endif
+.endif   
+ 
+    xvmaddasp       vs32, vs4,vs8
+    xvmaddasp       vs33, vs5,vs8
+
+    xvmaddasp       vs36, vs4,vs9
+    xvmaddasp       vs37, vs5,vs9
+
+.if \Complete==0        
+    xxpermdi    vs27,   vs26,   vs26,2    
+    
+.endif
+ 
+    xvmaddasp       vs40, vs4,vs10
+    xvmaddasp       vs41, vs5,vs10
+
+    xvmaddasp       vs44, vs4,vs11
+    xvmaddasp       vs45, vs5,vs11
+
+ 
+
+.endm
+
+.macro KERNEL4x8 First
+
+  LOAD4x8 0
+  END4x8 \First, AO, BO, 32,16  
+.endm
+
+.macro KERNEL4x8_L1_L2_I  AREG,BREG,First,OffsetA,OffsetB, Index,IsLast ,Complete
+    
+    lxv vs8,     DISP8(\Index, 0+\OffsetB)(\BREG) 
+    lxv vs4,    DISP16(\Index, 0+\OffsetA)(\AREG)
+    lxv vs5,    DISP16(\Index,16+\OffsetA)(\AREG)
+
+    xxperm      vs10,   vs8,        permute_mask  
+    xxpermdi    vs9,    vs8,    vs8,2     
+.if \First==1
+    xvmulsp     vs32, vs0,vs24
+    xvmulsp     vs33, vs1,vs24
+
+    xvmulsp     vs36, vs0,vs25
+    xvmulsp     vs37, vs1,vs25
+
+.else
+    xvmaddasp       vs32, vs0,vs24
+    xvmaddasp       vs33, vs1,vs24
+
+    xvmaddasp       vs36, vs0,vs25
+    xvmaddasp       vs37, vs1,vs25
+
+.endif
+
+    xxpermdi    vs11,   vs10,   vs10,2    
+ 
+.if \First==1  
+    xvmulsp     vs40, vs0,vs26
+    xvmulsp     vs41, vs1,vs26
+
+    xvmulsp     vs44, vs0,vs27
+    xvmulsp     vs45, vs1,vs27
+ 
+
+.else 
+    xvmaddasp       vs40, vs0,vs26
+    xvmaddasp       vs41, vs1,vs26
+
+    xvmaddasp       vs44, vs0,vs27
+    xvmaddasp       vs45, vs1,vs27
+ 
+
+.endif
+.if \Complete==0
+    lxv vs24,    DISP8(\Index,16+\OffsetB)(\BREG) 
+
+    lxv vs0,    DISP16(\Index,32+\OffsetA)(\AREG)
+    lxv vs1,    DISP16(\Index,32+16+\OffsetA)(\AREG)
+
+    xxperm      vs26,   vs24,   permute_mask   
+    xxpermdi    vs25,   vs24,   vs24,2    
+.endif    
+.if \IsLast==1  
+.if \Complete==1
+    addi        \BREG, \BREG,   DISP8(\Index,16+\OffsetB) 
+    addi        \AREG, \AREG,  DISP16(\Index,32+\OffsetA)
+
+.else
+    addi        \BREG, \BREG,   DISP8(\Index,32)
+    addi        \AREG, \AREG,  DISP16(\Index,64) 
+.endif
+.endif
+
+.if \First==1
+    xvmulsp     vs32, vs4,vs8
+    xvmulsp     vs33, vs5,vs8
+
+    xvmulsp     vs36, vs4,vs9
+    xvmulsp     vs37, vs5,vs9
+
+.else
+    xvmaddasp       vs32, vs4,vs8
+    xvmaddasp       vs33, vs5,vs8
+
+    xvmaddasp       vs36, vs4,vs9
+    xvmaddasp       vs37, vs5,vs9
+
+.endif 
+ 
+.if \Complete==0        
+    xxpermdi    vs27,   vs26,   vs26,2   
+ 
+.endif
+.if \First==1  
+    xvmulsp     vs40, vs4,vs10
+    xvmulsp     vs41, vs5,vs10
+
+    xvmulsp     vs44, vs4,vs11
+    xvmulsp     vs45, vs5,vs11
+ 
+.else 
+    xvmaddasp       vs40, vs4,vs10
+    xvmaddasp       vs41, vs5,vs10
+
+    xvmaddasp       vs44, vs4,vs11
+    xvmaddasp       vs45, vs5,vs11 
+
+.endif
+
+.endm
+
+
+.macro SAVE4x8 
+ 
+  slwi    T10, LDC ,   1 
+  add     T1, CO, LDC 
+
+  add     T2, CO, T10  
+  add     T3, T1, T10  
+
+ 
+
+#ifndef TRMMKERNEL    
+    lxv        vs34, 0(CO)
+    lxv        vs35, 16(CO)      
+    lxv        vs38, 0(T1)
+    lxv        vs39, 16(T1)  
+    lxv        vs42, 0(T2)
+    lxv        vs43, 16(T2)     
+    lxv        vs46, 0(T3)
+    lxv        vs47, 16(T3)  
+
+ 
+#endif  
+
+    xxmrglw     vs8,    vs32,   vs44
+    xxmrglw     vs10,   vs36,   vs40  
+
+    xxmrghw     vs1,    vs32,   vs44
+    xxmrghw     vs0,    vs36,   vs40
+
+    xxmrglw     vs12,   vs33,   vs45
+    xxmrglw     vs14,   vs37,   vs41  
+
+    xxmrghw     vs2,    vs37,   vs41
+    xxmrghw     vs3,    vs33,   vs45
+
+    xxlor      vs9, vs8,    vs8
+    xxlor      vs11,    vs10,   vs10 
+ 
+    xxlor      vs13,    vs12,   vs12
+    xxlor      vs15,    vs14,   vs14
+
+    xxperm      vs8,    vs0,    save_permute_1
+    xxperm      vs10,   vs1,    save_permute_1
+    xxperm      vs9,    vs0,    save_permute_2  
+    xxperm      vs11,   vs1,    save_permute_2      
+
+    xxperm     vs12,    vs2,    save_permute_1
+    xxperm     vs14,    vs3,    save_permute_1
+      
+    xxperm     vs13,    vs2,    save_permute_2   
+    xxperm     vs15,    vs3,    save_permute_2      
+
+
+    /* multiply add normal way */
+ 
+#ifdef TRMMKERNEL
+    xvmulsp     vs34,   vs8,    alpha_r 
+    xvmulsp     vs35,   vs12,   alpha_r 
+    xvmulsp     vs38,   vs9,    alpha_r 
+    xvmulsp     vs39,   vs13,   alpha_r 
+    xvmulsp     vs42,   vs10,   alpha_r 
+    xvmulsp     vs43,   vs14,   alpha_r 
+    xvmulsp     vs46,   vs11,   alpha_r 
+    xvmulsp     vs47,   vs15,   alpha_r                    
+#else 
+    xvmaddasp   vs34,   vs8,    alpha_r 
+    xvmaddasp   vs35,   vs12,   alpha_r 
+    xvmaddasp   vs38,   vs9,    alpha_r 
+    xvmaddasp   vs39,   vs13,   alpha_r  
+    xvmaddasp   vs42,   vs10,   alpha_r 
+    xvmaddasp   vs43,   vs14,   alpha_r   
+    xvmaddasp   vs46,   vs11,   alpha_r 
+    xvmaddasp   vs47,   vs15,   alpha_r                     
+#endif     
+ 
+    
+    stxv        vs34, 0(CO)
+    stxv        vs35, 16(CO)  
+    stxv        vs38, 0(T1)
+    stxv        vs39, 16(T1)  
+    stxv        vs42, 0(T2)
+    stxv        vs43, 16(T2)     
+    stxv        vs46, 0(T3)
+    stxv        vs47, 16(T3)  
+  
+
+    addi CO,CO,32
+
+.endm
+
+
+/**********************************************************************************************
+* Macros for N=4 and M=4
+**********************************************************************************************/
+
+.macro LOAD4x4_1
+   LOAD4x4 1
+.endm
+
+.macro LOAD4x4_0
+   LOAD4x4 0
+.endm
+
+.macro KERNEL4x4_L1_L4  Index,IsLast
+  KERNEL4x4_L1_L4_I AO,BO, 0,0, \Index,\IsLast,0
+.endm
+
+.macro KERNEL4x4_I1_L4  OffsetA,OffsetB, Index,IsLast
+  KERNEL4x4_L1_L4_I  AO,BO, \OffsetA,\OffsetB,\Index,\IsLast,0
+.endm
+
+.macro KERNEL4x4_I1_L4_2  OffsetA,OffsetB, Index,IsLast
+  KERNEL4x4_L1_L4_I  AO,BO,  \OffsetA,\OffsetB,\Index,\IsLast,0
+.endm
+
+.macro KERNEL4x4_I1_L4_3  OffsetA,OffsetB, Index,IsLast
+  KERNEL4x4_L1_L4_I  AO,BO,  \OffsetA,\OffsetB,\Index,\IsLast,1
+.endm
+.macro KERNEL4x4_I1_L2_3  OffsetA,OffsetB, Index,IsLast
+   KERNEL4x4_L1_L2_I AO,BO,0,  \OffsetA,\OffsetB,\Index,\IsLast,1
+.endm
+
+.macro KERNEL4x4_I2_L4_2  AREG,BREG,OffsetA,OffsetB, Index,IsLast
+  KERNEL4x4_L1_L4_I  \AREG,\BREG,  \OffsetA,\OffsetB,\Index,\IsLast,0
+.endm
+
+.macro KERNEL4x4_I2_L4_3  AREG,BREG,OffsetA,OffsetB, Index,IsLast
+  KERNEL4x4_L1_L4_I \AREG,\BREG,  \OffsetA,\OffsetB,\Index,\IsLast,1
+.endm
+
+.macro Zero4X4
+    xxlxor      vs32,   vs32,   vs32
+    xxlxor      vs33,   vs33,   vs33 
+    xxlxor      vs34,   vs34,   vs34
+    xxlxor      vs35,   vs35,   vs35
+ 
+.endm
+
+.macro LOAD4x4  Zero
+
+    lxv vs0,     0(AO)
+    lxv vs24,   0(BO) 
+
+
+
+    xxperm      vs2,   vs0,       permute_mask  
+    xxpermdi    vs1,   vs0,   vs0,2      
+    xxpermdi    vs3,   vs2,   vs2,2        
+
+.if \Zero==1 
+    xxlxor      vs32,   vs32,   vs32
+    xxlxor      vs33,   vs33,   vs33 
+    xxlxor      vs34,   vs34,   vs34
+    xxlxor      vs35,   vs35,   vs35
+ 
+.endif
+.endm
+
+.macro END4x4_NORMAL
+  END4x4 0, AO, BO, 16,16 
+.endm
+
+.macro END4x4 First, AREG, BREG, OffsetA, OffsetB
+
+.if \OffsetB != 0 
+    addi        \BREG, \BREG, \OffsetB 
+.endif
+.if \OffsetA != 0 
+    addi        \AREG, \AREG, \OffsetA 
+.endif  
+
+.if \First==1
+    xvmulsp      vs32,   vs24,   vs0
+    xvmulsp      vs33,   vs24,   vs1 
+    xvmulsp      vs34,   vs24,   vs2
+    xvmulsp      vs35,   vs24,   vs3  
+.else
+    xvmaddasp      vs32,   vs24,   vs0
+    xvmaddasp      vs33,   vs24,   vs1 
+    xvmaddasp      vs34,   vs24,   vs2
+    xvmaddasp      vs35,   vs24,   vs3
+ 
+
+.endif
+.endm  
+
+.macro KERNEL4x4_L1_L4_I  AREG,BREG,   OffsetA,OffsetB, Index,IsLast ,Complete
+
+    lxv vs4,    DISP16(\Index, 0+\OffsetA)(\AREG)
+    lxv vs26,   DISP16(\Index, 0+\OffsetB)(\BREG) 
+
+    xxperm      vs6,   vs4,       permute_mask  
+    xxpermdi    vs5,   vs4,   vs4,2      
+    xxpermdi    vs7,   vs6,   vs6,2 
+ 
+    xvmaddasp      vs32,   vs24,   vs0
+    xvmaddasp      vs33,   vs24,   vs1 
+    xvmaddasp      vs34,   vs24,   vs2
+    xvmaddasp      vs35,   vs24,   vs3
+ 
+
+    lxv vs0,    DISP16(\Index, 16+\OffsetA)(\AREG)
+    lxv vs24,   DISP16(\Index, 16+\OffsetB)(\BREG)  
+
+    xxperm      vs2,   vs0,       permute_mask  
+    xxpermdi    vs1,   vs0,   vs0,2      
+    xxpermdi    vs3,   vs2,   vs2,2   
+
+    xvmaddasp      vs32,   vs26,   vs4
+    xvmaddasp      vs33,   vs26,   vs5 
+    xvmaddasp      vs34,   vs26,   vs6
+    xvmaddasp      vs35,   vs26,   vs7
+ 
+ 
+
+    lxv vs4,    DISP16(\Index, 32+\OffsetA)(\AREG)
+    lxv vs26,   DISP16(\Index, 32+\OffsetB)(\BREG) 
+
+    xxperm      vs6,   vs4,       permute_mask  
+    xxpermdi    vs5,   vs4,   vs4,2      
+    xxpermdi    vs7,   vs6,   vs6,2 
+ 
+    xvmaddasp      vs32,   vs24,   vs0
+    xvmaddasp      vs33,   vs24,   vs1 
+    xvmaddasp      vs34,   vs24,   vs2
+    xvmaddasp      vs35,   vs24,   vs3
+ 
+
+.if \Complete==0 
+
+    lxv vs0,    DISP16(\Index, 48+\OffsetA)(\AREG)
+    lxv vs24,   DISP16(\Index, 48+\OffsetB)(\BREG) 
+
+    xxperm      vs2,   vs0,       permute_mask  
+    xxpermdi    vs1,   vs0,   vs0,2      
+    xxpermdi    vs3,   vs2,   vs2,2   
+.endif
+    xvmaddasp      vs32,   vs26,   vs4
+    xvmaddasp      vs33,   vs26,   vs5 
+    xvmaddasp      vs34,   vs26,   vs6
+    xvmaddasp      vs35,   vs26,   vs7
+ 
+
+ 
+ 
+.if \IsLast==1  
+.if \Complete==1
+    addi        \AREG, \AREG, DISP16(\Index,16*3+\OffsetA)  
+    addi        \BREG, \BREG,  DISP16(\Index,16*3+\OffsetB)
+
+.else
+    addi        \AREG, \AREG, DISP16(\Index,64)  
+    addi        \BREG, \BREG,  DISP16(\Index,64)
+
+.endif
+.endif   
+ 
+ 
+.endm
+
+.macro KERNEL4x4 First
+    LOAD4x4 0
+    END4x4 \First, AO, BO, 16,16  
+.endm
+
+.macro KERNEL4x4_L1_L2_I  AREG,BREG,First,OffsetA,OffsetB, Index,IsLast ,Complete
+
+    lxv vs4,    DISP8(\Index, 0+\OffsetA)(\AREG)
+    lxv vs26,   DISP8(\Index, 0+\OffsetB)(\BREG) 
+
+    xxperm      vs6,   vs4,       permute_mask  
+    xxpermdi    vs5,   vs4,   vs4,2      
+    xxpermdi    vs7,   vs6,   vs6,2 
+.if \First==1
+    xvmulsp      vs32,   vs24,   vs0
+    xvmulsp      vs33,   vs24,   vs1 
+    xvmulsp      vs34,   vs24,   vs2
+    xvmulsp      vs35,   vs24,   vs3
+ 
+.else 
+    xvmaddasp      vs32,   vs24,   vs0
+    xvmaddasp      vs33,   vs24,   vs1 
+    xvmaddasp      vs34,   vs24,   vs2
+    xvmaddasp      vs35,   vs24,   vs3
+ 
+.endif
+
+.if \Complete==0 
+
+    lxv vs0,    DISP8(\Index, 16+\OffsetA)(\AREG)
+    lxv vs24,   DISP8(\Index, 16+\OffsetB)(\BREG) 
+
+    xxperm      vs2,   vs0,       permute_mask  
+    xxpermdi    vs1,   vs0,   vs0,2      
+    xxpermdi    vs3,   vs2,   vs2,2   
+.endif
+
+.if \First==1
+    xvmulsp      vs32,   vs26,   vs4
+    xvmulsp      vs33,   vs26,   vs5 
+    xvmulsp      vs34,   vs26,   vs6
+    xvmulsp      vs35,   vs26,   vs7 
+
+
+.else
+    xvmaddasp      vs32,   vs26,   vs4
+    xvmaddasp      vs33,   vs26,   vs5 
+    xvmaddasp      vs34,   vs26,   vs6
+    xvmaddasp      vs35,   vs26,   vs7
+ 
+.endif
+ 
+ 
+.if \IsLast==1  
+.if \Complete==1
+    addi        \AREG, \AREG, DISP8(\Index,16+\OffsetA)  
+    addi        \BREG, \BREG,  DISP8(\Index,16+\OffsetB)
+
+.else
+    addi        \AREG, \AREG, DISP8(\Index,32)  
+    addi        \BREG, \BREG,  DISP8(\Index,32)
+
+.endif
+.endif   
+     
+  
+.endm
+
+
+.macro SAVE4x4
+  slwi    T10, LDC ,   1 
+  add     T1, CO, LDC 
+#if !defined(TRMMKERNEL)  
+  lxv        vs36, 0(CO)
+  lxv        vs37, 0(T1)
+#endif
+  add     T2, CO, T10  
+  add     T3, T1, T10 
+#if !defined(TRMMKERNEL)   
+  lxv        vs38, 0(T2)
+  lxv        vs39, 0(T3)    
+#endif   
+
+  xxmrglw  vs0, vs35,vs32
+  xxmrglw  vs1, vs34,vs33 
+  xxmrglw  vs4, vs32,vs35
+  xxmrglw  vs5, vs33,vs34 
+
+
+  xxmrghw  vs2, vs35,vs32
+  xxmrghw  vs3, vs34,vs33 
+  xxmrghw  vs6, vs32,vs35
+  xxmrghw  vs7, vs33,vs34  
+
+  xxmrgld  vs24, vs1, vs0  
+  xxmrghd  vs25,vs5,vs4 
+
+  xxmrgld  vs26, vs2, vs3  
+  xxmrghd  vs27,vs6,vs7
+
+ #if defined(TRMMKERNEL)
+  xvmulsp        vs36, vs24, alpha_r
+  xvmulsp        vs37, vs25, alpha_r 
+  xvmulsp        vs38, vs26, alpha_r
+  xvmulsp        vs39, vs27, alpha_r 
+#else
+  xvmaddasp        vs36, vs24, alpha_r
+  xvmaddasp        vs37, vs25, alpha_r 
+  xvmaddasp        vs38, vs26, alpha_r
+  xvmaddasp        vs39, vs27, alpha_r   
+ #endif
+  stxv        vs36, 0(CO)
+  stxv        vs37, 0(T1) 
+  stxv        vs38, 0(T2)
+  stxv        vs39, 0(T3)   
+ 
+
+
+  addi CO,CO,16
+.endm
+
+
+/**********************************************************************************************
+* Macros for N=4 and M=2
+**********************************************************************************************/
+
+ 
+.macro KERNEL4x2_2   OffsetA,OffsetB, Index,IsLast
+  KERNEL4x2_I_2 AO,BO, 0, \OffsetA,\OffsetB,\Index,\IsLast
+.endm
+
+ 
+
+.macro Zero4x2
+    xxlxor      vs0,   vs0,   vs0 
+    xxlxor      vs2,   vs2,   vs2 
+       
+.endm
+ 
+.macro KERNEL4x2
+  KERNEL4x2_1 AO,BO, 0, 0,0,0
+.endm
+.macro KERNEL4x2_1 AREG,BREG,First,OffsetA,OffsetB,Index
+
+
+    lxsd v4,   DISP2(\Index, 0+\OffsetA)(\AREG)
+    lxv vs26,   DISP8(\Index, 0+\OffsetB)(\BREG)      
+    xxspltw   vs8,  vs36, 0 
+    xxspltw   vs9,  vs36, 1  
+ 
+.if \First==1
+    xvmulsp      vs0,   vs26,   vs8 
+    xvmulsp      vs2,   vs26,   vs9 
+     
+.else 
+    xvmaddasp      vs0,   vs26,   vs8  
+    xvmaddasp      vs2,   vs26,   vs9 
+ 
+ .endif
+   
+    addi        \AREG, \AREG, DISP2(\Index,8)  
+    addi        \BREG, \BREG, DISP4(\Index,16)
+ 
+.endm
+
+.macro KERNEL4x2_I_2  AREG,BREG,First,OffsetA,OffsetB, Index,IsLast  
+
+    lxv vs4,    DISP4(\Index, 0+\OffsetA)(\AREG)
+    lxv vs26,   DISP8(\Index, 0+\OffsetB)(\BREG) 
+    lxv vs28,   DISP8(\Index,16+\OffsetB)(\BREG)       
+    xxspltw   vs8,  vs4, 2  
+    xxspltw   vs9,  vs4, 3 
+    xxspltw   vs10, vs4, 0 
+    xxspltw   vs11, vs4, 1
+ 
+.if \First==1
+    xvmulsp      vs0,   vs26,   vs8 
+    xvmulsp      vs2,   vs26,   vs9  
+
+    xvmulsp      vs0,   vs28,   vs10 
+    xvmulsp      vs2,   vs28,   vs11     
+.else 
+    xvmaddasp      vs0,   vs26,   vs8 
+    xvmaddasp      vs2,   vs26,   vs9 
+
+    xvmaddasp      vs0,   vs28,   vs10 
+    xvmaddasp      vs2,   vs28,   vs11   
+ .endif
+
+ 
+.if \IsLast==1   
+    addi        \AREG, \AREG, DISP4(\Index,16)  
+    addi        \BREG, \BREG, DISP8(\Index,32)
+.endif 
+  
+.endm
+
+
+.macro SAVE4x2
+  slwi    T10, LDC ,   1 
+  add     T1, CO, LDC  
+  add     T2, CO, T10  
+  add     T3, T1, T10     
+  /*convert alpha_r for multiply*/
+  xscvspdp  vs4,alpha_r
+/* v0 corresponds to vs32, do not forget*/
+#if !defined(TRMMKERNEL)
+  lxssp  v0,0(CO) 
+  lxssp  v1,4(CO) 
+
+  lxssp  v2,0(T1)
+  lxssp  v3,4(T1)
+
+  lxssp  v4,0(T2)
+  lxssp  v5,4(T2)
+
+  lxssp  v6,0(T3)
+  lxssp  v7,4(T3)
+
+   
+#endif
+  xscvspdp  vs5, vs2
+  xxspltw   vs6, vs2, 1 
+  xxspltw   vs7, vs2, 2 
+  xxspltw   vs8, vs2, 3  
+  xscvspdp  vs6,vs6
+  xscvspdp  vs7,vs7
+  xscvspdp  vs8,vs8
+
+  xscvspdp  vs24, vs0
+  xxspltw   vs25, vs0, 1 
+  xxspltw   vs26, vs0, 2 
+  xxspltw   vs27, vs0, 3  
+  xscvspdp  vs25,vs25
+  xscvspdp  vs26,vs26
+  xscvspdp  vs27,vs27
+ 
+
+#if defined(TRMMKERNEL)
+  xsmuldp  vs32,vs8, vs4 
+  xsmuldp  vs33,vs27, vs4 
+
+  xsmuldp  vs34,vs7, vs4 
+  xsmuldp  vs35,vs26, vs4 
+
+  xsmuldp  vs36,vs6, vs4 
+  xsmuldp  vs37,vs25, vs4  
+
+  xsmuldp  vs38,vs5, vs4 
+  xsmuldp  vs39,vs24, vs4  
+
+      
+#else
+  xsmaddadp  vs32,vs8, vs4 
+  xsmaddadp  vs33,vs27, vs4 
+
+  xsmaddadp  vs34,vs7, vs4 
+  xsmaddadp  vs35,vs26, vs4 
+
+  xsmaddadp  vs36,vs6, vs4 
+  xsmaddadp  vs37,vs25, vs4  
+
+  xsmaddadp  vs38,vs5, vs4 
+  xsmaddadp  vs39,vs24, vs4  
+
+    
+#endif  
+
+  stxssp  v0,0(CO) 
+  stxssp  v1,4(CO) 
+
+  stxssp  v2,0(T1)
+  stxssp  v3,4(T1)
+
+  stxssp  v4,0(T2)
+  stxssp  v5,4(T2)
+
+  stxssp  v6,0(T3)
+  stxssp  v7,4(T3)
+
+ 
+ 
+
+  addi CO,CO,8
+.endm
+
+
+/**********************************************************************************************
+* Macros for N=4 and M=1
+**********************************************************************************************/
+.macro KERNEL4x1_4   OffsetA,OffsetB, Index,IsLast
+  KERNEL4x1_I_4 AO,BO, 0, \OffsetA,\OffsetB,\Index,\IsLast
+.endm
+
+.macro Zero4x1
+    xxlxor      vs0,   vs0,   vs0 
+.endm
+
+.macro KERNEL4x1
+  KERNEL4x1_1 AO,BO, 0 
+.endm
+
+.macro KERNEL4x1_2
+  KERNEL4x1_2_1 AO,BO, 0 
+.endm
+
+.macro KERNEL4x1_1 AREG,BREG,First 
+    lxvwsx vs8,  0, \AREG
+    lxv vs26,   0(\BREG)       
+.if \First==1
+    xvmulsp      vs0,   vs26,   vs8  
+.else 
+    xvmaddasp      vs0,   vs26,   vs8  
+ .endif
+    addi        \AREG, \AREG,  4  
+    addi        \BREG, \BREG,  16
+.endm
+
+.macro KERNEL4x1_2_1 AREG,BREG,First 
+    lxsd v4,    0(\AREG)
+    lxv vs26,   0(\BREG)      
+    lxv vs28,  16(\BREG) 
+    xxspltw   vs8,  vs36, 1 
+    xxspltw   vs9,  vs36, 0  
+.if \First==1
+    xvmulsp      vs0,   vs26,   vs8 
+    xvmulsp      vs0,   vs28,   vs9     
+.else 
+    xvmaddasp      vs0,   vs26,   vs8  
+    xvmaddasp      vs0,   vs28,   vs9  
+ .endif
+    addi        \AREG, \AREG,  8 
+    addi        \BREG, \BREG,  32
+.endm
+
+.macro KERNEL4x1_I_4  AREG,BREG,First,OffsetA,OffsetB, Index,IsLast  
+    lxv vs4,    DISP4(\Index, 0+\OffsetA)(\AREG)
+    xxspltw   vs8,  vs4, 3 
+    xxspltw   vs9,  vs4, 2 
+    xxspltw   vs10, vs4, 1 
+    xxspltw   vs11, vs4, 0
+    lxv vs26,   DISP16(\Index, 0+\OffsetB)(\BREG) 
+    lxv vs28,   DISP16(\Index,16+\OffsetB)(\BREG)  
+    lxv vs30,   DISP16(\Index,32+\OffsetB)(\BREG) 
+    lxv vs32,   DISP16(\Index,48+\OffsetB)(\BREG)          
+.if \First==1
+    xvmulsp      vs0,   vs26,   vs8  
+    xvmulsp      vs0,   vs28,   vs9      
+    xvmulsp      vs0,   vs30,   vs10  
+    xvmulsp      vs0,   vs32,   vs11     
+.else 
+    xvmaddasp      vs0,   vs26,   vs8  
+    xvmaddasp      vs0,   vs28,   vs9     
+    xvmaddasp      vs0,   vs30,   vs10  
+    xvmaddasp      vs0,   vs32,   vs11  
+ .endif
+.if \IsLast==1   
+    addi        \AREG, \AREG, DISP4(\Index,16)  
+    addi        \BREG, \BREG, DISP16(\Index,64)
+.endif 
+.endm
+
+.macro SAVE4x1
+  slwi    T10, LDC ,   1 
+  add     T1, CO, LDC  
+  add     T2, CO, T10  
+  add     T3, T1, T10     
+  /*convert alpha_r for multiply*/
+  xscvspdp  vs4,alpha_r
+/* v0 corresponds to vs32, do not forget*/
+#if !defined(TRMMKERNEL)
+  lxssp  v0,0(CO)  
+  lxssp  v2,0(T1) 
+  lxssp  v4,0(T2) 
+  lxssp  v6,0(T3)  
+#endif
+  xscvspdp  vs24, vs0
+  xxspltw   vs25, vs0, 1 
+  xxspltw   vs26, vs0, 2 
+  xxspltw   vs27, vs0, 3  
+  xscvspdp  vs25,vs25
+  xscvspdp  vs26,vs26
+  xscvspdp  vs27,vs27
+
+#if defined(TRMMKERNEL)
+  xsmuldp  vs32,vs27, vs4 
+  xsmuldp  vs34,vs26, vs4 
+  xsmuldp  vs36,vs25, vs4 
+  xsmuldp  vs38,vs24, vs4  
+#else
+  xsmaddadp  vs32,vs27, vs4 
+  xsmaddadp  vs34,vs26, vs4 
+  xsmaddadp  vs36,vs25, vs4 
+  xsmaddadp  vs38,vs24, vs4   
+#endif  
+  stxssp  v0,0(CO)  
+  stxssp  v2,0(T1) 
+  stxssp  v4,0(T2) 
+  stxssp  v6,0(T3)  
+  addi CO,CO,4
+.endm
+
+/****************************N=2 section*****************/
+
+.macro KERNEL2x16_2   OffsetA,OffsetB, Index,IsLast
+  KERNEL2x16_I_2 AO,BO,  \OffsetA,\OffsetB,\Index,\IsLast
+.endm
+
+ 
+.macro Zero2x16
+    xxlxor      vs0,   vs0,   vs0
+    xxlxor      vs1,   vs1,   vs1 
+    xxlxor      vs2,   vs2,   vs2
+    xxlxor      vs3,   vs3,   vs3
+    xxlxor      vs4,   vs4,   vs4
+    xxlxor      vs5,   vs5,   vs5 
+    xxlxor      vs6,   vs6,   vs6
+    xxlxor      vs7,   vs7,   vs7      
+.endm
+ 
+.macro KERNEL2x16
+  KERNEL2x16_1 AO,BO, 0, 0,0,0
+.endm
+.macro KERNEL2x16_4 OffsetA,OffsetB, Index,IsLast  
+  KERNEL2x16_I_4 AO,BO,  \OffsetA,\OffsetB,\Index,\IsLast
+.endm
+ 
+.macro KERNEL2x16_1 AREG,BREG,First,OffsetA,OffsetB,Index
+
+
+    lxsd v4,   DISP2(\Index, 0+\OffsetB)(\BREG)
+    xxspltw   vs8,  vs36, 1 
+    xxspltw   vs9,  vs36, 0 
+    lxv vs26,   DISP16(\Index, 0+\OffsetA)(\AREG)
+    lxv vs27,   DISP16(\Index,16+\OffsetA)(\AREG)   
+    lxv vs28,   DISP16(\Index, 32+\OffsetA)(\AREG)
+    lxv vs29,   DISP16(\Index,48+\OffsetA)(\AREG)        
+ 
+ 
+.if \First==1
+    xvmulsp      vs0,   vs26,   vs8
+    xvmulsp      vs1,   vs27,   vs8 
+    xvmulsp      vs2,   vs28,   vs8
+    xvmulsp      vs3,   vs29,   vs8 
+
+    xvmulsp      vs4,   vs26,   vs9
+    xvmulsp      vs5,   vs27,   vs9 
+    xvmulsp      vs6,   vs28,   vs9
+    xvmulsp      vs7,   vs29,   vs9     
+     
+.else 
+    xvmaddasp      vs0,   vs26,   vs8
+    xvmaddasp      vs1,   vs27,   vs8 
+    xvmaddasp      vs2,   vs28,   vs8
+    xvmaddasp      vs3,   vs29,   vs8 
+
+    xvmaddasp      vs4,   vs26,   vs9
+    xvmaddasp      vs5,   vs27,   vs9 
+    xvmaddasp      vs6,   vs28,   vs9
+    xvmaddasp      vs7,   vs29,   vs9
+ 
+ .endif
+   
+    addi        \BREG, \BREG, DISP2(\Index,8)
+    addi        \AREG, \AREG, DISP16(\Index,64)  
+ 
+.endm
+
+
+
+
+.macro KERNEL2x16_I_4  AREG,BREG, OffsetA,OffsetB, Index,IsLast  
+
+    lxv vs38,    DISP8(\Index, 0+\OffsetB)(\BREG)
+    lxv vs39,    DISP8(\Index, 16+\OffsetB)(\BREG) 
+
+    lxv vs26,   DISP64(\Index, 0+\OffsetA)(\AREG)
+    lxv vs27,   DISP64(\Index,16+\OffsetA)(\AREG)
+    lxv vs28,   DISP64(\Index,32+\OffsetA)(\AREG)
+    lxv vs29,   DISP64(\Index,48+\OffsetA)(\AREG)  
+
+    lxv vs16,   DISP64(\Index,64+ 0+\OffsetA)(\AREG)
+    lxv vs17,   DISP64(\Index,64+ 16+\OffsetA)(\AREG)
+    lxv vs18,   DISP64(\Index,64+ 32+\OffsetA)(\AREG)
+    lxv vs19,   DISP64(\Index,64+ 48+\OffsetA)(\AREG) 
+
+    lxv vs30,   DISP64(\Index,128+ 0+\OffsetA)(\AREG)
+    lxv vs31,   DISP64(\Index,128+ 16+\OffsetA)(\AREG)
+    lxv vs32,   DISP64(\Index,128+ 32+\OffsetA)(\AREG)
+    lxv vs33,   DISP64(\Index,128+ 48+\OffsetA)(\AREG)  
+        
+    lxv vs34,   DISP64(\Index,128+ 64+ 0+\OffsetA)(\AREG)
+    lxv vs35,   DISP64(\Index,128+ 64+ 16+\OffsetA)(\AREG)
+    lxv vs36,   DISP64(\Index,128+ 64+ 32+\OffsetA)(\AREG)
+    lxv vs37,   DISP64(\Index,128+ 64+ 48+\OffsetA)(\AREG) 
+
+    xxspltw   vs8,  vs38, 3  
+    xxspltw   vs9,  vs38, 2 
+    xxspltw   vs10, vs38, 1 
+    xxspltw   vs11, vs38, 0  
+
+    xxspltw   vs12,  vs39, 3  
+    xxspltw   vs13,  vs39, 2 
+    xxspltw   vs14, vs39, 1 
+    xxspltw   vs15, vs39, 0  
+
+ 
+    xvmaddasp      vs0,   vs26,   vs8
+    xvmaddasp      vs1,   vs27,   vs8 
+    xvmaddasp      vs2,   vs28,   vs8
+    xvmaddasp      vs3,   vs29,   vs8 
+
+    xvmaddasp      vs4,   vs26,   vs9
+    xvmaddasp      vs5,   vs27,   vs9 
+    xvmaddasp      vs6,   vs28,   vs9
+    xvmaddasp      vs7,   vs29,   vs9 
+
+    xvmaddasp      vs0,   vs16,   vs10
+    xvmaddasp      vs1,   vs17,   vs10 
+    xvmaddasp      vs2,   vs18,   vs10
+    xvmaddasp      vs3,   vs19,   vs10 
+
+    xvmaddasp      vs4,   vs16,   vs11
+    xvmaddasp      vs5,   vs17,   vs11 
+    xvmaddasp      vs6,   vs18,   vs11
+    xvmaddasp      vs7,   vs19,   vs11  
+
+    xvmaddasp      vs0,   vs30,   vs12
+    xvmaddasp      vs1,   vs31,   vs12 
+    xvmaddasp      vs2,   vs32,   vs12
+    xvmaddasp      vs3,   vs33,   vs12 
+
+    xvmaddasp      vs4,   vs30,   vs13
+    xvmaddasp      vs5,   vs31,   vs13 
+    xvmaddasp      vs6,   vs32,   vs13
+    xvmaddasp      vs7,   vs33,   vs13 
+
+    xvmaddasp      vs0,   vs34,   vs14
+    xvmaddasp      vs1,   vs35,   vs14 
+    xvmaddasp      vs2,   vs36,   vs14
+    xvmaddasp      vs3,   vs37,   vs14 
+
+    xvmaddasp      vs4,   vs34,   vs15
+    xvmaddasp      vs5,   vs35,   vs15 
+    xvmaddasp      vs6,   vs36,   vs15
+    xvmaddasp      vs7,   vs37,   vs15    
+ 
+ 
+.if \IsLast==1   
+    addi        \BREG, \BREG, DISP8(\Index,32)  
+    addi        \AREG, \AREG, DISP64(\Index,256)
+.endif 
+  
+.endm
+
+.macro KERNEL2x16_I_2  AREG,BREG, OffsetA,OffsetB, Index,IsLast  
+
+    lxv vs36,    DISP4(\Index, 0+\OffsetB)(\BREG)
+    xxspltw   vs8,  vs36, 3  
+    xxspltw   vs9,  vs36, 2 
+    xxspltw   vs10, vs36, 1 
+    xxspltw   vs11, vs36, 0    
+    lxv vs26,   DISP32(\Index, 0+\OffsetA)(\AREG)
+    lxv vs27,   DISP32(\Index,16+\OffsetA)(\AREG)
+    lxv vs28,   DISP32(\Index,32+\OffsetA)(\AREG)
+    lxv vs29,   DISP32(\Index,48+\OffsetA)(\AREG)      
+    lxv vs16,   DISP32(\Index,64+ 0+\OffsetA)(\AREG)
+    lxv vs17,   DISP32(\Index,64+ 16+\OffsetA)(\AREG)
+    lxv vs18,   DISP32(\Index,64+ 32+\OffsetA)(\AREG)
+    lxv vs19,   DISP32(\Index,64+ 48+\OffsetA)(\AREG) 
+ 
+ 
+    xvmaddasp      vs0,   vs26,   vs8
+    xvmaddasp      vs1,   vs27,   vs8 
+    xvmaddasp      vs2,   vs28,   vs8
+    xvmaddasp      vs3,   vs29,   vs8 
+
+    xvmaddasp      vs4,   vs26,   vs9
+    xvmaddasp      vs5,   vs27,   vs9 
+    xvmaddasp      vs6,   vs28,   vs9
+    xvmaddasp      vs7,   vs29,   vs9 
+
+    xvmaddasp      vs0,   vs16,   vs10
+    xvmaddasp      vs1,   vs17,   vs10 
+    xvmaddasp      vs2,   vs18,   vs10
+    xvmaddasp      vs3,   vs19,   vs10 
+
+    xvmaddasp      vs4,   vs16,   vs11
+    xvmaddasp      vs5,   vs17,   vs11 
+    xvmaddasp      vs6,   vs18,   vs11
+    xvmaddasp      vs7,   vs19,   vs11   
+ 
+.if \IsLast==1   
+    addi        \BREG, \BREG, DISP4(\Index,16)  
+    addi        \AREG, \AREG, DISP32(\Index,128)
+.endif 
+  
+.endm
+
+
+.macro SAVE2x16
+
+#ifndef TRMMKERNEL    
+    lxv        vs16, 0(CO)
+    lxv        vs17, 16(CO) 
+    lxv        vs18, 32(CO)  
+    lxv        vs19, 48(CO)      
+#endif
+  add     T1, CO, LDC 
+#ifndef TRMMKERNEL    
+    lxv        vs26, 0(T1)
+    lxv        vs27, 16(T1) 
+    lxv        vs28, 32(T1)  
+    lxv        vs29, 48(T1)      
+#endif
+
+#if defined(TRMMKERNEL)
+  xvmulsp        vs16, vs0, alpha_r
+  xvmulsp        vs17, vs1, alpha_r 
+  xvmulsp        vs18, vs2, alpha_r
+  xvmulsp        vs19, vs3, alpha_r   
+  xvmulsp        vs26, vs4, alpha_r
+  xvmulsp        vs27, vs5, alpha_r 
+  xvmulsp        vs28, vs6, alpha_r
+  xvmulsp        vs29, vs7, alpha_r
+#else
+  xvmaddasp        vs16, vs0, alpha_r
+  xvmaddasp        vs17, vs1, alpha_r 
+  xvmaddasp        vs18, vs2, alpha_r
+  xvmaddasp        vs19, vs3, alpha_r   
+  xvmaddasp        vs26, vs4, alpha_r
+  xvmaddasp        vs27, vs5, alpha_r 
+  xvmaddasp        vs28, vs6, alpha_r
+  xvmaddasp        vs29, vs7, alpha_r
+#endif
+    stxv        vs16, 0(CO)
+    stxv        vs17, 16(CO) 
+    stxv        vs18, 32(CO)  
+    stxv        vs19, 48(CO)      
+    
+    stxv        vs26, 0(T1)
+    stxv        vs27, 16(T1) 
+    stxv        vs28, 32(T1)  
+    stxv        vs29, 48(T1) 
+ 
+  addi CO,CO,64
+
+.endm
+
+/*       M=8 N=2 */
+
+.macro KERNEL2x8_2   OffsetA,OffsetB, Index,IsLast
+  KERNEL2x8_I_2 AO,BO,  \OffsetA,\OffsetB,\Index,\IsLast
+.endm
+
+ 
+.macro Zero2x8
+    xxlxor      vs0,   vs0,   vs0
+    xxlxor      vs1,   vs1,   vs1 
+ 
+    xxlxor      vs4,   vs4,   vs4
+    xxlxor      vs5,   vs5,   vs5 
+     
+.endm
+ 
+.macro KERNEL2x8
+  KERNEL2x8_1 AO,BO, 0, 0,0,0
+.endm
+.macro KERNEL2x8_4 OffsetA,OffsetB, Index,IsLast  
+  KERNEL2x8_I_4 AO,BO,  \OffsetA,\OffsetB,\Index,\IsLast
+.endm
+ 
+.macro KERNEL2x8_1 AREG,BREG,First,OffsetA,OffsetB,Index
+
+
+    lxsd v4,   DISP2(\Index, 0+\OffsetB)(\BREG)
+    xxspltw   vs8,  vs36, 1 
+    xxspltw   vs9,  vs36, 0 
+    lxv vs26,   DISP8(\Index, 0+\OffsetA)(\AREG)
+    lxv vs27,   DISP8(\Index,16+\OffsetA)(\AREG)          
+ 
+ 
+.if \First==1
+    xvmulsp      vs0,   vs26,   vs8
+    xvmulsp      vs1,   vs27,   vs8  
+
+    xvmulsp      vs4,   vs26,   vs9
+    xvmulsp      vs5,   vs27,   vs9      
+     
+.else 
+    xvmaddasp      vs0,   vs26,   vs8
+    xvmaddasp      vs1,   vs27,   vs8   
+
+    xvmaddasp      vs4,   vs26,   vs9
+    xvmaddasp      vs5,   vs27,   vs9  
+ 
+ .endif
+   
+    addi        \BREG, \BREG, DISP2(\Index,8)
+    addi        \AREG, \AREG, DISP8(\Index,32)  
+ 
+.endm
+
+
+
+
+.macro KERNEL2x8_I_4  AREG,BREG, OffsetA,OffsetB, Index,IsLast  
+
+    lxv vs38,    DISP8(\Index, 0+\OffsetB)(\BREG)
+    lxv vs39,    DISP8(\Index, 16+\OffsetB)(\BREG) 
+
+    lxv vs26,   DISP32(\Index, 0+\OffsetA)(\AREG)
+    lxv vs27,   DISP32(\Index,16+\OffsetA)(\AREG) 
+
+    lxv vs16,   DISP32(\Index,32+ 0+\OffsetA)(\AREG)
+    lxv vs17,   DISP32(\Index,32+ 16+\OffsetA)(\AREG) 
+
+    lxv vs30,   DISP32(\Index,64+ 0+\OffsetA)(\AREG)
+    lxv vs31,   DISP32(\Index,64+ 16+\OffsetA)(\AREG)  
+        
+    lxv vs34,   DISP32(\Index, 96+ 0+\OffsetA)(\AREG)
+    lxv vs35,   DISP32(\Index, 96+ 16+\OffsetA)(\AREG) 
+
+    xxspltw   vs8,  vs38, 3  
+    xxspltw   vs9,  vs38, 2 
+    xxspltw   vs10, vs38, 1 
+    xxspltw   vs11, vs38, 0  
+
+    xxspltw   vs12,  vs39, 3  
+    xxspltw   vs13,  vs39, 2 
+    xxspltw   vs14, vs39, 1 
+    xxspltw   vs15, vs39, 0  
+
+ 
+ 
+    xvmaddasp      vs0,   vs26,   vs8
+    xvmaddasp      vs1,   vs27,   vs8 
+    xvmaddasp      vs4,   vs26,   vs9
+    xvmaddasp      vs5,   vs27,   vs9 
+ 
+
+    xvmaddasp      vs0,   vs16,   vs10
+    xvmaddasp      vs1,   vs17,   vs10 
+    xvmaddasp      vs4,   vs16,   vs11
+    xvmaddasp      vs5,   vs17,   vs11 
+ 
+
+    xvmaddasp      vs0,   vs30,   vs12
+    xvmaddasp      vs1,   vs31,   vs12 
+    xvmaddasp      vs4,   vs30,   vs13
+    xvmaddasp      vs5,   vs31,   vs13 
+
+    xvmaddasp      vs0,   vs34,   vs14
+    xvmaddasp      vs1,   vs35,   vs14 
+    xvmaddasp      vs4,   vs34,   vs15
+    xvmaddasp      vs5,   vs35,   vs15 
+   
+ 
+ 
+.if \IsLast==1   
+    addi        \BREG, \BREG, DISP8(\Index,32)  
+    addi        \AREG, \AREG, DISP32(\Index,128)
+.endif 
+  
+.endm
+
+.macro KERNEL2x8_I_2  AREG,BREG, OffsetA,OffsetB, Index,IsLast  
+
+    lxv vs36,    DISP4(\Index, 0+\OffsetB)(\BREG)
+    xxspltw   vs8,  vs36, 3  
+    xxspltw   vs9,  vs36, 2 
+    xxspltw   vs10, vs36, 1 
+    xxspltw   vs11, vs36, 0    
+    lxv vs26,   DISP16(\Index, 0+\OffsetA)(\AREG)
+    lxv vs27,   DISP16(\Index,16+\OffsetA)(\AREG)
+    lxv vs16,   DISP16(\Index,32+\OffsetA)(\AREG)
+    lxv vs17,   DISP16(\Index,48+\OffsetA)(\AREG)      
+ 
+    xvmaddasp      vs0,   vs26,   vs8
+    xvmaddasp      vs1,   vs27,   vs8  
+
+    xvmaddasp      vs4,   vs26,   vs9
+    xvmaddasp      vs5,   vs27,   vs9  
+
+    xvmaddasp      vs0,   vs16,   vs10
+    xvmaddasp      vs1,   vs17,   vs10   
+
+    xvmaddasp      vs4,   vs16,   vs11
+    xvmaddasp      vs5,   vs17,   vs11     
+ 
+.if \IsLast==1   
+    addi        \BREG, \BREG, DISP4(\Index,16)  
+    addi        \AREG, \AREG, DISP16(\Index,64)
+.endif 
+  
+.endm
+
+
+.macro SAVE2x8
+
+#ifndef TRMMKERNEL    
+    lxv        vs16, 0(CO)
+    lxv        vs17, 16(CO)     
+#endif
+  add     T1, CO, LDC 
+#ifndef TRMMKERNEL    
+    lxv        vs26, 0(T1)
+    lxv        vs27, 16(T1) 
+    
+#endif
+
+#if defined(TRMMKERNEL)
+  xvmulsp        vs16, vs0, alpha_r
+  xvmulsp        vs17, vs1, alpha_r  
+  xvmulsp        vs26, vs4, alpha_r
+  xvmulsp        vs27, vs5, alpha_r 
+#else
+  xvmaddasp        vs16, vs0, alpha_r
+  xvmaddasp        vs17, vs1, alpha_r  
+  xvmaddasp        vs26, vs4, alpha_r
+  xvmaddasp        vs27, vs5, alpha_r 
+#endif
+
+    stxv        vs16, 0(CO)
+    stxv        vs17, 16(CO) 
+     
+    
+    stxv        vs26, 0(T1)
+    stxv        vs27, 16(T1) 
+
+  addi CO,CO,32
+
+.endm
+
+
+/*M=4*/
+
+
+.macro KERNEL2x4_2   OffsetA,OffsetB, Index,IsLast
+  KERNEL2x4_I_2 AO,BO,  \OffsetA,\OffsetB,\Index,\IsLast
+.endm
+
+ /* we will aggregate on save vs0 +vs4 vs11+vs5 */
+.macro Zero2x4
+    xxlxor      vs0,   vs0,   vs0
+    xxlxor      vs1,   vs1,   vs1 
+ 
+    xxlxor      vs4,   vs4,   vs4
+    xxlxor      vs5,   vs5,   vs5 
+    
+.endm
+ 
+.macro KERNEL2x4
+  KERNEL2x4_1 AO,BO, 0, 0,0,0
+.endm
+.macro KERNEL2x4_4 OffsetA,OffsetB, Index,IsLast  
+  KERNEL2x4_I_4 AO,BO,  \OffsetA,\OffsetB,\Index,\IsLast
+.endm
+ 
+.macro KERNEL2x4_1 AREG,BREG,First,OffsetA,OffsetB,Index
+
+
+    lxsd v4,   DISP2(\Index, 0+\OffsetB)(\BREG)
+    xxspltw   vs8,  vs36, 1 
+    xxspltw   vs9,  vs36, 0 
+    lxv vs26,   DISP4(\Index, 0+\OffsetA)(\AREG)        
+ 
+ 
+.if \First==1
+    xvmulsp      vs0,   vs26,   vs8 
+    xvmulsp      vs1,   vs26,   vs9     
+     
+.else 
+    xvmaddasp      vs0,   vs26,   vs8 
+    xvmaddasp      vs1,   vs26,   vs9 
+ .endif
+   
+    addi        \BREG, \BREG, DISP2(\Index,8)
+    addi        \AREG, \AREG, DISP4(\Index,16)  
+ 
+.endm
+
+
+
+
+.macro KERNEL2x4_I_4  AREG,BREG, OffsetA,OffsetB, Index,IsLast  
+
+    lxv vs38,    DISP8(\Index, 0+\OffsetB)(\BREG)
+    lxv vs39,    DISP8(\Index, 16+\OffsetB)(\BREG) 
+
+    lxv vs26,   DISP16(\Index, 0+\OffsetA)(\AREG)
+    lxv vs16,   DISP16(\Index,16+\OffsetA)(\AREG) 
+
+    lxv vs30,   DISP16(\Index,32+ 0+\OffsetA)(\AREG)
+    lxv vs34,   DISP16(\Index,32+ 16+\OffsetA)(\AREG) 
+ 
+
+    xxspltw   vs8,  vs38, 3  
+    xxspltw   vs9,  vs38, 2 
+    xxspltw   vs10, vs38, 1 
+    xxspltw   vs11, vs38, 0  
+
+    xxspltw   vs12,  vs39, 3  
+    xxspltw   vs13,  vs39, 2 
+    xxspltw   vs14, vs39, 1 
+    xxspltw   vs15, vs39, 0  
+
+ 
+    xvmaddasp      vs0,   vs26,   vs8
+    xvmaddasp      vs1,   vs26,   vs9 
+    xvmaddasp      vs4,   vs16,   vs10
+    xvmaddasp      vs5,   vs16,   vs11 
+ 
+
+    xvmaddasp      vs0,   vs30,   vs12
+    xvmaddasp      vs1,   vs30,   vs13 
+    xvmaddasp      vs4,   vs34,   vs14
+    xvmaddasp      vs5,   vs34,   vs15 
+ 
+   
+ 
+ 
+.if \IsLast==1   
+    addi        \BREG, \BREG, DISP8(\Index,32)  
+    addi        \AREG, \AREG, DISP16(\Index,64)
+.endif 
+  
+.endm
+
+.macro KERNEL2x4_I_2  AREG,BREG, OffsetA,OffsetB, Index,IsLast  
+
+    lxv vs36,    DISP4(\Index, 0+\OffsetB)(\BREG)
+    xxspltw   vs8,  vs36, 3  
+    xxspltw   vs9,  vs36, 2 
+    xxspltw   vs10, vs36, 1 
+    xxspltw   vs11, vs36, 0    
+    lxv vs26,   DISP8(\Index, 0+\OffsetA)(\AREG)
+    lxv vs16,   DISP8(\Index, 16+\OffsetA)(\AREG)      
+ 
+    xvmaddasp      vs0,   vs26,   vs8
+    xvmaddasp      vs1,   vs26,   vs9 
+    xvmaddasp      vs4,   vs16,   vs10
+    xvmaddasp      vs5,   vs16,   vs11     
+ 
+.if \IsLast==1   
+    addi        \BREG, \BREG, DISP4(\Index,16)  
+    addi        \AREG, \AREG, DISP8(\Index,32)
+.endif 
+  
+.endm
+
+
+.macro SAVE2x4
+
+#ifndef TRMMKERNEL    
+    lxv        vs16, 0(CO)     
+#endif
+  add     T1, CO, LDC 
+#ifndef TRMMKERNEL    
+    lxv        vs26, 0(T1) 
+    
+#endif
+    /*aggregate vectors*/
+  xvaddsp         vs0,vs0,vs4
+  xvaddsp         vs1,vs1,vs5 
+#if defined(TRMMKERNEL)
+  xvmulsp        vs16, vs0, alpha_r 
+  xvmulsp        vs26, vs1, alpha_r 
+#else
+  xvmaddasp        vs16, vs0, alpha_r 
+  xvmaddasp        vs26, vs1, alpha_r 
+#endif
+
+  stxv        vs16, 0(CO) 
+  stxv        vs26, 0(T1)  
+
+  addi CO,CO,16
+
+.endm
+
+
+/* M=2 N=2 we will have inner pemrute action before permute was revrsing 3,2,1,0 not iw 2ill inner reverse 1,0,3,2  */
+.macro SWITCH_PERMUTE_INNER
+    xxpermdi	permute_mask,	permute_mask,	permute_mask,2
+.endm
+
+.macro Zero2x2
+    xxlxor      vs0,   vs0,   vs0
+    xxlxor      vs1,   vs1,   vs1 
+    SWITCH_PERMUTE_INNER
+.endm
+ 
+.macro KERNEL2x2
+  KERNEL2x2_1 AO,BO, 0, 0,0,0
+.endm
+.macro KERNEL2x2_4 OffsetA,OffsetB, Index,IsLast  
+  KERNEL2x2_I_4 AO,BO,  \OffsetA,\OffsetB,\Index,\IsLast
+.endm
+
+.macro KERNEL2x2_2 OffsetA,OffsetB, Index,IsLast  
+  KERNEL2x2_I_2 AO,BO,  \OffsetA,\OffsetB,\Index,\IsLast
+.endm
+ 
+.macro KERNEL2x2_1 AREG,BREG,First,OffsetA,OffsetB,Index
+
+
+    lxsd v4,   DISP2(\Index, 0+\OffsetB)(\BREG)
+    xxperm   vs9,  vs36, permute_mask 
+    lxsd v5,   DISP2(\Index, 0+\OffsetA)(\AREG)        
+ 
+ 
+.if \First==1
+    xvmulsp      vs0,   vs37,   vs36 
+    xvmulsp      vs1,   vs37,   vs9     
+     
+.else 
+    xvmaddasp      vs0,   vs37,   vs36 
+    xvmaddasp      vs1,   vs37,   vs9 
+ .endif
+   
+    addi        \BREG, \BREG, DISP2(\Index,8)
+    addi        \AREG, \AREG, DISP2(\Index,8)  
+ 
+.endm
+
+
+
+
+.macro KERNEL2x2_I_4  AREG,BREG, OffsetA,OffsetB, Index,IsLast  
+
+    lxv vs8,    DISP8(\Index, 0+\OffsetB)(\BREG)
+    lxv vs10,    DISP8(\Index, 16+\OffsetB)(\BREG) 
+
+    lxv vs26,   DISP8(\Index, 0+\OffsetA)(\AREG)
+    lxv vs16,   DISP8(\Index,16+\OffsetA)(\AREG) 
+
+ 
+    xxperm   vs9,  vs8, permute_mask   
+    xxperm   vs11, vs10, permute_mask  
+
+
+ 
+    xvmaddasp      vs0,   vs26,   vs8
+    xvmaddasp      vs1,   vs26,   vs9 
+    xvmaddasp      vs0,   vs16,   vs10
+    xvmaddasp      vs1,   vs16,   vs11 
+ 
+ 
+ 
+.if \IsLast==1   
+    addi        \BREG, \BREG, DISP8(\Index,32)  
+    addi        \AREG, \AREG, DISP8(\Index,32)
+.endif 
+  
+.endm
+
+.macro KERNEL2x2_I_2  AREG,BREG, OffsetA,OffsetB, Index,IsLast  
+
+    lxv vs8,    DISP4(\Index, 0+\OffsetB)(\BREG)  
+    lxv vs26,   DISP4(\Index, 0+\OffsetA)(\AREG) 
+
+ 
+    xxperm   vs9,  vs8, permute_mask    
+
+ 
+    xvmaddasp      vs0,   vs26,   vs8
+    xvmaddasp      vs1,   vs26,   vs9  
+ 
+.if \IsLast==1   
+    addi        \BREG, \BREG, DISP4(\Index,16)  
+    addi        \AREG, \AREG, DISP4(\Index,16)
+.endif 
+.endm
+
+
+.macro SAVE2x2
+
+#ifndef TRMMKERNEL    
+    lxsd v4   , 0(CO)     
+#endif
+  add     T1, CO, LDC 
+#ifndef TRMMKERNEL    
+    lxsd v5   , 0(T1) 
+    
+#endif
+    /*aggregate vectors*/
+  xxpermdi         vs4,vs0,vs0,2
+  xxpermdi         vs5,vs1,vs1,2  
+  xvaddsp          vs0,vs0,vs4
+  xvaddsp         vs1,vs1,vs5 
+  /*   */
+  /* lets correct the order to 00 10 and 10 ,11 from {00,11} {01,10}  */
+  xxperm    vs1,vs1, permute_mask
+
+
+  xxmrghw   vs2 ,vs1,vs0
+  xxpermdi         vs2,vs2,vs2,2  
+  xxmrghw   vs3 ,vs0,vs1  
+#if defined(TRMMKERNEL)
+  xvmulsp        vs36, vs2, alpha_r 
+  xvmulsp        vs37, vs3, alpha_r 
+#else
+  xvmaddasp        vs36, vs2, alpha_r 
+  xvmaddasp        vs37, vs3, alpha_r 
+#endif
+  /**** store last two words*/
+
+
+  stxsd       v4, 0(CO) 
+  stxsd        v5, 0(T1)  
+
+  addi CO,CO,8
+
+.endm
+
+/*--------------------------- M=1 N=2 */
+.macro Zero2x1
+    xxlxor      vs0,   vs0,   vs0
+    xxlxor      vs1,   vs1,   vs1 
+    xxlxor    vs2,vs2,vs2 
+    xxlxor    vs3,vs3,vs3     
+.endm
+ 
+.macro KERNEL2x1
+  KERNEL2x1_1 AO,BO, 0, 0,0,0
+.endm
+.macro KERNEL2x1_4 OffsetA,OffsetB, Index,IsLast  
+  KERNEL2x1_I_4 AO,BO,  \OffsetA,\OffsetB,\Index,\IsLast
+.endm
+
+.macro KERNEL2x1_2 OffsetA,OffsetB, Index,IsLast  
+  KERNEL2x1_I_2 AO,BO,  \OffsetA,\OffsetB,\Index,\IsLast
+.endm
+ /*
+   we will calculate 1 alone then will add it to batched ones
+ */
+.macro KERNEL2x1_1 AREG,BREG,First,OffsetA,OffsetB,Index
+
+
+    lxssp v3,   DISP2(\Index, 0+\OffsetB)(\BREG)
+    lxssp v4,   DISP2(\Index, 4+\OffsetB)(\BREG) 
+    lxssp v5,   DISP1(\Index, 0+\OffsetA)(\AREG)        
+ 
+ 
+.if \First==1
+    xvmulsp      vs2,   vs37,   vs35 
+    xvmulsp      vs3,   vs37,   vs36     
+     
+.else 
+    xsmaddadp     vs2,   vs37,   vs35
+    xsmaddadp      vs3,   vs37,   vs36
+ .endif
+   
+    addi        \BREG, \BREG, DISP2(\Index,8)
+    addi        \AREG, \AREG, DISP1(\Index,4)  
+ 
+.endm
+
+
+
+
+.macro KERNEL2x1_I_4  AREG,BREG, OffsetA,OffsetB, Index,IsLast  
+
+    lxv vs8,    DISP8(\Index, 0+\OffsetB)(\BREG)
+    lxv vs10,   DISP8(\Index, 16+\OffsetB)(\BREG) 
+
+    lxv vs26,   DISP4(\Index, 0+\OffsetA)(\AREG)
+   
+    xxmrglw   vs5, vs26,vs26
+    xxmrghw   vs6, vs26,vs26 
+ 
+    xvmaddasp      vs0,   vs8,   vs5
+    xvmaddasp      vs1,   vs10,   vs6 
+ 
+ 
+.if \IsLast==1   
+    addi        \BREG, \BREG, DISP8(\Index,32)  
+    addi        \AREG, \AREG, DISP4(\Index,16)
+.endif 
+  
+.endm
+
+.macro KERNEL2x1_I_2  AREG,BREG, OffsetA,OffsetB, Index,IsLast  
+
+    lxssp v3,   DISP4(\Index, 0+\OffsetB)(\BREG)
+    lxssp v4,   DISP4(\Index, 4+\OffsetB)(\BREG) 
+    lxssp v7,   DISP4(\Index, 8+\OffsetB)(\BREG)
+    lxssp v8,   DISP4(\Index, 12+\OffsetB)(\BREG)    
+    lxssp v5,   DISP2(\Index, 0+\OffsetA)(\AREG)        
+    lxssp v6,   DISP2(\Index, 4+\OffsetA)(\AREG)  
+ 
+ 
+    xsmaddadp      vs2,   vs37,   vs35
+    xsmaddadp      vs3,   vs37,   vs36
+
+    xsmaddadp      vs2,   vs38,   vs39 
+    xsmaddadp      vs3,   vs38,   vs40      
+ 
+   
+    addi        \BREG, \BREG, DISP4(\Index,16)
+    addi        \AREG, \AREG, DISP2(\Index,8) 
+.endm
+
+
+.macro SAVE2x1
+
+#ifndef TRMMKERNEL    
+    lxssp v4   , 0(CO)     
+#endif
+  add     T1, CO, LDC 
+#ifndef TRMMKERNEL    
+    lxssp v5   , 0(T1) 
+    
+#endif
+
+  /*convert alpha_r for multiply*/
+  xscvspdp  vs16,alpha_r
+
+ /*aggregate vectors 2x2_4   */ 
+      xxpermdi         vs4,vs0,vs0,2
+      xxpermdi         vs5,vs1,vs1,2  
+      xvaddsp          vs0,vs0,vs4
+      xvaddsp         vs1,vs1,vs5 
+      xvaddsp         vs0,vs0,vs1 
+/*aggregate vectors 2x1_2 and 2x1_1 into 2x2_4*/
+  xscvspdp  vs5, vs0
+  xxspltw   vs6, vs0, 1  
+  xscvspdp  vs6,vs6 
+  xsadddp  vs2,vs2,vs6
+  xsadddp  vs3,vs3,vs5  
+
+  /**** store last two words*/
+#if defined(TRMMKERNEL) 
+  xsmuldp  vs36,vs2, vs16 
+  xsmuldp  vs37,vs3, vs16  
+ 
+#else
+  xsmaddadp  vs36,vs2, vs16 
+  xsmaddadp  vs37,vs3, vs16 
+#endif  
+
+  stxssp       v4, 0(CO) 
+  stxssp        v5, 0(T1)  
+
+  addi CO,CO,4
+
+.endm
+
+
+
+/****************************N=1 section*****************/
+
+.macro KERNEL1x16_2   OffsetA,OffsetB, Index,IsLast
+  KERNEL1x16_I_2 AO,BO,  \OffsetA,\OffsetB,\Index,\IsLast
+.endm
+
+ 
+.macro Zero1x16
+    xxlxor      vs0,   vs0,   vs0
+    xxlxor      vs1,   vs1,   vs1 
+    xxlxor      vs2,   vs2,   vs2
+    xxlxor      vs3,   vs3,   vs3       
+.endm
+ 
+.macro KERNEL1x16
+  KERNEL1x16_1 AO,BO, 0, 0,0,0
+.endm
+.macro KERNEL1x16_4 OffsetA,OffsetB, Index,IsLast  
+  KERNEL1x16_I_4 AO,BO,  \OffsetA,\OffsetB,\Index,\IsLast
+.endm
+ 
+.macro KERNEL1x16_1 AREG,BREG,First,OffsetA,OffsetB,Index
+
+
+    lxssp v4,   DISP1(\Index, 0+\OffsetB)(\BREG)
+    xscvdpspn   vs36,vs36
+    xxspltw     vs8,  vs36, 0
+    lxv vs26,   DISP16(\Index, 0+\OffsetA)(\AREG)
+    lxv vs27,   DISP16(\Index,16+\OffsetA)(\AREG)   
+    lxv vs28,   DISP16(\Index, 32+\OffsetA)(\AREG)
+    lxv vs29,   DISP16(\Index,48+\OffsetA)(\AREG)        
+ 
+ 
+.if \First==1
+    xvmulsp      vs0,   vs26,   vs8
+    xvmulsp      vs1,   vs27,   vs8 
+    xvmulsp      vs2,   vs28,   vs8
+    xvmulsp      vs3,   vs29,   vs8 
+  
+     
+.else 
+    xvmaddasp      vs0,   vs26,   vs8
+    xvmaddasp      vs1,   vs27,   vs8 
+    xvmaddasp      vs2,   vs28,   vs8
+    xvmaddasp      vs3,   vs29,   vs8 
+ 
+ .endif
+   
+    addi        \BREG, \BREG, DISP1(\Index,4)
+    addi        \AREG, \AREG, DISP16(\Index,64)  
+ 
+.endm
+
+
+
+
+.macro KERNEL1x16_I_4  AREG,BREG, OffsetA,OffsetB, Index,IsLast  
+
+    lxv vs38,    DISP4(\Index, 0+\OffsetB)(\BREG) 
+
+    lxv vs26,   DISP64(\Index, 0+\OffsetA)(\AREG)
+    lxv vs27,   DISP64(\Index,16+\OffsetA)(\AREG)
+    lxv vs28,   DISP64(\Index,32+\OffsetA)(\AREG)
+    lxv vs29,   DISP64(\Index,48+\OffsetA)(\AREG)  
+
+    lxv vs16,   DISP64(\Index,64+ 0+\OffsetA)(\AREG)
+    lxv vs17,   DISP64(\Index,64+ 16+\OffsetA)(\AREG)
+    lxv vs18,   DISP64(\Index,64+ 32+\OffsetA)(\AREG)
+    lxv vs19,   DISP64(\Index,64+ 48+\OffsetA)(\AREG) 
+
+    xxspltw   vs8,  vs38, 3  
+    xxspltw   vs9,  vs38, 2 
+
+    lxv vs30,   DISP64(\Index,128+ 0+\OffsetA)(\AREG)
+    lxv vs31,   DISP64(\Index,128+ 16+\OffsetA)(\AREG)
+    lxv vs32,   DISP64(\Index,128+ 32+\OffsetA)(\AREG)
+    lxv vs33,   DISP64(\Index,128+ 48+\OffsetA)(\AREG)  
+        
+    lxv vs34,   DISP64(\Index,128+ 64+ 0+\OffsetA)(\AREG)
+    lxv vs35,   DISP64(\Index,128+ 64+ 16+\OffsetA)(\AREG)
+    lxv vs36,   DISP64(\Index,128+ 64+ 32+\OffsetA)(\AREG)
+    lxv vs37,   DISP64(\Index,128+ 64+ 48+\OffsetA)(\AREG) 
+
+    xxspltw   vs10, vs38, 1 
+    xxspltw   vs11, vs38, 0    
+
+ 
+    xvmaddasp      vs0,   vs26,   vs8
+    xvmaddasp      vs1,   vs27,   vs8 
+    xvmaddasp      vs2,   vs28,   vs8
+    xvmaddasp      vs3,   vs29,   vs8 
+ 
+
+    xvmaddasp      vs0,   vs16,   vs9
+    xvmaddasp      vs1,   vs17,   vs9 
+    xvmaddasp      vs2,   vs18,   vs9
+    xvmaddasp      vs3,   vs19,   vs9 
+ 
+
+    xvmaddasp      vs0,   vs30,   vs10
+    xvmaddasp      vs1,   vs31,   vs10 
+    xvmaddasp      vs2,   vs32,   vs10
+    xvmaddasp      vs3,   vs33,   vs10 
+ 
+
+    xvmaddasp      vs0,   vs34,   vs11
+    xvmaddasp      vs1,   vs35,   vs11 
+    xvmaddasp      vs2,   vs36,   vs11
+    xvmaddasp      vs3,   vs37,   vs11 
+
+ 
+ 
+ 
+.if \IsLast==1   
+    addi        \BREG, \BREG, DISP4(\Index,16)  
+    addi        \AREG, \AREG, DISP64(\Index,256)
+.endif 
+  
+.endm
+
+.macro KERNEL1x16_I_2  AREG,BREG, OffsetA,OffsetB, Index,IsLast  
+
+    lxsd v4,    DISP2(\Index, 0+\OffsetB)(\BREG)
+    xxspltw   vs8,  vs36, 1  
+    xxspltw   vs9,  vs36, 0      
+    lxv vs26,   DISP32(\Index, 0+\OffsetA)(\AREG)
+    lxv vs27,   DISP32(\Index,16+\OffsetA)(\AREG)
+    lxv vs28,   DISP32(\Index,32+\OffsetA)(\AREG)
+    lxv vs29,   DISP32(\Index,48+\OffsetA)(\AREG)      
+    lxv vs16,   DISP32(\Index,64+ 0+\OffsetA)(\AREG)
+    lxv vs17,   DISP32(\Index,64+ 16+\OffsetA)(\AREG)
+    lxv vs18,   DISP32(\Index,64+ 32+\OffsetA)(\AREG)
+    lxv vs19,   DISP32(\Index,64+ 48+\OffsetA)(\AREG) 
+ 
+ 
+    xvmaddasp      vs0,   vs26,   vs8
+    xvmaddasp      vs1,   vs27,   vs8 
+    xvmaddasp      vs2,   vs28,   vs8
+    xvmaddasp      vs3,   vs29,   vs8 
+ 
+
+    xvmaddasp      vs0,   vs16,   vs9
+    xvmaddasp      vs1,   vs17,   vs9 
+    xvmaddasp      vs2,   vs18,   vs9
+    xvmaddasp      vs3,   vs19,   vs9 
+  
+ 
+.if \IsLast==1   
+    addi        \BREG, \BREG, DISP2(\Index,8)  
+    addi        \AREG, \AREG, DISP32(\Index,128)
+.endif 
+  
+.endm
+
+
+.macro SAVE1x16
+
+#ifndef TRMMKERNEL    
+    lxv        vs16, 0(CO)
+    lxv        vs17, 16(CO) 
+    lxv        vs18, 32(CO)  
+    lxv        vs19, 48(CO)      
+#endif
+ 
+
+#if defined(TRMMKERNEL)
+  xvmulsp        vs16, vs0, alpha_r
+  xvmulsp        vs17, vs1, alpha_r 
+  xvmulsp        vs18, vs2, alpha_r
+  xvmulsp        vs19, vs3, alpha_r   
+#else
+  xvmaddasp        vs16, vs0, alpha_r
+  xvmaddasp        vs17, vs1, alpha_r 
+  xvmaddasp        vs18, vs2, alpha_r
+  xvmaddasp        vs19, vs3, alpha_r   
+#endif
+    stxv        vs16, 0(CO)
+    stxv        vs17, 16(CO) 
+    stxv        vs18, 32(CO)  
+    stxv        vs19, 48(CO)      
+    
+  addi CO,CO,64
+
+.endm
+
+/*       M=8 N=1 */
+
+.macro KERNEL1x8_2   OffsetA,OffsetB, Index,IsLast
+  KERNEL1x8_I_2 AO,BO,  \OffsetA,\OffsetB,\Index,\IsLast
+.endm
+
+ 
+.macro Zero1x8
+    xxlxor      vs0,   vs0,   vs0
+    xxlxor      vs1,   vs1,   vs1  
+    xxlxor      vs2,   vs2,   vs2
+    xxlxor      vs3,   vs3,   vs3          
+.endm
+ 
+.macro KERNEL1x8
+  KERNEL1x8_1 AO,BO, 0, 0,0,0
+.endm
+.macro KERNEL1x8_4 OffsetA,OffsetB, Index,IsLast  
+  KERNEL1x8_I_4 AO,BO,  \OffsetA,\OffsetB,\Index,\IsLast
+.endm
+ 
+.macro KERNEL1x8_1 AREG,BREG,First,OffsetA,OffsetB,Index
+
+
+    lxssp v4,   DISP1(\Index, 0+\OffsetB)(\BREG)
+    xscvdpspn   vs36,vs36
+    xxspltw     vs8,  vs36, 0
+    lxv vs26,   DISP8(\Index, 0+\OffsetA)(\AREG)
+    lxv vs27,   DISP8(\Index,16+\OffsetA)(\AREG)         
+ 
+ 
+.if \First==1
+    xvmulsp      vs0,   vs26,   vs8
+    xvmulsp      vs1,   vs27,   vs8  
+  
+     
+.else 
+    xvmaddasp      vs0,   vs26,   vs8
+    xvmaddasp      vs1,   vs27,   vs8  
+ 
+ .endif
+   
+    addi        \BREG, \BREG, DISP1(\Index,4)
+    addi        \AREG, \AREG, DISP8(\Index,32)  
+ 
+.endm
+
+
+
+
+.macro KERNEL1x8_I_4  AREG,BREG, OffsetA,OffsetB, Index,IsLast  
+
+    lxv vs38,    DISP4(\Index, 0+\OffsetB)(\BREG) 
+
+    lxv vs26,   DISP32(\Index, 0+\OffsetA)(\AREG)
+    lxv vs27,   DISP32(\Index,16+\OffsetA)(\AREG) 
+
+    lxv vs16,   DISP32(\Index,32+ 0+\OffsetA)(\AREG)
+    lxv vs17,   DISP32(\Index,32+ 16+\OffsetA)(\AREG) 
+
+    xxspltw   vs8,  vs38, 3  
+    xxspltw   vs9,  vs38, 2 
+
+    lxv vs30,   DISP32(\Index,64+ 0+\OffsetA)(\AREG)
+    lxv vs31,   DISP32(\Index,64+ 16+\OffsetA)(\AREG)   
+        
+    lxv vs34,   DISP32(\Index,64+ 32+ 0+\OffsetA)(\AREG)
+    lxv vs35,   DISP32(\Index,64+ 32+ 16+\OffsetA)(\AREG)  
+
+    xxspltw   vs10, vs38, 1 
+    xxspltw   vs11, vs38, 0    
+
+ 
+    xvmaddasp      vs0,   vs26,   vs8
+    xvmaddasp      vs1,   vs27,   vs8  
+ 
+
+    xvmaddasp      vs2,   vs16,   vs9
+    xvmaddasp      vs3,   vs17,   vs9  
+ 
+
+    xvmaddasp      vs0,   vs30,   vs10
+    xvmaddasp      vs1,   vs31,   vs10  
+ 
+
+    xvmaddasp      vs2,   vs34,   vs11
+    xvmaddasp      vs3,   vs35,   vs11  
+
+ 
+ 
+ 
+.if \IsLast==1   
+    addi        \BREG, \BREG, DISP4(\Index,16)  
+    addi        \AREG, \AREG, DISP32(\Index,128)
+.endif 
+  
+.endm
+
+.macro KERNEL1x8_I_2  AREG,BREG, OffsetA,OffsetB, Index,IsLast  
+
+    lxsd v4,    DISP2(\Index, 0+\OffsetB)(\BREG)
+    xxspltw   vs8,  vs36, 1  
+    xxspltw   vs9,  vs36, 0      
+    lxv vs26,   DISP16(\Index, 0+\OffsetA)(\AREG)
+    lxv vs27,   DISP16(\Index,16+\OffsetA)(\AREG)     
+    lxv vs16,   DISP16(\Index,32+ 0+\OffsetA)(\AREG)
+    lxv vs17,   DISP16(\Index,32+ 16+\OffsetA)(\AREG) 
+ 
+ 
+    xvmaddasp      vs0,   vs26,   vs8
+    xvmaddasp      vs1,   vs27,   vs8  
+ 
+
+    xvmaddasp      vs2,   vs16,   vs9
+    xvmaddasp      vs3,   vs17,   vs9   
+  
+ 
+.if \IsLast==1   
+    addi        \BREG, \BREG, DISP2(\Index,8)  
+    addi        \AREG, \AREG, DISP16(\Index,64)
+.endif 
+  
+.endm
+
+
+.macro SAVE1x8
+
+#ifndef TRMMKERNEL    
+    lxv        vs16, 0(CO)
+    lxv        vs17, 16(CO)       
+#endif
+   /* aggregate vs0 vs2 and vs1 vs3*/
+  xvaddsp vs0,vs0,vs2
+  xvaddsp  vs1,vs1,vs3
+#if defined(TRMMKERNEL)
+  xvmulsp        vs16, vs0, alpha_r
+  xvmulsp        vs17, vs1, alpha_r     
+#else
+  xvmaddasp        vs16, vs0, alpha_r
+  xvmaddasp        vs17, vs1, alpha_r  
+#endif
+    stxv        vs16, 0(CO)
+    stxv        vs17, 16(CO)      
+    
+  addi CO,CO,32
+
+.endm
+/*M=4*/
+
+.macro KERNEL1x4_2   OffsetA,OffsetB, Index,IsLast
+  KERNEL1x4_I_2 AO,BO,  \OffsetA,\OffsetB,\Index,\IsLast
+.endm
+
+ 
+.macro Zero1x4
+    xxlxor      vs0,   vs0,   vs0
+    xxlxor      vs1,   vs1,   vs1  
+    xxlxor      vs2,   vs2,   vs2
+    xxlxor      vs3,   vs3,   vs3          
+.endm
+ 
+.macro KERNEL1x4
+  KERNEL1x4_1 AO,BO, 0, 0,0,0
+.endm
+.macro KERNEL1x4_4 OffsetA,OffsetB, Index,IsLast  
+  KERNEL1x4_I_4 AO,BO,  \OffsetA,\OffsetB,\Index,\IsLast
+.endm
+ 
+.macro KERNEL1x4_1 AREG,BREG,First,OffsetA,OffsetB,Index
+
+
+    lxssp v4,   DISP1(\Index, 0+\OffsetB)(\BREG)
+    xscvdpspn   vs36,vs36
+    xxspltw     vs8,  vs36, 0
+    lxv vs26,   DISP4(\Index, 0+\OffsetA)(\AREG)         
+ 
+ 
+.if \First==1
+    xvmulsp      vs0,   vs26,   vs8 
+.else 
+    xvmaddasp      vs0,   vs26,   vs8 
+ 
+ .endif
+   
+    addi        \BREG, \BREG, DISP1(\Index,4)
+    addi        \AREG, \AREG, DISP4(\Index,16)  
+ 
+.endm
+
+
+
+
+.macro KERNEL1x4_I_4  AREG,BREG, OffsetA,OffsetB, Index,IsLast  
+
+    lxv vs38,    DISP4(\Index, 0+\OffsetB)(\BREG) 
+
+    lxv vs26,   DISP16(\Index, 0+\OffsetA)(\AREG)
+    lxv vs27,   DISP16(\Index,16+\OffsetA)(\AREG) 
+ 
+
+    xxspltw   vs8,  vs38, 3  
+    xxspltw   vs9,  vs38, 2 
+
+    lxv vs30,   DISP16(\Index,32+ 0+\OffsetA)(\AREG)
+    lxv vs31,   DISP16(\Index,32+ 16+\OffsetA)(\AREG)   
+          
+
+    xxspltw   vs10, vs38, 1 
+    xxspltw   vs11, vs38, 0    
+
+ 
+    xvmaddasp      vs0,   vs26,   vs8 
+
+    xvmaddasp      vs1,   vs27,   vs9 
+
+    xvmaddasp      vs2,   vs30,   vs10   
+ 
+
+    xvmaddasp      vs3,   vs31,   vs11   
+
+ 
+ 
+ 
+.if \IsLast==1   
+    addi        \BREG, \BREG, DISP4(\Index,16)  
+    addi        \AREG, \AREG, DISP16(\Index,64)
+.endif 
+  
+.endm
+
+.macro KERNEL1x4_I_2  AREG,BREG, OffsetA,OffsetB, Index,IsLast  
+
+    lxsd v4,    DISP2(\Index, 0+\OffsetB)(\BREG)
+    xxspltw   vs8,  vs36, 1  
+    xxspltw   vs9,  vs36, 0      
+    lxv vs26,   DISP8(\Index, 0+\OffsetA)(\AREG)
+    lxv vs27,   DISP8(\Index,16+\OffsetA)(\AREG)      
+ 
+ 
+    xvmaddasp      vs0,   vs26,   vs8
+    xvmaddasp      vs1,   vs27,   vs9
+  
+ 
+.if \IsLast==1   
+    addi        \BREG, \BREG, DISP2(\Index,8)  
+    addi        \AREG, \AREG, DISP8(\Index,32)
+.endif 
+  
+.endm
+
+
+.macro SAVE1x4
+
+#ifndef TRMMKERNEL    
+    lxv        vs16, 0(CO)       
+#endif
+   /* aggregate */
+  xvaddsp vs0,vs0,vs2
+  xvaddsp  vs1,vs1,vs3
+  xvaddsp  vs0,vs1,vs0
+#if defined(TRMMKERNEL)
+  xvmulsp        vs16, vs0, alpha_r     
+#else
+  xvmaddasp        vs16, vs0, alpha_r  
+#endif
+    stxv        vs16, 0(CO)      
+    
+  addi CO,CO,16
+
+.endm
+
+/* M=2 N=1*/ 
+.macro Zero1x2
+    xxlxor      vs0,   vs0,   vs0
+    xxlxor      vs1,   vs1,   vs1 
+    xxlxor    vs2,vs2,vs2 
+    xxlxor    vs3,vs3,vs3     
+.endm
+ 
+.macro KERNEL1x2
+  KERNEL1x2_1 AO,BO, 0, 0,0,0
+.endm
+.macro KERNEL1x2_4 OffsetA,OffsetB, Index,IsLast  
+  KERNEL1x2_I_4 AO,BO,  \OffsetA,\OffsetB,\Index,\IsLast
+.endm
+
+.macro KERNEL1x2_2 OffsetA,OffsetB, Index,IsLast  
+  KERNEL1x2_I_2 AO,BO,  \OffsetA,\OffsetB,\Index,\IsLast
+.endm
+ /*
+   we will calculate 1 alone then will add it to batched ones
+ */
+.macro KERNEL1x2_1 AREG,BREG,First,OffsetA,OffsetB,Index
+
+
+    lxssp v3,   DISP2(\Index, 0+\OffsetB)(\AREG)
+    lxssp v4,   DISP2(\Index, 4+\OffsetB)(\AREG) 
+    lxssp v5,   DISP1(\Index, 0+\OffsetA)(\BREG)        
+ 
+ 
+.if \First==1
+    xvmuldp      vs2,   vs37,   vs35 
+    xvmuldp      vs3,   vs37,   vs36     
+     
+.else 
+    xsmaddadp     vs2,   vs37,   vs35
+    xsmaddadp      vs3,   vs37,   vs36
+ .endif
+   
+    addi        \AREG, \AREG,  DISP2(\Index,8) 
+    addi        \BREG, \BREG, DISP1(\Index,4) 
+ 
+.endm
+
+
+
+
+.macro KERNEL1x2_I_4  AREG,BREG, OffsetA,OffsetB, Index,IsLast  
+
+    lxv vs8,    DISP8(\Index, 0+\OffsetB)(\AREG)
+    lxv vs10,   DISP8(\Index, 16+\OffsetB)(\AREG) 
+
+    lxv vs26,   DISP4(\Index, 0+\OffsetA)(\BREG)
+   
+    xxmrglw   vs5, vs26,vs26
+    xxmrghw   vs6, vs26,vs26 
+ 
+    xvmaddasp      vs0,   vs8,   vs5
+    xvmaddasp      vs1,   vs10,   vs6 
+ 
+ 
+.if \IsLast==1   
+    addi        \AREG, \AREG, DISP8(\Index,32)
+    addi        \BREG, \BREG,  DISP4(\Index,16)  
+.endif 
+  
+.endm
+
+.macro KERNEL1x2_I_2  AREG,BREG, OffsetA,OffsetB, Index,IsLast  
+
+    lxssp v3,   DISP4(\Index, 0+\OffsetB)(\AREG)
+    lxssp v4,   DISP4(\Index, 4+\OffsetB)(\AREG) 
+    lxssp v7,   DISP4(\Index, 8+\OffsetB)(\AREG)
+    lxssp v8,   DISP4(\Index, 12+\OffsetB)(\AREG)    
+    lxssp v5,   DISP2(\Index, 0+\OffsetA)(\BREG)        
+    lxssp v6,   DISP2(\Index, 4+\OffsetA)(\BREG)  
+ 
+ 
+    xsmaddadp      vs2,   vs37,   vs35
+    xsmaddadp      vs3,   vs37,   vs36
+
+    xsmaddadp      vs2,   vs38,   vs39 
+    xsmaddadp      vs3,   vs38,   vs40      
+ 
+   
+    addi        \AREG, \AREG, DISP4(\Index,16)
+    addi        \BREG, \BREG, DISP2(\Index,8) 
+.endm
+
+
+.macro SAVE1x2
+
+#ifndef TRMMKERNEL    
+    lxssp v4   , 0(CO)      
+    lxssp v5   , 4(CO) 
+    
+#endif
+
+  /*convert alpha_r for multiply*/
+  xscvspdp  vs16,alpha_r
+
+ /*aggregate vectors 1x2_4   */ 
+      xxpermdi         vs4,vs0,vs0,2
+      xxpermdi         vs5,vs1,vs1,2  
+      xvaddsp          vs0,vs0,vs4
+      xvaddsp         vs1,vs1,vs5 
+      xvaddsp         vs0,vs0,vs1 
+/*aggregate vectors 1x1_2 and 1x1_1 into 1x2_4*/
+  xscvspdp  vs5, vs0
+  xxspltw   vs6, vs0, 1  
+  xscvspdp  vs6,vs6 
+  xsadddp  vs2,vs2,vs6
+  xsadddp  vs3,vs3,vs5  
+
+  /**** store last two words*/
+#if defined(TRMMKERNEL) 
+  xsmuldp  vs36,vs2, vs16 
+  xsmuldp  vs37,vs3, vs16  
+ 
+#else
+  xsmaddadp  vs36,vs2, vs16 
+  xsmaddadp  vs37,vs3, vs16 
+#endif  
+
+  stxssp       v4, 0(CO) 
+  stxssp        v5, 4(CO)  
+
+  addi CO,CO,8
+
+.endm
+/*///////////////// N=1 M=1 //////////////////*/
+.macro Zero1x1
+    xxlxor      vs0,   vs0,   vs0
+    xxlxor      vs1,   vs1,   vs1 
+    xxlxor      vs2, vs2,vs2 
+    xxlxor      vs3,vs3,vs3 
+    xxlxor      vs4,vs4,vs4       
+.endm
+ 
+.macro KERNEL1x1
+  KERNEL1x1_1 AO,BO, 1, 0,0,0
+.endm
+
+.macro KERNEL1x1_16 OffsetA,OffsetB, Index,IsLast  
+  KERNEL1x1_I_16 AO,BO,  \OffsetA,\OffsetB,\Index,\IsLast
+.endm
+
+.macro KERNEL1x1_8 OffsetA,OffsetB, Index,IsLast  
+  KERNEL1x1_I_8 AO,BO,  \OffsetA,\OffsetB,\Index,\IsLast
+.endm
+
+.macro KERNEL1x1_4 OffsetA,OffsetB, Index,IsLast  
+  KERNEL1x1_I_4 AO,BO,  \OffsetA,\OffsetB,\Index,\IsLast
+.endm
+
+.macro KERNEL1x1_2 OffsetA,OffsetB, Index,IsLast  
+  KERNEL1x1_I_2 AO,BO,  \OffsetA,\OffsetB,\Index,\IsLast
+.endm
+ /*
+   we will calculate 1 alone ( FIRST==1 to zero vs4) 
+ */
+.macro KERNEL1x1_1 AREG,BREG,First,OffsetA,OffsetB,Index
+
+
+    lxssp v3,   DISP1(\Index, 0+\OffsetB)(\AREG) 
+    lxssp v5,   DISP1(\Index, 0+\OffsetA)(\BREG)        
+ 
+ 
+.if \First==1
+    xvmuldp      vs4,   vs37,   vs35       
+     
+.else 
+    xsmaddadp     vs4,   vs37,   vs35 
+ .endif
+   
+    addi        \AREG, \AREG,  DISP1(\Index,4) 
+    addi        \BREG, \BREG, DISP1(\Index,4) 
+ 
+.endm
+
+
+.macro KERNEL1x1_I_16 AREG,BREG, OffsetA,OffsetB, Index,IsLast  
+
+    lxv vs8,    DISP16(\Index, 0+\OffsetB)(\AREG) 
+    lxv vs9,    DISP16(\Index, 16+\OffsetB)(\AREG) 
+    lxv vs10,   DISP16(\Index, 32+0+\OffsetB)(\AREG) 
+    lxv vs11,   DISP16(\Index, 32+ 16+\OffsetB)(\AREG)        
+    lxv vs26,   DISP16(\Index, 0+\OffsetA)(\BREG) 
+    lxv vs16,   DISP16(\Index, 16+\OffsetA)(\BREG) 
+    lxv vs17,   DISP16(\Index, 32+0+\OffsetA)(\BREG) 
+    lxv vs18,   DISP16(\Index, 32+16+\OffsetA)(\BREG)     
+    xvmaddasp      vs0,   vs8,   vs26 
+    xvmaddasp      vs1,   vs9,   vs16  
+    xvmaddasp      vs2,   vs10,  vs17 
+    xvmaddasp      vs3,   vs11,  vs18
+.if \IsLast==1   
+    addi        \AREG, \AREG, DISP16(\Index,64)
+    addi        \BREG, \BREG,  DISP16(\Index,64)  
+.endif 
+  
+.endm
+
+.macro KERNEL1x1_I_8  AREG,BREG, OffsetA,OffsetB, Index,IsLast  
+
+    lxv vs8,    DISP8(\Index, 0+\OffsetB)(\AREG) 
+    lxv vs9,    DISP8(\Index, 16+\OffsetB)(\AREG)     
+    lxv vs26,   DISP8(\Index, 0+\OffsetA)(\BREG) 
+    lxv vs16,   DISP8(\Index, 16+\OffsetA)(\BREG) 
+    xvmaddasp      vs0,   vs8,   vs26 
+    xvmaddasp      vs1,   vs9,   vs16 
+ 
+.if \IsLast==1   
+    addi        \AREG, \AREG, DISP8(\Index,32)
+    addi        \BREG, \BREG,  DISP8(\Index,32)  
+.endif 
+  
+.endm
+
+
+.macro KERNEL1x1_I_4  AREG,BREG, OffsetA,OffsetB, Index,IsLast  
+
+    lxv vs8,    DISP4(\Index, 0+\OffsetB)(\AREG) 
+    lxv vs26,   DISP4(\Index, 0+\OffsetA)(\BREG) 
+ 
+    xvmaddasp      vs0,   vs8,   vs26 
+ 
+ 
+.if \IsLast==1   
+    addi        \AREG, \AREG, DISP4(\Index,16)
+    addi        \BREG, \BREG,  DISP4(\Index,16)  
+.endif 
+  
+.endm
+
+.macro KERNEL1x1_I_2  AREG,BREG, OffsetA,OffsetB, Index,IsLast  
+
+    lxsd v4,    DISP2(\Index, 0+\OffsetB)(\AREG) 
+    lxsd v5,   DISP2(\Index, 0+\OffsetA)(\BREG) 
+ 
+    xvmaddasp      vs0,   vs36,   vs37 
+ 
+    addi        \AREG, \AREG, DISP2(\Index,8)
+    addi        \BREG, \BREG, DISP2(\Index,8) 
+.endm
+
+
+.macro SAVE1x1
+
+#ifndef TRMMKERNEL    
+    lxssp v4   , 0(CO)    
+    
+#endif
+
+  /*convert alpha_r for multiply*/
+  xscvspdp  vs16,alpha_r
+
+ /*aggregate vectors   */ 
+      xvaddsp          vs0,vs0,vs1
+      xvaddsp          vs2,vs2,vs3
+      xvaddsp          vs0,vs0,vs2
+
+      xxpermdi         vs7,vs0,vs0,2   
+      xvaddsp          vs0,vs0,vs7 
+/*aggregate vectors 1x1_2 and 1x1_1 into 1x1_4*/
+  xscvspdp  vs5, vs0
+  xxspltw   vs6, vs0, 1  
+  xscvspdp  vs6,vs6 
+  xsadddp  vs7,vs5,vs6
+  xsadddp  vs4,vs4,vs7  
+
+  /**** store last two words*/
+#if defined(TRMMKERNEL) 
+  xsmuldp  vs36,vs4, vs16   
+ 
+#else
+  xsmaddadp  vs36,vs4, vs16   
+#endif  
+
+  stxssp       v4, 0(CO)    
+
+  addi CO,CO,4
+
+.endm
+
+
+
+
+/****************************TRMM POINTER REFRESH MACROSES*************************/
+
+.macro SHIFT_REG  REG1,REG2,SHIFT_VAL
+		.if \SHIFT_VAL==16 
+			slwi		\REG1,	\REG2,	6			
+		.elseif \SHIFT_VAL==8  
+			slwi		\REG1,	\REG2,	5			 
+		.elseif \SHIFT_VAL==4
+			slwi		\REG1,	\REG2,	4			  
+		.elseif \SHIFT_VAL==2
+			slwi		\REG1,	\REG2,	3			 
+		.elseif \SHIFT_VAL==1
+			slwi		\REG1,	\REG2,	2			 
+		.endif
+.endm
+
+/*
+//#if (defined(LEFT) &&  defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+// 		ptrbb = bb;
+// #else
+// 		ptrba += off*16;
+// 		ptrbb = bb + off*2;
+// #endif
+*/
+.macro REFRESH_POINTERS  PTR_A,PTR_B,OFF_VAL,B_VAL,C_A,C_B
+    #if (defined(LEFT) &&  defined(TRANSA)) ||  (!defined(LEFT) && !defined(TRANSA))
+        /* ptrbb = bb;*/
+        mr \PTR_B,\B_VAL     /* refresh BPOINT */
+
+    #else
+		    /*
+        // ptrba  =ptrba+ off*C_A;
+        // ptrbb = bb + off*C_B; 
+				*/
+		SHIFT_REG T4,\OFF_VAL,\C_B		/* Number of values in B shifted  */
+		SHIFT_REG T2,\OFF_VAL,\C_A		/* Number of values in A shifted  */
+		add		\PTR_B,	\B_VAL ,	T4				/* Add values to BO */
+		add		\PTR_A,	\PTR_A,	T2				/* Add values to AO  */
+    #endif 
+.endm
+
+
+/*
+// #if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
+// 		temp = bk-off;
+// #elif defined(LEFT)
+// 		temp = off+16;	// number of values in A
+// #else
+// 		temp = off+2;	// number of values in B
+// #endif
+*/
+.macro REFRESH_TEMP_BK TEMP_BK,BK_VAL,OFF_VAL,INCR_A,INCR_B
+    #if (defined(LEFT) && !defined(TRANSA)) ||  (!defined(LEFT) && defined(TRANSA))
+                            /* temp = bk-off;*/
+           sub \TEMP_BK,\BK_VAL,\OFF_VAL
+
+    #elif defined(LEFT)
+                            /* temp = off+INCR_A;	// number of values in A */
+           addi \TEMP_BK, \OFF_VAL, \INCR_A
+    #else
+                            /* temp = off+INCR_B	// number of values in B*/
+           addi \TEMP_BK,\OFF_VAL, \INCR_B
+    #endif
+
+.endm
+/*
+// #if ( defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+// 		temp = bk - off;
+// #ifdef LEFT
+// 		temp -= 16; // number of values in A
+// #else
+// 		temp -= 2; // number of values in B
+// #endif
+// 		ptrba += temp*16;
+// 		ptrbb += temp*2;
+// #endif
+
+// #ifdef LEFT
+// 		off += 16; // number of values in A
+// #endif
+*/
+ 
+
+.macro REFRESH_AFTER_SAVE TEMP_BK,BK_VAL,OFF_VAL,PTR_B,PTR_A,C_A,C_B
+
+    #if ( defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+                    /*temp = bk - off;*/
+                sub \TEMP_BK,\BK_VAL,\OFF_VAL
+    #ifdef LEFT
+                    /*temp -= 8; // number of values in A*/
+                addi \TEMP_BK,\TEMP_BK,-\C_A
+    #else
+                    /*temp -= 4; // number of values in B*/
+                addi \TEMP_BK,\TEMP_BK,-\C_B 
+    #endif
+                    /*ptrba += temp*C_A;
+                    ptrbb += temp*C_B;*/ 
+                SHIFT_REG T4,\TEMP_BK,\C_A
+								SHIFT_REG T2,\TEMP_BK,\C_B
+                add \PTR_A, \PTR_A,T4/*ptrba+temp*C_A*/ 
+								add \PTR_B, \PTR_B,T2 
+
+    #endif
+
+    #ifdef LEFT
+                    /*off += 8; // number of values in A*/
+                 addi \OFF_VAL,\OFF_VAL,\C_A
+    #endif
+.endm
\ No newline at end of file
diff --git a/param.h b/param.h
index 938a82a9e..d59cb1656 100644
--- a/param.h
+++ b/param.h
@@ -2248,12 +2248,12 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #define ZGEMM_DEFAULT_UNROLL_M 8
 #define ZGEMM_DEFAULT_UNROLL_N 2
 
-#define SGEMM_DEFAULT_P  1280
+#define SGEMM_DEFAULT_P 640
 #define DGEMM_DEFAULT_P  128
 #define CGEMM_DEFAULT_P  640
 #define ZGEMM_DEFAULT_P  320
 
-#define SGEMM_DEFAULT_Q  640
+#define SGEMM_DEFAULT_Q 1408
 #define DGEMM_DEFAULT_Q  384
 #define CGEMM_DEFAULT_Q  640
 #define ZGEMM_DEFAULT_Q  640

From 9763f872fcb841a00926f31c801bfd007a5337b0 Mon Sep 17 00:00:00 2001
From: Martin Kroeker <martin@ruby.chemie.uni-freiburg.de>
Date: Mon, 29 Apr 2019 19:18:26 +0200
Subject: [PATCH 07/28] Update Changelog with changes from 0.3.6

---
 Changelog.txt | 78 +++++++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 78 insertions(+)

diff --git a/Changelog.txt b/Changelog.txt
index 49b26873a..8df35d5c3 100644
--- a/Changelog.txt
+++ b/Changelog.txt
@@ -1,4 +1,82 @@
 OpenBLAS ChangeLog
+====================================================================
+Version 0.3.6
+29-Apr-2019
+
+common:
+	* the build tools now check that a given cpu TARGET is actually valid
+	* the build-time check of system features (c_check) has been made
+  	  less dependent on particular perl features (this should mainly
+  	  benefit building on Windows)
+	* several problem with the ReLAPACK integration were fixed,
+	  including INTERFACE64 support and building a shared library
+	* building with CMAKE on BSD systems was improved
+	* a non-absolute SUM function was added based on the
+  	  existing optimized code for ASUM
+	* CBLAS interfaces to the IxMIN and IxMAX functions were added
+	* a name clash between LAPACKE and BOOST headers was resolved
+	* CMAKE builds with OpenMP failed to include the appropriate getrf_parallel
+	  kernels
+	* a crash on thread (key) deletion with the USE_TLS=1 memory management
+	  option was fixed
+	* restored several earlier fixes, in particular for OpenMP performance,
+  	  building on BSD, and calling fork on CYGWIN, which had inadvertently
+  	  been dropped in the 0.3.3 rewrite of the memory management code.
+
+x86_64:
+	* the AVX512 DGEMM kernel has been disabled again due to unsolved problems
+	* building with old versions of MSVC was fixed
+	* it is now possible to build a static library on Windows with CMAKE
+	* accessing environment variables on CYGWIN at run time was fixed
+	* the CMAKE build system now recognizes 32bit userspace on 64bit hardware
+	* Intel "Denverton" atom and Hygon "Dhyana" zen CPUs are now autodetected
+	* building for DYNAMIC_ARCH with a DYNAMIC_LIST of targets is now supported
+  	  with CMAKE as well
+	* building for DYNAMIC_ARCH with GENERIC as the default target is now supported
+	* a buffer overflow in the SSE GEMM kernel for Intel Nano targets was fixed
+	* assembly bugs involving undeclared modification of input operands were fixed
+  	  in the AXPY, DOT, GEMV, GER, SCAL, SYMV and TRSM microkernels for Nehalem,
+	  Sandybridge, Haswell, Bulldozer and Piledriver. These would typically cause
+	  test failures or segfaults when compiled with recent versions of gcc from 8 onward.
+	* a similar bug was fixed in the blas_quickdivide code used to split workloads
+	  in most functions
+	* a bug in the IxMIN implementation for the GENERIC target made it return the result of IxMAX
+	* fixed building on SkylakeX systems when either the compiler or the (emulated) operating
+	  environment does not support AVX512
+	* improved GEMM performance on ZEN targets
+
+x86:
+	* build failures caused by the recently added checks for AVX512 were fixed
+	* an inline assembly bug involving undeclared modification of an input argument was
+  	  fixed in the blas_quickdivide code used to split workloads in most functions
+	* a bug in the IMIN implementation for the GENERIC target made it return the result of IMAX
+
+MIPS32:
+	* a bug in the IMIN implementation made it return the result of IMAX
+
+POWER:
+	* single precision BLAS1/2 functions have received optimized POWER8 kernels
+	* POWER9 is now a separate target, with an optimized DGEMM/DTRMM kernel
+	* building on PPC970 systems under OSX Leopard or Tiger is now supported
+	* out-of-bounds memory accesses in the gemm_beta microkernels were fixed
+	* building a shared library on AIX is now supported for POWER6
+	* DYNAMIC_ARCH support has been added for POWER6 and newer
+
+ARMv7:
+	* corrected xDOT behaviour with zero INC_X or INC_Y
+	* a bug in the IMIN implementation made it return the result of IMAX
+
+ARMv8:
+	* added support for HiSilicon TSV110 cpus
+	* the CMAKE build system now recognizes 32bit userspace on 64bit hardware
+	* cross-compilation with CMAKE now works again
+	* a bug in the IMIN implementation made it return the result of IMAX
+	* ARMV8 builds with the BINARY=32 option are now automatically handled as ARMV7
+
+IBM Z:
+	* optimized microkernels for single precicion BLAS1/2 functions have been added
+	  for both Z13 and Z14
+
 ====================================================================
 Version 0.3.5
 31-Dec-2018

From bfeb9c16b0011f4f5f508a6d6df18017ab28f95a Mon Sep 17 00:00:00 2001
From: Martin Kroeker <martin@ruby.chemie.uni-freiburg.de>
Date: Mon, 29 Apr 2019 19:24:53 +0200
Subject: [PATCH 08/28] Increment version to 0.3.7.dev

---
 CMakeLists.txt | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index 969696179..8900973a5 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -6,7 +6,7 @@ cmake_minimum_required(VERSION 2.8.5)
 project(OpenBLAS C ASM)
 set(OpenBLAS_MAJOR_VERSION 0)
 set(OpenBLAS_MINOR_VERSION 3)
-set(OpenBLAS_PATCH_VERSION 6)
+set(OpenBLAS_PATCH_VERSION 7.dev)
 set(OpenBLAS_VERSION "${OpenBLAS_MAJOR_VERSION}.${OpenBLAS_MINOR_VERSION}.${OpenBLAS_PATCH_VERSION}")
 
 # Adhere to GNU filesystem layout conventions

From 4f8143b098418487b261653b48b16dc71cc2a259 Mon Sep 17 00:00:00 2001
From: Martin Kroeker <martin@ruby.chemie.uni-freiburg.de>
Date: Mon, 29 Apr 2019 19:25:32 +0200
Subject: [PATCH 09/28] Increment version to 0.3.7.dev

---
 Makefile.rule | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/Makefile.rule b/Makefile.rule
index 21782a2b9..b46479d03 100644
--- a/Makefile.rule
+++ b/Makefile.rule
@@ -3,7 +3,7 @@
 #
 
 # This library's version
-VERSION = 0.3.6
+VERSION = 0.3.7.dev
 
 # If you set the suffix, the library name will be libopenblas_$(LIBNAMESUFFIX).a
 # and libopenblas_$(LIBNAMESUFFIX).so. Meanwhile, the soname in shared library

From daf2fec12db90c02aa74cb13726efd8f9b708312 Mon Sep 17 00:00:00 2001
From: "luz.paz" <luzpaz@users.noreply.github.com>
Date: Mon, 29 Apr 2019 17:03:56 -0400
Subject: [PATCH 10/28] Misc. typo fixes

Found via `codespell -q 3 -w -L ith,als,dum,nd,amin,nto,wis,ba -S ./relapack,./kernel,./lapack-netlib`
---
 Changelog.txt                     | 14 +++++++-------
 Makefile.rule                     |  6 +++---
 README.md                         |  2 +-
 cmake/kernel.cmake                |  2 +-
 cmake/system.cmake                |  2 +-
 cmake/utils.cmake                 |  2 +-
 common_stackalloc.h               |  2 +-
 common_x86.h                      |  2 +-
 common_x86_64.h                   |  2 +-
 ctest/c_cblat1.f                  |  2 +-
 ctest/c_dblat1.f                  |  2 +-
 ctest/c_sblat1.f                  |  2 +-
 ctest/c_zblat1.f                  |  2 +-
 driver/others/blas_server.c       |  6 +++---
 driver/others/blas_server_win32.c |  4 ++--
 driver/others/init.c              |  2 +-
 driver/others/memory.c            |  2 +-
 f_check                           |  2 +-
 interface/CMakeLists.txt          |  2 +-
 interface/axpy.c                  |  2 +-
 interface/zaxpy.c                 |  2 +-
 reference/ctbmvf.f                |  2 +-
 reference/ctpmvf.f                |  2 +-
 reference/ctrmvf.f                |  2 +-
 reference/dtbmvf.f                |  2 +-
 reference/dtpmvf.f                |  2 +-
 reference/dtrmvf.f                |  2 +-
 reference/stbmvf.f                |  2 +-
 reference/stpmvf.f                |  2 +-
 reference/strmvf.f                |  2 +-
 reference/ztbmvf.f                |  2 +-
 reference/ztpmvf.f                |  2 +-
 reference/ztrmvf.f                |  2 +-
 test/cblat1.f                     |  2 +-
 test/dblat1.f                     |  2 +-
 test/sblat1.f                     |  2 +-
 test/zblat1.f                     |  2 +-
 37 files changed, 48 insertions(+), 48 deletions(-)

diff --git a/Changelog.txt b/Changelog.txt
index 8df35d5c3..9feacf071 100644
--- a/Changelog.txt
+++ b/Changelog.txt
@@ -74,7 +74,7 @@ ARMv8:
 	* ARMV8 builds with the BINARY=32 option are now automatically handled as ARMV7
 
 IBM Z:
-	* optimized microkernels for single precicion BLAS1/2 functions have been added
+	* optimized microkernels for single precision BLAS1/2 functions have been added
 	  for both Z13 and Z14
 
 ====================================================================
@@ -588,8 +588,8 @@ common:
 	  s/d/c/zaxpby, s/d/c/zimatcopy, s/d/c/zomatcopy.
 	* Added OPENBLAS_CORETYPE environment for dynamic_arch. (a86d34)
 	* Added NO_AVX2 flag for old binutils. (#401)
-	* Support outputing the CPU corename on runtime.(#407)
-	* Patched LAPACK to fix bug 114, 117, 118. 
+	* Support outputting the CPU corename on runtime.(#407)
+	* Patched LAPACK to fix bug 114, 117, 118.
 	  (http://www.netlib.org/lapack/bug_list.html)
 	* Disabled ?gemm3m for a work-around fix. (#400)
 x86/x86-64:
@@ -628,7 +628,7 @@ Version 0.2.9.rc1
 13-Jan-2013
 common:
 	* Update LAPACK to 3.5.0 version
-	* Fixed compatiable issues with Clang and Pathscale compilers.
+	* Fixed compatible issues with Clang and Pathscale compilers.
 
 x86/x86-64:
 	* Optimization on Intel Haswell.
@@ -705,7 +705,7 @@ Version 0.2.5
 26-Nov-2012
 common:
 	* Added NO_SHARED flag to disable generating the shared library.
-	* Compile LAPACKE with ILP64 modle when INTERFACE64=1 (#158)
+	* Compile LAPACKE with ILP64 model when INTERFACE64=1 (#158)
 	* Export LAPACK 3.4.2 symbols in shared library. (#147)
 	* Only detect the number of physical CPU cores on Mac OSX. (#157)
 	* Fixed NetBSD build. (#155)
@@ -896,7 +896,7 @@ x86/x86_64:
 	* Fixed #28 a wrong result of dsdot on x86_64.
 	* Fixed #32 a SEGFAULT bug of zdotc with gcc-4.6.
 	* Fixed #33 ztrmm bug on Nehalem.
-	* Work-around #27 the low performance axpy issue with small imput size & multithreads.
+	* Work-around #27 the low performance axpy issue with small input size & multithreads.
 
 MIPS64:
 	* Fixed #28 a wrong result of dsdot on Loongson3A/MIPS64.
@@ -919,7 +919,7 @@ common:
 	* Imported GotoBLAS2 1.13 BSD version
 
 x86/x86_64:
-	* On x86 32bits, fixed a bug in zdot_sse2.S line 191. This would casue
+	* On x86 32bits, fixed a bug in zdot_sse2.S line 191. This would cause
 	  zdotu & zdotc failures. Instead, work-around it. (Refs issue #8 #9 on github)
 	* Modified ?axpy functions to return same netlib BLAS results
 	  when incx==0 or incy==0 (Refs issue #7 on github)
diff --git a/Makefile.rule b/Makefile.rule
index b46479d03..17815096e 100644
--- a/Makefile.rule
+++ b/Makefile.rule
@@ -181,17 +181,17 @@ NO_AFFINITY = 1
 # time out to improve performance. This number should be from 4 to 30
 # which corresponds to (1 << n) cycles. For example, if you set to 26,
 # thread will be running for (1 << 26) cycles(about 25ms on 3.0GHz
-# system). Also you can control this mumber by THREAD_TIMEOUT
+# system). Also you can control this number by THREAD_TIMEOUT
 # CCOMMON_OPT	+= -DTHREAD_TIMEOUT=26
 
-# Using special device driver for mapping physically contigous memory
+# Using special device driver for mapping physically contiguous memory
 # to the user space. If bigphysarea is enabled, it will use it.
 # DEVICEDRIVER_ALLOCATION = 1
 
 # If you need to synchronize FP CSR between threads (for x86/x86_64 only).
 # CONSISTENT_FPCSR = 1
 
-# If any gemm arguement m, n or k is less or equal this threshold, gemm will be execute
+# If any gemm argument m, n or k is less or equal this threshold, gemm will be execute
 # with single thread. (Actually in recent versions this is a factor proportional to the
 # number of floating point operations necessary for the given problem size, no longer
 # an individual dimension). You can use this setting to avoid the overhead of multi-
diff --git a/README.md b/README.md
index 26055c745..76a65b74b 100644
--- a/README.md
+++ b/README.md
@@ -133,7 +133,7 @@ Please read `GotoBLAS_01Readme.txt`.
 
 #### PPC/PPC64
 
-- **POWER8**: Optmized Level-3 BLAS and some Level-1, only with `USE_OPENMP=1`
+- **POWER8**: Optimized Level-3 BLAS and some Level-1, only with `USE_OPENMP=1`
 
 #### IBM zEnterprise System
 
diff --git a/cmake/kernel.cmake b/cmake/kernel.cmake
index 0ed09e776..9b238f004 100644
--- a/cmake/kernel.cmake
+++ b/cmake/kernel.cmake
@@ -1,7 +1,7 @@
 # helper functions for the kernel CMakeLists.txt
 
 
-# Set the default filenames for L1 objects. Most of these will be overriden by the appropriate KERNEL file.
+# Set the default filenames for L1 objects. Most of these will be overridden by the appropriate KERNEL file.
 macro(SetDefaultL1)
   set(SAMAXKERNEL amax.S)
   set(DAMAXKERNEL amax.S)
diff --git a/cmake/system.cmake b/cmake/system.cmake
index 7fda2adb9..d0f560872 100644
--- a/cmake/system.cmake
+++ b/cmake/system.cmake
@@ -283,7 +283,7 @@ endif ()
 
 set(KERNELDIR	"${PROJECT_SOURCE_DIR}/kernel/${ARCH}")
 
-# TODO: nead to convert these Makefiles
+# TODO: need to convert these Makefiles
 # include ${PROJECT_SOURCE_DIR}/cmake/${ARCH}.cmake
 
 if (${CORE} STREQUAL "PPC440")
diff --git a/cmake/utils.cmake b/cmake/utils.cmake
index 28ef65f47..fd93f8a70 100644
--- a/cmake/utils.cmake
+++ b/cmake/utils.cmake
@@ -89,7 +89,7 @@ function(AllCombinations list_in absent_codes_in)
   set(CODES_OUT ${CODES_OUT} PARENT_SCOPE)
 endfunction ()
 
-# generates object files for each of the sources, using the BLAS naming scheme to pass the funciton name as a preprocessor definition
+# generates object files for each of the sources, using the BLAS naming scheme to pass the function name as a preprocessor definition
 # @param sources_in the source files to build from
 # @param defines_in (optional) preprocessor definitions that will be applied to all objects
 # @param name_in (optional) if this is set this name will be used instead of the filename. Use a * to indicate where the float character should go, if no star the character will be prepended.
diff --git a/common_stackalloc.h b/common_stackalloc.h
index ec0fa1611..d3d54669c 100644
--- a/common_stackalloc.h
+++ b/common_stackalloc.h
@@ -45,7 +45,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  * SIZE must be carefully chosen to be:
  * - as small as possible to maximize the number of stack allocation
  * - large enough to support all architectures and kernel
- * Chosing a too small SIZE will lead to a stack smashing.
+ * Choosing a SIZE too small will lead to a stack smashing.
  */
 #define STACK_ALLOC(SIZE, TYPE, BUFFER)                                        \
   /* make it volatile because some function (ex: dgemv_n.S) */                 \
diff --git a/common_x86.h b/common_x86.h
index 3fdffe2a8..99adc9f5b 100644
--- a/common_x86.h
+++ b/common_x86.h
@@ -214,7 +214,7 @@ static __inline int blas_quickdivide(unsigned int x, unsigned int y){
 #endif
 
 #if defined(PILEDRIVER) || defined(BULLDOZER) || defined(STEAMROLLER) || defined(EXCAVATOR)
-//Enable some optimazation for barcelona.
+//Enable some optimization for barcelona.
 #define BARCELONA_OPTIMIZATION
 #endif
 
diff --git a/common_x86_64.h b/common_x86_64.h
index 718a81050..f59ff6627 100644
--- a/common_x86_64.h
+++ b/common_x86_64.h
@@ -276,7 +276,7 @@ static __inline int blas_quickdivide(unsigned int x, unsigned int y){
 #ifdef ASSEMBLER
 
 #if defined(PILEDRIVER) || defined(BULLDOZER) || defined(STEAMROLLER) || defined(EXCAVATOR)
-//Enable some optimazation for barcelona.
+//Enable some optimization for barcelona.
 #define BARCELONA_OPTIMIZATION
 #endif
 
diff --git a/ctest/c_cblat1.f b/ctest/c_cblat1.f
index c741ce506..1a123d74d 100644
--- a/ctest/c_cblat1.f
+++ b/ctest/c_cblat1.f
@@ -577,7 +577,7 @@
       SUBROUTINE STEST1(SCOMP1,STRUE1,SSIZE,SFAC)
 *     ************************* STEST1 *****************************
 *
-*     THIS IS AN INTERFACE SUBROUTINE TO ACCOMODATE THE FORTRAN
+*     THIS IS AN INTERFACE SUBROUTINE TO ACCOMMODATE THE FORTRAN
 *     REQUIREMENT THAT WHEN A DUMMY ARGUMENT IS AN ARRAY, THE
 *     ACTUAL ARGUMENT MUST ALSO BE AN ARRAY OR AN ARRAY ELEMENT.
 *
diff --git a/ctest/c_dblat1.f b/ctest/c_dblat1.f
index c570a9140..4a71b4dcf 100644
--- a/ctest/c_dblat1.f
+++ b/ctest/c_dblat1.f
@@ -653,7 +653,7 @@
       SUBROUTINE STEST1(SCOMP1,STRUE1,SSIZE,SFAC)
 *     ************************* STEST1 *****************************
 *
-*     THIS IS AN INTERFACE SUBROUTINE TO ACCOMODATE THE FORTRAN
+*     THIS IS AN INTERFACE SUBROUTINE TO ACCOMMODATE THE FORTRAN
 *     REQUIREMENT THAT WHEN A DUMMY ARGUMENT IS AN ARRAY, THE
 *     ACTUAL ARGUMENT MUST ALSO BE AN ARRAY OR AN ARRAY ELEMENT.
 *
diff --git a/ctest/c_sblat1.f b/ctest/c_sblat1.f
index 773787d6f..89902f12d 100644
--- a/ctest/c_sblat1.f
+++ b/ctest/c_sblat1.f
@@ -653,7 +653,7 @@
       SUBROUTINE STEST1(SCOMP1,STRUE1,SSIZE,SFAC)
 *     ************************* STEST1 *****************************
 *
-*     THIS IS AN INTERFACE SUBROUTINE TO ACCOMODATE THE FORTRAN
+*     THIS IS AN INTERFACE SUBROUTINE TO ACCOMMODATE THE FORTRAN
 *     REQUIREMENT THAT WHEN A DUMMY ARGUMENT IS AN ARRAY, THE
 *     ACTUAL ARGUMENT MUST ALSO BE AN ARRAY OR AN ARRAY ELEMENT.
 *
diff --git a/ctest/c_zblat1.f b/ctest/c_zblat1.f
index 03753e782..cd0c8541d 100644
--- a/ctest/c_zblat1.f
+++ b/ctest/c_zblat1.f
@@ -577,7 +577,7 @@
       SUBROUTINE STEST1(SCOMP1,STRUE1,SSIZE,SFAC)
 *     ************************* STEST1 *****************************
 *
-*     THIS IS AN INTERFACE SUBROUTINE TO ACCOMODATE THE FORTRAN
+*     THIS IS AN INTERFACE SUBROUTINE TO ACCOMMODATE THE FORTRAN
 *     REQUIREMENT THAT WHEN A DUMMY ARGUMENT IS AN ARRAY, THE
 *     ACTUAL ARGUMENT MUST ALSO BE AN ARRAY OR AN ARRAY ELEMENT.
 *
diff --git a/driver/others/blas_server.c b/driver/others/blas_server.c
index e5db1804f..6f4e20610 100644
--- a/driver/others/blas_server.c
+++ b/driver/others/blas_server.c
@@ -109,7 +109,7 @@ extern unsigned int openblas_thread_timeout();
 /* equal to "OMP_NUM_THREADS - 1" and thread only wakes up when     */
 /* jobs is queued.                                                  */
 
-/* We need this grobal for cheking if initialization is finished.   */
+/* We need this global for checking if initialization is finished.  */
 int blas_server_avail   __attribute__((aligned(ATTRIBUTE_SIZE))) = 0;
 
 /* Local Variables */
@@ -150,8 +150,8 @@ static unsigned int thread_timeout = (1U << (THREAD_TIMEOUT));
 
 #ifdef MONITOR
 
-/* Monitor is a function to see thread's status for every seconds. */
-/* Usually it turns off and it's for debugging.                    */
+/* Monitor is a function to see thread's status for every second. */
+/* Usually it turns off and it's for debugging.                   */
 
 static pthread_t      monitor_thread;
 static int main_status[MAX_CPU_NUMBER];
diff --git a/driver/others/blas_server_win32.c b/driver/others/blas_server_win32.c
index 0b38ee365..bace54a23 100644
--- a/driver/others/blas_server_win32.c
+++ b/driver/others/blas_server_win32.c
@@ -50,7 +50,7 @@
 
 /* This is a thread implementation for Win32 lazy implementation */
 
-/* Thread server common infomation */
+/* Thread server common information */
 typedef struct{
   CRITICAL_SECTION lock;
   HANDLE filled;
@@ -61,7 +61,7 @@ typedef struct{
 
 } blas_pool_t;
 
-/* We need this global for cheking if initialization is finished.   */
+/* We need this global for checking if initialization is finished.   */
 int blas_server_avail = 0;
 
 /* Local Variables */
diff --git a/driver/others/init.c b/driver/others/init.c
index 012ef6647..0aad9c407 100644
--- a/driver/others/init.c
+++ b/driver/others/init.c
@@ -765,7 +765,7 @@ int gotoblas_set_affinity(int pos) {
 
   int mynode = 1;
 
-  /* if number of threads is larger than inital condition */
+  /* if number of threads is larger than initial condition */
   if (pos < 0) {
       sched_setaffinity(0, sizeof(cpu_orig_mask), &cpu_orig_mask[0]);
       return 0;
diff --git a/driver/others/memory.c b/driver/others/memory.c
index ac8545f35..3fe31168d 100644
--- a/driver/others/memory.c
+++ b/driver/others/memory.c
@@ -2751,7 +2751,7 @@ void *blas_memory_alloc(int procpos){
 
 #ifdef ALLOC_DEVICEDRIVER
 	if ((*func ==  alloc_devicedirver) && (map_address == (void *)-1)) {
-	    fprintf(stderr, "OpenBLAS Warning ... Physically contigous allocation was failed.\n");
+	    fprintf(stderr, "OpenBLAS Warning ... Physically contiguous allocation was failed.\n");
 	}
 #endif
 
diff --git a/f_check b/f_check
index 34caa00be..b05db85bd 100644
--- a/f_check
+++ b/f_check
@@ -125,7 +125,7 @@ if ($compiler eq "") {
 	    $openmp = "-openmp";
 	}
 
-	# for embeded underscore name, e.g. zho_ge, it may append 2 underscores.
+	# for embedded underscore name, e.g. zho_ge, it may append 2 underscores.
 	$data = `$compiler -O2 -S ftest3.f > /dev/null 2>&1 && cat ftest3.s && rm -f ftest3.s`;
 	if ($data =~ / zho_ge__/) {
 	    $need2bu       = 1;
diff --git a/interface/CMakeLists.txt b/interface/CMakeLists.txt
index f76d5c13f..5ea39f864 100644
--- a/interface/CMakeLists.txt
+++ b/interface/CMakeLists.txt
@@ -24,7 +24,7 @@ set(BLAS1_MANGLED_SOURCES
   axpby.c
 )
 
-# TODO: USE_NETLIB_GEMV shoudl switch gemv.c to netlib/*gemv.f
+# TODO: USE_NETLIB_GEMV should switch gemv.c to netlib/*gemv.f
 # these all have 'z' sources for complex versions
 set(BLAS2_SOURCES
   gemv.c ger.c
diff --git a/interface/axpy.c b/interface/axpy.c
index 9032946d2..eaa19f4df 100644
--- a/interface/axpy.c
+++ b/interface/axpy.c
@@ -91,7 +91,7 @@ void CNAME(blasint n, FLOAT alpha, FLOAT *x, blasint incx, FLOAT *y, blasint inc
   //disable multi-thread when incx==0 or incy==0
   //In that case, the threads would be dependent.
   //
-  //Temporarily work-around the low performance issue with small imput size &
+  //Temporarily work-around the low performance issue with small input size &
   //multithreads.
   if (incx == 0 || incy == 0 || n <= MULTI_THREAD_MINIMAL)
 	  nthreads = 1;
diff --git a/interface/zaxpy.c b/interface/zaxpy.c
index dbd559628..da3b48ead 100644
--- a/interface/zaxpy.c
+++ b/interface/zaxpy.c
@@ -99,7 +99,7 @@ void CNAME(blasint n, FLOAT *ALPHA, FLOAT *x, blasint incx, FLOAT *y, blasint in
   //disable multi-thread when incx==0 or incy==0
   //In that case, the threads would be dependent.
   //
-  //Temporarily work-around the low performance issue with small imput size &
+  //Temporarily work-around the low performance issue with small input size &
   //multithreads.
   if (incx == 0 || incy == 0 || n <= MULTI_THREAD_MINIMAL)
 	  nthreads = 1;
diff --git a/reference/ctbmvf.f b/reference/ctbmvf.f
index ff3c5268d..ada701d70 100644
--- a/reference/ctbmvf.f
+++ b/reference/ctbmvf.f
@@ -117,7 +117,7 @@
 *           ( 1 + ( n - 1 )*abs( INCX ) ).
 *           Before entry, the incremented array X must contain the n
 *           element vector x. On exit, X is overwritten with the
-*           tranformed vector x.
+*           transformed vector x.
 *
 *  INCX   - INTEGER.
 *           On entry, INCX specifies the increment for the elements of
diff --git a/reference/ctpmvf.f b/reference/ctpmvf.f
index 340234270..ffc4766d2 100644
--- a/reference/ctpmvf.f
+++ b/reference/ctpmvf.f
@@ -77,7 +77,7 @@
 *           ( 1 + ( n - 1 )*abs( INCX ) ).
 *           Before entry, the incremented array X must contain the n
 *           element vector x. On exit, X is overwritten with the
-*           tranformed vector x.
+*           transformed vector x.
 *
 *  INCX   - INTEGER.
 *           On entry, INCX specifies the increment for the elements of
diff --git a/reference/ctrmvf.f b/reference/ctrmvf.f
index f9d3b445a..9cd1d17ad 100644
--- a/reference/ctrmvf.f
+++ b/reference/ctrmvf.f
@@ -80,7 +80,7 @@
 *           ( 1 + ( n - 1 )*abs( INCX ) ).
 *           Before entry, the incremented array X must contain the n
 *           element vector x. On exit, X is overwritten with the
-*           tranformed vector x.
+*           transformed vector x.
 *
 *  INCX   - INTEGER.
 *           On entry, INCX specifies the increment for the elements of
diff --git a/reference/dtbmvf.f b/reference/dtbmvf.f
index da340774e..621489085 100644
--- a/reference/dtbmvf.f
+++ b/reference/dtbmvf.f
@@ -117,7 +117,7 @@
 *           ( 1 + ( n - 1 )*abs( INCX ) ).
 *           Before entry, the incremented array X must contain the n
 *           element vector x. On exit, X is overwritten with the
-*           tranformed vector x.
+*           transformed vector x.
 *
 *  INCX   - INTEGER.
 *           On entry, INCX specifies the increment for the elements of
diff --git a/reference/dtpmvf.f b/reference/dtpmvf.f
index e8f6eb412..492f9fd46 100644
--- a/reference/dtpmvf.f
+++ b/reference/dtpmvf.f
@@ -77,7 +77,7 @@
 *           ( 1 + ( n - 1 )*abs( INCX ) ).
 *           Before entry, the incremented array X must contain the n
 *           element vector x. On exit, X is overwritten with the
-*           tranformed vector x.
+*           transformed vector x.
 *
 *  INCX   - INTEGER.
 *           On entry, INCX specifies the increment for the elements of
diff --git a/reference/dtrmvf.f b/reference/dtrmvf.f
index 0619d3eca..79b2eb806 100644
--- a/reference/dtrmvf.f
+++ b/reference/dtrmvf.f
@@ -80,7 +80,7 @@
 *           ( 1 + ( n - 1 )*abs( INCX ) ).
 *           Before entry, the incremented array X must contain the n
 *           element vector x. On exit, X is overwritten with the
-*           tranformed vector x.
+*           transformed vector x.
 *
 *  INCX   - INTEGER.
 *           On entry, INCX specifies the increment for the elements of
diff --git a/reference/stbmvf.f b/reference/stbmvf.f
index 353e63ee8..f21e5aa8b 100644
--- a/reference/stbmvf.f
+++ b/reference/stbmvf.f
@@ -117,7 +117,7 @@
 *           ( 1 + ( n - 1 )*abs( INCX ) ).
 *           Before entry, the incremented array X must contain the n
 *           element vector x. On exit, X is overwritten with the
-*           tranformed vector x.
+*           transformed vector x.
 *
 *  INCX   - INTEGER.
 *           On entry, INCX specifies the increment for the elements of
diff --git a/reference/stpmvf.f b/reference/stpmvf.f
index 1e93b843a..d97a695f5 100644
--- a/reference/stpmvf.f
+++ b/reference/stpmvf.f
@@ -77,7 +77,7 @@
 *           ( 1 + ( n - 1 )*abs( INCX ) ).
 *           Before entry, the incremented array X must contain the n
 *           element vector x. On exit, X is overwritten with the
-*           tranformed vector x.
+*           transformed vector x.
 *
 *  INCX   - INTEGER.
 *           On entry, INCX specifies the increment for the elements of
diff --git a/reference/strmvf.f b/reference/strmvf.f
index 249aff275..7614dcd32 100644
--- a/reference/strmvf.f
+++ b/reference/strmvf.f
@@ -80,7 +80,7 @@
 *           ( 1 + ( n - 1 )*abs( INCX ) ).
 *           Before entry, the incremented array X must contain the n
 *           element vector x. On exit, X is overwritten with the
-*           tranformed vector x.
+*           transformed vector x.
 *
 *  INCX   - INTEGER.
 *           On entry, INCX specifies the increment for the elements of
diff --git a/reference/ztbmvf.f b/reference/ztbmvf.f
index 8df5609ad..c8487cf7c 100644
--- a/reference/ztbmvf.f
+++ b/reference/ztbmvf.f
@@ -117,7 +117,7 @@
 *           ( 1 + ( n - 1 )*abs( INCX ) ).
 *           Before entry, the incremented array X must contain the n
 *           element vector x. On exit, X is overwritten with the
-*           tranformed vector x.
+*           transformed vector x.
 *
 *  INCX   - INTEGER.
 *           On entry, INCX specifies the increment for the elements of
diff --git a/reference/ztpmvf.f b/reference/ztpmvf.f
index 7e52ef74e..5dc03bac9 100644
--- a/reference/ztpmvf.f
+++ b/reference/ztpmvf.f
@@ -77,7 +77,7 @@
 *           ( 1 + ( n - 1 )*abs( INCX ) ).
 *           Before entry, the incremented array X must contain the n
 *           element vector x. On exit, X is overwritten with the
-*           tranformed vector x.
+*           transformed vector x.
 *
 *  INCX   - INTEGER.
 *           On entry, INCX specifies the increment for the elements of
diff --git a/reference/ztrmvf.f b/reference/ztrmvf.f
index 9e4f85380..5f52622e2 100644
--- a/reference/ztrmvf.f
+++ b/reference/ztrmvf.f
@@ -80,7 +80,7 @@
 *           ( 1 + ( n - 1 )*abs( INCX ) ).
 *           Before entry, the incremented array X must contain the n
 *           element vector x. On exit, X is overwritten with the
-*           tranformed vector x.
+*           transformed vector x.
 *
 *  INCX   - INTEGER.
 *           On entry, INCX specifies the increment for the elements of
diff --git a/test/cblat1.f b/test/cblat1.f
index a4c996fda..d6b53d105 100644
--- a/test/cblat1.f
+++ b/test/cblat1.f
@@ -576,7 +576,7 @@
       SUBROUTINE STEST1(SCOMP1,STRUE1,SSIZE,SFAC)
 *     ************************* STEST1 *****************************
 *
-*     THIS IS AN INTERFACE SUBROUTINE TO ACCOMODATE THE FORTRAN
+*     THIS IS AN INTERFACE SUBROUTINE TO ACCOMMODATE THE FORTRAN
 *     REQUIREMENT THAT WHEN A DUMMY ARGUMENT IS AN ARRAY, THE
 *     ACTUAL ARGUMENT MUST ALSO BE AN ARRAY OR AN ARRAY ELEMENT.
 *
diff --git a/test/dblat1.f b/test/dblat1.f
index f3255fef4..28af121cd 100644
--- a/test/dblat1.f
+++ b/test/dblat1.f
@@ -991,7 +991,7 @@
       SUBROUTINE STEST1(SCOMP1,STRUE1,SSIZE,SFAC)
 *     ************************* STEST1 *****************************
 *
-*     THIS IS AN INTERFACE SUBROUTINE TO ACCOMODATE THE FORTRAN
+*     THIS IS AN INTERFACE SUBROUTINE TO ACCOMMODATE THE FORTRAN
 *     REQUIREMENT THAT WHEN A DUMMY ARGUMENT IS AN ARRAY, THE
 *     ACTUAL ARGUMENT MUST ALSO BE AN ARRAY OR AN ARRAY ELEMENT.
 *
diff --git a/test/sblat1.f b/test/sblat1.f
index a5c1c6af6..fe05bbe87 100644
--- a/test/sblat1.f
+++ b/test/sblat1.f
@@ -946,7 +946,7 @@
       SUBROUTINE STEST1(SCOMP1,STRUE1,SSIZE,SFAC)
 *     ************************* STEST1 *****************************
 *
-*     THIS IS AN INTERFACE SUBROUTINE TO ACCOMODATE THE FORTRAN
+*     THIS IS AN INTERFACE SUBROUTINE TO ACCOMMODATE THE FORTRAN
 *     REQUIREMENT THAT WHEN A DUMMY ARGUMENT IS AN ARRAY, THE
 *     ACTUAL ARGUMENT MUST ALSO BE AN ARRAY OR AN ARRAY ELEMENT.
 *
diff --git a/test/zblat1.f b/test/zblat1.f
index e2415e1c4..8b4b8d21e 100644
--- a/test/zblat1.f
+++ b/test/zblat1.f
@@ -576,7 +576,7 @@
       SUBROUTINE STEST1(SCOMP1,STRUE1,SSIZE,SFAC)
 *     ************************* STEST1 *****************************
 *
-*     THIS IS AN INTERFACE SUBROUTINE TO ACCOMODATE THE FORTRAN
+*     THIS IS AN INTERFACE SUBROUTINE TO ACCOMMODATE THE FORTRAN
 *     REQUIREMENT THAT WHEN A DUMMY ARGUMENT IS AN ARRAY, THE
 *     ACTUAL ARGUMENT MUST ALSO BE AN ARRAY OR AN ARRAY ELEMENT.
 *

From b43c8382c885551b0f230c8493e79bf04d94e366 Mon Sep 17 00:00:00 2001
From: Martin Kroeker <martin@ruby.chemie.uni-freiburg.de>
Date: Wed, 1 May 2019 10:46:46 +0200
Subject: [PATCH 11/28] Correct argument of CPU_ISSET for glibc <2.5

fixes #2104
---
 driver/others/memory.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/driver/others/memory.c b/driver/others/memory.c
index ac8545f35..db14cde02 100644
--- a/driver/others/memory.c
+++ b/driver/others/memory.c
@@ -229,7 +229,7 @@ int get_num_procs(void) {
   n=0;
   #if !__GLIBC_PREREQ(2, 6)
   for (i=0;i<nums;i++)
-     if (CPU_ISSET(i,cpuset)) n++;
+     if (CPU_ISSET(i,&cpuset)) n++;
   nums=n;
   #else
   nums = CPU_COUNT(sizeof(cpuset),&cpuset);
@@ -1772,7 +1772,7 @@ int get_num_procs(void) {
   n=0;
   #if !__GLIBC_PREREQ(2, 6)
   for (i=0;i<nums;i++)
-     if (CPU_ISSET(i,cpuset)) n++;
+     if (CPU_ISSET(i,&cpuset)) n++;
   nums=n;
   #else
   nums = CPU_COUNT(sizeof(cpuset),&cpuset);

From 47f892198cf98d8392b91377b5939a7dfc364e3b Mon Sep 17 00:00:00 2001
From: AbdelRauf <quickwritereader@gmail.com>
Date: Wed, 1 May 2019 19:36:22 +0000
Subject: [PATCH 12/28] conflict resolve

---
 kernel/power/KERNEL.POWER9 | 10 +++++-----
 kernel/power/icamax.c      |  2 +-
 kernel/power/icamin.c      |  2 +-
 3 files changed, 7 insertions(+), 7 deletions(-)

diff --git a/kernel/power/KERNEL.POWER9 b/kernel/power/KERNEL.POWER9
index 6d5cf9068..0e0d62393 100644
--- a/kernel/power/KERNEL.POWER9
+++ b/kernel/power/KERNEL.POWER9
@@ -12,11 +12,11 @@ SGEMMKERNEL    =  sgemm_kernel_power9.S
 SGEMMINCOPY    = ../generic/gemm_ncopy_16.c
 SGEMMITCOPY    = sgemm_tcopy_16_power8.S
 SGEMMONCOPY    =  ../generic/gemm_ncopy_8.c
-SGEMMOTCOPY    = sgemm_tcopy_8_power8.S
-SGEMMINCOPYOBJ =  sgemm_incopy.o
-SGEMMITCOPYOBJ =  sgemm_itcopy.o
-SGEMMONCOPYOBJ =  sgemm_oncopy.o
-SGEMMOTCOPYOBJ =  sgemm_otcopy.o
+SGEMMOTCOPY    = sgemm_tcopy_8_power8.S 
+SGEMMINCOPYOBJ =  sgemm_incopy$(TSUFFIX).$(SUFFIX)
+SGEMMITCOPYOBJ =  sgemm_itcopy$(TSUFFIX).$(SUFFIX)
+SGEMMONCOPYOBJ =  sgemm_oncopy$(TSUFFIX).$(SUFFIX)
+SGEMMOTCOPYOBJ =  sgemm_otcopy$(TSUFFIX).$(SUFFIX)
 
 DGEMMKERNEL    =  dgemm_kernel_power9.S
 DGEMMINCOPY    = ../generic/gemm_ncopy_16.c
diff --git a/kernel/power/icamax.c b/kernel/power/icamax.c
index 06fc5d8ad..bd74d20e5 100644
--- a/kernel/power/icamax.c
+++ b/kernel/power/icamax.c
@@ -75,7 +75,7 @@ static inline __attribute__((always_inline)) __vector float mvec_mergeo(__vector
 static BLASLONG   ciamax_kernel_32(BLASLONG n, FLOAT *x, FLOAT *maxf) { 
 
     BLASLONG index;
-    BLASLONG i;
+    BLASLONG i=0;
 #if  defined(USE_MASK_PERMUTATIONS)    
     register __vector unsigned int static_index0 = {0,1,2,3};
 #else
diff --git a/kernel/power/icamin.c b/kernel/power/icamin.c
index 36432c993..336766245 100644
--- a/kernel/power/icamin.c
+++ b/kernel/power/icamin.c
@@ -50,7 +50,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 static BLASLONG   ciamin_kernel_32(BLASLONG n, FLOAT *x, FLOAT *minf) { 
 
     BLASLONG index;
-    BLASLONG i;
+    BLASLONG i=0;
     register __vector unsigned int static_index0 = {0,1,2,3};
     register __vector unsigned int temp0 = {4,4,4, 4}; //temporary vector register
     register __vector unsigned int temp1=  temp0<<1;  //{8,8,8,8}

From 858e609e1feba715065a65034eef02c9516aa107 Mon Sep 17 00:00:00 2001
From: "luz.paz" <luzpaz@users.noreply.github.com>
Date: Sat, 4 May 2019 15:01:29 -0400
Subject: [PATCH 13/28] Revert reference/ fixes

---
 reference/ctbmvf.f | 2 +-
 reference/ctpmvf.f | 2 +-
 reference/ctrmvf.f | 2 +-
 reference/dtbmvf.f | 2 +-
 reference/dtpmvf.f | 2 +-
 reference/dtrmvf.f | 2 +-
 reference/stbmvf.f | 2 +-
 reference/stpmvf.f | 2 +-
 reference/strmvf.f | 2 +-
 reference/ztbmvf.f | 2 +-
 reference/ztpmvf.f | 2 +-
 reference/ztrmvf.f | 2 +-
 12 files changed, 12 insertions(+), 12 deletions(-)

diff --git a/reference/ctbmvf.f b/reference/ctbmvf.f
index ada701d70..ff3c5268d 100644
--- a/reference/ctbmvf.f
+++ b/reference/ctbmvf.f
@@ -117,7 +117,7 @@
 *           ( 1 + ( n - 1 )*abs( INCX ) ).
 *           Before entry, the incremented array X must contain the n
 *           element vector x. On exit, X is overwritten with the
-*           transformed vector x.
+*           tranformed vector x.
 *
 *  INCX   - INTEGER.
 *           On entry, INCX specifies the increment for the elements of
diff --git a/reference/ctpmvf.f b/reference/ctpmvf.f
index ffc4766d2..340234270 100644
--- a/reference/ctpmvf.f
+++ b/reference/ctpmvf.f
@@ -77,7 +77,7 @@
 *           ( 1 + ( n - 1 )*abs( INCX ) ).
 *           Before entry, the incremented array X must contain the n
 *           element vector x. On exit, X is overwritten with the
-*           transformed vector x.
+*           tranformed vector x.
 *
 *  INCX   - INTEGER.
 *           On entry, INCX specifies the increment for the elements of
diff --git a/reference/ctrmvf.f b/reference/ctrmvf.f
index 9cd1d17ad..f9d3b445a 100644
--- a/reference/ctrmvf.f
+++ b/reference/ctrmvf.f
@@ -80,7 +80,7 @@
 *           ( 1 + ( n - 1 )*abs( INCX ) ).
 *           Before entry, the incremented array X must contain the n
 *           element vector x. On exit, X is overwritten with the
-*           transformed vector x.
+*           tranformed vector x.
 *
 *  INCX   - INTEGER.
 *           On entry, INCX specifies the increment for the elements of
diff --git a/reference/dtbmvf.f b/reference/dtbmvf.f
index 621489085..da340774e 100644
--- a/reference/dtbmvf.f
+++ b/reference/dtbmvf.f
@@ -117,7 +117,7 @@
 *           ( 1 + ( n - 1 )*abs( INCX ) ).
 *           Before entry, the incremented array X must contain the n
 *           element vector x. On exit, X is overwritten with the
-*           transformed vector x.
+*           tranformed vector x.
 *
 *  INCX   - INTEGER.
 *           On entry, INCX specifies the increment for the elements of
diff --git a/reference/dtpmvf.f b/reference/dtpmvf.f
index 492f9fd46..e8f6eb412 100644
--- a/reference/dtpmvf.f
+++ b/reference/dtpmvf.f
@@ -77,7 +77,7 @@
 *           ( 1 + ( n - 1 )*abs( INCX ) ).
 *           Before entry, the incremented array X must contain the n
 *           element vector x. On exit, X is overwritten with the
-*           transformed vector x.
+*           tranformed vector x.
 *
 *  INCX   - INTEGER.
 *           On entry, INCX specifies the increment for the elements of
diff --git a/reference/dtrmvf.f b/reference/dtrmvf.f
index 79b2eb806..0619d3eca 100644
--- a/reference/dtrmvf.f
+++ b/reference/dtrmvf.f
@@ -80,7 +80,7 @@
 *           ( 1 + ( n - 1 )*abs( INCX ) ).
 *           Before entry, the incremented array X must contain the n
 *           element vector x. On exit, X is overwritten with the
-*           transformed vector x.
+*           tranformed vector x.
 *
 *  INCX   - INTEGER.
 *           On entry, INCX specifies the increment for the elements of
diff --git a/reference/stbmvf.f b/reference/stbmvf.f
index f21e5aa8b..353e63ee8 100644
--- a/reference/stbmvf.f
+++ b/reference/stbmvf.f
@@ -117,7 +117,7 @@
 *           ( 1 + ( n - 1 )*abs( INCX ) ).
 *           Before entry, the incremented array X must contain the n
 *           element vector x. On exit, X is overwritten with the
-*           transformed vector x.
+*           tranformed vector x.
 *
 *  INCX   - INTEGER.
 *           On entry, INCX specifies the increment for the elements of
diff --git a/reference/stpmvf.f b/reference/stpmvf.f
index d97a695f5..1e93b843a 100644
--- a/reference/stpmvf.f
+++ b/reference/stpmvf.f
@@ -77,7 +77,7 @@
 *           ( 1 + ( n - 1 )*abs( INCX ) ).
 *           Before entry, the incremented array X must contain the n
 *           element vector x. On exit, X is overwritten with the
-*           transformed vector x.
+*           tranformed vector x.
 *
 *  INCX   - INTEGER.
 *           On entry, INCX specifies the increment for the elements of
diff --git a/reference/strmvf.f b/reference/strmvf.f
index 7614dcd32..249aff275 100644
--- a/reference/strmvf.f
+++ b/reference/strmvf.f
@@ -80,7 +80,7 @@
 *           ( 1 + ( n - 1 )*abs( INCX ) ).
 *           Before entry, the incremented array X must contain the n
 *           element vector x. On exit, X is overwritten with the
-*           transformed vector x.
+*           tranformed vector x.
 *
 *  INCX   - INTEGER.
 *           On entry, INCX specifies the increment for the elements of
diff --git a/reference/ztbmvf.f b/reference/ztbmvf.f
index c8487cf7c..8df5609ad 100644
--- a/reference/ztbmvf.f
+++ b/reference/ztbmvf.f
@@ -117,7 +117,7 @@
 *           ( 1 + ( n - 1 )*abs( INCX ) ).
 *           Before entry, the incremented array X must contain the n
 *           element vector x. On exit, X is overwritten with the
-*           transformed vector x.
+*           tranformed vector x.
 *
 *  INCX   - INTEGER.
 *           On entry, INCX specifies the increment for the elements of
diff --git a/reference/ztpmvf.f b/reference/ztpmvf.f
index 5dc03bac9..7e52ef74e 100644
--- a/reference/ztpmvf.f
+++ b/reference/ztpmvf.f
@@ -77,7 +77,7 @@
 *           ( 1 + ( n - 1 )*abs( INCX ) ).
 *           Before entry, the incremented array X must contain the n
 *           element vector x. On exit, X is overwritten with the
-*           transformed vector x.
+*           tranformed vector x.
 *
 *  INCX   - INTEGER.
 *           On entry, INCX specifies the increment for the elements of
diff --git a/reference/ztrmvf.f b/reference/ztrmvf.f
index 5f52622e2..9e4f85380 100644
--- a/reference/ztrmvf.f
+++ b/reference/ztrmvf.f
@@ -80,7 +80,7 @@
 *           ( 1 + ( n - 1 )*abs( INCX ) ).
 *           Before entry, the incremented array X must contain the n
 *           element vector x. On exit, X is overwritten with the
-*           transformed vector x.
+*           tranformed vector x.
 *
 *  INCX   - INTEGER.
 *           On entry, INCX specifies the increment for the elements of

From b46875b76b8d4ebbc320547c20f7f4486fe52563 Mon Sep 17 00:00:00 2001
From: "luz.paz" <luzpaz@users.noreply.github.com>
Date: Sat, 4 May 2019 15:43:17 -0400
Subject: [PATCH 14/28] Revert Changelog.txt typos

---
 Changelog.txt | 14 +++++++-------
 1 file changed, 7 insertions(+), 7 deletions(-)

diff --git a/Changelog.txt b/Changelog.txt
index 9feacf071..8df35d5c3 100644
--- a/Changelog.txt
+++ b/Changelog.txt
@@ -74,7 +74,7 @@ ARMv8:
 	* ARMV8 builds with the BINARY=32 option are now automatically handled as ARMV7
 
 IBM Z:
-	* optimized microkernels for single precision BLAS1/2 functions have been added
+	* optimized microkernels for single precicion BLAS1/2 functions have been added
 	  for both Z13 and Z14
 
 ====================================================================
@@ -588,8 +588,8 @@ common:
 	  s/d/c/zaxpby, s/d/c/zimatcopy, s/d/c/zomatcopy.
 	* Added OPENBLAS_CORETYPE environment for dynamic_arch. (a86d34)
 	* Added NO_AVX2 flag for old binutils. (#401)
-	* Support outputting the CPU corename on runtime.(#407)
-	* Patched LAPACK to fix bug 114, 117, 118.
+	* Support outputing the CPU corename on runtime.(#407)
+	* Patched LAPACK to fix bug 114, 117, 118. 
 	  (http://www.netlib.org/lapack/bug_list.html)
 	* Disabled ?gemm3m for a work-around fix. (#400)
 x86/x86-64:
@@ -628,7 +628,7 @@ Version 0.2.9.rc1
 13-Jan-2013
 common:
 	* Update LAPACK to 3.5.0 version
-	* Fixed compatible issues with Clang and Pathscale compilers.
+	* Fixed compatiable issues with Clang and Pathscale compilers.
 
 x86/x86-64:
 	* Optimization on Intel Haswell.
@@ -705,7 +705,7 @@ Version 0.2.5
 26-Nov-2012
 common:
 	* Added NO_SHARED flag to disable generating the shared library.
-	* Compile LAPACKE with ILP64 model when INTERFACE64=1 (#158)
+	* Compile LAPACKE with ILP64 modle when INTERFACE64=1 (#158)
 	* Export LAPACK 3.4.2 symbols in shared library. (#147)
 	* Only detect the number of physical CPU cores on Mac OSX. (#157)
 	* Fixed NetBSD build. (#155)
@@ -896,7 +896,7 @@ x86/x86_64:
 	* Fixed #28 a wrong result of dsdot on x86_64.
 	* Fixed #32 a SEGFAULT bug of zdotc with gcc-4.6.
 	* Fixed #33 ztrmm bug on Nehalem.
-	* Work-around #27 the low performance axpy issue with small input size & multithreads.
+	* Work-around #27 the low performance axpy issue with small imput size & multithreads.
 
 MIPS64:
 	* Fixed #28 a wrong result of dsdot on Loongson3A/MIPS64.
@@ -919,7 +919,7 @@ common:
 	* Imported GotoBLAS2 1.13 BSD version
 
 x86/x86_64:
-	* On x86 32bits, fixed a bug in zdot_sse2.S line 191. This would cause
+	* On x86 32bits, fixed a bug in zdot_sse2.S line 191. This would casue
 	  zdotu & zdotc failures. Instead, work-around it. (Refs issue #8 #9 on github)
 	* Modified ?axpy functions to return same netlib BLAS results
 	  when incx==0 or incy==0 (Refs issue #7 on github)

From 7ed8431527eb00f161de4dd309fd4d2b6c885b0c Mon Sep 17 00:00:00 2001
From: Martin Kroeker <martin@ruby.chemie.uni-freiburg.de>
Date: Sat, 4 May 2019 22:54:41 +0200
Subject: [PATCH 15/28] Disable the SkyLakeX DGEMMITCOPY kernel as well

as a stopgap measure for https://github.com/numpy/numpy/issues/13401 as mentioned in #1955
---
 kernel/x86_64/KERNEL.SKYLAKEX | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/kernel/x86_64/KERNEL.SKYLAKEX b/kernel/x86_64/KERNEL.SKYLAKEX
index 5d0a300b5..3c678904d 100644
--- a/kernel/x86_64/KERNEL.SKYLAKEX
+++ b/kernel/x86_64/KERNEL.SKYLAKEX
@@ -10,7 +10,7 @@ SGEMMOTCOPY    =  ../generic/gemm_tcopy_4.c
 #DGEMMKERNEL    =  dgemm_kernel_4x8_skylakex.c
 
 DGEMMINCOPY    =  dgemm_ncopy_8_skylakex.c
-DGEMMITCOPY    =  dgemm_tcopy_8_skylakex.c
+#DGEMMITCOPY    =  dgemm_tcopy_8_skylakex.c
 DGEMMONCOPY    =  dgemm_ncopy_8_skylakex.c
 DGEMMOTCOPY    =  dgemm_tcopy_8_skylakex.c
 

From b1561ecc6864428baa4f1336d47d23729b9636f2 Mon Sep 17 00:00:00 2001
From: Martin Kroeker <martin@ruby.chemie.uni-freiburg.de>
Date: Sun, 5 May 2019 15:52:01 +0200
Subject: [PATCH 16/28] Disable DGEMMINCOPY as well for now

#1955
---
 kernel/x86_64/KERNEL.SKYLAKEX | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/kernel/x86_64/KERNEL.SKYLAKEX b/kernel/x86_64/KERNEL.SKYLAKEX
index 3c678904d..d61c51628 100644
--- a/kernel/x86_64/KERNEL.SKYLAKEX
+++ b/kernel/x86_64/KERNEL.SKYLAKEX
@@ -9,7 +9,7 @@ SGEMMOTCOPY    =  ../generic/gemm_tcopy_4.c
 
 #DGEMMKERNEL    =  dgemm_kernel_4x8_skylakex.c
 
-DGEMMINCOPY    =  dgemm_ncopy_8_skylakex.c
+#DGEMMINCOPY    =  dgemm_ncopy_8_skylakex.c
 #DGEMMITCOPY    =  dgemm_tcopy_8_skylakex.c
 DGEMMONCOPY    =  dgemm_ncopy_8_skylakex.c
 DGEMMOTCOPY    =  dgemm_tcopy_8_skylakex.c

From a6a8cc2b7fa30f46fdaa4fb6e50c19da8c11e335 Mon Sep 17 00:00:00 2001
From: Martin Kroeker <martin@ruby.chemie.uni-freiburg.de>
Date: Tue, 7 May 2019 13:34:52 +0200
Subject: [PATCH 17/28] Fix errors in cpu enumeration with glibc 2.6

for #2114
---
 driver/others/init.c | 15 +++++++++++----
 1 file changed, 11 insertions(+), 4 deletions(-)

diff --git a/driver/others/init.c b/driver/others/init.c
index 012ef6647..a29dce971 100644
--- a/driver/others/init.c
+++ b/driver/others/init.c
@@ -765,7 +765,7 @@ int gotoblas_set_affinity(int pos) {
 
   int mynode = 1;
 
-  /* if number of threads is larger than inital condition */
+  /* if number of threads is larger than initial condition */
   if (pos < 0) {
       sched_setaffinity(0, sizeof(cpu_orig_mask), &cpu_orig_mask[0]);
       return 0;
@@ -857,7 +857,14 @@ void gotoblas_affinity_init(void) {
   common -> shmid = pshmid;
 
   if (common -> magic != SH_MAGIC) {
+
+#if defined(__GLIBC_PREREQ)
+#if __GLIBC_PREREQ(2, 7)
     cpu_set_t *cpusetp;
+#else
+    cpu_set_t cpuset;
+#endif
+#endif    
     int nums;
     int ret;
 
@@ -890,7 +897,7 @@ void gotoblas_affinity_init(void) {
     }
     CPU_FREE(cpusetp);
 #else
-    ret = sched_getaffinity(0,sizeof(cpu_set_t), cpusetp);
+    ret = sched_getaffinity(0,sizeof(cpu_set_t), &cpuset);
     if (ret!=0) {
         common->num_procs = nums;
     } else {
@@ -898,11 +905,11 @@ void gotoblas_affinity_init(void) {
     int i;
     int n = 0;
     for (i=0;i<nums;i++)
-        if (CPU_ISSET(i,cpusetp)) n++;
+        if (CPU_ISSET(i,&cpuset)) n++;
     common->num_procs = n;
     }
 #else
-    common->num_procs = CPU_COUNT(sizeof(cpu_set_t),cpusetp);
+    common->num_procs = CPU_COUNT(&cpuset);
     }
 #endif
 

From c516209581a77790b8d67d6dcd0c3f95fe713643 Mon Sep 17 00:00:00 2001
From: Diazonium <Diazonium@users.noreply.github.com>
Date: Tue, 7 May 2019 14:55:20 +0200
Subject: [PATCH 18/28] Change two http links to https

Closes #2109
---
 README.md | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/README.md b/README.md
index 76a65b74b..620e393f1 100644
--- a/README.md
+++ b/README.md
@@ -10,7 +10,7 @@ AppVeyor: [![Build status](https://ci.appveyor.com/api/projects/status/09sohd35n
 
 OpenBLAS is an optimized BLAS library based on GotoBLAS2 1.13 BSD version.
 
-Please read the documentation on the OpenBLAS wiki pages: <http://github.com/xianyi/OpenBLAS/wiki>.
+Please read the documentation on the OpenBLAS wiki pages: <https://github.com/xianyi/OpenBLAS/wiki>.
 
 ## Binary Packages
 
@@ -22,7 +22,7 @@ You can download them from [file hosting on sourceforge.net](https://sourceforge
 
 ## Installation from Source
 
-Download from project homepage, http://xianyi.github.com/OpenBLAS/, or check out the code
+Download from project homepage, https://xianyi.github.com/OpenBLAS/, or check out the code
 using Git from https://github.com/xianyi/OpenBLAS.git.
 
 ### Dependencies

From 7d1b468d9d83789d25eb6996afb5e358ee861f1d Mon Sep 17 00:00:00 2001
From: Zhang Xianyi <traits.zhang@gmail.com>
Date: Wed, 8 May 2019 09:58:01 +0800
Subject: [PATCH 19/28] Set up CI with Azure Pipelines

[skip ci]
---
 azure-pipelines.yml | 19 +++++++++++++++++++
 1 file changed, 19 insertions(+)
 create mode 100644 azure-pipelines.yml

diff --git a/azure-pipelines.yml b/azure-pipelines.yml
new file mode 100644
index 000000000..aa912913d
--- /dev/null
+++ b/azure-pipelines.yml
@@ -0,0 +1,19 @@
+# Starter pipeline
+# Start with a minimal pipeline that you can customize to build and deploy your code.
+# Add steps that build, run tests, deploy, and more:
+# https://aka.ms/yaml
+
+trigger:
+- master
+
+pool:
+  vmImage: 'ubuntu-latest'
+
+steps:
+- script: echo Hello, world!
+  displayName: 'Run a one-line script'
+
+- script: |
+    echo Add other tasks to build, test, and deploy your project.
+    echo See https://aka.ms/yaml
+  displayName: 'Run a multi-line script'

From e47b63466b26dab9618443fd5754885bea653845 Mon Sep 17 00:00:00 2001
From: Tyler Reddy <tyler.je.reddy@gmail.com>
Date: Tue, 7 May 2019 16:06:42 -0700
Subject: [PATCH 20/28] TST: add native POWER8 to CI

* add native POWER8 testing to
Travis CI matrix with ppc64le
os entry
---
 .travis.yml | 9 +++++++++
 1 file changed, 9 insertions(+)

diff --git a/.travis.yml b/.travis.yml
index eee7674fe..00a2509f9 100644
--- a/.travis.yml
+++ b/.travis.yml
@@ -25,6 +25,15 @@ matrix:
         - TARGET_BOX=LINUX64
         - BTYPE="BINARY=64"
 
+    - <<: *test-ubuntu
+      os: linux-ppc64le
+      before_script:
+        - COMMON_FLAGS="DYNAMIC_ARCH=1 TARGET=POWER8 NUM_THREADS=32"
+      env:
+        # for matrix annotation only
+        - TARGET_BOX=PPC64LE_LINUX
+        - BTYPE="BINARY=64 USE_OPENMP=1"
+
     - <<: *test-ubuntu
       env:
         - TARGET_BOX=LINUX64

From 70cea0b96b70330ae6ef80b954e708d6acd86911 Mon Sep 17 00:00:00 2001
From: Martin Kroeker <martin@ruby.chemie.uni-freiburg.de>
Date: Wed, 8 May 2019 12:20:00 +0200
Subject: [PATCH 21/28] Update link to IBM MASS library, update cpu support
 status

---
 README.md | 9 +++++----
 1 file changed, 5 insertions(+), 4 deletions(-)

diff --git a/README.md b/README.md
index 620e393f1..68a121498 100644
--- a/README.md
+++ b/README.md
@@ -63,9 +63,7 @@ A debug version can be built using `make DEBUG=1`.
 
 ### Compile with MASS support on Power CPU (optional)
 
-The [IBM MASS](http://www-01.ibm.com/software/awdtools/mass/linux/mass-linux.html) library
-consists of a set of mathematical functions for C, C++, and Fortran applications that are
-are tuned for optimum performance on POWER architectures.
+The [IBM MASS](https://www.ibm.com/support/home/product/W511326D80541V01/other_software/mathematical_acceleration_subsystem) library consists of a set of mathematical functions for C, C++, and Fortran applications that are tuned for optimum performance on POWER architectures.
 OpenBLAS with MASS requires a 64-bit, little-endian OS on POWER.
 The library can be installed as shown:
 
@@ -115,6 +113,7 @@ Please read `GotoBLAS_01Readme.txt`.
 - **AMD Bulldozer**: x86-64 ?GEMM FMA4 kernels. (Thanks to Werner Saar)
 - **AMD PILEDRIVER**: Uses Bulldozer codes with some optimizations.
 - **AMD STEAMROLLER**: Uses Bulldozer codes with some optimizations.
+- **AMD ZEN**: Uses Haswell codes with some optimizations.
 
 #### MIPS64
 
@@ -133,11 +132,13 @@ Please read `GotoBLAS_01Readme.txt`.
 
 #### PPC/PPC64
 
-- **POWER8**: Optimized Level-3 BLAS and some Level-1, only with `USE_OPENMP=1`
+- **POWER8**: Optimized BLAS, only for PPC64LE (Little Endian), only with `USE_OPENMP=1`
+- **POWER9**: Optimized Level-3 BLAS (real) and some Level-1,2. PPC64LE with OpenMP only. 
 
 #### IBM zEnterprise System
 
 - **Z13**: Optimized Level-3 BLAS and Level-1,2 (double precision)
+- **Z14**: Optimized Level-3 BLAS and Level-1,2 (single precision)
 
 ### Supported OS
 

From 3a49e8c05aa24bba832e5e05bd8888fbee039919 Mon Sep 17 00:00:00 2001
From: Martin Kroeker <martin@ruby.chemie.uni-freiburg.de>
Date: Wed, 8 May 2019 13:52:22 +0200
Subject: [PATCH 22/28] first try migrating one of the arm builds from travis

---
 azure-pipelines.yml | 26 +++++++++++++++++++++++---
 1 file changed, 23 insertions(+), 3 deletions(-)

diff --git a/azure-pipelines.yml b/azure-pipelines.yml
index aa912913d..87b4de3f0 100644
--- a/azure-pipelines.yml
+++ b/azure-pipelines.yml
@@ -14,6 +14,26 @@ steps:
   displayName: 'Run a one-line script'
 
 - script: |
-    echo Add other tasks to build, test, and deploy your project.
-    echo See https://aka.ms/yaml
-  displayName: 'Run a multi-line script'
+  docker run --rm --privileged multiarch/qemu-user-static:register --reset
+  ls /proc/sys/fs/binfmt_misc/
+  condition: not(startsWith(variables['CONFIG'], 'linux_64'))
+  displayName: Configure binfmt_misc
+
+- script: |
+  echo "FROM openblas/alpine:arm32
+        COPY . /tmp/openblas
+        RUN mkdir /tmp/openblas/build                             &&  \
+            cd /tmp/openblas/build                                &&  \
+            CC=gcc cmake -D DYNAMIC_ARCH=OFF                  \
+                                 -D TARGET=ARMV6             \
+                                 -D BUILD_SHARED_LIBS=ON              \
+                                 -D BUILD_WITHOUT_LAPACK=ON           \
+                                 -D BUILD_WITHOUT_CBLAS=ON            \
+                                 -D CMAKE_BUILD_TYPE=Release ../  &&  \
+            cmake --build ." > Dockerfile
+        docker build .
+
+#- script: |
+#    echo Add other tasks to build, test, and deploy your project.
+#    echo See https://aka.ms/yaml
+#  displayName: 'Run a multi-line script'

From 5cf434167ab9622c6788e4fdc9b418ab7bf96e61 Mon Sep 17 00:00:00 2001
From: Martin Kroeker <martin@ruby.chemie.uni-freiburg.de>
Date: Wed, 8 May 2019 13:58:59 +0200
Subject: [PATCH 23/28] fix tabbing in azure commands

---
 azure-pipelines.yml | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/azure-pipelines.yml b/azure-pipelines.yml
index 87b4de3f0..3b277073a 100644
--- a/azure-pipelines.yml
+++ b/azure-pipelines.yml
@@ -14,10 +14,10 @@ steps:
   displayName: 'Run a one-line script'
 
 - script: |
-  docker run --rm --privileged multiarch/qemu-user-static:register --reset
-  ls /proc/sys/fs/binfmt_misc/
+    docker run --rm --privileged multiarch/qemu-user-static:register --reset
+    ls /proc/sys/fs/binfmt_misc/
   condition: not(startsWith(variables['CONFIG'], 'linux_64'))
-  displayName: Configure binfmt_misc
+  displayName: 'Configure binfmt_misc'
 
 - script: |
   echo "FROM openblas/alpine:arm32
@@ -32,7 +32,7 @@ steps:
                                  -D CMAKE_BUILD_TYPE=Release ../  &&  \
             cmake --build ." > Dockerfile
         docker build .
-
+  displayname: 'Run ARMV6 docker build'
 #- script: |
 #    echo Add other tasks to build, test, and deploy your project.
 #    echo See https://aka.ms/yaml

From aa4c41bad26bbb6d550ddad3141063c2260b7afd Mon Sep 17 00:00:00 2001
From: Martin Kroeker <martin@ruby.chemie.uni-freiburg.de>
Date: Wed, 8 May 2019 14:12:02 +0200
Subject: [PATCH 24/28] Update azure-pipelines.yml

take out offending lines (although stolen from https://github.com/conda-forge/opencv-feedstock azure-pipelines fiie)
---
 azure-pipelines.yml | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/azure-pipelines.yml b/azure-pipelines.yml
index 3b277073a..d7e6cdc9b 100644
--- a/azure-pipelines.yml
+++ b/azure-pipelines.yml
@@ -15,9 +15,9 @@ steps:
 
 - script: |
     docker run --rm --privileged multiarch/qemu-user-static:register --reset
-    ls /proc/sys/fs/binfmt_misc/
-  condition: not(startsWith(variables['CONFIG'], 'linux_64'))
-  displayName: 'Configure binfmt_misc'
+#    ls /proc/sys/fs/binfmt_misc/
+#  condition: not(startsWith(variables['CONFIG'], 'linux_64'))
+#  displayName: 'Configure binfmt_misc'
 
 - script: |
   echo "FROM openblas/alpine:arm32

From 16fd8e3dbe510802860f1981321bf9cd70676de4 Mon Sep 17 00:00:00 2001
From: Martin Kroeker <martin@ruby.chemie.uni-freiburg.de>
Date: Wed, 8 May 2019 14:14:22 +0200
Subject: [PATCH 25/28] Update azure-pipelines.yml

---
 azure-pipelines.yml | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/azure-pipelines.yml b/azure-pipelines.yml
index d7e6cdc9b..12ea40b61 100644
--- a/azure-pipelines.yml
+++ b/azure-pipelines.yml
@@ -32,7 +32,8 @@ steps:
                                  -D CMAKE_BUILD_TYPE=Release ../  &&  \
             cmake --build ." > Dockerfile
         docker build .
-  displayname: 'Run ARMV6 docker build'
+  displayName: 'Run ARMV6 docker build'
+
 #- script: |
 #    echo Add other tasks to build, test, and deploy your project.
 #    echo See https://aka.ms/yaml

From a598ab1d32c1d5fcf9b9eb0c503a24db13757bc2 Mon Sep 17 00:00:00 2001
From: Martin Kroeker <martin@ruby.chemie.uni-freiburg.de>
Date: Wed, 8 May 2019 15:23:54 +0200
Subject: [PATCH 26/28] Update azure-pipelines.yml

---
 azure-pipelines.yml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/azure-pipelines.yml b/azure-pipelines.yml
index 12ea40b61..2b092c256 100644
--- a/azure-pipelines.yml
+++ b/azure-pipelines.yml
@@ -32,7 +32,7 @@ steps:
                                  -D CMAKE_BUILD_TYPE=Release ../  &&  \
             cmake --build ." > Dockerfile
         docker build .
-  displayName: 'Run ARMV6 docker build'
+#  displayName: 'Run ARMV6 docker build'
 
 #- script: |
 #    echo Add other tasks to build, test, and deploy your project.

From dd77a3f0e27dee0c15b6e1da3649aba6723631ab Mon Sep 17 00:00:00 2001
From: Martin Kroeker <martin@ruby.chemie.uni-freiburg.de>
Date: Wed, 8 May 2019 15:25:43 +0200
Subject: [PATCH 27/28] Update azure-pipelines.yml

---
 azure-pipelines.yml | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/azure-pipelines.yml b/azure-pipelines.yml
index 2b092c256..e25f11cb1 100644
--- a/azure-pipelines.yml
+++ b/azure-pipelines.yml
@@ -32,6 +32,8 @@ steps:
                                  -D CMAKE_BUILD_TYPE=Release ../  &&  \
             cmake --build ." > Dockerfile
         docker build .
+        
+        
 #  displayName: 'Run ARMV6 docker build'
 
 #- script: |

From ad20ceaa680e555e6f4e5e6d199f4c158ef1b6df Mon Sep 17 00:00:00 2001
From: Martin Kroeker <martin@ruby.chemie.uni-freiburg.de>
Date: Wed, 8 May 2019 19:07:58 +0200
Subject: [PATCH 28/28] Update azure-pipelines.yml

---
 azure-pipelines.yml | 12 +++++-------
 1 file changed, 5 insertions(+), 7 deletions(-)

diff --git a/azure-pipelines.yml b/azure-pipelines.yml
index e25f11cb1..0b1ba16fd 100644
--- a/azure-pipelines.yml
+++ b/azure-pipelines.yml
@@ -13,14 +13,14 @@ steps:
 - script: echo Hello, world!
   displayName: 'Run a one-line script'
 
-- script: |
-    docker run --rm --privileged multiarch/qemu-user-static:register --reset
+#- script: |
+#    docker run --rm --privileged multiarch/qemu-user-static:register --reset
 #    ls /proc/sys/fs/binfmt_misc/
 #  condition: not(startsWith(variables['CONFIG'], 'linux_64'))
 #  displayName: 'Configure binfmt_misc'
 
 - script: |
-  echo "FROM openblas/alpine:arm32
+    echo "FROM openblas/alpine:arm32
         COPY . /tmp/openblas
         RUN mkdir /tmp/openblas/build                             &&  \
             cd /tmp/openblas/build                                &&  \
@@ -31,10 +31,8 @@ steps:
                                  -D BUILD_WITHOUT_CBLAS=ON            \
                                  -D CMAKE_BUILD_TYPE=Release ../  &&  \
             cmake --build ." > Dockerfile
-        docker build .
-        
-        
-#  displayName: 'Run ARMV6 docker build'
+    docker build .
+  displayName: Run ARMV6 docker build
 
 #- script: |
 #    echo Add other tasks to build, test, and deploy your project.