From 0f105dd8a5a597b2f468f774a52da226581efbdc Mon Sep 17 00:00:00 2001
From: AbdelRauf <quickwritereader@gmail.com>
Date: Sat, 13 Apr 2019 13:56:19 +0000
Subject: [PATCH 001/127] sgemm/strmm

---
 CONTRIBUTORS.md                    |    5 +-
 kernel/power/KERNEL.POWER9         |    6 +-
 kernel/power/sgemm_kernel_power9.S |  286 ++
 kernel/power/sgemm_logic_power9.S  | 2133 ++++++++++
 kernel/power/sgemm_macros_power9.S | 5828 ++++++++++++++++++++++++++++
 param.h                            |    4 +-
 6 files changed, 8256 insertions(+), 6 deletions(-)
 create mode 100644 kernel/power/sgemm_kernel_power9.S
 create mode 100644 kernel/power/sgemm_logic_power9.S
 create mode 100644 kernel/power/sgemm_macros_power9.S

diff --git a/CONTRIBUTORS.md b/CONTRIBUTORS.md
index 08f8cc69d..3859a9c19 100644
--- a/CONTRIBUTORS.md
+++ b/CONTRIBUTORS.md
@@ -167,4 +167,7 @@ In chronological order:
   * [2017-02-26] ztrmm kernel for IBM z13
   * [2017-03-13] strmm and ctrmm kernel for IBM z13
   * [2017-09-01] initial Blas Level-1,2 (double precision) for IBM z13
-
+  * [2018-03-07] added missing Blas Level 1-2  (double precision) simd codes
+  * [2019-02-01] added missing Blas Level-1,2 (single precision)  simd codes
+  * [2019-03-14] power9 dgemm/dtrmm kernel
+  * [2019-04-29] power9 sgemm/strmm kernel 
diff --git a/kernel/power/KERNEL.POWER9 b/kernel/power/KERNEL.POWER9
index 86a931971..6d5cf9068 100644
--- a/kernel/power/KERNEL.POWER9
+++ b/kernel/power/KERNEL.POWER9
@@ -3,16 +3,16 @@
 #CGEMM_BETA = ../generic/zgemm_beta.c
 #ZGEMM_BETA = ../generic/zgemm_beta.c
 
-STRMMKERNEL	= strmm_kernel_16x8_power8.S
+STRMMKERNEL	= sgemm_kernel_power9.S
 DTRMMKERNEL	= dgemm_kernel_power9.S
 CTRMMKERNEL	= ctrmm_kernel_8x4_power8.S
 ZTRMMKERNEL	= ztrmm_kernel_8x2_power8.S
 
-SGEMMKERNEL    =  sgemm_kernel_16x8_power8.S
+SGEMMKERNEL    =  sgemm_kernel_power9.S
 SGEMMINCOPY    = ../generic/gemm_ncopy_16.c
 SGEMMITCOPY    = sgemm_tcopy_16_power8.S
 SGEMMONCOPY    =  ../generic/gemm_ncopy_8.c
-SGEMMOTCOPY    = sgemm_tcopy_8_power8.S 
+SGEMMOTCOPY    = sgemm_tcopy_8_power8.S
 SGEMMINCOPYOBJ =  sgemm_incopy.o
 SGEMMITCOPYOBJ =  sgemm_itcopy.o
 SGEMMONCOPYOBJ =  sgemm_oncopy.o
diff --git a/kernel/power/sgemm_kernel_power9.S b/kernel/power/sgemm_kernel_power9.S
new file mode 100644
index 000000000..a44659468
--- /dev/null
+++ b/kernel/power/sgemm_kernel_power9.S
@@ -0,0 +1,286 @@
+/***************************************************************************
+Copyright (c) 2013-2019, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*****************************************************************************/
+ 
+#define ASSEMBLER
+#include "common.h"
+#include "def_vsx.h"
+
+ 
+#define LOAD	ld
+#define STACKSIZE  (512 )  
+  
+#define	M	r3
+#define	N	r4
+#define	K	r5
+
+ 
+#define A	r7
+#define	B	r8
+#define	C	r9
+#define	LDC	r10
+#define OFFSET	r6
+ 
+ 
+
+#define alpha_r vs20
+#define save_permute_1 vs21
+#define save_permute_2 vs22
+#define permute_mask vs23
+#define o0	0
+ 
+
+#define T1	r11
+#define T2	r12
+#define T3	r14
+#define T4	r15
+#define T5	r16
+#define T6	r17
+#define L	r18
+#define T7	r19
+#define T8	r20
+#define TEMP_REG	r21
+#define	I	r22
+#define J	r23
+#define AO	r24
+#define	BO	r25
+#define	CO 	r26
+#define T9	r27
+#define	T10	r28
+#define	T11	r29
+
+#define T12	r30
+#define T13	r31
+
+#include "sgemm_macros_power9.S"
+
+.equ    perm_const1, 0x0405060700010203
+.equ    perm_const2, 0x0c0d0e0f08090a0b
+.equ save_permute_11, 0x1415161718191a1b
+.equ save_permute_12, 0x0405060708090a0b
+.equ save_permute_21, 0x101112131c1d1e1f
+.equ save_permute_22, 0x000102030c0d0e0f 
+
+
+#ifndef NEEDPARAM
+
+	PROLOGUE
+	PROFCODE
+
+	addi	SP, SP, -STACKSIZE
+	li	r0, 0
+
+	stfd	f14,    0(SP)
+	stfd	f15,    8(SP)
+	stfd	f16,   16(SP)
+	stfd	f17,   24(SP)
+
+	stfd	f18,   32(SP)
+	stfd	f19,   40(SP)
+	stfd	f20,   48(SP)
+	stfd	f21,   56(SP)
+
+	stfd	f22,   64(SP)
+	stfd	f23,   72(SP)
+	stfd	f24,   80(SP)
+	stfd	f25,   88(SP)
+
+	stfd	f26,   96(SP)
+	stfd	f27,  104(SP)
+	stfd	f28,  112(SP)
+	stfd	f29,  120(SP)
+
+	stfd	f30,  128(SP)
+	stfd	f31,  136(SP)
+
+ 
+	std	r31,  144(SP)
+	std	r30,  152(SP)
+	std	r29,  160(SP)
+	std	r28,  168(SP)
+	std	r27,  176(SP)
+	std	r26,  184(SP)
+	std	r25,  192(SP)
+	std	r24,  200(SP)
+	std	r23,  208(SP)
+	std	r22,  216(SP)
+	std	r21,  224(SP)
+	std	r20,  232(SP)
+	std	r19,  240(SP)
+	std	r18,  248(SP)
+	std	r17,  256(SP)
+	std	r16,  264(SP)
+	std	r15,  272(SP)
+	std	r14,  280(SP)
+ 
+ 
+	stxv    v20,  288(SP)
+	stxv    v21,  304(SP)
+	stxv    v22,  320(SP)
+	stxv    v23,  336(SP)
+	stxv    v24,  352(SP)
+	stxv    v25,  368(SP)
+	stxv    v26,  384(SP)
+	stxv    v27,  400(SP)
+	stxv    v28,  416(SP)
+	stxv    v29,  432(SP)
+	stxv    v30,  448(SP)
+	stxv    v31,  464(SP)
+
+ 
+
+#if defined(TRMMKERNEL) 
+	ld	OFFSET,  FRAMESLOT(0) + STACKSIZE(SP)
+#endif
+   slwi    LDC, LDC, 2
+
+
+/*	cmpwi	cr0, M, 0
+	ble	.L999_H1
+	cmpwi	cr0, N, 0
+	ble	.L999_H1
+	cmpwi	cr0, K, 0
+	ble	.L999_H1
+*/
+ 
+ 
+	/*alpha is stored in f1. convert to single and splat*/
+  xscvdpspn alpha_r,vs1 
+	xxspltw   alpha_r,alpha_r,0
+ 
+ 
+/*load reverse permute mask for big endian
+  uint128 = 0xc0d0e0f08090a0b0405060700010203
+*/ 
+		
+	lis T2, perm_const2@highest
+	ori T2, T2, perm_const2@higher
+	rldicr T2, T2, 32, 31
+	oris T2, T2, perm_const2@h
+	ori T2, T2, perm_const2@l 
+
+	lis T1, perm_const1@highest
+	ori T1, T1, perm_const1@higher
+	rldicr T1, T1, 32, 31
+	oris T1, T1, perm_const1@h
+	ori T1, T1, perm_const1@l
+
+	mtvsrdd permute_mask,T2,T1
+
+	lis T2, save_permute_12@highest
+	ori T2, T2, save_permute_12@higher
+	rldicr T2, T2, 32, 31
+	oris T2, T2, save_permute_12@h
+	ori T2, T2, save_permute_12@l 
+
+	lis T1, save_permute_11@highest
+	ori T1, T1, save_permute_11@higher
+	rldicr T1, T1, 32, 31
+	oris T1, T1, save_permute_11@h
+	ori T1, T1, save_permute_11@l
+
+	mtvsrdd save_permute_1,T2,T1	
+
+	lis T2, save_permute_22@highest
+	ori T2, T2, save_permute_22@higher
+	rldicr T2, T2, 32, 31
+	oris T2, T2, save_permute_22@h
+	ori T2, T2, save_permute_22@l 
+
+	lis T1, save_permute_21@highest
+	ori T1, T1, save_permute_21@higher
+	rldicr T1, T1, 32, 31
+	oris T1, T1, save_permute_21@h
+	ori T1, T1, save_permute_21@l
+
+	mtvsrdd save_permute_2,T2,T1	
+
+#include "sgemm_logic_power9.S"
+
+.L999:
+	addi	r3, 0, 0
+
+	lfd	f14,    0(SP)
+	lfd	f15,    8(SP)
+	lfd	f16,   16(SP)
+	lfd	f17,   24(SP)
+
+	lfd	f18,   32(SP)
+	lfd	f19,   40(SP)
+	lfd	f20,   48(SP)
+	lfd	f21,   56(SP)
+
+	lfd	f22,   64(SP)
+	lfd	f23,   72(SP)
+	lfd	f24,   80(SP)
+	lfd	f25,   88(SP)
+
+	lfd	f26,   96(SP)
+	lfd	f27,  104(SP)
+	lfd	f28,  112(SP)
+	lfd	f29,  120(SP)
+
+	lfd	f30,  128(SP)
+	lfd	f31,  136(SP)
+
+	ld	r31,  144(SP)
+	ld	r30,  152(SP)
+	ld	r29,  160(SP)
+	ld	r28,  168(SP)
+	ld	r27,  176(SP)
+	ld	r26,  184(SP)
+	ld	r25,  192(SP)
+	ld	r24,  200(SP)
+	ld	r23,  208(SP)
+	ld	r22,  216(SP)
+	ld	r21,  224(SP)
+	ld	r20,  232(SP)
+	ld	r19,  240(SP)
+	ld	r18,  248(SP)
+	ld	r17,  256(SP)
+	ld	r16,  264(SP)
+	ld	r15,  272(SP)
+	ld	r14,  280(SP)
+ 
+	lxv    v20,  288(SP)
+	lxv    v21,  304(SP)
+	lxv    v22,  320(SP)
+	lxv    v23,  336(SP)
+	lxv    v24,  352(SP)
+	lxv    v25,  368(SP)
+	lxv    v26,  384(SP)
+	lxv    v27,  400(SP)
+	lxv    v28,  416(SP)
+	lxv    v29,  432(SP)
+	lxv    v30,  448(SP)
+	lxv    v31,  464(SP)
+
+ 
+	addi	SP, SP, STACKSIZE
+	blr
+
+	EPILOGUE
+#endif
diff --git a/kernel/power/sgemm_logic_power9.S b/kernel/power/sgemm_logic_power9.S
new file mode 100644
index 000000000..300e30470
--- /dev/null
+++ b/kernel/power/sgemm_logic_power9.S
@@ -0,0 +1,2133 @@
+#define MY_ALIGN .align 3
+
+#if defined(TRMMKERNEL) && !defined(LEFT)
+   neg TEMP_REG, OFFSET 
+#endif
+
+	srawi.		J,	N,	3
+
+	ble		LSGEMM_L8_END
+
+LSGEMM_L8_BEGIN:
+
+	li		T1,	128
+	li		T2,	256
+ 
+	mr		AO,	A
+	mr		CO,	C
+	slwi		T3,	LDC	,	3
+	add		C,	C,	T3
+
+	dcbt		A,	T1
+	dcbt		A,	T2
+#if defined(TRMMKERNEL) && defined(LEFT)
+	mr TEMP_REG, OFFSET	 /*off = offset;*/
+#endif 
+	srawi.		I,	M,	4
+	ble		LSGEMM_L8x16_END
+
+	MY_ALIGN
+LSGEMM_L8x16_BEGIN:
+
+#if defined(TRMMKERNEL)
+    REFRESH_POINTERS  AO,BO,TEMP_REG,B,16,8
+#else
+	mr		BO,	B
+#endif	
+
+#if defined(TRMMKERNEL)
+   REFRESH_TEMP_BK T11,K,TEMP_REG,16,8
+   mr T12, T11
+   addi T12,T12, -1
+   srawi.		L, T12,	6 /**(T11-1) % 64x */
+#else
+   mr T12, K
+   addi T12,T12, -1
+   srawi.		L,	T12,	6 /**(K-1) % 64x */
+#endif 
+ 
+    ZERO8x16
+	ble		LSGEMM_L8x16_SUB0
+
+	MY_ALIGN
+LSGEMM_L8x16_LOOP_START:
+ 
+	LOAD8x16_0  /*we already zeroed */
+    ##OffsetA=64 OffsetB=32
+    addi AO,AO,2112
+    addi BO,BO,32  
+
+	mtctr		L
+
+	MY_ALIGN
+
+LSGEMM_L8x16_LOOP:
+
+    KERNEL8x16_I1_L4_2  -2048,0, 0,0
+    KERNEL8x16_I1_L4_2  -2048,0, 1,0
+    KERNEL8x16_I1_L4_2  -2048,0, 2,0
+    KERNEL8x16_I1_L4_2  -2048,0, 3,0
+    KERNEL8x16_I1_L4_2  -2048,0, 4,0
+    KERNEL8x16_I1_L4_2  -2048,0, 5,0        
+    KERNEL8x16_I1_L4_2  -2048,0, 6,0
+    KERNEL8x16_I1_L4_2  -2048,0, 7,0  
+    KERNEL8x16_I1_L4_2  -2048,0, 8,0      
+    KERNEL8x16_I1_L4_2  -2048,0, 9,0
+    KERNEL8x16_I1_L4_2  -2048,0, 10,0
+    KERNEL8x16_I1_L4_2  -2048,0, 11,0
+    KERNEL8x16_I1_L4_2  -2048,0, 12,0
+    KERNEL8x16_I1_L4_2  -2048,0, 13,0    
+    KERNEL8x16_I1_L4_2  -2048,0, 14,0    
+    KERNEL8x16_I1_L4_2  -2048,0, 15,1  	
+
+	bdnz		LSGEMM_L8x16_LOOP
+
+	MY_ALIGN
+LSGEMM_L8x16_LOOP_END:
+
+    END8x16 0, AO, BO, -2048, 0    
+
+	b		LSGEMM_L8x16_SUB1 
+	MY_ALIGN
+LSGEMM_L8x16_SUB0:
+#if defined(TRMMKERNEL)
+	andi.		L,	T11,	127
+#else
+	andi.		L,	K,	127
+#endif   
+	b		LSGEMM_L8x16_SUB2
+	MY_ALIGN
+LSGEMM_L8x16_SUB1:
+#if defined(TRMMKERNEL)
+	andi.		L,	T12,	63
+#else
+	andi.		L,  T12,	63
+#endif	
+	ble		LSGEMM_L8x16_SAVE
+	MY_ALIGN
+LSGEMM_L8x16_SUB2:
+
+    srawi.      T10,L, 5
+    ble LSGEMM_L8x16_SUB2_16
+    mtctr		T10
+    MY_ALIGN
+LSGEMM_L8x16_SUB2_LOOP:
+	LOAD8x16_0 
+    KERNEL8x16_I1_L4_2  64,32, 0,0
+    KERNEL8x16_I1_L4_2  64,32, 1,0
+    KERNEL8x16_I1_L4_2  64,32, 2,0
+    KERNEL8x16_I1_L4_2  64,32, 3,0
+    KERNEL8x16_I1_L4_2  64,32, 4,0
+    KERNEL8x16_I1_L4_2  64,32, 5,0
+    KERNEL8x16_I1_L4_2  64,32, 6,0
+    KERNEL8x16_I1_L4_3  64,32, 7,1
+    bdnz LSGEMM_L8x16_SUB2_LOOP 
+    MY_ALIGN        
+LSGEMM_L8x16_SUB2_16:
+    andi.      T10,L, 16
+    ble LSGEMM_L8x16_SUB2_8
+	LOAD8x16_0 
+    KERNEL8x16_I1_L4_2  64,32, 0,0
+    KERNEL8x16_I1_L4_2  64,32, 1,0
+    KERNEL8x16_I1_L4_2  64,32, 2,0
+    KERNEL8x16_I1_L4_3  64,32, 3,1
+    MY_ALIGN 
+LSGEMM_L8x16_SUB2_8:
+    andi.      T10,L, 8
+    ble LSGEMM_L8x16_SUB2_4 
+	LOAD8x16_0
+    KERNEL8x16_I1_L4_2  64,32, 0,0
+    KERNEL8x16_I1_L4_3  64,32, 1,1
+	MY_ALIGN	
+LSGEMM_L8x16_SUB2_4:
+    andi.      T10,L, 4
+    ble LSGEMM_L8x16_SUB2_2
+    LOAD8x16_0
+    KERNEL8x16_I1_L4_3  64,32, 0,1
+    MY_ALIGN
+LSGEMM_L8x16_SUB2_2:
+    andi.      T10,L, 2
+    ble LSGEMM_L8x16_SUB2_1
+    LOAD8x16_0
+    KERNEL8x16_I1_L2_3  64,32, 0,1
+    MY_ALIGN    
+LSGEMM_L8x16_SUB2_1:
+    andi.      T10,L, 1
+    ble LSGEMM_L8x16_SAVE	
+    KERNEL8x16 0
+#	addic.		L,	L,	-1
+#	bgt		LSGEMM_L8x16_SUB2
+
+	MY_ALIGN
+LSGEMM_L8x16_SAVE:
+	SAVE8x16
+#if defined(TRMMKERNEL)	
+	REFRESH_AFTER_SAVE T11,K,TEMP_REG,BO,AO,16,8
+#endif	
+	addic.		I,	I,	-1
+	bgt+		LSGEMM_L8x16_BEGIN
+    MY_ALIGN
+LSGEMM_L8x16_END:
+LSGEMM_L8x8_BEGIN:
+    andi.       T2, M,  15
+    ble     LSGEMM_L8x1_END
+
+    andi.       T1, M,  8
+    ble     LSGEMM_L8x8_END
+
+#if defined(TRMMKERNEL)
+    REFRESH_POINTERS  AO,BO,TEMP_REG,B,8,8
+#else
+    mr      BO, B
+#endif  
+
+#if defined(TRMMKERNEL)
+   REFRESH_TEMP_BK T11,K,TEMP_REG,8,8
+   mr T12, T11
+   addi T12,T12, -1
+   srawi.       L, T12, 4 /**(T11-1) % 16x */
+#else
+   mr T12, K
+   addi T12,T12, -1
+   srawi.       L,  T12,    4 /**(K-1) % 16x */
+#endif 
+    
+    ZERO8x8
+    ble     LSGEMM_L8x8_SUB0
+
+    MY_ALIGN
+LSGEMM_L8x8_LOOP_START:
+ 
+    LOAD8x8_0  /*we already zeroed */ 
+    mtctr       L
+
+    MY_ALIGN
+
+LSGEMM_L8x8_LOOP:
+
+    KERNEL8x8_I1_L4_2  32,32, 0,0
+    KERNEL8x8_I1_L4_2  32,32, 1,0
+    KERNEL8x8_I1_L4_2  32,32, 2,0
+    KERNEL8x8_I1_L4_2  32,32, 3,1    
+
+    bdnz        LSGEMM_L8x8_LOOP
+
+    MY_ALIGN
+LSGEMM_L8x8_LOOP_END:
+
+    END8x8 0, AO, BO, 32, 32    
+
+    b       LSGEMM_L8x8_SUB1 
+    MY_ALIGN
+LSGEMM_L8x8_SUB0:
+#if defined(TRMMKERNEL)
+    andi.       L,  T11,    31
+#else
+    andi.       L,  K,  31
+#endif   
+    b       LSGEMM_L8x8_SUB2
+    MY_ALIGN
+LSGEMM_L8x8_SUB1:
+#if defined(TRMMKERNEL)
+    andi.       L,  T12,    15
+#else
+    andi.       L,  T12,    15
+#endif  
+    ble     LSGEMM_L8x8_SAVE
+    MY_ALIGN
+LSGEMM_L8x8_SUB2:
+ 
+    srawi.      T1,L, 3
+    ble LSGEMM_L8x8_SUB2_4 
+    mtctr		T1
+    MY_ALIGN
+LSGEMM_L8x8_SUB2_LOOP:    
+    LOAD8x8_0
+    KERNEL8x8_I1_L4_2  32,32, 0,0
+    KERNEL8x8_I1_L4_3  32,32, 1,1
+    bdnz LSGEMM_L8x8_SUB2_LOOP
+    MY_ALIGN    
+LSGEMM_L8x8_SUB2_4:
+    andi.      T1,L, 4
+    ble LSGEMM_L8x8_SUB2_2
+    LOAD8x8_0
+    KERNEL8x8_I1_L4_3  32,32, 0,1
+    MY_ALIGN
+LSGEMM_L8x8_SUB2_2:
+    andi.      T1,L, 2
+    ble LSGEMM_L8x8_SUB2_1
+    LOAD8x8_0
+    KERNEL8x8_I1_L2_3  32,32, 0,1
+    MY_ALIGN    
+LSGEMM_L8x8_SUB2_1:
+    andi.      T1,L, 1
+    ble LSGEMM_L8x8_SAVE   
+    KERNEL8x8 0
+ 
+
+    MY_ALIGN
+LSGEMM_L8x8_SAVE:
+    SAVE8x8
+#if defined(TRMMKERNEL) 
+    REFRESH_AFTER_SAVE T11,K,TEMP_REG,BO,AO,8,8
+#endif  
+    MY_ALIGN  
+LSGEMM_L8x8_END:
+LSGEMM_L8x4_BEGIN:
+    andi.       T2, M,  15
+    ble     LSGEMM_L8x1_END
+
+    andi.       T1, M,  4
+    ble     LSGEMM_L8x4_END
+
+#if defined(TRMMKERNEL)
+    REFRESH_POINTERS  AO,BO,TEMP_REG,B,4,8
+#else
+    mr      BO, B
+#endif  
+
+#if defined(TRMMKERNEL)
+   REFRESH_TEMP_BK T11,K,TEMP_REG,4,8
+   mr T12, T11
+   addi T12,T12, -1
+   srawi.       L, T12, 4 /**(T11-1) % 16x */
+#else
+   mr T12, K
+   addi T12,T12, -1
+   srawi.       L,  T12,    4 /**(K-1) % 16x */
+#endif 
+    
+    ZERO8x4
+    ble     LSGEMM_L8x4_SUB0
+
+    MY_ALIGN
+LSGEMM_L8x4_LOOP_START:
+ 
+    LOAD8x4_0  /*we already zeroed */ 
+    mtctr       L
+
+    MY_ALIGN
+
+LSGEMM_L8x4_LOOP:
+
+    KERNEL8x4_I1_L4_2  16,32, 0,0
+    KERNEL8x4_I1_L4_2  16,32, 1,0
+    KERNEL8x4_I1_L4_2  16,32, 2,0
+    KERNEL8x4_I1_L4_2  16,32, 3,1    
+
+    bdnz        LSGEMM_L8x4_LOOP
+
+    MY_ALIGN
+LSGEMM_L8x4_LOOP_END:
+
+    END8x4 0, AO, BO, 16, 32    
+
+    b       LSGEMM_L8x4_SUB1 
+    MY_ALIGN
+LSGEMM_L8x4_SUB0:
+#if defined(TRMMKERNEL)
+    andi.       L,  T11,    31
+#else
+    andi.       L,  K,  31
+#endif   
+    b       LSGEMM_L8x4_SUB2
+    MY_ALIGN
+LSGEMM_L8x4_SUB1:
+#if defined(TRMMKERNEL)
+    andi.       L,  T12,    15
+#else
+    andi.       L,  T12,    15
+#endif  
+    ble     LSGEMM_L8x4_SAVE
+    MY_ALIGN
+LSGEMM_L8x4_SUB2:
+
+    srawi.      T1,L, 3
+    ble LSGEMM_L8x4_SUB2_4 
+    mtctr		T1
+    MY_ALIGN
+LSGEMM_L8x4_SUB2_LOOP:      
+    LOAD8x4_0
+    KERNEL8x4_I1_L4_2  16,32, 0,0
+    KERNEL8x4_I1_L4_3  16,32, 1,1
+    bdnz LSGEMM_L8x4_SUB2_LOOP
+    MY_ALIGN    
+LSGEMM_L8x4_SUB2_4:
+    andi.      T1,L, 4
+    ble LSGEMM_L8x4_SUB2_2
+    LOAD8x4_0
+    KERNEL8x4_I1_L4_3  16,32, 0,1
+    MY_ALIGN
+LSGEMM_L8x4_SUB2_2:
+    andi.      T1,L, 2
+    ble LSGEMM_L8x4_SUB2_1
+    LOAD8x4_0
+    KERNEL8x4_I1_L2_3  16,32, 0,1
+    MY_ALIGN    
+LSGEMM_L8x4_SUB2_1:
+    andi.      T1,L, 1
+    ble LSGEMM_L8x4_SAVE   
+    KERNEL8x4 0
+ 
+
+    MY_ALIGN
+LSGEMM_L8x4_SAVE:
+    SAVE8x4
+#if defined(TRMMKERNEL) 
+    REFRESH_AFTER_SAVE T11,K,TEMP_REG,BO,AO,4,8
+#endif  
+    MY_ALIGN  
+LSGEMM_L8x4_END:
+LSGEMM_L8x2_BEGIN:
+    andi.       T1, M,  2
+    ble     LSGEMM_L8x2_END
+
+#if defined(TRMMKERNEL)
+    REFRESH_POINTERS  AO,BO,TEMP_REG,B,2,8
+#else
+    mr      BO, B
+#endif  
+
+#if defined(TRMMKERNEL)
+   REFRESH_TEMP_BK T11,K,TEMP_REG,2,8 
+   srawi.       L, T11, 3 /**(T11) % 8x */
+#else
+   srawi.       L,  K,    3 /**(K) % 8x */
+#endif 
+    
+    ZERO8x2
+    ble     LSGEMM_L8x2_SUB0
+
+    MY_ALIGN
+LSGEMM_L8x2_LOOP_START: 
+    mtctr       L
+
+    MY_ALIGN
+
+LSGEMM_L8x2_LOOP:
+
+    KERNEL8x2_2  0,0, 0,0
+    KERNEL8x2_2  0,0, 1,0
+    KERNEL8x2_2  0,0, 2,0
+    KERNEL8x2_2  0,0, 3,1    
+
+    bdnz        LSGEMM_L8x2_LOOP
+
+    MY_ALIGN
+LSGEMM_L8x2_LOOP_END:   
+ 
+LSGEMM_L8x2_SUB0:
+#if defined(TRMMKERNEL)
+    andi.       L,  T11,    7
+#else
+    andi.       L,  K,  7
+#endif    
+    ble     LSGEMM_L8x2_SAVE
+    MY_ALIGN
+LSGEMM_L8x2_SUB2:
+    andi.      T1,L, 4
+    ble LSGEMM_L8x2_SUB2_2
+    KERNEL8x2_2  0,0, 0,0
+    KERNEL8x2_2  0,0, 1,1
+    MY_ALIGN
+LSGEMM_L8x2_SUB2_2:
+    andi.      T1,L, 2
+    ble LSGEMM_L8x2_SUB2_1
+    KERNEL8x2_2  0,0, 0,1 
+    MY_ALIGN    
+LSGEMM_L8x2_SUB2_1:
+    andi.      T1,L, 1
+    ble LSGEMM_L8x2_SAVE   
+    KERNEL8x2
+  
+    MY_ALIGN
+LSGEMM_L8x2_SAVE:
+    SAVE8x2
+#if defined(TRMMKERNEL) 
+    REFRESH_AFTER_SAVE T11,K,TEMP_REG,BO,AO,2,8
+#endif  
+    MY_ALIGN  
+LSGEMM_L8x2_END:
+LSGEMM_L8x1_BEGIN: 
+    andi.       T1, M,  1
+    ble     LSGEMM_L8x1_END
+#if defined(TRMMKERNEL)
+    REFRESH_POINTERS  AO,BO,TEMP_REG,B,1,8
+#else
+    mr      BO, B
+#endif  
+
+#if defined(TRMMKERNEL)
+   REFRESH_TEMP_BK T11,K,TEMP_REG,1,8 
+   srawi.       L, T11, 3 /**(T11) % 8x */
+#else
+   srawi.       L,  K,    3 /**(K) % 8x */
+#endif 
+    
+    ZERO8x1
+    ble     LSGEMM_L8x1_SUB0
+
+    MY_ALIGN
+LSGEMM_L8x1_LOOP_START: 
+    mtctr       L
+
+    MY_ALIGN
+
+LSGEMM_L8x1_LOOP:
+
+    KERNEL8x1_4  0,0, 0,0
+    KERNEL8x1_4  0,0, 1,1     
+
+    bdnz        LSGEMM_L8x1_LOOP
+
+    MY_ALIGN
+LSGEMM_L8x1_LOOP_END:   
+ 
+LSGEMM_L8x1_SUB0:
+#if defined(TRMMKERNEL)
+    andi.       L,  T11,    7
+#else
+    andi.       L,  K,  7
+#endif    
+    ble     LSGEMM_L8x1_SAVE
+    MY_ALIGN
+LSGEMM_L8x1_SUB2:
+    andi.      T1,L, 4
+    ble LSGEMM_L8x1_SUB2_2
+    KERNEL8x1_4  0,0, 0,1 
+    MY_ALIGN
+LSGEMM_L8x1_SUB2_2:
+    andi.      T1,L, 2
+    ble LSGEMM_L8x1_SUB2_1
+    KERNEL8x1_2 
+    MY_ALIGN    
+LSGEMM_L8x1_SUB2_1:
+    andi.      T1,L, 1
+    ble LSGEMM_L8x1_SAVE   
+    KERNEL8x1
+  
+    MY_ALIGN
+LSGEMM_L8x1_SAVE:
+    SAVE8x1
+#if defined(TRMMKERNEL) 
+    REFRESH_AFTER_SAVE T11,K,TEMP_REG,BO,AO,1,8
+#endif  
+    MY_ALIGN  
+LSGEMM_L8x1_END:
+
+	slwi		T1,	K,	5
+	add		B,	B,	T1
+#if defined(TRMMKERNEL) && !defined(LEFT)
+    addi TEMP_REG, TEMP_REG, 8
+#endif
+	addic.		J,	J,	-1
+	bgt		LSGEMM_L8_BEGIN
+ 
+
+LSGEMM_L8_END:
+
+/*	b		LSGEMM_L4_BEGIN*/
+    andi.       T1, N,  4
+    ble     LSGEMM_L4_END
+LSGEMM_L4_BEGIN:
+  
+
+	mr		AO,	A
+	mr		CO,	C
+	slwi		T3,	LDC	,	2
+	add		C,	C,	T3
+ 
+#if defined(TRMMKERNEL) && defined(LEFT)
+	mr TEMP_REG, OFFSET	 /*off = offset;*/
+#endif 
+	srawi.		I,	M,	4
+	ble		LSGEMM_L4x16_END
+
+	MY_ALIGN
+LSGEMM_L4x16_BEGIN:
+
+#if defined(TRMMKERNEL)
+    REFRESH_POINTERS  AO,BO,TEMP_REG,B,16,4
+#else
+	mr		BO,	B
+#endif	
+
+#if defined(TRMMKERNEL)
+   REFRESH_TEMP_BK T11,K,TEMP_REG,16,4
+   mr T12, T11
+   addi T12,T12, -1
+   srawi.		L, T12,	6 /**(T11-1) % 64x */
+#else
+   mr T12, K
+   addi T12,T12, -1
+   srawi.		L,	T12,	6 /**(K-1) % 64x */
+#endif 
+ 
+    ZERO4x16
+	ble		LSGEMM_L4x16_SUB0
+
+	MY_ALIGN
+LSGEMM_L4x16_LOOP_START:
+ 
+	LOAD4x16_0  /*we already zeroed */
+    ##OffsetA=64 OffsetB=16
+    addi AO,AO,2112
+    addi BO,BO,16  
+
+	mtctr		L
+
+	MY_ALIGN
+
+LSGEMM_L4x16_LOOP:
+
+    KERNEL4x16_I1_L4_2  -2048,0, 0,0
+    KERNEL4x16_I1_L4_2  -2048,0, 1,0
+    KERNEL4x16_I1_L4_2  -2048,0, 2,0
+    KERNEL4x16_I1_L4_2  -2048,0, 3,0
+    KERNEL4x16_I1_L4_2  -2048,0, 4,0
+    KERNEL4x16_I1_L4_2  -2048,0, 5,0        
+    KERNEL4x16_I1_L4_2  -2048,0, 6,0
+    KERNEL4x16_I1_L4_2  -2048,0, 7,0  
+    KERNEL4x16_I1_L4_2  -2048,0, 8,0      
+    KERNEL4x16_I1_L4_2  -2048,0, 9,0
+    KERNEL4x16_I1_L4_2  -2048,0, 10,0
+    KERNEL4x16_I1_L4_2  -2048,0, 11,0
+    KERNEL4x16_I1_L4_2  -2048,0, 12,0
+    KERNEL4x16_I1_L4_2  -2048,0, 13,0    
+    KERNEL4x16_I1_L4_2  -2048,0, 14,0    
+    KERNEL4x16_I1_L4_2  -2048,0, 15,1  	
+
+	bdnz		LSGEMM_L4x16_LOOP
+
+	MY_ALIGN
+LSGEMM_L4x16_LOOP_END:
+
+    END4x16 0, AO, BO, -2048, 0    
+
+	b		LSGEMM_L4x16_SUB1 
+	MY_ALIGN
+LSGEMM_L4x16_SUB0:
+#if defined(TRMMKERNEL)
+	andi.		L,	T11,	127
+#else
+	andi.		L,	K,	127
+#endif   
+	b		LSGEMM_L4x16_SUB2
+	MY_ALIGN
+LSGEMM_L4x16_SUB1:
+#if defined(TRMMKERNEL)
+	andi.		L,	T12,	63
+#else
+	andi.		L,  T12,	63
+#endif	
+	ble		LSGEMM_L4x16_SAVE
+	MY_ALIGN
+LSGEMM_L4x16_SUB2:
+
+    srawi.      T10,L, 5
+    ble LSGEMM_L4x16_SUB2_16
+    mtctr		T10
+    MY_ALIGN
+LSGEMM_L4x16_SUB2_LOOP:
+	LOAD4x16_0 
+    KERNEL4x16_I1_L4_2  64,16, 0,0
+    KERNEL4x16_I1_L4_2  64,16, 1,0
+    KERNEL4x16_I1_L4_2  64,16, 2,0
+    KERNEL4x16_I1_L4_2  64,16, 3,0
+    KERNEL4x16_I1_L4_2  64,16, 4,0
+    KERNEL4x16_I1_L4_2  64,16, 5,0
+    KERNEL4x16_I1_L4_2  64,16, 6,0
+    KERNEL4x16_I1_L4_3  64,16, 7,1
+    bdnz LSGEMM_L4x16_SUB2_LOOP 
+    MY_ALIGN        
+LSGEMM_L4x16_SUB2_16:
+    andi.      T10,L, 16
+    ble LSGEMM_L4x16_SUB2_8
+	LOAD4x16_0 
+    KERNEL4x16_I1_L4_2  64,16, 0,0
+    KERNEL4x16_I1_L4_2  64,16, 1,0
+    KERNEL4x16_I1_L4_2  64,16, 2,0
+    KERNEL4x16_I1_L4_3  64,16, 3,1
+    MY_ALIGN 
+LSGEMM_L4x16_SUB2_8:
+    andi.      T10,L, 8
+    ble LSGEMM_L4x16_SUB2_4 
+	LOAD4x16_0
+    KERNEL4x16_I1_L4_2  64,16, 0,0
+    KERNEL4x16_I1_L4_3  64,16, 1,1
+	MY_ALIGN	
+LSGEMM_L4x16_SUB2_4:
+    andi.      T10,L, 4
+    ble LSGEMM_L4x16_SUB2_2
+    LOAD4x16_0
+    KERNEL4x16_I1_L4_3  64,16, 0,1
+    MY_ALIGN
+LSGEMM_L4x16_SUB2_2:
+    andi.      T10,L, 2
+    ble LSGEMM_L4x16_SUB2_1
+    LOAD4x16_0
+    KERNEL4x16_I1_L2_3  64,16, 0,1
+    MY_ALIGN    
+LSGEMM_L4x16_SUB2_1:
+    andi.      T10,L, 1
+    ble LSGEMM_L4x16_SAVE	
+    KERNEL4x16 0
+#	addic.		L,	L,	-1
+#	bgt		LSGEMM_L4x16_SUB2
+
+	MY_ALIGN
+LSGEMM_L4x16_SAVE:
+	SAVE4x16
+#if defined(TRMMKERNEL)	
+	REFRESH_AFTER_SAVE T11,K,TEMP_REG,BO,AO,16,4
+#endif	
+	addic.		I,	I,	-1
+	bgt+		LSGEMM_L4x16_BEGIN
+    MY_ALIGN
+LSGEMM_L4x16_END:
+LSGEMM_L4x8_BEGIN:
+    andi.       T2, M,  15
+    ble     LSGEMM_L4x1_END
+
+    andi.       T1, M,  8
+    ble     LSGEMM_L4x8_END
+
+#if defined(TRMMKERNEL)
+    REFRESH_POINTERS  AO,BO,TEMP_REG,B,8,4
+#else
+    mr      BO, B
+#endif  
+
+#if defined(TRMMKERNEL)
+   REFRESH_TEMP_BK T11,K,TEMP_REG,8,4
+   mr T12, T11
+   addi T12,T12, -1
+   srawi.       L, T12, 4 /**(T11-1) % 16x */
+#else
+   mr T12, K
+   addi T12,T12, -1
+   srawi.       L,  T12,    4 /**(K-1) % 16x */
+#endif 
+    
+    ZERO4x8
+    ble     LSGEMM_L4x8_SUB0
+
+    MY_ALIGN
+LSGEMM_L4x8_LOOP_START:
+ 
+    LOAD4x8_0  /*we already zeroed */ 
+    mtctr       L
+
+    MY_ALIGN
+
+LSGEMM_L4x8_LOOP:
+
+    KERNEL4x8_I1_L4_2  32,16, 0,0
+    KERNEL4x8_I1_L4_2  32,16, 1,0
+    KERNEL4x8_I1_L4_2  32,16, 2,0
+    KERNEL4x8_I1_L4_2  32,16, 3,1    
+
+    bdnz        LSGEMM_L4x8_LOOP
+
+    MY_ALIGN
+LSGEMM_L4x8_LOOP_END:
+
+    END4x8 0, AO, BO, 32, 16    
+
+    b       LSGEMM_L4x8_SUB1 
+    MY_ALIGN
+LSGEMM_L4x8_SUB0:
+#if defined(TRMMKERNEL)
+    andi.       L,  T11,    31
+#else
+    andi.       L,  K,  31
+#endif   
+    b       LSGEMM_L4x8_SUB2
+    MY_ALIGN
+LSGEMM_L4x8_SUB1:
+#if defined(TRMMKERNEL)
+    andi.       L,  T12,    15
+#else
+    andi.       L,  T12,    15
+#endif  
+    ble     LSGEMM_L4x8_SAVE
+    MY_ALIGN
+LSGEMM_L4x8_SUB2:
+ 
+    srawi.      T1,L, 3
+    ble LSGEMM_L4x8_SUB2_4 
+    mtctr		T1
+    MY_ALIGN
+LSGEMM_L4x8_SUB2_LOOP:    
+    LOAD4x8_0
+    KERNEL4x8_I1_L4_2  32,16, 0,0
+    KERNEL4x8_I1_L4_3  32,16, 1,1
+    bdnz LSGEMM_L4x8_SUB2_LOOP
+    MY_ALIGN    
+LSGEMM_L4x8_SUB2_4:
+    andi.      T1,L, 4
+    ble LSGEMM_L4x8_SUB2_2
+    LOAD4x8_0
+    KERNEL4x8_I1_L4_3  32,16, 0,1
+    MY_ALIGN
+LSGEMM_L4x8_SUB2_2:
+    andi.      T1,L, 2
+    ble LSGEMM_L4x8_SUB2_1
+    LOAD4x8_0
+    KERNEL4x8_I1_L2_3  32,16, 0,1
+    MY_ALIGN    
+LSGEMM_L4x8_SUB2_1:
+    andi.      T1,L, 1
+    ble LSGEMM_L4x8_SAVE   
+    KERNEL4x8 0
+ 
+
+    MY_ALIGN
+LSGEMM_L4x8_SAVE:
+    SAVE4x8
+#if defined(TRMMKERNEL) 
+    REFRESH_AFTER_SAVE T11,K,TEMP_REG,BO,AO,8,4
+#endif  
+    MY_ALIGN  
+LSGEMM_L4x8_END:
+LSGEMM_L4x4_BEGIN:
+    andi.       T2, M,  15
+    ble     LSGEMM_L4x1_END
+
+    andi.       T1, M,  4
+    ble     LSGEMM_L4x4_END
+
+#if defined(TRMMKERNEL)
+    REFRESH_POINTERS  AO,BO,TEMP_REG,B,4,4
+#else
+    mr      BO, B
+#endif  
+
+#if defined(TRMMKERNEL)
+   REFRESH_TEMP_BK T11,K,TEMP_REG,4,4
+   mr T12, T11
+   addi T12,T12, -1
+   srawi.       L, T12, 4 /**(T11-1) % 16x */
+#else
+   mr T12, K
+   addi T12,T12, -1
+   srawi.       L,  T12,    4 /**(K-1) % 16x */
+#endif 
+    
+    ZERO4x4
+    ble     LSGEMM_L4x4_SUB0
+
+    MY_ALIGN
+LSGEMM_L4x4_LOOP_START:
+ 
+    LOAD4x4_0  /*we already zeroed */ 
+    mtctr       L
+
+    MY_ALIGN
+
+LSGEMM_L4x4_LOOP:
+
+    KERNEL4x4_I1_L4_2  16,16, 0,0
+    KERNEL4x4_I1_L4_2  16,16, 1,0
+    KERNEL4x4_I1_L4_2  16,16, 2,0
+    KERNEL4x4_I1_L4_2  16,16, 3,1    
+
+    bdnz        LSGEMM_L4x4_LOOP
+
+    MY_ALIGN
+LSGEMM_L4x4_LOOP_END:
+
+    END4x4 0, AO, BO, 16, 16    
+
+    b       LSGEMM_L4x4_SUB1 
+    MY_ALIGN
+LSGEMM_L4x4_SUB0:
+#if defined(TRMMKERNEL)
+    andi.       L,  T11,    31
+#else
+    andi.       L,  K,  31
+#endif   
+    b       LSGEMM_L4x4_SUB2
+    MY_ALIGN
+LSGEMM_L4x4_SUB1:
+#if defined(TRMMKERNEL)
+    andi.       L,  T12,    15
+#else
+    andi.       L,  T12,    15
+#endif  
+    ble     LSGEMM_L4x4_SAVE
+    MY_ALIGN
+LSGEMM_L4x4_SUB2:
+
+    srawi.      T1,L, 3 
+    ble LSGEMM_L4x4_SUB2_4  
+    mtctr		T1
+    MY_ALIGN
+LSGEMM_L4x4_SUB2_LOOP:     
+    LOAD4x4_0
+    KERNEL4x4_I1_L4_2  16,16, 0,0
+    KERNEL4x4_I1_L4_3  16,16, 1,1
+    bdnz LSGEMM_L4x4_SUB2_LOOP
+    MY_ALIGN    
+LSGEMM_L4x4_SUB2_4:
+    andi.      T1,L, 4
+    ble LSGEMM_L4x4_SUB2_2
+    LOAD4x4_0
+    KERNEL4x4_I1_L4_3  16,16, 0,1
+    MY_ALIGN
+LSGEMM_L4x4_SUB2_2:
+    andi.      T1,L, 2
+    ble LSGEMM_L4x4_SUB2_1
+    LOAD4x4_0
+    KERNEL4x4_I1_L2_3  16,16, 0,1
+    MY_ALIGN    
+LSGEMM_L4x4_SUB2_1:
+    andi.      T1,L, 1
+    ble LSGEMM_L4x4_SAVE   
+    KERNEL4x4 0
+ 
+
+    MY_ALIGN
+LSGEMM_L4x4_SAVE:
+    SAVE4x4
+#if defined(TRMMKERNEL) 
+    REFRESH_AFTER_SAVE T11,K,TEMP_REG,BO,AO,4,4
+#endif  
+    MY_ALIGN  
+LSGEMM_L4x4_END:
+LSGEMM_L4x2_BEGIN:
+    andi.       T1, M,  2
+    ble     LSGEMM_L4x2_END
+
+#if defined(TRMMKERNEL)
+    REFRESH_POINTERS  AO,BO,TEMP_REG,B,2,4
+#else
+    mr      BO, B
+#endif  
+
+#if defined(TRMMKERNEL)
+   REFRESH_TEMP_BK T11,K,TEMP_REG,2,4 
+   srawi.       L, T11, 3 /**(T11) % 8x */
+#else
+   srawi.       L,  K,    3 /**(K) % 8x */
+#endif 
+    
+    ZERO4x2
+    ble     LSGEMM_L4x2_SUB0
+
+    MY_ALIGN
+LSGEMM_L4x2_LOOP_START: 
+    mtctr       L
+
+    MY_ALIGN
+
+LSGEMM_L4x2_LOOP:
+
+    KERNEL4x2_2  0,0, 0,0
+    KERNEL4x2_2  0,0, 1,0
+    KERNEL4x2_2  0,0, 2,0
+    KERNEL4x2_2  0,0, 3,1    
+
+    bdnz        LSGEMM_L4x2_LOOP
+
+    MY_ALIGN
+LSGEMM_L4x2_LOOP_END:   
+ 
+LSGEMM_L4x2_SUB0:
+#if defined(TRMMKERNEL)
+    andi.       L,  T11,    7
+#else
+    andi.       L,  K,  7
+#endif    
+    ble     LSGEMM_L4x2_SAVE
+    MY_ALIGN
+LSGEMM_L4x2_SUB2:
+    andi.      T1,L, 4
+    ble LSGEMM_L4x2_SUB2_2
+    KERNEL4x2_2  0,0, 0,0
+    KERNEL4x2_2  0,0, 1,1
+    MY_ALIGN
+LSGEMM_L4x2_SUB2_2:
+    andi.      T1,L, 2
+    ble LSGEMM_L4x2_SUB2_1
+    KERNEL4x2_2  0,0, 0,1 
+    MY_ALIGN    
+LSGEMM_L4x2_SUB2_1:
+    andi.      T1,L, 1
+    ble LSGEMM_L4x2_SAVE   
+    KERNEL4x2
+  
+    MY_ALIGN
+LSGEMM_L4x2_SAVE:
+    SAVE4x2
+#if defined(TRMMKERNEL) 
+    REFRESH_AFTER_SAVE T11,K,TEMP_REG,BO,AO,2,4
+#endif  
+    MY_ALIGN  
+LSGEMM_L4x2_END:
+LSGEMM_L4x1_BEGIN: 
+    andi.       T1, M,  1
+    ble     LSGEMM_L4x1_END
+#if defined(TRMMKERNEL)
+    REFRESH_POINTERS  AO,BO,TEMP_REG,B,1,4
+#else
+    mr      BO, B
+#endif  
+
+#if defined(TRMMKERNEL)
+   REFRESH_TEMP_BK T11,K,TEMP_REG,1,4 
+   srawi.       L, T11, 3 /**(T11) % 8x */
+#else
+   srawi.       L,  K,    3 /**(K) % 8x */
+#endif 
+    
+    ZERO4x1
+    ble     LSGEMM_L4x1_SUB0
+
+    MY_ALIGN
+LSGEMM_L4x1_LOOP_START: 
+    mtctr       L
+
+    MY_ALIGN
+
+LSGEMM_L4x1_LOOP:
+
+    KERNEL4x1_4  0,0, 0,0
+    KERNEL4x1_4  0,0, 1,1     
+
+    bdnz        LSGEMM_L4x1_LOOP
+
+    MY_ALIGN
+LSGEMM_L4x1_LOOP_END:   
+ 
+LSGEMM_L4x1_SUB0:
+#if defined(TRMMKERNEL)
+    andi.       L,  T11,    7
+#else
+    andi.       L,  K,  7
+#endif    
+    ble     LSGEMM_L4x1_SAVE
+    MY_ALIGN
+LSGEMM_L4x1_SUB2:
+    andi.      T1,L, 4
+    ble LSGEMM_L4x1_SUB2_2
+    KERNEL4x1_4  0,0, 0,1 
+    MY_ALIGN
+LSGEMM_L4x1_SUB2_2:
+    andi.      T1,L, 2
+    ble LSGEMM_L4x1_SUB2_1
+    KERNEL4x1_2 
+    MY_ALIGN    
+LSGEMM_L4x1_SUB2_1:
+    andi.      T1,L, 1
+    ble LSGEMM_L4x1_SAVE   
+    KERNEL4x1
+  
+    MY_ALIGN
+LSGEMM_L4x1_SAVE:
+    SAVE4x1
+#if defined(TRMMKERNEL) 
+    REFRESH_AFTER_SAVE T11,K,TEMP_REG,BO,AO,1,4
+#endif  
+    MY_ALIGN  
+LSGEMM_L4x1_END:
+
+	slwi		T1,	K,	4
+	add		B,	B,	T1
+#if defined(TRMMKERNEL) && !defined(LEFT)
+    addi TEMP_REG, TEMP_REG, 4
+#endif
+
+	andi.		T2,	N,	3
+	ble		.L999
+
+LSGEMM_L4_END:
+    andi.       T1, N,  2
+    ble     LSGEMM_L2_END
+LSGEMM_L2_BEGIN:
+  
+
+	mr		AO,	A
+	mr		CO,	C
+	slwi		T3,	LDC	,	1
+	add		C,	C,	T3
+ 
+#if defined(TRMMKERNEL) && defined(LEFT)
+	mr TEMP_REG, OFFSET	 /*off = offset;*/
+#endif 
+	srawi.		I,	M,	4
+	ble		LSGEMM_L2x16_END
+
+	MY_ALIGN
+LSGEMM_L2x16_BEGIN:
+
+#if defined(TRMMKERNEL)
+    REFRESH_POINTERS  AO,BO,TEMP_REG,B,16,2
+#else
+	mr		BO,	B
+#endif	
+
+#if defined(TRMMKERNEL)
+   REFRESH_TEMP_BK T11,K,TEMP_REG,16,2 
+   srawi.		L, T11,	6 /**(T11 ) % 64x */
+#else 
+   srawi.		L,	K,	6 /**(K ) % 64x */
+#endif 
+ 
+    ZERO2x16
+	ble		LSGEMM_L2x16_SUB0
+    addi AO,AO,2048
+  
+	mtctr		L
+
+	MY_ALIGN
+
+LSGEMM_L2x16_LOOP:
+
+    KERNEL2x16_4  -2048,0, 0,0
+    KERNEL2x16_4  -2048,0, 1,0
+    KERNEL2x16_4  -2048,0, 2,0
+    KERNEL2x16_4  -2048,0, 3,0
+    KERNEL2x16_4  -2048,0, 4,0
+    KERNEL2x16_4  -2048,0, 5,0        
+    KERNEL2x16_4  -2048,0, 6,0
+    KERNEL2x16_4  -2048,0, 7,0  
+    KERNEL2x16_4  -2048,0, 8,0      
+    KERNEL2x16_4  -2048,0, 9,0
+    KERNEL2x16_4  -2048,0, 10,0
+    KERNEL2x16_4  -2048,0, 11,0
+    KERNEL2x16_4  -2048,0, 12,0
+    KERNEL2x16_4  -2048,0, 13,0    
+    KERNEL2x16_4  -2048,0, 14,0    
+    KERNEL2x16_4  -2048,0, 15,1  	
+
+	bdnz		LSGEMM_L2x16_LOOP
+    MY_ALIGN
+    addi AO,AO, -2048
+	MY_ALIGN
+LSGEMM_L2x16_SUB0: 
+#if defined(TRMMKERNEL)
+	andi.		L,	T11,	63
+#else
+	andi.		L,  K,	63
+#endif	
+	ble		LSGEMM_L2x16_SAVE
+	MY_ALIGN
+LSGEMM_L2x16_SUB2:
+    andi.      T10,L, 32
+    ble LSGEMM_L2x16_SUB2_16 
+    KERNEL2x16_4  0,0, 0,0
+    KERNEL2x16_4  0,0, 1,0
+    KERNEL2x16_4  0,0, 2,0
+    KERNEL2x16_4  0,0, 3,0
+    KERNEL2x16_4  0,0, 4,0
+    KERNEL2x16_4  0,0, 5,0
+    KERNEL2x16_4  0,0, 6,0
+    KERNEL2x16_4  0,0, 7,1 
+    MY_ALIGN        
+LSGEMM_L2x16_SUB2_16:
+    andi.      T10,L, 16
+    ble LSGEMM_L2x16_SUB2_8 
+    KERNEL2x16_4  0,0, 0,0
+    KERNEL2x16_4  0,0, 1,0
+    KERNEL2x16_4  0,0, 2,0
+    KERNEL2x16_4  0,0, 3,1
+    MY_ALIGN 
+LSGEMM_L2x16_SUB2_8:
+    andi.      T10,L, 8
+    ble LSGEMM_L2x16_SUB2_4  
+    KERNEL2x16_4  0,0, 0,0
+    KERNEL2x16_4  0,0, 1,1
+	MY_ALIGN	
+LSGEMM_L2x16_SUB2_4:
+    andi.      T10,L, 4
+    ble LSGEMM_L2x16_SUB2_2 
+    KERNEL2x16_4  0,0, 0,1
+    MY_ALIGN
+LSGEMM_L2x16_SUB2_2:
+    andi.      T10,L, 2
+    ble LSGEMM_L2x16_SUB2_1 
+    KERNEL2x16_2  0,0, 0,1
+    MY_ALIGN    
+LSGEMM_L2x16_SUB2_1:
+    andi.      T10,L, 1
+    ble LSGEMM_L2x16_SAVE	
+    KERNEL2x16
+
+	MY_ALIGN
+LSGEMM_L2x16_SAVE:
+	SAVE2x16
+#if defined(TRMMKERNEL)	
+	REFRESH_AFTER_SAVE T11,K,TEMP_REG,BO,AO,16,2
+#endif	
+	addic.		I,	I,	-1
+	bgt+		LSGEMM_L2x16_BEGIN
+    MY_ALIGN
+LSGEMM_L2x16_END:
+	andi.		I,	M,	8
+	ble		LSGEMM_L2x8_END
+
+	MY_ALIGN
+LSGEMM_L2x8_BEGIN:
+
+#if defined(TRMMKERNEL)
+    REFRESH_POINTERS  AO,BO,TEMP_REG,B,8,2
+#else
+	mr		BO,	B
+#endif	
+
+#if defined(TRMMKERNEL)
+   REFRESH_TEMP_BK T11,K,TEMP_REG,8,2 
+   srawi.		L, T11,	6 /**(T11 ) % 64x */
+#else 
+   srawi.		L,	K,	6 /**(K ) % 64x */
+#endif 
+ 
+    ZERO2x8
+	ble		LSGEMM_L2x8_SUB0
+    addi AO,AO,2048
+  
+	mtctr		L
+
+	MY_ALIGN
+
+LSGEMM_L2x8_LOOP:
+
+    KERNEL2x8_4  -2048,0, 0,0
+    KERNEL2x8_4  -2048,0, 1,0
+    KERNEL2x8_4  -2048,0, 2,0
+    KERNEL2x8_4  -2048,0, 3,0
+    KERNEL2x8_4  -2048,0, 4,0
+    KERNEL2x8_4  -2048,0, 5,0        
+    KERNEL2x8_4  -2048,0, 6,0
+    KERNEL2x8_4  -2048,0, 7,0  
+    KERNEL2x8_4  -2048,0, 8,0      
+    KERNEL2x8_4  -2048,0, 9,0
+    KERNEL2x8_4  -2048,0, 10,0
+    KERNEL2x8_4  -2048,0, 11,0
+    KERNEL2x8_4  -2048,0, 12,0
+    KERNEL2x8_4  -2048,0, 13,0    
+    KERNEL2x8_4  -2048,0, 14,0    
+    KERNEL2x8_4  -2048,0, 15,1  	
+
+	bdnz		LSGEMM_L2x8_LOOP
+    MY_ALIGN
+    addi AO,AO, -2048
+	MY_ALIGN
+LSGEMM_L2x8_SUB0: 
+#if defined(TRMMKERNEL)
+	andi.		L,	T11,	63
+#else
+	andi.		L,  K,	63
+#endif	
+	ble		LSGEMM_L2x8_SAVE
+	MY_ALIGN
+LSGEMM_L2x8_SUB2:
+    andi.      T10,L, 32
+    ble LSGEMM_L2x8_SUB2_16 
+    KERNEL2x8_4  0,0, 0,0
+    KERNEL2x8_4  0,0, 1,0
+    KERNEL2x8_4  0,0, 2,0
+    KERNEL2x8_4  0,0, 3,0
+    KERNEL2x8_4  0,0, 4,0
+    KERNEL2x8_4  0,0, 5,0
+    KERNEL2x8_4  0,0, 6,0
+    KERNEL2x8_4  0,0, 7,1 
+    MY_ALIGN        
+LSGEMM_L2x8_SUB2_16:
+    andi.      T10,L, 16
+    ble LSGEMM_L2x8_SUB2_8 
+    KERNEL2x8_4  0,0, 0,0
+    KERNEL2x8_4  0,0, 1,0
+    KERNEL2x8_4  0,0, 2,0
+    KERNEL2x8_4  0,0, 3,1
+    MY_ALIGN 
+LSGEMM_L2x8_SUB2_8:
+    andi.      T10,L, 8
+    ble LSGEMM_L2x8_SUB2_4  
+    KERNEL2x8_4  0,0, 0,0
+    KERNEL2x8_4  0,0, 1,1
+	MY_ALIGN	
+LSGEMM_L2x8_SUB2_4:
+    andi.      T10,L, 4
+    ble LSGEMM_L2x8_SUB2_2 
+    KERNEL2x8_4  0,0, 0,1
+    MY_ALIGN
+LSGEMM_L2x8_SUB2_2:
+    andi.      T10,L, 2
+    ble LSGEMM_L2x8_SUB2_1 
+    KERNEL2x8_2  0,0, 0,1
+    MY_ALIGN    
+LSGEMM_L2x8_SUB2_1:
+    andi.      T10,L, 1
+    ble LSGEMM_L2x8_SAVE	
+    KERNEL2x8
+
+	MY_ALIGN
+LSGEMM_L2x8_SAVE:
+	SAVE2x8
+#if defined(TRMMKERNEL)	
+	REFRESH_AFTER_SAVE T11,K,TEMP_REG,BO,AO,8,2
+#endif	 
+    MY_ALIGN
+LSGEMM_L2x8_END:
+	andi.		I,	M,	4
+	ble		LSGEMM_L2x4_END
+
+	MY_ALIGN
+LSGEMM_L2x4_BEGIN:
+
+#if defined(TRMMKERNEL)
+    REFRESH_POINTERS  AO,BO,TEMP_REG,B,4,2
+#else
+	mr		BO,	B
+#endif	
+
+#if defined(TRMMKERNEL)
+   REFRESH_TEMP_BK T11,K,TEMP_REG,4,2 
+   srawi.		L, T11,	6 /**(T11 ) % 64x */
+#else 
+   srawi.		L,	K,	6 /**(K ) % 64x */
+#endif 
+ 
+    ZERO2x4
+	ble		LSGEMM_L2x4_SUB0
+ 
+  
+	mtctr		L
+
+	MY_ALIGN
+
+LSGEMM_L2x4_LOOP:
+
+    KERNEL2x4_4  0,0, 0,0
+    KERNEL2x4_4  0,0, 1,0
+    KERNEL2x4_4  0,0, 2,0
+    KERNEL2x4_4  0,0, 3,0
+    KERNEL2x4_4  0,0, 4,0
+    KERNEL2x4_4  0,0, 5,0        
+    KERNEL2x4_4  0,0, 6,0
+    KERNEL2x4_4  0,0, 7,0  
+    KERNEL2x4_4  0,0, 8,0      
+    KERNEL2x4_4  0,0, 9,0
+    KERNEL2x4_4  0,0, 10,0
+    KERNEL2x4_4  0,0, 11,0
+    KERNEL2x4_4  0,0, 12,0
+    KERNEL2x4_4  0,0, 13,0    
+    KERNEL2x4_4  0,0, 14,0    
+    KERNEL2x4_4  0,0, 15,1  	
+
+	bdnz		LSGEMM_L2x4_LOOP
+    MY_ALIGN
+ 
+	MY_ALIGN
+LSGEMM_L2x4_SUB0: 
+#if defined(TRMMKERNEL)
+	andi.		L,	T11,	63
+#else
+	andi.		L,  K,	63
+#endif	
+	ble		LSGEMM_L2x4_SAVE
+	MY_ALIGN
+LSGEMM_L2x4_SUB2:
+    andi.      T10,L, 32
+    ble LSGEMM_L2x4_SUB2_16 
+    KERNEL2x4_4  0,0, 0,0
+    KERNEL2x4_4  0,0, 1,0
+    KERNEL2x4_4  0,0, 2,0
+    KERNEL2x4_4  0,0, 3,0
+    KERNEL2x4_4  0,0, 4,0
+    KERNEL2x4_4  0,0, 5,0
+    KERNEL2x4_4  0,0, 6,0
+    KERNEL2x4_4  0,0, 7,1 
+    MY_ALIGN        
+LSGEMM_L2x4_SUB2_16:
+    andi.      T10,L, 16
+    ble LSGEMM_L2x4_SUB2_8 
+    KERNEL2x4_4  0,0, 0,0
+    KERNEL2x4_4  0,0, 1,0
+    KERNEL2x4_4  0,0, 2,0
+    KERNEL2x4_4  0,0, 3,1
+    MY_ALIGN 
+LSGEMM_L2x4_SUB2_8:
+    andi.      T10,L, 8
+    ble LSGEMM_L2x4_SUB2_4  
+    KERNEL2x4_4  0,0, 0,0
+    KERNEL2x4_4  0,0, 1,1
+	MY_ALIGN	
+LSGEMM_L2x4_SUB2_4:
+    andi.      T10,L, 4
+    ble LSGEMM_L2x4_SUB2_2 
+    KERNEL2x4_4  0,0, 0,1
+    MY_ALIGN
+LSGEMM_L2x4_SUB2_2:
+    andi.      T10,L, 2
+    ble LSGEMM_L2x4_SUB2_1 
+    KERNEL2x4_2  0,0, 0,1
+    MY_ALIGN    
+LSGEMM_L2x4_SUB2_1:
+    andi.      T10,L, 1
+    ble LSGEMM_L2x4_SAVE	
+    KERNEL2x4
+
+	MY_ALIGN
+LSGEMM_L2x4_SAVE:
+	SAVE2x4
+#if defined(TRMMKERNEL)	
+	REFRESH_AFTER_SAVE T11,K,TEMP_REG,BO,AO,4,2
+#endif	 
+    MY_ALIGN
+LSGEMM_L2x4_END:
+	andi.		I,	M,	2
+	ble		LSGEMM_L2x2_END
+
+	MY_ALIGN
+LSGEMM_L2x2_BEGIN:
+
+#if defined(TRMMKERNEL)
+    REFRESH_POINTERS  AO,BO,TEMP_REG,B,2,2
+#else
+	mr		BO,	B
+#endif	
+
+#if defined(TRMMKERNEL)
+   REFRESH_TEMP_BK T11,K,TEMP_REG,2,2 
+   srawi.		L, T11,	6 /**(T11 ) % 64x */
+#else 
+   srawi.		L,	K,	6 /**(K ) % 64x */
+#endif 
+ 
+    ZERO2x2
+	ble		LSGEMM_L2x2_SUB0
+ 
+  
+	mtctr		L
+
+	MY_ALIGN
+
+LSGEMM_L2x2_LOOP:
+
+    KERNEL2x2_4  0,0, 0,0
+    KERNEL2x2_4  0,0, 1,0
+    KERNEL2x2_4  0,0, 2,0
+    KERNEL2x2_4  0,0, 3,0
+    KERNEL2x2_4  0,0, 4,0
+    KERNEL2x2_4  0,0, 5,0        
+    KERNEL2x2_4  0,0, 6,0
+    KERNEL2x2_4  0,0, 7,0  
+    KERNEL2x2_4  0,0, 8,0      
+    KERNEL2x2_4  0,0, 9,0
+    KERNEL2x2_4  0,0, 10,0
+    KERNEL2x2_4  0,0, 11,0
+    KERNEL2x2_4  0,0, 12,0
+    KERNEL2x2_4  0,0, 13,0    
+    KERNEL2x2_4  0,0, 14,0    
+    KERNEL2x2_4  0,0, 15,1  	
+
+	bdnz		LSGEMM_L2x2_LOOP
+    MY_ALIGN
+ 
+	MY_ALIGN
+LSGEMM_L2x2_SUB0: 
+#if defined(TRMMKERNEL)
+	andi.		L,	T11,	63
+#else
+	andi.		L,  K,	63
+#endif	
+	ble		LSGEMM_L2x2_SAVE
+	MY_ALIGN
+LSGEMM_L2x2_SUB2:
+    andi.      T10,L, 32
+    ble LSGEMM_L2x2_SUB2_16 
+    KERNEL2x2_4  0,0, 0,0
+    KERNEL2x2_4  0,0, 1,0
+    KERNEL2x2_4  0,0, 2,0
+    KERNEL2x2_4  0,0, 3,0
+    KERNEL2x2_4  0,0, 4,0
+    KERNEL2x2_4  0,0, 5,0
+    KERNEL2x2_4  0,0, 6,0
+    KERNEL2x2_4  0,0, 7,1 
+    MY_ALIGN        
+LSGEMM_L2x2_SUB2_16:
+    andi.      T10,L, 16
+    ble LSGEMM_L2x2_SUB2_8 
+    KERNEL2x2_4  0,0, 0,0
+    KERNEL2x2_4  0,0, 1,0
+    KERNEL2x2_4  0,0, 2,0
+    KERNEL2x2_4  0,0, 3,1
+    MY_ALIGN 
+LSGEMM_L2x2_SUB2_8:
+    andi.      T10,L, 8
+    ble LSGEMM_L2x2_SUB2_4  
+    KERNEL2x2_4  0,0, 0,0
+    KERNEL2x2_4  0,0, 1,1
+	MY_ALIGN	
+LSGEMM_L2x2_SUB2_4:
+    andi.      T10,L, 4
+    ble LSGEMM_L2x2_SUB2_2 
+    KERNEL2x2_4  0,0, 0,1
+    MY_ALIGN
+LSGEMM_L2x2_SUB2_2:
+    andi.      T10,L, 2
+    ble LSGEMM_L2x2_SUB2_1 
+    KERNEL2x2_2  0,0, 0,1
+    MY_ALIGN    
+LSGEMM_L2x2_SUB2_1:
+    andi.      T10,L, 1
+    ble LSGEMM_L2x2_SAVE	
+    KERNEL2x2
+
+	MY_ALIGN
+LSGEMM_L2x2_SAVE:
+	SAVE2x2
+#if defined(TRMMKERNEL)	
+	REFRESH_AFTER_SAVE T11,K,TEMP_REG,BO,AO,2,2
+#endif	 
+    MY_ALIGN
+LSGEMM_L2x2_END:
+	andi.		I,	M,	1
+	ble		LSGEMM_L2x1_END
+
+	MY_ALIGN
+LSGEMM_L2x1_BEGIN:
+
+#if defined(TRMMKERNEL)
+    REFRESH_POINTERS  AO,BO,TEMP_REG,B,1,2
+#else
+	mr		BO,	B
+#endif	
+
+#if defined(TRMMKERNEL)
+   REFRESH_TEMP_BK T11,K,TEMP_REG,1,2 
+   srawi.		L, T11,	6 /**(T11 ) % 64x */
+#else 
+   srawi.		L,	K,	6 /**(K ) % 64x */
+#endif 
+ 
+    ZERO2x1
+	ble		LSGEMM_L2x1_SUB0
+ 
+  
+	mtctr		L
+
+	MY_ALIGN
+
+LSGEMM_L2x1_LOOP:
+
+    KERNEL2x1_4  0,0, 0,0
+    KERNEL2x1_4  0,0, 1,0
+    KERNEL2x1_4  0,0, 2,0
+    KERNEL2x1_4  0,0, 3,0
+    KERNEL2x1_4  0,0, 4,0
+    KERNEL2x1_4  0,0, 5,0        
+    KERNEL2x1_4  0,0, 6,0
+    KERNEL2x1_4  0,0, 7,0  
+    KERNEL2x1_4  0,0, 8,0      
+    KERNEL2x1_4  0,0, 9,0
+    KERNEL2x1_4  0,0, 10,0
+    KERNEL2x1_4  0,0, 11,0
+    KERNEL2x1_4  0,0, 12,0
+    KERNEL2x1_4  0,0, 13,0    
+    KERNEL2x1_4  0,0, 14,0    
+    KERNEL2x1_4  0,0, 15,1  	
+
+	bdnz		LSGEMM_L2x1_LOOP
+    MY_ALIGN
+ 
+	MY_ALIGN
+LSGEMM_L2x1_SUB0: 
+#if defined(TRMMKERNEL)
+	andi.		L,	T11,	63
+#else
+	andi.		L,  K,	63
+#endif	
+	ble		LSGEMM_L2x1_SAVE
+	MY_ALIGN
+LSGEMM_L2x1_SUB2:
+    andi.      T10,L, 32
+    ble LSGEMM_L2x1_SUB2_16 
+    KERNEL2x1_4  0,0, 0,0
+    KERNEL2x1_4  0,0, 1,0
+    KERNEL2x1_4  0,0, 2,0
+    KERNEL2x1_4  0,0, 3,0
+    KERNEL2x1_4  0,0, 4,0
+    KERNEL2x1_4  0,0, 5,0
+    KERNEL2x1_4  0,0, 6,0
+    KERNEL2x1_4  0,0, 7,1 
+    MY_ALIGN        
+LSGEMM_L2x1_SUB2_16:
+    andi.      T10,L, 16
+    ble LSGEMM_L2x1_SUB2_8 
+    KERNEL2x1_4  0,0, 0,0
+    KERNEL2x1_4  0,0, 1,0
+    KERNEL2x1_4  0,0, 2,0
+    KERNEL2x1_4  0,0, 3,1
+    MY_ALIGN 
+LSGEMM_L2x1_SUB2_8:
+    andi.      T10,L, 8
+    ble LSGEMM_L2x1_SUB2_4  
+    KERNEL2x1_4  0,0, 0,0
+    KERNEL2x1_4  0,0, 1,1
+	MY_ALIGN	
+LSGEMM_L2x1_SUB2_4:
+    andi.      T10,L, 4
+    ble LSGEMM_L2x1_SUB2_2 
+    KERNEL2x1_4  0,0, 0,1
+    MY_ALIGN
+LSGEMM_L2x1_SUB2_2:
+    andi.      T10,L, 2
+    ble LSGEMM_L2x1_SUB2_1 
+    KERNEL2x1_2  0,0, 0,1
+    MY_ALIGN    
+LSGEMM_L2x1_SUB2_1:
+    andi.      T10,L, 1
+    ble LSGEMM_L2x1_SAVE	
+    KERNEL2x1
+
+	MY_ALIGN
+LSGEMM_L2x1_SAVE:
+	SAVE2x1
+#if defined(TRMMKERNEL)	
+	REFRESH_AFTER_SAVE T11,K,TEMP_REG,BO,AO,1,2
+#endif	 
+    MY_ALIGN
+LSGEMM_L2x1_END:
+	slwi		T1,	K,	3
+	add		B,	B,	T1
+#if defined(TRMMKERNEL) && !defined(LEFT)
+    addi TEMP_REG, TEMP_REG, 2
+#endif 
+LSGEMM_L2_END:
+   andi.       T1, N,  1
+   ble     LSGEMM_END
+LSGEMM_1_BEGIN:
+  
+
+	mr		AO,	A
+	mr		CO,	C 
+	add		C,	C,	LDC
+ 
+#if defined(TRMMKERNEL) && defined(LEFT)
+	mr TEMP_REG, OFFSET	 /*off = offset;*/
+#endif 
+	srawi.		I,	M,	4
+	ble		LSGEMM_1x16_END
+
+	MY_ALIGN
+LSGEMM_1x16_BEGIN:
+
+#if defined(TRMMKERNEL)
+    REFRESH_POINTERS  AO,BO,TEMP_REG,B,16,1
+#else
+	mr		BO,	B
+#endif	
+
+#if defined(TRMMKERNEL)
+   REFRESH_TEMP_BK T11,K,TEMP_REG,16,1 
+   srawi.		L, T11,	6 /**(T11 ) % 64x */
+#else 
+   srawi.		L,	K,	6 /**(K ) % 64x */
+#endif 
+ 
+    ZERO1x16
+	ble		LSGEMM_1x16_SUB0
+    addi AO,AO,2048
+  
+	mtctr		L
+
+	MY_ALIGN
+
+LSGEMM_1x16_LOOP:
+
+    KERNEL1x16_4  -2048,0, 0,0
+    KERNEL1x16_4  -2048,0, 1,0
+    KERNEL1x16_4  -2048,0, 2,0
+    KERNEL1x16_4  -2048,0, 3,0
+    KERNEL1x16_4  -2048,0, 4,0
+    KERNEL1x16_4  -2048,0, 5,0        
+    KERNEL1x16_4  -2048,0, 6,0
+    KERNEL1x16_4  -2048,0, 7,0  
+    KERNEL1x16_4  -2048,0, 8,0      
+    KERNEL1x16_4  -2048,0, 9,0
+    KERNEL1x16_4  -2048,0, 10,0
+    KERNEL1x16_4  -2048,0, 11,0
+    KERNEL1x16_4  -2048,0, 12,0
+    KERNEL1x16_4  -2048,0, 13,0    
+    KERNEL1x16_4  -2048,0, 14,0    
+    KERNEL1x16_4  -2048,0, 15,1  	
+
+	bdnz		LSGEMM_1x16_LOOP
+    MY_ALIGN
+    addi AO,AO, -2048
+	MY_ALIGN
+LSGEMM_1x16_SUB0: 
+#if defined(TRMMKERNEL)
+	andi.		L,	T11,	63
+#else
+	andi.		L,  K,	63
+#endif	
+	ble		LSGEMM_1x16_SAVE
+	MY_ALIGN
+LSGEMM_1x16_SUB2:
+    andi.      T10,L, 32
+    ble LSGEMM_1x16_SUB2_16 
+    KERNEL1x16_4  0,0, 0,0
+    KERNEL1x16_4  0,0, 1,0
+    KERNEL1x16_4  0,0, 2,0
+    KERNEL1x16_4  0,0, 3,0
+    KERNEL1x16_4  0,0, 4,0
+    KERNEL1x16_4  0,0, 5,0
+    KERNEL1x16_4  0,0, 6,0
+    KERNEL1x16_4  0,0, 7,1 
+    MY_ALIGN        
+LSGEMM_1x16_SUB2_16:
+    andi.      T10,L, 16
+    ble LSGEMM_1x16_SUB2_8 
+    KERNEL1x16_4  0,0, 0,0
+    KERNEL1x16_4  0,0, 1,0
+    KERNEL1x16_4  0,0, 2,0
+    KERNEL1x16_4  0,0, 3,1
+    MY_ALIGN 
+LSGEMM_1x16_SUB2_8:
+    andi.      T10,L, 8
+    ble LSGEMM_1x16_SUB2_4  
+    KERNEL1x16_4  0,0, 0,0
+    KERNEL1x16_4  0,0, 1,1
+	MY_ALIGN	
+LSGEMM_1x16_SUB2_4:
+    andi.      T10,L, 4
+    ble LSGEMM_1x16_SUB2_2 
+    KERNEL1x16_4  0,0, 0,1
+    MY_ALIGN
+LSGEMM_1x16_SUB2_2:
+    andi.      T10,L, 2
+    ble LSGEMM_1x16_SUB2_1 
+    KERNEL1x16_2  0,0, 0,1
+    MY_ALIGN    
+LSGEMM_1x16_SUB2_1:
+    andi.      T10,L, 1
+    ble LSGEMM_1x16_SAVE	
+    KERNEL1x16
+
+	MY_ALIGN
+LSGEMM_1x16_SAVE:
+	SAVE1x16
+#if defined(TRMMKERNEL)	
+	REFRESH_AFTER_SAVE T11,K,TEMP_REG,BO,AO,16,1
+#endif	
+	addic.		I,	I,	-1
+	bgt+		LSGEMM_1x16_BEGIN
+    MY_ALIGN
+LSGEMM_1x16_END:
+	andi.		I,	M,	8
+	ble		LSGEMM_1x8_END
+
+	MY_ALIGN
+LSGEMM_1x8_BEGIN:
+
+#if defined(TRMMKERNEL)
+    REFRESH_POINTERS  AO,BO,TEMP_REG,B,8,1
+#else
+	mr		BO,	B
+#endif	
+
+#if defined(TRMMKERNEL)
+   REFRESH_TEMP_BK T11,K,TEMP_REG,8,1 
+   srawi.		L, T11,	6 /**(T11 ) % 64x */
+#else 
+   srawi.		L,	K,	6 /**(K ) % 64x */
+#endif 
+ 
+    ZERO1x8
+	ble		LSGEMM_1x8_SUB0
+    addi AO,AO,2048
+  
+	mtctr		L
+
+	MY_ALIGN
+
+LSGEMM_1x8_LOOP:
+
+    KERNEL1x8_4  -2048,0, 0,0
+    KERNEL1x8_4  -2048,0, 1,0
+    KERNEL1x8_4  -2048,0, 2,0
+    KERNEL1x8_4  -2048,0, 3,0
+    KERNEL1x8_4  -2048,0, 4,0
+    KERNEL1x8_4  -2048,0, 5,0        
+    KERNEL1x8_4  -2048,0, 6,0
+    KERNEL1x8_4  -2048,0, 7,0  
+    KERNEL1x8_4  -2048,0, 8,0      
+    KERNEL1x8_4  -2048,0, 9,0
+    KERNEL1x8_4  -2048,0, 10,0
+    KERNEL1x8_4  -2048,0, 11,0
+    KERNEL1x8_4  -2048,0, 12,0
+    KERNEL1x8_4  -2048,0, 13,0    
+    KERNEL1x8_4  -2048,0, 14,0    
+    KERNEL1x8_4  -2048,0, 15,1  	
+
+	bdnz		LSGEMM_1x8_LOOP
+    MY_ALIGN
+    addi AO,AO, -2048
+	MY_ALIGN
+LSGEMM_1x8_SUB0: 
+#if defined(TRMMKERNEL)
+	andi.		L,	T11,	63
+#else
+	andi.		L,  K,	63
+#endif	
+	ble		LSGEMM_1x8_SAVE
+	MY_ALIGN
+LSGEMM_1x8_SUB2:
+    andi.      T10,L, 32
+    ble LSGEMM_1x8_SUB2_16 
+    KERNEL1x8_4  0,0, 0,0
+    KERNEL1x8_4  0,0, 1,0
+    KERNEL1x8_4  0,0, 2,0
+    KERNEL1x8_4  0,0, 3,0
+    KERNEL1x8_4  0,0, 4,0
+    KERNEL1x8_4  0,0, 5,0
+    KERNEL1x8_4  0,0, 6,0
+    KERNEL1x8_4  0,0, 7,1 
+    MY_ALIGN        
+LSGEMM_1x8_SUB2_16:
+    andi.      T10,L, 16
+    ble LSGEMM_1x8_SUB2_8 
+    KERNEL1x8_4  0,0, 0,0
+    KERNEL1x8_4  0,0, 1,0
+    KERNEL1x8_4  0,0, 2,0
+    KERNEL1x8_4  0,0, 3,1
+    MY_ALIGN 
+LSGEMM_1x8_SUB2_8:
+    andi.      T10,L, 8
+    ble LSGEMM_1x8_SUB2_4  
+    KERNEL1x8_4  0,0, 0,0
+    KERNEL1x8_4  0,0, 1,1
+	MY_ALIGN	
+LSGEMM_1x8_SUB2_4:
+    andi.      T10,L, 4
+    ble LSGEMM_1x8_SUB2_2 
+    KERNEL1x8_4  0,0, 0,1
+    MY_ALIGN
+LSGEMM_1x8_SUB2_2:
+    andi.      T10,L, 2
+    ble LSGEMM_1x8_SUB2_1 
+    KERNEL1x8_2  0,0, 0,1
+    MY_ALIGN    
+LSGEMM_1x8_SUB2_1:
+    andi.      T10,L, 1
+    ble LSGEMM_1x8_SAVE	
+    KERNEL1x8
+
+	MY_ALIGN
+LSGEMM_1x8_SAVE:
+	SAVE1x8
+#if defined(TRMMKERNEL)	
+	REFRESH_AFTER_SAVE T11,K,TEMP_REG,BO,AO,8,1
+#endif	 
+    MY_ALIGN
+LSGEMM_1x8_END:
+	andi.		I,	M,	4
+	ble		LSGEMM_1x4_END
+
+	MY_ALIGN
+LSGEMM_1x4_BEGIN:
+
+#if defined(TRMMKERNEL)
+    REFRESH_POINTERS  AO,BO,TEMP_REG,B,4,1
+#else
+	mr		BO,	B
+#endif	
+
+#if defined(TRMMKERNEL)
+   REFRESH_TEMP_BK T11,K,TEMP_REG,4,1 
+   srawi.		L, T11,	6 /**(T11 ) % 64x */
+#else 
+   srawi.		L,	K,	6 /**(K ) % 64x */
+#endif 
+ 
+    ZERO1x4
+	ble		LSGEMM_1x4_SUB0
+ 
+  
+	mtctr		L
+
+	MY_ALIGN
+
+LSGEMM_1x4_LOOP:
+
+    KERNEL1x4_4  0,0, 0,0
+    KERNEL1x4_4  0,0, 1,0
+    KERNEL1x4_4  0,0, 2,0
+    KERNEL1x4_4  0,0, 3,0
+    KERNEL1x4_4  0,0, 4,0
+    KERNEL1x4_4  0,0, 5,0        
+    KERNEL1x4_4  0,0, 6,0
+    KERNEL1x4_4  0,0, 7,0  
+    KERNEL1x4_4  0,0, 8,0      
+    KERNEL1x4_4  0,0, 9,0
+    KERNEL1x4_4  0,0, 10,0
+    KERNEL1x4_4  0,0, 11,0
+    KERNEL1x4_4  0,0, 12,0
+    KERNEL1x4_4  0,0, 13,0    
+    KERNEL1x4_4  0,0, 14,0    
+    KERNEL1x4_4  0,0, 15,1  	
+
+	bdnz		LSGEMM_1x4_LOOP
+    MY_ALIGN
+ 
+	MY_ALIGN
+LSGEMM_1x4_SUB0: 
+#if defined(TRMMKERNEL)
+	andi.		L,	T11,	63
+#else
+	andi.		L,  K,	63
+#endif	
+	ble		LSGEMM_1x4_SAVE
+	MY_ALIGN
+LSGEMM_1x4_SUB2:
+    andi.      T10,L, 32
+    ble LSGEMM_1x4_SUB2_16 
+    KERNEL1x4_4  0,0, 0,0
+    KERNEL1x4_4  0,0, 1,0
+    KERNEL1x4_4  0,0, 2,0
+    KERNEL1x4_4  0,0, 3,0
+    KERNEL1x4_4  0,0, 4,0
+    KERNEL1x4_4  0,0, 5,0
+    KERNEL1x4_4  0,0, 6,0
+    KERNEL1x4_4  0,0, 7,1 
+    MY_ALIGN        
+LSGEMM_1x4_SUB2_16:
+    andi.      T10,L, 16
+    ble LSGEMM_1x4_SUB2_8 
+    KERNEL1x4_4  0,0, 0,0
+    KERNEL1x4_4  0,0, 1,0
+    KERNEL1x4_4  0,0, 2,0
+    KERNEL1x4_4  0,0, 3,1
+    MY_ALIGN 
+LSGEMM_1x4_SUB2_8:
+    andi.      T10,L, 8
+    ble LSGEMM_1x4_SUB2_4  
+    KERNEL1x4_4  0,0, 0,0
+    KERNEL1x4_4  0,0, 1,1
+	MY_ALIGN	
+LSGEMM_1x4_SUB2_4:
+    andi.      T10,L, 4
+    ble LSGEMM_1x4_SUB2_2 
+    KERNEL1x4_4  0,0, 0,1
+    MY_ALIGN
+LSGEMM_1x4_SUB2_2:
+    andi.      T10,L, 2
+    ble LSGEMM_1x4_SUB2_1 
+    KERNEL1x4_2  0,0, 0,1
+    MY_ALIGN    
+LSGEMM_1x4_SUB2_1:
+    andi.      T10,L, 1
+    ble LSGEMM_1x4_SAVE	
+    KERNEL1x4
+
+	MY_ALIGN
+LSGEMM_1x4_SAVE:
+	SAVE1x4
+#if defined(TRMMKERNEL)	
+	REFRESH_AFTER_SAVE T11,K,TEMP_REG,BO,AO,4,1
+#endif	 
+    MY_ALIGN
+LSGEMM_1x4_END:
+	andi.		I,	M,	2
+	ble		LSGEMM_1x2_END
+
+	MY_ALIGN
+LSGEMM_1x2_BEGIN:
+
+#if defined(TRMMKERNEL)
+    REFRESH_POINTERS  AO,BO,TEMP_REG,B,2,1
+#else
+	mr		BO,	B
+#endif	
+
+#if defined(TRMMKERNEL)
+   REFRESH_TEMP_BK T11,K,TEMP_REG,2,1 
+   srawi.		L, T11,	6 /**(T11 ) % 64x */
+#else 
+   srawi.		L,	K,	6 /**(K ) % 64x */
+#endif 
+ 
+    ZERO1x2
+	ble		LSGEMM_1x2_SUB0
+ 
+  
+	mtctr		L
+
+	MY_ALIGN
+
+LSGEMM_1x2_LOOP:
+
+    KERNEL1x2_4  0,0, 0,0
+    KERNEL1x2_4  0,0, 1,0
+    KERNEL1x2_4  0,0, 2,0
+    KERNEL1x2_4  0,0, 3,0
+    KERNEL1x2_4  0,0, 4,0
+    KERNEL1x2_4  0,0, 5,0        
+    KERNEL1x2_4  0,0, 6,0
+    KERNEL1x2_4  0,0, 7,0  
+    KERNEL1x2_4  0,0, 8,0      
+    KERNEL1x2_4  0,0, 9,0
+    KERNEL1x2_4  0,0, 10,0
+    KERNEL1x2_4  0,0, 11,0
+    KERNEL1x2_4  0,0, 12,0
+    KERNEL1x2_4  0,0, 13,0    
+    KERNEL1x2_4  0,0, 14,0    
+    KERNEL1x2_4  0,0, 15,1  	
+
+	bdnz		LSGEMM_1x2_LOOP
+    MY_ALIGN
+ 
+	MY_ALIGN
+LSGEMM_1x2_SUB0: 
+#if defined(TRMMKERNEL)
+	andi.		L,	T11,	63
+#else
+	andi.		L,  K,	63
+#endif	
+	ble		LSGEMM_1x2_SAVE
+	MY_ALIGN
+LSGEMM_1x2_SUB2:
+    andi.      T10,L, 32
+    ble LSGEMM_1x2_SUB2_16 
+    KERNEL1x2_4  0,0, 0,0
+    KERNEL1x2_4  0,0, 1,0
+    KERNEL1x2_4  0,0, 2,0
+    KERNEL1x2_4  0,0, 3,0
+    KERNEL1x2_4  0,0, 4,0
+    KERNEL1x2_4  0,0, 5,0
+    KERNEL1x2_4  0,0, 6,0
+    KERNEL1x2_4  0,0, 7,1 
+    MY_ALIGN        
+LSGEMM_1x2_SUB2_16:
+    andi.      T10,L, 16
+    ble LSGEMM_1x2_SUB2_8 
+    KERNEL1x2_4  0,0, 0,0
+    KERNEL1x2_4  0,0, 1,0
+    KERNEL1x2_4  0,0, 2,0
+    KERNEL1x2_4  0,0, 3,1
+    MY_ALIGN 
+LSGEMM_1x2_SUB2_8:
+    andi.      T10,L, 8
+    ble LSGEMM_1x2_SUB2_4  
+    KERNEL1x2_4  0,0, 0,0
+    KERNEL1x2_4  0,0, 1,1
+	MY_ALIGN	
+LSGEMM_1x2_SUB2_4:
+    andi.      T10,L, 4
+    ble LSGEMM_1x2_SUB2_2 
+    KERNEL1x2_4  0,0, 0,1
+    MY_ALIGN
+LSGEMM_1x2_SUB2_2:
+    andi.      T10,L, 2
+    ble LSGEMM_1x2_SUB2_1 
+    KERNEL1x2_2  0,0, 0,1
+    MY_ALIGN    
+LSGEMM_1x2_SUB2_1:
+    andi.      T10,L, 1
+    ble LSGEMM_1x2_SAVE	
+    KERNEL1x2
+
+	MY_ALIGN
+LSGEMM_1x2_SAVE:
+	SAVE1x2
+#if defined(TRMMKERNEL)	
+	REFRESH_AFTER_SAVE T11,K,TEMP_REG,BO,AO,2,1
+#endif	 
+    MY_ALIGN
+LSGEMM_1x2_END:
+    andi.		I,	M,	1
+	ble		LSGEMM_1x1_END
+
+	MY_ALIGN
+LSGEMM_1x1_BEGIN:
+
+#if defined(TRMMKERNEL)
+    REFRESH_POINTERS  AO,BO,TEMP_REG,B,1,1
+#else
+	mr		BO,	B
+#endif	
+
+#if defined(TRMMKERNEL)
+   REFRESH_TEMP_BK T11,K,TEMP_REG,1,1 
+   srawi.		L, T11,	6 /**(T11 ) % 64x */
+#else 
+   srawi.		L,	K,	6 /**(K ) % 64x */
+#endif 
+ 
+    ZERO1x1
+	ble		LSGEMM_1x1_SUB0
+ 
+  
+	mtctr		L
+
+	MY_ALIGN
+
+LSGEMM_1x1_LOOP:
+
+    KERNEL1x1_16  0,0, 0,0
+    KERNEL1x1_16  0,0, 1,0
+    KERNEL1x1_16  0,0, 2,0
+    KERNEL1x1_16  0,0, 3,1 	
+
+	bdnz		LSGEMM_1x1_LOOP
+    MY_ALIGN
+ 
+	MY_ALIGN
+LSGEMM_1x1_SUB0: 
+#if defined(TRMMKERNEL)
+	andi.		L,	T11,	63
+#else
+	andi.		L,  K,	63
+#endif	
+	ble		LSGEMM_1x1_SAVE
+	MY_ALIGN
+LSGEMM_1x1_SUB2:
+    andi.      T10,L, 32
+    ble LSGEMM_1x1_SUB2_16 
+    KERNEL1x1_16  0,0, 0,0
+    KERNEL1x1_16  0,0, 1,1 
+    MY_ALIGN        
+LSGEMM_1x1_SUB2_16:
+    andi.      T10,L, 16
+    ble LSGEMM_1x1_SUB2_8 
+    KERNEL1x1_16  0,0, 0,1
+    MY_ALIGN 
+LSGEMM_1x1_SUB2_8:
+    andi.      T10,L, 8
+    ble LSGEMM_1x1_SUB2_4  
+    KERNEL1x1_8  0,0, 0,1
+	MY_ALIGN	
+LSGEMM_1x1_SUB2_4:
+    andi.      T10,L, 4
+    ble LSGEMM_1x1_SUB2_2 
+    KERNEL1x1_4  0,0, 0,1
+    MY_ALIGN
+LSGEMM_1x1_SUB2_2:
+    andi.      T10,L, 2
+    ble LSGEMM_1x1_SUB2_1 
+    KERNEL1x1_2  0,0, 0,1
+    MY_ALIGN    
+LSGEMM_1x1_SUB2_1:
+    andi.      T10,L, 1
+    ble LSGEMM_1x1_SAVE	
+    KERNEL1x1
+
+	MY_ALIGN
+LSGEMM_1x1_SAVE:
+	SAVE1x1
+#if defined(TRMMKERNEL)	
+	REFRESH_AFTER_SAVE T11,K,TEMP_REG,BO,AO,1,1
+#endif	 
+    MY_ALIGN
+LSGEMM_1x1_END:
+	slwi		T1,	K,	2
+	add		B,	B,	T1
+#if defined(TRMMKERNEL) && !defined(LEFT)
+    addi TEMP_REG, TEMP_REG, 1
+#endif 
+LSGEMM_END:
\ No newline at end of file
diff --git a/kernel/power/sgemm_macros_power9.S b/kernel/power/sgemm_macros_power9.S
new file mode 100644
index 000000000..c61f419ac
--- /dev/null
+++ b/kernel/power/sgemm_macros_power9.S
@@ -0,0 +1,5828 @@
+/***************************************************************************
+Copyright (c) 2013-2019, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*****************************************************************************/
+ 
+#define unit_size 4
+#define DISP64(ind,disp) (ind*unit_size*64+disp)
+#define DISP32(ind,disp) (ind*unit_size*32+disp)
+#define DISP16(ind,disp) (ind*unit_size*16+disp)
+#define DISP8(ind,disp) (ind*unit_size*8+disp)
+#define DISP4(ind,disp) (ind*unit_size*4+disp)
+#define DISP2(ind,disp) (ind*unit_size*2+disp)
+#define DISP1(ind,disp) (ind*unit_size+disp)
+
+/**********************************************************************************************
+* Macros for N=8 and M=16
+**********************************************************************************************/
+
+.macro LOAD8x16_1
+   LOAD8x16 1
+.endm
+
+.macro LOAD8x16_0
+   LOAD8x16 0
+.endm
+
+.macro KERNEL8x16_L1_L4  Index,IsLast
+  KERNEL8x16_L1_L4_I AO,BO, 0,0, \Index,\IsLast,0
+.endm
+
+.macro KERNEL8x16_I1_L4  OffsetA,OffsetB, Index,IsLast
+  KERNEL8x16_L1_L4_I  AO,BO, \OffsetA,\OffsetB,\Index,\IsLast,0
+.endm
+
+.macro KERNEL8x16_I1_L4_2  OffsetA,OffsetB, Index,IsLast
+  KERNEL8x16_L1_L4_I  AO,BO,  \OffsetA,\OffsetB,\Index,\IsLast,0
+.endm
+
+.macro KERNEL8x16_I1_L4_3  OffsetA,OffsetB, Index,IsLast
+  KERNEL8x16_L1_L4_I  AO,BO,  \OffsetA,\OffsetB,\Index,\IsLast,1
+.endm
+.macro KERNEL8x16_I1_L2_3  OffsetA,OffsetB, Index,IsLast
+   KERNEL8x16_L1_L2_I AO,BO,0,  \OffsetA,\OffsetB,\Index,\IsLast,1
+.endm
+
+.macro KERNEL8x16_I2_L4_2  AREG,BREG,OffsetA,OffsetB, Index,IsLast
+  KERNEL8x16_L1_L4_I  \AREG,\BREG,  \OffsetA,\OffsetB,\Index,\IsLast,0
+.endm
+
+.macro KERNEL8x16_I2_L4_3  AREG,BREG,OffsetA,OffsetB, Index,IsLast
+  KERNEL8x16_L1_L4_I \AREG,\BREG,  \OffsetA,\OffsetB,\Index,\IsLast,1
+.endm
+
+.macro Zero8X16
+    xxlxor		vs32,	vs32,	vs32
+    xxlxor		vs33,	vs33,	vs33
+	xxlxor		vs34,	vs34,	vs34
+	xxlxor		vs35,	vs35,	vs35
+	xxlxor		vs36,	vs36,	vs36
+	xxlxor		vs37,	vs37,	vs37
+	xxlxor		vs38,	vs38,	vs38
+	xxlxor		vs39,	vs39,	vs39
+	xxlxor		vs40,	vs40,	vs40
+	xxlxor		vs41,	vs41,	vs41
+	xxlxor		vs42,	vs42,	vs42
+	xxlxor		vs43,	vs43,	vs43
+	xxlxor		vs44,	vs44,	vs44
+	xxlxor		vs45,	vs45,	vs45
+	xxlxor		vs46,	vs46,	vs46
+	xxlxor		vs47,	vs47,	vs47
+	xxlxor		vs48,	vs48,	vs48
+	xxlxor		vs49,	vs49,	vs49
+	xxlxor		vs50,	vs50,	vs50
+	xxlxor		vs51,	vs51,	vs51 
+	xxlxor		vs52,	vs52,	vs52
+	xxlxor		vs53,	vs53,	vs53
+	xxlxor		vs54,	vs54,	vs54
+	xxlxor		vs55,	vs55,	vs55 
+	xxlxor		vs56,	vs56,	vs56
+	xxlxor		vs57,	vs57,	vs57
+	xxlxor		vs58,	vs58,	vs58
+	xxlxor		vs59,	vs59,	vs59 
+	xxlxor		vs60,	vs60,	vs60
+	xxlxor		vs61,	vs61,	vs61
+	xxlxor		vs62,	vs62,	vs62
+	xxlxor		vs63,	vs63,	vs63	
+.endm
+
+.macro LOAD8x16  Zero
+
+	lxv	vs24,	0(BO)
+	lxv	vs28,	16(BO)
+	lxv	vs0,	 0(AO)
+	lxv	vs1,	16(AO)
+	lxv	vs2,	32(AO)
+	lxv	vs3,	48(AO)
+	xxperm  	vs26,	vs24,		permute_mask
+	xxperm  	vs30,	vs28,		permute_mask	
+	xxpermdi	vs25,	vs24,	vs24,2	   
+	xxpermdi	vs29,	vs28,	vs28,2	  
+
+	xxpermdi	vs27,	vs26,	vs26,2	
+	xxpermdi	vs31,	vs30,	vs30,2	 	
+
+.if \Zero==1 
+    xxlxor		vs32,	vs32,	vs32
+    xxlxor		vs33,	vs33,	vs33
+	xxlxor		vs34,	vs34,	vs34
+	xxlxor		vs35,	vs35,	vs35
+	xxlxor		vs36,	vs36,	vs36
+	xxlxor		vs37,	vs37,	vs37
+	xxlxor		vs38,	vs38,	vs38
+	xxlxor		vs39,	vs39,	vs39
+	xxlxor		vs40,	vs40,	vs40
+	xxlxor		vs41,	vs41,	vs41
+	xxlxor		vs42,	vs42,	vs42
+	xxlxor		vs43,	vs43,	vs43
+	xxlxor		vs44,	vs44,	vs44
+	xxlxor		vs45,	vs45,	vs45
+	xxlxor		vs46,	vs46,	vs46
+	xxlxor		vs47,	vs47,	vs47
+	xxlxor		vs48,	vs48,	vs48
+	xxlxor		vs49,	vs49,	vs49
+	xxlxor		vs50,	vs50,	vs50
+	xxlxor		vs51,	vs51,	vs51 
+	xxlxor		vs52,	vs52,	vs52
+	xxlxor		vs53,	vs53,	vs53
+	xxlxor		vs54,	vs54,	vs54
+	xxlxor		vs55,	vs55,	vs55 
+	xxlxor		vs56,	vs56,	vs56
+	xxlxor		vs57,	vs57,	vs57
+	xxlxor		vs58,	vs58,	vs58
+	xxlxor		vs59,	vs59,	vs59 
+	xxlxor		vs60,	vs60,	vs60
+	xxlxor		vs61,	vs61,	vs61
+	xxlxor		vs62,	vs62,	vs62
+	xxlxor		vs63,	vs63,	vs63	
+.endif
+.endm
+
+.macro END8x16_NORMAL
+  END8x16 0, AO, BO, 64,32 
+.endm
+
+.macro END8x16 First, AREG, BREG, OffsetA, OffsetB
+
+.if \OffsetB != 0 
+    addi        \BREG, \BREG, \OffsetB 
+.endif
+.if \OffsetA != 0 
+    addi        \AREG, \AREG, \OffsetA 
+.endif  
+
+.if \First==1
+    xvmulsp     vs32, vs0,vs24
+    xvmulsp     vs33, vs1,vs24
+    xvmulsp     vs34, vs2,vs24  
+    xvmulsp     vs35, vs3,vs24  
+
+    xvmulsp     vs36, vs0,vs25
+    xvmulsp     vs37, vs1,vs25
+    xvmulsp     vs38, vs2,vs25  
+    xvmulsp     vs39, vs3,vs25
+
+    xvmulsp     vs40, vs0,vs26
+    xvmulsp     vs41, vs1,vs26
+    xvmulsp     vs42, vs2,vs26  
+    xvmulsp     vs43, vs3,vs26
+
+    xvmulsp     vs44, vs0,vs27
+    xvmulsp     vs45, vs1,vs27
+    xvmulsp     vs46, vs2,vs27  
+    xvmulsp     vs47, vs3,vs27
+
+    xvmulsp     vs48, vs0,vs28
+    xvmulsp     vs49, vs1,vs28
+    xvmulsp     vs50, vs2,vs28  
+    xvmulsp     vs51, vs3,vs28  
+
+    xvmulsp     vs52, vs0,vs29
+    xvmulsp     vs53, vs1,vs29
+    xvmulsp     vs54, vs2,vs29  
+    xvmulsp     vs55, vs3,vs29
+
+    xvmulsp     vs56, vs0,vs30
+    xvmulsp     vs57, vs1,vs30
+    xvmulsp     vs58, vs2,vs30  
+    xvmulsp     vs59, vs3,vs30
+
+    xvmulsp     vs60, vs0,vs31
+    xvmulsp     vs61, vs1,vs31
+    xvmulsp     vs62, vs2,vs31  
+    xvmulsp     vs63, vs3,vs31
+
+.else
+    xvmaddasp       vs32, vs0,vs24
+    xvmaddasp       vs33, vs1,vs24
+    xvmaddasp       vs34, vs2,vs24  
+    xvmaddasp       vs35, vs3,vs24  
+
+    xvmaddasp       vs36, vs0,vs25
+    xvmaddasp       vs37, vs1,vs25
+    xvmaddasp       vs38, vs2,vs25  
+    xvmaddasp       vs39, vs3,vs25 
+    xvmaddasp       vs40, vs0,vs26
+    xvmaddasp       vs41, vs1,vs26
+    xvmaddasp       vs42, vs2,vs26  
+    xvmaddasp       vs43, vs3,vs26
+
+    xvmaddasp       vs44, vs0,vs27
+    xvmaddasp       vs45, vs1,vs27
+    xvmaddasp       vs46, vs2,vs27  
+    xvmaddasp       vs47, vs3,vs27
+
+    xvmaddasp       vs48, vs0,vs28
+    xvmaddasp       vs49, vs1,vs28
+    xvmaddasp       vs50, vs2,vs28  
+    xvmaddasp       vs51, vs3,vs28  
+
+    xvmaddasp       vs52, vs0,vs29
+    xvmaddasp       vs53, vs1,vs29
+    xvmaddasp       vs54, vs2,vs29  
+    xvmaddasp       vs55, vs3,vs29
+
+    xvmaddasp       vs56, vs0,vs30
+    xvmaddasp       vs57, vs1,vs30
+    xvmaddasp       vs58, vs2,vs30  
+    xvmaddasp       vs59, vs3,vs30
+
+    xvmaddasp       vs60, vs0,vs31
+    xvmaddasp       vs61, vs1,vs31
+    xvmaddasp       vs62, vs2,vs31  
+    xvmaddasp       vs63, vs3,vs31 
+
+.endif
+.endm  
+
+.macro KERNEL8x16_L1_L4_I  AREG,BREG,   OffsetA,OffsetB, Index,IsLast ,Complete
+
+	lxv	vs8,	DISP32(\Index, 0+\OffsetB)(\BREG)
+	lxv	vs12,	DISP32(\Index,16+\OffsetB)(\BREG)
+
+ 	lxv	vs4,	DISP64(\Index, 0+\OffsetA)(\AREG)
+	lxv	vs5,	DISP64(\Index,16+\OffsetA)(\AREG)
+	lxv	vs6,	DISP64(\Index,32+\OffsetA)(\AREG)
+	lxv	vs7,	DISP64(\Index,48+\OffsetA)(\AREG) 
+
+	xxperm  	vs10,	vs8,		permute_mask
+	xxperm  	vs14,	vs12,		permute_mask	
+	xxpermdi	vs9,	vs8,	vs8,2	 
+	xxpermdi	vs13,	vs12,	vs12,2	 
+
+    xvmaddasp		vs32, vs0,vs24
+	xvmaddasp		vs33, vs1,vs24
+	xvmaddasp		vs34, vs2,vs24	
+	xvmaddasp		vs35, vs3,vs24	 
+
+    xvmaddasp		vs36, vs0,vs25
+	xvmaddasp		vs37, vs1,vs25
+	xvmaddasp		vs38, vs2,vs25	
+	xvmaddasp		vs39, vs3,vs25 
+
+ 	xxpermdi	vs11,	vs10,	vs10,2	
+	xxpermdi	vs15,	vs14,	vs14,2	
+
+    xvmaddasp		vs40, vs0,vs26
+	xvmaddasp		vs41, vs1,vs26
+	xvmaddasp		vs42, vs2,vs26	
+	xvmaddasp		vs43, vs3,vs26
+
+    xvmaddasp		vs44, vs0,vs27
+	xvmaddasp		vs45, vs1,vs27
+	xvmaddasp		vs46, vs2,vs27	
+	xvmaddasp		vs47, vs3,vs27
+
+    xvmaddasp		vs48, vs0,vs28
+	xvmaddasp		vs49, vs1,vs28
+	xvmaddasp		vs50, vs2,vs28	
+	xvmaddasp		vs51, vs3,vs28	
+
+    xvmaddasp		vs52, vs0,vs29
+	xvmaddasp		vs53, vs1,vs29
+	xvmaddasp		vs54, vs2,vs29	
+	xvmaddasp		vs55, vs3,vs29
+
+    xvmaddasp		vs56, vs0,vs30
+	xvmaddasp		vs57, vs1,vs30
+	xvmaddasp		vs58, vs2,vs30	
+	xvmaddasp		vs59, vs3,vs30
+
+    xvmaddasp		vs60, vs0,vs31
+	xvmaddasp		vs61, vs1,vs31
+	xvmaddasp		vs62, vs2,vs31	
+	xvmaddasp		vs63, vs3,vs31 
+
+	lxv	vs24,	DISP32(\Index,32+\OffsetB)(\BREG)
+	lxv	vs28,	DISP32(\Index,32+16+\OffsetB)(\BREG)
+
+	lxv	vs0,	DISP64(\Index,64+\OffsetA)(\AREG)
+	lxv	vs1,	DISP64(\Index,64+16+\OffsetA)(\AREG)
+	lxv	vs2,	DISP64(\Index,64+32+\OffsetA)(\AREG)
+	lxv	vs3,	DISP64(\Index,64+48+\OffsetA)(\AREG)
+
+	xxperm  	vs26,	vs24,		permute_mask
+	xxperm  	vs30,	vs28,		permute_mask	
+	xxpermdi	vs25,	vs24,	vs24,2	   
+	xxpermdi	vs29,	vs28,	vs28,2	  
+ 
+
+    xvmaddasp		vs32, vs4,vs8
+	xvmaddasp		vs33, vs5,vs8
+	xvmaddasp		vs34, vs6,vs8	
+	xvmaddasp		vs35, vs7,vs8	
+ 
+    xvmaddasp		vs36, vs4,vs9
+	xvmaddasp		vs37, vs5,vs9
+	xvmaddasp		vs38, vs6,vs9	
+	xvmaddasp		vs39, vs7,vs9
+         
+	xxpermdi	vs27,	vs26,	vs26,2	
+	xxpermdi	vs31,	vs30,	vs30,2		
+
+    xvmaddasp		vs40, vs4,vs10
+	xvmaddasp		vs41, vs5,vs10
+	xvmaddasp		vs42, vs6,vs10	
+	xvmaddasp		vs43, vs7,vs10
+
+    xvmaddasp		vs44, vs4,vs11
+	xvmaddasp		vs45, vs5,vs11
+	xvmaddasp		vs46, vs6,vs11	
+	xvmaddasp		vs47, vs7,vs11
+
+    xvmaddasp		vs48, vs4,vs12
+	xvmaddasp		vs49, vs5,vs12
+	xvmaddasp		vs50, vs6,vs12	
+	xvmaddasp		vs51, vs7,vs12	
+
+    xvmaddasp		vs52, vs4,vs13
+	xvmaddasp		vs53, vs5,vs13
+	xvmaddasp		vs54, vs6,vs13	
+	xvmaddasp		vs55, vs7,vs13
+
+    xvmaddasp		vs56, vs4,vs14
+	xvmaddasp		vs57, vs5,vs14
+	xvmaddasp		vs58, vs6,vs14	
+	xvmaddasp		vs59, vs7,vs14
+
+    xvmaddasp		vs60, vs4,vs15
+	xvmaddasp		vs61, vs5,vs15
+	xvmaddasp		vs62, vs6,vs15	
+	xvmaddasp		vs63, vs7,vs15
+
+	lxv	vs8,	DISP32(\Index,64+\OffsetB)(\BREG)
+	lxv	vs12,	DISP32(\Index,64+16+\OffsetB)(\BREG)
+
+ 	lxv	vs4,	DISP64(\Index,128+0+\OffsetA)(\AREG)
+	lxv	vs5,	DISP64(\Index,128+16+\OffsetA)(\AREG)
+	lxv	vs6,	DISP64(\Index,128+32+\OffsetA)(\AREG)
+	lxv	vs7,	DISP64(\Index,128+48+\OffsetA)(\AREG) 
+
+	xxperm  	vs10,	vs8,		permute_mask
+	xxperm  	vs14,	vs12,		permute_mask	
+	xxpermdi	vs9,	vs8,	vs8,2	 
+	xxpermdi	vs13,	vs12,	vs12,2	
+
+    xvmaddasp		vs32, vs0,vs24
+	xvmaddasp		vs33, vs1,vs24
+	xvmaddasp		vs34, vs2,vs24	
+	xvmaddasp		vs35, vs3,vs24	 
+
+    xvmaddasp		vs36, vs0,vs25
+	xvmaddasp		vs37, vs1,vs25
+	xvmaddasp		vs38, vs2,vs25	
+	xvmaddasp		vs39, vs3,vs25
+
+ 	xxpermdi	vs11,	vs10,	vs10,2	
+	xxpermdi	vs15,	vs14,	vs14,2	
+
+    xvmaddasp		vs40, vs0,vs26
+	xvmaddasp		vs41, vs1,vs26
+	xvmaddasp		vs42, vs2,vs26	
+	xvmaddasp		vs43, vs3,vs26
+
+    xvmaddasp		vs44, vs0,vs27
+	xvmaddasp		vs45, vs1,vs27
+	xvmaddasp		vs46, vs2,vs27	
+	xvmaddasp		vs47, vs3,vs27
+
+    xvmaddasp		vs48, vs0,vs28
+	xvmaddasp		vs49, vs1,vs28
+	xvmaddasp		vs50, vs2,vs28	
+	xvmaddasp		vs51, vs3,vs28	
+
+    xvmaddasp		vs52, vs0,vs29
+	xvmaddasp		vs53, vs1,vs29
+	xvmaddasp		vs54, vs2,vs29	
+	xvmaddasp		vs55, vs3,vs29
+
+    xvmaddasp		vs56, vs0,vs30
+	xvmaddasp		vs57, vs1,vs30
+	xvmaddasp		vs58, vs2,vs30	
+	xvmaddasp		vs59, vs3,vs30
+
+    xvmaddasp		vs60, vs0,vs31
+	xvmaddasp		vs61, vs1,vs31
+	xvmaddasp		vs62, vs2,vs31	
+	xvmaddasp		vs63, vs3,vs31 
+ 
+.if \Complete==0
+	lxv	vs24,	DISP32(\Index,96+\OffsetB)(\BREG)
+	lxv	vs28,	DISP32(\Index,96+16+\OffsetB)(\BREG)
+
+	lxv	vs0,	DISP64(\Index,192+\OffsetA)(\AREG)
+	lxv	vs1,	DISP64(\Index,192+16+\OffsetA)(\AREG) 
+	lxv	vs2,	DISP64(\Index,192+32+\OffsetA)(\AREG)
+	lxv	vs3,	DISP64(\Index,192+48+\OffsetA)(\AREG)
+
+	xxperm  	vs26,	vs24,	permute_mask
+	xxperm  	vs30,	vs28,	permute_mask	
+	xxpermdi	vs25,	vs24,	vs24,2 
+	xxpermdi	vs29,	vs28,	vs28,2		
+
+.endif 
+.if \IsLast==1	
+.if \Complete==1
+  
+	addi		\BREG, \BREG,  DISP32(\Index,32*3+\OffsetB)
+	addi		\AREG, \AREG, DISP64(\Index,64*3+\OffsetA)
+.else
+  
+	addi		\BREG, \BREG,  DISP32(\Index,128)
+	addi		\AREG, \AREG, DISP64(\Index,256)
+.endif
+.endif   
+ 
+    xvmaddasp		vs32, vs4,vs8
+	xvmaddasp		vs33, vs5,vs8
+	xvmaddasp		vs34, vs6,vs8	
+	xvmaddasp		vs35, vs7,vs8	 
+ 
+    xvmaddasp		vs36, vs4,vs9
+	xvmaddasp		vs37, vs5,vs9
+	xvmaddasp		vs38, vs6,vs9	
+	xvmaddasp		vs39, vs7,vs9
+  
+.if \Complete==0        
+	xxpermdi	vs27,	vs26,	vs26,2	
+	xxpermdi	vs31,	vs30,	vs30,2	
+ 	
+.endif
+ 
+    xvmaddasp		vs40, vs4,vs10
+	xvmaddasp		vs41, vs5,vs10
+	xvmaddasp		vs42, vs6,vs10	
+	xvmaddasp		vs43, vs7,vs10
+
+    xvmaddasp		vs44, vs4,vs11
+	xvmaddasp		vs45, vs5,vs11
+	xvmaddasp		vs46, vs6,vs11	
+	xvmaddasp		vs47, vs7,vs11
+
+    xvmaddasp		vs48, vs4,vs12
+	xvmaddasp		vs49, vs5,vs12
+	xvmaddasp		vs50, vs6,vs12	
+	xvmaddasp		vs51, vs7,vs12	
+
+    xvmaddasp		vs52, vs4,vs13
+	xvmaddasp		vs53, vs5,vs13
+	xvmaddasp		vs54, vs6,vs13	
+	xvmaddasp		vs55, vs7,vs13
+
+    xvmaddasp		vs56, vs4,vs14
+	xvmaddasp		vs57, vs5,vs14
+	xvmaddasp		vs58, vs6,vs14	
+	xvmaddasp		vs59, vs7,vs14
+
+    xvmaddasp		vs60, vs4,vs15
+	xvmaddasp		vs61, vs5,vs15
+	xvmaddasp		vs62, vs6,vs15	
+	xvmaddasp		vs63, vs7,vs15
+
+.endm
+
+.macro KERNEL8x16 First
+
+  LOAD8x16 0
+  END8x16 \First, AO, BO, 64,32 
+.endm
+
+.macro KERNEL8x16_L1_L2_I  AREG,BREG,First,OffsetA,OffsetB, Index,IsLast ,Complete
+	
+	lxv	vs8,	DISP16(\Index, 0+\OffsetB)(\BREG)
+	lxv	vs12,	DISP16(\Index,16+\OffsetB)(\BREG)
+
+ 	lxv	vs4,	DISP32(\Index, 0+\OffsetA)(\AREG)
+	lxv	vs5,	DISP32(\Index,16+\OffsetA)(\AREG)
+	lxv	vs6,	DISP32(\Index,32+\OffsetA)(\AREG)
+	lxv	vs7,	DISP32(\Index,48+\OffsetA)(\AREG) 
+
+	xxperm  	vs10,	vs8,		permute_mask
+	xxperm  	vs14,	vs12,		permute_mask	
+	xxpermdi	vs9,	vs8,	vs8,2	 
+	xxpermdi	vs13,	vs12,	vs12,2	 
+.if \First==1
+    xvmulsp		vs32, vs0,vs24
+	xvmulsp		vs33, vs1,vs24
+	xvmulsp		vs34, vs2,vs24	
+	xvmulsp		vs35, vs3,vs24	
+
+    xvmulsp		vs36, vs0,vs25
+	xvmulsp		vs37, vs1,vs25
+	xvmulsp		vs38, vs2,vs25	
+	xvmulsp		vs39, vs3,vs25	
+.else
+    xvmaddasp		vs32, vs0,vs24
+	xvmaddasp		vs33, vs1,vs24
+	xvmaddasp		vs34, vs2,vs24	
+	xvmaddasp		vs35, vs3,vs24
+
+    xvmaddasp		vs36, vs0,vs25
+	xvmaddasp		vs37, vs1,vs25
+	xvmaddasp		vs38, vs2,vs25	
+	xvmaddasp		vs39, vs3,vs25		
+.endif
+
+ 	xxpermdi	vs11,	vs10,	vs10,2	
+	xxpermdi	vs15,	vs14,	vs14,2	
+ 
+.if \First==1  
+    xvmulsp		vs40, vs0,vs26
+	xvmulsp		vs41, vs1,vs26
+	xvmulsp		vs42, vs2,vs26	
+	xvmulsp		vs43, vs3,vs26
+
+    xvmulsp		vs44, vs0,vs27
+	xvmulsp		vs45, vs1,vs27
+	xvmulsp		vs46, vs2,vs27	
+	xvmulsp		vs47, vs3,vs27
+
+    xvmulsp		vs48, vs0,vs28
+	xvmulsp		vs49, vs1,vs28
+	xvmulsp		vs50, vs2,vs28	
+	xvmulsp		vs51, vs3,vs28	
+
+    xvmulsp		vs52, vs0,vs29
+	xvmulsp		vs53, vs1,vs29
+	xvmulsp		vs54, vs2,vs29	
+	xvmulsp		vs55, vs3,vs29
+
+    xvmulsp		vs56, vs0,vs30
+	xvmulsp		vs57, vs1,vs30
+	xvmulsp		vs58, vs2,vs30	
+	xvmulsp		vs59, vs3,vs30
+
+    xvmulsp		vs60, vs0,vs31
+	xvmulsp		vs61, vs1,vs31
+	xvmulsp		vs62, vs2,vs31	
+	xvmulsp		vs63, vs3,vs31
+
+.else 
+    xvmaddasp		vs40, vs0,vs26
+	xvmaddasp		vs41, vs1,vs26
+	xvmaddasp		vs42, vs2,vs26	
+	xvmaddasp		vs43, vs3,vs26
+
+    xvmaddasp		vs44, vs0,vs27
+	xvmaddasp		vs45, vs1,vs27
+	xvmaddasp		vs46, vs2,vs27	
+	xvmaddasp		vs47, vs3,vs27
+
+    xvmaddasp		vs48, vs0,vs28
+	xvmaddasp		vs49, vs1,vs28
+	xvmaddasp		vs50, vs2,vs28	
+	xvmaddasp		vs51, vs3,vs28	
+
+    xvmaddasp		vs52, vs0,vs29
+	xvmaddasp		vs53, vs1,vs29
+	xvmaddasp		vs54, vs2,vs29	
+	xvmaddasp		vs55, vs3,vs29
+
+    xvmaddasp		vs56, vs0,vs30
+	xvmaddasp		vs57, vs1,vs30
+	xvmaddasp		vs58, vs2,vs30	
+	xvmaddasp		vs59, vs3,vs30
+
+    xvmaddasp		vs60, vs0,vs31
+	xvmaddasp		vs61, vs1,vs31
+	xvmaddasp		vs62, vs2,vs31	
+	xvmaddasp		vs63, vs3,vs31 
+
+.endif
+.if \Complete==0
+	lxv	vs24,	DISP16(\Index,32+\OffsetB)(\BREG)
+	lxv	vs28,	DISP16(\Index,32+16+\OffsetB)(\BREG)
+
+	lxv	vs0,	DISP32(\Index,64+\OffsetA)(\AREG)
+	lxv	vs1,	DISP32(\Index,64+16+\OffsetA)(\AREG)
+	lxv	vs2,	DISP32(\Index,64+32+\OffsetA)(\AREG)
+	lxv	vs3,	DISP32(\Index,64+48+\OffsetA)(\AREG)
+
+	xxperm  	vs26,	vs24,	permute_mask
+	xxperm  	vs30,	vs28,	permute_mask	
+	xxpermdi	vs25,	vs24,	vs24,2	 
+	xxpermdi	vs29,	vs28,	vs28,2	
+.endif    
+.if \IsLast==1	
+.if \Complete==1
+ 	addi		\BREG, \BREG,  DISP16(\Index,32+\OffsetB) 
+	addi		\AREG, \AREG, DISP32(\Index,64+\OffsetA)
+
+.else
+  	addi		\BREG, \BREG,  DISP16(\Index,64)
+	addi		\AREG, \AREG, DISP32(\Index,128) 
+.endif
+.endif
+
+.if \First==1
+    xvmulsp		vs32, vs4,vs8
+	xvmulsp		vs33, vs5,vs8
+	xvmulsp		vs34, vs6,vs8	
+	xvmulsp		vs35, vs7,vs8
+
+    xvmulsp		vs36, vs4,vs9
+	xvmulsp		vs37, vs5,vs9
+	xvmulsp		vs38, vs6,vs9	
+	xvmulsp		vs39, vs7,vs9
+.else
+    xvmaddasp		vs32, vs4,vs8
+	xvmaddasp		vs33, vs5,vs8
+	xvmaddasp		vs34, vs6,vs8	
+	xvmaddasp		vs35, vs7,vs8	
+
+    xvmaddasp		vs36, vs4,vs9
+	xvmaddasp		vs37, vs5,vs9
+	xvmaddasp		vs38, vs6,vs9	
+	xvmaddasp		vs39, vs7,vs9
+.endif 
+ 
+.if \Complete==0        
+	xxpermdi	vs27,	vs26,	vs26,2	
+	xxpermdi	vs31,	vs30,	vs30,2	
+ 
+.endif
+.if \First==1  
+    xvmulsp		vs40, vs4,vs10
+	xvmulsp		vs41, vs5,vs10
+	xvmulsp		vs42, vs6,vs10	
+	xvmulsp		vs43, vs7,vs10
+
+    xvmulsp		vs44, vs4,vs11
+	xvmulsp		vs45, vs5,vs11
+	xvmulsp		vs46, vs6,vs11	
+	xvmulsp		vs47, vs7,vs11
+
+    xvmulsp		vs48, vs4,vs12
+	xvmulsp		vs49, vs5,vs12
+	xvmulsp		vs50, vs6,vs12	
+	xvmulsp		vs51, vs7,vs12	
+
+    xvmulsp		vs52, vs4,vs13
+	xvmulsp		vs53, vs5,vs13
+	xvmulsp		vs54, vs6,vs13	
+	xvmulsp		vs55, vs7,vs13
+
+    xvmulsp		vs56, vs4,vs14
+	xvmulsp		vs57, vs5,vs14
+	xvmulsp		vs58, vs6,vs14	
+	xvmulsp		vs59, vs7,vs14
+
+    xvmulsp		vs60, vs4,vs15
+	xvmulsp		vs61, vs5,vs15
+	xvmulsp		vs62, vs6,vs15	
+	xvmulsp		vs63, vs7,vs15
+
+.else 
+    xvmaddasp		vs40, vs4,vs10
+	xvmaddasp		vs41, vs5,vs10
+	xvmaddasp		vs42, vs6,vs10	
+	xvmaddasp		vs43, vs7,vs10
+
+    xvmaddasp		vs44, vs4,vs11
+	xvmaddasp		vs45, vs5,vs11
+	xvmaddasp		vs46, vs6,vs11	
+	xvmaddasp		vs47, vs7,vs11
+
+    xvmaddasp		vs48, vs4,vs12
+	xvmaddasp		vs49, vs5,vs12
+	xvmaddasp		vs50, vs6,vs12	
+	xvmaddasp		vs51, vs7,vs12	
+
+    xvmaddasp		vs52, vs4,vs13
+	xvmaddasp		vs53, vs5,vs13
+	xvmaddasp		vs54, vs6,vs13	
+	xvmaddasp		vs55, vs7,vs13
+
+    xvmaddasp		vs56, vs4,vs14
+	xvmaddasp		vs57, vs5,vs14
+	xvmaddasp		vs58, vs6,vs14	
+	xvmaddasp		vs59, vs7,vs14
+
+    xvmaddasp		vs60, vs4,vs15
+	xvmaddasp		vs61, vs5,vs15
+	xvmaddasp		vs62, vs6,vs15	
+	xvmaddasp		vs63, vs7,vs15
+
+.endif
+
+.endm
+
+ 
+.macro SAVE8x16
+
+  slwi    T10, LDC ,   1 
+  add     T1, CO, LDC 
+
+  add     T2, CO, T10  
+  add     T3, T1, T10  
+
+  add     T4, T2, T10  
+  add     T5, T3, T10 
+
+  add     T6, T4, T10 
+  add     T7, T5, T10 
+
+
+
+   /* permute to restore butterfly rank 1 updateto normal promoted one */  
+    /* permute 16 vs8 MEM(CO) vs9 MEM(CO+LDC) vs10 MEM(CO+2*LDC)  vs11 MEM(CO+3*LDC) */
+    /* permute 16 vs12 MEM(16+CO) vs13 MEM(16+CO+LDC) vs14 MEM(16+CO+2*LDC)  vs15 MEM(16+CO+3*LDC) */
+    /* permute 16 vs16 MEM(32+CO) vs17 MEM(32+CO+LDC) vs18 MEM(32+CO+2*LDC)  vs19 MEM(32+CO+3*LDC) */
+    /* permute 16 vs24 MEM(32+CO) vs25 MEM(32+CO+LDC) vs26 MEM(32+CO+2*LDC)  vs27 MEM(32+CO+3*LDC) */
+
+    xxmrglw     vs8,    vs32,   vs44
+    xxmrglw     vs10,   vs36,   vs40  
+
+    xxmrghw     vs1,    vs32,   vs44
+    xxmrghw     vs0,    vs36,   vs40
+
+    xxmrglw     vs12,   vs33,   vs45
+    xxmrglw     vs14,   vs37,   vs41  
+
+    xxmrghw     vs2,    vs37,   vs41
+    xxmrghw     vs3,    vs33,   vs45
+
+    xxmrglw     vs16,   vs34,   vs46
+    xxmrglw     vs18,   vs38,   vs42   
+
+    xxlor      vs9, vs8,    vs8
+    xxlor      vs11,    vs10,   vs10 
+
+    xxmrghw     vs4,    vs38,   vs42
+    xxmrghw     vs5,    vs34,   vs46
+
+    xxlor      vs13,    vs12,   vs12
+    xxlor      vs15,    vs14,   vs14
+
+    xxmrglw     vs24,   vs35,   vs47
+    xxmrglw     vs26,   vs39,   vs43  
+
+    xxlor      vs17,    vs16,   vs16
+    xxlor      vs19,    vs18,   vs18
+
+    xxmrghw     vs30,   vs39,   vs43 
+    xxmrghw     vs31,   vs35,   vs47
+
+    xxperm      vs8,    vs0,    save_permute_1
+    xxperm      vs10,   vs1,    save_permute_1
+    xxperm      vs9,    vs0,    save_permute_2  
+    xxperm      vs11,   vs1,    save_permute_2      
+
+#ifndef TRMMKERNEL    
+    lxv        vs32, 0(CO)
+    lxv        vs33, 16(CO) 
+    lxv        vs34, 32(CO)  
+    lxv        vs35, 48(CO)      
+#endif
+    xxlor      vs25,    vs24,   vs24
+    xxlor      vs27,    vs26,   vs26 
+
+#ifndef TRMMKERNEL    
+    lxv        vs36, 0(T1)
+    lxv        vs37, 16(T1) 
+    lxv        vs38, 32(T1)  
+    lxv        vs39, 48(T1)     
+#endif
+#ifndef TRMMKERNEL       
+    lxv        vs40, 0(T2)
+    lxv        vs41, 16(T2) 
+    lxv        vs42, 32(T2)  
+    lxv        vs43, 48(T2)     
+#endif  
+#ifndef TRMMKERNEL    
+    lxv        vs44, 0(T3)
+    lxv        vs45, 16(T3) 
+    lxv        vs46, 32(T3)  
+    lxv        vs47, 48(T3)                 
+#endif  
+
+    xxperm     vs12,    vs2,    save_permute_1
+    xxperm     vs14,    vs3,    save_permute_1
+       
+    xxperm     vs13,    vs2,    save_permute_2   
+    xxperm     vs15,    vs3,    save_permute_2      
+
+    xxperm     vs16,    vs4,    save_permute_1
+    xxperm     vs18,    vs5,    save_permute_1
+      
+    xxperm     vs17,    vs4,    save_permute_2   
+    xxperm     vs19,    vs5,    save_permute_2      
+
+    xxperm     vs24,    vs30,   save_permute_1
+    xxperm     vs26,    vs31,   save_permute_1 
+         
+    xxperm     vs25,    vs30,   save_permute_2   
+    xxperm     vs27,    vs31,   save_permute_2  
+
+
+    /* multiply add normal way */
+ 
+#ifdef TRMMKERNEL
+    xvmulsp     vs32,   vs8,    alpha_r 
+    xvmulsp     vs33,   vs12,   alpha_r   
+    xvmulsp     vs34,   vs16,   alpha_r 
+    xvmulsp     vs35,   vs24,   alpha_r  
+    xvmulsp     vs36,   vs9,    alpha_r 
+    xvmulsp     vs37,   vs13,   alpha_r  
+    xvmulsp     vs38,   vs17,   alpha_r 
+    xvmulsp     vs39,   vs25,   alpha_r               
+#else 
+    xvmaddasp   vs32,   vs8,    alpha_r 
+    xvmaddasp   vs33,   vs12,   alpha_r   
+    xvmaddasp   vs34,   vs16,   alpha_r 
+    xvmaddasp   vs35,   vs24,   alpha_r  
+    xvmaddasp   vs36,   vs9,    alpha_r 
+    xvmaddasp   vs37,   vs13,   alpha_r   
+    xvmaddasp   vs38,   vs17,   alpha_r 
+    xvmaddasp   vs39,   vs25,   alpha_r         
+#endif 
+
+
+
+#ifdef TRMMKERNEL
+    xvmulsp     vs40,   vs10,   alpha_r 
+    xvmulsp     vs41,   vs14,   alpha_r 
+    xvmulsp     vs42,   vs18,   alpha_r 
+    xvmulsp     vs43,   vs26,   alpha_r  
+    xvmulsp     vs44,   vs11,   alpha_r 
+    xvmulsp     vs45,   vs15,   alpha_r  
+    xvmulsp     vs46,   vs19,   alpha_r 
+    xvmulsp     vs47,   vs27,   alpha_r                   
+#else
+
+    xvmaddasp   vs40,   vs10,   alpha_r 
+    xvmaddasp   vs41,   vs14,   alpha_r   
+    xvmaddasp   vs42,   vs18,   alpha_r 
+    xvmaddasp   vs43,   vs26,   alpha_r  
+    xvmaddasp   vs44,   vs11,   alpha_r 
+    xvmaddasp   vs45,   vs15,   alpha_r 
+    xvmaddasp   vs46,   vs19,   alpha_r 
+    xvmaddasp   vs47,   vs27,   alpha_r  
+        
+#endif  
+
+    stxv        vs32, 0(CO)
+    stxv        vs33, 16(CO) 
+    stxv        vs34, 32(CO)  
+    stxv        vs35, 48(CO)  
+
+    stxv        vs36, 0(T1)
+    stxv        vs37, 16(T1)  
+    stxv        vs38, 32(T1)  
+    stxv        vs39, 48(T1)
+
+    stxv        vs40, 0(T2)
+    stxv        vs41, 16(T2)  
+    stxv        vs42, 32(T2)  
+    stxv        vs43, 48(T2)  
+    stxv        vs44, 0(T3)
+    stxv        vs45, 16(T3) 
+    stxv        vs46, 32(T3)  
+    stxv        vs47, 48(T3)
+  
+ /*****the same with the second 8X8 ****/
+#ifndef TRMMKERNEL
+  
+    lxv        vs32, 0(T4)
+    lxv        vs33, 16(T4) 
+    lxv        vs34, 32(T4)  
+    lxv        vs35, 48(T4)      
+    lxv        vs36, 0(T5)
+    lxv        vs37, 16(T5) 
+    lxv        vs38,32(T5)  
+    lxv        vs39, 48(T5)     
+#endif  
+ 
+    xxmrglw     vs8,    vs48,   vs60
+    xxmrglw     vs10,   vs52,   vs56  
+
+    xxmrghw     vs1,    vs48,   vs60
+    xxmrghw     vs0,    vs52,   vs56
+    xxmrglw     vs12,   vs49,   vs61
+    xxmrglw     vs14,   vs53,   vs57  
+
+#ifndef TRMMKERNEL   
+    lxv        vs40, 0(T6)
+    lxv        vs41, 16(T6)  
+    lxv        vs42, 32(T6)  
+    lxv        vs43, 48(T6)           
+    lxv        vs44, 0(T7)
+    lxv        vs45, 16(T7) 
+    lxv        vs46, 32(T7)  
+    lxv        vs47, 48(T7)     
+#endif  
+    xxmrghw     vs2,    vs53,   vs57
+    xxmrghw     vs3,    vs49,   vs61
+
+    xxmrglw     vs16,   vs50,   vs62
+    xxmrglw     vs18,   vs54,   vs58   
+
+    xxlor      vs9, vs8,    vs8
+    xxlor      vs11,    vs10,   vs10 
+    xxmrghw     vs4,    vs54,   vs58
+    xxmrghw     vs5,    vs50,   vs62
+
+    xxlor      vs13,    vs12,   vs12
+    xxlor      vs15,    vs14,   vs14
+ 
+    xxmrglw     vs24,   vs51,   vs63
+    xxmrglw     vs26,   vs55,   vs59  
+
+    xxlor      vs17,    vs16,   vs16
+    xxlor      vs19,    vs18,   vs18
+    xxmrghw     vs30,   vs55,   vs59 
+    xxmrghw     vs31,   vs51,   vs63
+
+    xxperm      vs8,    vs0,    save_permute_1
+    xxperm      vs10,   vs1,    save_permute_1
+     
+    xxperm      vs9,    vs0,    save_permute_2  
+    xxperm      vs11,   vs1,    save_permute_2      
+
+    xxlor      vs25,    vs24,   vs24
+    xxlor      vs27,    vs26,   vs26 
+    xxperm     vs12,    vs2,    save_permute_1
+    xxperm     vs14,    vs3,    save_permute_1
+    xxperm     vs13,    vs2,    save_permute_2   
+    xxperm     vs15,    vs3,    save_permute_2      
+ 
+    xxperm     vs16,    vs4,    save_permute_1
+    xxperm     vs18,    vs5,    save_permute_1
+    xxperm     vs17,    vs4,    save_permute_2   
+    xxperm     vs19,    vs5,    save_permute_2      
+    xxperm     vs24,    vs30,   save_permute_1
+    xxperm     vs26,    vs31,   save_permute_1 
+    xxperm     vs25,    vs30,   save_permute_2   
+    xxperm     vs27,    vs31,   save_permute_2      
+
+#ifdef TRMMKERNEL
+    xvmulsp     vs32,   vs8,    alpha_r 
+    xvmulsp     vs33,   vs12,   alpha_r   
+    xvmulsp     vs34,   vs16,   alpha_r 
+    xvmulsp     vs35,   vs24,   alpha_r  
+    xvmulsp     vs36,   vs9,    alpha_r 
+    xvmulsp     vs37,   vs13,   alpha_r  
+    xvmulsp     vs38,   vs17,   alpha_r 
+    xvmulsp     vs39,   vs25,   alpha_r               
+#else 
+    xvmaddasp   vs32,   vs8,    alpha_r 
+    xvmaddasp   vs33,   vs12,   alpha_r   
+    xvmaddasp   vs34,   vs16,   alpha_r 
+    xvmaddasp   vs35,   vs24,   alpha_r  
+    xvmaddasp   vs36,   vs9,    alpha_r 
+    xvmaddasp   vs37,   vs13,   alpha_r   
+    xvmaddasp   vs38,   vs17,   alpha_r 
+    xvmaddasp   vs39,   vs25,   alpha_r         
+#endif 
+
+    stxv        vs32, 0(T4)
+    stxv        vs33, 16(T4) 
+    stxv        vs34, 32(T4)  
+    stxv        vs35, 48(T4)  
+
+    stxv        vs36, 0(T5)
+    stxv        vs37, 16(T5)  
+    stxv        vs38, 32(T5)  
+    stxv        vs39, 48(T5)
+
+#ifdef TRMMKERNEL
+    xvmulsp     vs40,   vs10,   alpha_r 
+    xvmulsp     vs41,   vs14,   alpha_r 
+    xvmulsp     vs42,   vs18,   alpha_r 
+    xvmulsp     vs43,   vs26,   alpha_r  
+    xvmulsp     vs44,   vs11,   alpha_r 
+    xvmulsp     vs45,   vs15,   alpha_r  
+    xvmulsp     vs46,   vs19,   alpha_r 
+    xvmulsp     vs47,   vs27,   alpha_r                   
+#else
+
+    xvmaddasp   vs40,   vs10,   alpha_r 
+    xvmaddasp   vs41,   vs14,   alpha_r   
+    xvmaddasp   vs42,   vs18,   alpha_r 
+    xvmaddasp   vs43,   vs26,   alpha_r  
+    xvmaddasp   vs44,   vs11,   alpha_r 
+    xvmaddasp   vs45,   vs15,   alpha_r 
+    xvmaddasp   vs46,   vs19,   alpha_r 
+    xvmaddasp   vs47,   vs27,   alpha_r  
+        
+#endif  
+
+    stxv        vs40, 0(T6)
+    stxv        vs41, 16(T6)  
+    stxv        vs42, 32(T6)  
+    stxv        vs43, 48(T6)  
+    stxv        vs44, 0(T7)
+    stxv        vs45, 16(T7) 
+    stxv        vs46, 32(T7)  
+    stxv        vs47, 48(T7)
+  
+
+    addi CO,CO,64
+
+
+.endm
+
+
+
+/**********************************************************************************************
+* Macros for N=8 and M=8
+**********************************************************************************************/
+
+.macro LOAD8x8_1
+   LOAD8x8 1
+.endm
+
+.macro LOAD8x8_0
+   LOAD8x8 0
+.endm
+
+.macro KERNEL8x8_L1_L4  Index,IsLast
+  KERNEL8x8_L1_L4_I AO,BO, 0,0, \Index,\IsLast,0
+.endm
+
+.macro KERNEL8x8_I1_L4  OffsetA,OffsetB, Index,IsLast
+  KERNEL8x8_L1_L4_I  AO,BO, \OffsetA,\OffsetB,\Index,\IsLast,0
+.endm
+
+.macro KERNEL8x8_I1_L4_2  OffsetA,OffsetB, Index,IsLast
+  KERNEL8x8_L1_L4_I  AO,BO,  \OffsetA,\OffsetB,\Index,\IsLast,0
+.endm
+
+.macro KERNEL8x8_I1_L4_3  OffsetA,OffsetB, Index,IsLast
+  KERNEL8x8_L1_L4_I  AO,BO,  \OffsetA,\OffsetB,\Index,\IsLast,1
+.endm
+.macro KERNEL8x8_I1_L2_3  OffsetA,OffsetB, Index,IsLast
+   KERNEL8x8_L1_L2_I AO,BO,0,  \OffsetA,\OffsetB,\Index,\IsLast,1
+.endm
+
+.macro KERNEL8x8_I2_L4_2  AREG,BREG,OffsetA,OffsetB, Index,IsLast
+  KERNEL8x8_L1_L4_I  \AREG,\BREG,  \OffsetA,\OffsetB,\Index,\IsLast,0
+.endm
+
+.macro KERNEL8x8_I2_L4_3  AREG,BREG,OffsetA,OffsetB, Index,IsLast
+  KERNEL8x8_L1_L4_I \AREG,\BREG,  \OffsetA,\OffsetB,\Index,\IsLast,1
+.endm
+
+.macro END8x8_NORMAL
+  END8x8 0, AO, BO, 32,32 
+.endm
+
+.macro Zero8X8
+    xxlxor      vs32,   vs32,   vs32
+    xxlxor      vs33,   vs33,   vs33
+ 
+    xxlxor      vs36,   vs36,   vs36
+    xxlxor      vs37,   vs37,   vs37
+ 
+    xxlxor      vs40,   vs40,   vs40
+    xxlxor      vs41,   vs41,   vs41
+ 
+    xxlxor      vs44,   vs44,   vs44
+    xxlxor      vs45,   vs45,   vs45
+ 
+    xxlxor      vs48,   vs48,   vs48
+    xxlxor      vs49,   vs49,   vs49
+ 
+    xxlxor      vs52,   vs52,   vs52
+    xxlxor      vs53,   vs53,   vs53
+ 
+    xxlxor      vs56,   vs56,   vs56
+    xxlxor      vs57,   vs57,   vs57
+  
+    xxlxor      vs60,   vs60,   vs60
+    xxlxor      vs61,   vs61,   vs61
+    
+.endm
+
+.macro LOAD8x8  Zero
+
+    lxv vs24,   0(BO)
+    lxv vs28,   16(BO)
+    lxv vs0,     0(AO)
+    lxv vs1,    16(AO)
+
+    xxperm      vs26,   vs24,       permute_mask
+    xxperm      vs30,   vs28,       permute_mask    
+    xxpermdi    vs25,   vs24,   vs24,2     
+    xxpermdi    vs29,   vs28,   vs28,2    
+
+    xxpermdi    vs27,   vs26,   vs26,2  
+    xxpermdi    vs31,   vs30,   vs30,2      
+
+.if \Zero==1 
+    xxlxor      vs32,   vs32,   vs32
+    xxlxor      vs33,   vs33,   vs33 
+    xxlxor      vs36,   vs36,   vs36
+    xxlxor      vs37,   vs37,   vs37
+    xxlxor      vs40,   vs40,   vs40
+    xxlxor      vs41,   vs41,   vs41 
+    xxlxor      vs44,   vs44,   vs44
+    xxlxor      vs45,   vs45,   vs45 
+    xxlxor      vs48,   vs48,   vs48
+    xxlxor      vs49,   vs49,   vs49 
+    xxlxor      vs52,   vs52,   vs52
+    xxlxor      vs53,   vs53,   vs53 
+    xxlxor      vs56,   vs56,   vs56
+    xxlxor      vs57,   vs57,   vs57  
+    xxlxor      vs60,   vs60,   vs60
+    xxlxor      vs61,   vs61,   vs61  
+.endif
+.endm
+
+
+.macro END8x8 First, AREG, BREG, OffsetA, OffsetB
+
+.if \OffsetB != 0 
+    addi        \BREG, \BREG, \OffsetB 
+.endif
+.if \OffsetA != 0 
+    addi        \AREG, \AREG, \OffsetA 
+.endif  
+
+.if \First==1
+    xvmulsp     vs32, vs0,vs24
+    xvmulsp     vs33, vs1,vs24
+
+    xvmulsp     vs36, vs0,vs25
+    xvmulsp     vs37, vs1,vs25
+
+    xvmulsp     vs40, vs0,vs26
+    xvmulsp     vs41, vs1,vs26
+
+    xvmulsp     vs44, vs0,vs27
+    xvmulsp     vs45, vs1,vs27
+
+    xvmulsp     vs48, vs0,vs28
+    xvmulsp     vs49, vs1,vs28
+
+    xvmulsp     vs52, vs0,vs29
+    xvmulsp     vs53, vs1,vs29
+
+    xvmulsp     vs56, vs0,vs30
+    xvmulsp     vs57, vs1,vs30
+
+    xvmulsp     vs60, vs0,vs31
+    xvmulsp     vs61, vs1,vs31
+
+.else
+    xvmaddasp       vs32, vs0,vs24
+    xvmaddasp       vs33, vs1,vs24
+
+    xvmaddasp       vs36, vs0,vs25
+    xvmaddasp       vs37, vs1,vs25
+
+    xvmaddasp       vs40, vs0,vs26
+    xvmaddasp       vs41, vs1,vs26
+
+    xvmaddasp       vs44, vs0,vs27
+    xvmaddasp       vs45, vs1,vs27
+
+    xvmaddasp       vs48, vs0,vs28
+    xvmaddasp       vs49, vs1,vs28
+
+    xvmaddasp       vs52, vs0,vs29
+    xvmaddasp       vs53, vs1,vs29
+
+    xvmaddasp       vs56, vs0,vs30
+    xvmaddasp       vs57, vs1,vs30
+
+    xvmaddasp       vs60, vs0,vs31
+    xvmaddasp       vs61, vs1,vs31
+
+.endif
+.endm  
+
+.macro KERNEL8x8_L1_L4_I  AREG,BREG,   OffsetA,OffsetB, Index,IsLast ,Complete
+
+    lxv vs8,    DISP32(\Index, 0+\OffsetB)(\BREG)
+    lxv vs12,   DISP32(\Index,16+\OffsetB)(\BREG)
+
+    lxv vs4,    DISP32(\Index, 0+\OffsetA)(\AREG)
+    lxv vs5,    DISP32(\Index,16+\OffsetA)(\AREG)
+
+    xxperm      vs10,   vs8,        permute_mask
+    xxperm      vs14,   vs12,       permute_mask    
+    xxpermdi    vs9,    vs8,    vs8,2    
+    xxpermdi    vs13,   vs12,   vs12,2   
+
+    xvmaddasp       vs32, vs0,vs24
+    xvmaddasp       vs33, vs1,vs24
+
+    xvmaddasp       vs36, vs0,vs25
+    xvmaddasp       vs37, vs1,vs25
+
+    xxpermdi    vs11,   vs10,   vs10,2  
+    xxpermdi    vs15,   vs14,   vs14,2  
+
+    xvmaddasp       vs40, vs0,vs26
+    xvmaddasp       vs41, vs1,vs26
+
+    xvmaddasp       vs44, vs0,vs27
+    xvmaddasp       vs45, vs1,vs27
+
+    xvmaddasp       vs48, vs0,vs28
+    xvmaddasp       vs49, vs1,vs28
+
+    xvmaddasp       vs52, vs0,vs29
+    xvmaddasp       vs53, vs1,vs29
+
+    xvmaddasp       vs56, vs0,vs30
+    xvmaddasp       vs57, vs1,vs30
+
+    xvmaddasp       vs60, vs0,vs31
+    xvmaddasp       vs61, vs1,vs31
+
+    lxv vs24,   DISP32(\Index,32+\OffsetB)(\BREG)
+    lxv vs28,   DISP32(\Index,32+16+\OffsetB)(\BREG)
+
+    lxv vs0,    DISP32(\Index,32+\OffsetA)(\AREG)
+    lxv vs1,    DISP32(\Index,32+16+\OffsetA)(\AREG)
+
+    xxperm      vs26,   vs24,       permute_mask
+    xxperm      vs30,   vs28,       permute_mask    
+    xxpermdi    vs25,   vs24,   vs24,2     
+    xxpermdi    vs29,   vs28,   vs28,2    
+
+    xvmaddasp       vs32, vs4,vs8
+    xvmaddasp       vs33, vs5,vs8
+
+    xvmaddasp       vs36, vs4,vs9
+    xvmaddasp       vs37, vs5,vs9
+
+    xxpermdi    vs27,   vs26,   vs26,2  
+    xxpermdi    vs31,   vs30,   vs30,2      
+
+    xvmaddasp       vs40, vs4,vs10
+    xvmaddasp       vs41, vs5,vs10
+
+    xvmaddasp       vs44, vs4,vs11
+    xvmaddasp       vs45, vs5,vs11
+
+    xvmaddasp       vs48, vs4,vs12
+    xvmaddasp       vs49, vs5,vs12
+
+    xvmaddasp       vs52, vs4,vs13
+    xvmaddasp       vs53, vs5,vs13
+
+    xvmaddasp       vs56, vs4,vs14
+    xvmaddasp       vs57, vs5,vs14
+
+    xvmaddasp       vs60, vs4,vs15
+    xvmaddasp       vs61, vs5,vs15
+
+    lxv vs8,    DISP32(\Index,64+\OffsetB)(\BREG)
+    lxv vs12,   DISP32(\Index,64+16+\OffsetB)(\BREG)
+
+    lxv vs4,    DISP32(\Index,64+0+\OffsetA)(\AREG)
+    lxv vs5,    DISP32(\Index,64+16+\OffsetA)(\AREG)
+
+    xxperm      vs10,   vs8,        permute_mask
+    xxperm      vs14,   vs12,       permute_mask    
+    xxpermdi    vs9,    vs8,    vs8,2    
+    xxpermdi    vs13,   vs12,   vs12,2  
+
+    xvmaddasp       vs32, vs0,vs24
+    xvmaddasp       vs33, vs1,vs24
+
+    xvmaddasp       vs36, vs0,vs25
+    xvmaddasp       vs37, vs1,vs25
+
+    xxpermdi    vs11,   vs10,   vs10,2  
+    xxpermdi    vs15,   vs14,   vs14,2  
+
+    xvmaddasp       vs40, vs0,vs26
+    xvmaddasp       vs41, vs1,vs26
+
+    xvmaddasp       vs44, vs0,vs27
+    xvmaddasp       vs45, vs1,vs27
+
+    xvmaddasp       vs48, vs0,vs28
+    xvmaddasp       vs49, vs1,vs28
+
+    xvmaddasp       vs52, vs0,vs29
+    xvmaddasp       vs53, vs1,vs29
+
+    xvmaddasp       vs56, vs0,vs30
+    xvmaddasp       vs57, vs1,vs30
+
+    xvmaddasp       vs60, vs0,vs31
+    xvmaddasp       vs61, vs1,vs31
+
+.if \Complete==0
+    lxv vs24,   DISP32(\Index,96+\OffsetB)(\BREG)
+    lxv vs28,   DISP32(\Index,96+16+\OffsetB)(\BREG)
+
+    lxv vs0,    DISP32(\Index,96+\OffsetA)(\AREG)
+    lxv vs1,    DISP32(\Index,96+16+\OffsetA)(\AREG) 
+
+    xxperm      vs26,   vs24,   permute_mask
+    xxperm      vs30,   vs28,   permute_mask    
+    xxpermdi    vs25,   vs24,   vs24,2 
+    xxpermdi    vs29,   vs28,   vs28,2      
+
+.endif 
+.if \IsLast==1  
+.if \Complete==1
+  
+    addi        \BREG, \BREG,  DISP32(\Index,32*3+\OffsetB)
+    addi        \AREG, \AREG, DISP32(\Index,32*3+\OffsetA)
+.else
+  
+    addi        \BREG, \BREG,  DISP32(\Index,128)
+    addi        \AREG, \AREG, DISP32(\Index,128)
+.endif
+.endif   
+ 
+    xvmaddasp       vs32, vs4,vs8
+    xvmaddasp       vs33, vs5,vs8
+
+    xvmaddasp       vs36, vs4,vs9
+    xvmaddasp       vs37, vs5,vs9
+
+.if \Complete==0        
+    xxpermdi    vs27,   vs26,   vs26,2  
+    xxpermdi    vs31,   vs30,   vs30,2  
+    
+.endif
+ 
+    xvmaddasp       vs40, vs4,vs10
+    xvmaddasp       vs41, vs5,vs10
+
+    xvmaddasp       vs44, vs4,vs11
+    xvmaddasp       vs45, vs5,vs11
+
+    xvmaddasp       vs48, vs4,vs12
+    xvmaddasp       vs49, vs5,vs12
+
+    xvmaddasp       vs52, vs4,vs13
+    xvmaddasp       vs53, vs5,vs13
+
+    xvmaddasp       vs56, vs4,vs14
+    xvmaddasp       vs57, vs5,vs14
+
+    xvmaddasp       vs60, vs4,vs15
+    xvmaddasp       vs61, vs5,vs15
+
+.endm
+
+.macro KERNEL8x8 First
+
+  LOAD8x8 0
+  END8x8 \First, AO, BO, 32,32  
+.endm
+
+.macro KERNEL8x8_L1_L2_I  AREG,BREG,First,OffsetA,OffsetB, Index,IsLast ,Complete
+    
+    lxv vs8,    DISP16(\Index, 0+\OffsetB)(\BREG)
+    lxv vs12,   DISP16(\Index,16+\OffsetB)(\BREG)
+
+    lxv vs4,    DISP16(\Index, 0+\OffsetA)(\AREG)
+    lxv vs5,    DISP16(\Index,16+\OffsetA)(\AREG)
+
+    xxperm      vs10,   vs8,        permute_mask
+    xxperm      vs14,   vs12,       permute_mask    
+    xxpermdi    vs9,    vs8,    vs8,2    
+    xxpermdi    vs13,   vs12,   vs12,2   
+.if \First==1
+    xvmulsp     vs32, vs0,vs24
+    xvmulsp     vs33, vs1,vs24
+
+    xvmulsp     vs36, vs0,vs25
+    xvmulsp     vs37, vs1,vs25
+
+.else
+    xvmaddasp       vs32, vs0,vs24
+    xvmaddasp       vs33, vs1,vs24
+
+    xvmaddasp       vs36, vs0,vs25
+    xvmaddasp       vs37, vs1,vs25
+
+.endif
+
+    xxpermdi    vs11,   vs10,   vs10,2  
+    xxpermdi    vs15,   vs14,   vs14,2  
+ 
+.if \First==1  
+    xvmulsp     vs40, vs0,vs26
+    xvmulsp     vs41, vs1,vs26
+
+    xvmulsp     vs44, vs0,vs27
+    xvmulsp     vs45, vs1,vs27
+
+    xvmulsp     vs48, vs0,vs28
+    xvmulsp     vs49, vs1,vs28
+
+    xvmulsp     vs52, vs0,vs29
+    xvmulsp     vs53, vs1,vs29
+
+    xvmulsp     vs56, vs0,vs30
+    xvmulsp     vs57, vs1,vs30
+
+    xvmulsp     vs60, vs0,vs31
+    xvmulsp     vs61, vs1,vs31
+
+.else 
+    xvmaddasp       vs40, vs0,vs26
+    xvmaddasp       vs41, vs1,vs26
+
+    xvmaddasp       vs44, vs0,vs27
+    xvmaddasp       vs45, vs1,vs27
+
+    xvmaddasp       vs48, vs0,vs28
+    xvmaddasp       vs49, vs1,vs28
+
+    xvmaddasp       vs52, vs0,vs29
+    xvmaddasp       vs53, vs1,vs29
+
+    xvmaddasp       vs56, vs0,vs30
+    xvmaddasp       vs57, vs1,vs30
+
+    xvmaddasp       vs60, vs0,vs31
+    xvmaddasp       vs61, vs1,vs31
+
+.endif
+.if \Complete==0
+    lxv vs24,   DISP16(\Index,32+\OffsetB)(\BREG)
+    lxv vs28,   DISP16(\Index,32+16+\OffsetB)(\BREG)
+
+    lxv vs0,    DISP16(\Index,32+\OffsetA)(\AREG)
+    lxv vs1,    DISP16(\Index,32+16+\OffsetA)(\AREG)
+
+    xxperm      vs26,   vs24,   permute_mask
+    xxperm      vs30,   vs28,   permute_mask    
+    xxpermdi    vs25,   vs24,   vs24,2   
+    xxpermdi    vs29,   vs28,   vs28,2  
+.endif    
+.if \IsLast==1  
+.if \Complete==1
+    addi        \BREG, \BREG,  DISP16(\Index,32+\OffsetB) 
+    addi        \AREG, \AREG,  DISP16(\Index,32+\OffsetA)
+
+.else
+    addi        \BREG, \BREG,  DISP16(\Index,64)
+    addi        \AREG, \AREG,  DISP16(\Index,64) 
+.endif
+.endif
+
+.if \First==1
+    xvmulsp     vs32, vs4,vs8
+    xvmulsp     vs33, vs5,vs8
+
+    xvmulsp     vs36, vs4,vs9
+    xvmulsp     vs37, vs5,vs9
+
+.else
+    xvmaddasp       vs32, vs4,vs8
+    xvmaddasp       vs33, vs5,vs8
+
+    xvmaddasp       vs36, vs4,vs9
+    xvmaddasp       vs37, vs5,vs9
+
+.endif 
+ 
+.if \Complete==0        
+    xxpermdi    vs27,   vs26,   vs26,2  
+    xxpermdi    vs31,   vs30,   vs30,2  
+ 
+.endif
+.if \First==1  
+    xvmulsp     vs40, vs4,vs10
+    xvmulsp     vs41, vs5,vs10
+
+    xvmulsp     vs44, vs4,vs11
+    xvmulsp     vs45, vs5,vs11
+
+    xvmulsp     vs48, vs4,vs12
+    xvmulsp     vs49, vs5,vs12
+
+    xvmulsp     vs52, vs4,vs13
+    xvmulsp     vs53, vs5,vs13
+
+    xvmulsp     vs56, vs4,vs14
+    xvmulsp     vs57, vs5,vs14
+
+    xvmulsp     vs60, vs4,vs15
+    xvmulsp     vs61, vs5,vs15
+
+.else 
+    xvmaddasp       vs40, vs4,vs10
+    xvmaddasp       vs41, vs5,vs10
+
+    xvmaddasp       vs44, vs4,vs11
+    xvmaddasp       vs45, vs5,vs11
+
+    xvmaddasp       vs48, vs4,vs12
+    xvmaddasp       vs49, vs5,vs12
+
+    xvmaddasp       vs52, vs4,vs13
+    xvmaddasp       vs53, vs5,vs13
+
+    xvmaddasp       vs56, vs4,vs14
+    xvmaddasp       vs57, vs5,vs14
+
+    xvmaddasp       vs60, vs4,vs15
+    xvmaddasp       vs61, vs5,vs15
+
+.endif
+
+.endm
+
+
+.macro SAVE8x8 
+ 
+  slwi    T10, LDC ,   1 
+  add     T1, CO, LDC 
+
+  add     T2, CO, T10  
+  add     T3, T1, T10  
+
+  add     T4, T2, T10  
+  add     T5, T3, T10 
+
+  add     T6, T4, T10 
+  add     T7, T5, T10 
+
+#ifndef TRMMKERNEL    
+    lxv        vs34, 0(CO)
+    lxv        vs35, 16(CO)      
+    lxv        vs38, 0(T1)
+    lxv        vs39, 16(T1)  
+    lxv        vs42, 0(T2)
+    lxv        vs43, 16(T2)     
+    lxv        vs46, 0(T3)
+    lxv        vs47, 16(T3)  
+
+    lxv        vs50, 0(T4)
+    lxv        vs51, 16(T4)      
+    lxv        vs54, 0(T5)
+    lxv        vs55, 16(T5)  
+    lxv        vs58, 0(T6)
+    lxv        vs59, 16(T6)     
+    lxv        vs62, 0(T7)
+    lxv        vs63, 16(T7) 
+#endif  
+
+    xxmrglw     vs8,    vs32,   vs44
+    xxmrglw     vs10,   vs36,   vs40  
+
+    xxmrghw     vs1,    vs32,   vs44
+    xxmrghw     vs0,    vs36,   vs40
+
+    xxmrglw     vs12,   vs33,   vs45
+    xxmrglw     vs14,   vs37,   vs41  
+
+    xxmrghw     vs2,    vs37,   vs41
+    xxmrghw     vs3,    vs33,   vs45
+
+    xxlor      vs9, vs8,    vs8
+    xxlor      vs11,    vs10,   vs10 
+ 
+    xxlor      vs13,    vs12,   vs12
+    xxlor      vs15,    vs14,   vs14
+
+    xxperm      vs8,    vs0,    save_permute_1
+    xxperm      vs10,   vs1,    save_permute_1
+    xxperm      vs9,    vs0,    save_permute_2  
+    xxperm      vs11,   vs1,    save_permute_2      
+
+    xxperm     vs12,    vs2,    save_permute_1
+    xxperm     vs14,    vs3,    save_permute_1
+      
+    xxperm     vs13,    vs2,    save_permute_2   
+    xxperm     vs15,    vs3,    save_permute_2      
+
+
+    /* multiply add normal way */
+ 
+#ifdef TRMMKERNEL
+    xvmulsp     vs34,   vs8,    alpha_r 
+    xvmulsp     vs35,   vs12,   alpha_r 
+    xvmulsp     vs38,   vs9,    alpha_r 
+    xvmulsp     vs39,   vs13,   alpha_r 
+    xvmulsp     vs42,   vs10,   alpha_r 
+    xvmulsp     vs43,   vs14,   alpha_r 
+    xvmulsp     vs46,   vs11,   alpha_r 
+    xvmulsp     vs47,   vs15,   alpha_r                    
+#else 
+    xvmaddasp   vs34,   vs8,    alpha_r 
+    xvmaddasp   vs35,   vs12,   alpha_r 
+    xvmaddasp   vs38,   vs9,    alpha_r 
+    xvmaddasp   vs39,   vs13,   alpha_r  
+    xvmaddasp   vs42,   vs10,   alpha_r 
+    xvmaddasp   vs43,   vs14,   alpha_r   
+    xvmaddasp   vs46,   vs11,   alpha_r 
+    xvmaddasp   vs47,   vs15,   alpha_r                     
+#endif     
+ 
+   
+    xxmrglw     vs8,    vs48,   vs60
+    xxmrglw     vs10,   vs52,   vs56  
+
+    xxmrghw     vs1,    vs48,   vs60
+    xxmrghw     vs0,    vs52,   vs56
+    stxv        vs34, 0(CO)
+    stxv        vs35, 16(CO) 
+    xxmrglw     vs12,   vs49,   vs61
+    xxmrglw     vs14,   vs53,   vs57  
+    stxv        vs38, 0(T1)
+    stxv        vs39, 16(T1) 
+    xxmrghw     vs2,    vs53,   vs57
+    xxmrghw     vs3,    vs49,   vs61
+    stxv        vs42, 0(T2)
+    stxv        vs43, 16(T2)   
+    xxlor      vs9, vs8,    vs8
+    xxlor      vs11,    vs10,   vs10  
+    stxv        vs46, 0(T3)
+    stxv        vs47, 16(T3)  
+    xxlor      vs13,    vs12,   vs12
+    xxlor      vs15,    vs14,   vs14
+   
+    xxperm      vs8,    vs0,    save_permute_1
+    xxperm      vs10,   vs1,    save_permute_1
+    
+ 
+    xxperm      vs9,    vs0,    save_permute_2  
+    xxperm      vs11,   vs1,    save_permute_2      
+ 
+    xxperm     vs12,    vs2,    save_permute_1
+    xxperm     vs14,    vs3,    save_permute_1
+    xxperm     vs13,    vs2,    save_permute_2   
+    xxperm     vs15,    vs3,    save_permute_2      
+    
+ #ifdef TRMMKERNEL
+    xvmulsp     vs50,   vs8,    alpha_r 
+    xvmulsp     vs51,   vs12,   alpha_r 
+    xvmulsp     vs54,   vs9,    alpha_r 
+    xvmulsp     vs55,   vs13,   alpha_r 
+    xvmulsp     vs58,   vs10,   alpha_r 
+    xvmulsp     vs59,   vs14,   alpha_r 
+    xvmulsp     vs62,   vs11,   alpha_r 
+    xvmulsp     vs63,   vs15,   alpha_r                    
+#else 
+    xvmaddasp     vs50,   vs8,    alpha_r 
+    xvmaddasp     vs51,   vs12,   alpha_r 
+    xvmaddasp     vs54,   vs9,    alpha_r 
+    xvmaddasp     vs55,   vs13,   alpha_r 
+    xvmaddasp     vs58,   vs10,   alpha_r 
+    xvmaddasp     vs59,   vs14,   alpha_r 
+    xvmaddasp     vs62,   vs11,   alpha_r 
+    xvmaddasp     vs63,   vs15,   alpha_r                     
+#endif  
+
+    stxv        vs50, 0(T4)
+    stxv        vs51, 16(T4)      
+    stxv        vs54, 0(T5)
+    stxv        vs55, 16(T5)  
+    stxv        vs58, 0(T6)
+    stxv        vs59, 16(T6)     
+    stxv        vs62, 0(T7)
+    stxv        vs63, 16(T7)   
+
+    addi CO,CO,32
+
+.endm
+
+
+/**********************************************************************************************
+* Macros for N=8 and M=4
+**********************************************************************************************/
+
+.macro LOAD8x4_1
+   LOAD8x4 1
+.endm
+
+.macro LOAD8x4_0
+   LOAD8x4 0
+.endm
+
+.macro KERNEL8x4_L1_L4  Index,IsLast
+  KERNEL8x4_L1_L4_I AO,BO, 0,0, \Index,\IsLast,0
+.endm
+
+.macro KERNEL8x4_I1_L4  OffsetA,OffsetB, Index,IsLast
+  KERNEL8x4_L1_L4_I  AO,BO, \OffsetA,\OffsetB,\Index,\IsLast,0
+.endm
+
+.macro KERNEL8x4_I1_L4_2  OffsetA,OffsetB, Index,IsLast
+  KERNEL8x4_L1_L4_I  AO,BO,  \OffsetA,\OffsetB,\Index,\IsLast,0
+.endm
+
+.macro KERNEL8x4_I1_L4_3  OffsetA,OffsetB, Index,IsLast
+  KERNEL8x4_L1_L4_I  AO,BO,  \OffsetA,\OffsetB,\Index,\IsLast,1
+.endm
+.macro KERNEL8x4_I1_L2_3  OffsetA,OffsetB, Index,IsLast
+   KERNEL8x4_L1_L2_I AO,BO,0,  \OffsetA,\OffsetB,\Index,\IsLast,1
+.endm
+
+.macro KERNEL8x4_I2_L4_2  AREG,BREG,OffsetA,OffsetB, Index,IsLast
+  KERNEL8x4_L1_L4_I  \AREG,\BREG,  \OffsetA,\OffsetB,\Index,\IsLast,0
+.endm
+
+.macro KERNEL8x4_I2_L4_3  AREG,BREG,OffsetA,OffsetB, Index,IsLast
+  KERNEL8x4_L1_L4_I \AREG,\BREG,  \OffsetA,\OffsetB,\Index,\IsLast,1
+.endm
+
+.macro Zero8X4
+    xxlxor      vs32,   vs32,   vs32
+    xxlxor      vs33,   vs33,   vs33 
+    xxlxor      vs34,   vs34,   vs34
+    xxlxor      vs35,   vs35,   vs35
+    
+    xxlxor      vs48,   vs48,   vs48
+    xxlxor      vs49,   vs49,   vs49
+    xxlxor      vs50,   vs50,   vs50
+    xxlxor      vs51,   vs51,   vs51  
+    
+.endm
+
+.macro LOAD8x4  Zero
+
+    lxv vs0,     0(AO)
+    lxv vs24,   0(BO)
+    lxv vs25,   16(BO)
+
+
+
+    xxperm      vs2,   vs0,       permute_mask  
+    xxpermdi    vs1,   vs0,   vs0,2      
+    xxpermdi    vs3,   vs2,   vs2,2        
+
+.if \Zero==1 
+    xxlxor      vs32,   vs32,   vs32
+    xxlxor      vs33,   vs33,   vs33 
+    xxlxor      vs34,   vs34,   vs34
+    xxlxor      vs35,   vs35,   vs35
+
+    xxlxor      vs48,   vs48,   vs48
+    xxlxor      vs49,   vs49,   vs49
+    xxlxor      vs50,   vs50,   vs50
+    xxlxor      vs51,   vs51,   vs51  
+.endif
+.endm
+
+.macro END8x4_NORMAL
+  END8x4 0, AO, BO, 16,32 
+.endm
+
+.macro END8x4 First, AREG, BREG, OffsetA, OffsetB
+
+.if \OffsetB != 0 
+    addi        \BREG, \BREG, \OffsetB 
+.endif
+.if \OffsetA != 0 
+    addi        \AREG, \AREG, \OffsetA 
+.endif  
+
+.if \First==1
+    xvmulsp      vs32,   vs24,   vs0
+    xvmulsp      vs33,   vs24,   vs1 
+    xvmulsp      vs34,   vs24,   vs2
+    xvmulsp      vs35,   vs24,   vs3
+
+    xvmulsp      vs48,   vs25,   vs0
+    xvmulsp      vs49,   vs25,   vs1
+    xvmulsp      vs50,   vs25,   vs2
+    xvmulsp      vs51,   vs25,   vs3  
+.else
+    xvmaddasp      vs32,   vs24,   vs0
+    xvmaddasp      vs33,   vs24,   vs1 
+    xvmaddasp      vs34,   vs24,   vs2
+    xvmaddasp      vs35,   vs24,   vs3
+
+    xvmaddasp      vs48,   vs25,   vs0
+    xvmaddasp      vs49,   vs25,   vs1
+    xvmaddasp      vs50,   vs25,   vs2
+    xvmaddasp      vs51,   vs25,   vs3 
+
+.endif
+.endm  
+
+.macro KERNEL8x4_L1_L4_I  AREG,BREG,   OffsetA,OffsetB, Index,IsLast ,Complete
+
+    lxv vs4,    DISP16(\Index, 0+\OffsetA)(\AREG)
+    lxv vs26,   DISP32(\Index, 0+\OffsetB)(\BREG)
+    lxv vs27,   DISP32(\Index,16+\OffsetB)(\BREG)
+
+    xxperm      vs6,   vs4,       permute_mask  
+    xxpermdi    vs5,   vs4,   vs4,2      
+    xxpermdi    vs7,   vs6,   vs6,2 
+ 
+    xvmaddasp      vs32,   vs24,   vs0
+    xvmaddasp      vs33,   vs24,   vs1 
+    xvmaddasp      vs34,   vs24,   vs2
+    xvmaddasp      vs35,   vs24,   vs3
+
+    xvmaddasp      vs48,   vs25,   vs0
+    xvmaddasp      vs49,   vs25,   vs1
+    xvmaddasp      vs50,   vs25,   vs2
+    xvmaddasp      vs51,   vs25,   vs3 
+
+    lxv vs0,    DISP16(\Index, 16+\OffsetA)(\AREG)
+    lxv vs24,   DISP32(\Index, 32+\OffsetB)(\BREG)
+    lxv vs25,   DISP32(\Index, 48+\OffsetB)(\BREG) 
+
+    xxperm      vs2,   vs0,       permute_mask  
+    xxpermdi    vs1,   vs0,   vs0,2      
+    xxpermdi    vs3,   vs2,   vs2,2   
+
+    xvmaddasp      vs32,   vs26,   vs4
+    xvmaddasp      vs33,   vs26,   vs5 
+    xvmaddasp      vs34,   vs26,   vs6
+    xvmaddasp      vs35,   vs26,   vs7
+
+    xvmaddasp      vs48,   vs27,   vs4
+    xvmaddasp      vs49,   vs27,   vs5
+    xvmaddasp      vs50,   vs27,   vs6
+    xvmaddasp      vs51,   vs27,   vs7
+ 
+
+    lxv vs4,    DISP16(\Index, 32+\OffsetA)(\AREG)
+    lxv vs26,   DISP32(\Index, 64+\OffsetB)(\BREG)
+    lxv vs27,   DISP32(\Index, 80+\OffsetB)(\BREG)
+
+    xxperm      vs6,   vs4,       permute_mask  
+    xxpermdi    vs5,   vs4,   vs4,2      
+    xxpermdi    vs7,   vs6,   vs6,2 
+ 
+    xvmaddasp      vs32,   vs24,   vs0
+    xvmaddasp      vs33,   vs24,   vs1 
+    xvmaddasp      vs34,   vs24,   vs2
+    xvmaddasp      vs35,   vs24,   vs3
+
+    xvmaddasp      vs48,   vs25,   vs0
+    xvmaddasp      vs49,   vs25,   vs1
+    xvmaddasp      vs50,   vs25,   vs2
+    xvmaddasp      vs51,   vs25,   vs3 
+
+.if \Complete==0 
+
+    lxv vs0,    DISP16(\Index, 48+\OffsetA)(\AREG)
+    lxv vs24,   DISP32(\Index, 96+\OffsetB)(\BREG)
+    lxv vs25,   DISP32(\Index, 96+16+\OffsetB)(\BREG) 
+
+    xxperm      vs2,   vs0,       permute_mask  
+    xxpermdi    vs1,   vs0,   vs0,2      
+    xxpermdi    vs3,   vs2,   vs2,2   
+.endif
+    xvmaddasp      vs32,   vs26,   vs4
+    xvmaddasp      vs33,   vs26,   vs5 
+    xvmaddasp      vs34,   vs26,   vs6
+    xvmaddasp      vs35,   vs26,   vs7
+
+    xvmaddasp      vs48,   vs27,   vs4
+    xvmaddasp      vs49,   vs27,   vs5
+    xvmaddasp      vs50,   vs27,   vs6
+    xvmaddasp      vs51,   vs27,   vs7
+
+ 
+ 
+.if \IsLast==1  
+.if \Complete==1
+    addi        \AREG, \AREG, DISP16(\Index,16*3+\OffsetA)  
+    addi        \BREG, \BREG,  DISP32(\Index,32*3+\OffsetB)
+
+.else
+    addi        \AREG, \AREG, DISP16(\Index,64)  
+    addi        \BREG, \BREG,  DISP32(\Index,128)
+
+.endif
+.endif   
+ 
+ 
+.endm
+
+.macro KERNEL8x4 First
+    LOAD8x4 0
+    END8x4 \First, AO, BO, 16,32  
+.endm
+
+.macro KERNEL8x4_L1_L2_I  AREG,BREG,First,OffsetA,OffsetB, Index,IsLast ,Complete
+
+    lxv vs4,    DISP8(\Index, 0+\OffsetA)(\AREG)
+    lxv vs26,   DISP16(\Index, 0+\OffsetB)(\BREG)
+    lxv vs27,   DISP16(\Index,16+\OffsetB)(\BREG)
+
+    xxperm      vs6,   vs4,       permute_mask  
+    xxpermdi    vs5,   vs4,   vs4,2      
+    xxpermdi    vs7,   vs6,   vs6,2 
+.if \First==1
+    xvmulsp      vs32,   vs24,   vs0
+    xvmulsp      vs33,   vs24,   vs1 
+    xvmulsp      vs34,   vs24,   vs2
+    xvmulsp      vs35,   vs24,   vs3
+
+    xvmulsp      vs48,   vs25,   vs0
+    xvmulsp      vs49,   vs25,   vs1
+    xvmulsp      vs50,   vs25,   vs2
+    xvmulsp      vs51,   vs25,   vs3  
+.else 
+    xvmaddasp      vs32,   vs24,   vs0
+    xvmaddasp      vs33,   vs24,   vs1 
+    xvmaddasp      vs34,   vs24,   vs2
+    xvmaddasp      vs35,   vs24,   vs3
+
+    xvmaddasp      vs48,   vs25,   vs0
+    xvmaddasp      vs49,   vs25,   vs1
+    xvmaddasp      vs50,   vs25,   vs2
+    xvmaddasp      vs51,   vs25,   vs3 
+.endif
+
+.if \Complete==0 
+
+    lxv vs0,    DISP8(\Index, 16+\OffsetA)(\AREG)
+    lxv vs24,   DISP16(\Index, 32+\OffsetB)(\BREG)
+    lxv vs25,   DISP16(\Index, 48+\OffsetB)(\BREG) 
+
+    xxperm      vs2,   vs0,       permute_mask  
+    xxpermdi    vs1,   vs0,   vs0,2      
+    xxpermdi    vs3,   vs2,   vs2,2   
+.endif
+
+.if \First==1
+    xvmulsp      vs32,   vs26,   vs4
+    xvmulsp      vs33,   vs26,   vs5 
+    xvmulsp      vs34,   vs26,   vs6
+    xvmulsp      vs35,   vs26,   vs7
+
+    xvmulsp      vs48,   vs27,   vs4
+    xvmulsp      vs49,   vs27,   vs5
+    xvmulsp      vs50,   vs27,   vs6
+    xvmulsp      vs51,   vs27,   vs7
+
+
+.else
+    xvmaddasp      vs32,   vs26,   vs4
+    xvmaddasp      vs33,   vs26,   vs5 
+    xvmaddasp      vs34,   vs26,   vs6
+    xvmaddasp      vs35,   vs26,   vs7
+
+    xvmaddasp      vs48,   vs27,   vs4
+    xvmaddasp      vs49,   vs27,   vs5
+    xvmaddasp      vs50,   vs27,   vs6
+    xvmaddasp      vs51,   vs27,   vs7
+.endif
+ 
+ 
+.if \IsLast==1  
+.if \Complete==1
+    addi        \AREG, \AREG, DISP8(\Index,16+\OffsetA)  
+    addi        \BREG, \BREG,  DISP16(\Index,32+\OffsetB)
+
+.else
+    addi        \AREG, \AREG, DISP8(\Index,32)  
+    addi        \BREG, \BREG,  DISP16(\Index,64)
+
+.endif
+.endif   
+     
+  
+.endm
+
+
+.macro SAVE8x4
+  slwi    T10, LDC ,   1 
+  add     T1, CO, LDC 
+#if !defined(TRMMKERNEL)  
+  lxv        vs36, 0(CO)
+  lxv        vs37, 0(T1)
+#endif  
+  add     T2, CO, T10  
+  add     T3, T1, T10 
+#if !defined(TRMMKERNEL)    
+  lxv        vs38, 0(T2)
+  lxv        vs39, 0(T3)   
+#endif   
+  add     T4, T2, T10 
+  add     T5, T3, T10
+#if !defined(TRMMKERNEL)    
+  lxv        vs40, 0(T4)
+  lxv        vs41, 0(T5)
+#endif  
+  add     T6, T4, T10 
+  add     T7, T5, T10
+#if !defined(TRMMKERNEL)    
+  lxv        vs42, 0(T6)
+  lxv        vs43, 0(T7)
+#endif
+  xxmrglw  vs0, vs35,vs32
+  xxmrglw  vs1, vs34,vs33 
+  xxmrglw  vs4, vs32,vs35
+  xxmrglw  vs5, vs33,vs34 
+
+
+  xxmrghw  vs2, vs35,vs32
+  xxmrghw  vs3, vs34,vs33 
+  xxmrghw  vs6, vs32,vs35
+  xxmrghw  vs7, vs33,vs34  
+
+  xxmrgld  vs24, vs1, vs0  
+  xxmrghd  vs25,vs5,vs4 
+
+  xxmrgld  vs26, vs2, vs3  
+  xxmrghd  vs27,vs6,vs7
+
+
+  xxmrglw  vs0, vs51,vs48
+  xxmrglw  vs1, vs50,vs49  
+  xxmrglw  vs4, vs48,vs51
+  xxmrglw  vs5, vs49,vs50 
+
+  xxmrghw  vs2, vs51,vs48
+  xxmrghw  vs3, vs50,vs49  
+  xxmrghw  vs6, vs48,vs51
+  xxmrghw  vs7, vs49,vs50   
+
+  xxmrgld  vs28, vs1, vs0  
+  xxmrghd  vs29,vs5,vs4
+
+  xxmrgld  vs30, vs2, vs3   
+  xxmrghd  vs31,vs6,vs7
+#if defined(TRMMKERNEL)
+
+  xvmulsp        vs36, vs24, alpha_r
+  xvmulsp        vs37, vs25, alpha_r 
+  xvmulsp        vs38, vs26, alpha_r
+  xvmulsp        vs39, vs27, alpha_r   
+  xvmulsp        vs40, vs28, alpha_r
+  xvmulsp        vs41, vs29, alpha_r 
+  xvmulsp        vs42, vs30, alpha_r
+  xvmulsp        vs43, vs31, alpha_r
+#else
+  xvmaddasp        vs36, vs24, alpha_r
+  xvmaddasp        vs37, vs25, alpha_r 
+  xvmaddasp        vs38, vs26, alpha_r
+  xvmaddasp        vs39, vs27, alpha_r   
+  xvmaddasp        vs40, vs28, alpha_r
+  xvmaddasp        vs41, vs29, alpha_r 
+  xvmaddasp        vs42, vs30, alpha_r
+  xvmaddasp        vs43, vs31, alpha_r
+#endif
+
+  stxv        vs36, 0(CO)
+  stxv        vs37, 0(T1) 
+  stxv        vs38, 0(T2)
+  stxv        vs39, 0(T3)   
+  stxv        vs40, 0(T4)
+  stxv        vs41, 0(T5) 
+  stxv        vs42, 0(T6)
+  stxv        vs43, 0(T7)
+
+
+  addi CO,CO,16
+.endm
+
+
+/**********************************************************************************************
+* Macros for N=8 and M=2
+**********************************************************************************************/
+
+ 
+.macro KERNEL8x2_2   OffsetA,OffsetB, Index,IsLast
+  KERNEL8x2_I_2 AO,BO, 0, \OffsetA,\OffsetB,\Index,\IsLast
+.endm
+
+ 
+
+.macro Zero8x2
+    xxlxor      vs0,   vs0,   vs0
+    xxlxor      vs1,   vs1,   vs1 
+    xxlxor      vs2,   vs2,   vs2
+    xxlxor      vs3,   vs3,   vs3
+       
+.endm
+ 
+.macro KERNEL8x2
+  KERNEL8x2_1 AO,BO, 0, 0,0,0
+.endm
+.macro KERNEL8x2_1 AREG,BREG,First,OffsetA,OffsetB,Index
+
+
+    lxsd v4,   DISP2(\Index, 0+\OffsetA)(\AREG)
+    lxv vs26,   DISP8(\Index, 0+\OffsetB)(\BREG)
+    lxv vs27,   DISP8(\Index,16+\OffsetB)(\BREG)      
+    xxspltw   vs8,  vs36, 0 
+    xxspltw   vs9,  vs36, 1  
+ 
+.if \First==1
+    xvmulsp      vs0,   vs26,   vs8
+    xvmulsp      vs1,   vs27,   vs8 
+    xvmulsp      vs2,   vs26,   vs9
+    xvmulsp      vs3,   vs27,   vs9 
+     
+.else 
+    xvmaddasp      vs0,   vs26,   vs8
+    xvmaddasp      vs1,   vs27,   vs8 
+    xvmaddasp      vs2,   vs26,   vs9
+    xvmaddasp      vs3,   vs27,   vs9
+ 
+ .endif
+   
+    addi        \AREG, \AREG, DISP2(\Index,8)  
+    addi        \BREG, \BREG, DISP8(\Index,32)
+ 
+.endm
+
+.macro KERNEL8x2_I_2  AREG,BREG,First,OffsetA,OffsetB, Index,IsLast  
+
+    lxv vs4,    DISP4(\Index, 0+\OffsetA)(\AREG)
+    lxv vs26,   DISP16(\Index, 0+\OffsetB)(\BREG)
+    lxv vs27,   DISP16(\Index,16+\OffsetB)(\BREG)
+    lxv vs28,   DISP16(\Index,32+\OffsetB)(\BREG)
+    lxv vs29,   DISP16(\Index,48+\OffsetB)(\BREG)      
+    xxspltw   vs8,  vs4, 2  
+    xxspltw   vs9,  vs4, 3 
+    xxspltw   vs10, vs4, 0 
+    xxspltw   vs11, vs4, 1
+ 
+.if \First==1
+    xvmulsp      vs0,   vs26,   vs8
+    xvmulsp      vs1,   vs27,   vs8 
+    xvmulsp      vs2,   vs26,   vs9
+    xvmulsp      vs3,   vs27,   vs9 
+
+    xvmulsp      vs0,   vs28,   vs10
+    xvmulsp      vs1,   vs29,   vs10 
+    xvmulsp      vs2,   vs28,   vs11
+    xvmulsp      vs3,   vs29,   vs11     
+.else 
+    xvmaddasp      vs0,   vs26,   vs8
+    xvmaddasp      vs1,   vs27,   vs8 
+    xvmaddasp      vs2,   vs26,   vs9
+    xvmaddasp      vs3,   vs27,   vs9
+
+    xvmaddasp      vs0,   vs28,   vs10
+    xvmaddasp      vs1,   vs29,   vs10 
+    xvmaddasp      vs2,   vs28,   vs11
+    xvmaddasp      vs3,   vs29,   vs11  
+ .endif
+
+ 
+.if \IsLast==1   
+    addi        \AREG, \AREG, DISP4(\Index,16)  
+    addi        \BREG, \BREG, DISP16(\Index,64)
+.endif 
+  
+.endm
+
+
+.macro SAVE8x2
+  slwi    T10, LDC ,   1 
+  add     T1, CO, LDC  
+  add     T2, CO, T10  
+  add     T3, T1, T10     
+  add     T4, T2, T10 
+  add     T5, T3, T10 
+  add     T6, T4, T10 
+  add     T7, T5, T10 
+  /*convert alpha_r for multiply*/
+  xscvspdp  vs4,alpha_r
+/* v0 corresponds to vs32, do not forget*/
+#if !defined(TRMMKERNEL)
+  lxssp  v0,0(CO) 
+  lxssp  v1,4(CO) 
+
+  lxssp  v2,0(T1)
+  lxssp  v3,4(T1)
+
+  lxssp  v4,0(T2)
+  lxssp  v5,4(T2)
+
+  lxssp  v6,0(T3)
+  lxssp  v7,4(T3)
+
+  lxssp  v8,0(T4)
+  lxssp  v9,4(T4)
+
+  lxssp  v10,0(T5)
+  lxssp  v11,4(T5)
+
+  lxssp  v12,0(T6)
+  lxssp  v13,4(T6)
+
+  lxssp  v14,0(T7)
+  lxssp  v15,4(T7)
+#endif
+  xscvspdp  vs5, vs2
+  xxspltw   vs6, vs2, 1 
+  xxspltw   vs7, vs2, 2 
+  xxspltw   vs8, vs2, 3  
+  xscvspdp  vs6,vs6
+  xscvspdp  vs7,vs7
+  xscvspdp  vs8,vs8
+
+  xscvspdp  vs24, vs0
+  xxspltw   vs25, vs0, 1 
+  xxspltw   vs26, vs0, 2 
+  xxspltw   vs27, vs0, 3  
+  xscvspdp  vs25,vs25
+  xscvspdp  vs26,vs26
+  xscvspdp  vs27,vs27
+
+  xscvspdp  vs9, vs3
+  xxspltw   vs10, vs3, 1 
+  xxspltw   vs11, vs3, 2 
+  xxspltw   vs12, vs3, 3  
+  xscvspdp  vs10,vs10
+  xscvspdp  vs11,vs11
+  xscvspdp  vs12,vs12
+
+  xscvspdp  vs28, vs1
+  xxspltw   vs29, vs1, 1 
+  xxspltw   vs30, vs1, 2 
+  xxspltw   vs31, vs1, 3  
+  xscvspdp  vs29,vs29
+  xscvspdp  vs30,vs30
+  xscvspdp  vs31,vs31
+
+
+
+
+#if defined(TRMMKERNEL)
+  xsmuldp  vs32,vs8, vs4 
+  xsmuldp  vs33,vs27, vs4 
+
+  xsmuldp  vs34,vs7, vs4 
+  xsmuldp  vs35,vs26, vs4 
+
+  xsmuldp  vs36,vs6, vs4 
+  xsmuldp  vs37,vs25, vs4  
+
+  xsmuldp  vs38,vs5, vs4 
+  xsmuldp  vs39,vs24, vs4  
+
+  xsmuldp  vs40,vs12, vs4 
+  xsmuldp  vs41,vs31, vs4
+
+  xsmuldp  vs42,vs11, vs4 
+  xsmuldp  vs43,vs30, vs4  
+
+  xsmuldp  vs44,vs10, vs4 
+  xsmuldp  vs45,vs29, vs4 
+
+  xsmuldp  vs46,vs9, vs4 
+  xsmuldp  vs47,vs28, vs4      
+#else
+  xsmaddadp  vs32,vs8, vs4 
+  xsmaddadp  vs33,vs27, vs4 
+
+  xsmaddadp  vs34,vs7, vs4 
+  xsmaddadp  vs35,vs26, vs4 
+
+  xsmaddadp  vs36,vs6, vs4 
+  xsmaddadp  vs37,vs25, vs4  
+
+  xsmaddadp  vs38,vs5, vs4 
+  xsmaddadp  vs39,vs24, vs4  
+
+  xsmaddadp  vs40,vs12, vs4 
+  xsmaddadp  vs41,vs31, vs4
+
+  xsmaddadp  vs42,vs11, vs4 
+  xsmaddadp  vs43,vs30, vs4  
+
+  xsmaddadp  vs44,vs10, vs4 
+  xsmaddadp  vs45,vs29, vs4 
+
+  xsmaddadp  vs46,vs9, vs4 
+  xsmaddadp  vs47,vs28, vs4     
+#endif  
+
+  stxssp  v0,0(CO) 
+  stxssp  v1,4(CO) 
+
+  stxssp  v2,0(T1)
+  stxssp  v3,4(T1)
+
+  stxssp  v4,0(T2)
+  stxssp  v5,4(T2)
+
+  stxssp  v6,0(T3)
+  stxssp  v7,4(T3)
+
+  stxssp  v8,0(T4)
+  stxssp  v9,4(T4)
+
+  stxssp  v10,0(T5)
+  stxssp  v11,4(T5)
+
+  stxssp  v12,0(T6)
+  stxssp  v13,4(T6)
+
+  stxssp  v14,0(T7)
+  stxssp  v15,4(T7)
+ 
+
+  addi CO,CO,8
+.endm
+
+
+/**********************************************************************************************
+* Macros for N=8 and M=1
+**********************************************************************************************/
+.macro KERNEL8x1_4   OffsetA,OffsetB, Index,IsLast
+  KERNEL8x1_I_4 AO,BO, 0, \OffsetA,\OffsetB,\Index,\IsLast
+.endm
+
+.macro Zero8x1
+    xxlxor      vs0,   vs0,   vs0
+    xxlxor      vs1,   vs1,   vs1  
+.endm
+
+.macro KERNEL8x1
+  KERNEL8x1_1 AO,BO, 0 
+.endm
+
+.macro KERNEL8x1_2
+  KERNEL8x1_2_1 AO,BO, 0 
+.endm
+
+.macro KERNEL8x1_1 AREG,BREG,First 
+    lxvwsx vs8,  0, \AREG
+    lxv vs26,   0(\BREG)
+    lxv vs27,   16(\BREG)      
+.if \First==1
+    xvmulsp      vs0,   vs26,   vs8
+    xvmulsp      vs1,   vs27,   vs8  
+.else 
+    xvmaddasp      vs0,   vs26,   vs8
+    xvmaddasp      vs1,   vs27,   vs8  
+ .endif
+    addi        \AREG, \AREG,  4  
+    addi        \BREG, \BREG,  32
+.endm
+
+.macro KERNEL8x1_2_1 AREG,BREG,First 
+    lxsd v4,    0(\AREG)
+    lxv vs26,   0(\BREG)
+    lxv vs27,  16(\BREG)      
+    lxv vs28,  32(\BREG)
+    lxv vs29,  48(\BREG) 
+    xxspltw   vs8,  vs36, 1 
+    xxspltw   vs9,  vs36, 0  
+.if \First==1
+    xvmulsp      vs0,   vs26,   vs8
+    xvmulsp      vs1,   vs27,   vs8  
+    xvmulsp      vs0,   vs28,   vs9
+    xvmulsp      vs1,   vs29,   vs9     
+.else 
+    xvmaddasp      vs0,   vs26,   vs8
+    xvmaddasp      vs1,   vs27,   vs8  
+    xvmaddasp      vs0,   vs28,   vs9
+    xvmaddasp      vs1,   vs29,   vs9 
+ .endif
+    addi        \AREG, \AREG,  8 
+    addi        \BREG, \BREG,  64
+.endm
+
+.macro KERNEL8x1_I_4  AREG,BREG,First,OffsetA,OffsetB, Index,IsLast  
+    lxv vs4,    DISP4(\Index, 0+\OffsetA)(\AREG)
+    xxspltw   vs8,  vs4, 3 
+    xxspltw   vs9,  vs4, 2 
+    xxspltw   vs10, vs4, 1 
+    xxspltw   vs11, vs4, 0
+    lxv vs26,   DISP32(\Index, 0+\OffsetB)(\BREG)
+    lxv vs27,   DISP32(\Index,16+\OffsetB)(\BREG)
+    lxv vs28,   DISP32(\Index,32+\OffsetB)(\BREG)
+    lxv vs29,   DISP32(\Index,48+\OffsetB)(\BREG) 
+    lxv vs30,   DISP32(\Index,64+ 0+\OffsetB)(\BREG)
+    lxv vs31,   DISP32(\Index,64+16+\OffsetB)(\BREG)
+    lxv vs32,   DISP32(\Index,64+32+\OffsetB)(\BREG)
+    lxv vs33,   DISP32(\Index,64+48+\OffsetB)(\BREG)         
+.if \First==1
+    xvmulsp      vs0,   vs26,   vs8
+    xvmulsp      vs1,   vs27,   vs8  
+    xvmulsp      vs0,   vs28,   vs9
+    xvmulsp      vs1,   vs29,   vs9     
+    xvmulsp      vs0,   vs30,   vs10
+    xvmulsp      vs1,   vs31,   vs10  
+    xvmulsp      vs0,   vs32,   vs11
+    xvmulsp      vs1,   vs33,   vs11     
+.else 
+    xvmaddasp      vs0,   vs26,   vs8
+    xvmaddasp      vs1,   vs27,   vs8  
+    xvmaddasp      vs0,   vs28,   vs9
+    xvmaddasp      vs1,   vs29,   vs9     
+    xvmaddasp      vs0,   vs30,   vs10
+    xvmaddasp      vs1,   vs31,   vs10  
+    xvmaddasp      vs0,   vs32,   vs11
+    xvmaddasp      vs1,   vs33,   vs11  
+ .endif
+.if \IsLast==1   
+    addi        \AREG, \AREG, DISP4(\Index,16)  
+    addi        \BREG, \BREG, DISP32(\Index,128)
+.endif 
+.endm
+
+.macro SAVE8x1
+  slwi    T10, LDC ,   1 
+  add     T1, CO, LDC  
+  add     T2, CO, T10  
+  add     T3, T1, T10     
+  add     T4, T2, T10 
+  add     T5, T3, T10 
+  add     T6, T4, T10 
+  add     T7, T5, T10 
+  /*convert alpha_r for multiply*/
+  xscvspdp  vs4,alpha_r
+/* v0 corresponds to vs32, do not forget*/
+#if !defined(TRMMKERNEL)
+  lxssp  v0,0(CO)  
+  lxssp  v2,0(T1) 
+  lxssp  v4,0(T2) 
+  lxssp  v6,0(T3) 
+  lxssp  v8,0(T4) 
+  lxssp  v10,0(T5) 
+  lxssp  v12,0(T6) 
+  lxssp  v14,0(T7)
+#endif
+  xscvspdp  vs24, vs0
+  xxspltw   vs25, vs0, 1 
+  xxspltw   vs26, vs0, 2 
+  xxspltw   vs27, vs0, 3  
+  xscvspdp  vs25,vs25
+  xscvspdp  vs26,vs26
+  xscvspdp  vs27,vs27
+  xscvspdp  vs28, vs1
+  xxspltw   vs29, vs1, 1 
+  xxspltw   vs30, vs1, 2 
+  xxspltw   vs31, vs1, 3  
+  xscvspdp  vs29,vs29
+  xscvspdp  vs30,vs30
+  xscvspdp  vs31,vs31
+#if defined(TRMMKERNEL)
+  xsmuldp  vs32,vs27, vs4 
+  xsmuldp  vs34,vs26, vs4 
+  xsmuldp  vs36,vs25, vs4 
+  xsmuldp  vs38,vs24, vs4 
+  xsmuldp  vs40,vs31, vs4 
+  xsmuldp  vs42,vs30, vs4 
+  xsmuldp  vs44,vs29, vs4 
+  xsmuldp  vs46,vs28, vs4 
+#else
+  xsmaddadp  vs32,vs27, vs4 
+  xsmaddadp  vs34,vs26, vs4 
+  xsmaddadp  vs36,vs25, vs4 
+  xsmaddadp  vs38,vs24, vs4 
+  xsmaddadp  vs40,vs31, vs4 
+  xsmaddadp  vs42,vs30, vs4 
+  xsmaddadp  vs44,vs29, vs4 
+  xsmaddadp  vs46,vs28, vs4  
+#endif  
+  stxssp  v0,0(CO)  
+  stxssp  v2,0(T1) 
+  stxssp  v4,0(T2) 
+  stxssp  v6,0(T3) 
+  stxssp  v8,0(T4) 
+  stxssp  v10,0(T5) 
+  stxssp  v12,0(T6) 
+  stxssp  v14,0(T7) 
+  addi CO,CO,4
+.endm
+
+
+
+/**********************************************************************************************
+* Macros for N=4 and M=16
+**********************************************************************************************/
+
+.macro LOAD4x16_1
+   LOAD4x16 1
+.endm
+
+.macro LOAD4x16_0
+   LOAD4x16 0
+.endm
+
+.macro KERNEL4x16_L1_L4  Index,IsLast
+  KERNEL4x16_L1_L4_I AO,BO, 0,0, \Index,\IsLast,0
+.endm
+
+.macro KERNEL4x16_I1_L4  OffsetA,OffsetB, Index,IsLast
+  KERNEL4x16_L1_L4_I  AO,BO, \OffsetA,\OffsetB,\Index,\IsLast,0
+.endm
+
+.macro KERNEL4x16_I1_L4_2  OffsetA,OffsetB, Index,IsLast
+  KERNEL4x16_L1_L4_I  AO,BO,  \OffsetA,\OffsetB,\Index,\IsLast,0
+.endm
+
+.macro KERNEL4x16_I1_L4_3  OffsetA,OffsetB, Index,IsLast
+  KERNEL4x16_L1_L4_I  AO,BO,  \OffsetA,\OffsetB,\Index,\IsLast,1
+.endm
+.macro KERNEL4x16_I1_L2_3  OffsetA,OffsetB, Index,IsLast
+   KERNEL4x16_L1_L2_I AO,BO,0,  \OffsetA,\OffsetB,\Index,\IsLast,1
+.endm
+
+.macro KERNEL4x16_I2_L4_2  AREG,BREG,OffsetA,OffsetB, Index,IsLast
+  KERNEL4x16_L1_L4_I  \AREG,\BREG,  \OffsetA,\OffsetB,\Index,\IsLast,0
+.endm
+
+.macro KERNEL4x16_I2_L4_3  AREG,BREG,OffsetA,OffsetB, Index,IsLast
+  KERNEL4x16_L1_L4_I \AREG,\BREG,  \OffsetA,\OffsetB,\Index,\IsLast,1
+.endm
+
+.macro Zero4X16
+    xxlxor		vs32,	vs32,	vs32
+    xxlxor		vs33,	vs33,	vs33
+	xxlxor		vs34,	vs34,	vs34
+	xxlxor		vs35,	vs35,	vs35
+	xxlxor		vs36,	vs36,	vs36
+	xxlxor		vs37,	vs37,	vs37
+	xxlxor		vs38,	vs38,	vs38
+	xxlxor		vs39,	vs39,	vs39
+	xxlxor		vs40,	vs40,	vs40
+	xxlxor		vs41,	vs41,	vs41
+	xxlxor		vs42,	vs42,	vs42
+	xxlxor		vs43,	vs43,	vs43
+	xxlxor		vs44,	vs44,	vs44
+	xxlxor		vs45,	vs45,	vs45
+	xxlxor		vs46,	vs46,	vs46
+	xxlxor		vs47,	vs47,	vs47	
+.endm
+
+.macro LOAD4x16  Zero
+
+	lxv	vs24,	0(BO) 
+	lxv	vs0,	 0(AO)
+	lxv	vs1,	16(AO)
+	lxv	vs2,	32(AO)
+	lxv	vs3,	48(AO)
+	xxperm  	vs26,	vs24,		permute_mask 	
+	xxpermdi	vs25,	vs24,	vs24,2 
+	xxpermdi	vs27,	vs26,	vs26,2	 	
+
+.if \Zero==1 
+    xxlxor		vs32,	vs32,	vs32
+    xxlxor		vs33,	vs33,	vs33
+	xxlxor		vs34,	vs34,	vs34
+	xxlxor		vs35,	vs35,	vs35
+	xxlxor		vs36,	vs36,	vs36
+	xxlxor		vs37,	vs37,	vs37
+	xxlxor		vs38,	vs38,	vs38
+	xxlxor		vs39,	vs39,	vs39
+	xxlxor		vs40,	vs40,	vs40
+	xxlxor		vs41,	vs41,	vs41
+	xxlxor		vs42,	vs42,	vs42
+	xxlxor		vs43,	vs43,	vs43
+	xxlxor		vs44,	vs44,	vs44
+	xxlxor		vs45,	vs45,	vs45
+	xxlxor		vs46,	vs46,	vs46
+	xxlxor		vs47,	vs47,	vs47
+ 
+.endif
+.endm
+
+.macro END4x16_NORMAL
+  END4x16 0, AO, BO, 64,16 
+.endm
+
+.macro END4x16 First, AREG, BREG, OffsetA, OffsetB
+
+.if \OffsetB != 0 
+    addi        \BREG, \BREG, \OffsetB 
+.endif
+.if \OffsetA != 0 
+    addi        \AREG, \AREG, \OffsetA 
+.endif  
+
+.if \First==1
+    xvmulsp     vs32, vs0,vs24
+    xvmulsp     vs33, vs1,vs24
+    xvmulsp     vs34, vs2,vs24  
+    xvmulsp     vs35, vs3,vs24  
+
+    xvmulsp     vs36, vs0,vs25
+    xvmulsp     vs37, vs1,vs25
+    xvmulsp     vs38, vs2,vs25  
+    xvmulsp     vs39, vs3,vs25
+
+    xvmulsp     vs40, vs0,vs26
+    xvmulsp     vs41, vs1,vs26
+    xvmulsp     vs42, vs2,vs26  
+    xvmulsp     vs43, vs3,vs26
+
+    xvmulsp     vs44, vs0,vs27
+    xvmulsp     vs45, vs1,vs27
+    xvmulsp     vs46, vs2,vs27  
+    xvmulsp     vs47, vs3,vs27
+
+.else
+    xvmaddasp       vs32, vs0,vs24
+    xvmaddasp       vs33, vs1,vs24
+    xvmaddasp       vs34, vs2,vs24  
+    xvmaddasp       vs35, vs3,vs24  
+
+    xvmaddasp       vs36, vs0,vs25
+    xvmaddasp       vs37, vs1,vs25
+    xvmaddasp       vs38, vs2,vs25  
+    xvmaddasp       vs39, vs3,vs25 
+    xvmaddasp       vs40, vs0,vs26
+    xvmaddasp       vs41, vs1,vs26
+    xvmaddasp       vs42, vs2,vs26  
+    xvmaddasp       vs43, vs3,vs26
+
+    xvmaddasp       vs44, vs0,vs27
+    xvmaddasp       vs45, vs1,vs27
+    xvmaddasp       vs46, vs2,vs27  
+    xvmaddasp       vs47, vs3,vs27
+
+.endif
+.endm  
+
+.macro KERNEL4x16_L1_L4_I  AREG,BREG,   OffsetA,OffsetB, Index,IsLast ,Complete
+
+	lxv	vs8,	DISP16(\Index, 0+\OffsetB)(\BREG) 
+
+ 	lxv	vs4,	DISP64(\Index, 0+\OffsetA)(\AREG)
+	lxv	vs5,	DISP64(\Index,16+\OffsetA)(\AREG)
+	lxv	vs6,	DISP64(\Index,32+\OffsetA)(\AREG)
+	lxv	vs7,	DISP64(\Index,48+\OffsetA)(\AREG) 
+
+	xxperm  	vs10,	vs8,		permute_mask 
+	xxpermdi	vs9,	vs8,	vs8,2	  
+
+    xvmaddasp		vs32, vs0,vs24
+	xvmaddasp		vs33, vs1,vs24
+	xvmaddasp		vs34, vs2,vs24	
+	xvmaddasp		vs35, vs3,vs24	 
+
+    xvmaddasp		vs36, vs0,vs25
+	xvmaddasp		vs37, vs1,vs25
+	xvmaddasp		vs38, vs2,vs25	
+	xvmaddasp		vs39, vs3,vs25 
+
+ 	xxpermdi	vs11,	vs10,	vs10,2	 
+
+    xvmaddasp		vs40, vs0,vs26
+	xvmaddasp		vs41, vs1,vs26
+	xvmaddasp		vs42, vs2,vs26	
+	xvmaddasp		vs43, vs3,vs26
+
+    xvmaddasp		vs44, vs0,vs27
+	xvmaddasp		vs45, vs1,vs27
+	xvmaddasp		vs46, vs2,vs27	
+	xvmaddasp		vs47, vs3,vs27
+
+
+
+	lxv	vs24,	DISP16(\Index,16+\OffsetB)(\BREG) 
+
+	lxv	vs0,	DISP64(\Index,64+\OffsetA)(\AREG)
+	lxv	vs1,	DISP64(\Index,64+16+\OffsetA)(\AREG)
+	lxv	vs2,	DISP64(\Index,64+32+\OffsetA)(\AREG)
+	lxv	vs3,	DISP64(\Index,64+48+\OffsetA)(\AREG)
+
+	xxperm  	vs26,	vs24,		permute_mask 
+	xxpermdi	vs25,	vs24,	vs24,2	    
+ 
+
+    xvmaddasp		vs32, vs4,vs8
+	xvmaddasp		vs33, vs5,vs8
+	xvmaddasp		vs34, vs6,vs8	
+	xvmaddasp		vs35, vs7,vs8	
+ 
+    xvmaddasp		vs36, vs4,vs9
+	xvmaddasp		vs37, vs5,vs9
+	xvmaddasp		vs38, vs6,vs9	
+	xvmaddasp		vs39, vs7,vs9
+         
+	xxpermdi	vs27,	vs26,	vs26,2	 	
+
+    xvmaddasp		vs40, vs4,vs10
+	xvmaddasp		vs41, vs5,vs10
+	xvmaddasp		vs42, vs6,vs10	
+	xvmaddasp		vs43, vs7,vs10
+
+    xvmaddasp		vs44, vs4,vs11
+	xvmaddasp		vs45, vs5,vs11
+	xvmaddasp		vs46, vs6,vs11	
+	xvmaddasp		vs47, vs7,vs11
+ 
+
+	lxv	vs8,	DISP16(\Index,32+\OffsetB)(\BREG) 
+
+ 	lxv	vs4,	DISP64(\Index,128+0+\OffsetA)(\AREG)
+	lxv	vs5,	DISP64(\Index,128+16+\OffsetA)(\AREG)
+	lxv	vs6,	DISP64(\Index,128+32+\OffsetA)(\AREG)
+	lxv	vs7,	DISP64(\Index,128+48+\OffsetA)(\AREG) 
+
+	xxperm  	vs10,	vs8,		permute_mask 
+	xxpermdi	vs9,	vs8,	vs8,2	  
+
+    xvmaddasp		vs32, vs0,vs24
+	xvmaddasp		vs33, vs1,vs24
+	xvmaddasp		vs34, vs2,vs24	
+	xvmaddasp		vs35, vs3,vs24	 
+
+    xvmaddasp		vs36, vs0,vs25
+	xvmaddasp		vs37, vs1,vs25
+	xvmaddasp		vs38, vs2,vs25	
+	xvmaddasp		vs39, vs3,vs25
+
+ 	xxpermdi	vs11,	vs10,	vs10,2	 
+
+    xvmaddasp		vs40, vs0,vs26
+	xvmaddasp		vs41, vs1,vs26
+	xvmaddasp		vs42, vs2,vs26	
+	xvmaddasp		vs43, vs3,vs26
+
+    xvmaddasp		vs44, vs0,vs27
+	xvmaddasp		vs45, vs1,vs27
+	xvmaddasp		vs46, vs2,vs27	
+	xvmaddasp		vs47, vs3,vs27
+
+ 
+ 
+.if \Complete==0
+	lxv	vs24,	DISP16(\Index,48+\OffsetB)(\BREG) 
+
+	lxv	vs0,	DISP64(\Index,192+\OffsetA)(\AREG)
+	lxv	vs1,	DISP64(\Index,192+16+\OffsetA)(\AREG) 
+	lxv	vs2,	DISP64(\Index,192+32+\OffsetA)(\AREG)
+	lxv	vs3,	DISP64(\Index,192+48+\OffsetA)(\AREG)
+
+	xxperm  	vs26,	vs24,	permute_mask 	
+	xxpermdi	vs25,	vs24,	vs24,2  	
+
+.endif 
+.if \IsLast==1	
+.if \Complete==1
+  
+	addi		\BREG, \BREG,  DISP16(\Index,16*3+\OffsetB)
+	addi		\AREG, \AREG, DISP64(\Index,64*3+\OffsetA)
+.else
+  
+	addi		\BREG, \BREG,  DISP16(\Index,64)
+	addi		\AREG, \AREG, DISP64(\Index,256)
+.endif
+.endif   
+ 
+    xvmaddasp		vs32, vs4,vs8
+	xvmaddasp		vs33, vs5,vs8
+	xvmaddasp		vs34, vs6,vs8	
+	xvmaddasp		vs35, vs7,vs8	 
+ 
+    xvmaddasp		vs36, vs4,vs9
+	xvmaddasp		vs37, vs5,vs9
+	xvmaddasp		vs38, vs6,vs9	
+	xvmaddasp		vs39, vs7,vs9
+  
+.if \Complete==0        
+	xxpermdi	vs27,	vs26,	vs26,2	 
+ 	
+.endif
+ 
+    xvmaddasp		vs40, vs4,vs10
+	xvmaddasp		vs41, vs5,vs10
+	xvmaddasp		vs42, vs6,vs10	
+	xvmaddasp		vs43, vs7,vs10
+
+    xvmaddasp		vs44, vs4,vs11
+	xvmaddasp		vs45, vs5,vs11
+	xvmaddasp		vs46, vs6,vs11	
+	xvmaddasp		vs47, vs7,vs11
+
+ 
+
+.endm
+
+.macro KERNEL4x16 First
+
+  LOAD4x16 0
+  END4x16 \First, AO, BO, 64,16 
+.endm
+
+.macro KERNEL4x16_L1_L2_I  AREG,BREG,First,OffsetA,OffsetB, Index,IsLast ,Complete
+	
+	lxv	vs8,	DISP8(\Index, 0+\OffsetB)(\BREG) 
+ 	lxv	vs4,	DISP32(\Index, 0+\OffsetA)(\AREG)
+	lxv	vs5,	DISP32(\Index,16+\OffsetA)(\AREG)
+	lxv	vs6,	DISP32(\Index,32+\OffsetA)(\AREG)
+	lxv	vs7,	DISP32(\Index,48+\OffsetA)(\AREG) 
+
+	xxperm  	vs10,	vs8,		permute_mask 
+	xxpermdi	vs9,	vs8,	vs8,2	  
+.if \First==1
+    xvmulsp		vs32, vs0,vs24
+	xvmulsp		vs33, vs1,vs24
+	xvmulsp		vs34, vs2,vs24	
+	xvmulsp		vs35, vs3,vs24	
+
+    xvmulsp		vs36, vs0,vs25
+	xvmulsp		vs37, vs1,vs25
+	xvmulsp		vs38, vs2,vs25	
+	xvmulsp		vs39, vs3,vs25	
+.else
+    xvmaddasp		vs32, vs0,vs24
+	xvmaddasp		vs33, vs1,vs24
+	xvmaddasp		vs34, vs2,vs24	
+	xvmaddasp		vs35, vs3,vs24
+
+    xvmaddasp		vs36, vs0,vs25
+	xvmaddasp		vs37, vs1,vs25
+	xvmaddasp		vs38, vs2,vs25	
+	xvmaddasp		vs39, vs3,vs25		
+.endif
+
+ 	xxpermdi	vs11,	vs10,	vs10,2	 	
+ 
+.if \First==1  
+    xvmulsp		vs40, vs0,vs26
+	xvmulsp		vs41, vs1,vs26
+	xvmulsp		vs42, vs2,vs26	
+	xvmulsp		vs43, vs3,vs26
+
+    xvmulsp		vs44, vs0,vs27
+	xvmulsp		vs45, vs1,vs27
+	xvmulsp		vs46, vs2,vs27	
+	xvmulsp		vs47, vs3,vs27
+
+  
+.else 
+    xvmaddasp		vs40, vs0,vs26
+	xvmaddasp		vs41, vs1,vs26
+	xvmaddasp		vs42, vs2,vs26	
+	xvmaddasp		vs43, vs3,vs26
+
+    xvmaddasp		vs44, vs0,vs27
+	xvmaddasp		vs45, vs1,vs27
+	xvmaddasp		vs46, vs2,vs27	
+	xvmaddasp		vs47, vs3,vs27
+ 
+
+.endif
+.if \Complete==0
+	lxv	vs24,	DISP8(\Index,16+\OffsetB)(\BREG) 
+	lxv	vs0,	DISP32(\Index,64+\OffsetA)(\AREG)
+	lxv	vs1,	DISP32(\Index,64+16+\OffsetA)(\AREG)
+	lxv	vs2,	DISP32(\Index,64+32+\OffsetA)(\AREG)
+	lxv	vs3,	DISP32(\Index,64+48+\OffsetA)(\AREG)
+
+	xxperm  	vs26,	vs24,	permute_mask 
+	xxpermdi	vs25,	vs24,	vs24,2	  
+.endif    
+.if \IsLast==1	
+.if \Complete==1
+ 	addi		\BREG, \BREG,  DISP8(\Index,16+\OffsetB) 
+	addi		\AREG, \AREG, DISP32(\Index,64+\OffsetA)
+
+.else
+  	addi		\BREG, \BREG,  DISP8(\Index,32)
+	addi		\AREG, \AREG, DISP32(\Index,128) 
+.endif
+.endif
+
+.if \First==1
+    xvmulsp		vs32, vs4,vs8
+	xvmulsp		vs33, vs5,vs8
+	xvmulsp		vs34, vs6,vs8	
+	xvmulsp		vs35, vs7,vs8
+
+    xvmulsp		vs36, vs4,vs9
+	xvmulsp		vs37, vs5,vs9
+	xvmulsp		vs38, vs6,vs9	
+	xvmulsp		vs39, vs7,vs9
+.else
+    xvmaddasp		vs32, vs4,vs8
+	xvmaddasp		vs33, vs5,vs8
+	xvmaddasp		vs34, vs6,vs8	
+	xvmaddasp		vs35, vs7,vs8	
+
+    xvmaddasp		vs36, vs4,vs9
+	xvmaddasp		vs37, vs5,vs9
+	xvmaddasp		vs38, vs6,vs9	
+	xvmaddasp		vs39, vs7,vs9
+.endif 
+ 
+.if \Complete==0        
+	xxpermdi	vs27,	vs26,	vs26,2	 
+ 
+.endif
+.if \First==1  
+    xvmulsp		vs40, vs4,vs10
+	xvmulsp		vs41, vs5,vs10
+	xvmulsp		vs42, vs6,vs10	
+	xvmulsp		vs43, vs7,vs10
+
+    xvmulsp		vs44, vs4,vs11
+	xvmulsp		vs45, vs5,vs11
+	xvmulsp		vs46, vs6,vs11	
+	xvmulsp		vs47, vs7,vs11
+
+ 
+
+.else 
+    xvmaddasp		vs40, vs4,vs10
+	xvmaddasp		vs41, vs5,vs10
+	xvmaddasp		vs42, vs6,vs10	
+	xvmaddasp		vs43, vs7,vs10
+
+    xvmaddasp		vs44, vs4,vs11
+	xvmaddasp		vs45, vs5,vs11
+	xvmaddasp		vs46, vs6,vs11	
+	xvmaddasp		vs47, vs7,vs11
+
+ 
+
+.endif
+
+.endm
+
+ 
+.macro SAVE4x16
+
+  slwi    T10, LDC ,   1 
+  add     T1, CO, LDC 
+
+  add     T2, CO, T10  
+  add     T3, T1, T10  
+
+  
+ 
+    xxmrglw     vs8,    vs32,   vs44
+    xxmrglw     vs10,   vs36,   vs40  
+
+    xxmrghw     vs1,    vs32,   vs44
+    xxmrghw     vs0,    vs36,   vs40
+
+    xxmrglw     vs12,   vs33,   vs45
+    xxmrglw     vs14,   vs37,   vs41  
+
+    xxmrghw     vs2,    vs37,   vs41
+    xxmrghw     vs3,    vs33,   vs45
+
+    xxmrglw     vs16,   vs34,   vs46
+    xxmrglw     vs18,   vs38,   vs42   
+
+    xxlor      vs9, vs8,    vs8
+    xxlor      vs11,    vs10,   vs10 
+
+    xxmrghw     vs4,    vs38,   vs42
+    xxmrghw     vs5,    vs34,   vs46
+
+    xxlor      vs13,    vs12,   vs12
+    xxlor      vs15,    vs14,   vs14
+
+    xxmrglw     vs24,   vs35,   vs47
+    xxmrglw     vs26,   vs39,   vs43  
+
+    xxlor      vs17,    vs16,   vs16
+    xxlor      vs19,    vs18,   vs18
+
+    xxmrghw     vs30,   vs39,   vs43 
+    xxmrghw     vs31,   vs35,   vs47
+
+    xxperm      vs8,    vs0,    save_permute_1
+    xxperm      vs10,   vs1,    save_permute_1
+    xxperm      vs9,    vs0,    save_permute_2  
+    xxperm      vs11,   vs1,    save_permute_2      
+
+#ifndef TRMMKERNEL    
+    lxv        vs32, 0(CO)
+    lxv        vs33, 16(CO) 
+    lxv        vs34, 32(CO)  
+    lxv        vs35, 48(CO)      
+#endif
+    xxlor      vs25,    vs24,   vs24
+    xxlor      vs27,    vs26,   vs26 
+
+#ifndef TRMMKERNEL    
+    lxv        vs36, 0(T1)
+    lxv        vs37, 16(T1) 
+    lxv        vs38, 32(T1)  
+    lxv        vs39, 48(T1)     
+#endif
+#ifndef TRMMKERNEL       
+    lxv        vs40, 0(T2)
+    lxv        vs41, 16(T2) 
+    lxv        vs42, 32(T2)  
+    lxv        vs43, 48(T2)     
+#endif  
+#ifndef TRMMKERNEL    
+    lxv        vs44, 0(T3)
+    lxv        vs45, 16(T3) 
+    lxv        vs46, 32(T3)  
+    lxv        vs47, 48(T3)                 
+#endif  
+
+    xxperm     vs12,    vs2,    save_permute_1
+    xxperm     vs14,    vs3,    save_permute_1
+       
+    xxperm     vs13,    vs2,    save_permute_2   
+    xxperm     vs15,    vs3,    save_permute_2      
+
+    xxperm     vs16,    vs4,    save_permute_1
+    xxperm     vs18,    vs5,    save_permute_1
+      
+    xxperm     vs17,    vs4,    save_permute_2   
+    xxperm     vs19,    vs5,    save_permute_2      
+
+    xxperm     vs24,    vs30,   save_permute_1
+    xxperm     vs26,    vs31,   save_permute_1 
+         
+    xxperm     vs25,    vs30,   save_permute_2   
+    xxperm     vs27,    vs31,   save_permute_2  
+
+
+    /* multiply add normal way */
+ 
+#ifdef TRMMKERNEL
+    xvmulsp     vs32,   vs8,    alpha_r 
+    xvmulsp     vs33,   vs12,   alpha_r   
+    xvmulsp     vs34,   vs16,   alpha_r 
+    xvmulsp     vs35,   vs24,   alpha_r  
+    xvmulsp     vs36,   vs9,    alpha_r 
+    xvmulsp     vs37,   vs13,   alpha_r  
+    xvmulsp     vs38,   vs17,   alpha_r 
+    xvmulsp     vs39,   vs25,   alpha_r               
+#else 
+    xvmaddasp   vs32,   vs8,    alpha_r 
+    xvmaddasp   vs33,   vs12,   alpha_r   
+    xvmaddasp   vs34,   vs16,   alpha_r 
+    xvmaddasp   vs35,   vs24,   alpha_r  
+    xvmaddasp   vs36,   vs9,    alpha_r 
+    xvmaddasp   vs37,   vs13,   alpha_r   
+    xvmaddasp   vs38,   vs17,   alpha_r 
+    xvmaddasp   vs39,   vs25,   alpha_r         
+#endif 
+
+
+
+#ifdef TRMMKERNEL
+    xvmulsp     vs40,   vs10,   alpha_r 
+    xvmulsp     vs41,   vs14,   alpha_r 
+    xvmulsp     vs42,   vs18,   alpha_r 
+    xvmulsp     vs43,   vs26,   alpha_r  
+    xvmulsp     vs44,   vs11,   alpha_r 
+    xvmulsp     vs45,   vs15,   alpha_r  
+    xvmulsp     vs46,   vs19,   alpha_r 
+    xvmulsp     vs47,   vs27,   alpha_r                   
+#else
+
+    xvmaddasp   vs40,   vs10,   alpha_r 
+    xvmaddasp   vs41,   vs14,   alpha_r   
+    xvmaddasp   vs42,   vs18,   alpha_r 
+    xvmaddasp   vs43,   vs26,   alpha_r  
+    xvmaddasp   vs44,   vs11,   alpha_r 
+    xvmaddasp   vs45,   vs15,   alpha_r 
+    xvmaddasp   vs46,   vs19,   alpha_r 
+    xvmaddasp   vs47,   vs27,   alpha_r  
+        
+#endif  
+
+    stxv        vs32, 0(CO)
+    stxv        vs33, 16(CO) 
+    stxv        vs34, 32(CO)  
+    stxv        vs35, 48(CO)  
+
+    stxv        vs36, 0(T1)
+    stxv        vs37, 16(T1)  
+    stxv        vs38, 32(T1)  
+    stxv        vs39, 48(T1)
+
+    stxv        vs40, 0(T2)
+    stxv        vs41, 16(T2)  
+    stxv        vs42, 32(T2)  
+    stxv        vs43, 48(T2)  
+    stxv        vs44, 0(T3)
+    stxv        vs45, 16(T3) 
+    stxv        vs46, 32(T3)  
+    stxv        vs47, 48(T3)
+   
+    addi CO,CO,64
+
+
+.endm
+
+
+
+/**********************************************************************************************
+* Macros for N=4 and M=8
+**********************************************************************************************/
+
+.macro LOAD4x8_1
+   LOAD4x8 1
+.endm
+
+.macro LOAD4x8_0
+   LOAD4x8 0
+.endm
+
+.macro KERNEL4x8_L1_L4  Index,IsLast
+  KERNEL4x8_L1_L4_I AO,BO, 0,0, \Index,\IsLast,0
+.endm
+
+.macro KERNEL4x8_I1_L4  OffsetA,OffsetB, Index,IsLast
+  KERNEL4x8_L1_L4_I  AO,BO, \OffsetA,\OffsetB,\Index,\IsLast,0
+.endm
+
+.macro KERNEL4x8_I1_L4_2  OffsetA,OffsetB, Index,IsLast
+  KERNEL4x8_L1_L4_I  AO,BO,  \OffsetA,\OffsetB,\Index,\IsLast,0
+.endm
+
+.macro KERNEL4x8_I1_L4_3  OffsetA,OffsetB, Index,IsLast
+  KERNEL4x8_L1_L4_I  AO,BO,  \OffsetA,\OffsetB,\Index,\IsLast,1
+.endm
+.macro KERNEL4x8_I1_L2_3  OffsetA,OffsetB, Index,IsLast
+   KERNEL4x8_L1_L2_I AO,BO,0,  \OffsetA,\OffsetB,\Index,\IsLast,1
+.endm
+
+.macro KERNEL4x8_I2_L4_2  AREG,BREG,OffsetA,OffsetB, Index,IsLast
+  KERNEL4x8_L1_L4_I  \AREG,\BREG,  \OffsetA,\OffsetB,\Index,\IsLast,0
+.endm
+
+.macro KERNEL4x8_I2_L4_3  AREG,BREG,OffsetA,OffsetB, Index,IsLast
+  KERNEL4x8_L1_L4_I \AREG,\BREG,  \OffsetA,\OffsetB,\Index,\IsLast,1
+.endm
+
+.macro END4x8_NORMAL
+  END4x8 0, AO, BO, 32,16 
+.endm
+
+.macro Zero4X8
+    xxlxor      vs32,   vs32,   vs32
+    xxlxor      vs33,   vs33,   vs33
+ 
+    xxlxor      vs36,   vs36,   vs36
+    xxlxor      vs37,   vs37,   vs37
+ 
+    xxlxor      vs40,   vs40,   vs40
+    xxlxor      vs41,   vs41,   vs41
+ 
+    xxlxor      vs44,   vs44,   vs44
+    xxlxor      vs45,   vs45,   vs45
+    
+.endm
+
+.macro LOAD4x8  Zero
+
+    lxv vs24,   0(BO) 
+    lxv vs0,     0(AO)
+    lxv vs1,    16(AO)
+
+    xxperm      vs26,   vs24,       permute_mask    
+    xxpermdi    vs25,   vs24,   vs24,2      
+
+    xxpermdi    vs27,   vs26,   vs26,2      
+
+.if \Zero==1 
+    xxlxor      vs32,   vs32,   vs32
+    xxlxor      vs33,   vs33,   vs33 
+    xxlxor      vs36,   vs36,   vs36
+    xxlxor      vs37,   vs37,   vs37
+    xxlxor      vs40,   vs40,   vs40
+    xxlxor      vs41,   vs41,   vs41 
+    xxlxor      vs44,   vs44,   vs44
+    xxlxor      vs45,   vs45,   vs45 
+ 
+.endif
+.endm
+
+
+.macro END4x8 First, AREG, BREG, OffsetA, OffsetB
+
+.if \OffsetB != 0 
+    addi        \BREG, \BREG, \OffsetB 
+.endif
+.if \OffsetA != 0 
+    addi        \AREG, \AREG, \OffsetA 
+.endif  
+
+.if \First==1
+    xvmulsp     vs32, vs0,vs24
+    xvmulsp     vs33, vs1,vs24
+
+    xvmulsp     vs36, vs0,vs25
+    xvmulsp     vs37, vs1,vs25
+
+    xvmulsp     vs40, vs0,vs26
+    xvmulsp     vs41, vs1,vs26
+
+    xvmulsp     vs44, vs0,vs27
+    xvmulsp     vs45, vs1,vs27
+ 
+
+.else
+    xvmaddasp       vs32, vs0,vs24
+    xvmaddasp       vs33, vs1,vs24
+
+    xvmaddasp       vs36, vs0,vs25
+    xvmaddasp       vs37, vs1,vs25
+
+    xvmaddasp       vs40, vs0,vs26
+    xvmaddasp       vs41, vs1,vs26
+
+    xvmaddasp       vs44, vs0,vs27
+    xvmaddasp       vs45, vs1,vs27
+ 
+
+.endif
+.endm  
+
+.macro KERNEL4x8_L1_L4_I  AREG,BREG,   OffsetA,OffsetB, Index,IsLast ,Complete
+
+    lxv vs8,    DISP16(\Index, 0+\OffsetB)(\BREG) 
+
+    lxv vs4,    DISP32(\Index, 0+\OffsetA)(\AREG)
+    lxv vs5,    DISP32(\Index,16+\OffsetA)(\AREG)
+
+    xxperm      vs10,   vs8,        permute_mask    
+    xxpermdi    vs9,    vs8,    vs8,2     
+
+    xvmaddasp       vs32, vs0,vs24
+    xvmaddasp       vs33, vs1,vs24
+
+    xvmaddasp       vs36, vs0,vs25
+    xvmaddasp       vs37, vs1,vs25
+
+    xxpermdi    vs11,   vs10,   vs10,2   
+
+    xvmaddasp       vs40, vs0,vs26
+    xvmaddasp       vs41, vs1,vs26
+
+    xvmaddasp       vs44, vs0,vs27
+    xvmaddasp       vs45, vs1,vs27
+
+ 
+
+    lxv vs24,   DISP16(\Index,16+\OffsetB)(\BREG) 
+
+    lxv vs0,    DISP32(\Index,32+\OffsetA)(\AREG)
+    lxv vs1,    DISP32(\Index,32+16+\OffsetA)(\AREG)
+
+    xxperm      vs26,   vs24,       permute_mask   
+    xxpermdi    vs25,   vs24,   vs24,2      
+
+    xvmaddasp       vs32, vs4,vs8
+    xvmaddasp       vs33, vs5,vs8
+
+    xvmaddasp       vs36, vs4,vs9
+    xvmaddasp       vs37, vs5,vs9
+
+    xxpermdi    vs27,   vs26,   vs26,2       
+
+    xvmaddasp       vs40, vs4,vs10
+    xvmaddasp       vs41, vs5,vs10
+
+    xvmaddasp       vs44, vs4,vs11
+    xvmaddasp       vs45, vs5,vs11
+
+ 
+
+    lxv vs8,    DISP16(\Index,32+\OffsetB)(\BREG) 
+
+    lxv vs4,    DISP32(\Index,64+0+\OffsetA)(\AREG)
+    lxv vs5,    DISP32(\Index,64+16+\OffsetA)(\AREG)
+
+    xxperm      vs10,   vs8,        permute_mask     
+    xxpermdi    vs9,    vs8,    vs8,2     
+
+    xvmaddasp       vs32, vs0,vs24
+    xvmaddasp       vs33, vs1,vs24
+
+    xvmaddasp       vs36, vs0,vs25
+    xvmaddasp       vs37, vs1,vs25
+
+    xxpermdi    vs11,   vs10,   vs10,2   
+
+    xvmaddasp       vs40, vs0,vs26
+    xvmaddasp       vs41, vs1,vs26
+
+    xvmaddasp       vs44, vs0,vs27
+    xvmaddasp       vs45, vs1,vs27
+
+ 
+
+.if \Complete==0
+    lxv vs24,   DISP16(\Index,48+\OffsetB)(\BREG) 
+
+    lxv vs0,    DISP32(\Index,96+\OffsetA)(\AREG)
+    lxv vs1,    DISP32(\Index,96+16+\OffsetA)(\AREG) 
+
+    xxperm      vs26,   vs24,   permute_mask     
+    xxpermdi    vs25,   vs24,   vs24,2      
+
+.endif 
+.if \IsLast==1  
+.if \Complete==1
+  
+    addi        \BREG, \BREG,  DISP16(\Index,16*3+\OffsetB)
+    addi        \AREG, \AREG, DISP32(\Index,32*3+\OffsetA)
+.else
+  
+    addi        \BREG, \BREG,  DISP16(\Index,64)
+    addi        \AREG, \AREG, DISP32(\Index,128)
+.endif
+.endif   
+ 
+    xvmaddasp       vs32, vs4,vs8
+    xvmaddasp       vs33, vs5,vs8
+
+    xvmaddasp       vs36, vs4,vs9
+    xvmaddasp       vs37, vs5,vs9
+
+.if \Complete==0        
+    xxpermdi    vs27,   vs26,   vs26,2    
+    
+.endif
+ 
+    xvmaddasp       vs40, vs4,vs10
+    xvmaddasp       vs41, vs5,vs10
+
+    xvmaddasp       vs44, vs4,vs11
+    xvmaddasp       vs45, vs5,vs11
+
+ 
+
+.endm
+
+.macro KERNEL4x8 First
+
+  LOAD4x8 0
+  END4x8 \First, AO, BO, 32,16  
+.endm
+
+.macro KERNEL4x8_L1_L2_I  AREG,BREG,First,OffsetA,OffsetB, Index,IsLast ,Complete
+    
+    lxv vs8,     DISP8(\Index, 0+\OffsetB)(\BREG) 
+    lxv vs4,    DISP16(\Index, 0+\OffsetA)(\AREG)
+    lxv vs5,    DISP16(\Index,16+\OffsetA)(\AREG)
+
+    xxperm      vs10,   vs8,        permute_mask  
+    xxpermdi    vs9,    vs8,    vs8,2     
+.if \First==1
+    xvmulsp     vs32, vs0,vs24
+    xvmulsp     vs33, vs1,vs24
+
+    xvmulsp     vs36, vs0,vs25
+    xvmulsp     vs37, vs1,vs25
+
+.else
+    xvmaddasp       vs32, vs0,vs24
+    xvmaddasp       vs33, vs1,vs24
+
+    xvmaddasp       vs36, vs0,vs25
+    xvmaddasp       vs37, vs1,vs25
+
+.endif
+
+    xxpermdi    vs11,   vs10,   vs10,2    
+ 
+.if \First==1  
+    xvmulsp     vs40, vs0,vs26
+    xvmulsp     vs41, vs1,vs26
+
+    xvmulsp     vs44, vs0,vs27
+    xvmulsp     vs45, vs1,vs27
+ 
+
+.else 
+    xvmaddasp       vs40, vs0,vs26
+    xvmaddasp       vs41, vs1,vs26
+
+    xvmaddasp       vs44, vs0,vs27
+    xvmaddasp       vs45, vs1,vs27
+ 
+
+.endif
+.if \Complete==0
+    lxv vs24,    DISP8(\Index,16+\OffsetB)(\BREG) 
+
+    lxv vs0,    DISP16(\Index,32+\OffsetA)(\AREG)
+    lxv vs1,    DISP16(\Index,32+16+\OffsetA)(\AREG)
+
+    xxperm      vs26,   vs24,   permute_mask   
+    xxpermdi    vs25,   vs24,   vs24,2    
+.endif    
+.if \IsLast==1  
+.if \Complete==1
+    addi        \BREG, \BREG,   DISP8(\Index,16+\OffsetB) 
+    addi        \AREG, \AREG,  DISP16(\Index,32+\OffsetA)
+
+.else
+    addi        \BREG, \BREG,   DISP8(\Index,32)
+    addi        \AREG, \AREG,  DISP16(\Index,64) 
+.endif
+.endif
+
+.if \First==1
+    xvmulsp     vs32, vs4,vs8
+    xvmulsp     vs33, vs5,vs8
+
+    xvmulsp     vs36, vs4,vs9
+    xvmulsp     vs37, vs5,vs9
+
+.else
+    xvmaddasp       vs32, vs4,vs8
+    xvmaddasp       vs33, vs5,vs8
+
+    xvmaddasp       vs36, vs4,vs9
+    xvmaddasp       vs37, vs5,vs9
+
+.endif 
+ 
+.if \Complete==0        
+    xxpermdi    vs27,   vs26,   vs26,2   
+ 
+.endif
+.if \First==1  
+    xvmulsp     vs40, vs4,vs10
+    xvmulsp     vs41, vs5,vs10
+
+    xvmulsp     vs44, vs4,vs11
+    xvmulsp     vs45, vs5,vs11
+ 
+.else 
+    xvmaddasp       vs40, vs4,vs10
+    xvmaddasp       vs41, vs5,vs10
+
+    xvmaddasp       vs44, vs4,vs11
+    xvmaddasp       vs45, vs5,vs11 
+
+.endif
+
+.endm
+
+
+.macro SAVE4x8 
+ 
+  slwi    T10, LDC ,   1 
+  add     T1, CO, LDC 
+
+  add     T2, CO, T10  
+  add     T3, T1, T10  
+
+ 
+
+#ifndef TRMMKERNEL    
+    lxv        vs34, 0(CO)
+    lxv        vs35, 16(CO)      
+    lxv        vs38, 0(T1)
+    lxv        vs39, 16(T1)  
+    lxv        vs42, 0(T2)
+    lxv        vs43, 16(T2)     
+    lxv        vs46, 0(T3)
+    lxv        vs47, 16(T3)  
+
+ 
+#endif  
+
+    xxmrglw     vs8,    vs32,   vs44
+    xxmrglw     vs10,   vs36,   vs40  
+
+    xxmrghw     vs1,    vs32,   vs44
+    xxmrghw     vs0,    vs36,   vs40
+
+    xxmrglw     vs12,   vs33,   vs45
+    xxmrglw     vs14,   vs37,   vs41  
+
+    xxmrghw     vs2,    vs37,   vs41
+    xxmrghw     vs3,    vs33,   vs45
+
+    xxlor      vs9, vs8,    vs8
+    xxlor      vs11,    vs10,   vs10 
+ 
+    xxlor      vs13,    vs12,   vs12
+    xxlor      vs15,    vs14,   vs14
+
+    xxperm      vs8,    vs0,    save_permute_1
+    xxperm      vs10,   vs1,    save_permute_1
+    xxperm      vs9,    vs0,    save_permute_2  
+    xxperm      vs11,   vs1,    save_permute_2      
+
+    xxperm     vs12,    vs2,    save_permute_1
+    xxperm     vs14,    vs3,    save_permute_1
+      
+    xxperm     vs13,    vs2,    save_permute_2   
+    xxperm     vs15,    vs3,    save_permute_2      
+
+
+    /* multiply add normal way */
+ 
+#ifdef TRMMKERNEL
+    xvmulsp     vs34,   vs8,    alpha_r 
+    xvmulsp     vs35,   vs12,   alpha_r 
+    xvmulsp     vs38,   vs9,    alpha_r 
+    xvmulsp     vs39,   vs13,   alpha_r 
+    xvmulsp     vs42,   vs10,   alpha_r 
+    xvmulsp     vs43,   vs14,   alpha_r 
+    xvmulsp     vs46,   vs11,   alpha_r 
+    xvmulsp     vs47,   vs15,   alpha_r                    
+#else 
+    xvmaddasp   vs34,   vs8,    alpha_r 
+    xvmaddasp   vs35,   vs12,   alpha_r 
+    xvmaddasp   vs38,   vs9,    alpha_r 
+    xvmaddasp   vs39,   vs13,   alpha_r  
+    xvmaddasp   vs42,   vs10,   alpha_r 
+    xvmaddasp   vs43,   vs14,   alpha_r   
+    xvmaddasp   vs46,   vs11,   alpha_r 
+    xvmaddasp   vs47,   vs15,   alpha_r                     
+#endif     
+ 
+    
+    stxv        vs34, 0(CO)
+    stxv        vs35, 16(CO)  
+    stxv        vs38, 0(T1)
+    stxv        vs39, 16(T1)  
+    stxv        vs42, 0(T2)
+    stxv        vs43, 16(T2)     
+    stxv        vs46, 0(T3)
+    stxv        vs47, 16(T3)  
+  
+
+    addi CO,CO,32
+
+.endm
+
+
+/**********************************************************************************************
+* Macros for N=4 and M=4
+**********************************************************************************************/
+
+.macro LOAD4x4_1
+   LOAD4x4 1
+.endm
+
+.macro LOAD4x4_0
+   LOAD4x4 0
+.endm
+
+.macro KERNEL4x4_L1_L4  Index,IsLast
+  KERNEL4x4_L1_L4_I AO,BO, 0,0, \Index,\IsLast,0
+.endm
+
+.macro KERNEL4x4_I1_L4  OffsetA,OffsetB, Index,IsLast
+  KERNEL4x4_L1_L4_I  AO,BO, \OffsetA,\OffsetB,\Index,\IsLast,0
+.endm
+
+.macro KERNEL4x4_I1_L4_2  OffsetA,OffsetB, Index,IsLast
+  KERNEL4x4_L1_L4_I  AO,BO,  \OffsetA,\OffsetB,\Index,\IsLast,0
+.endm
+
+.macro KERNEL4x4_I1_L4_3  OffsetA,OffsetB, Index,IsLast
+  KERNEL4x4_L1_L4_I  AO,BO,  \OffsetA,\OffsetB,\Index,\IsLast,1
+.endm
+.macro KERNEL4x4_I1_L2_3  OffsetA,OffsetB, Index,IsLast
+   KERNEL4x4_L1_L2_I AO,BO,0,  \OffsetA,\OffsetB,\Index,\IsLast,1
+.endm
+
+.macro KERNEL4x4_I2_L4_2  AREG,BREG,OffsetA,OffsetB, Index,IsLast
+  KERNEL4x4_L1_L4_I  \AREG,\BREG,  \OffsetA,\OffsetB,\Index,\IsLast,0
+.endm
+
+.macro KERNEL4x4_I2_L4_3  AREG,BREG,OffsetA,OffsetB, Index,IsLast
+  KERNEL4x4_L1_L4_I \AREG,\BREG,  \OffsetA,\OffsetB,\Index,\IsLast,1
+.endm
+
+.macro Zero4X4
+    xxlxor      vs32,   vs32,   vs32
+    xxlxor      vs33,   vs33,   vs33 
+    xxlxor      vs34,   vs34,   vs34
+    xxlxor      vs35,   vs35,   vs35
+ 
+.endm
+
+.macro LOAD4x4  Zero
+
+    lxv vs0,     0(AO)
+    lxv vs24,   0(BO) 
+
+
+
+    xxperm      vs2,   vs0,       permute_mask  
+    xxpermdi    vs1,   vs0,   vs0,2      
+    xxpermdi    vs3,   vs2,   vs2,2        
+
+.if \Zero==1 
+    xxlxor      vs32,   vs32,   vs32
+    xxlxor      vs33,   vs33,   vs33 
+    xxlxor      vs34,   vs34,   vs34
+    xxlxor      vs35,   vs35,   vs35
+ 
+.endif
+.endm
+
+.macro END4x4_NORMAL
+  END4x4 0, AO, BO, 16,16 
+.endm
+
+.macro END4x4 First, AREG, BREG, OffsetA, OffsetB
+
+.if \OffsetB != 0 
+    addi        \BREG, \BREG, \OffsetB 
+.endif
+.if \OffsetA != 0 
+    addi        \AREG, \AREG, \OffsetA 
+.endif  
+
+.if \First==1
+    xvmulsp      vs32,   vs24,   vs0
+    xvmulsp      vs33,   vs24,   vs1 
+    xvmulsp      vs34,   vs24,   vs2
+    xvmulsp      vs35,   vs24,   vs3  
+.else
+    xvmaddasp      vs32,   vs24,   vs0
+    xvmaddasp      vs33,   vs24,   vs1 
+    xvmaddasp      vs34,   vs24,   vs2
+    xvmaddasp      vs35,   vs24,   vs3
+ 
+
+.endif
+.endm  
+
+.macro KERNEL4x4_L1_L4_I  AREG,BREG,   OffsetA,OffsetB, Index,IsLast ,Complete
+
+    lxv vs4,    DISP16(\Index, 0+\OffsetA)(\AREG)
+    lxv vs26,   DISP16(\Index, 0+\OffsetB)(\BREG) 
+
+    xxperm      vs6,   vs4,       permute_mask  
+    xxpermdi    vs5,   vs4,   vs4,2      
+    xxpermdi    vs7,   vs6,   vs6,2 
+ 
+    xvmaddasp      vs32,   vs24,   vs0
+    xvmaddasp      vs33,   vs24,   vs1 
+    xvmaddasp      vs34,   vs24,   vs2
+    xvmaddasp      vs35,   vs24,   vs3
+ 
+
+    lxv vs0,    DISP16(\Index, 16+\OffsetA)(\AREG)
+    lxv vs24,   DISP16(\Index, 16+\OffsetB)(\BREG)  
+
+    xxperm      vs2,   vs0,       permute_mask  
+    xxpermdi    vs1,   vs0,   vs0,2      
+    xxpermdi    vs3,   vs2,   vs2,2   
+
+    xvmaddasp      vs32,   vs26,   vs4
+    xvmaddasp      vs33,   vs26,   vs5 
+    xvmaddasp      vs34,   vs26,   vs6
+    xvmaddasp      vs35,   vs26,   vs7
+ 
+ 
+
+    lxv vs4,    DISP16(\Index, 32+\OffsetA)(\AREG)
+    lxv vs26,   DISP16(\Index, 32+\OffsetB)(\BREG) 
+
+    xxperm      vs6,   vs4,       permute_mask  
+    xxpermdi    vs5,   vs4,   vs4,2      
+    xxpermdi    vs7,   vs6,   vs6,2 
+ 
+    xvmaddasp      vs32,   vs24,   vs0
+    xvmaddasp      vs33,   vs24,   vs1 
+    xvmaddasp      vs34,   vs24,   vs2
+    xvmaddasp      vs35,   vs24,   vs3
+ 
+
+.if \Complete==0 
+
+    lxv vs0,    DISP16(\Index, 48+\OffsetA)(\AREG)
+    lxv vs24,   DISP16(\Index, 48+\OffsetB)(\BREG) 
+
+    xxperm      vs2,   vs0,       permute_mask  
+    xxpermdi    vs1,   vs0,   vs0,2      
+    xxpermdi    vs3,   vs2,   vs2,2   
+.endif
+    xvmaddasp      vs32,   vs26,   vs4
+    xvmaddasp      vs33,   vs26,   vs5 
+    xvmaddasp      vs34,   vs26,   vs6
+    xvmaddasp      vs35,   vs26,   vs7
+ 
+
+ 
+ 
+.if \IsLast==1  
+.if \Complete==1
+    addi        \AREG, \AREG, DISP16(\Index,16*3+\OffsetA)  
+    addi        \BREG, \BREG,  DISP16(\Index,16*3+\OffsetB)
+
+.else
+    addi        \AREG, \AREG, DISP16(\Index,64)  
+    addi        \BREG, \BREG,  DISP16(\Index,64)
+
+.endif
+.endif   
+ 
+ 
+.endm
+
+.macro KERNEL4x4 First
+    LOAD4x4 0
+    END4x4 \First, AO, BO, 16,16  
+.endm
+
+.macro KERNEL4x4_L1_L2_I  AREG,BREG,First,OffsetA,OffsetB, Index,IsLast ,Complete
+
+    lxv vs4,    DISP8(\Index, 0+\OffsetA)(\AREG)
+    lxv vs26,   DISP8(\Index, 0+\OffsetB)(\BREG) 
+
+    xxperm      vs6,   vs4,       permute_mask  
+    xxpermdi    vs5,   vs4,   vs4,2      
+    xxpermdi    vs7,   vs6,   vs6,2 
+.if \First==1
+    xvmulsp      vs32,   vs24,   vs0
+    xvmulsp      vs33,   vs24,   vs1 
+    xvmulsp      vs34,   vs24,   vs2
+    xvmulsp      vs35,   vs24,   vs3
+ 
+.else 
+    xvmaddasp      vs32,   vs24,   vs0
+    xvmaddasp      vs33,   vs24,   vs1 
+    xvmaddasp      vs34,   vs24,   vs2
+    xvmaddasp      vs35,   vs24,   vs3
+ 
+.endif
+
+.if \Complete==0 
+
+    lxv vs0,    DISP8(\Index, 16+\OffsetA)(\AREG)
+    lxv vs24,   DISP8(\Index, 16+\OffsetB)(\BREG) 
+
+    xxperm      vs2,   vs0,       permute_mask  
+    xxpermdi    vs1,   vs0,   vs0,2      
+    xxpermdi    vs3,   vs2,   vs2,2   
+.endif
+
+.if \First==1
+    xvmulsp      vs32,   vs26,   vs4
+    xvmulsp      vs33,   vs26,   vs5 
+    xvmulsp      vs34,   vs26,   vs6
+    xvmulsp      vs35,   vs26,   vs7 
+
+
+.else
+    xvmaddasp      vs32,   vs26,   vs4
+    xvmaddasp      vs33,   vs26,   vs5 
+    xvmaddasp      vs34,   vs26,   vs6
+    xvmaddasp      vs35,   vs26,   vs7
+ 
+.endif
+ 
+ 
+.if \IsLast==1  
+.if \Complete==1
+    addi        \AREG, \AREG, DISP8(\Index,16+\OffsetA)  
+    addi        \BREG, \BREG,  DISP8(\Index,16+\OffsetB)
+
+.else
+    addi        \AREG, \AREG, DISP8(\Index,32)  
+    addi        \BREG, \BREG,  DISP8(\Index,32)
+
+.endif
+.endif   
+     
+  
+.endm
+
+
+.macro SAVE4x4
+  slwi    T10, LDC ,   1 
+  add     T1, CO, LDC 
+#if !defined(TRMMKERNEL)  
+  lxv        vs36, 0(CO)
+  lxv        vs37, 0(T1)
+#endif
+  add     T2, CO, T10  
+  add     T3, T1, T10 
+#if !defined(TRMMKERNEL)   
+  lxv        vs38, 0(T2)
+  lxv        vs39, 0(T3)    
+#endif   
+
+  xxmrglw  vs0, vs35,vs32
+  xxmrglw  vs1, vs34,vs33 
+  xxmrglw  vs4, vs32,vs35
+  xxmrglw  vs5, vs33,vs34 
+
+
+  xxmrghw  vs2, vs35,vs32
+  xxmrghw  vs3, vs34,vs33 
+  xxmrghw  vs6, vs32,vs35
+  xxmrghw  vs7, vs33,vs34  
+
+  xxmrgld  vs24, vs1, vs0  
+  xxmrghd  vs25,vs5,vs4 
+
+  xxmrgld  vs26, vs2, vs3  
+  xxmrghd  vs27,vs6,vs7
+
+ #if defined(TRMMKERNEL)
+  xvmulsp        vs36, vs24, alpha_r
+  xvmulsp        vs37, vs25, alpha_r 
+  xvmulsp        vs38, vs26, alpha_r
+  xvmulsp        vs39, vs27, alpha_r 
+#else
+  xvmaddasp        vs36, vs24, alpha_r
+  xvmaddasp        vs37, vs25, alpha_r 
+  xvmaddasp        vs38, vs26, alpha_r
+  xvmaddasp        vs39, vs27, alpha_r   
+ #endif
+  stxv        vs36, 0(CO)
+  stxv        vs37, 0(T1) 
+  stxv        vs38, 0(T2)
+  stxv        vs39, 0(T3)   
+ 
+
+
+  addi CO,CO,16
+.endm
+
+
+/**********************************************************************************************
+* Macros for N=4 and M=2
+**********************************************************************************************/
+
+ 
+.macro KERNEL4x2_2   OffsetA,OffsetB, Index,IsLast
+  KERNEL4x2_I_2 AO,BO, 0, \OffsetA,\OffsetB,\Index,\IsLast
+.endm
+
+ 
+
+.macro Zero4x2
+    xxlxor      vs0,   vs0,   vs0 
+    xxlxor      vs2,   vs2,   vs2 
+       
+.endm
+ 
+.macro KERNEL4x2
+  KERNEL4x2_1 AO,BO, 0, 0,0,0
+.endm
+.macro KERNEL4x2_1 AREG,BREG,First,OffsetA,OffsetB,Index
+
+
+    lxsd v4,   DISP2(\Index, 0+\OffsetA)(\AREG)
+    lxv vs26,   DISP8(\Index, 0+\OffsetB)(\BREG)      
+    xxspltw   vs8,  vs36, 0 
+    xxspltw   vs9,  vs36, 1  
+ 
+.if \First==1
+    xvmulsp      vs0,   vs26,   vs8 
+    xvmulsp      vs2,   vs26,   vs9 
+     
+.else 
+    xvmaddasp      vs0,   vs26,   vs8  
+    xvmaddasp      vs2,   vs26,   vs9 
+ 
+ .endif
+   
+    addi        \AREG, \AREG, DISP2(\Index,8)  
+    addi        \BREG, \BREG, DISP4(\Index,16)
+ 
+.endm
+
+.macro KERNEL4x2_I_2  AREG,BREG,First,OffsetA,OffsetB, Index,IsLast  
+
+    lxv vs4,    DISP4(\Index, 0+\OffsetA)(\AREG)
+    lxv vs26,   DISP8(\Index, 0+\OffsetB)(\BREG) 
+    lxv vs28,   DISP8(\Index,16+\OffsetB)(\BREG)       
+    xxspltw   vs8,  vs4, 2  
+    xxspltw   vs9,  vs4, 3 
+    xxspltw   vs10, vs4, 0 
+    xxspltw   vs11, vs4, 1
+ 
+.if \First==1
+    xvmulsp      vs0,   vs26,   vs8 
+    xvmulsp      vs2,   vs26,   vs9  
+
+    xvmulsp      vs0,   vs28,   vs10 
+    xvmulsp      vs2,   vs28,   vs11     
+.else 
+    xvmaddasp      vs0,   vs26,   vs8 
+    xvmaddasp      vs2,   vs26,   vs9 
+
+    xvmaddasp      vs0,   vs28,   vs10 
+    xvmaddasp      vs2,   vs28,   vs11   
+ .endif
+
+ 
+.if \IsLast==1   
+    addi        \AREG, \AREG, DISP4(\Index,16)  
+    addi        \BREG, \BREG, DISP8(\Index,32)
+.endif 
+  
+.endm
+
+
+.macro SAVE4x2
+  slwi    T10, LDC ,   1 
+  add     T1, CO, LDC  
+  add     T2, CO, T10  
+  add     T3, T1, T10     
+  /*convert alpha_r for multiply*/
+  xscvspdp  vs4,alpha_r
+/* v0 corresponds to vs32, do not forget*/
+#if !defined(TRMMKERNEL)
+  lxssp  v0,0(CO) 
+  lxssp  v1,4(CO) 
+
+  lxssp  v2,0(T1)
+  lxssp  v3,4(T1)
+
+  lxssp  v4,0(T2)
+  lxssp  v5,4(T2)
+
+  lxssp  v6,0(T3)
+  lxssp  v7,4(T3)
+
+   
+#endif
+  xscvspdp  vs5, vs2
+  xxspltw   vs6, vs2, 1 
+  xxspltw   vs7, vs2, 2 
+  xxspltw   vs8, vs2, 3  
+  xscvspdp  vs6,vs6
+  xscvspdp  vs7,vs7
+  xscvspdp  vs8,vs8
+
+  xscvspdp  vs24, vs0
+  xxspltw   vs25, vs0, 1 
+  xxspltw   vs26, vs0, 2 
+  xxspltw   vs27, vs0, 3  
+  xscvspdp  vs25,vs25
+  xscvspdp  vs26,vs26
+  xscvspdp  vs27,vs27
+ 
+
+#if defined(TRMMKERNEL)
+  xsmuldp  vs32,vs8, vs4 
+  xsmuldp  vs33,vs27, vs4 
+
+  xsmuldp  vs34,vs7, vs4 
+  xsmuldp  vs35,vs26, vs4 
+
+  xsmuldp  vs36,vs6, vs4 
+  xsmuldp  vs37,vs25, vs4  
+
+  xsmuldp  vs38,vs5, vs4 
+  xsmuldp  vs39,vs24, vs4  
+
+      
+#else
+  xsmaddadp  vs32,vs8, vs4 
+  xsmaddadp  vs33,vs27, vs4 
+
+  xsmaddadp  vs34,vs7, vs4 
+  xsmaddadp  vs35,vs26, vs4 
+
+  xsmaddadp  vs36,vs6, vs4 
+  xsmaddadp  vs37,vs25, vs4  
+
+  xsmaddadp  vs38,vs5, vs4 
+  xsmaddadp  vs39,vs24, vs4  
+
+    
+#endif  
+
+  stxssp  v0,0(CO) 
+  stxssp  v1,4(CO) 
+
+  stxssp  v2,0(T1)
+  stxssp  v3,4(T1)
+
+  stxssp  v4,0(T2)
+  stxssp  v5,4(T2)
+
+  stxssp  v6,0(T3)
+  stxssp  v7,4(T3)
+
+ 
+ 
+
+  addi CO,CO,8
+.endm
+
+
+/**********************************************************************************************
+* Macros for N=4 and M=1
+**********************************************************************************************/
+.macro KERNEL4x1_4   OffsetA,OffsetB, Index,IsLast
+  KERNEL4x1_I_4 AO,BO, 0, \OffsetA,\OffsetB,\Index,\IsLast
+.endm
+
+.macro Zero4x1
+    xxlxor      vs0,   vs0,   vs0 
+.endm
+
+.macro KERNEL4x1
+  KERNEL4x1_1 AO,BO, 0 
+.endm
+
+.macro KERNEL4x1_2
+  KERNEL4x1_2_1 AO,BO, 0 
+.endm
+
+.macro KERNEL4x1_1 AREG,BREG,First 
+    lxvwsx vs8,  0, \AREG
+    lxv vs26,   0(\BREG)       
+.if \First==1
+    xvmulsp      vs0,   vs26,   vs8  
+.else 
+    xvmaddasp      vs0,   vs26,   vs8  
+ .endif
+    addi        \AREG, \AREG,  4  
+    addi        \BREG, \BREG,  16
+.endm
+
+.macro KERNEL4x1_2_1 AREG,BREG,First 
+    lxsd v4,    0(\AREG)
+    lxv vs26,   0(\BREG)      
+    lxv vs28,  16(\BREG) 
+    xxspltw   vs8,  vs36, 1 
+    xxspltw   vs9,  vs36, 0  
+.if \First==1
+    xvmulsp      vs0,   vs26,   vs8 
+    xvmulsp      vs0,   vs28,   vs9     
+.else 
+    xvmaddasp      vs0,   vs26,   vs8  
+    xvmaddasp      vs0,   vs28,   vs9  
+ .endif
+    addi        \AREG, \AREG,  8 
+    addi        \BREG, \BREG,  32
+.endm
+
+.macro KERNEL4x1_I_4  AREG,BREG,First,OffsetA,OffsetB, Index,IsLast  
+    lxv vs4,    DISP4(\Index, 0+\OffsetA)(\AREG)
+    xxspltw   vs8,  vs4, 3 
+    xxspltw   vs9,  vs4, 2 
+    xxspltw   vs10, vs4, 1 
+    xxspltw   vs11, vs4, 0
+    lxv vs26,   DISP16(\Index, 0+\OffsetB)(\BREG) 
+    lxv vs28,   DISP16(\Index,16+\OffsetB)(\BREG)  
+    lxv vs30,   DISP16(\Index,32+\OffsetB)(\BREG) 
+    lxv vs32,   DISP16(\Index,48+\OffsetB)(\BREG)          
+.if \First==1
+    xvmulsp      vs0,   vs26,   vs8  
+    xvmulsp      vs0,   vs28,   vs9      
+    xvmulsp      vs0,   vs30,   vs10  
+    xvmulsp      vs0,   vs32,   vs11     
+.else 
+    xvmaddasp      vs0,   vs26,   vs8  
+    xvmaddasp      vs0,   vs28,   vs9     
+    xvmaddasp      vs0,   vs30,   vs10  
+    xvmaddasp      vs0,   vs32,   vs11  
+ .endif
+.if \IsLast==1   
+    addi        \AREG, \AREG, DISP4(\Index,16)  
+    addi        \BREG, \BREG, DISP16(\Index,64)
+.endif 
+.endm
+
+.macro SAVE4x1
+  slwi    T10, LDC ,   1 
+  add     T1, CO, LDC  
+  add     T2, CO, T10  
+  add     T3, T1, T10     
+  /*convert alpha_r for multiply*/
+  xscvspdp  vs4,alpha_r
+/* v0 corresponds to vs32, do not forget*/
+#if !defined(TRMMKERNEL)
+  lxssp  v0,0(CO)  
+  lxssp  v2,0(T1) 
+  lxssp  v4,0(T2) 
+  lxssp  v6,0(T3)  
+#endif
+  xscvspdp  vs24, vs0
+  xxspltw   vs25, vs0, 1 
+  xxspltw   vs26, vs0, 2 
+  xxspltw   vs27, vs0, 3  
+  xscvspdp  vs25,vs25
+  xscvspdp  vs26,vs26
+  xscvspdp  vs27,vs27
+
+#if defined(TRMMKERNEL)
+  xsmuldp  vs32,vs27, vs4 
+  xsmuldp  vs34,vs26, vs4 
+  xsmuldp  vs36,vs25, vs4 
+  xsmuldp  vs38,vs24, vs4  
+#else
+  xsmaddadp  vs32,vs27, vs4 
+  xsmaddadp  vs34,vs26, vs4 
+  xsmaddadp  vs36,vs25, vs4 
+  xsmaddadp  vs38,vs24, vs4   
+#endif  
+  stxssp  v0,0(CO)  
+  stxssp  v2,0(T1) 
+  stxssp  v4,0(T2) 
+  stxssp  v6,0(T3)  
+  addi CO,CO,4
+.endm
+
+/****************************N=2 section*****************/
+
+.macro KERNEL2x16_2   OffsetA,OffsetB, Index,IsLast
+  KERNEL2x16_I_2 AO,BO,  \OffsetA,\OffsetB,\Index,\IsLast
+.endm
+
+ 
+.macro Zero2x16
+    xxlxor      vs0,   vs0,   vs0
+    xxlxor      vs1,   vs1,   vs1 
+    xxlxor      vs2,   vs2,   vs2
+    xxlxor      vs3,   vs3,   vs3
+    xxlxor      vs4,   vs4,   vs4
+    xxlxor      vs5,   vs5,   vs5 
+    xxlxor      vs6,   vs6,   vs6
+    xxlxor      vs7,   vs7,   vs7      
+.endm
+ 
+.macro KERNEL2x16
+  KERNEL2x16_1 AO,BO, 0, 0,0,0
+.endm
+.macro KERNEL2x16_4 OffsetA,OffsetB, Index,IsLast  
+  KERNEL2x16_I_4 AO,BO,  \OffsetA,\OffsetB,\Index,\IsLast
+.endm
+ 
+.macro KERNEL2x16_1 AREG,BREG,First,OffsetA,OffsetB,Index
+
+
+    lxsd v4,   DISP2(\Index, 0+\OffsetB)(\BREG)
+    xxspltw   vs8,  vs36, 1 
+    xxspltw   vs9,  vs36, 0 
+    lxv vs26,   DISP16(\Index, 0+\OffsetA)(\AREG)
+    lxv vs27,   DISP16(\Index,16+\OffsetA)(\AREG)   
+    lxv vs28,   DISP16(\Index, 32+\OffsetA)(\AREG)
+    lxv vs29,   DISP16(\Index,48+\OffsetA)(\AREG)        
+ 
+ 
+.if \First==1
+    xvmulsp      vs0,   vs26,   vs8
+    xvmulsp      vs1,   vs27,   vs8 
+    xvmulsp      vs2,   vs28,   vs8
+    xvmulsp      vs3,   vs29,   vs8 
+
+    xvmulsp      vs4,   vs26,   vs9
+    xvmulsp      vs5,   vs27,   vs9 
+    xvmulsp      vs6,   vs28,   vs9
+    xvmulsp      vs7,   vs29,   vs9     
+     
+.else 
+    xvmaddasp      vs0,   vs26,   vs8
+    xvmaddasp      vs1,   vs27,   vs8 
+    xvmaddasp      vs2,   vs28,   vs8
+    xvmaddasp      vs3,   vs29,   vs8 
+
+    xvmaddasp      vs4,   vs26,   vs9
+    xvmaddasp      vs5,   vs27,   vs9 
+    xvmaddasp      vs6,   vs28,   vs9
+    xvmaddasp      vs7,   vs29,   vs9
+ 
+ .endif
+   
+    addi        \BREG, \BREG, DISP2(\Index,8)
+    addi        \AREG, \AREG, DISP16(\Index,64)  
+ 
+.endm
+
+
+
+
+.macro KERNEL2x16_I_4  AREG,BREG, OffsetA,OffsetB, Index,IsLast  
+
+    lxv vs38,    DISP8(\Index, 0+\OffsetB)(\BREG)
+    lxv vs39,    DISP8(\Index, 16+\OffsetB)(\BREG) 
+
+    lxv vs26,   DISP64(\Index, 0+\OffsetA)(\AREG)
+    lxv vs27,   DISP64(\Index,16+\OffsetA)(\AREG)
+    lxv vs28,   DISP64(\Index,32+\OffsetA)(\AREG)
+    lxv vs29,   DISP64(\Index,48+\OffsetA)(\AREG)  
+
+    lxv vs16,   DISP64(\Index,64+ 0+\OffsetA)(\AREG)
+    lxv vs17,   DISP64(\Index,64+ 16+\OffsetA)(\AREG)
+    lxv vs18,   DISP64(\Index,64+ 32+\OffsetA)(\AREG)
+    lxv vs19,   DISP64(\Index,64+ 48+\OffsetA)(\AREG) 
+
+    lxv vs30,   DISP64(\Index,128+ 0+\OffsetA)(\AREG)
+    lxv vs31,   DISP64(\Index,128+ 16+\OffsetA)(\AREG)
+    lxv vs32,   DISP64(\Index,128+ 32+\OffsetA)(\AREG)
+    lxv vs33,   DISP64(\Index,128+ 48+\OffsetA)(\AREG)  
+        
+    lxv vs34,   DISP64(\Index,128+ 64+ 0+\OffsetA)(\AREG)
+    lxv vs35,   DISP64(\Index,128+ 64+ 16+\OffsetA)(\AREG)
+    lxv vs36,   DISP64(\Index,128+ 64+ 32+\OffsetA)(\AREG)
+    lxv vs37,   DISP64(\Index,128+ 64+ 48+\OffsetA)(\AREG) 
+
+    xxspltw   vs8,  vs38, 3  
+    xxspltw   vs9,  vs38, 2 
+    xxspltw   vs10, vs38, 1 
+    xxspltw   vs11, vs38, 0  
+
+    xxspltw   vs12,  vs39, 3  
+    xxspltw   vs13,  vs39, 2 
+    xxspltw   vs14, vs39, 1 
+    xxspltw   vs15, vs39, 0  
+
+ 
+    xvmaddasp      vs0,   vs26,   vs8
+    xvmaddasp      vs1,   vs27,   vs8 
+    xvmaddasp      vs2,   vs28,   vs8
+    xvmaddasp      vs3,   vs29,   vs8 
+
+    xvmaddasp      vs4,   vs26,   vs9
+    xvmaddasp      vs5,   vs27,   vs9 
+    xvmaddasp      vs6,   vs28,   vs9
+    xvmaddasp      vs7,   vs29,   vs9 
+
+    xvmaddasp      vs0,   vs16,   vs10
+    xvmaddasp      vs1,   vs17,   vs10 
+    xvmaddasp      vs2,   vs18,   vs10
+    xvmaddasp      vs3,   vs19,   vs10 
+
+    xvmaddasp      vs4,   vs16,   vs11
+    xvmaddasp      vs5,   vs17,   vs11 
+    xvmaddasp      vs6,   vs18,   vs11
+    xvmaddasp      vs7,   vs19,   vs11  
+
+    xvmaddasp      vs0,   vs30,   vs12
+    xvmaddasp      vs1,   vs31,   vs12 
+    xvmaddasp      vs2,   vs32,   vs12
+    xvmaddasp      vs3,   vs33,   vs12 
+
+    xvmaddasp      vs4,   vs30,   vs13
+    xvmaddasp      vs5,   vs31,   vs13 
+    xvmaddasp      vs6,   vs32,   vs13
+    xvmaddasp      vs7,   vs33,   vs13 
+
+    xvmaddasp      vs0,   vs34,   vs14
+    xvmaddasp      vs1,   vs35,   vs14 
+    xvmaddasp      vs2,   vs36,   vs14
+    xvmaddasp      vs3,   vs37,   vs14 
+
+    xvmaddasp      vs4,   vs34,   vs15
+    xvmaddasp      vs5,   vs35,   vs15 
+    xvmaddasp      vs6,   vs36,   vs15
+    xvmaddasp      vs7,   vs37,   vs15    
+ 
+ 
+.if \IsLast==1   
+    addi        \BREG, \BREG, DISP8(\Index,32)  
+    addi        \AREG, \AREG, DISP64(\Index,256)
+.endif 
+  
+.endm
+
+.macro KERNEL2x16_I_2  AREG,BREG, OffsetA,OffsetB, Index,IsLast  
+
+    lxv vs36,    DISP4(\Index, 0+\OffsetB)(\BREG)
+    xxspltw   vs8,  vs36, 3  
+    xxspltw   vs9,  vs36, 2 
+    xxspltw   vs10, vs36, 1 
+    xxspltw   vs11, vs36, 0    
+    lxv vs26,   DISP32(\Index, 0+\OffsetA)(\AREG)
+    lxv vs27,   DISP32(\Index,16+\OffsetA)(\AREG)
+    lxv vs28,   DISP32(\Index,32+\OffsetA)(\AREG)
+    lxv vs29,   DISP32(\Index,48+\OffsetA)(\AREG)      
+    lxv vs16,   DISP32(\Index,64+ 0+\OffsetA)(\AREG)
+    lxv vs17,   DISP32(\Index,64+ 16+\OffsetA)(\AREG)
+    lxv vs18,   DISP32(\Index,64+ 32+\OffsetA)(\AREG)
+    lxv vs19,   DISP32(\Index,64+ 48+\OffsetA)(\AREG) 
+ 
+ 
+    xvmaddasp      vs0,   vs26,   vs8
+    xvmaddasp      vs1,   vs27,   vs8 
+    xvmaddasp      vs2,   vs28,   vs8
+    xvmaddasp      vs3,   vs29,   vs8 
+
+    xvmaddasp      vs4,   vs26,   vs9
+    xvmaddasp      vs5,   vs27,   vs9 
+    xvmaddasp      vs6,   vs28,   vs9
+    xvmaddasp      vs7,   vs29,   vs9 
+
+    xvmaddasp      vs0,   vs16,   vs10
+    xvmaddasp      vs1,   vs17,   vs10 
+    xvmaddasp      vs2,   vs18,   vs10
+    xvmaddasp      vs3,   vs19,   vs10 
+
+    xvmaddasp      vs4,   vs16,   vs11
+    xvmaddasp      vs5,   vs17,   vs11 
+    xvmaddasp      vs6,   vs18,   vs11
+    xvmaddasp      vs7,   vs19,   vs11   
+ 
+.if \IsLast==1   
+    addi        \BREG, \BREG, DISP4(\Index,16)  
+    addi        \AREG, \AREG, DISP32(\Index,128)
+.endif 
+  
+.endm
+
+
+.macro SAVE2x16
+
+#ifndef TRMMKERNEL    
+    lxv        vs16, 0(CO)
+    lxv        vs17, 16(CO) 
+    lxv        vs18, 32(CO)  
+    lxv        vs19, 48(CO)      
+#endif
+  add     T1, CO, LDC 
+#ifndef TRMMKERNEL    
+    lxv        vs26, 0(T1)
+    lxv        vs27, 16(T1) 
+    lxv        vs28, 32(T1)  
+    lxv        vs29, 48(T1)      
+#endif
+
+#if defined(TRMMKERNEL)
+  xvmulsp        vs16, vs0, alpha_r
+  xvmulsp        vs17, vs1, alpha_r 
+  xvmulsp        vs18, vs2, alpha_r
+  xvmulsp        vs19, vs3, alpha_r   
+  xvmulsp        vs26, vs4, alpha_r
+  xvmulsp        vs27, vs5, alpha_r 
+  xvmulsp        vs28, vs6, alpha_r
+  xvmulsp        vs29, vs7, alpha_r
+#else
+  xvmaddasp        vs16, vs0, alpha_r
+  xvmaddasp        vs17, vs1, alpha_r 
+  xvmaddasp        vs18, vs2, alpha_r
+  xvmaddasp        vs19, vs3, alpha_r   
+  xvmaddasp        vs26, vs4, alpha_r
+  xvmaddasp        vs27, vs5, alpha_r 
+  xvmaddasp        vs28, vs6, alpha_r
+  xvmaddasp        vs29, vs7, alpha_r
+#endif
+    stxv        vs16, 0(CO)
+    stxv        vs17, 16(CO) 
+    stxv        vs18, 32(CO)  
+    stxv        vs19, 48(CO)      
+    
+    stxv        vs26, 0(T1)
+    stxv        vs27, 16(T1) 
+    stxv        vs28, 32(T1)  
+    stxv        vs29, 48(T1) 
+ 
+  addi CO,CO,64
+
+.endm
+
+/*       M=8 N=2 */
+
+.macro KERNEL2x8_2   OffsetA,OffsetB, Index,IsLast
+  KERNEL2x8_I_2 AO,BO,  \OffsetA,\OffsetB,\Index,\IsLast
+.endm
+
+ 
+.macro Zero2x8
+    xxlxor      vs0,   vs0,   vs0
+    xxlxor      vs1,   vs1,   vs1 
+ 
+    xxlxor      vs4,   vs4,   vs4
+    xxlxor      vs5,   vs5,   vs5 
+     
+.endm
+ 
+.macro KERNEL2x8
+  KERNEL2x8_1 AO,BO, 0, 0,0,0
+.endm
+.macro KERNEL2x8_4 OffsetA,OffsetB, Index,IsLast  
+  KERNEL2x8_I_4 AO,BO,  \OffsetA,\OffsetB,\Index,\IsLast
+.endm
+ 
+.macro KERNEL2x8_1 AREG,BREG,First,OffsetA,OffsetB,Index
+
+
+    lxsd v4,   DISP2(\Index, 0+\OffsetB)(\BREG)
+    xxspltw   vs8,  vs36, 1 
+    xxspltw   vs9,  vs36, 0 
+    lxv vs26,   DISP8(\Index, 0+\OffsetA)(\AREG)
+    lxv vs27,   DISP8(\Index,16+\OffsetA)(\AREG)          
+ 
+ 
+.if \First==1
+    xvmulsp      vs0,   vs26,   vs8
+    xvmulsp      vs1,   vs27,   vs8  
+
+    xvmulsp      vs4,   vs26,   vs9
+    xvmulsp      vs5,   vs27,   vs9      
+     
+.else 
+    xvmaddasp      vs0,   vs26,   vs8
+    xvmaddasp      vs1,   vs27,   vs8   
+
+    xvmaddasp      vs4,   vs26,   vs9
+    xvmaddasp      vs5,   vs27,   vs9  
+ 
+ .endif
+   
+    addi        \BREG, \BREG, DISP2(\Index,8)
+    addi        \AREG, \AREG, DISP8(\Index,32)  
+ 
+.endm
+
+
+
+
+.macro KERNEL2x8_I_4  AREG,BREG, OffsetA,OffsetB, Index,IsLast  
+
+    lxv vs38,    DISP8(\Index, 0+\OffsetB)(\BREG)
+    lxv vs39,    DISP8(\Index, 16+\OffsetB)(\BREG) 
+
+    lxv vs26,   DISP32(\Index, 0+\OffsetA)(\AREG)
+    lxv vs27,   DISP32(\Index,16+\OffsetA)(\AREG) 
+
+    lxv vs16,   DISP32(\Index,32+ 0+\OffsetA)(\AREG)
+    lxv vs17,   DISP32(\Index,32+ 16+\OffsetA)(\AREG) 
+
+    lxv vs30,   DISP32(\Index,64+ 0+\OffsetA)(\AREG)
+    lxv vs31,   DISP32(\Index,64+ 16+\OffsetA)(\AREG)  
+        
+    lxv vs34,   DISP32(\Index, 96+ 0+\OffsetA)(\AREG)
+    lxv vs35,   DISP32(\Index, 96+ 16+\OffsetA)(\AREG) 
+
+    xxspltw   vs8,  vs38, 3  
+    xxspltw   vs9,  vs38, 2 
+    xxspltw   vs10, vs38, 1 
+    xxspltw   vs11, vs38, 0  
+
+    xxspltw   vs12,  vs39, 3  
+    xxspltw   vs13,  vs39, 2 
+    xxspltw   vs14, vs39, 1 
+    xxspltw   vs15, vs39, 0  
+
+ 
+ 
+    xvmaddasp      vs0,   vs26,   vs8
+    xvmaddasp      vs1,   vs27,   vs8 
+    xvmaddasp      vs4,   vs26,   vs9
+    xvmaddasp      vs5,   vs27,   vs9 
+ 
+
+    xvmaddasp      vs0,   vs16,   vs10
+    xvmaddasp      vs1,   vs17,   vs10 
+    xvmaddasp      vs4,   vs16,   vs11
+    xvmaddasp      vs5,   vs17,   vs11 
+ 
+
+    xvmaddasp      vs0,   vs30,   vs12
+    xvmaddasp      vs1,   vs31,   vs12 
+    xvmaddasp      vs4,   vs30,   vs13
+    xvmaddasp      vs5,   vs31,   vs13 
+
+    xvmaddasp      vs0,   vs34,   vs14
+    xvmaddasp      vs1,   vs35,   vs14 
+    xvmaddasp      vs4,   vs34,   vs15
+    xvmaddasp      vs5,   vs35,   vs15 
+   
+ 
+ 
+.if \IsLast==1   
+    addi        \BREG, \BREG, DISP8(\Index,32)  
+    addi        \AREG, \AREG, DISP32(\Index,128)
+.endif 
+  
+.endm
+
+.macro KERNEL2x8_I_2  AREG,BREG, OffsetA,OffsetB, Index,IsLast  
+
+    lxv vs36,    DISP4(\Index, 0+\OffsetB)(\BREG)
+    xxspltw   vs8,  vs36, 3  
+    xxspltw   vs9,  vs36, 2 
+    xxspltw   vs10, vs36, 1 
+    xxspltw   vs11, vs36, 0    
+    lxv vs26,   DISP16(\Index, 0+\OffsetA)(\AREG)
+    lxv vs27,   DISP16(\Index,16+\OffsetA)(\AREG)
+    lxv vs16,   DISP16(\Index,32+\OffsetA)(\AREG)
+    lxv vs17,   DISP16(\Index,48+\OffsetA)(\AREG)      
+ 
+    xvmaddasp      vs0,   vs26,   vs8
+    xvmaddasp      vs1,   vs27,   vs8  
+
+    xvmaddasp      vs4,   vs26,   vs9
+    xvmaddasp      vs5,   vs27,   vs9  
+
+    xvmaddasp      vs0,   vs16,   vs10
+    xvmaddasp      vs1,   vs17,   vs10   
+
+    xvmaddasp      vs4,   vs16,   vs11
+    xvmaddasp      vs5,   vs17,   vs11     
+ 
+.if \IsLast==1   
+    addi        \BREG, \BREG, DISP4(\Index,16)  
+    addi        \AREG, \AREG, DISP16(\Index,64)
+.endif 
+  
+.endm
+
+
+.macro SAVE2x8
+
+#ifndef TRMMKERNEL    
+    lxv        vs16, 0(CO)
+    lxv        vs17, 16(CO)     
+#endif
+  add     T1, CO, LDC 
+#ifndef TRMMKERNEL    
+    lxv        vs26, 0(T1)
+    lxv        vs27, 16(T1) 
+    
+#endif
+
+#if defined(TRMMKERNEL)
+  xvmulsp        vs16, vs0, alpha_r
+  xvmulsp        vs17, vs1, alpha_r  
+  xvmulsp        vs26, vs4, alpha_r
+  xvmulsp        vs27, vs5, alpha_r 
+#else
+  xvmaddasp        vs16, vs0, alpha_r
+  xvmaddasp        vs17, vs1, alpha_r  
+  xvmaddasp        vs26, vs4, alpha_r
+  xvmaddasp        vs27, vs5, alpha_r 
+#endif
+
+    stxv        vs16, 0(CO)
+    stxv        vs17, 16(CO) 
+     
+    
+    stxv        vs26, 0(T1)
+    stxv        vs27, 16(T1) 
+
+  addi CO,CO,32
+
+.endm
+
+
+/*M=4*/
+
+
+.macro KERNEL2x4_2   OffsetA,OffsetB, Index,IsLast
+  KERNEL2x4_I_2 AO,BO,  \OffsetA,\OffsetB,\Index,\IsLast
+.endm
+
+ /* we will aggregate on save vs0 +vs4 vs11+vs5 */
+.macro Zero2x4
+    xxlxor      vs0,   vs0,   vs0
+    xxlxor      vs1,   vs1,   vs1 
+ 
+    xxlxor      vs4,   vs4,   vs4
+    xxlxor      vs5,   vs5,   vs5 
+    
+.endm
+ 
+.macro KERNEL2x4
+  KERNEL2x4_1 AO,BO, 0, 0,0,0
+.endm
+.macro KERNEL2x4_4 OffsetA,OffsetB, Index,IsLast  
+  KERNEL2x4_I_4 AO,BO,  \OffsetA,\OffsetB,\Index,\IsLast
+.endm
+ 
+.macro KERNEL2x4_1 AREG,BREG,First,OffsetA,OffsetB,Index
+
+
+    lxsd v4,   DISP2(\Index, 0+\OffsetB)(\BREG)
+    xxspltw   vs8,  vs36, 1 
+    xxspltw   vs9,  vs36, 0 
+    lxv vs26,   DISP4(\Index, 0+\OffsetA)(\AREG)        
+ 
+ 
+.if \First==1
+    xvmulsp      vs0,   vs26,   vs8 
+    xvmulsp      vs1,   vs26,   vs9     
+     
+.else 
+    xvmaddasp      vs0,   vs26,   vs8 
+    xvmaddasp      vs1,   vs26,   vs9 
+ .endif
+   
+    addi        \BREG, \BREG, DISP2(\Index,8)
+    addi        \AREG, \AREG, DISP4(\Index,16)  
+ 
+.endm
+
+
+
+
+.macro KERNEL2x4_I_4  AREG,BREG, OffsetA,OffsetB, Index,IsLast  
+
+    lxv vs38,    DISP8(\Index, 0+\OffsetB)(\BREG)
+    lxv vs39,    DISP8(\Index, 16+\OffsetB)(\BREG) 
+
+    lxv vs26,   DISP16(\Index, 0+\OffsetA)(\AREG)
+    lxv vs16,   DISP16(\Index,16+\OffsetA)(\AREG) 
+
+    lxv vs30,   DISP16(\Index,32+ 0+\OffsetA)(\AREG)
+    lxv vs34,   DISP16(\Index,32+ 16+\OffsetA)(\AREG) 
+ 
+
+    xxspltw   vs8,  vs38, 3  
+    xxspltw   vs9,  vs38, 2 
+    xxspltw   vs10, vs38, 1 
+    xxspltw   vs11, vs38, 0  
+
+    xxspltw   vs12,  vs39, 3  
+    xxspltw   vs13,  vs39, 2 
+    xxspltw   vs14, vs39, 1 
+    xxspltw   vs15, vs39, 0  
+
+ 
+    xvmaddasp      vs0,   vs26,   vs8
+    xvmaddasp      vs1,   vs26,   vs9 
+    xvmaddasp      vs4,   vs16,   vs10
+    xvmaddasp      vs5,   vs16,   vs11 
+ 
+
+    xvmaddasp      vs0,   vs30,   vs12
+    xvmaddasp      vs1,   vs30,   vs13 
+    xvmaddasp      vs4,   vs34,   vs14
+    xvmaddasp      vs5,   vs34,   vs15 
+ 
+   
+ 
+ 
+.if \IsLast==1   
+    addi        \BREG, \BREG, DISP8(\Index,32)  
+    addi        \AREG, \AREG, DISP16(\Index,64)
+.endif 
+  
+.endm
+
+.macro KERNEL2x4_I_2  AREG,BREG, OffsetA,OffsetB, Index,IsLast  
+
+    lxv vs36,    DISP4(\Index, 0+\OffsetB)(\BREG)
+    xxspltw   vs8,  vs36, 3  
+    xxspltw   vs9,  vs36, 2 
+    xxspltw   vs10, vs36, 1 
+    xxspltw   vs11, vs36, 0    
+    lxv vs26,   DISP8(\Index, 0+\OffsetA)(\AREG)
+    lxv vs16,   DISP8(\Index, 16+\OffsetA)(\AREG)      
+ 
+    xvmaddasp      vs0,   vs26,   vs8
+    xvmaddasp      vs1,   vs26,   vs9 
+    xvmaddasp      vs4,   vs16,   vs10
+    xvmaddasp      vs5,   vs16,   vs11     
+ 
+.if \IsLast==1   
+    addi        \BREG, \BREG, DISP4(\Index,16)  
+    addi        \AREG, \AREG, DISP8(\Index,32)
+.endif 
+  
+.endm
+
+
+.macro SAVE2x4
+
+#ifndef TRMMKERNEL    
+    lxv        vs16, 0(CO)     
+#endif
+  add     T1, CO, LDC 
+#ifndef TRMMKERNEL    
+    lxv        vs26, 0(T1) 
+    
+#endif
+    /*aggregate vectors*/
+  xvaddsp         vs0,vs0,vs4
+  xvaddsp         vs1,vs1,vs5 
+#if defined(TRMMKERNEL)
+  xvmulsp        vs16, vs0, alpha_r 
+  xvmulsp        vs26, vs1, alpha_r 
+#else
+  xvmaddasp        vs16, vs0, alpha_r 
+  xvmaddasp        vs26, vs1, alpha_r 
+#endif
+
+  stxv        vs16, 0(CO) 
+  stxv        vs26, 0(T1)  
+
+  addi CO,CO,16
+
+.endm
+
+
+/* M=2 N=2 we will have inner pemrute action before permute was revrsing 3,2,1,0 not iw 2ill inner reverse 1,0,3,2  */
+.macro SWITCH_PERMUTE_INNER
+    xxpermdi	permute_mask,	permute_mask,	permute_mask,2
+.endm
+
+.macro Zero2x2
+    xxlxor      vs0,   vs0,   vs0
+    xxlxor      vs1,   vs1,   vs1 
+    SWITCH_PERMUTE_INNER
+.endm
+ 
+.macro KERNEL2x2
+  KERNEL2x2_1 AO,BO, 0, 0,0,0
+.endm
+.macro KERNEL2x2_4 OffsetA,OffsetB, Index,IsLast  
+  KERNEL2x2_I_4 AO,BO,  \OffsetA,\OffsetB,\Index,\IsLast
+.endm
+
+.macro KERNEL2x2_2 OffsetA,OffsetB, Index,IsLast  
+  KERNEL2x2_I_2 AO,BO,  \OffsetA,\OffsetB,\Index,\IsLast
+.endm
+ 
+.macro KERNEL2x2_1 AREG,BREG,First,OffsetA,OffsetB,Index
+
+
+    lxsd v4,   DISP2(\Index, 0+\OffsetB)(\BREG)
+    xxperm   vs9,  vs36, permute_mask 
+    lxsd v5,   DISP2(\Index, 0+\OffsetA)(\AREG)        
+ 
+ 
+.if \First==1
+    xvmulsp      vs0,   vs37,   vs36 
+    xvmulsp      vs1,   vs37,   vs9     
+     
+.else 
+    xvmaddasp      vs0,   vs37,   vs36 
+    xvmaddasp      vs1,   vs37,   vs9 
+ .endif
+   
+    addi        \BREG, \BREG, DISP2(\Index,8)
+    addi        \AREG, \AREG, DISP2(\Index,8)  
+ 
+.endm
+
+
+
+
+.macro KERNEL2x2_I_4  AREG,BREG, OffsetA,OffsetB, Index,IsLast  
+
+    lxv vs8,    DISP8(\Index, 0+\OffsetB)(\BREG)
+    lxv vs10,    DISP8(\Index, 16+\OffsetB)(\BREG) 
+
+    lxv vs26,   DISP8(\Index, 0+\OffsetA)(\AREG)
+    lxv vs16,   DISP8(\Index,16+\OffsetA)(\AREG) 
+
+ 
+    xxperm   vs9,  vs8, permute_mask   
+    xxperm   vs11, vs10, permute_mask  
+
+
+ 
+    xvmaddasp      vs0,   vs26,   vs8
+    xvmaddasp      vs1,   vs26,   vs9 
+    xvmaddasp      vs0,   vs16,   vs10
+    xvmaddasp      vs1,   vs16,   vs11 
+ 
+ 
+ 
+.if \IsLast==1   
+    addi        \BREG, \BREG, DISP8(\Index,32)  
+    addi        \AREG, \AREG, DISP8(\Index,32)
+.endif 
+  
+.endm
+
+.macro KERNEL2x2_I_2  AREG,BREG, OffsetA,OffsetB, Index,IsLast  
+
+    lxv vs8,    DISP4(\Index, 0+\OffsetB)(\BREG)  
+    lxv vs26,   DISP4(\Index, 0+\OffsetA)(\AREG) 
+
+ 
+    xxperm   vs9,  vs8, permute_mask    
+
+ 
+    xvmaddasp      vs0,   vs26,   vs8
+    xvmaddasp      vs1,   vs26,   vs9  
+ 
+.if \IsLast==1   
+    addi        \BREG, \BREG, DISP4(\Index,16)  
+    addi        \AREG, \AREG, DISP4(\Index,16)
+.endif 
+.endm
+
+
+.macro SAVE2x2
+
+#ifndef TRMMKERNEL    
+    lxsd v4   , 0(CO)     
+#endif
+  add     T1, CO, LDC 
+#ifndef TRMMKERNEL    
+    lxsd v5   , 0(T1) 
+    
+#endif
+    /*aggregate vectors*/
+  xxpermdi         vs4,vs0,vs0,2
+  xxpermdi         vs5,vs1,vs1,2  
+  xvaddsp          vs0,vs0,vs4
+  xvaddsp         vs1,vs1,vs5 
+  /*   */
+  /* lets correct the order to 00 10 and 10 ,11 from {00,11} {01,10}  */
+  xxperm    vs1,vs1, permute_mask
+
+
+  xxmrghw   vs2 ,vs1,vs0
+  xxpermdi         vs2,vs2,vs2,2  
+  xxmrghw   vs3 ,vs0,vs1  
+#if defined(TRMMKERNEL)
+  xvmulsp        vs36, vs2, alpha_r 
+  xvmulsp        vs37, vs3, alpha_r 
+#else
+  xvmaddasp        vs36, vs2, alpha_r 
+  xvmaddasp        vs37, vs3, alpha_r 
+#endif
+  /**** store last two words*/
+
+
+  stxsd       v4, 0(CO) 
+  stxsd        v5, 0(T1)  
+
+  addi CO,CO,8
+
+.endm
+
+/*--------------------------- M=1 N=2 */
+.macro Zero2x1
+    xxlxor      vs0,   vs0,   vs0
+    xxlxor      vs1,   vs1,   vs1 
+    xxlxor    vs2,vs2,vs2 
+    xxlxor    vs3,vs3,vs3     
+.endm
+ 
+.macro KERNEL2x1
+  KERNEL2x1_1 AO,BO, 0, 0,0,0
+.endm
+.macro KERNEL2x1_4 OffsetA,OffsetB, Index,IsLast  
+  KERNEL2x1_I_4 AO,BO,  \OffsetA,\OffsetB,\Index,\IsLast
+.endm
+
+.macro KERNEL2x1_2 OffsetA,OffsetB, Index,IsLast  
+  KERNEL2x1_I_2 AO,BO,  \OffsetA,\OffsetB,\Index,\IsLast
+.endm
+ /*
+   we will calculate 1 alone then will add it to batched ones
+ */
+.macro KERNEL2x1_1 AREG,BREG,First,OffsetA,OffsetB,Index
+
+
+    lxssp v3,   DISP2(\Index, 0+\OffsetB)(\BREG)
+    lxssp v4,   DISP2(\Index, 4+\OffsetB)(\BREG) 
+    lxssp v5,   DISP1(\Index, 0+\OffsetA)(\AREG)        
+ 
+ 
+.if \First==1
+    xvmulsp      vs2,   vs37,   vs35 
+    xvmulsp      vs3,   vs37,   vs36     
+     
+.else 
+    xsmaddadp     vs2,   vs37,   vs35
+    xsmaddadp      vs3,   vs37,   vs36
+ .endif
+   
+    addi        \BREG, \BREG, DISP2(\Index,8)
+    addi        \AREG, \AREG, DISP1(\Index,4)  
+ 
+.endm
+
+
+
+
+.macro KERNEL2x1_I_4  AREG,BREG, OffsetA,OffsetB, Index,IsLast  
+
+    lxv vs8,    DISP8(\Index, 0+\OffsetB)(\BREG)
+    lxv vs10,   DISP8(\Index, 16+\OffsetB)(\BREG) 
+
+    lxv vs26,   DISP4(\Index, 0+\OffsetA)(\AREG)
+   
+    xxmrglw   vs5, vs26,vs26
+    xxmrghw   vs6, vs26,vs26 
+ 
+    xvmaddasp      vs0,   vs8,   vs5
+    xvmaddasp      vs1,   vs10,   vs6 
+ 
+ 
+.if \IsLast==1   
+    addi        \BREG, \BREG, DISP8(\Index,32)  
+    addi        \AREG, \AREG, DISP4(\Index,16)
+.endif 
+  
+.endm
+
+.macro KERNEL2x1_I_2  AREG,BREG, OffsetA,OffsetB, Index,IsLast  
+
+    lxssp v3,   DISP4(\Index, 0+\OffsetB)(\BREG)
+    lxssp v4,   DISP4(\Index, 4+\OffsetB)(\BREG) 
+    lxssp v7,   DISP4(\Index, 8+\OffsetB)(\BREG)
+    lxssp v8,   DISP4(\Index, 12+\OffsetB)(\BREG)    
+    lxssp v5,   DISP2(\Index, 0+\OffsetA)(\AREG)        
+    lxssp v6,   DISP2(\Index, 4+\OffsetA)(\AREG)  
+ 
+ 
+    xsmaddadp      vs2,   vs37,   vs35
+    xsmaddadp      vs3,   vs37,   vs36
+
+    xsmaddadp      vs2,   vs38,   vs39 
+    xsmaddadp      vs3,   vs38,   vs40      
+ 
+   
+    addi        \BREG, \BREG, DISP4(\Index,16)
+    addi        \AREG, \AREG, DISP2(\Index,8) 
+.endm
+
+
+.macro SAVE2x1
+
+#ifndef TRMMKERNEL    
+    lxssp v4   , 0(CO)     
+#endif
+  add     T1, CO, LDC 
+#ifndef TRMMKERNEL    
+    lxssp v5   , 0(T1) 
+    
+#endif
+
+  /*convert alpha_r for multiply*/
+  xscvspdp  vs16,alpha_r
+
+ /*aggregate vectors 2x2_4   */ 
+      xxpermdi         vs4,vs0,vs0,2
+      xxpermdi         vs5,vs1,vs1,2  
+      xvaddsp          vs0,vs0,vs4
+      xvaddsp         vs1,vs1,vs5 
+      xvaddsp         vs0,vs0,vs1 
+/*aggregate vectors 2x1_2 and 2x1_1 into 2x2_4*/
+  xscvspdp  vs5, vs0
+  xxspltw   vs6, vs0, 1  
+  xscvspdp  vs6,vs6 
+  xsadddp  vs2,vs2,vs6
+  xsadddp  vs3,vs3,vs5  
+
+  /**** store last two words*/
+#if defined(TRMMKERNEL) 
+  xsmuldp  vs36,vs2, vs16 
+  xsmuldp  vs37,vs3, vs16  
+ 
+#else
+  xsmaddadp  vs36,vs2, vs16 
+  xsmaddadp  vs37,vs3, vs16 
+#endif  
+
+  stxssp       v4, 0(CO) 
+  stxssp        v5, 0(T1)  
+
+  addi CO,CO,4
+
+.endm
+
+
+
+/****************************N=1 section*****************/
+
+.macro KERNEL1x16_2   OffsetA,OffsetB, Index,IsLast
+  KERNEL1x16_I_2 AO,BO,  \OffsetA,\OffsetB,\Index,\IsLast
+.endm
+
+ 
+.macro Zero1x16
+    xxlxor      vs0,   vs0,   vs0
+    xxlxor      vs1,   vs1,   vs1 
+    xxlxor      vs2,   vs2,   vs2
+    xxlxor      vs3,   vs3,   vs3       
+.endm
+ 
+.macro KERNEL1x16
+  KERNEL1x16_1 AO,BO, 0, 0,0,0
+.endm
+.macro KERNEL1x16_4 OffsetA,OffsetB, Index,IsLast  
+  KERNEL1x16_I_4 AO,BO,  \OffsetA,\OffsetB,\Index,\IsLast
+.endm
+ 
+.macro KERNEL1x16_1 AREG,BREG,First,OffsetA,OffsetB,Index
+
+
+    lxssp v4,   DISP1(\Index, 0+\OffsetB)(\BREG)
+    xscvdpspn   vs36,vs36
+    xxspltw     vs8,  vs36, 0
+    lxv vs26,   DISP16(\Index, 0+\OffsetA)(\AREG)
+    lxv vs27,   DISP16(\Index,16+\OffsetA)(\AREG)   
+    lxv vs28,   DISP16(\Index, 32+\OffsetA)(\AREG)
+    lxv vs29,   DISP16(\Index,48+\OffsetA)(\AREG)        
+ 
+ 
+.if \First==1
+    xvmulsp      vs0,   vs26,   vs8
+    xvmulsp      vs1,   vs27,   vs8 
+    xvmulsp      vs2,   vs28,   vs8
+    xvmulsp      vs3,   vs29,   vs8 
+  
+     
+.else 
+    xvmaddasp      vs0,   vs26,   vs8
+    xvmaddasp      vs1,   vs27,   vs8 
+    xvmaddasp      vs2,   vs28,   vs8
+    xvmaddasp      vs3,   vs29,   vs8 
+ 
+ .endif
+   
+    addi        \BREG, \BREG, DISP1(\Index,4)
+    addi        \AREG, \AREG, DISP16(\Index,64)  
+ 
+.endm
+
+
+
+
+.macro KERNEL1x16_I_4  AREG,BREG, OffsetA,OffsetB, Index,IsLast  
+
+    lxv vs38,    DISP4(\Index, 0+\OffsetB)(\BREG) 
+
+    lxv vs26,   DISP64(\Index, 0+\OffsetA)(\AREG)
+    lxv vs27,   DISP64(\Index,16+\OffsetA)(\AREG)
+    lxv vs28,   DISP64(\Index,32+\OffsetA)(\AREG)
+    lxv vs29,   DISP64(\Index,48+\OffsetA)(\AREG)  
+
+    lxv vs16,   DISP64(\Index,64+ 0+\OffsetA)(\AREG)
+    lxv vs17,   DISP64(\Index,64+ 16+\OffsetA)(\AREG)
+    lxv vs18,   DISP64(\Index,64+ 32+\OffsetA)(\AREG)
+    lxv vs19,   DISP64(\Index,64+ 48+\OffsetA)(\AREG) 
+
+    xxspltw   vs8,  vs38, 3  
+    xxspltw   vs9,  vs38, 2 
+
+    lxv vs30,   DISP64(\Index,128+ 0+\OffsetA)(\AREG)
+    lxv vs31,   DISP64(\Index,128+ 16+\OffsetA)(\AREG)
+    lxv vs32,   DISP64(\Index,128+ 32+\OffsetA)(\AREG)
+    lxv vs33,   DISP64(\Index,128+ 48+\OffsetA)(\AREG)  
+        
+    lxv vs34,   DISP64(\Index,128+ 64+ 0+\OffsetA)(\AREG)
+    lxv vs35,   DISP64(\Index,128+ 64+ 16+\OffsetA)(\AREG)
+    lxv vs36,   DISP64(\Index,128+ 64+ 32+\OffsetA)(\AREG)
+    lxv vs37,   DISP64(\Index,128+ 64+ 48+\OffsetA)(\AREG) 
+
+    xxspltw   vs10, vs38, 1 
+    xxspltw   vs11, vs38, 0    
+
+ 
+    xvmaddasp      vs0,   vs26,   vs8
+    xvmaddasp      vs1,   vs27,   vs8 
+    xvmaddasp      vs2,   vs28,   vs8
+    xvmaddasp      vs3,   vs29,   vs8 
+ 
+
+    xvmaddasp      vs0,   vs16,   vs9
+    xvmaddasp      vs1,   vs17,   vs9 
+    xvmaddasp      vs2,   vs18,   vs9
+    xvmaddasp      vs3,   vs19,   vs9 
+ 
+
+    xvmaddasp      vs0,   vs30,   vs10
+    xvmaddasp      vs1,   vs31,   vs10 
+    xvmaddasp      vs2,   vs32,   vs10
+    xvmaddasp      vs3,   vs33,   vs10 
+ 
+
+    xvmaddasp      vs0,   vs34,   vs11
+    xvmaddasp      vs1,   vs35,   vs11 
+    xvmaddasp      vs2,   vs36,   vs11
+    xvmaddasp      vs3,   vs37,   vs11 
+
+ 
+ 
+ 
+.if \IsLast==1   
+    addi        \BREG, \BREG, DISP4(\Index,16)  
+    addi        \AREG, \AREG, DISP64(\Index,256)
+.endif 
+  
+.endm
+
+.macro KERNEL1x16_I_2  AREG,BREG, OffsetA,OffsetB, Index,IsLast  
+
+    lxsd v4,    DISP2(\Index, 0+\OffsetB)(\BREG)
+    xxspltw   vs8,  vs36, 1  
+    xxspltw   vs9,  vs36, 0      
+    lxv vs26,   DISP32(\Index, 0+\OffsetA)(\AREG)
+    lxv vs27,   DISP32(\Index,16+\OffsetA)(\AREG)
+    lxv vs28,   DISP32(\Index,32+\OffsetA)(\AREG)
+    lxv vs29,   DISP32(\Index,48+\OffsetA)(\AREG)      
+    lxv vs16,   DISP32(\Index,64+ 0+\OffsetA)(\AREG)
+    lxv vs17,   DISP32(\Index,64+ 16+\OffsetA)(\AREG)
+    lxv vs18,   DISP32(\Index,64+ 32+\OffsetA)(\AREG)
+    lxv vs19,   DISP32(\Index,64+ 48+\OffsetA)(\AREG) 
+ 
+ 
+    xvmaddasp      vs0,   vs26,   vs8
+    xvmaddasp      vs1,   vs27,   vs8 
+    xvmaddasp      vs2,   vs28,   vs8
+    xvmaddasp      vs3,   vs29,   vs8 
+ 
+
+    xvmaddasp      vs0,   vs16,   vs9
+    xvmaddasp      vs1,   vs17,   vs9 
+    xvmaddasp      vs2,   vs18,   vs9
+    xvmaddasp      vs3,   vs19,   vs9 
+  
+ 
+.if \IsLast==1   
+    addi        \BREG, \BREG, DISP2(\Index,8)  
+    addi        \AREG, \AREG, DISP32(\Index,128)
+.endif 
+  
+.endm
+
+
+.macro SAVE1x16
+
+#ifndef TRMMKERNEL    
+    lxv        vs16, 0(CO)
+    lxv        vs17, 16(CO) 
+    lxv        vs18, 32(CO)  
+    lxv        vs19, 48(CO)      
+#endif
+ 
+
+#if defined(TRMMKERNEL)
+  xvmulsp        vs16, vs0, alpha_r
+  xvmulsp        vs17, vs1, alpha_r 
+  xvmulsp        vs18, vs2, alpha_r
+  xvmulsp        vs19, vs3, alpha_r   
+#else
+  xvmaddasp        vs16, vs0, alpha_r
+  xvmaddasp        vs17, vs1, alpha_r 
+  xvmaddasp        vs18, vs2, alpha_r
+  xvmaddasp        vs19, vs3, alpha_r   
+#endif
+    stxv        vs16, 0(CO)
+    stxv        vs17, 16(CO) 
+    stxv        vs18, 32(CO)  
+    stxv        vs19, 48(CO)      
+    
+  addi CO,CO,64
+
+.endm
+
+/*       M=8 N=1 */
+
+.macro KERNEL1x8_2   OffsetA,OffsetB, Index,IsLast
+  KERNEL1x8_I_2 AO,BO,  \OffsetA,\OffsetB,\Index,\IsLast
+.endm
+
+ 
+.macro Zero1x8
+    xxlxor      vs0,   vs0,   vs0
+    xxlxor      vs1,   vs1,   vs1  
+    xxlxor      vs2,   vs2,   vs2
+    xxlxor      vs3,   vs3,   vs3          
+.endm
+ 
+.macro KERNEL1x8
+  KERNEL1x8_1 AO,BO, 0, 0,0,0
+.endm
+.macro KERNEL1x8_4 OffsetA,OffsetB, Index,IsLast  
+  KERNEL1x8_I_4 AO,BO,  \OffsetA,\OffsetB,\Index,\IsLast
+.endm
+ 
+.macro KERNEL1x8_1 AREG,BREG,First,OffsetA,OffsetB,Index
+
+
+    lxssp v4,   DISP1(\Index, 0+\OffsetB)(\BREG)
+    xscvdpspn   vs36,vs36
+    xxspltw     vs8,  vs36, 0
+    lxv vs26,   DISP8(\Index, 0+\OffsetA)(\AREG)
+    lxv vs27,   DISP8(\Index,16+\OffsetA)(\AREG)         
+ 
+ 
+.if \First==1
+    xvmulsp      vs0,   vs26,   vs8
+    xvmulsp      vs1,   vs27,   vs8  
+  
+     
+.else 
+    xvmaddasp      vs0,   vs26,   vs8
+    xvmaddasp      vs1,   vs27,   vs8  
+ 
+ .endif
+   
+    addi        \BREG, \BREG, DISP1(\Index,4)
+    addi        \AREG, \AREG, DISP8(\Index,32)  
+ 
+.endm
+
+
+
+
+.macro KERNEL1x8_I_4  AREG,BREG, OffsetA,OffsetB, Index,IsLast  
+
+    lxv vs38,    DISP4(\Index, 0+\OffsetB)(\BREG) 
+
+    lxv vs26,   DISP32(\Index, 0+\OffsetA)(\AREG)
+    lxv vs27,   DISP32(\Index,16+\OffsetA)(\AREG) 
+
+    lxv vs16,   DISP32(\Index,32+ 0+\OffsetA)(\AREG)
+    lxv vs17,   DISP32(\Index,32+ 16+\OffsetA)(\AREG) 
+
+    xxspltw   vs8,  vs38, 3  
+    xxspltw   vs9,  vs38, 2 
+
+    lxv vs30,   DISP32(\Index,64+ 0+\OffsetA)(\AREG)
+    lxv vs31,   DISP32(\Index,64+ 16+\OffsetA)(\AREG)   
+        
+    lxv vs34,   DISP32(\Index,64+ 32+ 0+\OffsetA)(\AREG)
+    lxv vs35,   DISP32(\Index,64+ 32+ 16+\OffsetA)(\AREG)  
+
+    xxspltw   vs10, vs38, 1 
+    xxspltw   vs11, vs38, 0    
+
+ 
+    xvmaddasp      vs0,   vs26,   vs8
+    xvmaddasp      vs1,   vs27,   vs8  
+ 
+
+    xvmaddasp      vs2,   vs16,   vs9
+    xvmaddasp      vs3,   vs17,   vs9  
+ 
+
+    xvmaddasp      vs0,   vs30,   vs10
+    xvmaddasp      vs1,   vs31,   vs10  
+ 
+
+    xvmaddasp      vs2,   vs34,   vs11
+    xvmaddasp      vs3,   vs35,   vs11  
+
+ 
+ 
+ 
+.if \IsLast==1   
+    addi        \BREG, \BREG, DISP4(\Index,16)  
+    addi        \AREG, \AREG, DISP32(\Index,128)
+.endif 
+  
+.endm
+
+.macro KERNEL1x8_I_2  AREG,BREG, OffsetA,OffsetB, Index,IsLast  
+
+    lxsd v4,    DISP2(\Index, 0+\OffsetB)(\BREG)
+    xxspltw   vs8,  vs36, 1  
+    xxspltw   vs9,  vs36, 0      
+    lxv vs26,   DISP16(\Index, 0+\OffsetA)(\AREG)
+    lxv vs27,   DISP16(\Index,16+\OffsetA)(\AREG)     
+    lxv vs16,   DISP16(\Index,32+ 0+\OffsetA)(\AREG)
+    lxv vs17,   DISP16(\Index,32+ 16+\OffsetA)(\AREG) 
+ 
+ 
+    xvmaddasp      vs0,   vs26,   vs8
+    xvmaddasp      vs1,   vs27,   vs8  
+ 
+
+    xvmaddasp      vs2,   vs16,   vs9
+    xvmaddasp      vs3,   vs17,   vs9   
+  
+ 
+.if \IsLast==1   
+    addi        \BREG, \BREG, DISP2(\Index,8)  
+    addi        \AREG, \AREG, DISP16(\Index,64)
+.endif 
+  
+.endm
+
+
+.macro SAVE1x8
+
+#ifndef TRMMKERNEL    
+    lxv        vs16, 0(CO)
+    lxv        vs17, 16(CO)       
+#endif
+   /* aggregate vs0 vs2 and vs1 vs3*/
+  xvaddsp vs0,vs0,vs2
+  xvaddsp  vs1,vs1,vs3
+#if defined(TRMMKERNEL)
+  xvmulsp        vs16, vs0, alpha_r
+  xvmulsp        vs17, vs1, alpha_r     
+#else
+  xvmaddasp        vs16, vs0, alpha_r
+  xvmaddasp        vs17, vs1, alpha_r  
+#endif
+    stxv        vs16, 0(CO)
+    stxv        vs17, 16(CO)      
+    
+  addi CO,CO,32
+
+.endm
+/*M=4*/
+
+.macro KERNEL1x4_2   OffsetA,OffsetB, Index,IsLast
+  KERNEL1x4_I_2 AO,BO,  \OffsetA,\OffsetB,\Index,\IsLast
+.endm
+
+ 
+.macro Zero1x4
+    xxlxor      vs0,   vs0,   vs0
+    xxlxor      vs1,   vs1,   vs1  
+    xxlxor      vs2,   vs2,   vs2
+    xxlxor      vs3,   vs3,   vs3          
+.endm
+ 
+.macro KERNEL1x4
+  KERNEL1x4_1 AO,BO, 0, 0,0,0
+.endm
+.macro KERNEL1x4_4 OffsetA,OffsetB, Index,IsLast  
+  KERNEL1x4_I_4 AO,BO,  \OffsetA,\OffsetB,\Index,\IsLast
+.endm
+ 
+.macro KERNEL1x4_1 AREG,BREG,First,OffsetA,OffsetB,Index
+
+
+    lxssp v4,   DISP1(\Index, 0+\OffsetB)(\BREG)
+    xscvdpspn   vs36,vs36
+    xxspltw     vs8,  vs36, 0
+    lxv vs26,   DISP4(\Index, 0+\OffsetA)(\AREG)         
+ 
+ 
+.if \First==1
+    xvmulsp      vs0,   vs26,   vs8 
+.else 
+    xvmaddasp      vs0,   vs26,   vs8 
+ 
+ .endif
+   
+    addi        \BREG, \BREG, DISP1(\Index,4)
+    addi        \AREG, \AREG, DISP4(\Index,16)  
+ 
+.endm
+
+
+
+
+.macro KERNEL1x4_I_4  AREG,BREG, OffsetA,OffsetB, Index,IsLast  
+
+    lxv vs38,    DISP4(\Index, 0+\OffsetB)(\BREG) 
+
+    lxv vs26,   DISP16(\Index, 0+\OffsetA)(\AREG)
+    lxv vs27,   DISP16(\Index,16+\OffsetA)(\AREG) 
+ 
+
+    xxspltw   vs8,  vs38, 3  
+    xxspltw   vs9,  vs38, 2 
+
+    lxv vs30,   DISP16(\Index,32+ 0+\OffsetA)(\AREG)
+    lxv vs31,   DISP16(\Index,32+ 16+\OffsetA)(\AREG)   
+          
+
+    xxspltw   vs10, vs38, 1 
+    xxspltw   vs11, vs38, 0    
+
+ 
+    xvmaddasp      vs0,   vs26,   vs8 
+
+    xvmaddasp      vs1,   vs27,   vs9 
+
+    xvmaddasp      vs2,   vs30,   vs10   
+ 
+
+    xvmaddasp      vs3,   vs31,   vs11   
+
+ 
+ 
+ 
+.if \IsLast==1   
+    addi        \BREG, \BREG, DISP4(\Index,16)  
+    addi        \AREG, \AREG, DISP16(\Index,64)
+.endif 
+  
+.endm
+
+.macro KERNEL1x4_I_2  AREG,BREG, OffsetA,OffsetB, Index,IsLast  
+
+    lxsd v4,    DISP2(\Index, 0+\OffsetB)(\BREG)
+    xxspltw   vs8,  vs36, 1  
+    xxspltw   vs9,  vs36, 0      
+    lxv vs26,   DISP8(\Index, 0+\OffsetA)(\AREG)
+    lxv vs27,   DISP8(\Index,16+\OffsetA)(\AREG)      
+ 
+ 
+    xvmaddasp      vs0,   vs26,   vs8
+    xvmaddasp      vs1,   vs27,   vs9
+  
+ 
+.if \IsLast==1   
+    addi        \BREG, \BREG, DISP2(\Index,8)  
+    addi        \AREG, \AREG, DISP8(\Index,32)
+.endif 
+  
+.endm
+
+
+.macro SAVE1x4
+
+#ifndef TRMMKERNEL    
+    lxv        vs16, 0(CO)       
+#endif
+   /* aggregate */
+  xvaddsp vs0,vs0,vs2
+  xvaddsp  vs1,vs1,vs3
+  xvaddsp  vs0,vs1,vs0
+#if defined(TRMMKERNEL)
+  xvmulsp        vs16, vs0, alpha_r     
+#else
+  xvmaddasp        vs16, vs0, alpha_r  
+#endif
+    stxv        vs16, 0(CO)      
+    
+  addi CO,CO,16
+
+.endm
+
+/* M=2 N=1*/ 
+.macro Zero1x2
+    xxlxor      vs0,   vs0,   vs0
+    xxlxor      vs1,   vs1,   vs1 
+    xxlxor    vs2,vs2,vs2 
+    xxlxor    vs3,vs3,vs3     
+.endm
+ 
+.macro KERNEL1x2
+  KERNEL1x2_1 AO,BO, 0, 0,0,0
+.endm
+.macro KERNEL1x2_4 OffsetA,OffsetB, Index,IsLast  
+  KERNEL1x2_I_4 AO,BO,  \OffsetA,\OffsetB,\Index,\IsLast
+.endm
+
+.macro KERNEL1x2_2 OffsetA,OffsetB, Index,IsLast  
+  KERNEL1x2_I_2 AO,BO,  \OffsetA,\OffsetB,\Index,\IsLast
+.endm
+ /*
+   we will calculate 1 alone then will add it to batched ones
+ */
+.macro KERNEL1x2_1 AREG,BREG,First,OffsetA,OffsetB,Index
+
+
+    lxssp v3,   DISP2(\Index, 0+\OffsetB)(\AREG)
+    lxssp v4,   DISP2(\Index, 4+\OffsetB)(\AREG) 
+    lxssp v5,   DISP1(\Index, 0+\OffsetA)(\BREG)        
+ 
+ 
+.if \First==1
+    xvmuldp      vs2,   vs37,   vs35 
+    xvmuldp      vs3,   vs37,   vs36     
+     
+.else 
+    xsmaddadp     vs2,   vs37,   vs35
+    xsmaddadp      vs3,   vs37,   vs36
+ .endif
+   
+    addi        \AREG, \AREG,  DISP2(\Index,8) 
+    addi        \BREG, \BREG, DISP1(\Index,4) 
+ 
+.endm
+
+
+
+
+.macro KERNEL1x2_I_4  AREG,BREG, OffsetA,OffsetB, Index,IsLast  
+
+    lxv vs8,    DISP8(\Index, 0+\OffsetB)(\AREG)
+    lxv vs10,   DISP8(\Index, 16+\OffsetB)(\AREG) 
+
+    lxv vs26,   DISP4(\Index, 0+\OffsetA)(\BREG)
+   
+    xxmrglw   vs5, vs26,vs26
+    xxmrghw   vs6, vs26,vs26 
+ 
+    xvmaddasp      vs0,   vs8,   vs5
+    xvmaddasp      vs1,   vs10,   vs6 
+ 
+ 
+.if \IsLast==1   
+    addi        \AREG, \AREG, DISP8(\Index,32)
+    addi        \BREG, \BREG,  DISP4(\Index,16)  
+.endif 
+  
+.endm
+
+.macro KERNEL1x2_I_2  AREG,BREG, OffsetA,OffsetB, Index,IsLast  
+
+    lxssp v3,   DISP4(\Index, 0+\OffsetB)(\AREG)
+    lxssp v4,   DISP4(\Index, 4+\OffsetB)(\AREG) 
+    lxssp v7,   DISP4(\Index, 8+\OffsetB)(\AREG)
+    lxssp v8,   DISP4(\Index, 12+\OffsetB)(\AREG)    
+    lxssp v5,   DISP2(\Index, 0+\OffsetA)(\BREG)        
+    lxssp v6,   DISP2(\Index, 4+\OffsetA)(\BREG)  
+ 
+ 
+    xsmaddadp      vs2,   vs37,   vs35
+    xsmaddadp      vs3,   vs37,   vs36
+
+    xsmaddadp      vs2,   vs38,   vs39 
+    xsmaddadp      vs3,   vs38,   vs40      
+ 
+   
+    addi        \AREG, \AREG, DISP4(\Index,16)
+    addi        \BREG, \BREG, DISP2(\Index,8) 
+.endm
+
+
+.macro SAVE1x2
+
+#ifndef TRMMKERNEL    
+    lxssp v4   , 0(CO)      
+    lxssp v5   , 4(CO) 
+    
+#endif
+
+  /*convert alpha_r for multiply*/
+  xscvspdp  vs16,alpha_r
+
+ /*aggregate vectors 1x2_4   */ 
+      xxpermdi         vs4,vs0,vs0,2
+      xxpermdi         vs5,vs1,vs1,2  
+      xvaddsp          vs0,vs0,vs4
+      xvaddsp         vs1,vs1,vs5 
+      xvaddsp         vs0,vs0,vs1 
+/*aggregate vectors 1x1_2 and 1x1_1 into 1x2_4*/
+  xscvspdp  vs5, vs0
+  xxspltw   vs6, vs0, 1  
+  xscvspdp  vs6,vs6 
+  xsadddp  vs2,vs2,vs6
+  xsadddp  vs3,vs3,vs5  
+
+  /**** store last two words*/
+#if defined(TRMMKERNEL) 
+  xsmuldp  vs36,vs2, vs16 
+  xsmuldp  vs37,vs3, vs16  
+ 
+#else
+  xsmaddadp  vs36,vs2, vs16 
+  xsmaddadp  vs37,vs3, vs16 
+#endif  
+
+  stxssp       v4, 0(CO) 
+  stxssp        v5, 4(CO)  
+
+  addi CO,CO,8
+
+.endm
+/*///////////////// N=1 M=1 //////////////////*/
+.macro Zero1x1
+    xxlxor      vs0,   vs0,   vs0
+    xxlxor      vs1,   vs1,   vs1 
+    xxlxor      vs2, vs2,vs2 
+    xxlxor      vs3,vs3,vs3 
+    xxlxor      vs4,vs4,vs4       
+.endm
+ 
+.macro KERNEL1x1
+  KERNEL1x1_1 AO,BO, 1, 0,0,0
+.endm
+
+.macro KERNEL1x1_16 OffsetA,OffsetB, Index,IsLast  
+  KERNEL1x1_I_16 AO,BO,  \OffsetA,\OffsetB,\Index,\IsLast
+.endm
+
+.macro KERNEL1x1_8 OffsetA,OffsetB, Index,IsLast  
+  KERNEL1x1_I_8 AO,BO,  \OffsetA,\OffsetB,\Index,\IsLast
+.endm
+
+.macro KERNEL1x1_4 OffsetA,OffsetB, Index,IsLast  
+  KERNEL1x1_I_4 AO,BO,  \OffsetA,\OffsetB,\Index,\IsLast
+.endm
+
+.macro KERNEL1x1_2 OffsetA,OffsetB, Index,IsLast  
+  KERNEL1x1_I_2 AO,BO,  \OffsetA,\OffsetB,\Index,\IsLast
+.endm
+ /*
+   we will calculate 1 alone ( FIRST==1 to zero vs4) 
+ */
+.macro KERNEL1x1_1 AREG,BREG,First,OffsetA,OffsetB,Index
+
+
+    lxssp v3,   DISP1(\Index, 0+\OffsetB)(\AREG) 
+    lxssp v5,   DISP1(\Index, 0+\OffsetA)(\BREG)        
+ 
+ 
+.if \First==1
+    xvmuldp      vs4,   vs37,   vs35       
+     
+.else 
+    xsmaddadp     vs4,   vs37,   vs35 
+ .endif
+   
+    addi        \AREG, \AREG,  DISP1(\Index,4) 
+    addi        \BREG, \BREG, DISP1(\Index,4) 
+ 
+.endm
+
+
+.macro KERNEL1x1_I_16 AREG,BREG, OffsetA,OffsetB, Index,IsLast  
+
+    lxv vs8,    DISP16(\Index, 0+\OffsetB)(\AREG) 
+    lxv vs9,    DISP16(\Index, 16+\OffsetB)(\AREG) 
+    lxv vs10,   DISP16(\Index, 32+0+\OffsetB)(\AREG) 
+    lxv vs11,   DISP16(\Index, 32+ 16+\OffsetB)(\AREG)        
+    lxv vs26,   DISP16(\Index, 0+\OffsetA)(\BREG) 
+    lxv vs16,   DISP16(\Index, 16+\OffsetA)(\BREG) 
+    lxv vs17,   DISP16(\Index, 32+0+\OffsetA)(\BREG) 
+    lxv vs18,   DISP16(\Index, 32+16+\OffsetA)(\BREG)     
+    xvmaddasp      vs0,   vs8,   vs26 
+    xvmaddasp      vs1,   vs9,   vs16  
+    xvmaddasp      vs2,   vs10,  vs17 
+    xvmaddasp      vs3,   vs11,  vs18
+.if \IsLast==1   
+    addi        \AREG, \AREG, DISP16(\Index,64)
+    addi        \BREG, \BREG,  DISP16(\Index,64)  
+.endif 
+  
+.endm
+
+.macro KERNEL1x1_I_8  AREG,BREG, OffsetA,OffsetB, Index,IsLast  
+
+    lxv vs8,    DISP8(\Index, 0+\OffsetB)(\AREG) 
+    lxv vs9,    DISP8(\Index, 16+\OffsetB)(\AREG)     
+    lxv vs26,   DISP8(\Index, 0+\OffsetA)(\BREG) 
+    lxv vs16,   DISP8(\Index, 16+\OffsetA)(\BREG) 
+    xvmaddasp      vs0,   vs8,   vs26 
+    xvmaddasp      vs1,   vs9,   vs16 
+ 
+.if \IsLast==1   
+    addi        \AREG, \AREG, DISP8(\Index,32)
+    addi        \BREG, \BREG,  DISP8(\Index,32)  
+.endif 
+  
+.endm
+
+
+.macro KERNEL1x1_I_4  AREG,BREG, OffsetA,OffsetB, Index,IsLast  
+
+    lxv vs8,    DISP4(\Index, 0+\OffsetB)(\AREG) 
+    lxv vs26,   DISP4(\Index, 0+\OffsetA)(\BREG) 
+ 
+    xvmaddasp      vs0,   vs8,   vs26 
+ 
+ 
+.if \IsLast==1   
+    addi        \AREG, \AREG, DISP4(\Index,16)
+    addi        \BREG, \BREG,  DISP4(\Index,16)  
+.endif 
+  
+.endm
+
+.macro KERNEL1x1_I_2  AREG,BREG, OffsetA,OffsetB, Index,IsLast  
+
+    lxsd v4,    DISP2(\Index, 0+\OffsetB)(\AREG) 
+    lxsd v5,   DISP2(\Index, 0+\OffsetA)(\BREG) 
+ 
+    xvmaddasp      vs0,   vs36,   vs37 
+ 
+    addi        \AREG, \AREG, DISP2(\Index,8)
+    addi        \BREG, \BREG, DISP2(\Index,8) 
+.endm
+
+
+.macro SAVE1x1
+
+#ifndef TRMMKERNEL    
+    lxssp v4   , 0(CO)    
+    
+#endif
+
+  /*convert alpha_r for multiply*/
+  xscvspdp  vs16,alpha_r
+
+ /*aggregate vectors   */ 
+      xvaddsp          vs0,vs0,vs1
+      xvaddsp          vs2,vs2,vs3
+      xvaddsp          vs0,vs0,vs2
+
+      xxpermdi         vs7,vs0,vs0,2   
+      xvaddsp          vs0,vs0,vs7 
+/*aggregate vectors 1x1_2 and 1x1_1 into 1x1_4*/
+  xscvspdp  vs5, vs0
+  xxspltw   vs6, vs0, 1  
+  xscvspdp  vs6,vs6 
+  xsadddp  vs7,vs5,vs6
+  xsadddp  vs4,vs4,vs7  
+
+  /**** store last two words*/
+#if defined(TRMMKERNEL) 
+  xsmuldp  vs36,vs4, vs16   
+ 
+#else
+  xsmaddadp  vs36,vs4, vs16   
+#endif  
+
+  stxssp       v4, 0(CO)    
+
+  addi CO,CO,4
+
+.endm
+
+
+
+
+/****************************TRMM POINTER REFRESH MACROSES*************************/
+
+.macro SHIFT_REG  REG1,REG2,SHIFT_VAL
+		.if \SHIFT_VAL==16 
+			slwi		\REG1,	\REG2,	6			
+		.elseif \SHIFT_VAL==8  
+			slwi		\REG1,	\REG2,	5			 
+		.elseif \SHIFT_VAL==4
+			slwi		\REG1,	\REG2,	4			  
+		.elseif \SHIFT_VAL==2
+			slwi		\REG1,	\REG2,	3			 
+		.elseif \SHIFT_VAL==1
+			slwi		\REG1,	\REG2,	2			 
+		.endif
+.endm
+
+/*
+//#if (defined(LEFT) &&  defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+// 		ptrbb = bb;
+// #else
+// 		ptrba += off*16;
+// 		ptrbb = bb + off*2;
+// #endif
+*/
+.macro REFRESH_POINTERS  PTR_A,PTR_B,OFF_VAL,B_VAL,C_A,C_B
+    #if (defined(LEFT) &&  defined(TRANSA)) ||  (!defined(LEFT) && !defined(TRANSA))
+        /* ptrbb = bb;*/
+        mr \PTR_B,\B_VAL     /* refresh BPOINT */
+
+    #else
+		    /*
+        // ptrba  =ptrba+ off*C_A;
+        // ptrbb = bb + off*C_B; 
+				*/
+		SHIFT_REG T4,\OFF_VAL,\C_B		/* Number of values in B shifted  */
+		SHIFT_REG T2,\OFF_VAL,\C_A		/* Number of values in A shifted  */
+		add		\PTR_B,	\B_VAL ,	T4				/* Add values to BO */
+		add		\PTR_A,	\PTR_A,	T2				/* Add values to AO  */
+    #endif 
+.endm
+
+
+/*
+// #if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
+// 		temp = bk-off;
+// #elif defined(LEFT)
+// 		temp = off+16;	// number of values in A
+// #else
+// 		temp = off+2;	// number of values in B
+// #endif
+*/
+.macro REFRESH_TEMP_BK TEMP_BK,BK_VAL,OFF_VAL,INCR_A,INCR_B
+    #if (defined(LEFT) && !defined(TRANSA)) ||  (!defined(LEFT) && defined(TRANSA))
+                            /* temp = bk-off;*/
+           sub \TEMP_BK,\BK_VAL,\OFF_VAL
+
+    #elif defined(LEFT)
+                            /* temp = off+INCR_A;	// number of values in A */
+           addi \TEMP_BK, \OFF_VAL, \INCR_A
+    #else
+                            /* temp = off+INCR_B	// number of values in B*/
+           addi \TEMP_BK,\OFF_VAL, \INCR_B
+    #endif
+
+.endm
+/*
+// #if ( defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+// 		temp = bk - off;
+// #ifdef LEFT
+// 		temp -= 16; // number of values in A
+// #else
+// 		temp -= 2; // number of values in B
+// #endif
+// 		ptrba += temp*16;
+// 		ptrbb += temp*2;
+// #endif
+
+// #ifdef LEFT
+// 		off += 16; // number of values in A
+// #endif
+*/
+ 
+
+.macro REFRESH_AFTER_SAVE TEMP_BK,BK_VAL,OFF_VAL,PTR_B,PTR_A,C_A,C_B
+
+    #if ( defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+                    /*temp = bk - off;*/
+                sub \TEMP_BK,\BK_VAL,\OFF_VAL
+    #ifdef LEFT
+                    /*temp -= 8; // number of values in A*/
+                addi \TEMP_BK,\TEMP_BK,-\C_A
+    #else
+                    /*temp -= 4; // number of values in B*/
+                addi \TEMP_BK,\TEMP_BK,-\C_B 
+    #endif
+                    /*ptrba += temp*C_A;
+                    ptrbb += temp*C_B;*/ 
+                SHIFT_REG T4,\TEMP_BK,\C_A
+								SHIFT_REG T2,\TEMP_BK,\C_B
+                add \PTR_A, \PTR_A,T4/*ptrba+temp*C_A*/ 
+								add \PTR_B, \PTR_B,T2 
+
+    #endif
+
+    #ifdef LEFT
+                    /*off += 8; // number of values in A*/
+                 addi \OFF_VAL,\OFF_VAL,\C_A
+    #endif
+.endm
\ No newline at end of file
diff --git a/param.h b/param.h
index 938a82a9e..d59cb1656 100644
--- a/param.h
+++ b/param.h
@@ -2248,12 +2248,12 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #define ZGEMM_DEFAULT_UNROLL_M 8
 #define ZGEMM_DEFAULT_UNROLL_N 2
 
-#define SGEMM_DEFAULT_P  1280
+#define SGEMM_DEFAULT_P 640
 #define DGEMM_DEFAULT_P  128
 #define CGEMM_DEFAULT_P  640
 #define ZGEMM_DEFAULT_P  320
 
-#define SGEMM_DEFAULT_Q  640
+#define SGEMM_DEFAULT_Q 1408
 #define DGEMM_DEFAULT_Q  384
 #define CGEMM_DEFAULT_Q  640
 #define ZGEMM_DEFAULT_Q  640

From bfeb9c16b0011f4f5f508a6d6df18017ab28f95a Mon Sep 17 00:00:00 2001
From: Martin Kroeker <martin@ruby.chemie.uni-freiburg.de>
Date: Mon, 29 Apr 2019 19:24:53 +0200
Subject: [PATCH 002/127] Increment version to 0.3.7.dev

---
 CMakeLists.txt | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index 969696179..8900973a5 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -6,7 +6,7 @@ cmake_minimum_required(VERSION 2.8.5)
 project(OpenBLAS C ASM)
 set(OpenBLAS_MAJOR_VERSION 0)
 set(OpenBLAS_MINOR_VERSION 3)
-set(OpenBLAS_PATCH_VERSION 6)
+set(OpenBLAS_PATCH_VERSION 7.dev)
 set(OpenBLAS_VERSION "${OpenBLAS_MAJOR_VERSION}.${OpenBLAS_MINOR_VERSION}.${OpenBLAS_PATCH_VERSION}")
 
 # Adhere to GNU filesystem layout conventions

From 4f8143b098418487b261653b48b16dc71cc2a259 Mon Sep 17 00:00:00 2001
From: Martin Kroeker <martin@ruby.chemie.uni-freiburg.de>
Date: Mon, 29 Apr 2019 19:25:32 +0200
Subject: [PATCH 003/127] Increment version to 0.3.7.dev

---
 Makefile.rule | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/Makefile.rule b/Makefile.rule
index 21782a2b9..b46479d03 100644
--- a/Makefile.rule
+++ b/Makefile.rule
@@ -3,7 +3,7 @@
 #
 
 # This library's version
-VERSION = 0.3.6
+VERSION = 0.3.7.dev
 
 # If you set the suffix, the library name will be libopenblas_$(LIBNAMESUFFIX).a
 # and libopenblas_$(LIBNAMESUFFIX).so. Meanwhile, the soname in shared library

From daf2fec12db90c02aa74cb13726efd8f9b708312 Mon Sep 17 00:00:00 2001
From: "luz.paz" <luzpaz@users.noreply.github.com>
Date: Mon, 29 Apr 2019 17:03:56 -0400
Subject: [PATCH 004/127] Misc. typo fixes

Found via `codespell -q 3 -w -L ith,als,dum,nd,amin,nto,wis,ba -S ./relapack,./kernel,./lapack-netlib`
---
 Changelog.txt                     | 14 +++++++-------
 Makefile.rule                     |  6 +++---
 README.md                         |  2 +-
 cmake/kernel.cmake                |  2 +-
 cmake/system.cmake                |  2 +-
 cmake/utils.cmake                 |  2 +-
 common_stackalloc.h               |  2 +-
 common_x86.h                      |  2 +-
 common_x86_64.h                   |  2 +-
 ctest/c_cblat1.f                  |  2 +-
 ctest/c_dblat1.f                  |  2 +-
 ctest/c_sblat1.f                  |  2 +-
 ctest/c_zblat1.f                  |  2 +-
 driver/others/blas_server.c       |  6 +++---
 driver/others/blas_server_win32.c |  4 ++--
 driver/others/init.c              |  2 +-
 driver/others/memory.c            |  2 +-
 f_check                           |  2 +-
 interface/CMakeLists.txt          |  2 +-
 interface/axpy.c                  |  2 +-
 interface/zaxpy.c                 |  2 +-
 reference/ctbmvf.f                |  2 +-
 reference/ctpmvf.f                |  2 +-
 reference/ctrmvf.f                |  2 +-
 reference/dtbmvf.f                |  2 +-
 reference/dtpmvf.f                |  2 +-
 reference/dtrmvf.f                |  2 +-
 reference/stbmvf.f                |  2 +-
 reference/stpmvf.f                |  2 +-
 reference/strmvf.f                |  2 +-
 reference/ztbmvf.f                |  2 +-
 reference/ztpmvf.f                |  2 +-
 reference/ztrmvf.f                |  2 +-
 test/cblat1.f                     |  2 +-
 test/dblat1.f                     |  2 +-
 test/sblat1.f                     |  2 +-
 test/zblat1.f                     |  2 +-
 37 files changed, 48 insertions(+), 48 deletions(-)

diff --git a/Changelog.txt b/Changelog.txt
index 8df35d5c3..9feacf071 100644
--- a/Changelog.txt
+++ b/Changelog.txt
@@ -74,7 +74,7 @@ ARMv8:
 	* ARMV8 builds with the BINARY=32 option are now automatically handled as ARMV7
 
 IBM Z:
-	* optimized microkernels for single precicion BLAS1/2 functions have been added
+	* optimized microkernels for single precision BLAS1/2 functions have been added
 	  for both Z13 and Z14
 
 ====================================================================
@@ -588,8 +588,8 @@ common:
 	  s/d/c/zaxpby, s/d/c/zimatcopy, s/d/c/zomatcopy.
 	* Added OPENBLAS_CORETYPE environment for dynamic_arch. (a86d34)
 	* Added NO_AVX2 flag for old binutils. (#401)
-	* Support outputing the CPU corename on runtime.(#407)
-	* Patched LAPACK to fix bug 114, 117, 118. 
+	* Support outputting the CPU corename on runtime.(#407)
+	* Patched LAPACK to fix bug 114, 117, 118.
 	  (http://www.netlib.org/lapack/bug_list.html)
 	* Disabled ?gemm3m for a work-around fix. (#400)
 x86/x86-64:
@@ -628,7 +628,7 @@ Version 0.2.9.rc1
 13-Jan-2013
 common:
 	* Update LAPACK to 3.5.0 version
-	* Fixed compatiable issues with Clang and Pathscale compilers.
+	* Fixed compatible issues with Clang and Pathscale compilers.
 
 x86/x86-64:
 	* Optimization on Intel Haswell.
@@ -705,7 +705,7 @@ Version 0.2.5
 26-Nov-2012
 common:
 	* Added NO_SHARED flag to disable generating the shared library.
-	* Compile LAPACKE with ILP64 modle when INTERFACE64=1 (#158)
+	* Compile LAPACKE with ILP64 model when INTERFACE64=1 (#158)
 	* Export LAPACK 3.4.2 symbols in shared library. (#147)
 	* Only detect the number of physical CPU cores on Mac OSX. (#157)
 	* Fixed NetBSD build. (#155)
@@ -896,7 +896,7 @@ x86/x86_64:
 	* Fixed #28 a wrong result of dsdot on x86_64.
 	* Fixed #32 a SEGFAULT bug of zdotc with gcc-4.6.
 	* Fixed #33 ztrmm bug on Nehalem.
-	* Work-around #27 the low performance axpy issue with small imput size & multithreads.
+	* Work-around #27 the low performance axpy issue with small input size & multithreads.
 
 MIPS64:
 	* Fixed #28 a wrong result of dsdot on Loongson3A/MIPS64.
@@ -919,7 +919,7 @@ common:
 	* Imported GotoBLAS2 1.13 BSD version
 
 x86/x86_64:
-	* On x86 32bits, fixed a bug in zdot_sse2.S line 191. This would casue
+	* On x86 32bits, fixed a bug in zdot_sse2.S line 191. This would cause
 	  zdotu & zdotc failures. Instead, work-around it. (Refs issue #8 #9 on github)
 	* Modified ?axpy functions to return same netlib BLAS results
 	  when incx==0 or incy==0 (Refs issue #7 on github)
diff --git a/Makefile.rule b/Makefile.rule
index b46479d03..17815096e 100644
--- a/Makefile.rule
+++ b/Makefile.rule
@@ -181,17 +181,17 @@ NO_AFFINITY = 1
 # time out to improve performance. This number should be from 4 to 30
 # which corresponds to (1 << n) cycles. For example, if you set to 26,
 # thread will be running for (1 << 26) cycles(about 25ms on 3.0GHz
-# system). Also you can control this mumber by THREAD_TIMEOUT
+# system). Also you can control this number by THREAD_TIMEOUT
 # CCOMMON_OPT	+= -DTHREAD_TIMEOUT=26
 
-# Using special device driver for mapping physically contigous memory
+# Using special device driver for mapping physically contiguous memory
 # to the user space. If bigphysarea is enabled, it will use it.
 # DEVICEDRIVER_ALLOCATION = 1
 
 # If you need to synchronize FP CSR between threads (for x86/x86_64 only).
 # CONSISTENT_FPCSR = 1
 
-# If any gemm arguement m, n or k is less or equal this threshold, gemm will be execute
+# If any gemm argument m, n or k is less or equal this threshold, gemm will be execute
 # with single thread. (Actually in recent versions this is a factor proportional to the
 # number of floating point operations necessary for the given problem size, no longer
 # an individual dimension). You can use this setting to avoid the overhead of multi-
diff --git a/README.md b/README.md
index 26055c745..76a65b74b 100644
--- a/README.md
+++ b/README.md
@@ -133,7 +133,7 @@ Please read `GotoBLAS_01Readme.txt`.
 
 #### PPC/PPC64
 
-- **POWER8**: Optmized Level-3 BLAS and some Level-1, only with `USE_OPENMP=1`
+- **POWER8**: Optimized Level-3 BLAS and some Level-1, only with `USE_OPENMP=1`
 
 #### IBM zEnterprise System
 
diff --git a/cmake/kernel.cmake b/cmake/kernel.cmake
index 0ed09e776..9b238f004 100644
--- a/cmake/kernel.cmake
+++ b/cmake/kernel.cmake
@@ -1,7 +1,7 @@
 # helper functions for the kernel CMakeLists.txt
 
 
-# Set the default filenames for L1 objects. Most of these will be overriden by the appropriate KERNEL file.
+# Set the default filenames for L1 objects. Most of these will be overridden by the appropriate KERNEL file.
 macro(SetDefaultL1)
   set(SAMAXKERNEL amax.S)
   set(DAMAXKERNEL amax.S)
diff --git a/cmake/system.cmake b/cmake/system.cmake
index 7fda2adb9..d0f560872 100644
--- a/cmake/system.cmake
+++ b/cmake/system.cmake
@@ -283,7 +283,7 @@ endif ()
 
 set(KERNELDIR	"${PROJECT_SOURCE_DIR}/kernel/${ARCH}")
 
-# TODO: nead to convert these Makefiles
+# TODO: need to convert these Makefiles
 # include ${PROJECT_SOURCE_DIR}/cmake/${ARCH}.cmake
 
 if (${CORE} STREQUAL "PPC440")
diff --git a/cmake/utils.cmake b/cmake/utils.cmake
index 28ef65f47..fd93f8a70 100644
--- a/cmake/utils.cmake
+++ b/cmake/utils.cmake
@@ -89,7 +89,7 @@ function(AllCombinations list_in absent_codes_in)
   set(CODES_OUT ${CODES_OUT} PARENT_SCOPE)
 endfunction ()
 
-# generates object files for each of the sources, using the BLAS naming scheme to pass the funciton name as a preprocessor definition
+# generates object files for each of the sources, using the BLAS naming scheme to pass the function name as a preprocessor definition
 # @param sources_in the source files to build from
 # @param defines_in (optional) preprocessor definitions that will be applied to all objects
 # @param name_in (optional) if this is set this name will be used instead of the filename. Use a * to indicate where the float character should go, if no star the character will be prepended.
diff --git a/common_stackalloc.h b/common_stackalloc.h
index ec0fa1611..d3d54669c 100644
--- a/common_stackalloc.h
+++ b/common_stackalloc.h
@@ -45,7 +45,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  * SIZE must be carefully chosen to be:
  * - as small as possible to maximize the number of stack allocation
  * - large enough to support all architectures and kernel
- * Chosing a too small SIZE will lead to a stack smashing.
+ * Choosing a SIZE too small will lead to a stack smashing.
  */
 #define STACK_ALLOC(SIZE, TYPE, BUFFER)                                        \
   /* make it volatile because some function (ex: dgemv_n.S) */                 \
diff --git a/common_x86.h b/common_x86.h
index 3fdffe2a8..99adc9f5b 100644
--- a/common_x86.h
+++ b/common_x86.h
@@ -214,7 +214,7 @@ static __inline int blas_quickdivide(unsigned int x, unsigned int y){
 #endif
 
 #if defined(PILEDRIVER) || defined(BULLDOZER) || defined(STEAMROLLER) || defined(EXCAVATOR)
-//Enable some optimazation for barcelona.
+//Enable some optimization for barcelona.
 #define BARCELONA_OPTIMIZATION
 #endif
 
diff --git a/common_x86_64.h b/common_x86_64.h
index 718a81050..f59ff6627 100644
--- a/common_x86_64.h
+++ b/common_x86_64.h
@@ -276,7 +276,7 @@ static __inline int blas_quickdivide(unsigned int x, unsigned int y){
 #ifdef ASSEMBLER
 
 #if defined(PILEDRIVER) || defined(BULLDOZER) || defined(STEAMROLLER) || defined(EXCAVATOR)
-//Enable some optimazation for barcelona.
+//Enable some optimization for barcelona.
 #define BARCELONA_OPTIMIZATION
 #endif
 
diff --git a/ctest/c_cblat1.f b/ctest/c_cblat1.f
index c741ce506..1a123d74d 100644
--- a/ctest/c_cblat1.f
+++ b/ctest/c_cblat1.f
@@ -577,7 +577,7 @@
       SUBROUTINE STEST1(SCOMP1,STRUE1,SSIZE,SFAC)
 *     ************************* STEST1 *****************************
 *
-*     THIS IS AN INTERFACE SUBROUTINE TO ACCOMODATE THE FORTRAN
+*     THIS IS AN INTERFACE SUBROUTINE TO ACCOMMODATE THE FORTRAN
 *     REQUIREMENT THAT WHEN A DUMMY ARGUMENT IS AN ARRAY, THE
 *     ACTUAL ARGUMENT MUST ALSO BE AN ARRAY OR AN ARRAY ELEMENT.
 *
diff --git a/ctest/c_dblat1.f b/ctest/c_dblat1.f
index c570a9140..4a71b4dcf 100644
--- a/ctest/c_dblat1.f
+++ b/ctest/c_dblat1.f
@@ -653,7 +653,7 @@
       SUBROUTINE STEST1(SCOMP1,STRUE1,SSIZE,SFAC)
 *     ************************* STEST1 *****************************
 *
-*     THIS IS AN INTERFACE SUBROUTINE TO ACCOMODATE THE FORTRAN
+*     THIS IS AN INTERFACE SUBROUTINE TO ACCOMMODATE THE FORTRAN
 *     REQUIREMENT THAT WHEN A DUMMY ARGUMENT IS AN ARRAY, THE
 *     ACTUAL ARGUMENT MUST ALSO BE AN ARRAY OR AN ARRAY ELEMENT.
 *
diff --git a/ctest/c_sblat1.f b/ctest/c_sblat1.f
index 773787d6f..89902f12d 100644
--- a/ctest/c_sblat1.f
+++ b/ctest/c_sblat1.f
@@ -653,7 +653,7 @@
       SUBROUTINE STEST1(SCOMP1,STRUE1,SSIZE,SFAC)
 *     ************************* STEST1 *****************************
 *
-*     THIS IS AN INTERFACE SUBROUTINE TO ACCOMODATE THE FORTRAN
+*     THIS IS AN INTERFACE SUBROUTINE TO ACCOMMODATE THE FORTRAN
 *     REQUIREMENT THAT WHEN A DUMMY ARGUMENT IS AN ARRAY, THE
 *     ACTUAL ARGUMENT MUST ALSO BE AN ARRAY OR AN ARRAY ELEMENT.
 *
diff --git a/ctest/c_zblat1.f b/ctest/c_zblat1.f
index 03753e782..cd0c8541d 100644
--- a/ctest/c_zblat1.f
+++ b/ctest/c_zblat1.f
@@ -577,7 +577,7 @@
       SUBROUTINE STEST1(SCOMP1,STRUE1,SSIZE,SFAC)
 *     ************************* STEST1 *****************************
 *
-*     THIS IS AN INTERFACE SUBROUTINE TO ACCOMODATE THE FORTRAN
+*     THIS IS AN INTERFACE SUBROUTINE TO ACCOMMODATE THE FORTRAN
 *     REQUIREMENT THAT WHEN A DUMMY ARGUMENT IS AN ARRAY, THE
 *     ACTUAL ARGUMENT MUST ALSO BE AN ARRAY OR AN ARRAY ELEMENT.
 *
diff --git a/driver/others/blas_server.c b/driver/others/blas_server.c
index e5db1804f..6f4e20610 100644
--- a/driver/others/blas_server.c
+++ b/driver/others/blas_server.c
@@ -109,7 +109,7 @@ extern unsigned int openblas_thread_timeout();
 /* equal to "OMP_NUM_THREADS - 1" and thread only wakes up when     */
 /* jobs is queued.                                                  */
 
-/* We need this grobal for cheking if initialization is finished.   */
+/* We need this global for checking if initialization is finished.  */
 int blas_server_avail   __attribute__((aligned(ATTRIBUTE_SIZE))) = 0;
 
 /* Local Variables */
@@ -150,8 +150,8 @@ static unsigned int thread_timeout = (1U << (THREAD_TIMEOUT));
 
 #ifdef MONITOR
 
-/* Monitor is a function to see thread's status for every seconds. */
-/* Usually it turns off and it's for debugging.                    */
+/* Monitor is a function to see thread's status for every second. */
+/* Usually it turns off and it's for debugging.                   */
 
 static pthread_t      monitor_thread;
 static int main_status[MAX_CPU_NUMBER];
diff --git a/driver/others/blas_server_win32.c b/driver/others/blas_server_win32.c
index 0b38ee365..bace54a23 100644
--- a/driver/others/blas_server_win32.c
+++ b/driver/others/blas_server_win32.c
@@ -50,7 +50,7 @@
 
 /* This is a thread implementation for Win32 lazy implementation */
 
-/* Thread server common infomation */
+/* Thread server common information */
 typedef struct{
   CRITICAL_SECTION lock;
   HANDLE filled;
@@ -61,7 +61,7 @@ typedef struct{
 
 } blas_pool_t;
 
-/* We need this global for cheking if initialization is finished.   */
+/* We need this global for checking if initialization is finished.   */
 int blas_server_avail = 0;
 
 /* Local Variables */
diff --git a/driver/others/init.c b/driver/others/init.c
index 012ef6647..0aad9c407 100644
--- a/driver/others/init.c
+++ b/driver/others/init.c
@@ -765,7 +765,7 @@ int gotoblas_set_affinity(int pos) {
 
   int mynode = 1;
 
-  /* if number of threads is larger than inital condition */
+  /* if number of threads is larger than initial condition */
   if (pos < 0) {
       sched_setaffinity(0, sizeof(cpu_orig_mask), &cpu_orig_mask[0]);
       return 0;
diff --git a/driver/others/memory.c b/driver/others/memory.c
index ac8545f35..3fe31168d 100644
--- a/driver/others/memory.c
+++ b/driver/others/memory.c
@@ -2751,7 +2751,7 @@ void *blas_memory_alloc(int procpos){
 
 #ifdef ALLOC_DEVICEDRIVER
 	if ((*func ==  alloc_devicedirver) && (map_address == (void *)-1)) {
-	    fprintf(stderr, "OpenBLAS Warning ... Physically contigous allocation was failed.\n");
+	    fprintf(stderr, "OpenBLAS Warning ... Physically contiguous allocation was failed.\n");
 	}
 #endif
 
diff --git a/f_check b/f_check
index 34caa00be..b05db85bd 100644
--- a/f_check
+++ b/f_check
@@ -125,7 +125,7 @@ if ($compiler eq "") {
 	    $openmp = "-openmp";
 	}
 
-	# for embeded underscore name, e.g. zho_ge, it may append 2 underscores.
+	# for embedded underscore name, e.g. zho_ge, it may append 2 underscores.
 	$data = `$compiler -O2 -S ftest3.f > /dev/null 2>&1 && cat ftest3.s && rm -f ftest3.s`;
 	if ($data =~ / zho_ge__/) {
 	    $need2bu       = 1;
diff --git a/interface/CMakeLists.txt b/interface/CMakeLists.txt
index f76d5c13f..5ea39f864 100644
--- a/interface/CMakeLists.txt
+++ b/interface/CMakeLists.txt
@@ -24,7 +24,7 @@ set(BLAS1_MANGLED_SOURCES
   axpby.c
 )
 
-# TODO: USE_NETLIB_GEMV shoudl switch gemv.c to netlib/*gemv.f
+# TODO: USE_NETLIB_GEMV should switch gemv.c to netlib/*gemv.f
 # these all have 'z' sources for complex versions
 set(BLAS2_SOURCES
   gemv.c ger.c
diff --git a/interface/axpy.c b/interface/axpy.c
index 9032946d2..eaa19f4df 100644
--- a/interface/axpy.c
+++ b/interface/axpy.c
@@ -91,7 +91,7 @@ void CNAME(blasint n, FLOAT alpha, FLOAT *x, blasint incx, FLOAT *y, blasint inc
   //disable multi-thread when incx==0 or incy==0
   //In that case, the threads would be dependent.
   //
-  //Temporarily work-around the low performance issue with small imput size &
+  //Temporarily work-around the low performance issue with small input size &
   //multithreads.
   if (incx == 0 || incy == 0 || n <= MULTI_THREAD_MINIMAL)
 	  nthreads = 1;
diff --git a/interface/zaxpy.c b/interface/zaxpy.c
index dbd559628..da3b48ead 100644
--- a/interface/zaxpy.c
+++ b/interface/zaxpy.c
@@ -99,7 +99,7 @@ void CNAME(blasint n, FLOAT *ALPHA, FLOAT *x, blasint incx, FLOAT *y, blasint in
   //disable multi-thread when incx==0 or incy==0
   //In that case, the threads would be dependent.
   //
-  //Temporarily work-around the low performance issue with small imput size &
+  //Temporarily work-around the low performance issue with small input size &
   //multithreads.
   if (incx == 0 || incy == 0 || n <= MULTI_THREAD_MINIMAL)
 	  nthreads = 1;
diff --git a/reference/ctbmvf.f b/reference/ctbmvf.f
index ff3c5268d..ada701d70 100644
--- a/reference/ctbmvf.f
+++ b/reference/ctbmvf.f
@@ -117,7 +117,7 @@
 *           ( 1 + ( n - 1 )*abs( INCX ) ).
 *           Before entry, the incremented array X must contain the n
 *           element vector x. On exit, X is overwritten with the
-*           tranformed vector x.
+*           transformed vector x.
 *
 *  INCX   - INTEGER.
 *           On entry, INCX specifies the increment for the elements of
diff --git a/reference/ctpmvf.f b/reference/ctpmvf.f
index 340234270..ffc4766d2 100644
--- a/reference/ctpmvf.f
+++ b/reference/ctpmvf.f
@@ -77,7 +77,7 @@
 *           ( 1 + ( n - 1 )*abs( INCX ) ).
 *           Before entry, the incremented array X must contain the n
 *           element vector x. On exit, X is overwritten with the
-*           tranformed vector x.
+*           transformed vector x.
 *
 *  INCX   - INTEGER.
 *           On entry, INCX specifies the increment for the elements of
diff --git a/reference/ctrmvf.f b/reference/ctrmvf.f
index f9d3b445a..9cd1d17ad 100644
--- a/reference/ctrmvf.f
+++ b/reference/ctrmvf.f
@@ -80,7 +80,7 @@
 *           ( 1 + ( n - 1 )*abs( INCX ) ).
 *           Before entry, the incremented array X must contain the n
 *           element vector x. On exit, X is overwritten with the
-*           tranformed vector x.
+*           transformed vector x.
 *
 *  INCX   - INTEGER.
 *           On entry, INCX specifies the increment for the elements of
diff --git a/reference/dtbmvf.f b/reference/dtbmvf.f
index da340774e..621489085 100644
--- a/reference/dtbmvf.f
+++ b/reference/dtbmvf.f
@@ -117,7 +117,7 @@
 *           ( 1 + ( n - 1 )*abs( INCX ) ).
 *           Before entry, the incremented array X must contain the n
 *           element vector x. On exit, X is overwritten with the
-*           tranformed vector x.
+*           transformed vector x.
 *
 *  INCX   - INTEGER.
 *           On entry, INCX specifies the increment for the elements of
diff --git a/reference/dtpmvf.f b/reference/dtpmvf.f
index e8f6eb412..492f9fd46 100644
--- a/reference/dtpmvf.f
+++ b/reference/dtpmvf.f
@@ -77,7 +77,7 @@
 *           ( 1 + ( n - 1 )*abs( INCX ) ).
 *           Before entry, the incremented array X must contain the n
 *           element vector x. On exit, X is overwritten with the
-*           tranformed vector x.
+*           transformed vector x.
 *
 *  INCX   - INTEGER.
 *           On entry, INCX specifies the increment for the elements of
diff --git a/reference/dtrmvf.f b/reference/dtrmvf.f
index 0619d3eca..79b2eb806 100644
--- a/reference/dtrmvf.f
+++ b/reference/dtrmvf.f
@@ -80,7 +80,7 @@
 *           ( 1 + ( n - 1 )*abs( INCX ) ).
 *           Before entry, the incremented array X must contain the n
 *           element vector x. On exit, X is overwritten with the
-*           tranformed vector x.
+*           transformed vector x.
 *
 *  INCX   - INTEGER.
 *           On entry, INCX specifies the increment for the elements of
diff --git a/reference/stbmvf.f b/reference/stbmvf.f
index 353e63ee8..f21e5aa8b 100644
--- a/reference/stbmvf.f
+++ b/reference/stbmvf.f
@@ -117,7 +117,7 @@
 *           ( 1 + ( n - 1 )*abs( INCX ) ).
 *           Before entry, the incremented array X must contain the n
 *           element vector x. On exit, X is overwritten with the
-*           tranformed vector x.
+*           transformed vector x.
 *
 *  INCX   - INTEGER.
 *           On entry, INCX specifies the increment for the elements of
diff --git a/reference/stpmvf.f b/reference/stpmvf.f
index 1e93b843a..d97a695f5 100644
--- a/reference/stpmvf.f
+++ b/reference/stpmvf.f
@@ -77,7 +77,7 @@
 *           ( 1 + ( n - 1 )*abs( INCX ) ).
 *           Before entry, the incremented array X must contain the n
 *           element vector x. On exit, X is overwritten with the
-*           tranformed vector x.
+*           transformed vector x.
 *
 *  INCX   - INTEGER.
 *           On entry, INCX specifies the increment for the elements of
diff --git a/reference/strmvf.f b/reference/strmvf.f
index 249aff275..7614dcd32 100644
--- a/reference/strmvf.f
+++ b/reference/strmvf.f
@@ -80,7 +80,7 @@
 *           ( 1 + ( n - 1 )*abs( INCX ) ).
 *           Before entry, the incremented array X must contain the n
 *           element vector x. On exit, X is overwritten with the
-*           tranformed vector x.
+*           transformed vector x.
 *
 *  INCX   - INTEGER.
 *           On entry, INCX specifies the increment for the elements of
diff --git a/reference/ztbmvf.f b/reference/ztbmvf.f
index 8df5609ad..c8487cf7c 100644
--- a/reference/ztbmvf.f
+++ b/reference/ztbmvf.f
@@ -117,7 +117,7 @@
 *           ( 1 + ( n - 1 )*abs( INCX ) ).
 *           Before entry, the incremented array X must contain the n
 *           element vector x. On exit, X is overwritten with the
-*           tranformed vector x.
+*           transformed vector x.
 *
 *  INCX   - INTEGER.
 *           On entry, INCX specifies the increment for the elements of
diff --git a/reference/ztpmvf.f b/reference/ztpmvf.f
index 7e52ef74e..5dc03bac9 100644
--- a/reference/ztpmvf.f
+++ b/reference/ztpmvf.f
@@ -77,7 +77,7 @@
 *           ( 1 + ( n - 1 )*abs( INCX ) ).
 *           Before entry, the incremented array X must contain the n
 *           element vector x. On exit, X is overwritten with the
-*           tranformed vector x.
+*           transformed vector x.
 *
 *  INCX   - INTEGER.
 *           On entry, INCX specifies the increment for the elements of
diff --git a/reference/ztrmvf.f b/reference/ztrmvf.f
index 9e4f85380..5f52622e2 100644
--- a/reference/ztrmvf.f
+++ b/reference/ztrmvf.f
@@ -80,7 +80,7 @@
 *           ( 1 + ( n - 1 )*abs( INCX ) ).
 *           Before entry, the incremented array X must contain the n
 *           element vector x. On exit, X is overwritten with the
-*           tranformed vector x.
+*           transformed vector x.
 *
 *  INCX   - INTEGER.
 *           On entry, INCX specifies the increment for the elements of
diff --git a/test/cblat1.f b/test/cblat1.f
index a4c996fda..d6b53d105 100644
--- a/test/cblat1.f
+++ b/test/cblat1.f
@@ -576,7 +576,7 @@
       SUBROUTINE STEST1(SCOMP1,STRUE1,SSIZE,SFAC)
 *     ************************* STEST1 *****************************
 *
-*     THIS IS AN INTERFACE SUBROUTINE TO ACCOMODATE THE FORTRAN
+*     THIS IS AN INTERFACE SUBROUTINE TO ACCOMMODATE THE FORTRAN
 *     REQUIREMENT THAT WHEN A DUMMY ARGUMENT IS AN ARRAY, THE
 *     ACTUAL ARGUMENT MUST ALSO BE AN ARRAY OR AN ARRAY ELEMENT.
 *
diff --git a/test/dblat1.f b/test/dblat1.f
index f3255fef4..28af121cd 100644
--- a/test/dblat1.f
+++ b/test/dblat1.f
@@ -991,7 +991,7 @@
       SUBROUTINE STEST1(SCOMP1,STRUE1,SSIZE,SFAC)
 *     ************************* STEST1 *****************************
 *
-*     THIS IS AN INTERFACE SUBROUTINE TO ACCOMODATE THE FORTRAN
+*     THIS IS AN INTERFACE SUBROUTINE TO ACCOMMODATE THE FORTRAN
 *     REQUIREMENT THAT WHEN A DUMMY ARGUMENT IS AN ARRAY, THE
 *     ACTUAL ARGUMENT MUST ALSO BE AN ARRAY OR AN ARRAY ELEMENT.
 *
diff --git a/test/sblat1.f b/test/sblat1.f
index a5c1c6af6..fe05bbe87 100644
--- a/test/sblat1.f
+++ b/test/sblat1.f
@@ -946,7 +946,7 @@
       SUBROUTINE STEST1(SCOMP1,STRUE1,SSIZE,SFAC)
 *     ************************* STEST1 *****************************
 *
-*     THIS IS AN INTERFACE SUBROUTINE TO ACCOMODATE THE FORTRAN
+*     THIS IS AN INTERFACE SUBROUTINE TO ACCOMMODATE THE FORTRAN
 *     REQUIREMENT THAT WHEN A DUMMY ARGUMENT IS AN ARRAY, THE
 *     ACTUAL ARGUMENT MUST ALSO BE AN ARRAY OR AN ARRAY ELEMENT.
 *
diff --git a/test/zblat1.f b/test/zblat1.f
index e2415e1c4..8b4b8d21e 100644
--- a/test/zblat1.f
+++ b/test/zblat1.f
@@ -576,7 +576,7 @@
       SUBROUTINE STEST1(SCOMP1,STRUE1,SSIZE,SFAC)
 *     ************************* STEST1 *****************************
 *
-*     THIS IS AN INTERFACE SUBROUTINE TO ACCOMODATE THE FORTRAN
+*     THIS IS AN INTERFACE SUBROUTINE TO ACCOMMODATE THE FORTRAN
 *     REQUIREMENT THAT WHEN A DUMMY ARGUMENT IS AN ARRAY, THE
 *     ACTUAL ARGUMENT MUST ALSO BE AN ARRAY OR AN ARRAY ELEMENT.
 *

From b43c8382c885551b0f230c8493e79bf04d94e366 Mon Sep 17 00:00:00 2001
From: Martin Kroeker <martin@ruby.chemie.uni-freiburg.de>
Date: Wed, 1 May 2019 10:46:46 +0200
Subject: [PATCH 005/127] Correct argument of CPU_ISSET for glibc <2.5

fixes #2104
---
 driver/others/memory.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/driver/others/memory.c b/driver/others/memory.c
index ac8545f35..db14cde02 100644
--- a/driver/others/memory.c
+++ b/driver/others/memory.c
@@ -229,7 +229,7 @@ int get_num_procs(void) {
   n=0;
   #if !__GLIBC_PREREQ(2, 6)
   for (i=0;i<nums;i++)
-     if (CPU_ISSET(i,cpuset)) n++;
+     if (CPU_ISSET(i,&cpuset)) n++;
   nums=n;
   #else
   nums = CPU_COUNT(sizeof(cpuset),&cpuset);
@@ -1772,7 +1772,7 @@ int get_num_procs(void) {
   n=0;
   #if !__GLIBC_PREREQ(2, 6)
   for (i=0;i<nums;i++)
-     if (CPU_ISSET(i,cpuset)) n++;
+     if (CPU_ISSET(i,&cpuset)) n++;
   nums=n;
   #else
   nums = CPU_COUNT(sizeof(cpuset),&cpuset);

From 47f892198cf98d8392b91377b5939a7dfc364e3b Mon Sep 17 00:00:00 2001
From: AbdelRauf <quickwritereader@gmail.com>
Date: Wed, 1 May 2019 19:36:22 +0000
Subject: [PATCH 006/127] conflict resolve

---
 kernel/power/KERNEL.POWER9 | 10 +++++-----
 kernel/power/icamax.c      |  2 +-
 kernel/power/icamin.c      |  2 +-
 3 files changed, 7 insertions(+), 7 deletions(-)

diff --git a/kernel/power/KERNEL.POWER9 b/kernel/power/KERNEL.POWER9
index 6d5cf9068..0e0d62393 100644
--- a/kernel/power/KERNEL.POWER9
+++ b/kernel/power/KERNEL.POWER9
@@ -12,11 +12,11 @@ SGEMMKERNEL    =  sgemm_kernel_power9.S
 SGEMMINCOPY    = ../generic/gemm_ncopy_16.c
 SGEMMITCOPY    = sgemm_tcopy_16_power8.S
 SGEMMONCOPY    =  ../generic/gemm_ncopy_8.c
-SGEMMOTCOPY    = sgemm_tcopy_8_power8.S
-SGEMMINCOPYOBJ =  sgemm_incopy.o
-SGEMMITCOPYOBJ =  sgemm_itcopy.o
-SGEMMONCOPYOBJ =  sgemm_oncopy.o
-SGEMMOTCOPYOBJ =  sgemm_otcopy.o
+SGEMMOTCOPY    = sgemm_tcopy_8_power8.S 
+SGEMMINCOPYOBJ =  sgemm_incopy$(TSUFFIX).$(SUFFIX)
+SGEMMITCOPYOBJ =  sgemm_itcopy$(TSUFFIX).$(SUFFIX)
+SGEMMONCOPYOBJ =  sgemm_oncopy$(TSUFFIX).$(SUFFIX)
+SGEMMOTCOPYOBJ =  sgemm_otcopy$(TSUFFIX).$(SUFFIX)
 
 DGEMMKERNEL    =  dgemm_kernel_power9.S
 DGEMMINCOPY    = ../generic/gemm_ncopy_16.c
diff --git a/kernel/power/icamax.c b/kernel/power/icamax.c
index 06fc5d8ad..bd74d20e5 100644
--- a/kernel/power/icamax.c
+++ b/kernel/power/icamax.c
@@ -75,7 +75,7 @@ static inline __attribute__((always_inline)) __vector float mvec_mergeo(__vector
 static BLASLONG   ciamax_kernel_32(BLASLONG n, FLOAT *x, FLOAT *maxf) { 
 
     BLASLONG index;
-    BLASLONG i;
+    BLASLONG i=0;
 #if  defined(USE_MASK_PERMUTATIONS)    
     register __vector unsigned int static_index0 = {0,1,2,3};
 #else
diff --git a/kernel/power/icamin.c b/kernel/power/icamin.c
index 36432c993..336766245 100644
--- a/kernel/power/icamin.c
+++ b/kernel/power/icamin.c
@@ -50,7 +50,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 static BLASLONG   ciamin_kernel_32(BLASLONG n, FLOAT *x, FLOAT *minf) { 
 
     BLASLONG index;
-    BLASLONG i;
+    BLASLONG i=0;
     register __vector unsigned int static_index0 = {0,1,2,3};
     register __vector unsigned int temp0 = {4,4,4, 4}; //temporary vector register
     register __vector unsigned int temp1=  temp0<<1;  //{8,8,8,8}

From 858e609e1feba715065a65034eef02c9516aa107 Mon Sep 17 00:00:00 2001
From: "luz.paz" <luzpaz@users.noreply.github.com>
Date: Sat, 4 May 2019 15:01:29 -0400
Subject: [PATCH 007/127] Revert reference/ fixes

---
 reference/ctbmvf.f | 2 +-
 reference/ctpmvf.f | 2 +-
 reference/ctrmvf.f | 2 +-
 reference/dtbmvf.f | 2 +-
 reference/dtpmvf.f | 2 +-
 reference/dtrmvf.f | 2 +-
 reference/stbmvf.f | 2 +-
 reference/stpmvf.f | 2 +-
 reference/strmvf.f | 2 +-
 reference/ztbmvf.f | 2 +-
 reference/ztpmvf.f | 2 +-
 reference/ztrmvf.f | 2 +-
 12 files changed, 12 insertions(+), 12 deletions(-)

diff --git a/reference/ctbmvf.f b/reference/ctbmvf.f
index ada701d70..ff3c5268d 100644
--- a/reference/ctbmvf.f
+++ b/reference/ctbmvf.f
@@ -117,7 +117,7 @@
 *           ( 1 + ( n - 1 )*abs( INCX ) ).
 *           Before entry, the incremented array X must contain the n
 *           element vector x. On exit, X is overwritten with the
-*           transformed vector x.
+*           tranformed vector x.
 *
 *  INCX   - INTEGER.
 *           On entry, INCX specifies the increment for the elements of
diff --git a/reference/ctpmvf.f b/reference/ctpmvf.f
index ffc4766d2..340234270 100644
--- a/reference/ctpmvf.f
+++ b/reference/ctpmvf.f
@@ -77,7 +77,7 @@
 *           ( 1 + ( n - 1 )*abs( INCX ) ).
 *           Before entry, the incremented array X must contain the n
 *           element vector x. On exit, X is overwritten with the
-*           transformed vector x.
+*           tranformed vector x.
 *
 *  INCX   - INTEGER.
 *           On entry, INCX specifies the increment for the elements of
diff --git a/reference/ctrmvf.f b/reference/ctrmvf.f
index 9cd1d17ad..f9d3b445a 100644
--- a/reference/ctrmvf.f
+++ b/reference/ctrmvf.f
@@ -80,7 +80,7 @@
 *           ( 1 + ( n - 1 )*abs( INCX ) ).
 *           Before entry, the incremented array X must contain the n
 *           element vector x. On exit, X is overwritten with the
-*           transformed vector x.
+*           tranformed vector x.
 *
 *  INCX   - INTEGER.
 *           On entry, INCX specifies the increment for the elements of
diff --git a/reference/dtbmvf.f b/reference/dtbmvf.f
index 621489085..da340774e 100644
--- a/reference/dtbmvf.f
+++ b/reference/dtbmvf.f
@@ -117,7 +117,7 @@
 *           ( 1 + ( n - 1 )*abs( INCX ) ).
 *           Before entry, the incremented array X must contain the n
 *           element vector x. On exit, X is overwritten with the
-*           transformed vector x.
+*           tranformed vector x.
 *
 *  INCX   - INTEGER.
 *           On entry, INCX specifies the increment for the elements of
diff --git a/reference/dtpmvf.f b/reference/dtpmvf.f
index 492f9fd46..e8f6eb412 100644
--- a/reference/dtpmvf.f
+++ b/reference/dtpmvf.f
@@ -77,7 +77,7 @@
 *           ( 1 + ( n - 1 )*abs( INCX ) ).
 *           Before entry, the incremented array X must contain the n
 *           element vector x. On exit, X is overwritten with the
-*           transformed vector x.
+*           tranformed vector x.
 *
 *  INCX   - INTEGER.
 *           On entry, INCX specifies the increment for the elements of
diff --git a/reference/dtrmvf.f b/reference/dtrmvf.f
index 79b2eb806..0619d3eca 100644
--- a/reference/dtrmvf.f
+++ b/reference/dtrmvf.f
@@ -80,7 +80,7 @@
 *           ( 1 + ( n - 1 )*abs( INCX ) ).
 *           Before entry, the incremented array X must contain the n
 *           element vector x. On exit, X is overwritten with the
-*           transformed vector x.
+*           tranformed vector x.
 *
 *  INCX   - INTEGER.
 *           On entry, INCX specifies the increment for the elements of
diff --git a/reference/stbmvf.f b/reference/stbmvf.f
index f21e5aa8b..353e63ee8 100644
--- a/reference/stbmvf.f
+++ b/reference/stbmvf.f
@@ -117,7 +117,7 @@
 *           ( 1 + ( n - 1 )*abs( INCX ) ).
 *           Before entry, the incremented array X must contain the n
 *           element vector x. On exit, X is overwritten with the
-*           transformed vector x.
+*           tranformed vector x.
 *
 *  INCX   - INTEGER.
 *           On entry, INCX specifies the increment for the elements of
diff --git a/reference/stpmvf.f b/reference/stpmvf.f
index d97a695f5..1e93b843a 100644
--- a/reference/stpmvf.f
+++ b/reference/stpmvf.f
@@ -77,7 +77,7 @@
 *           ( 1 + ( n - 1 )*abs( INCX ) ).
 *           Before entry, the incremented array X must contain the n
 *           element vector x. On exit, X is overwritten with the
-*           transformed vector x.
+*           tranformed vector x.
 *
 *  INCX   - INTEGER.
 *           On entry, INCX specifies the increment for the elements of
diff --git a/reference/strmvf.f b/reference/strmvf.f
index 7614dcd32..249aff275 100644
--- a/reference/strmvf.f
+++ b/reference/strmvf.f
@@ -80,7 +80,7 @@
 *           ( 1 + ( n - 1 )*abs( INCX ) ).
 *           Before entry, the incremented array X must contain the n
 *           element vector x. On exit, X is overwritten with the
-*           transformed vector x.
+*           tranformed vector x.
 *
 *  INCX   - INTEGER.
 *           On entry, INCX specifies the increment for the elements of
diff --git a/reference/ztbmvf.f b/reference/ztbmvf.f
index c8487cf7c..8df5609ad 100644
--- a/reference/ztbmvf.f
+++ b/reference/ztbmvf.f
@@ -117,7 +117,7 @@
 *           ( 1 + ( n - 1 )*abs( INCX ) ).
 *           Before entry, the incremented array X must contain the n
 *           element vector x. On exit, X is overwritten with the
-*           transformed vector x.
+*           tranformed vector x.
 *
 *  INCX   - INTEGER.
 *           On entry, INCX specifies the increment for the elements of
diff --git a/reference/ztpmvf.f b/reference/ztpmvf.f
index 5dc03bac9..7e52ef74e 100644
--- a/reference/ztpmvf.f
+++ b/reference/ztpmvf.f
@@ -77,7 +77,7 @@
 *           ( 1 + ( n - 1 )*abs( INCX ) ).
 *           Before entry, the incremented array X must contain the n
 *           element vector x. On exit, X is overwritten with the
-*           transformed vector x.
+*           tranformed vector x.
 *
 *  INCX   - INTEGER.
 *           On entry, INCX specifies the increment for the elements of
diff --git a/reference/ztrmvf.f b/reference/ztrmvf.f
index 5f52622e2..9e4f85380 100644
--- a/reference/ztrmvf.f
+++ b/reference/ztrmvf.f
@@ -80,7 +80,7 @@
 *           ( 1 + ( n - 1 )*abs( INCX ) ).
 *           Before entry, the incremented array X must contain the n
 *           element vector x. On exit, X is overwritten with the
-*           transformed vector x.
+*           tranformed vector x.
 *
 *  INCX   - INTEGER.
 *           On entry, INCX specifies the increment for the elements of

From b46875b76b8d4ebbc320547c20f7f4486fe52563 Mon Sep 17 00:00:00 2001
From: "luz.paz" <luzpaz@users.noreply.github.com>
Date: Sat, 4 May 2019 15:43:17 -0400
Subject: [PATCH 008/127] Revert Changelog.txt typos

---
 Changelog.txt | 14 +++++++-------
 1 file changed, 7 insertions(+), 7 deletions(-)

diff --git a/Changelog.txt b/Changelog.txt
index 9feacf071..8df35d5c3 100644
--- a/Changelog.txt
+++ b/Changelog.txt
@@ -74,7 +74,7 @@ ARMv8:
 	* ARMV8 builds with the BINARY=32 option are now automatically handled as ARMV7
 
 IBM Z:
-	* optimized microkernels for single precision BLAS1/2 functions have been added
+	* optimized microkernels for single precicion BLAS1/2 functions have been added
 	  for both Z13 and Z14
 
 ====================================================================
@@ -588,8 +588,8 @@ common:
 	  s/d/c/zaxpby, s/d/c/zimatcopy, s/d/c/zomatcopy.
 	* Added OPENBLAS_CORETYPE environment for dynamic_arch. (a86d34)
 	* Added NO_AVX2 flag for old binutils. (#401)
-	* Support outputting the CPU corename on runtime.(#407)
-	* Patched LAPACK to fix bug 114, 117, 118.
+	* Support outputing the CPU corename on runtime.(#407)
+	* Patched LAPACK to fix bug 114, 117, 118. 
 	  (http://www.netlib.org/lapack/bug_list.html)
 	* Disabled ?gemm3m for a work-around fix. (#400)
 x86/x86-64:
@@ -628,7 +628,7 @@ Version 0.2.9.rc1
 13-Jan-2013
 common:
 	* Update LAPACK to 3.5.0 version
-	* Fixed compatible issues with Clang and Pathscale compilers.
+	* Fixed compatiable issues with Clang and Pathscale compilers.
 
 x86/x86-64:
 	* Optimization on Intel Haswell.
@@ -705,7 +705,7 @@ Version 0.2.5
 26-Nov-2012
 common:
 	* Added NO_SHARED flag to disable generating the shared library.
-	* Compile LAPACKE with ILP64 model when INTERFACE64=1 (#158)
+	* Compile LAPACKE with ILP64 modle when INTERFACE64=1 (#158)
 	* Export LAPACK 3.4.2 symbols in shared library. (#147)
 	* Only detect the number of physical CPU cores on Mac OSX. (#157)
 	* Fixed NetBSD build. (#155)
@@ -896,7 +896,7 @@ x86/x86_64:
 	* Fixed #28 a wrong result of dsdot on x86_64.
 	* Fixed #32 a SEGFAULT bug of zdotc with gcc-4.6.
 	* Fixed #33 ztrmm bug on Nehalem.
-	* Work-around #27 the low performance axpy issue with small input size & multithreads.
+	* Work-around #27 the low performance axpy issue with small imput size & multithreads.
 
 MIPS64:
 	* Fixed #28 a wrong result of dsdot on Loongson3A/MIPS64.
@@ -919,7 +919,7 @@ common:
 	* Imported GotoBLAS2 1.13 BSD version
 
 x86/x86_64:
-	* On x86 32bits, fixed a bug in zdot_sse2.S line 191. This would cause
+	* On x86 32bits, fixed a bug in zdot_sse2.S line 191. This would casue
 	  zdotu & zdotc failures. Instead, work-around it. (Refs issue #8 #9 on github)
 	* Modified ?axpy functions to return same netlib BLAS results
 	  when incx==0 or incy==0 (Refs issue #7 on github)

From 7ed8431527eb00f161de4dd309fd4d2b6c885b0c Mon Sep 17 00:00:00 2001
From: Martin Kroeker <martin@ruby.chemie.uni-freiburg.de>
Date: Sat, 4 May 2019 22:54:41 +0200
Subject: [PATCH 009/127] Disable the SkyLakeX DGEMMITCOPY kernel as well

as a stopgap measure for https://github.com/numpy/numpy/issues/13401 as mentioned in #1955
---
 kernel/x86_64/KERNEL.SKYLAKEX | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/kernel/x86_64/KERNEL.SKYLAKEX b/kernel/x86_64/KERNEL.SKYLAKEX
index 5d0a300b5..3c678904d 100644
--- a/kernel/x86_64/KERNEL.SKYLAKEX
+++ b/kernel/x86_64/KERNEL.SKYLAKEX
@@ -10,7 +10,7 @@ SGEMMOTCOPY    =  ../generic/gemm_tcopy_4.c
 #DGEMMKERNEL    =  dgemm_kernel_4x8_skylakex.c
 
 DGEMMINCOPY    =  dgemm_ncopy_8_skylakex.c
-DGEMMITCOPY    =  dgemm_tcopy_8_skylakex.c
+#DGEMMITCOPY    =  dgemm_tcopy_8_skylakex.c
 DGEMMONCOPY    =  dgemm_ncopy_8_skylakex.c
 DGEMMOTCOPY    =  dgemm_tcopy_8_skylakex.c
 

From b1561ecc6864428baa4f1336d47d23729b9636f2 Mon Sep 17 00:00:00 2001
From: Martin Kroeker <martin@ruby.chemie.uni-freiburg.de>
Date: Sun, 5 May 2019 15:52:01 +0200
Subject: [PATCH 010/127] Disable DGEMMINCOPY as well for now

#1955
---
 kernel/x86_64/KERNEL.SKYLAKEX | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/kernel/x86_64/KERNEL.SKYLAKEX b/kernel/x86_64/KERNEL.SKYLAKEX
index 3c678904d..d61c51628 100644
--- a/kernel/x86_64/KERNEL.SKYLAKEX
+++ b/kernel/x86_64/KERNEL.SKYLAKEX
@@ -9,7 +9,7 @@ SGEMMOTCOPY    =  ../generic/gemm_tcopy_4.c
 
 #DGEMMKERNEL    =  dgemm_kernel_4x8_skylakex.c
 
-DGEMMINCOPY    =  dgemm_ncopy_8_skylakex.c
+#DGEMMINCOPY    =  dgemm_ncopy_8_skylakex.c
 #DGEMMITCOPY    =  dgemm_tcopy_8_skylakex.c
 DGEMMONCOPY    =  dgemm_ncopy_8_skylakex.c
 DGEMMOTCOPY    =  dgemm_tcopy_8_skylakex.c

From 5a9cce2bf6740110b93a534f876072f220d928d1 Mon Sep 17 00:00:00 2001
From: Fabrice Fontaine <fontaine.fabrice@gmail.com>
Date: Sun, 5 May 2019 18:37:28 +0200
Subject: [PATCH 011/127] Makefile.arm: remove -march flags

The provided -march flags, especially for ARMv5 and ARMv6 may not
necessarily match the needed ones: for ARMv5, it might be armv5,
armv5te, armv5t, etc. If the wrong one is used, the incorrect toolchain
sysroot can be used in a multilib toolchain.

Therefore, let the user building OpenBLAS pass the appropriate -march
flag.

The other flags, such as -mfpu=vfp or -mfloat-abi=hard are kept, as they
are actually required for the build to proceed (OpenBLAS uses VFP
instructions, and assume an EABIhf ABI).

[Peter: update for v0.2.20]
Signed-off-by: Thomas Petazzoni <thomas.petazzoni@free-electrons.com>
Signed-off-by: Peter Korsgaard <peter@korsgaard.com>
[Retrieved from:
https://git.buildroot.net/buildroot/tree/package/openblas/0001-Makefile.arm-remove-march-flags.patch]
Signed-off-by: Fabrice Fontaine <fontaine.fabrice@gmail.com>
---
 Makefile.arm | 13 ++++---------
 1 file changed, 4 insertions(+), 9 deletions(-)

diff --git a/Makefile.arm b/Makefile.arm
index eedd39b73..b5d80f8e6 100644
--- a/Makefile.arm
+++ b/Makefile.arm
@@ -1,7 +1,7 @@
 ifeq ($(CORE), $(filter $(CORE),ARMV7 CORTEXA9 CORTEXA15))
 ifeq ($(OSNAME), Android)
-CCOMMON_OPT += -mfpu=neon  -march=armv7-a
-FCOMMON_OPT += -mfpu=neon  -march=armv7-a
+CCOMMON_OPT += -mfpu=neon
+FCOMMON_OPT += -mfpu=neon
 else
 CCOMMON_OPT += -mfpu=vfpv3 -march=armv7-a
 FCOMMON_OPT += -mfpu=vfpv3 -march=armv7-a
@@ -9,11 +9,6 @@ endif
 endif
 
 ifeq ($(CORE), ARMV6)
-CCOMMON_OPT += -mfpu=vfp -march=armv6
-FCOMMON_OPT += -mfpu=vfp -march=armv6
-endif
-
-ifeq ($(CORE), ARMV5)
-CCOMMON_OPT += -march=armv5
-FCOMMON_OPT += -march=armv5
+CCOMMON_OPT += -mfpu=vfp
+FCOMMON_OPT += -mfpu=vfp
 endif

From 3d7debbb280d0e671691469da650a124bedda219 Mon Sep 17 00:00:00 2001
From: Andrew <16061801+brada4@users.noreply.github.com>
Date: Tue, 7 May 2019 13:15:08 +0300
Subject: [PATCH 012/127] init


From a6a8cc2b7fa30f46fdaa4fb6e50c19da8c11e335 Mon Sep 17 00:00:00 2001
From: Martin Kroeker <martin@ruby.chemie.uni-freiburg.de>
Date: Tue, 7 May 2019 13:34:52 +0200
Subject: [PATCH 013/127] Fix errors in cpu enumeration with glibc 2.6

for #2114
---
 driver/others/init.c | 15 +++++++++++----
 1 file changed, 11 insertions(+), 4 deletions(-)

diff --git a/driver/others/init.c b/driver/others/init.c
index 012ef6647..a29dce971 100644
--- a/driver/others/init.c
+++ b/driver/others/init.c
@@ -765,7 +765,7 @@ int gotoblas_set_affinity(int pos) {
 
   int mynode = 1;
 
-  /* if number of threads is larger than inital condition */
+  /* if number of threads is larger than initial condition */
   if (pos < 0) {
       sched_setaffinity(0, sizeof(cpu_orig_mask), &cpu_orig_mask[0]);
       return 0;
@@ -857,7 +857,14 @@ void gotoblas_affinity_init(void) {
   common -> shmid = pshmid;
 
   if (common -> magic != SH_MAGIC) {
+
+#if defined(__GLIBC_PREREQ)
+#if __GLIBC_PREREQ(2, 7)
     cpu_set_t *cpusetp;
+#else
+    cpu_set_t cpuset;
+#endif
+#endif    
     int nums;
     int ret;
 
@@ -890,7 +897,7 @@ void gotoblas_affinity_init(void) {
     }
     CPU_FREE(cpusetp);
 #else
-    ret = sched_getaffinity(0,sizeof(cpu_set_t), cpusetp);
+    ret = sched_getaffinity(0,sizeof(cpu_set_t), &cpuset);
     if (ret!=0) {
         common->num_procs = nums;
     } else {
@@ -898,11 +905,11 @@ void gotoblas_affinity_init(void) {
     int i;
     int n = 0;
     for (i=0;i<nums;i++)
-        if (CPU_ISSET(i,cpusetp)) n++;
+        if (CPU_ISSET(i,&cpuset)) n++;
     common->num_procs = n;
     }
 #else
-    common->num_procs = CPU_COUNT(sizeof(cpu_set_t),cpusetp);
+    common->num_procs = CPU_COUNT(&cpuset);
     }
 #endif
 

From c516209581a77790b8d67d6dcd0c3f95fe713643 Mon Sep 17 00:00:00 2001
From: Diazonium <Diazonium@users.noreply.github.com>
Date: Tue, 7 May 2019 14:55:20 +0200
Subject: [PATCH 014/127] Change two http links to https

Closes #2109
---
 README.md | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/README.md b/README.md
index 76a65b74b..620e393f1 100644
--- a/README.md
+++ b/README.md
@@ -10,7 +10,7 @@ AppVeyor: [![Build status](https://ci.appveyor.com/api/projects/status/09sohd35n
 
 OpenBLAS is an optimized BLAS library based on GotoBLAS2 1.13 BSD version.
 
-Please read the documentation on the OpenBLAS wiki pages: <http://github.com/xianyi/OpenBLAS/wiki>.
+Please read the documentation on the OpenBLAS wiki pages: <https://github.com/xianyi/OpenBLAS/wiki>.
 
 ## Binary Packages
 
@@ -22,7 +22,7 @@ You can download them from [file hosting on sourceforge.net](https://sourceforge
 
 ## Installation from Source
 
-Download from project homepage, http://xianyi.github.com/OpenBLAS/, or check out the code
+Download from project homepage, https://xianyi.github.com/OpenBLAS/, or check out the code
 using Git from https://github.com/xianyi/OpenBLAS.git.
 
 ### Dependencies

From 575a84398a1569738029594372f9143a6743c52c Mon Sep 17 00:00:00 2001
From: Andrew <16061801+brada4@users.noreply.github.com>
Date: Tue, 7 May 2019 23:46:54 +0300
Subject: [PATCH 015/127] remove redundant code #2113

---
 lapack/getrf/getrf_parallel.c | 9 ---------
 1 file changed, 9 deletions(-)

diff --git a/lapack/getrf/getrf_parallel.c b/lapack/getrf/getrf_parallel.c
index 591ce4a99..c82defcab 100644
--- a/lapack/getrf/getrf_parallel.c
+++ b/lapack/getrf/getrf_parallel.c
@@ -279,9 +279,6 @@ static int inner_advanced_thread(blas_arg_t *args, BLASLONG *range_m, BLASLONG *
     for (i = 0; i < args -> nthreads; i++)
 #if 1
     {
-	LOCK_COMMAND(&getrf_lock);
-	jw = job[mypos].working[i][CACHE_LINE_SIZE * bufferside];
-	UNLOCK_COMMAND(&getrf_lock);
 	do {
 	    LOCK_COMMAND(&getrf_lock);
 	    jw = job[mypos].working[i][CACHE_LINE_SIZE * bufferside];
@@ -368,9 +365,6 @@ static int inner_advanced_thread(blas_arg_t *args, BLASLONG *range_m, BLASLONG *
 
 	  if ((current != mypos) && (!is)) {
 #if 1
-		LOCK_COMMAND(&getrf_lock);
-		jw = job[current].working[mypos][CACHE_LINE_SIZE * bufferside];
-		UNLOCK_COMMAND(&getrf_lock);
 		do {
 		    LOCK_COMMAND(&getrf_lock);
 		    jw = job[current].working[mypos][CACHE_LINE_SIZE * bufferside];
@@ -402,9 +396,6 @@ static int inner_advanced_thread(blas_arg_t *args, BLASLONG *range_m, BLASLONG *
   for (i = 0; i < args -> nthreads; i++) {
     for (xxx = 0; xxx < DIVIDE_RATE; xxx++) {
 #if 1
-	LOCK_COMMAND(&getrf_lock);
-	jw = job[mypos].working[i][CACHE_LINE_SIZE *xxx];
-	UNLOCK_COMMAND(&getrf_lock);
 	do {
 	    LOCK_COMMAND(&getrf_lock);
 	    jw = job[mypos].working[i][CACHE_LINE_SIZE *xxx];

From 7d1b468d9d83789d25eb6996afb5e358ee861f1d Mon Sep 17 00:00:00 2001
From: Zhang Xianyi <traits.zhang@gmail.com>
Date: Wed, 8 May 2019 09:58:01 +0800
Subject: [PATCH 016/127] Set up CI with Azure Pipelines

[skip ci]
---
 azure-pipelines.yml | 19 +++++++++++++++++++
 1 file changed, 19 insertions(+)
 create mode 100644 azure-pipelines.yml

diff --git a/azure-pipelines.yml b/azure-pipelines.yml
new file mode 100644
index 000000000..aa912913d
--- /dev/null
+++ b/azure-pipelines.yml
@@ -0,0 +1,19 @@
+# Starter pipeline
+# Start with a minimal pipeline that you can customize to build and deploy your code.
+# Add steps that build, run tests, deploy, and more:
+# https://aka.ms/yaml
+
+trigger:
+- master
+
+pool:
+  vmImage: 'ubuntu-latest'
+
+steps:
+- script: echo Hello, world!
+  displayName: 'Run a one-line script'
+
+- script: |
+    echo Add other tasks to build, test, and deploy your project.
+    echo See https://aka.ms/yaml
+  displayName: 'Run a multi-line script'

From e47b63466b26dab9618443fd5754885bea653845 Mon Sep 17 00:00:00 2001
From: Tyler Reddy <tyler.je.reddy@gmail.com>
Date: Tue, 7 May 2019 16:06:42 -0700
Subject: [PATCH 017/127] TST: add native POWER8 to CI

* add native POWER8 testing to
Travis CI matrix with ppc64le
os entry
---
 .travis.yml | 9 +++++++++
 1 file changed, 9 insertions(+)

diff --git a/.travis.yml b/.travis.yml
index eee7674fe..00a2509f9 100644
--- a/.travis.yml
+++ b/.travis.yml
@@ -25,6 +25,15 @@ matrix:
         - TARGET_BOX=LINUX64
         - BTYPE="BINARY=64"
 
+    - <<: *test-ubuntu
+      os: linux-ppc64le
+      before_script:
+        - COMMON_FLAGS="DYNAMIC_ARCH=1 TARGET=POWER8 NUM_THREADS=32"
+      env:
+        # for matrix annotation only
+        - TARGET_BOX=PPC64LE_LINUX
+        - BTYPE="BINARY=64 USE_OPENMP=1"
+
     - <<: *test-ubuntu
       env:
         - TARGET_BOX=LINUX64

From 70cea0b96b70330ae6ef80b954e708d6acd86911 Mon Sep 17 00:00:00 2001
From: Martin Kroeker <martin@ruby.chemie.uni-freiburg.de>
Date: Wed, 8 May 2019 12:20:00 +0200
Subject: [PATCH 018/127] Update link to IBM MASS library, update cpu support
 status

---
 README.md | 9 +++++----
 1 file changed, 5 insertions(+), 4 deletions(-)

diff --git a/README.md b/README.md
index 620e393f1..68a121498 100644
--- a/README.md
+++ b/README.md
@@ -63,9 +63,7 @@ A debug version can be built using `make DEBUG=1`.
 
 ### Compile with MASS support on Power CPU (optional)
 
-The [IBM MASS](http://www-01.ibm.com/software/awdtools/mass/linux/mass-linux.html) library
-consists of a set of mathematical functions for C, C++, and Fortran applications that are
-are tuned for optimum performance on POWER architectures.
+The [IBM MASS](https://www.ibm.com/support/home/product/W511326D80541V01/other_software/mathematical_acceleration_subsystem) library consists of a set of mathematical functions for C, C++, and Fortran applications that are tuned for optimum performance on POWER architectures.
 OpenBLAS with MASS requires a 64-bit, little-endian OS on POWER.
 The library can be installed as shown:
 
@@ -115,6 +113,7 @@ Please read `GotoBLAS_01Readme.txt`.
 - **AMD Bulldozer**: x86-64 ?GEMM FMA4 kernels. (Thanks to Werner Saar)
 - **AMD PILEDRIVER**: Uses Bulldozer codes with some optimizations.
 - **AMD STEAMROLLER**: Uses Bulldozer codes with some optimizations.
+- **AMD ZEN**: Uses Haswell codes with some optimizations.
 
 #### MIPS64
 
@@ -133,11 +132,13 @@ Please read `GotoBLAS_01Readme.txt`.
 
 #### PPC/PPC64
 
-- **POWER8**: Optimized Level-3 BLAS and some Level-1, only with `USE_OPENMP=1`
+- **POWER8**: Optimized BLAS, only for PPC64LE (Little Endian), only with `USE_OPENMP=1`
+- **POWER9**: Optimized Level-3 BLAS (real) and some Level-1,2. PPC64LE with OpenMP only. 
 
 #### IBM zEnterprise System
 
 - **Z13**: Optimized Level-3 BLAS and Level-1,2 (double precision)
+- **Z14**: Optimized Level-3 BLAS and Level-1,2 (single precision)
 
 ### Supported OS
 

From 3a49e8c05aa24bba832e5e05bd8888fbee039919 Mon Sep 17 00:00:00 2001
From: Martin Kroeker <martin@ruby.chemie.uni-freiburg.de>
Date: Wed, 8 May 2019 13:52:22 +0200
Subject: [PATCH 019/127] first try migrating one of the arm builds from travis

---
 azure-pipelines.yml | 26 +++++++++++++++++++++++---
 1 file changed, 23 insertions(+), 3 deletions(-)

diff --git a/azure-pipelines.yml b/azure-pipelines.yml
index aa912913d..87b4de3f0 100644
--- a/azure-pipelines.yml
+++ b/azure-pipelines.yml
@@ -14,6 +14,26 @@ steps:
   displayName: 'Run a one-line script'
 
 - script: |
-    echo Add other tasks to build, test, and deploy your project.
-    echo See https://aka.ms/yaml
-  displayName: 'Run a multi-line script'
+  docker run --rm --privileged multiarch/qemu-user-static:register --reset
+  ls /proc/sys/fs/binfmt_misc/
+  condition: not(startsWith(variables['CONFIG'], 'linux_64'))
+  displayName: Configure binfmt_misc
+
+- script: |
+  echo "FROM openblas/alpine:arm32
+        COPY . /tmp/openblas
+        RUN mkdir /tmp/openblas/build                             &&  \
+            cd /tmp/openblas/build                                &&  \
+            CC=gcc cmake -D DYNAMIC_ARCH=OFF                  \
+                                 -D TARGET=ARMV6             \
+                                 -D BUILD_SHARED_LIBS=ON              \
+                                 -D BUILD_WITHOUT_LAPACK=ON           \
+                                 -D BUILD_WITHOUT_CBLAS=ON            \
+                                 -D CMAKE_BUILD_TYPE=Release ../  &&  \
+            cmake --build ." > Dockerfile
+        docker build .
+
+#- script: |
+#    echo Add other tasks to build, test, and deploy your project.
+#    echo See https://aka.ms/yaml
+#  displayName: 'Run a multi-line script'

From 5cf434167ab9622c6788e4fdc9b418ab7bf96e61 Mon Sep 17 00:00:00 2001
From: Martin Kroeker <martin@ruby.chemie.uni-freiburg.de>
Date: Wed, 8 May 2019 13:58:59 +0200
Subject: [PATCH 020/127] fix tabbing in azure commands

---
 azure-pipelines.yml | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/azure-pipelines.yml b/azure-pipelines.yml
index 87b4de3f0..3b277073a 100644
--- a/azure-pipelines.yml
+++ b/azure-pipelines.yml
@@ -14,10 +14,10 @@ steps:
   displayName: 'Run a one-line script'
 
 - script: |
-  docker run --rm --privileged multiarch/qemu-user-static:register --reset
-  ls /proc/sys/fs/binfmt_misc/
+    docker run --rm --privileged multiarch/qemu-user-static:register --reset
+    ls /proc/sys/fs/binfmt_misc/
   condition: not(startsWith(variables['CONFIG'], 'linux_64'))
-  displayName: Configure binfmt_misc
+  displayName: 'Configure binfmt_misc'
 
 - script: |
   echo "FROM openblas/alpine:arm32
@@ -32,7 +32,7 @@ steps:
                                  -D CMAKE_BUILD_TYPE=Release ../  &&  \
             cmake --build ." > Dockerfile
         docker build .
-
+  displayname: 'Run ARMV6 docker build'
 #- script: |
 #    echo Add other tasks to build, test, and deploy your project.
 #    echo See https://aka.ms/yaml

From aa4c41bad26bbb6d550ddad3141063c2260b7afd Mon Sep 17 00:00:00 2001
From: Martin Kroeker <martin@ruby.chemie.uni-freiburg.de>
Date: Wed, 8 May 2019 14:12:02 +0200
Subject: [PATCH 021/127] Update azure-pipelines.yml

take out offending lines (although stolen from https://github.com/conda-forge/opencv-feedstock azure-pipelines fiie)
---
 azure-pipelines.yml | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/azure-pipelines.yml b/azure-pipelines.yml
index 3b277073a..d7e6cdc9b 100644
--- a/azure-pipelines.yml
+++ b/azure-pipelines.yml
@@ -15,9 +15,9 @@ steps:
 
 - script: |
     docker run --rm --privileged multiarch/qemu-user-static:register --reset
-    ls /proc/sys/fs/binfmt_misc/
-  condition: not(startsWith(variables['CONFIG'], 'linux_64'))
-  displayName: 'Configure binfmt_misc'
+#    ls /proc/sys/fs/binfmt_misc/
+#  condition: not(startsWith(variables['CONFIG'], 'linux_64'))
+#  displayName: 'Configure binfmt_misc'
 
 - script: |
   echo "FROM openblas/alpine:arm32

From 16fd8e3dbe510802860f1981321bf9cd70676de4 Mon Sep 17 00:00:00 2001
From: Martin Kroeker <martin@ruby.chemie.uni-freiburg.de>
Date: Wed, 8 May 2019 14:14:22 +0200
Subject: [PATCH 022/127] Update azure-pipelines.yml

---
 azure-pipelines.yml | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/azure-pipelines.yml b/azure-pipelines.yml
index d7e6cdc9b..12ea40b61 100644
--- a/azure-pipelines.yml
+++ b/azure-pipelines.yml
@@ -32,7 +32,8 @@ steps:
                                  -D CMAKE_BUILD_TYPE=Release ../  &&  \
             cmake --build ." > Dockerfile
         docker build .
-  displayname: 'Run ARMV6 docker build'
+  displayName: 'Run ARMV6 docker build'
+
 #- script: |
 #    echo Add other tasks to build, test, and deploy your project.
 #    echo See https://aka.ms/yaml

From a598ab1d32c1d5fcf9b9eb0c503a24db13757bc2 Mon Sep 17 00:00:00 2001
From: Martin Kroeker <martin@ruby.chemie.uni-freiburg.de>
Date: Wed, 8 May 2019 15:23:54 +0200
Subject: [PATCH 023/127] Update azure-pipelines.yml

---
 azure-pipelines.yml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/azure-pipelines.yml b/azure-pipelines.yml
index 12ea40b61..2b092c256 100644
--- a/azure-pipelines.yml
+++ b/azure-pipelines.yml
@@ -32,7 +32,7 @@ steps:
                                  -D CMAKE_BUILD_TYPE=Release ../  &&  \
             cmake --build ." > Dockerfile
         docker build .
-  displayName: 'Run ARMV6 docker build'
+#  displayName: 'Run ARMV6 docker build'
 
 #- script: |
 #    echo Add other tasks to build, test, and deploy your project.

From dd77a3f0e27dee0c15b6e1da3649aba6723631ab Mon Sep 17 00:00:00 2001
From: Martin Kroeker <martin@ruby.chemie.uni-freiburg.de>
Date: Wed, 8 May 2019 15:25:43 +0200
Subject: [PATCH 024/127] Update azure-pipelines.yml

---
 azure-pipelines.yml | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/azure-pipelines.yml b/azure-pipelines.yml
index 2b092c256..e25f11cb1 100644
--- a/azure-pipelines.yml
+++ b/azure-pipelines.yml
@@ -32,6 +32,8 @@ steps:
                                  -D CMAKE_BUILD_TYPE=Release ../  &&  \
             cmake --build ." > Dockerfile
         docker build .
+        
+        
 #  displayName: 'Run ARMV6 docker build'
 
 #- script: |

From ad20ceaa680e555e6f4e5e6d199f4c158ef1b6df Mon Sep 17 00:00:00 2001
From: Martin Kroeker <martin@ruby.chemie.uni-freiburg.de>
Date: Wed, 8 May 2019 19:07:58 +0200
Subject: [PATCH 025/127] Update azure-pipelines.yml

---
 azure-pipelines.yml | 12 +++++-------
 1 file changed, 5 insertions(+), 7 deletions(-)

diff --git a/azure-pipelines.yml b/azure-pipelines.yml
index e25f11cb1..0b1ba16fd 100644
--- a/azure-pipelines.yml
+++ b/azure-pipelines.yml
@@ -13,14 +13,14 @@ steps:
 - script: echo Hello, world!
   displayName: 'Run a one-line script'
 
-- script: |
-    docker run --rm --privileged multiarch/qemu-user-static:register --reset
+#- script: |
+#    docker run --rm --privileged multiarch/qemu-user-static:register --reset
 #    ls /proc/sys/fs/binfmt_misc/
 #  condition: not(startsWith(variables['CONFIG'], 'linux_64'))
 #  displayName: 'Configure binfmt_misc'
 
 - script: |
-  echo "FROM openblas/alpine:arm32
+    echo "FROM openblas/alpine:arm32
         COPY . /tmp/openblas
         RUN mkdir /tmp/openblas/build                             &&  \
             cd /tmp/openblas/build                                &&  \
@@ -31,10 +31,8 @@ steps:
                                  -D BUILD_WITHOUT_CBLAS=ON            \
                                  -D CMAKE_BUILD_TYPE=Release ../  &&  \
             cmake --build ." > Dockerfile
-        docker build .
-        
-        
-#  displayName: 'Run ARMV6 docker build'
+    docker build .
+  displayName: Run ARMV6 docker build
 
 #- script: |
 #    echo Add other tasks to build, test, and deploy your project.

From 53703585aa5ac170cabfe035a32bd0e07e1877c8 Mon Sep 17 00:00:00 2001
From: Tyler Reddy <tyler.je.reddy@gmail.com>
Date: Wed, 8 May 2019 15:14:01 -0700
Subject: [PATCH 026/127] DOC: Add Azure CI status badge

---
 README.md | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/README.md b/README.md
index 68a121498..14815ff00 100644
--- a/README.md
+++ b/README.md
@@ -6,6 +6,8 @@ Travis CI: [![Build Status](https://travis-ci.org/xianyi/OpenBLAS.svg?branch=dev
 
 AppVeyor: [![Build status](https://ci.appveyor.com/api/projects/status/09sohd35n8nkkx64/branch/develop?svg=true)](https://ci.appveyor.com/project/xianyi/openblas/branch/develop)
 
+[![Build Status](https://dev.azure.com/xianyi/OpenBLAS/_apis/build/status/xianyi.OpenBLAS?branchName=develop)](https://dev.azure.com/xianyi/OpenBLAS/_build/latest?definitionId=1&branchName=develop)
+
 ## Introduction
 
 OpenBLAS is an optimized BLAS library based on GotoBLAS2 1.13 BSD version.

From 406c7242f49730e45453544b601d717e02ebe07d Mon Sep 17 00:00:00 2001
From: Martin Kroeker <martin@ruby.chemie.uni-freiburg.de>
Date: Thu, 9 May 2019 00:47:44 +0200
Subject: [PATCH 027/127] Add ARMV6 build to azure CI setup (#2122)

using aytekinar's Alpine image and docker script from the Travis setup

[skip ci]
---
 azure-pipelines.yml | 12 ++++--------
 1 file changed, 4 insertions(+), 8 deletions(-)

diff --git a/azure-pipelines.yml b/azure-pipelines.yml
index 0b1ba16fd..cef2ef973 100644
--- a/azure-pipelines.yml
+++ b/azure-pipelines.yml
@@ -13,19 +13,15 @@ steps:
 - script: echo Hello, world!
   displayName: 'Run a one-line script'
 
-#- script: |
-#    docker run --rm --privileged multiarch/qemu-user-static:register --reset
-#    ls /proc/sys/fs/binfmt_misc/
-#  condition: not(startsWith(variables['CONFIG'], 'linux_64'))
-#  displayName: 'Configure binfmt_misc'
-
 - script: |
+    docker run --rm --privileged multiarch/qemu-user-static:register --reset
     echo "FROM openblas/alpine:arm32
         COPY . /tmp/openblas
         RUN mkdir /tmp/openblas/build                             &&  \
             cd /tmp/openblas/build                                &&  \
-            CC=gcc cmake -D DYNAMIC_ARCH=OFF                  \
-                                 -D TARGET=ARMV6             \
+            CC=gcc cmake -D DYNAMIC_ARCH=OFF                          \
+                                 -D TARGET=ARMV6                      \
+                                 -D NOFORTRAN=ON                      \
                                  -D BUILD_SHARED_LIBS=ON              \
                                  -D BUILD_WITHOUT_LAPACK=ON           \
                                  -D BUILD_WITHOUT_CBLAS=ON            \

From 4efbac28ed42b79ac0ba27cfe065d38a3ba5af68 Mon Sep 17 00:00:00 2001
From: Tyler Reddy <tyler.je.reddy@gmail.com>
Date: Wed, 8 May 2019 18:51:59 -0700
Subject: [PATCH 028/127] TST: Azure manylinux1 & clean-up

* remove some of the steps & comments
from the original Azure yml template

* modify the trigger section to use
develop since OpenBLAS primarily uses
this branch; use the same batching
behavior as downstream projects NumPy/
SciPy

* remove Travis emulated ARMv6 gcc build
because this now happens in Azure

* use documented Ubuntu vmImage name for Azure
and add in a manylinux1 test run to the matrix

[skip appveyor]
---
 .travis.yml         |  8 ++-----
 azure-pipelines.yml | 57 +++++++++++++++++++++++++++------------------
 2 files changed, 36 insertions(+), 29 deletions(-)

diff --git a/.travis.yml b/.travis.yml
index 00a2509f9..82e2aaac8 100644
--- a/.travis.yml
+++ b/.travis.yml
@@ -177,8 +177,8 @@ matrix:
       dist: trusty
       sudo: required
       services: docker
-      env: IMAGE_ARCH=arm32 TARGET_ARCH=ARMV6 COMPILER=gcc
-      name: "Emulated Build for ARMV6 with gcc"
+      env: IMAGE_ARCH=arm32 TARGET_ARCH=ARMV6 COMPILER=clang
+      name: "Emulated Build for ARMV6 with clang"
       before_install: sudo docker run --rm --privileged multiarch/qemu-user-static:register --reset
       script: |
         echo "FROM openblas/alpine:${IMAGE_ARCH}
@@ -193,9 +193,6 @@ matrix:
                                  -D CMAKE_BUILD_TYPE=Release ../  &&  \
             cmake --build ." > Dockerfile
         docker build .
-    - <<: *emulated-arm
-      env: IMAGE_ARCH=arm32 TARGET_ARCH=ARMV6 COMPILER=clang
-      name: "Emulated Build for ARMV6 with clang"
     - <<: *emulated-arm
       env: IMAGE_ARCH=arm64 TARGET_ARCH=ARMV8 COMPILER=gcc
       name: "Emulated Build for ARMV8 with gcc"
@@ -204,7 +201,6 @@ matrix:
       name: "Emulated Build for ARMV8 with clang"
 
   allow_failures:
-    - env: IMAGE_ARCH=arm32 TARGET_ARCH=ARMV6 COMPILER=gcc
     - env: IMAGE_ARCH=arm32 TARGET_ARCH=ARMV6 COMPILER=clang
     - env: IMAGE_ARCH=arm64 TARGET_ARCH=ARMV8 COMPILER=gcc
     - env: IMAGE_ARCH=arm64 TARGET_ARCH=ARMV8 COMPILER=clang
diff --git a/azure-pipelines.yml b/azure-pipelines.yml
index cef2ef973..cbea6f4a7 100644
--- a/azure-pipelines.yml
+++ b/azure-pipelines.yml
@@ -1,21 +1,18 @@
-# Starter pipeline
-# Start with a minimal pipeline that you can customize to build and deploy your code.
-# Add steps that build, run tests, deploy, and more:
-# https://aka.ms/yaml
-
 trigger:
-- master
+  # start a new build for every push
+  batch: False
+  branches:
+    include:
+      - develop
 
-pool:
-  vmImage: 'ubuntu-latest'
-
-steps:
-- script: echo Hello, world!
-  displayName: 'Run a one-line script'
-
-- script: |
-    docker run --rm --privileged multiarch/qemu-user-static:register --reset
-    echo "FROM openblas/alpine:arm32
+jobs:
+- job: ARMv6_gcc
+  pool:
+    vmImage: 'ubuntu-16.04'
+  steps:
+  - script: |
+      docker run --rm --privileged multiarch/qemu-user-static:register --reset
+      echo "FROM openblas/alpine:arm32
         COPY . /tmp/openblas
         RUN mkdir /tmp/openblas/build                             &&  \
             cd /tmp/openblas/build                                &&  \
@@ -27,10 +24,24 @@ steps:
                                  -D BUILD_WITHOUT_CBLAS=ON            \
                                  -D CMAKE_BUILD_TYPE=Release ../  &&  \
             cmake --build ." > Dockerfile
-    docker build .
-  displayName: Run ARMV6 docker build
-
-#- script: |
-#    echo Add other tasks to build, test, and deploy your project.
-#    echo See https://aka.ms/yaml
-#  displayName: 'Run a multi-line script'
+      docker build .
+    displayName: Run ARMV6 docker build
+# manylinux1 is useful to test because the
+# standard Docker container uses an old version
+# of gcc / glibc
+- job: manylinux1_gcc
+  pool:
+    vmImage: 'ubuntu-16.04'
+  steps:
+  - script: |
+      echo "FROM quay.io/pypa/manylinux1_x86_64
+        COPY . /tmp/openblas
+        RUN cd /tmp/openblas                                      &&  \
+            COMMON_FLAGS='DYNAMIC_ARCH=1 TARGET=NEHALEM NUM_THREADS=32' && \
+            BTYPE='BINARY=64' CC=gcc && \
+            make QUIET_MAKE=1 $COMMON_FLAGS $BTYPE && \
+            make -C test $COMMON_FLAGS $BTYPE && \
+            make -C ctest $COMMON_FLAGS $BTYPE && \
+            make -C utest $COMMON_FLAGS $BTYPE" > Dockerfile
+      docker build .
+    displayName: Run manylinux1 docker build

From a3d4c65d62cf3689fe5840e65a7fcdb64d986435 Mon Sep 17 00:00:00 2001
From: Martin Kroeker <martin@ruby.chemie.uni-freiburg.de>
Date: Thu, 9 May 2019 11:52:02 +0200
Subject: [PATCH 029/127] Add NO_AFFINITY to available options on Linux, and
 set it to ON

to match the gmake default. Fixes second part of #2114
---
 CMakeLists.txt | 11 ++++++++---
 1 file changed, 8 insertions(+), 3 deletions(-)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index a27c1c0fc..50da721cd 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -6,7 +6,7 @@ cmake_minimum_required(VERSION 2.8.5)
 project(OpenBLAS C ASM)
 set(OpenBLAS_MAJOR_VERSION 0)
 set(OpenBLAS_MINOR_VERSION 3)
-set(OpenBLAS_PATCH_VERSION 6.dev)
+set(OpenBLAS_PATCH_VERSION 7.dev)
 set(OpenBLAS_VERSION "${OpenBLAS_MAJOR_VERSION}.${OpenBLAS_MINOR_VERSION}.${OpenBLAS_PATCH_VERSION}")
 
 # Adhere to GNU filesystem layout conventions
@@ -20,9 +20,14 @@ if(MSVC)
 option(BUILD_WITHOUT_LAPACK "Do not build LAPACK and LAPACKE (Only BLAS or CBLAS)" ON)
 endif()
 option(BUILD_WITHOUT_CBLAS "Do not build the C interface (CBLAS) to the BLAS functions" OFF)
-option(DYNAMIC_ARCH "Include support for multiple CPU targets, with automatic selection at runtime (x86/x86_64 only)" OFF)
-option(DYNAMIC_OLDER "Include specific support for older cpu models (Penryn,Dunnington,Atom,Nano,Opteron) with DYNAMIC_ARCH" OFF)
+option(DYNAMIC_ARCH "Include support for multiple CPU targets, with automatic selection at runtime (x86/x86_64, aarch64 or ppc only)" OFF)
+option(DYNAMIC_OLDER "Include specific support for older x86 cpu models (Penryn,Dunnington,Atom,Nano,Opteron) with DYNAMIC_ARCH" OFF)
 option(BUILD_RELAPACK "Build with ReLAPACK (recursive implementation of several LAPACK functions on top of standard LAPACK)" OFF)
+if(${CMAKE_SYSTEM_NAME} MATCHES "Linux")
+option(NO_AFFINITY "Disable support for CPU affinity masks to avoid binding processes from e.g. R or numpy/scipy to a single core" ON)
+else()
+set(NO_AFFINITY 1)
+endif()
 
 # Add a prefix or suffix to all exported symbol names in the shared library.
 # Avoids conflicts with other BLAS libraries, especially when using

From 9ea30f3788b64b7f42acfaf08e234591aee33e23 Mon Sep 17 00:00:00 2001
From: Martin Kroeker <martin@ruby.chemie.uni-freiburg.de>
Date: Thu, 9 May 2019 14:42:36 +0200
Subject: [PATCH 030/127] Replace ISMIN and ISAMIN kernels on all x86_64
 platforms (#2125)

* Mark iamax_sse.S as unsuitable for MIN due to issue #2116
* Use iamax.S rather than iamax_sse.S for ISMIN/ISAMIN on all x86_64 as workaround for #2116
---
 kernel/x86_64/KERNEL      |   4 +-
 kernel/x86_64/iamax_sse.S | 106 ++++++++++++++++++++------------------
 2 files changed, 58 insertions(+), 52 deletions(-)

diff --git a/kernel/x86_64/KERNEL b/kernel/x86_64/KERNEL
index 4874711bb..92d121ab2 100644
--- a/kernel/x86_64/KERNEL
+++ b/kernel/x86_64/KERNEL
@@ -171,7 +171,7 @@ IXAMAXKERNEL = izamax.S
 endif
 
 ifndef ISAMINKERNEL
-ISAMINKERNEL = iamax_sse.S
+ISAMINKERNEL = iamax.S
 endif
 
 ifndef IDAMINKERNEL
@@ -207,7 +207,7 @@ IQMAXKERNEL = iamax.S
 endif
 
 ifndef ISMINKERNEL
-ISMINKERNEL = iamax_sse.S
+ISMINKERNEL = iamax.S
 endif
 
 ifndef IDMINKERNEL
diff --git a/kernel/x86_64/iamax_sse.S b/kernel/x86_64/iamax_sse.S
index f22e34a1d..d50c1699c 100644
--- a/kernel/x86_64/iamax_sse.S
+++ b/kernel/x86_64/iamax_sse.S
@@ -36,6 +36,10 @@
 /* or implied, of The University of Texas at Austin.                 */
 /*********************************************************************/
 
+/* This kernel was found to give wrong results when used for ISMIN/ISAMIN
+   with increment != 1, although it appears to be correct for corresponding
+   MAX operations. See issue 2116 */
+
 #define ASSEMBLER
 #include "common.h"
 
@@ -48,9 +52,11 @@
 #define XX	%r10
 #define	MM	%r11
 
+#define MAXPS	maxps
+#define MAXSS	maxss
 #ifdef USE_MIN
-#define maxps	minps
-#define maxss	minss
+#define MAXPS	minps
+#define MAXSS	minss
 #endif
 
 #include "l1param.h"
@@ -103,7 +109,7 @@
 #ifdef USE_ABS
 	andps	%xmm15, %xmm4
 #endif
-	maxss	  %xmm4, %xmm0
+	MAXSS	  %xmm4, %xmm0
 	decq	M
 	addq	$SIZE, X
 	ALIGN_3
@@ -117,7 +123,7 @@
 #ifdef USE_ABS
 	andps	%xmm15, %xmm4
 #endif
-	maxps	  %xmm4, %xmm1
+	MAXPS	  %xmm4, %xmm1
 	subq	$2, M
 	addq	$2 * SIZE, X
 	ALIGN_3
@@ -137,25 +143,25 @@
 #ifdef USE_ABS
 	andps	%xmm15, %xmm4
 #endif
-	maxps	%xmm4, %xmm0
+	MAXPS	%xmm4, %xmm0
 
 	movaps	 4 * SIZE(X), %xmm5
 #ifdef USE_ABS
 	andps	%xmm15, %xmm5
 #endif
-	maxps	%xmm5, %xmm1
+	MAXPS	%xmm5, %xmm1
 
 	movaps	 8 * SIZE(X), %xmm6
 #ifdef USE_ABS
 	andps	%xmm15, %xmm6
 #endif
-	maxps	%xmm6, %xmm2
+	MAXPS	%xmm6, %xmm2
 
 	movaps	12 * SIZE(X), %xmm7
 #ifdef USE_ABS
 	andps	%xmm15, %xmm7
 #endif
-	maxps	%xmm7, %xmm3
+	MAXPS	%xmm7, %xmm3
 
 	addq	$16 * SIZE, X
 	decq	I
@@ -173,13 +179,13 @@
 #ifdef USE_ABS
 	andps	%xmm15, %xmm4
 #endif
-	maxps	%xmm4, %xmm0
+	MAXPS	%xmm4, %xmm0
 
 	movaps	4 * SIZE(X), %xmm5
 #ifdef USE_ABS
 	andps	%xmm15, %xmm5
 #endif
-	maxps	%xmm5, %xmm1
+	MAXPS	%xmm5, %xmm1
 	addq	$8 * SIZE, X
 	ALIGN_3
 
@@ -191,7 +197,7 @@
 #ifdef USE_ABS
 	andps	%xmm15, %xmm6
 #endif
-	maxps	%xmm6, %xmm2
+	MAXPS	%xmm6, %xmm2
 	addq	$4 * SIZE, X
 	ALIGN_3
 
@@ -204,7 +210,7 @@
 #ifdef USE_ABS
 	andps	%xmm15, %xmm7
 #endif
-	maxps	%xmm7, %xmm3
+	MAXPS	%xmm7, %xmm3
 	addq	$2 * SIZE, X
 
 .L18:
@@ -215,22 +221,22 @@
 #ifdef USE_ABS
 	andps	%xmm15, %xmm4
 #endif
-	maxss	%xmm4, %xmm0
+	MAXSS	%xmm4, %xmm0
 	ALIGN_3
 
 .L20:
 	movq	XX, X
 	movq	MM, M
 
-	maxps	%xmm1, %xmm0
-	maxps	%xmm3, %xmm2
-	maxps	%xmm2, %xmm0
+	MAXPS	%xmm1, %xmm0
+	MAXPS	%xmm3, %xmm2
+	MAXPS	%xmm2, %xmm0
 	movaps	%xmm0, %xmm1
 	movhlps %xmm0, %xmm0
-	maxps	%xmm1, %xmm0
+	MAXPS	%xmm1, %xmm0
 	movaps	%xmm0, %xmm1
 	shufps	$1, %xmm0, %xmm0
-	maxss	%xmm1, %xmm0
+	MAXSS	%xmm1, %xmm0
 	shufps	$0, %xmm0, %xmm0
 
 	testq	$4, X
@@ -427,28 +433,28 @@
 #ifdef USE_ABS
 	andps	%xmm15, %xmm4
 #endif
-	maxps	%xmm4, %xmm0
+	MAXPS	%xmm4, %xmm0
 
 	movsd	 4 * SIZE(X), %xmm5
 	movhps	 6 * SIZE(X), %xmm5
 #ifdef USE_ABS
 	andps	%xmm15, %xmm5
 #endif
-	maxps	%xmm5, %xmm1
+	MAXPS	%xmm5, %xmm1
 
 	movsd	 8 * SIZE(X), %xmm6
 	movhps	10 * SIZE(X), %xmm6
 #ifdef USE_ABS
 	andps	%xmm15, %xmm6
 #endif
-	maxps	%xmm6, %xmm2
+	MAXPS	%xmm6, %xmm2
 
 	movsd	12 * SIZE(X), %xmm7
 	movhps	14 * SIZE(X), %xmm7
 #ifdef USE_ABS
 	andps	%xmm15, %xmm7
 #endif
-	maxps	%xmm7, %xmm3
+	MAXPS	%xmm7, %xmm3
 
 	addq	$16 * SIZE, X
 	decq	I
@@ -467,14 +473,14 @@
 #ifdef USE_ABS
 	andps	%xmm15, %xmm4
 #endif
-	maxps	%xmm4, %xmm0
+	MAXPS	%xmm4, %xmm0
 
 	movsd	4 * SIZE(X), %xmm5
 	movhps	6 * SIZE(X), %xmm5
 #ifdef USE_ABS
 	andps	%xmm15, %xmm5
 #endif
-	maxps	%xmm5, %xmm1
+	MAXPS	%xmm5, %xmm1
 
 	addq	$8 * SIZE, X
 	ALIGN_3
@@ -488,7 +494,7 @@
 #ifdef USE_ABS
 	andps	%xmm15, %xmm6
 #endif
-	maxps	%xmm6, %xmm2
+	MAXPS	%xmm6, %xmm2
 	addq	$4 * SIZE, X
 	ALIGN_3
 
@@ -501,7 +507,7 @@
 #ifdef USE_ABS
 	andps	%xmm15, %xmm7
 #endif
-	maxps	%xmm7, %xmm3
+	MAXPS	%xmm7, %xmm3
 	addq	$2 * SIZE, X
 
 .L38:
@@ -512,7 +518,7 @@
 #ifdef USE_ABS
 	andps	%xmm15, %xmm4
 #endif
-	maxss	%xmm4, %xmm0
+	MAXSS	%xmm4, %xmm0
 	jmp	.L40
 	ALIGN_4
 
@@ -520,15 +526,15 @@
 	movq	XX, X
 	movq	MM, M
 
-	maxps	%xmm1, %xmm0
-	maxps	%xmm3, %xmm2
-	maxps	%xmm2, %xmm0
+	MAXPS	%xmm1, %xmm0
+	MAXPS	%xmm3, %xmm2
+	MAXPS	%xmm2, %xmm0
 	movaps	%xmm0, %xmm1
 	movhlps %xmm0, %xmm0
-	maxps	%xmm1, %xmm0
+	MAXPS	%xmm1, %xmm0
 	movaps	%xmm0, %xmm1
 	shufps	$1, %xmm0, %xmm0
-	maxss	%xmm1, %xmm0
+	MAXSS	%xmm1, %xmm0
 	shufps	$0, %xmm0, %xmm0
 
 	movq	M,  I
@@ -687,56 +693,56 @@
 #ifdef USE_ABS
 	andps	%xmm15, %xmm4
 #endif
-	maxss	%xmm4, %xmm0
+	MAXSS	%xmm4, %xmm0
 
 	movss	0 * SIZE(X), %xmm5
 	addq	INCX, X
 #ifdef USE_ABS
 	andps	%xmm15, %xmm5
 #endif
-	maxss	%xmm5, %xmm1
+	MAXSS	%xmm5, %xmm1
 
 	movss	0 * SIZE(X), %xmm6
 	addq	INCX, X
 #ifdef USE_ABS
 	andps	%xmm15, %xmm6
 #endif
-	maxss	%xmm6, %xmm2
+	MAXSS	%xmm6, %xmm2
 
 	movss	0 * SIZE(X), %xmm7
 	addq	INCX, X
 #ifdef USE_ABS
 	andps	%xmm15, %xmm7
 #endif
-	maxss	%xmm7, %xmm3
+	MAXSS	%xmm7, %xmm3
 
 	movss	0 * SIZE(X), %xmm4
 	addq	INCX, X
 #ifdef USE_ABS
 	andps	%xmm15, %xmm4
 #endif
-	maxss	%xmm4, %xmm0
+	MAXSS	%xmm4, %xmm0
 
 	movss	0 * SIZE(X), %xmm5
 	addq	INCX, X
 #ifdef USE_ABS
 	andps	%xmm15, %xmm5
 #endif
-	maxss	%xmm5, %xmm1
+	MAXSS	%xmm5, %xmm1
 
 	movss	0 * SIZE(X), %xmm6
 	addq	INCX, X
 #ifdef USE_ABS
 	andps	%xmm15, %xmm6
 #endif
-	maxss	%xmm6, %xmm2
+	MAXSS	%xmm6, %xmm2
 
 	movss	0 * SIZE(X), %xmm7
 	addq	INCX, X
 #ifdef USE_ABS
 	andps	%xmm15, %xmm7
 #endif
-	maxss	%xmm7, %xmm3
+	MAXSS	%xmm7, %xmm3
 
 	decq	I
 	jg	.L81
@@ -754,28 +760,28 @@
 #ifdef USE_ABS
 	andps	%xmm15, %xmm4
 #endif
-	maxss	%xmm4, %xmm0
+	MAXSS	%xmm4, %xmm0
 
 	movss	0 * SIZE(X), %xmm5
 	addq	INCX, X
 #ifdef USE_ABS
 	andps	%xmm15, %xmm5
 #endif
-	maxss	%xmm5, %xmm1
+	MAXSS	%xmm5, %xmm1
 
 	movss	0 * SIZE(X), %xmm6
 	addq	INCX, X
 #ifdef USE_ABS
 	andps	%xmm15, %xmm6
 #endif
-	maxss	%xmm6, %xmm2
+	MAXSS	%xmm6, %xmm2
 
 	movss	0 * SIZE(X), %xmm7
 	addq	INCX, X
 #ifdef USE_ABS
 	andps	%xmm15, %xmm7
 #endif
-	maxss	%xmm7, %xmm3
+	MAXSS	%xmm7, %xmm3
 	ALIGN_3
 
 .L86:
@@ -787,14 +793,14 @@
 #ifdef USE_ABS
 	andps	%xmm15, %xmm4
 #endif
-	maxss	%xmm4, %xmm0
+	MAXSS	%xmm4, %xmm0
 
 	movss	0 * SIZE(X), %xmm5
 	addq	INCX, X
 #ifdef USE_ABS
 	andps	%xmm15, %xmm5
 #endif
-	maxss	%xmm5, %xmm1
+	MAXSS	%xmm5, %xmm1
 	ALIGN_3
 
 .L87:
@@ -806,16 +812,16 @@
 #ifdef USE_ABS
 	andps	%xmm15, %xmm6
 #endif
-	maxss	%xmm6, %xmm2
+	MAXSS	%xmm6, %xmm2
 	ALIGN_4
 
 .L90:
 	movq	XX, X
 	movq	MM, M
 
-	maxss	%xmm1, %xmm0
-	maxss	%xmm3, %xmm2
-	maxss	%xmm2, %xmm0
+	MAXSS	%xmm1, %xmm0
+	MAXSS	%xmm3, %xmm2
+	MAXSS	%xmm2, %xmm0
 	shufps	$0, %xmm0, %xmm0
 
 	movq	M,  I

From 3cb1c8d210046f4f6e2935fe796af3648387a38e Mon Sep 17 00:00:00 2001
From: Martin Kroeker <martin@ruby.chemie.uni-freiburg.de>
Date: Sat, 11 May 2019 16:07:30 +0200
Subject: [PATCH 031/127] Move ARMv8 gcc build from Travis to Azure

---
 azure-pipelines.yml | 20 ++++++++++++++++++++
 1 file changed, 20 insertions(+)

diff --git a/azure-pipelines.yml b/azure-pipelines.yml
index cbea6f4a7..4673d07fe 100644
--- a/azure-pipelines.yml
+++ b/azure-pipelines.yml
@@ -26,6 +26,26 @@ jobs:
             cmake --build ." > Dockerfile
       docker build .
     displayName: Run ARMV6 docker build
+- job: ARMv8_gcc
+  pool:
+    vmImage: 'ubuntu-16.04'
+  steps:
+  - script: |
+      docker run --rm --privileged multiarch/qemu-user-static:register --reset
+      echo "FROM openblas/alpine:arm64
+        COPY . /tmp/openblas
+        RUN mkdir /tmp/openblas/build                             &&  \
+            cd /tmp/openblas/build                                &&  \
+            CC=gcc cmake -D DYNAMIC_ARCH=OFF                          \
+                                 -D TARGET=ARMV8                      \
+                                 -D NOFORTRAN=ON                      \
+                                 -D BUILD_SHARED_LIBS=ON              \
+                                 -D BUILD_WITHOUT_LAPACK=ON           \
+                                 -D BUILD_WITHOUT_CBLAS=ON            \
+                                 -D CMAKE_BUILD_TYPE=Release ../  &&  \
+            cmake --build ." > Dockerfile
+      docker build .
+    displayName: Run ARMV8 docker build   
 # manylinux1 is useful to test because the
 # standard Docker container uses an old version
 # of gcc / glibc

From 999a04f101250a189c92919277db6cbc50a584ff Mon Sep 17 00:00:00 2001
From: Martin Kroeker <martin@ruby.chemie.uni-freiburg.de>
Date: Sat, 11 May 2019 16:08:23 +0200
Subject: [PATCH 032/127] Move ARMv8 gcc build from Travis to Azure

---
 .travis.yml | 1 -
 1 file changed, 1 deletion(-)

diff --git a/.travis.yml b/.travis.yml
index 82e2aaac8..eb74ded37 100644
--- a/.travis.yml
+++ b/.travis.yml
@@ -202,7 +202,6 @@ matrix:
 
   allow_failures:
     - env: IMAGE_ARCH=arm32 TARGET_ARCH=ARMV6 COMPILER=clang
-    - env: IMAGE_ARCH=arm64 TARGET_ARCH=ARMV8 COMPILER=gcc
     - env: IMAGE_ARCH=arm64 TARGET_ARCH=ARMV8 COMPILER=clang
 
 # whitelist

From 43068288e9fc035cd9ebc7254de7a5f0a3600090 Mon Sep 17 00:00:00 2001
From: Martin Kroeker <martin@ruby.chemie.uni-freiburg.de>
Date: Sat, 11 May 2019 22:37:06 +0200
Subject: [PATCH 033/127] Update .travis.yml

---
 .travis.yml | 3 ---
 1 file changed, 3 deletions(-)

diff --git a/.travis.yml b/.travis.yml
index eb74ded37..b2827997c 100644
--- a/.travis.yml
+++ b/.travis.yml
@@ -193,9 +193,6 @@ matrix:
                                  -D CMAKE_BUILD_TYPE=Release ../  &&  \
             cmake --build ." > Dockerfile
         docker build .
-    - <<: *emulated-arm
-      env: IMAGE_ARCH=arm64 TARGET_ARCH=ARMV8 COMPILER=gcc
-      name: "Emulated Build for ARMV8 with gcc"
     - <<: *emulated-arm
       env: IMAGE_ARCH=arm64 TARGET_ARCH=ARMV8 COMPILER=clang
       name: "Emulated Build for ARMV8 with clang"

From d86f0b9e74130ab659062bca40badc1dc36649f0 Mon Sep 17 00:00:00 2001
From: Isuru Fernando <isuruf@gmail.com>
Date: Sun, 12 May 2019 13:35:07 -0500
Subject: [PATCH 034/127] Test drone CI

---
 .drone.yml | 19 +++++++++++++++++++
 1 file changed, 19 insertions(+)
 create mode 100644 .drone.yml

diff --git a/.drone.yml b/.drone.yml
new file mode 100644
index 000000000..b2300b81d
--- /dev/null
+++ b/.drone.yml
@@ -0,0 +1,19 @@
+---
+kind: pipeline
+name: arm64_gcc
+
+platform:
+  os: linux
+  arch: arm64
+
+steps:
+- name: Build
+  image: centos:7
+  environment:
+    CC: gcc
+    COMMON_FLAGS: 'DYNAMIC_ARCH=1 TARGET=ARMV8 NUM_THREADS=32'
+  commands:
+    - make QUIET_MAKE=1 $COMMON_FLAGS
+    - make -C test $COMMON_FLAGS
+    - make -C ctest $COMMON_FLAGS
+    - make -C utest $COMMON_FLAGS"

From 58829c098841d2da28defa96538a7a2f9d3e0f21 Mon Sep 17 00:00:00 2001
From: Isuru Fernando <isuruf@gmail.com>
Date: Sun, 12 May 2019 13:39:51 -0500
Subject: [PATCH 035/127] install make

---
 .drone.yml | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/.drone.yml b/.drone.yml
index b2300b81d..75868e919 100644
--- a/.drone.yml
+++ b/.drone.yml
@@ -7,12 +7,13 @@ platform:
   arch: arm64
 
 steps:
-- name: Build
+- name: Build and Test
   image: centos:7
   environment:
     CC: gcc
     COMMON_FLAGS: 'DYNAMIC_ARCH=1 TARGET=ARMV8 NUM_THREADS=32'
   commands:
+    - sudo yum -y install make
     - make QUIET_MAKE=1 $COMMON_FLAGS
     - make -C test $COMMON_FLAGS
     - make -C ctest $COMMON_FLAGS

From ff807473bb6e0faf8e7767c18b5cfae1318e0aaa Mon Sep 17 00:00:00 2001
From: Isuru Fernando <isuruf@gmail.com>
Date: Sun, 12 May 2019 13:40:23 -0500
Subject: [PATCH 036/127] remove sudo

---
 .drone.yml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.drone.yml b/.drone.yml
index 75868e919..da9520975 100644
--- a/.drone.yml
+++ b/.drone.yml
@@ -13,7 +13,7 @@ steps:
     CC: gcc
     COMMON_FLAGS: 'DYNAMIC_ARCH=1 TARGET=ARMV8 NUM_THREADS=32'
   commands:
-    - sudo yum -y install make
+    - yum -y install make
     - make QUIET_MAKE=1 $COMMON_FLAGS
     - make -C test $COMMON_FLAGS
     - make -C ctest $COMMON_FLAGS

From 21acf03e9a2b21e39fa6e81899f100084de0ba93 Mon Sep 17 00:00:00 2001
From: Isuru Fernando <isuruf@gmail.com>
Date: Sun, 12 May 2019 13:42:16 -0500
Subject: [PATCH 037/127] Install gcc

---
 .drone.yml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.drone.yml b/.drone.yml
index da9520975..c4f216ed6 100644
--- a/.drone.yml
+++ b/.drone.yml
@@ -13,7 +13,7 @@ steps:
     CC: gcc
     COMMON_FLAGS: 'DYNAMIC_ARCH=1 TARGET=ARMV8 NUM_THREADS=32'
   commands:
-    - yum -y install make
+    - yum -y install make gcc
     - make QUIET_MAKE=1 $COMMON_FLAGS
     - make -C test $COMMON_FLAGS
     - make -C ctest $COMMON_FLAGS

From 15f925fe9a0ca823352fd252cad2da95c810cec4 Mon Sep 17 00:00:00 2001
From: Isuru Fernando <isuruf@gmail.com>
Date: Sun, 12 May 2019 13:44:15 -0500
Subject: [PATCH 038/127] Install perl

---
 .drone.yml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.drone.yml b/.drone.yml
index c4f216ed6..765c2b02c 100644
--- a/.drone.yml
+++ b/.drone.yml
@@ -13,7 +13,7 @@ steps:
     CC: gcc
     COMMON_FLAGS: 'DYNAMIC_ARCH=1 TARGET=ARMV8 NUM_THREADS=32'
   commands:
-    - yum -y install make gcc
+    - yum -y install make gcc perl
     - make QUIET_MAKE=1 $COMMON_FLAGS
     - make -C test $COMMON_FLAGS
     - make -C ctest $COMMON_FLAGS

From a0aaf308ed682d58962f1dd6f568647e97572596 Mon Sep 17 00:00:00 2001
From: Isuru Fernando <isuruf@gmail.com>
Date: Sun, 12 May 2019 13:47:49 -0500
Subject: [PATCH 039/127] Install gfortran and add a clang job

---
 .drone.yml | 25 +++++++++++++++++++++++--
 1 file changed, 23 insertions(+), 2 deletions(-)

diff --git a/.drone.yml b/.drone.yml
index 765c2b02c..3b1515c33 100644
--- a/.drone.yml
+++ b/.drone.yml
@@ -1,6 +1,6 @@
 ---
 kind: pipeline
-name: arm64_gcc
+name: arm64_gcc_make
 
 platform:
   os: linux
@@ -13,7 +13,28 @@ steps:
     CC: gcc
     COMMON_FLAGS: 'DYNAMIC_ARCH=1 TARGET=ARMV8 NUM_THREADS=32'
   commands:
-    - yum -y install make gcc perl
+    - yum -y install make gcc gfortran perl
+    - make QUIET_MAKE=1 $COMMON_FLAGS
+    - make -C test $COMMON_FLAGS
+    - make -C ctest $COMMON_FLAGS
+    - make -C utest $COMMON_FLAGS"
+
+---
+kind: pipeline
+name: arm64_clang_make
+
+platform:
+  os: linux
+  arch: arm64
+
+steps:
+- name: Build and Test
+  image: centos:7
+  environment:
+    CC: clang
+    COMMON_FLAGS: 'DYNAMIC_ARCH=1 TARGET=ARMV8 NUM_THREADS=32'
+  commands:
+    - yum -y install make gcc gfortran perl clang
     - make QUIET_MAKE=1 $COMMON_FLAGS
     - make -C test $COMMON_FLAGS
     - make -C ctest $COMMON_FLAGS

From 9184590c33e9b8df68460877a0d56e229d21d2ce Mon Sep 17 00:00:00 2001
From: Isuru Fernando <isuruf@gmail.com>
Date: Sun, 12 May 2019 13:50:37 -0500
Subject: [PATCH 040/127] gfortran->gcc-gfortran

---
 .drone.yml | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/.drone.yml b/.drone.yml
index 3b1515c33..37ca7478f 100644
--- a/.drone.yml
+++ b/.drone.yml
@@ -13,7 +13,7 @@ steps:
     CC: gcc
     COMMON_FLAGS: 'DYNAMIC_ARCH=1 TARGET=ARMV8 NUM_THREADS=32'
   commands:
-    - yum -y install make gcc gfortran perl
+    - yum -y install make gcc gcc-gfortran perl
     - make QUIET_MAKE=1 $COMMON_FLAGS
     - make -C test $COMMON_FLAGS
     - make -C ctest $COMMON_FLAGS
@@ -34,7 +34,7 @@ steps:
     CC: clang
     COMMON_FLAGS: 'DYNAMIC_ARCH=1 TARGET=ARMV8 NUM_THREADS=32'
   commands:
-    - yum -y install make gcc gfortran perl clang
+    - yum -y install make gcc gcc-gfortran perl clang
     - make QUIET_MAKE=1 $COMMON_FLAGS
     - make -C test $COMMON_FLAGS
     - make -C ctest $COMMON_FLAGS

From dc110e179d5110bb807ee9c962e9b7da938ac9a6 Mon Sep 17 00:00:00 2001
From: Isuru Fernando <isuruf@gmail.com>
Date: Sun, 12 May 2019 13:53:58 -0500
Subject: [PATCH 041/127] Switch to ubuntu and parallel jobs

---
 .drone.yml | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

diff --git a/.drone.yml b/.drone.yml
index 37ca7478f..f048cad1f 100644
--- a/.drone.yml
+++ b/.drone.yml
@@ -8,12 +8,12 @@ platform:
 
 steps:
 - name: Build and Test
-  image: centos:7
+  image: ubuntu:18.04
   environment:
     CC: gcc
-    COMMON_FLAGS: 'DYNAMIC_ARCH=1 TARGET=ARMV8 NUM_THREADS=32'
+    COMMON_FLAGS: 'DYNAMIC_ARCH=1 TARGET=ARMV8 NUM_THREADS=32 -j'
   commands:
-    - yum -y install make gcc gcc-gfortran perl
+    - apt install make gcc gfortran perl clang
     - make QUIET_MAKE=1 $COMMON_FLAGS
     - make -C test $COMMON_FLAGS
     - make -C ctest $COMMON_FLAGS
@@ -29,12 +29,12 @@ platform:
 
 steps:
 - name: Build and Test
-  image: centos:7
+  image: ubuntu:18.04
   environment:
     CC: clang
-    COMMON_FLAGS: 'DYNAMIC_ARCH=1 TARGET=ARMV8 NUM_THREADS=32'
+    COMMON_FLAGS: 'DYNAMIC_ARCH=1 TARGET=ARMV8 NUM_THREADS=32 -j'
   commands:
-    - yum -y install make gcc gcc-gfortran perl clang
+    - apt install make gcc gfortran perl clang
     - make QUIET_MAKE=1 $COMMON_FLAGS
     - make -C test $COMMON_FLAGS
     - make -C ctest $COMMON_FLAGS

From 612c2d78e0589634de791c72769c978c2fdc0141 Mon Sep 17 00:00:00 2001
From: Isuru Fernando <isuruf@gmail.com>
Date: Sun, 12 May 2019 13:55:04 -0500
Subject: [PATCH 042/127] apt update

---
 .drone.yml | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/.drone.yml b/.drone.yml
index f048cad1f..973e00c00 100644
--- a/.drone.yml
+++ b/.drone.yml
@@ -13,7 +13,8 @@ steps:
     CC: gcc
     COMMON_FLAGS: 'DYNAMIC_ARCH=1 TARGET=ARMV8 NUM_THREADS=32 -j'
   commands:
-    - apt install make gcc gfortran perl clang
+    - apt-get update
+    - apt-get install make gcc gfortran perl clang
     - make QUIET_MAKE=1 $COMMON_FLAGS
     - make -C test $COMMON_FLAGS
     - make -C ctest $COMMON_FLAGS
@@ -34,7 +35,8 @@ steps:
     CC: clang
     COMMON_FLAGS: 'DYNAMIC_ARCH=1 TARGET=ARMV8 NUM_THREADS=32 -j'
   commands:
-    - apt install make gcc gfortran perl clang
+    - apt-get update
+    - apt-get install make gcc gfortran perl clang
     - make QUIET_MAKE=1 $COMMON_FLAGS
     - make -C test $COMMON_FLAGS
     - make -C ctest $COMMON_FLAGS

From 231472c4c6c5e4b76000e62b1ad8b0a0b25c6ed4 Mon Sep 17 00:00:00 2001
From: Isuru Fernando <isuruf@gmail.com>
Date: Sun, 12 May 2019 13:55:38 -0500
Subject: [PATCH 043/127] Fix typo

---
 .drone.yml | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/.drone.yml b/.drone.yml
index 973e00c00..5fe9983ae 100644
--- a/.drone.yml
+++ b/.drone.yml
@@ -18,7 +18,7 @@ steps:
     - make QUIET_MAKE=1 $COMMON_FLAGS
     - make -C test $COMMON_FLAGS
     - make -C ctest $COMMON_FLAGS
-    - make -C utest $COMMON_FLAGS"
+    - make -C utest $COMMON_FLAGS
 
 ---
 kind: pipeline
@@ -40,4 +40,4 @@ steps:
     - make QUIET_MAKE=1 $COMMON_FLAGS
     - make -C test $COMMON_FLAGS
     - make -C ctest $COMMON_FLAGS
-    - make -C utest $COMMON_FLAGS"
+    - make -C utest $COMMON_FLAGS

From 608cd69b66059de14b29639ab29957c99190be5c Mon Sep 17 00:00:00 2001
From: Isuru Fernando <isuruf@gmail.com>
Date: Sun, 12 May 2019 13:56:59 -0500
Subject: [PATCH 044/127] update yes

---
 .drone.yml | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/.drone.yml b/.drone.yml
index 5fe9983ae..6413bd1c9 100644
--- a/.drone.yml
+++ b/.drone.yml
@@ -13,8 +13,8 @@ steps:
     CC: gcc
     COMMON_FLAGS: 'DYNAMIC_ARCH=1 TARGET=ARMV8 NUM_THREADS=32 -j'
   commands:
-    - apt-get update
-    - apt-get install make gcc gfortran perl clang
+    - apt-get update -y
+    - apt-get install -y make gcc gfortran perl clang
     - make QUIET_MAKE=1 $COMMON_FLAGS
     - make -C test $COMMON_FLAGS
     - make -C ctest $COMMON_FLAGS
@@ -35,8 +35,8 @@ steps:
     CC: clang
     COMMON_FLAGS: 'DYNAMIC_ARCH=1 TARGET=ARMV8 NUM_THREADS=32 -j'
   commands:
-    - apt-get update
-    - apt-get install make gcc gfortran perl clang
+    - apt-get update -y
+    - apt-get install -y make gcc gfortran perl clang
     - make QUIET_MAKE=1 $COMMON_FLAGS
     - make -C test $COMMON_FLAGS
     - make -C ctest $COMMON_FLAGS

From d40c109eb0ace38d967e221308496854d207a70f Mon Sep 17 00:00:00 2001
From: Isuru Fernando <isuruf@gmail.com>
Date: Sun, 12 May 2019 14:02:39 -0500
Subject: [PATCH 045/127] no need of gcc in clang build

---
 .drone.yml | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/.drone.yml b/.drone.yml
index 6413bd1c9..0d7fd2000 100644
--- a/.drone.yml
+++ b/.drone.yml
@@ -11,10 +11,10 @@ steps:
   image: ubuntu:18.04
   environment:
     CC: gcc
-    COMMON_FLAGS: 'DYNAMIC_ARCH=1 TARGET=ARMV8 NUM_THREADS=32 -j'
+    COMMON_FLAGS: 'DYNAMIC_ARCH=1 TARGET=ARMV8 NUM_THREADS=32'
   commands:
     - apt-get update -y
-    - apt-get install -y make gcc gfortran perl clang
+    - apt-get install -y make $CC gfortran perl
     - make QUIET_MAKE=1 $COMMON_FLAGS
     - make -C test $COMMON_FLAGS
     - make -C ctest $COMMON_FLAGS
@@ -33,10 +33,10 @@ steps:
   image: ubuntu:18.04
   environment:
     CC: clang
-    COMMON_FLAGS: 'DYNAMIC_ARCH=1 TARGET=ARMV8 NUM_THREADS=32 -j'
+    COMMON_FLAGS: 'DYNAMIC_ARCH=1 TARGET=ARMV8 NUM_THREADS=32'
   commands:
     - apt-get update -y
-    - apt-get install -y make gcc gfortran perl clang
+    - apt-get install -y make $CC gfortran perl
     - make QUIET_MAKE=1 $COMMON_FLAGS
     - make -C test $COMMON_FLAGS
     - make -C ctest $COMMON_FLAGS

From dadafcdcd84ffa8f5545a14aa3b2c0b39398195c Mon Sep 17 00:00:00 2001
From: Isuru Fernando <isuruf@gmail.com>
Date: Sun, 12 May 2019 14:06:04 -0500
Subject: [PATCH 046/127] Add a cmake build as well

---
 .drone.yml | 44 ++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 44 insertions(+)

diff --git a/.drone.yml b/.drone.yml
index 0d7fd2000..70bfc5f19 100644
--- a/.drone.yml
+++ b/.drone.yml
@@ -41,3 +41,47 @@ steps:
     - make -C test $COMMON_FLAGS
     - make -C ctest $COMMON_FLAGS
     - make -C utest $COMMON_FLAGS
+
+---
+kind: pipeline
+name: arm64_gcc_cmake
+
+platform:
+  os: linux
+  arch: arm64
+
+steps:
+- name: Build and Test
+  image: ubuntu:18.04
+  environment:
+    CC: gcc
+    CMAKE_FLAGS: '-DDYNAMIC_ARCH=1 -DTARGET=ARMV8 -DNUM_THREADS=32'
+  commands:
+    - apt-get update -y
+    - apt-get install -y make $CC gfortran perl cmake
+    - mkdir build && cd build
+    - cmake $CMAKE_FLAGS ..
+    - cmake --build .
+    - ctest
+
+---
+kind: pipeline
+name: arm64_clang_cmake
+
+platform:
+  os: linux
+  arch: arm64
+
+steps:
+- name: Build and Test
+  image: ubuntu:18.04
+  environment:
+    CC: clang
+    CMAKE_FLAGS: '-DDYNAMIC_ARCH=1 -DTARGET=ARMV8 -DNUM_THREADS=32'
+  commands:
+    - apt-get update -y
+    - apt-get install -y make $CC gfortran perl cmake
+    - mkdir build && cd build
+    - cmake $CMAKE_FLAGS ..
+    - cmake --build .
+    - ctest

From cd99dfe034e3df8ad850dbae96e2e1deac6fdc2d Mon Sep 17 00:00:00 2001
From: Isuru Fernando <isuruf@gmail.com>
Date: Sun, 12 May 2019 14:09:29 -0500
Subject: [PATCH 047/127] Add cmake builds and print options

---
 .drone.yml | 8 ++++++++
 1 file changed, 8 insertions(+)

diff --git a/.drone.yml b/.drone.yml
index 70bfc5f19..a7224036e 100644
--- a/.drone.yml
+++ b/.drone.yml
@@ -13,8 +13,10 @@ steps:
     CC: gcc
     COMMON_FLAGS: 'DYNAMIC_ARCH=1 TARGET=ARMV8 NUM_THREADS=32'
   commands:
+    - echo "MAKE_FLAGS:= $COMMON_FLAGS"
     - apt-get update -y
     - apt-get install -y make $CC gfortran perl
+    - $CC --version
     - make QUIET_MAKE=1 $COMMON_FLAGS
     - make -C test $COMMON_FLAGS
     - make -C ctest $COMMON_FLAGS
@@ -35,8 +37,10 @@ steps:
     CC: clang
     COMMON_FLAGS: 'DYNAMIC_ARCH=1 TARGET=ARMV8 NUM_THREADS=32'
   commands:
+    - echo "MAKE_FLAGS:= $COMMON_FLAGS"
     - apt-get update -y
     - apt-get install -y make $CC gfortran perl
+    - $CC --version
     - make QUIET_MAKE=1 $COMMON_FLAGS
     - make -C test $COMMON_FLAGS
     - make -C ctest $COMMON_FLAGS
@@ -57,8 +61,10 @@ steps:
     CC: gcc
     CMAKE_FLAGS: '-DDYNAMIC_ARCH=1 -DTARGET=ARMV8 -DNUM_THREADS=32'
   commands:
+    - echo "CMAKE_FLAGS:= $CMAKE_FLAGS"
     - apt-get update -y
     - apt-get install -y make $CC gfortran perl cmake
+    - $CC --version
     - mkdir build && cd build
     - cmake $CMAKE_FLAGS ..
     - cmake --build .
@@ -79,8 +85,10 @@ steps:
     CC: clang
     CMAKE_FLAGS: '-DDYNAMIC_ARCH=1 -DTARGET=ARMV8 -DNUM_THREADS=32'
   commands:
+    - echo "CMAKE_FLAGS:= $CMAKE_FLAGS"
     - apt-get update -y
     - apt-get install -y make $CC gfortran perl cmake
+    - $CC --version
     - mkdir build && cd build
     - cmake $CMAKE_FLAGS ..
     - cmake --build .

From 3d94ab660f6352d31ef4a92835fd7506869cb80d Mon Sep 17 00:00:00 2001
From: Isuru Fernando <isuruf@gmail.com>
Date: Sun, 12 May 2019 14:17:12 -0500
Subject: [PATCH 048/127] build without lapack on cmake

---
 .drone.yml | 10 ++++++----
 1 file changed, 6 insertions(+), 4 deletions(-)

diff --git a/.drone.yml b/.drone.yml
index a7224036e..3df5f406a 100644
--- a/.drone.yml
+++ b/.drone.yml
@@ -59,11 +59,12 @@ steps:
   image: ubuntu:18.04
   environment:
     CC: gcc
-    CMAKE_FLAGS: '-DDYNAMIC_ARCH=1 -DTARGET=ARMV8 -DNUM_THREADS=32'
+    CXX: g++
+    CMAKE_FLAGS: '-DDYNAMIC_ARCH=1 -DTARGET=ARMV8 -DNUM_THREADS=32 -DNOFORTRAN=ON -DBUILD_WITHOUT_LAPACK=ON'
   commands:
     - echo "CMAKE_FLAGS:= $CMAKE_FLAGS"
     - apt-get update -y
-    - apt-get install -y make $CC gfortran perl cmake
+    - apt-get install -y make $CC $CXX g++ perl cmake
     - $CC --version
     - mkdir build && cd build
     - cmake $CMAKE_FLAGS ..
@@ -83,11 +84,12 @@ steps:
   image: ubuntu:18.04
   environment:
     CC: clang
-    CMAKE_FLAGS: '-DDYNAMIC_ARCH=1 -DTARGET=ARMV8 -DNUM_THREADS=32'
+    CXX: clang++
+    CMAKE_FLAGS: '-DDYNAMIC_ARCH=1 -DTARGET=ARMV8 -DNUM_THREADS=32 -DNOFORTRAN=ON -DBUILD_WITHOUT_LAPACK=ON'
   commands:
     - echo "CMAKE_FLAGS:= $CMAKE_FLAGS"
     - apt-get update -y
-    - apt-get install -y make $CC gfortran perl cmake
+    - apt-get install -y make $CC $CXX perl cmake
     - $CC --version
     - mkdir build && cd build
     - cmake $CMAKE_FLAGS ..

From 7aa6faad5f17cbd6e477e0c393a3ae853e610de8 Mon Sep 17 00:00:00 2001
From: Isuru Fernando <isuruf@gmail.com>
Date: Sun, 12 May 2019 14:22:36 -0500
Subject: [PATCH 049/127] parallel build

---
 .drone.yml | 10 ++++------
 1 file changed, 4 insertions(+), 6 deletions(-)

diff --git a/.drone.yml b/.drone.yml
index 3df5f406a..a8c69f8ca 100644
--- a/.drone.yml
+++ b/.drone.yml
@@ -59,16 +59,15 @@ steps:
   image: ubuntu:18.04
   environment:
     CC: gcc
-    CXX: g++
     CMAKE_FLAGS: '-DDYNAMIC_ARCH=1 -DTARGET=ARMV8 -DNUM_THREADS=32 -DNOFORTRAN=ON -DBUILD_WITHOUT_LAPACK=ON'
   commands:
     - echo "CMAKE_FLAGS:= $CMAKE_FLAGS"
     - apt-get update -y
-    - apt-get install -y make $CC $CXX g++ perl cmake
+    - apt-get install -y make $CC g++ perl cmake
     - $CC --version
     - mkdir build && cd build
     - cmake $CMAKE_FLAGS ..
-    - cmake --build .
+    - make -j
     - ctest
 
 ---
@@ -84,14 +83,13 @@ steps:
   image: ubuntu:18.04
   environment:
     CC: clang
-    CXX: clang++
     CMAKE_FLAGS: '-DDYNAMIC_ARCH=1 -DTARGET=ARMV8 -DNUM_THREADS=32 -DNOFORTRAN=ON -DBUILD_WITHOUT_LAPACK=ON'
   commands:
     - echo "CMAKE_FLAGS:= $CMAKE_FLAGS"
     - apt-get update -y
-    - apt-get install -y make $CC $CXX perl cmake
+    - apt-get install -y make $CC g++ perl cmake
     - $CC --version
     - mkdir build && cd build
     - cmake $CMAKE_FLAGS ..
-    - cmake --build .
+    - make -j
     - ctest

From e3cb8ad2d6cef8a56d8a0543d58c678f7b068ecd Mon Sep 17 00:00:00 2001
From: Isuru Fernando <isuruf@gmail.com>
Date: Sun, 12 May 2019 14:28:48 -0500
Subject: [PATCH 050/127] See if ubuntu 19.04 fixes the ICE

---
 .drone.yml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.drone.yml b/.drone.yml
index a8c69f8ca..46f259794 100644
--- a/.drone.yml
+++ b/.drone.yml
@@ -8,7 +8,7 @@ platform:
 
 steps:
 - name: Build and Test
-  image: ubuntu:18.04
+  image: ubuntu:19.04
   environment:
     CC: gcc
     COMMON_FLAGS: 'DYNAMIC_ARCH=1 TARGET=ARMV8 NUM_THREADS=32'

From 7ff44e0016f1f1bdeb518e108d9ae65e30004233 Mon Sep 17 00:00:00 2001
From: Isuru Fernando <isuruf@gmail.com>
Date: Sun, 12 May 2019 15:09:53 -0500
Subject: [PATCH 051/127] Remove qemu armv8 builds

---
 .travis.yml         |  7 -------
 azure-pipelines.yml | 20 --------------------
 2 files changed, 27 deletions(-)

diff --git a/.travis.yml b/.travis.yml
index b2827997c..dc388459b 100644
--- a/.travis.yml
+++ b/.travis.yml
@@ -193,13 +193,6 @@ matrix:
                                  -D CMAKE_BUILD_TYPE=Release ../  &&  \
             cmake --build ." > Dockerfile
         docker build .
-    - <<: *emulated-arm
-      env: IMAGE_ARCH=arm64 TARGET_ARCH=ARMV8 COMPILER=clang
-      name: "Emulated Build for ARMV8 with clang"
-
-  allow_failures:
-    - env: IMAGE_ARCH=arm32 TARGET_ARCH=ARMV6 COMPILER=clang
-    - env: IMAGE_ARCH=arm64 TARGET_ARCH=ARMV8 COMPILER=clang
 
 # whitelist
 branches:
diff --git a/azure-pipelines.yml b/azure-pipelines.yml
index 4673d07fe..cbea6f4a7 100644
--- a/azure-pipelines.yml
+++ b/azure-pipelines.yml
@@ -26,26 +26,6 @@ jobs:
             cmake --build ." > Dockerfile
       docker build .
     displayName: Run ARMV6 docker build
-- job: ARMv8_gcc
-  pool:
-    vmImage: 'ubuntu-16.04'
-  steps:
-  - script: |
-      docker run --rm --privileged multiarch/qemu-user-static:register --reset
-      echo "FROM openblas/alpine:arm64
-        COPY . /tmp/openblas
-        RUN mkdir /tmp/openblas/build                             &&  \
-            cd /tmp/openblas/build                                &&  \
-            CC=gcc cmake -D DYNAMIC_ARCH=OFF                          \
-                                 -D TARGET=ARMV8                      \
-                                 -D NOFORTRAN=ON                      \
-                                 -D BUILD_SHARED_LIBS=ON              \
-                                 -D BUILD_WITHOUT_LAPACK=ON           \
-                                 -D BUILD_WITHOUT_CBLAS=ON            \
-                                 -D CMAKE_BUILD_TYPE=Release ../  &&  \
-            cmake --build ." > Dockerfile
-      docker build .
-    displayName: Run ARMV8 docker build   
 # manylinux1 is useful to test because the
 # standard Docker container uses an old version
 # of gcc / glibc

From b911525c81063db8b7525800cff2a7d842b99518 Mon Sep 17 00:00:00 2001
From: Isuru Fernando <isuruf@gmail.com>
Date: Sun, 12 May 2019 15:14:46 -0500
Subject: [PATCH 052/127] arm32 build

---
 .drone.yml          | 48 +++++++++++++++++++++++++++++++++++++++++++++
 .travis.yml         | 21 --------------------
 azure-pipelines.yml | 20 -------------------
 3 files changed, 48 insertions(+), 41 deletions(-)

diff --git a/.drone.yml b/.drone.yml
index 46f259794..aa9e129e0 100644
--- a/.drone.yml
+++ b/.drone.yml
@@ -22,6 +22,30 @@ steps:
     - make -C ctest $COMMON_FLAGS
     - make -C utest $COMMON_FLAGS
 
+---
+kind: pipeline
+name: arm32_gcc_make
+
+platform:
+  os: linux
+  arch: arm64
+
+steps:
+- name: Build and Test
+  image: ubuntu:18.04
+  environment:
+    CC: gcc
+    COMMON_FLAGS: 'DYNAMIC_ARCH=1 TARGET=ARMV6 NUM_THREADS=32'
+  commands:
+    - echo "MAKE_FLAGS:= $COMMON_FLAGS"
+    - apt-get update -y
+    - apt-get install -y make $CC gfortran perl
+    - $CC --version
+    - make QUIET_MAKE=1 $COMMON_FLAGS
+    - make -C test $COMMON_FLAGS
+    - make -C ctest $COMMON_FLAGS
+    - make -C utest $COMMON_FLAGS
+
 ---
 kind: pipeline
 name: arm64_clang_make
@@ -46,6 +70,30 @@ steps:
     - make -C ctest $COMMON_FLAGS
     - make -C utest $COMMON_FLAGS
 
+---
+kind: pipeline
+name: arm32_clang_cmake
+
+platform:
+  os: linux
+  arch: arm
+
+steps:
+- name: Build and Test
+  image: ubuntu:18.04
+  environment:
+    CC: clang
+    CMAKE_FLAGS: '-DDYNAMIC_ARCH=1 -DTARGET=ARMV6 -DNUM_THREADS=32 -DNOFORTRAN=ON -DBUILD_WITHOUT_LAPACK=ON'
+  commands:
+    - echo "CMAKE_FLAGS:= $CMAKE_FLAGS"
+    - apt-get update -y
+    - apt-get install -y make $CC g++ perl cmake
+    - $CC --version
+    - mkdir build && cd build
+    - cmake $CMAKE_FLAGS ..
+    - make -j
+    - ctest
+
 ---
 kind: pipeline
 name: arm64_gcc_cmake
diff --git a/.travis.yml b/.travis.yml
index dc388459b..a92bb0687 100644
--- a/.travis.yml
+++ b/.travis.yml
@@ -173,27 +173,6 @@ matrix:
       env:
         - BTYPE="BINARY=32"
 
-    - &emulated-arm
-      dist: trusty
-      sudo: required
-      services: docker
-      env: IMAGE_ARCH=arm32 TARGET_ARCH=ARMV6 COMPILER=clang
-      name: "Emulated Build for ARMV6 with clang"
-      before_install: sudo docker run --rm --privileged multiarch/qemu-user-static:register --reset
-      script: |
-        echo "FROM openblas/alpine:${IMAGE_ARCH}
-        COPY . /tmp/openblas
-        RUN mkdir /tmp/openblas/build                             &&  \
-            cd /tmp/openblas/build                                &&  \
-            CC=${COMPILER} cmake -D DYNAMIC_ARCH=OFF                  \
-                                 -D TARGET=${TARGET_ARCH}             \
-                                 -D BUILD_SHARED_LIBS=ON              \
-                                 -D BUILD_WITHOUT_LAPACK=ON           \
-                                 -D BUILD_WITHOUT_CBLAS=ON            \
-                                 -D CMAKE_BUILD_TYPE=Release ../  &&  \
-            cmake --build ." > Dockerfile
-        docker build .
-
 # whitelist
 branches:
   only:
diff --git a/azure-pipelines.yml b/azure-pipelines.yml
index cbea6f4a7..7197062d1 100644
--- a/azure-pipelines.yml
+++ b/azure-pipelines.yml
@@ -6,26 +6,6 @@ trigger:
       - develop
 
 jobs:
-- job: ARMv6_gcc
-  pool:
-    vmImage: 'ubuntu-16.04'
-  steps:
-  - script: |
-      docker run --rm --privileged multiarch/qemu-user-static:register --reset
-      echo "FROM openblas/alpine:arm32
-        COPY . /tmp/openblas
-        RUN mkdir /tmp/openblas/build                             &&  \
-            cd /tmp/openblas/build                                &&  \
-            CC=gcc cmake -D DYNAMIC_ARCH=OFF                          \
-                                 -D TARGET=ARMV6                      \
-                                 -D NOFORTRAN=ON                      \
-                                 -D BUILD_SHARED_LIBS=ON              \
-                                 -D BUILD_WITHOUT_LAPACK=ON           \
-                                 -D BUILD_WITHOUT_CBLAS=ON            \
-                                 -D CMAKE_BUILD_TYPE=Release ../  &&  \
-            cmake --build ." > Dockerfile
-      docker build .
-    displayName: Run ARMV6 docker build
 # manylinux1 is useful to test because the
 # standard Docker container uses an old version
 # of gcc / glibc

From b43deb4ad60b2960b4c0ee1aca6afeaadc30673c Mon Sep 17 00:00:00 2001
From: Isuru Fernando <isuruf@gmail.com>
Date: Sun, 12 May 2019 15:25:45 -0500
Subject: [PATCH 053/127] Fix typo

---
 .drone.yml | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/.drone.yml b/.drone.yml
index aa9e129e0..779912954 100644
--- a/.drone.yml
+++ b/.drone.yml
@@ -28,11 +28,11 @@ name: arm32_gcc_make
 
 platform:
   os: linux
-  arch: arm64
+  arch: arm
 
 steps:
 - name: Build and Test
-  image: ubuntu:18.04
+  image: ubuntu:19.04
   environment:
     CC: gcc
     COMMON_FLAGS: 'DYNAMIC_ARCH=1 TARGET=ARMV6 NUM_THREADS=32'

From a211bc9b6a6e597a38fc8b8b7ed0b006cb367c46 Mon Sep 17 00:00:00 2001
From: Tyler Reddy <tyler.je.reddy@gmail.com>
Date: Tue, 14 May 2019 11:32:23 -0700
Subject: [PATCH 054/127] TST: add SkylakeX AVX512 CI test

* adapt the C-level reproducer code for some
recent SkylakeX AVX512 kernel issues, provided
by Isuru Fernando and modified by Martin Kroeker,
for usage in the utest suite

* add an Intel SDE SkylakeX emulation utest run to
the Azure CI matrix; a custom Docker build was required
because Ubuntu image provided by Azure does not support
AVX512VL instructions
---
 azure-pipelines.yml         | 24 ++++++++++++++++++
 utest/CMakeLists.txt        |  1 +
 utest/Makefile              |  1 +
 utest/test_kernel_regress.c | 50 +++++++++++++++++++++++++++++++++++++
 4 files changed, 76 insertions(+)
 create mode 100644 utest/test_kernel_regress.c

diff --git a/azure-pipelines.yml b/azure-pipelines.yml
index 7197062d1..9b4c85367 100644
--- a/azure-pipelines.yml
+++ b/azure-pipelines.yml
@@ -25,3 +25,27 @@ jobs:
             make -C utest $COMMON_FLAGS $BTYPE" > Dockerfile
       docker build .
     displayName: Run manylinux1 docker build
+- job: Intel_SDE_skx
+  pool:
+    vmImage: 'ubuntu-16.04'
+  steps:
+  - script: |
+      # at the time of writing the available Azure Ubuntu vm image
+      # does not support AVX512VL, so use more recent LTS version
+      echo "FROM ubuntu:bionic
+      COPY . /tmp/openblas
+      RUN apt-get -y update && apt-get -y install \\
+          cmake \\
+          gfortran \\
+          make \\
+          wget
+      RUN mkdir /tmp/SDE && cd /tmp/SDE && \\
+          mkdir sde-external-8.35.0-2019-03-11-lin && \\
+          wget --quiet -O sde-external-8.35.0-2019-03-11-lin.tar.bz2 https://www.dropbox.com/s/fopsnzj67572sj5/sde-external-8.35.0-2019-03-11-lin.tar.bz2?dl=0 && \\
+          tar -xjvf sde-external-8.35.0-2019-03-11-lin.tar.bz2 -C /tmp/SDE/sde-external-8.35.0-2019-03-11-lin --strip-components=1
+      RUN cd /tmp/openblas && CC=gcc make QUIET_MAKE=1 DYNAMIC_ARCH=1 NUM_THREADS=32 BINARY=64
+      CMD cd /tmp/openblas && echo 0 > /proc/sys/kernel/yama/ptrace_scope && CC=gcc OPENBLAS_VERBOSE=2 /tmp/SDE/sde-external-8.35.0-2019-03-11-lin/sde64 -cpuid_in /tmp/SDE/sde-external-8.35.0-2019-03-11-lin/misc/cpuid/skx/cpuid.def -- make -C utest DYNAMIC_ARCH=1 NUM_THREADS=32 BINARY=64" > Dockerfile
+      docker build -t intel_sde .
+      # we need a privileged docker run for sde process attachment
+      docker run --privileged intel_sde
+    displayName: 'Run AVX512 SkylakeX docker build / test'
diff --git a/utest/CMakeLists.txt b/utest/CMakeLists.txt
index dc306501f..4e647cadc 100644
--- a/utest/CMakeLists.txt
+++ b/utest/CMakeLists.txt
@@ -38,6 +38,7 @@ if (NOT NO_LAPACK)
 set(OpenBLAS_utest_src
   ${OpenBLAS_utest_src}
   test_potrs.c
+  test_kernel_regress.c
   )
 endif()
 
diff --git a/utest/Makefile b/utest/Makefile
index 550a65569..cbe639cdb 100644
--- a/utest/Makefile
+++ b/utest/Makefile
@@ -13,6 +13,7 @@ OBJS=utest_main.o test_amax.o test_rotmg.o test_axpy.o test_dotu.o test_dsdot.o
 
 ifneq ($(NO_LAPACK), 1)
 OBJS += test_potrs.o
+OBJS += test_kernel_regress.o
 endif
 
 #this does not work with OpenMP nor with native Windows or Android threads
diff --git a/utest/test_kernel_regress.c b/utest/test_kernel_regress.c
new file mode 100644
index 000000000..93a30b30c
--- /dev/null
+++ b/utest/test_kernel_regress.c
@@ -0,0 +1,50 @@
+#include "openblas_utest.h"
+#include <stdio.h>
+#include <stdlib.h>
+#include <cblas.h>
+
+#define LAPACK_ROW_MAJOR               101
+blasint LAPACKE_dgesvd( blasint matrix_layout, char jobu, char jobvt,
+                           blasint m, blasint n, double* a,
+                           blasint lda, double* s, double* u, blasint ldu,
+                           double* vt, blasint ldvt, double* superb );
+                                                                                 
+
+#define DATASIZE 100
+
+double s[DATASIZE];
+double u[DATASIZE*DATASIZE];
+double vt[DATASIZE*DATASIZE];
+double X[DATASIZE*DATASIZE];
+double superb[DATASIZE];
+double tmp[DATASIZE*DATASIZE];
+double m[DATASIZE*DATASIZE];
+
+CTEST(kernel_regress,skx_avx)
+{
+    double norm;
+    int i, j, info;
+    srand(0);
+	for (i = 0; i < DATASIZE*DATASIZE; i++) {
+        m[i] = (rand()+0.0)/RAND_MAX * 10;
+        tmp[i] = m[i];
+    }
+
+    info = LAPACKE_dgesvd( LAPACK_ROW_MAJOR, 'A', 'A', DATASIZE, DATASIZE, m, DATASIZE,
+                        s, u, DATASIZE, vt, DATASIZE, superb);
+
+	for (i = 0; i < DATASIZE; i++) {
+	    for (j = 0; j < DATASIZE; j++) {
+            u[i*DATASIZE+j] = u[i*DATASIZE+j]*s[j];
+        }
+    }
+    cblas_dgemm(CblasRowMajor, CblasNoTrans, CblasNoTrans, 
+                DATASIZE, DATASIZE, DATASIZE, 1, u, DATASIZE, vt, DATASIZE, 0, X, DATASIZE);
+
+	for (i = 0; i < DATASIZE*DATASIZE; i++) {
+        X[i] = X[i] - tmp[i];
+    }
+    
+    norm = cblas_dnrm2(DATASIZE*DATASIZE, X, 1);
+    ASSERT_DBL_NEAR_TOL(0.0, norm, 1e-10);
+}

From d2cb610272137536416df2e44f1bc8175ddd4eaf Mon Sep 17 00:00:00 2001
From: Martin Kroeker <martin@ruby.chemie.uni-freiburg.de>
Date: Wed, 15 May 2019 23:18:43 +0200
Subject: [PATCH 055/127] Add option USE_LOCKING for single-threaded build with
 locking support

for calling from concurrent threads
---
 Makefile.rule   | 10 ++++++++--
 Makefile.system | 12 ++++++++++++
 common.h        |  4 ++--
 3 files changed, 22 insertions(+), 4 deletions(-)

diff --git a/Makefile.rule b/Makefile.rule
index 17815096e..faf8c8013 100644
--- a/Makefile.rule
+++ b/Makefile.rule
@@ -56,7 +56,13 @@ VERSION = 0.3.7.dev
 # specify it.
 # For force setting for single threaded, specify USE_THREAD = 0
 # For force setting for multi  threaded, specify USE_THREAD = 1
-# USE_THREAD = 0
+USE_THREAD = 0
+
+# If you want to build a single-threaded OpenBLAS, but expect to call this
+# from several concurrent threads in some other program, comment this in for
+# thread safety. (This is done automatically for USE_THREAD=1 , and should not
+# be necessary when USE_OPENMP=1)
+# USE_LOCKING = 1
 
 # If you're going to use this library with OpenMP, please comment it in.
 # This flag is always set for POWER8. Don't set USE_OPENMP = 0 if you're targeting POWER8.
@@ -220,7 +226,7 @@ NO_AFFINITY = 1
 COMMON_PROF = -pg
 
 # Build Debug version
-# DEBUG = 1
+DEBUG = 1
 
 # Set maximum stack allocation.
 # The default value is 2048. 0 disable stack allocation a may reduce GER and GEMV
diff --git a/Makefile.system b/Makefile.system
index a95d6190f..29aef7e27 100644
--- a/Makefile.system
+++ b/Makefile.system
@@ -237,6 +237,10 @@ SMP = 1
 endif
 endif
 
+ifeq ($(SMP), 1)
+USE_LOCKING = 
+endif
+
 ifndef NEED_PIC
 NEED_PIC = 1
 endif
@@ -388,6 +392,12 @@ ifneq ($(MAX_STACK_ALLOC), 0)
 CCOMMON_OPT	+= -DMAX_STACK_ALLOC=$(MAX_STACK_ALLOC)
 endif
 
+ifdef USE_LOCKING
+ifneq ($(USE_LOCKING), 0)
+CCOMMON_OPT	+= -DUSE_LOCKING
+endif
+endif
+
 #
 #  Architecture dependent settings
 #
@@ -744,6 +754,8 @@ CCOMMON_OPT += -DF_INTERFACE_GFORT
 FCOMMON_OPT += -Wall
 # make single-threaded LAPACK calls thread-safe #1847
 FCOMMON_OPT += -frecursive
+# work around ABI changes in gfortran 9 that break calls from C code
+FCOMMON_OPT += -fno-optimize-sibling-calls
 #Don't include -lgfortran, when NO_LAPACK=1 or lsbcc
 ifneq ($(NO_LAPACK), 1)
 EXTRALIB += -lgfortran
diff --git a/common.h b/common.h
index 0ac74bb20..a9fe8d911 100644
--- a/common.h
+++ b/common.h
@@ -131,7 +131,7 @@ extern "C" {
 #include <time.h>
 #include <unistd.h>
 #include <math.h>
-#ifdef SMP
+#if defined(SMP) || defined(USE_LOCKING)
 #include <pthread.h>
 #endif
 #endif
@@ -200,7 +200,7 @@ extern "C" {
 #error "You can't specify both LOCK operation!"
 #endif
 
-#ifdef SMP
+#if defined(SMP) || defined(USE_LOCKING)
 #define USE_PTHREAD_LOCK
 #undef	USE_PTHREAD_SPINLOCK
 #endif

From 1e52572be38541cc11ac39cef6cded8a640bb65b Mon Sep 17 00:00:00 2001
From: Martin Kroeker <martin@ruby.chemie.uni-freiburg.de>
Date: Wed, 15 May 2019 23:19:30 +0200
Subject: [PATCH 056/127] Add option USE_LOCKING for single-threaded build with
 locking support

---
 cmake/system.cmake | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/cmake/system.cmake b/cmake/system.cmake
index d0f560872..adedd32cc 100644
--- a/cmake/system.cmake
+++ b/cmake/system.cmake
@@ -136,6 +136,10 @@ endif ()
 
 if (USE_THREAD)
   message(STATUS "Multi-threading enabled with ${NUM_THREADS} threads.")
+else()
+  if (${USE_LOCKING})
+    set(CCOMMON_OPT "${CCOMMON_OPT} -DUSE_LOCKING")
+  endif ()
 endif ()
 
 include("${PROJECT_SOURCE_DIR}/cmake/prebuild.cmake")

From 86dda5c2fa9e298deacdd17211e2c4e58f2688ea Mon Sep 17 00:00:00 2001
From: Martin Kroeker <martin@ruby.chemie.uni-freiburg.de>
Date: Wed, 15 May 2019 23:21:20 +0200
Subject: [PATCH 057/127] Add option USE_LOCKING for SMP-like locking in
 USE_THREAD=0 builds

---
 driver/others/memory.c | 30 +++++++++++++++---------------
 1 file changed, 15 insertions(+), 15 deletions(-)

diff --git a/driver/others/memory.c b/driver/others/memory.c
index 02352b3ae..adb1ec86c 100644
--- a/driver/others/memory.c
+++ b/driver/others/memory.c
@@ -2062,13 +2062,13 @@ static void *alloc_mmap(void *address){
   }
 
   if (map_address != (void *)-1) {
-#if defined(SMP) && !defined(USE_OPENMP)
+#if (defined(SMP) || defined(USE_LOCKING)) && !defined(USE_OPENMP)
     LOCK_COMMAND(&alloc_lock);
 #endif    
     release_info[release_pos].address = map_address;
     release_info[release_pos].func    = alloc_mmap_free;
     release_pos ++;
-#if defined(SMP) && !defined(USE_OPENMP)
+#if (defined(SMP) || defined(USE_LOCKING)) && !defined(USE_OPENMP)
     UNLOCK_COMMAND(&alloc_lock);
 #endif    
   }
@@ -2214,13 +2214,13 @@ static void *alloc_mmap(void *address){
 #endif
 
   if (map_address != (void *)-1) {
-#if defined(SMP) && !defined(USE_OPENMP)
+#if (defined(SMP) || defined(USE_LOCKING)) && !defined(USE_OPENMP)
     LOCK_COMMAND(&alloc_lock);
 #endif
     release_info[release_pos].address = map_address;
     release_info[release_pos].func    = alloc_mmap_free;
     release_pos ++;
-#if defined(SMP) && !defined(USE_OPENMP)
+#if (defined(SMP) || defined(USE_LOCKING)) && !defined(USE_OPENMP)
     UNLOCK_COMMAND(&alloc_lock);
 #endif
   }
@@ -2701,7 +2701,7 @@ void *blas_memory_alloc(int procpos){
 
   position = 0;
 
-#if defined(SMP) && !defined(USE_OPENMP)
+#if (defined(SMP) || defined(USE_LOCKING)) && !defined(USE_OPENMP)
   LOCK_COMMAND(&alloc_lock);
 #endif
   do {
@@ -2718,7 +2718,7 @@ void *blas_memory_alloc(int procpos){
     position ++;
 
   } while (position < NUM_BUFFERS);
-#if defined(SMP) && !defined(USE_OPENMP)
+#if (defined(SMP) || defined(USE_LOCKING)) && !defined(USE_OPENMP)
   UNLOCK_COMMAND(&alloc_lock);	
 #endif
   goto error;
@@ -2730,7 +2730,7 @@ void *blas_memory_alloc(int procpos){
 #endif
 
   memory[position].used = 1;
-#if defined(SMP) && !defined(USE_OPENMP)
+#if (defined(SMP) || defined(USE_LOCKING)) && !defined(USE_OPENMP)
   UNLOCK_COMMAND(&alloc_lock);
 #else
   blas_unlock(&memory[position].lock);	
@@ -2779,11 +2779,11 @@ void *blas_memory_alloc(int procpos){
 
     } while ((BLASLONG)map_address == -1);
 
-#if defined(SMP) && !defined(USE_OPENMP)
+#if (defined(SMP) || defined(USE_LOCKING)) && !defined(USE_OPENMP)
     LOCK_COMMAND(&alloc_lock);
 #endif    
     memory[position].addr = map_address;
-#if defined(SMP) && !defined(USE_OPENMP)
+#if (defined(SMP) || defined(USE_LOCKING)) && !defined(USE_OPENMP)
     UNLOCK_COMMAND(&alloc_lock);
 #endif
 
@@ -2839,7 +2839,7 @@ void blas_memory_free(void *free_area){
 #endif
 
   position = 0;
-#if defined(SMP) && !defined(USE_OPENMP)
+#if (defined(SMP) || defined(USE_LOCKING)) && !defined(USE_OPENMP)
   LOCK_COMMAND(&alloc_lock);
 #endif
   while ((position < NUM_BUFFERS) && (memory[position].addr != free_area))
@@ -2855,7 +2855,7 @@ void blas_memory_free(void *free_area){
   WMB;
 
   memory[position].used = 0;
-#if defined(SMP) && !defined(USE_OPENMP)
+#if (defined(SMP) || defined(USE_LOCKING)) && !defined(USE_OPENMP)
   UNLOCK_COMMAND(&alloc_lock);
 #endif
 
@@ -2872,7 +2872,7 @@ void blas_memory_free(void *free_area){
   for (position = 0; position < NUM_BUFFERS; position++)
     printf("%4ld  %p : %d\n", position, memory[position].addr, memory[position].used);
 #endif
-#if defined(SMP) && !defined(USE_OPENMP)
+#if (defined(SMP) || defined(USE_LOCKING)) && !defined(USE_OPENMP)
   UNLOCK_COMMAND(&alloc_lock);
 #endif
   return;
@@ -2924,7 +2924,7 @@ void blas_shutdown(void){
 
 #if defined(OS_LINUX) && !defined(NO_WARMUP)
 
-#ifdef SMP
+#if defined(SMP) || defined(USE_LOCKING)
 #if   defined(USE_PTHREAD_LOCK)
 static pthread_mutex_t    init_lock = PTHREAD_MUTEX_INITIALIZER;
 #elif defined(USE_PTHREAD_SPINLOCK)
@@ -2949,7 +2949,7 @@ static void _touch_memory(blas_arg_t *arg, BLASLONG *range_m, BLASLONG *range_n,
     if (hot_alloc != 2) {
 #endif
 
-#ifdef SMP
+#if defined(SMP) || defined(USE_LOCKING)
   LOCK_COMMAND(&init_lock);
 #endif
 
@@ -2959,7 +2959,7 @@ static void _touch_memory(blas_arg_t *arg, BLASLONG *range_m, BLASLONG *range_n,
     size    -= PAGESIZE;
   }
 
-#ifdef SMP
+#if defined(SMP) || defined(USE_LOCKING)
   UNLOCK_COMMAND(&init_lock);
 #endif
 

From 5ecffc28f2c32a23222ab633c904c9886923ecf1 Mon Sep 17 00:00:00 2001
From: Martin Kroeker <martin@ruby.chemie.uni-freiburg.de>
Date: Wed, 15 May 2019 23:36:17 +0200
Subject: [PATCH 058/127] Add option USE_LOCKING but keep default settings
 intact

---
 Makefile.rule | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/Makefile.rule b/Makefile.rule
index faf8c8013..255d1da46 100644
--- a/Makefile.rule
+++ b/Makefile.rule
@@ -56,7 +56,7 @@ VERSION = 0.3.7.dev
 # specify it.
 # For force setting for single threaded, specify USE_THREAD = 0
 # For force setting for multi  threaded, specify USE_THREAD = 1
-USE_THREAD = 0
+# USE_THREAD = 0
 
 # If you want to build a single-threaded OpenBLAS, but expect to call this
 # from several concurrent threads in some other program, comment this in for
@@ -226,7 +226,7 @@ NO_AFFINITY = 1
 COMMON_PROF = -pg
 
 # Build Debug version
-DEBUG = 1
+# DEBUG = 1
 
 # Set maximum stack allocation.
 # The default value is 2048. 0 disable stack allocation a may reduce GER and GEMV

From f66c11fc22fa01eb8e120d4274d262b3795e4281 Mon Sep 17 00:00:00 2001
From: Martin Kroeker <martin@ruby.chemie.uni-freiburg.de>
Date: Wed, 15 May 2019 23:38:12 +0200
Subject: [PATCH 059/127] Remove unrelated change

---
 Makefile.system | 2 --
 1 file changed, 2 deletions(-)

diff --git a/Makefile.system b/Makefile.system
index 29aef7e27..f574edf88 100644
--- a/Makefile.system
+++ b/Makefile.system
@@ -754,8 +754,6 @@ CCOMMON_OPT += -DF_INTERFACE_GFORT
 FCOMMON_OPT += -Wall
 # make single-threaded LAPACK calls thread-safe #1847
 FCOMMON_OPT += -frecursive
-# work around ABI changes in gfortran 9 that break calls from C code
-FCOMMON_OPT += -fno-optimize-sibling-calls
 #Don't include -lgfortran, when NO_LAPACK=1 or lsbcc
 ifneq ($(NO_LAPACK), 1)
 EXTRALIB += -lgfortran

From 1778fd4219688e84463844f3aeaf824ca4043b31 Mon Sep 17 00:00:00 2001
From: Martin Kroeker <martin@ruby.chemie.uni-freiburg.de>
Date: Wed, 22 May 2019 13:48:27 +0200
Subject: [PATCH 060/127] Do not try ancient PGI hacks with recent versions of
 that compiler

should fix #2139
---
 driver/others/memory.c | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/driver/others/memory.c b/driver/others/memory.c
index 02352b3ae..bf2cfb996 100644
--- a/driver/others/memory.c
+++ b/driver/others/memory.c
@@ -1622,6 +1622,7 @@ void gotoblas_dummy_for_PGI(void) {
   gotoblas_init();
   gotoblas_quit();
 
+#if __PGIC__ < 19
 #if 0
   asm ("\t.section\t.ctors,\"aw\",@progbits; .align 8; .quad gotoblas_init; .section .text");
   asm ("\t.section\t.dtors,\"aw\",@progbits; .align 8; .quad gotoblas_quit; .section .text");
@@ -1629,6 +1630,7 @@ void gotoblas_dummy_for_PGI(void) {
   asm (".section .init,\"ax\"; call gotoblas_init@PLT; .section .text");
   asm (".section .fini,\"ax\"; call gotoblas_quit@PLT; .section .text");
 #endif
+#endif
 }
 #endif
 
@@ -3192,7 +3194,7 @@ void gotoblas_dummy_for_PGI(void) {
 
   gotoblas_init();
   gotoblas_quit();
-
+#if __PGIC__ < 19
 #if 0
   asm ("\t.section\t.ctors,\"aw\",@progbits; .align 8; .quad gotoblas_init; .section .text");
   asm ("\t.section\t.dtors,\"aw\",@progbits; .align 8; .quad gotoblas_quit; .section .text");
@@ -3200,6 +3202,7 @@ void gotoblas_dummy_for_PGI(void) {
   asm (".section .init,\"ax\"; call gotoblas_init@PLT; .section .text");
   asm (".section .fini,\"ax\"; call gotoblas_quit@PLT; .section .text");
 #endif
+#endif	
 }
 #endif
 

From 940f38f6dd504c02a554470b53545270e8e5a351 Mon Sep 17 00:00:00 2001
From: Martin Kroeker <martin@ruby.chemie.uni-freiburg.de>
Date: Fri, 24 May 2019 13:02:23 +0200
Subject: [PATCH 061/127] Build and run utests in any case, they do their own
 checks for fortran availability

---
 Makefile | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/Makefile b/Makefile
index 273fde33e..aed248ef2 100644
--- a/Makefile
+++ b/Makefile
@@ -123,8 +123,8 @@ ifeq ($(NOFORTRAN), $(filter 0,$(NOFORTRAN)))
 	touch $(LIBNAME)
 ifndef NO_FBLAS
 	$(MAKE) -C test all
-	$(MAKE) -C utest all
 endif
+	$(MAKE) -C utest all
 ifndef NO_CBLAS
 	$(MAKE) -C ctest all
 endif

From 79366ff7a9548e7eb5d200c7ac444d35b28f2b7a Mon Sep 17 00:00:00 2001
From: Martin Kroeker <martin@ruby.chemie.uni-freiburg.de>
Date: Tue, 28 May 2019 20:34:22 +0200
Subject: [PATCH 062/127] Add softfp support in min/max kernels

fix for #1912
---
 kernel/arm/iamax_vfp.S | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/kernel/arm/iamax_vfp.S b/kernel/arm/iamax_vfp.S
index fd43b15b1..ae362935e 100644
--- a/kernel/arm/iamax_vfp.S
+++ b/kernel/arm/iamax_vfp.S
@@ -469,9 +469,11 @@ iamax_kernel_S10:
 
 
 iamax_kernel_L999:
-
+#if !defined(__ARM_PCS_VFP)
+	vmov    r0, s0
+#else
 	mov	r0, INDEX		// set return value
-
+#endif
 	pop     {r4}
 	bx	lr
 

From d76b20b4d2617582c8e1ac8a5aeb079e5c9de6f4 Mon Sep 17 00:00:00 2001
From: Martin Kroeker <martin@ruby.chemie.uni-freiburg.de>
Date: Wed, 29 May 2019 14:07:17 +0200
Subject: [PATCH 063/127] Revert "Add softfp support in min/max kernels"

---
 kernel/arm/iamax_vfp.S | 6 ++----
 1 file changed, 2 insertions(+), 4 deletions(-)

diff --git a/kernel/arm/iamax_vfp.S b/kernel/arm/iamax_vfp.S
index ae362935e..fd43b15b1 100644
--- a/kernel/arm/iamax_vfp.S
+++ b/kernel/arm/iamax_vfp.S
@@ -469,11 +469,9 @@ iamax_kernel_S10:
 
 
 iamax_kernel_L999:
-#if !defined(__ARM_PCS_VFP)
-	vmov    r0, s0
-#else
+
 	mov	r0, INDEX		// set return value
-#endif
+
 	pop     {r4}
 	bx	lr
 

From c70496b1082983e4d68a2513486a9d2fcbef44e2 Mon Sep 17 00:00:00 2001
From: Martin Kroeker <martin@ruby.chemie.uni-freiburg.de>
Date: Wed, 29 May 2019 15:02:51 +0200
Subject: [PATCH 064/127] Separate implementations of AMAX and IAMAX on arm

As noted in #1912 and comment on #1942, the combined implementation happens to "do the right thing" on hardfp, but cannot return both value and index on softfp where they would have to share the return register
---
 kernel/arm/KERNEL.ARMV6 |  24 +--
 kernel/arm/amax_vfp.S   | 441 ++++++++++++++++++++++++++++++++++++++++
 2 files changed, 453 insertions(+), 12 deletions(-)
 create mode 100644 kernel/arm/amax_vfp.S

diff --git a/kernel/arm/KERNEL.ARMV6 b/kernel/arm/KERNEL.ARMV6
index b773a5ba0..1c561deb6 100644
--- a/kernel/arm/KERNEL.ARMV6
+++ b/kernel/arm/KERNEL.ARMV6
@@ -1,20 +1,20 @@
 include $(KERNELDIR)/KERNEL.ARMV5
 
-SAMAXKERNEL  = iamax_vfp.S
-DAMAXKERNEL  = iamax_vfp.S
-CAMAXKERNEL  = iamax_vfp.S
-ZAMAXKERNEL  = iamax_vfp.S
+SAMAXKERNEL  = amax_vfp.S
+DAMAXKERNEL  = amax_vfp.S
+CAMAXKERNEL  = amax_vfp.S
+ZAMAXKERNEL  = amax_vfp.S
 
-SAMINKERNEL  = iamax_vfp.S
-DAMINKERNEL  = iamax_vfp.S
-CAMINKERNEL  = iamax_vfp.S
-ZAMINKERNEL  = iamax_vfp.S
+SAMINKERNEL  = amax_vfp.S
+DAMINKERNEL  = amax_vfp.S
+CAMINKERNEL  = amax_vfp.S
+ZAMINKERNEL  = amax_vfp.S
 
-SMAXKERNEL   = iamax_vfp.S
-DMAXKERNEL   = iamax_vfp.S
+SMAXKERNEL   = amax_vfp.S
+DMAXKERNEL   = amax_vfp.S
 
-SMINKERNEL   = iamax_vfp.S
-DMINKERNEL   = iamax_vfp.S
+SMINKERNEL   = amax_vfp.S
+DMINKERNEL   = amax_vfp.S
 
 ISAMAXKERNEL = iamax_vfp.S
 IDAMAXKERNEL = iamax_vfp.S
diff --git a/kernel/arm/amax_vfp.S b/kernel/arm/amax_vfp.S
new file mode 100644
index 000000000..c780ce5bd
--- /dev/null
+++ b/kernel/arm/amax_vfp.S
@@ -0,0 +1,441 @@
+/***************************************************************************
+Copyright (c) 2013, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*****************************************************************************/
+
+/**************************************************************************************
+* 2013/11/14 Saar
+* 	 BLASTEST 		: OK
+* 	 CTEST			: OK
+* 	 TEST			: OK
+*
+**************************************************************************************/
+
+#define ASSEMBLER
+#include "common.h"
+
+#define STACKSIZE 256
+
+#define	N	r0
+#define	X	r1
+#define	INC_X	r2
+
+#define I	r12
+
+#define X_PRE	512
+
+/**************************************************************************************
+* Macro definitions
+**************************************************************************************/
+
+#if	defined(USE_ABS)
+
+#if	defined(DOUBLE)
+
+#define	VABS(x0,x1)	vabs.f64	x0, x1
+
+#else
+
+#define	VABS(x0,x1)	vabs.f32	x0, x1
+
+#endif
+
+#else
+
+#define VABS(x0,x1)	nop
+
+#endif
+
+/*****************************************************************************************/
+
+#if	defined(USE_MIN)
+
+#define	MOVCOND		movlt
+
+#if	defined(DOUBLE)
+
+#define	VMOVCOND	vmovlt.f64
+
+#else
+
+#define	VMOVCOND	vmovlt.f32
+
+#endif
+
+#else
+
+#define	MOVCOND		movgt
+
+#if	defined(DOUBLE)
+
+#define	VMOVCOND	vmovgt.f64
+
+#else
+
+#define	VMOVCOND	vmovgt.f32
+
+#endif
+
+
+#endif
+
+
+/*****************************************************************************************/
+
+
+
+#if	!defined(COMPLEX)
+
+#if	defined(DOUBLE)
+
+.macro INIT_F
+
+	vldmia.f64	X!, { d0 }
+	VABS(   d0,  d0 )
+
+.endm
+
+.macro KERNEL_F1
+
+	vldmia.f64	X!, { d4 }
+	VABS(   d4,  d4 )
+	vcmpe.f64  	d4,  d0
+	vmrs		APSR_nzcv, fpscr
+	VMOVCOND	d0,  d4
+
+.endm
+
+.macro INIT_S
+
+	vldmia.f64	X, { d0 }
+	VABS(   d0,  d0 )
+	add	X, X, INC_X
+
+.endm
+
+
+.macro KERNEL_S1
+
+	vldmia.f64	X, { d4 }
+	VABS(   d4,  d4 )
+	vcmpe.f64  	d4,  d0
+	vmrs		APSR_nzcv, fpscr
+	VMOVCOND	d0,  d4
+	add	X, X, INC_X
+
+.endm
+
+#else
+
+.macro INIT_F
+
+	vldmia.f32	X!, { s0 }
+	VABS(   s0,  s0 )
+
+.endm
+
+.macro KERNEL_F1
+
+	vldmia.f32	X!, { s4 }
+	VABS(   s4,  s4 )
+	vcmpe.f32  	s4,  s0
+	vmrs		APSR_nzcv, fpscr
+	VMOVCOND	s0,  s4
+
+.endm
+
+.macro INIT_S
+
+	vldmia.f32	X, { s0 }
+	VABS(   s0,  s0 )
+	add	X, X, INC_X
+
+.endm
+
+
+.macro KERNEL_S1
+
+	vldmia.f32	X, { s4 }
+	VABS(   s4,  s4 )
+	vcmpe.f32  	s4,  s0
+	vmrs		APSR_nzcv, fpscr
+	VMOVCOND	s0,  s4
+	add	X, X, INC_X
+
+.endm
+
+
+
+
+#endif
+
+#else
+
+#if	defined(DOUBLE)
+
+.macro INIT_F
+
+	vldmia.f64	X!, { d0 -d1 }
+	vabs.f64   d0,  d0
+	vabs.f64   d1,  d1
+	vadd.f64   d0  , d0,  d1
+.endm
+
+
+.macro KERNEL_F1
+
+	vldmia.f64	X!, { d4 - d5 }
+	vabs.f64   d4,  d4
+	vabs.f64   d5,  d5
+	vadd.f64   d4  , d4,  d5
+	vcmpe.f64  	d4,  d0
+	vmrs		APSR_nzcv, fpscr
+	VMOVCOND	d0,  d4
+
+.endm
+
+.macro INIT_S
+
+	vldmia.f64	X, { d0 -d1 }
+	vabs.f64   d0,  d0
+	vabs.f64   d1,  d1
+	vadd.f64   d0  , d0,  d1
+	add	X, X, INC_X
+
+.endm
+
+
+
+.macro KERNEL_S1
+
+	vldmia.f64	X, { d4 - d5 }
+	vabs.f64   d4,  d4
+	vabs.f64   d5,  d5
+	vadd.f64   d4  , d4,  d5
+	vcmpe.f64  	d4,  d0
+	vmrs		APSR_nzcv, fpscr
+	VMOVCOND	d0,  d4
+	add	X, X, INC_X
+
+.endm
+
+#else
+
+.macro INIT_F
+
+	vldmia.f32	X!, { s0 -s1 }
+	vabs.f32   s0,  s0
+	vabs.f32   s1,  s1
+	vadd.f32   s0  , s0,  s1
+
+.endm
+
+
+.macro KERNEL_F1
+
+	vldmia.f32	X!, { s4 - s5 }
+	vabs.f32   s4,  s4
+	vabs.f32   s5,  s5
+	vadd.f32   s4  , s4,  s5
+	vcmpe.f32  	s4,  s0
+	vmrs		APSR_nzcv, fpscr
+	VMOVCOND	s0,  s4
+
+.endm
+
+.macro INIT_S
+
+	vldmia.f32	X, { s0 -s1 }
+	vabs.f32   s0,  s0
+	vabs.f32   s1,  s1
+	vadd.f32   s0  , s0,  s1
+	add	X, X, INC_X
+
+.endm
+
+
+
+.macro KERNEL_S1
+
+	vldmia.f32	X, { s4 - s5 }
+	vabs.f32   s4,  s4
+	vabs.f32   s5,  s5
+	vadd.f32   s4  , s4,  s5
+	vcmpe.f32  	s4,  s0
+	vmrs		APSR_nzcv, fpscr
+	VMOVCOND	s0,  s4
+	add	X, X, INC_X
+
+.endm
+
+
+
+
+#endif
+
+#endif
+
+/**************************************************************************************
+* End of macro definitions
+**************************************************************************************/
+
+	PROLOGUE
+
+	.align 5
+
+        movs    r12, #0                                          // clear floating point register
+        vmov    s0, r12
+#if     defined(DOUBLE)
+        vcvt.f64.f32    d0, s0
+#endif
+
+
+	cmp	N, #0
+	ble	amax_kernel_L999
+
+	cmp	INC_X, #0
+	beq	amax_kernel_L999
+
+
+	cmp	INC_X, #1
+	bne	amax_kernel_S_BEGIN
+
+
+amax_kernel_F_BEGIN:
+
+	INIT_F
+
+	subs	N, N , #1
+	ble	amax_kernel_L999
+
+	asrs	I, N, #2					// I = N / 4
+	ble	amax_kernel_F1
+
+	.align 5
+
+amax_kernel_F4:
+
+	pld	[ X, #X_PRE ]
+	KERNEL_F1
+	KERNEL_F1
+#if defined(COMPLEX) && defined(DOUBLE)
+	pld	[ X, #X_PRE ]
+#endif
+	KERNEL_F1
+	KERNEL_F1
+
+	subs	I, I, #1
+	ble	amax_kernel_F1
+
+
+#if defined(COMPLEX) || defined(DOUBLE)
+	pld	[ X, #X_PRE ]
+#endif
+	KERNEL_F1
+	KERNEL_F1
+#if defined(COMPLEX) && defined(DOUBLE)
+	pld	[ X, #X_PRE ]
+#endif
+	KERNEL_F1
+	KERNEL_F1
+
+	subs	I, I, #1
+	bne	amax_kernel_F4
+
+amax_kernel_F1:
+
+	ands	I, N, #3
+	ble	amax_kernel_L999
+
+amax_kernel_F10:
+
+	KERNEL_F1
+
+	subs    I, I, #1
+        bne     amax_kernel_F10
+
+	b	amax_kernel_L999
+
+amax_kernel_S_BEGIN:
+
+#if defined(COMPLEX)
+
+#if defined(DOUBLE)
+	lsl	INC_X, INC_X, #4				// INC_X * SIZE * 2
+#else
+	lsl	INC_X, INC_X, #3				// INC_X * SIZE * 2
+#endif
+
+#else
+
+#if defined(DOUBLE)
+	lsl	INC_X, INC_X, #3				// INC_X * SIZE
+#else
+	lsl	INC_X, INC_X, #2				// INC_X * SIZE
+#endif
+
+#endif
+
+	INIT_S
+
+	subs	N, N , #1
+	ble	amax_kernel_L999
+
+	asrs	I, N, #2					// I = N / 4
+	ble	amax_kernel_S1
+
+	.align 5
+
+amax_kernel_S4:
+
+	KERNEL_S1
+	KERNEL_S1
+	KERNEL_S1
+	KERNEL_S1
+
+	subs	I, I, #1
+	bne	amax_kernel_S4
+
+amax_kernel_S1:
+
+	ands	I, N, #3
+	ble	amax_kernel_L999
+
+amax_kernel_S10:
+
+	KERNEL_S1
+
+	subs    I, I, #1
+        bne     amax_kernel_S10
+
+
+amax_kernel_L999:
+#if !defined(__ARM_PCS_VFP) 
+	vmov    r0, s0
+#endif
+	bx	lr
+
+	EPILOGUE
+

From c5495d20563d9a7a142c6726d24c0fd485fcedf6 Mon Sep 17 00:00:00 2001
From: Martin Kroeker <martin@ruby.chemie.uni-freiburg.de>
Date: Thu, 30 May 2019 11:25:43 +0200
Subject: [PATCH 065/127] Ensure correct output for DAMAX with softfp

---
 kernel/arm/amax_vfp.S | 6 +++++-
 1 file changed, 5 insertions(+), 1 deletion(-)

diff --git a/kernel/arm/amax_vfp.S b/kernel/arm/amax_vfp.S
index c780ce5bd..d3770ea1e 100644
--- a/kernel/arm/amax_vfp.S
+++ b/kernel/arm/amax_vfp.S
@@ -432,8 +432,12 @@ amax_kernel_S10:
 
 
 amax_kernel_L999:
-#if !defined(__ARM_PCS_VFP) 
+#if !defined(__ARM_PCS_VFP)
+#if defined(DOUBLE)
+	vmov	r0, r1, d0
+#else	
 	vmov    r0, s0
+#endif
 #endif
 	bx	lr
 

From 74c10b57c6ea9d80f77c469b50f90989843b0bb9 Mon Sep 17 00:00:00 2001
From: Martin Kroeker <martin@ruby.chemie.uni-freiburg.de>
Date: Thu, 30 May 2019 11:38:11 +0200
Subject: [PATCH 066/127] Use generic kernels for complex (I)AMAX to support
 softfp

---
 kernel/arm/KERNEL.ARMV6 | 16 ++++++++--------
 1 file changed, 8 insertions(+), 8 deletions(-)

diff --git a/kernel/arm/KERNEL.ARMV6 b/kernel/arm/KERNEL.ARMV6
index 1c561deb6..344a71885 100644
--- a/kernel/arm/KERNEL.ARMV6
+++ b/kernel/arm/KERNEL.ARMV6
@@ -2,13 +2,13 @@ include $(KERNELDIR)/KERNEL.ARMV5
 
 SAMAXKERNEL  = amax_vfp.S
 DAMAXKERNEL  = amax_vfp.S
-CAMAXKERNEL  = amax_vfp.S
-ZAMAXKERNEL  = amax_vfp.S
+#CAMAXKERNEL  = amax_vfp.S
+#ZAMAXKERNEL  = amax_vfp.S
 
 SAMINKERNEL  = amax_vfp.S
 DAMINKERNEL  = amax_vfp.S
-CAMINKERNEL  = amax_vfp.S
-ZAMINKERNEL  = amax_vfp.S
+#CAMINKERNEL  = amax_vfp.S
+#ZAMINKERNEL  = amax_vfp.S
 
 SMAXKERNEL   = amax_vfp.S
 DMAXKERNEL   = amax_vfp.S
@@ -18,13 +18,13 @@ DMINKERNEL   = amax_vfp.S
 
 ISAMAXKERNEL = iamax_vfp.S
 IDAMAXKERNEL = iamax_vfp.S
-ICAMAXKERNEL = iamax_vfp.S
-IZAMAXKERNEL = iamax_vfp.S
+#ICAMAXKERNEL = iamax_vfp.S
+#IZAMAXKERNEL = iamax_vfp.S
 
 ISAMINKERNEL = iamax_vfp.S
 IDAMINKERNEL = iamax_vfp.S
-ICAMINKERNEL = iamax_vfp.S
-IZAMINKERNEL = iamax_vfp.S
+#ICAMINKERNEL = iamax_vfp.S
+#IZAMINKERNEL = iamax_vfp.S
 
 ISMAXKERNEL  = iamax_vfp.S
 IDMAXKERNEL  = iamax_vfp.S

From 8fe794f059a29922f1a4de7ecd143f35c79eb7e9 Mon Sep 17 00:00:00 2001
From: AbdelRauf <quickwritereader@gmail.com>
Date: Thu, 23 May 2019 04:23:43 +0000
Subject: [PATCH 067/127] improved zgemm power9 based on power8

---
 kernel/power/KERNEL.POWER9         |    2 +-
 kernel/power/sgemm_kernel_power9.S |    2 +-
 kernel/power/sgemm_logic_power9.S  |   40 +-
 kernel/power/zgemm_kernel_power9.S |  257 +++++
 kernel/power/zgemm_logic_power9.S  |  857 ++++++++++++++
 kernel/power/zgemm_macros_power9.S | 1664 ++++++++++++++++++++++++++++
 param.h                            |    4 +-
 7 files changed, 2802 insertions(+), 24 deletions(-)
 create mode 100644 kernel/power/zgemm_kernel_power9.S
 create mode 100644 kernel/power/zgemm_logic_power9.S
 create mode 100644 kernel/power/zgemm_macros_power9.S

diff --git a/kernel/power/KERNEL.POWER9 b/kernel/power/KERNEL.POWER9
index 0e0d62393..5c10ad64a 100644
--- a/kernel/power/KERNEL.POWER9
+++ b/kernel/power/KERNEL.POWER9
@@ -38,7 +38,7 @@ CGEMMOTCOPYOBJ =  cgemm_otcopy.o
 CGEMMINCOPYOBJ =  cgemm_incopy.o
 CGEMMITCOPYOBJ =  cgemm_itcopy.o
 
-ZGEMMKERNEL    = zgemm_kernel_8x2_power8.S
+ZGEMMKERNEL    = zgemm_kernel_power9.S
 ZGEMMONCOPY    = ../generic/zgemm_ncopy_2.c
 ZGEMMOTCOPY    = ../generic/zgemm_tcopy_2.c
 ZGEMMINCOPY    = ../generic/zgemm_ncopy_8.c
diff --git a/kernel/power/sgemm_kernel_power9.S b/kernel/power/sgemm_kernel_power9.S
index a44659468..f408cdc17 100644
--- a/kernel/power/sgemm_kernel_power9.S
+++ b/kernel/power/sgemm_kernel_power9.S
@@ -168,7 +168,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  
  
 	/*alpha is stored in f1. convert to single and splat*/
-  xscvdpspn alpha_r,vs1 
+    xscvdpspn alpha_r,vs1 
 	xxspltw   alpha_r,alpha_r,0
  
  
diff --git a/kernel/power/sgemm_logic_power9.S b/kernel/power/sgemm_logic_power9.S
index 300e30470..c149cb903 100644
--- a/kernel/power/sgemm_logic_power9.S
+++ b/kernel/power/sgemm_logic_power9.S
@@ -53,9 +53,9 @@ LSGEMM_L8x16_BEGIN:
 LSGEMM_L8x16_LOOP_START:
  
 	LOAD8x16_0  /*we already zeroed */
-    ##OffsetA=64 OffsetB=32
-    addi AO,AO,2112
-    addi BO,BO,32  
+    /*##OffsetA=64 OffsetB=32
+    #addi AO,AO,2112
+    #addi BO,BO,32  */
 
 	mtctr		L
 
@@ -63,29 +63,29 @@ LSGEMM_L8x16_LOOP_START:
 
 LSGEMM_L8x16_LOOP:
 
-    KERNEL8x16_I1_L4_2  -2048,0, 0,0
-    KERNEL8x16_I1_L4_2  -2048,0, 1,0
-    KERNEL8x16_I1_L4_2  -2048,0, 2,0
-    KERNEL8x16_I1_L4_2  -2048,0, 3,0
-    KERNEL8x16_I1_L4_2  -2048,0, 4,0
-    KERNEL8x16_I1_L4_2  -2048,0, 5,0        
-    KERNEL8x16_I1_L4_2  -2048,0, 6,0
-    KERNEL8x16_I1_L4_2  -2048,0, 7,0  
-    KERNEL8x16_I1_L4_2  -2048,0, 8,0      
-    KERNEL8x16_I1_L4_2  -2048,0, 9,0
-    KERNEL8x16_I1_L4_2  -2048,0, 10,0
-    KERNEL8x16_I1_L4_2  -2048,0, 11,0
-    KERNEL8x16_I1_L4_2  -2048,0, 12,0
-    KERNEL8x16_I1_L4_2  -2048,0, 13,0    
-    KERNEL8x16_I1_L4_2  -2048,0, 14,0    
-    KERNEL8x16_I1_L4_2  -2048,0, 15,1  	
+    KERNEL8x16_I1_L4_2  64,32, 0,0
+    KERNEL8x16_I1_L4_2  64,32, 1,0
+    KERNEL8x16_I1_L4_2  64,32, 2,0
+    KERNEL8x16_I1_L4_2  64,32, 3,0
+    KERNEL8x16_I1_L4_2  64,32, 4,0
+    KERNEL8x16_I1_L4_2  64,32, 5,0        
+    KERNEL8x16_I1_L4_2  64,32, 6,0
+    KERNEL8x16_I1_L4_2  64,32, 7,0  
+    KERNEL8x16_I1_L4_2  64,32, 8,0      
+    KERNEL8x16_I1_L4_2  64,32, 9,0
+    KERNEL8x16_I1_L4_2  64,32, 10,0
+    KERNEL8x16_I1_L4_2  64,32, 11,0
+    KERNEL8x16_I1_L4_2  64,32, 12,0
+    KERNEL8x16_I1_L4_2  64,32, 13,0    
+    KERNEL8x16_I1_L4_2  64,32, 14,0    
+    KERNEL8x16_I1_L4_2  64,32, 15,1  	
 
 	bdnz		LSGEMM_L8x16_LOOP
 
 	MY_ALIGN
 LSGEMM_L8x16_LOOP_END:
 
-    END8x16 0, AO, BO, -2048, 0    
+    END8x16 0, AO, BO, 64, 32    
 
 	b		LSGEMM_L8x16_SUB1 
 	MY_ALIGN
diff --git a/kernel/power/zgemm_kernel_power9.S b/kernel/power/zgemm_kernel_power9.S
new file mode 100644
index 000000000..e655f0bfe
--- /dev/null
+++ b/kernel/power/zgemm_kernel_power9.S
@@ -0,0 +1,257 @@
+/***************************************************************************
+Copyright (c) 2013-2019, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*****************************************************************************/
+#define ASSEMBLER
+#include "common.h"
+#include "def_vsx.h"
+
+#define LOAD	ld
+ 
+#define STACKSIZE 32192
+
+#define FZERO	312+192(SP)
+ 
+
+#define	M	r3
+#define	N	r4
+#define	K	r5
+
+ 
+#define A	r8
+#define	B	r9
+#define	C	r10
+#define	LDC	r6
+#define OFFSET	r7
+ 
+ 
+
+#define o0	0
+#define alpha_r vs30
+#define alpha_i vs31
+
+#define VECSAVE r11
+
+#define FRAMEPOINTER r12
+
+#define BBUFFER r14
+
+#define L	r15
+#define ALPHA	r16
+#define T5	r17
+#define T2	r19
+#define BBO	r20
+#define	o8	r21
+#define	I	r22
+#define J	r23
+#define AO	r24
+#define	BO	r25
+#define	CO	r26
+#define o16	r27
+#define	T3	r28
+#define T4	r29
+
+#define PRE	r30
+#define T1  	r31
+
+#ifndef NEEDPARAM
+
+	PROLOGUE
+	PROFCODE
+
+	mr      FRAMEPOINTER, SP
+        addi    SP, SP, -STACKSIZE
+        addi    SP, SP, -STACKSIZE
+        addi    SP, SP, -STACKSIZE
+        addi    SP, SP, -STACKSIZE
+        li      r0, 0
+
+	stfd	f14,    0(SP)
+	stfd	f15,    8(SP)
+	stfd	f16,   16(SP)
+	stfd	f17,   24(SP)
+
+	stfd	f18,   32(SP)
+	stfd	f19,   40(SP)
+	stfd	f20,   48(SP)
+	stfd	f21,   56(SP)
+
+	stfd	f22,   64(SP)
+	stfd	f23,   72(SP)
+	stfd	f24,   80(SP)
+	stfd	f25,   88(SP)
+
+	stfd	f26,   96(SP)
+	stfd	f27,  104(SP)
+	stfd	f28,  112(SP)
+	stfd	f29,  120(SP)
+
+	stfd	f30,  128(SP)
+	stfd	f31,  136(SP)
+
+ 
+	std	r31,  144(SP)
+	std	r30,  152(SP)
+	std	r29,  160(SP)
+	std	r28,  168(SP)
+	std	r27,  176(SP)
+	std	r26,  184(SP)
+	std	r25,  192(SP)
+	std	r24,  200(SP)
+	std	r23,  208(SP)
+	std	r22,  216(SP)
+	std	r21,  224(SP)
+	std	r20,  232(SP)
+	std	r19,  240(SP)
+	std	r18,  248(SP)
+	std	r17,  256(SP)
+	std	r16,  264(SP)
+	std	r15,  272(SP)
+	std	r14,  280(SP)
+ 
+ 
+    stxv    v20,  288(SP)
+    stxv    v21,  304(SP)
+    stxv    v22,  320(SP)
+    stxv    v23,  336(SP)
+    stxv    v24,  352(SP)
+    stxv    v25,  368(SP)
+    stxv    v26,  384(SP)
+    stxv    v27,  400(SP)
+    stxv    v28,  416(SP)
+    stxv    v29,  432(SP)
+    stxv    v30,  448(SP)
+    stxv    v31,  464(SP)
+
+ 
+	stw	r0,  FZERO
+
+#ifdef linux
+	ld	LDC, FRAMESLOT(0) + 0(FRAMEPOINTER)
+#endif
+
+
+#ifdef TRMMKERNEL
+#if defined(linux) && defined(__64BIT__)
+	ld	OFFSET,  FRAMESLOT(1) + 0(FRAMEPOINTER)
+#endif 
+#endif
+
+
+#include "zgemm_macros_power9.S"
+
+	cmpwi	cr0, M, 0
+	ble	L999
+	cmpwi	cr0, N, 0
+	ble	L999
+	cmpwi	cr0, K, 0
+	ble	L999
+
+	slwi	LDC, LDC, ZBASE_SHIFT
+	li	PRE,  512
+	li	o8  , 8
+	li	o16 , 16 
+
+    addi    BBUFFER, SP, 512+4096
+    li      T1, -4096
+    and     BBUFFER, BBUFFER, T1
+
+ 
+	addi	ALPHA, SP, 296+192
+ 
+    xxlor  alpha_r,vs1,vs1  /*copy from register f1 */
+    xxlor  alpha_i,vs2,vs2  /*copy from register f2 */
+
+	.align 4
+
+#include "zgemm_logic_power9.S"
+
+L999:
+	addi	r3, 0, 0
+
+	lfd	f14,    0(SP)
+	lfd	f15,    8(SP)
+	lfd	f16,   16(SP)
+	lfd	f17,   24(SP)
+
+	lfd	f18,   32(SP)
+	lfd	f19,   40(SP)
+	lfd	f20,   48(SP)
+	lfd	f21,   56(SP)
+
+	lfd	f22,   64(SP)
+	lfd	f23,   72(SP)
+	lfd	f24,   80(SP)
+	lfd	f25,   88(SP)
+
+	lfd	f26,   96(SP)
+	lfd	f27,  104(SP)
+	lfd	f28,  112(SP)
+	lfd	f29,  120(SP)
+
+	lfd	f30,  128(SP)
+	lfd	f31,  136(SP)
+
+ 
+	ld	r31,  144(SP)
+	ld	r30,  152(SP)
+	ld	r29,  160(SP)
+	ld	r28,  168(SP)
+	ld	r27,  176(SP)
+	ld	r26,  184(SP)
+	ld	r25,  192(SP)
+	ld	r24,  200(SP)
+	ld	r23,  208(SP)
+	ld	r22,  216(SP)
+	ld	r21,  224(SP)
+	ld	r20,  232(SP)
+	ld	r19,  240(SP)
+	ld	r18,  248(SP)
+	ld	r17,  256(SP)
+	ld	r16,  264(SP)
+	ld	r15,  272(SP)
+	ld	r14,  280(SP)
+ 
+	lxv    v20,  288(SP)
+	lxv    v21,  304(SP)
+	lxv    v22,  320(SP)
+	lxv    v23,  336(SP)
+	lxv    v24,  352(SP)
+	lxv    v25,  368(SP)
+	lxv    v26,  384(SP)
+	lxv    v27,  400(SP)
+	lxv    v28,  416(SP)
+	lxv    v29,  432(SP)
+	lxv    v30,  448(SP)
+	lxv    v31,  464(SP)
+
+	addi	SP, SP, STACKSIZE
+	addi	SP, SP, STACKSIZE
+	addi	SP, SP, STACKSIZE
+	addi	SP, SP, STACKSIZE
+	blr
+
+	EPILOGUE
+#endif
\ No newline at end of file
diff --git a/kernel/power/zgemm_logic_power9.S b/kernel/power/zgemm_logic_power9.S
new file mode 100644
index 000000000..77ce36294
--- /dev/null
+++ b/kernel/power/zgemm_logic_power9.S
@@ -0,0 +1,857 @@
+/***************************************************************************
+Copyright (c) 2013-2019, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*****************************************************************************/
+#define MY_ALIGN .align 3
+
+	srawi.		J,	N,	1
+	ble		ZGEMM_L2_END
+
+ZGEMM_L2_BEGIN:
+
+	mr		BO,	B
+	mr		BBO,	BBUFFER
+	srawi.		T1,	K,	2
+	ble		ZGEMM_L2_COPYB1
+
+ZGEMM_L2_COPYB8:
+
+	addi		T2,	PRE, 128
+	dcbt		BO,	PRE
+	dcbtst		BBO,	PRE
+	dcbtst		BBO,	T2
+	ZCOPYB_8
+	addic.		T1,	T1,	-1
+
+	bgt		ZGEMM_L2_COPYB8
+
+ZGEMM_L2_COPYB1:
+
+	andi.		T1,	K,	3
+	ble		ZGEMM_L2_COPYB_END
+
+ZGEMM_L2_COPYB_LOOP:
+
+	ZCOPYB_2
+	addic.          T1,     T1,     -1
+
+	bgt             ZGEMM_L2_COPYB_LOOP
+
+ZGEMM_L2_COPYB_END:
+
+	mr		CO,	C
+	mr		AO,	A
+	slwi		T1,	LDC	,	1
+	add		C,	C,	T1
+	srawi.		I,	M,	3
+	ble		ZGEMM_L2x8_END
+
+ZGEMM_L2x8_BEGIN:
+
+
+	mr		BO,	BBUFFER
+	mr T1, K
+    addi T1,T1, -1
+    srawi.		L,	T1,	5 /**(K-1) % 32x */ 
+	ZERO2x8  
+	ble		ZGEMM_L2x8_SUB0
+ 
+
+ZGEMM_L2x8_LOOP_START:
+
+    LOAD2x8 0 
+    li T2, 1024
+	li T3, 1024+512
+	li T4, 2048
+	li T5, 2048+512
+	mtctr		L
+
+	MY_ALIGN
+ZGEMM_L2x8_LOOP:
+ 	dcbt		AO,	PRE
+	dcbt		BO,	PRE
+    KERNEL2x8_L 128,64,0,0
+	KERNEL2x8_L 128,64,1,0
+	dcbt		AO,	T2	
+	KERNEL2x8_L 128,64,2,0
+	KERNEL2x8_L 128,64,3,0 
+	dcbt		AO,	T3
+	dcbt		BO,	T2
+    KERNEL2x8_L 128,64,4,0
+	KERNEL2x8_L 128,64,5,0
+	dcbt		AO,	T4	
+	KERNEL2x8_L 128,64,6,0
+	KERNEL2x8_L 128,64,7,0  
+	dcbt		AO,	T5	
+	dcbt		BO,	T3
+    KERNEL2x8_L 128,64,8,0
+	KERNEL2x8_L 128,64,9,0
+	KERNEL2x8_L 128,64,10,0
+	KERNEL2x8_L 128,64,11,0  
+	dcbt		BO,	T4
+    KERNEL2x8_L 128,64,12,0
+	KERNEL2x8_L 128,64,13,0
+	KERNEL2x8_L 128,64,14,0
+	KERNEL2x8_L 128,64,15,1 		
+	bdnz		ZGEMM_L2x8_LOOP
+ 	MY_ALIGN  
+ZGEMM_L2x8_LOOP_END:
+    END2x8  AO, BO, 128, 64   	 
+ 
+	b		ZGEMM_L2x8_SUB1
+ 
+ZGEMM_L2x8_SUB0:
+
+	andi.		L,	K,	63
+ 
+	b		ZGEMM_L2x8_SUB2
+
+ZGEMM_L2x8_SUB1:
+
+	andi.		L,	T1,	31
+	ble		ZGEMM_L2x8_SAVE
+
+ZGEMM_L2x8_SUB2:
+    srawi.      T1,L, 3
+    ble ZGEMM_L2x8_SUB2_4
+    mtctr		T1
+    MY_ALIGN
+ZGEMM_L2x8_SUB2_LOOP:
+	LOAD2x8 0 
+    KERNEL2x8_L  128,64, 0,0
+    KERNEL2x8_L  128,64, 1,0
+    KERNEL2x8_L  128,64, 2,0
+    KERNEL2x8_E  128,64, 3,1
+    bdnz ZGEMM_L2x8_SUB2_LOOP 
+    MY_ALIGN  
+ZGEMM_L2x8_SUB2_4:
+    andi.      T1,L, 4
+    ble ZGEMM_L2x8_SUB2_2
+	LOAD2x8 0 
+    KERNEL2x8_L  128,64, 0,0
+    KERNEL2x8_E  128,64, 1,1
+    MY_ALIGN
+ZGEMM_L2x8_SUB2_2:
+    andi.      T1,L, 2
+    ble ZGEMM_L2x8_SUB2_1
+	LOAD2x8 0 
+    KERNEL2x8_E  128,64, 0,1
+    MY_ALIGN    
+ZGEMM_L2x8_SUB2_1:
+    andi.      T1,L, 1
+    ble ZGEMM_L2x8_SAVE	
+    KERNEL2x8      
+
+/*	addic.		L,	L,	-1
+	bgt		ZGEMM_L2x8_SUB2_1*/
+
+ZGEMM_L2x8_SAVE:
+
+	SAVE2x8
+
+	addic.		I,	I,	-1
+	bgt		ZGEMM_L2x8_BEGIN
+
+ZGEMM_L2x8_END:
+
+ZGEMM_L2x4_BEGIN:
+
+	andi.		T2,	M,	7
+	ble		ZGEMM_L2x1_END
+
+	andi.		T1,	M,	4
+	ble		ZGEMM_L2x4_END
+	mr		BO,	BBUFFER
+	mr T1, K
+    addi T1,T1, -1
+    srawi.		L,	T1, 4 /**(K-1) % 16x */ 
+	ZERO2x4  
+	ble		ZGEMM_L2x4_SUB0 
+
+ZGEMM_L2x4_LOOP_START:
+    LOAD2x4 0  
+	mtctr		L
+
+	MY_ALIGN
+ZGEMM_L2x4_LOOP: 
+    KERNEL2x4_L 64,64,0,0
+	KERNEL2x4_L 64,64,1,0 	
+	KERNEL2x4_L 64,64,2,0
+	KERNEL2x4_L 64,64,3,0  
+    KERNEL2x4_L 64,64,4,0
+	KERNEL2x4_L 64,64,5,0 
+	KERNEL2x4_L 64,64,6,0
+	KERNEL2x4_L 64,64,7,1	
+	bdnz		ZGEMM_L2x4_LOOP
+ 	MY_ALIGN  
+ZGEMM_L2x4_LOOP_END:
+    END2x4  AO, BO, 64, 64   	 
+ 
+	b		ZGEMM_L2x4_SUB1
+ 
+ZGEMM_L2x4_SUB0:
+
+	andi.		L,	K,	31
+ 
+	b		ZGEMM_L2x4_SUB2
+
+ZGEMM_L2x4_SUB1:
+
+	andi.		L,	T1,	15
+	ble		ZGEMM_L2x4_SAVE
+
+ZGEMM_L2x4_SUB2:
+    srawi.      T1,L, 3
+    ble ZGEMM_L2x4_SUB2_4
+    mtctr		T1
+    MY_ALIGN
+ZGEMM_L2x4_SUB2_LOOP:
+	LOAD2x4 0 
+    KERNEL2x4_L  64,64, 0,0
+    KERNEL2x4_L  64,64, 1,0
+    KERNEL2x4_L  64,64, 2,0
+    KERNEL2x4_E  64,64, 3,1
+    bdnz ZGEMM_L2x4_SUB2_LOOP 
+    MY_ALIGN  
+ZGEMM_L2x4_SUB2_4:
+    andi.      T1,L, 4
+    ble ZGEMM_L2x4_SUB2_2
+	LOAD2x4 0 
+    KERNEL2x4_L  64,64, 0,0
+    KERNEL2x4_E  64,64, 1,1
+    MY_ALIGN
+ZGEMM_L2x4_SUB2_2:
+    andi.      T1,L, 2
+    ble ZGEMM_L2x4_SUB2_1
+	LOAD2x4 0 
+    KERNEL2x4_E  64,64, 0,1
+    MY_ALIGN    
+ZGEMM_L2x4_SUB2_1:
+    andi.      T1,L, 1
+    ble ZGEMM_L2x4_SAVE	
+    KERNEL2x4  
+
+ZGEMM_L2x4_SAVE:
+
+	SAVE2x4
+
+ZGEMM_L2x4_END:
+
+ZGEMM_L2x2_BEGIN:
+
+
+	andi.		T1,	M,	2
+	ble		ZGEMM_L2x2_END
+	mr		BO,	BBUFFER
+	mr T1, K
+    addi T1,T1, -1
+    srawi.		L,	T1, 4 /**(K-1) % 16x */ 
+	ZERO2x2 
+	ble		ZGEMM_L2x2_SUB0 
+
+ZGEMM_L2x2_LOOP_START:
+    LOAD2x2 0  
+	mtctr		L
+
+	MY_ALIGN
+ZGEMM_L2x2_LOOP: 
+    KERNEL2x2_L 32,64,0,0
+	KERNEL2x2_L 32,64,1,0 	
+	KERNEL2x2_L 32,64,2,0
+	KERNEL2x2_L 32,64,3,0  
+    KERNEL2x2_L 32,64,4,0
+	KERNEL2x2_L 32,64,5,0 
+	KERNEL2x2_L 32,64,6,0
+	KERNEL2x2_L 32,64,7,1	
+	bdnz		ZGEMM_L2x2_LOOP
+ 	MY_ALIGN  
+ZGEMM_L2x2_LOOP_END:
+    END2x2  AO, BO, 32, 64   	 
+ 
+	b		ZGEMM_L2x2_SUB1
+ 
+ZGEMM_L2x2_SUB0:
+
+	andi.		L,	K,	31
+ 
+	b		ZGEMM_L2x2_SUB2
+
+ZGEMM_L2x2_SUB1:
+
+	andi.		L,	T1,	15
+	ble		ZGEMM_L2x2_SAVE
+
+ZGEMM_L2x2_SUB2:
+    srawi.      T1,L, 3
+    ble ZGEMM_L2x2_SUB2_4
+    mtctr		T1
+    MY_ALIGN
+ZGEMM_L2x2_SUB2_LOOP:
+	LOAD2x2 0 
+    KERNEL2x2_L  32,64, 0,0
+    KERNEL2x2_L  32,64, 1,0
+    KERNEL2x2_L  32,64, 2,0
+    KERNEL2x2_E  32,64, 3,1
+    bdnz ZGEMM_L2x2_SUB2_LOOP 
+    MY_ALIGN  
+ZGEMM_L2x2_SUB2_4:
+    andi.      T1,L, 4
+    ble ZGEMM_L2x2_SUB2_2
+	LOAD2x2 0 
+    KERNEL2x2_L  32,64, 0,0
+    KERNEL2x2_E  32,64, 1,1
+    MY_ALIGN
+ZGEMM_L2x2_SUB2_2:
+    andi.      T1,L, 2
+    ble ZGEMM_L2x2_SUB2_1
+	LOAD2x2 0 
+    KERNEL2x2_E  32,64, 0,1
+    MY_ALIGN    
+ZGEMM_L2x2_SUB2_1:
+    andi.      T1,L, 1
+    ble ZGEMM_L2x2_SAVE	
+    KERNEL2x2 
+ZGEMM_L2x2_SAVE:
+
+	SAVE2x2
+
+ZGEMM_L2x2_END:
+
+ZGEMM_L2x1_BEGIN:
+
+
+	andi.		T1,	M,	1
+	ble		ZGEMM_L2x1_END
+	mr		BO,	BBUFFER
+	mr T1, K
+    addi T1,T1, -1
+    srawi.		L,	T1, 4 /**(K-1) % 16x */ 
+	ZERO2x1  
+	ble		ZGEMM_L2x1_SUB0 
+
+ZGEMM_L2x1_LOOP_START:
+
+    LOAD2x1 0  
+	mtctr		L
+
+	MY_ALIGN
+ZGEMM_L2x1_LOOP: 
+    KERNEL2x1_L 16,64,0,0
+	KERNEL2x1_L 16,64,1,0 	
+	KERNEL2x1_L 16,64,2,0
+	KERNEL2x1_L 16,64,3,0  
+    KERNEL2x1_L 16,64,4,0
+	KERNEL2x1_L 16,64,5,0 
+	KERNEL2x1_L 16,64,6,0
+	KERNEL2x1_L 16,64,7,1 		
+	bdnz		ZGEMM_L2x1_LOOP
+ 	MY_ALIGN  
+ZGEMM_L2x1_LOOP_END:
+    END2x1  AO, BO, 16, 64   	 
+ 
+	b		ZGEMM_L2x1_SUB1
+ 
+ZGEMM_L2x1_SUB0:
+
+	andi.		L,	K,	31
+ 
+	b		ZGEMM_L2x1_SUB2
+
+ZGEMM_L2x1_SUB1:
+
+	andi.		L,	T1,	15
+	ble		ZGEMM_L2x1_SAVE
+
+ZGEMM_L2x1_SUB2:
+    srawi.      T1,L, 3
+    ble ZGEMM_L2x1_SUB2_4
+    mtctr		T1
+    MY_ALIGN
+ZGEMM_L2x1_SUB2_LOOP:
+	LOAD2x1 0 
+    KERNEL2x1_L  16,64, 0,0
+    KERNEL2x1_L  16,64, 1,0
+    KERNEL2x1_L  16,64, 2,0
+    KERNEL2x1_E  16,64, 3,1
+    bdnz ZGEMM_L2x1_SUB2_LOOP 
+    MY_ALIGN  
+ZGEMM_L2x1_SUB2_4:
+    andi.      T1,L, 4
+    ble ZGEMM_L2x1_SUB2_2
+	LOAD2x1 0 
+    KERNEL2x1_L  16,64, 0,0
+    KERNEL2x1_E  16,64, 1,1
+    MY_ALIGN
+ZGEMM_L2x1_SUB2_2:
+    andi.      T1,L, 2
+    ble ZGEMM_L2x1_SUB2_1
+	LOAD2x1 0 
+    KERNEL2x1_E  16,64, 0,1
+    MY_ALIGN    
+ZGEMM_L2x1_SUB2_1:
+    andi.      T1,L, 1
+    ble ZGEMM_L2x1_SAVE	
+    KERNEL2x1 
+
+ZGEMM_L2x1_SAVE:
+
+	SAVE2x1
+
+ZGEMM_L2x1_END:
+
+	slwi		T1,	K,	5
+	add		B,	B,	T1
+
+	addic.		J,	J,	-1
+	bgt		ZGEMM_L2_BEGIN
+
+	andi.		T2,	N,	1
+	ble		L999
+
+ZGEMM_L2_END:
+
+	b		ZGEMM_L1_BEGIN
+
+L999_H1:
+
+	b		L999
+
+ZGEMM_L1_BEGIN:
+	andi.		T1,	N,	1
+	ble		ZGEMM_L1_END
+
+	mr		BO,	B
+	mr		BBO,	BBUFFER 
+	srawi.		T1,	K,	3 /*this time K/8 */
+	ble		ZGEMM_L1_COPYB1
+
+ZGEMM_L1_COPYB8:
+
+	addi		T2,	PRE, 128
+	dcbt		BO,	PRE
+	dcbtst		BBO,	PRE
+	dcbtst		BBO,	T2
+	ZCOPYB_8
+	addic.		T1,	T1,	-1
+
+	bgt		ZGEMM_L1_COPYB8
+
+ZGEMM_L1_COPYB1:
+
+	andi.		T1,	K,	7
+	ble		ZGEMM_L1_COPYB_END
+
+ZGEMM_L1_COPYB_LOOP:
+
+	ZCOPYB_1
+	addic.          T1,     T1,     -1
+
+	bgt             ZGEMM_L1_COPYB_LOOP
+
+ZGEMM_L1_COPYB_END:
+
+	mr		CO,	C
+	mr		AO,	A
+	srawi.		I,	M,	3
+	ble		ZGEMM_L1x8_END
+
+ZGEMM_L1x8_BEGIN:
+
+
+	mr		BO,	BBUFFER
+	mr T1, K
+    addi T1,T1, -1
+    srawi.		L,	T1,	5 /**(K-1) % 32x */ 
+	ZERO1x8  
+	ble		ZGEMM_L1x8_SUB0
+ 
+
+ZGEMM_L1x8_LOOP_START:
+
+    LOAD1x8 0 
+    li T2, 1024
+	li T3, 1024+512
+	li T4, 2048
+	li T5, 2048+512
+	mtctr		L
+
+	MY_ALIGN
+ZGEMM_L1x8_LOOP:
+ 	dcbt		AO,	PRE
+	dcbt		BO,	PRE
+    KERNEL1x8_L 128,32,0,0
+	KERNEL1x8_L 128,32,1,0
+	dcbt		AO,	T2	
+	KERNEL1x8_L 128,32,2,0
+	KERNEL1x8_L 128,32,3,0 
+	dcbt		AO,	T3
+	dcbt		BO,	T2
+    KERNEL1x8_L 128,32,4,0
+	KERNEL1x8_L 128,32,5,0
+	dcbt		AO,	T4	
+	KERNEL1x8_L 128,32,6,0
+	KERNEL1x8_L 128,32,7,0  
+	dcbt		AO,	T5	
+	dcbt		BO,	T3
+    KERNEL1x8_L 128,32,8,0
+	KERNEL1x8_L 128,32,9,0
+	KERNEL1x8_L 128,32,10,0
+	KERNEL1x8_L 128,32,11,0  
+	dcbt		BO,	T4
+    KERNEL1x8_L 128,32,12,0
+	KERNEL1x8_L 128,32,13,0
+	KERNEL1x8_L 128,32,14,0
+	KERNEL1x8_L 128,32,15,1 		
+	bdnz		ZGEMM_L1x8_LOOP
+ 	MY_ALIGN  
+ZGEMM_L1x8_LOOP_END:
+    END1x8  AO, BO, 128, 32   	 
+ 
+	b		ZGEMM_L1x8_SUB1
+ 
+ZGEMM_L1x8_SUB0:
+
+	andi.		L,	K,	63
+ 
+	b		ZGEMM_L1x8_SUB2
+
+ZGEMM_L1x8_SUB1:
+
+	andi.		L,	T1,	31
+	ble		ZGEMM_L1x8_SAVE
+
+ZGEMM_L1x8_SUB2:
+    srawi.      T1,L, 3
+    ble ZGEMM_L1x8_SUB2_4
+    mtctr		T1
+    MY_ALIGN
+ZGEMM_L1x8_SUB2_LOOP:
+	LOAD1x8 0 
+    KERNEL1x8_L  128,32, 0,0
+    KERNEL1x8_L  128,32, 1,0
+    KERNEL1x8_L  128,32, 2,0
+    KERNEL1x8_E  128,32, 3,1
+    bdnz ZGEMM_L1x8_SUB2_LOOP 
+    MY_ALIGN  
+ZGEMM_L1x8_SUB2_4:
+    andi.      T1,L, 4
+    ble ZGEMM_L1x8_SUB2_2
+	LOAD1x8 0 
+    KERNEL1x8_L  128,32, 0,0
+    KERNEL1x8_E  128,32, 1,1
+    MY_ALIGN
+ZGEMM_L1x8_SUB2_2:
+    andi.      T1,L, 2
+    ble ZGEMM_L1x8_SUB2_1
+	LOAD1x8 0 
+    KERNEL1x8_E  128,32, 0,1
+    MY_ALIGN    
+ZGEMM_L1x8_SUB2_1:
+    andi.      T1,L, 1
+    ble ZGEMM_L1x8_SAVE	
+    KERNEL1x8      
+
+/*	addic.		L,	L,	-1
+	bgt		ZGEMM_L1x8_SUB2_1*/
+
+ZGEMM_L1x8_SAVE:
+
+	SAVE1x8
+
+	addic.		I,	I,	-1
+	bgt		ZGEMM_L1x8_BEGIN
+
+ZGEMM_L1x8_END:
+
+ZGEMM_L1x4_BEGIN:
+
+	andi.		T2,	M,	7
+	ble		ZGEMM_L1x1_END
+
+	andi.		T1,	M,	4
+	ble		ZGEMM_L1x4_END
+	mr		BO,	BBUFFER
+	mr T1, K
+    addi T1,T1, -1
+    srawi.		L,	T1, 5 /**(K-1) % 16x */ 
+	ZERO1x4  
+	ble		ZGEMM_L1x4_SUB0 
+
+ZGEMM_L1x4_LOOP_START:
+    LOAD1x4 0  
+	mtctr		L
+
+	MY_ALIGN
+ZGEMM_L1x4_LOOP: 
+    KERNEL1x4_L 64,32,0,0
+	KERNEL1x4_L 64,32,1,0 	
+	KERNEL1x4_L 64,32,2,0
+	KERNEL1x4_L 64,32,3,0  
+    KERNEL1x4_L 64,32,4,0
+	KERNEL1x4_L 64,32,5,0 
+	KERNEL1x4_L 64,32,6,0
+	KERNEL1x4_L 64,32,7,0   
+    KERNEL1x4_L 64,32,8,0
+	KERNEL1x4_L 64,32,9,0
+	KERNEL1x4_L 64,32,10,0
+	KERNEL1x4_L 64,32,11,0   
+    KERNEL1x4_L 64,32,12,0
+	KERNEL1x4_L 64,32,13,0
+	KERNEL1x4_L 64,32,14,0
+	KERNEL1x4_L 64,32,15,1 		
+	bdnz		ZGEMM_L1x4_LOOP
+ 	MY_ALIGN  
+ZGEMM_L1x4_LOOP_END:
+    END1x4  AO, BO, 64, 32   	 
+ 
+	b		ZGEMM_L1x4_SUB1
+ 
+ZGEMM_L1x4_SUB0:
+
+	andi.		L,	K,	63
+ 
+	b		ZGEMM_L1x4_SUB2
+
+ZGEMM_L1x4_SUB1:
+
+	andi.		L,	T1,	31
+	ble		ZGEMM_L1x4_SAVE
+
+ZGEMM_L1x4_SUB2:
+    srawi.      T1,L, 3
+    ble ZGEMM_L1x4_SUB2_4
+    mtctr		T1
+    MY_ALIGN
+ZGEMM_L1x4_SUB2_LOOP:
+	LOAD1x4 0 
+    KERNEL1x4_L  64,32, 0,0
+    KERNEL1x4_L  64,32, 1,0
+    KERNEL1x4_L  64,32, 2,0
+    KERNEL1x4_E  64,32, 3,1
+    bdnz ZGEMM_L1x4_SUB2_LOOP 
+    MY_ALIGN  
+ZGEMM_L1x4_SUB2_4:
+    andi.      T1,L, 4
+    ble ZGEMM_L1x4_SUB2_2
+	LOAD1x4 0 
+    KERNEL1x4_L  64,32, 0,0
+    KERNEL1x4_E  64,32, 1,1
+    MY_ALIGN
+ZGEMM_L1x4_SUB2_2:
+    andi.      T1,L, 2
+    ble ZGEMM_L1x4_SUB2_1
+	LOAD1x4 0 
+    KERNEL1x4_E  64,32, 0,1
+    MY_ALIGN    
+ZGEMM_L1x4_SUB2_1:
+    andi.      T1,L, 1
+    ble ZGEMM_L1x4_SAVE	
+    KERNEL1x4  
+
+ZGEMM_L1x4_SAVE:
+
+	SAVE1x4
+
+ZGEMM_L1x4_END:
+
+ZGEMM_L1x2_BEGIN:
+
+
+	andi.		T1,	M,	2
+	ble		ZGEMM_L1x2_END
+	mr		BO,	BBUFFER
+	mr T1, K
+    addi T1,T1, -1
+    srawi.		L,	T1, 5 /**(K-1) % 16x */ 
+	ZERO1x2  
+	ble		ZGEMM_L1x2_SUB0 
+
+ZGEMM_L1x2_LOOP_START:
+    LOAD1x2 0  
+	mtctr		L
+
+	MY_ALIGN
+ZGEMM_L1x2_LOOP: 
+    KERNEL1x2_L 32,32,0,0
+	KERNEL1x2_L 32,32,1,0 	
+	KERNEL1x2_L 32,32,2,0
+	KERNEL1x2_L 32,32,3,0  
+    KERNEL1x2_L 32,32,4,0
+	KERNEL1x2_L 32,32,5,0 
+	KERNEL1x2_L 32,32,6,0
+	KERNEL1x2_L 32,32,7,0   
+    KERNEL1x2_L 32,32,8,0
+	KERNEL1x2_L 32,32,9,0
+	KERNEL1x2_L 32,32,10,0
+	KERNEL1x2_L 32,32,11,0   
+    KERNEL1x2_L 32,32,12,0
+	KERNEL1x2_L 32,32,13,0
+	KERNEL1x2_L 32,32,14,0
+	KERNEL1x2_L 32,32,15,1 		
+	bdnz		ZGEMM_L1x2_LOOP
+ 	MY_ALIGN  
+ZGEMM_L1x2_LOOP_END:
+    END1x2  AO, BO, 32, 32   	 
+ 
+	b		ZGEMM_L1x2_SUB1
+ 
+ZGEMM_L1x2_SUB0:
+
+	andi.		L,	K,	63
+ 
+	b		ZGEMM_L1x2_SUB2
+
+ZGEMM_L1x2_SUB1:
+
+	andi.		L,	T1,	31
+	ble		ZGEMM_L1x2_SAVE
+
+ZGEMM_L1x2_SUB2:
+    srawi.      T1,L, 3
+    ble ZGEMM_L1x2_SUB2_4
+    mtctr		T1
+    MY_ALIGN
+ZGEMM_L1x2_SUB2_LOOP:
+	LOAD1x2 0 
+    KERNEL1x2_L  32,32, 0,0
+    KERNEL1x2_L  32,32, 1,0
+    KERNEL1x2_L  32,32, 2,0
+    KERNEL1x2_E  32,32, 3,1
+    bdnz ZGEMM_L1x2_SUB2_LOOP 
+    MY_ALIGN  
+ZGEMM_L1x2_SUB2_4:
+    andi.      T1,L, 4
+    ble ZGEMM_L1x2_SUB2_2
+	LOAD1x2 0 
+    KERNEL1x2_L  32,32, 0,0
+    KERNEL1x2_E  32,32, 1,1
+    MY_ALIGN
+ZGEMM_L1x2_SUB2_2:
+    andi.      T1,L, 2
+    ble ZGEMM_L1x2_SUB2_1
+	LOAD1x2 0 
+    KERNEL1x2_E  32,32, 0,1
+    MY_ALIGN    
+ZGEMM_L1x2_SUB2_1:
+    andi.      T1,L, 1
+    ble ZGEMM_L1x2_SAVE	
+    KERNEL1x2 
+ZGEMM_L1x2_SAVE:
+
+	SAVE1x2
+
+ZGEMM_L1x2_END:
+
+ZGEMM_L1x1_BEGIN:
+
+
+	andi.		T1,	M,	1
+	ble		ZGEMM_L1x1_END
+	mr		BO,	BBUFFER
+	mr T1, K
+    addi T1,T1, -1
+    srawi.		L,	T1, 5 /**(K-1) % 16x */ 
+	ZERO1x1  
+	ble		ZGEMM_L1x1_SUB0 
+
+ZGEMM_L1x1_LOOP_START:
+
+    LOAD1x1 0  
+	mtctr		L
+
+	MY_ALIGN
+ZGEMM_L1x1_LOOP: 
+    KERNEL1x1_L 16,32,0,0
+	KERNEL1x1_L 16,32,1,0 	
+	KERNEL1x1_L 16,32,2,0
+	KERNEL1x1_L 16,32,3,0  
+    KERNEL1x1_L 16,32,4,0
+	KERNEL1x1_L 16,32,5,0 
+	KERNEL1x1_L 16,32,6,0
+	KERNEL1x1_L 16,32,7,0   
+    KERNEL1x1_L 16,32,8,0
+	KERNEL1x1_L 16,32,9,0
+	KERNEL1x1_L 16,32,10,0
+	KERNEL1x1_L 16,32,11,0   
+    KERNEL1x1_L 16,32,12,0
+	KERNEL1x1_L 16,32,13,0
+	KERNEL1x1_L 16,32,14,0
+	KERNEL1x1_L 16,32,15,1 		
+	bdnz		ZGEMM_L1x1_LOOP
+ 	MY_ALIGN  
+ZGEMM_L1x1_LOOP_END:
+    END1x1  AO, BO, 16, 32   	 
+ 
+	b		ZGEMM_L1x1_SUB1
+ 
+ZGEMM_L1x1_SUB0:
+
+	andi.		L,	K,	63
+ 
+	b		ZGEMM_L1x1_SUB2
+
+ZGEMM_L1x1_SUB1:
+
+	andi.		L,	T1,	31
+	ble		ZGEMM_L1x1_SAVE
+
+ZGEMM_L1x1_SUB2:
+    srawi.      T1,L, 3
+    ble ZGEMM_L1x1_SUB2_4
+    mtctr		T1
+    MY_ALIGN
+ZGEMM_L1x1_SUB2_LOOP:
+	LOAD1x1 0 
+    KERNEL1x1_L  16,32, 0,0
+    KERNEL1x1_L  16,32, 1,0
+    KERNEL1x1_L  16,32, 2,0
+    KERNEL1x1_E  16,32, 3,1
+    bdnz ZGEMM_L1x1_SUB2_LOOP 
+    MY_ALIGN  
+ZGEMM_L1x1_SUB2_4:
+    andi.      T1,L, 4
+    ble ZGEMM_L1x1_SUB2_2
+	LOAD1x1 0 
+    KERNEL1x1_L  16,32, 0,0
+    KERNEL1x1_E  16,32, 1,1
+    MY_ALIGN
+ZGEMM_L1x1_SUB2_2:
+    andi.      T1,L, 2
+    ble ZGEMM_L1x1_SUB2_1
+	LOAD1x1 0 
+    KERNEL1x1_E  16,32, 0,1
+    MY_ALIGN    
+ZGEMM_L1x1_SUB2_1:
+    andi.      T1,L, 1
+    ble ZGEMM_L1x1_SAVE	
+    KERNEL1x1 
+
+ZGEMM_L1x1_SAVE:
+
+	SAVE1x1
+
+ZGEMM_L1x1_END:
+
+ZGEMM_L1_END:
diff --git a/kernel/power/zgemm_macros_power9.S b/kernel/power/zgemm_macros_power9.S
new file mode 100644
index 000000000..93a309ad1
--- /dev/null
+++ b/kernel/power/zgemm_macros_power9.S
@@ -0,0 +1,1664 @@
+/***************************************************************************
+Copyright (c) 2013-2019, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*****************************************************************************/
+
+#if   defined(NN) || defined(NT) || defined(TN) || defined(TT)
+
+    #define XSFADD_R1   xsadddp
+    #define XSFADD_R2   xssubdp
+    #define XSFADD_I1   xsadddp
+    #define XSFADD_I2   xsadddp
+
+#elif  defined(CN) || defined(CT) || defined(RN) || defined(RT)
+
+    #define XSFADD_R1   xsadddp
+    #define XSFADD_R2   xsadddp
+    #define XSFADD_I1   xssubdp
+    #define XSFADD_I2   xsadddp
+
+#elif  defined(NC) || defined(TC) || defined(NR) || defined(TR)
+
+    #define XSFADD_R1   xsadddp
+    #define XSFADD_R2   xsadddp
+    #define XSFADD_I1   xsadddp
+    #define XSFADD_I2   xssubdp
+
+#else       // CC || CR || RC || RR
+
+    #define XSFADD_R1   xsadddp
+    #define XSFADD_R2   xssubdp
+    #define XSFADD_I1   xssubdp
+    #define XSFADD_I2   xssubdp
+
+#endif
+
+.macro AGGREGATE_INTO_COMPLEX  FIRST_V, SECOND_V, OUTPUT_V
+     AGGREGATE_INTO_COMPLEX_INNER \FIRST_V, \SECOND_V, \OUTPUT_V, vs0,vs1,vs2,vs3,vs4,vs5,vs6,vs7
+.endm
+
+.macro AGGREGATE_INTO_COMPLEX_INNER  FIRST_V, SECOND_V, OUTPUT_V ,TEMP1,TEMP2,TEMP3,TEMP4,TEMP5,TEMP6,TEMP7,TEMP8
+    xxlxor      \TEMP1, \TEMP1, \TEMP1
+    xxlxor      \TEMP2, \TEMP2, \TEMP2
+ 
+    xxswapd     \SECOND_V,  \SECOND_V           //   imagA*imagB, realA*imagB -> realA*imagB, imagA*imagB 
+
+    XSFADD_I1   \TEMP2, \TEMP2, \FIRST_V        // realA*imagB
+    XSFADD_I2   \TEMP2, \TEMP2, \SECOND_V       // imagA*realB
+
+    xxswapd     \FIRST_V,   \FIRST_V            //imagA*realB, realA*realB -> realA*realB, imagA*realB   
+    xxswapd     \SECOND_V,  \SECOND_V           //  reverse to original imagA*imagB, realA*imagB 
+
+    XSFADD_R1   \TEMP1, \TEMP1, \FIRST_V        // realA*realB
+    XSFADD_R2   \TEMP1, \TEMP1, \SECOND_V       // imagA*imagB
+
+    xsmuldp     \TEMP3, \TEMP2, alpha_i     // imag*alpha_i
+    xsmuldp     \TEMP4, \TEMP2, alpha_r     // imag*alpha_r 
+    xsmuldp     \TEMP5, \TEMP1, alpha_r     // real*alpha_r 
+    xsmuldp     \TEMP6, \TEMP1, alpha_i     // real*alpha_i
+
+    xssubdp     \TEMP7, \TEMP5, \TEMP3      // real*alpha_r - imag*alpha_i
+    xsadddp     \TEMP8, \TEMP6, \TEMP4      // real*alpha_i + imag*alpha_r
+    xxpermdi    \OUTPUT_V,  \TEMP8, \TEMP7, 0   // merge real and imag part
+.endm
+
+/**********************************************************************************************
+* Macros for N=2 and M=8
+**********************************************************************************************/
+
+#define unit_size 16
+#define DISP32(ind,disp) (ind*unit_size*32+disp)
+#define DISP16(ind,disp) (ind*unit_size*16+disp)
+#define DISP8(ind,disp) (ind*unit_size*8+disp)
+#define DISP4(ind,disp) (ind*unit_size*4+disp)
+#define DISP2(ind,disp) (ind*unit_size*2+disp)
+#define DISP1(ind,disp) (ind*unit_size+disp)
+
+.macro Zero2x8
+    xxlxor      vs32,   vs32,   vs32
+    xxlxor      vs33,   vs33,   vs33
+    xxlxor      vs34,   vs34,   vs34
+    xxlxor      vs35,   vs35,   vs35
+    xxlxor      vs36,   vs36,   vs36
+    xxlxor      vs37,   vs37,   vs37
+    xxlxor      vs38,   vs38,   vs38
+    xxlxor      vs39,   vs39,   vs39
+    xxlxor      vs40,   vs40,   vs40
+    xxlxor      vs41,   vs41,   vs41
+    xxlxor      vs42,   vs42,   vs42
+    xxlxor      vs43,   vs43,   vs43
+    xxlxor      vs44,   vs44,   vs44
+    xxlxor      vs45,   vs45,   vs45
+    xxlxor      vs46,   vs46,   vs46
+    xxlxor      vs47,   vs47,   vs47
+    xxlxor      vs48,   vs48,   vs48
+    xxlxor      vs49,   vs49,   vs49
+    xxlxor      vs50,   vs50,   vs50
+    xxlxor      vs51,   vs51,   vs51 
+    xxlxor      vs52,   vs52,   vs52
+    xxlxor      vs53,   vs53,   vs53
+    xxlxor      vs54,   vs54,   vs54
+    xxlxor      vs55,   vs55,   vs55 
+    xxlxor      vs56,   vs56,   vs56
+    xxlxor      vs57,   vs57,   vs57
+    xxlxor      vs58,   vs58,   vs58
+    xxlxor      vs59,   vs59,   vs59 
+    xxlxor      vs60,   vs60,   vs60
+    xxlxor      vs61,   vs61,   vs61
+    xxlxor      vs62,   vs62,   vs62
+    xxlxor      vs63,   vs63,   vs63    
+.endm
+
+.macro LOAD2x8 Zero
+
+    lxv     vs16,   0(BO)       // load real part from B
+    lxv     vs17,   16(BO)      // load imag part from B
+    lxv     vs18,   32(BO)      // load real part from B
+    lxv     vs19,   48(BO)      // load imag part from B
+
+    lxv     vs0,    0(AO)       // load real,imag from A
+    lxv     vs1,    16(AO)      // load real,imag from A
+    lxv     vs2,    32(AO)      // load real,imag from A
+    lxv     vs3,    48(AO)      // load real,imag from A
+
+    lxv     vs4,    64(AO)      // load real,imag from A
+    lxv     vs5,    80(AO)      // load real,imag from A
+    lxv     vs6,    96(AO)      // load real,imag from A
+    lxv     vs7,    112(AO)     // load real,imag from A
+
+.if \Zero==1
+    Zero2x8 
+.endif
+
+.endm
+
+.macro END2x8_NORMAL
+   END2x8 AO,BO,128,64
+.endm
+
+.macro END2x8   AREG, BREG, OffsetA, OffsetB
+
+.if \OffsetB != 0 
+    addi        \BREG, \BREG, \OffsetB 
+.endif
+.if \OffsetA != 0 
+    addi        \AREG, \AREG, \OffsetA 
+.endif  
+
+    xvmaddadp   vs32,   vs0,    vs16        // real*real, imag*real
+    xvmaddadp   vs33,   vs0,    vs17        // real*imag, imag*imag
+    xvmaddadp   vs34,   vs1,    vs16        // real*real, imag*real
+    xvmaddadp   vs35,   vs1,    vs17        // real*imag, imag*imag
+    xvmaddadp   vs36,   vs2,    vs16        // real*real, imag*real
+    xvmaddadp   vs37,   vs2,    vs17        // real*imag, imag*imag
+    xvmaddadp   vs38,   vs3,    vs16        // real*real, imag*real
+    xvmaddadp   vs39,   vs3,    vs17        // real*imag, imag*imag
+    xvmaddadp   vs40,   vs4,    vs16        // real*real, imag*real
+    xvmaddadp   vs41,   vs4,    vs17        // real*imag, imag*imag
+    xvmaddadp   vs42,   vs5,    vs16        // real*real, imag*real
+    xvmaddadp   vs43,   vs5,    vs17        // real*imag, imag*imag
+    xvmaddadp   vs44,   vs6,    vs16        // real*real, imag*real
+    xvmaddadp   vs45,   vs6,    vs17        // real*imag, imag*imag
+    xvmaddadp   vs46,   vs7,    vs16        // real*real, imag*real
+    xvmaddadp   vs47,   vs7,    vs17        // real*imag, imag*imag
+
+    xvmaddadp   vs48,   vs0,    vs18        // real*real, imag*real
+    xvmaddadp   vs49,   vs0,    vs19        // real*imag, imag*imag
+    xvmaddadp   vs50,   vs1,    vs18        // real*real, imag*real
+    xvmaddadp   vs51,   vs1,    vs19        // real*imag, imag*imag
+    xvmaddadp   vs52,   vs2,    vs18        // real*real, imag*real
+    xvmaddadp   vs53,   vs2,    vs19        // real*imag, imag*imag
+    xvmaddadp   vs54,   vs3,    vs18        // real*real, imag*real
+    xvmaddadp   vs55,   vs3,    vs19        // real*imag, imag*imag
+    xvmaddadp   vs56,   vs4,    vs18        // real*real, imag*real
+    xvmaddadp   vs57,   vs4,    vs19        // real*imag, imag*imag
+    xvmaddadp   vs58,   vs5,    vs18        // real*real, imag*real
+    xvmaddadp   vs59,   vs5,    vs19        // real*imag, imag*imag
+    xvmaddadp   vs60,   vs6,    vs18        // real*real, imag*real
+    xvmaddadp   vs61,   vs6,    vs19        // real*imag, imag*imag
+    xvmaddadp   vs62,   vs7,    vs18        // real*real, imag*real
+    xvmaddadp   vs63,   vs7,    vs19        // real*imag, imag*imag
+
+.endm
+
+.macro KERNEL2x8_L      OffsetA,OffsetB, Index,IsLast  
+  KERNEL2x8_2   AO,BO,   \OffsetA,\OffsetB, \Index,\IsLast ,0
+.endm
+
+.macro KERNEL2x8_E      OffsetA,OffsetB, Index,IsLast  
+  KERNEL2x8_2   AO,BO,   \OffsetA,\OffsetB, \Index,\IsLast ,1
+.endm
+
+.macro KERNEL2x8_2   AREG,BREG,   OffsetA,OffsetB, Index,IsLast ,Complete
+
+    lxv     vs8,     DISP16(\Index, 0 + \OffsetA)(\AREG)        // load real,imag from A
+    lxv     vs9,     DISP16(\Index,16 + \OffsetA)(\AREG)        // load real,imag from A
+    lxv     vs10,    DISP16(\Index,32 + \OffsetA)(\AREG)        // load real,imag from A
+    lxv     vs11,    DISP16(\Index,48 + \OffsetA)(\AREG)        // load real,imag from A 
+
+    lxv     vs12,    DISP16(\Index, 64 + \OffsetA)(\AREG)       // load real,imag from A
+    lxv     vs13,    DISP16(\Index,64+16 + \OffsetA)(\AREG)     // load real,imag from A
+    lxv     vs14,    DISP16(\Index,64+32 + \OffsetA)(\AREG)     // load real,imag from A
+    lxv     vs15,    DISP16(\Index,64+48 + \OffsetA)(\AREG)     // load real,imag from A
+
+lxv     vs20,   DISP8(\Index,   0+\OffsetB)(\BREG)      // load real part from B
+    lxv     vs21,   DISP8(\Index,16+\OffsetB)(\BREG)        // load imag part from B
+    lxv     vs22,   DISP8(\Index,32+\OffsetB)(\BREG)        // load real part from B
+    lxv     vs23,   DISP8(\Index,48+\OffsetB)(\BREG)        // load imag part from B
+
+    xvmaddadp   vs32,   vs0,    vs16        // real*real, imag*real
+    xvmaddadp   vs33,   vs0,    vs17        // real*imag, imag*imag
+    xvmaddadp   vs34,   vs1,    vs16        // real*real, imag*real
+    xvmaddadp   vs35,   vs1,    vs17        // real*imag, imag*imag
+    xvmaddadp   vs36,   vs2,    vs16        // real*real, imag*real
+    xvmaddadp   vs37,   vs2,    vs17        // real*imag, imag*imag
+    xvmaddadp   vs38,   vs3,    vs16        // real*real, imag*real
+    xvmaddadp   vs39,   vs3,    vs17        // real*imag, imag*imag
+    xvmaddadp   vs40,   vs4,    vs16        // real*real, imag*real
+    xvmaddadp   vs41,   vs4,    vs17        // real*imag, imag*imag
+    xvmaddadp   vs42,   vs5,    vs16        // real*real, imag*real
+    xvmaddadp   vs43,   vs5,    vs17        // real*imag, imag*imag
+    xvmaddadp   vs44,   vs6,    vs16        // real*real, imag*real
+    xvmaddadp   vs45,   vs6,    vs17        // real*imag, imag*imag
+    xvmaddadp   vs46,   vs7,    vs16        // real*real, imag*real
+    xvmaddadp   vs47,   vs7,    vs17        // real*imag, imag*imag
+
+    xvmaddadp   vs48,   vs0,    vs18        // real*real, imag*real
+    xvmaddadp   vs49,   vs0,    vs19        // real*imag, imag*imag
+    xvmaddadp   vs50,   vs1,    vs18        // real*real, imag*real
+    xvmaddadp   vs51,   vs1,    vs19        // real*imag, imag*imag
+    xvmaddadp   vs52,   vs2,    vs18        // real*real, imag*real
+    xvmaddadp   vs53,   vs2,    vs19        // real*imag, imag*imag
+    xvmaddadp   vs54,   vs3,    vs18        // real*real, imag*real
+    xvmaddadp   vs55,   vs3,    vs19        // real*imag, imag*imag
+    xvmaddadp   vs56,   vs4,    vs18        // real*real, imag*real
+    xvmaddadp   vs57,   vs4,    vs19        // real*imag, imag*imag
+    xvmaddadp   vs58,   vs5,    vs18        // real*real, imag*real
+    xvmaddadp   vs59,   vs5,    vs19        // real*imag, imag*imag
+    xvmaddadp   vs60,   vs6,    vs18        // real*real, imag*real
+    xvmaddadp   vs61,   vs6,    vs19        // real*imag, imag*imag
+    xvmaddadp   vs62,   vs7,    vs18        // real*real, imag*real
+    xvmaddadp   vs63,   vs7,    vs19        // real*imag, imag*imag
+
+.if \Complete==0
+    lxv     vs0,     DISP16(\Index,128+ + \OffsetA)(\AREG)      // load real,imag from A
+    lxv     vs1,     DISP16(\Index,128+16 + \OffsetA)(\AREG)        // load real,imag from A
+    lxv     vs2,     DISP16(\Index,128+32 + \OffsetA)(\AREG)        // load real,imag from A
+    lxv     vs3,     DISP16(\Index,128+48 + \OffsetA)(\AREG)        // load real,imag from A
+
+    lxv     vs4,     DISP16(\Index, 192 + \OffsetA)(\AREG)      // load real,imag from A
+    lxv     vs5,     DISP16(\Index,192 +16 + \OffsetA)(\AREG)       // load real,imag from A
+    lxv     vs6,     DISP16(\Index,192 +32 + \OffsetA)(\AREG)       // load real,imag from A
+    lxv     vs7,     DISP16(\Index,192 +48 + \OffsetA)(\AREG)       // load real,imag from A
+
+    lxv     vs16,   DISP8(\Index,   64+\OffsetB)(\BREG)     // load real part from B
+    lxv     vs17,   DISP8(\Index,64+16+\OffsetB)(\BREG)     // load imag part from B
+    lxv     vs18,   DISP8(\Index,64+32+\OffsetB)(\BREG)     // load real part from B
+    lxv     vs19,   DISP8(\Index,64+48+\OffsetB)(\BREG)     // load imag part from B
+.endif
+
+.if \IsLast==1  
+.if \Complete==1
+    addi        \AREG, \AREG, DISP16(\Index,128+\OffsetA)
+    addi        \BREG, \BREG,  DISP8(\Index,64+\OffsetB)
+.else 
+    addi        \AREG, \AREG, DISP16(\Index,256)
+    addi        \BREG, \BREG,  DISP8(\Index,128)
+.endif
+.endif  
+
+    xvmaddadp   vs32,   vs8,    vs20        // real*real, imag*real
+    xvmaddadp   vs33,   vs8,    vs21        // real*imag, imag*imag
+    xvmaddadp   vs34,   vs9,    vs20        // real*real, imag*real
+    xvmaddadp   vs35,   vs9,    vs21        // real*imag, imag*imag
+    xvmaddadp   vs36,   vs10,   vs20        // real*real, imag*real
+    xvmaddadp   vs37,   vs10,   vs21        // real*imag, imag*imag
+    xvmaddadp   vs38,   vs11,   vs20        // real*real, imag*real
+    xvmaddadp   vs39,   vs11,   vs21        // real*imag, imag*imag
+    xvmaddadp   vs40,   vs12,   vs20        // real*real, imag*real
+    xvmaddadp   vs41,   vs12,   vs21        // real*imag, imag*imag
+    xvmaddadp   vs42,   vs13,   vs20        // real*real, imag*real
+    xvmaddadp   vs43,   vs13,   vs21        // real*imag, imag*imag
+    xvmaddadp   vs44,   vs14,   vs20        // real*real, imag*real
+    xvmaddadp   vs45,   vs14,   vs21        // real*imag, imag*imag
+    xvmaddadp   vs46,   vs15,   vs20        // real*real, imag*real
+    xvmaddadp   vs47,   vs15,   vs21        // real*imag, imag*imag
+
+    xvmaddadp   vs48,   vs8,    vs22        // real*real, imag*real
+    xvmaddadp   vs49,   vs8,    vs23        // real*imag, imag*imag
+    xvmaddadp   vs50,   vs9,    vs22        // real*real, imag*real
+    xvmaddadp   vs51,   vs9,    vs23        // real*imag, imag*imag
+    xvmaddadp   vs52,   vs10,   vs22        // real*real, imag*real
+    xvmaddadp   vs53,   vs10,   vs23        // real*imag, imag*imag
+    xvmaddadp   vs54,   vs11,   vs22        // real*real, imag*real
+    xvmaddadp   vs55,   vs11,   vs23        // real*imag, imag*imag
+    xvmaddadp   vs56,   vs12,   vs22        // real*real, imag*real
+    xvmaddadp   vs57,   vs12,   vs23        // real*imag, imag*imag
+    xvmaddadp   vs58,   vs13,   vs22        // real*real, imag*real
+    xvmaddadp   vs59,   vs13,   vs23        // real*imag, imag*imag
+    xvmaddadp   vs60,   vs14,   vs22        // real*real, imag*real
+    xvmaddadp   vs61,   vs14,   vs23        // real*imag, imag*imag
+    xvmaddadp   vs62,   vs15,   vs22        // real*real, imag*real
+    xvmaddadp   vs63,   vs15,   vs23        // real*imag, imag*imag
+
+.endm
+
+.macro KERNEL2x8 
+  LOAD2x8 0
+  END2x8  AO, BO, 128,64 
+.endm
+
+.macro SAVE2x8
+
+    mr      T1, CO
+    addi        T2, T1, 64
+
+#ifndef TRMMKERNEL
+
+    lxv     vs16,   0(T1)
+    lxv     vs17,   16(T1)
+    lxv     vs18,   32(T1)
+    lxv     vs19,   48(T1)
+    lxv     vs20,   0(T2)
+    lxv     vs21,   16(T2)
+    lxv     vs22,   32(T2)
+    lxv     vs23,   48(T2)
+
+#endif
+
+   AGGREGATE_INTO_COMPLEX vs32,vs33,vs8
+   AGGREGATE_INTO_COMPLEX vs34,vs35,vs9
+   AGGREGATE_INTO_COMPLEX vs36,vs37,vs10
+   AGGREGATE_INTO_COMPLEX vs38,vs39,vs11
+   AGGREGATE_INTO_COMPLEX vs40,vs41,vs12
+   AGGREGATE_INTO_COMPLEX vs42,vs43,vs13
+   AGGREGATE_INTO_COMPLEX vs44,vs45,vs14
+   AGGREGATE_INTO_COMPLEX vs46,vs47,vs15
+
+#ifndef TRMMKERNEL
+
+    xvadddp     vs8,    vs8,    vs16
+    xvadddp     vs9,    vs9,    vs17
+    xvadddp     vs10,   vs10,   vs18
+    xvadddp     vs11,   vs11,   vs19
+    xvadddp     vs12,   vs12,   vs20
+    xvadddp     vs13,   vs13,   vs21
+    xvadddp     vs14,   vs14,   vs22
+    xvadddp     vs15,   vs15,   vs23
+
+#endif
+
+    stxv        vs8,    0(T1)
+    stxv        vs9,    16(T1)
+    stxv        vs10,   32(T1)
+    stxv        vs11,   48(T1)
+    stxv        vs12,   0(T2)
+    stxv        vs13,   16(T2)
+    stxv        vs14,   32(T2)
+    stxv        vs15,   48(T2)
+
+    add     T1, T1, LDC
+    add     T2, T2, LDC
+
+#ifndef TRMMKERNEL
+
+    lxv     vs16,   0(T1)
+    lxv     vs17,   16(T1)
+    lxv     vs18,   32(T1)
+    lxv     vs19,   48(T1)
+    lxv     vs20,   0(T2)
+    lxv     vs21,   16(T2)
+    lxv     vs22,   32(T2)
+    lxv     vs23,   48(T2)
+
+#endif
+
+   AGGREGATE_INTO_COMPLEX vs48,vs49,vs8
+   AGGREGATE_INTO_COMPLEX vs50,vs51,vs9
+   AGGREGATE_INTO_COMPLEX vs52,vs53,vs10
+   AGGREGATE_INTO_COMPLEX vs54,vs55,vs11
+   AGGREGATE_INTO_COMPLEX vs56,vs57,vs12
+   AGGREGATE_INTO_COMPLEX vs58,vs59,vs13
+   AGGREGATE_INTO_COMPLEX vs60,vs61,vs14
+   AGGREGATE_INTO_COMPLEX vs62,vs63,vs15
+
+#ifndef TRMMKERNEL
+
+    xvadddp     vs8,    vs8,    vs16
+    xvadddp     vs9,    vs9,    vs17
+    xvadddp     vs10,   vs10,   vs18
+    xvadddp     vs11,   vs11,   vs19
+    xvadddp     vs12,   vs12,   vs20
+    xvadddp     vs13,   vs13,   vs21
+    xvadddp     vs14,   vs14,   vs22
+    xvadddp     vs15,   vs15,   vs23
+
+#endif
+
+    stxv        vs8,    0(T1)
+    stxv        vs9,    16(T1)
+    stxv        vs10,   32(T1)
+    stxv        vs11,   48(T1)
+    stxv        vs12,   0(T2)
+    stxv        vs13,   16(T2)
+    stxv        vs14,   32(T2)
+    stxv        vs15,   48(T2)
+ 
+    addi        CO, CO, 128
+
+.endm
+
+/**********************************************************************************************
+* Macros for N=2 and M=4
+**********************************************************************************************/
+
+.macro Zero2x4
+    xxlxor      vs32,   vs32,   vs32
+    xxlxor      vs33,   vs33,   vs33
+    xxlxor      vs34,   vs34,   vs34
+    xxlxor      vs35,   vs35,   vs35
+    xxlxor      vs36,   vs36,   vs36
+    xxlxor      vs37,   vs37,   vs37
+    xxlxor      vs38,   vs38,   vs38
+    xxlxor      vs39,   vs39,   vs39
+    xxlxor      vs40,   vs40,   vs40
+    xxlxor      vs41,   vs41,   vs41
+    xxlxor      vs42,   vs42,   vs42
+    xxlxor      vs43,   vs43,   vs43
+    xxlxor      vs44,   vs44,   vs44
+    xxlxor      vs45,   vs45,   vs45
+    xxlxor      vs46,   vs46,   vs46
+    xxlxor      vs47,   vs47,   vs47 
+.endm
+
+.macro LOAD2x4 Zero
+
+    lxv     vs16,   0(BO)       // load real part from B
+    lxv     vs17,   16(BO)      // load imag part from B
+    lxv     vs18,   32(BO)      // load real part from B
+    lxv     vs19,   48(BO)      // load imag part from B
+
+    lxv     vs0,    0(AO)       // load real,imag from A
+    lxv     vs1,    16(AO)      // load real,imag from A
+    lxv     vs2,    32(AO)      // load real,imag from A
+    lxv     vs3,    48(AO)      // load real,imag from A
+ 
+.if \Zero==1
+    Zero2x4 
+.endif
+
+.endm
+
+.macro END2x4_NORMAL
+   END2x4 AO,BO,64,64
+.endm
+
+.macro END2x4   AREG, BREG, OffsetA, OffsetB
+
+.if \OffsetB != 0 
+    addi        \BREG, \BREG, \OffsetB 
+.endif
+.if \OffsetA != 0 
+    addi        \AREG, \AREG, \OffsetA 
+.endif  
+
+    xvmaddadp   vs32,   vs0,    vs16        // real*real, imag*real
+    xvmaddadp   vs33,   vs0,    vs17        // real*imag, imag*imag
+    xvmaddadp   vs34,   vs1,    vs16        // real*real, imag*real
+    xvmaddadp   vs35,   vs1,    vs17        // real*imag, imag*imag
+    xvmaddadp   vs36,   vs2,    vs16        // real*real, imag*real
+    xvmaddadp   vs37,   vs2,    vs17        // real*imag, imag*imag
+    xvmaddadp   vs38,   vs3,    vs16        // real*real, imag*real
+    xvmaddadp   vs39,   vs3,    vs17        // real*imag, imag*imag
+
+    xvmaddadp   vs40,   vs0,    vs18        // real*real, imag*real
+    xvmaddadp   vs41,   vs0,    vs19        // real*imag, imag*imag
+    xvmaddadp   vs42,   vs1,    vs18        // real*real, imag*real
+    xvmaddadp   vs43,   vs1,    vs19        // real*imag, imag*imag
+    xvmaddadp   vs44,   vs2,    vs18        // real*real, imag*real
+    xvmaddadp   vs45,   vs2,    vs19        // real*imag, imag*imag
+    xvmaddadp   vs46,   vs3,    vs18        // real*real, imag*real
+    xvmaddadp   vs47,   vs3,    vs19        // real*imag, imag*imag
+
+.endm
+
+.macro KERNEL2x4_L      OffsetA,OffsetB, Index,IsLast  
+  KERNEL2x4_2   AO,BO,   \OffsetA,\OffsetB, \Index,\IsLast ,0
+.endm
+
+.macro KERNEL2x4_E      OffsetA,OffsetB, Index,IsLast  
+  KERNEL2x4_2   AO,BO,   \OffsetA,\OffsetB, \Index,\IsLast ,1
+.endm
+
+.macro KERNEL2x4_2   AREG,BREG,   OffsetA,OffsetB, Index,IsLast ,Complete
+
+    lxv     vs8,     DISP8(\Index, 0 + \OffsetA)(\AREG)        // load real,imag from A
+    lxv     vs9,     DISP8(\Index,16 + \OffsetA)(\AREG)        // load real,imag from A
+    lxv     vs10,    DISP8(\Index,32 + \OffsetA)(\AREG)        // load real,imag from A
+    lxv     vs11,    DISP8(\Index,48 + \OffsetA)(\AREG)        // load real,imag from A
+
+lxv     vs20,   DISP8(\Index,   0+\OffsetB)(\BREG)      // load real part from B
+    lxv     vs21,   DISP8(\Index,16+\OffsetB)(\BREG)        // load imag part from B
+    lxv     vs22,   DISP8(\Index,32+\OffsetB)(\BREG)        // load real part from B
+    lxv     vs23,   DISP8(\Index,48+\OffsetB)(\BREG)        // load imag part from B
+
+    xvmaddadp   vs32,   vs0,    vs16        // real*real, imag*real
+    xvmaddadp   vs33,   vs0,    vs17        // real*imag, imag*imag
+    xvmaddadp   vs34,   vs1,    vs16        // real*real, imag*real
+    xvmaddadp   vs35,   vs1,    vs17        // real*imag, imag*imag
+    xvmaddadp   vs36,   vs2,    vs16        // real*real, imag*real
+    xvmaddadp   vs37,   vs2,    vs17        // real*imag, imag*imag
+    xvmaddadp   vs38,   vs3,    vs16        // real*real, imag*real
+    xvmaddadp   vs39,   vs3,    vs17        // real*imag, imag*imag
+
+    xvmaddadp   vs40,   vs0,    vs18        // real*real, imag*real
+    xvmaddadp   vs41,   vs0,    vs19        // real*imag, imag*imag
+    xvmaddadp   vs42,   vs1,    vs18        // real*real, imag*real
+    xvmaddadp   vs43,   vs1,    vs19        // real*imag, imag*imag
+    xvmaddadp   vs44,   vs2,    vs18        // real*real, imag*real
+    xvmaddadp   vs45,   vs2,    vs19        // real*imag, imag*imag
+    xvmaddadp   vs46,   vs3,    vs18        // real*real, imag*real
+    xvmaddadp   vs47,   vs3,    vs19        // real*imag, imag*imag
+
+.if \Complete==0
+    lxv     vs0,     DISP8(\Index,64+  \OffsetA)(\AREG)      // load real,imag from A
+    lxv     vs1,     DISP8(\Index,64+16 + \OffsetA)(\AREG)        // load real,imag from A
+    lxv     vs2,     DISP8(\Index,64+32 + \OffsetA)(\AREG)        // load real,imag from A
+    lxv     vs3,     DISP8(\Index,64+48 + \OffsetA)(\AREG)        // load real,imag from A 
+
+    lxv     vs16,   DISP8(\Index,   64+\OffsetB)(\BREG)     // load real part from B
+    lxv     vs17,   DISP8(\Index,64+16+\OffsetB)(\BREG)     // load imag part from B
+    lxv     vs18,   DISP8(\Index,64+32+\OffsetB)(\BREG)     // load real part from B
+    lxv     vs19,   DISP8(\Index,64+48+\OffsetB)(\BREG)     // load imag part from B
+.endif
+
+.if \IsLast==1  
+.if \Complete==1
+    addi        \AREG, \AREG,  DISP8(\Index,64+\OffsetA)
+    addi        \BREG, \BREG,  DISP8(\Index,64+\OffsetB)
+.else 
+    addi        \AREG, \AREG,  DISP8(\Index,128)
+    addi        \BREG, \BREG,  DISP8(\Index,128)
+.endif
+.endif  
+
+    xvmaddadp   vs32,   vs8,    vs20        // real*real, imag*real
+    xvmaddadp   vs33,   vs8,    vs21        // real*imag, imag*imag
+    xvmaddadp   vs34,   vs9,    vs20        // real*real, imag*real
+    xvmaddadp   vs35,   vs9,    vs21        // real*imag, imag*imag
+    xvmaddadp   vs36,   vs10,   vs20        // real*real, imag*real
+    xvmaddadp   vs37,   vs10,   vs21        // real*imag, imag*imag
+    xvmaddadp   vs38,   vs11,   vs20        // real*real, imag*real
+    xvmaddadp   vs39,   vs11,   vs21        // real*imag, imag*imag
+ 
+    xvmaddadp   vs40,   vs8,    vs22        // real*real, imag*real
+    xvmaddadp   vs41,   vs8,    vs23        // real*imag, imag*imag
+    xvmaddadp   vs42,   vs9,    vs22        // real*real, imag*real
+    xvmaddadp   vs43,   vs9,    vs23        // real*imag, imag*imag
+    xvmaddadp   vs44,   vs10,   vs22        // real*real, imag*real
+    xvmaddadp   vs45,   vs10,   vs23        // real*imag, imag*imag
+    xvmaddadp   vs46,   vs11,   vs22        // real*real, imag*real
+    xvmaddadp   vs47,   vs11,   vs23        // real*imag, imag*imag
+
+.endm
+
+.macro KERNEL2x4 
+  LOAD2x4 0
+  END2x4  AO, BO, 64,64 
+.endm
+
+.macro SAVE2x4
+
+    mr      T1, CO
+
+#ifndef TRMMKERNEL
+
+    lxv     vs16,   0(T1)
+    lxv     vs17,   16(T1)
+    lxv     vs18,   32(T1)
+    lxv     vs19,   48(T1)
+
+#endif
+
+   AGGREGATE_INTO_COMPLEX vs32,vs33,vs8
+   AGGREGATE_INTO_COMPLEX vs34,vs35,vs9
+   AGGREGATE_INTO_COMPLEX vs36,vs37,vs10
+   AGGREGATE_INTO_COMPLEX vs38,vs39,vs11
+
+#ifndef TRMMKERNEL
+
+    xvadddp     vs8,    vs8,    vs16
+    xvadddp     vs9,    vs9,    vs17
+    xvadddp     vs10,   vs10,   vs18
+    xvadddp     vs11,   vs11,   vs19
+
+#endif
+
+    stxv        vs8,    0(T1)
+    stxv        vs9,    16(T1)
+    stxv        vs10,   32(T1)
+    stxv        vs11,   48(T1)
+
+    add     T1, T1, LDC
+
+#ifndef TRMMKERNEL
+
+    lxv     vs16,   0(T1)
+    lxv     vs17,   16(T1)
+    lxv     vs18,   32(T1)
+    lxv     vs19,   48(T1)
+
+#endif
+
+   AGGREGATE_INTO_COMPLEX vs40,vs41,vs8
+   AGGREGATE_INTO_COMPLEX vs42,vs43,vs9
+   AGGREGATE_INTO_COMPLEX vs44,vs45,vs10
+   AGGREGATE_INTO_COMPLEX vs46,vs47,vs11
+
+#ifndef TRMMKERNEL
+
+    xvadddp     vs8,    vs8,    vs16
+    xvadddp     vs9,    vs9,    vs17
+    xvadddp     vs10,   vs10,   vs18
+    xvadddp     vs11,   vs11,   vs19
+
+#endif
+
+    stxv        vs8,    0(T1)
+    stxv        vs9,    16(T1)
+    stxv        vs10,   32(T1)
+    stxv        vs11,   48(T1)
+ 
+    addi        CO, CO, 64
+
+.endm
+
+/**********************************************************************************************
+* Macros for N=2 and M=2
+**********************************************************************************************/
+
+.macro Zero2x2
+    xxlxor      vs32,   vs32,   vs32
+    xxlxor      vs33,   vs33,   vs33
+    xxlxor      vs34,   vs34,   vs34
+    xxlxor      vs35,   vs35,   vs35
+    xxlxor      vs36,   vs36,   vs36
+    xxlxor      vs37,   vs37,   vs37
+    xxlxor      vs38,   vs38,   vs38
+    xxlxor      vs39,   vs39,   vs39 
+.endm
+
+.macro LOAD2x2 Zero
+
+    lxv     vs16,   0(BO)       // load real part from B
+    lxv     vs17,   16(BO)      // load imag part from B
+    lxv     vs18,   32(BO)      // load real part from B
+    lxv     vs19,   48(BO)      // load imag part from B
+
+    lxv     vs0,    0(AO)       // load real,imag from A
+    lxv     vs1,    16(AO)      // load real,imag from A 
+ 
+.if \Zero==1
+    Zero2x2 
+.endif
+
+.endm
+
+.macro END2x2_NORMAL
+   END2x2 AO,BO,32,64
+.endm
+
+.macro END2x2   AREG, BREG, OffsetA, OffsetB
+
+.if \OffsetB != 0 
+    addi        \BREG, \BREG, \OffsetB 
+.endif
+.if \OffsetA != 0 
+    addi        \AREG, \AREG, \OffsetA 
+.endif  
+
+    xvmaddadp   vs32,   vs0,    vs16        // real*real, imag*real
+    xvmaddadp   vs33,   vs0,    vs17        // real*imag, imag*imag
+    xvmaddadp   vs34,   vs1,    vs16        // real*real, imag*real
+    xvmaddadp   vs35,   vs1,    vs17        // real*imag, imag*imag 
+
+    xvmaddadp   vs36,   vs0,    vs18        // real*real, imag*real
+    xvmaddadp   vs37,   vs0,    vs19        // real*imag, imag*imag
+    xvmaddadp   vs38,   vs1,    vs18        // real*real, imag*real
+    xvmaddadp   vs39,   vs1,    vs19        // real*imag, imag*imag 
+  
+.endm
+
+.macro KERNEL2x2_L      OffsetA,OffsetB, Index,IsLast  
+  KERNEL2x2_2   AO,BO,   \OffsetA,\OffsetB, \Index,\IsLast ,0
+.endm
+
+.macro KERNEL2x2_E      OffsetA,OffsetB, Index,IsLast  
+  KERNEL2x2_2   AO,BO,   \OffsetA,\OffsetB, \Index,\IsLast ,1
+.endm
+
+.macro KERNEL2x2_2   AREG,BREG,   OffsetA,OffsetB, Index,IsLast ,Complete
+
+    lxv     vs8,     DISP4(\Index, 0 + \OffsetA)(\AREG)        // load real,imag from A
+    lxv     vs9,     DISP4(\Index,16 + \OffsetA)(\AREG)        // load real,imag from A
+
+lxv     vs20,   DISP8(\Index,   0+\OffsetB)(\BREG)      // load real part from B
+    lxv     vs21,   DISP8(\Index,16+\OffsetB)(\BREG)        // load imag part from B
+    lxv     vs22,   DISP8(\Index,32+\OffsetB)(\BREG)        // load real part from B
+    lxv     vs23,   DISP8(\Index,48+\OffsetB)(\BREG)        // load imag part from B
+
+    xvmaddadp   vs32,   vs0,    vs16        // real*real, imag*real
+    xvmaddadp   vs33,   vs0,    vs17        // real*imag, imag*imag
+    xvmaddadp   vs34,   vs1,    vs16        // real*real, imag*real
+    xvmaddadp   vs35,   vs1,    vs17        // real*imag, imag*imag 
+
+    xvmaddadp   vs36,   vs0,    vs18        // real*real, imag*real
+    xvmaddadp   vs37,   vs0,    vs19        // real*imag, imag*imag
+    xvmaddadp   vs38,   vs1,    vs18        // real*real, imag*real
+    xvmaddadp   vs39,   vs1,    vs19        // real*imag, imag*imag 
+
+.if \Complete==0
+    lxv     vs0,     DISP4(\Index,32 + \OffsetA)(\AREG)      // load real,imag from A
+    lxv     vs1,     DISP4(\Index,48+ \OffsetA)(\AREG)        // load real,imag from A 
+
+    lxv     vs16,   DISP8(\Index,   64+\OffsetB)(\BREG)     // load real part from B
+    lxv     vs17,   DISP8(\Index,64+16+\OffsetB)(\BREG)     // load imag part from B
+    lxv     vs18,   DISP8(\Index,64+32+\OffsetB)(\BREG)     // load real part from B
+    lxv     vs19,   DISP8(\Index,64+48+\OffsetB)(\BREG)     // load imag part from B
+.endif
+
+.if \IsLast==1  
+.if \Complete==1
+    addi        \AREG, \AREG,  DISP4(\Index,32+\OffsetA)
+    addi        \BREG, \BREG,  DISP8(\Index,64+\OffsetB)
+.else 
+    addi        \AREG, \AREG,  DISP4(\Index,64)
+    addi        \BREG, \BREG,  DISP8(\Index,128)
+.endif
+.endif  
+
+    xvmaddadp   vs32,   vs8,    vs20        // real*real, imag*real
+    xvmaddadp   vs33,   vs8,    vs21        // real*imag, imag*imag
+    xvmaddadp   vs34,   vs9,    vs20        // real*real, imag*real
+    xvmaddadp   vs35,   vs9,    vs21        // real*imag, imag*imag 
+ 
+    xvmaddadp   vs36,   vs8,    vs22        // real*real, imag*real
+    xvmaddadp   vs37,   vs8,    vs23        // real*imag, imag*imag
+    xvmaddadp   vs38,   vs9,    vs22        // real*real, imag*real
+    xvmaddadp   vs39,   vs9,    vs23        // real*imag, imag*imag 
+     
+.endm
+
+.macro KERNEL2x2 
+  LOAD2x2 0
+  END2x2  AO, BO, 32,64 
+.endm
+
+.macro SAVE2x2
+
+    mr      T1, CO
+
+#ifndef TRMMKERNEL
+
+    lxv     vs16,   0(T1)
+    lxv     vs17,   16(T1)
+
+#endif
+
+   AGGREGATE_INTO_COMPLEX vs32,vs33,vs8
+   AGGREGATE_INTO_COMPLEX vs34,vs35,vs9 
+
+#ifndef TRMMKERNEL
+
+    xvadddp     vs8,    vs8,    vs16
+    xvadddp     vs9,    vs9,    vs17
+
+#endif
+
+    stxv        vs8,    0(T1)
+    stxv        vs9,    16(T1)
+
+    add     T1, T1, LDC
+
+#ifndef TRMMKERNEL
+
+    lxv     vs16,   0(T1)
+    lxv     vs17,   16(T1)
+
+#endif
+
+   AGGREGATE_INTO_COMPLEX vs36,vs37,vs8
+   AGGREGATE_INTO_COMPLEX vs38,vs39,vs9
+
+#ifndef TRMMKERNEL
+
+    xvadddp     vs8,    vs8,    vs16
+    xvadddp     vs9,    vs9,    vs17
+
+#endif
+
+    stxv        vs8,    0(T1)
+    stxv        vs9,    16(T1)
+ 
+    addi        CO, CO, 32
+
+.endm
+
+/**********************************************************************************************
+* Macros for N=2 and M=1
+**********************************************************************************************/
+
+.macro Zero2x1
+    xxlxor      vs32,   vs32,   vs32
+    xxlxor      vs33,   vs33,   vs33
+    xxlxor      vs34,   vs34,   vs34
+    xxlxor      vs35,   vs35,   vs35 
+.endm
+
+.macro LOAD2x1 Zero
+    lxv     vs0,    0(AO)       // load real,imag from A 
+
+    lxv     vs16,   0(BO)       // load real part from B
+    lxv     vs17,   16(BO)      // load imag part from B
+    lxv     vs18,   32(BO)      // load real part from B
+    lxv     vs19,   48(BO)      // load imag part from B
+
+.if \Zero==1
+    Zero2x1 
+.endif
+
+.endm
+
+.macro END2x1_NORMAL
+   END2x1 AO,BO,16,64
+.endm
+
+.macro END2x1   AREG, BREG, OffsetA, OffsetB
+
+.if \OffsetA != 0 
+    addi        \AREG, \AREG, \OffsetA 
+.endif  
+.if \OffsetB != 0 
+    addi        \BREG, \BREG, \OffsetB 
+.endif
+
+    xvmaddadp   vs32,   vs0,    vs16        // real*real, imag*real
+    xvmaddadp   vs33,   vs0,    vs17        // real*imag, imag*imag 
+
+    xvmaddadp   vs34,   vs0,    vs18        // real*real, imag*real
+    xvmaddadp   vs35,   vs0,    vs19        // real*imag, imag*imag 
+  
+.endm
+
+.macro KERNEL2x1_L      OffsetA,OffsetB, Index,IsLast  
+  KERNEL2x1_2   AO,BO,   \OffsetA,\OffsetB, \Index,\IsLast ,0
+.endm
+
+.macro KERNEL2x1_E      OffsetA,OffsetB, Index,IsLast  
+  KERNEL2x1_2   AO,BO,   \OffsetA,\OffsetB, \Index,\IsLast ,1
+.endm
+
+.macro KERNEL2x1_2   AREG,BREG,   OffsetA,OffsetB, Index,IsLast ,Complete
+
+    lxv     vs8,     DISP2(\Index, 0 + \OffsetA)(\AREG)        // load real,imag from A
+
+lxv     vs20,   DISP8(\Index,   0+\OffsetB)(\BREG)      // load real part from B
+    lxv     vs21,   DISP8(\Index,16+\OffsetB)(\BREG)        // load imag part from B
+    lxv     vs22,   DISP8(\Index,32+\OffsetB)(\BREG)        // load real part from B
+    lxv     vs23,   DISP8(\Index,48+\OffsetB)(\BREG)        // load imag part from B
+
+    xvmaddadp   vs32,   vs0,    vs16        // real*real, imag*real
+    xvmaddadp   vs33,   vs0,    vs17        // real*imag, imag*imag 
+
+    xvmaddadp   vs34,   vs0,    vs18        // real*real, imag*real
+    xvmaddadp   vs35,   vs0,    vs19        // real*imag, imag*imag 
+
+.if \Complete==0
+    lxv     vs0,     DISP2(\Index,16 + \OffsetA)(\AREG)      // load real,imag from A 
+
+    lxv     vs16,   DISP8(\Index,   64+\OffsetB)(\BREG)     // load real part from B
+    lxv     vs17,   DISP8(\Index,64+16+\OffsetB)(\BREG)     // load imag part from B
+    lxv     vs18,   DISP8(\Index,64+32+\OffsetB)(\BREG)     // load real part from B
+    lxv     vs19,   DISP8(\Index,64+48+\OffsetB)(\BREG)     // load imag part from B
+.endif
+
+.if \IsLast==1  
+.if \Complete==1
+    addi        \AREG, \AREG,  DISP2(\Index,16+\OffsetA)
+    addi        \BREG, \BREG,  DISP8(\Index,64+\OffsetB)
+.else 
+    addi        \AREG, \AREG,  DISP2(\Index,32)
+    addi        \BREG, \BREG,  DISP8(\Index,128)
+.endif
+.endif  
+
+    xvmaddadp   vs32,   vs8,    vs20        // real*real, imag*real
+    xvmaddadp   vs33,   vs8,    vs21        // real*imag, imag*imag 
+ 
+    xvmaddadp   vs34,   vs8,    vs22        // real*real, imag*real
+    xvmaddadp   vs35,   vs8,    vs23        // real*imag, imag*imag  
+     
+.endm
+
+.macro KERNEL2x1 
+  LOAD2x1 0
+  END2x1  AO, BO, 16,64 
+.endm
+
+.macro SAVE2x1
+
+    mr      T1, CO
+#ifndef TRMMKERNEL
+    lxv     vs16,   0(T1)
+#endif
+   AGGREGATE_INTO_COMPLEX vs32,vs33,vs8
+
+#ifndef TRMMKERNEL
+    xvadddp     vs8,    vs8,    vs16
+#endif
+
+    stxv        vs8,    0(T1)
+
+    add     T1, T1, LDC
+
+#ifndef TRMMKERNEL
+    lxv     vs16,   0(T1)
+#endif
+
+   AGGREGATE_INTO_COMPLEX vs34,vs35,vs8
+
+#ifndef TRMMKERNEL
+    xvadddp     vs8,    vs8,    vs16
+#endif
+
+    stxv        vs8,    0(T1)
+
+    addi        CO, CO, 16
+
+.endm
+
+/**********************************************************************************************
+* Macros for N=1 and M=8
+**********************************************************************************************/
+.macro Zero1x8
+    xxlxor      vs32,   vs32,   vs32
+    xxlxor      vs33,   vs33,   vs33
+    xxlxor      vs34,   vs34,   vs34
+    xxlxor      vs35,   vs35,   vs35
+    xxlxor      vs36,   vs36,   vs36
+    xxlxor      vs37,   vs37,   vs37
+    xxlxor      vs38,   vs38,   vs38
+    xxlxor      vs39,   vs39,   vs39
+    xxlxor      vs40,   vs40,   vs40
+    xxlxor      vs41,   vs41,   vs41
+    xxlxor      vs42,   vs42,   vs42
+    xxlxor      vs43,   vs43,   vs43
+    xxlxor      vs44,   vs44,   vs44
+    xxlxor      vs45,   vs45,   vs45
+    xxlxor      vs46,   vs46,   vs46
+    xxlxor      vs47,   vs47,   vs47     
+.endm
+
+.macro LOAD1x8 Zero
+
+    lxv     vs16,   0(BO)       // load real part from B
+    lxv     vs17,   16(BO)      // load imag part from B 
+
+    lxv     vs0,    0(AO)       // load real,imag from A
+    lxv     vs1,    16(AO)      // load real,imag from A
+    lxv     vs2,    32(AO)      // load real,imag from A
+    lxv     vs3,    48(AO)      // load real,imag from A
+
+    lxv     vs4,    64(AO)      // load real,imag from A
+    lxv     vs5,    80(AO)      // load real,imag from A
+    lxv     vs6,    96(AO)      // load real,imag from A
+    lxv     vs7,    112(AO)     // load real,imag from A
+
+.if \Zero==1
+    Zero1x8 
+.endif
+
+.endm
+
+.macro END1x8_NORMAL
+   END1x8 AO,BO,128,32
+.endm
+
+.macro END1x8   AREG, BREG, OffsetA, OffsetB
+
+.if \OffsetB != 0 
+    addi        \BREG, \BREG, \OffsetB 
+.endif
+.if \OffsetA != 0 
+    addi        \AREG, \AREG, \OffsetA 
+.endif  
+
+    xvmaddadp   vs32,   vs0,    vs16        // real*real, imag*real
+    xvmaddadp   vs33,   vs0,    vs17        // real*imag, imag*imag
+    xvmaddadp   vs34,   vs1,    vs16        // real*real, imag*real
+    xvmaddadp   vs35,   vs1,    vs17        // real*imag, imag*imag
+    xvmaddadp   vs36,   vs2,    vs16        // real*real, imag*real
+    xvmaddadp   vs37,   vs2,    vs17        // real*imag, imag*imag
+    xvmaddadp   vs38,   vs3,    vs16        // real*real, imag*real
+    xvmaddadp   vs39,   vs3,    vs17        // real*imag, imag*imag
+    xvmaddadp   vs40,   vs4,    vs16        // real*real, imag*real
+    xvmaddadp   vs41,   vs4,    vs17        // real*imag, imag*imag
+    xvmaddadp   vs42,   vs5,    vs16        // real*real, imag*real
+    xvmaddadp   vs43,   vs5,    vs17        // real*imag, imag*imag
+    xvmaddadp   vs44,   vs6,    vs16        // real*real, imag*real
+    xvmaddadp   vs45,   vs6,    vs17        // real*imag, imag*imag
+    xvmaddadp   vs46,   vs7,    vs16        // real*real, imag*real
+    xvmaddadp   vs47,   vs7,    vs17        // real*imag, imag*imag
+
+.endm
+
+.macro KERNEL1x8_L      OffsetA,OffsetB, Index,IsLast  
+  KERNEL1x8_2   AO,BO,   \OffsetA,\OffsetB, \Index,\IsLast ,0
+.endm
+
+.macro KERNEL1x8_E      OffsetA,OffsetB, Index,IsLast  
+  KERNEL1x8_2   AO,BO,   \OffsetA,\OffsetB, \Index,\IsLast ,1
+.endm
+
+.macro KERNEL1x8_2   AREG,BREG,   OffsetA,OffsetB, Index,IsLast ,Complete
+
+    lxv     vs8,     DISP16(\Index, 0 + \OffsetA)(\AREG)        // load real,imag from A
+    lxv     vs9,     DISP16(\Index,16 + \OffsetA)(\AREG)        // load real,imag from A
+    lxv     vs10,    DISP16(\Index,32 + \OffsetA)(\AREG)        // load real,imag from A
+    lxv     vs11,    DISP16(\Index,48 + \OffsetA)(\AREG)        // load real,imag from A 
+
+    lxv     vs12,    DISP16(\Index, 64 + \OffsetA)(\AREG)       // load real,imag from A
+    lxv     vs13,    DISP16(\Index,64+16 + \OffsetA)(\AREG)     // load real,imag from A
+    lxv     vs14,    DISP16(\Index,64+32 + \OffsetA)(\AREG)     // load real,imag from A
+    lxv     vs15,    DISP16(\Index,64+48 + \OffsetA)(\AREG)     // load real,imag from A
+
+    lxv     vs20,   DISP4(\Index,   0+\OffsetB)(\BREG)      // load real part from B
+    lxv     vs21,   DISP4(\Index,16+\OffsetB)(\BREG)        // load imag part from B 
+
+    xvmaddadp   vs32,   vs0,    vs16        // real*real, imag*real
+    xvmaddadp   vs33,   vs0,    vs17        // real*imag, imag*imag
+    xvmaddadp   vs34,   vs1,    vs16        // real*real, imag*real
+    xvmaddadp   vs35,   vs1,    vs17        // real*imag, imag*imag
+    xvmaddadp   vs36,   vs2,    vs16        // real*real, imag*real
+    xvmaddadp   vs37,   vs2,    vs17        // real*imag, imag*imag
+    xvmaddadp   vs38,   vs3,    vs16        // real*real, imag*real
+    xvmaddadp   vs39,   vs3,    vs17        // real*imag, imag*imag
+    xvmaddadp   vs40,   vs4,    vs16        // real*real, imag*real
+    xvmaddadp   vs41,   vs4,    vs17        // real*imag, imag*imag
+    xvmaddadp   vs42,   vs5,    vs16        // real*real, imag*real
+    xvmaddadp   vs43,   vs5,    vs17        // real*imag, imag*imag
+    xvmaddadp   vs44,   vs6,    vs16        // real*real, imag*real
+    xvmaddadp   vs45,   vs6,    vs17        // real*imag, imag*imag
+    xvmaddadp   vs46,   vs7,    vs16        // real*real, imag*real
+    xvmaddadp   vs47,   vs7,    vs17        // real*imag, imag*imag
+
+.if \Complete==0
+    lxv     vs0,     DISP16(\Index,128+ + \OffsetA)(\AREG)      // load real,imag from A
+    lxv     vs1,     DISP16(\Index,128+16 + \OffsetA)(\AREG)        // load real,imag from A
+    lxv     vs2,     DISP16(\Index,128+32 + \OffsetA)(\AREG)        // load real,imag from A
+    lxv     vs3,     DISP16(\Index,128+48 + \OffsetA)(\AREG)        // load real,imag from A
+
+    lxv     vs4,     DISP16(\Index, 192 + \OffsetA)(\AREG)      // load real,imag from A
+    lxv     vs5,     DISP16(\Index,192 +16 + \OffsetA)(\AREG)       // load real,imag from A
+    lxv     vs6,     DISP16(\Index,192 +32 + \OffsetA)(\AREG)       // load real,imag from A
+    lxv     vs7,     DISP16(\Index,192 +48 + \OffsetA)(\AREG)       // load real,imag from A
+
+    lxv     vs16,   DISP4(\Index,   32+\OffsetB)(\BREG)     // load real part from B
+    lxv     vs17,   DISP4(\Index,48+\OffsetB)(\BREG)     // load imag part from B 
+.endif
+
+.if \IsLast==1  
+.if \Complete==1
+    addi        \AREG, \AREG, DISP16(\Index,128+\OffsetA)
+    addi        \BREG, \BREG, DISP4(\Index,32+\OffsetB)
+.else 
+    addi        \AREG, \AREG, DISP16(\Index,256)
+    addi        \BREG, \BREG, DISP4(\Index,64)
+.endif
+.endif  
+
+    xvmaddadp   vs32,   vs8,    vs20        // real*real, imag*real
+    xvmaddadp   vs33,   vs8,    vs21        // real*imag, imag*imag
+    xvmaddadp   vs34,   vs9,    vs20        // real*real, imag*real
+    xvmaddadp   vs35,   vs9,    vs21        // real*imag, imag*imag
+    xvmaddadp   vs36,   vs10,   vs20        // real*real, imag*real
+    xvmaddadp   vs37,   vs10,   vs21        // real*imag, imag*imag
+    xvmaddadp   vs38,   vs11,   vs20        // real*real, imag*real
+    xvmaddadp   vs39,   vs11,   vs21        // real*imag, imag*imag
+    xvmaddadp   vs40,   vs12,   vs20        // real*real, imag*real
+    xvmaddadp   vs41,   vs12,   vs21        // real*imag, imag*imag
+    xvmaddadp   vs42,   vs13,   vs20        // real*real, imag*real
+    xvmaddadp   vs43,   vs13,   vs21        // real*imag, imag*imag
+    xvmaddadp   vs44,   vs14,   vs20        // real*real, imag*real
+    xvmaddadp   vs45,   vs14,   vs21        // real*imag, imag*imag
+    xvmaddadp   vs46,   vs15,   vs20        // real*real, imag*real
+    xvmaddadp   vs47,   vs15,   vs21        // real*imag, imag*imag
+
+.endm
+
+.macro KERNEL1x8 
+  LOAD1x8 0
+  END1x8  AO, BO, 128,32 
+.endm
+
+.macro SAVE1x8
+
+     mr      T1, CO
+    addi        T2, T1, 64
+
+#ifndef TRMMKERNEL
+
+    lxv     vs16,   0(T1)
+    lxv     vs17,   16(T1)
+    lxv     vs18,   32(T1)
+    lxv     vs19,   48(T1)
+    lxv     vs20,   0(T2)
+    lxv     vs21,   16(T2)
+    lxv     vs22,   32(T2)
+    lxv     vs23,   48(T2)
+
+#endif
+
+   AGGREGATE_INTO_COMPLEX vs32,vs33,vs8
+   AGGREGATE_INTO_COMPLEX vs34,vs35,vs9
+   AGGREGATE_INTO_COMPLEX vs36,vs37,vs10
+   AGGREGATE_INTO_COMPLEX vs38,vs39,vs11
+   AGGREGATE_INTO_COMPLEX vs40,vs41,vs12
+   AGGREGATE_INTO_COMPLEX vs42,vs43,vs13
+   AGGREGATE_INTO_COMPLEX vs44,vs45,vs14
+   AGGREGATE_INTO_COMPLEX vs46,vs47,vs15
+
+#ifndef TRMMKERNEL
+
+    xvadddp     vs8,    vs8,    vs16
+    xvadddp     vs9,    vs9,    vs17
+    xvadddp     vs10,   vs10,   vs18
+    xvadddp     vs11,   vs11,   vs19
+    xvadddp     vs12,   vs12,   vs20
+    xvadddp     vs13,   vs13,   vs21
+    xvadddp     vs14,   vs14,   vs22
+    xvadddp     vs15,   vs15,   vs23
+
+#endif
+
+    stxv        vs8,    0(T1)
+    stxv        vs9,    16(T1)
+    stxv        vs10,   32(T1)
+    stxv        vs11,   48(T1)
+    stxv        vs12,   0(T2)
+    stxv        vs13,   16(T2)
+    stxv        vs14,   32(T2)
+    stxv        vs15,   48(T2)
+
+    addi        CO, CO, 128
+
+.endm
+
+/**********************************************************************************************
+* Macros for N=1 and M=4
+**********************************************************************************************/
+
+.macro Zero1x4
+    xxlxor      vs32,   vs32,   vs32
+    xxlxor      vs33,   vs33,   vs33
+    xxlxor      vs34,   vs34,   vs34
+    xxlxor      vs35,   vs35,   vs35
+    xxlxor      vs36,   vs36,   vs36
+    xxlxor      vs37,   vs37,   vs37
+    xxlxor      vs38,   vs38,   vs38
+    xxlxor      vs39,   vs39,   vs39 
+.endm
+
+.macro LOAD1x4 Zero
+
+    lxv     vs16,   0(BO)       // load real part from B
+    lxv     vs17,   16(BO)      // load imag part from B 
+
+    lxv     vs0,    0(AO)       // load real,imag from A
+    lxv     vs1,    16(AO)      // load real,imag from A
+    lxv     vs2,    32(AO)      // load real,imag from A
+    lxv     vs3,    48(AO)      // load real,imag from A
+ 
+.if \Zero==1
+    Zero1x4 
+.endif
+
+.endm
+
+.macro END1x4_NORMAL
+   END1x4 AO,BO,64,32
+.endm
+
+.macro END1x4   AREG, BREG, OffsetA, OffsetB
+
+.if \OffsetB != 0 
+    addi        \BREG, \BREG, \OffsetB 
+.endif
+.if \OffsetA != 0 
+    addi        \AREG, \AREG, \OffsetA 
+.endif  
+
+    xvmaddadp   vs32,   vs0,    vs16        // real*real, imag*real
+    xvmaddadp   vs33,   vs0,    vs17        // real*imag, imag*imag
+    xvmaddadp   vs34,   vs1,    vs16        // real*real, imag*real
+    xvmaddadp   vs35,   vs1,    vs17        // real*imag, imag*imag
+    xvmaddadp   vs36,   vs2,    vs16        // real*real, imag*real
+    xvmaddadp   vs37,   vs2,    vs17        // real*imag, imag*imag
+    xvmaddadp   vs38,   vs3,    vs16        // real*real, imag*real
+    xvmaddadp   vs39,   vs3,    vs17        // real*imag, imag*imag
+
+.endm
+
+.macro KERNEL1x4_L      OffsetA,OffsetB, Index,IsLast  
+  KERNEL1x4_2   AO,BO,   \OffsetA,\OffsetB, \Index,\IsLast ,0
+.endm
+
+.macro KERNEL1x4_E      OffsetA,OffsetB, Index,IsLast  
+  KERNEL1x4_2   AO,BO,   \OffsetA,\OffsetB, \Index,\IsLast ,1
+.endm
+
+.macro KERNEL1x4_2   AREG,BREG,   OffsetA,OffsetB, Index,IsLast ,Complete
+
+    lxv     vs8,     DISP8(\Index, 0 + \OffsetA)(\AREG)        // load real,imag from A
+    lxv     vs9,     DISP8(\Index,16 + \OffsetA)(\AREG)        // load real,imag from A
+    lxv     vs10,    DISP8(\Index,32 + \OffsetA)(\AREG)        // load real,imag from A
+    lxv     vs11,    DISP8(\Index,48 + \OffsetA)(\AREG)        // load real,imag from A
+
+lxv     vs20,   DISP4(\Index,   0+\OffsetB)(\BREG)      // load real part from B
+    lxv     vs21,   DISP4(\Index,16+\OffsetB)(\BREG)        // load imag part from B 
+
+    xvmaddadp   vs32,   vs0,    vs16        // real*real, imag*real
+    xvmaddadp   vs33,   vs0,    vs17        // real*imag, imag*imag
+    xvmaddadp   vs34,   vs1,    vs16        // real*real, imag*real
+    xvmaddadp   vs35,   vs1,    vs17        // real*imag, imag*imag
+    xvmaddadp   vs36,   vs2,    vs16        // real*real, imag*real
+    xvmaddadp   vs37,   vs2,    vs17        // real*imag, imag*imag
+    xvmaddadp   vs38,   vs3,    vs16        // real*real, imag*real
+    xvmaddadp   vs39,   vs3,    vs17        // real*imag, imag*imag
+
+    xvmaddadp   vs40,   vs0,    vs18        // real*real, imag*real
+    xvmaddadp   vs41,   vs0,    vs19        // real*imag, imag*imag
+    xvmaddadp   vs42,   vs1,    vs18        // real*real, imag*real
+    xvmaddadp   vs43,   vs1,    vs19        // real*imag, imag*imag
+    xvmaddadp   vs44,   vs2,    vs18        // real*real, imag*real
+    xvmaddadp   vs45,   vs2,    vs19        // real*imag, imag*imag
+    xvmaddadp   vs46,   vs3,    vs18        // real*real, imag*real
+    xvmaddadp   vs47,   vs3,    vs19        // real*imag, imag*imag
+
+.if \Complete==0
+    lxv     vs0,     DISP8(\Index,64+  \OffsetA)(\AREG)      // load real,imag from A
+    lxv     vs1,     DISP8(\Index,64+16 + \OffsetA)(\AREG)        // load real,imag from A
+    lxv     vs2,     DISP8(\Index,64+32 + \OffsetA)(\AREG)        // load real,imag from A
+    lxv     vs3,     DISP8(\Index,64+48 + \OffsetA)(\AREG)        // load real,imag from A 
+
+    lxv     vs16,   DISP4(\Index,   32+\OffsetB)(\BREG)     // load real part from B
+    lxv     vs17,   DISP4(\Index,32+16+\OffsetB)(\BREG)     // load imag part from B 
+.endif
+
+.if \IsLast==1  
+.if \Complete==1
+    addi        \AREG, \AREG,  DISP8(\Index,64+\OffsetA)
+    addi        \BREG, \BREG,  DISP4(\Index,32+\OffsetB)
+.else 
+    addi        \AREG, \AREG,  DISP8(\Index,128)
+    addi        \BREG, \BREG,  DISP4(\Index,64)
+.endif
+.endif  
+
+    xvmaddadp   vs32,   vs8,    vs20        // real*real, imag*real
+    xvmaddadp   vs33,   vs8,    vs21        // real*imag, imag*imag
+    xvmaddadp   vs34,   vs9,    vs20        // real*real, imag*real
+    xvmaddadp   vs35,   vs9,    vs21        // real*imag, imag*imag
+    xvmaddadp   vs36,   vs10,   vs20        // real*real, imag*real
+    xvmaddadp   vs37,   vs10,   vs21        // real*imag, imag*imag
+    xvmaddadp   vs38,   vs11,   vs20        // real*real, imag*real
+    xvmaddadp   vs39,   vs11,   vs21        // real*imag, imag*imag
+ 
+    xvmaddadp   vs40,   vs8,    vs22        // real*real, imag*real
+    xvmaddadp   vs41,   vs8,    vs23        // real*imag, imag*imag
+    xvmaddadp   vs42,   vs9,    vs22        // real*real, imag*real
+    xvmaddadp   vs43,   vs9,    vs23        // real*imag, imag*imag
+    xvmaddadp   vs44,   vs10,   vs22        // real*real, imag*real
+    xvmaddadp   vs45,   vs10,   vs23        // real*imag, imag*imag
+    xvmaddadp   vs46,   vs11,   vs22        // real*real, imag*real
+    xvmaddadp   vs47,   vs11,   vs23        // real*imag, imag*imag
+
+.endm
+
+.macro KERNEL1x4 
+  LOAD1x4 0
+  END1x4  AO, BO, 64,32 
+.endm
+
+.macro SAVE1x4
+
+    mr      T1, CO
+
+#ifndef TRMMKERNEL
+
+    lxv     vs16,   0(T1)
+    lxv     vs17,   16(T1)
+    lxv     vs18,   32(T1)
+    lxv     vs19,   48(T1)
+
+#endif
+
+   AGGREGATE_INTO_COMPLEX vs32,vs33,vs8
+   AGGREGATE_INTO_COMPLEX vs34,vs35,vs9
+   AGGREGATE_INTO_COMPLEX vs36,vs37,vs10
+   AGGREGATE_INTO_COMPLEX vs38,vs39,vs11
+
+#ifndef TRMMKERNEL
+
+    xvadddp     vs8,    vs8,    vs16
+    xvadddp     vs9,    vs9,    vs17
+    xvadddp     vs10,   vs10,   vs18
+    xvadddp     vs11,   vs11,   vs19
+
+#endif
+
+    stxv        vs8,    0(T1)
+    stxv        vs9,    16(T1)
+    stxv        vs10,   32(T1)
+    stxv        vs11,   48(T1) 
+ 
+    addi        CO, CO, 64
+
+.endm
+
+/**********************************************************************************************
+* Macros for N=1 and M=2
+**********************************************************************************************/
+
+.macro Zero1x2
+    xxlxor      vs32,   vs32,   vs32
+    xxlxor      vs33,   vs33,   vs33
+    xxlxor      vs34,   vs34,   vs34
+    xxlxor      vs35,   vs35,   vs35 
+.endm
+
+.macro LOAD1x2 Zero
+
+    lxv     vs16,   0(BO)       // load real part from B
+    lxv     vs17,   16(BO)      // load imag part from B 
+
+    lxv     vs0,    0(AO)       // load real,imag from A
+    lxv     vs1,    16(AO)      // load real,imag from A 
+ 
+.if \Zero==1
+    Zero1x2 
+.endif
+
+.endm
+
+.macro END1x2_NORMAL
+   END1x2 AO,BO,32,32
+.endm
+
+.macro END1x2   AREG, BREG, OffsetA, OffsetB
+
+.if \OffsetB != 0 
+    addi        \BREG, \BREG, \OffsetB 
+.endif
+.if \OffsetA != 0 
+    addi        \AREG, \AREG, \OffsetA 
+.endif  
+
+    xvmaddadp   vs32,   vs0,    vs16        // real*real, imag*real
+    xvmaddadp   vs33,   vs0,    vs17        // real*imag, imag*imag
+    xvmaddadp   vs34,   vs1,    vs16        // real*real, imag*real
+    xvmaddadp   vs35,   vs1,    vs17        // real*imag, imag*imag
+  
+.endm
+
+.macro KERNEL1x2_L      OffsetA,OffsetB, Index,IsLast  
+  KERNEL1x2_2   AO,BO,   \OffsetA,\OffsetB, \Index,\IsLast ,0
+.endm
+
+.macro KERNEL1x2_E      OffsetA,OffsetB, Index,IsLast  
+  KERNEL1x2_2   AO,BO,   \OffsetA,\OffsetB, \Index,\IsLast ,1
+.endm
+
+.macro KERNEL1x2_2   AREG,BREG,   OffsetA,OffsetB, Index,IsLast ,Complete
+
+    lxv     vs8,     DISP4(\Index, 0 + \OffsetA)(\AREG)        // load real,imag from A
+    lxv     vs9,     DISP4(\Index,16 + \OffsetA)(\AREG)        // load real,imag from A
+
+lxv     vs20,   DISP4(\Index,   0+\OffsetB)(\BREG)      // load real part from B
+    lxv     vs21,   DISP4(\Index,16+\OffsetB)(\BREG)        // load imag part from B 
+
+    xvmaddadp   vs32,   vs0,    vs16        // real*real, imag*real
+    xvmaddadp   vs33,   vs0,    vs17        // real*imag, imag*imag
+    xvmaddadp   vs34,   vs1,    vs16        // real*real, imag*real
+    xvmaddadp   vs35,   vs1,    vs17        // real*imag, imag*imag  
+.if \Complete==0
+    lxv     vs0,     DISP4(\Index,32 + \OffsetA)(\AREG)      // load real,imag from A
+    lxv     vs1,     DISP4(\Index,48+ \OffsetA)(\AREG)        // load real,imag from A 
+
+    lxv     vs16,   DISP4(\Index,   32+\OffsetB)(\BREG)     // load real part from B
+    lxv     vs17,   DISP4(\Index,32+16+\OffsetB)(\BREG)     // load imag part from B 
+.endif
+
+.if \IsLast==1  
+.if \Complete==1
+    addi        \AREG, \AREG,  DISP4(\Index,32+\OffsetA)
+    addi        \BREG, \BREG,  DISP4(\Index,32+\OffsetB)
+.else 
+    addi        \AREG, \AREG,  DISP4(\Index,64)
+    addi        \BREG, \BREG,  DISP4(\Index,64)
+.endif
+.endif  
+
+    xvmaddadp   vs32,   vs8,    vs20        // real*real, imag*real
+    xvmaddadp   vs33,   vs8,    vs21        // real*imag, imag*imag
+    xvmaddadp   vs34,   vs9,    vs20        // real*real, imag*real
+    xvmaddadp   vs35,   vs9,    vs21        // real*imag, imag*imag
+
+.endm
+
+.macro KERNEL1x2 
+  LOAD1x2 0
+  END1x2  AO, BO, 32,32 
+.endm
+
+.macro SAVE1x2
+
+    mr      T1, CO
+
+#ifndef TRMMKERNEL
+
+    lxv     vs16,   0(T1)
+    lxv     vs17,   16(T1)
+
+#endif
+
+   AGGREGATE_INTO_COMPLEX vs32,vs33,vs8
+   AGGREGATE_INTO_COMPLEX vs34,vs35,vs9 
+
+#ifndef TRMMKERNEL
+
+    xvadddp     vs8,    vs8,    vs16
+    xvadddp     vs9,    vs9,    vs17
+
+#endif
+
+    stxv        vs8,    0(T1)
+    stxv        vs9,    16(T1)
+
+addi        CO, CO, 32
+
+.endm
+
+/**********************************************************************************************
+* Macros for N=1 and M=1
+**********************************************************************************************/
+
+.macro Zero1x1
+    xxlxor      vs32,   vs32,   vs32
+    xxlxor      vs33,   vs33,   vs33 
+.endm
+
+.macro LOAD1x1 Zero
+    lxv     vs0,    0(AO)       // load real,imag from A 
+
+    lxv     vs16,   0(BO)       // load real part from B
+    lxv     vs17,   16(BO)      // load imag part from B 
+
+.if \Zero==1
+    Zero1x1 
+.endif
+
+.endm
+
+.macro END1x1_NORMAL
+   END1x1 AO,BO,16,32
+.endm
+
+.macro END1x1   AREG, BREG, OffsetA, OffsetB
+
+.if \OffsetA != 0 
+    addi        \AREG, \AREG, \OffsetA 
+.endif  
+.if \OffsetB != 0 
+    addi        \BREG, \BREG, \OffsetB 
+.endif
+
+    xvmaddadp   vs32,   vs0,    vs16        // real*real, imag*real
+    xvmaddadp   vs33,   vs0,    vs17        // real*imag, imag*imag 
+ 
+  
+.endm
+
+.macro KERNEL1x1_L      OffsetA,OffsetB, Index,IsLast  
+  KERNEL1x1_2   AO,BO,   \OffsetA,\OffsetB, \Index,\IsLast ,0
+.endm
+
+.macro KERNEL1x1_E      OffsetA,OffsetB, Index,IsLast  
+  KERNEL1x1_2   AO,BO,   \OffsetA,\OffsetB, \Index,\IsLast ,1
+.endm
+
+.macro KERNEL1x1_2   AREG,BREG,   OffsetA,OffsetB, Index,IsLast ,Complete
+
+    lxv     vs8,     DISP2(\Index, 0 + \OffsetA)(\AREG)        // load real,imag from A
+
+    lxv     vs20,   DISP4(\Index,   0+\OffsetB)(\BREG)      // load real part from B
+    lxv     vs21,   DISP4(\Index,16+\OffsetB)(\BREG)        // load imag part from B 
+
+    xvmaddadp   vs32,   vs0,    vs16        // real*real, imag*real
+    xvmaddadp   vs33,   vs0,    vs17        // real*imag, imag*imag  
+
+.if \Complete==0
+    lxv     vs0,     DISP2(\Index,16 + \OffsetA)(\AREG)      // load real,imag from A 
+
+    lxv     vs16,   DISP4(\Index,   32+\OffsetB)(\BREG)     // load real part from B
+    lxv     vs17,   DISP4(\Index,32+16+\OffsetB)(\BREG)     // load imag part from B 
+.endif
+
+
+.if \IsLast==1  
+.if \Complete==1
+    addi        \AREG, \AREG,  DISP2(\Index,16+\OffsetA)
+    addi        \BREG, \BREG,  DISP4(\Index,32+\OffsetB)
+.else 
+    addi        \AREG, \AREG,  DISP2(\Index,32)
+    addi        \BREG, \BREG,  DISP4(\Index,64)
+.endif
+.endif
+  
+    xvmaddadp   vs32,   vs8,    vs20        // real*real, imag*real
+    xvmaddadp   vs33,   vs8,    vs21        // real*imag, imag*imag 
+  
+     
+.endm
+
+.macro KERNEL1x1 
+  LOAD1x1 0
+  END1x1  AO, BO, 16,32 
+
+.endm  
+
+.macro SAVE1x1
+
+    mr      T1, CO
+#ifndef TRMMKERNEL
+    lxv     vs16,   0(T1)
+#endif
+   AGGREGATE_INTO_COMPLEX vs32,vs33,vs8
+
+#ifndef TRMMKERNEL
+    xvadddp     vs8,    vs8,    vs16
+#endif
+
+    stxv        vs8,    0(T1)
+
+addi        CO, CO, 16
+
+.endm
+
+
+.macro ZCOPYB_2
+
+        lxv          vs32,   0(BO)
+        lxv          vs33,  16(BO)            
+        addi            BO,     BO,     32
+        xxspltd     vs40, vs32, 1
+        xxspltd     vs41, vs32, 0     
+        xxspltd     vs42, vs33, 1
+        xxspltd     vs43, vs33, 0
+
+        stxv         vs40,    0(BBO)
+        stxv         vs41,   16(BBO)
+        stxv         vs42,   32(BBO)
+        stxv         vs43,   48(BBO)
+        addi            BBO,    BBO,    64
+
+.endm
+
+.macro ZCOPYB_1
+
+        lxv          vs32,   0(BO)              
+        addi            BO,     BO,     16
+        xxspltd     vs40, vs32, 1
+        xxspltd     vs41, vs32, 0        
+        stxv         vs40,    0(BBO)
+        stxv         vs41,   16(BBO)
+
+        addi            BBO,    BBO,    32
+
+.endm
+
+.macro ZCOPYB_8
+
+        lxv          vs32,   0(BO)
+        lxv          vs33,  16(BO)
+        lxv          vs34,  32(BO)
+        lxv          vs35,  48(BO) 
+
+        lxv          vs36,   64+0(BO)
+        lxv          vs37,  64+16(BO)
+        lxv          vs38,  64+32(BO)
+        lxv          vs39,  64+48(BO) 
+        addi         BO, BO,    128
+        xxspltd     vs40, vs32, 1
+        xxspltd     vs41, vs32, 0
+        xxspltd     vs42, vs33, 1
+        xxspltd     vs43, vs33, 0
+        xxspltd     vs44, vs34, 1
+        xxspltd     vs45, vs34, 0
+        xxspltd     vs46, vs35, 1
+        xxspltd     vs47, vs35, 0    
+
+        xxspltd     vs48, vs36, 1
+        xxspltd     vs49, vs36, 0
+        xxspltd     vs50, vs37, 1
+        xxspltd     vs51, vs37, 0
+        xxspltd     vs52, vs38, 1
+        xxspltd     vs53, vs38, 0
+        xxspltd     vs54, vs39, 1
+        xxspltd     vs55, vs39, 0
+
+        stxv         vs40,    0(BBO)
+        stxv         vs41,   16(BBO)
+        stxv         vs42,   32(BBO)
+        stxv         vs43,   48(BBO) 
+
+        stxv         vs44,    64+0(BBO)
+        stxv         vs45,   64+16(BBO)
+        stxv         vs46,   64+32(BBO)
+        stxv         vs47,   64+48(BBO) 
+
+        stxv         vs48,   128+ 0(BBO)
+        stxv         vs49,   128+ 16(BBO)
+        stxv         vs50,   128+ 32(BBO)
+        stxv         vs51,   128+ 48(BBO) 
+
+        stxv         vs52,   192 + 0(BBO)
+        stxv         vs53,   192 + 16(BBO)
+        stxv         vs54,   192+ 32(BBO)
+        stxv         vs55,   192 + 48(BBO)
+        addi            BBO,    BBO,    256
+
+.endm
+
diff --git a/param.h b/param.h
index 4dcd96a75..d0b8518c9 100644
--- a/param.h
+++ b/param.h
@@ -2251,12 +2251,12 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #define SGEMM_DEFAULT_P 640
 #define DGEMM_DEFAULT_P  128
 #define CGEMM_DEFAULT_P  640
-#define ZGEMM_DEFAULT_P  320
+#define ZGEMM_DEFAULT_P 512
 
 #define SGEMM_DEFAULT_Q 1408
 #define DGEMM_DEFAULT_Q  384
 #define CGEMM_DEFAULT_Q  640
-#define ZGEMM_DEFAULT_Q  640
+#define ZGEMM_DEFAULT_Q 1152
 
 #define SYMV_P	 8
 

From c00289ba543121a78c5ab07a8e45385cc12fb9a8 Mon Sep 17 00:00:00 2001
From: TiborGY <gyori.tibor@stud.u-szeged.hu>
Date: Sat, 1 Jun 2019 21:30:06 +0200
Subject: [PATCH 068/127] upload thread safety test folder

---
 cpp_thread_test/Makefile                   |  14 +++
 cpp_thread_test/cpp_thread_safety_common.h |  55 +++++++++++
 cpp_thread_test/dgemm_thread_safety.cpp    |  92 +++++++++++++++++++
 cpp_thread_test/dgemv_thread_safety.cpp    | 101 +++++++++++++++++++++
 4 files changed, 262 insertions(+)
 create mode 100644 cpp_thread_test/Makefile
 create mode 100644 cpp_thread_test/cpp_thread_safety_common.h
 create mode 100644 cpp_thread_test/dgemm_thread_safety.cpp
 create mode 100644 cpp_thread_test/dgemv_thread_safety.cpp

diff --git a/cpp_thread_test/Makefile b/cpp_thread_test/Makefile
new file mode 100644
index 000000000..81e3470ef
--- /dev/null
+++ b/cpp_thread_test/Makefile
@@ -0,0 +1,14 @@
+include ../Makefile.rule
+
+all :: dgemv_tester dgemm_tester
+
+dgemv_tester :
+	$(CXX) $(COMMON_OPT) -Wall -Wextra -Wshadow -fopenmp -std=c++11 dgemv_thread_safety.cpp ../libopenblas.a -lpthread -o dgemv_tester
+	./dgemv_tester
+
+dgemm_tester : dgemv_tester
+	$(CXX) $(COMMON_OPT) -Wall -Wextra -Wshadow -fopenmp -std=c++11 dgemm_thread_safety.cpp ../libopenblas.a -lpthread -o dgemm_tester
+	./dgemm_tester
+
+clean ::
+	rm -f dgemv_tester dgemm_tester
diff --git a/cpp_thread_test/cpp_thread_safety_common.h b/cpp_thread_test/cpp_thread_safety_common.h
new file mode 100644
index 000000000..60ab5bb2f
--- /dev/null
+++ b/cpp_thread_test/cpp_thread_safety_common.h
@@ -0,0 +1,55 @@
+inline void pauser(){
+    /// a portable way to pause a program
+    std::string dummy;
+    std::cout << "Press enter to continue...";
+    std::getline(std::cin, dummy);
+}
+
+void FillMatrices(std::vector<std::vector<double>>& matBlock, std::mt19937_64& PRNG, std::uniform_real_distribution<double>& rngdist, const blasint randomMatSize, const uint32_t numConcurrentThreads, const uint32_t numMat){
+	for(uint32_t i=0; i<numMat; i++){
+		for(uint32_t j = 0; j < static_cast<uint32_t>(randomMatSize*randomMatSize); j++){
+			matBlock[i][j] = rngdist(PRNG);
+		}
+	}
+	for(uint32_t i=numMat; i<(numConcurrentThreads*numMat); i+=numMat){
+		for(uint32_t j=0; j<numMat; j++){
+			matBlock[i+j] = matBlock[j];
+		}
+	}
+}
+
+void FillVectors(std::vector<std::vector<double>>& vecBlock, std::mt19937_64& PRNG, std::uniform_real_distribution<double>& rngdist, const blasint randomMatSize, const uint32_t numConcurrentThreads, const uint32_t numVec){
+	for(uint32_t i=0; i<numVec; i++){
+		for(uint32_t j = 0; j < static_cast<uint32_t>(randomMatSize); j++){
+			vecBlock[i][j] = rngdist(PRNG);
+		}
+	}
+	for(uint32_t i=numVec; i<(numConcurrentThreads*numVec); i+=numVec){
+		for(uint32_t j=0; j<numVec; j++){
+			vecBlock[i+j] = vecBlock[j];
+		}
+	}
+}
+
+std::mt19937_64 InitPRNG(){
+	std::random_device rd;
+	std::mt19937_64 PRNG(rd()); //seed PRNG using /dev/urandom or similar OS provided RNG
+	std::uniform_real_distribution<double> rngdist{-1.0, 1.0};
+	//make sure the internal state of the PRNG is properly mixed by generating 10M random numbers
+	//PRNGs often have unreliable distribution uniformity and other statistical properties before their internal state is sufficiently mixed
+	for (uint32_t i=0;i<10000000;i++) rngdist(PRNG);
+	return PRNG;
+}
+
+void PrintMatrices(const std::vector<std::vector<double>>& matBlock, const blasint randomMatSize, const uint32_t numConcurrentThreads, const uint32_t numMat){
+	for (uint32_t i=0;i<numConcurrentThreads*numMat;i++){
+		std::cout<<i<<std::endl;
+		for (uint32_t j = 0; j < static_cast<uint32_t>(randomMatSize); j++){
+			for (uint32_t k = 0; k < static_cast<uint32_t>(randomMatSize); k++){
+				std::cout<<matBlock[i][j*randomMatSize + k]<<"  ";
+			}
+			std::cout<<std::endl;
+		}
+		std::cout<<std::endl;
+	}
+}
diff --git a/cpp_thread_test/dgemm_thread_safety.cpp b/cpp_thread_test/dgemm_thread_safety.cpp
new file mode 100644
index 000000000..cecf794fa
--- /dev/null
+++ b/cpp_thread_test/dgemm_thread_safety.cpp
@@ -0,0 +1,92 @@
+#include <iostream>
+#include <vector>
+#include <random>
+#include <future>
+#include <omp.h>
+#include "../cblas.h"
+#include "cpp_thread_safety_common.h"
+
+void launch_cblas_dgemm(double* A, double* B, double* C, const blasint randomMatSize){
+	cblas_dgemm(CblasColMajor, CblasNoTrans, CblasNoTrans, randomMatSize, randomMatSize, randomMatSize, 1.0, A, randomMatSize, B, randomMatSize, 0.1, C, randomMatSize);
+}
+
+int main(int argc, char* argv[]){
+	blasint randomMatSize = 1024; //dimension of the random square matrices used
+	uint32_t numConcurrentThreads = 52; //number of concurrent calls of the functions being tested
+	uint32_t numTestRounds = 16; //number of testing rounds before success exit
+	
+	if (argc > 4){
+		std::cout<<"ERROR: too many arguments for thread safety tester"<<std::endl;
+		abort();
+	}
+	
+	if(argc == 4){
+		std::vector<std::string> cliArgs;
+		for (int i = 1; i < argc; i++){
+			cliArgs.push_back(argv[i]);
+			std::cout<<argv[i]<<std::endl;
+		}
+		randomMatSize = std::stoul(cliArgs[0]);
+		numConcurrentThreads = std::stoul(cliArgs[1]);
+		numTestRounds = std::stoul(cliArgs[2]);
+	}
+	
+	std::uniform_real_distribution<double> rngdist{-1.0, 1.0};
+	std::vector<std::vector<double>> matBlock(numConcurrentThreads*3);
+	std::vector<std::future<void>> futureBlock(numConcurrentThreads);
+	
+	std::cout<<"*----------------------------*\n";
+	std::cout<<"| DGEMM thread safety tester |\n";
+	std::cout<<"*----------------------------*\n";
+	std::cout<<"Size of random matrices(N=M=K): "<<randomMatSize<<'\n';
+	std::cout<<"Number of concurrent calls into OpenBLAS : "<<numConcurrentThreads<<'\n';
+	std::cout<<"Number of testing rounds : "<<numTestRounds<<'\n';
+	std::cout<<"This test will need "<<(static_cast<uint64_t>(randomMatSize*randomMatSize)*numConcurrentThreads*3*8)/static_cast<double>(1024*1024)<<" MiB of RAM\n"<<std::endl;
+	
+	std::cout<<"Initializing random number generator..."<<std::flush;
+	std::mt19937_64 PRNG = InitPRNG();
+	std::cout<<"done\n";
+	
+	std::cout<<"Preparing to test CBLAS DGEMM thread safety\n";
+	std::cout<<"Allocating matrices..."<<std::flush;
+	for(uint32_t i=0; i<(numConcurrentThreads*3); i++){
+		matBlock[i].resize(randomMatSize*randomMatSize);
+	}
+	std::cout<<"done\n";
+	//pauser();
+	std::cout<<"Filling matrices with random numbers..."<<std::flush;
+	FillMatrices(matBlock, PRNG, rngdist, randomMatSize, numConcurrentThreads, 3);
+	//PrintMatrices(matBlock, randomMatSize, numConcurrentThreads, 3);
+	std::cout<<"done\n";
+	std::cout<<"Testing CBLAS DGEMM thread safety\n";
+	omp_set_num_threads(numConcurrentThreads);
+	for(uint32_t R=0; R<numTestRounds; R++){
+		std::cout<<"DGEMM round #"<<R<<std::endl;
+		std::cout<<"Launching "<<numConcurrentThreads<<" threads simultaneously using OpenMP..."<<std::flush;
+		#pragma omp parallel for default(none) shared(futureBlock, matBlock, randomMatSize, numConcurrentThreads)
+		for(uint32_t i=0; i<numConcurrentThreads; i++){
+			futureBlock[i] = std::async(std::launch::async, launch_cblas_dgemm, &matBlock[i*3][0], &matBlock[i*3+1][0], &matBlock[i*3+2][0], randomMatSize);
+			//launch_cblas_dgemm( &matBlock[i][0], &matBlock[i+1][0], &matBlock[i+2][0]);
+		}
+		std::cout<<"done\n";
+		std::cout<<"Waiting for threads to finish..."<<std::flush;
+		for(uint32_t i=0; i<numConcurrentThreads; i++){
+			futureBlock[i].get();
+		}
+		std::cout<<"done\n";
+		//PrintMatrices(matBlock, randomMatSize, numConcurrentThreads, 3);
+		std::cout<<"Comparing results from different threads..."<<std::flush;
+		for(uint32_t i=3; i<(numConcurrentThreads*3); i+=3){ //i is the index of matrix A, for a given thread
+			for(uint32_t j = 0; j < static_cast<uint32_t>(randomMatSize*randomMatSize); j++){
+				if (std::abs(matBlock[i+2][j] - matBlock[2][j]) > 1.0E-13){ //i+2 is the index of matrix C, for a given thread
+					std::cout<<"ERROR: one of the threads returned a different result! Index : "<<i+2<<std::endl;
+					std::cout<<"CBLAS DGEMM thread safety test FAILED!"<<std::endl;
+					return -1;
+				}
+			}
+		}
+		std::cout<<"OK!\n"<<std::endl;
+	}
+	std::cout<<"CBLAS DGEMM thread safety test PASSED!\n"<<std::endl;
+	return 0;
+}
diff --git a/cpp_thread_test/dgemv_thread_safety.cpp b/cpp_thread_test/dgemv_thread_safety.cpp
new file mode 100644
index 000000000..22505d03f
--- /dev/null
+++ b/cpp_thread_test/dgemv_thread_safety.cpp
@@ -0,0 +1,101 @@
+#include <iostream>
+#include <vector>
+#include <random>
+#include <future>
+#include <omp.h>
+#include "../cblas.h"
+#include "cpp_thread_safety_common.h"
+
+void launch_cblas_dgemv(double* A, double* x, double* y, const blasint randomMatSize){
+	const blasint inc = 1;
+	cblas_dgemv(CblasColMajor, CblasNoTrans, randomMatSize, randomMatSize, 1.0, A, randomMatSize, x, inc, 0.1, y, inc);
+}
+
+int main(int argc, char* argv[]){
+	blasint randomMatSize = 1024; //dimension of the random square matrices and vectors being used
+	uint32_t numConcurrentThreads = 52; //number of concurrent calls of the functions being tested
+	uint32_t numTestRounds = 16; //number of testing rounds before success exit
+	
+	if (argc > 4){
+		std::cout<<"ERROR: too many arguments for thread safety tester"<<std::endl;
+		abort();
+	}
+	if(argc == 4){
+		std::vector<std::string> cliArgs;
+		for (int i = 1; i < argc; i++){
+			cliArgs.push_back(argv[i]);
+			std::cout<<argv[i]<<std::endl;
+		}
+		randomMatSize = std::stoul(cliArgs.at(0));
+		numConcurrentThreads = std::stoul(cliArgs.at(1));
+		numTestRounds = std::stoul(cliArgs.at(2));
+	}
+	
+	std::uniform_real_distribution<double> rngdist{-1.0, 1.0};
+	std::vector<std::vector<double>> matBlock(numConcurrentThreads);
+	std::vector<std::vector<double>> vecBlock(numConcurrentThreads*2);
+	std::vector<std::future<void>> futureBlock(numConcurrentThreads);
+	
+	std::cout<<"*----------------------------*\n";
+	std::cout<<"| DGEMV thread safety tester |\n";
+	std::cout<<"*----------------------------*\n";
+	std::cout<<"Size of random matrices and vectors(N=M): "<<randomMatSize<<'\n';
+	std::cout<<"Number of concurrent calls into OpenBLAS : "<<numConcurrentThreads<<'\n';
+	std::cout<<"Number of testing rounds : "<<numTestRounds<<'\n';
+	std::cout<<"This test will need "<<((static_cast<uint64_t>(randomMatSize*randomMatSize)*numConcurrentThreads*8)+(static_cast<uint64_t>(randomMatSize)*numConcurrentThreads*8*2))/static_cast<double>(1024*1024)<<" MiB of RAM\n"<<std::endl;
+	
+	std::cout<<"Initializing random number generator..."<<std::flush;
+	std::mt19937_64 PRNG = InitPRNG();
+	std::cout<<"done\n";
+	
+	std::cout<<"Preparing to test CBLAS DGEMV thread safety\n";
+	std::cout<<"Allocating matrices..."<<std::flush;
+	for(uint32_t i=0; i<numConcurrentThreads; i++){
+		matBlock.at(i).resize(randomMatSize*randomMatSize);
+	}
+	std::cout<<"done\n";
+	std::cout<<"Allocating vectors..."<<std::flush;
+	for(uint32_t i=0; i<(numConcurrentThreads*2); i++){
+		vecBlock.at(i).resize(randomMatSize);
+	}
+	std::cout<<"done\n";
+	//pauser();
+	
+	std::cout<<"Filling matrices with random numbers..."<<std::flush;
+	FillMatrices(matBlock, PRNG, rngdist, randomMatSize, numConcurrentThreads, 1);
+	//PrintMatrices(matBlock, randomMatSize, numConcurrentThreads);
+	std::cout<<"done\n";
+	std::cout<<"Filling vectors with random numbers..."<<std::flush;
+	FillVectors(vecBlock, PRNG, rngdist, randomMatSize, numConcurrentThreads, 2);
+	std::cout<<"done\n";
+	
+	std::cout<<"Testing CBLAS DGEMV thread safety"<<std::endl;
+	omp_set_num_threads(numConcurrentThreads);
+	for(uint32_t R=0; R<numTestRounds; R++){
+		std::cout<<"DGEMV round #"<<R<<std::endl;
+		std::cout<<"Launching "<<numConcurrentThreads<<" threads simultaneously using OpenMP..."<<std::flush;
+		#pragma omp parallel for default(none) shared(futureBlock, matBlock, vecBlock, randomMatSize, numConcurrentThreads)
+		for(uint32_t i=0; i<numConcurrentThreads; i++){
+			futureBlock[i] = std::async(std::launch::async, launch_cblas_dgemv, &matBlock[i][0], &vecBlock[i*2][0], &vecBlock[i*2+1][0], randomMatSize);
+		}
+		std::cout<<"done\n";
+		std::cout<<"Waiting for threads to finish..."<<std::flush;
+		for(uint32_t i=0; i<numConcurrentThreads; i++){
+			futureBlock[i].get();
+		}
+		std::cout<<"done\n";
+		std::cout<<"Comparing results from different threads..."<<std::flush;
+		for(uint32_t i=2; i<(numConcurrentThreads*2); i+=2){ //i is the index of vector x, for a given thread
+			for(uint32_t j = 0; j < static_cast<uint32_t>(randomMatSize); j++){
+				if (std::abs(vecBlock[i+1][j] - vecBlock[1][j]) > 1.0E-13){ //i+1 is the index of vector y, for a given thread
+					std::cout<<"ERROR: one of the threads returned a different result! Index : "<<i+1<<std::endl;
+					std::cout<<"CBLAS DGEMV thread safety test FAILED!"<<std::endl;
+					return -1;
+				}
+			}
+		}
+		std::cout<<"OK!\n"<<std::endl;
+	}
+	std::cout<<"CBLAS DGEMV thread safety test PASSED!\n"<<std::endl;
+	return 0;
+}

From 1aded698212cca0e6fd447306ae346460cf88616 Mon Sep 17 00:00:00 2001
From: TiborGY <gyori.tibor@stud.u-szeged.hu>
Date: Sat, 1 Jun 2019 21:32:52 +0200
Subject: [PATCH 069/127] hook up c++ thread safety test (main Makefile)

---
 Makefile | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/Makefile b/Makefile
index 21096f893..20ef1e868 100644
--- a/Makefile
+++ b/Makefile
@@ -34,7 +34,7 @@ endif
 
 LAPACK_NOOPT := $(filter-out -O0 -O1 -O2 -O3 -Ofast,$(LAPACK_FFLAGS))
 
-SUBDIRS_ALL = $(SUBDIRS) test ctest utest exports benchmark ../laswp ../bench
+SUBDIRS_ALL = $(SUBDIRS) test ctest utest exports benchmark ../laswp ../bench cpp_thread_test
 
 .PHONY : all libs netlib $(RELA) test ctest shared install
 .NOTPARALLEL : all libs $(RELA) prof lapack-test install blas-test
@@ -127,6 +127,9 @@ ifndef NO_FBLAS
 endif
 ifndef NO_CBLAS
 	$(MAKE) -C ctest all
+ifeq ($(CPP_THREAD_SAFETY_TEST), 1)
+	$(MAKE) -C cpp_thread_test all
+endif
 endif
 endif
 

From 16f3df5d3551ff705d5d23dcdf26853114fb6956 Mon Sep 17 00:00:00 2001
From: TiborGY <gyori.tibor@stud.u-szeged.hu>
Date: Sat, 1 Jun 2019 21:36:41 +0200
Subject: [PATCH 070/127]  add c++ thread test option to Makefile.rule

---
 Makefile.rule | 15 +++++++++++++++
 1 file changed, 15 insertions(+)

diff --git a/Makefile.rule b/Makefile.rule
index 7c128fb49..209934991 100644
--- a/Makefile.rule
+++ b/Makefile.rule
@@ -220,6 +220,21 @@ COMMON_PROF = -pg
 # SYMBOLPREFIX=
 # SYMBOLSUFFIX=
 
+# Run a C++ based thread safety tester after the build is done.
+# This is mostly intended as a developer feature to spot regressions, but users and
+# package maintainers can enable this if they have doubts about the thread safety of
+# the library, given the configuration in this file.
+# By default, the thread safety tester launches 52 concurrent calculations at the same
+# time.
+#
+# Please note that the test uses ~1300 MiB of RAM for the DGEMM test.
+#
+# The test requires CBLAS to be built, a C++11 capable compiler and the presence of
+# an OpenMP implementation. If you are cross-compiling this test will probably not
+# work at all.
+#
+# CPP_THREAD_SAFETY_TEST = 1
+
 #
 #  End of user configuration
 #

From 27649b95430cbed40923db4ab45119af6b05acb3 Mon Sep 17 00:00:00 2001
From: Martin Kroeker <martin@ruby.chemie.uni-freiburg.de>
Date: Mon, 3 Jun 2019 11:01:33 +0200
Subject: [PATCH 071/127] Document NO_AVX512

for #2151
---
 Makefile.rule | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/Makefile.rule b/Makefile.rule
index 255d1da46..65d04ee3e 100644
--- a/Makefile.rule
+++ b/Makefile.rule
@@ -163,6 +163,10 @@ NO_AFFINITY = 1
 # Don't use Haswell optimizations if binutils is too old (e.g. RHEL6)
 # NO_AVX2 = 1
 
+# Don't use SkylakeX optimizations if binutils or compiler are too old (the build
+# system will try to determine this automatically)
+# NO_AVX512 = 1
+
 # Don't use parallel make.
 # NO_PARALLEL_MAKE = 1
 

From a469b32cf43772bb14253a405be8f088ce3a9d83 Mon Sep 17 00:00:00 2001
From: AbdelRauf <quickwritereader@gmail.com>
Date: Fri, 31 May 2019 22:48:16 +0000
Subject: [PATCH 072/127] sgemm pipeline improved, zgemm rewritten without
 inner packs, ABI lxvx v20 fixed with vs52

---
 benchmark/gemm.c                   |    2 +-
 kernel/power/KERNEL.POWER9         |    2 +-
 kernel/power/dgemm_kernel_power9.S |   48 +-
 kernel/power/sgemm_kernel_power9.S |  140 +-
 kernel/power/sgemm_logic_power9.S  |  192 ++-
 kernel/power/sgemm_macros_power9.S |  881 ++++-------
 kernel/power/zgemm_kernel_power9.S |  114 +-
 kernel/power/zgemm_logic_power9.S  |  806 ++++++----
 kernel/power/zgemm_macros_power9.S | 2307 +++++++++++++---------------
 param.h                            |    8 +-
 10 files changed, 2073 insertions(+), 2427 deletions(-)

diff --git a/benchmark/gemm.c b/benchmark/gemm.c
index 85bcbc710..dd016a7c3 100644
--- a/benchmark/gemm.c
+++ b/benchmark/gemm.c
@@ -207,7 +207,7 @@ int main(int argc, char *argv[]){
   for (i = 0; i < m * n * COMPSIZE; i++) {
     c[i] = ((FLOAT) rand() / (FLOAT) RAND_MAX) - 0.5;
   }
- 
+
   fprintf(stderr, "          SIZE                   Flops             Time\n");
 
   for (i = from; i <= to; i += step) {
diff --git a/kernel/power/KERNEL.POWER9 b/kernel/power/KERNEL.POWER9
index 5c10ad64a..440eaab1b 100644
--- a/kernel/power/KERNEL.POWER9
+++ b/kernel/power/KERNEL.POWER9
@@ -42,7 +42,7 @@ ZGEMMKERNEL    = zgemm_kernel_power9.S
 ZGEMMONCOPY    = ../generic/zgemm_ncopy_2.c
 ZGEMMOTCOPY    = ../generic/zgemm_tcopy_2.c
 ZGEMMINCOPY    = ../generic/zgemm_ncopy_8.c
-ZGEMMITCOPY    = zgemm_tcopy_8_power8.S
+ZGEMMITCOPY    =  ../generic/zgemm_tcopy_8.c
 ZGEMMONCOPYOBJ =  zgemm_oncopy.o
 ZGEMMOTCOPYOBJ =  zgemm_otcopy.o
 ZGEMMINCOPYOBJ =  zgemm_incopy.o
diff --git a/kernel/power/dgemm_kernel_power9.S b/kernel/power/dgemm_kernel_power9.S
index a1762dcf2..2fb1b27ef 100644
--- a/kernel/power/dgemm_kernel_power9.S
+++ b/kernel/power/dgemm_kernel_power9.S
@@ -135,18 +135,18 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 	std	r14,  280(SP)
  
  
-    stxv    v20,  288(SP)
-    stxv    v21,  304(SP)
-    stxv    v22,  320(SP)
-    stxv    v23,  336(SP)
-    stxv    v24,  352(SP)
-    stxv    v25,  368(SP)
-    stxv    v26,  384(SP)
-    stxv    v27,  400(SP)
-    stxv    v28,  416(SP)
-    stxv    v29,  432(SP)
-    stxv    v30,  448(SP)
-    stxv    v31,  464(SP)
+    stxv    vs52,  288(SP)
+    stxv    vs53,  304(SP)
+    stxv    vs54,  320(SP)
+    stxv    vs55,  336(SP)
+    stxv    vs56,  352(SP)
+    stxv    vs57,  368(SP)
+    stxv    vs58,  384(SP) 
+    stxv    vs59,  400(SP) 
+    stxv    vs60,  416(SP)
+    stxv    vs61,  432(SP) 
+    stxv    vs62,  448(SP)
+    stxv    vs63,  464(SP)
 
 
 	stfd	f1,  ALPHA_SP
@@ -229,18 +229,18 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 	ld	r15,  272(SP)
 	ld	r14,  280(SP)
  
-    lxv    v20,  288(SP)
-    lxv    v21,  304(SP)
-    lxv    v22,  320(SP)
-    lxv    v23,  336(SP)
-    lxv    v24,  352(SP)
-    lxv    v25,  368(SP)
-    lxv    v26,  384(SP)
-    lxv    v27,  400(SP)
-    lxv    v28,  416(SP)
-    lxv    v29,  432(SP)
-    lxv    v30,  448(SP)
-    lxv    v31,  464(SP)
+    lxv    vs52,  288(SP)
+    lxv    vs53,  304(SP)
+    lxv    vs54,  320(SP)
+    lxv    vs55,  336(SP)
+    lxv    vs56,  352(SP)
+    lxv    vs57,  368(SP)
+    lxv    vs58,  384(SP) 
+    lxv    vs59,  400(SP) 
+    lxv    vs60,  416(SP)
+    lxv    vs61,  432(SP) 
+    lxv    vs62,  448(SP)
+    lxv    vs63,  464(SP)
 
 	addi	SP, SP, STACKSIZE
 	blr
diff --git a/kernel/power/sgemm_kernel_power9.S b/kernel/power/sgemm_kernel_power9.S
index f408cdc17..7a0f3143e 100644
--- a/kernel/power/sgemm_kernel_power9.S
+++ b/kernel/power/sgemm_kernel_power9.S
@@ -32,7 +32,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  
 #define LOAD	ld
 #define STACKSIZE  (512 )  
-  
+#define FLINK_SAVE (STACKSIZE+16) /* 16($r12) */  
 #define	M	r3
 #define	N	r4
 #define	K	r5
@@ -91,7 +91,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 	PROFCODE
 
 	addi	SP, SP, -STACKSIZE
-	li	r0, 0
+	mflr r0
+
 
 	stfd	f14,    0(SP)
 	stfd	f15,    8(SP)
@@ -137,19 +138,19 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 	std	r14,  280(SP)
  
  
-	stxv    v20,  288(SP)
-	stxv    v21,  304(SP)
-	stxv    v22,  320(SP)
-	stxv    v23,  336(SP)
-	stxv    v24,  352(SP)
-	stxv    v25,  368(SP)
-	stxv    v26,  384(SP)
-	stxv    v27,  400(SP)
-	stxv    v28,  416(SP)
-	stxv    v29,  432(SP)
-	stxv    v30,  448(SP)
-	stxv    v31,  464(SP)
-
+  stxv    vs52,  288(SP)
+  stxv    vs53,  304(SP)
+  stxv    vs54,  320(SP)
+  stxv    vs55,  336(SP)
+  stxv    vs56,  352(SP)
+  stxv    vs57,  368(SP)
+  stxv    vs58,  384(SP)
+  stxv    vs59,  400(SP)
+  stxv    vs60,  416(SP)
+  stxv    vs61,  432(SP)
+  stxv    vs62,  448(SP)
+  stxv    vs63,  464(SP)
+  std     r0,   FLINK_SAVE(SP)
  
 
 #if defined(TRMMKERNEL) 
@@ -157,72 +158,54 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #endif
    slwi    LDC, LDC, 2
 
-
-/*	cmpwi	cr0, M, 0
-	ble	.L999_H1
-	cmpwi	cr0, N, 0
-	ble	.L999_H1
-	cmpwi	cr0, K, 0
-	ble	.L999_H1
-*/
  
  
 	/*alpha is stored in f1. convert to single and splat*/
-    xscvdpspn alpha_r,vs1 
-	xxspltw   alpha_r,alpha_r,0
- 
+  xscvdpspn alpha_r,vs1 
+	xxspltw   alpha_r,alpha_r,0 
  
 /*load reverse permute mask for big endian
   uint128 = 0xc0d0e0f08090a0b0405060700010203
 */ 
 		
 	lis T2, perm_const2@highest
-	ori T2, T2, perm_const2@higher
-	rldicr T2, T2, 32, 31
-	oris T2, T2, perm_const2@h
-	ori T2, T2, perm_const2@l 
-
 	lis T1, perm_const1@highest
+	lis T3, save_permute_12@highest
+	lis T4, save_permute_11@highest
+	lis T5, save_permute_22@highest
+	lis T6, save_permute_21@highest
+	ori T2, T2, perm_const2@higher
 	ori T1, T1, perm_const1@higher
+	ori T3, T3, save_permute_12@higher
+	ori T4, T4, save_permute_11@higher
+	ori T5, T5, save_permute_22@higher
+	ori T6, T6, save_permute_21@higher
+	rldicr T2, T2, 32, 31
 	rldicr T1, T1, 32, 31
+	rldicr T3, T3, 32, 31
+	rldicr T4, T4, 32, 31
+	rldicr T5, T5, 32, 31
+	rldicr T6, T6, 32, 31
+	oris T2, T2, perm_const2@h
 	oris T1, T1, perm_const1@h
+	oris T3, T3, save_permute_12@h
+	oris T4, T4, save_permute_11@h
+	oris T5, T5, save_permute_22@h
+	oris T6, T6, save_permute_21@h
+	ori T2, T2, perm_const2@l  
 	ori T1, T1, perm_const1@l
-
+	ori T3, T3, save_permute_12@l  
+	ori T4, T4, save_permute_11@l
+	ori T5, T5, save_permute_22@l 
+	ori T6, T6, save_permute_21@l
+  li r0,0
 	mtvsrdd permute_mask,T2,T1
-
-	lis T2, save_permute_12@highest
-	ori T2, T2, save_permute_12@higher
-	rldicr T2, T2, 32, 31
-	oris T2, T2, save_permute_12@h
-	ori T2, T2, save_permute_12@l 
-
-	lis T1, save_permute_11@highest
-	ori T1, T1, save_permute_11@higher
-	rldicr T1, T1, 32, 31
-	oris T1, T1, save_permute_11@h
-	ori T1, T1, save_permute_11@l
-
-	mtvsrdd save_permute_1,T2,T1	
-
-	lis T2, save_permute_22@highest
-	ori T2, T2, save_permute_22@higher
-	rldicr T2, T2, 32, 31
-	oris T2, T2, save_permute_22@h
-	ori T2, T2, save_permute_22@l 
-
-	lis T1, save_permute_21@highest
-	ori T1, T1, save_permute_21@higher
-	rldicr T1, T1, 32, 31
-	oris T1, T1, save_permute_21@h
-	ori T1, T1, save_permute_21@l
-
-	mtvsrdd save_permute_2,T2,T1	
+	mtvsrdd save_permute_1,T3,T4	
+	mtvsrdd save_permute_2,T5,T6	
 
 #include "sgemm_logic_power9.S"
 
-.L999:
-	addi	r3, 0, 0
-
+.L999: 
 	lfd	f14,    0(SP)
 	lfd	f15,    8(SP)
 	lfd	f16,   16(SP)
@@ -264,23 +247,26 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 	ld	r16,  264(SP)
 	ld	r15,  272(SP)
 	ld	r14,  280(SP)
- 
-	lxv    v20,  288(SP)
-	lxv    v21,  304(SP)
-	lxv    v22,  320(SP)
-	lxv    v23,  336(SP)
-	lxv    v24,  352(SP)
-	lxv    v25,  368(SP)
-	lxv    v26,  384(SP)
-	lxv    v27,  400(SP)
-	lxv    v28,  416(SP)
-	lxv    v29,  432(SP)
-	lxv    v30,  448(SP)
-	lxv    v31,  464(SP)
 
+	ld    r0, 	 FLINK_SAVE(SP)	
  
-	addi	SP, SP, STACKSIZE
+    lxv    vs52,  288(SP)
+    lxv    vs53,  304(SP)
+    lxv    vs54,  320(SP)
+    lxv    vs55,  336(SP)
+    lxv    vs56,  352(SP)
+    lxv    vs57,  368(SP)
+    lxv    vs58,  384(SP) 
+    lxv    vs59,  400(SP)
+	mtlr r0
+    lxv    vs60,  416(SP)
+    lxv    vs61,  432(SP) 
+    lxv    vs62,  448(SP)
+    lxv    vs63,  464(SP)
+
+	addi	SP, SP, STACKSIZE 
 	blr
 
+
 	EPILOGUE
 #endif
diff --git a/kernel/power/sgemm_logic_power9.S b/kernel/power/sgemm_logic_power9.S
index c149cb903..25e8c8387 100644
--- a/kernel/power/sgemm_logic_power9.S
+++ b/kernel/power/sgemm_logic_power9.S
@@ -1,5 +1,94 @@
 #define MY_ALIGN .align 3
+b L8
 
+	MY_ALIGN
+LSGEMM_L8x16_LMAIN_SUB: 
+	LOAD8x16_0    
+	mtctr		L 
+	MY_ALIGN
+
+LSGEMM_L8x16_LOOP:
+
+    KERNEL8x16_I1_L4_2  64,32, 0,0
+    KERNEL8x16_I1_L4_2  64,32, 1,0
+    KERNEL8x16_I1_L4_2  64,32, 2,0
+    KERNEL8x16_I1_L4_2  64,32, 3,0
+    KERNEL8x16_I1_L4_2  64,32, 4,0
+    KERNEL8x16_I1_L4_2  64,32, 5,0        
+    KERNEL8x16_I1_L4_2  64,32, 6,0
+    KERNEL8x16_I1_L4_2  64,32, 7,0  
+    KERNEL8x16_I1_L4_2  64,32, 8,0      
+    KERNEL8x16_I1_L4_2  64,32, 9,0
+    KERNEL8x16_I1_L4_2  64,32, 10,0
+    KERNEL8x16_I1_L4_2  64,32, 11,0
+    KERNEL8x16_I1_L4_2  64,32, 12,0
+    KERNEL8x16_I1_L4_2  64,32, 13,0    
+    KERNEL8x16_I1_L4_2  64,32, 14,0    
+    KERNEL8x16_I1_L4_2  64,32, 15,0  	
+    KERNEL8x16_I1_L4_2  64,32, 16,0
+    KERNEL8x16_I1_L4_2  64,32, 17,0
+    KERNEL8x16_I1_L4_2  64,32, 18,0
+    KERNEL8x16_I1_L4_2  64,32, 19,0
+    KERNEL8x16_I1_L4_2  64,32, 20,0
+    KERNEL8x16_I1_L4_2  64,32, 21,0        
+    KERNEL8x16_I1_L4_2  64,32, 22,0
+    KERNEL8x16_I1_L4_2  64,32, 23,0  
+    KERNEL8x16_I1_L4_2  64,32, 24,0      
+    KERNEL8x16_I1_L4_2  64,32, 25,0
+    KERNEL8x16_I1_L4_2  64,32, 26,0
+    KERNEL8x16_I1_L4_2  64,32, 27,0
+    KERNEL8x16_I1_L4_2  64,32, 28,0
+    KERNEL8x16_I1_L4_2  64,32, 29,0    
+    KERNEL8x16_I1_L4_2  64,32, 30,0    
+    KERNEL8x16_I1_L4_2  64,32, 31,1 
+	bdnz		LSGEMM_L8x16_LOOP
+
+	MY_ALIGN
+LSGEMM_L8x16_LOOP_END: 
+    END8x16 0, AO, BO, 64, 32
+    blr  
+
+	MY_ALIGN
+LSGEMM_L8x16_L64_SUB: 
+	LOAD8x16_0     
+    KERNEL8x16_I1_L4_2  64,32, 0,0
+    KERNEL8x16_I1_L4_2  64,32, 1,0
+    KERNEL8x16_I1_L4_2  64,32, 2,0
+    KERNEL8x16_I1_L4_2  64,32, 3,0
+    KERNEL8x16_I1_L4_2  64,32, 4,0
+    KERNEL8x16_I1_L4_2  64,32, 5,0        
+    KERNEL8x16_I1_L4_2  64,32, 6,0
+    KERNEL8x16_I1_L4_2  64,32, 7,0  
+    KERNEL8x16_I1_L4_2  64,32, 8,0      
+    KERNEL8x16_I1_L4_2  64,32, 9,0
+    KERNEL8x16_I1_L4_2  64,32, 10,0
+    KERNEL8x16_I1_L4_2  64,32, 11,0
+    KERNEL8x16_I1_L4_2  64,32, 12,0
+    KERNEL8x16_I1_L4_2  64,32, 13,0    
+    KERNEL8x16_I1_L4_2  64,32, 14,0    
+    KERNEL8x16_I1_L4_3  64,32, 15,1 
+    blr	
+LSGEMM_L8x16_L32_SUB: 
+	LOAD8x16_0     
+    KERNEL8x16_I1_L4_2  64,32, 0,0
+    KERNEL8x16_I1_L4_2  64,32, 1,0
+    KERNEL8x16_I1_L4_2  64,32, 2,0
+    KERNEL8x16_I1_L4_2  64,32, 3,0
+    KERNEL8x16_I1_L4_2  64,32, 4,0
+    KERNEL8x16_I1_L4_2  64,32, 5,0        
+    KERNEL8x16_I1_L4_2  64,32, 6,0
+    KERNEL8x16_I1_L4_3  64,32, 7,1
+    blr	
+
+LSGEMM_L8x16_L16_SUB: 
+	LOAD8x16_0     
+    KERNEL8x16_I1_L4_2  64,32, 0,0
+    KERNEL8x16_I1_L4_2  64,32, 1,0
+    KERNEL8x16_I1_L4_2  64,32, 2,0
+    KERNEL8x16_I1_L4_3  64,32, 3,1
+    blr	
+
+L8:
 #if defined(TRMMKERNEL) && !defined(LEFT)
    neg TEMP_REG, OFFSET 
 #endif
@@ -39,98 +128,50 @@ LSGEMM_L8x16_BEGIN:
    REFRESH_TEMP_BK T11,K,TEMP_REG,16,8
    mr T12, T11
    addi T12,T12, -1
-   srawi.		L, T12,	6 /**(T11-1) % 64x */
+   srawi.		L, T12,	7 /**(T11-1) % 128x */
 #else
    mr T12, K
    addi T12,T12, -1
-   srawi.		L,	T12,	6 /**(K-1) % 64x */
+   srawi.		L,	T12,	7 /**(K-1) % 128x */
 #endif 
  
     ZERO8x16
 	ble		LSGEMM_L8x16_SUB0
-
-	MY_ALIGN
-LSGEMM_L8x16_LOOP_START:
- 
-	LOAD8x16_0  /*we already zeroed */
-    /*##OffsetA=64 OffsetB=32
-    #addi AO,AO,2112
-    #addi BO,BO,32  */
-
-	mtctr		L
-
-	MY_ALIGN
-
-LSGEMM_L8x16_LOOP:
-
-    KERNEL8x16_I1_L4_2  64,32, 0,0
-    KERNEL8x16_I1_L4_2  64,32, 1,0
-    KERNEL8x16_I1_L4_2  64,32, 2,0
-    KERNEL8x16_I1_L4_2  64,32, 3,0
-    KERNEL8x16_I1_L4_2  64,32, 4,0
-    KERNEL8x16_I1_L4_2  64,32, 5,0        
-    KERNEL8x16_I1_L4_2  64,32, 6,0
-    KERNEL8x16_I1_L4_2  64,32, 7,0  
-    KERNEL8x16_I1_L4_2  64,32, 8,0      
-    KERNEL8x16_I1_L4_2  64,32, 9,0
-    KERNEL8x16_I1_L4_2  64,32, 10,0
-    KERNEL8x16_I1_L4_2  64,32, 11,0
-    KERNEL8x16_I1_L4_2  64,32, 12,0
-    KERNEL8x16_I1_L4_2  64,32, 13,0    
-    KERNEL8x16_I1_L4_2  64,32, 14,0    
-    KERNEL8x16_I1_L4_2  64,32, 15,1  	
-
-	bdnz		LSGEMM_L8x16_LOOP
-
-	MY_ALIGN
-LSGEMM_L8x16_LOOP_END:
-
-    END8x16 0, AO, BO, 64, 32    
-
-	b		LSGEMM_L8x16_SUB1 
+    bl      LSGEMM_L8x16_LMAIN_SUB
+	andi.		L,	T12,	127
+	ble		LSGEMM_L8x16_SAVE
+	b		LSGEMM_L8x16_SUB2   
 	MY_ALIGN
 LSGEMM_L8x16_SUB0:
 #if defined(TRMMKERNEL)
-	andi.		L,	T11,	127
+	andi.		L,	T11,	255
+    cmpwi   T11,128
 #else
-	andi.		L,	K,	127
+	andi.		L,	K,	255
+    cmpwi   K,128
 #endif   
-	b		LSGEMM_L8x16_SUB2
-	MY_ALIGN
-LSGEMM_L8x16_SUB1:
-#if defined(TRMMKERNEL)
-	andi.		L,	T12,	63
-#else
-	andi.		L,  T12,	63
-#endif	
-	ble		LSGEMM_L8x16_SAVE
+
+	bne LSGEMM_L8x16_SUB2 
+    MY_ALIGN	
+LSGEMM_L8x16_SUB2_128:
+  	bl LSGEMM_L8x16_L64_SUB
+	bl LSGEMM_L8x16_L64_SUB  
+	b LSGEMM_L8x16_SAVE  
 	MY_ALIGN
 LSGEMM_L8x16_SUB2:
-
-    srawi.      T10,L, 5
+    andi.   T10,L,64
+    ble   LSGEMM_L8x16_SUB2_32
+    bl   LSGEMM_L8x16_L64_SUB
+    MY_ALIGN 
+LSGEMM_L8x16_SUB2_32:
+    andi.      T10,L, 32
     ble LSGEMM_L8x16_SUB2_16
-    mtctr		T10
-    MY_ALIGN
-LSGEMM_L8x16_SUB2_LOOP:
-	LOAD8x16_0 
-    KERNEL8x16_I1_L4_2  64,32, 0,0
-    KERNEL8x16_I1_L4_2  64,32, 1,0
-    KERNEL8x16_I1_L4_2  64,32, 2,0
-    KERNEL8x16_I1_L4_2  64,32, 3,0
-    KERNEL8x16_I1_L4_2  64,32, 4,0
-    KERNEL8x16_I1_L4_2  64,32, 5,0
-    KERNEL8x16_I1_L4_2  64,32, 6,0
-    KERNEL8x16_I1_L4_3  64,32, 7,1
-    bdnz LSGEMM_L8x16_SUB2_LOOP 
-    MY_ALIGN        
+    bl   LSGEMM_L8x16_L32_SUB
+    MY_ALIGN                
 LSGEMM_L8x16_SUB2_16:
     andi.      T10,L, 16
     ble LSGEMM_L8x16_SUB2_8
-	LOAD8x16_0 
-    KERNEL8x16_I1_L4_2  64,32, 0,0
-    KERNEL8x16_I1_L4_2  64,32, 1,0
-    KERNEL8x16_I1_L4_2  64,32, 2,0
-    KERNEL8x16_I1_L4_3  64,32, 3,1
+	bl  LSGEMM_L8x16_L16_SUB
     MY_ALIGN 
 LSGEMM_L8x16_SUB2_8:
     andi.      T10,L, 8
@@ -155,8 +196,7 @@ LSGEMM_L8x16_SUB2_1:
     andi.      T10,L, 1
     ble LSGEMM_L8x16_SAVE	
     KERNEL8x16 0
-#	addic.		L,	L,	-1
-#	bgt		LSGEMM_L8x16_SUB2
+
 
 	MY_ALIGN
 LSGEMM_L8x16_SAVE:
diff --git a/kernel/power/sgemm_macros_power9.S b/kernel/power/sgemm_macros_power9.S
index c61f419ac..3f86a1d25 100644
--- a/kernel/power/sgemm_macros_power9.S
+++ b/kernel/power/sgemm_macros_power9.S
@@ -62,7 +62,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
   KERNEL8x16_L1_L4_I  AO,BO,  \OffsetA,\OffsetB,\Index,\IsLast,1
 .endm
 .macro KERNEL8x16_I1_L2_3  OffsetA,OffsetB, Index,IsLast
-   KERNEL8x16_L1_L2_I AO,BO,0,  \OffsetA,\OffsetB,\Index,\IsLast,1
+   KERNEL8x16_L1_L2_I AO,BO, \OffsetA,\OffsetB,\Index,\IsLast,1
 .endm
 
 .macro KERNEL8x16_I2_L4_2  AREG,BREG,OffsetA,OffsetB, Index,IsLast
@@ -112,15 +112,14 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
 	lxv	vs24,	0(BO)
 	lxv	vs28,	16(BO)
+	xxperm  	vs26,	vs24,		permute_mask
+	xxperm  	vs30,	vs28,		permute_mask	  
 	lxv	vs0,	 0(AO)
 	lxv	vs1,	16(AO)
-	lxv	vs2,	32(AO)
-	lxv	vs3,	48(AO)
-	xxperm  	vs26,	vs24,		permute_mask
-	xxperm  	vs30,	vs28,		permute_mask	
 	xxpermdi	vs25,	vs24,	vs24,2	   
 	xxpermdi	vs29,	vs28,	vs28,2	  
-
+	lxv	vs2,	32(AO)
+	lxv	vs3,	48(AO) 
 	xxpermdi	vs27,	vs26,	vs26,2	
 	xxpermdi	vs31,	vs30,	vs30,2	 	
 
@@ -259,247 +258,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
 .macro KERNEL8x16_L1_L4_I  AREG,BREG,   OffsetA,OffsetB, Index,IsLast ,Complete
 
-	lxv	vs8,	DISP32(\Index, 0+\OffsetB)(\BREG)
-	lxv	vs12,	DISP32(\Index,16+\OffsetB)(\BREG)
-
- 	lxv	vs4,	DISP64(\Index, 0+\OffsetA)(\AREG)
-	lxv	vs5,	DISP64(\Index,16+\OffsetA)(\AREG)
-	lxv	vs6,	DISP64(\Index,32+\OffsetA)(\AREG)
-	lxv	vs7,	DISP64(\Index,48+\OffsetA)(\AREG) 
-
-	xxperm  	vs10,	vs8,		permute_mask
-	xxperm  	vs14,	vs12,		permute_mask	
-	xxpermdi	vs9,	vs8,	vs8,2	 
-	xxpermdi	vs13,	vs12,	vs12,2	 
-
-    xvmaddasp		vs32, vs0,vs24
-	xvmaddasp		vs33, vs1,vs24
-	xvmaddasp		vs34, vs2,vs24	
-	xvmaddasp		vs35, vs3,vs24	 
-
-    xvmaddasp		vs36, vs0,vs25
-	xvmaddasp		vs37, vs1,vs25
-	xvmaddasp		vs38, vs2,vs25	
-	xvmaddasp		vs39, vs3,vs25 
-
- 	xxpermdi	vs11,	vs10,	vs10,2	
-	xxpermdi	vs15,	vs14,	vs14,2	
-
-    xvmaddasp		vs40, vs0,vs26
-	xvmaddasp		vs41, vs1,vs26
-	xvmaddasp		vs42, vs2,vs26	
-	xvmaddasp		vs43, vs3,vs26
-
-    xvmaddasp		vs44, vs0,vs27
-	xvmaddasp		vs45, vs1,vs27
-	xvmaddasp		vs46, vs2,vs27	
-	xvmaddasp		vs47, vs3,vs27
-
-    xvmaddasp		vs48, vs0,vs28
-	xvmaddasp		vs49, vs1,vs28
-	xvmaddasp		vs50, vs2,vs28	
-	xvmaddasp		vs51, vs3,vs28	
-
-    xvmaddasp		vs52, vs0,vs29
-	xvmaddasp		vs53, vs1,vs29
-	xvmaddasp		vs54, vs2,vs29	
-	xvmaddasp		vs55, vs3,vs29
-
-    xvmaddasp		vs56, vs0,vs30
-	xvmaddasp		vs57, vs1,vs30
-	xvmaddasp		vs58, vs2,vs30	
-	xvmaddasp		vs59, vs3,vs30
-
-    xvmaddasp		vs60, vs0,vs31
-	xvmaddasp		vs61, vs1,vs31
-	xvmaddasp		vs62, vs2,vs31	
-	xvmaddasp		vs63, vs3,vs31 
-
-	lxv	vs24,	DISP32(\Index,32+\OffsetB)(\BREG)
-	lxv	vs28,	DISP32(\Index,32+16+\OffsetB)(\BREG)
-
-	lxv	vs0,	DISP64(\Index,64+\OffsetA)(\AREG)
-	lxv	vs1,	DISP64(\Index,64+16+\OffsetA)(\AREG)
-	lxv	vs2,	DISP64(\Index,64+32+\OffsetA)(\AREG)
-	lxv	vs3,	DISP64(\Index,64+48+\OffsetA)(\AREG)
-
-	xxperm  	vs26,	vs24,		permute_mask
-	xxperm  	vs30,	vs28,		permute_mask	
-	xxpermdi	vs25,	vs24,	vs24,2	   
-	xxpermdi	vs29,	vs28,	vs28,2	  
- 
-
-    xvmaddasp		vs32, vs4,vs8
-	xvmaddasp		vs33, vs5,vs8
-	xvmaddasp		vs34, vs6,vs8	
-	xvmaddasp		vs35, vs7,vs8	
- 
-    xvmaddasp		vs36, vs4,vs9
-	xvmaddasp		vs37, vs5,vs9
-	xvmaddasp		vs38, vs6,vs9	
-	xvmaddasp		vs39, vs7,vs9
-         
-	xxpermdi	vs27,	vs26,	vs26,2	
-	xxpermdi	vs31,	vs30,	vs30,2		
-
-    xvmaddasp		vs40, vs4,vs10
-	xvmaddasp		vs41, vs5,vs10
-	xvmaddasp		vs42, vs6,vs10	
-	xvmaddasp		vs43, vs7,vs10
-
-    xvmaddasp		vs44, vs4,vs11
-	xvmaddasp		vs45, vs5,vs11
-	xvmaddasp		vs46, vs6,vs11	
-	xvmaddasp		vs47, vs7,vs11
-
-    xvmaddasp		vs48, vs4,vs12
-	xvmaddasp		vs49, vs5,vs12
-	xvmaddasp		vs50, vs6,vs12	
-	xvmaddasp		vs51, vs7,vs12	
-
-    xvmaddasp		vs52, vs4,vs13
-	xvmaddasp		vs53, vs5,vs13
-	xvmaddasp		vs54, vs6,vs13	
-	xvmaddasp		vs55, vs7,vs13
-
-    xvmaddasp		vs56, vs4,vs14
-	xvmaddasp		vs57, vs5,vs14
-	xvmaddasp		vs58, vs6,vs14	
-	xvmaddasp		vs59, vs7,vs14
-
-    xvmaddasp		vs60, vs4,vs15
-	xvmaddasp		vs61, vs5,vs15
-	xvmaddasp		vs62, vs6,vs15	
-	xvmaddasp		vs63, vs7,vs15
-
-	lxv	vs8,	DISP32(\Index,64+\OffsetB)(\BREG)
-	lxv	vs12,	DISP32(\Index,64+16+\OffsetB)(\BREG)
-
- 	lxv	vs4,	DISP64(\Index,128+0+\OffsetA)(\AREG)
-	lxv	vs5,	DISP64(\Index,128+16+\OffsetA)(\AREG)
-	lxv	vs6,	DISP64(\Index,128+32+\OffsetA)(\AREG)
-	lxv	vs7,	DISP64(\Index,128+48+\OffsetA)(\AREG) 
-
-	xxperm  	vs10,	vs8,		permute_mask
-	xxperm  	vs14,	vs12,		permute_mask	
-	xxpermdi	vs9,	vs8,	vs8,2	 
-	xxpermdi	vs13,	vs12,	vs12,2	
-
-    xvmaddasp		vs32, vs0,vs24
-	xvmaddasp		vs33, vs1,vs24
-	xvmaddasp		vs34, vs2,vs24	
-	xvmaddasp		vs35, vs3,vs24	 
-
-    xvmaddasp		vs36, vs0,vs25
-	xvmaddasp		vs37, vs1,vs25
-	xvmaddasp		vs38, vs2,vs25	
-	xvmaddasp		vs39, vs3,vs25
-
- 	xxpermdi	vs11,	vs10,	vs10,2	
-	xxpermdi	vs15,	vs14,	vs14,2	
-
-    xvmaddasp		vs40, vs0,vs26
-	xvmaddasp		vs41, vs1,vs26
-	xvmaddasp		vs42, vs2,vs26	
-	xvmaddasp		vs43, vs3,vs26
-
-    xvmaddasp		vs44, vs0,vs27
-	xvmaddasp		vs45, vs1,vs27
-	xvmaddasp		vs46, vs2,vs27	
-	xvmaddasp		vs47, vs3,vs27
-
-    xvmaddasp		vs48, vs0,vs28
-	xvmaddasp		vs49, vs1,vs28
-	xvmaddasp		vs50, vs2,vs28	
-	xvmaddasp		vs51, vs3,vs28	
-
-    xvmaddasp		vs52, vs0,vs29
-	xvmaddasp		vs53, vs1,vs29
-	xvmaddasp		vs54, vs2,vs29	
-	xvmaddasp		vs55, vs3,vs29
-
-    xvmaddasp		vs56, vs0,vs30
-	xvmaddasp		vs57, vs1,vs30
-	xvmaddasp		vs58, vs2,vs30	
-	xvmaddasp		vs59, vs3,vs30
-
-    xvmaddasp		vs60, vs0,vs31
-	xvmaddasp		vs61, vs1,vs31
-	xvmaddasp		vs62, vs2,vs31	
-	xvmaddasp		vs63, vs3,vs31 
- 
-.if \Complete==0
-	lxv	vs24,	DISP32(\Index,96+\OffsetB)(\BREG)
-	lxv	vs28,	DISP32(\Index,96+16+\OffsetB)(\BREG)
-
-	lxv	vs0,	DISP64(\Index,192+\OffsetA)(\AREG)
-	lxv	vs1,	DISP64(\Index,192+16+\OffsetA)(\AREG) 
-	lxv	vs2,	DISP64(\Index,192+32+\OffsetA)(\AREG)
-	lxv	vs3,	DISP64(\Index,192+48+\OffsetA)(\AREG)
-
-	xxperm  	vs26,	vs24,	permute_mask
-	xxperm  	vs30,	vs28,	permute_mask	
-	xxpermdi	vs25,	vs24,	vs24,2 
-	xxpermdi	vs29,	vs28,	vs28,2		
-
-.endif 
-.if \IsLast==1	
-.if \Complete==1
-  
-	addi		\BREG, \BREG,  DISP32(\Index,32*3+\OffsetB)
-	addi		\AREG, \AREG, DISP64(\Index,64*3+\OffsetA)
-.else
-  
-	addi		\BREG, \BREG,  DISP32(\Index,128)
-	addi		\AREG, \AREG, DISP64(\Index,256)
-.endif
-.endif   
- 
-    xvmaddasp		vs32, vs4,vs8
-	xvmaddasp		vs33, vs5,vs8
-	xvmaddasp		vs34, vs6,vs8	
-	xvmaddasp		vs35, vs7,vs8	 
- 
-    xvmaddasp		vs36, vs4,vs9
-	xvmaddasp		vs37, vs5,vs9
-	xvmaddasp		vs38, vs6,vs9	
-	xvmaddasp		vs39, vs7,vs9
-  
-.if \Complete==0        
-	xxpermdi	vs27,	vs26,	vs26,2	
-	xxpermdi	vs31,	vs30,	vs30,2	
- 	
-.endif
- 
-    xvmaddasp		vs40, vs4,vs10
-	xvmaddasp		vs41, vs5,vs10
-	xvmaddasp		vs42, vs6,vs10	
-	xvmaddasp		vs43, vs7,vs10
-
-    xvmaddasp		vs44, vs4,vs11
-	xvmaddasp		vs45, vs5,vs11
-	xvmaddasp		vs46, vs6,vs11	
-	xvmaddasp		vs47, vs7,vs11
-
-    xvmaddasp		vs48, vs4,vs12
-	xvmaddasp		vs49, vs5,vs12
-	xvmaddasp		vs50, vs6,vs12	
-	xvmaddasp		vs51, vs7,vs12	
-
-    xvmaddasp		vs52, vs4,vs13
-	xvmaddasp		vs53, vs5,vs13
-	xvmaddasp		vs54, vs6,vs13	
-	xvmaddasp		vs55, vs7,vs13
-
-    xvmaddasp		vs56, vs4,vs14
-	xvmaddasp		vs57, vs5,vs14
-	xvmaddasp		vs58, vs6,vs14	
-	xvmaddasp		vs59, vs7,vs14
-
-    xvmaddasp		vs60, vs4,vs15
-	xvmaddasp		vs61, vs5,vs15
-	xvmaddasp		vs62, vs6,vs15	
-	xvmaddasp		vs63, vs7,vs15
+KERNEL8x16_L1_L2_I  \AREG,\BREG, \OffsetA,\OffsetB, (\Index*2),0 ,0
+KERNEL8x16_L1_L2_I  \AREG,\BREG,\OffsetA,\OffsetB, (\Index*2+1),\IsLast ,\Complete
 
 .endm
 
@@ -509,224 +269,134 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
   END8x16 \First, AO, BO, 64,32 
 .endm
 
-.macro KERNEL8x16_L1_L2_I  AREG,BREG,First,OffsetA,OffsetB, Index,IsLast ,Complete
-	
-	lxv	vs8,	DISP16(\Index, 0+\OffsetB)(\BREG)
+.macro KERNEL8x16_L1_L2_I  AREG,BREG, OffsetA,OffsetB, Index,IsLast ,Complete
+	lxv	vs8,	DISP16(\Index,\OffsetB)(\BREG)
 	lxv	vs12,	DISP16(\Index,16+\OffsetB)(\BREG)
 
- 	lxv	vs4,	DISP32(\Index, 0+\OffsetA)(\AREG)
+  xvmaddasp		vs32, vs0,vs24
+  xvmaddasp		vs36, vs0,vs25
+ 	lxv	vs4,	DISP32(\Index,0+\OffsetA)(\AREG)
 	lxv	vs5,	DISP32(\Index,16+\OffsetA)(\AREG)
-	lxv	vs6,	DISP32(\Index,32+\OffsetA)(\AREG)
-	lxv	vs7,	DISP32(\Index,48+\OffsetA)(\AREG) 
-
 	xxperm  	vs10,	vs8,		permute_mask
 	xxperm  	vs14,	vs12,		permute_mask	
+  xvmaddasp		vs40, vs0,vs26
+  xvmaddasp		vs44, vs0,vs27
+	lxv	vs6,	DISP32(\Index,32+\OffsetA)(\AREG)
+	lxv	vs7,	DISP32(\Index,48+\OffsetA)(\AREG) 
+  xvmaddasp		vs48, vs0,vs28
+  xvmaddasp		vs52, vs0,vs29
+
 	xxpermdi	vs9,	vs8,	vs8,2	 
 	xxpermdi	vs13,	vs12,	vs12,2	 
-.if \First==1
-    xvmulsp		vs32, vs0,vs24
-	xvmulsp		vs33, vs1,vs24
-	xvmulsp		vs34, vs2,vs24	
-	xvmulsp		vs35, vs3,vs24	
 
-    xvmulsp		vs36, vs0,vs25
-	xvmulsp		vs37, vs1,vs25
-	xvmulsp		vs38, vs2,vs25	
-	xvmulsp		vs39, vs3,vs25	
-.else
-    xvmaddasp		vs32, vs0,vs24
-	xvmaddasp		vs33, vs1,vs24
-	xvmaddasp		vs34, vs2,vs24	
-	xvmaddasp		vs35, vs3,vs24
-
-    xvmaddasp		vs36, vs0,vs25
-	xvmaddasp		vs37, vs1,vs25
-	xvmaddasp		vs38, vs2,vs25	
-	xvmaddasp		vs39, vs3,vs25		
-.endif
+  xvmaddasp		vs56, vs0,vs30
+  xvmaddasp		vs60, vs0,vs31
 
  	xxpermdi	vs11,	vs10,	vs10,2	
 	xxpermdi	vs15,	vs14,	vs14,2	
- 
-.if \First==1  
-    xvmulsp		vs40, vs0,vs26
-	xvmulsp		vs41, vs1,vs26
-	xvmulsp		vs42, vs2,vs26	
-	xvmulsp		vs43, vs3,vs26
 
-    xvmulsp		vs44, vs0,vs27
-	xvmulsp		vs45, vs1,vs27
-	xvmulsp		vs46, vs2,vs27	
-	xvmulsp		vs47, vs3,vs27
 
-    xvmulsp		vs48, vs0,vs28
-	xvmulsp		vs49, vs1,vs28
-	xvmulsp		vs50, vs2,vs28	
-	xvmulsp		vs51, vs3,vs28	
 
-    xvmulsp		vs52, vs0,vs29
-	xvmulsp		vs53, vs1,vs29
-	xvmulsp		vs54, vs2,vs29	
-	xvmulsp		vs55, vs3,vs29
+	xvmaddasp		vs33, vs1,vs24
+	xvmaddasp		vs37, vs1,vs25
 
-    xvmulsp		vs56, vs0,vs30
-	xvmulsp		vs57, vs1,vs30
-	xvmulsp		vs58, vs2,vs30	
-	xvmulsp		vs59, vs3,vs30
-
-    xvmulsp		vs60, vs0,vs31
-	xvmulsp		vs61, vs1,vs31
-	xvmulsp		vs62, vs2,vs31	
-	xvmulsp		vs63, vs3,vs31
-
-.else 
-    xvmaddasp		vs40, vs0,vs26
 	xvmaddasp		vs41, vs1,vs26
-	xvmaddasp		vs42, vs2,vs26	
-	xvmaddasp		vs43, vs3,vs26
-
-    xvmaddasp		vs44, vs0,vs27
 	xvmaddasp		vs45, vs1,vs27
-	xvmaddasp		vs46, vs2,vs27	
-	xvmaddasp		vs47, vs3,vs27
-
-    xvmaddasp		vs48, vs0,vs28
 	xvmaddasp		vs49, vs1,vs28
-	xvmaddasp		vs50, vs2,vs28	
-	xvmaddasp		vs51, vs3,vs28	
-
-    xvmaddasp		vs52, vs0,vs29
 	xvmaddasp		vs53, vs1,vs29
-	xvmaddasp		vs54, vs2,vs29	
-	xvmaddasp		vs55, vs3,vs29
-
-    xvmaddasp		vs56, vs0,vs30
 	xvmaddasp		vs57, vs1,vs30
-	xvmaddasp		vs58, vs2,vs30	
-	xvmaddasp		vs59, vs3,vs30
-
-    xvmaddasp		vs60, vs0,vs31
-	xvmaddasp		vs61, vs1,vs31
-	xvmaddasp		vs62, vs2,vs31	
-	xvmaddasp		vs63, vs3,vs31 
-
+	xvmaddasp		vs61, vs1,vs31    
+.if \Complete==0
+	lxv	vs0,	DISP32(\Index,64+\OffsetA)(\AREG)
+	lxv	vs1,	DISP32(\Index,64+16+\OffsetA)(\AREG) 
 .endif
+	xvmaddasp		vs34, vs2,vs24
+	xvmaddasp		vs38, vs2,vs25
+	xvmaddasp		vs42, vs2,vs26
+	xvmaddasp		vs46, vs2,vs27
+	xvmaddasp		vs50, vs2,vs28
+	xvmaddasp		vs54, vs2,vs29
+	xvmaddasp		vs58, vs2,vs30
+	xvmaddasp		vs62, vs2,vs31	
+
+	xvmaddasp		vs35, vs3,vs24	  
+	xvmaddasp		vs39, vs3,vs25
+	xvmaddasp		vs43, vs3,vs26
+	xvmaddasp		vs47, vs3,vs27
+	xvmaddasp		vs51, vs3,vs28
+	xvmaddasp		vs55, vs3,vs29
+	xvmaddasp		vs59, vs3,vs30
+	xvmaddasp		vs63, vs3,vs31
+.if \Complete==0	
+	lxv	vs2,	DISP32(\Index,64+32+\OffsetA)(\AREG)
+	lxv	vs3,	DISP32(\Index,64+48+\OffsetA)(\AREG)
+.endif
+    xvmaddasp		vs32, vs4,vs8
+    xvmaddasp		vs36, vs4,vs9
 .if \Complete==0
 	lxv	vs24,	DISP16(\Index,32+\OffsetB)(\BREG)
 	lxv	vs28,	DISP16(\Index,32+16+\OffsetB)(\BREG)
-
-	lxv	vs0,	DISP32(\Index,64+\OffsetA)(\AREG)
-	lxv	vs1,	DISP32(\Index,64+16+\OffsetA)(\AREG)
-	lxv	vs2,	DISP32(\Index,64+32+\OffsetA)(\AREG)
-	lxv	vs3,	DISP32(\Index,64+48+\OffsetA)(\AREG)
-
-	xxperm  	vs26,	vs24,	permute_mask
-	xxperm  	vs30,	vs28,	permute_mask	
-	xxpermdi	vs25,	vs24,	vs24,2	 
-	xxpermdi	vs29,	vs28,	vs28,2	
-.endif    
+.endif
 .if \IsLast==1	
 .if \Complete==1
- 	addi		\BREG, \BREG,  DISP16(\Index,32+\OffsetB) 
-	addi		\AREG, \AREG, DISP32(\Index,64+\OffsetA)
+	addi		\AREG, \AREG, DISP32(\Index,64+\OffsetA)  
+	addi		\BREG, \BREG,  DISP16(\Index,32+\OffsetB)
 
 .else
-  	addi		\BREG, \BREG,  DISP16(\Index,64)
-	addi		\AREG, \AREG, DISP32(\Index,128) 
+	addi		\AREG, \AREG, DISP32(\Index,128)  
+	addi		\BREG, \BREG,  DISP16(\Index,64)
+
 .endif
+.endif   
+    xvmaddasp		vs40, vs4,vs10
+    xvmaddasp		vs44, vs4,vs11
+.if \Complete==0
+	xxperm  	vs26,	vs24,	permute_mask
+	xxperm  	vs30,	vs28,	permute_mask	
 .endif
-
-.if \First==1
-    xvmulsp		vs32, vs4,vs8
-	xvmulsp		vs33, vs5,vs8
-	xvmulsp		vs34, vs6,vs8	
-	xvmulsp		vs35, vs7,vs8
-
-    xvmulsp		vs36, vs4,vs9
-	xvmulsp		vs37, vs5,vs9
-	xvmulsp		vs38, vs6,vs9	
-	xvmulsp		vs39, vs7,vs9
-.else
-    xvmaddasp		vs32, vs4,vs8
-	xvmaddasp		vs33, vs5,vs8
-	xvmaddasp		vs34, vs6,vs8	
-	xvmaddasp		vs35, vs7,vs8	
-
-    xvmaddasp		vs36, vs4,vs9
-	xvmaddasp		vs37, vs5,vs9
-	xvmaddasp		vs38, vs6,vs9	
-	xvmaddasp		vs39, vs7,vs9
+    xvmaddasp		vs48, vs4,vs12
+    xvmaddasp		vs52, vs4,vs13
+.if \Complete==0	
+	xxpermdi	vs25,	vs24,	vs24,2 
+	xxpermdi	vs29,	vs28,	vs28,2	 
 .endif 
+
+    xvmaddasp		vs56, vs4,vs14
+    xvmaddasp		vs60, vs4,vs15
  
 .if \Complete==0        
 	xxpermdi	vs27,	vs26,	vs26,2	
 	xxpermdi	vs31,	vs30,	vs30,2	
- 
-.endif
-.if \First==1  
-    xvmulsp		vs40, vs4,vs10
-	xvmulsp		vs41, vs5,vs10
-	xvmulsp		vs42, vs6,vs10	
-	xvmulsp		vs43, vs7,vs10
+ 	
+.endif 
 
-    xvmulsp		vs44, vs4,vs11
-	xvmulsp		vs45, vs5,vs11
-	xvmulsp		vs46, vs6,vs11	
-	xvmulsp		vs47, vs7,vs11
-
-    xvmulsp		vs48, vs4,vs12
-	xvmulsp		vs49, vs5,vs12
-	xvmulsp		vs50, vs6,vs12	
-	xvmulsp		vs51, vs7,vs12	
-
-    xvmulsp		vs52, vs4,vs13
-	xvmulsp		vs53, vs5,vs13
-	xvmulsp		vs54, vs6,vs13	
-	xvmulsp		vs55, vs7,vs13
-
-    xvmulsp		vs56, vs4,vs14
-	xvmulsp		vs57, vs5,vs14
-	xvmulsp		vs58, vs6,vs14	
-	xvmulsp		vs59, vs7,vs14
-
-    xvmulsp		vs60, vs4,vs15
-	xvmulsp		vs61, vs5,vs15
-	xvmulsp		vs62, vs6,vs15	
-	xvmulsp		vs63, vs7,vs15
-
-.else 
-    xvmaddasp		vs40, vs4,vs10
+	xvmaddasp		vs33, vs5,vs8
+	xvmaddasp		vs37, vs5,vs9
 	xvmaddasp		vs41, vs5,vs10
-	xvmaddasp		vs42, vs6,vs10	
-	xvmaddasp		vs43, vs7,vs10
-
-    xvmaddasp		vs44, vs4,vs11
 	xvmaddasp		vs45, vs5,vs11
-	xvmaddasp		vs46, vs6,vs11	
-	xvmaddasp		vs47, vs7,vs11
-
-    xvmaddasp		vs48, vs4,vs12
 	xvmaddasp		vs49, vs5,vs12
-	xvmaddasp		vs50, vs6,vs12	
-	xvmaddasp		vs51, vs7,vs12	
-
-    xvmaddasp		vs52, vs4,vs13
 	xvmaddasp		vs53, vs5,vs13
-	xvmaddasp		vs54, vs6,vs13	
-	xvmaddasp		vs55, vs7,vs13
-
-    xvmaddasp		vs56, vs4,vs14
 	xvmaddasp		vs57, vs5,vs14
-	xvmaddasp		vs58, vs6,vs14	
-	xvmaddasp		vs59, vs7,vs14
-
-    xvmaddasp		vs60, vs4,vs15
 	xvmaddasp		vs61, vs5,vs15
-	xvmaddasp		vs62, vs6,vs15	
+
+	xvmaddasp		vs34, vs6,vs8	
+	xvmaddasp		vs38, vs6,vs9	
+	xvmaddasp		vs42, vs6,vs10
+	xvmaddasp		vs46, vs6,vs11
+	xvmaddasp		vs50, vs6,vs12
+	xvmaddasp		vs54, vs6,vs13
+	xvmaddasp		vs58, vs6,vs14
+	xvmaddasp		vs62, vs6,vs15
+
+	xvmaddasp		vs35, vs7,vs8	
+	xvmaddasp		vs39, vs7,vs9	
+	xvmaddasp		vs43, vs7,vs10
+	xvmaddasp		vs47, vs7,vs11
+	xvmaddasp		vs51, vs7,vs12
+	xvmaddasp		vs55, vs7,vs13
+	xvmaddasp		vs59, vs7,vs14
 	xvmaddasp		vs63, vs7,vs15
-
-.endif
-
+ 
 .endm
 
  
@@ -763,7 +433,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
     xxmrghw     vs2,    vs37,   vs41
     xxmrghw     vs3,    vs33,   vs45
-
+#ifndef TRMMKERNEL    
+    lxv        vs32, 0(CO)
+    lxv        vs33, 16(CO) 
+#endif 
     xxmrglw     vs16,   vs34,   vs46
     xxmrglw     vs18,   vs38,   vs42   
 
@@ -784,176 +457,203 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
     xxmrghw     vs30,   vs39,   vs43 
     xxmrghw     vs31,   vs35,   vs47
-
-    xxperm      vs8,    vs0,    save_permute_1
-    xxperm      vs10,   vs1,    save_permute_1
-    xxperm      vs9,    vs0,    save_permute_2  
-    xxperm      vs11,   vs1,    save_permute_2      
-
-#ifndef TRMMKERNEL    
-    lxv        vs32, 0(CO)
-    lxv        vs33, 16(CO) 
+#ifndef TRMMKERNEL       
     lxv        vs34, 32(CO)  
     lxv        vs35, 48(CO)      
 #endif
-    xxlor      vs25,    vs24,   vs24
-    xxlor      vs27,    vs26,   vs26 
-
+    xxperm      vs8,    vs0,    save_permute_1
+    xxperm      vs10,   vs1,    save_permute_1
 #ifndef TRMMKERNEL    
     lxv        vs36, 0(T1)
     lxv        vs37, 16(T1) 
+#endif
+    xxperm      vs9,    vs0,    save_permute_2  
+    xxperm      vs11,   vs1,    save_permute_2      
+
+#ifndef TRMMKERNEL      
     lxv        vs38, 32(T1)  
     lxv        vs39, 48(T1)     
 #endif
+
+    xxlor      vs25,    vs24,   vs24
+    xxlor      vs27,    vs26,   vs26 
+
+
+
 #ifndef TRMMKERNEL       
     lxv        vs40, 0(T2)
     lxv        vs41, 16(T2) 
+#endif
+
+    xxperm     vs12,    vs2,    save_permute_1
+    xxperm     vs14,    vs3,    save_permute_1
+#ifndef TRMMKERNEL     
     lxv        vs42, 32(T2)  
     lxv        vs43, 48(T2)     
 #endif  
+       
+    xxperm     vs13,    vs2,    save_permute_2   
+    xxperm     vs15,    vs3,    save_permute_2  
 #ifndef TRMMKERNEL    
     lxv        vs44, 0(T3)
-    lxv        vs45, 16(T3) 
+    lxv        vs45, 16(T3)
+#endif
+    xxperm     vs16,    vs4,    save_permute_1
+    xxperm     vs18,    vs5,    save_permute_1
+#ifndef TRMMKERNEL      
     lxv        vs46, 32(T3)  
     lxv        vs47, 48(T3)                 
 #endif  
 
-    xxperm     vs12,    vs2,    save_permute_1
-    xxperm     vs14,    vs3,    save_permute_1
-       
-    xxperm     vs13,    vs2,    save_permute_2   
-    xxperm     vs15,    vs3,    save_permute_2      
+    
+
 
-    xxperm     vs16,    vs4,    save_permute_1
-    xxperm     vs18,    vs5,    save_permute_1
       
     xxperm     vs17,    vs4,    save_permute_2   
     xxperm     vs19,    vs5,    save_permute_2      
-
+#ifdef TRMMKERNEL
+    xvmulsp     vs32,   vs8,    alpha_r 
+    xvmulsp     vs33,   vs12,   alpha_r                 
+#else 
+    xvmaddasp   vs32,   vs8,    alpha_r 
+    xvmaddasp   vs33,   vs12,   alpha_r            
+#endif 
     xxperm     vs24,    vs30,   save_permute_1
     xxperm     vs26,    vs31,   save_permute_1 
+
+ 
+    stxv        vs32, 0(CO)
+    stxv        vs33, 16(CO)     
+#ifdef TRMMKERNEL   
+    xvmulsp     vs34,   vs16,   alpha_r 
+    xvmulsp     vs35,   vs24,   alpha_r                 
+#else    
+    xvmaddasp   vs34,   vs16,   alpha_r 
+    xvmaddasp   vs35,   vs24,   alpha_r           
+#endif 
          
     xxperm     vs25,    vs30,   save_permute_2   
     xxperm     vs27,    vs31,   save_permute_2  
 
 
-    /* multiply add normal way */
- 
-#ifdef TRMMKERNEL
-    xvmulsp     vs32,   vs8,    alpha_r 
-    xvmulsp     vs33,   vs12,   alpha_r   
-    xvmulsp     vs34,   vs16,   alpha_r 
-    xvmulsp     vs35,   vs24,   alpha_r  
+    stxv        vs34, 32(CO)  
+    stxv        vs35, 48(CO)  
+#ifdef TRMMKERNEL  
     xvmulsp     vs36,   vs9,    alpha_r 
-    xvmulsp     vs37,   vs13,   alpha_r  
+    xvmulsp     vs37,   vs13,   alpha_r                
+#else   
+    xvmaddasp   vs36,   vs9,    alpha_r 
+    xvmaddasp   vs37,   vs13,   alpha_r           
+#endif 
+    stxv        vs36, 0(T1)
+    stxv        vs37, 16(T1)
+#ifdef TRMMKERNEL  
     xvmulsp     vs38,   vs17,   alpha_r 
     xvmulsp     vs39,   vs25,   alpha_r               
-#else 
-    xvmaddasp   vs32,   vs8,    alpha_r 
-    xvmaddasp   vs33,   vs12,   alpha_r   
-    xvmaddasp   vs34,   vs16,   alpha_r 
-    xvmaddasp   vs35,   vs24,   alpha_r  
-    xvmaddasp   vs36,   vs9,    alpha_r 
-    xvmaddasp   vs37,   vs13,   alpha_r   
+#else   
     xvmaddasp   vs38,   vs17,   alpha_r 
     xvmaddasp   vs39,   vs25,   alpha_r         
 #endif 
-
-
-
-#ifdef TRMMKERNEL
-    xvmulsp     vs40,   vs10,   alpha_r 
-    xvmulsp     vs41,   vs14,   alpha_r 
-    xvmulsp     vs42,   vs18,   alpha_r 
-    xvmulsp     vs43,   vs26,   alpha_r  
-    xvmulsp     vs44,   vs11,   alpha_r 
-    xvmulsp     vs45,   vs15,   alpha_r  
-    xvmulsp     vs46,   vs19,   alpha_r 
-    xvmulsp     vs47,   vs27,   alpha_r                   
-#else
-
-    xvmaddasp   vs40,   vs10,   alpha_r 
-    xvmaddasp   vs41,   vs14,   alpha_r   
-    xvmaddasp   vs42,   vs18,   alpha_r 
-    xvmaddasp   vs43,   vs26,   alpha_r  
-    xvmaddasp   vs44,   vs11,   alpha_r 
-    xvmaddasp   vs45,   vs15,   alpha_r 
-    xvmaddasp   vs46,   vs19,   alpha_r 
-    xvmaddasp   vs47,   vs27,   alpha_r  
-        
-#endif  
-
-    stxv        vs32, 0(CO)
-    stxv        vs33, 16(CO) 
-    stxv        vs34, 32(CO)  
-    stxv        vs35, 48(CO)  
-
-    stxv        vs36, 0(T1)
-    stxv        vs37, 16(T1)  
     stxv        vs38, 32(T1)  
     stxv        vs39, 48(T1)
 
+#ifdef TRMMKERNEL
+    xvmulsp     vs40,   vs10,   alpha_r 
+    xvmulsp     vs41,   vs14,   alpha_r                    
+#else 
+    xvmaddasp   vs40,   vs10,   alpha_r 
+    xvmaddasp   vs41,   vs14,   alpha_r   
+#endif   
+
     stxv        vs40, 0(T2)
     stxv        vs41, 16(T2)  
+#ifdef TRMMKERNEL 
+    xvmulsp     vs42,   vs18,   alpha_r 
+    xvmulsp     vs43,   vs26,   alpha_r                     
+#else   
+    xvmaddasp   vs42,   vs18,   alpha_r 
+    xvmaddasp   vs43,   vs26,   alpha_r
+#endif      
     stxv        vs42, 32(T2)  
     stxv        vs43, 48(T2)  
+#ifdef TRMMKERNEL  
+    xvmulsp     vs44,   vs11,   alpha_r 
+    xvmulsp     vs45,   vs15,   alpha_r                    
+#else
+    xvmaddasp   vs44,   vs11,   alpha_r 
+    xvmaddasp   vs45,   vs15,   alpha_r    
+#endif      
     stxv        vs44, 0(T3)
     stxv        vs45, 16(T3) 
+#ifdef TRMMKERNEL 
+    xvmulsp     vs46,   vs19,   alpha_r 
+    xvmulsp     vs47,   vs27,   alpha_r                   
+#else 
+    xvmaddasp   vs46,   vs19,   alpha_r 
+    xvmaddasp   vs47,   vs27,   alpha_r 
+#endif      
     stxv        vs46, 32(T3)  
     stxv        vs47, 48(T3)
   
  /*****the same with the second 8X8 ****/
-#ifndef TRMMKERNEL
-  
+ #ifndef TRMMKERNEL 
     lxv        vs32, 0(T4)
     lxv        vs33, 16(T4) 
-    lxv        vs34, 32(T4)  
-    lxv        vs35, 48(T4)      
-    lxv        vs36, 0(T5)
-    lxv        vs37, 16(T5) 
-    lxv        vs38,32(T5)  
-    lxv        vs39, 48(T5)     
 #endif  
- 
     xxmrglw     vs8,    vs48,   vs60
     xxmrglw     vs10,   vs52,   vs56  
-
+#ifndef TRMMKERNEL    
+    lxv        vs34, 32(T4)  
+    lxv        vs35, 48(T4)  
+#endif  
     xxmrghw     vs1,    vs48,   vs60
     xxmrghw     vs0,    vs52,   vs56
+#ifndef TRMMKERNEL        
+    lxv        vs36, 0(T5)
+    lxv        vs37, 16(T5) 
+#endif  
     xxmrglw     vs12,   vs49,   vs61
     xxmrglw     vs14,   vs53,   vs57  
-
-#ifndef TRMMKERNEL   
-    lxv        vs40, 0(T6)
-    lxv        vs41, 16(T6)  
-    lxv        vs42, 32(T6)  
-    lxv        vs43, 48(T6)           
-    lxv        vs44, 0(T7)
-    lxv        vs45, 16(T7) 
-    lxv        vs46, 32(T7)  
-    lxv        vs47, 48(T7)     
-#endif  
+#ifndef TRMMKERNEL    
+    lxv        vs38,32(T5)  
+    lxv        vs39, 48(T5)     
+#endif   
+ 
     xxmrghw     vs2,    vs53,   vs57
     xxmrghw     vs3,    vs49,   vs61
-
+#ifndef TRMMKERNEL   
+    lxv        vs40, 0(T6)
+    lxv        vs41, 16(T6)
+#endif  
     xxmrglw     vs16,   vs50,   vs62
     xxmrglw     vs18,   vs54,   vs58   
-
+#ifndef TRMMKERNEL      
+    lxv        vs42, 32(T6)  
+    lxv        vs43, 48(T6) 
+#endif  
     xxlor      vs9, vs8,    vs8
     xxlor      vs11,    vs10,   vs10 
     xxmrghw     vs4,    vs54,   vs58
     xxmrghw     vs5,    vs50,   vs62
-
+#ifndef TRMMKERNEL              
+    lxv        vs44, 0(T7)
+    lxv        vs45, 16(T7) 
+#endif  
     xxlor      vs13,    vs12,   vs12
     xxlor      vs15,    vs14,   vs14
  
     xxmrglw     vs24,   vs51,   vs63
-    xxmrglw     vs26,   vs55,   vs59  
-
+    xxmrglw     vs26,   vs55,   vs59 
+#ifndef TRMMKERNEL    
+    lxv        vs46, 32(T7)  
+    lxv        vs47, 48(T7)     
+#endif  
     xxlor      vs17,    vs16,   vs16
     xxlor      vs19,    vs18,   vs18
     xxmrghw     vs30,   vs55,   vs59 
-    xxmrghw     vs31,   vs51,   vs63
+    xxmrghw     vs31,   vs51,   vs63 
+
+ 
 
     xxperm      vs8,    vs0,    save_permute_1
     xxperm      vs10,   vs1,    save_permute_1
@@ -965,11 +665,20 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
     xxlor      vs27,    vs26,   vs26 
     xxperm     vs12,    vs2,    save_permute_1
     xxperm     vs14,    vs3,    save_permute_1
+
     xxperm     vs13,    vs2,    save_permute_2   
     xxperm     vs15,    vs3,    save_permute_2      
- 
+ #ifdef TRMMKERNEL
+    xvmulsp     vs32,   vs8,    alpha_r 
+    xvmulsp     vs33,   vs12,   alpha_r                 
+#else 
+    xvmaddasp   vs32,   vs8,    alpha_r 
+    xvmaddasp   vs33,   vs12,   alpha_r            
+#endif  
     xxperm     vs16,    vs4,    save_permute_1
     xxperm     vs18,    vs5,    save_permute_1
+    stxv        vs32, 0(T4)
+    stxv        vs33, 16(T4) 
     xxperm     vs17,    vs4,    save_permute_2   
     xxperm     vs19,    vs5,    save_permute_2      
     xxperm     vs24,    vs30,   save_permute_1
@@ -977,64 +686,77 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
     xxperm     vs25,    vs30,   save_permute_2   
     xxperm     vs27,    vs31,   save_permute_2      
 
-#ifdef TRMMKERNEL
-    xvmulsp     vs32,   vs8,    alpha_r 
-    xvmulsp     vs33,   vs12,   alpha_r   
+#ifdef TRMMKERNEL   
     xvmulsp     vs34,   vs16,   alpha_r 
-    xvmulsp     vs35,   vs24,   alpha_r  
+    xvmulsp     vs35,   vs24,   alpha_r                 
+#else    
+    xvmaddasp   vs34,   vs16,   alpha_r 
+    xvmaddasp   vs35,   vs24,   alpha_r           
+#endif 
+    stxv        vs34, 32(T4)  
+    stxv        vs35, 48(T4)  
+
+#ifdef TRMMKERNEL  
     xvmulsp     vs36,   vs9,    alpha_r 
-    xvmulsp     vs37,   vs13,   alpha_r  
+    xvmulsp     vs37,   vs13,   alpha_r                
+#else   
+    xvmaddasp   vs36,   vs9,    alpha_r 
+    xvmaddasp   vs37,   vs13,   alpha_r           
+#endif 
+    stxv        vs36, 0(T5)
+    stxv        vs37, 16(T5) 
+
+#ifdef TRMMKERNEL  
     xvmulsp     vs38,   vs17,   alpha_r 
     xvmulsp     vs39,   vs25,   alpha_r               
-#else 
-    xvmaddasp   vs32,   vs8,    alpha_r 
-    xvmaddasp   vs33,   vs12,   alpha_r   
-    xvmaddasp   vs34,   vs16,   alpha_r 
-    xvmaddasp   vs35,   vs24,   alpha_r  
-    xvmaddasp   vs36,   vs9,    alpha_r 
-    xvmaddasp   vs37,   vs13,   alpha_r   
+#else  
     xvmaddasp   vs38,   vs17,   alpha_r 
     xvmaddasp   vs39,   vs25,   alpha_r         
 #endif 
 
-    stxv        vs32, 0(T4)
-    stxv        vs33, 16(T4) 
-    stxv        vs34, 32(T4)  
-    stxv        vs35, 48(T4)  
 
-    stxv        vs36, 0(T5)
-    stxv        vs37, 16(T5)  
+
+ 
     stxv        vs38, 32(T5)  
     stxv        vs39, 48(T5)
 
+
 #ifdef TRMMKERNEL
     xvmulsp     vs40,   vs10,   alpha_r 
-    xvmulsp     vs41,   vs14,   alpha_r 
-    xvmulsp     vs42,   vs18,   alpha_r 
-    xvmulsp     vs43,   vs26,   alpha_r  
-    xvmulsp     vs44,   vs11,   alpha_r 
-    xvmulsp     vs45,   vs15,   alpha_r  
-    xvmulsp     vs46,   vs19,   alpha_r 
-    xvmulsp     vs47,   vs27,   alpha_r                   
-#else
-
+    xvmulsp     vs41,   vs14,   alpha_r                    
+#else 
     xvmaddasp   vs40,   vs10,   alpha_r 
     xvmaddasp   vs41,   vs14,   alpha_r   
-    xvmaddasp   vs42,   vs18,   alpha_r 
-    xvmaddasp   vs43,   vs26,   alpha_r  
-    xvmaddasp   vs44,   vs11,   alpha_r 
-    xvmaddasp   vs45,   vs15,   alpha_r 
-    xvmaddasp   vs46,   vs19,   alpha_r 
-    xvmaddasp   vs47,   vs27,   alpha_r  
-        
 #endif  
-
     stxv        vs40, 0(T6)
-    stxv        vs41, 16(T6)  
+    stxv        vs41, 16(T6) 
+#ifdef TRMMKERNEL 
+    xvmulsp     vs42,   vs18,   alpha_r 
+    xvmulsp     vs43,   vs26,   alpha_r                     
+#else   
+    xvmaddasp   vs42,   vs18,   alpha_r 
+    xvmaddasp   vs43,   vs26,   alpha_r
+#endif  
     stxv        vs42, 32(T6)  
     stxv        vs43, 48(T6)  
+#ifdef TRMMKERNEL  
+    xvmulsp     vs44,   vs11,   alpha_r 
+    xvmulsp     vs45,   vs15,   alpha_r                    
+#else
+    xvmaddasp   vs44,   vs11,   alpha_r 
+    xvmaddasp   vs45,   vs15,   alpha_r    
+#endif  
+
     stxv        vs44, 0(T7)
     stxv        vs45, 16(T7) 
+#ifdef TRMMKERNEL 
+    xvmulsp     vs46,   vs19,   alpha_r 
+    xvmulsp     vs47,   vs27,   alpha_r                   
+#else 
+    xvmaddasp   vs46,   vs19,   alpha_r 
+    xvmaddasp   vs47,   vs27,   alpha_r 
+#endif  
+ 
     stxv        vs46, 32(T7)  
     stxv        vs47, 48(T7)
   
@@ -1224,12 +946,14 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
     xxperm      vs10,   vs8,        permute_mask
     xxperm      vs14,   vs12,       permute_mask    
-    xxpermdi    vs9,    vs8,    vs8,2    
-    xxpermdi    vs13,   vs12,   vs12,2   
 
     xvmaddasp       vs32, vs0,vs24
     xvmaddasp       vs33, vs1,vs24
 
+    xxpermdi    vs9,    vs8,    vs8,2    
+    xxpermdi    vs13,   vs12,   vs12,2   
+
+
     xvmaddasp       vs36, vs0,vs25
     xvmaddasp       vs37, vs1,vs25
 
@@ -1247,21 +971,21 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
     xvmaddasp       vs52, vs0,vs29
     xvmaddasp       vs53, vs1,vs29
-
+    lxv vs24,   DISP32(\Index,32+\OffsetB)(\BREG)
+    lxv vs28,   DISP32(\Index,32+16+\OffsetB)(\BREG)
     xvmaddasp       vs56, vs0,vs30
     xvmaddasp       vs57, vs1,vs30
 
     xvmaddasp       vs60, vs0,vs31
     xvmaddasp       vs61, vs1,vs31
 
-    lxv vs24,   DISP32(\Index,32+\OffsetB)(\BREG)
-    lxv vs28,   DISP32(\Index,32+16+\OffsetB)(\BREG)
+    xxperm      vs26,   vs24,       permute_mask
+    xxperm      vs30,   vs28,       permute_mask    
 
     lxv vs0,    DISP32(\Index,32+\OffsetA)(\AREG)
     lxv vs1,    DISP32(\Index,32+16+\OffsetA)(\AREG)
 
-    xxperm      vs26,   vs24,       permute_mask
-    xxperm      vs30,   vs28,       permute_mask    
+
     xxpermdi    vs25,   vs24,   vs24,2     
     xxpermdi    vs29,   vs28,   vs28,2    
 
@@ -1285,21 +1009,22 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
     xvmaddasp       vs52, vs4,vs13
     xvmaddasp       vs53, vs5,vs13
-
+    lxv vs8,    DISP32(\Index,64+\OffsetB)(\BREG)
+    lxv vs12,   DISP32(\Index,64+16+\OffsetB)(\BREG)
     xvmaddasp       vs56, vs4,vs14
     xvmaddasp       vs57, vs5,vs14
 
     xvmaddasp       vs60, vs4,vs15
     xvmaddasp       vs61, vs5,vs15
 
-    lxv vs8,    DISP32(\Index,64+\OffsetB)(\BREG)
-    lxv vs12,   DISP32(\Index,64+16+\OffsetB)(\BREG)
+    xxperm      vs10,   vs8,        permute_mask
+    xxperm      vs14,   vs12,       permute_mask   
+ 
 
     lxv vs4,    DISP32(\Index,64+0+\OffsetA)(\AREG)
     lxv vs5,    DISP32(\Index,64+16+\OffsetA)(\AREG)
 
-    xxperm      vs10,   vs8,        permute_mask
-    xxperm      vs14,   vs12,       permute_mask    
+ 
     xxpermdi    vs9,    vs8,    vs8,2    
     xxpermdi    vs13,   vs12,   vs12,2  
 
@@ -1323,22 +1048,26 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
     xvmaddasp       vs52, vs0,vs29
     xvmaddasp       vs53, vs1,vs29
-
-    xvmaddasp       vs56, vs0,vs30
-    xvmaddasp       vs57, vs1,vs30
-
-    xvmaddasp       vs60, vs0,vs31
-    xvmaddasp       vs61, vs1,vs31
-
 .if \Complete==0
     lxv vs24,   DISP32(\Index,96+\OffsetB)(\BREG)
     lxv vs28,   DISP32(\Index,96+16+\OffsetB)(\BREG)
+.endif 
+    xvmaddasp       vs56, vs0,vs30
+    xvmaddasp       vs57, vs1,vs30
+.if \Complete==0
+    xxperm      vs26,   vs24,   permute_mask
+    xxperm      vs30,   vs28,   permute_mask   
+.endif 
+    xvmaddasp       vs60, vs0,vs31
+    xvmaddasp       vs61, vs1,vs31
 
+
+.if \Complete==0
     lxv vs0,    DISP32(\Index,96+\OffsetA)(\AREG)
     lxv vs1,    DISP32(\Index,96+16+\OffsetA)(\AREG) 
+.endif 
 
-    xxperm      vs26,   vs24,   permute_mask
-    xxperm      vs30,   vs28,   permute_mask    
+.if \Complete==0     
     xxpermdi    vs25,   vs24,   vs24,2 
     xxpermdi    vs29,   vs28,   vs28,2      
 
diff --git a/kernel/power/zgemm_kernel_power9.S b/kernel/power/zgemm_kernel_power9.S
index e655f0bfe..a41bcec77 100644
--- a/kernel/power/zgemm_kernel_power9.S
+++ b/kernel/power/zgemm_kernel_power9.S
@@ -30,10 +30,11 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
 #define LOAD	ld
  
-#define STACKSIZE 32192
+#define STACKSIZE 512
 
 #define FZERO	312+192(SP)
- 
+
+#define FLINK_SAVE (STACKSIZE+16) /* 16($r12) */
 
 #define	M	r3
 #define	N	r4
@@ -56,20 +57,20 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
 #define FRAMEPOINTER r12
 
-#define BBUFFER r14
+#define T10 r14
 
 #define L	r15
-#define ALPHA	r16
+#define T8	r16
 #define T5	r17
 #define T2	r19
-#define BBO	r20
-#define	o8	r21
+#define T9	r20
+#define	T6	r21
 #define	I	r22
 #define J	r23
 #define AO	r24
 #define	BO	r25
 #define	CO	r26
-#define o16	r27
+#define T7	r27
 #define	T3	r28
 #define T4	r29
 
@@ -82,12 +83,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 	PROFCODE
 
 	mr      FRAMEPOINTER, SP
-        addi    SP, SP, -STACKSIZE
-        addi    SP, SP, -STACKSIZE
-        addi    SP, SP, -STACKSIZE
-        addi    SP, SP, -STACKSIZE
-        li      r0, 0
-
+    addi    SP, SP, -STACKSIZE 
+    mflr    r0
 	stfd	f14,    0(SP)
 	stfd	f15,    8(SP)
 	stfd	f16,   16(SP)
@@ -111,6 +108,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 	stfd	f30,  128(SP)
 	stfd	f31,  136(SP)
 
+    xxspltd  alpha_r,vs1,0  /*copy from register f1 */
+    xxspltd  alpha_i,vs2,0  /*copy from register f2 */
  
 	std	r31,  144(SP)
 	std	r30,  152(SP)
@@ -132,21 +131,21 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 	std	r14,  280(SP)
  
  
-    stxv    v20,  288(SP)
-    stxv    v21,  304(SP)
-    stxv    v22,  320(SP)
-    stxv    v23,  336(SP)
-    stxv    v24,  352(SP)
-    stxv    v25,  368(SP)
-    stxv    v26,  384(SP)
-    stxv    v27,  400(SP)
-    stxv    v28,  416(SP)
-    stxv    v29,  432(SP)
-    stxv    v30,  448(SP)
-    stxv    v31,  464(SP)
+    stxv    vs52,  288(SP)
+    stxv    vs53,  304(SP)
+    stxv    vs54,  320(SP)
+    stxv    vs55,  336(SP)
+    stxv    vs56,  352(SP)
+    stxv    vs57,  368(SP)
+    stxv    vs58,  384(SP)
+    stxv    vs59,  400(SP)
+    stxv    vs60,  416(SP)
+    stxv    vs61,  432(SP)
+    stxv    vs62,  448(SP)
+    stxv    vs63,  464(SP)
 
+    std    r0, FLINK_SAVE(SP)
  
-	stw	r0,  FZERO
 
 #ifdef linux
 	ld	LDC, FRAMESLOT(0) + 0(FRAMEPOINTER)
@@ -162,35 +161,24 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
 #include "zgemm_macros_power9.S"
 
-	cmpwi	cr0, M, 0
-	ble	L999
-	cmpwi	cr0, N, 0
-	ble	L999
-	cmpwi	cr0, K, 0
-	ble	L999
+ 
 
 	slwi	LDC, LDC, ZBASE_SHIFT
-	li	PRE,  512
-	li	o8  , 8
-	li	o16 , 16 
-
-    addi    BBUFFER, SP, 512+4096
-    li      T1, -4096
-    and     BBUFFER, BBUFFER, T1
-
+	li	PRE,  512 
+    li  r0,   0
  
-	addi	ALPHA, SP, 296+192
- 
-    xxlor  alpha_r,vs1,vs1  /*copy from register f1 */
-    xxlor  alpha_i,vs2,vs2  /*copy from register f2 */
 
+#if defined(CC) || defined(CR) || defined(RC) || defined(RR) 
+/*negate for this case as we will use addition -1*(a+b) */
+  xvnegdp alpha_r,alpha_r
+  xvnegdp alpha_i,alpha_i
+#endif
 	.align 4
 
 #include "zgemm_logic_power9.S"
 
 L999:
-	addi	r3, 0, 0
-
+ 
 	lfd	f14,    0(SP)
 	lfd	f15,    8(SP)
 	lfd	f16,   16(SP)
@@ -233,24 +221,24 @@ L999:
 	ld	r16,  264(SP)
 	ld	r15,  272(SP)
 	ld	r14,  280(SP)
- 
-	lxv    v20,  288(SP)
-	lxv    v21,  304(SP)
-	lxv    v22,  320(SP)
-	lxv    v23,  336(SP)
-	lxv    v24,  352(SP)
-	lxv    v25,  368(SP)
-	lxv    v26,  384(SP)
-	lxv    v27,  400(SP)
-	lxv    v28,  416(SP)
-	lxv    v29,  432(SP)
-	lxv    v30,  448(SP)
-	lxv    v31,  464(SP)
 
-	addi	SP, SP, STACKSIZE
-	addi	SP, SP, STACKSIZE
-	addi	SP, SP, STACKSIZE
-	addi	SP, SP, STACKSIZE
+	ld    r0, 	 FLINK_SAVE(SP)	
+ 
+    lxv    vs52,  288(SP)
+    lxv    vs53,  304(SP)
+    lxv    vs54,  320(SP)
+    lxv    vs55,  336(SP)
+    lxv    vs56,  352(SP)
+    lxv    vs57,  368(SP)
+    lxv    vs58,  384(SP) 
+    lxv    vs59,  400(SP)
+	mtlr r0
+    lxv    vs60,  416(SP)
+    lxv    vs61,  432(SP) 
+    lxv    vs62,  448(SP)
+    lxv    vs63,  464(SP)
+
+	addi	SP, SP, STACKSIZE 
 	blr
 
 	EPILOGUE
diff --git a/kernel/power/zgemm_logic_power9.S b/kernel/power/zgemm_logic_power9.S
index 77ce36294..01685fe79 100644
--- a/kernel/power/zgemm_logic_power9.S
+++ b/kernel/power/zgemm_logic_power9.S
@@ -25,155 +25,348 @@ OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
 USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 *****************************************************************************/
 #define MY_ALIGN .align 3
+b ZGEMM_L2
 
+/*                MINI SUBROUTINES                            */
+
+
+
+/*                2x8 MAIN 128x+1 LOOP                     */   
+ZGEMM_L2x8_LMAIN_SUB: 
+	mtctr		L
+    LOAD2x8 0  
+	MY_ALIGN
+ZGEMM_L2x8_LOOP:
+	dcbt		AO,	PRE
+	dcbt		BO,	PRE
+    KERNEL2x8_L 128,32,0,0 
+    KERNEL2x8_L 128,32,1,0
+	dcbt		AO,	T2	
+	KERNEL2x8_L 128,32,2,0
+	KERNEL2x8_L 128,32,3,0 
+	dcbt		AO,	T3
+	dcbt		BO,	T2
+    KERNEL2x8_L 128,32,4,0
+	KERNEL2x8_L 128,32,5,0
+	dcbt		AO,	T4	
+	KERNEL2x8_L 128,32,6,0
+	KERNEL2x8_L 128,32,7,0  
+	dcbt		AO,	T5	
+	dcbt		BO,	T3
+    KERNEL2x8_L 128,32,8,0
+	KERNEL2x8_L 128,32,9,0
+	KERNEL2x8_L 128,32,10,0
+	KERNEL2x8_L 128,32,11,0  
+	dcbt		BO,	T4
+    KERNEL2x8_L 128,32,12,0
+	KERNEL2x8_L 128,32,13,0
+	KERNEL2x8_L 128,32,14,0
+	KERNEL2x8_L 128,32,15,0 	
+    KERNEL2x8_L 128,32,16,0
+	KERNEL2x8_L 128,32,17,0 
+	KERNEL2x8_L 128,32,18,0
+	KERNEL2x8_L 128,32,19,0  
+    KERNEL2x8_L 128,32,20,0
+	KERNEL2x8_L 128,32,21,0 
+	KERNEL2x8_L 128,32,22,0
+	KERNEL2x8_L 128,32,23,0   
+    KERNEL2x8_L 128,32,24,0
+	KERNEL2x8_L 128,32,25,0
+	KERNEL2x8_L 128,32,26,0
+	KERNEL2x8_L 128,32,27,0  
+    KERNEL2x8_L 128,32,28,0
+	KERNEL2x8_L 128,32,29,0
+	KERNEL2x8_L 128,32,30,0
+	KERNEL2x8_L 128,32,31,0 
+    KERNEL2x8_L 128,32,32,0
+	KERNEL2x8_L 128,32,33,0
+	KERNEL2x8_L 128,32,34,0
+	KERNEL2x8_L 128,32,35,0 
+    KERNEL2x8_L 128,32,36,0
+	KERNEL2x8_L 128,32,37,0
+	KERNEL2x8_L 128,32,38,0
+	KERNEL2x8_L 128,32,39,0  
+    KERNEL2x8_L 128,32,40,0
+	KERNEL2x8_L 128,32,41,0
+	KERNEL2x8_L 128,32,42,0
+	KERNEL2x8_L 128,32,43,0  
+    KERNEL2x8_L 128,32,44,0
+	KERNEL2x8_L 128,32,45,0
+	KERNEL2x8_L 128,32,46,0
+	KERNEL2x8_L 128,32,47,0 
+    KERNEL2x8_L 128,32,48,0
+	KERNEL2x8_L 128,32,49,0 
+	KERNEL2x8_L 128,32,50,0
+	KERNEL2x8_L 128,32,51,0  
+    KERNEL2x8_L 128,32,52,0
+	KERNEL2x8_L 128,32,53,0 
+	KERNEL2x8_L 128,32,54,0
+	KERNEL2x8_L 128,32,55,0  
+    KERNEL2x8_L 128,32,56,0
+	KERNEL2x8_L 128,32,57,0
+	KERNEL2x8_L 128,32,58,0
+	KERNEL2x8_L 128,32,59,0  
+    KERNEL2x8_L 128,32,60,0
+	KERNEL2x8_L 128,32,61,0
+	KERNEL2x8_L 128,32,62,0 
+	KERNEL2x8_L 128,32,63,1	
+	bdnz		ZGEMM_L2x8_LOOP
+ 	MY_ALIGN  
+ZGEMM_L2x8_LOOP_END:
+   END2x8  AO, BO, 128,32 
+   blr
+
+    MY_ALIGN
+ZGEMM_2x8_L64_SUB:
+    LOAD2x8 0 
+	dcbt		AO,	PRE
+	dcbt		BO,	PRE
+    KERNEL2x8_L 128,32,0,0 
+    KERNEL2x8_L 128,32,1,0
+	dcbt		AO,	T2	
+	KERNEL2x8_L 128,32,2,0
+	KERNEL2x8_L 128,32,3,0 
+	dcbt		AO,	T3
+	dcbt		BO,	T2
+    KERNEL2x8_L 128,32,4,0
+	KERNEL2x8_L 128,32,5,0
+	dcbt		AO,	T4	
+	KERNEL2x8_L 128,32,6,0
+	KERNEL2x8_L 128,32,7,0  
+	dcbt		AO,	T5	
+	dcbt		BO,	T3
+    KERNEL2x8_L 128,32,8,0
+	KERNEL2x8_L 128,32,9,0
+	KERNEL2x8_L 128,32,10,0
+	KERNEL2x8_L 128,32,11,0  
+	dcbt		BO,	T4
+    KERNEL2x8_L 128,32,12,0
+	KERNEL2x8_L 128,32,13,0
+	KERNEL2x8_L 128,32,14,0
+	KERNEL2x8_L 128,32,15,0 	
+    KERNEL2x8_L 128,32,16,0
+	KERNEL2x8_L 128,32,17,0 
+	KERNEL2x8_L 128,32,18,0
+	KERNEL2x8_L 128,32,19,0  
+    KERNEL2x8_L 128,32,20,0
+	KERNEL2x8_L 128,32,21,0 
+	KERNEL2x8_L 128,32,22,0
+	KERNEL2x8_L 128,32,23,0   
+    KERNEL2x8_L 128,32,24,0
+	KERNEL2x8_L 128,32,25,0
+	KERNEL2x8_L 128,32,26,0
+	KERNEL2x8_L 128,32,27,0  
+    KERNEL2x8_L 128,32,28,0
+	KERNEL2x8_L 128,32,29,0
+	KERNEL2x8_L 128,32,30,0
+	KERNEL2x8_E 128,32,31,1
+	blr
+
+
+    MY_ALIGN
+ZGEMM_2x8_L32_SUB:
+    LOAD2x8 0 
+	dcbt		AO,	PRE
+	dcbt		BO,	PRE
+    KERNEL2x8_L 128,32,0,0 
+    KERNEL2x8_L 128,32,1,0
+	dcbt		AO,	T2	
+	KERNEL2x8_L 128,32,2,0
+	KERNEL2x8_L 128,32,3,0 
+	dcbt		AO,	T3
+	dcbt		BO,	T2
+    KERNEL2x8_L 128,32,4,0
+	KERNEL2x8_L 128,32,5,0
+	dcbt		AO,	T4	
+	KERNEL2x8_L 128,32,6,0
+	KERNEL2x8_L 128,32,7,0  
+	dcbt		AO,	T5	
+	dcbt		BO,	T3
+    KERNEL2x8_L 128,32,8,0
+	KERNEL2x8_L 128,32,9,0
+	KERNEL2x8_L 128,32,10,0
+	KERNEL2x8_L 128,32,11,0  
+	dcbt		BO,	T4
+    KERNEL2x8_L 128,32,12,0
+	KERNEL2x8_L 128,32,13,0
+	KERNEL2x8_L 128,32,14,0
+	KERNEL2x8_L 128,32,15,1
+	blr
+    MY_ALIGN
+
+ZGEMM_2x8_L16_SUB:
+    LOAD2x8 0 
+	dcbt		AO,	PRE
+	dcbt		BO,	PRE
+    KERNEL2x8_L 128,32,0,0 
+    KERNEL2x8_L 128,32,1,0
+	dcbt		AO,	T2	
+	KERNEL2x8_L 128,32,2,0
+	KERNEL2x8_L 128,32,3,0 
+	dcbt		AO,	T3
+	dcbt		BO,	T2
+    KERNEL2x8_L 128,32,4,0
+	KERNEL2x8_L 128,32,5,0
+	dcbt		AO,	T4	
+	KERNEL2x8_L 128,32,6,0
+	KERNEL2x8_L 128,32,7,1
+	blr
+   MY_ALIGN
+
+ZGEMM_2x4_LMAIN_SUB:
+	mtctr		L
+    LOAD2x4 0   
+	MY_ALIGN
+ZGEMM_L2x4_LOOP: 
+    KERNEL2x4_L 64,32,0,0
+	KERNEL2x4_L 64,32,1,0 	
+	KERNEL2x4_L 64,32,2,0
+	KERNEL2x4_L 64,32,3,0  
+    KERNEL2x4_L 64,32,4,0
+	KERNEL2x4_L 64,32,5,0 
+	KERNEL2x4_L 64,32,6,0
+	KERNEL2x4_L 64,32,7,0
+    KERNEL2x4_L 64,32,8,0
+	KERNEL2x4_L 64,32,9,0 	
+	KERNEL2x4_L 64,32,10,0
+	KERNEL2x4_L 64,32,11,0  
+    KERNEL2x4_L 64,32,12,0
+	KERNEL2x4_L 64,32,13,0 
+	KERNEL2x4_L 64,32,14,0
+	KERNEL2x4_L 64,32,15,1		
+	bdnz		ZGEMM_L2x4_LOOP
+ 	MY_ALIGN  
+ZGEMM_L2x4_LOOP_END:
+    END2x4  AO, BO, 64,32  
+	blr
+
+    MY_ALIGN
+ZGEMM_2x4_L16_SUB:
+	LOAD2x4 0 
+    KERNEL2x4_L  64,32, 0,0
+    KERNEL2x4_L  64,32, 1,0
+    KERNEL2x4_L  64,32, 2,0
+    KERNEL2x4_L  64,32, 3,0
+    KERNEL2x4_L  64,32, 4,0
+    KERNEL2x4_L  64,32, 5,0
+    KERNEL2x4_L  64,32, 6,0
+    KERNEL2x4_E  64,32, 7,1
+    blr
+
+    MY_ALIGN
+ZGEMM_2x4_L8_SUB:
+	LOAD2x4 0 
+    KERNEL2x4_L  64,32, 0,0
+    KERNEL2x4_L  64,32, 1,0
+    KERNEL2x4_L  64,32, 2,0
+    KERNEL2x4_E  64,32, 3,1
+    blr
+
+/*             MAIN LOOP BEGINS               */
+
+   MY_ALIGN
+ZGEMM_L2:
 	srawi.		J,	N,	1
 	ble		ZGEMM_L2_END
 
 ZGEMM_L2_BEGIN:
-
-	mr		BO,	B
-	mr		BBO,	BBUFFER
-	srawi.		T1,	K,	2
-	ble		ZGEMM_L2_COPYB1
-
-ZGEMM_L2_COPYB8:
-
-	addi		T2,	PRE, 128
-	dcbt		BO,	PRE
-	dcbtst		BBO,	PRE
-	dcbtst		BBO,	T2
-	ZCOPYB_8
-	addic.		T1,	T1,	-1
-
-	bgt		ZGEMM_L2_COPYB8
-
-ZGEMM_L2_COPYB1:
-
-	andi.		T1,	K,	3
-	ble		ZGEMM_L2_COPYB_END
-
-ZGEMM_L2_COPYB_LOOP:
-
-	ZCOPYB_2
-	addic.          T1,     T1,     -1
-
-	bgt             ZGEMM_L2_COPYB_LOOP
-
-ZGEMM_L2_COPYB_END:
-
-	mr		CO,	C
-	mr		AO,	A
-	slwi		T1,	LDC	,	1
+  	mr		CO,	C
+	slwi		T1,	LDC	,	1	 	  
+    add     T2,C,LDC    
+	mr		AO,	A  
 	add		C,	C,	T1
 	srawi.		I,	M,	3
 	ble		ZGEMM_L2x8_END
-
-ZGEMM_L2x8_BEGIN:
-
-
-	mr		BO,	BBUFFER
+    dcbt    CO,r0  /*just prefetch*/
+    dcbt    T2,r0    
+ZGEMM_L2x8_BEGIN: 
 	mr T1, K
+	mr		BO,	B 
+	dcbt		B,	r0	
+	dcbt		AO,	r0 
+	/* TEMPS FOR PREFETCH */
+	li T2, 1024
+	li T3, 1024+512
+
     addi T1,T1, -1
-    srawi.		L,	T1,	5 /**(K-1) % 32x */ 
+	/* TEMPS FOR PREFETCH */	
+	li T4, 2048
+	li T5, 2048+512		
+    srawi.		L,	T1,	7 /**(K-1) %  128x */ 
+
 	ZERO2x8  
 	ble		ZGEMM_L2x8_SUB0
- 
-
-ZGEMM_L2x8_LOOP_START:
-
-    LOAD2x8 0 
-    li T2, 1024
-	li T3, 1024+512
-	li T4, 2048
-	li T5, 2048+512
-	mtctr		L
-
-	MY_ALIGN
-ZGEMM_L2x8_LOOP:
- 	dcbt		AO,	PRE
-	dcbt		BO,	PRE
-    KERNEL2x8_L 128,64,0,0
-	KERNEL2x8_L 128,64,1,0
-	dcbt		AO,	T2	
-	KERNEL2x8_L 128,64,2,0
-	KERNEL2x8_L 128,64,3,0 
-	dcbt		AO,	T3
-	dcbt		BO,	T2
-    KERNEL2x8_L 128,64,4,0
-	KERNEL2x8_L 128,64,5,0
-	dcbt		AO,	T4	
-	KERNEL2x8_L 128,64,6,0
-	KERNEL2x8_L 128,64,7,0  
-	dcbt		AO,	T5	
-	dcbt		BO,	T3
-    KERNEL2x8_L 128,64,8,0
-	KERNEL2x8_L 128,64,9,0
-	KERNEL2x8_L 128,64,10,0
-	KERNEL2x8_L 128,64,11,0  
-	dcbt		BO,	T4
-    KERNEL2x8_L 128,64,12,0
-	KERNEL2x8_L 128,64,13,0
-	KERNEL2x8_L 128,64,14,0
-	KERNEL2x8_L 128,64,15,1 		
-	bdnz		ZGEMM_L2x8_LOOP
- 	MY_ALIGN  
-ZGEMM_L2x8_LOOP_END:
-    END2x8  AO, BO, 128, 64   	 
- 
-	b		ZGEMM_L2x8_SUB1
- 
-ZGEMM_L2x8_SUB0:
-
-	andi.		L,	K,	63
- 
-	b		ZGEMM_L2x8_SUB2
-
-ZGEMM_L2x8_SUB1:
-
-	andi.		L,	T1,	31
+    bl ZGEMM_L2x8_LMAIN_SUB 
+	
+	andi.		L,	T1,	127
 	ble		ZGEMM_L2x8_SAVE
-
-ZGEMM_L2x8_SUB2:
-    srawi.      T1,L, 3
-    ble ZGEMM_L2x8_SUB2_4
-    mtctr		T1
+	b		ZGEMM_L2x8_SUB2
+ 
+ZGEMM_L2x8_SUB0: 
+	andi.		L,	K,	255
+    cmpwi   K,128
+	bne ZGEMM_L2x8_SUB2 
+    MY_ALIGN	
+ZGEMM_L2x8_SUB2_128:
+  	bl ZGEMM_2x8_L64_SUB
+	bl ZGEMM_2x8_L64_SUB  
+	b ZGEMM_L2x8_SAVE 
     MY_ALIGN
-ZGEMM_L2x8_SUB2_LOOP:
+ZGEMM_L2x8_SUB2:
+    andi.      T1,L, 64
+	ble ZGEMM_L2x8_SUB2_32
+	bl ZGEMM_2x8_L64_SUB
+    MY_ALIGN
+ZGEMM_L2x8_SUB2_32:
+    andi.      T1,L, 32
+    ble ZGEMM_L2x8_SUB2_16  	
+	bl ZGEMM_2x8_L32_SUB
+    MY_ALIGN 
+ZGEMM_L2x8_SUB2_16:
+    andi.      T1,L, 16
+    ble ZGEMM_L2x8_SUB2_8
+	bl ZGEMM_2x8_L16_SUB	
+    MY_ALIGN		
+ZGEMM_L2x8_SUB2_8:
+    andi.      T1,L, 8
+    ble ZGEMM_L2x8_SUB2_4
 	LOAD2x8 0 
-    KERNEL2x8_L  128,64, 0,0
-    KERNEL2x8_L  128,64, 1,0
-    KERNEL2x8_L  128,64, 2,0
-    KERNEL2x8_E  128,64, 3,1
-    bdnz ZGEMM_L2x8_SUB2_LOOP 
-    MY_ALIGN  
+    KERNEL2x8_L  128,32, 0,0
+    KERNEL2x8_L  128,32, 1,0
+    KERNEL2x8_L  128,32, 2,0
+    KERNEL2x8_E  128,32, 3,1
+    MY_ALIGN	 
 ZGEMM_L2x8_SUB2_4:
     andi.      T1,L, 4
     ble ZGEMM_L2x8_SUB2_2
 	LOAD2x8 0 
-    KERNEL2x8_L  128,64, 0,0
-    KERNEL2x8_E  128,64, 1,1
+    KERNEL2x8_L  128,32, 0,0
+    KERNEL2x8_E  128,32, 1,1
     MY_ALIGN
 ZGEMM_L2x8_SUB2_2:
     andi.      T1,L, 2
     ble ZGEMM_L2x8_SUB2_1
 	LOAD2x8 0 
-    KERNEL2x8_E  128,64, 0,1
+    KERNEL2x8_E  128,32, 0,1
     MY_ALIGN    
 ZGEMM_L2x8_SUB2_1:
     andi.      T1,L, 1
     ble ZGEMM_L2x8_SAVE	
-    KERNEL2x8      
-
-/*	addic.		L,	L,	-1
-	bgt		ZGEMM_L2x8_SUB2_1*/
+    KERNEL2x8       
 
 ZGEMM_L2x8_SAVE:
-
+	addic.		I,	I,	-1
 	SAVE2x8
 
-	addic.		I,	I,	-1
 	bgt		ZGEMM_L2x8_BEGIN
 
+	andi.		T2,	M,	7
+	ble		ZGEMM_L2x1_END
+
+	andi.		T1,	M,	4
+	ble		ZGEMM_L2x4_END
+	b 	ZGEMM_L2x4_BEGIN
+	MY_ALIGN 
 ZGEMM_L2x8_END:
 
 ZGEMM_L2x4_BEGIN:
@@ -183,70 +376,50 @@ ZGEMM_L2x4_BEGIN:
 
 	andi.		T1,	M,	4
 	ble		ZGEMM_L2x4_END
-	mr		BO,	BBUFFER
+	mr		BO,	B
 	mr T1, K
     addi T1,T1, -1
-    srawi.		L,	T1, 4 /**(K-1) % 16x */ 
-	ZERO2x4  
+	ZERO2x4  	
+    srawi.		L,	T1, 5 /**(K-1) % 32x */ 
+
 	ble		ZGEMM_L2x4_SUB0 
-
-ZGEMM_L2x4_LOOP_START:
-    LOAD2x4 0  
-	mtctr		L
-
-	MY_ALIGN
-ZGEMM_L2x4_LOOP: 
-    KERNEL2x4_L 64,64,0,0
-	KERNEL2x4_L 64,64,1,0 	
-	KERNEL2x4_L 64,64,2,0
-	KERNEL2x4_L 64,64,3,0  
-    KERNEL2x4_L 64,64,4,0
-	KERNEL2x4_L 64,64,5,0 
-	KERNEL2x4_L 64,64,6,0
-	KERNEL2x4_L 64,64,7,1	
-	bdnz		ZGEMM_L2x4_LOOP
- 	MY_ALIGN  
-ZGEMM_L2x4_LOOP_END:
-    END2x4  AO, BO, 64, 64   	 
- 
-	b		ZGEMM_L2x4_SUB1
- 
-ZGEMM_L2x4_SUB0:
-
-	andi.		L,	K,	31
- 
+    bl ZGEMM_2x4_LMAIN_SUB
+	andi.		L,	T1,	31
+	ble		ZGEMM_L2x4_SAVE
 	b		ZGEMM_L2x4_SUB2
 
-ZGEMM_L2x4_SUB1:
-
-	andi.		L,	T1,	15
-	ble		ZGEMM_L2x4_SAVE
-
-ZGEMM_L2x4_SUB2:
-    srawi.      T1,L, 3
-    ble ZGEMM_L2x4_SUB2_4
-    mtctr		T1
+ZGEMM_L2x4_SUB0:
+	andi.		L,	K,	63
+    cmpwi   K,32
+	bne ZGEMM_L2x4_SUB2 
+    MY_ALIGN	
+ZGEMM_L2x4_SUB2_32:
+  	bl ZGEMM_2x4_L16_SUB
+	bl ZGEMM_2x4_L16_SUB  
+	b ZGEMM_L2x4_SAVE 
+    MY_ALIGN 
+ZGEMM_L2x4_SUB2: 
+    andi.      T1,L, 16
+    ble ZGEMM_L2x4_SUB2_8
+	bl ZGEMM_2x4_L16_SUB	
     MY_ALIGN
-ZGEMM_L2x4_SUB2_LOOP:
-	LOAD2x4 0 
-    KERNEL2x4_L  64,64, 0,0
-    KERNEL2x4_L  64,64, 1,0
-    KERNEL2x4_L  64,64, 2,0
-    KERNEL2x4_E  64,64, 3,1
-    bdnz ZGEMM_L2x4_SUB2_LOOP 
+ZGEMM_L2x4_SUB2_8: 		
+    andi.      T1,L, 8
+    ble ZGEMM_L2x4_SUB2_4
+    bl ZGEMM_2x4_L8_SUB
     MY_ALIGN  
 ZGEMM_L2x4_SUB2_4:
     andi.      T1,L, 4
     ble ZGEMM_L2x4_SUB2_2
 	LOAD2x4 0 
-    KERNEL2x4_L  64,64, 0,0
-    KERNEL2x4_E  64,64, 1,1
+    KERNEL2x4_L  64,32, 0,0
+    KERNEL2x4_E  64,32, 1,1
     MY_ALIGN
 ZGEMM_L2x4_SUB2_2:
     andi.      T1,L, 2
     ble ZGEMM_L2x4_SUB2_1
 	LOAD2x4 0 
-    KERNEL2x4_E  64,64, 0,1
+    KERNEL2x4_E  64,32, 0,1
     MY_ALIGN    
 ZGEMM_L2x4_SUB2_1:
     andi.      T1,L, 1
@@ -259,12 +432,11 @@ ZGEMM_L2x4_SAVE:
 
 ZGEMM_L2x4_END:
 
-ZGEMM_L2x2_BEGIN:
-
+ZGEMM_L2x2_BEGIN: 
 
 	andi.		T1,	M,	2
 	ble		ZGEMM_L2x2_END
-	mr		BO,	BBUFFER
+	mr		BO,	B
 	mr T1, K
     addi T1,T1, -1
     srawi.		L,	T1, 4 /**(K-1) % 16x */ 
@@ -277,18 +449,18 @@ ZGEMM_L2x2_LOOP_START:
 
 	MY_ALIGN
 ZGEMM_L2x2_LOOP: 
-    KERNEL2x2_L 32,64,0,0
-	KERNEL2x2_L 32,64,1,0 	
-	KERNEL2x2_L 32,64,2,0
-	KERNEL2x2_L 32,64,3,0  
-    KERNEL2x2_L 32,64,4,0
-	KERNEL2x2_L 32,64,5,0 
-	KERNEL2x2_L 32,64,6,0
-	KERNEL2x2_L 32,64,7,1	
+    KERNEL2x2_L 32,32,0,0
+	KERNEL2x2_L 32,32,1,0 	
+	KERNEL2x2_L 32,32,2,0
+	KERNEL2x2_L 32,32,3,0  
+    KERNEL2x2_L 32,32,4,0
+	KERNEL2x2_L 32,32,5,0 
+	KERNEL2x2_L 32,32,6,0
+	KERNEL2x2_L 32,32,7,1	
 	bdnz		ZGEMM_L2x2_LOOP
  	MY_ALIGN  
 ZGEMM_L2x2_LOOP_END:
-    END2x2  AO, BO, 32, 64   	 
+    END2x2  AO, BO, 32,32   	 
  
 	b		ZGEMM_L2x2_SUB1
  
@@ -310,24 +482,24 @@ ZGEMM_L2x2_SUB2:
     MY_ALIGN
 ZGEMM_L2x2_SUB2_LOOP:
 	LOAD2x2 0 
-    KERNEL2x2_L  32,64, 0,0
-    KERNEL2x2_L  32,64, 1,0
-    KERNEL2x2_L  32,64, 2,0
-    KERNEL2x2_E  32,64, 3,1
+    KERNEL2x2_L  32,32, 0,0
+    KERNEL2x2_L  32,32, 1,0
+    KERNEL2x2_L  32,32, 2,0
+    KERNEL2x2_E  32,32, 3,1
     bdnz ZGEMM_L2x2_SUB2_LOOP 
     MY_ALIGN  
 ZGEMM_L2x2_SUB2_4:
     andi.      T1,L, 4
     ble ZGEMM_L2x2_SUB2_2
 	LOAD2x2 0 
-    KERNEL2x2_L  32,64, 0,0
-    KERNEL2x2_E  32,64, 1,1
+    KERNEL2x2_L  32,32, 0,0
+    KERNEL2x2_E  32,32, 1,1
     MY_ALIGN
 ZGEMM_L2x2_SUB2_2:
     andi.      T1,L, 2
     ble ZGEMM_L2x2_SUB2_1
 	LOAD2x2 0 
-    KERNEL2x2_E  32,64, 0,1
+    KERNEL2x2_E  32,32, 0,1
     MY_ALIGN    
 ZGEMM_L2x2_SUB2_1:
     andi.      T1,L, 1
@@ -339,12 +511,12 @@ ZGEMM_L2x2_SAVE:
 
 ZGEMM_L2x2_END:
 
-ZGEMM_L2x1_BEGIN:
 
 
+ZGEMM_L2x1_BEGIN: 
 	andi.		T1,	M,	1
 	ble		ZGEMM_L2x1_END
-	mr		BO,	BBUFFER
+	mr		BO,	B
 	mr T1, K
     addi T1,T1, -1
     srawi.		L,	T1, 4 /**(K-1) % 16x */ 
@@ -358,18 +530,18 @@ ZGEMM_L2x1_LOOP_START:
 
 	MY_ALIGN
 ZGEMM_L2x1_LOOP: 
-    KERNEL2x1_L 16,64,0,0
-	KERNEL2x1_L 16,64,1,0 	
-	KERNEL2x1_L 16,64,2,0
-	KERNEL2x1_L 16,64,3,0  
-    KERNEL2x1_L 16,64,4,0
-	KERNEL2x1_L 16,64,5,0 
-	KERNEL2x1_L 16,64,6,0
-	KERNEL2x1_L 16,64,7,1 		
+    KERNEL2x1_L 16,32,0,0
+	KERNEL2x1_L 16,32,1,0 	
+	KERNEL2x1_L 16,32,2,0
+	KERNEL2x1_L 16,32,3,0  
+    KERNEL2x1_L 16,32,4,0
+	KERNEL2x1_L 16,32,5,0 
+	KERNEL2x1_L 16,32,6,0
+	KERNEL2x1_L 16,32,7,1 		
 	bdnz		ZGEMM_L2x1_LOOP
  	MY_ALIGN  
 ZGEMM_L2x1_LOOP_END:
-    END2x1  AO, BO, 16, 64   	 
+    END2x1  AO, BO, 16,32   	 
  
 	b		ZGEMM_L2x1_SUB1
  
@@ -391,24 +563,24 @@ ZGEMM_L2x1_SUB2:
     MY_ALIGN
 ZGEMM_L2x1_SUB2_LOOP:
 	LOAD2x1 0 
-    KERNEL2x1_L  16,64, 0,0
-    KERNEL2x1_L  16,64, 1,0
-    KERNEL2x1_L  16,64, 2,0
-    KERNEL2x1_E  16,64, 3,1
+    KERNEL2x1_L  16,32, 0,0
+    KERNEL2x1_L  16,32, 1,0
+    KERNEL2x1_L  16,32, 2,0
+    KERNEL2x1_E  16,32, 3,1
     bdnz ZGEMM_L2x1_SUB2_LOOP 
     MY_ALIGN  
 ZGEMM_L2x1_SUB2_4:
     andi.      T1,L, 4
     ble ZGEMM_L2x1_SUB2_2
 	LOAD2x1 0 
-    KERNEL2x1_L  16,64, 0,0
-    KERNEL2x1_E  16,64, 1,1
+    KERNEL2x1_L  16,32, 0,0
+    KERNEL2x1_E  16,32, 1,1
     MY_ALIGN
 ZGEMM_L2x1_SUB2_2:
     andi.      T1,L, 2
     ble ZGEMM_L2x1_SUB2_1
 	LOAD2x1 0 
-    KERNEL2x1_E  16,64, 0,1
+    KERNEL2x1_E  16,32, 0,1
     MY_ALIGN    
 ZGEMM_L2x1_SUB2_1:
     andi.      T1,L, 1
@@ -442,36 +614,6 @@ ZGEMM_L1_BEGIN:
 	andi.		T1,	N,	1
 	ble		ZGEMM_L1_END
 
-	mr		BO,	B
-	mr		BBO,	BBUFFER 
-	srawi.		T1,	K,	3 /*this time K/8 */
-	ble		ZGEMM_L1_COPYB1
-
-ZGEMM_L1_COPYB8:
-
-	addi		T2,	PRE, 128
-	dcbt		BO,	PRE
-	dcbtst		BBO,	PRE
-	dcbtst		BBO,	T2
-	ZCOPYB_8
-	addic.		T1,	T1,	-1
-
-	bgt		ZGEMM_L1_COPYB8
-
-ZGEMM_L1_COPYB1:
-
-	andi.		T1,	K,	7
-	ble		ZGEMM_L1_COPYB_END
-
-ZGEMM_L1_COPYB_LOOP:
-
-	ZCOPYB_1
-	addic.          T1,     T1,     -1
-
-	bgt             ZGEMM_L1_COPYB_LOOP
-
-ZGEMM_L1_COPYB_END:
-
 	mr		CO,	C
 	mr		AO,	A
 	srawi.		I,	M,	3
@@ -480,7 +622,7 @@ ZGEMM_L1_COPYB_END:
 ZGEMM_L1x8_BEGIN:
 
 
-	mr		BO,	BBUFFER
+	mr		BO,	B
 	mr T1, K
     addi T1,T1, -1
     srawi.		L,	T1,	5 /**(K-1) % 32x */ 
@@ -501,33 +643,33 @@ ZGEMM_L1x8_LOOP_START:
 ZGEMM_L1x8_LOOP:
  	dcbt		AO,	PRE
 	dcbt		BO,	PRE
-    KERNEL1x8_L 128,32,0,0
-	KERNEL1x8_L 128,32,1,0
+    KERNEL1x8_L 128,16,0,0
+	KERNEL1x8_L 128,16,1,0
 	dcbt		AO,	T2	
-	KERNEL1x8_L 128,32,2,0
-	KERNEL1x8_L 128,32,3,0 
+	KERNEL1x8_L 128,16,2,0
+	KERNEL1x8_L 128,16,3,0 
 	dcbt		AO,	T3
 	dcbt		BO,	T2
-    KERNEL1x8_L 128,32,4,0
-	KERNEL1x8_L 128,32,5,0
+    KERNEL1x8_L 128,16,4,0
+	KERNEL1x8_L 128,16,5,0
 	dcbt		AO,	T4	
-	KERNEL1x8_L 128,32,6,0
-	KERNEL1x8_L 128,32,7,0  
+	KERNEL1x8_L 128,16,6,0
+	KERNEL1x8_L 128,16,7,0  
 	dcbt		AO,	T5	
 	dcbt		BO,	T3
-    KERNEL1x8_L 128,32,8,0
-	KERNEL1x8_L 128,32,9,0
-	KERNEL1x8_L 128,32,10,0
-	KERNEL1x8_L 128,32,11,0  
+    KERNEL1x8_L 128,16,8,0
+	KERNEL1x8_L 128,16,9,0
+	KERNEL1x8_L 128,16,10,0
+	KERNEL1x8_L 128,16,11,0  
 	dcbt		BO,	T4
-    KERNEL1x8_L 128,32,12,0
-	KERNEL1x8_L 128,32,13,0
-	KERNEL1x8_L 128,32,14,0
-	KERNEL1x8_L 128,32,15,1 		
+    KERNEL1x8_L 128,16,12,0
+	KERNEL1x8_L 128,16,13,0
+	KERNEL1x8_L 128,16,14,0
+	KERNEL1x8_L 128,16,15,1 		
 	bdnz		ZGEMM_L1x8_LOOP
  	MY_ALIGN  
 ZGEMM_L1x8_LOOP_END:
-    END1x8  AO, BO, 128, 32   	 
+    END1x8  AO, BO, 128,16   	 
  
 	b		ZGEMM_L1x8_SUB1
  
@@ -549,32 +691,30 @@ ZGEMM_L1x8_SUB2:
     MY_ALIGN
 ZGEMM_L1x8_SUB2_LOOP:
 	LOAD1x8 0 
-    KERNEL1x8_L  128,32, 0,0
-    KERNEL1x8_L  128,32, 1,0
-    KERNEL1x8_L  128,32, 2,0
-    KERNEL1x8_E  128,32, 3,1
+    KERNEL1x8_L  128,16, 0,0
+    KERNEL1x8_L  128,16, 1,0
+    KERNEL1x8_L  128,16, 2,0
+    KERNEL1x8_E  128,16, 3,1
     bdnz ZGEMM_L1x8_SUB2_LOOP 
     MY_ALIGN  
 ZGEMM_L1x8_SUB2_4:
     andi.      T1,L, 4
     ble ZGEMM_L1x8_SUB2_2
 	LOAD1x8 0 
-    KERNEL1x8_L  128,32, 0,0
-    KERNEL1x8_E  128,32, 1,1
+    KERNEL1x8_L  128,16, 0,0
+    KERNEL1x8_E  128,16, 1,1
     MY_ALIGN
 ZGEMM_L1x8_SUB2_2:
     andi.      T1,L, 2
     ble ZGEMM_L1x8_SUB2_1
 	LOAD1x8 0 
-    KERNEL1x8_E  128,32, 0,1
+    KERNEL1x8_E  128,16, 0,1
     MY_ALIGN    
 ZGEMM_L1x8_SUB2_1:
     andi.      T1,L, 1
     ble ZGEMM_L1x8_SAVE	
     KERNEL1x8      
-
-/*	addic.		L,	L,	-1
-	bgt		ZGEMM_L1x8_SUB2_1*/
+ 
 
 ZGEMM_L1x8_SAVE:
 
@@ -592,7 +732,7 @@ ZGEMM_L1x4_BEGIN:
 
 	andi.		T1,	M,	4
 	ble		ZGEMM_L1x4_END
-	mr		BO,	BBUFFER
+	mr		BO,	B
 	mr T1, K
     addi T1,T1, -1
     srawi.		L,	T1, 5 /**(K-1) % 16x */ 
@@ -605,26 +745,26 @@ ZGEMM_L1x4_LOOP_START:
 
 	MY_ALIGN
 ZGEMM_L1x4_LOOP: 
-    KERNEL1x4_L 64,32,0,0
-	KERNEL1x4_L 64,32,1,0 	
-	KERNEL1x4_L 64,32,2,0
-	KERNEL1x4_L 64,32,3,0  
-    KERNEL1x4_L 64,32,4,0
-	KERNEL1x4_L 64,32,5,0 
-	KERNEL1x4_L 64,32,6,0
-	KERNEL1x4_L 64,32,7,0   
-    KERNEL1x4_L 64,32,8,0
-	KERNEL1x4_L 64,32,9,0
-	KERNEL1x4_L 64,32,10,0
-	KERNEL1x4_L 64,32,11,0   
-    KERNEL1x4_L 64,32,12,0
-	KERNEL1x4_L 64,32,13,0
-	KERNEL1x4_L 64,32,14,0
-	KERNEL1x4_L 64,32,15,1 		
+    KERNEL1x4_L 64,16,0,0
+	KERNEL1x4_L 64,16,1,0 	
+	KERNEL1x4_L 64,16,2,0
+	KERNEL1x4_L 64,16,3,0  
+    KERNEL1x4_L 64,16,4,0
+	KERNEL1x4_L 64,16,5,0 
+	KERNEL1x4_L 64,16,6,0
+	KERNEL1x4_L 64,16,7,0   
+    KERNEL1x4_L 64,16,8,0
+	KERNEL1x4_L 64,16,9,0
+	KERNEL1x4_L 64,16,10,0
+	KERNEL1x4_L 64,16,11,0   
+    KERNEL1x4_L 64,16,12,0
+	KERNEL1x4_L 64,16,13,0
+	KERNEL1x4_L 64,16,14,0
+	KERNEL1x4_L 64,16,15,1 		
 	bdnz		ZGEMM_L1x4_LOOP
  	MY_ALIGN  
 ZGEMM_L1x4_LOOP_END:
-    END1x4  AO, BO, 64, 32   	 
+    END1x4  AO, BO, 64,16   	 
  
 	b		ZGEMM_L1x4_SUB1
  
@@ -646,24 +786,24 @@ ZGEMM_L1x4_SUB2:
     MY_ALIGN
 ZGEMM_L1x4_SUB2_LOOP:
 	LOAD1x4 0 
-    KERNEL1x4_L  64,32, 0,0
-    KERNEL1x4_L  64,32, 1,0
-    KERNEL1x4_L  64,32, 2,0
-    KERNEL1x4_E  64,32, 3,1
+    KERNEL1x4_L  64,16, 0,0
+    KERNEL1x4_L  64,16, 1,0
+    KERNEL1x4_L  64,16, 2,0
+    KERNEL1x4_E  64,16, 3,1
     bdnz ZGEMM_L1x4_SUB2_LOOP 
     MY_ALIGN  
 ZGEMM_L1x4_SUB2_4:
     andi.      T1,L, 4
     ble ZGEMM_L1x4_SUB2_2
 	LOAD1x4 0 
-    KERNEL1x4_L  64,32, 0,0
-    KERNEL1x4_E  64,32, 1,1
+    KERNEL1x4_L  64,16, 0,0
+    KERNEL1x4_E  64,16, 1,1
     MY_ALIGN
 ZGEMM_L1x4_SUB2_2:
     andi.      T1,L, 2
     ble ZGEMM_L1x4_SUB2_1
 	LOAD1x4 0 
-    KERNEL1x4_E  64,32, 0,1
+    KERNEL1x4_E  64,16, 0,1
     MY_ALIGN    
 ZGEMM_L1x4_SUB2_1:
     andi.      T1,L, 1
@@ -681,7 +821,7 @@ ZGEMM_L1x2_BEGIN:
 
 	andi.		T1,	M,	2
 	ble		ZGEMM_L1x2_END
-	mr		BO,	BBUFFER
+	mr		BO,	B
 	mr T1, K
     addi T1,T1, -1
     srawi.		L,	T1, 5 /**(K-1) % 16x */ 
@@ -694,26 +834,26 @@ ZGEMM_L1x2_LOOP_START:
 
 	MY_ALIGN
 ZGEMM_L1x2_LOOP: 
-    KERNEL1x2_L 32,32,0,0
-	KERNEL1x2_L 32,32,1,0 	
-	KERNEL1x2_L 32,32,2,0
-	KERNEL1x2_L 32,32,3,0  
-    KERNEL1x2_L 32,32,4,0
-	KERNEL1x2_L 32,32,5,0 
-	KERNEL1x2_L 32,32,6,0
-	KERNEL1x2_L 32,32,7,0   
-    KERNEL1x2_L 32,32,8,0
-	KERNEL1x2_L 32,32,9,0
-	KERNEL1x2_L 32,32,10,0
-	KERNEL1x2_L 32,32,11,0   
-    KERNEL1x2_L 32,32,12,0
-	KERNEL1x2_L 32,32,13,0
-	KERNEL1x2_L 32,32,14,0
-	KERNEL1x2_L 32,32,15,1 		
+    KERNEL1x2_L 32,16,0,0
+	KERNEL1x2_L 32,16,1,0 	
+	KERNEL1x2_L 32,16,2,0
+	KERNEL1x2_L 32,16,3,0  
+    KERNEL1x2_L 32,16,4,0
+	KERNEL1x2_L 32,16,5,0 
+	KERNEL1x2_L 32,16,6,0
+	KERNEL1x2_L 32,16,7,0   
+    KERNEL1x2_L 32,16,8,0
+	KERNEL1x2_L 32,16,9,0
+	KERNEL1x2_L 32,16,10,0
+	KERNEL1x2_L 32,16,11,0   
+    KERNEL1x2_L 32,16,12,0
+	KERNEL1x2_L 32,16,13,0
+	KERNEL1x2_L 32,16,14,0
+	KERNEL1x2_L 32,16,15,1 		
 	bdnz		ZGEMM_L1x2_LOOP
  	MY_ALIGN  
 ZGEMM_L1x2_LOOP_END:
-    END1x2  AO, BO, 32, 32   	 
+    END1x2  AO, BO, 32,16  	 
  
 	b		ZGEMM_L1x2_SUB1
  
@@ -735,24 +875,24 @@ ZGEMM_L1x2_SUB2:
     MY_ALIGN
 ZGEMM_L1x2_SUB2_LOOP:
 	LOAD1x2 0 
-    KERNEL1x2_L  32,32, 0,0
-    KERNEL1x2_L  32,32, 1,0
-    KERNEL1x2_L  32,32, 2,0
-    KERNEL1x2_E  32,32, 3,1
+    KERNEL1x2_L  32,16, 0,0
+    KERNEL1x2_L  32,16, 1,0
+    KERNEL1x2_L  32,16, 2,0
+    KERNEL1x2_E  32,16, 3,1
     bdnz ZGEMM_L1x2_SUB2_LOOP 
     MY_ALIGN  
 ZGEMM_L1x2_SUB2_4:
     andi.      T1,L, 4
     ble ZGEMM_L1x2_SUB2_2
 	LOAD1x2 0 
-    KERNEL1x2_L  32,32, 0,0
-    KERNEL1x2_E  32,32, 1,1
+    KERNEL1x2_L  32,16, 0,0
+    KERNEL1x2_E  32,16, 1,1
     MY_ALIGN
 ZGEMM_L1x2_SUB2_2:
     andi.      T1,L, 2
     ble ZGEMM_L1x2_SUB2_1
 	LOAD1x2 0 
-    KERNEL1x2_E  32,32, 0,1
+    KERNEL1x2_E  32,16, 0,1
     MY_ALIGN    
 ZGEMM_L1x2_SUB2_1:
     andi.      T1,L, 1
@@ -769,7 +909,7 @@ ZGEMM_L1x1_BEGIN:
 
 	andi.		T1,	M,	1
 	ble		ZGEMM_L1x1_END
-	mr		BO,	BBUFFER
+	mr		BO,	B
 	mr T1, K
     addi T1,T1, -1
     srawi.		L,	T1, 5 /**(K-1) % 16x */ 
@@ -783,26 +923,26 @@ ZGEMM_L1x1_LOOP_START:
 
 	MY_ALIGN
 ZGEMM_L1x1_LOOP: 
-    KERNEL1x1_L 16,32,0,0
-	KERNEL1x1_L 16,32,1,0 	
-	KERNEL1x1_L 16,32,2,0
-	KERNEL1x1_L 16,32,3,0  
-    KERNEL1x1_L 16,32,4,0
-	KERNEL1x1_L 16,32,5,0 
-	KERNEL1x1_L 16,32,6,0
-	KERNEL1x1_L 16,32,7,0   
-    KERNEL1x1_L 16,32,8,0
-	KERNEL1x1_L 16,32,9,0
-	KERNEL1x1_L 16,32,10,0
-	KERNEL1x1_L 16,32,11,0   
-    KERNEL1x1_L 16,32,12,0
-	KERNEL1x1_L 16,32,13,0
-	KERNEL1x1_L 16,32,14,0
-	KERNEL1x1_L 16,32,15,1 		
+    KERNEL1x1_L 16,16,0,0
+	KERNEL1x1_L 16,16,1,0 	
+	KERNEL1x1_L 16,16,2,0
+	KERNEL1x1_L 16,16,3,0  
+    KERNEL1x1_L 16,16,4,0
+	KERNEL1x1_L 16,16,5,0 
+	KERNEL1x1_L 16,16,6,0
+	KERNEL1x1_L 16,16,7,0   
+    KERNEL1x1_L 16,16,8,0
+	KERNEL1x1_L 16,16,9,0
+	KERNEL1x1_L 16,16,10,0
+	KERNEL1x1_L 16,16,11,0   
+    KERNEL1x1_L 16,16,12,0
+	KERNEL1x1_L 16,16,13,0
+	KERNEL1x1_L 16,16,14,0
+	KERNEL1x1_L 16,16,15,1 		
 	bdnz		ZGEMM_L1x1_LOOP
  	MY_ALIGN  
 ZGEMM_L1x1_LOOP_END:
-    END1x1  AO, BO, 16, 32   	 
+    END1x1  AO, BO, 16, 16   	 
  
 	b		ZGEMM_L1x1_SUB1
  
@@ -824,24 +964,24 @@ ZGEMM_L1x1_SUB2:
     MY_ALIGN
 ZGEMM_L1x1_SUB2_LOOP:
 	LOAD1x1 0 
-    KERNEL1x1_L  16,32, 0,0
-    KERNEL1x1_L  16,32, 1,0
-    KERNEL1x1_L  16,32, 2,0
-    KERNEL1x1_E  16,32, 3,1
+    KERNEL1x1_L  16,16, 0,0
+    KERNEL1x1_L  16,16, 1,0
+    KERNEL1x1_L  16,16, 2,0
+    KERNEL1x1_E  16,16, 3,1
     bdnz ZGEMM_L1x1_SUB2_LOOP 
     MY_ALIGN  
 ZGEMM_L1x1_SUB2_4:
     andi.      T1,L, 4
     ble ZGEMM_L1x1_SUB2_2
 	LOAD1x1 0 
-    KERNEL1x1_L  16,32, 0,0
-    KERNEL1x1_E  16,32, 1,1
+    KERNEL1x1_L  16,16, 0,0
+    KERNEL1x1_E  16,16, 1,1
     MY_ALIGN
 ZGEMM_L1x1_SUB2_2:
     andi.      T1,L, 2
     ble ZGEMM_L1x1_SUB2_1
 	LOAD1x1 0 
-    KERNEL1x1_E  16,32, 0,1
+    KERNEL1x1_E  16,16, 0,1
     MY_ALIGN    
 ZGEMM_L1x1_SUB2_1:
     andi.      T1,L, 1
diff --git a/kernel/power/zgemm_macros_power9.S b/kernel/power/zgemm_macros_power9.S
index 93a309ad1..10d9e4cc3 100644
--- a/kernel/power/zgemm_macros_power9.S
+++ b/kernel/power/zgemm_macros_power9.S
@@ -25,68 +25,6 @@ OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
 USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 *****************************************************************************/
 
-#if   defined(NN) || defined(NT) || defined(TN) || defined(TT)
-
-    #define XSFADD_R1   xsadddp
-    #define XSFADD_R2   xssubdp
-    #define XSFADD_I1   xsadddp
-    #define XSFADD_I2   xsadddp
-
-#elif  defined(CN) || defined(CT) || defined(RN) || defined(RT)
-
-    #define XSFADD_R1   xsadddp
-    #define XSFADD_R2   xsadddp
-    #define XSFADD_I1   xssubdp
-    #define XSFADD_I2   xsadddp
-
-#elif  defined(NC) || defined(TC) || defined(NR) || defined(TR)
-
-    #define XSFADD_R1   xsadddp
-    #define XSFADD_R2   xsadddp
-    #define XSFADD_I1   xsadddp
-    #define XSFADD_I2   xssubdp
-
-#else       // CC || CR || RC || RR
-
-    #define XSFADD_R1   xsadddp
-    #define XSFADD_R2   xssubdp
-    #define XSFADD_I1   xssubdp
-    #define XSFADD_I2   xssubdp
-
-#endif
-
-.macro AGGREGATE_INTO_COMPLEX  FIRST_V, SECOND_V, OUTPUT_V
-     AGGREGATE_INTO_COMPLEX_INNER \FIRST_V, \SECOND_V, \OUTPUT_V, vs0,vs1,vs2,vs3,vs4,vs5,vs6,vs7
-.endm
-
-.macro AGGREGATE_INTO_COMPLEX_INNER  FIRST_V, SECOND_V, OUTPUT_V ,TEMP1,TEMP2,TEMP3,TEMP4,TEMP5,TEMP6,TEMP7,TEMP8
-    xxlxor      \TEMP1, \TEMP1, \TEMP1
-    xxlxor      \TEMP2, \TEMP2, \TEMP2
- 
-    xxswapd     \SECOND_V,  \SECOND_V           //   imagA*imagB, realA*imagB -> realA*imagB, imagA*imagB 
-
-    XSFADD_I1   \TEMP2, \TEMP2, \FIRST_V        // realA*imagB
-    XSFADD_I2   \TEMP2, \TEMP2, \SECOND_V       // imagA*realB
-
-    xxswapd     \FIRST_V,   \FIRST_V            //imagA*realB, realA*realB -> realA*realB, imagA*realB   
-    xxswapd     \SECOND_V,  \SECOND_V           //  reverse to original imagA*imagB, realA*imagB 
-
-    XSFADD_R1   \TEMP1, \TEMP1, \FIRST_V        // realA*realB
-    XSFADD_R2   \TEMP1, \TEMP1, \SECOND_V       // imagA*imagB
-
-    xsmuldp     \TEMP3, \TEMP2, alpha_i     // imag*alpha_i
-    xsmuldp     \TEMP4, \TEMP2, alpha_r     // imag*alpha_r 
-    xsmuldp     \TEMP5, \TEMP1, alpha_r     // real*alpha_r 
-    xsmuldp     \TEMP6, \TEMP1, alpha_i     // real*alpha_i
-
-    xssubdp     \TEMP7, \TEMP5, \TEMP3      // real*alpha_r - imag*alpha_i
-    xsadddp     \TEMP8, \TEMP6, \TEMP4      // real*alpha_i + imag*alpha_r
-    xxpermdi    \OUTPUT_V,  \TEMP8, \TEMP7, 0   // merge real and imag part
-.endm
-
-/**********************************************************************************************
-* Macros for N=2 and M=8
-**********************************************************************************************/
 
 #define unit_size 16
 #define DISP32(ind,disp) (ind*unit_size*32+disp)
@@ -95,338 +33,457 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #define DISP4(ind,disp) (ind*unit_size*4+disp)
 #define DISP2(ind,disp) (ind*unit_size*2+disp)
 #define DISP1(ind,disp) (ind*unit_size+disp)
+#define DISPX(disp)  (disp)
+
+/*	HELPERS FOR SAVE	*/
+
+/* {r0,i0} and {r1,i1} into  {r0,r1} {i0,i1} */
+.macro LOAD_COUPLE_AS_RR_II  VS_OUT1,VS_OUT2,VS_TEMP1,VS_TEMP2,REG,LOFFSET 
+#ifndef TRMMKERNEL 
+  lxv	\VS_TEMP1,	DISPX(\LOFFSET)(\REG)
+  lxv	\VS_TEMP2,	DISPX(\LOFFSET+16)(\REG)
+  xxmrgld  \VS_OUT1,\VS_TEMP1,\VS_TEMP2
+  xxmrghd  \VS_OUT2,\VS_TEMP1,\VS_TEMP2	
+#endif	
+.endm
+
+/*from 2 result {a0r*br,a0i*bi} and {a1r*br,a1i*bi} pack into {a0r*br,a1r*br} and {a0i*bi,a1i*bi}*/
+.macro RESULT_INTO_REALREAL_IMAGEIMAGE VSIN1,VSIN2,VSOUT1,VSOUT2
+	xxmrgld	\VSOUT1, \VSIN1,\VSIN2 /*  real*real from 2 results*/
+	xxmrghd	\VSOUT2, \VSIN1,\VSIN2 /*  imag*imag from 2 results*/
+.endm 
+
+/*from 2 result {a0r*bi,a0i*br} and {a1r*bi,a1i*br} pack into {a0r*bi,a1r*bi} and {a0i*br,a1i*br}*/
+.macro RESULT_INTO_REALIMAG_IMAGREAL VSIN1,VSIN2,VSOUT1,VSOUT2 
+	xxmrgld	\VSOUT1, \VSIN1,\VSIN2 /*  real*imag */
+	xxmrghd	\VSOUT2, \VSIN1,\VSIN2 /*  imag*real*/
+.endm
+
+/* {a0r*br op a0i*bi ,a1r*br op a1i*bi} ~ {r0,r1}; {a0r*bi op a0i*br ,a1r*bi op a1i*br} ~ {i0,i1}*/
+.macro  AGGREGATE_REALS_IMAGES  VSINR_OUT1,VSINR,VSINI_OUT2,VSINI
+#if	defined(NN) || defined(NT) || defined(TN) || defined(TT) 
+	xvsubdp  \VSINR_OUT1,\VSINR_OUT1,\VSINR
+	xvadddp  \VSINI_OUT2,\VSINI_OUT2,\VSINI  
+#elif  defined(CN) || defined(CT) || defined(RN) || defined(RT) 
+	xvadddp  \VSINR_OUT1,\VSINR_OUT1,\VSINR
+	xvsubdp  \VSINI_OUT2,\VSINI_OUT2,\VSINI 
+#elif  defined(NC) || defined(TC) || defined(NR) || defined(TR) 
+	xvadddp  \VSINR_OUT1,\VSINR_OUT1,\VSINR
+	xvsubdp  \VSINI_OUT2,\VSINI,\VSINI_OUT2  
+#else	// CC || CR || RC || RR 
+    /*we will assume {-alpha_r,-alpha_i} for this case */
+    /*i1i2-r1r2 so we will negate alpha real instead to fix sign*/
+	xvsubdp  \VSINR_OUT1,\VSINR,\VSINR_OUT1
+    /*we will negate alpha image instead  instead to fix sign*/
+	xvadddp  \VSINI_OUT2,\VSINI_OUT2,\VSINI 
+#endif
+.endm 
+
+/* {i0,i1} * {alpha_i,alpha_i} - VSOUT1 ;VSOUT2 + {r0,r1}*{alpha_i,alpha_i} */
+.macro MULT_APLHA_PART1  VSINRR,VSINII,VSOUT1,VSOUT2
+#ifndef TRMMKERNEL  
+	xvmsubadp \VSOUT1,\VSINII, alpha_i
+	xvmaddadp  \VSOUT2,\VSINRR, alpha_i
+#else 
+	xvmuldp \VSOUT1,\VSINII, alpha_i 
+	xvmuldp  \VSOUT2,\VSINRR, alpha_i
+#endif 
+.endm
+
+/*   {r0,r1} * {alpha_r,alpha_r} -  VSOUT1 ;VSOUT2 + {i0,i1} * {alpha_r,alpha_r} */
+.macro MULT_APLHA_PART2  VSINRR,VSINII,VSOUT1,VSOUT2 
+	xvmsubadp  \VSOUT1,\VSINRR, alpha_r
+	xvmaddadp \VSOUT2,\VSINII, alpha_r
+.endm
+
+/* unpack to store 2{r,r} {i,i} into  {r,i} {r,i} (big endian because of stxv) */
+.macro UNPACK_FOR_STORE VSIN1,VSIN2,VSOUT1,VSOUT2 
+	xxmrghd  \VSOUT1,\VSIN2,\VSIN1
+	xxmrgld  \VSOUT2,\VSIN2,\VSIN1
+.endm
+.macro STORE_COUPLE REG,LOFFSET,VSIN1,VSIN2
+	stxv	\VSIN1,	DISPX(\LOFFSET)(\REG)
+	stxv	\VSIN2,	DISPX(\LOFFSET+16)(\REG)
+.endm
+
+.macro SAVE8 VSRes1,VSRes2,VSRes3,VSRes4,VSRes5,VSRes6,VSRes7,VSRes8,VSRes9,VSRes10,VSRes11,VSRes12,VSRes13,VSRes14,VSRes15,VSRes16,BASE_REG,LOFFSET
+  RESULT_INTO_REALREAL_IMAGEIMAGE \VSRes1,\VSRes3,vs2,vs3
+  LOAD_COUPLE_AS_RR_II	vs14,vs15,vs18,vs19,\BASE_REG,\LOFFSET
+  RESULT_INTO_REALIMAG_IMAGREAL	\VSRes2,\VSRes4,vs4,vs5
+  LOAD_COUPLE_AS_RR_II	vs16,vs17,vs20,vs21,\BASE_REG,(\LOFFSET+32)
+  RESULT_INTO_REALREAL_IMAGEIMAGE \VSRes5,\VSRes7,vs6,vs7
+  LOAD_COUPLE_AS_RR_II	vs24,vs25,vs18,vs19,\BASE_REG,(\LOFFSET +64)
+  RESULT_INTO_REALIMAG_IMAGREAL	\VSRes6,\VSRes8,vs8,vs9 
+  LOAD_COUPLE_AS_RR_II	vs26,vs27,vs20,vs21,\BASE_REG,(\LOFFSET+96)
+  RESULT_INTO_REALREAL_IMAGEIMAGE \VSRes9,\VSRes11,vs10,vs11
+  AGGREGATE_REALS_IMAGES	vs2,vs3,vs4,vs5
+  RESULT_INTO_REALIMAG_IMAGREAL	\VSRes10,\VSRes12,vs12,vs13 
+  AGGREGATE_REALS_IMAGES	vs6,vs7,vs8,vs9  
+  RESULT_INTO_REALREAL_IMAGEIMAGE \VSRes13,\VSRes15,\VSRes1,\VSRes2
+  MULT_APLHA_PART1	vs2,vs4, vs14,vs15
+  RESULT_INTO_REALIMAG_IMAGREAL	\VSRes14,\VSRes16,\VSRes3,\VSRes4
+  MULT_APLHA_PART1	vs6,vs8,vs16,vs17
+  MULT_APLHA_PART2  vs2,vs4,vs14,vs15 
+  AGGREGATE_REALS_IMAGES	vs10,vs11,vs12,vs13
+  MULT_APLHA_PART2	vs6,vs8,vs16,vs17
+  AGGREGATE_REALS_IMAGES	\VSRes1,\VSRes2,\VSRes3,\VSRes4	
+  UNPACK_FOR_STORE	vs14,vs15,vs7,vs9
+  MULT_APLHA_PART1	vs10,vs12, vs24,vs25
+  UNPACK_FOR_STORE	vs16,vs17,vs3,vs5 
+  MULT_APLHA_PART1	\VSRes1,\VSRes3, vs26,vs27
+  STORE_COUPLE	\BASE_REG,\LOFFSET,vs7,vs9
+  MULT_APLHA_PART2	vs10,vs12,vs24,vs25
+  STORE_COUPLE	\BASE_REG,(\LOFFSET+32),vs3,vs5 
+  MULT_APLHA_PART2	\VSRes1,\VSRes3, vs26,vs27
+  UNPACK_FOR_STORE	vs24,vs25,vs10,vs12
+  UNPACK_FOR_STORE	vs26,vs27,\VSRes1,\VSRes3
+  STORE_COUPLE	\BASE_REG,(\LOFFSET +64),vs10,vs12
+  STORE_COUPLE	\BASE_REG,(\LOFFSET+96),\VSRes1,\VSRes3
+.endm
+
+.macro SAVE4  VSRes1,VSRes2,VSRes3,VSRes4,VSRes5,VSRes6,VSRes7,VSRes8,BASE_REG,LOFFSET
+  RESULT_INTO_REALREAL_IMAGEIMAGE \VSRes1,\VSRes3,vs2,vs3
+  LOAD_COUPLE_AS_RR_II	vs14,vs15,vs18,vs19,\BASE_REG,\LOFFSET
+  RESULT_INTO_REALIMAG_IMAGREAL	\VSRes2,\VSRes4,vs4,vs5
+  LOAD_COUPLE_AS_RR_II	vs16,vs17,vs20,vs21,\BASE_REG,(\LOFFSET+32)
+  RESULT_INTO_REALREAL_IMAGEIMAGE \VSRes5,\VSRes7,vs6,vs7
+  RESULT_INTO_REALIMAG_IMAGREAL	\VSRes6,\VSRes8,vs8,vs9 
+  AGGREGATE_REALS_IMAGES	vs2,vs3,vs4,vs5	
+  AGGREGATE_REALS_IMAGES	vs6,vs7,vs8,vs9  
+  MULT_APLHA_PART1	vs2,vs4, vs14,vs15
+  MULT_APLHA_PART1	vs6,vs8, vs16,vs17
+  MULT_APLHA_PART2	vs2,vs4, vs14,vs15 
+  MULT_APLHA_PART2	vs6,vs8,vs16,vs17
+  UNPACK_FOR_STORE	vs14,vs15,vs7,vs9
+  UNPACK_FOR_STORE	vs16,vs17,vs3,vs5
+  STORE_COUPLE	\BASE_REG,\LOFFSET,vs7,vs9
+  STORE_COUPLE	\BASE_REG,(\LOFFSET+32),vs3,vs5
+.endm
+
+
+.macro SAVE2  VSRes1,VSRes2,VSRes3,VSRes4,BASE_REG,LOFFSET
+  RESULT_INTO_REALREAL_IMAGEIMAGE \VSRes1,\VSRes3,vs2,vs3
+  LOAD_COUPLE_AS_RR_II	vs14,vs15,vs18,vs19,\BASE_REG,\LOFFSET
+  RESULT_INTO_REALIMAG_IMAGREAL	\VSRes2,\VSRes4,vs4,vs5	
+  AGGREGATE_REALS_IMAGES	vs2,vs3,vs4,vs5	
+  MULT_APLHA_PART1	vs2,vs4, vs14,vs15	
+  MULT_APLHA_PART2	vs2,vs4, vs14,vs15  
+  UNPACK_FOR_STORE	vs14,vs15,vs7,vs9	
+  STORE_COUPLE	\BASE_REG,\LOFFSET,vs7,vs9  
+.endm
+
+
+.macro SAVE1  VSRes1,VSRes2,BASE_REG,LOFFSET
+  RESULT_INTO_REALREAL_IMAGEIMAGE \VSRes1,\VSRes1,vs2,vs3
+#ifndef TRMMKERNEL 
+  lxv	vs18,	(\LOFFSET)(\BASE_REG) 
+  xxmrgld  vs14,vs18,vs18
+  xxmrghd  vs15,vs18,vs18	
+#endif	
+  RESULT_INTO_REALIMAG_IMAGREAL	\VSRes2,\VSRes2,vs4,vs5	
+  AGGREGATE_REALS_IMAGES	vs2,vs3,vs4,vs5	
+  MULT_APLHA_PART1	vs2,vs4, vs14,vs15	
+  MULT_APLHA_PART2	vs2,vs4, vs14,vs15  
+  UNPACK_FOR_STORE	vs14,vs15,vs7,vs9 
+  xxmrghd  vs7,vs15,vs14	
+  stxv	vs7,	(\LOFFSET)(\BASE_REG) 
+.endm
+
+/**********************************************************************************************
+* Macros for N=2 and M=8
+**********************************************************************************************/
 
 .macro Zero2x8
-    xxlxor      vs32,   vs32,   vs32
-    xxlxor      vs33,   vs33,   vs33
-    xxlxor      vs34,   vs34,   vs34
-    xxlxor      vs35,   vs35,   vs35
-    xxlxor      vs36,   vs36,   vs36
-    xxlxor      vs37,   vs37,   vs37
-    xxlxor      vs38,   vs38,   vs38
-    xxlxor      vs39,   vs39,   vs39
-    xxlxor      vs40,   vs40,   vs40
-    xxlxor      vs41,   vs41,   vs41
-    xxlxor      vs42,   vs42,   vs42
-    xxlxor      vs43,   vs43,   vs43
-    xxlxor      vs44,   vs44,   vs44
-    xxlxor      vs45,   vs45,   vs45
-    xxlxor      vs46,   vs46,   vs46
-    xxlxor      vs47,   vs47,   vs47
-    xxlxor      vs48,   vs48,   vs48
-    xxlxor      vs49,   vs49,   vs49
-    xxlxor      vs50,   vs50,   vs50
-    xxlxor      vs51,   vs51,   vs51 
-    xxlxor      vs52,   vs52,   vs52
-    xxlxor      vs53,   vs53,   vs53
-    xxlxor      vs54,   vs54,   vs54
-    xxlxor      vs55,   vs55,   vs55 
-    xxlxor      vs56,   vs56,   vs56
-    xxlxor      vs57,   vs57,   vs57
-    xxlxor      vs58,   vs58,   vs58
-    xxlxor      vs59,   vs59,   vs59 
-    xxlxor      vs60,   vs60,   vs60
-    xxlxor      vs61,   vs61,   vs61
-    xxlxor      vs62,   vs62,   vs62
-    xxlxor      vs63,   vs63,   vs63    
+	xxlxor	vs32,	vs32,	vs32
+	xxlxor	vs33,	vs33,	vs33
+	xxlxor	vs34,	vs34,	vs34
+	xxlxor	vs35,	vs35,	vs35
+	xxlxor	vs36,	vs36,	vs36
+	xxlxor	vs37,	vs37,	vs37
+	xxlxor	vs38,	vs38,	vs38
+	xxlxor	vs39,	vs39,	vs39
+	xxlxor	vs40,	vs40,	vs40
+	xxlxor	vs41,	vs41,	vs41
+	xxlxor	vs42,	vs42,	vs42
+	xxlxor	vs43,	vs43,	vs43
+	xxlxor	vs44,	vs44,	vs44
+	xxlxor	vs45,	vs45,	vs45
+	xxlxor	vs46,	vs46,	vs46
+	xxlxor	vs47,	vs47,	vs47
+	xxlxor	vs48,	vs48,	vs48
+	xxlxor	vs49,	vs49,	vs49
+	xxlxor	vs50,	vs50,	vs50
+	xxlxor	vs51,	vs51,	vs51
+	xxlxor	vs52,	vs52,	vs52
+	xxlxor	vs53,	vs53,	vs53
+	xxlxor	vs54,	vs54,	vs54
+	xxlxor	vs55,	vs55,	vs55
+	xxlxor	vs56,	vs56,	vs56
+	xxlxor	vs57,	vs57,	vs57
+	xxlxor	vs58,	vs58,	vs58
+	xxlxor	vs59,	vs59,	vs59
+	xxlxor	vs60,	vs60,	vs60
+	xxlxor	vs61,	vs61,	vs61
+	xxlxor	vs62,	vs62,	vs62
+	xxlxor	vs63,	vs63,	vs63
 .endm
 
 .macro LOAD2x8 Zero
 
-    lxv     vs16,   0(BO)       // load real part from B
-    lxv     vs17,   16(BO)      // load imag part from B
-    lxv     vs18,   32(BO)      // load real part from B
-    lxv     vs19,   48(BO)      // load imag part from B
+	lxv	vs16,	0(BO)	// load real imag from B
+	lxv	vs18,	16(BO)	// load real,imag from B
+	xxswapd	vs17, vs16
+	xxswapd	vs19, vs18
 
-    lxv     vs0,    0(AO)       // load real,imag from A
-    lxv     vs1,    16(AO)      // load real,imag from A
-    lxv     vs2,    32(AO)      // load real,imag from A
-    lxv     vs3,    48(AO)      // load real,imag from A
+	lxv	vs0,	0(AO)	// load real,imag from A
+	lxv	vs1,	16(AO)	// load real,imag from A
+	lxv	vs2,	32(AO)	// load real,imag from A
+	lxv	vs3,	48(AO)	// load real,imag from A
 
-    lxv     vs4,    64(AO)      // load real,imag from A
-    lxv     vs5,    80(AO)      // load real,imag from A
-    lxv     vs6,    96(AO)      // load real,imag from A
-    lxv     vs7,    112(AO)     // load real,imag from A
+	lxv	vs4,	64(AO)	// load real,imag from A
+	lxv	vs5,	80(AO)	// load real,imag from A
+	lxv	vs6,	96(AO)	// load real,imag from A
+	lxv	vs7,	112(AO)	// load real,imag from A
 
 .if \Zero==1
-    Zero2x8 
+	Zero2x8
 .endif
 
 .endm
 
 .macro END2x8_NORMAL
-   END2x8 AO,BO,128,64
+	END2x8 AO,BO,128,32
 .endm
 
-.macro END2x8   AREG, BREG, OffsetA, OffsetB
+.macro END2x8	AREG, BREG, OffsetA, OffsetB
 
-.if \OffsetB != 0 
-    addi        \BREG, \BREG, \OffsetB 
+.if \OffsetB != 0
+	addi	\BREG, \BREG, \OffsetB
+.endif
+.if \OffsetA != 0
+	addi	\AREG, \AREG, \OffsetA
 .endif
-.if \OffsetA != 0 
-    addi        \AREG, \AREG, \OffsetA 
-.endif  
 
-    xvmaddadp   vs32,   vs0,    vs16        // real*real, imag*real
-    xvmaddadp   vs33,   vs0,    vs17        // real*imag, imag*imag
-    xvmaddadp   vs34,   vs1,    vs16        // real*real, imag*real
-    xvmaddadp   vs35,   vs1,    vs17        // real*imag, imag*imag
-    xvmaddadp   vs36,   vs2,    vs16        // real*real, imag*real
-    xvmaddadp   vs37,   vs2,    vs17        // real*imag, imag*imag
-    xvmaddadp   vs38,   vs3,    vs16        // real*real, imag*real
-    xvmaddadp   vs39,   vs3,    vs17        // real*imag, imag*imag
-    xvmaddadp   vs40,   vs4,    vs16        // real*real, imag*real
-    xvmaddadp   vs41,   vs4,    vs17        // real*imag, imag*imag
-    xvmaddadp   vs42,   vs5,    vs16        // real*real, imag*real
-    xvmaddadp   vs43,   vs5,    vs17        // real*imag, imag*imag
-    xvmaddadp   vs44,   vs6,    vs16        // real*real, imag*real
-    xvmaddadp   vs45,   vs6,    vs17        // real*imag, imag*imag
-    xvmaddadp   vs46,   vs7,    vs16        // real*real, imag*real
-    xvmaddadp   vs47,   vs7,    vs17        // real*imag, imag*imag
+	xvmaddadp	vs32,	vs0,	vs16
+	xvmaddadp	vs48,	vs0,	vs18
 
-    xvmaddadp   vs48,   vs0,    vs18        // real*real, imag*real
-    xvmaddadp   vs49,   vs0,    vs19        // real*imag, imag*imag
-    xvmaddadp   vs50,   vs1,    vs18        // real*real, imag*real
-    xvmaddadp   vs51,   vs1,    vs19        // real*imag, imag*imag
-    xvmaddadp   vs52,   vs2,    vs18        // real*real, imag*real
-    xvmaddadp   vs53,   vs2,    vs19        // real*imag, imag*imag
-    xvmaddadp   vs54,   vs3,    vs18        // real*real, imag*real
-    xvmaddadp   vs55,   vs3,    vs19        // real*imag, imag*imag
-    xvmaddadp   vs56,   vs4,    vs18        // real*real, imag*real
-    xvmaddadp   vs57,   vs4,    vs19        // real*imag, imag*imag
-    xvmaddadp   vs58,   vs5,    vs18        // real*real, imag*real
-    xvmaddadp   vs59,   vs5,    vs19        // real*imag, imag*imag
-    xvmaddadp   vs60,   vs6,    vs18        // real*real, imag*real
-    xvmaddadp   vs61,   vs6,    vs19        // real*imag, imag*imag
-    xvmaddadp   vs62,   vs7,    vs18        // real*real, imag*real
-    xvmaddadp   vs63,   vs7,    vs19        // real*imag, imag*imag
+	xvmaddadp	vs34,	vs1,	vs16
+	xvmaddadp	vs50,	vs1,	vs18
+
+	xvmaddadp	vs36,	vs2,	vs16
+	xvmaddadp	vs52,	vs2,	vs18
+
+	xvmaddadp	vs38,	vs3,	vs16
+	xvmaddadp	vs54,	vs3,	vs18
+
+	xvmaddadp	vs40,	vs4,	vs16
+	xvmaddadp	vs56,	vs4,	vs18
+
+	xvmaddadp	vs42,	vs5,	vs16
+	xvmaddadp	vs58,	vs5,	vs18
+
+	xvmaddadp	vs44,	vs6,	vs16
+	xvmaddadp	vs60,	vs6,	vs18
+
+	xvmaddadp	vs46,	vs7,	vs16
+	xvmaddadp	vs62,	vs7,	vs18
+
+
+	xvmaddadp	vs33,	vs0,	vs17
+	xvmaddadp	vs49,	vs0,	vs19
+
+	xvmaddadp	vs35,	vs1,	vs17
+	xvmaddadp	vs51,	vs1,	vs19
+
+	xvmaddadp	vs37,	vs2,	vs17
+	xvmaddadp	vs53,	vs2,	vs19
+
+	xvmaddadp	vs39,	vs3,	vs17
+	xvmaddadp	vs55,	vs3,	vs19
+
+	xvmaddadp	vs41,	vs4,	vs17
+	xvmaddadp	vs57,	vs4,	vs19
+
+	xvmaddadp	vs43,	vs5,	vs17
+	xvmaddadp	vs59,	vs5,	vs19
+
+	xvmaddadp	vs45,	vs6,	vs17
+	xvmaddadp	vs61,	vs6,	vs19
+
+	xvmaddadp	vs47,	vs7,	vs17
+	xvmaddadp	vs63,	vs7,	vs19
 
 .endm
 
-.macro KERNEL2x8_L      OffsetA,OffsetB, Index,IsLast  
-  KERNEL2x8_2   AO,BO,   \OffsetA,\OffsetB, \Index,\IsLast ,0
+.macro KERNEL2x8_L	OffsetA,OffsetB, Index,IsLast
+  KERNEL2x8_2	AO,BO,	\OffsetA,\OffsetB, \Index,\IsLast ,0
 .endm
 
-.macro KERNEL2x8_E      OffsetA,OffsetB, Index,IsLast  
-  KERNEL2x8_2   AO,BO,   \OffsetA,\OffsetB, \Index,\IsLast ,1
+.macro KERNEL2x8_E	OffsetA,OffsetB, Index,IsLast
+  KERNEL2x8_2	AO,BO,	\OffsetA,\OffsetB, \Index,\IsLast ,1
 .endm
 
-.macro KERNEL2x8_2   AREG,BREG,   OffsetA,OffsetB, Index,IsLast ,Complete
 
-    lxv     vs8,     DISP16(\Index, 0 + \OffsetA)(\AREG)        // load real,imag from A
-    lxv     vs9,     DISP16(\Index,16 + \OffsetA)(\AREG)        // load real,imag from A
-    lxv     vs10,    DISP16(\Index,32 + \OffsetA)(\AREG)        // load real,imag from A
-    lxv     vs11,    DISP16(\Index,48 + \OffsetA)(\AREG)        // load real,imag from A 
+.macro KERNEL2x8_2	AREG,BREG,	OffsetA,OffsetB, Index,IsLast ,Complete
 
-    lxv     vs12,    DISP16(\Index, 64 + \OffsetA)(\AREG)       // load real,imag from A
-    lxv     vs13,    DISP16(\Index,64+16 + \OffsetA)(\AREG)     // load real,imag from A
-    lxv     vs14,    DISP16(\Index,64+32 + \OffsetA)(\AREG)     // load real,imag from A
-    lxv     vs15,    DISP16(\Index,64+48 + \OffsetA)(\AREG)     // load real,imag from A
+	lxv	vs20,	DISP4(\Index,	0+\OffsetB)(\BREG)	// load real,imag	from B
+	lxv	vs22,	DISP4(\Index,16+\OffsetB)(\BREG)	// load real,imag  from B
+	xvmaddadp	vs32,	vs0,	vs16
+	xvmaddadp	vs48,	vs0,	vs18
+	xvmaddadp	vs33,	vs0,	vs17
+	xvmaddadp	vs49,	vs0,	vs19
 
-lxv     vs20,   DISP8(\Index,   0+\OffsetB)(\BREG)      // load real part from B
-    lxv     vs21,   DISP8(\Index,16+\OffsetB)(\BREG)        // load imag part from B
-    lxv     vs22,   DISP8(\Index,32+\OffsetB)(\BREG)        // load real part from B
-    lxv     vs23,   DISP8(\Index,48+\OffsetB)(\BREG)        // load imag part from B
+	xxswapd	vs21, vs20
+	xxswapd	vs23, vs22
 
-    xvmaddadp   vs32,   vs0,    vs16        // real*real, imag*real
-    xvmaddadp   vs33,   vs0,    vs17        // real*imag, imag*imag
-    xvmaddadp   vs34,   vs1,    vs16        // real*real, imag*real
-    xvmaddadp   vs35,   vs1,    vs17        // real*imag, imag*imag
-    xvmaddadp   vs36,   vs2,    vs16        // real*real, imag*real
-    xvmaddadp   vs37,   vs2,    vs17        // real*imag, imag*imag
-    xvmaddadp   vs38,   vs3,    vs16        // real*real, imag*real
-    xvmaddadp   vs39,   vs3,    vs17        // real*imag, imag*imag
-    xvmaddadp   vs40,   vs4,    vs16        // real*real, imag*real
-    xvmaddadp   vs41,   vs4,    vs17        // real*imag, imag*imag
-    xvmaddadp   vs42,   vs5,    vs16        // real*real, imag*real
-    xvmaddadp   vs43,   vs5,    vs17        // real*imag, imag*imag
-    xvmaddadp   vs44,   vs6,    vs16        // real*real, imag*real
-    xvmaddadp   vs45,   vs6,    vs17        // real*imag, imag*imag
-    xvmaddadp   vs46,   vs7,    vs16        // real*real, imag*real
-    xvmaddadp   vs47,   vs7,    vs17        // real*imag, imag*imag
+	xvmaddadp	vs34,	vs1,	vs16
+	xvmaddadp	vs50,	vs1,	vs18
 
-    xvmaddadp   vs48,   vs0,    vs18        // real*real, imag*real
-    xvmaddadp   vs49,   vs0,    vs19        // real*imag, imag*imag
-    xvmaddadp   vs50,   vs1,    vs18        // real*real, imag*real
-    xvmaddadp   vs51,   vs1,    vs19        // real*imag, imag*imag
-    xvmaddadp   vs52,   vs2,    vs18        // real*real, imag*real
-    xvmaddadp   vs53,   vs2,    vs19        // real*imag, imag*imag
-    xvmaddadp   vs54,   vs3,    vs18        // real*real, imag*real
-    xvmaddadp   vs55,   vs3,    vs19        // real*imag, imag*imag
-    xvmaddadp   vs56,   vs4,    vs18        // real*real, imag*real
-    xvmaddadp   vs57,   vs4,    vs19        // real*imag, imag*imag
-    xvmaddadp   vs58,   vs5,    vs18        // real*real, imag*real
-    xvmaddadp   vs59,   vs5,    vs19        // real*imag, imag*imag
-    xvmaddadp   vs60,   vs6,    vs18        // real*real, imag*real
-    xvmaddadp   vs61,   vs6,    vs19        // real*imag, imag*imag
-    xvmaddadp   vs62,   vs7,    vs18        // real*real, imag*real
-    xvmaddadp   vs63,   vs7,    vs19        // real*imag, imag*imag
+	lxv	vs8,	DISP16(\Index, 0 + \OffsetA)(\AREG)	// load real,imag from A
+	lxv	vs9,	DISP16(\Index,16 + \OffsetA)(\AREG)	// load real,imag from A
+
+	xvmaddadp	vs35,	vs1,	vs17
+	xvmaddadp	vs51,	vs1,	vs19
+
+	lxv	vs10,	DISP16(\Index,32 + \OffsetA)(\AREG)	// load real,imag from A
+	lxv	vs11,	DISP16(\Index,48 + \OffsetA)(\AREG)	// load real,imag from A
+
+	xvmaddadp	vs36,	vs2,	vs16
+	xvmaddadp	vs52,	vs2,	vs18
+
+	lxv	vs12,	DISP16(\Index, 64 + \OffsetA)(\AREG)	// load real,imag from A
+	lxv	vs13,	DISP16(\Index,64+16 + \OffsetA)(\AREG)	// load real,imag from A
+
+	xvmaddadp	vs37,	vs2,	vs17
+	xvmaddadp	vs53,	vs2,	vs19
+
+	lxv	vs14,	DISP16(\Index,64+32 + \OffsetA)(\AREG)	// load real,imag from A
+	lxv	vs15,	DISP16(\Index,64+48 + \OffsetA)(\AREG)	// load real,imag from A
+
+.if \IsLast==1
+.if \Complete==1 
+	addi	\AREG, \AREG, DISP16(\Index,128+\OffsetA)
+	addi	\BREG, \BREG,  DISP4(\Index,32+\OffsetB)
+.endif
+.endif
+
+
+	xvmaddadp	vs38,	vs3,	vs16
+	xvmaddadp	vs54,	vs3,	vs18
 
 .if \Complete==0
-    lxv     vs0,     DISP16(\Index,128+ + \OffsetA)(\AREG)      // load real,imag from A
-    lxv     vs1,     DISP16(\Index,128+16 + \OffsetA)(\AREG)        // load real,imag from A
-    lxv     vs2,     DISP16(\Index,128+32 + \OffsetA)(\AREG)        // load real,imag from A
-    lxv     vs3,     DISP16(\Index,128+48 + \OffsetA)(\AREG)        // load real,imag from A
-
-    lxv     vs4,     DISP16(\Index, 192 + \OffsetA)(\AREG)      // load real,imag from A
-    lxv     vs5,     DISP16(\Index,192 +16 + \OffsetA)(\AREG)       // load real,imag from A
-    lxv     vs6,     DISP16(\Index,192 +32 + \OffsetA)(\AREG)       // load real,imag from A
-    lxv     vs7,     DISP16(\Index,192 +48 + \OffsetA)(\AREG)       // load real,imag from A
-
-    lxv     vs16,   DISP8(\Index,   64+\OffsetB)(\BREG)     // load real part from B
-    lxv     vs17,   DISP8(\Index,64+16+\OffsetB)(\BREG)     // load imag part from B
-    lxv     vs18,   DISP8(\Index,64+32+\OffsetB)(\BREG)     // load real part from B
-    lxv     vs19,   DISP8(\Index,64+48+\OffsetB)(\BREG)     // load imag part from B
+	lxv	vs0,	DISP16(\Index,128+ + \OffsetA)(\AREG)	// load real,imag from A
+	lxv	vs1,	DISP16(\Index,128+16 + \OffsetA)(\AREG)	// load real,imag from A
 .endif
 
-.if \IsLast==1  
-.if \Complete==1
-    addi        \AREG, \AREG, DISP16(\Index,128+\OffsetA)
-    addi        \BREG, \BREG,  DISP8(\Index,64+\OffsetB)
-.else 
-    addi        \AREG, \AREG, DISP16(\Index,256)
-    addi        \BREG, \BREG,  DISP8(\Index,128)
+
+	xvmaddadp	vs39,	vs3,	vs17
+	xvmaddadp	vs55,	vs3,	vs19
+
+.if \Complete==0
+	lxv	vs2,	DISP16(\Index,128+32 + \OffsetA)(\AREG)	// load real,imag from A
+	lxv	vs3,	DISP16(\Index,128+48 + \OffsetA)(\AREG)	// load real,imag from A
 .endif
-.endif  
+	xvmaddadp	vs40,	vs4,	vs16
+	xvmaddadp	vs56,	vs4,	vs18
 
-    xvmaddadp   vs32,   vs8,    vs20        // real*real, imag*real
-    xvmaddadp   vs33,   vs8,    vs21        // real*imag, imag*imag
-    xvmaddadp   vs34,   vs9,    vs20        // real*real, imag*real
-    xvmaddadp   vs35,   vs9,    vs21        // real*imag, imag*imag
-    xvmaddadp   vs36,   vs10,   vs20        // real*real, imag*real
-    xvmaddadp   vs37,   vs10,   vs21        // real*imag, imag*imag
-    xvmaddadp   vs38,   vs11,   vs20        // real*real, imag*real
-    xvmaddadp   vs39,   vs11,   vs21        // real*imag, imag*imag
-    xvmaddadp   vs40,   vs12,   vs20        // real*real, imag*real
-    xvmaddadp   vs41,   vs12,   vs21        // real*imag, imag*imag
-    xvmaddadp   vs42,   vs13,   vs20        // real*real, imag*real
-    xvmaddadp   vs43,   vs13,   vs21        // real*imag, imag*imag
-    xvmaddadp   vs44,   vs14,   vs20        // real*real, imag*real
-    xvmaddadp   vs45,   vs14,   vs21        // real*imag, imag*imag
-    xvmaddadp   vs46,   vs15,   vs20        // real*real, imag*real
-    xvmaddadp   vs47,   vs15,   vs21        // real*imag, imag*imag
+	xvmaddadp	vs41,	vs4,	vs17
+	xvmaddadp	vs57,	vs4,	vs19
 
-    xvmaddadp   vs48,   vs8,    vs22        // real*real, imag*real
-    xvmaddadp   vs49,   vs8,    vs23        // real*imag, imag*imag
-    xvmaddadp   vs50,   vs9,    vs22        // real*real, imag*real
-    xvmaddadp   vs51,   vs9,    vs23        // real*imag, imag*imag
-    xvmaddadp   vs52,   vs10,   vs22        // real*real, imag*real
-    xvmaddadp   vs53,   vs10,   vs23        // real*imag, imag*imag
-    xvmaddadp   vs54,   vs11,   vs22        // real*real, imag*real
-    xvmaddadp   vs55,   vs11,   vs23        // real*imag, imag*imag
-    xvmaddadp   vs56,   vs12,   vs22        // real*real, imag*real
-    xvmaddadp   vs57,   vs12,   vs23        // real*imag, imag*imag
-    xvmaddadp   vs58,   vs13,   vs22        // real*real, imag*real
-    xvmaddadp   vs59,   vs13,   vs23        // real*imag, imag*imag
-    xvmaddadp   vs60,   vs14,   vs22        // real*real, imag*real
-    xvmaddadp   vs61,   vs14,   vs23        // real*imag, imag*imag
-    xvmaddadp   vs62,   vs15,   vs22        // real*real, imag*real
-    xvmaddadp   vs63,   vs15,   vs23        // real*imag, imag*imag
+	xvmaddadp	vs42,	vs5,	vs16
+	xvmaddadp	vs58,	vs5,	vs18
+	xvmaddadp	vs43,	vs5,	vs17
+	xvmaddadp	vs59,	vs5,	vs19
+
+.if \Complete==0
+	lxv	vs4,	DISP16(\Index, 192 + \OffsetA)(\AREG)	// load real,imag from A
+	lxv	vs5,	DISP16(\Index,192 +16 + \OffsetA)(\AREG)	// load real,imag from A
+.endif
+
+	xvmaddadp	vs44,	vs6,	vs16
+	xvmaddadp	vs60,	vs6,	vs18
+	xvmaddadp	vs45,	vs6,	vs17
+	xvmaddadp	vs61,	vs6,	vs19
+
+	xvmaddadp	vs46,	vs7,	vs16
+	xvmaddadp	vs62,	vs7,	vs18
+	xvmaddadp	vs47,	vs7,	vs17
+	xvmaddadp	vs63,	vs7,	vs19
+
+.if \Complete==0
+	lxv	vs6,	DISP16(\Index,192 +32 + \OffsetA)(\AREG)	// load real,imag from A
+	lxv	vs7,	DISP16(\Index,192 +48 + \OffsetA)(\AREG)	// load real,imag from A
+.endif
+
+	xvmaddadp	vs32,	vs8,	vs20
+	xvmaddadp	vs48,	vs8,	vs22
+.if \Complete==0
+	lxv	vs16,	DISP4(\Index, 32+\OffsetB)(\BREG)	// load real imag from B
+	lxv	vs18,	DISP4(\Index, 48+\OffsetB)(\BREG)	// load real,imag from B
+.endif
+.if \Complete==0
+.if \IsLast==1 
+	addi	\AREG, \AREG, DISP16(\Index,256)
+	addi	\BREG, \BREG,  DISP4(\Index,64)
+.endif
+
+.endif
+	xvmaddadp	vs33,	vs8,	vs21
+	xvmaddadp	vs49,	vs8,	vs23
+
+.if \Complete==0
+	xxswapd	vs17, vs16
+	xxswapd	vs19, vs18
+.endif
+
+	xvmaddadp	vs34,	vs9,	vs20
+	xvmaddadp	vs50,	vs9,	vs22
+	xvmaddadp	vs35,	vs9,	vs21
+	xvmaddadp	vs51,	vs9,	vs23
+
+	xvmaddadp	vs36,	vs10,	vs20
+	xvmaddadp	vs52,	vs10,	vs22
+	xvmaddadp	vs37,	vs10,	vs21
+	xvmaddadp	vs53,	vs10,	vs23
+
+	xvmaddadp	vs38,	vs11,	vs20
+	xvmaddadp	vs54,	vs11,	vs22
+	xvmaddadp	vs39,	vs11,	vs21
+	xvmaddadp	vs55,	vs11,	vs23
+
+	xvmaddadp	vs40,	vs12,	vs20
+	xvmaddadp	vs56,	vs12,	vs22
+	xvmaddadp	vs41,	vs12,	vs21
+	xvmaddadp	vs57,	vs12,	vs23
+
+	xvmaddadp	vs42,	vs13,	vs20
+	xvmaddadp	vs58,	vs13,	vs22
+	xvmaddadp	vs43,	vs13,	vs21
+	xvmaddadp	vs59,	vs13,	vs23
+
+	xvmaddadp	vs44,	vs14,	vs20
+	xvmaddadp	vs60,	vs14,	vs22
+	xvmaddadp	vs45,	vs14,	vs21
+	xvmaddadp	vs61,	vs14,	vs23
+
+	xvmaddadp	vs46,	vs15,	vs20
+	xvmaddadp	vs62,	vs15,	vs22
+	xvmaddadp	vs47,	vs15,	vs21
+	xvmaddadp	vs63,	vs15,	vs23
 
 .endm
 
-.macro KERNEL2x8 
+.macro KERNEL2x8
   LOAD2x8 0
-  END2x8  AO, BO, 128,64 
+  END2x8  AO, BO, 128,32
 .endm
 
 .macro SAVE2x8
 
-    mr      T1, CO
-    addi        T2, T1, 64
-
-#ifndef TRMMKERNEL
-
-    lxv     vs16,   0(T1)
-    lxv     vs17,   16(T1)
-    lxv     vs18,   32(T1)
-    lxv     vs19,   48(T1)
-    lxv     vs20,   0(T2)
-    lxv     vs21,   16(T2)
-    lxv     vs22,   32(T2)
-    lxv     vs23,   48(T2)
-
-#endif
-
-   AGGREGATE_INTO_COMPLEX vs32,vs33,vs8
-   AGGREGATE_INTO_COMPLEX vs34,vs35,vs9
-   AGGREGATE_INTO_COMPLEX vs36,vs37,vs10
-   AGGREGATE_INTO_COMPLEX vs38,vs39,vs11
-   AGGREGATE_INTO_COMPLEX vs40,vs41,vs12
-   AGGREGATE_INTO_COMPLEX vs42,vs43,vs13
-   AGGREGATE_INTO_COMPLEX vs44,vs45,vs14
-   AGGREGATE_INTO_COMPLEX vs46,vs47,vs15
-
-#ifndef TRMMKERNEL
-
-    xvadddp     vs8,    vs8,    vs16
-    xvadddp     vs9,    vs9,    vs17
-    xvadddp     vs10,   vs10,   vs18
-    xvadddp     vs11,   vs11,   vs19
-    xvadddp     vs12,   vs12,   vs20
-    xvadddp     vs13,   vs13,   vs21
-    xvadddp     vs14,   vs14,   vs22
-    xvadddp     vs15,   vs15,   vs23
-
-#endif
-
-    stxv        vs8,    0(T1)
-    stxv        vs9,    16(T1)
-    stxv        vs10,   32(T1)
-    stxv        vs11,   48(T1)
-    stxv        vs12,   0(T2)
-    stxv        vs13,   16(T2)
-    stxv        vs14,   32(T2)
-    stxv        vs15,   48(T2)
-
-    add     T1, T1, LDC
-    add     T2, T2, LDC
-
-#ifndef TRMMKERNEL
-
-    lxv     vs16,   0(T1)
-    lxv     vs17,   16(T1)
-    lxv     vs18,   32(T1)
-    lxv     vs19,   48(T1)
-    lxv     vs20,   0(T2)
-    lxv     vs21,   16(T2)
-    lxv     vs22,   32(T2)
-    lxv     vs23,   48(T2)
-
-#endif
-
-   AGGREGATE_INTO_COMPLEX vs48,vs49,vs8
-   AGGREGATE_INTO_COMPLEX vs50,vs51,vs9
-   AGGREGATE_INTO_COMPLEX vs52,vs53,vs10
-   AGGREGATE_INTO_COMPLEX vs54,vs55,vs11
-   AGGREGATE_INTO_COMPLEX vs56,vs57,vs12
-   AGGREGATE_INTO_COMPLEX vs58,vs59,vs13
-   AGGREGATE_INTO_COMPLEX vs60,vs61,vs14
-   AGGREGATE_INTO_COMPLEX vs62,vs63,vs15
-
-#ifndef TRMMKERNEL
-
-    xvadddp     vs8,    vs8,    vs16
-    xvadddp     vs9,    vs9,    vs17
-    xvadddp     vs10,   vs10,   vs18
-    xvadddp     vs11,   vs11,   vs19
-    xvadddp     vs12,   vs12,   vs20
-    xvadddp     vs13,   vs13,   vs21
-    xvadddp     vs14,   vs14,   vs22
-    xvadddp     vs15,   vs15,   vs23
-
-#endif
-
-    stxv        vs8,    0(T1)
-    stxv        vs9,    16(T1)
-    stxv        vs10,   32(T1)
-    stxv        vs11,   48(T1)
-    stxv        vs12,   0(T2)
-    stxv        vs13,   16(T2)
-    stxv        vs14,   32(T2)
-    stxv        vs15,   48(T2)
- 
-    addi        CO, CO, 128
+	add	T1, CO ,LDC 
+	SAVE8  vs32,vs33,vs34,vs35,vs36,vs37,vs38,vs39,vs40,vs41,vs42,vs43,vs44,vs45,vs46,vs47,CO,0
+	SAVE8  vs48,vs49,vs50,vs51,vs52,vs53,vs54,vs55,vs56,vs57,vs58,vs59,vs60,vs61,vs62,vs63,T1,0  
+	addi	CO, CO, 128
 
 .endm
 
@@ -435,223 +492,178 @@ lxv     vs20,   DISP8(\Index,   0+\OffsetB)(\BREG)      // load real part from B
 **********************************************************************************************/
 
 .macro Zero2x4
-    xxlxor      vs32,   vs32,   vs32
-    xxlxor      vs33,   vs33,   vs33
-    xxlxor      vs34,   vs34,   vs34
-    xxlxor      vs35,   vs35,   vs35
-    xxlxor      vs36,   vs36,   vs36
-    xxlxor      vs37,   vs37,   vs37
-    xxlxor      vs38,   vs38,   vs38
-    xxlxor      vs39,   vs39,   vs39
-    xxlxor      vs40,   vs40,   vs40
-    xxlxor      vs41,   vs41,   vs41
-    xxlxor      vs42,   vs42,   vs42
-    xxlxor      vs43,   vs43,   vs43
-    xxlxor      vs44,   vs44,   vs44
-    xxlxor      vs45,   vs45,   vs45
-    xxlxor      vs46,   vs46,   vs46
-    xxlxor      vs47,   vs47,   vs47 
+	xxlxor	vs32,	vs32,	vs32
+	xxlxor	vs33,	vs33,	vs33
+	xxlxor	vs34,	vs34,	vs34
+	xxlxor	vs35,	vs35,	vs35
+	xxlxor	vs36,	vs36,	vs36
+	xxlxor	vs37,	vs37,	vs37
+	xxlxor	vs38,	vs38,	vs38
+	xxlxor	vs39,	vs39,	vs39
+	xxlxor	vs40,	vs40,	vs40
+	xxlxor	vs41,	vs41,	vs41
+	xxlxor	vs42,	vs42,	vs42
+	xxlxor	vs43,	vs43,	vs43
+	xxlxor	vs44,	vs44,	vs44
+	xxlxor	vs45,	vs45,	vs45
+	xxlxor	vs46,	vs46,	vs46
+	xxlxor	vs47,	vs47,	vs47
 .endm
 
 .macro LOAD2x4 Zero
 
-    lxv     vs16,   0(BO)       // load real part from B
-    lxv     vs17,   16(BO)      // load imag part from B
-    lxv     vs18,   32(BO)      // load real part from B
-    lxv     vs19,   48(BO)      // load imag part from B
+	lxv	vs16,	0(BO)	// load real imag from B
+	lxv	vs18,	16(BO)	// load real,imag from B
+	xxswapd	vs17, vs16
+	xxswapd	vs19, vs18
+
+	lxv	vs0,	0(AO)	// load real,imag from A
+	lxv	vs1,	16(AO)	// load real,imag from A
+	lxv	vs2,	32(AO)	// load real,imag from A
+	lxv	vs3,	48(AO)	// load real,imag from A
 
-    lxv     vs0,    0(AO)       // load real,imag from A
-    lxv     vs1,    16(AO)      // load real,imag from A
-    lxv     vs2,    32(AO)      // load real,imag from A
-    lxv     vs3,    48(AO)      // load real,imag from A
- 
 .if \Zero==1
-    Zero2x4 
+	Zero2x4
 .endif
 
 .endm
 
 .macro END2x4_NORMAL
-   END2x4 AO,BO,64,64
+	END2x4 AO,BO,64,32
 .endm
 
-.macro END2x4   AREG, BREG, OffsetA, OffsetB
+.macro END2x4	AREG, BREG, OffsetA, OffsetB
 
-.if \OffsetB != 0 
-    addi        \BREG, \BREG, \OffsetB 
+.if \OffsetB != 0
+	addi	\BREG, \BREG, \OffsetB
+.endif
+.if \OffsetA != 0
+	addi	\AREG, \AREG, \OffsetA
 .endif
-.if \OffsetA != 0 
-    addi        \AREG, \AREG, \OffsetA 
-.endif  
 
-    xvmaddadp   vs32,   vs0,    vs16        // real*real, imag*real
-    xvmaddadp   vs33,   vs0,    vs17        // real*imag, imag*imag
-    xvmaddadp   vs34,   vs1,    vs16        // real*real, imag*real
-    xvmaddadp   vs35,   vs1,    vs17        // real*imag, imag*imag
-    xvmaddadp   vs36,   vs2,    vs16        // real*real, imag*real
-    xvmaddadp   vs37,   vs2,    vs17        // real*imag, imag*imag
-    xvmaddadp   vs38,   vs3,    vs16        // real*real, imag*real
-    xvmaddadp   vs39,   vs3,    vs17        // real*imag, imag*imag
+	xvmaddadp	vs32,	vs0,	vs16
+	xvmaddadp	vs33,	vs0,	vs17
+	xvmaddadp	vs40,	vs0,	vs18
+	xvmaddadp	vs41,	vs0,	vs19
 
-    xvmaddadp   vs40,   vs0,    vs18        // real*real, imag*real
-    xvmaddadp   vs41,   vs0,    vs19        // real*imag, imag*imag
-    xvmaddadp   vs42,   vs1,    vs18        // real*real, imag*real
-    xvmaddadp   vs43,   vs1,    vs19        // real*imag, imag*imag
-    xvmaddadp   vs44,   vs2,    vs18        // real*real, imag*real
-    xvmaddadp   vs45,   vs2,    vs19        // real*imag, imag*imag
-    xvmaddadp   vs46,   vs3,    vs18        // real*real, imag*real
-    xvmaddadp   vs47,   vs3,    vs19        // real*imag, imag*imag
+	xvmaddadp	vs34,	vs1,	vs16
+	xvmaddadp	vs35,	vs1,	vs17
+	xvmaddadp	vs42,	vs1,	vs18
+	xvmaddadp	vs43,	vs1,	vs19
+	
+	xvmaddadp	vs36,	vs2,	vs16
+	xvmaddadp	vs37,	vs2,	vs17
+	xvmaddadp	vs44,	vs2,	vs18
+	xvmaddadp	vs45,	vs2,	vs19
+
+	xvmaddadp	vs38,	vs3,	vs16
+	xvmaddadp	vs39,	vs3,	vs17
+	xvmaddadp	vs46,	vs3,	vs18
+	xvmaddadp	vs47,	vs3,	vs19
 
 .endm
 
-.macro KERNEL2x4_L      OffsetA,OffsetB, Index,IsLast  
-  KERNEL2x4_2   AO,BO,   \OffsetA,\OffsetB, \Index,\IsLast ,0
+.macro KERNEL2x4_L	OffsetA,OffsetB, Index,IsLast
+  KERNEL2x4_2	AO,BO,	\OffsetA,\OffsetB, \Index,\IsLast ,0
 .endm
 
-.macro KERNEL2x4_E      OffsetA,OffsetB, Index,IsLast  
-  KERNEL2x4_2   AO,BO,   \OffsetA,\OffsetB, \Index,\IsLast ,1
+.macro KERNEL2x4_E	OffsetA,OffsetB, Index,IsLast
+  KERNEL2x4_2	AO,BO,	\OffsetA,\OffsetB, \Index,\IsLast ,1
 .endm
 
-.macro KERNEL2x4_2   AREG,BREG,   OffsetA,OffsetB, Index,IsLast ,Complete
+.macro KERNEL2x4_2	AREG,BREG,	OffsetA,OffsetB, Index,IsLast ,Complete
 
-    lxv     vs8,     DISP8(\Index, 0 + \OffsetA)(\AREG)        // load real,imag from A
-    lxv     vs9,     DISP8(\Index,16 + \OffsetA)(\AREG)        // load real,imag from A
-    lxv     vs10,    DISP8(\Index,32 + \OffsetA)(\AREG)        // load real,imag from A
-    lxv     vs11,    DISP8(\Index,48 + \OffsetA)(\AREG)        // load real,imag from A
+	lxv	vs20,	DISP4(\Index,	0+\OffsetB)(\BREG)	// load real,imag	from B
+	lxv	vs22,	DISP4(\Index,16+\OffsetB)(\BREG)	// load real,imag  from B
+ 
+	xvmaddadp	vs32,	vs0,	vs16
+	xvmaddadp	vs33,	vs0,	vs17
+	xxswapd	vs21, vs20
+	xxswapd	vs23, vs22	
+	lxv	vs8,	DISP8(\Index, 0 + \OffsetA)(\AREG)	// load real,imag from A
+	lxv	vs9,	DISP8(\Index,16 + \OffsetA)(\AREG)	// load real,imag from A
+	xvmaddadp	vs40,	vs0,	vs18
+	xvmaddadp	vs41,	vs0,	vs19
+	lxv	vs10,	DISP8(\Index,32 + \OffsetA)(\AREG)	// load real,imag from A
+	lxv	vs11,	DISP8(\Index,48 + \OffsetA)(\AREG)	// load real,imag from A
+.if \IsLast==1
+.if \Complete==1
+	addi	\BREG, \BREG,  DISP4(\Index,32+\OffsetB) 
+	addi	\AREG, \AREG,  DISP8(\Index,64+\OffsetA) 
+.endif
+.endif
 
-lxv     vs20,   DISP8(\Index,   0+\OffsetB)(\BREG)      // load real part from B
-    lxv     vs21,   DISP8(\Index,16+\OffsetB)(\BREG)        // load imag part from B
-    lxv     vs22,   DISP8(\Index,32+\OffsetB)(\BREG)        // load real part from B
-    lxv     vs23,   DISP8(\Index,48+\OffsetB)(\BREG)        // load imag part from B
+	xvmaddadp	vs34,	vs1,	vs16
+	xvmaddadp	vs35,	vs1,	vs17
+	xvmaddadp	vs42,	vs1,	vs18
+	xvmaddadp	vs43,	vs1,	vs19
+	
+	xvmaddadp	vs36,	vs2,	vs16
+	xvmaddadp	vs37,	vs2,	vs17
+.if \Complete==0
+	lxv	vs0,	DISP8(\Index,64+  \OffsetA)(\AREG)	// load real,imag from A
+	lxv	vs1,	DISP8(\Index,64+16 + \OffsetA)(\AREG)	// load real,imag from A
+.endif	
+	xvmaddadp	vs44,	vs2,	vs18
+	xvmaddadp	vs45,	vs2,	vs19
+	
+	xvmaddadp	vs38,	vs3,	vs16
+	xvmaddadp	vs39,	vs3,	vs17
+	xvmaddadp	vs46,	vs3,	vs18
+	xvmaddadp	vs47,	vs3,	vs19
 
-    xvmaddadp   vs32,   vs0,    vs16        // real*real, imag*real
-    xvmaddadp   vs33,   vs0,    vs17        // real*imag, imag*imag
-    xvmaddadp   vs34,   vs1,    vs16        // real*real, imag*real
-    xvmaddadp   vs35,   vs1,    vs17        // real*imag, imag*imag
-    xvmaddadp   vs36,   vs2,    vs16        // real*real, imag*real
-    xvmaddadp   vs37,   vs2,    vs17        // real*imag, imag*imag
-    xvmaddadp   vs38,   vs3,    vs16        // real*real, imag*real
-    xvmaddadp   vs39,   vs3,    vs17        // real*imag, imag*imag
 
-    xvmaddadp   vs40,   vs0,    vs18        // real*real, imag*real
-    xvmaddadp   vs41,   vs0,    vs19        // real*imag, imag*imag
-    xvmaddadp   vs42,   vs1,    vs18        // real*real, imag*real
-    xvmaddadp   vs43,   vs1,    vs19        // real*imag, imag*imag
-    xvmaddadp   vs44,   vs2,    vs18        // real*real, imag*real
-    xvmaddadp   vs45,   vs2,    vs19        // real*imag, imag*imag
-    xvmaddadp   vs46,   vs3,    vs18        // real*real, imag*real
-    xvmaddadp   vs47,   vs3,    vs19        // real*imag, imag*imag
+.if \Complete==0	
+	lxv	vs2,	DISP8(\Index,64+32 + \OffsetA)(\AREG)	// load real,imag from A
+	lxv	vs3,	DISP8(\Index,64+48 + \OffsetA)(\AREG)	// load real,imag from A
+
+.endif
+	xvmaddadp	vs32,	vs8,	vs20
+	xvmaddadp	vs33,	vs8,	vs21
+.if \Complete==0
+	lxv	vs16,	DISP4(\Index, 32+\OffsetB)(\BREG)	// load real imag from B
+	lxv	vs18,	DISP4(\Index, 48+\OffsetB)(\BREG)	// load real,imag from B
+.if \IsLast==1 
+	addi	\AREG, \AREG,  DISP8(\Index,128)
+	addi	\BREG, \BREG,  DISP4(\Index,64) 
+.endif    
+.endif
 
 .if \Complete==0
-    lxv     vs0,     DISP8(\Index,64+  \OffsetA)(\AREG)      // load real,imag from A
-    lxv     vs1,     DISP8(\Index,64+16 + \OffsetA)(\AREG)        // load real,imag from A
-    lxv     vs2,     DISP8(\Index,64+32 + \OffsetA)(\AREG)        // load real,imag from A
-    lxv     vs3,     DISP8(\Index,64+48 + \OffsetA)(\AREG)        // load real,imag from A 
-
-    lxv     vs16,   DISP8(\Index,   64+\OffsetB)(\BREG)     // load real part from B
-    lxv     vs17,   DISP8(\Index,64+16+\OffsetB)(\BREG)     // load imag part from B
-    lxv     vs18,   DISP8(\Index,64+32+\OffsetB)(\BREG)     // load real part from B
-    lxv     vs19,   DISP8(\Index,64+48+\OffsetB)(\BREG)     // load imag part from B
+	xxswapd	vs17, vs16
+	xxswapd	vs19, vs18
 .endif
-
-.if \IsLast==1  
-.if \Complete==1
-    addi        \AREG, \AREG,  DISP8(\Index,64+\OffsetA)
-    addi        \BREG, \BREG,  DISP8(\Index,64+\OffsetB)
-.else 
-    addi        \AREG, \AREG,  DISP8(\Index,128)
-    addi        \BREG, \BREG,  DISP8(\Index,128)
-.endif
-.endif  
-
-    xvmaddadp   vs32,   vs8,    vs20        // real*real, imag*real
-    xvmaddadp   vs33,   vs8,    vs21        // real*imag, imag*imag
-    xvmaddadp   vs34,   vs9,    vs20        // real*real, imag*real
-    xvmaddadp   vs35,   vs9,    vs21        // real*imag, imag*imag
-    xvmaddadp   vs36,   vs10,   vs20        // real*real, imag*real
-    xvmaddadp   vs37,   vs10,   vs21        // real*imag, imag*imag
-    xvmaddadp   vs38,   vs11,   vs20        // real*real, imag*real
-    xvmaddadp   vs39,   vs11,   vs21        // real*imag, imag*imag
  
-    xvmaddadp   vs40,   vs8,    vs22        // real*real, imag*real
-    xvmaddadp   vs41,   vs8,    vs23        // real*imag, imag*imag
-    xvmaddadp   vs42,   vs9,    vs22        // real*real, imag*real
-    xvmaddadp   vs43,   vs9,    vs23        // real*imag, imag*imag
-    xvmaddadp   vs44,   vs10,   vs22        // real*real, imag*real
-    xvmaddadp   vs45,   vs10,   vs23        // real*imag, imag*imag
-    xvmaddadp   vs46,   vs11,   vs22        // real*real, imag*real
-    xvmaddadp   vs47,   vs11,   vs23        // real*imag, imag*imag
+	xvmaddadp	vs40,	vs8,	vs22
+	xvmaddadp	vs41,	vs8,	vs23
+
+	xvmaddadp	vs34,	vs9,	vs20
+	xvmaddadp	vs35,	vs9,	vs21
+	xvmaddadp	vs42,	vs9,	vs22
+	xvmaddadp	vs43,	vs9,	vs23
+
+	xvmaddadp	vs36,	vs10,	vs20
+	xvmaddadp	vs37,	vs10,	vs21
+	xvmaddadp	vs44,	vs10,	vs22
+	xvmaddadp	vs45,	vs10,	vs23
+
+	xvmaddadp	vs38,	vs11,	vs20
+	xvmaddadp	vs39,	vs11,	vs21
+	xvmaddadp	vs46,	vs11,	vs22
+	xvmaddadp	vs47,	vs11,	vs23
 
 .endm
 
-.macro KERNEL2x4 
+.macro KERNEL2x4
   LOAD2x4 0
-  END2x4  AO, BO, 64,64 
+  END2x4  AO, BO, 64,32
 .endm
 
-.macro SAVE2x4
-
-    mr      T1, CO
-
-#ifndef TRMMKERNEL
-
-    lxv     vs16,   0(T1)
-    lxv     vs17,   16(T1)
-    lxv     vs18,   32(T1)
-    lxv     vs19,   48(T1)
-
-#endif
-
-   AGGREGATE_INTO_COMPLEX vs32,vs33,vs8
-   AGGREGATE_INTO_COMPLEX vs34,vs35,vs9
-   AGGREGATE_INTO_COMPLEX vs36,vs37,vs10
-   AGGREGATE_INTO_COMPLEX vs38,vs39,vs11
-
-#ifndef TRMMKERNEL
-
-    xvadddp     vs8,    vs8,    vs16
-    xvadddp     vs9,    vs9,    vs17
-    xvadddp     vs10,   vs10,   vs18
-    xvadddp     vs11,   vs11,   vs19
-
-#endif
-
-    stxv        vs8,    0(T1)
-    stxv        vs9,    16(T1)
-    stxv        vs10,   32(T1)
-    stxv        vs11,   48(T1)
-
-    add     T1, T1, LDC
-
-#ifndef TRMMKERNEL
-
-    lxv     vs16,   0(T1)
-    lxv     vs17,   16(T1)
-    lxv     vs18,   32(T1)
-    lxv     vs19,   48(T1)
-
-#endif
-
-   AGGREGATE_INTO_COMPLEX vs40,vs41,vs8
-   AGGREGATE_INTO_COMPLEX vs42,vs43,vs9
-   AGGREGATE_INTO_COMPLEX vs44,vs45,vs10
-   AGGREGATE_INTO_COMPLEX vs46,vs47,vs11
-
-#ifndef TRMMKERNEL
-
-    xvadddp     vs8,    vs8,    vs16
-    xvadddp     vs9,    vs9,    vs17
-    xvadddp     vs10,   vs10,   vs18
-    xvadddp     vs11,   vs11,   vs19
-
-#endif
-
-    stxv        vs8,    0(T1)
-    stxv        vs9,    16(T1)
-    stxv        vs10,   32(T1)
-    stxv        vs11,   48(T1)
- 
-    addi        CO, CO, 64
+.macro SAVE2x4 
+	add	T1, CO ,LDC 
+	SAVE4  vs32,vs33,vs34,vs35,vs36,vs37,vs38,vs39,CO,0
+	SAVE4  vs40,vs41,vs42,vs43,vs44,vs45,vs46,vs47,T1,0  
+	addi	CO, CO, 64
 
 .endm
 
@@ -660,170 +672,131 @@ lxv     vs20,   DISP8(\Index,   0+\OffsetB)(\BREG)      // load real part from B
 **********************************************************************************************/
 
 .macro Zero2x2
-    xxlxor      vs32,   vs32,   vs32
-    xxlxor      vs33,   vs33,   vs33
-    xxlxor      vs34,   vs34,   vs34
-    xxlxor      vs35,   vs35,   vs35
-    xxlxor      vs36,   vs36,   vs36
-    xxlxor      vs37,   vs37,   vs37
-    xxlxor      vs38,   vs38,   vs38
-    xxlxor      vs39,   vs39,   vs39 
+	xxlxor	vs32,	vs32,	vs32
+	xxlxor	vs33,	vs33,	vs33
+	xxlxor	vs34,	vs34,	vs34
+	xxlxor	vs35,	vs35,	vs35
+	xxlxor	vs36,	vs36,	vs36
+	xxlxor	vs37,	vs37,	vs37
+	xxlxor	vs38,	vs38,	vs38
+	xxlxor	vs39,	vs39,	vs39
 .endm
 
 .macro LOAD2x2 Zero
 
-    lxv     vs16,   0(BO)       // load real part from B
-    lxv     vs17,   16(BO)      // load imag part from B
-    lxv     vs18,   32(BO)      // load real part from B
-    lxv     vs19,   48(BO)      // load imag part from B
+	lxv	vs16,	0(BO)	// load real imag from B
+	lxv	vs18,	16(BO)	// load real,imag from B
+	xxswapd	vs17, vs16
+	xxswapd	vs19, vs18
+
+	lxv	vs0,	0(AO)	// load real,imag from A
+	lxv	vs1,	16(AO)	// load real,imag from A
+
 
-    lxv     vs0,    0(AO)       // load real,imag from A
-    lxv     vs1,    16(AO)      // load real,imag from A 
- 
 .if \Zero==1
-    Zero2x2 
-.endif
-
+	Zero2x2
+.endif 
 .endm
 
 .macro END2x2_NORMAL
-   END2x2 AO,BO,32,64
+	END2x2 AO,BO,32,32
 .endm
 
-.macro END2x2   AREG, BREG, OffsetA, OffsetB
+.macro END2x2	AREG, BREG, OffsetA, OffsetB
 
-.if \OffsetB != 0 
-    addi        \BREG, \BREG, \OffsetB 
+.if \OffsetB != 0
+	addi	\BREG, \BREG, \OffsetB
+.endif
+.if \OffsetA != 0
+	addi	\AREG, \AREG, \OffsetA
 .endif
-.if \OffsetA != 0 
-    addi        \AREG, \AREG, \OffsetA 
-.endif  
 
-    xvmaddadp   vs32,   vs0,    vs16        // real*real, imag*real
-    xvmaddadp   vs33,   vs0,    vs17        // real*imag, imag*imag
-    xvmaddadp   vs34,   vs1,    vs16        // real*real, imag*real
-    xvmaddadp   vs35,   vs1,    vs17        // real*imag, imag*imag 
+	xvmaddadp	vs32,	vs0,	vs16
+	xvmaddadp	vs33,	vs0,	vs17
+	xvmaddadp	vs36,	vs0,	vs18
+	xvmaddadp	vs37,	vs0,	vs19
+
+	xvmaddadp	vs34,	vs1,	vs16
+	xvmaddadp	vs35,	vs1,	vs17 
+	xvmaddadp	vs38,	vs1,	vs18
+	xvmaddadp	vs39,	vs1,	vs19
 
-    xvmaddadp   vs36,   vs0,    vs18        // real*real, imag*real
-    xvmaddadp   vs37,   vs0,    vs19        // real*imag, imag*imag
-    xvmaddadp   vs38,   vs1,    vs18        // real*real, imag*real
-    xvmaddadp   vs39,   vs1,    vs19        // real*imag, imag*imag 
-  
 .endm
 
-.macro KERNEL2x2_L      OffsetA,OffsetB, Index,IsLast  
-  KERNEL2x2_2   AO,BO,   \OffsetA,\OffsetB, \Index,\IsLast ,0
+.macro KERNEL2x2_L	OffsetA,OffsetB, Index,IsLast
+  KERNEL2x2_2	AO,BO,	\OffsetA,\OffsetB, \Index,\IsLast ,0
 .endm
 
-.macro KERNEL2x2_E      OffsetA,OffsetB, Index,IsLast  
-  KERNEL2x2_2   AO,BO,   \OffsetA,\OffsetB, \Index,\IsLast ,1
+.macro KERNEL2x2_E	OffsetA,OffsetB, Index,IsLast
+  KERNEL2x2_2	AO,BO,	\OffsetA,\OffsetB, \Index,\IsLast ,1
 .endm
 
-.macro KERNEL2x2_2   AREG,BREG,   OffsetA,OffsetB, Index,IsLast ,Complete
+.macro KERNEL2x2_2	AREG,BREG,	OffsetA,OffsetB, Index,IsLast ,Complete
 
-    lxv     vs8,     DISP4(\Index, 0 + \OffsetA)(\AREG)        // load real,imag from A
-    lxv     vs9,     DISP4(\Index,16 + \OffsetA)(\AREG)        // load real,imag from A
+	lxv	vs20,	DISP4(\Index,	0+\OffsetB)(\BREG)	// load real,imag	from B
+	lxv	vs22,	DISP4(\Index,16+\OffsetB)(\BREG)	// load real,imag  from B
+	xvmaddadp	vs32,	vs0,	vs16
+	xvmaddadp	vs33,	vs0,	vs17
+	xxswapd	vs21, vs20
+	xxswapd	vs23, vs22
 
-lxv     vs20,   DISP8(\Index,   0+\OffsetB)(\BREG)      // load real part from B
-    lxv     vs21,   DISP8(\Index,16+\OffsetB)(\BREG)        // load imag part from B
-    lxv     vs22,   DISP8(\Index,32+\OffsetB)(\BREG)        // load real part from B
-    lxv     vs23,   DISP8(\Index,48+\OffsetB)(\BREG)        // load imag part from B
+	lxv	vs8,	DISP4(\Index, 0 + \OffsetA)(\AREG)	// load real,imag from A
+	lxv	vs9,	DISP4(\Index,16 + \OffsetA)(\AREG)	// load real,imag from A
+.if \IsLast==1
+.if \Complete==1
+	addi	\AREG, \AREG,  DISP4(\Index,32+\OffsetA)
+	addi	\BREG, \BREG,  DISP4(\Index,32+\OffsetB) 
+.endif
+.endif 
+	xvmaddadp	vs36,	vs0,	vs18
+	xvmaddadp	vs37,	vs0,	vs19
 
-    xvmaddadp   vs32,   vs0,    vs16        // real*real, imag*real
-    xvmaddadp   vs33,   vs0,    vs17        // real*imag, imag*imag
-    xvmaddadp   vs34,   vs1,    vs16        // real*real, imag*real
-    xvmaddadp   vs35,   vs1,    vs17        // real*imag, imag*imag 
-
-    xvmaddadp   vs36,   vs0,    vs18        // real*real, imag*real
-    xvmaddadp   vs37,   vs0,    vs19        // real*imag, imag*imag
-    xvmaddadp   vs38,   vs1,    vs18        // real*real, imag*real
-    xvmaddadp   vs39,   vs1,    vs19        // real*imag, imag*imag 
+	xvmaddadp	vs34,	vs1,	vs16
+	xvmaddadp	vs35,	vs1,	vs17 
+	xvmaddadp	vs38,	vs1,	vs18
+	xvmaddadp	vs39,	vs1,	vs19
 
 .if \Complete==0
-    lxv     vs0,     DISP4(\Index,32 + \OffsetA)(\AREG)      // load real,imag from A
-    lxv     vs1,     DISP4(\Index,48+ \OffsetA)(\AREG)        // load real,imag from A 
-
-    lxv     vs16,   DISP8(\Index,   64+\OffsetB)(\BREG)     // load real part from B
-    lxv     vs17,   DISP8(\Index,64+16+\OffsetB)(\BREG)     // load imag part from B
-    lxv     vs18,   DISP8(\Index,64+32+\OffsetB)(\BREG)     // load real part from B
-    lxv     vs19,   DISP8(\Index,64+48+\OffsetB)(\BREG)     // load imag part from B
+	lxv	vs0,	DISP4(\Index,32 + \OffsetA)(\AREG)	// load real,imag from A
+	lxv	vs1,	DISP4(\Index,48+ \OffsetA)(\AREG)	// load real,imag from A
+.endif
+.if \Complete==0
+	lxv	vs16,	DISP4(\Index, 32+\OffsetB)(\BREG)	// load real imag from B
+	lxv	vs18,	DISP4(\Index, 48+\OffsetB)(\BREG)	// load real,imag from B
+.if \IsLast==1 
+	addi	\AREG, \AREG,  DISP4(\Index,64)
+	addi	\BREG, \BREG,  DISP4(\Index,64)
+.endif	
 .endif
 
-.if \IsLast==1  
-.if \Complete==1
-    addi        \AREG, \AREG,  DISP4(\Index,32+\OffsetA)
-    addi        \BREG, \BREG,  DISP8(\Index,64+\OffsetB)
-.else 
-    addi        \AREG, \AREG,  DISP4(\Index,64)
-    addi        \BREG, \BREG,  DISP8(\Index,128)
-.endif
-.endif  
+	xvmaddadp	vs32,	vs8,	vs20
+	xvmaddadp	vs33,	vs8,	vs21
+
+.if \Complete==0
+	xxswapd	vs17, vs16
+	xxswapd	vs19, vs18
+.endif 
+	xvmaddadp	vs36,	vs8,	vs22
+	xvmaddadp	vs37,	vs8,	vs23
+
+	xvmaddadp	vs34,	vs9,	vs20
+	xvmaddadp	vs35,	vs9,	vs21 
+
+	xvmaddadp	vs38,	vs9,	vs22
+	xvmaddadp	vs39,	vs9,	vs23
 
-    xvmaddadp   vs32,   vs8,    vs20        // real*real, imag*real
-    xvmaddadp   vs33,   vs8,    vs21        // real*imag, imag*imag
-    xvmaddadp   vs34,   vs9,    vs20        // real*real, imag*real
-    xvmaddadp   vs35,   vs9,    vs21        // real*imag, imag*imag 
- 
-    xvmaddadp   vs36,   vs8,    vs22        // real*real, imag*real
-    xvmaddadp   vs37,   vs8,    vs23        // real*imag, imag*imag
-    xvmaddadp   vs38,   vs9,    vs22        // real*real, imag*real
-    xvmaddadp   vs39,   vs9,    vs23        // real*imag, imag*imag 
-     
 .endm
 
-.macro KERNEL2x2 
+.macro KERNEL2x2
   LOAD2x2 0
-  END2x2  AO, BO, 32,64 
+  END2x2  AO, BO, 32,32
 .endm
 
-.macro SAVE2x2
-
-    mr      T1, CO
-
-#ifndef TRMMKERNEL
-
-    lxv     vs16,   0(T1)
-    lxv     vs17,   16(T1)
-
-#endif
-
-   AGGREGATE_INTO_COMPLEX vs32,vs33,vs8
-   AGGREGATE_INTO_COMPLEX vs34,vs35,vs9 
-
-#ifndef TRMMKERNEL
-
-    xvadddp     vs8,    vs8,    vs16
-    xvadddp     vs9,    vs9,    vs17
-
-#endif
-
-    stxv        vs8,    0(T1)
-    stxv        vs9,    16(T1)
-
-    add     T1, T1, LDC
-
-#ifndef TRMMKERNEL
-
-    lxv     vs16,   0(T1)
-    lxv     vs17,   16(T1)
-
-#endif
-
-   AGGREGATE_INTO_COMPLEX vs36,vs37,vs8
-   AGGREGATE_INTO_COMPLEX vs38,vs39,vs9
-
-#ifndef TRMMKERNEL
-
-    xvadddp     vs8,    vs8,    vs16
-    xvadddp     vs9,    vs9,    vs17
-
-#endif
-
-    stxv        vs8,    0(T1)
-    stxv        vs9,    16(T1)
- 
-    addi        CO, CO, 32
-
+.macro SAVE2x2 
+	add	T1, CO ,LDC 
+	SAVE2  vs32,vs33,vs34,vs35,CO,0
+	SAVE2  vs36,vs37,vs38,vs39,T1,0 
+	addi	CO, CO, 32 
 .endm
 
 /**********************************************************************************************
@@ -831,348 +804,288 @@ lxv     vs20,   DISP8(\Index,   0+\OffsetB)(\BREG)      // load real part from B
 **********************************************************************************************/
 
 .macro Zero2x1
-    xxlxor      vs32,   vs32,   vs32
-    xxlxor      vs33,   vs33,   vs33
-    xxlxor      vs34,   vs34,   vs34
-    xxlxor      vs35,   vs35,   vs35 
+	xxlxor	vs32,	vs32,	vs32
+	xxlxor	vs33,	vs33,	vs33
+	xxlxor	vs34,	vs34,	vs34
+	xxlxor	vs35,	vs35,	vs35
 .endm
 
 .macro LOAD2x1 Zero
-    lxv     vs0,    0(AO)       // load real,imag from A 
+	lxv	vs0,	0(AO)	// load real,imag from A
 
-    lxv     vs16,   0(BO)       // load real part from B
-    lxv     vs17,   16(BO)      // load imag part from B
-    lxv     vs18,   32(BO)      // load real part from B
-    lxv     vs19,   48(BO)      // load imag part from B
+	lxv	vs16,	0(BO)	// load real imag from B
+	lxv	vs18,	16(BO)	// load real,imag from B
 
+	xxswapd	vs17, vs16
+	xxswapd	vs19, vs18
 .if \Zero==1
-    Zero2x1 
-.endif
-
+	Zero2x1
+.endif 
 .endm
 
 .macro END2x1_NORMAL
-   END2x1 AO,BO,16,64
+	END2x1 AO,BO,16,32
 .endm
 
-.macro END2x1   AREG, BREG, OffsetA, OffsetB
+.macro END2x1	AREG, BREG, OffsetA, OffsetB
 
-.if \OffsetA != 0 
-    addi        \AREG, \AREG, \OffsetA 
-.endif  
-.if \OffsetB != 0 
-    addi        \BREG, \BREG, \OffsetB 
+.if \OffsetA != 0
+	addi	\AREG, \AREG, \OffsetA
+.endif
+.if \OffsetB != 0
+	addi	\BREG, \BREG, \OffsetB
 .endif
 
-    xvmaddadp   vs32,   vs0,    vs16        // real*real, imag*real
-    xvmaddadp   vs33,   vs0,    vs17        // real*imag, imag*imag 
+	xvmaddadp	vs32,	vs0,	vs16
+	xvmaddadp	vs33,	vs0,	vs17
+
+	xvmaddadp	vs34,	vs0,	vs18
+	xvmaddadp	vs35,	vs0,	vs19
 
-    xvmaddadp   vs34,   vs0,    vs18        // real*real, imag*real
-    xvmaddadp   vs35,   vs0,    vs19        // real*imag, imag*imag 
-  
 .endm
 
-.macro KERNEL2x1_L      OffsetA,OffsetB, Index,IsLast  
-  KERNEL2x1_2   AO,BO,   \OffsetA,\OffsetB, \Index,\IsLast ,0
+.macro KERNEL2x1_L	OffsetA,OffsetB, Index,IsLast
+  KERNEL2x1_2	AO,BO,	\OffsetA,\OffsetB, \Index,\IsLast ,0
 .endm
 
-.macro KERNEL2x1_E      OffsetA,OffsetB, Index,IsLast  
-  KERNEL2x1_2   AO,BO,   \OffsetA,\OffsetB, \Index,\IsLast ,1
+.macro KERNEL2x1_E	OffsetA,OffsetB, Index,IsLast
+  KERNEL2x1_2	AO,BO,	\OffsetA,\OffsetB, \Index,\IsLast ,1
 .endm
 
-.macro KERNEL2x1_2   AREG,BREG,   OffsetA,OffsetB, Index,IsLast ,Complete
+.macro KERNEL2x1_2	AREG,BREG,	OffsetA,OffsetB, Index,IsLast ,Complete
 
-    lxv     vs8,     DISP2(\Index, 0 + \OffsetA)(\AREG)        // load real,imag from A
+	lxv	vs20,	DISP4(\Index,	0+\OffsetB)(\BREG)	// load real,imag	from B
+	lxv	vs22,	DISP4(\Index,16+\OffsetB)(\BREG)	// load real,imag  from B
 
-lxv     vs20,   DISP8(\Index,   0+\OffsetB)(\BREG)      // load real part from B
-    lxv     vs21,   DISP8(\Index,16+\OffsetB)(\BREG)        // load imag part from B
-    lxv     vs22,   DISP8(\Index,32+\OffsetB)(\BREG)        // load real part from B
-    lxv     vs23,   DISP8(\Index,48+\OffsetB)(\BREG)        // load imag part from B
+	lxv	vs8,	DISP2(\Index, 0 + \OffsetA)(\AREG)	// load real,imag from A
 
-    xvmaddadp   vs32,   vs0,    vs16        // real*real, imag*real
-    xvmaddadp   vs33,   vs0,    vs17        // real*imag, imag*imag 
+	xxswapd	vs21, vs20
+	xxswapd	vs23, vs22
+.if \IsLast==1
+.if \Complete==1
+	addi	\AREG, \AREG,  DISP2(\Index,16+\OffsetA)
+	addi	\BREG, \BREG,  DISP4(\Index,32+\OffsetB) 
+.endif
+.endif
 
-    xvmaddadp   vs34,   vs0,    vs18        // real*real, imag*real
-    xvmaddadp   vs35,   vs0,    vs19        // real*imag, imag*imag 
+	xvmaddadp	vs32,	vs0,	vs16
+	xvmaddadp	vs33,	vs0,	vs17
+
+	xvmaddadp	vs34,	vs0,	vs18
+	xvmaddadp	vs35,	vs0,	vs19
 
 .if \Complete==0
-    lxv     vs0,     DISP2(\Index,16 + \OffsetA)(\AREG)      // load real,imag from A 
+	lxv	vs0,	DISP2(\Index,16 + \OffsetA)(\AREG)	// load real,imag from A
 
-    lxv     vs16,   DISP8(\Index,   64+\OffsetB)(\BREG)     // load real part from B
-    lxv     vs17,   DISP8(\Index,64+16+\OffsetB)(\BREG)     // load imag part from B
-    lxv     vs18,   DISP8(\Index,64+32+\OffsetB)(\BREG)     // load real part from B
-    lxv     vs19,   DISP8(\Index,64+48+\OffsetB)(\BREG)     // load imag part from B
 .endif
-
-.if \IsLast==1  
-.if \Complete==1
-    addi        \AREG, \AREG,  DISP2(\Index,16+\OffsetA)
-    addi        \BREG, \BREG,  DISP8(\Index,64+\OffsetB)
-.else 
-    addi        \AREG, \AREG,  DISP2(\Index,32)
-    addi        \BREG, \BREG,  DISP8(\Index,128)
+.if \Complete==0
+	lxv	vs16,	DISP4(\Index, 32+\OffsetB)(\BREG)	// load real imag from B
+	lxv	vs18,	DISP4(\Index, 48+\OffsetB)(\BREG)	// load real,imag from B
+.if \IsLast==1 
+	addi	\AREG, \AREG,  DISP2(\Index,32)
+	addi	\BREG, \BREG,  DISP4(\Index,64)
+.endif	
 .endif
-.endif  
-
-    xvmaddadp   vs32,   vs8,    vs20        // real*real, imag*real
-    xvmaddadp   vs33,   vs8,    vs21        // real*imag, imag*imag 
  
-    xvmaddadp   vs34,   vs8,    vs22        // real*real, imag*real
-    xvmaddadp   vs35,   vs8,    vs23        // real*imag, imag*imag  
-     
+.if \Complete==0
+	xxswapd	vs17, vs16
+	xxswapd	vs19, vs18
+.endif
+
+	xvmaddadp	vs32,	vs8,	vs20
+	xvmaddadp	vs33,	vs8,	vs21
+
+	xvmaddadp	vs34,	vs8,	vs22
+	xvmaddadp	vs35,	vs8,	vs23
+
 .endm
 
-.macro KERNEL2x1 
+.macro KERNEL2x1
   LOAD2x1 0
-  END2x1  AO, BO, 16,64 
+  END2x1  AO, BO, 16,32
 .endm
 
 .macro SAVE2x1
-
-    mr      T1, CO
-#ifndef TRMMKERNEL
-    lxv     vs16,   0(T1)
-#endif
-   AGGREGATE_INTO_COMPLEX vs32,vs33,vs8
-
-#ifndef TRMMKERNEL
-    xvadddp     vs8,    vs8,    vs16
-#endif
-
-    stxv        vs8,    0(T1)
-
-    add     T1, T1, LDC
-
-#ifndef TRMMKERNEL
-    lxv     vs16,   0(T1)
-#endif
-
-   AGGREGATE_INTO_COMPLEX vs34,vs35,vs8
-
-#ifndef TRMMKERNEL
-    xvadddp     vs8,    vs8,    vs16
-#endif
-
-    stxv        vs8,    0(T1)
-
-    addi        CO, CO, 16
-
+	add	T1, CO ,LDC 
+	SAVE1  vs32,vs33,CO,0
+	SAVE1  vs34,vs35,T1,0  
+	addi	CO, CO, 16 
 .endm
 
 /**********************************************************************************************
 * Macros for N=1 and M=8
 **********************************************************************************************/
 .macro Zero1x8
-    xxlxor      vs32,   vs32,   vs32
-    xxlxor      vs33,   vs33,   vs33
-    xxlxor      vs34,   vs34,   vs34
-    xxlxor      vs35,   vs35,   vs35
-    xxlxor      vs36,   vs36,   vs36
-    xxlxor      vs37,   vs37,   vs37
-    xxlxor      vs38,   vs38,   vs38
-    xxlxor      vs39,   vs39,   vs39
-    xxlxor      vs40,   vs40,   vs40
-    xxlxor      vs41,   vs41,   vs41
-    xxlxor      vs42,   vs42,   vs42
-    xxlxor      vs43,   vs43,   vs43
-    xxlxor      vs44,   vs44,   vs44
-    xxlxor      vs45,   vs45,   vs45
-    xxlxor      vs46,   vs46,   vs46
-    xxlxor      vs47,   vs47,   vs47     
+	xxlxor	vs32,	vs32,	vs32
+	xxlxor	vs33,	vs33,	vs33
+	xxlxor	vs34,	vs34,	vs34
+	xxlxor	vs35,	vs35,	vs35
+	xxlxor	vs36,	vs36,	vs36
+	xxlxor	vs37,	vs37,	vs37
+	xxlxor	vs38,	vs38,	vs38
+	xxlxor	vs39,	vs39,	vs39
+	xxlxor	vs40,	vs40,	vs40
+	xxlxor	vs41,	vs41,	vs41
+	xxlxor	vs42,	vs42,	vs42
+	xxlxor	vs43,	vs43,	vs43
+	xxlxor	vs44,	vs44,	vs44
+	xxlxor	vs45,	vs45,	vs45
+	xxlxor	vs46,	vs46,	vs46
+	xxlxor	vs47,	vs47,	vs47
 .endm
 
 .macro LOAD1x8 Zero
 
-    lxv     vs16,   0(BO)       // load real part from B
-    lxv     vs17,   16(BO)      // load imag part from B 
+	lxv	vs16,	0(BO)	// load real imag from B
+	xxswapd	vs17, vs16
+	lxv	vs0,	0(AO)	// load real,imag from A
+	lxv	vs1,	16(AO)	// load real,imag from A
+	lxv	vs2,	32(AO)	// load real,imag from A
+	lxv	vs3,	48(AO)	// load real,imag from A
 
-    lxv     vs0,    0(AO)       // load real,imag from A
-    lxv     vs1,    16(AO)      // load real,imag from A
-    lxv     vs2,    32(AO)      // load real,imag from A
-    lxv     vs3,    48(AO)      // load real,imag from A
-
-    lxv     vs4,    64(AO)      // load real,imag from A
-    lxv     vs5,    80(AO)      // load real,imag from A
-    lxv     vs6,    96(AO)      // load real,imag from A
-    lxv     vs7,    112(AO)     // load real,imag from A
+	lxv	vs4,	64(AO)	// load real,imag from A
+	lxv	vs5,	80(AO)	// load real,imag from A
+	lxv	vs6,	96(AO)	// load real,imag from A
+	lxv	vs7,	112(AO)	// load real,imag from A
 
 .if \Zero==1
-    Zero1x8 
+	Zero1x8
 .endif
 
 .endm
 
 .macro END1x8_NORMAL
-   END1x8 AO,BO,128,32
+	END1x8 AO,BO,128,16
 .endm
 
-.macro END1x8   AREG, BREG, OffsetA, OffsetB
+.macro END1x8	AREG, BREG, OffsetA, OffsetB
 
-.if \OffsetB != 0 
-    addi        \BREG, \BREG, \OffsetB 
+.if \OffsetB != 0
+	addi	\BREG, \BREG, \OffsetB
+.endif
+.if \OffsetA != 0
+	addi	\AREG, \AREG, \OffsetA
 .endif
-.if \OffsetA != 0 
-    addi        \AREG, \AREG, \OffsetA 
-.endif  
 
-    xvmaddadp   vs32,   vs0,    vs16        // real*real, imag*real
-    xvmaddadp   vs33,   vs0,    vs17        // real*imag, imag*imag
-    xvmaddadp   vs34,   vs1,    vs16        // real*real, imag*real
-    xvmaddadp   vs35,   vs1,    vs17        // real*imag, imag*imag
-    xvmaddadp   vs36,   vs2,    vs16        // real*real, imag*real
-    xvmaddadp   vs37,   vs2,    vs17        // real*imag, imag*imag
-    xvmaddadp   vs38,   vs3,    vs16        // real*real, imag*real
-    xvmaddadp   vs39,   vs3,    vs17        // real*imag, imag*imag
-    xvmaddadp   vs40,   vs4,    vs16        // real*real, imag*real
-    xvmaddadp   vs41,   vs4,    vs17        // real*imag, imag*imag
-    xvmaddadp   vs42,   vs5,    vs16        // real*real, imag*real
-    xvmaddadp   vs43,   vs5,    vs17        // real*imag, imag*imag
-    xvmaddadp   vs44,   vs6,    vs16        // real*real, imag*real
-    xvmaddadp   vs45,   vs6,    vs17        // real*imag, imag*imag
-    xvmaddadp   vs46,   vs7,    vs16        // real*real, imag*real
-    xvmaddadp   vs47,   vs7,    vs17        // real*imag, imag*imag
+	xvmaddadp	vs32,	vs0,	vs16
+	xvmaddadp	vs33,	vs0,	vs17
+	xvmaddadp	vs34,	vs1,	vs16
+	xvmaddadp	vs35,	vs1,	vs17
+	xvmaddadp	vs36,	vs2,	vs16
+	xvmaddadp	vs37,	vs2,	vs17
+	xvmaddadp	vs38,	vs3,	vs16
+	xvmaddadp	vs39,	vs3,	vs17
+	xvmaddadp	vs40,	vs4,	vs16
+	xvmaddadp	vs41,	vs4,	vs17
+	xvmaddadp	vs42,	vs5,	vs16
+	xvmaddadp	vs43,	vs5,	vs17
+	xvmaddadp	vs44,	vs6,	vs16
+	xvmaddadp	vs45,	vs6,	vs17
+	xvmaddadp	vs46,	vs7,	vs16
+	xvmaddadp	vs47,	vs7,	vs17
 
 .endm
 
-.macro KERNEL1x8_L      OffsetA,OffsetB, Index,IsLast  
-  KERNEL1x8_2   AO,BO,   \OffsetA,\OffsetB, \Index,\IsLast ,0
+.macro KERNEL1x8_L	OffsetA,OffsetB, Index,IsLast
+  KERNEL1x8_2	AO,BO,	\OffsetA,\OffsetB, \Index,\IsLast ,0
 .endm
 
-.macro KERNEL1x8_E      OffsetA,OffsetB, Index,IsLast  
-  KERNEL1x8_2   AO,BO,   \OffsetA,\OffsetB, \Index,\IsLast ,1
+.macro KERNEL1x8_E	OffsetA,OffsetB, Index,IsLast
+  KERNEL1x8_2	AO,BO,	\OffsetA,\OffsetB, \Index,\IsLast ,1
 .endm
 
-.macro KERNEL1x8_2   AREG,BREG,   OffsetA,OffsetB, Index,IsLast ,Complete
+.macro KERNEL1x8_2	AREG,BREG,	OffsetA,OffsetB, Index,IsLast ,Complete
 
-    lxv     vs8,     DISP16(\Index, 0 + \OffsetA)(\AREG)        // load real,imag from A
-    lxv     vs9,     DISP16(\Index,16 + \OffsetA)(\AREG)        // load real,imag from A
-    lxv     vs10,    DISP16(\Index,32 + \OffsetA)(\AREG)        // load real,imag from A
-    lxv     vs11,    DISP16(\Index,48 + \OffsetA)(\AREG)        // load real,imag from A 
+	lxv	vs20,	DISP2(\Index,	0+\OffsetB)(\BREG)	// load real,imag	from B
+	xxswapd	vs21, vs20
 
-    lxv     vs12,    DISP16(\Index, 64 + \OffsetA)(\AREG)       // load real,imag from A
-    lxv     vs13,    DISP16(\Index,64+16 + \OffsetA)(\AREG)     // load real,imag from A
-    lxv     vs14,    DISP16(\Index,64+32 + \OffsetA)(\AREG)     // load real,imag from A
-    lxv     vs15,    DISP16(\Index,64+48 + \OffsetA)(\AREG)     // load real,imag from A
 
-    lxv     vs20,   DISP4(\Index,   0+\OffsetB)(\BREG)      // load real part from B
-    lxv     vs21,   DISP4(\Index,16+\OffsetB)(\BREG)        // load imag part from B 
-
-    xvmaddadp   vs32,   vs0,    vs16        // real*real, imag*real
-    xvmaddadp   vs33,   vs0,    vs17        // real*imag, imag*imag
-    xvmaddadp   vs34,   vs1,    vs16        // real*real, imag*real
-    xvmaddadp   vs35,   vs1,    vs17        // real*imag, imag*imag
-    xvmaddadp   vs36,   vs2,    vs16        // real*real, imag*real
-    xvmaddadp   vs37,   vs2,    vs17        // real*imag, imag*imag
-    xvmaddadp   vs38,   vs3,    vs16        // real*real, imag*real
-    xvmaddadp   vs39,   vs3,    vs17        // real*imag, imag*imag
-    xvmaddadp   vs40,   vs4,    vs16        // real*real, imag*real
-    xvmaddadp   vs41,   vs4,    vs17        // real*imag, imag*imag
-    xvmaddadp   vs42,   vs5,    vs16        // real*real, imag*real
-    xvmaddadp   vs43,   vs5,    vs17        // real*imag, imag*imag
-    xvmaddadp   vs44,   vs6,    vs16        // real*real, imag*real
-    xvmaddadp   vs45,   vs6,    vs17        // real*imag, imag*imag
-    xvmaddadp   vs46,   vs7,    vs16        // real*real, imag*real
-    xvmaddadp   vs47,   vs7,    vs17        // real*imag, imag*imag
+	lxv	vs8,	DISP16(\Index, 0 + \OffsetA)(\AREG)	// load real,imag from A
+	lxv	vs9,	DISP16(\Index,16 + \OffsetA)(\AREG)	// load real,imag from A
+	xvmaddadp	vs32,	vs0,	vs16
+	xvmaddadp	vs33,	vs0,	vs17	
+	lxv	vs10,	DISP16(\Index,32 + \OffsetA)(\AREG)	// load real,imag from A
+	lxv	vs11,	DISP16(\Index,48 + \OffsetA)(\AREG)	// load real,imag from A
+	xvmaddadp	vs34,	vs1,	vs16
+	xvmaddadp	vs35,	vs1,	vs17
+	lxv	vs12,	DISP16(\Index, 64 + \OffsetA)(\AREG)	// load real,imag from A
+	lxv	vs13,	DISP16(\Index,64+16 + \OffsetA)(\AREG)	// load real,imag from A
+	xvmaddadp	vs36,	vs2,	vs16
+	xvmaddadp	vs37,	vs2,	vs17
+	lxv	vs14,	DISP16(\Index,64+32 + \OffsetA)(\AREG)	// load real,imag from A
+	lxv	vs15,	DISP16(\Index,64+48 + \OffsetA)(\AREG)	// load real,imag from A
 
+	xvmaddadp	vs38,	vs3,	vs16
+	xvmaddadp	vs39,	vs3,	vs17
 .if \Complete==0
-    lxv     vs0,     DISP16(\Index,128+ + \OffsetA)(\AREG)      // load real,imag from A
-    lxv     vs1,     DISP16(\Index,128+16 + \OffsetA)(\AREG)        // load real,imag from A
-    lxv     vs2,     DISP16(\Index,128+32 + \OffsetA)(\AREG)        // load real,imag from A
-    lxv     vs3,     DISP16(\Index,128+48 + \OffsetA)(\AREG)        // load real,imag from A
+	lxv	vs0,	DISP16(\Index,128+ + \OffsetA)(\AREG)	// load real,imag from A
+	lxv	vs1,	DISP16(\Index,128+16 + \OffsetA)(\AREG)	// load real,imag from A
+.endif	
+	xvmaddadp	vs40,	vs4,	vs16
+	xvmaddadp	vs41,	vs4,	vs17
+.if \Complete==0	
+	lxv	vs2,	DISP16(\Index,128+32 + \OffsetA)(\AREG)	// load real,imag from A
+	lxv	vs3,	DISP16(\Index,128+48 + \OffsetA)(\AREG)	// load real,imag from A
+.endif	
+	xvmaddadp	vs42,	vs5,	vs16
+	xvmaddadp	vs43,	vs5,	vs17
+	xvmaddadp	vs44,	vs6,	vs16
+	xvmaddadp	vs45,	vs6,	vs17
+.if \Complete==0
+	lxv	vs4,	DISP16(\Index, 192 + \OffsetA)(\AREG)	// load real,imag from A
+	lxv	vs5,	DISP16(\Index,192 +16 + \OffsetA)(\AREG)	// load real,imag from A
+.endif	
+	xvmaddadp	vs46,	vs7,	vs16
+	xvmaddadp	vs47,	vs7,	vs17
 
-    lxv     vs4,     DISP16(\Index, 192 + \OffsetA)(\AREG)      // load real,imag from A
-    lxv     vs5,     DISP16(\Index,192 +16 + \OffsetA)(\AREG)       // load real,imag from A
-    lxv     vs6,     DISP16(\Index,192 +32 + \OffsetA)(\AREG)       // load real,imag from A
-    lxv     vs7,     DISP16(\Index,192 +48 + \OffsetA)(\AREG)       // load real,imag from A
 
-    lxv     vs16,   DISP4(\Index,   32+\OffsetB)(\BREG)     // load real part from B
-    lxv     vs17,   DISP4(\Index,48+\OffsetB)(\BREG)     // load imag part from B 
+	xvmaddadp	vs32,	vs8,	vs20
+	xvmaddadp	vs33,	vs8,	vs21
+.if \Complete==0	
+	lxv	vs6,	DISP16(\Index,192 +32 + \OffsetA)(\AREG)	// load real,imag from A
+	lxv	vs7,	DISP16(\Index,192 +48 + \OffsetA)(\AREG)	// load real,imag from A 
 .endif
-
-.if \IsLast==1  
+	xvmaddadp	vs34,	vs9,	vs20
+	xvmaddadp	vs35,	vs9,	vs21
+.if \Complete==0
+	lxv	vs16,	DISP2(\Index, 16+\OffsetB)(\BREG)	// load real imag from B
+	xxswapd	vs17,vs16
+.endif
+.if \IsLast==1
 .if \Complete==1
-    addi        \AREG, \AREG, DISP16(\Index,128+\OffsetA)
-    addi        \BREG, \BREG, DISP4(\Index,32+\OffsetB)
-.else 
-    addi        \AREG, \AREG, DISP16(\Index,256)
-    addi        \BREG, \BREG, DISP4(\Index,64)
+	addi	\AREG, \AREG, DISP16(\Index,128+\OffsetA)
+	addi	\BREG, \BREG, DISP2(\Index,16+\OffsetB)
+.else
+	addi	\AREG, \AREG, DISP16(\Index,256)
+	addi	\BREG, \BREG, DISP2(\Index,32)
 .endif
-.endif  
+.endif
+	xvmaddadp	vs36,	vs10,	vs20
+	xvmaddadp	vs37,	vs10,	vs21
 
-    xvmaddadp   vs32,   vs8,    vs20        // real*real, imag*real
-    xvmaddadp   vs33,   vs8,    vs21        // real*imag, imag*imag
-    xvmaddadp   vs34,   vs9,    vs20        // real*real, imag*real
-    xvmaddadp   vs35,   vs9,    vs21        // real*imag, imag*imag
-    xvmaddadp   vs36,   vs10,   vs20        // real*real, imag*real
-    xvmaddadp   vs37,   vs10,   vs21        // real*imag, imag*imag
-    xvmaddadp   vs38,   vs11,   vs20        // real*real, imag*real
-    xvmaddadp   vs39,   vs11,   vs21        // real*imag, imag*imag
-    xvmaddadp   vs40,   vs12,   vs20        // real*real, imag*real
-    xvmaddadp   vs41,   vs12,   vs21        // real*imag, imag*imag
-    xvmaddadp   vs42,   vs13,   vs20        // real*real, imag*real
-    xvmaddadp   vs43,   vs13,   vs21        // real*imag, imag*imag
-    xvmaddadp   vs44,   vs14,   vs20        // real*real, imag*real
-    xvmaddadp   vs45,   vs14,   vs21        // real*imag, imag*imag
-    xvmaddadp   vs46,   vs15,   vs20        // real*real, imag*real
-    xvmaddadp   vs47,   vs15,   vs21        // real*imag, imag*imag
+	xvmaddadp	vs38,	vs11,	vs20
+	xvmaddadp	vs39,	vs11,	vs21
+
+	xvmaddadp	vs40,	vs12,	vs20
+	xvmaddadp	vs41,	vs12,	vs21
+	xvmaddadp	vs42,	vs13,	vs20
+	xvmaddadp	vs43,	vs13,	vs21
+	xvmaddadp	vs44,	vs14,	vs20
+	xvmaddadp	vs45,	vs14,	vs21
+	xvmaddadp	vs46,	vs15,	vs20
+	xvmaddadp	vs47,	vs15,	vs21
 
 .endm
 
-.macro KERNEL1x8 
+.macro KERNEL1x8
   LOAD1x8 0
-  END1x8  AO, BO, 128,32 
+  END1x8  AO, BO, 128,16
 .endm
 
 .macro SAVE1x8
 
-     mr      T1, CO
-    addi        T2, T1, 64
-
-#ifndef TRMMKERNEL
-
-    lxv     vs16,   0(T1)
-    lxv     vs17,   16(T1)
-    lxv     vs18,   32(T1)
-    lxv     vs19,   48(T1)
-    lxv     vs20,   0(T2)
-    lxv     vs21,   16(T2)
-    lxv     vs22,   32(T2)
-    lxv     vs23,   48(T2)
-
-#endif
-
-   AGGREGATE_INTO_COMPLEX vs32,vs33,vs8
-   AGGREGATE_INTO_COMPLEX vs34,vs35,vs9
-   AGGREGATE_INTO_COMPLEX vs36,vs37,vs10
-   AGGREGATE_INTO_COMPLEX vs38,vs39,vs11
-   AGGREGATE_INTO_COMPLEX vs40,vs41,vs12
-   AGGREGATE_INTO_COMPLEX vs42,vs43,vs13
-   AGGREGATE_INTO_COMPLEX vs44,vs45,vs14
-   AGGREGATE_INTO_COMPLEX vs46,vs47,vs15
-
-#ifndef TRMMKERNEL
-
-    xvadddp     vs8,    vs8,    vs16
-    xvadddp     vs9,    vs9,    vs17
-    xvadddp     vs10,   vs10,   vs18
-    xvadddp     vs11,   vs11,   vs19
-    xvadddp     vs12,   vs12,   vs20
-    xvadddp     vs13,   vs13,   vs21
-    xvadddp     vs14,   vs14,   vs22
-    xvadddp     vs15,   vs15,   vs23
-
-#endif
-
-    stxv        vs8,    0(T1)
-    stxv        vs9,    16(T1)
-    stxv        vs10,   32(T1)
-    stxv        vs11,   48(T1)
-    stxv        vs12,   0(T2)
-    stxv        vs13,   16(T2)
-    stxv        vs14,   32(T2)
-    stxv        vs15,   48(T2)
-
-    addi        CO, CO, 128
+	SAVE8  vs32,vs33,vs34,vs35,vs36,vs37,vs38,vs39,vs40,vs41,vs42,vs43,vs44,vs45,vs46,vs47,CO,0 
+	addi	CO, CO, 128
 
 .endm
 
@@ -1181,170 +1094,143 @@ lxv     vs20,   DISP8(\Index,   0+\OffsetB)(\BREG)      // load real part from B
 **********************************************************************************************/
 
 .macro Zero1x4
-    xxlxor      vs32,   vs32,   vs32
-    xxlxor      vs33,   vs33,   vs33
-    xxlxor      vs34,   vs34,   vs34
-    xxlxor      vs35,   vs35,   vs35
-    xxlxor      vs36,   vs36,   vs36
-    xxlxor      vs37,   vs37,   vs37
-    xxlxor      vs38,   vs38,   vs38
-    xxlxor      vs39,   vs39,   vs39 
+	xxlxor	vs32,	vs32,	vs32
+	xxlxor	vs33,	vs33,	vs33
+	xxlxor	vs34,	vs34,	vs34
+	xxlxor	vs35,	vs35,	vs35
+	xxlxor	vs36,	vs36,	vs36
+	xxlxor	vs37,	vs37,	vs37
+	xxlxor	vs38,	vs38,	vs38
+	xxlxor	vs39,	vs39,	vs39
 .endm
 
 .macro LOAD1x4 Zero
 
-    lxv     vs16,   0(BO)       // load real part from B
-    lxv     vs17,   16(BO)      // load imag part from B 
+	lxv	vs16,	0(BO)	// load real imag from B
+	xxswapd	vs17,vs16
+	lxv	vs0,	0(AO)	// load real,imag from A
+	lxv	vs1,	16(AO)	// load real,imag from A
+	lxv	vs2,	32(AO)	// load real,imag from A
+	lxv	vs3,	48(AO)	// load real,imag from A
 
-    lxv     vs0,    0(AO)       // load real,imag from A
-    lxv     vs1,    16(AO)      // load real,imag from A
-    lxv     vs2,    32(AO)      // load real,imag from A
-    lxv     vs3,    48(AO)      // load real,imag from A
- 
 .if \Zero==1
-    Zero1x4 
+	Zero1x4
 .endif
 
 .endm
 
 .macro END1x4_NORMAL
-   END1x4 AO,BO,64,32
+	END1x4 AO,BO,64,16
 .endm
 
-.macro END1x4   AREG, BREG, OffsetA, OffsetB
+.macro END1x4	AREG, BREG, OffsetA, OffsetB
 
-.if \OffsetB != 0 
-    addi        \BREG, \BREG, \OffsetB 
+.if \OffsetB != 0
+	addi	\BREG, \BREG, \OffsetB
+.endif
+.if \OffsetA != 0
+	addi	\AREG, \AREG, \OffsetA
 .endif
-.if \OffsetA != 0 
-    addi        \AREG, \AREG, \OffsetA 
-.endif  
 
-    xvmaddadp   vs32,   vs0,    vs16        // real*real, imag*real
-    xvmaddadp   vs33,   vs0,    vs17        // real*imag, imag*imag
-    xvmaddadp   vs34,   vs1,    vs16        // real*real, imag*real
-    xvmaddadp   vs35,   vs1,    vs17        // real*imag, imag*imag
-    xvmaddadp   vs36,   vs2,    vs16        // real*real, imag*real
-    xvmaddadp   vs37,   vs2,    vs17        // real*imag, imag*imag
-    xvmaddadp   vs38,   vs3,    vs16        // real*real, imag*real
-    xvmaddadp   vs39,   vs3,    vs17        // real*imag, imag*imag
+	xvmaddadp	vs32,	vs0,	vs16
+	xvmaddadp	vs33,	vs0,	vs17
+	xvmaddadp	vs34,	vs1,	vs16
+	xvmaddadp	vs35,	vs1,	vs17
+	xvmaddadp	vs36,	vs2,	vs16
+	xvmaddadp	vs37,	vs2,	vs17
+	xvmaddadp	vs38,	vs3,	vs16
+	xvmaddadp	vs39,	vs3,	vs17
 
 .endm
 
-.macro KERNEL1x4_L      OffsetA,OffsetB, Index,IsLast  
-  KERNEL1x4_2   AO,BO,   \OffsetA,\OffsetB, \Index,\IsLast ,0
+.macro KERNEL1x4_L	OffsetA,OffsetB, Index,IsLast
+  KERNEL1x4_2	AO,BO,	\OffsetA,\OffsetB, \Index,\IsLast ,0
 .endm
 
-.macro KERNEL1x4_E      OffsetA,OffsetB, Index,IsLast  
-  KERNEL1x4_2   AO,BO,   \OffsetA,\OffsetB, \Index,\IsLast ,1
+.macro KERNEL1x4_E	OffsetA,OffsetB, Index,IsLast
+  KERNEL1x4_2	AO,BO,	\OffsetA,\OffsetB, \Index,\IsLast ,1
 .endm
 
-.macro KERNEL1x4_2   AREG,BREG,   OffsetA,OffsetB, Index,IsLast ,Complete
+.macro KERNEL1x4_2	AREG,BREG,	OffsetA,OffsetB, Index,IsLast ,Complete
 
-    lxv     vs8,     DISP8(\Index, 0 + \OffsetA)(\AREG)        // load real,imag from A
-    lxv     vs9,     DISP8(\Index,16 + \OffsetA)(\AREG)        // load real,imag from A
-    lxv     vs10,    DISP8(\Index,32 + \OffsetA)(\AREG)        // load real,imag from A
-    lxv     vs11,    DISP8(\Index,48 + \OffsetA)(\AREG)        // load real,imag from A
+	lxv	vs20,	DISP2(\Index,	0+\OffsetB)(\BREG)	// load real,imag	from B
+	xxswapd	vs21,vs20
 
-lxv     vs20,   DISP4(\Index,   0+\OffsetB)(\BREG)      // load real part from B
-    lxv     vs21,   DISP4(\Index,16+\OffsetB)(\BREG)        // load imag part from B 
+	xvmaddadp	vs32,	vs0,	vs16
+	xvmaddadp	vs33,	vs0,	vs17
 
-    xvmaddadp   vs32,   vs0,    vs16        // real*real, imag*real
-    xvmaddadp   vs33,   vs0,    vs17        // real*imag, imag*imag
-    xvmaddadp   vs34,   vs1,    vs16        // real*real, imag*real
-    xvmaddadp   vs35,   vs1,    vs17        // real*imag, imag*imag
-    xvmaddadp   vs36,   vs2,    vs16        // real*real, imag*real
-    xvmaddadp   vs37,   vs2,    vs17        // real*imag, imag*imag
-    xvmaddadp   vs38,   vs3,    vs16        // real*real, imag*real
-    xvmaddadp   vs39,   vs3,    vs17        // real*imag, imag*imag
+	lxv	vs8,	DISP8(\Index, 0 + \OffsetA)(\AREG)	// load real,imag from A
+	lxv	vs9,	DISP8(\Index,16 + \OffsetA)(\AREG)	// load real,imag from A
+	xvmaddadp	vs34,	vs1,	vs16
+	xvmaddadp	vs35,	vs1,	vs17	
+	lxv	vs10,	DISP8(\Index,32 + \OffsetA)(\AREG)	// load real,imag from A
+	lxv	vs11,	DISP8(\Index,48 + \OffsetA)(\AREG)	// load real,imag from A
 
-    xvmaddadp   vs40,   vs0,    vs18        // real*real, imag*real
-    xvmaddadp   vs41,   vs0,    vs19        // real*imag, imag*imag
-    xvmaddadp   vs42,   vs1,    vs18        // real*real, imag*real
-    xvmaddadp   vs43,   vs1,    vs19        // real*imag, imag*imag
-    xvmaddadp   vs44,   vs2,    vs18        // real*real, imag*real
-    xvmaddadp   vs45,   vs2,    vs19        // real*imag, imag*imag
-    xvmaddadp   vs46,   vs3,    vs18        // real*real, imag*real
-    xvmaddadp   vs47,   vs3,    vs19        // real*imag, imag*imag
+	xvmaddadp	vs36,	vs2,	vs16
+	xvmaddadp	vs37,	vs2,	vs17
+	xvmaddadp	vs38,	vs3,	vs16
+	xvmaddadp	vs39,	vs3,	vs17
+
+	xvmaddadp	vs40,	vs0,	vs18
+	xvmaddadp	vs41,	vs0,	vs19
+	xvmaddadp	vs42,	vs1,	vs18
+	xvmaddadp	vs43,	vs1,	vs19
+	xvmaddadp	vs44,	vs2,	vs18
+	xvmaddadp	vs45,	vs2,	vs19
+	xvmaddadp	vs46,	vs3,	vs18
+	xvmaddadp	vs47,	vs3,	vs19
 
 .if \Complete==0
-    lxv     vs0,     DISP8(\Index,64+  \OffsetA)(\AREG)      // load real,imag from A
-    lxv     vs1,     DISP8(\Index,64+16 + \OffsetA)(\AREG)        // load real,imag from A
-    lxv     vs2,     DISP8(\Index,64+32 + \OffsetA)(\AREG)        // load real,imag from A
-    lxv     vs3,     DISP8(\Index,64+48 + \OffsetA)(\AREG)        // load real,imag from A 
-
-    lxv     vs16,   DISP4(\Index,   32+\OffsetB)(\BREG)     // load real part from B
-    lxv     vs17,   DISP4(\Index,32+16+\OffsetB)(\BREG)     // load imag part from B 
+	lxv	vs0,	DISP8(\Index,64+  \OffsetA)(\AREG)	// load real,imag from A
+	lxv	vs1,	DISP8(\Index,64+16 + \OffsetA)(\AREG)	// load real,imag from A
 .endif
-
-.if \IsLast==1  
-.if \Complete==1
-    addi        \AREG, \AREG,  DISP8(\Index,64+\OffsetA)
-    addi        \BREG, \BREG,  DISP4(\Index,32+\OffsetB)
-.else 
-    addi        \AREG, \AREG,  DISP8(\Index,128)
-    addi        \BREG, \BREG,  DISP4(\Index,64)
-.endif
-.endif  
-
-    xvmaddadp   vs32,   vs8,    vs20        // real*real, imag*real
-    xvmaddadp   vs33,   vs8,    vs21        // real*imag, imag*imag
-    xvmaddadp   vs34,   vs9,    vs20        // real*real, imag*real
-    xvmaddadp   vs35,   vs9,    vs21        // real*imag, imag*imag
-    xvmaddadp   vs36,   vs10,   vs20        // real*real, imag*real
-    xvmaddadp   vs37,   vs10,   vs21        // real*imag, imag*imag
-    xvmaddadp   vs38,   vs11,   vs20        // real*real, imag*real
-    xvmaddadp   vs39,   vs11,   vs21        // real*imag, imag*imag
+.if \Complete==0	
+	lxv	vs2,	DISP8(\Index,64+32 + \OffsetA)(\AREG)	// load real,imag from A
+	lxv	vs3,	DISP8(\Index,64+48 + \OffsetA)(\AREG)	// load real,imag from A
  
-    xvmaddadp   vs40,   vs8,    vs22        // real*real, imag*real
-    xvmaddadp   vs41,   vs8,    vs23        // real*imag, imag*imag
-    xvmaddadp   vs42,   vs9,    vs22        // real*real, imag*real
-    xvmaddadp   vs43,   vs9,    vs23        // real*imag, imag*imag
-    xvmaddadp   vs44,   vs10,   vs22        // real*real, imag*real
-    xvmaddadp   vs45,   vs10,   vs23        // real*imag, imag*imag
-    xvmaddadp   vs46,   vs11,   vs22        // real*real, imag*real
-    xvmaddadp   vs47,   vs11,   vs23        // real*imag, imag*imag
+.endif
+.if \Complete==0
+	lxv	vs16,	DISP2(\Index, 16+\OffsetB)(\BREG)	// load real imag from B
+	xxswapd	vs17,vs16
+.endif
+.if \IsLast==1
+.if \Complete==1
+	addi	\AREG, \AREG,  DISP8(\Index,64+\OffsetA)
+	addi	\BREG, \BREG,  DISP2(\Index,16+\OffsetB)
+.else
+	addi	\AREG, \AREG,  DISP8(\Index,128)
+	addi	\BREG, \BREG,  DISP2(\Index,32)
+.endif
+.endif
+
+	xvmaddadp	vs32,	vs8,	vs20
+	xvmaddadp	vs33,	vs8,	vs21
+	xvmaddadp	vs34,	vs9,	vs20
+	xvmaddadp	vs35,	vs9,	vs21
+	xvmaddadp	vs36,	vs10,	vs20
+	xvmaddadp	vs37,	vs10,	vs21
+	xvmaddadp	vs38,	vs11,	vs20
+	xvmaddadp	vs39,	vs11,	vs21
+
+	xvmaddadp	vs40,	vs8,	vs22
+	xvmaddadp	vs41,	vs8,	vs23
+	xvmaddadp	vs42,	vs9,	vs22
+	xvmaddadp	vs43,	vs9,	vs23
+	xvmaddadp	vs44,	vs10,	vs22
+	xvmaddadp	vs45,	vs10,	vs23
+	xvmaddadp	vs46,	vs11,	vs22
+	xvmaddadp	vs47,	vs11,	vs23
 
 .endm
 
-.macro KERNEL1x4 
+.macro KERNEL1x4
   LOAD1x4 0
-  END1x4  AO, BO, 64,32 
+  END1x4  AO, BO, 64,16
 .endm
 
 .macro SAVE1x4
-
-    mr      T1, CO
-
-#ifndef TRMMKERNEL
-
-    lxv     vs16,   0(T1)
-    lxv     vs17,   16(T1)
-    lxv     vs18,   32(T1)
-    lxv     vs19,   48(T1)
-
-#endif
-
-   AGGREGATE_INTO_COMPLEX vs32,vs33,vs8
-   AGGREGATE_INTO_COMPLEX vs34,vs35,vs9
-   AGGREGATE_INTO_COMPLEX vs36,vs37,vs10
-   AGGREGATE_INTO_COMPLEX vs38,vs39,vs11
-
-#ifndef TRMMKERNEL
-
-    xvadddp     vs8,    vs8,    vs16
-    xvadddp     vs9,    vs9,    vs17
-    xvadddp     vs10,   vs10,   vs18
-    xvadddp     vs11,   vs11,   vs19
-
-#endif
-
-    stxv        vs8,    0(T1)
-    stxv        vs9,    16(T1)
-    stxv        vs10,   32(T1)
-    stxv        vs11,   48(T1) 
- 
-    addi        CO, CO, 64
+	SAVE4  vs32,vs33,vs34,vs35,vs36,vs37,vs38,vs39,CO,0
+	addi	CO, CO, 64
 
 .endm
 
@@ -1353,122 +1239,99 @@ lxv     vs20,   DISP4(\Index,   0+\OffsetB)(\BREG)      // load real part from B
 **********************************************************************************************/
 
 .macro Zero1x2
-    xxlxor      vs32,   vs32,   vs32
-    xxlxor      vs33,   vs33,   vs33
-    xxlxor      vs34,   vs34,   vs34
-    xxlxor      vs35,   vs35,   vs35 
+	xxlxor	vs32,	vs32,	vs32
+	xxlxor	vs33,	vs33,	vs33
+	xxlxor	vs34,	vs34,	vs34
+	xxlxor	vs35,	vs35,	vs35
 .endm
 
 .macro LOAD1x2 Zero
 
-    lxv     vs16,   0(BO)       // load real part from B
-    lxv     vs17,   16(BO)      // load imag part from B 
-
-    lxv     vs0,    0(AO)       // load real,imag from A
-    lxv     vs1,    16(AO)      // load real,imag from A 
+	lxv	vs16,	0(BO)	// load real imag from B
+	xxswapd	vs17,vs16
+	lxv	vs0,	0(AO)	// load real,imag from A
+	lxv	vs1,	16(AO)	// load real,imag from A
  
 .if \Zero==1
-    Zero1x2 
+	Zero1x2
 .endif
 
 .endm
 
 .macro END1x2_NORMAL
-   END1x2 AO,BO,32,32
+	END1x2 AO,BO,32,16
 .endm
 
-.macro END1x2   AREG, BREG, OffsetA, OffsetB
+.macro END1x2	AREG, BREG, OffsetA, OffsetB
 
-.if \OffsetB != 0 
-    addi        \BREG, \BREG, \OffsetB 
+.if \OffsetB != 0
+	addi	\BREG, \BREG, \OffsetB
+.endif
+.if \OffsetA != 0
+	addi	\AREG, \AREG, \OffsetA
 .endif
-.if \OffsetA != 0 
-    addi        \AREG, \AREG, \OffsetA 
-.endif  
 
-    xvmaddadp   vs32,   vs0,    vs16        // real*real, imag*real
-    xvmaddadp   vs33,   vs0,    vs17        // real*imag, imag*imag
-    xvmaddadp   vs34,   vs1,    vs16        // real*real, imag*real
-    xvmaddadp   vs35,   vs1,    vs17        // real*imag, imag*imag
-  
+	xvmaddadp	vs32,	vs0,	vs16
+	xvmaddadp	vs33,	vs0,	vs17
+	xvmaddadp	vs34,	vs1,	vs16
+	xvmaddadp	vs35,	vs1,	vs17
+
 .endm
 
-.macro KERNEL1x2_L      OffsetA,OffsetB, Index,IsLast  
-  KERNEL1x2_2   AO,BO,   \OffsetA,\OffsetB, \Index,\IsLast ,0
+.macro KERNEL1x2_L	OffsetA,OffsetB, Index,IsLast
+  KERNEL1x2_2	AO,BO,	\OffsetA,\OffsetB, \Index,\IsLast ,0
 .endm
 
-.macro KERNEL1x2_E      OffsetA,OffsetB, Index,IsLast  
-  KERNEL1x2_2   AO,BO,   \OffsetA,\OffsetB, \Index,\IsLast ,1
+.macro KERNEL1x2_E	OffsetA,OffsetB, Index,IsLast
+  KERNEL1x2_2	AO,BO,	\OffsetA,\OffsetB, \Index,\IsLast ,1
 .endm
 
-.macro KERNEL1x2_2   AREG,BREG,   OffsetA,OffsetB, Index,IsLast ,Complete
+.macro KERNEL1x2_2	AREG,BREG,	OffsetA,OffsetB, Index,IsLast ,Complete
 
-    lxv     vs8,     DISP4(\Index, 0 + \OffsetA)(\AREG)        // load real,imag from A
-    lxv     vs9,     DISP4(\Index,16 + \OffsetA)(\AREG)        // load real,imag from A
+	lxv	vs20,	DISP2(\Index,	0+\OffsetB)(\BREG)	// load real,imag	from B
+	xxswapd	vs21,vs20
 
-lxv     vs20,   DISP4(\Index,   0+\OffsetB)(\BREG)      // load real part from B
-    lxv     vs21,   DISP4(\Index,16+\OffsetB)(\BREG)        // load imag part from B 
+	xvmaddadp	vs32,	vs0,	vs16
+	xvmaddadp	vs33,	vs0,	vs17
 
-    xvmaddadp   vs32,   vs0,    vs16        // real*real, imag*real
-    xvmaddadp   vs33,   vs0,    vs17        // real*imag, imag*imag
-    xvmaddadp   vs34,   vs1,    vs16        // real*real, imag*real
-    xvmaddadp   vs35,   vs1,    vs17        // real*imag, imag*imag  
+	lxv	vs8,	DISP4(\Index, 0 + \OffsetA)(\AREG)	// load real,imag from A
+	lxv	vs9,	DISP4(\Index,16 + \OffsetA)(\AREG)	// load real,imag from A
+
+	xvmaddadp	vs34,	vs1,	vs16
+	xvmaddadp	vs35,	vs1,	vs17
 .if \Complete==0
-    lxv     vs0,     DISP4(\Index,32 + \OffsetA)(\AREG)      // load real,imag from A
-    lxv     vs1,     DISP4(\Index,48+ \OffsetA)(\AREG)        // load real,imag from A 
-
-    lxv     vs16,   DISP4(\Index,   32+\OffsetB)(\BREG)     // load real part from B
-    lxv     vs17,   DISP4(\Index,32+16+\OffsetB)(\BREG)     // load imag part from B 
+	lxv	vs0,	DISP4(\Index,32 + \OffsetA)(\AREG)	// load real,imag from A
+	lxv	vs1,	DISP4(\Index,48+ \OffsetA)(\AREG)	// load real,imag from A 
 .endif
-
-.if \IsLast==1  
+.if \Complete==0
+	lxv	vs16,	DISP2(\Index, 16+\OffsetB)(\BREG)	// load real imag from B
+	xxswapd	vs17,vs16
+.endif
+.if \IsLast==1
 .if \Complete==1
-    addi        \AREG, \AREG,  DISP4(\Index,32+\OffsetA)
-    addi        \BREG, \BREG,  DISP4(\Index,32+\OffsetB)
-.else 
-    addi        \AREG, \AREG,  DISP4(\Index,64)
-    addi        \BREG, \BREG,  DISP4(\Index,64)
+	addi	\AREG, \AREG,  DISP4(\Index,32+\OffsetA)
+	addi	\BREG, \BREG,  DISP2(\Index,16+\OffsetB)
+.else
+	addi	\AREG, \AREG,  DISP4(\Index,64)
+	addi	\BREG, \BREG,  DISP2(\Index,32)
+.endif
 .endif
-.endif  
 
-    xvmaddadp   vs32,   vs8,    vs20        // real*real, imag*real
-    xvmaddadp   vs33,   vs8,    vs21        // real*imag, imag*imag
-    xvmaddadp   vs34,   vs9,    vs20        // real*real, imag*real
-    xvmaddadp   vs35,   vs9,    vs21        // real*imag, imag*imag
+	xvmaddadp	vs32,	vs8,	vs20
+	xvmaddadp	vs33,	vs8,	vs21
+	xvmaddadp	vs34,	vs9,	vs20
+	xvmaddadp	vs35,	vs9,	vs21
 
 .endm
 
-.macro KERNEL1x2 
+.macro KERNEL1x2
   LOAD1x2 0
-  END1x2  AO, BO, 32,32 
+  END1x2  AO, BO, 32,16
 .endm
 
 .macro SAVE1x2
-
-    mr      T1, CO
-
-#ifndef TRMMKERNEL
-
-    lxv     vs16,   0(T1)
-    lxv     vs17,   16(T1)
-
-#endif
-
-   AGGREGATE_INTO_COMPLEX vs32,vs33,vs8
-   AGGREGATE_INTO_COMPLEX vs34,vs35,vs9 
-
-#ifndef TRMMKERNEL
-
-    xvadddp     vs8,    vs8,    vs16
-    xvadddp     vs9,    vs9,    vs17
-
-#endif
-
-    stxv        vs8,    0(T1)
-    stxv        vs9,    16(T1)
-
-addi        CO, CO, 32
-
+	SAVE2  vs32,vs33,vs34,vs35,CO,0
+	addi	CO, CO, 32 
 .endm
 
 /**********************************************************************************************
@@ -1476,189 +1339,89 @@ addi        CO, CO, 32
 **********************************************************************************************/
 
 .macro Zero1x1
-    xxlxor      vs32,   vs32,   vs32
-    xxlxor      vs33,   vs33,   vs33 
+	xxlxor	vs32,	vs32,	vs32
+	xxlxor	vs33,	vs33,	vs33
 .endm
 
 .macro LOAD1x1 Zero
-    lxv     vs0,    0(AO)       // load real,imag from A 
-
-    lxv     vs16,   0(BO)       // load real part from B
-    lxv     vs17,   16(BO)      // load imag part from B 
+	lxv	vs0,	0(AO)	// load real,imag from A
 
+	lxv	vs16,	0(BO)	// load real imag from B
+	xxswapd vs17,  vs16
 .if \Zero==1
-    Zero1x1 
+	Zero1x1
 .endif
-
+ 
 .endm
 
 .macro END1x1_NORMAL
-   END1x1 AO,BO,16,32
+	END1x1 AO,BO,16,16
 .endm
 
-.macro END1x1   AREG, BREG, OffsetA, OffsetB
+.macro END1x1	AREG, BREG, OffsetA, OffsetB
 
-.if \OffsetA != 0 
-    addi        \AREG, \AREG, \OffsetA 
-.endif  
-.if \OffsetB != 0 
-    addi        \BREG, \BREG, \OffsetB 
+.if \OffsetA != 0
+	addi	\AREG, \AREG, \OffsetA
+.endif
+.if \OffsetB != 0
+	addi	\BREG, \BREG, \OffsetB
 .endif
 
-    xvmaddadp   vs32,   vs0,    vs16        // real*real, imag*real
-    xvmaddadp   vs33,   vs0,    vs17        // real*imag, imag*imag 
+	xvmaddadp	vs32,	vs0,	vs16
+	xvmaddadp	vs33,	vs0,	vs17
+
+
+.endm
+
+.macro KERNEL1x1_L	OffsetA,OffsetB, Index,IsLast
+  KERNEL1x1_2	AO,BO,	\OffsetA,\OffsetB, \Index,\IsLast ,0
+.endm
+
+.macro KERNEL1x1_E	OffsetA,OffsetB, Index,IsLast
+  KERNEL1x1_2	AO,BO,	\OffsetA,\OffsetB, \Index,\IsLast ,1
+.endm
+
+.macro KERNEL1x1_2	AREG,BREG,	OffsetA,OffsetB, Index,IsLast ,Complete
+	lxv	vs20,	DISP2(\Index,	0+\OffsetB)(\BREG)	// load real,imag	from B
+	xxswapd vs21,  vs20
+
+	lxv	vs8,	DISP2(\Index, 0 + \OffsetA)(\AREG)	// load real,imag from A
  
-  
-.endm
-
-.macro KERNEL1x1_L      OffsetA,OffsetB, Index,IsLast  
-  KERNEL1x1_2   AO,BO,   \OffsetA,\OffsetB, \Index,\IsLast ,0
-.endm
-
-.macro KERNEL1x1_E      OffsetA,OffsetB, Index,IsLast  
-  KERNEL1x1_2   AO,BO,   \OffsetA,\OffsetB, \Index,\IsLast ,1
-.endm
-
-.macro KERNEL1x1_2   AREG,BREG,   OffsetA,OffsetB, Index,IsLast ,Complete
-
-    lxv     vs8,     DISP2(\Index, 0 + \OffsetA)(\AREG)        // load real,imag from A
-
-    lxv     vs20,   DISP4(\Index,   0+\OffsetB)(\BREG)      // load real part from B
-    lxv     vs21,   DISP4(\Index,16+\OffsetB)(\BREG)        // load imag part from B 
-
-    xvmaddadp   vs32,   vs0,    vs16        // real*real, imag*real
-    xvmaddadp   vs33,   vs0,    vs17        // real*imag, imag*imag  
+	xvmaddadp	vs32,	vs0,	vs16
+	xvmaddadp	vs33,	vs0,	vs17
 
 .if \Complete==0
-    lxv     vs0,     DISP2(\Index,16 + \OffsetA)(\AREG)      // load real,imag from A 
-
-    lxv     vs16,   DISP4(\Index,   32+\OffsetB)(\BREG)     // load real part from B
-    lxv     vs17,   DISP4(\Index,32+16+\OffsetB)(\BREG)     // load imag part from B 
+	lxv	vs0,	DISP2(\Index,16 + \OffsetA)(\AREG)	// load real,imag from A
+.endif
+.if \Complete==0
+	lxv	vs16,	DISP2(\Index, 16+\OffsetB)(\BREG)	// load real imag from B
+	xxswapd vs17,  vs16	
 .endif
 
-
-.if \IsLast==1  
+.if \IsLast==1
 .if \Complete==1
-    addi        \AREG, \AREG,  DISP2(\Index,16+\OffsetA)
-    addi        \BREG, \BREG,  DISP4(\Index,32+\OffsetB)
-.else 
-    addi        \AREG, \AREG,  DISP2(\Index,32)
-    addi        \BREG, \BREG,  DISP4(\Index,64)
+	addi	\AREG, \AREG,  DISP2(\Index,16+\OffsetA)
+	addi	\BREG, \BREG,  DISP2(\Index,16+\OffsetB)
+.else
+	addi	\AREG, \AREG,  DISP2(\Index,32)
+	addi	\BREG, \BREG,  DISP2(\Index,32)
 .endif
 .endif
-  
-    xvmaddadp   vs32,   vs8,    vs20        // real*real, imag*real
-    xvmaddadp   vs33,   vs8,    vs21        // real*imag, imag*imag 
-  
-     
+
+	xvmaddadp	vs32,	vs8,	vs20
+	xvmaddadp	vs33,	vs8,	vs21
+
+
 .endm
 
-.macro KERNEL1x1 
+.macro KERNEL1x1
   LOAD1x1 0
-  END1x1  AO, BO, 16,32 
-
-.endm  
-
-.macro SAVE1x1
-
-    mr      T1, CO
-#ifndef TRMMKERNEL
-    lxv     vs16,   0(T1)
-#endif
-   AGGREGATE_INTO_COMPLEX vs32,vs33,vs8
-
-#ifndef TRMMKERNEL
-    xvadddp     vs8,    vs8,    vs16
-#endif
-
-    stxv        vs8,    0(T1)
-
-addi        CO, CO, 16
+  END1x1  AO, BO, 16,16
 
 .endm
 
-
-.macro ZCOPYB_2
-
-        lxv          vs32,   0(BO)
-        lxv          vs33,  16(BO)            
-        addi            BO,     BO,     32
-        xxspltd     vs40, vs32, 1
-        xxspltd     vs41, vs32, 0     
-        xxspltd     vs42, vs33, 1
-        xxspltd     vs43, vs33, 0
-
-        stxv         vs40,    0(BBO)
-        stxv         vs41,   16(BBO)
-        stxv         vs42,   32(BBO)
-        stxv         vs43,   48(BBO)
-        addi            BBO,    BBO,    64
-
-.endm
-
-.macro ZCOPYB_1
-
-        lxv          vs32,   0(BO)              
-        addi            BO,     BO,     16
-        xxspltd     vs40, vs32, 1
-        xxspltd     vs41, vs32, 0        
-        stxv         vs40,    0(BBO)
-        stxv         vs41,   16(BBO)
-
-        addi            BBO,    BBO,    32
-
-.endm
-
-.macro ZCOPYB_8
-
-        lxv          vs32,   0(BO)
-        lxv          vs33,  16(BO)
-        lxv          vs34,  32(BO)
-        lxv          vs35,  48(BO) 
-
-        lxv          vs36,   64+0(BO)
-        lxv          vs37,  64+16(BO)
-        lxv          vs38,  64+32(BO)
-        lxv          vs39,  64+48(BO) 
-        addi         BO, BO,    128
-        xxspltd     vs40, vs32, 1
-        xxspltd     vs41, vs32, 0
-        xxspltd     vs42, vs33, 1
-        xxspltd     vs43, vs33, 0
-        xxspltd     vs44, vs34, 1
-        xxspltd     vs45, vs34, 0
-        xxspltd     vs46, vs35, 1
-        xxspltd     vs47, vs35, 0    
-
-        xxspltd     vs48, vs36, 1
-        xxspltd     vs49, vs36, 0
-        xxspltd     vs50, vs37, 1
-        xxspltd     vs51, vs37, 0
-        xxspltd     vs52, vs38, 1
-        xxspltd     vs53, vs38, 0
-        xxspltd     vs54, vs39, 1
-        xxspltd     vs55, vs39, 0
-
-        stxv         vs40,    0(BBO)
-        stxv         vs41,   16(BBO)
-        stxv         vs42,   32(BBO)
-        stxv         vs43,   48(BBO) 
-
-        stxv         vs44,    64+0(BBO)
-        stxv         vs45,   64+16(BBO)
-        stxv         vs46,   64+32(BBO)
-        stxv         vs47,   64+48(BBO) 
-
-        stxv         vs48,   128+ 0(BBO)
-        stxv         vs49,   128+ 16(BBO)
-        stxv         vs50,   128+ 32(BBO)
-        stxv         vs51,   128+ 48(BBO) 
-
-        stxv         vs52,   192 + 0(BBO)
-        stxv         vs53,   192 + 16(BBO)
-        stxv         vs54,   192+ 32(BBO)
-        stxv         vs55,   192 + 48(BBO)
-        addi            BBO,    BBO,    256
-
+.macro SAVE1x1 
+	SAVE1  vs32,vs33,CO,0
+	addi	CO, CO, 16 
 .endm
 
diff --git a/param.h b/param.h
index d0b8518c9..8f78a6a64 100644
--- a/param.h
+++ b/param.h
@@ -2248,15 +2248,15 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #define ZGEMM_DEFAULT_UNROLL_M 8
 #define ZGEMM_DEFAULT_UNROLL_N 2
 
-#define SGEMM_DEFAULT_P 640
+#define SGEMM_DEFAULT_P 832
 #define DGEMM_DEFAULT_P  128
 #define CGEMM_DEFAULT_P  640
-#define ZGEMM_DEFAULT_P 512
+#define ZGEMM_DEFAULT_P 256
 
-#define SGEMM_DEFAULT_Q 1408
+#define SGEMM_DEFAULT_Q 1025
 #define DGEMM_DEFAULT_Q  384
 #define CGEMM_DEFAULT_Q  640
-#define ZGEMM_DEFAULT_Q 1152
+#define ZGEMM_DEFAULT_Q 1025
 
 #define SYMV_P	 8
 

From 7a9a4dbc4fdd748747cd86ae685e760ae8cdc10f Mon Sep 17 00:00:00 2001
From: Michael Lass <michael.lass@uni-paderborn.de>
Date: Fri, 3 May 2019 21:07:14 +0200
Subject: [PATCH 073/127] Fix detection of AVX512 capable compilers in getarch

21eda8b5 introduced a check in getarch.c to test if the compiler is capable of
AVX512. This check currently fails, since the used __AVX2__ macro is only
defined if getarch itself was compiled with AVX2/AVX512 support. Make sure this
is the case by building getarch with -march=native on x86_64. It is only
supposed to run on the build host anyway.
---
 Makefile.system    | 9 +++++++++
 cmake/system.cmake | 5 +++++
 2 files changed, 14 insertions(+)

diff --git a/Makefile.system b/Makefile.system
index f574edf88..eb57cbb30 100644
--- a/Makefile.system
+++ b/Makefile.system
@@ -9,6 +9,11 @@ ifndef TOPDIR
 TOPDIR = .
 endif
 
+# If ARCH is not set, we use the host system's architecture.
+ifndef ARCH
+ARCH := $(shell uname -m)
+endif
+
 # Catch conflicting usage of ARCH in some BSD environments
 ifeq ($(ARCH), amd64)
 override ARCH=x86_64
@@ -137,6 +142,10 @@ endif
 endif
 
 
+# On x86_64 build getarch with march=native. This is required to detect AVX512 support in getarch.
+ifeq ($(ARCH), x86_64)
+GETARCH_FLAGS += -march=native
+endif
 
 
 ifdef INTERFACE64
diff --git a/cmake/system.cmake b/cmake/system.cmake
index adedd32cc..7f3696286 100644
--- a/cmake/system.cmake
+++ b/cmake/system.cmake
@@ -65,6 +65,11 @@ if (DEFINED TARGET)
   set(GETARCH_FLAGS "-DFORCE_${TARGET}")
 endif ()
 
+# On x86_64 build getarch with march=native. This is required to detect AVX512 support in getarch.
+if (X86_64)
+  set(GETARCH_FLAGS "${GETARCH_FLAGS} -march=native")
+endif ()
+
 if (INTERFACE64)
   message(STATUS "Using 64-bit integers.")
   set(GETARCH_FLAGS	"${GETARCH_FLAGS} -DUSE64BITINT")

From 9cdc828afa3b209c2c74a7d9daa5fac85bece49f Mon Sep 17 00:00:00 2001
From: Michael Lass <michael.lass@uni-paderborn.de>
Date: Fri, 3 May 2019 21:22:27 +0200
Subject: [PATCH 074/127] c_check: Unlink correct file

---
 c_check | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/c_check b/c_check
index d93b756d5..271182c54 100644
--- a/c_check
+++ b/c_check
@@ -240,7 +240,7 @@ if (($architecture eq "x86") || ($architecture eq "x86_64")) {
 	} else {
 	    $no_avx512 = 0;
 	}
-	unlink("tmpf.o");
+	unlink("$tmpf.o");
     }
 }
 

From d0c3543c3f38bbbdf363caa8c37bcf6df5bdb6fd Mon Sep 17 00:00:00 2001
From: AbdelRauf <quickwritereader@gmail.com>
Date: Wed, 5 Jun 2019 10:30:57 +0000
Subject: [PATCH 075/127] power9 zgemm ztrmm optimized

---
 kernel/power/KERNEL.POWER9         |    2 +-
 kernel/power/zgemm_kernel_power9.S |    2 +-
 kernel/power/zgemm_logic_power9.S  | 2352 +++++++++++++++++++---------
 kernel/power/zgemm_macros_power9.S | 1692 ++++++++++++--------
 param.h                            |    2 +-
 5 files changed, 2671 insertions(+), 1379 deletions(-)

diff --git a/kernel/power/KERNEL.POWER9 b/kernel/power/KERNEL.POWER9
index 440eaab1b..126313c9a 100644
--- a/kernel/power/KERNEL.POWER9
+++ b/kernel/power/KERNEL.POWER9
@@ -6,7 +6,7 @@
 STRMMKERNEL	= sgemm_kernel_power9.S
 DTRMMKERNEL	= dgemm_kernel_power9.S
 CTRMMKERNEL	= ctrmm_kernel_8x4_power8.S
-ZTRMMKERNEL	= ztrmm_kernel_8x2_power8.S
+ZTRMMKERNEL	= zgemm_kernel_power9.S
 
 SGEMMKERNEL    =  sgemm_kernel_power9.S
 SGEMMINCOPY    = ../generic/gemm_ncopy_16.c
diff --git a/kernel/power/zgemm_kernel_power9.S b/kernel/power/zgemm_kernel_power9.S
index a41bcec77..813f270b8 100644
--- a/kernel/power/zgemm_kernel_power9.S
+++ b/kernel/power/zgemm_kernel_power9.S
@@ -63,7 +63,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #define T8	r16
 #define T5	r17
 #define T2	r19
-#define T9	r20
+#define TEMP_REG	r20
 #define	T6	r21
 #define	I	r22
 #define J	r23
diff --git a/kernel/power/zgemm_logic_power9.S b/kernel/power/zgemm_logic_power9.S
index 01685fe79..f902484a3 100644
--- a/kernel/power/zgemm_logic_power9.S
+++ b/kernel/power/zgemm_logic_power9.S
@@ -26,972 +26,1866 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 *****************************************************************************/
 #define MY_ALIGN .align 3
 b ZGEMM_L2
-
-/*                MINI SUBROUTINES                            */
+/*                MINI SUBROUTINES                            */      
+/*                2x8 MAIN 128x+2 LOOP                     */      
 
 
-
-/*                2x8 MAIN 128x+1 LOOP                     */   
-ZGEMM_L2x8_LMAIN_SUB: 
-	mtctr		L
-    LOAD2x8 0  
-	MY_ALIGN
+ZGEMM_L2x8_LMAIN_SUB:
+/*----------------------------------------*/   
+    mtctr   T8
+    LOAD2x8_2 
+    MY_ALIGN
 ZGEMM_L2x8_LOOP:
-	dcbt		AO,	PRE
-	dcbt		BO,	PRE
-    KERNEL2x8_L 128,32,0,0 
-    KERNEL2x8_L 128,32,1,0
-	dcbt		AO,	T2	
-	KERNEL2x8_L 128,32,2,0
-	KERNEL2x8_L 128,32,3,0 
-	dcbt		AO,	T3
-	dcbt		BO,	T2
-    KERNEL2x8_L 128,32,4,0
-	KERNEL2x8_L 128,32,5,0
-	dcbt		AO,	T4	
-	KERNEL2x8_L 128,32,6,0
-	KERNEL2x8_L 128,32,7,0  
-	dcbt		AO,	T5	
-	dcbt		BO,	T3
-    KERNEL2x8_L 128,32,8,0
-	KERNEL2x8_L 128,32,9,0
-	KERNEL2x8_L 128,32,10,0
-	KERNEL2x8_L 128,32,11,0  
-	dcbt		BO,	T4
-    KERNEL2x8_L 128,32,12,0
-	KERNEL2x8_L 128,32,13,0
-	KERNEL2x8_L 128,32,14,0
-	KERNEL2x8_L 128,32,15,0 	
-    KERNEL2x8_L 128,32,16,0
-	KERNEL2x8_L 128,32,17,0 
-	KERNEL2x8_L 128,32,18,0
-	KERNEL2x8_L 128,32,19,0  
-    KERNEL2x8_L 128,32,20,0
-	KERNEL2x8_L 128,32,21,0 
-	KERNEL2x8_L 128,32,22,0
-	KERNEL2x8_L 128,32,23,0   
-    KERNEL2x8_L 128,32,24,0
-	KERNEL2x8_L 128,32,25,0
-	KERNEL2x8_L 128,32,26,0
-	KERNEL2x8_L 128,32,27,0  
-    KERNEL2x8_L 128,32,28,0
-	KERNEL2x8_L 128,32,29,0
-	KERNEL2x8_L 128,32,30,0
-	KERNEL2x8_L 128,32,31,0 
-    KERNEL2x8_L 128,32,32,0
-	KERNEL2x8_L 128,32,33,0
-	KERNEL2x8_L 128,32,34,0
-	KERNEL2x8_L 128,32,35,0 
-    KERNEL2x8_L 128,32,36,0
-	KERNEL2x8_L 128,32,37,0
-	KERNEL2x8_L 128,32,38,0
-	KERNEL2x8_L 128,32,39,0  
-    KERNEL2x8_L 128,32,40,0
-	KERNEL2x8_L 128,32,41,0
-	KERNEL2x8_L 128,32,42,0
-	KERNEL2x8_L 128,32,43,0  
-    KERNEL2x8_L 128,32,44,0
-	KERNEL2x8_L 128,32,45,0
-	KERNEL2x8_L 128,32,46,0
-	KERNEL2x8_L 128,32,47,0 
-    KERNEL2x8_L 128,32,48,0
-	KERNEL2x8_L 128,32,49,0 
-	KERNEL2x8_L 128,32,50,0
-	KERNEL2x8_L 128,32,51,0  
-    KERNEL2x8_L 128,32,52,0
-	KERNEL2x8_L 128,32,53,0 
-	KERNEL2x8_L 128,32,54,0
-	KERNEL2x8_L 128,32,55,0  
-    KERNEL2x8_L 128,32,56,0
-	KERNEL2x8_L 128,32,57,0
-	KERNEL2x8_L 128,32,58,0
-	KERNEL2x8_L 128,32,59,0  
-    KERNEL2x8_L 128,32,60,0
-	KERNEL2x8_L 128,32,61,0
-	KERNEL2x8_L 128,32,62,0 
-	KERNEL2x8_L 128,32,63,1	
-	bdnz		ZGEMM_L2x8_LOOP
- 	MY_ALIGN  
+/*----------------------------------------*/   
+    dcbt    AO, PRE
+    dcbt    BO, PRE
+    KERNEL2x8_L2 256,64,0,0 
+ZGEMM_L2x8_K128:
+/*----------------------------------------*/   
+    KERNEL2x8_L2 256,64,1,0
+    dcbt    AO, T2  
+    KERNEL2x8_L2 256,64,2,0
+    KERNEL2x8_L2 256,64,3,0 
+    dcbt    AO, T3
+    dcbt    BO, T2
+    KERNEL2x8_L2 256,64,4,0
+    KERNEL2x8_L2 256,64,5,0
+    dcbt    AO, T4  
+    KERNEL2x8_L2 256,64,6,0
+    KERNEL2x8_L2 256,64,7,0  
+    dcbt    AO, T5  
+    dcbt    BO, T3
+    KERNEL2x8_L2 256,64,8,0
+    KERNEL2x8_L2 256,64,9,0
+    KERNEL2x8_L2 256,64,10,0
+    KERNEL2x8_L2 256,64,11,0  
+    dcbt    BO, T4
+    KERNEL2x8_L2 256,64,12,0
+    KERNEL2x8_L2 256,64,13,0
+    KERNEL2x8_L2 256,64,14,0
+    KERNEL2x8_L2 256,64,15,0  
+    KERNEL2x8_L2 256,64,16,0
+    KERNEL2x8_L2 256,64,17,0 
+    KERNEL2x8_L2 256,64,18,0
+    KERNEL2x8_L2 256,64,19,0  
+    KERNEL2x8_L2 256,64,20,0
+    KERNEL2x8_L2 256,64,21,0 
+    KERNEL2x8_L2 256,64,22,0
+    KERNEL2x8_L2 256,64,23,0   
+    KERNEL2x8_L2 256,64,24,0
+    KERNEL2x8_L2 256,64,25,0
+    KERNEL2x8_L2 256,64,26,0
+    KERNEL2x8_L2 256,64,27,0  
+    KERNEL2x8_L2 256,64,28,0
+    KERNEL2x8_L2 256,64,29,0
+    KERNEL2x8_L2 256,64,30,0
+    KERNEL2x8_L2 256,64,31,0 
+    KERNEL2x8_L2 256,64,32,0
+    KERNEL2x8_L2 256,64,33,0
+    KERNEL2x8_L2 256,64,34,0
+    KERNEL2x8_L2 256,64,35,0 
+    KERNEL2x8_L2 256,64,36,0
+    KERNEL2x8_L2 256,64,37,0
+    KERNEL2x8_L2 256,64,38,0
+    KERNEL2x8_L2 256,64,39,0  
+    KERNEL2x8_L2 256,64,40,0
+    KERNEL2x8_L2 256,64,41,0
+    KERNEL2x8_L2 256,64,42,0
+    KERNEL2x8_L2 256,64,43,0  
+    KERNEL2x8_L2 256,64,44,0
+    KERNEL2x8_L2 256,64,45,0
+    KERNEL2x8_L2 256,64,46,0
+    KERNEL2x8_L2 256,64,47,0 
+    KERNEL2x8_L2 256,64,48,0
+    KERNEL2x8_L2 256,64,49,0 
+    KERNEL2x8_L2 256,64,50,0
+    KERNEL2x8_L2 256,64,51,0  
+    KERNEL2x8_L2 256,64,52,0
+    KERNEL2x8_L2 256,64,53,0 
+    KERNEL2x8_L2 256,64,54,0
+    KERNEL2x8_L2 256,64,55,0  
+    KERNEL2x8_L2 256,64,56,0
+    KERNEL2x8_L2 256,64,57,0
+    KERNEL2x8_L2 256,64,58,0
+    KERNEL2x8_L2 256,64,59,0  
+    KERNEL2x8_L2 256,64,60,0
+    KERNEL2x8_L2 256,64,61,0
+    KERNEL2x8_L2 256,64,62,0 
+    KERNEL2x8_L2 256,64,63,1  
+    bdnz    ZGEMM_L2x8_LOOP
+    MY_ALIGN  
 ZGEMM_L2x8_LOOP_END:
-   END2x8  AO, BO, 128,32 
-   blr
-
+/*----------------------------------------*/   
+    END2x8_2
+    blr
     MY_ALIGN
+
+
 ZGEMM_2x8_L64_SUB:
-    LOAD2x8 0 
-	dcbt		AO,	PRE
-	dcbt		BO,	PRE
-    KERNEL2x8_L 128,32,0,0 
-    KERNEL2x8_L 128,32,1,0
-	dcbt		AO,	T2	
-	KERNEL2x8_L 128,32,2,0
-	KERNEL2x8_L 128,32,3,0 
-	dcbt		AO,	T3
-	dcbt		BO,	T2
-    KERNEL2x8_L 128,32,4,0
-	KERNEL2x8_L 128,32,5,0
-	dcbt		AO,	T4	
-	KERNEL2x8_L 128,32,6,0
-	KERNEL2x8_L 128,32,7,0  
-	dcbt		AO,	T5	
-	dcbt		BO,	T3
-    KERNEL2x8_L 128,32,8,0
-	KERNEL2x8_L 128,32,9,0
-	KERNEL2x8_L 128,32,10,0
-	KERNEL2x8_L 128,32,11,0  
-	dcbt		BO,	T4
-    KERNEL2x8_L 128,32,12,0
-	KERNEL2x8_L 128,32,13,0
-	KERNEL2x8_L 128,32,14,0
-	KERNEL2x8_L 128,32,15,0 	
-    KERNEL2x8_L 128,32,16,0
-	KERNEL2x8_L 128,32,17,0 
-	KERNEL2x8_L 128,32,18,0
-	KERNEL2x8_L 128,32,19,0  
-    KERNEL2x8_L 128,32,20,0
-	KERNEL2x8_L 128,32,21,0 
-	KERNEL2x8_L 128,32,22,0
-	KERNEL2x8_L 128,32,23,0   
-    KERNEL2x8_L 128,32,24,0
-	KERNEL2x8_L 128,32,25,0
-	KERNEL2x8_L 128,32,26,0
-	KERNEL2x8_L 128,32,27,0  
-    KERNEL2x8_L 128,32,28,0
-	KERNEL2x8_L 128,32,29,0
-	KERNEL2x8_L 128,32,30,0
-	KERNEL2x8_E 128,32,31,1
-	blr
-
-
+/*----------------------------------------*/   
+    LOAD2x8_2  
+    dcbt    AO, PRE
+    dcbt    BO, PRE
+    KERNEL2x8_L2 256,64,0,0 
+    KERNEL2x8_L2 256,64,1,0
+    dcbt    AO, T2  
+    KERNEL2x8_L2 256,64,2,0
+    KERNEL2x8_L2 256,64,3,0 
+    dcbt    AO, T3
+    dcbt    BO, T2
+    KERNEL2x8_L2 256,64,4,0
+    KERNEL2x8_L2 256,64,5,0
+    dcbt    AO, T4  
+    KERNEL2x8_L2 256,64,6,0
+    KERNEL2x8_L2 256,64,7,0  
+    dcbt    AO, T5  
+    dcbt    BO, T3
+    KERNEL2x8_L2 256,64,8,0
+    KERNEL2x8_L2 256,64,9,0
+    KERNEL2x8_L2 256,64,10,0
+    KERNEL2x8_L2 256,64,11,0  
+    dcbt    BO, T4
+    KERNEL2x8_L2 256,64,12,0
+    KERNEL2x8_L2 256,64,13,0
+    KERNEL2x8_L2 256,64,14,0
+    KERNEL2x8_L2 256,64,15,0  
+    KERNEL2x8_L2 256,64,16,0
+    KERNEL2x8_L2 256,64,17,0 
+    KERNEL2x8_L2 256,64,18,0
+    KERNEL2x8_L2 256,64,19,0  
+    KERNEL2x8_L2 256,64,20,0
+    KERNEL2x8_L2 256,64,21,0 
+    KERNEL2x8_L2 256,64,22,0
+    KERNEL2x8_L2 256,64,23,0   
+    KERNEL2x8_L2 256,64,24,0
+    KERNEL2x8_L2 256,64,25,0
+    KERNEL2x8_L2 256,64,26,0
+    KERNEL2x8_L2 256,64,27,0  
+    KERNEL2x8_L2 256,64,28,0
+    KERNEL2x8_L2 256,64,29,0
+    KERNEL2x8_L2 256,64,30,0
+    KERNEL2x8_E2 256,64,31,1
+    blr
     MY_ALIGN
+
+
 ZGEMM_2x8_L32_SUB:
-    LOAD2x8 0 
-	dcbt		AO,	PRE
-	dcbt		BO,	PRE
-    KERNEL2x8_L 128,32,0,0 
-    KERNEL2x8_L 128,32,1,0
-	dcbt		AO,	T2	
-	KERNEL2x8_L 128,32,2,0
-	KERNEL2x8_L 128,32,3,0 
-	dcbt		AO,	T3
-	dcbt		BO,	T2
-    KERNEL2x8_L 128,32,4,0
-	KERNEL2x8_L 128,32,5,0
-	dcbt		AO,	T4	
-	KERNEL2x8_L 128,32,6,0
-	KERNEL2x8_L 128,32,7,0  
-	dcbt		AO,	T5	
-	dcbt		BO,	T3
-    KERNEL2x8_L 128,32,8,0
-	KERNEL2x8_L 128,32,9,0
-	KERNEL2x8_L 128,32,10,0
-	KERNEL2x8_L 128,32,11,0  
-	dcbt		BO,	T4
-    KERNEL2x8_L 128,32,12,0
-	KERNEL2x8_L 128,32,13,0
-	KERNEL2x8_L 128,32,14,0
-	KERNEL2x8_L 128,32,15,1
-	blr
+/*----------------------------------------*/   
+    LOAD2x8_2  
+    dcbt    AO, PRE
+    dcbt    BO, PRE
+    KERNEL2x8_L2 256,64,0,0 
+    KERNEL2x8_L2 256,64,1,0
+    dcbt    AO, T2  
+    KERNEL2x8_L2 256,64,2,0
+    KERNEL2x8_L2 256,64,3,0 
+    dcbt    AO, T3
+    dcbt    BO, T2
+    KERNEL2x8_L2 256,64,4,0
+    KERNEL2x8_L2 256,64,5,0
+    dcbt    AO, T4  
+    KERNEL2x8_L2 256,64,6,0
+    KERNEL2x8_L2 256,64,7,0  
+    dcbt    AO, T5  
+    dcbt    BO, T3
+    KERNEL2x8_L2 256,64,8,0
+    KERNEL2x8_L2 256,64,9,0
+    KERNEL2x8_L2 256,64,10,0
+    KERNEL2x8_L2 256,64,11,0  
+    dcbt    BO, T4
+    KERNEL2x8_L2 256,64,12,0
+    KERNEL2x8_L2 256,64,13,0
+    KERNEL2x8_L2 256,64,14,0
+    KERNEL2x8_E2 256,64,15,1
+    blr
     MY_ALIGN
 
+
 ZGEMM_2x8_L16_SUB:
-    LOAD2x8 0 
-	dcbt		AO,	PRE
-	dcbt		BO,	PRE
-    KERNEL2x8_L 128,32,0,0 
-    KERNEL2x8_L 128,32,1,0
-	dcbt		AO,	T2	
-	KERNEL2x8_L 128,32,2,0
-	KERNEL2x8_L 128,32,3,0 
-	dcbt		AO,	T3
-	dcbt		BO,	T2
-    KERNEL2x8_L 128,32,4,0
-	KERNEL2x8_L 128,32,5,0
-	dcbt		AO,	T4	
-	KERNEL2x8_L 128,32,6,0
-	KERNEL2x8_L 128,32,7,1
-	blr
-   MY_ALIGN
+/*----------------------------------------*/   
+    LOAD2x8_2 
+    dcbt    AO, PRE
+    dcbt    BO, PRE
+    KERNEL2x8_L2 256,64,0,0 
+    KERNEL2x8_L2 256,64,1,0
+    dcbt    AO, T2  
+    KERNEL2x8_L2 256,64,2,0
+    KERNEL2x8_L2 256,64,3,0 
+    dcbt    AO, T3
+    dcbt    BO, T2
+    KERNEL2x8_L2 256,64,4,0
+    KERNEL2x8_L2 256,64,5,0
+    dcbt    AO, T4  
+    KERNEL2x8_L2 256,64,6,0
+    KERNEL2x8_E2 256,64,7,1
+    blr
+    MY_ALIGN
+
 
 ZGEMM_2x4_LMAIN_SUB:
-	mtctr		L
-    LOAD2x4 0   
-	MY_ALIGN
-ZGEMM_L2x4_LOOP: 
-    KERNEL2x4_L 64,32,0,0
-	KERNEL2x4_L 64,32,1,0 	
-	KERNEL2x4_L 64,32,2,0
-	KERNEL2x4_L 64,32,3,0  
-    KERNEL2x4_L 64,32,4,0
-	KERNEL2x4_L 64,32,5,0 
-	KERNEL2x4_L 64,32,6,0
-	KERNEL2x4_L 64,32,7,0
-    KERNEL2x4_L 64,32,8,0
-	KERNEL2x4_L 64,32,9,0 	
-	KERNEL2x4_L 64,32,10,0
-	KERNEL2x4_L 64,32,11,0  
-    KERNEL2x4_L 64,32,12,0
-	KERNEL2x4_L 64,32,13,0 
-	KERNEL2x4_L 64,32,14,0
-	KERNEL2x4_L 64,32,15,1		
-	bdnz		ZGEMM_L2x4_LOOP
- 	MY_ALIGN  
+/*----------------------------------------*/   
+    mtctr   T8
+    LOAD2x4_2  
+    MY_ALIGN
+ZGEMM_L2x4_LOOP:
+/*----------------------------------------*/   
+    KERNEL2x4_L2 128,64,0,0
+ZGEMM_L2x4_K32:
+/*----------------------------------------*/   
+    KERNEL2x4_L2 128,64,1,0   
+    KERNEL2x4_L2 128,64,2,0
+    KERNEL2x4_L2 128,64,3,0  
+    KERNEL2x4_L2 128,64,4,0
+    KERNEL2x4_L2 128,64,5,0 
+    KERNEL2x4_L2 128,64,6,0
+    KERNEL2x4_L2 128,64,7,0
+    KERNEL2x4_L2 128,64,8,0
+    KERNEL2x4_L2 128,64,9,0   
+    KERNEL2x4_L2 128,64,10,0
+    KERNEL2x4_L2 128,64,11,0  
+    KERNEL2x4_L2 128,64,12,0
+    KERNEL2x4_L2 128,64,13,0 
+    KERNEL2x4_L2 128,64,14,0
+    KERNEL2x4_L2 128,64,15,1    
+    bdnz    ZGEMM_L2x4_LOOP
+    MY_ALIGN  
 ZGEMM_L2x4_LOOP_END:
-    END2x4  AO, BO, 64,32  
-	blr
-
+/*----------------------------------------*/   
+    END2x4_2 
+    blr
     MY_ALIGN
+
+
 ZGEMM_2x4_L16_SUB:
-	LOAD2x4 0 
-    KERNEL2x4_L  64,32, 0,0
-    KERNEL2x4_L  64,32, 1,0
-    KERNEL2x4_L  64,32, 2,0
-    KERNEL2x4_L  64,32, 3,0
-    KERNEL2x4_L  64,32, 4,0
-    KERNEL2x4_L  64,32, 5,0
-    KERNEL2x4_L  64,32, 6,0
-    KERNEL2x4_E  64,32, 7,1
+/*----------------------------------------*/   
+    LOAD2x4_2
+    KERNEL2x4_L2 128,64,0,0
+    KERNEL2x4_L2 128,64,1,0   
+    KERNEL2x4_L2 128,64,2,0
+    KERNEL2x4_L2 128,64,3,0  
+    KERNEL2x4_L2 128,64,4,0
+    KERNEL2x4_L2 128,64,5,0 
+    KERNEL2x4_L2 128,64,6,0
+    KERNEL2x4_E2 128,64,7,1
+    blr
+    MY_ALIGN
+
+
+ZGEMM_2x4_L8_SUB:
+/*----------------------------------------*/   
+    LOAD2x4_2
+    KERNEL2x4_L2 128,64,0,0
+    KERNEL2x4_L2 128,64,1,0   
+    KERNEL2x4_L2 128,64,2,0
+    KERNEL2x4_E2 128,64,3,1 
+    blr
+
+
+ZGEMM_2x2_LMAIN_SUB:
+/*----------------------------------------*/   
+    mtctr   T8
+    LOAD2x2_2  
+    MY_ALIGN 
+ZGEMM_L2x2_LOOP:
+/*----------------------------------------*/   
+    KERNEL2x2_L2 64,64,0,0 
+ZGEMM_L2x2_K32:
+/*----------------------------------------*/   
+    KERNEL2x2_L2 64,64,1,0  
+    KERNEL2x2_L2 64,64,2,0
+    KERNEL2x2_L2 64,64,3,0  
+    KERNEL2x2_L2 64,64,4,0
+    KERNEL2x2_L2 64,64,5,0 
+    KERNEL2x2_L2 64,64,6,0
+    KERNEL2x2_L2 64,64,7,0
+    KERNEL2x2_L2 64,64,8,0
+    KERNEL2x2_L2 64,64,9,0  
+    KERNEL2x2_L2 64,64,10,0
+    KERNEL2x2_L2 64,64,11,0  
+    KERNEL2x2_L2 64,64,12,0
+    KERNEL2x2_L2 64,64,13,0 
+    KERNEL2x2_L2 64,64,14,0
+    KERNEL2x2_L2 64,64,15,1   
+    bdnz    ZGEMM_L2x2_LOOP
+    MY_ALIGN  
+
+
+ZGEMM_L2x2_LOOP_END:
+/*----------------------------------------*/   
+    END2x2_2 
+    blr
+    MY_ALIGN
+ZGEMM_2x2_L16_SUB:
+/*----------------------------------------*/   
+    LOAD2x2_2
+    KERNEL2x2_L2 64,64,0,0
+    KERNEL2x2_L2 64,64,1,0  
+    KERNEL2x2_L2 64,64,2,0
+    KERNEL2x2_L2 64,64,3,0  
+    KERNEL2x2_L2 64,64,4,0
+    KERNEL2x2_L2 64,64,5,0 
+    KERNEL2x2_L2 64,64,6,0
+    KERNEL2x2_E2 64,64,7,1
+    blr
+    MY_ALIGN
+ZGEMM_2x2_L8_SUB:
+/*----------------------------------------*/   
+    LOAD2x2_2
+    KERNEL2x2_L2 64,64,0,0
+    KERNEL2x2_L2 64,64,1,0  
+    KERNEL2x2_L2 64,64,2,0
+    KERNEL2x2_E2 64,64,3,1  
+    blr
+
+
+ZGEMM_2x1_LMAIN_SUB:
+/*----------------------------------------*/   
+    mtctr   T8
+    LOAD2x1_2  
+    MY_ALIGN
+ZGEMM_L2x1_LOOP:
+/*----------------------------------------*/   
+    KERNEL2x1_L2 32,64,0,0 
+ZGEMM_L2x1_K32:
+/*----------------------------------------*/   
+    KERNEL2x1_L2 32,64,1,0  
+    KERNEL2x1_L2 32,64,2,0
+    KERNEL2x1_L2 32,64,3,0  
+    KERNEL2x1_L2 32,64,4,0
+    KERNEL2x1_L2 32,64,5,0 
+    KERNEL2x1_L2 32,64,6,0
+    KERNEL2x1_L2 32,64,7,0
+    KERNEL2x1_L2 32,64,8,0
+    KERNEL2x1_L2 32,64,9,0  
+    KERNEL2x1_L2 32,64,10,0
+    KERNEL2x1_L2 32,64,11,0  
+    KERNEL2x1_L2 32,64,12,0
+    KERNEL2x1_L2 32,64,13,0 
+    KERNEL2x1_L2 32,64,14,0
+    KERNEL2x1_L2 32,64,15,1   
+    bdnz    ZGEMM_L2x1_LOOP
+    MY_ALIGN  
+ZGEMM_L2x1_LOOP_END:
+/*----------------------------------------*/   
+    END2x1_2 
     blr
 
     MY_ALIGN
-ZGEMM_2x4_L8_SUB:
-	LOAD2x4 0 
-    KERNEL2x4_L  64,32, 0,0
-    KERNEL2x4_L  64,32, 1,0
-    KERNEL2x4_L  64,32, 2,0
-    KERNEL2x4_E  64,32, 3,1
+ZGEMM_2x1_L16_SUB:
+/*----------------------------------------*/   
+    LOAD2x1_2
+    KERNEL2x1_L2 32,64,0,0
+    KERNEL2x1_L2 32,64,1,0  
+    KERNEL2x1_L2 32,64,2,0
+    KERNEL2x1_L2 32,64,3,0  
+    KERNEL2x1_L2 32,64,4,0
+    KERNEL2x1_L2 32,64,5,0 
+    KERNEL2x1_L2 32,64,6,0
+    KERNEL2x1_E2 32,64,7,1
+    blr
+    MY_ALIGN
+
+
+ZGEMM_2x1_L8_SUB:
+/*----------------------------------------*/   
+    LOAD2x1_2
+    KERNEL2x1_L2 32,64,0,0
+    KERNEL2x1_L2 32,64,1,0  
+    KERNEL2x1_L2 32,64,2,0
+    KERNEL2x1_E2 32,64,3,1  
     blr
 
-/*             MAIN LOOP BEGINS               */
 
-   MY_ALIGN
+
+/*             MAIN LOOP BEGINS               */   
+    MY_ALIGN
+
+
 ZGEMM_L2:
-	srawi.		J,	N,	1
-	ble		ZGEMM_L2_END
+/*----------------------------------------*/   
+#if defined(TRMMKERNEL) && !defined(LEFT)   
+    neg TEMP_REG, OFFSET 
+#endif   
+    srawi.    J,  N,  1
+    ble   ZGEMM_L2_END
+
 
 ZGEMM_L2_BEGIN:
-  	mr		CO,	C
-	slwi		T1,	LDC	,	1	 	  
+/*----------------------------------------*/   
+    mr    CO, C
+    slwi    T1, LDC , 1     
     add     T2,C,LDC    
-	mr		AO,	A  
-	add		C,	C,	T1
-	srawi.		I,	M,	3
-	ble		ZGEMM_L2x8_END
+    mr    AO, A  
+    add   C,  C,  T1
+#if defined(TRMMKERNEL) && defined(LEFT)   
+    mr TEMP_REG, OFFSET  /*off = offset;*/
+#endif     
+    srawi.    I,  M,  3
+    ble   ZGEMM_L2x8_END
     dcbt    CO,r0  /*just prefetch*/
     dcbt    T2,r0    
-ZGEMM_L2x8_BEGIN: 
-	mr T1, K
-	mr		BO,	B 
-	dcbt		B,	r0	
-	dcbt		AO,	r0 
-	/* TEMPS FOR PREFETCH */
-	li T2, 1024
-	li T3, 1024+512
 
-    addi T1,T1, -1
-	/* TEMPS FOR PREFETCH */	
-	li T4, 2048
-	li T5, 2048+512		
-    srawi.		L,	T1,	7 /**(K-1) %  128x */ 
 
-	ZERO2x8  
-	ble		ZGEMM_L2x8_SUB0
-    bl ZGEMM_L2x8_LMAIN_SUB 
-	
-	andi.		L,	T1,	127
-	ble		ZGEMM_L2x8_SAVE
-	b		ZGEMM_L2x8_SUB2
- 
-ZGEMM_L2x8_SUB0: 
-	andi.		L,	K,	255
+ZGEMM_L2x8_BEGIN:
+/*----------------------------------------*/   
+#if defined(TRMMKERNEL)   
+    REFRESH_POINTERS  AO,BO,TEMP_REG,B,8,2
+#else    
+    mr    BO, B  
+    dcbt    B,  r0  
+#endif     
+    dcbt    AO, r0
+#if defined(TRMMKERNEL)   
+    REFRESH_TEMP_BK T6,K,TEMP_REG,8,2
+    mr T1, T6
+/* TEMPS FOR PREFETCH */   
+    li T2, 1024
+    li T3, 1024+512
+    addi T1,T1, -2
+/* TEMPS FOR PREFETCH */     
+    li T4, 2048
+    li T5, 2048+512   
+    srawi.   T8, T1, 7 /**(T11-2) % 128x */
+#else   
+    mr T1, K
+/* TEMPS FOR PREFETCH */   
+    li T2, 1024
+    li T3, 1024+512
+    addi T1,T1, -2
+/* TEMPS FOR PREFETCH */     
+    li T4, 2048
+    li T5, 2048+512 
+    srawi.   T8, T1, 7 /**(K-2) % 128x */
+#endif   
+    ZERO2x8  
+    ble   ZGEMM_L2x8_SUB0
+    bl ZGEMM_L2x8_LMAIN_SUB
+    andi.   L,  T1, 127
+    ble   ZGEMM_L2x8_SAVE
+    b   ZGEMM_L2x8_SUB2
+
+
+ZGEMM_L2x8_SUB0:
+/*----------------------------------------*/   
+#if defined(TRMMKERNEL)   
+    andi.   L,  T6, 255
+    cmpwi   T6,129
+#else   
+    andi.   L,  K,  255
+    cmpwi   K,129
+#endif       
+    li T8,1
+    bne CMP2x8_128K
+    addi BO,BO,-32
+    addi AO,AO,-128 
+    LOAD2x8O 128,32 
+    END2x8_WITHOUT_ADD   
+    LOAD2x8_2O  256, 64 
+    mtctr   T8    
+    bl ZGEMM_L2x8_K128   
+    b ZGEMM_L2x8_SAVE  
+    CMP2x8_128K:
+/*----------------------------------------*/   
+#if defined(TRMMKERNEL)    
+    cmpwi   T6,128
+#else    
     cmpwi   K,128
-	bne ZGEMM_L2x8_SUB2 
-    MY_ALIGN	
-ZGEMM_L2x8_SUB2_128:
-  	bl ZGEMM_2x8_L64_SUB
-	bl ZGEMM_2x8_L64_SUB  
-	b ZGEMM_L2x8_SAVE 
+#endif        
+    bne ZGEMM_L2x8_SUB2 
+    MY_ALIGN   
+    mtctr   T8
+    addi BO,BO,-64
+    addi AO,AO,-256   
+    LOAD2x8_2O 256,64
+    bl ZGEMM_L2x8_K128   
+    b ZGEMM_L2x8_SAVE 
     MY_ALIGN
+
+
 ZGEMM_L2x8_SUB2:
+/*----------------------------------------*/   
     andi.      T1,L, 64
-	ble ZGEMM_L2x8_SUB2_32
-	bl ZGEMM_2x8_L64_SUB
+    ble ZGEMM_L2x8_SUB2_32
+    bl  ZGEMM_2x8_L64_SUB
     MY_ALIGN
+
+
 ZGEMM_L2x8_SUB2_32:
+/*----------------------------------------*/   
     andi.      T1,L, 32
-    ble ZGEMM_L2x8_SUB2_16  	
-	bl ZGEMM_2x8_L32_SUB
+    ble ZGEMM_L2x8_SUB2_16    
+    bl  ZGEMM_2x8_L32_SUB
     MY_ALIGN 
+
+
 ZGEMM_L2x8_SUB2_16:
+/*----------------------------------------*/   
     andi.      T1,L, 16
     ble ZGEMM_L2x8_SUB2_8
-	bl ZGEMM_2x8_L16_SUB	
-    MY_ALIGN		
+    bl  ZGEMM_2x8_L16_SUB  
+    MY_ALIGN    
+
+
 ZGEMM_L2x8_SUB2_8:
+/*----------------------------------------*/   
     andi.      T1,L, 8
     ble ZGEMM_L2x8_SUB2_4
-	LOAD2x8 0 
-    KERNEL2x8_L  128,32, 0,0
-    KERNEL2x8_L  128,32, 1,0
-    KERNEL2x8_L  128,32, 2,0
-    KERNEL2x8_E  128,32, 3,1
-    MY_ALIGN	 
+    LOAD2x8_2
+    KERNEL2x8_L2  256,64, 0,0
+    KERNEL2x8_L2  256,64, 1,0
+    KERNEL2x8_L2  256,64, 2,0
+    KERNEL2x8_E2  256,64, 3,1
+    MY_ALIGN   
+
+
 ZGEMM_L2x8_SUB2_4:
+/*----------------------------------------*/   
     andi.      T1,L, 4
     ble ZGEMM_L2x8_SUB2_2
-	LOAD2x8 0 
-    KERNEL2x8_L  128,32, 0,0
-    KERNEL2x8_E  128,32, 1,1
+    LOAD2x8_2
+    KERNEL2x8_L2  256,64, 0,0
+    KERNEL2x8_E2  256,64, 1,1
     MY_ALIGN
+
+
 ZGEMM_L2x8_SUB2_2:
+/*----------------------------------------*/   
     andi.      T1,L, 2
     ble ZGEMM_L2x8_SUB2_1
-	LOAD2x8 0 
-    KERNEL2x8_E  128,32, 0,1
+    LOAD2x8_2 
+    KERNEL2x8_E2  256,64, 0,1
     MY_ALIGN    
+
+
 ZGEMM_L2x8_SUB2_1:
+/*----------------------------------------*/   
     andi.      T1,L, 1
-    ble ZGEMM_L2x8_SAVE	
-    KERNEL2x8       
+    ble ZGEMM_L2x8_SAVE 
+    KERNEL2x8
+
 
 ZGEMM_L2x8_SAVE:
-	addic.		I,	I,	-1
-	SAVE2x8
+/*----------------------------------------*/   
+    addic.    I,  I,  -1
+    SAVE2x8
+#if defined(TRMMKERNEL)    
+    REFRESH_AFTER_SAVE T6,K,TEMP_REG,BO,AO,8,2
+#endif     
+    bgt   ZGEMM_L2x8_BEGIN
+    andi.   T2, M,  7
+    ble   ZGEMM_L2x1_END
+    andi.   T1, M,  4
+    ble   ZGEMM_L2x4_END
+    b   ZGEMM_L2x4_BEGIN
+    MY_ALIGN 
 
-	bgt		ZGEMM_L2x8_BEGIN
 
-	andi.		T2,	M,	7
-	ble		ZGEMM_L2x1_END
-
-	andi.		T1,	M,	4
-	ble		ZGEMM_L2x4_END
-	b 	ZGEMM_L2x4_BEGIN
-	MY_ALIGN 
 ZGEMM_L2x8_END:
+/*----------------------------------------*/   
+
 
 ZGEMM_L2x4_BEGIN:
-
-	andi.		T2,	M,	7
-	ble		ZGEMM_L2x1_END
-
-	andi.		T1,	M,	4
-	ble		ZGEMM_L2x4_END
-	mr		BO,	B
-	mr T1, K
-    addi T1,T1, -1
-	ZERO2x4  	
-    srawi.		L,	T1, 5 /**(K-1) % 32x */ 
-
-	ble		ZGEMM_L2x4_SUB0 
+/*----------------------------------------*/   
+    andi.   T2, M,  7
+    ble   ZGEMM_L2x1_END
+    andi.   T1, M,  4
+    ble   ZGEMM_L2x4_END
+#if defined(TRMMKERNEL)   
+    REFRESH_POINTERS  AO,BO,TEMP_REG,B,4,2
+#else    
+    mr    BO, B   
+#endif        
+#if defined(TRMMKERNEL)   
+    REFRESH_TEMP_BK T6,K,TEMP_REG,4,2
+    mr T1, T6 
+    addi T1,T1, -2 
+    srawi.   T8, T1, 5 /**(T11-2) % 32x */
+#else   
+    mr T1, K 
+    addi T1,T1, -2
+    srawi.   T8, T1, 5 /**(K-2) % 32x */
+#endif     
+    ZERO2x4
+    ble   ZGEMM_L2x4_SUB0 
     bl ZGEMM_2x4_LMAIN_SUB
-	andi.		L,	T1,	31
-	ble		ZGEMM_L2x4_SAVE
-	b		ZGEMM_L2x4_SUB2
+    andi.   L,  T1, 31
+    ble   ZGEMM_L2x4_SAVE
+    b    ZGEMM_L2x4_SUB2
+
 
 ZGEMM_L2x4_SUB0:
-	andi.		L,	K,	63
+/*----------------------------------------*/   
+#if defined(TRMMKERNEL)   
+    andi.   L,  T6, 63
+    cmpwi   T6,33
+#else   
+    andi.   L,  K,  63
+    cmpwi   K,33
+#endif       
+    li T8,1
+    bne CMP2x4_32K
+    addi BO,BO,-32
+    addi AO,AO,-64  
+    LOAD2x4O 64,32 
+    END2x4_WITHOUT_ADD   
+    LOAD2x4_2O  128, 64 
+    mtctr   T8    
+    bl ZGEMM_L2x4_K32   
+    b ZGEMM_L2x4_SAVE  
+    CMP2x4_32K:
+/*----------------------------------------*/   
+#if defined(TRMMKERNEL)    
+    cmpwi   T6,32
+#else    
     cmpwi   K,32
-	bne ZGEMM_L2x4_SUB2 
-    MY_ALIGN	
-ZGEMM_L2x4_SUB2_32:
-  	bl ZGEMM_2x4_L16_SUB
-	bl ZGEMM_2x4_L16_SUB  
-	b ZGEMM_L2x4_SAVE 
+#endif        
+    bne ZGEMM_L2x4_SUB2 
+    MY_ALIGN   
+    mtctr   T8
+    addi BO,BO,-64
+    addi AO,AO,-128   
+    LOAD2x4_2O 128,64
+    bl ZGEMM_L2x4_K32   
+    b ZGEMM_L2x4_SAVE 
     MY_ALIGN 
-ZGEMM_L2x4_SUB2: 
+    MY_ALIGN 
+
+
+ZGEMM_L2x4_SUB2:
+/*----------------------------------------*/   
     andi.      T1,L, 16
     ble ZGEMM_L2x4_SUB2_8
-	bl ZGEMM_2x4_L16_SUB	
+    bl  ZGEMM_2x4_L16_SUB  
     MY_ALIGN
-ZGEMM_L2x4_SUB2_8: 		
+
+
+ZGEMM_L2x4_SUB2_8:
+/*----------------------------------------*/   
     andi.      T1,L, 8
     ble ZGEMM_L2x4_SUB2_4
     bl ZGEMM_2x4_L8_SUB
     MY_ALIGN  
+
+
 ZGEMM_L2x4_SUB2_4:
+/*----------------------------------------*/   
     andi.      T1,L, 4
     ble ZGEMM_L2x4_SUB2_2
-	LOAD2x4 0 
-    KERNEL2x4_L  64,32, 0,0
-    KERNEL2x4_E  64,32, 1,1
+    LOAD2x4_2
+    KERNEL2x4_L2  128,64, 0,0
+    KERNEL2x4_E2  128,64, 1,1
     MY_ALIGN
+
+
 ZGEMM_L2x4_SUB2_2:
+/*----------------------------------------*/   
     andi.      T1,L, 2
     ble ZGEMM_L2x4_SUB2_1
-	LOAD2x4 0 
-    KERNEL2x4_E  64,32, 0,1
+    LOAD2x4_2
+    KERNEL2x4_E2  128,64, 0,1
     MY_ALIGN    
+
+
 ZGEMM_L2x4_SUB2_1:
+/*----------------------------------------*/   
     andi.      T1,L, 1
-    ble ZGEMM_L2x4_SAVE	
-    KERNEL2x4  
+    ble ZGEMM_L2x4_SAVE 
+    KERNEL2x4
+
 
 ZGEMM_L2x4_SAVE:
+/*----------------------------------------*/   
+    SAVE2x4
+#if defined(TRMMKERNEL)    
+    REFRESH_AFTER_SAVE T6,K,TEMP_REG,BO,AO,4,2
+#endif     
 
-	SAVE2x4
 
 ZGEMM_L2x4_END:
+/*----------------------------------------*/   
 
-ZGEMM_L2x2_BEGIN: 
 
-	andi.		T1,	M,	2
-	ble		ZGEMM_L2x2_END
-	mr		BO,	B
-	mr T1, K
-    addi T1,T1, -1
-    srawi.		L,	T1, 4 /**(K-1) % 16x */ 
-	ZERO2x2 
-	ble		ZGEMM_L2x2_SUB0 
+ZGEMM_L2x2_BEGIN:
+/*----------------------------------------*/   
+    andi.   T1, M,  2
+    ble   ZGEMM_L2x2_END
+#if defined(TRMMKERNEL)   
+    REFRESH_POINTERS  AO,BO,TEMP_REG,B,2,2
+#else    
+    mr    BO, B   
+#endif        
+#if defined(TRMMKERNEL)   
+    REFRESH_TEMP_BK T6,K,TEMP_REG,2,2
+    mr T1, T6 
+    addi T1,T1, -2 
+    srawi.   T8, T1, 5 /**(T11-2) % 32x */
+#else   
+    mr T1, K 
+    addi T1,T1, -2
+    srawi.   T8, T1, 5 /**(K-2) % 32x */
+#endif     
+    ZERO2x2
+    ble   ZGEMM_L2x2_SUB0 
+    bl ZGEMM_2x2_LMAIN_SUB
+    andi.   L,  T1, 31
+    ble   ZGEMM_L2x2_SAVE
+    b   ZGEMM_L2x2_SUB2
 
-ZGEMM_L2x2_LOOP_START:
-    LOAD2x2 0  
-	mtctr		L
 
-	MY_ALIGN
-ZGEMM_L2x2_LOOP: 
-    KERNEL2x2_L 32,32,0,0
-	KERNEL2x2_L 32,32,1,0 	
-	KERNEL2x2_L 32,32,2,0
-	KERNEL2x2_L 32,32,3,0  
-    KERNEL2x2_L 32,32,4,0
-	KERNEL2x2_L 32,32,5,0 
-	KERNEL2x2_L 32,32,6,0
-	KERNEL2x2_L 32,32,7,1	
-	bdnz		ZGEMM_L2x2_LOOP
- 	MY_ALIGN  
-ZGEMM_L2x2_LOOP_END:
-    END2x2  AO, BO, 32,32   	 
- 
-	b		ZGEMM_L2x2_SUB1
- 
 ZGEMM_L2x2_SUB0:
+/*----------------------------------------*/   
+#if defined(TRMMKERNEL)   
+    andi.   L,  T6, 63
+    cmpwi   T6,33
+#else   
+    andi.   L,  K,  63
+    cmpwi   K,33
+#endif       
+    li T8,1
+    bne CMP2x2_32K
+    addi BO,BO,-32
+    addi AO,AO,-32  
+    LOAD2x2O 32,32 
+    END2x2_WITHOUT_ADD   
+    LOAD2x2_2O  64, 64  
+    mtctr   T8    
+    bl ZGEMM_L2x2_K32   
+    b ZGEMM_L2x2_SAVE  
+    CMP2x2_32K:
+/*----------------------------------------*/   
+#if defined(TRMMKERNEL)    
+    cmpwi   T6,32
+#else    
+    cmpwi   K,32
+#endif        
+    bne ZGEMM_L2x2_SUB2 
+    MY_ALIGN   
+    mtctr   T8
+    addi BO,BO,-64
+    addi AO,AO,-64   
+    LOAD2x2_2O 64,64
+    bl ZGEMM_L2x2_K32   
+    b ZGEMM_L2x2_SAVE 
+    MY_ALIGN 
+    MY_ALIGN 
 
-	andi.		L,	K,	31
- 
-	b		ZGEMM_L2x2_SUB2
-
-ZGEMM_L2x2_SUB1:
-
-	andi.		L,	T1,	15
-	ble		ZGEMM_L2x2_SAVE
 
 ZGEMM_L2x2_SUB2:
-    srawi.      T1,L, 3
-    ble ZGEMM_L2x2_SUB2_4
-    mtctr		T1
+/*----------------------------------------*/   
+    andi.      T1,L, 16
+    ble ZGEMM_L2x2_SUB2_8
+    bl ZGEMM_2x2_L16_SUB  
     MY_ALIGN
-ZGEMM_L2x2_SUB2_LOOP:
-	LOAD2x2 0 
-    KERNEL2x2_L  32,32, 0,0
-    KERNEL2x2_L  32,32, 1,0
-    KERNEL2x2_L  32,32, 2,0
-    KERNEL2x2_E  32,32, 3,1
-    bdnz ZGEMM_L2x2_SUB2_LOOP 
+
+
+ZGEMM_L2x2_SUB2_8:
+/*----------------------------------------*/   
+    andi.      T1,L, 8
+    ble ZGEMM_L2x2_SUB2_4
+    bl ZGEMM_2x2_L8_SUB
     MY_ALIGN  
+
+
 ZGEMM_L2x2_SUB2_4:
+/*----------------------------------------*/   
     andi.      T1,L, 4
     ble ZGEMM_L2x2_SUB2_2
-	LOAD2x2 0 
-    KERNEL2x2_L  32,32, 0,0
-    KERNEL2x2_E  32,32, 1,1
+    LOAD2x2_2
+    KERNEL2x2_L2  64,64, 0,0
+    KERNEL2x2_E2  64,64, 1,1
     MY_ALIGN
+
+
 ZGEMM_L2x2_SUB2_2:
+/*----------------------------------------*/   
     andi.      T1,L, 2
     ble ZGEMM_L2x2_SUB2_1
-	LOAD2x2 0 
-    KERNEL2x2_E  32,32, 0,1
+    LOAD2x2_2
+    KERNEL2x2_E2  64,64, 0,1
     MY_ALIGN    
-ZGEMM_L2x2_SUB2_1:
-    andi.      T1,L, 1
-    ble ZGEMM_L2x2_SAVE	
-    KERNEL2x2 
-ZGEMM_L2x2_SAVE:
 
-	SAVE2x2
+
+ZGEMM_L2x2_SUB2_1:
+/*----------------------------------------*/   
+    andi.      T1,L, 1
+    ble ZGEMM_L2x2_SAVE 
+    KERNEL2x2
+
+
+ZGEMM_L2x2_SAVE:
+/*----------------------------------------*/   
+    SAVE2x2
+#if defined(TRMMKERNEL)    
+    REFRESH_AFTER_SAVE T6,K,TEMP_REG,BO,AO,2,2
+#endif     
+
 
 ZGEMM_L2x2_END:
+/*----------------------------------------*/   
 
 
+ZGEMM_L2x1_BEGIN:
+/*----------------------------------------*/   
+    andi.   T1, M,  1
+    ble   ZGEMM_L2x1_END
+#if defined(TRMMKERNEL)   
+    REFRESH_POINTERS  AO,BO,TEMP_REG,B,1,2
+#else    
+    mr    BO, B   
+#endif        
+#if defined(TRMMKERNEL)   
+    REFRESH_TEMP_BK T6,K,TEMP_REG,1,2
+    mr T1, T6 
+    addi T1,T1, -2 
+    srawi.   T8, T1, 5 /**(T11-2) % 32x */
+#else   
+    mr T1, K 
+    addi T1,T1, -2
+    srawi.   T8, T1, 5 /**(K-2) % 32x */
+#endif     
+    ZERO2x1
+    ble   ZGEMM_L2x1_SUB0 
+    bl ZGEMM_2x1_LMAIN_SUB
+    andi.   L,  T1, 31
+    ble   ZGEMM_L2x1_SAVE
+    b   ZGEMM_L2x1_SUB2
 
-ZGEMM_L2x1_BEGIN: 
-	andi.		T1,	M,	1
-	ble		ZGEMM_L2x1_END
-	mr		BO,	B
-	mr T1, K
-    addi T1,T1, -1
-    srawi.		L,	T1, 4 /**(K-1) % 16x */ 
-	ZERO2x1  
-	ble		ZGEMM_L2x1_SUB0 
 
-ZGEMM_L2x1_LOOP_START:
-
-    LOAD2x1 0  
-	mtctr		L
-
-	MY_ALIGN
-ZGEMM_L2x1_LOOP: 
-    KERNEL2x1_L 16,32,0,0
-	KERNEL2x1_L 16,32,1,0 	
-	KERNEL2x1_L 16,32,2,0
-	KERNEL2x1_L 16,32,3,0  
-    KERNEL2x1_L 16,32,4,0
-	KERNEL2x1_L 16,32,5,0 
-	KERNEL2x1_L 16,32,6,0
-	KERNEL2x1_L 16,32,7,1 		
-	bdnz		ZGEMM_L2x1_LOOP
- 	MY_ALIGN  
-ZGEMM_L2x1_LOOP_END:
-    END2x1  AO, BO, 16,32   	 
- 
-	b		ZGEMM_L2x1_SUB1
- 
 ZGEMM_L2x1_SUB0:
+/*----------------------------------------*/   
+#if defined(TRMMKERNEL)   
+    andi.   L,  T6, 63
+    cmpwi   T6,33
+#else   
+    andi.   L,  K,  63
+    cmpwi   K,33
+#endif       
+    li T8,1
+    bne CMP2x1_32K
+    addi BO,BO,-32
+    addi AO,AO,-16  
+    LOAD2x1O 16,32 
+    END2x1_WITHOUT_ADD   
+    LOAD2x1_2O  32, 64  
+    mtctr   T8    
+    bl ZGEMM_L2x1_K32   
+    b ZGEMM_L2x1_SAVE  
+    CMP2x1_32K:
+/*----------------------------------------*/   
+#if defined(TRMMKERNEL)    
+    cmpwi   T6,32
+#else    
+    cmpwi   K,32
+#endif        
+    bne ZGEMM_L2x1_SUB2 
+    MY_ALIGN   
+    mtctr   T8
+    addi BO,BO,-64
+    addi AO,AO,-32   
+    LOAD2x1_2O 32,64
+    bl ZGEMM_L2x1_K32   
+    b ZGEMM_L2x1_SAVE 
+    MY_ALIGN 
+    MY_ALIGN 
 
-	andi.		L,	K,	31
- 
-	b		ZGEMM_L2x1_SUB2
-
-ZGEMM_L2x1_SUB1:
-
-	andi.		L,	T1,	15
-	ble		ZGEMM_L2x1_SAVE
 
 ZGEMM_L2x1_SUB2:
-    srawi.      T1,L, 3
-    ble ZGEMM_L2x1_SUB2_4
-    mtctr		T1
+/*----------------------------------------*/   
+    andi.      T1,L, 16
+    ble ZGEMM_L2x1_SUB2_8
+    bl ZGEMM_2x1_L16_SUB  
     MY_ALIGN
-ZGEMM_L2x1_SUB2_LOOP:
-	LOAD2x1 0 
-    KERNEL2x1_L  16,32, 0,0
-    KERNEL2x1_L  16,32, 1,0
-    KERNEL2x1_L  16,32, 2,0
-    KERNEL2x1_E  16,32, 3,1
-    bdnz ZGEMM_L2x1_SUB2_LOOP 
+
+
+ZGEMM_L2x1_SUB2_8:
+/*----------------------------------------*/   
+    andi.      T1,L, 8
+    ble ZGEMM_L2x1_SUB2_4
+    bl ZGEMM_2x1_L8_SUB
     MY_ALIGN  
+
+
 ZGEMM_L2x1_SUB2_4:
+/*----------------------------------------*/   
     andi.      T1,L, 4
     ble ZGEMM_L2x1_SUB2_2
-	LOAD2x1 0 
-    KERNEL2x1_L  16,32, 0,0
-    KERNEL2x1_E  16,32, 1,1
+    LOAD2x1_2
+    KERNEL2x1_L2  32,64, 0,0
+    KERNEL2x1_E2  32,64, 1,1
     MY_ALIGN
+
+
 ZGEMM_L2x1_SUB2_2:
+/*----------------------------------------*/   
     andi.      T1,L, 2
     ble ZGEMM_L2x1_SUB2_1
-	LOAD2x1 0 
-    KERNEL2x1_E  16,32, 0,1
+    LOAD2x1_2
+    KERNEL2x1_E2  32,64, 0,1
     MY_ALIGN    
+
+
 ZGEMM_L2x1_SUB2_1:
+/*----------------------------------------*/   
     andi.      T1,L, 1
-    ble ZGEMM_L2x1_SAVE	
-    KERNEL2x1 
+    ble ZGEMM_L2x1_SAVE 
+    KERNEL2x1
+
 
 ZGEMM_L2x1_SAVE:
+/*----------------------------------------*/   
+    SAVE2x1
+#if defined(TRMMKERNEL)    
+    REFRESH_AFTER_SAVE T6,K,TEMP_REG,BO,AO,1,2
+#endif   
 
-	SAVE2x1
 
 ZGEMM_L2x1_END:
+/*----------------------------------------*/   
+    slwi    T1, K,  5
+    addic.    J,  J,  -1
+    add   B,  B,  T1
+#if defined(TRMMKERNEL) && !defined(LEFT)   
+    addi TEMP_REG, TEMP_REG, 2
+#endif   
+    bgt   ZGEMM_L2_BEGIN
 
-	slwi		T1,	K,	5
-	add		B,	B,	T1
-
-	addic.		J,	J,	-1
-	bgt		ZGEMM_L2_BEGIN
-
-	andi.		T2,	N,	1
-	ble		L999
 
 ZGEMM_L2_END:
 
-	b		ZGEMM_L1_BEGIN
+b ZGEMM_L1
+/*                MINI SUBROUTINES                            */      
+/*                1x8 MAIN 128x+2 LOOP                     */      
 
-L999_H1:
 
-	b		L999
+ZGEMM_L1x8_LMAIN_SUB:
+/*----------------------------------------*/   
+    mtctr   T8
+    LOAD1x8_2 
+    MY_ALIGN
+ZGEMM_L1x8_LOOP:
+/*----------------------------------------*/   
+    dcbt    AO, PRE
+    dcbt    BO, PRE
+    KERNEL1x8_L2 256,32,0,0 
+ZGEMM_L1x8_K128:
+/*----------------------------------------*/   
+    KERNEL1x8_L2 256,32,1,0
+    dcbt    AO, T2  
+    KERNEL1x8_L2 256,32,2,0
+    KERNEL1x8_L2 256,32,3,0 
+    dcbt    AO, T3
+    dcbt    BO, T2
+    KERNEL1x8_L2 256,32,4,0
+    KERNEL1x8_L2 256,32,5,0
+    dcbt    AO, T4  
+    KERNEL1x8_L2 256,32,6,0
+    KERNEL1x8_L2 256,32,7,0  
+    dcbt    AO, T5  
+    dcbt    BO, T3
+    KERNEL1x8_L2 256,32,8,0
+    KERNEL1x8_L2 256,32,9,0
+    KERNEL1x8_L2 256,32,10,0
+    KERNEL1x8_L2 256,32,11,0  
+    dcbt    BO, T4
+    KERNEL1x8_L2 256,32,12,0
+    KERNEL1x8_L2 256,32,13,0
+    KERNEL1x8_L2 256,32,14,0
+    KERNEL1x8_L2 256,32,15,0  
+    KERNEL1x8_L2 256,32,16,0
+    KERNEL1x8_L2 256,32,17,0 
+    KERNEL1x8_L2 256,32,18,0
+    KERNEL1x8_L2 256,32,19,0  
+    KERNEL1x8_L2 256,32,20,0
+    KERNEL1x8_L2 256,32,21,0 
+    KERNEL1x8_L2 256,32,22,0
+    KERNEL1x8_L2 256,32,23,0   
+    KERNEL1x8_L2 256,32,24,0
+    KERNEL1x8_L2 256,32,25,0
+    KERNEL1x8_L2 256,32,26,0
+    KERNEL1x8_L2 256,32,27,0  
+    KERNEL1x8_L2 256,32,28,0
+    KERNEL1x8_L2 256,32,29,0
+    KERNEL1x8_L2 256,32,30,0
+    KERNEL1x8_L2 256,32,31,0 
+    KERNEL1x8_L2 256,32,32,0
+    KERNEL1x8_L2 256,32,33,0
+    KERNEL1x8_L2 256,32,34,0
+    KERNEL1x8_L2 256,32,35,0 
+    KERNEL1x8_L2 256,32,36,0
+    KERNEL1x8_L2 256,32,37,0
+    KERNEL1x8_L2 256,32,38,0
+    KERNEL1x8_L2 256,32,39,0  
+    KERNEL1x8_L2 256,32,40,0
+    KERNEL1x8_L2 256,32,41,0
+    KERNEL1x8_L2 256,32,42,0
+    KERNEL1x8_L2 256,32,43,0  
+    KERNEL1x8_L2 256,32,44,0
+    KERNEL1x8_L2 256,32,45,0
+    KERNEL1x8_L2 256,32,46,0
+    KERNEL1x8_L2 256,32,47,0 
+    KERNEL1x8_L2 256,32,48,0
+    KERNEL1x8_L2 256,32,49,0 
+    KERNEL1x8_L2 256,32,50,0
+    KERNEL1x8_L2 256,32,51,0  
+    KERNEL1x8_L2 256,32,52,0
+    KERNEL1x8_L2 256,32,53,0 
+    KERNEL1x8_L2 256,32,54,0
+    KERNEL1x8_L2 256,32,55,0  
+    KERNEL1x8_L2 256,32,56,0
+    KERNEL1x8_L2 256,32,57,0
+    KERNEL1x8_L2 256,32,58,0
+    KERNEL1x8_L2 256,32,59,0  
+    KERNEL1x8_L2 256,32,60,0
+    KERNEL1x8_L2 256,32,61,0
+    KERNEL1x8_L2 256,32,62,0 
+    KERNEL1x8_L2 256,32,63,1  
+    bdnz    ZGEMM_L1x8_LOOP
+    MY_ALIGN  
+ZGEMM_L1x8_LOOP_END:
+/*----------------------------------------*/   
+    END1x8_2
+    blr
+    MY_ALIGN
 
+
+ZGEMM_1x8_L64_SUB:
+/*----------------------------------------*/   
+    LOAD1x8_2  
+    dcbt    AO, PRE
+    dcbt    BO, PRE
+    KERNEL1x8_L2 256,32,0,0 
+    KERNEL1x8_L2 256,32,1,0
+    dcbt    AO, T2  
+    KERNEL1x8_L2 256,32,2,0
+    KERNEL1x8_L2 256,32,3,0 
+    dcbt    AO, T3
+    dcbt    BO, T2
+    KERNEL1x8_L2 256,32,4,0
+    KERNEL1x8_L2 256,32,5,0
+    dcbt    AO, T4  
+    KERNEL1x8_L2 256,32,6,0
+    KERNEL1x8_L2 256,32,7,0  
+    dcbt    AO, T5  
+    dcbt    BO, T3
+    KERNEL1x8_L2 256,32,8,0
+    KERNEL1x8_L2 256,32,9,0
+    KERNEL1x8_L2 256,32,10,0
+    KERNEL1x8_L2 256,32,11,0  
+    dcbt    BO, T4
+    KERNEL1x8_L2 256,32,12,0
+    KERNEL1x8_L2 256,32,13,0
+    KERNEL1x8_L2 256,32,14,0
+    KERNEL1x8_L2 256,32,15,0  
+    KERNEL1x8_L2 256,32,16,0
+    KERNEL1x8_L2 256,32,17,0 
+    KERNEL1x8_L2 256,32,18,0
+    KERNEL1x8_L2 256,32,19,0  
+    KERNEL1x8_L2 256,32,20,0
+    KERNEL1x8_L2 256,32,21,0 
+    KERNEL1x8_L2 256,32,22,0
+    KERNEL1x8_L2 256,32,23,0   
+    KERNEL1x8_L2 256,32,24,0
+    KERNEL1x8_L2 256,32,25,0
+    KERNEL1x8_L2 256,32,26,0
+    KERNEL1x8_L2 256,32,27,0  
+    KERNEL1x8_L2 256,32,28,0
+    KERNEL1x8_L2 256,32,29,0
+    KERNEL1x8_L2 256,32,30,0
+    KERNEL1x8_E2 256,32,31,1
+    blr
+    MY_ALIGN
+
+
+ZGEMM_1x8_L32_SUB:
+/*----------------------------------------*/   
+    LOAD1x8_2  
+    dcbt    AO, PRE
+    dcbt    BO, PRE
+    KERNEL1x8_L2 256,32,0,0 
+    KERNEL1x8_L2 256,32,1,0
+    dcbt    AO, T2  
+    KERNEL1x8_L2 256,32,2,0
+    KERNEL1x8_L2 256,32,3,0 
+    dcbt    AO, T3
+    dcbt    BO, T2
+    KERNEL1x8_L2 256,32,4,0
+    KERNEL1x8_L2 256,32,5,0
+    dcbt    AO, T4  
+    KERNEL1x8_L2 256,32,6,0
+    KERNEL1x8_L2 256,32,7,0  
+    dcbt    AO, T5  
+    dcbt    BO, T3
+    KERNEL1x8_L2 256,32,8,0
+    KERNEL1x8_L2 256,32,9,0
+    KERNEL1x8_L2 256,32,10,0
+    KERNEL1x8_L2 256,32,11,0  
+    dcbt    BO, T4
+    KERNEL1x8_L2 256,32,12,0
+    KERNEL1x8_L2 256,32,13,0
+    KERNEL1x8_L2 256,32,14,0
+    KERNEL1x8_E2 256,32,15,1
+    blr
+    MY_ALIGN
+
+
+ZGEMM_1x8_L16_SUB:
+/*----------------------------------------*/   
+    LOAD1x8_2 
+    dcbt    AO, PRE
+    dcbt    BO, PRE
+    KERNEL1x8_L2 256,32,0,0 
+    KERNEL1x8_L2 256,32,1,0
+    dcbt    AO, T2  
+    KERNEL1x8_L2 256,32,2,0
+    KERNEL1x8_L2 256,32,3,0 
+    dcbt    AO, T3
+    dcbt    BO, T2
+    KERNEL1x8_L2 256,32,4,0
+    KERNEL1x8_L2 256,32,5,0
+    dcbt    AO, T4  
+    KERNEL1x8_L2 256,32,6,0
+    KERNEL1x8_E2 256,32,7,1
+    blr
+    MY_ALIGN
+
+
+ZGEMM_1x4_LMAIN_SUB:
+/*----------------------------------------*/   
+    mtctr   T8
+    LOAD1x4_2  
+    MY_ALIGN
+
+
+ZGEMM_L1x4_LOOP:
+/*----------------------------------------*/   
+    KERNEL1x4_L2 128,32,0,0
+
+
+ZGEMM_L1x4_K32:
+/*----------------------------------------*/   
+    KERNEL1x4_L2 128,32,1,0   
+    KERNEL1x4_L2 128,32,2,0
+    KERNEL1x4_L2 128,32,3,0  
+    KERNEL1x4_L2 128,32,4,0
+    KERNEL1x4_L2 128,32,5,0 
+    KERNEL1x4_L2 128,32,6,0
+    KERNEL1x4_L2 128,32,7,0
+    KERNEL1x4_L2 128,32,8,0
+    KERNEL1x4_L2 128,32,9,0   
+    KERNEL1x4_L2 128,32,10,0
+    KERNEL1x4_L2 128,32,11,0  
+    KERNEL1x4_L2 128,32,12,0
+    KERNEL1x4_L2 128,32,13,0 
+    KERNEL1x4_L2 128,32,14,0
+    KERNEL1x4_L2 128,32,15,1    
+    bdnz    ZGEMM_L1x4_LOOP
+    MY_ALIGN  
+
+
+ZGEMM_L1x4_LOOP_END:
+/*----------------------------------------*/   
+    END1x4_2 
+    blr
+    MY_ALIGN
+
+
+ZGEMM_1x4_L16_SUB:
+/*----------------------------------------*/   
+    LOAD1x4_2
+    KERNEL1x4_L2 128,32,0,0
+    KERNEL1x4_L2 128,32,1,0   
+    KERNEL1x4_L2 128,32,2,0
+    KERNEL1x4_L2 128,32,3,0  
+    KERNEL1x4_L2 128,32,4,0
+    KERNEL1x4_L2 128,32,5,0 
+    KERNEL1x4_L2 128,32,6,0
+    KERNEL1x4_E2 128,32,7,1
+    blr
+    MY_ALIGN
+
+
+ZGEMM_1x4_L8_SUB:
+/*----------------------------------------*/   
+    LOAD1x4_2
+    KERNEL1x4_L2 128,32,0,0
+    KERNEL1x4_L2 128,32,1,0   
+    KERNEL1x4_L2 128,32,2,0
+    KERNEL1x4_E2 128,32,3,1  
+    blr
+
+
+ZGEMM_1x2_LMAIN_SUB:
+/*----------------------------------------*/   
+    mtctr   T8
+    LOAD1x2_2  
+    MY_ALIGN
+
+
+ZGEMM_L1x2_LOOP:
+/*----------------------------------------*/   
+    KERNEL1x2_L2 64,32,0,0
+
+
+ZGEMM_L1x2_K32:
+/*----------------------------------------*/   
+    KERNEL1x2_L2 64,32,1,0  
+    KERNEL1x2_L2 64,32,2,0
+    KERNEL1x2_L2 64,32,3,0  
+    KERNEL1x2_L2 64,32,4,0
+    KERNEL1x2_L2 64,32,5,0 
+    KERNEL1x2_L2 64,32,6,0
+    KERNEL1x2_L2 64,32,7,0
+    KERNEL1x2_L2 64,32,8,0
+    KERNEL1x2_L2 64,32,9,0  
+    KERNEL1x2_L2 64,32,10,0
+    KERNEL1x2_L2 64,32,11,0  
+    KERNEL1x2_L2 64,32,12,0
+    KERNEL1x2_L2 64,32,13,0 
+    KERNEL1x2_L2 64,32,14,0
+    KERNEL1x2_L2 64,32,15,1   
+    bdnz    ZGEMM_L1x2_LOOP
+    MY_ALIGN  
+
+
+ZGEMM_L1x2_LOOP_END:
+/*----------------------------------------*/   
+    END1x2_2 
+    blr
+    MY_ALIGN
+
+
+ZGEMM_1x2_L16_SUB:
+/*----------------------------------------*/   
+    LOAD1x2_2
+    KERNEL1x2_L2 64,32,0,0
+    KERNEL1x2_L2 64,32,1,0  
+    KERNEL1x2_L2 64,32,2,0
+    KERNEL1x2_L2 64,32,3,0  
+    KERNEL1x2_L2 64,32,4,0
+    KERNEL1x2_L2 64,32,5,0 
+    KERNEL1x2_L2 64,32,6,0
+    KERNEL1x2_E2 64,32,7,1
+    blr
+    MY_ALIGN
+
+
+ZGEMM_1x2_L8_SUB:
+/*----------------------------------------*/   
+    LOAD1x2_2
+    KERNEL1x2_L2 64,32,0,0
+    KERNEL1x2_L2 64,32,1,0  
+    KERNEL1x2_L2 64,32,2,0
+    KERNEL1x2_E2 64,32,3,1  
+    blr
+
+
+ZGEMM_1x1_LMAIN_SUB:
+/*----------------------------------------*/   
+    mtctr   T8
+    LOAD1x1_2  
+    MY_ALIGN
+
+
+ZGEMM_L1x1_LOOP:
+/*----------------------------------------*/   
+    KERNEL1x1_L2 32,32,0,0
+
+
+ZGEMM_L1x1_K32:
+/*----------------------------------------*/   
+    KERNEL1x1_L2 32,32,1,0  
+    KERNEL1x1_L2 32,32,2,0
+    KERNEL1x1_L2 32,32,3,0  
+    KERNEL1x1_L2 32,32,4,0
+    KERNEL1x1_L2 32,32,5,0 
+    KERNEL1x1_L2 32,32,6,0
+    KERNEL1x1_L2 32,32,7,0
+    KERNEL1x1_L2 32,32,8,0
+    KERNEL1x1_L2 32,32,9,0  
+    KERNEL1x1_L2 32,32,10,0
+    KERNEL1x1_L2 32,32,11,0  
+    KERNEL1x1_L2 32,32,12,0
+    KERNEL1x1_L2 32,32,13,0 
+    KERNEL1x1_L2 32,32,14,0
+    KERNEL1x1_L2 32,32,15,1   
+    bdnz    ZGEMM_L1x1_LOOP
+    MY_ALIGN  
+
+
+ZGEMM_L1x1_LOOP_END:
+/*----------------------------------------*/   
+    END1x1_2 
+    blr
+    MY_ALIGN
+
+
+ZGEMM_1x1_L16_SUB:
+/*----------------------------------------*/   
+    LOAD1x1_2
+    KERNEL1x1_L2 32,32,0,0
+    KERNEL1x1_L2 32,32,1,0  
+    KERNEL1x1_L2 32,32,2,0
+    KERNEL1x1_L2 32,32,3,0  
+    KERNEL1x1_L2 32,32,4,0
+    KERNEL1x1_L2 32,32,5,0 
+    KERNEL1x1_L2 32,32,6,0
+    KERNEL1x1_E2 32,32,7,1
+    blr
+    MY_ALIGN
+
+
+ZGEMM_1x1_L8_SUB:
+/*----------------------------------------*/   
+    LOAD1x1_2
+    KERNEL1x1_L2 32,32,0,0
+    KERNEL1x1_L2 32,32,1,0  
+    KERNEL1x1_L2 32,32,2,0
+    KERNEL1x1_E2 32,32,3,1  
+    blr
+
+
+/*----------------------N1 BEGINS---------*/
+ZGEMM_L1:
+/*----------------------------------------*/   
+    andi.   T1, N,  1
+    ble   ZGEMM_L1_END
+		
 ZGEMM_L1_BEGIN:
-	andi.		T1,	N,	1
-	ble		ZGEMM_L1_END
+/*----------------------------------------*/   
+    mr    CO, C
+    slwi    T1, LDC , 1     
+    add     T2,C,LDC    
+    mr    AO, A  
+    add   C,  C,  T1
+#if defined(TRMMKERNEL) && defined(LEFT)   
+    mr TEMP_REG, OFFSET  /*off = offset;*/
+#endif     
+    srawi.    I,  M,  3
+    ble   ZGEMM_L1x8_END
+    dcbt    CO,r0  /*just prefetch*/
+    dcbt    T2,r0    
 
-	mr		CO,	C
-	mr		AO,	A
-	srawi.		I,	M,	3
-	ble		ZGEMM_L1x8_END
 
 ZGEMM_L1x8_BEGIN:
-
-
-	mr		BO,	B
-	mr T1, K
-    addi T1,T1, -1
-    srawi.		L,	T1,	5 /**(K-1) % 32x */ 
-	ZERO1x8  
-	ble		ZGEMM_L1x8_SUB0
- 
-
-ZGEMM_L1x8_LOOP_START:
-
-    LOAD1x8 0 
+/*----------------------------------------*/   
+#if defined(TRMMKERNEL)   
+    REFRESH_POINTERS  AO,BO,TEMP_REG,B,8,1
+#else    
+    mr    BO, B  
+    dcbt    B,  r0  
+#endif     
+    dcbt    AO, r0
+#if defined(TRMMKERNEL)   
+    REFRESH_TEMP_BK T6,K,TEMP_REG,8,1
+    mr T1, T6
+/* TEMPS FOR PREFETCH */   
     li T2, 1024
-	li T3, 1024+512
-	li T4, 2048
-	li T5, 2048+512
-	mtctr		L
+    li T3, 1024+512
+    addi T1,T1, -2
+/* TEMPS FOR PREFETCH */     
+    li T4, 2048
+    li T5, 2048+512   
+    srawi.   T8, T1, 7 /**(T11-2) % 128x */
+#else   
+    mr T1, K
+/* TEMPS FOR PREFETCH */   
+    li T2, 1024
+    li T3, 1024+512
+    addi T1,T1, -2
+/* TEMPS FOR PREFETCH */     
+    li T4, 2048
+    li T5, 2048+512 
+    srawi.   T8, T1, 7 /**(K-2) % 128x */
+#endif   
+    ZERO1x8  
+    ble   ZGEMM_L1x8_SUB0
+    bl ZGEMM_L1x8_LMAIN_SUB
+    andi.   L,  T1, 127
+    ble   ZGEMM_L1x8_SAVE
+    b   ZGEMM_L1x8_SUB2
+
 
-	MY_ALIGN
-ZGEMM_L1x8_LOOP:
- 	dcbt		AO,	PRE
-	dcbt		BO,	PRE
-    KERNEL1x8_L 128,16,0,0
-	KERNEL1x8_L 128,16,1,0
-	dcbt		AO,	T2	
-	KERNEL1x8_L 128,16,2,0
-	KERNEL1x8_L 128,16,3,0 
-	dcbt		AO,	T3
-	dcbt		BO,	T2
-    KERNEL1x8_L 128,16,4,0
-	KERNEL1x8_L 128,16,5,0
-	dcbt		AO,	T4	
-	KERNEL1x8_L 128,16,6,0
-	KERNEL1x8_L 128,16,7,0  
-	dcbt		AO,	T5	
-	dcbt		BO,	T3
-    KERNEL1x8_L 128,16,8,0
-	KERNEL1x8_L 128,16,9,0
-	KERNEL1x8_L 128,16,10,0
-	KERNEL1x8_L 128,16,11,0  
-	dcbt		BO,	T4
-    KERNEL1x8_L 128,16,12,0
-	KERNEL1x8_L 128,16,13,0
-	KERNEL1x8_L 128,16,14,0
-	KERNEL1x8_L 128,16,15,1 		
-	bdnz		ZGEMM_L1x8_LOOP
- 	MY_ALIGN  
-ZGEMM_L1x8_LOOP_END:
-    END1x8  AO, BO, 128,16   	 
- 
-	b		ZGEMM_L1x8_SUB1
- 
 ZGEMM_L1x8_SUB0:
+/*----------------------------------------*/   
+#if defined(TRMMKERNEL)   
+    andi.   L,  T6, 255
+    cmpwi   T6,129
+#else   
+    andi.   L,  K,  255
+    cmpwi   K,129
+#endif       
+    li T8,1
+    bne CMP1x8_128K
+    addi BO,BO,-16
+    addi AO,AO,-128 
+    LOAD1x8O 128,16 
+    END1x8_WITHOUT_ADD   
+    LOAD1x8_2O  256, 32 
+    mtctr   T8    
+    bl ZGEMM_L1x8_K128   
+    b ZGEMM_L1x8_SAVE  
+    CMP1x8_128K:
+/*----------------------------------------*/   
+#if defined(TRMMKERNEL)    
+    cmpwi   T6,128
+#else    
+    cmpwi   K,128
+#endif        
+    bne ZGEMM_L1x8_SUB2 
+    MY_ALIGN   
+    mtctr   T8
+    addi BO,BO,-32
+    addi AO,AO,-256   
+    LOAD1x8_2O 256,32
+    bl ZGEMM_L1x8_K128   
+    b ZGEMM_L1x8_SAVE 
+    MY_ALIGN
 
-	andi.		L,	K,	63
- 
-	b		ZGEMM_L1x8_SUB2
-
-ZGEMM_L1x8_SUB1:
-
-	andi.		L,	T1,	31
-	ble		ZGEMM_L1x8_SAVE
 
 ZGEMM_L1x8_SUB2:
-    srawi.      T1,L, 3
-    ble ZGEMM_L1x8_SUB2_4
-    mtctr		T1
+/*----------------------------------------*/   
+    andi.      T1,L, 64
+    ble ZGEMM_L1x8_SUB2_32
+    bl ZGEMM_1x8_L64_SUB
     MY_ALIGN
-ZGEMM_L1x8_SUB2_LOOP:
-	LOAD1x8 0 
-    KERNEL1x8_L  128,16, 0,0
-    KERNEL1x8_L  128,16, 1,0
-    KERNEL1x8_L  128,16, 2,0
-    KERNEL1x8_E  128,16, 3,1
-    bdnz ZGEMM_L1x8_SUB2_LOOP 
-    MY_ALIGN  
+
+
+ZGEMM_L1x8_SUB2_32:
+/*----------------------------------------*/   
+    andi.      T1,L, 32
+    ble ZGEMM_L1x8_SUB2_16    
+    bl ZGEMM_1x8_L32_SUB
+    MY_ALIGN 
+
+
+ZGEMM_L1x8_SUB2_16:
+/*----------------------------------------*/   
+    andi.      T1,L, 16
+    ble ZGEMM_L1x8_SUB2_8
+    bl ZGEMM_1x8_L16_SUB  
+    MY_ALIGN    
+
+
+ZGEMM_L1x8_SUB2_8:
+/*----------------------------------------*/   
+    andi.      T1,L, 8
+    ble ZGEMM_L1x8_SUB2_4
+    LOAD1x8_2
+    KERNEL1x8_L2  256,32, 0,0
+    KERNEL1x8_L2  256,32, 1,0
+    KERNEL1x8_L2  256,32, 2,0
+    KERNEL1x8_E2  256,32, 3,1
+    MY_ALIGN   
+
+
 ZGEMM_L1x8_SUB2_4:
+/*----------------------------------------*/   
     andi.      T1,L, 4
     ble ZGEMM_L1x8_SUB2_2
-	LOAD1x8 0 
-    KERNEL1x8_L  128,16, 0,0
-    KERNEL1x8_E  128,16, 1,1
+    LOAD1x8_2
+    KERNEL1x8_L2  256,32, 0,0
+    KERNEL1x8_E2  256,32, 1,1
     MY_ALIGN
+
+
 ZGEMM_L1x8_SUB2_2:
+/*----------------------------------------*/   
     andi.      T1,L, 2
     ble ZGEMM_L1x8_SUB2_1
-	LOAD1x8 0 
-    KERNEL1x8_E  128,16, 0,1
+    LOAD1x8_2 
+    KERNEL1x8_E2  256,32, 0,1
     MY_ALIGN    
+
+
 ZGEMM_L1x8_SUB2_1:
+/*----------------------------------------*/   
     andi.      T1,L, 1
-    ble ZGEMM_L1x8_SAVE	
-    KERNEL1x8      
- 
+    ble ZGEMM_L1x8_SAVE 
+    KERNEL1x8
+
 
 ZGEMM_L1x8_SAVE:
+/*----------------------------------------*/   
+    addic.    I,  I,  -1
+    SAVE1x8
+#if defined(TRMMKERNEL)    
+    REFRESH_AFTER_SAVE T6,K,TEMP_REG,BO,AO,8,1
+#endif     
+    bgt   ZGEMM_L1x8_BEGIN
+    andi.   T2, M,  7
+    ble   ZGEMM_L1x1_END
+    andi.   T1, M,  4
+    ble   ZGEMM_L1x4_END
+    b   ZGEMM_L1x4_BEGIN
+    MY_ALIGN 
 
-	SAVE1x8
-
-	addic.		I,	I,	-1
-	bgt		ZGEMM_L1x8_BEGIN
 
 ZGEMM_L1x8_END:
+/*----------------------------------------*/   
+
 
 ZGEMM_L1x4_BEGIN:
+/*----------------------------------------*/   
+    andi.   T2, M,  7
+    ble   ZGEMM_L1x1_END
+    andi.   T1, M,  4
+    ble   ZGEMM_L1x4_END
+#if defined(TRMMKERNEL)   
+    REFRESH_POINTERS  AO,BO,TEMP_REG,B,4,1
+#else    
+    mr    BO, B   
+#endif        
+#if defined(TRMMKERNEL)   
+    REFRESH_TEMP_BK T6,K,TEMP_REG,4,1
+    mr T1, T6 
+    addi T1,T1, -2 
+    srawi.   T8, T1, 5 /**(T11-2) % 32x */
+#else   
+    mr T1, K 
+    addi T1,T1, -2
+    srawi.   T8, T1, 5 /**(K-2) % 32x */
+#endif     
+    ZERO1x4
+    ble   ZGEMM_L1x4_SUB0 
+    bl ZGEMM_1x4_LMAIN_SUB
+    andi.   L,  T1, 31
+    ble   ZGEMM_L1x4_SAVE
+    b   ZGEMM_L1x4_SUB2
 
-	andi.		T2,	M,	7
-	ble		ZGEMM_L1x1_END
 
-	andi.		T1,	M,	4
-	ble		ZGEMM_L1x4_END
-	mr		BO,	B
-	mr T1, K
-    addi T1,T1, -1
-    srawi.		L,	T1, 5 /**(K-1) % 16x */ 
-	ZERO1x4  
-	ble		ZGEMM_L1x4_SUB0 
-
-ZGEMM_L1x4_LOOP_START:
-    LOAD1x4 0  
-	mtctr		L
-
-	MY_ALIGN
-ZGEMM_L1x4_LOOP: 
-    KERNEL1x4_L 64,16,0,0
-	KERNEL1x4_L 64,16,1,0 	
-	KERNEL1x4_L 64,16,2,0
-	KERNEL1x4_L 64,16,3,0  
-    KERNEL1x4_L 64,16,4,0
-	KERNEL1x4_L 64,16,5,0 
-	KERNEL1x4_L 64,16,6,0
-	KERNEL1x4_L 64,16,7,0   
-    KERNEL1x4_L 64,16,8,0
-	KERNEL1x4_L 64,16,9,0
-	KERNEL1x4_L 64,16,10,0
-	KERNEL1x4_L 64,16,11,0   
-    KERNEL1x4_L 64,16,12,0
-	KERNEL1x4_L 64,16,13,0
-	KERNEL1x4_L 64,16,14,0
-	KERNEL1x4_L 64,16,15,1 		
-	bdnz		ZGEMM_L1x4_LOOP
- 	MY_ALIGN  
-ZGEMM_L1x4_LOOP_END:
-    END1x4  AO, BO, 64,16   	 
- 
-	b		ZGEMM_L1x4_SUB1
- 
 ZGEMM_L1x4_SUB0:
+/*----------------------------------------*/   
+#if defined(TRMMKERNEL)   
+    andi.   L,  T6, 63
+    cmpwi   T6,33
+#else   
+    andi.   L,  K,  63
+    cmpwi   K,33
+#endif       
+    li T8,1
+    bne CMP1x4_32K
+    addi BO,BO,-16
+    addi AO,AO,-64  
+    LOAD1x4O 64,16 
+    END1x4_WITHOUT_ADD   
+    LOAD1x4_2O  128, 32 
+    mtctr   T8    
+    bl ZGEMM_L1x4_K32   
+    b ZGEMM_L1x4_SAVE  
+    CMP1x4_32K:
+/*----------------------------------------*/   
+#if defined(TRMMKERNEL)    
+    cmpwi   T6,32
+#else    
+    cmpwi   K,32
+#endif        
+    bne ZGEMM_L1x4_SUB2 
+    MY_ALIGN   
+    mtctr   T8
+    addi BO,BO,-32
+    addi AO,AO,-128   
+    LOAD1x4_2O 128,32
+    bl ZGEMM_L1x4_K32   
+    b ZGEMM_L1x4_SAVE 
+    MY_ALIGN 
+    MY_ALIGN 
 
-	andi.		L,	K,	63
- 
-	b		ZGEMM_L1x4_SUB2
-
-ZGEMM_L1x4_SUB1:
-
-	andi.		L,	T1,	31
-	ble		ZGEMM_L1x4_SAVE
 
 ZGEMM_L1x4_SUB2:
-    srawi.      T1,L, 3
-    ble ZGEMM_L1x4_SUB2_4
-    mtctr		T1
+/*----------------------------------------*/   
+    andi.      T1,L, 16
+    ble ZGEMM_L1x4_SUB2_8
+    bl ZGEMM_1x4_L16_SUB  
     MY_ALIGN
-ZGEMM_L1x4_SUB2_LOOP:
-	LOAD1x4 0 
-    KERNEL1x4_L  64,16, 0,0
-    KERNEL1x4_L  64,16, 1,0
-    KERNEL1x4_L  64,16, 2,0
-    KERNEL1x4_E  64,16, 3,1
-    bdnz ZGEMM_L1x4_SUB2_LOOP 
+
+
+ZGEMM_L1x4_SUB2_8:
+/*----------------------------------------*/   
+    andi.      T1,L, 8
+    ble ZGEMM_L1x4_SUB2_4
+    bl ZGEMM_1x4_L8_SUB
     MY_ALIGN  
+
+
 ZGEMM_L1x4_SUB2_4:
+/*----------------------------------------*/   
     andi.      T1,L, 4
     ble ZGEMM_L1x4_SUB2_2
-	LOAD1x4 0 
-    KERNEL1x4_L  64,16, 0,0
-    KERNEL1x4_E  64,16, 1,1
+    LOAD1x4_2
+    KERNEL1x4_L2  128,32, 0,0
+    KERNEL1x4_E2  128,32, 1,1
     MY_ALIGN
+
+
 ZGEMM_L1x4_SUB2_2:
+/*----------------------------------------*/   
     andi.      T1,L, 2
     ble ZGEMM_L1x4_SUB2_1
-	LOAD1x4 0 
-    KERNEL1x4_E  64,16, 0,1
+    LOAD1x4_2
+    KERNEL1x4_E2  128,32, 0,1
     MY_ALIGN    
+
+
 ZGEMM_L1x4_SUB2_1:
+/*----------------------------------------*/   
     andi.      T1,L, 1
-    ble ZGEMM_L1x4_SAVE	
-    KERNEL1x4  
+    ble ZGEMM_L1x4_SAVE 
+    KERNEL1x4
+
 
 ZGEMM_L1x4_SAVE:
+/*----------------------------------------*/   
+    SAVE1x4
+#if defined(TRMMKERNEL)    
+    REFRESH_AFTER_SAVE T6,K,TEMP_REG,BO,AO,4,1
+#endif     
 
-	SAVE1x4
 
 ZGEMM_L1x4_END:
+/*----------------------------------------*/   
+
 
 ZGEMM_L1x2_BEGIN:
+/*----------------------------------------*/   
+    andi.   T1, M,  2
+    ble   ZGEMM_L1x2_END
+#if defined(TRMMKERNEL)   
+    REFRESH_POINTERS  AO,BO,TEMP_REG,B,2,1
+#else    
+    mr    BO, B   
+#endif        
+#if defined(TRMMKERNEL)   
+    REFRESH_TEMP_BK T6,K,TEMP_REG,2,1
+    mr T1, T6 
+    addi T1,T1, -2 
+    srawi.   T8, T1, 5 /**(T11-2) % 32x */
+#else   
+    mr T1, K 
+    addi T1,T1, -2
+    srawi.   T8, T1, 5 /**(K-2) % 32x */
+#endif     
+    ZERO1x2
+    ble   ZGEMM_L1x2_SUB0 
+    bl ZGEMM_1x2_LMAIN_SUB
+    andi.   L,  T1, 31
+    ble   ZGEMM_L1x2_SAVE
+    b   ZGEMM_L1x2_SUB2
 
 
-	andi.		T1,	M,	2
-	ble		ZGEMM_L1x2_END
-	mr		BO,	B
-	mr T1, K
-    addi T1,T1, -1
-    srawi.		L,	T1, 5 /**(K-1) % 16x */ 
-	ZERO1x2  
-	ble		ZGEMM_L1x2_SUB0 
-
-ZGEMM_L1x2_LOOP_START:
-    LOAD1x2 0  
-	mtctr		L
-
-	MY_ALIGN
-ZGEMM_L1x2_LOOP: 
-    KERNEL1x2_L 32,16,0,0
-	KERNEL1x2_L 32,16,1,0 	
-	KERNEL1x2_L 32,16,2,0
-	KERNEL1x2_L 32,16,3,0  
-    KERNEL1x2_L 32,16,4,0
-	KERNEL1x2_L 32,16,5,0 
-	KERNEL1x2_L 32,16,6,0
-	KERNEL1x2_L 32,16,7,0   
-    KERNEL1x2_L 32,16,8,0
-	KERNEL1x2_L 32,16,9,0
-	KERNEL1x2_L 32,16,10,0
-	KERNEL1x2_L 32,16,11,0   
-    KERNEL1x2_L 32,16,12,0
-	KERNEL1x2_L 32,16,13,0
-	KERNEL1x2_L 32,16,14,0
-	KERNEL1x2_L 32,16,15,1 		
-	bdnz		ZGEMM_L1x2_LOOP
- 	MY_ALIGN  
-ZGEMM_L1x2_LOOP_END:
-    END1x2  AO, BO, 32,16  	 
- 
-	b		ZGEMM_L1x2_SUB1
- 
 ZGEMM_L1x2_SUB0:
+/*----------------------------------------*/   
+#if defined(TRMMKERNEL)   
+    andi.   L,  T6, 63
+    cmpwi   T6,33
+#else   
+    andi.   L,  K,  63
+    cmpwi   K,33
+#endif       
+    li T8,1
+    bne CMP1x2_32K
+    addi BO,BO,-16
+    addi AO,AO,-32  
+    LOAD1x2O 32,16 
+    END1x2_WITHOUT_ADD   
+    LOAD1x2_2O  64, 32  
+    mtctr   T8    
+    bl ZGEMM_L1x2_K32   
+    b ZGEMM_L1x2_SAVE  
+    CMP1x2_32K:
+/*----------------------------------------*/   
+#if defined(TRMMKERNEL)    
+    cmpwi   T6,32
+#else    
+    cmpwi   K,32
+#endif        
+    bne ZGEMM_L1x2_SUB2 
+    MY_ALIGN   
+    mtctr   T8
+    addi BO,BO,-32
+    addi AO,AO,-64   
+    LOAD1x2_2O 64,32
+    bl ZGEMM_L1x2_K32   
+    b ZGEMM_L1x2_SAVE 
+    MY_ALIGN 
+    MY_ALIGN 
 
-	andi.		L,	K,	63
- 
-	b		ZGEMM_L1x2_SUB2
-
-ZGEMM_L1x2_SUB1:
-
-	andi.		L,	T1,	31
-	ble		ZGEMM_L1x2_SAVE
 
 ZGEMM_L1x2_SUB2:
-    srawi.      T1,L, 3
-    ble ZGEMM_L1x2_SUB2_4
-    mtctr		T1
+/*----------------------------------------*/   
+    andi.      T1,L, 16
+    ble ZGEMM_L1x2_SUB2_8
+    bl ZGEMM_1x2_L16_SUB  
     MY_ALIGN
-ZGEMM_L1x2_SUB2_LOOP:
-	LOAD1x2 0 
-    KERNEL1x2_L  32,16, 0,0
-    KERNEL1x2_L  32,16, 1,0
-    KERNEL1x2_L  32,16, 2,0
-    KERNEL1x2_E  32,16, 3,1
-    bdnz ZGEMM_L1x2_SUB2_LOOP 
+
+
+ZGEMM_L1x2_SUB2_8:
+/*----------------------------------------*/   
+    andi.      T1,L, 8
+    ble ZGEMM_L1x2_SUB2_4
+    bl ZGEMM_1x2_L8_SUB
     MY_ALIGN  
+
+
 ZGEMM_L1x2_SUB2_4:
+/*----------------------------------------*/   
     andi.      T1,L, 4
     ble ZGEMM_L1x2_SUB2_2
-	LOAD1x2 0 
-    KERNEL1x2_L  32,16, 0,0
-    KERNEL1x2_E  32,16, 1,1
+    LOAD1x2_2
+    KERNEL1x2_L2  64,32, 0,0
+    KERNEL1x2_E2  64,32, 1,1
     MY_ALIGN
+
+
 ZGEMM_L1x2_SUB2_2:
+/*----------------------------------------*/   
     andi.      T1,L, 2
     ble ZGEMM_L1x2_SUB2_1
-	LOAD1x2 0 
-    KERNEL1x2_E  32,16, 0,1
+    LOAD1x2_2
+    KERNEL1x2_E2  64,32, 0,1
     MY_ALIGN    
-ZGEMM_L1x2_SUB2_1:
-    andi.      T1,L, 1
-    ble ZGEMM_L1x2_SAVE	
-    KERNEL1x2 
-ZGEMM_L1x2_SAVE:
 
-	SAVE1x2
+
+ZGEMM_L1x2_SUB2_1:
+/*----------------------------------------*/   
+    andi.      T1,L, 1
+    ble ZGEMM_L1x2_SAVE 
+    KERNEL1x2
+
+
+ZGEMM_L1x2_SAVE:
+/*----------------------------------------*/   
+    SAVE1x2
+#if defined(TRMMKERNEL)    
+    REFRESH_AFTER_SAVE T6,K,TEMP_REG,BO,AO,2,1
+#endif     
+
 
 ZGEMM_L1x2_END:
+/*----------------------------------------*/   
+
 
 ZGEMM_L1x1_BEGIN:
+/*----------------------------------------*/   
+    andi.   T1, M,  1
+    ble   ZGEMM_L1x1_END
+#if defined(TRMMKERNEL)   
+    REFRESH_POINTERS  AO,BO,TEMP_REG,B,1,1
+#else    
+    mr    BO, B   
+#endif        
+#if defined(TRMMKERNEL)   
+    REFRESH_TEMP_BK T6,K,TEMP_REG,1,1
+    mr T1, T6 
+    addi T1,T1, -2 
+    srawi.   T8, T1, 5 /**(T11-2) % 32x */
+#else   
+    mr T1, K 
+    addi T1,T1, -2
+    srawi.   T8, T1, 5 /**(K-2) % 32x */
+#endif     
+    ZERO1x1
+    ble   ZGEMM_L1x1_SUB0 
+    bl ZGEMM_1x1_LMAIN_SUB
+    andi.   L,  T1, 31
+    ble   ZGEMM_L1x1_SAVE
+    b   ZGEMM_L1x1_SUB2
 
 
-	andi.		T1,	M,	1
-	ble		ZGEMM_L1x1_END
-	mr		BO,	B
-	mr T1, K
-    addi T1,T1, -1
-    srawi.		L,	T1, 5 /**(K-1) % 16x */ 
-	ZERO1x1  
-	ble		ZGEMM_L1x1_SUB0 
-
-ZGEMM_L1x1_LOOP_START:
-
-    LOAD1x1 0  
-	mtctr		L
-
-	MY_ALIGN
-ZGEMM_L1x1_LOOP: 
-    KERNEL1x1_L 16,16,0,0
-	KERNEL1x1_L 16,16,1,0 	
-	KERNEL1x1_L 16,16,2,0
-	KERNEL1x1_L 16,16,3,0  
-    KERNEL1x1_L 16,16,4,0
-	KERNEL1x1_L 16,16,5,0 
-	KERNEL1x1_L 16,16,6,0
-	KERNEL1x1_L 16,16,7,0   
-    KERNEL1x1_L 16,16,8,0
-	KERNEL1x1_L 16,16,9,0
-	KERNEL1x1_L 16,16,10,0
-	KERNEL1x1_L 16,16,11,0   
-    KERNEL1x1_L 16,16,12,0
-	KERNEL1x1_L 16,16,13,0
-	KERNEL1x1_L 16,16,14,0
-	KERNEL1x1_L 16,16,15,1 		
-	bdnz		ZGEMM_L1x1_LOOP
- 	MY_ALIGN  
-ZGEMM_L1x1_LOOP_END:
-    END1x1  AO, BO, 16, 16   	 
- 
-	b		ZGEMM_L1x1_SUB1
- 
 ZGEMM_L1x1_SUB0:
+/*----------------------------------------*/   
+#if defined(TRMMKERNEL)   
+    andi.   L,  T6, 63
+    cmpwi   T6,33
+#else   
+    andi.   L,  K,  63
+    cmpwi   K,33
+#endif       
+    li T8,1
+    bne CMP1x1_32K
+    addi BO,BO,-16
+    addi AO,AO,-16  
+    LOAD1x1O 16,16 
+    END1x1_WITHOUT_ADD   
+    LOAD1x1_2O  32, 32  
+    mtctr   T8    
+    bl ZGEMM_L1x1_K32   
+    b ZGEMM_L1x1_SAVE  
+    CMP1x1_32K:
+/*----------------------------------------*/   
+#if defined(TRMMKERNEL)    
+    cmpwi   T6,32
+#else    
+    cmpwi   K,32
+#endif        
+    bne ZGEMM_L1x1_SUB2 
+    MY_ALIGN   
+    mtctr   T8
+    addi BO,BO,-32
+    addi AO,AO,-32   
+    LOAD1x1_2O 32,32
+    bl ZGEMM_L1x1_K32   
+    b ZGEMM_L1x1_SAVE 
+    MY_ALIGN 
+    MY_ALIGN 
 
-	andi.		L,	K,	63
- 
-	b		ZGEMM_L1x1_SUB2
-
-ZGEMM_L1x1_SUB1:
-
-	andi.		L,	T1,	31
-	ble		ZGEMM_L1x1_SAVE
 
 ZGEMM_L1x1_SUB2:
-    srawi.      T1,L, 3
-    ble ZGEMM_L1x1_SUB2_4
-    mtctr		T1
+/*----------------------------------------*/   
+    andi.      T1,L, 16
+    ble ZGEMM_L1x1_SUB2_8
+    bl ZGEMM_1x1_L16_SUB  
     MY_ALIGN
-ZGEMM_L1x1_SUB2_LOOP:
-	LOAD1x1 0 
-    KERNEL1x1_L  16,16, 0,0
-    KERNEL1x1_L  16,16, 1,0
-    KERNEL1x1_L  16,16, 2,0
-    KERNEL1x1_E  16,16, 3,1
-    bdnz ZGEMM_L1x1_SUB2_LOOP 
+
+
+ZGEMM_L1x1_SUB2_8:
+/*----------------------------------------*/   
+    andi.      T1,L, 8
+    ble ZGEMM_L1x1_SUB2_4
+    bl ZGEMM_1x1_L8_SUB
     MY_ALIGN  
+
+
 ZGEMM_L1x1_SUB2_4:
+/*----------------------------------------*/   
     andi.      T1,L, 4
     ble ZGEMM_L1x1_SUB2_2
-	LOAD1x1 0 
-    KERNEL1x1_L  16,16, 0,0
-    KERNEL1x1_E  16,16, 1,1
+    LOAD1x1_2
+    KERNEL1x1_L2  32,32, 0,0
+    KERNEL1x1_E2  32,32, 1,1
     MY_ALIGN
+
+
 ZGEMM_L1x1_SUB2_2:
+/*----------------------------------------*/   
     andi.      T1,L, 2
     ble ZGEMM_L1x1_SUB2_1
-	LOAD1x1 0 
-    KERNEL1x1_E  16,16, 0,1
+    LOAD1x1_2
+    KERNEL1x1_E2  32,32, 0,1
     MY_ALIGN    
+
+
 ZGEMM_L1x1_SUB2_1:
+/*----------------------------------------*/   
     andi.      T1,L, 1
-    ble ZGEMM_L1x1_SAVE	
-    KERNEL1x1 
+    ble ZGEMM_L1x1_SAVE 
+    KERNEL1x1
+
 
 ZGEMM_L1x1_SAVE:
+/*----------------------------------------*/   
+    SAVE1x1
+#if defined(TRMMKERNEL)    
+    REFRESH_AFTER_SAVE T6,K,TEMP_REG,BO,AO,1,1
+#endif   
 
-	SAVE1x1
 
 ZGEMM_L1x1_END:
+/*----------------------------------------*/   
+#if defined(TRMMKERNEL) && !defined(LEFT)   
+    addi TEMP_REG, TEMP_REG, 1
+#endif   
+
 
 ZGEMM_L1_END:
+/*----------------------------------------*/   
+    
\ No newline at end of file
diff --git a/kernel/power/zgemm_macros_power9.S b/kernel/power/zgemm_macros_power9.S
index 10d9e4cc3..8670e9574 100644
--- a/kernel/power/zgemm_macros_power9.S
+++ b/kernel/power/zgemm_macros_power9.S
@@ -25,7 +25,6 @@ OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
 USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 *****************************************************************************/
 
-
 #define unit_size 16
 #define DISP32(ind,disp) (ind*unit_size*32+disp)
 #define DISP16(ind,disp) (ind*unit_size*16+disp)
@@ -34,10 +33,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #define DISP2(ind,disp) (ind*unit_size*2+disp)
 #define DISP1(ind,disp) (ind*unit_size+disp)
 #define DISPX(disp)  (disp)
-
 /*	HELPERS FOR SAVE	*/
-
 /* {r0,i0} and {r1,i1} into  {r0,r1} {i0,i1} */
+
+
 .macro LOAD_COUPLE_AS_RR_II  VS_OUT1,VS_OUT2,VS_TEMP1,VS_TEMP2,REG,LOFFSET 
 #ifndef TRMMKERNEL 
   lxv	\VS_TEMP1,	DISPX(\LOFFSET)(\REG)
@@ -46,20 +45,23 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
   xxmrghd  \VS_OUT2,\VS_TEMP1,\VS_TEMP2	
 #endif	
 .endm
-
 /*from 2 result {a0r*br,a0i*bi} and {a1r*br,a1i*bi} pack into {a0r*br,a1r*br} and {a0i*bi,a1i*bi}*/
+
+
 .macro RESULT_INTO_REALREAL_IMAGEIMAGE VSIN1,VSIN2,VSOUT1,VSOUT2
 	xxmrgld	\VSOUT1, \VSIN1,\VSIN2 /*  real*real from 2 results*/
 	xxmrghd	\VSOUT2, \VSIN1,\VSIN2 /*  imag*imag from 2 results*/
 .endm 
-
 /*from 2 result {a0r*bi,a0i*br} and {a1r*bi,a1i*br} pack into {a0r*bi,a1r*bi} and {a0i*br,a1i*br}*/
+
+
 .macro RESULT_INTO_REALIMAG_IMAGREAL VSIN1,VSIN2,VSOUT1,VSOUT2 
 	xxmrgld	\VSOUT1, \VSIN1,\VSIN2 /*  real*imag */
 	xxmrghd	\VSOUT2, \VSIN1,\VSIN2 /*  imag*real*/
 .endm
-
 /* {a0r*br op a0i*bi ,a1r*br op a1i*bi} ~ {r0,r1}; {a0r*bi op a0i*br ,a1r*bi op a1i*br} ~ {i0,i1}*/
+
+
 .macro  AGGREGATE_REALS_IMAGES  VSINR_OUT1,VSINR,VSINI_OUT2,VSINI
 #if	defined(NN) || defined(NT) || defined(TN) || defined(TT) 
 	xvsubdp  \VSINR_OUT1,\VSINR_OUT1,\VSINR
@@ -78,8 +80,9 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 	xvadddp  \VSINI_OUT2,\VSINI_OUT2,\VSINI 
 #endif
 .endm 
-
 /* {i0,i1} * {alpha_i,alpha_i} - VSOUT1 ;VSOUT2 + {r0,r1}*{alpha_i,alpha_i} */
+
+
 .macro MULT_APLHA_PART1  VSINRR,VSINII,VSOUT1,VSOUT2
 #ifndef TRMMKERNEL  
 	xvmsubadp \VSOUT1,\VSINII, alpha_i
@@ -89,23 +92,28 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 	xvmuldp  \VSOUT2,\VSINRR, alpha_i
 #endif 
 .endm
-
 /*   {r0,r1} * {alpha_r,alpha_r} -  VSOUT1 ;VSOUT2 + {i0,i1} * {alpha_r,alpha_r} */
+
+
 .macro MULT_APLHA_PART2  VSINRR,VSINII,VSOUT1,VSOUT2 
 	xvmsubadp  \VSOUT1,\VSINRR, alpha_r
 	xvmaddadp \VSOUT2,\VSINII, alpha_r
 .endm
-
 /* unpack to store 2{r,r} {i,i} into  {r,i} {r,i} (big endian because of stxv) */
+
+
 .macro UNPACK_FOR_STORE VSIN1,VSIN2,VSOUT1,VSOUT2 
 	xxmrghd  \VSOUT1,\VSIN2,\VSIN1
 	xxmrgld  \VSOUT2,\VSIN2,\VSIN1
 .endm
+
+
 .macro STORE_COUPLE REG,LOFFSET,VSIN1,VSIN2
 	stxv	\VSIN1,	DISPX(\LOFFSET)(\REG)
 	stxv	\VSIN2,	DISPX(\LOFFSET+16)(\REG)
 .endm
 
+
 .macro SAVE8 VSRes1,VSRes2,VSRes3,VSRes4,VSRes5,VSRes6,VSRes7,VSRes8,VSRes9,VSRes10,VSRes11,VSRes12,VSRes13,VSRes14,VSRes15,VSRes16,BASE_REG,LOFFSET
   RESULT_INTO_REALREAL_IMAGEIMAGE \VSRes1,\VSRes3,vs2,vs3
   LOAD_COUPLE_AS_RR_II	vs14,vs15,vs18,vs19,\BASE_REG,\LOFFSET
@@ -141,6 +149,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
   STORE_COUPLE	\BASE_REG,(\LOFFSET+96),\VSRes1,\VSRes3
 .endm
 
+
 .macro SAVE4  VSRes1,VSRes2,VSRes3,VSRes4,VSRes5,VSRes6,VSRes7,VSRes8,BASE_REG,LOFFSET
   RESULT_INTO_REALREAL_IMAGEIMAGE \VSRes1,\VSRes3,vs2,vs3
   LOAD_COUPLE_AS_RR_II	vs14,vs15,vs18,vs19,\BASE_REG,\LOFFSET
@@ -161,6 +170,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 .endm
 
 
+
 .macro SAVE2  VSRes1,VSRes2,VSRes3,VSRes4,BASE_REG,LOFFSET
   RESULT_INTO_REALREAL_IMAGEIMAGE \VSRes1,\VSRes3,vs2,vs3
   LOAD_COUPLE_AS_RR_II	vs14,vs15,vs18,vs19,\BASE_REG,\LOFFSET
@@ -173,6 +183,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 .endm
 
 
+
 .macro SAVE1  VSRes1,VSRes2,BASE_REG,LOFFSET
   RESULT_INTO_REALREAL_IMAGEIMAGE \VSRes1,\VSRes1,vs2,vs3
 #ifndef TRMMKERNEL 
@@ -188,9 +199,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
   xxmrghd  vs7,vs15,vs14	
   stxv	vs7,	(\LOFFSET)(\BASE_REG) 
 .endm
-
 /**********************************************************************************************
-* Macros for N=2 and M=8
+*
+
+.macros for N=2 and M=8
 **********************************************************************************************/
 
 .macro Zero2x8
@@ -228,269 +240,272 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 	xxlxor	vs63,	vs63,	vs63
 .endm
 
-.macro LOAD2x8 Zero
 
-	lxv	vs16,	0(BO)	// load real imag from B
-	lxv	vs18,	16(BO)	// load real,imag from B
+.macro LOAD2x8   
+	LOAD2x8O 0,0 
+.endm
+
+
+.macro LOAD2x8O  OffsetA,OffsetB
+	lxv	vs16,(\OffsetB+	0)(BO)	// load real imag from B
+	lxv	vs18,	(\OffsetB+16)(BO)	// load real,imag from B 
 	xxswapd	vs17, vs16
 	xxswapd	vs19, vs18
-
-	lxv	vs0,	0(AO)	// load real,imag from A
-	lxv	vs1,	16(AO)	// load real,imag from A
-	lxv	vs2,	32(AO)	// load real,imag from A
-	lxv	vs3,	48(AO)	// load real,imag from A
-
-	lxv	vs4,	64(AO)	// load real,imag from A
-	lxv	vs5,	80(AO)	// load real,imag from A
-	lxv	vs6,	96(AO)	// load real,imag from A
-	lxv	vs7,	112(AO)	// load real,imag from A
-
-.if \Zero==1
-	Zero2x8
-.endif
-
+	lxv	vs0,	(0+\OffsetA)(AO)	// load real,imag from A
+	lxv	vs1,	(16+\OffsetA)(AO)	// load real,imag from A
+	lxv	vs2,	(32+\OffsetA)(AO)	// load real,imag from A
+	lxv	vs3,	(48+\OffsetA)(AO)	// load real,imag from A
+	lxv	vs4,	(64+\OffsetA)(AO)	// load real,imag from A
+	lxv	vs5,	(80+\OffsetA)(AO)	// load real,imag from A
+	lxv	vs6,	(96+\OffsetA)(AO)	// load real,imag from A
+	lxv	vs7,	(112+\OffsetA)(AO)	// load real,imag from A
+ 
 .endm
 
+
 .macro END2x8_NORMAL
 	END2x8 AO,BO,128,32
 .endm
 
-.macro END2x8	AREG, BREG, OffsetA, OffsetB
 
+.macro END2x8_WITHOUT_ADD
+	END2x8 AO,BO,0,0
+.endm
+
+
+.macro END2x8	AREG, BREG, OffsetA, OffsetB
 .if \OffsetB != 0
 	addi	\BREG, \BREG, \OffsetB
 .endif
 .if \OffsetA != 0
 	addi	\AREG, \AREG, \OffsetA
 .endif
-
 	xvmaddadp	vs32,	vs0,	vs16
 	xvmaddadp	vs48,	vs0,	vs18
-
-	xvmaddadp	vs34,	vs1,	vs16
-	xvmaddadp	vs50,	vs1,	vs18
-
-	xvmaddadp	vs36,	vs2,	vs16
-	xvmaddadp	vs52,	vs2,	vs18
-
-	xvmaddadp	vs38,	vs3,	vs16
-	xvmaddadp	vs54,	vs3,	vs18
-
-	xvmaddadp	vs40,	vs4,	vs16
-	xvmaddadp	vs56,	vs4,	vs18
-
-	xvmaddadp	vs42,	vs5,	vs16
-	xvmaddadp	vs58,	vs5,	vs18
-
-	xvmaddadp	vs44,	vs6,	vs16
-	xvmaddadp	vs60,	vs6,	vs18
-
-	xvmaddadp	vs46,	vs7,	vs16
-	xvmaddadp	vs62,	vs7,	vs18
-
-
 	xvmaddadp	vs33,	vs0,	vs17
 	xvmaddadp	vs49,	vs0,	vs19
-
+	xvmaddadp	vs34,	vs1,	vs16
+	xvmaddadp	vs50,	vs1,	vs18
 	xvmaddadp	vs35,	vs1,	vs17
 	xvmaddadp	vs51,	vs1,	vs19
-
+	xvmaddadp	vs36,	vs2,	vs16
+	xvmaddadp	vs52,	vs2,	vs18
 	xvmaddadp	vs37,	vs2,	vs17
 	xvmaddadp	vs53,	vs2,	vs19
-
+	xvmaddadp	vs38,	vs3,	vs16
+	xvmaddadp	vs54,	vs3,	vs18
 	xvmaddadp	vs39,	vs3,	vs17
 	xvmaddadp	vs55,	vs3,	vs19
-
+	xvmaddadp	vs40,	vs4,	vs16
+	xvmaddadp	vs56,	vs4,	vs18
 	xvmaddadp	vs41,	vs4,	vs17
 	xvmaddadp	vs57,	vs4,	vs19
-
+	xvmaddadp	vs42,	vs5,	vs16
+	xvmaddadp	vs58,	vs5,	vs18
 	xvmaddadp	vs43,	vs5,	vs17
 	xvmaddadp	vs59,	vs5,	vs19
-
+	xvmaddadp	vs44,	vs6,	vs16
+	xvmaddadp	vs60,	vs6,	vs18
 	xvmaddadp	vs45,	vs6,	vs17
 	xvmaddadp	vs61,	vs6,	vs19
-
+	xvmaddadp	vs46,	vs7,	vs16
+	xvmaddadp	vs62,	vs7,	vs18
 	xvmaddadp	vs47,	vs7,	vs17
 	xvmaddadp	vs63,	vs7,	vs19
-
 .endm
 
-.macro KERNEL2x8_L	OffsetA,OffsetB, Index,IsLast
-  KERNEL2x8_2	AO,BO,	\OffsetA,\OffsetB, \Index,\IsLast ,0
+
+.macro LOAD2x8_2
+    LOAD2x8_2O 0,0
+.endm	
+
+
+.macro LOAD2x8_2O  OffsetA,OffsetB
+	lxv	vs16,(\OffsetB+	0)(BO)	// load real imag from B
+	lxv	vs18,	(\OffsetB+16)(BO)	// load real,imag from B
+	lxv	vs20,	(\OffsetB+32)(BO)	// load real,imag	from B
+	lxv	vs22,	(\OffsetB+48)(BO)	// load real,imag  from B	
+	xxswapd	vs17, vs16
+	xxswapd	vs19, vs18
+	lxv	vs0,	(0+\OffsetA)(AO)	// load real,imag from A
+	lxv	vs1,	(16+\OffsetA)(AO)	// load real,imag from A
+	lxv	vs2,	(32+\OffsetA)(AO)	// load real,imag from A
+	lxv	vs3,	(48+\OffsetA)(AO)	// load real,imag from A
+	lxv	vs4,	(64+\OffsetA)(AO)	// load real,imag from A
+	lxv	vs5,	(80+\OffsetA)(AO)	// load real,imag from A
+	lxv	vs6,	(96+\OffsetA)(AO)	// load real,imag from A
+	lxv	vs7,	(112+\OffsetA)(AO)	// load real,imag from A
+	lxv	vs8,	(128+0+\OffsetA)(AO)	// load real,imag from A
+	lxv	vs9,	(128+16+\OffsetA)(AO)	// load real,imag from A
+	lxv	vs10,   (128+32+\OffsetA)(AO)	// load real,imag from A
+	lxv	vs11,   (128+48+\OffsetA)(AO)	// load real,imag from A
+	lxv	vs12,   (128+64+\OffsetA)(AO)	// load real,imag from A
+	lxv	vs13,   (128+80+\OffsetA)(AO)	// load real,imag from A
+	lxv	vs14,   (128+96+\OffsetA)(AO)	// load real,imag from A
+	lxv	vs15,   (128+112+\OffsetA)(AO)	// load real,imag from A
+.endm	
+
+
+.macro END2x8_2	  
+  /*for load2 offset will be 256 and 64*/
+   KERNEL2x8_2	AO,BO,	256,64,0 ,1,1 
+.endm
+ 
+
+
+.macro KERNEL2x8_E2	OffsetA,OffsetB, Index,IsLast 
+  KERNEL2x8_2	AO,BO,	\OffsetA,\OffsetB, \Index,\IsLast ,1 
 .endm
 
-.macro KERNEL2x8_E	OffsetA,OffsetB, Index,IsLast
-  KERNEL2x8_2	AO,BO,	\OffsetA,\OffsetB, \Index,\IsLast ,1
+
+.macro KERNEL2x8_L2	OffsetA,OffsetB, Index,IsLast
+  KERNEL2x8_2	AO,BO,	\OffsetA,\OffsetB, \Index,\IsLast ,0 
 .endm
 
 
 .macro KERNEL2x8_2	AREG,BREG,	OffsetA,OffsetB, Index,IsLast ,Complete
-
-	lxv	vs20,	DISP4(\Index,	0+\OffsetB)(\BREG)	// load real,imag	from B
-	lxv	vs22,	DISP4(\Index,16+\OffsetB)(\BREG)	// load real,imag  from B
 	xvmaddadp	vs32,	vs0,	vs16
 	xvmaddadp	vs48,	vs0,	vs18
 	xvmaddadp	vs33,	vs0,	vs17
 	xvmaddadp	vs49,	vs0,	vs19
-
-	xxswapd	vs21, vs20
-	xxswapd	vs23, vs22
-
+  xxswapd	vs21, vs20
+  xxswapd	vs23, vs22
 	xvmaddadp	vs34,	vs1,	vs16
 	xvmaddadp	vs50,	vs1,	vs18
-
-	lxv	vs8,	DISP16(\Index, 0 + \OffsetA)(\AREG)	// load real,imag from A
-	lxv	vs9,	DISP16(\Index,16 + \OffsetA)(\AREG)	// load real,imag from A
-
 	xvmaddadp	vs35,	vs1,	vs17
 	xvmaddadp	vs51,	vs1,	vs19
-
-	lxv	vs10,	DISP16(\Index,32 + \OffsetA)(\AREG)	// load real,imag from A
-	lxv	vs11,	DISP16(\Index,48 + \OffsetA)(\AREG)	// load real,imag from A
-
+.if \Complete==0	
+	lxv	vs0,	DISP16(\Index, 0 + \OffsetA)(\AREG)	// load real,imag from A
+	lxv	vs1,	DISP16(\Index,16 + \OffsetA)(\AREG)	// load real,imag from A
+.endif	
 	xvmaddadp	vs36,	vs2,	vs16
 	xvmaddadp	vs52,	vs2,	vs18
-
-	lxv	vs12,	DISP16(\Index, 64 + \OffsetA)(\AREG)	// load real,imag from A
-	lxv	vs13,	DISP16(\Index,64+16 + \OffsetA)(\AREG)	// load real,imag from A
-
 	xvmaddadp	vs37,	vs2,	vs17
 	xvmaddadp	vs53,	vs2,	vs19
-
-	lxv	vs14,	DISP16(\Index,64+32 + \OffsetA)(\AREG)	// load real,imag from A
-	lxv	vs15,	DISP16(\Index,64+48 + \OffsetA)(\AREG)	// load real,imag from A
-
-.if \IsLast==1
-.if \Complete==1 
-	addi	\AREG, \AREG, DISP16(\Index,128+\OffsetA)
-	addi	\BREG, \BREG,  DISP4(\Index,32+\OffsetB)
-.endif
-.endif
-
-
 	xvmaddadp	vs38,	vs3,	vs16
 	xvmaddadp	vs54,	vs3,	vs18
-
-.if \Complete==0
-	lxv	vs0,	DISP16(\Index,128+ + \OffsetA)(\AREG)	// load real,imag from A
-	lxv	vs1,	DISP16(\Index,128+16 + \OffsetA)(\AREG)	// load real,imag from A
-.endif
-
-
 	xvmaddadp	vs39,	vs3,	vs17
 	xvmaddadp	vs55,	vs3,	vs19
-
-.if \Complete==0
-	lxv	vs2,	DISP16(\Index,128+32 + \OffsetA)(\AREG)	// load real,imag from A
-	lxv	vs3,	DISP16(\Index,128+48 + \OffsetA)(\AREG)	// load real,imag from A
-.endif
+.if \Complete==0	
+	lxv	vs2,	DISP16(\Index,32 + \OffsetA)(\AREG)	// load real,imag from A
+	lxv	vs3,	DISP16(\Index,48 + \OffsetA)(\AREG)	// load real,imag from A
+.endif	
 	xvmaddadp	vs40,	vs4,	vs16
 	xvmaddadp	vs56,	vs4,	vs18
-
 	xvmaddadp	vs41,	vs4,	vs17
 	xvmaddadp	vs57,	vs4,	vs19
-
 	xvmaddadp	vs42,	vs5,	vs16
 	xvmaddadp	vs58,	vs5,	vs18
 	xvmaddadp	vs43,	vs5,	vs17
 	xvmaddadp	vs59,	vs5,	vs19
-
-.if \Complete==0
-	lxv	vs4,	DISP16(\Index, 192 + \OffsetA)(\AREG)	// load real,imag from A
-	lxv	vs5,	DISP16(\Index,192 +16 + \OffsetA)(\AREG)	// load real,imag from A
-.endif
-
+.if \Complete==0		
+	lxv	vs4,	DISP16(\Index,64+ \OffsetA)(\AREG)	// load real,imag from A
+	lxv	vs5,	DISP16(\Index,64+16 + \OffsetA)(\AREG)	// load real,imag from A
+.endif	
 	xvmaddadp	vs44,	vs6,	vs16
 	xvmaddadp	vs60,	vs6,	vs18
 	xvmaddadp	vs45,	vs6,	vs17
 	xvmaddadp	vs61,	vs6,	vs19
-
 	xvmaddadp	vs46,	vs7,	vs16
 	xvmaddadp	vs62,	vs7,	vs18
 	xvmaddadp	vs47,	vs7,	vs17
-	xvmaddadp	vs63,	vs7,	vs19
-
-.if \Complete==0
-	lxv	vs6,	DISP16(\Index,192 +32 + \OffsetA)(\AREG)	// load real,imag from A
-	lxv	vs7,	DISP16(\Index,192 +48 + \OffsetA)(\AREG)	// load real,imag from A
+	xvmaddadp	vs63,	vs7,	vs19	
+.if \Complete==0		
+	lxv	vs16,	DISP4(\Index, 0+\OffsetB)(\BREG)	// load real imag from B
+	lxv	vs18,	DISP4(\Index, 16+\OffsetB)(\BREG)	// load real,imag from B
 .endif
-
 	xvmaddadp	vs32,	vs8,	vs20
 	xvmaddadp	vs48,	vs8,	vs22
 .if \Complete==0
-	lxv	vs16,	DISP4(\Index, 32+\OffsetB)(\BREG)	// load real imag from B
-	lxv	vs18,	DISP4(\Index, 48+\OffsetB)(\BREG)	// load real,imag from B
-.endif
-.if \Complete==0
-.if \IsLast==1 
-	addi	\AREG, \AREG, DISP16(\Index,256)
-	addi	\BREG, \BREG,  DISP4(\Index,64)
-.endif
-
-.endif
+	lxv	vs6,	DISP16(\Index,64+32 + \OffsetA)(\AREG)	// load real,imag from A
+	lxv	vs7,	DISP16(\Index,64+48 + \OffsetA)(\AREG)	// load real,imag from A	
+.endif	
 	xvmaddadp	vs33,	vs8,	vs21
 	xvmaddadp	vs49,	vs8,	vs23
-
-.if \Complete==0
-	xxswapd	vs17, vs16
-	xxswapd	vs19, vs18
+.if \Complete==0		
+  xxswapd	vs17, vs16
+  xxswapd	vs19, vs18
 .endif
-
 	xvmaddadp	vs34,	vs9,	vs20
 	xvmaddadp	vs50,	vs9,	vs22
 	xvmaddadp	vs35,	vs9,	vs21
 	xvmaddadp	vs51,	vs9,	vs23
-
+.if \Complete==0		
+	lxv	vs8,	DISP16(\Index,128+ + \OffsetA)(\AREG)	// load real,imag from A
+	lxv	vs9,	DISP16(\Index,128+16 + \OffsetA)(\AREG)	// load real,imag from A
+.endif
 	xvmaddadp	vs36,	vs10,	vs20
 	xvmaddadp	vs52,	vs10,	vs22
 	xvmaddadp	vs37,	vs10,	vs21
 	xvmaddadp	vs53,	vs10,	vs23
-
 	xvmaddadp	vs38,	vs11,	vs20
 	xvmaddadp	vs54,	vs11,	vs22
 	xvmaddadp	vs39,	vs11,	vs21
 	xvmaddadp	vs55,	vs11,	vs23
-
+.if \Complete==0	
+	lxv	vs10,	DISP16(\Index,128+32 + \OffsetA)(\AREG)	// load real,imag from A
+	lxv	vs11,	DISP16(\Index,128+48 + \OffsetA)(\AREG)	// load real,imag from A
+.endif	
 	xvmaddadp	vs40,	vs12,	vs20
 	xvmaddadp	vs56,	vs12,	vs22
 	xvmaddadp	vs41,	vs12,	vs21
 	xvmaddadp	vs57,	vs12,	vs23
-
 	xvmaddadp	vs42,	vs13,	vs20
 	xvmaddadp	vs58,	vs13,	vs22
 	xvmaddadp	vs43,	vs13,	vs21
 	xvmaddadp	vs59,	vs13,	vs23
-
+.if \Complete==0	
+	lxv	vs12,	DISP16(\Index, 192 + \OffsetA)(\AREG)	// load real,imag from A
+	lxv	vs13,	DISP16(\Index,192 +16 + \OffsetA)(\AREG)	// load real,imag from A
+.endif	
 	xvmaddadp	vs44,	vs14,	vs20
 	xvmaddadp	vs60,	vs14,	vs22
 	xvmaddadp	vs45,	vs14,	vs21
 	xvmaddadp	vs61,	vs14,	vs23
-
 	xvmaddadp	vs46,	vs15,	vs20
 	xvmaddadp	vs62,	vs15,	vs22
 	xvmaddadp	vs47,	vs15,	vs21
 	xvmaddadp	vs63,	vs15,	vs23
-
+.if \Complete==0	
+	lxv	vs14,	DISP16(\Index,192 +32 + \OffsetA)(\AREG)	// load real,imag from A
+	lxv	vs15,	DISP16(\Index,192 +48 + \OffsetA)(\AREG)	// load real,imag from A
+ 	lxv	vs20,	DISP4(\Index, 32+\OffsetB)(\BREG)	// load real,imag	from B
+	lxv	vs22,	DISP4(\Index, 48+\OffsetB)(\BREG)	// load real,imag  from B
+.endif
+.if \IsLast==1
+.if \Complete==1
+	addi	\AREG, \AREG,  DISP16(\Index,\OffsetA)
+	addi	\BREG, \BREG,  DISP4(\Index,\OffsetB)
+.else
+	addi	\AREG, \AREG, DISP16(\Index,256)
+	addi	\BREG, \BREG,  DISP4(\Index,64)
+.endif
+.endif 
 .endm
 
+ 
+
+
+
 .macro KERNEL2x8
-  LOAD2x8 0
+  LOAD2x8
   END2x8  AO, BO, 128,32
 .endm
 
-.macro SAVE2x8
 
+.macro SAVE2x8
 	add	T1, CO ,LDC 
 	SAVE8  vs32,vs33,vs34,vs35,vs36,vs37,vs38,vs39,vs40,vs41,vs42,vs43,vs44,vs45,vs46,vs47,CO,0
 	SAVE8  vs48,vs49,vs50,vs51,vs52,vs53,vs54,vs55,vs56,vs57,vs58,vs59,vs60,vs61,vs62,vs63,T1,0  
 	addi	CO, CO, 128
-
 .endm
-
 /**********************************************************************************************
-* Macros for N=2 and M=4
+*
+
+.macros for N=2 and M=4
 **********************************************************************************************/
 
+
 .macro Zero2x4
 	xxlxor	vs32,	vs32,	vs32
 	xxlxor	vs33,	vs33,	vs33
@@ -510,167 +525,199 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 	xxlxor	vs47,	vs47,	vs47
 .endm
 
-.macro LOAD2x4 Zero
 
-	lxv	vs16,	0(BO)	// load real imag from B
-	lxv	vs18,	16(BO)	// load real,imag from B
+.macro LOAD2x4   
+	LOAD2x4O 0,0 
+.endm
+
+
+.macro LOAD2x4O  OffsetA,OffsetB
+	lxv	vs16,(\OffsetB+	0)(BO)	// load real imag from B
+	lxv	vs18,	(\OffsetB+16)(BO)	// load real,imag from B 
 	xxswapd	vs17, vs16
 	xxswapd	vs19, vs18
-
-	lxv	vs0,	0(AO)	// load real,imag from A
-	lxv	vs1,	16(AO)	// load real,imag from A
-	lxv	vs2,	32(AO)	// load real,imag from A
-	lxv	vs3,	48(AO)	// load real,imag from A
-
-.if \Zero==1
-	Zero2x4
-.endif
-
+	lxv	vs0,	(0+\OffsetA)(AO)	// load real,imag from A
+	lxv	vs1,	(16+\OffsetA)(AO)	// load real,imag from A
+	lxv	vs2,	(32+\OffsetA)(AO)	// load real,imag from A
+	lxv	vs3,	(48+\OffsetA)(AO)	// load real,imag from A  
 .endm
 
+
 .macro END2x4_NORMAL
 	END2x4 AO,BO,64,32
 .endm
 
-.macro END2x4	AREG, BREG, OffsetA, OffsetB
 
+.macro END2x4_WITHOUT_ADD
+	END2x4 AO,BO,0,0
+.endm
+
+
+.macro END2x4	AREG, BREG, OffsetA, OffsetB
 .if \OffsetB != 0
 	addi	\BREG, \BREG, \OffsetB
 .endif
 .if \OffsetA != 0
 	addi	\AREG, \AREG, \OffsetA
 .endif
-
 	xvmaddadp	vs32,	vs0,	vs16
-	xvmaddadp	vs33,	vs0,	vs17
 	xvmaddadp	vs40,	vs0,	vs18
+	xvmaddadp	vs33,	vs0,	vs17
 	xvmaddadp	vs41,	vs0,	vs19
-
 	xvmaddadp	vs34,	vs1,	vs16
-	xvmaddadp	vs35,	vs1,	vs17
 	xvmaddadp	vs42,	vs1,	vs18
+	xvmaddadp	vs35,	vs1,	vs17
 	xvmaddadp	vs43,	vs1,	vs19
-	
 	xvmaddadp	vs36,	vs2,	vs16
-	xvmaddadp	vs37,	vs2,	vs17
 	xvmaddadp	vs44,	vs2,	vs18
+	xvmaddadp	vs37,	vs2,	vs17
 	xvmaddadp	vs45,	vs2,	vs19
-
 	xvmaddadp	vs38,	vs3,	vs16
-	xvmaddadp	vs39,	vs3,	vs17
 	xvmaddadp	vs46,	vs3,	vs18
+	xvmaddadp	vs39,	vs3,	vs17
 	xvmaddadp	vs47,	vs3,	vs19
 
 .endm
 
-.macro KERNEL2x4_L	OffsetA,OffsetB, Index,IsLast
-  KERNEL2x4_2	AO,BO,	\OffsetA,\OffsetB, \Index,\IsLast ,0
-.endm
 
-.macro KERNEL2x4_E	OffsetA,OffsetB, Index,IsLast
-  KERNEL2x4_2	AO,BO,	\OffsetA,\OffsetB, \Index,\IsLast ,1
-.endm
-
-.macro KERNEL2x4_2	AREG,BREG,	OffsetA,OffsetB, Index,IsLast ,Complete
-
-	lxv	vs20,	DISP4(\Index,	0+\OffsetB)(\BREG)	// load real,imag	from B
-	lxv	vs22,	DISP4(\Index,16+\OffsetB)(\BREG)	// load real,imag  from B
- 
-	xvmaddadp	vs32,	vs0,	vs16
-	xvmaddadp	vs33,	vs0,	vs17
-	xxswapd	vs21, vs20
-	xxswapd	vs23, vs22	
-	lxv	vs8,	DISP8(\Index, 0 + \OffsetA)(\AREG)	// load real,imag from A
-	lxv	vs9,	DISP8(\Index,16 + \OffsetA)(\AREG)	// load real,imag from A
-	xvmaddadp	vs40,	vs0,	vs18
-	xvmaddadp	vs41,	vs0,	vs19
-	lxv	vs10,	DISP8(\Index,32 + \OffsetA)(\AREG)	// load real,imag from A
-	lxv	vs11,	DISP8(\Index,48 + \OffsetA)(\AREG)	// load real,imag from A
-.if \IsLast==1
-.if \Complete==1
-	addi	\BREG, \BREG,  DISP4(\Index,32+\OffsetB) 
-	addi	\AREG, \AREG,  DISP8(\Index,64+\OffsetA) 
-.endif
-.endif
-
-	xvmaddadp	vs34,	vs1,	vs16
-	xvmaddadp	vs35,	vs1,	vs17
-	xvmaddadp	vs42,	vs1,	vs18
-	xvmaddadp	vs43,	vs1,	vs19
-	
-	xvmaddadp	vs36,	vs2,	vs16
-	xvmaddadp	vs37,	vs2,	vs17
-.if \Complete==0
-	lxv	vs0,	DISP8(\Index,64+  \OffsetA)(\AREG)	// load real,imag from A
-	lxv	vs1,	DISP8(\Index,64+16 + \OffsetA)(\AREG)	// load real,imag from A
-.endif	
-	xvmaddadp	vs44,	vs2,	vs18
-	xvmaddadp	vs45,	vs2,	vs19
-	
-	xvmaddadp	vs38,	vs3,	vs16
-	xvmaddadp	vs39,	vs3,	vs17
-	xvmaddadp	vs46,	vs3,	vs18
-	xvmaddadp	vs47,	vs3,	vs19
+.macro LOAD2x4_2
+    LOAD2x4_2O 0,0
+.endm	
 
 
-.if \Complete==0	
-	lxv	vs2,	DISP8(\Index,64+32 + \OffsetA)(\AREG)	// load real,imag from A
-	lxv	vs3,	DISP8(\Index,64+48 + \OffsetA)(\AREG)	// load real,imag from A
-
-.endif
-	xvmaddadp	vs32,	vs8,	vs20
-	xvmaddadp	vs33,	vs8,	vs21
-.if \Complete==0
-	lxv	vs16,	DISP4(\Index, 32+\OffsetB)(\BREG)	// load real imag from B
-	lxv	vs18,	DISP4(\Index, 48+\OffsetB)(\BREG)	// load real,imag from B
-.if \IsLast==1 
-	addi	\AREG, \AREG,  DISP8(\Index,128)
-	addi	\BREG, \BREG,  DISP4(\Index,64) 
-.endif    
-.endif
-
-.if \Complete==0
+.macro LOAD2x4_2O  OffsetA,OffsetB
+	lxv	vs16,(\OffsetB+	0)(BO)	// load real imag from B
+	lxv	vs18,	(\OffsetB+16)(BO)	// load real,imag from B
+	lxv	vs20,	(\OffsetB+32)(BO)	// load real,imag	from B
+	lxv	vs22,	(\OffsetB+48)(BO)	// load real,imag  from B	
 	xxswapd	vs17, vs16
 	xxswapd	vs19, vs18
-.endif
- 
-	xvmaddadp	vs40,	vs8,	vs22
-	xvmaddadp	vs41,	vs8,	vs23
+	lxv	vs0,	(0+\OffsetA)(AO)	// load real,imag from A
+	lxv	vs1,	(16+\OffsetA)(AO)	// load real,imag from A
+	lxv	vs2,	(32+\OffsetA)(AO)	// load real,imag from A
+	lxv	vs3,	(48+\OffsetA)(AO)	// load real,imag from A
+	lxv	vs8,	(64+\OffsetA)(AO)	// load real,imag from A
+	lxv	vs9,	(80+\OffsetA)(AO)	// load real,imag from A
+	lxv	vs10,	(96+\OffsetA)(AO)	// load real,imag from A
+	lxv	vs11,	(112+\OffsetA)(AO)	// load real,imag from A 
+.endm	
 
-	xvmaddadp	vs34,	vs9,	vs20
-	xvmaddadp	vs35,	vs9,	vs21
-	xvmaddadp	vs42,	vs9,	vs22
-	xvmaddadp	vs43,	vs9,	vs23
-
-	xvmaddadp	vs36,	vs10,	vs20
-	xvmaddadp	vs37,	vs10,	vs21
-	xvmaddadp	vs44,	vs10,	vs22
-	xvmaddadp	vs45,	vs10,	vs23
-
-	xvmaddadp	vs38,	vs11,	vs20
-	xvmaddadp	vs39,	vs11,	vs21
-	xvmaddadp	vs46,	vs11,	vs22
-	xvmaddadp	vs47,	vs11,	vs23
 
+.macro END2x4_2	  
+  /*for load2 offset will be 128 and 64*/
+   KERNEL2x4_2	AO,BO,	128,64,0 ,1,1 
 .endm
+ 
+
+
+.macro KERNEL2x4_E2	OffsetA,OffsetB, Index,IsLast 
+  KERNEL2x4_2	AO,BO,	\OffsetA,\OffsetB, \Index,\IsLast ,1 
+.endm
+
+
+.macro KERNEL2x4_L2	OffsetA,OffsetB, Index,IsLast
+  KERNEL2x4_2	AO,BO,	\OffsetA,\OffsetB, \Index,\IsLast ,0 
+.endm
+
+
+.macro KERNEL2x4_2	AREG,BREG,	OffsetA,OffsetB, Index,IsLast ,Complete
+	xvmaddadp	vs32,	vs0,	vs16
+	xvmaddadp	vs40,	vs0,	vs18
+	xvmaddadp	vs33,	vs0,	vs17
+	xvmaddadp	vs41,	vs0,	vs19
+  xxswapd	vs21, vs20
+  xxswapd	vs23, vs22
+	xvmaddadp	vs34,	vs1,	vs16
+	xvmaddadp	vs42,	vs1,	vs18
+	xvmaddadp	vs35,	vs1,	vs17
+	xvmaddadp	vs43,	vs1,	vs19
+.if \Complete==0	
+	lxv	vs0,	DISP8(\Index, 0 + \OffsetA)(\AREG)	// load real,imag from A
+	lxv	vs1,	DISP8(\Index,16 + \OffsetA)(\AREG)	// load real,imag from A
+.endif	
+	xvmaddadp	vs36,	vs2,	vs16
+	xvmaddadp	vs44,	vs2,	vs18
+	xvmaddadp	vs37,	vs2,	vs17
+	xvmaddadp	vs45,	vs2,	vs19
+	xvmaddadp	vs38,	vs3,	vs16
+	xvmaddadp	vs46,	vs3,	vs18
+	xvmaddadp	vs39,	vs3,	vs17
+	xvmaddadp	vs47,	vs3,	vs19
+.if \Complete==0	
+	lxv	vs2,	DISP8(\Index,32 + \OffsetA)(\AREG)	// load real,imag from A
+	lxv	vs3,	DISP8(\Index,48 + \OffsetA)(\AREG)	// load real,imag from A
+.endif	
+ 
+.if \Complete==0		
+	lxv	vs16,	DISP4(\Index, 0+\OffsetB)(\BREG)	// load real imag from B
+	lxv	vs18,	DISP4(\Index, 16+\OffsetB)(\BREG)	// load real,imag from B
+.endif
+	xvmaddadp	vs32,	vs8,	vs20
+	xvmaddadp	vs40,	vs8,	vs22 
+	xvmaddadp	vs33,	vs8,	vs21
+	xvmaddadp	vs41,	vs8,	vs23
+.if \Complete==0		
+  xxswapd	vs17, vs16
+  xxswapd	vs19, vs18
+.endif
+	xvmaddadp	vs34,	vs9,	vs20
+	xvmaddadp	vs42,	vs9,	vs22
+	xvmaddadp	vs35,	vs9,	vs21
+	xvmaddadp	vs43,	vs9,	vs23
+.if \Complete==0		
+	lxv	vs8,	DISP8(\Index,64+0+ \OffsetA)(\AREG)	// load real,imag from A
+	lxv	vs9,	DISP8(\Index,64+16 + \OffsetA)(\AREG)	// load real,imag from A
+.endif
+	xvmaddadp	vs36,	vs10,	vs20
+	xvmaddadp	vs44,	vs10,	vs22
+	xvmaddadp	vs37,	vs10,	vs21
+	xvmaddadp	vs45,	vs10,	vs23
+	xvmaddadp	vs38,	vs11,	vs20
+	xvmaddadp	vs46,	vs11,	vs22
+	xvmaddadp	vs39,	vs11,	vs21
+	xvmaddadp	vs47,	vs11,	vs23
+.if \Complete==0	
+	lxv	vs10,	DISP8(\Index,64+32 + \OffsetA)(\AREG)	// load real,imag from A
+	lxv	vs11,	DISP8(\Index,64+48 + \OffsetA)(\AREG)	// load real,imag from A
+.endif	
+ 
+.if \Complete==0	 
+ 	lxv	vs20,	DISP4(\Index, 32+\OffsetB)(\BREG)	// load real,imag	from B
+	lxv	vs22,	DISP4(\Index, 48+\OffsetB)(\BREG)	// load real,imag  from B
+.endif
+.if \IsLast==1
+.if \Complete==1
+	addi	\AREG, \AREG,  DISP8(\Index,\OffsetA)
+	addi	\BREG, \BREG,  DISP4(\Index,\OffsetB)
+.else
+	addi	\AREG, \AREG, DISP8(\Index,128)
+	addi	\BREG, \BREG,  DISP4(\Index,64)
+.endif
+.endif 
+.endm
+ 
+
 
 .macro KERNEL2x4
-  LOAD2x4 0
+  LOAD2x4
   END2x4  AO, BO, 64,32
 .endm
 
+
+
 .macro SAVE2x4 
 	add	T1, CO ,LDC 
 	SAVE4  vs32,vs33,vs34,vs35,vs36,vs37,vs38,vs39,CO,0
 	SAVE4  vs40,vs41,vs42,vs43,vs44,vs45,vs46,vs47,T1,0  
 	addi	CO, CO, 64
-
 .endm
-
 /**********************************************************************************************
-* Macros for N=2 and M=2
+*
+
+.macros for N=2 and M=2
 **********************************************************************************************/
 
+
 .macro Zero2x2
 	xxlxor	vs32,	vs32,	vs32
 	xxlxor	vs33,	vs33,	vs33
@@ -680,231 +727,299 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 	xxlxor	vs37,	vs37,	vs37
 	xxlxor	vs38,	vs38,	vs38
 	xxlxor	vs39,	vs39,	vs39
+
 .endm
 
-.macro LOAD2x2 Zero
 
-	lxv	vs16,	0(BO)	// load real imag from B
-	lxv	vs18,	16(BO)	// load real,imag from B
+.macro LOAD2x2   
+	LOAD2x2O 0,0 
+.endm
+
+
+.macro LOAD2x2O  OffsetA,OffsetB
+	lxv	vs16,(\OffsetB+	0)(BO)	// load real imag from B
+	lxv	vs18,	(\OffsetB+16)(BO)	// load real,imag from B 
 	xxswapd	vs17, vs16
 	xxswapd	vs19, vs18
-
-	lxv	vs0,	0(AO)	// load real,imag from A
-	lxv	vs1,	16(AO)	// load real,imag from A
-
-
-.if \Zero==1
-	Zero2x2
-.endif 
+	lxv	vs0,	(0+\OffsetA)(AO)	// load real,imag from A
+	lxv	vs1,	(16+\OffsetA)(AO)	// load real,imag from A
+ 
 .endm
 
+
 .macro END2x2_NORMAL
 	END2x2 AO,BO,32,32
 .endm
 
-.macro END2x2	AREG, BREG, OffsetA, OffsetB
 
+.macro END2x2_WITHOUT_ADD
+	END2x2 AO,BO,0,0
+.endm
+
+
+.macro END2x2	AREG, BREG, OffsetA, OffsetB
 .if \OffsetB != 0
 	addi	\BREG, \BREG, \OffsetB
 .endif
 .if \OffsetA != 0
 	addi	\AREG, \AREG, \OffsetA
 .endif
-
 	xvmaddadp	vs32,	vs0,	vs16
-	xvmaddadp	vs33,	vs0,	vs17
 	xvmaddadp	vs36,	vs0,	vs18
-	xvmaddadp	vs37,	vs0,	vs19
-
-	xvmaddadp	vs34,	vs1,	vs16
-	xvmaddadp	vs35,	vs1,	vs17 
-	xvmaddadp	vs38,	vs1,	vs18
-	xvmaddadp	vs39,	vs1,	vs19
-
-.endm
-
-.macro KERNEL2x2_L	OffsetA,OffsetB, Index,IsLast
-  KERNEL2x2_2	AO,BO,	\OffsetA,\OffsetB, \Index,\IsLast ,0
-.endm
-
-.macro KERNEL2x2_E	OffsetA,OffsetB, Index,IsLast
-  KERNEL2x2_2	AO,BO,	\OffsetA,\OffsetB, \Index,\IsLast ,1
-.endm
-
-.macro KERNEL2x2_2	AREG,BREG,	OffsetA,OffsetB, Index,IsLast ,Complete
-
-	lxv	vs20,	DISP4(\Index,	0+\OffsetB)(\BREG)	// load real,imag	from B
-	lxv	vs22,	DISP4(\Index,16+\OffsetB)(\BREG)	// load real,imag  from B
-	xvmaddadp	vs32,	vs0,	vs16
 	xvmaddadp	vs33,	vs0,	vs17
-	xxswapd	vs21, vs20
-	xxswapd	vs23, vs22
-
-	lxv	vs8,	DISP4(\Index, 0 + \OffsetA)(\AREG)	// load real,imag from A
-	lxv	vs9,	DISP4(\Index,16 + \OffsetA)(\AREG)	// load real,imag from A
-.if \IsLast==1
-.if \Complete==1
-	addi	\AREG, \AREG,  DISP4(\Index,32+\OffsetA)
-	addi	\BREG, \BREG,  DISP4(\Index,32+\OffsetB) 
-.endif
-.endif 
-	xvmaddadp	vs36,	vs0,	vs18
 	xvmaddadp	vs37,	vs0,	vs19
-
 	xvmaddadp	vs34,	vs1,	vs16
-	xvmaddadp	vs35,	vs1,	vs17 
 	xvmaddadp	vs38,	vs1,	vs18
-	xvmaddadp	vs39,	vs1,	vs19
+	xvmaddadp	vs35,	vs1,	vs17
+	xvmaddadp	vs39,	vs1,	vs19 
 
-.if \Complete==0
-	lxv	vs0,	DISP4(\Index,32 + \OffsetA)(\AREG)	// load real,imag from A
-	lxv	vs1,	DISP4(\Index,48+ \OffsetA)(\AREG)	// load real,imag from A
-.endif
-.if \Complete==0
-	lxv	vs16,	DISP4(\Index, 32+\OffsetB)(\BREG)	// load real imag from B
-	lxv	vs18,	DISP4(\Index, 48+\OffsetB)(\BREG)	// load real,imag from B
-.if \IsLast==1 
-	addi	\AREG, \AREG,  DISP4(\Index,64)
-	addi	\BREG, \BREG,  DISP4(\Index,64)
-.endif	
-.endif
+.endm
 
-	xvmaddadp	vs32,	vs8,	vs20
-	xvmaddadp	vs33,	vs8,	vs21
 
-.if \Complete==0
+.macro LOAD2x2_2
+    LOAD2x2_2O 0,0
+.endm	
+
+
+.macro LOAD2x2_2O  OffsetA,OffsetB
+	lxv	vs16,(\OffsetB+	0)(BO)	// load real imag from B
+	lxv	vs18,	(\OffsetB+16)(BO)	// load real,imag from B
+	lxv	vs20,	(\OffsetB+32)(BO)	// load real,imag	from B
+	lxv	vs22,	(\OffsetB+48)(BO)	// load real,imag  from B	
 	xxswapd	vs17, vs16
 	xxswapd	vs19, vs18
-.endif 
-	xvmaddadp	vs36,	vs8,	vs22
-	xvmaddadp	vs37,	vs8,	vs23
+	lxv	vs0,	(0+\OffsetA)(AO)	// load real,imag from A
+	lxv	vs1,	(16+\OffsetA)(AO)	// load real,imag from A 
+	lxv	vs8,	(32+\OffsetA)(AO)	// load real,imag from A
+	lxv	vs9,	(48+\OffsetA)(AO)	// load real,imag from A
+ 	
+.endm	
 
-	xvmaddadp	vs34,	vs9,	vs20
-	xvmaddadp	vs35,	vs9,	vs21 
-
-	xvmaddadp	vs38,	vs9,	vs22
-	xvmaddadp	vs39,	vs9,	vs23
 
+.macro END2x2_2	  
+  /*for load2 offset will be 64 and 64*/
+   KERNEL2x2_2	AO,BO,	64,64,0 ,1,1 
 .endm
+ 
+
+
+.macro KERNEL2x2_E2	OffsetA,OffsetB, Index,IsLast 
+  KERNEL2x2_2	AO,BO,	\OffsetA,\OffsetB, \Index,\IsLast ,1 
+.endm
+
+
+.macro KERNEL2x2_L2	OffsetA,OffsetB, Index,IsLast
+  KERNEL2x2_2	AO,BO,	\OffsetA,\OffsetB, \Index,\IsLast ,0 
+.endm
+
+
+.macro KERNEL2x2_2	AREG,BREG,	OffsetA,OffsetB, Index,IsLast ,Complete
+	xvmaddadp	vs32,	vs0,	vs16
+	xvmaddadp	vs36,	vs0,	vs18
+	xvmaddadp	vs33,	vs0,	vs17
+	xvmaddadp	vs37,	vs0,	vs19
+  xxswapd	vs21, vs20
+  xxswapd	vs23, vs22
+	xvmaddadp	vs34,	vs1,	vs16
+	xvmaddadp	vs38,	vs1,	vs18
+	xvmaddadp	vs35,	vs1,	vs17
+	xvmaddadp	vs39,	vs1,	vs19
+.if \Complete==0	
+	lxv	vs0,	DISP4(\Index, 0 + \OffsetA)(\AREG)	// load real,imag from A
+	lxv	vs1,	DISP4(\Index,16 + \OffsetA)(\AREG)	// load real,imag from A
+.endif	 
+.if \Complete==0		
+	lxv	vs16,	DISP4(\Index, 0+\OffsetB)(\BREG)	// load real imag from B
+	lxv	vs18,	DISP4(\Index, 16+\OffsetB)(\BREG)	// load real,imag from B
+.endif
+	xvmaddadp	vs32,	vs8,	vs20
+	xvmaddadp	vs36,	vs8,	vs22 
+	xvmaddadp	vs33,	vs8,	vs21
+	xvmaddadp	vs37,	vs8,	vs23
+.if \Complete==0		
+  xxswapd	vs17, vs16
+  xxswapd	vs19, vs18
+.endif
+	xvmaddadp	vs34,	vs9,	vs20
+	xvmaddadp	vs38,	vs9,	vs22
+	xvmaddadp	vs35,	vs9,	vs21
+	xvmaddadp	vs39,	vs9,	vs23
+.if \Complete==0	 
+ 	lxv	vs20,	DISP4(\Index, 32+\OffsetB)(\BREG)	// load real,imag	from B
+	lxv	vs22,	DISP4(\Index, 48+\OffsetB)(\BREG)	// load real,imag  from B
+.endif
+.if \Complete==0		
+	lxv	vs8,	DISP4(\Index,32+0+ \OffsetA)(\AREG)	// load real,imag from A
+	lxv	vs9,	DISP4(\Index,32+16 + \OffsetA)(\AREG)	// load real,imag from A
+.endif
+ 
+ 
+
+.if \IsLast==1
+.if \Complete==1
+	addi	\AREG, \AREG,  DISP4(\Index,\OffsetA)
+	addi	\BREG, \BREG,  DISP4(\Index,\OffsetB)
+.else
+	addi	\AREG, \AREG, DISP4(\Index,64)
+	addi	\BREG, \BREG,  DISP4(\Index,64)
+.endif
+.endif 
+.endm
+ 
+
 
 .macro KERNEL2x2
-  LOAD2x2 0
+  LOAD2x2
   END2x2  AO, BO, 32,32
 .endm
 
+
+
 .macro SAVE2x2 
 	add	T1, CO ,LDC 
 	SAVE2  vs32,vs33,vs34,vs35,CO,0
 	SAVE2  vs36,vs37,vs38,vs39,T1,0 
 	addi	CO, CO, 32 
 .endm
-
 /**********************************************************************************************
-* Macros for N=2 and M=1
+*
+
+.macros for N=2 and M=1
 **********************************************************************************************/
 
+
+
 .macro Zero2x1
 	xxlxor	vs32,	vs32,	vs32
 	xxlxor	vs33,	vs33,	vs33
 	xxlxor	vs34,	vs34,	vs34
 	xxlxor	vs35,	vs35,	vs35
+ 
 .endm
 
-.macro LOAD2x1 Zero
-	lxv	vs0,	0(AO)	// load real,imag from A
 
-	lxv	vs16,	0(BO)	// load real imag from B
-	lxv	vs18,	16(BO)	// load real,imag from B
+.macro LOAD2x1   
+	LOAD2x1O 0,0 
+.endm
 
+
+.macro LOAD2x1O  OffsetA,OffsetB
+	lxv	vs16,(\OffsetB+	0)(BO)	// load real imag from B
+	lxv	vs18,	(\OffsetB+16)(BO)	// load real,imag from B 
 	xxswapd	vs17, vs16
 	xxswapd	vs19, vs18
-.if \Zero==1
-	Zero2x1
-.endif 
+	lxv	vs0,	(0+\OffsetA)(AO)	// load real,imag from A 
 .endm
 
+
 .macro END2x1_NORMAL
 	END2x1 AO,BO,16,32
 .endm
 
-.macro END2x1	AREG, BREG, OffsetA, OffsetB
 
-.if \OffsetA != 0
-	addi	\AREG, \AREG, \OffsetA
-.endif
+.macro END2x1_WITHOUT_ADD
+	END2x1 AO,BO,0,0
+.endm
+
+
+.macro END2x1	AREG, BREG, OffsetA, OffsetB
 .if \OffsetB != 0
 	addi	\BREG, \BREG, \OffsetB
 .endif
-
+.if \OffsetA != 0
+	addi	\AREG, \AREG, \OffsetA
+.endif
 	xvmaddadp	vs32,	vs0,	vs16
-	xvmaddadp	vs33,	vs0,	vs17
-
 	xvmaddadp	vs34,	vs0,	vs18
-	xvmaddadp	vs35,	vs0,	vs19
-
-.endm
-
-.macro KERNEL2x1_L	OffsetA,OffsetB, Index,IsLast
-  KERNEL2x1_2	AO,BO,	\OffsetA,\OffsetB, \Index,\IsLast ,0
-.endm
-
-.macro KERNEL2x1_E	OffsetA,OffsetB, Index,IsLast
-  KERNEL2x1_2	AO,BO,	\OffsetA,\OffsetB, \Index,\IsLast ,1
-.endm
-
-.macro KERNEL2x1_2	AREG,BREG,	OffsetA,OffsetB, Index,IsLast ,Complete
-
-	lxv	vs20,	DISP4(\Index,	0+\OffsetB)(\BREG)	// load real,imag	from B
-	lxv	vs22,	DISP4(\Index,16+\OffsetB)(\BREG)	// load real,imag  from B
-
-	lxv	vs8,	DISP2(\Index, 0 + \OffsetA)(\AREG)	// load real,imag from A
-
-	xxswapd	vs21, vs20
-	xxswapd	vs23, vs22
-.if \IsLast==1
-.if \Complete==1
-	addi	\AREG, \AREG,  DISP2(\Index,16+\OffsetA)
-	addi	\BREG, \BREG,  DISP4(\Index,32+\OffsetB) 
-.endif
-.endif
-
-	xvmaddadp	vs32,	vs0,	vs16
 	xvmaddadp	vs33,	vs0,	vs17
+	xvmaddadp	vs35,	vs0,	vs19 
+.endm
 
-	xvmaddadp	vs34,	vs0,	vs18
-	xvmaddadp	vs35,	vs0,	vs19
 
-.if \Complete==0
-	lxv	vs0,	DISP2(\Index,16 + \OffsetA)(\AREG)	// load real,imag from A
+.macro LOAD2x1_2
+    LOAD2x1_2O 0,0
+.endm	
 
-.endif
-.if \Complete==0
-	lxv	vs16,	DISP4(\Index, 32+\OffsetB)(\BREG)	// load real imag from B
-	lxv	vs18,	DISP4(\Index, 48+\OffsetB)(\BREG)	// load real,imag from B
-.if \IsLast==1 
-	addi	\AREG, \AREG,  DISP2(\Index,32)
-	addi	\BREG, \BREG,  DISP4(\Index,64)
-.endif	
-.endif
- 
-.if \Complete==0
+
+.macro LOAD2x1_2O  OffsetA,OffsetB
+	lxv	vs16,(\OffsetB+	0)(BO)	// load real imag from B
+	lxv	vs18,	(\OffsetB+16)(BO)	// load real,imag from B
+	lxv	vs20,	(\OffsetB+32)(BO)	// load real,imag	from B
+	lxv	vs22,	(\OffsetB+48)(BO)	// load real,imag  from B	
 	xxswapd	vs17, vs16
 	xxswapd	vs19, vs18
-.endif
+	lxv	vs0,	(0+\OffsetA)(AO)	// load real,imag from A
+	lxv	vs8,	(16+\OffsetA)(AO)	// load real,imag from A 
+.endm	
 
-	xvmaddadp	vs32,	vs8,	vs20
-	xvmaddadp	vs33,	vs8,	vs21
-
-	xvmaddadp	vs34,	vs8,	vs22
-	xvmaddadp	vs35,	vs8,	vs23
 
+.macro END2x1_2	  
+  /*for load2 offset will be 32 and 64*/
+   KERNEL2x1_2	AO,BO,	32,64,0 ,1,1 
 .endm
+ 
+
+
+.macro KERNEL2x1_E2	OffsetA,OffsetB, Index,IsLast 
+  KERNEL2x1_2	AO,BO,	\OffsetA,\OffsetB, \Index,\IsLast ,1 
+.endm
+
+
+.macro KERNEL2x1_L2	OffsetA,OffsetB, Index,IsLast
+  KERNEL2x1_2	AO,BO,	\OffsetA,\OffsetB, \Index,\IsLast ,0 
+.endm
+
+
+.macro KERNEL2x1_2	AREG,BREG,	OffsetA,OffsetB, Index,IsLast ,Complete
+  xxswapd	vs21, vs20
+  xxswapd	vs23, vs22 
+	xvmaddadp	vs32,	vs0,	vs16
+	xvmaddadp	vs34,	vs0,	vs18
+	xvmaddadp	vs33,	vs0,	vs17
+	xvmaddadp	vs35,	vs0,	vs19
+.if \Complete==0	
+	lxv	vs0,	DISP2(\Index, 0 + \OffsetA)(\AREG)	// load real,imag from A 
+.endif	 
+.if \Complete==0		
+	lxv	vs16,	DISP4(\Index, 0+\OffsetB)(\BREG)	// load real imag from B
+	lxv	vs18,	DISP4(\Index, 16+\OffsetB)(\BREG)	// load real,imag from B
+.endif
+.if \Complete==0		
+  xxswapd	vs17, vs16
+  xxswapd	vs19, vs18
+.endif 
+	xvmaddadp	vs32,	vs8,	vs20
+	xvmaddadp	vs34,	vs8,	vs22 
+	xvmaddadp	vs33,	vs8,	vs21
+	xvmaddadp	vs35,	vs8,	vs23
+.if \Complete==0		
+	lxv	vs8,	DISP2(\Index,16+0+ \OffsetA)(\AREG)	// load real,imag from A 
+.endif
+ 
+.if \Complete==0	 
+ 	lxv	vs20,	DISP4(\Index, 32+\OffsetB)(\BREG)	// load real,imag	from B
+	lxv	vs22,	DISP4(\Index, 48+\OffsetB)(\BREG)	// load real,imag  from B
+.endif
+.if \IsLast==1
+.if \Complete==1
+	addi	\AREG, \AREG,  DISP2(\Index,\OffsetA)
+	addi	\BREG, \BREG,  DISP4(\Index,\OffsetB)
+.else
+	addi	\AREG, \AREG, DISP2(\Index,32)
+	addi	\BREG, \BREG,  DISP4(\Index,64)
+.endif
+.endif 
+.endm
+ 
+
 
 .macro KERNEL2x1
-  LOAD2x1 0
+  LOAD2x1
   END2x1  AO, BO, 16,32
 .endm
 
+
+
 .macro SAVE2x1
 	add	T1, CO ,LDC 
 	SAVE1  vs32,vs33,CO,0
@@ -913,8 +1028,12 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 .endm
 
 /**********************************************************************************************
-* Macros for N=1 and M=8
+*
+
+.macros for N=1 and M=8
 **********************************************************************************************/
+
+
 .macro Zero1x8
 	xxlxor	vs32,	vs32,	vs32
 	xxlxor	vs33,	vs33,	vs33
@@ -932,167 +1051,228 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 	xxlxor	vs45,	vs45,	vs45
 	xxlxor	vs46,	vs46,	vs46
 	xxlxor	vs47,	vs47,	vs47
+	xxlxor	vs48,	vs48,	vs48
 .endm
 
-.macro LOAD1x8 Zero
-
-	lxv	vs16,	0(BO)	// load real imag from B
-	xxswapd	vs17, vs16
-	lxv	vs0,	0(AO)	// load real,imag from A
-	lxv	vs1,	16(AO)	// load real,imag from A
-	lxv	vs2,	32(AO)	// load real,imag from A
-	lxv	vs3,	48(AO)	// load real,imag from A
-
-	lxv	vs4,	64(AO)	// load real,imag from A
-	lxv	vs5,	80(AO)	// load real,imag from A
-	lxv	vs6,	96(AO)	// load real,imag from A
-	lxv	vs7,	112(AO)	// load real,imag from A
-
-.if \Zero==1
-	Zero1x8
-.endif
 
+.macro LOAD1x8   
+	LOAD1x8O 0,0 
 .endm
 
+
+.macro LOAD1x8O  OffsetA,OffsetB
+	lxv	vs16,(\OffsetB+	0)(BO)	// load real imag from B 
+	xxswapd	vs17, vs16 
+	lxv	vs0,	(0+\OffsetA)(AO)	// load real,imag from A
+	lxv	vs1,	(16+\OffsetA)(AO)	// load real,imag from A
+	lxv	vs2,	(32+\OffsetA)(AO)	// load real,imag from A
+	lxv	vs3,	(48+\OffsetA)(AO)	// load real,imag from A
+	lxv	vs4,	(64+\OffsetA)(AO)	// load real,imag from A
+	lxv	vs5,	(80+\OffsetA)(AO)	// load real,imag from A
+	lxv	vs6,	(96+\OffsetA)(AO)	// load real,imag from A
+	lxv	vs7,	(112+\OffsetA)(AO)	// load real,imag from A
+ 
+.endm
+
+
 .macro END1x8_NORMAL
 	END1x8 AO,BO,128,16
 .endm
 
-.macro END1x8	AREG, BREG, OffsetA, OffsetB
 
+.macro END1x8_WITHOUT_ADD
+	END1x8 AO,BO,0,0
+.endm
+
+
+.macro END1x8	AREG, BREG, OffsetA, OffsetB
 .if \OffsetB != 0
 	addi	\BREG, \BREG, \OffsetB
 .endif
 .if \OffsetA != 0
 	addi	\AREG, \AREG, \OffsetA
 .endif
-
 	xvmaddadp	vs32,	vs0,	vs16
 	xvmaddadp	vs33,	vs0,	vs17
+
 	xvmaddadp	vs34,	vs1,	vs16
 	xvmaddadp	vs35,	vs1,	vs17
+
 	xvmaddadp	vs36,	vs2,	vs16
 	xvmaddadp	vs37,	vs2,	vs17
+
 	xvmaddadp	vs38,	vs3,	vs16
 	xvmaddadp	vs39,	vs3,	vs17
+
 	xvmaddadp	vs40,	vs4,	vs16
 	xvmaddadp	vs41,	vs4,	vs17
+
 	xvmaddadp	vs42,	vs5,	vs16
 	xvmaddadp	vs43,	vs5,	vs17
+
 	xvmaddadp	vs44,	vs6,	vs16
 	xvmaddadp	vs45,	vs6,	vs17
+
 	xvmaddadp	vs46,	vs7,	vs16
 	xvmaddadp	vs47,	vs7,	vs17
 
 .endm
 
-.macro KERNEL1x8_L	OffsetA,OffsetB, Index,IsLast
-  KERNEL1x8_2	AO,BO,	\OffsetA,\OffsetB, \Index,\IsLast ,0
+
+.macro LOAD1x8_2
+    LOAD1x8_2O 0,0
+.endm	
+
+
+.macro LOAD1x8_2O  OffsetA,OffsetB
+	lxv	vs16,(\OffsetB+	0)(BO)	// load real imag from B
+	lxv	vs20,	(\OffsetB+16)(BO)	// load real,imag	from B
+	xxswapd	vs17, vs16
+
+	lxv	vs0,	(0+\OffsetA)(AO)	// load real,imag from A
+	lxv	vs1,	(16+\OffsetA)(AO)	// load real,imag from A
+	lxv	vs2,	(32+\OffsetA)(AO)	// load real,imag from A
+	lxv	vs3,	(48+\OffsetA)(AO)	// load real,imag from A
+	lxv	vs4,	(64+\OffsetA)(AO)	// load real,imag from A
+	lxv	vs5,	(80+\OffsetA)(AO)	// load real,imag from A
+	lxv	vs6,	(96+\OffsetA)(AO)	// load real,imag from A
+	lxv	vs7,	(112+\OffsetA)(AO)	// load real,imag from A
+	lxv	vs8,	(128+0+\OffsetA)(AO)	// load real,imag from A
+	lxv	vs9,	(128+16+\OffsetA)(AO)	// load real,imag from A
+	lxv	vs10,   (128+32+\OffsetA)(AO)	// load real,imag from A
+	lxv	vs11,   (128+48+\OffsetA)(AO)	// load real,imag from A
+	lxv	vs12,   (128+64+\OffsetA)(AO)	// load real,imag from A
+	lxv	vs13,   (128+80+\OffsetA)(AO)	// load real,imag from A
+	lxv	vs14,   (128+96+\OffsetA)(AO)	// load real,imag from A
+	lxv	vs15,   (128+112+\OffsetA)(AO)	// load real,imag from A
+.endm	
+
+
+.macro END1x8_2	  
+  /*for load2 offset will be 256 and 32*/
+   KERNEL1x8_2	AO,BO,	256,32,0 ,1,1 
+.endm
+ 
+
+
+.macro KERNEL1x8_E2	OffsetA,OffsetB, Index,IsLast 
+  KERNEL1x8_2	AO,BO,	\OffsetA,\OffsetB, \Index,\IsLast ,1 
 .endm
 
-.macro KERNEL1x8_E	OffsetA,OffsetB, Index,IsLast
-  KERNEL1x8_2	AO,BO,	\OffsetA,\OffsetB, \Index,\IsLast ,1
+
+.macro KERNEL1x8_L2	OffsetA,OffsetB, Index,IsLast
+  KERNEL1x8_2	AO,BO,	\OffsetA,\OffsetB, \Index,\IsLast ,0 
 .endm
 
+
 .macro KERNEL1x8_2	AREG,BREG,	OffsetA,OffsetB, Index,IsLast ,Complete
-
-	lxv	vs20,	DISP2(\Index,	0+\OffsetB)(\BREG)	// load real,imag	from B
-	xxswapd	vs21, vs20
-
-
-	lxv	vs8,	DISP16(\Index, 0 + \OffsetA)(\AREG)	// load real,imag from A
-	lxv	vs9,	DISP16(\Index,16 + \OffsetA)(\AREG)	// load real,imag from A
 	xvmaddadp	vs32,	vs0,	vs16
-	xvmaddadp	vs33,	vs0,	vs17	
-	lxv	vs10,	DISP16(\Index,32 + \OffsetA)(\AREG)	// load real,imag from A
-	lxv	vs11,	DISP16(\Index,48 + \OffsetA)(\AREG)	// load real,imag from A
+	xvmaddadp	vs33,	vs0,	vs17
+  xxswapd	vs21, vs20
 	xvmaddadp	vs34,	vs1,	vs16
 	xvmaddadp	vs35,	vs1,	vs17
-	lxv	vs12,	DISP16(\Index, 64 + \OffsetA)(\AREG)	// load real,imag from A
-	lxv	vs13,	DISP16(\Index,64+16 + \OffsetA)(\AREG)	// load real,imag from A
+.if \Complete==0	
+	lxv	vs0,	DISP16(\Index, 0 + \OffsetA)(\AREG)	// load real,imag from A
+	lxv	vs1,	DISP16(\Index,16 + \OffsetA)(\AREG)	// load real,imag from A
+.endif	
 	xvmaddadp	vs36,	vs2,	vs16
 	xvmaddadp	vs37,	vs2,	vs17
-	lxv	vs14,	DISP16(\Index,64+32 + \OffsetA)(\AREG)	// load real,imag from A
-	lxv	vs15,	DISP16(\Index,64+48 + \OffsetA)(\AREG)	// load real,imag from A
 
 	xvmaddadp	vs38,	vs3,	vs16
 	xvmaddadp	vs39,	vs3,	vs17
-.if \Complete==0
-	lxv	vs0,	DISP16(\Index,128+ + \OffsetA)(\AREG)	// load real,imag from A
-	lxv	vs1,	DISP16(\Index,128+16 + \OffsetA)(\AREG)	// load real,imag from A
+.if \Complete==0	
+	lxv	vs2,	DISP16(\Index,32 + \OffsetA)(\AREG)	// load real,imag from A
+	lxv	vs3,	DISP16(\Index,48 + \OffsetA)(\AREG)	// load real,imag from A
 .endif	
 	xvmaddadp	vs40,	vs4,	vs16
 	xvmaddadp	vs41,	vs4,	vs17
-.if \Complete==0	
-	lxv	vs2,	DISP16(\Index,128+32 + \OffsetA)(\AREG)	// load real,imag from A
-	lxv	vs3,	DISP16(\Index,128+48 + \OffsetA)(\AREG)	// load real,imag from A
-.endif	
+
 	xvmaddadp	vs42,	vs5,	vs16
 	xvmaddadp	vs43,	vs5,	vs17
+.if \Complete==0		
+	lxv	vs4,	DISP16(\Index,64+ \OffsetA)(\AREG)	// load real,imag from A
+	lxv	vs5,	DISP16(\Index,64+16 + \OffsetA)(\AREG)	// load real,imag from A
+.endif	
 	xvmaddadp	vs44,	vs6,	vs16
 	xvmaddadp	vs45,	vs6,	vs17
-.if \Complete==0
-	lxv	vs4,	DISP16(\Index, 192 + \OffsetA)(\AREG)	// load real,imag from A
-	lxv	vs5,	DISP16(\Index,192 +16 + \OffsetA)(\AREG)	// load real,imag from A
-.endif	
+
 	xvmaddadp	vs46,	vs7,	vs16
 	xvmaddadp	vs47,	vs7,	vs17
-
-
+.if \Complete==0		
+	lxv	vs16,	DISP2(\Index, 0+\OffsetB)(\BREG)	// load real imag from B
+.endif
+.if \Complete==0		
+  xxswapd	vs17, vs16
+.endif
 	xvmaddadp	vs32,	vs8,	vs20
 	xvmaddadp	vs33,	vs8,	vs21
-.if \Complete==0	
-	lxv	vs6,	DISP16(\Index,192 +32 + \OffsetA)(\AREG)	// load real,imag from A
-	lxv	vs7,	DISP16(\Index,192 +48 + \OffsetA)(\AREG)	// load real,imag from A 
-.endif
+.if \Complete==0
+	lxv	vs6,	DISP16(\Index,64+32 + \OffsetA)(\AREG)	// load real,imag from A
+	lxv	vs7,	DISP16(\Index,64+48 + \OffsetA)(\AREG)	// load real,imag from A	
+.endif	
 	xvmaddadp	vs34,	vs9,	vs20
 	xvmaddadp	vs35,	vs9,	vs21
-.if \Complete==0
-	lxv	vs16,	DISP2(\Index, 16+\OffsetB)(\BREG)	// load real imag from B
-	xxswapd	vs17,vs16
-.endif
-.if \IsLast==1
-.if \Complete==1
-	addi	\AREG, \AREG, DISP16(\Index,128+\OffsetA)
-	addi	\BREG, \BREG, DISP2(\Index,16+\OffsetB)
-.else
-	addi	\AREG, \AREG, DISP16(\Index,256)
-	addi	\BREG, \BREG, DISP2(\Index,32)
-.endif
+.if \Complete==0		
+	lxv	vs8,	DISP16(\Index,128+ + \OffsetA)(\AREG)	// load real,imag from A
+	lxv	vs9,	DISP16(\Index,128+16 + \OffsetA)(\AREG)	// load real,imag from A
 .endif
 	xvmaddadp	vs36,	vs10,	vs20
 	xvmaddadp	vs37,	vs10,	vs21
-
 	xvmaddadp	vs38,	vs11,	vs20
 	xvmaddadp	vs39,	vs11,	vs21
-
+.if \Complete==0	
+	lxv	vs10,	DISP16(\Index,128+32 + \OffsetA)(\AREG)	// load real,imag from A
+	lxv	vs11,	DISP16(\Index,128+48 + \OffsetA)(\AREG)	// load real,imag from A
+.endif	
 	xvmaddadp	vs40,	vs12,	vs20
 	xvmaddadp	vs41,	vs12,	vs21
 	xvmaddadp	vs42,	vs13,	vs20
 	xvmaddadp	vs43,	vs13,	vs21
+.if \Complete==0	
+	lxv	vs12,	DISP16(\Index, 192 + \OffsetA)(\AREG)	// load real,imag from A
+	lxv	vs13,	DISP16(\Index,192 +16 + \OffsetA)(\AREG)	// load real,imag from A
+.endif	
 	xvmaddadp	vs44,	vs14,	vs20
 	xvmaddadp	vs45,	vs14,	vs21
 	xvmaddadp	vs46,	vs15,	vs20
 	xvmaddadp	vs47,	vs15,	vs21
-
+.if \Complete==0	
+	lxv	vs14,	DISP16(\Index,192 +32 + \OffsetA)(\AREG)	// load real,imag from A
+	lxv	vs15,	DISP16(\Index,192 +48 + \OffsetA)(\AREG)	// load real,imag from A
+ 	lxv	vs20,	DISP2(\Index, 16+\OffsetB)(\BREG)	// load real,imag	from B
+.endif
+.if \IsLast==1
+.if \Complete==1
+	addi	\AREG, \AREG,  DISP16(\Index,\OffsetA)
+	addi	\BREG, \BREG,  DISP2(\Index,\OffsetB)
+.else
+	addi	\AREG, \AREG, DISP16(\Index,256)
+	addi	\BREG, \BREG,  DISP2(\Index,32)
+.endif
+.endif 
 .endm
 
+ 
+
+
+
 .macro KERNEL1x8
-  LOAD1x8 0
+  LOAD1x8
   END1x8  AO, BO, 128,16
 .endm
 
+
 .macro SAVE1x8
-
-	SAVE8  vs32,vs33,vs34,vs35,vs36,vs37,vs38,vs39,vs40,vs41,vs42,vs43,vs44,vs45,vs46,vs47,CO,0 
+	SAVE8  vs32,vs33,vs34,vs35,vs36,vs37,vs38,vs39,vs40,vs41,vs42,vs43,vs44,vs45,vs46,vs47,CO,0
 	addi	CO, CO, 128
-
 .endm
-
 /**********************************************************************************************
-* Macros for N=1 and M=4
+*
+
+.macros for N=2 and M=4
 **********************************************************************************************/
 
+
 .macro Zero1x4
 	xxlxor	vs32,	vs32,	vs32
 	xxlxor	vs33,	vs33,	vs33
@@ -1104,324 +1284,542 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 	xxlxor	vs39,	vs39,	vs39
 .endm
 
-.macro LOAD1x4 Zero
-
-	lxv	vs16,	0(BO)	// load real imag from B
-	xxswapd	vs17,vs16
-	lxv	vs0,	0(AO)	// load real,imag from A
-	lxv	vs1,	16(AO)	// load real,imag from A
-	lxv	vs2,	32(AO)	// load real,imag from A
-	lxv	vs3,	48(AO)	// load real,imag from A
-
-.if \Zero==1
-	Zero1x4
-.endif
 
+.macro LOAD1x4   
+	LOAD1x4O 0,0 
 .endm
 
+
+.macro LOAD1x4O  OffsetA,OffsetB
+	lxv	vs16,(\OffsetB+	0)(BO)	// load real imag from B
+	xxswapd	vs17, vs16
+
+	lxv	vs0,	(0+\OffsetA)(AO)	// load real,imag from A
+	lxv	vs1,	(16+\OffsetA)(AO)	// load real,imag from A
+	lxv	vs2,	(32+\OffsetA)(AO)	// load real,imag from A
+	lxv	vs3,	(48+\OffsetA)(AO)	// load real,imag from A 
+ 
+.endm
+
+
 .macro END1x4_NORMAL
 	END1x4 AO,BO,64,16
 .endm
 
-.macro END1x4	AREG, BREG, OffsetA, OffsetB
 
+.macro END1x4_WITHOUT_ADD
+	END1x4 AO,BO,0,0
+.endm
+
+
+.macro END1x4	AREG, BREG, OffsetA, OffsetB
 .if \OffsetB != 0
 	addi	\BREG, \BREG, \OffsetB
 .endif
 .if \OffsetA != 0
 	addi	\AREG, \AREG, \OffsetA
 .endif
-
 	xvmaddadp	vs32,	vs0,	vs16
 	xvmaddadp	vs33,	vs0,	vs17
+
 	xvmaddadp	vs34,	vs1,	vs16
 	xvmaddadp	vs35,	vs1,	vs17
+
 	xvmaddadp	vs36,	vs2,	vs16
 	xvmaddadp	vs37,	vs2,	vs17
+
 	xvmaddadp	vs38,	vs3,	vs16
 	xvmaddadp	vs39,	vs3,	vs17
 
 .endm
 
-.macro KERNEL1x4_L	OffsetA,OffsetB, Index,IsLast
-  KERNEL1x4_2	AO,BO,	\OffsetA,\OffsetB, \Index,\IsLast ,0
+
+.macro LOAD1x4_2
+    LOAD1x4_2O 0,0
+.endm	
+
+
+.macro LOAD1x4_2O  OffsetA,OffsetB
+	lxv	vs16,(\OffsetB+	0)(BO)	// load real imag from B
+	lxv	vs20,	(\OffsetB+16)(BO)	// load real,imag	from B
+	xxswapd	vs17, vs16
+
+	lxv	vs0,	(0+\OffsetA)(AO)	// load real,imag from A
+	lxv	vs1,	(16+\OffsetA)(AO)	// load real,imag from A
+	lxv	vs2,	(32+\OffsetA)(AO)	// load real,imag from A
+	lxv	vs3,	(48+\OffsetA)(AO)	// load real,imag from A
+	lxv	vs8,	(64+\OffsetA)(AO)	// load real,imag from A
+	lxv	vs9,	(80+\OffsetA)(AO)	// load real,imag from A
+	lxv	vs10,	(96+\OffsetA)(AO)	// load real,imag from A
+	lxv	vs11,	(112+\OffsetA)(AO)	// load real,imag from A 
+.endm	
+
+
+.macro END1x4_2	  
+  /*for load2 offset will be 128 and 32*/
+   KERNEL1x4_2	AO,BO,	128,32,0 ,1,1 
+.endm
+ 
+
+
+.macro KERNEL1x4_E2	OffsetA,OffsetB, Index,IsLast 
+  KERNEL1x4_2	AO,BO,	\OffsetA,\OffsetB, \Index,\IsLast ,1 
 .endm
 
-.macro KERNEL1x4_E	OffsetA,OffsetB, Index,IsLast
-  KERNEL1x4_2	AO,BO,	\OffsetA,\OffsetB, \Index,\IsLast ,1
+
+.macro KERNEL1x4_L2	OffsetA,OffsetB, Index,IsLast
+  KERNEL1x4_2	AO,BO,	\OffsetA,\OffsetB, \Index,\IsLast ,0 
 .endm
 
+
 .macro KERNEL1x4_2	AREG,BREG,	OffsetA,OffsetB, Index,IsLast ,Complete
-
-	lxv	vs20,	DISP2(\Index,	0+\OffsetB)(\BREG)	// load real,imag	from B
-	xxswapd	vs21,vs20
-
 	xvmaddadp	vs32,	vs0,	vs16
 	xvmaddadp	vs33,	vs0,	vs17
-
-	lxv	vs8,	DISP8(\Index, 0 + \OffsetA)(\AREG)	// load real,imag from A
-	lxv	vs9,	DISP8(\Index,16 + \OffsetA)(\AREG)	// load real,imag from A
+  xxswapd	vs21, vs20
 	xvmaddadp	vs34,	vs1,	vs16
-	xvmaddadp	vs35,	vs1,	vs17	
-	lxv	vs10,	DISP8(\Index,32 + \OffsetA)(\AREG)	// load real,imag from A
-	lxv	vs11,	DISP8(\Index,48 + \OffsetA)(\AREG)	// load real,imag from A
-
+	xvmaddadp	vs35,	vs1,	vs17
+.if \Complete==0	
+	lxv	vs0,	DISP8(\Index, 0 + \OffsetA)(\AREG)	// load real,imag from A
+	lxv	vs1,	DISP8(\Index,16 + \OffsetA)(\AREG)	// load real,imag from A
+.endif	
 	xvmaddadp	vs36,	vs2,	vs16
 	xvmaddadp	vs37,	vs2,	vs17
+
 	xvmaddadp	vs38,	vs3,	vs16
 	xvmaddadp	vs39,	vs3,	vs17
-
-	xvmaddadp	vs40,	vs0,	vs18
-	xvmaddadp	vs41,	vs0,	vs19
-	xvmaddadp	vs42,	vs1,	vs18
-	xvmaddadp	vs43,	vs1,	vs19
-	xvmaddadp	vs44,	vs2,	vs18
-	xvmaddadp	vs45,	vs2,	vs19
-	xvmaddadp	vs46,	vs3,	vs18
-	xvmaddadp	vs47,	vs3,	vs19
-
-.if \Complete==0
-	lxv	vs0,	DISP8(\Index,64+  \OffsetA)(\AREG)	// load real,imag from A
-	lxv	vs1,	DISP8(\Index,64+16 + \OffsetA)(\AREG)	// load real,imag from A
-.endif
 .if \Complete==0	
-	lxv	vs2,	DISP8(\Index,64+32 + \OffsetA)(\AREG)	// load real,imag from A
-	lxv	vs3,	DISP8(\Index,64+48 + \OffsetA)(\AREG)	// load real,imag from A
+	lxv	vs2,	DISP8(\Index,32 + \OffsetA)(\AREG)	// load real,imag from A
+	lxv	vs3,	DISP8(\Index,48 + \OffsetA)(\AREG)	// load real,imag from A
+.endif	
  
+.if \Complete==0		
+	lxv	vs16,	DISP2(\Index, 0+\OffsetB)(\BREG)	// load real imag from B
 .endif
-.if \Complete==0
-	lxv	vs16,	DISP2(\Index, 16+\OffsetB)(\BREG)	// load real imag from B
-	xxswapd	vs17,vs16
-.endif
-.if \IsLast==1
-.if \Complete==1
-	addi	\AREG, \AREG,  DISP8(\Index,64+\OffsetA)
-	addi	\BREG, \BREG,  DISP2(\Index,16+\OffsetB)
-.else
-	addi	\AREG, \AREG,  DISP8(\Index,128)
-	addi	\BREG, \BREG,  DISP2(\Index,32)
-.endif
-.endif
-
 	xvmaddadp	vs32,	vs8,	vs20
 	xvmaddadp	vs33,	vs8,	vs21
+.if \Complete==0		
+  xxswapd	vs17, vs16
+.endif
 	xvmaddadp	vs34,	vs9,	vs20
 	xvmaddadp	vs35,	vs9,	vs21
+.if \Complete==0		
+	lxv	vs8,	DISP8(\Index,64+0+ \OffsetA)(\AREG)	// load real,imag from A
+	lxv	vs9,	DISP8(\Index,64+16 + \OffsetA)(\AREG)	// load real,imag from A
+.endif
 	xvmaddadp	vs36,	vs10,	vs20
 	xvmaddadp	vs37,	vs10,	vs21
 	xvmaddadp	vs38,	vs11,	vs20
 	xvmaddadp	vs39,	vs11,	vs21
-
-	xvmaddadp	vs40,	vs8,	vs22
-	xvmaddadp	vs41,	vs8,	vs23
-	xvmaddadp	vs42,	vs9,	vs22
-	xvmaddadp	vs43,	vs9,	vs23
-	xvmaddadp	vs44,	vs10,	vs22
-	xvmaddadp	vs45,	vs10,	vs23
-	xvmaddadp	vs46,	vs11,	vs22
-	xvmaddadp	vs47,	vs11,	vs23
-
+.if \Complete==0	
+	lxv	vs10,	DISP8(\Index,64+32 + \OffsetA)(\AREG)	// load real,imag from A
+	lxv	vs11,	DISP8(\Index,64+48 + \OffsetA)(\AREG)	// load real,imag from A
+.endif	
+ 
+.if \Complete==0	 
+ 	lxv	vs20,	DISP2(\Index, 16+\OffsetB)(\BREG)	// load real,imag	from B
+.endif
+.if \IsLast==1
+.if \Complete==1
+	addi	\AREG, \AREG,  DISP8(\Index,\OffsetA)
+	addi	\BREG, \BREG,  DISP2(\Index,\OffsetB)
+.else
+	addi	\AREG, \AREG, DISP8(\Index,128)
+	addi	\BREG, \BREG,  DISP2(\Index,32)
+.endif
+.endif 
 .endm
+ 
+
 
 .macro KERNEL1x4
-  LOAD1x4 0
+  LOAD1x4
   END1x4  AO, BO, 64,16
 .endm
 
-.macro SAVE1x4
+
+
+.macro SAVE1x4 
 	SAVE4  vs32,vs33,vs34,vs35,vs36,vs37,vs38,vs39,CO,0
 	addi	CO, CO, 64
-
 .endm
-
 /**********************************************************************************************
-* Macros for N=1 and M=2
+*
+
+.macros for N=2 and M=2
 **********************************************************************************************/
 
+
 .macro Zero1x2
 	xxlxor	vs32,	vs32,	vs32
 	xxlxor	vs33,	vs33,	vs33
 	xxlxor	vs34,	vs34,	vs34
-	xxlxor	vs35,	vs35,	vs35
-.endm
-
-.macro LOAD1x2 Zero
-
-	lxv	vs16,	0(BO)	// load real imag from B
-	xxswapd	vs17,vs16
-	lxv	vs0,	0(AO)	// load real,imag from A
-	lxv	vs1,	16(AO)	// load real,imag from A
- 
-.if \Zero==1
-	Zero1x2
-.endif
+	xxlxor	vs35,	vs35,	vs35 
 
 .endm
 
+
+.macro LOAD1x2   
+	LOAD1x2O 0,0 
+.endm
+
+
+.macro LOAD1x2O  OffsetA,OffsetB
+	lxv	vs16,(\OffsetB+	0)(BO)	// load real imag from B
+	xxswapd	vs17, vs16
+
+	lxv	vs0,	(0+\OffsetA)(AO)	// load real,imag from A
+	lxv	vs1,	(16+\OffsetA)(AO)	// load real,imag from A 
+
+.endm
+
+
 .macro END1x2_NORMAL
 	END1x2 AO,BO,32,16
 .endm
 
-.macro END1x2	AREG, BREG, OffsetA, OffsetB
 
+.macro END1x2_WITHOUT_ADD
+	END1x2 AO,BO,0,0
+.endm
+
+
+.macro END1x2	AREG, BREG, OffsetA, OffsetB
 .if \OffsetB != 0
 	addi	\BREG, \BREG, \OffsetB
 .endif
 .if \OffsetA != 0
 	addi	\AREG, \AREG, \OffsetA
 .endif
-
 	xvmaddadp	vs32,	vs0,	vs16
 	xvmaddadp	vs33,	vs0,	vs17
+
 	xvmaddadp	vs34,	vs1,	vs16
 	xvmaddadp	vs35,	vs1,	vs17
 
 .endm
 
-.macro KERNEL1x2_L	OffsetA,OffsetB, Index,IsLast
-  KERNEL1x2_2	AO,BO,	\OffsetA,\OffsetB, \Index,\IsLast ,0
+
+.macro LOAD1x2_2
+    LOAD1x2_2O 0,0
+.endm	
+
+
+.macro LOAD1x2_2O  OffsetA,OffsetB
+	lxv	vs16,(\OffsetB+	0)(BO)	// load real imag from B
+	lxv	vs20,	(\OffsetB+16)(BO)	// load real,imag	from B
+	xxswapd	vs17, vs16
+
+	lxv	vs0,	(0+\OffsetA)(AO)	// load real,imag from A
+	lxv	vs1,	(16+\OffsetA)(AO)	// load real,imag from A
+	lxv	vs8,	(32+\OffsetA)(AO)	// load real,imag from A
+	lxv	vs9,	(48+\OffsetA)(AO)	// load real,imag from A
+.endm	
+
+
+.macro END1x2_2	  
+  /*for load2 offset will be 64 and 32*/
+   KERNEL1x2_2	AO,BO,	64,32,0 ,1,1 
+.endm
+ 
+
+
+.macro KERNEL1x2_E2	OffsetA,OffsetB, Index,IsLast 
+  KERNEL1x2_2	AO,BO,	\OffsetA,\OffsetB, \Index,\IsLast ,1 
 .endm
 
-.macro KERNEL1x2_E	OffsetA,OffsetB, Index,IsLast
-  KERNEL1x2_2	AO,BO,	\OffsetA,\OffsetB, \Index,\IsLast ,1
+
+.macro KERNEL1x2_L2	OffsetA,OffsetB, Index,IsLast
+  KERNEL1x2_2	AO,BO,	\OffsetA,\OffsetB, \Index,\IsLast ,0 
 .endm
 
+
 .macro KERNEL1x2_2	AREG,BREG,	OffsetA,OffsetB, Index,IsLast ,Complete
-
-	lxv	vs20,	DISP2(\Index,	0+\OffsetB)(\BREG)	// load real,imag	from B
-	xxswapd	vs21,vs20
-
 	xvmaddadp	vs32,	vs0,	vs16
 	xvmaddadp	vs33,	vs0,	vs17
-
-	lxv	vs8,	DISP4(\Index, 0 + \OffsetA)(\AREG)	// load real,imag from A
-	lxv	vs9,	DISP4(\Index,16 + \OffsetA)(\AREG)	// load real,imag from A
-
+  xxswapd	vs21, vs20
 	xvmaddadp	vs34,	vs1,	vs16
 	xvmaddadp	vs35,	vs1,	vs17
-.if \Complete==0
-	lxv	vs0,	DISP4(\Index,32 + \OffsetA)(\AREG)	// load real,imag from A
-	lxv	vs1,	DISP4(\Index,48+ \OffsetA)(\AREG)	// load real,imag from A 
+.if \Complete==0	
+	lxv	vs0,	DISP4(\Index, 0 + \OffsetA)(\AREG)	// load real,imag from A
+	lxv	vs1,	DISP4(\Index,16 + \OffsetA)(\AREG)	// load real,imag from A
+.endif	 
+.if \Complete==0		
+	lxv	vs16,	DISP2(\Index, 0+\OffsetB)(\BREG)	// load real imag from B
 .endif
-.if \Complete==0
-	lxv	vs16,	DISP2(\Index, 16+\OffsetB)(\BREG)	// load real imag from B
-	xxswapd	vs17,vs16
-.endif
-.if \IsLast==1
-.if \Complete==1
-	addi	\AREG, \AREG,  DISP4(\Index,32+\OffsetA)
-	addi	\BREG, \BREG,  DISP2(\Index,16+\OffsetB)
-.else
-	addi	\AREG, \AREG,  DISP4(\Index,64)
-	addi	\BREG, \BREG,  DISP2(\Index,32)
-.endif
-.endif
-
 	xvmaddadp	vs32,	vs8,	vs20
 	xvmaddadp	vs33,	vs8,	vs21
+.if \Complete==0		
+  xxswapd	vs17, vs16
+.endif
 	xvmaddadp	vs34,	vs9,	vs20
 	xvmaddadp	vs35,	vs9,	vs21
+.if \Complete==0	 
+ 	lxv	vs20,	DISP2(\Index, 16+\OffsetB)(\BREG)	// load real,imag	from B
+.endif
+.if \Complete==0		
+	lxv	vs8,	DISP4(\Index,32+0+ \OffsetA)(\AREG)	// load real,imag from A
+	lxv	vs9,	DISP4(\Index,32+16 + \OffsetA)(\AREG)	// load real,imag from A
+.endif
+ 
+ 
 
+.if \IsLast==1
+.if \Complete==1
+	addi	\AREG, \AREG,  DISP4(\Index,\OffsetA)
+	addi	\BREG, \BREG,  DISP2(\Index,\OffsetB)
+.else
+	addi	\AREG, \AREG, DISP4(\Index,64)
+	addi	\BREG, \BREG,  DISP2(\Index,32)
+.endif
+.endif 
 .endm
+ 
+
 
 .macro KERNEL1x2
-  LOAD1x2 0
+  LOAD1x2
   END1x2  AO, BO, 32,16
 .endm
 
-.macro SAVE1x2
+
+
+.macro SAVE1x2 
 	SAVE2  vs32,vs33,vs34,vs35,CO,0
 	addi	CO, CO, 32 
 .endm
-
 /**********************************************************************************************
-* Macros for N=1 and M=1
+*
+
+.macros for N=2 and M=1
 **********************************************************************************************/
 
+
+
 .macro Zero1x1
 	xxlxor	vs32,	vs32,	vs32
-	xxlxor	vs33,	vs33,	vs33
+	xxlxor	vs33,	vs33,	vs33 
 .endm
 
-.macro LOAD1x1 Zero
-	lxv	vs0,	0(AO)	// load real,imag from A
 
-	lxv	vs16,	0(BO)	// load real imag from B
-	xxswapd vs17,  vs16
-.if \Zero==1
-	Zero1x1
-.endif
- 
+.macro LOAD1x1   
+	LOAD1x1O 0,0 
 .endm
 
+
+.macro LOAD1x1O  OffsetA,OffsetB
+	lxv	vs16,(\OffsetB+	0)(BO)	// load real imag from B
+	lxv	vs0,	(0+\OffsetA)(AO)	// load real,imag from A 
+	xxswapd	vs17, vs16
+
+.endm
+
+
 .macro END1x1_NORMAL
 	END1x1 AO,BO,16,16
 .endm
 
-.macro END1x1	AREG, BREG, OffsetA, OffsetB
 
-.if \OffsetA != 0
-	addi	\AREG, \AREG, \OffsetA
-.endif
+.macro END1x1_WITHOUT_ADD
+	END1x1 AO,BO,0,0
+.endm
+
+
+.macro END1x1	AREG, BREG, OffsetA, OffsetB
 .if \OffsetB != 0
 	addi	\BREG, \BREG, \OffsetB
 .endif
-
-	xvmaddadp	vs32,	vs0,	vs16
-	xvmaddadp	vs33,	vs0,	vs17
-
-
+.if \OffsetA != 0
+	addi	\AREG, \AREG, \OffsetA
+.endif
+	xvmaddadp	vs32,	vs0,	vs16 
+	xvmaddadp	vs33,	vs0,	vs17 
 .endm
 
-.macro KERNEL1x1_L	OffsetA,OffsetB, Index,IsLast
-  KERNEL1x1_2	AO,BO,	\OffsetA,\OffsetB, \Index,\IsLast ,0
+
+.macro LOAD1x1_2
+    LOAD1x1_2O 0,0
+.endm	
+
+
+.macro LOAD1x1_2O  OffsetA,OffsetB
+	lxv	vs16,(\OffsetB+	0)(BO)	// load real imag from B
+	lxv	vs20,	(\OffsetB+16)(BO)	// load real,imag	from B
+	xxswapd	vs17, vs16
+
+	lxv	vs0,	(0+\OffsetA)(AO)	// load real,imag from A
+	lxv	vs8,	(16+\OffsetA)(AO)	// load real,imag from A 
+.endm	
+
+
+.macro END1x1_2	  
+  /*for load2 offset will be 32 and 32*/
+   KERNEL1x1_2	AO,BO,	32,32,0 ,1,1 
+.endm
+ 
+
+
+.macro KERNEL1x1_E2	OffsetA,OffsetB, Index,IsLast 
+  KERNEL1x1_2	AO,BO,	\OffsetA,\OffsetB, \Index,\IsLast ,1 
 .endm
 
-.macro KERNEL1x1_E	OffsetA,OffsetB, Index,IsLast
-  KERNEL1x1_2	AO,BO,	\OffsetA,\OffsetB, \Index,\IsLast ,1
+
+.macro KERNEL1x1_L2	OffsetA,OffsetB, Index,IsLast
+  KERNEL1x1_2	AO,BO,	\OffsetA,\OffsetB, \Index,\IsLast ,0 
 .endm
 
+
 .macro KERNEL1x1_2	AREG,BREG,	OffsetA,OffsetB, Index,IsLast ,Complete
-	lxv	vs20,	DISP2(\Index,	0+\OffsetB)(\BREG)	// load real,imag	from B
-	xxswapd vs21,  vs20
-
-	lxv	vs8,	DISP2(\Index, 0 + \OffsetA)(\AREG)	// load real,imag from A
+  xxswapd	vs21, vs20
+	xvmaddadp	vs32,	vs0,	vs16 
+	xvmaddadp	vs33,	vs0,	vs17 
+.if \Complete==0	
+	lxv	vs0,	DISP2(\Index, 0 + \OffsetA)(\AREG)	// load real,imag from A 
+.endif	 
+.if \Complete==0		
+	lxv	vs16,	DISP2(\Index, 0+\OffsetB)(\BREG)	// load real imag from B
+.endif
+.if \Complete==0		
+  xxswapd	vs17, vs16
+.endif 
+	xvmaddadp	vs32,	vs8,	vs20
+	xvmaddadp	vs33,	vs8,	vs21 
+.if \Complete==0		
+	lxv	vs8,	DISP2(\Index,16+0+ \OffsetA)(\AREG)	// load real,imag from A 
+.endif
  
-	xvmaddadp	vs32,	vs0,	vs16
-	xvmaddadp	vs33,	vs0,	vs17
-
-.if \Complete==0
-	lxv	vs0,	DISP2(\Index,16 + \OffsetA)(\AREG)	// load real,imag from A
+.if \Complete==0	 
+ 	lxv	vs20,	DISP2(\Index, 16+\OffsetB)(\BREG)	// load real,imag	from B
 .endif
-.if \Complete==0
-	lxv	vs16,	DISP2(\Index, 16+\OffsetB)(\BREG)	// load real imag from B
-	xxswapd vs17,  vs16	
-.endif
-
 .if \IsLast==1
 .if \Complete==1
-	addi	\AREG, \AREG,  DISP2(\Index,16+\OffsetA)
-	addi	\BREG, \BREG,  DISP2(\Index,16+\OffsetB)
+	addi	\AREG, \AREG,  DISP2(\Index,\OffsetA)
+	addi	\BREG, \BREG,  DISP2(\Index,\OffsetB)
 .else
-	addi	\AREG, \AREG,  DISP2(\Index,32)
+	addi	\AREG, \AREG, DISP2(\Index,32)
 	addi	\BREG, \BREG,  DISP2(\Index,32)
 .endif
-.endif
-
-	xvmaddadp	vs32,	vs8,	vs20
-	xvmaddadp	vs33,	vs8,	vs21
-
-
+.endif 
 .endm
+ 
+
 
 .macro KERNEL1x1
-  LOAD1x1 0
+  LOAD1x1
   END1x1  AO, BO, 16,16
-
 .endm
 
-.macro SAVE1x1 
+
+
+.macro SAVE1x1
 	SAVE1  vs32,vs33,CO,0
 	addi	CO, CO, 16 
 .endm
 
+/****************************TRMM POINTER REFRESH
+
+.macroSES*************************/
+
+
+.macro SHIFT_REG  REG1,REG2,SHIFT_VAL
+		.if \SHIFT_VAL==16 
+			slwi		\REG1,	\REG2,	8			
+		.elseif \SHIFT_VAL==8  
+			slwi		\REG1,	\REG2,	7			 
+		.elseif \SHIFT_VAL==4
+			slwi		\REG1,	\REG2,	6			  
+		.elseif \SHIFT_VAL==2
+			slwi		\REG1,	\REG2,	5			 
+		.elseif \SHIFT_VAL==1
+			slwi		\REG1,	\REG2,	4			 
+		.endif
+.endm
+/*
+//#if (defined(LEFT) &&  defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+// 		ptrbb = bb;
+// #else
+// 		ptrba += off*16;
+// 		ptrbb = bb + off*2;
+// #endif
+*/
+
+
+.macro REFRESH_POINTERS  PTR_A,PTR_B,OFF_VAL,B_VAL,C_A,C_B
+    #if (defined(LEFT) &&  defined(TRANSA)) ||  (!defined(LEFT) && !defined(TRANSA))
+        /* ptrbb = bb;*/
+        mr \PTR_B,\B_VAL     /* refresh BPOINT */
+    #else
+		    /*
+        // ptrba  =ptrba+ off*C_A;
+        // ptrbb = bb + off*C_B; 
+				*/
+		SHIFT_REG T4,\OFF_VAL,\C_B		/* Number of values in B shifted  */
+		SHIFT_REG T2,\OFF_VAL,\C_A		/* Number of values in A shifted  */
+		add		\PTR_B,	\B_VAL ,	T4				/* Add values to BO */
+		add		\PTR_A,	\PTR_A,	T2				/* Add values to AO  */
+    #endif 
+.endm
+
+/*
+// #if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
+// 		temp = bk-off;
+// #elif defined(LEFT)
+// 		temp = off+16;	// number of values in A
+// #else
+// 		temp = off+2;	// number of values in B
+// #endif
+*/
+
+
+.macro REFRESH_TEMP_BK TEMP_BK,BK_VAL,OFF_VAL,INCR_A,INCR_B
+    #if (defined(LEFT) && !defined(TRANSA)) ||  (!defined(LEFT) && defined(TRANSA))
+                            /* temp = bk-off;*/
+           sub \TEMP_BK,\BK_VAL,\OFF_VAL
+    #elif defined(LEFT)
+                            /* temp = off+INCR_A;	// number of values in A */
+           addi \TEMP_BK, \OFF_VAL, \INCR_A
+    #else
+                            /* temp = off+INCR_B	// number of values in B*/
+           addi \TEMP_BK,\OFF_VAL, \INCR_B
+    #endif
+.endm
+/*
+// #if ( defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+// 		temp = bk - off;
+// #ifdef LEFT
+// 		temp -= 16; // number of values in A
+// #else
+// 		temp -= 2; // number of values in B
+// #endif
+// 		ptrba += temp*16;
+// 		ptrbb += temp*2;
+// #endif
+// #ifdef LEFT
+// 		off += 16; // number of values in A
+// #endif
+*/
+ 
+
+
+.macro REFRESH_AFTER_SAVE TEMP_BK,BK_VAL,OFF_VAL,PTR_B,PTR_A,C_A,C_B
+    #if ( defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+                    /*temp = bk - off;*/
+                sub \TEMP_BK,\BK_VAL,\OFF_VAL
+    #ifdef LEFT
+                    /*temp -= 8; // number of values in A*/
+                addi \TEMP_BK,\TEMP_BK,-\C_A
+    #else
+                    /*temp -= 4; // number of values in B*/
+                addi \TEMP_BK,\TEMP_BK,-\C_B 
+    #endif
+                    /*ptrba += temp*C_A;
+                    ptrbb += temp*C_B;*/ 
+                SHIFT_REG T4,\TEMP_BK,\C_A
+								SHIFT_REG T2,\TEMP_BK,\C_B
+                add \PTR_A, \PTR_A,T4/*ptrba+temp*C_A*/ 
+								add \PTR_B, \PTR_B,T2 
+    #endif
+    #ifdef LEFT
+                    /*off += 8; // number of values in A*/
+                 addi \OFF_VAL,\OFF_VAL,\C_A
+    #endif
+.endm
\ No newline at end of file
diff --git a/param.h b/param.h
index 8f78a6a64..9a1a68ecd 100644
--- a/param.h
+++ b/param.h
@@ -2256,7 +2256,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #define SGEMM_DEFAULT_Q 1025
 #define DGEMM_DEFAULT_Q  384
 #define CGEMM_DEFAULT_Q  640
-#define ZGEMM_DEFAULT_Q 1025
+#define ZGEMM_DEFAULT_Q 1026
 
 #define SYMV_P	 8
 

From 148c4cc5fd4db4d10dcda94c5640de12611b7669 Mon Sep 17 00:00:00 2001
From: AbdelRauf <quickwritereader@gmail.com>
Date: Wed, 5 Jun 2019 20:50:50 +0000
Subject: [PATCH 076/127] conflict resolve

---
 kernel/power/KERNEL.POWER9 | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/kernel/power/KERNEL.POWER9 b/kernel/power/KERNEL.POWER9
index 126313c9a..0f91d6d7d 100644
--- a/kernel/power/KERNEL.POWER9
+++ b/kernel/power/KERNEL.POWER9
@@ -42,7 +42,7 @@ ZGEMMKERNEL    = zgemm_kernel_power9.S
 ZGEMMONCOPY    = ../generic/zgemm_ncopy_2.c
 ZGEMMOTCOPY    = ../generic/zgemm_tcopy_2.c
 ZGEMMINCOPY    = ../generic/zgemm_ncopy_8.c
-ZGEMMITCOPY    =  ../generic/zgemm_tcopy_8.c
+ZGEMMITCOPY    = zgemm_tcopy_8_power8.S
 ZGEMMONCOPYOBJ =  zgemm_oncopy.o
 ZGEMMOTCOPYOBJ =  zgemm_otcopy.o
 ZGEMMINCOPYOBJ =  zgemm_incopy.o

From 900d5a3205bd06c04990ff842449ec80808d5027 Mon Sep 17 00:00:00 2001
From: Martin Kroeker <martin@ruby.chemie.uni-freiburg.de>
Date: Thu, 6 Jun 2019 10:18:40 +0200
Subject: [PATCH 077/127] Add gfortran workaround for ABI violations in LAPACKE

for #2154 (see gcc bug 90329)
---
 Makefile.system | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/Makefile.system b/Makefile.system
index a95d6190f..49c02bbcb 100644
--- a/Makefile.system
+++ b/Makefile.system
@@ -744,6 +744,8 @@ CCOMMON_OPT += -DF_INTERFACE_GFORT
 FCOMMON_OPT += -Wall
 # make single-threaded LAPACK calls thread-safe #1847
 FCOMMON_OPT += -frecursive
+# work around ABI problem with passing single-character arguments
+FCOMMON_OPT += -fno-optimize-sibling-calls
 #Don't include -lgfortran, when NO_LAPACK=1 or lsbcc
 ifneq ($(NO_LAPACK), 1)
 EXTRALIB += -lgfortran

From a0caa762b3066f28b1b9334932b39a3d377f79f9 Mon Sep 17 00:00:00 2001
From: Martin Kroeker <martin@ruby.chemie.uni-freiburg.de>
Date: Thu, 6 Jun 2019 10:24:16 +0200
Subject: [PATCH 078/127] Add gfortran workaround for ABI violations

for #2154 (see gcc bug 90329)
---
 Makefile.power | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/Makefile.power b/Makefile.power
index 195f1930f..24d8aa8a7 100644
--- a/Makefile.power
+++ b/Makefile.power
@@ -29,6 +29,10 @@ FCOMMON_OPT += -O2 -frecursive -mcpu=power8 -mtune=power8 -malign-power -fno-fas
 endif
 endif
 
+# workaround for C->FORTRAN ABI violation in LAPACKE
+ifeq ($(F_COMPILER), GFORTRAN)
+FCOMMON_OPT += -fno-optimize-sibling-calls
+endif
 
 FLAMEPATH	= $(HOME)/flame/lib
 

From 6ca898b63b81325559cbd2e925bf245f2a8ac999 Mon Sep 17 00:00:00 2001
From: Martin Kroeker <martin@ruby.chemie.uni-freiburg.de>
Date: Sat, 8 Jun 2019 23:17:03 +0200
Subject: [PATCH 079/127] Add gfortran workaround for potential ABI violation

for #2154
---
 cmake/fc.cmake | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/cmake/fc.cmake b/cmake/fc.cmake
index adec28a91..9d8a5713c 100644
--- a/cmake/fc.cmake
+++ b/cmake/fc.cmake
@@ -44,7 +44,10 @@ endif ()
 
 if (${F_COMPILER} STREQUAL "GFORTRAN")
   set(CCOMMON_OPT "${CCOMMON_OPT} -DF_INTERFACE_GFORT")
+  # ensure reentrancy of lapack codes
   set(FCOMMON_OPT "${FCOMMON_OPT} -Wall -frecursive")
+  # work around ABI violation in passing string arguments from C
+  set(FCOMMON_OPT "$(FCOMMON_OPT) -fno-optimize-sibling-calls")
   #Don't include -lgfortran, when NO_LAPACK=1 or lsbcc
   if (NOT NO_LAPACK)
     set(EXTRALIB "{EXTRALIB} -lgfortran")

From e674e1c73515fab38e263d121429a1a5da494a45 Mon Sep 17 00:00:00 2001
From: Martin Kroeker <martin@ruby.chemie.uni-freiburg.de>
Date: Sun, 9 Jun 2019 09:31:13 +0200
Subject: [PATCH 080/127] Update fc.cmake

---
 cmake/fc.cmake | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/cmake/fc.cmake b/cmake/fc.cmake
index 9d8a5713c..f54c989d4 100644
--- a/cmake/fc.cmake
+++ b/cmake/fc.cmake
@@ -47,7 +47,7 @@ if (${F_COMPILER} STREQUAL "GFORTRAN")
   # ensure reentrancy of lapack codes
   set(FCOMMON_OPT "${FCOMMON_OPT} -Wall -frecursive")
   # work around ABI violation in passing string arguments from C
-  set(FCOMMON_OPT "$(FCOMMON_OPT) -fno-optimize-sibling-calls")
+  set(FCOMMON_OPT "${FCOMMON_OPT} -fno-optimize-sibling-calls")
   #Don't include -lgfortran, when NO_LAPACK=1 or lsbcc
   if (NOT NO_LAPACK)
     set(EXTRALIB "{EXTRALIB} -lgfortran")

From 1f4b6a5d5d2c1e3aaf7ca1da6720825cd075391f Mon Sep 17 00:00:00 2001
From: Martin Kroeker <martin@ruby.chemie.uni-freiburg.de>
Date: Mon, 10 Jun 2019 09:50:13 +0200
Subject: [PATCH 081/127] Remove any inadvertent use of -march=native from
 DYNAMIC_ARCH builds

from #2143, -march=native precludes use of more specific options like -march=skylake-avx512 in individual kernels, and defeats the purpose of dynamic arch anyway.
---
 cmake/arch.cmake | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/cmake/arch.cmake b/cmake/arch.cmake
index 470ea2a8f..b4547b7c9 100644
--- a/cmake/arch.cmake
+++ b/cmake/arch.cmake
@@ -73,7 +73,8 @@ if (DYNAMIC_ARCH)
     endif ()
     if (NOT NO_AVX512)
       set(DYNAMIC_CORE ${DYNAMIC_CORE} SKYLAKEX)
-    endif ()
+      string(REGEX REPLACE "-march=native" "" CMAKE_C_FLAGS ${CMAKE_C_FLAGS})
+   endif ()
     if (DYNAMIC_LIST)
 	set(DYNAMIC_CORE PRESCOTT ${DYNAMIC_LIST})
     endif ()

From 4ea794a52253ee56573922a15a64606ec82248a5 Mon Sep 17 00:00:00 2001
From: Martin Kroeker <martin@ruby.chemie.uni-freiburg.de>
Date: Mon, 10 Jun 2019 17:24:15 +0200
Subject: [PATCH 082/127] Avoid unintentional activation of TLS code via
 USE_TLS=0

fixes #2149
---
 Makefile.system | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/Makefile.system b/Makefile.system
index 44eacda4b..c24647d62 100644
--- a/Makefile.system
+++ b/Makefile.system
@@ -1070,7 +1070,7 @@ ifdef USE_SIMPLE_THREADED_LEVEL3
 CCOMMON_OPT	+= -DUSE_SIMPLE_THREADED_LEVEL3
 endif
 
-ifdef USE_TLS
+ifeq ($(USE_TLS), 1)
 CCOMMON_OPT += -DUSE_TLS
 endif
 

From d9ff2cd90df9a114701dcd6298ae8439d6648e04 Mon Sep 17 00:00:00 2001
From: Martin Kroeker <martin@ruby.chemie.uni-freiburg.de>
Date: Thu, 13 Jun 2019 23:01:35 +0200
Subject: [PATCH 083/127] Do not force gcc options on non-gcc compilers

fixes compile failure with pgi 18.10 as reported on OpenBLAS-users
---
 Makefile.x86_64 | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/Makefile.x86_64 b/Makefile.x86_64
index 1b7fe3ef4..d23645058 100644
--- a/Makefile.x86_64
+++ b/Makefile.x86_64
@@ -28,11 +28,15 @@ endif
 ifeq ($(CORE), HASWELL)
 ifndef DYNAMIC_ARCH
 ifndef NO_AVX2
+ifeq ($(C_COMPILER), GCC)
 CCOMMON_OPT += -mavx2
+endif
+ifeq $(F_COMPILER), GFORTRAN)
 FCOMMON_OPT += -mavx2
 endif
 endif
 endif
+endif
 
 
From 6d3efb2b5829d78926f818496de5572dbd34e64f Mon Sep 17 00:00:00 2001
From: Martin Kroeker <martin@ruby.chemie.uni-freiburg.de>
Date: Fri, 14 Jun 2019 08:08:11 +0200
Subject: [PATCH 084/127] Update Makefile.x86_64

---
 Makefile.x86_64 | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/Makefile.x86_64 b/Makefile.x86_64
index d23645058..99364752f 100644
--- a/Makefile.x86_64
+++ b/Makefile.x86_64
@@ -31,7 +31,7 @@ ifndef NO_AVX2
 ifeq ($(C_COMPILER), GCC)
 CCOMMON_OPT += -mavx2
 endif
-ifeq $(F_COMPILER), GFORTRAN)
+ifeq ($(F_COMPILER), GFORTRAN)
 FCOMMON_OPT += -mavx2
 endif
 endif

From bbd4bb0154b6c4bfc561dce07b71eba7c7fa9013 Mon Sep 17 00:00:00 2001
From: Martin Kroeker <martin@ruby.chemie.uni-freiburg.de>
Date: Sun, 16 Jun 2019 15:04:10 +0200
Subject: [PATCH 085/127] Zero ecx with a mov instruction

PGI assembler does not like the initialization in the constraints.
---
 common_x86_64.h | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/common_x86_64.h b/common_x86_64.h
index f59ff6627..9db66b545 100644
--- a/common_x86_64.h
+++ b/common_x86_64.h
@@ -129,12 +129,13 @@ static __inline void cpuid(int op, int *eax, int *ebx, int *ecx, int *edx){
   *ecx=cpuinfo[2];
   *edx=cpuinfo[3];
 #else
-        __asm__ __volatile__("cpuid"
+        __asm__ __volatile__("mov %%ecx, 0;"
+			     "cpuid"
 			     : "=a" (*eax),
 			     "=b" (*ebx),
 			     "=c" (*ecx),
 			     "=d" (*edx)
-			     : "0" (op), "c"(0));
+			     : "0" (op));
 #endif
 }
 

From 280552b988e4377d95bc2f77bc07d2c00bb544e2 Mon Sep 17 00:00:00 2001
From: Martin Kroeker <martin@ruby.chemie.uni-freiburg.de>
Date: Sun, 16 Jun 2019 18:35:43 +0200
Subject: [PATCH 086/127] Fix mov syntax

---
 common_x86_64.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/common_x86_64.h b/common_x86_64.h
index 9db66b545..c05998d58 100644
--- a/common_x86_64.h
+++ b/common_x86_64.h
@@ -129,7 +129,7 @@ static __inline void cpuid(int op, int *eax, int *ebx, int *ecx, int *edx){
   *ecx=cpuinfo[2];
   *edx=cpuinfo[3];
 #else
-        __asm__ __volatile__("mov %%ecx, 0;"
+        __asm__ __volatile__("mov $0, %%ecx;"
 			     "cpuid"
 			     : "=a" (*eax),
 			     "=b" (*ebx),

From cdbfb891da2a8de14aa1d9bd7a57265284f7432c Mon Sep 17 00:00:00 2001
From: AbdelRauf <quickwritereader@gmail.com>
Date: Mon, 17 Jun 2019 15:33:38 +0000
Subject: [PATCH 087/127] new sgemm 8x16

---
 kernel/power/sgemm_logic_power9.S  | 193 ++++++++--------
 kernel/power/sgemm_macros_power9.S | 344 +++++++++++++++--------------
 param.h                            |   2 +-
 3 files changed, 288 insertions(+), 251 deletions(-)

diff --git a/kernel/power/sgemm_logic_power9.S b/kernel/power/sgemm_logic_power9.S
index 25e8c8387..053836cbf 100644
--- a/kernel/power/sgemm_logic_power9.S
+++ b/kernel/power/sgemm_logic_power9.S
@@ -3,89 +3,89 @@ b L8
 
 	MY_ALIGN
 LSGEMM_L8x16_LMAIN_SUB: 
-	LOAD8x16_0    
-	mtctr		L 
+	LOAD8x16_2    
 	MY_ALIGN
 
 LSGEMM_L8x16_LOOP:
-
-    KERNEL8x16_I1_L4_2  64,32, 0,0
-    KERNEL8x16_I1_L4_2  64,32, 1,0
-    KERNEL8x16_I1_L4_2  64,32, 2,0
-    KERNEL8x16_I1_L4_2  64,32, 3,0
-    KERNEL8x16_I1_L4_2  64,32, 4,0
-    KERNEL8x16_I1_L4_2  64,32, 5,0        
-    KERNEL8x16_I1_L4_2  64,32, 6,0
-    KERNEL8x16_I1_L4_2  64,32, 7,0  
-    KERNEL8x16_I1_L4_2  64,32, 8,0      
-    KERNEL8x16_I1_L4_2  64,32, 9,0
-    KERNEL8x16_I1_L4_2  64,32, 10,0
-    KERNEL8x16_I1_L4_2  64,32, 11,0
-    KERNEL8x16_I1_L4_2  64,32, 12,0
-    KERNEL8x16_I1_L4_2  64,32, 13,0    
-    KERNEL8x16_I1_L4_2  64,32, 14,0    
-    KERNEL8x16_I1_L4_2  64,32, 15,0  	
-    KERNEL8x16_I1_L4_2  64,32, 16,0
-    KERNEL8x16_I1_L4_2  64,32, 17,0
-    KERNEL8x16_I1_L4_2  64,32, 18,0
-    KERNEL8x16_I1_L4_2  64,32, 19,0
-    KERNEL8x16_I1_L4_2  64,32, 20,0
-    KERNEL8x16_I1_L4_2  64,32, 21,0        
-    KERNEL8x16_I1_L4_2  64,32, 22,0
-    KERNEL8x16_I1_L4_2  64,32, 23,0  
-    KERNEL8x16_I1_L4_2  64,32, 24,0      
-    KERNEL8x16_I1_L4_2  64,32, 25,0
-    KERNEL8x16_I1_L4_2  64,32, 26,0
-    KERNEL8x16_I1_L4_2  64,32, 27,0
-    KERNEL8x16_I1_L4_2  64,32, 28,0
-    KERNEL8x16_I1_L4_2  64,32, 29,0    
-    KERNEL8x16_I1_L4_2  64,32, 30,0    
-    KERNEL8x16_I1_L4_2  64,32, 31,1 
+    KERNEL8x16_L2 128,64,0,0 
+LSGEMM_L8x16_K128:
+    KERNEL8x16_L2 128,64,1,0 
+    KERNEL8x16_I1_L4_2  128,64, 1,0
+    KERNEL8x16_I1_L4_2  128,64, 2,0
+    KERNEL8x16_I1_L4_2  128,64, 3,0
+    KERNEL8x16_I1_L4_2  128,64, 4,0
+    KERNEL8x16_I1_L4_2  128,64, 5,0        
+    KERNEL8x16_I1_L4_2  128,64, 6,0
+    KERNEL8x16_I1_L4_2  128,64, 7,0  
+    KERNEL8x16_I1_L4_2  128,64, 8,0      
+    KERNEL8x16_I1_L4_2  128,64, 9,0
+    KERNEL8x16_I1_L4_2  128,64, 10,0
+    KERNEL8x16_I1_L4_2  128,64, 11,0
+    KERNEL8x16_I1_L4_2  128,64, 12,0
+    KERNEL8x16_I1_L4_2  128,64, 13,0    
+    KERNEL8x16_I1_L4_2  128,64, 14,0    
+    KERNEL8x16_I1_L4_2  128,64, 15,0  	
+    KERNEL8x16_I1_L4_2  128,64, 16,0
+    KERNEL8x16_I1_L4_2  128,64, 17,0
+    KERNEL8x16_I1_L4_2  128,64, 18,0
+    KERNEL8x16_I1_L4_2  128,64, 19,0
+    KERNEL8x16_I1_L4_2  128,64, 20,0
+    KERNEL8x16_I1_L4_2  128,64, 21,0        
+    KERNEL8x16_I1_L4_2  128,64, 22,0
+    KERNEL8x16_I1_L4_2  128,64, 23,0  
+    KERNEL8x16_I1_L4_2  128,64, 24,0      
+    KERNEL8x16_I1_L4_2  128,64, 25,0
+    KERNEL8x16_I1_L4_2  128,64, 26,0
+    KERNEL8x16_I1_L4_2  128,64, 27,0
+    KERNEL8x16_I1_L4_2  128,64, 28,0
+    KERNEL8x16_I1_L4_2  128,64, 29,0    
+    KERNEL8x16_I1_L4_2  128,64, 30,0    
+    KERNEL8x16_I1_L4_2  128,64, 31,1 
 	bdnz		LSGEMM_L8x16_LOOP
 
 	MY_ALIGN
 LSGEMM_L8x16_LOOP_END: 
-    END8x16 0, AO, BO, 64, 32
+    END8x16_2
     blr  
 
 	MY_ALIGN
 LSGEMM_L8x16_L64_SUB: 
-	LOAD8x16_0     
-    KERNEL8x16_I1_L4_2  64,32, 0,0
-    KERNEL8x16_I1_L4_2  64,32, 1,0
-    KERNEL8x16_I1_L4_2  64,32, 2,0
-    KERNEL8x16_I1_L4_2  64,32, 3,0
-    KERNEL8x16_I1_L4_2  64,32, 4,0
-    KERNEL8x16_I1_L4_2  64,32, 5,0        
-    KERNEL8x16_I1_L4_2  64,32, 6,0
-    KERNEL8x16_I1_L4_2  64,32, 7,0  
-    KERNEL8x16_I1_L4_2  64,32, 8,0      
-    KERNEL8x16_I1_L4_2  64,32, 9,0
-    KERNEL8x16_I1_L4_2  64,32, 10,0
-    KERNEL8x16_I1_L4_2  64,32, 11,0
-    KERNEL8x16_I1_L4_2  64,32, 12,0
-    KERNEL8x16_I1_L4_2  64,32, 13,0    
-    KERNEL8x16_I1_L4_2  64,32, 14,0    
-    KERNEL8x16_I1_L4_3  64,32, 15,1 
+	LOAD8x16_2     
+    KERNEL8x16_I1_L4_2  128,64, 0,0
+    KERNEL8x16_I1_L4_2  128,64, 1,0
+    KERNEL8x16_I1_L4_2  128,64, 2,0
+    KERNEL8x16_I1_L4_2  128,64,3,0
+    KERNEL8x16_I1_L4_2  128,64,4,0
+    KERNEL8x16_I1_L4_2  128,64,5,0        
+    KERNEL8x16_I1_L4_2  128,64,6,0
+    KERNEL8x16_I1_L4_2  128,64,7,0  
+    KERNEL8x16_I1_L4_2  128,64,8,0      
+    KERNEL8x16_I1_L4_2  128,64,9,0
+    KERNEL8x16_I1_L4_2  128,64,10,0
+    KERNEL8x16_I1_L4_2  128,64,11,0
+    KERNEL8x16_I1_L4_2  128,64,12,0
+    KERNEL8x16_I1_L4_2  128,64,13,0    
+    KERNEL8x16_I1_L4_2  128,64,14,0    
+    KERNEL8x16_I1_L4_3  128,64,15,1 
     blr	
 LSGEMM_L8x16_L32_SUB: 
-	LOAD8x16_0     
-    KERNEL8x16_I1_L4_2  64,32, 0,0
-    KERNEL8x16_I1_L4_2  64,32, 1,0
-    KERNEL8x16_I1_L4_2  64,32, 2,0
-    KERNEL8x16_I1_L4_2  64,32, 3,0
-    KERNEL8x16_I1_L4_2  64,32, 4,0
-    KERNEL8x16_I1_L4_2  64,32, 5,0        
-    KERNEL8x16_I1_L4_2  64,32, 6,0
-    KERNEL8x16_I1_L4_3  64,32, 7,1
+	LOAD8x16_2     
+    KERNEL8x16_I1_L4_2  128,64,0,0
+    KERNEL8x16_I1_L4_2  128,64,1,0
+    KERNEL8x16_I1_L4_2  128,64,2,0
+    KERNEL8x16_I1_L4_2  128,64,3,0
+    KERNEL8x16_I1_L4_2  128,64,4,0
+    KERNEL8x16_I1_L4_2  128,64,5,0        
+    KERNEL8x16_I1_L4_2  128,64,6,0
+    KERNEL8x16_I1_L4_3  128,64,7,1
     blr	
 
 LSGEMM_L8x16_L16_SUB: 
-	LOAD8x16_0     
-    KERNEL8x16_I1_L4_2  64,32, 0,0
-    KERNEL8x16_I1_L4_2  64,32, 1,0
-    KERNEL8x16_I1_L4_2  64,32, 2,0
-    KERNEL8x16_I1_L4_3  64,32, 3,1
+	LOAD8x16_2     
+    KERNEL8x16_I1_L4_2  128,64,0,0
+    KERNEL8x16_I1_L4_2  128,64,1,0
+    KERNEL8x16_I1_L4_2  128,64,2,0
+    KERNEL8x16_I1_L4_3  128,64,3,1
     blr	
 
 L8:
@@ -127,15 +127,16 @@ LSGEMM_L8x16_BEGIN:
 #if defined(TRMMKERNEL)
    REFRESH_TEMP_BK T11,K,TEMP_REG,16,8
    mr T12, T11
-   addi T12,T12, -1
-   srawi.		L, T12,	7 /**(T11-1) % 128x */
+   addi T12,T12, -2
+   srawi.		L, T12,	7 /**(T11-2) % 128x */
 #else
    mr T12, K
-   addi T12,T12, -1
-   srawi.		L,	T12,	7 /**(K-1) % 128x */
+   addi T12,T12, -2
+   srawi.		L,	T12,	7 /**(K-2) % 128x */
 #endif 
  
-    ZERO8x16
+    ZERO8x16 
+	mtctr		L 
 	ble		LSGEMM_L8x16_SUB0
     bl      LSGEMM_L8x16_LMAIN_SUB
 	andi.		L,	T12,	127
@@ -148,15 +149,33 @@ LSGEMM_L8x16_SUB0:
     cmpwi   T11,128
 #else
 	andi.		L,	K,	255
+    cmpwi   K,129
+#endif       
+    li T10,1
+    bne CMP8x16_128K
+    addi BO,BO,-32
+    addi AO,AO,-64 
+    LOAD8x16 64,32 
+    END8x16_WITHOUT_ADD   
+    LOAD8x16_2O AO,BO,  128, 64 
+    mtctr   T10   
+    bl LSGEMM_L8x16_K128   
+    b LSGEMM_L8x16_SAVE  
+CMP8x16_128K:
+/*----------------------------------------*/   
+#if defined(TRMMKERNEL)    
+    cmpwi   T11,128
+#else    
     cmpwi   K,128
-#endif   
-
-	bne LSGEMM_L8x16_SUB2 
-    MY_ALIGN	
-LSGEMM_L8x16_SUB2_128:
-  	bl LSGEMM_L8x16_L64_SUB
-	bl LSGEMM_L8x16_L64_SUB  
-	b LSGEMM_L8x16_SAVE  
+#endif        
+    bne LSGEMM_L8x16_SUB2 
+    MY_ALIGN   
+    mtctr   T10
+    addi BO,BO,-64
+    addi AO,AO,-128   
+    LOAD8x16_2O  AO,BO,  128,64
+    bl LSGEMM_L8x16_K128   
+    b LSGEMM_L8x16_SAVE
 	MY_ALIGN
 LSGEMM_L8x16_SUB2:
     andi.   T10,L,64
@@ -176,21 +195,21 @@ LSGEMM_L8x16_SUB2_16:
 LSGEMM_L8x16_SUB2_8:
     andi.      T10,L, 8
     ble LSGEMM_L8x16_SUB2_4 
-	LOAD8x16_0
-    KERNEL8x16_I1_L4_2  64,32, 0,0
-    KERNEL8x16_I1_L4_3  64,32, 1,1
+	LOAD8x16_2
+    KERNEL8x16_I1_L4_2  128,64, 0,0
+    KERNEL8x16_I1_L4_3  128,64, 1,1
 	MY_ALIGN	
 LSGEMM_L8x16_SUB2_4:
     andi.      T10,L, 4
     ble LSGEMM_L8x16_SUB2_2
-    LOAD8x16_0
-    KERNEL8x16_I1_L4_3  64,32, 0,1
+    LOAD8x16_2
+    KERNEL8x16_I1_L4_3  128,64, 0,1
     MY_ALIGN
 LSGEMM_L8x16_SUB2_2:
     andi.      T10,L, 2
     ble LSGEMM_L8x16_SUB2_1
-    LOAD8x16_0
-    KERNEL8x16_I1_L2_3  64,32, 0,1
+    LOAD8x16_2
+    KERNEL8x16_E2  128,64, 0,1
     MY_ALIGN    
 LSGEMM_L8x16_SUB2_1:
     andi.      T10,L, 1
diff --git a/kernel/power/sgemm_macros_power9.S b/kernel/power/sgemm_macros_power9.S
index 3f86a1d25..2c9e537c7 100644
--- a/kernel/power/sgemm_macros_power9.S
+++ b/kernel/power/sgemm_macros_power9.S
@@ -38,13 +38,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 * Macros for N=8 and M=16
 **********************************************************************************************/
 
-.macro LOAD8x16_1
-   LOAD8x16 1
-.endm
-
-.macro LOAD8x16_0
-   LOAD8x16 0
-.endm
+ 
 
 .macro KERNEL8x16_L1_L4  Index,IsLast
   KERNEL8x16_L1_L4_I AO,BO, 0,0, \Index,\IsLast,0
@@ -61,10 +55,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 .macro KERNEL8x16_I1_L4_3  OffsetA,OffsetB, Index,IsLast
   KERNEL8x16_L1_L4_I  AO,BO,  \OffsetA,\OffsetB,\Index,\IsLast,1
 .endm
-.macro KERNEL8x16_I1_L2_3  OffsetA,OffsetB, Index,IsLast
-   KERNEL8x16_L1_L2_I AO,BO, \OffsetA,\OffsetB,\Index,\IsLast,1
-.endm
-
+ 
 .macro KERNEL8x16_I2_L4_2  AREG,BREG,OffsetA,OffsetB, Index,IsLast
   KERNEL8x16_L1_L4_I  \AREG,\BREG,  \OffsetA,\OffsetB,\Index,\IsLast,0
 .endm
@@ -108,61 +99,31 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 	xxlxor		vs63,	vs63,	vs63	
 .endm
 
-.macro LOAD8x16  Zero
+.macro LOAD8x16  OffsetA,OffsetB
 
-	lxv	vs24,	0(BO)
-	lxv	vs28,	16(BO)
+	lxv	vs24,	(\OffsetB+0)(BO)
+	lxv	vs28,	(\OffsetB+16)(BO)
 	xxperm  	vs26,	vs24,		permute_mask
 	xxperm  	vs30,	vs28,		permute_mask	  
-	lxv	vs0,	 0(AO)
-	lxv	vs1,	16(AO)
+	lxv	vs0,	(\OffsetA+0)(AO)
+	lxv	vs1,	(\OffsetA+16)(AO)
 	xxpermdi	vs25,	vs24,	vs24,2	   
 	xxpermdi	vs29,	vs28,	vs28,2	  
-	lxv	vs2,	32(AO)
-	lxv	vs3,	48(AO) 
+	lxv	vs2,	(\OffsetA+32)(AO)
+	lxv	vs3,	(\OffsetA+48)(AO) 
 	xxpermdi	vs27,	vs26,	vs26,2	
 	xxpermdi	vs31,	vs30,	vs30,2	 	
 
-.if \Zero==1 
-    xxlxor		vs32,	vs32,	vs32
-    xxlxor		vs33,	vs33,	vs33
-	xxlxor		vs34,	vs34,	vs34
-	xxlxor		vs35,	vs35,	vs35
-	xxlxor		vs36,	vs36,	vs36
-	xxlxor		vs37,	vs37,	vs37
-	xxlxor		vs38,	vs38,	vs38
-	xxlxor		vs39,	vs39,	vs39
-	xxlxor		vs40,	vs40,	vs40
-	xxlxor		vs41,	vs41,	vs41
-	xxlxor		vs42,	vs42,	vs42
-	xxlxor		vs43,	vs43,	vs43
-	xxlxor		vs44,	vs44,	vs44
-	xxlxor		vs45,	vs45,	vs45
-	xxlxor		vs46,	vs46,	vs46
-	xxlxor		vs47,	vs47,	vs47
-	xxlxor		vs48,	vs48,	vs48
-	xxlxor		vs49,	vs49,	vs49
-	xxlxor		vs50,	vs50,	vs50
-	xxlxor		vs51,	vs51,	vs51 
-	xxlxor		vs52,	vs52,	vs52
-	xxlxor		vs53,	vs53,	vs53
-	xxlxor		vs54,	vs54,	vs54
-	xxlxor		vs55,	vs55,	vs55 
-	xxlxor		vs56,	vs56,	vs56
-	xxlxor		vs57,	vs57,	vs57
-	xxlxor		vs58,	vs58,	vs58
-	xxlxor		vs59,	vs59,	vs59 
-	xxlxor		vs60,	vs60,	vs60
-	xxlxor		vs61,	vs61,	vs61
-	xxlxor		vs62,	vs62,	vs62
-	xxlxor		vs63,	vs63,	vs63	
-.endif
 .endm
 
 .macro END8x16_NORMAL
   END8x16 0, AO, BO, 64,32 
 .endm
 
+.macro END8x16_WITHOUT_ADD
+	END8x16 0, AO,BO,0,0
+.endm
+
 .macro END8x16 First, AREG, BREG, OffsetA, OffsetB
 
 .if \OffsetB != 0 
@@ -258,145 +219,202 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
 .macro KERNEL8x16_L1_L4_I  AREG,BREG,   OffsetA,OffsetB, Index,IsLast ,Complete
 
-KERNEL8x16_L1_L2_I  \AREG,\BREG, \OffsetA,\OffsetB, (\Index*2),0 ,0
-KERNEL8x16_L1_L2_I  \AREG,\BREG,\OffsetA,\OffsetB, (\Index*2+1),\IsLast ,\Complete
+KERNEL8x16_2  \AREG,\BREG, \OffsetA,\OffsetB, (\Index*2),0 ,0
+KERNEL8x16_2  \AREG,\BREG,\OffsetA,\OffsetB, (\Index*2+1),\IsLast ,\Complete
 
 .endm
 
 .macro KERNEL8x16 First
 
-  LOAD8x16 0
+  LOAD8x16 0,0
   END8x16 \First, AO, BO, 64,32 
 .endm
 
-.macro KERNEL8x16_L1_L2_I  AREG,BREG, OffsetA,OffsetB, Index,IsLast ,Complete
-	lxv	vs8,	DISP16(\Index,\OffsetB)(\BREG)
-	lxv	vs12,	DISP16(\Index,16+\OffsetB)(\BREG)
+.macro LOAD8x16_2
+    LOAD8x16_2O AO,BO, 0,0
+.endm	
+
+.macro LOAD8x16_2O  AREG,BREG, OffsetA,OffsetB
+  lxv	vs8,	(\OffsetB)(\BREG)
+  lxv	vs12,	(16+\OffsetB)(\BREG)
+  lxv	vs24,	(32+\OffsetB)(\BREG)
+  lxv	vs28,	(32+16+\OffsetB)(\BREG)
+  lxv	vs4,	(0+\OffsetA)(\AREG)
+  lxv	vs5,	(16+\OffsetA)(\AREG)
+  xxperm  	vs10,	vs8,		permute_mask
+  xxperm  	vs14,	vs12,		permute_mask	
+  lxv	vs6,	(32+\OffsetA)(\AREG)
+  lxv	vs7,	(48+\OffsetA)(\AREG) 
+  xxpermdi	vs9,	vs8,	 vs8,2	 
+  xxpermdi	vs13,	vs12,	vs12,2	 
+  lxv	vs0,	(64+\OffsetA)(\AREG)
+  lxv	vs1,	(64+16+\OffsetA)(\AREG) 
+  xxpermdi	vs11,	vs10,	vs10,2	
+  xxpermdi	vs15,	vs14,	vs14,2	
+  lxv	vs2,	(64+32+\OffsetA)(\AREG)
+  lxv	vs3,	(64+48+\OffsetA)(\AREG)
+
+  xxperm  	vs26,	vs24,	permute_mask
+  xxperm  	vs30,	vs28,	permute_mask	
+  xxpermdi	vs25,	vs24,	vs24,2 
+  xxpermdi	vs29,	vs28,	vs28,2	      
+  xxpermdi	vs27,	vs26,	vs26,2	
+  xxpermdi	vs31,	vs30,	vs30,2	 
+.endm
+
+.macro END8x16_2	  
+  /*for load2 offset will be 128 and 64*/
+   KERNEL8x16_2	AO,BO,	128,64,0 ,1,1 
+.endm
+ 
+
+
+.macro KERNEL8x16_E2	OffsetA,OffsetB, Index,IsLast 
+  KERNEL8x16_2	AO,BO,	\OffsetA,\OffsetB, \Index,\IsLast ,1 
+.endm
+
+
+.macro KERNEL8x16_L2	OffsetA,OffsetB, Index,IsLast
+  KERNEL8x16_2	AO,BO,	\OffsetA,\OffsetB, \Index,\IsLast ,0 
+.endm
+
+
+.macro KERNEL8x16_2  AREG,BREG,  OffsetA,OffsetB, Index,IsLast ,Complete
+  xvmaddasp		vs32, vs4,vs8
+  xvmaddasp		vs33, vs5,vs8
+  xvmaddasp		vs48, vs4,vs12
+  xvmaddasp		vs49, vs5,vs12
+
+  xvmaddasp		vs40, vs4,vs10
+  xvmaddasp		vs41, vs5,vs10
+  xvmaddasp		vs56, vs4,vs14
+  xvmaddasp		vs57, vs5,vs14
+
+  xvmaddasp		vs36, vs4,vs9
+  xvmaddasp		vs37, vs5,vs9
+  xvmaddasp		vs52, vs4,vs13
+  xvmaddasp		vs53, vs5,vs13
+
+  xvmaddasp		vs44, vs4,vs11
+  xvmaddasp		vs45, vs5,vs11
+  xvmaddasp		vs60, vs4,vs15
+  xvmaddasp		vs61, vs5,vs15
+
+.if \Complete==0	
+   lxv	vs4,	DISP32(\Index,0+\OffsetA)(\AREG)
+   lxv	vs5,	DISP32(\Index,16+\OffsetA)(\AREG)
+.endif
+
+  xvmaddasp		vs34, vs6,vs8	
+  xvmaddasp		vs35, vs7,vs8	
+  xvmaddasp		vs50, vs6,vs12
+  xvmaddasp		vs51, vs7,vs12
+.if \Complete==0  
+  lxv vs8,  DISP16(\Index,\OffsetB)(\BREG)
+  lxv vs12, DISP16(\Index,16+\OffsetB)(\BREG)
+.endif    
+  xvmaddasp		vs42, vs6,vs10
+  xvmaddasp		vs43, vs7,vs10
+  xvmaddasp		vs58, vs6,vs14
+  xvmaddasp		vs59, vs7,vs14
+.if \Complete==0  
+  xxperm    vs10, vs8,    permute_mask
+  xxperm    vs14, vs12,   permute_mask    
+.endif    
+  xvmaddasp		vs38, vs6,vs9	
+  xvmaddasp		vs39, vs7,vs9	
+  xvmaddasp   vs54, vs6,vs13
+  xvmaddasp   vs55, vs7,vs13
+.if \Complete==0
+  xxpermdi  vs9,  vs8,   vs8,2   
+  xxpermdi  vs13, vs12, vs12,2   
+.endif    
+  xvmaddasp		vs46, vs6,vs11
+  xvmaddasp		vs47, vs7,vs11
+  xvmaddasp		vs62, vs6,vs15
+  xvmaddasp		vs63, vs7,vs15
+.if \Complete==0
+  xxpermdi  vs11, vs10, vs10,2  
+  xxpermdi  vs15, vs14, vs14,2  
+.endif  
+
+.if \Complete==0
+   lxv	vs6,	DISP32(\Index,32+\OffsetA)(\AREG)
+   lxv	vs7,	DISP32(\Index,48+\OffsetA)(\AREG) 
+.endif 
 
   xvmaddasp		vs32, vs0,vs24
-  xvmaddasp		vs36, vs0,vs25
- 	lxv	vs4,	DISP32(\Index,0+\OffsetA)(\AREG)
-	lxv	vs5,	DISP32(\Index,16+\OffsetA)(\AREG)
-	xxperm  	vs10,	vs8,		permute_mask
-	xxperm  	vs14,	vs12,		permute_mask	
-  xvmaddasp		vs40, vs0,vs26
-  xvmaddasp		vs44, vs0,vs27
-	lxv	vs6,	DISP32(\Index,32+\OffsetA)(\AREG)
-	lxv	vs7,	DISP32(\Index,48+\OffsetA)(\AREG) 
+  xvmaddasp		vs33, vs1,vs24
   xvmaddasp		vs48, vs0,vs28
-  xvmaddasp		vs52, vs0,vs29
-
-	xxpermdi	vs9,	vs8,	vs8,2	 
-	xxpermdi	vs13,	vs12,	vs12,2	 
-
+  xvmaddasp		vs49, vs1,vs28
+  xvmaddasp		vs40, vs0,vs26
+  xvmaddasp		vs41, vs1,vs26
   xvmaddasp		vs56, vs0,vs30
+  xvmaddasp		vs57, vs1,vs30
+  xvmaddasp		vs36, vs0,vs25
+  xvmaddasp		vs37, vs1,vs25
+  xvmaddasp		vs52, vs0,vs29
+  xvmaddasp		vs53, vs1,vs29
+  xvmaddasp		vs44, vs0,vs27
+  xvmaddasp		vs45, vs1,vs27
   xvmaddasp		vs60, vs0,vs31
-
- 	xxpermdi	vs11,	vs10,	vs10,2	
-	xxpermdi	vs15,	vs14,	vs14,2	
-
-
-
-	xvmaddasp		vs33, vs1,vs24
-	xvmaddasp		vs37, vs1,vs25
-
-	xvmaddasp		vs41, vs1,vs26
-	xvmaddasp		vs45, vs1,vs27
-	xvmaddasp		vs49, vs1,vs28
-	xvmaddasp		vs53, vs1,vs29
-	xvmaddasp		vs57, vs1,vs30
-	xvmaddasp		vs61, vs1,vs31    
+  xvmaddasp		vs61, vs1,vs31 
 .if \Complete==0
-	lxv	vs0,	DISP32(\Index,64+\OffsetA)(\AREG)
-	lxv	vs1,	DISP32(\Index,64+16+\OffsetA)(\AREG) 
+  lxv	vs0,	DISP32(\Index,64+\OffsetA)(\AREG)
+  lxv	vs1,	DISP32(\Index,64+16+\OffsetA)(\AREG) 
 .endif
-	xvmaddasp		vs34, vs2,vs24
-	xvmaddasp		vs38, vs2,vs25
-	xvmaddasp		vs42, vs2,vs26
-	xvmaddasp		vs46, vs2,vs27
-	xvmaddasp		vs50, vs2,vs28
-	xvmaddasp		vs54, vs2,vs29
-	xvmaddasp		vs58, vs2,vs30
-	xvmaddasp		vs62, vs2,vs31	
 
-	xvmaddasp		vs35, vs3,vs24	  
-	xvmaddasp		vs39, vs3,vs25
-	xvmaddasp		vs43, vs3,vs26
-	xvmaddasp		vs47, vs3,vs27
-	xvmaddasp		vs51, vs3,vs28
-	xvmaddasp		vs55, vs3,vs29
-	xvmaddasp		vs59, vs3,vs30
-	xvmaddasp		vs63, vs3,vs31
-.if \Complete==0	
-	lxv	vs2,	DISP32(\Index,64+32+\OffsetA)(\AREG)
-	lxv	vs3,	DISP32(\Index,64+48+\OffsetA)(\AREG)
-.endif
-    xvmaddasp		vs32, vs4,vs8
-    xvmaddasp		vs36, vs4,vs9
+  xvmaddasp		vs34, vs2,vs24
+  xvmaddasp		vs35, vs3,vs24	  
+  xvmaddasp		vs50, vs2,vs28
+  xvmaddasp		vs51, vs3,vs28
 .if \Complete==0
-	lxv	vs24,	DISP16(\Index,32+\OffsetB)(\BREG)
-	lxv	vs28,	DISP16(\Index,32+16+\OffsetB)(\BREG)
+  lxv vs24, DISP16(\Index,32+\OffsetB)(\BREG)
+  lxv vs28, DISP16(\Index,32+16+\OffsetB)(\BREG)
+.endif  
+  xvmaddasp		vs42, vs2,vs26
+  xvmaddasp		vs43, vs3,vs26
+  xvmaddasp		vs58, vs2,vs30
+  xvmaddasp		vs59, vs3,vs30
+.if \Complete==0
+  xxperm    vs26, vs24, permute_mask
+  xxperm    vs30, vs28, permute_mask  
+.endif  
+  xvmaddasp		vs38, vs2,vs25
+  xvmaddasp		vs39, vs3,vs25
+  xvmaddasp		vs54, vs2,vs29
+  xvmaddasp		vs55, vs3,vs29
+.if \Complete==0
+  xxpermdi  vs25, vs24, vs24,2 
+  xxpermdi  vs29, vs28, vs28,2    
+.endif  
+  xvmaddasp		vs46, vs2,vs27
+  xvmaddasp		vs47, vs3,vs27
+  xvmaddasp		vs62, vs2,vs31	
+  xvmaddasp		vs63, vs3,vs31
+.if \Complete==0
+  xxpermdi  vs27, vs26, vs26,2  
+  xxpermdi  vs31, vs30, vs30,2   
 .endif
+.if \Complete==0
+  lxv	vs2,	DISP32(\Index,64+32+\OffsetA)(\AREG)
+  lxv	vs3,	DISP32(\Index,64+48+\OffsetA)(\AREG)
+.endif
+
+
 .if \IsLast==1	
 .if \Complete==1
-	addi		\AREG, \AREG, DISP32(\Index,64+\OffsetA)  
-	addi		\BREG, \BREG,  DISP16(\Index,32+\OffsetB)
+	addi		\BREG, \BREG,  DISP16(\Index,\OffsetB)
+  addi    \AREG, \AREG, DISP32(\Index,\OffsetA)  
 
 .else
-	addi		\AREG, \AREG, DISP32(\Index,128)  
 	addi		\BREG, \BREG,  DISP16(\Index,64)
+  addi    \AREG, \AREG, DISP32(\Index,128)  
 
 .endif
 .endif   
-    xvmaddasp		vs40, vs4,vs10
-    xvmaddasp		vs44, vs4,vs11
-.if \Complete==0
-	xxperm  	vs26,	vs24,	permute_mask
-	xxperm  	vs30,	vs28,	permute_mask	
-.endif
-    xvmaddasp		vs48, vs4,vs12
-    xvmaddasp		vs52, vs4,vs13
-.if \Complete==0	
-	xxpermdi	vs25,	vs24,	vs24,2 
-	xxpermdi	vs29,	vs28,	vs28,2	 
-.endif 
 
-    xvmaddasp		vs56, vs4,vs14
-    xvmaddasp		vs60, vs4,vs15
- 
-.if \Complete==0        
-	xxpermdi	vs27,	vs26,	vs26,2	
-	xxpermdi	vs31,	vs30,	vs30,2	
- 	
-.endif 
 
-	xvmaddasp		vs33, vs5,vs8
-	xvmaddasp		vs37, vs5,vs9
-	xvmaddasp		vs41, vs5,vs10
-	xvmaddasp		vs45, vs5,vs11
-	xvmaddasp		vs49, vs5,vs12
-	xvmaddasp		vs53, vs5,vs13
-	xvmaddasp		vs57, vs5,vs14
-	xvmaddasp		vs61, vs5,vs15
-
-	xvmaddasp		vs34, vs6,vs8	
-	xvmaddasp		vs38, vs6,vs9	
-	xvmaddasp		vs42, vs6,vs10
-	xvmaddasp		vs46, vs6,vs11
-	xvmaddasp		vs50, vs6,vs12
-	xvmaddasp		vs54, vs6,vs13
-	xvmaddasp		vs58, vs6,vs14
-	xvmaddasp		vs62, vs6,vs15
-
-	xvmaddasp		vs35, vs7,vs8	
-	xvmaddasp		vs39, vs7,vs9	
-	xvmaddasp		vs43, vs7,vs10
-	xvmaddasp		vs47, vs7,vs11
-	xvmaddasp		vs51, vs7,vs12
-	xvmaddasp		vs55, vs7,vs13
-	xvmaddasp		vs59, vs7,vs14
-	xvmaddasp		vs63, vs7,vs15
- 
 .endm
 
  
diff --git a/param.h b/param.h
index 9a1a68ecd..3934da6c8 100644
--- a/param.h
+++ b/param.h
@@ -2253,7 +2253,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #define CGEMM_DEFAULT_P  640
 #define ZGEMM_DEFAULT_P 256
 
-#define SGEMM_DEFAULT_Q 1025
+#define SGEMM_DEFAULT_Q 1026
 #define DGEMM_DEFAULT_Q  384
 #define CGEMM_DEFAULT_Q  640
 #define ZGEMM_DEFAULT_Q 1026

From a575f1e4c771b31ba29bd11af4a3190f240cf1d2 Mon Sep 17 00:00:00 2001
From: kavanabhat <kavana.bhat@in.ibm.com>
Date: Wed, 19 Jun 2019 15:27:14 +0530
Subject: [PATCH 088/127] Update dtrmm_kernel_16x4_power8.S

---
 kernel/power/dtrmm_kernel_16x4_power8.S | 2 --
 1 file changed, 2 deletions(-)

diff --git a/kernel/power/dtrmm_kernel_16x4_power8.S b/kernel/power/dtrmm_kernel_16x4_power8.S
index 47e703a3a..57829ac51 100644
--- a/kernel/power/dtrmm_kernel_16x4_power8.S
+++ b/kernel/power/dtrmm_kernel_16x4_power8.S
@@ -257,8 +257,6 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
         stvx    v31, r11, r0
         li r11,0
 
-	stw	r31,  144(SP)
-
 	stfd	f1,  ALPHA_SP
 	stw	r0,  FZERO
 

From 7684c4f8f8f979ec4d8a563e9b9cb442d9b04a80 Mon Sep 17 00:00:00 2001
From: Martin Kroeker <martin@ruby.chemie.uni-freiburg.de>
Date: Thu, 20 Jun 2019 19:56:01 +0200
Subject: [PATCH 089/127] PGI compiler does not like -march=native

---
 Makefile.system | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/Makefile.system b/Makefile.system
index 44eacda4b..fcb3cbe33 100644
--- a/Makefile.system
+++ b/Makefile.system
@@ -144,9 +144,10 @@ endif
 
 # On x86_64 build getarch with march=native. This is required to detect AVX512 support in getarch.
 ifeq ($(ARCH), x86_64)
+ifneq ($(C_COMPILER), PGI)
 GETARCH_FLAGS += -march=native
 endif
-
+endif
 
 ifdef INTERFACE64
 ifneq ($(INTERFACE64), 0)

From eebfeba7680e4b81f0803f44999c86303aa5945b Mon Sep 17 00:00:00 2001
From: Piotr Kubaj <pkubaj@anongoth.pl>
Date: Tue, 25 Jun 2019 10:58:56 +0200
Subject: [PATCH 090/127] Fix build on FreeBSD/powerpc64.

Signed-off-by: Piotr Kubaj <pkubaj@anongoth.pl>
---
 common_power.h                             | 6 +++---
 kernel/power/axpy.S                        | 2 +-
 kernel/power/axpy_ppc440.S                 | 2 +-
 kernel/power/cgemm_kernel_8x4_power8.S     | 6 +++---
 kernel/power/ctrmm_kernel_8x4_power8.S     | 6 +++---
 kernel/power/dgemm_kernel_16x4_power8.S    | 4 ++--
 kernel/power/dtrmm_kernel_16x4_power8.S    | 4 ++--
 kernel/power/dtrsm_kernel_LT_16x4_power8.S | 4 ++--
 kernel/power/gemm_beta.S                   | 2 +-
 kernel/power/gemm_kernel.S                 | 6 +++---
 kernel/power/gemm_kernel_altivec.S         | 2 +-
 kernel/power/gemm_kernel_altivec_cell.S    | 2 +-
 kernel/power/gemm_kernel_altivec_g4.S      | 2 +-
 kernel/power/gemm_kernel_cell.S            | 6 +++---
 kernel/power/gemm_kernel_g4.S              | 4 ++--
 kernel/power/gemm_kernel_hummer.S          | 2 +-
 kernel/power/gemm_kernel_power3.S          | 4 ++--
 kernel/power/gemm_kernel_power6.S          | 4 ++--
 kernel/power/gemm_kernel_ppc440.S          | 4 ++--
 kernel/power/gemv_n.S                      | 4 ++--
 kernel/power/gemv_n_ppc440.S               | 4 ++--
 kernel/power/gemv_t.S                      | 4 ++--
 kernel/power/gemv_t_ppc440.S               | 4 ++--
 kernel/power/ger.S                         | 4 ++--
 kernel/power/scal.S                        | 2 +-
 kernel/power/scal_ppc440.S                 | 2 +-
 kernel/power/sgemm_kernel_16x8_power8.S    | 4 ++--
 kernel/power/strmm_kernel_16x8_power8.S    | 4 ++--
 kernel/power/swap.S                        | 2 +-
 kernel/power/symv_L.S                      | 4 ++--
 kernel/power/symv_U.S                      | 4 ++--
 kernel/power/trsm_kernel_LN.S              | 6 +++---
 kernel/power/trsm_kernel_LT.S              | 6 +++---
 kernel/power/trsm_kernel_RT.S              | 6 +++---
 kernel/power/trsm_kernel_cell_LN.S         | 6 +++---
 kernel/power/trsm_kernel_cell_LT.S         | 6 +++---
 kernel/power/trsm_kernel_cell_RT.S         | 6 +++---
 kernel/power/trsm_kernel_hummer_LN.S       | 2 +-
 kernel/power/trsm_kernel_hummer_LT.S       | 2 +-
 kernel/power/trsm_kernel_hummer_RT.S       | 2 +-
 kernel/power/trsm_kernel_power6_LN.S       | 4 ++--
 kernel/power/trsm_kernel_power6_LT.S       | 4 ++--
 kernel/power/trsm_kernel_power6_RT.S       | 4 ++--
 kernel/power/trsm_kernel_ppc440_LN.S       | 4 ++--
 kernel/power/trsm_kernel_ppc440_LT.S       | 4 ++--
 kernel/power/trsm_kernel_ppc440_RT.S       | 4 ++--
 kernel/power/zaxpy.S                       | 4 ++--
 kernel/power/zaxpy_ppc440.S                | 4 ++--
 kernel/power/zgemm_beta.S                  | 2 +-
 kernel/power/zgemm_kernel.S                | 8 ++++----
 kernel/power/zgemm_kernel_8x2_power8.S     | 6 +++---
 kernel/power/zgemm_kernel_altivec.S        | 6 +++---
 kernel/power/zgemm_kernel_altivec_cell.S   | 6 +++---
 kernel/power/zgemm_kernel_altivec_g4.S     | 4 ++--
 kernel/power/zgemm_kernel_cell.S           | 8 ++++----
 kernel/power/zgemm_kernel_g4.S             | 6 +++---
 kernel/power/zgemm_kernel_hummer.S         | 2 +-
 kernel/power/zgemm_kernel_power3.S         | 6 +++---
 kernel/power/zgemm_kernel_power6.S         | 6 +++---
 kernel/power/zgemm_kernel_power9.S         | 4 ++--
 kernel/power/zgemm_kernel_ppc440.S         | 6 +++---
 kernel/power/zgemv_n.S                     | 4 ++--
 kernel/power/zgemv_n_ppc440.S              | 4 ++--
 kernel/power/zgemv_t.S                     | 4 ++--
 kernel/power/zgemv_t_ppc440.S              | 4 ++--
 kernel/power/zger.S                        | 4 ++--
 kernel/power/zscal.S                       | 2 +-
 kernel/power/zscal_ppc440.S                | 2 +-
 kernel/power/zswap.S                       | 4 ++--
 kernel/power/zsymv_L.S                     | 4 ++--
 kernel/power/zsymv_U.S                     | 4 ++--
 kernel/power/ztrmm_kernel_8x2_power8.S     | 6 +++---
 kernel/power/ztrsm_kernel_LN.S             | 8 ++++----
 kernel/power/ztrsm_kernel_LT.S             | 8 ++++----
 kernel/power/ztrsm_kernel_RT.S             | 8 ++++----
 kernel/power/ztrsm_kernel_cell_LN.S        | 6 +++---
 kernel/power/ztrsm_kernel_cell_LT.S        | 8 ++++----
 kernel/power/ztrsm_kernel_cell_RT.S        | 6 +++---
 kernel/power/ztrsm_kernel_hummer_LN.S      | 2 +-
 kernel/power/ztrsm_kernel_hummer_LT.S      | 2 +-
 kernel/power/ztrsm_kernel_hummer_RT.S      | 2 +-
 kernel/power/ztrsm_kernel_power6_LN.S      | 6 +++---
 kernel/power/ztrsm_kernel_power6_LT.S      | 6 +++---
 kernel/power/ztrsm_kernel_power6_RT.S      | 6 +++---
 kernel/power/ztrsm_kernel_ppc440_LN.S      | 6 +++---
 kernel/power/ztrsm_kernel_ppc440_LT.S      | 6 +++---
 kernel/power/ztrsm_kernel_ppc440_RT.S      | 6 +++---
 87 files changed, 193 insertions(+), 193 deletions(-)

diff --git a/common_power.h b/common_power.h
index 889205c75..f38b85864 100644
--- a/common_power.h
+++ b/common_power.h
@@ -499,7 +499,7 @@ static inline int blas_quickdivide(blasint x, blasint y){
 
 #if defined(ASSEMBLER) && !defined(NEEDPARAM)
 
-#ifdef OS_LINUX
+#if defined(OS_LINUX) || defined(OS_FREEBSD)
 #ifndef __64BIT__
 #define PROLOGUE \
 	.section .text;\
@@ -784,7 +784,7 @@ Lmcount$lazy_ptr:
 
 #define HALT		mfspr	r0, 1023
 
-#ifdef OS_LINUX
+#if defined(OS_LINUX) || defined(OS_FREEBSD)
 #if defined(PPC440) || defined(PPC440FP2)
 #undef  MAX_CPU_NUMBER
 #define MAX_CPU_NUMBER 1
@@ -829,7 +829,7 @@ Lmcount$lazy_ptr:
 #define MAP_ANONYMOUS MAP_ANON
 #endif
 
-#ifdef OS_LINUX
+#if defined(OS_LINUX) || defined(OS_FREEBSD)
 #ifndef __64BIT__
 #define FRAMESLOT(X) (((X) * 4) + 8)
 #else
diff --git a/kernel/power/axpy.S b/kernel/power/axpy.S
index fb9789da4..238771826 100644
--- a/kernel/power/axpy.S
+++ b/kernel/power/axpy.S
@@ -39,7 +39,7 @@
 #define ASSEMBLER
 #include "common.h"
 
-#ifdef linux
+#if defined(linux) || defined(__FreeBSD__)
 #ifndef __64BIT__
 #define N	r3
 #define X	r6
diff --git a/kernel/power/axpy_ppc440.S b/kernel/power/axpy_ppc440.S
index 81a660e4d..7733e46e7 100644
--- a/kernel/power/axpy_ppc440.S
+++ b/kernel/power/axpy_ppc440.S
@@ -39,7 +39,7 @@
 #define ASSEMBLER
 #include "common.h"
 
-#ifdef linux
+#if defined(linux) || defined(__FreeBSD__)
 #ifndef __64BIT__
 #define N	r3
 #define X	r6
diff --git a/kernel/power/cgemm_kernel_8x4_power8.S b/kernel/power/cgemm_kernel_8x4_power8.S
index 8dbb6011d..2bc99974f 100644
--- a/kernel/power/cgemm_kernel_8x4_power8.S
+++ b/kernel/power/cgemm_kernel_8x4_power8.S
@@ -97,7 +97,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #define	N	r4
 #define	K	r5
 
-#ifdef linux
+#if defined(linux) || defined(__FreeBSD__)
 #ifndef __64BIT__
 #define A	r6
 #define	B	r7
@@ -265,7 +265,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 	stfs	f2,  ALPHA_I_SP
 	// stw	r0,  FZERO
 
-#ifdef linux
+#if defined(linux) || defined(__FreeBSD__)
 #ifdef __64BIT__
 	ld	LDC, FRAMESLOT(0) + 0(FRAMEPOINTER)
 #endif
@@ -286,7 +286,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #endif
 
 #ifdef TRMMKERNEL
-#if defined(linux) && defined(__64BIT__)
+#if (defined(linux) || defined(__FreeBSD__)) && defined(__64BIT__)
 	ld	OFFSET,  FRAMESLOT(1) + 0(FRAMEPOINTER)
 #endif
 
diff --git a/kernel/power/ctrmm_kernel_8x4_power8.S b/kernel/power/ctrmm_kernel_8x4_power8.S
index 26f49c663..822420dfd 100644
--- a/kernel/power/ctrmm_kernel_8x4_power8.S
+++ b/kernel/power/ctrmm_kernel_8x4_power8.S
@@ -98,7 +98,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #define	N	r4
 #define	K	r5
 
-#ifdef linux
+#if defined(linux) || defined(__FreeBSD__)
 #ifndef __64BIT__
 #define A	r6
 #define	B	r7
@@ -264,7 +264,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 	stfs	f2,  ALPHA_I_SP
 	// stw	r0,  FZERO
 
-#ifdef linux
+#if defined(linux) || defined(__FreeBSD__)
 #ifdef __64BIT__
 	ld	LDC, FRAMESLOT(0) + STACKSIZE(SP)
 #endif
@@ -285,7 +285,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #endif
 
 #ifdef TRMMKERNEL
-#if defined(linux) && defined(__64BIT__)
+#if (defined(linux) || defined(__FreeBSD__)) && defined(__64BIT__)
 	ld	OFFSET,  FRAMESLOT(1) + STACKSIZE(SP)
 #endif
 
diff --git a/kernel/power/dgemm_kernel_16x4_power8.S b/kernel/power/dgemm_kernel_16x4_power8.S
index 41958eab0..651fd53fc 100644
--- a/kernel/power/dgemm_kernel_16x4_power8.S
+++ b/kernel/power/dgemm_kernel_16x4_power8.S
@@ -97,7 +97,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #define	N	r4
 #define	K	r5
 
-#ifdef linux
+#if defined(linux) || defined(__FreeBSD__)
 #ifndef __64BIT__
 #define A	r6
 #define	B	r7
@@ -271,7 +271,7 @@ li r11,0
 	slwi	LDC, LDC, BASE_SHIFT
 
 #if defined(TRMMKERNEL)
-#if defined(linux) && defined(__64BIT__)
+#if (defined(linux) || defined(__FreeBSD__)) && defined(__64BIT__)
 	ld	OFFSET,  FRAMESLOT(0) + STACKSIZE(SP)
 #endif
 
diff --git a/kernel/power/dtrmm_kernel_16x4_power8.S b/kernel/power/dtrmm_kernel_16x4_power8.S
index 57829ac51..84c65f503 100644
--- a/kernel/power/dtrmm_kernel_16x4_power8.S
+++ b/kernel/power/dtrmm_kernel_16x4_power8.S
@@ -96,7 +96,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #define	N	r4
 #define	K	r5
 
-#ifdef linux
+#if defined(linux) || defined(__FreeBSD__)
 #ifndef __64BIT__
 #define A	r6
 #define	B	r7
@@ -269,7 +269,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 	slwi	LDC, LDC, BASE_SHIFT
 
 #if defined(TRMMKERNEL)
-#if defined(linux) && defined(__64BIT__)
+#if (defined(linux) || defined(__FreeBSD__)) && defined(__64BIT__)
 	ld	OFFSET,  FRAMESLOT(0) + STACKSIZE(SP)
 #endif
 
diff --git a/kernel/power/dtrsm_kernel_LT_16x4_power8.S b/kernel/power/dtrsm_kernel_LT_16x4_power8.S
index 7a4a30390..8a423f181 100644
--- a/kernel/power/dtrsm_kernel_LT_16x4_power8.S
+++ b/kernel/power/dtrsm_kernel_LT_16x4_power8.S
@@ -61,7 +61,7 @@
 #define	N	r4
 #define	K	r5
 
-#ifdef linux
+#if defined(linux) || defined(__FreeBSD__)
 #ifndef __64BIT__
 #define A	r6
 #define	B	r7
@@ -217,7 +217,7 @@ li r11,0
 #endif
 
 
-#if defined(linux) && defined(__64BIT__)
+#if (defined(linux) || defined(__FreeBSD__)) && defined(__64BIT__)
 	ld	OFFSET,  FRAMESLOT(0) + STACKSIZE(SP)
 #endif
 
diff --git a/kernel/power/gemm_beta.S b/kernel/power/gemm_beta.S
index 7acc05b4d..81457b698 100644
--- a/kernel/power/gemm_beta.S
+++ b/kernel/power/gemm_beta.S
@@ -62,7 +62,7 @@
 	stfd	f31,   16(SP)
 	stw	r0,    24(SP)
 
-#ifdef linux
+#if defined(linux) || defined(__FreeBSD__)
 #ifndef __64BIT__
 	lwz	LDC,    FRAMESLOT(0) + STACKSIZE(SP)
 #else
diff --git a/kernel/power/gemm_kernel.S b/kernel/power/gemm_kernel.S
index e5e9ec346..37ff9c9e7 100644
--- a/kernel/power/gemm_kernel.S
+++ b/kernel/power/gemm_kernel.S
@@ -59,7 +59,7 @@
 #define	N	r4
 #define	K	r5
 
-#ifdef linux
+#if defined(linux) || defined(__FreeBSD__)
 #ifndef __64BIT__
 #define A	r6
 #define	B	r7
@@ -186,7 +186,7 @@
 	slwi	LDC, LDC, BASE_SHIFT
 
 #if defined(TRMMKERNEL)
-#if defined(linux) && defined(__64BIT__)
+#if (defined(linux) || defined(__FreeBSD__)) && defined(__64BIT__)
 	ld	OFFSET,   FRAMESLOT(0) + STACKSIZE(SP)
 #endif
 
@@ -228,7 +228,7 @@
 
 #else
 
-#ifdef linux
+#if defined(linux) || defined(__FreeBSD__)
 #ifndef __64BIT__
 	mr	PREA,  r10
 	lwz	PREB,  FRAMESLOT(0) + STACKSIZE(SP)
diff --git a/kernel/power/gemm_kernel_altivec.S b/kernel/power/gemm_kernel_altivec.S
index 6c7e78319..2dae49cb8 100644
--- a/kernel/power/gemm_kernel_altivec.S
+++ b/kernel/power/gemm_kernel_altivec.S
@@ -58,7 +58,7 @@
 #define	N	r4
 #define	K	r5
 
-#ifdef linux
+#if defined(linux) || defined(__FreeBSD__)
 #ifndef __64BIT__
 #define A	r6
 #define	B	r7
diff --git a/kernel/power/gemm_kernel_altivec_cell.S b/kernel/power/gemm_kernel_altivec_cell.S
index b7445a1f6..0823420dd 100644
--- a/kernel/power/gemm_kernel_altivec_cell.S
+++ b/kernel/power/gemm_kernel_altivec_cell.S
@@ -58,7 +58,7 @@
 #define	N	r4
 #define	K	r5
 
-#ifdef linux
+#if defined(linux) || defined(__FreeBSD__)
 #ifndef __64BIT__
 #define A	r6
 #define	B	r7
diff --git a/kernel/power/gemm_kernel_altivec_g4.S b/kernel/power/gemm_kernel_altivec_g4.S
index 548150143..3a214b248 100644
--- a/kernel/power/gemm_kernel_altivec_g4.S
+++ b/kernel/power/gemm_kernel_altivec_g4.S
@@ -58,7 +58,7 @@
 #define	N	r4
 #define	K	r5
 
-#ifdef linux
+#if defined(linux) || defined(__FreeBSD__)
 #ifndef __64BIT__
 #define A	r6
 #define	B	r7
diff --git a/kernel/power/gemm_kernel_cell.S b/kernel/power/gemm_kernel_cell.S
index f3d3b8325..26f9cb023 100644
--- a/kernel/power/gemm_kernel_cell.S
+++ b/kernel/power/gemm_kernel_cell.S
@@ -59,7 +59,7 @@
 #define	N	r4
 #define	K	r5
 
-#ifdef linux
+#if defined(linux) || defined(__FreeBSD__)
 #ifndef __64BIT__
 #define A	r6
 #define	B	r7
@@ -192,7 +192,7 @@
 	slwi	LDC, LDC, BASE_SHIFT
 
 #if defined(TRMMKERNEL)
-#if defined(linux) && defined(__64BIT__)
+#if (defined(linux) || defined(__FreeBSD__)) && defined(__64BIT__)
 	ld	OFFSET, FRAMESLOT(0) + STACKSIZE(SP)
 #endif
 
@@ -226,7 +226,7 @@
 	li	PREC,   4 * SIZE
 #endif
 
-#ifdef linux
+#if defined(linux) || defined(__FreeBSD__)
 #ifndef __64BIT__
 	mr	PREA,  r10
 	lwz	PREB,  FRAMESLOT(0) + STACKSIZE(SP)
diff --git a/kernel/power/gemm_kernel_g4.S b/kernel/power/gemm_kernel_g4.S
index 259f04c4e..a5c4d3a43 100644
--- a/kernel/power/gemm_kernel_g4.S
+++ b/kernel/power/gemm_kernel_g4.S
@@ -59,7 +59,7 @@
 #define	N	r4
 #define	K	r5
 
-#ifdef linux
+#if defined(linux) || defined(__FreeBSD__)
 #ifndef __64BIT__
 #define A	r6
 #define	B	r7
@@ -184,7 +184,7 @@
 	slwi	LDC, LDC, BASE_SHIFT
 
 #if defined(TRMMKERNEL)
-#if defined(linux) && defined(__64BIT__)
+#if (defined(linux) || defined(__FreeBSD__)) && defined(__64BIT__)
 	ld	OFFSET, FRAMESLOT(0) + STACKSIZE(SP)
 #endif
 
diff --git a/kernel/power/gemm_kernel_hummer.S b/kernel/power/gemm_kernel_hummer.S
index 3a8e1edfa..6ecbeb3e0 100644
--- a/kernel/power/gemm_kernel_hummer.S
+++ b/kernel/power/gemm_kernel_hummer.S
@@ -46,7 +46,7 @@
 #define	N	r4
 #define	K	r5
 
-#ifdef linux
+#if defined(linux) || defined(__FreeBSD__)
 #define A	r6
 #define	B	r7
 #define	C	r8
diff --git a/kernel/power/gemm_kernel_power3.S b/kernel/power/gemm_kernel_power3.S
index 4a6b5da62..f88bc291c 100644
--- a/kernel/power/gemm_kernel_power3.S
+++ b/kernel/power/gemm_kernel_power3.S
@@ -59,7 +59,7 @@
 #define	N	r4
 #define	K	r5
 
-#ifdef linux
+#if defined(linux) || defined(__FreeBSD__)
 #ifndef __64BIT__
 #define A	r6
 #define	B	r7
@@ -187,7 +187,7 @@
 	li	PREC,   4 * SIZE
 #else
 
-#ifdef linux
+#if defined(linux) || defined(__FreeBSD__)
 #ifndef __64BIT__
 	mr	PREA,  r10
 	lwz	PREB,  FRAMESLOT(0) + STACKSIZE(SP)
diff --git a/kernel/power/gemm_kernel_power6.S b/kernel/power/gemm_kernel_power6.S
index 1a412c4fb..b274f7655 100644
--- a/kernel/power/gemm_kernel_power6.S
+++ b/kernel/power/gemm_kernel_power6.S
@@ -59,7 +59,7 @@
 #define	N	r4
 #define	K	r5
 
-#ifdef linux
+#if defined(linux) || defined(__FreeBSD__)
 #ifndef __64BIT__
 #define A	r6
 #define	B	r7
@@ -183,7 +183,7 @@
 	slwi	LDC, LDC, BASE_SHIFT
 
 #if defined(TRMMKERNEL)
-#if defined(linux) && defined(__64BIT__)
+#if (defined(linux) || defined(__FreeBSD__)) && defined(__64BIT__)
 	ld	OFFSET,  FRAMESLOT(0) + STACKSIZE(SP)
 #endif
 
diff --git a/kernel/power/gemm_kernel_ppc440.S b/kernel/power/gemm_kernel_ppc440.S
index b128beb38..c5ef6e4e5 100644
--- a/kernel/power/gemm_kernel_ppc440.S
+++ b/kernel/power/gemm_kernel_ppc440.S
@@ -59,7 +59,7 @@
 #define	N	r4
 #define	K	r5
 
-#ifdef linux
+#if defined(linux) || defined(__FreeBSD__)
 #ifndef __64BIT__
 #define A	r6
 #define	B	r7
@@ -183,7 +183,7 @@
 	slwi	LDC, LDC, BASE_SHIFT
 
 #if defined(TRMMKERNEL)
-#if defined(linux) && defined(__64BIT__)
+#if (defined(linux) || defined(__FreeBSD__)) && defined(__64BIT__)
 	ld	OFFSET, FRAMESLOT(0) + STACKSIZE(SP)
 #endif
 
diff --git a/kernel/power/gemv_n.S b/kernel/power/gemv_n.S
index 02160bd61..abc61b62e 100644
--- a/kernel/power/gemv_n.S
+++ b/kernel/power/gemv_n.S
@@ -39,7 +39,7 @@
 #define ASSEMBLER
 #include "common.h"
 
-#ifdef linux
+#if defined(linux) || defined(__FreeBSD__)
 #ifndef __64BIT__
 #define M	r3
 #define	N	r4
@@ -252,7 +252,7 @@
 	stw	r27,   196(SP)
 #endif
 
-#ifdef linux
+#if defined(linux) || defined(__FreeBSD__)
 #ifndef __64BIT__
 	lwz	INCY,    FRAMESLOT(0) + STACKSIZE(SP)
 	lwz	BUFFER,  FRAMESLOT(1) + STACKSIZE(SP)
diff --git a/kernel/power/gemv_n_ppc440.S b/kernel/power/gemv_n_ppc440.S
index beb21200a..18d804520 100644
--- a/kernel/power/gemv_n_ppc440.S
+++ b/kernel/power/gemv_n_ppc440.S
@@ -39,7 +39,7 @@
 #define ASSEMBLER
 #include "common.h"
 
-#ifdef linux
+#if defined(linux) || defined(__FreeBSD__)
 #ifndef __64BIT__
 #define M	r3
 #define	N	r4
@@ -199,7 +199,7 @@
 	stw	r23,   180(SP)
 #endif
 
-#ifdef linux
+#if defined(linux) || defined(__FreeBSD__)
 #ifndef __64BIT__
 	lwz	INCY,    FRAMESLOT(0) + STACKSIZE(SP)
 	lwz	BUFFER,  FRAMESLOT(1) + STACKSIZE(SP)
diff --git a/kernel/power/gemv_t.S b/kernel/power/gemv_t.S
index 457753065..25a4dd01b 100644
--- a/kernel/power/gemv_t.S
+++ b/kernel/power/gemv_t.S
@@ -39,7 +39,7 @@
 #define ASSEMBLER
 #include "common.h"
 
-#ifdef linux
+#if defined(linux) || defined(__FreeBSD__)
 #ifndef __64BIT__
 #define M	r3
 #define	N	r4
@@ -260,7 +260,7 @@
 	stw	r29,   220(SP)
 #endif
 
-#ifdef linux
+#if defined(linux) || defined(__FreeBSD__)
 #ifndef __64BIT__
 	lwz	INCY,	 FRAMESLOT(0) + STACKSIZE(SP)
 	lwz	BUFFER,  FRAMESLOT(1) + STACKSIZE(SP)
diff --git a/kernel/power/gemv_t_ppc440.S b/kernel/power/gemv_t_ppc440.S
index 6e560db6c..7d12b07a4 100644
--- a/kernel/power/gemv_t_ppc440.S
+++ b/kernel/power/gemv_t_ppc440.S
@@ -39,7 +39,7 @@
 #define ASSEMBLER
 #include "common.h"
 
-#ifdef linux
+#if defined(linux) || defined(__FreeBSD__)
 #ifndef __64BIT__
 #define M	r3
 #define	N	r4
@@ -190,7 +190,7 @@
 	stw	r22,   192(SP)
 #endif
 
-#ifdef linux
+#if defined(linux) || defined(__FreeBSD__)
 #ifndef __64BIT__
 	lwz	INCY,	 FRAMESLOT(0) + STACKSIZE(SP)
 	lwz	BUFFER,  FRAMESLOT(1) + STACKSIZE(SP)
diff --git a/kernel/power/ger.S b/kernel/power/ger.S
index fd397ce8c..d83546b0d 100644
--- a/kernel/power/ger.S
+++ b/kernel/power/ger.S
@@ -47,7 +47,7 @@
 #endif
 #endif
 
-#ifdef linux
+#if defined(linux) || defined(__FreeBSD__)
 #ifndef __64BIT__
 #define M	r3
 #define	N	r4
@@ -224,7 +224,7 @@
 	stw	r27,   196(SP)
 #endif
 
-#ifdef linux
+#if defined(linux) || defined(__FreeBSD__)
 #ifndef __64BIT__
 	lwz	LDA,     FRAMESLOT(0) + STACKSIZE(SP)
 	lwz	BUFFER,  FRAMESLOT(1) + STACKSIZE(SP)
diff --git a/kernel/power/scal.S b/kernel/power/scal.S
index 7c65d1234..19fdd32ab 100644
--- a/kernel/power/scal.S
+++ b/kernel/power/scal.S
@@ -43,7 +43,7 @@
 #define XX	r4
 #define PREA	r5
 
-#ifdef linux
+#if defined(linux) || defined(__FreeBSD__)
 #ifndef __64BIT__
 #define X r6
 #define INCX r7
diff --git a/kernel/power/scal_ppc440.S b/kernel/power/scal_ppc440.S
index ed148834d..d977b0b59 100644
--- a/kernel/power/scal_ppc440.S
+++ b/kernel/power/scal_ppc440.S
@@ -43,7 +43,7 @@
 #define XX	r4
 #define PRE	r5
 
-#ifdef linux
+#if defined(linux) || defined(__FreeBSD__)
 #ifndef __64BIT__
 #define X r6
 #define INCX r7
diff --git a/kernel/power/sgemm_kernel_16x8_power8.S b/kernel/power/sgemm_kernel_16x8_power8.S
index c72b00cf6..3e6440af8 100644
--- a/kernel/power/sgemm_kernel_16x8_power8.S
+++ b/kernel/power/sgemm_kernel_16x8_power8.S
@@ -95,7 +95,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #define	N	r4
 #define	K	r5
 
-#ifdef linux
+#if defined(linux) || defined(__FreeBSD__)
 #ifndef __64BIT__
 #define A	r6
 #define	B	r7
@@ -273,7 +273,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 	slwi	LDC, LDC, 2
 
 #if defined(TRMMKERNEL)
-#if defined(linux) && defined(__64BIT__)
+#if (defined(linux) || defined(__FreeBSD__)) && defined(__64BIT__)
 	ld	OFFSET,  FRAMESLOT(0) + 0(FRAMEPOINTER)
 #endif
 
diff --git a/kernel/power/strmm_kernel_16x8_power8.S b/kernel/power/strmm_kernel_16x8_power8.S
index f9b8a0bb8..78e539231 100644
--- a/kernel/power/strmm_kernel_16x8_power8.S
+++ b/kernel/power/strmm_kernel_16x8_power8.S
@@ -96,7 +96,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #define	N	r4
 #define	K	r5
 
-#ifdef linux
+#if defined(linux) || defined(__FreeBSD__)
 #ifndef __64BIT__
 #define A	r6
 #define	B	r7
@@ -271,7 +271,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 	slwi	LDC, LDC, BASE_SHIFT
 
 #if defined(TRMMKERNEL)
-#if defined(linux) && defined(__64BIT__)
+#if (defined(linux) || defined(__FreeBSD__)) && defined(__64BIT__)
 	ld	OFFSET,  FRAMESLOT(0) + STACKSIZE(SP)
 #endif
 
diff --git a/kernel/power/swap.S b/kernel/power/swap.S
index e862b17bb..c9c0f86b0 100644
--- a/kernel/power/swap.S
+++ b/kernel/power/swap.S
@@ -39,7 +39,7 @@
 #define ASSEMBLER
 #include "common.h"
 
-#ifdef linux
+#if defined(linux) || defined(__FreeBSD__)
 #ifndef __64BIT__
 #define N	r3
 #define X	r6
diff --git a/kernel/power/symv_L.S b/kernel/power/symv_L.S
index f7d768c50..a4ff703e2 100644
--- a/kernel/power/symv_L.S
+++ b/kernel/power/symv_L.S
@@ -39,7 +39,7 @@
 #define ASSEMBLER
 #include "common.h"
 
-#ifdef linux
+#if defined(linux) || defined(__FreeBSD__)
 #ifndef __64BIT__
 #define M	r3
 #define N	r4
@@ -248,7 +248,7 @@
 	stw	r27,   196(SP)
 #endif
 
-#ifdef linux
+#if defined(linux) || defined(__FreeBSD__)
 #ifndef __64BIT__
 	lwz	BUFFER,  FRAMESLOT(0) + STACKSIZE(SP)
 #else
diff --git a/kernel/power/symv_U.S b/kernel/power/symv_U.S
index d8e082397..c3063e077 100644
--- a/kernel/power/symv_U.S
+++ b/kernel/power/symv_U.S
@@ -39,7 +39,7 @@
 #define ASSEMBLER
 #include "common.h"
 
-#ifdef linux
+#if defined(linux) || defined(__FreeBSD__)
 #ifndef __64BIT__
 #define M	r3
 #define IS	r4
@@ -247,7 +247,7 @@
 	stw	r27,   196(SP)
 #endif
 
-#ifdef linux
+#if defined(linux) || defined(__FreeBSD__)
 #ifndef __64BIT__
 	lwz	BUFFER,  FRAMESLOT(0) + STACKSIZE(SP)
 #else
diff --git a/kernel/power/trsm_kernel_LN.S b/kernel/power/trsm_kernel_LN.S
index 7983c573b..8319d5ed8 100644
--- a/kernel/power/trsm_kernel_LN.S
+++ b/kernel/power/trsm_kernel_LN.S
@@ -59,7 +59,7 @@
 #define	N	r4
 #define	K	r5
 
-#ifdef linux
+#if defined(linux) || defined(__FreeBSD__)
 #ifndef __64BIT__
 #define A	r6
 #define	B	r7
@@ -180,7 +180,7 @@
 
 	slwi	LDC, LDC, BASE_SHIFT
 
-#if defined(linux) && defined(__64BIT__)
+#if (defined(linux) || defined(__FreeBSD__)) && defined(__64BIT__)
 	ld	OFFSET,  FRAMESLOT(0) + STACKSIZE(SP)
 #endif
 
@@ -236,7 +236,7 @@
 
 #else
 
-#ifdef linux
+#if defined(linux) || defined(__FreeBSD__)
 #ifndef __64BIT__
 	mr	PREA,  r10
 	lwz	PREB,  FRAMESLOT(0) + STACKSIZE(SP)
diff --git a/kernel/power/trsm_kernel_LT.S b/kernel/power/trsm_kernel_LT.S
index c561fd014..30f25e015 100644
--- a/kernel/power/trsm_kernel_LT.S
+++ b/kernel/power/trsm_kernel_LT.S
@@ -59,7 +59,7 @@
 #define	N	r4
 #define	K	r5
 
-#ifdef linux
+#if defined(linux) || defined(__FreeBSD__)
 #ifndef __64BIT__
 #define A	r6
 #define	B	r7
@@ -180,7 +180,7 @@
 
 	slwi	LDC, LDC, BASE_SHIFT
 
-#if defined(linux) && defined(__64BIT__)
+#if (defined(linux) || defined(__FreeBSD__)) && defined(__64BIT__)
 	ld	OFFSET,  FRAMESLOT(0) + STACKSIZE(SP)
 #endif
 
@@ -257,7 +257,7 @@
 
 #else
 
-#ifdef linux
+#if defined(linux) || defined(__FreeBSD__)
 #ifndef __64BIT__
 	mr	PREA,  r10
 	lwz	PREB,  FRAMESLOT(0) + STACKSIZE(SP)
diff --git a/kernel/power/trsm_kernel_RT.S b/kernel/power/trsm_kernel_RT.S
index 07b88402c..d39d3a6e2 100644
--- a/kernel/power/trsm_kernel_RT.S
+++ b/kernel/power/trsm_kernel_RT.S
@@ -59,7 +59,7 @@
 #define	N	r4
 #define	K	r5
 
-#ifdef linux
+#if defined(linux) || defined(__FreeBSD__)
 #ifndef __64BIT__
 #define A	r6
 #define	B	r7
@@ -180,7 +180,7 @@
 
 	slwi	LDC, LDC, BASE_SHIFT
 
-#if defined(linux) && defined(__64BIT__)
+#if (defined(linux) || defined(__FreeBSD__)) && defined(__64BIT__)
 	ld	OFFSET,  FRAMESLOT(0) + STACKSIZE(SP)
 #endif
 
@@ -254,7 +254,7 @@
 
 #else
 
-#ifdef linux
+#if defined(linux) || defined(__FreeBSD__)
 #ifndef __64BIT__
 	mr	PREA,  r10
 	lwz	PREB,  FRAMESLOT(0) + STACKSIZE(SP)
diff --git a/kernel/power/trsm_kernel_cell_LN.S b/kernel/power/trsm_kernel_cell_LN.S
index 803530cbb..f656015a8 100644
--- a/kernel/power/trsm_kernel_cell_LN.S
+++ b/kernel/power/trsm_kernel_cell_LN.S
@@ -59,7 +59,7 @@
 #define	N	r4
 #define	K	r5
 
-#ifdef linux
+#if defined(linux) || defined(__FreeBSD__)
 #ifndef __64BIT__
 #define A	r6
 #define	B	r7
@@ -180,7 +180,7 @@
 
 	slwi	LDC, LDC, BASE_SHIFT
 
-#if defined(linux) && defined(__64BIT__)
+#if (defined(linux) || defined(__FreeBSD__)) && defined(__64BIT__)
 	ld	OFFSET,  FRAMESLOT(0) + STACKSIZE(SP)
 #endif
 
@@ -231,7 +231,7 @@
 	li	PREC,  -4 * SIZE
 #else
 
-#ifdef linux
+#if defined(linux) || defined(__FreeBSD__)
 #ifndef __64BIT__
 	mr	PREA,  r10
 	lwz	PREB,  FRAMESLOT(0) + STACKSIZE(SP)
diff --git a/kernel/power/trsm_kernel_cell_LT.S b/kernel/power/trsm_kernel_cell_LT.S
index 105e7d43c..083af7289 100644
--- a/kernel/power/trsm_kernel_cell_LT.S
+++ b/kernel/power/trsm_kernel_cell_LT.S
@@ -59,7 +59,7 @@
 #define	N	r4
 #define	K	r5
 
-#ifdef linux
+#if defined(linux) || defined(__FreeBSD__)
 #ifndef __64BIT__
 #define A	r6
 #define	B	r7
@@ -180,7 +180,7 @@
 
 	slwi	LDC, LDC, BASE_SHIFT
 
-#if defined(linux) && defined(__64BIT__)
+#if (defined(linux) || defined(__FreeBSD__)) && defined(__64BIT__)
 	ld	OFFSET,  FRAMESLOT(0) + STACKSIZE(SP)
 #endif
 
@@ -257,7 +257,7 @@
 
 #else
 
-#ifdef linux
+#if defined(linux) || defined(__FreeBSD__)
 #ifndef __64BIT__
 	mr	PREA,  r10
 	lwz	PREB,  FRAMESLOT(0) + STACKSIZE(SP)
diff --git a/kernel/power/trsm_kernel_cell_RT.S b/kernel/power/trsm_kernel_cell_RT.S
index a54a261cb..5a5b67e77 100644
--- a/kernel/power/trsm_kernel_cell_RT.S
+++ b/kernel/power/trsm_kernel_cell_RT.S
@@ -59,7 +59,7 @@
 #define	N	r4
 #define	K	r5
 
-#ifdef linux
+#if defined(linux) || defined(__FreeBSD__)
 #ifndef __64BIT__
 #define A	r6
 #define	B	r7
@@ -180,7 +180,7 @@
 
 	slwi	LDC, LDC, BASE_SHIFT
 
-#if defined(linux) && defined(__64BIT__)
+#if (defined(linux) || defined(__FreeBSD__)) && defined(__64BIT__)
 	ld	OFFSET,  FRAMESLOT(0) + STACKSIZE(SP)
 #endif
 
@@ -231,7 +231,7 @@
 	li	PREC,  -4 * SIZE
 #else
 
-#ifdef linux
+#if defined(linux) || defined(__FreeBSD__)
 #ifndef __64BIT__
 	mr	PREA,  r10
 	lwz	PREB,  FRAMESLOT(0) + STACKSIZE(SP)
diff --git a/kernel/power/trsm_kernel_hummer_LN.S b/kernel/power/trsm_kernel_hummer_LN.S
index 109dacb8c..35ffab427 100644
--- a/kernel/power/trsm_kernel_hummer_LN.S
+++ b/kernel/power/trsm_kernel_hummer_LN.S
@@ -46,7 +46,7 @@
 #define	N	r4
 #define	K	r5
 
-#ifdef linux
+#if defined(linux) || defined(__FreeBSD__)
 #define A	r6
 #define	B	r7
 #define	C	r8
diff --git a/kernel/power/trsm_kernel_hummer_LT.S b/kernel/power/trsm_kernel_hummer_LT.S
index 1ad062a7c..f7a09dbd8 100644
--- a/kernel/power/trsm_kernel_hummer_LT.S
+++ b/kernel/power/trsm_kernel_hummer_LT.S
@@ -46,7 +46,7 @@
 #define	N	r4
 #define	K	r5
 
-#ifdef linux
+#if defined(linux) || defined(__FreeBSD__)
 #define A	r6
 #define	B	r7
 #define	C	r8
diff --git a/kernel/power/trsm_kernel_hummer_RT.S b/kernel/power/trsm_kernel_hummer_RT.S
index 94b3c0c85..0e563e5cc 100644
--- a/kernel/power/trsm_kernel_hummer_RT.S
+++ b/kernel/power/trsm_kernel_hummer_RT.S
@@ -46,7 +46,7 @@
 #define	N	r4
 #define	K	r5
 
-#ifdef linux
+#if defined(linux) || defined(__FreeBSD__)
 #define A	r6
 #define	B	r7
 #define	C	r8
diff --git a/kernel/power/trsm_kernel_power6_LN.S b/kernel/power/trsm_kernel_power6_LN.S
index 937a6761a..83594c772 100644
--- a/kernel/power/trsm_kernel_power6_LN.S
+++ b/kernel/power/trsm_kernel_power6_LN.S
@@ -59,7 +59,7 @@
 #define	N	r4
 #define	K	r5
 
-#ifdef linux
+#if defined(linux) || defined(__FreeBSD__)
 #ifndef __64BIT__
 #define A	r6
 #define	B	r7
@@ -179,7 +179,7 @@
 
 	slwi	LDC, LDC, BASE_SHIFT
 
-#if defined(linux) && defined(__64BIT__)
+#if (defined(linux) || defined(__FreeBSD__)) && defined(__64BIT__)
 	ld	OFFSET,  FRAMESLOT(0) + STACKSIZE(SP)
 #endif
 
diff --git a/kernel/power/trsm_kernel_power6_LT.S b/kernel/power/trsm_kernel_power6_LT.S
index 924f00ec0..54a8547b0 100644
--- a/kernel/power/trsm_kernel_power6_LT.S
+++ b/kernel/power/trsm_kernel_power6_LT.S
@@ -59,7 +59,7 @@
 #define	N	r4
 #define	K	r5
 
-#ifdef linux
+#if defined(linux) || defined(__FreeBSD__)
 #ifndef __64BIT__
 #define A	r6
 #define	B	r7
@@ -180,7 +180,7 @@
 
 	slwi	LDC, LDC, BASE_SHIFT
 
-#if defined(linux) && defined(__64BIT__)
+#if (defined(linux) || defined(__FreeBSD__)) && defined(__64BIT__)
 	ld	OFFSET,  FRAMESLOT(0) + STACKSIZE(SP)
 #endif
 
diff --git a/kernel/power/trsm_kernel_power6_RT.S b/kernel/power/trsm_kernel_power6_RT.S
index 40ee5e28d..b2b27613c 100644
--- a/kernel/power/trsm_kernel_power6_RT.S
+++ b/kernel/power/trsm_kernel_power6_RT.S
@@ -59,7 +59,7 @@
 #define	N	r4
 #define	K	r5
 
-#ifdef linux
+#if defined(linux) || defined(__FreeBSD__)
 #ifndef __64BIT__
 #define A	r6
 #define	B	r7
@@ -179,7 +179,7 @@
 
 	slwi	LDC, LDC, BASE_SHIFT
 
-#if defined(linux) && defined(__64BIT__)
+#if (defined(linux) || defined(__FreeBSD__)) && defined(__64BIT__)
 	ld	OFFSET,  FRAMESLOT(0) + STACKSIZE(SP)
 #endif
 
diff --git a/kernel/power/trsm_kernel_ppc440_LN.S b/kernel/power/trsm_kernel_ppc440_LN.S
index 6b7312101..a708a084d 100644
--- a/kernel/power/trsm_kernel_ppc440_LN.S
+++ b/kernel/power/trsm_kernel_ppc440_LN.S
@@ -59,7 +59,7 @@
 #define	N	r4
 #define	K	r5
 
-#ifdef linux
+#if defined(linux) || defined(__FreeBSD__)
 #ifndef __64BIT__
 #define A	r6
 #define	B	r7
@@ -191,7 +191,7 @@
 
 	slwi	LDC, LDC, BASE_SHIFT
 
-#if defined(linux) && defined(__64BIT__)
+#if (defined(linux) || defined(__FreeBSD__)) && defined(__64BIT__)
 	ld	OFFSET,  FRAMESLOT(0) + STACKSIZE(SP)
 #endif
 
diff --git a/kernel/power/trsm_kernel_ppc440_LT.S b/kernel/power/trsm_kernel_ppc440_LT.S
index 28b109b96..31f82de2c 100644
--- a/kernel/power/trsm_kernel_ppc440_LT.S
+++ b/kernel/power/trsm_kernel_ppc440_LT.S
@@ -59,7 +59,7 @@
 #define	N	r4
 #define	K	r5
 
-#ifdef linux
+#if defined(linux) || defined(__FreeBSD__)
 #ifndef __64BIT__
 #define A	r6
 #define	B	r7
@@ -176,7 +176,7 @@
 
 	slwi	LDC, LDC, BASE_SHIFT
 
-#if defined(linux) && defined(__64BIT__)
+#if (defined(linux) || defined(__FreeBSD__)) && defined(__64BIT__)
 	ld	OFFSET,  FRAMESLOT(0) + STACKSIZE(SP)
 #endif
 
diff --git a/kernel/power/trsm_kernel_ppc440_RT.S b/kernel/power/trsm_kernel_ppc440_RT.S
index df80cd393..f5005403c 100644
--- a/kernel/power/trsm_kernel_ppc440_RT.S
+++ b/kernel/power/trsm_kernel_ppc440_RT.S
@@ -59,7 +59,7 @@
 #define	N	r4
 #define	K	r5
 
-#ifdef linux
+#if defined(linux) || defined(__FreeBSD__)
 #ifndef __64BIT__
 #define A	r6
 #define	B	r7
@@ -191,7 +191,7 @@
 
 	slwi	LDC, LDC, BASE_SHIFT
 
-#if defined(linux) && defined(__64BIT__)
+#if (defined(linux) || defined(__FreeBSD__)) && defined(__64BIT__)
 	ld	OFFSET,  FRAMESLOT(0) + STACKSIZE(SP)
 #endif
 
diff --git a/kernel/power/zaxpy.S b/kernel/power/zaxpy.S
index ac5b249bb..b001f42d1 100644
--- a/kernel/power/zaxpy.S
+++ b/kernel/power/zaxpy.S
@@ -39,7 +39,7 @@
 #define ASSEMBLER
 #include "common.h"
 
-#ifdef linux
+#if defined(linux) || defined(__FreeBSD__)
 #ifndef __64BIT__
 #define N	r3
 #define X	r6
@@ -123,7 +123,7 @@
 	stfd	f24,   80(SP)
 	stfd	f25,   88(SP)
 
-#if defined(linux) && defined(__64BIT__)
+#if (defined(linux) || defined(__FreeBSD__)) && defined(__64BIT__)
         ld	INCY, FRAMESLOT(0) + STACKSIZE(SP)
 #endif
 
diff --git a/kernel/power/zaxpy_ppc440.S b/kernel/power/zaxpy_ppc440.S
index b5c604e91..848a0135f 100644
--- a/kernel/power/zaxpy_ppc440.S
+++ b/kernel/power/zaxpy_ppc440.S
@@ -39,7 +39,7 @@
 #define ASSEMBLER
 #include "common.h"
 
-#ifdef linux
+#if defined(linux) || defined(__FreeBSD__)
 #ifndef __64BIT__
 #define N	r3
 #define X	r6
@@ -112,7 +112,7 @@
 	stfd	f24,   80(SP)
 	stfd	f25,   88(SP)
 
-#if defined(linux) && defined(__64BIT__)
+#if (defined(linux) || defined(__FreeBSD__)) && defined(__64BIT__)
         ld	INCY, FRAMESLOT(0) + STACKSIZE(SP)
 #endif
 
diff --git a/kernel/power/zgemm_beta.S b/kernel/power/zgemm_beta.S
index 1f4c29210..57c3bed50 100644
--- a/kernel/power/zgemm_beta.S
+++ b/kernel/power/zgemm_beta.S
@@ -62,7 +62,7 @@
 	stfd	f31,    8(SP)
 	stw	r0,    16(SP)
 
-#ifdef linux
+#if defined(linux) || defined(__FreeBSD__)
 #ifndef __64BIT__
 	lwz	LDC,    FRAMESLOT(0) + STACKSIZE(SP)
 #else
diff --git a/kernel/power/zgemm_kernel.S b/kernel/power/zgemm_kernel.S
index 8ec8b674a..ae8a93e89 100644
--- a/kernel/power/zgemm_kernel.S
+++ b/kernel/power/zgemm_kernel.S
@@ -61,7 +61,7 @@
 #define	N	r4
 #define	K	r5
 
-#ifdef linux
+#if defined(linux) || defined(__FreeBSD__)
 #ifndef __64BIT__
 #define A	r6
 #define	B	r7
@@ -169,7 +169,7 @@
 	stfd	f2,  ALPHA_I
 	stw	r0,  FZERO
 
-#ifdef linux
+#if defined(linux) || defined(__FreeBSD__)
 #ifdef __64BIT__
 	ld	LDC, FRAMESLOT(0) + STACKSIZE(SP)
 #endif
@@ -190,7 +190,7 @@
 #endif
 
 #ifdef TRMMKERNEL
-#if defined(linux) && defined(__64BIT__)
+#if (defined(linux) || defined(__FreeBSD__)) && defined(__64BIT__)
 	ld	OFFSET,  FRAMESLOT(1) + STACKSIZE(SP)
 #endif
 
@@ -231,7 +231,7 @@
 #endif
 #else
 
-#ifdef linux
+#if defined(linux) || defined(__FreeBSD__)
 #ifndef __64BIT__
 	lwz	PREA,  FRAMESLOT(2) + STACKSIZE(SP)
 	lwz	PREC,  FRAMESLOT(3) + STACKSIZE(SP)
diff --git a/kernel/power/zgemm_kernel_8x2_power8.S b/kernel/power/zgemm_kernel_8x2_power8.S
index 5526b91c9..dfe2d9dc6 100644
--- a/kernel/power/zgemm_kernel_8x2_power8.S
+++ b/kernel/power/zgemm_kernel_8x2_power8.S
@@ -132,7 +132,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #define	N	r4
 #define	K	r5
 
-#ifdef linux
+#if defined(linux) || defined(__FreeBSD__)
 #ifndef __64BIT__
 #define A	r6
 #define	B	r7
@@ -296,7 +296,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 	stfd	f2,  ALPHA_I_SP
 	stw	r0,  FZERO
 
-#ifdef linux
+#if defined(linux) || defined(__FreeBSD__)
 #ifdef __64BIT__
 	ld	LDC, FRAMESLOT(0) + 0(FRAMEPOINTER)
 #endif
@@ -317,7 +317,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #endif
 
 #ifdef TRMMKERNEL
-#if defined(linux) && defined(__64BIT__)
+#if (defined(linux) || defined(__FreeBSD__)) && defined(__64BIT__)
 	ld	OFFSET,  FRAMESLOT(1) + 0(FRAMEPOINTER)
 #endif
 
diff --git a/kernel/power/zgemm_kernel_altivec.S b/kernel/power/zgemm_kernel_altivec.S
index 2b650cd02..2525a8e58 100644
--- a/kernel/power/zgemm_kernel_altivec.S
+++ b/kernel/power/zgemm_kernel_altivec.S
@@ -62,7 +62,7 @@
 #define	N	r4
 #define	K	r5
 
-#ifdef linux
+#if defined(linux) || defined(__FreeBSD__)
 #ifndef __64BIT__
 #define A	r6
 #define	B	r7
@@ -238,7 +238,7 @@
 #endif
 
 
-#ifdef linux
+#if defined(linux) || defined(__FreeBSD__)
 #ifdef __64BIT__
 	ld	LDC,   FRAMESLOT(0) + STACKSIZE(SP)
 #endif
@@ -264,7 +264,7 @@
 #endif
 #else
 
-#ifdef linux
+#if defined(linux) || defined(__FreeBSD__)
 #ifndef __64BIT__
 	lwz	PREB,  FRAMESLOT(2) + STACKSIZE(SP)
 	lwz	PREC,  FRAMESLOT(3) + STACKSIZE(SP)
diff --git a/kernel/power/zgemm_kernel_altivec_cell.S b/kernel/power/zgemm_kernel_altivec_cell.S
index 642d1f2e7..47a79064d 100644
--- a/kernel/power/zgemm_kernel_altivec_cell.S
+++ b/kernel/power/zgemm_kernel_altivec_cell.S
@@ -62,7 +62,7 @@
 #define	N	r4
 #define	K	r5
 
-#ifdef linux
+#if defined(linux) || defined(__FreeBSD__)
 #ifndef __64BIT__
 #define A	r6
 #define	B	r7
@@ -244,7 +244,7 @@
 #endif
 
 
-#ifdef linux
+#if defined(linux) || defined(__FreeBSD__)
 #ifdef __64BIT__
 	ld	LDC,    FRAMESLOT(0) + STACKSIZE(SP)
 #endif
@@ -270,7 +270,7 @@
 #endif
 #else
 
-#ifdef linux
+#if defined(linux) || defined(__FreeBSD__)
 #ifndef __64BIT__
 	lwz	PREB,  FRAMESLOT(2) + STACKSIZE(SP)
 	lwz	PREC,  FRAMESLOT(3) + STACKSIZE(SP)
diff --git a/kernel/power/zgemm_kernel_altivec_g4.S b/kernel/power/zgemm_kernel_altivec_g4.S
index 0f7a6f9aa..c305270bd 100644
--- a/kernel/power/zgemm_kernel_altivec_g4.S
+++ b/kernel/power/zgemm_kernel_altivec_g4.S
@@ -62,7 +62,7 @@
 #define	N	r4
 #define	K	r5
 
-#ifdef linux
+#if defined(linux) || defined(__FreeBSD__)
 #ifndef __64BIT__
 #define A	r6
 #define	B	r7
@@ -238,7 +238,7 @@
 #endif
 
 
-#ifdef linux
+#if defined(linux) || defined(__FreeBSD__)
 #ifdef __64BIT__
 	ld	LDC,  FRAMESLOT(0) + STACKSIZE(SP)
 #endif
diff --git a/kernel/power/zgemm_kernel_cell.S b/kernel/power/zgemm_kernel_cell.S
index 8fd6b0afb..3d179378b 100644
--- a/kernel/power/zgemm_kernel_cell.S
+++ b/kernel/power/zgemm_kernel_cell.S
@@ -61,7 +61,7 @@
 #define	N	r4
 #define	K	r5
 
-#ifdef linux
+#if defined(linux) || defined(__FreeBSD__)
 #ifndef __64BIT__
 #define A	r6
 #define	B	r7
@@ -175,7 +175,7 @@
 	stfd	f2,  ALPHA_I
 	stw	r0,  FZERO
 
-#ifdef linux
+#if defined(linux) || defined(__FreeBSD__)
 #ifdef __64BIT__
 	ld	LDC, FRAMESLOT(0) + STACKSIZE(SP)
 #endif
@@ -196,7 +196,7 @@
 #endif
 
 #ifdef TRMMKERNEL
-#if defined(linux) && defined(__64BIT__)
+#if (defined(linux) || defined(__FreeBSD__)) && defined(__64BIT__)
 	ld	OFFSET,  FRAMESLOT(1) + STACKSIZE(SP)
 #endif
 
@@ -230,7 +230,7 @@
 	li	PREA,   16 * 12 * SIZE
 #else
 
-#ifdef linux
+#if defined(linux) || defined(__FreeBSD__)
 #ifndef __64BIT__
 	lwz	PREA,  FRAMESLOT(2) + STACKSIZE(SP)
 	lwz	PREC,  FRAMESLOT(3) + STACKSIZE(SP)
diff --git a/kernel/power/zgemm_kernel_g4.S b/kernel/power/zgemm_kernel_g4.S
index bf6bf77e8..b92fb4225 100644
--- a/kernel/power/zgemm_kernel_g4.S
+++ b/kernel/power/zgemm_kernel_g4.S
@@ -61,7 +61,7 @@
 #define	N	r4
 #define	K	r5
 
-#ifdef linux
+#if defined(linux) || defined(__FreeBSD__)
 #ifndef __64BIT__
 #define A	r6
 #define	B	r7
@@ -185,7 +185,7 @@
 	stfd	f2,  ALPHA_I
 	stw	r0,  FZERO
 
-#ifdef linux
+#if defined(linux) || defined(__FreeBSD__)
 #ifdef __64BIT__
 	ld	LDC, FRAMESLOT(0) + STACKSIZE(SP)
 #endif
@@ -206,7 +206,7 @@
 #endif
 
 #ifdef TRMMKERNEL
-#if defined(linux) && defined(__64BIT__)
+#if (defined(linux) || defined(__FreeBSD__)) && defined(__64BIT__)
 	ld	OFFSET,  FRAMESLOT(1) + STACKSIZE(SP)
 #endif
 
diff --git a/kernel/power/zgemm_kernel_hummer.S b/kernel/power/zgemm_kernel_hummer.S
index 991a64373..5546dd2f6 100644
--- a/kernel/power/zgemm_kernel_hummer.S
+++ b/kernel/power/zgemm_kernel_hummer.S
@@ -48,7 +48,7 @@
 #define	N	r4
 #define	K	r5
 
-#ifdef linux
+#if defined(linux) || defined(__FreeBSD__)
 #define A	r6
 #define	B	r7
 #define	C	r8
diff --git a/kernel/power/zgemm_kernel_power3.S b/kernel/power/zgemm_kernel_power3.S
index 471d3b9ae..d14cb1cd9 100644
--- a/kernel/power/zgemm_kernel_power3.S
+++ b/kernel/power/zgemm_kernel_power3.S
@@ -61,7 +61,7 @@
 #define	N	r4
 #define	K	r5
 
-#ifdef linux
+#if defined(linux) || defined(__FreeBSD__)
 #ifndef __64BIT__
 #define A	r6
 #define	B	r7
@@ -161,7 +161,7 @@
 	stfd	f2,  ALPHA_I
 	stw	r0,  FZERO
 
-#ifdef linux
+#if defined(linux) || defined(__FreeBSD__)
 #ifdef __64BIT__
 	ld	LDC, FRAMESLOT(0) + STACKSIZE(SP)
 #endif
@@ -202,7 +202,7 @@
 #endif
 #else
 
-#ifdef linux
+#if defined(linux) || defined(__FreeBSD__)
 #ifndef __64BIT__
 	lwz	PREA,  FRAMESLOT(2) + STACKSIZE(SP)
 	lwz	PREC,  FRAMESLOT(3) + STACKSIZE(SP)
diff --git a/kernel/power/zgemm_kernel_power6.S b/kernel/power/zgemm_kernel_power6.S
index 3c28649bc..9b47b9fc1 100644
--- a/kernel/power/zgemm_kernel_power6.S
+++ b/kernel/power/zgemm_kernel_power6.S
@@ -61,7 +61,7 @@
 #define	N	r4
 #define	K	r5
 
-#ifdef linux
+#if defined(linux) || defined(__FreeBSD__)
 #ifndef __64BIT__
 #define A	r6
 #define	B	r7
@@ -199,7 +199,7 @@
 	stfd	f2,  ALPHA_I
 	stw	r0,  FZERO
 
-#ifdef linux
+#if defined(linux) || defined(__FreeBSD__)
 #ifdef __64BIT__
 	ld	LDC, FRAMESLOT(0) + STACKSIZE(SP)
 #endif
@@ -220,7 +220,7 @@
 #endif
 
 #ifdef TRMMKERNEL
-#if defined(linux) && defined(__64BIT__)
+#if (defined(linux) || defined(__FreeBSD__)) && defined(__64BIT__)
 	ld	OFFSET,  FRAMESLOT(1) + STACKSIZE(SP)
 #endif
 
diff --git a/kernel/power/zgemm_kernel_power9.S b/kernel/power/zgemm_kernel_power9.S
index 813f270b8..d1e60da6c 100644
--- a/kernel/power/zgemm_kernel_power9.S
+++ b/kernel/power/zgemm_kernel_power9.S
@@ -147,13 +147,13 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
     std    r0, FLINK_SAVE(SP)
  
 
-#ifdef linux
+#if defined(linux) || defined(__FreeBSD__)
 	ld	LDC, FRAMESLOT(0) + 0(FRAMEPOINTER)
 #endif
 
 
 #ifdef TRMMKERNEL
-#if defined(linux) && defined(__64BIT__)
+#if (defined(linux) || defined(__FreeBSD__)) && defined(__64BIT__)
 	ld	OFFSET,  FRAMESLOT(1) + 0(FRAMEPOINTER)
 #endif 
 #endif
diff --git a/kernel/power/zgemm_kernel_ppc440.S b/kernel/power/zgemm_kernel_ppc440.S
index 748b69a0c..ba99a21c5 100644
--- a/kernel/power/zgemm_kernel_ppc440.S
+++ b/kernel/power/zgemm_kernel_ppc440.S
@@ -61,7 +61,7 @@
 #define	N	r4
 #define	K	r5
 
-#ifdef linux
+#if defined(linux) || defined(__FreeBSD__)
 #ifndef __64BIT__
 #define A	r6
 #define	B	r7
@@ -182,7 +182,7 @@
 	stfd	f2,  ALPHA_I
 	stw	r0,  FZERO
 
-#ifdef linux
+#if defined(linux) || defined(__FreeBSD__)
 #ifdef __64BIT__
 	ld	LDC, FRAMESLOT(0) + STACKSIZE(SP)
 #endif
@@ -203,7 +203,7 @@
 #endif
 
 #ifdef TRMMKERNEL
-#if defined(linux) && defined(__64BIT__)
+#if (defined(linux) || defined(__FreeBSD__)) && defined(__64BIT__)
 	ld	OFFSET,  FRAMESLOT(1) + STACKSIZE(SP)
 #endif
 
diff --git a/kernel/power/zgemv_n.S b/kernel/power/zgemv_n.S
index f93439986..708f1318d 100644
--- a/kernel/power/zgemv_n.S
+++ b/kernel/power/zgemv_n.S
@@ -39,7 +39,7 @@
 #define ASSEMBLER
 #include "common.h"
 
-#ifdef linux
+#if defined(linux) || defined(__FreeBSD__)
 #ifndef __64BIT__
 #define M	r3
 #define	N	r4
@@ -250,7 +250,7 @@
 	stw	r22,   176(SP)
 #endif
 
-#ifdef linux
+#if defined(linux) || defined(__FreeBSD__)
 #ifndef __64BIT__
 	lwz	INCY,  FRAMESLOT(0) + STACKSIZE(SP)
 #else
diff --git a/kernel/power/zgemv_n_ppc440.S b/kernel/power/zgemv_n_ppc440.S
index 55dd2d84f..bd1148b65 100644
--- a/kernel/power/zgemv_n_ppc440.S
+++ b/kernel/power/zgemv_n_ppc440.S
@@ -39,7 +39,7 @@
 #define ASSEMBLER
 #include "common.h"
 
-#ifdef linux
+#if defined(linux) || defined(__FreeBSD__)
 #ifndef __64BIT__
 #define M	r3
 #define	N	r4
@@ -223,7 +223,7 @@
 	stw	r22,   176(SP)
 #endif
 
-#ifdef linux
+#if defined(linux) || defined(__FreeBSD__)
 #ifndef __64BIT__
 	lwz	INCY,	 FRAMESLOT(0) + STACKSIZE(SP)
 	lwz	BUFFER,  FRAMESLOT(1) + STACKSIZE(SP)
diff --git a/kernel/power/zgemv_t.S b/kernel/power/zgemv_t.S
index 9c6f510c2..d82fab16a 100644
--- a/kernel/power/zgemv_t.S
+++ b/kernel/power/zgemv_t.S
@@ -47,7 +47,7 @@
 #define STACKSIZE 304
 #endif
 
-#ifdef linux
+#if defined(linux) || defined(__FreeBSD__)
 #ifndef __64BIT__
 #define M	r3
 #define	N	r4
@@ -226,7 +226,7 @@
 	stw	r0,    4 + FZERO
 #endif
 
-#ifdef linux
+#if defined(linux) || defined(__FreeBSD__)
 #ifndef __64BIT__
 	lwz	INCY,	 FRAMESLOT(0) + STACKSIZE(SP)
 	lwz	BUFFER,  FRAMESLOT(1) + STACKSIZE(SP)
diff --git a/kernel/power/zgemv_t_ppc440.S b/kernel/power/zgemv_t_ppc440.S
index bfc039a0c..d7f3ee027 100644
--- a/kernel/power/zgemv_t_ppc440.S
+++ b/kernel/power/zgemv_t_ppc440.S
@@ -47,7 +47,7 @@
 #define STACKSIZE 304
 #endif
 
-#ifdef linux
+#if defined(linux) || defined(__FreeBSD__)
 #ifndef __64BIT__
 #define M	r3
 #define	N	r4
@@ -179,7 +179,7 @@
 	stw	r0,    4 + FZERO
 #endif
 
-#ifdef linux
+#if defined(linux) || defined(__FreeBSD__)
 #ifndef __64BIT__
 	lwz	INCY,	 FRAMESLOT(0) + STACKSIZE(SP)
 	lwz	BUFFER,  FRAMESLOT(1) + STACKSIZE(SP)
diff --git a/kernel/power/zger.S b/kernel/power/zger.S
index a9a607815..73757d448 100644
--- a/kernel/power/zger.S
+++ b/kernel/power/zger.S
@@ -47,7 +47,7 @@
 #endif
 #endif
 
-#ifdef linux
+#if defined(linux) || defined(__FreeBSD__)
 #ifndef __64BIT__
 #define M	r3
 #define	N	r4
@@ -235,7 +235,7 @@
 	stw	r27,   196(SP)
 #endif
 
-#ifdef linux
+#if defined(linux) || defined(__FreeBSD__)
 #ifndef __64BIT__
 	lwz	LDA,     FRAMESLOT(0) + STACKSIZE(SP)
 	lwz	BUFFER,  FRAMESLOT(1) + STACKSIZE(SP)
diff --git a/kernel/power/zscal.S b/kernel/power/zscal.S
index 2eb7b0df3..ae68ee672 100644
--- a/kernel/power/zscal.S
+++ b/kernel/power/zscal.S
@@ -43,7 +43,7 @@
 #define XX	r4
 #define PREA	r5
 
-#ifdef linux
+#if defined(linux) || defined(__FreeBSD__)
 #ifndef __64BIT__
 #define X r6
 #define INCX r7
diff --git a/kernel/power/zscal_ppc440.S b/kernel/power/zscal_ppc440.S
index d0e4c9bcf..55dd1b87b 100644
--- a/kernel/power/zscal_ppc440.S
+++ b/kernel/power/zscal_ppc440.S
@@ -43,7 +43,7 @@
 #define XX	r4
 #define PRE	r5
 
-#ifdef linux
+#if defined(linux) || defined(__FreeBSD__)
 #ifndef __64BIT__
 #define X r6
 #define INCX r7
diff --git a/kernel/power/zswap.S b/kernel/power/zswap.S
index 8befadca2..415164a2b 100644
--- a/kernel/power/zswap.S
+++ b/kernel/power/zswap.S
@@ -39,7 +39,7 @@
 #define ASSEMBLER
 #include "common.h"
 
-#ifdef linux
+#if defined(linux) || defined(__FreeBSD__)
 #ifndef __64BIT__
 #define N	r3
 #define X	r6
@@ -117,7 +117,7 @@
 	stfd	f30,  128(SP)
 	stfd	f31,  136(SP)
 
-#if defined(linux) && defined(__64BIT__)
+#if (defined(linux) || defined(__FreeBSD__)) && defined(__64BIT__)
 	ld	INCY, FRAMESLOT(0) + STACKSIZE(SP)
 #endif
 
diff --git a/kernel/power/zsymv_L.S b/kernel/power/zsymv_L.S
index b348e328f..9f00df072 100644
--- a/kernel/power/zsymv_L.S
+++ b/kernel/power/zsymv_L.S
@@ -39,7 +39,7 @@
 #define ASSEMBLER
 #include "common.h"
 
-#ifdef linux
+#if defined(linux) || defined(__FreeBSD__)
 #ifndef __64BIT__
 #define M	r3
 #define N	r4
@@ -259,7 +259,7 @@
 	stw	r27,   196(SP)
 #endif
 
-#ifdef linux
+#if defined(linux) || defined(__FreeBSD__)
 #ifndef __64BIT__
 	lwz	BUFFER,  FRAMESLOT(0) + STACKSIZE(SP)
 #else
diff --git a/kernel/power/zsymv_U.S b/kernel/power/zsymv_U.S
index b631cbe35..fe97fde8b 100644
--- a/kernel/power/zsymv_U.S
+++ b/kernel/power/zsymv_U.S
@@ -39,7 +39,7 @@
 #define ASSEMBLER
 #include "common.h"
 
-#ifdef linux
+#if defined(linux) || defined(__FreeBSD__)
 #ifndef __64BIT__
 #define M	r3
 #define IS	r4
@@ -256,7 +256,7 @@
 	stw	r27,   196(SP)
 #endif
 
-#ifdef linux
+#if defined(linux) || defined(__FreeBSD__)
 #ifndef __64BIT__
 	lwz	BUFFER,  FRAMESLOT(0) + STACKSIZE(SP)
 #else
diff --git a/kernel/power/ztrmm_kernel_8x2_power8.S b/kernel/power/ztrmm_kernel_8x2_power8.S
index c1415138c..684cbd6eb 100644
--- a/kernel/power/ztrmm_kernel_8x2_power8.S
+++ b/kernel/power/ztrmm_kernel_8x2_power8.S
@@ -98,7 +98,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #define	N	r4
 #define	K	r5
 
-#ifdef linux
+#if defined(linux) || defined(__FreeBSD__)
 #ifndef __64BIT__
 #define A	r6
 #define	B	r7
@@ -259,7 +259,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 	stfd	f2,  ALPHA_I_SP
 	stw	r0,  FZERO
 
-#ifdef linux
+#if defined(linux) || defined(__FreeBSD__)
 #ifdef __64BIT__
 	ld	LDC, FRAMESLOT(0) + STACKSIZE(SP)
 #endif
@@ -280,7 +280,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #endif
 
 #ifdef TRMMKERNEL
-#if defined(linux) && defined(__64BIT__)
+#if (defined(linux) || defined(__FreeBSD__)) && defined(__64BIT__)
 	ld	OFFSET,  FRAMESLOT(1) + STACKSIZE(SP)
 #endif
 
diff --git a/kernel/power/ztrsm_kernel_LN.S b/kernel/power/ztrsm_kernel_LN.S
index 87473b45d..3acd9562d 100644
--- a/kernel/power/ztrsm_kernel_LN.S
+++ b/kernel/power/ztrsm_kernel_LN.S
@@ -61,7 +61,7 @@
 #define	N	r4
 #define	K	r5
 
-#ifdef linux
+#if defined(linux) || defined(__FreeBSD__)
 #ifndef __64BIT__
 #define A	r6
 #define	B	r7
@@ -166,7 +166,7 @@
 
 	stw	r0,  FZERO
 
-#ifdef linux
+#if defined(linux) || defined(__FreeBSD__)
 #ifdef __64BIT__
 	ld	LDC, FRAMESLOT(0) + STACKSIZE(SP)
 #endif
@@ -186,7 +186,7 @@
 #endif
 #endif
 
-#if defined(linux) && defined(__64BIT__)
+#if (defined(linux) || defined(__FreeBSD__)) && defined(__64BIT__)
 	ld	OFFSET,  FRAMESLOT(1) + STACKSIZE(SP)
 #endif
 
@@ -244,7 +244,7 @@
 #endif
 #else
 
-#ifdef linux
+#if defined(linux) || defined(__FreeBSD__)
 #ifndef __64BIT__
 	lwz	PREA,  FRAMESLOT(2) + STACKSIZE(SP)
 	lwz	PREC,  FRAMESLOT(3) + STACKSIZE(SP)
diff --git a/kernel/power/ztrsm_kernel_LT.S b/kernel/power/ztrsm_kernel_LT.S
index db0860124..2d4f31189 100644
--- a/kernel/power/ztrsm_kernel_LT.S
+++ b/kernel/power/ztrsm_kernel_LT.S
@@ -61,7 +61,7 @@
 #define	N	r4
 #define	K	r5
 
-#ifdef linux
+#if defined(linux) || defined(__FreeBSD__)
 #ifndef __64BIT__
 #define A	r6
 #define	B	r7
@@ -166,7 +166,7 @@
 
 	stw	r0,  FZERO
 
-#ifdef linux
+#if defined(linux) || defined(__FreeBSD__)
 #ifdef __64BIT__
 	ld	LDC, FRAMESLOT(0) + STACKSIZE(SP)
 #endif
@@ -186,7 +186,7 @@
 #endif
 #endif
 
-#if defined(linux) && defined(__64BIT__)
+#if (defined(linux) || defined(__FreeBSD__)) && defined(__64BIT__)
 	ld	OFFSET,  FRAMESLOT(1) + STACKSIZE(SP)
 #endif
 
@@ -247,7 +247,7 @@
 #endif
 #else
 
-#ifdef linux
+#if defined(linux) || defined(__FreeBSD__)
 #ifndef __64BIT__
 	lwz	PREA,  FRAMESLOT(2) + STACKSIZE(SP)
 	lwz	PREC,  FRAMESLOT(3) + STACKSIZE(SP)
diff --git a/kernel/power/ztrsm_kernel_RT.S b/kernel/power/ztrsm_kernel_RT.S
index c50ab86df..605363119 100644
--- a/kernel/power/ztrsm_kernel_RT.S
+++ b/kernel/power/ztrsm_kernel_RT.S
@@ -61,7 +61,7 @@
 #define	N	r4
 #define	K	r5
 
-#ifdef linux
+#if defined(linux) || defined(__FreeBSD__)
 #ifndef __64BIT__
 #define A	r6
 #define	B	r7
@@ -166,7 +166,7 @@
 
 	stw	r0,  FZERO
 
-#ifdef linux
+#if defined(linux) || defined(__FreeBSD__)
 #ifdef __64BIT__
 	ld	LDC, FRAMESLOT(0) + STACKSIZE(SP)
 #endif
@@ -186,7 +186,7 @@
 #endif
 #endif
 
-#if defined(linux) && defined(__64BIT__)
+#if (defined(linux) || defined(__FreeBSD__)) && defined(__64BIT__)
 	ld	OFFSET,  FRAMESLOT(1) + STACKSIZE(SP)
 #endif
 
@@ -247,7 +247,7 @@
 #endif
 #else
 
-#ifdef linux
+#if defined(linux) || defined(__FreeBSD__)
 #ifndef __64BIT__
 	lwz	PREA,  FRAMESLOT(2) + STACKSIZE(SP)
 	lwz	PREC,  FRAMESLOT(3) + STACKSIZE(SP)
diff --git a/kernel/power/ztrsm_kernel_cell_LN.S b/kernel/power/ztrsm_kernel_cell_LN.S
index 884a3e864..4798b5958 100644
--- a/kernel/power/ztrsm_kernel_cell_LN.S
+++ b/kernel/power/ztrsm_kernel_cell_LN.S
@@ -61,7 +61,7 @@
 #define	N	r4
 #define	K	r5
 
-#ifdef linux
+#if defined(linux) || defined(__FreeBSD__)
 #ifndef __64BIT__
 #define A	r6
 #define	B	r7
@@ -172,7 +172,7 @@
 
 	stw	r0,  FZERO
 
-#ifdef linux
+#if defined(linux) || defined(__FreeBSD__)
 #ifdef __64BIT__
 	ld	LDC, FRAMESLOT(0) + STACKSIZE(SP)
 #endif
@@ -192,7 +192,7 @@
 #endif
 #endif
 
-#if defined(linux) && defined(__64BIT__)
+#if (defined(linux) || defined(__FreeBSD__)) && defined(__64BIT__)
 	ld	OFFSET,  FRAMESLOT(1) + STACKSIZE(SP)
 #endif
 
diff --git a/kernel/power/ztrsm_kernel_cell_LT.S b/kernel/power/ztrsm_kernel_cell_LT.S
index 388dfe3c2..654938a4d 100644
--- a/kernel/power/ztrsm_kernel_cell_LT.S
+++ b/kernel/power/ztrsm_kernel_cell_LT.S
@@ -61,7 +61,7 @@
 #define	N	r4
 #define	K	r5
 
-#ifdef linux
+#if defined(linux) || defined(__FreeBSD__)
 #ifndef __64BIT__
 #define A	r6
 #define	B	r7
@@ -172,7 +172,7 @@
 
 	stw	r0,  FZERO
 
-#ifdef linux
+#if defined(linux) || defined(__FreeBSD__)
 #ifdef __64BIT__
 	ld	LDC, FRAMESLOT(0) + STACKSIZE(SP)
 #endif
@@ -192,7 +192,7 @@
 #endif
 #endif
 
-#if defined(linux) && defined(__64BIT__)
+#if (defined(linux) || defined(__FreeBSD__)) && defined(__64BIT__)
 	ld	OFFSET,  FRAMESLOT(1) + STACKSIZE(SP)
 #endif
 
@@ -246,7 +246,7 @@
 	li	PREA,   16 * 12 * SIZE
 #else
 
-#ifdef linux
+#if defined(linux) || defined(__FreeBSD__)
 #ifndef __64BIT__
 	lwz	PREA,  FRAMESLOT(2) + STACKSIZE(SP)
 	lwz	PREC,  FRAMESLOT(3) + STACKSIZE(SP)
diff --git a/kernel/power/ztrsm_kernel_cell_RT.S b/kernel/power/ztrsm_kernel_cell_RT.S
index 00b50fe04..e3fe84d00 100644
--- a/kernel/power/ztrsm_kernel_cell_RT.S
+++ b/kernel/power/ztrsm_kernel_cell_RT.S
@@ -61,7 +61,7 @@
 #define	N	r4
 #define	K	r5
 
-#ifdef linux
+#if defined(linux) || defined(__FreeBSD__)
 #ifndef __64BIT__
 #define A	r6
 #define	B	r7
@@ -172,7 +172,7 @@
 
 	stw	r0,  FZERO
 
-#ifdef linux
+#if defined(linux) || defined(__FreeBSD__)
 #ifdef __64BIT__
 	ld	LDC, FRAMESLOT(0) + STACKSIZE(SP)
 #endif
@@ -192,7 +192,7 @@
 #endif
 #endif
 
-#if defined(linux) && defined(__64BIT__)
+#if (defined(linux) || defined(__FreeBSD__)) && defined(__64BIT__)
 	ld	OFFSET,  FRAMESLOT(1) + STACKSIZE(SP)
 #endif
 
diff --git a/kernel/power/ztrsm_kernel_hummer_LN.S b/kernel/power/ztrsm_kernel_hummer_LN.S
index bf3eafa45..042f4d476 100644
--- a/kernel/power/ztrsm_kernel_hummer_LN.S
+++ b/kernel/power/ztrsm_kernel_hummer_LN.S
@@ -48,7 +48,7 @@
 #define	N	r4
 #define	K	r5
 
-#ifdef linux
+#if defined(linux) || defined(__FreeBSD__)
 #define A	r6
 #define	B	r7
 #define	C	r8
diff --git a/kernel/power/ztrsm_kernel_hummer_LT.S b/kernel/power/ztrsm_kernel_hummer_LT.S
index 865c85f78..fc8a0bef8 100644
--- a/kernel/power/ztrsm_kernel_hummer_LT.S
+++ b/kernel/power/ztrsm_kernel_hummer_LT.S
@@ -48,7 +48,7 @@
 #define	N	r4
 #define	K	r5
 
-#ifdef linux
+#if defined(linux) || defined(__FreeBSD__)
 #define A	r6
 #define	B	r7
 #define	C	r8
diff --git a/kernel/power/ztrsm_kernel_hummer_RT.S b/kernel/power/ztrsm_kernel_hummer_RT.S
index 99868f948..17e31ffa8 100644
--- a/kernel/power/ztrsm_kernel_hummer_RT.S
+++ b/kernel/power/ztrsm_kernel_hummer_RT.S
@@ -48,7 +48,7 @@
 #define	N	r4
 #define	K	r5
 
-#ifdef linux
+#if defined(linux) || defined(__FreeBSD__)
 #define A	r6
 #define	B	r7
 #define	C	r8
diff --git a/kernel/power/ztrsm_kernel_power6_LN.S b/kernel/power/ztrsm_kernel_power6_LN.S
index 65b8077db..3c40f605a 100644
--- a/kernel/power/ztrsm_kernel_power6_LN.S
+++ b/kernel/power/ztrsm_kernel_power6_LN.S
@@ -57,7 +57,7 @@
 #define	N	r4
 #define	K	r5
 
-#ifdef linux
+#if defined(linux) || defined(__FreeBSD__)
 #ifndef __64BIT__
 #define A	r6
 #define	B	r7
@@ -184,7 +184,7 @@
 
 	stw	r0,  FZERO
 
-#ifdef linux
+#if defined(linux) || defined(__FreeBSD__)
 #ifdef __64BIT__
 	ld	LDC, FRAMESLOT(0) + STACKSIZE(SP)
 #endif
@@ -204,7 +204,7 @@
 #endif
 #endif
 
-#if defined(linux) && defined(__64BIT__)
+#if (defined(linux) || defined(__FreeBSD__)) && defined(__64BIT__)
 	ld	OFFSET,  FRAMESLOT(1) + STACKSIZE(SP)
 #endif
 
diff --git a/kernel/power/ztrsm_kernel_power6_LT.S b/kernel/power/ztrsm_kernel_power6_LT.S
index c27170604..b2a92301d 100644
--- a/kernel/power/ztrsm_kernel_power6_LT.S
+++ b/kernel/power/ztrsm_kernel_power6_LT.S
@@ -57,7 +57,7 @@
 #define	N	r4
 #define	K	r5
 
-#ifdef linux
+#if defined(linux) || defined(__FreeBSD__)
 #ifndef __64BIT__
 #define A	r6
 #define	B	r7
@@ -184,7 +184,7 @@
 
 	stw	r0,  FZERO
 
-#ifdef linux
+#if defined(linux) || defined(__FreeBSD__)
 #ifdef __64BIT__
 	ld	LDC, FRAMESLOT(0) + STACKSIZE(SP)
 #endif
@@ -204,7 +204,7 @@
 #endif
 #endif
 
-#if defined(linux) && defined(__64BIT__)
+#if (defined(linux) || defined(__FreeBSD__)) && defined(__64BIT__)
 	ld	OFFSET,  FRAMESLOT(1) + STACKSIZE(SP)
 #endif
 
diff --git a/kernel/power/ztrsm_kernel_power6_RT.S b/kernel/power/ztrsm_kernel_power6_RT.S
index ff0338cdc..cf37b5ca0 100644
--- a/kernel/power/ztrsm_kernel_power6_RT.S
+++ b/kernel/power/ztrsm_kernel_power6_RT.S
@@ -57,7 +57,7 @@
 #define	N	r4
 #define	K	r5
 
-#ifdef linux
+#if defined(linux) || defined(__FreeBSD__)
 #ifndef __64BIT__
 #define A	r6
 #define	B	r7
@@ -184,7 +184,7 @@
 
 	stw	r0,  FZERO
 
-#ifdef linux
+#if defined(linux) || defined(__FreeBSD__)
 #ifdef __64BIT__
 	ld	LDC, FRAMESLOT(0) + STACKSIZE(SP)
 #endif
@@ -204,7 +204,7 @@
 #endif
 #endif
 
-#if defined(linux) && defined(__64BIT__)
+#if (defined(linux) || defined(__FreeBSD__)) && defined(__64BIT__)
 	ld	OFFSET,  FRAMESLOT(1) + STACKSIZE(SP)
 #endif
 
diff --git a/kernel/power/ztrsm_kernel_ppc440_LN.S b/kernel/power/ztrsm_kernel_ppc440_LN.S
index d33522456..f0be64d81 100644
--- a/kernel/power/ztrsm_kernel_ppc440_LN.S
+++ b/kernel/power/ztrsm_kernel_ppc440_LN.S
@@ -61,7 +61,7 @@
 #define	N	r4
 #define	K	r5
 
-#ifdef linux
+#if defined(linux) || defined(__FreeBSD__)
 #ifndef __64BIT__
 #define A	r6
 #define	B	r7
@@ -177,7 +177,7 @@
 
 	stw	r0,  FZERO
 
-#ifdef linux
+#if defined(linux) || defined(__FreeBSD__)
 #ifdef __64BIT__
 	ld	LDC, FRAMESLOT(0) + STACKSIZE(SP)
 #endif
@@ -197,7 +197,7 @@
 #endif
 #endif
 
-#if defined(linux) && defined(__64BIT__)
+#if (defined(linux) || defined(__FreeBSD__)) && defined(__64BIT__)
 	ld	OFFSET,  FRAMESLOT(1) + STACKSIZE(SP)
 #endif
 
diff --git a/kernel/power/ztrsm_kernel_ppc440_LT.S b/kernel/power/ztrsm_kernel_ppc440_LT.S
index a9e7b891f..d5ff1b57f 100644
--- a/kernel/power/ztrsm_kernel_ppc440_LT.S
+++ b/kernel/power/ztrsm_kernel_ppc440_LT.S
@@ -61,7 +61,7 @@
 #define	N	r4
 #define	K	r5
 
-#ifdef linux
+#if defined(linux) || defined(__FreeBSD__)
 #ifndef __64BIT__
 #define A	r6
 #define	B	r7
@@ -177,7 +177,7 @@
 
 	stw	r0,  FZERO
 
-#ifdef linux
+#if defined(linux) || defined(__FreeBSD__)
 #ifdef __64BIT__
 	ld	LDC, FRAMESLOT(0) + STACKSIZE(SP)
 #endif
@@ -197,7 +197,7 @@
 #endif
 #endif
 
-#if defined(linux) && defined(__64BIT__)
+#if (defined(linux) || defined(__FreeBSD__)) && defined(__64BIT__)
 	ld	OFFSET,  FRAMESLOT(1) + STACKSIZE(SP)
 #endif
 
diff --git a/kernel/power/ztrsm_kernel_ppc440_RT.S b/kernel/power/ztrsm_kernel_ppc440_RT.S
index 43f4b07cb..b77dd76d1 100644
--- a/kernel/power/ztrsm_kernel_ppc440_RT.S
+++ b/kernel/power/ztrsm_kernel_ppc440_RT.S
@@ -61,7 +61,7 @@
 #define	N	r4
 #define	K	r5
 
-#ifdef linux
+#if defined(linux) || defined(__FreeBSD__)
 #ifndef __64BIT__
 #define A	r6
 #define	B	r7
@@ -177,7 +177,7 @@
 
 	stw	r0,  FZERO
 
-#ifdef linux
+#if defined(linux) || defined(__FreeBSD__)
 #ifdef __64BIT__
 	ld	LDC, FRAMESLOT(0) + STACKSIZE(SP)
 #endif
@@ -197,7 +197,7 @@
 #endif
 #endif
 
-#if defined(linux) && defined(__64BIT__)
+#if (defined(linux) || defined(__FreeBSD__)) && defined(__64BIT__)
 	ld	OFFSET,  FRAMESLOT(1) + STACKSIZE(SP)
 #endif
 

From 5a4f1a21188a99d935482f1bda057d4ea42d34f4 Mon Sep 17 00:00:00 2001
From: pkubaj <pkubaj@users.noreply.github.com>
Date: Fri, 28 Jun 2019 10:29:44 +0000
Subject: [PATCH 091/127] Fix build for PPC970 on FreeBSD pt. 1

FreeBSD needs DCBT_ARG=0 as well.
---
 common_power.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/common_power.h b/common_power.h
index f38b85864..5e15b7554 100644
--- a/common_power.h
+++ b/common_power.h
@@ -241,7 +241,7 @@ static inline int blas_quickdivide(blasint x, blasint y){
 #define HAVE_PREFETCH
 #endif
 
-#if defined(POWER3) || defined(POWER6) || defined(PPCG4) || defined(CELL) || defined(POWER8) || defined(POWER9) || ( defined(PPC970) && defined(OS_DARWIN) )
+#if defined(POWER3) || defined(POWER6) || defined(PPCG4) || defined(CELL) || defined(POWER8) || defined(POWER9) || ( defined(PPC970) && ( defined(OS_DARWIN) || defined(OS_FREEBSD) ) )
 #define DCBT_ARG	0
 #else
 #define DCBT_ARG	8

From 7c7505a7784a698ecfac453284080b5074a7b102 Mon Sep 17 00:00:00 2001
From: pkubaj <pkubaj@users.noreply.github.com>
Date: Fri, 28 Jun 2019 10:31:45 +0000
Subject: [PATCH 092/127] Fix build for PPC970 on FreeBSD pt.2

FreeBSD needs those macros too.
---
 param.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/param.h b/param.h
index 9a1a68ecd..0f354f2bc 100644
--- a/param.h
+++ b/param.h
@@ -1999,7 +1999,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #define ZGEMM_DEFAULT_UNROLL_M 2
 #define ZGEMM_DEFAULT_UNROLL_N 2
 
-#if defined(OS_LINUX) || defined(OS_DARWIN)
+#if defined(OS_LINUX) || defined(OS_DARWIN) || defined(OS_FREEBSD)
 #if L2_SIZE == 1024976
 #define SGEMM_DEFAULT_P 320
 #define DGEMM_DEFAULT_P 256

From a97b301aaabbe4bdf99a4506cc6a007d707f8b14 Mon Sep 17 00:00:00 2001
From: AbdelRauf <quickwritereader@gmail.com>
Date: Tue, 18 Jun 2019 15:55:56 +0000
Subject: [PATCH 093/127] cgemm/ctrmm power9

---
 kernel/power/KERNEL.POWER9         |    6 +-
 kernel/power/cgemm_kernel_power9.S |  293 +++
 kernel/power/cgemm_logic_power9.S  | 2816 ++++++++++++++++++++++++++
 kernel/power/cgemm_macros_power9.S | 3019 ++++++++++++++++++++++++++++
 kernel/power/zgemm_logic_power9.S  |    2 +-
 param.h                            |    4 +-
 6 files changed, 6134 insertions(+), 6 deletions(-)
 create mode 100644 kernel/power/cgemm_kernel_power9.S
 create mode 100644 kernel/power/cgemm_logic_power9.S
 create mode 100644 kernel/power/cgemm_macros_power9.S

diff --git a/kernel/power/KERNEL.POWER9 b/kernel/power/KERNEL.POWER9
index 0f91d6d7d..31a5deeba 100644
--- a/kernel/power/KERNEL.POWER9
+++ b/kernel/power/KERNEL.POWER9
@@ -5,7 +5,7 @@
 
 STRMMKERNEL	= sgemm_kernel_power9.S
 DTRMMKERNEL	= dgemm_kernel_power9.S
-CTRMMKERNEL	= ctrmm_kernel_8x4_power8.S
+CTRMMKERNEL	= cgemm_kernel_power9.S
 ZTRMMKERNEL	= zgemm_kernel_power9.S
 
 SGEMMKERNEL    =  sgemm_kernel_power9.S
@@ -28,9 +28,9 @@ DGEMMITCOPYOBJ =  dgemm_itcopy.o
 DGEMMONCOPYOBJ =  dgemm_oncopy.o
 DGEMMOTCOPYOBJ =  dgemm_otcopy.o
 
-CGEMMKERNEL    = cgemm_kernel_8x4_power8.S
+CGEMMKERNEL    = cgemm_kernel_power9.S
 CGEMMINCOPY    = ../generic/zgemm_ncopy_8.c
-CGEMMITCOPY    = cgemm_tcopy_8_power8.S
+CGEMMITCOPY    = ../generic/zgemm_tcopy_8.c
 CGEMMONCOPY    = ../generic/zgemm_ncopy_4.c
 CGEMMOTCOPY    = ../generic/zgemm_tcopy_4.c
 CGEMMONCOPYOBJ =  cgemm_oncopy.o
diff --git a/kernel/power/cgemm_kernel_power9.S b/kernel/power/cgemm_kernel_power9.S
new file mode 100644
index 000000000..4b5c2fa31
--- /dev/null
+++ b/kernel/power/cgemm_kernel_power9.S
@@ -0,0 +1,293 @@
+/***************************************************************************
+Copyright (c) 2013-2019, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*****************************************************************************/
+
+/**************************************************************************************
+* Abdelrauf(quickwritereader@gmail.com)
+* 	 BLASTEST 		: OK
+* 	 CTEST			: OK
+* 	 TEST			: OK
+*	 LAPACK-TEST		: OK
+**************************************************************************************/
+#define ASSEMBLER
+#include "common.h"
+#include "def_vsx.h"
+
+ 
+#define LOAD	ld
+#define STACKSIZE  (512 )  
+#define FLINK_SAVE (STACKSIZE+16) /* 16($r12) */  
+#define	M	r3
+#define	N	r4
+#define	K	r5
+
+
+#define A	r8
+#define	B	r9
+#define	C	r10
+#define	LDC	r6
+#define OFFSET	r7
+
+
+#define alpha_r vs19
+#define alpha_i vs20
+#define save_permute_1 vs21
+#define permute_mask vs22
+#define o0	0
+ 
+
+#define T1	r11
+#define T2	r12
+#define T3	r14
+#define T4	r15
+#define T5	r16
+#define T6	r17
+#define L	r18
+#define T7	r19
+#define T8	r20
+#define TEMP_REG	r21
+#define	I	r22
+#define J	r23
+#define AO	r24
+#define	BO	r25
+#define	CO 	r26
+#define T9	r27
+#define	T10	r28
+#define	PRE	r29
+
+#define T12	r30
+#define T13	r31
+
+#include "cgemm_macros_power9.S"
+
+.equ    perm_const1, 0x0405060700010203
+.equ    perm_const2, 0x0c0d0e0f08090a0b
+.equ save_permute_12, 0x0c0d0e0f1c1d1e1f
+.equ save_permute_11, 0x0405060714151617
+
+
+
+#ifndef NEEDPARAM
+
+	PROLOGUE
+	PROFCODE
+
+
+	addi	SP, SP, -STACKSIZE
+	mflr r0
+
+
+	stfd	f14,    0(SP)
+	stfd	f15,    8(SP)
+	stfd	f16,   16(SP)
+	stfd	f17,   24(SP)
+
+	stfd	f18,   32(SP)
+	stfd	f19,   40(SP)
+	stfd	f20,   48(SP)
+	stfd	f21,   56(SP)
+
+	stfd	f22,   64(SP)
+	stfd	f23,   72(SP)
+	stfd	f24,   80(SP)
+	stfd	f25,   88(SP)
+
+	stfd	f26,   96(SP)
+	stfd	f27,  104(SP)
+	stfd	f28,  112(SP)
+	stfd	f29,  120(SP)
+
+	stfd	f30,  128(SP)
+	stfd	f31,  136(SP)
+
+
+	std	r31,  144(SP)
+	std	r30,  152(SP)
+	std	r29,  160(SP)
+	std	r28,  168(SP)
+	std	r27,  176(SP)
+	std	r26,  184(SP)
+	std	r25,  192(SP)
+	std	r24,  200(SP)
+	std	r23,  208(SP)
+	std	r22,  216(SP)
+	std	r21,  224(SP)
+	std	r20,  232(SP)
+	std	r19,  240(SP)
+	std	r18,  248(SP)
+	std	r17,  256(SP)
+	std	r16,  264(SP)
+	std	r15,  272(SP)
+	std	r14,  280(SP)
+ 
+ 
+  stxv    vs52,  288(SP)
+  stxv    vs53,  304(SP)
+  stxv    vs54,  320(SP)
+  stxv    vs55,  336(SP)
+  stxv    vs56,  352(SP)
+  stxv    vs57,  368(SP)
+  stxv    vs58,  384(SP)
+  stxv    vs59,  400(SP)
+  stxv    vs60,  416(SP)
+  stxv    vs61,  432(SP)
+  stxv    vs62,  448(SP)
+  stxv    vs63,  464(SP)
+  std     r0,   FLINK_SAVE(SP)
+ 
+
+
+	ld	LDC, FRAMESLOT(0) + STACKSIZE(SP)
+
+
+
+#ifdef TRMMKERNEL
+	ld	OFFSET,  FRAMESLOT(1) + STACKSIZE(SP)
+#endif
+   slwi    LDC, LDC, ZBASE_SHIFT
+
+ 
+ 
+	/*alpha is stored in f1. convert to single and splat*/
+    xscvdpspn alpha_r,vs1 
+    xscvdpspn alpha_i,vs2 
+	xxspltw   alpha_r,alpha_r,0 
+	xxspltw   alpha_i,alpha_i,0 
+/*load reverse permute mask for big endian
+  uint128 = 0xc0d0e0f08090a0b0405060700010203
+*/ 
+		
+	lis T2, perm_const2@highest
+	lis T1, perm_const1@highest
+	lis T3, save_permute_12@highest
+	lis T4, save_permute_11@highest
+
+	
+	ori T2, T2, perm_const2@higher
+	ori T1, T1, perm_const1@higher
+	ori T3, T3, save_permute_12@higher
+	ori T4, T4, save_permute_11@higher
+
+	
+	rldicr T2, T2, 32, 31
+	rldicr T1, T1, 32, 31
+	rldicr T3, T3, 32, 31
+	rldicr T4, T4, 32, 31 
+
+	oris T2, T2, perm_const2@h
+	oris T1, T1, perm_const1@h
+	oris T3, T3, save_permute_12@h
+	oris T4, T4, save_permute_11@h
+
+	
+	ori T2, T2, perm_const2@l  
+	ori T1, T1, perm_const1@l
+	ori T3, T3, save_permute_12@l  
+	ori T4, T4, save_permute_11@l
+
+	
+  li r0,0
+  li PRE,512
+
+#if defined(CC) || defined(CR) || defined(RC) || defined(RR) 
+/*negate for this case as we will use addition -1*(a+b) */
+  xvnegsp alpha_r,alpha_r
+  xvnegsp alpha_i,alpha_i
+#endif
+
+	mtvsrdd permute_mask,T2,T1
+	mtvsrdd save_permute_1,T3,T4 	
+
+     /*mask is reverse permute so we have to make it inner permute */
+ 	xxpermdi	permute_mask,	permute_mask,	permute_mask,2 
+
+#include "cgemm_logic_power9.S"
+
+.L999: 
+	lfd	f14,    0(SP)
+	lfd	f15,    8(SP)
+	lfd	f16,   16(SP)
+	lfd	f17,   24(SP)
+
+	lfd	f18,   32(SP)
+	lfd	f19,   40(SP)
+	lfd	f20,   48(SP)
+	lfd	f21,   56(SP)
+
+	lfd	f22,   64(SP)
+	lfd	f23,   72(SP)
+	lfd	f24,   80(SP)
+	lfd	f25,   88(SP)
+
+	lfd	f26,   96(SP)
+	lfd	f27,  104(SP)
+	lfd	f28,  112(SP)
+	lfd	f29,  120(SP)
+
+	lfd	f30,  128(SP)
+	lfd	f31,  136(SP)
+
+	ld	r31,  144(SP)
+	ld	r30,  152(SP)
+	ld	r29,  160(SP)
+	ld	r28,  168(SP)
+	ld	r27,  176(SP)
+	ld	r26,  184(SP)
+	ld	r25,  192(SP)
+	ld	r24,  200(SP)
+	ld	r23,  208(SP)
+	ld	r22,  216(SP)
+	ld	r21,  224(SP)
+	ld	r20,  232(SP)
+	ld	r19,  240(SP)
+	ld	r18,  248(SP)
+	ld	r17,  256(SP)
+	ld	r16,  264(SP)
+	ld	r15,  272(SP)
+	ld	r14,  280(SP)
+
+	ld    r0, 	 FLINK_SAVE(SP)	
+ 
+    lxv    vs52,  288(SP)
+    lxv    vs53,  304(SP)
+    lxv    vs54,  320(SP)
+    lxv    vs55,  336(SP)
+    lxv    vs56,  352(SP)
+    lxv    vs57,  368(SP)
+    lxv    vs58,  384(SP) 
+    lxv    vs59,  400(SP)
+	mtlr r0
+    lxv    vs60,  416(SP)
+    lxv    vs61,  432(SP) 
+    lxv    vs62,  448(SP)
+    lxv    vs63,  464(SP)
+
+	addi	SP, SP, STACKSIZE 
+	blr
+
+
+	EPILOGUE
+#endif
diff --git a/kernel/power/cgemm_logic_power9.S b/kernel/power/cgemm_logic_power9.S
new file mode 100644
index 000000000..b4f937e90
--- /dev/null
+++ b/kernel/power/cgemm_logic_power9.S
@@ -0,0 +1,2816 @@
+/***************************************************************************
+Copyright (c) 2013-2019, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*****************************************************************************/
+
+/**************************************************************************************
+* Abdelrauf(quickwritereader@gmail.com)
+* 	 BLASTEST 		: OK
+* 	 CTEST			: OK
+* 	 TEST			: OK
+*	 LAPACK-TEST		: OK
+**************************************************************************************/
+#define MY_ALIGN .align 3
+b CGEMM_L4
+/*                MINI SUBROUTINES                            */      
+/*                4x8 MAIN 128x+2 LOOP                     */      
+
+
+CGEMM_L4x8_LMAIN_SUB:
+/*----------------------------------------*/   
+    mtctr   T8
+    LOAD4x8_2 
+    MY_ALIGN
+CGEMM_L4x8_LOOP:
+/*----------------------------------------*/   
+    dcbt    AO, PRE
+    dcbt    BO, PRE
+    KERNEL4x8_L2 128,64,0,0 
+CGEMM_L4x8_K128:
+/*----------------------------------------*/   
+    KERNEL4x8_L2 128,64,1,0
+    dcbt    AO, T2  
+    KERNEL4x8_L2 128,64,2,0
+    KERNEL4x8_L2 128,64,3,0 
+    dcbt    AO, T3
+    dcbt    BO, T2
+    KERNEL4x8_L2 128,64,4,0
+    KERNEL4x8_L2 128,64,5,0
+    dcbt    AO, T4  
+    KERNEL4x8_L2 128,64,6,0
+    KERNEL4x8_L2 128,64,7,0  
+    dcbt    AO, T5  
+    dcbt    BO, T3
+    KERNEL4x8_L2 128,64,8,0
+    KERNEL4x8_L2 128,64,9,0
+    KERNEL4x8_L2 128,64,10,0
+    KERNEL4x8_L2 128,64,11,0  
+    dcbt    BO, T4
+    KERNEL4x8_L2 128,64,12,0
+    KERNEL4x8_L2 128,64,13,0
+    KERNEL4x8_L2 128,64,14,0
+    KERNEL4x8_L2 128,64,15,0  
+    KERNEL4x8_L2 128,64,16,0
+    KERNEL4x8_L2 128,64,17,0 
+    KERNEL4x8_L2 128,64,18,0
+    KERNEL4x8_L2 128,64,19,0  
+    KERNEL4x8_L2 128,64,20,0
+    KERNEL4x8_L2 128,64,21,0 
+    KERNEL4x8_L2 128,64,22,0
+    KERNEL4x8_L2 128,64,23,0   
+    KERNEL4x8_L2 128,64,24,0
+    KERNEL4x8_L2 128,64,25,0
+    KERNEL4x8_L2 128,64,26,0
+    KERNEL4x8_L2 128,64,27,0  
+    KERNEL4x8_L2 128,64,28,0
+    KERNEL4x8_L2 128,64,29,0
+    KERNEL4x8_L2 128,64,30,0
+    KERNEL4x8_L2 128,64,31,0 
+    KERNEL4x8_L2 128,64,32,0
+    KERNEL4x8_L2 128,64,33,0
+    KERNEL4x8_L2 128,64,34,0
+    KERNEL4x8_L2 128,64,35,0 
+    KERNEL4x8_L2 128,64,36,0
+    KERNEL4x8_L2 128,64,37,0
+    KERNEL4x8_L2 128,64,38,0
+    KERNEL4x8_L2 128,64,39,0  
+    KERNEL4x8_L2 128,64,40,0
+    KERNEL4x8_L2 128,64,41,0
+    KERNEL4x8_L2 128,64,42,0
+    KERNEL4x8_L2 128,64,43,0  
+    KERNEL4x8_L2 128,64,44,0
+    KERNEL4x8_L2 128,64,45,0
+    KERNEL4x8_L2 128,64,46,0
+    KERNEL4x8_L2 128,64,47,0 
+    KERNEL4x8_L2 128,64,48,0
+    KERNEL4x8_L2 128,64,49,0 
+    KERNEL4x8_L2 128,64,50,0
+    KERNEL4x8_L2 128,64,51,0  
+    KERNEL4x8_L2 128,64,52,0
+    KERNEL4x8_L2 128,64,53,0 
+    KERNEL4x8_L2 128,64,54,0
+    KERNEL4x8_L2 128,64,55,0  
+    KERNEL4x8_L2 128,64,56,0
+    KERNEL4x8_L2 128,64,57,0
+    KERNEL4x8_L2 128,64,58,0
+    KERNEL4x8_L2 128,64,59,0  
+    KERNEL4x8_L2 128,64,60,0
+    KERNEL4x8_L2 128,64,61,0
+    KERNEL4x8_L2 128,64,62,0 
+    KERNEL4x8_L2 128,64,63,1  
+    bdnz    CGEMM_L4x8_LOOP
+    MY_ALIGN  
+CGEMM_L4x8_LOOP_END:
+/*----------------------------------------*/   
+    END4x8_2
+    blr
+    MY_ALIGN
+
+
+CGEMM_4x8_L64_SUB:
+/*----------------------------------------*/   
+    LOAD4x8_2  
+    dcbt    AO, PRE
+    dcbt    BO, PRE
+    KERNEL4x8_L2 128,64,0,0 
+    KERNEL4x8_L2 128,64,1,0
+    dcbt    AO, T2  
+    KERNEL4x8_L2 128,64,2,0
+    KERNEL4x8_L2 128,64,3,0 
+    dcbt    AO, T3
+    dcbt    BO, T2
+    KERNEL4x8_L2 128,64,4,0
+    KERNEL4x8_L2 128,64,5,0
+    dcbt    AO, T4  
+    KERNEL4x8_L2 128,64,6,0
+    KERNEL4x8_L2 128,64,7,0  
+    dcbt    AO, T5  
+    dcbt    BO, T3
+    KERNEL4x8_L2 128,64,8,0
+    KERNEL4x8_L2 128,64,9,0
+    KERNEL4x8_L2 128,64,10,0
+    KERNEL4x8_L2 128,64,11,0  
+    dcbt    BO, T4
+    KERNEL4x8_L2 128,64,12,0
+    KERNEL4x8_L2 128,64,13,0
+    KERNEL4x8_L2 128,64,14,0
+    KERNEL4x8_L2 128,64,15,0  
+    KERNEL4x8_L2 128,64,16,0
+    KERNEL4x8_L2 128,64,17,0 
+    KERNEL4x8_L2 128,64,18,0
+    KERNEL4x8_L2 128,64,19,0  
+    KERNEL4x8_L2 128,64,20,0
+    KERNEL4x8_L2 128,64,21,0 
+    KERNEL4x8_L2 128,64,22,0
+    KERNEL4x8_L2 128,64,23,0   
+    KERNEL4x8_L2 128,64,24,0
+    KERNEL4x8_L2 128,64,25,0
+    KERNEL4x8_L2 128,64,26,0
+    KERNEL4x8_L2 128,64,27,0  
+    KERNEL4x8_L2 128,64,28,0
+    KERNEL4x8_L2 128,64,29,0
+    KERNEL4x8_L2 128,64,30,0
+    KERNEL4x8_E2 128,64,31,1
+    blr
+    MY_ALIGN
+
+
+CGEMM_4x8_L32_SUB:
+/*----------------------------------------*/   
+    LOAD4x8_2  
+    dcbt    AO, PRE
+    dcbt    BO, PRE
+    KERNEL4x8_L2 128,64,0,0 
+    KERNEL4x8_L2 128,64,1,0
+    dcbt    AO, T2  
+    KERNEL4x8_L2 128,64,2,0
+    KERNEL4x8_L2 128,64,3,0 
+    dcbt    AO, T3
+    dcbt    BO, T2
+    KERNEL4x8_L2 128,64,4,0
+    KERNEL4x8_L2 128,64,5,0
+    dcbt    AO, T4  
+    KERNEL4x8_L2 128,64,6,0
+    KERNEL4x8_L2 128,64,7,0  
+    dcbt    AO, T5  
+    dcbt    BO, T3
+    KERNEL4x8_L2 128,64,8,0
+    KERNEL4x8_L2 128,64,9,0
+    KERNEL4x8_L2 128,64,10,0
+    KERNEL4x8_L2 128,64,11,0  
+    dcbt    BO, T4
+    KERNEL4x8_L2 128,64,12,0
+    KERNEL4x8_L2 128,64,13,0
+    KERNEL4x8_L2 128,64,14,0
+    KERNEL4x8_E2 128,64,15,1
+    blr
+    MY_ALIGN
+
+
+CGEMM_4x8_L16_SUB:
+/*----------------------------------------*/   
+    LOAD4x8_2 
+    dcbt    AO, PRE
+    dcbt    BO, PRE
+    KERNEL4x8_L2 128,64,0,0 
+    KERNEL4x8_L2 128,64,1,0
+    dcbt    AO, T2  
+    KERNEL4x8_L2 128,64,2,0
+    KERNEL4x8_L2 128,64,3,0 
+    dcbt    AO, T3
+    dcbt    BO, T2
+    KERNEL4x8_L2 128,64,4,0
+    KERNEL4x8_L2 128,64,5,0
+    dcbt    AO, T4  
+    KERNEL4x8_L2 128,64,6,0
+    KERNEL4x8_E2 128,64,7,1
+    blr
+    MY_ALIGN
+
+
+CGEMM_4x4_LMAIN_SUB:
+/*----------------------------------------*/   
+    mtctr   T8
+    LOAD4x4_2  
+    MY_ALIGN
+CGEMM_L4x4_LOOP:
+/*----------------------------------------*/   
+    KERNEL4x4_L2 64,64,0,0
+CGEMM_L4x4_K32:
+/*----------------------------------------*/   
+    KERNEL4x4_L2 64,64,1,0   
+    KERNEL4x4_L2 64,64,2,0
+    KERNEL4x4_L2 64,64,3,0  
+    KERNEL4x4_L2 64,64,4,0
+    KERNEL4x4_L2 64,64,5,0 
+    KERNEL4x4_L2 64,64,6,0
+    KERNEL4x4_L2 64,64,7,0
+    KERNEL4x4_L2 64,64,8,0
+    KERNEL4x4_L2 64,64,9,0   
+    KERNEL4x4_L2 64,64,10,0
+    KERNEL4x4_L2 64,64,11,0  
+    KERNEL4x4_L2 64,64,12,0
+    KERNEL4x4_L2 64,64,13,0 
+    KERNEL4x4_L2 64,64,14,0
+    KERNEL4x4_L2 64,64,15,1    
+    bdnz    CGEMM_L4x4_LOOP
+    MY_ALIGN  
+CGEMM_L4x4_LOOP_END:
+/*----------------------------------------*/   
+    END4x4_2 
+    blr
+    MY_ALIGN
+
+
+CGEMM_4x4_L16_SUB:
+/*----------------------------------------*/   
+    LOAD4x4_2
+    KERNEL4x4_L2 64,64,0,0
+    KERNEL4x4_L2 64,64,1,0   
+    KERNEL4x4_L2 64,64,2,0
+    KERNEL4x4_L2 64,64,3,0  
+    KERNEL4x4_L2 64,64,4,0
+    KERNEL4x4_L2 64,64,5,0 
+    KERNEL4x4_L2 64,64,6,0
+    KERNEL4x4_E2 64,64,7,1
+    blr
+    MY_ALIGN
+
+
+CGEMM_4x4_L8_SUB:
+/*----------------------------------------*/   
+    LOAD4x4_2
+    KERNEL4x4_L2 64,64,0,0
+    KERNEL4x4_L2 64,64,1,0   
+    KERNEL4x4_L2 64,64,2,0
+    KERNEL4x4_E2 64,64,3,1 
+    blr
+
+
+CGEMM_4x2_LMAIN_SUB:
+/*----------------------------------------*/   
+    mtctr   T8
+    LOAD4x2_2  
+    MY_ALIGN 
+CGEMM_L4x2_LOOP:
+/*----------------------------------------*/   
+    KERNEL4x2_L2 32,64,0,0 
+CGEMM_L4x2_K32:
+/*----------------------------------------*/   
+    KERNEL4x2_L2 32,64,1,0  
+    KERNEL4x2_L2 32,64,2,0
+    KERNEL4x2_L2 32,64,3,0  
+    KERNEL4x2_L2 32,64,4,0
+    KERNEL4x2_L2 32,64,5,0 
+    KERNEL4x2_L2 32,64,6,0
+    KERNEL4x2_L2 32,64,7,0
+    KERNEL4x2_L2 32,64,8,0
+    KERNEL4x2_L2 32,64,9,0  
+    KERNEL4x2_L2 32,64,10,0
+    KERNEL4x2_L2 32,64,11,0  
+    KERNEL4x2_L2 32,64,12,0
+    KERNEL4x2_L2 32,64,13,0 
+    KERNEL4x2_L2 32,64,14,0
+    KERNEL4x2_L2 32,64,15,1   
+    bdnz    CGEMM_L4x2_LOOP
+    MY_ALIGN  
+
+
+CGEMM_L4x2_LOOP_END:
+/*----------------------------------------*/   
+    END4x2_2 
+    blr
+    MY_ALIGN
+CGEMM_4x2_L16_SUB:
+/*----------------------------------------*/   
+    LOAD4x2_2
+    KERNEL4x2_L2 32,64,0,0
+    KERNEL4x2_L2 32,64,1,0  
+    KERNEL4x2_L2 32,64,2,0
+    KERNEL4x2_L2 32,64,3,0  
+    KERNEL4x2_L2 32,64,4,0
+    KERNEL4x2_L2 32,64,5,0 
+    KERNEL4x2_L2 32,64,6,0
+    KERNEL4x2_E2 32,64,7,1
+    blr
+    MY_ALIGN
+CGEMM_4x2_L8_SUB:
+/*----------------------------------------*/   
+    LOAD4x2_2
+    KERNEL4x2_L2 32,64,0,0
+    KERNEL4x2_L2 32,64,1,0  
+    KERNEL4x2_L2 32,64,2,0
+    KERNEL4x2_E2 32,64,3,1  
+    blr
+
+
+CGEMM_4x1_LMAIN_SUB:
+/*----------------------------------------*/   
+    mtctr   T8
+    LOAD4x1_2  
+    MY_ALIGN
+CGEMM_L4x1_LOOP:
+/*----------------------------------------*/   
+    KERNEL4x1_L2 16,64,0,0 
+CGEMM_L4x1_K32:
+/*----------------------------------------*/   
+    KERNEL4x1_L2 16,64,1,0  
+    KERNEL4x1_L2 16,64,2,0
+    KERNEL4x1_L2 16,64,3,0  
+    KERNEL4x1_L2 16,64,4,0
+    KERNEL4x1_L2 16,64,5,0 
+    KERNEL4x1_L2 16,64,6,0
+    KERNEL4x1_L2 16,64,7,0
+    KERNEL4x1_L2 16,64,8,0
+    KERNEL4x1_L2 16,64,9,0  
+    KERNEL4x1_L2 16,64,10,0
+    KERNEL4x1_L2 16,64,11,0  
+    KERNEL4x1_L2 16,64,12,0
+    KERNEL4x1_L2 16,64,13,0 
+    KERNEL4x1_L2 16,64,14,0
+    KERNEL4x1_L2 16,64,15,1   
+    bdnz    CGEMM_L4x1_LOOP
+    MY_ALIGN  
+CGEMM_L4x1_LOOP_END:
+/*----------------------------------------*/   
+    END4x1_2 
+    blr
+
+    MY_ALIGN
+CGEMM_4x1_L16_SUB:
+/*----------------------------------------*/   
+    LOAD4x1_2
+    KERNEL4x1_L2 16,64,0,0
+    KERNEL4x1_L2 16,64,1,0  
+    KERNEL4x1_L2 16,64,2,0
+    KERNEL4x1_L2 16,64,3,0  
+    KERNEL4x1_L2 16,64,4,0
+    KERNEL4x1_L2 16,64,5,0 
+    KERNEL4x1_L2 16,64,6,0
+    KERNEL4x1_E2 16,64,7,1
+    blr
+    MY_ALIGN
+
+
+CGEMM_4x1_L8_SUB:
+/*----------------------------------------*/   
+    LOAD4x1_2
+    KERNEL4x1_L2 16,64,0,0
+    KERNEL4x1_L2 16,64,1,0  
+    KERNEL4x1_L2 16,64,2,0
+    KERNEL4x1_E2 16,64,3,1  
+    blr
+
+
+
+/*             MAIN LOOP BEGINS               */   
+    MY_ALIGN
+
+
+CGEMM_L4:
+/*----------------------------------------*/   
+#if defined(TRMMKERNEL) && !defined(LEFT)   
+    neg TEMP_REG, OFFSET 
+#endif   
+    srawi.    J,  N,  2
+    ble   CGEMM_L4_END
+
+
+CGEMM_L4_BEGIN:
+/*----------------------------------------*/   
+    mr    CO, C
+    slwi    T1, LDC , 2     
+    add     T2,C,LDC    
+    mr    AO, A  
+    add   C,  C,  T1
+#if defined(TRMMKERNEL) && defined(LEFT)   
+    mr TEMP_REG, OFFSET  /*off = offset;*/
+#endif     
+    srawi.    I,  M,  3
+    ble   CGEMM_L4x8_END
+    dcbt    CO,r0  /*just prefetch*/
+    dcbt    T2,r0    
+
+
+CGEMM_L4x8_BEGIN:
+/*----------------------------------------*/   
+#if defined(TRMMKERNEL)   
+    REFRESH_POINTERS  AO,BO,TEMP_REG,B,8,4
+#else    
+    mr    BO, B  
+    dcbt    B,  r0  
+#endif     
+    dcbt    AO, r0
+#if defined(TRMMKERNEL)   
+    REFRESH_TEMP_BK T6,K,TEMP_REG,8,4
+    mr T1, T6
+/* TEMPS FOR PREFETCH */   
+    li T2, 1024
+    li T3, 1024+512
+    addi T1,T1, -2
+/* TEMPS FOR PREFETCH */     
+    li T4, 2048
+    li T5, 2048+512   
+    srawi.   T8, T1, 7 /**(T1-2) % 128x */
+#else   
+    mr T1, K
+/* TEMPS FOR PREFETCH */   
+    li T2, 1024
+    li T3, 1024+512
+    addi T1,T1, -2
+/* TEMPS FOR PREFETCH */     
+    li T4, 2048
+    li T5, 2048+512 
+    srawi.   T8, T1, 7 /**(K-2) % 128x */
+#endif   
+    ZERO4x8  
+    ble   CGEMM_L4x8_SUB0
+    bl CGEMM_L4x8_LMAIN_SUB
+    andi.   L,  T1, 127
+    ble   CGEMM_L4x8_SAVE
+    b   CGEMM_L4x8_SUB2
+
+
+CGEMM_L4x8_SUB0:
+/*----------------------------------------*/   
+#if defined(TRMMKERNEL)   
+    andi.   L,  T6, 255
+    cmpwi   T6,129
+#else   
+    andi.   L,  K,  255
+    cmpwi   K,129
+#endif       
+    li T8,1
+    bne CMP4x8_128K
+    addi BO,BO,-32
+    addi AO,AO,-64 
+    LOAD4x8O 64,32 
+    END4x8_WITHOUT_ADD   
+    LOAD4x8_2O  128, 64 
+    mtctr   T8    
+    bl CGEMM_L4x8_K128   
+    b CGEMM_L4x8_SAVE  
+    CMP4x8_128K:
+/*----------------------------------------*/   
+#if defined(TRMMKERNEL)    
+    cmpwi   T6,128
+#else    
+    cmpwi   K,128
+#endif        
+    bne CGEMM_L4x8_SUB2 
+    MY_ALIGN   
+    mtctr   T8
+    addi BO,BO,-64
+    addi AO,AO,-128   
+    LOAD4x8_2O 128,64
+    bl CGEMM_L4x8_K128   
+    b CGEMM_L4x8_SAVE 
+    MY_ALIGN
+
+
+CGEMM_L4x8_SUB2:
+/*----------------------------------------*/   
+    andi.      T1,L, 64
+    ble CGEMM_L4x8_SUB2_32
+    bl  CGEMM_4x8_L64_SUB
+    MY_ALIGN
+
+
+CGEMM_L4x8_SUB2_32:
+/*----------------------------------------*/   
+    andi.      T1,L, 32
+    ble CGEMM_L4x8_SUB2_16    
+    bl  CGEMM_4x8_L32_SUB
+    MY_ALIGN 
+
+
+CGEMM_L4x8_SUB2_16:
+/*----------------------------------------*/   
+    andi.      T1,L, 16
+    ble CGEMM_L4x8_SUB2_8
+    bl  CGEMM_4x8_L16_SUB  
+    MY_ALIGN    
+
+
+CGEMM_L4x8_SUB2_8:
+/*----------------------------------------*/   
+    andi.      T1,L, 8
+    ble CGEMM_L4x8_SUB2_4
+    LOAD4x8_2
+    KERNEL4x8_L2  128,64, 0,0
+    KERNEL4x8_L2  128,64, 1,0
+    KERNEL4x8_L2  128,64, 2,0
+    KERNEL4x8_E2  128,64, 3,1
+    MY_ALIGN   
+
+
+CGEMM_L4x8_SUB2_4:
+/*----------------------------------------*/   
+    andi.      T1,L, 4
+    ble CGEMM_L4x8_SUB2_2
+    LOAD4x8_2
+    KERNEL4x8_L2  128,64, 0,0
+    KERNEL4x8_E2  128,64, 1,1
+    MY_ALIGN
+
+
+CGEMM_L4x8_SUB2_2:
+/*----------------------------------------*/   
+    andi.      T1,L, 2
+    ble CGEMM_L4x8_SUB2_1
+    LOAD4x8_2 
+    KERNEL4x8_E2  128,64, 0,1
+    MY_ALIGN    
+
+
+CGEMM_L4x8_SUB2_1:
+/*----------------------------------------*/   
+    andi.      T1,L, 1
+    ble CGEMM_L4x8_SAVE 
+    KERNEL4x8
+
+    MY_ALIGN
+CGEMM_L4x8_SAVE:
+/*----------------------------------------*/   
+    addic.    I,  I,  -1
+    MY_ALIGN
+    SAVE4x8
+#if defined(TRMMKERNEL)    
+    REFRESH_AFTER_SAVE T6,K,TEMP_REG,BO,AO,8,4
+#endif     
+    bgt   CGEMM_L4x8_BEGIN
+    andi.   T2, M,  7
+    ble   CGEMM_L4x1_END
+    andi.   T1, M,  4
+    ble   CGEMM_L4x4_END
+    b   CGEMM_L4x4_BEGIN
+    MY_ALIGN 
+
+
+CGEMM_L4x8_END:
+/*----------------------------------------*/   
+
+
+CGEMM_L4x4_BEGIN:
+/*----------------------------------------*/   
+    andi.   T2, M,  7
+    ble   CGEMM_L4x1_END
+    andi.   T1, M,  4
+    ble   CGEMM_L4x4_END
+#if defined(TRMMKERNEL)   
+    REFRESH_POINTERS  AO,BO,TEMP_REG,B,4,4
+#else    
+    mr    BO, B   
+#endif        
+#if defined(TRMMKERNEL)   
+    REFRESH_TEMP_BK T6,K,TEMP_REG,4,4
+    mr T1, T6 
+    addi T1,T1, -2 
+    srawi.   T8, T1, 5 /**(T1-2) % 32x */
+#else   
+    mr T1, K 
+    addi T1,T1, -2
+    srawi.   T8, T1, 5 /**(K-2) % 32x */
+#endif     
+    ZERO4x4
+    ble   CGEMM_L4x4_SUB0 
+    bl CGEMM_4x4_LMAIN_SUB
+    andi.   L,  T1, 31
+    ble   CGEMM_L4x4_SAVE
+    b    CGEMM_L4x4_SUB2
+
+
+CGEMM_L4x4_SUB0:
+/*----------------------------------------*/   
+#if defined(TRMMKERNEL)   
+    andi.   L,  T6, 63
+    cmpwi   T6,33
+#else   
+    andi.   L,  K,  63
+    cmpwi   K,33
+#endif       
+    li T8,1
+    bne CMP4x4_32K
+    addi BO,BO,-32
+    addi AO,AO,-32  
+    LOAD4x4O 32,32 
+    END4x4_WITHOUT_ADD   
+    LOAD4x4_2O  64, 64 
+    mtctr   T8    
+    bl CGEMM_L4x4_K32   
+    b CGEMM_L4x4_SAVE  
+    CMP4x4_32K:
+/*----------------------------------------*/   
+#if defined(TRMMKERNEL)    
+    cmpwi   T6,32
+#else    
+    cmpwi   K,32
+#endif        
+    bne CGEMM_L4x4_SUB2 
+    MY_ALIGN   
+    mtctr   T8
+    addi BO,BO,-64
+    addi AO,AO,-64   
+    LOAD4x4_2O 64,64
+    bl CGEMM_L4x4_K32   
+    b CGEMM_L4x4_SAVE 
+    MY_ALIGN 
+    MY_ALIGN 
+
+
+CGEMM_L4x4_SUB2:
+/*----------------------------------------*/   
+    andi.      T1,L, 16
+    ble CGEMM_L4x4_SUB2_8
+    bl  CGEMM_4x4_L16_SUB  
+    MY_ALIGN
+
+
+CGEMM_L4x4_SUB2_8:
+/*----------------------------------------*/   
+    andi.      T1,L, 8
+    ble CGEMM_L4x4_SUB2_4
+    bl CGEMM_4x4_L8_SUB
+    MY_ALIGN  
+
+
+CGEMM_L4x4_SUB2_4:
+/*----------------------------------------*/   
+    andi.      T1,L, 4
+    ble CGEMM_L4x4_SUB2_2
+    LOAD4x4_2
+    KERNEL4x4_L2  64,64, 0,0
+    KERNEL4x4_E2  64,64, 1,1
+    MY_ALIGN
+
+
+CGEMM_L4x4_SUB2_2:
+/*----------------------------------------*/   
+    andi.      T1,L, 2
+    ble CGEMM_L4x4_SUB2_1
+    LOAD4x4_2
+    KERNEL4x4_E2  64,64, 0,1
+    MY_ALIGN    
+
+
+CGEMM_L4x4_SUB2_1:
+/*----------------------------------------*/   
+    andi.      T1,L, 1
+    ble CGEMM_L4x4_SAVE 
+    KERNEL4x4
+
+
+CGEMM_L4x4_SAVE:
+/*----------------------------------------*/   
+    SAVE4x4
+#if defined(TRMMKERNEL)    
+    REFRESH_AFTER_SAVE T6,K,TEMP_REG,BO,AO,4,4
+#endif     
+
+
+CGEMM_L4x4_END:
+/*----------------------------------------*/   
+
+
+CGEMM_L4x2_BEGIN:
+/*----------------------------------------*/   
+    andi.   T1, M,  2
+    ble   CGEMM_L4x2_END
+#if defined(TRMMKERNEL)   
+    REFRESH_POINTERS  AO,BO,TEMP_REG,B,2,4
+#else    
+    mr    BO, B   
+#endif        
+#if defined(TRMMKERNEL)   
+    REFRESH_TEMP_BK T6,K,TEMP_REG,2,4
+    mr T1, T6 
+    addi T1,T1, -2 
+    srawi.   T8, T1, 5 /**(T1-2) % 32x */
+#else   
+    mr T1, K 
+    addi T1,T1, -2
+    srawi.   T8, T1, 5 /**(K-2) % 32x */
+#endif     
+    ZERO4x2
+    ble   CGEMM_L4x2_SUB0 
+    bl CGEMM_4x2_LMAIN_SUB
+    andi.   L,  T1, 31
+    ble   CGEMM_L4x2_SAVE
+    b   CGEMM_L4x2_SUB2
+
+
+CGEMM_L4x2_SUB0:
+/*----------------------------------------*/   
+#if defined(TRMMKERNEL)   
+    andi.   L,  T6, 63
+    cmpwi   T6,33
+#else   
+    andi.   L,  K,  63
+    cmpwi   K,33
+#endif       
+    li T8,1
+    bne CMP4x2_32K
+    addi BO,BO,-32
+    addi AO,AO,-16  
+    LOAD4x2O 16,32 
+    END4x2_WITHOUT_ADD   
+    LOAD4x2_2O  32, 64  
+    mtctr   T8    
+    bl CGEMM_L4x2_K32   
+    b CGEMM_L4x2_SAVE  
+    CMP4x2_32K:
+/*----------------------------------------*/   
+#if defined(TRMMKERNEL)    
+    cmpwi   T6,32
+#else    
+    cmpwi   K,32
+#endif        
+    bne CGEMM_L4x2_SUB2 
+    MY_ALIGN   
+    mtctr   T8
+    addi BO,BO,-64
+    addi AO,AO,-32   
+    LOAD4x2_2O 32,64
+    bl CGEMM_L4x2_K32   
+    b CGEMM_L4x2_SAVE 
+    MY_ALIGN 
+    MY_ALIGN 
+
+
+CGEMM_L4x2_SUB2:
+/*----------------------------------------*/   
+    andi.      T1,L, 16
+    ble CGEMM_L4x2_SUB2_8
+    bl CGEMM_4x2_L16_SUB  
+    MY_ALIGN
+
+
+CGEMM_L4x2_SUB2_8:
+/*----------------------------------------*/   
+    andi.      T1,L, 8
+    ble CGEMM_L4x2_SUB2_4
+    bl CGEMM_4x2_L8_SUB
+    MY_ALIGN  
+
+
+CGEMM_L4x2_SUB2_4:
+/*----------------------------------------*/   
+    andi.      T1,L, 4
+    ble CGEMM_L4x2_SUB2_2
+    LOAD4x2_2
+    KERNEL4x2_L2  32,64, 0,0
+    KERNEL4x2_E2  32,64, 1,1
+    MY_ALIGN
+
+
+CGEMM_L4x2_SUB2_2:
+/*----------------------------------------*/   
+    andi.      T1,L, 2
+    ble CGEMM_L4x2_SUB2_1
+    LOAD4x2_2
+    KERNEL4x2_E2  32,64, 0,1
+    MY_ALIGN    
+
+
+CGEMM_L4x2_SUB2_1:
+/*----------------------------------------*/   
+    andi.      T1,L, 1
+    ble CGEMM_L4x2_SAVE 
+    KERNEL4x2
+
+    MY_ALIGN
+CGEMM_L4x2_SAVE:
+/*----------------------------------------*/   
+    SAVE4x2
+#if defined(TRMMKERNEL)    
+    REFRESH_AFTER_SAVE T6,K,TEMP_REG,BO,AO,2,4
+#endif     
+
+
+CGEMM_L4x2_END:
+/*----------------------------------------*/   
+
+
+CGEMM_L4x1_BEGIN:
+/*----------------------------------------*/   
+    andi.   T1, M,  1
+    ble   CGEMM_L4x1_END
+#if defined(TRMMKERNEL)   
+    REFRESH_POINTERS  AO,BO,TEMP_REG,B,1,4
+#else    
+    mr    BO, B   
+#endif        
+#if defined(TRMMKERNEL)   
+    REFRESH_TEMP_BK T6,K,TEMP_REG,1,4
+    mr T1, T6 
+    addi T1,T1, -2 
+    srawi.   T8, T1, 5 /**(T1-2) % 32x */
+#else   
+    mr T1, K 
+    addi T1,T1, -2
+    srawi.   T8, T1, 5 /**(K-2) % 32x */
+#endif     
+    ZERO4x1
+    ble   CGEMM_L4x1_SUB0 
+    bl CGEMM_4x1_LMAIN_SUB
+    andi.   L,  T1, 31
+    ble   CGEMM_L4x1_SAVE
+    b   CGEMM_L4x1_SUB2
+
+
+CGEMM_L4x1_SUB0:
+/*----------------------------------------*/   
+#if defined(TRMMKERNEL)   
+    andi.   L,  T6, 63
+    cmpwi   T6,33
+#else   
+    andi.   L,  K,  63
+    cmpwi   K,33
+#endif       
+    li T8,1
+    bne CMP4x1_32K
+    addi BO,BO,-32
+    addi AO,AO,-8  
+    LOAD4x1O 8,32 
+    END4x1_WITHOUT_ADD   
+    LOAD4x1_2O  16, 64  
+    mtctr   T8    
+    bl CGEMM_L4x1_K32   
+    b CGEMM_L4x1_SAVE  
+    CMP4x1_32K:
+/*----------------------------------------*/   
+#if defined(TRMMKERNEL)    
+    cmpwi   T6,32
+#else    
+    cmpwi   K,32
+#endif        
+    bne CGEMM_L4x1_SUB2 
+    MY_ALIGN   
+    mtctr   T8
+    addi BO,BO,-64
+    addi AO,AO,-16   
+    LOAD4x1_2O 16,64
+    bl CGEMM_L4x1_K32   
+    b CGEMM_L4x1_SAVE 
+    MY_ALIGN 
+    MY_ALIGN 
+
+
+CGEMM_L4x1_SUB2:
+/*----------------------------------------*/   
+    andi.      T1,L, 16
+    ble CGEMM_L4x1_SUB2_8
+    bl CGEMM_4x1_L16_SUB  
+    MY_ALIGN
+
+
+CGEMM_L4x1_SUB2_8:
+/*----------------------------------------*/   
+    andi.      T1,L, 8
+    ble CGEMM_L4x1_SUB2_4
+    bl CGEMM_4x1_L8_SUB
+    MY_ALIGN  
+
+
+CGEMM_L4x1_SUB2_4:
+/*----------------------------------------*/   
+    andi.      T1,L, 4
+    ble CGEMM_L4x1_SUB2_2
+    LOAD4x1_2
+    KERNEL4x1_L2  16,64, 0,0
+    KERNEL4x1_E2  16,64, 1,1
+    MY_ALIGN
+
+
+CGEMM_L4x1_SUB2_2:
+/*----------------------------------------*/   
+    andi.      T1,L, 2
+    ble CGEMM_L4x1_SUB2_1
+    LOAD4x1_2
+    KERNEL4x1_E2  16,64, 0,1
+    MY_ALIGN    
+
+
+CGEMM_L4x1_SUB2_1:
+/*----------------------------------------*/   
+    andi.      T1,L, 1
+    ble CGEMM_L4x1_SAVE 
+    KERNEL4x1
+
+    MY_ALIGN
+CGEMM_L4x1_SAVE:
+/*----------------------------------------*/  
+     
+    SAVE4x1
+#if defined(TRMMKERNEL)    
+    REFRESH_AFTER_SAVE T6,K,TEMP_REG,BO,AO,1,4
+#endif   
+
+
+CGEMM_L4x1_END:
+/*----------------------------------------*/   
+    slwi    T1, K,  5
+    addic.    J,  J,  -1
+    add   B,  B,  T1
+#if defined(TRMMKERNEL) && !defined(LEFT)   
+    addi TEMP_REG, TEMP_REG, 4
+#endif   
+    bgt   CGEMM_L4_BEGIN
+
+
+CGEMM_L4_END:
+
+b CGEMM_L2
+/*                MINI SUBROUTINES                            */      
+/*                2x8 MAIN 128x+2 LOOP                     */      
+
+
+CGEMM_L2x8_LMAIN_SUB:
+/*----------------------------------------*/   
+    mtctr   T8
+    LOAD2x8_2 
+    MY_ALIGN
+CGEMM_L2x8_LOOP:
+/*----------------------------------------*/   
+    dcbt    AO, PRE
+    dcbt    BO, PRE
+    KERNEL2x8_L2 128,32,0,0 
+CGEMM_L2x8_K128:
+/*----------------------------------------*/   
+    KERNEL2x8_L2 128,32,1,0
+    dcbt    AO, T2  
+    KERNEL2x8_L2 128,32,2,0
+    KERNEL2x8_L2 128,32,3,0 
+    dcbt    AO, T3
+    dcbt    BO, T2
+    KERNEL2x8_L2 128,32,4,0
+    KERNEL2x8_L2 128,32,5,0
+    dcbt    AO, T4  
+    KERNEL2x8_L2 128,32,6,0
+    KERNEL2x8_L2 128,32,7,0  
+    dcbt    AO, T5  
+    dcbt    BO, T3
+    KERNEL2x8_L2 128,32,8,0
+    KERNEL2x8_L2 128,32,9,0
+    KERNEL2x8_L2 128,32,10,0
+    KERNEL2x8_L2 128,32,11,0  
+    dcbt    BO, T4
+    KERNEL2x8_L2 128,32,12,0
+    KERNEL2x8_L2 128,32,13,0
+    KERNEL2x8_L2 128,32,14,0
+    KERNEL2x8_L2 128,32,15,0  
+    KERNEL2x8_L2 128,32,16,0
+    KERNEL2x8_L2 128,32,17,0 
+    KERNEL2x8_L2 128,32,18,0
+    KERNEL2x8_L2 128,32,19,0  
+    KERNEL2x8_L2 128,32,20,0
+    KERNEL2x8_L2 128,32,21,0 
+    KERNEL2x8_L2 128,32,22,0
+    KERNEL2x8_L2 128,32,23,0   
+    KERNEL2x8_L2 128,32,24,0
+    KERNEL2x8_L2 128,32,25,0
+    KERNEL2x8_L2 128,32,26,0
+    KERNEL2x8_L2 128,32,27,0  
+    KERNEL2x8_L2 128,32,28,0
+    KERNEL2x8_L2 128,32,29,0
+    KERNEL2x8_L2 128,32,30,0
+    KERNEL2x8_L2 128,32,31,0 
+    KERNEL2x8_L2 128,32,32,0
+    KERNEL2x8_L2 128,32,33,0
+    KERNEL2x8_L2 128,32,34,0
+    KERNEL2x8_L2 128,32,35,0 
+    KERNEL2x8_L2 128,32,36,0
+    KERNEL2x8_L2 128,32,37,0
+    KERNEL2x8_L2 128,32,38,0
+    KERNEL2x8_L2 128,32,39,0  
+    KERNEL2x8_L2 128,32,40,0
+    KERNEL2x8_L2 128,32,41,0
+    KERNEL2x8_L2 128,32,42,0
+    KERNEL2x8_L2 128,32,43,0  
+    KERNEL2x8_L2 128,32,44,0
+    KERNEL2x8_L2 128,32,45,0
+    KERNEL2x8_L2 128,32,46,0
+    KERNEL2x8_L2 128,32,47,0 
+    KERNEL2x8_L2 128,32,48,0
+    KERNEL2x8_L2 128,32,49,0 
+    KERNEL2x8_L2 128,32,50,0
+    KERNEL2x8_L2 128,32,51,0  
+    KERNEL2x8_L2 128,32,52,0
+    KERNEL2x8_L2 128,32,53,0 
+    KERNEL2x8_L2 128,32,54,0
+    KERNEL2x8_L2 128,32,55,0  
+    KERNEL2x8_L2 128,32,56,0
+    KERNEL2x8_L2 128,32,57,0
+    KERNEL2x8_L2 128,32,58,0
+    KERNEL2x8_L2 128,32,59,0  
+    KERNEL2x8_L2 128,32,60,0
+    KERNEL2x8_L2 128,32,61,0
+    KERNEL2x8_L2 128,32,62,0 
+    KERNEL2x8_L2 128,32,63,1  
+    bdnz    CGEMM_L2x8_LOOP
+    MY_ALIGN  
+CGEMM_L2x8_LOOP_END:
+/*----------------------------------------*/   
+    END2x8_2
+    blr
+    MY_ALIGN
+
+
+CGEMM_2x8_L64_SUB:
+/*----------------------------------------*/   
+    LOAD2x8_2  
+    dcbt    AO, PRE
+    dcbt    BO, PRE
+    KERNEL2x8_L2 128,32,0,0 
+    KERNEL2x8_L2 128,32,1,0
+    dcbt    AO, T2  
+    KERNEL2x8_L2 128,32,2,0
+    KERNEL2x8_L2 128,32,3,0 
+    dcbt    AO, T3
+    dcbt    BO, T2
+    KERNEL2x8_L2 128,32,4,0
+    KERNEL2x8_L2 128,32,5,0
+    dcbt    AO, T4  
+    KERNEL2x8_L2 128,32,6,0
+    KERNEL2x8_L2 128,32,7,0  
+    dcbt    AO, T5  
+    dcbt    BO, T3
+    KERNEL2x8_L2 128,32,8,0
+    KERNEL2x8_L2 128,32,9,0
+    KERNEL2x8_L2 128,32,10,0
+    KERNEL2x8_L2 128,32,11,0  
+    dcbt    BO, T4
+    KERNEL2x8_L2 128,32,12,0
+    KERNEL2x8_L2 128,32,13,0
+    KERNEL2x8_L2 128,32,14,0
+    KERNEL2x8_L2 128,32,15,0  
+    KERNEL2x8_L2 128,32,16,0
+    KERNEL2x8_L2 128,32,17,0 
+    KERNEL2x8_L2 128,32,18,0
+    KERNEL2x8_L2 128,32,19,0  
+    KERNEL2x8_L2 128,32,20,0
+    KERNEL2x8_L2 128,32,21,0 
+    KERNEL2x8_L2 128,32,22,0
+    KERNEL2x8_L2 128,32,23,0   
+    KERNEL2x8_L2 128,32,24,0
+    KERNEL2x8_L2 128,32,25,0
+    KERNEL2x8_L2 128,32,26,0
+    KERNEL2x8_L2 128,32,27,0  
+    KERNEL2x8_L2 128,32,28,0
+    KERNEL2x8_L2 128,32,29,0
+    KERNEL2x8_L2 128,32,30,0
+    KERNEL2x8_E2 128,32,31,1
+    blr
+    MY_ALIGN
+
+
+CGEMM_2x8_L32_SUB:
+/*----------------------------------------*/   
+    LOAD2x8_2  
+    dcbt    AO, PRE
+    dcbt    BO, PRE
+    KERNEL2x8_L2 128,32,0,0 
+    KERNEL2x8_L2 128,32,1,0
+    dcbt    AO, T2  
+    KERNEL2x8_L2 128,32,2,0
+    KERNEL2x8_L2 128,32,3,0 
+    dcbt    AO, T3
+    dcbt    BO, T2
+    KERNEL2x8_L2 128,32,4,0
+    KERNEL2x8_L2 128,32,5,0
+    dcbt    AO, T4  
+    KERNEL2x8_L2 128,32,6,0
+    KERNEL2x8_L2 128,32,7,0  
+    dcbt    AO, T5  
+    dcbt    BO, T3
+    KERNEL2x8_L2 128,32,8,0
+    KERNEL2x8_L2 128,32,9,0
+    KERNEL2x8_L2 128,32,10,0
+    KERNEL2x8_L2 128,32,11,0  
+    dcbt    BO, T4
+    KERNEL2x8_L2 128,32,12,0
+    KERNEL2x8_L2 128,32,13,0
+    KERNEL2x8_L2 128,32,14,0
+    KERNEL2x8_E2 128,32,15,1
+    blr
+    MY_ALIGN
+
+
+CGEMM_2x8_L16_SUB:
+/*----------------------------------------*/   
+    LOAD2x8_2 
+    dcbt    AO, PRE
+    dcbt    BO, PRE
+    KERNEL2x8_L2 128,32,0,0 
+    KERNEL2x8_L2 128,32,1,0
+    dcbt    AO, T2  
+    KERNEL2x8_L2 128,32,2,0
+    KERNEL2x8_L2 128,32,3,0 
+    dcbt    AO, T3
+    dcbt    BO, T2
+    KERNEL2x8_L2 128,32,4,0
+    KERNEL2x8_L2 128,32,5,0
+    dcbt    AO, T4  
+    KERNEL2x8_L2 128,32,6,0
+    KERNEL2x8_E2 128,32,7,1
+    blr
+    MY_ALIGN
+
+
+CGEMM_2x4_LMAIN_SUB:
+/*----------------------------------------*/   
+    mtctr   T8
+    LOAD2x4_2  
+    MY_ALIGN
+CGEMM_L2x4_LOOP:
+/*----------------------------------------*/   
+    KERNEL2x4_L2 64,32,0,0
+CGEMM_L2x4_K32:
+/*----------------------------------------*/   
+    KERNEL2x4_L2 64,32,1,0   
+    KERNEL2x4_L2 64,32,2,0
+    KERNEL2x4_L2 64,32,3,0  
+    KERNEL2x4_L2 64,32,4,0
+    KERNEL2x4_L2 64,32,5,0 
+    KERNEL2x4_L2 64,32,6,0
+    KERNEL2x4_L2 64,32,7,0
+    KERNEL2x4_L2 64,32,8,0
+    KERNEL2x4_L2 64,32,9,0   
+    KERNEL2x4_L2 64,32,10,0
+    KERNEL2x4_L2 64,32,11,0  
+    KERNEL2x4_L2 64,32,12,0
+    KERNEL2x4_L2 64,32,13,0 
+    KERNEL2x4_L2 64,32,14,0
+    KERNEL2x4_L2 64,32,15,1    
+    bdnz    CGEMM_L2x4_LOOP
+    MY_ALIGN  
+CGEMM_L2x4_LOOP_END:
+/*----------------------------------------*/   
+    END2x4_2 
+    blr
+    MY_ALIGN
+
+
+CGEMM_2x4_L16_SUB:
+/*----------------------------------------*/   
+    LOAD2x4_2
+    KERNEL2x4_L2 64,32,0,0
+    KERNEL2x4_L2 64,32,1,0   
+    KERNEL2x4_L2 64,32,2,0
+    KERNEL2x4_L2 64,32,3,0  
+    KERNEL2x4_L2 64,32,4,0
+    KERNEL2x4_L2 64,32,5,0 
+    KERNEL2x4_L2 64,32,6,0
+    KERNEL2x4_E2 64,32,7,1
+    blr
+    MY_ALIGN
+
+
+CGEMM_2x4_L8_SUB:
+/*----------------------------------------*/   
+    LOAD2x4_2
+    KERNEL2x4_L2 64,32,0,0
+    KERNEL2x4_L2 64,32,1,0   
+    KERNEL2x4_L2 64,32,2,0
+    KERNEL2x4_E2 64,32,3,1 
+    blr
+
+
+CGEMM_2x2_LMAIN_SUB:
+/*----------------------------------------*/   
+    mtctr   T8
+    LOAD2x2_2  
+    MY_ALIGN 
+CGEMM_L2x2_LOOP:
+/*----------------------------------------*/   
+    KERNEL2x2_L2 32,32,0,0 
+CGEMM_L2x2_K32:
+/*----------------------------------------*/   
+    KERNEL2x2_L2 32,32,1,0  
+    KERNEL2x2_L2 32,32,2,0
+    KERNEL2x2_L2 32,32,3,0  
+    KERNEL2x2_L2 32,32,4,0
+    KERNEL2x2_L2 32,32,5,0 
+    KERNEL2x2_L2 32,32,6,0
+    KERNEL2x2_L2 32,32,7,0
+    KERNEL2x2_L2 32,32,8,0
+    KERNEL2x2_L2 32,32,9,0  
+    KERNEL2x2_L2 32,32,10,0
+    KERNEL2x2_L2 32,32,11,0  
+    KERNEL2x2_L2 32,32,12,0
+    KERNEL2x2_L2 32,32,13,0 
+    KERNEL2x2_L2 32,32,14,0
+    KERNEL2x2_L2 32,32,15,1   
+    bdnz    CGEMM_L2x2_LOOP
+    MY_ALIGN  
+
+
+CGEMM_L2x2_LOOP_END:
+/*----------------------------------------*/   
+    END2x2_2 
+    blr
+    MY_ALIGN
+CGEMM_2x2_L16_SUB:
+/*----------------------------------------*/   
+    LOAD2x2_2
+    KERNEL2x2_L2 32,32,0,0
+    KERNEL2x2_L2 32,32,1,0  
+    KERNEL2x2_L2 32,32,2,0
+    KERNEL2x2_L2 32,32,3,0  
+    KERNEL2x2_L2 32,32,4,0
+    KERNEL2x2_L2 32,32,5,0 
+    KERNEL2x2_L2 32,32,6,0
+    KERNEL2x2_E2 32,32,7,1
+    blr
+    MY_ALIGN
+CGEMM_2x2_L8_SUB:
+/*----------------------------------------*/   
+    LOAD2x2_2
+    KERNEL2x2_L2 32,32,0,0
+    KERNEL2x2_L2 32,32,1,0  
+    KERNEL2x2_L2 32,32,2,0
+    KERNEL2x2_E2 32,32,3,1  
+    blr
+
+
+CGEMM_2x1_LMAIN_SUB:
+/*----------------------------------------*/   
+    mtctr   T8
+    LOAD2x1_2  
+    MY_ALIGN
+CGEMM_L2x1_LOOP:
+/*----------------------------------------*/   
+    KERNEL2x1_L2 16,32,0,0 
+CGEMM_L2x1_K32:
+/*----------------------------------------*/   
+    KERNEL2x1_L2 16,32,1,0  
+    KERNEL2x1_L2 16,32,2,0
+    KERNEL2x1_L2 16,32,3,0  
+    KERNEL2x1_L2 16,32,4,0
+    KERNEL2x1_L2 16,32,5,0 
+    KERNEL2x1_L2 16,32,6,0
+    KERNEL2x1_L2 16,32,7,0
+    KERNEL2x1_L2 16,32,8,0
+    KERNEL2x1_L2 16,32,9,0  
+    KERNEL2x1_L2 16,32,10,0
+    KERNEL2x1_L2 16,32,11,0  
+    KERNEL2x1_L2 16,32,12,0
+    KERNEL2x1_L2 16,32,13,0 
+    KERNEL2x1_L2 16,32,14,0
+    KERNEL2x1_L2 16,32,15,1   
+    bdnz    CGEMM_L2x1_LOOP
+    MY_ALIGN  
+CGEMM_L2x1_LOOP_END:
+/*----------------------------------------*/   
+    END2x1_2 
+    blr
+
+    MY_ALIGN
+CGEMM_2x1_L16_SUB:
+/*----------------------------------------*/   
+    LOAD2x1_2
+    KERNEL2x1_L2 16,32,0,0
+    KERNEL2x1_L2 16,32,1,0  
+    KERNEL2x1_L2 16,32,2,0
+    KERNEL2x1_L2 16,32,3,0  
+    KERNEL2x1_L2 16,32,4,0
+    KERNEL2x1_L2 16,32,5,0 
+    KERNEL2x1_L2 16,32,6,0
+    KERNEL2x1_E2 16,32,7,1
+    blr
+    MY_ALIGN
+
+
+CGEMM_2x1_L8_SUB:
+/*----------------------------------------*/   
+    LOAD2x1_2
+    KERNEL2x1_L2 16,32,0,0
+    KERNEL2x1_L2 16,32,1,0  
+    KERNEL2x1_L2 16,32,2,0
+    KERNEL2x1_E2 16,32,3,1  
+    blr
+
+
+
+/*             MAIN LOOP BEGINS               */   
+    MY_ALIGN
+
+
+CGEMM_L2:
+/*----------------------------------------*/   
+
+    andi.    J,  N,  2
+    ble   CGEMM_L2_END
+
+
+CGEMM_L2_BEGIN:
+/*----------------------------------------*/   
+    mr    CO, C
+    slwi    T1, LDC , 1     
+    add     T2,C,LDC    
+    mr    AO, A  
+    add   C,  C,  T1
+#if defined(TRMMKERNEL) && defined(LEFT)   
+    mr TEMP_REG, OFFSET  /*off = offset;*/
+#endif     
+    srawi.    I,  M,  3
+    ble   CGEMM_L2x8_END
+    dcbt    CO,r0  /*just prefetch*/
+    dcbt    T2,r0    
+
+
+CGEMM_L2x8_BEGIN:
+/*----------------------------------------*/   
+#if defined(TRMMKERNEL)   
+    REFRESH_POINTERS  AO,BO,TEMP_REG,B,8,2
+#else    
+    mr    BO, B  
+    dcbt    B,  r0  
+#endif     
+    dcbt    AO, r0
+#if defined(TRMMKERNEL)   
+    REFRESH_TEMP_BK T6,K,TEMP_REG,8,2
+    mr T1, T6
+/* TEMPS FOR PREFETCH */   
+    li T2, 1024
+    li T3, 1024+512
+    addi T1,T1, -2
+/* TEMPS FOR PREFETCH */     
+    li T4, 2048
+    li T5, 2048+512   
+    srawi.   T8, T1, 7 /**(T1-2) % 128x */
+#else   
+    mr T1, K
+/* TEMPS FOR PREFETCH */   
+    li T2, 1024
+    li T3, 1024+512
+    addi T1,T1, -2
+/* TEMPS FOR PREFETCH */     
+    li T4, 2048
+    li T5, 2048+512 
+    srawi.   T8, T1, 7 /**(K-2) % 128x */
+#endif   
+    ZERO2x8  
+    ble   CGEMM_L2x8_SUB0
+    bl CGEMM_L2x8_LMAIN_SUB
+    andi.   L,  T1, 127
+    ble   CGEMM_L2x8_SAVE
+    b   CGEMM_L2x8_SUB2
+
+
+CGEMM_L2x8_SUB0:
+/*----------------------------------------*/   
+#if defined(TRMMKERNEL)   
+    andi.   L,  T6, 255
+    cmpwi   T6,129
+#else   
+    andi.   L,  K,  255
+    cmpwi   K,129
+#endif       
+    li T8,1
+    bne CMP2x8_128K
+    addi BO,BO,-16
+    addi AO,AO,-64 
+    LOAD2x8O 64,16 
+    END2x8_WITHOUT_ADD   
+    LOAD2x8_2O  128, 32 
+    mtctr   T8    
+    bl CGEMM_L2x8_K128   
+    b CGEMM_L2x8_SAVE  
+    CMP2x8_128K:
+/*----------------------------------------*/   
+#if defined(TRMMKERNEL)    
+    cmpwi   T6,128
+#else    
+    cmpwi   K,128
+#endif        
+    bne CGEMM_L2x8_SUB2 
+    MY_ALIGN   
+    mtctr   T8
+    addi BO,BO,-32
+    addi AO,AO,-128   
+    LOAD2x8_2O 128,32
+    bl CGEMM_L2x8_K128   
+    b CGEMM_L2x8_SAVE 
+    MY_ALIGN
+
+
+CGEMM_L2x8_SUB2:
+/*----------------------------------------*/   
+    andi.      T1,L, 64
+    ble CGEMM_L2x8_SUB2_32
+    bl  CGEMM_2x8_L64_SUB
+    MY_ALIGN
+
+
+CGEMM_L2x8_SUB2_32:
+/*----------------------------------------*/   
+    andi.      T1,L, 32
+    ble CGEMM_L2x8_SUB2_16    
+    bl  CGEMM_2x8_L32_SUB
+    MY_ALIGN 
+
+
+CGEMM_L2x8_SUB2_16:
+/*----------------------------------------*/   
+    andi.      T1,L, 16
+    ble CGEMM_L2x8_SUB2_8
+    bl  CGEMM_2x8_L16_SUB  
+    MY_ALIGN    
+
+
+CGEMM_L2x8_SUB2_8:
+/*----------------------------------------*/   
+    andi.      T1,L, 8
+    ble CGEMM_L2x8_SUB2_4
+    LOAD2x8_2
+    KERNEL2x8_L2  128,32, 0,0
+    KERNEL2x8_L2  128,32, 1,0
+    KERNEL2x8_L2  128,32, 2,0
+    KERNEL2x8_E2  128,32, 3,1
+    MY_ALIGN   
+
+
+CGEMM_L2x8_SUB2_4:
+/*----------------------------------------*/   
+    andi.      T1,L, 4
+    ble CGEMM_L2x8_SUB2_2
+    LOAD2x8_2
+    KERNEL2x8_L2  128,32, 0,0
+    KERNEL2x8_E2  128,32, 1,1
+    MY_ALIGN
+
+
+CGEMM_L2x8_SUB2_2:
+/*----------------------------------------*/   
+    andi.      T1,L, 2
+    ble CGEMM_L2x8_SUB2_1
+    LOAD2x8_2 
+    KERNEL2x8_E2  128,32, 0,1
+    MY_ALIGN    
+
+
+CGEMM_L2x8_SUB2_1:
+/*----------------------------------------*/   
+    andi.      T1,L, 1
+    ble CGEMM_L2x8_SAVE 
+    KERNEL2x8
+
+    MY_ALIGN
+CGEMM_L2x8_SAVE:
+/*----------------------------------------*/   
+    addic.    I,  I,  -1
+    MY_ALIGN
+    SAVE2x8
+#if defined(TRMMKERNEL)    
+    REFRESH_AFTER_SAVE T6,K,TEMP_REG,BO,AO,8,2
+#endif     
+    bgt   CGEMM_L2x8_BEGIN
+    andi.   T2, M,  7
+    ble   CGEMM_L2x1_END
+    andi.   T1, M,  4
+    ble   CGEMM_L2x4_END
+    b   CGEMM_L2x4_BEGIN
+    MY_ALIGN 
+
+
+CGEMM_L2x8_END:
+/*----------------------------------------*/   
+
+
+CGEMM_L2x4_BEGIN:
+/*----------------------------------------*/   
+    andi.   T2, M,  7
+    ble   CGEMM_L2x1_END
+    andi.   T1, M,  4
+    ble   CGEMM_L2x4_END
+#if defined(TRMMKERNEL)   
+    REFRESH_POINTERS  AO,BO,TEMP_REG,B,4,2
+#else    
+    mr    BO, B   
+#endif        
+#if defined(TRMMKERNEL)   
+    REFRESH_TEMP_BK T6,K,TEMP_REG,4,2
+    mr T1, T6 
+    addi T1,T1, -2 
+    srawi.   T8, T1, 5 /**(T1-2) % 32x */
+#else   
+    mr T1, K 
+    addi T1,T1, -2
+    srawi.   T8, T1, 5 /**(K-2) % 32x */
+#endif     
+    ZERO2x4
+    ble   CGEMM_L2x4_SUB0 
+    bl CGEMM_2x4_LMAIN_SUB
+    andi.   L,  T1, 31
+    ble   CGEMM_L2x4_SAVE
+    b    CGEMM_L2x4_SUB2
+
+
+CGEMM_L2x4_SUB0:
+/*----------------------------------------*/   
+#if defined(TRMMKERNEL)   
+    andi.   L,  T6, 63
+    cmpwi   T6,33
+#else   
+    andi.   L,  K,  63
+    cmpwi   K,33
+#endif       
+    li T8,1
+    bne CMP2x4_32K
+    addi BO,BO,-16
+    addi AO,AO,-32  
+    LOAD2x4O 32,16 
+    END2x4_WITHOUT_ADD   
+    LOAD2x4_2O  64, 32 
+    mtctr   T8    
+    bl CGEMM_L2x4_K32   
+    b CGEMM_L2x4_SAVE  
+    CMP2x4_32K:
+/*----------------------------------------*/   
+#if defined(TRMMKERNEL)    
+    cmpwi   T6,32
+#else    
+    cmpwi   K,32
+#endif        
+    bne CGEMM_L2x4_SUB2 
+    MY_ALIGN   
+    mtctr   T8
+    addi BO,BO,-32
+    addi AO,AO,-64   
+    LOAD2x4_2O 64,32
+    bl CGEMM_L2x4_K32   
+    b CGEMM_L2x4_SAVE 
+    MY_ALIGN 
+    MY_ALIGN 
+
+
+CGEMM_L2x4_SUB2:
+/*----------------------------------------*/   
+    andi.      T1,L, 16
+    ble CGEMM_L2x4_SUB2_8
+    bl  CGEMM_2x4_L16_SUB  
+    MY_ALIGN
+
+
+CGEMM_L2x4_SUB2_8:
+/*----------------------------------------*/   
+    andi.      T1,L, 8
+    ble CGEMM_L2x4_SUB2_4
+    bl CGEMM_2x4_L8_SUB
+    MY_ALIGN  
+
+
+CGEMM_L2x4_SUB2_4:
+/*----------------------------------------*/   
+    andi.      T1,L, 4
+    ble CGEMM_L2x4_SUB2_2
+    LOAD2x4_2
+    KERNEL2x4_L2  64,32, 0,0
+    KERNEL2x4_E2  64,32, 1,1
+    MY_ALIGN
+
+
+CGEMM_L2x4_SUB2_2:
+/*----------------------------------------*/   
+    andi.      T1,L, 2
+    ble CGEMM_L2x4_SUB2_1
+    LOAD2x4_2
+    KERNEL2x4_E2  64,32, 0,1
+    MY_ALIGN    
+
+
+CGEMM_L2x4_SUB2_1:
+/*----------------------------------------*/   
+    andi.      T1,L, 1
+    ble CGEMM_L2x4_SAVE 
+    KERNEL2x4
+
+
+CGEMM_L2x4_SAVE:
+/*----------------------------------------*/   
+    SAVE2x4
+#if defined(TRMMKERNEL)    
+    REFRESH_AFTER_SAVE T6,K,TEMP_REG,BO,AO,4,2
+#endif     
+
+
+CGEMM_L2x4_END:
+/*----------------------------------------*/   
+
+
+CGEMM_L2x2_BEGIN:
+/*----------------------------------------*/   
+    andi.   T1, M,  2
+    ble   CGEMM_L2x2_END
+#if defined(TRMMKERNEL)   
+    REFRESH_POINTERS  AO,BO,TEMP_REG,B,2,2
+#else    
+    mr    BO, B   
+#endif        
+#if defined(TRMMKERNEL)   
+    REFRESH_TEMP_BK T6,K,TEMP_REG,2,2
+    mr T1, T6 
+    addi T1,T1, -2 
+    srawi.   T8, T1, 5 /**(T1-2) % 32x */
+#else   
+    mr T1, K 
+    addi T1,T1, -2
+    srawi.   T8, T1, 5 /**(K-2) % 32x */
+#endif     
+    ZERO2x2
+    ble   CGEMM_L2x2_SUB0 
+    bl CGEMM_2x2_LMAIN_SUB
+    andi.   L,  T1, 31
+    ble   CGEMM_L2x2_SAVE
+    b   CGEMM_L2x2_SUB2
+
+
+CGEMM_L2x2_SUB0:
+/*----------------------------------------*/   
+#if defined(TRMMKERNEL)   
+    andi.   L,  T6, 63
+    cmpwi   T6,33
+#else   
+    andi.   L,  K,  63
+    cmpwi   K,33
+#endif       
+    li T8,1
+    bne CMP2x2_32K
+    addi BO,BO,-16
+    addi AO,AO,-16  
+    LOAD2x2O 16,16 
+    END2x2_WITHOUT_ADD   
+    LOAD2x2_2O  32, 32  
+    mtctr   T8    
+    bl CGEMM_L2x2_K32   
+    b CGEMM_L2x2_SAVE  
+    CMP2x2_32K:
+/*----------------------------------------*/   
+#if defined(TRMMKERNEL)    
+    cmpwi   T6,32
+#else    
+    cmpwi   K,32
+#endif        
+    bne CGEMM_L2x2_SUB2 
+    MY_ALIGN   
+    mtctr   T8
+    addi BO,BO,-32
+    addi AO,AO,-32   
+    LOAD2x2_2O 32,32
+    bl CGEMM_L2x2_K32   
+    b CGEMM_L2x2_SAVE 
+    MY_ALIGN 
+    MY_ALIGN 
+
+
+CGEMM_L2x2_SUB2:
+/*----------------------------------------*/   
+    andi.      T1,L, 16
+    ble CGEMM_L2x2_SUB2_8
+    bl CGEMM_2x2_L16_SUB  
+    MY_ALIGN
+
+
+CGEMM_L2x2_SUB2_8:
+/*----------------------------------------*/   
+    andi.      T1,L, 8
+    ble CGEMM_L2x2_SUB2_4
+    bl CGEMM_2x2_L8_SUB
+    MY_ALIGN  
+
+
+CGEMM_L2x2_SUB2_4:
+/*----------------------------------------*/   
+    andi.      T1,L, 4
+    ble CGEMM_L2x2_SUB2_2
+    LOAD2x2_2
+    KERNEL2x2_L2  32,32, 0,0
+    KERNEL2x2_E2  32,32, 1,1
+    MY_ALIGN
+
+
+CGEMM_L2x2_SUB2_2:
+/*----------------------------------------*/   
+    andi.      T1,L, 2
+    ble CGEMM_L2x2_SUB2_1
+    LOAD2x2_2
+    KERNEL2x2_E2  32,32, 0,1
+    MY_ALIGN    
+
+
+CGEMM_L2x2_SUB2_1:
+/*----------------------------------------*/   
+    andi.      T1,L, 1
+    ble CGEMM_L2x2_SAVE 
+    KERNEL2x2
+
+    MY_ALIGN
+CGEMM_L2x2_SAVE:
+/*----------------------------------------*/   
+    SAVE2x2
+#if defined(TRMMKERNEL)    
+    REFRESH_AFTER_SAVE T6,K,TEMP_REG,BO,AO,2,2
+#endif     
+
+
+CGEMM_L2x2_END:
+/*----------------------------------------*/   
+
+
+CGEMM_L2x1_BEGIN:
+/*----------------------------------------*/   
+    andi.   T1, M,  1
+    ble   CGEMM_L2x1_END
+#if defined(TRMMKERNEL)   
+    REFRESH_POINTERS  AO,BO,TEMP_REG,B,1,2
+#else    
+    mr    BO, B   
+#endif        
+#if defined(TRMMKERNEL)   
+    REFRESH_TEMP_BK T6,K,TEMP_REG,1,2
+    mr T1, T6 
+    addi T1,T1, -2 
+    srawi.   T8, T1, 5 /**(T1-2) % 32x */
+#else   
+    mr T1, K 
+    addi T1,T1, -2
+    srawi.   T8, T1, 5 /**(K-2) % 32x */
+#endif     
+    ZERO2x1
+    ble   CGEMM_L2x1_SUB0 
+    bl CGEMM_2x1_LMAIN_SUB
+    andi.   L,  T1, 31
+    ble   CGEMM_L2x1_SAVE
+    b   CGEMM_L2x1_SUB2
+
+
+CGEMM_L2x1_SUB0:
+/*----------------------------------------*/   
+#if defined(TRMMKERNEL)   
+    andi.   L,  T6, 63
+    cmpwi   T6,33
+#else   
+    andi.   L,  K,  63
+    cmpwi   K,33
+#endif       
+    li T8,1
+    bne CMP2x1_32K
+    addi BO,BO,-16
+    addi AO,AO,-8  
+    LOAD2x1O 8,16 
+    END2x1_WITHOUT_ADD   
+    LOAD2x1_2O  16, 32  
+    mtctr   T8    
+    bl CGEMM_L2x1_K32   
+    b CGEMM_L2x1_SAVE  
+    CMP2x1_32K:
+/*----------------------------------------*/   
+#if defined(TRMMKERNEL)    
+    cmpwi   T6,32
+#else    
+    cmpwi   K,32
+#endif        
+    bne CGEMM_L2x1_SUB2 
+    MY_ALIGN   
+    mtctr   T8
+    addi BO,BO,-32
+    addi AO,AO,-16   
+    LOAD2x1_2O 16,32
+    bl CGEMM_L2x1_K32   
+    b CGEMM_L2x1_SAVE 
+    MY_ALIGN 
+    MY_ALIGN 
+
+
+CGEMM_L2x1_SUB2:
+/*----------------------------------------*/   
+    andi.      T1,L, 16
+    ble CGEMM_L2x1_SUB2_8
+    bl CGEMM_2x1_L16_SUB  
+    MY_ALIGN
+
+
+CGEMM_L2x1_SUB2_8:
+/*----------------------------------------*/   
+    andi.      T1,L, 8
+    ble CGEMM_L2x1_SUB2_4
+    bl CGEMM_2x1_L8_SUB
+    MY_ALIGN  
+
+
+CGEMM_L2x1_SUB2_4:
+/*----------------------------------------*/   
+    andi.      T1,L, 4
+    ble CGEMM_L2x1_SUB2_2
+    LOAD2x1_2
+    KERNEL2x1_L2  16,32, 0,0
+    KERNEL2x1_E2  16,32, 1,1
+    MY_ALIGN
+
+
+CGEMM_L2x1_SUB2_2:
+/*----------------------------------------*/   
+    andi.      T1,L, 2
+    ble CGEMM_L2x1_SUB2_1
+    LOAD2x1_2
+    KERNEL2x1_E2  16,32, 0,1
+    MY_ALIGN    
+
+
+CGEMM_L2x1_SUB2_1:
+/*----------------------------------------*/   
+    andi.      T1,L, 1
+    ble CGEMM_L2x1_SAVE 
+    KERNEL2x1
+
+    MY_ALIGN
+CGEMM_L2x1_SAVE:
+/*----------------------------------------*/  
+     
+    SAVE2x1
+#if defined(TRMMKERNEL)    
+    REFRESH_AFTER_SAVE T6,K,TEMP_REG,BO,AO,1,2
+#endif   
+
+
+CGEMM_L2x1_END:
+/*----------------------------------------*/   
+    slwi    T1, K,  4
+
+    add   B,  B,  T1
+#if defined(TRMMKERNEL) && !defined(LEFT)   
+    addi TEMP_REG, TEMP_REG, 2
+#endif   
+
+CGEMM_L2_END:
+
+
+b CGEMM_L1
+/*                MINI SUBROUTINES                            */      
+/*                1x8 MAIN 128x+2 LOOP                     */      
+
+
+CGEMM_L1x8_LMAIN_SUB:
+/*----------------------------------------*/   
+    mtctr   T8
+    LOAD1x8_2 
+    MY_ALIGN
+CGEMM_L1x8_LOOP:
+/*----------------------------------------*/   
+    dcbt    AO, PRE
+    dcbt    BO, PRE
+    KERNEL1x8_L2 128,16,0,0 
+CGEMM_L1x8_K128:
+/*----------------------------------------*/   
+    KERNEL1x8_L2 128,16,1,0
+    dcbt    AO, T2  
+    KERNEL1x8_L2 128,16,2,0
+    KERNEL1x8_L2 128,16,3,0 
+    dcbt    AO, T3
+    dcbt    BO, T2
+    KERNEL1x8_L2 128,16,4,0
+    KERNEL1x8_L2 128,16,5,0
+    dcbt    AO, T4  
+    KERNEL1x8_L2 128,16,6,0
+    KERNEL1x8_L2 128,16,7,0  
+    dcbt    AO, T5  
+    dcbt    BO, T3
+    KERNEL1x8_L2 128,16,8,0
+    KERNEL1x8_L2 128,16,9,0
+    KERNEL1x8_L2 128,16,10,0
+    KERNEL1x8_L2 128,16,11,0  
+    dcbt    BO, T4
+    KERNEL1x8_L2 128,16,12,0
+    KERNEL1x8_L2 128,16,13,0
+    KERNEL1x8_L2 128,16,14,0
+    KERNEL1x8_L2 128,16,15,0  
+    KERNEL1x8_L2 128,16,16,0
+    KERNEL1x8_L2 128,16,17,0 
+    KERNEL1x8_L2 128,16,18,0
+    KERNEL1x8_L2 128,16,19,0  
+    KERNEL1x8_L2 128,16,20,0
+    KERNEL1x8_L2 128,16,21,0 
+    KERNEL1x8_L2 128,16,22,0
+    KERNEL1x8_L2 128,16,23,0   
+    KERNEL1x8_L2 128,16,24,0
+    KERNEL1x8_L2 128,16,25,0
+    KERNEL1x8_L2 128,16,26,0
+    KERNEL1x8_L2 128,16,27,0  
+    KERNEL1x8_L2 128,16,28,0
+    KERNEL1x8_L2 128,16,29,0
+    KERNEL1x8_L2 128,16,30,0
+    KERNEL1x8_L2 128,16,31,0 
+    KERNEL1x8_L2 128,16,32,0
+    KERNEL1x8_L2 128,16,33,0
+    KERNEL1x8_L2 128,16,34,0
+    KERNEL1x8_L2 128,16,35,0 
+    KERNEL1x8_L2 128,16,36,0
+    KERNEL1x8_L2 128,16,37,0
+    KERNEL1x8_L2 128,16,38,0
+    KERNEL1x8_L2 128,16,39,0  
+    KERNEL1x8_L2 128,16,40,0
+    KERNEL1x8_L2 128,16,41,0
+    KERNEL1x8_L2 128,16,42,0
+    KERNEL1x8_L2 128,16,43,0  
+    KERNEL1x8_L2 128,16,44,0
+    KERNEL1x8_L2 128,16,45,0
+    KERNEL1x8_L2 128,16,46,0
+    KERNEL1x8_L2 128,16,47,0 
+    KERNEL1x8_L2 128,16,48,0
+    KERNEL1x8_L2 128,16,49,0 
+    KERNEL1x8_L2 128,16,50,0
+    KERNEL1x8_L2 128,16,51,0  
+    KERNEL1x8_L2 128,16,52,0
+    KERNEL1x8_L2 128,16,53,0 
+    KERNEL1x8_L2 128,16,54,0
+    KERNEL1x8_L2 128,16,55,0  
+    KERNEL1x8_L2 128,16,56,0
+    KERNEL1x8_L2 128,16,57,0
+    KERNEL1x8_L2 128,16,58,0
+    KERNEL1x8_L2 128,16,59,0  
+    KERNEL1x8_L2 128,16,60,0
+    KERNEL1x8_L2 128,16,61,0
+    KERNEL1x8_L2 128,16,62,0 
+    KERNEL1x8_L2 128,16,63,1  
+    bdnz    CGEMM_L1x8_LOOP
+    MY_ALIGN  
+CGEMM_L1x8_LOOP_END:
+/*----------------------------------------*/   
+    END1x8_2
+    blr
+    MY_ALIGN
+
+
+CGEMM_1x8_L64_SUB:
+/*----------------------------------------*/   
+    LOAD1x8_2  
+    dcbt    AO, PRE
+    dcbt    BO, PRE
+    KERNEL1x8_L2 128,16,0,0 
+    KERNEL1x8_L2 128,16,1,0
+    dcbt    AO, T2  
+    KERNEL1x8_L2 128,16,2,0
+    KERNEL1x8_L2 128,16,3,0 
+    dcbt    AO, T3
+    dcbt    BO, T2
+    KERNEL1x8_L2 128,16,4,0
+    KERNEL1x8_L2 128,16,5,0
+    dcbt    AO, T4  
+    KERNEL1x8_L2 128,16,6,0
+    KERNEL1x8_L2 128,16,7,0  
+    dcbt    AO, T5  
+    dcbt    BO, T3
+    KERNEL1x8_L2 128,16,8,0
+    KERNEL1x8_L2 128,16,9,0
+    KERNEL1x8_L2 128,16,10,0
+    KERNEL1x8_L2 128,16,11,0  
+    dcbt    BO, T4
+    KERNEL1x8_L2 128,16,12,0
+    KERNEL1x8_L2 128,16,13,0
+    KERNEL1x8_L2 128,16,14,0
+    KERNEL1x8_L2 128,16,15,0  
+    KERNEL1x8_L2 128,16,16,0
+    KERNEL1x8_L2 128,16,17,0 
+    KERNEL1x8_L2 128,16,18,0
+    KERNEL1x8_L2 128,16,19,0  
+    KERNEL1x8_L2 128,16,20,0
+    KERNEL1x8_L2 128,16,21,0 
+    KERNEL1x8_L2 128,16,22,0
+    KERNEL1x8_L2 128,16,23,0   
+    KERNEL1x8_L2 128,16,24,0
+    KERNEL1x8_L2 128,16,25,0
+    KERNEL1x8_L2 128,16,26,0
+    KERNEL1x8_L2 128,16,27,0  
+    KERNEL1x8_L2 128,16,28,0
+    KERNEL1x8_L2 128,16,29,0
+    KERNEL1x8_L2 128,16,30,0
+    KERNEL1x8_E2 128,16,31,1
+    blr
+    MY_ALIGN
+
+
+CGEMM_1x8_L32_SUB:
+/*----------------------------------------*/   
+    LOAD1x8_2  
+    dcbt    AO, PRE
+    dcbt    BO, PRE
+    KERNEL1x8_L2 128,16,0,0 
+    KERNEL1x8_L2 128,16,1,0
+    dcbt    AO, T2  
+    KERNEL1x8_L2 128,16,2,0
+    KERNEL1x8_L2 128,16,3,0 
+    dcbt    AO, T3
+    dcbt    BO, T2
+    KERNEL1x8_L2 128,16,4,0
+    KERNEL1x8_L2 128,16,5,0
+    dcbt    AO, T4  
+    KERNEL1x8_L2 128,16,6,0
+    KERNEL1x8_L2 128,16,7,0  
+    dcbt    AO, T5  
+    dcbt    BO, T3
+    KERNEL1x8_L2 128,16,8,0
+    KERNEL1x8_L2 128,16,9,0
+    KERNEL1x8_L2 128,16,10,0
+    KERNEL1x8_L2 128,16,11,0  
+    dcbt    BO, T4
+    KERNEL1x8_L2 128,16,12,0
+    KERNEL1x8_L2 128,16,13,0
+    KERNEL1x8_L2 128,16,14,0
+    KERNEL1x8_E2 128,16,15,1
+    blr
+    MY_ALIGN
+
+
+CGEMM_1x8_L16_SUB:
+/*----------------------------------------*/   
+    LOAD1x8_2 
+    dcbt    AO, PRE
+    dcbt    BO, PRE
+    KERNEL1x8_L2 128,16,0,0 
+    KERNEL1x8_L2 128,16,1,0
+    dcbt    AO, T2  
+    KERNEL1x8_L2 128,16,2,0
+    KERNEL1x8_L2 128,16,3,0 
+    dcbt    AO, T3
+    dcbt    BO, T2
+    KERNEL1x8_L2 128,16,4,0
+    KERNEL1x8_L2 128,16,5,0
+    dcbt    AO, T4  
+    KERNEL1x8_L2 128,16,6,0
+    KERNEL1x8_E2 128,16,7,1
+    blr
+    MY_ALIGN
+
+
+CGEMM_1x4_LMAIN_SUB:
+/*----------------------------------------*/   
+    mtctr   T8
+    LOAD1x4_2  
+    MY_ALIGN
+CGEMM_L1x4_LOOP:
+/*----------------------------------------*/   
+    KERNEL1x4_L2 64,16,0,0
+CGEMM_L1x4_K32:
+/*----------------------------------------*/   
+    KERNEL1x4_L2 64,16,1,0   
+    KERNEL1x4_L2 64,16,2,0
+    KERNEL1x4_L2 64,16,3,0  
+    KERNEL1x4_L2 64,16,4,0
+    KERNEL1x4_L2 64,16,5,0 
+    KERNEL1x4_L2 64,16,6,0
+    KERNEL1x4_L2 64,16,7,0
+    KERNEL1x4_L2 64,16,8,0
+    KERNEL1x4_L2 64,16,9,0   
+    KERNEL1x4_L2 64,16,10,0
+    KERNEL1x4_L2 64,16,11,0  
+    KERNEL1x4_L2 64,16,12,0
+    KERNEL1x4_L2 64,16,13,0 
+    KERNEL1x4_L2 64,16,14,0
+    KERNEL1x4_L2 64,16,15,1    
+    bdnz    CGEMM_L1x4_LOOP
+    MY_ALIGN  
+CGEMM_L1x4_LOOP_END:
+/*----------------------------------------*/   
+    END1x4_2 
+    blr
+    MY_ALIGN
+
+
+CGEMM_1x4_L16_SUB:
+/*----------------------------------------*/   
+    LOAD1x4_2
+    KERNEL1x4_L2 64,16,0,0
+    KERNEL1x4_L2 64,16,1,0   
+    KERNEL1x4_L2 64,16,2,0
+    KERNEL1x4_L2 64,16,3,0  
+    KERNEL1x4_L2 64,16,4,0
+    KERNEL1x4_L2 64,16,5,0 
+    KERNEL1x4_L2 64,16,6,0
+    KERNEL1x4_E2 64,16,7,1
+    blr
+    MY_ALIGN
+
+
+CGEMM_1x4_L8_SUB:
+/*----------------------------------------*/   
+    LOAD1x4_2
+    KERNEL1x4_L2 64,16,0,0
+    KERNEL1x4_L2 64,16,1,0   
+    KERNEL1x4_L2 64,16,2,0
+    KERNEL1x4_E2 64,16,3,1 
+    blr
+
+
+CGEMM_1x2_LMAIN_SUB:
+/*----------------------------------------*/   
+    mtctr   T8
+    LOAD1x2_2  
+    MY_ALIGN 
+CGEMM_L1x2_LOOP:
+/*----------------------------------------*/   
+    KERNEL1x2_L2 32,16,0,0 
+CGEMM_L1x2_K32:
+/*----------------------------------------*/   
+    KERNEL1x2_L2 32,16,1,0  
+    KERNEL1x2_L2 32,16,2,0
+    KERNEL1x2_L2 32,16,3,0  
+    KERNEL1x2_L2 32,16,4,0
+    KERNEL1x2_L2 32,16,5,0 
+    KERNEL1x2_L2 32,16,6,0
+    KERNEL1x2_L2 32,16,7,0
+    KERNEL1x2_L2 32,16,8,0
+    KERNEL1x2_L2 32,16,9,0  
+    KERNEL1x2_L2 32,16,10,0
+    KERNEL1x2_L2 32,16,11,0  
+    KERNEL1x2_L2 32,16,12,0
+    KERNEL1x2_L2 32,16,13,0 
+    KERNEL1x2_L2 32,16,14,0
+    KERNEL1x2_L2 32,16,15,1   
+    bdnz    CGEMM_L1x2_LOOP
+    MY_ALIGN  
+
+
+CGEMM_L1x2_LOOP_END:
+/*----------------------------------------*/   
+    END1x2_2 
+    blr
+    MY_ALIGN
+CGEMM_1x2_L16_SUB:
+/*----------------------------------------*/   
+    LOAD1x2_2
+    KERNEL1x2_L2 32,16,0,0
+    KERNEL1x2_L2 32,16,1,0  
+    KERNEL1x2_L2 32,16,2,0
+    KERNEL1x2_L2 32,16,3,0  
+    KERNEL1x2_L2 32,16,4,0
+    KERNEL1x2_L2 32,16,5,0 
+    KERNEL1x2_L2 32,16,6,0
+    KERNEL1x2_E2 32,16,7,1
+    blr
+    MY_ALIGN
+CGEMM_1x2_L8_SUB:
+/*----------------------------------------*/   
+    LOAD1x2_2
+    KERNEL1x2_L2 32,16,0,0
+    KERNEL1x2_L2 32,16,1,0  
+    KERNEL1x2_L2 32,16,2,0
+    KERNEL1x2_E2 32,16,3,1  
+    blr
+
+
+CGEMM_1x1_LMAIN_SUB:
+/*----------------------------------------*/   
+    mtctr   T8
+    LOAD1x1_2  
+    MY_ALIGN
+CGEMM_L1x1_LOOP:
+/*----------------------------------------*/   
+    KERNEL1x1_L2 16,16,0,0 
+CGEMM_L1x1_K32:
+/*----------------------------------------*/   
+    KERNEL1x1_L2 16,16,1,0  
+    KERNEL1x1_L2 16,16,2,0
+    KERNEL1x1_L2 16,16,3,0  
+    KERNEL1x1_L2 16,16,4,0
+    KERNEL1x1_L2 16,16,5,0 
+    KERNEL1x1_L2 16,16,6,0
+    KERNEL1x1_L2 16,16,7,0
+    KERNEL1x1_L2 16,16,8,0
+    KERNEL1x1_L2 16,16,9,0  
+    KERNEL1x1_L2 16,16,10,0
+    KERNEL1x1_L2 16,16,11,0  
+    KERNEL1x1_L2 16,16,12,0
+    KERNEL1x1_L2 16,16,13,0 
+    KERNEL1x1_L2 16,16,14,0
+    KERNEL1x1_L2 16,16,15,1   
+    bdnz    CGEMM_L1x1_LOOP
+    MY_ALIGN  
+CGEMM_L1x1_LOOP_END:
+/*----------------------------------------*/   
+    END1x1_2 
+    blr
+
+    MY_ALIGN
+CGEMM_1x1_L16_SUB:
+/*----------------------------------------*/   
+    LOAD1x1_2
+    KERNEL1x1_L2 16,16,0,0
+    KERNEL1x1_L2 16,16,1,0  
+    KERNEL1x1_L2 16,16,2,0
+    KERNEL1x1_L2 16,16,3,0  
+    KERNEL1x1_L2 16,16,4,0
+    KERNEL1x1_L2 16,16,5,0 
+    KERNEL1x1_L2 16,16,6,0
+    KERNEL1x1_E2 16,16,7,1
+    blr
+    MY_ALIGN
+
+
+CGEMM_1x1_L8_SUB:
+/*----------------------------------------*/   
+    LOAD1x1_2
+    KERNEL1x1_L2 16,16,0,0
+    KERNEL1x1_L2 16,16,1,0  
+    KERNEL1x1_L2 16,16,2,0
+    KERNEL1x1_E2 16,16,3,1  
+    blr
+
+
+
+/*             MAIN LOOP BEGINS               */   
+    MY_ALIGN
+
+
+CGEMM_L1:
+/*----------------------------------------*/   
+
+    andi.    J,  N,  1
+    ble   CGEMM_L1_END
+
+CGEMM_L1_BEGIN:
+/*----------------------------------------*/   
+    mr    CO, C  
+    add     T2,C,LDC    
+    mr    AO, A  
+    add   C,  C,  T1
+#if defined(TRMMKERNEL) && defined(LEFT)   
+    mr TEMP_REG, OFFSET  /*off = offset;*/
+#endif     
+    srawi.    I,  M,  3
+    ble   CGEMM_L1x8_END
+    dcbt    CO,r0  /*just prefetch*/
+    dcbt    T2,r0    
+
+
+CGEMM_L1x8_BEGIN:
+/*----------------------------------------*/   
+#if defined(TRMMKERNEL)   
+    REFRESH_POINTERS  AO,BO,TEMP_REG,B,8,1
+#else    
+    mr    BO, B  
+    dcbt    B,  r0  
+#endif     
+    dcbt    AO, r0
+#if defined(TRMMKERNEL)   
+    REFRESH_TEMP_BK T6,K,TEMP_REG,8,1
+    mr T1, T6
+/* TEMPS FOR PREFETCH */   
+    li T2, 1024
+    li T3, 1024+512
+    addi T1,T1, -2
+/* TEMPS FOR PREFETCH */     
+    li T4, 2048
+    li T5, 2048+512   
+    srawi.   T8, T1, 7 /**(T1-2) % 128x */
+#else   
+    mr T1, K
+/* TEMPS FOR PREFETCH */   
+    li T2, 1024
+    li T3, 1024+512
+    addi T1,T1, -2
+/* TEMPS FOR PREFETCH */     
+    li T4, 2048
+    li T5, 2048+512 
+    srawi.   T8, T1, 7 /**(K-2) % 128x */
+#endif   
+    ZERO1x8  
+    ble   CGEMM_L1x8_SUB0
+    bl CGEMM_L1x8_LMAIN_SUB
+    andi.   L,  T1, 127
+    ble   CGEMM_L1x8_SAVE
+    b   CGEMM_L1x8_SUB2
+
+
+CGEMM_L1x8_SUB0:
+/*----------------------------------------*/   
+#if defined(TRMMKERNEL)   
+    andi.   L,  T6, 255
+    cmpwi   T6,129
+#else   
+    andi.   L,  K,  255
+    cmpwi   K,129
+#endif       
+    li T8,1
+    bne CMP1x8_128K
+    addi BO,BO,-8
+    addi AO,AO,-64 
+    LOAD1x8O 64,8 
+    END1x8_WITHOUT_ADD   
+    LOAD1x8_2O  128, 16 
+    mtctr   T8    
+    bl CGEMM_L1x8_K128   
+    b CGEMM_L1x8_SAVE  
+    CMP1x8_128K:
+/*----------------------------------------*/   
+#if defined(TRMMKERNEL)    
+    cmpwi   T6,128
+#else    
+    cmpwi   K,128
+#endif        
+    bne CGEMM_L1x8_SUB2 
+    MY_ALIGN   
+    mtctr   T8
+    addi BO,BO,-16
+    addi AO,AO,-128   
+    LOAD1x8_2O 128,16
+    bl CGEMM_L1x8_K128   
+    b CGEMM_L1x8_SAVE 
+    MY_ALIGN
+
+
+CGEMM_L1x8_SUB2:
+/*----------------------------------------*/   
+    andi.      T1,L, 64
+    ble CGEMM_L1x8_SUB2_32
+    bl  CGEMM_1x8_L64_SUB
+    MY_ALIGN
+
+
+CGEMM_L1x8_SUB2_32:
+/*----------------------------------------*/   
+    andi.      T1,L, 32
+    ble CGEMM_L1x8_SUB2_16    
+    bl  CGEMM_1x8_L32_SUB
+    MY_ALIGN 
+
+
+CGEMM_L1x8_SUB2_16:
+/*----------------------------------------*/   
+    andi.      T1,L, 16
+    ble CGEMM_L1x8_SUB2_8
+    bl  CGEMM_1x8_L16_SUB  
+    MY_ALIGN    
+
+
+CGEMM_L1x8_SUB2_8:
+/*----------------------------------------*/   
+    andi.      T1,L, 8
+    ble CGEMM_L1x8_SUB2_4
+    LOAD1x8_2
+    KERNEL1x8_L2  128,16, 0,0
+    KERNEL1x8_L2  128,16, 1,0
+    KERNEL1x8_L2  128,16, 2,0
+    KERNEL1x8_E2  128,16, 3,1
+    MY_ALIGN   
+
+
+CGEMM_L1x8_SUB2_4:
+/*----------------------------------------*/   
+    andi.      T1,L, 4
+    ble CGEMM_L1x8_SUB2_2
+    LOAD1x8_2
+    KERNEL1x8_L2  128,16, 0,0
+    KERNEL1x8_E2  128,16, 1,1
+    MY_ALIGN
+
+
+CGEMM_L1x8_SUB2_2:
+/*----------------------------------------*/   
+    andi.      T1,L, 2
+    ble CGEMM_L1x8_SUB2_1
+    LOAD1x8_2 
+    KERNEL1x8_E2  128,16, 0,1
+    MY_ALIGN    
+
+
+CGEMM_L1x8_SUB2_1:
+/*----------------------------------------*/   
+    andi.      T1,L, 1
+    ble CGEMM_L1x8_SAVE 
+    KERNEL1x8
+
+    MY_ALIGN
+CGEMM_L1x8_SAVE:
+/*----------------------------------------*/   
+    addic.    I,  I,  -1
+    MY_ALIGN
+    SAVE1x8
+#if defined(TRMMKERNEL)    
+    REFRESH_AFTER_SAVE T6,K,TEMP_REG,BO,AO,8,1
+#endif     
+    bgt   CGEMM_L1x8_BEGIN
+    andi.   T2, M,  7
+    ble   CGEMM_L1x1_END
+    andi.   T1, M,  4
+    ble   CGEMM_L1x4_END
+    b   CGEMM_L1x4_BEGIN
+    MY_ALIGN 
+
+
+CGEMM_L1x8_END:
+/*----------------------------------------*/   
+
+
+CGEMM_L1x4_BEGIN:
+/*----------------------------------------*/   
+    andi.   T2, M,  7
+    ble   CGEMM_L1x1_END
+    andi.   T1, M,  4
+    ble   CGEMM_L1x4_END
+#if defined(TRMMKERNEL)   
+    REFRESH_POINTERS  AO,BO,TEMP_REG,B,4,1
+#else    
+    mr    BO, B   
+#endif        
+#if defined(TRMMKERNEL)   
+    REFRESH_TEMP_BK T6,K,TEMP_REG,4,1
+    mr T1, T6 
+    addi T1,T1, -2 
+    srawi.   T8, T1, 5 /**(T1-2) % 31x */
+#else   
+    mr T1, K 
+    addi T1,T1, -2
+    srawi.   T8, T1, 5 /**(K-2) % 31x */
+#endif     
+    ZERO1x4
+    ble   CGEMM_L1x4_SUB0 
+    bl CGEMM_1x4_LMAIN_SUB
+    andi.   L,  T1, 31
+    ble   CGEMM_L1x4_SAVE
+    b    CGEMM_L1x4_SUB2
+
+
+CGEMM_L1x4_SUB0:
+/*----------------------------------------*/   
+#if defined(TRMMKERNEL)   
+    andi.   L,  T6, 63
+    cmpwi   T6,33
+#else   
+    andi.   L,  K,  63
+    cmpwi   K,33
+#endif       
+    li T8,1
+    bne CMP1x4_32K
+    addi BO,BO,-8
+    addi AO,AO,-32  
+    LOAD1x4O 32,8 
+    END1x4_WITHOUT_ADD   
+    LOAD1x4_2O  64, 16 
+    mtctr   T8    
+    bl CGEMM_L1x4_K32   
+    b CGEMM_L1x4_SAVE  
+    CMP1x4_32K:
+/*----------------------------------------*/   
+#if defined(TRMMKERNEL)    
+    cmpwi   T6,32
+#else    
+    cmpwi   K,32
+#endif        
+    bne CGEMM_L1x4_SUB2 
+    MY_ALIGN   
+    mtctr   T8
+    addi BO,BO,-16
+    addi AO,AO,-64   
+    LOAD1x4_2O 64,16
+    bl CGEMM_L1x4_K32   
+    b CGEMM_L1x4_SAVE 
+    MY_ALIGN 
+    MY_ALIGN 
+
+
+CGEMM_L1x4_SUB2:
+/*----------------------------------------*/   
+    andi.      T1,L, 16
+    ble CGEMM_L1x4_SUB2_8
+    bl  CGEMM_1x4_L16_SUB  
+    MY_ALIGN
+
+
+CGEMM_L1x4_SUB2_8:
+/*----------------------------------------*/   
+    andi.      T1,L, 8
+    ble CGEMM_L1x4_SUB2_4
+    bl CGEMM_1x4_L8_SUB
+    MY_ALIGN  
+
+
+CGEMM_L1x4_SUB2_4:
+/*----------------------------------------*/   
+    andi.      T1,L, 4
+    ble CGEMM_L1x4_SUB2_2
+    LOAD1x4_2
+    KERNEL1x4_L2  64,16, 0,0
+    KERNEL1x4_E2  64,16, 1,1
+    MY_ALIGN
+
+
+CGEMM_L1x4_SUB2_2:
+/*----------------------------------------*/   
+    andi.      T1,L, 2
+    ble CGEMM_L1x4_SUB2_1
+    LOAD1x4_2
+    KERNEL1x4_E2  64,16, 0,1
+    MY_ALIGN    
+
+
+CGEMM_L1x4_SUB2_1:
+/*----------------------------------------*/   
+    andi.      T1,L, 1
+    ble CGEMM_L1x4_SAVE 
+    KERNEL1x4
+
+
+CGEMM_L1x4_SAVE:
+/*----------------------------------------*/   
+    SAVE1x4
+#if defined(TRMMKERNEL)    
+    REFRESH_AFTER_SAVE T6,K,TEMP_REG,BO,AO,4,1
+#endif     
+
+
+CGEMM_L1x4_END:
+/*----------------------------------------*/   
+
+
+CGEMM_L1x2_BEGIN:
+/*----------------------------------------*/   
+    andi.   T1, M,  2
+    ble   CGEMM_L1x2_END
+#if defined(TRMMKERNEL)   
+    REFRESH_POINTERS  AO,BO,TEMP_REG,B,2,1
+#else    
+    mr    BO, B   
+#endif        
+#if defined(TRMMKERNEL)   
+    REFRESH_TEMP_BK T6,K,TEMP_REG,2,1
+    mr T1, T6 
+    addi T1,T1, -2 
+    srawi.   T8, T1, 5 /**(T1-2) % 31x */
+#else   
+    mr T1, K 
+    addi T1,T1, -2
+    srawi.   T8, T1, 5 /**(K-2) % 31x */
+#endif     
+    ZERO1x2
+    ble   CGEMM_L1x2_SUB0 
+    bl CGEMM_1x2_LMAIN_SUB
+    andi.   L,  T1, 31
+    ble   CGEMM_L1x2_SAVE
+    b   CGEMM_L1x2_SUB2
+
+
+CGEMM_L1x2_SUB0:
+/*----------------------------------------*/   
+#if defined(TRMMKERNEL)   
+    andi.   L,  T6, 63
+    cmpwi   T6,33
+#else   
+    andi.   L,  K,  63
+    cmpwi   K,33
+#endif       
+    li T8,1
+    bne CMP1x2_32K
+    addi BO,BO,-8
+    addi AO,AO,-16  
+    LOAD1x2O 16,8 
+    END1x2_WITHOUT_ADD   
+    LOAD1x2_2O  32, 16  
+    mtctr   T8    
+    bl CGEMM_L1x2_K32   
+    b CGEMM_L1x2_SAVE  
+    CMP1x2_32K:
+/*----------------------------------------*/   
+#if defined(TRMMKERNEL)    
+    cmpwi   T6,32
+#else    
+    cmpwi   K,32
+#endif        
+    bne CGEMM_L1x2_SUB2 
+    MY_ALIGN   
+    mtctr   T8
+    addi BO,BO,-16
+    addi AO,AO,-32   
+    LOAD1x2_2O 32,16
+    bl CGEMM_L1x2_K32   
+    b CGEMM_L1x2_SAVE 
+    MY_ALIGN 
+    MY_ALIGN 
+
+
+CGEMM_L1x2_SUB2:
+/*----------------------------------------*/   
+    andi.      T1,L, 16
+    ble CGEMM_L1x2_SUB2_8
+    bl CGEMM_1x2_L16_SUB  
+    MY_ALIGN
+
+
+CGEMM_L1x2_SUB2_8:
+/*----------------------------------------*/   
+    andi.      T1,L, 8
+    ble CGEMM_L1x2_SUB2_4
+    bl CGEMM_1x2_L8_SUB
+    MY_ALIGN  
+
+
+CGEMM_L1x2_SUB2_4:
+/*----------------------------------------*/   
+    andi.      T1,L, 4
+    ble CGEMM_L1x2_SUB2_2
+    LOAD1x2_2
+    KERNEL1x2_L2  32,16, 0,0
+    KERNEL1x2_E2  32,16, 1,1
+    MY_ALIGN
+
+
+CGEMM_L1x2_SUB2_2:
+/*----------------------------------------*/   
+    andi.      T1,L, 2
+    ble CGEMM_L1x2_SUB2_1
+    LOAD1x2_2
+    KERNEL1x2_E2  32,16, 0,1
+    MY_ALIGN    
+
+
+CGEMM_L1x2_SUB2_1:
+/*----------------------------------------*/   
+    andi.      T1,L, 1
+    ble CGEMM_L1x2_SAVE 
+    KERNEL1x2
+
+    MY_ALIGN
+CGEMM_L1x2_SAVE:
+/*----------------------------------------*/   
+    SAVE1x2
+#if defined(TRMMKERNEL)    
+    REFRESH_AFTER_SAVE T6,K,TEMP_REG,BO,AO,2,1
+#endif     
+
+
+CGEMM_L1x2_END:
+/*----------------------------------------*/   
+
+
+CGEMM_L1x1_BEGIN:
+/*----------------------------------------*/   
+    andi.   T1, M,  1
+    ble   CGEMM_L1x1_END
+#if defined(TRMMKERNEL)   
+    REFRESH_POINTERS  AO,BO,TEMP_REG,B,1,1
+#else    
+    mr    BO, B   
+#endif        
+#if defined(TRMMKERNEL)   
+    REFRESH_TEMP_BK T6,K,TEMP_REG,1,1
+    mr T1, T6 
+    addi T1,T1, -2 
+    srawi.   T8, T1, 5 /**(T1-2) % 31x */
+#else   
+    mr T1, K 
+    addi T1,T1, -2
+    srawi.   T8, T1, 5 /**(K-2) % 31x */
+#endif     
+    ZERO1x1
+    ble   CGEMM_L1x1_SUB0 
+    bl CGEMM_1x1_LMAIN_SUB
+    andi.   L,  T1, 31
+    ble   CGEMM_L1x1_SAVE
+    b   CGEMM_L1x1_SUB2
+
+
+CGEMM_L1x1_SUB0:
+/*----------------------------------------*/   
+#if defined(TRMMKERNEL)   
+    andi.   L,  T6, 63
+    cmpwi   T6,33
+#else   
+    andi.   L,  K,  63
+    cmpwi   K,33
+#endif       
+    li T8,1
+    bne CMP1x1_32K
+    addi BO,BO,-8
+    addi AO,AO,-8  
+    LOAD1x1O 8,8 
+    END1x1_WITHOUT_ADD   
+    LOAD1x1_2O  16, 16  
+    mtctr   T8    
+    bl CGEMM_L1x1_K32   
+    b CGEMM_L1x1_SAVE  
+    CMP1x1_32K:
+/*----------------------------------------*/   
+#if defined(TRMMKERNEL)    
+    cmpwi   T6,32
+#else    
+    cmpwi   K,32
+#endif        
+    bne CGEMM_L1x1_SUB2 
+    MY_ALIGN   
+    mtctr   T8
+    addi BO,BO,-16
+    addi AO,AO,-16   
+    LOAD1x1_2O 16,16
+    bl CGEMM_L1x1_K32   
+    b CGEMM_L1x1_SAVE 
+    MY_ALIGN 
+    MY_ALIGN 
+
+
+CGEMM_L1x1_SUB2:
+/*----------------------------------------*/   
+    andi.      T1,L, 16
+    ble CGEMM_L1x1_SUB2_8
+    bl CGEMM_1x1_L16_SUB  
+    MY_ALIGN
+
+
+CGEMM_L1x1_SUB2_8:
+/*----------------------------------------*/   
+    andi.      T1,L, 8
+    ble CGEMM_L1x1_SUB2_4
+    bl CGEMM_1x1_L8_SUB
+    MY_ALIGN  
+
+
+CGEMM_L1x1_SUB2_4:
+/*----------------------------------------*/   
+    andi.      T1,L, 4
+    ble CGEMM_L1x1_SUB2_2
+    LOAD1x1_2
+    KERNEL1x1_L2  16,16, 0,0
+    KERNEL1x1_E2  16,16, 1,1
+    MY_ALIGN
+
+
+CGEMM_L1x1_SUB2_2:
+/*----------------------------------------*/   
+    andi.      T1,L, 2
+    ble CGEMM_L1x1_SUB2_1
+    LOAD1x1_2
+    KERNEL1x1_E2  16,16, 0,1
+    MY_ALIGN    
+
+
+CGEMM_L1x1_SUB2_1:
+/*----------------------------------------*/   
+    andi.      T1,L, 1
+    ble CGEMM_L1x1_SAVE 
+    KERNEL1x1
+
+    MY_ALIGN
+CGEMM_L1x1_SAVE:
+/*----------------------------------------*/  
+     
+    SAVE1x1
+#if defined(TRMMKERNEL)    
+    REFRESH_AFTER_SAVE T6,K,TEMP_REG,BO,AO,1,1
+#endif   
+
+
+CGEMM_L1x1_END:
+/*----------------------------------------*/   
+    slwi    T1, K,  3
+
+    add   B,  B,  T1
+#if defined(TRMMKERNEL) && !defined(LEFT)   
+    addi TEMP_REG, TEMP_REG, 1
+#endif   
+
+CGEMM_L1_END:
+
+
+
+
diff --git a/kernel/power/cgemm_macros_power9.S b/kernel/power/cgemm_macros_power9.S
new file mode 100644
index 000000000..a256e1a01
--- /dev/null
+++ b/kernel/power/cgemm_macros_power9.S
@@ -0,0 +1,3019 @@
+
+/***************************************************************************
+Copyright (c) 2013-2019, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*****************************************************************************/
+
+/**************************************************************************************
+* Abdelrauf(quickwritereader@gmail.com)
+* BLASTEST 	     	: OK
+*  CTEST		    	: OK
+*  TEST			      : OK
+*	 LAPACK-TEST		: OK
+**************************************************************************************/
+#define unit_size 8
+#define DISP32(ind,disp) (ind*unit_size*32+disp)
+#define DISP16(ind,disp) (ind*unit_size*16+disp)
+#define DISP8(ind,disp) (ind*unit_size*8+disp)
+#define DISP4(ind,disp) (ind*unit_size*4+disp)
+#define DISP2(ind,disp) (ind*unit_size*2+disp)
+#define DISP1(ind,disp) (ind*unit_size+disp)
+#define DISPX(disp)  (disp)
+
+.macro  AGGREGATE_REALS_IMAGES  VSINR_OUT1,VSINR,VSINI_OUT2,VSINI
+#if	defined(NN) || defined(NT) || defined(TN) || defined(TT) 
+	xvsubsp  \VSINR_OUT1,\VSINR_OUT1,\VSINR
+	xvaddsp  \VSINI_OUT2,\VSINI_OUT2,\VSINI  
+#elif  defined(CN) || defined(CT) || defined(RN) || defined(RT) 
+	xvaddsp  \VSINR_OUT1,\VSINR_OUT1,\VSINR
+	xvsubsp  \VSINI_OUT2,\VSINI_OUT2,\VSINI 
+#elif  defined(NC) || defined(TC) || defined(NR) || defined(TR) 
+	xvaddsp  \VSINR_OUT1,\VSINR_OUT1,\VSINR
+	xvsubsp  \VSINI_OUT2,\VSINI,\VSINI_OUT2  
+#else	// CC || CR || RC || RR 
+    /*we will assume {-alpha_r,-alpha_i} for this case */
+    /*i1i2-r1r2 so we will negate alpha real instead to fix sign*/
+	xvsubsp  \VSINR_OUT1,\VSINR,\VSINR_OUT1
+    /*we will negate alpha image   instead to fix sign*/
+	xvaddsp  \VSINI_OUT2,\VSINI_OUT2,\VSINI 
+#endif
+.endm
+
+
+.macro  AGGREGATE_REALS_IMAGES_A_PERMUTE  VSINR_OUT1,VSINR,VSINI_OUT2,VSINI
+#if	defined(NN) || defined(NT) || defined(TN) || defined(TT) 
+	xvsubsp  \VSINR_OUT1,\VSINR_OUT1,\VSINR
+	xvaddsp  \VSINI_OUT2,\VSINI_OUT2,\VSINI  
+#elif  defined(CN) || defined(CT) || defined(RN) || defined(RT) 
+	xvaddsp  \VSINR_OUT1,\VSINR_OUT1,\VSINR
+	xvsubsp  \VSINI_OUT2,\VSINI,\VSINI_OUT2  
+#elif  defined(NC) || defined(TC) || defined(NR) || defined(TR) 
+	xvaddsp  \VSINR_OUT1,\VSINR_OUT1,\VSINR
+	xvsubsp  \VSINI_OUT2,\VSINI_OUT2,\VSINI 
+#else	// CC || CR || RC || RR 
+    /*we will assume {-alpha_r,-alpha_i} for this case */
+    /*i1i2-r1r2 so we will negate alpha real instead to fix sign*/
+	xvsubsp  \VSINR_OUT1,\VSINR,\VSINR_OUT1
+    /*we will negate alpha image   instead to fix sign*/
+	xvaddsp  \VSINI_OUT2,\VSINI_OUT2,\VSINI 
+#endif
+.endm
+ 
+/* {i0,i1} * {alpha_i,alpha_i} [- VSOUT1] ;[VSOUT2 +] {r0,r1}*{alpha_i,alpha_i} */
+
+.macro MULT_APLHA_PART1  VSINRR,VSINII,VSOUT1,VSOUT2
+	xvmulsp \VSOUT1,\VSINII, alpha_i 
+	xvmulsp  \VSOUT2,\VSINRR, alpha_i
+.endm
+
+/*   {r0,r1} * {alpha_r,alpha_r} -  VSOUT1 ;VSOUT2 + {i0,i1} * {alpha_r,alpha_r} */
+
+.macro MULT_APLHA_PART2  VSINRR,VSINII,VSOUT1,VSOUT2 
+	xvmsubasp  \VSOUT1,\VSINRR, alpha_r
+	xvmaddasp \VSOUT2,\VSINII, alpha_r
+.endm
+
+/*                                             macros for N=4 and M=8
+**********************************************************************************************/
+
+.macro Zero4x8
+	xxlxor	vs32,	vs32,	vs32
+	xxlxor	vs33,	vs33,	vs33
+	xxlxor	vs34,	vs34,	vs34
+	xxlxor	vs35,	vs35,	vs35
+	xxlxor	vs36,	vs36,	vs36
+	xxlxor	vs37,	vs37,	vs37
+	xxlxor	vs38,	vs38,	vs38
+	xxlxor	vs39,	vs39,	vs39
+	xxlxor	vs40,	vs40,	vs40
+	xxlxor	vs41,	vs41,	vs41
+	xxlxor	vs42,	vs42,	vs42
+	xxlxor	vs43,	vs43,	vs43
+	xxlxor	vs44,	vs44,	vs44
+	xxlxor	vs45,	vs45,	vs45
+	xxlxor	vs46,	vs46,	vs46
+	xxlxor	vs47,	vs47,	vs47
+	xxlxor	vs48,	vs48,	vs48
+	xxlxor	vs49,	vs49,	vs49
+	xxlxor	vs50,	vs50,	vs50
+	xxlxor	vs51,	vs51,	vs51
+	xxlxor	vs52,	vs52,	vs52
+	xxlxor	vs53,	vs53,	vs53
+	xxlxor	vs54,	vs54,	vs54
+	xxlxor	vs55,	vs55,	vs55
+	xxlxor	vs56,	vs56,	vs56
+	xxlxor	vs57,	vs57,	vs57
+	xxlxor	vs58,	vs58,	vs58
+	xxlxor	vs59,	vs59,	vs59
+	xxlxor	vs60,	vs60,	vs60
+	xxlxor	vs61,	vs61,	vs61
+	xxlxor	vs62,	vs62,	vs62
+	xxlxor	vs63,	vs63,	vs63
+.endm
+
+
+.macro LOAD4x8   
+	LOAD4x8O 0,0 
+.endm
+
+
+.macro LOAD4x8O  OffsetA,OffsetB
+	lxv	vs24,	(\OffsetB+0)(BO)
+	lxv	vs28,	(\OffsetB+16)(BO)
+	xxperm  	vs26,	vs24,		permute_mask
+	xxperm  	vs30,	vs28,		permute_mask	  
+	lxv	vs0,	(\OffsetA+0)(AO)
+	lxv	vs1,	(\OffsetA+16)(AO)
+	xxpermdi	vs25,	vs24,	vs24,2	   
+	xxpermdi	vs29,	vs28,	vs28,2	  
+	lxv	vs2,	(\OffsetA+32)(AO)
+	lxv	vs3,	(\OffsetA+48)(AO) 
+	xxpermdi	vs27,	vs26,	vs26,2	
+	xxpermdi	vs31,	vs30,	vs30,2	 	
+.endm
+
+
+.macro END4x8_NORMAL
+	END4x8 AO,BO,64,32
+.endm
+
+
+.macro END4x8_WITHOUT_ADD
+	END4x8 AO,BO,0,0
+.endm
+
+
+.macro END4x8	AREG, BREG, OffsetA, OffsetB
+.if \OffsetB != 0
+	addi	\BREG, \BREG, \OffsetB
+.endif
+
+.if \OffsetA != 0
+	addi	\AREG, \AREG, \OffsetA
+.endif
+
+    xvmaddasp       vs32, vs0,vs24
+    xvmaddasp       vs33, vs1,vs24
+    xvmaddasp       vs34, vs2,vs24  
+    xvmaddasp       vs35, vs3,vs24  
+    xvmaddasp       vs36, vs0,vs25
+    xvmaddasp       vs37, vs1,vs25
+    xvmaddasp       vs38, vs2,vs25  
+    xvmaddasp       vs39, vs3,vs25 
+    xvmaddasp       vs40, vs0,vs26
+    xvmaddasp       vs41, vs1,vs26
+    xvmaddasp       vs42, vs2,vs26  
+    xvmaddasp       vs43, vs3,vs26
+    xvmaddasp       vs44, vs0,vs27
+    xvmaddasp       vs45, vs1,vs27
+    xvmaddasp       vs46, vs2,vs27  
+    xvmaddasp       vs47, vs3,vs27
+    xvmaddasp       vs48, vs0,vs28
+    xvmaddasp       vs49, vs1,vs28
+    xvmaddasp       vs50, vs2,vs28  
+    xvmaddasp       vs51, vs3,vs28  
+    xvmaddasp       vs52, vs0,vs29
+    xvmaddasp       vs53, vs1,vs29
+    xvmaddasp       vs54, vs2,vs29  
+    xvmaddasp       vs55, vs3,vs29
+    xvmaddasp       vs56, vs0,vs30
+    xvmaddasp       vs57, vs1,vs30
+    xvmaddasp       vs58, vs2,vs30  
+    xvmaddasp       vs59, vs3,vs30
+    xvmaddasp       vs60, vs0,vs31
+    xvmaddasp       vs61, vs1,vs31
+    xvmaddasp       vs62, vs2,vs31  
+    xvmaddasp       vs63, vs3,vs31 
+.endm
+
+
+.macro LOAD4x8_2
+    LOAD4x8_2O 0,0
+.endm
+	
+
+.macro LOAD4x8_2O  OffsetA,OffsetB
+  lxv	vs8,	(\OffsetB)(BO)
+  lxv	vs12,	(16+\OffsetB)(BO)
+  lxv	vs24,	(32+\OffsetB)(BO)
+  lxv	vs28,	(32+16+\OffsetB)(BO)
+  lxv	vs4,	(0+\OffsetA)(AO)
+  lxv	vs5,	(16+\OffsetA)(AO)
+  xxperm  	vs10,	vs8,		permute_mask
+  xxperm  	vs14,	vs12,		permute_mask	
+  lxv	vs6,	(32+\OffsetA)(AO)
+  lxv	vs7,	(48+\OffsetA)(AO) 
+  xxpermdi	vs9,	vs8,	 vs8,2	 
+  xxpermdi	vs13,	vs12,	vs12,2	 
+  lxv	vs0,	(64+\OffsetA)(AO)
+  lxv	vs1,	(64+16+\OffsetA)(AO) 
+  xxpermdi	vs11,	vs10,	vs10,2	
+  xxpermdi	vs15,	vs14,	vs14,2	
+  lxv	vs2,	(64+32+\OffsetA)(AO)
+  lxv	vs3,	(64+48+\OffsetA)(AO)
+  xxperm  	vs26,	vs24,	permute_mask
+  xxperm  	vs30,	vs28,	permute_mask	
+  xxpermdi	vs25,	vs24,	vs24,2 
+  xxpermdi	vs29,	vs28,	vs28,2	      
+  xxpermdi	vs27,	vs26,	vs26,2	
+  xxpermdi	vs31,	vs30,	vs30,2	 
+.endm
+	
+
+.macro END4x8_2	  
+  /*for load2 offset will be 128 and 64*/
+   KERNEL4x8_2	AO,BO,	128,64,0 ,1,1 
+.endm
+
+
+.macro KERNEL4x8_E2	OffsetA,OffsetB, Index,IsLast 
+  KERNEL4x8_2	AO,BO,	\OffsetA,\OffsetB, \Index,\IsLast ,1 
+.endm
+
+
+.macro KERNEL4x8_L2	OffsetA,OffsetB, Index,IsLast
+  KERNEL4x8_2	AO,BO,	\OffsetA,\OffsetB, \Index,\IsLast ,0 
+.endm
+
+
+.macro KERNEL4x8_2	AREG,BREG,	OffsetA,OffsetB, Index,IsLast ,Complete
+  xvmaddasp		vs32, vs4,vs8
+  xvmaddasp		vs33, vs5,vs8
+  xvmaddasp		vs48, vs4,vs12
+  xvmaddasp		vs49, vs5,vs12
+  xvmaddasp		vs40, vs4,vs10
+  xvmaddasp		vs41, vs5,vs10
+  xvmaddasp		vs56, vs4,vs14
+  xvmaddasp		vs57, vs5,vs14
+  xvmaddasp		vs36, vs4,vs9
+  xvmaddasp		vs37, vs5,vs9
+  xvmaddasp		vs52, vs4,vs13
+  xvmaddasp		vs53, vs5,vs13
+  xvmaddasp		vs44, vs4,vs11
+  xvmaddasp		vs45, vs5,vs11
+  xvmaddasp		vs60, vs4,vs15
+  xvmaddasp		vs61, vs5,vs15
+.if \Complete==0	
+   lxv	vs4,	DISP16(\Index,0+\OffsetA)(\AREG)
+   lxv	vs5,	DISP16(\Index,16+\OffsetA)(\AREG)
+.endif
+
+  xvmaddasp		vs34, vs6,vs8	
+  xvmaddasp		vs35, vs7,vs8	
+  xvmaddasp		vs50, vs6,vs12
+  xvmaddasp		vs51, vs7,vs12
+.if \Complete==0  
+  lxv vs8,  DISP8(\Index,\OffsetB)(\BREG)
+  lxv vs12, DISP8(\Index,16+\OffsetB)(\BREG)
+.endif    
+  xvmaddasp		vs42, vs6,vs10
+  xvmaddasp		vs43, vs7,vs10
+  xvmaddasp		vs58, vs6,vs14
+  xvmaddasp		vs59, vs7,vs14
+.if \Complete==0  
+  xxperm    vs10, vs8,    permute_mask
+  xxperm    vs14, vs12,   permute_mask    
+.endif    
+  xvmaddasp		vs38, vs6,vs9	
+  xvmaddasp		vs39, vs7,vs9	
+  xvmaddasp   vs54, vs6,vs13
+  xvmaddasp   vs55, vs7,vs13
+.if \Complete==0
+  xxpermdi  vs9,  vs8,   vs8,2   
+  xxpermdi  vs13, vs12, vs12,2   
+.endif    
+  xvmaddasp		vs46, vs6,vs11
+  xvmaddasp		vs47, vs7,vs11
+  xvmaddasp		vs62, vs6,vs15
+  xvmaddasp		vs63, vs7,vs15
+.if \Complete==0
+  xxpermdi  vs11, vs10, vs10,2  
+  xxpermdi  vs15, vs14, vs14,2  
+.endif  
+.if \Complete==0
+   lxv	vs6,	DISP16(\Index,32+\OffsetA)(\AREG)
+   lxv	vs7,	DISP16(\Index,48+\OffsetA)(\AREG) 
+.endif 
+  xvmaddasp		vs32, vs0,vs24
+  xvmaddasp		vs33, vs1,vs24
+  xvmaddasp		vs48, vs0,vs28
+  xvmaddasp		vs49, vs1,vs28
+  xvmaddasp		vs40, vs0,vs26
+  xvmaddasp		vs41, vs1,vs26
+  xvmaddasp		vs56, vs0,vs30
+  xvmaddasp		vs57, vs1,vs30
+  xvmaddasp		vs36, vs0,vs25
+  xvmaddasp		vs37, vs1,vs25
+  xvmaddasp		vs52, vs0,vs29
+  xvmaddasp		vs53, vs1,vs29
+  xvmaddasp		vs44, vs0,vs27
+  xvmaddasp		vs45, vs1,vs27
+  xvmaddasp		vs60, vs0,vs31
+  xvmaddasp		vs61, vs1,vs31 
+.if \Complete==0
+  lxv	vs0,	DISP16(\Index,64+\OffsetA)(\AREG)
+  lxv	vs1,	DISP16(\Index,64+16+\OffsetA)(\AREG) 
+.endif
+
+  xvmaddasp		vs34, vs2,vs24
+  xvmaddasp		vs35, vs3,vs24	  
+  xvmaddasp		vs50, vs2,vs28
+  xvmaddasp		vs51, vs3,vs28
+.if \Complete==0
+  lxv vs24, DISP8(\Index,32+\OffsetB)(\BREG)
+  lxv vs28, DISP8(\Index,32+16+\OffsetB)(\BREG)
+.endif  
+  xvmaddasp		vs42, vs2,vs26
+  xvmaddasp		vs43, vs3,vs26
+  xvmaddasp		vs58, vs2,vs30
+  xvmaddasp		vs59, vs3,vs30
+.if \Complete==0
+  xxperm    vs26, vs24, permute_mask
+  xxperm    vs30, vs28, permute_mask  
+.endif  
+  xvmaddasp		vs38, vs2,vs25
+  xvmaddasp		vs39, vs3,vs25
+  xvmaddasp		vs54, vs2,vs29
+  xvmaddasp		vs55, vs3,vs29
+.if \Complete==0
+  xxpermdi  vs25, vs24, vs24,2 
+  xxpermdi  vs29, vs28, vs28,2    
+.endif  
+  xvmaddasp		vs46, vs2,vs27
+  xvmaddasp		vs47, vs3,vs27
+  xvmaddasp		vs62, vs2,vs31	
+  xvmaddasp		vs63, vs3,vs31
+.if \Complete==0
+  xxpermdi  vs27, vs26, vs26,2  
+  xxpermdi  vs31, vs30, vs30,2   
+.endif
+
+.if \Complete==0
+  lxv	vs2,	DISP16(\Index,64+32+\OffsetA)(\AREG)
+  lxv	vs3,	DISP16(\Index,64+48+\OffsetA)(\AREG)
+.endif
+
+.if \IsLast==1	
+.if \Complete==1
+	addi		\BREG, \BREG,  DISP8(\Index,\OffsetB)
+  addi    \AREG, \AREG, DISP16(\Index,\OffsetA)  
+.else
+	addi		\BREG, \BREG,  DISP8(\Index,64)
+  addi    \AREG, \AREG, DISP16(\Index,128)  
+.endif
+
+.endif   
+.endm
+
+
+.macro KERNEL4x8
+  LOAD4x8
+  END4x8  AO, BO, 64,32
+.endm
+
+
+.macro SAVE4x8
+  add T4, LDC,LDC
+	add	T1, CO ,LDC  
+#ifndef TRMMKERNEL  
+  lxv vs24 , 0(CO)
+  lxv vs25 , 16(CO)
+#endif
+  xxperm  vs0,vs32,permute_mask
+  xxperm  vs4,vs40,permute_mask
+#ifndef TRMMKERNEL  
+  lxv vs26 , 32(CO)
+  lxv vs27 , 48(CO)
+#endif  
+  xxperm  vs1,vs33,permute_mask
+  xxperm  vs5,vs41,permute_mask
+#ifndef TRMMKERNEL  
+  lxv vs28 , 0(T1)
+  lxv vs29 , 16(T1)
+#endif  
+  xxperm  vs2,vs34,permute_mask
+  xxperm  vs6,vs42,permute_mask
+#ifndef TRMMKERNEL  
+  lxv vs30 , 32(T1)
+  lxv vs31 , 48(T1)
+#endif 
+  xxperm  vs3,vs35,permute_mask
+  xxperm  vs7,vs43,permute_mask 
+  add T2,CO,T4
+  add T3,T1,T4  
+  AGGREGATE_REALS_IMAGES vs32,vs0,vs40,vs4
+  xxperm  vs8,vs36,permute_mask
+  xxperm  vs12,vs44,permute_mask
+  AGGREGATE_REALS_IMAGES vs33,vs1,vs41,vs5
+  xxperm  vs9,vs37,permute_mask
+  xxperm  vs13,vs45,permute_mask
+  AGGREGATE_REALS_IMAGES vs34,vs2,vs42,vs6
+  xxperm  vs10,vs38,permute_mask
+  xxperm  vs14,vs46,permute_mask
+  AGGREGATE_REALS_IMAGES vs35,vs3,vs43,vs7 
+  xxperm  vs11,vs39,permute_mask
+  xxperm  vs15,vs47,permute_mask 
+  AGGREGATE_REALS_IMAGES vs36,vs8,vs44,vs12
+  xxperm  vs0,vs48,permute_mask
+  xxperm  vs4,vs56,permute_mask
+  AGGREGATE_REALS_IMAGES vs37,vs9,vs45,vs13
+  xxperm  vs1,vs49,permute_mask
+  xxperm  vs5,vs57,permute_mask
+  AGGREGATE_REALS_IMAGES vs38,vs10,vs46,vs14
+  xxperm  vs2,vs50,permute_mask
+  xxperm  vs6,vs58,permute_mask
+  AGGREGATE_REALS_IMAGES vs39,vs11,vs47,vs15 
+  xxperm  vs3,vs51,permute_mask
+  xxperm  vs7,vs59,permute_mask 
+  AGGREGATE_REALS_IMAGES vs48,vs0,vs56,vs4
+  xxperm  vs8,vs52,permute_mask
+  xxperm  vs12,vs60,permute_mask
+  AGGREGATE_REALS_IMAGES vs49,vs1,vs57,vs5
+  xxperm  vs9,vs53,permute_mask
+  xxperm  vs13,vs61,permute_mask
+  AGGREGATE_REALS_IMAGES vs50,vs2,vs58,vs6
+  xxperm  vs10,vs54,permute_mask
+  xxperm  vs14,vs62,permute_mask
+  AGGREGATE_REALS_IMAGES vs51,vs3,vs59,vs7 
+  xxperm  vs11,vs55,permute_mask
+  xxperm  vs15,vs63,permute_mask 
+  AGGREGATE_REALS_IMAGES vs52,vs8,vs60,vs12
+  AGGREGATE_REALS_IMAGES vs53,vs9,vs61,vs13
+  /*VSINRR,VSINII,VSOUT1,VSOUT2*/
+  MULT_APLHA_PART1    vs32,vs40,vs0,vs1
+  AGGREGATE_REALS_IMAGES vs54,vs10,vs62,vs14
+  MULT_APLHA_PART1    vs33,vs41,vs2,vs3  
+  AGGREGATE_REALS_IMAGES vs55,vs11,vs63,vs15  
+  MULT_APLHA_PART1    vs34,vs42,vs4,vs5
+  MULT_APLHA_PART1    vs35,vs43,vs6,vs7  
+  MULT_APLHA_PART2    vs32,vs40,vs0,vs1    
+  MULT_APLHA_PART2    vs33,vs41,vs2,vs3   
+  MULT_APLHA_PART2    vs34,vs42,vs4,vs5
+  MULT_APLHA_PART2    vs35,vs43,vs6,vs7  
+ #ifndef TRMMKERNEL  
+  lxv vs32 , 0(T2)
+  lxv vs40 , 16(T2)
+#endif 
+  MULT_APLHA_PART1    vs36,vs44,vs8,vs9
+  MULT_APLHA_PART1    vs37,vs45,vs10,vs11
+#ifndef TRMMKERNEL  
+  lxv vs33 , 32(T2)
+  lxv vs41 , 48(T2)
+#endif  
+  MULT_APLHA_PART1    vs38,vs46,vs12,vs13
+  MULT_APLHA_PART1    vs39,vs47,vs14,vs15
+#ifndef TRMMKERNEL  
+  lxv vs34 , 0(T3)
+  lxv vs42 , 16(T3)
+#endif  
+  MULT_APLHA_PART2    vs36,vs44,vs8,vs9
+  MULT_APLHA_PART2    vs37,vs45,vs10,vs11
+#ifndef TRMMKERNEL  
+  lxv vs35 , 32(T3)
+  lxv vs43 , 48(T3)
+#endif    
+  MULT_APLHA_PART2    vs38,vs46,vs12,vs13
+  MULT_APLHA_PART2    vs39,vs47,vs14,vs15
+/* reconstruct r,i pairs*/
+  xxperm  vs0,vs1, save_permute_1
+  xxperm  vs2,vs3, save_permute_1
+  xxperm  vs4,vs5, save_permute_1
+  xxperm  vs6,vs7, save_permute_1
+  xxperm  vs8,vs9, save_permute_1
+  xxperm  vs10,vs11, save_permute_1
+  xxperm  vs12,vs13, save_permute_1
+  xxperm  vs14,vs15, save_permute_1
+#ifndef TRMMKERNEL
+  /* add */
+  xxpermdi vs1,vs8,vs0,2
+  xxpermdi vs3,vs10,vs2,2
+  xxpermdi vs5,vs12,vs4,2
+  xxpermdi vs7,vs14,vs6,2
+  xxpermdi vs9,vs0,vs8,2
+  xxpermdi vs11,vs2,vs10,2  
+  xvaddsp vs24,vs24,vs1
+  xvaddsp vs25,vs25,vs3
+  xxpermdi vs13,vs4,vs12,2  
+  xxpermdi vs15,vs6,vs14,2
+  xvaddsp vs26,vs26,vs5
+  xvaddsp  vs27,vs27,vs7
+  xvaddsp vs28,vs28,vs9
+  xvaddsp vs29,vs29,vs11 
+  xvaddsp vs30,vs30,vs13
+  xvaddsp vs31,vs31,vs15  
+#else
+  xxpermdi vs24,vs8,vs0,2
+  xxpermdi vs25,vs10,vs2,2
+  xxpermdi vs26,vs12,vs4,2
+  xxpermdi vs27,vs14,vs6,2 
+  xxpermdi vs28,vs0,vs8,2
+  xxpermdi vs29,vs2,vs10,2  
+  xxpermdi vs30,vs4,vs12,2  
+  xxpermdi vs31,vs6,vs14,2
+#endif
+  stxv vs24 , 0(CO)
+  stxv vs25 , 16(CO)
+  MULT_APLHA_PART1    vs48,vs56,vs0,vs1
+  MULT_APLHA_PART1    vs49,vs57,vs2,vs3
+  stxv vs26 , 32(CO)
+  stxv vs27 , 48(CO)
+  MULT_APLHA_PART1    vs50,vs58,vs4,vs5
+  MULT_APLHA_PART1    vs51,vs59,vs6,vs7
+  stxv vs28 , 0(T1)
+  stxv vs29 , 16(T1)
+  MULT_APLHA_PART2    vs48,vs56,vs0,vs1
+  MULT_APLHA_PART2    vs49,vs57,vs2,vs3
+  stxv vs30 , 32(T1)
+  stxv vs31 , 48(T1)  
+  MULT_APLHA_PART2    vs50,vs58,vs4,vs5
+  MULT_APLHA_PART2    vs51,vs59,vs6,vs7
+  MULT_APLHA_PART1    vs52,vs60,vs8,vs9
+  MULT_APLHA_PART1    vs53,vs61,vs10,vs11
+  xxperm  vs0,vs1, save_permute_1
+  xxperm  vs2,vs3, save_permute_1
+  MULT_APLHA_PART1    vs54,vs62,vs12,vs13
+  MULT_APLHA_PART1    vs55,vs63,vs14,vs15
+  xxperm  vs4,vs5, save_permute_1
+  xxperm  vs6,vs7, save_permute_1
+  MULT_APLHA_PART2    vs52,vs60,vs8,vs9
+  MULT_APLHA_PART2    vs53,vs61,vs10,vs11
+  xxperm  vs8,vs9, save_permute_1
+  xxperm  vs10,vs11, save_permute_1
+  MULT_APLHA_PART2    vs54,vs62,vs12,vs13
+  MULT_APLHA_PART2    vs55,vs63,vs14,vs15
+  xxperm  vs12,vs13, save_permute_1
+  xxperm  vs14,vs15, save_permute_1
+#ifndef TRMMKERNEL
+  /* add */
+  xxpermdi vs1,vs8,vs0,2
+  xxpermdi vs3,vs10,vs2,2
+  xxpermdi vs5,vs12,vs4,2
+  xxpermdi vs7,vs14,vs6,2
+  xxpermdi vs9,vs0,vs8,2
+  xxpermdi vs11,vs2,vs10,2  
+  xvaddsp vs32,vs32,vs1
+  xvaddsp vs40,vs40,vs3
+  xxpermdi vs13,vs4,vs12,2  
+  xxpermdi vs15,vs6,vs14,2
+  xvaddsp vs33,vs33,vs5
+  xvaddsp  vs41,vs41,vs7
+  xvaddsp vs34,vs34,vs9
+  xvaddsp vs42,vs42,vs11 
+  xvaddsp vs35,vs35,vs13
+  xvaddsp vs43,vs43,vs15  
+#else
+  xxpermdi vs32,vs8,vs0,2
+  xxpermdi vs40,vs10,vs2,2
+  xxpermdi vs33,vs12,vs4,2
+  xxpermdi vs41,vs14,vs6,2 
+  xxpermdi vs34,vs0,vs8,2
+  xxpermdi vs42,vs2,vs10,2  
+  xxpermdi vs35,vs4,vs12,2  
+  xxpermdi vs43,vs6,vs14,2
+#endif
+  stxv vs32 , 0(T2)
+  stxv vs40 , 16(T2)
+  stxv vs33 , 32(T2)
+  stxv vs41 , 48(T2)
+  stxv vs34 , 0(T3)
+  stxv vs42 , 16(T3)
+  stxv vs35 , 32(T3)
+  stxv vs43 , 48(T3)  
+	addi	CO, CO, 64
+.endm
+
+/*                                             macros for N=4 and M=4
+**********************************************************************************************/
+
+.macro Zero4x4
+	xxlxor	vs32,	vs32,	vs32
+	xxlxor	vs33,	vs33,	vs33
+	xxlxor	vs36,	vs36,	vs36
+	xxlxor	vs37,	vs37,	vs37
+	xxlxor	vs40,	vs40,	vs40
+	xxlxor	vs41,	vs41,	vs41
+	xxlxor	vs44,	vs44,	vs44
+	xxlxor	vs45,	vs45,	vs45
+	xxlxor	vs48,	vs48,	vs48
+	xxlxor	vs49,	vs49,	vs49
+	xxlxor	vs52,	vs52,	vs52
+	xxlxor	vs53,	vs53,	vs53
+	xxlxor	vs56,	vs56,	vs56
+	xxlxor	vs57,	vs57,	vs57
+	xxlxor	vs60,	vs60,	vs60
+	xxlxor	vs61,	vs61,	vs61
+.endm
+
+
+.macro LOAD4x4   
+	LOAD4x4O 0,0 
+.endm
+
+
+.macro LOAD4x4O  OffsetA,OffsetB
+	lxv	vs24,	(\OffsetB+0)(BO)
+	lxv	vs28,	(\OffsetB+16)(BO)
+	xxperm  	vs26,	vs24,		permute_mask
+	xxperm  	vs30,	vs28,		permute_mask	  
+	lxv	vs0,	(\OffsetA+0)(AO)
+	lxv	vs1,	(\OffsetA+16)(AO)
+	xxpermdi	vs25,	vs24,	vs24,2	   
+	xxpermdi	vs29,	vs28,	vs28,2	  
+	xxpermdi	vs27,	vs26,	vs26,2	
+	xxpermdi	vs31,	vs30,	vs30,2	 	
+.endm
+
+
+.macro END4x4_NORMAL
+	END4x4 AO,BO,32,32
+.endm
+
+
+.macro END4x4_WITHOUT_ADD
+	END4x4 AO,BO,0,0
+.endm
+
+
+.macro END4x4	AREG, BREG, OffsetA, OffsetB
+.if \OffsetB != 0
+	addi	\BREG, \BREG, \OffsetB
+.endif
+
+.if \OffsetA != 0
+	addi	\AREG, \AREG, \OffsetA
+.endif
+
+    xvmaddasp       vs32, vs0,vs24
+    xvmaddasp       vs33, vs1,vs24
+    xvmaddasp       vs36, vs0,vs25
+    xvmaddasp       vs37, vs1,vs25
+    xvmaddasp       vs40, vs0,vs26
+    xvmaddasp       vs41, vs1,vs26
+    xvmaddasp       vs44, vs0,vs27
+    xvmaddasp       vs45, vs1,vs27
+    xvmaddasp       vs48, vs0,vs28
+    xvmaddasp       vs49, vs1,vs28
+    xvmaddasp       vs52, vs0,vs29
+    xvmaddasp       vs53, vs1,vs29
+    xvmaddasp       vs56, vs0,vs30
+    xvmaddasp       vs57, vs1,vs30
+    xvmaddasp       vs60, vs0,vs31
+    xvmaddasp       vs61, vs1,vs31
+.endm
+
+
+.macro LOAD4x4_2
+    LOAD4x4_2O 0,0
+.endm
+	
+
+.macro LOAD4x4_2O  OffsetA,OffsetB
+  lxv	vs8,	(\OffsetB)(BO)
+  lxv	vs12,	(16+\OffsetB)(BO)
+  lxv	vs24,	(32+\OffsetB)(BO)
+  lxv	vs28,	(32+16+\OffsetB)(BO)
+  lxv	vs4,	(0+\OffsetA)(AO)
+  lxv	vs5,	(16+\OffsetA)(AO)
+  xxperm  	vs10,	vs8,		permute_mask
+  xxperm  	vs14,	vs12,		permute_mask	
+  xxpermdi	vs9,	vs8,	 vs8,2	 
+  xxpermdi	vs13,	vs12,	vs12,2	 
+  lxv	vs0,	(32+\OffsetA)(AO)
+  lxv	vs1,	(32+16+\OffsetA)(AO) 
+  xxpermdi	vs11,	vs10,	vs10,2	
+  xxpermdi	vs15,	vs14,	vs14,2	
+  xxperm  	vs26,	vs24,	permute_mask
+  xxperm  	vs30,	vs28,	permute_mask	
+  xxpermdi	vs25,	vs24,	vs24,2 
+  xxpermdi	vs29,	vs28,	vs28,2	      
+  xxpermdi	vs27,	vs26,	vs26,2	
+  xxpermdi	vs31,	vs30,	vs30,2	 
+.endm
+
+
+.macro END4x4_2	  
+  /*for load2 offset will be 64 and 64*/
+   KERNEL4x4_2	AO,BO,	64,64,0 ,1,1 
+.endm
+
+
+.macro KERNEL4x4_E2	OffsetA,OffsetB, Index,IsLast 
+  KERNEL4x4_2	AO,BO,	\OffsetA,\OffsetB, \Index,\IsLast ,1 
+.endm
+
+
+.macro KERNEL4x4_L2	OffsetA,OffsetB, Index,IsLast
+  KERNEL4x4_2	AO,BO,	\OffsetA,\OffsetB, \Index,\IsLast ,0 
+.endm
+
+
+.macro KERNEL4x4_2	AREG,BREG,	OffsetA,OffsetB, Index,IsLast ,Complete
+  xvmaddasp		vs32, vs4,vs8
+  xvmaddasp		vs33, vs5,vs8
+  xvmaddasp		vs48, vs4,vs12
+  xvmaddasp		vs49, vs5,vs12
+  xvmaddasp		vs40, vs4,vs10
+  xvmaddasp		vs41, vs5,vs10
+  xvmaddasp		vs56, vs4,vs14
+  xvmaddasp		vs57, vs5,vs14
+.if \Complete==0  
+  lxv vs8,  DISP8(\Index,\OffsetB)(\BREG)
+  lxv vs12, DISP8(\Index,16+\OffsetB)(\BREG)
+.endif  
+  xvmaddasp		vs36, vs4,vs9
+  xvmaddasp		vs37, vs5,vs9
+  xvmaddasp		vs52, vs4,vs13
+  xvmaddasp		vs53, vs5,vs13
+.if \Complete==0  
+  xxperm    vs10, vs8,    permute_mask
+  xxperm    vs14, vs12,   permute_mask    
+.endif    
+  xvmaddasp		vs44, vs4,vs11
+  xvmaddasp		vs45, vs5,vs11
+  xvmaddasp		vs60, vs4,vs15
+  xvmaddasp		vs61, vs5,vs15
+.if \Complete==0
+  xxpermdi  vs9,  vs8,   vs8,2   
+  xxpermdi  vs13, vs12, vs12,2   
+.endif    
+.if \Complete==0	
+   lxv	vs4,	DISP8(\Index,0+\OffsetA)(\AREG)
+   lxv	vs5,	DISP8(\Index,16+\OffsetA)(\AREG)
+.endif
+
+.if \Complete==0
+  xxpermdi  vs11, vs10, vs10,2  
+  xxpermdi  vs15, vs14, vs14,2  
+.endif  
+  xvmaddasp		vs32, vs0,vs24
+  xvmaddasp		vs33, vs1,vs24
+  xvmaddasp		vs48, vs0,vs28
+  xvmaddasp		vs49, vs1,vs28
+  xvmaddasp		vs40, vs0,vs26
+  xvmaddasp		vs41, vs1,vs26
+  xvmaddasp		vs56, vs0,vs30
+  xvmaddasp		vs57, vs1,vs30
+.if \Complete==0
+  lxv vs24, DISP8(\Index,32+\OffsetB)(\BREG)
+  lxv vs28, DISP8(\Index,32+16+\OffsetB)(\BREG)
+.endif   
+  xvmaddasp		vs36, vs0,vs25
+  xvmaddasp		vs37, vs1,vs25
+  xvmaddasp		vs52, vs0,vs29
+  xvmaddasp		vs53, vs1,vs29
+.if \Complete==0
+  xxperm    vs26, vs24, permute_mask
+  xxperm    vs30, vs28, permute_mask  
+.endif    
+  xvmaddasp		vs44, vs0,vs27
+  xvmaddasp		vs45, vs1,vs27
+  xvmaddasp		vs60, vs0,vs31
+  xvmaddasp		vs61, vs1,vs31 
+.if \Complete==0
+  xxpermdi  vs25, vs24, vs24,2 
+  xxpermdi  vs29, vs28, vs28,2    
+.endif  
+.if \Complete==0
+  lxv	vs0,	DISP8(\Index,32+\OffsetA)(\AREG)
+  lxv	vs1,	DISP8(\Index,32+16+\OffsetA)(\AREG) 
+.endif
+
+.if \Complete==0
+  xxpermdi  vs27, vs26, vs26,2  
+  xxpermdi  vs31, vs30, vs30,2   
+.endif
+
+.if \IsLast==1	
+.if \Complete==1
+	addi		\BREG, \BREG,  DISP8(\Index,\OffsetB)
+  addi    \AREG, \AREG, DISP8(\Index,\OffsetA)  
+.else
+	addi		\BREG, \BREG,  DISP8(\Index,64)
+  addi    \AREG, \AREG, DISP8(\Index,64)  
+.endif
+
+.endif   
+.endm
+
+
+.macro KERNEL4x4
+  LOAD4x4
+  END4x4  AO, BO, 32,32
+.endm
+
+
+.macro SAVE4x4
+  add T4, LDC,LDC
+  add T1, CO ,LDC  
+#ifndef TRMMKERNEL  
+  lxv vs24 , 0(CO)
+  lxv vs25 , 16(CO)
+#endif
+  add T2,CO,T4
+  add T3,T1,T4  
+#ifndef TRMMKERNEL  
+  lxv vs26 , 0(T1)
+  lxv vs27 , 16(T1)
+#endif  
+ #ifndef TRMMKERNEL  
+  lxv vs28 , 0(T2)
+  lxv vs29 , 16(T2)
+#endif
+#ifndef TRMMKERNEL  
+  lxv vs30 , 0(T3)
+  lxv vs31 , 16(T3)
+#endif   
+  xxperm  vs0,vs32,permute_mask
+  xxperm  vs4,vs40,permute_mask
+  xxperm  vs1,vs33,permute_mask
+  xxperm  vs5,vs41,permute_mask
+  xxperm  vs8,vs36,permute_mask
+  xxperm  vs12,vs44,permute_mask
+  xxperm  vs9,vs37,permute_mask
+  xxperm  vs13,vs45,permute_mask
+  AGGREGATE_REALS_IMAGES vs32,vs0,vs40,vs4
+  AGGREGATE_REALS_IMAGES vs33,vs1,vs41,vs5
+  AGGREGATE_REALS_IMAGES vs36,vs8,vs44,vs12
+  AGGREGATE_REALS_IMAGES vs37,vs9,vs45,vs13
+  xxperm  vs0,vs48,permute_mask
+  xxperm  vs4,vs56,permute_mask
+  xxperm  vs1,vs49,permute_mask
+  xxperm  vs5,vs57,permute_mask 
+  xxperm  vs8,vs52,permute_mask
+  xxperm  vs12,vs60,permute_mask
+  xxperm  vs9,vs53,permute_mask
+  xxperm  vs13,vs61,permute_mask
+  AGGREGATE_REALS_IMAGES vs48,vs0,vs56,vs4
+  AGGREGATE_REALS_IMAGES vs49,vs1,vs57,vs5
+  AGGREGATE_REALS_IMAGES vs52,vs8,vs60,vs12
+  AGGREGATE_REALS_IMAGES vs53,vs9,vs61,vs13
+  /*VSINRR,VSINII,VSOUT1,VSOUT2*/
+  MULT_APLHA_PART1    vs32,vs40,vs0,vs1
+  MULT_APLHA_PART1    vs33,vs41,vs2,vs3    
+  MULT_APLHA_PART1    vs36,vs44,vs8,vs9
+  MULT_APLHA_PART1    vs37,vs45,vs10,vs11
+  MULT_APLHA_PART1    vs48,vs56,vs4,vs5
+  MULT_APLHA_PART1    vs49,vs57,vs6,vs7    
+  MULT_APLHA_PART1    vs52,vs60,vs12,vs13
+  MULT_APLHA_PART1    vs53,vs61,vs14,vs15
+  MULT_APLHA_PART2    vs32,vs40,vs0,vs1    
+  MULT_APLHA_PART2    vs33,vs41,vs2,vs3   
+  MULT_APLHA_PART2    vs36,vs44,vs8,vs9
+  MULT_APLHA_PART2    vs37,vs45,vs10,vs11
+  MULT_APLHA_PART2    vs48,vs56,vs4,vs5
+  MULT_APLHA_PART2    vs49,vs57,vs6,vs7    
+  MULT_APLHA_PART2    vs52,vs60,vs12,vs13
+  MULT_APLHA_PART2    vs53,vs61,vs14,vs15
+/* reconstruct r,i pairs*/
+  xxperm  vs0,vs1, save_permute_1
+  xxperm  vs2,vs3, save_permute_1
+  xxperm  vs8,vs9, save_permute_1
+  xxperm  vs10,vs11, save_permute_1
+  xxperm  vs4,vs5, save_permute_1
+  xxperm  vs6,vs7, save_permute_1
+  xxperm  vs12,vs13, save_permute_1
+  xxperm  vs14,vs15, save_permute_1
+#ifndef TRMMKERNEL
+  /* add */
+  xxpermdi vs1,vs8,vs0,2
+  xxpermdi vs3,vs10,vs2,2 
+  xxpermdi vs9,vs0,vs8,2
+  xxpermdi vs11,vs2,vs10,2  
+  xxpermdi vs5,vs12,vs4,2
+  xxpermdi vs7,vs14,vs6,2 
+  xxpermdi vs13,vs4,vs12,2
+  xxpermdi vs15,vs6,vs14,2   
+  xvaddsp vs24,vs24,vs1
+  xvaddsp vs25,vs25,vs3 
+  xvaddsp vs26,vs26,vs9
+  xvaddsp vs27,vs27,vs11 
+  xvaddsp vs28,vs28,vs5
+  xvaddsp vs29,vs29,vs7 
+  xvaddsp vs30,vs30,vs13
+  xvaddsp vs31,vs31,vs15 
+#else
+  xxpermdi vs24,vs8,vs0,2
+  xxpermdi vs25,vs10,vs2,2
+  xxpermdi vs26,vs0,vs8,2
+  xxpermdi vs27,vs2,vs10,2  
+  xxpermdi vs28,vs12,vs4,2
+  xxpermdi vs29,vs14,vs6,2 
+  xxpermdi vs30,vs4,vs12,2
+  xxpermdi vs31,vs6,vs14,2   
+#endif
+  stxv vs24 , 0(CO)
+  stxv vs25 , 16(CO)
+  stxv vs26 , 0(T1)
+  stxv vs27 , 16(T1)
+  stxv vs28 , 0(T2)
+  stxv vs29 , 16(T2)
+  stxv vs30 , 0(T3)
+  stxv vs31 , 16(T3)  
+  addi  CO, CO, 32
+.endm
+
+/*                                             macros for N=4 and M=2
+**********************************************************************************************/
+
+.macro Zero4x2
+	xxlxor	vs32,	vs32,	vs32
+	xxlxor	vs33,	vs33,	vs33
+	xxlxor	vs36,	vs36,	vs36
+	xxlxor	vs37,	vs37,	vs37
+	xxlxor	vs40,	vs40,	vs40
+	xxlxor	vs41,	vs41,	vs41
+	xxlxor	vs44,	vs44,	vs44
+	xxlxor	vs45,	vs45,	vs45
+.endm
+
+
+.macro LOAD4x2   
+	LOAD4x2O 0,0 
+.endm
+
+
+.macro LOAD4x2O  OffsetA,OffsetB
+	lxv	vs24,	(\OffsetA+0)(AO)
+  lxv vs0,  (\OffsetB+0)(BO)
+  lxv vs1,  (\OffsetB+16)(BO)
+	xxperm  	vs26,	vs24,		permute_mask  
+	xxpermdi	vs25,	vs24,	vs24,2	    
+	xxpermdi	vs27,	vs26,	vs26,2	
+.endm
+
+
+.macro END4x2_NORMAL
+	END4x2 AO,BO,16,32
+.endm
+
+
+.macro END4x2_WITHOUT_ADD
+	END4x2 AO,BO,0,0
+.endm
+
+
+.macro END4x2	AREG, BREG, OffsetA, OffsetB
+.if \OffsetB != 0
+	addi	\BREG, \BREG, \OffsetB
+.endif
+
+.if \OffsetA != 0
+	addi	\AREG, \AREG, \OffsetA
+.endif
+
+    xvmaddasp       vs32, vs0,vs24
+    xvmaddasp       vs33, vs1,vs24
+    xvmaddasp       vs36, vs0,vs25
+    xvmaddasp       vs37, vs1,vs25
+    xvmaddasp       vs40, vs0,vs26
+    xvmaddasp       vs41, vs1,vs26
+    xvmaddasp       vs44, vs0,vs27
+    xvmaddasp       vs45, vs1,vs27
+.endm
+
+
+.macro LOAD4x2_2
+    LOAD4x2_2O 0,0
+.endm
+	
+
+.macro LOAD4x2_2O  OffsetA,OffsetB
+  lxv	vs8,	(\OffsetA)(AO) 
+  lxv	vs24,	(16+\OffsetA)(AO) 
+  lxv	vs4,	(0+\OffsetB)(BO)
+  lxv	vs5,	(16+\OffsetB)(BO)
+  xxperm  	vs10,	vs8,		permute_mask
+  xxpermdi	vs9,	vs8,	 vs8,2	 
+  xxperm  	vs26,	vs24,	permute_mask
+  xxpermdi	vs25,	vs24,	vs24,2    
+  lxv vs0,  (32+\OffsetB)(BO)
+  lxv vs1,  (32+16+\OffsetB)(BO) 
+  xxpermdi  vs11, vs10, vs10,2  
+  xxpermdi	vs27,	vs26,	vs26,2	
+.endm
+
+
+.macro END4x2_2	  
+  /*for load2 offset will be 32 and 64*/
+   KERNEL4x2_2	AO,BO,	32,64,0 ,1,1 
+.endm
+
+
+.macro KERNEL4x2_E2	OffsetA,OffsetB, Index,IsLast 
+  KERNEL4x2_2	AO,BO,	\OffsetA,\OffsetB, \Index,\IsLast ,1 
+.endm
+
+
+.macro KERNEL4x2_L2	OffsetA,OffsetB, Index,IsLast
+  KERNEL4x2_2	AO,BO,	\OffsetA,\OffsetB, \Index,\IsLast ,0 
+.endm
+
+
+.macro KERNEL4x2_2	AREG,BREG,	OffsetA,OffsetB, Index,IsLast ,Complete
+  xvmaddasp		vs32, vs4,vs8
+  xvmaddasp		vs33, vs5,vs8
+  xvmaddasp		vs40, vs4,vs10
+  xvmaddasp		vs41, vs5,vs10
+.if \Complete==0  
+  lxv vs8,  DISP4(\Index,\OffsetA)(\AREG) 
+.endif  
+  xvmaddasp		vs36, vs4,vs9
+  xvmaddasp		vs37, vs5,vs9
+  xvmaddasp   vs44, vs4,vs11
+  xvmaddasp   vs45, vs5,vs11
+.if \Complete==0  
+  xxperm    vs10, vs8,    permute_mask 
+  xxpermdi  vs9,  vs8,   vs8,2  
+.endif    
+.if \Complete==0	
+   lxv	vs4,	DISP8(\Index,0+\OffsetB)(\BREG)
+   lxv	vs5,	DISP8(\Index,16+\OffsetB)(\BREG)
+.endif
+
+.if \Complete==0
+  xxpermdi  vs11, vs10, vs10,2   
+.endif  
+  xvmaddasp		vs32, vs0,vs24
+  xvmaddasp		vs33, vs1,vs24
+  xvmaddasp		vs40, vs0,vs26
+  xvmaddasp		vs41, vs1,vs26
+.if \Complete==0
+  lxv vs24, DISP4(\Index,16+\OffsetA)(\AREG) 
+.endif   
+  xvmaddasp		vs36, vs0,vs25
+  xvmaddasp		vs37, vs1,vs25
+  xvmaddasp		vs44, vs0,vs27
+  xvmaddasp		vs45, vs1,vs27
+.if \Complete==0
+  xxperm    vs26, vs24, permute_mask 
+  xxpermdi  vs25, vs24, vs24,2    
+.endif  
+.if \Complete==0
+  lxv	vs0,	DISP8(\Index,32+\OffsetB)(\BREG)
+  lxv	vs1,	DISP8(\Index,32+16+\OffsetB)(\BREG) 
+.endif
+
+.if \Complete==0
+  xxpermdi  vs27, vs26, vs26,2    
+.endif
+
+.if \IsLast==1	
+.if \Complete==1
+  addi    \AREG, \AREG, DISP4(\Index,\OffsetA) 
+	addi		\BREG, \BREG,  DISP8(\Index,\OffsetB)
+.else
+  addi    \AREG, \AREG, DISP4(\Index,32)  
+	addi		\BREG, \BREG,  DISP8(\Index,64)
+.endif
+
+.endif   
+.endm
+
+
+.macro KERNEL4x2
+  LOAD4x2
+  END4x2  AO, BO, 16,32
+.endm
+
+
+.macro SAVE4x2
+  add T4, LDC,LDC
+  add T1, CO ,LDC  
+  add T2,CO,T4
+  add T3,T1,T4  
+#ifndef TRMMKERNEL  
+  lxv vs24 , 0(CO) 
+#endif
+#ifndef TRMMKERNEL  
+  lxv vs25 , 0(T1) 
+#endif  
+#ifndef TRMMKERNEL  
+  lxv vs26 , 0(T2) 
+#endif
+#ifndef TRMMKERNEL  
+  lxv vs27 , 0(T3) 
+#endif   
+  xxperm  vs0,vs32,permute_mask
+  xxperm  vs4,vs40,permute_mask
+  xxperm  vs1,vs33,permute_mask
+  xxperm  vs5,vs41,permute_mask 
+  xxperm  vs8,vs36,permute_mask
+  xxperm  vs12,vs44,permute_mask
+  xxperm  vs9,vs37,permute_mask
+  xxperm  vs13,vs45,permute_mask
+  AGGREGATE_REALS_IMAGES_A_PERMUTE vs32,vs0,vs40,vs4
+  AGGREGATE_REALS_IMAGES_A_PERMUTE vs33,vs1,vs41,vs5
+  AGGREGATE_REALS_IMAGES_A_PERMUTE vs36,vs8,vs44,vs12
+  AGGREGATE_REALS_IMAGES_A_PERMUTE vs37,vs9,vs45,vs13
+  /*VSINRR,VSINII,VSOUT1,VSOUT2*/
+  MULT_APLHA_PART1    vs32,vs40,vs0,vs1
+  MULT_APLHA_PART1    vs33,vs41,vs2,vs3    
+  MULT_APLHA_PART1    vs36,vs44,vs8,vs9
+  MULT_APLHA_PART1    vs37,vs45,vs10,vs11
+  MULT_APLHA_PART2    vs32,vs40,vs0,vs1    
+  MULT_APLHA_PART2    vs33,vs41,vs2,vs3   
+  MULT_APLHA_PART2    vs36,vs44,vs8,vs9
+  MULT_APLHA_PART2    vs37,vs45,vs10,vs11
+/* reconstruct r,i pairs*/
+  xxperm  vs0,vs1, save_permute_1
+  xxperm  vs2,vs3, save_permute_1
+  xxperm  vs8,vs9, save_permute_1
+  xxperm  vs10,vs11, save_permute_1
+#ifndef TRMMKERNEL
+  /* add */
+  xxpermdi vs1,vs8,vs0,0
+  xxpermdi vs9,vs10,vs2,0 
+  xxpermdi vs3,vs0,vs8,3
+  xxpermdi vs11,vs2,vs10,3 
+  xvaddsp vs24,vs24,vs1
+  xvaddsp vs26,vs26,vs9 
+  xvaddsp vs25,vs25,vs3 
+  xvaddsp vs27,vs27,vs11 
+#else
+  xxpermdi vs24,vs8,vs0,0
+  xxpermdi vs26,vs10,vs2,0 
+  xxpermdi vs25,vs0,vs8,3
+  xxpermdi vs27,vs2,vs10,3 
+#endif
+  stxv vs24 , 0(CO) 
+  stxv vs25 , 0(T1) 
+  stxv vs26 , 0(T2) 
+  stxv vs27 , 0(T3)  
+  addi  CO, CO, 16
+.endm
+
+/*                                             macros for N=4 and M=2
+**********************************************************************************************/
+
+.macro Zero4x1
+  xxlxor  vs32, vs32, vs32
+  xxlxor  vs33, vs33, vs33 
+  xxlxor  vs40, vs40, vs40
+  xxlxor  vs41, vs41, vs41 
+.endm
+
+
+.macro LOAD4x1   
+  LOAD4x1O 0,0 
+.endm
+
+
+.macro LOAD4x1O  OffsetA,OffsetB
+  lxsd v4, (\OffsetA+0)(AO) 
+  lxv vs0,  (\OffsetB+0)(BO)
+  lxv vs1,  (\OffsetB+16)(BO)
+  xxspltd  vs24,vs36,0
+  xxperm    vs26, vs24,   permute_mask   
+.endm
+
+
+.macro END4x1_NORMAL
+  END4x1 AO,BO,8,32
+.endm
+
+
+.macro END4x1_WITHOUT_ADD
+  END4x1 AO,BO,0,0
+.endm
+
+
+.macro END4x1 AREG, BREG, OffsetA, OffsetB
+.if \OffsetB != 0
+  addi  \BREG, \BREG, \OffsetB
+.endif
+
+.if \OffsetA != 0
+  addi  \AREG, \AREG, \OffsetA
+.endif
+
+    xvmaddasp       vs32, vs0,vs24
+    xvmaddasp       vs33, vs1,vs24
+    xvmaddasp       vs40, vs0,vs26
+    xvmaddasp       vs41, vs1,vs26
+.endm
+
+
+.macro LOAD4x1_2
+    LOAD4x1_2O 0,0
+.endm
+ 
+
+.macro LOAD4x1_2O  OffsetA,OffsetB
+  lxv vs27,  (\OffsetA)(AO) 
+  xxspltd  vs8,vs27,1
+  xxspltd  vs24,vs27,0  
+  lxv vs4,  (0+\OffsetB)(BO)
+  lxv vs5,  (16+\OffsetB)(BO) 
+  xxperm    vs10, vs8,    permute_mask 
+  xxperm    vs26, vs24, permute_mask      
+  lxv vs0,  (32+\OffsetB)(BO)
+  lxv vs1,  (32+16+\OffsetB)(BO)
+.endm
+
+
+.macro END4x1_2   
+  /*for load2 offset will be 16 and 64*/
+   KERNEL4x1_2  AO,BO,  16,64,0 ,1,1 
+.endm
+
+
+.macro KERNEL4x1_E2 OffsetA,OffsetB, Index,IsLast 
+  KERNEL4x1_2 AO,BO,  \OffsetA,\OffsetB, \Index,\IsLast ,1 
+.endm
+
+
+.macro KERNEL4x1_L2 OffsetA,OffsetB, Index,IsLast
+  KERNEL4x1_2 AO,BO,  \OffsetA,\OffsetB, \Index,\IsLast ,0 
+.endm
+
+
+.macro KERNEL4x1_2  AREG,BREG,  OffsetA,OffsetB, Index,IsLast ,Complete
+  xvmaddasp   vs32, vs4,vs8
+  xvmaddasp   vs33, vs5,vs8
+  xvmaddasp   vs40, vs4,vs10
+  xvmaddasp   vs41, vs5,vs10
+.if \Complete==0  
+  lxv vs27,  DISP2(\Index,\OffsetA)(\AREG) 
+  xxspltd  vs8,vs27,1 
+.endif  
+.if \Complete==0  
+   lxv  vs4,  DISP8(\Index,0+\OffsetB)(\BREG)
+   lxv  vs5,  DISP8(\Index,16+\OffsetB)(\BREG)
+.endif
+
+.if \Complete==0  
+  xxperm    vs10, vs8,    permute_mask  
+.endif    
+  xvmaddasp   vs32, vs0,vs24
+  xvmaddasp   vs33, vs1,vs24
+  xvmaddasp   vs40, vs0,vs26
+  xvmaddasp   vs41, vs1,vs26
+.if \Complete==0 
+  xxspltd  vs24,vs27,0  
+  xxperm   vs26, vs24, permute_mask   
+.endif  
+.if \Complete==0
+  lxv vs0,  DISP8(\Index,32+\OffsetB)(\BREG)
+  lxv vs1,  DISP8(\Index,32+16+\OffsetB)(\BREG) 
+.endif
+
+.if \IsLast==1  
+.if \Complete==1
+  addi    \AREG, \AREG, DISP2(\Index,\OffsetA) 
+  addi    \BREG, \BREG,  DISP8(\Index,\OffsetB)
+.else
+  addi    \AREG, \AREG, DISP2(\Index,16)  
+  addi    \BREG, \BREG,  DISP8(\Index,64)
+.endif
+
+.endif   
+.endm
+
+
+.macro KERNEL4x1
+  LOAD4x1
+  END4x1  AO, BO, 8,32
+.endm
+
+
+.macro SAVE4x1
+  add T4, LDC,LDC
+  add T1, CO ,LDC  
+  add T2,CO,T4
+  add T3,T1,T4  
+#ifndef TRMMKERNEL  
+  lxsd v4 , 0(CO) 
+#endif
+#ifndef TRMMKERNEL  
+  lxsd v5 , 0(T1) 
+#endif  
+#ifndef TRMMKERNEL  
+  lxsd v6 , 0(T2) 
+#endif
+#ifndef TRMMKERNEL  
+  lxsd v7 , 0(T3) 
+#endif   
+  xxperm  vs0,vs32,permute_mask
+  xxperm  vs4,vs40,permute_mask
+  xxperm  vs1,vs33,permute_mask
+  xxperm  vs5,vs41,permute_mask 
+  AGGREGATE_REALS_IMAGES_A_PERMUTE vs32,vs0,vs40,vs4
+  AGGREGATE_REALS_IMAGES_A_PERMUTE vs33,vs1,vs41,vs5
+  /*VSINRR,VSINII,VSOUT1,VSOUT2*/
+  MULT_APLHA_PART1    vs32,vs40,vs0,vs1
+  MULT_APLHA_PART1    vs33,vs41,vs2,vs3     
+  MULT_APLHA_PART2    vs32,vs40,vs0,vs1    
+  MULT_APLHA_PART2    vs33,vs41,vs2,vs3    
+/* reconstruct r,i pairs*/
+  xxperm  vs0,vs1, save_permute_1
+  xxperm  vs2,vs3, save_permute_1
+#ifndef TRMMKERNEL
+  /* add */
+  xxspltd vs1,vs0,0
+  xxspltd vs3,vs0,1
+  xxspltd vs9,vs2,0
+  xxspltd vs11,vs2,1
+ /*--v4==vs36 v5==vs37 v6==vs38 v7==vs39---*/
+  xvaddsp vs36,vs36,vs1
+  xvaddsp vs37,vs37,vs3   
+  xvaddsp vs38,vs38,vs9  
+  xvaddsp vs39,vs39,vs11 
+#else 
+ /*--v4==vs36 v5==vs37 v6==vs38 v7==vs39---*/
+  xxspltd vs36,vs0,0
+  xxspltd vs37,vs0,1
+  xxspltd vs38,vs2,0
+  xxspltd vs39,vs2,1
+#endif
+  stxsd v4 , 0(CO) 
+  stxsd v5 , 0(T1) 
+  stxsd v6 , 0(T2) 
+  stxsd v7 , 0(T3)  
+  addi  CO, CO, 8
+.endm
+
+/*                                             macros for N=2 and M=8
+**********************************************************************************************/
+
+.macro Zero2x8
+  xxlxor  vs32, vs32, vs32
+  xxlxor  vs33, vs33, vs33
+  xxlxor  vs34, vs34, vs34
+  xxlxor  vs35, vs35, vs35
+  xxlxor  vs36, vs36, vs36
+  xxlxor  vs37, vs37, vs37
+  xxlxor  vs38, vs38, vs38
+  xxlxor  vs39, vs39, vs39
+  xxlxor  vs40, vs40, vs40
+  xxlxor  vs41, vs41, vs41
+  xxlxor  vs42, vs42, vs42
+  xxlxor  vs43, vs43, vs43
+  xxlxor  vs44, vs44, vs44
+  xxlxor  vs45, vs45, vs45
+  xxlxor  vs46, vs46, vs46
+  xxlxor  vs47, vs47, vs47
+.endm
+
+
+.macro LOAD2x8   
+  LOAD2x8O 0,0 
+.endm
+
+
+.macro LOAD2x8O  OffsetA,OffsetB
+  lxv vs24, (\OffsetB+0)(BO) 
+  xxperm    vs26, vs24,   permute_mask    
+  lxv vs0,  (\OffsetA+0)(AO)
+  lxv vs1,  (\OffsetA+16)(AO)
+  lxv vs2,  (\OffsetA+32)(AO)
+  lxv vs3,  (\OffsetA+48)(AO) 
+  xxpermdi  vs25, vs24, vs24,2  
+  xxpermdi  vs27, vs26, vs26,2
+.endm
+
+
+.macro END2x8_NORMAL
+  END2x8 AO,BO,64,16
+.endm
+
+
+.macro END2x8_WITHOUT_ADD
+  END2x8 AO,BO,0,0
+.endm
+
+
+.macro END2x8 AREG, BREG, OffsetA, OffsetB
+.if \OffsetB != 0
+  addi  \BREG, \BREG, \OffsetB
+.endif
+
+.if \OffsetA != 0
+  addi  \AREG, \AREG, \OffsetA
+.endif
+
+    xvmaddasp       vs32, vs0,vs24
+    xvmaddasp       vs33, vs1,vs24
+    xvmaddasp       vs34, vs2,vs24  
+    xvmaddasp       vs35, vs3,vs24  
+    xvmaddasp       vs36, vs0,vs25
+    xvmaddasp       vs37, vs1,vs25
+    xvmaddasp       vs38, vs2,vs25  
+    xvmaddasp       vs39, vs3,vs25 
+    xvmaddasp       vs40, vs0,vs26
+    xvmaddasp       vs41, vs1,vs26
+    xvmaddasp       vs42, vs2,vs26  
+    xvmaddasp       vs43, vs3,vs26
+    xvmaddasp       vs44, vs0,vs27
+    xvmaddasp       vs45, vs1,vs27
+    xvmaddasp       vs46, vs2,vs27  
+    xvmaddasp       vs47, vs3,vs27
+.endm
+
+
+.macro LOAD2x8_2
+    LOAD2x8_2O 0,0
+.endm
+ 
+
+.macro LOAD2x8_2O  OffsetA,OffsetB
+  lxv vs8,  (\OffsetB)(BO)
+  lxv vs24, (16+\OffsetB)(BO)
+  lxv vs4,  (0+\OffsetA)(AO)
+  lxv vs5,  (16+\OffsetA)(AO)
+  xxperm    vs10, vs8,    permute_mask 
+  xxperm    vs26, vs24, permute_mask  
+  lxv vs6,  (32+\OffsetA)(AO)
+  lxv vs7,  (48+\OffsetA)(AO) 
+  lxv vs0,  (64+\OffsetA)(AO)
+  lxv vs1,  (64+16+\OffsetA)(AO) 
+  xxpermdi  vs9,  vs8,   vs8,2    
+  xxpermdi  vs25, vs24, vs24,2     
+  lxv vs2,  (64+32+\OffsetA)(AO)
+  lxv vs3,  (64+48+\OffsetA)(AO)
+  xxpermdi  vs11, vs10, vs10,2
+  xxpermdi  vs27, vs26, vs26,2 
+.endm
+ 
+
+.macro END2x8_2   
+  /*for load2 offset will be 128 and 32*/
+   KERNEL2x8_2  AO,BO,  128,32,0 ,1,1 
+.endm
+
+
+.macro KERNEL2x8_E2 OffsetA,OffsetB, Index,IsLast 
+  KERNEL2x8_2 AO,BO,  \OffsetA,\OffsetB, \Index,\IsLast ,1 
+.endm
+
+
+.macro KERNEL2x8_L2 OffsetA,OffsetB, Index,IsLast
+  KERNEL2x8_2 AO,BO,  \OffsetA,\OffsetB, \Index,\IsLast ,0 
+.endm
+
+
+.macro KERNEL2x8_2  AREG,BREG,  OffsetA,OffsetB, Index,IsLast ,Complete
+  xvmaddasp   vs32, vs4,vs8
+  xvmaddasp   vs33, vs5,vs8
+  xvmaddasp   vs40, vs4,vs10
+  xvmaddasp   vs41, vs5,vs10
+  xvmaddasp   vs36, vs4,vs9
+  xvmaddasp   vs37, vs5,vs9
+  xvmaddasp   vs44, vs4,vs11
+  xvmaddasp   vs45, vs5,vs11
+.if \Complete==0  
+   lxv  vs4,  DISP16(\Index,0+\OffsetA)(\AREG)
+   lxv  vs5,  DISP16(\Index,16+\OffsetA)(\AREG)
+.endif
+
+  xvmaddasp   vs34, vs6,vs8 
+  xvmaddasp   vs35, vs7,vs8
+.if \Complete==0  
+  lxv vs8,  DISP4(\Index,\OffsetB)(\BREG)
+.endif    
+  xvmaddasp   vs42, vs6,vs10
+  xvmaddasp   vs43, vs7,vs10
+  xvmaddasp   vs38, vs6,vs9 
+  xvmaddasp   vs39, vs7,vs9
+.if \Complete==0
+  xxperm    vs10, vs8,    permute_mask  
+  xxpermdi  vs9,  vs8,   vs8,2   
+.endif    
+  xvmaddasp   vs46, vs6,vs11
+  xvmaddasp   vs47, vs7,vs11
+.if \Complete==0
+  xxpermdi  vs11, vs10, vs10,2   
+.endif  
+.if \Complete==0
+   lxv  vs6,  DISP16(\Index,32+\OffsetA)(\AREG)
+   lxv  vs7,  DISP16(\Index,48+\OffsetA)(\AREG) 
+.endif 
+  xvmaddasp   vs32, vs0,vs24
+  xvmaddasp   vs33, vs1,vs24
+  xvmaddasp   vs40, vs0,vs26
+  xvmaddasp   vs41, vs1,vs26
+  xvmaddasp   vs36, vs0,vs25
+  xvmaddasp   vs37, vs1,vs25
+  xvmaddasp   vs44, vs0,vs27
+  xvmaddasp   vs45, vs1,vs27
+.if \Complete==0
+  lxv vs0,  DISP16(\Index,64+\OffsetA)(\AREG)
+  lxv vs1,  DISP16(\Index,64+16+\OffsetA)(\AREG) 
+.endif
+
+  xvmaddasp   vs34, vs2,vs24
+  xvmaddasp   vs35, vs3,vs24    
+.if \Complete==0
+  lxv vs24, DISP4(\Index,16+\OffsetB)(\BREG)
+.endif  
+  xvmaddasp   vs42, vs2,vs26
+  xvmaddasp   vs43, vs3,vs26
+  xvmaddasp   vs38, vs2,vs25
+  xvmaddasp   vs39, vs3,vs25
+.if \Complete==0
+  xxperm    vs26, vs24, permute_mask 
+  xxpermdi  vs25, vs24, vs24,2   
+.endif  
+  xvmaddasp   vs46, vs2,vs27
+  xvmaddasp   vs47, vs3,vs27
+.if \Complete==0
+  xxpermdi  vs27, vs26, vs26,2   
+.endif
+
+.if \Complete==0
+  lxv vs2,  DISP16(\Index,64+32+\OffsetA)(\AREG)
+  lxv vs3,  DISP16(\Index,64+48+\OffsetA)(\AREG)
+.endif
+
+.if \IsLast==1  
+.if \Complete==1
+  addi    \BREG, \BREG,  DISP4(\Index,\OffsetB)
+  addi    \AREG, \AREG, DISP16(\Index,\OffsetA)  
+.else
+  addi    \BREG, \BREG,  DISP4(\Index,32)
+  addi    \AREG, \AREG, DISP16(\Index,128)  
+.endif
+
+.endif   
+.endm
+
+
+.macro KERNEL2x8
+  LOAD2x8
+  END2x8  AO, BO, 64,16
+.endm
+
+
+.macro SAVE2x8
+  add T1, CO ,LDC  
+#ifndef TRMMKERNEL  
+  lxv vs24 , 0(CO)
+  lxv vs25 , 16(CO)
+#endif
+  xxperm  vs0,vs32,permute_mask
+  xxperm  vs4,vs40,permute_mask
+#ifndef TRMMKERNEL  
+  lxv vs26 , 32(CO)
+  lxv vs27 , 48(CO)
+#endif  
+  xxperm  vs1,vs33,permute_mask
+  xxperm  vs5,vs41,permute_mask
+#ifndef TRMMKERNEL  
+  lxv vs28 , 0(T1)
+  lxv vs29 , 16(T1)
+#endif  
+  xxperm  vs2,vs34,permute_mask
+  xxperm  vs6,vs42,permute_mask
+#ifndef TRMMKERNEL  
+  lxv vs30 , 32(T1)
+  lxv vs31 , 48(T1)
+#endif 
+  xxperm  vs3,vs35,permute_mask
+  xxperm  vs7,vs43,permute_mask 
+  add T2,CO,T4
+  add T3,T1,T4  
+  AGGREGATE_REALS_IMAGES vs32,vs0,vs40,vs4
+  xxperm  vs8,vs36,permute_mask
+  xxperm  vs12,vs44,permute_mask
+  AGGREGATE_REALS_IMAGES vs33,vs1,vs41,vs5
+  xxperm  vs9,vs37,permute_mask
+  xxperm  vs13,vs45,permute_mask
+  AGGREGATE_REALS_IMAGES vs34,vs2,vs42,vs6
+  xxperm  vs10,vs38,permute_mask
+  xxperm  vs14,vs46,permute_mask
+  AGGREGATE_REALS_IMAGES vs35,vs3,vs43,vs7 
+  xxperm  vs11,vs39,permute_mask
+  xxperm  vs15,vs47,permute_mask 
+  AGGREGATE_REALS_IMAGES vs36,vs8,vs44,vs12
+  AGGREGATE_REALS_IMAGES vs37,vs9,vs45,vs13
+  AGGREGATE_REALS_IMAGES vs38,vs10,vs46,vs14
+  AGGREGATE_REALS_IMAGES vs39,vs11,vs47,vs15 
+  /*VSINRR,VSINII,VSOUT1,VSOUT2*/
+  MULT_APLHA_PART1    vs32,vs40,vs0,vs1 
+  MULT_APLHA_PART1    vs33,vs41,vs2,vs3    
+  MULT_APLHA_PART1    vs34,vs42,vs4,vs5
+  MULT_APLHA_PART1    vs35,vs43,vs6,vs7  
+  MULT_APLHA_PART2    vs32,vs40,vs0,vs1    
+  MULT_APLHA_PART2    vs33,vs41,vs2,vs3   
+  MULT_APLHA_PART2    vs34,vs42,vs4,vs5
+  MULT_APLHA_PART2    vs35,vs43,vs6,vs7  
+  MULT_APLHA_PART1    vs36,vs44,vs8,vs9
+  MULT_APLHA_PART1    vs37,vs45,vs10,vs11
+  MULT_APLHA_PART1    vs38,vs46,vs12,vs13
+  MULT_APLHA_PART1    vs39,vs47,vs14,vs15
+  MULT_APLHA_PART2    vs36,vs44,vs8,vs9
+  MULT_APLHA_PART2    vs37,vs45,vs10,vs11
+  MULT_APLHA_PART2    vs38,vs46,vs12,vs13
+  MULT_APLHA_PART2    vs39,vs47,vs14,vs15
+/* reconstruct r,i pairs*/
+  xxperm  vs0,vs1, save_permute_1
+  xxperm  vs2,vs3, save_permute_1
+  xxperm  vs4,vs5, save_permute_1
+  xxperm  vs6,vs7, save_permute_1
+  xxperm  vs8,vs9, save_permute_1
+  xxperm  vs10,vs11, save_permute_1
+  xxperm  vs12,vs13, save_permute_1
+  xxperm  vs14,vs15, save_permute_1
+#ifndef TRMMKERNEL
+  /* add */
+  xxpermdi vs1,vs8,vs0,2
+  xxpermdi vs3,vs10,vs2,2
+  xxpermdi vs5,vs12,vs4,2
+  xxpermdi vs7,vs14,vs6,2
+  xxpermdi vs9,vs0,vs8,2
+  xxpermdi vs11,vs2,vs10,2  
+  xvaddsp vs24,vs24,vs1
+  xvaddsp vs25,vs25,vs3
+  xxpermdi vs13,vs4,vs12,2  
+  xxpermdi vs15,vs6,vs14,2
+  xvaddsp vs26,vs26,vs5
+  xvaddsp  vs27,vs27,vs7
+  xvaddsp vs28,vs28,vs9
+  xvaddsp vs29,vs29,vs11 
+  xvaddsp vs30,vs30,vs13
+  xvaddsp vs31,vs31,vs15  
+#else
+  xxpermdi vs24,vs8,vs0,2
+  xxpermdi vs25,vs10,vs2,2
+  xxpermdi vs26,vs12,vs4,2
+  xxpermdi vs27,vs14,vs6,2 
+  xxpermdi vs28,vs0,vs8,2
+  xxpermdi vs29,vs2,vs10,2  
+  xxpermdi vs30,vs4,vs12,2  
+  xxpermdi vs31,vs6,vs14,2
+#endif
+  stxv vs24 , 0(CO)
+  stxv vs25 , 16(CO) 
+  stxv vs26 , 32(CO)
+  stxv vs27 , 48(CO) 
+  stxv vs28 , 0(T1)
+  stxv vs29 , 16(T1) 
+  stxv vs30 , 32(T1)
+  stxv vs31 , 48(T1)  
+  addi  CO, CO, 64
+.endm
+
+/*                                             macros for N=2 and M=4
+**********************************************************************************************/
+
+.macro Zero2x4
+  xxlxor  vs32, vs32, vs32
+  xxlxor  vs33, vs33, vs33
+  xxlxor  vs36, vs36, vs36
+  xxlxor  vs37, vs37, vs37
+  xxlxor  vs40, vs40, vs40
+  xxlxor  vs41, vs41, vs41
+  xxlxor  vs44, vs44, vs44
+  xxlxor  vs45, vs45, vs45
+.endm
+
+
+.macro LOAD2x4   
+  LOAD2x4O 0,0 
+.endm
+
+
+.macro LOAD2x4O  OffsetA,OffsetB
+  lxv vs24, (\OffsetB+0)(BO)
+  lxv vs0,  (\OffsetA+0)(AO)
+  lxv vs1,  (\OffsetA+16)(AO)
+  xxperm    vs26, vs24,   permute_mask  
+  xxpermdi  vs25, vs24, vs24,2     
+  xxpermdi  vs27, vs26, vs26,2  
+.endm
+
+
+.macro END2x4_NORMAL
+  END2x4 AO,BO,32,16
+.endm
+
+
+.macro END2x4_WITHOUT_ADD
+  END2x4 AO,BO,0,0
+.endm
+
+
+.macro END2x4 AREG, BREG, OffsetA, OffsetB
+.if \OffsetB != 0
+  addi  \BREG, \BREG, \OffsetB
+.endif
+
+.if \OffsetA != 0
+  addi  \AREG, \AREG, \OffsetA
+.endif
+
+    xvmaddasp       vs32, vs0,vs24
+    xvmaddasp       vs33, vs1,vs24
+    xvmaddasp       vs36, vs0,vs25
+    xvmaddasp       vs37, vs1,vs25
+    xvmaddasp       vs40, vs0,vs26
+    xvmaddasp       vs41, vs1,vs26
+    xvmaddasp       vs44, vs0,vs27
+    xvmaddasp       vs45, vs1,vs27
+.endm
+
+
+.macro LOAD2x4_2
+    LOAD2x4_2O 0,0
+.endm
+ 
+
+.macro LOAD2x4_2O  OffsetA,OffsetB
+  lxv vs8,  (\OffsetB)(BO)
+  lxv vs24, (16+\OffsetB)(BO)
+  lxv vs4,  (0+\OffsetA)(AO)
+  lxv vs5,  (16+\OffsetA)(AO)
+  xxperm    vs10, vs8,    permute_mask
+  xxperm    vs26, vs24, permute_mask
+  xxpermdi  vs9,  vs8,   vs8,2   
+  xxpermdi  vs25, vs24, vs24,2     
+  lxv vs0,  (32+\OffsetA)(AO)
+  lxv vs1,  (32+16+\OffsetA)(AO) 
+  xxpermdi  vs11, vs10, vs10,2  
+  xxpermdi  vs27, vs26, vs26,2  
+.endm
+
+
+.macro END2x4_2   
+  /*for load2 offset will be 64 and 32*/
+   KERNEL2x4_2  AO,BO,  64,32,0 ,1,1 
+.endm
+
+
+.macro KERNEL2x4_E2 OffsetA,OffsetB, Index,IsLast 
+  KERNEL2x4_2 AO,BO,  \OffsetA,\OffsetB, \Index,\IsLast ,1 
+.endm
+
+
+.macro KERNEL2x4_L2 OffsetA,OffsetB, Index,IsLast
+  KERNEL2x4_2 AO,BO,  \OffsetA,\OffsetB, \Index,\IsLast ,0 
+.endm
+
+
+.macro KERNEL2x4_2  AREG,BREG,  OffsetA,OffsetB, Index,IsLast ,Complete
+  xvmaddasp   vs32, vs4,vs8
+  xvmaddasp   vs33, vs5,vs8
+  xvmaddasp   vs40, vs4,vs10
+  xvmaddasp   vs41, vs5,vs10
+.if \Complete==0  
+  lxv vs8,  DISP4(\Index,\OffsetB)(\BREG)
+.endif  
+  xvmaddasp   vs36, vs4,vs9
+  xvmaddasp   vs37, vs5,vs9
+  xvmaddasp   vs44, vs4,vs11
+  xvmaddasp   vs45, vs5,vs11
+.if \Complete==0
+  xxperm    vs10, vs8,    permute_mask 
+  xxpermdi  vs9,  vs8,   vs8,2   
+.endif    
+.if \Complete==0  
+   lxv  vs4,  DISP8(\Index,0+\OffsetA)(\AREG)
+   lxv  vs5,  DISP8(\Index,16+\OffsetA)(\AREG)
+.endif
+
+.if \Complete==0
+  xxpermdi  vs11, vs10, vs10,2 
+.endif  
+  xvmaddasp   vs32, vs0,vs24
+  xvmaddasp   vs33, vs1,vs24
+  xvmaddasp   vs40, vs0,vs26
+  xvmaddasp   vs41, vs1,vs26
+.if \Complete==0
+  lxv vs24, DISP4(\Index,16+\OffsetB)(\BREG)
+.endif   
+  xvmaddasp   vs36, vs0,vs25
+  xvmaddasp   vs37, vs1,vs25
+  xvmaddasp   vs44, vs0,vs27
+  xvmaddasp   vs45, vs1,vs27
+.if \Complete==0
+  xxperm    vs26, vs24, permute_mask
+  xxpermdi  vs25, vs24, vs24,2 
+.endif  
+.if \Complete==0
+  lxv vs0,  DISP8(\Index,32+\OffsetA)(\AREG)
+  lxv vs1,  DISP8(\Index,32+16+\OffsetA)(\AREG) 
+.endif
+
+.if \Complete==0
+  xxpermdi  vs27, vs26, vs26,2  
+.endif
+
+.if \IsLast==1  
+.if \Complete==1
+  addi    \BREG, \BREG,  DISP4(\Index,\OffsetB)
+  addi    \AREG, \AREG, DISP8(\Index,\OffsetA)  
+.else
+  addi    \BREG, \BREG,  DISP4(\Index,32)
+  addi    \AREG, \AREG, DISP8(\Index,64)  
+.endif
+
+.endif   
+.endm
+
+
+.macro KERNEL2x4
+  LOAD2x4
+  END2x4  AO, BO, 32,16
+.endm
+
+
+.macro SAVE2x4
+  add T1, CO ,LDC  
+#ifndef TRMMKERNEL  
+  lxv vs24 , 0(CO)
+  lxv vs25 , 16(CO)
+#endif
+#ifndef TRMMKERNEL  
+  lxv vs26 , 0(T1)
+  lxv vs27 , 16(T1)
+#endif  
+  xxperm  vs0,vs32,permute_mask
+  xxperm  vs4,vs40,permute_mask
+  xxperm  vs1,vs33,permute_mask
+  xxperm  vs5,vs41,permute_mask
+  xxperm  vs8,vs36,permute_mask
+  xxperm  vs12,vs44,permute_mask
+  xxperm  vs9,vs37,permute_mask
+  xxperm  vs13,vs45,permute_mask
+  AGGREGATE_REALS_IMAGES vs32,vs0,vs40,vs4
+  AGGREGATE_REALS_IMAGES vs33,vs1,vs41,vs5
+  AGGREGATE_REALS_IMAGES vs36,vs8,vs44,vs12
+  AGGREGATE_REALS_IMAGES vs37,vs9,vs45,vs13
+  /*VSINRR,VSINII,VSOUT1,VSOUT2*/
+  MULT_APLHA_PART1    vs32,vs40,vs0,vs1
+  MULT_APLHA_PART1    vs33,vs41,vs2,vs3    
+  MULT_APLHA_PART1    vs36,vs44,vs8,vs9
+  MULT_APLHA_PART1    vs37,vs45,vs10,vs11
+  MULT_APLHA_PART2    vs32,vs40,vs0,vs1    
+  MULT_APLHA_PART2    vs33,vs41,vs2,vs3   
+  MULT_APLHA_PART2    vs36,vs44,vs8,vs9
+  MULT_APLHA_PART2    vs37,vs45,vs10,vs11
+/* reconstruct r,i pairs*/
+  xxperm  vs0,vs1, save_permute_1
+  xxperm  vs2,vs3, save_permute_1
+  xxperm  vs8,vs9, save_permute_1
+  xxperm  vs10,vs11, save_permute_1
+#ifndef TRMMKERNEL
+  /* add */
+  xxpermdi vs1,vs8,vs0,2
+  xxpermdi vs3,vs10,vs2,2 
+  xxpermdi vs9,vs0,vs8,2
+  xxpermdi vs11,vs2,vs10,2  
+  xvaddsp vs24,vs24,vs1
+  xvaddsp vs25,vs25,vs3 
+  xvaddsp vs26,vs26,vs9
+  xvaddsp vs27,vs27,vs11 
+#else
+  xxpermdi vs24,vs8,vs0,2
+  xxpermdi vs25,vs10,vs2,2
+  xxpermdi vs26,vs0,vs8,2
+  xxpermdi vs27,vs2,vs10,2  
+#endif
+  stxv vs24 , 0(CO)
+  stxv vs25 , 16(CO)
+  stxv vs26 , 0(T1)
+  stxv vs27 , 16(T1)
+  addi  CO, CO, 32
+.endm
+
+/*                                             macros for N=2 and M=2
+**********************************************************************************************/
+
+.macro Zero2x2
+  xxlxor  vs32, vs32, vs32
+  xxlxor  vs36, vs36, vs36
+  xxlxor  vs40, vs40, vs40
+  xxlxor  vs44, vs44, vs44
+.endm
+
+
+.macro LOAD2x2   
+  LOAD2x2O 0,0 
+.endm
+
+
+.macro LOAD2x2O  OffsetA,OffsetB
+  lxv vs24, (\OffsetA+0)(AO)
+  lxv vs0,  (\OffsetB+0)(BO)
+  xxperm    vs26, vs24,   permute_mask  
+  xxpermdi  vs25, vs24, vs24,2      
+  xxpermdi  vs27, vs26, vs26,2  
+.endm
+
+
+.macro END2x2_NORMAL
+  END2x2 AO,BO,16,16
+.endm
+
+
+.macro END2x2_WITHOUT_ADD
+  END2x2 AO,BO,0,0
+.endm
+
+
+.macro END2x2 AREG, BREG, OffsetA, OffsetB
+.if \OffsetB != 0
+  addi  \BREG, \BREG, \OffsetB
+.endif
+
+.if \OffsetA != 0
+  addi  \AREG, \AREG, \OffsetA
+.endif
+
+    xvmaddasp       vs32, vs0,vs24
+    xvmaddasp       vs36, vs0,vs25
+    xvmaddasp       vs40, vs0,vs26
+    xvmaddasp       vs44, vs0,vs27
+.endm
+
+
+.macro LOAD2x2_2
+    LOAD2x2_2O 0,0
+.endm
+ 
+
+.macro LOAD2x2_2O  OffsetA,OffsetB
+  lxv vs8,  (\OffsetA)(AO) 
+  lxv vs24, (16+\OffsetA)(AO) 
+  lxv vs4,  (0+\OffsetB)(BO)
+  lxv vs0,  (16+\OffsetB)(BO)
+  xxperm    vs10, vs8,    permute_mask
+  xxpermdi  vs9,  vs8,   vs8,2   
+  xxperm    vs26, vs24, permute_mask
+  xxpermdi  vs25, vs24, vs24,2    
+  xxpermdi  vs11, vs10, vs10,2  
+  xxpermdi  vs27, vs26, vs26,2  
+.endm
+
+
+.macro END2x2_2   
+  /*for load2 offset will be 32 and 32*/
+   KERNEL2x2_2  AO,BO,  32,32,0 ,1,1 
+.endm
+
+
+.macro KERNEL2x2_E2 OffsetA,OffsetB, Index,IsLast 
+  KERNEL2x2_2 AO,BO,  \OffsetA,\OffsetB, \Index,\IsLast ,1 
+.endm
+
+
+.macro KERNEL2x2_L2 OffsetA,OffsetB, Index,IsLast
+  KERNEL2x2_2 AO,BO,  \OffsetA,\OffsetB, \Index,\IsLast ,0 
+.endm
+
+
+.macro KERNEL2x2_2  AREG,BREG,  OffsetA,OffsetB, Index,IsLast ,Complete
+  xvmaddasp   vs32, vs4,vs8
+  xvmaddasp   vs40, vs4,vs10
+.if \Complete==0  
+  lxv vs8,  DISP4(\Index,\OffsetA)(\AREG) 
+.endif  
+  xvmaddasp   vs36, vs4,vs9
+  xvmaddasp   vs44, vs4,vs11
+.if \Complete==0  
+  xxperm    vs10, vs8,    permute_mask 
+  xxpermdi  vs9,  vs8,   vs8,2  
+.endif    
+.if \Complete==0  
+   lxv  vs4,  DISP4(\Index,0+\OffsetB)(\BREG)
+.endif
+
+.if \Complete==0
+  xxpermdi  vs11, vs10, vs10,2   
+.endif  
+  xvmaddasp   vs32, vs0,vs24
+  xvmaddasp   vs40, vs0,vs26
+.if \Complete==0
+  lxv vs24, DISP4(\Index,16+\OffsetA)(\AREG) 
+.endif   
+  xvmaddasp   vs36, vs0,vs25
+  xvmaddasp   vs44, vs0,vs27
+.if \Complete==0
+  xxperm    vs26, vs24, permute_mask 
+  xxpermdi  vs25, vs24, vs24,2    
+.endif  
+.if \Complete==0
+  lxv vs0,  DISP4(\Index,16+\OffsetB)(\BREG)
+.endif
+
+.if \Complete==0
+  xxpermdi  vs27, vs26, vs26,2    
+.endif
+
+.if \IsLast==1  
+.if \Complete==1
+  addi    \AREG, \AREG, DISP4(\Index,\OffsetA) 
+  addi    \BREG, \BREG,  DISP4(\Index,\OffsetB)
+.else
+  addi    \AREG, \AREG, DISP4(\Index,32)  
+  addi    \BREG, \BREG,  DISP4(\Index,32)
+.endif
+
+.endif   
+.endm
+
+
+.macro KERNEL2x2
+  LOAD2x2
+  END2x2  AO, BO, 16,16
+.endm
+
+
+.macro SAVE2x2
+  add T1, CO ,LDC  
+#ifndef TRMMKERNEL  
+  lxv vs24 , 0(CO) 
+#endif
+#ifndef TRMMKERNEL  
+  lxv vs26 , 0(T1) 
+#endif  
+  xxperm  vs0,vs32,permute_mask
+  xxperm  vs4,vs40,permute_mask
+  xxperm  vs8,vs36,permute_mask
+  xxperm  vs12,vs44,permute_mask
+  AGGREGATE_REALS_IMAGES_A_PERMUTE vs32,vs0,vs40,vs4
+  AGGREGATE_REALS_IMAGES_A_PERMUTE vs36,vs8,vs44,vs12
+  /*VSINRR,VSINII,VSOUT1,VSOUT2*/
+  MULT_APLHA_PART1    vs32,vs40,vs0,vs1
+  MULT_APLHA_PART1    vs36,vs44,vs8,vs9
+  MULT_APLHA_PART2    vs32,vs40,vs0,vs1   
+  MULT_APLHA_PART2    vs36,vs44,vs8,vs9
+/* reconstruct r,i pairs*/
+  xxperm  vs0,vs1, save_permute_1
+  xxperm  vs8,vs9, save_permute_1
+#ifndef TRMMKERNEL
+  /* add */
+  xxpermdi vs1,vs8,vs0,0
+  xxpermdi vs9,vs0,vs8,3 
+  xvaddsp vs24,vs24,vs1
+  xvaddsp vs26,vs26,vs9 
+#else
+  xxpermdi vs24,vs8,vs0,0
+  xxpermdi vs26,vs0,vs8,3 
+#endif
+  stxv vs24 , 0(CO) 
+  stxv vs26 , 0(T1)
+  addi  CO, CO, 16
+.endm
+
+/*                                             macros for N=2 and M=1
+**********************************************************************************************/
+
+.macro Zero2x1
+  xxlxor  vs32, vs32, vs32
+  xxlxor  vs40, vs40, vs40
+.endm
+
+
+.macro LOAD2x1   
+  LOAD2x1O 0,0 
+.endm
+
+
+.macro LOAD2x1O  OffsetA,OffsetB
+  lxsd v4, (\OffsetA+0)(AO) 
+  lxv vs0,  (\OffsetB+0)(BO)
+  xxspltd  vs24,vs36,0
+  xxperm    vs26, vs24,   permute_mask   
+.endm
+
+
+.macro END2x1_NORMAL
+  END2x1 AO,BO,8,16
+.endm
+
+
+.macro END2x1_WITHOUT_ADD
+  END2x1 AO,BO,0,0
+.endm
+
+
+.macro END2x1 AREG, BREG, OffsetA, OffsetB
+.if \OffsetB != 0
+  addi  \BREG, \BREG, \OffsetB
+.endif
+
+.if \OffsetA != 0
+  addi  \AREG, \AREG, \OffsetA
+.endif
+
+    xvmaddasp       vs32, vs0,vs24
+    xvmaddasp       vs40, vs0,vs26
+.endm
+
+
+.macro LOAD2x1_2
+    LOAD2x1_2O 0,0
+.endm
+ 
+
+.macro LOAD2x1_2O  OffsetA,OffsetB
+  lxv vs27,  (\OffsetA)(AO) 
+  lxv vs4,  (0+\OffsetB)(BO)
+  lxv vs0,  (16+\OffsetB)(BO)
+  xxspltd  vs8,vs27,1
+  xxspltd  vs24,vs27,0  
+  xxperm    vs10, vs8,    permute_mask 
+  xxperm    vs26, vs24, permute_mask      
+.endm
+
+
+.macro END2x1_2   
+  /*for load2 offset will be 16 and 32*/
+   KERNEL2x1_2  AO,BO,  16,32,0 ,1,1 
+.endm
+
+
+.macro KERNEL2x1_E2 OffsetA,OffsetB, Index,IsLast 
+  KERNEL2x1_2 AO,BO,  \OffsetA,\OffsetB, \Index,\IsLast ,1 
+.endm
+
+
+.macro KERNEL2x1_L2 OffsetA,OffsetB, Index,IsLast
+  KERNEL2x1_2 AO,BO,  \OffsetA,\OffsetB, \Index,\IsLast ,0 
+.endm
+
+
+.macro KERNEL2x1_2  AREG,BREG,  OffsetA,OffsetB, Index,IsLast ,Complete
+  xvmaddasp   vs32, vs4,vs8
+  xvmaddasp   vs40, vs4,vs10
+.if \Complete==0  
+  lxv vs27,  DISP2(\Index,\OffsetA)(\AREG) 
+  xxspltd  vs8,vs27,1 
+.endif  
+.if \Complete==0  
+   lxv  vs4,  DISP4(\Index,0+\OffsetB)(\BREG)
+.endif
+
+.if \Complete==0  
+  xxperm    vs10, vs8,    permute_mask  
+.endif    
+  xvmaddasp   vs32, vs0,vs24
+  xvmaddasp   vs40, vs0,vs26
+.if \Complete==0 
+  xxspltd  vs24,vs27,0  
+  xxperm   vs26, vs24, permute_mask   
+.endif  
+.if \Complete==0
+  lxv vs0,  DISP4(\Index,16+\OffsetB)(\BREG)
+.endif
+
+.if \IsLast==1  
+.if \Complete==1
+  addi    \AREG, \AREG, DISP2(\Index,\OffsetA) 
+  addi    \BREG, \BREG,  DISP4(\Index,\OffsetB)
+.else
+  addi    \AREG, \AREG, DISP2(\Index,16)  
+  addi    \BREG, \BREG,  DISP4(\Index,32)
+.endif
+
+.endif   
+.endm
+
+
+.macro KERNEL2x1
+  LOAD2x1
+  END2x1  AO, BO, 8,16
+.endm
+
+
+.macro SAVE2x1
+  add T1, CO ,LDC  
+#ifndef TRMMKERNEL  
+  lxsd v4 , 0(CO) 
+#endif
+#ifndef TRMMKERNEL  
+  lxsd v5 , 0(T1) 
+#endif  
+  xxperm  vs0,vs32,permute_mask
+  xxperm  vs4,vs40,permute_mask
+  AGGREGATE_REALS_IMAGES_A_PERMUTE vs32,vs0,vs40,vs4
+  AGGREGATE_REALS_IMAGES_A_PERMUTE vs33,vs1,vs41,vs5
+  /*VSINRR,VSINII,VSOUT1,VSOUT2*/
+  MULT_APLHA_PART1    vs32,vs40,vs0,vs1 
+  MULT_APLHA_PART2    vs32,vs40,vs0,vs1      
+/* reconstruct r,i pairs*/
+  xxperm  vs0,vs1, save_permute_1 
+#ifndef TRMMKERNEL
+  /* add */
+  xxspltd vs1,vs0,0
+  xxspltd vs3,vs0,1
+ /*--v4==vs36 v5==vs37---*/
+  xvaddsp vs36,vs36,vs1
+  xvaddsp vs37,vs37,vs3  
+#else 
+ /*--v4==vs36 v5==vs37---*/
+  xxspltd vs36,vs0,0
+  xxspltd vs37,vs0,1
+#endif
+  stxsd v4 , 0(CO) 
+  stxsd v5 , 0(T1) 
+  addi  CO, CO, 8
+.endm
+
+/*                                             macros for N=1 and M=8
+**********************************************************************************************/
+
+.macro Zero1x8
+  xxlxor  vs32, vs32, vs32
+  xxlxor  vs33, vs33, vs33
+  xxlxor  vs34, vs34, vs34
+  xxlxor  vs35, vs35, vs35
+  xxlxor  vs40, vs40, vs40
+  xxlxor  vs41, vs41, vs41
+  xxlxor  vs42, vs42, vs42
+  xxlxor  vs43, vs43, vs43
+.endm
+
+
+.macro LOAD1x8   
+  LOAD1x8O 0,0 
+.endm
+
+
+.macro LOAD1x8O  OffsetA,OffsetB
+  lxsd vs4, (\OffsetB+0)(BO) 
+  lxv vs0,  (\OffsetA+0)(AO)
+  lxv vs1,  (\OffsetA+16)(AO)
+  lxv vs2,  (\OffsetA+32)(AO)
+  lxv vs3,  (\OffsetA+48)(AO) 
+  xxspltd   vs24,vs36,0
+  xxperm    vs26, vs24,   permute_mask    
+.endm
+
+
+.macro END1x8_NORMAL
+  END1x8 AO,BO,64,8
+.endm
+
+
+.macro END1x8_WITHOUT_ADD
+  END1x8 AO,BO,0,0
+.endm
+
+
+.macro END1x8 AREG, BREG, OffsetA, OffsetB
+.if \OffsetB != 0
+  addi  \BREG, \BREG, \OffsetB
+.endif
+
+.if \OffsetA != 0
+  addi  \AREG, \AREG, \OffsetA
+.endif
+
+    xvmaddasp       vs32, vs0,vs24
+    xvmaddasp       vs33, vs1,vs24
+    xvmaddasp       vs34, vs2,vs24  
+    xvmaddasp       vs35, vs3,vs24  
+    xvmaddasp       vs40, vs0,vs26
+    xvmaddasp       vs41, vs1,vs26
+    xvmaddasp       vs42, vs2,vs26  
+    xvmaddasp       vs43, vs3,vs26
+.endm
+
+
+.macro LOAD1x8_2
+    LOAD1x8_2O 0,0
+.endm
+ 
+
+.macro LOAD1x8_2O  OffsetA,OffsetB
+  lxv vs27,  (\OffsetB)(BO)
+  lxv vs4,  (0+\OffsetA)(AO)
+  lxv vs5,  (16+\OffsetA)(AO)
+  xxspltd  vs8,vs27,1
+  xxspltd  vs24,vs27,0    
+  lxv vs6,  (32+\OffsetA)(AO)
+  lxv vs7,  (48+\OffsetA)(AO) 
+  lxv vs0,  (64+\OffsetA)(AO)
+  lxv vs1,  (64+16+\OffsetA)(AO)     
+  lxv vs2,  (64+32+\OffsetA)(AO)
+  lxv vs3,  (64+48+\OffsetA)(AO)
+  xxperm    vs10, vs8,    permute_mask 
+  xxperm    vs26, vs24, permute_mask   
+.endm
+ 
+
+.macro END1x8_2   
+  /*for load2 offset will be 128 and 16*/
+   KERNEL1x8_2  AO,BO,  128,16,0 ,1,1 
+.endm
+
+
+.macro KERNEL1x8_E2 OffsetA,OffsetB, Index,IsLast 
+  KERNEL1x8_2 AO,BO,  \OffsetA,\OffsetB, \Index,\IsLast ,1 
+.endm
+
+
+.macro KERNEL1x8_L2 OffsetA,OffsetB, Index,IsLast
+  KERNEL1x8_2 AO,BO,  \OffsetA,\OffsetB, \Index,\IsLast ,0 
+.endm
+
+
+.macro KERNEL1x8_2  AREG,BREG,  OffsetA,OffsetB, Index,IsLast ,Complete
+.if \Complete==0  
+  lxv vs27,  DISP2(\Index,\OffsetB)(\BREG)
+.endif    
+  xvmaddasp   vs32, vs4,vs8
+  xvmaddasp   vs33, vs5,vs8
+  xvmaddasp   vs40, vs4,vs10
+  xvmaddasp   vs41, vs5,vs10
+.if \Complete==0  
+   lxv  vs4,  DISP16(\Index,0+\OffsetA)(\AREG)
+   lxv  vs5,  DISP16(\Index,16+\OffsetA)(\AREG)
+.endif
+
+  xvmaddasp   vs34, vs6,vs8 
+  xvmaddasp   vs35, vs7,vs8
+  xvmaddasp   vs42, vs6,vs10
+  xvmaddasp   vs43, vs7,vs10
+.if \Complete==0
+   lxv  vs6,  DISP16(\Index,32+\OffsetA)(\AREG)
+   lxv  vs7,  DISP16(\Index,48+\OffsetA)(\AREG) 
+.endif 
+.if \Complete==0 
+  xxspltd  vs8,vs27,1    
+  xxperm    vs10, vs8,    permute_mask   
+.endif    
+  xvmaddasp   vs32, vs0,vs24
+  xvmaddasp   vs33, vs1,vs24
+  xvmaddasp   vs40, vs0,vs26
+  xvmaddasp   vs41, vs1,vs26
+.if \Complete==0
+  lxv vs0,  DISP16(\Index,64+\OffsetA)(\AREG)
+  lxv vs1,  DISP16(\Index,64+16+\OffsetA)(\AREG) 
+.endif
+
+  xvmaddasp   vs34, vs2,vs24
+  xvmaddasp   vs35, vs3,vs24    
+  xvmaddasp   vs42, vs2,vs26
+  xvmaddasp   vs43, vs3,vs26
+.if \Complete==0
+  xxspltd  vs24,vs27,0   
+  xxperm    vs26, vs24, permute_mask  
+.endif  
+.if \Complete==0
+  lxv vs2,  DISP16(\Index,64+32+\OffsetA)(\AREG)
+  lxv vs3,  DISP16(\Index,64+48+\OffsetA)(\AREG)
+.endif
+
+.if \IsLast==1  
+.if \Complete==1
+  addi    \BREG, \BREG,  DISP2(\Index,\OffsetB)
+  addi    \AREG, \AREG, DISP16(\Index,\OffsetA)  
+.else
+  addi    \BREG, \BREG,  DISP2(\Index,16)
+  addi    \AREG, \AREG, DISP16(\Index,128)  
+.endif
+
+.endif   
+.endm
+
+
+.macro KERNEL1x8
+  LOAD1x8
+  END1x8  AO, BO, 64,8
+.endm
+
+
+.macro SAVE1x8
+#ifndef TRMMKERNEL  
+  lxv vs24 , 0(CO)
+  lxv vs25 , 16(CO)
+#endif
+  xxperm  vs0,vs32,permute_mask
+  xxperm  vs4,vs40,permute_mask
+#ifndef TRMMKERNEL  
+  lxv vs26 , 32(CO)
+  lxv vs27 , 48(CO)
+#endif  
+  xxperm  vs1,vs33,permute_mask
+  xxperm  vs5,vs41,permute_mask
+  xxperm  vs2,vs34,permute_mask
+  xxperm  vs6,vs42,permute_mask
+  xxperm  vs3,vs35,permute_mask
+  xxperm  vs7,vs43,permute_mask 
+  AGGREGATE_REALS_IMAGES vs32,vs0,vs40,vs4
+  AGGREGATE_REALS_IMAGES vs33,vs1,vs41,vs5
+  AGGREGATE_REALS_IMAGES vs34,vs2,vs42,vs6
+  AGGREGATE_REALS_IMAGES vs35,vs3,vs43,vs7 
+  /*inner reverse save_permute and store vs28 */
+  xxpermdi vs28,save_permute_1,save_permute_1,2
+  /*VSINRR,VSINII,VSOUT1,VSOUT2*/
+  MULT_APLHA_PART1    vs32,vs40,vs0,vs1 
+  MULT_APLHA_PART1    vs33,vs41,vs2,vs3    
+  MULT_APLHA_PART1    vs34,vs42,vs4,vs5
+  MULT_APLHA_PART1    vs35,vs43,vs6,vs7  
+  MULT_APLHA_PART2    vs32,vs40,vs0,vs1    
+  MULT_APLHA_PART2    vs33,vs41,vs2,vs3   
+  MULT_APLHA_PART2    vs34,vs42,vs4,vs5
+  MULT_APLHA_PART2    vs35,vs43,vs6,vs7  
+/* reconstruct r,i pairs*/
+  xxperm  vs0,vs1, vs28
+  xxperm  vs2,vs3, vs28
+  xxperm  vs4,vs5, vs28
+  xxperm  vs6,vs7, vs28  
+#ifndef TRMMKERNEL
+  /* add */
+  xvaddsp vs24,vs24,vs0
+  xvaddsp vs25,vs25,vs2
+  xvaddsp vs26,vs26,vs4
+  xvaddsp  vs27,vs27,vs6
+  stxv vs24 , 0(CO)
+  stxv vs25 , 16(CO) 
+  stxv vs26 , 32(CO)
+  stxv vs27 , 48(CO)    
+#else
+/* reconstruct r,i pairs*/
+  stxv vs0 , 0(CO)
+  stxv vs2 , 16(CO) 
+  stxv vs4 , 32(CO)
+  stxv vs6 , 48(CO)  
+#endif
+  addi  CO, CO, 64
+.endm
+
+/*                                             macros for N=1 and M=4
+**********************************************************************************************/
+
+.macro Zero1x4
+  xxlxor  vs32, vs32, vs32
+  xxlxor  vs33, vs33, vs33
+  xxlxor  vs40, vs40, vs40
+  xxlxor  vs41, vs41, vs41
+.endm
+
+
+.macro LOAD1x4   
+  LOAD1x4O 0,0 
+.endm
+
+
+.macro LOAD1x4O  OffsetA,OffsetB
+  lxsd vs4, (\OffsetB+0)(BO) 
+  lxv vs0,  (\OffsetA+0)(AO)
+  lxv vs1,  (\OffsetA+16)(AO)
+  xxspltd   vs24,vs36,0
+  xxperm    vs26, vs24,   permute_mask    
+.endm
+
+
+.macro END1x4_NORMAL
+  END1x4 AO,BO,32,8
+.endm
+
+
+.macro END1x4_WITHOUT_ADD
+  END1x4 AO,BO,0,0
+.endm
+
+
+.macro END1x4 AREG, BREG, OffsetA, OffsetB
+.if \OffsetB != 0
+  addi  \BREG, \BREG, \OffsetB
+.endif
+
+.if \OffsetA != 0
+  addi  \AREG, \AREG, \OffsetA
+.endif
+
+    xvmaddasp       vs32, vs0,vs24
+    xvmaddasp       vs33, vs1,vs24
+    xvmaddasp       vs40, vs0,vs26
+    xvmaddasp       vs41, vs1,vs26
+.endm
+
+
+.macro LOAD1x4_2
+    LOAD1x4_2O 0,0
+.endm
+ 
+
+.macro LOAD1x4_2O  OffsetA,OffsetB
+  lxv vs27,  (\OffsetB)(BO)
+  lxv vs4,  (0+\OffsetA)(AO)
+  lxv vs5,  (16+\OffsetA)(AO)
+  xxspltd  vs8,vs27,1
+  xxspltd  vs24,vs27,0    
+  lxv vs0,  (32+\OffsetA)(AO)
+  lxv vs1,  (32+16+\OffsetA)(AO)     
+  xxperm    vs10, vs8,    permute_mask 
+  xxperm    vs26, vs24, permute_mask   
+.endm
+ 
+
+.macro END1x4_2   
+  /*for load2 offset will be 64 and 16*/
+   KERNEL1x4_2  AO,BO,  64,16,0 ,1,1 
+.endm
+
+
+.macro KERNEL1x4_E2 OffsetA,OffsetB, Index,IsLast 
+  KERNEL1x4_2 AO,BO,  \OffsetA,\OffsetB, \Index,\IsLast ,1 
+.endm
+
+
+.macro KERNEL1x4_L2 OffsetA,OffsetB, Index,IsLast
+  KERNEL1x4_2 AO,BO,  \OffsetA,\OffsetB, \Index,\IsLast ,0 
+.endm
+
+
+.macro KERNEL1x4_2  AREG,BREG,  OffsetA,OffsetB, Index,IsLast ,Complete
+.if \Complete==0  
+  lxv vs27,  DISP2(\Index,\OffsetB)(\BREG)
+.endif    
+  xvmaddasp   vs32, vs4,vs8
+  xvmaddasp   vs33, vs5,vs8
+  xvmaddasp   vs40, vs4,vs10
+  xvmaddasp   vs41, vs5,vs10
+.if \Complete==0  
+   lxv  vs4,  DISP8(\Index,0+\OffsetA)(\AREG)
+   lxv  vs5,  DISP8(\Index,16+\OffsetA)(\AREG)
+.endif
+
+.if \Complete==0 
+  xxspltd  vs8,vs27,1    
+  xxperm    vs10, vs8,    permute_mask   
+.endif    
+  xvmaddasp   vs32, vs0,vs24
+  xvmaddasp   vs33, vs1,vs24
+  xvmaddasp   vs40, vs0,vs26
+  xvmaddasp   vs41, vs1,vs26
+.if \Complete==0
+  lxv vs0,  DISP8(\Index,32+\OffsetA)(\AREG)
+  lxv vs1,  DISP8(\Index,32+16+\OffsetA)(\AREG) 
+.endif
+
+.if \Complete==0
+  xxspltd  vs24,vs27,0   
+  xxperm    vs26, vs24, permute_mask  
+.endif  
+.if \IsLast==1  
+.if \Complete==1
+  addi    \BREG, \BREG,  DISP2(\Index,\OffsetB)
+  addi    \AREG, \AREG, DISP8(\Index,\OffsetA)  
+.else
+  addi    \BREG, \BREG,  DISP2(\Index,16)
+  addi    \AREG, \AREG, DISP8(\Index,64)  
+.endif
+
+.endif   
+.endm
+
+
+.macro KERNEL1x4
+  LOAD1x4
+  END1x4  AO, BO, 32,8
+.endm
+
+
+.macro SAVE1x4
+#ifndef TRMMKERNEL  
+  lxv vs24 , 0(CO)
+  lxv vs25 , 16(CO)
+#endif
+  xxperm  vs0,vs32,permute_mask
+  xxperm  vs4,vs40,permute_mask
+  xxperm  vs1,vs33,permute_mask
+  xxperm  vs5,vs41,permute_mask
+  AGGREGATE_REALS_IMAGES vs32,vs0,vs40,vs4
+  AGGREGATE_REALS_IMAGES vs33,vs1,vs41,vs5
+  /*inner reverse save_permute and store vs28 */
+  xxpermdi vs28,save_permute_1,save_permute_1,2
+  /*VSINRR,VSINII,VSOUT1,VSOUT2*/
+  MULT_APLHA_PART1    vs32,vs40,vs0,vs1 
+  MULT_APLHA_PART1    vs33,vs41,vs2,vs3    
+  MULT_APLHA_PART2    vs32,vs40,vs0,vs1    
+  MULT_APLHA_PART2    vs33,vs41,vs2,vs3   
+/* reconstruct r,i pairs*/
+  xxperm  vs0,vs1, vs28
+  xxperm  vs2,vs3, vs28
+#ifndef TRMMKERNEL
+  /* add */
+  xvaddsp vs24,vs24,vs0
+  xvaddsp vs25,vs25,vs2
+  stxv vs24 , 0(CO)
+  stxv vs25 , 16(CO) 
+#else
+/* reconstruct r,i pairs*/
+  stxv vs0 , 0(CO)
+  stxv vs2 , 16(CO) 
+#endif
+  addi  CO, CO, 32
+.endm
+
+/*                                             macros for N=1 and M=2
+**********************************************************************************************/
+
+.macro Zero1x2
+  xxlxor  vs32, vs32, vs32
+  xxlxor  vs40, vs40, vs40
+.endm
+
+
+.macro LOAD1x2   
+  LOAD1x2O 0,0 
+.endm
+
+
+.macro LOAD1x2O  OffsetA,OffsetB
+  lxsd vs4, (\OffsetB+0)(BO) 
+  lxv vs0,  (\OffsetA+0)(AO)
+  xxspltd   vs24,vs36,0
+  xxperm    vs26, vs24,   permute_mask    
+.endm
+
+
+.macro END1x2_NORMAL
+  END1x2 AO,BO,16,8
+.endm
+
+
+.macro END1x2_WITHOUT_ADD
+  END1x2 AO,BO,0,0
+.endm
+
+
+.macro END1x2 AREG, BREG, OffsetA, OffsetB
+.if \OffsetB != 0
+  addi  \BREG, \BREG, \OffsetB
+.endif
+
+.if \OffsetA != 0
+  addi  \AREG, \AREG, \OffsetA
+.endif
+
+    xvmaddasp       vs32, vs0,vs24
+    xvmaddasp       vs40, vs0,vs26
+.endm
+
+
+.macro LOAD1x2_2
+    LOAD1x2_2O 0,0
+.endm
+ 
+
+.macro LOAD1x2_2O  OffsetA,OffsetB
+  lxv vs27,  (\OffsetB)(BO)
+  lxv vs4,  (0+\OffsetA)(AO)
+  lxv vs0,  (16+\OffsetA)(AO)
+  xxspltd  vs8,vs27,1
+  xxspltd  vs24,vs27,0    
+  xxperm    vs10, vs8,    permute_mask 
+  xxperm    vs26, vs24, permute_mask   
+.endm
+ 
+
+.macro END1x2_2   
+  /*for load2 offset will be 32 and 16*/
+   KERNEL1x2_2  AO,BO,  32,16,0 ,1,1 
+.endm
+
+
+.macro KERNEL1x2_E2 OffsetA,OffsetB, Index,IsLast 
+  KERNEL1x2_2 AO,BO,  \OffsetA,\OffsetB, \Index,\IsLast ,1 
+.endm
+
+
+.macro KERNEL1x2_L2 OffsetA,OffsetB, Index,IsLast
+  KERNEL1x2_2 AO,BO,  \OffsetA,\OffsetB, \Index,\IsLast ,0 
+.endm
+
+
+.macro KERNEL1x2_2  AREG,BREG,  OffsetA,OffsetB, Index,IsLast ,Complete
+.if \Complete==0  
+  lxv vs27,  DISP2(\Index,\OffsetB)(\BREG)
+.endif    
+  xvmaddasp   vs32, vs4,vs8
+  xvmaddasp   vs40, vs4,vs10
+.if \Complete==0  
+   lxv  vs4,  DISP4(\Index,0+\OffsetA)(\AREG)
+.endif
+
+.if \Complete==0 
+  xxspltd  vs8,vs27,1    
+  xxperm    vs10, vs8,    permute_mask   
+.endif    
+  xvmaddasp   vs32, vs0,vs24
+  xvmaddasp   vs40, vs0,vs26
+.if \Complete==0
+  lxv vs0,  DISP4(\Index,16+\OffsetA)(\AREG)
+.endif
+
+.if \Complete==0
+  xxspltd  vs24,vs27,0   
+  xxperm    vs26, vs24, permute_mask  
+.endif  
+.if \IsLast==1  
+.if \Complete==1
+  addi    \BREG, \BREG,  DISP2(\Index,\OffsetB)
+  addi    \AREG, \AREG, DISP4(\Index,\OffsetA)  
+.else
+  addi    \BREG, \BREG,  DISP2(\Index,16)
+  addi    \AREG, \AREG, DISP4(\Index,32)  
+.endif
+
+.endif   
+.endm
+
+
+.macro KERNEL1x2
+  LOAD1x2
+  END1x2  AO, BO, 16,8
+.endm
+
+
+.macro SAVE1x2
+#ifndef TRMMKERNEL  
+  lxv vs24 , 0(CO)
+#endif
+  xxperm  vs0,vs32,permute_mask
+  xxperm  vs4,vs40,permute_mask
+  AGGREGATE_REALS_IMAGES vs32,vs0,vs40,vs4
+  /*inner reverse save_permute and store vs28 */
+  xxpermdi vs28,save_permute_1,save_permute_1,2
+  /*VSINRR,VSINII,VSOUT1,VSOUT2*/
+  MULT_APLHA_PART1    vs32,vs40,vs0,vs1 
+  MULT_APLHA_PART2    vs32,vs40,vs0,vs1    
+/* reconstruct r,i pairs*/
+  xxperm  vs0,vs1, vs28
+#ifndef TRMMKERNEL
+  /* add */
+  xvaddsp vs24,vs24,vs0
+  stxv vs24 , 0(CO)
+#else
+/* reconstruct r,i pairs*/
+  stxv vs0 , 0(CO)
+#endif
+  addi  CO, CO, 16
+.endm
+
+/*                                             macros for N=1 and M=1
+**********************************************************************************************/
+.macro Zero1x1
+  xxlxor  vs32, vs32, vs32
+  xxlxor  vs40, vs40, vs40
+.endm
+
+
+.macro LOAD1x1   
+  LOAD1x1O 0,0 
+.endm
+
+
+.macro LOAD1x1O  OffsetA,OffsetB
+  lxsd v4, (\OffsetB+0)(BO) 
+  lxsd v5,  (\OffsetA+0)(AO)
+  xxperm    vs38, vs36,   permute_mask    
+.endm
+
+
+.macro END1x1_NORMAL
+  END1x1 AO,BO,8,8
+.endm
+
+
+.macro END1x1_WITHOUT_ADD
+  END1x1 AO,BO,0,0
+.endm
+
+
+.macro END1x1 AREG, BREG, OffsetA, OffsetB
+.if \OffsetB != 0
+  addi  \BREG, \BREG, \OffsetB
+.endif
+
+.if \OffsetA != 0
+  addi  \AREG, \AREG, \OffsetA
+.endif
+
+    xvmaddasp       vs32, vs37,vs36
+    xvmaddasp       vs40, vs37,vs38
+.endm
+
+
+.macro LOAD1x1_2
+    LOAD1x1_2O 0,0
+.endm
+ 
+
+.macro LOAD1x1_2O  OffsetA,OffsetB
+  lxv vs8,  (\OffsetB)(BO)
+  lxv vs4,  (0+\OffsetA)(AO) 
+  xxperm    vs10, vs8,    permute_mask  
+.endm
+ 
+
+.macro END1x1_2   
+  /*for load2 offset will be 16 and 16*/
+   KERNEL1x1_2  AO,BO,  16,16,0 ,1,1 
+.endm
+
+
+.macro KERNEL1x1_E2 OffsetA,OffsetB, Index,IsLast 
+  KERNEL1x1_2 AO,BO,  \OffsetA,\OffsetB, \Index,\IsLast ,1 
+.endm
+
+
+.macro KERNEL1x1_L2 OffsetA,OffsetB, Index,IsLast
+  KERNEL1x1_2 AO,BO,  \OffsetA,\OffsetB, \Index,\IsLast ,0 
+.endm
+
+
+.macro KERNEL1x1_2  AREG,BREG,  OffsetA,OffsetB, Index,IsLast ,Complete
+ 
+  xvmaddasp   vs32, vs4,vs8
+  xvmaddasp   vs40, vs4,vs10
+.if \Complete==0  
+  lxv vs8,  DISP2(\Index,\OffsetB)(\BREG)
+  lxv vs4,  DISP2(\Index,\OffsetB)(\AREG)
+  xxperm    vs10, vs8,    permute_mask  
+.endif
+
+.if \IsLast==1  
+.if \Complete==1
+  addi    \BREG, \BREG,  DISP2(\Index,\OffsetB)
+  addi    \AREG, \AREG, DISP2(\Index,\OffsetA)  
+.else
+  addi    \BREG, \BREG,  DISP2(\Index,16)
+  addi    \AREG, \AREG, DISP2(\Index,16)  
+.endif
+
+.endif   
+.endm
+
+
+.macro KERNEL1x1
+  LOAD1x1
+  END1x1  AO, BO, 8,8
+.endm
+
+
+.macro SAVE1x1
+#ifndef TRMMKERNEL  
+  lxsd v4 , 0(CO)
+#endif
+  /*aggregate x2*/
+  xxpermdi vs33,vs32,vs32,2
+  xxpermdi vs41,vs40,vs40,2 
+  xvaddsp vs32,vs32,vs33
+  xvaddsp vs40,vs40,vs41
+
+  xxperm  vs0,vs32,permute_mask
+  xxperm  vs4,vs40,permute_mask
+  AGGREGATE_REALS_IMAGES vs32,vs0,vs40,vs4
+  /*inner reverse save_permute and store vs28 */
+  xxpermdi vs28,save_permute_1,save_permute_1,2
+  /*VSINRR,VSINII,VSOUT1,VSOUT2*/
+  MULT_APLHA_PART1    vs32,vs40,vs37,vs1 
+  MULT_APLHA_PART2    vs32,vs40,vs37,vs1    
+
+/* reconstruct r,i pairs*/
+  xxperm  vs37,vs1, vs28  
+
+#ifndef TRMMKERNEL
+  /* add */
+  xvaddsp vs36,vs36,vs37
+  stxsd v4 , 0(CO)
+#else
+
+/* vs37 is v5 */
+  stxsd v5 , 0(CO)
+#endif
+  addi  CO, CO, 8
+.endm
+
+ 
+ 
+
+/****************************TRMM POINTER REFRESH MACROSES*************************/
+
+
+.macro SHIFT_REG  REG1,REG2,SHIFT_VAL
+		.if \SHIFT_VAL==16 
+			slwi		\REG1,	\REG2,	7			
+		.elseif \SHIFT_VAL==8  
+			slwi		\REG1,	\REG2,	6			 
+		.elseif \SHIFT_VAL==4
+			slwi		\REG1,	\REG2,	5			  
+		.elseif \SHIFT_VAL==2
+			slwi		\REG1,	\REG2,	4			 
+		.elseif \SHIFT_VAL==1
+			slwi		\REG1,	\REG2,	3			 
+		.endif
+.endm
+
+/*
+//#if (defined(LEFT) &&  defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+// 		ptrbb = bb;
+// #else
+// 		ptrba += off*8;
+// 		ptrbb = bb + off*4;
+// #endif
+*/
+.macro REFRESH_POINTERS  PTR_A,PTR_B,OFF_VAL,B_VAL,C_A,C_B
+    #if (defined(LEFT) &&  defined(TRANSA)) ||  (!defined(LEFT) && !defined(TRANSA))
+        /* ptrbb = bb;*/
+        mr \PTR_B,\B_VAL     /* refresh BPOINT */
+
+    #else
+		    /*
+        // ptrba  =ptrba+ off*C_A;
+        // ptrbb = bb + off*C_B; 
+				*/
+		SHIFT_REG T4,\OFF_VAL,\C_B		/* Number of values in B shifted  */
+		SHIFT_REG T2,\OFF_VAL,\C_A		/* Number of values in A shifted  */
+		add		\PTR_B,	\B_VAL ,	T4				/* Add values to BO */
+		add		\PTR_A,	\PTR_A,	T2				/* Add values to AO  */
+    #endif 
+.endm
+
+
+/*
+// #if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
+// 		temp = bk-off;
+// #elif defined(LEFT)
+// 		temp = off+8;	// number of values in A
+// #else
+// 		temp = off+4;	// number of values in B
+// #endif
+*/
+.macro REFRESH_TEMP_BK TEMP_BK,BK_VAL,OFF_VAL,INCR_A,INCR_B
+    #if (defined(LEFT) && !defined(TRANSA)) ||  (!defined(LEFT) && defined(TRANSA))
+                            /* temp = bk-off;*/
+           sub \TEMP_BK,\BK_VAL,\OFF_VAL
+
+    #elif defined(LEFT)
+                            /* temp = off+INCR_A;	// number of values in A */
+           addi \TEMP_BK, \OFF_VAL, \INCR_A
+    #else
+                            /* temp = off+INCR_B	// number of values in B*/
+           addi \TEMP_BK,\OFF_VAL, \INCR_B
+    #endif
+
+.endm
+/*
+// #if ( defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+// 		temp = bk - off;
+// #ifdef LEFT
+// 		temp -= 8; // number of values in A
+// #else
+// 		temp -= 4; // number of values in B
+// #endif
+// 		ptrba += temp*8;
+// 		ptrbb += temp*4;
+// #endif
+
+// #ifdef LEFT
+// 		off += 8; // number of values in A
+// #endif
+*/
+ 
+
+.macro REFRESH_AFTER_SAVE TEMP_BK,BK_VAL,OFF_VAL,PTR_B,PTR_A,C_A,C_B
+
+    #if ( defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+                    /*temp = bk - off;*/
+                sub \TEMP_BK,\BK_VAL,\OFF_VAL
+    #ifdef LEFT
+                    /*temp -= 8; // number of values in A*/
+                addi \TEMP_BK,\TEMP_BK,-\C_A
+    #else
+                    /*temp -= 4; // number of values in B*/
+                addi \TEMP_BK,\TEMP_BK,-\C_B 
+    #endif
+                    /*ptrba += temp*C_A;
+                    ptrbb += temp*C_B;*/ 
+                SHIFT_REG T4,\TEMP_BK,\C_A
+								SHIFT_REG T2,\TEMP_BK,\C_B
+                add \PTR_A, \PTR_A,T4/*ptrba+temp*C_A*/ 
+								add \PTR_B, \PTR_B,T2 
+
+    #endif
+
+    #ifdef LEFT
+                    /*off += 8; // number of values in A*/
+                 addi \OFF_VAL,\OFF_VAL,\C_A
+    #endif
+.endm
\ No newline at end of file
diff --git a/kernel/power/zgemm_logic_power9.S b/kernel/power/zgemm_logic_power9.S
index f902484a3..fe5d8ade2 100644
--- a/kernel/power/zgemm_logic_power9.S
+++ b/kernel/power/zgemm_logic_power9.S
@@ -1353,7 +1353,7 @@ ZGEMM_L1:
 ZGEMM_L1_BEGIN:
 /*----------------------------------------*/   
     mr    CO, C
-    slwi    T1, LDC , 1     
+   
     add     T2,C,LDC    
     mr    AO, A  
     add   C,  C,  T1
diff --git a/param.h b/param.h
index 3934da6c8..84e577acc 100644
--- a/param.h
+++ b/param.h
@@ -2250,12 +2250,12 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
 #define SGEMM_DEFAULT_P 832
 #define DGEMM_DEFAULT_P  128
-#define CGEMM_DEFAULT_P  640
+#define CGEMM_DEFAULT_P  512
 #define ZGEMM_DEFAULT_P 256
 
 #define SGEMM_DEFAULT_Q 1026
 #define DGEMM_DEFAULT_Q  384
-#define CGEMM_DEFAULT_Q  640
+#define CGEMM_DEFAULT_Q  1026
 #define ZGEMM_DEFAULT_Q 1026
 
 #define SYMV_P	 8

From 9086543f503f63d9107ce539650f28918b027015 Mon Sep 17 00:00:00 2001
From: Martin Kroeker <martin@ruby.chemie.uni-freiburg.de>
Date: Sat, 6 Jul 2019 14:29:47 +0200
Subject: [PATCH 094/127] Utest needs CBLAS but not necessarily FORTRAN

---
 CMakeLists.txt | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index 50da721cd..d7d9c2fce 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -211,7 +211,8 @@ if (USE_THREAD)
   target_link_libraries(${OpenBLAS_LIBNAME} ${CMAKE_THREAD_LIBS_INIT})
 endif()
 
-if (MSVC OR NOT NOFORTRAN)
+#if (MSVC OR NOT NOFORTRAN)
+if (NOT NO_CBLAS)
   # Broken without fortran on unix
   add_subdirectory(utest)
 endif()

From ae9e8b131e27f65684cf4cb98e03b7df4b290142 Mon Sep 17 00:00:00 2001
From: Martin Kroeker <martin@ruby.chemie.uni-freiburg.de>
Date: Sat, 6 Jul 2019 14:30:33 +0200
Subject: [PATCH 095/127] Add mingw builds to Appveyor config

---
 appveyor.yml | 17 ++++++++++++++++-
 1 file changed, 16 insertions(+), 1 deletion(-)

diff --git a/appveyor.yml b/appveyor.yml
index 44a616aaa..2f9cc7b0b 100644
--- a/appveyor.yml
+++ b/appveyor.yml
@@ -35,7 +35,14 @@ environment:
       DYNAMIC_ARCH: ON
       WITH_FORTRAN: no
     - COMPILER: cl
-
+    - COMPILER: MinGW64-gcc-7.2.0-mingw
+      DYNAMIC_ARCH: OFF
+      WITH_FORTRAN: ignore
+    - COMPILER: MinGW64-gcc-7.2.0
+    - APPVEYOR_BUILD_WORKER_IMAGE: Visual Studio 2015
+      COMPILER: MinGW-gcc-5.3.0
+      WITH_FORTRAN: ignore
+    
 install:
   - if [%COMPILER%]==[clang-cl] call %CONDA_INSTALL_LOCN%\Scripts\activate.bat
   - if [%COMPILER%]==[clang-cl] conda config --add channels conda-forge --force
@@ -52,7 +59,14 @@ install:
 before_build:
   - ps: if (-Not (Test-Path .\build)) { mkdir build }
   - cd build
+  - set PATH=%PATH:C:\Program Files\Git\usr\bin;=%
+  - if [%COMPILER%]==[MinGW-gcc-5.3.0] set PATH=C:\MinGW\bin;C:\msys64\usr\bin;C:\mingw-w64\x86_64-7.2.0-posix-seh-rt_v5-rev1\mingw64\bin;%PATH%
+  - if [%COMPILER%]==[MinGW64-gcc-7.2.0-mingw] set PATH=C:\MinGW\bin;C:\mingw-w64\x86_64-7.2.0-posix-seh-rt_v5-rev1\mingw64\bin;%PATH%
+  - if [%COMPILER%]==[MinGW64-gcc-7.2.0] set PATH=C:\msys64\usr\bin;C:\mingw-w64\x86_64-7.2.0-posix-seh-rt_v5-rev1\mingw64\bin;%PATH%
   - if [%COMPILER%]==[cl] cmake -G "Visual Studio 15 2017 Win64" ..
+  - if [%COMPILER%]==[MinGW64-gcc-7.2.0-mingw] cmake -G "MinGW Makefiles" -DNOFORTRAN=1 ..
+  - if [%COMPILER%]==[MinGW64-gcc-7.2.0] cmake -G "MSYS Makefiles"  -DBINARY=32 -DNOFORTRAN=1 ..
+  - if [%COMPILER%]==[MinGW-gcc-5.3.0] cmake -G "MSYS Makefiles" -DNOFORTRAN=1 ..
   - if [%WITH_FORTRAN%]==[no] cmake -G "Ninja" -DCMAKE_CXX_COMPILER=clang-cl -DCMAKE_C_COMPILER=clang-cl -DMSVC_STATIC_CRT=ON ..
   - if [%WITH_FORTRAN%]==[yes] cmake -G "Ninja" -DCMAKE_CXX_COMPILER=clang-cl -DCMAKE_C_COMPILER=clang-cl -DCMAKE_Fortran_COMPILER=flang -DBUILD_WITHOUT_LAPACK=no -DNOFORTRAN=0 ..
   - if [%DYNAMIC_ARCH%]==[ON] cmake -DDYNAMIC_ARCH=ON -DDYNAMIC_LIST='CORE2;NEHALEM;SANDYBRIDGE;BULLDOZER;HASWELL' ..
@@ -64,3 +78,4 @@ test_script:
   - echo Running Test
   - cd utest
   - openblas_utest
+  

From f69a0be712a9dccf5fcf433a734eb1371cb6189a Mon Sep 17 00:00:00 2001
From: Martin Kroeker <martin@ruby.chemie.uni-freiburg.de>
Date: Sat, 6 Jul 2019 15:02:39 +0200
Subject: [PATCH 096/127] Add getarch flags to disable AVX on x86

(and other small fixes to match Makefile behaviour)
---
 cmake/system.cmake | 25 +++++++++++++++++++++----
 1 file changed, 21 insertions(+), 4 deletions(-)

diff --git a/cmake/system.cmake b/cmake/system.cmake
index 7f3696286..1c2093efe 100644
--- a/cmake/system.cmake
+++ b/cmake/system.cmake
@@ -70,6 +70,13 @@ if (X86_64)
   set(GETARCH_FLAGS "${GETARCH_FLAGS} -march=native")
 endif ()
 
+# On x86 no AVX support is available
+if (X86 OR X86_64)
+if ((DEFINED BINARY AND BINARY EQUAL 32) OR ("$CMAKE_SIZEOF_VOID_P}" EQUAL "4"))
+  set(GETARCH_FLAGS "${GETARCH_FLAGS} -DNO_AVX -DNO_AVX2 -DNO_AVX512")
+endif ()
+endif ()
+
 if (INTERFACE64)
   message(STATUS "Using 64-bit integers.")
   set(GETARCH_FLAGS	"${GETARCH_FLAGS} -DUSE64BITINT")
@@ -148,7 +155,9 @@ else()
 endif ()
 
 include("${PROJECT_SOURCE_DIR}/cmake/prebuild.cmake")
-
+if (DEFINED BINARY)
+  message(STATUS "Compiling a ${BINARY}-bit binary.")
+endif ()
 if (NOT DEFINED NEED_PIC)
   set(NEED_PIC 1)
 endif ()
@@ -165,6 +174,9 @@ include("${PROJECT_SOURCE_DIR}/cmake/cc.cmake")
 if (NOT NOFORTRAN)
   # Fortran Compiler dependent settings
   include("${PROJECT_SOURCE_DIR}/cmake/fc.cmake")
+else ()
+set(NO_LAPACK 1)
+set(NO_LAPACKE 1)
 endif ()
 
 if (BINARY64)
@@ -190,9 +202,14 @@ if (NEED_PIC)
 endif ()
 
 if (DYNAMIC_ARCH)
-  set(CCOMMON_OPT "${CCOMMON_OPT} -DDYNAMIC_ARCH")
-  if (DYNAMIC_OLDER)
-    set(CCOMMON_OPT "${CCOMMON_OPT} -DDYNAMIC_OLDER")
+  if (X86 OR X86_64 OR ARM64 OR PPC)
+    set(CCOMMON_OPT "${CCOMMON_OPT} -DDYNAMIC_ARCH")
+    if (DYNAMIC_OLDER)
+      set(CCOMMON_OPT "${CCOMMON_OPT} -DDYNAMIC_OLDER")
+    endif ()
+  else ()
+    unset (DYNAMIC_ARCH)
+    message (STATUS "DYNAMIC_ARCH is not supported on the target architecture, removing")
   endif ()
 endif ()
 

From 04d671aae2b452a0bf63837c289f8948c35eb675 Mon Sep 17 00:00:00 2001
From: Martin Kroeker <martin@ruby.chemie.uni-freiburg.de>
Date: Sat, 6 Jul 2019 15:05:04 +0200
Subject: [PATCH 097/127] Make disabling DYNAMIC_ARCH on unsupported systems
 work

needs to be unset in the cache for the change to have any effect
---
 cmake/arch.cmake | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/cmake/arch.cmake b/cmake/arch.cmake
index b4547b7c9..5a7434551 100644
--- a/cmake/arch.cmake
+++ b/cmake/arch.cmake
@@ -81,7 +81,8 @@ if (DYNAMIC_ARCH)
   endif ()
 
   if (NOT DYNAMIC_CORE)
-    unset(DYNAMIC_ARCH)
+    message (STATUS "DYNAMIC_ARCH is not supported on this architecture, removing from options")
+    unset(DYNAMIC_ARCH CACHE)
   endif ()
 endif ()
 

From 8fb76134bc0711634b410fa20d6eb113f8893a04 Mon Sep 17 00:00:00 2001
From: Martin Kroeker <martin@ruby.chemie.uni-freiburg.de>
Date: Sat, 6 Jul 2019 15:07:15 +0200
Subject: [PATCH 098/127] Mingw32 needs leading underscore on object names

(also copy BUNDERSCORE settings for FORTRAN from the corresponding Makefile)
---
 cmake/prebuild.cmake | 8 ++++++++
 1 file changed, 8 insertions(+)

diff --git a/cmake/prebuild.cmake b/cmake/prebuild.cmake
index a67c44bf5..e508a46c2 100644
--- a/cmake/prebuild.cmake
+++ b/cmake/prebuild.cmake
@@ -59,6 +59,9 @@ set(FU "")
 if (APPLE OR (MSVC AND NOT ${CMAKE_C_COMPILER_ID} MATCHES "Clang"))
   set(FU "_")
 endif()
+if(MINGW AND NOT MINGW64)
+  set(FU "_")
+endif()
 
 set(COMPILER_ID ${CMAKE_C_COMPILER_ID})
 if (${COMPILER_ID} STREQUAL "GNU")
@@ -82,6 +85,11 @@ endif ()
 # f_check
 if (NOT NOFORTRAN)
   include("${PROJECT_SOURCE_DIR}/cmake/f_check.cmake")
+else ()
+ file(APPEND ${TARGET_CONF_TEMP}
+   "#define BUNDERSCORE _\n"
+   "#define NEEDBUNDERSCORE 1\n")
+ set(BU "_")
 endif ()
 
 # Cannot run getarch on target if we are cross-compiling

From b89c781637503ec66117eb3b887a3755d42f0f46 Mon Sep 17 00:00:00 2001
From: Martin Kroeker <martin@ruby.chemie.uni-freiburg.de>
Date: Sun, 7 Jul 2019 16:04:45 +0200
Subject: [PATCH 099/127] Fix surprising behaviour of NO_AFFINITY=0

---
 Makefile.system | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/Makefile.system b/Makefile.system
index 16791bcc2..09a648e4a 100644
--- a/Makefile.system
+++ b/Makefile.system
@@ -1124,8 +1124,12 @@ endif
 endif
 
 ifdef NO_AFFINITY
+ifeq ($(NO_AFFINITY), 0)
+override undefine NO_AFFINITY
+else
 CCOMMON_OPT	+= -DNO_AFFINITY
 endif
+endif
 
 ifdef FUNCTION_PROFILE
 CCOMMON_OPT	+= -DFUNCTION_PROFILE

From b89d9762a29ac84422ebb6092584831efd85d355 Mon Sep 17 00:00:00 2001
From: Isuru Fernando <isuruf@gmail.com>
Date: Mon, 8 Jul 2019 17:13:21 -0500
Subject: [PATCH 100/127] Change install_name on osx to match linux

---
 Makefile         | 1 +
 Makefile.install | 3 ++-
 exports/Makefile | 8 ++++++--
 3 files changed, 9 insertions(+), 3 deletions(-)

diff --git a/Makefile b/Makefile
index 07b08439e..60f189ef2 100644
--- a/Makefile
+++ b/Makefile
@@ -109,6 +109,7 @@ endif
 ifeq ($(OSNAME), Darwin)
 	@$(MAKE) -C exports dyn
 	@ln -fs $(LIBDYNNAME) $(LIBPREFIX).dylib
+	@ln -fs $(LIBDYNNAME) $(LIBPREFIX).$(MAJOR_VERSION).dylib
 endif
 ifeq ($(OSNAME), WINNT)
 	@$(MAKE) -C exports dll
diff --git a/Makefile.install b/Makefile.install
index fefecd98d..8070b4729 100644
--- a/Makefile.install
+++ b/Makefile.install
@@ -83,7 +83,8 @@ ifeq ($(OSNAME), Darwin)
 	@-cp $(LIBDYNNAME) "$(DESTDIR)$(OPENBLAS_LIBRARY_DIR)"
 	@-install_name_tool -id "$(DESTDIR)$(OPENBLAS_LIBRARY_DIR)/$(LIBDYNNAME)" "$(DESTDIR)$(OPENBLAS_LIBRARY_DIR)/$(LIBDYNNAME)"
 	@cd "$(DESTDIR)$(OPENBLAS_LIBRARY_DIR)" ; \
-	ln -fs $(LIBDYNNAME) $(LIBPREFIX).dylib
+	ln -fs $(LIBDYNNAME) $(LIBPREFIX).dylib ; \
+	ln -fs $(LIBDYNNAME) $(LIBPREFIX).$(MAJOR_VERSION).dylib
 endif
 ifeq ($(OSNAME), WINNT)
 	@-cp $(LIBDLLNAME) "$(DESTDIR)$(OPENBLAS_BINARY_DIR)"
diff --git a/exports/Makefile b/exports/Makefile
index b1348bd4a..d32e449df 100644
--- a/exports/Makefile
+++ b/exports/Makefile
@@ -105,6 +105,10 @@ $(LIBPREFIX).def : gensymbol
 libgoto_hpl.def : gensymbol
 	perl ./gensymbol win2khpl $(ARCH) dummy $(EXPRECISION) $(NO_CBLAS) $(NO_LAPACK) $(NO_LAPACKE) $(NEED2UNDERSCORES) $(ONLY_CBLAS) "$(SYMBOLPREFIX)" "$(SYMBOLSUFFIX)" $(BUILD_LAPACK_DEPRECATED) > $(@F)
 
+ifeq ($(OSNAME), Darwin)
+INTERNALNAME = $(LIBPREFIX).$(MAJOR_VERSION).dylib
+endif
+
 ifeq (, $(SYMBOLPREFIX)$(SYMBOLSUFFIX))
 $(LIBDYNNAME) : ../$(LIBNAME) osx.def
 else
@@ -114,9 +118,9 @@ $(LIBDYNNAME) : ../$(LIBNAME).osx.renamed osx.def
 endif
 ifneq (,$(filter 1 2,$(NOFORTRAN)))
 #only build without Fortran
-	$(CC) $(CFLAGS) $(LDFLAGS) -all_load -headerpad_max_install_names -install_name "$(CURDIR)/../$(LIBDYNNAME)" -dynamiclib -o ../$(LIBDYNNAME) $< -Wl,-exported_symbols_list,osx.def  $(FEXTRALIB)
+	$(CC) $(CFLAGS) $(LDFLAGS) -all_load -headerpad_max_install_names -install_name "$(CURDIR)/../$(INTERNALNAME)" -dynamiclib -o ../$(LIBDYNNAME) $< -Wl,-exported_symbols_list,osx.def  $(FEXTRALIB)
 else
-	$(FC) $(FFLAGS) $(LDFLAGS) -all_load -headerpad_max_install_names -install_name "$(CURDIR)/../$(LIBDYNNAME)" -dynamiclib -o ../$(LIBDYNNAME) $< -Wl,-exported_symbols_list,osx.def  $(FEXTRALIB)
+	$(FC) $(FFLAGS) $(LDFLAGS) -all_load -headerpad_max_install_names -install_name "$(CURDIR)/../$(INTERNALNAME)" -dynamiclib -o ../$(LIBDYNNAME) $< -Wl,-exported_symbols_list,osx.def  $(FEXTRALIB)
 endif
 
 dllinit.$(SUFFIX) : dllinit.c

From 0ba29fd2625dfe405a08005a22d0fa21293cc16c Mon Sep 17 00:00:00 2001
From: wjc404 <52632443+wjc404@users.noreply.github.com>
Date: Wed, 17 Jul 2019 00:46:51 +0800
Subject: [PATCH 101/127] Update dgemm_kernel_4x8_haswell.S for zen2

replaced a bunch of vpermpd instructions with vpermilpd and vperm2f128
---
 kernel/x86_64/dgemm_kernel_4x8_haswell.S | 120 ++++++++++-------------
 1 file changed, 54 insertions(+), 66 deletions(-)

diff --git a/kernel/x86_64/dgemm_kernel_4x8_haswell.S b/kernel/x86_64/dgemm_kernel_4x8_haswell.S
index c84b599ce..5416018bb 100644
--- a/kernel/x86_64/dgemm_kernel_4x8_haswell.S
+++ b/kernel/x86_64/dgemm_kernel_4x8_haswell.S
@@ -143,7 +143,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 	vmulpd  	%ymm0 ,%ymm2  , %ymm8
 	vmulpd  	%ymm0 ,%ymm3  , %ymm12
 	prefetcht0	B_PR1+256(BO)
-	vpermpd		$ 0xb1, %ymm0  , %ymm0
+	vpermilpd	$ 0x05, %ymm0  , %ymm0
 	vmulpd  	%ymm0 ,%ymm1  , %ymm5
 	vmulpd  	%ymm0 ,%ymm2  , %ymm9
 	vmulpd  	%ymm0 ,%ymm3  , %ymm13
@@ -153,7 +153,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
 	addq		$ 12*SIZE, BO
 	vmulpd  	%ymm0 ,%ymm3  , %ymm14
-	vpermpd		$ 0xb1, %ymm0  , %ymm0
+	vpermilpd	$ 0x05, %ymm0  , %ymm0
 	vmulpd  	%ymm0 ,%ymm1  , %ymm7
 	vmovups		-12 * SIZE(BO), %ymm1
 	vmulpd  	%ymm0 ,%ymm2  , %ymm11
@@ -172,7 +172,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 	vfmadd231pd  	%ymm0 ,%ymm2  , %ymm8
 	prefetcht0	B_PR1+128(BO)
 	vfmadd231pd  	%ymm0 ,%ymm3  , %ymm12
-	vpermpd		$ 0xb1, %ymm0  , %ymm0
+	vpermilpd	$ 0x05, %ymm0  , %ymm0
 	vfmadd231pd  	%ymm0 ,%ymm1  , %ymm5
 	vfmadd231pd  	%ymm0 ,%ymm2  , %ymm9
 	vfmadd231pd  	%ymm0 ,%ymm3  , %ymm13
@@ -181,7 +181,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 	vfmadd231pd  	%ymm0 ,%ymm2  , %ymm10
 
 	vfmadd231pd  	%ymm0 ,%ymm3  , %ymm14
-	vpermpd		$ 0xb1, %ymm0  , %ymm0
+	vpermilpd	$ 0x05, %ymm0  , %ymm0
 	vfmadd231pd  	%ymm0 ,%ymm1  , %ymm7
 	vmovups		-12 * SIZE(BO), %ymm1
 	vfmadd231pd  	%ymm0 ,%ymm2  , %ymm11
@@ -196,7 +196,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 	vfmadd231pd  	%ymm0 ,%ymm1  , %ymm4
 	vfmadd231pd  	%ymm0 ,%ymm2  , %ymm8
 	vfmadd231pd  	%ymm0 ,%ymm3  , %ymm12
-	vpermpd		$ 0xb1, %ymm0  , %ymm0
+	vpermilpd	$ 0x05, %ymm0  , %ymm0
 	vfmadd231pd  	%ymm0 ,%ymm1  , %ymm5
 	vfmadd231pd  	%ymm0 ,%ymm2  , %ymm9
 	vfmadd231pd  	%ymm0 ,%ymm3  , %ymm13
@@ -206,7 +206,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
 	addq		$ 8*SIZE, AO
 	vfmadd231pd  	%ymm0 ,%ymm3  , %ymm14
-	vpermpd		$ 0xb1, %ymm0  , %ymm0
+	vpermilpd	$ 0x05, %ymm0  , %ymm0
 	vfmadd231pd  	%ymm0 ,%ymm1  , %ymm7
 	vmovups		  0 * SIZE(BO), %ymm1
 	vfmadd231pd  	%ymm0 ,%ymm2  , %ymm11
@@ -222,7 +222,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 	vfmadd231pd  	%ymm0 ,%ymm1  , %ymm4
 	vfmadd231pd  	%ymm0 ,%ymm2  , %ymm8
 	vfmadd231pd  	%ymm0 ,%ymm3  , %ymm12
-	vpermpd		$ 0xb1, %ymm0  , %ymm0
+	vpermilpd	$ 0x05, %ymm0  , %ymm0
 	vfmadd231pd  	%ymm0 ,%ymm1  , %ymm5
 	vfmadd231pd  	%ymm0 ,%ymm2  , %ymm9
 	vfmadd231pd  	%ymm0 ,%ymm3  , %ymm13
@@ -232,7 +232,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
 	addq		$ 8*SIZE, AO
 	vfmadd231pd  	%ymm0 ,%ymm3  , %ymm14
-	vpermpd		$ 0xb1, %ymm0  , %ymm0
+	vpermilpd	$ 0x05, %ymm0  , %ymm0
 	vfmadd231pd  	%ymm0 ,%ymm1  , %ymm7
 	vfmadd231pd  	%ymm0 ,%ymm2  , %ymm11
 	vfmadd231pd  	%ymm0 ,%ymm3  , %ymm15
@@ -247,7 +247,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 	vfmadd231pd  	%ymm0 ,%ymm2  , %ymm8
 	vmovups		 -4 * SIZE(BO), %ymm3
 	vfmadd231pd  	%ymm0 ,%ymm3  , %ymm12
-	vpermpd		$ 0xb1, %ymm0  , %ymm0
+	vpermilpd	$ 0x05, %ymm0  , %ymm0
 	vfmadd231pd  	%ymm0 ,%ymm1  , %ymm5
 	vfmadd231pd  	%ymm0 ,%ymm2  , %ymm9
 	addq		$ 12*SIZE, BO
@@ -257,7 +257,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 	vfmadd231pd  	%ymm0 ,%ymm2  , %ymm10
 	addq		$ 4*SIZE, AO
 	vfmadd231pd  	%ymm0 ,%ymm3  , %ymm14
-	vpermpd		$ 0xb1, %ymm0  , %ymm0
+	vpermilpd	$ 0x05, %ymm0  , %ymm0
 	vfmadd231pd  	%ymm0 ,%ymm1  , %ymm7
 	vfmadd231pd  	%ymm0 ,%ymm2  , %ymm11
 	vfmadd231pd  	%ymm0 ,%ymm3  , %ymm15
@@ -284,18 +284,16 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 	vmulpd	%ymm0 , %ymm14, %ymm14
 	vmulpd	%ymm0 , %ymm15, %ymm15
 
-	vpermpd $ 0xb1 , %ymm5, %ymm5
-	vpermpd $ 0xb1 , %ymm7, %ymm7
+	vpermilpd $ 0x05 , %ymm5, %ymm5
+	vpermilpd $ 0x05 , %ymm7, %ymm7
 
 	vblendpd $ 0x0a, %ymm5, %ymm4, %ymm0
 	vblendpd $ 0x05, %ymm5, %ymm4, %ymm1
 	vblendpd $ 0x0a, %ymm7, %ymm6, %ymm2
 	vblendpd $ 0x05, %ymm7, %ymm6, %ymm3
 
-	vpermpd $ 0x1b , %ymm2, %ymm2
-	vpermpd $ 0x1b , %ymm3, %ymm3
-	vpermpd $ 0xb1 , %ymm2, %ymm2
-	vpermpd $ 0xb1 , %ymm3, %ymm3
+	vperm2f128 $ 0x01 , %ymm2, %ymm2
+	vperm2f128 $ 0x01 , %ymm3, %ymm3
 
 	vblendpd $ 0x03, %ymm0, %ymm2 , %ymm4
 	vblendpd $ 0x03, %ymm1, %ymm3 , %ymm5
@@ -324,18 +322,16 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 	prefetcht0	32(%rax)
 	prefetcht0	32(%rax,LDC)
 
-	vpermpd $ 0xb1 , %ymm9 , %ymm9
-	vpermpd $ 0xb1 , %ymm11, %ymm11
+	vpermilpd $ 0x05 , %ymm9 , %ymm9
+	vpermilpd $ 0x05 , %ymm11, %ymm11
 
 	vblendpd $ 0x0a, %ymm9 , %ymm8 , %ymm0
 	vblendpd $ 0x05, %ymm9 , %ymm8 , %ymm1
 	vblendpd $ 0x0a, %ymm11, %ymm10, %ymm2
 	vblendpd $ 0x05, %ymm11, %ymm10, %ymm3
 
-	vpermpd $ 0x1b , %ymm2, %ymm2
-	vpermpd $ 0x1b , %ymm3, %ymm3
-	vpermpd $ 0xb1 , %ymm2, %ymm2
-	vpermpd $ 0xb1 , %ymm3, %ymm3
+	vperm2f128 $ 0x01 , %ymm2, %ymm2
+	vperm2f128 $ 0x01 , %ymm3, %ymm3
 
 	vblendpd $ 0x03, %ymm0, %ymm2 , %ymm4
 	vblendpd $ 0x03, %ymm1, %ymm3 , %ymm5
@@ -365,18 +361,16 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 	prefetcht0	32(%rbp)
 	prefetcht0	32(%rbp,LDC)
 
-	vpermpd $ 0xb1 , %ymm13, %ymm13
-	vpermpd $ 0xb1 , %ymm15, %ymm15
+	vpermilpd $ 0x05 , %ymm13, %ymm13
+	vpermilpd $ 0x05 , %ymm15, %ymm15
 
 	vblendpd $ 0x0a, %ymm13, %ymm12, %ymm0
 	vblendpd $ 0x05, %ymm13, %ymm12, %ymm1
 	vblendpd $ 0x0a, %ymm15, %ymm14, %ymm2
 	vblendpd $ 0x05, %ymm15, %ymm14, %ymm3
 
-	vpermpd $ 0x1b , %ymm2, %ymm2
-	vpermpd $ 0x1b , %ymm3, %ymm3
-	vpermpd $ 0xb1 , %ymm2, %ymm2
-	vpermpd $ 0xb1 , %ymm3, %ymm3
+	vperm2f128 $ 0x01 , %ymm2, %ymm2
+	vperm2f128 $ 0x01 , %ymm3, %ymm3
 
 	vblendpd $ 0x03, %ymm0, %ymm2 , %ymm4
 	vblendpd $ 0x03, %ymm1, %ymm3 , %ymm5
@@ -687,7 +681,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 	vmovups		 -8 * SIZE(BO), %ymm2
 	vmulpd  	%ymm0 ,%ymm1  , %ymm4
 	vmulpd  	%ymm0 ,%ymm2  , %ymm8
-	vpermpd		$ 0xb1, %ymm0  , %ymm0
+	vpermilpd	$ 0x05, %ymm0  , %ymm0
 	vmulpd  	%ymm0 ,%ymm1  , %ymm5
 	vmulpd  	%ymm0 ,%ymm2  , %ymm9
 	vpermpd		$ 0x1b, %ymm0  , %ymm0
@@ -695,7 +689,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 	vmulpd  	%ymm0 ,%ymm2  , %ymm10
 
 	addq		$  8*SIZE, BO
-	vpermpd		$ 0xb1, %ymm0  , %ymm0
+	vpermilpd	$ 0x05, %ymm0  , %ymm0
 	vmulpd  	%ymm0 ,%ymm1  , %ymm7
 	vmovups		-12 * SIZE(BO), %ymm1
 	vmulpd  	%ymm0 ,%ymm2  , %ymm11
@@ -710,14 +704,14 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 	vfmadd231pd  	%ymm0 ,%ymm1  , %ymm4
 	prefetcht0	B_PR1+64(BO)
 	vfmadd231pd  	%ymm0 ,%ymm2  , %ymm8
-	vpermpd		$ 0xb1, %ymm0  , %ymm0
+	vpermilpd	$ 0x05, %ymm0  , %ymm0
 	vfmadd231pd  	%ymm0 ,%ymm1  , %ymm5
 	vfmadd231pd  	%ymm0 ,%ymm2  , %ymm9
 	vpermpd		$ 0x1b, %ymm0  , %ymm0
 	vfmadd231pd  	%ymm0 ,%ymm1  , %ymm6
 	vfmadd231pd  	%ymm0 ,%ymm2  , %ymm10
 
-	vpermpd		$ 0xb1, %ymm0  , %ymm0
+	vpermilpd	$ 0x05, %ymm0  , %ymm0
 	vfmadd231pd  	%ymm0 ,%ymm1  , %ymm7
 	vmovups		-12 * SIZE(BO), %ymm1
 	vfmadd231pd  	%ymm0 ,%ymm2  , %ymm11
@@ -729,7 +723,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 	vmovups 	-12 * SIZE(AO), %ymm0
 	vfmadd231pd  	%ymm0 ,%ymm1  , %ymm4
 	vfmadd231pd  	%ymm0 ,%ymm2  , %ymm8
-	vpermpd		$ 0xb1, %ymm0  , %ymm0
+	vpermilpd	$ 0x05, %ymm0  , %ymm0
 	vfmadd231pd  	%ymm0 ,%ymm1  , %ymm5
 	vfmadd231pd  	%ymm0 ,%ymm2  , %ymm9
 	vpermpd		$ 0x1b, %ymm0  , %ymm0
@@ -737,7 +731,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 	vfmadd231pd  	%ymm0 ,%ymm2  , %ymm10
 
 	addq		$ 8*SIZE, AO
-	vpermpd		$ 0xb1, %ymm0  , %ymm0
+	vpermilpd	$ 0x05, %ymm0  , %ymm0
 	vfmadd231pd  	%ymm0 ,%ymm1  , %ymm7
 	vmovups		 -4 * SIZE(BO), %ymm1
 	vfmadd231pd  	%ymm0 ,%ymm2  , %ymm11
@@ -750,7 +744,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 	vmovups 	-12 * SIZE(AO), %ymm0
 	vfmadd231pd  	%ymm0 ,%ymm1  , %ymm4
 	vfmadd231pd  	%ymm0 ,%ymm2  , %ymm8
-	vpermpd		$ 0xb1, %ymm0  , %ymm0
+	vpermilpd	$ 0x05, %ymm0  , %ymm0
 	vfmadd231pd  	%ymm0 ,%ymm1  , %ymm5
 	vfmadd231pd  	%ymm0 ,%ymm2  , %ymm9
 	vpermpd		$ 0x1b, %ymm0  , %ymm0
@@ -758,7 +752,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 	vfmadd231pd  	%ymm0 ,%ymm2  , %ymm10
 
 	addq		$ 8*SIZE, AO
-	vpermpd		$ 0xb1, %ymm0  , %ymm0
+	vpermilpd	$ 0x05, %ymm0  , %ymm0
 	vfmadd231pd  	%ymm0 ,%ymm1  , %ymm7
 	vfmadd231pd  	%ymm0 ,%ymm2  , %ymm11
 	addq		$  8*SIZE, BO
@@ -770,7 +764,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 	vfmadd231pd  	%ymm0 ,%ymm1  , %ymm4
 	vmovups		 -8 * SIZE(BO), %ymm2
 	vfmadd231pd  	%ymm0 ,%ymm2  , %ymm8
-	vpermpd		$ 0xb1, %ymm0  , %ymm0
+	vpermilpd	$ 0x05, %ymm0  , %ymm0
 	vfmadd231pd  	%ymm0 ,%ymm1  , %ymm5
 	vfmadd231pd  	%ymm0 ,%ymm2  , %ymm9
 	addq		$  8*SIZE, BO
@@ -778,7 +772,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 	vfmadd231pd  	%ymm0 ,%ymm1  , %ymm6
 	vfmadd231pd  	%ymm0 ,%ymm2  , %ymm10
 	addq		$ 4*SIZE, AO
-	vpermpd		$ 0xb1, %ymm0  , %ymm0
+	vpermilpd	$ 0x05, %ymm0  , %ymm0
 	vfmadd231pd  	%ymm0 ,%ymm1  , %ymm7
 	vfmadd231pd  	%ymm0 ,%ymm2  , %ymm11
 
@@ -799,18 +793,16 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 	vmulpd	%ymm0 , %ymm10, %ymm10
 	vmulpd	%ymm0 , %ymm11, %ymm11
 
-	vpermpd $ 0xb1 , %ymm5, %ymm5
-	vpermpd $ 0xb1 , %ymm7, %ymm7
+	vpermilpd $ 0x05 , %ymm5, %ymm5
+	vpermilpd $ 0x05 , %ymm7, %ymm7
 
 	vblendpd $ 0x0a, %ymm5, %ymm4, %ymm0
 	vblendpd $ 0x05, %ymm5, %ymm4, %ymm1
 	vblendpd $ 0x0a, %ymm7, %ymm6, %ymm2
 	vblendpd $ 0x05, %ymm7, %ymm6, %ymm3
 
-	vpermpd $ 0x1b , %ymm2, %ymm2
-	vpermpd $ 0x1b , %ymm3, %ymm3
-	vpermpd $ 0xb1 , %ymm2, %ymm2
-	vpermpd $ 0xb1 , %ymm3, %ymm3
+	vperm2f128 $ 0x01 , %ymm2, %ymm2
+	vperm2f128 $ 0x01 , %ymm3, %ymm3
 
 	vblendpd $ 0x03, %ymm0, %ymm2 , %ymm4
 	vblendpd $ 0x03, %ymm1, %ymm3 , %ymm5
@@ -839,18 +831,16 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 	prefetcht0	32(%rax)
 	prefetcht0	32(%rax,LDC)
 
-	vpermpd $ 0xb1 , %ymm9 , %ymm9
-	vpermpd $ 0xb1 , %ymm11, %ymm11
+	vpermilpd $ 0x05 , %ymm9 , %ymm9
+	vpermilpd $ 0x05 , %ymm11, %ymm11
 
 	vblendpd $ 0x0a, %ymm9 , %ymm8 , %ymm0
 	vblendpd $ 0x05, %ymm9 , %ymm8 , %ymm1
 	vblendpd $ 0x0a, %ymm11, %ymm10, %ymm2
 	vblendpd $ 0x05, %ymm11, %ymm10, %ymm3
 
-	vpermpd $ 0x1b , %ymm2, %ymm2
-	vpermpd $ 0x1b , %ymm3, %ymm3
-	vpermpd $ 0xb1 , %ymm2, %ymm2
-	vpermpd $ 0xb1 , %ymm3, %ymm3
+	vperm2f128 $ 0x01 , %ymm2, %ymm2
+	vperm2f128 $ 0x01 , %ymm3, %ymm3
 
 	vblendpd $ 0x03, %ymm0, %ymm2 , %ymm4
 	vblendpd $ 0x03, %ymm1, %ymm3 , %ymm5
@@ -1084,13 +1074,13 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 	vmovups		-12 * SIZE(BO), %ymm1
 	vmovups 	-16 * SIZE(AO), %ymm0
 	vmulpd  	%ymm0 ,%ymm1  , %ymm4
-	vpermpd		$ 0xb1, %ymm0  , %ymm0
+	vpermilpd	$ 0x05, %ymm0  , %ymm0
 	vmulpd  	%ymm0 ,%ymm1  , %ymm5
 	vpermpd		$ 0x1b, %ymm0  , %ymm0
 	vmulpd  	%ymm0 ,%ymm1  , %ymm6
 
 	addq		$ 4*SIZE, BO
-	vpermpd		$ 0xb1, %ymm0  , %ymm0
+	vpermilpd	$ 0x05, %ymm0  , %ymm0
 	vmulpd  	%ymm0 ,%ymm1  , %ymm7
 	vmovups		-12 * SIZE(BO), %ymm1
 
@@ -1100,12 +1090,12 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 	prefetcht0	A_PR1(AO)
 	vmovups 	-16 * SIZE(AO), %ymm0
 	vfmadd231pd  	%ymm0 ,%ymm1  , %ymm4
-	vpermpd		$ 0xb1, %ymm0  , %ymm0
+	vpermilpd	$ 0x05, %ymm0  , %ymm0
 	vfmadd231pd  	%ymm0 ,%ymm1  , %ymm5
 	vpermpd		$ 0x1b, %ymm0  , %ymm0
 	vfmadd231pd  	%ymm0 ,%ymm1  , %ymm6
 
-	vpermpd		$ 0xb1, %ymm0  , %ymm0
+	vpermilpd	$ 0x05, %ymm0  , %ymm0
 	vfmadd231pd  	%ymm0 ,%ymm1  , %ymm7
 	vmovups		-12 * SIZE(BO), %ymm1
 
@@ -1114,13 +1104,13 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 .macro KERNEL4x4_M2
 	vmovups 	-12 * SIZE(AO), %ymm0
 	vfmadd231pd  	%ymm0 ,%ymm1  , %ymm4
-	vpermpd		$ 0xb1, %ymm0  , %ymm0
+	vpermilpd	$ 0x05, %ymm0  , %ymm0
 	vfmadd231pd  	%ymm0 ,%ymm1  , %ymm5
 	vpermpd		$ 0x1b, %ymm0  , %ymm0
 	vfmadd231pd  	%ymm0 ,%ymm1  , %ymm6
 
 	addq		$ 8*SIZE, AO
-	vpermpd		$ 0xb1, %ymm0  , %ymm0
+	vpermilpd	$ 0x05, %ymm0  , %ymm0
 	vfmadd231pd  	%ymm0 ,%ymm1  , %ymm7
 	vmovups		 -8 * SIZE(BO), %ymm1
 	addq		$ 8*SIZE, BO
@@ -1130,13 +1120,13 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 .macro KERNEL4x4_E
 	vmovups 	-12 * SIZE(AO), %ymm0
 	vfmadd231pd  	%ymm0 ,%ymm1  , %ymm4
-	vpermpd		$ 0xb1, %ymm0  , %ymm0
+	vpermilpd	$ 0x05, %ymm0  , %ymm0
 	vfmadd231pd  	%ymm0 ,%ymm1  , %ymm5
 	vpermpd		$ 0x1b, %ymm0  , %ymm0
 	vfmadd231pd  	%ymm0 ,%ymm1  , %ymm6
 
 	addq		$ 8*SIZE, AO
-	vpermpd		$ 0xb1, %ymm0  , %ymm0
+	vpermilpd	$ 0x05, %ymm0  , %ymm0
 	vfmadd231pd  	%ymm0 ,%ymm1  , %ymm7
 	addq		$ 4*SIZE, BO
 .endm
@@ -1145,13 +1135,13 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 	vmovups		-12 * SIZE(BO), %ymm1
 	vmovups 	-16 * SIZE(AO), %ymm0
 	vfmadd231pd  	%ymm0 ,%ymm1  , %ymm4
-	vpermpd		$ 0xb1, %ymm0  , %ymm0
+	vpermilpd	$ 0x05, %ymm0  , %ymm0
 	vfmadd231pd  	%ymm0 ,%ymm1  , %ymm5
 	addq		$ 4*SIZE, BO
 	vpermpd		$ 0x1b, %ymm0  , %ymm0
 	vfmadd231pd  	%ymm0 ,%ymm1  , %ymm6
 	addq		$ 4*SIZE, AO
-	vpermpd		$ 0xb1, %ymm0  , %ymm0
+	vpermilpd	$ 0x05, %ymm0  , %ymm0
 	vfmadd231pd  	%ymm0 ,%ymm1  , %ymm7
 
 .endm
@@ -1165,18 +1155,16 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 	vmulpd	%ymm0 , %ymm5 , %ymm5
 	vmulpd	%ymm0 , %ymm6 , %ymm6
 
-	vpermpd $ 0xb1 , %ymm5, %ymm5
-	vpermpd $ 0xb1 , %ymm7, %ymm7
+	vpermilpd $ 0x05 , %ymm5, %ymm5
+	vpermilpd $ 0x05 , %ymm7, %ymm7
 
 	vblendpd $ 0x0a, %ymm5, %ymm4, %ymm0
 	vblendpd $ 0x05, %ymm5, %ymm4, %ymm1
 	vblendpd $ 0x0a, %ymm7, %ymm6, %ymm2
 	vblendpd $ 0x05, %ymm7, %ymm6, %ymm3
 
-	vpermpd $ 0x1b , %ymm2, %ymm2
-	vpermpd $ 0x1b , %ymm3, %ymm3
-	vpermpd $ 0xb1 , %ymm2, %ymm2
-	vpermpd $ 0xb1 , %ymm3, %ymm3
+	vperm2f128 $ 0x01 , %ymm2, %ymm2
+	vperm2f128 $ 0x01 , %ymm3, %ymm3
 
 	vblendpd $ 0x03, %ymm0, %ymm2 , %ymm4
 	vblendpd $ 0x03, %ymm1, %ymm3 , %ymm5

From 7a9050d6817dd63e4b3cb641566b03f069be47a9 Mon Sep 17 00:00:00 2001
From: wjc404 <52632443+wjc404@users.noreply.github.com>
Date: Wed, 17 Jul 2019 00:55:06 +0800
Subject: [PATCH 102/127] Update dgemm_kernel_4x8_haswell.S

---
 kernel/x86_64/dgemm_kernel_4x8_haswell.S | 24 ++++++++++++------------
 1 file changed, 12 insertions(+), 12 deletions(-)

diff --git a/kernel/x86_64/dgemm_kernel_4x8_haswell.S b/kernel/x86_64/dgemm_kernel_4x8_haswell.S
index 5416018bb..b98610524 100644
--- a/kernel/x86_64/dgemm_kernel_4x8_haswell.S
+++ b/kernel/x86_64/dgemm_kernel_4x8_haswell.S
@@ -292,8 +292,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 	vblendpd $ 0x0a, %ymm7, %ymm6, %ymm2
 	vblendpd $ 0x05, %ymm7, %ymm6, %ymm3
 
-	vperm2f128 $ 0x01 , %ymm2, %ymm2
-	vperm2f128 $ 0x01 , %ymm3, %ymm3
+	vperm2f128 $ 0x01 , %ymm2, %ymm2 , %ymm2
+	vperm2f128 $ 0x01 , %ymm3, %ymm3 , %ymm3
 
 	vblendpd $ 0x03, %ymm0, %ymm2 , %ymm4
 	vblendpd $ 0x03, %ymm1, %ymm3 , %ymm5
@@ -330,8 +330,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 	vblendpd $ 0x0a, %ymm11, %ymm10, %ymm2
 	vblendpd $ 0x05, %ymm11, %ymm10, %ymm3
 
-	vperm2f128 $ 0x01 , %ymm2, %ymm2
-	vperm2f128 $ 0x01 , %ymm3, %ymm3
+	vperm2f128 $ 0x01 , %ymm2, %ymm2 , %ymm2
+	vperm2f128 $ 0x01 , %ymm3, %ymm3 , %ymm3
 
 	vblendpd $ 0x03, %ymm0, %ymm2 , %ymm4
 	vblendpd $ 0x03, %ymm1, %ymm3 , %ymm5
@@ -369,8 +369,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 	vblendpd $ 0x0a, %ymm15, %ymm14, %ymm2
 	vblendpd $ 0x05, %ymm15, %ymm14, %ymm3
 
-	vperm2f128 $ 0x01 , %ymm2, %ymm2
-	vperm2f128 $ 0x01 , %ymm3, %ymm3
+	vperm2f128 $ 0x01 , %ymm2, %ymm2 , %ymm2
+	vperm2f128 $ 0x01 , %ymm3, %ymm3 , %ymm3
 
 	vblendpd $ 0x03, %ymm0, %ymm2 , %ymm4
 	vblendpd $ 0x03, %ymm1, %ymm3 , %ymm5
@@ -801,8 +801,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 	vblendpd $ 0x0a, %ymm7, %ymm6, %ymm2
 	vblendpd $ 0x05, %ymm7, %ymm6, %ymm3
 
-	vperm2f128 $ 0x01 , %ymm2, %ymm2
-	vperm2f128 $ 0x01 , %ymm3, %ymm3
+	vperm2f128 $ 0x01 , %ymm2, %ymm2 , %ymm2
+	vperm2f128 $ 0x01 , %ymm3, %ymm3 , %ymm3
 
 	vblendpd $ 0x03, %ymm0, %ymm2 , %ymm4
 	vblendpd $ 0x03, %ymm1, %ymm3 , %ymm5
@@ -839,8 +839,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 	vblendpd $ 0x0a, %ymm11, %ymm10, %ymm2
 	vblendpd $ 0x05, %ymm11, %ymm10, %ymm3
 
-	vperm2f128 $ 0x01 , %ymm2, %ymm2
-	vperm2f128 $ 0x01 , %ymm3, %ymm3
+	vperm2f128 $ 0x01 , %ymm2, %ymm2 , %ymm2
+	vperm2f128 $ 0x01 , %ymm3, %ymm3 , %ymm3
 
 	vblendpd $ 0x03, %ymm0, %ymm2 , %ymm4
 	vblendpd $ 0x03, %ymm1, %ymm3 , %ymm5
@@ -1163,8 +1163,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 	vblendpd $ 0x0a, %ymm7, %ymm6, %ymm2
 	vblendpd $ 0x05, %ymm7, %ymm6, %ymm3
 
-	vperm2f128 $ 0x01 , %ymm2, %ymm2
-	vperm2f128 $ 0x01 , %ymm3, %ymm3
+	vperm2f128 $ 0x01 , %ymm2, %ymm2 , %ymm2
+	vperm2f128 $ 0x01 , %ymm3, %ymm3 , %ymm3
 
 	vblendpd $ 0x03, %ymm0, %ymm2 , %ymm4
 	vblendpd $ 0x03, %ymm1, %ymm3 , %ymm5

From 182b06d6adb445d00066eff3b15da335ee1656bc Mon Sep 17 00:00:00 2001
From: wjc404 <52632443+wjc404@users.noreply.github.com>
Date: Wed, 17 Jul 2019 17:02:35 +0800
Subject: [PATCH 103/127] Update dgemm_kernel_4x8_haswell.S

---
 kernel/x86_64/dgemm_kernel_4x8_haswell.S | 40 ++++++++++++------------
 1 file changed, 20 insertions(+), 20 deletions(-)

diff --git a/kernel/x86_64/dgemm_kernel_4x8_haswell.S b/kernel/x86_64/dgemm_kernel_4x8_haswell.S
index b98610524..814a1c350 100644
--- a/kernel/x86_64/dgemm_kernel_4x8_haswell.S
+++ b/kernel/x86_64/dgemm_kernel_4x8_haswell.S
@@ -317,10 +317,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 	vmovups	%ymm6 ,  	(%rax)
 	vmovups	%ymm7 ,  	(%rax, LDC)
 
-	prefetcht0	32(CO1)
-	prefetcht0	32(CO1,LDC)
-	prefetcht0	32(%rax)
-	prefetcht0	32(%rax,LDC)
+	prefetcht0	56(CO1)
+	prefetcht0	56(CO1,LDC)
+	prefetcht0	56(%rax)
+	prefetcht0	56(%rax,LDC)
 
 	vpermilpd $ 0x05 , %ymm9 , %ymm9
 	vpermilpd $ 0x05 , %ymm11, %ymm11
@@ -356,10 +356,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 	vmovups	%ymm6 ,  	(%rbp)
 	vmovups	%ymm7 ,  	(%rbp, LDC)
 
-	prefetcht0	32(%rax)
-	prefetcht0	32(%rax,LDC)
-	prefetcht0	32(%rbp)
-	prefetcht0	32(%rbp,LDC)
+	prefetcht0	56(%rax)
+	prefetcht0	56(%rax,LDC)
+	prefetcht0	56(%rbp)
+	prefetcht0	56(%rbp,LDC)
 
 	vpermilpd $ 0x05 , %ymm13, %ymm13
 	vpermilpd $ 0x05 , %ymm15, %ymm15
@@ -395,10 +395,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 	vmovups	%ymm6 ,  	(%rbp)
 	vmovups	%ymm7 ,  	(%rbp, LDC)
 
-	prefetcht0	32(%rax)
-	prefetcht0	32(%rax,LDC)
-	prefetcht0	32(%rbp)
-	prefetcht0	32(%rbp,LDC)
+	prefetcht0	56(%rax)
+	prefetcht0	56(%rax,LDC)
+	prefetcht0	56(%rbp)
+	prefetcht0	56(%rbp,LDC)
 
 	addq	$ 4*SIZE, CO1
 .endm
@@ -826,10 +826,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 	vmovups	%ymm6 ,  	(%rax)
 	vmovups	%ymm7 ,  	(%rax, LDC)
 
-	prefetcht0	32(CO1)
-	prefetcht0	32(CO1,LDC)
-	prefetcht0	32(%rax)
-	prefetcht0	32(%rax,LDC)
+	prefetcht0	56(CO1)
+	prefetcht0	56(CO1,LDC)
+	prefetcht0	56(%rax)
+	prefetcht0	56(%rax,LDC)
 
 	vpermilpd $ 0x05 , %ymm9 , %ymm9
 	vpermilpd $ 0x05 , %ymm11, %ymm11
@@ -865,10 +865,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 	vmovups	%ymm6 ,  	(%rbp)
 	vmovups	%ymm7 ,  	(%rbp, LDC)
 
-	prefetcht0	32(%rax)
-	prefetcht0	32(%rax,LDC)
-	prefetcht0	32(%rbp)
-	prefetcht0	32(%rbp,LDC)
+	prefetcht0	56(%rax)
+	prefetcht0	56(%rax,LDC)
+	prefetcht0	56(%rbp)
+	prefetcht0	56(%rbp,LDC)
 
 	addq	$ 4*SIZE, CO1
 .endm

From 1733f927e6b892610bda045538a42d495faa1af5 Mon Sep 17 00:00:00 2001
From: wjc404 <52632443+wjc404@users.noreply.github.com>
Date: Wed, 17 Jul 2019 21:27:41 +0800
Subject: [PATCH 104/127] Update dgemm_kernel_4x8_haswell.S

---
 kernel/x86_64/dgemm_kernel_4x8_haswell.S | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/kernel/x86_64/dgemm_kernel_4x8_haswell.S b/kernel/x86_64/dgemm_kernel_4x8_haswell.S
index 814a1c350..b30ecccea 100644
--- a/kernel/x86_64/dgemm_kernel_4x8_haswell.S
+++ b/kernel/x86_64/dgemm_kernel_4x8_haswell.S
@@ -106,7 +106,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #endif
 
 #define	A_PR1	512
-#define	B_PR1	512
+#define	B_PR1	160
 
 /*******************************************************************************************
 * Macro definitions

From 211ab03b1402a3c39311b7ca769aaad736ca554c Mon Sep 17 00:00:00 2001
From: wjc404 <52632443+wjc404@users.noreply.github.com>
Date: Wed, 17 Jul 2019 22:39:15 +0800
Subject: [PATCH 105/127] Update dgemm_kernel_4x8_haswell.S

---
 kernel/x86_64/dgemm_kernel_4x8_haswell.S | 7 ++++---
 1 file changed, 4 insertions(+), 3 deletions(-)

diff --git a/kernel/x86_64/dgemm_kernel_4x8_haswell.S b/kernel/x86_64/dgemm_kernel_4x8_haswell.S
index b30ecccea..3f7f9a98e 100644
--- a/kernel/x86_64/dgemm_kernel_4x8_haswell.S
+++ b/kernel/x86_64/dgemm_kernel_4x8_haswell.S
@@ -267,23 +267,24 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
 .macro SAVE4x12
 
+        prefetcht0      128(%rsp) /*BUFFER 1*/
 	vbroadcastsd	ALPHA, %ymm0
 
 	vmulpd	%ymm0 , %ymm4 , %ymm4
 	vmulpd	%ymm0 , %ymm5 , %ymm5
 	vmulpd	%ymm0 , %ymm6 , %ymm6
 	vmulpd	%ymm0 , %ymm7 , %ymm7
-
+        prefetcht0      192(%rsp)
 	vmulpd	%ymm0 , %ymm8 , %ymm8
 	vmulpd	%ymm0 , %ymm9 , %ymm9
 	vmulpd	%ymm0 , %ymm10, %ymm10
 	vmulpd	%ymm0 , %ymm11, %ymm11
-
+        prefetcht0      256(%rsp)
 	vmulpd	%ymm0 , %ymm12, %ymm12
 	vmulpd	%ymm0 , %ymm13, %ymm13
 	vmulpd	%ymm0 , %ymm14, %ymm14
 	vmulpd	%ymm0 , %ymm15, %ymm15
-
+        prefetcht0      320(%rsp)
 	vpermilpd $ 0x05 , %ymm5, %ymm5
 	vpermilpd $ 0x05 , %ymm7, %ymm7
 

From 8a074b39656636ebec5812532b486cf751231a3b Mon Sep 17 00:00:00 2001
From: wjc404 <52632443+wjc404@users.noreply.github.com>
Date: Wed, 17 Jul 2019 23:47:30 +0800
Subject: [PATCH 106/127] Update dgemm_kernel_4x8_haswell.S

---
 kernel/x86_64/dgemm_kernel_4x8_haswell.S | 42 +++++++++++++++++++++---
 1 file changed, 37 insertions(+), 5 deletions(-)

diff --git a/kernel/x86_64/dgemm_kernel_4x8_haswell.S b/kernel/x86_64/dgemm_kernel_4x8_haswell.S
index 3f7f9a98e..5242e3efe 100644
--- a/kernel/x86_64/dgemm_kernel_4x8_haswell.S
+++ b/kernel/x86_64/dgemm_kernel_4x8_haswell.S
@@ -267,24 +267,24 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
 .macro SAVE4x12
 
-        prefetcht0      128(%rsp) /*BUFFER 1*/
+        prefetcht0      BUFFER1
 	vbroadcastsd	ALPHA, %ymm0
 
 	vmulpd	%ymm0 , %ymm4 , %ymm4
 	vmulpd	%ymm0 , %ymm5 , %ymm5
 	vmulpd	%ymm0 , %ymm6 , %ymm6
 	vmulpd	%ymm0 , %ymm7 , %ymm7
-        prefetcht0      192(%rsp)
+        prefetcht0      64 + BUFFER1
 	vmulpd	%ymm0 , %ymm8 , %ymm8
 	vmulpd	%ymm0 , %ymm9 , %ymm9
 	vmulpd	%ymm0 , %ymm10, %ymm10
 	vmulpd	%ymm0 , %ymm11, %ymm11
-        prefetcht0      256(%rsp)
+        prefetcht0      128 + BUFFER1
 	vmulpd	%ymm0 , %ymm12, %ymm12
 	vmulpd	%ymm0 , %ymm13, %ymm13
 	vmulpd	%ymm0 , %ymm14, %ymm14
 	vmulpd	%ymm0 , %ymm15, %ymm15
-        prefetcht0      320(%rsp)
+        prefetcht0      192 + BUFFER1
 	vpermilpd $ 0x05 , %ymm5, %ymm5
 	vpermilpd $ 0x05 , %ymm7, %ymm7
 
@@ -1606,6 +1606,37 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 .endm
 
 
+.macro PREFETCHT0_C
+        prefetcht0 (CO1)
+        prefetcht0 24(CO1)
+        prefetcht0 (CO1,LDC,4)
+        prefetcht0 24(CO1,LDC,4)
+        prefetcht0 (CO1,LDC,8)
+        prefetcht0 24(CO1,LDC,8)
+        addq  LDC,CO1
+        prefetcht0 (CO1)
+        prefetcht0 24(CO1)
+        prefetcht0 (CO1,LDC,4)
+        prefetcht0 24(CO1,LDC,4)
+        prefetcht0 (CO1,LDC,8)
+        prefetcht0 24(CO1,LDC,8)
+        leaq  (CO1,LDC,2),CO1
+        prefetcht0 (CO1)
+        prefetcht0 24(CO1)
+        prefetcht0 (CO1,LDC,4)
+        prefetcht0 24(CO1,LDC,4)
+        prefetcht0 (CO1,LDC,8)
+        prefetcht0 24(CO1,LDC,8)
+        subq  LDC,CO1
+        prefetcht0 (CO1)
+        prefetcht0 24(CO1)
+        prefetcht0 (CO1,LDC,4)
+        prefetcht0 24(CO1,LDC,4)
+        prefetcht0 (CO1,LDC,8)
+        prefetcht0 24(CO1,LDC,8)
+        subq  LDC,CO1
+        subq  LDC,CO1
+.endm
 /*******************************************************************************************/
 
 #if !defined(TRMMKERNEL)
@@ -1773,7 +1804,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
 	dec	%rax
 	jne	.L12_12
-
+	
+        PREFETCHT0_C
 .L12_12a:
 
 	KERNEL4x12_M1

From 9b04baeaeeaaaeba8c12e3fc2418ceaeca53ebb0 Mon Sep 17 00:00:00 2001
From: wjc404 <52632443+wjc404@users.noreply.github.com>
Date: Wed, 17 Jul 2019 23:50:03 +0800
Subject: [PATCH 107/127] Update dgemm_kernel_4x8_haswell.S

---
 kernel/x86_64/dgemm_kernel_4x8_haswell.S | 24 ++++++++++++------------
 1 file changed, 12 insertions(+), 12 deletions(-)

diff --git a/kernel/x86_64/dgemm_kernel_4x8_haswell.S b/kernel/x86_64/dgemm_kernel_4x8_haswell.S
index 5242e3efe..42692f33b 100644
--- a/kernel/x86_64/dgemm_kernel_4x8_haswell.S
+++ b/kernel/x86_64/dgemm_kernel_4x8_haswell.S
@@ -318,10 +318,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 	vmovups	%ymm6 ,  	(%rax)
 	vmovups	%ymm7 ,  	(%rax, LDC)
 
-	prefetcht0	56(CO1)
-	prefetcht0	56(CO1,LDC)
-	prefetcht0	56(%rax)
-	prefetcht0	56(%rax,LDC)
+	prefetcht1	56(CO1)
+	prefetcht1	56(CO1,LDC)
+	prefetcht1	56(%rax)
+	prefetcht1	56(%rax,LDC)
 
 	vpermilpd $ 0x05 , %ymm9 , %ymm9
 	vpermilpd $ 0x05 , %ymm11, %ymm11
@@ -357,10 +357,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 	vmovups	%ymm6 ,  	(%rbp)
 	vmovups	%ymm7 ,  	(%rbp, LDC)
 
-	prefetcht0	56(%rax)
-	prefetcht0	56(%rax,LDC)
-	prefetcht0	56(%rbp)
-	prefetcht0	56(%rbp,LDC)
+	prefetcht1	56(%rax)
+	prefetcht1	56(%rax,LDC)
+	prefetcht1	56(%rbp)
+	prefetcht1	56(%rbp,LDC)
 
 	vpermilpd $ 0x05 , %ymm13, %ymm13
 	vpermilpd $ 0x05 , %ymm15, %ymm15
@@ -396,10 +396,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 	vmovups	%ymm6 ,  	(%rbp)
 	vmovups	%ymm7 ,  	(%rbp, LDC)
 
-	prefetcht0	56(%rax)
-	prefetcht0	56(%rax,LDC)
-	prefetcht0	56(%rbp)
-	prefetcht0	56(%rbp,LDC)
+	prefetcht1	56(%rax)
+	prefetcht1	56(%rax,LDC)
+	prefetcht1	56(%rbp)
+	prefetcht1	56(%rbp,LDC)
 
 	addq	$ 4*SIZE, CO1
 .endm

From 9c89757562f43af48645a6563161909321077646 Mon Sep 17 00:00:00 2001
From: wjc404 <52632443+wjc404@users.noreply.github.com>
Date: Fri, 19 Jul 2019 23:47:58 +0800
Subject: [PATCH 108/127] Add files via upload

---
 kernel/x86_64/dgemm_kernel_4x8_haswell.S | 29 +++++++++++++++++++++++-
 1 file changed, 28 insertions(+), 1 deletion(-)

diff --git a/kernel/x86_64/dgemm_kernel_4x8_haswell.S b/kernel/x86_64/dgemm_kernel_4x8_haswell.S
index 42692f33b..e26bddea3 100644
--- a/kernel/x86_64/dgemm_kernel_4x8_haswell.S
+++ b/kernel/x86_64/dgemm_kernel_4x8_haswell.S
@@ -1865,6 +1865,14 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
 	SAVE4x12
 
+        salq    $3, K
+        prefetcht2 32(B)
+        prefetcht2 32(B, K, 8)
+        prefetcht2 96(B)
+        prefetcht2 96(B, K, 8)
+        addq    $128, B
+        sarq    $3, K
+
 	decq	I			# i --
 	jne	.L12_11
 	ALIGN_4	
@@ -1872,6 +1880,11 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 /**************************************************************************
 * Rest of M 
 ***************************************************************************/
+        movq    M, I
+        sarq    $2, I
+        salq    $7, I
+        subq    I, B
+
 .L12_20:
 	// Test rest of M
 
@@ -2102,7 +2115,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
 	jmp .L13_16
 
-
+        PREFETCHT0_C
 .L13_13:
 
 	test $1, %rax
@@ -2147,6 +2160,14 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
 	SAVE4x12
 
+        salq    $3, K
+        prefetcht2 (B)
+        prefetcht2 (B, K, 8)
+        prefetcht2 64(B)
+        prefetcht2 64(B, K, 8)
+        addq    $128, B
+        sarq    $3, K
+
 	decq	I			# i --
 	jne	.L13_11
 	ALIGN_4	
@@ -2154,6 +2175,12 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 /**************************************************************************
 * Rest of M 
 ***************************************************************************/
+
+        movq    M, I
+        sarq    $2, I
+        salq    $7, I
+        subq    I, B
+
 .L13_20:
 	// Test rest of M
 

From 825777faab163326f38a0e6203ef1fb6fa8de6af Mon Sep 17 00:00:00 2001
From: wjc404 <52632443+wjc404@users.noreply.github.com>
Date: Fri, 19 Jul 2019 23:58:24 +0800
Subject: [PATCH 109/127] Update dgemm_kernel_4x8_haswell.S

---
 kernel/x86_64/dgemm_kernel_4x8_haswell.S | 11 +++++++++--
 1 file changed, 9 insertions(+), 2 deletions(-)

diff --git a/kernel/x86_64/dgemm_kernel_4x8_haswell.S b/kernel/x86_64/dgemm_kernel_4x8_haswell.S
index e26bddea3..225af3673 100644
--- a/kernel/x86_64/dgemm_kernel_4x8_haswell.S
+++ b/kernel/x86_64/dgemm_kernel_4x8_haswell.S
@@ -1865,12 +1865,15 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
 	SAVE4x12
 
+        /* here for the prefetch of next b source block */
+	/* the increment should be proportional to the ratio of GEMM_Q/GEMM_P */
+	/* currently an increment of 128 byte is suitable */
         salq    $3, K
         prefetcht2 32(B)
         prefetcht2 32(B, K, 8)
         prefetcht2 96(B)
         prefetcht2 96(B, K, 8)
-        addq    $128, B
+        addq    $128, B /* increment */
         sarq    $3, K
 
 	decq	I			# i --
@@ -1880,6 +1883,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 /**************************************************************************
 * Rest of M 
 ***************************************************************************/
+        /* recover the original value of pointer B */
         movq    M, I
         sarq    $2, I
         salq    $7, I
@@ -2160,6 +2164,9 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
 	SAVE4x12
 
+        /* here for the prefetch of next b source block */
+	/* the increment should be proportional to the ratio of GEMM_Q/GEMM_P */
+	/* currently an increment of 128 byte is suitable */
         salq    $3, K
         prefetcht2 (B)
         prefetcht2 (B, K, 8)
@@ -2175,7 +2182,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 /**************************************************************************
 * Rest of M 
 ***************************************************************************/
-
+        /* recover the original value of pointer B */
         movq    M, I
         sarq    $2, I
         salq    $7, I

From f49f8047acbea636eb2a3542f306803a1285793b Mon Sep 17 00:00:00 2001
From: wjc404 <52632443+wjc404@users.noreply.github.com>
Date: Sat, 20 Jul 2019 14:33:37 +0800
Subject: [PATCH 110/127] Add files via upload

---
 kernel/x86_64/dgemm_kernel_4x8_haswell.S | 50 ++++++++++++++++++++----
 1 file changed, 43 insertions(+), 7 deletions(-)

diff --git a/kernel/x86_64/dgemm_kernel_4x8_haswell.S b/kernel/x86_64/dgemm_kernel_4x8_haswell.S
index 225af3673..6d1460bb2 100644
--- a/kernel/x86_64/dgemm_kernel_4x8_haswell.S
+++ b/kernel/x86_64/dgemm_kernel_4x8_haswell.S
@@ -279,30 +279,45 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 	vmulpd	%ymm0 , %ymm9 , %ymm9
 	vmulpd	%ymm0 , %ymm10, %ymm10
 	vmulpd	%ymm0 , %ymm11, %ymm11
+#if B_PR1 >= 96
         prefetcht0      128 + BUFFER1
+#endif
 	vmulpd	%ymm0 , %ymm12, %ymm12
 	vmulpd	%ymm0 , %ymm13, %ymm13
 	vmulpd	%ymm0 , %ymm14, %ymm14
 	vmulpd	%ymm0 , %ymm15, %ymm15
+#if B_PR1 >= 160
         prefetcht0      192 + BUFFER1
+#endif
 	vpermilpd $ 0x05 , %ymm5, %ymm5
 	vpermilpd $ 0x05 , %ymm7, %ymm7
-
+#if B_PR1 >= 224
+        prefetcht0      256 + BUFFER1
+#endif
 	vblendpd $ 0x0a, %ymm5, %ymm4, %ymm0
 	vblendpd $ 0x05, %ymm5, %ymm4, %ymm1
 	vblendpd $ 0x0a, %ymm7, %ymm6, %ymm2
 	vblendpd $ 0x05, %ymm7, %ymm6, %ymm3
-
+#if B_PR1 >= 288
+        prefetcht0      320 + BUFFER1
+#endif
 	vperm2f128 $ 0x01 , %ymm2, %ymm2 , %ymm2
 	vperm2f128 $ 0x01 , %ymm3, %ymm3 , %ymm3
-
+#if B_PR1 >= 352
+        prefetcht0      384 + BUFFER1
+#endif
 	vblendpd $ 0x03, %ymm0, %ymm2 , %ymm4
 	vblendpd $ 0x03, %ymm1, %ymm3 , %ymm5
 	vblendpd $ 0x03, %ymm2, %ymm0 , %ymm6
 	vblendpd $ 0x03, %ymm3, %ymm1 , %ymm7
-
+#if B_PR1 >= 416
+        prefetcht0      448 + BUFFER1
+#endif
         leaq    (CO1, LDC, 2), %rax     
 	
+#if B_PR1 >= 480
+        prefetcht0      512 + BUFFER1
+#endif
 
 #if !defined(TRMMKERNEL)
 
@@ -1867,13 +1882,19 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
         /* here for the prefetch of next b source block */
 	/* the increment should be proportional to the ratio of GEMM_Q/GEMM_P */
-	/* currently an increment of 128 byte is suitable */
+
         salq    $3, K
+#ifdef WINDOWS_ABI /* GEMM_P == GEMM_Q * 4 */
+        prefetcht2 32(B)
+        prefetcht2 32(B, K, 8)
+        addq    $64, B /* increment */
+#else /* GEMM_P == GEMM_Q * 2 under linux x86_64 */
         prefetcht2 32(B)
         prefetcht2 32(B, K, 8)
         prefetcht2 96(B)
         prefetcht2 96(B, K, 8)
         addq    $128, B /* increment */
+#endif
         sarq    $3, K
 
 	decq	I			# i --
@@ -1883,10 +1904,15 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 /**************************************************************************
 * Rest of M 
 ***************************************************************************/
-        /* recover the original value of pointer B */
+
+        /* recover the original value of pointer B after prefetch */
         movq    M, I
         sarq    $2, I
+#ifdef WINDOWS_ABI /* GEMM_P == GEMM_Q * 4 */
+        salq    $6, I
+#else /* GEMM_P == GEMM_Q * 2 under linux x86_64 */
         salq    $7, I
+#endif
         subq    I, B
 
 .L12_20:
@@ -2166,13 +2192,19 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
         /* here for the prefetch of next b source block */
 	/* the increment should be proportional to the ratio of GEMM_Q/GEMM_P */
-	/* currently an increment of 128 byte is suitable */
+
         salq    $3, K
+#ifdef WINDOWS_ABI /* GEMM_P == GEMM_Q * 4 */
+        prefetcht2 (B)
+        prefetcht2 (B, K, 8)
+        addq    $64, B
+#else /* GEMM_P == GEMM_Q * 2 under linux x86_64 */
         prefetcht2 (B)
         prefetcht2 (B, K, 8)
         prefetcht2 64(B)
         prefetcht2 64(B, K, 8)
         addq    $128, B
+#endif
         sarq    $3, K
 
 	decq	I			# i --
@@ -2185,7 +2217,11 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
         /* recover the original value of pointer B */
         movq    M, I
         sarq    $2, I
+#ifdef WINDOWS_ABI /* GEMM_P == GEMM_Q * 4 */
+        salq    $6, I
+#else /* GEMM_P == GEMM_Q * 2 under linux x86_64 */
         salq    $7, I
+#endif
         subq    I, B
 
 .L13_20:

From 94db259e5b432a7f1769c1d61071b9dd727778db Mon Sep 17 00:00:00 2001
From: wjc404 <52632443+wjc404@users.noreply.github.com>
Date: Sat, 20 Jul 2019 22:04:41 +0800
Subject: [PATCH 111/127] Add files via upload

---
 kernel/x86_64/dgemm_kernel_4x8_haswell.S | 45 ++++++++++--------------
 1 file changed, 19 insertions(+), 26 deletions(-)

diff --git a/kernel/x86_64/dgemm_kernel_4x8_haswell.S b/kernel/x86_64/dgemm_kernel_4x8_haswell.S
index 6d1460bb2..6a8619e32 100644
--- a/kernel/x86_64/dgemm_kernel_4x8_haswell.S
+++ b/kernel/x86_64/dgemm_kernel_4x8_haswell.S
@@ -1622,35 +1622,13 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
 
 .macro PREFETCHT0_C
+        prefetcht0 ALPHA
         prefetcht0 (CO1)
         prefetcht0 24(CO1)
         prefetcht0 (CO1,LDC,4)
         prefetcht0 24(CO1,LDC,4)
         prefetcht0 (CO1,LDC,8)
         prefetcht0 24(CO1,LDC,8)
-        addq  LDC,CO1
-        prefetcht0 (CO1)
-        prefetcht0 24(CO1)
-        prefetcht0 (CO1,LDC,4)
-        prefetcht0 24(CO1,LDC,4)
-        prefetcht0 (CO1,LDC,8)
-        prefetcht0 24(CO1,LDC,8)
-        leaq  (CO1,LDC,2),CO1
-        prefetcht0 (CO1)
-        prefetcht0 24(CO1)
-        prefetcht0 (CO1,LDC,4)
-        prefetcht0 24(CO1,LDC,4)
-        prefetcht0 (CO1,LDC,8)
-        prefetcht0 24(CO1,LDC,8)
-        subq  LDC,CO1
-        prefetcht0 (CO1)
-        prefetcht0 24(CO1)
-        prefetcht0 (CO1,LDC,4)
-        prefetcht0 24(CO1,LDC,4)
-        prefetcht0 (CO1,LDC,8)
-        prefetcht0 24(CO1,LDC,8)
-        subq  LDC,CO1
-        subq  LDC,CO1
 .endm
 /*******************************************************************************************/
 
@@ -1820,12 +1798,19 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 	dec	%rax
 	jne	.L12_12
 	
-        PREFETCHT0_C
 .L12_12a:
-
+        PREFETCHT0_C
+        addq  LDC,CO1
 	KERNEL4x12_M1
+        PREFETCHT0_C
+        leaq  (CO1,LDC,2),CO1
 	KERNEL4x12_M2
+        PREFETCHT0_C
+        subq  LDC,CO1
 	KERNEL4x12_M1
+        PREFETCHT0_C
+        subq  LDC,CO1
+        subq  LDC,CO1
 	KERNEL4x12_M2
 
 	KERNEL4x12_M1
@@ -2133,9 +2118,18 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
 .L13_12a:
 
+        PREFETCHT0_C
+        addq  LDC,CO1
 	KERNEL4x12_M1
+        PREFETCHT0_C
+        leaq  (CO1,LDC,2),CO1
 	KERNEL4x12_M2
+        PREFETCHT0_C
+        subq  LDC,CO1
 	KERNEL4x12_M1
+        PREFETCHT0_C
+        subq  LDC,CO1
+        subq  LDC,CO1
 	KERNEL4x12_M2
 
 	KERNEL4x12_M1
@@ -2145,7 +2139,6 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
 	jmp .L13_16
 
-        PREFETCHT0_C
 .L13_13:
 
 	test $1, %rax

From 9440fa607d146f1b91d70e35404f0d4abe50ffc5 Mon Sep 17 00:00:00 2001
From: wjc404 <52632443+wjc404@users.noreply.github.com>
Date: Sat, 20 Jul 2019 22:08:22 +0800
Subject: [PATCH 112/127] Add files via upload

---
 kernel/x86_64/dgemm_kernel_4x8_haswell.S | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/kernel/x86_64/dgemm_kernel_4x8_haswell.S b/kernel/x86_64/dgemm_kernel_4x8_haswell.S
index 6a8619e32..c834239be 100644
--- a/kernel/x86_64/dgemm_kernel_4x8_haswell.S
+++ b/kernel/x86_64/dgemm_kernel_4x8_haswell.S
@@ -1622,7 +1622,6 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
 
 .macro PREFETCHT0_C
-        prefetcht0 ALPHA
         prefetcht0 (CO1)
         prefetcht0 24(CO1)
         prefetcht0 (CO1,LDC,4)
@@ -1799,6 +1798,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 	jne	.L12_12
 	
 .L12_12a:
+        prefetcht0 ALPHA
         PREFETCHT0_C
         addq  LDC,CO1
 	KERNEL4x12_M1
@@ -2117,7 +2117,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 	jne	.L13_12
 
 .L13_12a:
-
+        prefetcht0 ALPHA
         PREFETCHT0_C
         addq  LDC,CO1
 	KERNEL4x12_M1

From 4801c6d36bd87421b08e60efa1b6e0217fd41672 Mon Sep 17 00:00:00 2001
From: wjc404 <52632443+wjc404@users.noreply.github.com>
Date: Sun, 21 Jul 2019 00:47:45 +0800
Subject: [PATCH 113/127] Update dgemm_kernel_4x8_haswell.S

---
 kernel/x86_64/dgemm_kernel_4x8_haswell.S | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/kernel/x86_64/dgemm_kernel_4x8_haswell.S b/kernel/x86_64/dgemm_kernel_4x8_haswell.S
index c834239be..26eea0acf 100644
--- a/kernel/x86_64/dgemm_kernel_4x8_haswell.S
+++ b/kernel/x86_64/dgemm_kernel_4x8_haswell.S
@@ -1866,7 +1866,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 	SAVE4x12
 
         /* here for the prefetch of next b source block */
-	/* the increment should be proportional to the ratio of GEMM_Q/GEMM_P */
+	/* the increment should be proportional to GEMM_Q/GEMM_P */
 
         salq    $3, K
 #ifdef WINDOWS_ABI /* GEMM_P == GEMM_Q * 4 */
@@ -2184,19 +2184,19 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 	SAVE4x12
 
         /* here for the prefetch of next b source block */
-	/* the increment should be proportional to the ratio of GEMM_Q/GEMM_P */
+	/* the increment should be proportional to GEMM_Q/GEMM_P */
 
         salq    $3, K
 #ifdef WINDOWS_ABI /* GEMM_P == GEMM_Q * 4 */
         prefetcht2 (B)
         prefetcht2 (B, K, 8)
-        addq    $64, B
+        addq    $64, B /* increment */
 #else /* GEMM_P == GEMM_Q * 2 under linux x86_64 */
         prefetcht2 (B)
         prefetcht2 (B, K, 8)
         prefetcht2 64(B)
         prefetcht2 64(B, K, 8)
-        addq    $128, B
+        addq    $128, B /* increment */
 #endif
         sarq    $3, K
 

From 95fb98f556adcbbccc5f42318c7c645ec1837e1a Mon Sep 17 00:00:00 2001
From: wjc404 <52632443+wjc404@users.noreply.github.com>
Date: Sun, 21 Jul 2019 01:10:32 +0800
Subject: [PATCH 114/127] Update dgemm_kernel_4x8_haswell.S

---
 kernel/x86_64/dgemm_kernel_4x8_haswell.S | 14 +++++++-------
 1 file changed, 7 insertions(+), 7 deletions(-)

diff --git a/kernel/x86_64/dgemm_kernel_4x8_haswell.S b/kernel/x86_64/dgemm_kernel_4x8_haswell.S
index 26eea0acf..082e62a7c 100644
--- a/kernel/x86_64/dgemm_kernel_4x8_haswell.S
+++ b/kernel/x86_64/dgemm_kernel_4x8_haswell.S
@@ -279,43 +279,43 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 	vmulpd	%ymm0 , %ymm9 , %ymm9
 	vmulpd	%ymm0 , %ymm10, %ymm10
 	vmulpd	%ymm0 , %ymm11, %ymm11
-#if B_PR1 >= 96
+#if B_PR1 > 32
         prefetcht0      128 + BUFFER1
 #endif
 	vmulpd	%ymm0 , %ymm12, %ymm12
 	vmulpd	%ymm0 , %ymm13, %ymm13
 	vmulpd	%ymm0 , %ymm14, %ymm14
 	vmulpd	%ymm0 , %ymm15, %ymm15
-#if B_PR1 >= 160
+#if B_PR1 > 96
         prefetcht0      192 + BUFFER1
 #endif
 	vpermilpd $ 0x05 , %ymm5, %ymm5
 	vpermilpd $ 0x05 , %ymm7, %ymm7
-#if B_PR1 >= 224
+#if B_PR1 > 160
         prefetcht0      256 + BUFFER1
 #endif
 	vblendpd $ 0x0a, %ymm5, %ymm4, %ymm0
 	vblendpd $ 0x05, %ymm5, %ymm4, %ymm1
 	vblendpd $ 0x0a, %ymm7, %ymm6, %ymm2
 	vblendpd $ 0x05, %ymm7, %ymm6, %ymm3
-#if B_PR1 >= 288
+#if B_PR1 > 224
         prefetcht0      320 + BUFFER1
 #endif
 	vperm2f128 $ 0x01 , %ymm2, %ymm2 , %ymm2
 	vperm2f128 $ 0x01 , %ymm3, %ymm3 , %ymm3
-#if B_PR1 >= 352
+#if B_PR1 > 288
         prefetcht0      384 + BUFFER1
 #endif
 	vblendpd $ 0x03, %ymm0, %ymm2 , %ymm4
 	vblendpd $ 0x03, %ymm1, %ymm3 , %ymm5
 	vblendpd $ 0x03, %ymm2, %ymm0 , %ymm6
 	vblendpd $ 0x03, %ymm3, %ymm1 , %ymm7
-#if B_PR1 >= 416
+#if B_PR1 > 352
         prefetcht0      448 + BUFFER1
 #endif
         leaq    (CO1, LDC, 2), %rax     
 	
-#if B_PR1 >= 480
+#if B_PR1 > 416
         prefetcht0      512 + BUFFER1
 #endif
 

From 28e96458e5a4b2d8039ed16048a07892a7c960bf Mon Sep 17 00:00:00 2001
From: Martin Kroeker <martin@ruby.chemie.uni-freiburg.de>
Date: Mon, 22 Jul 2019 08:28:16 +0200
Subject: [PATCH 115/127] Replace vpermpd with vpermilpd

to improve performance on Zen/Zen2 (as demonstrated by wjc404 in #2180)
---
 kernel/x86_64/zdot_microk_haswell-2.c | 24 ++++++++++++++++--------
 1 file changed, 16 insertions(+), 8 deletions(-)

diff --git a/kernel/x86_64/zdot_microk_haswell-2.c b/kernel/x86_64/zdot_microk_haswell-2.c
index 9f2fc2c1d..4eade7bfd 100644
--- a/kernel/x86_64/zdot_microk_haswell-2.c
+++ b/kernel/x86_64/zdot_microk_haswell-2.c
@@ -66,13 +66,17 @@ static void zdot_kernel_8( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *dot)
 
 	"vfmadd231pd       %%ymm8 , %%ymm12, %%ymm0     \n\t"  // x_r * y_r, x_i * y_i
 	"vfmadd231pd       %%ymm9 , %%ymm13, %%ymm1     \n\t"  // x_r * y_r, x_i * y_i
-	"vpermpd      $0xb1 , %%ymm12, %%ymm12               \n\t"
-	"vpermpd      $0xb1 , %%ymm13, %%ymm13               \n\t"
+	"vpermilpd      $0x05 , %%ymm12, %%ymm12               \n\t"
+	"vpermilpd      $0x05 , %%ymm13, %%ymm13               \n\t"
+//	"vpermpd      $0xb1 , %%ymm12, %%ymm12               \n\t"
+//	"vpermpd      $0xb1 , %%ymm13, %%ymm13               \n\t"
 
 	"vfmadd231pd       %%ymm10, %%ymm14, %%ymm2     \n\t"  // x_r * y_r, x_i * y_i
 	"vfmadd231pd       %%ymm11, %%ymm15, %%ymm3     \n\t"  // x_r * y_r, x_i * y_i
-	"vpermpd      $0xb1 , %%ymm14, %%ymm14               \n\t"
-	"vpermpd      $0xb1 , %%ymm15, %%ymm15               \n\t"
+	"vpermilpd      $0x05 , %%ymm14, %%ymm14               \n\t"
+	"vpermilpd      $0x05 , %%ymm15, %%ymm15               \n\t"
+//	"vpermpd      $0xb1 , %%ymm14, %%ymm14               \n\t"
+//	"vpermpd      $0xb1 , %%ymm15, %%ymm15               \n\t"
 
 	"vfmadd231pd       %%ymm8 , %%ymm12, %%ymm4     \n\t"  // x_r * y_i, x_i * y_r
 	"addq		$16 , %0	  	 	             \n\t"
@@ -151,13 +155,17 @@ static void zdot_kernel_8( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *dot)
 
 	"vfmadd231pd       %%ymm8 , %%ymm12, %%ymm0     \n\t"  // x_r * y_r, x_i * y_i
 	"vfmadd231pd       %%ymm9 , %%ymm13, %%ymm1     \n\t"  // x_r * y_r, x_i * y_i
-	"vpermpd      $0xb1 , %%ymm12, %%ymm12               \n\t"
-	"vpermpd      $0xb1 , %%ymm13, %%ymm13               \n\t"
+	"vpermilpd      $0x05 , %%ymm12, %%ymm12               \n\t"
+	"vpermilpd      $0x05 , %%ymm13, %%ymm13               \n\t"
+//	"vpermpd      $0xb1 , %%ymm12, %%ymm12               \n\t"
+//	"vpermpd      $0xb1 , %%ymm13, %%ymm13               \n\t"
 
 	"vfmadd231pd       %%ymm10, %%ymm14, %%ymm2     \n\t"  // x_r * y_r, x_i * y_i
 	"vfmadd231pd       %%ymm11, %%ymm15, %%ymm3     \n\t"  // x_r * y_r, x_i * y_i
-	"vpermpd      $0xb1 , %%ymm14, %%ymm14               \n\t"
-	"vpermpd      $0xb1 , %%ymm15, %%ymm15               \n\t"
+	"vpermilpd      $0x05 , %%ymm14, %%ymm14               \n\t"
+	"vpermilpd      $0x05 , %%ymm15, %%ymm15               \n\t"
+//	"vpermpd      $0xb1 , %%ymm14, %%ymm14               \n\t"
+//	"vpermpd      $0xb1 , %%ymm15, %%ymm15               \n\t"
 
 	"vfmadd231pd       %%ymm8 , %%ymm12, %%ymm4     \n\t"  // x_r * y_i, x_i * y_r
 	"addq		$16 , %0	  	 	             \n\t"

From 3f6ab1582aca019cf5514aac3af98dcb66c9bbd6 Mon Sep 17 00:00:00 2001
From: Tyler Reddy <tyler.je.reddy@gmail.com>
Date: Mon, 22 Jul 2019 21:24:57 -0600
Subject: [PATCH 116/127] MAINT: remove legacy CMake endif()

* clean up a case where CMake endif()
contained the conditional used in the
if(), which is no longer needed /
discouraged since our minimum required
CMake version supports the modern syntax
---
 cmake/system_check.cmake | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/cmake/system_check.cmake b/cmake/system_check.cmake
index 94d3ba643..610f689e0 100644
--- a/cmake/system_check.cmake
+++ b/cmake/system_check.cmake
@@ -15,7 +15,7 @@ if (${HOST_OS} STREQUAL "LINUX")
     EXECUTE_PROCESS( COMMAND uname -o COMMAND tr -d '\n' OUTPUT_VARIABLE OPERATING_SYSTEM)
       if(${OPERATING_SYSTEM} MATCHES "Android")
         set(HOST_OS ANDROID)
-      endif(${OPERATING_SYSTEM} MATCHES "Android")
+      endif()
 endif()
 
 
From af2e7f28fce42e39fd3d4e108dfb4d55b377b5ee Mon Sep 17 00:00:00 2001
From: Martin Kroeker <martin@ruby.chemie.uni-freiburg.de>
Date: Tue, 23 Jul 2019 16:56:40 +0200
Subject: [PATCH 117/127] Override special make variables

as seen in https://github.com/xianyi/OpenBLAS/issues/1912#issuecomment-514183900 , any external setting of TARGET_ARCH (which could result from building OpenBLAS as part of a larger project that actually uses this variable) would cause the utest build to fail.
(Other subtargets appear to be unaffected as they do not use implicit make rules)
---
 utest/Makefile | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/utest/Makefile b/utest/Makefile
index cbe639cdb..5846db0bb 100644
--- a/utest/Makefile
+++ b/utest/Makefile
@@ -1,6 +1,9 @@
 UTEST_CHECK = 1
 TOPDIR	= ..
 
+override TARGET_ARCH=
+override TARGET_MACH=
+
 UTESTBIN=openblas_utest
 
 .PHONY : all

From 30efed14d1aa9e1fba887aeddac964b841dd4720 Mon Sep 17 00:00:00 2001
From: Martin Kroeker <martin@ruby.chemie.uni-freiburg.de>
Date: Wed, 24 Jul 2019 15:26:09 +0200
Subject: [PATCH 118/127] Unset special make variables in ctest Makefile as
 well

---
 ctest/Makefile | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/ctest/Makefile b/ctest/Makefile
index 569a5dda3..f562c9bb3 100644
--- a/ctest/Makefile
+++ b/ctest/Makefile
@@ -6,6 +6,8 @@ TOPDIR = ..
 include $(TOPDIR)/Makefile.system
 
 override CFLAGS += -DADD$(BU) -DCBLAS
+override TARGET_ARCH=
+override TARGET_MACH=
 
 LIB = $(TOPDIR)/$(LIBNAME)
 

From 7eecd8e39cfd3bf3f8eddc1154b8b2bfec19ea33 Mon Sep 17 00:00:00 2001
From: wjc404 <52632443+wjc404@users.noreply.github.com>
Date: Sun, 28 Jul 2019 07:39:09 +0800
Subject: [PATCH 119/127] Add files via upload

---
 kernel/x86_64/dgemm_kernel_4x8_haswell.S | 334 ++++++++++++++++++++++-
 1 file changed, 325 insertions(+), 9 deletions(-)

diff --git a/kernel/x86_64/dgemm_kernel_4x8_haswell.S b/kernel/x86_64/dgemm_kernel_4x8_haswell.S
index 082e62a7c..19e32ef2c 100644
--- a/kernel/x86_64/dgemm_kernel_4x8_haswell.S
+++ b/kernel/x86_64/dgemm_kernel_4x8_haswell.S
@@ -107,6 +107,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
 #define	A_PR1	512
 #define	B_PR1	160
+#define BROADCASTKERNEL
 
 /*******************************************************************************************
 * Macro definitions
@@ -133,7 +134,11 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 	prefetcht0	A_PR1(AO)
 	vmovups		-12 * SIZE(BO), %ymm1
 	prefetcht0	B_PR1(BO)
+# if defined BROADCASTKERNEL
+        vbroadcastsd    -16 * SIZE(AO), %ymm0
+# else
 	vmovups 	-16 * SIZE(AO), %ymm0
+# endif
 	prefetcht0	B_PR1+64(BO)
 	vmovups		 -8 * SIZE(BO), %ymm2
 	prefetcht0	B_PR1+128(BO)
@@ -143,17 +148,29 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 	vmulpd  	%ymm0 ,%ymm2  , %ymm8
 	vmulpd  	%ymm0 ,%ymm3  , %ymm12
 	prefetcht0	B_PR1+256(BO)
+# if defined BROADCASTKERNEL
+        vbroadcastsd    -15 * SIZE(AO), %ymm0
+# else
 	vpermilpd	$ 0x05, %ymm0  , %ymm0
+# endif
 	vmulpd  	%ymm0 ,%ymm1  , %ymm5
 	vmulpd  	%ymm0 ,%ymm2  , %ymm9
 	vmulpd  	%ymm0 ,%ymm3  , %ymm13
+# if defined BROADCASTKERNEL
+        vbroadcastsd    -14 * SIZE(AO), %ymm0
+# else
 	vpermpd		$ 0x1b, %ymm0  , %ymm0
+# endif
 	vmulpd  	%ymm0 ,%ymm1  , %ymm6
 	vmulpd  	%ymm0 ,%ymm2  , %ymm10
 
 	addq		$ 12*SIZE, BO
 	vmulpd  	%ymm0 ,%ymm3  , %ymm14
+# if defined BROADCASTKERNEL
+        vbroadcastsd    -13 * SIZE(AO), %ymm0
+# else
 	vpermilpd	$ 0x05, %ymm0  , %ymm0
+# endif
 	vmulpd  	%ymm0 ,%ymm1  , %ymm7
 	vmovups		-12 * SIZE(BO), %ymm1
 	vmulpd  	%ymm0 ,%ymm2  , %ymm11
@@ -165,23 +182,38 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
 .macro KERNEL4x12_M1
 	prefetcht0	A_PR1(AO)
+# if defined BROADCASTKERNEL
+        vbroadcastsd    -16 * SIZE(AO), %ymm0
+# else
 	vmovups 	-16 * SIZE(AO), %ymm0
+# endif
 	prefetcht0	B_PR1(BO)
 	vfmadd231pd  	%ymm0 ,%ymm1  , %ymm4
 	prefetcht0	B_PR1+64(BO)
 	vfmadd231pd  	%ymm0 ,%ymm2  , %ymm8
 	prefetcht0	B_PR1+128(BO)
 	vfmadd231pd  	%ymm0 ,%ymm3  , %ymm12
+# if defined BROADCASTKERNEL
+        vbroadcastsd    -15 * SIZE(AO), %ymm0
+# else
 	vpermilpd	$ 0x05, %ymm0  , %ymm0
+# endif
 	vfmadd231pd  	%ymm0 ,%ymm1  , %ymm5
 	vfmadd231pd  	%ymm0 ,%ymm2  , %ymm9
 	vfmadd231pd  	%ymm0 ,%ymm3  , %ymm13
+# if defined BROADCASTKERNEL
+        vbroadcastsd    -14 * SIZE(AO), %ymm0
+# else
 	vpermpd		$ 0x1b, %ymm0  , %ymm0
+# endif
 	vfmadd231pd  	%ymm0 ,%ymm1  , %ymm6
 	vfmadd231pd  	%ymm0 ,%ymm2  , %ymm10
-
 	vfmadd231pd  	%ymm0 ,%ymm3  , %ymm14
+# if defined BROADCASTKERNEL
+        vbroadcastsd    -13 * SIZE(AO), %ymm0
+# else
 	vpermilpd	$ 0x05, %ymm0  , %ymm0
+# endif
 	vfmadd231pd  	%ymm0 ,%ymm1  , %ymm7
 	vmovups		-12 * SIZE(BO), %ymm1
 	vfmadd231pd  	%ymm0 ,%ymm2  , %ymm11
@@ -192,21 +224,37 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 .endm
 
 .macro KERNEL4x12_M2
+# if defined BROADCASTKERNEL
+        vbroadcastsd    -12 * SIZE(AO), %ymm0
+# else
 	vmovups 	-12 * SIZE(AO), %ymm0
+# endif
 	vfmadd231pd  	%ymm0 ,%ymm1  , %ymm4
 	vfmadd231pd  	%ymm0 ,%ymm2  , %ymm8
 	vfmadd231pd  	%ymm0 ,%ymm3  , %ymm12
+# if defined BROADCASTKERNEL
+        vbroadcastsd    -11 * SIZE(AO), %ymm0
+# else
 	vpermilpd	$ 0x05, %ymm0  , %ymm0
+# endif
 	vfmadd231pd  	%ymm0 ,%ymm1  , %ymm5
 	vfmadd231pd  	%ymm0 ,%ymm2  , %ymm9
 	vfmadd231pd  	%ymm0 ,%ymm3  , %ymm13
+# if defined BROADCASTKERNEL
+        vbroadcastsd    -10 * SIZE(AO), %ymm0
+# else
 	vpermpd		$ 0x1b, %ymm0  , %ymm0
+# endif
 	vfmadd231pd  	%ymm0 ,%ymm1  , %ymm6
 	vfmadd231pd  	%ymm0 ,%ymm2  , %ymm10
 
 	addq		$ 8*SIZE, AO
 	vfmadd231pd  	%ymm0 ,%ymm3  , %ymm14
+# if defined BROADCASTKERNEL
+        vbroadcastsd    -17 * SIZE(AO), %ymm0
+# else
 	vpermilpd	$ 0x05, %ymm0  , %ymm0
+# endif
 	vfmadd231pd  	%ymm0 ,%ymm1  , %ymm7
 	vmovups		  0 * SIZE(BO), %ymm1
 	vfmadd231pd  	%ymm0 ,%ymm2  , %ymm11
@@ -218,21 +266,37 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
 
 .macro KERNEL4x12_E
+# if defined BROADCASTKERNEL
+        vbroadcastsd    -12 * SIZE(AO), %ymm0
+# else
 	vmovups 	-12 * SIZE(AO), %ymm0
+# endif
 	vfmadd231pd  	%ymm0 ,%ymm1  , %ymm4
 	vfmadd231pd  	%ymm0 ,%ymm2  , %ymm8
 	vfmadd231pd  	%ymm0 ,%ymm3  , %ymm12
+# if defined BROADCASTKERNEL
+        vbroadcastsd    -11 * SIZE(AO), %ymm0
+# else
 	vpermilpd	$ 0x05, %ymm0  , %ymm0
+# endif
 	vfmadd231pd  	%ymm0 ,%ymm1  , %ymm5
 	vfmadd231pd  	%ymm0 ,%ymm2  , %ymm9
 	vfmadd231pd  	%ymm0 ,%ymm3  , %ymm13
+# if defined BROADCASTKERNEL
+        vbroadcastsd    -10 * SIZE(AO), %ymm0
+# else
 	vpermpd		$ 0x1b, %ymm0  , %ymm0
+# endif
 	vfmadd231pd  	%ymm0 ,%ymm1  , %ymm6
 	vfmadd231pd  	%ymm0 ,%ymm2  , %ymm10
 
 	addq		$ 8*SIZE, AO
 	vfmadd231pd  	%ymm0 ,%ymm3  , %ymm14
+# if defined BROADCASTKERNEL
+        vbroadcastsd    -17 * SIZE(AO), %ymm0
+# else
 	vpermilpd	$ 0x05, %ymm0  , %ymm0
+# endif
 	vfmadd231pd  	%ymm0 ,%ymm1  , %ymm7
 	vfmadd231pd  	%ymm0 ,%ymm2  , %ymm11
 	vfmadd231pd  	%ymm0 ,%ymm3  , %ymm15
@@ -241,23 +305,39 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
 .macro KERNEL4x12_SUB
 	vmovups		-12 * SIZE(BO), %ymm1
+# if defined BROADCASTKERNEL
+        vbroadcastsd    -16 * SIZE(AO), %ymm0
+# else
 	vmovups 	-16 * SIZE(AO), %ymm0
+# endif
 	vfmadd231pd  	%ymm0 ,%ymm1  , %ymm4
 	vmovups		 -8 * SIZE(BO), %ymm2
 	vfmadd231pd  	%ymm0 ,%ymm2  , %ymm8
 	vmovups		 -4 * SIZE(BO), %ymm3
 	vfmadd231pd  	%ymm0 ,%ymm3  , %ymm12
+# if defined BROADCASTKERNEL
+        vbroadcastsd    -15 * SIZE(AO), %ymm0
+# else
 	vpermilpd	$ 0x05, %ymm0  , %ymm0
+# endif
 	vfmadd231pd  	%ymm0 ,%ymm1  , %ymm5
 	vfmadd231pd  	%ymm0 ,%ymm2  , %ymm9
 	addq		$ 12*SIZE, BO
 	vfmadd231pd  	%ymm0 ,%ymm3  , %ymm13
+# if defined BROADCASTKERNEL
+        vbroadcastsd    -14 * SIZE(AO), %ymm0
+# else
 	vpermpd		$ 0x1b, %ymm0  , %ymm0
+# endif
 	vfmadd231pd  	%ymm0 ,%ymm1  , %ymm6
 	vfmadd231pd  	%ymm0 ,%ymm2  , %ymm10
 	addq		$ 4*SIZE, AO
 	vfmadd231pd  	%ymm0 ,%ymm3  , %ymm14
+# if defined BROADCASTKERNEL
+        vbroadcastsd    -17 * SIZE(AO), %ymm0
+# else
 	vpermilpd	$ 0x05, %ymm0  , %ymm0
+# endif
 	vfmadd231pd  	%ymm0 ,%ymm1  , %ymm7
 	vfmadd231pd  	%ymm0 ,%ymm2  , %ymm11
 	vfmadd231pd  	%ymm0 ,%ymm3  , %ymm15
@@ -289,27 +369,53 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #if B_PR1 > 96
         prefetcht0      192 + BUFFER1
 #endif
+
+#if defined BROADCASTKERNEL
+        vperm2f128 $ 0x20 , %ymm6, %ymm4 , %ymm0
+        vperm2f128 $ 0x20 , %ymm7, %ymm5 , %ymm1
+        vperm2f128 $ 0x31 , %ymm6, %ymm4 , %ymm2
+        vperm2f128 $ 0x31 , %ymm7, %ymm5 , %ymm3
+#else
 	vpermilpd $ 0x05 , %ymm5, %ymm5
 	vpermilpd $ 0x05 , %ymm7, %ymm7
+#endif
+
 #if B_PR1 > 160
         prefetcht0      256 + BUFFER1
 #endif
+
+#if defined BROADCASTKERNEL
+        vunpcklpd %ymm1, %ymm0, %ymm4
+        vunpckhpd %ymm1, %ymm0, %ymm5
+        vunpcklpd %ymm3, %ymm2, %ymm6
+        vunpckhpd %ymm3, %ymm2, %ymm7
+#else
 	vblendpd $ 0x0a, %ymm5, %ymm4, %ymm0
 	vblendpd $ 0x05, %ymm5, %ymm4, %ymm1
 	vblendpd $ 0x0a, %ymm7, %ymm6, %ymm2
 	vblendpd $ 0x05, %ymm7, %ymm6, %ymm3
+#endif
+
 #if B_PR1 > 224
         prefetcht0      320 + BUFFER1
 #endif
+
+#ifndef BROADCASTKERNEL
 	vperm2f128 $ 0x01 , %ymm2, %ymm2 , %ymm2
 	vperm2f128 $ 0x01 , %ymm3, %ymm3 , %ymm3
+#endif
+
 #if B_PR1 > 288
         prefetcht0      384 + BUFFER1
 #endif
+
+#ifndef BROADCASTKERNEL
 	vblendpd $ 0x03, %ymm0, %ymm2 , %ymm4
 	vblendpd $ 0x03, %ymm1, %ymm3 , %ymm5
 	vblendpd $ 0x03, %ymm2, %ymm0 , %ymm6
 	vblendpd $ 0x03, %ymm3, %ymm1 , %ymm7
+#endif
+
 #if B_PR1 > 352
         prefetcht0      448 + BUFFER1
 #endif
@@ -338,11 +444,21 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 	prefetcht1	56(%rax)
 	prefetcht1	56(%rax,LDC)
 
-	vpermilpd $ 0x05 , %ymm9 , %ymm9
+#if defined BROADCASTKERNEL
+        vperm2f128 $ 0x20 , %ymm10, %ymm8 , %ymm0
+        vperm2f128 $ 0x20 , %ymm11, %ymm9 , %ymm1
+        vperm2f128 $ 0x31 , %ymm10, %ymm8 , %ymm2
+        vperm2f128 $ 0x31 , %ymm11, %ymm9 , %ymm3
+        vunpcklpd %ymm1, %ymm0, %ymm4
+        vunpckhpd %ymm1, %ymm0, %ymm5
+        vunpcklpd %ymm3, %ymm2, %ymm6
+        vunpckhpd %ymm3, %ymm2, %ymm7
+#else
+	vpermilpd $ 0x05 , %ymm9, %ymm9
 	vpermilpd $ 0x05 , %ymm11, %ymm11
 
-	vblendpd $ 0x0a, %ymm9 , %ymm8 , %ymm0
-	vblendpd $ 0x05, %ymm9 , %ymm8 , %ymm1
+	vblendpd $ 0x0a, %ymm9, %ymm8, %ymm0
+	vblendpd $ 0x05, %ymm9, %ymm8, %ymm1
 	vblendpd $ 0x0a, %ymm11, %ymm10, %ymm2
 	vblendpd $ 0x05, %ymm11, %ymm10, %ymm3
 
@@ -353,7 +469,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 	vblendpd $ 0x03, %ymm1, %ymm3 , %ymm5
 	vblendpd $ 0x03, %ymm2, %ymm0 , %ymm6
 	vblendpd $ 0x03, %ymm3, %ymm1 , %ymm7
-
+#endif
 
 	leaq	(%rax, LDC, 2), %rax
 	leaq	(%rax, LDC, 2), %rbp
@@ -377,6 +493,16 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 	prefetcht1	56(%rbp)
 	prefetcht1	56(%rbp,LDC)
 
+#if defined BROADCASTKERNEL
+        vperm2f128 $ 0x20 , %ymm14, %ymm12 , %ymm0
+        vperm2f128 $ 0x20 , %ymm15, %ymm13 , %ymm1
+        vperm2f128 $ 0x31 , %ymm14, %ymm12 , %ymm2
+        vperm2f128 $ 0x31 , %ymm15, %ymm13 , %ymm3
+        vunpcklpd %ymm1, %ymm0, %ymm4
+        vunpckhpd %ymm1, %ymm0, %ymm5
+        vunpcklpd %ymm3, %ymm2, %ymm6
+        vunpckhpd %ymm3, %ymm2, %ymm7
+#else
 	vpermilpd $ 0x05 , %ymm13, %ymm13
 	vpermilpd $ 0x05 , %ymm15, %ymm15
 
@@ -392,7 +518,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 	vblendpd $ 0x03, %ymm1, %ymm3 , %ymm5
 	vblendpd $ 0x03, %ymm2, %ymm0 , %ymm6
 	vblendpd $ 0x03, %ymm3, %ymm1 , %ymm7
-
+#endif
 
 	leaq	(%rax, LDC, 4), %rax
 	leaq	(%rbp, LDC, 4), %rbp
@@ -693,19 +819,35 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
 .macro KERNEL4x8_I
 	vmovups		-12 * SIZE(BO), %ymm1
+#if defined BROADCASTKERNEL
+        vbroadcastsd    -16 * SIZE(AO), %ymm0
+#else
 	vmovups 	-16 * SIZE(AO), %ymm0
+#endif
 	vmovups		 -8 * SIZE(BO), %ymm2
 	vmulpd  	%ymm0 ,%ymm1  , %ymm4
 	vmulpd  	%ymm0 ,%ymm2  , %ymm8
+#if defined BROADCASTKERNEL
+        vbroadcastsd    -15 * SIZE(AO), %ymm0
+#else
 	vpermilpd	$ 0x05, %ymm0  , %ymm0
+#endif
 	vmulpd  	%ymm0 ,%ymm1  , %ymm5
 	vmulpd  	%ymm0 ,%ymm2  , %ymm9
+#if defined BROADCASTKERNEL
+        vbroadcastsd    -14 * SIZE(AO), %ymm0
+#else
 	vpermpd		$ 0x1b, %ymm0  , %ymm0
+#endif
 	vmulpd  	%ymm0 ,%ymm1  , %ymm6
 	vmulpd  	%ymm0 ,%ymm2  , %ymm10
 
 	addq		$  8*SIZE, BO
+#if defined BROADCASTKERNEL
+        vbroadcastsd    -13 * SIZE(AO), %ymm0
+#else
 	vpermilpd	$ 0x05, %ymm0  , %ymm0
+#endif
 	vmulpd  	%ymm0 ,%ymm1  , %ymm7
 	vmovups		-12 * SIZE(BO), %ymm1
 	vmulpd  	%ymm0 ,%ymm2  , %ymm11
@@ -715,19 +857,34 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
 .macro KERNEL4x8_M1
 	prefetcht0	A_PR1(AO)
+#if defined BROADCASTKERNEL
+        vbroadcastsd    -16 * SIZE(AO), %ymm0
+#else
 	vmovups 	-16 * SIZE(AO), %ymm0
+#endif
 	prefetcht0	B_PR1(BO)
 	vfmadd231pd  	%ymm0 ,%ymm1  , %ymm4
 	prefetcht0	B_PR1+64(BO)
 	vfmadd231pd  	%ymm0 ,%ymm2  , %ymm8
+#if defined BROADCASTKERNEL
+        vbroadcastsd    -15 * SIZE(AO), %ymm0
+#else
 	vpermilpd	$ 0x05, %ymm0  , %ymm0
+#endif
 	vfmadd231pd  	%ymm0 ,%ymm1  , %ymm5
 	vfmadd231pd  	%ymm0 ,%ymm2  , %ymm9
+#if defined BROADCASTKERNEL
+        vbroadcastsd    -14 * SIZE(AO), %ymm0
+#else
 	vpermpd		$ 0x1b, %ymm0  , %ymm0
+#endif
 	vfmadd231pd  	%ymm0 ,%ymm1  , %ymm6
 	vfmadd231pd  	%ymm0 ,%ymm2  , %ymm10
-
+#if defined BROADCASTKERNEL
+        vbroadcastsd    -13 * SIZE(AO), %ymm0
+#else
 	vpermilpd	$ 0x05, %ymm0  , %ymm0
+#endif
 	vfmadd231pd  	%ymm0 ,%ymm1  , %ymm7
 	vmovups		-12 * SIZE(BO), %ymm1
 	vfmadd231pd  	%ymm0 ,%ymm2  , %ymm11
@@ -736,18 +893,34 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 .endm
 
 .macro KERNEL4x8_M2
+#if defined BROADCASTKERNEL
+        vbroadcastsd    -12 * SIZE(AO), %ymm0
+#else
 	vmovups 	-12 * SIZE(AO), %ymm0
+#endif
 	vfmadd231pd  	%ymm0 ,%ymm1  , %ymm4
 	vfmadd231pd  	%ymm0 ,%ymm2  , %ymm8
+#if defined BROADCASTKERNEL
+        vbroadcastsd    -11 * SIZE(AO), %ymm0
+#else
 	vpermilpd	$ 0x05, %ymm0  , %ymm0
+#endif
 	vfmadd231pd  	%ymm0 ,%ymm1  , %ymm5
 	vfmadd231pd  	%ymm0 ,%ymm2  , %ymm9
+#if defined BROADCASTKERNEL
+        vbroadcastsd    -10 * SIZE(AO), %ymm0
+#else
 	vpermpd		$ 0x1b, %ymm0  , %ymm0
+#endif
 	vfmadd231pd  	%ymm0 ,%ymm1  , %ymm6
 	vfmadd231pd  	%ymm0 ,%ymm2  , %ymm10
 
 	addq		$ 8*SIZE, AO
+#if defined BROADCASTKERNEL
+        vbroadcastsd    -17 * SIZE(AO), %ymm0
+#else
 	vpermilpd	$ 0x05, %ymm0  , %ymm0
+#endif
 	vfmadd231pd  	%ymm0 ,%ymm1  , %ymm7
 	vmovups		 -4 * SIZE(BO), %ymm1
 	vfmadd231pd  	%ymm0 ,%ymm2  , %ymm11
@@ -757,18 +930,34 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
 
 .macro KERNEL4x8_E
+#if defined BROADCASTKERNEL
+        vbroadcastsd    -12 * SIZE(AO), %ymm0
+#else
 	vmovups 	-12 * SIZE(AO), %ymm0
+#endif
 	vfmadd231pd  	%ymm0 ,%ymm1  , %ymm4
 	vfmadd231pd  	%ymm0 ,%ymm2  , %ymm8
+#if defined BROADCASTKERNEL
+        vbroadcastsd    -11 * SIZE(AO), %ymm0
+#else
 	vpermilpd	$ 0x05, %ymm0  , %ymm0
+#endif
 	vfmadd231pd  	%ymm0 ,%ymm1  , %ymm5
 	vfmadd231pd  	%ymm0 ,%ymm2  , %ymm9
+#if defined BROADCASTKERNEL
+        vbroadcastsd    -10 * SIZE(AO), %ymm0
+#else
 	vpermpd		$ 0x1b, %ymm0  , %ymm0
+#endif
 	vfmadd231pd  	%ymm0 ,%ymm1  , %ymm6
 	vfmadd231pd  	%ymm0 ,%ymm2  , %ymm10
 
 	addq		$ 8*SIZE, AO
+#if defined BROADCASTKERNEL
+        vbroadcastsd    -17 * SIZE(AO), %ymm0
+#else
 	vpermilpd	$ 0x05, %ymm0  , %ymm0
+#endif
 	vfmadd231pd  	%ymm0 ,%ymm1  , %ymm7
 	vfmadd231pd  	%ymm0 ,%ymm2  , %ymm11
 	addq		$  8*SIZE, BO
@@ -776,19 +965,35 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
 .macro KERNEL4x8_SUB
 	vmovups		-12 * SIZE(BO), %ymm1
+#if defined BROADCASTKERNEL
+        vbroadcastsd    -16 * SIZE(AO), %ymm0
+#else
 	vmovups 	-16 * SIZE(AO), %ymm0
+#endif
 	vfmadd231pd  	%ymm0 ,%ymm1  , %ymm4
 	vmovups		 -8 * SIZE(BO), %ymm2
 	vfmadd231pd  	%ymm0 ,%ymm2  , %ymm8
+#if defined BROADCASTKERNEL
+        vbroadcastsd    -15 * SIZE(AO), %ymm0
+#else
 	vpermilpd	$ 0x05, %ymm0  , %ymm0
+#endif
 	vfmadd231pd  	%ymm0 ,%ymm1  , %ymm5
 	vfmadd231pd  	%ymm0 ,%ymm2  , %ymm9
 	addq		$  8*SIZE, BO
+#if defined BROADCASTKERNEL
+        vbroadcastsd    -14 * SIZE(AO), %ymm0
+#else
 	vpermpd		$ 0x1b, %ymm0  , %ymm0
+#endif
 	vfmadd231pd  	%ymm0 ,%ymm1  , %ymm6
 	vfmadd231pd  	%ymm0 ,%ymm2  , %ymm10
 	addq		$ 4*SIZE, AO
+#if defined BROADCASTKERNEL
+        vbroadcastsd    -17 * SIZE(AO), %ymm0
+#else
 	vpermilpd	$ 0x05, %ymm0  , %ymm0
+#endif
 	vfmadd231pd  	%ymm0 ,%ymm1  , %ymm7
 	vfmadd231pd  	%ymm0 ,%ymm2  , %ymm11
 
@@ -809,6 +1014,16 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 	vmulpd	%ymm0 , %ymm10, %ymm10
 	vmulpd	%ymm0 , %ymm11, %ymm11
 
+#if defined BROADCASTKERNEL
+        vperm2f128 $ 0x20 , %ymm6, %ymm4 , %ymm0
+        vperm2f128 $ 0x20 , %ymm7, %ymm5 , %ymm1
+        vperm2f128 $ 0x31 , %ymm6, %ymm4 , %ymm2
+        vperm2f128 $ 0x31 , %ymm7, %ymm5 , %ymm3
+        vunpcklpd %ymm1, %ymm0, %ymm4
+        vunpckhpd %ymm1, %ymm0, %ymm5
+        vunpcklpd %ymm3, %ymm2, %ymm6
+        vunpckhpd %ymm3, %ymm2, %ymm7
+#else
 	vpermilpd $ 0x05 , %ymm5, %ymm5
 	vpermilpd $ 0x05 , %ymm7, %ymm7
 
@@ -824,6 +1039,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 	vblendpd $ 0x03, %ymm1, %ymm3 , %ymm5
 	vblendpd $ 0x03, %ymm2, %ymm0 , %ymm6
 	vblendpd $ 0x03, %ymm3, %ymm1 , %ymm7
+#endif
 
         leaq    (CO1, LDC, 2), %rax     
 	
@@ -847,6 +1063,16 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 	prefetcht0	56(%rax)
 	prefetcht0	56(%rax,LDC)
 
+#if defined BROADCASTKERNEL
+        vperm2f128 $ 0x20 , %ymm10, %ymm8 , %ymm0
+        vperm2f128 $ 0x20 , %ymm11, %ymm9 , %ymm1
+        vperm2f128 $ 0x31 , %ymm10, %ymm8 , %ymm2
+        vperm2f128 $ 0x31 , %ymm11, %ymm9 , %ymm3
+        vunpcklpd %ymm1, %ymm0, %ymm4
+        vunpckhpd %ymm1, %ymm0, %ymm5
+        vunpcklpd %ymm3, %ymm2, %ymm6
+        vunpckhpd %ymm3, %ymm2, %ymm7
+#else
 	vpermilpd $ 0x05 , %ymm9 , %ymm9
 	vpermilpd $ 0x05 , %ymm11, %ymm11
 
@@ -862,7 +1088,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 	vblendpd $ 0x03, %ymm1, %ymm3 , %ymm5
 	vblendpd $ 0x03, %ymm2, %ymm0 , %ymm6
 	vblendpd $ 0x03, %ymm3, %ymm1 , %ymm7
-
+#endif
 
 	leaq	(%rax, LDC, 2), %rax
 	leaq	(%rax, LDC, 2), %rbp
@@ -1088,15 +1314,31 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 .macro KERNEL4x4_I
 	prefetcht0	A_PR1(AO)
 	vmovups		-12 * SIZE(BO), %ymm1
+#if defined BROADCASTKERNEL
+        vbroadcastsd    -16 * SIZE(AO), %ymm0
+#else
 	vmovups 	-16 * SIZE(AO), %ymm0
+#endif
 	vmulpd  	%ymm0 ,%ymm1  , %ymm4
+#if defined BROADCASTKERNEL
+        vbroadcastsd    -15 * SIZE(AO), %ymm0
+#else
 	vpermilpd	$ 0x05, %ymm0  , %ymm0
+#endif
 	vmulpd  	%ymm0 ,%ymm1  , %ymm5
+#if defined BROADCASTKERNEL
+        vbroadcastsd    -14 * SIZE(AO), %ymm0
+#else
 	vpermpd		$ 0x1b, %ymm0  , %ymm0
+#endif
 	vmulpd  	%ymm0 ,%ymm1  , %ymm6
 
 	addq		$ 4*SIZE, BO
+#if defined BROADCASTKERNEL
+        vbroadcastsd    -13 * SIZE(AO), %ymm0
+#else
 	vpermilpd	$ 0x05, %ymm0  , %ymm0
+#endif
 	vmulpd  	%ymm0 ,%ymm1  , %ymm7
 	vmovups		-12 * SIZE(BO), %ymm1
 
@@ -1104,29 +1346,60 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
 .macro KERNEL4x4_M1
 	prefetcht0	A_PR1(AO)
+#if defined BROADCASTKERNEL
+        vbroadcastsd    -16 * SIZE(AO), %ymm0
+#else
 	vmovups 	-16 * SIZE(AO), %ymm0
+#endif
 	vfmadd231pd  	%ymm0 ,%ymm1  , %ymm4
+#if defined BROADCASTKERNEL
+        vbroadcastsd    -15 * SIZE(AO), %ymm0
+#else
 	vpermilpd	$ 0x05, %ymm0  , %ymm0
+#endif
 	vfmadd231pd  	%ymm0 ,%ymm1  , %ymm5
+#if defined BROADCASTKERNEL
+        vbroadcastsd    -14 * SIZE(AO), %ymm0
+#else
 	vpermpd		$ 0x1b, %ymm0  , %ymm0
+#endif
 	vfmadd231pd  	%ymm0 ,%ymm1  , %ymm6
-
+#if defined BROADCASTKERNEL
+        vbroadcastsd    -13 * SIZE(AO), %ymm0
+#else
 	vpermilpd	$ 0x05, %ymm0  , %ymm0
+#endif
 	vfmadd231pd  	%ymm0 ,%ymm1  , %ymm7
 	vmovups		-12 * SIZE(BO), %ymm1
 
 .endm
 
 .macro KERNEL4x4_M2
+#if defined BROADCASTKERNEL
+        vbroadcastsd    -12 * SIZE(AO), %ymm0
+#else
 	vmovups 	-12 * SIZE(AO), %ymm0
+#endif
 	vfmadd231pd  	%ymm0 ,%ymm1  , %ymm4
+#if defined BROADCASTKERNEL
+        vbroadcastsd    -11 * SIZE(AO), %ymm0
+#else
 	vpermilpd	$ 0x05, %ymm0  , %ymm0
+#endif
 	vfmadd231pd  	%ymm0 ,%ymm1  , %ymm5
+#if defined BROADCASTKERNEL
+        vbroadcastsd    -10 * SIZE(AO), %ymm0
+#else
 	vpermpd		$ 0x1b, %ymm0  , %ymm0
+#endif
 	vfmadd231pd  	%ymm0 ,%ymm1  , %ymm6
 
 	addq		$ 8*SIZE, AO
+#if defined BROADCASTKERNEL
+        vbroadcastsd    -17 * SIZE(AO), %ymm0
+#else
 	vpermilpd	$ 0x05, %ymm0  , %ymm0
+#endif
 	vfmadd231pd  	%ymm0 ,%ymm1  , %ymm7
 	vmovups		 -8 * SIZE(BO), %ymm1
 	addq		$ 8*SIZE, BO
@@ -1134,30 +1407,62 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
 
 .macro KERNEL4x4_E
+#if defined BROADCASTKERNEL
+        vbroadcastsd    -12 * SIZE(AO), %ymm0
+#else
 	vmovups 	-12 * SIZE(AO), %ymm0
+#endif
 	vfmadd231pd  	%ymm0 ,%ymm1  , %ymm4
+#if defined BROADCASTKERNEL
+        vbroadcastsd    -11 * SIZE(AO), %ymm0
+#else
 	vpermilpd	$ 0x05, %ymm0  , %ymm0
+#endif
 	vfmadd231pd  	%ymm0 ,%ymm1  , %ymm5
+#if defined BROADCASTKERNEL
+        vbroadcastsd    -10 * SIZE(AO), %ymm0
+#else
 	vpermpd		$ 0x1b, %ymm0  , %ymm0
+#endif
 	vfmadd231pd  	%ymm0 ,%ymm1  , %ymm6
 
 	addq		$ 8*SIZE, AO
+#if defined BROADCASTKERNEL
+        vbroadcastsd    -17 * SIZE(AO), %ymm0
+#else
 	vpermilpd	$ 0x05, %ymm0  , %ymm0
+#endif
 	vfmadd231pd  	%ymm0 ,%ymm1  , %ymm7
 	addq		$ 4*SIZE, BO
 .endm
 
 .macro KERNEL4x4_SUB
 	vmovups		-12 * SIZE(BO), %ymm1
+#if defined BROADCASTKERNEL
+        vbroadcastsd    -16 * SIZE(AO), %ymm0
+#else
 	vmovups 	-16 * SIZE(AO), %ymm0
+#endif
 	vfmadd231pd  	%ymm0 ,%ymm1  , %ymm4
+#if defined BROADCASTKERNEL
+        vbroadcastsd    -15 * SIZE(AO), %ymm0
+#else
 	vpermilpd	$ 0x05, %ymm0  , %ymm0
+#endif
 	vfmadd231pd  	%ymm0 ,%ymm1  , %ymm5
 	addq		$ 4*SIZE, BO
+#if defined BROADCASTKERNEL
+        vbroadcastsd    -14 * SIZE(AO), %ymm0
+#else
 	vpermpd		$ 0x1b, %ymm0  , %ymm0
+#endif
 	vfmadd231pd  	%ymm0 ,%ymm1  , %ymm6
 	addq		$ 4*SIZE, AO
+#if defined BROADCASTKERNEL
+        vbroadcastsd    -17 * SIZE(AO), %ymm0
+#else
 	vpermilpd	$ 0x05, %ymm0  , %ymm0
+#endif
 	vfmadd231pd  	%ymm0 ,%ymm1  , %ymm7
 
 .endm
@@ -1171,6 +1476,16 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 	vmulpd	%ymm0 , %ymm5 , %ymm5
 	vmulpd	%ymm0 , %ymm6 , %ymm6
 
+#if defined BROADCASTKERNEL
+        vperm2f128 $ 0x20 , %ymm6, %ymm4 , %ymm0
+        vperm2f128 $ 0x20 , %ymm7, %ymm5 , %ymm1
+        vperm2f128 $ 0x31 , %ymm6, %ymm4 , %ymm2
+        vperm2f128 $ 0x31 , %ymm7, %ymm5 , %ymm3
+        vunpcklpd %ymm1, %ymm0, %ymm4
+        vunpckhpd %ymm1, %ymm0, %ymm5
+        vunpcklpd %ymm3, %ymm2, %ymm6
+        vunpckhpd %ymm3, %ymm2, %ymm7
+#else
 	vpermilpd $ 0x05 , %ymm5, %ymm5
 	vpermilpd $ 0x05 , %ymm7, %ymm7
 
@@ -1186,6 +1501,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 	vblendpd $ 0x03, %ymm1, %ymm3 , %ymm5
 	vblendpd $ 0x03, %ymm2, %ymm0 , %ymm6
 	vblendpd $ 0x03, %ymm3, %ymm1 , %ymm7
+#endif
 
         leaq    (CO1, LDC, 2), %rax     
 	

From 2dfb804cb943ac12035fe51859d109daca76b4f4 Mon Sep 17 00:00:00 2001
From: Martin Kroeker <martin@ruby.chemie.uni-freiburg.de>
Date: Sun, 28 Jul 2019 23:17:28 +0200
Subject: [PATCH 120/127] Replace vpermpd with vpermilpd in the Haswell DTRMM
 kernel

to improve performance on AMD Zen (#2180) applying wjc404's improvement of the DGEMM kernel from #2186
---
 kernel/x86_64/dtrmm_kernel_4x8_haswell.c | 24 ++++++++++--------------
 1 file changed, 10 insertions(+), 14 deletions(-)

diff --git a/kernel/x86_64/dtrmm_kernel_4x8_haswell.c b/kernel/x86_64/dtrmm_kernel_4x8_haswell.c
index 651736b89..2acdc4615 100644
--- a/kernel/x86_64/dtrmm_kernel_4x8_haswell.c
+++ b/kernel/x86_64/dtrmm_kernel_4x8_haswell.c
@@ -33,7 +33,7 @@ static void dtrmm_kernel_4x8( BLASLONG n, FLOAT *alpha ,FLOAT *a, FLOAT *b, FLOA
 		"	vfmadd231pd	%%ymm0 , %%ymm1 , %%ymm4		\n\t"
 		"	vfmadd231pd	%%ymm0 , %%ymm2 , %%ymm8		\n\t"
 
-		"	vpermpd         $0xb1  , %%ymm0 , %%ymm0		\n\t"
+		"	vpermilpd         $0x05  , %%ymm0 , %%ymm0		\n\t"
 		"	vfmadd231pd	%%ymm0 , %%ymm1 , %%ymm5		\n\t"
 		"	vfmadd231pd	%%ymm0 , %%ymm2 , %%ymm9		\n\t"
 
@@ -41,7 +41,7 @@ static void dtrmm_kernel_4x8( BLASLONG n, FLOAT *alpha ,FLOAT *a, FLOAT *b, FLOA
 		"	vfmadd231pd	%%ymm0 , %%ymm1 , %%ymm6		\n\t"
 		"	vfmadd231pd	%%ymm0 , %%ymm2 , %%ymm10		\n\t"
 
-		"	vpermpd         $0xb1  , %%ymm0 , %%ymm0		\n\t"
+		"	vpermilpd         $0x05  , %%ymm0 , %%ymm0		\n\t"
 		"	vfmadd231pd	%%ymm0 , %%ymm1 , %%ymm7		\n\t"
 		"	vfmadd231pd	%%ymm0 , %%ymm2 , %%ymm11		\n\t"
 
@@ -62,18 +62,16 @@ static void dtrmm_kernel_4x8( BLASLONG n, FLOAT *alpha ,FLOAT *a, FLOAT *b, FLOA
 		"	vmulpd		%%ymm0 , %%ymm10, %%ymm10		\n\t"
 		"	vmulpd		%%ymm0 , %%ymm11, %%ymm11		\n\t"
 
-		"	vpermpd 	$0xb1  , %%ymm5 , %%ymm5		\n\t"
-		"	vpermpd 	$0xb1  , %%ymm7 , %%ymm7		\n\t"
+		"	vpermilpd 	$0x05  , %%ymm5 , %%ymm5		\n\t"
+		"	vpermilpd 	$0x05  , %%ymm7 , %%ymm7		\n\t"
 
 		"	vblendpd 	$0x0a  , %%ymm5 , %%ymm4 , %%ymm0	\n\t"
 		"	vblendpd 	$0x05  , %%ymm5 , %%ymm4 , %%ymm1	\n\t"
 		"	vblendpd 	$0x0a  , %%ymm7 , %%ymm6 , %%ymm2	\n\t"
 		"	vblendpd 	$0x05  , %%ymm7 , %%ymm6 , %%ymm3	\n\t"
 
-		"	vpermpd 	$0x1b  , %%ymm2 , %%ymm2		\n\t"
-		"	vpermpd 	$0x1b  , %%ymm3 , %%ymm3		\n\t"
-		"	vpermpd 	$0xb1  , %%ymm2 , %%ymm2		\n\t"
-		"	vpermpd 	$0xb1  , %%ymm3 , %%ymm3		\n\t"
+		"	vperm2f128 	$0x01  , %%ymm2 , %%ymm2 , %%ymm2	\n\t"
+		"	vperm2f128 	$0x01  , %%ymm3 , %%ymm3 , %%ymm3	\n\t"
 
 		"	vblendpd 	$0x03  , %%ymm0 , %%ymm2 , %%ymm4	\n\t"
 		"	vblendpd 	$0x03  , %%ymm1 , %%ymm3 , %%ymm5	\n\t"
@@ -85,18 +83,16 @@ static void dtrmm_kernel_4x8( BLASLONG n, FLOAT *alpha ,FLOAT *a, FLOAT *b, FLOA
 		"	vmovups		%%ymm6 , (%7)				\n\t"
 		"	vmovups		%%ymm7 , (%8)				\n\t"
 
-		"	vpermpd 	$0xb1  , %%ymm9 , %%ymm9		\n\t"
-		"	vpermpd 	$0xb1  , %%ymm11, %%ymm11		\n\t"
+		"	vpermilpd 	$0x05  , %%ymm9 , %%ymm9		\n\t"
+		"	vpermilpd 	$0x05  , %%ymm11, %%ymm11		\n\t"
 
 		"	vblendpd 	$0x0a  , %%ymm9 , %%ymm8 , %%ymm0	\n\t"
 		"	vblendpd 	$0x05  , %%ymm9 , %%ymm8 , %%ymm1	\n\t"
 		"	vblendpd 	$0x0a  , %%ymm11, %%ymm10, %%ymm2	\n\t"
 		"	vblendpd 	$0x05  , %%ymm11, %%ymm10, %%ymm3	\n\t"
 
-		"	vpermpd 	$0x1b  , %%ymm2 , %%ymm2		\n\t"
-		"	vpermpd 	$0x1b  , %%ymm3 , %%ymm3		\n\t"
-		"	vpermpd 	$0xb1  , %%ymm2 , %%ymm2		\n\t"
-		"	vpermpd 	$0xb1  , %%ymm3 , %%ymm3		\n\t"
+		"	vperm2f128 	$0x01  , %%ymm2 , %%ymm2 , %%ymm2	\n\t"
+		"	vperm2f128 	$0x01  , %%ymm3 , %%ymm3 , %%ymm3	\n\t"
 
 		"	vblendpd 	$0x03  , %%ymm0 , %%ymm2 , %%ymm4	\n\t"
 		"	vblendpd 	$0x03  , %%ymm1 , %%ymm3 , %%ymm5	\n\t"

From 648491e1aa5cec7e8b8947d8ce47a825ceba705d Mon Sep 17 00:00:00 2001
From: Martin Kroeker <martin@ruby.chemie.uni-freiburg.de>
Date: Thu, 1 Aug 2019 22:51:09 +0200
Subject: [PATCH 121/127] Autodetect Intel Ice Lake (as SKYLAKEX target)

---
 cpuid_x86.c | 27 ++++++++++++++++++++-------
 1 file changed, 20 insertions(+), 7 deletions(-)

diff --git a/cpuid_x86.c b/cpuid_x86.c
index 884d4b78a..141d6044e 100644
--- a/cpuid_x86.c
+++ b/cpuid_x86.c
@@ -1211,7 +1211,7 @@ int get_cpuname(void){
 	  return CPUTYPE_CORE2;
 	}
 	break;
-      case 1:
+      case 1:  // family 6 exmodel 1
 	switch (model) {
 	case  6:
 	  return CPUTYPE_CORE2;
@@ -1228,7 +1228,7 @@ int get_cpuname(void){
 	  return CPUTYPE_DUNNINGTON;
 	}
 	break;
-      case  2:
+      case  2: // family 6 exmodel 2
 	switch (model) {
 	case 5:
 	  //Intel Core (Clarkdale) / Core (Arrandale)
@@ -1257,7 +1257,7 @@ int get_cpuname(void){
 	  return CPUTYPE_NEHALEM;
 	}
 	break;
-      case 3:
+      case 3: // family 6 exmodel 3
 	switch (model) {
 	case  7:
 	    // Bay Trail	
@@ -1287,7 +1287,7 @@ int get_cpuname(void){
 	    return CPUTYPE_NEHALEM;
 	}
 	break;
-      case 4:
+      case 4: // family 6 exmodel 4
         switch (model) {
         case 5:
 	case 6:
@@ -1321,7 +1321,7 @@ int get_cpuname(void){
 	    return CPUTYPE_NEHALEM;
         }
         break;
-      case 5:
+      case 5:  // family 6 exmodel 5
         switch (model) {
 	case 6:
 	  //Broadwell
@@ -1364,7 +1364,7 @@ int get_cpuname(void){
 	    return CPUTYPE_NEHALEM;
 	}
 	break;
-      case 6:
+      case 6:  // family 6 exmodel 6
         switch (model) {
         case 6: // Cannon Lake
           if(support_avx512())
@@ -1376,7 +1376,20 @@ int get_cpuname(void){
 	  else
 	  return CPUTYPE_NEHALEM;
         }
-      break;  
+      break;
+      case 7: // family 6 exmodel 7
+        switch (model) {
+        case 14: // Ice Lake
+          if(support_avx512())
+            return CPUTYPE_SKYLAKEX;
+          if(support_avx2())
+            return CPUTYPE_HASWELL;
+          if(support_avx())
+	    return CPUTYPE_SANDYBRIDGE;
+	  else
+	  return CPUTYPE_NEHALEM;
+        }
+      break;
       case 9:
       case 8:      
         switch (model) {

From 3d36c4511693bfd7c117465a701c5ff1f19f8565 Mon Sep 17 00:00:00 2001
From: Martin Kroeker <martin@ruby.chemie.uni-freiburg.de>
Date: Thu, 1 Aug 2019 22:52:35 +0200
Subject: [PATCH 122/127] Add CPUID identification of Intel Ice Lake

---
 driver/others/dynamic.c | 20 +++++++++++++++++++-
 1 file changed, 19 insertions(+), 1 deletion(-)

diff --git a/driver/others/dynamic.c b/driver/others/dynamic.c
index 045fc65b8..f1cd3c6e6 100644
--- a/driver/others/dynamic.c
+++ b/driver/others/dynamic.c
@@ -585,9 +585,27 @@ static gotoblas_t *get_coretype(void){
 	  }
         }
         return NULL;  
+      case 7:
+        if (model == 14) {
+	// Ice Lake
+          if (support_avx512()) 
+	    return &gotoblas_SKYLAKEX;
+	  if(support_avx2()){
+	    openblas_warning(FALLBACK_VERBOSE, HASWELL_FALLBACK);
+	    return &gotoblas_HASWELL;
+          }
+	  if(support_avx()) {
+	    openblas_warning(FALLBACK_VERBOSE, SANDYBRIDGE_FALLBACK);
+	    return &gotoblas_SANDYBRIDGE;
+	  } else {
+          openblas_warning(FALLBACK_VERBOSE, NEHALEM_FALLBACK);
+          return &gotoblas_NEHALEM;
+          }
+        }
+        return NULL;  
       case 9:
       case 8:
-	if (model == 14 ) { // Kaby Lake
+	if (model == 14 ) { // Kaby Lake, Coffee Lake
 	  if(support_avx2())
 	    return &gotoblas_HASWELL;
 	  if(support_avx()) {

From acf6002ab242f98460845bb71db8fefdbdb26a1f Mon Sep 17 00:00:00 2001
From: Martin Kroeker <martin@ruby.chemie.uni-freiburg.de>
Date: Sat, 3 Aug 2019 12:40:13 +0200
Subject: [PATCH 123/127] Replace most vpermpd calls in the Haswell DTRSM_RN
 kernel

---
 kernel/x86_64/dtrsm_kernel_RN_haswell.c | 36 +++++++++++--------------
 1 file changed, 16 insertions(+), 20 deletions(-)

diff --git a/kernel/x86_64/dtrsm_kernel_RN_haswell.c b/kernel/x86_64/dtrsm_kernel_RN_haswell.c
index 9ab78fc8e..cb939e762 100644
--- a/kernel/x86_64/dtrsm_kernel_RN_haswell.c
+++ b/kernel/x86_64/dtrsm_kernel_RN_haswell.c
@@ -132,7 +132,7 @@ static void dtrsm_RN_solve_opt(BLASLONG n, FLOAT *a, FLOAT *b, FLOAT *c, BLASLON
 	"1:									\n\t"
 
 	"	vmovups         (%8,%1,4), %%ymm4				\n\t"	// read a
-        "       vpermpd         $0xb1  , %%ymm0 , %%ymm3                	\n\t"
+        "       vpermilpd         $0x05  , %%ymm0 , %%ymm3                	\n\t"   // was vpermpd 0xb1
 
 	"	vfmadd231pd	%%ymm0 , %%ymm1 , %%ymm8			\n\t"
 	"	vfmadd231pd	%%ymm0 , %%ymm2 , %%ymm12			\n\t"
@@ -143,7 +143,7 @@ static void dtrsm_RN_solve_opt(BLASLONG n, FLOAT *a, FLOAT *b, FLOAT *c, BLASLON
 
         "       vpermpd         $0x1b  , %%ymm3 , %%ymm0                	\n\t"
 	"	vmovups       32(%9,%1,8), %%ymm6				\n\t"	// read b1
-        "       vpermpd         $0xb1  , %%ymm0 , %%ymm3                	\n\t"
+        "       vpermilpd         $0x05  , %%ymm0 , %%ymm3                	\n\t"
 	"	vfmadd231pd	%%ymm0 , %%ymm1 , %%ymm10			\n\t"
 	"	vfmadd231pd	%%ymm0 , %%ymm2 , %%ymm14			\n\t"
 
@@ -160,7 +160,7 @@ static void dtrsm_RN_solve_opt(BLASLONG n, FLOAT *a, FLOAT *b, FLOAT *c, BLASLON
 	"	vfmadd231pd	%%ymm4 , %%ymm5 , %%ymm8			\n\t"
 	"	vfmadd231pd	%%ymm4 , %%ymm6 , %%ymm12			\n\t"
 
-        "       vpermpd         $0xb1  , %%ymm4 , %%ymm4                	\n\t"
+        "       vpermilpd         $0x05  , %%ymm4 , %%ymm4                	\n\t"
 	"	vmovups         (%9,%1,8), %%ymm1				\n\t"	// read b0
 	"	vfmadd231pd	%%ymm4 , %%ymm5 , %%ymm9			\n\t"
 	"	vfmadd231pd	%%ymm4 , %%ymm6 , %%ymm13			\n\t"
@@ -170,7 +170,7 @@ static void dtrsm_RN_solve_opt(BLASLONG n, FLOAT *a, FLOAT *b, FLOAT *c, BLASLON
 	"	vfmadd231pd	%%ymm4 , %%ymm5 , %%ymm10			\n\t"
 	"	vfmadd231pd	%%ymm4 , %%ymm6 , %%ymm14			\n\t"
 
-        "       vpermpd         $0xb1  , %%ymm4 , %%ymm4                	\n\t"
+        "       vpermilpd         $0x05  , %%ymm4 , %%ymm4                	\n\t"
 	"	addq		$8, %1						\n\t"
 	"	vfmadd231pd	%%ymm4 , %%ymm5 , %%ymm11			\n\t"
 	"	vfmadd231pd	%%ymm4 , %%ymm6 , %%ymm15			\n\t"
@@ -185,7 +185,7 @@ static void dtrsm_RN_solve_opt(BLASLONG n, FLOAT *a, FLOAT *b, FLOAT *c, BLASLON
 	"	vfmadd231pd	%%ymm0 , %%ymm1 , %%ymm8			\n\t"
 	"	vfmadd231pd	%%ymm0 , %%ymm2 , %%ymm12			\n\t"
 
-        "       vpermpd         $0xb1  , %%ymm0 , %%ymm0                	\n\t"
+        "       vpermilpd         $0x05  , %%ymm0 , %%ymm0                	\n\t"
 	"	vfmadd231pd	%%ymm0 , %%ymm1 , %%ymm9			\n\t"
 	"	vfmadd231pd	%%ymm0 , %%ymm2 , %%ymm13			\n\t"
 
@@ -193,7 +193,7 @@ static void dtrsm_RN_solve_opt(BLASLONG n, FLOAT *a, FLOAT *b, FLOAT *c, BLASLON
 	"	vfmadd231pd	%%ymm0 , %%ymm1 , %%ymm10			\n\t"
 	"	vfmadd231pd	%%ymm0 , %%ymm2 , %%ymm14			\n\t"
 
-        "       vpermpd         $0xb1  , %%ymm0 , %%ymm0                	\n\t"
+        "       vpermilpd         $0x05  , %%ymm0 , %%ymm0                	\n\t"
 	"	vfmadd231pd	%%ymm0 , %%ymm1 , %%ymm11			\n\t"
 	"	vfmadd231pd	%%ymm0 , %%ymm2 , %%ymm15			\n\t"
 
@@ -204,7 +204,7 @@ static void dtrsm_RN_solve_opt(BLASLONG n, FLOAT *a, FLOAT *b, FLOAT *c, BLASLON
 	"	vfmadd231pd	%%ymm4 , %%ymm5 , %%ymm8			\n\t"
 	"	vfmadd231pd	%%ymm4 , %%ymm6 , %%ymm12			\n\t"
 
-        "       vpermpd         $0xb1  , %%ymm4 , %%ymm4                	\n\t"
+        "       vpermilpd         $0x05  , %%ymm4 , %%ymm4                	\n\t"
 	"	vfmadd231pd	%%ymm4 , %%ymm5 , %%ymm9			\n\t"
 	"	vfmadd231pd	%%ymm4 , %%ymm6 , %%ymm13			\n\t"
 
@@ -212,42 +212,38 @@ static void dtrsm_RN_solve_opt(BLASLONG n, FLOAT *a, FLOAT *b, FLOAT *c, BLASLON
 	"	vfmadd231pd	%%ymm4 , %%ymm5 , %%ymm10			\n\t"
 	"	vfmadd231pd	%%ymm4 , %%ymm6 , %%ymm14			\n\t"
 
-        "       vpermpd         $0xb1  , %%ymm4 , %%ymm4                	\n\t"
+        "       vpermilpd         $0x05  , %%ymm4 , %%ymm4                	\n\t"
 	"	vfmadd231pd	%%ymm4 , %%ymm5 , %%ymm11			\n\t"
 	"	vfmadd231pd	%%ymm4 , %%ymm6 , %%ymm15			\n\t"
 
 	"3:								\n\t"	
 
-        "       vpermpd         $0xb1  , %%ymm9 , %%ymm9                \n\t"
-        "       vpermpd         $0xb1  , %%ymm11, %%ymm11               \n\t"
+        "       vpermilpd         $0x05  , %%ymm9 , %%ymm9                \n\t"
+        "       vpermilpd         $0x05  , %%ymm11, %%ymm11               \n\t"
 
         "       vblendpd        $0x0a  , %%ymm9 , %%ymm8 , %%ymm0       \n\t"
         "       vblendpd        $0x05  , %%ymm9 , %%ymm8 , %%ymm1       \n\t"
         "       vblendpd        $0x0a  , %%ymm11, %%ymm10, %%ymm2       \n\t"
         "       vblendpd        $0x05  , %%ymm11, %%ymm10, %%ymm3       \n\t"
 
-        "       vpermpd         $0x1b  , %%ymm2 , %%ymm2                \n\t"
-        "       vpermpd         $0x1b  , %%ymm3 , %%ymm3                \n\t"
-        "       vpermpd         $0xb1  , %%ymm2 , %%ymm2                \n\t"
-        "       vpermpd         $0xb1  , %%ymm3 , %%ymm3                \n\t"
+        "       vperm2f128         $0x01  , %%ymm2 , %%ymm2 , %%ymm2    \n\t"
+        "       vperm2f128         $0x01  , %%ymm3 , %%ymm3 , %%ymm3    \n\t"
 
         "       vblendpd        $0x03  , %%ymm0 , %%ymm2 , %%ymm8       \n\t"
         "       vblendpd        $0x03  , %%ymm1 , %%ymm3 , %%ymm9       \n\t"
         "       vblendpd        $0x03  , %%ymm2 , %%ymm0 , %%ymm10      \n\t"
         "       vblendpd        $0x03  , %%ymm3 , %%ymm1 , %%ymm11      \n\t"
 
-        "       vpermpd         $0xb1  , %%ymm13, %%ymm13               \n\t"
-        "       vpermpd         $0xb1  , %%ymm15, %%ymm15               \n\t"
+        "       vpermilpd         $0x05  , %%ymm13, %%ymm13               \n\t"
+        "       vpermilpd         $0x05  , %%ymm15, %%ymm15               \n\t"
 
         "       vblendpd        $0x0a  , %%ymm13, %%ymm12, %%ymm0       \n\t"
         "       vblendpd        $0x05  , %%ymm13, %%ymm12, %%ymm1       \n\t"
         "       vblendpd        $0x0a  , %%ymm15, %%ymm14, %%ymm2       \n\t"
         "       vblendpd        $0x05  , %%ymm15, %%ymm14, %%ymm3       \n\t"
 
-        "       vpermpd         $0x1b  , %%ymm2 , %%ymm2                \n\t"
-        "       vpermpd         $0x1b  , %%ymm3 , %%ymm3                \n\t"
-        "       vpermpd         $0xb1  , %%ymm2 , %%ymm2                \n\t"
-        "       vpermpd         $0xb1  , %%ymm3 , %%ymm3                \n\t"
+        "       vperm2f128         $0x01  , %%ymm2 , %%ymm2 , %%ymm2       \n\t"
+        "       vperm2f128         $0x01  , %%ymm3 , %%ymm3 , %%ymm3       \n\t"
 
         "       vblendpd        $0x03  , %%ymm0 , %%ymm2 , %%ymm12      \n\t"
         "       vblendpd        $0x03  , %%ymm1 , %%ymm3 , %%ymm13      \n\t"

From 4e2f81cfa1f6dfa24912c3ff88470471b39b695e Mon Sep 17 00:00:00 2001
From: Martin Kroeker <martin@ruby.chemie.uni-freiburg.de>
Date: Thu, 8 Aug 2019 23:15:35 +0200
Subject: [PATCH 124/127] Provide more information on mmap/munmap failure

for #2207
---
 driver/others/memory.c | 12 +++++++++++-
 1 file changed, 11 insertions(+), 1 deletion(-)

diff --git a/driver/others/memory.c b/driver/others/memory.c
index f67cb01f4..77d2b72fa 100644
--- a/driver/others/memory.c
+++ b/driver/others/memory.c
@@ -2041,8 +2041,12 @@ static BLASULONG  alloc_lock = 0UL;
 
 static void alloc_mmap_free(struct release_t *release){
 
+if (!release->address) return 0;
+
   if (munmap(release -> address, BUFFER_SIZE)) {
-    printf("OpenBLAS : munmap failed\n");
+      int errsv=errno;
+       perror("OpenBLAS : munmap failed:");
+       printf("error code=%d,\trelease->address=%lx\n",errsv,release->address);
   }
 }
 
@@ -2073,6 +2077,12 @@ static void *alloc_mmap(void *address){
 #if (defined(SMP) || defined(USE_LOCKING)) && !defined(USE_OPENMP)
     UNLOCK_COMMAND(&alloc_lock);
 #endif    
+  } else {
+#ifdef DEBUG  
+        int errsv=errno;
+       perror("OpenBLAS : mmap failed:");
+       printf("error code=%d,\tmap_address=%lx\n",errsv,map_address);
+#endif
   }
 
 #ifdef OS_LINUX

From 1776ad82c01c0f9efeeda043eb02e10187084066 Mon Sep 17 00:00:00 2001
From: Martin Kroeker <martin@ruby.chemie.uni-freiburg.de>
Date: Fri, 9 Aug 2019 00:08:11 +0200
Subject: [PATCH 125/127] Add files via upload

---
 driver/others/memory.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/driver/others/memory.c b/driver/others/memory.c
index 77d2b72fa..534d6d9fc 100644
--- a/driver/others/memory.c
+++ b/driver/others/memory.c
@@ -2041,7 +2041,7 @@ static BLASULONG  alloc_lock = 0UL;
 
 static void alloc_mmap_free(struct release_t *release){
 
-if (!release->address) return 0;
+if (!release->address) return;
 
   if (munmap(release -> address, BUFFER_SIZE)) {
       int errsv=errno;

From b7bbb02447ed612e380dc1ca6d6e7a26f48dc868 Mon Sep 17 00:00:00 2001
From: Martin Kroeker <martin@ruby.chemie.uni-freiburg.de>
Date: Sun, 11 Aug 2019 12:46:05 +0200
Subject: [PATCH 126/127] Silence two nuisance warnings from gcc

---
 cpuid_arm64.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/cpuid_arm64.c b/cpuid_arm64.c
index a5e731d74..e8aa29813 100644
--- a/cpuid_arm64.c
+++ b/cpuid_arm64.c
@@ -94,7 +94,7 @@ int get_feature(char *search)
 	if( p == NULL ) return 0;
 
 	t = strtok(p," ");
-	while( t = strtok(NULL," "))
+	while( (t = strtok(NULL," ")))
 	{
 		if (!strcmp(t, search))   { return(1); }
 	}
@@ -344,7 +344,7 @@ void get_features(void)
 	if( p == NULL ) return;
 
 	t = strtok(p," ");
-	while( t = strtok(NULL," "))
+	while( (t = strtok(NULL," ")))
 	{
 	}
 

From be147a9f28889d831019c6f860d501b2546e3771 Mon Sep 17 00:00:00 2001
From: Martin Kroeker <martin@ruby.chemie.uni-freiburg.de>
Date: Sun, 11 Aug 2019 16:24:39 +0200
Subject: [PATCH 127/127] Avoid adding a spurious dependency on the fortran
 runtime despite NOFORTRAN=1

for cases where a fortran compiler is present but not wanted (e.g. not fully functional)
---
 Makefile.system | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/Makefile.system b/Makefile.system
index 16791bcc2..835c76e78 100644
--- a/Makefile.system
+++ b/Makefile.system
@@ -267,9 +267,10 @@ OBJCOPY = $(CROSS_SUFFIX)objcopy
 OBJCONV = $(CROSS_SUFFIX)objconv
 
 
-# For detect fortran failed, only build BLAS.
+# When fortran support was either not detected or actively deselected, only build BLAS.
 ifeq ($(NOFORTRAN), 1)
 NO_LAPACK = 1
+override FEXTRALIB = 
 endif
 
 #