diff --git a/kernel/power/KERNEL.POWER10 b/kernel/power/KERNEL.POWER10
index ab8fbfcd9..00d31f8b6 100644
--- a/kernel/power/KERNEL.POWER10
+++ b/kernel/power/KERNEL.POWER10
@@ -7,12 +7,12 @@ else
 #CGEMM_BETA = ../generic/zgemm_beta.c
 #ZGEMM_BETA = ../generic/zgemm_beta.c
 
-STRMMKERNEL	= sgemm_kernel_power9.S
-DTRMMKERNEL	= dgemm_kernel_power9.S
-CTRMMKERNEL	= cgemm_kernel_power9.S
+STRMMKERNEL	= sgemm_kernel_power10.c
+DTRMMKERNEL	= dgemm_kernel_power10.c
+CTRMMKERNEL	= cgemm_kernel_power10.S
 ZTRMMKERNEL	= zgemm_kernel_power9.S
 
-SGEMMKERNEL    =  sgemm_kernel_power9.S
+SGEMMKERNEL    =  sgemm_kernel_power10.c
 SGEMMINCOPY    = ../generic/gemm_ncopy_16.c
 SGEMMITCOPY    = sgemm_tcopy_16_power8.S
 SGEMMONCOPY    =  ../generic/gemm_ncopy_8.c
@@ -22,7 +22,7 @@ SGEMMITCOPYOBJ =  sgemm_itcopy$(TSUFFIX).$(SUFFIX)
 SGEMMONCOPYOBJ =  sgemm_oncopy$(TSUFFIX).$(SUFFIX)
 SGEMMOTCOPYOBJ =  sgemm_otcopy$(TSUFFIX).$(SUFFIX)
 
-DGEMMKERNEL    =  dgemm_kernel_power9.S
+DGEMMKERNEL    =  dgemm_kernel_power10.c
 DGEMMINCOPY    = ../generic/gemm_ncopy_16.c
 DGEMMITCOPY    =  dgemm_tcopy_16_power8.S
 DGEMMONCOPY    =  dgemm_ncopy_4_power8.S
@@ -32,7 +32,7 @@ DGEMMITCOPYOBJ =  dgemm_itcopy$(TSUFFIX).$(SUFFIX)
 DGEMMONCOPYOBJ =  dgemm_oncopy$(TSUFFIX).$(SUFFIX)
 DGEMMOTCOPYOBJ =  dgemm_otcopy$(TSUFFIX).$(SUFFIX)
 
-CGEMMKERNEL    = cgemm_kernel_power9.S
+CGEMMKERNEL    = cgemm_kernel_power10.S
 CGEMMINCOPY    = ../generic/zgemm_ncopy_8.c
 CGEMMITCOPY    = ../generic/zgemm_tcopy_8.c
 CGEMMONCOPY    = ../generic/zgemm_ncopy_4.c
diff --git a/kernel/power/cgemm_kernel_power10.S b/kernel/power/cgemm_kernel_power10.S
new file mode 100644
index 000000000..e04f948dd
--- /dev/null
+++ b/kernel/power/cgemm_kernel_power10.S
@@ -0,0 +1,286 @@
+/***************************************************************************
+Copyright (c) 2013-2020, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*****************************************************************************/
+
+#define ASSEMBLER
+#include "common.h"
+#include "def_vsx.h"
+
+ 
+#define LOAD	ld
+#define STACKSIZE  (512 )  
+#define FLINK_SAVE (STACKSIZE+16) /* 16($r12) */  
+#define	M	r3
+#define	N	r4
+#define	K	r5
+
+
+#define A	r8
+#define	B	r9
+#define	C	r10
+#define	LDC	r6
+#define OFFSET	r7
+
+
+#define alpha_r vs51
+#define alpha_i vs55
+#define save_permute_1 vs59
+#define permute_mask vs63
+#define o0	0
+ 
+
+#define T1	r11
+#define T2	r12
+#define T3	r14
+#define T4	r15
+#define T5	r16
+#define T6	r17
+#define L	r18
+#define T7	r19
+#define T8	r20
+#define TEMP_REG	r21
+#define	I	r22
+#define J	r23
+#define AO	r24
+#define	BO	r25
+#define	CO 	r26
+#define T9	r27
+#define	T10	r28
+#define	PRE	r29
+
+#define T12	r30
+#define T13	r31
+
+#include "cgemm_macros_power10.S"
+
+.equ    perm_const1, 0x0405060700010203
+.equ    perm_const2, 0x0c0d0e0f08090a0b
+.equ save_permute_12, 0x0c0d0e0f1c1d1e1f
+.equ save_permute_11, 0x0405060714151617
+
+
+
+#ifndef NEEDPARAM
+
+	PROLOGUE
+	PROFCODE
+
+
+	addi	SP, SP, -STACKSIZE
+	mflr r0
+
+
+	stfd	f14,    0(SP)
+	stfd	f15,    8(SP)
+	stfd	f16,   16(SP)
+	stfd	f17,   24(SP)
+
+	stfd	f18,   32(SP)
+	stfd	f19,   40(SP)
+	stfd	f20,   48(SP)
+	stfd	f21,   56(SP)
+
+	stfd	f22,   64(SP)
+	stfd	f23,   72(SP)
+	stfd	f24,   80(SP)
+	stfd	f25,   88(SP)
+
+	stfd	f26,   96(SP)
+	stfd	f27,  104(SP)
+	stfd	f28,  112(SP)
+	stfd	f29,  120(SP)
+
+	stfd	f30,  128(SP)
+	stfd	f31,  136(SP)
+
+
+	std	r31,  144(SP)
+	std	r30,  152(SP)
+	std	r29,  160(SP)
+	std	r28,  168(SP)
+	std	r27,  176(SP)
+	std	r26,  184(SP)
+	std	r25,  192(SP)
+	std	r24,  200(SP)
+	std	r23,  208(SP)
+	std	r22,  216(SP)
+	std	r21,  224(SP)
+	std	r20,  232(SP)
+	std	r19,  240(SP)
+	std	r18,  248(SP)
+	std	r17,  256(SP)
+	std	r16,  264(SP)
+	std	r15,  272(SP)
+	std	r14,  280(SP)
+ 
+ 
+  stxv    vs52,  288(SP)
+  stxv    vs53,  304(SP)
+  stxv    vs54,  320(SP)
+  stxv    vs55,  336(SP)
+  stxv    vs56,  352(SP)
+  stxv    vs57,  368(SP)
+  stxv    vs58,  384(SP)
+  stxv    vs59,  400(SP)
+  stxv    vs60,  416(SP)
+  stxv    vs61,  432(SP)
+  stxv    vs62,  448(SP)
+  stxv    vs63,  464(SP)
+  std     r0,   FLINK_SAVE(SP)
+ 
+
+
+	ld	LDC, FRAMESLOT(0) + STACKSIZE(SP)
+
+
+
+#ifdef TRMMKERNEL
+	ld	OFFSET,  FRAMESLOT(1) + STACKSIZE(SP)
+#endif
+   slwi    LDC, LDC, ZBASE_SHIFT
+
+ 
+ 
+	/*alpha is stored in f1. convert to single and splat*/
+    xscvdpspn alpha_r,vs1 
+    xscvdpspn alpha_i,vs2 
+	xxspltw   alpha_r,alpha_r,0 
+	xxspltw   alpha_i,alpha_i,0 
+/*load reverse permute mask for big endian
+  uint128 = 0xc0d0e0f08090a0b0405060700010203
+*/ 
+		
+	lis T2, perm_const2@highest
+	lis T1, perm_const1@highest
+	lis T3, save_permute_12@highest
+	lis T4, save_permute_11@highest
+
+	
+	ori T2, T2, perm_const2@higher
+	ori T1, T1, perm_const1@higher
+	ori T3, T3, save_permute_12@higher
+	ori T4, T4, save_permute_11@higher
+
+	
+	rldicr T2, T2, 32, 31
+	rldicr T1, T1, 32, 31
+	rldicr T3, T3, 32, 31
+	rldicr T4, T4, 32, 31 
+
+	oris T2, T2, perm_const2@h
+	oris T1, T1, perm_const1@h
+	oris T3, T3, save_permute_12@h
+	oris T4, T4, save_permute_11@h
+
+	
+	ori T2, T2, perm_const2@l  
+	ori T1, T1, perm_const1@l
+	ori T3, T3, save_permute_12@l  
+	ori T4, T4, save_permute_11@l
+
+	
+  li r0,0
+  li PRE,512
+
+#if defined(CC) || defined(CR) || defined(RC) || defined(RR) 
+/*negate for this case as we will use addition -1*(a+b) */
+  xvnegsp alpha_r,alpha_r
+  xvnegsp alpha_i,alpha_i
+#endif
+
+	mtvsrdd permute_mask,T2,T1
+	mtvsrdd save_permute_1,T3,T4 	
+
+     /*mask is reverse permute so we have to make it inner permute */
+ 	xxpermdi	permute_mask,	permute_mask,	permute_mask,2 
+
+#include "cgemm_logic_power10.S"
+
+.L999: 
+	lfd	f14,    0(SP)
+	lfd	f15,    8(SP)
+	lfd	f16,   16(SP)
+	lfd	f17,   24(SP)
+
+	lfd	f18,   32(SP)
+	lfd	f19,   40(SP)
+	lfd	f20,   48(SP)
+	lfd	f21,   56(SP)
+
+	lfd	f22,   64(SP)
+	lfd	f23,   72(SP)
+	lfd	f24,   80(SP)
+	lfd	f25,   88(SP)
+
+	lfd	f26,   96(SP)
+	lfd	f27,  104(SP)
+	lfd	f28,  112(SP)
+	lfd	f29,  120(SP)
+
+	lfd	f30,  128(SP)
+	lfd	f31,  136(SP)
+
+	ld	r31,  144(SP)
+	ld	r30,  152(SP)
+	ld	r29,  160(SP)
+	ld	r28,  168(SP)
+	ld	r27,  176(SP)
+	ld	r26,  184(SP)
+	ld	r25,  192(SP)
+	ld	r24,  200(SP)
+	ld	r23,  208(SP)
+	ld	r22,  216(SP)
+	ld	r21,  224(SP)
+	ld	r20,  232(SP)
+	ld	r19,  240(SP)
+	ld	r18,  248(SP)
+	ld	r17,  256(SP)
+	ld	r16,  264(SP)
+	ld	r15,  272(SP)
+	ld	r14,  280(SP)
+
+	ld    r0, 	 FLINK_SAVE(SP)	
+ 
+    lxv    vs52,  288(SP)
+    lxv    vs53,  304(SP)
+    lxv    vs54,  320(SP)
+    lxv    vs55,  336(SP)
+    lxv    vs56,  352(SP)
+    lxv    vs57,  368(SP)
+    lxv    vs58,  384(SP) 
+    lxv    vs59,  400(SP)
+	mtlr r0
+    lxv    vs60,  416(SP)
+    lxv    vs61,  432(SP) 
+    lxv    vs62,  448(SP)
+    lxv    vs63,  464(SP)
+
+	addi	SP, SP, STACKSIZE 
+	blr
+
+
+	EPILOGUE
+#endif
diff --git a/kernel/power/cgemm_logic_power10.S b/kernel/power/cgemm_logic_power10.S
new file mode 100644
index 000000000..3700ac87b
--- /dev/null
+++ b/kernel/power/cgemm_logic_power10.S
@@ -0,0 +1,2814 @@
+/***************************************************************************
+Copyright (c) 2013-2020, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*****************************************************************************/
+
+#define MY_ALIGN .align 3
+b CGEMM_L4
+/*                MINI SUBROUTINES                            */      
+/*                4x8 MAIN 128x+2 LOOP                     */      
+
+
+CGEMM_L4x8_LMAIN_SUB:
+/*----------------------------------------*/   
+    mtctr   T8
+    LOAD4x8_2 
+    MY_ALIGN
+CGEMM_L4x8_LOOP:
+/*----------------------------------------*/   
+    dcbt    AO, PRE
+    dcbt    BO, PRE
+    KERNEL4x8_L2 128,64,0,0 
+CGEMM_L4x8_K128:
+/*----------------------------------------*/   
+    KERNEL4x8_L2 128,64,1,0
+    dcbt    AO, T2  
+    KERNEL4x8_L2 128,64,2,0
+    KERNEL4x8_L2 128,64,3,0 
+    dcbt    AO, T3
+    dcbt    BO, T2
+    KERNEL4x8_L2 128,64,4,0
+    KERNEL4x8_L2 128,64,5,0
+    dcbt    AO, T4  
+    KERNEL4x8_L2 128,64,6,0
+    KERNEL4x8_L2 128,64,7,0  
+    dcbt    AO, T5  
+    dcbt    BO, T3
+    KERNEL4x8_L2 128,64,8,0
+    KERNEL4x8_L2 128,64,9,0
+    KERNEL4x8_L2 128,64,10,0
+    KERNEL4x8_L2 128,64,11,0  
+    dcbt    BO, T4
+    KERNEL4x8_L2 128,64,12,0
+    KERNEL4x8_L2 128,64,13,0
+    KERNEL4x8_L2 128,64,14,0
+    KERNEL4x8_L2 128,64,15,0  
+    KERNEL4x8_L2 128,64,16,0
+    KERNEL4x8_L2 128,64,17,0 
+    KERNEL4x8_L2 128,64,18,0
+    KERNEL4x8_L2 128,64,19,0  
+    KERNEL4x8_L2 128,64,20,0
+    KERNEL4x8_L2 128,64,21,0 
+    KERNEL4x8_L2 128,64,22,0
+    KERNEL4x8_L2 128,64,23,0   
+    KERNEL4x8_L2 128,64,24,0
+    KERNEL4x8_L2 128,64,25,0
+    KERNEL4x8_L2 128,64,26,0
+    KERNEL4x8_L2 128,64,27,0  
+    KERNEL4x8_L2 128,64,28,0
+    KERNEL4x8_L2 128,64,29,0
+    KERNEL4x8_L2 128,64,30,0
+    KERNEL4x8_L2 128,64,31,0 
+    KERNEL4x8_L2 128,64,32,0
+    KERNEL4x8_L2 128,64,33,0
+    KERNEL4x8_L2 128,64,34,0
+    KERNEL4x8_L2 128,64,35,0 
+    KERNEL4x8_L2 128,64,36,0
+    KERNEL4x8_L2 128,64,37,0
+    KERNEL4x8_L2 128,64,38,0
+    KERNEL4x8_L2 128,64,39,0  
+    KERNEL4x8_L2 128,64,40,0
+    KERNEL4x8_L2 128,64,41,0
+    KERNEL4x8_L2 128,64,42,0
+    KERNEL4x8_L2 128,64,43,0  
+    KERNEL4x8_L2 128,64,44,0
+    KERNEL4x8_L2 128,64,45,0
+    KERNEL4x8_L2 128,64,46,0
+    KERNEL4x8_L2 128,64,47,0 
+    KERNEL4x8_L2 128,64,48,0
+    KERNEL4x8_L2 128,64,49,0 
+    KERNEL4x8_L2 128,64,50,0
+    KERNEL4x8_L2 128,64,51,0  
+    KERNEL4x8_L2 128,64,52,0
+    KERNEL4x8_L2 128,64,53,0 
+    KERNEL4x8_L2 128,64,54,0
+    KERNEL4x8_L2 128,64,55,0  
+    KERNEL4x8_L2 128,64,56,0
+    KERNEL4x8_L2 128,64,57,0
+    KERNEL4x8_L2 128,64,58,0
+    KERNEL4x8_L2 128,64,59,0  
+    KERNEL4x8_L2 128,64,60,0
+    KERNEL4x8_L2 128,64,61,0
+    KERNEL4x8_L2 128,64,62,0 
+    KERNEL4x8_L2 128,64,63,1  
+    bdnz    CGEMM_L4x8_LOOP
+    MY_ALIGN  
+CGEMM_L4x8_LOOP_END:
+/*----------------------------------------*/   
+    END4x8_2
+    blr
+    MY_ALIGN
+
+
+CGEMM_4x8_L64_SUB:
+/*----------------------------------------*/   
+    LOAD4x8_2  
+    dcbt    AO, PRE
+    dcbt    BO, PRE
+    KERNEL4x8_L2 128,64,0,0 
+    KERNEL4x8_L2 128,64,1,0
+    dcbt    AO, T2  
+    KERNEL4x8_L2 128,64,2,0
+    KERNEL4x8_L2 128,64,3,0 
+    dcbt    AO, T3
+    dcbt    BO, T2
+    KERNEL4x8_L2 128,64,4,0
+    KERNEL4x8_L2 128,64,5,0
+    dcbt    AO, T4  
+    KERNEL4x8_L2 128,64,6,0
+    KERNEL4x8_L2 128,64,7,0  
+    dcbt    AO, T5  
+    dcbt    BO, T3
+    KERNEL4x8_L2 128,64,8,0
+    KERNEL4x8_L2 128,64,9,0
+    KERNEL4x8_L2 128,64,10,0
+    KERNEL4x8_L2 128,64,11,0  
+    dcbt    BO, T4
+    KERNEL4x8_L2 128,64,12,0
+    KERNEL4x8_L2 128,64,13,0
+    KERNEL4x8_L2 128,64,14,0
+    KERNEL4x8_L2 128,64,15,0  
+    KERNEL4x8_L2 128,64,16,0
+    KERNEL4x8_L2 128,64,17,0 
+    KERNEL4x8_L2 128,64,18,0
+    KERNEL4x8_L2 128,64,19,0  
+    KERNEL4x8_L2 128,64,20,0
+    KERNEL4x8_L2 128,64,21,0 
+    KERNEL4x8_L2 128,64,22,0
+    KERNEL4x8_L2 128,64,23,0   
+    KERNEL4x8_L2 128,64,24,0
+    KERNEL4x8_L2 128,64,25,0
+    KERNEL4x8_L2 128,64,26,0
+    KERNEL4x8_L2 128,64,27,0  
+    KERNEL4x8_L2 128,64,28,0
+    KERNEL4x8_L2 128,64,29,0
+    KERNEL4x8_L2 128,64,30,0
+    KERNEL4x8_E2 128,64,31,1
+    blr
+    MY_ALIGN
+
+
+CGEMM_4x8_L32_SUB:
+/*----------------------------------------*/   
+    LOAD4x8_2  
+    dcbt    AO, PRE
+    dcbt    BO, PRE
+    KERNEL4x8_L2 128,64,0,0 
+    KERNEL4x8_L2 128,64,1,0
+    dcbt    AO, T2  
+    KERNEL4x8_L2 128,64,2,0
+    KERNEL4x8_L2 128,64,3,0 
+    dcbt    AO, T3
+    dcbt    BO, T2
+    KERNEL4x8_L2 128,64,4,0
+    KERNEL4x8_L2 128,64,5,0
+    dcbt    AO, T4  
+    KERNEL4x8_L2 128,64,6,0
+    KERNEL4x8_L2 128,64,7,0  
+    dcbt    AO, T5  
+    dcbt    BO, T3
+    KERNEL4x8_L2 128,64,8,0
+    KERNEL4x8_L2 128,64,9,0
+    KERNEL4x8_L2 128,64,10,0
+    KERNEL4x8_L2 128,64,11,0  
+    dcbt    BO, T4
+    KERNEL4x8_L2 128,64,12,0
+    KERNEL4x8_L2 128,64,13,0
+    KERNEL4x8_L2 128,64,14,0
+    KERNEL4x8_E2 128,64,15,1
+    blr
+    MY_ALIGN
+
+
+CGEMM_4x8_L16_SUB:
+/*----------------------------------------*/   
+    LOAD4x8_2 
+    dcbt    AO, PRE
+    dcbt    BO, PRE
+    KERNEL4x8_L2 128,64,0,0 
+    KERNEL4x8_L2 128,64,1,0
+    dcbt    AO, T2  
+    KERNEL4x8_L2 128,64,2,0
+    KERNEL4x8_L2 128,64,3,0 
+    dcbt    AO, T3
+    dcbt    BO, T2
+    KERNEL4x8_L2 128,64,4,0
+    KERNEL4x8_L2 128,64,5,0
+    dcbt    AO, T4  
+    KERNEL4x8_L2 128,64,6,0
+    KERNEL4x8_E2 128,64,7,1
+    blr
+    MY_ALIGN
+
+
+CGEMM_4x4_LMAIN_SUB:
+/*----------------------------------------*/   
+    mtctr   T8
+    LOAD4x4_2  
+    MY_ALIGN
+CGEMM_L4x4_LOOP:
+/*----------------------------------------*/   
+    KERNEL4x4_L2 64,64,0,0
+CGEMM_L4x4_K32:
+/*----------------------------------------*/   
+    KERNEL4x4_L2 64,64,1,0   
+    KERNEL4x4_L2 64,64,2,0
+    KERNEL4x4_L2 64,64,3,0  
+    KERNEL4x4_L2 64,64,4,0
+    KERNEL4x4_L2 64,64,5,0 
+    KERNEL4x4_L2 64,64,6,0
+    KERNEL4x4_L2 64,64,7,0
+    KERNEL4x4_L2 64,64,8,0
+    KERNEL4x4_L2 64,64,9,0   
+    KERNEL4x4_L2 64,64,10,0
+    KERNEL4x4_L2 64,64,11,0  
+    KERNEL4x4_L2 64,64,12,0
+    KERNEL4x4_L2 64,64,13,0 
+    KERNEL4x4_L2 64,64,14,0
+    KERNEL4x4_L2 64,64,15,1    
+    bdnz    CGEMM_L4x4_LOOP
+    MY_ALIGN  
+CGEMM_L4x4_LOOP_END:
+/*----------------------------------------*/   
+    END4x4_2 
+    blr
+    MY_ALIGN
+
+
+CGEMM_4x4_L16_SUB:
+/*----------------------------------------*/   
+    LOAD4x4_2
+    KERNEL4x4_L2 64,64,0,0
+    KERNEL4x4_L2 64,64,1,0   
+    KERNEL4x4_L2 64,64,2,0
+    KERNEL4x4_L2 64,64,3,0  
+    KERNEL4x4_L2 64,64,4,0
+    KERNEL4x4_L2 64,64,5,0 
+    KERNEL4x4_L2 64,64,6,0
+    KERNEL4x4_E2 64,64,7,1
+    blr
+    MY_ALIGN
+
+
+CGEMM_4x4_L8_SUB:
+/*----------------------------------------*/   
+    LOAD4x4_2
+    KERNEL4x4_L2 64,64,0,0
+    KERNEL4x4_L2 64,64,1,0   
+    KERNEL4x4_L2 64,64,2,0
+    KERNEL4x4_E2 64,64,3,1 
+    blr
+
+
+CGEMM_4x2_LMAIN_SUB:
+/*----------------------------------------*/   
+    mtctr   T8
+    LOAD4x2_2  
+    MY_ALIGN 
+CGEMM_L4x2_LOOP:
+/*----------------------------------------*/   
+    KERNEL4x2_L2 32,64,0,0 
+CGEMM_L4x2_K32:
+/*----------------------------------------*/   
+    KERNEL4x2_L2 32,64,1,0  
+    KERNEL4x2_L2 32,64,2,0
+    KERNEL4x2_L2 32,64,3,0  
+    KERNEL4x2_L2 32,64,4,0
+    KERNEL4x2_L2 32,64,5,0 
+    KERNEL4x2_L2 32,64,6,0
+    KERNEL4x2_L2 32,64,7,0
+    KERNEL4x2_L2 32,64,8,0
+    KERNEL4x2_L2 32,64,9,0  
+    KERNEL4x2_L2 32,64,10,0
+    KERNEL4x2_L2 32,64,11,0  
+    KERNEL4x2_L2 32,64,12,0
+    KERNEL4x2_L2 32,64,13,0 
+    KERNEL4x2_L2 32,64,14,0
+    KERNEL4x2_L2 32,64,15,1   
+    bdnz    CGEMM_L4x2_LOOP
+    MY_ALIGN  
+
+
+CGEMM_L4x2_LOOP_END:
+/*----------------------------------------*/   
+    END4x2_2 
+    blr
+    MY_ALIGN
+CGEMM_4x2_L16_SUB:
+/*----------------------------------------*/   
+    LOAD4x2_2
+    KERNEL4x2_L2 32,64,0,0
+    KERNEL4x2_L2 32,64,1,0  
+    KERNEL4x2_L2 32,64,2,0
+    KERNEL4x2_L2 32,64,3,0  
+    KERNEL4x2_L2 32,64,4,0
+    KERNEL4x2_L2 32,64,5,0 
+    KERNEL4x2_L2 32,64,6,0
+    KERNEL4x2_E2 32,64,7,1
+    blr
+    MY_ALIGN
+CGEMM_4x2_L8_SUB:
+/*----------------------------------------*/   
+    LOAD4x2_2
+    KERNEL4x2_L2 32,64,0,0
+    KERNEL4x2_L2 32,64,1,0  
+    KERNEL4x2_L2 32,64,2,0
+    KERNEL4x2_E2 32,64,3,1  
+    blr
+
+
+CGEMM_4x1_LMAIN_SUB:
+/*----------------------------------------*/   
+    mtctr   T8
+    LOAD4x1_2  
+    MY_ALIGN
+CGEMM_L4x1_LOOP:
+/*----------------------------------------*/   
+    KERNEL4x1_L2 16,64,0,0 
+CGEMM_L4x1_K32:
+/*----------------------------------------*/   
+    KERNEL4x1_L2 16,64,1,0  
+    KERNEL4x1_L2 16,64,2,0
+    KERNEL4x1_L2 16,64,3,0  
+    KERNEL4x1_L2 16,64,4,0
+    KERNEL4x1_L2 16,64,5,0 
+    KERNEL4x1_L2 16,64,6,0
+    KERNEL4x1_L2 16,64,7,0
+    KERNEL4x1_L2 16,64,8,0
+    KERNEL4x1_L2 16,64,9,0  
+    KERNEL4x1_L2 16,64,10,0
+    KERNEL4x1_L2 16,64,11,0  
+    KERNEL4x1_L2 16,64,12,0
+    KERNEL4x1_L2 16,64,13,0 
+    KERNEL4x1_L2 16,64,14,0
+    KERNEL4x1_L2 16,64,15,1   
+    bdnz    CGEMM_L4x1_LOOP
+    MY_ALIGN  
+CGEMM_L4x1_LOOP_END:
+/*----------------------------------------*/   
+    END4x1_2 
+    blr
+
+    MY_ALIGN
+CGEMM_4x1_L16_SUB:
+/*----------------------------------------*/   
+    LOAD4x1_2
+    KERNEL4x1_L2 16,64,0,0
+    KERNEL4x1_L2 16,64,1,0  
+    KERNEL4x1_L2 16,64,2,0
+    KERNEL4x1_L2 16,64,3,0  
+    KERNEL4x1_L2 16,64,4,0
+    KERNEL4x1_L2 16,64,5,0 
+    KERNEL4x1_L2 16,64,6,0
+    KERNEL4x1_E2 16,64,7,1
+    blr
+    MY_ALIGN
+
+
+CGEMM_4x1_L8_SUB:
+/*----------------------------------------*/   
+    LOAD4x1_2
+    KERNEL4x1_L2 16,64,0,0
+    KERNEL4x1_L2 16,64,1,0  
+    KERNEL4x1_L2 16,64,2,0
+    KERNEL4x1_E2 16,64,3,1  
+    blr
+
+
+
+/*             MAIN LOOP BEGINS               */   
+    MY_ALIGN
+
+
+CGEMM_L4:
+/*----------------------------------------*/   
+#if defined(TRMMKERNEL) && !defined(LEFT)   
+    neg TEMP_REG, OFFSET 
+#endif   
+    /* Pre set value in vs57 as 0xffff0000ffff0000 for masking */
+    vspltisb v24, -1
+    vspltisb v25, 0
+    xxsldwi vs57, vs56, vs57, 1
+    xxpermdi vs57, vs57, vs57, 3
+    srawi.    J,  N,  2
+    ble   CGEMM_L4_END
+
+
+CGEMM_L4_BEGIN:
+/*----------------------------------------*/   
+    mr    CO, C
+    slwi    T1, LDC , 2     
+    add     T2,C,LDC    
+    mr    AO, A  
+    add   C,  C,  T1
+#if defined(TRMMKERNEL) && defined(LEFT)   
+    mr TEMP_REG, OFFSET  /*off = offset;*/
+#endif     
+    srawi.    I,  M,  3
+    ble   CGEMM_L4x8_END
+    dcbt    CO,r0  /*just prefetch*/
+    dcbt    T2,r0    
+
+
+CGEMM_L4x8_BEGIN:
+/*----------------------------------------*/   
+#if defined(TRMMKERNEL)   
+    REFRESH_POINTERS  AO,BO,TEMP_REG,B,8,4
+#else    
+    mr    BO, B  
+    dcbt    B,  r0  
+#endif     
+    dcbt    AO, r0
+#if defined(TRMMKERNEL)   
+    REFRESH_TEMP_BK T6,K,TEMP_REG,8,4
+    mr T1, T6
+/* TEMPS FOR PREFETCH */   
+    li T2, 1024
+    li T3, 1024+512
+    addi T1,T1, -2
+/* TEMPS FOR PREFETCH */     
+    li T4, 2048
+    li T5, 2048+512   
+    srawi.   T8, T1, 7 /**(T1-2) % 128x */
+#else   
+    mr T1, K
+/* TEMPS FOR PREFETCH */   
+    li T2, 1024
+    li T3, 1024+512
+    addi T1,T1, -2
+/* TEMPS FOR PREFETCH */     
+    li T4, 2048
+    li T5, 2048+512 
+    srawi.   T8, T1, 7 /**(K-2) % 128x */
+#endif   
+    ZERO4x8  
+    ble   CGEMM_L4x8_SUB0
+    bl CGEMM_L4x8_LMAIN_SUB
+    andi.   L,  T1, 127
+    ble   CGEMM_L4x8_SAVE
+    b   CGEMM_L4x8_SUB2
+
+
+CGEMM_L4x8_SUB0:
+/*----------------------------------------*/   
+#if defined(TRMMKERNEL)   
+    andi.   L,  T6, 255
+    cmpwi   T6,129
+#else   
+    andi.   L,  K,  255
+    cmpwi   K,129
+#endif       
+    li T8,1
+    bne CMP4x8_128K
+    addi BO,BO,-32
+    addi AO,AO,-64 
+    LOAD4x8O 64,32 
+    END4x8_WITHOUT_ADD   
+    LOAD4x8_2O  128, 64 
+    mtctr   T8    
+    bl CGEMM_L4x8_K128   
+    b CGEMM_L4x8_SAVE  
+    CMP4x8_128K:
+/*----------------------------------------*/   
+#if defined(TRMMKERNEL)    
+    cmpwi   T6,128
+#else    
+    cmpwi   K,128
+#endif        
+    bne CGEMM_L4x8_SUB2 
+    MY_ALIGN   
+    mtctr   T8
+    addi BO,BO,-64
+    addi AO,AO,-128   
+    LOAD4x8_2O 128,64
+    bl CGEMM_L4x8_K128   
+    b CGEMM_L4x8_SAVE 
+    MY_ALIGN
+
+
+CGEMM_L4x8_SUB2:
+/*----------------------------------------*/   
+    andi.      T1,L, 64
+    ble CGEMM_L4x8_SUB2_32
+    bl  CGEMM_4x8_L64_SUB
+    MY_ALIGN
+
+
+CGEMM_L4x8_SUB2_32:
+/*----------------------------------------*/   
+    andi.      T1,L, 32
+    ble CGEMM_L4x8_SUB2_16    
+    bl  CGEMM_4x8_L32_SUB
+    MY_ALIGN 
+
+
+CGEMM_L4x8_SUB2_16:
+/*----------------------------------------*/   
+    andi.      T1,L, 16
+    ble CGEMM_L4x8_SUB2_8
+    bl  CGEMM_4x8_L16_SUB  
+    MY_ALIGN    
+
+
+CGEMM_L4x8_SUB2_8:
+/*----------------------------------------*/   
+    andi.      T1,L, 8
+    ble CGEMM_L4x8_SUB2_4
+    LOAD4x8_2
+    KERNEL4x8_L2  128,64, 0,0
+    KERNEL4x8_L2  128,64, 1,0
+    KERNEL4x8_L2  128,64, 2,0
+    KERNEL4x8_E2  128,64, 3,1
+    MY_ALIGN   
+
+
+CGEMM_L4x8_SUB2_4:
+/*----------------------------------------*/   
+    andi.      T1,L, 4
+    ble CGEMM_L4x8_SUB2_2
+    LOAD4x8_2
+    KERNEL4x8_L2  128,64, 0,0
+    KERNEL4x8_E2  128,64, 1,1
+    MY_ALIGN
+
+
+CGEMM_L4x8_SUB2_2:
+/*----------------------------------------*/   
+    andi.      T1,L, 2
+    ble CGEMM_L4x8_SUB2_1
+    LOAD4x8_2 
+    KERNEL4x8_E2  128,64, 0,1
+    MY_ALIGN    
+
+
+CGEMM_L4x8_SUB2_1:
+/*----------------------------------------*/   
+    andi.      T1,L, 1
+    ble CGEMM_L4x8_SAVE 
+    KERNEL4x8
+
+    MY_ALIGN
+CGEMM_L4x8_SAVE:
+/*----------------------------------------*/   
+    addic.    I,  I,  -1
+    MY_ALIGN
+    SAVE4x8
+#if defined(TRMMKERNEL)    
+    REFRESH_AFTER_SAVE T6,K,TEMP_REG,BO,AO,8,4
+#endif     
+    bgt   CGEMM_L4x8_BEGIN
+    andi.   T2, M,  7
+    ble   CGEMM_L4x1_END
+    andi.   T1, M,  4
+    ble   CGEMM_L4x4_END
+    b   CGEMM_L4x4_BEGIN
+    MY_ALIGN 
+
+
+CGEMM_L4x8_END:
+/*----------------------------------------*/   
+
+
+CGEMM_L4x4_BEGIN:
+/*----------------------------------------*/   
+    andi.   T2, M,  7
+    ble   CGEMM_L4x1_END
+    andi.   T1, M,  4
+    ble   CGEMM_L4x4_END
+#if defined(TRMMKERNEL)   
+    REFRESH_POINTERS  AO,BO,TEMP_REG,B,4,4
+#else    
+    mr    BO, B   
+#endif        
+#if defined(TRMMKERNEL)   
+    REFRESH_TEMP_BK T6,K,TEMP_REG,4,4
+    mr T1, T6 
+    addi T1,T1, -2 
+    srawi.   T8, T1, 5 /**(T1-2) % 32x */
+#else   
+    mr T1, K 
+    addi T1,T1, -2
+    srawi.   T8, T1, 5 /**(K-2) % 32x */
+#endif     
+    ZERO4x4
+    ble   CGEMM_L4x4_SUB0 
+    bl CGEMM_4x4_LMAIN_SUB
+    andi.   L,  T1, 31
+    ble   CGEMM_L4x4_SAVE
+    b    CGEMM_L4x4_SUB2
+
+
+CGEMM_L4x4_SUB0:
+/*----------------------------------------*/   
+#if defined(TRMMKERNEL)   
+    andi.   L,  T6, 63
+    cmpwi   T6,33
+#else   
+    andi.   L,  K,  63
+    cmpwi   K,33
+#endif       
+    li T8,1
+    bne CMP4x4_32K
+    addi BO,BO,-32
+    addi AO,AO,-32  
+    LOAD4x4O 32,32 
+    END4x4_WITHOUT_ADD   
+    LOAD4x4_2O  64, 64 
+    mtctr   T8    
+    bl CGEMM_L4x4_K32   
+    b CGEMM_L4x4_SAVE  
+    CMP4x4_32K:
+/*----------------------------------------*/   
+#if defined(TRMMKERNEL)    
+    cmpwi   T6,32
+#else    
+    cmpwi   K,32
+#endif        
+    bne CGEMM_L4x4_SUB2 
+    MY_ALIGN   
+    mtctr   T8
+    addi BO,BO,-64
+    addi AO,AO,-64   
+    LOAD4x4_2O 64,64
+    bl CGEMM_L4x4_K32   
+    b CGEMM_L4x4_SAVE 
+    MY_ALIGN 
+    MY_ALIGN 
+
+
+CGEMM_L4x4_SUB2:
+/*----------------------------------------*/   
+    andi.      T1,L, 16
+    ble CGEMM_L4x4_SUB2_8
+    bl  CGEMM_4x4_L16_SUB  
+    MY_ALIGN
+
+
+CGEMM_L4x4_SUB2_8:
+/*----------------------------------------*/   
+    andi.      T1,L, 8
+    ble CGEMM_L4x4_SUB2_4
+    bl CGEMM_4x4_L8_SUB
+    MY_ALIGN  
+
+
+CGEMM_L4x4_SUB2_4:
+/*----------------------------------------*/   
+    andi.      T1,L, 4
+    ble CGEMM_L4x4_SUB2_2
+    LOAD4x4_2
+    KERNEL4x4_L2  64,64, 0,0
+    KERNEL4x4_E2  64,64, 1,1
+    MY_ALIGN
+
+
+CGEMM_L4x4_SUB2_2:
+/*----------------------------------------*/   
+    andi.      T1,L, 2
+    ble CGEMM_L4x4_SUB2_1
+    LOAD4x4_2
+    KERNEL4x4_E2  64,64, 0,1
+    MY_ALIGN    
+
+
+CGEMM_L4x4_SUB2_1:
+/*----------------------------------------*/   
+    andi.      T1,L, 1
+    ble CGEMM_L4x4_SAVE 
+    KERNEL4x4
+
+
+CGEMM_L4x4_SAVE:
+/*----------------------------------------*/   
+    SAVE4x4
+#if defined(TRMMKERNEL)    
+    REFRESH_AFTER_SAVE T6,K,TEMP_REG,BO,AO,4,4
+#endif     
+
+
+CGEMM_L4x4_END:
+/*----------------------------------------*/   
+
+
+CGEMM_L4x2_BEGIN:
+/*----------------------------------------*/   
+    andi.   T1, M,  2
+    ble   CGEMM_L4x2_END
+#if defined(TRMMKERNEL)   
+    REFRESH_POINTERS  AO,BO,TEMP_REG,B,2,4
+#else    
+    mr    BO, B   
+#endif        
+#if defined(TRMMKERNEL)   
+    REFRESH_TEMP_BK T6,K,TEMP_REG,2,4
+    mr T1, T6 
+    addi T1,T1, -2 
+    srawi.   T8, T1, 5 /**(T1-2) % 32x */
+#else   
+    mr T1, K 
+    addi T1,T1, -2
+    srawi.   T8, T1, 5 /**(K-2) % 32x */
+#endif     
+    ZERO4x2
+    ble   CGEMM_L4x2_SUB0 
+    bl CGEMM_4x2_LMAIN_SUB
+    andi.   L,  T1, 31
+    ble   CGEMM_L4x2_SAVE
+    b   CGEMM_L4x2_SUB2
+
+
+CGEMM_L4x2_SUB0:
+/*----------------------------------------*/   
+#if defined(TRMMKERNEL)   
+    andi.   L,  T6, 63
+    cmpwi   T6,33
+#else   
+    andi.   L,  K,  63
+    cmpwi   K,33
+#endif       
+    li T8,1
+    bne CMP4x2_32K
+    addi BO,BO,-32
+    addi AO,AO,-16  
+    LOAD4x2O 16,32 
+    END4x2_WITHOUT_ADD   
+    LOAD4x2_2O  32, 64  
+    mtctr   T8    
+    bl CGEMM_L4x2_K32   
+    b CGEMM_L4x2_SAVE  
+    CMP4x2_32K:
+/*----------------------------------------*/   
+#if defined(TRMMKERNEL)    
+    cmpwi   T6,32
+#else    
+    cmpwi   K,32
+#endif        
+    bne CGEMM_L4x2_SUB2 
+    MY_ALIGN   
+    mtctr   T8
+    addi BO,BO,-64
+    addi AO,AO,-32   
+    LOAD4x2_2O 32,64
+    bl CGEMM_L4x2_K32   
+    b CGEMM_L4x2_SAVE 
+    MY_ALIGN 
+    MY_ALIGN 
+
+
+CGEMM_L4x2_SUB2:
+/*----------------------------------------*/   
+    andi.      T1,L, 16
+    ble CGEMM_L4x2_SUB2_8
+    bl CGEMM_4x2_L16_SUB  
+    MY_ALIGN
+
+
+CGEMM_L4x2_SUB2_8:
+/*----------------------------------------*/   
+    andi.      T1,L, 8
+    ble CGEMM_L4x2_SUB2_4
+    bl CGEMM_4x2_L8_SUB
+    MY_ALIGN  
+
+
+CGEMM_L4x2_SUB2_4:
+/*----------------------------------------*/   
+    andi.      T1,L, 4
+    ble CGEMM_L4x2_SUB2_2
+    LOAD4x2_2
+    KERNEL4x2_L2  32,64, 0,0
+    KERNEL4x2_E2  32,64, 1,1
+    MY_ALIGN
+
+
+CGEMM_L4x2_SUB2_2:
+/*----------------------------------------*/   
+    andi.      T1,L, 2
+    ble CGEMM_L4x2_SUB2_1
+    LOAD4x2_2
+    KERNEL4x2_E2  32,64, 0,1
+    MY_ALIGN    
+
+
+CGEMM_L4x2_SUB2_1:
+/*----------------------------------------*/   
+    andi.      T1,L, 1
+    ble CGEMM_L4x2_SAVE 
+    KERNEL4x2
+
+    MY_ALIGN
+CGEMM_L4x2_SAVE:
+/*----------------------------------------*/   
+    SAVE4x2
+#if defined(TRMMKERNEL)    
+    REFRESH_AFTER_SAVE T6,K,TEMP_REG,BO,AO,2,4
+#endif     
+
+
+CGEMM_L4x2_END:
+/*----------------------------------------*/   
+
+
+CGEMM_L4x1_BEGIN:
+/*----------------------------------------*/   
+    andi.   T1, M,  1
+    ble   CGEMM_L4x1_END
+#if defined(TRMMKERNEL)   
+    REFRESH_POINTERS  AO,BO,TEMP_REG,B,1,4
+#else    
+    mr    BO, B   
+#endif        
+#if defined(TRMMKERNEL)   
+    REFRESH_TEMP_BK T6,K,TEMP_REG,1,4
+    mr T1, T6 
+    addi T1,T1, -2 
+    srawi.   T8, T1, 5 /**(T1-2) % 32x */
+#else   
+    mr T1, K 
+    addi T1,T1, -2
+    srawi.   T8, T1, 5 /**(K-2) % 32x */
+#endif     
+    ZERO4x1
+    ble   CGEMM_L4x1_SUB0 
+    bl CGEMM_4x1_LMAIN_SUB
+    andi.   L,  T1, 31
+    ble   CGEMM_L4x1_SAVE
+    b   CGEMM_L4x1_SUB2
+
+
+CGEMM_L4x1_SUB0:
+/*----------------------------------------*/   
+#if defined(TRMMKERNEL)   
+    andi.   L,  T6, 63
+    cmpwi   T6,33
+#else   
+    andi.   L,  K,  63
+    cmpwi   K,33
+#endif       
+    li T8,1
+    bne CMP4x1_32K
+    addi BO,BO,-32
+    addi AO,AO,-8  
+    LOAD4x1O 8,32 
+    END4x1_WITHOUT_ADD   
+    LOAD4x1_2O  16, 64  
+    mtctr   T8    
+    bl CGEMM_L4x1_K32   
+    b CGEMM_L4x1_SAVE  
+    CMP4x1_32K:
+/*----------------------------------------*/   
+#if defined(TRMMKERNEL)    
+    cmpwi   T6,32
+#else    
+    cmpwi   K,32
+#endif        
+    bne CGEMM_L4x1_SUB2 
+    MY_ALIGN   
+    mtctr   T8
+    addi BO,BO,-64
+    addi AO,AO,-16   
+    LOAD4x1_2O 16,64
+    bl CGEMM_L4x1_K32   
+    b CGEMM_L4x1_SAVE 
+    MY_ALIGN 
+    MY_ALIGN 
+
+
+CGEMM_L4x1_SUB2:
+/*----------------------------------------*/   
+    andi.      T1,L, 16
+    ble CGEMM_L4x1_SUB2_8
+    bl CGEMM_4x1_L16_SUB  
+    MY_ALIGN
+
+
+CGEMM_L4x1_SUB2_8:
+/*----------------------------------------*/   
+    andi.      T1,L, 8
+    ble CGEMM_L4x1_SUB2_4
+    bl CGEMM_4x1_L8_SUB
+    MY_ALIGN  
+
+
+CGEMM_L4x1_SUB2_4:
+/*----------------------------------------*/   
+    andi.      T1,L, 4
+    ble CGEMM_L4x1_SUB2_2
+    LOAD4x1_2
+    KERNEL4x1_L2  16,64, 0,0
+    KERNEL4x1_E2  16,64, 1,1
+    MY_ALIGN
+
+
+CGEMM_L4x1_SUB2_2:
+/*----------------------------------------*/   
+    andi.      T1,L, 2
+    ble CGEMM_L4x1_SUB2_1
+    LOAD4x1_2
+    KERNEL4x1_E2  16,64, 0,1
+    MY_ALIGN    
+
+
+CGEMM_L4x1_SUB2_1:
+/*----------------------------------------*/   
+    andi.      T1,L, 1
+    ble CGEMM_L4x1_SAVE 
+    KERNEL4x1
+
+    MY_ALIGN
+CGEMM_L4x1_SAVE:
+/*----------------------------------------*/  
+     
+    SAVE4x1
+#if defined(TRMMKERNEL)    
+    REFRESH_AFTER_SAVE T6,K,TEMP_REG,BO,AO,1,4
+#endif   
+
+
+CGEMM_L4x1_END:
+/*----------------------------------------*/   
+    slwi    T1, K,  5
+    addic.    J,  J,  -1
+    add   B,  B,  T1
+#if defined(TRMMKERNEL) && !defined(LEFT)   
+    addi TEMP_REG, TEMP_REG, 4
+#endif   
+    bgt   CGEMM_L4_BEGIN
+
+
+CGEMM_L4_END:
+
+b CGEMM_L2
+/*                MINI SUBROUTINES                            */      
+/*                2x8 MAIN 128x+2 LOOP                     */
+
+
+CGEMM_L2x8_LMAIN_SUB:
+/*----------------------------------------*/   
+    mtctr   T8
+    LOAD2x8_2 
+    MY_ALIGN
+CGEMM_L2x8_LOOP:
+/*----------------------------------------*/   
+    dcbt    AO, PRE
+    dcbt    BO, PRE
+    KERNEL2x8_L2 128,32,0,0 
+CGEMM_L2x8_K128:
+/*----------------------------------------*/   
+    KERNEL2x8_L2 128,32,1,0
+    dcbt    AO, T2  
+    KERNEL2x8_L2 128,32,2,0
+    KERNEL2x8_L2 128,32,3,0 
+    dcbt    AO, T3
+    dcbt    BO, T2
+    KERNEL2x8_L2 128,32,4,0
+    KERNEL2x8_L2 128,32,5,0
+    dcbt    AO, T4  
+    KERNEL2x8_L2 128,32,6,0
+    KERNEL2x8_L2 128,32,7,0  
+    dcbt    AO, T5  
+    dcbt    BO, T3
+    KERNEL2x8_L2 128,32,8,0
+    KERNEL2x8_L2 128,32,9,0
+    KERNEL2x8_L2 128,32,10,0
+    KERNEL2x8_L2 128,32,11,0  
+    dcbt    BO, T4
+    KERNEL2x8_L2 128,32,12,0
+    KERNEL2x8_L2 128,32,13,0
+    KERNEL2x8_L2 128,32,14,0
+    KERNEL2x8_L2 128,32,15,0  
+    KERNEL2x8_L2 128,32,16,0
+    KERNEL2x8_L2 128,32,17,0 
+    KERNEL2x8_L2 128,32,18,0
+    KERNEL2x8_L2 128,32,19,0  
+    KERNEL2x8_L2 128,32,20,0
+    KERNEL2x8_L2 128,32,21,0 
+    KERNEL2x8_L2 128,32,22,0
+    KERNEL2x8_L2 128,32,23,0   
+    KERNEL2x8_L2 128,32,24,0
+    KERNEL2x8_L2 128,32,25,0
+    KERNEL2x8_L2 128,32,26,0
+    KERNEL2x8_L2 128,32,27,0  
+    KERNEL2x8_L2 128,32,28,0
+    KERNEL2x8_L2 128,32,29,0
+    KERNEL2x8_L2 128,32,30,0
+    KERNEL2x8_L2 128,32,31,0 
+    KERNEL2x8_L2 128,32,32,0
+    KERNEL2x8_L2 128,32,33,0
+    KERNEL2x8_L2 128,32,34,0
+    KERNEL2x8_L2 128,32,35,0 
+    KERNEL2x8_L2 128,32,36,0
+    KERNEL2x8_L2 128,32,37,0
+    KERNEL2x8_L2 128,32,38,0
+    KERNEL2x8_L2 128,32,39,0  
+    KERNEL2x8_L2 128,32,40,0
+    KERNEL2x8_L2 128,32,41,0
+    KERNEL2x8_L2 128,32,42,0
+    KERNEL2x8_L2 128,32,43,0  
+    KERNEL2x8_L2 128,32,44,0
+    KERNEL2x8_L2 128,32,45,0
+    KERNEL2x8_L2 128,32,46,0
+    KERNEL2x8_L2 128,32,47,0 
+    KERNEL2x8_L2 128,32,48,0
+    KERNEL2x8_L2 128,32,49,0 
+    KERNEL2x8_L2 128,32,50,0
+    KERNEL2x8_L2 128,32,51,0  
+    KERNEL2x8_L2 128,32,52,0
+    KERNEL2x8_L2 128,32,53,0 
+    KERNEL2x8_L2 128,32,54,0
+    KERNEL2x8_L2 128,32,55,0  
+    KERNEL2x8_L2 128,32,56,0
+    KERNEL2x8_L2 128,32,57,0
+    KERNEL2x8_L2 128,32,58,0
+    KERNEL2x8_L2 128,32,59,0  
+    KERNEL2x8_L2 128,32,60,0
+    KERNEL2x8_L2 128,32,61,0
+    KERNEL2x8_L2 128,32,62,0 
+    KERNEL2x8_L2 128,32,63,1  
+    bdnz    CGEMM_L2x8_LOOP
+    MY_ALIGN  
+CGEMM_L2x8_LOOP_END:
+/*----------------------------------------*/   
+    END2x8_2
+    blr
+    MY_ALIGN
+
+
+CGEMM_2x8_L64_SUB:
+/*----------------------------------------*/   
+    LOAD2x8_2  
+    dcbt    AO, PRE
+    dcbt    BO, PRE
+    KERNEL2x8_L2 128,32,0,0 
+    KERNEL2x8_L2 128,32,1,0
+    dcbt    AO, T2  
+    KERNEL2x8_L2 128,32,2,0
+    KERNEL2x8_L2 128,32,3,0 
+    dcbt    AO, T3
+    dcbt    BO, T2
+    KERNEL2x8_L2 128,32,4,0
+    KERNEL2x8_L2 128,32,5,0
+    dcbt    AO, T4  
+    KERNEL2x8_L2 128,32,6,0
+    KERNEL2x8_L2 128,32,7,0  
+    dcbt    AO, T5  
+    dcbt    BO, T3
+    KERNEL2x8_L2 128,32,8,0
+    KERNEL2x8_L2 128,32,9,0
+    KERNEL2x8_L2 128,32,10,0
+    KERNEL2x8_L2 128,32,11,0  
+    dcbt    BO, T4
+    KERNEL2x8_L2 128,32,12,0
+    KERNEL2x8_L2 128,32,13,0
+    KERNEL2x8_L2 128,32,14,0
+    KERNEL2x8_L2 128,32,15,0  
+    KERNEL2x8_L2 128,32,16,0
+    KERNEL2x8_L2 128,32,17,0 
+    KERNEL2x8_L2 128,32,18,0
+    KERNEL2x8_L2 128,32,19,0  
+    KERNEL2x8_L2 128,32,20,0
+    KERNEL2x8_L2 128,32,21,0 
+    KERNEL2x8_L2 128,32,22,0
+    KERNEL2x8_L2 128,32,23,0   
+    KERNEL2x8_L2 128,32,24,0
+    KERNEL2x8_L2 128,32,25,0
+    KERNEL2x8_L2 128,32,26,0
+    KERNEL2x8_L2 128,32,27,0  
+    KERNEL2x8_L2 128,32,28,0
+    KERNEL2x8_L2 128,32,29,0
+    KERNEL2x8_L2 128,32,30,0
+    KERNEL2x8_E2 128,32,31,1
+    blr
+    MY_ALIGN
+
+
+CGEMM_2x8_L32_SUB:
+/*----------------------------------------*/   
+    LOAD2x8_2  
+    dcbt    AO, PRE
+    dcbt    BO, PRE
+    KERNEL2x8_L2 128,32,0,0 
+    KERNEL2x8_L2 128,32,1,0
+    dcbt    AO, T2  
+    KERNEL2x8_L2 128,32,2,0
+    KERNEL2x8_L2 128,32,3,0 
+    dcbt    AO, T3
+    dcbt    BO, T2
+    KERNEL2x8_L2 128,32,4,0
+    KERNEL2x8_L2 128,32,5,0
+    dcbt    AO, T4  
+    KERNEL2x8_L2 128,32,6,0
+    KERNEL2x8_L2 128,32,7,0  
+    dcbt    AO, T5  
+    dcbt    BO, T3
+    KERNEL2x8_L2 128,32,8,0
+    KERNEL2x8_L2 128,32,9,0
+    KERNEL2x8_L2 128,32,10,0
+    KERNEL2x8_L2 128,32,11,0  
+    dcbt    BO, T4
+    KERNEL2x8_L2 128,32,12,0
+    KERNEL2x8_L2 128,32,13,0
+    KERNEL2x8_L2 128,32,14,0
+    KERNEL2x8_E2 128,32,15,1
+    blr
+    MY_ALIGN
+
+
+CGEMM_2x8_L16_SUB:
+/*----------------------------------------*/   
+    LOAD2x8_2 
+    dcbt    AO, PRE
+    dcbt    BO, PRE
+    KERNEL2x8_L2 128,32,0,0 
+    KERNEL2x8_L2 128,32,1,0
+    dcbt    AO, T2  
+    KERNEL2x8_L2 128,32,2,0
+    KERNEL2x8_L2 128,32,3,0 
+    dcbt    AO, T3
+    dcbt    BO, T2
+    KERNEL2x8_L2 128,32,4,0
+    KERNEL2x8_L2 128,32,5,0
+    dcbt    AO, T4  
+    KERNEL2x8_L2 128,32,6,0
+    KERNEL2x8_E2 128,32,7,1
+    blr
+    MY_ALIGN
+
+
+CGEMM_2x4_LMAIN_SUB:
+/*----------------------------------------*/   
+    mtctr   T8
+    LOAD2x4_2  
+    MY_ALIGN
+CGEMM_L2x4_LOOP:
+/*----------------------------------------*/   
+    KERNEL2x4_L2 64,32,0,0
+CGEMM_L2x4_K32:
+/*----------------------------------------*/   
+    KERNEL2x4_L2 64,32,1,0   
+    KERNEL2x4_L2 64,32,2,0
+    KERNEL2x4_L2 64,32,3,0  
+    KERNEL2x4_L2 64,32,4,0
+    KERNEL2x4_L2 64,32,5,0 
+    KERNEL2x4_L2 64,32,6,0
+    KERNEL2x4_L2 64,32,7,0
+    KERNEL2x4_L2 64,32,8,0
+    KERNEL2x4_L2 64,32,9,0   
+    KERNEL2x4_L2 64,32,10,0
+    KERNEL2x4_L2 64,32,11,0  
+    KERNEL2x4_L2 64,32,12,0
+    KERNEL2x4_L2 64,32,13,0 
+    KERNEL2x4_L2 64,32,14,0
+    KERNEL2x4_L2 64,32,15,1    
+    bdnz    CGEMM_L2x4_LOOP
+    MY_ALIGN  
+CGEMM_L2x4_LOOP_END:
+/*----------------------------------------*/   
+    END2x4_2 
+    blr
+    MY_ALIGN
+
+
+CGEMM_2x4_L16_SUB:
+/*----------------------------------------*/   
+    LOAD2x4_2
+    KERNEL2x4_L2 64,32,0,0
+    KERNEL2x4_L2 64,32,1,0   
+    KERNEL2x4_L2 64,32,2,0
+    KERNEL2x4_L2 64,32,3,0  
+    KERNEL2x4_L2 64,32,4,0
+    KERNEL2x4_L2 64,32,5,0 
+    KERNEL2x4_L2 64,32,6,0
+    KERNEL2x4_E2 64,32,7,1
+    blr
+    MY_ALIGN
+
+
+CGEMM_2x4_L8_SUB:
+/*----------------------------------------*/   
+    LOAD2x4_2
+    KERNEL2x4_L2 64,32,0,0
+    KERNEL2x4_L2 64,32,1,0   
+    KERNEL2x4_L2 64,32,2,0
+    KERNEL2x4_E2 64,32,3,1 
+    blr
+
+
+CGEMM_2x2_LMAIN_SUB:
+/*----------------------------------------*/   
+    mtctr   T8
+    LOAD2x2_2  
+    MY_ALIGN 
+CGEMM_L2x2_LOOP:
+/*----------------------------------------*/   
+    KERNEL2x2_L2 32,32,0,0 
+CGEMM_L2x2_K32:
+/*----------------------------------------*/   
+    KERNEL2x2_L2 32,32,1,0  
+    KERNEL2x2_L2 32,32,2,0
+    KERNEL2x2_L2 32,32,3,0  
+    KERNEL2x2_L2 32,32,4,0
+    KERNEL2x2_L2 32,32,5,0 
+    KERNEL2x2_L2 32,32,6,0
+    KERNEL2x2_L2 32,32,7,0
+    KERNEL2x2_L2 32,32,8,0
+    KERNEL2x2_L2 32,32,9,0  
+    KERNEL2x2_L2 32,32,10,0
+    KERNEL2x2_L2 32,32,11,0  
+    KERNEL2x2_L2 32,32,12,0
+    KERNEL2x2_L2 32,32,13,0 
+    KERNEL2x2_L2 32,32,14,0
+    KERNEL2x2_L2 32,32,15,1   
+    bdnz    CGEMM_L2x2_LOOP
+    MY_ALIGN  
+
+
+CGEMM_L2x2_LOOP_END:
+/*----------------------------------------*/   
+    END2x2_2 
+    blr
+    MY_ALIGN
+CGEMM_2x2_L16_SUB:
+/*----------------------------------------*/   
+    LOAD2x2_2
+    KERNEL2x2_L2 32,32,0,0
+    KERNEL2x2_L2 32,32,1,0  
+    KERNEL2x2_L2 32,32,2,0
+    KERNEL2x2_L2 32,32,3,0  
+    KERNEL2x2_L2 32,32,4,0
+    KERNEL2x2_L2 32,32,5,0 
+    KERNEL2x2_L2 32,32,6,0
+    KERNEL2x2_E2 32,32,7,1
+    blr
+    MY_ALIGN
+CGEMM_2x2_L8_SUB:
+/*----------------------------------------*/   
+    LOAD2x2_2
+    KERNEL2x2_L2 32,32,0,0
+    KERNEL2x2_L2 32,32,1,0  
+    KERNEL2x2_L2 32,32,2,0
+    KERNEL2x2_E2 32,32,3,1  
+    blr
+
+
+CGEMM_2x1_LMAIN_SUB:
+/*----------------------------------------*/   
+    mtctr   T8
+    LOAD2x1_2  
+    MY_ALIGN
+CGEMM_L2x1_LOOP:
+/*----------------------------------------*/   
+    KERNEL2x1_L2 16,32,0,0 
+CGEMM_L2x1_K32:
+/*----------------------------------------*/   
+    KERNEL2x1_L2 16,32,1,0  
+    KERNEL2x1_L2 16,32,2,0
+    KERNEL2x1_L2 16,32,3,0  
+    KERNEL2x1_L2 16,32,4,0
+    KERNEL2x1_L2 16,32,5,0 
+    KERNEL2x1_L2 16,32,6,0
+    KERNEL2x1_L2 16,32,7,0
+    KERNEL2x1_L2 16,32,8,0
+    KERNEL2x1_L2 16,32,9,0  
+    KERNEL2x1_L2 16,32,10,0
+    KERNEL2x1_L2 16,32,11,0  
+    KERNEL2x1_L2 16,32,12,0
+    KERNEL2x1_L2 16,32,13,0 
+    KERNEL2x1_L2 16,32,14,0
+    KERNEL2x1_L2 16,32,15,1   
+    bdnz    CGEMM_L2x1_LOOP
+    MY_ALIGN  
+CGEMM_L2x1_LOOP_END:
+/*----------------------------------------*/   
+    END2x1_2 
+    blr
+
+    MY_ALIGN
+CGEMM_2x1_L16_SUB:
+/*----------------------------------------*/   
+    LOAD2x1_2
+    KERNEL2x1_L2 16,32,0,0
+    KERNEL2x1_L2 16,32,1,0  
+    KERNEL2x1_L2 16,32,2,0
+    KERNEL2x1_L2 16,32,3,0  
+    KERNEL2x1_L2 16,32,4,0
+    KERNEL2x1_L2 16,32,5,0 
+    KERNEL2x1_L2 16,32,6,0
+    KERNEL2x1_E2 16,32,7,1
+    blr
+    MY_ALIGN
+
+
+CGEMM_2x1_L8_SUB:
+/*----------------------------------------*/   
+    LOAD2x1_2
+    KERNEL2x1_L2 16,32,0,0
+    KERNEL2x1_L2 16,32,1,0  
+    KERNEL2x1_L2 16,32,2,0
+    KERNEL2x1_E2 16,32,3,1  
+    blr
+
+
+
+/*             MAIN LOOP BEGINS               */   
+    MY_ALIGN
+
+
+CGEMM_L2:
+/*----------------------------------------*/   
+
+    andi.    J,  N,  2
+    ble   CGEMM_L2_END
+
+
+CGEMM_L2_BEGIN:
+/*----------------------------------------*/   
+    mr    CO, C
+    slwi    T1, LDC , 1     
+    add     T2,C,LDC    
+    mr    AO, A  
+    add   C,  C,  T1
+#if defined(TRMMKERNEL) && defined(LEFT)   
+    mr TEMP_REG, OFFSET  /*off = offset;*/
+#endif     
+    srawi.    I,  M, 3
+    ble   CGEMM_L2x8_END
+    dcbt    CO,r0  /*just prefetch*/
+    dcbt    T2,r0    
+
+
+CGEMM_L2x8_BEGIN:
+/*----------------------------------------*/   
+#if defined(TRMMKERNEL)   
+    REFRESH_POINTERS  AO,BO,TEMP_REG,B,8,2
+#else    
+    mr    BO, B  
+    dcbt    B,  r0  
+#endif     
+    dcbt    AO, r0
+#if defined(TRMMKERNEL)   
+    REFRESH_TEMP_BK T6,K,TEMP_REG,8,2
+    mr T1, T6
+/* TEMPS FOR PREFETCH */   
+    li T2, 1024
+    li T3, 1024+512
+    addi T1,T1, -2
+/* TEMPS FOR PREFETCH */     
+    li T4, 2048
+    li T5, 2048+512   
+    srawi.   T8, T1, 7 /**(T1-2) % 128x */
+#else   
+    mr T1, K
+/* TEMPS FOR PREFETCH */   
+    li T2, 1024
+    li T3, 1024+512
+    addi T1,T1, -2
+/* TEMPS FOR PREFETCH */     
+    li T4, 2048
+    li T5, 2048+512 
+    srawi.   T8, T1, 7 /**(K-2) % 128x */
+#endif   
+    ZERO2x8  
+    ble   CGEMM_L2x8_SUB0
+    bl CGEMM_L2x8_LMAIN_SUB
+    andi.   L,  T1, 127
+    ble   CGEMM_L2x8_SAVE
+    b   CGEMM_L2x8_SUB2
+
+
+CGEMM_L2x8_SUB0:
+/*----------------------------------------*/   
+#if defined(TRMMKERNEL)   
+    andi.   L,  T6, 255
+    cmpwi   T6,129
+#else   
+    andi.   L,  K,  255
+    cmpwi   K,129
+#endif       
+    li T8,1
+    bne CMP2x8_128K
+    addi BO,BO,-16
+    addi AO,AO,-64 
+    LOAD2x8O 64,16 
+    END2x8_WITHOUT_ADD   
+    LOAD2x8_2O  128, 32 
+    mtctr   T8    
+    bl CGEMM_L2x8_K128   
+    b CGEMM_L2x8_SAVE  
+    CMP2x8_128K:
+/*----------------------------------------*/   
+#if defined(TRMMKERNEL)    
+    cmpwi   T6,128
+#else    
+    cmpwi   K,128
+#endif        
+    bne CGEMM_L2x8_SUB2 
+    MY_ALIGN   
+    mtctr   T8
+    addi BO,BO,-32
+    addi AO,AO,-128   
+    LOAD2x8_2O 128,32
+    bl CGEMM_L2x8_K128   
+    b CGEMM_L2x8_SAVE 
+    MY_ALIGN
+
+
+CGEMM_L2x8_SUB2:
+/*----------------------------------------*/   
+    andi.      T1,L, 64
+    ble CGEMM_L2x8_SUB2_32
+    bl  CGEMM_2x8_L64_SUB
+    MY_ALIGN
+
+
+CGEMM_L2x8_SUB2_32:
+/*----------------------------------------*/   
+    andi.      T1,L, 32
+    ble CGEMM_L2x8_SUB2_16    
+    bl  CGEMM_2x8_L32_SUB
+    MY_ALIGN 
+
+
+CGEMM_L2x8_SUB2_16:
+/*----------------------------------------*/   
+    andi.      T1,L, 16
+    ble CGEMM_L2x8_SUB2_8
+    bl  CGEMM_2x8_L16_SUB  
+    MY_ALIGN    
+
+
+CGEMM_L2x8_SUB2_8:
+/*----------------------------------------*/   
+    andi.      T1,L, 8
+    ble CGEMM_L2x8_SUB2_4
+    LOAD2x8_2
+    KERNEL2x8_L2  128,32, 0,0
+    KERNEL2x8_L2  128,32, 1,0
+    KERNEL2x8_L2  128,32, 2,0
+    KERNEL2x8_E2  128,32, 3,1
+    MY_ALIGN   
+
+
+CGEMM_L2x8_SUB2_4:
+/*----------------------------------------*/   
+    andi.      T1,L, 4
+    ble CGEMM_L2x8_SUB2_2
+    LOAD2x8_2
+    KERNEL2x8_L2  128,32, 0,0
+    KERNEL2x8_E2  128,32, 1,1
+    MY_ALIGN
+
+
+CGEMM_L2x8_SUB2_2:
+/*----------------------------------------*/   
+    andi.      T1,L, 2
+    ble CGEMM_L2x8_SUB2_1
+    LOAD2x8_2 
+    KERNEL2x8_E2  128,32, 0,1
+    MY_ALIGN    
+
+
+CGEMM_L2x8_SUB2_1:
+/*----------------------------------------*/   
+    andi.      T1,L, 1
+    ble CGEMM_L2x8_SAVE 
+    KERNEL2x8
+
+    MY_ALIGN
+CGEMM_L2x8_SAVE:
+/*----------------------------------------*/   
+    addic.    I,  I,  -1
+    MY_ALIGN
+    SAVE2x8
+#if defined(TRMMKERNEL)    
+    REFRESH_AFTER_SAVE T6,K,TEMP_REG,BO,AO,8,2
+#endif     
+    bgt   CGEMM_L2x8_BEGIN
+    andi.   T2, M,  7
+    ble   CGEMM_L2x1_END
+    andi.   T1, M,  4
+    ble   CGEMM_L2x4_END
+    b   CGEMM_L2x4_BEGIN
+    MY_ALIGN 
+
+
+CGEMM_L2x8_END:
+/*----------------------------------------*/   
+
+
+CGEMM_L2x4_BEGIN:
+/*----------------------------------------*/   
+    andi.   T2, M,  7
+    ble   CGEMM_L2x1_END
+    andi.   T1, M,  4
+    ble   CGEMM_L2x4_END
+#if defined(TRMMKERNEL)   
+    REFRESH_POINTERS  AO,BO,TEMP_REG,B,4,2
+#else    
+    mr    BO, B   
+#endif        
+#if defined(TRMMKERNEL)   
+    REFRESH_TEMP_BK T6,K,TEMP_REG,4,2
+    mr T1, T6 
+    addi T1,T1, -2 
+    srawi.   T8, T1, 5 /**(T1-2) % 32x */
+#else   
+    mr T1, K 
+    addi T1,T1, -2
+    srawi.   T8, T1, 5 /**(K-2) % 32x */
+#endif     
+    ZERO2x4
+    ble   CGEMM_L2x4_SUB0 
+    bl CGEMM_2x4_LMAIN_SUB
+    andi.   L,  T1, 31
+    ble   CGEMM_L2x4_SAVE
+    b    CGEMM_L2x4_SUB2
+
+
+CGEMM_L2x4_SUB0:
+/*----------------------------------------*/   
+#if defined(TRMMKERNEL)   
+    andi.   L,  T6, 63
+    cmpwi   T6,33
+#else   
+    andi.   L,  K,  63
+    cmpwi   K,33
+#endif       
+    li T8,1
+    bne CMP2x4_32K
+    addi BO,BO,-16
+    addi AO,AO,-32  
+    LOAD2x4O 32,16 
+    END2x4_WITHOUT_ADD   
+    LOAD2x4_2O  64, 32 
+    mtctr   T8    
+    bl CGEMM_L2x4_K32   
+    b CGEMM_L2x4_SAVE  
+    CMP2x4_32K:
+/*----------------------------------------*/   
+#if defined(TRMMKERNEL)    
+    cmpwi   T6,32
+#else    
+    cmpwi   K,32
+#endif        
+    bne CGEMM_L2x4_SUB2 
+    MY_ALIGN   
+    mtctr   T8
+    addi BO,BO,-32
+    addi AO,AO,-64   
+    LOAD2x4_2O 64,32
+    bl CGEMM_L2x4_K32   
+    b CGEMM_L2x4_SAVE 
+    MY_ALIGN 
+    MY_ALIGN 
+
+
+CGEMM_L2x4_SUB2:
+/*----------------------------------------*/   
+    andi.      T1,L, 16
+    ble CGEMM_L2x4_SUB2_8
+    bl  CGEMM_2x4_L16_SUB  
+    MY_ALIGN
+
+
+CGEMM_L2x4_SUB2_8:
+/*----------------------------------------*/   
+    andi.      T1,L, 8
+    ble CGEMM_L2x4_SUB2_4
+    bl CGEMM_2x4_L8_SUB
+    MY_ALIGN  
+
+
+CGEMM_L2x4_SUB2_4:
+/*----------------------------------------*/   
+    andi.      T1,L, 4
+    ble CGEMM_L2x4_SUB2_2
+    LOAD2x4_2
+    KERNEL2x4_L2  64,32, 0,0
+    KERNEL2x4_E2  64,32, 1,1
+    MY_ALIGN
+
+
+CGEMM_L2x4_SUB2_2:
+/*----------------------------------------*/   
+    andi.      T1,L, 2
+    ble CGEMM_L2x4_SUB2_1
+    LOAD2x4_2
+    KERNEL2x4_E2  64,32, 0,1
+    MY_ALIGN    
+
+
+CGEMM_L2x4_SUB2_1:
+/*----------------------------------------*/   
+    andi.      T1,L, 1
+    ble CGEMM_L2x4_SAVE 
+    KERNEL2x4
+
+
+CGEMM_L2x4_SAVE:
+/*----------------------------------------*/   
+    SAVE2x4
+#if defined(TRMMKERNEL)    
+    REFRESH_AFTER_SAVE T6,K,TEMP_REG,BO,AO,4,2
+#endif     
+
+
+CGEMM_L2x4_END:
+/*----------------------------------------*/   
+
+
+CGEMM_L2x2_BEGIN:
+/*----------------------------------------*/   
+    andi.   T1, M,  2
+    ble   CGEMM_L2x2_END
+#if defined(TRMMKERNEL)   
+    REFRESH_POINTERS  AO,BO,TEMP_REG,B,2,2
+#else    
+    mr    BO, B   
+#endif        
+#if defined(TRMMKERNEL)   
+    REFRESH_TEMP_BK T6,K,TEMP_REG,2,2
+    mr T1, T6 
+    addi T1,T1, -2 
+    srawi.   T8, T1, 5 /**(T1-2) % 32x */
+#else   
+    mr T1, K 
+    addi T1,T1, -2
+    srawi.   T8, T1, 5 /**(K-2) % 32x */
+#endif     
+    ZERO2x2
+    ble   CGEMM_L2x2_SUB0 
+    bl CGEMM_2x2_LMAIN_SUB
+    andi.   L,  T1, 31
+    ble   CGEMM_L2x2_SAVE
+    b   CGEMM_L2x2_SUB2
+
+
+CGEMM_L2x2_SUB0:
+/*----------------------------------------*/   
+#if defined(TRMMKERNEL)   
+    andi.   L,  T6, 63
+    cmpwi   T6,33
+#else   
+    andi.   L,  K,  63
+    cmpwi   K,33
+#endif       
+    li T8,1
+    bne CMP2x2_32K
+    addi BO,BO,-16
+    addi AO,AO,-16  
+    LOAD2x2O 16,16 
+    END2x2_WITHOUT_ADD   
+    LOAD2x2_2O  32, 32  
+    mtctr   T8    
+    bl CGEMM_L2x2_K32   
+    b CGEMM_L2x2_SAVE  
+    CMP2x2_32K:
+/*----------------------------------------*/   
+#if defined(TRMMKERNEL)    
+    cmpwi   T6,32
+#else    
+    cmpwi   K,32
+#endif        
+    bne CGEMM_L2x2_SUB2 
+    MY_ALIGN   
+    mtctr   T8
+    addi BO,BO,-32
+    addi AO,AO,-32   
+    LOAD2x2_2O 32,32
+    bl CGEMM_L2x2_K32   
+    b CGEMM_L2x2_SAVE 
+    MY_ALIGN 
+    MY_ALIGN 
+
+
+CGEMM_L2x2_SUB2:
+/*----------------------------------------*/   
+    andi.      T1,L, 16
+    ble CGEMM_L2x2_SUB2_8
+    bl CGEMM_2x2_L16_SUB  
+    MY_ALIGN
+
+
+CGEMM_L2x2_SUB2_8:
+/*----------------------------------------*/   
+    andi.      T1,L, 8
+    ble CGEMM_L2x2_SUB2_4
+    bl CGEMM_2x2_L8_SUB
+    MY_ALIGN  
+
+
+CGEMM_L2x2_SUB2_4:
+/*----------------------------------------*/   
+    andi.      T1,L, 4
+    ble CGEMM_L2x2_SUB2_2
+    LOAD2x2_2
+    KERNEL2x2_L2  32,32, 0,0
+    KERNEL2x2_E2  32,32, 1,1
+    MY_ALIGN
+
+
+CGEMM_L2x2_SUB2_2:
+/*----------------------------------------*/   
+    andi.      T1,L, 2
+    ble CGEMM_L2x2_SUB2_1
+    LOAD2x2_2
+    KERNEL2x2_E2  32,32, 0,1
+    MY_ALIGN    
+
+
+CGEMM_L2x2_SUB2_1:
+/*----------------------------------------*/   
+    andi.      T1,L, 1
+    ble CGEMM_L2x2_SAVE 
+    KERNEL2x2
+
+    MY_ALIGN
+CGEMM_L2x2_SAVE:
+/*----------------------------------------*/   
+    SAVE2x2
+#if defined(TRMMKERNEL)    
+    REFRESH_AFTER_SAVE T6,K,TEMP_REG,BO,AO,2,2
+#endif     
+
+
+CGEMM_L2x2_END:
+/*----------------------------------------*/   
+
+
+CGEMM_L2x1_BEGIN:
+/*----------------------------------------*/   
+    andi.   T1, M,  1
+    ble   CGEMM_L2x1_END
+#if defined(TRMMKERNEL)   
+    REFRESH_POINTERS  AO,BO,TEMP_REG,B,1,2
+#else    
+    mr    BO, B   
+#endif        
+#if defined(TRMMKERNEL)   
+    REFRESH_TEMP_BK T6,K,TEMP_REG,1,2
+    mr T1, T6 
+    addi T1,T1, -2 
+    srawi.   T8, T1, 5 /**(T1-2) % 32x */
+#else   
+    mr T1, K 
+    addi T1,T1, -2
+    srawi.   T8, T1, 5 /**(K-2) % 32x */
+#endif     
+    ZERO2x1
+    ble   CGEMM_L2x1_SUB0 
+    bl CGEMM_2x1_LMAIN_SUB
+    andi.   L,  T1, 31
+    ble   CGEMM_L2x1_SAVE
+    b   CGEMM_L2x1_SUB2
+
+
+CGEMM_L2x1_SUB0:
+/*----------------------------------------*/   
+#if defined(TRMMKERNEL)   
+    andi.   L,  T6, 63
+    cmpwi   T6,33
+#else   
+    andi.   L,  K,  63
+    cmpwi   K,33
+#endif       
+    li T8,1
+    bne CMP2x1_32K
+    addi BO,BO,-16
+    addi AO,AO,-8  
+    LOAD2x1O 8,16 
+    END2x1_WITHOUT_ADD   
+    LOAD2x1_2O  16, 32  
+    mtctr   T8    
+    bl CGEMM_L2x1_K32   
+    b CGEMM_L2x1_SAVE  
+    CMP2x1_32K:
+/*----------------------------------------*/   
+#if defined(TRMMKERNEL)    
+    cmpwi   T6,32
+#else    
+    cmpwi   K,32
+#endif        
+    bne CGEMM_L2x1_SUB2 
+    MY_ALIGN   
+    mtctr   T8
+    addi BO,BO,-32
+    addi AO,AO,-16   
+    LOAD2x1_2O 16,32
+    bl CGEMM_L2x1_K32   
+    b CGEMM_L2x1_SAVE 
+    MY_ALIGN 
+    MY_ALIGN 
+
+
+CGEMM_L2x1_SUB2:
+/*----------------------------------------*/   
+    andi.      T1,L, 16
+    ble CGEMM_L2x1_SUB2_8
+    bl CGEMM_2x1_L16_SUB  
+    MY_ALIGN
+
+
+CGEMM_L2x1_SUB2_8:
+/*----------------------------------------*/   
+    andi.      T1,L, 8
+    ble CGEMM_L2x1_SUB2_4
+    bl CGEMM_2x1_L8_SUB
+    MY_ALIGN  
+
+
+CGEMM_L2x1_SUB2_4:
+/*----------------------------------------*/   
+    andi.      T1,L, 4
+    ble CGEMM_L2x1_SUB2_2
+    LOAD2x1_2
+    KERNEL2x1_L2  16,32, 0,0
+    KERNEL2x1_E2  16,32, 1,1
+    MY_ALIGN
+
+
+CGEMM_L2x1_SUB2_2:
+/*----------------------------------------*/   
+    andi.      T1,L, 2
+    ble CGEMM_L2x1_SUB2_1
+    LOAD2x1_2
+    KERNEL2x1_E2  16,32, 0,1
+    MY_ALIGN    
+
+
+CGEMM_L2x1_SUB2_1:
+/*----------------------------------------*/   
+    andi.      T1,L, 1
+    ble CGEMM_L2x1_SAVE 
+    KERNEL2x1
+
+    MY_ALIGN
+CGEMM_L2x1_SAVE:
+/*----------------------------------------*/  
+     
+    SAVE2x1
+#if defined(TRMMKERNEL)    
+    REFRESH_AFTER_SAVE T6,K,TEMP_REG,BO,AO,1,2
+#endif   
+
+
+CGEMM_L2x1_END:
+/*----------------------------------------*/   
+    slwi    T1, K,  4
+
+    add   B,  B,  T1
+#if defined(TRMMKERNEL) && !defined(LEFT)   
+    addi TEMP_REG, TEMP_REG, 2
+#endif   
+
+CGEMM_L2_END:
+
+
+b CGEMM_L1
+/*                MINI SUBROUTINES                            */      
+/*                1x8 MAIN 128x+2 LOOP                     */      
+
+
+CGEMM_L1x8_LMAIN_SUB:
+/*----------------------------------------*/   
+    mtctr   T8
+    LOAD1x8_2 
+    MY_ALIGN
+CGEMM_L1x8_LOOP:
+/*----------------------------------------*/   
+    dcbt    AO, PRE
+    dcbt    BO, PRE
+    KERNEL1x8_L2 128,16,0,0 
+CGEMM_L1x8_K128:
+/*----------------------------------------*/   
+    KERNEL1x8_L2 128,16,1,0
+    dcbt    AO, T2  
+    KERNEL1x8_L2 128,16,2,0
+    KERNEL1x8_L2 128,16,3,0 
+    dcbt    AO, T3
+    dcbt    BO, T2
+    KERNEL1x8_L2 128,16,4,0
+    KERNEL1x8_L2 128,16,5,0
+    dcbt    AO, T4  
+    KERNEL1x8_L2 128,16,6,0
+    KERNEL1x8_L2 128,16,7,0  
+    dcbt    AO, T5  
+    dcbt    BO, T3
+    KERNEL1x8_L2 128,16,8,0
+    KERNEL1x8_L2 128,16,9,0
+    KERNEL1x8_L2 128,16,10,0
+    KERNEL1x8_L2 128,16,11,0  
+    dcbt    BO, T4
+    KERNEL1x8_L2 128,16,12,0
+    KERNEL1x8_L2 128,16,13,0
+    KERNEL1x8_L2 128,16,14,0
+    KERNEL1x8_L2 128,16,15,0  
+    KERNEL1x8_L2 128,16,16,0
+    KERNEL1x8_L2 128,16,17,0 
+    KERNEL1x8_L2 128,16,18,0
+    KERNEL1x8_L2 128,16,19,0  
+    KERNEL1x8_L2 128,16,20,0
+    KERNEL1x8_L2 128,16,21,0 
+    KERNEL1x8_L2 128,16,22,0
+    KERNEL1x8_L2 128,16,23,0   
+    KERNEL1x8_L2 128,16,24,0
+    KERNEL1x8_L2 128,16,25,0
+    KERNEL1x8_L2 128,16,26,0
+    KERNEL1x8_L2 128,16,27,0  
+    KERNEL1x8_L2 128,16,28,0
+    KERNEL1x8_L2 128,16,29,0
+    KERNEL1x8_L2 128,16,30,0
+    KERNEL1x8_L2 128,16,31,0 
+    KERNEL1x8_L2 128,16,32,0
+    KERNEL1x8_L2 128,16,33,0
+    KERNEL1x8_L2 128,16,34,0
+    KERNEL1x8_L2 128,16,35,0 
+    KERNEL1x8_L2 128,16,36,0
+    KERNEL1x8_L2 128,16,37,0
+    KERNEL1x8_L2 128,16,38,0
+    KERNEL1x8_L2 128,16,39,0  
+    KERNEL1x8_L2 128,16,40,0
+    KERNEL1x8_L2 128,16,41,0
+    KERNEL1x8_L2 128,16,42,0
+    KERNEL1x8_L2 128,16,43,0  
+    KERNEL1x8_L2 128,16,44,0
+    KERNEL1x8_L2 128,16,45,0
+    KERNEL1x8_L2 128,16,46,0
+    KERNEL1x8_L2 128,16,47,0 
+    KERNEL1x8_L2 128,16,48,0
+    KERNEL1x8_L2 128,16,49,0 
+    KERNEL1x8_L2 128,16,50,0
+    KERNEL1x8_L2 128,16,51,0  
+    KERNEL1x8_L2 128,16,52,0
+    KERNEL1x8_L2 128,16,53,0 
+    KERNEL1x8_L2 128,16,54,0
+    KERNEL1x8_L2 128,16,55,0  
+    KERNEL1x8_L2 128,16,56,0
+    KERNEL1x8_L2 128,16,57,0
+    KERNEL1x8_L2 128,16,58,0
+    KERNEL1x8_L2 128,16,59,0  
+    KERNEL1x8_L2 128,16,60,0
+    KERNEL1x8_L2 128,16,61,0
+    KERNEL1x8_L2 128,16,62,0 
+    KERNEL1x8_L2 128,16,63,1  
+    bdnz    CGEMM_L1x8_LOOP
+    MY_ALIGN  
+CGEMM_L1x8_LOOP_END:
+/*----------------------------------------*/   
+    END1x8_2
+    blr
+    MY_ALIGN
+
+
+CGEMM_1x8_L64_SUB:
+/*----------------------------------------*/   
+    LOAD1x8_2  
+    dcbt    AO, PRE
+    dcbt    BO, PRE
+    KERNEL1x8_L2 128,16,0,0 
+    KERNEL1x8_L2 128,16,1,0
+    dcbt    AO, T2  
+    KERNEL1x8_L2 128,16,2,0
+    KERNEL1x8_L2 128,16,3,0 
+    dcbt    AO, T3
+    dcbt    BO, T2
+    KERNEL1x8_L2 128,16,4,0
+    KERNEL1x8_L2 128,16,5,0
+    dcbt    AO, T4  
+    KERNEL1x8_L2 128,16,6,0
+    KERNEL1x8_L2 128,16,7,0  
+    dcbt    AO, T5  
+    dcbt    BO, T3
+    KERNEL1x8_L2 128,16,8,0
+    KERNEL1x8_L2 128,16,9,0
+    KERNEL1x8_L2 128,16,10,0
+    KERNEL1x8_L2 128,16,11,0  
+    dcbt    BO, T4
+    KERNEL1x8_L2 128,16,12,0
+    KERNEL1x8_L2 128,16,13,0
+    KERNEL1x8_L2 128,16,14,0
+    KERNEL1x8_L2 128,16,15,0  
+    KERNEL1x8_L2 128,16,16,0
+    KERNEL1x8_L2 128,16,17,0 
+    KERNEL1x8_L2 128,16,18,0
+    KERNEL1x8_L2 128,16,19,0  
+    KERNEL1x8_L2 128,16,20,0
+    KERNEL1x8_L2 128,16,21,0 
+    KERNEL1x8_L2 128,16,22,0
+    KERNEL1x8_L2 128,16,23,0   
+    KERNEL1x8_L2 128,16,24,0
+    KERNEL1x8_L2 128,16,25,0
+    KERNEL1x8_L2 128,16,26,0
+    KERNEL1x8_L2 128,16,27,0  
+    KERNEL1x8_L2 128,16,28,0
+    KERNEL1x8_L2 128,16,29,0
+    KERNEL1x8_L2 128,16,30,0
+    KERNEL1x8_E2 128,16,31,1
+    blr
+    MY_ALIGN
+
+
+CGEMM_1x8_L32_SUB:
+/*----------------------------------------*/   
+    LOAD1x8_2  
+    dcbt    AO, PRE
+    dcbt    BO, PRE
+    KERNEL1x8_L2 128,16,0,0 
+    KERNEL1x8_L2 128,16,1,0
+    dcbt    AO, T2  
+    KERNEL1x8_L2 128,16,2,0
+    KERNEL1x8_L2 128,16,3,0 
+    dcbt    AO, T3
+    dcbt    BO, T2
+    KERNEL1x8_L2 128,16,4,0
+    KERNEL1x8_L2 128,16,5,0
+    dcbt    AO, T4  
+    KERNEL1x8_L2 128,16,6,0
+    KERNEL1x8_L2 128,16,7,0  
+    dcbt    AO, T5  
+    dcbt    BO, T3
+    KERNEL1x8_L2 128,16,8,0
+    KERNEL1x8_L2 128,16,9,0
+    KERNEL1x8_L2 128,16,10,0
+    KERNEL1x8_L2 128,16,11,0  
+    dcbt    BO, T4
+    KERNEL1x8_L2 128,16,12,0
+    KERNEL1x8_L2 128,16,13,0
+    KERNEL1x8_L2 128,16,14,0
+    KERNEL1x8_E2 128,16,15,1
+    blr
+    MY_ALIGN
+
+
+CGEMM_1x8_L16_SUB:
+/*----------------------------------------*/   
+    LOAD1x8_2 
+    dcbt    AO, PRE
+    dcbt    BO, PRE
+    KERNEL1x8_L2 128,16,0,0 
+    KERNEL1x8_L2 128,16,1,0
+    dcbt    AO, T2  
+    KERNEL1x8_L2 128,16,2,0
+    KERNEL1x8_L2 128,16,3,0 
+    dcbt    AO, T3
+    dcbt    BO, T2
+    KERNEL1x8_L2 128,16,4,0
+    KERNEL1x8_L2 128,16,5,0
+    dcbt    AO, T4  
+    KERNEL1x8_L2 128,16,6,0
+    KERNEL1x8_E2 128,16,7,1
+    blr
+    MY_ALIGN
+
+
+CGEMM_1x4_LMAIN_SUB:
+/*----------------------------------------*/   
+    mtctr   T8
+    LOAD1x4_2  
+    MY_ALIGN
+CGEMM_L1x4_LOOP:
+/*----------------------------------------*/   
+    KERNEL1x4_L2 64,16,0,0
+CGEMM_L1x4_K32:
+/*----------------------------------------*/   
+    KERNEL1x4_L2 64,16,1,0   
+    KERNEL1x4_L2 64,16,2,0
+    KERNEL1x4_L2 64,16,3,0  
+    KERNEL1x4_L2 64,16,4,0
+    KERNEL1x4_L2 64,16,5,0 
+    KERNEL1x4_L2 64,16,6,0
+    KERNEL1x4_L2 64,16,7,0
+    KERNEL1x4_L2 64,16,8,0
+    KERNEL1x4_L2 64,16,9,0   
+    KERNEL1x4_L2 64,16,10,0
+    KERNEL1x4_L2 64,16,11,0  
+    KERNEL1x4_L2 64,16,12,0
+    KERNEL1x4_L2 64,16,13,0 
+    KERNEL1x4_L2 64,16,14,0
+    KERNEL1x4_L2 64,16,15,1    
+    bdnz    CGEMM_L1x4_LOOP
+    MY_ALIGN  
+CGEMM_L1x4_LOOP_END:
+/*----------------------------------------*/   
+    END1x4_2 
+    blr
+    MY_ALIGN
+
+
+CGEMM_1x4_L16_SUB:
+/*----------------------------------------*/   
+    LOAD1x4_2
+    KERNEL1x4_L2 64,16,0,0
+    KERNEL1x4_L2 64,16,1,0   
+    KERNEL1x4_L2 64,16,2,0
+    KERNEL1x4_L2 64,16,3,0  
+    KERNEL1x4_L2 64,16,4,0
+    KERNEL1x4_L2 64,16,5,0 
+    KERNEL1x4_L2 64,16,6,0
+    KERNEL1x4_E2 64,16,7,1
+    blr
+    MY_ALIGN
+
+
+CGEMM_1x4_L8_SUB:
+/*----------------------------------------*/   
+    LOAD1x4_2
+    KERNEL1x4_L2 64,16,0,0
+    KERNEL1x4_L2 64,16,1,0   
+    KERNEL1x4_L2 64,16,2,0
+    KERNEL1x4_E2 64,16,3,1 
+    blr
+
+
+CGEMM_1x2_LMAIN_SUB:
+/*----------------------------------------*/   
+    mtctr   T8
+    LOAD1x2_2  
+    MY_ALIGN 
+CGEMM_L1x2_LOOP:
+/*----------------------------------------*/   
+    KERNEL1x2_L2 32,16,0,0 
+CGEMM_L1x2_K32:
+/*----------------------------------------*/   
+    KERNEL1x2_L2 32,16,1,0  
+    KERNEL1x2_L2 32,16,2,0
+    KERNEL1x2_L2 32,16,3,0  
+    KERNEL1x2_L2 32,16,4,0
+    KERNEL1x2_L2 32,16,5,0 
+    KERNEL1x2_L2 32,16,6,0
+    KERNEL1x2_L2 32,16,7,0
+    KERNEL1x2_L2 32,16,8,0
+    KERNEL1x2_L2 32,16,9,0  
+    KERNEL1x2_L2 32,16,10,0
+    KERNEL1x2_L2 32,16,11,0  
+    KERNEL1x2_L2 32,16,12,0
+    KERNEL1x2_L2 32,16,13,0 
+    KERNEL1x2_L2 32,16,14,0
+    KERNEL1x2_L2 32,16,15,1   
+    bdnz    CGEMM_L1x2_LOOP
+    MY_ALIGN  
+
+
+CGEMM_L1x2_LOOP_END:
+/*----------------------------------------*/   
+    END1x2_2 
+    blr
+    MY_ALIGN
+CGEMM_1x2_L16_SUB:
+/*----------------------------------------*/   
+    LOAD1x2_2
+    KERNEL1x2_L2 32,16,0,0
+    KERNEL1x2_L2 32,16,1,0  
+    KERNEL1x2_L2 32,16,2,0
+    KERNEL1x2_L2 32,16,3,0  
+    KERNEL1x2_L2 32,16,4,0
+    KERNEL1x2_L2 32,16,5,0 
+    KERNEL1x2_L2 32,16,6,0
+    KERNEL1x2_E2 32,16,7,1
+    blr
+    MY_ALIGN
+CGEMM_1x2_L8_SUB:
+/*----------------------------------------*/   
+    LOAD1x2_2
+    KERNEL1x2_L2 32,16,0,0
+    KERNEL1x2_L2 32,16,1,0  
+    KERNEL1x2_L2 32,16,2,0
+    KERNEL1x2_E2 32,16,3,1  
+    blr
+
+
+CGEMM_1x1_LMAIN_SUB:
+/*----------------------------------------*/   
+    mtctr   T8
+    LOAD1x1_2  
+    MY_ALIGN
+CGEMM_L1x1_LOOP:
+/*----------------------------------------*/   
+    KERNEL1x1_L2 16,16,0,0 
+CGEMM_L1x1_K32:
+/*----------------------------------------*/   
+    KERNEL1x1_L2 16,16,1,0  
+    KERNEL1x1_L2 16,16,2,0
+    KERNEL1x1_L2 16,16,3,0  
+    KERNEL1x1_L2 16,16,4,0
+    KERNEL1x1_L2 16,16,5,0 
+    KERNEL1x1_L2 16,16,6,0
+    KERNEL1x1_L2 16,16,7,0
+    KERNEL1x1_L2 16,16,8,0
+    KERNEL1x1_L2 16,16,9,0  
+    KERNEL1x1_L2 16,16,10,0
+    KERNEL1x1_L2 16,16,11,0  
+    KERNEL1x1_L2 16,16,12,0
+    KERNEL1x1_L2 16,16,13,0 
+    KERNEL1x1_L2 16,16,14,0
+    KERNEL1x1_L2 16,16,15,1   
+    bdnz    CGEMM_L1x1_LOOP
+    MY_ALIGN  
+CGEMM_L1x1_LOOP_END:
+/*----------------------------------------*/   
+    END1x1_2 
+    blr
+
+    MY_ALIGN
+CGEMM_1x1_L16_SUB:
+/*----------------------------------------*/   
+    LOAD1x1_2
+    KERNEL1x1_L2 16,16,0,0
+    KERNEL1x1_L2 16,16,1,0  
+    KERNEL1x1_L2 16,16,2,0
+    KERNEL1x1_L2 16,16,3,0  
+    KERNEL1x1_L2 16,16,4,0
+    KERNEL1x1_L2 16,16,5,0 
+    KERNEL1x1_L2 16,16,6,0
+    KERNEL1x1_E2 16,16,7,1
+    blr
+    MY_ALIGN
+
+
+CGEMM_1x1_L8_SUB:
+/*----------------------------------------*/   
+    LOAD1x1_2
+    KERNEL1x1_L2 16,16,0,0
+    KERNEL1x1_L2 16,16,1,0  
+    KERNEL1x1_L2 16,16,2,0
+    KERNEL1x1_E2 16,16,3,1  
+    blr
+
+
+
+/*             MAIN LOOP BEGINS               */   
+    MY_ALIGN
+
+
+CGEMM_L1:
+/*----------------------------------------*/   
+
+    andi.    J,  N,  1
+    ble   CGEMM_L1_END
+
+CGEMM_L1_BEGIN:
+/*----------------------------------------*/   
+    mr    CO, C  
+    add     T2,C,LDC    
+    mr    AO, A  
+    add   C,  C,  T1
+#if defined(TRMMKERNEL) && defined(LEFT)   
+    mr TEMP_REG, OFFSET  /*off = offset;*/
+#endif     
+    srawi.    I,  M,  3
+    ble   CGEMM_L1x8_END
+    dcbt    CO,r0  /*just prefetch*/
+    dcbt    T2,r0    
+
+
+CGEMM_L1x8_BEGIN:
+/*----------------------------------------*/   
+#if defined(TRMMKERNEL)   
+    REFRESH_POINTERS  AO,BO,TEMP_REG,B,8,1
+#else    
+    mr    BO, B  
+    dcbt    B,  r0  
+#endif     
+    dcbt    AO, r0
+#if defined(TRMMKERNEL)   
+    REFRESH_TEMP_BK T6,K,TEMP_REG,8,1
+    mr T1, T6
+/* TEMPS FOR PREFETCH */   
+    li T2, 1024
+    li T3, 1024+512
+    addi T1,T1, -2
+/* TEMPS FOR PREFETCH */     
+    li T4, 2048
+    li T5, 2048+512   
+    srawi.   T8, T1, 7 /**(T1-2) % 128x */
+#else   
+    mr T1, K
+/* TEMPS FOR PREFETCH */   
+    li T2, 1024
+    li T3, 1024+512
+    addi T1,T1, -2
+/* TEMPS FOR PREFETCH */     
+    li T4, 2048
+    li T5, 2048+512 
+    srawi.   T8, T1, 7 /**(K-2) % 128x */
+#endif   
+    ZERO1x8  
+    ble   CGEMM_L1x8_SUB0
+    bl CGEMM_L1x8_LMAIN_SUB
+    andi.   L,  T1, 127
+    ble   CGEMM_L1x8_SAVE
+    b   CGEMM_L1x8_SUB2
+
+
+CGEMM_L1x8_SUB0:
+/*----------------------------------------*/   
+#if defined(TRMMKERNEL)   
+    andi.   L,  T6, 255
+    cmpwi   T6,129
+#else   
+    andi.   L,  K,  255
+    cmpwi   K,129
+#endif       
+    li T8,1
+    bne CMP1x8_128K
+    addi BO,BO,-8
+    addi AO,AO,-64 
+    LOAD1x8O 64,8 
+    END1x8_WITHOUT_ADD   
+    LOAD1x8_2O  128, 16 
+    mtctr   T8    
+    bl CGEMM_L1x8_K128   
+    b CGEMM_L1x8_SAVE  
+    CMP1x8_128K:
+/*----------------------------------------*/   
+#if defined(TRMMKERNEL)    
+    cmpwi   T6,128
+#else    
+    cmpwi   K,128
+#endif        
+    bne CGEMM_L1x8_SUB2 
+    MY_ALIGN   
+    mtctr   T8
+    addi BO,BO,-16
+    addi AO,AO,-128   
+    LOAD1x8_2O 128,16
+    bl CGEMM_L1x8_K128   
+    b CGEMM_L1x8_SAVE 
+    MY_ALIGN
+
+
+CGEMM_L1x8_SUB2:
+/*----------------------------------------*/   
+    andi.      T1,L, 64
+    ble CGEMM_L1x8_SUB2_32
+    bl  CGEMM_1x8_L64_SUB
+    MY_ALIGN
+
+
+CGEMM_L1x8_SUB2_32:
+/*----------------------------------------*/   
+    andi.      T1,L, 32
+    ble CGEMM_L1x8_SUB2_16    
+    bl  CGEMM_1x8_L32_SUB
+    MY_ALIGN 
+
+
+CGEMM_L1x8_SUB2_16:
+/*----------------------------------------*/   
+    andi.      T1,L, 16
+    ble CGEMM_L1x8_SUB2_8
+    bl  CGEMM_1x8_L16_SUB  
+    MY_ALIGN    
+
+
+CGEMM_L1x8_SUB2_8:
+/*----------------------------------------*/   
+    andi.      T1,L, 8
+    ble CGEMM_L1x8_SUB2_4
+    LOAD1x8_2
+    KERNEL1x8_L2  128,16, 0,0
+    KERNEL1x8_L2  128,16, 1,0
+    KERNEL1x8_L2  128,16, 2,0
+    KERNEL1x8_E2  128,16, 3,1
+    MY_ALIGN   
+
+
+CGEMM_L1x8_SUB2_4:
+/*----------------------------------------*/   
+    andi.      T1,L, 4
+    ble CGEMM_L1x8_SUB2_2
+    LOAD1x8_2
+    KERNEL1x8_L2  128,16, 0,0
+    KERNEL1x8_E2  128,16, 1,1
+    MY_ALIGN
+
+
+CGEMM_L1x8_SUB2_2:
+/*----------------------------------------*/   
+    andi.      T1,L, 2
+    ble CGEMM_L1x8_SUB2_1
+    LOAD1x8_2 
+    KERNEL1x8_E2  128,16, 0,1
+    MY_ALIGN    
+
+
+CGEMM_L1x8_SUB2_1:
+/*----------------------------------------*/   
+    andi.      T1,L, 1
+    ble CGEMM_L1x8_SAVE 
+    KERNEL1x8
+
+    MY_ALIGN
+CGEMM_L1x8_SAVE:
+/*----------------------------------------*/   
+    addic.    I,  I,  -1
+    MY_ALIGN
+    SAVE1x8
+#if defined(TRMMKERNEL)    
+    REFRESH_AFTER_SAVE T6,K,TEMP_REG,BO,AO,8,1
+#endif     
+    bgt   CGEMM_L1x8_BEGIN
+    andi.   T2, M,  7
+    ble   CGEMM_L1x1_END
+    andi.   T1, M,  4
+    ble   CGEMM_L1x4_END
+    b   CGEMM_L1x4_BEGIN
+    MY_ALIGN 
+
+
+CGEMM_L1x8_END:
+/*----------------------------------------*/   
+
+
+CGEMM_L1x4_BEGIN:
+/*----------------------------------------*/   
+    andi.   T2, M,  7
+    ble   CGEMM_L1x1_END
+    andi.   T1, M,  4
+    ble   CGEMM_L1x4_END
+#if defined(TRMMKERNEL)   
+    REFRESH_POINTERS  AO,BO,TEMP_REG,B,4,1
+#else    
+    mr    BO, B   
+#endif        
+#if defined(TRMMKERNEL)   
+    REFRESH_TEMP_BK T6,K,TEMP_REG,4,1
+    mr T1, T6 
+    addi T1,T1, -2 
+    srawi.   T8, T1, 5 /**(T1-2) % 31x */
+#else   
+    mr T1, K 
+    addi T1,T1, -2
+    srawi.   T8, T1, 5 /**(K-2) % 31x */
+#endif     
+    ZERO1x4
+    ble   CGEMM_L1x4_SUB0 
+    bl CGEMM_1x4_LMAIN_SUB
+    andi.   L,  T1, 31
+    ble   CGEMM_L1x4_SAVE
+    b    CGEMM_L1x4_SUB2
+
+
+CGEMM_L1x4_SUB0:
+/*----------------------------------------*/   
+#if defined(TRMMKERNEL)   
+    andi.   L,  T6, 63
+    cmpwi   T6,33
+#else   
+    andi.   L,  K,  63
+    cmpwi   K,33
+#endif       
+    li T8,1
+    bne CMP1x4_32K
+    addi BO,BO,-8
+    addi AO,AO,-32  
+    LOAD1x4O 32,8 
+    END1x4_WITHOUT_ADD   
+    LOAD1x4_2O  64, 16 
+    mtctr   T8    
+    bl CGEMM_L1x4_K32   
+    b CGEMM_L1x4_SAVE  
+    CMP1x4_32K:
+/*----------------------------------------*/   
+#if defined(TRMMKERNEL)    
+    cmpwi   T6,32
+#else    
+    cmpwi   K,32
+#endif        
+    bne CGEMM_L1x4_SUB2 
+    MY_ALIGN   
+    mtctr   T8
+    addi BO,BO,-16
+    addi AO,AO,-64   
+    LOAD1x4_2O 64,16
+    bl CGEMM_L1x4_K32   
+    b CGEMM_L1x4_SAVE 
+    MY_ALIGN 
+    MY_ALIGN 
+
+
+CGEMM_L1x4_SUB2:
+/*----------------------------------------*/   
+    andi.      T1,L, 16
+    ble CGEMM_L1x4_SUB2_8
+    bl  CGEMM_1x4_L16_SUB  
+    MY_ALIGN
+
+
+CGEMM_L1x4_SUB2_8:
+/*----------------------------------------*/   
+    andi.      T1,L, 8
+    ble CGEMM_L1x4_SUB2_4
+    bl CGEMM_1x4_L8_SUB
+    MY_ALIGN  
+
+
+CGEMM_L1x4_SUB2_4:
+/*----------------------------------------*/   
+    andi.      T1,L, 4
+    ble CGEMM_L1x4_SUB2_2
+    LOAD1x4_2
+    KERNEL1x4_L2  64,16, 0,0
+    KERNEL1x4_E2  64,16, 1,1
+    MY_ALIGN
+
+
+CGEMM_L1x4_SUB2_2:
+/*----------------------------------------*/   
+    andi.      T1,L, 2
+    ble CGEMM_L1x4_SUB2_1
+    LOAD1x4_2
+    KERNEL1x4_E2  64,16, 0,1
+    MY_ALIGN    
+
+
+CGEMM_L1x4_SUB2_1:
+/*----------------------------------------*/   
+    andi.      T1,L, 1
+    ble CGEMM_L1x4_SAVE 
+    KERNEL1x4
+
+
+CGEMM_L1x4_SAVE:
+/*----------------------------------------*/   
+    SAVE1x4
+#if defined(TRMMKERNEL)    
+    REFRESH_AFTER_SAVE T6,K,TEMP_REG,BO,AO,4,1
+#endif     
+
+
+CGEMM_L1x4_END:
+/*----------------------------------------*/   
+
+
+CGEMM_L1x2_BEGIN:
+/*----------------------------------------*/   
+    andi.   T1, M,  2
+    ble   CGEMM_L1x2_END
+#if defined(TRMMKERNEL)   
+    REFRESH_POINTERS  AO,BO,TEMP_REG,B,2,1
+#else    
+    mr    BO, B   
+#endif        
+#if defined(TRMMKERNEL)   
+    REFRESH_TEMP_BK T6,K,TEMP_REG,2,1
+    mr T1, T6 
+    addi T1,T1, -2 
+    srawi.   T8, T1, 5 /**(T1-2) % 31x */
+#else   
+    mr T1, K 
+    addi T1,T1, -2
+    srawi.   T8, T1, 5 /**(K-2) % 31x */
+#endif     
+    ZERO1x2
+    ble   CGEMM_L1x2_SUB0 
+    bl CGEMM_1x2_LMAIN_SUB
+    andi.   L,  T1, 31
+    ble   CGEMM_L1x2_SAVE
+    b   CGEMM_L1x2_SUB2
+
+
+CGEMM_L1x2_SUB0:
+/*----------------------------------------*/   
+#if defined(TRMMKERNEL)   
+    andi.   L,  T6, 63
+    cmpwi   T6,33
+#else   
+    andi.   L,  K,  63
+    cmpwi   K,33
+#endif       
+    li T8,1
+    bne CMP1x2_32K
+    addi BO,BO,-8
+    addi AO,AO,-16  
+    LOAD1x2O 16,8 
+    END1x2_WITHOUT_ADD   
+    LOAD1x2_2O  32, 16  
+    mtctr   T8    
+    bl CGEMM_L1x2_K32   
+    b CGEMM_L1x2_SAVE  
+    CMP1x2_32K:
+/*----------------------------------------*/   
+#if defined(TRMMKERNEL)    
+    cmpwi   T6,32
+#else    
+    cmpwi   K,32
+#endif        
+    bne CGEMM_L1x2_SUB2 
+    MY_ALIGN   
+    mtctr   T8
+    addi BO,BO,-16
+    addi AO,AO,-32   
+    LOAD1x2_2O 32,16
+    bl CGEMM_L1x2_K32   
+    b CGEMM_L1x2_SAVE 
+    MY_ALIGN 
+    MY_ALIGN 
+
+
+CGEMM_L1x2_SUB2:
+/*----------------------------------------*/   
+    andi.      T1,L, 16
+    ble CGEMM_L1x2_SUB2_8
+    bl CGEMM_1x2_L16_SUB  
+    MY_ALIGN
+
+
+CGEMM_L1x2_SUB2_8:
+/*----------------------------------------*/   
+    andi.      T1,L, 8
+    ble CGEMM_L1x2_SUB2_4
+    bl CGEMM_1x2_L8_SUB
+    MY_ALIGN  
+
+
+CGEMM_L1x2_SUB2_4:
+/*----------------------------------------*/   
+    andi.      T1,L, 4
+    ble CGEMM_L1x2_SUB2_2
+    LOAD1x2_2
+    KERNEL1x2_L2  32,16, 0,0
+    KERNEL1x2_E2  32,16, 1,1
+    MY_ALIGN
+
+
+CGEMM_L1x2_SUB2_2:
+/*----------------------------------------*/   
+    andi.      T1,L, 2
+    ble CGEMM_L1x2_SUB2_1
+    LOAD1x2_2
+    KERNEL1x2_E2  32,16, 0,1
+    MY_ALIGN    
+
+
+CGEMM_L1x2_SUB2_1:
+/*----------------------------------------*/   
+    andi.      T1,L, 1
+    ble CGEMM_L1x2_SAVE 
+    KERNEL1x2
+
+    MY_ALIGN
+CGEMM_L1x2_SAVE:
+/*----------------------------------------*/   
+    SAVE1x2
+#if defined(TRMMKERNEL)    
+    REFRESH_AFTER_SAVE T6,K,TEMP_REG,BO,AO,2,1
+#endif     
+
+
+CGEMM_L1x2_END:
+/*----------------------------------------*/   
+
+
+CGEMM_L1x1_BEGIN:
+/*----------------------------------------*/   
+    andi.   T1, M,  1
+    ble   CGEMM_L1x1_END
+#if defined(TRMMKERNEL)   
+    REFRESH_POINTERS  AO,BO,TEMP_REG,B,1,1
+#else    
+    mr    BO, B   
+#endif        
+#if defined(TRMMKERNEL)   
+    REFRESH_TEMP_BK T6,K,TEMP_REG,1,1
+    mr T1, T6 
+    addi T1,T1, -2 
+    srawi.   T8, T1, 5 /**(T1-2) % 31x */
+#else   
+    mr T1, K 
+    addi T1,T1, -2
+    srawi.   T8, T1, 5 /**(K-2) % 31x */
+#endif     
+    ZERO1x1
+    ble   CGEMM_L1x1_SUB0 
+    bl CGEMM_1x1_LMAIN_SUB
+    andi.   L,  T1, 31
+    ble   CGEMM_L1x1_SAVE
+    b   CGEMM_L1x1_SUB2
+
+
+CGEMM_L1x1_SUB0:
+/*----------------------------------------*/   
+#if defined(TRMMKERNEL)   
+    andi.   L,  T6, 63
+    cmpwi   T6,33
+#else   
+    andi.   L,  K,  63
+    cmpwi   K,33
+#endif       
+    li T8,1
+    bne CMP1x1_32K
+    addi BO,BO,-8
+    addi AO,AO,-8  
+    LOAD1x1O 8,8 
+    END1x1_WITHOUT_ADD   
+    LOAD1x1_2O  16, 16  
+    mtctr   T8    
+    bl CGEMM_L1x1_K32   
+    b CGEMM_L1x1_SAVE  
+    CMP1x1_32K:
+/*----------------------------------------*/   
+#if defined(TRMMKERNEL)    
+    cmpwi   T6,32
+#else    
+    cmpwi   K,32
+#endif        
+    bne CGEMM_L1x1_SUB2 
+    MY_ALIGN   
+    mtctr   T8
+    addi BO,BO,-16
+    addi AO,AO,-16   
+    LOAD1x1_2O 16,16
+    bl CGEMM_L1x1_K32   
+    b CGEMM_L1x1_SAVE 
+    MY_ALIGN 
+    MY_ALIGN 
+
+
+CGEMM_L1x1_SUB2:
+/*----------------------------------------*/   
+    andi.      T1,L, 16
+    ble CGEMM_L1x1_SUB2_8
+    bl CGEMM_1x1_L16_SUB  
+    MY_ALIGN
+
+
+CGEMM_L1x1_SUB2_8:
+/*----------------------------------------*/   
+    andi.      T1,L, 8
+    ble CGEMM_L1x1_SUB2_4
+    bl CGEMM_1x1_L8_SUB
+    MY_ALIGN  
+
+
+CGEMM_L1x1_SUB2_4:
+/*----------------------------------------*/   
+    andi.      T1,L, 4
+    ble CGEMM_L1x1_SUB2_2
+    LOAD1x1_2
+    KERNEL1x1_L2  16,16, 0,0
+    KERNEL1x1_E2  16,16, 1,1
+    MY_ALIGN
+
+
+CGEMM_L1x1_SUB2_2:
+/*----------------------------------------*/   
+    andi.      T1,L, 2
+    ble CGEMM_L1x1_SUB2_1
+    LOAD1x1_2
+    KERNEL1x1_E2  16,16, 0,1
+    MY_ALIGN    
+
+
+CGEMM_L1x1_SUB2_1:
+/*----------------------------------------*/   
+    andi.      T1,L, 1
+    ble CGEMM_L1x1_SAVE 
+    KERNEL1x1
+
+    MY_ALIGN
+CGEMM_L1x1_SAVE:
+/*----------------------------------------*/  
+     
+    SAVE1x1
+#if defined(TRMMKERNEL)    
+    REFRESH_AFTER_SAVE T6,K,TEMP_REG,BO,AO,1,1
+#endif   
+
+
+CGEMM_L1x1_END:
+/*----------------------------------------*/   
+    slwi    T1, K,  3
+
+    add   B,  B,  T1
+#if defined(TRMMKERNEL) && !defined(LEFT)   
+    addi TEMP_REG, TEMP_REG, 1
+#endif   
+
+CGEMM_L1_END:
+
+
+
+
diff --git a/kernel/power/cgemm_macros_power10.S b/kernel/power/cgemm_macros_power10.S
new file mode 100644
index 000000000..b66e93405
--- /dev/null
+++ b/kernel/power/cgemm_macros_power10.S
@@ -0,0 +1,2131 @@
+/***************************************************************************
+Copyright (c) 2013-2020, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*****************************************************************************/
+
+#define unit_size 8
+#define DISP32(ind, disp) (ind*unit_size*32+disp)
+#define DISP16(ind, disp) (ind*unit_size*16+disp)
+#define DISP8(ind, disp) (ind*unit_size*8+disp)
+#define DISP4(ind, disp) (ind*unit_size*4+disp)
+#define DISP2(ind, disp) (ind*unit_size*2+disp)
+#define DISP1(ind, disp) (ind*unit_size+disp)
+#define DISPX(disp)  (disp)
+
+.macro	AGGREGATE_REALS_IMAGES  VSINR_OUT1, VSINR, VSINI_OUT2, VSINI
+#if	defined(NN) || defined(NT) || defined(TN) || defined(TT)
+	xvsubsp  \VSINR_OUT1, \VSINR_OUT1, \VSINR
+	xvaddsp  \VSINI_OUT2, \VSINI_OUT2, \VSINI
+#elif  defined(CN) || defined(CT) || defined(RN) || defined(RT)
+	xvaddsp  \VSINR_OUT1, \VSINR_OUT1, \VSINR
+	xvsubsp  \VSINI_OUT2, \VSINI_OUT2, \VSINI
+#elif  defined(NC) || defined(TC) || defined(NR) || defined(TR)
+	xvaddsp  \VSINR_OUT1, \VSINR_OUT1, \VSINR
+	xvsubsp  \VSINI_OUT2, \VSINI, \VSINI_OUT2
+#else	// CC || CR || RC || RR
+    /*we will assume {-alpha_r,-alpha_i} for this case */
+    /*i1i2-r1r2 so we will negate alpha real instead to fix sign*/
+	xvsubsp  \VSINR_OUT1, \VSINR, \VSINR_OUT1
+    /*we will negate alpha image   instead to fix sign*/
+	xvaddsp  \VSINI_OUT2, \VSINI_OUT2, \VSINI
+#endif
+.endm
+
+.macro	AGGREGATE_REALS_IMAGES_A_PERMUTE  VSINR_OUT1, VSINR, VSINI_OUT2, VSINI
+#if	defined(NN) || defined(NT) || defined(TN) || defined(TT)
+	xvsubsp  \VSINR_OUT1, \VSINR_OUT1, \VSINR
+	xvaddsp  \VSINI_OUT2, \VSINI_OUT2, \VSINI
+#elif  defined(CN) || defined(CT) || defined(RN) || defined(RT)
+	xvaddsp  \VSINR_OUT1, \VSINR_OUT1, \VSINR
+	xvsubsp  \VSINI_OUT2, \VSINI, \VSINI_OUT2
+#elif  defined(NC) || defined(TC) || defined(NR) || defined(TR)
+	xvaddsp  \VSINR_OUT1, \VSINR_OUT1, \VSINR
+	xvsubsp  \VSINI_OUT2, \VSINI_OUT2, \VSINI
+#else	// CC || CR || RC || RR
+    /*we will assume {-alpha_r,-alpha_i} for this case */
+    /*i1i2-r1r2 so we will negate alpha real instead to fix sign*/
+	xvsubsp  \VSINR_OUT1, \VSINR, \VSINR_OUT1
+    /*we will negate alpha image   instead to fix sign*/
+	xvaddsp  \VSINI_OUT2, \VSINI_OUT2, \VSINI
+#endif
+.endm
+
+/* {i0,i1} * {alpha_i,alpha_i} [- VSOUT1] ;[VSOUT2 +] {r0,r1}*{alpha_i,alpha_i} */
+
+.macro MULT_APLHA_PART1  VSINRR, VSINII, VSOUT1, VSOUT2
+	xvmulsp \VSOUT1, \VSINII, alpha_i
+	xvmulsp  \VSOUT2, \VSINRR, alpha_i
+.endm
+
+/*   {r0,r1} * {alpha_r,alpha_r} -  VSOUT1 ;VSOUT2 + {i0,i1} * {alpha_r,alpha_r} */
+
+.macro MULT_APLHA_PART2  VSINRR, VSINII, VSOUT1, VSOUT2
+	xvmsubasp  \VSOUT1, \VSINRR, alpha_r
+	xvmaddasp \VSOUT2, \VSINII, alpha_r
+.endm
+
+.macro	PERMUTE1	OUT, R1, R2, R3, R4
+	xxsel	vs62, \R1, \R2, vs57
+	xxsel	\OUT, \R3, \R4, vs57
+	xxpermdi	\OUT, \OUT, vs62, 1
+.endm
+.macro	PERMUTE2	OUT, R1, R2, R3, R4
+	xxsel	vs62, \R2, \R1, vs57
+	xxsel	\OUT, \R4, \R3, vs57
+	xxpermdi	\OUT, vs62, \OUT, 1
+	xxperm	\OUT, \OUT, permute_mask
+.endm
+.macro PERMUTE3	OUT, R1, R2, R3, R4
+	xxsel	vs62, \R1, \R2, vs57
+	xxsel	\OUT, \R3, \R4, vs57
+	xxpermdi \OUT, vs62, \OUT, 2
+.endm
+.macro PERMUTE4	OUT, R1, R2, R3, R4
+	xxsel	vs62, \R2, \R1, vs57
+	xxsel	\OUT, \R4, \R3, vs57
+	xxpermdi	\OUT, \OUT, vs62, 2
+	xxperm	\OUT, \OUT, permute_mask
+.endm
+.macro	GROUP1
+	xxperm	vs0, vs32, permute_mask
+	xxperm	vs4, vs40, permute_mask
+	xxperm	vs1, vs33, permute_mask
+	xxperm	vs5, vs41, permute_mask
+	xxperm	vs8, vs36, permute_mask
+	xxperm	vs12, vs44, permute_mask
+	xxperm	vs9, vs37, permute_mask
+	xxperm	vs13, vs45, permute_mask
+.endm
+.macro	AGG_GROUP1
+	AGGREGATE_REALS_IMAGES	vs32, vs0, vs40, vs4
+	AGGREGATE_REALS_IMAGES	vs33, vs1, vs41, vs5
+	AGGREGATE_REALS_IMAGES	vs36, vs8, vs44, vs12
+	AGGREGATE_REALS_IMAGES	vs37, vs9, vs45, vs13
+.endm
+.macro	GROUP2
+	xxperm	vs0, vs34, permute_mask
+	xxperm	vs4, vs42, permute_mask
+	xxperm	vs1, vs35, permute_mask
+	xxperm	vs5, vs43, permute_mask
+	xxperm	vs8, vs38, permute_mask
+	xxperm	vs12, vs46, permute_mask
+	xxperm	vs9, vs39, permute_mask
+	xxperm	vs13, vs47, permute_mask
+.endm
+.macro	AGG_GROUP2
+	AGGREGATE_REALS_IMAGES	vs34, vs0, vs42, vs4
+	AGGREGATE_REALS_IMAGES	vs35, vs1, vs43, vs5
+	AGGREGATE_REALS_IMAGES	vs38, vs8, vs46, vs12
+	AGGREGATE_REALS_IMAGES	vs39, vs9, vs47, vs13
+.endm
+.macro	MULTIPLY_GROUP1
+	MULT_APLHA_PART1	vs32, vs40, vs0, vs1
+	MULT_APLHA_PART1	vs33, vs41, vs2, vs3
+	MULT_APLHA_PART1	vs36, vs44, vs8, vs9
+	MULT_APLHA_PART1	vs37, vs45, vs10, vs11
+	MULT_APLHA_PART2	vs32, vs40, vs0, vs1
+	MULT_APLHA_PART2	vs33, vs41, vs2, vs3
+	MULT_APLHA_PART2	vs36, vs44, vs8, vs9
+	MULT_APLHA_PART2	vs37, vs45, vs10, vs11
+.endm
+.macro	MULTIPLY_GROUP2
+	MULT_APLHA_PART1	vs34, vs42, vs4, vs5
+	MULT_APLHA_PART1	vs35, vs43, vs6, vs7
+	MULT_APLHA_PART1	vs38, vs46, vs12, vs13
+	MULT_APLHA_PART1	vs39, vs47, vs14, vs15
+	MULT_APLHA_PART2	vs34, vs42, vs4, vs5
+	MULT_APLHA_PART2	vs35, vs43, vs6, vs7
+	MULT_APLHA_PART2	vs38, vs46, vs12, vs13
+	MULT_APLHA_PART2	vs39, vs47, vs14, vs15
+.endm
+/* reconstruct r, i pairs*/
+.macro	RECONSTRUCT_PAIR1
+	xxperm	vs0, vs1, save_permute_1
+	xxperm	vs2, vs3, save_permute_1
+	xxperm	vs8, vs9, save_permute_1
+	xxperm	vs10, vs11, save_permute_1
+.endm
+.macro	RECONSTRUCT_PAIR2
+	xxperm	vs4, vs5, save_permute_1
+	xxperm	vs6, vs7, save_permute_1
+	xxperm	vs12, vs13, save_permute_1
+	xxperm	vs14, vs15, save_permute_1
+.endm
+.macro	SHUFFLE_ACC	ACC, R0, R1, R2, R3, O1, O2, O3, O4
+	xxmfacc	\ACC
+	PERMUTE1	\O1, \R3, \R2, \R1, \R0
+	PERMUTE2	\O2, \R1, \R0, \R3, \R2
+	PERMUTE3	\O3, \R1, \R0, \R3, \R2
+	PERMUTE4	\O4, \R3, \R2, \R1, \R0
+.endm
+/*                                             macros for N=4 and M=8
+**********************************************************************************************/
+.macro	ZERO4x8
+	xxsetaccz	0
+	xxsetaccz	1
+	xxsetaccz	2
+	xxsetaccz	3
+	xxsetaccz	4
+	xxsetaccz	5
+	xxsetaccz	6
+	xxsetaccz	7
+.endm
+
+.macro	LOAD4x8
+	LOAD4x8O	0, 0
+.endm
+
+.macro	LOAD4x8O  OffsetA, OffsetB
+	lxvp	vs34, (\OffsetB+0)(BO)
+	lxvp	vs32, (\OffsetA+0)(AO)
+	lxvp	vs36, (\OffsetA+32)(AO)
+.endm
+
+.macro	END4x8_NORMAL
+	END4x8	AO, BO, 64, 32
+.endm
+
+.macro	END4x8_WITHOUT_ADD
+	END4x8	AO, BO, 0, 0
+.endm
+
+.macro	END4x8	AREG, BREG, OffsetA, OffsetB
+.if \OffsetB != 0
+	addi	\BREG, \BREG, \OffsetB
+.endif
+.if \OffsetA != 0
+	addi	\AREG, \AREG, \OffsetA
+.endif
+	xvf32gerpp	3, 36, 35
+	xvf32gerpp	2, 37, 35
+	xvf32gerpp	1, 32, 35
+	xvf32gerpp	0, 33, 35
+	xvf32gerpp	7, 36, 34
+	xvf32gerpp	6, 37, 34
+	xvf32gerpp	5, 32, 34
+	xvf32gerpp	4, 33, 34
+.endm
+
+.macro	LOAD4x8_2
+	LOAD4x8_2O	0, 0
+.endm
+
+.macro	LOAD4x8_2O  OffsetA, OffsetB
+	lxvp	vs34, (\OffsetB)(BO)
+	lxvp	vs38, (32+\OffsetB)(BO)
+	lxvp	vs32, (0+\OffsetA)(AO)
+	lxvp	vs36, (32+\OffsetA)(AO)
+	lxvp	vs40, (64+\OffsetA)(AO)
+	lxvp	vs42, (64+32+\OffsetA)(AO)
+.endm
+
+.macro	END4x8_2
+	/*for load2 offset will be 128 and 64*/
+	KERNEL4x8_2	AO, BO, 128, 64, 0, 1, 1
+.endm
+
+.macro	KERNEL4x8_E2	OffsetA, OffsetB, Index, IsLast
+	KERNEL4x8_2	AO, BO, \OffsetA, \OffsetB, \Index, \IsLast, 1
+.endm
+
+.macro	KERNEL4x8_L2	OffsetA, OffsetB, Index, IsLast
+	KERNEL4x8_2	AO, BO, \OffsetA, \OffsetB, \Index, \IsLast, 0
+.endm
+
+.macro	KERNEL4x8_2	AREG, BREG, OffsetA, OffsetB, Index, IsLast, Complete
+	xvf32gerpp	3, 36, 35
+	xvf32gerpp	2, 37, 35
+	xvf32gerpp	1, 32, 35
+	xvf32gerpp	0, 33, 35
+	xvf32gerpp	7, 36, 34
+	xvf32gerpp	6, 37, 34
+	xvf32gerpp	5, 32, 34
+	xvf32gerpp	4, 33, 34
+.if \Complete==0
+	lxvp	vs34, DISP8(\Index, \OffsetB)(\BREG)
+	lxvp	vs32, DISP16(\Index, 0+\OffsetA)(\AREG)
+	lxvp	vs36, DISP16(\Index, 32+\OffsetA)(\AREG)
+.endif
+	xvf32gerpp	3, 42, 39
+	xvf32gerpp	2, 43, 39
+	xvf32gerpp	1, 40, 39
+	xvf32gerpp	0, 41, 39
+	xvf32gerpp	7, 42, 38
+	xvf32gerpp	6, 43, 38
+	xvf32gerpp	5, 40, 38
+	xvf32gerpp	4, 41, 38
+.if \Complete==0
+	lxvp	vs40, DISP16(\Index, 64+\OffsetA)(\AREG)
+	lxvp	vs38, DISP8(\Index, 32+\OffsetB)(\BREG)
+	lxvp	vs42, DISP16(\Index, 64+32+\OffsetA)(\AREG)
+.endif
+.if \IsLast==1
+.if \Complete==1
+	addi	\BREG, \BREG, DISP8(\Index, \OffsetB)
+	addi    \AREG, \AREG, DISP16(\Index, \OffsetA)
+.else
+	addi	\BREG, \BREG, DISP8(\Index, 64)
+	addi    \AREG, \AREG, DISP16(\Index, 128)
+.endif
+.endif
+.endm
+
+.macro	KERNEL4x8
+	LOAD4x8
+	END4x8	AO, BO, 64, 32
+.endm
+
+.macro SAVE4x8
+	SHUFFLE_ACC	0, vs0, vs1, vs2, vs3, vs32, vs40, vs36, vs44
+	SHUFFLE_ACC	1, vs4, vs5, vs6, vs7, vs33, vs41, vs37, vs45
+	SHUFFLE_ACC	2, vs8, vs9, vs10, vs11, vs34, vs42, vs38, vs46
+	SHUFFLE_ACC	3, vs12, vs13, vs14, vs15, vs35, vs43, vs39, vs47
+	SHUFFLE_ACC	4, vs16, vs17, vs18, vs19, vs48, vs56, vs52, vs60
+	SHUFFLE_ACC	5, vs20, vs21, vs22, vs23, vs49, vs16, vs53, vs61
+	SHUFFLE_ACC	7, vs28, vs29, vs30, vs31, vs17, vs19, vs18, vs20
+	SHUFFLE_ACC	6, vs24, vs25, vs26, vs27, vs50, vs58, vs54, vs21
+	add	T4, LDC, LDC
+	add	T1, CO, LDC
+#ifndef TRMMKERNEL
+	lxvp	vs24, 0(CO)
+#endif
+#ifndef TRMMKERNEL
+	lxvp	vs26, 32(CO)
+#endif
+#ifndef TRMMKERNEL
+	lxvp	vs28, 0(T1)
+#endif
+	xxperm	vs2, vs34, permute_mask
+	xxperm	vs6, vs42, permute_mask
+#ifndef TRMMKERNEL
+	lxvp	vs30, 32(T1)
+#endif
+	xxperm	vs3, vs35, permute_mask
+	xxperm	vs7, vs43, permute_mask
+	add	T2, CO, T4
+	add	T3, T1, T4
+	GROUP1
+	AGG_GROUP1
+	AGGREGATE_REALS_IMAGES	vs34, vs2, vs42, vs6
+	xxperm	vs10, vs38, permute_mask
+	xxperm	vs14, vs46, permute_mask
+	AGGREGATE_REALS_IMAGES	vs35, vs3, vs43, vs7
+	xxperm	vs11, vs39, permute_mask
+	xxperm	vs15, vs47, permute_mask
+	xxperm	vs0, vs48, permute_mask
+	xxperm	vs4, vs56, permute_mask
+	xxperm	vs1, vs49, permute_mask
+	xxperm	vs5, vs16, permute_mask
+	AGGREGATE_REALS_IMAGES	vs38, vs10, vs46, vs14
+	xxperm	vs2, vs50, permute_mask
+	xxperm	vs6, vs58, permute_mask
+	AGGREGATE_REALS_IMAGES	vs39, vs11, vs47, vs15
+	xxperm	vs3, vs17, permute_mask
+	xxperm	vs7, vs19, permute_mask
+	AGGREGATE_REALS_IMAGES	vs48, vs0, vs56, vs4
+	xxperm	vs8, vs52, permute_mask
+	xxperm	vs12, vs60, permute_mask
+	AGGREGATE_REALS_IMAGES	vs49, vs1, vs16, vs5
+	xxperm	vs9, vs53, permute_mask
+	xxperm	vs13, vs61, permute_mask
+	AGGREGATE_REALS_IMAGES	vs50, vs2, vs58, vs6
+	xxperm	vs10, vs54, permute_mask
+	xxperm	vs14, vs21, permute_mask
+	AGGREGATE_REALS_IMAGES	vs17, vs3, vs19, vs7
+	xxperm	vs11, vs18, permute_mask
+	xxperm	vs15, vs20, permute_mask
+	AGGREGATE_REALS_IMAGES	vs52, vs8, vs60, vs12
+	AGGREGATE_REALS_IMAGES	vs53, vs9, vs61, vs13
+/*VSINRR, VSINII, VSOUT1, VSOUT2*/
+	MULT_APLHA_PART1    vs32, vs40, vs0, vs1
+	AGGREGATE_REALS_IMAGES	vs54, vs10, vs21, vs14
+	MULT_APLHA_PART1    vs33, vs41, vs2, vs3
+	AGGREGATE_REALS_IMAGES	vs18, vs11, vs20, vs15
+	MULT_APLHA_PART1    vs34, vs42, vs4, vs5
+	MULT_APLHA_PART1    vs35, vs43, vs6, vs7
+	MULT_APLHA_PART2    vs32, vs40, vs0, vs1
+	MULT_APLHA_PART2    vs33, vs41, vs2, vs3
+	MULT_APLHA_PART2    vs34, vs42, vs4, vs5
+	MULT_APLHA_PART2    vs35, vs43, vs6, vs7
+#ifndef TRMMKERNEL
+	lxvp	vs32, 0(T2)
+#endif
+	MULT_APLHA_PART1    vs36, vs44, vs8, vs9
+	MULT_APLHA_PART1    vs37, vs45, vs10, vs11
+#ifndef TRMMKERNEL
+	lxvp	vs40, 32(T2)
+#endif
+	MULT_APLHA_PART1    vs38, vs46, vs12, vs13
+	MULT_APLHA_PART1    vs39, vs47, vs14, vs15
+#ifndef TRMMKERNEL
+	lxvp	vs34, 0(T3)
+#endif
+	MULT_APLHA_PART2    vs36, vs44, vs8, vs9
+	MULT_APLHA_PART2    vs37, vs45, vs10, vs11
+#ifndef TRMMKERNEL
+	lxvp	vs42, 32(T3)
+#endif
+	MULT_APLHA_PART2    vs38, vs46, vs12, vs13
+	MULT_APLHA_PART2    vs39, vs47, vs14, vs15
+	RECONSTRUCT_PAIR1
+	RECONSTRUCT_PAIR2
+#ifndef TRMMKERNEL
+	/* add */
+	xxpermdi	vs1, vs8, vs0, 2
+	xxpermdi	vs3, vs10, vs2, 2
+	xxpermdi	vs5, vs12, vs4, 2
+	xxpermdi	vs7, vs14, vs6, 2
+	xxpermdi	vs9, vs0, vs8, 2
+	xxpermdi	vs11, vs2, vs10, 2
+	xvaddsp	vs24, vs24, vs3
+	xvaddsp	vs25, vs25, vs1
+	xxpermdi	vs13, vs4, vs12, 2
+	xxpermdi	vs15, vs6, vs14, 2
+	xvaddsp	vs26, vs26, vs7
+	xvaddsp	vs27, vs27, vs5
+	xvaddsp	vs28, vs28, vs11
+	xvaddsp	vs29, vs29, vs9
+	xvaddsp	vs30, vs30, vs15
+	xvaddsp	vs31, vs31, vs13
+#else
+	xxpermdi	vs25, vs8, vs0, 2
+	xxpermdi	vs24, vs10, vs2, 2
+	xxpermdi	vs27, vs12, vs4, 2
+	xxpermdi	vs26, vs14, vs6, 2
+	xxpermdi	vs29, vs0, vs8, 2
+	xxpermdi	vs28, vs2, vs10, 2
+	xxpermdi	vs31, vs4, vs12, 2
+	xxpermdi	vs30, vs6, vs14, 2
+#endif
+	stxvp	vs24, 0(CO)
+	MULT_APLHA_PART1    vs48, vs56, vs0, vs1
+	MULT_APLHA_PART1    vs49, vs16, vs2, vs3
+	stxvp	vs26, 32(CO)
+	MULT_APLHA_PART1    vs50, vs58, vs4, vs5
+	MULT_APLHA_PART1    vs17, vs19, vs6, vs7
+	stxvp	vs28, 0(T1)
+	MULT_APLHA_PART2    vs48, vs56, vs0, vs1
+	MULT_APLHA_PART2    vs49, vs16, vs2, vs3
+	stxvp	vs30, 32(T1)
+	MULT_APLHA_PART2    vs50, vs58, vs4, vs5
+	MULT_APLHA_PART2    vs17, vs19, vs6, vs7
+	MULT_APLHA_PART1    vs52, vs60, vs8, vs9
+	MULT_APLHA_PART1    vs53, vs61, vs10, vs11
+	MULT_APLHA_PART1    vs54, vs21, vs12, vs13
+	MULT_APLHA_PART1    vs18, vs20, vs14, vs15
+	MULT_APLHA_PART2    vs52, vs60, vs8, vs9
+	MULT_APLHA_PART2    vs53, vs61, vs10, vs11
+	MULT_APLHA_PART2    vs54, vs21, vs12, vs13
+	MULT_APLHA_PART2    vs18, vs20, vs14, vs15
+	RECONSTRUCT_PAIR1
+	RECONSTRUCT_PAIR2
+#ifndef TRMMKERNEL
+  /* add */
+	xxpermdi	vs1, vs8, vs0, 2
+	xxpermdi	vs3, vs10, vs2, 2
+	xxpermdi	vs5, vs12, vs4, 2
+	xxpermdi	vs7, vs14, vs6, 2
+	xxpermdi	vs9, vs0, vs8, 2
+	xxpermdi	vs11, vs2, vs10, 2
+	xvaddsp	vs32, vs32, vs3
+	xvaddsp	vs33, vs33, vs1
+	xxpermdi	vs13, vs4, vs12, 2
+	xxpermdi	vs15, vs6, vs14, 2
+	xvaddsp	vs40, vs40, vs7
+	xvaddsp vs41, vs41, vs5
+	xvaddsp	vs34, vs34, vs11
+	xvaddsp	vs35, vs35, vs9
+	xvaddsp	vs42, vs42, vs15
+	xvaddsp	vs43, vs43, vs13
+#else
+	xxpermdi	vs33, vs8, vs0, 2
+	xxpermdi	vs32, vs10, vs2, 2
+	xxpermdi	vs41, vs12, vs4, 2
+	xxpermdi	vs40, vs14, vs6, 2
+	xxpermdi	vs35, vs0, vs8, 2
+	xxpermdi	vs34, vs2, vs10, 2
+	xxpermdi	vs43, vs4, vs12, 2
+	xxpermdi	vs42, vs6, vs14, 2
+#endif
+	stxvp	vs32, 0(T2)
+	stxvp	vs40, 32(T2)
+	stxvp	vs34, 0(T3)
+	stxvp	vs42, 32(T3)
+	addi	CO, CO, 64
+.endm
+
+/*                                             macros for N=4 and M=4
+**********************************************************************************************/
+
+.macro	ZERO4x4
+	xxsetaccz	0
+	xxsetaccz	1
+	xxsetaccz	2
+	xxsetaccz	3
+.endm
+
+.macro	LOAD4x4
+	LOAD4x4O 0, 0
+.endm
+
+.macro	LOAD4x4O  OffsetA, OffsetB
+	lxvp	vs34, (\OffsetB+0)(BO)
+	lxvp	vs32, (\OffsetA+0)(AO)
+.endm
+
+.macro	END4x4_NORMAL
+	END4x4 AO, BO, 32, 32
+.endm
+
+.macro	END4x4_WITHOUT_ADD
+	END4x4 AO, BO, 0, 0
+.endm
+
+.macro	END4x4	AREG, BREG, OffsetA, OffsetB
+.if \OffsetB != 0
+	addi	\BREG, \BREG, \OffsetB
+.endif
+.if \OffsetA != 0
+	addi	\AREG, \AREG, \OffsetA
+.endif
+	xvf32gerpp	3, 32, 34
+	xvf32gerpp	2, 33, 34
+	xvf32gerpp	1, 32, 35
+	xvf32gerpp	0, 33, 35
+.endm
+
+.macro	LOAD4x4_2
+	LOAD4x4_2O 0, 0
+.endm
+
+.macro	LOAD4x4_2O  OffsetA, OffsetB
+	lxvp	vs34, (\OffsetB)(BO)
+	lxvp	vs38, (32+\OffsetB)(BO)
+	lxvp	vs32, (0+\OffsetA)(AO)
+	lxvp	vs36, (32+\OffsetA)(AO)
+.endm
+
+.macro	END4x4_2
+  /*for load2 offset will be 64 and 64*/
+	KERNEL4x4_2	AO, BO, 64, 64, 0, 1, 1
+.endm
+
+.macro	KERNEL4x4_E2	OffsetA, OffsetB, Index, IsLast
+	KERNEL4x4_2	AO, BO, \OffsetA, \OffsetB, \Index, \IsLast, 1
+.endm
+
+.macro	KERNEL4x4_L2	OffsetA, OffsetB, Index, IsLast
+	KERNEL4x4_2	AO, BO, \OffsetA, \OffsetB, \Index, \IsLast, 0
+.endm
+
+.macro	KERNEL4x4_2	AREG, BREG, OffsetA, OffsetB, Index, IsLast, Complete
+	xvf32gerpp	3, 32, 34
+	xvf32gerpp	2, 33, 34
+	xvf32gerpp	1, 32, 35
+	xvf32gerpp	0, 33, 35
+.if \Complete==0
+	lxvp	vs34, DISP8(\Index, \OffsetB)(\BREG)
+	lxvp	vs32, DISP8(\Index, 0+\OffsetA)(\AREG)
+.endif
+	xvf32gerpp	3, 36, 38
+	xvf32gerpp	2, 37, 38
+	xvf32gerpp	1, 36, 39
+	xvf32gerpp	0, 37, 39
+.if \Complete==0
+	lxvp	vs38, DISP8(\Index, 32+\OffsetB)(\BREG)
+	lxvp	vs36, DISP8(\Index, 32+\OffsetA)(\AREG)
+.endif
+.if \IsLast==1
+.if \Complete==1
+	addi		\BREG, \BREG, DISP8(\Index, \OffsetB)
+	addi    \AREG, \AREG, DISP8(\Index, \OffsetA)
+.else
+	addi		\BREG, \BREG, DISP8(\Index, 64)
+	addi    \AREG, \AREG, DISP8(\Index, 64)
+.endif
+.endif
+.endm
+
+.macro	KERNEL4x4
+	LOAD4x4
+	END4x4  AO, BO, 32, 32
+.endm
+
+.macro SAVE4x4
+	SHUFFLE_ACC	0, vs0, vs1, vs2, vs3, vs32, vs40, vs36, vs44
+	SHUFFLE_ACC	1, vs4, vs5, vs6, vs7, vs33, vs41, vs37, vs45
+	SHUFFLE_ACC	2, vs8, vs9, vs10, vs11, vs34, vs42, vs38, vs46
+	SHUFFLE_ACC	3, vs12, vs13, vs14, vs15, vs35, vs43, vs39, vs47
+	add	T4, LDC, LDC
+	add	T1, CO, LDC
+#ifndef TRMMKERNEL
+	lxvp	vs24, 0(CO)
+#endif
+	add	T2, CO, T4
+	add	T3, T1, T4
+#ifndef TRMMKERNEL
+	lxvp	vs26, 0(T1)
+#endif
+ #ifndef TRMMKERNEL
+	lxvp	vs28, 0(T2)
+#endif
+#ifndef TRMMKERNEL
+	lxvp	vs30, 0(T3)
+#endif
+	GROUP1
+	AGG_GROUP1
+	GROUP2
+	AGG_GROUP2
+  /*VSINRR, VSINII, VSOUT1, VSOUT2*/
+	MULTIPLY_GROUP1
+	MULTIPLY_GROUP2
+/* reconstruct r, i pairs*/
+	RECONSTRUCT_PAIR1
+	RECONSTRUCT_PAIR2
+#ifndef TRMMKERNEL
+  /* add */
+	xxpermdi	vs1, vs8, vs0, 2
+	xxpermdi	vs3, vs10, vs2, 2
+	xxpermdi	vs9, vs0, vs8, 2
+	xxpermdi	vs11, vs2, vs10, 2
+	xxpermdi	vs5, vs12, vs4, 2
+	xxpermdi	vs7, vs14, vs6, 2
+	xxpermdi	vs13, vs4, vs12, 2
+	xxpermdi	vs15, vs6, vs14, 2
+	xvaddsp	vs24, vs24, vs3
+	xvaddsp	vs25, vs25, vs1
+	xvaddsp	vs26, vs26, vs11
+	xvaddsp	vs27, vs27, vs9
+	xvaddsp	vs28, vs28, vs7
+	xvaddsp	vs29, vs29, vs5
+	xvaddsp	vs30, vs30, vs15
+	xvaddsp	vs31, vs31, vs13
+#else
+	xxpermdi	vs25, vs8, vs0, 2
+	xxpermdi	vs24, vs10, vs2, 2
+	xxpermdi	vs27, vs0, vs8, 2
+	xxpermdi	vs26, vs2, vs10, 2
+	xxpermdi	vs29, vs12, vs4, 2
+	xxpermdi	vs28, vs14, vs6, 2
+	xxpermdi	vs31, vs4, vs12, 2
+	xxpermdi	vs30, vs6, vs14, 2
+#endif
+	stxvp	vs24, 0(CO)
+	stxvp	vs26, 0(T1)
+	stxvp	vs28, 0(T2)
+	stxvp	vs30, 0(T3)
+	addi  CO, CO, 32
+.endm
+
+/*                                             macros for N=4 and M=2
+**********************************************************************************************/
+
+.macro	ZERO4x2
+	xxsetaccz	0
+	xxsetaccz	1
+.endm
+
+.macro	LOAD4x2
+	LOAD4x2O 0, 0
+.endm
+
+.macro	LOAD4x2O  OffsetA, OffsetB
+	lxv	vs32, (\OffsetA+0)(AO)
+	lxvp	vs34, (\OffsetB+0)(BO)
+.endm
+
+.macro	END4x2_NORMAL
+	END4x2 AO, BO, 16, 32
+.endm
+
+.macro	END4x2_WITHOUT_ADD
+	END4x2 AO, BO, 0, 0
+.endm
+
+.macro	END4x2	AREG, BREG, OffsetA, OffsetB
+.if \OffsetB != 0
+	addi	\BREG, \BREG, \OffsetB
+.endif
+
+.if \OffsetA != 0
+	addi	\AREG, \AREG, \OffsetA
+.endif
+	xvf32gerpp	1, 34, 32
+	xvf32gerpp	0, 35, 32
+.endm
+
+.macro	LOAD4x2_2
+	LOAD4x2_2O 0, 0
+.endm
+
+.macro	LOAD4x2_2O  OffsetA, OffsetB
+	lxvp	vs32, (\OffsetA)(AO)
+	lxvp	vs34, (0+\OffsetB)(BO)
+	lxvp	vs36, (32+\OffsetB)(BO)
+.endm
+
+.macro	END4x2_2
+  /*for load2 offset will be 32 and 64*/
+	KERNEL4x2_2	AO, BO, 32, 64, 0, 1, 1
+.endm
+
+.macro	KERNEL4x2_E2	OffsetA, OffsetB, Index, IsLast
+	KERNEL4x2_2	AO, BO, \OffsetA, \OffsetB, \Index, \IsLast, 1
+.endm
+
+.macro	KERNEL4x2_L2	OffsetA, OffsetB, Index, IsLast
+	KERNEL4x2_2	AO, BO, \OffsetA, \OffsetB, \Index, \IsLast, 0
+.endm
+
+.macro	KERNEL4x2_2	AREG, BREG, OffsetA, OffsetB, Index, IsLast, Complete
+	xvf32gerpp	1, 34, 33
+	xvf32gerpp	0, 35, 33
+.if \Complete==0
+	lxvp	vs34, DISP8(\Index, 0+\OffsetB)(\BREG)
+.endif
+	xvf32gerpp	1, 36, 32
+	xvf32gerpp	0, 37, 32
+.if \Complete==0
+	lxvp	vs32, DISP4(\Index, \OffsetA)(\AREG)
+	lxvp	vs36, DISP8(\Index, 32+\OffsetB)(\BREG)
+.endif
+.if \IsLast==1
+.if \Complete==1
+	addi    \AREG, \AREG, DISP4(\Index, \OffsetA)
+	addi		\BREG, \BREG, DISP8(\Index, \OffsetB)
+.else
+	addi    \AREG, \AREG, DISP4(\Index, 32)
+	addi		\BREG, \BREG, DISP8(\Index, 64)
+.endif
+.endif
+.endm
+
+.macro	KERNEL4x2
+	LOAD4x2
+	END4x2  AO, BO, 16, 32
+.endm
+
+.macro SAVE4x2
+	SHUFFLE_ACC	0, vs0, vs1, vs2, vs3, vs32, vs40, vs36, vs44
+	SHUFFLE_ACC	1, vs4, vs5, vs6, vs7, vs33, vs41, vs37, vs45
+	add	T4, LDC, LDC
+	add	T1, CO, LDC
+	add	T2, CO, T4
+	add	T3, T1, T4
+#ifndef TRMMKERNEL
+	lxv	vs24, 0(CO)
+#endif
+#ifndef TRMMKERNEL
+	lxv	vs25, 0(T1)
+#endif
+#ifndef TRMMKERNEL
+	lxv	vs26, 0(T2)
+#endif
+#ifndef TRMMKERNEL
+	lxv	vs27, 0(T3)
+#endif
+	GROUP1
+	AGGREGATE_REALS_IMAGES_A_PERMUTE vs32, vs0, vs40, vs4
+	AGGREGATE_REALS_IMAGES_A_PERMUTE vs33, vs1, vs41, vs5
+	AGGREGATE_REALS_IMAGES_A_PERMUTE vs36, vs8, vs44, vs12
+	AGGREGATE_REALS_IMAGES_A_PERMUTE vs37, vs9, vs45, vs13
+  /*VSINRR, VSINII, VSOUT1, VSOUT2*/
+	MULTIPLY_GROUP1
+/* reconstruct r, i pairs*/
+	RECONSTRUCT_PAIR1
+#ifndef TRMMKERNEL
+  /* add */
+	xxpermdi	vs1, vs8, vs0, 0
+	xxpermdi	vs9, vs10, vs2, 0
+	xxpermdi	vs3, vs0, vs8, 3
+	xxpermdi	vs11, vs2, vs10, 3
+	xvaddsp	vs24, vs24, vs1
+	xvaddsp	vs26, vs26, vs9
+	xvaddsp	vs25, vs25, vs3
+	xvaddsp	vs27, vs27, vs11
+#else
+	xxpermdi	vs24, vs8, vs0, 0
+	xxpermdi	vs26, vs10, vs2, 0
+	xxpermdi	vs25, vs0, vs8, 3
+	xxpermdi	vs27, vs2, vs10, 3
+#endif
+	stxv	vs24, 0(CO)
+	stxv	vs25, 0(T1)
+	stxv	vs26, 0(T2)
+	stxv	vs27, 0(T3)
+	addi  CO, CO, 16
+.endm
+
+/*                                             macros for N=4 and M=2
+**********************************************************************************************/
+
+.macro	ZERO4x1
+	xxsetaccz	0
+	xxsetaccz	1
+.endm
+
+.macro	LOAD4x1
+	LOAD4x1O 0, 0
+.endm
+
+.macro	LOAD4x1O  OffsetA, OffsetB
+	lxsd	v0, (\OffsetA+0)(AO)
+	lxvp	vs34, (\OffsetB+0)(BO)
+.endm
+
+.macro	END4x1_NORMAL
+	END4x1 AO, BO,8, 32
+.endm
+
+.macro	END4x1_WITHOUT_ADD
+	END4x1 AO, BO, 0, 0
+.endm
+
+.macro	END4x1 AREG, BREG, OffsetA, OffsetB
+.if \OffsetB != 0
+	addi  \BREG, \BREG, \OffsetB
+.endif
+
+.if \OffsetA != 0
+	addi  \AREG, \AREG, \OffsetA
+.endif
+	xvf32gerpp	    0, 35, 32
+	xvf32gerpp	    1, 34, 32
+.endm
+
+.macro	LOAD4x1_2
+	LOAD4x1_2O 0, 0
+.endm
+
+.macro	LOAD4x1_2O  OffsetA, OffsetB
+	lxv	vs32, (\OffsetA)(AO)
+	vspltisb        v6, 0
+	xxpermdi        vs33, vs32, vs38, 0
+	xxpermdi        vs32, vs32, vs38, 2
+	lxvp	vs34, (0+\OffsetB)(BO)
+	lxvp	vs36, (32+\OffsetB)(BO)
+.endm
+
+.macro	END4x1_2
+  /*for load2 offset will be 16 and 64*/
+	KERNEL4x1_2  AO, BO, 16, 64, 0, 1, 1
+.endm
+
+.macro	KERNEL4x1_E2 OffsetA, OffsetB, Index, IsLast
+	KERNEL4x1_2 AO, BO, \OffsetA, \OffsetB, \Index, \IsLast, 1
+.endm
+
+.macro	KERNEL4x1_L2 OffsetA, OffsetB, Index, IsLast
+	KERNEL4x1_2 AO, BO, \OffsetA, \OffsetB, \Index, \IsLast, 0
+.endm
+
+.macro	KERNEL4x1_2  AREG, BREG, OffsetA, OffsetB, Index, IsLast, Complete
+	xvf32gerpp	    0, 35, 32
+	xvf32gerpp	    1, 34, 32
+.if \Complete==0
+	lxvp	vs34, DISP8(\Index, 0+\OffsetB)(\BREG)
+.endif
+	xvf32gerpp	    0, 37, 33
+	xvf32gerpp	    1, 36, 33
+.if \Complete==0
+	lxv	vs32, DISP2(\Index, \OffsetA)(\AREG)
+	lxvp	vs36, DISP8(\Index, 32+\OffsetB)(\BREG)
+	xxpermdi        vs33, vs32, vs38, 0
+	xxpermdi        vs32, vs32, vs38, 2
+.endif
+.if \IsLast==1
+.if \Complete==1
+	addi    \AREG, \AREG, DISP2(\Index, \OffsetA)
+	addi    \BREG, \BREG, DISP8(\Index, \OffsetB)
+.else
+	addi    \AREG, \AREG, DISP2(\Index, 16)
+	addi    \BREG, \BREG, DISP8(\Index, 64)
+.endif
+.endif
+.endm
+
+.macro	KERNEL4x1
+	LOAD4x1
+	END4x1  AO, BO, 8, 32
+.endm
+
+.macro SAVE4x1
+	SHUFFLE_ACC	0, vs0, vs1, vs2, vs3, vs32, vs40, vs36, vs44
+	SHUFFLE_ACC	1, vs4, vs5, vs6, vs7, vs33, vs41, vs37, vs45
+	xxpermdi	vs32, vs32, vs36, 1
+	xxpermdi	vs40, vs40, vs44, 1
+	xxpermdi	vs33, vs33, vs37, 1
+	xxpermdi	vs41, vs41, vs45, 1
+	add	T4, LDC, LDC
+	add	T1, CO, LDC
+	add	T2, CO, T4
+	add	T3, T1, T4
+#ifndef TRMMKERNEL
+	lxsd	v4, 0(CO)
+#endif
+#ifndef TRMMKERNEL
+	lxsd	v5, 0(T1)
+#endif
+#ifndef TRMMKERNEL
+	lxsd	v6, 0(T2)
+#endif
+#ifndef TRMMKERNEL
+	lxsd	v7, 0(T3)
+#endif
+	xxperm	vs0, vs32, permute_mask
+	xxperm	vs4, vs40, permute_mask
+	xxperm	vs1, vs33, permute_mask
+	xxperm	vs5, vs41, permute_mask
+	AGGREGATE_REALS_IMAGES_A_PERMUTE vs32, vs0, vs40, vs4
+	AGGREGATE_REALS_IMAGES_A_PERMUTE vs33, vs1, vs41, vs5
+  /*VSINRR, VSINII, VSOUT1, VSOUT2*/
+	MULT_APLHA_PART1    vs32, vs40, vs0, vs1
+	MULT_APLHA_PART1    vs33, vs41, vs2, vs3
+	MULT_APLHA_PART2    vs32, vs40, vs0, vs1
+	MULT_APLHA_PART2    vs33, vs41, vs2, vs3
+/* reconstruct r, i pairs*/
+	xxperm	vs0, vs1, save_permute_1
+	xxperm	vs2, vs3, save_permute_1
+#ifndef TRMMKERNEL
+  /* add */
+	xxspltd vs1, vs0, 0
+	xxspltd vs3, vs0, 1
+	xxspltd vs9, vs2, 0
+	xxspltd vs11, vs2, 1
+ /*--v4==vs36 v5==vs37 v6==vs38 v7==vs39---*/
+	xvaddsp	vs36, vs36, vs1
+	xvaddsp	vs37, vs37, vs3
+	xvaddsp	vs38, vs38, vs9
+	xvaddsp	vs39, vs39, vs11
+#else
+ /*--v4==vs36 v5==vs37 v6==vs38 v7==vs39---*/
+	xxspltd vs36, vs0, 0
+	xxspltd vs37, vs0, 1
+	xxspltd vs38, vs2, 0
+	xxspltd vs39, vs2, 1
+#endif
+	stxsd	v4, 0(CO)
+	stxsd	v5, 0(T1)
+	stxsd	v6, 0(T2)
+	stxsd	v7, 0(T3)
+	addi  CO, CO, 8
+.endm
+
+/*                                             macros for N=2 and M=8
+**********************************************************************************************/
+
+.macro	ZERO2x8
+	xxsetaccz	0
+	xxsetaccz	1
+	xxsetaccz	2
+	xxsetaccz	3
+.endm
+
+.macro	LOAD2x8
+	LOAD2x8O 0, 0
+.endm
+
+.macro	LOAD2x8O  OffsetA, OffsetB
+	lxv	vs34, (\OffsetB+0)(BO)
+	lxvp	vs32, (\OffsetA+0)(AO)
+	lxvp	vs36, (\OffsetA+32)(AO)
+.endm
+
+.macro	END2x8_NORMAL
+	END2x8 AO, BO, 64, 16
+.endm
+
+.macro	END2x8_WITHOUT_ADD
+	END2x8 AO, BO, 0, 0
+.endm
+
+.macro	END2x8 AREG, BREG, OffsetA, OffsetB
+.if \OffsetB != 0
+	addi  \BREG, \BREG, \OffsetB
+.endif
+
+.if \OffsetA != 0
+	addi  \AREG, \AREG, \OffsetA
+.endif
+	xvf32gerpp	2, 37, 34
+	xvf32gerpp	3, 36, 34
+	xvf32gerpp	0, 33, 34
+	xvf32gerpp	1, 32, 34
+.endm
+
+.macro	LOAD2x8_2
+	LOAD2x8_2O 0, 0
+.endm
+
+.macro	LOAD2x8_2O  OffsetA, OffsetB
+	lxvp	vs34, (\OffsetB)(BO)
+	lxvp	vs32, (0+\OffsetA)(AO)
+	lxvp	vs36, (32+\OffsetA)(AO)
+	lxvp	vs38, (64+\OffsetA)(AO)
+	lxvp	vs40, (64+32+\OffsetA)(AO)
+.endm
+
+.macro	END2x8_2
+  /*for load2 offset will be 128 and 32*/
+	KERNEL2x8_2  AO, BO, 128, 32, 0, 1, 1
+.endm
+
+.macro	KERNEL2x8_E2 OffsetA, OffsetB, Index, IsLast
+	KERNEL2x8_2 AO, BO, \OffsetA, \OffsetB, \Index, \IsLast, 1
+.endm
+
+.macro	KERNEL2x8_L2 OffsetA, OffsetB, Index, IsLast
+	KERNEL2x8_2 AO, BO, \OffsetA, \OffsetB, \Index, \IsLast, 0
+.endm
+
+.macro	KERNEL2x8_2  AREG, BREG, OffsetA, OffsetB, Index, IsLast, Complete
+	xvf32gerpp	2, 37, 35
+	xvf32gerpp	3, 36, 35
+	xvf32gerpp	0, 33, 35
+	xvf32gerpp	1, 32, 35
+
+.if \Complete==0
+	lxvp	vs32, DISP16(\Index, 0+\OffsetA)(\AREG)
+	lxvp	vs36, DISP16(\Index, 32+\OffsetA)(\AREG)
+.endif
+	xvf32gerpp	2, 41, 34
+	xvf32gerpp	3, 40, 34
+	xvf32gerpp	0, 39, 34
+	xvf32gerpp	1, 38, 34
+
+.if \Complete==0
+	lxvp	vs34, DISP4(\Index, \OffsetB)(\BREG)
+	lxvp	vs38, DISP16(\Index, 64+\OffsetA)(\AREG)
+	lxvp	vs40, DISP16(\Index, 64+32+\OffsetA)(\AREG)
+.endif
+.if \IsLast==1
+.if \Complete==1
+	addi    \BREG, \BREG, DISP4(\Index, \OffsetB)
+	addi    \AREG, \AREG, DISP16(\Index, \OffsetA)
+.else
+	addi    \BREG, \BREG, DISP4(\Index, 32)
+	addi    \AREG, \AREG, DISP16(\Index, 128)
+.endif
+.endif
+.endm
+
+.macro	KERNEL2x8
+	LOAD2x8
+	END2x8  AO, BO, 64, 16
+.endm
+
+.macro SAVE2x8
+	SHUFFLE_ACC	0, vs0, vs1, vs2, vs3, vs32, vs40, vs36, vs44
+	SHUFFLE_ACC	1, vs4, vs5, vs6, vs7, vs33, vs41, vs37, vs45
+	SHUFFLE_ACC	2, vs8, vs9, vs10, vs11, vs34, vs42, vs38, vs46
+	SHUFFLE_ACC	3, vs12, vs13, vs14, vs15, vs35, vs43, vs39, vs47
+	add	T1, CO, LDC
+#ifndef TRMMKERNEL
+	lxvp	vs24, 0(CO)
+#endif
+#ifndef TRMMKERNEL
+	lxvp	vs26, 32(CO)
+#endif
+#ifndef TRMMKERNEL
+	lxvp	vs28, 0(T1)
+#endif
+#ifndef TRMMKERNEL
+	lxvp	vs30, 32(T1)
+#endif
+	add	T2, CO, T4
+	add	T3, T1, T4
+	GROUP1
+	AGG_GROUP1
+	GROUP2
+	AGG_GROUP2
+  /*VSINRR, VSINII, VSOUT1, VSOUT2*/
+	MULTIPLY_GROUP1
+	MULTIPLY_GROUP2
+/* reconstruct r, i pairs*/
+	RECONSTRUCT_PAIR1
+	RECONSTRUCT_PAIR2
+#ifndef TRMMKERNEL
+  /* add */
+	xxpermdi	vs1, vs8, vs0, 2
+	xxpermdi	vs3, vs10, vs2, 2
+	xxpermdi	vs5, vs12, vs4, 2
+	xxpermdi	vs7, vs14, vs6, 2
+	xxpermdi	vs9, vs0, vs8, 2
+	xxpermdi	vs11, vs2, vs10, 2
+	xvaddsp	vs24, vs24, vs3
+	xvaddsp	vs25, vs25, vs1
+	xxpermdi	vs13, vs4, vs12, 2
+	xxpermdi	vs15, vs6, vs14, 2
+	xvaddsp	vs26, vs26, vs7
+	xvaddsp	vs27, vs27, vs5
+	xvaddsp	vs28, vs28, vs11
+	xvaddsp	vs29, vs29, vs9
+	xvaddsp	vs30, vs30, vs15
+	xvaddsp	vs31, vs31, vs13
+#else
+	xxpermdi	vs25, vs8, vs0, 2
+	xxpermdi	vs24, vs10, vs2, 2
+	xxpermdi	vs27, vs12, vs4, 2
+	xxpermdi	vs26, vs14, vs6, 2
+	xxpermdi	vs29, vs0, vs8, 2
+	xxpermdi	vs28, vs2, vs10, 2
+	xxpermdi	vs31, vs4, vs12, 2
+	xxpermdi	vs30, vs6, vs14, 2
+#endif
+	stxvp	vs24, 0(CO)
+	stxvp	vs26, 32(CO)
+	stxvp	vs28, 0(T1)
+	stxvp	vs30, 32(T1)
+	addi  CO, CO, 64
+.endm
+
+/*                                             macros for N=2 and M=4
+**********************************************************************************************/
+
+.macro	ZERO2x4
+	xxsetaccz	0
+	xxsetaccz	1
+.endm
+
+.macro	LOAD2x4
+	LOAD2x4O 0, 0
+.endm
+
+.macro	LOAD2x4O  OffsetA, OffsetB
+	lxv	vs34, (\OffsetB+0)(BO)
+	lxvp	vs32, (\OffsetA+0)(AO)
+.endm
+
+.macro	END2x4_NORMAL
+	END2x4 AO, BO, 32, 16
+.endm
+
+.macro	END2x4_WITHOUT_ADD
+	END2x4 AO, BO, 0, 0
+.endm
+
+.macro	END2x4 AREG, BREG, OffsetA, OffsetB
+.if \OffsetB != 0
+	addi  \BREG, \BREG, \OffsetB
+.endif
+.if \OffsetA != 0
+	addi  \AREG, \AREG, \OffsetA
+.endif
+	xvf32gerpp	0, 33, 34
+	xvf32gerpp	1, 32, 34
+.endm
+
+.macro	LOAD2x4_2
+	LOAD2x4_2O 0, 0
+.endm
+
+.macro	LOAD2x4_2O  OffsetA, OffsetB
+	lxvp	vs34, (\OffsetB)(BO)
+	lxvp	vs32, (0+\OffsetA)(AO)
+	lxvp	vs36, (32+\OffsetA)(AO)
+.endm
+
+.macro	END2x4_2
+  /*for load2 offset will be 64 and 32*/
+	KERNEL2x4_2  AO, BO, 64, 32, 0, 1, 1
+.endm
+
+.macro	KERNEL2x4_E2 OffsetA, OffsetB, Index, IsLast
+	KERNEL2x4_2 AO, BO, \OffsetA, \OffsetB, \Index, \IsLast, 1
+.endm
+
+.macro	KERNEL2x4_L2 OffsetA, OffsetB, Index, IsLast
+	KERNEL2x4_2 AO, BO, \OffsetA, \OffsetB, \Index, \IsLast, 0
+.endm
+
+.macro	KERNEL2x4_2  AREG, BREG, OffsetA, OffsetB, Index, IsLast, Complete
+	xvf32gerpp	0, 33, 35
+	xvf32gerpp	1, 32, 35
+.if \Complete==0
+	lxvp	vs32, DISP8(\Index, 0+\OffsetA)(\AREG)
+.endif
+	xvf32gerpp	0, 37, 34
+	xvf32gerpp	1, 36, 34
+.if \Complete==0
+	lxvp	vs34, DISP4(\Index, \OffsetB)(\BREG)
+	lxvp	vs36, DISP8(\Index, 32+\OffsetA)(\AREG)
+.endif
+.if \IsLast==1
+.if \Complete==1
+	addi    \BREG, \BREG, DISP4(\Index, \OffsetB)
+	addi    \AREG, \AREG, DISP8(\Index, \OffsetA)
+.else
+	addi    \BREG, \BREG, DISP4(\Index, 32)
+	addi    \AREG, \AREG, DISP8(\Index, 64)
+.endif
+.endif
+.endm
+
+.macro	KERNEL2x4
+	LOAD2x4
+	END2x4  AO, BO, 32, 16
+.endm
+
+.macro SAVE2x4
+	SHUFFLE_ACC	0, vs0, vs1, vs2, vs3, vs32, vs40, vs36, vs44
+	SHUFFLE_ACC	1, vs4, vs5, vs6, vs7, vs33, vs41, vs37, vs45
+	add	T1, CO, LDC
+#ifndef TRMMKERNEL
+	lxvp	vs24, 0(CO)
+#endif
+#ifndef TRMMKERNEL
+	lxvp	vs26, 0(T1)
+#endif
+	GROUP1
+	AGG_GROUP1
+  /*VSINRR, VSINII, VSOUT1, VSOUT2*/
+	MULTIPLY_GROUP1
+/* reconstruct r, i pairs*/
+	RECONSTRUCT_PAIR1
+#ifndef TRMMKERNEL
+  /* add */
+	xxpermdi	vs1, vs8, vs0, 2
+	xxpermdi	vs3, vs10, vs2, 2
+	xxpermdi	vs9, vs0, vs8, 2
+	xxpermdi	vs11, vs2, vs10, 2
+	xvaddsp	vs24, vs24, vs3
+	xvaddsp	vs25, vs25, vs1
+	xvaddsp	vs26, vs26, vs11
+	xvaddsp	vs27, vs27, vs9
+#else
+	xxpermdi	vs25, vs8, vs0, 2
+	xxpermdi	vs24, vs10, vs2, 2
+	xxpermdi	vs27, vs0, vs8, 2
+	xxpermdi	vs26, vs2, vs10, 2
+#endif
+	stxvp	vs24, 0(CO)
+	stxvp	vs26, 0(T1)
+	addi  CO, CO, 32
+.endm
+
+/*                                             macros for N=2 and M=2
+**********************************************************************************************/
+
+.macro	ZERO2x2
+	xxsetaccz	0
+.endm
+
+.macro	LOAD2x2
+	LOAD2x2O 0, 0
+.endm
+
+.macro	LOAD2x2O  OffsetA, OffsetB
+	lxv	vs32, (\OffsetA+0)(AO)
+	lxv	vs34, (\OffsetB+0)(BO)
+.endm
+
+.macro	END2x2_NORMAL
+	END2x2 AO, BO, 16, 16
+.endm
+
+.macro	END2x2_WITHOUT_ADD
+	END2x2 AO, BO, 0, 0
+.endm
+
+.macro	END2x2 AREG, BREG, OffsetA, OffsetB
+.if \OffsetB != 0
+	addi  \BREG, \BREG, \OffsetB
+.endif
+
+.if \OffsetA != 0
+	addi  \AREG, \AREG, \OffsetA
+.endif
+	xvf32gerpp	0, 34, 32
+.endm
+
+.macro	LOAD2x2_2
+	LOAD2x2_2O 0, 0
+.endm
+
+.macro	LOAD2x2_2O  OffsetA, OffsetB
+	lxvp	vs32, (\OffsetA)(AO)
+	lxvp	vs34, (0+\OffsetB)(BO)
+.endm
+
+.macro	END2x2_2
+  /*for load2 offset will be 32 and 32*/
+	KERNEL2x2_2  AO, BO, 32, 32, 0, 1, 1
+.endm
+
+.macro	KERNEL2x2_E2 OffsetA, OffsetB, Index, IsLast
+	KERNEL2x2_2 AO, BO, \OffsetA, \OffsetB, \Index, \IsLast, 1
+.endm
+
+.macro	KERNEL2x2_L2 OffsetA, OffsetB, Index, IsLast
+	KERNEL2x2_2 AO, BO, \OffsetA, \OffsetB, \Index, \IsLast, 0
+.endm
+
+.macro	KERNEL2x2_2  AREG, BREG, OffsetA, OffsetB, Index, IsLast, Complete
+	xvf32gerpp	0, 34, 32
+	xvf32gerpp	0, 35, 33
+.if \Complete==0
+	lxvp	vs32, DISP4(\Index, \OffsetA)(\AREG)
+	lxvp	vs34, DISP4(\Index, \OffsetA)(\BREG)
+.endif
+.if \IsLast==1
+.if \Complete==1
+	addi    \AREG, \AREG, DISP4(\Index, \OffsetA)
+	addi    \BREG, \BREG, DISP4(\Index, \OffsetB)
+.else
+	addi    \AREG, \AREG, DISP4(\Index, 32)
+	addi    \BREG, \BREG, DISP4(\Index, 32)
+.endif
+.endif
+.endm
+
+.macro	KERNEL2x2
+	LOAD2x2
+	END2x2  AO, BO, 16, 16
+.endm
+
+.macro SAVE2x2
+	SHUFFLE_ACC	0, vs0, vs1, vs2, vs3, vs32, vs40, vs36, vs44
+	add	T1, CO, LDC
+#ifndef TRMMKERNEL
+	lxv	vs24, 0(CO)
+#endif
+#ifndef TRMMKERNEL
+	lxv	vs26, 0(T1)
+#endif
+	xxperm	vs0, vs32, permute_mask
+	xxperm	vs4, vs40, permute_mask
+	xxperm	vs8, vs36, permute_mask
+	xxperm	vs12, vs44, permute_mask
+	AGGREGATE_REALS_IMAGES_A_PERMUTE vs32, vs0, vs40, vs4
+	AGGREGATE_REALS_IMAGES_A_PERMUTE vs36, vs8, vs44, vs12
+  /*VSINRR, VSINII, VSOUT1, VSOUT2*/
+	MULT_APLHA_PART1    vs32, vs40, vs0, vs1
+	MULT_APLHA_PART1    vs36, vs44, vs8, vs9
+	MULT_APLHA_PART2    vs32, vs40, vs0, vs1
+	MULT_APLHA_PART2    vs36, vs44, vs8, vs9
+/* reconstruct r, i pairs*/
+	xxperm	vs0, vs1, save_permute_1
+	xxperm	vs8, vs9, save_permute_1
+#ifndef TRMMKERNEL
+  /* add */
+	xxpermdi	vs1, vs8, vs0, 0
+	xxpermdi	vs9, vs0, vs8, 3
+	xvaddsp	vs24, vs24, vs1
+	xvaddsp	vs26, vs26, vs9
+#else
+	xxpermdi	vs24, vs8, vs0, 0
+	xxpermdi	vs26, vs0, vs8, 3
+#endif
+	stxv	vs24, 0(CO)
+	stxv	vs26, 0(T1)
+	addi  CO, CO, 16
+.endm
+
+/*                                             macros for N=2 and M=1
+**********************************************************************************************/
+
+.macro	ZERO2x1
+	xxlxor  vs32, vs32, vs32
+	xxlxor  vs40, vs40, vs40
+.endm
+
+.macro	LOAD2x1
+	LOAD2x1O 0, 0
+.endm
+
+.macro	LOAD2x1O  OffsetA, OffsetB
+	lxsd	v4, (\OffsetA+0)(AO)
+	lxv	vs0, (\OffsetB+0)(BO)
+	xxspltd  vs24, vs36, 0
+	xxperm    vs26, vs24, permute_mask
+.endm
+
+.macro	END2x1_NORMAL
+	END2x1 AO, BO,8, 16
+.endm
+
+.macro	END2x1_WITHOUT_ADD
+	END2x1 AO, BO, 0, 0
+.endm
+
+.macro	END2x1 AREG, BREG, OffsetA, OffsetB
+.if \OffsetB != 0
+	addi  \BREG, \BREG, \OffsetB
+.endif
+
+.if \OffsetA != 0
+	addi  \AREG, \AREG, \OffsetA
+.endif
+	xvmaddasp	vs32, vs0, vs24
+	xvmaddasp	vs40, vs0, vs26
+.endm
+
+.macro	LOAD2x1_2
+	LOAD2x1_2O 0, 0
+.endm
+
+.macro	LOAD2x1_2O  OffsetA, OffsetB
+	lxv	vs27, (\OffsetA)(AO)
+	lxvp	vs4, (0+\OffsetB)(BO)
+	xxspltd  vs8, vs27, 1
+	xxspltd  vs24, vs27, 0
+	xxperm    vs10, vs8, permute_mask
+	xxperm    vs26, vs24, permute_mask
+.endm
+
+.macro	END2x1_2
+  /*for load2 offset will be 16 and 32*/
+	KERNEL2x1_2  AO, BO, 16, 32, 0, 1, 1
+.endm
+
+.macro	KERNEL2x1_E2 OffsetA, OffsetB, Index, IsLast
+	KERNEL2x1_2 AO, BO, \OffsetA, \OffsetB, \Index, \IsLast, 1
+.endm
+
+.macro	KERNEL2x1_L2 OffsetA, OffsetB, Index, IsLast
+	KERNEL2x1_2 AO, BO, \OffsetA, \OffsetB, \Index, \IsLast, 0
+.endm
+
+.macro	KERNEL2x1_2  AREG, BREG, OffsetA, OffsetB, Index, IsLast, Complete
+	xvmaddasp	vs32, vs5, vs8
+	xvmaddasp	vs40, vs5, vs10
+.if \Complete==0
+	lxv	vs27, DISP2(\Index, \OffsetA)(\AREG)
+	xxspltd  vs8, vs27, 1
+.endif
+.if \Complete==0
+	xxperm    vs10, vs8, permute_mask
+.endif
+	xvmaddasp	vs32, vs4, vs24
+	xvmaddasp	vs40, vs4, vs26
+.if \Complete==0
+	xxspltd  vs24, vs27, 0
+	xxperm   vs26, vs24, permute_mask
+.endif
+.if \Complete==0
+	lxvp	vs4, DISP4(\Index, 0+\OffsetB)(\BREG)
+.endif
+.if \IsLast==1
+.if \Complete==1
+	addi    \AREG, \AREG, DISP2(\Index, \OffsetA)
+	addi    \BREG, \BREG, DISP4(\Index, \OffsetB)
+.else
+	addi    \AREG, \AREG, DISP2(\Index, 16)
+	addi    \BREG, \BREG, DISP4(\Index, 32)
+.endif
+.endif
+.endm
+
+.macro	KERNEL2x1
+	LOAD2x1
+	END2x1  AO, BO, 8, 16
+.endm
+
+.macro SAVE2x1
+	add	T1, CO, LDC
+#ifndef TRMMKERNEL
+	lxsd	v4, 0(CO)
+#endif
+#ifndef TRMMKERNEL
+	lxsd	v5, 0(T1)
+#endif
+	xxperm	vs0, vs32, permute_mask
+	xxperm	vs4, vs40, permute_mask
+	AGGREGATE_REALS_IMAGES_A_PERMUTE vs32, vs0, vs40, vs4
+	AGGREGATE_REALS_IMAGES_A_PERMUTE vs33, vs1, vs41, vs5
+  /*VSINRR, VSINII, VSOUT1, VSOUT2*/
+	MULT_APLHA_PART1    vs32, vs40, vs0, vs1
+	MULT_APLHA_PART2    vs32, vs40, vs0, vs1
+/* reconstruct r, i pairs*/
+	xxperm	vs0, vs1, save_permute_1
+#ifndef TRMMKERNEL
+  /* add */
+	xxspltd vs1, vs0, 0
+	xxspltd vs3, vs0, 1
+ /*--v4==vs36 v5==vs37---*/
+	xvaddsp	vs36, vs36, vs1
+	xvaddsp	vs37, vs37, vs3
+#else
+ /*--v4==vs36 v5==vs37---*/
+	xxspltd vs36, vs0, 0
+	xxspltd vs37, vs0, 1
+#endif
+	stxsd	v4, 0(CO)
+	stxsd	v5, 0(T1)
+	addi  CO, CO, 8
+.endm
+
+/*                                             macros for N=1 and M=8
+**********************************************************************************************/
+
+.macro	ZERO1x8
+	xxsetaccz	0
+	xxsetaccz	1
+	xxsetaccz	2
+	xxsetaccz	3
+.endm
+
+.macro	LOAD1x8
+	LOAD1x8O 0, 0
+.endm
+
+.macro	LOAD1x8O  OffsetA, OffsetB
+	lxsd	v2, (\OffsetB+0)(BO)
+	lxvp	vs32, (\OffsetA+0)(AO)
+	lxvp	vs36, (\OffsetA+32)(AO)
+.endm
+
+.macro	END1x8_NORMAL
+	END1x8 AO, BO, 64,8
+.endm
+
+.macro	END1x8_WITHOUT_ADD
+	END1x8 AO, BO, 0, 0
+.endm
+
+.macro	END1x8 AREG, BREG, OffsetA, OffsetB
+.if \OffsetB != 0
+	addi  \BREG, \BREG, \OffsetB
+.endif
+
+.if \OffsetA != 0
+	addi  \AREG, \AREG, \OffsetA
+.endif
+	xvf32gerpp	    0, 34, 33
+	xvf32gerpp	    1, 34, 32
+	xvf32gerpp	    2, 34, 37
+	xvf32gerpp	    3, 34, 36
+.endm
+
+.macro	LOAD1x8_2
+	LOAD1x8_2O 0, 0
+.endm
+
+.macro	LOAD1x8_2O  OffsetA, OffsetB
+	lxv	vs34, (\OffsetB)(BO)
+	lxvp	vs32, (0+\OffsetA)(AO)
+	lxvp	vs36, (32+\OffsetA)(AO)
+	vspltisb        v10, 0
+	xxpermdi        vs35, vs34, vs42, 0
+	xxpermdi        vs34, vs34, vs42, 2
+	lxvp	vs38, (64+\OffsetA)(AO)
+	lxvp	vs40, (64+32+\OffsetA)(AO)
+.endm
+
+.macro	END1x8_2
+  /*for load2 offset will be 128 and 16*/
+	KERNEL1x8_2  AO, BO, 128, 16, 0, 1, 1
+.endm
+
+.macro	KERNEL1x8_E2 OffsetA, OffsetB, Index, IsLast
+	KERNEL1x8_2 AO, BO, \OffsetA, \OffsetB, \Index, \IsLast, 1
+.endm
+
+.macro	KERNEL1x8_L2 OffsetA, OffsetB, Index, IsLast
+	KERNEL1x8_2 AO, BO, \OffsetA, \OffsetB, \Index, \IsLast, 0
+.endm
+
+.macro	KERNEL1x8_2  AREG, BREG, OffsetA, OffsetB, Index, IsLast, Complete
+	xvf32gerpp	    0, 34, 33
+	xvf32gerpp	    1, 34, 32
+.if \Complete==0
+	lxvp	vs32, DISP16(\Index, 0+\OffsetA)(\AREG)
+.endif
+	xvf32gerpp	    2, 34, 37
+	xvf32gerpp	    3, 34, 36
+.if \Complete==0
+	lxvp	vs36, DISP16(\Index, 32+\OffsetA)(\AREG)
+.endif
+	xvf32gerpp	    0, 35, 39
+	xvf32gerpp	    1, 35, 38
+.if \Complete==0
+	lxvp	vs38, DISP16(\Index, 64+\OffsetA)(\AREG)
+.endif
+	xvf32gerpp	    2, 35, 41
+	xvf32gerpp	    3, 35, 40
+.if \Complete==0
+	lxv	vs34, DISP2(\Index, \OffsetB)(\BREG)
+	xxpermdi        vs35, vs34, vs42, 0
+	xxpermdi        vs34, vs34, vs42, 2
+	lxvp	vs40, DISP16(\Index, 64+32+\OffsetA)(\AREG)
+.endif
+.if \IsLast==1
+.if \Complete==1
+	addi    \BREG, \BREG, DISP2(\Index, \OffsetB)
+	addi    \AREG, \AREG, DISP16(\Index, \OffsetA)
+.else
+	addi    \BREG, \BREG, DISP2(\Index, 16)
+	addi    \AREG, \AREG, DISP16(\Index, 128)
+.endif
+.endif
+.endm
+
+.macro	KERNEL1x8
+	LOAD1x8
+	END1x8  AO, BO, 64,8
+.endm
+
+.macro SAVE1x8
+	SHUFFLE_ACC	0, vs0, vs1, vs2, vs3, vs32, vs40, vs36, vs44
+	SHUFFLE_ACC	1, vs4, vs5, vs6, vs7, vs33, vs41, vs37, vs45
+	SHUFFLE_ACC	2, vs8, vs9, vs10, vs11, vs34, vs42, vs38, vs46
+	SHUFFLE_ACC	3, vs12, vs13, vs14, vs15, vs35, vs43, vs39, vs47
+	xxpermdi	vs32, vs32, vs36, 0
+	xxpermdi	vs33, vs33, vs37, 0
+	xxpermdi	vs34, vs34, vs38, 0
+	xxpermdi	vs35, vs35, vs39, 0
+	xxpermdi	vs40, vs40, vs44, 0
+	xxperm vs40, vs40, permute_mask
+	xxpermdi	vs41, vs41, vs45, 0
+	xxperm vs41, vs41, permute_mask
+	xxpermdi	vs42, vs42, vs46, 0
+	xxperm vs42, vs42, permute_mask
+	xxpermdi	vs43, vs43, vs47, 0
+	xxperm vs43, vs43, permute_mask
+#ifndef TRMMKERNEL
+	lxvp	vs24, 0(CO)
+#endif
+	xxperm	vs0, vs32, permute_mask
+	xxperm	vs4, vs40, permute_mask
+#ifndef TRMMKERNEL
+	lxvp	vs26, 32(CO)
+#endif
+	xxperm	vs1, vs33, permute_mask
+	xxperm	vs5, vs41, permute_mask
+	xxperm	vs2, vs34, permute_mask
+	xxperm	vs6, vs42, permute_mask
+	xxperm	vs3, vs35, permute_mask
+	xxperm	vs7, vs43, permute_mask
+	AGGREGATE_REALS_IMAGES	vs32, vs0, vs40, vs4
+	AGGREGATE_REALS_IMAGES	vs33, vs1, vs41, vs5
+	AGGREGATE_REALS_IMAGES	vs34, vs2, vs42, vs6
+	AGGREGATE_REALS_IMAGES	vs35, vs3, vs43, vs7
+  /*inner reverse save_permute and store vs28 */
+	xxpermdi	vs28,save_permute_1,save_permute_1, 2
+  /*VSINRR, VSINII, VSOUT1, VSOUT2*/
+	MULT_APLHA_PART1    vs32, vs40, vs0, vs1
+	MULT_APLHA_PART1    vs33, vs41, vs2, vs3
+	MULT_APLHA_PART1    vs34, vs42, vs4, vs5
+	MULT_APLHA_PART1    vs35, vs43, vs6, vs7
+	MULT_APLHA_PART2    vs32, vs40, vs0, vs1
+	MULT_APLHA_PART2    vs33, vs41, vs2, vs3
+	MULT_APLHA_PART2    vs34, vs42, vs4, vs5
+	MULT_APLHA_PART2    vs35, vs43, vs6, vs7
+/* reconstruct r, i pairs*/
+	xxperm	vs0, vs1, vs28
+	xxperm	vs2, vs3, vs28
+	xxperm	vs4, vs5, vs28
+	xxperm	vs6, vs7, vs28
+#ifndef TRMMKERNEL
+  /* add */
+	xvaddsp	vs24, vs24, vs2
+	xvaddsp	vs25, vs25, vs0
+	xvaddsp	vs26, vs26, vs6
+	xvaddsp	vs27, vs27, vs4
+	stxvp	vs24, 0(CO)
+	stxvp	vs26, 32(CO)
+#else
+/* reconstruct r, i pairs*/
+	stxv	vs0, 0(CO)
+	stxv	vs2, 16(CO)
+	stxv	vs4, 32(CO)
+	stxv	vs6, 48(CO)
+#endif
+	addi  CO, CO, 64
+.endm
+
+/*                                             macros for N=1 and M=4
+**********************************************************************************************/
+
+.macro	ZERO1x4
+	xxsetaccz	0
+	xxsetaccz	1
+.endm
+
+.macro	LOAD1x4
+	LOAD1x4O 0, 0
+.endm
+
+.macro	LOAD1x4O  OffsetA, OffsetB
+	lxsd	v2, (\OffsetB+0)(BO)
+	lxvp	vs32, (\OffsetA+0)(AO)
+.endm
+
+.macro	END1x4_NORMAL
+	END1x4 AO, BO, 32,8
+.endm
+
+.macro	END1x4_WITHOUT_ADD
+	END1x4 AO, BO, 0, 0
+.endm
+
+.macro	END1x4 AREG, BREG, OffsetA, OffsetB
+.if \OffsetB != 0
+	addi  \BREG, \BREG, \OffsetB
+.endif
+
+.if \OffsetA != 0
+	addi  \AREG, \AREG, \OffsetA
+.endif
+	xvf32gerpp	    0, 34, 33
+	xvf32gerpp	    1, 34, 32
+.endm
+
+.macro	LOAD1x4_2
+	LOAD1x4_2O 0, 0
+.endm
+
+.macro	LOAD1x4_2O  OffsetA, OffsetB
+	lxv	vs34, (\OffsetB)(BO)
+	lxvp	vs32, (0+\OffsetA)(AO)
+	vspltisb        v6, 0
+	xxpermdi        vs35, vs34, vs38, 0
+	xxpermdi        vs34, vs34, vs38, 2
+	lxvp	vs36, (32+\OffsetA)(AO)
+.endm
+
+.macro	END1x4_2
+  /*for load2 offset will be 64 and 16*/
+	KERNEL1x4_2  AO, BO, 64, 16, 0, 1, 1
+.endm
+
+.macro	KERNEL1x4_E2 OffsetA, OffsetB, Index, IsLast
+	KERNEL1x4_2 AO, BO, \OffsetA, \OffsetB, \Index, \IsLast, 1
+.endm
+
+.macro	KERNEL1x4_L2 OffsetA, OffsetB, Index, IsLast
+	KERNEL1x4_2 AO, BO, \OffsetA, \OffsetB, \Index, \IsLast, 0
+.endm
+
+.macro	KERNEL1x4_2  AREG, BREG, OffsetA, OffsetB, Index, IsLast, Complete
+	xvf32gerpp	    0, 34, 33
+	xvf32gerpp	    1, 34, 32
+.if \Complete==0
+	lxvp	vs32, DISP8(\Index, 0+\OffsetA)(\AREG)
+.endif
+	xvf32gerpp	    0, 35, 37
+	xvf32gerpp	    1, 35, 36
+.if \Complete==0
+	lxv	vs34, DISP2(\Index, \OffsetB)(\BREG)
+	xxpermdi        vs35, vs34, vs38, 0
+	xxpermdi        vs34, vs34, vs38, 2
+	lxvp	vs36, DISP8(\Index, 32+\OffsetA)(\AREG)
+.endif
+.if \IsLast==1
+.if \Complete==1
+	addi    \BREG, \BREG, DISP2(\Index, \OffsetB)
+	addi    \AREG, \AREG, DISP8(\Index, \OffsetA)
+.else
+	addi    \BREG, \BREG, DISP2(\Index, 16)
+	addi    \AREG, \AREG, DISP8(\Index, 64)
+.endif
+.endif
+.endm
+
+.macro	KERNEL1x4
+	LOAD1x4
+	END1x4	AO, BO, 32,8
+.endm
+
+.macro SAVE1x4
+	SHUFFLE_ACC	0, vs0, vs1, vs2, vs3, vs32, vs40, vs36, vs44
+	SHUFFLE_ACC	1, vs4, vs5, vs6, vs7, vs33, vs41, vs37, vs45
+	xxpermdi	vs32, vs32, vs36, 0
+	xxpermdi	vs40, vs40, vs44, 0
+	xxpermdi	vs33, vs33, vs37, 0
+	xxpermdi	vs41, vs41, vs45, 0
+	xxperm vs40, vs40, permute_mask
+	xxperm vs41, vs41, permute_mask
+#ifndef TRMMKERNEL
+	lxvp	vs24, 0(CO)
+#endif
+	xxperm	vs0, vs32, permute_mask
+	xxperm	vs4, vs40, permute_mask
+	xxperm	vs1, vs33, permute_mask
+	xxperm	vs5, vs41, permute_mask
+	AGGREGATE_REALS_IMAGES	vs32, vs0, vs40, vs4
+	AGGREGATE_REALS_IMAGES	vs33, vs1, vs41, vs5
+  /*inner reverse save_permute and store vs28 */
+	xxpermdi	vs28,save_permute_1,save_permute_1, 2
+  /*VSINRR, VSINII, VSOUT1, VSOUT2*/
+	MULT_APLHA_PART1    vs32, vs40, vs0, vs1
+	MULT_APLHA_PART1    vs33, vs41, vs2, vs3
+	MULT_APLHA_PART2    vs32, vs40, vs0, vs1
+	MULT_APLHA_PART2    vs33, vs41, vs2, vs3
+/* reconstruct r, i pairs*/
+	xxperm	vs0, vs1, vs28
+	xxperm	vs2, vs3, vs28
+#ifndef TRMMKERNEL
+  /* add */
+	xvaddsp	vs24, vs24, vs2
+	xvaddsp	vs25, vs25, vs0
+	stxvp	vs24, 0(CO)
+#else
+/* reconstruct r, i pairs*/
+	stxv	vs0, 0(CO)
+	stxv	vs2, 16(CO)
+#endif
+	addi  CO, CO, 32
+.endm
+
+/*                                             macros for N=1 and M=2
+**********************************************************************************************/
+
+.macro	ZERO1x2
+	xxlxor  vs32, vs32, vs32
+	xxlxor  vs40, vs40, vs40
+.endm
+
+.macro	LOAD1x2
+	LOAD1x2O 0, 0
+.endm
+
+.macro	LOAD1x2O  OffsetA, OffsetB
+	lxsd	vs4, (\OffsetB+0)(BO)
+	lxv	vs0, (\OffsetA+0)(AO)
+	xxspltd   vs24, vs36, 0
+	xxperm    vs26, vs24, permute_mask
+.endm
+
+.macro	END1x2_NORMAL
+	END1x2 AO, BO, 16,8
+.endm
+
+.macro	END1x2_WITHOUT_ADD
+	END1x2 AO, BO, 0, 0
+.endm
+
+.macro	END1x2 AREG, BREG, OffsetA, OffsetB
+.if \OffsetB != 0
+	addi  \BREG, \BREG, \OffsetB
+.endif
+.if \OffsetA != 0
+	addi  \AREG, \AREG, \OffsetA
+.endif
+	xvmaddasp	vs32, vs0, vs24
+	xvmaddasp	vs40, vs0, vs26
+.endm
+
+.macro	LOAD1x2_2
+	LOAD1x2_2O 0, 0
+.endm
+
+.macro	LOAD1x2_2O  OffsetA, OffsetB
+	lxv	vs27, (\OffsetB)(BO)
+	lxvp	vs4, (0+\OffsetA)(AO)
+	xxspltd  vs8, vs27, 1
+	xxspltd  vs24, vs27, 0
+	xxperm    vs10, vs8, permute_mask
+	xxperm    vs26, vs24, permute_mask
+.endm
+
+.macro	END1x2_2
+  /*for load2 offset will be 32 and 16*/
+	KERNEL1x2_2  AO, BO, 32, 16, 0, 1, 1
+.endm
+
+.macro	KERNEL1x2_E2 OffsetA, OffsetB, Index, IsLast
+	KERNEL1x2_2 AO, BO, \OffsetA, \OffsetB, \Index, \IsLast, 1
+.endm
+
+.macro	KERNEL1x2_L2 OffsetA, OffsetB, Index, IsLast
+	KERNEL1x2_2 AO, BO, \OffsetA, \OffsetB, \Index, \IsLast, 0
+.endm
+
+.macro	KERNEL1x2_2  AREG, BREG, OffsetA, OffsetB, Index, IsLast, Complete
+.if \Complete==0
+	lxv	vs27, DISP2(\Index, \OffsetB)(\BREG)
+.endif
+	xvmaddasp	vs32, vs5, vs8
+	xvmaddasp	vs40, vs5, vs10
+
+.if \Complete==0
+	xxspltd  vs8, vs27, 1
+	xxperm    vs10, vs8, permute_mask
+.endif
+	xvmaddasp	vs32, vs4, vs24
+	xvmaddasp	vs40, vs4, vs26
+.if \Complete==0
+	lxvp	vs4, DISP4(\Index, 0+\OffsetA)(\AREG)
+.endif
+
+.if \Complete==0
+	xxspltd  vs24, vs27, 0
+	xxperm    vs26, vs24, permute_mask
+.endif
+.if \IsLast==1
+.if \Complete==1
+	addi    \BREG, \BREG, DISP2(\Index, \OffsetB)
+	addi    \AREG, \AREG, DISP4(\Index, \OffsetA)
+.else
+	addi    \BREG, \BREG, DISP2(\Index, 16)
+	addi    \AREG, \AREG, DISP4(\Index, 32)
+.endif
+.endif
+.endm
+
+.macro	KERNEL1x2
+	LOAD1x2
+	END1x2  AO, BO, 16,8
+.endm
+
+.macro SAVE1x2
+#ifndef TRMMKERNEL
+	lxv	vs24, 0(CO)
+#endif
+	xxperm	vs0, vs32, permute_mask
+	xxperm	vs4, vs40, permute_mask
+	AGGREGATE_REALS_IMAGES	vs32, vs0, vs40, vs4
+  /*inner reverse save_permute and store vs28 */
+	xxpermdi	vs28,save_permute_1,save_permute_1, 2
+  /*VSINRR, VSINII, VSOUT1, VSOUT2*/
+	MULT_APLHA_PART1    vs32, vs40, vs0, vs1
+	MULT_APLHA_PART2    vs32, vs40, vs0, vs1
+/* reconstruct r, i pairs*/
+	xxperm	vs0, vs1, vs28
+#ifndef TRMMKERNEL
+  /* add */
+	xvaddsp	vs24, vs24, vs0
+	stxv	vs24, 0(CO)
+#else
+/* reconstruct r, i pairs*/
+	stxv	vs0, 0(CO)
+#endif
+	addi  CO, CO, 16
+.endm
+
+/*                                             macros for N=1 and M=1
+**********************************************************************************************/
+.macro	ZERO1x1
+	xxlxor  vs32, vs32, vs32
+	xxlxor  vs40, vs40, vs40
+.endm
+
+.macro	LOAD1x1
+	LOAD1x1O 0, 0
+.endm
+
+.macro	LOAD1x1O  OffsetA, OffsetB
+	lxsd	v4, (\OffsetB+0)(BO)
+	lxsd	v5, (\OffsetA+0)(AO)
+	xxperm    vs38, vs36, permute_mask
+.endm
+
+.macro	END1x1_NORMAL
+	END1x1 AO, BO,8,8
+.endm
+
+.macro	END1x1_WITHOUT_ADD
+	END1x1 AO, BO, 0, 0
+.endm
+
+.macro	END1x1 AREG, BREG, OffsetA, OffsetB
+.if \OffsetB != 0
+	addi  \BREG, \BREG, \OffsetB
+.endif
+.if \OffsetA != 0
+	addi  \AREG, \AREG, \OffsetA
+.endif
+	xvmaddasp	vs32, vs37, vs36
+	xvmaddasp	vs40, vs37, vs38
+.endm
+
+.macro	LOAD1x1_2
+	LOAD1x1_2O 0, 0
+.endm
+
+.macro	LOAD1x1_2O  OffsetA, OffsetB
+	lxv	vs8, (\OffsetB)(BO)
+	lxv	vs4, (0+\OffsetA)(AO)
+	xxperm    vs10, vs8, permute_mask
+.endm
+
+.macro	END1x1_2
+  /*for load2 offset will be 16 and 16*/
+	KERNEL1x1_2  AO, BO, 16, 16, 0, 1, 1
+.endm
+
+.macro	KERNEL1x1_E2 OffsetA, OffsetB, Index, IsLast
+	KERNEL1x1_2 AO, BO, \OffsetA, \OffsetB, \Index, \IsLast, 1
+.endm
+
+.macro	KERNEL1x1_L2 OffsetA, OffsetB, Index, IsLast
+	KERNEL1x1_2 AO, BO, \OffsetA, \OffsetB, \Index, \IsLast, 0
+.endm
+
+.macro	KERNEL1x1_2  AREG, BREG, OffsetA, OffsetB, Index, IsLast, Complete
+	xvmaddasp	vs32, vs4, vs8
+	xvmaddasp	vs40, vs4, vs10
+.if \Complete==0
+	lxv	vs8, DISP2(\Index, \OffsetB)(\BREG)
+	lxv	vs4, DISP2(\Index, \OffsetB)(\AREG)
+	xxperm    vs10, vs8, permute_mask
+.endif
+.if \IsLast==1
+.if \Complete==1
+	addi    \BREG, \BREG, DISP2(\Index, \OffsetB)
+	addi    \AREG, \AREG, DISP2(\Index, \OffsetA)
+.else
+	addi    \BREG, \BREG, DISP2(\Index, 16)
+	addi    \AREG, \AREG, DISP2(\Index, 16)
+.endif
+.endif
+.endm
+
+.macro	KERNEL1x1
+	LOAD1x1
+	END1x1  AO, BO, 8,8
+.endm
+
+.macro SAVE1x1
+#ifndef TRMMKERNEL
+	lxsd	v4, 0(CO)
+#endif
+  /*aggregate x2*/
+	xxpermdi	vs33, vs32, vs32, 2
+	xxpermdi	vs41, vs40, vs40, 2
+	xvaddsp	vs32, vs32, vs33
+	xvaddsp	vs40, vs40, vs41
+
+	xxperm	vs0, vs32, permute_mask
+	xxperm	vs4, vs40, permute_mask
+	AGGREGATE_REALS_IMAGES	vs32, vs0, vs40, vs4
+  /*inner reverse save_permute and store vs28 */
+	xxpermdi	vs28,save_permute_1,save_permute_1, 2
+  /*VSINRR, VSINII, VSOUT1, VSOUT2*/
+	MULT_APLHA_PART1    vs32, vs40, vs37, vs1
+	MULT_APLHA_PART2    vs32, vs40, vs37, vs1
+/* reconstruct r, i pairs*/
+	xxperm	vs37, vs1, vs28
+#ifndef TRMMKERNEL
+  /* add */
+	xvaddsp	vs36, vs36, vs37
+	stxsd	v4, 0(CO)
+#else
+/* vs37 is v5 */
+	stxsd	v5, 0(CO)
+#endif
+	addi  CO, CO, 8
+.endm
+
+/****************************TRMM POINTER REFRESH MACROSES*************************/
+.macro SHIFT_REG	REG1,REG2,SHIFT_VAL
+.if \SHIFT_VAL==16
+	slwi		\REG1, \REG2, 7
+.elseif \SHIFT_VAL==8
+	slwi		\REG1, \REG2, 6
+.elseif \SHIFT_VAL==4
+	slwi		\REG1, \REG2, 5
+.elseif \SHIFT_VAL==2
+	slwi		\REG1, \REG2, 4
+.elseif \SHIFT_VAL==1
+	slwi		\REG1, \REG2, 3
+.endif
+.endm
+
+/*
+//#if (defined(LEFT) &&  defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+// 		ptrbb = bb;
+// #else
+// 		ptrba += off*8;
+// 		ptrbb = bb + off*4;
+// #endif
+*/
+.macro REFRESH_POINTERS  PTR_A,PTR_B, OFF_VAL, B_VAL, C_A, C_B
+#if (defined(LEFT) &&  defined(TRANSA)) ||  (!defined(LEFT) && !defined(TRANSA))
+/* ptrbb = bb;*/
+	mr \PTR_B, \B_VAL     /* refresh BPOINT */
+#else
+/*
+// ptrba  =ptrba+ off*C_A;
+// ptrbb = bb + off*C_B;
+*/
+	SHIFT_REG T4, \OFF_VAL, \C_B	/* Number of values in B shifted  */
+	SHIFT_REG T2, \OFF_VAL, \C_A	/* Number of values in A shifted  */
+	add	\PTR_B, \B_VAL, T4	/* Add values to BO */
+	add	\PTR_A, \PTR_A, T2	/* Add values to AO  */
+#endif
+.endm
+
+/*
+// #if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
+// 		temp = bk-off;
+// #elif defined(LEFT)
+// 		temp = off+8;	// number of values in A
+// #else
+// 		temp = off+4;	// number of values in B
+// #endif
+*/
+.macro REFRESH_TEMP_BK TEMP_BK, BK_VAL, OFF_VAL, INCR_A, INCR_B
+    #if (defined(LEFT) && !defined(TRANSA)) ||  (!defined(LEFT) && defined(TRANSA))
+	/* temp = bk-off;*/
+	sub \TEMP_BK, \BK_VAL, \OFF_VAL
+    #elif defined(LEFT)
+	/* temp = off+INCR_A;	// number of values in A */
+	addi \TEMP_BK, \OFF_VAL, \INCR_A
+    #else
+	/* temp = off+INCR_B	// number of values in B*/
+	addi \TEMP_BK, \OFF_VAL, \INCR_B
+    #endif
+.endm
+/*
+// #if ( defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+// 		temp = bk - off;
+// #ifdef LEFT
+// 		temp -= 8; // number of values in A
+// #else
+// 		temp -= 4; // number of values in B
+// #endif
+// 		ptrba += temp*8;
+// 		ptrbb += temp*4;
+// #endif
+
+// #ifdef LEFT
+// 		off += 8; // number of values in A
+// #endif
+*/
+.macro REFRESH_AFTER_SAVE TEMP_BK, BK_VAL, OFF_VAL,PTR_B,PTR_A, C_A, C_B
+    #if ( defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+	/*temp = bk - off;*/
+	sub \TEMP_BK, \BK_VAL, \OFF_VAL
+    #ifdef LEFT
+	/*temp -= 8; // number of values in A*/
+	addi \TEMP_BK, \TEMP_BK,-\C_A
+    #else
+	/*temp -= 4; // number of values in B*/
+	addi \TEMP_BK, \TEMP_BK,-\C_B
+    #endif
+	/*ptrba += temp*C_A;
+	ptrbb += temp*C_B;*/
+	SHIFT_REG T4, \TEMP_BK, \C_A
+	SHIFT_REG T2, \TEMP_BK, \C_B
+	add \PTR_A, \PTR_A, T4/*ptrba+temp*C_A*/
+	add \PTR_B, \PTR_B, T2
+    #endif
+    #ifdef LEFT
+	/*off += 8; // number of values in A*/
+	addi \OFF_VAL, \OFF_VAL, \C_A
+    #endif
+.endm
diff --git a/kernel/power/dgemm_kernel_power10.c b/kernel/power/dgemm_kernel_power10.c
new file mode 100644
index 000000000..b3ee301be
--- /dev/null
+++ b/kernel/power/dgemm_kernel_power10.c
@@ -0,0 +1,864 @@
+/*********************************************************************************
+Copyright (c) 2020, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+**********************************************************************************/
+#include "common.h"
+#include <altivec.h>
+
+typedef unsigned char vec_t __attribute__ ((vector_size (16)));
+typedef FLOAT v4sf_t __attribute__ ((vector_size (16)));
+typedef FLOAT v2sf_t __attribute__ ((vector_size (8)));
+
+#ifdef TRMMKERNEL
+#define SAVE_ACC(ACC, J)  \
+          __builtin_mma_disassemble_acc (result, ACC); \
+          rowC = (v4sf_t *) &CO[0* ldc+J]; \
+          rowC[0] = result[3] * alpha; \
+          rowC = (v4sf_t *) &CO[1*ldc+J]; \
+          rowC[0] = result[2] * alpha; \
+          rowC = (v4sf_t *) &CO[2*ldc+J]; \
+          rowC[0] = result[1] * alpha; \
+          rowC = (v4sf_t *) &CO[3*ldc+J]; \
+          rowC[0] = result[0] * alpha;
+#define SAVE_ACC1(ACC, J)  \
+          __builtin_mma_disassemble_acc (result, ACC); \
+          rowC = (v4sf_t *) &CO[4* ldc+J]; \
+          rowC[0] = result[3] * alpha; \
+          rowC = (v4sf_t *) &CO[5*ldc+J]; \
+          rowC[0] = result[2] * alpha; \
+          rowC = (v4sf_t *) &CO[6*ldc+J]; \
+          rowC[0] = result[1] * alpha; \
+          rowC = (v4sf_t *) &CO[7*ldc+J]; \
+          rowC[0] = result[0] * alpha;
+#define  SAVE2x4_ACC(ACC, J)  \
+	  __builtin_mma_disassemble_acc (result, ACC); \
+	  rowC = (v4sf_t *) &CO[0* ldc+J]; \
+          rowC[0] = result[3] * alpha; \
+	  rowC = (v4sf_t *) &CO[1* ldc+J]; \
+          rowC[0] = result[2] * alpha;
+#else
+#define SAVE_ACC(ACC, J)  \
+          __builtin_mma_disassemble_acc (result, ACC); \
+          rowC = (v4sf_t *) &CO[0* ldc+J]; \
+          rowC[0] += result[3] * alpha; \
+          rowC = (v4sf_t *) &CO[1*ldc+J]; \
+          rowC[0] += result[2] * alpha; \
+          rowC = (v4sf_t *) &CO[2*ldc+J]; \
+          rowC[0] += result[1] * alpha; \
+          rowC = (v4sf_t *) &CO[3*ldc+J]; \
+          rowC[0] += result[0] * alpha;
+#define SAVE_ACC1(ACC, J)  \
+          __builtin_mma_disassemble_acc (result, ACC); \
+          rowC = (v4sf_t *) &CO[4* ldc+J]; \
+          rowC[0] += result[3] * alpha; \
+          rowC = (v4sf_t *) &CO[5*ldc+J]; \
+          rowC[0] += result[2] * alpha; \
+          rowC = (v4sf_t *) &CO[6*ldc+J]; \
+          rowC[0] += result[1] * alpha; \
+          rowC = (v4sf_t *) &CO[7*ldc+J]; \
+          rowC[0] += result[0] * alpha;
+#define  SAVE2x4_ACC(ACC, J)  \
+	  __builtin_mma_disassemble_acc (result, ACC); \
+	  rowC = (v4sf_t *) &CO[0* ldc+J]; \
+          rowC[0] += result[3] * alpha; \
+	  rowC = (v4sf_t *) &CO[1* ldc+J]; \
+          rowC[0] += result[2] * alpha;
+#endif
+
+#define SET_ACC_ZERO4() \
+          __builtin_mma_xxsetaccz (&acc0); \
+          __builtin_mma_xxsetaccz (&acc1); \
+          __builtin_mma_xxsetaccz (&acc2); \
+          __builtin_mma_xxsetaccz (&acc3);
+
+#define SET_ACC_ZERO8() \
+          __builtin_mma_xxsetaccz (&acc0); \
+          __builtin_mma_xxsetaccz (&acc1); \
+          __builtin_mma_xxsetaccz (&acc2); \
+          __builtin_mma_xxsetaccz (&acc3); \
+          __builtin_mma_xxsetaccz (&acc4); \
+          __builtin_mma_xxsetaccz (&acc5); \
+          __builtin_mma_xxsetaccz (&acc6); \
+          __builtin_mma_xxsetaccz (&acc7);
+
+#define PREFETCH1(x, y) asm volatile ("dcbt %0, %1" : : "r" (x), "b" (y) : "memory");
+
+#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
+#define REFRESH_TEMP_BK(x, y) \
+            temp = k - off;
+#elif defined(LEFT)
+#define REFRESH_TEMP_BK(x, y) \
+            temp = off + x;
+#else
+#define REFRESH_TEMP_BK(x, y) \
+            temp = off + y;
+#endif
+#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+#define REFRESH_POINTERS(x, y) \
+          BO = B; \
+          REFRESH_TEMP_BK(x, y)
+#else
+#define REFRESH_POINTERS(x, y) \
+          AO += off * x; \
+          BO = B + off * y; \
+          REFRESH_TEMP_BK(x, y)
+#endif
+
+#ifdef LEFT
+#define REFRESH_OFF(x) \
+            off += x;
+#else
+#define REFRESH_OFF(x)
+#endif
+
+#ifdef LEFT
+#define UPDATE_TEMP(x, y) \
+            temp -= x;
+#else
+#define UPDATE_TEMP(x, y) \
+            temp -= y;
+#endif
+
+#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+#define REFRESH_TMP_AFTER_SAVE(x, y) \
+            temp = k - off; \
+            UPDATE_TEMP(x, y) \
+            AO += temp * x; \
+            BO += temp * y;
+#else
+#define REFRESH_TMP_AFTER_SAVE(x, y)
+#endif
+
+#define REFRESH_AFTER_SAVE(x,y) \
+        REFRESH_TMP_AFTER_SAVE(x, y) \
+        REFRESH_OFF(x)
+/*************************************************************************************
+* GEMM Kernel
+*************************************************************************************/
+int
+CNAME (BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha, FLOAT * A, FLOAT * B,
+       FLOAT * C, BLASLONG ldc
+#ifdef TRMMKERNEL
+       , BLASLONG offset
+#endif
+  )
+{
+  BLASLONG N = n;
+  BLASLONG i1;
+#if defined(TRMMKERNEL)
+  BLASLONG off;
+#endif
+#if defined(TRMMKERNEL) && !defined(LEFT)
+  off = -offset;
+#endif
+  v4sf_t valpha = { alpha, alpha };
+  N = n >> 2;
+  for (i1 = 0; i1 < N; i1++)
+    {
+      BLASLONG i, j, temp;
+      FLOAT *CO;
+      FLOAT *AO;
+#if defined(TRMMKERNEL) && defined(LEFT)
+      off = offset;
+#endif
+      CO = C;
+      C += ldc << 2;
+      AO = A;
+      PREFETCH1 (A, 128);
+      PREFETCH1 (A, 256);
+      i = m >> 4;
+      for (j = 0; j < i; j++)
+	{
+          FLOAT *BO;
+#if defined(TRMMKERNEL)
+          REFRESH_POINTERS (16, 4);
+#else
+          BO = B;
+          temp = k;
+#endif
+	  v4sf_t *rowC;
+	  v4sf_t result[4];
+	  BLASLONG l = 0;
+	  PREFETCH1 (CO, 0);
+	  PREFETCH1 (CO + ldc, 0);
+	  PREFETCH1 (CO + ldc + ldc, 0);
+	  PREFETCH1 (CO + ldc + ldc + ldc, 0);
+	  PREFETCH1 (CO, 128);
+	  PREFETCH1 (CO + ldc, 128);
+	  PREFETCH1 (CO + ldc + ldc, 128);
+	  PREFETCH1 (CO + ldc + ldc + ldc, 128);
+	  __vector_quad acc0, acc1, acc2, acc3, acc4, acc5, acc6, acc7;
+	  SET_ACC_ZERO8 ();
+	  for (l = 0; l < temp; l++)
+	    {
+	      vec_t *rowA = (vec_t *) & AO[l << 4];
+	      __vector_pair rowB;
+	      vec_t *rb = (vec_t *) & BO[l << 2];
+	      __builtin_mma_assemble_pair (&rowB, rb[1], rb[0]);
+	      __builtin_mma_xvf64gerpp (&acc0, rowB, rowA[0]);
+	      __builtin_mma_xvf64gerpp (&acc1, rowB, rowA[1]);
+	      __builtin_mma_xvf64gerpp (&acc2, rowB, rowA[2]);
+	      __builtin_mma_xvf64gerpp (&acc3, rowB, rowA[3]);
+	      __builtin_mma_xvf64gerpp (&acc4, rowB, rowA[4]);
+	      __builtin_mma_xvf64gerpp (&acc5, rowB, rowA[5]);
+	      __builtin_mma_xvf64gerpp (&acc6, rowB, rowA[6]);
+	      __builtin_mma_xvf64gerpp (&acc7, rowB, rowA[7]);
+	    }
+	  SAVE_ACC (&acc0, 0);
+	  SAVE_ACC (&acc2, 4);
+	  SAVE_ACC (&acc1, 2);
+	  SAVE_ACC (&acc3, 6);
+	  SAVE_ACC (&acc4, 8);
+	  SAVE_ACC (&acc6, 12);
+	  SAVE_ACC (&acc5, 10);
+	  SAVE_ACC (&acc7, 14);
+	  AO += temp << 4;
+	  BO += temp << 2;
+#if defined(TRMMKERNEL)
+          REFRESH_AFTER_SAVE (16, 4)
+#endif
+	  CO += 16;
+	}
+      i = (m & 15) >> 3;
+      for (j = 0; j < i; j++)
+	{
+	  FLOAT *BO;
+#if defined(TRMMKERNEL)
+          REFRESH_POINTERS (8, 4);
+#else
+          BO = B;
+          temp = k;
+#endif
+	  v4sf_t *rowC;
+	  v4sf_t result[4];
+	  __vector_quad acc0, acc1, acc2, acc3;
+	  SET_ACC_ZERO4 ();
+	  BLASLONG l = 0;
+	  for (l = 0; l < temp; l++)
+	    {
+	      vec_t *rowA = (vec_t *) & AO[l << 3];
+	      __vector_pair rowB;
+	      vec_t *rb = (vec_t *) & BO[l << 2];
+	      __builtin_mma_assemble_pair (&rowB, rb[1], rb[0]);
+	      __builtin_mma_xvf64gerpp (&acc0, rowB, rowA[0]);
+	      __builtin_mma_xvf64gerpp (&acc1, rowB, rowA[1]);
+	      __builtin_mma_xvf64gerpp (&acc2, rowB, rowA[2]);
+	      __builtin_mma_xvf64gerpp (&acc3, rowB, rowA[3]);
+	    }
+	  SAVE_ACC (&acc0, 0);
+	  SAVE_ACC (&acc2, 4);
+	  SAVE_ACC (&acc1, 2);
+	  SAVE_ACC (&acc3, 6);
+	  CO += 8;
+	  AO += temp << 3;
+	  BO += temp << 2;
+#if defined(TRMMKERNEL)
+          REFRESH_AFTER_SAVE (8, 4)
+#endif
+	}
+      i = (m & 7) >> 2;
+      for (j = 0; j < i; j++)
+	{
+	  FLOAT *BO;
+#if defined(TRMMKERNEL)
+          REFRESH_POINTERS (4, 4);
+#else
+          BO = B;
+          temp = k;
+#endif
+	  v4sf_t *rowC;
+	  v4sf_t result[4];
+	  __vector_quad acc0, acc1;
+	  __builtin_mma_xxsetaccz (&acc0);
+	  __builtin_mma_xxsetaccz (&acc1);
+	  BLASLONG l = 0;
+	  for (l = 0; l < temp; l++)
+	    {
+	      vec_t *rowA = (vec_t *) & AO[l << 2];
+	      __vector_pair rowB;
+	      vec_t *rb = (vec_t *) & BO[l << 2];
+	      __builtin_mma_assemble_pair (&rowB, rb[1], rb[0]);
+	      __builtin_mma_xvf64gerpp (&acc0, rowB, rowA[0]);
+	      __builtin_mma_xvf64gerpp (&acc1, rowB, rowA[1]);
+	    }
+	  SAVE_ACC (&acc0, 0);
+	  SAVE_ACC (&acc1, 2);
+	  CO += 4;
+	  AO += temp << 2;
+	  BO += temp << 2;
+#if defined(TRMMKERNEL)
+          REFRESH_AFTER_SAVE (4, 4)
+#endif
+	}
+      i = (m & 3) >> 1;
+      for (j = 0; j < i; j++)
+	{
+	  FLOAT *BO;
+#if defined(TRMMKERNEL)
+          REFRESH_POINTERS (2, 4);
+#else
+          BO = B;
+          temp = k;
+#endif
+	  v4sf_t *rowC;
+	  v4sf_t result[4];
+	  __vector_quad acc0;
+	  __builtin_mma_xxsetaccz (&acc0);
+	  BLASLONG l = 0;
+	  for (l = 0; l < temp; l++)
+	    {
+	      vec_t *rowA = (vec_t *) & AO[l << 1];
+	      __vector_pair rowB;
+	      vec_t *rb = (vec_t *) & BO[l << 2];
+	      __builtin_mma_assemble_pair (&rowB, rb[1], rb[0]);
+	      __builtin_mma_xvf64gerpp (&acc0, rowB, rowA[0]);
+	    }
+	  SAVE_ACC (&acc0, 0);
+	  CO += 2;
+	  AO += temp << 1;
+	  BO += temp << 2;
+#if defined(TRMMKERNEL)
+          REFRESH_AFTER_SAVE (2, 4)
+#endif
+	}
+      i = (m & 1) >> 0;
+      for (j = 0; j < i; j++)
+	{
+	  FLOAT *BO;
+#if defined(TRMMKERNEL)
+          REFRESH_POINTERS (1, 4);
+#else
+          BO = B;
+          temp = k;
+#endif
+	  BLASLONG l = 0;
+	  v4sf_t t = { 0, 0 };
+	  v4sf_t t1 = { 0, 0 };
+	  for (l = 0; l < temp; l++)
+	    {
+	      v4sf_t rowA = { AO[l], AO[l] };
+	      v4sf_t rowB = { BO[l << 2], BO[(l << 2) + 1] };
+	      v4sf_t rowB1 = { BO[(l << 2) + 2], BO[(l << 2) + 3] };
+	      t += rowA * rowB;
+	      t1 += rowA * rowB1;
+	    }
+	  t = t * valpha;
+	  t1 = t1 * valpha;
+#if defined(TRMMKERNEL)
+	  CO[0 * ldc] = t[0];
+	  CO[1 * ldc] = t[1];
+	  CO[2 * ldc] = t1[0];
+	  CO[3 * ldc] = t1[1];
+#else
+	  CO[0 * ldc] += t[0];
+	  CO[1 * ldc] += t[1];
+	  CO[2 * ldc] += t1[0];
+	  CO[3 * ldc] += t1[1];
+#endif
+	  CO += 1;
+	  AO += temp;
+	  BO += temp << 2;
+#if defined(TRMMKERNEL)
+          REFRESH_AFTER_SAVE (1, 4)
+#endif
+	}
+#if defined(TRMMKERNEL) && !defined(LEFT)
+      off += 4;                 // number of values in A
+#endif
+      B += k << 2;
+    }
+  N = (n & 3) >> 1;
+  for (i1 = 0; i1 < N; i1++)
+    {
+      BLASLONG i, j, temp;
+#if defined(TRMMKERNEL) && defined(LEFT)
+      off = offset;
+#endif
+      FLOAT *CO;
+      FLOAT *AO;
+      CO = C;
+      C += ldc << 1;
+      AO = A;
+      i = m >> 4;
+      for (j = 0; j < i; j++)
+	{
+	  FLOAT *BO;
+#if defined(TRMMKERNEL)
+          REFRESH_POINTERS (16, 2);
+#else
+          BO = B;
+          temp = k;
+#endif
+	  v4sf_t *rowC;
+	  v4sf_t result[4];
+	  __vector_quad acc0, acc1, acc2, acc3, acc4, acc5, acc6, acc7;
+	  SET_ACC_ZERO8 ();
+	  BLASLONG l = 0;
+	  for (l = 0; l < temp; l++)
+	    {
+	      FLOAT t[4] = { 0, 0, 0, 0 };
+	      t[0] = BO[l << 1], t[1] = BO[(l << 1) + 1];
+	      __vector_pair rowB;
+	      vec_t *rb = (vec_t *) & t[0];
+	      __builtin_mma_assemble_pair (&rowB, rb[1], rb[0]);
+	      vec_t *rowA = (vec_t *) & AO[l << 4];
+	      __builtin_mma_xvf64gerpp (&acc0, rowB, rowA[0]);
+	      __builtin_mma_xvf64gerpp (&acc1, rowB, rowA[1]);
+	      __builtin_mma_xvf64gerpp (&acc2, rowB, rowA[2]);
+	      __builtin_mma_xvf64gerpp (&acc3, rowB, rowA[3]);
+	      __builtin_mma_xvf64gerpp (&acc4, rowB, rowA[4]);
+	      __builtin_mma_xvf64gerpp (&acc5, rowB, rowA[5]);
+	      __builtin_mma_xvf64gerpp (&acc6, rowB, rowA[6]);
+	      __builtin_mma_xvf64gerpp (&acc7, rowB, rowA[7]);
+	    }
+	  SAVE2x4_ACC (&acc0, 0);
+	  SAVE2x4_ACC (&acc1, 2);
+	  SAVE2x4_ACC (&acc2, 4);
+	  SAVE2x4_ACC (&acc3, 6);
+	  SAVE2x4_ACC (&acc4, 8);
+	  SAVE2x4_ACC (&acc5, 10);
+	  SAVE2x4_ACC (&acc6, 12);
+	  SAVE2x4_ACC (&acc7, 14);
+	  CO += 16;
+	  AO += temp << 4;
+	  BO += temp << 1;
+#if defined(TRMMKERNEL)
+          REFRESH_AFTER_SAVE (16, 2)
+#endif
+	}
+      i = (m & 15) >> 3;
+      for (j = 0; j < i; j++)
+	{
+	  FLOAT *BO;
+#if defined(TRMMKERNEL)
+          REFRESH_POINTERS (8, 2);
+#else
+          BO = B;
+          temp = k;
+#endif
+	  v4sf_t *rowC;
+	  v4sf_t result[4];
+	  __vector_quad acc0, acc1, acc2, acc3;
+	  SET_ACC_ZERO4 ();
+	  BLASLONG l = 0;
+	  for (l = 0; l < temp; l++)
+	    {
+	      FLOAT t[4] = { 0, 0, 0, 0 };
+	      t[0] = BO[l << 1], t[1] = BO[(l << 1) + 1];
+	      __vector_pair rowB;
+	      vec_t *rb = (vec_t *) & t[0];
+	      __builtin_mma_assemble_pair (&rowB, rb[1], rb[0]);
+	      vec_t *rowA = (vec_t *) & AO[l << 3];
+	      __builtin_mma_xvf64gerpp (&acc0, rowB, rowA[0]);
+	      __builtin_mma_xvf64gerpp (&acc1, rowB, rowA[1]);
+	      __builtin_mma_xvf64gerpp (&acc2, rowB, rowA[2]);
+	      __builtin_mma_xvf64gerpp (&acc3, rowB, rowA[3]);
+	    }
+	  SAVE2x4_ACC (&acc0, 0);
+	  SAVE2x4_ACC (&acc1, 2);
+	  SAVE2x4_ACC (&acc2, 4);
+	  SAVE2x4_ACC (&acc3, 6);
+	  CO += 8;
+	  AO += temp << 3;
+	  BO += temp << 1;
+#if defined(TRMMKERNEL)
+          REFRESH_AFTER_SAVE (8, 2)
+#endif
+	}
+      i = (m & 7) >> 2;
+      for (j = 0; j < i; j++)
+	{
+	  FLOAT *BO;
+#if defined(TRMMKERNEL)
+          REFRESH_POINTERS (4, 2);
+#else
+          BO = B;
+          temp = k;
+#endif
+	  v4sf_t *rowC;
+	  v4sf_t result[4];
+	  __vector_quad acc0, acc1;
+	  __builtin_mma_xxsetaccz (&acc0);
+	  __builtin_mma_xxsetaccz (&acc1);
+	  BLASLONG l = 0;
+	  for (l = 0; l < temp; l++)
+	    {
+	      FLOAT t[4] = { 0, 0, 0, 0 };
+	      t[0] = BO[l << 1], t[1] = BO[(l << 1) + 1];
+	      __vector_pair rowB;
+	      vec_t *rb = (vec_t *) & t[0];
+	      __builtin_mma_assemble_pair (&rowB, rb[1], rb[0]);
+	      vec_t *rowA = (vec_t *) & AO[l << 2];
+	      __builtin_mma_xvf64gerpp (&acc0, rowB, rowA[0]);
+	      __builtin_mma_xvf64gerpp (&acc1, rowB, rowA[1]);
+	    }
+	  SAVE2x4_ACC (&acc0, 0);
+	  SAVE2x4_ACC (&acc1, 2);
+	  CO += 4;
+	  AO += temp << 2;
+	  BO += temp << 1;
+#if defined(TRMMKERNEL)
+          REFRESH_AFTER_SAVE (4, 2)
+#endif
+	}
+      i = (m & 3) >> 1;
+      for (j = 0; j < i; j++)
+	{
+	  FLOAT *BO;
+#if defined(TRMMKERNEL)
+          REFRESH_POINTERS (2, 2);
+#else
+          BO = B;
+          temp = k;
+#endif
+	  v4sf_t *rowC;
+	  v4sf_t result[4];
+	  __vector_quad acc0;
+	  __builtin_mma_xxsetaccz (&acc0);
+	  BLASLONG l = 0;
+	  for (l = 0; l < temp; l++)
+	    {
+	      FLOAT t[4] = { 0, 0, 0, 0 };
+	      t[0] = BO[l << 1], t[1] = BO[(l << 1) + 1];
+	      __vector_pair rowB;
+	      vec_t *rb = (vec_t *) & t[0];
+	      __builtin_mma_assemble_pair (&rowB, rb[1], rb[0]);
+	      vec_t *rowA = (vec_t *) & AO[l << 1];
+	      __builtin_mma_xvf64gerpp (&acc0, rowB, rowA[0]);
+	    }
+	  SAVE2x4_ACC (&acc0, 0);
+	  CO += 2;
+	  AO += temp << 1;
+	  BO += temp << 1;
+#if defined(TRMMKERNEL)
+          REFRESH_AFTER_SAVE (2, 2)
+#endif
+	}
+      i = (m & 1) >> 0;
+      for (j = 0; j < i; j++)
+	{
+	  FLOAT *BO;
+#if defined(TRMMKERNEL)
+          REFRESH_POINTERS (1, 2);
+#else
+          BO = B;
+          temp = k;
+#endif
+	  BLASLONG l = 0;
+	  v4sf_t t = { 0, 0 };
+	  for (l = 0; l < temp; l++)
+	    {
+	      v4sf_t rowA = { AO[l], AO[l] };
+	      v4sf_t rowB = { BO[l << 1], BO[(l << 1) + 1] };
+	      t += rowA * rowB;
+	    }
+	  t = t * valpha;
+#if defined(TRMMKERNEL)
+	  CO[0 * ldc] = t[0];
+	  CO[1 * ldc] = t[1];
+#else
+	  CO[0 * ldc] += t[0];
+	  CO[1 * ldc] += t[1];
+#endif
+	  CO += 1;
+	  AO += temp;
+	  BO += temp << 1;
+#if defined(TRMMKERNEL)
+          REFRESH_AFTER_SAVE (1, 2)
+#endif
+	}
+#if defined(TRMMKERNEL) && !defined(LEFT)
+      off += 2;                 // number of values in A
+#endif
+      B += k << 1;
+    }
+  N = (n & 1) >> 0;
+  for (i1 = 0; i1 < N; i1++)
+    {
+      BLASLONG i, temp;
+#if defined(TRMMKERNEL) && defined(LEFT)
+      off = offset;
+#endif
+      FLOAT *CO;
+      FLOAT *AO;
+      CO = C;
+      C += ldc;
+      AO = A;
+      i = m;
+      while (i >= 16)
+	{
+	  FLOAT *BO;
+#if defined(TRMMKERNEL)
+          REFRESH_POINTERS (16, 1)
+#else
+          BO = B;
+          temp = k;
+#endif
+	  BLASLONG l = 0;
+	  v4sf_t t = { 0, 0 };
+	  v4sf_t t1 = { 0, 0 };
+	  v4sf_t t2 = { 0, 0 };
+	  v4sf_t t3 = { 0, 0 };
+	  v4sf_t t4 = { 0, 0 };
+	  v4sf_t t5 = { 0, 0 };
+	  v4sf_t t6 = { 0, 0 };
+	  v4sf_t t7 = { 0, 0 };
+	  for (l = 0; l < temp; l++)
+	    {
+	      v4sf_t rowB = { BO[l], BO[l] };
+	      v4sf_t rowA = { AO[l << 4], AO[(l << 4) + 1] };
+	      v4sf_t rowA1 = { AO[(l << 4) + 2], AO[(l << 4) + 3] };
+	      v4sf_t rowA2 = { AO[(l << 4) + 4], AO[(l << 4) + 5] };
+	      v4sf_t rowA3 = { AO[(l << 4) + 6], AO[(l << 4) + 7] };
+	      v4sf_t rowA4 = { AO[(l << 4) + 8], AO[(l << 4) + 9] };
+	      v4sf_t rowA5 = { AO[(l << 4) + 10], AO[(l << 4) + 11] };
+	      v4sf_t rowA6 = { AO[(l << 4) + 12], AO[(l << 4) + 13] };
+	      v4sf_t rowA7 = { AO[(l << 4) + 14], AO[(l << 4) + 15] };
+	      t += rowA * rowB;
+	      t1 += rowA1 * rowB;
+	      t2 += rowA2 * rowB;
+	      t3 += rowA3 * rowB;
+	      t4 += rowA4 * rowB;
+	      t5 += rowA5 * rowB;
+	      t6 += rowA6 * rowB;
+	      t7 += rowA7 * rowB;
+	    }
+	  t = t * valpha;
+	  t1 = t1 * valpha;
+	  t2 = t2 * valpha;
+	  t3 = t3 * valpha;
+	  t4 = t4 * valpha;
+	  t5 = t5 * valpha;
+	  t6 = t6 * valpha;
+	  t7 = t7 * valpha;
+#if defined(TRMMKERNEL)
+	  CO[0] = t[0];
+	  CO[1] = t[1];
+	  CO[2] = t1[0];
+	  CO[3] = t1[1];
+	  CO[4] = t2[0];
+	  CO[5] = t2[1];
+	  CO[6] = t3[0];
+	  CO[7] = t3[1];
+	  CO[8] = t4[0];
+	  CO[9] = t4[1];
+	  CO[10] = t5[0];
+	  CO[11] = t5[1];
+	  CO[12] = t6[0];
+	  CO[13] = t6[1];
+	  CO[14] = t7[0];
+	  CO[15] = t7[1];
+#else
+	  CO[0] += t[0];
+	  CO[1] += t[1];
+	  CO[2] += t1[0];
+	  CO[3] += t1[1];
+	  CO[4] += t2[0];
+	  CO[5] += t2[1];
+	  CO[6] += t3[0];
+	  CO[7] += t3[1];
+	  CO[8] += t4[0];
+	  CO[9] += t4[1];
+	  CO[10] += t5[0];
+	  CO[11] += t5[1];
+	  CO[12] += t6[0];
+	  CO[13] += t6[1];
+	  CO[14] += t7[0];
+	  CO[15] += t7[1];
+#endif
+	  AO += temp << 4;
+	  BO += temp;
+	  CO += 16;
+	  i -= 16;
+#if defined(TRMMKERNEL)
+          REFRESH_AFTER_SAVE (16, 1)
+#endif
+	}
+      while (i >= 8)
+	{
+	  FLOAT *BO;
+#if defined(TRMMKERNEL)
+          REFRESH_POINTERS (8, 1)
+#else
+          BO = B;
+          temp = k;
+#endif
+	  BLASLONG l = 0;
+	  v4sf_t t = { 0, 0 };
+	  v4sf_t t1 = { 0, 0 };
+	  v4sf_t t2 = { 0, 0 };
+	  v4sf_t t3 = { 0, 0 };
+	  for (l = 0; l < temp; l++)
+	    {
+	      v4sf_t rowB = { BO[l], BO[l] };
+	      v4sf_t rowA = { AO[l << 3], AO[(l << 3) + 1] };
+	      v4sf_t rowA1 = { AO[(l << 3) + 2], AO[(l << 3) + 3] };
+	      v4sf_t rowA2 = { AO[(l << 3) + 4], AO[(l << 3) + 5] };
+	      v4sf_t rowA3 = { AO[(l << 3) + 6], AO[(l << 3) + 7] };
+	      t += rowA * rowB;
+	      t1 += rowA1 * rowB;
+	      t2 += rowA2 * rowB;
+	      t3 += rowA3 * rowB;
+	    }
+	  t = t * valpha;
+	  t1 = t1 * valpha;
+	  t2 = t2 * valpha;
+	  t3 = t3 * valpha;
+#if defined(TRMMKERNEL)
+	  CO[0] = t[0];
+	  CO[1] = t[1];
+	  CO[2] = t1[0];
+	  CO[3] = t1[1];
+	  CO[4] = t2[0];
+	  CO[5] = t2[1];
+	  CO[6] = t3[0];
+	  CO[7] = t3[1];
+#else
+	  CO[0] += t[0];
+	  CO[1] += t[1];
+	  CO[2] += t1[0];
+	  CO[3] += t1[1];
+	  CO[4] += t2[0];
+	  CO[5] += t2[1];
+	  CO[6] += t3[0];
+	  CO[7] += t3[1];
+#endif
+	  AO += temp << 3;
+	  BO += temp;
+	  CO += 8;
+	  i -= 8;
+#if defined(TRMMKERNEL)
+          REFRESH_AFTER_SAVE (8, 1)
+#endif
+	}
+      while (i >= 4)
+	{
+	  FLOAT *BO;
+#if defined(TRMMKERNEL)
+          REFRESH_POINTERS (4, 1)
+#else
+          BO = B;
+          temp = k;
+#endif
+	  BLASLONG l = 0;
+	  v4sf_t t = { 0, 0 };
+	  v4sf_t t1 = { 0, 0 };
+	  for (l = 0; l < temp; l++)
+	    {
+	      v4sf_t rowB = { BO[l], BO[l] };
+	      v4sf_t rowA = { AO[l << 2], AO[(l << 2) + 1] };
+	      v4sf_t rowA1 = { AO[(l << 2) + 2], AO[(l << 2) + 3] };
+	      t += rowA * rowB;
+	      t1 += rowA1 * rowB;
+	    }
+	  t = t * valpha;
+	  t1 = t1 * valpha;
+#if defined(TRMMKERNEL)
+	  CO[0] = t[0];
+	  CO[1] = t[1];
+	  CO[2] = t1[0];
+	  CO[3] = t1[1];
+#else
+	  CO[0] += t[0];
+	  CO[1] += t[1];
+	  CO[2] += t1[0];
+	  CO[3] += t1[1];
+#endif
+	  AO += temp << 2;
+	  BO += temp;
+	  CO += 4;
+	  i -= 4;
+#if defined(TRMMKERNEL)
+          REFRESH_AFTER_SAVE (4, 1)
+#endif
+	}
+      while (i >= 2)
+	{
+	  FLOAT *BO;
+#if defined(TRMMKERNEL)
+          REFRESH_POINTERS (2, 1)
+#else
+          BO = B;
+          temp = k;
+#endif
+	  BLASLONG l = 0;
+	  v4sf_t t = { 0, 0 };
+	  for (l = 0; l < temp; l++)
+	    {
+	      v4sf_t rowB = { BO[l], BO[l] };
+	      v4sf_t rowA = { AO[l << 1], AO[(l << 1) + 1] };
+	      t += rowA * rowB;
+	    }
+	  t = t * valpha;
+#if defined(TRMMKERNEL)
+	  CO[0] = t[0];
+	  CO[1] = t[1];
+#else
+	  CO[0] += t[0];
+	  CO[1] += t[1];
+#endif
+	  AO += temp << 1;
+	  BO += temp;
+	  CO += 2;
+	  i -= 2;
+#if defined(TRMMKERNEL)
+          REFRESH_AFTER_SAVE (2, 1)
+#endif
+	}
+      while (i >= 1)
+	{
+	  FLOAT *BO;
+#if defined(TRMMKERNEL)
+          REFRESH_POINTERS (1, 1)
+#else
+          BO = B;
+          temp = k;
+#endif
+	  BLASLONG l = 0;
+	  FLOAT t = 0;
+	  for (l = 0; l < temp; l++)
+	    {
+	      t += AO[l] * BO[l];
+	    }
+	  AO += temp;
+	  BO += temp;
+#if defined(TRMMKERNEL)
+	  CO[0] = t * alpha;
+#else
+	  CO[0] += t * alpha;
+#endif
+	  CO += 1;
+	  i -= 1;
+#if defined(TRMMKERNEL)
+          REFRESH_AFTER_SAVE (1, 1)
+#endif
+	}
+#if defined(TRMMKERNEL) && !defined(LEFT)
+      off += 1;                 // number of values in A
+#endif
+      B += k;
+    }
+  return 0;
+}
diff --git a/kernel/power/sgemm_kernel_power10.c b/kernel/power/sgemm_kernel_power10.c
new file mode 100644
index 000000000..01c122c6d
--- /dev/null
+++ b/kernel/power/sgemm_kernel_power10.c
@@ -0,0 +1,1334 @@
+/*********************************************************************************
+Copyright (c) 2020, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+**********************************************************************************/
+#include "common.h"
+#include <altivec.h>
+
+typedef unsigned char vec_t __attribute__ ((vector_size (16)));
+typedef FLOAT v4sf_t __attribute__ ((vector_size (16)));
+typedef FLOAT v2sf_t __attribute__ ((vector_size (8)));
+#if defined(TRMMKERNEL)
+#define SAVE_ACC(ACC, J)  \
+	  __builtin_mma_disassemble_acc (result, ACC); \
+	  rowC = (v4sf_t *) &CO[0* ldc+J]; \
+          rowC[0] = result[3] * alpha; \
+          rowC = (v4sf_t *) &CO[1*ldc+J]; \
+          rowC[0] = result[2] * alpha; \
+          rowC = (v4sf_t *) &CO[2*ldc+J]; \
+          rowC[0] = result[1] * alpha; \
+          rowC = (v4sf_t *) &CO[3*ldc+J]; \
+          rowC[0] = result[0] * alpha;
+#define SAVE_ACC1(ACC, J)  \
+	  __builtin_mma_disassemble_acc (result, ACC); \
+	  rowC = (v4sf_t *) &CO[4* ldc+J]; \
+          rowC[0] = result[3] * alpha; \
+          rowC = (v4sf_t *) &CO[5*ldc+J]; \
+          rowC[0] = result[2] * alpha; \
+          rowC = (v4sf_t *) &CO[6*ldc+J]; \
+          rowC[0] = result[1] * alpha; \
+          rowC = (v4sf_t *) &CO[7*ldc+J]; \
+          rowC[0] = result[0] * alpha;
+#define  SAVE4x2_ACC(ACC, J)  \
+	  __builtin_mma_disassemble_acc (result, ACC); \
+	  rowC = (v2sf_t *) &CO[0* ldc+J]; \
+          rowC[0] = result[6] * alpha; \
+	  rowC = (v2sf_t *) &CO[1* ldc+J]; \
+          rowC[0] = result[4] * alpha; \
+	  rowC = (v2sf_t *) &CO[2* ldc+J]; \
+          rowC[0] = result[2] * alpha; \
+	  rowC = (v2sf_t *) &CO[3* ldc+J]; \
+          rowC[0] = result[0] * alpha;
+#define  SAVE4x2_ACC1(ACC, J)  \
+	  __builtin_mma_disassemble_acc (result, ACC); \
+	  rowC = (v2sf_t *) &CO[4* ldc+J]; \
+          rowC[0] = result[6] * alpha; \
+	  rowC = (v2sf_t *) &CO[5* ldc+J]; \
+          rowC[0] = result[4] * alpha; \
+	  rowC = (v2sf_t *) &CO[6* ldc+J]; \
+          rowC[0] = result[2] * alpha; \
+	  rowC = (v2sf_t *) &CO[7* ldc+J]; \
+          rowC[0] = result[0] * alpha;
+#define  SAVE2x4_ACC(ACC, J)  \
+	  __builtin_mma_disassemble_acc (result, ACC); \
+	  rowC = (v4sf_t *) &CO[0* ldc+J]; \
+          rowC[0] = result[3] * alpha; \
+	  rowC = (v4sf_t *) &CO[1* ldc+J]; \
+          rowC[0] = result[2] * alpha;
+#else
+#define SAVE_ACC(ACC, J)  \
+	  __builtin_mma_disassemble_acc (result, ACC); \
+	  rowC = (v4sf_t *) &CO[0* ldc+J]; \
+          rowC[0] += result[3] * alpha; \
+          rowC = (v4sf_t *) &CO[1*ldc+J]; \
+          rowC[0] += result[2] * alpha; \
+          rowC = (v4sf_t *) &CO[2*ldc+J]; \
+          rowC[0] += result[1] * alpha; \
+          rowC = (v4sf_t *) &CO[3*ldc+J]; \
+          rowC[0] += result[0] * alpha;
+#define SAVE_ACC1(ACC, J)  \
+	  __builtin_mma_disassemble_acc (result, ACC); \
+	  rowC = (v4sf_t *) &CO[4* ldc+J]; \
+          rowC[0] += result[3] * alpha; \
+          rowC = (v4sf_t *) &CO[5*ldc+J]; \
+          rowC[0] += result[2] * alpha; \
+          rowC = (v4sf_t *) &CO[6*ldc+J]; \
+          rowC[0] += result[1] * alpha; \
+          rowC = (v4sf_t *) &CO[7*ldc+J]; \
+          rowC[0] += result[0] * alpha;
+#define  SAVE4x2_ACC(ACC, J)  \
+	  __builtin_mma_disassemble_acc (result, ACC); \
+	  rowC = (v2sf_t *) &CO[0* ldc+J]; \
+          rowC[0] += result[6] * alpha; \
+	  rowC = (v2sf_t *) &CO[1* ldc+J]; \
+          rowC[0] += result[4] * alpha; \
+	  rowC = (v2sf_t *) &CO[2* ldc+J]; \
+          rowC[0] += result[2] * alpha; \
+	  rowC = (v2sf_t *) &CO[3* ldc+J]; \
+          rowC[0] += result[0] * alpha;
+#define  SAVE4x2_ACC1(ACC, J)  \
+	  __builtin_mma_disassemble_acc (result, ACC); \
+	  rowC = (v2sf_t *) &CO[4* ldc+J]; \
+          rowC[0] += result[6] * alpha; \
+	  rowC = (v2sf_t *) &CO[5* ldc+J]; \
+          rowC[0] += result[4] * alpha; \
+	  rowC = (v2sf_t *) &CO[6* ldc+J]; \
+          rowC[0] += result[2] * alpha; \
+	  rowC = (v2sf_t *) &CO[7* ldc+J]; \
+          rowC[0] += result[0] * alpha;
+#define  SAVE2x4_ACC(ACC, J)  \
+	  __builtin_mma_disassemble_acc (result, ACC); \
+	  rowC = (v4sf_t *) &CO[0* ldc+J]; \
+          rowC[0] += result[3] * alpha; \
+	  rowC = (v4sf_t *) &CO[1* ldc+J]; \
+          rowC[0] += result[2] * alpha;
+#endif
+#define KERNEL(i, j) \
+          __builtin_mma_xvf32gerpp (&acc0, rowB[i], rowA[j]); \
+          __builtin_mma_xvf32gerpp (&acc1, rowB[i+1], rowA[j]); \
+          __builtin_mma_xvf32gerpp (&acc2, rowB[i], rowA[j+1]); \
+          __builtin_mma_xvf32gerpp (&acc3, rowB[i+1], rowA[j+1]); \
+          __builtin_mma_xvf32gerpp (&acc4, rowB[i], rowA[j+2]); \
+          __builtin_mma_xvf32gerpp (&acc5, rowB[i+1], rowA[j+2]); \
+          __builtin_mma_xvf32gerpp (&acc6, rowB[i], rowA[j+3]); \
+          __builtin_mma_xvf32gerpp (&acc7, rowB[i+1], rowA[j+3]);
+#define SET_ACC_ZERO4() \
+          __builtin_mma_xxsetaccz (&acc0); \
+          __builtin_mma_xxsetaccz (&acc1); \
+          __builtin_mma_xxsetaccz (&acc2); \
+          __builtin_mma_xxsetaccz (&acc3);
+
+#define SET_ACC_ZERO8() \
+          __builtin_mma_xxsetaccz (&acc0); \
+          __builtin_mma_xxsetaccz (&acc1); \
+          __builtin_mma_xxsetaccz (&acc2); \
+          __builtin_mma_xxsetaccz (&acc3); \
+          __builtin_mma_xxsetaccz (&acc4); \
+          __builtin_mma_xxsetaccz (&acc5); \
+          __builtin_mma_xxsetaccz (&acc6); \
+          __builtin_mma_xxsetaccz (&acc7);
+
+#define PREFETCH1(x, y) asm volatile ("dcbt %0, %1" : : "r" (x), "b" (y) : "memory");
+
+#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
+#define REFRESH_TEMP_BK(x, y) \
+            temp = k - off;
+#elif defined(LEFT)
+#define REFRESH_TEMP_BK(x, y) \
+            temp = off + x;
+#else
+#define REFRESH_TEMP_BK(x, y) \
+            temp = off + y;
+#endif
+#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+#define REFRESH_POINTERS(x, y) \
+	  BO = B; \
+          REFRESH_TEMP_BK(x, y)
+#else
+#define REFRESH_POINTERS(x, y) \
+          AO += off * x; \
+          BO = B + off * y; \
+          REFRESH_TEMP_BK(x, y)
+#endif
+
+#ifdef LEFT
+#define REFRESH_OFF(x) \
+            off += x;
+#else
+#define REFRESH_OFF(x)
+#endif
+
+#ifdef LEFT
+#define UPDATE_TEMP(x, y) \
+            temp -= x;
+#else
+#define UPDATE_TEMP(x, y) \
+            temp -= y;
+#endif
+
+#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+#define REFRESH_TMP_AFTER_SAVE(x, y) \
+            temp = k - off; \
+            UPDATE_TEMP(x, y) \
+            AO += temp * x; \
+            BO += temp * y;
+#else
+#define REFRESH_TMP_AFTER_SAVE(x, y)
+#endif
+
+#define REFRESH_AFTER_SAVE(x,y) \
+        REFRESH_TMP_AFTER_SAVE(x, y) \
+	REFRESH_OFF(x)
+/*************************************************************************************
+* GEMM Kernel
+*************************************************************************************/
+int
+CNAME (BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha, FLOAT * A, FLOAT * B,
+       FLOAT * C, BLASLONG ldc
+#ifdef TRMMKERNEL
+       , BLASLONG offset
+#endif
+  )
+{
+  BLASLONG N = n;
+  BLASLONG i1;
+#if defined(TRMMKERNEL)
+  BLASLONG off;
+#endif
+#if defined(TRMMKERNEL) && !defined(LEFT)
+  off = -offset;
+#endif
+
+  v4sf_t valpha = { alpha, alpha, alpha, alpha };
+  N = n >> 3;
+  for (i1 = 0; i1 < N; i1++)
+    {
+      BLASLONG i, j, temp;
+      FLOAT *CO;
+      FLOAT *AO;
+#if defined(TRMMKERNEL) && defined(LEFT)
+      off = offset;
+#endif
+      CO = C;
+      C += ldc << 3;
+      AO = A;
+      PREFETCH1 (A, 128);
+      PREFETCH1 (A, 256);
+      i = m >> 4;
+      for (j = 0; j < i; j++)
+	{
+	  FLOAT *BO;
+#if defined(TRMMKERNEL)
+	  REFRESH_POINTERS (16, 8);
+#else
+	  BO = B;
+	  temp = k;
+#endif
+	  v4sf_t *rowC;
+	  v4sf_t result[4];
+	  __vector_quad acc0, acc1, acc2, acc3, acc4, acc5, acc6, acc7;
+	  SET_ACC_ZERO8 ();
+	  BLASLONG l = 0;
+	  BLASLONG K = temp / 64;
+	  for (l = 0; l < K; l++)
+	    {
+	      vec_t *rowA = (vec_t *) & AO[0];
+	      vec_t *rowB = (vec_t *) & BO[0];
+	      KERNEL (0, 0);
+	      KERNEL (2, 4);
+	      KERNEL (4, 8);
+	      KERNEL (6, 12);
+	      KERNEL (8, 16);
+	      KERNEL (10, 20);
+	      KERNEL (12, 24);
+	      KERNEL (14, 28);
+	      KERNEL (16, 32);
+	      KERNEL (18, 36);
+	      KERNEL (20, 40);
+	      KERNEL (22, 44);
+	      KERNEL (24, 48);
+	      KERNEL (26, 52);
+	      KERNEL (28, 56);
+	      KERNEL (30, 60);
+	      KERNEL (32, 64);
+	      KERNEL (34, 68);
+	      KERNEL (36, 72);
+	      KERNEL (38, 76);
+	      KERNEL (40, 80);
+	      KERNEL (42, 84);
+	      KERNEL (44, 88);
+	      KERNEL (46, 92);
+	      KERNEL (48, 96);
+	      KERNEL (50, 100);
+	      KERNEL (52, 104);
+	      KERNEL (54, 108);
+	      KERNEL (56, 112);
+	      KERNEL (58, 116);
+	      KERNEL (60, 120);
+	      KERNEL (62, 124);
+	      KERNEL (64, 128);
+	      KERNEL (66, 132);
+	      KERNEL (68, 136);
+	      KERNEL (70, 140);
+	      KERNEL (72, 144);
+	      KERNEL (74, 148);
+	      KERNEL (76, 152);
+	      KERNEL (78, 156);
+	      KERNEL (80, 160);
+	      KERNEL (82, 164);
+	      KERNEL (84, 168);
+	      KERNEL (86, 172);
+	      KERNEL (88, 176);
+	      KERNEL (90, 180);
+	      KERNEL (92, 184);
+	      KERNEL (94, 188);
+	      KERNEL (96, 192);
+	      KERNEL (98, 196);
+	      KERNEL (100, 200);
+	      KERNEL (102, 204);
+	      KERNEL (104, 208);
+	      KERNEL (106, 212);
+	      KERNEL (108, 216);
+	      KERNEL (110, 220);
+	      KERNEL (112, 224);
+	      KERNEL (114, 228);
+	      KERNEL (116, 232);
+	      KERNEL (118, 236);
+	      KERNEL (120, 240);
+	      KERNEL (122, 244);
+	      KERNEL (124, 248);
+	      KERNEL (126, 252);
+	      AO += 1024;
+	      BO += 512;
+	    }
+	  if ((temp & 63) >> 5)
+	    {
+	      vec_t *rowA = (vec_t *) & AO[0];
+	      vec_t *rowB = (vec_t *) & BO[0];
+	      KERNEL (0, 0);
+	      KERNEL (2, 4);
+	      KERNEL (4, 8);
+	      KERNEL (6, 12);
+	      KERNEL (8, 16);
+	      KERNEL (10, 20);
+	      KERNEL (12, 24);
+	      KERNEL (14, 28);
+	      KERNEL (16, 32);
+	      KERNEL (18, 36);
+	      KERNEL (20, 40);
+	      KERNEL (22, 44);
+	      KERNEL (24, 48);
+	      KERNEL (26, 52);
+	      KERNEL (28, 56);
+	      KERNEL (30, 60);
+	      KERNEL (32, 64);
+	      KERNEL (34, 68);
+	      KERNEL (36, 72);
+	      KERNEL (38, 76);
+	      KERNEL (40, 80);
+	      KERNEL (42, 84);
+	      KERNEL (44, 88);
+	      KERNEL (46, 92);
+	      KERNEL (48, 96);
+	      KERNEL (50, 100);
+	      KERNEL (52, 104);
+	      KERNEL (54, 108);
+	      KERNEL (56, 112);
+	      KERNEL (58, 116);
+	      KERNEL (60, 120);
+	      KERNEL (62, 124);
+	      AO += 512;
+	      BO += 256;
+	    }
+	  if ((temp & 31) >> 4)
+	    {
+	      vec_t *rowA = (vec_t *) & AO[0];
+	      vec_t *rowB = (vec_t *) & BO[0];
+	      KERNEL (0, 0);
+	      KERNEL (2, 4);
+	      KERNEL (4, 8);
+	      KERNEL (6, 12);
+	      KERNEL (8, 16);
+	      KERNEL (10, 20);
+	      KERNEL (12, 24);
+	      KERNEL (14, 28);
+	      KERNEL (16, 32);
+	      KERNEL (18, 36);
+	      KERNEL (20, 40);
+	      KERNEL (22, 44);
+	      KERNEL (24, 48);
+	      KERNEL (26, 52);
+	      KERNEL (28, 56);
+	      KERNEL (30, 60);
+	      AO += 256;
+	      BO += 128;
+	    }
+	  if ((temp & 15) >> 3)
+	    {
+	      vec_t *rowA = (vec_t *) & AO[0];
+	      vec_t *rowB = (vec_t *) & BO[0];
+	      KERNEL (0, 0);
+	      KERNEL (2, 4);
+	      KERNEL (4, 8);
+	      KERNEL (6, 12);
+	      KERNEL (8, 16);
+	      KERNEL (10, 20);
+	      KERNEL (12, 24);
+	      KERNEL (14, 28);
+	      AO += 128;
+	      BO += 64;
+	    }
+	  if ((temp & 7) >> 2)
+	    {
+	      vec_t *rowA = (vec_t *) & AO[0];
+	      vec_t *rowB = (vec_t *) & BO[0];
+	      KERNEL (0, 0);
+	      KERNEL (2, 4);
+	      KERNEL (4, 8);
+	      KERNEL (6, 12);
+	      AO += 64;
+	      BO += 32;
+	    }
+	  if ((temp & 3) >> 1)
+	    {
+	      vec_t *rowA = (vec_t *) & AO[0];
+	      vec_t *rowB = (vec_t *) & BO[0];
+	      KERNEL (0, 0);
+	      KERNEL (2, 4);
+	      AO += 32;
+	      BO += 16;
+	    }
+	  if ((temp & 1) >> 0)
+	    {
+	      vec_t *rowA = (vec_t *) & AO[0];
+	      vec_t *rowB = (vec_t *) & BO[0];
+	      KERNEL (0, 0);
+	      AO += 16;
+	      BO += 8;
+	    }
+	  SAVE_ACC (&acc0, 0);
+	  SAVE_ACC (&acc2, 4);
+	  SAVE_ACC1 (&acc1, 0);
+	  SAVE_ACC1 (&acc3, 4);
+	  SAVE_ACC (&acc4, 8);
+	  SAVE_ACC (&acc6, 12);
+	  SAVE_ACC1 (&acc5, 8);
+	  SAVE_ACC1 (&acc7, 12);
+#if defined(TRMMKERNEL)
+	  REFRESH_AFTER_SAVE (16, 8)
+#endif
+	    CO += 16;
+	}
+      i = (m & 15) >> 3;
+      for (j = 0; j < i; j++)
+	{
+	  FLOAT *BO;
+#if defined(TRMMKERNEL)
+	  REFRESH_POINTERS (8, 8);
+#else
+	  BO = B;
+	  temp = k;
+#endif
+	  v4sf_t *rowC;
+	  v4sf_t result[4];
+	  __vector_quad acc0, acc1, acc2, acc3;
+	  SET_ACC_ZERO4 ();
+	  BLASLONG l = 0;
+	  for (l = 0; l < temp; l++)
+	    {
+	      vec_t *rowA = (vec_t *) & AO[l << 3];
+	      vec_t *rowB = (vec_t *) & BO[l << 3];
+	      __builtin_mma_xvf32gerpp (&acc0, rowB[0], rowA[0]);
+	      __builtin_mma_xvf32gerpp (&acc1, rowB[1], rowA[0]);
+	      __builtin_mma_xvf32gerpp (&acc2, rowB[0], rowA[1]);
+	      __builtin_mma_xvf32gerpp (&acc3, rowB[1], rowA[1]);
+	    }
+	  SAVE_ACC (&acc0, 0);
+	  SAVE_ACC (&acc2, 4);
+	  SAVE_ACC1 (&acc1, 0);
+	  SAVE_ACC1 (&acc3, 4);
+	  AO += (temp << 3);
+	  BO += (temp << 3);
+	  CO += 8;
+#if defined(TRMMKERNEL)
+	  REFRESH_AFTER_SAVE (8, 8)
+#endif
+	}
+      i = (m & 7) >> 2;
+      for (j = 0; j < i; j++)
+	{
+	  FLOAT *BO;
+#if defined(TRMMKERNEL)
+	  REFRESH_POINTERS (4, 8);
+#else
+	  BO = B;
+	  temp = k;
+#endif
+	  v4sf_t *rowC;
+	  v4sf_t result[4];
+	  __vector_quad acc0, acc1;
+	  __builtin_mma_xxsetaccz (&acc0);
+	  __builtin_mma_xxsetaccz (&acc1);
+	  BLASLONG l = 0;
+	  for (l = 0; l < temp; l++)
+	    {
+	      vec_t *rowA = (vec_t *) & AO[l << 2];
+	      vec_t *rowB = (vec_t *) & BO[l << 3];
+	      __builtin_mma_xvf32gerpp (&acc0, rowB[0], rowA[0]);
+	      __builtin_mma_xvf32gerpp (&acc1, rowB[1], rowA[0]);
+	    }
+	  SAVE_ACC (&acc0, 0);
+	  SAVE_ACC1 (&acc1, 0);
+	  CO += 4;
+	  AO += (temp << 2);
+	  BO += (temp << 3);
+#if defined(TRMMKERNEL)
+	  REFRESH_AFTER_SAVE (4, 8)
+#endif
+	}
+      i = (m & 3) >> 1;
+      for (j = 0; j < i; j++)
+	{
+	  FLOAT *BO;
+#if defined(TRMMKERNEL)
+	  REFRESH_POINTERS (2, 8);
+#else
+	  BO = B;
+	  temp = k;
+#endif
+
+	  v2sf_t *rowC;
+	  v2sf_t result[8];
+	  __vector_quad acc0, acc1;
+	  __builtin_mma_xxsetaccz (&acc0);
+	  __builtin_mma_xxsetaccz (&acc1);
+	  BLASLONG l = 0;
+	  for (l = 0; l < temp; l++)
+	    {
+	      FLOAT t[4] = { 0 };
+	      t[0] = AO[l << 1], t[1] = AO[(l << 1) + 1];
+	      vec_t *rowA = (vec_t *) & t[0];
+	      vec_t *rowB = (vec_t *) & BO[l << 3];
+	      __builtin_mma_xvf32gerpp (&acc0, rowB[0], rowA[0]);
+	      __builtin_mma_xvf32gerpp (&acc1, rowB[1], rowA[0]);
+	    }
+	  SAVE4x2_ACC (&acc0, 0);
+	  SAVE4x2_ACC1 (&acc1, 0);
+	  CO += 2;
+	  AO += (temp << 1);
+	  BO += (temp << 3);
+#if defined(TRMMKERNEL)
+	  REFRESH_AFTER_SAVE (2, 8)
+#endif
+	}
+      i = (m & 1) >> 0;
+      for (j = 0; j < i; j++)
+	{
+	  FLOAT *BO;
+#if defined(TRMMKERNEL)
+	  REFRESH_POINTERS (1, 8);
+#else
+	  BO = B;
+	  temp = k;
+#endif
+	  BLASLONG l = 0;
+	  v4sf_t t = { 0, 0, 0, 0 };
+	  v4sf_t t1 = { 0, 0, 0, 0 };
+	  for (l = 0; l < temp; l++)
+	    {
+	      v4sf_t rowA = { AO[l], AO[l], AO[l], AO[l] };
+	      v4sf_t rowB = { BO[l << 3], BO[(l << 3) + 1], BO[(l << 3) + 2],
+		BO[(l << 3) + 3]
+	      };
+	      v4sf_t rowB1 =
+		{ BO[(l << 3) + 4], BO[(l << 3) + 5], BO[(l << 3) + 6],
+		BO[(l << 3) + 7]
+	      };
+	      t += rowA * rowB;
+	      t1 += rowA * rowB1;
+	    }
+	  t = t * valpha;
+	  t1 = t1 * valpha;
+#if defined(TRMMKERNEL)
+	  CO[0 * ldc] = t[0];
+	  CO[1 * ldc] = t[1];
+	  CO[2 * ldc] = t[2];
+	  CO[3 * ldc] = t[3];
+	  CO[4 * ldc] = t1[0];
+	  CO[5 * ldc] = t1[1];
+	  CO[6 * ldc] = t1[2];
+	  CO[7 * ldc] = t1[3];
+#else
+	  CO[0 * ldc] += t[0];
+	  CO[1 * ldc] += t[1];
+	  CO[2 * ldc] += t[2];
+	  CO[3 * ldc] += t[3];
+	  CO[4 * ldc] += t1[0];
+	  CO[5 * ldc] += t1[1];
+	  CO[6 * ldc] += t1[2];
+	  CO[7 * ldc] += t1[3];
+#endif
+	  CO += 1;
+	  AO += temp;
+	  BO += (temp << 3);
+#if defined(TRMMKERNEL)
+	  REFRESH_AFTER_SAVE (1, 8)
+#endif
+	}
+#if defined(TRMMKERNEL) && !defined(LEFT)
+      off += 8;			// number of values in A
+#endif
+
+      B += k << 3;
+    }
+  N = (n & 7) >> 2;
+  for (i1 = 0; i1 < N; i1++)
+    {
+      BLASLONG i, j, temp;
+#if defined(TRMMKERNEL) && defined(LEFT)
+      off = offset;
+#endif
+      FLOAT *CO;
+      FLOAT *AO;
+      CO = C;
+      C += ldc << 2;
+      AO = A;
+#if !defined(TRMMKERNEL)
+      i = m >> 5;
+      for (j = 0; j < i; j++)
+	{
+	  FLOAT *BO = B;
+	  v4sf_t *rowC;
+	  v4sf_t result[4];
+	  FLOAT *A1;
+	  A1 = AO + (16 * k);
+	  __vector_quad acc0, acc1, acc2, acc3, acc4, acc5, acc6, acc7;
+	  SET_ACC_ZERO8 ();
+	  BLASLONG l = 0;
+	  for (l = 0; l < k; l++)
+	    {
+	      vec_t *rowA = (vec_t *) & AO[l << 4];
+	      vec_t *rowA1 = (vec_t *) & A1[l << 4];
+	      vec_t *rowB = (vec_t *) & BO[l << 2];
+	      __builtin_mma_xvf32gerpp (&acc0, rowB[0], rowA[0]);
+	      __builtin_mma_xvf32gerpp (&acc1, rowB[0], rowA[1]);
+	      __builtin_mma_xvf32gerpp (&acc2, rowB[0], rowA[2]);
+	      __builtin_mma_xvf32gerpp (&acc3, rowB[0], rowA[3]);
+	      __builtin_mma_xvf32gerpp (&acc4, rowB[0], rowA1[0]);
+	      __builtin_mma_xvf32gerpp (&acc5, rowB[0], rowA1[1]);
+	      __builtin_mma_xvf32gerpp (&acc6, rowB[0], rowA1[2]);
+	      __builtin_mma_xvf32gerpp (&acc7, rowB[0], rowA1[3]);
+	    }
+
+	  SAVE_ACC (&acc0, 0);
+	  SAVE_ACC (&acc1, 4);
+	  CO += 8;
+	  SAVE_ACC (&acc2, 0);
+	  SAVE_ACC (&acc3, 4);
+	  CO += 8;
+	  SAVE_ACC (&acc4, 0);
+	  SAVE_ACC (&acc5, 4);
+	  CO += 8;
+	  SAVE_ACC (&acc6, 0);
+	  SAVE_ACC (&acc7, 4);
+	  CO += 8;
+	  AO += k << 5;
+	  BO += k << 2;
+	}
+      i = (m & 31) >> 4;
+#else
+      i = m >> 4;
+#endif
+      for (j = 0; j < i; j++)
+	{
+	  FLOAT *BO;
+#if defined(TRMMKERNEL)
+	  REFRESH_POINTERS (16, 4);
+#else
+	  BO = B;
+	  temp = k;
+#endif
+	  v4sf_t *rowC;
+	  v4sf_t result[4];
+	  __vector_quad acc0, acc1, acc2, acc3;
+	  SET_ACC_ZERO4 ();
+	  BLASLONG l = 0;
+	  for (l = 0; l < temp; l++)
+	    {
+	      vec_t *rowA = (vec_t *) & AO[l << 4];
+	      vec_t *rowB = (vec_t *) & BO[l << 2];
+	      __builtin_mma_xvf32gerpp (&acc0, rowB[0], rowA[0]);
+	      __builtin_mma_xvf32gerpp (&acc1, rowB[0], rowA[1]);
+	      __builtin_mma_xvf32gerpp (&acc2, rowB[0], rowA[2]);
+	      __builtin_mma_xvf32gerpp (&acc3, rowB[0], rowA[3]);
+	    }
+
+	  SAVE_ACC (&acc0, 0);
+	  SAVE_ACC (&acc1, 4);
+	  CO += 8;
+	  SAVE_ACC (&acc2, 0);
+	  SAVE_ACC (&acc3, 4);
+	  CO += 8;
+	  AO += temp << 4;
+	  BO += temp << 2;
+#if defined(TRMMKERNEL)
+	  REFRESH_AFTER_SAVE (16, 4)
+#endif
+	}
+      i = (m & 15) >> 3;
+      for (j = 0; j < i; j++)
+	{
+	  FLOAT *BO;
+#if defined(TRMMKERNEL)
+	  REFRESH_POINTERS (8, 4);
+#else
+	  BO = B;
+	  temp = k;
+#endif
+	  v4sf_t *rowC;
+	  v4sf_t result[4];
+	  __vector_quad acc0, acc1;
+	  __builtin_mma_xxsetaccz (&acc0);
+	  __builtin_mma_xxsetaccz (&acc1);
+	  BLASLONG l = 0;
+	  for (l = 0; l < temp; l++)
+	    {
+	      vec_t *rowA = (vec_t *) & AO[l << 3];
+	      vec_t *rowB = (vec_t *) & BO[l << 2];
+	      __builtin_mma_xvf32gerpp (&acc0, rowB[0], rowA[0]);
+	      __builtin_mma_xvf32gerpp (&acc1, rowB[0], rowA[1]);
+	    }
+	  SAVE_ACC (&acc0, 0);
+	  SAVE_ACC (&acc1, 4);
+	  CO += 8;
+	  AO += temp << 3;
+	  BO += temp << 2;
+#if defined(TRMMKERNEL)
+	  REFRESH_AFTER_SAVE (8, 4)
+#endif
+	}
+      i = (m & 7) >> 2;
+      for (j = 0; j < i; j++)
+	{
+	  FLOAT *BO;
+#if defined(TRMMKERNEL)
+	  REFRESH_POINTERS (4, 4);
+#else
+	  BO = B;
+	  temp = k;
+#endif
+	  v4sf_t *rowC;
+	  __vector_quad acc0;
+	  v4sf_t result[4];
+	  __builtin_mma_xxsetaccz (&acc0);
+	  BLASLONG l = 0;
+	  for (l = 0; l < temp; l++)
+	    {
+	      vec_t *rowA = (vec_t *) & AO[l << 2];
+	      vec_t *rowB = (vec_t *) & BO[l << 2];
+	      __builtin_mma_xvf32gerpp (&acc0, rowB[0], rowA[0]);
+	    }
+	  SAVE_ACC (&acc0, 0);
+	  CO += 4;
+	  AO += temp << 2;
+	  BO += temp << 2;
+#if defined(TRMMKERNEL)
+	  REFRESH_AFTER_SAVE (4, 4)
+#endif
+	}
+      i = (m & 3) >> 1;
+      for (j = 0; j < i; j++)
+	{
+	  FLOAT *BO;
+#if defined(TRMMKERNEL)
+	  REFRESH_POINTERS (2, 4);
+#else
+	  BO = B;
+	  temp = k;
+#endif
+	  v2sf_t *rowC;
+	  v2sf_t result[8];
+	  __vector_quad acc0;
+	  __builtin_mma_xxsetaccz (&acc0);
+	  BLASLONG l = 0;
+	  for (l = 0; l < temp; l++)
+	    {
+	      FLOAT t[4] = { 0 };
+	      t[0] = AO[l << 1], t[1] = AO[(l << 1) + 1];
+	      vec_t *rowA = (vec_t *) & t[0];
+	      vec_t *rowB = (vec_t *) & BO[l << 2];
+	      __builtin_mma_xvf32gerpp (&acc0, rowB[0], rowA[0]);
+	    }
+	  SAVE4x2_ACC (&acc0, 0);
+	  CO += 2;
+	  AO += temp << 1;
+	  BO += temp << 2;
+#if defined(TRMMKERNEL)
+	  REFRESH_AFTER_SAVE (2, 4)
+#endif
+	}
+      i = (m & 1) >> 0;
+      for (j = 0; j < i; j++)
+	{
+	  FLOAT *BO;
+#if defined(TRMMKERNEL)
+	  REFRESH_POINTERS (1, 4)
+#else
+	  BO = B;
+	  temp = k;
+#endif
+	  BLASLONG l = 0;
+	  v4sf_t t = { 0, 0, 0, 0 };
+	  for (l = 0; l < temp; l++)
+	    {
+	      v4sf_t rowA = { AO[l], AO[l], AO[l], AO[l] };
+	      v4sf_t rowB = { BO[l << 2], BO[(l << 2) + 1], BO[(l << 2) + 2],
+		BO[(l << 2) + 3]
+	      };
+	      t += rowA * rowB;
+	    }
+	  t = t * valpha;
+#if defined(TRMMKERNEL)
+	  CO[0 * ldc] = t[0];
+	  CO[1 * ldc] = t[1];
+	  CO[2 * ldc] = t[2];
+	  CO[3 * ldc] = t[3];
+#else
+	  CO[0 * ldc] += t[0];
+	  CO[1 * ldc] += t[1];
+	  CO[2 * ldc] += t[2];
+	  CO[3 * ldc] += t[3];
+#endif
+	  CO += 1;
+	  AO += temp;
+	  BO += temp << 2;
+#if defined(TRMMKERNEL)
+	  REFRESH_AFTER_SAVE (1, 4)
+#endif
+	}
+#if defined(TRMMKERNEL) && !defined(LEFT)
+      off += 4;			// number of values in A
+#endif
+
+      B += k << 2;
+    }
+  N = (n & 3) >> 1;
+  for (i1 = 0; i1 < N; i1++)
+    {
+      BLASLONG i, j, temp;
+#if defined(TRMMKERNEL) && defined(LEFT)
+      off = offset;
+#endif
+      FLOAT *CO;
+      FLOAT *AO;
+      CO = C;
+      C += ldc << 1;
+      AO = A;
+#if !defined(TRMMKERNEL)
+      i = m >> 5;
+      for (j = 0; j < i; j++)
+	{
+	  FLOAT *BO = B;
+	  v4sf_t *rowC;
+	  v4sf_t result[4];
+	  FLOAT *A1;
+	  A1 = AO + (16 * k);
+	  __vector_quad acc0, acc1, acc2, acc3, acc4, acc5, acc6, acc7;
+	  SET_ACC_ZERO8 ();
+	  BLASLONG l = 0;
+	  for (l = 0; l < k; l++)
+	    {
+	      FLOAT t[4] = { 0 };
+	      t[0] = BO[l << 1], t[1] = BO[(l << 1) + 1];
+	      vec_t *rowB = (vec_t *) & t[0];
+	      vec_t *rowA = (vec_t *) & AO[l << 4];
+	      vec_t *rowA1 = (vec_t *) & A1[l << 4];
+	      __builtin_mma_xvf32gerpp (&acc0, rowB[0], rowA[0]);
+	      __builtin_mma_xvf32gerpp (&acc1, rowB[0], rowA[1]);
+	      __builtin_mma_xvf32gerpp (&acc2, rowB[0], rowA[2]);
+	      __builtin_mma_xvf32gerpp (&acc3, rowB[0], rowA[3]);
+	      __builtin_mma_xvf32gerpp (&acc4, rowB[0], rowA1[0]);
+	      __builtin_mma_xvf32gerpp (&acc5, rowB[0], rowA1[1]);
+	      __builtin_mma_xvf32gerpp (&acc6, rowB[0], rowA1[2]);
+	      __builtin_mma_xvf32gerpp (&acc7, rowB[0], rowA1[3]);
+	    }
+	  SAVE2x4_ACC (&acc0, 0);
+	  SAVE2x4_ACC (&acc1, 4);
+	  SAVE2x4_ACC (&acc2, 8);
+	  SAVE2x4_ACC (&acc3, 12);
+	  CO += 16;
+	  SAVE2x4_ACC (&acc4, 0);
+	  SAVE2x4_ACC (&acc5, 4);
+	  SAVE2x4_ACC (&acc6, 8);
+	  SAVE2x4_ACC (&acc7, 12);
+	  CO += 16;
+	  AO += k << 5;
+	  BO += k << 1;
+	}
+      i = (m & 31) >> 4;
+#else
+      i = m >> 4;
+#endif
+      for (j = 0; j < i; j++)
+	{
+	  FLOAT *BO;
+	  v4sf_t *rowC;
+	  v4sf_t result[4];
+	  __vector_quad acc0, acc1, acc2, acc3;
+	  SET_ACC_ZERO4 ();
+	  BLASLONG l = 0;
+#if defined(TRMMKERNEL)
+	  REFRESH_POINTERS (16, 2)
+#else
+	  BO = B;
+	  temp = k;
+#endif
+	  for (l = 0; l < temp; l++)
+	    {
+	      FLOAT t[4] = { 0 };
+	      t[0] = BO[l << 1], t[1] = BO[(l << 1) + 1];
+	      vec_t *rowB = (vec_t *) & t[0];
+	      vec_t *rowA = (vec_t *) & AO[l << 4];
+	      __builtin_mma_xvf32gerpp (&acc0, rowB[0], rowA[0]);
+	      __builtin_mma_xvf32gerpp (&acc1, rowB[0], rowA[1]);
+	      __builtin_mma_xvf32gerpp (&acc2, rowB[0], rowA[2]);
+	      __builtin_mma_xvf32gerpp (&acc3, rowB[0], rowA[3]);
+	    }
+	  SAVE2x4_ACC (&acc0, 0);
+	  SAVE2x4_ACC (&acc1, 4);
+	  SAVE2x4_ACC (&acc2, 8);
+	  SAVE2x4_ACC (&acc3, 12);
+	  CO += 16;
+	  AO += temp << 4;
+	  BO += temp << 1;
+#if defined(TRMMKERNEL)
+	  REFRESH_AFTER_SAVE (16, 2)
+#endif
+	}
+      i = (m & 15) >> 3;
+      for (j = 0; j < i; j++)
+	{
+	  FLOAT *BO;
+	  v4sf_t *rowC;
+	  v4sf_t result[4];
+	  __vector_quad acc0, acc1;
+	  __builtin_mma_xxsetaccz (&acc0);
+	  __builtin_mma_xxsetaccz (&acc1);
+#if defined(TRMMKERNEL)
+	  REFRESH_POINTERS (8, 2)
+#else
+	  BO = B;
+	  temp = k;
+#endif
+	  BLASLONG l = 0;
+	  for (l = 0; l < temp; l++)
+	    {
+	      FLOAT t[4] = { 0 };
+	      t[0] = BO[l << 1], t[1] = BO[(l << 1) + 1];
+	      vec_t *rowB = (vec_t *) & t[0];
+	      vec_t *rowA = (vec_t *) & AO[l << 3];
+	      __builtin_mma_xvf32gerpp (&acc0, rowB[0], rowA[0]);
+	      __builtin_mma_xvf32gerpp (&acc1, rowB[0], rowA[1]);
+	    }
+	  SAVE2x4_ACC (&acc0, 0);
+	  SAVE2x4_ACC (&acc1, 4);
+	  CO += 8;
+	  AO += temp << 3;
+	  BO += temp << 1;
+#if defined(TRMMKERNEL)
+	  REFRESH_AFTER_SAVE (8, 2)
+#endif
+	}
+      i = (m & 7) >> 2;
+      for (j = 0; j < i; j++)
+	{
+	  FLOAT *BO;
+	  v4sf_t *rowC;
+	  v4sf_t result[4];
+	  __vector_quad acc0;
+	  __builtin_mma_xxsetaccz (&acc0);
+#if defined(TRMMKERNEL)
+	  REFRESH_POINTERS (4, 2)
+#else
+	  BO = B;
+	  temp = k;
+#endif
+	  BLASLONG l = 0;
+	  for (l = 0; l < temp; l++)
+	    {
+	      FLOAT t[4] = { 0 };
+	      t[0] = BO[l << 1], t[1] = BO[(l << 1) + 1];
+	      vec_t *rowB = (vec_t *) & t[0];
+	      vec_t *rowA = (vec_t *) & AO[l << 2];
+	      __builtin_mma_xvf32gerpp (&acc0, rowB[0], rowA[0]);
+	    }
+	  SAVE2x4_ACC (&acc0, 0);
+	  CO += 4;
+	  AO += temp << 2;
+	  BO += temp << 1;
+#if defined(TRMMKERNEL)
+	  REFRESH_AFTER_SAVE (4, 2)
+#endif
+	}
+      i = (m & 3) >> 1;
+      for (j = 0; j < i; j++)
+	{
+	  FLOAT *BO;
+	  BLASLONG l = 0;
+#if defined(TRMMKERNEL)
+	  REFRESH_POINTERS (2, 2)
+#else
+	  BO = B;
+	  temp = k;
+#endif
+	  v4sf_t t = { 0, 0, 0, 0 };
+	  for (l = 0; l < (temp << 1); l += 2)
+	    {
+	      v4sf_t rowA = { AO[l], AO[l], AO[l + 1], AO[l + 1] };
+	      v4sf_t rowB = { BO[l], BO[l + 1], BO[l], BO[l + 1] };
+	      t += rowA * rowB;
+	    }
+	  t = t * valpha;
+#if defined(TRMMKERNEL)
+	  CO[0 * ldc] = t[0];
+	  CO[1 * ldc] = t[1];
+	  CO[0 * ldc + 1] = t[2];
+	  CO[1 * ldc + 1] = t[3];
+#else
+	  CO[0 * ldc] += t[0];
+	  CO[1 * ldc] += t[1];
+	  CO[0 * ldc + 1] += t[2];
+	  CO[1 * ldc + 1] += t[3];
+#endif
+	  CO += 2;
+	  AO += temp << 1;
+	  BO += temp << 1;
+#if defined(TRMMKERNEL)
+	  REFRESH_AFTER_SAVE (2, 2)
+#endif
+	}
+      i = (m & 1) >> 0;
+      for (j = 0; j < i; j++)
+	{
+	  FLOAT *BO;
+	  BLASLONG l = 0;
+#if defined(TRMMKERNEL)
+	  REFRESH_POINTERS (1, 2)
+#else
+	  BO = B;
+	  temp = k;
+#endif
+	  v4sf_t t = { 0, 0, 0, 0 };
+	  for (l = 0; l < temp; l++)
+	    {
+	      v4sf_t rowA = { AO[l], AO[l], 0, 0 };
+	      v4sf_t rowB = { BO[l << 1], BO[(l << 1) + 1], 0, 0 };
+	      t += rowA * rowB;
+	    }
+	  t = t * valpha;
+#if defined(TRMMKERNEL)
+	  CO[0 * ldc] = t[0];
+	  CO[1 * ldc] = t[1];
+#else
+	  CO[0 * ldc] += t[0];
+	  CO[1 * ldc] += t[1];
+#endif
+	  CO += 1;
+	  AO += temp;
+	  BO += temp << 1;
+#if defined(TRMMKERNEL)
+	  REFRESH_AFTER_SAVE (1, 2)
+#endif
+	}
+#if defined(TRMMKERNEL) && !defined(LEFT)
+      off += 2;			// number of values in A
+#endif
+
+      B += k << 1;
+    }
+  N = (n & 1) >> 0;
+  for (i1 = 0; i1 < N; i1++)
+    {
+      BLASLONG i, temp;
+#if defined(TRMMKERNEL) && defined(LEFT)
+      off = offset;
+#endif
+      FLOAT *CO;
+      FLOAT *AO;
+      CO = C;
+      C += ldc;
+      AO = A;
+      i = m;
+      while (i >= 16)
+	{
+	  FLOAT *BO;
+	  BLASLONG l = 0;
+#if defined(TRMMKERNEL)
+	  REFRESH_POINTERS (16, 1)
+#else
+	  BO = B;
+	  temp = k;
+#endif
+
+	  v4sf_t t = { 0, 0, 0, 0 };
+	  v4sf_t t1 = { 0, 0, 0, 0 };
+	  v4sf_t t2 = { 0, 0, 0, 0 };
+	  v4sf_t t3 = { 0, 0, 0, 0 };
+	  for (l = 0; l < temp; l++)
+	    {
+	      v4sf_t rowB = { BO[l], BO[l], BO[l], BO[l] };
+	      v4sf_t rowA = { AO[l << 4], AO[(l << 4) + 1], AO[(l << 4) + 2],
+		AO[(l << 4) + 3]
+	      };
+	      v4sf_t rowA1 =
+		{ AO[(l << 4) + 4], AO[(l << 4) + 5], AO[(l << 4) + 6],
+		AO[(l << 4) + 7]
+	      };
+	      v4sf_t rowA2 =
+		{ AO[(l << 4) + 8], AO[(l << 4) + 9], AO[(l << 4) + 10],
+		AO[(l << 4) + 11]
+	      };
+	      v4sf_t rowA3 =
+		{ AO[(l << 4) + 12], AO[(l << 4) + 13], AO[(l << 4) + 14],
+		AO[(l << 4) + 15]
+	      };
+	      t += rowA * rowB;
+	      t1 += rowA1 * rowB;
+	      t2 += rowA2 * rowB;
+	      t3 += rowA3 * rowB;
+	    }
+	  t = t * valpha;
+	  t1 = t1 * valpha;
+	  t2 = t2 * valpha;
+	  t3 = t3 * valpha;
+#if defined(TRMMKERNEL)
+	  CO[0] = t[0];
+	  CO[1] = t[1];
+	  CO[2] = t[2];
+	  CO[3] = t[3];
+	  CO[4] = t1[0];
+	  CO[5] = t1[1];
+	  CO[6] = t1[2];
+	  CO[7] = t1[3];
+	  CO[8] = t2[0];
+	  CO[9] = t2[1];
+	  CO[10] = t2[2];
+	  CO[11] = t2[3];
+	  CO[12] = t3[0];
+	  CO[13] = t3[1];
+	  CO[14] = t3[2];
+	  CO[15] = t3[3];
+#else
+	  CO[0] += t[0];
+	  CO[1] += t[1];
+	  CO[2] += t[2];
+	  CO[3] += t[3];
+	  CO[4] += t1[0];
+	  CO[5] += t1[1];
+	  CO[6] += t1[2];
+	  CO[7] += t1[3];
+	  CO[8] += t2[0];
+	  CO[9] += t2[1];
+	  CO[10] += t2[2];
+	  CO[11] += t2[3];
+	  CO[12] += t3[0];
+	  CO[13] += t3[1];
+	  CO[14] += t3[2];
+	  CO[15] += t3[3];
+#endif
+	  AO += temp << 4;
+	  BO += temp;
+	  CO += 16;
+	  i -= 16;
+#if defined(TRMMKERNEL)
+	  REFRESH_AFTER_SAVE (16, 1)
+#endif
+	}
+      while (i >= 8)
+	{
+	  FLOAT *BO;
+	  BLASLONG l = 0;
+	  v4sf_t t = { 0, 0, 0, 0 };
+	  v4sf_t t1 = { 0, 0, 0, 0 };
+#if defined(TRMMKERNEL)
+	  REFRESH_POINTERS (8, 1)
+#else
+	  BO = B;
+	  temp = k;
+#endif
+
+	  for (l = 0; l < temp; l++)
+	    {
+	      v4sf_t rowB = { BO[l], BO[l], BO[l], BO[l] };
+	      v4sf_t rowA = { AO[l << 3], AO[(l << 3) + 1], AO[(l << 3) + 2],
+		AO[(l << 3) + 3]
+	      };
+	      v4sf_t rowA1 =
+		{ AO[(l << 3) + 4], AO[(l << 3) + 5], AO[(l << 3) + 6],
+		AO[(l << 3) + 7]
+	      };
+	      t += rowA * rowB;
+	      t1 += rowA1 * rowB;
+	    }
+	  t = t * valpha;
+	  t1 = t1 * valpha;
+#if defined(TRMMKERNEL)
+	  CO[0] = t[0];
+	  CO[1] = t[1];
+	  CO[2] = t[2];
+	  CO[3] = t[3];
+	  CO[4] = t1[0];
+	  CO[5] = t1[1];
+	  CO[6] = t1[2];
+	  CO[7] = t1[3];
+#else
+	  CO[0] += t[0];
+	  CO[1] += t[1];
+	  CO[2] += t[2];
+	  CO[3] += t[3];
+	  CO[4] += t1[0];
+	  CO[5] += t1[1];
+	  CO[6] += t1[2];
+	  CO[7] += t1[3];
+#endif
+	  AO += temp << 3;
+	  BO += temp;
+	  CO += 8;
+	  i -= 8;
+#if defined(TRMMKERNEL)
+	  REFRESH_AFTER_SAVE (8, 1)
+#endif
+	}
+      while (i >= 4)
+	{
+	  FLOAT *BO;
+	  BLASLONG l = 0;
+	  v4sf_t t = { 0, 0, 0, 0 };
+#if defined(TRMMKERNEL)
+	  REFRESH_POINTERS (4, 1)
+#else
+	  BO = B;
+	  temp = k;
+#endif
+
+	  for (l = 0; l < temp; l++)
+	    {
+	      v4sf_t rowB = { BO[l], BO[l], BO[l], BO[l] };
+	      v4sf_t rowA = { AO[l << 2], AO[(l << 2) + 1], AO[(l << 2) + 2],
+		AO[(l << 2) + 3]
+	      };
+	      t += rowA * rowB;
+	    }
+	  t = t * valpha;
+#if defined(TRMMKERNEL)
+	  CO[0] = t[0];
+	  CO[1] = t[1];
+	  CO[2] = t[2];
+	  CO[3] = t[3];
+#else
+	  CO[0] += t[0];
+	  CO[1] += t[1];
+	  CO[2] += t[2];
+	  CO[3] += t[3];
+#endif
+	  AO += temp << 2;
+	  BO += temp;
+	  CO += 4;
+	  i -= 4;
+#if defined(TRMMKERNEL)
+	  REFRESH_AFTER_SAVE (4, 1)
+#endif
+	}
+      while (i >= 2)
+	{
+	  FLOAT *BO;
+	  BLASLONG l = 0;
+#if defined(TRMMKERNEL)
+	  REFRESH_POINTERS (2, 1)
+#else
+	  BO = B;
+	  temp = k;
+#endif
+
+	  v4sf_t t = { 0, 0, 0, 0 };
+	  for (l = 0; l < temp; l++)
+	    {
+	      v4sf_t rowB = { BO[l], BO[l], 0, 0 };
+	      v4sf_t rowA = { AO[l << 1], AO[(l << 1) + 1], 0, 0 };
+	      t += rowA * rowB;
+	    }
+	  t = t * valpha;
+#if defined(TRMMKERNEL)
+	  CO[0] = t[0];
+	  CO[1] = t[1];
+#else
+	  CO[0] += t[0];
+	  CO[1] += t[1];
+#endif
+	  AO += temp << 1;
+	  BO += temp;
+	  CO += 2;
+	  i -= 2;
+#if defined(TRMMKERNEL)
+	  REFRESH_AFTER_SAVE (2, 1)
+#endif
+	}
+      while (i >= 1)
+	{
+	  FLOAT *BO;
+#if defined(TRMMKERNEL)
+	  REFRESH_POINTERS (1, 1)
+#else
+	  BO = B;
+	  temp = k;
+#endif
+
+	  BLASLONG l = 0;
+	  FLOAT t = 0;
+	  for (l = 0; l < temp; l++)
+	    {
+	      t += AO[l] * BO[l];
+	    }
+	  AO += temp;
+	  BO += temp;
+#if defined(TRMMKERNEL)
+	  CO[0] = t * alpha;
+#else
+	  CO[0] += t * alpha;
+#endif
+	  CO += 1;
+	  i -= 1;
+#if defined(TRMMKERNEL)
+	  REFRESH_AFTER_SAVE (1, 1)
+#endif
+	}
+
+#if defined(TRMMKERNEL) && !defined(LEFT)
+      off += 1;			// number of values in A
+#endif
+      B += k;
+    }
+  return 0;
+}