From a51102e9b749bf7bb72930c491e9faaf7c1426fc Mon Sep 17 00:00:00 2001
From: Werner Saar <wernsaar@googlemail.com>
Date: Wed, 6 Apr 2016 11:15:21 +0200
Subject: [PATCH 1/4] bugfixes for sgemm- and cgemm-kernel

---
 kernel/power/cgemm_kernel_8x4_power8.S  | 8 ++++----
 kernel/power/sgemm_kernel_16x8_power8.S | 8 ++++----
 param.h                                 | 6 +++---
 3 files changed, 11 insertions(+), 11 deletions(-)

diff --git a/kernel/power/cgemm_kernel_8x4_power8.S b/kernel/power/cgemm_kernel_8x4_power8.S
index f90069e3f..91a48d190 100644
--- a/kernel/power/cgemm_kernel_8x4_power8.S
+++ b/kernel/power/cgemm_kernel_8x4_power8.S
@@ -82,7 +82,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #endif
 
 #ifdef __64BIT__
-#define STACKSIZE 400
+#define STACKSIZE 512
 #define ALPHA_R_SP 296(SP)
 #define ALPHA_I_SP 304(SP)
 #define FZERO	312(SP)
@@ -290,9 +290,9 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 	li	o32 , 32
 	li	o48 , 48
 	
-        li      T1, 256
-        slwi    T1, T1, 9               // 131072
-        sub     BBUFFER, A, T1          // temp buffer for B unrolled
+        li      T1, 512
+        slwi    T1, T1, 16             
+	add	BBUFFER, A, T1
 
 
 #ifdef __64BIT__
diff --git a/kernel/power/sgemm_kernel_16x8_power8.S b/kernel/power/sgemm_kernel_16x8_power8.S
index c2dc1f651..20c94cd94 100644
--- a/kernel/power/sgemm_kernel_16x8_power8.S
+++ b/kernel/power/sgemm_kernel_16x8_power8.S
@@ -82,7 +82,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #endif
 
 #ifdef __64BIT__
-#define STACKSIZE 340
+#define STACKSIZE 512
 #define ALPHA_SP   296(SP)
 #define FZERO	304(SP)
 #else
@@ -271,9 +271,9 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 	li	o32, 32
 	li	o48, 48
 
-	li	T1, 256
-	slwi	T1, T1, 9		// 131072
-	sub	BBUFFER, A, T1		// temp buffer for B unrolled
+        li      T1, 512
+        slwi    T1, T1, 16
+        add     BBUFFER, A, T1
 
         addi    T1, SP, 300
         stxsspx    f1, o0 , T1
diff --git a/param.h b/param.h
index d01c992c4..84ef7671a 100644
--- a/param.h
+++ b/param.h
@@ -1965,7 +1965,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #define DNUMOPT		8
 
 #define GEMM_DEFAULT_OFFSET_A 131072 
-#define GEMM_DEFAULT_OFFSET_B 1024
+#define GEMM_DEFAULT_OFFSET_B 131072
 #define GEMM_DEFAULT_ALIGN 0x03fffUL
 
 #define SGEMM_DEFAULT_UNROLL_M 16
@@ -1985,12 +1985,12 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #define SGEMM_DEFAULT_Q  720
 #define DGEMM_DEFAULT_Q  720
 #define CGEMM_DEFAULT_Q  720
-#define ZGEMM_DEFAULT_Q  360
+#define ZGEMM_DEFAULT_Q  720
 
 #define SGEMM_DEFAULT_R 14400
 #define DGEMM_DEFAULT_R 14400
 #define CGEMM_DEFAULT_R 14400
-#define ZGEMM_DEFAULT_R 7200
+#define ZGEMM_DEFAULT_R 14400
 
 #define SYMV_P	 8
 

From 9c42f0374a434e18302aa4a7957955dd66fc630b Mon Sep 17 00:00:00 2001
From: Werner Saar <wernsaar@googlemail.com>
Date: Thu, 7 Apr 2016 15:08:15 +0200
Subject: [PATCH 2/4] Updated cgemm- and sgemm-kernel for POWER8 SMP

---
 common_power.h                          |  2 +-
 kernel/power/cgemm_kernel_8x4_power8.S  | 36 +++++++++++++++----------
 kernel/power/sgemm_kernel_16x8_power8.S | 27 ++++++++++++-------
 param.h                                 |  8 +++---
 4 files changed, 45 insertions(+), 28 deletions(-)

diff --git a/common_power.h b/common_power.h
index 052d38828..723d949f2 100644
--- a/common_power.h
+++ b/common_power.h
@@ -798,7 +798,7 @@ Lmcount$lazy_ptr:
 #elif defined(PPC440FP2)
 #define BUFFER_SIZE     ( 16 << 20)
 #elif defined(POWER8)
-#define BUFFER_SIZE     ( 64 << 20)
+#define BUFFER_SIZE     ( 32 << 20)
 #else
 #define BUFFER_SIZE     ( 16 << 20)
 #endif
diff --git a/kernel/power/cgemm_kernel_8x4_power8.S b/kernel/power/cgemm_kernel_8x4_power8.S
index 91a48d190..0c462ce8e 100644
--- a/kernel/power/cgemm_kernel_8x4_power8.S
+++ b/kernel/power/cgemm_kernel_8x4_power8.S
@@ -82,7 +82,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #endif
 
 #ifdef __64BIT__
-#define STACKSIZE 512
+#define STACKSIZE 32000
 #define ALPHA_R_SP 296(SP)
 #define ALPHA_I_SP 304(SP)
 #define FZERO	312(SP)
@@ -136,6 +136,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #define alpha_sr vs30
 #define alpha_si vs31
 
+#define FRAMEPOINTER r12
 
 #define BBUFFER	r14
 #define L	r15
@@ -161,6 +162,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 	PROLOGUE
 	PROFCODE
 
+	mr      FRAMEPOINTER, SP
+	addi	SP, SP, -STACKSIZE
+	addi	SP, SP, -STACKSIZE
+	addi	SP, SP, -STACKSIZE
 	addi	SP, SP, -STACKSIZE
 	li	r0, 0
 
@@ -233,37 +238,37 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
 #ifdef linux
 #ifdef __64BIT__
-	ld	LDC, FRAMESLOT(0) + STACKSIZE(SP)
+	ld	LDC, FRAMESLOT(0) + 0(FRAMEPOINTER)
 #endif
 #endif
 
 #if defined(_AIX) || defined(__APPLE__)
 #ifdef __64BIT__
-	ld	LDC, FRAMESLOT(0) + STACKSIZE(SP)
+	ld	LDC, FRAMESLOT(0) + 0(FRAMEPOINTER)
 #else
 #ifdef DOUBLE
-	lwz	B,   FRAMESLOT(0) + STACKSIZE(SP)
-	lwz	C,   FRAMESLOT(1) + STACKSIZE(SP)
-	lwz	LDC, FRAMESLOT(2) + STACKSIZE(SP)
+	lwz	B,   FRAMESLOT(0) + 0(FRAMEPOINTER)
+	lwz	C,   FRAMESLOT(1) + 0(FRAMEPOINTER)
+	lwz	LDC, FRAMESLOT(2) + 0(FRAMEPOINTER)
 #else
-	lwz	LDC, FRAMESLOT(0) + STACKSIZE(SP)
+	lwz	LDC, FRAMESLOT(0) + 0(FRAMEPOINTER)
 #endif
 #endif
 #endif
 
 #ifdef TRMMKERNEL
 #if defined(linux) && defined(__64BIT__)
-	ld	OFFSET,  FRAMESLOT(1) + STACKSIZE(SP)
+	ld	OFFSET,  FRAMESLOT(1) + 0(FRAMEPOINTER)
 #endif
 
 #if defined(_AIX) || defined(__APPLE__)
 #ifdef __64BIT__
-	ld	OFFSET,  FRAMESLOT(1) + STACKSIZE(SP)
+	ld	OFFSET,  FRAMESLOT(1) + 0(FRAMEPOINTER)
 #else
 #ifdef DOUBLE
-	lwz	OFFSET,  FRAMESLOT(3) + STACKSIZE(SP)
+	lwz	OFFSET,  FRAMESLOT(3) + 0(FRAMEPOINTER)
 #else
-	lwz	OFFSET,  FRAMESLOT(1) + STACKSIZE(SP)
+	lwz	OFFSET,  FRAMESLOT(1) + 0(FRAMEPOINTER)
 #endif
 #endif
 #endif
@@ -290,9 +295,9 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 	li	o32 , 32
 	li	o48 , 48
 	
-        li      T1, 512
-        slwi    T1, T1, 16             
-	add	BBUFFER, A, T1
+	addi    BBUFFER, SP, 512+4096
+        li      T1, -4096
+        and     BBUFFER, BBUFFER, T1
 
 
 #ifdef __64BIT__
@@ -392,6 +397,9 @@ L999:
 #endif
 
 	addi	SP, SP, STACKSIZE
+	addi	SP, SP, STACKSIZE
+	addi	SP, SP, STACKSIZE
+	addi	SP, SP, STACKSIZE
 
 	blr
 
diff --git a/kernel/power/sgemm_kernel_16x8_power8.S b/kernel/power/sgemm_kernel_16x8_power8.S
index 20c94cd94..77f3f7cfb 100644
--- a/kernel/power/sgemm_kernel_16x8_power8.S
+++ b/kernel/power/sgemm_kernel_16x8_power8.S
@@ -82,7 +82,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #endif
 
 #ifdef __64BIT__
-#define STACKSIZE 512
+#define STACKSIZE 32752
 #define ALPHA_SP   296(SP)
 #define FZERO	304(SP)
 #else
@@ -132,6 +132,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
 #define o0	0
 
+#define FRAMEPOINTER r12
+
 #define BBUFFER r14
 #define o4	r15
 #define o12	r16
@@ -160,6 +162,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 	PROLOGUE
 	PROFCODE
 
+	mr	FRAMEPOINTER, SP
+	addi	SP, SP, -STACKSIZE
+	addi	SP, SP, -STACKSIZE
+	addi	SP, SP, -STACKSIZE
 	addi	SP, SP, -STACKSIZE
 	li	r0, 0
 
@@ -231,7 +237,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
 #if defined(_AIX) || defined(__APPLE__)
 #if !defined(__64BIT__) && defined(DOUBLE)
-	lwz	LDC, FRAMESLOT(0) + STACKSIZE(SP)
+	lwz	LDC, FRAMESLOT(0) + 0(FRAMEPOINTER)
 #endif
 #endif
 
@@ -239,17 +245,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
 #if defined(TRMMKERNEL)
 #if defined(linux) && defined(__64BIT__)
-	ld	OFFSET,  FRAMESLOT(0) + STACKSIZE(SP)
+	ld	OFFSET,  FRAMESLOT(0) + 0(FRAMEPOINTER)
 #endif
 
 #if defined(_AIX) || defined(__APPLE__)
 #ifdef __64BIT__
-	ld	OFFSET,  FRAMESLOT(0) + STACKSIZE(SP)
+	ld	OFFSET,  FRAMESLOT(0) + 0(FRAMEPOINTER)
 #else
 #ifdef DOUBLE
-	lwz	OFFSET,  FRAMESLOT(1) + STACKSIZE(SP)
+	lwz	OFFSET,  FRAMESLOT(1) + 0(FRAMEPOINTER)
 #else
-	lwz	OFFSET,  FRAMESLOT(0) + STACKSIZE(SP)
+	lwz	OFFSET,  FRAMESLOT(0) + 0(FRAMEPOINTER)
 #endif
 #endif
 #endif
@@ -271,9 +277,9 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 	li	o32, 32
 	li	o48, 48
 
-        li      T1, 512
-        slwi    T1, T1, 16
-        add     BBUFFER, A, T1
+        addi    BBUFFER, SP, 512+4096
+	li	T1, -4096
+	and	BBUFFER, BBUFFER, T1
 
         addi    T1, SP, 300
         stxsspx    f1, o0 , T1
@@ -355,6 +361,9 @@ L999:
 #endif
 
 	addi	SP, SP, STACKSIZE
+	addi	SP, SP, STACKSIZE
+	addi	SP, SP, STACKSIZE
+	addi	SP, SP, STACKSIZE
 
 	blr
 
diff --git a/param.h b/param.h
index 84ef7671a..2efd9b2c1 100644
--- a/param.h
+++ b/param.h
@@ -1964,8 +1964,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #define SNUMOPT		16
 #define DNUMOPT		8
 
-#define GEMM_DEFAULT_OFFSET_A 131072 
-#define GEMM_DEFAULT_OFFSET_B 131072
+#define GEMM_DEFAULT_OFFSET_A 4096 
+#define GEMM_DEFAULT_OFFSET_B 4096
 #define GEMM_DEFAULT_ALIGN 0x03fffUL
 
 #define SGEMM_DEFAULT_UNROLL_M 16
@@ -1987,9 +1987,9 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #define CGEMM_DEFAULT_Q  720
 #define ZGEMM_DEFAULT_Q  720
 
-#define SGEMM_DEFAULT_R 14400
+#define SGEMM_DEFAULT_R 21600
 #define DGEMM_DEFAULT_R 14400
-#define CGEMM_DEFAULT_R 14400
+#define CGEMM_DEFAULT_R 16200
 #define ZGEMM_DEFAULT_R 14400
 
 #define SYMV_P	 8

From e173c51c0416dade779478b698ccff9429034a7f Mon Sep 17 00:00:00 2001
From: Werner Saar <wernsaar@googlemail.com>
Date: Fri, 8 Apr 2016 09:05:37 +0200
Subject: [PATCH 3/4] updated zgemm- and ztrmm-kernel for POWER8

---
 kernel/power/zgemm_kernel_8x2_power8.S |   97 +-
 kernel/power/zgemm_logic_8x2_power8.S  |  427 ++--
 kernel/power/zgemm_macros_8x2_power8.S |  497 ++--
 kernel/power/ztrmm_kernel_8x2_power8.S |    2 +-
 kernel/power/ztrmm_macros_8x2_power8.S | 3110 ++++++++++++++++++++++++
 param.h                                |    4 +-
 6 files changed, 3611 insertions(+), 526 deletions(-)
 create mode 100644 kernel/power/ztrmm_macros_8x2_power8.S

diff --git a/kernel/power/zgemm_kernel_8x2_power8.S b/kernel/power/zgemm_kernel_8x2_power8.S
index a7665f749..336b13b1f 100644
--- a/kernel/power/zgemm_kernel_8x2_power8.S
+++ b/kernel/power/zgemm_kernel_8x2_power8.S
@@ -1,38 +1,3 @@
-/***************************************************************************
-Copyright (c) 2013-2016, The OpenBLAS Project
-All rights reserved.
-Redistribution and use in source and binary forms, with or without
-modification, are permitted provided that the following conditions are
-met:
-1. Redistributions of source code must retain the above copyright
-notice, this list of conditions and the following disclaimer.
-2. Redistributions in binary form must reproduce the above copyright
-notice, this list of conditions and the following disclaimer in
-the documentation and/or other materials provided with the
-distribution.
-3. Neither the name of the OpenBLAS project nor the names of
-its contributors may be used to endorse or promote products
-derived from this software without specific prior written permission.
-THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
-AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
-IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
-ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
-LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
-DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
-SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
-CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
-OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
-USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-*****************************************************************************/
-
-/**************************************************************************************
-* 2016/03/05 Werner Saar (wernsaar@googlemail.com)
-* 	 BLASTEST 		: OK
-* 	 CTEST			: OK
-* 	 TEST			: OK
-*	 LAPACK-TEST		: OK
-**************************************************************************************/
-
 /*********************************************************************/
 /* Copyright 2009, 2010 The University of Texas at Austin.           */
 /* All rights reserved.                                              */
@@ -82,7 +47,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #endif
 
 #ifdef __64BIT__
-#define STACKSIZE 320
+#define STACKSIZE 32000
 #define ALPHA_R_SP 296(SP)
 #define ALPHA_I_SP 304(SP)
 #define FZERO	312(SP)
@@ -133,11 +98,16 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #define alpha_r vs30
 #define alpha_i vs31
 
+
+#define FRAMEPOINTER r12
+
+#define BBUFFER r14
+
 #define L	r15
 #define ALPHA	r16
 #define o24	r17
 #define T2	r19
-#define KK	r20
+#define BBO	r20
 #define	o8	r21
 #define	I	r22
 #define J	r23
@@ -156,8 +126,12 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 	PROLOGUE
 	PROFCODE
 
-	addi	SP, SP, -STACKSIZE
-	li	r0, 0
+	mr      FRAMEPOINTER, SP
+        addi    SP, SP, -STACKSIZE
+        addi    SP, SP, -STACKSIZE
+        addi    SP, SP, -STACKSIZE
+        addi    SP, SP, -STACKSIZE
+        li      r0, 0
 
 	stfd	f14,    0(SP)
 	stfd	f15,    8(SP)
@@ -200,6 +174,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 	std	r17,  256(SP)
 	std	r16,  264(SP)
 	std	r15,  272(SP)
+	std	r14,  280(SP)
 #else
 	stw	r31,  144(SP)
 	stw	r30,  148(SP)
@@ -226,37 +201,37 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
 #ifdef linux
 #ifdef __64BIT__
-	ld	LDC, FRAMESLOT(0) + STACKSIZE(SP)
+	ld	LDC, FRAMESLOT(0) + 0(FRAMEPOINTER)
 #endif
 #endif
 
 #if defined(_AIX) || defined(__APPLE__)
 #ifdef __64BIT__
-	ld	LDC, FRAMESLOT(0) + STACKSIZE(SP)
+	ld	LDC, FRAMESLOT(0) + 0(FRAMEPOINTER)
 #else
 #ifdef DOUBLE
-	lwz	B,   FRAMESLOT(0) + STACKSIZE(SP)
-	lwz	C,   FRAMESLOT(1) + STACKSIZE(SP)
-	lwz	LDC, FRAMESLOT(2) + STACKSIZE(SP)
+	lwz	B,   FRAMESLOT(0) + 0(FRAMEPOINTER)
+	lwz	C,   FRAMESLOT(1) + 0(FRAMEPOINTER)
+	lwz	LDC, FRAMESLOT(2) + 0(FRAMEPOINTER)
 #else
-	lwz	LDC, FRAMESLOT(0) + STACKSIZE(SP)
+	lwz	LDC, FRAMESLOT(0) + 0(FRAMEPOINTER)
 #endif
 #endif
 #endif
 
 #ifdef TRMMKERNEL
 #if defined(linux) && defined(__64BIT__)
-	ld	OFFSET,  FRAMESLOT(1) + STACKSIZE(SP)
+	ld	OFFSET,  FRAMESLOT(1) + 0(FRAMEPOINTER)
 #endif
 
 #if defined(_AIX) || defined(__APPLE__)
 #ifdef __64BIT__
-	ld	OFFSET,  FRAMESLOT(1) + STACKSIZE(SP)
+	ld	OFFSET,  FRAMESLOT(1) + 0(FRAMEPOINTER)
 #else
 #ifdef DOUBLE
-	lwz	OFFSET,  FRAMESLOT(3) + STACKSIZE(SP)
+	lwz	OFFSET,  FRAMESLOT(3) + 0(FRAMEPOINTER)
 #else
-	lwz	OFFSET,  FRAMESLOT(1) + STACKSIZE(SP)
+	lwz	OFFSET,  FRAMESLOT(1) + 0(FRAMEPOINTER)
 #endif
 #endif
 #endif
@@ -268,34 +243,38 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #include "zgemm_macros_8x2_power8.S"
 
 	cmpwi	cr0, M, 0
-	ble	.L999
+	ble	L999
 	cmpwi	cr0, N, 0
-	ble	.L999
+	ble	L999
 	cmpwi	cr0, K, 0
-	ble	.L999
+	ble	L999
 
 	slwi	LDC, LDC, ZBASE_SHIFT
-	li	PRE, 256 
+	li	PRE, 384 
 	li	o8  , 8
 	li	o16 , 16
 	li	o24 , 24
 	li	o32 , 32
 	li	o48 , 48
 
+        addi    BBUFFER, SP, 512+4096
+        li      T1, -4096
+        and     BBUFFER, BBUFFER, T1
+
 #ifdef __64BIT__
 	addi	ALPHA, SP, 296
 #else
 	addi	ALPHA, SP, 224
 #endif
 
-	lxvdsx	alpha_r, 0, ALPHA
-	lxvdsx	alpha_i, o8, ALPHA
+	lxsdx	alpha_r, 0, ALPHA
+	lxsdx	alpha_i, o8, ALPHA
 
-	.align 5
+	.align 4
 
 #include "zgemm_logic_8x2_power8.S"
 
-.L999:
+L999:
 	addi	r3, 0, 0
 
 	lfd	f14,    0(SP)
@@ -339,6 +318,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 	ld	r17,  256(SP)
 	ld	r16,  264(SP)
 	ld	r15,  272(SP)
+	ld	r14,  280(SP)
 #else
 	lwz	r31,  144(SP)
 	lwz	r30,  148(SP)
@@ -360,6 +340,9 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #endif
 
 	addi	SP, SP, STACKSIZE
+	addi	SP, SP, STACKSIZE
+	addi	SP, SP, STACKSIZE
+	addi	SP, SP, STACKSIZE
 
 	blr
 
diff --git a/kernel/power/zgemm_logic_8x2_power8.S b/kernel/power/zgemm_logic_8x2_power8.S
index 5fcade5bf..96612da82 100644
--- a/kernel/power/zgemm_logic_8x2_power8.S
+++ b/kernel/power/zgemm_logic_8x2_power8.S
@@ -1,83 +1,111 @@
 	srawi.		J,	N,	1
-	ble		.LZGEMM_L2_END
+	ble		ZGEMM_L2_END
+
+ZGEMM_L2_BEGIN:
+
+	mr		BO,	B
+	mr		BBO,	BBUFFER
+	slwi		T1,	K,	1
+
+ZGEMM_L2_COPYB:
+
+	lxvdsx		vs4,	o0,	BO              // b0_r
+	lxvdsx		vs5,	o8,	BO              // b0_i
+	addi		BO,	BO,	16
+	stxvd2x		vs4,	o0,	BBO
+	stxvd2x		vs5,	o16,	BBO
+	addic.		T1,	T1,	-1
+	addi		BBO,	BBO,	32
+
+	bge		ZGEMM_L2_COPYB
 
-.LZGEMM_L2_BEGIN:
 
 	mr		CO,	C
 	mr		AO,	A
 	slwi		T1,	LDC	,	1
 	add		C,	C,	T1
 	srawi.		I,	M,	3
-	ble		.LZGEMM_L2x8_END
+	ble		ZGEMM_L2x8_END
 
-.LZGEMM_L2x8_BEGIN:
+ZGEMM_L2x8_BEGIN:
 
 
-	mr		BO,	B
+	mr		BO,	BBUFFER
 	srawi.		L,	K,	3
-	ble		.LZGEMM_L2x8_SUB0
+	ble		ZGEMM_L2x8_SUB0
 	cmpwi		cr0,	L,	1
-	ble		.LZGEMM_L2x8_SUB4
+	ble		ZGEMM_L2x8_SUB4
 
-.LZGEMM_L2x8_LOOP_START:
+ZGEMM_L2x8_LOOP_START:
 
 	dcbt		AO,	PRE
+	dcbt		BO,	PRE
 	LOAD2x8_1
 	dcbt		AO,	PRE
 	KERNEL2x8_I1
 	dcbt		AO,	PRE
+	dcbt		BO,	PRE
 	KERNEL2x8_2
 	dcbt		AO,	PRE
 	KERNEL2x8_1
 	dcbt		AO,	PRE
+	dcbt		BO,	PRE
 	KERNEL2x8_2
 
 	dcbt		AO,	PRE
 	KERNEL2x8_1
 	dcbt		AO,	PRE
+	dcbt		BO,	PRE
 	KERNEL2x8_2
 	dcbt		AO,	PRE
 	KERNEL2x8_1
 	dcbt		AO,	PRE
+	dcbt		BO,	PRE
 	KERNEL2x8_2
 
 	addic.		L,	L,	-2
-	ble		.LZGEMM_L2x8_LOOP_END
+	ble		ZGEMM_L2x8_LOOP_END
 
 	.align 5
 
-.LZGEMM_L2x8_LOOP:
+ZGEMM_L2x8_LOOP:
 
 	dcbt		AO,	PRE
 	KERNEL2x8_1
 	dcbt		AO,	PRE
+	dcbt		BO,	PRE
 	KERNEL2x8_2
 	dcbt		AO,	PRE
 	KERNEL2x8_1
 	dcbt		AO,	PRE
+	dcbt		BO,	PRE
 	KERNEL2x8_2
 
 	dcbt		AO,	PRE
 	KERNEL2x8_1
 	dcbt		AO,	PRE
+	dcbt		BO,	PRE
 	KERNEL2x8_2
 	dcbt		AO,	PRE
 	KERNEL2x8_1
 	dcbt		AO,	PRE
+	dcbt		BO,	PRE
 	KERNEL2x8_2
 
 	addic.		L,	L,	-1
-	bgt		.LZGEMM_L2x8_LOOP
+	bgt		ZGEMM_L2x8_LOOP
 
-.LZGEMM_L2x8_LOOP_END:
+ZGEMM_L2x8_LOOP_END:
 
 	dcbt		AO,	PRE
 	KERNEL2x8_1
 	dcbt		AO,	PRE
+	dcbt		BO,	PRE
 	KERNEL2x8_2
 	dcbt		AO,	PRE
 	KERNEL2x8_1
 	dcbt		AO,	PRE
+	dcbt		BO,	PRE
 	KERNEL2x8_2
 
 	dcbt		AO,	PRE
@@ -88,9 +116,9 @@
 	KERNEL2x8_1
 	KERNEL2x8_E2
 
-	b		.LZGEMM_L2x8_SUB1
+	b		ZGEMM_L2x8_SUB1
 
-.LZGEMM_L2x8_SUB4:
+ZGEMM_L2x8_SUB4:
 
 	dcbt		AO,	PRE
 	KERNEL2x8_SUBI1
@@ -106,53 +134,53 @@
 	KERNEL2x8_SUB1
 	KERNEL2x8_SUB1
 
-	b		.LZGEMM_L2x8_SUB1
+	b		ZGEMM_L2x8_SUB1
 
-.LZGEMM_L2x8_SUB0:
+ZGEMM_L2x8_SUB0:
 
 	andi.		L,	K,	7
 
 	KERNEL2x8_SUBI1
 
 	addic.		L,	L,	-1
-	ble		.LZGEMM_L2x8_SAVE
-	b		.LZGEMM_L2x8_SUB2
+	ble		ZGEMM_L2x8_SAVE
+	b		ZGEMM_L2x8_SUB2
 
-.LZGEMM_L2x8_SUB1:
+ZGEMM_L2x8_SUB1:
 
 	andi.		L,	K,	7
-	ble		.LZGEMM_L2x8_SAVE
+	ble		ZGEMM_L2x8_SAVE
 
-.LZGEMM_L2x8_SUB2:
+ZGEMM_L2x8_SUB2:
 
 	KERNEL2x8_SUB1
 
 	addic.		L,	L,	-1
-	bgt		.LZGEMM_L2x8_SUB2
+	bgt		ZGEMM_L2x8_SUB2
 
-.LZGEMM_L2x8_SAVE:
+ZGEMM_L2x8_SAVE:
 
 	SAVE2x8
 
 	addic.		I,	I,	-1
-	bgt		.LZGEMM_L2x8_BEGIN
+	bgt		ZGEMM_L2x8_BEGIN
 
-.LZGEMM_L2x8_END:
+ZGEMM_L2x8_END:
 
-.LZGEMM_L2x4_BEGIN:
+ZGEMM_L2x4_BEGIN:
 
 	andi.		T2,	M,	7
-	ble		.LZGEMM_L2x1_END
+	ble		ZGEMM_L2x1_END
 
 	andi.		T1,	M,	4
-	ble		.LZGEMM_L2x4_END
-	mr		BO,	B
+	ble		ZGEMM_L2x4_END
+	mr		BO,	BBUFFER
 	srawi.		L,	K,	3
-	ble		.LZGEMM_L2x4_SUB0
+	ble		ZGEMM_L2x4_SUB0
 	cmpwi		cr0,	L,	1
-	ble		.LZGEMM_L2x4_SUB4
+	ble		ZGEMM_L2x4_SUB4
 
-.LZGEMM_L2x4_LOOP_START:
+ZGEMM_L2x4_LOOP_START:
 
 	LOAD2x4_1
 	KERNEL2x4_I1
@@ -166,11 +194,11 @@
 	KERNEL2x4_2
 
 	addic.		L,	L,	-2
-	ble		.LZGEMM_L2x4_LOOP_END
+	ble		ZGEMM_L2x4_LOOP_END
 
 	.align 5
 
-.LZGEMM_L2x4_LOOP:
+ZGEMM_L2x4_LOOP:
 
 	KERNEL2x4_1
 	KERNEL2x4_2
@@ -183,9 +211,9 @@
 	KERNEL2x4_2
 
 	addic.		L,	L,	-1
-	bgt		.LZGEMM_L2x4_LOOP
+	bgt		ZGEMM_L2x4_LOOP
 
-.LZGEMM_L2x4_LOOP_END:
+ZGEMM_L2x4_LOOP_END:
 
 	KERNEL2x4_1
 	KERNEL2x4_2
@@ -197,9 +225,9 @@
 	KERNEL2x4_1
 	KERNEL2x4_E2
 
-	b		.LZGEMM_L2x4_SUB1
+	b		ZGEMM_L2x4_SUB1
 
-.LZGEMM_L2x4_SUB4:
+ZGEMM_L2x4_SUB4:
 
 	KERNEL2x4_SUBI1
 	KERNEL2x4_SUB1
@@ -211,48 +239,48 @@
 	KERNEL2x4_SUB1
 	KERNEL2x4_SUB1
 
-	b		.LZGEMM_L2x4_SUB1
+	b		ZGEMM_L2x4_SUB1
 
-.LZGEMM_L2x4_SUB0:
+ZGEMM_L2x4_SUB0:
 
 	andi.		L,	K,	7
 
 	KERNEL2x4_SUBI1
 
 	addic.		L,	L,	-1
-	ble		.LZGEMM_L2x4_SAVE
-	b		.LZGEMM_L2x4_SUB2
+	ble		ZGEMM_L2x4_SAVE
+	b		ZGEMM_L2x4_SUB2
 
-.LZGEMM_L2x4_SUB1:
+ZGEMM_L2x4_SUB1:
 
 	andi.		L,	K,	7
-	ble		.LZGEMM_L2x4_SAVE
+	ble		ZGEMM_L2x4_SAVE
 
-.LZGEMM_L2x4_SUB2:
+ZGEMM_L2x4_SUB2:
 
 	KERNEL2x4_SUB1
 
 	addic.		L,	L,	-1
-	bgt		.LZGEMM_L2x4_SUB2
+	bgt		ZGEMM_L2x4_SUB2
 
-.LZGEMM_L2x4_SAVE:
+ZGEMM_L2x4_SAVE:
 
 	SAVE2x4
 
-.LZGEMM_L2x4_END:
+ZGEMM_L2x4_END:
 
-.LZGEMM_L2x2_BEGIN:
+ZGEMM_L2x2_BEGIN:
 
 
 	andi.		T1,	M,	2
-	ble		.LZGEMM_L2x2_END
-	mr		BO,	B
+	ble		ZGEMM_L2x2_END
+	mr		BO,	BBUFFER
 	srawi.		L,	K,	3
-	ble		.LZGEMM_L2x2_SUB0
+	ble		ZGEMM_L2x2_SUB0
 	cmpwi		cr0,	L,	1
-	ble		.LZGEMM_L2x2_SUB4
+	ble		ZGEMM_L2x2_SUB4
 
-.LZGEMM_L2x2_LOOP_START:
+ZGEMM_L2x2_LOOP_START:
 
 	LOAD2x2_1
 	KERNEL2x2_I1
@@ -266,11 +294,11 @@
 	KERNEL2x2_2
 
 	addic.		L,	L,	-2
-	ble		.LZGEMM_L2x2_LOOP_END
+	ble		ZGEMM_L2x2_LOOP_END
 
 	.align 5
 
-.LZGEMM_L2x2_LOOP:
+ZGEMM_L2x2_LOOP:
 
 	KERNEL2x2_1
 	KERNEL2x2_2
@@ -283,9 +311,9 @@
 	KERNEL2x2_2
 
 	addic.		L,	L,	-1
-	bgt		.LZGEMM_L2x2_LOOP
+	bgt		ZGEMM_L2x2_LOOP
 
-.LZGEMM_L2x2_LOOP_END:
+ZGEMM_L2x2_LOOP_END:
 
 	KERNEL2x2_1
 	KERNEL2x2_2
@@ -297,9 +325,9 @@
 	KERNEL2x2_1
 	KERNEL2x2_E2
 
-	b		.LZGEMM_L2x2_SUB1
+	b		ZGEMM_L2x2_SUB1
 
-.LZGEMM_L2x2_SUB4:
+ZGEMM_L2x2_SUB4:
 
 	KERNEL2x2_SUBI1
 	KERNEL2x2_SUB1
@@ -311,48 +339,48 @@
 	KERNEL2x2_SUB1
 	KERNEL2x2_SUB1
 
-	b		.LZGEMM_L2x2_SUB1
+	b		ZGEMM_L2x2_SUB1
 
-.LZGEMM_L2x2_SUB0:
+ZGEMM_L2x2_SUB0:
 
 	andi.		L,	K,	7
 
 	KERNEL2x2_SUBI1
 
 	addic.		L,	L,	-1
-	ble		.LZGEMM_L2x2_SAVE
-	b		.LZGEMM_L2x2_SUB2
+	ble		ZGEMM_L2x2_SAVE
+	b		ZGEMM_L2x2_SUB2
 
-.LZGEMM_L2x2_SUB1:
+ZGEMM_L2x2_SUB1:
 
 	andi.		L,	K,	7
-	ble		.LZGEMM_L2x2_SAVE
+	ble		ZGEMM_L2x2_SAVE
 
-.LZGEMM_L2x2_SUB2:
+ZGEMM_L2x2_SUB2:
 
 	KERNEL2x2_SUB1
 
 	addic.		L,	L,	-1
-	bgt		.LZGEMM_L2x2_SUB2
+	bgt		ZGEMM_L2x2_SUB2
 
-.LZGEMM_L2x2_SAVE:
+ZGEMM_L2x2_SAVE:
 
 	SAVE2x2
 
-.LZGEMM_L2x2_END:
+ZGEMM_L2x2_END:
 
-.LZGEMM_L2x1_BEGIN:
+ZGEMM_L2x1_BEGIN:
 
 
 	andi.		T1,	M,	1
-	ble		.LZGEMM_L2x1_END
-	mr		BO,	B
+	ble		ZGEMM_L2x1_END
+	mr		BO,	BBUFFER
 	srawi.		L,	K,	3
-	ble		.LZGEMM_L2x1_SUB0
+	ble		ZGEMM_L2x1_SUB0
 	cmpwi		cr0,	L,	1
-	ble		.LZGEMM_L2x1_SUB4
+	ble		ZGEMM_L2x1_SUB4
 
-.LZGEMM_L2x1_LOOP_START:
+ZGEMM_L2x1_LOOP_START:
 
 	LOAD2x1_1
 	KERNEL2x1_I1
@@ -366,11 +394,11 @@
 	KERNEL2x1_2
 
 	addic.		L,	L,	-2
-	ble		.LZGEMM_L2x1_LOOP_END
+	ble		ZGEMM_L2x1_LOOP_END
 
 	.align 5
 
-.LZGEMM_L2x1_LOOP:
+ZGEMM_L2x1_LOOP:
 
 	KERNEL2x1_1
 	KERNEL2x1_2
@@ -383,9 +411,9 @@
 	KERNEL2x1_2
 
 	addic.		L,	L,	-1
-	bgt		.LZGEMM_L2x1_LOOP
+	bgt		ZGEMM_L2x1_LOOP
 
-.LZGEMM_L2x1_LOOP_END:
+ZGEMM_L2x1_LOOP_END:
 
 	KERNEL2x1_1
 	KERNEL2x1_2
@@ -397,9 +425,9 @@
 	KERNEL2x1_1
 	KERNEL2x1_E2
 
-	b		.LZGEMM_L2x1_SUB1
+	b		ZGEMM_L2x1_SUB1
 
-.LZGEMM_L2x1_SUB4:
+ZGEMM_L2x1_SUB4:
 
 	KERNEL2x1_SUBI1
 	KERNEL2x1_SUB1
@@ -411,72 +439,89 @@
 	KERNEL2x1_SUB1
 	KERNEL2x1_SUB1
 
-	b		.LZGEMM_L2x1_SUB1
+	b		ZGEMM_L2x1_SUB1
 
-.LZGEMM_L2x1_SUB0:
+ZGEMM_L2x1_SUB0:
 
 	andi.		L,	K,	7
 
 	KERNEL2x1_SUBI1
 
 	addic.		L,	L,	-1
-	ble		.LZGEMM_L2x1_SAVE
-	b		.LZGEMM_L2x1_SUB2
+	ble		ZGEMM_L2x1_SAVE
+	b		ZGEMM_L2x1_SUB2
 
-.LZGEMM_L2x1_SUB1:
+ZGEMM_L2x1_SUB1:
 
 	andi.		L,	K,	7
-	ble		.LZGEMM_L2x1_SAVE
+	ble		ZGEMM_L2x1_SAVE
 
-.LZGEMM_L2x1_SUB2:
+ZGEMM_L2x1_SUB2:
 
 	KERNEL2x1_SUB1
 
 	addic.		L,	L,	-1
-	bgt		.LZGEMM_L2x1_SUB2
+	bgt		ZGEMM_L2x1_SUB2
 
-.LZGEMM_L2x1_SAVE:
+ZGEMM_L2x1_SAVE:
 
 	SAVE2x1
 
-.LZGEMM_L2x1_END:
+ZGEMM_L2x1_END:
 
 	slwi		T1,	K,	5
 	add		B,	B,	T1
 
 	addic.		J,	J,	-1
-	bgt		.LZGEMM_L2_BEGIN
+	bgt		ZGEMM_L2_BEGIN
 
 	andi.		T2,	N,	1
-	ble		.L999
+	ble		L999
 
-.LZGEMM_L2_END:
+ZGEMM_L2_END:
 
-	b		.LZGEMM_L1_BEGIN
+	b		ZGEMM_L1_BEGIN
 
-.L999_H1:
+L999_H1:
 
-	b		.L999
+	b		L999
+
+ZGEMM_L1_BEGIN:
+
+	mr		BO,	B
+	mr		BBO,	BBUFFER
+	slwi		T1,	K,	0
+
+ZGEMM_L1_COPYB:
+
+	lxvdsx		vs4,	o0,	BO              // b0_r
+	lxvdsx		vs5,	o8,	BO              // b0_i
+	addi		BO,	BO,	16
+	stxvd2x		vs4,	o0,	BBO
+	stxvd2x		vs5,	o16,	BBO
+	addic.		T1,	T1,	-1
+	addi		BBO,	BBO,	32
+
+	bge		ZGEMM_L1_COPYB
 
-.LZGEMM_L1_BEGIN:
 
 	andi.		T1,	N,	1
-	ble		.LZGEMM_L1_END
+	ble		ZGEMM_L1_END
 	mr		CO,	C
 	mr		AO,	A
 	srawi.		I,	M,	3
-	ble		.LZGEMM_L1x8_END
+	ble		ZGEMM_L1x8_END
 
-.LZGEMM_L1x8_BEGIN:
+ZGEMM_L1x8_BEGIN:
 
 
-	mr		BO,	B
+	mr		BO,	BBUFFER
 	srawi.		L,	K,	3
-	ble		.LZGEMM_L1x8_SUB0
+	ble		ZGEMM_L1x8_SUB0
 	cmpwi		cr0,	L,	1
-	ble		.LZGEMM_L1x8_SUB4
+	ble		ZGEMM_L1x8_SUB4
 
-.LZGEMM_L1x8_LOOP_START:
+ZGEMM_L1x8_LOOP_START:
 
 	dcbt		AO,	PRE
 	LOAD1x8_1
@@ -499,11 +544,11 @@
 	KERNEL1x8_2
 
 	addic.		L,	L,	-2
-	ble		.LZGEMM_L1x8_LOOP_END
+	ble		ZGEMM_L1x8_LOOP_END
 
 	.align 5
 
-.LZGEMM_L1x8_LOOP:
+ZGEMM_L1x8_LOOP:
 
 	dcbt		AO,	PRE
 	KERNEL1x8_1
@@ -524,9 +569,9 @@
 	KERNEL1x8_2
 
 	addic.		L,	L,	-1
-	bgt		.LZGEMM_L1x8_LOOP
+	bgt		ZGEMM_L1x8_LOOP
 
-.LZGEMM_L1x8_LOOP_END:
+ZGEMM_L1x8_LOOP_END:
 
 	dcbt		AO,	PRE
 	KERNEL1x8_1
@@ -545,9 +590,9 @@
 	KERNEL1x8_1
 	KERNEL1x8_E2
 
-	b		.LZGEMM_L1x8_SUB1
+	b		ZGEMM_L1x8_SUB1
 
-.LZGEMM_L1x8_SUB4:
+ZGEMM_L1x8_SUB4:
 
 	dcbt		AO,	PRE
 	KERNEL1x8_SUBI1
@@ -563,53 +608,53 @@
 	KERNEL1x8_SUB1
 	KERNEL1x8_SUB1
 
-	b		.LZGEMM_L1x8_SUB1
+	b		ZGEMM_L1x8_SUB1
 
-.LZGEMM_L1x8_SUB0:
+ZGEMM_L1x8_SUB0:
 
 	andi.		L,	K,	7
 
 	KERNEL1x8_SUBI1
 
 	addic.		L,	L,	-1
-	ble		.LZGEMM_L1x8_SAVE
-	b		.LZGEMM_L1x8_SUB2
+	ble		ZGEMM_L1x8_SAVE
+	b		ZGEMM_L1x8_SUB2
 
-.LZGEMM_L1x8_SUB1:
+ZGEMM_L1x8_SUB1:
 
 	andi.		L,	K,	7
-	ble		.LZGEMM_L1x8_SAVE
+	ble		ZGEMM_L1x8_SAVE
 
-.LZGEMM_L1x8_SUB2:
+ZGEMM_L1x8_SUB2:
 
 	KERNEL1x8_SUB1
 
 	addic.		L,	L,	-1
-	bgt		.LZGEMM_L1x8_SUB2
+	bgt		ZGEMM_L1x8_SUB2
 
-.LZGEMM_L1x8_SAVE:
+ZGEMM_L1x8_SAVE:
 
 	SAVE1x8
 
 	addic.		I,	I,	-1
-	bgt		.LZGEMM_L1x8_BEGIN
+	bgt		ZGEMM_L1x8_BEGIN
 
-.LZGEMM_L1x8_END:
+ZGEMM_L1x8_END:
 
-.LZGEMM_L1x4_BEGIN:
+ZGEMM_L1x4_BEGIN:
 
 	andi.		T2,	M,	7
-	ble		.LZGEMM_L1x1_END
+	ble		ZGEMM_L1x1_END
 
 	andi.		T1,	M,	4
-	ble		.LZGEMM_L1x4_END
-	mr		BO,	B
+	ble		ZGEMM_L1x4_END
+	mr		BO,	BBUFFER
 	srawi.		L,	K,	3
-	ble		.LZGEMM_L1x4_SUB0
+	ble		ZGEMM_L1x4_SUB0
 	cmpwi		cr0,	L,	1
-	ble		.LZGEMM_L1x4_SUB4
+	ble		ZGEMM_L1x4_SUB4
 
-.LZGEMM_L1x4_LOOP_START:
+ZGEMM_L1x4_LOOP_START:
 
 	LOAD1x4_1
 	KERNEL1x4_I1
@@ -623,11 +668,11 @@
 	KERNEL1x4_2
 
 	addic.		L,	L,	-2
-	ble		.LZGEMM_L1x4_LOOP_END
+	ble		ZGEMM_L1x4_LOOP_END
 
 	.align 5
 
-.LZGEMM_L1x4_LOOP:
+ZGEMM_L1x4_LOOP:
 
 	KERNEL1x4_1
 	KERNEL1x4_2
@@ -640,9 +685,9 @@
 	KERNEL1x4_2
 
 	addic.		L,	L,	-1
-	bgt		.LZGEMM_L1x4_LOOP
+	bgt		ZGEMM_L1x4_LOOP
 
-.LZGEMM_L1x4_LOOP_END:
+ZGEMM_L1x4_LOOP_END:
 
 	KERNEL1x4_1
 	KERNEL1x4_2
@@ -654,9 +699,9 @@
 	KERNEL1x4_1
 	KERNEL1x4_E2
 
-	b		.LZGEMM_L1x4_SUB1
+	b		ZGEMM_L1x4_SUB1
 
-.LZGEMM_L1x4_SUB4:
+ZGEMM_L1x4_SUB4:
 
 	KERNEL1x4_SUBI1
 	KERNEL1x4_SUB1
@@ -668,48 +713,48 @@
 	KERNEL1x4_SUB1
 	KERNEL1x4_SUB1
 
-	b		.LZGEMM_L1x4_SUB1
+	b		ZGEMM_L1x4_SUB1
 
-.LZGEMM_L1x4_SUB0:
+ZGEMM_L1x4_SUB0:
 
 	andi.		L,	K,	7
 
 	KERNEL1x4_SUBI1
 
 	addic.		L,	L,	-1
-	ble		.LZGEMM_L1x4_SAVE
-	b		.LZGEMM_L1x4_SUB2
+	ble		ZGEMM_L1x4_SAVE
+	b		ZGEMM_L1x4_SUB2
 
-.LZGEMM_L1x4_SUB1:
+ZGEMM_L1x4_SUB1:
 
 	andi.		L,	K,	7
-	ble		.LZGEMM_L1x4_SAVE
+	ble		ZGEMM_L1x4_SAVE
 
-.LZGEMM_L1x4_SUB2:
+ZGEMM_L1x4_SUB2:
 
 	KERNEL1x4_SUB1
 
 	addic.		L,	L,	-1
-	bgt		.LZGEMM_L1x4_SUB2
+	bgt		ZGEMM_L1x4_SUB2
 
-.LZGEMM_L1x4_SAVE:
+ZGEMM_L1x4_SAVE:
 
 	SAVE1x4
 
-.LZGEMM_L1x4_END:
+ZGEMM_L1x4_END:
 
-.LZGEMM_L1x2_BEGIN:
+ZGEMM_L1x2_BEGIN:
 
 
 	andi.		T1,	M,	2
-	ble		.LZGEMM_L1x2_END
-	mr		BO,	B
+	ble		ZGEMM_L1x2_END
+	mr		BO,	BBUFFER
 	srawi.		L,	K,	3
-	ble		.LZGEMM_L1x2_SUB0
+	ble		ZGEMM_L1x2_SUB0
 	cmpwi		cr0,	L,	1
-	ble		.LZGEMM_L1x2_SUB4
+	ble		ZGEMM_L1x2_SUB4
 
-.LZGEMM_L1x2_LOOP_START:
+ZGEMM_L1x2_LOOP_START:
 
 	LOAD1x2_1
 	KERNEL1x2_I1
@@ -723,11 +768,11 @@
 	KERNEL1x2_2
 
 	addic.		L,	L,	-2
-	ble		.LZGEMM_L1x2_LOOP_END
+	ble		ZGEMM_L1x2_LOOP_END
 
 	.align 5
 
-.LZGEMM_L1x2_LOOP:
+ZGEMM_L1x2_LOOP:
 
 	KERNEL1x2_1
 	KERNEL1x2_2
@@ -740,9 +785,9 @@
 	KERNEL1x2_2
 
 	addic.		L,	L,	-1
-	bgt		.LZGEMM_L1x2_LOOP
+	bgt		ZGEMM_L1x2_LOOP
 
-.LZGEMM_L1x2_LOOP_END:
+ZGEMM_L1x2_LOOP_END:
 
 	KERNEL1x2_1
 	KERNEL1x2_2
@@ -754,9 +799,9 @@
 	KERNEL1x2_1
 	KERNEL1x2_E2
 
-	b		.LZGEMM_L1x2_SUB1
+	b		ZGEMM_L1x2_SUB1
 
-.LZGEMM_L1x2_SUB4:
+ZGEMM_L1x2_SUB4:
 
 	KERNEL1x2_SUBI1
 	KERNEL1x2_SUB1
@@ -768,48 +813,48 @@
 	KERNEL1x2_SUB1
 	KERNEL1x2_SUB1
 
-	b		.LZGEMM_L1x2_SUB1
+	b		ZGEMM_L1x2_SUB1
 
-.LZGEMM_L1x2_SUB0:
+ZGEMM_L1x2_SUB0:
 
 	andi.		L,	K,	7
 
 	KERNEL1x2_SUBI1
 
 	addic.		L,	L,	-1
-	ble		.LZGEMM_L1x2_SAVE
-	b		.LZGEMM_L1x2_SUB2
+	ble		ZGEMM_L1x2_SAVE
+	b		ZGEMM_L1x2_SUB2
 
-.LZGEMM_L1x2_SUB1:
+ZGEMM_L1x2_SUB1:
 
 	andi.		L,	K,	7
-	ble		.LZGEMM_L1x2_SAVE
+	ble		ZGEMM_L1x2_SAVE
 
-.LZGEMM_L1x2_SUB2:
+ZGEMM_L1x2_SUB2:
 
 	KERNEL1x2_SUB1
 
 	addic.		L,	L,	-1
-	bgt		.LZGEMM_L1x2_SUB2
+	bgt		ZGEMM_L1x2_SUB2
 
-.LZGEMM_L1x2_SAVE:
+ZGEMM_L1x2_SAVE:
 
 	SAVE1x2
 
-.LZGEMM_L1x2_END:
+ZGEMM_L1x2_END:
 
-.LZGEMM_L1x1_BEGIN:
+ZGEMM_L1x1_BEGIN:
 
 
 	andi.		T1,	M,	1
-	ble		.LZGEMM_L1x1_END
-	mr		BO,	B
+	ble		ZGEMM_L1x1_END
+	mr		BO,	BBUFFER
 	srawi.		L,	K,	3
-	ble		.LZGEMM_L1x1_SUB0
+	ble		ZGEMM_L1x1_SUB0
 	cmpwi		cr0,	L,	1
-	ble		.LZGEMM_L1x1_SUB4
+	ble		ZGEMM_L1x1_SUB4
 
-.LZGEMM_L1x1_LOOP_START:
+ZGEMM_L1x1_LOOP_START:
 
 	LOAD1x1_1
 	KERNEL1x1_I1
@@ -823,11 +868,11 @@
 	KERNEL1x1_2
 
 	addic.		L,	L,	-2
-	ble		.LZGEMM_L1x1_LOOP_END
+	ble		ZGEMM_L1x1_LOOP_END
 
 	.align 5
 
-.LZGEMM_L1x1_LOOP:
+ZGEMM_L1x1_LOOP:
 
 	KERNEL1x1_1
 	KERNEL1x1_2
@@ -840,9 +885,9 @@
 	KERNEL1x1_2
 
 	addic.		L,	L,	-1
-	bgt		.LZGEMM_L1x1_LOOP
+	bgt		ZGEMM_L1x1_LOOP
 
-.LZGEMM_L1x1_LOOP_END:
+ZGEMM_L1x1_LOOP_END:
 
 	KERNEL1x1_1
 	KERNEL1x1_2
@@ -854,9 +899,9 @@
 	KERNEL1x1_1
 	KERNEL1x1_E2
 
-	b		.LZGEMM_L1x1_SUB1
+	b		ZGEMM_L1x1_SUB1
 
-.LZGEMM_L1x1_SUB4:
+ZGEMM_L1x1_SUB4:
 
 	KERNEL1x1_SUBI1
 	KERNEL1x1_SUB1
@@ -868,34 +913,34 @@
 	KERNEL1x1_SUB1
 	KERNEL1x1_SUB1
 
-	b		.LZGEMM_L1x1_SUB1
+	b		ZGEMM_L1x1_SUB1
 
-.LZGEMM_L1x1_SUB0:
+ZGEMM_L1x1_SUB0:
 
 	andi.		L,	K,	7
 
 	KERNEL1x1_SUBI1
 
 	addic.		L,	L,	-1
-	ble		.LZGEMM_L1x1_SAVE
-	b		.LZGEMM_L1x1_SUB2
+	ble		ZGEMM_L1x1_SAVE
+	b		ZGEMM_L1x1_SUB2
 
-.LZGEMM_L1x1_SUB1:
+ZGEMM_L1x1_SUB1:
 
 	andi.		L,	K,	7
-	ble		.LZGEMM_L1x1_SAVE
+	ble		ZGEMM_L1x1_SAVE
 
-.LZGEMM_L1x1_SUB2:
+ZGEMM_L1x1_SUB2:
 
 	KERNEL1x1_SUB1
 
 	addic.		L,	L,	-1
-	bgt		.LZGEMM_L1x1_SUB2
+	bgt		ZGEMM_L1x1_SUB2
 
-.LZGEMM_L1x1_SAVE:
+ZGEMM_L1x1_SAVE:
 
 	SAVE1x1
 
-.LZGEMM_L1x1_END:
+ZGEMM_L1x1_END:
 
-.LZGEMM_L1_END:
+ZGEMM_L1_END:
diff --git a/kernel/power/zgemm_macros_8x2_power8.S b/kernel/power/zgemm_macros_8x2_power8.S
index 701ec65c8..a0fbb2e11 100644
--- a/kernel/power/zgemm_macros_8x2_power8.S
+++ b/kernel/power/zgemm_macros_8x2_power8.S
@@ -1,39 +1,3 @@
-/***************************************************************************
-Copyright (c) 2013-2016, The OpenBLAS Project
-All rights reserved.
-Redistribution and use in source and binary forms, with or without
-modification, are permitted provided that the following conditions are
-met:
-1. Redistributions of source code must retain the above copyright
-notice, this list of conditions and the following disclaimer.
-2. Redistributions in binary form must reproduce the above copyright
-notice, this list of conditions and the following disclaimer in
-the documentation and/or other materials provided with the
-distribution.
-3. Neither the name of the OpenBLAS project nor the names of
-its contributors may be used to endorse or promote products
-derived from this software without specific prior written permission.
-THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
-AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
-IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
-ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
-LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
-DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
-SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
-CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
-OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
-USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-*****************************************************************************/
-
-/**************************************************************************************
-* 2016/03/05 Werner Saar (wernsaar@googlemail.com)
-* 	 BLASTEST 		: OK
-* 	 CTEST			: OK
-* 	 TEST			: OK
-*	 LAPACK-TEST		: OK
-**************************************************************************************/
-
-
 #if   defined(NN) || defined(NT) || defined(TN) || defined(TT)
 
 	#define	XSFADD_R1	xsadddp
@@ -70,12 +34,12 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
 .macro LOAD2x8_1
 
-	lxvdsx		vs16,	o0,	BO		// load real part from B
-	lxvdsx		vs17,	o8,	BO		// load imag part from B
-	lxvdsx		vs18,	o16,	BO		// load real part from B
-	lxvdsx		vs19,	o24,	BO		// load imag part from B
+	lxvd2x		vs16,	o0,	BO		// load real part from B
+	lxvd2x		vs17,	o16,	BO		// load imag part from B
+	lxvd2x		vs18,	o32,	BO		// load real part from B
+	lxvd2x		vs19,	o48,	BO		// load imag part from B
 
-	addi		BO,	BO,	32
+	addi		BO,	BO,	64
 
 	lxvd2x		vs0,	o0,	AO		// load real,imag from A
 	lxvd2x		vs1,	o16,	AO		// load real,imag from A
@@ -110,12 +74,12 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
 	addi		AO,	AO,	64
 
-	lxvdsx		vs20,	o0,	BO		// load real part from B
-	lxvdsx		vs21,	o8,	BO		// load imag part from B
-	lxvdsx		vs22,	o16,	BO		// load real part from B
-	lxvdsx		vs23,	o24,	BO		// load imag part from B
+	lxvd2x		vs20,	o0,	BO		// load real part from B
+	lxvd2x		vs21,	o16,	BO		// load imag part from B
+	lxvd2x		vs22,	o32,	BO		// load real part from B
+	lxvd2x		vs23,	o48,	BO		// load imag part from B
 
-	addi		BO,	BO,	32
+	addi		BO,	BO,	64
 
 	xvmuldp		vs32,	vs0,	vs16		// real*real, imag*real
 	xvmuldp		vs33,	vs0,	vs17		// real*imag, imag*imag
@@ -156,36 +120,41 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
 .macro KERNEL2x8_1
 
+	lxvd2x		vs8,	o0,	AO		// load real,imag from A
+	lxvd2x		vs9,	o16,	AO		// load real,imag from A
+	lxvd2x		vs10,	o32,	AO		// load real,imag from A
+	lxvd2x		vs11,	o48,	AO		// load real,imag from A
+
+	addi		AO,	AO,	64
+
+	lxvd2x		vs12,	o0,	AO		// load real,imag from A
+	lxvd2x		vs13,	o16,	AO		// load real,imag from A
+	lxvd2x		vs14,	o32,	AO		// load real,imag from A
+	lxvd2x		vs15,	o48,	AO		// load real,imag from A
+
+	addi		AO,	AO,	64
+
+	lxvd2x		vs20,	o0,	BO		// load real part from B
+	lxvd2x		vs21,	o16,	BO		// load imag part from B
+	lxvd2x		vs22,	o32,	BO		// load real part from B
+	lxvd2x		vs23,	o48,	BO		// load imag part from B
+
+	addi		BO,	BO,	64
 
 	xvmaddadp	vs32,	vs0,	vs16		// real*real, imag*real
 	xvmaddadp	vs33,	vs0,	vs17		// real*imag, imag*imag
 	xvmaddadp	vs34,	vs1,	vs16		// real*real, imag*real
 	xvmaddadp	vs35,	vs1,	vs17		// real*imag, imag*imag
-
-	lxvdsx		vs22,	o16,	BO		// load real part from B
-	lxvdsx		vs23,	o24,	BO		// load imag part from B
-
 	xvmaddadp	vs36,	vs2,	vs16		// real*real, imag*real
 	xvmaddadp	vs37,	vs2,	vs17		// real*imag, imag*imag
 	xvmaddadp	vs38,	vs3,	vs16		// real*real, imag*real
 	xvmaddadp	vs39,	vs3,	vs17		// real*imag, imag*imag
-
-	lxvd2x		vs8,	o0,	AO		// load real,imag from A
-	lxvd2x		vs9,	o16,	AO		// load real,imag from A
-
 	xvmaddadp	vs40,	vs4,	vs16		// real*real, imag*real
 	xvmaddadp	vs41,	vs4,	vs17		// real*imag, imag*imag
 	xvmaddadp	vs42,	vs5,	vs16		// real*real, imag*real
 	xvmaddadp	vs43,	vs5,	vs17		// real*imag, imag*imag
-
-	lxvd2x		vs10,	o32,	AO		// load real,imag from A
-	lxvd2x		vs11,	o48,	AO		// load real,imag from A
-
 	xvmaddadp	vs44,	vs6,	vs16		// real*real, imag*real
 	xvmaddadp	vs45,	vs6,	vs17		// real*imag, imag*imag
-
-	addi		AO,	AO,	64
-
 	xvmaddadp	vs46,	vs7,	vs16		// real*real, imag*real
 	xvmaddadp	vs47,	vs7,	vs17		// real*imag, imag*imag
 
@@ -193,101 +162,79 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 	xvmaddadp	vs49,	vs0,	vs19		// real*imag, imag*imag
 	xvmaddadp	vs50,	vs1,	vs18		// real*real, imag*real
 	xvmaddadp	vs51,	vs1,	vs19		// real*imag, imag*imag
-
-	lxvd2x		vs12,	o0,	AO		// load real,imag from A
-	lxvd2x		vs13,	o16,	AO		// load real,imag from A
-
 	xvmaddadp	vs52,	vs2,	vs18		// real*real, imag*real
 	xvmaddadp	vs53,	vs2,	vs19		// real*imag, imag*imag
 	xvmaddadp	vs54,	vs3,	vs18		// real*real, imag*real
 	xvmaddadp	vs55,	vs3,	vs19		// real*imag, imag*imag
-
-	lxvd2x		vs14,	o32,	AO		// load real,imag from A
-	lxvd2x		vs15,	o48,	AO		// load real,imag from A
-
 	xvmaddadp	vs56,	vs4,	vs18		// real*real, imag*real
 	xvmaddadp	vs57,	vs4,	vs19		// real*imag, imag*imag
 	xvmaddadp	vs58,	vs5,	vs18		// real*real, imag*real
 	xvmaddadp	vs59,	vs5,	vs19		// real*imag, imag*imag
-
-	lxvdsx		vs20,	o0,	BO		// load real part from B
-	lxvdsx		vs21,	o8,	BO		// load imag part from B
-
 	xvmaddadp	vs60,	vs6,	vs18		// real*real, imag*real
 	xvmaddadp	vs61,	vs6,	vs19		// real*imag, imag*imag
 	xvmaddadp	vs62,	vs7,	vs18		// real*real, imag*real
 	xvmaddadp	vs63,	vs7,	vs19		// real*imag, imag*imag
 
-	addi		AO,	AO,	64
-	addi		BO,	BO,	32
 
 .endm
 
 .macro KERNEL2x8_2
 
+	lxvd2x		vs0,	o0,	AO		// load real,imag from A
+	lxvd2x		vs1,	o16,	AO		// load real,imag from A
+	lxvd2x		vs2,	o32,	AO		// load real,imag from A
+	lxvd2x		vs3,	o48,	AO		// load real,imag from A
+
+	addi		AO,	AO,	64
+
+	lxvd2x		vs4,	o0,	AO		// load real,imag from A
+	lxvd2x		vs5,	o16,	AO		// load real,imag from A
+	lxvd2x		vs6,	o32,	AO		// load real,imag from A
+	lxvd2x		vs7,	o48,	AO		// load real,imag from A
+
+	addi		AO,	AO,	64
+
+	lxvd2x		vs16,	o0,	BO		// load real part from B
+	lxvd2x		vs17,	o16,	BO		// load imag part from B
+	lxvd2x		vs18,	o32,	BO		// load real part from B
+	lxvd2x		vs19,	o48,	BO		// load imag part from B
+
+	addi		BO,	BO,	64
 
 	xvmaddadp	vs32,	vs8,	vs20		// real*real, imag*real
 	xvmaddadp	vs33,	vs8,	vs21		// real*imag, imag*imag
 	xvmaddadp	vs34,	vs9,	vs20		// real*real, imag*real
 	xvmaddadp	vs35,	vs9,	vs21		// real*imag, imag*imag
-
-	lxvdsx		vs16,	o0,	BO		// load real part from B
-	lxvdsx		vs17,	o8,	BO		// load imag part from B
-
 	xvmaddadp	vs36,	vs10,	vs20		// real*real, imag*real
 	xvmaddadp	vs37,	vs10,	vs21		// real*imag, imag*imag
 	xvmaddadp	vs38,	vs11,	vs20		// real*real, imag*real
 	xvmaddadp	vs39,	vs11,	vs21		// real*imag, imag*imag
-
-	lxvd2x		vs0,	o0,	AO		// load real,imag from A
-	lxvd2x		vs1,	o16,	AO		// load real,imag from A
-
 	xvmaddadp	vs40,	vs12,	vs20		// real*real, imag*real
 	xvmaddadp	vs41,	vs12,	vs21		// real*imag, imag*imag
 	xvmaddadp	vs42,	vs13,	vs20		// real*real, imag*real
 	xvmaddadp	vs43,	vs13,	vs21		// real*imag, imag*imag
-
-	lxvd2x		vs2,	o32,	AO		// load real,imag from A
-	lxvd2x		vs3,	o48,	AO		// load real,imag from A
-
 	xvmaddadp	vs44,	vs14,	vs20		// real*real, imag*real
 	xvmaddadp	vs45,	vs14,	vs21		// real*imag, imag*imag
 	xvmaddadp	vs46,	vs15,	vs20		// real*real, imag*real
 	xvmaddadp	vs47,	vs15,	vs21		// real*imag, imag*imag
 
-	addi		AO,	AO,	64
-
 	xvmaddadp	vs48,	vs8,	vs22		// real*real, imag*real
 	xvmaddadp	vs49,	vs8,	vs23		// real*imag, imag*imag
 	xvmaddadp	vs50,	vs9,	vs22		// real*real, imag*real
 	xvmaddadp	vs51,	vs9,	vs23		// real*imag, imag*imag
-
-	lxvd2x		vs4,	o0,	AO		// load real,imag from A
-	lxvd2x		vs5,	o16,	AO		// load real,imag from A
-
 	xvmaddadp	vs52,	vs10,	vs22		// real*real, imag*real
 	xvmaddadp	vs53,	vs10,	vs23		// real*imag, imag*imag
 	xvmaddadp	vs54,	vs11,	vs22		// real*real, imag*real
 	xvmaddadp	vs55,	vs11,	vs23		// real*imag, imag*imag
-
-	lxvd2x		vs6,	o32,	AO		// load real,imag from A
-	lxvd2x		vs7,	o48,	AO		// load real,imag from A
-
 	xvmaddadp	vs56,	vs12,	vs22		// real*real, imag*real
 	xvmaddadp	vs57,	vs12,	vs23		// real*imag, imag*imag
 	xvmaddadp	vs58,	vs13,	vs22		// real*real, imag*real
 	xvmaddadp	vs59,	vs13,	vs23		// real*imag, imag*imag
-
-	lxvdsx		vs18,	o16,	BO		// load real part from B
-	lxvdsx		vs19,	o24,	BO		// load imag part from B
-
 	xvmaddadp	vs60,	vs14,	vs22		// real*real, imag*real
 	xvmaddadp	vs61,	vs14,	vs23		// real*imag, imag*imag
 	xvmaddadp	vs62,	vs15,	vs22		// real*real, imag*real
 	xvmaddadp	vs63,	vs15,	vs23		// real*imag, imag*imag
 
-	addi		AO,	AO,	64
-	addi		BO,	BO,	32
 
 .endm
 
@@ -347,12 +294,12 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
 	addi		AO,	AO,	64
 
-	lxvdsx		vs16,	o0,	BO		// load real part from B
-	lxvdsx		vs17,	o8,	BO		// load imag part from B
-	lxvdsx		vs18,	o16,	BO		// load real part from B
-	lxvdsx		vs19,	o24,	BO		// load imag part from B
+	lxvd2x		vs16,	o0,	BO		// load real part from B
+	lxvd2x		vs17,	o16,	BO		// load imag part from B
+	lxvd2x		vs18,	o32,	BO		// load real part from B
+	lxvd2x		vs19,	o48,	BO		// load imag part from B
 
-	addi		BO,	BO,	32
+	addi		BO,	BO,	64
 
 	xvmuldp		vs32,	vs0,	vs16		// real*real, imag*real
 	xvmuldp		vs33,	vs0,	vs17		// real*imag, imag*imag
@@ -407,12 +354,12 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
 	addi		AO,	AO,	64
 
-	lxvdsx		vs16,	o0,	BO		// load real part from B
-	lxvdsx		vs17,	o8,	BO		// load imag part from B
-	lxvdsx		vs18,	o16,	BO		// load real part from B
-	lxvdsx		vs19,	o24,	BO		// load imag part from B
+	lxvd2x		vs16,	o0,	BO		// load real part from B
+	lxvd2x		vs17,	o16,	BO		// load imag part from B
+	lxvd2x		vs18,	o32,	BO		// load real part from B
+	lxvd2x		vs19,	o48,	BO		// load imag part from B
 
-	addi		BO,	BO,	32
+	addi		BO,	BO,	64
 
 	xvmaddadp	vs32,	vs0,	vs16		// real*real, imag*real
 	xvmaddadp	vs33,	vs0,	vs17		// real*imag, imag*imag
@@ -927,12 +874,12 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
 .macro LOAD2x4_1
 
-	lxvdsx		vs16,	o0,	BO		// load real part from B
-	lxvdsx		vs17,	o8,	BO		// load imag part from B
-	lxvdsx		vs18,	o16,	BO		// load real part from B
-	lxvdsx		vs19,	o24,	BO		// load imag part from B
+	lxvd2x		vs16,	o0,	BO		// load real part from B
+	lxvd2x		vs17,	o16,	BO		// load imag part from B
+	lxvd2x		vs18,	o32,	BO		// load real part from B
+	lxvd2x		vs19,	o48,	BO		// load imag part from B
 
-	addi		BO,	BO,	32
+	addi		BO,	BO,	64
 
 	lxvd2x		vs0,	o0,	AO		// load real,imag from A
 	lxvd2x		vs1,	o16,	AO		// load real,imag from A
@@ -953,12 +900,12 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
 	addi		AO,	AO,	64
 
-	lxvdsx		vs20,	o0,	BO		// load real part from B
-	lxvdsx		vs21,	o8,	BO		// load imag part from B
-	lxvdsx		vs22,	o16,	BO		// load real part from B
-	lxvdsx		vs23,	o24,	BO		// load imag part from B
+	lxvd2x		vs20,	o0,	BO		// load real part from B
+	lxvd2x		vs21,	o16,	BO		// load imag part from B
+	lxvd2x		vs22,	o32,	BO		// load real part from B
+	lxvd2x		vs23,	o48,	BO		// load imag part from B
 
-	addi		BO,	BO,	32
+	addi		BO,	BO,	64
 
 	xvmuldp		vs32,	vs0,	vs16		// real*real, imag*real
 	xvmuldp		vs33,	vs0,	vs17		// real*imag, imag*imag
@@ -990,12 +937,12 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
 	addi		AO,	AO,	64
 
-	lxvdsx		vs20,	o0,	BO		// load real part from B
-	lxvdsx		vs21,	o8,	BO		// load imag part from B
-	lxvdsx		vs22,	o16,	BO		// load real part from B
-	lxvdsx		vs23,	o24,	BO		// load imag part from B
+	lxvd2x		vs20,	o0,	BO		// load real part from B
+	lxvd2x		vs21,	o16,	BO		// load imag part from B
+	lxvd2x		vs22,	o32,	BO		// load real part from B
+	lxvd2x		vs23,	o48,	BO		// load imag part from B
 
-	addi		BO,	BO,	32
+	addi		BO,	BO,	64
 
 	xvmaddadp	vs32,	vs0,	vs16		// real*real, imag*real
 	xvmaddadp	vs33,	vs0,	vs17		// real*imag, imag*imag
@@ -1027,12 +974,12 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
 	addi		AO,	AO,	64
 
-	lxvdsx		vs16,	o0,	BO		// load real part from B
-	lxvdsx		vs17,	o8,	BO		// load imag part from B
-	lxvdsx		vs18,	o16,	BO		// load real part from B
-	lxvdsx		vs19,	o24,	BO		// load imag part from B
+	lxvd2x		vs16,	o0,	BO		// load real part from B
+	lxvd2x		vs17,	o16,	BO		// load imag part from B
+	lxvd2x		vs18,	o32,	BO		// load real part from B
+	lxvd2x		vs19,	o48,	BO		// load imag part from B
 
-	addi		BO,	BO,	32
+	addi		BO,	BO,	64
 
 	xvmaddadp	vs32,	vs8,	vs20		// real*real, imag*real
 	xvmaddadp	vs33,	vs8,	vs21		// real*imag, imag*imag
@@ -1088,12 +1035,12 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
 	addi		AO,	AO,	64
 
-	lxvdsx		vs16,	o0,	BO		// load real part from B
-	lxvdsx		vs17,	o8,	BO		// load imag part from B
-	lxvdsx		vs18,	o16,	BO		// load real part from B
-	lxvdsx		vs19,	o24,	BO		// load imag part from B
+	lxvd2x		vs16,	o0,	BO		// load real part from B
+	lxvd2x		vs17,	o16,	BO		// load imag part from B
+	lxvd2x		vs18,	o32,	BO		// load real part from B
+	lxvd2x		vs19,	o48,	BO		// load imag part from B
 
-	addi		BO,	BO,	32
+	addi		BO,	BO,	64
 
 	xvmuldp		vs32,	vs0,	vs16		// real*real, imag*real
 	xvmuldp		vs33,	vs0,	vs17		// real*imag, imag*imag
@@ -1125,12 +1072,12 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
 	addi		AO,	AO,	64
 
-	lxvdsx		vs16,	o0,	BO		// load real part from B
-	lxvdsx		vs17,	o8,	BO		// load imag part from B
-	lxvdsx		vs18,	o16,	BO		// load real part from B
-	lxvdsx		vs19,	o24,	BO		// load imag part from B
+	lxvd2x		vs16,	o0,	BO		// load real part from B
+	lxvd2x		vs17,	o16,	BO		// load imag part from B
+	lxvd2x		vs18,	o32,	BO		// load real part from B
+	lxvd2x		vs19,	o48,	BO		// load imag part from B
 
-	addi		BO,	BO,	32
+	addi		BO,	BO,	64
 
 	xvmaddadp	vs32,	vs0,	vs16		// real*real, imag*real
 	xvmaddadp	vs33,	vs0,	vs17		// real*imag, imag*imag
@@ -1410,12 +1357,12 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
 .macro LOAD2x2_1
 
-	lxvdsx		vs16,	o0,	BO		// load real part from B
-	lxvdsx		vs17,	o8,	BO		// load imag part from B
-	lxvdsx		vs18,	o16,	BO		// load real part from B
-	lxvdsx		vs19,	o24,	BO		// load imag part from B
+	lxvd2x		vs16,	o0,	BO		// load real part from B
+	lxvd2x		vs17,	o16,	BO		// load imag part from B
+	lxvd2x		vs18,	o32,	BO		// load real part from B
+	lxvd2x		vs19,	o48,	BO		// load imag part from B
 
-	addi		BO,	BO,	32
+	addi		BO,	BO,	64
 
 	lxvd2x		vs0,	o0,	AO		// load real,imag from A
 	lxvd2x		vs1,	o16,	AO		// load real,imag from A
@@ -1432,12 +1379,12 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
 	addi		AO,	AO,	32
 
-	lxvdsx		vs20,	o0,	BO		// load real part from B
-	lxvdsx		vs21,	o8,	BO		// load imag part from B
-	lxvdsx		vs22,	o16,	BO		// load real part from B
-	lxvdsx		vs23,	o24,	BO		// load imag part from B
+	lxvd2x		vs20,	o0,	BO		// load real part from B
+	lxvd2x		vs21,	o16,	BO		// load imag part from B
+	lxvd2x		vs22,	o32,	BO		// load real part from B
+	lxvd2x		vs23,	o48,	BO		// load imag part from B
 
-	addi		BO,	BO,	32
+	addi		BO,	BO,	64
 
 	xvmuldp		vs32,	vs0,	vs16		// real*real, imag*real
 	xvmuldp		vs33,	vs0,	vs17		// real*imag, imag*imag
@@ -1459,12 +1406,12 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
 	addi		AO,	AO,	32
 
-	lxvdsx		vs20,	o0,	BO		// load real part from B
-	lxvdsx		vs21,	o8,	BO		// load imag part from B
-	lxvdsx		vs22,	o16,	BO		// load real part from B
-	lxvdsx		vs23,	o24,	BO		// load imag part from B
+	lxvd2x		vs20,	o0,	BO		// load real part from B
+	lxvd2x		vs21,	o16,	BO		// load imag part from B
+	lxvd2x		vs22,	o32,	BO		// load real part from B
+	lxvd2x		vs23,	o48,	BO		// load imag part from B
 
-	addi		BO,	BO,	32
+	addi		BO,	BO,	64
 
 	xvmaddadp	vs32,	vs0,	vs16		// real*real, imag*real
 	xvmaddadp	vs33,	vs0,	vs17		// real*imag, imag*imag
@@ -1486,12 +1433,12 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
 	addi		AO,	AO,	32
 
-	lxvdsx		vs16,	o0,	BO		// load real part from B
-	lxvdsx		vs17,	o8,	BO		// load imag part from B
-	lxvdsx		vs18,	o16,	BO		// load real part from B
-	lxvdsx		vs19,	o24,	BO		// load imag part from B
+	lxvd2x		vs16,	o0,	BO		// load real part from B
+	lxvd2x		vs17,	o16,	BO		// load imag part from B
+	lxvd2x		vs18,	o32,	BO		// load real part from B
+	lxvd2x		vs19,	o48,	BO		// load imag part from B
 
-	addi		BO,	BO,	32
+	addi		BO,	BO,	64
 
 	xvmaddadp	vs32,	vs8,	vs20		// real*real, imag*real
 	xvmaddadp	vs33,	vs8,	vs21		// real*imag, imag*imag
@@ -1529,12 +1476,12 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
 	addi		AO,	AO,	32
 
-	lxvdsx		vs16,	o0,	BO		// load real part from B
-	lxvdsx		vs17,	o8,	BO		// load imag part from B
-	lxvdsx		vs18,	o16,	BO		// load real part from B
-	lxvdsx		vs19,	o24,	BO		// load imag part from B
+	lxvd2x		vs16,	o0,	BO		// load real part from B
+	lxvd2x		vs17,	o16,	BO		// load imag part from B
+	lxvd2x		vs18,	o32,	BO		// load real part from B
+	lxvd2x		vs19,	o48,	BO		// load imag part from B
 
-	addi		BO,	BO,	32
+	addi		BO,	BO,	64
 
 	xvmuldp		vs32,	vs0,	vs16		// real*real, imag*real
 	xvmuldp		vs33,	vs0,	vs17		// real*imag, imag*imag
@@ -1556,12 +1503,12 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
 	addi		AO,	AO,	32
 
-	lxvdsx		vs16,	o0,	BO		// load real part from B
-	lxvdsx		vs17,	o8,	BO		// load imag part from B
-	lxvdsx		vs18,	o16,	BO		// load real part from B
-	lxvdsx		vs19,	o24,	BO		// load imag part from B
+	lxvd2x		vs16,	o0,	BO		// load real part from B
+	lxvd2x		vs17,	o16,	BO		// load imag part from B
+	lxvd2x		vs18,	o32,	BO		// load real part from B
+	lxvd2x		vs19,	o48,	BO		// load imag part from B
 
-	addi		BO,	BO,	32
+	addi		BO,	BO,	64
 
 	xvmaddadp	vs32,	vs0,	vs16		// real*real, imag*real
 	xvmaddadp	vs33,	vs0,	vs17		// real*imag, imag*imag
@@ -1725,12 +1672,12 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
 .macro LOAD2x1_1
 
-	lxvdsx		vs16,	o0,	BO		// load real part from B
-	lxvdsx		vs17,	o8,	BO		// load imag part from B
-	lxvdsx		vs18,	o16,	BO		// load real part from B
-	lxvdsx		vs19,	o24,	BO		// load imag part from B
+	lxvd2x		vs16,	o0,	BO		// load real part from B
+	lxvd2x		vs17,	o16,	BO		// load imag part from B
+	lxvd2x		vs18,	o32,	BO		// load real part from B
+	lxvd2x		vs19,	o48,	BO		// load imag part from B
 
-	addi		BO,	BO,	32
+	addi		BO,	BO,	64
 
 	lxvd2x		vs0,	o0,	AO		// load real,imag from A
 
@@ -1745,12 +1692,12 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
 	addi		AO,	AO,	16
 
-	lxvdsx		vs20,	o0,	BO		// load real part from B
-	lxvdsx		vs21,	o8,	BO		// load imag part from B
-	lxvdsx		vs22,	o16,	BO		// load real part from B
-	lxvdsx		vs23,	o24,	BO		// load imag part from B
+	lxvd2x		vs20,	o0,	BO		// load real part from B
+	lxvd2x		vs21,	o16,	BO		// load imag part from B
+	lxvd2x		vs22,	o32,	BO		// load real part from B
+	lxvd2x		vs23,	o48,	BO		// load imag part from B
 
-	addi		BO,	BO,	32
+	addi		BO,	BO,	64
 
 	xvmuldp		vs32,	vs0,	vs16		// real*real, imag*real
 	xvmuldp		vs33,	vs0,	vs17		// real*imag, imag*imag
@@ -1767,12 +1714,12 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
 	addi		AO,	AO,	16
 
-	lxvdsx		vs20,	o0,	BO		// load real part from B
-	lxvdsx		vs21,	o8,	BO		// load imag part from B
-	lxvdsx		vs22,	o16,	BO		// load real part from B
-	lxvdsx		vs23,	o24,	BO		// load imag part from B
+	lxvd2x		vs20,	o0,	BO		// load real part from B
+	lxvd2x		vs21,	o16,	BO		// load imag part from B
+	lxvd2x		vs22,	o32,	BO		// load real part from B
+	lxvd2x		vs23,	o48,	BO		// load imag part from B
 
-	addi		BO,	BO,	32
+	addi		BO,	BO,	64
 
 	xvmaddadp	vs32,	vs0,	vs16		// real*real, imag*real
 	xvmaddadp	vs33,	vs0,	vs17		// real*imag, imag*imag
@@ -1789,12 +1736,12 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
 	addi		AO,	AO,	16
 
-	lxvdsx		vs16,	o0,	BO		// load real part from B
-	lxvdsx		vs17,	o8,	BO		// load imag part from B
-	lxvdsx		vs18,	o16,	BO		// load real part from B
-	lxvdsx		vs19,	o24,	BO		// load imag part from B
+	lxvd2x		vs16,	o0,	BO		// load real part from B
+	lxvd2x		vs17,	o16,	BO		// load imag part from B
+	lxvd2x		vs18,	o32,	BO		// load real part from B
+	lxvd2x		vs19,	o48,	BO		// load imag part from B
 
-	addi		BO,	BO,	32
+	addi		BO,	BO,	64
 
 	xvmaddadp	vs32,	vs8,	vs20		// real*real, imag*real
 	xvmaddadp	vs33,	vs8,	vs21		// real*imag, imag*imag
@@ -1823,12 +1770,12 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
 	addi		AO,	AO,	16
 
-	lxvdsx		vs16,	o0,	BO		// load real part from B
-	lxvdsx		vs17,	o8,	BO		// load imag part from B
-	lxvdsx		vs18,	o16,	BO		// load real part from B
-	lxvdsx		vs19,	o24,	BO		// load imag part from B
+	lxvd2x		vs16,	o0,	BO		// load real part from B
+	lxvd2x		vs17,	o16,	BO		// load imag part from B
+	lxvd2x		vs18,	o32,	BO		// load real part from B
+	lxvd2x		vs19,	o48,	BO		// load imag part from B
 
-	addi		BO,	BO,	32
+	addi		BO,	BO,	64
 
 	xvmuldp		vs32,	vs0,	vs16		// real*real, imag*real
 	xvmuldp		vs33,	vs0,	vs17		// real*imag, imag*imag
@@ -1845,12 +1792,12 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
 	addi		AO,	AO,	16
 
-	lxvdsx		vs16,	o0,	BO		// load real part from B
-	lxvdsx		vs17,	o8,	BO		// load imag part from B
-	lxvdsx		vs18,	o16,	BO		// load real part from B
-	lxvdsx		vs19,	o24,	BO		// load imag part from B
+	lxvd2x		vs16,	o0,	BO		// load real part from B
+	lxvd2x		vs17,	o16,	BO		// load imag part from B
+	lxvd2x		vs18,	o32,	BO		// load real part from B
+	lxvd2x		vs19,	o48,	BO		// load imag part from B
 
-	addi		BO,	BO,	32
+	addi		BO,	BO,	64
 
 	xvmaddadp	vs32,	vs0,	vs16		// real*real, imag*real
 	xvmaddadp	vs33,	vs0,	vs17		// real*imag, imag*imag
@@ -1956,10 +1903,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
 .macro LOAD1x8_1
 
-	lxvdsx		vs16,	o0,	BO		// load real part from B
-	lxvdsx		vs17,	o8,	BO		// load imag part from B
+	lxvd2x		vs16,	o0,	BO		// load real part from B
+	lxvd2x		vs17,	o16,	BO		// load imag part from B
 
-	addi		BO,	BO,	16
+	addi		BO,	BO,	32
 
 	lxvd2x		vs0,	o0,	AO		// load real,imag from A
 	lxvd2x		vs1,	o16,	AO		// load real,imag from A
@@ -1994,10 +1941,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
 	addi		AO,	AO,	64
 
-	lxvdsx		vs20,	o0,	BO		// load real part from B
-	lxvdsx		vs21,	o8,	BO		// load imag part from B
+	lxvd2x		vs20,	o0,	BO		// load real part from B
+	lxvd2x		vs21,	o16,	BO		// load imag part from B
 
-	addi		BO,	BO,	16
+	addi		BO,	BO,	32
 
 	xvmuldp		vs32,	vs0,	vs16		// real*real, imag*real
 	xvmuldp		vs33,	vs0,	vs17		// real*imag, imag*imag
@@ -2035,10 +1982,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
 	addi		AO,	AO,	64
 
-	lxvdsx		vs20,	o0,	BO		// load real part from B
-	lxvdsx		vs21,	o8,	BO		// load imag part from B
+	lxvd2x		vs20,	o0,	BO		// load real part from B
+	lxvd2x		vs21,	o16,	BO		// load imag part from B
 
-	addi		BO,	BO,	16
+	addi		BO,	BO,	32
 
 	xvmaddadp	vs32,	vs0,	vs16		// real*real, imag*real
 	xvmaddadp	vs33,	vs0,	vs17		// real*imag, imag*imag
@@ -2076,10 +2023,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
 	addi		AO,	AO,	64
 
-	lxvdsx		vs16,	o0,	BO		// load real part from B
-	lxvdsx		vs17,	o8,	BO		// load imag part from B
+	lxvd2x		vs16,	o0,	BO		// load real part from B
+	lxvd2x		vs17,	o16,	BO		// load imag part from B
 
-	addi		BO,	BO,	16
+	addi		BO,	BO,	32
 
 	xvmaddadp	vs32,	vs8,	vs20		// real*real, imag*real
 	xvmaddadp	vs33,	vs8,	vs21		// real*imag, imag*imag
@@ -2140,10 +2087,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
 	addi		AO,	AO,	64
 
-	lxvdsx		vs16,	o0,	BO		// load real part from B
-	lxvdsx		vs17,	o8,	BO		// load imag part from B
+	lxvd2x		vs16,	o0,	BO		// load real part from B
+	lxvd2x		vs17,	o16,	BO		// load imag part from B
 
-	addi		BO,	BO,	16
+	addi		BO,	BO,	32
 
 	xvmuldp		vs32,	vs0,	vs16		// real*real, imag*real
 	xvmuldp		vs33,	vs0,	vs17		// real*imag, imag*imag
@@ -2181,10 +2128,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
 	addi		AO,	AO,	64
 
-	lxvdsx		vs16,	o0,	BO		// load real part from B
-	lxvdsx		vs17,	o8,	BO		// load imag part from B
+	lxvd2x		vs16,	o0,	BO		// load real part from B
+	lxvd2x		vs17,	o16,	BO		// load imag part from B
 
-	addi		BO,	BO,	16
+	addi		BO,	BO,	32
 
 	xvmaddadp	vs32,	vs0,	vs16		// real*real, imag*real
 	xvmaddadp	vs33,	vs0,	vs17		// real*imag, imag*imag
@@ -2452,10 +2399,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
 .macro LOAD1x4_1
 
-	lxvdsx		vs16,	o0,	BO		// load real part from B
-	lxvdsx		vs17,	o8,	BO		// load imag part from B
+	lxvd2x		vs16,	o0,	BO		// load real part from B
+	lxvd2x		vs17,	o16,	BO		// load imag part from B
 
-	addi		BO,	BO,	16
+	addi		BO,	BO,	32
 
 	lxvd2x		vs0,	o0,	AO		// load real,imag from A
 	lxvd2x		vs1,	o16,	AO		// load real,imag from A
@@ -2476,10 +2423,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
 	addi		AO,	AO,	64
 
-	lxvdsx		vs20,	o0,	BO		// load real part from B
-	lxvdsx		vs21,	o8,	BO		// load imag part from B
+	lxvd2x		vs20,	o0,	BO		// load real part from B
+	lxvd2x		vs21,	o16,	BO		// load imag part from B
 
-	addi		BO,	BO,	16
+	addi		BO,	BO,	32
 
 	xvmuldp		vs32,	vs0,	vs16		// real*real, imag*real
 	xvmuldp		vs33,	vs0,	vs17		// real*imag, imag*imag
@@ -2502,10 +2449,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
 	addi		AO,	AO,	64
 
-	lxvdsx		vs20,	o0,	BO		// load real part from B
-	lxvdsx		vs21,	o8,	BO		// load imag part from B
+	lxvd2x		vs20,	o0,	BO		// load real part from B
+	lxvd2x		vs21,	o16,	BO		// load imag part from B
 
-	addi		BO,	BO,	16
+	addi		BO,	BO,	32
 
 	xvmaddadp	vs32,	vs0,	vs16		// real*real, imag*real
 	xvmaddadp	vs33,	vs0,	vs17		// real*imag, imag*imag
@@ -2528,10 +2475,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
 	addi		AO,	AO,	64
 
-	lxvdsx		vs16,	o0,	BO		// load real part from B
-	lxvdsx		vs17,	o8,	BO		// load imag part from B
+	lxvd2x		vs16,	o0,	BO		// load real part from B
+	lxvd2x		vs17,	o16,	BO		// load imag part from B
 
-	addi		BO,	BO,	16
+	addi		BO,	BO,	32
 
 	xvmaddadp	vs32,	vs8,	vs20		// real*real, imag*real
 	xvmaddadp	vs33,	vs8,	vs21		// real*imag, imag*imag
@@ -2569,10 +2516,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
 	addi		AO,	AO,	64
 
-	lxvdsx		vs16,	o0,	BO		// load real part from B
-	lxvdsx		vs17,	o8,	BO		// load imag part from B
+	lxvd2x		vs16,	o0,	BO		// load real part from B
+	lxvd2x		vs17,	o16,	BO		// load imag part from B
 
-	addi		BO,	BO,	16
+	addi		BO,	BO,	32
 
 	xvmuldp		vs32,	vs0,	vs16		// real*real, imag*real
 	xvmuldp		vs33,	vs0,	vs17		// real*imag, imag*imag
@@ -2595,10 +2542,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
 	addi		AO,	AO,	64
 
-	lxvdsx		vs16,	o0,	BO		// load real part from B
-	lxvdsx		vs17,	o8,	BO		// load imag part from B
+	lxvd2x		vs16,	o0,	BO		// load real part from B
+	lxvd2x		vs17,	o16,	BO		// load imag part from B
 
-	addi		BO,	BO,	16
+	addi		BO,	BO,	32
 
 	xvmaddadp	vs32,	vs0,	vs16		// real*real, imag*real
 	xvmaddadp	vs33,	vs0,	vs17		// real*imag, imag*imag
@@ -2748,10 +2695,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
 .macro LOAD1x2_1
 
-	lxvdsx		vs16,	o0,	BO		// load real part from B
-	lxvdsx		vs17,	o8,	BO		// load imag part from B
+	lxvd2x		vs16,	o0,	BO		// load real part from B
+	lxvd2x		vs17,	o16,	BO		// load imag part from B
 
-	addi		BO,	BO,	16
+	addi		BO,	BO,	32
 
 	lxvd2x		vs0,	o0,	AO		// load real,imag from A
 	lxvd2x		vs1,	o16,	AO		// load real,imag from A
@@ -2768,10 +2715,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
 	addi		AO,	AO,	32
 
-	lxvdsx		vs20,	o0,	BO		// load real part from B
-	lxvdsx		vs21,	o8,	BO		// load imag part from B
+	lxvd2x		vs20,	o0,	BO		// load real part from B
+	lxvd2x		vs21,	o16,	BO		// load imag part from B
 
-	addi		BO,	BO,	16
+	addi		BO,	BO,	32
 
 	xvmuldp		vs32,	vs0,	vs16		// real*real, imag*real
 	xvmuldp		vs33,	vs0,	vs17		// real*imag, imag*imag
@@ -2788,10 +2735,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
 	addi		AO,	AO,	32
 
-	lxvdsx		vs20,	o0,	BO		// load real part from B
-	lxvdsx		vs21,	o8,	BO		// load imag part from B
+	lxvd2x		vs20,	o0,	BO		// load real part from B
+	lxvd2x		vs21,	o16,	BO		// load imag part from B
 
-	addi		BO,	BO,	16
+	addi		BO,	BO,	32
 
 	xvmaddadp	vs32,	vs0,	vs16		// real*real, imag*real
 	xvmaddadp	vs33,	vs0,	vs17		// real*imag, imag*imag
@@ -2808,10 +2755,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
 	addi		AO,	AO,	32
 
-	lxvdsx		vs16,	o0,	BO		// load real part from B
-	lxvdsx		vs17,	o8,	BO		// load imag part from B
+	lxvd2x		vs16,	o0,	BO		// load real part from B
+	lxvd2x		vs17,	o16,	BO		// load imag part from B
 
-	addi		BO,	BO,	16
+	addi		BO,	BO,	32
 
 	xvmaddadp	vs32,	vs8,	vs20		// real*real, imag*real
 	xvmaddadp	vs33,	vs8,	vs21		// real*imag, imag*imag
@@ -2839,10 +2786,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
 	addi		AO,	AO,	32
 
-	lxvdsx		vs16,	o0,	BO		// load real part from B
-	lxvdsx		vs17,	o8,	BO		// load imag part from B
+	lxvd2x		vs16,	o0,	BO		// load real part from B
+	lxvd2x		vs17,	o16,	BO		// load imag part from B
 
-	addi		BO,	BO,	16
+	addi		BO,	BO,	32
 
 	xvmuldp		vs32,	vs0,	vs16		// real*real, imag*real
 	xvmuldp		vs33,	vs0,	vs17		// real*imag, imag*imag
@@ -2859,10 +2806,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
 	addi		AO,	AO,	32
 
-	lxvdsx		vs16,	o0,	BO		// load real part from B
-	lxvdsx		vs17,	o8,	BO		// load imag part from B
+	lxvd2x		vs16,	o0,	BO		// load real part from B
+	lxvd2x		vs17,	o16,	BO		// load imag part from B
 
-	addi		BO,	BO,	16
+	addi		BO,	BO,	32
 
 	xvmaddadp	vs32,	vs0,	vs16		// real*real, imag*real
 	xvmaddadp	vs33,	vs0,	vs17		// real*imag, imag*imag
@@ -2954,10 +2901,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
 .macro LOAD1x1_1
 
-	lxvdsx		vs16,	o0,	BO		// load real part from B
-	lxvdsx		vs17,	o8,	BO		// load imag part from B
+	lxvd2x		vs16,	o0,	BO		// load real part from B
+	lxvd2x		vs17,	o16,	BO		// load imag part from B
 
-	addi		BO,	BO,	16
+	addi		BO,	BO,	32
 
 	lxvd2x		vs0,	o0,	AO		// load real,imag from A
 
@@ -2972,10 +2919,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
 	addi		AO,	AO,	16
 
-	lxvdsx		vs20,	o0,	BO		// load real part from B
-	lxvdsx		vs21,	o8,	BO		// load imag part from B
+	lxvd2x		vs20,	o0,	BO		// load real part from B
+	lxvd2x		vs21,	o16,	BO		// load imag part from B
 
-	addi		BO,	BO,	16
+	addi		BO,	BO,	32
 
 	xvmuldp		vs32,	vs0,	vs16		// real*real, imag*real
 	xvmuldp		vs33,	vs0,	vs17		// real*imag, imag*imag
@@ -2989,10 +2936,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
 	addi		AO,	AO,	16
 
-	lxvdsx		vs20,	o0,	BO		// load real part from B
-	lxvdsx		vs21,	o8,	BO		// load imag part from B
+	lxvd2x		vs20,	o0,	BO		// load real part from B
+	lxvd2x		vs21,	o16,	BO		// load imag part from B
 
-	addi		BO,	BO,	16
+	addi		BO,	BO,	32
 
 	xvmaddadp	vs32,	vs0,	vs16		// real*real, imag*real
 	xvmaddadp	vs33,	vs0,	vs17		// real*imag, imag*imag
@@ -3006,10 +2953,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
 	addi		AO,	AO,	16
 
-	lxvdsx		vs16,	o0,	BO		// load real part from B
-	lxvdsx		vs17,	o8,	BO		// load imag part from B
+	lxvd2x		vs16,	o0,	BO		// load real part from B
+	lxvd2x		vs17,	o16,	BO		// load imag part from B
 
-	addi		BO,	BO,	16
+	addi		BO,	BO,	32
 
 	xvmaddadp	vs32,	vs8,	vs20		// real*real, imag*real
 	xvmaddadp	vs33,	vs8,	vs21		// real*imag, imag*imag
@@ -3032,10 +2979,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
 	addi		AO,	AO,	16
 
-	lxvdsx		vs16,	o0,	BO		// load real part from B
-	lxvdsx		vs17,	o8,	BO		// load imag part from B
+	lxvd2x		vs16,	o0,	BO		// load real part from B
+	lxvd2x		vs17,	o16,	BO		// load imag part from B
 
-	addi		BO,	BO,	16
+	addi		BO,	BO,	32
 
 	xvmuldp		vs32,	vs0,	vs16		// real*real, imag*real
 	xvmuldp		vs33,	vs0,	vs17		// real*imag, imag*imag
@@ -3049,10 +2996,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
 	addi		AO,	AO,	16
 
-	lxvdsx		vs16,	o0,	BO		// load real part from B
-	lxvdsx		vs17,	o8,	BO		// load imag part from B
+	lxvd2x		vs16,	o0,	BO		// load real part from B
+	lxvd2x		vs17,	o16,	BO		// load imag part from B
 
-	addi		BO,	BO,	16
+	addi		BO,	BO,	32
 
 	xvmaddadp	vs32,	vs0,	vs16		// real*real, imag*real
 	xvmaddadp	vs33,	vs0,	vs17		// real*imag, imag*imag
diff --git a/kernel/power/ztrmm_kernel_8x2_power8.S b/kernel/power/ztrmm_kernel_8x2_power8.S
index 8b953765e..0cfe613d5 100644
--- a/kernel/power/ztrmm_kernel_8x2_power8.S
+++ b/kernel/power/ztrmm_kernel_8x2_power8.S
@@ -271,7 +271,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #endif
 #endif
 
-#include "zgemm_macros_8x2_power8.S"
+#include "ztrmm_macros_8x2_power8.S"
 
 	cmpwi	cr0, M, 0
 	ble	.L999
diff --git a/kernel/power/ztrmm_macros_8x2_power8.S b/kernel/power/ztrmm_macros_8x2_power8.S
new file mode 100644
index 000000000..701ec65c8
--- /dev/null
+++ b/kernel/power/ztrmm_macros_8x2_power8.S
@@ -0,0 +1,3110 @@
+/***************************************************************************
+Copyright (c) 2013-2016, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*****************************************************************************/
+
+/**************************************************************************************
+* 2016/03/05 Werner Saar (wernsaar@googlemail.com)
+* 	 BLASTEST 		: OK
+* 	 CTEST			: OK
+* 	 TEST			: OK
+*	 LAPACK-TEST		: OK
+**************************************************************************************/
+
+
+#if   defined(NN) || defined(NT) || defined(TN) || defined(TT)
+
+	#define	XSFADD_R1	xsadddp
+	#define	XSFADD_R2	xssubdp
+	#define	XSFADD_I1	xsadddp
+	#define	XSFADD_I2	xsadddp
+
+#elif  defined(CN) || defined(CT) || defined(RN) || defined(RT)
+
+	#define	XSFADD_R1	xsadddp
+	#define	XSFADD_R2	xsadddp
+	#define	XSFADD_I1	xssubdp
+	#define	XSFADD_I2	xsadddp
+
+#elif  defined(NC) || defined(TC) || defined(NR) || defined(TR)
+
+	#define	XSFADD_R1	xsadddp
+	#define	XSFADD_R2	xsadddp
+	#define	XSFADD_I1	xsadddp
+	#define	XSFADD_I2	xssubdp
+
+#else		// CC || CR || RC || RR
+
+	#define	XSFADD_R1	xsadddp
+	#define	XSFADD_R2	xssubdp
+	#define	XSFADD_I1	xssubdp
+	#define	XSFADD_I2	xssubdp
+
+#endif
+
+/**********************************************************************************************
+* Macros for N=2 and M=8
+**********************************************************************************************/
+
+.macro LOAD2x8_1
+
+	lxvdsx		vs16,	o0,	BO		// load real part from B
+	lxvdsx		vs17,	o8,	BO		// load imag part from B
+	lxvdsx		vs18,	o16,	BO		// load real part from B
+	lxvdsx		vs19,	o24,	BO		// load imag part from B
+
+	addi		BO,	BO,	32
+
+	lxvd2x		vs0,	o0,	AO		// load real,imag from A
+	lxvd2x		vs1,	o16,	AO		// load real,imag from A
+	lxvd2x		vs2,	o32,	AO		// load real,imag from A
+	lxvd2x		vs3,	o48,	AO		// load real,imag from A
+
+	addi		AO,	AO,	64
+
+	lxvd2x		vs4,	o0,	AO		// load real,imag from A
+	lxvd2x		vs5,	o16,	AO		// load real,imag from A
+	lxvd2x		vs6,	o32,	AO		// load real,imag from A
+	lxvd2x		vs7,	o48,	AO		// load real,imag from A
+
+	addi		AO,	AO,	64
+
+
+.endm
+
+.macro KERNEL2x8_I1
+
+	lxvd2x		vs8,	o0,	AO		// load real,imag from A
+	lxvd2x		vs9,	o16,	AO		// load real,imag from A
+	lxvd2x		vs10,	o32,	AO		// load real,imag from A
+	lxvd2x		vs11,	o48,	AO		// load real,imag from A
+
+	addi		AO,	AO,	64
+
+	lxvd2x		vs12,	o0,	AO		// load real,imag from A
+	lxvd2x		vs13,	o16,	AO		// load real,imag from A
+	lxvd2x		vs14,	o32,	AO		// load real,imag from A
+	lxvd2x		vs15,	o48,	AO		// load real,imag from A
+
+	addi		AO,	AO,	64
+
+	lxvdsx		vs20,	o0,	BO		// load real part from B
+	lxvdsx		vs21,	o8,	BO		// load imag part from B
+	lxvdsx		vs22,	o16,	BO		// load real part from B
+	lxvdsx		vs23,	o24,	BO		// load imag part from B
+
+	addi		BO,	BO,	32
+
+	xvmuldp		vs32,	vs0,	vs16		// real*real, imag*real
+	xvmuldp		vs33,	vs0,	vs17		// real*imag, imag*imag
+	xvmuldp		vs34,	vs1,	vs16		// real*real, imag*real
+	xvmuldp		vs35,	vs1,	vs17		// real*imag, imag*imag
+	xvmuldp		vs36,	vs2,	vs16		// real*real, imag*real
+	xvmuldp		vs37,	vs2,	vs17		// real*imag, imag*imag
+	xvmuldp		vs38,	vs3,	vs16		// real*real, imag*real
+	xvmuldp		vs39,	vs3,	vs17		// real*imag, imag*imag
+	xvmuldp		vs40,	vs4,	vs16		// real*real, imag*real
+	xvmuldp		vs41,	vs4,	vs17		// real*imag, imag*imag
+	xvmuldp		vs42,	vs5,	vs16		// real*real, imag*real
+	xvmuldp		vs43,	vs5,	vs17		// real*imag, imag*imag
+	xvmuldp		vs44,	vs6,	vs16		// real*real, imag*real
+	xvmuldp		vs45,	vs6,	vs17		// real*imag, imag*imag
+	xvmuldp		vs46,	vs7,	vs16		// real*real, imag*real
+	xvmuldp		vs47,	vs7,	vs17		// real*imag, imag*imag
+
+	xvmuldp		vs48,	vs0,	vs18		// real*real, imag*real
+	xvmuldp		vs49,	vs0,	vs19		// real*imag, imag*imag
+	xvmuldp		vs50,	vs1,	vs18		// real*real, imag*real
+	xvmuldp		vs51,	vs1,	vs19		// real*imag, imag*imag
+	xvmuldp		vs52,	vs2,	vs18		// real*real, imag*real
+	xvmuldp		vs53,	vs2,	vs19		// real*imag, imag*imag
+	xvmuldp		vs54,	vs3,	vs18		// real*real, imag*real
+	xvmuldp		vs55,	vs3,	vs19		// real*imag, imag*imag
+	xvmuldp		vs56,	vs4,	vs18		// real*real, imag*real
+	xvmuldp		vs57,	vs4,	vs19		// real*imag, imag*imag
+	xvmuldp		vs58,	vs5,	vs18		// real*real, imag*real
+	xvmuldp		vs59,	vs5,	vs19		// real*imag, imag*imag
+	xvmuldp		vs60,	vs6,	vs18		// real*real, imag*real
+	xvmuldp		vs61,	vs6,	vs19		// real*imag, imag*imag
+	xvmuldp		vs62,	vs7,	vs18		// real*real, imag*real
+	xvmuldp		vs63,	vs7,	vs19		// real*imag, imag*imag
+
+
+.endm
+
+.macro KERNEL2x8_1
+
+
+	xvmaddadp	vs32,	vs0,	vs16		// real*real, imag*real
+	xvmaddadp	vs33,	vs0,	vs17		// real*imag, imag*imag
+	xvmaddadp	vs34,	vs1,	vs16		// real*real, imag*real
+	xvmaddadp	vs35,	vs1,	vs17		// real*imag, imag*imag
+
+	lxvdsx		vs22,	o16,	BO		// load real part from B
+	lxvdsx		vs23,	o24,	BO		// load imag part from B
+
+	xvmaddadp	vs36,	vs2,	vs16		// real*real, imag*real
+	xvmaddadp	vs37,	vs2,	vs17		// real*imag, imag*imag
+	xvmaddadp	vs38,	vs3,	vs16		// real*real, imag*real
+	xvmaddadp	vs39,	vs3,	vs17		// real*imag, imag*imag
+
+	lxvd2x		vs8,	o0,	AO		// load real,imag from A
+	lxvd2x		vs9,	o16,	AO		// load real,imag from A
+
+	xvmaddadp	vs40,	vs4,	vs16		// real*real, imag*real
+	xvmaddadp	vs41,	vs4,	vs17		// real*imag, imag*imag
+	xvmaddadp	vs42,	vs5,	vs16		// real*real, imag*real
+	xvmaddadp	vs43,	vs5,	vs17		// real*imag, imag*imag
+
+	lxvd2x		vs10,	o32,	AO		// load real,imag from A
+	lxvd2x		vs11,	o48,	AO		// load real,imag from A
+
+	xvmaddadp	vs44,	vs6,	vs16		// real*real, imag*real
+	xvmaddadp	vs45,	vs6,	vs17		// real*imag, imag*imag
+
+	addi		AO,	AO,	64
+
+	xvmaddadp	vs46,	vs7,	vs16		// real*real, imag*real
+	xvmaddadp	vs47,	vs7,	vs17		// real*imag, imag*imag
+
+	xvmaddadp	vs48,	vs0,	vs18		// real*real, imag*real
+	xvmaddadp	vs49,	vs0,	vs19		// real*imag, imag*imag
+	xvmaddadp	vs50,	vs1,	vs18		// real*real, imag*real
+	xvmaddadp	vs51,	vs1,	vs19		// real*imag, imag*imag
+
+	lxvd2x		vs12,	o0,	AO		// load real,imag from A
+	lxvd2x		vs13,	o16,	AO		// load real,imag from A
+
+	xvmaddadp	vs52,	vs2,	vs18		// real*real, imag*real
+	xvmaddadp	vs53,	vs2,	vs19		// real*imag, imag*imag
+	xvmaddadp	vs54,	vs3,	vs18		// real*real, imag*real
+	xvmaddadp	vs55,	vs3,	vs19		// real*imag, imag*imag
+
+	lxvd2x		vs14,	o32,	AO		// load real,imag from A
+	lxvd2x		vs15,	o48,	AO		// load real,imag from A
+
+	xvmaddadp	vs56,	vs4,	vs18		// real*real, imag*real
+	xvmaddadp	vs57,	vs4,	vs19		// real*imag, imag*imag
+	xvmaddadp	vs58,	vs5,	vs18		// real*real, imag*real
+	xvmaddadp	vs59,	vs5,	vs19		// real*imag, imag*imag
+
+	lxvdsx		vs20,	o0,	BO		// load real part from B
+	lxvdsx		vs21,	o8,	BO		// load imag part from B
+
+	xvmaddadp	vs60,	vs6,	vs18		// real*real, imag*real
+	xvmaddadp	vs61,	vs6,	vs19		// real*imag, imag*imag
+	xvmaddadp	vs62,	vs7,	vs18		// real*real, imag*real
+	xvmaddadp	vs63,	vs7,	vs19		// real*imag, imag*imag
+
+	addi		AO,	AO,	64
+	addi		BO,	BO,	32
+
+.endm
+
+.macro KERNEL2x8_2
+
+
+	xvmaddadp	vs32,	vs8,	vs20		// real*real, imag*real
+	xvmaddadp	vs33,	vs8,	vs21		// real*imag, imag*imag
+	xvmaddadp	vs34,	vs9,	vs20		// real*real, imag*real
+	xvmaddadp	vs35,	vs9,	vs21		// real*imag, imag*imag
+
+	lxvdsx		vs16,	o0,	BO		// load real part from B
+	lxvdsx		vs17,	o8,	BO		// load imag part from B
+
+	xvmaddadp	vs36,	vs10,	vs20		// real*real, imag*real
+	xvmaddadp	vs37,	vs10,	vs21		// real*imag, imag*imag
+	xvmaddadp	vs38,	vs11,	vs20		// real*real, imag*real
+	xvmaddadp	vs39,	vs11,	vs21		// real*imag, imag*imag
+
+	lxvd2x		vs0,	o0,	AO		// load real,imag from A
+	lxvd2x		vs1,	o16,	AO		// load real,imag from A
+
+	xvmaddadp	vs40,	vs12,	vs20		// real*real, imag*real
+	xvmaddadp	vs41,	vs12,	vs21		// real*imag, imag*imag
+	xvmaddadp	vs42,	vs13,	vs20		// real*real, imag*real
+	xvmaddadp	vs43,	vs13,	vs21		// real*imag, imag*imag
+
+	lxvd2x		vs2,	o32,	AO		// load real,imag from A
+	lxvd2x		vs3,	o48,	AO		// load real,imag from A
+
+	xvmaddadp	vs44,	vs14,	vs20		// real*real, imag*real
+	xvmaddadp	vs45,	vs14,	vs21		// real*imag, imag*imag
+	xvmaddadp	vs46,	vs15,	vs20		// real*real, imag*real
+	xvmaddadp	vs47,	vs15,	vs21		// real*imag, imag*imag
+
+	addi		AO,	AO,	64
+
+	xvmaddadp	vs48,	vs8,	vs22		// real*real, imag*real
+	xvmaddadp	vs49,	vs8,	vs23		// real*imag, imag*imag
+	xvmaddadp	vs50,	vs9,	vs22		// real*real, imag*real
+	xvmaddadp	vs51,	vs9,	vs23		// real*imag, imag*imag
+
+	lxvd2x		vs4,	o0,	AO		// load real,imag from A
+	lxvd2x		vs5,	o16,	AO		// load real,imag from A
+
+	xvmaddadp	vs52,	vs10,	vs22		// real*real, imag*real
+	xvmaddadp	vs53,	vs10,	vs23		// real*imag, imag*imag
+	xvmaddadp	vs54,	vs11,	vs22		// real*real, imag*real
+	xvmaddadp	vs55,	vs11,	vs23		// real*imag, imag*imag
+
+	lxvd2x		vs6,	o32,	AO		// load real,imag from A
+	lxvd2x		vs7,	o48,	AO		// load real,imag from A
+
+	xvmaddadp	vs56,	vs12,	vs22		// real*real, imag*real
+	xvmaddadp	vs57,	vs12,	vs23		// real*imag, imag*imag
+	xvmaddadp	vs58,	vs13,	vs22		// real*real, imag*real
+	xvmaddadp	vs59,	vs13,	vs23		// real*imag, imag*imag
+
+	lxvdsx		vs18,	o16,	BO		// load real part from B
+	lxvdsx		vs19,	o24,	BO		// load imag part from B
+
+	xvmaddadp	vs60,	vs14,	vs22		// real*real, imag*real
+	xvmaddadp	vs61,	vs14,	vs23		// real*imag, imag*imag
+	xvmaddadp	vs62,	vs15,	vs22		// real*real, imag*real
+	xvmaddadp	vs63,	vs15,	vs23		// real*imag, imag*imag
+
+	addi		AO,	AO,	64
+	addi		BO,	BO,	32
+
+.endm
+
+.macro KERNEL2x8_E2
+
+
+	xvmaddadp	vs32,	vs8,	vs20		// real*real, imag*real
+	xvmaddadp	vs33,	vs8,	vs21		// real*imag, imag*imag
+	xvmaddadp	vs34,	vs9,	vs20		// real*real, imag*real
+	xvmaddadp	vs35,	vs9,	vs21		// real*imag, imag*imag
+	xvmaddadp	vs36,	vs10,	vs20		// real*real, imag*real
+	xvmaddadp	vs37,	vs10,	vs21		// real*imag, imag*imag
+	xvmaddadp	vs38,	vs11,	vs20		// real*real, imag*real
+	xvmaddadp	vs39,	vs11,	vs21		// real*imag, imag*imag
+	xvmaddadp	vs40,	vs12,	vs20		// real*real, imag*real
+	xvmaddadp	vs41,	vs12,	vs21		// real*imag, imag*imag
+	xvmaddadp	vs42,	vs13,	vs20		// real*real, imag*real
+	xvmaddadp	vs43,	vs13,	vs21		// real*imag, imag*imag
+	xvmaddadp	vs44,	vs14,	vs20		// real*real, imag*real
+	xvmaddadp	vs45,	vs14,	vs21		// real*imag, imag*imag
+	xvmaddadp	vs46,	vs15,	vs20		// real*real, imag*real
+	xvmaddadp	vs47,	vs15,	vs21		// real*imag, imag*imag
+
+	xvmaddadp	vs48,	vs8,	vs22		// real*real, imag*real
+	xvmaddadp	vs49,	vs8,	vs23		// real*imag, imag*imag
+	xvmaddadp	vs50,	vs9,	vs22		// real*real, imag*real
+	xvmaddadp	vs51,	vs9,	vs23		// real*imag, imag*imag
+	xvmaddadp	vs52,	vs10,	vs22		// real*real, imag*real
+	xvmaddadp	vs53,	vs10,	vs23		// real*imag, imag*imag
+	xvmaddadp	vs54,	vs11,	vs22		// real*real, imag*real
+	xvmaddadp	vs55,	vs11,	vs23		// real*imag, imag*imag
+	xvmaddadp	vs56,	vs12,	vs22		// real*real, imag*real
+	xvmaddadp	vs57,	vs12,	vs23		// real*imag, imag*imag
+	xvmaddadp	vs58,	vs13,	vs22		// real*real, imag*real
+	xvmaddadp	vs59,	vs13,	vs23		// real*imag, imag*imag
+	xvmaddadp	vs60,	vs14,	vs22		// real*real, imag*real
+	xvmaddadp	vs61,	vs14,	vs23		// real*imag, imag*imag
+	xvmaddadp	vs62,	vs15,	vs22		// real*real, imag*real
+	xvmaddadp	vs63,	vs15,	vs23		// real*imag, imag*imag
+
+
+.endm
+
+.macro KERNEL2x8_SUBI1
+
+	lxvd2x		vs0,	o0,	AO		// load real,imag from A
+	lxvd2x		vs1,	o16,	AO		// load real,imag from A
+	lxvd2x		vs2,	o32,	AO		// load real,imag from A
+	lxvd2x		vs3,	o48,	AO		// load real,imag from A
+
+	addi		AO,	AO,	64
+
+	lxvd2x		vs4,	o0,	AO		// load real,imag from A
+	lxvd2x		vs5,	o16,	AO		// load real,imag from A
+	lxvd2x		vs6,	o32,	AO		// load real,imag from A
+	lxvd2x		vs7,	o48,	AO		// load real,imag from A
+
+	addi		AO,	AO,	64
+
+	lxvdsx		vs16,	o0,	BO		// load real part from B
+	lxvdsx		vs17,	o8,	BO		// load imag part from B
+	lxvdsx		vs18,	o16,	BO		// load real part from B
+	lxvdsx		vs19,	o24,	BO		// load imag part from B
+
+	addi		BO,	BO,	32
+
+	xvmuldp		vs32,	vs0,	vs16		// real*real, imag*real
+	xvmuldp		vs33,	vs0,	vs17		// real*imag, imag*imag
+	xvmuldp		vs34,	vs1,	vs16		// real*real, imag*real
+	xvmuldp		vs35,	vs1,	vs17		// real*imag, imag*imag
+	xvmuldp		vs36,	vs2,	vs16		// real*real, imag*real
+	xvmuldp		vs37,	vs2,	vs17		// real*imag, imag*imag
+	xvmuldp		vs38,	vs3,	vs16		// real*real, imag*real
+	xvmuldp		vs39,	vs3,	vs17		// real*imag, imag*imag
+	xvmuldp		vs40,	vs4,	vs16		// real*real, imag*real
+	xvmuldp		vs41,	vs4,	vs17		// real*imag, imag*imag
+	xvmuldp		vs42,	vs5,	vs16		// real*real, imag*real
+	xvmuldp		vs43,	vs5,	vs17		// real*imag, imag*imag
+	xvmuldp		vs44,	vs6,	vs16		// real*real, imag*real
+	xvmuldp		vs45,	vs6,	vs17		// real*imag, imag*imag
+	xvmuldp		vs46,	vs7,	vs16		// real*real, imag*real
+	xvmuldp		vs47,	vs7,	vs17		// real*imag, imag*imag
+
+	xvmuldp		vs48,	vs0,	vs18		// real*real, imag*real
+	xvmuldp		vs49,	vs0,	vs19		// real*imag, imag*imag
+	xvmuldp		vs50,	vs1,	vs18		// real*real, imag*real
+	xvmuldp		vs51,	vs1,	vs19		// real*imag, imag*imag
+	xvmuldp		vs52,	vs2,	vs18		// real*real, imag*real
+	xvmuldp		vs53,	vs2,	vs19		// real*imag, imag*imag
+	xvmuldp		vs54,	vs3,	vs18		// real*real, imag*real
+	xvmuldp		vs55,	vs3,	vs19		// real*imag, imag*imag
+	xvmuldp		vs56,	vs4,	vs18		// real*real, imag*real
+	xvmuldp		vs57,	vs4,	vs19		// real*imag, imag*imag
+	xvmuldp		vs58,	vs5,	vs18		// real*real, imag*real
+	xvmuldp		vs59,	vs5,	vs19		// real*imag, imag*imag
+	xvmuldp		vs60,	vs6,	vs18		// real*real, imag*real
+	xvmuldp		vs61,	vs6,	vs19		// real*imag, imag*imag
+	xvmuldp		vs62,	vs7,	vs18		// real*real, imag*real
+	xvmuldp		vs63,	vs7,	vs19		// real*imag, imag*imag
+
+
+.endm
+
+.macro KERNEL2x8_SUB1
+
+	lxvd2x		vs0,	o0,	AO		// load real,imag from A
+	lxvd2x		vs1,	o16,	AO		// load real,imag from A
+	lxvd2x		vs2,	o32,	AO		// load real,imag from A
+	lxvd2x		vs3,	o48,	AO		// load real,imag from A
+
+	addi		AO,	AO,	64
+
+	lxvd2x		vs4,	o0,	AO		// load real,imag from A
+	lxvd2x		vs5,	o16,	AO		// load real,imag from A
+	lxvd2x		vs6,	o32,	AO		// load real,imag from A
+	lxvd2x		vs7,	o48,	AO		// load real,imag from A
+
+	addi		AO,	AO,	64
+
+	lxvdsx		vs16,	o0,	BO		// load real part from B
+	lxvdsx		vs17,	o8,	BO		// load imag part from B
+	lxvdsx		vs18,	o16,	BO		// load real part from B
+	lxvdsx		vs19,	o24,	BO		// load imag part from B
+
+	addi		BO,	BO,	32
+
+	xvmaddadp	vs32,	vs0,	vs16		// real*real, imag*real
+	xvmaddadp	vs33,	vs0,	vs17		// real*imag, imag*imag
+	xvmaddadp	vs34,	vs1,	vs16		// real*real, imag*real
+	xvmaddadp	vs35,	vs1,	vs17		// real*imag, imag*imag
+	xvmaddadp	vs36,	vs2,	vs16		// real*real, imag*real
+	xvmaddadp	vs37,	vs2,	vs17		// real*imag, imag*imag
+	xvmaddadp	vs38,	vs3,	vs16		// real*real, imag*real
+	xvmaddadp	vs39,	vs3,	vs17		// real*imag, imag*imag
+	xvmaddadp	vs40,	vs4,	vs16		// real*real, imag*real
+	xvmaddadp	vs41,	vs4,	vs17		// real*imag, imag*imag
+	xvmaddadp	vs42,	vs5,	vs16		// real*real, imag*real
+	xvmaddadp	vs43,	vs5,	vs17		// real*imag, imag*imag
+	xvmaddadp	vs44,	vs6,	vs16		// real*real, imag*real
+	xvmaddadp	vs45,	vs6,	vs17		// real*imag, imag*imag
+	xvmaddadp	vs46,	vs7,	vs16		// real*real, imag*real
+	xvmaddadp	vs47,	vs7,	vs17		// real*imag, imag*imag
+
+	xvmaddadp	vs48,	vs0,	vs18		// real*real, imag*real
+	xvmaddadp	vs49,	vs0,	vs19		// real*imag, imag*imag
+	xvmaddadp	vs50,	vs1,	vs18		// real*real, imag*real
+	xvmaddadp	vs51,	vs1,	vs19		// real*imag, imag*imag
+	xvmaddadp	vs52,	vs2,	vs18		// real*real, imag*real
+	xvmaddadp	vs53,	vs2,	vs19		// real*imag, imag*imag
+	xvmaddadp	vs54,	vs3,	vs18		// real*real, imag*real
+	xvmaddadp	vs55,	vs3,	vs19		// real*imag, imag*imag
+	xvmaddadp	vs56,	vs4,	vs18		// real*real, imag*real
+	xvmaddadp	vs57,	vs4,	vs19		// real*imag, imag*imag
+	xvmaddadp	vs58,	vs5,	vs18		// real*real, imag*real
+	xvmaddadp	vs59,	vs5,	vs19		// real*imag, imag*imag
+	xvmaddadp	vs60,	vs6,	vs18		// real*real, imag*real
+	xvmaddadp	vs61,	vs6,	vs19		// real*imag, imag*imag
+	xvmaddadp	vs62,	vs7,	vs18		// real*real, imag*real
+	xvmaddadp	vs63,	vs7,	vs19		// real*imag, imag*imag
+
+
+.endm
+
+.macro SAVE2x8
+
+
+	mr		T1,	CO
+	addi		T2,	T1,	64
+
+#ifndef TRMMKERNEL
+
+	lxvd2x		vs16,	o0,	T1
+	lxvd2x		vs17,	o16,	T1
+	lxvd2x		vs18,	o32,	T1
+	lxvd2x		vs19,	o48,	T1
+	lxvd2x		vs20,	o0,	T2
+	lxvd2x		vs21,	o16,	T2
+	lxvd2x		vs22,	o32,	T2
+	lxvd2x		vs23,	o48,	T2
+
+#endif
+
+
+	xxlxor		vs0,	vs0,	vs0
+	xxlxor		vs1,	vs1,	vs1
+	xxswapd		vs33,	vs33			// realA*imagB, imagA*imagB -> imagA*imagB, realA*imagB
+
+	XSFADD_R1	vs0,	vs0,	vs32		// realA*realB
+	XSFADD_R2	vs0,	vs0,	vs33		// imagA*imagB
+
+	xxswapd		vs32,	vs32			// realA*realB, imagA*realB -> imagA*realB, realA*realB
+	xxswapd		vs33,	vs33			// imagA*imagB, realA*imagB -> realA*imagB, imagA*imagB
+
+	XSFADD_I1	vs1,	vs1,	vs32		// realA*imagB
+	XSFADD_I2	vs1,	vs1,	vs33		// imagA*realB
+
+	xsmuldp		vs4,	vs0,	alpha_r		// real*alpha_r
+	xsmuldp		vs5,	vs1,	alpha_i		// imag*alpha_i
+	xsmuldp		vs6,	vs0,	alpha_i		// real*alpha_i
+	xsmuldp		vs7,	vs1,	alpha_r		// imag*alpha_r
+
+	xssubdp		vs2,	vs4,	vs5		// real*alpha_r - imag*alpha_i
+	xsadddp		vs3,	vs6,	vs7		// real*alpha_i + imag*alpha_r
+	xxpermdi	vs8,	vs2,	vs3,	0	// merge real and imag part
+
+
+
+	xxlxor		vs0,	vs0,	vs0
+	xxlxor		vs1,	vs1,	vs1
+	xxswapd		vs35,	vs35			// realA*imagB, imagA*imagB -> imagA*imagB, realA*imagB
+
+	XSFADD_R1	vs0,	vs0,	vs34		// realA*realB
+	XSFADD_R2	vs0,	vs0,	vs35		// imagA*imagB
+
+	xxswapd		vs34,	vs34			// realA*realB, imagA*realB -> imagA*realB, realA*realB
+	xxswapd		vs35,	vs35			// imagA*imagB, realA*imagB -> realA*imagB, imagA*imagB
+
+	XSFADD_I1	vs1,	vs1,	vs34		// realA*imagB
+	XSFADD_I2	vs1,	vs1,	vs35		// imagA*realB
+
+	xsmuldp		vs4,	vs0,	alpha_r		// real*alpha_r
+	xsmuldp		vs5,	vs1,	alpha_i		// imag*alpha_i
+	xsmuldp		vs6,	vs0,	alpha_i		// real*alpha_i
+	xsmuldp		vs7,	vs1,	alpha_r		// imag*alpha_r
+
+	xssubdp		vs2,	vs4,	vs5		// real*alpha_r - imag*alpha_i
+	xsadddp		vs3,	vs6,	vs7		// real*alpha_i + imag*alpha_r
+	xxpermdi	vs9,	vs2,	vs3,	0	// merge real and imag part
+
+
+
+	xxlxor		vs0,	vs0,	vs0
+	xxlxor		vs1,	vs1,	vs1
+	xxswapd		vs37,	vs37			// realA*imagB, imagA*imagB -> imagA*imagB, realA*imagB
+
+	XSFADD_R1	vs0,	vs0,	vs36		// realA*realB
+	XSFADD_R2	vs0,	vs0,	vs37		// imagA*imagB
+
+	xxswapd		vs36,	vs36			// realA*realB, imagA*realB -> imagA*realB, realA*realB
+	xxswapd		vs37,	vs37			// imagA*imagB, realA*imagB -> realA*imagB, imagA*imagB
+
+	XSFADD_I1	vs1,	vs1,	vs36		// realA*imagB
+	XSFADD_I2	vs1,	vs1,	vs37		// imagA*realB
+
+	xsmuldp		vs4,	vs0,	alpha_r		// real*alpha_r
+	xsmuldp		vs5,	vs1,	alpha_i		// imag*alpha_i
+	xsmuldp		vs6,	vs0,	alpha_i		// real*alpha_i
+	xsmuldp		vs7,	vs1,	alpha_r		// imag*alpha_r
+
+	xssubdp		vs2,	vs4,	vs5		// real*alpha_r - imag*alpha_i
+	xsadddp		vs3,	vs6,	vs7		// real*alpha_i + imag*alpha_r
+	xxpermdi	vs10,	vs2,	vs3,	0	// merge real and imag part
+
+
+
+	xxlxor		vs0,	vs0,	vs0
+	xxlxor		vs1,	vs1,	vs1
+	xxswapd		vs39,	vs39			// realA*imagB, imagA*imagB -> imagA*imagB, realA*imagB
+
+	XSFADD_R1	vs0,	vs0,	vs38		// realA*realB
+	XSFADD_R2	vs0,	vs0,	vs39		// imagA*imagB
+
+	xxswapd		vs38,	vs38			// realA*realB, imagA*realB -> imagA*realB, realA*realB
+	xxswapd		vs39,	vs39			// imagA*imagB, realA*imagB -> realA*imagB, imagA*imagB
+
+	XSFADD_I1	vs1,	vs1,	vs38		// realA*imagB
+	XSFADD_I2	vs1,	vs1,	vs39		// imagA*realB
+
+	xsmuldp		vs4,	vs0,	alpha_r		// real*alpha_r
+	xsmuldp		vs5,	vs1,	alpha_i		// imag*alpha_i
+	xsmuldp		vs6,	vs0,	alpha_i		// real*alpha_i
+	xsmuldp		vs7,	vs1,	alpha_r		// imag*alpha_r
+
+	xssubdp		vs2,	vs4,	vs5		// real*alpha_r - imag*alpha_i
+	xsadddp		vs3,	vs6,	vs7		// real*alpha_i + imag*alpha_r
+	xxpermdi	vs11,	vs2,	vs3,	0	// merge real and imag part
+
+
+
+	xxlxor		vs0,	vs0,	vs0
+	xxlxor		vs1,	vs1,	vs1
+	xxswapd		vs41,	vs41			// realA*imagB, imagA*imagB -> imagA*imagB, realA*imagB
+
+	XSFADD_R1	vs0,	vs0,	vs40		// realA*realB
+	XSFADD_R2	vs0,	vs0,	vs41		// imagA*imagB
+
+	xxswapd		vs40,	vs40			// realA*realB, imagA*realB -> imagA*realB, realA*realB
+	xxswapd		vs41,	vs41			// imagA*imagB, realA*imagB -> realA*imagB, imagA*imagB
+
+	XSFADD_I1	vs1,	vs1,	vs40		// realA*imagB
+	XSFADD_I2	vs1,	vs1,	vs41		// imagA*realB
+
+	xsmuldp		vs4,	vs0,	alpha_r		// real*alpha_r
+	xsmuldp		vs5,	vs1,	alpha_i		// imag*alpha_i
+	xsmuldp		vs6,	vs0,	alpha_i		// real*alpha_i
+	xsmuldp		vs7,	vs1,	alpha_r		// imag*alpha_r
+
+	xssubdp		vs2,	vs4,	vs5		// real*alpha_r - imag*alpha_i
+	xsadddp		vs3,	vs6,	vs7		// real*alpha_i + imag*alpha_r
+	xxpermdi	vs12,	vs2,	vs3,	0	// merge real and imag part
+
+
+
+	xxlxor		vs0,	vs0,	vs0
+	xxlxor		vs1,	vs1,	vs1
+	xxswapd		vs43,	vs43			// realA*imagB, imagA*imagB -> imagA*imagB, realA*imagB
+
+	XSFADD_R1	vs0,	vs0,	vs42		// realA*realB
+	XSFADD_R2	vs0,	vs0,	vs43		// imagA*imagB
+
+	xxswapd		vs42,	vs42			// realA*realB, imagA*realB -> imagA*realB, realA*realB
+	xxswapd		vs43,	vs43			// imagA*imagB, realA*imagB -> realA*imagB, imagA*imagB
+
+	XSFADD_I1	vs1,	vs1,	vs42		// realA*imagB
+	XSFADD_I2	vs1,	vs1,	vs43		// imagA*realB
+
+	xsmuldp		vs4,	vs0,	alpha_r		// real*alpha_r
+	xsmuldp		vs5,	vs1,	alpha_i		// imag*alpha_i
+	xsmuldp		vs6,	vs0,	alpha_i		// real*alpha_i
+	xsmuldp		vs7,	vs1,	alpha_r		// imag*alpha_r
+
+	xssubdp		vs2,	vs4,	vs5		// real*alpha_r - imag*alpha_i
+	xsadddp		vs3,	vs6,	vs7		// real*alpha_i + imag*alpha_r
+	xxpermdi	vs13,	vs2,	vs3,	0	// merge real and imag part
+
+
+
+	xxlxor		vs0,	vs0,	vs0
+	xxlxor		vs1,	vs1,	vs1
+	xxswapd		vs45,	vs45			// realA*imagB, imagA*imagB -> imagA*imagB, realA*imagB
+
+	XSFADD_R1	vs0,	vs0,	vs44		// realA*realB
+	XSFADD_R2	vs0,	vs0,	vs45		// imagA*imagB
+
+	xxswapd		vs44,	vs44			// realA*realB, imagA*realB -> imagA*realB, realA*realB
+	xxswapd		vs45,	vs45			// imagA*imagB, realA*imagB -> realA*imagB, imagA*imagB
+
+	XSFADD_I1	vs1,	vs1,	vs44		// realA*imagB
+	XSFADD_I2	vs1,	vs1,	vs45		// imagA*realB
+
+	xsmuldp		vs4,	vs0,	alpha_r		// real*alpha_r
+	xsmuldp		vs5,	vs1,	alpha_i		// imag*alpha_i
+	xsmuldp		vs6,	vs0,	alpha_i		// real*alpha_i
+	xsmuldp		vs7,	vs1,	alpha_r		// imag*alpha_r
+
+	xssubdp		vs2,	vs4,	vs5		// real*alpha_r - imag*alpha_i
+	xsadddp		vs3,	vs6,	vs7		// real*alpha_i + imag*alpha_r
+	xxpermdi	vs14,	vs2,	vs3,	0	// merge real and imag part
+
+
+
+	xxlxor		vs0,	vs0,	vs0
+	xxlxor		vs1,	vs1,	vs1
+	xxswapd		vs47,	vs47			// realA*imagB, imagA*imagB -> imagA*imagB, realA*imagB
+
+	XSFADD_R1	vs0,	vs0,	vs46		// realA*realB
+	XSFADD_R2	vs0,	vs0,	vs47		// imagA*imagB
+
+	xxswapd		vs46,	vs46			// realA*realB, imagA*realB -> imagA*realB, realA*realB
+	xxswapd		vs47,	vs47			// imagA*imagB, realA*imagB -> realA*imagB, imagA*imagB
+
+	XSFADD_I1	vs1,	vs1,	vs46		// realA*imagB
+	XSFADD_I2	vs1,	vs1,	vs47		// imagA*realB
+
+	xsmuldp		vs4,	vs0,	alpha_r		// real*alpha_r
+	xsmuldp		vs5,	vs1,	alpha_i		// imag*alpha_i
+	xsmuldp		vs6,	vs0,	alpha_i		// real*alpha_i
+	xsmuldp		vs7,	vs1,	alpha_r		// imag*alpha_r
+
+	xssubdp		vs2,	vs4,	vs5		// real*alpha_r - imag*alpha_i
+	xsadddp		vs3,	vs6,	vs7		// real*alpha_i + imag*alpha_r
+	xxpermdi	vs15,	vs2,	vs3,	0	// merge real and imag part
+
+
+#ifndef TRMMKERNEL
+
+	xvadddp		vs8,	vs8,	vs16
+	xvadddp		vs9,	vs9,	vs17
+	xvadddp		vs10,	vs10,	vs18
+	xvadddp		vs11,	vs11,	vs19
+	xvadddp		vs12,	vs12,	vs20
+	xvadddp		vs13,	vs13,	vs21
+	xvadddp		vs14,	vs14,	vs22
+	xvadddp		vs15,	vs15,	vs23
+
+#endif
+
+	stxvd2x		vs8,	o0,	T1
+	stxvd2x		vs9,	o16,	T1
+	stxvd2x		vs10,	o32,	T1
+	stxvd2x		vs11,	o48,	T1
+	stxvd2x		vs12,	o0,	T2
+	stxvd2x		vs13,	o16,	T2
+	stxvd2x		vs14,	o32,	T2
+	stxvd2x		vs15,	o48,	T2
+
+	add		T1,	T1,	LDC
+	add		T2,	T2,	LDC
+
+#ifndef TRMMKERNEL
+
+	lxvd2x		vs16,	o0,	T1
+	lxvd2x		vs17,	o16,	T1
+	lxvd2x		vs18,	o32,	T1
+	lxvd2x		vs19,	o48,	T1
+	lxvd2x		vs20,	o0,	T2
+	lxvd2x		vs21,	o16,	T2
+	lxvd2x		vs22,	o32,	T2
+	lxvd2x		vs23,	o48,	T2
+
+#endif
+
+
+	xxlxor		vs0,	vs0,	vs0
+	xxlxor		vs1,	vs1,	vs1
+	xxswapd		vs49,	vs49			// realA*imagB, imagA*imagB -> imagA*imagB, realA*imagB
+
+	XSFADD_R1	vs0,	vs0,	vs48		// realA*realB
+	XSFADD_R2	vs0,	vs0,	vs49		// imagA*imagB
+
+	xxswapd		vs48,	vs48			// realA*realB, imagA*realB -> imagA*realB, realA*realB
+	xxswapd		vs49,	vs49			// imagA*imagB, realA*imagB -> realA*imagB, imagA*imagB
+
+	XSFADD_I1	vs1,	vs1,	vs48		// realA*imagB
+	XSFADD_I2	vs1,	vs1,	vs49		// imagA*realB
+
+	xsmuldp		vs4,	vs0,	alpha_r		// real*alpha_r
+	xsmuldp		vs5,	vs1,	alpha_i		// imag*alpha_i
+	xsmuldp		vs6,	vs0,	alpha_i		// real*alpha_i
+	xsmuldp		vs7,	vs1,	alpha_r		// imag*alpha_r
+
+	xssubdp		vs2,	vs4,	vs5		// real*alpha_r - imag*alpha_i
+	xsadddp		vs3,	vs6,	vs7		// real*alpha_i + imag*alpha_r
+	xxpermdi	vs8,	vs2,	vs3,	0	// merge real and imag part
+
+
+
+	xxlxor		vs0,	vs0,	vs0
+	xxlxor		vs1,	vs1,	vs1
+	xxswapd		vs51,	vs51			// realA*imagB, imagA*imagB -> imagA*imagB, realA*imagB
+
+	XSFADD_R1	vs0,	vs0,	vs50		// realA*realB
+	XSFADD_R2	vs0,	vs0,	vs51		// imagA*imagB
+
+	xxswapd		vs50,	vs50			// realA*realB, imagA*realB -> imagA*realB, realA*realB
+	xxswapd		vs51,	vs51			// imagA*imagB, realA*imagB -> realA*imagB, imagA*imagB
+
+	XSFADD_I1	vs1,	vs1,	vs50		// realA*imagB
+	XSFADD_I2	vs1,	vs1,	vs51		// imagA*realB
+
+	xsmuldp		vs4,	vs0,	alpha_r		// real*alpha_r
+	xsmuldp		vs5,	vs1,	alpha_i		// imag*alpha_i
+	xsmuldp		vs6,	vs0,	alpha_i		// real*alpha_i
+	xsmuldp		vs7,	vs1,	alpha_r		// imag*alpha_r
+
+	xssubdp		vs2,	vs4,	vs5		// real*alpha_r - imag*alpha_i
+	xsadddp		vs3,	vs6,	vs7		// real*alpha_i + imag*alpha_r
+	xxpermdi	vs9,	vs2,	vs3,	0	// merge real and imag part
+
+
+
+	xxlxor		vs0,	vs0,	vs0
+	xxlxor		vs1,	vs1,	vs1
+	xxswapd		vs53,	vs53			// realA*imagB, imagA*imagB -> imagA*imagB, realA*imagB
+
+	XSFADD_R1	vs0,	vs0,	vs52		// realA*realB
+	XSFADD_R2	vs0,	vs0,	vs53		// imagA*imagB
+
+	xxswapd		vs52,	vs52			// realA*realB, imagA*realB -> imagA*realB, realA*realB
+	xxswapd		vs53,	vs53			// imagA*imagB, realA*imagB -> realA*imagB, imagA*imagB
+
+	XSFADD_I1	vs1,	vs1,	vs52		// realA*imagB
+	XSFADD_I2	vs1,	vs1,	vs53		// imagA*realB
+
+	xsmuldp		vs4,	vs0,	alpha_r		// real*alpha_r
+	xsmuldp		vs5,	vs1,	alpha_i		// imag*alpha_i
+	xsmuldp		vs6,	vs0,	alpha_i		// real*alpha_i
+	xsmuldp		vs7,	vs1,	alpha_r		// imag*alpha_r
+
+	xssubdp		vs2,	vs4,	vs5		// real*alpha_r - imag*alpha_i
+	xsadddp		vs3,	vs6,	vs7		// real*alpha_i + imag*alpha_r
+	xxpermdi	vs10,	vs2,	vs3,	0	// merge real and imag part
+
+
+
+	xxlxor		vs0,	vs0,	vs0
+	xxlxor		vs1,	vs1,	vs1
+	xxswapd		vs55,	vs55			// realA*imagB, imagA*imagB -> imagA*imagB, realA*imagB
+
+	XSFADD_R1	vs0,	vs0,	vs54		// realA*realB
+	XSFADD_R2	vs0,	vs0,	vs55		// imagA*imagB
+
+	xxswapd		vs54,	vs54			// realA*realB, imagA*realB -> imagA*realB, realA*realB
+	xxswapd		vs55,	vs55			// imagA*imagB, realA*imagB -> realA*imagB, imagA*imagB
+
+	XSFADD_I1	vs1,	vs1,	vs54		// realA*imagB
+	XSFADD_I2	vs1,	vs1,	vs55		// imagA*realB
+
+	xsmuldp		vs4,	vs0,	alpha_r		// real*alpha_r
+	xsmuldp		vs5,	vs1,	alpha_i		// imag*alpha_i
+	xsmuldp		vs6,	vs0,	alpha_i		// real*alpha_i
+	xsmuldp		vs7,	vs1,	alpha_r		// imag*alpha_r
+
+	xssubdp		vs2,	vs4,	vs5		// real*alpha_r - imag*alpha_i
+	xsadddp		vs3,	vs6,	vs7		// real*alpha_i + imag*alpha_r
+	xxpermdi	vs11,	vs2,	vs3,	0	// merge real and imag part
+
+
+
+	xxlxor		vs0,	vs0,	vs0
+	xxlxor		vs1,	vs1,	vs1
+	xxswapd		vs57,	vs57			// realA*imagB, imagA*imagB -> imagA*imagB, realA*imagB
+
+	XSFADD_R1	vs0,	vs0,	vs56		// realA*realB
+	XSFADD_R2	vs0,	vs0,	vs57		// imagA*imagB
+
+	xxswapd		vs56,	vs56			// realA*realB, imagA*realB -> imagA*realB, realA*realB
+	xxswapd		vs57,	vs57			// imagA*imagB, realA*imagB -> realA*imagB, imagA*imagB
+
+	XSFADD_I1	vs1,	vs1,	vs56		// realA*imagB
+	XSFADD_I2	vs1,	vs1,	vs57		// imagA*realB
+
+	xsmuldp		vs4,	vs0,	alpha_r		// real*alpha_r
+	xsmuldp		vs5,	vs1,	alpha_i		// imag*alpha_i
+	xsmuldp		vs6,	vs0,	alpha_i		// real*alpha_i
+	xsmuldp		vs7,	vs1,	alpha_r		// imag*alpha_r
+
+	xssubdp		vs2,	vs4,	vs5		// real*alpha_r - imag*alpha_i
+	xsadddp		vs3,	vs6,	vs7		// real*alpha_i + imag*alpha_r
+	xxpermdi	vs12,	vs2,	vs3,	0	// merge real and imag part
+
+
+
+	xxlxor		vs0,	vs0,	vs0
+	xxlxor		vs1,	vs1,	vs1
+	xxswapd		vs59,	vs59			// realA*imagB, imagA*imagB -> imagA*imagB, realA*imagB
+
+	XSFADD_R1	vs0,	vs0,	vs58		// realA*realB
+	XSFADD_R2	vs0,	vs0,	vs59		// imagA*imagB
+
+	xxswapd		vs58,	vs58			// realA*realB, imagA*realB -> imagA*realB, realA*realB
+	xxswapd		vs59,	vs59			// imagA*imagB, realA*imagB -> realA*imagB, imagA*imagB
+
+	XSFADD_I1	vs1,	vs1,	vs58		// realA*imagB
+	XSFADD_I2	vs1,	vs1,	vs59		// imagA*realB
+
+	xsmuldp		vs4,	vs0,	alpha_r		// real*alpha_r
+	xsmuldp		vs5,	vs1,	alpha_i		// imag*alpha_i
+	xsmuldp		vs6,	vs0,	alpha_i		// real*alpha_i
+	xsmuldp		vs7,	vs1,	alpha_r		// imag*alpha_r
+
+	xssubdp		vs2,	vs4,	vs5		// real*alpha_r - imag*alpha_i
+	xsadddp		vs3,	vs6,	vs7		// real*alpha_i + imag*alpha_r
+	xxpermdi	vs13,	vs2,	vs3,	0	// merge real and imag part
+
+
+
+	xxlxor		vs0,	vs0,	vs0
+	xxlxor		vs1,	vs1,	vs1
+	xxswapd		vs61,	vs61			// realA*imagB, imagA*imagB -> imagA*imagB, realA*imagB
+
+	XSFADD_R1	vs0,	vs0,	vs60		// realA*realB
+	XSFADD_R2	vs0,	vs0,	vs61		// imagA*imagB
+
+	xxswapd		vs60,	vs60			// realA*realB, imagA*realB -> imagA*realB, realA*realB
+	xxswapd		vs61,	vs61			// imagA*imagB, realA*imagB -> realA*imagB, imagA*imagB
+
+	XSFADD_I1	vs1,	vs1,	vs60		// realA*imagB
+	XSFADD_I2	vs1,	vs1,	vs61		// imagA*realB
+
+	xsmuldp		vs4,	vs0,	alpha_r		// real*alpha_r
+	xsmuldp		vs5,	vs1,	alpha_i		// imag*alpha_i
+	xsmuldp		vs6,	vs0,	alpha_i		// real*alpha_i
+	xsmuldp		vs7,	vs1,	alpha_r		// imag*alpha_r
+
+	xssubdp		vs2,	vs4,	vs5		// real*alpha_r - imag*alpha_i
+	xsadddp		vs3,	vs6,	vs7		// real*alpha_i + imag*alpha_r
+	xxpermdi	vs14,	vs2,	vs3,	0	// merge real and imag part
+
+
+
+	xxlxor		vs0,	vs0,	vs0
+	xxlxor		vs1,	vs1,	vs1
+	xxswapd		vs63,	vs63			// realA*imagB, imagA*imagB -> imagA*imagB, realA*imagB
+
+	XSFADD_R1	vs0,	vs0,	vs62		// realA*realB
+	XSFADD_R2	vs0,	vs0,	vs63		// imagA*imagB
+
+	xxswapd		vs62,	vs62			// realA*realB, imagA*realB -> imagA*realB, realA*realB
+	xxswapd		vs63,	vs63			// imagA*imagB, realA*imagB -> realA*imagB, imagA*imagB
+
+	XSFADD_I1	vs1,	vs1,	vs62		// realA*imagB
+	XSFADD_I2	vs1,	vs1,	vs63		// imagA*realB
+
+	xsmuldp		vs4,	vs0,	alpha_r		// real*alpha_r
+	xsmuldp		vs5,	vs1,	alpha_i		// imag*alpha_i
+	xsmuldp		vs6,	vs0,	alpha_i		// real*alpha_i
+	xsmuldp		vs7,	vs1,	alpha_r		// imag*alpha_r
+
+	xssubdp		vs2,	vs4,	vs5		// real*alpha_r - imag*alpha_i
+	xsadddp		vs3,	vs6,	vs7		// real*alpha_i + imag*alpha_r
+	xxpermdi	vs15,	vs2,	vs3,	0	// merge real and imag part
+
+
+#ifndef TRMMKERNEL
+
+	xvadddp		vs8,	vs8,	vs16
+	xvadddp		vs9,	vs9,	vs17
+	xvadddp		vs10,	vs10,	vs18
+	xvadddp		vs11,	vs11,	vs19
+	xvadddp		vs12,	vs12,	vs20
+	xvadddp		vs13,	vs13,	vs21
+	xvadddp		vs14,	vs14,	vs22
+	xvadddp		vs15,	vs15,	vs23
+
+#endif
+
+	stxvd2x		vs8,	o0,	T1
+	stxvd2x		vs9,	o16,	T1
+	stxvd2x		vs10,	o32,	T1
+	stxvd2x		vs11,	o48,	T1
+	stxvd2x		vs12,	o0,	T2
+	stxvd2x		vs13,	o16,	T2
+	stxvd2x		vs14,	o32,	T2
+	stxvd2x		vs15,	o48,	T2
+
+	add		T1,	T1,	LDC
+	add		T2,	T2,	LDC
+	addi		CO,	CO,	128
+
+.endm
+
+
+/**********************************************************************************************
+* Macros for N=2 and M=4
+**********************************************************************************************/
+
+.macro LOAD2x4_1
+
+	lxvdsx		vs16,	o0,	BO		// load real part from B
+	lxvdsx		vs17,	o8,	BO		// load imag part from B
+	lxvdsx		vs18,	o16,	BO		// load real part from B
+	lxvdsx		vs19,	o24,	BO		// load imag part from B
+
+	addi		BO,	BO,	32
+
+	lxvd2x		vs0,	o0,	AO		// load real,imag from A
+	lxvd2x		vs1,	o16,	AO		// load real,imag from A
+	lxvd2x		vs2,	o32,	AO		// load real,imag from A
+	lxvd2x		vs3,	o48,	AO		// load real,imag from A
+
+	addi		AO,	AO,	64
+
+
+.endm
+
+.macro KERNEL2x4_I1
+
+	lxvd2x		vs8,	o0,	AO		// load real,imag from A
+	lxvd2x		vs9,	o16,	AO		// load real,imag from A
+	lxvd2x		vs10,	o32,	AO		// load real,imag from A
+	lxvd2x		vs11,	o48,	AO		// load real,imag from A
+
+	addi		AO,	AO,	64
+
+	lxvdsx		vs20,	o0,	BO		// load real part from B
+	lxvdsx		vs21,	o8,	BO		// load imag part from B
+	lxvdsx		vs22,	o16,	BO		// load real part from B
+	lxvdsx		vs23,	o24,	BO		// load imag part from B
+
+	addi		BO,	BO,	32
+
+	xvmuldp		vs32,	vs0,	vs16		// real*real, imag*real
+	xvmuldp		vs33,	vs0,	vs17		// real*imag, imag*imag
+	xvmuldp		vs34,	vs1,	vs16		// real*real, imag*real
+	xvmuldp		vs35,	vs1,	vs17		// real*imag, imag*imag
+	xvmuldp		vs36,	vs2,	vs16		// real*real, imag*real
+	xvmuldp		vs37,	vs2,	vs17		// real*imag, imag*imag
+	xvmuldp		vs38,	vs3,	vs16		// real*real, imag*real
+	xvmuldp		vs39,	vs3,	vs17		// real*imag, imag*imag
+
+	xvmuldp		vs40,	vs0,	vs18		// real*real, imag*real
+	xvmuldp		vs41,	vs0,	vs19		// real*imag, imag*imag
+	xvmuldp		vs42,	vs1,	vs18		// real*real, imag*real
+	xvmuldp		vs43,	vs1,	vs19		// real*imag, imag*imag
+	xvmuldp		vs44,	vs2,	vs18		// real*real, imag*real
+	xvmuldp		vs45,	vs2,	vs19		// real*imag, imag*imag
+	xvmuldp		vs46,	vs3,	vs18		// real*real, imag*real
+	xvmuldp		vs47,	vs3,	vs19		// real*imag, imag*imag
+
+
+.endm
+
+.macro KERNEL2x4_1
+
+	lxvd2x		vs8,	o0,	AO		// load real,imag from A
+	lxvd2x		vs9,	o16,	AO		// load real,imag from A
+	lxvd2x		vs10,	o32,	AO		// load real,imag from A
+	lxvd2x		vs11,	o48,	AO		// load real,imag from A
+
+	addi		AO,	AO,	64
+
+	lxvdsx		vs20,	o0,	BO		// load real part from B
+	lxvdsx		vs21,	o8,	BO		// load imag part from B
+	lxvdsx		vs22,	o16,	BO		// load real part from B
+	lxvdsx		vs23,	o24,	BO		// load imag part from B
+
+	addi		BO,	BO,	32
+
+	xvmaddadp	vs32,	vs0,	vs16		// real*real, imag*real
+	xvmaddadp	vs33,	vs0,	vs17		// real*imag, imag*imag
+	xvmaddadp	vs34,	vs1,	vs16		// real*real, imag*real
+	xvmaddadp	vs35,	vs1,	vs17		// real*imag, imag*imag
+	xvmaddadp	vs36,	vs2,	vs16		// real*real, imag*real
+	xvmaddadp	vs37,	vs2,	vs17		// real*imag, imag*imag
+	xvmaddadp	vs38,	vs3,	vs16		// real*real, imag*real
+	xvmaddadp	vs39,	vs3,	vs17		// real*imag, imag*imag
+
+	xvmaddadp	vs40,	vs0,	vs18		// real*real, imag*real
+	xvmaddadp	vs41,	vs0,	vs19		// real*imag, imag*imag
+	xvmaddadp	vs42,	vs1,	vs18		// real*real, imag*real
+	xvmaddadp	vs43,	vs1,	vs19		// real*imag, imag*imag
+	xvmaddadp	vs44,	vs2,	vs18		// real*real, imag*real
+	xvmaddadp	vs45,	vs2,	vs19		// real*imag, imag*imag
+	xvmaddadp	vs46,	vs3,	vs18		// real*real, imag*real
+	xvmaddadp	vs47,	vs3,	vs19		// real*imag, imag*imag
+
+
+.endm
+
+.macro KERNEL2x4_2
+
+	lxvd2x		vs0,	o0,	AO		// load real,imag from A
+	lxvd2x		vs1,	o16,	AO		// load real,imag from A
+	lxvd2x		vs2,	o32,	AO		// load real,imag from A
+	lxvd2x		vs3,	o48,	AO		// load real,imag from A
+
+	addi		AO,	AO,	64
+
+	lxvdsx		vs16,	o0,	BO		// load real part from B
+	lxvdsx		vs17,	o8,	BO		// load imag part from B
+	lxvdsx		vs18,	o16,	BO		// load real part from B
+	lxvdsx		vs19,	o24,	BO		// load imag part from B
+
+	addi		BO,	BO,	32
+
+	xvmaddadp	vs32,	vs8,	vs20		// real*real, imag*real
+	xvmaddadp	vs33,	vs8,	vs21		// real*imag, imag*imag
+	xvmaddadp	vs34,	vs9,	vs20		// real*real, imag*real
+	xvmaddadp	vs35,	vs9,	vs21		// real*imag, imag*imag
+	xvmaddadp	vs36,	vs10,	vs20		// real*real, imag*real
+	xvmaddadp	vs37,	vs10,	vs21		// real*imag, imag*imag
+	xvmaddadp	vs38,	vs11,	vs20		// real*real, imag*real
+	xvmaddadp	vs39,	vs11,	vs21		// real*imag, imag*imag
+
+	xvmaddadp	vs40,	vs8,	vs22		// real*real, imag*real
+	xvmaddadp	vs41,	vs8,	vs23		// real*imag, imag*imag
+	xvmaddadp	vs42,	vs9,	vs22		// real*real, imag*real
+	xvmaddadp	vs43,	vs9,	vs23		// real*imag, imag*imag
+	xvmaddadp	vs44,	vs10,	vs22		// real*real, imag*real
+	xvmaddadp	vs45,	vs10,	vs23		// real*imag, imag*imag
+	xvmaddadp	vs46,	vs11,	vs22		// real*real, imag*real
+	xvmaddadp	vs47,	vs11,	vs23		// real*imag, imag*imag
+
+
+.endm
+
+.macro KERNEL2x4_E2
+
+
+	xvmaddadp	vs32,	vs8,	vs20		// real*real, imag*real
+	xvmaddadp	vs33,	vs8,	vs21		// real*imag, imag*imag
+	xvmaddadp	vs34,	vs9,	vs20		// real*real, imag*real
+	xvmaddadp	vs35,	vs9,	vs21		// real*imag, imag*imag
+	xvmaddadp	vs36,	vs10,	vs20		// real*real, imag*real
+	xvmaddadp	vs37,	vs10,	vs21		// real*imag, imag*imag
+	xvmaddadp	vs38,	vs11,	vs20		// real*real, imag*real
+	xvmaddadp	vs39,	vs11,	vs21		// real*imag, imag*imag
+
+	xvmaddadp	vs40,	vs8,	vs22		// real*real, imag*real
+	xvmaddadp	vs41,	vs8,	vs23		// real*imag, imag*imag
+	xvmaddadp	vs42,	vs9,	vs22		// real*real, imag*real
+	xvmaddadp	vs43,	vs9,	vs23		// real*imag, imag*imag
+	xvmaddadp	vs44,	vs10,	vs22		// real*real, imag*real
+	xvmaddadp	vs45,	vs10,	vs23		// real*imag, imag*imag
+	xvmaddadp	vs46,	vs11,	vs22		// real*real, imag*real
+	xvmaddadp	vs47,	vs11,	vs23		// real*imag, imag*imag
+
+
+.endm
+
+.macro KERNEL2x4_SUBI1
+
+	lxvd2x		vs0,	o0,	AO		// load real,imag from A
+	lxvd2x		vs1,	o16,	AO		// load real,imag from A
+	lxvd2x		vs2,	o32,	AO		// load real,imag from A
+	lxvd2x		vs3,	o48,	AO		// load real,imag from A
+
+	addi		AO,	AO,	64
+
+	lxvdsx		vs16,	o0,	BO		// load real part from B
+	lxvdsx		vs17,	o8,	BO		// load imag part from B
+	lxvdsx		vs18,	o16,	BO		// load real part from B
+	lxvdsx		vs19,	o24,	BO		// load imag part from B
+
+	addi		BO,	BO,	32
+
+	xvmuldp		vs32,	vs0,	vs16		// real*real, imag*real
+	xvmuldp		vs33,	vs0,	vs17		// real*imag, imag*imag
+	xvmuldp		vs34,	vs1,	vs16		// real*real, imag*real
+	xvmuldp		vs35,	vs1,	vs17		// real*imag, imag*imag
+	xvmuldp		vs36,	vs2,	vs16		// real*real, imag*real
+	xvmuldp		vs37,	vs2,	vs17		// real*imag, imag*imag
+	xvmuldp		vs38,	vs3,	vs16		// real*real, imag*real
+	xvmuldp		vs39,	vs3,	vs17		// real*imag, imag*imag
+
+	xvmuldp		vs40,	vs0,	vs18		// real*real, imag*real
+	xvmuldp		vs41,	vs0,	vs19		// real*imag, imag*imag
+	xvmuldp		vs42,	vs1,	vs18		// real*real, imag*real
+	xvmuldp		vs43,	vs1,	vs19		// real*imag, imag*imag
+	xvmuldp		vs44,	vs2,	vs18		// real*real, imag*real
+	xvmuldp		vs45,	vs2,	vs19		// real*imag, imag*imag
+	xvmuldp		vs46,	vs3,	vs18		// real*real, imag*real
+	xvmuldp		vs47,	vs3,	vs19		// real*imag, imag*imag
+
+
+.endm
+
+.macro KERNEL2x4_SUB1
+
+	lxvd2x		vs0,	o0,	AO		// load real,imag from A
+	lxvd2x		vs1,	o16,	AO		// load real,imag from A
+	lxvd2x		vs2,	o32,	AO		// load real,imag from A
+	lxvd2x		vs3,	o48,	AO		// load real,imag from A
+
+	addi		AO,	AO,	64
+
+	lxvdsx		vs16,	o0,	BO		// load real part from B
+	lxvdsx		vs17,	o8,	BO		// load imag part from B
+	lxvdsx		vs18,	o16,	BO		// load real part from B
+	lxvdsx		vs19,	o24,	BO		// load imag part from B
+
+	addi		BO,	BO,	32
+
+	xvmaddadp	vs32,	vs0,	vs16		// real*real, imag*real
+	xvmaddadp	vs33,	vs0,	vs17		// real*imag, imag*imag
+	xvmaddadp	vs34,	vs1,	vs16		// real*real, imag*real
+	xvmaddadp	vs35,	vs1,	vs17		// real*imag, imag*imag
+	xvmaddadp	vs36,	vs2,	vs16		// real*real, imag*real
+	xvmaddadp	vs37,	vs2,	vs17		// real*imag, imag*imag
+	xvmaddadp	vs38,	vs3,	vs16		// real*real, imag*real
+	xvmaddadp	vs39,	vs3,	vs17		// real*imag, imag*imag
+
+	xvmaddadp	vs40,	vs0,	vs18		// real*real, imag*real
+	xvmaddadp	vs41,	vs0,	vs19		// real*imag, imag*imag
+	xvmaddadp	vs42,	vs1,	vs18		// real*real, imag*real
+	xvmaddadp	vs43,	vs1,	vs19		// real*imag, imag*imag
+	xvmaddadp	vs44,	vs2,	vs18		// real*real, imag*real
+	xvmaddadp	vs45,	vs2,	vs19		// real*imag, imag*imag
+	xvmaddadp	vs46,	vs3,	vs18		// real*real, imag*real
+	xvmaddadp	vs47,	vs3,	vs19		// real*imag, imag*imag
+
+
+.endm
+
+.macro SAVE2x4
+
+
+	mr		T1,	CO
+
+#ifndef TRMMKERNEL
+
+	lxvd2x		vs16,	o0,	T1
+	lxvd2x		vs17,	o16,	T1
+	lxvd2x		vs18,	o32,	T1
+	lxvd2x		vs19,	o48,	T1
+
+#endif
+
+
+	xxlxor		vs0,	vs0,	vs0
+	xxlxor		vs1,	vs1,	vs1
+	xxswapd		vs33,	vs33			// realA*imagB, imagA*imagB -> imagA*imagB, realA*imagB
+
+	XSFADD_R1	vs0,	vs0,	vs32		// realA*realB
+	XSFADD_R2	vs0,	vs0,	vs33		// imagA*imagB
+
+	xxswapd		vs32,	vs32			// realA*realB, imagA*realB -> imagA*realB, realA*realB
+	xxswapd		vs33,	vs33			// imagA*imagB, realA*imagB -> realA*imagB, imagA*imagB
+
+	XSFADD_I1	vs1,	vs1,	vs32		// realA*imagB
+	XSFADD_I2	vs1,	vs1,	vs33		// imagA*realB
+
+	xsmuldp		vs4,	vs0,	alpha_r		// real*alpha_r
+	xsmuldp		vs5,	vs1,	alpha_i		// imag*alpha_i
+	xsmuldp		vs6,	vs0,	alpha_i		// real*alpha_i
+	xsmuldp		vs7,	vs1,	alpha_r		// imag*alpha_r
+
+	xssubdp		vs2,	vs4,	vs5		// real*alpha_r - imag*alpha_i
+	xsadddp		vs3,	vs6,	vs7		// real*alpha_i + imag*alpha_r
+	xxpermdi	vs8,	vs2,	vs3,	0	// merge real and imag part
+
+
+
+	xxlxor		vs0,	vs0,	vs0
+	xxlxor		vs1,	vs1,	vs1
+	xxswapd		vs35,	vs35			// realA*imagB, imagA*imagB -> imagA*imagB, realA*imagB
+
+	XSFADD_R1	vs0,	vs0,	vs34		// realA*realB
+	XSFADD_R2	vs0,	vs0,	vs35		// imagA*imagB
+
+	xxswapd		vs34,	vs34			// realA*realB, imagA*realB -> imagA*realB, realA*realB
+	xxswapd		vs35,	vs35			// imagA*imagB, realA*imagB -> realA*imagB, imagA*imagB
+
+	XSFADD_I1	vs1,	vs1,	vs34		// realA*imagB
+	XSFADD_I2	vs1,	vs1,	vs35		// imagA*realB
+
+	xsmuldp		vs4,	vs0,	alpha_r		// real*alpha_r
+	xsmuldp		vs5,	vs1,	alpha_i		// imag*alpha_i
+	xsmuldp		vs6,	vs0,	alpha_i		// real*alpha_i
+	xsmuldp		vs7,	vs1,	alpha_r		// imag*alpha_r
+
+	xssubdp		vs2,	vs4,	vs5		// real*alpha_r - imag*alpha_i
+	xsadddp		vs3,	vs6,	vs7		// real*alpha_i + imag*alpha_r
+	xxpermdi	vs9,	vs2,	vs3,	0	// merge real and imag part
+
+
+
+	xxlxor		vs0,	vs0,	vs0
+	xxlxor		vs1,	vs1,	vs1
+	xxswapd		vs37,	vs37			// realA*imagB, imagA*imagB -> imagA*imagB, realA*imagB
+
+	XSFADD_R1	vs0,	vs0,	vs36		// realA*realB
+	XSFADD_R2	vs0,	vs0,	vs37		// imagA*imagB
+
+	xxswapd		vs36,	vs36			// realA*realB, imagA*realB -> imagA*realB, realA*realB
+	xxswapd		vs37,	vs37			// imagA*imagB, realA*imagB -> realA*imagB, imagA*imagB
+
+	XSFADD_I1	vs1,	vs1,	vs36		// realA*imagB
+	XSFADD_I2	vs1,	vs1,	vs37		// imagA*realB
+
+	xsmuldp		vs4,	vs0,	alpha_r		// real*alpha_r
+	xsmuldp		vs5,	vs1,	alpha_i		// imag*alpha_i
+	xsmuldp		vs6,	vs0,	alpha_i		// real*alpha_i
+	xsmuldp		vs7,	vs1,	alpha_r		// imag*alpha_r
+
+	xssubdp		vs2,	vs4,	vs5		// real*alpha_r - imag*alpha_i
+	xsadddp		vs3,	vs6,	vs7		// real*alpha_i + imag*alpha_r
+	xxpermdi	vs10,	vs2,	vs3,	0	// merge real and imag part
+
+
+
+	xxlxor		vs0,	vs0,	vs0
+	xxlxor		vs1,	vs1,	vs1
+	xxswapd		vs39,	vs39			// realA*imagB, imagA*imagB -> imagA*imagB, realA*imagB
+
+	XSFADD_R1	vs0,	vs0,	vs38		// realA*realB
+	XSFADD_R2	vs0,	vs0,	vs39		// imagA*imagB
+
+	xxswapd		vs38,	vs38			// realA*realB, imagA*realB -> imagA*realB, realA*realB
+	xxswapd		vs39,	vs39			// imagA*imagB, realA*imagB -> realA*imagB, imagA*imagB
+
+	XSFADD_I1	vs1,	vs1,	vs38		// realA*imagB
+	XSFADD_I2	vs1,	vs1,	vs39		// imagA*realB
+
+	xsmuldp		vs4,	vs0,	alpha_r		// real*alpha_r
+	xsmuldp		vs5,	vs1,	alpha_i		// imag*alpha_i
+	xsmuldp		vs6,	vs0,	alpha_i		// real*alpha_i
+	xsmuldp		vs7,	vs1,	alpha_r		// imag*alpha_r
+
+	xssubdp		vs2,	vs4,	vs5		// real*alpha_r - imag*alpha_i
+	xsadddp		vs3,	vs6,	vs7		// real*alpha_i + imag*alpha_r
+	xxpermdi	vs11,	vs2,	vs3,	0	// merge real and imag part
+
+
+#ifndef TRMMKERNEL
+
+	xvadddp		vs8,	vs8,	vs16
+	xvadddp		vs9,	vs9,	vs17
+	xvadddp		vs10,	vs10,	vs18
+	xvadddp		vs11,	vs11,	vs19
+
+#endif
+
+	stxvd2x		vs8,	o0,	T1
+	stxvd2x		vs9,	o16,	T1
+	stxvd2x		vs10,	o32,	T1
+	stxvd2x		vs11,	o48,	T1
+
+	add		T1,	T1,	LDC
+
+#ifndef TRMMKERNEL
+
+	lxvd2x		vs16,	o0,	T1
+	lxvd2x		vs17,	o16,	T1
+	lxvd2x		vs18,	o32,	T1
+	lxvd2x		vs19,	o48,	T1
+
+#endif
+
+
+	xxlxor		vs0,	vs0,	vs0
+	xxlxor		vs1,	vs1,	vs1
+	xxswapd		vs41,	vs41			// realA*imagB, imagA*imagB -> imagA*imagB, realA*imagB
+
+	XSFADD_R1	vs0,	vs0,	vs40		// realA*realB
+	XSFADD_R2	vs0,	vs0,	vs41		// imagA*imagB
+
+	xxswapd		vs40,	vs40			// realA*realB, imagA*realB -> imagA*realB, realA*realB
+	xxswapd		vs41,	vs41			// imagA*imagB, realA*imagB -> realA*imagB, imagA*imagB
+
+	XSFADD_I1	vs1,	vs1,	vs40		// realA*imagB
+	XSFADD_I2	vs1,	vs1,	vs41		// imagA*realB
+
+	xsmuldp		vs4,	vs0,	alpha_r		// real*alpha_r
+	xsmuldp		vs5,	vs1,	alpha_i		// imag*alpha_i
+	xsmuldp		vs6,	vs0,	alpha_i		// real*alpha_i
+	xsmuldp		vs7,	vs1,	alpha_r		// imag*alpha_r
+
+	xssubdp		vs2,	vs4,	vs5		// real*alpha_r - imag*alpha_i
+	xsadddp		vs3,	vs6,	vs7		// real*alpha_i + imag*alpha_r
+	xxpermdi	vs8,	vs2,	vs3,	0	// merge real and imag part
+
+
+
+	xxlxor		vs0,	vs0,	vs0
+	xxlxor		vs1,	vs1,	vs1
+	xxswapd		vs43,	vs43			// realA*imagB, imagA*imagB -> imagA*imagB, realA*imagB
+
+	XSFADD_R1	vs0,	vs0,	vs42		// realA*realB
+	XSFADD_R2	vs0,	vs0,	vs43		// imagA*imagB
+
+	xxswapd		vs42,	vs42			// realA*realB, imagA*realB -> imagA*realB, realA*realB
+	xxswapd		vs43,	vs43			// imagA*imagB, realA*imagB -> realA*imagB, imagA*imagB
+
+	XSFADD_I1	vs1,	vs1,	vs42		// realA*imagB
+	XSFADD_I2	vs1,	vs1,	vs43		// imagA*realB
+
+	xsmuldp		vs4,	vs0,	alpha_r		// real*alpha_r
+	xsmuldp		vs5,	vs1,	alpha_i		// imag*alpha_i
+	xsmuldp		vs6,	vs0,	alpha_i		// real*alpha_i
+	xsmuldp		vs7,	vs1,	alpha_r		// imag*alpha_r
+
+	xssubdp		vs2,	vs4,	vs5		// real*alpha_r - imag*alpha_i
+	xsadddp		vs3,	vs6,	vs7		// real*alpha_i + imag*alpha_r
+	xxpermdi	vs9,	vs2,	vs3,	0	// merge real and imag part
+
+
+
+	xxlxor		vs0,	vs0,	vs0
+	xxlxor		vs1,	vs1,	vs1
+	xxswapd		vs45,	vs45			// realA*imagB, imagA*imagB -> imagA*imagB, realA*imagB
+
+	XSFADD_R1	vs0,	vs0,	vs44		// realA*realB
+	XSFADD_R2	vs0,	vs0,	vs45		// imagA*imagB
+
+	xxswapd		vs44,	vs44			// realA*realB, imagA*realB -> imagA*realB, realA*realB
+	xxswapd		vs45,	vs45			// imagA*imagB, realA*imagB -> realA*imagB, imagA*imagB
+
+	XSFADD_I1	vs1,	vs1,	vs44		// realA*imagB
+	XSFADD_I2	vs1,	vs1,	vs45		// imagA*realB
+
+	xsmuldp		vs4,	vs0,	alpha_r		// real*alpha_r
+	xsmuldp		vs5,	vs1,	alpha_i		// imag*alpha_i
+	xsmuldp		vs6,	vs0,	alpha_i		// real*alpha_i
+	xsmuldp		vs7,	vs1,	alpha_r		// imag*alpha_r
+
+	xssubdp		vs2,	vs4,	vs5		// real*alpha_r - imag*alpha_i
+	xsadddp		vs3,	vs6,	vs7		// real*alpha_i + imag*alpha_r
+	xxpermdi	vs10,	vs2,	vs3,	0	// merge real and imag part
+
+
+
+	xxlxor		vs0,	vs0,	vs0
+	xxlxor		vs1,	vs1,	vs1
+	xxswapd		vs47,	vs47			// realA*imagB, imagA*imagB -> imagA*imagB, realA*imagB
+
+	XSFADD_R1	vs0,	vs0,	vs46		// realA*realB
+	XSFADD_R2	vs0,	vs0,	vs47		// imagA*imagB
+
+	xxswapd		vs46,	vs46			// realA*realB, imagA*realB -> imagA*realB, realA*realB
+	xxswapd		vs47,	vs47			// imagA*imagB, realA*imagB -> realA*imagB, imagA*imagB
+
+	XSFADD_I1	vs1,	vs1,	vs46		// realA*imagB
+	XSFADD_I2	vs1,	vs1,	vs47		// imagA*realB
+
+	xsmuldp		vs4,	vs0,	alpha_r		// real*alpha_r
+	xsmuldp		vs5,	vs1,	alpha_i		// imag*alpha_i
+	xsmuldp		vs6,	vs0,	alpha_i		// real*alpha_i
+	xsmuldp		vs7,	vs1,	alpha_r		// imag*alpha_r
+
+	xssubdp		vs2,	vs4,	vs5		// real*alpha_r - imag*alpha_i
+	xsadddp		vs3,	vs6,	vs7		// real*alpha_i + imag*alpha_r
+	xxpermdi	vs11,	vs2,	vs3,	0	// merge real and imag part
+
+
+#ifndef TRMMKERNEL
+
+	xvadddp		vs8,	vs8,	vs16
+	xvadddp		vs9,	vs9,	vs17
+	xvadddp		vs10,	vs10,	vs18
+	xvadddp		vs11,	vs11,	vs19
+
+#endif
+
+	stxvd2x		vs8,	o0,	T1
+	stxvd2x		vs9,	o16,	T1
+	stxvd2x		vs10,	o32,	T1
+	stxvd2x		vs11,	o48,	T1
+
+	add		T1,	T1,	LDC
+	addi		CO,	CO,	64
+
+.endm
+
+
+/**********************************************************************************************
+* Macros for N=2 and M=2
+**********************************************************************************************/
+
+.macro LOAD2x2_1
+
+	lxvdsx		vs16,	o0,	BO		// load real part from B
+	lxvdsx		vs17,	o8,	BO		// load imag part from B
+	lxvdsx		vs18,	o16,	BO		// load real part from B
+	lxvdsx		vs19,	o24,	BO		// load imag part from B
+
+	addi		BO,	BO,	32
+
+	lxvd2x		vs0,	o0,	AO		// load real,imag from A
+	lxvd2x		vs1,	o16,	AO		// load real,imag from A
+
+	addi		AO,	AO,	32
+
+
+.endm
+
+.macro KERNEL2x2_I1
+
+	lxvd2x		vs8,	o0,	AO		// load real,imag from A
+	lxvd2x		vs9,	o16,	AO		// load real,imag from A
+
+	addi		AO,	AO,	32
+
+	lxvdsx		vs20,	o0,	BO		// load real part from B
+	lxvdsx		vs21,	o8,	BO		// load imag part from B
+	lxvdsx		vs22,	o16,	BO		// load real part from B
+	lxvdsx		vs23,	o24,	BO		// load imag part from B
+
+	addi		BO,	BO,	32
+
+	xvmuldp		vs32,	vs0,	vs16		// real*real, imag*real
+	xvmuldp		vs33,	vs0,	vs17		// real*imag, imag*imag
+	xvmuldp		vs34,	vs1,	vs16		// real*real, imag*real
+	xvmuldp		vs35,	vs1,	vs17		// real*imag, imag*imag
+
+	xvmuldp		vs36,	vs0,	vs18		// real*real, imag*real
+	xvmuldp		vs37,	vs0,	vs19		// real*imag, imag*imag
+	xvmuldp		vs38,	vs1,	vs18		// real*real, imag*real
+	xvmuldp		vs39,	vs1,	vs19		// real*imag, imag*imag
+
+
+.endm
+
+.macro KERNEL2x2_1
+
+	lxvd2x		vs8,	o0,	AO		// load real,imag from A
+	lxvd2x		vs9,	o16,	AO		// load real,imag from A
+
+	addi		AO,	AO,	32
+
+	lxvdsx		vs20,	o0,	BO		// load real part from B
+	lxvdsx		vs21,	o8,	BO		// load imag part from B
+	lxvdsx		vs22,	o16,	BO		// load real part from B
+	lxvdsx		vs23,	o24,	BO		// load imag part from B
+
+	addi		BO,	BO,	32
+
+	xvmaddadp	vs32,	vs0,	vs16		// real*real, imag*real
+	xvmaddadp	vs33,	vs0,	vs17		// real*imag, imag*imag
+	xvmaddadp	vs34,	vs1,	vs16		// real*real, imag*real
+	xvmaddadp	vs35,	vs1,	vs17		// real*imag, imag*imag
+
+	xvmaddadp	vs36,	vs0,	vs18		// real*real, imag*real
+	xvmaddadp	vs37,	vs0,	vs19		// real*imag, imag*imag
+	xvmaddadp	vs38,	vs1,	vs18		// real*real, imag*real
+	xvmaddadp	vs39,	vs1,	vs19		// real*imag, imag*imag
+
+
+.endm
+
+.macro KERNEL2x2_2
+
+	lxvd2x		vs0,	o0,	AO		// load real,imag from A
+	lxvd2x		vs1,	o16,	AO		// load real,imag from A
+
+	addi		AO,	AO,	32
+
+	lxvdsx		vs16,	o0,	BO		// load real part from B
+	lxvdsx		vs17,	o8,	BO		// load imag part from B
+	lxvdsx		vs18,	o16,	BO		// load real part from B
+	lxvdsx		vs19,	o24,	BO		// load imag part from B
+
+	addi		BO,	BO,	32
+
+	xvmaddadp	vs32,	vs8,	vs20		// real*real, imag*real
+	xvmaddadp	vs33,	vs8,	vs21		// real*imag, imag*imag
+	xvmaddadp	vs34,	vs9,	vs20		// real*real, imag*real
+	xvmaddadp	vs35,	vs9,	vs21		// real*imag, imag*imag
+
+	xvmaddadp	vs36,	vs8,	vs22		// real*real, imag*real
+	xvmaddadp	vs37,	vs8,	vs23		// real*imag, imag*imag
+	xvmaddadp	vs38,	vs9,	vs22		// real*real, imag*real
+	xvmaddadp	vs39,	vs9,	vs23		// real*imag, imag*imag
+
+
+.endm
+
+.macro KERNEL2x2_E2
+
+
+	xvmaddadp	vs32,	vs8,	vs20		// real*real, imag*real
+	xvmaddadp	vs33,	vs8,	vs21		// real*imag, imag*imag
+	xvmaddadp	vs34,	vs9,	vs20		// real*real, imag*real
+	xvmaddadp	vs35,	vs9,	vs21		// real*imag, imag*imag
+
+	xvmaddadp	vs36,	vs8,	vs22		// real*real, imag*real
+	xvmaddadp	vs37,	vs8,	vs23		// real*imag, imag*imag
+	xvmaddadp	vs38,	vs9,	vs22		// real*real, imag*real
+	xvmaddadp	vs39,	vs9,	vs23		// real*imag, imag*imag
+
+
+.endm
+
+.macro KERNEL2x2_SUBI1
+
+	lxvd2x		vs0,	o0,	AO		// load real,imag from A
+	lxvd2x		vs1,	o16,	AO		// load real,imag from A
+
+	addi		AO,	AO,	32
+
+	lxvdsx		vs16,	o0,	BO		// load real part from B
+	lxvdsx		vs17,	o8,	BO		// load imag part from B
+	lxvdsx		vs18,	o16,	BO		// load real part from B
+	lxvdsx		vs19,	o24,	BO		// load imag part from B
+
+	addi		BO,	BO,	32
+
+	xvmuldp		vs32,	vs0,	vs16		// real*real, imag*real
+	xvmuldp		vs33,	vs0,	vs17		// real*imag, imag*imag
+	xvmuldp		vs34,	vs1,	vs16		// real*real, imag*real
+	xvmuldp		vs35,	vs1,	vs17		// real*imag, imag*imag
+
+	xvmuldp		vs36,	vs0,	vs18		// real*real, imag*real
+	xvmuldp		vs37,	vs0,	vs19		// real*imag, imag*imag
+	xvmuldp		vs38,	vs1,	vs18		// real*real, imag*real
+	xvmuldp		vs39,	vs1,	vs19		// real*imag, imag*imag
+
+
+.endm
+
+.macro KERNEL2x2_SUB1
+
+	lxvd2x		vs0,	o0,	AO		// load real,imag from A
+	lxvd2x		vs1,	o16,	AO		// load real,imag from A
+
+	addi		AO,	AO,	32
+
+	lxvdsx		vs16,	o0,	BO		// load real part from B
+	lxvdsx		vs17,	o8,	BO		// load imag part from B
+	lxvdsx		vs18,	o16,	BO		// load real part from B
+	lxvdsx		vs19,	o24,	BO		// load imag part from B
+
+	addi		BO,	BO,	32
+
+	xvmaddadp	vs32,	vs0,	vs16		// real*real, imag*real
+	xvmaddadp	vs33,	vs0,	vs17		// real*imag, imag*imag
+	xvmaddadp	vs34,	vs1,	vs16		// real*real, imag*real
+	xvmaddadp	vs35,	vs1,	vs17		// real*imag, imag*imag
+
+	xvmaddadp	vs36,	vs0,	vs18		// real*real, imag*real
+	xvmaddadp	vs37,	vs0,	vs19		// real*imag, imag*imag
+	xvmaddadp	vs38,	vs1,	vs18		// real*real, imag*real
+	xvmaddadp	vs39,	vs1,	vs19		// real*imag, imag*imag
+
+
+.endm
+
+.macro SAVE2x2
+
+
+	mr		T1,	CO
+
+#ifndef TRMMKERNEL
+
+	lxvd2x		vs16,	o0,	T1
+	lxvd2x		vs17,	o16,	T1
+
+#endif
+
+
+	xxlxor		vs0,	vs0,	vs0
+	xxlxor		vs1,	vs1,	vs1
+	xxswapd		vs33,	vs33			// realA*imagB, imagA*imagB -> imagA*imagB, realA*imagB
+
+	XSFADD_R1	vs0,	vs0,	vs32		// realA*realB
+	XSFADD_R2	vs0,	vs0,	vs33		// imagA*imagB
+
+	xxswapd		vs32,	vs32			// realA*realB, imagA*realB -> imagA*realB, realA*realB
+	xxswapd		vs33,	vs33			// imagA*imagB, realA*imagB -> realA*imagB, imagA*imagB
+
+	XSFADD_I1	vs1,	vs1,	vs32		// realA*imagB
+	XSFADD_I2	vs1,	vs1,	vs33		// imagA*realB
+
+	xsmuldp		vs4,	vs0,	alpha_r		// real*alpha_r
+	xsmuldp		vs5,	vs1,	alpha_i		// imag*alpha_i
+	xsmuldp		vs6,	vs0,	alpha_i		// real*alpha_i
+	xsmuldp		vs7,	vs1,	alpha_r		// imag*alpha_r
+
+	xssubdp		vs2,	vs4,	vs5		// real*alpha_r - imag*alpha_i
+	xsadddp		vs3,	vs6,	vs7		// real*alpha_i + imag*alpha_r
+	xxpermdi	vs8,	vs2,	vs3,	0	// merge real and imag part
+
+
+
+	xxlxor		vs0,	vs0,	vs0
+	xxlxor		vs1,	vs1,	vs1
+	xxswapd		vs35,	vs35			// realA*imagB, imagA*imagB -> imagA*imagB, realA*imagB
+
+	XSFADD_R1	vs0,	vs0,	vs34		// realA*realB
+	XSFADD_R2	vs0,	vs0,	vs35		// imagA*imagB
+
+	xxswapd		vs34,	vs34			// realA*realB, imagA*realB -> imagA*realB, realA*realB
+	xxswapd		vs35,	vs35			// imagA*imagB, realA*imagB -> realA*imagB, imagA*imagB
+
+	XSFADD_I1	vs1,	vs1,	vs34		// realA*imagB
+	XSFADD_I2	vs1,	vs1,	vs35		// imagA*realB
+
+	xsmuldp		vs4,	vs0,	alpha_r		// real*alpha_r
+	xsmuldp		vs5,	vs1,	alpha_i		// imag*alpha_i
+	xsmuldp		vs6,	vs0,	alpha_i		// real*alpha_i
+	xsmuldp		vs7,	vs1,	alpha_r		// imag*alpha_r
+
+	xssubdp		vs2,	vs4,	vs5		// real*alpha_r - imag*alpha_i
+	xsadddp		vs3,	vs6,	vs7		// real*alpha_i + imag*alpha_r
+	xxpermdi	vs9,	vs2,	vs3,	0	// merge real and imag part
+
+
+#ifndef TRMMKERNEL
+
+	xvadddp		vs8,	vs8,	vs16
+	xvadddp		vs9,	vs9,	vs17
+
+#endif
+
+	stxvd2x		vs8,	o0,	T1
+	stxvd2x		vs9,	o16,	T1
+
+	add		T1,	T1,	LDC
+
+#ifndef TRMMKERNEL
+
+	lxvd2x		vs16,	o0,	T1
+	lxvd2x		vs17,	o16,	T1
+
+#endif
+
+
+	xxlxor		vs0,	vs0,	vs0
+	xxlxor		vs1,	vs1,	vs1
+	xxswapd		vs37,	vs37			// realA*imagB, imagA*imagB -> imagA*imagB, realA*imagB
+
+	XSFADD_R1	vs0,	vs0,	vs36		// realA*realB
+	XSFADD_R2	vs0,	vs0,	vs37		// imagA*imagB
+
+	xxswapd		vs36,	vs36			// realA*realB, imagA*realB -> imagA*realB, realA*realB
+	xxswapd		vs37,	vs37			// imagA*imagB, realA*imagB -> realA*imagB, imagA*imagB
+
+	XSFADD_I1	vs1,	vs1,	vs36		// realA*imagB
+	XSFADD_I2	vs1,	vs1,	vs37		// imagA*realB
+
+	xsmuldp		vs4,	vs0,	alpha_r		// real*alpha_r
+	xsmuldp		vs5,	vs1,	alpha_i		// imag*alpha_i
+	xsmuldp		vs6,	vs0,	alpha_i		// real*alpha_i
+	xsmuldp		vs7,	vs1,	alpha_r		// imag*alpha_r
+
+	xssubdp		vs2,	vs4,	vs5		// real*alpha_r - imag*alpha_i
+	xsadddp		vs3,	vs6,	vs7		// real*alpha_i + imag*alpha_r
+	xxpermdi	vs8,	vs2,	vs3,	0	// merge real and imag part
+
+
+
+	xxlxor		vs0,	vs0,	vs0
+	xxlxor		vs1,	vs1,	vs1
+	xxswapd		vs39,	vs39			// realA*imagB, imagA*imagB -> imagA*imagB, realA*imagB
+
+	XSFADD_R1	vs0,	vs0,	vs38		// realA*realB
+	XSFADD_R2	vs0,	vs0,	vs39		// imagA*imagB
+
+	xxswapd		vs38,	vs38			// realA*realB, imagA*realB -> imagA*realB, realA*realB
+	xxswapd		vs39,	vs39			// imagA*imagB, realA*imagB -> realA*imagB, imagA*imagB
+
+	XSFADD_I1	vs1,	vs1,	vs38		// realA*imagB
+	XSFADD_I2	vs1,	vs1,	vs39		// imagA*realB
+
+	xsmuldp		vs4,	vs0,	alpha_r		// real*alpha_r
+	xsmuldp		vs5,	vs1,	alpha_i		// imag*alpha_i
+	xsmuldp		vs6,	vs0,	alpha_i		// real*alpha_i
+	xsmuldp		vs7,	vs1,	alpha_r		// imag*alpha_r
+
+	xssubdp		vs2,	vs4,	vs5		// real*alpha_r - imag*alpha_i
+	xsadddp		vs3,	vs6,	vs7		// real*alpha_i + imag*alpha_r
+	xxpermdi	vs9,	vs2,	vs3,	0	// merge real and imag part
+
+
+#ifndef TRMMKERNEL
+
+	xvadddp		vs8,	vs8,	vs16
+	xvadddp		vs9,	vs9,	vs17
+
+#endif
+
+	stxvd2x		vs8,	o0,	T1
+	stxvd2x		vs9,	o16,	T1
+
+	add		T1,	T1,	LDC
+	addi		CO,	CO,	32
+
+.endm
+
+
+/**********************************************************************************************
+* Macros for N=2 and M=1
+**********************************************************************************************/
+
+.macro LOAD2x1_1
+
+	lxvdsx		vs16,	o0,	BO		// load real part from B
+	lxvdsx		vs17,	o8,	BO		// load imag part from B
+	lxvdsx		vs18,	o16,	BO		// load real part from B
+	lxvdsx		vs19,	o24,	BO		// load imag part from B
+
+	addi		BO,	BO,	32
+
+	lxvd2x		vs0,	o0,	AO		// load real,imag from A
+
+	addi		AO,	AO,	16
+
+
+.endm
+
+.macro KERNEL2x1_I1
+
+	lxvd2x		vs8,	o0,	AO		// load real,imag from A
+
+	addi		AO,	AO,	16
+
+	lxvdsx		vs20,	o0,	BO		// load real part from B
+	lxvdsx		vs21,	o8,	BO		// load imag part from B
+	lxvdsx		vs22,	o16,	BO		// load real part from B
+	lxvdsx		vs23,	o24,	BO		// load imag part from B
+
+	addi		BO,	BO,	32
+
+	xvmuldp		vs32,	vs0,	vs16		// real*real, imag*real
+	xvmuldp		vs33,	vs0,	vs17		// real*imag, imag*imag
+
+	xvmuldp		vs34,	vs0,	vs18		// real*real, imag*real
+	xvmuldp		vs35,	vs0,	vs19		// real*imag, imag*imag
+
+
+.endm
+
+.macro KERNEL2x1_1
+
+	lxvd2x		vs8,	o0,	AO		// load real,imag from A
+
+	addi		AO,	AO,	16
+
+	lxvdsx		vs20,	o0,	BO		// load real part from B
+	lxvdsx		vs21,	o8,	BO		// load imag part from B
+	lxvdsx		vs22,	o16,	BO		// load real part from B
+	lxvdsx		vs23,	o24,	BO		// load imag part from B
+
+	addi		BO,	BO,	32
+
+	xvmaddadp	vs32,	vs0,	vs16		// real*real, imag*real
+	xvmaddadp	vs33,	vs0,	vs17		// real*imag, imag*imag
+
+	xvmaddadp	vs34,	vs0,	vs18		// real*real, imag*real
+	xvmaddadp	vs35,	vs0,	vs19		// real*imag, imag*imag
+
+
+.endm
+
+.macro KERNEL2x1_2
+
+	lxvd2x		vs0,	o0,	AO		// load real,imag from A
+
+	addi		AO,	AO,	16
+
+	lxvdsx		vs16,	o0,	BO		// load real part from B
+	lxvdsx		vs17,	o8,	BO		// load imag part from B
+	lxvdsx		vs18,	o16,	BO		// load real part from B
+	lxvdsx		vs19,	o24,	BO		// load imag part from B
+
+	addi		BO,	BO,	32
+
+	xvmaddadp	vs32,	vs8,	vs20		// real*real, imag*real
+	xvmaddadp	vs33,	vs8,	vs21		// real*imag, imag*imag
+
+	xvmaddadp	vs34,	vs8,	vs22		// real*real, imag*real
+	xvmaddadp	vs35,	vs8,	vs23		// real*imag, imag*imag
+
+
+.endm
+
+.macro KERNEL2x1_E2
+
+
+	xvmaddadp	vs32,	vs8,	vs20		// real*real, imag*real
+	xvmaddadp	vs33,	vs8,	vs21		// real*imag, imag*imag
+
+	xvmaddadp	vs34,	vs8,	vs22		// real*real, imag*real
+	xvmaddadp	vs35,	vs8,	vs23		// real*imag, imag*imag
+
+
+.endm
+
+.macro KERNEL2x1_SUBI1
+
+	lxvd2x		vs0,	o0,	AO		// load real,imag from A
+
+	addi		AO,	AO,	16
+
+	lxvdsx		vs16,	o0,	BO		// load real part from B
+	lxvdsx		vs17,	o8,	BO		// load imag part from B
+	lxvdsx		vs18,	o16,	BO		// load real part from B
+	lxvdsx		vs19,	o24,	BO		// load imag part from B
+
+	addi		BO,	BO,	32
+
+	xvmuldp		vs32,	vs0,	vs16		// real*real, imag*real
+	xvmuldp		vs33,	vs0,	vs17		// real*imag, imag*imag
+
+	xvmuldp		vs34,	vs0,	vs18		// real*real, imag*real
+	xvmuldp		vs35,	vs0,	vs19		// real*imag, imag*imag
+
+
+.endm
+
+.macro KERNEL2x1_SUB1
+
+	lxvd2x		vs0,	o0,	AO		// load real,imag from A
+
+	addi		AO,	AO,	16
+
+	lxvdsx		vs16,	o0,	BO		// load real part from B
+	lxvdsx		vs17,	o8,	BO		// load imag part from B
+	lxvdsx		vs18,	o16,	BO		// load real part from B
+	lxvdsx		vs19,	o24,	BO		// load imag part from B
+
+	addi		BO,	BO,	32
+
+	xvmaddadp	vs32,	vs0,	vs16		// real*real, imag*real
+	xvmaddadp	vs33,	vs0,	vs17		// real*imag, imag*imag
+
+	xvmaddadp	vs34,	vs0,	vs18		// real*real, imag*real
+	xvmaddadp	vs35,	vs0,	vs19		// real*imag, imag*imag
+
+
+.endm
+
+.macro SAVE2x1
+
+
+	mr		T1,	CO
+
+#ifndef TRMMKERNEL
+
+	lxvd2x		vs16,	o0,	T1
+
+#endif
+
+
+	xxlxor		vs0,	vs0,	vs0
+	xxlxor		vs1,	vs1,	vs1
+	xxswapd		vs33,	vs33			// realA*imagB, imagA*imagB -> imagA*imagB, realA*imagB
+
+	XSFADD_R1	vs0,	vs0,	vs32		// realA*realB
+	XSFADD_R2	vs0,	vs0,	vs33		// imagA*imagB
+
+	xxswapd		vs32,	vs32			// realA*realB, imagA*realB -> imagA*realB, realA*realB
+	xxswapd		vs33,	vs33			// imagA*imagB, realA*imagB -> realA*imagB, imagA*imagB
+
+	XSFADD_I1	vs1,	vs1,	vs32		// realA*imagB
+	XSFADD_I2	vs1,	vs1,	vs33		// imagA*realB
+
+	xsmuldp		vs4,	vs0,	alpha_r		// real*alpha_r
+	xsmuldp		vs5,	vs1,	alpha_i		// imag*alpha_i
+	xsmuldp		vs6,	vs0,	alpha_i		// real*alpha_i
+	xsmuldp		vs7,	vs1,	alpha_r		// imag*alpha_r
+
+	xssubdp		vs2,	vs4,	vs5		// real*alpha_r - imag*alpha_i
+	xsadddp		vs3,	vs6,	vs7		// real*alpha_i + imag*alpha_r
+	xxpermdi	vs8,	vs2,	vs3,	0	// merge real and imag part
+
+
+#ifndef TRMMKERNEL
+
+	xvadddp		vs8,	vs8,	vs16
+
+#endif
+
+	stxvd2x		vs8,	o0,	T1
+
+	add		T1,	T1,	LDC
+
+#ifndef TRMMKERNEL
+
+	lxvd2x		vs16,	o0,	T1
+
+#endif
+
+
+	xxlxor		vs0,	vs0,	vs0
+	xxlxor		vs1,	vs1,	vs1
+	xxswapd		vs35,	vs35			// realA*imagB, imagA*imagB -> imagA*imagB, realA*imagB
+
+	XSFADD_R1	vs0,	vs0,	vs34		// realA*realB
+	XSFADD_R2	vs0,	vs0,	vs35		// imagA*imagB
+
+	xxswapd		vs34,	vs34			// realA*realB, imagA*realB -> imagA*realB, realA*realB
+	xxswapd		vs35,	vs35			// imagA*imagB, realA*imagB -> realA*imagB, imagA*imagB
+
+	XSFADD_I1	vs1,	vs1,	vs34		// realA*imagB
+	XSFADD_I2	vs1,	vs1,	vs35		// imagA*realB
+
+	xsmuldp		vs4,	vs0,	alpha_r		// real*alpha_r
+	xsmuldp		vs5,	vs1,	alpha_i		// imag*alpha_i
+	xsmuldp		vs6,	vs0,	alpha_i		// real*alpha_i
+	xsmuldp		vs7,	vs1,	alpha_r		// imag*alpha_r
+
+	xssubdp		vs2,	vs4,	vs5		// real*alpha_r - imag*alpha_i
+	xsadddp		vs3,	vs6,	vs7		// real*alpha_i + imag*alpha_r
+	xxpermdi	vs8,	vs2,	vs3,	0	// merge real and imag part
+
+
+#ifndef TRMMKERNEL
+
+	xvadddp		vs8,	vs8,	vs16
+
+#endif
+
+	stxvd2x		vs8,	o0,	T1
+
+	add		T1,	T1,	LDC
+	addi		CO,	CO,	16
+
+.endm
+
+
+/**********************************************************************************************
+* Macros for N=1 and M=8
+**********************************************************************************************/
+
+.macro LOAD1x8_1
+
+	lxvdsx		vs16,	o0,	BO		// load real part from B
+	lxvdsx		vs17,	o8,	BO		// load imag part from B
+
+	addi		BO,	BO,	16
+
+	lxvd2x		vs0,	o0,	AO		// load real,imag from A
+	lxvd2x		vs1,	o16,	AO		// load real,imag from A
+	lxvd2x		vs2,	o32,	AO		// load real,imag from A
+	lxvd2x		vs3,	o48,	AO		// load real,imag from A
+
+	addi		AO,	AO,	64
+
+	lxvd2x		vs4,	o0,	AO		// load real,imag from A
+	lxvd2x		vs5,	o16,	AO		// load real,imag from A
+	lxvd2x		vs6,	o32,	AO		// load real,imag from A
+	lxvd2x		vs7,	o48,	AO		// load real,imag from A
+
+	addi		AO,	AO,	64
+
+
+.endm
+
+.macro KERNEL1x8_I1
+
+	lxvd2x		vs8,	o0,	AO		// load real,imag from A
+	lxvd2x		vs9,	o16,	AO		// load real,imag from A
+	lxvd2x		vs10,	o32,	AO		// load real,imag from A
+	lxvd2x		vs11,	o48,	AO		// load real,imag from A
+
+	addi		AO,	AO,	64
+
+	lxvd2x		vs12,	o0,	AO		// load real,imag from A
+	lxvd2x		vs13,	o16,	AO		// load real,imag from A
+	lxvd2x		vs14,	o32,	AO		// load real,imag from A
+	lxvd2x		vs15,	o48,	AO		// load real,imag from A
+
+	addi		AO,	AO,	64
+
+	lxvdsx		vs20,	o0,	BO		// load real part from B
+	lxvdsx		vs21,	o8,	BO		// load imag part from B
+
+	addi		BO,	BO,	16
+
+	xvmuldp		vs32,	vs0,	vs16		// real*real, imag*real
+	xvmuldp		vs33,	vs0,	vs17		// real*imag, imag*imag
+	xvmuldp		vs34,	vs1,	vs16		// real*real, imag*real
+	xvmuldp		vs35,	vs1,	vs17		// real*imag, imag*imag
+	xvmuldp		vs36,	vs2,	vs16		// real*real, imag*real
+	xvmuldp		vs37,	vs2,	vs17		// real*imag, imag*imag
+	xvmuldp		vs38,	vs3,	vs16		// real*real, imag*real
+	xvmuldp		vs39,	vs3,	vs17		// real*imag, imag*imag
+	xvmuldp		vs40,	vs4,	vs16		// real*real, imag*real
+	xvmuldp		vs41,	vs4,	vs17		// real*imag, imag*imag
+	xvmuldp		vs42,	vs5,	vs16		// real*real, imag*real
+	xvmuldp		vs43,	vs5,	vs17		// real*imag, imag*imag
+	xvmuldp		vs44,	vs6,	vs16		// real*real, imag*real
+	xvmuldp		vs45,	vs6,	vs17		// real*imag, imag*imag
+	xvmuldp		vs46,	vs7,	vs16		// real*real, imag*real
+	xvmuldp		vs47,	vs7,	vs17		// real*imag, imag*imag
+
+
+.endm
+
+.macro KERNEL1x8_1
+
+	lxvd2x		vs8,	o0,	AO		// load real,imag from A
+	lxvd2x		vs9,	o16,	AO		// load real,imag from A
+	lxvd2x		vs10,	o32,	AO		// load real,imag from A
+	lxvd2x		vs11,	o48,	AO		// load real,imag from A
+
+	addi		AO,	AO,	64
+
+	lxvd2x		vs12,	o0,	AO		// load real,imag from A
+	lxvd2x		vs13,	o16,	AO		// load real,imag from A
+	lxvd2x		vs14,	o32,	AO		// load real,imag from A
+	lxvd2x		vs15,	o48,	AO		// load real,imag from A
+
+	addi		AO,	AO,	64
+
+	lxvdsx		vs20,	o0,	BO		// load real part from B
+	lxvdsx		vs21,	o8,	BO		// load imag part from B
+
+	addi		BO,	BO,	16
+
+	xvmaddadp	vs32,	vs0,	vs16		// real*real, imag*real
+	xvmaddadp	vs33,	vs0,	vs17		// real*imag, imag*imag
+	xvmaddadp	vs34,	vs1,	vs16		// real*real, imag*real
+	xvmaddadp	vs35,	vs1,	vs17		// real*imag, imag*imag
+	xvmaddadp	vs36,	vs2,	vs16		// real*real, imag*real
+	xvmaddadp	vs37,	vs2,	vs17		// real*imag, imag*imag
+	xvmaddadp	vs38,	vs3,	vs16		// real*real, imag*real
+	xvmaddadp	vs39,	vs3,	vs17		// real*imag, imag*imag
+	xvmaddadp	vs40,	vs4,	vs16		// real*real, imag*real
+	xvmaddadp	vs41,	vs4,	vs17		// real*imag, imag*imag
+	xvmaddadp	vs42,	vs5,	vs16		// real*real, imag*real
+	xvmaddadp	vs43,	vs5,	vs17		// real*imag, imag*imag
+	xvmaddadp	vs44,	vs6,	vs16		// real*real, imag*real
+	xvmaddadp	vs45,	vs6,	vs17		// real*imag, imag*imag
+	xvmaddadp	vs46,	vs7,	vs16		// real*real, imag*real
+	xvmaddadp	vs47,	vs7,	vs17		// real*imag, imag*imag
+
+
+.endm
+
+.macro KERNEL1x8_2
+
+	lxvd2x		vs0,	o0,	AO		// load real,imag from A
+	lxvd2x		vs1,	o16,	AO		// load real,imag from A
+	lxvd2x		vs2,	o32,	AO		// load real,imag from A
+	lxvd2x		vs3,	o48,	AO		// load real,imag from A
+
+	addi		AO,	AO,	64
+
+	lxvd2x		vs4,	o0,	AO		// load real,imag from A
+	lxvd2x		vs5,	o16,	AO		// load real,imag from A
+	lxvd2x		vs6,	o32,	AO		// load real,imag from A
+	lxvd2x		vs7,	o48,	AO		// load real,imag from A
+
+	addi		AO,	AO,	64
+
+	lxvdsx		vs16,	o0,	BO		// load real part from B
+	lxvdsx		vs17,	o8,	BO		// load imag part from B
+
+	addi		BO,	BO,	16
+
+	xvmaddadp	vs32,	vs8,	vs20		// real*real, imag*real
+	xvmaddadp	vs33,	vs8,	vs21		// real*imag, imag*imag
+	xvmaddadp	vs34,	vs9,	vs20		// real*real, imag*real
+	xvmaddadp	vs35,	vs9,	vs21		// real*imag, imag*imag
+	xvmaddadp	vs36,	vs10,	vs20		// real*real, imag*real
+	xvmaddadp	vs37,	vs10,	vs21		// real*imag, imag*imag
+	xvmaddadp	vs38,	vs11,	vs20		// real*real, imag*real
+	xvmaddadp	vs39,	vs11,	vs21		// real*imag, imag*imag
+	xvmaddadp	vs40,	vs12,	vs20		// real*real, imag*real
+	xvmaddadp	vs41,	vs12,	vs21		// real*imag, imag*imag
+	xvmaddadp	vs42,	vs13,	vs20		// real*real, imag*real
+	xvmaddadp	vs43,	vs13,	vs21		// real*imag, imag*imag
+	xvmaddadp	vs44,	vs14,	vs20		// real*real, imag*real
+	xvmaddadp	vs45,	vs14,	vs21		// real*imag, imag*imag
+	xvmaddadp	vs46,	vs15,	vs20		// real*real, imag*real
+	xvmaddadp	vs47,	vs15,	vs21		// real*imag, imag*imag
+
+
+.endm
+
+.macro KERNEL1x8_E2
+
+
+	xvmaddadp	vs32,	vs8,	vs20		// real*real, imag*real
+	xvmaddadp	vs33,	vs8,	vs21		// real*imag, imag*imag
+	xvmaddadp	vs34,	vs9,	vs20		// real*real, imag*real
+	xvmaddadp	vs35,	vs9,	vs21		// real*imag, imag*imag
+	xvmaddadp	vs36,	vs10,	vs20		// real*real, imag*real
+	xvmaddadp	vs37,	vs10,	vs21		// real*imag, imag*imag
+	xvmaddadp	vs38,	vs11,	vs20		// real*real, imag*real
+	xvmaddadp	vs39,	vs11,	vs21		// real*imag, imag*imag
+	xvmaddadp	vs40,	vs12,	vs20		// real*real, imag*real
+	xvmaddadp	vs41,	vs12,	vs21		// real*imag, imag*imag
+	xvmaddadp	vs42,	vs13,	vs20		// real*real, imag*real
+	xvmaddadp	vs43,	vs13,	vs21		// real*imag, imag*imag
+	xvmaddadp	vs44,	vs14,	vs20		// real*real, imag*real
+	xvmaddadp	vs45,	vs14,	vs21		// real*imag, imag*imag
+	xvmaddadp	vs46,	vs15,	vs20		// real*real, imag*real
+	xvmaddadp	vs47,	vs15,	vs21		// real*imag, imag*imag
+
+
+.endm
+
+.macro KERNEL1x8_SUBI1
+
+	lxvd2x		vs0,	o0,	AO		// load real,imag from A
+	lxvd2x		vs1,	o16,	AO		// load real,imag from A
+	lxvd2x		vs2,	o32,	AO		// load real,imag from A
+	lxvd2x		vs3,	o48,	AO		// load real,imag from A
+
+	addi		AO,	AO,	64
+
+	lxvd2x		vs4,	o0,	AO		// load real,imag from A
+	lxvd2x		vs5,	o16,	AO		// load real,imag from A
+	lxvd2x		vs6,	o32,	AO		// load real,imag from A
+	lxvd2x		vs7,	o48,	AO		// load real,imag from A
+
+	addi		AO,	AO,	64
+
+	lxvdsx		vs16,	o0,	BO		// load real part from B
+	lxvdsx		vs17,	o8,	BO		// load imag part from B
+
+	addi		BO,	BO,	16
+
+	xvmuldp		vs32,	vs0,	vs16		// real*real, imag*real
+	xvmuldp		vs33,	vs0,	vs17		// real*imag, imag*imag
+	xvmuldp		vs34,	vs1,	vs16		// real*real, imag*real
+	xvmuldp		vs35,	vs1,	vs17		// real*imag, imag*imag
+	xvmuldp		vs36,	vs2,	vs16		// real*real, imag*real
+	xvmuldp		vs37,	vs2,	vs17		// real*imag, imag*imag
+	xvmuldp		vs38,	vs3,	vs16		// real*real, imag*real
+	xvmuldp		vs39,	vs3,	vs17		// real*imag, imag*imag
+	xvmuldp		vs40,	vs4,	vs16		// real*real, imag*real
+	xvmuldp		vs41,	vs4,	vs17		// real*imag, imag*imag
+	xvmuldp		vs42,	vs5,	vs16		// real*real, imag*real
+	xvmuldp		vs43,	vs5,	vs17		// real*imag, imag*imag
+	xvmuldp		vs44,	vs6,	vs16		// real*real, imag*real
+	xvmuldp		vs45,	vs6,	vs17		// real*imag, imag*imag
+	xvmuldp		vs46,	vs7,	vs16		// real*real, imag*real
+	xvmuldp		vs47,	vs7,	vs17		// real*imag, imag*imag
+
+
+.endm
+
+.macro KERNEL1x8_SUB1
+
+	lxvd2x		vs0,	o0,	AO		// load real,imag from A
+	lxvd2x		vs1,	o16,	AO		// load real,imag from A
+	lxvd2x		vs2,	o32,	AO		// load real,imag from A
+	lxvd2x		vs3,	o48,	AO		// load real,imag from A
+
+	addi		AO,	AO,	64
+
+	lxvd2x		vs4,	o0,	AO		// load real,imag from A
+	lxvd2x		vs5,	o16,	AO		// load real,imag from A
+	lxvd2x		vs6,	o32,	AO		// load real,imag from A
+	lxvd2x		vs7,	o48,	AO		// load real,imag from A
+
+	addi		AO,	AO,	64
+
+	lxvdsx		vs16,	o0,	BO		// load real part from B
+	lxvdsx		vs17,	o8,	BO		// load imag part from B
+
+	addi		BO,	BO,	16
+
+	xvmaddadp	vs32,	vs0,	vs16		// real*real, imag*real
+	xvmaddadp	vs33,	vs0,	vs17		// real*imag, imag*imag
+	xvmaddadp	vs34,	vs1,	vs16		// real*real, imag*real
+	xvmaddadp	vs35,	vs1,	vs17		// real*imag, imag*imag
+	xvmaddadp	vs36,	vs2,	vs16		// real*real, imag*real
+	xvmaddadp	vs37,	vs2,	vs17		// real*imag, imag*imag
+	xvmaddadp	vs38,	vs3,	vs16		// real*real, imag*real
+	xvmaddadp	vs39,	vs3,	vs17		// real*imag, imag*imag
+	xvmaddadp	vs40,	vs4,	vs16		// real*real, imag*real
+	xvmaddadp	vs41,	vs4,	vs17		// real*imag, imag*imag
+	xvmaddadp	vs42,	vs5,	vs16		// real*real, imag*real
+	xvmaddadp	vs43,	vs5,	vs17		// real*imag, imag*imag
+	xvmaddadp	vs44,	vs6,	vs16		// real*real, imag*real
+	xvmaddadp	vs45,	vs6,	vs17		// real*imag, imag*imag
+	xvmaddadp	vs46,	vs7,	vs16		// real*real, imag*real
+	xvmaddadp	vs47,	vs7,	vs17		// real*imag, imag*imag
+
+
+.endm
+
+.macro SAVE1x8
+
+
+	mr		T1,	CO
+	addi		T2,	T1,	64
+
+#ifndef TRMMKERNEL
+
+	lxvd2x		vs16,	o0,	T1
+	lxvd2x		vs17,	o16,	T1
+	lxvd2x		vs18,	o32,	T1
+	lxvd2x		vs19,	o48,	T1
+	lxvd2x		vs20,	o0,	T2
+	lxvd2x		vs21,	o16,	T2
+	lxvd2x		vs22,	o32,	T2
+	lxvd2x		vs23,	o48,	T2
+
+#endif
+
+
+	xxlxor		vs0,	vs0,	vs0
+	xxlxor		vs1,	vs1,	vs1
+	xxswapd		vs33,	vs33			// realA*imagB, imagA*imagB -> imagA*imagB, realA*imagB
+
+	XSFADD_R1	vs0,	vs0,	vs32		// realA*realB
+	XSFADD_R2	vs0,	vs0,	vs33		// imagA*imagB
+
+	xxswapd		vs32,	vs32			// realA*realB, imagA*realB -> imagA*realB, realA*realB
+	xxswapd		vs33,	vs33			// imagA*imagB, realA*imagB -> realA*imagB, imagA*imagB
+
+	XSFADD_I1	vs1,	vs1,	vs32		// realA*imagB
+	XSFADD_I2	vs1,	vs1,	vs33		// imagA*realB
+
+	xsmuldp		vs4,	vs0,	alpha_r		// real*alpha_r
+	xsmuldp		vs5,	vs1,	alpha_i		// imag*alpha_i
+	xsmuldp		vs6,	vs0,	alpha_i		// real*alpha_i
+	xsmuldp		vs7,	vs1,	alpha_r		// imag*alpha_r
+
+	xssubdp		vs2,	vs4,	vs5		// real*alpha_r - imag*alpha_i
+	xsadddp		vs3,	vs6,	vs7		// real*alpha_i + imag*alpha_r
+	xxpermdi	vs8,	vs2,	vs3,	0	// merge real and imag part
+
+
+
+	xxlxor		vs0,	vs0,	vs0
+	xxlxor		vs1,	vs1,	vs1
+	xxswapd		vs35,	vs35			// realA*imagB, imagA*imagB -> imagA*imagB, realA*imagB
+
+	XSFADD_R1	vs0,	vs0,	vs34		// realA*realB
+	XSFADD_R2	vs0,	vs0,	vs35		// imagA*imagB
+
+	xxswapd		vs34,	vs34			// realA*realB, imagA*realB -> imagA*realB, realA*realB
+	xxswapd		vs35,	vs35			// imagA*imagB, realA*imagB -> realA*imagB, imagA*imagB
+
+	XSFADD_I1	vs1,	vs1,	vs34		// realA*imagB
+	XSFADD_I2	vs1,	vs1,	vs35		// imagA*realB
+
+	xsmuldp		vs4,	vs0,	alpha_r		// real*alpha_r
+	xsmuldp		vs5,	vs1,	alpha_i		// imag*alpha_i
+	xsmuldp		vs6,	vs0,	alpha_i		// real*alpha_i
+	xsmuldp		vs7,	vs1,	alpha_r		// imag*alpha_r
+
+	xssubdp		vs2,	vs4,	vs5		// real*alpha_r - imag*alpha_i
+	xsadddp		vs3,	vs6,	vs7		// real*alpha_i + imag*alpha_r
+	xxpermdi	vs9,	vs2,	vs3,	0	// merge real and imag part
+
+
+
+	xxlxor		vs0,	vs0,	vs0
+	xxlxor		vs1,	vs1,	vs1
+	xxswapd		vs37,	vs37			// realA*imagB, imagA*imagB -> imagA*imagB, realA*imagB
+
+	XSFADD_R1	vs0,	vs0,	vs36		// realA*realB
+	XSFADD_R2	vs0,	vs0,	vs37		// imagA*imagB
+
+	xxswapd		vs36,	vs36			// realA*realB, imagA*realB -> imagA*realB, realA*realB
+	xxswapd		vs37,	vs37			// imagA*imagB, realA*imagB -> realA*imagB, imagA*imagB
+
+	XSFADD_I1	vs1,	vs1,	vs36		// realA*imagB
+	XSFADD_I2	vs1,	vs1,	vs37		// imagA*realB
+
+	xsmuldp		vs4,	vs0,	alpha_r		// real*alpha_r
+	xsmuldp		vs5,	vs1,	alpha_i		// imag*alpha_i
+	xsmuldp		vs6,	vs0,	alpha_i		// real*alpha_i
+	xsmuldp		vs7,	vs1,	alpha_r		// imag*alpha_r
+
+	xssubdp		vs2,	vs4,	vs5		// real*alpha_r - imag*alpha_i
+	xsadddp		vs3,	vs6,	vs7		// real*alpha_i + imag*alpha_r
+	xxpermdi	vs10,	vs2,	vs3,	0	// merge real and imag part
+
+
+
+	xxlxor		vs0,	vs0,	vs0
+	xxlxor		vs1,	vs1,	vs1
+	xxswapd		vs39,	vs39			// realA*imagB, imagA*imagB -> imagA*imagB, realA*imagB
+
+	XSFADD_R1	vs0,	vs0,	vs38		// realA*realB
+	XSFADD_R2	vs0,	vs0,	vs39		// imagA*imagB
+
+	xxswapd		vs38,	vs38			// realA*realB, imagA*realB -> imagA*realB, realA*realB
+	xxswapd		vs39,	vs39			// imagA*imagB, realA*imagB -> realA*imagB, imagA*imagB
+
+	XSFADD_I1	vs1,	vs1,	vs38		// realA*imagB
+	XSFADD_I2	vs1,	vs1,	vs39		// imagA*realB
+
+	xsmuldp		vs4,	vs0,	alpha_r		// real*alpha_r
+	xsmuldp		vs5,	vs1,	alpha_i		// imag*alpha_i
+	xsmuldp		vs6,	vs0,	alpha_i		// real*alpha_i
+	xsmuldp		vs7,	vs1,	alpha_r		// imag*alpha_r
+
+	xssubdp		vs2,	vs4,	vs5		// real*alpha_r - imag*alpha_i
+	xsadddp		vs3,	vs6,	vs7		// real*alpha_i + imag*alpha_r
+	xxpermdi	vs11,	vs2,	vs3,	0	// merge real and imag part
+
+
+
+	xxlxor		vs0,	vs0,	vs0
+	xxlxor		vs1,	vs1,	vs1
+	xxswapd		vs41,	vs41			// realA*imagB, imagA*imagB -> imagA*imagB, realA*imagB
+
+	XSFADD_R1	vs0,	vs0,	vs40		// realA*realB
+	XSFADD_R2	vs0,	vs0,	vs41		// imagA*imagB
+
+	xxswapd		vs40,	vs40			// realA*realB, imagA*realB -> imagA*realB, realA*realB
+	xxswapd		vs41,	vs41			// imagA*imagB, realA*imagB -> realA*imagB, imagA*imagB
+
+	XSFADD_I1	vs1,	vs1,	vs40		// realA*imagB
+	XSFADD_I2	vs1,	vs1,	vs41		// imagA*realB
+
+	xsmuldp		vs4,	vs0,	alpha_r		// real*alpha_r
+	xsmuldp		vs5,	vs1,	alpha_i		// imag*alpha_i
+	xsmuldp		vs6,	vs0,	alpha_i		// real*alpha_i
+	xsmuldp		vs7,	vs1,	alpha_r		// imag*alpha_r
+
+	xssubdp		vs2,	vs4,	vs5		// real*alpha_r - imag*alpha_i
+	xsadddp		vs3,	vs6,	vs7		// real*alpha_i + imag*alpha_r
+	xxpermdi	vs12,	vs2,	vs3,	0	// merge real and imag part
+
+
+
+	xxlxor		vs0,	vs0,	vs0
+	xxlxor		vs1,	vs1,	vs1
+	xxswapd		vs43,	vs43			// realA*imagB, imagA*imagB -> imagA*imagB, realA*imagB
+
+	XSFADD_R1	vs0,	vs0,	vs42		// realA*realB
+	XSFADD_R2	vs0,	vs0,	vs43		// imagA*imagB
+
+	xxswapd		vs42,	vs42			// realA*realB, imagA*realB -> imagA*realB, realA*realB
+	xxswapd		vs43,	vs43			// imagA*imagB, realA*imagB -> realA*imagB, imagA*imagB
+
+	XSFADD_I1	vs1,	vs1,	vs42		// realA*imagB
+	XSFADD_I2	vs1,	vs1,	vs43		// imagA*realB
+
+	xsmuldp		vs4,	vs0,	alpha_r		// real*alpha_r
+	xsmuldp		vs5,	vs1,	alpha_i		// imag*alpha_i
+	xsmuldp		vs6,	vs0,	alpha_i		// real*alpha_i
+	xsmuldp		vs7,	vs1,	alpha_r		// imag*alpha_r
+
+	xssubdp		vs2,	vs4,	vs5		// real*alpha_r - imag*alpha_i
+	xsadddp		vs3,	vs6,	vs7		// real*alpha_i + imag*alpha_r
+	xxpermdi	vs13,	vs2,	vs3,	0	// merge real and imag part
+
+
+
+	xxlxor		vs0,	vs0,	vs0
+	xxlxor		vs1,	vs1,	vs1
+	xxswapd		vs45,	vs45			// realA*imagB, imagA*imagB -> imagA*imagB, realA*imagB
+
+	XSFADD_R1	vs0,	vs0,	vs44		// realA*realB
+	XSFADD_R2	vs0,	vs0,	vs45		// imagA*imagB
+
+	xxswapd		vs44,	vs44			// realA*realB, imagA*realB -> imagA*realB, realA*realB
+	xxswapd		vs45,	vs45			// imagA*imagB, realA*imagB -> realA*imagB, imagA*imagB
+
+	XSFADD_I1	vs1,	vs1,	vs44		// realA*imagB
+	XSFADD_I2	vs1,	vs1,	vs45		// imagA*realB
+
+	xsmuldp		vs4,	vs0,	alpha_r		// real*alpha_r
+	xsmuldp		vs5,	vs1,	alpha_i		// imag*alpha_i
+	xsmuldp		vs6,	vs0,	alpha_i		// real*alpha_i
+	xsmuldp		vs7,	vs1,	alpha_r		// imag*alpha_r
+
+	xssubdp		vs2,	vs4,	vs5		// real*alpha_r - imag*alpha_i
+	xsadddp		vs3,	vs6,	vs7		// real*alpha_i + imag*alpha_r
+	xxpermdi	vs14,	vs2,	vs3,	0	// merge real and imag part
+
+
+
+	xxlxor		vs0,	vs0,	vs0
+	xxlxor		vs1,	vs1,	vs1
+	xxswapd		vs47,	vs47			// realA*imagB, imagA*imagB -> imagA*imagB, realA*imagB
+
+	XSFADD_R1	vs0,	vs0,	vs46		// realA*realB
+	XSFADD_R2	vs0,	vs0,	vs47		// imagA*imagB
+
+	xxswapd		vs46,	vs46			// realA*realB, imagA*realB -> imagA*realB, realA*realB
+	xxswapd		vs47,	vs47			// imagA*imagB, realA*imagB -> realA*imagB, imagA*imagB
+
+	XSFADD_I1	vs1,	vs1,	vs46		// realA*imagB
+	XSFADD_I2	vs1,	vs1,	vs47		// imagA*realB
+
+	xsmuldp		vs4,	vs0,	alpha_r		// real*alpha_r
+	xsmuldp		vs5,	vs1,	alpha_i		// imag*alpha_i
+	xsmuldp		vs6,	vs0,	alpha_i		// real*alpha_i
+	xsmuldp		vs7,	vs1,	alpha_r		// imag*alpha_r
+
+	xssubdp		vs2,	vs4,	vs5		// real*alpha_r - imag*alpha_i
+	xsadddp		vs3,	vs6,	vs7		// real*alpha_i + imag*alpha_r
+	xxpermdi	vs15,	vs2,	vs3,	0	// merge real and imag part
+
+
+#ifndef TRMMKERNEL
+
+	xvadddp		vs8,	vs8,	vs16
+	xvadddp		vs9,	vs9,	vs17
+	xvadddp		vs10,	vs10,	vs18
+	xvadddp		vs11,	vs11,	vs19
+	xvadddp		vs12,	vs12,	vs20
+	xvadddp		vs13,	vs13,	vs21
+	xvadddp		vs14,	vs14,	vs22
+	xvadddp		vs15,	vs15,	vs23
+
+#endif
+
+	stxvd2x		vs8,	o0,	T1
+	stxvd2x		vs9,	o16,	T1
+	stxvd2x		vs10,	o32,	T1
+	stxvd2x		vs11,	o48,	T1
+	stxvd2x		vs12,	o0,	T2
+	stxvd2x		vs13,	o16,	T2
+	stxvd2x		vs14,	o32,	T2
+	stxvd2x		vs15,	o48,	T2
+
+	add		T1,	T1,	LDC
+	add		T2,	T2,	LDC
+	addi		CO,	CO,	128
+
+.endm
+
+
+/**********************************************************************************************
+* Macros for N=1 and M=4
+**********************************************************************************************/
+
+.macro LOAD1x4_1
+
+	lxvdsx		vs16,	o0,	BO		// load real part from B
+	lxvdsx		vs17,	o8,	BO		// load imag part from B
+
+	addi		BO,	BO,	16
+
+	lxvd2x		vs0,	o0,	AO		// load real,imag from A
+	lxvd2x		vs1,	o16,	AO		// load real,imag from A
+	lxvd2x		vs2,	o32,	AO		// load real,imag from A
+	lxvd2x		vs3,	o48,	AO		// load real,imag from A
+
+	addi		AO,	AO,	64
+
+
+.endm
+
+.macro KERNEL1x4_I1
+
+	lxvd2x		vs8,	o0,	AO		// load real,imag from A
+	lxvd2x		vs9,	o16,	AO		// load real,imag from A
+	lxvd2x		vs10,	o32,	AO		// load real,imag from A
+	lxvd2x		vs11,	o48,	AO		// load real,imag from A
+
+	addi		AO,	AO,	64
+
+	lxvdsx		vs20,	o0,	BO		// load real part from B
+	lxvdsx		vs21,	o8,	BO		// load imag part from B
+
+	addi		BO,	BO,	16
+
+	xvmuldp		vs32,	vs0,	vs16		// real*real, imag*real
+	xvmuldp		vs33,	vs0,	vs17		// real*imag, imag*imag
+	xvmuldp		vs34,	vs1,	vs16		// real*real, imag*real
+	xvmuldp		vs35,	vs1,	vs17		// real*imag, imag*imag
+	xvmuldp		vs36,	vs2,	vs16		// real*real, imag*real
+	xvmuldp		vs37,	vs2,	vs17		// real*imag, imag*imag
+	xvmuldp		vs38,	vs3,	vs16		// real*real, imag*real
+	xvmuldp		vs39,	vs3,	vs17		// real*imag, imag*imag
+
+
+.endm
+
+.macro KERNEL1x4_1
+
+	lxvd2x		vs8,	o0,	AO		// load real,imag from A
+	lxvd2x		vs9,	o16,	AO		// load real,imag from A
+	lxvd2x		vs10,	o32,	AO		// load real,imag from A
+	lxvd2x		vs11,	o48,	AO		// load real,imag from A
+
+	addi		AO,	AO,	64
+
+	lxvdsx		vs20,	o0,	BO		// load real part from B
+	lxvdsx		vs21,	o8,	BO		// load imag part from B
+
+	addi		BO,	BO,	16
+
+	xvmaddadp	vs32,	vs0,	vs16		// real*real, imag*real
+	xvmaddadp	vs33,	vs0,	vs17		// real*imag, imag*imag
+	xvmaddadp	vs34,	vs1,	vs16		// real*real, imag*real
+	xvmaddadp	vs35,	vs1,	vs17		// real*imag, imag*imag
+	xvmaddadp	vs36,	vs2,	vs16		// real*real, imag*real
+	xvmaddadp	vs37,	vs2,	vs17		// real*imag, imag*imag
+	xvmaddadp	vs38,	vs3,	vs16		// real*real, imag*real
+	xvmaddadp	vs39,	vs3,	vs17		// real*imag, imag*imag
+
+
+.endm
+
+.macro KERNEL1x4_2
+
+	lxvd2x		vs0,	o0,	AO		// load real,imag from A
+	lxvd2x		vs1,	o16,	AO		// load real,imag from A
+	lxvd2x		vs2,	o32,	AO		// load real,imag from A
+	lxvd2x		vs3,	o48,	AO		// load real,imag from A
+
+	addi		AO,	AO,	64
+
+	lxvdsx		vs16,	o0,	BO		// load real part from B
+	lxvdsx		vs17,	o8,	BO		// load imag part from B
+
+	addi		BO,	BO,	16
+
+	xvmaddadp	vs32,	vs8,	vs20		// real*real, imag*real
+	xvmaddadp	vs33,	vs8,	vs21		// real*imag, imag*imag
+	xvmaddadp	vs34,	vs9,	vs20		// real*real, imag*real
+	xvmaddadp	vs35,	vs9,	vs21		// real*imag, imag*imag
+	xvmaddadp	vs36,	vs10,	vs20		// real*real, imag*real
+	xvmaddadp	vs37,	vs10,	vs21		// real*imag, imag*imag
+	xvmaddadp	vs38,	vs11,	vs20		// real*real, imag*real
+	xvmaddadp	vs39,	vs11,	vs21		// real*imag, imag*imag
+
+
+.endm
+
+.macro KERNEL1x4_E2
+
+
+	xvmaddadp	vs32,	vs8,	vs20		// real*real, imag*real
+	xvmaddadp	vs33,	vs8,	vs21		// real*imag, imag*imag
+	xvmaddadp	vs34,	vs9,	vs20		// real*real, imag*real
+	xvmaddadp	vs35,	vs9,	vs21		// real*imag, imag*imag
+	xvmaddadp	vs36,	vs10,	vs20		// real*real, imag*real
+	xvmaddadp	vs37,	vs10,	vs21		// real*imag, imag*imag
+	xvmaddadp	vs38,	vs11,	vs20		// real*real, imag*real
+	xvmaddadp	vs39,	vs11,	vs21		// real*imag, imag*imag
+
+
+.endm
+
+.macro KERNEL1x4_SUBI1
+
+	lxvd2x		vs0,	o0,	AO		// load real,imag from A
+	lxvd2x		vs1,	o16,	AO		// load real,imag from A
+	lxvd2x		vs2,	o32,	AO		// load real,imag from A
+	lxvd2x		vs3,	o48,	AO		// load real,imag from A
+
+	addi		AO,	AO,	64
+
+	lxvdsx		vs16,	o0,	BO		// load real part from B
+	lxvdsx		vs17,	o8,	BO		// load imag part from B
+
+	addi		BO,	BO,	16
+
+	xvmuldp		vs32,	vs0,	vs16		// real*real, imag*real
+	xvmuldp		vs33,	vs0,	vs17		// real*imag, imag*imag
+	xvmuldp		vs34,	vs1,	vs16		// real*real, imag*real
+	xvmuldp		vs35,	vs1,	vs17		// real*imag, imag*imag
+	xvmuldp		vs36,	vs2,	vs16		// real*real, imag*real
+	xvmuldp		vs37,	vs2,	vs17		// real*imag, imag*imag
+	xvmuldp		vs38,	vs3,	vs16		// real*real, imag*real
+	xvmuldp		vs39,	vs3,	vs17		// real*imag, imag*imag
+
+
+.endm
+
+.macro KERNEL1x4_SUB1
+
+	lxvd2x		vs0,	o0,	AO		// load real,imag from A
+	lxvd2x		vs1,	o16,	AO		// load real,imag from A
+	lxvd2x		vs2,	o32,	AO		// load real,imag from A
+	lxvd2x		vs3,	o48,	AO		// load real,imag from A
+
+	addi		AO,	AO,	64
+
+	lxvdsx		vs16,	o0,	BO		// load real part from B
+	lxvdsx		vs17,	o8,	BO		// load imag part from B
+
+	addi		BO,	BO,	16
+
+	xvmaddadp	vs32,	vs0,	vs16		// real*real, imag*real
+	xvmaddadp	vs33,	vs0,	vs17		// real*imag, imag*imag
+	xvmaddadp	vs34,	vs1,	vs16		// real*real, imag*real
+	xvmaddadp	vs35,	vs1,	vs17		// real*imag, imag*imag
+	xvmaddadp	vs36,	vs2,	vs16		// real*real, imag*real
+	xvmaddadp	vs37,	vs2,	vs17		// real*imag, imag*imag
+	xvmaddadp	vs38,	vs3,	vs16		// real*real, imag*real
+	xvmaddadp	vs39,	vs3,	vs17		// real*imag, imag*imag
+
+
+.endm
+
+.macro SAVE1x4
+
+
+	mr		T1,	CO
+
+#ifndef TRMMKERNEL
+
+	lxvd2x		vs16,	o0,	T1
+	lxvd2x		vs17,	o16,	T1
+	lxvd2x		vs18,	o32,	T1
+	lxvd2x		vs19,	o48,	T1
+
+#endif
+
+
+	xxlxor		vs0,	vs0,	vs0
+	xxlxor		vs1,	vs1,	vs1
+	xxswapd		vs33,	vs33			// realA*imagB, imagA*imagB -> imagA*imagB, realA*imagB
+
+	XSFADD_R1	vs0,	vs0,	vs32		// realA*realB
+	XSFADD_R2	vs0,	vs0,	vs33		// imagA*imagB
+
+	xxswapd		vs32,	vs32			// realA*realB, imagA*realB -> imagA*realB, realA*realB
+	xxswapd		vs33,	vs33			// imagA*imagB, realA*imagB -> realA*imagB, imagA*imagB
+
+	XSFADD_I1	vs1,	vs1,	vs32		// realA*imagB
+	XSFADD_I2	vs1,	vs1,	vs33		// imagA*realB
+
+	xsmuldp		vs4,	vs0,	alpha_r		// real*alpha_r
+	xsmuldp		vs5,	vs1,	alpha_i		// imag*alpha_i
+	xsmuldp		vs6,	vs0,	alpha_i		// real*alpha_i
+	xsmuldp		vs7,	vs1,	alpha_r		// imag*alpha_r
+
+	xssubdp		vs2,	vs4,	vs5		// real*alpha_r - imag*alpha_i
+	xsadddp		vs3,	vs6,	vs7		// real*alpha_i + imag*alpha_r
+	xxpermdi	vs8,	vs2,	vs3,	0	// merge real and imag part
+
+
+
+	xxlxor		vs0,	vs0,	vs0
+	xxlxor		vs1,	vs1,	vs1
+	xxswapd		vs35,	vs35			// realA*imagB, imagA*imagB -> imagA*imagB, realA*imagB
+
+	XSFADD_R1	vs0,	vs0,	vs34		// realA*realB
+	XSFADD_R2	vs0,	vs0,	vs35		// imagA*imagB
+
+	xxswapd		vs34,	vs34			// realA*realB, imagA*realB -> imagA*realB, realA*realB
+	xxswapd		vs35,	vs35			// imagA*imagB, realA*imagB -> realA*imagB, imagA*imagB
+
+	XSFADD_I1	vs1,	vs1,	vs34		// realA*imagB
+	XSFADD_I2	vs1,	vs1,	vs35		// imagA*realB
+
+	xsmuldp		vs4,	vs0,	alpha_r		// real*alpha_r
+	xsmuldp		vs5,	vs1,	alpha_i		// imag*alpha_i
+	xsmuldp		vs6,	vs0,	alpha_i		// real*alpha_i
+	xsmuldp		vs7,	vs1,	alpha_r		// imag*alpha_r
+
+	xssubdp		vs2,	vs4,	vs5		// real*alpha_r - imag*alpha_i
+	xsadddp		vs3,	vs6,	vs7		// real*alpha_i + imag*alpha_r
+	xxpermdi	vs9,	vs2,	vs3,	0	// merge real and imag part
+
+
+
+	xxlxor		vs0,	vs0,	vs0
+	xxlxor		vs1,	vs1,	vs1
+	xxswapd		vs37,	vs37			// realA*imagB, imagA*imagB -> imagA*imagB, realA*imagB
+
+	XSFADD_R1	vs0,	vs0,	vs36		// realA*realB
+	XSFADD_R2	vs0,	vs0,	vs37		// imagA*imagB
+
+	xxswapd		vs36,	vs36			// realA*realB, imagA*realB -> imagA*realB, realA*realB
+	xxswapd		vs37,	vs37			// imagA*imagB, realA*imagB -> realA*imagB, imagA*imagB
+
+	XSFADD_I1	vs1,	vs1,	vs36		// realA*imagB
+	XSFADD_I2	vs1,	vs1,	vs37		// imagA*realB
+
+	xsmuldp		vs4,	vs0,	alpha_r		// real*alpha_r
+	xsmuldp		vs5,	vs1,	alpha_i		// imag*alpha_i
+	xsmuldp		vs6,	vs0,	alpha_i		// real*alpha_i
+	xsmuldp		vs7,	vs1,	alpha_r		// imag*alpha_r
+
+	xssubdp		vs2,	vs4,	vs5		// real*alpha_r - imag*alpha_i
+	xsadddp		vs3,	vs6,	vs7		// real*alpha_i + imag*alpha_r
+	xxpermdi	vs10,	vs2,	vs3,	0	// merge real and imag part
+
+
+
+	xxlxor		vs0,	vs0,	vs0
+	xxlxor		vs1,	vs1,	vs1
+	xxswapd		vs39,	vs39			// realA*imagB, imagA*imagB -> imagA*imagB, realA*imagB
+
+	XSFADD_R1	vs0,	vs0,	vs38		// realA*realB
+	XSFADD_R2	vs0,	vs0,	vs39		// imagA*imagB
+
+	xxswapd		vs38,	vs38			// realA*realB, imagA*realB -> imagA*realB, realA*realB
+	xxswapd		vs39,	vs39			// imagA*imagB, realA*imagB -> realA*imagB, imagA*imagB
+
+	XSFADD_I1	vs1,	vs1,	vs38		// realA*imagB
+	XSFADD_I2	vs1,	vs1,	vs39		// imagA*realB
+
+	xsmuldp		vs4,	vs0,	alpha_r		// real*alpha_r
+	xsmuldp		vs5,	vs1,	alpha_i		// imag*alpha_i
+	xsmuldp		vs6,	vs0,	alpha_i		// real*alpha_i
+	xsmuldp		vs7,	vs1,	alpha_r		// imag*alpha_r
+
+	xssubdp		vs2,	vs4,	vs5		// real*alpha_r - imag*alpha_i
+	xsadddp		vs3,	vs6,	vs7		// real*alpha_i + imag*alpha_r
+	xxpermdi	vs11,	vs2,	vs3,	0	// merge real and imag part
+
+
+#ifndef TRMMKERNEL
+
+	xvadddp		vs8,	vs8,	vs16
+	xvadddp		vs9,	vs9,	vs17
+	xvadddp		vs10,	vs10,	vs18
+	xvadddp		vs11,	vs11,	vs19
+
+#endif
+
+	stxvd2x		vs8,	o0,	T1
+	stxvd2x		vs9,	o16,	T1
+	stxvd2x		vs10,	o32,	T1
+	stxvd2x		vs11,	o48,	T1
+
+	add		T1,	T1,	LDC
+	addi		CO,	CO,	64
+
+.endm
+
+
+/**********************************************************************************************
+* Macros for N=1 and M=2
+**********************************************************************************************/
+
+.macro LOAD1x2_1
+
+	lxvdsx		vs16,	o0,	BO		// load real part from B
+	lxvdsx		vs17,	o8,	BO		// load imag part from B
+
+	addi		BO,	BO,	16
+
+	lxvd2x		vs0,	o0,	AO		// load real,imag from A
+	lxvd2x		vs1,	o16,	AO		// load real,imag from A
+
+	addi		AO,	AO,	32
+
+
+.endm
+
+.macro KERNEL1x2_I1
+
+	lxvd2x		vs8,	o0,	AO		// load real,imag from A
+	lxvd2x		vs9,	o16,	AO		// load real,imag from A
+
+	addi		AO,	AO,	32
+
+	lxvdsx		vs20,	o0,	BO		// load real part from B
+	lxvdsx		vs21,	o8,	BO		// load imag part from B
+
+	addi		BO,	BO,	16
+
+	xvmuldp		vs32,	vs0,	vs16		// real*real, imag*real
+	xvmuldp		vs33,	vs0,	vs17		// real*imag, imag*imag
+	xvmuldp		vs34,	vs1,	vs16		// real*real, imag*real
+	xvmuldp		vs35,	vs1,	vs17		// real*imag, imag*imag
+
+
+.endm
+
+.macro KERNEL1x2_1
+
+	lxvd2x		vs8,	o0,	AO		// load real,imag from A
+	lxvd2x		vs9,	o16,	AO		// load real,imag from A
+
+	addi		AO,	AO,	32
+
+	lxvdsx		vs20,	o0,	BO		// load real part from B
+	lxvdsx		vs21,	o8,	BO		// load imag part from B
+
+	addi		BO,	BO,	16
+
+	xvmaddadp	vs32,	vs0,	vs16		// real*real, imag*real
+	xvmaddadp	vs33,	vs0,	vs17		// real*imag, imag*imag
+	xvmaddadp	vs34,	vs1,	vs16		// real*real, imag*real
+	xvmaddadp	vs35,	vs1,	vs17		// real*imag, imag*imag
+
+
+.endm
+
+.macro KERNEL1x2_2
+
+	lxvd2x		vs0,	o0,	AO		// load real,imag from A
+	lxvd2x		vs1,	o16,	AO		// load real,imag from A
+
+	addi		AO,	AO,	32
+
+	lxvdsx		vs16,	o0,	BO		// load real part from B
+	lxvdsx		vs17,	o8,	BO		// load imag part from B
+
+	addi		BO,	BO,	16
+
+	xvmaddadp	vs32,	vs8,	vs20		// real*real, imag*real
+	xvmaddadp	vs33,	vs8,	vs21		// real*imag, imag*imag
+	xvmaddadp	vs34,	vs9,	vs20		// real*real, imag*real
+	xvmaddadp	vs35,	vs9,	vs21		// real*imag, imag*imag
+
+
+.endm
+
+.macro KERNEL1x2_E2
+
+
+	xvmaddadp	vs32,	vs8,	vs20		// real*real, imag*real
+	xvmaddadp	vs33,	vs8,	vs21		// real*imag, imag*imag
+	xvmaddadp	vs34,	vs9,	vs20		// real*real, imag*real
+	xvmaddadp	vs35,	vs9,	vs21		// real*imag, imag*imag
+
+
+.endm
+
+.macro KERNEL1x2_SUBI1
+
+	lxvd2x		vs0,	o0,	AO		// load real,imag from A
+	lxvd2x		vs1,	o16,	AO		// load real,imag from A
+
+	addi		AO,	AO,	32
+
+	lxvdsx		vs16,	o0,	BO		// load real part from B
+	lxvdsx		vs17,	o8,	BO		// load imag part from B
+
+	addi		BO,	BO,	16
+
+	xvmuldp		vs32,	vs0,	vs16		// real*real, imag*real
+	xvmuldp		vs33,	vs0,	vs17		// real*imag, imag*imag
+	xvmuldp		vs34,	vs1,	vs16		// real*real, imag*real
+	xvmuldp		vs35,	vs1,	vs17		// real*imag, imag*imag
+
+
+.endm
+
+.macro KERNEL1x2_SUB1
+
+	lxvd2x		vs0,	o0,	AO		// load real,imag from A
+	lxvd2x		vs1,	o16,	AO		// load real,imag from A
+
+	addi		AO,	AO,	32
+
+	lxvdsx		vs16,	o0,	BO		// load real part from B
+	lxvdsx		vs17,	o8,	BO		// load imag part from B
+
+	addi		BO,	BO,	16
+
+	xvmaddadp	vs32,	vs0,	vs16		// real*real, imag*real
+	xvmaddadp	vs33,	vs0,	vs17		// real*imag, imag*imag
+	xvmaddadp	vs34,	vs1,	vs16		// real*real, imag*real
+	xvmaddadp	vs35,	vs1,	vs17		// real*imag, imag*imag
+
+
+.endm
+
+.macro SAVE1x2
+
+
+	mr		T1,	CO
+
+#ifndef TRMMKERNEL
+
+	lxvd2x		vs16,	o0,	T1
+	lxvd2x		vs17,	o16,	T1
+
+#endif
+
+
+	xxlxor		vs0,	vs0,	vs0
+	xxlxor		vs1,	vs1,	vs1
+	xxswapd		vs33,	vs33			// realA*imagB, imagA*imagB -> imagA*imagB, realA*imagB
+
+	XSFADD_R1	vs0,	vs0,	vs32		// realA*realB
+	XSFADD_R2	vs0,	vs0,	vs33		// imagA*imagB
+
+	xxswapd		vs32,	vs32			// realA*realB, imagA*realB -> imagA*realB, realA*realB
+	xxswapd		vs33,	vs33			// imagA*imagB, realA*imagB -> realA*imagB, imagA*imagB
+
+	XSFADD_I1	vs1,	vs1,	vs32		// realA*imagB
+	XSFADD_I2	vs1,	vs1,	vs33		// imagA*realB
+
+	xsmuldp		vs4,	vs0,	alpha_r		// real*alpha_r
+	xsmuldp		vs5,	vs1,	alpha_i		// imag*alpha_i
+	xsmuldp		vs6,	vs0,	alpha_i		// real*alpha_i
+	xsmuldp		vs7,	vs1,	alpha_r		// imag*alpha_r
+
+	xssubdp		vs2,	vs4,	vs5		// real*alpha_r - imag*alpha_i
+	xsadddp		vs3,	vs6,	vs7		// real*alpha_i + imag*alpha_r
+	xxpermdi	vs8,	vs2,	vs3,	0	// merge real and imag part
+
+
+
+	xxlxor		vs0,	vs0,	vs0
+	xxlxor		vs1,	vs1,	vs1
+	xxswapd		vs35,	vs35			// realA*imagB, imagA*imagB -> imagA*imagB, realA*imagB
+
+	XSFADD_R1	vs0,	vs0,	vs34		// realA*realB
+	XSFADD_R2	vs0,	vs0,	vs35		// imagA*imagB
+
+	xxswapd		vs34,	vs34			// realA*realB, imagA*realB -> imagA*realB, realA*realB
+	xxswapd		vs35,	vs35			// imagA*imagB, realA*imagB -> realA*imagB, imagA*imagB
+
+	XSFADD_I1	vs1,	vs1,	vs34		// realA*imagB
+	XSFADD_I2	vs1,	vs1,	vs35		// imagA*realB
+
+	xsmuldp		vs4,	vs0,	alpha_r		// real*alpha_r
+	xsmuldp		vs5,	vs1,	alpha_i		// imag*alpha_i
+	xsmuldp		vs6,	vs0,	alpha_i		// real*alpha_i
+	xsmuldp		vs7,	vs1,	alpha_r		// imag*alpha_r
+
+	xssubdp		vs2,	vs4,	vs5		// real*alpha_r - imag*alpha_i
+	xsadddp		vs3,	vs6,	vs7		// real*alpha_i + imag*alpha_r
+	xxpermdi	vs9,	vs2,	vs3,	0	// merge real and imag part
+
+
+#ifndef TRMMKERNEL
+
+	xvadddp		vs8,	vs8,	vs16
+	xvadddp		vs9,	vs9,	vs17
+
+#endif
+
+	stxvd2x		vs8,	o0,	T1
+	stxvd2x		vs9,	o16,	T1
+
+	add		T1,	T1,	LDC
+	addi		CO,	CO,	32
+
+.endm
+
+
+/**********************************************************************************************
+* Macros for N=1 and M=1
+**********************************************************************************************/
+
+.macro LOAD1x1_1
+
+	lxvdsx		vs16,	o0,	BO		// load real part from B
+	lxvdsx		vs17,	o8,	BO		// load imag part from B
+
+	addi		BO,	BO,	16
+
+	lxvd2x		vs0,	o0,	AO		// load real,imag from A
+
+	addi		AO,	AO,	16
+
+
+.endm
+
+.macro KERNEL1x1_I1
+
+	lxvd2x		vs8,	o0,	AO		// load real,imag from A
+
+	addi		AO,	AO,	16
+
+	lxvdsx		vs20,	o0,	BO		// load real part from B
+	lxvdsx		vs21,	o8,	BO		// load imag part from B
+
+	addi		BO,	BO,	16
+
+	xvmuldp		vs32,	vs0,	vs16		// real*real, imag*real
+	xvmuldp		vs33,	vs0,	vs17		// real*imag, imag*imag
+
+
+.endm
+
+.macro KERNEL1x1_1
+
+	lxvd2x		vs8,	o0,	AO		// load real,imag from A
+
+	addi		AO,	AO,	16
+
+	lxvdsx		vs20,	o0,	BO		// load real part from B
+	lxvdsx		vs21,	o8,	BO		// load imag part from B
+
+	addi		BO,	BO,	16
+
+	xvmaddadp	vs32,	vs0,	vs16		// real*real, imag*real
+	xvmaddadp	vs33,	vs0,	vs17		// real*imag, imag*imag
+
+
+.endm
+
+.macro KERNEL1x1_2
+
+	lxvd2x		vs0,	o0,	AO		// load real,imag from A
+
+	addi		AO,	AO,	16
+
+	lxvdsx		vs16,	o0,	BO		// load real part from B
+	lxvdsx		vs17,	o8,	BO		// load imag part from B
+
+	addi		BO,	BO,	16
+
+	xvmaddadp	vs32,	vs8,	vs20		// real*real, imag*real
+	xvmaddadp	vs33,	vs8,	vs21		// real*imag, imag*imag
+
+
+.endm
+
+.macro KERNEL1x1_E2
+
+
+	xvmaddadp	vs32,	vs8,	vs20		// real*real, imag*real
+	xvmaddadp	vs33,	vs8,	vs21		// real*imag, imag*imag
+
+
+.endm
+
+.macro KERNEL1x1_SUBI1
+
+	lxvd2x		vs0,	o0,	AO		// load real,imag from A
+
+	addi		AO,	AO,	16
+
+	lxvdsx		vs16,	o0,	BO		// load real part from B
+	lxvdsx		vs17,	o8,	BO		// load imag part from B
+
+	addi		BO,	BO,	16
+
+	xvmuldp		vs32,	vs0,	vs16		// real*real, imag*real
+	xvmuldp		vs33,	vs0,	vs17		// real*imag, imag*imag
+
+
+.endm
+
+.macro KERNEL1x1_SUB1
+
+	lxvd2x		vs0,	o0,	AO		// load real,imag from A
+
+	addi		AO,	AO,	16
+
+	lxvdsx		vs16,	o0,	BO		// load real part from B
+	lxvdsx		vs17,	o8,	BO		// load imag part from B
+
+	addi		BO,	BO,	16
+
+	xvmaddadp	vs32,	vs0,	vs16		// real*real, imag*real
+	xvmaddadp	vs33,	vs0,	vs17		// real*imag, imag*imag
+
+
+.endm
+
+.macro SAVE1x1
+
+
+	mr		T1,	CO
+
+#ifndef TRMMKERNEL
+
+	lxvd2x		vs16,	o0,	T1
+
+#endif
+
+
+	xxlxor		vs0,	vs0,	vs0
+	xxlxor		vs1,	vs1,	vs1
+	xxswapd		vs33,	vs33			// realA*imagB, imagA*imagB -> imagA*imagB, realA*imagB
+
+	XSFADD_R1	vs0,	vs0,	vs32		// realA*realB
+	XSFADD_R2	vs0,	vs0,	vs33		// imagA*imagB
+
+	xxswapd		vs32,	vs32			// realA*realB, imagA*realB -> imagA*realB, realA*realB
+	xxswapd		vs33,	vs33			// imagA*imagB, realA*imagB -> realA*imagB, imagA*imagB
+
+	XSFADD_I1	vs1,	vs1,	vs32		// realA*imagB
+	XSFADD_I2	vs1,	vs1,	vs33		// imagA*realB
+
+	xsmuldp		vs4,	vs0,	alpha_r		// real*alpha_r
+	xsmuldp		vs5,	vs1,	alpha_i		// imag*alpha_i
+	xsmuldp		vs6,	vs0,	alpha_i		// real*alpha_i
+	xsmuldp		vs7,	vs1,	alpha_r		// imag*alpha_r
+
+	xssubdp		vs2,	vs4,	vs5		// real*alpha_r - imag*alpha_i
+	xsadddp		vs3,	vs6,	vs7		// real*alpha_i + imag*alpha_r
+	xxpermdi	vs8,	vs2,	vs3,	0	// merge real and imag part
+
+
+#ifndef TRMMKERNEL
+
+	xvadddp		vs8,	vs8,	vs16
+
+#endif
+
+	stxvd2x		vs8,	o0,	T1
+
+	add		T1,	T1,	LDC
+	addi		CO,	CO,	16
+
+.endm
+
diff --git a/param.h b/param.h
index 2efd9b2c1..a6ead4b64 100644
--- a/param.h
+++ b/param.h
@@ -1980,7 +1980,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #define SGEMM_DEFAULT_P  960
 #define DGEMM_DEFAULT_P  480
 #define CGEMM_DEFAULT_P  720
-#define ZGEMM_DEFAULT_P  240
+#define ZGEMM_DEFAULT_P  480
 
 #define SGEMM_DEFAULT_Q  720
 #define DGEMM_DEFAULT_Q  720
@@ -1990,7 +1990,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #define SGEMM_DEFAULT_R 21600
 #define DGEMM_DEFAULT_R 14400
 #define CGEMM_DEFAULT_R 16200
-#define ZGEMM_DEFAULT_R 14400
+#define ZGEMM_DEFAULT_R 21600
 
 #define SYMV_P	 8
 

From 08bddde3f3abe3337a1a4177a6a9dbb2428fc87c Mon Sep 17 00:00:00 2001
From: Werner Saar <wernsaar@googlemail.com>
Date: Fri, 8 Apr 2016 10:37:59 +0200
Subject: [PATCH 4/4] updated benchmark Makefile for ESSL

---
 benchmark/Makefile | 18 ++++++++++++++++--
 1 file changed, 16 insertions(+), 2 deletions(-)

diff --git a/benchmark/Makefile b/benchmark/Makefile
index badd42c6b..8166f3863 100644
--- a/benchmark/Makefile
+++ b/benchmark/Makefile
@@ -34,7 +34,8 @@ LIBMKL = -L$(MKL) -lmkl_intel_lp64 -lmkl_gnu_thread -lmkl_core -lgomp -lpthread
 LIBVECLIB = -framework Accelerate
 
 ESSL=/opt/ibm/lib
-LIBESSL = -lessl $(ESSL)/libxlomp_ser.so.1 $(ESSL)/libxlf90_r.so.1 $(ESSL)/libxlfmath.so.1 $(ESSL)/libxlsmp.so.1 /opt/ibm/xlC/13.1.2/lib/libxl.a
+#LIBESSL = -lesslsmp $(ESSL)/libxlomp_ser.so.1 $(ESSL)/libxlf90_r.so.1 $(ESSL)/libxlfmath.so.1 $(ESSL)/libxlsmp.so.1 /opt/ibm/xlC/13.1.3/lib/libxl.a
+LIBESSL = -lesslsmp  $(ESSL)/libxlf90_r.so.1 $(ESSL)/libxlfmath.so.1 $(ESSL)/libxlsmp.so.1 /opt/ibm/xlC/13.1.3/lib/libxl.a
 
 ifeq ($(OSNAME), WINNT)
 
@@ -259,7 +260,8 @@ mkl :: slinpack.mkl dlinpack.mkl clinpack.mkl zlinpack.mkl \
 endif
 
 essl :: sgemm.essl strmm.essl dgemm.essl dtrmm.essl  \
-	cgemm.essl ctrmm.essl zgemm.essl ztrmm.essl
+	cgemm.essl ctrmm.essl zgemm.essl ztrmm.essl  \
+	slinpack.essl clinpack.essl dlinpack.essl zlinpack.essl
 
 veclib :: slinpack.veclib dlinpack.veclib clinpack.veclib zlinpack.veclib \
        scholesky.veclib dcholesky.veclib ccholesky.veclib zcholesky.veclib \
@@ -312,6 +314,9 @@ slinpack.mkl : slinpack.$(SUFFIX)
 slinpack.veclib : slinpack.$(SUFFIX)
 	-$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
 
+slinpack.essl : slinpack.$(SUFFIX)
+	-$(CC) $(CFLAGS) -o $(@F) $^ $(LIBESSL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
+
 ##################################### Dlinpack ####################################################
 dlinpack.goto : dlinpack.$(SUFFIX) ../$(LIBNAME)
 	$(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm
@@ -328,6 +333,9 @@ dlinpack.mkl : dlinpack.$(SUFFIX)
 dlinpack.veclib : dlinpack.$(SUFFIX)
 	-$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
 
+dlinpack.essl : dlinpack.$(SUFFIX)
+	-$(CC) $(CFLAGS) -o $(@F) $^ $(LIBESSL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
+
 ##################################### Clinpack ####################################################
 
 clinpack.goto : clinpack.$(SUFFIX) ../$(LIBNAME)
@@ -345,6 +353,9 @@ clinpack.mkl : clinpack.$(SUFFIX)
 clinpack.veclib : clinpack.$(SUFFIX)
 	-$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
 
+clinpack.essl : clinpack.$(SUFFIX)
+	-$(CC) $(CFLAGS) -o $(@F) $^ $(LIBESSL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
+
 ##################################### Zlinpack ####################################################
 
 zlinpack.goto : zlinpack.$(SUFFIX) ../$(LIBNAME)
@@ -362,6 +373,9 @@ zlinpack.mkl : zlinpack.$(SUFFIX)
 zlinpack.veclib : zlinpack.$(SUFFIX)
 	-$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
 
+zlinpack.essl : zlinpack.$(SUFFIX)
+	-$(CC) $(CFLAGS) -o $(@F) $^ $(LIBESSL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
+
 ##################################### Scholesky ###################################################
 
 scholesky.goto : scholesky.$(SUFFIX) ../$(LIBNAME)