3694 lines
		
	
	
		
			98 KiB
		
	
	
	
		
			ArmAsm
		
	
	
	
			
		
		
	
	
			3694 lines
		
	
	
		
			98 KiB
		
	
	
	
		
			ArmAsm
		
	
	
	
/***************************************************************************
 | 
						|
Copyright (c) 2013-2016, The OpenBLAS Project
 | 
						|
All rights reserved.
 | 
						|
Redistribution and use in source and binary forms, with or without
 | 
						|
modification, are permitted provided that the following conditions are
 | 
						|
met:
 | 
						|
1. Redistributions of source code must retain the above copyright
 | 
						|
notice, this list of conditions and the following disclaimer.
 | 
						|
2. Redistributions in binary form must reproduce the above copyright
 | 
						|
notice, this list of conditions and the following disclaimer in
 | 
						|
the documentation and/or other materials provided with the
 | 
						|
distribution.
 | 
						|
3. Neither the name of the OpenBLAS project nor the names of
 | 
						|
its contributors may be used to endorse or promote products
 | 
						|
derived from this software without specific prior written permission.
 | 
						|
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
 | 
						|
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
 | 
						|
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
 | 
						|
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
 | 
						|
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
 | 
						|
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
 | 
						|
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
 | 
						|
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
 | 
						|
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
 | 
						|
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 | 
						|
*****************************************************************************/
 | 
						|
 | 
						|
/**************************************************************************************
 | 
						|
* 2016/04/22 Werner Saar (wernsaar@googlemail.com)
 | 
						|
* 	 BLASTEST 		: OK
 | 
						|
* 	 CTEST			: OK
 | 
						|
* 	 TEST			: OK
 | 
						|
*	 LAPACK-TEST		: OK
 | 
						|
**************************************************************************************/
 | 
						|
 | 
						|
#if   defined(NN) || defined(NT) || defined(TN) || defined(TT)
 | 
						|
 | 
						|
	#define	XSFADD_R1	xsadddp
 | 
						|
	#define	XSFADD_R2	xssubdp
 | 
						|
	#define	XSFADD_I1	xsadddp
 | 
						|
	#define	XSFADD_I2	xsadddp
 | 
						|
 | 
						|
#elif  defined(CN) || defined(CT) || defined(RN) || defined(RT)
 | 
						|
 | 
						|
	#define	XSFADD_R1	xsadddp
 | 
						|
	#define	XSFADD_R2	xsadddp
 | 
						|
	#define	XSFADD_I1	xssubdp
 | 
						|
	#define	XSFADD_I2	xsadddp
 | 
						|
 | 
						|
#elif  defined(NC) || defined(TC) || defined(NR) || defined(TR)
 | 
						|
 | 
						|
	#define	XSFADD_R1	xsadddp
 | 
						|
	#define	XSFADD_R2	xsadddp
 | 
						|
	#define	XSFADD_I1	xsadddp
 | 
						|
	#define	XSFADD_I2	xssubdp
 | 
						|
 | 
						|
#else		// CC || CR || RC || RR
 | 
						|
 | 
						|
	#define	XSFADD_R1	xsadddp
 | 
						|
	#define	XSFADD_R2	xssubdp
 | 
						|
	#define	XSFADD_I1	xssubdp
 | 
						|
	#define	XSFADD_I2	xssubdp
 | 
						|
 | 
						|
#endif
 | 
						|
 | 
						|
/**********************************************************************************************
 | 
						|
* Macros for N=2 and M=8
 | 
						|
**********************************************************************************************/
 | 
						|
 | 
						|
#if defined(_AIX)
 | 
						|
define(`LOAD2x8_1', `
 | 
						|
#else
 | 
						|
.macro LOAD2x8_1
 | 
						|
#endif
 | 
						|
 | 
						|
	lxvd2x		vs16,	o0,	BO		// load real part from B
 | 
						|
	lxvd2x		vs17,	o16,	BO		// load imag part from B
 | 
						|
	lxvd2x		vs18,	o32,	BO		// load real part from B
 | 
						|
	lxvd2x		vs19,	o48,	BO		// load imag part from B
 | 
						|
 | 
						|
	addi		BO,	BO,	64
 | 
						|
 | 
						|
	lxvd2x		vs0,	o0,	AO		// load real,imag from A
 | 
						|
	lxvd2x		vs1,	o16,	AO		// load real,imag from A
 | 
						|
	lxvd2x		vs2,	o32,	AO		// load real,imag from A
 | 
						|
	lxvd2x		vs3,	o48,	AO		// load real,imag from A
 | 
						|
 | 
						|
	addi		AO,	AO,	64
 | 
						|
 | 
						|
	lxvd2x		vs4,	o0,	AO		// load real,imag from A
 | 
						|
	lxvd2x		vs5,	o16,	AO		// load real,imag from A
 | 
						|
	lxvd2x		vs6,	o32,	AO		// load real,imag from A
 | 
						|
	lxvd2x		vs7,	o48,	AO		// load real,imag from A
 | 
						|
 | 
						|
	addi		AO,	AO,	64
 | 
						|
 | 
						|
 | 
						|
#if defined(_AIX)
 | 
						|
')
 | 
						|
#else
 | 
						|
.endm
 | 
						|
#endif
 | 
						|
 | 
						|
#if defined(_AIX)
 | 
						|
define(`KERNEL2x8_I1', `
 | 
						|
#else
 | 
						|
.macro KERNEL2x8_I1
 | 
						|
#endif
 | 
						|
 | 
						|
	lxvd2x		vs8,	o0,	AO		// load real,imag from A
 | 
						|
	lxvd2x		vs9,	o16,	AO		// load real,imag from A
 | 
						|
	lxvd2x		vs10,	o32,	AO		// load real,imag from A
 | 
						|
	lxvd2x		vs11,	o48,	AO		// load real,imag from A
 | 
						|
 | 
						|
	addi		AO,	AO,	64
 | 
						|
 | 
						|
	lxvd2x		vs12,	o0,	AO		// load real,imag from A
 | 
						|
	lxvd2x		vs13,	o16,	AO		// load real,imag from A
 | 
						|
	lxvd2x		vs14,	o32,	AO		// load real,imag from A
 | 
						|
	lxvd2x		vs15,	o48,	AO		// load real,imag from A
 | 
						|
 | 
						|
	addi		AO,	AO,	64
 | 
						|
 | 
						|
	lxvd2x		vs20,	o0,	BO		// load real part from B
 | 
						|
	lxvd2x		vs21,	o16,	BO		// load imag part from B
 | 
						|
	lxvd2x		vs22,	o32,	BO		// load real part from B
 | 
						|
	lxvd2x		vs23,	o48,	BO		// load imag part from B
 | 
						|
 | 
						|
	addi		BO,	BO,	64
 | 
						|
 | 
						|
	xvmuldp		vs32,	vs0,	vs16		// real*real, imag*real
 | 
						|
	xvmuldp		vs33,	vs0,	vs17		// real*imag, imag*imag
 | 
						|
	xvmuldp		vs34,	vs1,	vs16		// real*real, imag*real
 | 
						|
	xvmuldp		vs35,	vs1,	vs17		// real*imag, imag*imag
 | 
						|
	xvmuldp		vs36,	vs2,	vs16		// real*real, imag*real
 | 
						|
	xvmuldp		vs37,	vs2,	vs17		// real*imag, imag*imag
 | 
						|
	xvmuldp		vs38,	vs3,	vs16		// real*real, imag*real
 | 
						|
	xvmuldp		vs39,	vs3,	vs17		// real*imag, imag*imag
 | 
						|
	xvmuldp		vs40,	vs4,	vs16		// real*real, imag*real
 | 
						|
	xvmuldp		vs41,	vs4,	vs17		// real*imag, imag*imag
 | 
						|
	xvmuldp		vs42,	vs5,	vs16		// real*real, imag*real
 | 
						|
	xvmuldp		vs43,	vs5,	vs17		// real*imag, imag*imag
 | 
						|
	xvmuldp		vs44,	vs6,	vs16		// real*real, imag*real
 | 
						|
	xvmuldp		vs45,	vs6,	vs17		// real*imag, imag*imag
 | 
						|
	xvmuldp		vs46,	vs7,	vs16		// real*real, imag*real
 | 
						|
	xvmuldp		vs47,	vs7,	vs17		// real*imag, imag*imag
 | 
						|
 | 
						|
	xvmuldp		vs48,	vs0,	vs18		// real*real, imag*real
 | 
						|
	xvmuldp		vs49,	vs0,	vs19		// real*imag, imag*imag
 | 
						|
	xvmuldp		vs50,	vs1,	vs18		// real*real, imag*real
 | 
						|
	xvmuldp		vs51,	vs1,	vs19		// real*imag, imag*imag
 | 
						|
	xvmuldp		vs52,	vs2,	vs18		// real*real, imag*real
 | 
						|
	xvmuldp		vs53,	vs2,	vs19		// real*imag, imag*imag
 | 
						|
	xvmuldp		vs54,	vs3,	vs18		// real*real, imag*real
 | 
						|
	xvmuldp		vs55,	vs3,	vs19		// real*imag, imag*imag
 | 
						|
	xvmuldp		vs56,	vs4,	vs18		// real*real, imag*real
 | 
						|
	xvmuldp		vs57,	vs4,	vs19		// real*imag, imag*imag
 | 
						|
	xvmuldp		vs58,	vs5,	vs18		// real*real, imag*real
 | 
						|
	xvmuldp		vs59,	vs5,	vs19		// real*imag, imag*imag
 | 
						|
	xvmuldp		vs60,	vs6,	vs18		// real*real, imag*real
 | 
						|
	xvmuldp		vs61,	vs6,	vs19		// real*imag, imag*imag
 | 
						|
	xvmuldp		vs62,	vs7,	vs18		// real*real, imag*real
 | 
						|
	xvmuldp		vs63,	vs7,	vs19		// real*imag, imag*imag
 | 
						|
 | 
						|
 | 
						|
#if defined(_AIX)
 | 
						|
')
 | 
						|
#else
 | 
						|
.endm
 | 
						|
#endif
 | 
						|
 | 
						|
#if defined(_AIX)
 | 
						|
define(`KERNEL2x8_1', `
 | 
						|
#else
 | 
						|
.macro KERNEL2x8_1
 | 
						|
#endif
 | 
						|
 | 
						|
	lxvd2x		vs8,	o0,	AO		// load real,imag from A
 | 
						|
	lxvd2x		vs9,	o16,	AO		// load real,imag from A
 | 
						|
	lxvd2x		vs10,	o32,	AO		// load real,imag from A
 | 
						|
	lxvd2x		vs11,	o48,	AO		// load real,imag from A
 | 
						|
 | 
						|
	addi		AO,	AO,	64
 | 
						|
 | 
						|
	lxvd2x		vs12,	o0,	AO		// load real,imag from A
 | 
						|
	lxvd2x		vs13,	o16,	AO		// load real,imag from A
 | 
						|
	lxvd2x		vs14,	o32,	AO		// load real,imag from A
 | 
						|
	lxvd2x		vs15,	o48,	AO		// load real,imag from A
 | 
						|
 | 
						|
	addi		AO,	AO,	64
 | 
						|
 | 
						|
	lxvd2x		vs20,	o0,	BO		// load real part from B
 | 
						|
	lxvd2x		vs21,	o16,	BO		// load imag part from B
 | 
						|
	lxvd2x		vs22,	o32,	BO		// load real part from B
 | 
						|
	lxvd2x		vs23,	o48,	BO		// load imag part from B
 | 
						|
 | 
						|
	addi		BO,	BO,	64
 | 
						|
 | 
						|
	xvmaddadp	vs32,	vs0,	vs16		// real*real, imag*real
 | 
						|
	xvmaddadp	vs33,	vs0,	vs17		// real*imag, imag*imag
 | 
						|
	xvmaddadp	vs34,	vs1,	vs16		// real*real, imag*real
 | 
						|
	xvmaddadp	vs35,	vs1,	vs17		// real*imag, imag*imag
 | 
						|
	xvmaddadp	vs36,	vs2,	vs16		// real*real, imag*real
 | 
						|
	xvmaddadp	vs37,	vs2,	vs17		// real*imag, imag*imag
 | 
						|
	xvmaddadp	vs38,	vs3,	vs16		// real*real, imag*real
 | 
						|
	xvmaddadp	vs39,	vs3,	vs17		// real*imag, imag*imag
 | 
						|
	xvmaddadp	vs40,	vs4,	vs16		// real*real, imag*real
 | 
						|
	xvmaddadp	vs41,	vs4,	vs17		// real*imag, imag*imag
 | 
						|
	xvmaddadp	vs42,	vs5,	vs16		// real*real, imag*real
 | 
						|
	xvmaddadp	vs43,	vs5,	vs17		// real*imag, imag*imag
 | 
						|
	xvmaddadp	vs44,	vs6,	vs16		// real*real, imag*real
 | 
						|
	xvmaddadp	vs45,	vs6,	vs17		// real*imag, imag*imag
 | 
						|
	xvmaddadp	vs46,	vs7,	vs16		// real*real, imag*real
 | 
						|
	xvmaddadp	vs47,	vs7,	vs17		// real*imag, imag*imag
 | 
						|
 | 
						|
	xvmaddadp	vs48,	vs0,	vs18		// real*real, imag*real
 | 
						|
	xvmaddadp	vs49,	vs0,	vs19		// real*imag, imag*imag
 | 
						|
	xvmaddadp	vs50,	vs1,	vs18		// real*real, imag*real
 | 
						|
	xvmaddadp	vs51,	vs1,	vs19		// real*imag, imag*imag
 | 
						|
	xvmaddadp	vs52,	vs2,	vs18		// real*real, imag*real
 | 
						|
	xvmaddadp	vs53,	vs2,	vs19		// real*imag, imag*imag
 | 
						|
	xvmaddadp	vs54,	vs3,	vs18		// real*real, imag*real
 | 
						|
	xvmaddadp	vs55,	vs3,	vs19		// real*imag, imag*imag
 | 
						|
	xvmaddadp	vs56,	vs4,	vs18		// real*real, imag*real
 | 
						|
	xvmaddadp	vs57,	vs4,	vs19		// real*imag, imag*imag
 | 
						|
	xvmaddadp	vs58,	vs5,	vs18		// real*real, imag*real
 | 
						|
	xvmaddadp	vs59,	vs5,	vs19		// real*imag, imag*imag
 | 
						|
	xvmaddadp	vs60,	vs6,	vs18		// real*real, imag*real
 | 
						|
	xvmaddadp	vs61,	vs6,	vs19		// real*imag, imag*imag
 | 
						|
	xvmaddadp	vs62,	vs7,	vs18		// real*real, imag*real
 | 
						|
	xvmaddadp	vs63,	vs7,	vs19		// real*imag, imag*imag
 | 
						|
 | 
						|
 | 
						|
#if defined(_AIX)
 | 
						|
')
 | 
						|
#else
 | 
						|
.endm
 | 
						|
#endif
 | 
						|
 | 
						|
#if defined(_AIX)
 | 
						|
define(`KERNEL2x8_2', `
 | 
						|
#else
 | 
						|
.macro KERNEL2x8_2
 | 
						|
#endif
 | 
						|
 | 
						|
	lxvd2x		vs0,	o0,	AO		// load real,imag from A
 | 
						|
	lxvd2x		vs1,	o16,	AO		// load real,imag from A
 | 
						|
	lxvd2x		vs2,	o32,	AO		// load real,imag from A
 | 
						|
	lxvd2x		vs3,	o48,	AO		// load real,imag from A
 | 
						|
 | 
						|
	addi		AO,	AO,	64
 | 
						|
 | 
						|
	lxvd2x		vs4,	o0,	AO		// load real,imag from A
 | 
						|
	lxvd2x		vs5,	o16,	AO		// load real,imag from A
 | 
						|
	lxvd2x		vs6,	o32,	AO		// load real,imag from A
 | 
						|
	lxvd2x		vs7,	o48,	AO		// load real,imag from A
 | 
						|
 | 
						|
	addi		AO,	AO,	64
 | 
						|
 | 
						|
	lxvd2x		vs16,	o0,	BO		// load real part from B
 | 
						|
	lxvd2x		vs17,	o16,	BO		// load imag part from B
 | 
						|
	lxvd2x		vs18,	o32,	BO		// load real part from B
 | 
						|
	lxvd2x		vs19,	o48,	BO		// load imag part from B
 | 
						|
 | 
						|
	addi		BO,	BO,	64
 | 
						|
 | 
						|
	xvmaddadp	vs32,	vs8,	vs20		// real*real, imag*real
 | 
						|
	xvmaddadp	vs33,	vs8,	vs21		// real*imag, imag*imag
 | 
						|
	xvmaddadp	vs34,	vs9,	vs20		// real*real, imag*real
 | 
						|
	xvmaddadp	vs35,	vs9,	vs21		// real*imag, imag*imag
 | 
						|
	xvmaddadp	vs36,	vs10,	vs20		// real*real, imag*real
 | 
						|
	xvmaddadp	vs37,	vs10,	vs21		// real*imag, imag*imag
 | 
						|
	xvmaddadp	vs38,	vs11,	vs20		// real*real, imag*real
 | 
						|
	xvmaddadp	vs39,	vs11,	vs21		// real*imag, imag*imag
 | 
						|
	xvmaddadp	vs40,	vs12,	vs20		// real*real, imag*real
 | 
						|
	xvmaddadp	vs41,	vs12,	vs21		// real*imag, imag*imag
 | 
						|
	xvmaddadp	vs42,	vs13,	vs20		// real*real, imag*real
 | 
						|
	xvmaddadp	vs43,	vs13,	vs21		// real*imag, imag*imag
 | 
						|
	xvmaddadp	vs44,	vs14,	vs20		// real*real, imag*real
 | 
						|
	xvmaddadp	vs45,	vs14,	vs21		// real*imag, imag*imag
 | 
						|
	xvmaddadp	vs46,	vs15,	vs20		// real*real, imag*real
 | 
						|
	xvmaddadp	vs47,	vs15,	vs21		// real*imag, imag*imag
 | 
						|
 | 
						|
	xvmaddadp	vs48,	vs8,	vs22		// real*real, imag*real
 | 
						|
	xvmaddadp	vs49,	vs8,	vs23		// real*imag, imag*imag
 | 
						|
	xvmaddadp	vs50,	vs9,	vs22		// real*real, imag*real
 | 
						|
	xvmaddadp	vs51,	vs9,	vs23		// real*imag, imag*imag
 | 
						|
	xvmaddadp	vs52,	vs10,	vs22		// real*real, imag*real
 | 
						|
	xvmaddadp	vs53,	vs10,	vs23		// real*imag, imag*imag
 | 
						|
	xvmaddadp	vs54,	vs11,	vs22		// real*real, imag*real
 | 
						|
	xvmaddadp	vs55,	vs11,	vs23		// real*imag, imag*imag
 | 
						|
	xvmaddadp	vs56,	vs12,	vs22		// real*real, imag*real
 | 
						|
	xvmaddadp	vs57,	vs12,	vs23		// real*imag, imag*imag
 | 
						|
	xvmaddadp	vs58,	vs13,	vs22		// real*real, imag*real
 | 
						|
	xvmaddadp	vs59,	vs13,	vs23		// real*imag, imag*imag
 | 
						|
	xvmaddadp	vs60,	vs14,	vs22		// real*real, imag*real
 | 
						|
	xvmaddadp	vs61,	vs14,	vs23		// real*imag, imag*imag
 | 
						|
	xvmaddadp	vs62,	vs15,	vs22		// real*real, imag*real
 | 
						|
	xvmaddadp	vs63,	vs15,	vs23		// real*imag, imag*imag
 | 
						|
 | 
						|
 | 
						|
#if defined(_AIX)
 | 
						|
')
 | 
						|
#else
 | 
						|
.endm
 | 
						|
#endif
 | 
						|
 | 
						|
#if defined(_AIX)
 | 
						|
define(`KERNEL2x8_E2', `
 | 
						|
#else
 | 
						|
.macro KERNEL2x8_E2
 | 
						|
#endif
 | 
						|
 | 
						|
 | 
						|
	xvmaddadp	vs32,	vs8,	vs20		// real*real, imag*real
 | 
						|
	xvmaddadp	vs33,	vs8,	vs21		// real*imag, imag*imag
 | 
						|
	xvmaddadp	vs34,	vs9,	vs20		// real*real, imag*real
 | 
						|
	xvmaddadp	vs35,	vs9,	vs21		// real*imag, imag*imag
 | 
						|
	xvmaddadp	vs36,	vs10,	vs20		// real*real, imag*real
 | 
						|
	xvmaddadp	vs37,	vs10,	vs21		// real*imag, imag*imag
 | 
						|
	xvmaddadp	vs38,	vs11,	vs20		// real*real, imag*real
 | 
						|
	xvmaddadp	vs39,	vs11,	vs21		// real*imag, imag*imag
 | 
						|
	xvmaddadp	vs40,	vs12,	vs20		// real*real, imag*real
 | 
						|
	xvmaddadp	vs41,	vs12,	vs21		// real*imag, imag*imag
 | 
						|
	xvmaddadp	vs42,	vs13,	vs20		// real*real, imag*real
 | 
						|
	xvmaddadp	vs43,	vs13,	vs21		// real*imag, imag*imag
 | 
						|
	xvmaddadp	vs44,	vs14,	vs20		// real*real, imag*real
 | 
						|
	xvmaddadp	vs45,	vs14,	vs21		// real*imag, imag*imag
 | 
						|
	xvmaddadp	vs46,	vs15,	vs20		// real*real, imag*real
 | 
						|
	xvmaddadp	vs47,	vs15,	vs21		// real*imag, imag*imag
 | 
						|
 | 
						|
	xvmaddadp	vs48,	vs8,	vs22		// real*real, imag*real
 | 
						|
	xvmaddadp	vs49,	vs8,	vs23		// real*imag, imag*imag
 | 
						|
	xvmaddadp	vs50,	vs9,	vs22		// real*real, imag*real
 | 
						|
	xvmaddadp	vs51,	vs9,	vs23		// real*imag, imag*imag
 | 
						|
	xvmaddadp	vs52,	vs10,	vs22		// real*real, imag*real
 | 
						|
	xvmaddadp	vs53,	vs10,	vs23		// real*imag, imag*imag
 | 
						|
	xvmaddadp	vs54,	vs11,	vs22		// real*real, imag*real
 | 
						|
	xvmaddadp	vs55,	vs11,	vs23		// real*imag, imag*imag
 | 
						|
	xvmaddadp	vs56,	vs12,	vs22		// real*real, imag*real
 | 
						|
	xvmaddadp	vs57,	vs12,	vs23		// real*imag, imag*imag
 | 
						|
	xvmaddadp	vs58,	vs13,	vs22		// real*real, imag*real
 | 
						|
	xvmaddadp	vs59,	vs13,	vs23		// real*imag, imag*imag
 | 
						|
	xvmaddadp	vs60,	vs14,	vs22		// real*real, imag*real
 | 
						|
	xvmaddadp	vs61,	vs14,	vs23		// real*imag, imag*imag
 | 
						|
	xvmaddadp	vs62,	vs15,	vs22		// real*real, imag*real
 | 
						|
	xvmaddadp	vs63,	vs15,	vs23		// real*imag, imag*imag
 | 
						|
 | 
						|
 | 
						|
#if defined(_AIX)
 | 
						|
')
 | 
						|
#else
 | 
						|
.endm
 | 
						|
#endif
 | 
						|
 | 
						|
#if defined(_AIX)
 | 
						|
define(`KERNEL2x8_SUBI1', `
 | 
						|
#else
 | 
						|
.macro KERNEL2x8_SUBI1
 | 
						|
#endif
 | 
						|
 | 
						|
	lxvd2x		vs0,	o0,	AO		// load real,imag from A
 | 
						|
	lxvd2x		vs1,	o16,	AO		// load real,imag from A
 | 
						|
	lxvd2x		vs2,	o32,	AO		// load real,imag from A
 | 
						|
	lxvd2x		vs3,	o48,	AO		// load real,imag from A
 | 
						|
 | 
						|
	addi		AO,	AO,	64
 | 
						|
 | 
						|
	lxvd2x		vs4,	o0,	AO		// load real,imag from A
 | 
						|
	lxvd2x		vs5,	o16,	AO		// load real,imag from A
 | 
						|
	lxvd2x		vs6,	o32,	AO		// load real,imag from A
 | 
						|
	lxvd2x		vs7,	o48,	AO		// load real,imag from A
 | 
						|
 | 
						|
	addi		AO,	AO,	64
 | 
						|
 | 
						|
	lxvd2x		vs16,	o0,	BO		// load real part from B
 | 
						|
	lxvd2x		vs17,	o16,	BO		// load imag part from B
 | 
						|
	lxvd2x		vs18,	o32,	BO		// load real part from B
 | 
						|
	lxvd2x		vs19,	o48,	BO		// load imag part from B
 | 
						|
 | 
						|
	addi		BO,	BO,	64
 | 
						|
 | 
						|
	xvmuldp		vs32,	vs0,	vs16		// real*real, imag*real
 | 
						|
	xvmuldp		vs33,	vs0,	vs17		// real*imag, imag*imag
 | 
						|
	xvmuldp		vs34,	vs1,	vs16		// real*real, imag*real
 | 
						|
	xvmuldp		vs35,	vs1,	vs17		// real*imag, imag*imag
 | 
						|
	xvmuldp		vs36,	vs2,	vs16		// real*real, imag*real
 | 
						|
	xvmuldp		vs37,	vs2,	vs17		// real*imag, imag*imag
 | 
						|
	xvmuldp		vs38,	vs3,	vs16		// real*real, imag*real
 | 
						|
	xvmuldp		vs39,	vs3,	vs17		// real*imag, imag*imag
 | 
						|
	xvmuldp		vs40,	vs4,	vs16		// real*real, imag*real
 | 
						|
	xvmuldp		vs41,	vs4,	vs17		// real*imag, imag*imag
 | 
						|
	xvmuldp		vs42,	vs5,	vs16		// real*real, imag*real
 | 
						|
	xvmuldp		vs43,	vs5,	vs17		// real*imag, imag*imag
 | 
						|
	xvmuldp		vs44,	vs6,	vs16		// real*real, imag*real
 | 
						|
	xvmuldp		vs45,	vs6,	vs17		// real*imag, imag*imag
 | 
						|
	xvmuldp		vs46,	vs7,	vs16		// real*real, imag*real
 | 
						|
	xvmuldp		vs47,	vs7,	vs17		// real*imag, imag*imag
 | 
						|
 | 
						|
	xvmuldp		vs48,	vs0,	vs18		// real*real, imag*real
 | 
						|
	xvmuldp		vs49,	vs0,	vs19		// real*imag, imag*imag
 | 
						|
	xvmuldp		vs50,	vs1,	vs18		// real*real, imag*real
 | 
						|
	xvmuldp		vs51,	vs1,	vs19		// real*imag, imag*imag
 | 
						|
	xvmuldp		vs52,	vs2,	vs18		// real*real, imag*real
 | 
						|
	xvmuldp		vs53,	vs2,	vs19		// real*imag, imag*imag
 | 
						|
	xvmuldp		vs54,	vs3,	vs18		// real*real, imag*real
 | 
						|
	xvmuldp		vs55,	vs3,	vs19		// real*imag, imag*imag
 | 
						|
	xvmuldp		vs56,	vs4,	vs18		// real*real, imag*real
 | 
						|
	xvmuldp		vs57,	vs4,	vs19		// real*imag, imag*imag
 | 
						|
	xvmuldp		vs58,	vs5,	vs18		// real*real, imag*real
 | 
						|
	xvmuldp		vs59,	vs5,	vs19		// real*imag, imag*imag
 | 
						|
	xvmuldp		vs60,	vs6,	vs18		// real*real, imag*real
 | 
						|
	xvmuldp		vs61,	vs6,	vs19		// real*imag, imag*imag
 | 
						|
	xvmuldp		vs62,	vs7,	vs18		// real*real, imag*real
 | 
						|
	xvmuldp		vs63,	vs7,	vs19		// real*imag, imag*imag
 | 
						|
 | 
						|
 | 
						|
#if defined(_AIX)
 | 
						|
')
 | 
						|
#else
 | 
						|
.endm
 | 
						|
#endif
 | 
						|
 | 
						|
#if defined(_AIX)
 | 
						|
define(`KERNEL2x8_SUB1', `
 | 
						|
#else
 | 
						|
.macro KERNEL2x8_SUB1
 | 
						|
#endif
 | 
						|
 | 
						|
	lxvd2x		vs0,	o0,	AO		// load real,imag from A
 | 
						|
	lxvd2x		vs1,	o16,	AO		// load real,imag from A
 | 
						|
	lxvd2x		vs2,	o32,	AO		// load real,imag from A
 | 
						|
	lxvd2x		vs3,	o48,	AO		// load real,imag from A
 | 
						|
 | 
						|
	addi		AO,	AO,	64
 | 
						|
 | 
						|
	lxvd2x		vs4,	o0,	AO		// load real,imag from A
 | 
						|
	lxvd2x		vs5,	o16,	AO		// load real,imag from A
 | 
						|
	lxvd2x		vs6,	o32,	AO		// load real,imag from A
 | 
						|
	lxvd2x		vs7,	o48,	AO		// load real,imag from A
 | 
						|
 | 
						|
	addi		AO,	AO,	64
 | 
						|
 | 
						|
	lxvd2x		vs16,	o0,	BO		// load real part from B
 | 
						|
	lxvd2x		vs17,	o16,	BO		// load imag part from B
 | 
						|
	lxvd2x		vs18,	o32,	BO		// load real part from B
 | 
						|
	lxvd2x		vs19,	o48,	BO		// load imag part from B
 | 
						|
 | 
						|
	addi		BO,	BO,	64
 | 
						|
 | 
						|
	xvmaddadp	vs32,	vs0,	vs16		// real*real, imag*real
 | 
						|
	xvmaddadp	vs33,	vs0,	vs17		// real*imag, imag*imag
 | 
						|
	xvmaddadp	vs34,	vs1,	vs16		// real*real, imag*real
 | 
						|
	xvmaddadp	vs35,	vs1,	vs17		// real*imag, imag*imag
 | 
						|
	xvmaddadp	vs36,	vs2,	vs16		// real*real, imag*real
 | 
						|
	xvmaddadp	vs37,	vs2,	vs17		// real*imag, imag*imag
 | 
						|
	xvmaddadp	vs38,	vs3,	vs16		// real*real, imag*real
 | 
						|
	xvmaddadp	vs39,	vs3,	vs17		// real*imag, imag*imag
 | 
						|
	xvmaddadp	vs40,	vs4,	vs16		// real*real, imag*real
 | 
						|
	xvmaddadp	vs41,	vs4,	vs17		// real*imag, imag*imag
 | 
						|
	xvmaddadp	vs42,	vs5,	vs16		// real*real, imag*real
 | 
						|
	xvmaddadp	vs43,	vs5,	vs17		// real*imag, imag*imag
 | 
						|
	xvmaddadp	vs44,	vs6,	vs16		// real*real, imag*real
 | 
						|
	xvmaddadp	vs45,	vs6,	vs17		// real*imag, imag*imag
 | 
						|
	xvmaddadp	vs46,	vs7,	vs16		// real*real, imag*real
 | 
						|
	xvmaddadp	vs47,	vs7,	vs17		// real*imag, imag*imag
 | 
						|
 | 
						|
	xvmaddadp	vs48,	vs0,	vs18		// real*real, imag*real
 | 
						|
	xvmaddadp	vs49,	vs0,	vs19		// real*imag, imag*imag
 | 
						|
	xvmaddadp	vs50,	vs1,	vs18		// real*real, imag*real
 | 
						|
	xvmaddadp	vs51,	vs1,	vs19		// real*imag, imag*imag
 | 
						|
	xvmaddadp	vs52,	vs2,	vs18		// real*real, imag*real
 | 
						|
	xvmaddadp	vs53,	vs2,	vs19		// real*imag, imag*imag
 | 
						|
	xvmaddadp	vs54,	vs3,	vs18		// real*real, imag*real
 | 
						|
	xvmaddadp	vs55,	vs3,	vs19		// real*imag, imag*imag
 | 
						|
	xvmaddadp	vs56,	vs4,	vs18		// real*real, imag*real
 | 
						|
	xvmaddadp	vs57,	vs4,	vs19		// real*imag, imag*imag
 | 
						|
	xvmaddadp	vs58,	vs5,	vs18		// real*real, imag*real
 | 
						|
	xvmaddadp	vs59,	vs5,	vs19		// real*imag, imag*imag
 | 
						|
	xvmaddadp	vs60,	vs6,	vs18		// real*real, imag*real
 | 
						|
	xvmaddadp	vs61,	vs6,	vs19		// real*imag, imag*imag
 | 
						|
	xvmaddadp	vs62,	vs7,	vs18		// real*real, imag*real
 | 
						|
	xvmaddadp	vs63,	vs7,	vs19		// real*imag, imag*imag
 | 
						|
 | 
						|
 | 
						|
#if defined(_AIX)
 | 
						|
')
 | 
						|
#else
 | 
						|
.endm
 | 
						|
#endif
 | 
						|
 | 
						|
#if defined(_AIX)
 | 
						|
define(`SAVE2x8', `
 | 
						|
#else
 | 
						|
.macro SAVE2x8
 | 
						|
#endif
 | 
						|
 | 
						|
 | 
						|
	mr		T1,	CO
 | 
						|
	addi		T2,	T1,	64
 | 
						|
 | 
						|
#ifndef TRMMKERNEL
 | 
						|
 | 
						|
	lxvd2x		vs16,	o0,	T1
 | 
						|
	lxvd2x		vs17,	o16,	T1
 | 
						|
	lxvd2x		vs18,	o32,	T1
 | 
						|
	lxvd2x		vs19,	o48,	T1
 | 
						|
	lxvd2x		vs20,	o0,	T2
 | 
						|
	lxvd2x		vs21,	o16,	T2
 | 
						|
	lxvd2x		vs22,	o32,	T2
 | 
						|
	lxvd2x		vs23,	o48,	T2
 | 
						|
 | 
						|
#endif
 | 
						|
 | 
						|
 | 
						|
	xxlxor		vs0,	vs0,	vs0
 | 
						|
	xxlxor		vs1,	vs1,	vs1
 | 
						|
	XXSWAPD(vs33,vs33)			// realA*imagB, imagA*imagB -> imagA*imagB, realA*imagB
 | 
						|
 | 
						|
	XSFADD_R1	vs0,	vs0,	vs32		// realA*realB
 | 
						|
	XSFADD_R2	vs0,	vs0,	vs33		// imagA*imagB
 | 
						|
 | 
						|
	XXSWAPD(vs32,vs32)			// realA*realB, imagA*realB -> imagA*realB, realA*realB
 | 
						|
	XXSWAPD(vs33,vs33)			// imagA*imagB, realA*imagB -> realA*imagB, imagA*imagB
 | 
						|
 | 
						|
	XSFADD_I1	vs1,	vs1,	vs32		// realA*imagB
 | 
						|
	XSFADD_I2	vs1,	vs1,	vs33		// imagA*realB
 | 
						|
 | 
						|
	xsmuldp		vs4,	vs0,	alpha_r		// real*alpha_r
 | 
						|
	xsmuldp		vs5,	vs1,	alpha_i		// imag*alpha_i
 | 
						|
	xsmuldp		vs6,	vs0,	alpha_i		// real*alpha_i
 | 
						|
	xsmuldp		vs7,	vs1,	alpha_r		// imag*alpha_r
 | 
						|
 | 
						|
	xssubdp		vs2,	vs4,	vs5		// real*alpha_r - imag*alpha_i
 | 
						|
	xsadddp		vs3,	vs6,	vs7		// real*alpha_i + imag*alpha_r
 | 
						|
	xxpermdi	vs8,	vs2,	vs3,	0	// merge real and imag part
 | 
						|
 | 
						|
 | 
						|
 | 
						|
	xxlxor		vs0,	vs0,	vs0
 | 
						|
	xxlxor		vs1,	vs1,	vs1
 | 
						|
	XXSWAPD(vs35,vs35)			// realA*imagB, imagA*imagB -> imagA*imagB, realA*imagB
 | 
						|
 | 
						|
	XSFADD_R1	vs0,	vs0,	vs34		// realA*realB
 | 
						|
	XSFADD_R2	vs0,	vs0,	vs35		// imagA*imagB
 | 
						|
 | 
						|
	XXSWAPD(vs34,vs34)			// realA*realB, imagA*realB -> imagA*realB, realA*realB
 | 
						|
	XXSWAPD(vs35,vs35)			// imagA*imagB, realA*imagB -> realA*imagB, imagA*imagB
 | 
						|
 | 
						|
	XSFADD_I1	vs1,	vs1,	vs34		// realA*imagB
 | 
						|
	XSFADD_I2	vs1,	vs1,	vs35		// imagA*realB
 | 
						|
 | 
						|
	xsmuldp		vs4,	vs0,	alpha_r		// real*alpha_r
 | 
						|
	xsmuldp		vs5,	vs1,	alpha_i		// imag*alpha_i
 | 
						|
	xsmuldp		vs6,	vs0,	alpha_i		// real*alpha_i
 | 
						|
	xsmuldp		vs7,	vs1,	alpha_r		// imag*alpha_r
 | 
						|
 | 
						|
	xssubdp		vs2,	vs4,	vs5		// real*alpha_r - imag*alpha_i
 | 
						|
	xsadddp		vs3,	vs6,	vs7		// real*alpha_i + imag*alpha_r
 | 
						|
	xxpermdi	vs9,	vs2,	vs3,	0	// merge real and imag part
 | 
						|
 | 
						|
 | 
						|
 | 
						|
	xxlxor		vs0,	vs0,	vs0
 | 
						|
	xxlxor		vs1,	vs1,	vs1
 | 
						|
	XXSWAPD(vs37,vs37)			// realA*imagB, imagA*imagB -> imagA*imagB, realA*imagB
 | 
						|
 | 
						|
	XSFADD_R1	vs0,	vs0,	vs36		// realA*realB
 | 
						|
	XSFADD_R2	vs0,	vs0,	vs37		// imagA*imagB
 | 
						|
 | 
						|
	XXSWAPD(vs36,vs36)			// realA*realB, imagA*realB -> imagA*realB, realA*realB
 | 
						|
	XXSWAPD(vs37,vs37)			// imagA*imagB, realA*imagB -> realA*imagB, imagA*imagB
 | 
						|
 | 
						|
	XSFADD_I1	vs1,	vs1,	vs36		// realA*imagB
 | 
						|
	XSFADD_I2	vs1,	vs1,	vs37		// imagA*realB
 | 
						|
 | 
						|
	xsmuldp		vs4,	vs0,	alpha_r		// real*alpha_r
 | 
						|
	xsmuldp		vs5,	vs1,	alpha_i		// imag*alpha_i
 | 
						|
	xsmuldp		vs6,	vs0,	alpha_i		// real*alpha_i
 | 
						|
	xsmuldp		vs7,	vs1,	alpha_r		// imag*alpha_r
 | 
						|
 | 
						|
	xssubdp		vs2,	vs4,	vs5		// real*alpha_r - imag*alpha_i
 | 
						|
	xsadddp		vs3,	vs6,	vs7		// real*alpha_i + imag*alpha_r
 | 
						|
	xxpermdi	vs10,	vs2,	vs3,	0	// merge real and imag part
 | 
						|
 | 
						|
 | 
						|
 | 
						|
	xxlxor		vs0,	vs0,	vs0
 | 
						|
	xxlxor		vs1,	vs1,	vs1
 | 
						|
	XXSWAPD(vs39,vs39)			// realA*imagB, imagA*imagB -> imagA*imagB, realA*imagB
 | 
						|
 | 
						|
	XSFADD_R1	vs0,	vs0,	vs38		// realA*realB
 | 
						|
	XSFADD_R2	vs0,	vs0,	vs39		// imagA*imagB
 | 
						|
 | 
						|
	XXSWAPD(vs38,vs38)			// realA*realB, imagA*realB -> imagA*realB, realA*realB
 | 
						|
	XXSWAPD(vs39,vs39)			// imagA*imagB, realA*imagB -> realA*imagB, imagA*imagB
 | 
						|
 | 
						|
	XSFADD_I1	vs1,	vs1,	vs38		// realA*imagB
 | 
						|
	XSFADD_I2	vs1,	vs1,	vs39		// imagA*realB
 | 
						|
 | 
						|
	xsmuldp		vs4,	vs0,	alpha_r		// real*alpha_r
 | 
						|
	xsmuldp		vs5,	vs1,	alpha_i		// imag*alpha_i
 | 
						|
	xsmuldp		vs6,	vs0,	alpha_i		// real*alpha_i
 | 
						|
	xsmuldp		vs7,	vs1,	alpha_r		// imag*alpha_r
 | 
						|
 | 
						|
	xssubdp		vs2,	vs4,	vs5		// real*alpha_r - imag*alpha_i
 | 
						|
	xsadddp		vs3,	vs6,	vs7		// real*alpha_i + imag*alpha_r
 | 
						|
	xxpermdi	vs11,	vs2,	vs3,	0	// merge real and imag part
 | 
						|
 | 
						|
 | 
						|
 | 
						|
	xxlxor		vs0,	vs0,	vs0
 | 
						|
	xxlxor		vs1,	vs1,	vs1
 | 
						|
	XXSWAPD(vs41,vs41)			// realA*imagB, imagA*imagB -> imagA*imagB, realA*imagB
 | 
						|
 | 
						|
	XSFADD_R1	vs0,	vs0,	vs40		// realA*realB
 | 
						|
	XSFADD_R2	vs0,	vs0,	vs41		// imagA*imagB
 | 
						|
 | 
						|
	XXSWAPD(vs40,vs40)			// realA*realB, imagA*realB -> imagA*realB, realA*realB
 | 
						|
	XXSWAPD(vs41,vs41)			// imagA*imagB, realA*imagB -> realA*imagB, imagA*imagB
 | 
						|
 | 
						|
	XSFADD_I1	vs1,	vs1,	vs40		// realA*imagB
 | 
						|
	XSFADD_I2	vs1,	vs1,	vs41		// imagA*realB
 | 
						|
 | 
						|
	xsmuldp		vs4,	vs0,	alpha_r		// real*alpha_r
 | 
						|
	xsmuldp		vs5,	vs1,	alpha_i		// imag*alpha_i
 | 
						|
	xsmuldp		vs6,	vs0,	alpha_i		// real*alpha_i
 | 
						|
	xsmuldp		vs7,	vs1,	alpha_r		// imag*alpha_r
 | 
						|
 | 
						|
	xssubdp		vs2,	vs4,	vs5		// real*alpha_r - imag*alpha_i
 | 
						|
	xsadddp		vs3,	vs6,	vs7		// real*alpha_i + imag*alpha_r
 | 
						|
	xxpermdi	vs12,	vs2,	vs3,	0	// merge real and imag part
 | 
						|
 | 
						|
 | 
						|
 | 
						|
	xxlxor		vs0,	vs0,	vs0
 | 
						|
	xxlxor		vs1,	vs1,	vs1
 | 
						|
	XXSWAPD(vs43,vs43)			// realA*imagB, imagA*imagB -> imagA*imagB, realA*imagB
 | 
						|
 | 
						|
	XSFADD_R1	vs0,	vs0,	vs42		// realA*realB
 | 
						|
	XSFADD_R2	vs0,	vs0,	vs43		// imagA*imagB
 | 
						|
 | 
						|
	XXSWAPD(vs42,vs42)			// realA*realB, imagA*realB -> imagA*realB, realA*realB
 | 
						|
	XXSWAPD(vs43,vs43)			// imagA*imagB, realA*imagB -> realA*imagB, imagA*imagB
 | 
						|
 | 
						|
	XSFADD_I1	vs1,	vs1,	vs42		// realA*imagB
 | 
						|
	XSFADD_I2	vs1,	vs1,	vs43		// imagA*realB
 | 
						|
 | 
						|
	xsmuldp		vs4,	vs0,	alpha_r		// real*alpha_r
 | 
						|
	xsmuldp		vs5,	vs1,	alpha_i		// imag*alpha_i
 | 
						|
	xsmuldp		vs6,	vs0,	alpha_i		// real*alpha_i
 | 
						|
	xsmuldp		vs7,	vs1,	alpha_r		// imag*alpha_r
 | 
						|
 | 
						|
	xssubdp		vs2,	vs4,	vs5		// real*alpha_r - imag*alpha_i
 | 
						|
	xsadddp		vs3,	vs6,	vs7		// real*alpha_i + imag*alpha_r
 | 
						|
	xxpermdi	vs13,	vs2,	vs3,	0	// merge real and imag part
 | 
						|
 | 
						|
 | 
						|
 | 
						|
	xxlxor		vs0,	vs0,	vs0
 | 
						|
	xxlxor		vs1,	vs1,	vs1
 | 
						|
	XXSWAPD(vs45,vs45)			// realA*imagB, imagA*imagB -> imagA*imagB, realA*imagB
 | 
						|
 | 
						|
	XSFADD_R1	vs0,	vs0,	vs44		// realA*realB
 | 
						|
	XSFADD_R2	vs0,	vs0,	vs45		// imagA*imagB
 | 
						|
 | 
						|
	XXSWAPD(vs44,vs44)			// realA*realB, imagA*realB -> imagA*realB, realA*realB
 | 
						|
	XXSWAPD(vs45,vs45)			// imagA*imagB, realA*imagB -> realA*imagB, imagA*imagB
 | 
						|
 | 
						|
	XSFADD_I1	vs1,	vs1,	vs44		// realA*imagB
 | 
						|
	XSFADD_I2	vs1,	vs1,	vs45		// imagA*realB
 | 
						|
 | 
						|
	xsmuldp		vs4,	vs0,	alpha_r		// real*alpha_r
 | 
						|
	xsmuldp		vs5,	vs1,	alpha_i		// imag*alpha_i
 | 
						|
	xsmuldp		vs6,	vs0,	alpha_i		// real*alpha_i
 | 
						|
	xsmuldp		vs7,	vs1,	alpha_r		// imag*alpha_r
 | 
						|
 | 
						|
	xssubdp		vs2,	vs4,	vs5		// real*alpha_r - imag*alpha_i
 | 
						|
	xsadddp		vs3,	vs6,	vs7		// real*alpha_i + imag*alpha_r
 | 
						|
	xxpermdi	vs14,	vs2,	vs3,	0	// merge real and imag part
 | 
						|
 | 
						|
 | 
						|
 | 
						|
	xxlxor		vs0,	vs0,	vs0
 | 
						|
	xxlxor		vs1,	vs1,	vs1
 | 
						|
	XXSWAPD(vs47,vs47)			// realA*imagB, imagA*imagB -> imagA*imagB, realA*imagB
 | 
						|
 | 
						|
	XSFADD_R1	vs0,	vs0,	vs46		// realA*realB
 | 
						|
	XSFADD_R2	vs0,	vs0,	vs47		// imagA*imagB
 | 
						|
 | 
						|
	XXSWAPD(vs46,vs46)			// realA*realB, imagA*realB -> imagA*realB, realA*realB
 | 
						|
	XXSWAPD(vs47,vs47)			// imagA*imagB, realA*imagB -> realA*imagB, imagA*imagB
 | 
						|
 | 
						|
	XSFADD_I1	vs1,	vs1,	vs46		// realA*imagB
 | 
						|
	XSFADD_I2	vs1,	vs1,	vs47		// imagA*realB
 | 
						|
 | 
						|
	xsmuldp		vs4,	vs0,	alpha_r		// real*alpha_r
 | 
						|
	xsmuldp		vs5,	vs1,	alpha_i		// imag*alpha_i
 | 
						|
	xsmuldp		vs6,	vs0,	alpha_i		// real*alpha_i
 | 
						|
	xsmuldp		vs7,	vs1,	alpha_r		// imag*alpha_r
 | 
						|
 | 
						|
	xssubdp		vs2,	vs4,	vs5		// real*alpha_r - imag*alpha_i
 | 
						|
	xsadddp		vs3,	vs6,	vs7		// real*alpha_i + imag*alpha_r
 | 
						|
	xxpermdi	vs15,	vs2,	vs3,	0	// merge real and imag part
 | 
						|
 | 
						|
 | 
						|
#ifndef TRMMKERNEL
 | 
						|
 | 
						|
	xvadddp		vs8,	vs8,	vs16
 | 
						|
	xvadddp		vs9,	vs9,	vs17
 | 
						|
	xvadddp		vs10,	vs10,	vs18
 | 
						|
	xvadddp		vs11,	vs11,	vs19
 | 
						|
	xvadddp		vs12,	vs12,	vs20
 | 
						|
	xvadddp		vs13,	vs13,	vs21
 | 
						|
	xvadddp		vs14,	vs14,	vs22
 | 
						|
	xvadddp		vs15,	vs15,	vs23
 | 
						|
 | 
						|
#endif
 | 
						|
 | 
						|
	stxvd2x		vs8,	o0,	T1
 | 
						|
	stxvd2x		vs9,	o16,	T1
 | 
						|
	stxvd2x		vs10,	o32,	T1
 | 
						|
	stxvd2x		vs11,	o48,	T1
 | 
						|
	stxvd2x		vs12,	o0,	T2
 | 
						|
	stxvd2x		vs13,	o16,	T2
 | 
						|
	stxvd2x		vs14,	o32,	T2
 | 
						|
	stxvd2x		vs15,	o48,	T2
 | 
						|
 | 
						|
	add		T1,	T1,	LDC
 | 
						|
	add		T2,	T2,	LDC
 | 
						|
 | 
						|
#ifndef TRMMKERNEL
 | 
						|
 | 
						|
	lxvd2x		vs16,	o0,	T1
 | 
						|
	lxvd2x		vs17,	o16,	T1
 | 
						|
	lxvd2x		vs18,	o32,	T1
 | 
						|
	lxvd2x		vs19,	o48,	T1
 | 
						|
	lxvd2x		vs20,	o0,	T2
 | 
						|
	lxvd2x		vs21,	o16,	T2
 | 
						|
	lxvd2x		vs22,	o32,	T2
 | 
						|
	lxvd2x		vs23,	o48,	T2
 | 
						|
 | 
						|
#endif
 | 
						|
 | 
						|
 | 
						|
	xxlxor		vs0,	vs0,	vs0
 | 
						|
	xxlxor		vs1,	vs1,	vs1
 | 
						|
	XXSWAPD(vs49,vs49)			// realA*imagB, imagA*imagB -> imagA*imagB, realA*imagB
 | 
						|
 | 
						|
	XSFADD_R1	vs0,	vs0,	vs48		// realA*realB
 | 
						|
	XSFADD_R2	vs0,	vs0,	vs49		// imagA*imagB
 | 
						|
 | 
						|
	XXSWAPD(vs48,vs48)			// realA*realB, imagA*realB -> imagA*realB, realA*realB
 | 
						|
	XXSWAPD(vs49,vs49)			// imagA*imagB, realA*imagB -> realA*imagB, imagA*imagB
 | 
						|
 | 
						|
	XSFADD_I1	vs1,	vs1,	vs48		// realA*imagB
 | 
						|
	XSFADD_I2	vs1,	vs1,	vs49		// imagA*realB
 | 
						|
 | 
						|
	xsmuldp		vs4,	vs0,	alpha_r		// real*alpha_r
 | 
						|
	xsmuldp		vs5,	vs1,	alpha_i		// imag*alpha_i
 | 
						|
	xsmuldp		vs6,	vs0,	alpha_i		// real*alpha_i
 | 
						|
	xsmuldp		vs7,	vs1,	alpha_r		// imag*alpha_r
 | 
						|
 | 
						|
	xssubdp		vs2,	vs4,	vs5		// real*alpha_r - imag*alpha_i
 | 
						|
	xsadddp		vs3,	vs6,	vs7		// real*alpha_i + imag*alpha_r
 | 
						|
	xxpermdi	vs8,	vs2,	vs3,	0	// merge real and imag part
 | 
						|
 | 
						|
 | 
						|
 | 
						|
	xxlxor		vs0,	vs0,	vs0
 | 
						|
	xxlxor		vs1,	vs1,	vs1
 | 
						|
	XXSWAPD(vs51,vs51)			// realA*imagB, imagA*imagB -> imagA*imagB, realA*imagB
 | 
						|
 | 
						|
	XSFADD_R1	vs0,	vs0,	vs50		// realA*realB
 | 
						|
	XSFADD_R2	vs0,	vs0,	vs51		// imagA*imagB
 | 
						|
 | 
						|
	XXSWAPD(vs50,vs50)			// realA*realB, imagA*realB -> imagA*realB, realA*realB
 | 
						|
	XXSWAPD(vs51,vs51)			// imagA*imagB, realA*imagB -> realA*imagB, imagA*imagB
 | 
						|
 | 
						|
	XSFADD_I1	vs1,	vs1,	vs50		// realA*imagB
 | 
						|
	XSFADD_I2	vs1,	vs1,	vs51		// imagA*realB
 | 
						|
 | 
						|
	xsmuldp		vs4,	vs0,	alpha_r		// real*alpha_r
 | 
						|
	xsmuldp		vs5,	vs1,	alpha_i		// imag*alpha_i
 | 
						|
	xsmuldp		vs6,	vs0,	alpha_i		// real*alpha_i
 | 
						|
	xsmuldp		vs7,	vs1,	alpha_r		// imag*alpha_r
 | 
						|
 | 
						|
	xssubdp		vs2,	vs4,	vs5		// real*alpha_r - imag*alpha_i
 | 
						|
	xsadddp		vs3,	vs6,	vs7		// real*alpha_i + imag*alpha_r
 | 
						|
	xxpermdi	vs9,	vs2,	vs3,	0	// merge real and imag part
 | 
						|
 | 
						|
 | 
						|
 | 
						|
	xxlxor		vs0,	vs0,	vs0
 | 
						|
	xxlxor		vs1,	vs1,	vs1
 | 
						|
	XXSWAPD(vs53,vs53)			// realA*imagB, imagA*imagB -> imagA*imagB, realA*imagB
 | 
						|
 | 
						|
	XSFADD_R1	vs0,	vs0,	vs52		// realA*realB
 | 
						|
	XSFADD_R2	vs0,	vs0,	vs53		// imagA*imagB
 | 
						|
 | 
						|
	XXSWAPD(vs52,vs52)			// realA*realB, imagA*realB -> imagA*realB, realA*realB
 | 
						|
	XXSWAPD(vs53,vs53)			// imagA*imagB, realA*imagB -> realA*imagB, imagA*imagB
 | 
						|
 | 
						|
	XSFADD_I1	vs1,	vs1,	vs52		// realA*imagB
 | 
						|
	XSFADD_I2	vs1,	vs1,	vs53		// imagA*realB
 | 
						|
 | 
						|
	xsmuldp		vs4,	vs0,	alpha_r		// real*alpha_r
 | 
						|
	xsmuldp		vs5,	vs1,	alpha_i		// imag*alpha_i
 | 
						|
	xsmuldp		vs6,	vs0,	alpha_i		// real*alpha_i
 | 
						|
	xsmuldp		vs7,	vs1,	alpha_r		// imag*alpha_r
 | 
						|
 | 
						|
	xssubdp		vs2,	vs4,	vs5		// real*alpha_r - imag*alpha_i
 | 
						|
	xsadddp		vs3,	vs6,	vs7		// real*alpha_i + imag*alpha_r
 | 
						|
	xxpermdi	vs10,	vs2,	vs3,	0	// merge real and imag part
 | 
						|
 | 
						|
 | 
						|
 | 
						|
	xxlxor		vs0,	vs0,	vs0
 | 
						|
	xxlxor		vs1,	vs1,	vs1
 | 
						|
	XXSWAPD(vs55,vs55)			// realA*imagB, imagA*imagB -> imagA*imagB, realA*imagB
 | 
						|
 | 
						|
	XSFADD_R1	vs0,	vs0,	vs54		// realA*realB
 | 
						|
	XSFADD_R2	vs0,	vs0,	vs55		// imagA*imagB
 | 
						|
 | 
						|
	XXSWAPD(vs54,vs54)			// realA*realB, imagA*realB -> imagA*realB, realA*realB
 | 
						|
	XXSWAPD(vs55,vs55)			// imagA*imagB, realA*imagB -> realA*imagB, imagA*imagB
 | 
						|
 | 
						|
	XSFADD_I1	vs1,	vs1,	vs54		// realA*imagB
 | 
						|
	XSFADD_I2	vs1,	vs1,	vs55		// imagA*realB
 | 
						|
 | 
						|
	xsmuldp		vs4,	vs0,	alpha_r		// real*alpha_r
 | 
						|
	xsmuldp		vs5,	vs1,	alpha_i		// imag*alpha_i
 | 
						|
	xsmuldp		vs6,	vs0,	alpha_i		// real*alpha_i
 | 
						|
	xsmuldp		vs7,	vs1,	alpha_r		// imag*alpha_r
 | 
						|
 | 
						|
	xssubdp		vs2,	vs4,	vs5		// real*alpha_r - imag*alpha_i
 | 
						|
	xsadddp		vs3,	vs6,	vs7		// real*alpha_i + imag*alpha_r
 | 
						|
	xxpermdi	vs11,	vs2,	vs3,	0	// merge real and imag part
 | 
						|
 | 
						|
 | 
						|
 | 
						|
	xxlxor		vs0,	vs0,	vs0
 | 
						|
	xxlxor		vs1,	vs1,	vs1
 | 
						|
	XXSWAPD(vs57,vs57)			// realA*imagB, imagA*imagB -> imagA*imagB, realA*imagB
 | 
						|
 | 
						|
	XSFADD_R1	vs0,	vs0,	vs56		// realA*realB
 | 
						|
	XSFADD_R2	vs0,	vs0,	vs57		// imagA*imagB
 | 
						|
 | 
						|
	XXSWAPD(vs56,vs56)			// realA*realB, imagA*realB -> imagA*realB, realA*realB
 | 
						|
	XXSWAPD(vs57,vs57)			// imagA*imagB, realA*imagB -> realA*imagB, imagA*imagB
 | 
						|
 | 
						|
	XSFADD_I1	vs1,	vs1,	vs56		// realA*imagB
 | 
						|
	XSFADD_I2	vs1,	vs1,	vs57		// imagA*realB
 | 
						|
 | 
						|
	xsmuldp		vs4,	vs0,	alpha_r		// real*alpha_r
 | 
						|
	xsmuldp		vs5,	vs1,	alpha_i		// imag*alpha_i
 | 
						|
	xsmuldp		vs6,	vs0,	alpha_i		// real*alpha_i
 | 
						|
	xsmuldp		vs7,	vs1,	alpha_r		// imag*alpha_r
 | 
						|
 | 
						|
	xssubdp		vs2,	vs4,	vs5		// real*alpha_r - imag*alpha_i
 | 
						|
	xsadddp		vs3,	vs6,	vs7		// real*alpha_i + imag*alpha_r
 | 
						|
	xxpermdi	vs12,	vs2,	vs3,	0	// merge real and imag part
 | 
						|
 | 
						|
 | 
						|
 | 
						|
	xxlxor		vs0,	vs0,	vs0
 | 
						|
	xxlxor		vs1,	vs1,	vs1
 | 
						|
	XXSWAPD(vs59,vs59)			// realA*imagB, imagA*imagB -> imagA*imagB, realA*imagB
 | 
						|
 | 
						|
	XSFADD_R1	vs0,	vs0,	vs58		// realA*realB
 | 
						|
	XSFADD_R2	vs0,	vs0,	vs59		// imagA*imagB
 | 
						|
 | 
						|
	XXSWAPD(vs58,vs58)			// realA*realB, imagA*realB -> imagA*realB, realA*realB
 | 
						|
	XXSWAPD(vs59,vs59)			// imagA*imagB, realA*imagB -> realA*imagB, imagA*imagB
 | 
						|
 | 
						|
	XSFADD_I1	vs1,	vs1,	vs58		// realA*imagB
 | 
						|
	XSFADD_I2	vs1,	vs1,	vs59		// imagA*realB
 | 
						|
 | 
						|
	xsmuldp		vs4,	vs0,	alpha_r		// real*alpha_r
 | 
						|
	xsmuldp		vs5,	vs1,	alpha_i		// imag*alpha_i
 | 
						|
	xsmuldp		vs6,	vs0,	alpha_i		// real*alpha_i
 | 
						|
	xsmuldp		vs7,	vs1,	alpha_r		// imag*alpha_r
 | 
						|
 | 
						|
	xssubdp		vs2,	vs4,	vs5		// real*alpha_r - imag*alpha_i
 | 
						|
	xsadddp		vs3,	vs6,	vs7		// real*alpha_i + imag*alpha_r
 | 
						|
	xxpermdi	vs13,	vs2,	vs3,	0	// merge real and imag part
 | 
						|
 | 
						|
 | 
						|
 | 
						|
	xxlxor		vs0,	vs0,	vs0
 | 
						|
	xxlxor		vs1,	vs1,	vs1
 | 
						|
	XXSWAPD(vs61,vs61)			// realA*imagB, imagA*imagB -> imagA*imagB, realA*imagB
 | 
						|
 | 
						|
	XSFADD_R1	vs0,	vs0,	vs60		// realA*realB
 | 
						|
	XSFADD_R2	vs0,	vs0,	vs61		// imagA*imagB
 | 
						|
 | 
						|
	XXSWAPD(vs60,vs60)			// realA*realB, imagA*realB -> imagA*realB, realA*realB
 | 
						|
	XXSWAPD(vs61,vs61)			// imagA*imagB, realA*imagB -> realA*imagB, imagA*imagB
 | 
						|
 | 
						|
	XSFADD_I1	vs1,	vs1,	vs60		// realA*imagB
 | 
						|
	XSFADD_I2	vs1,	vs1,	vs61		// imagA*realB
 | 
						|
 | 
						|
	xsmuldp		vs4,	vs0,	alpha_r		// real*alpha_r
 | 
						|
	xsmuldp		vs5,	vs1,	alpha_i		// imag*alpha_i
 | 
						|
	xsmuldp		vs6,	vs0,	alpha_i		// real*alpha_i
 | 
						|
	xsmuldp		vs7,	vs1,	alpha_r		// imag*alpha_r
 | 
						|
 | 
						|
	xssubdp		vs2,	vs4,	vs5		// real*alpha_r - imag*alpha_i
 | 
						|
	xsadddp		vs3,	vs6,	vs7		// real*alpha_i + imag*alpha_r
 | 
						|
	xxpermdi	vs14,	vs2,	vs3,	0	// merge real and imag part
 | 
						|
 | 
						|
 | 
						|
 | 
						|
	xxlxor		vs0,	vs0,	vs0
 | 
						|
	xxlxor		vs1,	vs1,	vs1
 | 
						|
	XXSWAPD(vs63,vs63)			// realA*imagB, imagA*imagB -> imagA*imagB, realA*imagB
 | 
						|
 | 
						|
	XSFADD_R1	vs0,	vs0,	vs62		// realA*realB
 | 
						|
	XSFADD_R2	vs0,	vs0,	vs63		// imagA*imagB
 | 
						|
 | 
						|
	XXSWAPD(vs62,vs62)			// realA*realB, imagA*realB -> imagA*realB, realA*realB
 | 
						|
	XXSWAPD(vs63,vs63)			// imagA*imagB, realA*imagB -> realA*imagB, imagA*imagB
 | 
						|
 | 
						|
	XSFADD_I1	vs1,	vs1,	vs62		// realA*imagB
 | 
						|
	XSFADD_I2	vs1,	vs1,	vs63		// imagA*realB
 | 
						|
 | 
						|
	xsmuldp		vs4,	vs0,	alpha_r		// real*alpha_r
 | 
						|
	xsmuldp		vs5,	vs1,	alpha_i		// imag*alpha_i
 | 
						|
	xsmuldp		vs6,	vs0,	alpha_i		// real*alpha_i
 | 
						|
	xsmuldp		vs7,	vs1,	alpha_r		// imag*alpha_r
 | 
						|
 | 
						|
	xssubdp		vs2,	vs4,	vs5		// real*alpha_r - imag*alpha_i
 | 
						|
	xsadddp		vs3,	vs6,	vs7		// real*alpha_i + imag*alpha_r
 | 
						|
	xxpermdi	vs15,	vs2,	vs3,	0	// merge real and imag part
 | 
						|
 | 
						|
 | 
						|
#ifndef TRMMKERNEL
 | 
						|
 | 
						|
	xvadddp		vs8,	vs8,	vs16
 | 
						|
	xvadddp		vs9,	vs9,	vs17
 | 
						|
	xvadddp		vs10,	vs10,	vs18
 | 
						|
	xvadddp		vs11,	vs11,	vs19
 | 
						|
	xvadddp		vs12,	vs12,	vs20
 | 
						|
	xvadddp		vs13,	vs13,	vs21
 | 
						|
	xvadddp		vs14,	vs14,	vs22
 | 
						|
	xvadddp		vs15,	vs15,	vs23
 | 
						|
 | 
						|
#endif
 | 
						|
 | 
						|
	stxvd2x		vs8,	o0,	T1
 | 
						|
	stxvd2x		vs9,	o16,	T1
 | 
						|
	stxvd2x		vs10,	o32,	T1
 | 
						|
	stxvd2x		vs11,	o48,	T1
 | 
						|
	stxvd2x		vs12,	o0,	T2
 | 
						|
	stxvd2x		vs13,	o16,	T2
 | 
						|
	stxvd2x		vs14,	o32,	T2
 | 
						|
	stxvd2x		vs15,	o48,	T2
 | 
						|
 | 
						|
	add		T1,	T1,	LDC
 | 
						|
	add		T2,	T2,	LDC
 | 
						|
	addi		CO,	CO,	128
 | 
						|
 | 
						|
#if defined(_AIX)
 | 
						|
')
 | 
						|
#else
 | 
						|
.endm
 | 
						|
#endif
 | 
						|
 | 
						|
 | 
						|
/**********************************************************************************************
 | 
						|
* Macros for N=2 and M=4
 | 
						|
**********************************************************************************************/
 | 
						|
 | 
						|
#if defined(_AIX)
 | 
						|
define(`LOAD2x4_1', `
 | 
						|
#else
 | 
						|
.macro LOAD2x4_1
 | 
						|
#endif
 | 
						|
 | 
						|
	lxvd2x		vs16,	o0,	BO		// load real part from B
 | 
						|
	lxvd2x		vs17,	o16,	BO		// load imag part from B
 | 
						|
	lxvd2x		vs18,	o32,	BO		// load real part from B
 | 
						|
	lxvd2x		vs19,	o48,	BO		// load imag part from B
 | 
						|
 | 
						|
	addi		BO,	BO,	64
 | 
						|
 | 
						|
	lxvd2x		vs0,	o0,	AO		// load real,imag from A
 | 
						|
	lxvd2x		vs1,	o16,	AO		// load real,imag from A
 | 
						|
	lxvd2x		vs2,	o32,	AO		// load real,imag from A
 | 
						|
	lxvd2x		vs3,	o48,	AO		// load real,imag from A
 | 
						|
 | 
						|
	addi		AO,	AO,	64
 | 
						|
 | 
						|
 | 
						|
#if defined(_AIX)
 | 
						|
')
 | 
						|
#else
 | 
						|
.endm
 | 
						|
#endif
 | 
						|
 | 
						|
#if defined(_AIX)
 | 
						|
define(`KERNEL2x4_I1', `
 | 
						|
#else
 | 
						|
.macro KERNEL2x4_I1
 | 
						|
#endif
 | 
						|
 | 
						|
	lxvd2x		vs8,	o0,	AO		// load real,imag from A
 | 
						|
	lxvd2x		vs9,	o16,	AO		// load real,imag from A
 | 
						|
	lxvd2x		vs10,	o32,	AO		// load real,imag from A
 | 
						|
	lxvd2x		vs11,	o48,	AO		// load real,imag from A
 | 
						|
 | 
						|
	addi		AO,	AO,	64
 | 
						|
 | 
						|
	lxvd2x		vs20,	o0,	BO		// load real part from B
 | 
						|
	lxvd2x		vs21,	o16,	BO		// load imag part from B
 | 
						|
	lxvd2x		vs22,	o32,	BO		// load real part from B
 | 
						|
	lxvd2x		vs23,	o48,	BO		// load imag part from B
 | 
						|
 | 
						|
	addi		BO,	BO,	64
 | 
						|
 | 
						|
	xvmuldp		vs32,	vs0,	vs16		// real*real, imag*real
 | 
						|
	xvmuldp		vs33,	vs0,	vs17		// real*imag, imag*imag
 | 
						|
	xvmuldp		vs34,	vs1,	vs16		// real*real, imag*real
 | 
						|
	xvmuldp		vs35,	vs1,	vs17		// real*imag, imag*imag
 | 
						|
	xvmuldp		vs36,	vs2,	vs16		// real*real, imag*real
 | 
						|
	xvmuldp		vs37,	vs2,	vs17		// real*imag, imag*imag
 | 
						|
	xvmuldp		vs38,	vs3,	vs16		// real*real, imag*real
 | 
						|
	xvmuldp		vs39,	vs3,	vs17		// real*imag, imag*imag
 | 
						|
 | 
						|
	xvmuldp		vs40,	vs0,	vs18		// real*real, imag*real
 | 
						|
	xvmuldp		vs41,	vs0,	vs19		// real*imag, imag*imag
 | 
						|
	xvmuldp		vs42,	vs1,	vs18		// real*real, imag*real
 | 
						|
	xvmuldp		vs43,	vs1,	vs19		// real*imag, imag*imag
 | 
						|
	xvmuldp		vs44,	vs2,	vs18		// real*real, imag*real
 | 
						|
	xvmuldp		vs45,	vs2,	vs19		// real*imag, imag*imag
 | 
						|
	xvmuldp		vs46,	vs3,	vs18		// real*real, imag*real
 | 
						|
	xvmuldp		vs47,	vs3,	vs19		// real*imag, imag*imag
 | 
						|
 | 
						|
 | 
						|
#if defined(_AIX)
 | 
						|
')
 | 
						|
#else
 | 
						|
.endm
 | 
						|
#endif
 | 
						|
 | 
						|
#if defined(_AIX)
 | 
						|
define(`KERNEL2x4_1', `
 | 
						|
#else
 | 
						|
.macro KERNEL2x4_1
 | 
						|
#endif
 | 
						|
 | 
						|
	lxvd2x		vs8,	o0,	AO		// load real,imag from A
 | 
						|
	lxvd2x		vs9,	o16,	AO		// load real,imag from A
 | 
						|
	lxvd2x		vs10,	o32,	AO		// load real,imag from A
 | 
						|
	lxvd2x		vs11,	o48,	AO		// load real,imag from A
 | 
						|
 | 
						|
	addi		AO,	AO,	64
 | 
						|
 | 
						|
	lxvd2x		vs20,	o0,	BO		// load real part from B
 | 
						|
	lxvd2x		vs21,	o16,	BO		// load imag part from B
 | 
						|
	lxvd2x		vs22,	o32,	BO		// load real part from B
 | 
						|
	lxvd2x		vs23,	o48,	BO		// load imag part from B
 | 
						|
 | 
						|
	addi		BO,	BO,	64
 | 
						|
 | 
						|
	xvmaddadp	vs32,	vs0,	vs16		// real*real, imag*real
 | 
						|
	xvmaddadp	vs33,	vs0,	vs17		// real*imag, imag*imag
 | 
						|
	xvmaddadp	vs34,	vs1,	vs16		// real*real, imag*real
 | 
						|
	xvmaddadp	vs35,	vs1,	vs17		// real*imag, imag*imag
 | 
						|
	xvmaddadp	vs36,	vs2,	vs16		// real*real, imag*real
 | 
						|
	xvmaddadp	vs37,	vs2,	vs17		// real*imag, imag*imag
 | 
						|
	xvmaddadp	vs38,	vs3,	vs16		// real*real, imag*real
 | 
						|
	xvmaddadp	vs39,	vs3,	vs17		// real*imag, imag*imag
 | 
						|
 | 
						|
	xvmaddadp	vs40,	vs0,	vs18		// real*real, imag*real
 | 
						|
	xvmaddadp	vs41,	vs0,	vs19		// real*imag, imag*imag
 | 
						|
	xvmaddadp	vs42,	vs1,	vs18		// real*real, imag*real
 | 
						|
	xvmaddadp	vs43,	vs1,	vs19		// real*imag, imag*imag
 | 
						|
	xvmaddadp	vs44,	vs2,	vs18		// real*real, imag*real
 | 
						|
	xvmaddadp	vs45,	vs2,	vs19		// real*imag, imag*imag
 | 
						|
	xvmaddadp	vs46,	vs3,	vs18		// real*real, imag*real
 | 
						|
	xvmaddadp	vs47,	vs3,	vs19		// real*imag, imag*imag
 | 
						|
 | 
						|
 | 
						|
#if defined(_AIX)
 | 
						|
')
 | 
						|
#else
 | 
						|
.endm
 | 
						|
#endif
 | 
						|
 | 
						|
#if defined(_AIX)
 | 
						|
define(`KERNEL2x4_2', `
 | 
						|
#else
 | 
						|
.macro KERNEL2x4_2
 | 
						|
#endif
 | 
						|
 | 
						|
	lxvd2x		vs0,	o0,	AO		// load real,imag from A
 | 
						|
	lxvd2x		vs1,	o16,	AO		// load real,imag from A
 | 
						|
	lxvd2x		vs2,	o32,	AO		// load real,imag from A
 | 
						|
	lxvd2x		vs3,	o48,	AO		// load real,imag from A
 | 
						|
 | 
						|
	addi		AO,	AO,	64
 | 
						|
 | 
						|
	lxvd2x		vs16,	o0,	BO		// load real part from B
 | 
						|
	lxvd2x		vs17,	o16,	BO		// load imag part from B
 | 
						|
	lxvd2x		vs18,	o32,	BO		// load real part from B
 | 
						|
	lxvd2x		vs19,	o48,	BO		// load imag part from B
 | 
						|
 | 
						|
	addi		BO,	BO,	64
 | 
						|
 | 
						|
	xvmaddadp	vs32,	vs8,	vs20		// real*real, imag*real
 | 
						|
	xvmaddadp	vs33,	vs8,	vs21		// real*imag, imag*imag
 | 
						|
	xvmaddadp	vs34,	vs9,	vs20		// real*real, imag*real
 | 
						|
	xvmaddadp	vs35,	vs9,	vs21		// real*imag, imag*imag
 | 
						|
	xvmaddadp	vs36,	vs10,	vs20		// real*real, imag*real
 | 
						|
	xvmaddadp	vs37,	vs10,	vs21		// real*imag, imag*imag
 | 
						|
	xvmaddadp	vs38,	vs11,	vs20		// real*real, imag*real
 | 
						|
	xvmaddadp	vs39,	vs11,	vs21		// real*imag, imag*imag
 | 
						|
 | 
						|
	xvmaddadp	vs40,	vs8,	vs22		// real*real, imag*real
 | 
						|
	xvmaddadp	vs41,	vs8,	vs23		// real*imag, imag*imag
 | 
						|
	xvmaddadp	vs42,	vs9,	vs22		// real*real, imag*real
 | 
						|
	xvmaddadp	vs43,	vs9,	vs23		// real*imag, imag*imag
 | 
						|
	xvmaddadp	vs44,	vs10,	vs22		// real*real, imag*real
 | 
						|
	xvmaddadp	vs45,	vs10,	vs23		// real*imag, imag*imag
 | 
						|
	xvmaddadp	vs46,	vs11,	vs22		// real*real, imag*real
 | 
						|
	xvmaddadp	vs47,	vs11,	vs23		// real*imag, imag*imag
 | 
						|
 | 
						|
 | 
						|
#if defined(_AIX)
 | 
						|
')
 | 
						|
#else
 | 
						|
.endm
 | 
						|
#endif
 | 
						|
 | 
						|
#if defined(_AIX)
 | 
						|
define(`KERNEL2x4_E2', `
 | 
						|
#else
 | 
						|
.macro KERNEL2x4_E2
 | 
						|
#endif
 | 
						|
 | 
						|
 | 
						|
	xvmaddadp	vs32,	vs8,	vs20		// real*real, imag*real
 | 
						|
	xvmaddadp	vs33,	vs8,	vs21		// real*imag, imag*imag
 | 
						|
	xvmaddadp	vs34,	vs9,	vs20		// real*real, imag*real
 | 
						|
	xvmaddadp	vs35,	vs9,	vs21		// real*imag, imag*imag
 | 
						|
	xvmaddadp	vs36,	vs10,	vs20		// real*real, imag*real
 | 
						|
	xvmaddadp	vs37,	vs10,	vs21		// real*imag, imag*imag
 | 
						|
	xvmaddadp	vs38,	vs11,	vs20		// real*real, imag*real
 | 
						|
	xvmaddadp	vs39,	vs11,	vs21		// real*imag, imag*imag
 | 
						|
 | 
						|
	xvmaddadp	vs40,	vs8,	vs22		// real*real, imag*real
 | 
						|
	xvmaddadp	vs41,	vs8,	vs23		// real*imag, imag*imag
 | 
						|
	xvmaddadp	vs42,	vs9,	vs22		// real*real, imag*real
 | 
						|
	xvmaddadp	vs43,	vs9,	vs23		// real*imag, imag*imag
 | 
						|
	xvmaddadp	vs44,	vs10,	vs22		// real*real, imag*real
 | 
						|
	xvmaddadp	vs45,	vs10,	vs23		// real*imag, imag*imag
 | 
						|
	xvmaddadp	vs46,	vs11,	vs22		// real*real, imag*real
 | 
						|
	xvmaddadp	vs47,	vs11,	vs23		// real*imag, imag*imag
 | 
						|
 | 
						|
 | 
						|
#if defined(_AIX)
 | 
						|
')
 | 
						|
#else
 | 
						|
.endm
 | 
						|
#endif
 | 
						|
 | 
						|
#if defined(_AIX)
 | 
						|
define(`KERNEL2x4_SUBI1', `
 | 
						|
#else
 | 
						|
.macro KERNEL2x4_SUBI1
 | 
						|
#endif
 | 
						|
 | 
						|
	lxvd2x		vs0,	o0,	AO		// load real,imag from A
 | 
						|
	lxvd2x		vs1,	o16,	AO		// load real,imag from A
 | 
						|
	lxvd2x		vs2,	o32,	AO		// load real,imag from A
 | 
						|
	lxvd2x		vs3,	o48,	AO		// load real,imag from A
 | 
						|
 | 
						|
	addi		AO,	AO,	64
 | 
						|
 | 
						|
	lxvd2x		vs16,	o0,	BO		// load real part from B
 | 
						|
	lxvd2x		vs17,	o16,	BO		// load imag part from B
 | 
						|
	lxvd2x		vs18,	o32,	BO		// load real part from B
 | 
						|
	lxvd2x		vs19,	o48,	BO		// load imag part from B
 | 
						|
 | 
						|
	addi		BO,	BO,	64
 | 
						|
 | 
						|
	xvmuldp		vs32,	vs0,	vs16		// real*real, imag*real
 | 
						|
	xvmuldp		vs33,	vs0,	vs17		// real*imag, imag*imag
 | 
						|
	xvmuldp		vs34,	vs1,	vs16		// real*real, imag*real
 | 
						|
	xvmuldp		vs35,	vs1,	vs17		// real*imag, imag*imag
 | 
						|
	xvmuldp		vs36,	vs2,	vs16		// real*real, imag*real
 | 
						|
	xvmuldp		vs37,	vs2,	vs17		// real*imag, imag*imag
 | 
						|
	xvmuldp		vs38,	vs3,	vs16		// real*real, imag*real
 | 
						|
	xvmuldp		vs39,	vs3,	vs17		// real*imag, imag*imag
 | 
						|
 | 
						|
	xvmuldp		vs40,	vs0,	vs18		// real*real, imag*real
 | 
						|
	xvmuldp		vs41,	vs0,	vs19		// real*imag, imag*imag
 | 
						|
	xvmuldp		vs42,	vs1,	vs18		// real*real, imag*real
 | 
						|
	xvmuldp		vs43,	vs1,	vs19		// real*imag, imag*imag
 | 
						|
	xvmuldp		vs44,	vs2,	vs18		// real*real, imag*real
 | 
						|
	xvmuldp		vs45,	vs2,	vs19		// real*imag, imag*imag
 | 
						|
	xvmuldp		vs46,	vs3,	vs18		// real*real, imag*real
 | 
						|
	xvmuldp		vs47,	vs3,	vs19		// real*imag, imag*imag
 | 
						|
 | 
						|
 | 
						|
#if defined(_AIX)
 | 
						|
')
 | 
						|
#else
 | 
						|
.endm
 | 
						|
#endif
 | 
						|
 | 
						|
#if defined(_AIX)
 | 
						|
define(`KERNEL2x4_SUB1', `
 | 
						|
#else
 | 
						|
.macro KERNEL2x4_SUB1
 | 
						|
#endif
 | 
						|
 | 
						|
	lxvd2x		vs0,	o0,	AO		// load real,imag from A
 | 
						|
	lxvd2x		vs1,	o16,	AO		// load real,imag from A
 | 
						|
	lxvd2x		vs2,	o32,	AO		// load real,imag from A
 | 
						|
	lxvd2x		vs3,	o48,	AO		// load real,imag from A
 | 
						|
 | 
						|
	addi		AO,	AO,	64
 | 
						|
 | 
						|
	lxvd2x		vs16,	o0,	BO		// load real part from B
 | 
						|
	lxvd2x		vs17,	o16,	BO		// load imag part from B
 | 
						|
	lxvd2x		vs18,	o32,	BO		// load real part from B
 | 
						|
	lxvd2x		vs19,	o48,	BO		// load imag part from B
 | 
						|
 | 
						|
	addi		BO,	BO,	64
 | 
						|
 | 
						|
	xvmaddadp	vs32,	vs0,	vs16		// real*real, imag*real
 | 
						|
	xvmaddadp	vs33,	vs0,	vs17		// real*imag, imag*imag
 | 
						|
	xvmaddadp	vs34,	vs1,	vs16		// real*real, imag*real
 | 
						|
	xvmaddadp	vs35,	vs1,	vs17		// real*imag, imag*imag
 | 
						|
	xvmaddadp	vs36,	vs2,	vs16		// real*real, imag*real
 | 
						|
	xvmaddadp	vs37,	vs2,	vs17		// real*imag, imag*imag
 | 
						|
	xvmaddadp	vs38,	vs3,	vs16		// real*real, imag*real
 | 
						|
	xvmaddadp	vs39,	vs3,	vs17		// real*imag, imag*imag
 | 
						|
 | 
						|
	xvmaddadp	vs40,	vs0,	vs18		// real*real, imag*real
 | 
						|
	xvmaddadp	vs41,	vs0,	vs19		// real*imag, imag*imag
 | 
						|
	xvmaddadp	vs42,	vs1,	vs18		// real*real, imag*real
 | 
						|
	xvmaddadp	vs43,	vs1,	vs19		// real*imag, imag*imag
 | 
						|
	xvmaddadp	vs44,	vs2,	vs18		// real*real, imag*real
 | 
						|
	xvmaddadp	vs45,	vs2,	vs19		// real*imag, imag*imag
 | 
						|
	xvmaddadp	vs46,	vs3,	vs18		// real*real, imag*real
 | 
						|
	xvmaddadp	vs47,	vs3,	vs19		// real*imag, imag*imag
 | 
						|
 | 
						|
 | 
						|
#if defined(_AIX)
 | 
						|
')
 | 
						|
#else
 | 
						|
.endm
 | 
						|
#endif
 | 
						|
 | 
						|
#if defined(_AIX)
 | 
						|
define(`SAVE2x4', `
 | 
						|
#else
 | 
						|
.macro SAVE2x4
 | 
						|
#endif
 | 
						|
 | 
						|
 | 
						|
	mr		T1,	CO
 | 
						|
 | 
						|
#ifndef TRMMKERNEL
 | 
						|
 | 
						|
	lxvd2x		vs16,	o0,	T1
 | 
						|
	lxvd2x		vs17,	o16,	T1
 | 
						|
	lxvd2x		vs18,	o32,	T1
 | 
						|
	lxvd2x		vs19,	o48,	T1
 | 
						|
 | 
						|
#endif
 | 
						|
 | 
						|
 | 
						|
	xxlxor		vs0,	vs0,	vs0
 | 
						|
	xxlxor		vs1,	vs1,	vs1
 | 
						|
	XXSWAPD(vs33,vs33)			// realA*imagB, imagA*imagB -> imagA*imagB, realA*imagB
 | 
						|
 | 
						|
	XSFADD_R1	vs0,	vs0,	vs32		// realA*realB
 | 
						|
	XSFADD_R2	vs0,	vs0,	vs33		// imagA*imagB
 | 
						|
 | 
						|
	XXSWAPD(vs32,vs32)			// realA*realB, imagA*realB -> imagA*realB, realA*realB
 | 
						|
	XXSWAPD(vs33,vs33)			// imagA*imagB, realA*imagB -> realA*imagB, imagA*imagB
 | 
						|
 | 
						|
	XSFADD_I1	vs1,	vs1,	vs32		// realA*imagB
 | 
						|
	XSFADD_I2	vs1,	vs1,	vs33		// imagA*realB
 | 
						|
 | 
						|
	xsmuldp		vs4,	vs0,	alpha_r		// real*alpha_r
 | 
						|
	xsmuldp		vs5,	vs1,	alpha_i		// imag*alpha_i
 | 
						|
	xsmuldp		vs6,	vs0,	alpha_i		// real*alpha_i
 | 
						|
	xsmuldp		vs7,	vs1,	alpha_r		// imag*alpha_r
 | 
						|
 | 
						|
	xssubdp		vs2,	vs4,	vs5		// real*alpha_r - imag*alpha_i
 | 
						|
	xsadddp		vs3,	vs6,	vs7		// real*alpha_i + imag*alpha_r
 | 
						|
	xxpermdi	vs8,	vs2,	vs3,	0	// merge real and imag part
 | 
						|
 | 
						|
 | 
						|
 | 
						|
	xxlxor		vs0,	vs0,	vs0
 | 
						|
	xxlxor		vs1,	vs1,	vs1
 | 
						|
	XXSWAPD(vs35,vs35)			// realA*imagB, imagA*imagB -> imagA*imagB, realA*imagB
 | 
						|
 | 
						|
	XSFADD_R1	vs0,	vs0,	vs34		// realA*realB
 | 
						|
	XSFADD_R2	vs0,	vs0,	vs35		// imagA*imagB
 | 
						|
 | 
						|
	XXSWAPD(vs34,vs34)			// realA*realB, imagA*realB -> imagA*realB, realA*realB
 | 
						|
	XXSWAPD(vs35,vs35)			// imagA*imagB, realA*imagB -> realA*imagB, imagA*imagB
 | 
						|
 | 
						|
	XSFADD_I1	vs1,	vs1,	vs34		// realA*imagB
 | 
						|
	XSFADD_I2	vs1,	vs1,	vs35		// imagA*realB
 | 
						|
 | 
						|
	xsmuldp		vs4,	vs0,	alpha_r		// real*alpha_r
 | 
						|
	xsmuldp		vs5,	vs1,	alpha_i		// imag*alpha_i
 | 
						|
	xsmuldp		vs6,	vs0,	alpha_i		// real*alpha_i
 | 
						|
	xsmuldp		vs7,	vs1,	alpha_r		// imag*alpha_r
 | 
						|
 | 
						|
	xssubdp		vs2,	vs4,	vs5		// real*alpha_r - imag*alpha_i
 | 
						|
	xsadddp		vs3,	vs6,	vs7		// real*alpha_i + imag*alpha_r
 | 
						|
	xxpermdi	vs9,	vs2,	vs3,	0	// merge real and imag part
 | 
						|
 | 
						|
 | 
						|
 | 
						|
	xxlxor		vs0,	vs0,	vs0
 | 
						|
	xxlxor		vs1,	vs1,	vs1
 | 
						|
	XXSWAPD(vs37,vs37)			// realA*imagB, imagA*imagB -> imagA*imagB, realA*imagB
 | 
						|
 | 
						|
	XSFADD_R1	vs0,	vs0,	vs36		// realA*realB
 | 
						|
	XSFADD_R2	vs0,	vs0,	vs37		// imagA*imagB
 | 
						|
 | 
						|
	XXSWAPD(vs36,vs36)			// realA*realB, imagA*realB -> imagA*realB, realA*realB
 | 
						|
	XXSWAPD(vs37,vs37)			// imagA*imagB, realA*imagB -> realA*imagB, imagA*imagB
 | 
						|
 | 
						|
	XSFADD_I1	vs1,	vs1,	vs36		// realA*imagB
 | 
						|
	XSFADD_I2	vs1,	vs1,	vs37		// imagA*realB
 | 
						|
 | 
						|
	xsmuldp		vs4,	vs0,	alpha_r		// real*alpha_r
 | 
						|
	xsmuldp		vs5,	vs1,	alpha_i		// imag*alpha_i
 | 
						|
	xsmuldp		vs6,	vs0,	alpha_i		// real*alpha_i
 | 
						|
	xsmuldp		vs7,	vs1,	alpha_r		// imag*alpha_r
 | 
						|
 | 
						|
	xssubdp		vs2,	vs4,	vs5		// real*alpha_r - imag*alpha_i
 | 
						|
	xsadddp		vs3,	vs6,	vs7		// real*alpha_i + imag*alpha_r
 | 
						|
	xxpermdi	vs10,	vs2,	vs3,	0	// merge real and imag part
 | 
						|
 | 
						|
 | 
						|
 | 
						|
	xxlxor		vs0,	vs0,	vs0
 | 
						|
	xxlxor		vs1,	vs1,	vs1
 | 
						|
	XXSWAPD(vs39,vs39)			// realA*imagB, imagA*imagB -> imagA*imagB, realA*imagB
 | 
						|
 | 
						|
	XSFADD_R1	vs0,	vs0,	vs38		// realA*realB
 | 
						|
	XSFADD_R2	vs0,	vs0,	vs39		// imagA*imagB
 | 
						|
 | 
						|
	XXSWAPD(vs38,vs38)			// realA*realB, imagA*realB -> imagA*realB, realA*realB
 | 
						|
	XXSWAPD(vs39,vs39)			// imagA*imagB, realA*imagB -> realA*imagB, imagA*imagB
 | 
						|
 | 
						|
	XSFADD_I1	vs1,	vs1,	vs38		// realA*imagB
 | 
						|
	XSFADD_I2	vs1,	vs1,	vs39		// imagA*realB
 | 
						|
 | 
						|
	xsmuldp		vs4,	vs0,	alpha_r		// real*alpha_r
 | 
						|
	xsmuldp		vs5,	vs1,	alpha_i		// imag*alpha_i
 | 
						|
	xsmuldp		vs6,	vs0,	alpha_i		// real*alpha_i
 | 
						|
	xsmuldp		vs7,	vs1,	alpha_r		// imag*alpha_r
 | 
						|
 | 
						|
	xssubdp		vs2,	vs4,	vs5		// real*alpha_r - imag*alpha_i
 | 
						|
	xsadddp		vs3,	vs6,	vs7		// real*alpha_i + imag*alpha_r
 | 
						|
	xxpermdi	vs11,	vs2,	vs3,	0	// merge real and imag part
 | 
						|
 | 
						|
 | 
						|
#ifndef TRMMKERNEL
 | 
						|
 | 
						|
	xvadddp		vs8,	vs8,	vs16
 | 
						|
	xvadddp		vs9,	vs9,	vs17
 | 
						|
	xvadddp		vs10,	vs10,	vs18
 | 
						|
	xvadddp		vs11,	vs11,	vs19
 | 
						|
 | 
						|
#endif
 | 
						|
 | 
						|
	stxvd2x		vs8,	o0,	T1
 | 
						|
	stxvd2x		vs9,	o16,	T1
 | 
						|
	stxvd2x		vs10,	o32,	T1
 | 
						|
	stxvd2x		vs11,	o48,	T1
 | 
						|
 | 
						|
	add		T1,	T1,	LDC
 | 
						|
 | 
						|
#ifndef TRMMKERNEL
 | 
						|
 | 
						|
	lxvd2x		vs16,	o0,	T1
 | 
						|
	lxvd2x		vs17,	o16,	T1
 | 
						|
	lxvd2x		vs18,	o32,	T1
 | 
						|
	lxvd2x		vs19,	o48,	T1
 | 
						|
 | 
						|
#endif
 | 
						|
 | 
						|
 | 
						|
	xxlxor		vs0,	vs0,	vs0
 | 
						|
	xxlxor		vs1,	vs1,	vs1
 | 
						|
	XXSWAPD(vs41,vs41)			// realA*imagB, imagA*imagB -> imagA*imagB, realA*imagB
 | 
						|
 | 
						|
	XSFADD_R1	vs0,	vs0,	vs40		// realA*realB
 | 
						|
	XSFADD_R2	vs0,	vs0,	vs41		// imagA*imagB
 | 
						|
 | 
						|
	XXSWAPD(vs40,vs40)			// realA*realB, imagA*realB -> imagA*realB, realA*realB
 | 
						|
	XXSWAPD(vs41,vs41)			// imagA*imagB, realA*imagB -> realA*imagB, imagA*imagB
 | 
						|
 | 
						|
	XSFADD_I1	vs1,	vs1,	vs40		// realA*imagB
 | 
						|
	XSFADD_I2	vs1,	vs1,	vs41		// imagA*realB
 | 
						|
 | 
						|
	xsmuldp		vs4,	vs0,	alpha_r		// real*alpha_r
 | 
						|
	xsmuldp		vs5,	vs1,	alpha_i		// imag*alpha_i
 | 
						|
	xsmuldp		vs6,	vs0,	alpha_i		// real*alpha_i
 | 
						|
	xsmuldp		vs7,	vs1,	alpha_r		// imag*alpha_r
 | 
						|
 | 
						|
	xssubdp		vs2,	vs4,	vs5		// real*alpha_r - imag*alpha_i
 | 
						|
	xsadddp		vs3,	vs6,	vs7		// real*alpha_i + imag*alpha_r
 | 
						|
	xxpermdi	vs8,	vs2,	vs3,	0	// merge real and imag part
 | 
						|
 | 
						|
 | 
						|
 | 
						|
	xxlxor		vs0,	vs0,	vs0
 | 
						|
	xxlxor		vs1,	vs1,	vs1
 | 
						|
	XXSWAPD(vs43,vs43)			// realA*imagB, imagA*imagB -> imagA*imagB, realA*imagB
 | 
						|
 | 
						|
	XSFADD_R1	vs0,	vs0,	vs42		// realA*realB
 | 
						|
	XSFADD_R2	vs0,	vs0,	vs43		// imagA*imagB
 | 
						|
 | 
						|
	XXSWAPD(vs42,vs42)			// realA*realB, imagA*realB -> imagA*realB, realA*realB
 | 
						|
	XXSWAPD(vs43,vs43)			// imagA*imagB, realA*imagB -> realA*imagB, imagA*imagB
 | 
						|
 | 
						|
	XSFADD_I1	vs1,	vs1,	vs42		// realA*imagB
 | 
						|
	XSFADD_I2	vs1,	vs1,	vs43		// imagA*realB
 | 
						|
 | 
						|
	xsmuldp		vs4,	vs0,	alpha_r		// real*alpha_r
 | 
						|
	xsmuldp		vs5,	vs1,	alpha_i		// imag*alpha_i
 | 
						|
	xsmuldp		vs6,	vs0,	alpha_i		// real*alpha_i
 | 
						|
	xsmuldp		vs7,	vs1,	alpha_r		// imag*alpha_r
 | 
						|
 | 
						|
	xssubdp		vs2,	vs4,	vs5		// real*alpha_r - imag*alpha_i
 | 
						|
	xsadddp		vs3,	vs6,	vs7		// real*alpha_i + imag*alpha_r
 | 
						|
	xxpermdi	vs9,	vs2,	vs3,	0	// merge real and imag part
 | 
						|
 | 
						|
 | 
						|
 | 
						|
	xxlxor		vs0,	vs0,	vs0
 | 
						|
	xxlxor		vs1,	vs1,	vs1
 | 
						|
	XXSWAPD(vs45,vs45)			// realA*imagB, imagA*imagB -> imagA*imagB, realA*imagB
 | 
						|
 | 
						|
	XSFADD_R1	vs0,	vs0,	vs44		// realA*realB
 | 
						|
	XSFADD_R2	vs0,	vs0,	vs45		// imagA*imagB
 | 
						|
 | 
						|
	XXSWAPD(vs44,vs44)			// realA*realB, imagA*realB -> imagA*realB, realA*realB
 | 
						|
	XXSWAPD(vs45,vs45)			// imagA*imagB, realA*imagB -> realA*imagB, imagA*imagB
 | 
						|
 | 
						|
	XSFADD_I1	vs1,	vs1,	vs44		// realA*imagB
 | 
						|
	XSFADD_I2	vs1,	vs1,	vs45		// imagA*realB
 | 
						|
 | 
						|
	xsmuldp		vs4,	vs0,	alpha_r		// real*alpha_r
 | 
						|
	xsmuldp		vs5,	vs1,	alpha_i		// imag*alpha_i
 | 
						|
	xsmuldp		vs6,	vs0,	alpha_i		// real*alpha_i
 | 
						|
	xsmuldp		vs7,	vs1,	alpha_r		// imag*alpha_r
 | 
						|
 | 
						|
	xssubdp		vs2,	vs4,	vs5		// real*alpha_r - imag*alpha_i
 | 
						|
	xsadddp		vs3,	vs6,	vs7		// real*alpha_i + imag*alpha_r
 | 
						|
	xxpermdi	vs10,	vs2,	vs3,	0	// merge real and imag part
 | 
						|
 | 
						|
 | 
						|
 | 
						|
	xxlxor		vs0,	vs0,	vs0
 | 
						|
	xxlxor		vs1,	vs1,	vs1
 | 
						|
	XXSWAPD(vs47,vs47)			// realA*imagB, imagA*imagB -> imagA*imagB, realA*imagB
 | 
						|
 | 
						|
	XSFADD_R1	vs0,	vs0,	vs46		// realA*realB
 | 
						|
	XSFADD_R2	vs0,	vs0,	vs47		// imagA*imagB
 | 
						|
 | 
						|
	XXSWAPD(vs46,vs46)			// realA*realB, imagA*realB -> imagA*realB, realA*realB
 | 
						|
	XXSWAPD(vs47,vs47)			// imagA*imagB, realA*imagB -> realA*imagB, imagA*imagB
 | 
						|
 | 
						|
	XSFADD_I1	vs1,	vs1,	vs46		// realA*imagB
 | 
						|
	XSFADD_I2	vs1,	vs1,	vs47		// imagA*realB
 | 
						|
 | 
						|
	xsmuldp		vs4,	vs0,	alpha_r		// real*alpha_r
 | 
						|
	xsmuldp		vs5,	vs1,	alpha_i		// imag*alpha_i
 | 
						|
	xsmuldp		vs6,	vs0,	alpha_i		// real*alpha_i
 | 
						|
	xsmuldp		vs7,	vs1,	alpha_r		// imag*alpha_r
 | 
						|
 | 
						|
	xssubdp		vs2,	vs4,	vs5		// real*alpha_r - imag*alpha_i
 | 
						|
	xsadddp		vs3,	vs6,	vs7		// real*alpha_i + imag*alpha_r
 | 
						|
	xxpermdi	vs11,	vs2,	vs3,	0	// merge real and imag part
 | 
						|
 | 
						|
 | 
						|
#ifndef TRMMKERNEL
 | 
						|
 | 
						|
	xvadddp		vs8,	vs8,	vs16
 | 
						|
	xvadddp		vs9,	vs9,	vs17
 | 
						|
	xvadddp		vs10,	vs10,	vs18
 | 
						|
	xvadddp		vs11,	vs11,	vs19
 | 
						|
 | 
						|
#endif
 | 
						|
 | 
						|
	stxvd2x		vs8,	o0,	T1
 | 
						|
	stxvd2x		vs9,	o16,	T1
 | 
						|
	stxvd2x		vs10,	o32,	T1
 | 
						|
	stxvd2x		vs11,	o48,	T1
 | 
						|
 | 
						|
	add		T1,	T1,	LDC
 | 
						|
	addi		CO,	CO,	64
 | 
						|
 | 
						|
#if defined(_AIX)
 | 
						|
')
 | 
						|
#else
 | 
						|
.endm
 | 
						|
#endif
 | 
						|
 | 
						|
 | 
						|
/**********************************************************************************************
 | 
						|
* Macros for N=2 and M=2
 | 
						|
**********************************************************************************************/
 | 
						|
 | 
						|
#if defined(_AIX)
 | 
						|
define(`LOAD2x2_1', `
 | 
						|
#else
 | 
						|
.macro LOAD2x2_1
 | 
						|
#endif
 | 
						|
 | 
						|
	lxvd2x		vs16,	o0,	BO		// load real part from B
 | 
						|
	lxvd2x		vs17,	o16,	BO		// load imag part from B
 | 
						|
	lxvd2x		vs18,	o32,	BO		// load real part from B
 | 
						|
	lxvd2x		vs19,	o48,	BO		// load imag part from B
 | 
						|
 | 
						|
	addi		BO,	BO,	64
 | 
						|
 | 
						|
	lxvd2x		vs0,	o0,	AO		// load real,imag from A
 | 
						|
	lxvd2x		vs1,	o16,	AO		// load real,imag from A
 | 
						|
 | 
						|
	addi		AO,	AO,	32
 | 
						|
 | 
						|
 | 
						|
#if defined(_AIX)
 | 
						|
')
 | 
						|
#else
 | 
						|
.endm
 | 
						|
#endif
 | 
						|
 | 
						|
#if defined(_AIX)
 | 
						|
define(`KERNEL2x2_I1', `
 | 
						|
#else
 | 
						|
.macro KERNEL2x2_I1
 | 
						|
#endif
 | 
						|
 | 
						|
	lxvd2x		vs8,	o0,	AO		// load real,imag from A
 | 
						|
	lxvd2x		vs9,	o16,	AO		// load real,imag from A
 | 
						|
 | 
						|
	addi		AO,	AO,	32
 | 
						|
 | 
						|
	lxvd2x		vs20,	o0,	BO		// load real part from B
 | 
						|
	lxvd2x		vs21,	o16,	BO		// load imag part from B
 | 
						|
	lxvd2x		vs22,	o32,	BO		// load real part from B
 | 
						|
	lxvd2x		vs23,	o48,	BO		// load imag part from B
 | 
						|
 | 
						|
	addi		BO,	BO,	64
 | 
						|
 | 
						|
	xvmuldp		vs32,	vs0,	vs16		// real*real, imag*real
 | 
						|
	xvmuldp		vs33,	vs0,	vs17		// real*imag, imag*imag
 | 
						|
	xvmuldp		vs34,	vs1,	vs16		// real*real, imag*real
 | 
						|
	xvmuldp		vs35,	vs1,	vs17		// real*imag, imag*imag
 | 
						|
 | 
						|
	xvmuldp		vs36,	vs0,	vs18		// real*real, imag*real
 | 
						|
	xvmuldp		vs37,	vs0,	vs19		// real*imag, imag*imag
 | 
						|
	xvmuldp		vs38,	vs1,	vs18		// real*real, imag*real
 | 
						|
	xvmuldp		vs39,	vs1,	vs19		// real*imag, imag*imag
 | 
						|
 | 
						|
 | 
						|
#if defined(_AIX)
 | 
						|
')
 | 
						|
#else
 | 
						|
.endm
 | 
						|
#endif
 | 
						|
 | 
						|
#if defined(_AIX)
 | 
						|
define(`KERNEL2x2_1', `
 | 
						|
#else
 | 
						|
.macro KERNEL2x2_1
 | 
						|
#endif
 | 
						|
 | 
						|
	lxvd2x		vs8,	o0,	AO		// load real,imag from A
 | 
						|
	lxvd2x		vs9,	o16,	AO		// load real,imag from A
 | 
						|
 | 
						|
	addi		AO,	AO,	32
 | 
						|
 | 
						|
	lxvd2x		vs20,	o0,	BO		// load real part from B
 | 
						|
	lxvd2x		vs21,	o16,	BO		// load imag part from B
 | 
						|
	lxvd2x		vs22,	o32,	BO		// load real part from B
 | 
						|
	lxvd2x		vs23,	o48,	BO		// load imag part from B
 | 
						|
 | 
						|
	addi		BO,	BO,	64
 | 
						|
 | 
						|
	xvmaddadp	vs32,	vs0,	vs16		// real*real, imag*real
 | 
						|
	xvmaddadp	vs33,	vs0,	vs17		// real*imag, imag*imag
 | 
						|
	xvmaddadp	vs34,	vs1,	vs16		// real*real, imag*real
 | 
						|
	xvmaddadp	vs35,	vs1,	vs17		// real*imag, imag*imag
 | 
						|
 | 
						|
	xvmaddadp	vs36,	vs0,	vs18		// real*real, imag*real
 | 
						|
	xvmaddadp	vs37,	vs0,	vs19		// real*imag, imag*imag
 | 
						|
	xvmaddadp	vs38,	vs1,	vs18		// real*real, imag*real
 | 
						|
	xvmaddadp	vs39,	vs1,	vs19		// real*imag, imag*imag
 | 
						|
 | 
						|
 | 
						|
#if defined(_AIX)
 | 
						|
')
 | 
						|
#else
 | 
						|
.endm
 | 
						|
#endif
 | 
						|
 | 
						|
#if defined(_AIX)
 | 
						|
define(`KERNEL2x2_2', `
 | 
						|
#else
 | 
						|
.macro KERNEL2x2_2
 | 
						|
#endif
 | 
						|
 | 
						|
	lxvd2x		vs0,	o0,	AO		// load real,imag from A
 | 
						|
	lxvd2x		vs1,	o16,	AO		// load real,imag from A
 | 
						|
 | 
						|
	addi		AO,	AO,	32
 | 
						|
 | 
						|
	lxvd2x		vs16,	o0,	BO		// load real part from B
 | 
						|
	lxvd2x		vs17,	o16,	BO		// load imag part from B
 | 
						|
	lxvd2x		vs18,	o32,	BO		// load real part from B
 | 
						|
	lxvd2x		vs19,	o48,	BO		// load imag part from B
 | 
						|
 | 
						|
	addi		BO,	BO,	64
 | 
						|
 | 
						|
	xvmaddadp	vs32,	vs8,	vs20		// real*real, imag*real
 | 
						|
	xvmaddadp	vs33,	vs8,	vs21		// real*imag, imag*imag
 | 
						|
	xvmaddadp	vs34,	vs9,	vs20		// real*real, imag*real
 | 
						|
	xvmaddadp	vs35,	vs9,	vs21		// real*imag, imag*imag
 | 
						|
 | 
						|
	xvmaddadp	vs36,	vs8,	vs22		// real*real, imag*real
 | 
						|
	xvmaddadp	vs37,	vs8,	vs23		// real*imag, imag*imag
 | 
						|
	xvmaddadp	vs38,	vs9,	vs22		// real*real, imag*real
 | 
						|
	xvmaddadp	vs39,	vs9,	vs23		// real*imag, imag*imag
 | 
						|
 | 
						|
 | 
						|
#if defined(_AIX)
 | 
						|
')
 | 
						|
#else
 | 
						|
.endm
 | 
						|
#endif
 | 
						|
 | 
						|
#if defined(_AIX)
 | 
						|
define(`KERNEL2x2_E2', `
 | 
						|
#else
 | 
						|
.macro KERNEL2x2_E2
 | 
						|
#endif
 | 
						|
 | 
						|
 | 
						|
	xvmaddadp	vs32,	vs8,	vs20		// real*real, imag*real
 | 
						|
	xvmaddadp	vs33,	vs8,	vs21		// real*imag, imag*imag
 | 
						|
	xvmaddadp	vs34,	vs9,	vs20		// real*real, imag*real
 | 
						|
	xvmaddadp	vs35,	vs9,	vs21		// real*imag, imag*imag
 | 
						|
 | 
						|
	xvmaddadp	vs36,	vs8,	vs22		// real*real, imag*real
 | 
						|
	xvmaddadp	vs37,	vs8,	vs23		// real*imag, imag*imag
 | 
						|
	xvmaddadp	vs38,	vs9,	vs22		// real*real, imag*real
 | 
						|
	xvmaddadp	vs39,	vs9,	vs23		// real*imag, imag*imag
 | 
						|
 | 
						|
 | 
						|
#if defined(_AIX)
 | 
						|
')
 | 
						|
#else
 | 
						|
.endm
 | 
						|
#endif
 | 
						|
 | 
						|
#if defined(_AIX)
 | 
						|
define(`KERNEL2x2_SUBI1', `
 | 
						|
#else
 | 
						|
.macro KERNEL2x2_SUBI1
 | 
						|
#endif
 | 
						|
 | 
						|
	lxvd2x		vs0,	o0,	AO		// load real,imag from A
 | 
						|
	lxvd2x		vs1,	o16,	AO		// load real,imag from A
 | 
						|
 | 
						|
	addi		AO,	AO,	32
 | 
						|
 | 
						|
	lxvd2x		vs16,	o0,	BO		// load real part from B
 | 
						|
	lxvd2x		vs17,	o16,	BO		// load imag part from B
 | 
						|
	lxvd2x		vs18,	o32,	BO		// load real part from B
 | 
						|
	lxvd2x		vs19,	o48,	BO		// load imag part from B
 | 
						|
 | 
						|
	addi		BO,	BO,	64
 | 
						|
 | 
						|
	xvmuldp		vs32,	vs0,	vs16		// real*real, imag*real
 | 
						|
	xvmuldp		vs33,	vs0,	vs17		// real*imag, imag*imag
 | 
						|
	xvmuldp		vs34,	vs1,	vs16		// real*real, imag*real
 | 
						|
	xvmuldp		vs35,	vs1,	vs17		// real*imag, imag*imag
 | 
						|
 | 
						|
	xvmuldp		vs36,	vs0,	vs18		// real*real, imag*real
 | 
						|
	xvmuldp		vs37,	vs0,	vs19		// real*imag, imag*imag
 | 
						|
	xvmuldp		vs38,	vs1,	vs18		// real*real, imag*real
 | 
						|
	xvmuldp		vs39,	vs1,	vs19		// real*imag, imag*imag
 | 
						|
 | 
						|
 | 
						|
#if defined(_AIX)
 | 
						|
')
 | 
						|
#else
 | 
						|
.endm
 | 
						|
#endif
 | 
						|
 | 
						|
#if defined(_AIX)
 | 
						|
define(`KERNEL2x2_SUB1', `
 | 
						|
#else
 | 
						|
.macro KERNEL2x2_SUB1
 | 
						|
#endif
 | 
						|
 | 
						|
	lxvd2x		vs0,	o0,	AO		// load real,imag from A
 | 
						|
	lxvd2x		vs1,	o16,	AO		// load real,imag from A
 | 
						|
 | 
						|
	addi		AO,	AO,	32
 | 
						|
 | 
						|
	lxvd2x		vs16,	o0,	BO		// load real part from B
 | 
						|
	lxvd2x		vs17,	o16,	BO		// load imag part from B
 | 
						|
	lxvd2x		vs18,	o32,	BO		// load real part from B
 | 
						|
	lxvd2x		vs19,	o48,	BO		// load imag part from B
 | 
						|
 | 
						|
	addi		BO,	BO,	64
 | 
						|
 | 
						|
	xvmaddadp	vs32,	vs0,	vs16		// real*real, imag*real
 | 
						|
	xvmaddadp	vs33,	vs0,	vs17		// real*imag, imag*imag
 | 
						|
	xvmaddadp	vs34,	vs1,	vs16		// real*real, imag*real
 | 
						|
	xvmaddadp	vs35,	vs1,	vs17		// real*imag, imag*imag
 | 
						|
 | 
						|
	xvmaddadp	vs36,	vs0,	vs18		// real*real, imag*real
 | 
						|
	xvmaddadp	vs37,	vs0,	vs19		// real*imag, imag*imag
 | 
						|
	xvmaddadp	vs38,	vs1,	vs18		// real*real, imag*real
 | 
						|
	xvmaddadp	vs39,	vs1,	vs19		// real*imag, imag*imag
 | 
						|
 | 
						|
 | 
						|
#if defined(_AIX)
 | 
						|
')
 | 
						|
#else
 | 
						|
.endm
 | 
						|
#endif
 | 
						|
 | 
						|
#if defined(_AIX)
 | 
						|
define(`SAVE2x2', `
 | 
						|
#else
 | 
						|
.macro SAVE2x2
 | 
						|
#endif
 | 
						|
 | 
						|
 | 
						|
	mr		T1,	CO
 | 
						|
 | 
						|
#ifndef TRMMKERNEL
 | 
						|
 | 
						|
	lxvd2x		vs16,	o0,	T1
 | 
						|
	lxvd2x		vs17,	o16,	T1
 | 
						|
 | 
						|
#endif
 | 
						|
 | 
						|
 | 
						|
	xxlxor		vs0,	vs0,	vs0
 | 
						|
	xxlxor		vs1,	vs1,	vs1
 | 
						|
	XXSWAPD(vs33,vs33)			// realA*imagB, imagA*imagB -> imagA*imagB, realA*imagB
 | 
						|
 | 
						|
	XSFADD_R1	vs0,	vs0,	vs32		// realA*realB
 | 
						|
	XSFADD_R2	vs0,	vs0,	vs33		// imagA*imagB
 | 
						|
 | 
						|
	XXSWAPD(vs32,vs32)			// realA*realB, imagA*realB -> imagA*realB, realA*realB
 | 
						|
	XXSWAPD(vs33,vs33)			// imagA*imagB, realA*imagB -> realA*imagB, imagA*imagB
 | 
						|
 | 
						|
	XSFADD_I1	vs1,	vs1,	vs32		// realA*imagB
 | 
						|
	XSFADD_I2	vs1,	vs1,	vs33		// imagA*realB
 | 
						|
 | 
						|
	xsmuldp		vs4,	vs0,	alpha_r		// real*alpha_r
 | 
						|
	xsmuldp		vs5,	vs1,	alpha_i		// imag*alpha_i
 | 
						|
	xsmuldp		vs6,	vs0,	alpha_i		// real*alpha_i
 | 
						|
	xsmuldp		vs7,	vs1,	alpha_r		// imag*alpha_r
 | 
						|
 | 
						|
	xssubdp		vs2,	vs4,	vs5		// real*alpha_r - imag*alpha_i
 | 
						|
	xsadddp		vs3,	vs6,	vs7		// real*alpha_i + imag*alpha_r
 | 
						|
	xxpermdi	vs8,	vs2,	vs3,	0	// merge real and imag part
 | 
						|
 | 
						|
 | 
						|
 | 
						|
	xxlxor		vs0,	vs0,	vs0
 | 
						|
	xxlxor		vs1,	vs1,	vs1
 | 
						|
	XXSWAPD(vs35,vs35)			// realA*imagB, imagA*imagB -> imagA*imagB, realA*imagB
 | 
						|
 | 
						|
	XSFADD_R1	vs0,	vs0,	vs34		// realA*realB
 | 
						|
	XSFADD_R2	vs0,	vs0,	vs35		// imagA*imagB
 | 
						|
 | 
						|
	XXSWAPD(vs34,vs34)			// realA*realB, imagA*realB -> imagA*realB, realA*realB
 | 
						|
	XXSWAPD(vs35,vs35)			// imagA*imagB, realA*imagB -> realA*imagB, imagA*imagB
 | 
						|
 | 
						|
	XSFADD_I1	vs1,	vs1,	vs34		// realA*imagB
 | 
						|
	XSFADD_I2	vs1,	vs1,	vs35		// imagA*realB
 | 
						|
 | 
						|
	xsmuldp		vs4,	vs0,	alpha_r		// real*alpha_r
 | 
						|
	xsmuldp		vs5,	vs1,	alpha_i		// imag*alpha_i
 | 
						|
	xsmuldp		vs6,	vs0,	alpha_i		// real*alpha_i
 | 
						|
	xsmuldp		vs7,	vs1,	alpha_r		// imag*alpha_r
 | 
						|
 | 
						|
	xssubdp		vs2,	vs4,	vs5		// real*alpha_r - imag*alpha_i
 | 
						|
	xsadddp		vs3,	vs6,	vs7		// real*alpha_i + imag*alpha_r
 | 
						|
	xxpermdi	vs9,	vs2,	vs3,	0	// merge real and imag part
 | 
						|
 | 
						|
 | 
						|
#ifndef TRMMKERNEL
 | 
						|
 | 
						|
	xvadddp		vs8,	vs8,	vs16
 | 
						|
	xvadddp		vs9,	vs9,	vs17
 | 
						|
 | 
						|
#endif
 | 
						|
 | 
						|
	stxvd2x		vs8,	o0,	T1
 | 
						|
	stxvd2x		vs9,	o16,	T1
 | 
						|
 | 
						|
	add		T1,	T1,	LDC
 | 
						|
 | 
						|
#ifndef TRMMKERNEL
 | 
						|
 | 
						|
	lxvd2x		vs16,	o0,	T1
 | 
						|
	lxvd2x		vs17,	o16,	T1
 | 
						|
 | 
						|
#endif
 | 
						|
 | 
						|
 | 
						|
	xxlxor		vs0,	vs0,	vs0
 | 
						|
	xxlxor		vs1,	vs1,	vs1
 | 
						|
	XXSWAPD(vs37,vs37)			// realA*imagB, imagA*imagB -> imagA*imagB, realA*imagB
 | 
						|
 | 
						|
	XSFADD_R1	vs0,	vs0,	vs36		// realA*realB
 | 
						|
	XSFADD_R2	vs0,	vs0,	vs37		// imagA*imagB
 | 
						|
 | 
						|
	XXSWAPD(vs36,vs36)			// realA*realB, imagA*realB -> imagA*realB, realA*realB
 | 
						|
	XXSWAPD(vs37,vs37)			// imagA*imagB, realA*imagB -> realA*imagB, imagA*imagB
 | 
						|
 | 
						|
	XSFADD_I1	vs1,	vs1,	vs36		// realA*imagB
 | 
						|
	XSFADD_I2	vs1,	vs1,	vs37		// imagA*realB
 | 
						|
 | 
						|
	xsmuldp		vs4,	vs0,	alpha_r		// real*alpha_r
 | 
						|
	xsmuldp		vs5,	vs1,	alpha_i		// imag*alpha_i
 | 
						|
	xsmuldp		vs6,	vs0,	alpha_i		// real*alpha_i
 | 
						|
	xsmuldp		vs7,	vs1,	alpha_r		// imag*alpha_r
 | 
						|
 | 
						|
	xssubdp		vs2,	vs4,	vs5		// real*alpha_r - imag*alpha_i
 | 
						|
	xsadddp		vs3,	vs6,	vs7		// real*alpha_i + imag*alpha_r
 | 
						|
	xxpermdi	vs8,	vs2,	vs3,	0	// merge real and imag part
 | 
						|
 | 
						|
 | 
						|
 | 
						|
	xxlxor		vs0,	vs0,	vs0
 | 
						|
	xxlxor		vs1,	vs1,	vs1
 | 
						|
	XXSWAPD(vs39,vs39)			// realA*imagB, imagA*imagB -> imagA*imagB, realA*imagB
 | 
						|
 | 
						|
	XSFADD_R1	vs0,	vs0,	vs38		// realA*realB
 | 
						|
	XSFADD_R2	vs0,	vs0,	vs39		// imagA*imagB
 | 
						|
 | 
						|
	XXSWAPD(vs38,vs38)			// realA*realB, imagA*realB -> imagA*realB, realA*realB
 | 
						|
	XXSWAPD(vs39,vs39)			// imagA*imagB, realA*imagB -> realA*imagB, imagA*imagB
 | 
						|
 | 
						|
	XSFADD_I1	vs1,	vs1,	vs38		// realA*imagB
 | 
						|
	XSFADD_I2	vs1,	vs1,	vs39		// imagA*realB
 | 
						|
 | 
						|
	xsmuldp		vs4,	vs0,	alpha_r		// real*alpha_r
 | 
						|
	xsmuldp		vs5,	vs1,	alpha_i		// imag*alpha_i
 | 
						|
	xsmuldp		vs6,	vs0,	alpha_i		// real*alpha_i
 | 
						|
	xsmuldp		vs7,	vs1,	alpha_r		// imag*alpha_r
 | 
						|
 | 
						|
	xssubdp		vs2,	vs4,	vs5		// real*alpha_r - imag*alpha_i
 | 
						|
	xsadddp		vs3,	vs6,	vs7		// real*alpha_i + imag*alpha_r
 | 
						|
	xxpermdi	vs9,	vs2,	vs3,	0	// merge real and imag part
 | 
						|
 | 
						|
 | 
						|
#ifndef TRMMKERNEL
 | 
						|
 | 
						|
	xvadddp		vs8,	vs8,	vs16
 | 
						|
	xvadddp		vs9,	vs9,	vs17
 | 
						|
 | 
						|
#endif
 | 
						|
 | 
						|
	stxvd2x		vs8,	o0,	T1
 | 
						|
	stxvd2x		vs9,	o16,	T1
 | 
						|
 | 
						|
	add		T1,	T1,	LDC
 | 
						|
	addi		CO,	CO,	32
 | 
						|
 | 
						|
#if defined(_AIX)
 | 
						|
')
 | 
						|
#else
 | 
						|
.endm
 | 
						|
#endif
 | 
						|
 | 
						|
 | 
						|
/**********************************************************************************************
 | 
						|
* Macros for N=2 and M=1
 | 
						|
**********************************************************************************************/
 | 
						|
 | 
						|
#if defined(_AIX)
 | 
						|
define(`LOAD2x1_1', `
 | 
						|
#else
 | 
						|
.macro LOAD2x1_1
 | 
						|
#endif
 | 
						|
 | 
						|
	lxvd2x		vs16,	o0,	BO		// load real part from B
 | 
						|
	lxvd2x		vs17,	o16,	BO		// load imag part from B
 | 
						|
	lxvd2x		vs18,	o32,	BO		// load real part from B
 | 
						|
	lxvd2x		vs19,	o48,	BO		// load imag part from B
 | 
						|
 | 
						|
	addi		BO,	BO,	64
 | 
						|
 | 
						|
	lxvd2x		vs0,	o0,	AO		// load real,imag from A
 | 
						|
 | 
						|
	addi		AO,	AO,	16
 | 
						|
 | 
						|
 | 
						|
#if defined(_AIX)
 | 
						|
')
 | 
						|
#else
 | 
						|
.endm
 | 
						|
#endif
 | 
						|
 | 
						|
#if defined(_AIX)
 | 
						|
define(`KERNEL2x1_I1', `
 | 
						|
#else
 | 
						|
.macro KERNEL2x1_I1
 | 
						|
#endif
 | 
						|
 | 
						|
	lxvd2x		vs8,	o0,	AO		// load real,imag from A
 | 
						|
 | 
						|
	addi		AO,	AO,	16
 | 
						|
 | 
						|
	lxvd2x		vs20,	o0,	BO		// load real part from B
 | 
						|
	lxvd2x		vs21,	o16,	BO		// load imag part from B
 | 
						|
	lxvd2x		vs22,	o32,	BO		// load real part from B
 | 
						|
	lxvd2x		vs23,	o48,	BO		// load imag part from B
 | 
						|
 | 
						|
	addi		BO,	BO,	64
 | 
						|
 | 
						|
	xvmuldp		vs32,	vs0,	vs16		// real*real, imag*real
 | 
						|
	xvmuldp		vs33,	vs0,	vs17		// real*imag, imag*imag
 | 
						|
 | 
						|
	xvmuldp		vs34,	vs0,	vs18		// real*real, imag*real
 | 
						|
	xvmuldp		vs35,	vs0,	vs19		// real*imag, imag*imag
 | 
						|
 | 
						|
 | 
						|
#if defined(_AIX)
 | 
						|
')
 | 
						|
#else
 | 
						|
.endm
 | 
						|
#endif
 | 
						|
 | 
						|
#if defined(_AIX)
 | 
						|
define(`KERNEL2x1_1', `
 | 
						|
#else
 | 
						|
.macro KERNEL2x1_1
 | 
						|
#endif
 | 
						|
 | 
						|
	lxvd2x		vs8,	o0,	AO		// load real,imag from A
 | 
						|
 | 
						|
	addi		AO,	AO,	16
 | 
						|
 | 
						|
	lxvd2x		vs20,	o0,	BO		// load real part from B
 | 
						|
	lxvd2x		vs21,	o16,	BO		// load imag part from B
 | 
						|
	lxvd2x		vs22,	o32,	BO		// load real part from B
 | 
						|
	lxvd2x		vs23,	o48,	BO		// load imag part from B
 | 
						|
 | 
						|
	addi		BO,	BO,	64
 | 
						|
 | 
						|
	xvmaddadp	vs32,	vs0,	vs16		// real*real, imag*real
 | 
						|
	xvmaddadp	vs33,	vs0,	vs17		// real*imag, imag*imag
 | 
						|
 | 
						|
	xvmaddadp	vs34,	vs0,	vs18		// real*real, imag*real
 | 
						|
	xvmaddadp	vs35,	vs0,	vs19		// real*imag, imag*imag
 | 
						|
 | 
						|
 | 
						|
#if defined(_AIX)
 | 
						|
')
 | 
						|
#else
 | 
						|
.endm
 | 
						|
#endif
 | 
						|
 | 
						|
#if defined(_AIX)
 | 
						|
define(`KERNEL2x1_2', `
 | 
						|
#else
 | 
						|
.macro KERNEL2x1_2
 | 
						|
#endif
 | 
						|
 | 
						|
	lxvd2x		vs0,	o0,	AO		// load real,imag from A
 | 
						|
 | 
						|
	addi		AO,	AO,	16
 | 
						|
 | 
						|
	lxvd2x		vs16,	o0,	BO		// load real part from B
 | 
						|
	lxvd2x		vs17,	o16,	BO		// load imag part from B
 | 
						|
	lxvd2x		vs18,	o32,	BO		// load real part from B
 | 
						|
	lxvd2x		vs19,	o48,	BO		// load imag part from B
 | 
						|
 | 
						|
	addi		BO,	BO,	64
 | 
						|
 | 
						|
	xvmaddadp	vs32,	vs8,	vs20		// real*real, imag*real
 | 
						|
	xvmaddadp	vs33,	vs8,	vs21		// real*imag, imag*imag
 | 
						|
 | 
						|
	xvmaddadp	vs34,	vs8,	vs22		// real*real, imag*real
 | 
						|
	xvmaddadp	vs35,	vs8,	vs23		// real*imag, imag*imag
 | 
						|
 | 
						|
 | 
						|
#if defined(_AIX)
 | 
						|
')
 | 
						|
#else
 | 
						|
.endm
 | 
						|
#endif
 | 
						|
 | 
						|
#if defined(_AIX)
 | 
						|
define(`KERNEL2x1_E2', `
 | 
						|
#else
 | 
						|
.macro KERNEL2x1_E2
 | 
						|
#endif
 | 
						|
 | 
						|
 | 
						|
	xvmaddadp	vs32,	vs8,	vs20		// real*real, imag*real
 | 
						|
	xvmaddadp	vs33,	vs8,	vs21		// real*imag, imag*imag
 | 
						|
 | 
						|
	xvmaddadp	vs34,	vs8,	vs22		// real*real, imag*real
 | 
						|
	xvmaddadp	vs35,	vs8,	vs23		// real*imag, imag*imag
 | 
						|
 | 
						|
 | 
						|
#if defined(_AIX)
 | 
						|
')
 | 
						|
#else
 | 
						|
.endm
 | 
						|
#endif
 | 
						|
 | 
						|
#if defined(_AIX)
 | 
						|
define(`KERNEL2x1_SUBI1', `
 | 
						|
#else
 | 
						|
.macro KERNEL2x1_SUBI1
 | 
						|
#endif
 | 
						|
 | 
						|
	lxvd2x		vs0,	o0,	AO		// load real,imag from A
 | 
						|
 | 
						|
	addi		AO,	AO,	16
 | 
						|
 | 
						|
	lxvd2x		vs16,	o0,	BO		// load real part from B
 | 
						|
	lxvd2x		vs17,	o16,	BO		// load imag part from B
 | 
						|
	lxvd2x		vs18,	o32,	BO		// load real part from B
 | 
						|
	lxvd2x		vs19,	o48,	BO		// load imag part from B
 | 
						|
 | 
						|
	addi		BO,	BO,	64
 | 
						|
 | 
						|
	xvmuldp		vs32,	vs0,	vs16		// real*real, imag*real
 | 
						|
	xvmuldp		vs33,	vs0,	vs17		// real*imag, imag*imag
 | 
						|
 | 
						|
	xvmuldp		vs34,	vs0,	vs18		// real*real, imag*real
 | 
						|
	xvmuldp		vs35,	vs0,	vs19		// real*imag, imag*imag
 | 
						|
 | 
						|
 | 
						|
#if defined(_AIX)
 | 
						|
')
 | 
						|
#else
 | 
						|
.endm
 | 
						|
#endif
 | 
						|
 | 
						|
#if defined(_AIX)
 | 
						|
define(`KERNEL2x1_SUB1', `
 | 
						|
#else
 | 
						|
.macro KERNEL2x1_SUB1
 | 
						|
#endif
 | 
						|
 | 
						|
	lxvd2x		vs0,	o0,	AO		// load real,imag from A
 | 
						|
 | 
						|
	addi		AO,	AO,	16
 | 
						|
 | 
						|
	lxvd2x		vs16,	o0,	BO		// load real part from B
 | 
						|
	lxvd2x		vs17,	o16,	BO		// load imag part from B
 | 
						|
	lxvd2x		vs18,	o32,	BO		// load real part from B
 | 
						|
	lxvd2x		vs19,	o48,	BO		// load imag part from B
 | 
						|
 | 
						|
	addi		BO,	BO,	64
 | 
						|
 | 
						|
	xvmaddadp	vs32,	vs0,	vs16		// real*real, imag*real
 | 
						|
	xvmaddadp	vs33,	vs0,	vs17		// real*imag, imag*imag
 | 
						|
 | 
						|
	xvmaddadp	vs34,	vs0,	vs18		// real*real, imag*real
 | 
						|
	xvmaddadp	vs35,	vs0,	vs19		// real*imag, imag*imag
 | 
						|
 | 
						|
 | 
						|
#if defined(_AIX)
 | 
						|
')
 | 
						|
#else
 | 
						|
.endm
 | 
						|
#endif
 | 
						|
 | 
						|
#if defined(_AIX)
 | 
						|
define(`SAVE2x1', `
 | 
						|
#else
 | 
						|
.macro SAVE2x1
 | 
						|
#endif
 | 
						|
 | 
						|
 | 
						|
	mr		T1,	CO
 | 
						|
 | 
						|
#ifndef TRMMKERNEL
 | 
						|
 | 
						|
	lxvd2x		vs16,	o0,	T1
 | 
						|
 | 
						|
#endif
 | 
						|
 | 
						|
 | 
						|
	xxlxor		vs0,	vs0,	vs0
 | 
						|
	xxlxor		vs1,	vs1,	vs1
 | 
						|
	XXSWAPD(vs33,vs33)			// realA*imagB, imagA*imagB -> imagA*imagB, realA*imagB
 | 
						|
 | 
						|
	XSFADD_R1	vs0,	vs0,	vs32		// realA*realB
 | 
						|
	XSFADD_R2	vs0,	vs0,	vs33		// imagA*imagB
 | 
						|
 | 
						|
	XXSWAPD(vs32,vs32)			// realA*realB, imagA*realB -> imagA*realB, realA*realB
 | 
						|
	XXSWAPD(vs33,vs33)			// imagA*imagB, realA*imagB -> realA*imagB, imagA*imagB
 | 
						|
 | 
						|
	XSFADD_I1	vs1,	vs1,	vs32		// realA*imagB
 | 
						|
	XSFADD_I2	vs1,	vs1,	vs33		// imagA*realB
 | 
						|
 | 
						|
	xsmuldp		vs4,	vs0,	alpha_r		// real*alpha_r
 | 
						|
	xsmuldp		vs5,	vs1,	alpha_i		// imag*alpha_i
 | 
						|
	xsmuldp		vs6,	vs0,	alpha_i		// real*alpha_i
 | 
						|
	xsmuldp		vs7,	vs1,	alpha_r		// imag*alpha_r
 | 
						|
 | 
						|
	xssubdp		vs2,	vs4,	vs5		// real*alpha_r - imag*alpha_i
 | 
						|
	xsadddp		vs3,	vs6,	vs7		// real*alpha_i + imag*alpha_r
 | 
						|
	xxpermdi	vs8,	vs2,	vs3,	0	// merge real and imag part
 | 
						|
 | 
						|
 | 
						|
#ifndef TRMMKERNEL
 | 
						|
 | 
						|
	xvadddp		vs8,	vs8,	vs16
 | 
						|
 | 
						|
#endif
 | 
						|
 | 
						|
	stxvd2x		vs8,	o0,	T1
 | 
						|
 | 
						|
	add		T1,	T1,	LDC
 | 
						|
 | 
						|
#ifndef TRMMKERNEL
 | 
						|
 | 
						|
	lxvd2x		vs16,	o0,	T1
 | 
						|
 | 
						|
#endif
 | 
						|
 | 
						|
 | 
						|
	xxlxor		vs0,	vs0,	vs0
 | 
						|
	xxlxor		vs1,	vs1,	vs1
 | 
						|
	XXSWAPD(vs35,vs35)			// realA*imagB, imagA*imagB -> imagA*imagB, realA*imagB
 | 
						|
 | 
						|
	XSFADD_R1	vs0,	vs0,	vs34		// realA*realB
 | 
						|
	XSFADD_R2	vs0,	vs0,	vs35		// imagA*imagB
 | 
						|
 | 
						|
	XXSWAPD(vs34,vs34)			// realA*realB, imagA*realB -> imagA*realB, realA*realB
 | 
						|
	XXSWAPD(vs35,vs35)			// imagA*imagB, realA*imagB -> realA*imagB, imagA*imagB
 | 
						|
 | 
						|
	XSFADD_I1	vs1,	vs1,	vs34		// realA*imagB
 | 
						|
	XSFADD_I2	vs1,	vs1,	vs35		// imagA*realB
 | 
						|
 | 
						|
	xsmuldp		vs4,	vs0,	alpha_r		// real*alpha_r
 | 
						|
	xsmuldp		vs5,	vs1,	alpha_i		// imag*alpha_i
 | 
						|
	xsmuldp		vs6,	vs0,	alpha_i		// real*alpha_i
 | 
						|
	xsmuldp		vs7,	vs1,	alpha_r		// imag*alpha_r
 | 
						|
 | 
						|
	xssubdp		vs2,	vs4,	vs5		// real*alpha_r - imag*alpha_i
 | 
						|
	xsadddp		vs3,	vs6,	vs7		// real*alpha_i + imag*alpha_r
 | 
						|
	xxpermdi	vs8,	vs2,	vs3,	0	// merge real and imag part
 | 
						|
 | 
						|
 | 
						|
#ifndef TRMMKERNEL
 | 
						|
 | 
						|
	xvadddp		vs8,	vs8,	vs16
 | 
						|
 | 
						|
#endif
 | 
						|
 | 
						|
	stxvd2x		vs8,	o0,	T1
 | 
						|
 | 
						|
	add		T1,	T1,	LDC
 | 
						|
	addi		CO,	CO,	16
 | 
						|
 | 
						|
#if defined(_AIX)
 | 
						|
')
 | 
						|
#else
 | 
						|
.endm
 | 
						|
#endif
 | 
						|
 | 
						|
 | 
						|
/**********************************************************************************************
 | 
						|
* Macros for N=1 and M=8
 | 
						|
**********************************************************************************************/
 | 
						|
 | 
						|
#if defined(_AIX)
 | 
						|
define(`LOAD1x8_1', `
 | 
						|
#else
 | 
						|
.macro LOAD1x8_1
 | 
						|
#endif
 | 
						|
 | 
						|
	lxvd2x		vs16,	o0,	BO		// load real part from B
 | 
						|
	lxvd2x		vs17,	o16,	BO		// load imag part from B
 | 
						|
 | 
						|
	addi		BO,	BO,	32
 | 
						|
 | 
						|
	lxvd2x		vs0,	o0,	AO		// load real,imag from A
 | 
						|
	lxvd2x		vs1,	o16,	AO		// load real,imag from A
 | 
						|
	lxvd2x		vs2,	o32,	AO		// load real,imag from A
 | 
						|
	lxvd2x		vs3,	o48,	AO		// load real,imag from A
 | 
						|
 | 
						|
	addi		AO,	AO,	64
 | 
						|
 | 
						|
	lxvd2x		vs4,	o0,	AO		// load real,imag from A
 | 
						|
	lxvd2x		vs5,	o16,	AO		// load real,imag from A
 | 
						|
	lxvd2x		vs6,	o32,	AO		// load real,imag from A
 | 
						|
	lxvd2x		vs7,	o48,	AO		// load real,imag from A
 | 
						|
 | 
						|
	addi		AO,	AO,	64
 | 
						|
 | 
						|
 | 
						|
#if defined(_AIX)
 | 
						|
')
 | 
						|
#else
 | 
						|
.endm
 | 
						|
#endif
 | 
						|
 | 
						|
#if defined(_AIX)
 | 
						|
define(`KERNEL1x8_I1', `
 | 
						|
#else
 | 
						|
.macro KERNEL1x8_I1
 | 
						|
#endif
 | 
						|
 | 
						|
	lxvd2x		vs8,	o0,	AO		// load real,imag from A
 | 
						|
	lxvd2x		vs9,	o16,	AO		// load real,imag from A
 | 
						|
	lxvd2x		vs10,	o32,	AO		// load real,imag from A
 | 
						|
	lxvd2x		vs11,	o48,	AO		// load real,imag from A
 | 
						|
 | 
						|
	addi		AO,	AO,	64
 | 
						|
 | 
						|
	lxvd2x		vs12,	o0,	AO		// load real,imag from A
 | 
						|
	lxvd2x		vs13,	o16,	AO		// load real,imag from A
 | 
						|
	lxvd2x		vs14,	o32,	AO		// load real,imag from A
 | 
						|
	lxvd2x		vs15,	o48,	AO		// load real,imag from A
 | 
						|
 | 
						|
	addi		AO,	AO,	64
 | 
						|
 | 
						|
	lxvd2x		vs20,	o0,	BO		// load real part from B
 | 
						|
	lxvd2x		vs21,	o16,	BO		// load imag part from B
 | 
						|
 | 
						|
	addi		BO,	BO,	32
 | 
						|
 | 
						|
	xvmuldp		vs32,	vs0,	vs16		// real*real, imag*real
 | 
						|
	xvmuldp		vs33,	vs0,	vs17		// real*imag, imag*imag
 | 
						|
	xvmuldp		vs34,	vs1,	vs16		// real*real, imag*real
 | 
						|
	xvmuldp		vs35,	vs1,	vs17		// real*imag, imag*imag
 | 
						|
	xvmuldp		vs36,	vs2,	vs16		// real*real, imag*real
 | 
						|
	xvmuldp		vs37,	vs2,	vs17		// real*imag, imag*imag
 | 
						|
	xvmuldp		vs38,	vs3,	vs16		// real*real, imag*real
 | 
						|
	xvmuldp		vs39,	vs3,	vs17		// real*imag, imag*imag
 | 
						|
	xvmuldp		vs40,	vs4,	vs16		// real*real, imag*real
 | 
						|
	xvmuldp		vs41,	vs4,	vs17		// real*imag, imag*imag
 | 
						|
	xvmuldp		vs42,	vs5,	vs16		// real*real, imag*real
 | 
						|
	xvmuldp		vs43,	vs5,	vs17		// real*imag, imag*imag
 | 
						|
	xvmuldp		vs44,	vs6,	vs16		// real*real, imag*real
 | 
						|
	xvmuldp		vs45,	vs6,	vs17		// real*imag, imag*imag
 | 
						|
	xvmuldp		vs46,	vs7,	vs16		// real*real, imag*real
 | 
						|
	xvmuldp		vs47,	vs7,	vs17		// real*imag, imag*imag
 | 
						|
 | 
						|
 | 
						|
#if defined(_AIX)
 | 
						|
')
 | 
						|
#else
 | 
						|
.endm
 | 
						|
#endif
 | 
						|
 | 
						|
#if defined(_AIX)
 | 
						|
define(`KERNEL1x8_1', `
 | 
						|
#else
 | 
						|
.macro KERNEL1x8_1
 | 
						|
#endif
 | 
						|
 | 
						|
	lxvd2x		vs8,	o0,	AO		// load real,imag from A
 | 
						|
	lxvd2x		vs9,	o16,	AO		// load real,imag from A
 | 
						|
	lxvd2x		vs10,	o32,	AO		// load real,imag from A
 | 
						|
	lxvd2x		vs11,	o48,	AO		// load real,imag from A
 | 
						|
 | 
						|
	addi		AO,	AO,	64
 | 
						|
 | 
						|
	lxvd2x		vs12,	o0,	AO		// load real,imag from A
 | 
						|
	lxvd2x		vs13,	o16,	AO		// load real,imag from A
 | 
						|
	lxvd2x		vs14,	o32,	AO		// load real,imag from A
 | 
						|
	lxvd2x		vs15,	o48,	AO		// load real,imag from A
 | 
						|
 | 
						|
	addi		AO,	AO,	64
 | 
						|
 | 
						|
	lxvd2x		vs20,	o0,	BO		// load real part from B
 | 
						|
	lxvd2x		vs21,	o16,	BO		// load imag part from B
 | 
						|
 | 
						|
	addi		BO,	BO,	32
 | 
						|
 | 
						|
	xvmaddadp	vs32,	vs0,	vs16		// real*real, imag*real
 | 
						|
	xvmaddadp	vs33,	vs0,	vs17		// real*imag, imag*imag
 | 
						|
	xvmaddadp	vs34,	vs1,	vs16		// real*real, imag*real
 | 
						|
	xvmaddadp	vs35,	vs1,	vs17		// real*imag, imag*imag
 | 
						|
	xvmaddadp	vs36,	vs2,	vs16		// real*real, imag*real
 | 
						|
	xvmaddadp	vs37,	vs2,	vs17		// real*imag, imag*imag
 | 
						|
	xvmaddadp	vs38,	vs3,	vs16		// real*real, imag*real
 | 
						|
	xvmaddadp	vs39,	vs3,	vs17		// real*imag, imag*imag
 | 
						|
	xvmaddadp	vs40,	vs4,	vs16		// real*real, imag*real
 | 
						|
	xvmaddadp	vs41,	vs4,	vs17		// real*imag, imag*imag
 | 
						|
	xvmaddadp	vs42,	vs5,	vs16		// real*real, imag*real
 | 
						|
	xvmaddadp	vs43,	vs5,	vs17		// real*imag, imag*imag
 | 
						|
	xvmaddadp	vs44,	vs6,	vs16		// real*real, imag*real
 | 
						|
	xvmaddadp	vs45,	vs6,	vs17		// real*imag, imag*imag
 | 
						|
	xvmaddadp	vs46,	vs7,	vs16		// real*real, imag*real
 | 
						|
	xvmaddadp	vs47,	vs7,	vs17		// real*imag, imag*imag
 | 
						|
 | 
						|
 | 
						|
#if defined(_AIX)
 | 
						|
')
 | 
						|
#else
 | 
						|
.endm
 | 
						|
#endif
 | 
						|
 | 
						|
#if defined(_AIX)
 | 
						|
define(`KERNEL1x8_2', `
 | 
						|
#else
 | 
						|
.macro KERNEL1x8_2
 | 
						|
#endif
 | 
						|
 | 
						|
	lxvd2x		vs0,	o0,	AO		// load real,imag from A
 | 
						|
	lxvd2x		vs1,	o16,	AO		// load real,imag from A
 | 
						|
	lxvd2x		vs2,	o32,	AO		// load real,imag from A
 | 
						|
	lxvd2x		vs3,	o48,	AO		// load real,imag from A
 | 
						|
 | 
						|
	addi		AO,	AO,	64
 | 
						|
 | 
						|
	lxvd2x		vs4,	o0,	AO		// load real,imag from A
 | 
						|
	lxvd2x		vs5,	o16,	AO		// load real,imag from A
 | 
						|
	lxvd2x		vs6,	o32,	AO		// load real,imag from A
 | 
						|
	lxvd2x		vs7,	o48,	AO		// load real,imag from A
 | 
						|
 | 
						|
	addi		AO,	AO,	64
 | 
						|
 | 
						|
	lxvd2x		vs16,	o0,	BO		// load real part from B
 | 
						|
	lxvd2x		vs17,	o16,	BO		// load imag part from B
 | 
						|
 | 
						|
	addi		BO,	BO,	32
 | 
						|
 | 
						|
	xvmaddadp	vs32,	vs8,	vs20		// real*real, imag*real
 | 
						|
	xvmaddadp	vs33,	vs8,	vs21		// real*imag, imag*imag
 | 
						|
	xvmaddadp	vs34,	vs9,	vs20		// real*real, imag*real
 | 
						|
	xvmaddadp	vs35,	vs9,	vs21		// real*imag, imag*imag
 | 
						|
	xvmaddadp	vs36,	vs10,	vs20		// real*real, imag*real
 | 
						|
	xvmaddadp	vs37,	vs10,	vs21		// real*imag, imag*imag
 | 
						|
	xvmaddadp	vs38,	vs11,	vs20		// real*real, imag*real
 | 
						|
	xvmaddadp	vs39,	vs11,	vs21		// real*imag, imag*imag
 | 
						|
	xvmaddadp	vs40,	vs12,	vs20		// real*real, imag*real
 | 
						|
	xvmaddadp	vs41,	vs12,	vs21		// real*imag, imag*imag
 | 
						|
	xvmaddadp	vs42,	vs13,	vs20		// real*real, imag*real
 | 
						|
	xvmaddadp	vs43,	vs13,	vs21		// real*imag, imag*imag
 | 
						|
	xvmaddadp	vs44,	vs14,	vs20		// real*real, imag*real
 | 
						|
	xvmaddadp	vs45,	vs14,	vs21		// real*imag, imag*imag
 | 
						|
	xvmaddadp	vs46,	vs15,	vs20		// real*real, imag*real
 | 
						|
	xvmaddadp	vs47,	vs15,	vs21		// real*imag, imag*imag
 | 
						|
 | 
						|
 | 
						|
#if defined(_AIX)
 | 
						|
')
 | 
						|
#else
 | 
						|
.endm
 | 
						|
#endif
 | 
						|
 | 
						|
#if defined(_AIX)
 | 
						|
define(`KERNEL1x8_E2', `
 | 
						|
#else
 | 
						|
.macro KERNEL1x8_E2
 | 
						|
#endif
 | 
						|
 | 
						|
 | 
						|
	xvmaddadp	vs32,	vs8,	vs20		// real*real, imag*real
 | 
						|
	xvmaddadp	vs33,	vs8,	vs21		// real*imag, imag*imag
 | 
						|
	xvmaddadp	vs34,	vs9,	vs20		// real*real, imag*real
 | 
						|
	xvmaddadp	vs35,	vs9,	vs21		// real*imag, imag*imag
 | 
						|
	xvmaddadp	vs36,	vs10,	vs20		// real*real, imag*real
 | 
						|
	xvmaddadp	vs37,	vs10,	vs21		// real*imag, imag*imag
 | 
						|
	xvmaddadp	vs38,	vs11,	vs20		// real*real, imag*real
 | 
						|
	xvmaddadp	vs39,	vs11,	vs21		// real*imag, imag*imag
 | 
						|
	xvmaddadp	vs40,	vs12,	vs20		// real*real, imag*real
 | 
						|
	xvmaddadp	vs41,	vs12,	vs21		// real*imag, imag*imag
 | 
						|
	xvmaddadp	vs42,	vs13,	vs20		// real*real, imag*real
 | 
						|
	xvmaddadp	vs43,	vs13,	vs21		// real*imag, imag*imag
 | 
						|
	xvmaddadp	vs44,	vs14,	vs20		// real*real, imag*real
 | 
						|
	xvmaddadp	vs45,	vs14,	vs21		// real*imag, imag*imag
 | 
						|
	xvmaddadp	vs46,	vs15,	vs20		// real*real, imag*real
 | 
						|
	xvmaddadp	vs47,	vs15,	vs21		// real*imag, imag*imag
 | 
						|
 | 
						|
 | 
						|
#if defined(_AIX)
 | 
						|
')
 | 
						|
#else
 | 
						|
.endm
 | 
						|
#endif
 | 
						|
 | 
						|
#if defined(_AIX)
 | 
						|
define(`KERNEL1x8_SUBI1', `
 | 
						|
#else
 | 
						|
.macro KERNEL1x8_SUBI1
 | 
						|
#endif
 | 
						|
 | 
						|
	lxvd2x		vs0,	o0,	AO		// load real,imag from A
 | 
						|
	lxvd2x		vs1,	o16,	AO		// load real,imag from A
 | 
						|
	lxvd2x		vs2,	o32,	AO		// load real,imag from A
 | 
						|
	lxvd2x		vs3,	o48,	AO		// load real,imag from A
 | 
						|
 | 
						|
	addi		AO,	AO,	64
 | 
						|
 | 
						|
	lxvd2x		vs4,	o0,	AO		// load real,imag from A
 | 
						|
	lxvd2x		vs5,	o16,	AO		// load real,imag from A
 | 
						|
	lxvd2x		vs6,	o32,	AO		// load real,imag from A
 | 
						|
	lxvd2x		vs7,	o48,	AO		// load real,imag from A
 | 
						|
 | 
						|
	addi		AO,	AO,	64
 | 
						|
 | 
						|
	lxvd2x		vs16,	o0,	BO		// load real part from B
 | 
						|
	lxvd2x		vs17,	o16,	BO		// load imag part from B
 | 
						|
 | 
						|
	addi		BO,	BO,	32
 | 
						|
 | 
						|
	xvmuldp		vs32,	vs0,	vs16		// real*real, imag*real
 | 
						|
	xvmuldp		vs33,	vs0,	vs17		// real*imag, imag*imag
 | 
						|
	xvmuldp		vs34,	vs1,	vs16		// real*real, imag*real
 | 
						|
	xvmuldp		vs35,	vs1,	vs17		// real*imag, imag*imag
 | 
						|
	xvmuldp		vs36,	vs2,	vs16		// real*real, imag*real
 | 
						|
	xvmuldp		vs37,	vs2,	vs17		// real*imag, imag*imag
 | 
						|
	xvmuldp		vs38,	vs3,	vs16		// real*real, imag*real
 | 
						|
	xvmuldp		vs39,	vs3,	vs17		// real*imag, imag*imag
 | 
						|
	xvmuldp		vs40,	vs4,	vs16		// real*real, imag*real
 | 
						|
	xvmuldp		vs41,	vs4,	vs17		// real*imag, imag*imag
 | 
						|
	xvmuldp		vs42,	vs5,	vs16		// real*real, imag*real
 | 
						|
	xvmuldp		vs43,	vs5,	vs17		// real*imag, imag*imag
 | 
						|
	xvmuldp		vs44,	vs6,	vs16		// real*real, imag*real
 | 
						|
	xvmuldp		vs45,	vs6,	vs17		// real*imag, imag*imag
 | 
						|
	xvmuldp		vs46,	vs7,	vs16		// real*real, imag*real
 | 
						|
	xvmuldp		vs47,	vs7,	vs17		// real*imag, imag*imag
 | 
						|
 | 
						|
 | 
						|
#if defined(_AIX)
 | 
						|
')
 | 
						|
#else
 | 
						|
.endm
 | 
						|
#endif
 | 
						|
 | 
						|
#if defined(_AIX)
 | 
						|
define(`KERNEL1x8_SUB1', `
 | 
						|
#else
 | 
						|
.macro KERNEL1x8_SUB1
 | 
						|
#endif
 | 
						|
 | 
						|
	lxvd2x		vs0,	o0,	AO		// load real,imag from A
 | 
						|
	lxvd2x		vs1,	o16,	AO		// load real,imag from A
 | 
						|
	lxvd2x		vs2,	o32,	AO		// load real,imag from A
 | 
						|
	lxvd2x		vs3,	o48,	AO		// load real,imag from A
 | 
						|
 | 
						|
	addi		AO,	AO,	64
 | 
						|
 | 
						|
	lxvd2x		vs4,	o0,	AO		// load real,imag from A
 | 
						|
	lxvd2x		vs5,	o16,	AO		// load real,imag from A
 | 
						|
	lxvd2x		vs6,	o32,	AO		// load real,imag from A
 | 
						|
	lxvd2x		vs7,	o48,	AO		// load real,imag from A
 | 
						|
 | 
						|
	addi		AO,	AO,	64
 | 
						|
 | 
						|
	lxvd2x		vs16,	o0,	BO		// load real part from B
 | 
						|
	lxvd2x		vs17,	o16,	BO		// load imag part from B
 | 
						|
 | 
						|
	addi		BO,	BO,	32
 | 
						|
 | 
						|
	xvmaddadp	vs32,	vs0,	vs16		// real*real, imag*real
 | 
						|
	xvmaddadp	vs33,	vs0,	vs17		// real*imag, imag*imag
 | 
						|
	xvmaddadp	vs34,	vs1,	vs16		// real*real, imag*real
 | 
						|
	xvmaddadp	vs35,	vs1,	vs17		// real*imag, imag*imag
 | 
						|
	xvmaddadp	vs36,	vs2,	vs16		// real*real, imag*real
 | 
						|
	xvmaddadp	vs37,	vs2,	vs17		// real*imag, imag*imag
 | 
						|
	xvmaddadp	vs38,	vs3,	vs16		// real*real, imag*real
 | 
						|
	xvmaddadp	vs39,	vs3,	vs17		// real*imag, imag*imag
 | 
						|
	xvmaddadp	vs40,	vs4,	vs16		// real*real, imag*real
 | 
						|
	xvmaddadp	vs41,	vs4,	vs17		// real*imag, imag*imag
 | 
						|
	xvmaddadp	vs42,	vs5,	vs16		// real*real, imag*real
 | 
						|
	xvmaddadp	vs43,	vs5,	vs17		// real*imag, imag*imag
 | 
						|
	xvmaddadp	vs44,	vs6,	vs16		// real*real, imag*real
 | 
						|
	xvmaddadp	vs45,	vs6,	vs17		// real*imag, imag*imag
 | 
						|
	xvmaddadp	vs46,	vs7,	vs16		// real*real, imag*real
 | 
						|
	xvmaddadp	vs47,	vs7,	vs17		// real*imag, imag*imag
 | 
						|
 | 
						|
 | 
						|
#if defined(_AIX)
 | 
						|
')
 | 
						|
#else
 | 
						|
.endm
 | 
						|
#endif
 | 
						|
 | 
						|
#if defined(_AIX)
 | 
						|
define(`SAVE1x8', `
 | 
						|
#else
 | 
						|
.macro SAVE1x8
 | 
						|
#endif
 | 
						|
 | 
						|
 | 
						|
	mr		T1,	CO
 | 
						|
	addi		T2,	T1,	64
 | 
						|
 | 
						|
#ifndef TRMMKERNEL
 | 
						|
 | 
						|
	lxvd2x		vs16,	o0,	T1
 | 
						|
	lxvd2x		vs17,	o16,	T1
 | 
						|
	lxvd2x		vs18,	o32,	T1
 | 
						|
	lxvd2x		vs19,	o48,	T1
 | 
						|
	lxvd2x		vs20,	o0,	T2
 | 
						|
	lxvd2x		vs21,	o16,	T2
 | 
						|
	lxvd2x		vs22,	o32,	T2
 | 
						|
	lxvd2x		vs23,	o48,	T2
 | 
						|
 | 
						|
#endif
 | 
						|
 | 
						|
 | 
						|
	xxlxor		vs0,	vs0,	vs0
 | 
						|
	xxlxor		vs1,	vs1,	vs1
 | 
						|
	XXSWAPD(vs33,vs33)			// realA*imagB, imagA*imagB -> imagA*imagB, realA*imagB
 | 
						|
 | 
						|
	XSFADD_R1	vs0,	vs0,	vs32		// realA*realB
 | 
						|
	XSFADD_R2	vs0,	vs0,	vs33		// imagA*imagB
 | 
						|
 | 
						|
	XXSWAPD(vs32,vs32)			// realA*realB, imagA*realB -> imagA*realB, realA*realB
 | 
						|
	XXSWAPD(vs33,vs33)			// imagA*imagB, realA*imagB -> realA*imagB, imagA*imagB
 | 
						|
 | 
						|
	XSFADD_I1	vs1,	vs1,	vs32		// realA*imagB
 | 
						|
	XSFADD_I2	vs1,	vs1,	vs33		// imagA*realB
 | 
						|
 | 
						|
	xsmuldp		vs4,	vs0,	alpha_r		// real*alpha_r
 | 
						|
	xsmuldp		vs5,	vs1,	alpha_i		// imag*alpha_i
 | 
						|
	xsmuldp		vs6,	vs0,	alpha_i		// real*alpha_i
 | 
						|
	xsmuldp		vs7,	vs1,	alpha_r		// imag*alpha_r
 | 
						|
 | 
						|
	xssubdp		vs2,	vs4,	vs5		// real*alpha_r - imag*alpha_i
 | 
						|
	xsadddp		vs3,	vs6,	vs7		// real*alpha_i + imag*alpha_r
 | 
						|
	xxpermdi	vs8,	vs2,	vs3,	0	// merge real and imag part
 | 
						|
 | 
						|
 | 
						|
 | 
						|
	xxlxor		vs0,	vs0,	vs0
 | 
						|
	xxlxor		vs1,	vs1,	vs1
 | 
						|
	XXSWAPD(vs35,vs35)			// realA*imagB, imagA*imagB -> imagA*imagB, realA*imagB
 | 
						|
 | 
						|
	XSFADD_R1	vs0,	vs0,	vs34		// realA*realB
 | 
						|
	XSFADD_R2	vs0,	vs0,	vs35		// imagA*imagB
 | 
						|
 | 
						|
	XXSWAPD(vs34,vs34)			// realA*realB, imagA*realB -> imagA*realB, realA*realB
 | 
						|
	XXSWAPD(vs35,vs35)			// imagA*imagB, realA*imagB -> realA*imagB, imagA*imagB
 | 
						|
 | 
						|
	XSFADD_I1	vs1,	vs1,	vs34		// realA*imagB
 | 
						|
	XSFADD_I2	vs1,	vs1,	vs35		// imagA*realB
 | 
						|
 | 
						|
	xsmuldp		vs4,	vs0,	alpha_r		// real*alpha_r
 | 
						|
	xsmuldp		vs5,	vs1,	alpha_i		// imag*alpha_i
 | 
						|
	xsmuldp		vs6,	vs0,	alpha_i		// real*alpha_i
 | 
						|
	xsmuldp		vs7,	vs1,	alpha_r		// imag*alpha_r
 | 
						|
 | 
						|
	xssubdp		vs2,	vs4,	vs5		// real*alpha_r - imag*alpha_i
 | 
						|
	xsadddp		vs3,	vs6,	vs7		// real*alpha_i + imag*alpha_r
 | 
						|
	xxpermdi	vs9,	vs2,	vs3,	0	// merge real and imag part
 | 
						|
 | 
						|
 | 
						|
 | 
						|
	xxlxor		vs0,	vs0,	vs0
 | 
						|
	xxlxor		vs1,	vs1,	vs1
 | 
						|
	XXSWAPD(vs37,vs37)			// realA*imagB, imagA*imagB -> imagA*imagB, realA*imagB
 | 
						|
 | 
						|
	XSFADD_R1	vs0,	vs0,	vs36		// realA*realB
 | 
						|
	XSFADD_R2	vs0,	vs0,	vs37		// imagA*imagB
 | 
						|
 | 
						|
	XXSWAPD(vs36,vs36)			// realA*realB, imagA*realB -> imagA*realB, realA*realB
 | 
						|
	XXSWAPD(vs37,vs37)			// imagA*imagB, realA*imagB -> realA*imagB, imagA*imagB
 | 
						|
 | 
						|
	XSFADD_I1	vs1,	vs1,	vs36		// realA*imagB
 | 
						|
	XSFADD_I2	vs1,	vs1,	vs37		// imagA*realB
 | 
						|
 | 
						|
	xsmuldp		vs4,	vs0,	alpha_r		// real*alpha_r
 | 
						|
	xsmuldp		vs5,	vs1,	alpha_i		// imag*alpha_i
 | 
						|
	xsmuldp		vs6,	vs0,	alpha_i		// real*alpha_i
 | 
						|
	xsmuldp		vs7,	vs1,	alpha_r		// imag*alpha_r
 | 
						|
 | 
						|
	xssubdp		vs2,	vs4,	vs5		// real*alpha_r - imag*alpha_i
 | 
						|
	xsadddp		vs3,	vs6,	vs7		// real*alpha_i + imag*alpha_r
 | 
						|
	xxpermdi	vs10,	vs2,	vs3,	0	// merge real and imag part
 | 
						|
 | 
						|
 | 
						|
 | 
						|
	xxlxor		vs0,	vs0,	vs0
 | 
						|
	xxlxor		vs1,	vs1,	vs1
 | 
						|
	XXSWAPD(vs39,vs39)			// realA*imagB, imagA*imagB -> imagA*imagB, realA*imagB
 | 
						|
 | 
						|
	XSFADD_R1	vs0,	vs0,	vs38		// realA*realB
 | 
						|
	XSFADD_R2	vs0,	vs0,	vs39		// imagA*imagB
 | 
						|
 | 
						|
	XXSWAPD(vs38,vs38)			// realA*realB, imagA*realB -> imagA*realB, realA*realB
 | 
						|
	XXSWAPD(vs39,vs39)			// imagA*imagB, realA*imagB -> realA*imagB, imagA*imagB
 | 
						|
 | 
						|
	XSFADD_I1	vs1,	vs1,	vs38		// realA*imagB
 | 
						|
	XSFADD_I2	vs1,	vs1,	vs39		// imagA*realB
 | 
						|
 | 
						|
	xsmuldp		vs4,	vs0,	alpha_r		// real*alpha_r
 | 
						|
	xsmuldp		vs5,	vs1,	alpha_i		// imag*alpha_i
 | 
						|
	xsmuldp		vs6,	vs0,	alpha_i		// real*alpha_i
 | 
						|
	xsmuldp		vs7,	vs1,	alpha_r		// imag*alpha_r
 | 
						|
 | 
						|
	xssubdp		vs2,	vs4,	vs5		// real*alpha_r - imag*alpha_i
 | 
						|
	xsadddp		vs3,	vs6,	vs7		// real*alpha_i + imag*alpha_r
 | 
						|
	xxpermdi	vs11,	vs2,	vs3,	0	// merge real and imag part
 | 
						|
 | 
						|
 | 
						|
 | 
						|
	xxlxor		vs0,	vs0,	vs0
 | 
						|
	xxlxor		vs1,	vs1,	vs1
 | 
						|
	XXSWAPD(vs41,vs41)			// realA*imagB, imagA*imagB -> imagA*imagB, realA*imagB
 | 
						|
 | 
						|
	XSFADD_R1	vs0,	vs0,	vs40		// realA*realB
 | 
						|
	XSFADD_R2	vs0,	vs0,	vs41		// imagA*imagB
 | 
						|
 | 
						|
	XXSWAPD(vs40,vs40)			// realA*realB, imagA*realB -> imagA*realB, realA*realB
 | 
						|
	XXSWAPD(vs41,vs41)			// imagA*imagB, realA*imagB -> realA*imagB, imagA*imagB
 | 
						|
 | 
						|
	XSFADD_I1	vs1,	vs1,	vs40		// realA*imagB
 | 
						|
	XSFADD_I2	vs1,	vs1,	vs41		// imagA*realB
 | 
						|
 | 
						|
	xsmuldp		vs4,	vs0,	alpha_r		// real*alpha_r
 | 
						|
	xsmuldp		vs5,	vs1,	alpha_i		// imag*alpha_i
 | 
						|
	xsmuldp		vs6,	vs0,	alpha_i		// real*alpha_i
 | 
						|
	xsmuldp		vs7,	vs1,	alpha_r		// imag*alpha_r
 | 
						|
 | 
						|
	xssubdp		vs2,	vs4,	vs5		// real*alpha_r - imag*alpha_i
 | 
						|
	xsadddp		vs3,	vs6,	vs7		// real*alpha_i + imag*alpha_r
 | 
						|
	xxpermdi	vs12,	vs2,	vs3,	0	// merge real and imag part
 | 
						|
 | 
						|
 | 
						|
 | 
						|
	xxlxor		vs0,	vs0,	vs0
 | 
						|
	xxlxor		vs1,	vs1,	vs1
 | 
						|
	XXSWAPD(vs43,vs43)			// realA*imagB, imagA*imagB -> imagA*imagB, realA*imagB
 | 
						|
 | 
						|
	XSFADD_R1	vs0,	vs0,	vs42		// realA*realB
 | 
						|
	XSFADD_R2	vs0,	vs0,	vs43		// imagA*imagB
 | 
						|
 | 
						|
	XXSWAPD(vs42,vs42)			// realA*realB, imagA*realB -> imagA*realB, realA*realB
 | 
						|
	XXSWAPD(vs43,vs43)			// imagA*imagB, realA*imagB -> realA*imagB, imagA*imagB
 | 
						|
 | 
						|
	XSFADD_I1	vs1,	vs1,	vs42		// realA*imagB
 | 
						|
	XSFADD_I2	vs1,	vs1,	vs43		// imagA*realB
 | 
						|
 | 
						|
	xsmuldp		vs4,	vs0,	alpha_r		// real*alpha_r
 | 
						|
	xsmuldp		vs5,	vs1,	alpha_i		// imag*alpha_i
 | 
						|
	xsmuldp		vs6,	vs0,	alpha_i		// real*alpha_i
 | 
						|
	xsmuldp		vs7,	vs1,	alpha_r		// imag*alpha_r
 | 
						|
 | 
						|
	xssubdp		vs2,	vs4,	vs5		// real*alpha_r - imag*alpha_i
 | 
						|
	xsadddp		vs3,	vs6,	vs7		// real*alpha_i + imag*alpha_r
 | 
						|
	xxpermdi	vs13,	vs2,	vs3,	0	// merge real and imag part
 | 
						|
 | 
						|
 | 
						|
 | 
						|
	xxlxor		vs0,	vs0,	vs0
 | 
						|
	xxlxor		vs1,	vs1,	vs1
 | 
						|
	XXSWAPD(vs45,vs45)			// realA*imagB, imagA*imagB -> imagA*imagB, realA*imagB
 | 
						|
 | 
						|
	XSFADD_R1	vs0,	vs0,	vs44		// realA*realB
 | 
						|
	XSFADD_R2	vs0,	vs0,	vs45		// imagA*imagB
 | 
						|
 | 
						|
	XXSWAPD(vs44,vs44)			// realA*realB, imagA*realB -> imagA*realB, realA*realB
 | 
						|
	XXSWAPD(vs45,vs45)			// imagA*imagB, realA*imagB -> realA*imagB, imagA*imagB
 | 
						|
 | 
						|
	XSFADD_I1	vs1,	vs1,	vs44		// realA*imagB
 | 
						|
	XSFADD_I2	vs1,	vs1,	vs45		// imagA*realB
 | 
						|
 | 
						|
	xsmuldp		vs4,	vs0,	alpha_r		// real*alpha_r
 | 
						|
	xsmuldp		vs5,	vs1,	alpha_i		// imag*alpha_i
 | 
						|
	xsmuldp		vs6,	vs0,	alpha_i		// real*alpha_i
 | 
						|
	xsmuldp		vs7,	vs1,	alpha_r		// imag*alpha_r
 | 
						|
 | 
						|
	xssubdp		vs2,	vs4,	vs5		// real*alpha_r - imag*alpha_i
 | 
						|
	xsadddp		vs3,	vs6,	vs7		// real*alpha_i + imag*alpha_r
 | 
						|
	xxpermdi	vs14,	vs2,	vs3,	0	// merge real and imag part
 | 
						|
 | 
						|
 | 
						|
 | 
						|
	xxlxor		vs0,	vs0,	vs0
 | 
						|
	xxlxor		vs1,	vs1,	vs1
 | 
						|
	XXSWAPD(vs47,vs47)			// realA*imagB, imagA*imagB -> imagA*imagB, realA*imagB
 | 
						|
 | 
						|
	XSFADD_R1	vs0,	vs0,	vs46		// realA*realB
 | 
						|
	XSFADD_R2	vs0,	vs0,	vs47		// imagA*imagB
 | 
						|
 | 
						|
	XXSWAPD(vs46,vs46)			// realA*realB, imagA*realB -> imagA*realB, realA*realB
 | 
						|
	XXSWAPD(vs47,vs47)			// imagA*imagB, realA*imagB -> realA*imagB, imagA*imagB
 | 
						|
 | 
						|
	XSFADD_I1	vs1,	vs1,	vs46		// realA*imagB
 | 
						|
	XSFADD_I2	vs1,	vs1,	vs47		// imagA*realB
 | 
						|
 | 
						|
	xsmuldp		vs4,	vs0,	alpha_r		// real*alpha_r
 | 
						|
	xsmuldp		vs5,	vs1,	alpha_i		// imag*alpha_i
 | 
						|
	xsmuldp		vs6,	vs0,	alpha_i		// real*alpha_i
 | 
						|
	xsmuldp		vs7,	vs1,	alpha_r		// imag*alpha_r
 | 
						|
 | 
						|
	xssubdp		vs2,	vs4,	vs5		// real*alpha_r - imag*alpha_i
 | 
						|
	xsadddp		vs3,	vs6,	vs7		// real*alpha_i + imag*alpha_r
 | 
						|
	xxpermdi	vs15,	vs2,	vs3,	0	// merge real and imag part
 | 
						|
 | 
						|
 | 
						|
#ifndef TRMMKERNEL
 | 
						|
 | 
						|
	xvadddp		vs8,	vs8,	vs16
 | 
						|
	xvadddp		vs9,	vs9,	vs17
 | 
						|
	xvadddp		vs10,	vs10,	vs18
 | 
						|
	xvadddp		vs11,	vs11,	vs19
 | 
						|
	xvadddp		vs12,	vs12,	vs20
 | 
						|
	xvadddp		vs13,	vs13,	vs21
 | 
						|
	xvadddp		vs14,	vs14,	vs22
 | 
						|
	xvadddp		vs15,	vs15,	vs23
 | 
						|
 | 
						|
#endif
 | 
						|
 | 
						|
	stxvd2x		vs8,	o0,	T1
 | 
						|
	stxvd2x		vs9,	o16,	T1
 | 
						|
	stxvd2x		vs10,	o32,	T1
 | 
						|
	stxvd2x		vs11,	o48,	T1
 | 
						|
	stxvd2x		vs12,	o0,	T2
 | 
						|
	stxvd2x		vs13,	o16,	T2
 | 
						|
	stxvd2x		vs14,	o32,	T2
 | 
						|
	stxvd2x		vs15,	o48,	T2
 | 
						|
 | 
						|
	add		T1,	T1,	LDC
 | 
						|
	add		T2,	T2,	LDC
 | 
						|
	addi		CO,	CO,	128
 | 
						|
 | 
						|
#if defined(_AIX)
 | 
						|
')
 | 
						|
#else
 | 
						|
.endm
 | 
						|
#endif
 | 
						|
 | 
						|
 | 
						|
/**********************************************************************************************
 | 
						|
* Macros for N=1 and M=4
 | 
						|
**********************************************************************************************/
 | 
						|
 | 
						|
#if defined(_AIX)
 | 
						|
define(`LOAD1x4_1', `
 | 
						|
#else
 | 
						|
.macro LOAD1x4_1
 | 
						|
#endif
 | 
						|
 | 
						|
	lxvd2x		vs16,	o0,	BO		// load real part from B
 | 
						|
	lxvd2x		vs17,	o16,	BO		// load imag part from B
 | 
						|
 | 
						|
	addi		BO,	BO,	32
 | 
						|
 | 
						|
	lxvd2x		vs0,	o0,	AO		// load real,imag from A
 | 
						|
	lxvd2x		vs1,	o16,	AO		// load real,imag from A
 | 
						|
	lxvd2x		vs2,	o32,	AO		// load real,imag from A
 | 
						|
	lxvd2x		vs3,	o48,	AO		// load real,imag from A
 | 
						|
 | 
						|
	addi		AO,	AO,	64
 | 
						|
 | 
						|
 | 
						|
#if defined(_AIX)
 | 
						|
')
 | 
						|
#else
 | 
						|
.endm
 | 
						|
#endif
 | 
						|
 | 
						|
#if defined(_AIX)
 | 
						|
define(`KERNEL1x4_I1', `
 | 
						|
#else
 | 
						|
.macro KERNEL1x4_I1
 | 
						|
#endif
 | 
						|
 | 
						|
	lxvd2x		vs8,	o0,	AO		// load real,imag from A
 | 
						|
	lxvd2x		vs9,	o16,	AO		// load real,imag from A
 | 
						|
	lxvd2x		vs10,	o32,	AO		// load real,imag from A
 | 
						|
	lxvd2x		vs11,	o48,	AO		// load real,imag from A
 | 
						|
 | 
						|
	addi		AO,	AO,	64
 | 
						|
 | 
						|
	lxvd2x		vs20,	o0,	BO		// load real part from B
 | 
						|
	lxvd2x		vs21,	o16,	BO		// load imag part from B
 | 
						|
 | 
						|
	addi		BO,	BO,	32
 | 
						|
 | 
						|
	xvmuldp		vs32,	vs0,	vs16		// real*real, imag*real
 | 
						|
	xvmuldp		vs33,	vs0,	vs17		// real*imag, imag*imag
 | 
						|
	xvmuldp		vs34,	vs1,	vs16		// real*real, imag*real
 | 
						|
	xvmuldp		vs35,	vs1,	vs17		// real*imag, imag*imag
 | 
						|
	xvmuldp		vs36,	vs2,	vs16		// real*real, imag*real
 | 
						|
	xvmuldp		vs37,	vs2,	vs17		// real*imag, imag*imag
 | 
						|
	xvmuldp		vs38,	vs3,	vs16		// real*real, imag*real
 | 
						|
	xvmuldp		vs39,	vs3,	vs17		// real*imag, imag*imag
 | 
						|
 | 
						|
 | 
						|
#if defined(_AIX)
 | 
						|
')
 | 
						|
#else
 | 
						|
.endm
 | 
						|
#endif
 | 
						|
 | 
						|
#if defined(_AIX)
 | 
						|
define(`KERNEL1x4_1', `
 | 
						|
#else
 | 
						|
.macro KERNEL1x4_1
 | 
						|
#endif
 | 
						|
 | 
						|
	lxvd2x		vs8,	o0,	AO		// load real,imag from A
 | 
						|
	lxvd2x		vs9,	o16,	AO		// load real,imag from A
 | 
						|
	lxvd2x		vs10,	o32,	AO		// load real,imag from A
 | 
						|
	lxvd2x		vs11,	o48,	AO		// load real,imag from A
 | 
						|
 | 
						|
	addi		AO,	AO,	64
 | 
						|
 | 
						|
	lxvd2x		vs20,	o0,	BO		// load real part from B
 | 
						|
	lxvd2x		vs21,	o16,	BO		// load imag part from B
 | 
						|
 | 
						|
	addi		BO,	BO,	32
 | 
						|
 | 
						|
	xvmaddadp	vs32,	vs0,	vs16		// real*real, imag*real
 | 
						|
	xvmaddadp	vs33,	vs0,	vs17		// real*imag, imag*imag
 | 
						|
	xvmaddadp	vs34,	vs1,	vs16		// real*real, imag*real
 | 
						|
	xvmaddadp	vs35,	vs1,	vs17		// real*imag, imag*imag
 | 
						|
	xvmaddadp	vs36,	vs2,	vs16		// real*real, imag*real
 | 
						|
	xvmaddadp	vs37,	vs2,	vs17		// real*imag, imag*imag
 | 
						|
	xvmaddadp	vs38,	vs3,	vs16		// real*real, imag*real
 | 
						|
	xvmaddadp	vs39,	vs3,	vs17		// real*imag, imag*imag
 | 
						|
 | 
						|
 | 
						|
#if defined(_AIX)
 | 
						|
')
 | 
						|
#else
 | 
						|
.endm
 | 
						|
#endif
 | 
						|
 | 
						|
#if defined(_AIX)
 | 
						|
define(`KERNEL1x4_2', `
 | 
						|
#else
 | 
						|
.macro KERNEL1x4_2
 | 
						|
#endif
 | 
						|
 | 
						|
	lxvd2x		vs0,	o0,	AO		// load real,imag from A
 | 
						|
	lxvd2x		vs1,	o16,	AO		// load real,imag from A
 | 
						|
	lxvd2x		vs2,	o32,	AO		// load real,imag from A
 | 
						|
	lxvd2x		vs3,	o48,	AO		// load real,imag from A
 | 
						|
 | 
						|
	addi		AO,	AO,	64
 | 
						|
 | 
						|
	lxvd2x		vs16,	o0,	BO		// load real part from B
 | 
						|
	lxvd2x		vs17,	o16,	BO		// load imag part from B
 | 
						|
 | 
						|
	addi		BO,	BO,	32
 | 
						|
 | 
						|
	xvmaddadp	vs32,	vs8,	vs20		// real*real, imag*real
 | 
						|
	xvmaddadp	vs33,	vs8,	vs21		// real*imag, imag*imag
 | 
						|
	xvmaddadp	vs34,	vs9,	vs20		// real*real, imag*real
 | 
						|
	xvmaddadp	vs35,	vs9,	vs21		// real*imag, imag*imag
 | 
						|
	xvmaddadp	vs36,	vs10,	vs20		// real*real, imag*real
 | 
						|
	xvmaddadp	vs37,	vs10,	vs21		// real*imag, imag*imag
 | 
						|
	xvmaddadp	vs38,	vs11,	vs20		// real*real, imag*real
 | 
						|
	xvmaddadp	vs39,	vs11,	vs21		// real*imag, imag*imag
 | 
						|
 | 
						|
 | 
						|
#if defined(_AIX)
 | 
						|
')
 | 
						|
#else
 | 
						|
.endm
 | 
						|
#endif
 | 
						|
 | 
						|
#if defined(_AIX)
 | 
						|
define(`KERNEL1x4_E2', `
 | 
						|
#else
 | 
						|
.macro KERNEL1x4_E2
 | 
						|
#endif
 | 
						|
 | 
						|
 | 
						|
	xvmaddadp	vs32,	vs8,	vs20		// real*real, imag*real
 | 
						|
	xvmaddadp	vs33,	vs8,	vs21		// real*imag, imag*imag
 | 
						|
	xvmaddadp	vs34,	vs9,	vs20		// real*real, imag*real
 | 
						|
	xvmaddadp	vs35,	vs9,	vs21		// real*imag, imag*imag
 | 
						|
	xvmaddadp	vs36,	vs10,	vs20		// real*real, imag*real
 | 
						|
	xvmaddadp	vs37,	vs10,	vs21		// real*imag, imag*imag
 | 
						|
	xvmaddadp	vs38,	vs11,	vs20		// real*real, imag*real
 | 
						|
	xvmaddadp	vs39,	vs11,	vs21		// real*imag, imag*imag
 | 
						|
 | 
						|
 | 
						|
#if defined(_AIX)
 | 
						|
')
 | 
						|
#else
 | 
						|
.endm
 | 
						|
#endif
 | 
						|
 | 
						|
#if defined(_AIX)
 | 
						|
define(`KERNEL1x4_SUBI1', `
 | 
						|
#else
 | 
						|
.macro KERNEL1x4_SUBI1
 | 
						|
#endif
 | 
						|
 | 
						|
	lxvd2x		vs0,	o0,	AO		// load real,imag from A
 | 
						|
	lxvd2x		vs1,	o16,	AO		// load real,imag from A
 | 
						|
	lxvd2x		vs2,	o32,	AO		// load real,imag from A
 | 
						|
	lxvd2x		vs3,	o48,	AO		// load real,imag from A
 | 
						|
 | 
						|
	addi		AO,	AO,	64
 | 
						|
 | 
						|
	lxvd2x		vs16,	o0,	BO		// load real part from B
 | 
						|
	lxvd2x		vs17,	o16,	BO		// load imag part from B
 | 
						|
 | 
						|
	addi		BO,	BO,	32
 | 
						|
 | 
						|
	xvmuldp		vs32,	vs0,	vs16		// real*real, imag*real
 | 
						|
	xvmuldp		vs33,	vs0,	vs17		// real*imag, imag*imag
 | 
						|
	xvmuldp		vs34,	vs1,	vs16		// real*real, imag*real
 | 
						|
	xvmuldp		vs35,	vs1,	vs17		// real*imag, imag*imag
 | 
						|
	xvmuldp		vs36,	vs2,	vs16		// real*real, imag*real
 | 
						|
	xvmuldp		vs37,	vs2,	vs17		// real*imag, imag*imag
 | 
						|
	xvmuldp		vs38,	vs3,	vs16		// real*real, imag*real
 | 
						|
	xvmuldp		vs39,	vs3,	vs17		// real*imag, imag*imag
 | 
						|
 | 
						|
 | 
						|
#if defined(_AIX)
 | 
						|
')
 | 
						|
#else
 | 
						|
.endm
 | 
						|
#endif
 | 
						|
 | 
						|
#if defined(_AIX)
 | 
						|
define(`KERNEL1x4_SUB1', `
 | 
						|
#else
 | 
						|
.macro KERNEL1x4_SUB1
 | 
						|
#endif
 | 
						|
 | 
						|
	lxvd2x		vs0,	o0,	AO		// load real,imag from A
 | 
						|
	lxvd2x		vs1,	o16,	AO		// load real,imag from A
 | 
						|
	lxvd2x		vs2,	o32,	AO		// load real,imag from A
 | 
						|
	lxvd2x		vs3,	o48,	AO		// load real,imag from A
 | 
						|
 | 
						|
	addi		AO,	AO,	64
 | 
						|
 | 
						|
	lxvd2x		vs16,	o0,	BO		// load real part from B
 | 
						|
	lxvd2x		vs17,	o16,	BO		// load imag part from B
 | 
						|
 | 
						|
	addi		BO,	BO,	32
 | 
						|
 | 
						|
	xvmaddadp	vs32,	vs0,	vs16		// real*real, imag*real
 | 
						|
	xvmaddadp	vs33,	vs0,	vs17		// real*imag, imag*imag
 | 
						|
	xvmaddadp	vs34,	vs1,	vs16		// real*real, imag*real
 | 
						|
	xvmaddadp	vs35,	vs1,	vs17		// real*imag, imag*imag
 | 
						|
	xvmaddadp	vs36,	vs2,	vs16		// real*real, imag*real
 | 
						|
	xvmaddadp	vs37,	vs2,	vs17		// real*imag, imag*imag
 | 
						|
	xvmaddadp	vs38,	vs3,	vs16		// real*real, imag*real
 | 
						|
	xvmaddadp	vs39,	vs3,	vs17		// real*imag, imag*imag
 | 
						|
 | 
						|
 | 
						|
#if defined(_AIX)
 | 
						|
')
 | 
						|
#else
 | 
						|
.endm
 | 
						|
#endif
 | 
						|
 | 
						|
#if defined(_AIX)
 | 
						|
define(`SAVE1x4', `
 | 
						|
#else
 | 
						|
.macro SAVE1x4
 | 
						|
#endif
 | 
						|
 | 
						|
 | 
						|
	mr		T1,	CO
 | 
						|
 | 
						|
#ifndef TRMMKERNEL
 | 
						|
 | 
						|
	lxvd2x		vs16,	o0,	T1
 | 
						|
	lxvd2x		vs17,	o16,	T1
 | 
						|
	lxvd2x		vs18,	o32,	T1
 | 
						|
	lxvd2x		vs19,	o48,	T1
 | 
						|
 | 
						|
#endif
 | 
						|
 | 
						|
 | 
						|
	xxlxor		vs0,	vs0,	vs0
 | 
						|
	xxlxor		vs1,	vs1,	vs1
 | 
						|
	XXSWAPD(vs33,vs33)			// realA*imagB, imagA*imagB -> imagA*imagB, realA*imagB
 | 
						|
 | 
						|
	XSFADD_R1	vs0,	vs0,	vs32		// realA*realB
 | 
						|
	XSFADD_R2	vs0,	vs0,	vs33		// imagA*imagB
 | 
						|
 | 
						|
	XXSWAPD(vs32,vs32)			// realA*realB, imagA*realB -> imagA*realB, realA*realB
 | 
						|
	XXSWAPD(vs33,vs33)			// imagA*imagB, realA*imagB -> realA*imagB, imagA*imagB
 | 
						|
 | 
						|
	XSFADD_I1	vs1,	vs1,	vs32		// realA*imagB
 | 
						|
	XSFADD_I2	vs1,	vs1,	vs33		// imagA*realB
 | 
						|
 | 
						|
	xsmuldp		vs4,	vs0,	alpha_r		// real*alpha_r
 | 
						|
	xsmuldp		vs5,	vs1,	alpha_i		// imag*alpha_i
 | 
						|
	xsmuldp		vs6,	vs0,	alpha_i		// real*alpha_i
 | 
						|
	xsmuldp		vs7,	vs1,	alpha_r		// imag*alpha_r
 | 
						|
 | 
						|
	xssubdp		vs2,	vs4,	vs5		// real*alpha_r - imag*alpha_i
 | 
						|
	xsadddp		vs3,	vs6,	vs7		// real*alpha_i + imag*alpha_r
 | 
						|
	xxpermdi	vs8,	vs2,	vs3,	0	// merge real and imag part
 | 
						|
 | 
						|
 | 
						|
 | 
						|
	xxlxor		vs0,	vs0,	vs0
 | 
						|
	xxlxor		vs1,	vs1,	vs1
 | 
						|
	XXSWAPD(vs35,vs35)			// realA*imagB, imagA*imagB -> imagA*imagB, realA*imagB
 | 
						|
 | 
						|
	XSFADD_R1	vs0,	vs0,	vs34		// realA*realB
 | 
						|
	XSFADD_R2	vs0,	vs0,	vs35		// imagA*imagB
 | 
						|
 | 
						|
	XXSWAPD(vs34,vs34)			// realA*realB, imagA*realB -> imagA*realB, realA*realB
 | 
						|
	XXSWAPD(vs35,vs35)			// imagA*imagB, realA*imagB -> realA*imagB, imagA*imagB
 | 
						|
 | 
						|
	XSFADD_I1	vs1,	vs1,	vs34		// realA*imagB
 | 
						|
	XSFADD_I2	vs1,	vs1,	vs35		// imagA*realB
 | 
						|
 | 
						|
	xsmuldp		vs4,	vs0,	alpha_r		// real*alpha_r
 | 
						|
	xsmuldp		vs5,	vs1,	alpha_i		// imag*alpha_i
 | 
						|
	xsmuldp		vs6,	vs0,	alpha_i		// real*alpha_i
 | 
						|
	xsmuldp		vs7,	vs1,	alpha_r		// imag*alpha_r
 | 
						|
 | 
						|
	xssubdp		vs2,	vs4,	vs5		// real*alpha_r - imag*alpha_i
 | 
						|
	xsadddp		vs3,	vs6,	vs7		// real*alpha_i + imag*alpha_r
 | 
						|
	xxpermdi	vs9,	vs2,	vs3,	0	// merge real and imag part
 | 
						|
 | 
						|
 | 
						|
 | 
						|
	xxlxor		vs0,	vs0,	vs0
 | 
						|
	xxlxor		vs1,	vs1,	vs1
 | 
						|
	XXSWAPD(vs37,vs37)			// realA*imagB, imagA*imagB -> imagA*imagB, realA*imagB
 | 
						|
 | 
						|
	XSFADD_R1	vs0,	vs0,	vs36		// realA*realB
 | 
						|
	XSFADD_R2	vs0,	vs0,	vs37		// imagA*imagB
 | 
						|
 | 
						|
	XXSWAPD(vs36,vs36)			// realA*realB, imagA*realB -> imagA*realB, realA*realB
 | 
						|
	XXSWAPD(vs37,vs37)			// imagA*imagB, realA*imagB -> realA*imagB, imagA*imagB
 | 
						|
 | 
						|
	XSFADD_I1	vs1,	vs1,	vs36		// realA*imagB
 | 
						|
	XSFADD_I2	vs1,	vs1,	vs37		// imagA*realB
 | 
						|
 | 
						|
	xsmuldp		vs4,	vs0,	alpha_r		// real*alpha_r
 | 
						|
	xsmuldp		vs5,	vs1,	alpha_i		// imag*alpha_i
 | 
						|
	xsmuldp		vs6,	vs0,	alpha_i		// real*alpha_i
 | 
						|
	xsmuldp		vs7,	vs1,	alpha_r		// imag*alpha_r
 | 
						|
 | 
						|
	xssubdp		vs2,	vs4,	vs5		// real*alpha_r - imag*alpha_i
 | 
						|
	xsadddp		vs3,	vs6,	vs7		// real*alpha_i + imag*alpha_r
 | 
						|
	xxpermdi	vs10,	vs2,	vs3,	0	// merge real and imag part
 | 
						|
 | 
						|
 | 
						|
 | 
						|
	xxlxor		vs0,	vs0,	vs0
 | 
						|
	xxlxor		vs1,	vs1,	vs1
 | 
						|
	XXSWAPD(vs39,vs39)			// realA*imagB, imagA*imagB -> imagA*imagB, realA*imagB
 | 
						|
 | 
						|
	XSFADD_R1	vs0,	vs0,	vs38		// realA*realB
 | 
						|
	XSFADD_R2	vs0,	vs0,	vs39		// imagA*imagB
 | 
						|
 | 
						|
	XXSWAPD(vs38,vs38)			// realA*realB, imagA*realB -> imagA*realB, realA*realB
 | 
						|
	XXSWAPD(vs39,vs39)			// imagA*imagB, realA*imagB -> realA*imagB, imagA*imagB
 | 
						|
 | 
						|
	XSFADD_I1	vs1,	vs1,	vs38		// realA*imagB
 | 
						|
	XSFADD_I2	vs1,	vs1,	vs39		// imagA*realB
 | 
						|
 | 
						|
	xsmuldp		vs4,	vs0,	alpha_r		// real*alpha_r
 | 
						|
	xsmuldp		vs5,	vs1,	alpha_i		// imag*alpha_i
 | 
						|
	xsmuldp		vs6,	vs0,	alpha_i		// real*alpha_i
 | 
						|
	xsmuldp		vs7,	vs1,	alpha_r		// imag*alpha_r
 | 
						|
 | 
						|
	xssubdp		vs2,	vs4,	vs5		// real*alpha_r - imag*alpha_i
 | 
						|
	xsadddp		vs3,	vs6,	vs7		// real*alpha_i + imag*alpha_r
 | 
						|
	xxpermdi	vs11,	vs2,	vs3,	0	// merge real and imag part
 | 
						|
 | 
						|
 | 
						|
#ifndef TRMMKERNEL
 | 
						|
 | 
						|
	xvadddp		vs8,	vs8,	vs16
 | 
						|
	xvadddp		vs9,	vs9,	vs17
 | 
						|
	xvadddp		vs10,	vs10,	vs18
 | 
						|
	xvadddp		vs11,	vs11,	vs19
 | 
						|
 | 
						|
#endif
 | 
						|
 | 
						|
	stxvd2x		vs8,	o0,	T1
 | 
						|
	stxvd2x		vs9,	o16,	T1
 | 
						|
	stxvd2x		vs10,	o32,	T1
 | 
						|
	stxvd2x		vs11,	o48,	T1
 | 
						|
 | 
						|
	add		T1,	T1,	LDC
 | 
						|
	addi		CO,	CO,	64
 | 
						|
 | 
						|
#if defined(_AIX)
 | 
						|
')
 | 
						|
#else
 | 
						|
.endm
 | 
						|
#endif
 | 
						|
 | 
						|
 | 
						|
/**********************************************************************************************
 | 
						|
* Macros for N=1 and M=2
 | 
						|
**********************************************************************************************/
 | 
						|
 | 
						|
#if defined(_AIX)
 | 
						|
define(`LOAD1x2_1', `
 | 
						|
#else
 | 
						|
.macro LOAD1x2_1
 | 
						|
#endif
 | 
						|
 | 
						|
	lxvd2x		vs16,	o0,	BO		// load real part from B
 | 
						|
	lxvd2x		vs17,	o16,	BO		// load imag part from B
 | 
						|
 | 
						|
	addi		BO,	BO,	32
 | 
						|
 | 
						|
	lxvd2x		vs0,	o0,	AO		// load real,imag from A
 | 
						|
	lxvd2x		vs1,	o16,	AO		// load real,imag from A
 | 
						|
 | 
						|
	addi		AO,	AO,	32
 | 
						|
 | 
						|
 | 
						|
#if defined(_AIX)
 | 
						|
')
 | 
						|
#else
 | 
						|
.endm
 | 
						|
#endif
 | 
						|
 | 
						|
#if defined(_AIX)
 | 
						|
define(`KERNEL1x2_I1', `
 | 
						|
#else
 | 
						|
.macro KERNEL1x2_I1
 | 
						|
#endif
 | 
						|
 | 
						|
	lxvd2x		vs8,	o0,	AO		// load real,imag from A
 | 
						|
	lxvd2x		vs9,	o16,	AO		// load real,imag from A
 | 
						|
 | 
						|
	addi		AO,	AO,	32
 | 
						|
 | 
						|
	lxvd2x		vs20,	o0,	BO		// load real part from B
 | 
						|
	lxvd2x		vs21,	o16,	BO		// load imag part from B
 | 
						|
 | 
						|
	addi		BO,	BO,	32
 | 
						|
 | 
						|
	xvmuldp		vs32,	vs0,	vs16		// real*real, imag*real
 | 
						|
	xvmuldp		vs33,	vs0,	vs17		// real*imag, imag*imag
 | 
						|
	xvmuldp		vs34,	vs1,	vs16		// real*real, imag*real
 | 
						|
	xvmuldp		vs35,	vs1,	vs17		// real*imag, imag*imag
 | 
						|
 | 
						|
 | 
						|
#if defined(_AIX)
 | 
						|
')
 | 
						|
#else
 | 
						|
.endm
 | 
						|
#endif
 | 
						|
 | 
						|
#if defined(_AIX)
 | 
						|
define(`KERNEL1x2_1', `
 | 
						|
#else
 | 
						|
.macro KERNEL1x2_1
 | 
						|
#endif
 | 
						|
 | 
						|
	lxvd2x		vs8,	o0,	AO		// load real,imag from A
 | 
						|
	lxvd2x		vs9,	o16,	AO		// load real,imag from A
 | 
						|
 | 
						|
	addi		AO,	AO,	32
 | 
						|
 | 
						|
	lxvd2x		vs20,	o0,	BO		// load real part from B
 | 
						|
	lxvd2x		vs21,	o16,	BO		// load imag part from B
 | 
						|
 | 
						|
	addi		BO,	BO,	32
 | 
						|
 | 
						|
	xvmaddadp	vs32,	vs0,	vs16		// real*real, imag*real
 | 
						|
	xvmaddadp	vs33,	vs0,	vs17		// real*imag, imag*imag
 | 
						|
	xvmaddadp	vs34,	vs1,	vs16		// real*real, imag*real
 | 
						|
	xvmaddadp	vs35,	vs1,	vs17		// real*imag, imag*imag
 | 
						|
 | 
						|
 | 
						|
#if defined(_AIX)
 | 
						|
')
 | 
						|
#else
 | 
						|
.endm
 | 
						|
#endif
 | 
						|
 | 
						|
#if defined(_AIX)
 | 
						|
define(`KERNEL1x2_2', `
 | 
						|
#else
 | 
						|
.macro KERNEL1x2_2
 | 
						|
#endif
 | 
						|
 | 
						|
	lxvd2x		vs0,	o0,	AO		// load real,imag from A
 | 
						|
	lxvd2x		vs1,	o16,	AO		// load real,imag from A
 | 
						|
 | 
						|
	addi		AO,	AO,	32
 | 
						|
 | 
						|
	lxvd2x		vs16,	o0,	BO		// load real part from B
 | 
						|
	lxvd2x		vs17,	o16,	BO		// load imag part from B
 | 
						|
 | 
						|
	addi		BO,	BO,	32
 | 
						|
 | 
						|
	xvmaddadp	vs32,	vs8,	vs20		// real*real, imag*real
 | 
						|
	xvmaddadp	vs33,	vs8,	vs21		// real*imag, imag*imag
 | 
						|
	xvmaddadp	vs34,	vs9,	vs20		// real*real, imag*real
 | 
						|
	xvmaddadp	vs35,	vs9,	vs21		// real*imag, imag*imag
 | 
						|
 | 
						|
 | 
						|
#if defined(_AIX)
 | 
						|
')
 | 
						|
#else
 | 
						|
.endm
 | 
						|
#endif
 | 
						|
 | 
						|
#if defined(_AIX)
 | 
						|
define(`KERNEL1x2_E2', `
 | 
						|
#else
 | 
						|
.macro KERNEL1x2_E2
 | 
						|
#endif
 | 
						|
 | 
						|
 | 
						|
	xvmaddadp	vs32,	vs8,	vs20		// real*real, imag*real
 | 
						|
	xvmaddadp	vs33,	vs8,	vs21		// real*imag, imag*imag
 | 
						|
	xvmaddadp	vs34,	vs9,	vs20		// real*real, imag*real
 | 
						|
	xvmaddadp	vs35,	vs9,	vs21		// real*imag, imag*imag
 | 
						|
 | 
						|
 | 
						|
#if defined(_AIX)
 | 
						|
')
 | 
						|
#else
 | 
						|
.endm
 | 
						|
#endif
 | 
						|
 | 
						|
#if defined(_AIX)
 | 
						|
define(`KERNEL1x2_SUBI1', `
 | 
						|
#else
 | 
						|
.macro KERNEL1x2_SUBI1
 | 
						|
#endif
 | 
						|
 | 
						|
	lxvd2x		vs0,	o0,	AO		// load real,imag from A
 | 
						|
	lxvd2x		vs1,	o16,	AO		// load real,imag from A
 | 
						|
 | 
						|
	addi		AO,	AO,	32
 | 
						|
 | 
						|
	lxvd2x		vs16,	o0,	BO		// load real part from B
 | 
						|
	lxvd2x		vs17,	o16,	BO		// load imag part from B
 | 
						|
 | 
						|
	addi		BO,	BO,	32
 | 
						|
 | 
						|
	xvmuldp		vs32,	vs0,	vs16		// real*real, imag*real
 | 
						|
	xvmuldp		vs33,	vs0,	vs17		// real*imag, imag*imag
 | 
						|
	xvmuldp		vs34,	vs1,	vs16		// real*real, imag*real
 | 
						|
	xvmuldp		vs35,	vs1,	vs17		// real*imag, imag*imag
 | 
						|
 | 
						|
 | 
						|
#if defined(_AIX)
 | 
						|
')
 | 
						|
#else
 | 
						|
.endm
 | 
						|
#endif
 | 
						|
 | 
						|
#if defined(_AIX)
 | 
						|
define(`KERNEL1x2_SUB1', `
 | 
						|
#else
 | 
						|
.macro KERNEL1x2_SUB1
 | 
						|
#endif
 | 
						|
 | 
						|
	lxvd2x		vs0,	o0,	AO		// load real,imag from A
 | 
						|
	lxvd2x		vs1,	o16,	AO		// load real,imag from A
 | 
						|
 | 
						|
	addi		AO,	AO,	32
 | 
						|
 | 
						|
	lxvd2x		vs16,	o0,	BO		// load real part from B
 | 
						|
	lxvd2x		vs17,	o16,	BO		// load imag part from B
 | 
						|
 | 
						|
	addi		BO,	BO,	32
 | 
						|
 | 
						|
	xvmaddadp	vs32,	vs0,	vs16		// real*real, imag*real
 | 
						|
	xvmaddadp	vs33,	vs0,	vs17		// real*imag, imag*imag
 | 
						|
	xvmaddadp	vs34,	vs1,	vs16		// real*real, imag*real
 | 
						|
	xvmaddadp	vs35,	vs1,	vs17		// real*imag, imag*imag
 | 
						|
 | 
						|
 | 
						|
#if defined(_AIX)
 | 
						|
')
 | 
						|
#else
 | 
						|
.endm
 | 
						|
#endif
 | 
						|
 | 
						|
#if defined(_AIX)
 | 
						|
define(`SAVE1x2', `
 | 
						|
#else
 | 
						|
.macro SAVE1x2
 | 
						|
#endif
 | 
						|
 | 
						|
 | 
						|
	mr		T1,	CO
 | 
						|
 | 
						|
#ifndef TRMMKERNEL
 | 
						|
 | 
						|
	lxvd2x		vs16,	o0,	T1
 | 
						|
	lxvd2x		vs17,	o16,	T1
 | 
						|
 | 
						|
#endif
 | 
						|
 | 
						|
 | 
						|
	xxlxor		vs0,	vs0,	vs0
 | 
						|
	xxlxor		vs1,	vs1,	vs1
 | 
						|
	XXSWAPD(vs33,vs33)			// realA*imagB, imagA*imagB -> imagA*imagB, realA*imagB
 | 
						|
 | 
						|
	XSFADD_R1	vs0,	vs0,	vs32		// realA*realB
 | 
						|
	XSFADD_R2	vs0,	vs0,	vs33		// imagA*imagB
 | 
						|
 | 
						|
	XXSWAPD(vs32,vs32)			// realA*realB, imagA*realB -> imagA*realB, realA*realB
 | 
						|
	XXSWAPD(vs33,vs33)			// imagA*imagB, realA*imagB -> realA*imagB, imagA*imagB
 | 
						|
 | 
						|
	XSFADD_I1	vs1,	vs1,	vs32		// realA*imagB
 | 
						|
	XSFADD_I2	vs1,	vs1,	vs33		// imagA*realB
 | 
						|
 | 
						|
	xsmuldp		vs4,	vs0,	alpha_r		// real*alpha_r
 | 
						|
	xsmuldp		vs5,	vs1,	alpha_i		// imag*alpha_i
 | 
						|
	xsmuldp		vs6,	vs0,	alpha_i		// real*alpha_i
 | 
						|
	xsmuldp		vs7,	vs1,	alpha_r		// imag*alpha_r
 | 
						|
 | 
						|
	xssubdp		vs2,	vs4,	vs5		// real*alpha_r - imag*alpha_i
 | 
						|
	xsadddp		vs3,	vs6,	vs7		// real*alpha_i + imag*alpha_r
 | 
						|
	xxpermdi	vs8,	vs2,	vs3,	0	// merge real and imag part
 | 
						|
 | 
						|
 | 
						|
 | 
						|
	xxlxor		vs0,	vs0,	vs0
 | 
						|
	xxlxor		vs1,	vs1,	vs1
 | 
						|
	XXSWAPD(vs35,vs35)			// realA*imagB, imagA*imagB -> imagA*imagB, realA*imagB
 | 
						|
 | 
						|
	XSFADD_R1	vs0,	vs0,	vs34		// realA*realB
 | 
						|
	XSFADD_R2	vs0,	vs0,	vs35		// imagA*imagB
 | 
						|
 | 
						|
	XXSWAPD(vs34,vs34)			// realA*realB, imagA*realB -> imagA*realB, realA*realB
 | 
						|
	XXSWAPD(vs35,vs35)			// imagA*imagB, realA*imagB -> realA*imagB, imagA*imagB
 | 
						|
 | 
						|
	XSFADD_I1	vs1,	vs1,	vs34		// realA*imagB
 | 
						|
	XSFADD_I2	vs1,	vs1,	vs35		// imagA*realB
 | 
						|
 | 
						|
	xsmuldp		vs4,	vs0,	alpha_r		// real*alpha_r
 | 
						|
	xsmuldp		vs5,	vs1,	alpha_i		// imag*alpha_i
 | 
						|
	xsmuldp		vs6,	vs0,	alpha_i		// real*alpha_i
 | 
						|
	xsmuldp		vs7,	vs1,	alpha_r		// imag*alpha_r
 | 
						|
 | 
						|
	xssubdp		vs2,	vs4,	vs5		// real*alpha_r - imag*alpha_i
 | 
						|
	xsadddp		vs3,	vs6,	vs7		// real*alpha_i + imag*alpha_r
 | 
						|
	xxpermdi	vs9,	vs2,	vs3,	0	// merge real and imag part
 | 
						|
 | 
						|
 | 
						|
#ifndef TRMMKERNEL
 | 
						|
 | 
						|
	xvadddp		vs8,	vs8,	vs16
 | 
						|
	xvadddp		vs9,	vs9,	vs17
 | 
						|
 | 
						|
#endif
 | 
						|
 | 
						|
	stxvd2x		vs8,	o0,	T1
 | 
						|
	stxvd2x		vs9,	o16,	T1
 | 
						|
 | 
						|
	add		T1,	T1,	LDC
 | 
						|
	addi		CO,	CO,	32
 | 
						|
 | 
						|
#if defined(_AIX)
 | 
						|
')
 | 
						|
#else
 | 
						|
.endm
 | 
						|
#endif
 | 
						|
 | 
						|
 | 
						|
/**********************************************************************************************
 | 
						|
* Macros for N=1 and M=1
 | 
						|
**********************************************************************************************/
 | 
						|
 | 
						|
#if defined(_AIX)
 | 
						|
define(`LOAD1x1_1', `
 | 
						|
#else
 | 
						|
.macro LOAD1x1_1
 | 
						|
#endif
 | 
						|
 | 
						|
	lxvd2x		vs16,	o0,	BO		// load real part from B
 | 
						|
	lxvd2x		vs17,	o16,	BO		// load imag part from B
 | 
						|
 | 
						|
	addi		BO,	BO,	32
 | 
						|
 | 
						|
	lxvd2x		vs0,	o0,	AO		// load real,imag from A
 | 
						|
 | 
						|
	addi		AO,	AO,	16
 | 
						|
 | 
						|
 | 
						|
#if defined(_AIX)
 | 
						|
')
 | 
						|
#else
 | 
						|
.endm
 | 
						|
#endif
 | 
						|
 | 
						|
#if defined(_AIX)
 | 
						|
define(`KERNEL1x1_I1', `
 | 
						|
#else
 | 
						|
.macro KERNEL1x1_I1
 | 
						|
#endif
 | 
						|
 | 
						|
	lxvd2x		vs8,	o0,	AO		// load real,imag from A
 | 
						|
 | 
						|
	addi		AO,	AO,	16
 | 
						|
 | 
						|
	lxvd2x		vs20,	o0,	BO		// load real part from B
 | 
						|
	lxvd2x		vs21,	o16,	BO		// load imag part from B
 | 
						|
 | 
						|
	addi		BO,	BO,	32
 | 
						|
 | 
						|
	xvmuldp		vs32,	vs0,	vs16		// real*real, imag*real
 | 
						|
	xvmuldp		vs33,	vs0,	vs17		// real*imag, imag*imag
 | 
						|
 | 
						|
 | 
						|
#if defined(_AIX)
 | 
						|
')
 | 
						|
#else
 | 
						|
.endm
 | 
						|
#endif
 | 
						|
 | 
						|
#if defined(_AIX)
 | 
						|
define(`KERNEL1x1_1', `
 | 
						|
#else
 | 
						|
.macro KERNEL1x1_1
 | 
						|
#endif
 | 
						|
 | 
						|
	lxvd2x		vs8,	o0,	AO		// load real,imag from A
 | 
						|
 | 
						|
	addi		AO,	AO,	16
 | 
						|
 | 
						|
	lxvd2x		vs20,	o0,	BO		// load real part from B
 | 
						|
	lxvd2x		vs21,	o16,	BO		// load imag part from B
 | 
						|
 | 
						|
	addi		BO,	BO,	32
 | 
						|
 | 
						|
	xvmaddadp	vs32,	vs0,	vs16		// real*real, imag*real
 | 
						|
	xvmaddadp	vs33,	vs0,	vs17		// real*imag, imag*imag
 | 
						|
 | 
						|
 | 
						|
#if defined(_AIX)
 | 
						|
')
 | 
						|
#else
 | 
						|
.endm
 | 
						|
#endif
 | 
						|
 | 
						|
#if defined(_AIX)
 | 
						|
define(`KERNEL1x1_2', `
 | 
						|
#else
 | 
						|
.macro KERNEL1x1_2
 | 
						|
#endif
 | 
						|
 | 
						|
	lxvd2x		vs0,	o0,	AO		// load real,imag from A
 | 
						|
 | 
						|
	addi		AO,	AO,	16
 | 
						|
 | 
						|
	lxvd2x		vs16,	o0,	BO		// load real part from B
 | 
						|
	lxvd2x		vs17,	o16,	BO		// load imag part from B
 | 
						|
 | 
						|
	addi		BO,	BO,	32
 | 
						|
 | 
						|
	xvmaddadp	vs32,	vs8,	vs20		// real*real, imag*real
 | 
						|
	xvmaddadp	vs33,	vs8,	vs21		// real*imag, imag*imag
 | 
						|
 | 
						|
 | 
						|
#if defined(_AIX)
 | 
						|
')
 | 
						|
#else
 | 
						|
.endm
 | 
						|
#endif
 | 
						|
 | 
						|
#if defined(_AIX)
 | 
						|
define(`KERNEL1x1_E2', `
 | 
						|
#else
 | 
						|
.macro KERNEL1x1_E2
 | 
						|
#endif
 | 
						|
 | 
						|
 | 
						|
	xvmaddadp	vs32,	vs8,	vs20		// real*real, imag*real
 | 
						|
	xvmaddadp	vs33,	vs8,	vs21		// real*imag, imag*imag
 | 
						|
 | 
						|
 | 
						|
#if defined(_AIX)
 | 
						|
')
 | 
						|
#else
 | 
						|
.endm
 | 
						|
#endif
 | 
						|
 | 
						|
#if defined(_AIX)
 | 
						|
define(`KERNEL1x1_SUBI1', `
 | 
						|
#else
 | 
						|
.macro KERNEL1x1_SUBI1
 | 
						|
#endif
 | 
						|
 | 
						|
	lxvd2x		vs0,	o0,	AO		// load real,imag from A
 | 
						|
 | 
						|
	addi		AO,	AO,	16
 | 
						|
 | 
						|
	lxvd2x		vs16,	o0,	BO		// load real part from B
 | 
						|
	lxvd2x		vs17,	o16,	BO		// load imag part from B
 | 
						|
 | 
						|
	addi		BO,	BO,	32
 | 
						|
 | 
						|
	xvmuldp		vs32,	vs0,	vs16		// real*real, imag*real
 | 
						|
	xvmuldp		vs33,	vs0,	vs17		// real*imag, imag*imag
 | 
						|
 | 
						|
 | 
						|
#if defined(_AIX)
 | 
						|
')
 | 
						|
#else
 | 
						|
.endm
 | 
						|
#endif
 | 
						|
 | 
						|
#if defined(_AIX)
 | 
						|
define(`KERNEL1x1_SUB1', `
 | 
						|
#else
 | 
						|
.macro KERNEL1x1_SUB1
 | 
						|
#endif
 | 
						|
 | 
						|
	lxvd2x		vs0,	o0,	AO		// load real,imag from A
 | 
						|
 | 
						|
	addi		AO,	AO,	16
 | 
						|
 | 
						|
	lxvd2x		vs16,	o0,	BO		// load real part from B
 | 
						|
	lxvd2x		vs17,	o16,	BO		// load imag part from B
 | 
						|
 | 
						|
	addi		BO,	BO,	32
 | 
						|
 | 
						|
	xvmaddadp	vs32,	vs0,	vs16		// real*real, imag*real
 | 
						|
	xvmaddadp	vs33,	vs0,	vs17		// real*imag, imag*imag
 | 
						|
 | 
						|
 | 
						|
#if defined(_AIX)
 | 
						|
')
 | 
						|
#else
 | 
						|
.endm
 | 
						|
#endif
 | 
						|
 | 
						|
#if defined(_AIX)
 | 
						|
define(`SAVE1x1', `
 | 
						|
#else
 | 
						|
.macro SAVE1x1
 | 
						|
#endif
 | 
						|
 | 
						|
 | 
						|
	mr		T1,	CO
 | 
						|
 | 
						|
#ifndef TRMMKERNEL
 | 
						|
 | 
						|
	lxvd2x		vs16,	o0,	T1
 | 
						|
 | 
						|
#endif
 | 
						|
 | 
						|
 | 
						|
	xxlxor		vs0,	vs0,	vs0
 | 
						|
	xxlxor		vs1,	vs1,	vs1
 | 
						|
	XXSWAPD(vs33,vs33)			// realA*imagB, imagA*imagB -> imagA*imagB, realA*imagB
 | 
						|
 | 
						|
	XSFADD_R1	vs0,	vs0,	vs32		// realA*realB
 | 
						|
	XSFADD_R2	vs0,	vs0,	vs33		// imagA*imagB
 | 
						|
 | 
						|
	XXSWAPD(vs32,vs32)			// realA*realB, imagA*realB -> imagA*realB, realA*realB
 | 
						|
	XXSWAPD(vs33,vs33)			// imagA*imagB, realA*imagB -> realA*imagB, imagA*imagB
 | 
						|
 | 
						|
	XSFADD_I1	vs1,	vs1,	vs32		// realA*imagB
 | 
						|
	XSFADD_I2	vs1,	vs1,	vs33		// imagA*realB
 | 
						|
 | 
						|
	xsmuldp		vs4,	vs0,	alpha_r		// real*alpha_r
 | 
						|
	xsmuldp		vs5,	vs1,	alpha_i		// imag*alpha_i
 | 
						|
	xsmuldp		vs6,	vs0,	alpha_i		// real*alpha_i
 | 
						|
	xsmuldp		vs7,	vs1,	alpha_r		// imag*alpha_r
 | 
						|
 | 
						|
	xssubdp		vs2,	vs4,	vs5		// real*alpha_r - imag*alpha_i
 | 
						|
	xsadddp		vs3,	vs6,	vs7		// real*alpha_i + imag*alpha_r
 | 
						|
	xxpermdi	vs8,	vs2,	vs3,	0	// merge real and imag part
 | 
						|
 | 
						|
 | 
						|
#ifndef TRMMKERNEL
 | 
						|
 | 
						|
	xvadddp		vs8,	vs8,	vs16
 | 
						|
 | 
						|
#endif
 | 
						|
 | 
						|
	stxvd2x		vs8,	o0,	T1
 | 
						|
 | 
						|
	add		T1,	T1,	LDC
 | 
						|
	addi		CO,	CO,	16
 | 
						|
 | 
						|
#if defined(_AIX)
 | 
						|
')
 | 
						|
#else
 | 
						|
.endm
 | 
						|
#endif
 | 
						|
 | 
						|
 | 
						|
 | 
						|
#if defined(_AIX)
 | 
						|
define(`ZCOPYB_1x1', `
 | 
						|
#else
 | 
						|
.macro ZCOPYB_1x1
 | 
						|
#endif
 | 
						|
 | 
						|
        lxvdsx          vs4,    o0,     BO              // b0_r
 | 
						|
        lxvdsx          vs5,    o8,     BO              // b0_i
 | 
						|
        addi            BO,     BO,     16
 | 
						|
        stxvd2x         vs4,    o0,     BBO
 | 
						|
        stxvd2x         vs5,    o16,    BBO
 | 
						|
        addi            BBO,    BBO,    32
 | 
						|
 | 
						|
#if defined(_AIX)
 | 
						|
')
 | 
						|
#else
 | 
						|
.endm
 | 
						|
#endif
 | 
						|
 | 
						|
 | 
						|
#if defined(_AIX)
 | 
						|
define(`ZCOPYB_8x1', `
 | 
						|
#else
 | 
						|
.macro ZCOPYB_8x1
 | 
						|
#endif
 | 
						|
 | 
						|
        lxvd2x          vs32,   o0,     BO
 | 
						|
        lxvd2x          vs33,  o16,     BO
 | 
						|
        lxvd2x          vs34,  o32,     BO
 | 
						|
        lxvd2x          vs35,  o48,     BO
 | 
						|
        addi            BO,     BO,     64
 | 
						|
 | 
						|
        lxvd2x          vs36,   o0,     BO
 | 
						|
        lxvd2x          vs37,  o16,     BO
 | 
						|
        lxvd2x          vs38,  o32,     BO
 | 
						|
        lxvd2x          vs39,  o48,     BO
 | 
						|
        addi            BO,     BO,     64
 | 
						|
 | 
						|
	XXSPLTD(vs40,vs32,0)
 | 
						|
	XXSPLTD(vs41,vs32,1)
 | 
						|
	XXSPLTD(vs42,vs33,0)
 | 
						|
	XXSPLTD(vs43,vs33,1)
 | 
						|
	XXSPLTD(vs44,vs34,0)
 | 
						|
	XXSPLTD(vs45,vs34,1)
 | 
						|
	XXSPLTD(vs46,vs35,0)
 | 
						|
	XXSPLTD(vs47,vs35,1)
 | 
						|
 | 
						|
	XXSPLTD(vs48,vs36,0)
 | 
						|
	XXSPLTD(vs49,vs36,1)
 | 
						|
	XXSPLTD(vs50,vs37,0)
 | 
						|
	XXSPLTD(vs51,vs37,1)
 | 
						|
	XXSPLTD(vs52,vs38,0)
 | 
						|
	XXSPLTD(vs53,vs38,1)
 | 
						|
	XXSPLTD(vs54,vs39,0)
 | 
						|
	XXSPLTD(vs55,vs39,1)
 | 
						|
 | 
						|
        stxvd2x         vs40,    o0,     BBO
 | 
						|
        stxvd2x         vs41,   o16,     BBO
 | 
						|
        stxvd2x         vs42,   o32,     BBO
 | 
						|
        stxvd2x         vs43,   o48,     BBO
 | 
						|
        addi            BBO,    BBO,    64
 | 
						|
 | 
						|
        stxvd2x         vs44,    o0,     BBO
 | 
						|
        stxvd2x         vs45,   o16,     BBO
 | 
						|
        stxvd2x         vs46,   o32,     BBO
 | 
						|
        stxvd2x         vs47,   o48,     BBO
 | 
						|
        addi            BBO,    BBO,    64
 | 
						|
 | 
						|
        stxvd2x         vs48,    o0,     BBO
 | 
						|
        stxvd2x         vs49,   o16,     BBO
 | 
						|
        stxvd2x         vs50,   o32,     BBO
 | 
						|
        stxvd2x         vs51,   o48,     BBO
 | 
						|
        addi            BBO,    BBO,    64
 | 
						|
 | 
						|
        stxvd2x         vs52,    o0,     BBO
 | 
						|
        stxvd2x         vs53,   o16,     BBO
 | 
						|
        stxvd2x         vs54,   o32,     BBO
 | 
						|
        stxvd2x         vs55,   o48,     BBO
 | 
						|
        addi            BBO,    BBO,    64
 | 
						|
 | 
						|
#if defined(_AIX)
 | 
						|
')
 | 
						|
#else
 | 
						|
.endm
 | 
						|
#endif
 | 
						|
 | 
						|
 |