1825 lines
		
	
	
		
			47 KiB
		
	
	
	
		
			ArmAsm
		
	
	
	
			
		
		
	
	
			1825 lines
		
	
	
		
			47 KiB
		
	
	
	
		
			ArmAsm
		
	
	
	
/***************************************************************************
 | 
						|
Copyright (c) 2013-2019, The OpenBLAS Project
 | 
						|
All rights reserved.
 | 
						|
Redistribution and use in source and binary forms, with or without
 | 
						|
modification, are permitted provided that the following conditions are
 | 
						|
met:
 | 
						|
1. Redistributions of source code must retain the above copyright
 | 
						|
notice, this list of conditions and the following disclaimer.
 | 
						|
2. Redistributions in binary form must reproduce the above copyright
 | 
						|
notice, this list of conditions and the following disclaimer in
 | 
						|
the documentation and/or other materials provided with the
 | 
						|
distribution.
 | 
						|
3. Neither the name of the OpenBLAS project nor the names of
 | 
						|
its contributors may be used to endorse or promote products
 | 
						|
derived from this software without specific prior written permission.
 | 
						|
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
 | 
						|
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
 | 
						|
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
 | 
						|
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
 | 
						|
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
 | 
						|
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
 | 
						|
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
 | 
						|
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
 | 
						|
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
 | 
						|
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 | 
						|
*****************************************************************************/
 | 
						|
 | 
						|
#define unit_size 16
 | 
						|
#define DISP32(ind,disp) (ind*unit_size*32+disp)
 | 
						|
#define DISP16(ind,disp) (ind*unit_size*16+disp)
 | 
						|
#define DISP8(ind,disp) (ind*unit_size*8+disp)
 | 
						|
#define DISP4(ind,disp) (ind*unit_size*4+disp)
 | 
						|
#define DISP2(ind,disp) (ind*unit_size*2+disp)
 | 
						|
#define DISP1(ind,disp) (ind*unit_size+disp)
 | 
						|
#define DISPX(disp)  (disp)
 | 
						|
/*	HELPERS FOR SAVE	*/
 | 
						|
/* {r0,i0} and {r1,i1} into  {r0,r1} {i0,i1} */
 | 
						|
 | 
						|
 | 
						|
.macro LOAD_COUPLE_AS_RR_II  VS_OUT1,VS_OUT2,VS_TEMP1,VS_TEMP2,REG,LOFFSET 
 | 
						|
#ifndef TRMMKERNEL 
 | 
						|
  lxv	\VS_TEMP1,	DISPX(\LOFFSET)(\REG)
 | 
						|
  lxv	\VS_TEMP2,	DISPX(\LOFFSET+16)(\REG)
 | 
						|
  xxmrgld  \VS_OUT1,\VS_TEMP1,\VS_TEMP2
 | 
						|
  xxmrghd  \VS_OUT2,\VS_TEMP1,\VS_TEMP2	
 | 
						|
#endif	
 | 
						|
.endm
 | 
						|
/*from 2 result {a0r*br,a0i*bi} and {a1r*br,a1i*bi} pack into {a0r*br,a1r*br} and {a0i*bi,a1i*bi}*/
 | 
						|
 | 
						|
 | 
						|
.macro RESULT_INTO_REALREAL_IMAGEIMAGE VSIN1,VSIN2,VSOUT1,VSOUT2
 | 
						|
	xxmrgld	\VSOUT1, \VSIN1,\VSIN2 /*  real*real from 2 results*/
 | 
						|
	xxmrghd	\VSOUT2, \VSIN1,\VSIN2 /*  imag*imag from 2 results*/
 | 
						|
.endm 
 | 
						|
/*from 2 result {a0r*bi,a0i*br} and {a1r*bi,a1i*br} pack into {a0r*bi,a1r*bi} and {a0i*br,a1i*br}*/
 | 
						|
 | 
						|
 | 
						|
.macro RESULT_INTO_REALIMAG_IMAGREAL VSIN1,VSIN2,VSOUT1,VSOUT2 
 | 
						|
	xxmrgld	\VSOUT1, \VSIN1,\VSIN2 /*  real*imag */
 | 
						|
	xxmrghd	\VSOUT2, \VSIN1,\VSIN2 /*  imag*real*/
 | 
						|
.endm
 | 
						|
/* {a0r*br op a0i*bi ,a1r*br op a1i*bi} ~ {r0,r1}; {a0r*bi op a0i*br ,a1r*bi op a1i*br} ~ {i0,i1}*/
 | 
						|
 | 
						|
 | 
						|
.macro  AGGREGATE_REALS_IMAGES  VSINR_OUT1,VSINR,VSINI_OUT2,VSINI
 | 
						|
#if	defined(NN) || defined(NT) || defined(TN) || defined(TT) 
 | 
						|
	xvsubdp  \VSINR_OUT1,\VSINR_OUT1,\VSINR
 | 
						|
	xvadddp  \VSINI_OUT2,\VSINI_OUT2,\VSINI  
 | 
						|
#elif  defined(CN) || defined(CT) || defined(RN) || defined(RT) 
 | 
						|
	xvadddp  \VSINR_OUT1,\VSINR_OUT1,\VSINR
 | 
						|
	xvsubdp  \VSINI_OUT2,\VSINI_OUT2,\VSINI 
 | 
						|
#elif  defined(NC) || defined(TC) || defined(NR) || defined(TR) 
 | 
						|
	xvadddp  \VSINR_OUT1,\VSINR_OUT1,\VSINR
 | 
						|
	xvsubdp  \VSINI_OUT2,\VSINI,\VSINI_OUT2  
 | 
						|
#else	// CC || CR || RC || RR 
 | 
						|
    /*we will assume {-alpha_r,-alpha_i} for this case */
 | 
						|
    /*i1i2-r1r2 so we will negate alpha real instead to fix sign*/
 | 
						|
	xvsubdp  \VSINR_OUT1,\VSINR,\VSINR_OUT1
 | 
						|
    /*we will negate alpha image instead  instead to fix sign*/
 | 
						|
	xvadddp  \VSINI_OUT2,\VSINI_OUT2,\VSINI 
 | 
						|
#endif
 | 
						|
.endm 
 | 
						|
/* {i0,i1} * {alpha_i,alpha_i} - VSOUT1 ;VSOUT2 + {r0,r1}*{alpha_i,alpha_i} */
 | 
						|
 | 
						|
 | 
						|
.macro MULT_APLHA_PART1  VSINRR,VSINII,VSOUT1,VSOUT2
 | 
						|
#ifndef TRMMKERNEL  
 | 
						|
	xvmsubadp \VSOUT1,\VSINII, alpha_i
 | 
						|
	xvmaddadp  \VSOUT2,\VSINRR, alpha_i
 | 
						|
#else 
 | 
						|
	xvmuldp \VSOUT1,\VSINII, alpha_i 
 | 
						|
	xvmuldp  \VSOUT2,\VSINRR, alpha_i
 | 
						|
#endif 
 | 
						|
.endm
 | 
						|
/*   {r0,r1} * {alpha_r,alpha_r} -  VSOUT1 ;VSOUT2 + {i0,i1} * {alpha_r,alpha_r} */
 | 
						|
 | 
						|
 | 
						|
.macro MULT_APLHA_PART2  VSINRR,VSINII,VSOUT1,VSOUT2 
 | 
						|
	xvmsubadp  \VSOUT1,\VSINRR, alpha_r
 | 
						|
	xvmaddadp \VSOUT2,\VSINII, alpha_r
 | 
						|
.endm
 | 
						|
/* unpack to store 2{r,r} {i,i} into  {r,i} {r,i} (big endian because of stxv) */
 | 
						|
 | 
						|
 | 
						|
.macro UNPACK_FOR_STORE VSIN1,VSIN2,VSOUT1,VSOUT2 
 | 
						|
	xxmrghd  \VSOUT1,\VSIN2,\VSIN1
 | 
						|
	xxmrgld  \VSOUT2,\VSIN2,\VSIN1
 | 
						|
.endm
 | 
						|
 | 
						|
 | 
						|
.macro STORE_COUPLE REG,LOFFSET,VSIN1,VSIN2
 | 
						|
	stxv	\VSIN1,	DISPX(\LOFFSET)(\REG)
 | 
						|
	stxv	\VSIN2,	DISPX(\LOFFSET+16)(\REG)
 | 
						|
.endm
 | 
						|
 | 
						|
 | 
						|
.macro SAVE8 VSRes1,VSRes2,VSRes3,VSRes4,VSRes5,VSRes6,VSRes7,VSRes8,VSRes9,VSRes10,VSRes11,VSRes12,VSRes13,VSRes14,VSRes15,VSRes16,BASE_REG,LOFFSET
 | 
						|
  RESULT_INTO_REALREAL_IMAGEIMAGE \VSRes1,\VSRes3,vs2,vs3
 | 
						|
  LOAD_COUPLE_AS_RR_II	vs14,vs15,vs18,vs19,\BASE_REG,\LOFFSET
 | 
						|
  RESULT_INTO_REALIMAG_IMAGREAL	\VSRes2,\VSRes4,vs4,vs5
 | 
						|
  LOAD_COUPLE_AS_RR_II	vs16,vs17,vs20,vs21,\BASE_REG,(\LOFFSET+32)
 | 
						|
  RESULT_INTO_REALREAL_IMAGEIMAGE \VSRes5,\VSRes7,vs6,vs7
 | 
						|
  LOAD_COUPLE_AS_RR_II	vs24,vs25,vs18,vs19,\BASE_REG,(\LOFFSET +64)
 | 
						|
  RESULT_INTO_REALIMAG_IMAGREAL	\VSRes6,\VSRes8,vs8,vs9 
 | 
						|
  LOAD_COUPLE_AS_RR_II	vs26,vs27,vs20,vs21,\BASE_REG,(\LOFFSET+96)
 | 
						|
  RESULT_INTO_REALREAL_IMAGEIMAGE \VSRes9,\VSRes11,vs10,vs11
 | 
						|
  AGGREGATE_REALS_IMAGES	vs2,vs3,vs4,vs5
 | 
						|
  RESULT_INTO_REALIMAG_IMAGREAL	\VSRes10,\VSRes12,vs12,vs13 
 | 
						|
  AGGREGATE_REALS_IMAGES	vs6,vs7,vs8,vs9  
 | 
						|
  RESULT_INTO_REALREAL_IMAGEIMAGE \VSRes13,\VSRes15,\VSRes1,\VSRes2
 | 
						|
  MULT_APLHA_PART1	vs2,vs4, vs14,vs15
 | 
						|
  RESULT_INTO_REALIMAG_IMAGREAL	\VSRes14,\VSRes16,\VSRes3,\VSRes4
 | 
						|
  MULT_APLHA_PART1	vs6,vs8,vs16,vs17
 | 
						|
  MULT_APLHA_PART2  vs2,vs4,vs14,vs15 
 | 
						|
  AGGREGATE_REALS_IMAGES	vs10,vs11,vs12,vs13
 | 
						|
  MULT_APLHA_PART2	vs6,vs8,vs16,vs17
 | 
						|
  AGGREGATE_REALS_IMAGES	\VSRes1,\VSRes2,\VSRes3,\VSRes4	
 | 
						|
  UNPACK_FOR_STORE	vs14,vs15,vs7,vs9
 | 
						|
  MULT_APLHA_PART1	vs10,vs12, vs24,vs25
 | 
						|
  UNPACK_FOR_STORE	vs16,vs17,vs3,vs5 
 | 
						|
  MULT_APLHA_PART1	\VSRes1,\VSRes3, vs26,vs27
 | 
						|
  STORE_COUPLE	\BASE_REG,\LOFFSET,vs7,vs9
 | 
						|
  MULT_APLHA_PART2	vs10,vs12,vs24,vs25
 | 
						|
  STORE_COUPLE	\BASE_REG,(\LOFFSET+32),vs3,vs5 
 | 
						|
  MULT_APLHA_PART2	\VSRes1,\VSRes3, vs26,vs27
 | 
						|
  UNPACK_FOR_STORE	vs24,vs25,vs10,vs12
 | 
						|
  UNPACK_FOR_STORE	vs26,vs27,\VSRes1,\VSRes3
 | 
						|
  STORE_COUPLE	\BASE_REG,(\LOFFSET +64),vs10,vs12
 | 
						|
  STORE_COUPLE	\BASE_REG,(\LOFFSET+96),\VSRes1,\VSRes3
 | 
						|
.endm
 | 
						|
 | 
						|
 | 
						|
.macro SAVE4  VSRes1,VSRes2,VSRes3,VSRes4,VSRes5,VSRes6,VSRes7,VSRes8,BASE_REG,LOFFSET
 | 
						|
  RESULT_INTO_REALREAL_IMAGEIMAGE \VSRes1,\VSRes3,vs2,vs3
 | 
						|
  LOAD_COUPLE_AS_RR_II	vs14,vs15,vs18,vs19,\BASE_REG,\LOFFSET
 | 
						|
  RESULT_INTO_REALIMAG_IMAGREAL	\VSRes2,\VSRes4,vs4,vs5
 | 
						|
  LOAD_COUPLE_AS_RR_II	vs16,vs17,vs20,vs21,\BASE_REG,(\LOFFSET+32)
 | 
						|
  RESULT_INTO_REALREAL_IMAGEIMAGE \VSRes5,\VSRes7,vs6,vs7
 | 
						|
  RESULT_INTO_REALIMAG_IMAGREAL	\VSRes6,\VSRes8,vs8,vs9 
 | 
						|
  AGGREGATE_REALS_IMAGES	vs2,vs3,vs4,vs5	
 | 
						|
  AGGREGATE_REALS_IMAGES	vs6,vs7,vs8,vs9  
 | 
						|
  MULT_APLHA_PART1	vs2,vs4, vs14,vs15
 | 
						|
  MULT_APLHA_PART1	vs6,vs8, vs16,vs17
 | 
						|
  MULT_APLHA_PART2	vs2,vs4, vs14,vs15 
 | 
						|
  MULT_APLHA_PART2	vs6,vs8,vs16,vs17
 | 
						|
  UNPACK_FOR_STORE	vs14,vs15,vs7,vs9
 | 
						|
  UNPACK_FOR_STORE	vs16,vs17,vs3,vs5
 | 
						|
  STORE_COUPLE	\BASE_REG,\LOFFSET,vs7,vs9
 | 
						|
  STORE_COUPLE	\BASE_REG,(\LOFFSET+32),vs3,vs5
 | 
						|
.endm
 | 
						|
 | 
						|
 | 
						|
 | 
						|
.macro SAVE2  VSRes1,VSRes2,VSRes3,VSRes4,BASE_REG,LOFFSET
 | 
						|
  RESULT_INTO_REALREAL_IMAGEIMAGE \VSRes1,\VSRes3,vs2,vs3
 | 
						|
  LOAD_COUPLE_AS_RR_II	vs14,vs15,vs18,vs19,\BASE_REG,\LOFFSET
 | 
						|
  RESULT_INTO_REALIMAG_IMAGREAL	\VSRes2,\VSRes4,vs4,vs5	
 | 
						|
  AGGREGATE_REALS_IMAGES	vs2,vs3,vs4,vs5	
 | 
						|
  MULT_APLHA_PART1	vs2,vs4, vs14,vs15	
 | 
						|
  MULT_APLHA_PART2	vs2,vs4, vs14,vs15  
 | 
						|
  UNPACK_FOR_STORE	vs14,vs15,vs7,vs9	
 | 
						|
  STORE_COUPLE	\BASE_REG,\LOFFSET,vs7,vs9  
 | 
						|
.endm
 | 
						|
 | 
						|
 | 
						|
 | 
						|
.macro SAVE1  VSRes1,VSRes2,BASE_REG,LOFFSET
 | 
						|
  RESULT_INTO_REALREAL_IMAGEIMAGE \VSRes1,\VSRes1,vs2,vs3
 | 
						|
#ifndef TRMMKERNEL 
 | 
						|
  lxv	vs18,	(\LOFFSET)(\BASE_REG) 
 | 
						|
  xxmrgld  vs14,vs18,vs18
 | 
						|
  xxmrghd  vs15,vs18,vs18	
 | 
						|
#endif	
 | 
						|
  RESULT_INTO_REALIMAG_IMAGREAL	\VSRes2,\VSRes2,vs4,vs5	
 | 
						|
  AGGREGATE_REALS_IMAGES	vs2,vs3,vs4,vs5	
 | 
						|
  MULT_APLHA_PART1	vs2,vs4, vs14,vs15	
 | 
						|
  MULT_APLHA_PART2	vs2,vs4, vs14,vs15  
 | 
						|
  UNPACK_FOR_STORE	vs14,vs15,vs7,vs9 
 | 
						|
  xxmrghd  vs7,vs15,vs14	
 | 
						|
  stxv	vs7,	(\LOFFSET)(\BASE_REG) 
 | 
						|
.endm
 | 
						|
/**********************************************************************************************
 | 
						|
*
 | 
						|
 | 
						|
.macros for N=2 and M=8
 | 
						|
**********************************************************************************************/
 | 
						|
 | 
						|
.macro Zero2x8
 | 
						|
	xxlxor	vs32,	vs32,	vs32
 | 
						|
	xxlxor	vs33,	vs33,	vs33
 | 
						|
	xxlxor	vs34,	vs34,	vs34
 | 
						|
	xxlxor	vs35,	vs35,	vs35
 | 
						|
	xxlxor	vs36,	vs36,	vs36
 | 
						|
	xxlxor	vs37,	vs37,	vs37
 | 
						|
	xxlxor	vs38,	vs38,	vs38
 | 
						|
	xxlxor	vs39,	vs39,	vs39
 | 
						|
	xxlxor	vs40,	vs40,	vs40
 | 
						|
	xxlxor	vs41,	vs41,	vs41
 | 
						|
	xxlxor	vs42,	vs42,	vs42
 | 
						|
	xxlxor	vs43,	vs43,	vs43
 | 
						|
	xxlxor	vs44,	vs44,	vs44
 | 
						|
	xxlxor	vs45,	vs45,	vs45
 | 
						|
	xxlxor	vs46,	vs46,	vs46
 | 
						|
	xxlxor	vs47,	vs47,	vs47
 | 
						|
	xxlxor	vs48,	vs48,	vs48
 | 
						|
	xxlxor	vs49,	vs49,	vs49
 | 
						|
	xxlxor	vs50,	vs50,	vs50
 | 
						|
	xxlxor	vs51,	vs51,	vs51
 | 
						|
	xxlxor	vs52,	vs52,	vs52
 | 
						|
	xxlxor	vs53,	vs53,	vs53
 | 
						|
	xxlxor	vs54,	vs54,	vs54
 | 
						|
	xxlxor	vs55,	vs55,	vs55
 | 
						|
	xxlxor	vs56,	vs56,	vs56
 | 
						|
	xxlxor	vs57,	vs57,	vs57
 | 
						|
	xxlxor	vs58,	vs58,	vs58
 | 
						|
	xxlxor	vs59,	vs59,	vs59
 | 
						|
	xxlxor	vs60,	vs60,	vs60
 | 
						|
	xxlxor	vs61,	vs61,	vs61
 | 
						|
	xxlxor	vs62,	vs62,	vs62
 | 
						|
	xxlxor	vs63,	vs63,	vs63
 | 
						|
.endm
 | 
						|
 | 
						|
 | 
						|
.macro LOAD2x8   
 | 
						|
	LOAD2x8O 0,0 
 | 
						|
.endm
 | 
						|
 | 
						|
 | 
						|
.macro LOAD2x8O  OffsetA,OffsetB
 | 
						|
	lxv	vs16,(\OffsetB+	0)(BO)	// load real imag from B
 | 
						|
	lxv	vs18,	(\OffsetB+16)(BO)	// load real,imag from B 
 | 
						|
	xxswapd	vs17, vs16
 | 
						|
	xxswapd	vs19, vs18
 | 
						|
	lxv	vs0,	(0+\OffsetA)(AO)	// load real,imag from A
 | 
						|
	lxv	vs1,	(16+\OffsetA)(AO)	// load real,imag from A
 | 
						|
	lxv	vs2,	(32+\OffsetA)(AO)	// load real,imag from A
 | 
						|
	lxv	vs3,	(48+\OffsetA)(AO)	// load real,imag from A
 | 
						|
	lxv	vs4,	(64+\OffsetA)(AO)	// load real,imag from A
 | 
						|
	lxv	vs5,	(80+\OffsetA)(AO)	// load real,imag from A
 | 
						|
	lxv	vs6,	(96+\OffsetA)(AO)	// load real,imag from A
 | 
						|
	lxv	vs7,	(112+\OffsetA)(AO)	// load real,imag from A
 | 
						|
 
 | 
						|
.endm
 | 
						|
 | 
						|
 | 
						|
.macro END2x8_NORMAL
 | 
						|
	END2x8 AO,BO,128,32
 | 
						|
.endm
 | 
						|
 | 
						|
 | 
						|
.macro END2x8_WITHOUT_ADD
 | 
						|
	END2x8 AO,BO,0,0
 | 
						|
.endm
 | 
						|
 | 
						|
 | 
						|
.macro END2x8	AREG, BREG, OffsetA, OffsetB
 | 
						|
.if \OffsetB != 0
 | 
						|
	addi	\BREG, \BREG, \OffsetB
 | 
						|
.endif
 | 
						|
.if \OffsetA != 0
 | 
						|
	addi	\AREG, \AREG, \OffsetA
 | 
						|
.endif
 | 
						|
	xvmaddadp	vs32,	vs0,	vs16
 | 
						|
	xvmaddadp	vs48,	vs0,	vs18
 | 
						|
	xvmaddadp	vs33,	vs0,	vs17
 | 
						|
	xvmaddadp	vs49,	vs0,	vs19
 | 
						|
	xvmaddadp	vs34,	vs1,	vs16
 | 
						|
	xvmaddadp	vs50,	vs1,	vs18
 | 
						|
	xvmaddadp	vs35,	vs1,	vs17
 | 
						|
	xvmaddadp	vs51,	vs1,	vs19
 | 
						|
	xvmaddadp	vs36,	vs2,	vs16
 | 
						|
	xvmaddadp	vs52,	vs2,	vs18
 | 
						|
	xvmaddadp	vs37,	vs2,	vs17
 | 
						|
	xvmaddadp	vs53,	vs2,	vs19
 | 
						|
	xvmaddadp	vs38,	vs3,	vs16
 | 
						|
	xvmaddadp	vs54,	vs3,	vs18
 | 
						|
	xvmaddadp	vs39,	vs3,	vs17
 | 
						|
	xvmaddadp	vs55,	vs3,	vs19
 | 
						|
	xvmaddadp	vs40,	vs4,	vs16
 | 
						|
	xvmaddadp	vs56,	vs4,	vs18
 | 
						|
	xvmaddadp	vs41,	vs4,	vs17
 | 
						|
	xvmaddadp	vs57,	vs4,	vs19
 | 
						|
	xvmaddadp	vs42,	vs5,	vs16
 | 
						|
	xvmaddadp	vs58,	vs5,	vs18
 | 
						|
	xvmaddadp	vs43,	vs5,	vs17
 | 
						|
	xvmaddadp	vs59,	vs5,	vs19
 | 
						|
	xvmaddadp	vs44,	vs6,	vs16
 | 
						|
	xvmaddadp	vs60,	vs6,	vs18
 | 
						|
	xvmaddadp	vs45,	vs6,	vs17
 | 
						|
	xvmaddadp	vs61,	vs6,	vs19
 | 
						|
	xvmaddadp	vs46,	vs7,	vs16
 | 
						|
	xvmaddadp	vs62,	vs7,	vs18
 | 
						|
	xvmaddadp	vs47,	vs7,	vs17
 | 
						|
	xvmaddadp	vs63,	vs7,	vs19
 | 
						|
.endm
 | 
						|
 | 
						|
 | 
						|
.macro LOAD2x8_2
 | 
						|
    LOAD2x8_2O 0,0
 | 
						|
.endm	
 | 
						|
 | 
						|
 | 
						|
.macro LOAD2x8_2O  OffsetA,OffsetB
 | 
						|
	lxv	vs16,(\OffsetB+	0)(BO)	// load real imag from B
 | 
						|
	lxv	vs18,	(\OffsetB+16)(BO)	// load real,imag from B
 | 
						|
	lxv	vs20,	(\OffsetB+32)(BO)	// load real,imag	from B
 | 
						|
	lxv	vs22,	(\OffsetB+48)(BO)	// load real,imag  from B	
 | 
						|
	xxswapd	vs17, vs16
 | 
						|
	xxswapd	vs19, vs18
 | 
						|
	lxv	vs0,	(0+\OffsetA)(AO)	// load real,imag from A
 | 
						|
	lxv	vs1,	(16+\OffsetA)(AO)	// load real,imag from A
 | 
						|
	lxv	vs2,	(32+\OffsetA)(AO)	// load real,imag from A
 | 
						|
	lxv	vs3,	(48+\OffsetA)(AO)	// load real,imag from A
 | 
						|
	lxv	vs4,	(64+\OffsetA)(AO)	// load real,imag from A
 | 
						|
	lxv	vs5,	(80+\OffsetA)(AO)	// load real,imag from A
 | 
						|
	lxv	vs6,	(96+\OffsetA)(AO)	// load real,imag from A
 | 
						|
	lxv	vs7,	(112+\OffsetA)(AO)	// load real,imag from A
 | 
						|
	lxv	vs8,	(128+0+\OffsetA)(AO)	// load real,imag from A
 | 
						|
	lxv	vs9,	(128+16+\OffsetA)(AO)	// load real,imag from A
 | 
						|
	lxv	vs10,   (128+32+\OffsetA)(AO)	// load real,imag from A
 | 
						|
	lxv	vs11,   (128+48+\OffsetA)(AO)	// load real,imag from A
 | 
						|
	lxv	vs12,   (128+64+\OffsetA)(AO)	// load real,imag from A
 | 
						|
	lxv	vs13,   (128+80+\OffsetA)(AO)	// load real,imag from A
 | 
						|
	lxv	vs14,   (128+96+\OffsetA)(AO)	// load real,imag from A
 | 
						|
	lxv	vs15,   (128+112+\OffsetA)(AO)	// load real,imag from A
 | 
						|
.endm	
 | 
						|
 | 
						|
 | 
						|
.macro END2x8_2	  
 | 
						|
  /*for load2 offset will be 256 and 64*/
 | 
						|
   KERNEL2x8_2	AO,BO,	256,64,0 ,1,1 
 | 
						|
.endm
 | 
						|
 
 | 
						|
 | 
						|
 | 
						|
.macro KERNEL2x8_E2	OffsetA,OffsetB, Index,IsLast 
 | 
						|
  KERNEL2x8_2	AO,BO,	\OffsetA,\OffsetB, \Index,\IsLast ,1 
 | 
						|
.endm
 | 
						|
 | 
						|
 | 
						|
.macro KERNEL2x8_L2	OffsetA,OffsetB, Index,IsLast
 | 
						|
  KERNEL2x8_2	AO,BO,	\OffsetA,\OffsetB, \Index,\IsLast ,0 
 | 
						|
.endm
 | 
						|
 | 
						|
 | 
						|
.macro KERNEL2x8_2	AREG,BREG,	OffsetA,OffsetB, Index,IsLast ,Complete
 | 
						|
	xvmaddadp	vs32,	vs0,	vs16
 | 
						|
	xvmaddadp	vs48,	vs0,	vs18
 | 
						|
	xvmaddadp	vs33,	vs0,	vs17
 | 
						|
	xvmaddadp	vs49,	vs0,	vs19
 | 
						|
  xxswapd	vs21, vs20
 | 
						|
  xxswapd	vs23, vs22
 | 
						|
	xvmaddadp	vs34,	vs1,	vs16
 | 
						|
	xvmaddadp	vs50,	vs1,	vs18
 | 
						|
	xvmaddadp	vs35,	vs1,	vs17
 | 
						|
	xvmaddadp	vs51,	vs1,	vs19
 | 
						|
.if \Complete==0	
 | 
						|
	lxv	vs0,	DISP16(\Index, 0 + \OffsetA)(\AREG)	// load real,imag from A
 | 
						|
	lxv	vs1,	DISP16(\Index,16 + \OffsetA)(\AREG)	// load real,imag from A
 | 
						|
.endif	
 | 
						|
	xvmaddadp	vs36,	vs2,	vs16
 | 
						|
	xvmaddadp	vs52,	vs2,	vs18
 | 
						|
	xvmaddadp	vs37,	vs2,	vs17
 | 
						|
	xvmaddadp	vs53,	vs2,	vs19
 | 
						|
	xvmaddadp	vs38,	vs3,	vs16
 | 
						|
	xvmaddadp	vs54,	vs3,	vs18
 | 
						|
	xvmaddadp	vs39,	vs3,	vs17
 | 
						|
	xvmaddadp	vs55,	vs3,	vs19
 | 
						|
.if \Complete==0	
 | 
						|
	lxv	vs2,	DISP16(\Index,32 + \OffsetA)(\AREG)	// load real,imag from A
 | 
						|
	lxv	vs3,	DISP16(\Index,48 + \OffsetA)(\AREG)	// load real,imag from A
 | 
						|
.endif	
 | 
						|
	xvmaddadp	vs40,	vs4,	vs16
 | 
						|
	xvmaddadp	vs56,	vs4,	vs18
 | 
						|
	xvmaddadp	vs41,	vs4,	vs17
 | 
						|
	xvmaddadp	vs57,	vs4,	vs19
 | 
						|
	xvmaddadp	vs42,	vs5,	vs16
 | 
						|
	xvmaddadp	vs58,	vs5,	vs18
 | 
						|
	xvmaddadp	vs43,	vs5,	vs17
 | 
						|
	xvmaddadp	vs59,	vs5,	vs19
 | 
						|
.if \Complete==0		
 | 
						|
	lxv	vs4,	DISP16(\Index,64+ \OffsetA)(\AREG)	// load real,imag from A
 | 
						|
	lxv	vs5,	DISP16(\Index,64+16 + \OffsetA)(\AREG)	// load real,imag from A
 | 
						|
.endif	
 | 
						|
	xvmaddadp	vs44,	vs6,	vs16
 | 
						|
	xvmaddadp	vs60,	vs6,	vs18
 | 
						|
	xvmaddadp	vs45,	vs6,	vs17
 | 
						|
	xvmaddadp	vs61,	vs6,	vs19
 | 
						|
	xvmaddadp	vs46,	vs7,	vs16
 | 
						|
	xvmaddadp	vs62,	vs7,	vs18
 | 
						|
	xvmaddadp	vs47,	vs7,	vs17
 | 
						|
	xvmaddadp	vs63,	vs7,	vs19	
 | 
						|
.if \Complete==0		
 | 
						|
	lxv	vs16,	DISP4(\Index, 0+\OffsetB)(\BREG)	// load real imag from B
 | 
						|
	lxv	vs18,	DISP4(\Index, 16+\OffsetB)(\BREG)	// load real,imag from B
 | 
						|
.endif
 | 
						|
	xvmaddadp	vs32,	vs8,	vs20
 | 
						|
	xvmaddadp	vs48,	vs8,	vs22
 | 
						|
.if \Complete==0
 | 
						|
	lxv	vs6,	DISP16(\Index,64+32 + \OffsetA)(\AREG)	// load real,imag from A
 | 
						|
	lxv	vs7,	DISP16(\Index,64+48 + \OffsetA)(\AREG)	// load real,imag from A	
 | 
						|
.endif	
 | 
						|
	xvmaddadp	vs33,	vs8,	vs21
 | 
						|
	xvmaddadp	vs49,	vs8,	vs23
 | 
						|
.if \Complete==0		
 | 
						|
  xxswapd	vs17, vs16
 | 
						|
  xxswapd	vs19, vs18
 | 
						|
.endif
 | 
						|
	xvmaddadp	vs34,	vs9,	vs20
 | 
						|
	xvmaddadp	vs50,	vs9,	vs22
 | 
						|
	xvmaddadp	vs35,	vs9,	vs21
 | 
						|
	xvmaddadp	vs51,	vs9,	vs23
 | 
						|
.if \Complete==0		
 | 
						|
	lxv	vs8,	DISP16(\Index,128+ + \OffsetA)(\AREG)	// load real,imag from A
 | 
						|
	lxv	vs9,	DISP16(\Index,128+16 + \OffsetA)(\AREG)	// load real,imag from A
 | 
						|
.endif
 | 
						|
	xvmaddadp	vs36,	vs10,	vs20
 | 
						|
	xvmaddadp	vs52,	vs10,	vs22
 | 
						|
	xvmaddadp	vs37,	vs10,	vs21
 | 
						|
	xvmaddadp	vs53,	vs10,	vs23
 | 
						|
	xvmaddadp	vs38,	vs11,	vs20
 | 
						|
	xvmaddadp	vs54,	vs11,	vs22
 | 
						|
	xvmaddadp	vs39,	vs11,	vs21
 | 
						|
	xvmaddadp	vs55,	vs11,	vs23
 | 
						|
.if \Complete==0	
 | 
						|
	lxv	vs10,	DISP16(\Index,128+32 + \OffsetA)(\AREG)	// load real,imag from A
 | 
						|
	lxv	vs11,	DISP16(\Index,128+48 + \OffsetA)(\AREG)	// load real,imag from A
 | 
						|
.endif	
 | 
						|
	xvmaddadp	vs40,	vs12,	vs20
 | 
						|
	xvmaddadp	vs56,	vs12,	vs22
 | 
						|
	xvmaddadp	vs41,	vs12,	vs21
 | 
						|
	xvmaddadp	vs57,	vs12,	vs23
 | 
						|
	xvmaddadp	vs42,	vs13,	vs20
 | 
						|
	xvmaddadp	vs58,	vs13,	vs22
 | 
						|
	xvmaddadp	vs43,	vs13,	vs21
 | 
						|
	xvmaddadp	vs59,	vs13,	vs23
 | 
						|
.if \Complete==0	
 | 
						|
	lxv	vs12,	DISP16(\Index, 192 + \OffsetA)(\AREG)	// load real,imag from A
 | 
						|
	lxv	vs13,	DISP16(\Index,192 +16 + \OffsetA)(\AREG)	// load real,imag from A
 | 
						|
.endif	
 | 
						|
	xvmaddadp	vs44,	vs14,	vs20
 | 
						|
	xvmaddadp	vs60,	vs14,	vs22
 | 
						|
	xvmaddadp	vs45,	vs14,	vs21
 | 
						|
	xvmaddadp	vs61,	vs14,	vs23
 | 
						|
	xvmaddadp	vs46,	vs15,	vs20
 | 
						|
	xvmaddadp	vs62,	vs15,	vs22
 | 
						|
	xvmaddadp	vs47,	vs15,	vs21
 | 
						|
	xvmaddadp	vs63,	vs15,	vs23
 | 
						|
.if \Complete==0	
 | 
						|
	lxv	vs14,	DISP16(\Index,192 +32 + \OffsetA)(\AREG)	// load real,imag from A
 | 
						|
	lxv	vs15,	DISP16(\Index,192 +48 + \OffsetA)(\AREG)	// load real,imag from A
 | 
						|
 	lxv	vs20,	DISP4(\Index, 32+\OffsetB)(\BREG)	// load real,imag	from B
 | 
						|
	lxv	vs22,	DISP4(\Index, 48+\OffsetB)(\BREG)	// load real,imag  from B
 | 
						|
.endif
 | 
						|
.if \IsLast==1
 | 
						|
.if \Complete==1
 | 
						|
	addi	\AREG, \AREG,  DISP16(\Index,\OffsetA)
 | 
						|
	addi	\BREG, \BREG,  DISP4(\Index,\OffsetB)
 | 
						|
.else
 | 
						|
	addi	\AREG, \AREG, DISP16(\Index,256)
 | 
						|
	addi	\BREG, \BREG,  DISP4(\Index,64)
 | 
						|
.endif
 | 
						|
.endif 
 | 
						|
.endm
 | 
						|
 | 
						|
 
 | 
						|
 | 
						|
 | 
						|
 | 
						|
.macro KERNEL2x8
 | 
						|
  LOAD2x8
 | 
						|
  END2x8  AO, BO, 128,32
 | 
						|
.endm
 | 
						|
 | 
						|
 | 
						|
.macro SAVE2x8
 | 
						|
	add	T1, CO ,LDC 
 | 
						|
	SAVE8  vs32,vs33,vs34,vs35,vs36,vs37,vs38,vs39,vs40,vs41,vs42,vs43,vs44,vs45,vs46,vs47,CO,0
 | 
						|
	SAVE8  vs48,vs49,vs50,vs51,vs52,vs53,vs54,vs55,vs56,vs57,vs58,vs59,vs60,vs61,vs62,vs63,T1,0  
 | 
						|
	addi	CO, CO, 128
 | 
						|
.endm
 | 
						|
/**********************************************************************************************
 | 
						|
*
 | 
						|
 | 
						|
.macros for N=2 and M=4
 | 
						|
**********************************************************************************************/
 | 
						|
 | 
						|
 | 
						|
.macro Zero2x4
 | 
						|
	xxlxor	vs32,	vs32,	vs32
 | 
						|
	xxlxor	vs33,	vs33,	vs33
 | 
						|
	xxlxor	vs34,	vs34,	vs34
 | 
						|
	xxlxor	vs35,	vs35,	vs35
 | 
						|
	xxlxor	vs36,	vs36,	vs36
 | 
						|
	xxlxor	vs37,	vs37,	vs37
 | 
						|
	xxlxor	vs38,	vs38,	vs38
 | 
						|
	xxlxor	vs39,	vs39,	vs39
 | 
						|
	xxlxor	vs40,	vs40,	vs40
 | 
						|
	xxlxor	vs41,	vs41,	vs41
 | 
						|
	xxlxor	vs42,	vs42,	vs42
 | 
						|
	xxlxor	vs43,	vs43,	vs43
 | 
						|
	xxlxor	vs44,	vs44,	vs44
 | 
						|
	xxlxor	vs45,	vs45,	vs45
 | 
						|
	xxlxor	vs46,	vs46,	vs46
 | 
						|
	xxlxor	vs47,	vs47,	vs47
 | 
						|
.endm
 | 
						|
 | 
						|
 | 
						|
.macro LOAD2x4   
 | 
						|
	LOAD2x4O 0,0 
 | 
						|
.endm
 | 
						|
 | 
						|
 | 
						|
.macro LOAD2x4O  OffsetA,OffsetB
 | 
						|
	lxv	vs16,(\OffsetB+	0)(BO)	// load real imag from B
 | 
						|
	lxv	vs18,	(\OffsetB+16)(BO)	// load real,imag from B 
 | 
						|
	xxswapd	vs17, vs16
 | 
						|
	xxswapd	vs19, vs18
 | 
						|
	lxv	vs0,	(0+\OffsetA)(AO)	// load real,imag from A
 | 
						|
	lxv	vs1,	(16+\OffsetA)(AO)	// load real,imag from A
 | 
						|
	lxv	vs2,	(32+\OffsetA)(AO)	// load real,imag from A
 | 
						|
	lxv	vs3,	(48+\OffsetA)(AO)	// load real,imag from A  
 | 
						|
.endm
 | 
						|
 | 
						|
 | 
						|
.macro END2x4_NORMAL
 | 
						|
	END2x4 AO,BO,64,32
 | 
						|
.endm
 | 
						|
 | 
						|
 | 
						|
.macro END2x4_WITHOUT_ADD
 | 
						|
	END2x4 AO,BO,0,0
 | 
						|
.endm
 | 
						|
 | 
						|
 | 
						|
.macro END2x4	AREG, BREG, OffsetA, OffsetB
 | 
						|
.if \OffsetB != 0
 | 
						|
	addi	\BREG, \BREG, \OffsetB
 | 
						|
.endif
 | 
						|
.if \OffsetA != 0
 | 
						|
	addi	\AREG, \AREG, \OffsetA
 | 
						|
.endif
 | 
						|
	xvmaddadp	vs32,	vs0,	vs16
 | 
						|
	xvmaddadp	vs40,	vs0,	vs18
 | 
						|
	xvmaddadp	vs33,	vs0,	vs17
 | 
						|
	xvmaddadp	vs41,	vs0,	vs19
 | 
						|
	xvmaddadp	vs34,	vs1,	vs16
 | 
						|
	xvmaddadp	vs42,	vs1,	vs18
 | 
						|
	xvmaddadp	vs35,	vs1,	vs17
 | 
						|
	xvmaddadp	vs43,	vs1,	vs19
 | 
						|
	xvmaddadp	vs36,	vs2,	vs16
 | 
						|
	xvmaddadp	vs44,	vs2,	vs18
 | 
						|
	xvmaddadp	vs37,	vs2,	vs17
 | 
						|
	xvmaddadp	vs45,	vs2,	vs19
 | 
						|
	xvmaddadp	vs38,	vs3,	vs16
 | 
						|
	xvmaddadp	vs46,	vs3,	vs18
 | 
						|
	xvmaddadp	vs39,	vs3,	vs17
 | 
						|
	xvmaddadp	vs47,	vs3,	vs19
 | 
						|
 | 
						|
.endm
 | 
						|
 | 
						|
 | 
						|
.macro LOAD2x4_2
 | 
						|
    LOAD2x4_2O 0,0
 | 
						|
.endm	
 | 
						|
 | 
						|
 | 
						|
.macro LOAD2x4_2O  OffsetA,OffsetB
 | 
						|
	lxv	vs16,(\OffsetB+	0)(BO)	// load real imag from B
 | 
						|
	lxv	vs18,	(\OffsetB+16)(BO)	// load real,imag from B
 | 
						|
	lxv	vs20,	(\OffsetB+32)(BO)	// load real,imag	from B
 | 
						|
	lxv	vs22,	(\OffsetB+48)(BO)	// load real,imag  from B	
 | 
						|
	xxswapd	vs17, vs16
 | 
						|
	xxswapd	vs19, vs18
 | 
						|
	lxv	vs0,	(0+\OffsetA)(AO)	// load real,imag from A
 | 
						|
	lxv	vs1,	(16+\OffsetA)(AO)	// load real,imag from A
 | 
						|
	lxv	vs2,	(32+\OffsetA)(AO)	// load real,imag from A
 | 
						|
	lxv	vs3,	(48+\OffsetA)(AO)	// load real,imag from A
 | 
						|
	lxv	vs8,	(64+\OffsetA)(AO)	// load real,imag from A
 | 
						|
	lxv	vs9,	(80+\OffsetA)(AO)	// load real,imag from A
 | 
						|
	lxv	vs10,	(96+\OffsetA)(AO)	// load real,imag from A
 | 
						|
	lxv	vs11,	(112+\OffsetA)(AO)	// load real,imag from A 
 | 
						|
.endm	
 | 
						|
 | 
						|
 | 
						|
.macro END2x4_2	  
 | 
						|
  /*for load2 offset will be 128 and 64*/
 | 
						|
   KERNEL2x4_2	AO,BO,	128,64,0 ,1,1 
 | 
						|
.endm
 | 
						|
 
 | 
						|
 | 
						|
 | 
						|
.macro KERNEL2x4_E2	OffsetA,OffsetB, Index,IsLast 
 | 
						|
  KERNEL2x4_2	AO,BO,	\OffsetA,\OffsetB, \Index,\IsLast ,1 
 | 
						|
.endm
 | 
						|
 | 
						|
 | 
						|
.macro KERNEL2x4_L2	OffsetA,OffsetB, Index,IsLast
 | 
						|
  KERNEL2x4_2	AO,BO,	\OffsetA,\OffsetB, \Index,\IsLast ,0 
 | 
						|
.endm
 | 
						|
 | 
						|
 | 
						|
.macro KERNEL2x4_2	AREG,BREG,	OffsetA,OffsetB, Index,IsLast ,Complete
 | 
						|
	xvmaddadp	vs32,	vs0,	vs16
 | 
						|
	xvmaddadp	vs40,	vs0,	vs18
 | 
						|
	xvmaddadp	vs33,	vs0,	vs17
 | 
						|
	xvmaddadp	vs41,	vs0,	vs19
 | 
						|
  xxswapd	vs21, vs20
 | 
						|
  xxswapd	vs23, vs22
 | 
						|
	xvmaddadp	vs34,	vs1,	vs16
 | 
						|
	xvmaddadp	vs42,	vs1,	vs18
 | 
						|
	xvmaddadp	vs35,	vs1,	vs17
 | 
						|
	xvmaddadp	vs43,	vs1,	vs19
 | 
						|
.if \Complete==0	
 | 
						|
	lxv	vs0,	DISP8(\Index, 0 + \OffsetA)(\AREG)	// load real,imag from A
 | 
						|
	lxv	vs1,	DISP8(\Index,16 + \OffsetA)(\AREG)	// load real,imag from A
 | 
						|
.endif	
 | 
						|
	xvmaddadp	vs36,	vs2,	vs16
 | 
						|
	xvmaddadp	vs44,	vs2,	vs18
 | 
						|
	xvmaddadp	vs37,	vs2,	vs17
 | 
						|
	xvmaddadp	vs45,	vs2,	vs19
 | 
						|
	xvmaddadp	vs38,	vs3,	vs16
 | 
						|
	xvmaddadp	vs46,	vs3,	vs18
 | 
						|
	xvmaddadp	vs39,	vs3,	vs17
 | 
						|
	xvmaddadp	vs47,	vs3,	vs19
 | 
						|
.if \Complete==0	
 | 
						|
	lxv	vs2,	DISP8(\Index,32 + \OffsetA)(\AREG)	// load real,imag from A
 | 
						|
	lxv	vs3,	DISP8(\Index,48 + \OffsetA)(\AREG)	// load real,imag from A
 | 
						|
.endif	
 | 
						|
 
 | 
						|
.if \Complete==0		
 | 
						|
	lxv	vs16,	DISP4(\Index, 0+\OffsetB)(\BREG)	// load real imag from B
 | 
						|
	lxv	vs18,	DISP4(\Index, 16+\OffsetB)(\BREG)	// load real,imag from B
 | 
						|
.endif
 | 
						|
	xvmaddadp	vs32,	vs8,	vs20
 | 
						|
	xvmaddadp	vs40,	vs8,	vs22 
 | 
						|
	xvmaddadp	vs33,	vs8,	vs21
 | 
						|
	xvmaddadp	vs41,	vs8,	vs23
 | 
						|
.if \Complete==0		
 | 
						|
  xxswapd	vs17, vs16
 | 
						|
  xxswapd	vs19, vs18
 | 
						|
.endif
 | 
						|
	xvmaddadp	vs34,	vs9,	vs20
 | 
						|
	xvmaddadp	vs42,	vs9,	vs22
 | 
						|
	xvmaddadp	vs35,	vs9,	vs21
 | 
						|
	xvmaddadp	vs43,	vs9,	vs23
 | 
						|
.if \Complete==0		
 | 
						|
	lxv	vs8,	DISP8(\Index,64+0+ \OffsetA)(\AREG)	// load real,imag from A
 | 
						|
	lxv	vs9,	DISP8(\Index,64+16 + \OffsetA)(\AREG)	// load real,imag from A
 | 
						|
.endif
 | 
						|
	xvmaddadp	vs36,	vs10,	vs20
 | 
						|
	xvmaddadp	vs44,	vs10,	vs22
 | 
						|
	xvmaddadp	vs37,	vs10,	vs21
 | 
						|
	xvmaddadp	vs45,	vs10,	vs23
 | 
						|
	xvmaddadp	vs38,	vs11,	vs20
 | 
						|
	xvmaddadp	vs46,	vs11,	vs22
 | 
						|
	xvmaddadp	vs39,	vs11,	vs21
 | 
						|
	xvmaddadp	vs47,	vs11,	vs23
 | 
						|
.if \Complete==0	
 | 
						|
	lxv	vs10,	DISP8(\Index,64+32 + \OffsetA)(\AREG)	// load real,imag from A
 | 
						|
	lxv	vs11,	DISP8(\Index,64+48 + \OffsetA)(\AREG)	// load real,imag from A
 | 
						|
.endif	
 | 
						|
 
 | 
						|
.if \Complete==0	 
 | 
						|
 	lxv	vs20,	DISP4(\Index, 32+\OffsetB)(\BREG)	// load real,imag	from B
 | 
						|
	lxv	vs22,	DISP4(\Index, 48+\OffsetB)(\BREG)	// load real,imag  from B
 | 
						|
.endif
 | 
						|
.if \IsLast==1
 | 
						|
.if \Complete==1
 | 
						|
	addi	\AREG, \AREG,  DISP8(\Index,\OffsetA)
 | 
						|
	addi	\BREG, \BREG,  DISP4(\Index,\OffsetB)
 | 
						|
.else
 | 
						|
	addi	\AREG, \AREG, DISP8(\Index,128)
 | 
						|
	addi	\BREG, \BREG,  DISP4(\Index,64)
 | 
						|
.endif
 | 
						|
.endif 
 | 
						|
.endm
 | 
						|
 
 | 
						|
 | 
						|
 | 
						|
.macro KERNEL2x4
 | 
						|
  LOAD2x4
 | 
						|
  END2x4  AO, BO, 64,32
 | 
						|
.endm
 | 
						|
 | 
						|
 | 
						|
 | 
						|
.macro SAVE2x4 
 | 
						|
	add	T1, CO ,LDC 
 | 
						|
	SAVE4  vs32,vs33,vs34,vs35,vs36,vs37,vs38,vs39,CO,0
 | 
						|
	SAVE4  vs40,vs41,vs42,vs43,vs44,vs45,vs46,vs47,T1,0  
 | 
						|
	addi	CO, CO, 64
 | 
						|
.endm
 | 
						|
/**********************************************************************************************
 | 
						|
*
 | 
						|
 | 
						|
.macros for N=2 and M=2
 | 
						|
**********************************************************************************************/
 | 
						|
 | 
						|
 | 
						|
.macro Zero2x2
 | 
						|
	xxlxor	vs32,	vs32,	vs32
 | 
						|
	xxlxor	vs33,	vs33,	vs33
 | 
						|
	xxlxor	vs34,	vs34,	vs34
 | 
						|
	xxlxor	vs35,	vs35,	vs35
 | 
						|
	xxlxor	vs36,	vs36,	vs36
 | 
						|
	xxlxor	vs37,	vs37,	vs37
 | 
						|
	xxlxor	vs38,	vs38,	vs38
 | 
						|
	xxlxor	vs39,	vs39,	vs39
 | 
						|
 | 
						|
.endm
 | 
						|
 | 
						|
 | 
						|
.macro LOAD2x2   
 | 
						|
	LOAD2x2O 0,0 
 | 
						|
.endm
 | 
						|
 | 
						|
 | 
						|
.macro LOAD2x2O  OffsetA,OffsetB
 | 
						|
	lxv	vs16,(\OffsetB+	0)(BO)	// load real imag from B
 | 
						|
	lxv	vs18,	(\OffsetB+16)(BO)	// load real,imag from B 
 | 
						|
	xxswapd	vs17, vs16
 | 
						|
	xxswapd	vs19, vs18
 | 
						|
	lxv	vs0,	(0+\OffsetA)(AO)	// load real,imag from A
 | 
						|
	lxv	vs1,	(16+\OffsetA)(AO)	// load real,imag from A
 | 
						|
 
 | 
						|
.endm
 | 
						|
 | 
						|
 | 
						|
.macro END2x2_NORMAL
 | 
						|
	END2x2 AO,BO,32,32
 | 
						|
.endm
 | 
						|
 | 
						|
 | 
						|
.macro END2x2_WITHOUT_ADD
 | 
						|
	END2x2 AO,BO,0,0
 | 
						|
.endm
 | 
						|
 | 
						|
 | 
						|
.macro END2x2	AREG, BREG, OffsetA, OffsetB
 | 
						|
.if \OffsetB != 0
 | 
						|
	addi	\BREG, \BREG, \OffsetB
 | 
						|
.endif
 | 
						|
.if \OffsetA != 0
 | 
						|
	addi	\AREG, \AREG, \OffsetA
 | 
						|
.endif
 | 
						|
	xvmaddadp	vs32,	vs0,	vs16
 | 
						|
	xvmaddadp	vs36,	vs0,	vs18
 | 
						|
	xvmaddadp	vs33,	vs0,	vs17
 | 
						|
	xvmaddadp	vs37,	vs0,	vs19
 | 
						|
	xvmaddadp	vs34,	vs1,	vs16
 | 
						|
	xvmaddadp	vs38,	vs1,	vs18
 | 
						|
	xvmaddadp	vs35,	vs1,	vs17
 | 
						|
	xvmaddadp	vs39,	vs1,	vs19 
 | 
						|
 | 
						|
.endm
 | 
						|
 | 
						|
 | 
						|
.macro LOAD2x2_2
 | 
						|
    LOAD2x2_2O 0,0
 | 
						|
.endm	
 | 
						|
 | 
						|
 | 
						|
.macro LOAD2x2_2O  OffsetA,OffsetB
 | 
						|
	lxv	vs16,(\OffsetB+	0)(BO)	// load real imag from B
 | 
						|
	lxv	vs18,	(\OffsetB+16)(BO)	// load real,imag from B
 | 
						|
	lxv	vs20,	(\OffsetB+32)(BO)	// load real,imag	from B
 | 
						|
	lxv	vs22,	(\OffsetB+48)(BO)	// load real,imag  from B	
 | 
						|
	xxswapd	vs17, vs16
 | 
						|
	xxswapd	vs19, vs18
 | 
						|
	lxv	vs0,	(0+\OffsetA)(AO)	// load real,imag from A
 | 
						|
	lxv	vs1,	(16+\OffsetA)(AO)	// load real,imag from A 
 | 
						|
	lxv	vs8,	(32+\OffsetA)(AO)	// load real,imag from A
 | 
						|
	lxv	vs9,	(48+\OffsetA)(AO)	// load real,imag from A
 | 
						|
 	
 | 
						|
.endm	
 | 
						|
 | 
						|
 | 
						|
.macro END2x2_2	  
 | 
						|
  /*for load2 offset will be 64 and 64*/
 | 
						|
   KERNEL2x2_2	AO,BO,	64,64,0 ,1,1 
 | 
						|
.endm
 | 
						|
 
 | 
						|
 | 
						|
 | 
						|
.macro KERNEL2x2_E2	OffsetA,OffsetB, Index,IsLast 
 | 
						|
  KERNEL2x2_2	AO,BO,	\OffsetA,\OffsetB, \Index,\IsLast ,1 
 | 
						|
.endm
 | 
						|
 | 
						|
 | 
						|
.macro KERNEL2x2_L2	OffsetA,OffsetB, Index,IsLast
 | 
						|
  KERNEL2x2_2	AO,BO,	\OffsetA,\OffsetB, \Index,\IsLast ,0 
 | 
						|
.endm
 | 
						|
 | 
						|
 | 
						|
.macro KERNEL2x2_2	AREG,BREG,	OffsetA,OffsetB, Index,IsLast ,Complete
 | 
						|
	xvmaddadp	vs32,	vs0,	vs16
 | 
						|
	xvmaddadp	vs36,	vs0,	vs18
 | 
						|
	xvmaddadp	vs33,	vs0,	vs17
 | 
						|
	xvmaddadp	vs37,	vs0,	vs19
 | 
						|
  xxswapd	vs21, vs20
 | 
						|
  xxswapd	vs23, vs22
 | 
						|
	xvmaddadp	vs34,	vs1,	vs16
 | 
						|
	xvmaddadp	vs38,	vs1,	vs18
 | 
						|
	xvmaddadp	vs35,	vs1,	vs17
 | 
						|
	xvmaddadp	vs39,	vs1,	vs19
 | 
						|
.if \Complete==0	
 | 
						|
	lxv	vs0,	DISP4(\Index, 0 + \OffsetA)(\AREG)	// load real,imag from A
 | 
						|
	lxv	vs1,	DISP4(\Index,16 + \OffsetA)(\AREG)	// load real,imag from A
 | 
						|
.endif	 
 | 
						|
.if \Complete==0		
 | 
						|
	lxv	vs16,	DISP4(\Index, 0+\OffsetB)(\BREG)	// load real imag from B
 | 
						|
	lxv	vs18,	DISP4(\Index, 16+\OffsetB)(\BREG)	// load real,imag from B
 | 
						|
.endif
 | 
						|
	xvmaddadp	vs32,	vs8,	vs20
 | 
						|
	xvmaddadp	vs36,	vs8,	vs22 
 | 
						|
	xvmaddadp	vs33,	vs8,	vs21
 | 
						|
	xvmaddadp	vs37,	vs8,	vs23
 | 
						|
.if \Complete==0		
 | 
						|
  xxswapd	vs17, vs16
 | 
						|
  xxswapd	vs19, vs18
 | 
						|
.endif
 | 
						|
	xvmaddadp	vs34,	vs9,	vs20
 | 
						|
	xvmaddadp	vs38,	vs9,	vs22
 | 
						|
	xvmaddadp	vs35,	vs9,	vs21
 | 
						|
	xvmaddadp	vs39,	vs9,	vs23
 | 
						|
.if \Complete==0	 
 | 
						|
 	lxv	vs20,	DISP4(\Index, 32+\OffsetB)(\BREG)	// load real,imag	from B
 | 
						|
	lxv	vs22,	DISP4(\Index, 48+\OffsetB)(\BREG)	// load real,imag  from B
 | 
						|
.endif
 | 
						|
.if \Complete==0		
 | 
						|
	lxv	vs8,	DISP4(\Index,32+0+ \OffsetA)(\AREG)	// load real,imag from A
 | 
						|
	lxv	vs9,	DISP4(\Index,32+16 + \OffsetA)(\AREG)	// load real,imag from A
 | 
						|
.endif
 | 
						|
 
 | 
						|
 
 | 
						|
 | 
						|
.if \IsLast==1
 | 
						|
.if \Complete==1
 | 
						|
	addi	\AREG, \AREG,  DISP4(\Index,\OffsetA)
 | 
						|
	addi	\BREG, \BREG,  DISP4(\Index,\OffsetB)
 | 
						|
.else
 | 
						|
	addi	\AREG, \AREG, DISP4(\Index,64)
 | 
						|
	addi	\BREG, \BREG,  DISP4(\Index,64)
 | 
						|
.endif
 | 
						|
.endif 
 | 
						|
.endm
 | 
						|
 
 | 
						|
 | 
						|
 | 
						|
.macro KERNEL2x2
 | 
						|
  LOAD2x2
 | 
						|
  END2x2  AO, BO, 32,32
 | 
						|
.endm
 | 
						|
 | 
						|
 | 
						|
 | 
						|
.macro SAVE2x2 
 | 
						|
	add	T1, CO ,LDC 
 | 
						|
	SAVE2  vs32,vs33,vs34,vs35,CO,0
 | 
						|
	SAVE2  vs36,vs37,vs38,vs39,T1,0 
 | 
						|
	addi	CO, CO, 32 
 | 
						|
.endm
 | 
						|
/**********************************************************************************************
 | 
						|
*
 | 
						|
 | 
						|
.macros for N=2 and M=1
 | 
						|
**********************************************************************************************/
 | 
						|
 | 
						|
 | 
						|
 | 
						|
.macro Zero2x1
 | 
						|
	xxlxor	vs32,	vs32,	vs32
 | 
						|
	xxlxor	vs33,	vs33,	vs33
 | 
						|
	xxlxor	vs34,	vs34,	vs34
 | 
						|
	xxlxor	vs35,	vs35,	vs35
 | 
						|
 
 | 
						|
.endm
 | 
						|
 | 
						|
 | 
						|
.macro LOAD2x1   
 | 
						|
	LOAD2x1O 0,0 
 | 
						|
.endm
 | 
						|
 | 
						|
 | 
						|
.macro LOAD2x1O  OffsetA,OffsetB
 | 
						|
	lxv	vs16,(\OffsetB+	0)(BO)	// load real imag from B
 | 
						|
	lxv	vs18,	(\OffsetB+16)(BO)	// load real,imag from B 
 | 
						|
	xxswapd	vs17, vs16
 | 
						|
	xxswapd	vs19, vs18
 | 
						|
	lxv	vs0,	(0+\OffsetA)(AO)	// load real,imag from A 
 | 
						|
.endm
 | 
						|
 | 
						|
 | 
						|
.macro END2x1_NORMAL
 | 
						|
	END2x1 AO,BO,16,32
 | 
						|
.endm
 | 
						|
 | 
						|
 | 
						|
.macro END2x1_WITHOUT_ADD
 | 
						|
	END2x1 AO,BO,0,0
 | 
						|
.endm
 | 
						|
 | 
						|
 | 
						|
.macro END2x1	AREG, BREG, OffsetA, OffsetB
 | 
						|
.if \OffsetB != 0
 | 
						|
	addi	\BREG, \BREG, \OffsetB
 | 
						|
.endif
 | 
						|
.if \OffsetA != 0
 | 
						|
	addi	\AREG, \AREG, \OffsetA
 | 
						|
.endif
 | 
						|
	xvmaddadp	vs32,	vs0,	vs16
 | 
						|
	xvmaddadp	vs34,	vs0,	vs18
 | 
						|
	xvmaddadp	vs33,	vs0,	vs17
 | 
						|
	xvmaddadp	vs35,	vs0,	vs19 
 | 
						|
.endm
 | 
						|
 | 
						|
 | 
						|
.macro LOAD2x1_2
 | 
						|
    LOAD2x1_2O 0,0
 | 
						|
.endm	
 | 
						|
 | 
						|
 | 
						|
.macro LOAD2x1_2O  OffsetA,OffsetB
 | 
						|
	lxv	vs16,(\OffsetB+	0)(BO)	// load real imag from B
 | 
						|
	lxv	vs18,	(\OffsetB+16)(BO)	// load real,imag from B
 | 
						|
	lxv	vs20,	(\OffsetB+32)(BO)	// load real,imag	from B
 | 
						|
	lxv	vs22,	(\OffsetB+48)(BO)	// load real,imag  from B	
 | 
						|
	xxswapd	vs17, vs16
 | 
						|
	xxswapd	vs19, vs18
 | 
						|
	lxv	vs0,	(0+\OffsetA)(AO)	// load real,imag from A
 | 
						|
	lxv	vs8,	(16+\OffsetA)(AO)	// load real,imag from A 
 | 
						|
.endm	
 | 
						|
 | 
						|
 | 
						|
.macro END2x1_2	  
 | 
						|
  /*for load2 offset will be 32 and 64*/
 | 
						|
   KERNEL2x1_2	AO,BO,	32,64,0 ,1,1 
 | 
						|
.endm
 | 
						|
 
 | 
						|
 | 
						|
 | 
						|
.macro KERNEL2x1_E2	OffsetA,OffsetB, Index,IsLast 
 | 
						|
  KERNEL2x1_2	AO,BO,	\OffsetA,\OffsetB, \Index,\IsLast ,1 
 | 
						|
.endm
 | 
						|
 | 
						|
 | 
						|
.macro KERNEL2x1_L2	OffsetA,OffsetB, Index,IsLast
 | 
						|
  KERNEL2x1_2	AO,BO,	\OffsetA,\OffsetB, \Index,\IsLast ,0 
 | 
						|
.endm
 | 
						|
 | 
						|
 | 
						|
.macro KERNEL2x1_2	AREG,BREG,	OffsetA,OffsetB, Index,IsLast ,Complete
 | 
						|
  xxswapd	vs21, vs20
 | 
						|
  xxswapd	vs23, vs22 
 | 
						|
	xvmaddadp	vs32,	vs0,	vs16
 | 
						|
	xvmaddadp	vs34,	vs0,	vs18
 | 
						|
	xvmaddadp	vs33,	vs0,	vs17
 | 
						|
	xvmaddadp	vs35,	vs0,	vs19
 | 
						|
.if \Complete==0	
 | 
						|
	lxv	vs0,	DISP2(\Index, 0 + \OffsetA)(\AREG)	// load real,imag from A 
 | 
						|
.endif	 
 | 
						|
.if \Complete==0		
 | 
						|
	lxv	vs16,	DISP4(\Index, 0+\OffsetB)(\BREG)	// load real imag from B
 | 
						|
	lxv	vs18,	DISP4(\Index, 16+\OffsetB)(\BREG)	// load real,imag from B
 | 
						|
.endif
 | 
						|
.if \Complete==0		
 | 
						|
  xxswapd	vs17, vs16
 | 
						|
  xxswapd	vs19, vs18
 | 
						|
.endif 
 | 
						|
	xvmaddadp	vs32,	vs8,	vs20
 | 
						|
	xvmaddadp	vs34,	vs8,	vs22 
 | 
						|
	xvmaddadp	vs33,	vs8,	vs21
 | 
						|
	xvmaddadp	vs35,	vs8,	vs23
 | 
						|
.if \Complete==0		
 | 
						|
	lxv	vs8,	DISP2(\Index,16+0+ \OffsetA)(\AREG)	// load real,imag from A 
 | 
						|
.endif
 | 
						|
 
 | 
						|
.if \Complete==0	 
 | 
						|
 	lxv	vs20,	DISP4(\Index, 32+\OffsetB)(\BREG)	// load real,imag	from B
 | 
						|
	lxv	vs22,	DISP4(\Index, 48+\OffsetB)(\BREG)	// load real,imag  from B
 | 
						|
.endif
 | 
						|
.if \IsLast==1
 | 
						|
.if \Complete==1
 | 
						|
	addi	\AREG, \AREG,  DISP2(\Index,\OffsetA)
 | 
						|
	addi	\BREG, \BREG,  DISP4(\Index,\OffsetB)
 | 
						|
.else
 | 
						|
	addi	\AREG, \AREG, DISP2(\Index,32)
 | 
						|
	addi	\BREG, \BREG,  DISP4(\Index,64)
 | 
						|
.endif
 | 
						|
.endif 
 | 
						|
.endm
 | 
						|
 
 | 
						|
 | 
						|
 | 
						|
.macro KERNEL2x1
 | 
						|
  LOAD2x1
 | 
						|
  END2x1  AO, BO, 16,32
 | 
						|
.endm
 | 
						|
 | 
						|
 | 
						|
 | 
						|
.macro SAVE2x1
 | 
						|
	add	T1, CO ,LDC 
 | 
						|
	SAVE1  vs32,vs33,CO,0
 | 
						|
	SAVE1  vs34,vs35,T1,0  
 | 
						|
	addi	CO, CO, 16 
 | 
						|
.endm
 | 
						|
 | 
						|
/**********************************************************************************************
 | 
						|
*
 | 
						|
 | 
						|
.macros for N=1 and M=8
 | 
						|
**********************************************************************************************/
 | 
						|
 | 
						|
 | 
						|
.macro Zero1x8
 | 
						|
	xxlxor	vs32,	vs32,	vs32
 | 
						|
	xxlxor	vs33,	vs33,	vs33
 | 
						|
	xxlxor	vs34,	vs34,	vs34
 | 
						|
	xxlxor	vs35,	vs35,	vs35
 | 
						|
	xxlxor	vs36,	vs36,	vs36
 | 
						|
	xxlxor	vs37,	vs37,	vs37
 | 
						|
	xxlxor	vs38,	vs38,	vs38
 | 
						|
	xxlxor	vs39,	vs39,	vs39
 | 
						|
	xxlxor	vs40,	vs40,	vs40
 | 
						|
	xxlxor	vs41,	vs41,	vs41
 | 
						|
	xxlxor	vs42,	vs42,	vs42
 | 
						|
	xxlxor	vs43,	vs43,	vs43
 | 
						|
	xxlxor	vs44,	vs44,	vs44
 | 
						|
	xxlxor	vs45,	vs45,	vs45
 | 
						|
	xxlxor	vs46,	vs46,	vs46
 | 
						|
	xxlxor	vs47,	vs47,	vs47
 | 
						|
	xxlxor	vs48,	vs48,	vs48
 | 
						|
.endm
 | 
						|
 | 
						|
 | 
						|
.macro LOAD1x8   
 | 
						|
	LOAD1x8O 0,0 
 | 
						|
.endm
 | 
						|
 | 
						|
 | 
						|
.macro LOAD1x8O  OffsetA,OffsetB
 | 
						|
	lxv	vs16,(\OffsetB+	0)(BO)	// load real imag from B 
 | 
						|
	xxswapd	vs17, vs16 
 | 
						|
	lxv	vs0,	(0+\OffsetA)(AO)	// load real,imag from A
 | 
						|
	lxv	vs1,	(16+\OffsetA)(AO)	// load real,imag from A
 | 
						|
	lxv	vs2,	(32+\OffsetA)(AO)	// load real,imag from A
 | 
						|
	lxv	vs3,	(48+\OffsetA)(AO)	// load real,imag from A
 | 
						|
	lxv	vs4,	(64+\OffsetA)(AO)	// load real,imag from A
 | 
						|
	lxv	vs5,	(80+\OffsetA)(AO)	// load real,imag from A
 | 
						|
	lxv	vs6,	(96+\OffsetA)(AO)	// load real,imag from A
 | 
						|
	lxv	vs7,	(112+\OffsetA)(AO)	// load real,imag from A
 | 
						|
 
 | 
						|
.endm
 | 
						|
 | 
						|
 | 
						|
.macro END1x8_NORMAL
 | 
						|
	END1x8 AO,BO,128,16
 | 
						|
.endm
 | 
						|
 | 
						|
 | 
						|
.macro END1x8_WITHOUT_ADD
 | 
						|
	END1x8 AO,BO,0,0
 | 
						|
.endm
 | 
						|
 | 
						|
 | 
						|
.macro END1x8	AREG, BREG, OffsetA, OffsetB
 | 
						|
.if \OffsetB != 0
 | 
						|
	addi	\BREG, \BREG, \OffsetB
 | 
						|
.endif
 | 
						|
.if \OffsetA != 0
 | 
						|
	addi	\AREG, \AREG, \OffsetA
 | 
						|
.endif
 | 
						|
	xvmaddadp	vs32,	vs0,	vs16
 | 
						|
	xvmaddadp	vs33,	vs0,	vs17
 | 
						|
 | 
						|
	xvmaddadp	vs34,	vs1,	vs16
 | 
						|
	xvmaddadp	vs35,	vs1,	vs17
 | 
						|
 | 
						|
	xvmaddadp	vs36,	vs2,	vs16
 | 
						|
	xvmaddadp	vs37,	vs2,	vs17
 | 
						|
 | 
						|
	xvmaddadp	vs38,	vs3,	vs16
 | 
						|
	xvmaddadp	vs39,	vs3,	vs17
 | 
						|
 | 
						|
	xvmaddadp	vs40,	vs4,	vs16
 | 
						|
	xvmaddadp	vs41,	vs4,	vs17
 | 
						|
 | 
						|
	xvmaddadp	vs42,	vs5,	vs16
 | 
						|
	xvmaddadp	vs43,	vs5,	vs17
 | 
						|
 | 
						|
	xvmaddadp	vs44,	vs6,	vs16
 | 
						|
	xvmaddadp	vs45,	vs6,	vs17
 | 
						|
 | 
						|
	xvmaddadp	vs46,	vs7,	vs16
 | 
						|
	xvmaddadp	vs47,	vs7,	vs17
 | 
						|
 | 
						|
.endm
 | 
						|
 | 
						|
 | 
						|
.macro LOAD1x8_2
 | 
						|
    LOAD1x8_2O 0,0
 | 
						|
.endm	
 | 
						|
 | 
						|
 | 
						|
.macro LOAD1x8_2O  OffsetA,OffsetB
 | 
						|
	lxv	vs16,(\OffsetB+	0)(BO)	// load real imag from B
 | 
						|
	lxv	vs20,	(\OffsetB+16)(BO)	// load real,imag	from B
 | 
						|
	xxswapd	vs17, vs16
 | 
						|
 | 
						|
	lxv	vs0,	(0+\OffsetA)(AO)	// load real,imag from A
 | 
						|
	lxv	vs1,	(16+\OffsetA)(AO)	// load real,imag from A
 | 
						|
	lxv	vs2,	(32+\OffsetA)(AO)	// load real,imag from A
 | 
						|
	lxv	vs3,	(48+\OffsetA)(AO)	// load real,imag from A
 | 
						|
	lxv	vs4,	(64+\OffsetA)(AO)	// load real,imag from A
 | 
						|
	lxv	vs5,	(80+\OffsetA)(AO)	// load real,imag from A
 | 
						|
	lxv	vs6,	(96+\OffsetA)(AO)	// load real,imag from A
 | 
						|
	lxv	vs7,	(112+\OffsetA)(AO)	// load real,imag from A
 | 
						|
	lxv	vs8,	(128+0+\OffsetA)(AO)	// load real,imag from A
 | 
						|
	lxv	vs9,	(128+16+\OffsetA)(AO)	// load real,imag from A
 | 
						|
	lxv	vs10,   (128+32+\OffsetA)(AO)	// load real,imag from A
 | 
						|
	lxv	vs11,   (128+48+\OffsetA)(AO)	// load real,imag from A
 | 
						|
	lxv	vs12,   (128+64+\OffsetA)(AO)	// load real,imag from A
 | 
						|
	lxv	vs13,   (128+80+\OffsetA)(AO)	// load real,imag from A
 | 
						|
	lxv	vs14,   (128+96+\OffsetA)(AO)	// load real,imag from A
 | 
						|
	lxv	vs15,   (128+112+\OffsetA)(AO)	// load real,imag from A
 | 
						|
.endm	
 | 
						|
 | 
						|
 | 
						|
.macro END1x8_2	  
 | 
						|
  /*for load2 offset will be 256 and 32*/
 | 
						|
   KERNEL1x8_2	AO,BO,	256,32,0 ,1,1 
 | 
						|
.endm
 | 
						|
 
 | 
						|
 | 
						|
 | 
						|
.macro KERNEL1x8_E2	OffsetA,OffsetB, Index,IsLast 
 | 
						|
  KERNEL1x8_2	AO,BO,	\OffsetA,\OffsetB, \Index,\IsLast ,1 
 | 
						|
.endm
 | 
						|
 | 
						|
 | 
						|
.macro KERNEL1x8_L2	OffsetA,OffsetB, Index,IsLast
 | 
						|
  KERNEL1x8_2	AO,BO,	\OffsetA,\OffsetB, \Index,\IsLast ,0 
 | 
						|
.endm
 | 
						|
 | 
						|
 | 
						|
.macro KERNEL1x8_2	AREG,BREG,	OffsetA,OffsetB, Index,IsLast ,Complete
 | 
						|
	xvmaddadp	vs32,	vs0,	vs16
 | 
						|
	xvmaddadp	vs33,	vs0,	vs17
 | 
						|
  xxswapd	vs21, vs20
 | 
						|
	xvmaddadp	vs34,	vs1,	vs16
 | 
						|
	xvmaddadp	vs35,	vs1,	vs17
 | 
						|
.if \Complete==0	
 | 
						|
	lxv	vs0,	DISP16(\Index, 0 + \OffsetA)(\AREG)	// load real,imag from A
 | 
						|
	lxv	vs1,	DISP16(\Index,16 + \OffsetA)(\AREG)	// load real,imag from A
 | 
						|
.endif	
 | 
						|
	xvmaddadp	vs36,	vs2,	vs16
 | 
						|
	xvmaddadp	vs37,	vs2,	vs17
 | 
						|
 | 
						|
	xvmaddadp	vs38,	vs3,	vs16
 | 
						|
	xvmaddadp	vs39,	vs3,	vs17
 | 
						|
.if \Complete==0	
 | 
						|
	lxv	vs2,	DISP16(\Index,32 + \OffsetA)(\AREG)	// load real,imag from A
 | 
						|
	lxv	vs3,	DISP16(\Index,48 + \OffsetA)(\AREG)	// load real,imag from A
 | 
						|
.endif	
 | 
						|
	xvmaddadp	vs40,	vs4,	vs16
 | 
						|
	xvmaddadp	vs41,	vs4,	vs17
 | 
						|
 | 
						|
	xvmaddadp	vs42,	vs5,	vs16
 | 
						|
	xvmaddadp	vs43,	vs5,	vs17
 | 
						|
.if \Complete==0		
 | 
						|
	lxv	vs4,	DISP16(\Index,64+ \OffsetA)(\AREG)	// load real,imag from A
 | 
						|
	lxv	vs5,	DISP16(\Index,64+16 + \OffsetA)(\AREG)	// load real,imag from A
 | 
						|
.endif	
 | 
						|
	xvmaddadp	vs44,	vs6,	vs16
 | 
						|
	xvmaddadp	vs45,	vs6,	vs17
 | 
						|
 | 
						|
	xvmaddadp	vs46,	vs7,	vs16
 | 
						|
	xvmaddadp	vs47,	vs7,	vs17
 | 
						|
.if \Complete==0		
 | 
						|
	lxv	vs16,	DISP2(\Index, 0+\OffsetB)(\BREG)	// load real imag from B
 | 
						|
.endif
 | 
						|
.if \Complete==0		
 | 
						|
  xxswapd	vs17, vs16
 | 
						|
.endif
 | 
						|
	xvmaddadp	vs32,	vs8,	vs20
 | 
						|
	xvmaddadp	vs33,	vs8,	vs21
 | 
						|
.if \Complete==0
 | 
						|
	lxv	vs6,	DISP16(\Index,64+32 + \OffsetA)(\AREG)	// load real,imag from A
 | 
						|
	lxv	vs7,	DISP16(\Index,64+48 + \OffsetA)(\AREG)	// load real,imag from A	
 | 
						|
.endif	
 | 
						|
	xvmaddadp	vs34,	vs9,	vs20
 | 
						|
	xvmaddadp	vs35,	vs9,	vs21
 | 
						|
.if \Complete==0		
 | 
						|
	lxv	vs8,	DISP16(\Index,128+ + \OffsetA)(\AREG)	// load real,imag from A
 | 
						|
	lxv	vs9,	DISP16(\Index,128+16 + \OffsetA)(\AREG)	// load real,imag from A
 | 
						|
.endif
 | 
						|
	xvmaddadp	vs36,	vs10,	vs20
 | 
						|
	xvmaddadp	vs37,	vs10,	vs21
 | 
						|
	xvmaddadp	vs38,	vs11,	vs20
 | 
						|
	xvmaddadp	vs39,	vs11,	vs21
 | 
						|
.if \Complete==0	
 | 
						|
	lxv	vs10,	DISP16(\Index,128+32 + \OffsetA)(\AREG)	// load real,imag from A
 | 
						|
	lxv	vs11,	DISP16(\Index,128+48 + \OffsetA)(\AREG)	// load real,imag from A
 | 
						|
.endif	
 | 
						|
	xvmaddadp	vs40,	vs12,	vs20
 | 
						|
	xvmaddadp	vs41,	vs12,	vs21
 | 
						|
	xvmaddadp	vs42,	vs13,	vs20
 | 
						|
	xvmaddadp	vs43,	vs13,	vs21
 | 
						|
.if \Complete==0	
 | 
						|
	lxv	vs12,	DISP16(\Index, 192 + \OffsetA)(\AREG)	// load real,imag from A
 | 
						|
	lxv	vs13,	DISP16(\Index,192 +16 + \OffsetA)(\AREG)	// load real,imag from A
 | 
						|
.endif	
 | 
						|
	xvmaddadp	vs44,	vs14,	vs20
 | 
						|
	xvmaddadp	vs45,	vs14,	vs21
 | 
						|
	xvmaddadp	vs46,	vs15,	vs20
 | 
						|
	xvmaddadp	vs47,	vs15,	vs21
 | 
						|
.if \Complete==0	
 | 
						|
	lxv	vs14,	DISP16(\Index,192 +32 + \OffsetA)(\AREG)	// load real,imag from A
 | 
						|
	lxv	vs15,	DISP16(\Index,192 +48 + \OffsetA)(\AREG)	// load real,imag from A
 | 
						|
 	lxv	vs20,	DISP2(\Index, 16+\OffsetB)(\BREG)	// load real,imag	from B
 | 
						|
.endif
 | 
						|
.if \IsLast==1
 | 
						|
.if \Complete==1
 | 
						|
	addi	\AREG, \AREG,  DISP16(\Index,\OffsetA)
 | 
						|
	addi	\BREG, \BREG,  DISP2(\Index,\OffsetB)
 | 
						|
.else
 | 
						|
	addi	\AREG, \AREG, DISP16(\Index,256)
 | 
						|
	addi	\BREG, \BREG,  DISP2(\Index,32)
 | 
						|
.endif
 | 
						|
.endif 
 | 
						|
.endm
 | 
						|
 | 
						|
 
 | 
						|
 | 
						|
 | 
						|
 | 
						|
.macro KERNEL1x8
 | 
						|
  LOAD1x8
 | 
						|
  END1x8  AO, BO, 128,16
 | 
						|
.endm
 | 
						|
 | 
						|
 | 
						|
.macro SAVE1x8
 | 
						|
	SAVE8  vs32,vs33,vs34,vs35,vs36,vs37,vs38,vs39,vs40,vs41,vs42,vs43,vs44,vs45,vs46,vs47,CO,0
 | 
						|
	addi	CO, CO, 128
 | 
						|
.endm
 | 
						|
/**********************************************************************************************
 | 
						|
*
 | 
						|
 | 
						|
.macros for N=2 and M=4
 | 
						|
**********************************************************************************************/
 | 
						|
 | 
						|
 | 
						|
.macro Zero1x4
 | 
						|
	xxlxor	vs32,	vs32,	vs32
 | 
						|
	xxlxor	vs33,	vs33,	vs33
 | 
						|
	xxlxor	vs34,	vs34,	vs34
 | 
						|
	xxlxor	vs35,	vs35,	vs35
 | 
						|
	xxlxor	vs36,	vs36,	vs36
 | 
						|
	xxlxor	vs37,	vs37,	vs37
 | 
						|
	xxlxor	vs38,	vs38,	vs38
 | 
						|
	xxlxor	vs39,	vs39,	vs39
 | 
						|
.endm
 | 
						|
 | 
						|
 | 
						|
.macro LOAD1x4   
 | 
						|
	LOAD1x4O 0,0 
 | 
						|
.endm
 | 
						|
 | 
						|
 | 
						|
.macro LOAD1x4O  OffsetA,OffsetB
 | 
						|
	lxv	vs16,(\OffsetB+	0)(BO)	// load real imag from B
 | 
						|
	xxswapd	vs17, vs16
 | 
						|
 | 
						|
	lxv	vs0,	(0+\OffsetA)(AO)	// load real,imag from A
 | 
						|
	lxv	vs1,	(16+\OffsetA)(AO)	// load real,imag from A
 | 
						|
	lxv	vs2,	(32+\OffsetA)(AO)	// load real,imag from A
 | 
						|
	lxv	vs3,	(48+\OffsetA)(AO)	// load real,imag from A 
 | 
						|
 
 | 
						|
.endm
 | 
						|
 | 
						|
 | 
						|
.macro END1x4_NORMAL
 | 
						|
	END1x4 AO,BO,64,16
 | 
						|
.endm
 | 
						|
 | 
						|
 | 
						|
.macro END1x4_WITHOUT_ADD
 | 
						|
	END1x4 AO,BO,0,0
 | 
						|
.endm
 | 
						|
 | 
						|
 | 
						|
.macro END1x4	AREG, BREG, OffsetA, OffsetB
 | 
						|
.if \OffsetB != 0
 | 
						|
	addi	\BREG, \BREG, \OffsetB
 | 
						|
.endif
 | 
						|
.if \OffsetA != 0
 | 
						|
	addi	\AREG, \AREG, \OffsetA
 | 
						|
.endif
 | 
						|
	xvmaddadp	vs32,	vs0,	vs16
 | 
						|
	xvmaddadp	vs33,	vs0,	vs17
 | 
						|
 | 
						|
	xvmaddadp	vs34,	vs1,	vs16
 | 
						|
	xvmaddadp	vs35,	vs1,	vs17
 | 
						|
 | 
						|
	xvmaddadp	vs36,	vs2,	vs16
 | 
						|
	xvmaddadp	vs37,	vs2,	vs17
 | 
						|
 | 
						|
	xvmaddadp	vs38,	vs3,	vs16
 | 
						|
	xvmaddadp	vs39,	vs3,	vs17
 | 
						|
 | 
						|
.endm
 | 
						|
 | 
						|
 | 
						|
.macro LOAD1x4_2
 | 
						|
    LOAD1x4_2O 0,0
 | 
						|
.endm	
 | 
						|
 | 
						|
 | 
						|
.macro LOAD1x4_2O  OffsetA,OffsetB
 | 
						|
	lxv	vs16,(\OffsetB+	0)(BO)	// load real imag from B
 | 
						|
	lxv	vs20,	(\OffsetB+16)(BO)	// load real,imag	from B
 | 
						|
	xxswapd	vs17, vs16
 | 
						|
 | 
						|
	lxv	vs0,	(0+\OffsetA)(AO)	// load real,imag from A
 | 
						|
	lxv	vs1,	(16+\OffsetA)(AO)	// load real,imag from A
 | 
						|
	lxv	vs2,	(32+\OffsetA)(AO)	// load real,imag from A
 | 
						|
	lxv	vs3,	(48+\OffsetA)(AO)	// load real,imag from A
 | 
						|
	lxv	vs8,	(64+\OffsetA)(AO)	// load real,imag from A
 | 
						|
	lxv	vs9,	(80+\OffsetA)(AO)	// load real,imag from A
 | 
						|
	lxv	vs10,	(96+\OffsetA)(AO)	// load real,imag from A
 | 
						|
	lxv	vs11,	(112+\OffsetA)(AO)	// load real,imag from A 
 | 
						|
.endm	
 | 
						|
 | 
						|
 | 
						|
.macro END1x4_2	  
 | 
						|
  /*for load2 offset will be 128 and 32*/
 | 
						|
   KERNEL1x4_2	AO,BO,	128,32,0 ,1,1 
 | 
						|
.endm
 | 
						|
 
 | 
						|
 | 
						|
 | 
						|
.macro KERNEL1x4_E2	OffsetA,OffsetB, Index,IsLast 
 | 
						|
  KERNEL1x4_2	AO,BO,	\OffsetA,\OffsetB, \Index,\IsLast ,1 
 | 
						|
.endm
 | 
						|
 | 
						|
 | 
						|
.macro KERNEL1x4_L2	OffsetA,OffsetB, Index,IsLast
 | 
						|
  KERNEL1x4_2	AO,BO,	\OffsetA,\OffsetB, \Index,\IsLast ,0 
 | 
						|
.endm
 | 
						|
 | 
						|
 | 
						|
.macro KERNEL1x4_2	AREG,BREG,	OffsetA,OffsetB, Index,IsLast ,Complete
 | 
						|
	xvmaddadp	vs32,	vs0,	vs16
 | 
						|
	xvmaddadp	vs33,	vs0,	vs17
 | 
						|
  xxswapd	vs21, vs20
 | 
						|
	xvmaddadp	vs34,	vs1,	vs16
 | 
						|
	xvmaddadp	vs35,	vs1,	vs17
 | 
						|
.if \Complete==0	
 | 
						|
	lxv	vs0,	DISP8(\Index, 0 + \OffsetA)(\AREG)	// load real,imag from A
 | 
						|
	lxv	vs1,	DISP8(\Index,16 + \OffsetA)(\AREG)	// load real,imag from A
 | 
						|
.endif	
 | 
						|
	xvmaddadp	vs36,	vs2,	vs16
 | 
						|
	xvmaddadp	vs37,	vs2,	vs17
 | 
						|
 | 
						|
	xvmaddadp	vs38,	vs3,	vs16
 | 
						|
	xvmaddadp	vs39,	vs3,	vs17
 | 
						|
.if \Complete==0	
 | 
						|
	lxv	vs2,	DISP8(\Index,32 + \OffsetA)(\AREG)	// load real,imag from A
 | 
						|
	lxv	vs3,	DISP8(\Index,48 + \OffsetA)(\AREG)	// load real,imag from A
 | 
						|
.endif	
 | 
						|
 
 | 
						|
.if \Complete==0		
 | 
						|
	lxv	vs16,	DISP2(\Index, 0+\OffsetB)(\BREG)	// load real imag from B
 | 
						|
.endif
 | 
						|
	xvmaddadp	vs32,	vs8,	vs20
 | 
						|
	xvmaddadp	vs33,	vs8,	vs21
 | 
						|
.if \Complete==0		
 | 
						|
  xxswapd	vs17, vs16
 | 
						|
.endif
 | 
						|
	xvmaddadp	vs34,	vs9,	vs20
 | 
						|
	xvmaddadp	vs35,	vs9,	vs21
 | 
						|
.if \Complete==0		
 | 
						|
	lxv	vs8,	DISP8(\Index,64+0+ \OffsetA)(\AREG)	// load real,imag from A
 | 
						|
	lxv	vs9,	DISP8(\Index,64+16 + \OffsetA)(\AREG)	// load real,imag from A
 | 
						|
.endif
 | 
						|
	xvmaddadp	vs36,	vs10,	vs20
 | 
						|
	xvmaddadp	vs37,	vs10,	vs21
 | 
						|
	xvmaddadp	vs38,	vs11,	vs20
 | 
						|
	xvmaddadp	vs39,	vs11,	vs21
 | 
						|
.if \Complete==0	
 | 
						|
	lxv	vs10,	DISP8(\Index,64+32 + \OffsetA)(\AREG)	// load real,imag from A
 | 
						|
	lxv	vs11,	DISP8(\Index,64+48 + \OffsetA)(\AREG)	// load real,imag from A
 | 
						|
.endif	
 | 
						|
 
 | 
						|
.if \Complete==0	 
 | 
						|
 	lxv	vs20,	DISP2(\Index, 16+\OffsetB)(\BREG)	// load real,imag	from B
 | 
						|
.endif
 | 
						|
.if \IsLast==1
 | 
						|
.if \Complete==1
 | 
						|
	addi	\AREG, \AREG,  DISP8(\Index,\OffsetA)
 | 
						|
	addi	\BREG, \BREG,  DISP2(\Index,\OffsetB)
 | 
						|
.else
 | 
						|
	addi	\AREG, \AREG, DISP8(\Index,128)
 | 
						|
	addi	\BREG, \BREG,  DISP2(\Index,32)
 | 
						|
.endif
 | 
						|
.endif 
 | 
						|
.endm
 | 
						|
 
 | 
						|
 | 
						|
 | 
						|
.macro KERNEL1x4
 | 
						|
  LOAD1x4
 | 
						|
  END1x4  AO, BO, 64,16
 | 
						|
.endm
 | 
						|
 | 
						|
 | 
						|
 | 
						|
.macro SAVE1x4 
 | 
						|
	SAVE4  vs32,vs33,vs34,vs35,vs36,vs37,vs38,vs39,CO,0
 | 
						|
	addi	CO, CO, 64
 | 
						|
.endm
 | 
						|
/**********************************************************************************************
 | 
						|
*
 | 
						|
 | 
						|
.macros for N=2 and M=2
 | 
						|
**********************************************************************************************/
 | 
						|
 | 
						|
 | 
						|
.macro Zero1x2
 | 
						|
	xxlxor	vs32,	vs32,	vs32
 | 
						|
	xxlxor	vs33,	vs33,	vs33
 | 
						|
	xxlxor	vs34,	vs34,	vs34
 | 
						|
	xxlxor	vs35,	vs35,	vs35 
 | 
						|
 | 
						|
.endm
 | 
						|
 | 
						|
 | 
						|
.macro LOAD1x2   
 | 
						|
	LOAD1x2O 0,0 
 | 
						|
.endm
 | 
						|
 | 
						|
 | 
						|
.macro LOAD1x2O  OffsetA,OffsetB
 | 
						|
	lxv	vs16,(\OffsetB+	0)(BO)	// load real imag from B
 | 
						|
	xxswapd	vs17, vs16
 | 
						|
 | 
						|
	lxv	vs0,	(0+\OffsetA)(AO)	// load real,imag from A
 | 
						|
	lxv	vs1,	(16+\OffsetA)(AO)	// load real,imag from A 
 | 
						|
 | 
						|
.endm
 | 
						|
 | 
						|
 | 
						|
.macro END1x2_NORMAL
 | 
						|
	END1x2 AO,BO,32,16
 | 
						|
.endm
 | 
						|
 | 
						|
 | 
						|
.macro END1x2_WITHOUT_ADD
 | 
						|
	END1x2 AO,BO,0,0
 | 
						|
.endm
 | 
						|
 | 
						|
 | 
						|
.macro END1x2	AREG, BREG, OffsetA, OffsetB
 | 
						|
.if \OffsetB != 0
 | 
						|
	addi	\BREG, \BREG, \OffsetB
 | 
						|
.endif
 | 
						|
.if \OffsetA != 0
 | 
						|
	addi	\AREG, \AREG, \OffsetA
 | 
						|
.endif
 | 
						|
	xvmaddadp	vs32,	vs0,	vs16
 | 
						|
	xvmaddadp	vs33,	vs0,	vs17
 | 
						|
 | 
						|
	xvmaddadp	vs34,	vs1,	vs16
 | 
						|
	xvmaddadp	vs35,	vs1,	vs17
 | 
						|
 | 
						|
.endm
 | 
						|
 | 
						|
 | 
						|
.macro LOAD1x2_2
 | 
						|
    LOAD1x2_2O 0,0
 | 
						|
.endm	
 | 
						|
 | 
						|
 | 
						|
.macro LOAD1x2_2O  OffsetA,OffsetB
 | 
						|
	lxv	vs16,(\OffsetB+	0)(BO)	// load real imag from B
 | 
						|
	lxv	vs20,	(\OffsetB+16)(BO)	// load real,imag	from B
 | 
						|
	xxswapd	vs17, vs16
 | 
						|
 | 
						|
	lxv	vs0,	(0+\OffsetA)(AO)	// load real,imag from A
 | 
						|
	lxv	vs1,	(16+\OffsetA)(AO)	// load real,imag from A
 | 
						|
	lxv	vs8,	(32+\OffsetA)(AO)	// load real,imag from A
 | 
						|
	lxv	vs9,	(48+\OffsetA)(AO)	// load real,imag from A
 | 
						|
.endm	
 | 
						|
 | 
						|
 | 
						|
.macro END1x2_2	  
 | 
						|
  /*for load2 offset will be 64 and 32*/
 | 
						|
   KERNEL1x2_2	AO,BO,	64,32,0 ,1,1 
 | 
						|
.endm
 | 
						|
 
 | 
						|
 | 
						|
 | 
						|
.macro KERNEL1x2_E2	OffsetA,OffsetB, Index,IsLast 
 | 
						|
  KERNEL1x2_2	AO,BO,	\OffsetA,\OffsetB, \Index,\IsLast ,1 
 | 
						|
.endm
 | 
						|
 | 
						|
 | 
						|
.macro KERNEL1x2_L2	OffsetA,OffsetB, Index,IsLast
 | 
						|
  KERNEL1x2_2	AO,BO,	\OffsetA,\OffsetB, \Index,\IsLast ,0 
 | 
						|
.endm
 | 
						|
 | 
						|
 | 
						|
.macro KERNEL1x2_2	AREG,BREG,	OffsetA,OffsetB, Index,IsLast ,Complete
 | 
						|
	xvmaddadp	vs32,	vs0,	vs16
 | 
						|
	xvmaddadp	vs33,	vs0,	vs17
 | 
						|
  xxswapd	vs21, vs20
 | 
						|
	xvmaddadp	vs34,	vs1,	vs16
 | 
						|
	xvmaddadp	vs35,	vs1,	vs17
 | 
						|
.if \Complete==0	
 | 
						|
	lxv	vs0,	DISP4(\Index, 0 + \OffsetA)(\AREG)	// load real,imag from A
 | 
						|
	lxv	vs1,	DISP4(\Index,16 + \OffsetA)(\AREG)	// load real,imag from A
 | 
						|
.endif	 
 | 
						|
.if \Complete==0		
 | 
						|
	lxv	vs16,	DISP2(\Index, 0+\OffsetB)(\BREG)	// load real imag from B
 | 
						|
.endif
 | 
						|
	xvmaddadp	vs32,	vs8,	vs20
 | 
						|
	xvmaddadp	vs33,	vs8,	vs21
 | 
						|
.if \Complete==0		
 | 
						|
  xxswapd	vs17, vs16
 | 
						|
.endif
 | 
						|
	xvmaddadp	vs34,	vs9,	vs20
 | 
						|
	xvmaddadp	vs35,	vs9,	vs21
 | 
						|
.if \Complete==0	 
 | 
						|
 	lxv	vs20,	DISP2(\Index, 16+\OffsetB)(\BREG)	// load real,imag	from B
 | 
						|
.endif
 | 
						|
.if \Complete==0		
 | 
						|
	lxv	vs8,	DISP4(\Index,32+0+ \OffsetA)(\AREG)	// load real,imag from A
 | 
						|
	lxv	vs9,	DISP4(\Index,32+16 + \OffsetA)(\AREG)	// load real,imag from A
 | 
						|
.endif
 | 
						|
 
 | 
						|
 
 | 
						|
 | 
						|
.if \IsLast==1
 | 
						|
.if \Complete==1
 | 
						|
	addi	\AREG, \AREG,  DISP4(\Index,\OffsetA)
 | 
						|
	addi	\BREG, \BREG,  DISP2(\Index,\OffsetB)
 | 
						|
.else
 | 
						|
	addi	\AREG, \AREG, DISP4(\Index,64)
 | 
						|
	addi	\BREG, \BREG,  DISP2(\Index,32)
 | 
						|
.endif
 | 
						|
.endif 
 | 
						|
.endm
 | 
						|
 
 | 
						|
 | 
						|
 | 
						|
.macro KERNEL1x2
 | 
						|
  LOAD1x2
 | 
						|
  END1x2  AO, BO, 32,16
 | 
						|
.endm
 | 
						|
 | 
						|
 | 
						|
 | 
						|
.macro SAVE1x2 
 | 
						|
	SAVE2  vs32,vs33,vs34,vs35,CO,0
 | 
						|
	addi	CO, CO, 32 
 | 
						|
.endm
 | 
						|
/**********************************************************************************************
 | 
						|
*
 | 
						|
 | 
						|
.macros for N=2 and M=1
 | 
						|
**********************************************************************************************/
 | 
						|
 | 
						|
 | 
						|
 | 
						|
.macro Zero1x1
 | 
						|
	xxlxor	vs32,	vs32,	vs32
 | 
						|
	xxlxor	vs33,	vs33,	vs33 
 | 
						|
.endm
 | 
						|
 | 
						|
 | 
						|
.macro LOAD1x1   
 | 
						|
	LOAD1x1O 0,0 
 | 
						|
.endm
 | 
						|
 | 
						|
 | 
						|
.macro LOAD1x1O  OffsetA,OffsetB
 | 
						|
	lxv	vs16,(\OffsetB+	0)(BO)	// load real imag from B
 | 
						|
	lxv	vs0,	(0+\OffsetA)(AO)	// load real,imag from A 
 | 
						|
	xxswapd	vs17, vs16
 | 
						|
 | 
						|
.endm
 | 
						|
 | 
						|
 | 
						|
.macro END1x1_NORMAL
 | 
						|
	END1x1 AO,BO,16,16
 | 
						|
.endm
 | 
						|
 | 
						|
 | 
						|
.macro END1x1_WITHOUT_ADD
 | 
						|
	END1x1 AO,BO,0,0
 | 
						|
.endm
 | 
						|
 | 
						|
 | 
						|
.macro END1x1	AREG, BREG, OffsetA, OffsetB
 | 
						|
.if \OffsetB != 0
 | 
						|
	addi	\BREG, \BREG, \OffsetB
 | 
						|
.endif
 | 
						|
.if \OffsetA != 0
 | 
						|
	addi	\AREG, \AREG, \OffsetA
 | 
						|
.endif
 | 
						|
	xvmaddadp	vs32,	vs0,	vs16 
 | 
						|
	xvmaddadp	vs33,	vs0,	vs17 
 | 
						|
.endm
 | 
						|
 | 
						|
 | 
						|
.macro LOAD1x1_2
 | 
						|
    LOAD1x1_2O 0,0
 | 
						|
.endm	
 | 
						|
 | 
						|
 | 
						|
.macro LOAD1x1_2O  OffsetA,OffsetB
 | 
						|
	lxv	vs16,(\OffsetB+	0)(BO)	// load real imag from B
 | 
						|
	lxv	vs20,	(\OffsetB+16)(BO)	// load real,imag	from B
 | 
						|
	xxswapd	vs17, vs16
 | 
						|
 | 
						|
	lxv	vs0,	(0+\OffsetA)(AO)	// load real,imag from A
 | 
						|
	lxv	vs8,	(16+\OffsetA)(AO)	// load real,imag from A 
 | 
						|
.endm	
 | 
						|
 | 
						|
 | 
						|
.macro END1x1_2	  
 | 
						|
  /*for load2 offset will be 32 and 32*/
 | 
						|
   KERNEL1x1_2	AO,BO,	32,32,0 ,1,1 
 | 
						|
.endm
 | 
						|
 
 | 
						|
 | 
						|
 | 
						|
.macro KERNEL1x1_E2	OffsetA,OffsetB, Index,IsLast 
 | 
						|
  KERNEL1x1_2	AO,BO,	\OffsetA,\OffsetB, \Index,\IsLast ,1 
 | 
						|
.endm
 | 
						|
 | 
						|
 | 
						|
.macro KERNEL1x1_L2	OffsetA,OffsetB, Index,IsLast
 | 
						|
  KERNEL1x1_2	AO,BO,	\OffsetA,\OffsetB, \Index,\IsLast ,0 
 | 
						|
.endm
 | 
						|
 | 
						|
 | 
						|
.macro KERNEL1x1_2	AREG,BREG,	OffsetA,OffsetB, Index,IsLast ,Complete
 | 
						|
  xxswapd	vs21, vs20
 | 
						|
	xvmaddadp	vs32,	vs0,	vs16 
 | 
						|
	xvmaddadp	vs33,	vs0,	vs17 
 | 
						|
.if \Complete==0	
 | 
						|
	lxv	vs0,	DISP2(\Index, 0 + \OffsetA)(\AREG)	// load real,imag from A 
 | 
						|
.endif	 
 | 
						|
.if \Complete==0		
 | 
						|
	lxv	vs16,	DISP2(\Index, 0+\OffsetB)(\BREG)	// load real imag from B
 | 
						|
.endif
 | 
						|
.if \Complete==0		
 | 
						|
  xxswapd	vs17, vs16
 | 
						|
.endif 
 | 
						|
	xvmaddadp	vs32,	vs8,	vs20
 | 
						|
	xvmaddadp	vs33,	vs8,	vs21 
 | 
						|
.if \Complete==0		
 | 
						|
	lxv	vs8,	DISP2(\Index,16+0+ \OffsetA)(\AREG)	// load real,imag from A 
 | 
						|
.endif
 | 
						|
 
 | 
						|
.if \Complete==0	 
 | 
						|
 	lxv	vs20,	DISP2(\Index, 16+\OffsetB)(\BREG)	// load real,imag	from B
 | 
						|
.endif
 | 
						|
.if \IsLast==1
 | 
						|
.if \Complete==1
 | 
						|
	addi	\AREG, \AREG,  DISP2(\Index,\OffsetA)
 | 
						|
	addi	\BREG, \BREG,  DISP2(\Index,\OffsetB)
 | 
						|
.else
 | 
						|
	addi	\AREG, \AREG, DISP2(\Index,32)
 | 
						|
	addi	\BREG, \BREG,  DISP2(\Index,32)
 | 
						|
.endif
 | 
						|
.endif 
 | 
						|
.endm
 | 
						|
 
 | 
						|
 | 
						|
 | 
						|
.macro KERNEL1x1
 | 
						|
  LOAD1x1
 | 
						|
  END1x1  AO, BO, 16,16
 | 
						|
.endm
 | 
						|
 | 
						|
 | 
						|
 | 
						|
.macro SAVE1x1
 | 
						|
	SAVE1  vs32,vs33,CO,0
 | 
						|
	addi	CO, CO, 16 
 | 
						|
.endm
 | 
						|
 | 
						|
/****************************TRMM POINTER REFRESH
 | 
						|
 | 
						|
.macroSES*************************/
 | 
						|
 | 
						|
 | 
						|
.macro SHIFT_REG  REG1,REG2,SHIFT_VAL
 | 
						|
		.if \SHIFT_VAL==16 
 | 
						|
			slwi		\REG1,	\REG2,	8			
 | 
						|
		.elseif \SHIFT_VAL==8  
 | 
						|
			slwi		\REG1,	\REG2,	7			 
 | 
						|
		.elseif \SHIFT_VAL==4
 | 
						|
			slwi		\REG1,	\REG2,	6			  
 | 
						|
		.elseif \SHIFT_VAL==2
 | 
						|
			slwi		\REG1,	\REG2,	5			 
 | 
						|
		.elseif \SHIFT_VAL==1
 | 
						|
			slwi		\REG1,	\REG2,	4			 
 | 
						|
		.endif
 | 
						|
.endm
 | 
						|
/*
 | 
						|
//#if (defined(LEFT) &&  defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
 | 
						|
// 		ptrbb = bb;
 | 
						|
// #else
 | 
						|
// 		ptrba += off*16;
 | 
						|
// 		ptrbb = bb + off*2;
 | 
						|
// #endif
 | 
						|
*/
 | 
						|
 | 
						|
 | 
						|
.macro REFRESH_POINTERS  PTR_A,PTR_B,OFF_VAL,B_VAL,C_A,C_B
 | 
						|
    #if (defined(LEFT) &&  defined(TRANSA)) ||  (!defined(LEFT) && !defined(TRANSA))
 | 
						|
        /* ptrbb = bb;*/
 | 
						|
        mr \PTR_B,\B_VAL     /* refresh BPOINT */
 | 
						|
    #else
 | 
						|
		    /*
 | 
						|
        // ptrba  =ptrba+ off*C_A;
 | 
						|
        // ptrbb = bb + off*C_B; 
 | 
						|
				*/
 | 
						|
		SHIFT_REG T4,\OFF_VAL,\C_B		/* Number of values in B shifted  */
 | 
						|
		SHIFT_REG T2,\OFF_VAL,\C_A		/* Number of values in A shifted  */
 | 
						|
		add		\PTR_B,	\B_VAL ,	T4				/* Add values to BO */
 | 
						|
		add		\PTR_A,	\PTR_A,	T2				/* Add values to AO  */
 | 
						|
    #endif 
 | 
						|
.endm
 | 
						|
 | 
						|
/*
 | 
						|
// #if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
 | 
						|
// 		temp = bk-off;
 | 
						|
// #elif defined(LEFT)
 | 
						|
// 		temp = off+16;	// number of values in A
 | 
						|
// #else
 | 
						|
// 		temp = off+2;	// number of values in B
 | 
						|
// #endif
 | 
						|
*/
 | 
						|
 | 
						|
 | 
						|
.macro REFRESH_TEMP_BK TEMP_BK,BK_VAL,OFF_VAL,INCR_A,INCR_B
 | 
						|
    #if (defined(LEFT) && !defined(TRANSA)) ||  (!defined(LEFT) && defined(TRANSA))
 | 
						|
                            /* temp = bk-off;*/
 | 
						|
           sub \TEMP_BK,\BK_VAL,\OFF_VAL
 | 
						|
    #elif defined(LEFT)
 | 
						|
                            /* temp = off+INCR_A;	// number of values in A */
 | 
						|
           addi \TEMP_BK, \OFF_VAL, \INCR_A
 | 
						|
    #else
 | 
						|
                            /* temp = off+INCR_B	// number of values in B*/
 | 
						|
           addi \TEMP_BK,\OFF_VAL, \INCR_B
 | 
						|
    #endif
 | 
						|
.endm
 | 
						|
/*
 | 
						|
// #if ( defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
 | 
						|
// 		temp = bk - off;
 | 
						|
// #ifdef LEFT
 | 
						|
// 		temp -= 16; // number of values in A
 | 
						|
// #else
 | 
						|
// 		temp -= 2; // number of values in B
 | 
						|
// #endif
 | 
						|
// 		ptrba += temp*16;
 | 
						|
// 		ptrbb += temp*2;
 | 
						|
// #endif
 | 
						|
// #ifdef LEFT
 | 
						|
// 		off += 16; // number of values in A
 | 
						|
// #endif
 | 
						|
*/
 | 
						|
 
 | 
						|
 | 
						|
 | 
						|
.macro REFRESH_AFTER_SAVE TEMP_BK,BK_VAL,OFF_VAL,PTR_B,PTR_A,C_A,C_B
 | 
						|
    #if ( defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
 | 
						|
                    /*temp = bk - off;*/
 | 
						|
                sub \TEMP_BK,\BK_VAL,\OFF_VAL
 | 
						|
    #ifdef LEFT
 | 
						|
                    /*temp -= 8; // number of values in A*/
 | 
						|
                addi \TEMP_BK,\TEMP_BK,-\C_A
 | 
						|
    #else
 | 
						|
                    /*temp -= 4; // number of values in B*/
 | 
						|
                addi \TEMP_BK,\TEMP_BK,-\C_B 
 | 
						|
    #endif
 | 
						|
                    /*ptrba += temp*C_A;
 | 
						|
                    ptrbb += temp*C_B;*/ 
 | 
						|
                SHIFT_REG T4,\TEMP_BK,\C_A
 | 
						|
								SHIFT_REG T2,\TEMP_BK,\C_B
 | 
						|
                add \PTR_A, \PTR_A,T4/*ptrba+temp*C_A*/ 
 | 
						|
								add \PTR_B, \PTR_B,T2 
 | 
						|
    #endif
 | 
						|
    #ifdef LEFT
 | 
						|
                    /*off += 8; // number of values in A*/
 | 
						|
                 addi \OFF_VAL,\OFF_VAL,\C_A
 | 
						|
    #endif
 | 
						|
.endm |