1350 lines
		
	
	
		
			38 KiB
		
	
	
	
		
			ArmAsm
		
	
	
	
			
		
		
	
	
			1350 lines
		
	
	
		
			38 KiB
		
	
	
	
		
			ArmAsm
		
	
	
	
| /***************************************************************************
 | |
| Copyright (c) 2013-2020, The OpenBLAS Project
 | |
| All rights reserved.
 | |
| Redistribution and use in source and binary forms, with or without
 | |
| modification, are permitted provided that the following conditions are
 | |
| met:
 | |
| 1. Redistributions of source code must retain the above copyright
 | |
| notice, this list of conditions and the following disclaimer.
 | |
| 2. Redistributions in binary form must reproduce the above copyright
 | |
| notice, this list of conditions and the following disclaimer in
 | |
| the documentation and/or other materials provided with the
 | |
| distribution.
 | |
| 3. Neither the name of the OpenBLAS project nor the names of
 | |
| its contributors may be used to endorse or promote products
 | |
| derived from this software without specific prior written permission.
 | |
| THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
 | |
| AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
 | |
| IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
 | |
| ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
 | |
| LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
 | |
| DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
 | |
| SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
 | |
| CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
 | |
| OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
 | |
| USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 | |
| *****************************************************************************/
 | |
| 
 | |
| #define unit_size 16
 | |
| #define DISP32(ind,disp) (ind*unit_size*32+disp)
 | |
| #define DISP16(ind,disp) (ind*unit_size*16+disp)
 | |
| #define DISP8(ind,disp) (ind*unit_size*8+disp)
 | |
| #define DISP4(ind,disp) (ind*unit_size*4+disp)
 | |
| #define DISP2(ind,disp) (ind*unit_size*2+disp)
 | |
| #define DISP1(ind,disp) (ind*unit_size+disp)
 | |
| #define DISPX(disp)  (disp)
 | |
| /*	HELPERS FOR SAVE	*/
 | |
| /* {r0,i0} and {r1,i1} into  {r0,r1} {i0,i1} */
 | |
| 
 | |
| 
 | |
| .macro LOAD_COUPLE_AS_RR_II  VS_OUT1,VS_OUT2,VS_TEMP1,VS_TEMP2,REG,LOFFSET 
 | |
| #ifndef TRMMKERNEL 
 | |
|   lxv	\VS_TEMP1,	DISPX(\LOFFSET)(\REG)
 | |
|   lxv	\VS_TEMP2,	DISPX(\LOFFSET+16)(\REG)
 | |
| #if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__) 
 | |
|   xxmrghd  \VS_OUT1,\VS_TEMP1,\VS_TEMP2
 | |
|   xxmrgld  \VS_OUT2,\VS_TEMP1,\VS_TEMP2
 | |
| #else
 | |
|   xxmrgld  \VS_OUT1,\VS_TEMP1,\VS_TEMP2
 | |
|   xxmrghd  \VS_OUT2,\VS_TEMP1,\VS_TEMP2	
 | |
| #endif
 | |
| #endif	
 | |
| .endm
 | |
| /*from 2 result {a0r*br,a0i*bi} and {a1r*br,a1i*bi} pack into {a0r*br,a1r*br} and {a0i*bi,a1i*bi}*/
 | |
| 
 | |
| 
 | |
| .macro RESULT_INTO_REALREAL_IMAGEIMAGE VSIN1,VSIN2,VSOUT1,VSOUT2
 | |
| #if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__)
 | |
|     xxmrghd \VSOUT1, \VSIN1,\VSIN2 /*  real*real from 2 results*/
 | |
|     xxmrgld \VSOUT2, \VSIN1,\VSIN2 /*  imag*imag from 2 results*/
 | |
| #else
 | |
| 	xxmrgld	\VSOUT1, \VSIN1,\VSIN2 /*  real*real from 2 results*/
 | |
| 	xxmrghd	\VSOUT2, \VSIN1,\VSIN2 /*  imag*imag from 2 results*/
 | |
| #endif
 | |
| .endm 
 | |
| /*from 2 result {a0r*bi,a0i*br} and {a1r*bi,a1i*br} pack into {a0r*bi,a1r*bi} and {a0i*br,a1i*br}*/
 | |
| 
 | |
| 
 | |
| .macro RESULT_INTO_REALIMAG_IMAGREAL VSIN1,VSIN2,VSOUT1,VSOUT2 
 | |
| #if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__)
 | |
|     xxmrghd \VSOUT1, \VSIN1,\VSIN2 /*  real*imag */
 | |
|     xxmrgld \VSOUT2, \VSIN1,\VSIN2 /*  imag*real*/
 | |
| #else
 | |
| 	xxmrgld	\VSOUT1, \VSIN1,\VSIN2 /*  real*imag */
 | |
| 	xxmrghd	\VSOUT2, \VSIN1,\VSIN2 /*  imag*real*/
 | |
| #endif
 | |
| .endm
 | |
| /* {a0r*br op a0i*bi ,a1r*br op a1i*bi} ~ {r0,r1}; {a0r*bi op a0i*br ,a1r*bi op a1i*br} ~ {i0,i1}*/
 | |
| 
 | |
| 
 | |
| .macro  AGGREGATE_REALS_IMAGES  VSINR_OUT1,VSINR,VSINI_OUT2,VSINI
 | |
| #if	defined(NN) || defined(NT) || defined(TN) || defined(TT) 
 | |
| 	xvsubdp  \VSINR_OUT1,\VSINR_OUT1,\VSINR
 | |
| 	xvadddp  \VSINI_OUT2,\VSINI_OUT2,\VSINI  
 | |
| #elif  defined(CN) || defined(CT) || defined(RN) || defined(RT) 
 | |
| 	xvadddp  \VSINR_OUT1,\VSINR_OUT1,\VSINR
 | |
| 	xvsubdp  \VSINI_OUT2,\VSINI_OUT2,\VSINI 
 | |
| #elif  defined(NC) || defined(TC) || defined(NR) || defined(TR) 
 | |
| 	xvadddp  \VSINR_OUT1,\VSINR_OUT1,\VSINR
 | |
| 	xvsubdp  \VSINI_OUT2,\VSINI,\VSINI_OUT2  
 | |
| #else	// CC || CR || RC || RR 
 | |
|     /*we will assume {-alpha_r,-alpha_i} for this case */
 | |
|     /*i1i2-r1r2 so we will negate alpha real instead to fix sign*/
 | |
| 	xvsubdp  \VSINR_OUT1,\VSINR,\VSINR_OUT1
 | |
|     /*we will negate alpha image instead  instead to fix sign*/
 | |
| 	xvadddp  \VSINI_OUT2,\VSINI_OUT2,\VSINI 
 | |
| #endif
 | |
| .endm 
 | |
| /* {i0,i1} * {alpha_i,alpha_i} - VSOUT1 ;VSOUT2 + {r0,r1}*{alpha_i,alpha_i} */
 | |
| 
 | |
| 
 | |
| .macro MULT_APLHA_PART1  VSINRR,VSINII,VSOUT1,VSOUT2
 | |
| #ifndef TRMMKERNEL  
 | |
| 	xvmsubadp \VSOUT1,\VSINII, alpha_i
 | |
| 	xvmaddadp  \VSOUT2,\VSINRR, alpha_i
 | |
| #else 
 | |
| 	xvmuldp \VSOUT1,\VSINII, alpha_i 
 | |
| 	xvmuldp  \VSOUT2,\VSINRR, alpha_i
 | |
| #endif 
 | |
| .endm
 | |
| /*   {r0,r1} * {alpha_r,alpha_r} -  VSOUT1 ;VSOUT2 + {i0,i1} * {alpha_r,alpha_r} */
 | |
| 
 | |
| 
 | |
| .macro MULT_APLHA_PART2  VSINRR,VSINII,VSOUT1,VSOUT2 
 | |
| 	xvmsubadp  \VSOUT1,\VSINRR, alpha_r
 | |
| 	xvmaddadp \VSOUT2,\VSINII, alpha_r
 | |
| .endm
 | |
| /* unpack to store 2{r,r} {i,i} into  {r,i} {r,i} (big endian because of stxv) */
 | |
| 
 | |
| 
 | |
| .macro UNPACK_FOR_STORE VSIN1,VSIN2,VSOUT1,VSOUT2 
 | |
| #if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__)
 | |
|     xxmrghd  \VSOUT1,\VSIN1,\VSIN2
 | |
|     xxmrgld  \VSOUT2,\VSIN1,\VSIN2
 | |
| #else
 | |
| 	xxmrghd  \VSOUT1,\VSIN2,\VSIN1
 | |
| 	xxmrgld  \VSOUT2,\VSIN2,\VSIN1
 | |
| #endif
 | |
| .endm
 | |
| 
 | |
| 
 | |
| .macro STORE_COUPLE REG,LOFFSET,VSIN1,VSIN2
 | |
| 	stxv	\VSIN1,	DISPX(\LOFFSET)(\REG)
 | |
| 	stxv	\VSIN2,	DISPX(\LOFFSET+16)(\REG)
 | |
| .endm
 | |
| 
 | |
| 
 | |
| .macro SAVE8 VSRes1,VSRes2,VSRes3,VSRes4,VSRes5,VSRes6,VSRes7,VSRes8,VSRes9,VSRes10,VSRes11,VSRes12,VSRes13,VSRes14,VSRes15,VSRes16,BASE_REG,LOFFSET
 | |
|   RESULT_INTO_REALREAL_IMAGEIMAGE \VSRes1,\VSRes3,vs34,vs35
 | |
|   LOAD_COUPLE_AS_RR_II	vs46,vs47,vs50,vs51,\BASE_REG,\LOFFSET
 | |
|   RESULT_INTO_REALIMAG_IMAGREAL	\VSRes2,\VSRes4,vs36,vs37
 | |
|   LOAD_COUPLE_AS_RR_II	vs48,vs49,vs52,vs53,\BASE_REG,(\LOFFSET+32)
 | |
|   RESULT_INTO_REALREAL_IMAGEIMAGE \VSRes5,\VSRes7,vs38,vs39
 | |
|   LOAD_COUPLE_AS_RR_II	vs56,vs57,vs50,vs51,\BASE_REG,(\LOFFSET +64)
 | |
|   RESULT_INTO_REALIMAG_IMAGREAL	\VSRes6,\VSRes8,vs40,vs41 
 | |
|   LOAD_COUPLE_AS_RR_II	vs58,vs59,vs52,vs53,\BASE_REG,(\LOFFSET+96)
 | |
|   RESULT_INTO_REALREAL_IMAGEIMAGE \VSRes9,\VSRes11,vs42,vs43
 | |
|   AGGREGATE_REALS_IMAGES	vs34,vs35,vs36,vs37
 | |
|   RESULT_INTO_REALIMAG_IMAGREAL	\VSRes10,\VSRes12,vs44,vs45 
 | |
|   AGGREGATE_REALS_IMAGES	vs38,vs39,vs40,vs41  
 | |
|   RESULT_INTO_REALREAL_IMAGEIMAGE \VSRes13,\VSRes15,\VSRes1,\VSRes2
 | |
|   MULT_APLHA_PART1	vs34,vs36, vs46,vs47
 | |
|   RESULT_INTO_REALIMAG_IMAGREAL	\VSRes14,\VSRes16,\VSRes3,\VSRes4
 | |
|   MULT_APLHA_PART1	vs38,vs40,vs48,vs49
 | |
|   MULT_APLHA_PART2  vs34,vs36,vs46,vs47 
 | |
|   AGGREGATE_REALS_IMAGES	vs42,vs43,vs44,vs45
 | |
|   MULT_APLHA_PART2	vs38,vs40,vs48,vs49
 | |
|   AGGREGATE_REALS_IMAGES	\VSRes1,\VSRes2,\VSRes3,\VSRes4	
 | |
|   UNPACK_FOR_STORE	vs46,vs47,vs39,vs41
 | |
|   MULT_APLHA_PART1	vs42,vs44, vs56,vs57
 | |
|   UNPACK_FOR_STORE	vs48,vs49,vs35,vs37 
 | |
|   MULT_APLHA_PART1	\VSRes1,\VSRes3, vs58,vs59
 | |
|   STORE_COUPLE	\BASE_REG,\LOFFSET,vs39,vs41
 | |
|   MULT_APLHA_PART2	vs42,vs44,vs56,vs57
 | |
|   STORE_COUPLE	\BASE_REG,(\LOFFSET+32),vs35,vs37 
 | |
|   MULT_APLHA_PART2	\VSRes1,\VSRes3, vs58,vs59
 | |
|   UNPACK_FOR_STORE	vs56,vs57,vs42,vs44
 | |
|   UNPACK_FOR_STORE	vs58,vs59,\VSRes1,\VSRes3
 | |
|   STORE_COUPLE	\BASE_REG,(\LOFFSET +64),vs42,vs44
 | |
|   STORE_COUPLE	\BASE_REG,(\LOFFSET+96),\VSRes1,\VSRes3
 | |
| .endm
 | |
| 
 | |
| 
 | |
| .macro SAVE4  VSRes1,VSRes2,VSRes3,VSRes4,VSRes5,VSRes6,VSRes7,VSRes8,BASE_REG,LOFFSET
 | |
|   RESULT_INTO_REALREAL_IMAGEIMAGE \VSRes1,\VSRes3,vs34,vs35
 | |
|   LOAD_COUPLE_AS_RR_II	vs46,vs47,vs50,vs51,\BASE_REG,\LOFFSET
 | |
|   RESULT_INTO_REALIMAG_IMAGREAL	\VSRes2,\VSRes4,vs36,vs37
 | |
|   LOAD_COUPLE_AS_RR_II	vs48,vs49,vs52,vs53,\BASE_REG,(\LOFFSET+32)
 | |
|   RESULT_INTO_REALREAL_IMAGEIMAGE \VSRes5,\VSRes7,vs38,vs39
 | |
|   RESULT_INTO_REALIMAG_IMAGREAL	\VSRes6,\VSRes8,vs40,vs41 
 | |
|   AGGREGATE_REALS_IMAGES	vs34,vs35,vs36,vs37	
 | |
|   AGGREGATE_REALS_IMAGES	vs38,vs39,vs40,vs41  
 | |
|   MULT_APLHA_PART1	vs34,vs36, vs46,vs47
 | |
|   MULT_APLHA_PART1	vs38,vs40, vs48,vs49
 | |
|   MULT_APLHA_PART2	vs34,vs36, vs46,vs47 
 | |
|   MULT_APLHA_PART2	vs38,vs40,vs48,vs49
 | |
|   UNPACK_FOR_STORE	vs46,vs47,vs39,vs41
 | |
|   UNPACK_FOR_STORE	vs48,vs49,vs35,vs37
 | |
|   STORE_COUPLE	\BASE_REG,\LOFFSET,vs39,vs41
 | |
|   STORE_COUPLE	\BASE_REG,(\LOFFSET+32),vs35,vs37
 | |
| .endm
 | |
| 
 | |
| 
 | |
| .macro SAVE2  VSRes1,VSRes2,VSRes3,VSRes4,BASE_REG,LOFFSET
 | |
|   RESULT_INTO_REALREAL_IMAGEIMAGE \VSRes1,\VSRes3,vs34,vs35
 | |
|   LOAD_COUPLE_AS_RR_II	vs46,vs47,vs50,vs51,\BASE_REG,\LOFFSET
 | |
|   RESULT_INTO_REALIMAG_IMAGREAL	\VSRes2,\VSRes4,vs36,vs37	
 | |
|   AGGREGATE_REALS_IMAGES	vs34,vs35,vs36,vs37	
 | |
|   MULT_APLHA_PART1	vs34,vs36, vs46,vs47	
 | |
|   MULT_APLHA_PART2	vs34,vs36, vs46,vs47  
 | |
|   UNPACK_FOR_STORE	vs46,vs47,vs39,vs41	
 | |
|   STORE_COUPLE	\BASE_REG,\LOFFSET,vs39,vs41  
 | |
| .endm
 | |
| 
 | |
| 
 | |
| .macro SAVE1  VSRes1,VSRes2,BASE_REG,LOFFSET
 | |
|   RESULT_INTO_REALREAL_IMAGEIMAGE \VSRes1,\VSRes1,vs34,vs35
 | |
| #ifndef TRMMKERNEL 
 | |
|   lxv	vs50,	(\LOFFSET)(\BASE_REG) 
 | |
| #if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__)
 | |
|   xxmrghd  vs46,vs50,vs50
 | |
|   xxmrgld  vs47,vs50,vs50
 | |
| #else
 | |
|   xxmrgld  vs46,vs50,vs50
 | |
|   xxmrghd  vs47,vs50,vs50	
 | |
| #endif
 | |
| #endif	
 | |
|   RESULT_INTO_REALIMAG_IMAGREAL	\VSRes2,\VSRes2,vs36,vs37	
 | |
|   AGGREGATE_REALS_IMAGES	vs34,vs35,vs36,vs37	
 | |
|   MULT_APLHA_PART1	vs34,vs36, vs46,vs47	
 | |
|   MULT_APLHA_PART2	vs34,vs36, vs46,vs47  
 | |
|   UNPACK_FOR_STORE	vs46,vs47,vs39,vs41 
 | |
| #if (__BYTE_ORDER__ != __ORDER_BIG_ENDIAN__)
 | |
|   xxmrghd  vs39,vs47,vs46	
 | |
| #endif
 | |
|   stxv	vs39,	(\LOFFSET)(\BASE_REG) 
 | |
| .endm
 | |
| 
 | |
| /**********************************************************************************************
 | |
| *
 | |
| 
 | |
| .macros for N=2 and M=8
 | |
| **********************************************************************************************/
 | |
| 
 | |
| .macro  KERNEL2x8_ZERO_AND_PRIME_MMA
 | |
|         /* zero out and prime the MMA accumulators */
 | |
|         xxsetaccz 0
 | |
|         xxsetaccz 1
 | |
|         xxsetaccz 2
 | |
|         xxsetaccz 3
 | |
|         xxsetaccz 4
 | |
|         xxsetaccz 5
 | |
|         xxsetaccz 6
 | |
|         xxsetaccz 7
 | |
| .endm
 | |
| 
 | |
| 
 | |
| .macro KERNEL2x8_PRELOAD
 | |
| 	lxvp	vs32,	 0(AO)	// load real,imag from A
 | |
| 	lxvp	vs34,	32(AO)	// load real,imag from A
 | |
| 	lxvp	vs36,	64(AO)	// load real,imag from A
 | |
| 	lxvp	vs38,	96(AO)	// load real,imag from A
 | |
| 	lxvp	vs48,	 0(BO)	// load real imag from B
 | |
| .endm
 | |
| 
 | |
| 
 | |
| .macro KERNEL2x8_2	Index, IsLast
 | |
| 	lxvp	vs40,	DISP16(\Index,128)(AO)	// load real,imag from A
 | |
| 	lxvp	vs42,	DISP16(\Index,160)(AO)	// load real,imag from A
 | |
| 	lxvp	vs44,	DISP16(\Index,192)(AO)	// load real,imag from A
 | |
| 	lxvp	vs46,	DISP16(\Index,224)(AO)	// load real,imag from A
 | |
|  	lxvp	vs50,	DISP4(\Index,  32)(BO)	// load real,imag from B
 | |
| #if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__)
 | |
| 	xvf64gerpp  0,	vs32,	vs48
 | |
| 	xvf64gerpp  1,  vs34,   vs48
 | |
| 	xvf64gerpp  2,  vs36,   vs48
 | |
| 	xvf64gerpp  3,  vs38,   vs48
 | |
| 	xvf64gerpp  4,  vs32,   vs49
 | |
| 	xvf64gerpp  5,  vs34,   vs49
 | |
| 	xvf64gerpp  6,  vs36,   vs49
 | |
| 	xvf64gerpp  7,  vs38,   vs49
 | |
| #else
 | |
| 	xvf64gerpp	0,	vs32,	vs49
 | |
| 	xvf64gerpp	1,	vs34,	vs49
 | |
| 	xvf64gerpp	2,	vs36,	vs49
 | |
| 	xvf64gerpp	3,	vs38,	vs49
 | |
| 	xvf64gerpp	4,	vs32,	vs48
 | |
| 	xvf64gerpp	5,	vs34,	vs48
 | |
| 	xvf64gerpp	6,	vs36,	vs48
 | |
| 	xvf64gerpp	7,	vs38,	vs48
 | |
| #endif
 | |
| 	lxvp	vs32,	DISP16(\Index, 256)(AO)	// load real,imag from A
 | |
| 	lxvp	vs34,	DISP16(\Index, 288)(AO)	// load real,imag from A
 | |
| 	lxvp	vs36,	DISP16(\Index, 320)(AO)	// load real,imag from A
 | |
| 	lxvp	vs38,	DISP16(\Index, 352)(AO)	// load real,imag from A
 | |
| 	lxvp	vs48,	DISP4(\Index,  64)(BO)	// load real imag from B
 | |
| #if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__)
 | |
| 	xvf64gerpp  0,  vs40,   vs50
 | |
| 	xvf64gerpp  1,  vs42,   vs50
 | |
| 	xvf64gerpp  2,  vs44,   vs50
 | |
| 	xvf64gerpp  3,  vs46,   vs50
 | |
| 	xvf64gerpp  4,  vs40,   vs51
 | |
| 	xvf64gerpp  5,  vs42,   vs51
 | |
| 	xvf64gerpp  6,  vs44,   vs51
 | |
| 	xvf64gerpp  7,  vs46,   vs51
 | |
| #else
 | |
| 	xvf64gerpp	0,	vs40,	vs51
 | |
| 	xvf64gerpp	1,	vs42,	vs51
 | |
| 	xvf64gerpp	2,	vs44,	vs51
 | |
| 	xvf64gerpp	3,	vs46,	vs51
 | |
| 	xvf64gerpp	4,	vs40,	vs50
 | |
| 	xvf64gerpp	5,	vs42,	vs50
 | |
| 	xvf64gerpp	6,	vs44,	vs50
 | |
| 	xvf64gerpp	7,	vs46,	vs50
 | |
| #endif
 | |
| .if \IsLast==1
 | |
| 	addi	AO, AO,  DISP16(\Index,256)
 | |
| 	addi	BO, BO,  DISP4(\Index,64)
 | |
| .endif 
 | |
| .endm
 | |
| 
 | |
| 
 | |
| .macro LOAD_END_2x8  OffsetA,OffsetB
 | |
| #if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__)
 | |
| 	xvf64gerpp  0,  vs32,   vs48
 | |
| 	xvf64gerpp  1,  vs34,   vs48
 | |
| 	xvf64gerpp  2,  vs36,   vs48
 | |
| 	xvf64gerpp  3,  vs38,   vs48
 | |
| 	xvf64gerpp  4,  vs32,   vs49
 | |
| 	xvf64gerpp  5,  vs34,   vs49
 | |
| 	xvf64gerpp  6,  vs36,   vs49
 | |
| 	xvf64gerpp  7,  vs38,   vs49	
 | |
| #else
 | |
| 	xvf64gerpp	0,	vs32,	vs49
 | |
| 	xvf64gerpp	1,	vs34,	vs49
 | |
| 	xvf64gerpp	2,	vs36,	vs49
 | |
| 	xvf64gerpp	3,	vs38,	vs49
 | |
| 	xvf64gerpp	4,	vs32,	vs48
 | |
| 	xvf64gerpp	5,	vs34,	vs48
 | |
| 	xvf64gerpp	6,	vs36,	vs48
 | |
| 	xvf64gerpp	7,	vs38,	vs48
 | |
| #endif
 | |
| 	addi	BO, BO, \OffsetB
 | |
| 	addi	AO, AO, \OffsetA
 | |
| .endm
 | |
| 
 | |
| 
 | |
| .macro  KERNEL2x8_UNPRIME_MMA
 | |
|         /* "unprime" MMA accumulators */
 | |
|         xxmfacc 0
 | |
|         xxmfacc 1
 | |
|         xxmfacc 2
 | |
|         xxmfacc 3
 | |
|         xxmfacc 4
 | |
|         xxmfacc 5
 | |
|         xxmfacc 6
 | |
|         xxmfacc 7
 | |
| .endm
 | |
| 
 | |
| 
 | |
| .macro SAVE2x8
 | |
| 	add	T1, CO ,LDC 
 | |
|         xxpermdi vs32, vs0, vs1, 0b01
 | |
|         xxpermdi vs33, vs0, vs1, 0b10
 | |
|         xxpermdi vs34, vs2, vs3, 0b01
 | |
|         xxpermdi vs35, vs2, vs3, 0b10
 | |
|         xxpermdi vs36, vs4, vs5, 0b01
 | |
|         xxpermdi vs37, vs4, vs5, 0b10
 | |
|         xxpermdi vs38, vs6, vs7, 0b01
 | |
|         xxpermdi vs39, vs6, vs7, 0b10
 | |
|         xxpermdi vs40, vs8, vs9, 0b01
 | |
|         xxpermdi vs41, vs8, vs9, 0b10
 | |
|         xxpermdi vs42, vs10, vs11, 0b01
 | |
|         xxpermdi vs43, vs10, vs11, 0b10
 | |
|         xxpermdi vs44, vs12, vs13, 0b01
 | |
|         xxpermdi vs45, vs12, vs13, 0b10
 | |
|         xxpermdi vs46, vs14, vs15, 0b01
 | |
|         xxpermdi vs47, vs14, vs15, 0b10
 | |
| #if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__)
 | |
| 	xxlor vs0, vs32, vs32
 | |
| 	xxlor vs1, vs33, vs33
 | |
| 	xxlor vs2, vs34, vs34
 | |
| 	xxlor vs3, vs35, vs35
 | |
| 	xxlor vs4, vs36, vs36
 | |
| 	xxlor vs5, vs37, vs37
 | |
| 	xxlor vs6, vs38, vs38
 | |
| 	xxlor vs7, vs39, vs39
 | |
| 	xxlor vs8, vs40, vs40
 | |
| 	xxlor vs9, vs41, vs41
 | |
| 	xxlor vs10, vs42, vs42
 | |
| 	xxlor vs11, vs43, vs43
 | |
| 	xxlor vs12, vs44, vs44
 | |
| 	xxlor vs13, vs45, vs45
 | |
| 	xxlor vs14, vs46, vs46
 | |
| 	xxlor vs15, vs47, vs47
 | |
| #else
 | |
| 	xxlor vs2, vs32, vs32
 | |
| 	xxlor vs3, vs33, vs33
 | |
| 	xxlor vs0, vs34, vs34
 | |
| 	xxlor vs1, vs35, vs35
 | |
| 	xxlor vs6, vs36, vs36
 | |
| 	xxlor vs7, vs37, vs37
 | |
| 	xxlor vs4, vs38, vs38
 | |
| 	xxlor vs5, vs39, vs39
 | |
| 	xxlor vs10, vs40, vs40
 | |
| 	xxlor vs11, vs41, vs41
 | |
| 	xxlor vs8, vs42, vs42
 | |
| 	xxlor vs9, vs43, vs43
 | |
| 	xxlor vs14, vs44, vs44
 | |
| 	xxlor vs15, vs45, vs45
 | |
| 	xxlor vs12, vs46, vs46
 | |
| 	xxlor vs13, vs47, vs47
 | |
| #endif
 | |
|         xxpermdi vs32, vs16, vs17, 0b01
 | |
|         xxpermdi vs33, vs16, vs17, 0b10
 | |
|         xxpermdi vs34, vs18, vs19, 0b01
 | |
|         xxpermdi vs35, vs18, vs19, 0b10
 | |
|         xxpermdi vs36, vs20, vs21, 0b01
 | |
|         xxpermdi vs37, vs20, vs21, 0b10
 | |
|         xxpermdi vs38, vs22, vs23, 0b01
 | |
|         xxpermdi vs39, vs22, vs23, 0b10
 | |
|         xxpermdi vs40, vs24, vs25, 0b01
 | |
|         xxpermdi vs41, vs24, vs25, 0b10
 | |
|         xxpermdi vs42, vs26, vs27, 0b01
 | |
|         xxpermdi vs43, vs26, vs27, 0b10
 | |
|         xxpermdi vs44, vs28, vs29, 0b01
 | |
|         xxpermdi vs45, vs28, vs29, 0b10
 | |
|         xxpermdi vs46, vs30, vs31, 0b01
 | |
|         xxpermdi vs47, vs30, vs31, 0b10
 | |
| #if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__)
 | |
| 	xxlor vs16, vs32, vs32
 | |
| 	xxlor vs17, vs33, vs33
 | |
| 	xxlor vs18, vs34, vs34
 | |
| 	xxlor vs19, vs35, vs35
 | |
| 	xxlor vs20, vs36, vs36
 | |
| 	xxlor vs21, vs37, vs37
 | |
| 	xxlor vs22, vs38, vs38
 | |
| 	xxlor vs23, vs39, vs39
 | |
| 	xxlor vs24, vs40, vs40
 | |
| 	xxlor vs25, vs41, vs41
 | |
| 	xxlor vs26, vs42, vs42
 | |
| 	xxlor vs27, vs43, vs43
 | |
| 	xxlor vs28, vs44, vs44
 | |
| 	xxlor vs29, vs45, vs45
 | |
| 	xxlor vs30, vs46, vs46
 | |
| 	xxlor vs31, vs47, vs47
 | |
| #else
 | |
| 	xxlor vs18, vs32, vs32
 | |
| 	xxlor vs19, vs33, vs33
 | |
| 	xxlor vs16, vs34, vs34
 | |
| 	xxlor vs17, vs35, vs35
 | |
| 	xxlor vs22, vs36, vs36
 | |
| 	xxlor vs23, vs37, vs37
 | |
| 	xxlor vs20, vs38, vs38
 | |
| 	xxlor vs21, vs39, vs39
 | |
| 	xxlor vs26, vs40, vs40
 | |
| 	xxlor vs27, vs41, vs41
 | |
| 	xxlor vs24, vs42, vs42
 | |
| 	xxlor vs25, vs43, vs43
 | |
| 	xxlor vs30, vs44, vs44
 | |
| 	xxlor vs31, vs45, vs45
 | |
| 	xxlor vs28, vs46, vs46
 | |
| 	xxlor vs29, vs47, vs47
 | |
| #endif
 | |
| 	SAVE8  vs0,vs1,vs2,vs3,vs4,vs5,vs6,vs7,vs8,vs9,vs10,vs11,vs12,vs13,vs14,vs15,CO,0
 | |
| 	SAVE8  vs16,vs17,vs18,vs19,vs20,vs21,vs22,vs23,vs24,vs25,vs26,vs27,vs28,vs29,vs30,vs31,T1,0  
 | |
| 	addi	CO, CO, 128
 | |
| .endm
 | |
| 
 | |
| /**********************************************************************************************
 | |
| *
 | |
| 
 | |
| .macros for N=2 and M=4
 | |
| **********************************************************************************************/
 | |
| 
 | |
| .macro  KERNEL2x4_ZERO_AND_PRIME_MMA
 | |
|         /* zero out and prime the MMA accumulators */
 | |
|         xxsetaccz 0
 | |
|         xxsetaccz 1
 | |
|         xxsetaccz 2
 | |
|         xxsetaccz 3
 | |
| .endm
 | |
| 
 | |
| 
 | |
| .macro KERNEL2x4_PRELOAD
 | |
| 	lxvp	vs32,	 0(AO)	// load real,imag from A
 | |
| 	lxvp	vs34,	32(AO)	// load real,imag from A
 | |
| 	lxvp	vs48,	 0(BO)	// load real imag from B
 | |
| .endm
 | |
| 
 | |
| 
 | |
| .macro KERNEL2x4_2 Index, IsLast
 | |
| 	lxvp	vs40,	DISP8(\Index,  64)(AO)	// load real,imag from A
 | |
| 	lxvp	vs42,	DISP8(\Index,  96)(AO)	// load real,imag from A
 | |
|  	lxvp	vs50,	DISP4(\Index,  32)(BO)  // load real,imag from B
 | |
| #if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__)
 | |
|     xvf64gerpp      0,      vs32,   vs48
 | |
|     xvf64gerpp      1,      vs34,   vs48
 | |
|     xvf64gerpp      2,      vs32,   vs49
 | |
|     xvf64gerpp      3,      vs34,   vs49
 | |
| #else
 | |
|     xvf64gerpp      0,      vs32,   vs49
 | |
|     xvf64gerpp      1,      vs34,   vs49
 | |
|     xvf64gerpp      2,      vs32,   vs48
 | |
|     xvf64gerpp      3,      vs34,   vs48
 | |
| #endif
 | |
| 	lxvp	vs32,	DISP8(\Index, 128)(AO)	// load real,imag from A
 | |
| 	lxvp	vs34,	DISP8(\Index, 160)(AO)	// load real,imag from A
 | |
|  	lxvp	vs48,	DISP4(\Index,  64)(BO)  // load real,imag from B
 | |
| #if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__)
 | |
|     xvf64gerpp      0,      vs40,   vs50
 | |
|     xvf64gerpp      1,      vs42,   vs50
 | |
|     xvf64gerpp      2,      vs40,   vs51
 | |
|     xvf64gerpp      3,      vs42,   vs51
 | |
| #else
 | |
|     xvf64gerpp      0,      vs40,   vs51 
 | |
|     xvf64gerpp      1,      vs42,   vs51
 | |
|     xvf64gerpp      2,      vs40,   vs50
 | |
|     xvf64gerpp      3,      vs42,   vs50
 | |
| #endif
 | |
| .if \IsLast==1
 | |
| 	addi	AO, AO, DISP8(\Index,128)
 | |
| 	addi	BO, BO, DISP4(\Index,64)
 | |
| .endif 
 | |
| .endm
 | |
|  
 | |
| 
 | |
| .macro LOAD_END_2x4	OffsetA, OffsetB
 | |
| #if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__)
 | |
| 	xvf64gerpp      0,      vs32,   vs48
 | |
| 	xvf64gerpp      1,      vs34,   vs48
 | |
| 	xvf64gerpp      2,      vs32,   vs49
 | |
| 	xvf64gerpp      3,      vs34,   vs49
 | |
| #else
 | |
| 	xvf64gerpp      0,      vs32,   vs49
 | |
| 	xvf64gerpp      1,      vs34,   vs49
 | |
| 	xvf64gerpp      2,      vs32,   vs48
 | |
| 	xvf64gerpp      3,      vs34,   vs48
 | |
| #endif
 | |
| 	addi	BO, BO, \OffsetB
 | |
| 	addi	AO, AO, \OffsetA
 | |
| .endm
 | |
| 
 | |
| 
 | |
| .macro  KERNEL2x4_UNPRIME_MMA
 | |
|         /* "unprime" MMA accumulators */
 | |
|         xxmfacc 0
 | |
|         xxmfacc 1
 | |
|         xxmfacc 2
 | |
|         xxmfacc 3
 | |
| .endm
 | |
| 
 | |
| 
 | |
| .macro SAVE2x4 
 | |
| 	add	T1, CO ,LDC 
 | |
|         xxpermdi vs32, vs0, vs1, 0b01
 | |
|         xxpermdi vs33, vs0, vs1, 0b10
 | |
|         xxpermdi vs34, vs2, vs3, 0b01
 | |
|         xxpermdi vs35, vs2, vs3, 0b10
 | |
|         xxpermdi vs36, vs4, vs5, 0b01
 | |
|         xxpermdi vs37, vs4, vs5, 0b10
 | |
|         xxpermdi vs38, vs6, vs7, 0b01
 | |
|         xxpermdi vs39, vs6, vs7, 0b10
 | |
|         xxpermdi vs40, vs8, vs9, 0b01
 | |
|         xxpermdi vs41, vs8, vs9, 0b10
 | |
|         xxpermdi vs42, vs10, vs11, 0b01
 | |
|         xxpermdi vs43, vs10, vs11, 0b10
 | |
|         xxpermdi vs44, vs12, vs13, 0b01
 | |
|         xxpermdi vs45, vs12, vs13, 0b10
 | |
|         xxpermdi vs46, vs14, vs15, 0b01
 | |
|         xxpermdi vs47, vs14, vs15, 0b10
 | |
| #if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__)
 | |
| 	xxlor vs0, vs32, vs32
 | |
| 	xxlor vs1, vs33, vs33
 | |
| 	xxlor vs2, vs34, vs34
 | |
| 	xxlor vs3, vs35, vs35
 | |
| 	xxlor vs4, vs36, vs36 
 | |
| 	xxlor vs5, vs37, vs37
 | |
| 	xxlor vs6, vs38, vs38
 | |
| 	xxlor vs7, vs39, vs39
 | |
| 	xxlor vs8, vs40, vs40
 | |
| 	xxlor vs9, vs41, vs41
 | |
| 	xxlor vs10, vs42, vs42
 | |
| 	xxlor vs11, vs43, vs43
 | |
| 	xxlor vs12, vs44, vs44
 | |
| 	xxlor vs13, vs45, vs45
 | |
| 	xxlor vs14, vs46, vs46
 | |
| 	xxlor vs15, vs47, vs47
 | |
| #else
 | |
| 	xxlor vs2, vs32, vs32
 | |
| 	xxlor vs3, vs33, vs33
 | |
| 	xxlor vs0, vs34, vs34
 | |
| 	xxlor vs1, vs35, vs35
 | |
| 	xxlor vs6, vs36, vs36
 | |
| 	xxlor vs7, vs37, vs37
 | |
| 	xxlor vs4, vs38, vs38
 | |
| 	xxlor vs5, vs39, vs39
 | |
| 	xxlor vs10, vs40, vs40
 | |
| 	xxlor vs11, vs41, vs41
 | |
| 	xxlor vs8, vs42, vs42
 | |
| 	xxlor vs9, vs43, vs43
 | |
| 	xxlor vs14, vs44, vs44
 | |
| 	xxlor vs15, vs45, vs45
 | |
| 	xxlor vs12, vs46, vs46
 | |
| 	xxlor vs13, vs47, vs47
 | |
| #endif
 | |
| 	SAVE4  vs0,vs1,vs2,vs3,vs4,vs5,vs6,vs7,CO,0
 | |
| 	SAVE4  vs8,vs9,vs10,vs11,vs12,vs13,vs14,vs15,T1,0  
 | |
| 	addi	CO, CO, 64
 | |
| .endm
 | |
| 
 | |
| /**********************************************************************************************
 | |
| *
 | |
| 
 | |
| .macros for N=2 and M=2
 | |
| **********************************************************************************************/
 | |
| 
 | |
| .macro  KERNEL2x2_ZERO_AND_PRIME_MMA
 | |
|         /* zero out and prime the MMA accumulators */
 | |
|         xxsetaccz 0
 | |
|         xxsetaccz 1
 | |
| .endm
 | |
| 
 | |
| 
 | |
| .macro KERNEL2x2_PRELOAD
 | |
| 	lxvp	vs32,	 0(AO)	// load real,imag from A
 | |
| 	lxvp	vs48,	 0(BO)	// load real imag from B
 | |
| .endm
 | |
| 
 | |
| 
 | |
| .macro KERNEL2x2_2 Index, IsLast
 | |
| 	lxvp	vs40,	DISP4(\Index, 32)(AO)	// load real,imag from A
 | |
|  	lxvp	vs50,	DISP4(\Index, 32)(BO)	// load real,imag from B
 | |
| #if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__)
 | |
| 	xvf64gerpp	0,	vs32,	vs48
 | |
| 	xvf64gerpp	1,	vs32,	vs49
 | |
| #else
 | |
| 	xvf64gerpp      0,      vs32,   vs49
 | |
| 	xvf64gerpp      1,      vs32,   vs48
 | |
| #endif
 | |
| 	lxvp	vs32,	DISP4(\Index, 64)(AO)	// load real,imag from A
 | |
| 	lxvp	vs48,	DISP4(\Index, 64)(BO)	// load real imag from B
 | |
| #if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__)
 | |
| 	xvf64gerpp	0,	vs40,	vs50
 | |
| 	xvf64gerpp	1,	vs40,	vs51
 | |
| #else
 | |
| 	xvf64gerpp      0,      vs40,   vs51
 | |
| 	xvf64gerpp      1,      vs40,   vs50
 | |
| #endif
 | |
| .if \IsLast==1
 | |
| 	addi	AO, AO, DISP4(\Index,64)
 | |
| 	addi	BO, BO, DISP4(\Index,64)
 | |
| .endif 
 | |
| .endm
 | |
| 
 | |
|  
 | |
| .macro LOAD_END_2x2  OffsetA,OffsetB
 | |
| #if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__)
 | |
| 	xvf64gerpp	0,	vs32,	vs48
 | |
| 	xvf64gerpp	1,	vs32,	vs49
 | |
| #else
 | |
| 	xvf64gerpp      0,      vs32,   vs49
 | |
| 	xvf64gerpp      1,      vs32,   vs48
 | |
| #endif
 | |
| 	addi	BO, BO, \OffsetB
 | |
| 	addi	AO, AO, \OffsetA
 | |
| .endm
 | |
| 
 | |
| 
 | |
| .macro  KERNEL2x2_UNPRIME_MMA
 | |
|         /* "unprime" MMA accumulators */
 | |
|         xxmfacc 0
 | |
|         xxmfacc 1
 | |
| .endm
 | |
| 
 | |
| 
 | |
| .macro SAVE2x2 
 | |
| 	add	T1, CO ,LDC 
 | |
|         xxpermdi vs32, vs0, vs1, 0b01
 | |
|         xxpermdi vs33, vs0, vs1, 0b10
 | |
|         xxpermdi vs34, vs2, vs3, 0b01
 | |
|         xxpermdi vs35, vs2, vs3, 0b10
 | |
|         xxpermdi vs36, vs4, vs5, 0b01
 | |
|         xxpermdi vs37, vs4, vs5, 0b10
 | |
|         xxpermdi vs38, vs6, vs7, 0b01
 | |
|         xxpermdi vs39, vs6, vs7, 0b10
 | |
| #if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__)
 | |
| 	xxlor vs0, vs32, vs32
 | |
| 	xxlor vs1, vs33, vs33
 | |
| 	xxlor vs2, vs34, vs34
 | |
| 	xxlor vs3, vs35, vs35
 | |
| 	xxlor vs4, vs36, vs36
 | |
| 	xxlor vs5, vs37, vs37
 | |
| 	xxlor vs6, vs38, vs38
 | |
| 	xxlor vs7, vs39, vs39
 | |
| #else
 | |
| 	xxlor vs2, vs32, vs32
 | |
| 	xxlor vs3, vs33, vs33
 | |
| 	xxlor vs0, vs34, vs34
 | |
| 	xxlor vs1, vs35, vs35
 | |
| 	xxlor vs6, vs36, vs36
 | |
| 	xxlor vs7, vs37, vs37
 | |
| 	xxlor vs4, vs38, vs38
 | |
| 	xxlor vs5, vs39, vs39
 | |
| #endif
 | |
| 	SAVE2  vs0,vs1,vs2,vs3,CO,0
 | |
| 	SAVE2  vs4,vs5,vs6,vs7,T1,0 
 | |
| 	addi	CO, CO, 32 
 | |
| .endm
 | |
| 
 | |
| /**********************************************************************************************
 | |
| *
 | |
| 
 | |
| .macros for N=2 and M=1
 | |
| **********************************************************************************************/
 | |
| 
 | |
| .macro ZERO2x1
 | |
| 	xxlxor	vs0,	vs0,	vs0
 | |
| 	xxlxor	vs1,	vs1,	vs1
 | |
| 	xxlxor	vs2,	vs2,	vs2
 | |
| 	xxlxor	vs3,	vs3,	vs3
 | |
|  
 | |
| .endm
 | |
| 
 | |
| 
 | |
| .macro LOAD2x1   
 | |
| 	LOAD2x1O 0,0 
 | |
| .endm
 | |
| 
 | |
| 
 | |
| .macro LOAD2x1O  OffsetA,OffsetB
 | |
| 	lxv	vs48,(\OffsetB+	0)(BO)	// load real imag from B
 | |
| 	lxv	vs50,	(\OffsetB+16)(BO)	// load real,imag from B 
 | |
| 	xxswapd	vs49, vs48
 | |
| 	xxswapd	vs51, vs50
 | |
| 	lxv	vs32,	(0+\OffsetA)(AO)	// load real,imag from A 
 | |
| .endm
 | |
| 
 | |
| 
 | |
| .macro END2x1_WITHOUT_ADD
 | |
| 	END2x1 AO,BO,0,0
 | |
| .endm
 | |
| 
 | |
| 
 | |
| .macro END2x1	AREG, BREG, OffsetA, OffsetB
 | |
| .if \OffsetB != 0
 | |
| 	addi	\BREG, \BREG, \OffsetB
 | |
| .endif
 | |
| .if \OffsetA != 0
 | |
| 	addi	\AREG, \AREG, \OffsetA
 | |
| .endif
 | |
| 	xvmaddadp	vs0,	vs32,	vs48
 | |
| 	xvmaddadp	vs2,	vs32,	vs50
 | |
| 	xvmaddadp	vs1,	vs32,	vs49
 | |
| 	xvmaddadp	vs3,	vs32,	vs51 
 | |
| .endm
 | |
| 
 | |
| 
 | |
| .macro LOAD2x1_2
 | |
|     LOAD2x1_2O 0,0
 | |
| .endm	
 | |
| 
 | |
| 
 | |
| .macro LOAD2x1_2O  OffsetA,OffsetB
 | |
| 	lxv	vs48,(\OffsetB+	0)(BO)	// load real imag from B
 | |
| 	lxv	vs50,	(\OffsetB+16)(BO)	// load real,imag from B
 | |
| 	lxv	vs52,	(\OffsetB+32)(BO)	// load real,imag	from B
 | |
| 	lxv	vs54,	(\OffsetB+48)(BO)	// load real,imag  from B	
 | |
| 	xxswapd	vs49, vs48
 | |
| 	xxswapd	vs51, vs50
 | |
| 	lxv	vs32,	(0+\OffsetA)(AO)	// load real,imag from A
 | |
| 	lxv	vs40,	(16+\OffsetA)(AO)	// load real,imag from A 
 | |
| .endm	
 | |
| 
 | |
| 
 | |
| .macro END2x1_2	  
 | |
|   /*for load2 offset will be 32 and 64*/
 | |
|    KERNEL2x1_2	AO,BO,	32,64,0 ,1,1 
 | |
| .endm
 | |
| 
 | |
| 
 | |
| .macro KERNEL2x1_E2	OffsetA,OffsetB, Index,IsLast 
 | |
|   KERNEL2x1_2	AO,BO,	\OffsetA,\OffsetB, \Index,\IsLast ,1 
 | |
| .endm
 | |
| 
 | |
| 
 | |
| .macro KERNEL2x1_L2	OffsetA,OffsetB, Index,IsLast
 | |
|   KERNEL2x1_2	AO,BO,	\OffsetA,\OffsetB, \Index,\IsLast ,0 
 | |
| .endm
 | |
| 
 | |
| 
 | |
| .macro KERNEL2x1_2	AREG,BREG,	OffsetA,OffsetB, Index,IsLast ,Complete
 | |
|   xxswapd	vs53, vs52
 | |
|   xxswapd	vs55, vs54 
 | |
| 	xvmaddadp	vs0,	vs32,	vs48
 | |
| 	xvmaddadp	vs2,	vs32,	vs50
 | |
| 	xvmaddadp	vs1,	vs32,	vs49
 | |
| 	xvmaddadp	vs3,	vs32,	vs51
 | |
| .if \Complete==0	
 | |
| 	lxv	vs32,	DISP2(\Index, 0 + \OffsetA)(\AREG)	// load real,imag from A 
 | |
| .endif	 
 | |
| .if \Complete==0		
 | |
| 	lxv	vs48,	DISP4(\Index, 0+\OffsetB)(\BREG)	// load real imag from B
 | |
| 	lxv	vs50,	DISP4(\Index, 16+\OffsetB)(\BREG)	// load real,imag from B
 | |
| .endif
 | |
| .if \Complete==0		
 | |
|   xxswapd	vs49, vs48
 | |
|   xxswapd	vs51, vs50
 | |
| .endif 
 | |
| 	xvmaddadp	vs0,	vs40,	vs52
 | |
| 	xvmaddadp	vs2,	vs40,	vs54 
 | |
| 	xvmaddadp	vs1,	vs40,	vs53
 | |
| 	xvmaddadp	vs3,	vs40,	vs55
 | |
| .if \Complete==0		
 | |
| 	lxv	vs40,	DISP2(\Index,16+0+ \OffsetA)(\AREG)	// load real,imag from A 
 | |
| .endif
 | |
|  
 | |
| .if \Complete==0	 
 | |
|  	lxv	vs52,	DISP4(\Index, 32+\OffsetB)(\BREG)	// load real,imag	from B
 | |
| 	lxv	vs54,	DISP4(\Index, 48+\OffsetB)(\BREG)	// load real,imag  from B
 | |
| .endif
 | |
| .if \IsLast==1
 | |
| .if \Complete==1
 | |
| 	addi	\AREG, \AREG,  DISP2(\Index,\OffsetA)
 | |
| 	addi	\BREG, \BREG,  DISP4(\Index,\OffsetB)
 | |
| .else
 | |
| 	addi	\AREG, \AREG, DISP2(\Index,32)
 | |
| 	addi	\BREG, \BREG,  DISP4(\Index,64)
 | |
| .endif
 | |
| .endif 
 | |
| .endm
 | |
|  
 | |
| 
 | |
| .macro KERNEL2x1
 | |
|   LOAD2x1
 | |
|   END2x1  AO, BO, 16,32
 | |
| .endm
 | |
| 
 | |
| 
 | |
| .macro SAVE2x1
 | |
| 	add	T1, CO ,LDC 
 | |
| 	SAVE1  vs0,vs1,CO,0
 | |
| 	SAVE1  vs2,vs3,T1,0  
 | |
| 	addi	CO, CO, 16 
 | |
| .endm
 | |
| 
 | |
| /**********************************************************************************************
 | |
| *
 | |
| 
 | |
| .macros for N=1 and M=8
 | |
| **********************************************************************************************/
 | |
| 
 | |
| .macro  KERNEL1x8_ZERO_AND_PRIME_MMA
 | |
|         /* zero out and prime the MMA accumulators */
 | |
|         xxsetaccz 0
 | |
|         xxsetaccz 1
 | |
|         xxsetaccz 2
 | |
|         xxsetaccz 3
 | |
| .endm
 | |
| 
 | |
| 
 | |
| .macro KERNEL1x8_2	Index,IsLast
 | |
| 	lxvp	vs32,	DISP16(\Index,   0)(AO)	// load real,imag from A
 | |
| 	lxvp	vs34,	DISP16(\Index,  32)(AO)	// load real,imag from A
 | |
| 	lxvp	vs36,	DISP16(\Index,  64)(AO)	// load real,imag from A
 | |
| 	lxvp	vs38,	DISP16(\Index,  96)(AO)	// load real,imag from A
 | |
| 	lxvp	vs40,	DISP16(\Index, 128)(AO)	// load real,imag from A
 | |
| 	lxvp	vs42,	DISP16(\Index, 160)(AO)	// load real,imag from A
 | |
| 	lxvp	vs44,	DISP16(\Index, 192)(AO)	// load real,imag from A
 | |
| 	lxvp	vs46,	DISP16(\Index, 224)(AO)	// load real,imag from A
 | |
| 	lxvp	vs48,	DISP2(\Index,    0)(BO)	// load real imag from B
 | |
| #if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__)
 | |
| 	xvf64gerpp	0,	vs32,	vs48
 | |
| 	xvf64gerpp	1,	vs34,	vs48
 | |
| 	xvf64gerpp	2,	vs36,	vs48
 | |
| 	xvf64gerpp	3,	vs38,	vs48
 | |
| 	xvf64gerpp	0,	vs40,	vs49
 | |
| 	xvf64gerpp	1,	vs42,	vs49
 | |
| 	xvf64gerpp	2,	vs44,	vs49
 | |
| 	xvf64gerpp	3,	vs46,	vs49
 | |
| #else
 | |
| 	xvf64gerpp      0,      vs32,   vs49
 | |
| 	xvf64gerpp      1,      vs34,   vs49
 | |
| 	xvf64gerpp      2,      vs36,   vs49
 | |
| 	xvf64gerpp      3,      vs38,   vs49
 | |
| 	xvf64gerpp      0,      vs40,   vs48
 | |
| 	xvf64gerpp      1,      vs42,   vs48
 | |
| 	xvf64gerpp      2,      vs44,   vs48
 | |
| 	xvf64gerpp      3,      vs46,   vs48
 | |
| #endif
 | |
| .if \IsLast==1
 | |
| 	addi	AO, AO, DISP16(\Index,256)
 | |
| 	addi	BO, BO,  DISP2(\Index,32)
 | |
| .endif 
 | |
| .endm
 | |
| 
 | |
| 
 | |
| .macro LOAD_END_1x8  OffsetA,OffsetB
 | |
| 	lxvp	vs32,	0(AO)	// load real,imag from A
 | |
| 	lxvp	vs34,	32(AO)	// load real,imag from A
 | |
| 	lxvp	vs36,	64(AO)	// load real,imag from A
 | |
| 	lxvp	vs38,	96(AO)	// load real,imag from A
 | |
| 	lxv	vs48,	0(BO)	// load real imag from B 
 | |
|         xvf64gerpp      0,      vs32,   vs48
 | |
|         xvf64gerpp      1,      vs34,   vs48
 | |
|         xvf64gerpp      2,      vs36,   vs48
 | |
|         xvf64gerpp      3,      vs38,   vs48
 | |
| 	addi	BO, BO, \OffsetB
 | |
| 	addi	AO, AO, \OffsetA
 | |
| .endm
 | |
| 
 | |
| 
 | |
| .macro  KERNEL1x8_UNPRIME_MMA
 | |
|         /* "unprime" MMA accumulators */
 | |
|         xxmfacc 0
 | |
|         xxmfacc 1
 | |
|         xxmfacc 2
 | |
|         xxmfacc 3
 | |
| .endm
 | |
| 
 | |
| 
 | |
| .macro SAVE1x8
 | |
|         xxpermdi vs32, vs0, vs1, 0b01
 | |
|         xxpermdi vs33, vs0, vs1, 0b10
 | |
|         xxpermdi vs34, vs2, vs3, 0b01
 | |
|         xxpermdi vs35, vs2, vs3, 0b10
 | |
|         xxpermdi vs36, vs4, vs5, 0b01
 | |
|         xxpermdi vs37, vs4, vs5, 0b10
 | |
|         xxpermdi vs38, vs6, vs7, 0b01
 | |
|         xxpermdi vs39, vs6, vs7, 0b10
 | |
|         xxpermdi vs40, vs8, vs9, 0b01
 | |
|         xxpermdi vs41, vs8, vs9, 0b10
 | |
|         xxpermdi vs42, vs10, vs11, 0b01
 | |
|         xxpermdi vs43, vs10, vs11, 0b10
 | |
|         xxpermdi vs44, vs12, vs13, 0b01
 | |
|         xxpermdi vs45, vs12, vs13, 0b10
 | |
|         xxpermdi vs46, vs14, vs15, 0b01
 | |
|         xxpermdi vs47, vs14, vs15, 0b10
 | |
| #if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__)
 | |
| 	xxlor vs0, vs32, vs32
 | |
| 	xxlor vs1, vs33, vs33
 | |
| 	xxlor vs2, vs34, vs34
 | |
| 	xxlor vs3, vs35, vs35
 | |
| 	xxlor vs4, vs36, vs36
 | |
| 	xxlor vs5, vs37, vs37
 | |
| 	xxlor vs6, vs38, vs38
 | |
| 	xxlor vs7, vs39, vs39
 | |
| 	xxlor vs8, vs40, vs40
 | |
| 	xxlor vs9, vs41, vs41
 | |
| 	xxlor vs10, vs42, vs42
 | |
| 	xxlor vs11, vs43, vs43
 | |
| 	xxlor vs12, vs44, vs44
 | |
| 	xxlor vs13, vs45, vs45
 | |
| 	xxlor vs14, vs46, vs46
 | |
| 	xxlor vs15, vs47, vs47
 | |
| #else
 | |
| 	xxlor vs2, vs32, vs32
 | |
| 	xxlor vs3, vs33, vs33
 | |
| 	xxlor vs0, vs34, vs34
 | |
| 	xxlor vs1, vs35, vs35
 | |
| 	xxlor vs6, vs36, vs36
 | |
| 	xxlor vs7, vs37, vs37
 | |
| 	xxlor vs4, vs38, vs38
 | |
| 	xxlor vs5, vs39, vs39
 | |
| 	xxlor vs10, vs40, vs40
 | |
| 	xxlor vs11, vs41, vs41
 | |
| 	xxlor vs8, vs42, vs42
 | |
| 	xxlor vs9, vs43, vs43
 | |
| 	xxlor vs14, vs44, vs44
 | |
| 	xxlor vs15, vs45, vs45
 | |
| 	xxlor vs12, vs46, vs46
 | |
| 	xxlor vs13, vs47, vs47
 | |
| #endif
 | |
| 	SAVE8  vs0,vs1,vs2,vs3,vs4,vs5,vs6,vs7,vs8,vs9,vs10,vs11,vs12,vs13,vs14,vs15,CO,0
 | |
| 	addi	CO, CO, 128
 | |
| .endm
 | |
| 
 | |
| /**********************************************************************************************
 | |
| *
 | |
| 
 | |
| .macros for N=1 and M=4
 | |
| **********************************************************************************************/
 | |
| 
 | |
| .macro  KERNEL1x4_ZERO_AND_PRIME_MMA
 | |
|         /* zero out and prime the MMA accumulators */
 | |
|         xxsetaccz 0
 | |
|         xxsetaccz 1
 | |
| .endm
 | |
| 
 | |
| 
 | |
| .macro KERNEL1x4_2	Index,IsLast
 | |
| 	lxvp	vs32,	DISP8(\Index,  0)(AO)	// load real,imag from A
 | |
| 	lxvp	vs34,	DISP8(\Index, 32)(AO)	// load real,imag from A
 | |
| 	lxvp	vs40,	DISP8(\Index, 64)(AO)	// load real,imag from A
 | |
| 	lxvp	vs42,	DISP8(\Index, 96)(AO)	// load real,imag from A
 | |
| 	lxvp	vs48,	DISP2(\Index,  0)(BO)	// load real imag from B
 | |
| #if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__)
 | |
| 	xvf64gerpp	0,	vs32,	vs48
 | |
| 	xvf64gerpp	1,	vs34,	vs48
 | |
| 	xvf64gerpp	0,	vs40,	vs49
 | |
| 	xvf64gerpp	1,	vs42,	vs49
 | |
| #else
 | |
| 	xvf64gerpp      0,      vs32,   vs49
 | |
| 	xvf64gerpp      1,      vs34,   vs49
 | |
| 	xvf64gerpp      0,      vs40,   vs48
 | |
| 	xvf64gerpp      1,      vs42,   vs48
 | |
| #endif
 | |
| .if \IsLast==1
 | |
| 	addi	AO, AO, DISP8(\Index,128)
 | |
| 	addi	BO, BO,  DISP2(\Index,32)
 | |
| .endif 
 | |
| .endm
 | |
|  
 | |
| 
 | |
| .macro LOAD_END_1x4  OffsetA,OffsetB
 | |
| 	lxvp	vs32,	0(AO)	// load real,imag from A
 | |
| 	lxvp	vs34,	32(AO)	// load real,imag from A
 | |
| 	lxv	vs48,	0(BO)	// load real imag from B
 | |
|         xvf64gerpp      0,      vs32,   vs48
 | |
|         xvf64gerpp      1,      vs34,   vs48
 | |
| 	addi	BO, BO, \OffsetB
 | |
| 	addi	AO, AO, \OffsetA
 | |
| .endm
 | |
| 
 | |
| 
 | |
| .macro  KERNEL1x4_UNPRIME_MMA
 | |
|         /* "unprime" MMA accumulators */
 | |
|         xxmfacc 0
 | |
|         xxmfacc 1
 | |
| .endm
 | |
| 
 | |
| 
 | |
| .macro SAVE1x4 
 | |
|         xxpermdi vs32, vs0, vs1, 0b01
 | |
|         xxpermdi vs33, vs0, vs1, 0b10
 | |
|         xxpermdi vs34, vs2, vs3, 0b01
 | |
|         xxpermdi vs35, vs2, vs3, 0b10
 | |
|         xxpermdi vs36, vs4, vs5, 0b01
 | |
|         xxpermdi vs37, vs4, vs5, 0b10
 | |
|         xxpermdi vs38, vs6, vs7, 0b01
 | |
|         xxpermdi vs39, vs6, vs7, 0b10
 | |
| #if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__)
 | |
| 	xxlor vs0, vs32, vs32
 | |
| 	xxlor vs1, vs33, vs33
 | |
| 	xxlor vs2, vs34, vs34
 | |
| 	xxlor vs3, vs35, vs35
 | |
| 	xxlor vs4, vs36, vs36
 | |
| 	xxlor vs5, vs37, vs37
 | |
| 	xxlor vs6, vs38, vs38
 | |
| 	xxlor vs7, vs39, vs39
 | |
| #else
 | |
| 	xxlor vs2, vs32, vs32
 | |
| 	xxlor vs3, vs33, vs33
 | |
| 	xxlor vs0, vs34, vs34
 | |
| 	xxlor vs1, vs35, vs35
 | |
| 	xxlor vs6, vs36, vs36
 | |
| 	xxlor vs7, vs37, vs37
 | |
| 	xxlor vs4, vs38, vs38
 | |
| 	xxlor vs5, vs39, vs39
 | |
| #endif
 | |
| 	SAVE4  vs0,vs1,vs2,vs3,vs4,vs5,vs6,vs7,CO,0
 | |
| 	addi	CO, CO, 64
 | |
| .endm
 | |
| 
 | |
| /**********************************************************************************************
 | |
| *
 | |
| 
 | |
| .macros for N=1 and M=2
 | |
| **********************************************************************************************/
 | |
| 
 | |
| .macro  KERNEL1x2_ZERO_AND_PRIME_MMA
 | |
|         /* zero out and prime the MMA accumulators */
 | |
|         xxsetaccz 0
 | |
| .endm
 | |
| 
 | |
| 
 | |
| .macro KERNEL1x2_2	Index,IsLast
 | |
| 	lxvp	vs32,	DISP4(\Index,  0)(AO)	// load real,imag from A
 | |
| 	lxvp	vs40,	DISP4(\Index, 32)(AO)	// load real,imag from A
 | |
| 	lxvp	vs48,	DISP2(\Index,  0)(BO)	// load real imag from B
 | |
| #if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__)
 | |
| 	xvf64gerpp	0,	vs32,	vs48
 | |
| 	xvf64gerpp	0,	vs40,	vs49
 | |
| #else
 | |
| 	xvf64gerpp      0,      vs32,   vs49
 | |
| 	xvf64gerpp      0,      vs40,   vs48
 | |
| #endif
 | |
| .if \IsLast==1
 | |
| 	addi	AO, AO, DISP4(\Index,64)
 | |
| 	addi	BO, BO, DISP2(\Index,32)
 | |
| .endif 
 | |
| .endm
 | |
|  
 | |
| 
 | |
| .macro LOAD_END_1x2  OffsetA,OffsetB
 | |
| 	lxvp	vs32,	0(AO)	// load real,imag from A
 | |
| 	lxv	vs48,	0(BO)	// load real imag from B
 | |
|         xvf64gerpp      0,      vs32,   vs48
 | |
| 	addi	BO, BO, \OffsetB
 | |
| 	addi	AO, AO, \OffsetA
 | |
| .endm
 | |
| 
 | |
| 
 | |
| .macro  KERNEL1x2_UNPRIME_MMA
 | |
|         /* "unprime" MMA accumulators */
 | |
|         xxmfacc 0
 | |
| .endm
 | |
| 
 | |
| 
 | |
| .macro SAVE1x2 
 | |
|         xxpermdi vs32, vs0, vs1, 0b01
 | |
|         xxpermdi vs33, vs0, vs1, 0b10
 | |
|         xxpermdi vs34, vs2, vs3, 0b01
 | |
|         xxpermdi vs35, vs2, vs3, 0b10
 | |
| #if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__)
 | |
| 	xxlor vs0, vs32, vs32
 | |
| 	xxlor vs1, vs33, vs33
 | |
| 	xxlor vs2, vs34, vs34
 | |
| 	xxlor vs3, vs35, vs35
 | |
| #else
 | |
| 	xxlor vs2, vs32, vs32
 | |
| 	xxlor vs3, vs33, vs33
 | |
| 	xxlor vs0, vs34, vs34
 | |
| 	xxlor vs1, vs35, vs35
 | |
| #endif
 | |
| 
 | |
| 	SAVE2  vs0,vs1,vs2,vs3,CO,0
 | |
| 	addi	CO, CO, 32 
 | |
| .endm
 | |
| 
 | |
| /**********************************************************************************************
 | |
| *
 | |
| 
 | |
| .macros for N=1 and M=1
 | |
| **********************************************************************************************/
 | |
| 
 | |
| .macro ZERO1x1
 | |
| 	xxlxor	vs0,	vs0,	vs0
 | |
| 	xxlxor	vs1,	vs1,	vs1 
 | |
| .endm
 | |
| 
 | |
| 
 | |
| .macro LOAD1x1   
 | |
| 	LOAD1x1O 0,0 
 | |
| .endm
 | |
| 
 | |
| 
 | |
| .macro LOAD1x1O  OffsetA,OffsetB
 | |
| 	lxv	vs48,(\OffsetB+	0)(BO)	// load real imag from B
 | |
| 	lxv	vs32,	(0+\OffsetA)(AO)	// load real,imag from A 
 | |
| 	xxswapd	vs49, vs48
 | |
| 
 | |
| .endm
 | |
| 
 | |
| 
 | |
| .macro END1x1_WITHOUT_ADD
 | |
| 	END1x1 AO,BO,0,0
 | |
| .endm
 | |
| 
 | |
| 
 | |
| .macro END1x1	AREG, BREG, OffsetA, OffsetB
 | |
| .if \OffsetB != 0
 | |
| 	addi	\BREG, \BREG, \OffsetB
 | |
| .endif
 | |
| .if \OffsetA != 0
 | |
| 	addi	\AREG, \AREG, \OffsetA
 | |
| .endif
 | |
| 	xvmaddadp	vs0,	vs32,	vs48 
 | |
| 	xvmaddadp	vs1,	vs32,	vs49 
 | |
| .endm
 | |
| 
 | |
| 
 | |
| .macro LOAD1x1_2
 | |
|     LOAD1x1_2O 0,0
 | |
| .endm	
 | |
| 
 | |
| 
 | |
| .macro LOAD1x1_2O  OffsetA,OffsetB
 | |
| 	lxv	vs48,(\OffsetB+	0)(BO)	// load real imag from B
 | |
| 	lxv	vs52,	(\OffsetB+16)(BO)	// load real,imag	from B
 | |
| 	xxswapd	vs49, vs48
 | |
| 
 | |
| 	lxv	vs32,	(0+\OffsetA)(AO)	// load real,imag from A
 | |
| 	lxv	vs40,	(16+\OffsetA)(AO)	// load real,imag from A 
 | |
| .endm	
 | |
| 
 | |
| 
 | |
| .macro END1x1_2	  
 | |
|   /*for load2 offset will be 32 and 32*/
 | |
|    KERNEL1x1_2	AO,BO,	32,32,0 ,1,1 
 | |
| .endm
 | |
|  
 | |
| 
 | |
| 
 | |
| .macro KERNEL1x1_E2	OffsetA,OffsetB, Index,IsLast 
 | |
|   KERNEL1x1_2	AO,BO,	\OffsetA,\OffsetB, \Index,\IsLast ,1 
 | |
| .endm
 | |
| 
 | |
| 
 | |
| .macro KERNEL1x1_L2	OffsetA,OffsetB, Index,IsLast
 | |
|   KERNEL1x1_2	AO,BO,	\OffsetA,\OffsetB, \Index,\IsLast ,0 
 | |
| .endm
 | |
| 
 | |
| 
 | |
| .macro KERNEL1x1_2	AREG,BREG,	OffsetA,OffsetB, Index,IsLast ,Complete
 | |
|   xxswapd	vs53, vs52
 | |
| 	xvmaddadp	vs0,	vs32,	vs48 
 | |
| 	xvmaddadp	vs1,	vs32,	vs49 
 | |
| .if \Complete==0	
 | |
| 	lxv	vs32,	DISP2(\Index, 0 + \OffsetA)(\AREG)	// load real,imag from A 
 | |
| .endif	 
 | |
| .if \Complete==0		
 | |
| 	lxv	vs48,	DISP2(\Index, 0+\OffsetB)(\BREG)	// load real imag from B
 | |
| .endif
 | |
| .if \Complete==0		
 | |
|   xxswapd	vs49, vs48
 | |
| .endif 
 | |
| 	xvmaddadp	vs0,	vs40,	vs52
 | |
| 	xvmaddadp	vs1,	vs40,	vs53 
 | |
| .if \Complete==0		
 | |
| 	lxv	vs40,	DISP2(\Index,16+0+ \OffsetA)(\AREG)	// load real,imag from A 
 | |
| .endif
 | |
|  
 | |
| .if \Complete==0	 
 | |
|  	lxv	vs52,	DISP2(\Index, 16+\OffsetB)(\BREG)	// load real,imag	from B
 | |
| .endif
 | |
| .if \IsLast==1
 | |
| .if \Complete==1
 | |
| 	addi	\AREG, \AREG,  DISP2(\Index,\OffsetA)
 | |
| 	addi	\BREG, \BREG,  DISP2(\Index,\OffsetB)
 | |
| .else
 | |
| 	addi	\AREG, \AREG, DISP2(\Index,32)
 | |
| 	addi	\BREG, \BREG,  DISP2(\Index,32)
 | |
| .endif
 | |
| .endif 
 | |
| .endm
 | |
|  
 | |
| 
 | |
| 
 | |
| .macro KERNEL1x1
 | |
|   LOAD1x1
 | |
|   END1x1  AO, BO, 16,16
 | |
| .endm
 | |
| 
 | |
| 
 | |
| 
 | |
| .macro SAVE1x1
 | |
| 	SAVE1  vs0,vs1,CO,0
 | |
| 	addi	CO, CO, 16 
 | |
| .endm
 | |
| 
 | |
| /****************************TRMM POINTER REFRESH
 | |
| 
 | |
| .macroSES*************************/
 | |
| 
 | |
| 
 | |
| .macro SHIFT_REG  REG1,REG2,SHIFT_VAL
 | |
| 		.if \SHIFT_VAL==16 
 | |
| 			slwi		\REG1,	\REG2,	8			
 | |
| 		.elseif \SHIFT_VAL==8  
 | |
| 			slwi		\REG1,	\REG2,	7			 
 | |
| 		.elseif \SHIFT_VAL==4
 | |
| 			slwi		\REG1,	\REG2,	6			  
 | |
| 		.elseif \SHIFT_VAL==2
 | |
| 			slwi		\REG1,	\REG2,	5			 
 | |
| 		.elseif \SHIFT_VAL==1
 | |
| 			slwi		\REG1,	\REG2,	4			 
 | |
| 		.endif
 | |
| .endm
 | |
| /*
 | |
| //#if (defined(LEFT) &&  defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
 | |
| // 		ptrbb = bb;
 | |
| // #else
 | |
| // 		ptrba += off*16;
 | |
| // 		ptrbb = bb + off*2;
 | |
| // #endif
 | |
| */
 | |
| 
 | |
| 
 | |
| .macro REFRESH_POINTERS  PTR_A,PTR_B,OFF_VAL,B_VAL,C_A,C_B
 | |
|     #if (defined(LEFT) &&  defined(TRANSA)) ||  (!defined(LEFT) && !defined(TRANSA))
 | |
|         /* ptrbb = bb;*/
 | |
|         mr \PTR_B,\B_VAL     /* refresh BPOINT */
 | |
|     #else
 | |
| 		    /*
 | |
|         // ptrba  =ptrba+ off*C_A;
 | |
|         // ptrbb = bb + off*C_B; 
 | |
| 				*/
 | |
| 		SHIFT_REG T4,\OFF_VAL,\C_B		/* Number of values in B shifted  */
 | |
| 		SHIFT_REG T2,\OFF_VAL,\C_A		/* Number of values in A shifted  */
 | |
| 		add		\PTR_B,	\B_VAL ,	T4				/* Add values to BO */
 | |
| 		add		\PTR_A,	\PTR_A,	T2				/* Add values to AO  */
 | |
|     #endif 
 | |
| .endm
 | |
| 
 | |
| /*
 | |
| // #if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
 | |
| // 		temp = bk-off;
 | |
| // #elif defined(LEFT)
 | |
| // 		temp = off+16;	// number of values in A
 | |
| // #else
 | |
| // 		temp = off+2;	// number of values in B
 | |
| // #endif
 | |
| */
 | |
| 
 | |
| 
 | |
| .macro REFRESH_TEMP_BK TEMP_BK,BK_VAL,OFF_VAL,INCR_A,INCR_B
 | |
|     #if (defined(LEFT) && !defined(TRANSA)) ||  (!defined(LEFT) && defined(TRANSA))
 | |
|                             /* temp = bk-off;*/
 | |
|            sub \TEMP_BK,\BK_VAL,\OFF_VAL
 | |
|     #elif defined(LEFT)
 | |
|                             /* temp = off+INCR_A;	// number of values in A */
 | |
|            addi \TEMP_BK, \OFF_VAL, \INCR_A
 | |
|     #else
 | |
|                             /* temp = off+INCR_B	// number of values in B*/
 | |
|            addi \TEMP_BK,\OFF_VAL, \INCR_B
 | |
|     #endif
 | |
| .endm
 | |
| /*
 | |
| // #if ( defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
 | |
| // 		temp = bk - off;
 | |
| // #ifdef LEFT
 | |
| // 		temp -= 16; // number of values in A
 | |
| // #else
 | |
| // 		temp -= 2; // number of values in B
 | |
| // #endif
 | |
| // 		ptrba += temp*16;
 | |
| // 		ptrbb += temp*2;
 | |
| // #endif
 | |
| // #ifdef LEFT
 | |
| // 		off += 16; // number of values in A
 | |
| // #endif
 | |
| */
 | |
|  
 | |
| 
 | |
| 
 | |
| .macro REFRESH_AFTER_SAVE TEMP_BK,BK_VAL,OFF_VAL,PTR_B,PTR_A,C_A,C_B
 | |
|     #if ( defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
 | |
|                     /*temp = bk - off;*/
 | |
|                 sub \TEMP_BK,\BK_VAL,\OFF_VAL
 | |
|     #ifdef LEFT
 | |
|                     /*temp -= 8; // number of values in A*/
 | |
|                 addi \TEMP_BK,\TEMP_BK,-\C_A
 | |
|     #else
 | |
|                     /*temp -= 4; // number of values in B*/
 | |
|                 addi \TEMP_BK,\TEMP_BK,-\C_B 
 | |
|     #endif
 | |
|                     /*ptrba += temp*C_A;
 | |
|                     ptrbb += temp*C_B;*/ 
 | |
|                 SHIFT_REG T4,\TEMP_BK,\C_A
 | |
| 								SHIFT_REG T2,\TEMP_BK,\C_B
 | |
|                 add \PTR_A, \PTR_A,T4/*ptrba+temp*C_A*/ 
 | |
| 								add \PTR_B, \PTR_B,T2 
 | |
|     #endif
 | |
|     #ifdef LEFT
 | |
|                     /*off += 8; // number of values in A*/
 | |
|                  addi \OFF_VAL,\OFF_VAL,\C_A
 | |
|     #endif
 | |
| .endm
 | |
| 
 |