1507 lines
		
	
	
		
			28 KiB
		
	
	
	
		
			ArmAsm
		
	
	
	
			
		
		
	
	
			1507 lines
		
	
	
		
			28 KiB
		
	
	
	
		
			ArmAsm
		
	
	
	
| /*********************************************************************/
 | |
| /* Copyright 2009, 2010 The University of Texas at Austin.           */
 | |
| /* All rights reserved.                                              */
 | |
| /*                                                                   */
 | |
| /* Redistribution and use in source and binary forms, with or        */
 | |
| /* without modification, are permitted provided that the following   */
 | |
| /* conditions are met:                                               */
 | |
| /*                                                                   */
 | |
| /*   1. Redistributions of source code must retain the above         */
 | |
| /*      copyright notice, this list of conditions and the following  */
 | |
| /*      disclaimer.                                                  */
 | |
| /*                                                                   */
 | |
| /*   2. Redistributions in binary form must reproduce the above      */
 | |
| /*      copyright notice, this list of conditions and the following  */
 | |
| /*      disclaimer in the documentation and/or other materials       */
 | |
| /*      provided with the distribution.                              */
 | |
| /*                                                                   */
 | |
| /*    THIS  SOFTWARE IS PROVIDED  BY THE  UNIVERSITY OF  TEXAS AT    */
 | |
| /*    AUSTIN  ``AS IS''  AND ANY  EXPRESS OR  IMPLIED WARRANTIES,    */
 | |
| /*    INCLUDING, BUT  NOT LIMITED  TO, THE IMPLIED  WARRANTIES OF    */
 | |
| /*    MERCHANTABILITY  AND FITNESS FOR  A PARTICULAR  PURPOSE ARE    */
 | |
| /*    DISCLAIMED.  IN  NO EVENT SHALL THE UNIVERSITY  OF TEXAS AT    */
 | |
| /*    AUSTIN OR CONTRIBUTORS BE  LIABLE FOR ANY DIRECT, INDIRECT,    */
 | |
| /*    INCIDENTAL,  SPECIAL, EXEMPLARY,  OR  CONSEQUENTIAL DAMAGES    */
 | |
| /*    (INCLUDING, BUT  NOT LIMITED TO,  PROCUREMENT OF SUBSTITUTE    */
 | |
| /*    GOODS  OR  SERVICES; LOSS  OF  USE,  DATA,  OR PROFITS;  OR    */
 | |
| /*    BUSINESS INTERRUPTION) HOWEVER CAUSED  AND ON ANY THEORY OF    */
 | |
| /*    LIABILITY, WHETHER  IN CONTRACT, STRICT  LIABILITY, OR TORT    */
 | |
| /*    (INCLUDING NEGLIGENCE OR OTHERWISE)  ARISING IN ANY WAY OUT    */
 | |
| /*    OF  THE  USE OF  THIS  SOFTWARE,  EVEN  IF ADVISED  OF  THE    */
 | |
| /*    POSSIBILITY OF SUCH DAMAGE.                                    */
 | |
| /*                                                                   */
 | |
| /* The views and conclusions contained in the software and           */
 | |
| /* documentation are those of the authors and should not be          */
 | |
| /* interpreted as representing official policies, either expressed   */
 | |
| /* or implied, of The University of Texas at Austin.                 */
 | |
| /*********************************************************************/
 | |
| 
 | |
| #define ASSEMBLER
 | |
| #include "common.h"
 | |
| 
 | |
| #if defined(linux) || defined(__FreeBSD__)
 | |
| #ifndef __64BIT__
 | |
| #define M	r3
 | |
| #define IS	r4
 | |
| #define A	r5
 | |
| #define LDA	r6
 | |
| #define X	r7
 | |
| #define	INCX	r8
 | |
| #define	Y	r9
 | |
| #define	INCY	r10
 | |
| #define BUFFER	r14
 | |
| #else
 | |
| #define M	r3
 | |
| #define IS	r4
 | |
| #define A	r6
 | |
| #define LDA	r7
 | |
| #define X	r8
 | |
| #define	INCX	r9
 | |
| #define	Y	r10
 | |
| #define	INCY	r5
 | |
| #define BUFFER	r14
 | |
| #endif
 | |
| #endif
 | |
| 
 | |
| #if defined(_AIX) || defined(__APPLE__)
 | |
| #if !defined(__64BIT__) && defined(DOUBLE)
 | |
| #define M	r3
 | |
| #define IS	r4
 | |
| #define A	r7
 | |
| #define LDA	r8
 | |
| #define X	r9
 | |
| #define	INCX	r10
 | |
| #define	Y	r5
 | |
| #define	INCY	r6
 | |
| #define BUFFER	r14
 | |
| #else
 | |
| #define M	r3
 | |
| #define IS	r4
 | |
| #define A	r6
 | |
| #define LDA	r7
 | |
| #define X	r8
 | |
| #define	INCX	r9
 | |
| #define	Y	r10
 | |
| #define	INCY	r5
 | |
| #define BUFFER	r14
 | |
| #endif
 | |
| #endif
 | |
| 
 | |
| #define I	r11
 | |
| #define	J	r12
 | |
| 
 | |
| #define AO1	r15
 | |
| #define AO2	r16
 | |
| #define AO3	r17
 | |
| #define AO4	r18
 | |
| #define XX	r19
 | |
| #define YY	r20
 | |
| #define	NEW_Y	r21
 | |
| #define TEMP	r22
 | |
| #define	PREA	r24
 | |
| 
 | |
| #define y01 f0
 | |
| #define y02 f1
 | |
| #define y03 f2
 | |
| #define y04 f3
 | |
| 
 | |
| #define atemp1 f4
 | |
| #define atemp2 f5
 | |
| #define atemp3 f6
 | |
| #define atemp4 f7
 | |
| 
 | |
| #define xtemp1 f8
 | |
| #define xtemp2 f9
 | |
| #define xtemp3 f10
 | |
| #define xtemp4 f11
 | |
| 
 | |
| #define xsum1  f12
 | |
| #define xsum2  f13
 | |
| #define xsum3  f14
 | |
| #define xsum4  f15
 | |
| 
 | |
| #define a1     f16
 | |
| #define a2     f17
 | |
| #define a3     f18
 | |
| #define a4     f19
 | |
| #define a5     f20
 | |
| #define a6     f21
 | |
| #define a7     f22
 | |
| #define a8     f23
 | |
| #define a9     f24
 | |
| #define a10    f25
 | |
| #define a11    f26
 | |
| #define a12    f27
 | |
| #define a13    f28
 | |
| #define a14    f29
 | |
| #define a15    f30
 | |
| #define a16    f31
 | |
| 
 | |
| #define alpha  f1
 | |
| 
 | |
| #if defined(PPCG4)
 | |
| #define PREFETCHSIZE_A  24
 | |
| #endif
 | |
| 
 | |
| #if defined(PPC440) || defined(PPC440FP2)
 | |
| #define PREFETCHSIZE_A  24
 | |
| #endif
 | |
| 
 | |
| #ifdef PPC970
 | |
| #define PREFETCHSIZE_A  64
 | |
| #endif
 | |
| 
 | |
| #ifdef CELL
 | |
| #define PREFETCHSIZE_A  72
 | |
| #endif
 | |
| 
 | |
| #ifdef POWER4
 | |
| #define PREFETCHSIZE_A  16
 | |
| #endif
 | |
| 
 | |
| #ifdef POWER5
 | |
| #define PREFETCHSIZE_A  96
 | |
| #endif
 | |
| 
 | |
| #ifdef POWER6
 | |
| #define PREFETCHSIZE_A  40
 | |
| #endif
 | |
| 
 | |
| #if defined(POWER4) || defined(POWER5) || defined(POWER6) || defined(PPC970)
 | |
| #define NOP1
 | |
| #define NOP2
 | |
| #else
 | |
| #define NOP1   mr	LDA, LDA
 | |
| #define NOP2   mr	INCX, INCX
 | |
| #endif
 | |
| 
 | |
| #ifndef NEEDPARAM
 | |
| 
 | |
| #ifndef __64BIT__
 | |
| #define STACKSIZE 224
 | |
| #define ALPHA     200(SP)
 | |
| #define	FZERO	  208(SP)
 | |
| #else
 | |
| #define STACKSIZE 280
 | |
| #define ALPHA     256(SP)
 | |
| #define FZERO	  264(SP)
 | |
| #endif
 | |
| 
 | |
| 	PROLOGUE
 | |
| 	PROFCODE
 | |
| 
 | |
| 	addi	SP,   SP, -STACKSIZE
 | |
| 	li	r0,   0
 | |
| 
 | |
| 	stfd	f14,     0(SP)
 | |
| 	stfd	f15,     8(SP)
 | |
| 	stfd	f16,    16(SP)
 | |
| 	stfd	f17,    24(SP)
 | |
| 	stfd	f18,    32(SP)
 | |
| 	stfd	f19,    40(SP)
 | |
| 	stfd	f20,    48(SP)
 | |
| 	stfd	f21,    56(SP)
 | |
| 	stfd	f22,    64(SP)
 | |
| 	stfd	f23,    72(SP)
 | |
| 	stfd	f24,    80(SP)
 | |
| 	stfd	f25,    88(SP)
 | |
| 	stfd	f26,    96(SP)
 | |
| 	stfd	f27,   104(SP)
 | |
| 	stfd	f28,   112(SP)
 | |
| 	stfd	f29,   120(SP)
 | |
| 	stfd	f30,   128(SP)
 | |
| 	stfd	f31,   136(SP)
 | |
| 
 | |
| #ifdef __64BIT__
 | |
| 	std	r0,    FZERO
 | |
| 	std	r14,   144(SP)
 | |
| 	std	r15,   152(SP)
 | |
| 	std	r16,   160(SP)
 | |
| 	std	r17,   168(SP)
 | |
| 	std	r18,   176(SP)
 | |
| 	std	r19,   184(SP)
 | |
| 	std	r20,   192(SP)
 | |
| 	std	r21,   200(SP)
 | |
| 	std	r22,   208(SP)
 | |
| 	std	r23,   216(SP)
 | |
| 	std	r24,   224(SP)
 | |
| 	std	r25,   232(SP)
 | |
| 	std	r26,   240(SP)
 | |
| 	std	r27,   248(SP)
 | |
| #else
 | |
| 	stw	r0,    0 + FZERO
 | |
| 	stw	r0,    4 + FZERO
 | |
| 	stw	r14,   144(SP)
 | |
| 	stw	r15,   148(SP)
 | |
| 	stw	r16,   152(SP)
 | |
| 	stw	r17,   156(SP)
 | |
| 	stw	r18,   160(SP)
 | |
| 	stw	r19,   164(SP)
 | |
| 	stw	r20,   168(SP)
 | |
| 	stw	r21,   172(SP)
 | |
| 	stw	r22,   176(SP)
 | |
| 	stw	r23,   180(SP)
 | |
| 	stw	r24,   184(SP)
 | |
| 	stw	r25,   188(SP)
 | |
| 	stw	r26,   192(SP)
 | |
| 	stw	r27,   196(SP)
 | |
| #endif
 | |
| 
 | |
| #if defined(linux) || defined(__FreeBSD__)
 | |
| #ifndef __64BIT__
 | |
| 	lwz	BUFFER,  FRAMESLOT(0) + STACKSIZE(SP)
 | |
| #else
 | |
| 	ld	INCY,    FRAMESLOT(0) + STACKSIZE(SP)
 | |
| 	ld	BUFFER,  FRAMESLOT(1) + STACKSIZE(SP)
 | |
| #endif
 | |
| #endif
 | |
| 
 | |
| #if defined(_AIX) || defined(__APPLE__)
 | |
| #ifndef __64BIT__
 | |
| #ifdef DOUBLE
 | |
| 	lwz	Y,       FRAMESLOT(0) + STACKSIZE(SP)
 | |
| 	lwz	INCY,    FRAMESLOT(1) + STACKSIZE(SP)
 | |
| 	lwz	BUFFER,  FRAMESLOT(2) + STACKSIZE(SP)
 | |
| #else
 | |
| 	lwz	  INCY,  FRAMESLOT(0) + STACKSIZE(SP)
 | |
| 	lwz	BUFFER,  FRAMESLOT(1) + STACKSIZE(SP)
 | |
| #endif
 | |
| #else
 | |
| 	ld	INCY,    FRAMESLOT(0) + STACKSIZE(SP)
 | |
| 	ld	BUFFER,  FRAMESLOT(1) + STACKSIZE(SP)
 | |
| #endif
 | |
| #endif
 | |
| 
 | |
| 	STFD	alpha, ALPHA
 | |
| 
 | |
| 	slwi	LDA,  LDA,  BASE_SHIFT
 | |
| 	slwi	INCX, INCX, BASE_SHIFT
 | |
| 	slwi	INCY, INCY, BASE_SHIFT
 | |
| 
 | |
| 	li	PREA, PREFETCHSIZE_A * SIZE
 | |
| 	sub	IS, M, IS
 | |
| 
 | |
| 	cmpwi	cr0, M, 0
 | |
| 	ble-	LL(999)
 | |
| 
 | |
| 	mullw	TEMP, IS, LDA
 | |
| 	add	A, A, TEMP
 | |
| 
 | |
| 	cmpwi	cr0, INCX, SIZE
 | |
| 	beq	LL(05)
 | |
| 
 | |
| 	mr	XX, X
 | |
| 	mr	X, BUFFER
 | |
| 
 | |
| 	srawi.	r0, M, 3
 | |
| 	mtspr	CTR, r0
 | |
| 	ble	LL(03)
 | |
| 	.align 4
 | |
| 
 | |
| LL(01):
 | |
| 	LFD	a1, 0 * SIZE(XX)
 | |
| 	add	XX, XX, INCX
 | |
| 	LFD	a2, 0 * SIZE(XX)
 | |
| 	add	XX, XX, INCX
 | |
| 	LFD	a3, 0 * SIZE(XX)
 | |
| 	add	XX, XX, INCX
 | |
| 	LFD	a4, 0 * SIZE(XX)
 | |
| 	add	XX, XX, INCX
 | |
| 	LFD	a5, 0 * SIZE(XX)
 | |
| 	add	XX, XX, INCX
 | |
| 	LFD	a6, 0 * SIZE(XX)
 | |
| 	add	XX, XX, INCX
 | |
| 	LFD	a7, 0 * SIZE(XX)
 | |
| 	add	XX, XX, INCX
 | |
| 	LFD	a8, 0 * SIZE(XX)
 | |
| 	add	XX, XX, INCX
 | |
| 
 | |
| 	dcbt	XX, PREA
 | |
| 	dcbtst	BUFFER, PREA
 | |
| 
 | |
| 	STFD	a1, 0 * SIZE(BUFFER)
 | |
| 	STFD	a2, 1 * SIZE(BUFFER)
 | |
| 	STFD	a3, 2 * SIZE(BUFFER)
 | |
| 	STFD	a4, 3 * SIZE(BUFFER)
 | |
| 	STFD	a5, 4 * SIZE(BUFFER)
 | |
| 	STFD	a6, 5 * SIZE(BUFFER)
 | |
| 	STFD	a7, 6 * SIZE(BUFFER)
 | |
| 	STFD	a8, 7 * SIZE(BUFFER)
 | |
| 
 | |
| 	addi	BUFFER, BUFFER, 8 * SIZE
 | |
| 	bdnz	LL(01)
 | |
| 	.align 4
 | |
| 
 | |
| LL(03):
 | |
| 	andi.	r0, M, 7
 | |
| 	mtspr	CTR, r0
 | |
| 	ble	LL(05)
 | |
| 	.align 4
 | |
| 
 | |
| LL(04):
 | |
| 	LFD	a1, 0 * SIZE(XX)
 | |
| 	add	XX, XX, INCX
 | |
| 
 | |
| 	STFD	a1, 0 * SIZE(BUFFER)
 | |
| 	addi	BUFFER, BUFFER, 1 * SIZE
 | |
| 	bdnz	LL(04)
 | |
| 	.align 4
 | |
| 
 | |
| LL(05):
 | |
| 	mr	NEW_Y, Y
 | |
| 	lfd	f0, FZERO
 | |
| 
 | |
| 	cmpwi	cr0, INCY, SIZE
 | |
| 	beq	LL(10)
 | |
| 
 | |
| 	mr	NEW_Y, BUFFER
 | |
| 
 | |
| 	addi	r0, M,  7
 | |
| 	srawi.	r0, r0, 3
 | |
| 	mtspr	CTR, r0
 | |
| 	.align 4
 | |
| 
 | |
| LL(06):
 | |
| 	STFD	f0, 0 * SIZE(BUFFER)
 | |
| 	STFD	f0, 1 * SIZE(BUFFER)
 | |
| 	STFD	f0, 2 * SIZE(BUFFER)
 | |
| 	STFD	f0, 3 * SIZE(BUFFER)
 | |
| 	STFD	f0, 4 * SIZE(BUFFER)
 | |
| 	STFD	f0, 5 * SIZE(BUFFER)
 | |
| 	STFD	f0, 6 * SIZE(BUFFER)
 | |
| 	STFD	f0, 7 * SIZE(BUFFER)
 | |
| 	addi	BUFFER, BUFFER, 8 * SIZE
 | |
| 	bdnz	LL(06)
 | |
| 	.align 4
 | |
| 
 | |
| LL(10):
 | |
| 	addi	TEMP, IS, 4
 | |
| 	cmpw	cr0, TEMP, M
 | |
| 	bgt	LL(20)
 | |
| 	.align 4
 | |
| 
 | |
| LL(11):
 | |
| 	mr	AO1, A
 | |
| 	add	AO2, A,   LDA
 | |
| 	add	AO3, AO2, LDA
 | |
| 	add	AO4, AO3, LDA
 | |
| 	add	A,   AO4, LDA
 | |
| 
 | |
| 	slwi	TEMP,  IS,  BASE_SHIFT
 | |
| 	add	TEMP, X, TEMP
 | |
| 
 | |
| 	LFD	a16, ALPHA
 | |
| 	lfd	xsum1, FZERO
 | |
| 
 | |
| 	LFD	atemp1, 0 * SIZE(TEMP)
 | |
| 	LFD	atemp2, 1 * SIZE(TEMP)
 | |
| 	LFD	atemp3, 2 * SIZE(TEMP)
 | |
| 	LFD	atemp4, 3 * SIZE(TEMP)
 | |
| 
 | |
| 	LFD	xtemp1,  0 * SIZE(X)
 | |
| 	LFD	xtemp2,  1 * SIZE(X)
 | |
| 	LFD	xtemp3,  2 * SIZE(X)
 | |
| 	LFD	xtemp4,  3 * SIZE(X)
 | |
| 
 | |
| 	LFD	y01,  0 * SIZE(NEW_Y)
 | |
| 	LFD	y02,  1 * SIZE(NEW_Y)
 | |
| 	LFD	y03,  2 * SIZE(NEW_Y)
 | |
| 	LFD	y04,  3 * SIZE(NEW_Y)
 | |
| 
 | |
| 	LFD	a1,  0 * SIZE(AO1)
 | |
| 	FMUL	atemp1, a16, atemp1
 | |
| 	LFD	a2,  1 * SIZE(AO1)
 | |
| 	FMUL	atemp2, a16, atemp2
 | |
| 	LFD	a3,  2 * SIZE(AO1)
 | |
| 	FMUL	atemp3, a16, atemp3
 | |
| 	LFD	a4,  3 * SIZE(AO1)
 | |
| 	FMUL	atemp4, a16, atemp4
 | |
| 
 | |
| 	LFD	a5,  0 * SIZE(AO2)
 | |
| 	fmr	xsum2, xsum1
 | |
| 	LFD	a6,  1 * SIZE(AO2)
 | |
| 	fmr	xsum3, xsum1
 | |
| 	LFD	a7,  2 * SIZE(AO2)
 | |
| 	fmr	xsum4, xsum1
 | |
| 	LFD	a8,  3 * SIZE(AO2)
 | |
| 
 | |
| 	LFD	a9,  0 * SIZE(AO3)
 | |
| 	LFD	a10, 1 * SIZE(AO3)
 | |
| 	LFD	a11, 2 * SIZE(AO3)
 | |
| 	LFD	a12, 3 * SIZE(AO3)
 | |
| 
 | |
| 	LFD	a13, 0 * SIZE(AO4)
 | |
| 	LFD	a14, 1 * SIZE(AO4)
 | |
| 	LFD	a15, 2 * SIZE(AO4)
 | |
| 	LFD	a16, 3 * SIZE(AO4)
 | |
| 
 | |
| 	mr	XX, X
 | |
| 	mr	YY, NEW_Y
 | |
| 
 | |
| 	srawi.	r0,  IS, 4
 | |
| 	mtspr	CTR, r0
 | |
| 	ble	LL(14)
 | |
| 	.align 4
 | |
| 
 | |
| LL(12):
 | |
| 	FMADD	xsum1, xtemp1, a1,  xsum1
 | |
| 	DCBT(AO1, PREA)
 | |
| 	FMADD	y01, atemp1, a1,  y01
 | |
| 	LFD	a1,  4 * SIZE(AO1)
 | |
| 
 | |
| 	FMADD	xsum2, xtemp1, a5,  xsum2
 | |
| 	NOP1
 | |
| 	FMADD	y02, atemp1, a2,  y02
 | |
| 	NOP2
 | |
| 
 | |
| 	FMADD	xsum3, xtemp1, a9,  xsum3
 | |
| 	NOP1
 | |
| 	FMADD	y03, atemp1, a3,  y03
 | |
| 	NOP2
 | |
| 
 | |
| 	FMADD	xsum4, xtemp1, a13, xsum4
 | |
| 	LFD	xtemp1,  4 * SIZE(XX)
 | |
| 	FMADD	y04, atemp1, a4,  y04
 | |
| 	NOP2
 | |
| 
 | |
| 	FMADD	xsum1, xtemp2, a2,  xsum1
 | |
| 	LFD	a2,  5 * SIZE(AO1)
 | |
| 	FMADD	y01, atemp2, a5,  y01
 | |
| 	LFD	a5,  4 * SIZE(AO2)
 | |
| 
 | |
| 	FMADD	xsum2, xtemp2, a6,  xsum2
 | |
| 	NOP1
 | |
| 	FMADD	y02, atemp2, a6,  y02
 | |
| 	LFD	a6,  5 * SIZE(AO2)
 | |
| 
 | |
| 	FMADD	xsum3, xtemp2, a10, xsum3
 | |
| 	NOP1
 | |
| 	FMADD	y03, atemp2, a7,  y03
 | |
| 	NOP2
 | |
| 
 | |
| 	FMADD	xsum4, xtemp2, a14, xsum4
 | |
| 	LFD	xtemp2,  5 * SIZE(XX)
 | |
| 	FMADD	y04, atemp2, a8,  y04
 | |
| #	DCBT(X, PREX)
 | |
| 	NOP2
 | |
| 
 | |
| 	FMADD	xsum1, xtemp3, a3,  xsum1
 | |
| 	LFD	a3,  6 * SIZE(AO1)
 | |
| 	FMADD	y01, atemp3, a9,  y01
 | |
| 	LFD	a9,  4 * SIZE(AO3)
 | |
| 
 | |
| 	FMADD	xsum2, xtemp3, a7,  xsum2
 | |
| 	LFD	a7,  6 * SIZE(AO2)
 | |
| 	FMADD	y02, atemp3, a10, y02
 | |
| 	LFD	a10, 5 * SIZE(AO3)
 | |
| 
 | |
| 	FMADD	xsum3, xtemp3, a11, xsum3
 | |
| 	NOP1
 | |
| 	FMADD	y03, atemp3, a11, y03
 | |
| 	LFD	a11, 6 * SIZE(AO3)
 | |
| 
 | |
| 	FMADD	xsum4, xtemp3, a15, xsum4
 | |
| 	LFD	xtemp3,  6 * SIZE(XX)
 | |
| 	FMADD	y04, atemp3, a12, y04
 | |
| 	NOP2
 | |
| 
 | |
| 	FMADD	xsum1, xtemp4, a4,  xsum1
 | |
| 	LFD	a4,  7 * SIZE(AO1)
 | |
| 	FMADD	y01, atemp4, a13, y01
 | |
| 	LFD	a13, 4 * SIZE(AO4)
 | |
| 
 | |
| 	FMADD	xsum2, xtemp4, a8,  xsum2
 | |
| 	LFD	a8,  7 * SIZE(AO2)
 | |
| 	FMADD	y02, atemp4, a14, y02
 | |
| 	LFD	a14, 5 * SIZE(AO4)
 | |
| 
 | |
| 	FMADD	xsum3, xtemp4, a12, xsum3
 | |
| 	LFD	a12, 7 * SIZE(AO3)
 | |
| 	FMADD	y03, atemp4, a15, y03
 | |
| 	LFD	a15, 6 * SIZE(AO4)
 | |
| 
 | |
| 	FMADD	xsum4, xtemp4, a16, xsum4
 | |
| 	LFD	xtemp4,  7 * SIZE(XX)
 | |
| 	FMADD	y04, atemp4, a16, y04
 | |
| 	LFD	a16, 7 * SIZE(AO4)
 | |
| 
 | |
| 	STFD	y01,  0 * SIZE(YY)
 | |
| 	LFD	y01,  4 * SIZE(YY)
 | |
| 	STFD	y02,  1 * SIZE(YY)
 | |
| 	LFD	y02,  5 * SIZE(YY)
 | |
| 
 | |
| 	STFD	y03,  2 * SIZE(YY)
 | |
| 	LFD	y03,  6 * SIZE(YY)
 | |
| 	STFD	y04,  3 * SIZE(YY)
 | |
| 	LFD	y04,  7 * SIZE(YY)
 | |
| 
 | |
| 	FMADD	xsum1, xtemp1, a1,  xsum1
 | |
| 	DCBT(AO2, PREA)
 | |
| 	FMADD	y01, atemp1, a1,  y01
 | |
| 	LFD	a1,  8 * SIZE(AO1)
 | |
| 
 | |
| 	FMADD	xsum2, xtemp1, a5,  xsum2
 | |
| 	NOP1
 | |
| 	FMADD	y02, atemp1, a2,  y02
 | |
| 	NOP2
 | |
| 
 | |
| 	FMADD	xsum3, xtemp1, a9,  xsum3
 | |
| 	NOP1
 | |
| 	FMADD	y03, atemp1, a3,  y03
 | |
| 	NOP2
 | |
| 
 | |
| 	FMADD	xsum4, xtemp1, a13, xsum4
 | |
| 	LFD	xtemp1,  8 * SIZE(XX)
 | |
| 	FMADD	y04, atemp1, a4,  y04
 | |
| 	NOP2
 | |
| 
 | |
| 	FMADD	xsum1, xtemp2, a2,  xsum1
 | |
| 	LFD	a2,  9 * SIZE(AO1)
 | |
| 	FMADD	y01, atemp2, a5,  y01
 | |
| 	LFD	a5,  8 * SIZE(AO2)
 | |
| 
 | |
| 	FMADD	xsum2, xtemp2, a6,  xsum2
 | |
| 	NOP1
 | |
| 	FMADD	y02, atemp2, a6,  y02
 | |
| 	LFD	a6,  9 * SIZE(AO2)
 | |
| 
 | |
| 	FMADD	xsum3, xtemp2, a10, xsum3
 | |
| 	NOP1
 | |
| 	FMADD	y03, atemp2, a7,  y03
 | |
| 	NOP2
 | |
| 
 | |
| 	FMADD	xsum4, xtemp2, a14, xsum4
 | |
| 	LFD	xtemp2,  9 * SIZE(XX)
 | |
| 	FMADD	y04, atemp2, a8,  y04
 | |
| 	NOP2
 | |
| 
 | |
| 	FMADD	xsum1, xtemp3, a3,  xsum1
 | |
| 	LFD	a3, 10 * SIZE(AO1)
 | |
| 	FMADD	y01, atemp3, a9,  y01
 | |
| 	LFD	a9,  8 * SIZE(AO3)
 | |
| 
 | |
| 	FMADD	xsum2, xtemp3, a7,  xsum2
 | |
| 	LFD	a7, 10 * SIZE(AO2)
 | |
| 	FMADD	y02, atemp3, a10, y02
 | |
| 	LFD	a10, 9 * SIZE(AO3)
 | |
| 
 | |
| 	FMADD	xsum3, xtemp3, a11, xsum3
 | |
| 	NOP1
 | |
| 	FMADD	y03, atemp3, a11, y03
 | |
| 	LFD	a11, 10 * SIZE(AO3)
 | |
| 
 | |
| 	FMADD	xsum4, xtemp3, a15, xsum4
 | |
| 	LFD	xtemp3, 10 * SIZE(XX)
 | |
| 	FMADD	y04, atemp3, a12, y04
 | |
| 	NOP2
 | |
| 
 | |
| 	FMADD	xsum1, xtemp4, a4,  xsum1
 | |
| 	LFD	a4, 11 * SIZE(AO1)
 | |
| 	FMADD	y01, atemp4, a13, y01
 | |
| 	LFD	a13, 8 * SIZE(AO4)
 | |
| 
 | |
| 	FMADD	xsum2, xtemp4, a8,  xsum2
 | |
| 	LFD	a8, 11 * SIZE(AO2)
 | |
| 	FMADD	y02, atemp4, a14, y02
 | |
| 	LFD	a14, 9 * SIZE(AO4)
 | |
| 
 | |
| 	FMADD	xsum3, xtemp4, a12, xsum3
 | |
| 	LFD	a12, 11 * SIZE(AO3)
 | |
| 	FMADD	y03, atemp4, a15, y03
 | |
| 	LFD	a15, 10 * SIZE(AO4)
 | |
| 
 | |
| 	FMADD	xsum4, xtemp4, a16, xsum4
 | |
| 	LFD	xtemp4, 11 * SIZE(XX)
 | |
| 	FMADD	y04, atemp4, a16, y04
 | |
| 	LFD	a16, 11 * SIZE(AO4)
 | |
| 
 | |
| 	STFD	y01,  4 * SIZE(YY)
 | |
| 	LFD	y01,  8 * SIZE(YY)
 | |
| 	STFD	y02,  5 * SIZE(YY)
 | |
| 	LFD	y02,  9 * SIZE(YY)
 | |
| 
 | |
| 	STFD	y03,  6 * SIZE(YY)
 | |
| 	LFD	y03, 10 * SIZE(YY)
 | |
| 	STFD	y04,  7 * SIZE(YY)
 | |
| 	LFD	y04, 11 * SIZE(YY)
 | |
| 
 | |
| 
 | |
| 	FMADD	xsum1, xtemp1, a1,  xsum1
 | |
| 	DCBT(AO3, PREA)
 | |
| 	FMADD	y01, atemp1, a1,  y01
 | |
| 	LFD	a1, 12 * SIZE(AO1)
 | |
| 
 | |
| 	FMADD	xsum2, xtemp1, a5,  xsum2
 | |
| 	NOP1
 | |
| 	FMADD	y02, atemp1, a2,  y02
 | |
| 	NOP2
 | |
| 
 | |
| 	FMADD	xsum3, xtemp1, a9,  xsum3
 | |
| 	NOP1
 | |
| 	FMADD	y03, atemp1, a3,  y03
 | |
| 	NOP2
 | |
| 
 | |
| 	FMADD	xsum4, xtemp1, a13, xsum4
 | |
| 	LFD	xtemp1, 12 * SIZE(XX)
 | |
| 	FMADD	y04, atemp1, a4,  y04
 | |
| 	NOP2
 | |
| 
 | |
| 	FMADD	xsum1, xtemp2, a2,  xsum1
 | |
| 	LFD	a2, 13 * SIZE(AO1)
 | |
| 	FMADD	y01, atemp2, a5,  y01
 | |
| 	LFD	a5, 12 * SIZE(AO2)
 | |
| 
 | |
| 	FMADD	xsum2, xtemp2, a6,  xsum2
 | |
| 	NOP1
 | |
| 	FMADD	y02, atemp2, a6,  y02
 | |
| 	LFD	a6, 13 * SIZE(AO2)
 | |
| 
 | |
| 	FMADD	xsum3, xtemp2, a10, xsum3
 | |
| 	NOP1
 | |
| 	FMADD	y03, atemp2, a7,  y03
 | |
| #	DCBT(Y1, PREY)
 | |
| 	NOP2
 | |
| 
 | |
| 	FMADD	xsum4, xtemp2, a14, xsum4
 | |
| 	LFD	xtemp2, 13 * SIZE(XX)
 | |
| 	FMADD	y04, atemp2, a8,  y04
 | |
| 	NOP2
 | |
| 
 | |
| 	FMADD	xsum1, xtemp3, a3,  xsum1
 | |
| 	LFD	a3, 14 * SIZE(AO1)
 | |
| 	FMADD	y01, atemp3, a9,  y01
 | |
| 	LFD	a9, 12 * SIZE(AO3)
 | |
| 
 | |
| 	FMADD	xsum2, xtemp3, a7,  xsum2
 | |
| 	LFD	a7, 14 * SIZE(AO2)
 | |
| 	FMADD	y02, atemp3, a10, y02
 | |
| 	LFD	a10,13 * SIZE(AO3)
 | |
| 
 | |
| 	FMADD	xsum3, xtemp3, a11, xsum3
 | |
| 	NOP1
 | |
| 	FMADD	y03, atemp3, a11, y03
 | |
| 	LFD	a11, 14 * SIZE(AO3)
 | |
| 
 | |
| 	FMADD	xsum4, xtemp3, a15, xsum4
 | |
| 	LFD	xtemp3, 14 * SIZE(XX)
 | |
| 	FMADD	y04, atemp3, a12, y04
 | |
| 	NOP2
 | |
| 
 | |
| 	FMADD	xsum1, xtemp4, a4,  xsum1
 | |
| 	LFD	a4, 15 * SIZE(AO1)
 | |
| 	FMADD	y01, atemp4, a13, y01
 | |
| 	LFD	a13,12 * SIZE(AO4)
 | |
| 
 | |
| 	FMADD	xsum2, xtemp4, a8,  xsum2
 | |
| 	LFD	a8, 15 * SIZE(AO2)
 | |
| 	FMADD	y02, atemp4, a14, y02
 | |
| 	LFD	a14, 13 * SIZE(AO4)
 | |
| 
 | |
| 	FMADD	xsum3, xtemp4, a12, xsum3
 | |
| 	LFD	a12, 15 * SIZE(AO3)
 | |
| 	FMADD	y03, atemp4, a15, y03
 | |
| 	LFD	a15, 14 * SIZE(AO4)
 | |
| 
 | |
| 	FMADD	xsum4, xtemp4, a16, xsum4
 | |
| 	LFD	xtemp4,  15 * SIZE(XX)
 | |
| 	FMADD	y04, atemp4, a16, y04
 | |
| 	LFD	a16, 15 * SIZE(AO4)
 | |
| 
 | |
| 	STFD	y01,  8 * SIZE(YY)
 | |
| 	LFD	y01, 12 * SIZE(YY)
 | |
| 	STFD	y02,  9 * SIZE(YY)
 | |
| 	LFD	y02, 13 * SIZE(YY)
 | |
| 
 | |
| 	STFD	y03, 10 * SIZE(YY)
 | |
| 	LFD	y03, 14 * SIZE(YY)
 | |
| 	STFD	y04, 11 * SIZE(YY)
 | |
| 	LFD	y04, 15 * SIZE(YY)
 | |
| 
 | |
| 	FMADD	xsum1, xtemp1, a1,  xsum1
 | |
| 	DCBT(AO4, PREA)
 | |
| 	FMADD	y01, atemp1, a1,  y01
 | |
| 	LFD	a1, 16 * SIZE(AO1)
 | |
| 
 | |
| 	FMADD	xsum2, xtemp1, a5,  xsum2
 | |
| 	NOP1
 | |
| 	FMADD	y02, atemp1, a2,  y02
 | |
| 	NOP2
 | |
| 
 | |
| 	FMADD	xsum3, xtemp1, a9,  xsum3
 | |
| 	NOP1
 | |
| 	FMADD	y03, atemp1, a3,  y03
 | |
| 	NOP2
 | |
| 
 | |
| 	FMADD	xsum4, xtemp1, a13, xsum4
 | |
| 	LFD	xtemp1, 16 * SIZE(XX)
 | |
| 	FMADD	y04, atemp1, a4,  y04
 | |
| 	addi	YY, YY, 16 * SIZE
 | |
| 
 | |
| 	FMADD	xsum1, xtemp2, a2,  xsum1
 | |
| 	LFD	a2, 17 * SIZE(AO1)
 | |
| 	FMADD	y01, atemp2, a5,  y01
 | |
| 	LFD	a5, 16 * SIZE(AO2)
 | |
| 
 | |
| 	FMADD	xsum2, xtemp2, a6,  xsum2
 | |
| 	addi	AO3, AO3, 16 * SIZE
 | |
| 	FMADD	y02, atemp2, a6,  y02
 | |
| 	LFD	a6, 17 * SIZE(AO2)
 | |
| 
 | |
| 	FMADD	xsum3, xtemp2, a10, xsum3
 | |
| 	addi	AO1, AO1, 16 * SIZE
 | |
| 	FMADD	y03, atemp2, a7,  y03
 | |
| 	addi	AO2, AO2, 16 * SIZE
 | |
| 
 | |
| 	FMADD	xsum4, xtemp2, a14, xsum4
 | |
| 	LFD	xtemp2, 17 * SIZE(XX)
 | |
| 	FMADD	y04, atemp2, a8,  y04
 | |
| 	addi	AO4, AO4, 16 * SIZE
 | |
| 
 | |
| 	FMADD	xsum1, xtemp3, a3,  xsum1
 | |
| 	LFD	a3,  2 * SIZE(AO1)
 | |
| 	FMADD	y01, atemp3, a9,  y01
 | |
| 	LFD	a9,  0 * SIZE(AO3)
 | |
| 
 | |
| 	FMADD	xsum2, xtemp3, a7,  xsum2
 | |
| 	LFD	a7,  2 * SIZE(AO2)
 | |
| 	FMADD	y02, atemp3, a10, y02
 | |
| 	LFD	a10,  1 * SIZE(AO3)
 | |
| 
 | |
| 	FMADD	xsum3, xtemp3, a11, xsum3
 | |
| 	NOP1
 | |
| 	FMADD	y03, atemp3, a11, y03
 | |
| 	LFD	a11,  2 * SIZE(AO3)
 | |
| 
 | |
| 	FMADD	xsum4, xtemp3, a15, xsum4
 | |
| 	LFD	xtemp3, 18 * SIZE(XX)
 | |
| 	FMADD	y04, atemp3, a12, y04
 | |
| 	addi	XX, XX, 16 * SIZE
 | |
| 
 | |
| 	FMADD	xsum1, xtemp4, a4,  xsum1
 | |
| 	LFD	a4,  3 * SIZE(AO1)
 | |
| 	FMADD	y01, atemp4, a13, y01
 | |
| 	LFD	a13,  0 * SIZE(AO4)
 | |
| 
 | |
| 	FMADD	xsum2, xtemp4, a8,  xsum2
 | |
| 	LFD	a8,  3 * SIZE(AO2)
 | |
| 	FMADD	y02, atemp4, a14, y02
 | |
| 	LFD	a14,  1 * SIZE(AO4)
 | |
| 
 | |
| 	FMADD	xsum3, xtemp4, a12, xsum3
 | |
| 	LFD	a12,  3 * SIZE(AO3)
 | |
| 	FMADD	y03, atemp4, a15, y03
 | |
| 	LFD	a15,  2 * SIZE(AO4)
 | |
| 
 | |
| 	FMADD	xsum4, xtemp4, a16, xsum4
 | |
| 	LFD	xtemp4,  3 * SIZE(XX)
 | |
| 	FMADD	y04, atemp4, a16, y04
 | |
| 	LFD	a16,  3 * SIZE(AO4)
 | |
| 
 | |
| 	STFD	y01, -4 * SIZE(YY)
 | |
| 	LFD	y01,  0 * SIZE(YY)
 | |
| 	STFD	y02, -3 * SIZE(YY)
 | |
| 	LFD	y02,  1 * SIZE(YY)
 | |
| 
 | |
| 	STFD	y03, -2 * SIZE(YY)
 | |
| 	LFD	y03,  2 * SIZE(YY)
 | |
| 	STFD	y04, -1 * SIZE(YY)
 | |
| 	LFD	y04,  3 * SIZE(YY)
 | |
| 	bdnz	LL(12)
 | |
| 	.align 4
 | |
| 
 | |
| LL(14):
 | |
| 	andi.	r0,  IS, 8
 | |
| 	ble	LL(15)
 | |
| 
 | |
| 	FMADD	xsum1, xtemp1, a1,  xsum1
 | |
| 	NOP1
 | |
| 	FMADD	y01, atemp1, a1,  y01
 | |
| 	LFD	a1,  4 * SIZE(AO1)
 | |
| 
 | |
| 	FMADD	xsum2, xtemp1, a5,  xsum2
 | |
| 	NOP1
 | |
| 	FMADD	y02, atemp1, a2,  y02
 | |
| 	NOP2
 | |
| 
 | |
| 	FMADD	xsum3, xtemp1, a9,  xsum3
 | |
| 	NOP1
 | |
| 	FMADD	y03, atemp1, a3,  y03
 | |
| 	NOP2
 | |
| 
 | |
| 	FMADD	xsum4, xtemp1, a13, xsum4
 | |
| 	LFD	xtemp1,  4 * SIZE(XX)
 | |
| 	FMADD	y04, atemp1, a4,  y04
 | |
| 	NOP2
 | |
| 
 | |
| 	FMADD	xsum1, xtemp2, a2,  xsum1
 | |
| 	LFD	a2,  5 * SIZE(AO1)
 | |
| 	FMADD	y01, atemp2, a5,  y01
 | |
| 	LFD	a5,  4 * SIZE(AO2)
 | |
| 
 | |
| 	FMADD	xsum2, xtemp2, a6,  xsum2
 | |
| 	NOP1
 | |
| 	FMADD	y02, atemp2, a6,  y02
 | |
| 	LFD	a6,  5 * SIZE(AO2)
 | |
| 
 | |
| 	FMADD	xsum3, xtemp2, a10, xsum3
 | |
| 	NOP1
 | |
| 	FMADD	y03, atemp2, a7,  y03
 | |
| 	NOP2
 | |
| 
 | |
| 	FMADD	xsum4, xtemp2, a14, xsum4
 | |
| 	LFD	xtemp2,  5 * SIZE(XX)
 | |
| 	FMADD	y04, atemp2, a8,  y04
 | |
| 	NOP2
 | |
| 
 | |
| 	FMADD	xsum1, xtemp3, a3,  xsum1
 | |
| 	LFD	a3,  6 * SIZE(AO1)
 | |
| 	FMADD	y01, atemp3, a9,  y01
 | |
| 	LFD	a9,  4 * SIZE(AO3)
 | |
| 
 | |
| 	FMADD	xsum2, xtemp3, a7,  xsum2
 | |
| 	LFD	a7,  6 * SIZE(AO2)
 | |
| 	FMADD	y02, atemp3, a10, y02
 | |
| 	LFD	a10, 5 * SIZE(AO3)
 | |
| 
 | |
| 	FMADD	xsum3, xtemp3, a11, xsum3
 | |
| 	NOP1
 | |
| 	FMADD	y03, atemp3, a11, y03
 | |
| 	LFD	a11, 6 * SIZE(AO3)
 | |
| 
 | |
| 	FMADD	xsum4, xtemp3, a15, xsum4
 | |
| 	LFD	xtemp3,  6 * SIZE(XX)
 | |
| 	FMADD	y04, atemp3, a12, y04
 | |
| 	NOP2
 | |
| 
 | |
| 	FMADD	xsum1, xtemp4, a4,  xsum1
 | |
| 	LFD	a4,  7 * SIZE(AO1)
 | |
| 	FMADD	y01, atemp4, a13, y01
 | |
| 	LFD	a13, 4 * SIZE(AO4)
 | |
| 
 | |
| 	FMADD	xsum2, xtemp4, a8,  xsum2
 | |
| 	LFD	a8,  7 * SIZE(AO2)
 | |
| 	FMADD	y02, atemp4, a14, y02
 | |
| 	LFD	a14, 5 * SIZE(AO4)
 | |
| 
 | |
| 	FMADD	xsum3, xtemp4, a12, xsum3
 | |
| 	LFD	a12, 7 * SIZE(AO3)
 | |
| 	FMADD	y03, atemp4, a15, y03
 | |
| 	LFD	a15, 6 * SIZE(AO4)
 | |
| 
 | |
| 	FMADD	xsum4, xtemp4, a16, xsum4
 | |
| 	LFD	xtemp4,  7 * SIZE(XX)
 | |
| 	FMADD	y04, atemp4, a16, y04
 | |
| 	LFD	a16, 7 * SIZE(AO4)
 | |
| 
 | |
| 	STFD	y01,  0 * SIZE(YY)
 | |
| 	LFD	y01,  4 * SIZE(YY)
 | |
| 	STFD	y02,  1 * SIZE(YY)
 | |
| 	LFD	y02,  5 * SIZE(YY)
 | |
| 
 | |
| 	STFD	y03,  2 * SIZE(YY)
 | |
| 	LFD	y03,  6 * SIZE(YY)
 | |
| 	STFD	y04,  3 * SIZE(YY)
 | |
| 	LFD	y04,  7 * SIZE(YY)
 | |
| 
 | |
| 	FMADD	xsum1, xtemp1, a1,  xsum1
 | |
| 	NOP1
 | |
| 	FMADD	y01, atemp1, a1,  y01
 | |
| 	LFD	a1,  8 * SIZE(AO1)
 | |
| 
 | |
| 	FMADD	xsum2, xtemp1, a5,  xsum2
 | |
| 	NOP1
 | |
| 	FMADD	y02, atemp1, a2,  y02
 | |
| 	NOP2
 | |
| 
 | |
| 	FMADD	xsum3, xtemp1, a9,  xsum3
 | |
| 	NOP1
 | |
| 	FMADD	y03, atemp1, a3,  y03
 | |
| 	NOP2
 | |
| 
 | |
| 	FMADD	xsum4, xtemp1, a13, xsum4
 | |
| 	LFD	xtemp1,  8 * SIZE(XX)
 | |
| 	FMADD	y04, atemp1, a4,  y04
 | |
| 	NOP2
 | |
| 
 | |
| 	FMADD	xsum1, xtemp2, a2,  xsum1
 | |
| 	LFD	a2,  9 * SIZE(AO1)
 | |
| 	FMADD	y01, atemp2, a5,  y01
 | |
| 	LFD	a5,  8 * SIZE(AO2)
 | |
| 
 | |
| 	FMADD	xsum2, xtemp2, a6,  xsum2
 | |
| 	NOP1
 | |
| 	FMADD	y02, atemp2, a6,  y02
 | |
| 	LFD	a6,  9 * SIZE(AO2)
 | |
| 
 | |
| 	FMADD	xsum3, xtemp2, a10, xsum3
 | |
| 	NOP1
 | |
| 	FMADD	y03, atemp2, a7,  y03
 | |
| 	NOP2
 | |
| 
 | |
| 	FMADD	xsum4, xtemp2, a14, xsum4
 | |
| 	LFD	xtemp2,  9 * SIZE(XX)
 | |
| 	FMADD	y04, atemp2, a8,  y04
 | |
| 	NOP2
 | |
| 
 | |
| 	FMADD	xsum1, xtemp3, a3,  xsum1
 | |
| 	LFD	a3, 10 * SIZE(AO1)
 | |
| 	FMADD	y01, atemp3, a9,  y01
 | |
| 	LFD	a9,  8 * SIZE(AO3)
 | |
| 
 | |
| 	FMADD	xsum2, xtemp3, a7,  xsum2
 | |
| 	LFD	a7, 10 * SIZE(AO2)
 | |
| 	FMADD	y02, atemp3, a10, y02
 | |
| 	LFD	a10, 9 * SIZE(AO3)
 | |
| 
 | |
| 	FMADD	xsum3, xtemp3, a11, xsum3
 | |
| 	NOP1
 | |
| 	FMADD	y03, atemp3, a11, y03
 | |
| 	LFD	a11, 10 * SIZE(AO3)
 | |
| 
 | |
| 	FMADD	xsum4, xtemp3, a15, xsum4
 | |
| 	LFD	xtemp3, 10 * SIZE(XX)
 | |
| 	FMADD	y04, atemp3, a12, y04
 | |
| 	NOP2
 | |
| 
 | |
| 	FMADD	xsum1, xtemp4, a4,  xsum1
 | |
| 	LFD	a4, 11 * SIZE(AO1)
 | |
| 	FMADD	y01, atemp4, a13, y01
 | |
| 	LFD	a13, 8 * SIZE(AO4)
 | |
| 
 | |
| 	FMADD	xsum2, xtemp4, a8,  xsum2
 | |
| 	LFD	a8, 11 * SIZE(AO2)
 | |
| 	FMADD	y02, atemp4, a14, y02
 | |
| 	LFD	a14, 9 * SIZE(AO4)
 | |
| 
 | |
| 	FMADD	xsum3, xtemp4, a12, xsum3
 | |
| 	LFD	a12, 11 * SIZE(AO3)
 | |
| 	FMADD	y03, atemp4, a15, y03
 | |
| 	LFD	a15, 10 * SIZE(AO4)
 | |
| 
 | |
| 	FMADD	xsum4, xtemp4, a16, xsum4
 | |
| 	LFD	xtemp4, 11 * SIZE(XX)
 | |
| 	FMADD	y04, atemp4, a16, y04
 | |
| 	LFD	a16, 11 * SIZE(AO4)
 | |
| 
 | |
| 	addi	AO1, AO1, 8 * SIZE
 | |
| 	addi	AO2, AO2, 8 * SIZE
 | |
| 	addi	AO3, AO3, 8 * SIZE
 | |
| 	addi	AO4, AO4, 8 * SIZE
 | |
| 
 | |
| 	STFD	y01,  4 * SIZE(YY)
 | |
| 	LFD	y01,  8 * SIZE(YY)
 | |
| 	STFD	y02,  5 * SIZE(YY)
 | |
| 	LFD	y02,  9 * SIZE(YY)
 | |
| 
 | |
| 	STFD	y03,  6 * SIZE(YY)
 | |
| 	LFD	y03, 10 * SIZE(YY)
 | |
| 	STFD	y04,  7 * SIZE(YY)
 | |
| 	LFD	y04, 11 * SIZE(YY)
 | |
| 
 | |
| 	addi	XX, XX, 8 * SIZE
 | |
| 	addi	YY, YY, 8 * SIZE
 | |
| 	.align 4
 | |
| 
 | |
| LL(15):
 | |
| 	andi.	r0,  IS, 4
 | |
| 	ble	LL(18)
 | |
| 
 | |
| 	FMADD	xsum1, xtemp1, a1,  xsum1
 | |
| 	NOP1
 | |
| 	FMADD	y01, atemp1, a1,  y01
 | |
| 	LFD	a1,  4 * SIZE(AO1)
 | |
| 
 | |
| 	FMADD	xsum2, xtemp1, a5,  xsum2
 | |
| 	NOP1
 | |
| 	FMADD	y02, atemp1, a2,  y02
 | |
| 	NOP2
 | |
| 
 | |
| 	FMADD	xsum3, xtemp1, a9,  xsum3
 | |
| 	NOP1
 | |
| 	FMADD	y03, atemp1, a3,  y03
 | |
| 	NOP2
 | |
| 
 | |
| 	FMADD	xsum4, xtemp1, a13, xsum4
 | |
| 	LFD	xtemp1,  4 * SIZE(XX)
 | |
| 	FMADD	y04, atemp1, a4,  y04
 | |
| 	NOP2
 | |
| 
 | |
| 	FMADD	xsum1, xtemp2, a2,  xsum1
 | |
| 	LFD	a2,  5 * SIZE(AO1)
 | |
| 	FMADD	y01, atemp2, a5,  y01
 | |
| 	LFD	a5,  4 * SIZE(AO2)
 | |
| 
 | |
| 	FMADD	xsum2, xtemp2, a6,  xsum2
 | |
| 	NOP1
 | |
| 	FMADD	y02, atemp2, a6,  y02
 | |
| 	LFD	a6,  5 * SIZE(AO2)
 | |
| 
 | |
| 	FMADD	xsum3, xtemp2, a10, xsum3
 | |
| 	NOP1
 | |
| 	FMADD	y03, atemp2, a7,  y03
 | |
| 	NOP2
 | |
| 
 | |
| 	FMADD	xsum4, xtemp2, a14, xsum4
 | |
| 	LFD	xtemp2,  5 * SIZE(XX)
 | |
| 	FMADD	y04, atemp2, a8,  y04
 | |
| 	NOP2
 | |
| 
 | |
| 	FMADD	xsum1, xtemp3, a3,  xsum1
 | |
| 	LFD	a3,  6 * SIZE(AO1)
 | |
| 	FMADD	y01, atemp3, a9,  y01
 | |
| 	LFD	a9,  4 * SIZE(AO3)
 | |
| 
 | |
| 	FMADD	xsum2, xtemp3, a7,  xsum2
 | |
| 	LFD	a7,  6 * SIZE(AO2)
 | |
| 	FMADD	y02, atemp3, a10, y02
 | |
| 	LFD	a10, 5 * SIZE(AO3)
 | |
| 
 | |
| 	FMADD	xsum3, xtemp3, a11, xsum3
 | |
| 	NOP1
 | |
| 	FMADD	y03, atemp3, a11, y03
 | |
| 	LFD	a11, 6 * SIZE(AO3)
 | |
| 
 | |
| 	FMADD	xsum4, xtemp3, a15, xsum4
 | |
| 	LFD	xtemp3,  6 * SIZE(XX)
 | |
| 	FMADD	y04, atemp3, a12, y04
 | |
| 	NOP2
 | |
| 
 | |
| 	FMADD	xsum1, xtemp4, a4,  xsum1
 | |
| 	LFD	a4,  7 * SIZE(AO1)
 | |
| 	FMADD	y01, atemp4, a13, y01
 | |
| 	LFD	a13, 4 * SIZE(AO4)
 | |
| 
 | |
| 	FMADD	xsum2, xtemp4, a8,  xsum2
 | |
| 	LFD	a8,  7 * SIZE(AO2)
 | |
| 	FMADD	y02, atemp4, a14, y02
 | |
| 	LFD	a14, 5 * SIZE(AO4)
 | |
| 
 | |
| 	FMADD	xsum3, xtemp4, a12, xsum3
 | |
| 	LFD	a12, 7 * SIZE(AO3)
 | |
| 	FMADD	y03, atemp4, a15, y03
 | |
| 	LFD	a15, 6 * SIZE(AO4)
 | |
| 
 | |
| 	FMADD	xsum4, xtemp4, a16, xsum4
 | |
| 	LFD	xtemp4,  7 * SIZE(XX)
 | |
| 	FMADD	y04, atemp4, a16, y04
 | |
| 	LFD	a16, 7 * SIZE(AO4)
 | |
| 
 | |
| 	addi	AO1, AO1, 4 * SIZE
 | |
| 	addi	AO2, AO2, 4 * SIZE
 | |
| 	addi	AO3, AO3, 4 * SIZE
 | |
| 	addi	AO4, AO4, 4 * SIZE
 | |
| 
 | |
| 	STFD	y01,  0 * SIZE(YY)
 | |
| 	LFD	y01,  4 * SIZE(YY)
 | |
| 	STFD	y02,  1 * SIZE(YY)
 | |
| 	LFD	y02,  5 * SIZE(YY)
 | |
| 
 | |
| 	STFD	y03,  2 * SIZE(YY)
 | |
| 	LFD	y03,  6 * SIZE(YY)
 | |
| 	STFD	y04,  3 * SIZE(YY)
 | |
| 	LFD	y04,  7 * SIZE(YY)
 | |
| 
 | |
| 	addi	XX, XX, 4 * SIZE
 | |
| 	addi	YY, YY, 4 * SIZE
 | |
| 	.align 4
 | |
| 
 | |
| LL(18):
 | |
| 	LFD	xtemp1, ALPHA
 | |
| 
 | |
| 	FMUL	xsum1, xtemp1, xsum1
 | |
| 	FMUL	xsum2, xtemp1, xsum2
 | |
| 	FMUL	xsum3, xtemp1, xsum3
 | |
| 	FMUL	xsum4, xtemp1, xsum4
 | |
| 
 | |
| 	FMADD	xsum1, atemp1, a1,  xsum1
 | |
| 	FMADD	xsum2, atemp1, a5,  xsum2
 | |
| 	FMADD	xsum3, atemp1, a9,  xsum3
 | |
| 	FMADD	xsum4, atemp1, a13, xsum4
 | |
| 
 | |
| 	FMADD	xsum1, atemp2, a5,  xsum1
 | |
| 	FMADD	xsum2, atemp2, a6,  xsum2
 | |
| 	FMADD	xsum3, atemp2, a10, xsum3
 | |
| 	FMADD	xsum4, atemp2, a14, xsum4
 | |
| 
 | |
| 	FMADD	xsum1, atemp3, a9,  xsum1
 | |
| 	FMADD	xsum2, atemp3, a10, xsum2
 | |
| 	FMADD	xsum3, atemp3, a11, xsum3
 | |
| 	FMADD	xsum4, atemp3, a15, xsum4
 | |
| 
 | |
| 	FMADD	xsum1, atemp4, a13, xsum1
 | |
| 	FMADD	xsum2, atemp4, a14, xsum2
 | |
| 	FMADD	xsum3, atemp4, a15, xsum3
 | |
| 	FMADD	xsum4, atemp4, a16, xsum4
 | |
| 
 | |
| 	FADD	y01, y01, xsum1
 | |
| 	FADD	y02, y02, xsum2
 | |
| 	FADD	y03, y03, xsum3
 | |
| 	FADD	y04, y04, xsum4
 | |
| 
 | |
| 	STFD	y01,  0 * SIZE(YY)
 | |
| 	STFD	y02,  1 * SIZE(YY)
 | |
| 	STFD	y03,  2 * SIZE(YY)
 | |
| 	STFD	y04,  3 * SIZE(YY)
 | |
| 
 | |
| 	addi	TEMP, IS, 8
 | |
| 	addi	IS,   IS, 4
 | |
| 	cmpw	cr0, TEMP, M
 | |
| 	ble	LL(11)
 | |
| 	.align 4
 | |
| 
 | |
| LL(20):
 | |
| 	andi.	TEMP, M, 2
 | |
| 	ble	LL(30)
 | |
| 
 | |
| 	mr	AO1, A
 | |
| 	add	AO2, A,   LDA
 | |
| 	add	A,   AO2, LDA
 | |
| 
 | |
| 	slwi	TEMP,  IS,  BASE_SHIFT
 | |
| 	add	TEMP, X, TEMP
 | |
| 
 | |
| 	LFD	atemp1, 0 * SIZE(TEMP)
 | |
| 	LFD	atemp2, 1 * SIZE(TEMP)
 | |
| 
 | |
| 	LFD	a1, ALPHA
 | |
| 
 | |
| 	FMUL	atemp1, a1, atemp1
 | |
| 	FMUL	atemp2, a1, atemp2
 | |
| 
 | |
| 	lfd	xsum1, FZERO
 | |
| 	fmr	xsum2, xsum1
 | |
| 
 | |
| 	mr	XX, X
 | |
| 	mr	YY, NEW_Y
 | |
| 
 | |
| 	LFD	xtemp1,  0 * SIZE(XX)
 | |
| 	LFD	xtemp2,  1 * SIZE(XX)
 | |
| 
 | |
| 	LFD	y01,  0 * SIZE(YY)
 | |
| 	LFD	y02,  1 * SIZE(YY)
 | |
| 
 | |
| 	LFD	a1,  0 * SIZE(AO1)
 | |
| 	LFD	a2,  1 * SIZE(AO1)
 | |
| 
 | |
| 	LFD	a5,  0 * SIZE(AO2)
 | |
| 	LFD	a6,  1 * SIZE(AO2)
 | |
| 
 | |
| 	srawi.	r0,  IS, 1
 | |
| 	mtspr	CTR, r0
 | |
| 	ble	LL(28)
 | |
| 	.align 4
 | |
| 
 | |
| LL(22):
 | |
| 	FMADD	xsum1, xtemp1, a1,  xsum1
 | |
| 	FMADD	xsum2, xtemp1, a5,  xsum2
 | |
| 
 | |
| 	FMADD	xsum1, xtemp2, a2,  xsum1
 | |
| 	FMADD	xsum2, xtemp2, a6,  xsum2
 | |
| 
 | |
| 	FMADD	y01, atemp1, a1,  y01
 | |
| 	FMADD	y02, atemp1, a2,  y02
 | |
| 	FMADD	y01, atemp2, a5,  y01
 | |
| 	FMADD	y02, atemp2, a6,  y02
 | |
| 
 | |
| 	LFD	xtemp1,  2 * SIZE(XX)
 | |
| 	LFD	xtemp2,  3 * SIZE(XX)
 | |
| 
 | |
| 	LFD	a1,  2 * SIZE(AO1)
 | |
| 	LFD	a2,  3 * SIZE(AO1)
 | |
| 
 | |
| 	LFD	a5,  2 * SIZE(AO2)
 | |
| 	LFD	a6,  3 * SIZE(AO2)
 | |
| 
 | |
| 	STFD	y01,  0 * SIZE(YY)
 | |
| 	STFD	y02,  1 * SIZE(YY)
 | |
| 
 | |
| 	LFD	y01,  2 * SIZE(YY)
 | |
| 	LFD	y02,  3 * SIZE(YY)
 | |
| 
 | |
| 	addi	AO1, AO1, 2 * SIZE
 | |
| 	addi	AO2, AO2, 2 * SIZE
 | |
| 
 | |
| 	addi	XX, XX, 2 * SIZE
 | |
| 	addi	YY, YY, 2 * SIZE
 | |
| 
 | |
| 	bdnz	LL(22)
 | |
| 	.align 4
 | |
| 
 | |
| LL(28):
 | |
| 	LFD	xtemp1, ALPHA
 | |
| 
 | |
| 	FMUL	xsum1, xtemp1, xsum1
 | |
| 	FMUL	xsum2, xtemp1, xsum2
 | |
| 
 | |
| 	FMADD	xsum1, atemp1, a1,  xsum1
 | |
| 	FMADD	xsum2, atemp1, a5,  xsum2
 | |
| 	FMADD	xsum1, atemp2, a5,  xsum1
 | |
| 	FMADD	xsum2, atemp2, a6,  xsum2
 | |
| 
 | |
| 	FADD	y01, y01, xsum1
 | |
| 	FADD	y02, y02, xsum2
 | |
| 
 | |
| 	STFD	y01,  0 * SIZE(YY)
 | |
| 	STFD	y02,  1 * SIZE(YY)
 | |
| 
 | |
| 	addi	IS, IS, 2
 | |
| 	.align 4
 | |
| 
 | |
| LL(30):
 | |
| 	andi.	TEMP, M, 1
 | |
| 	ble	LL(990)
 | |
| 
 | |
| 	mr	AO1, A
 | |
| 
 | |
| 	slwi	TEMP,  IS,  BASE_SHIFT
 | |
| 	add	TEMP, X, TEMP
 | |
| 
 | |
| 	LFD	atemp1, 0 * SIZE(TEMP)
 | |
| 
 | |
| 	LFD	a1, ALPHA
 | |
| 
 | |
| 	FMUL	atemp1, a1, atemp1
 | |
| 
 | |
| 	lfd	xsum1, FZERO
 | |
| 
 | |
| 	mr	XX, X
 | |
| 	mr	YY, NEW_Y
 | |
| 
 | |
| 	LFD	xtemp1,  0 * SIZE(XX)
 | |
| 	LFD	y01,  0 * SIZE(YY)
 | |
| 
 | |
| 	LFD	a1,  0 * SIZE(AO1)
 | |
| 
 | |
| 	mtspr	CTR, IS
 | |
| 	cmpwi	cr0, IS, 0
 | |
| 	ble	LL(38)
 | |
| 	.align 4
 | |
| 
 | |
| LL(32):
 | |
| 	FMADD	xsum1, xtemp1, a1,  xsum1
 | |
| 
 | |
| 	FMADD	y01, atemp1, a1,  y01
 | |
| 
 | |
| 	LFD	xtemp1,  1 * SIZE(XX)
 | |
| 
 | |
| 	LFD	a1,   1 * SIZE(AO1)
 | |
| 
 | |
| 	STFD	y01,  0 * SIZE(YY)
 | |
| 
 | |
| 	LFD	y01,  1 * SIZE(YY)
 | |
| 
 | |
| 	addi	AO1, AO1, 1 * SIZE
 | |
| 
 | |
| 	addi	XX, XX, 1 * SIZE
 | |
| 	addi	YY, YY, 1 * SIZE
 | |
| 
 | |
| 	bdnz	LL(32)
 | |
| 	.align 4
 | |
| 
 | |
| LL(38):
 | |
| 	LFD	xtemp1, ALPHA
 | |
| 
 | |
| 	FMUL	xsum1, xtemp1, xsum1
 | |
| 
 | |
| 	FMADD	xsum1, atemp1, a1,  xsum1
 | |
| 
 | |
| 	FADD	y01, y01, xsum1
 | |
| 
 | |
| 	STFD	y01,  0 * SIZE(YY)
 | |
| 	.align 4
 | |
| 
 | |
| LL(990):
 | |
| 	cmpwi	cr0, INCY, SIZE
 | |
| 	beq	LL(999)
 | |
| 
 | |
| 	mr	YY, Y
 | |
| 
 | |
| 	srawi.	r0, M, 3
 | |
| 	mtspr	CTR, r0
 | |
| 	ble	LL(995)
 | |
| 	.align 4
 | |
| 
 | |
| LL(991):
 | |
| 	LFD	f0,  0 * SIZE(Y)
 | |
| 	add	Y, Y, INCY
 | |
| 	LFD	f1,  0 * SIZE(Y)
 | |
| 	add	Y, Y, INCY
 | |
| 	LFD	f2,  0 * SIZE(Y)
 | |
| 	add	Y, Y, INCY
 | |
| 	LFD	f3,  0 * SIZE(Y)
 | |
| 	add	Y, Y, INCY
 | |
| 	LFD	f4,  0 * SIZE(Y)
 | |
| 	add	Y, Y, INCY
 | |
| 	LFD	f5,  0 * SIZE(Y)
 | |
| 	add	Y, Y, INCY
 | |
| 	LFD	f6,  0 * SIZE(Y)
 | |
| 	add	Y, Y, INCY
 | |
| 	LFD	f7,  0 * SIZE(Y)
 | |
| 	add	Y, Y, INCY
 | |
| 
 | |
| 	LFD	f8,   0 * SIZE(NEW_Y)
 | |
| 	LFD	f9,   1 * SIZE(NEW_Y)
 | |
| 	LFD	f10,  2 * SIZE(NEW_Y)
 | |
| 	LFD	f11,  3 * SIZE(NEW_Y)
 | |
| 	LFD	f12,  4 * SIZE(NEW_Y)
 | |
| 	LFD	f13,  5 * SIZE(NEW_Y)
 | |
| 	LFD	f14,  6 * SIZE(NEW_Y)
 | |
| 	LFD	f15,  7 * SIZE(NEW_Y)
 | |
| 	addi	NEW_Y, NEW_Y, 8 * SIZE
 | |
| 
 | |
| 	FADD	f8,  f8,  f0
 | |
| 	FADD	f9,  f9,  f1
 | |
| 	FADD	f10, f10, f2
 | |
| 	FADD	f11, f11, f3
 | |
| 	FADD	f12, f12, f4
 | |
| 	FADD	f13, f13, f5
 | |
| 	FADD	f14, f14, f6
 | |
| 	FADD	f15, f15, f7
 | |
| 
 | |
| 	STFD	f8,  0 * SIZE(YY)
 | |
| 	add	YY, YY, INCY
 | |
| 	STFD	f9,  0 * SIZE(YY)
 | |
| 	add	YY, YY, INCY
 | |
| 	STFD	f10, 0 * SIZE(YY)
 | |
| 	add	YY, YY, INCY
 | |
| 	STFD	f11, 0 * SIZE(YY)
 | |
| 	add	YY, YY, INCY
 | |
| 	STFD	f12, 0 * SIZE(YY)
 | |
| 	add	YY, YY, INCY
 | |
| 	STFD	f13, 0 * SIZE(YY)
 | |
| 	add	YY, YY, INCY
 | |
| 	STFD	f14, 0 * SIZE(YY)
 | |
| 	add	YY, YY, INCY
 | |
| 	STFD	f15, 0 * SIZE(YY)
 | |
| 	add	YY, YY, INCY
 | |
| 	bdnz	LL(991)
 | |
| 	.align 4
 | |
| 
 | |
| LL(995):
 | |
| 	andi.	J, M, 4
 | |
| 	ble	LL(996)
 | |
| 
 | |
| 	LFD	f0,  0 * SIZE(Y)
 | |
| 	add	Y, Y, INCY
 | |
| 	LFD	f1,  0 * SIZE(Y)
 | |
| 	add	Y, Y, INCY
 | |
| 	LFD	f2,  0 * SIZE(Y)
 | |
| 	add	Y, Y, INCY
 | |
| 	LFD	f3,  0 * SIZE(Y)
 | |
| 	add	Y, Y, INCY
 | |
| 
 | |
| 	LFD	f8,   0 * SIZE(NEW_Y)
 | |
| 	LFD	f9,   1 * SIZE(NEW_Y)
 | |
| 	LFD	f10,  2 * SIZE(NEW_Y)
 | |
| 	LFD	f11,  3 * SIZE(NEW_Y)
 | |
| 	addi	NEW_Y, NEW_Y, 4 * SIZE
 | |
| 
 | |
| 	FADD	f8,  f8,  f0
 | |
| 	FADD	f9,  f9,  f1
 | |
| 	FADD	f10, f10, f2
 | |
| 	FADD	f11, f11, f3
 | |
| 
 | |
| 	STFD	f8,  0 * SIZE(YY)
 | |
| 	add	YY, YY, INCY
 | |
| 	STFD	f9,  0 * SIZE(YY)
 | |
| 	add	YY, YY, INCY
 | |
| 	STFD	f10, 0 * SIZE(YY)
 | |
| 	add	YY, YY, INCY
 | |
| 	STFD	f11, 0 * SIZE(YY)
 | |
| 	add	YY, YY, INCY
 | |
| 	.align 4
 | |
| 
 | |
| LL(996):
 | |
| 	andi.	J, M, 2
 | |
| 	ble	LL(997)
 | |
| 
 | |
| 	LFD	f0,  0 * SIZE(Y)
 | |
| 	add	Y, Y, INCY
 | |
| 	LFD	f1,  0 * SIZE(Y)
 | |
| 	add	Y, Y, INCY
 | |
| 
 | |
| 	LFD	f8,   0 * SIZE(NEW_Y)
 | |
| 	LFD	f9,   1 * SIZE(NEW_Y)
 | |
| 	addi	NEW_Y, NEW_Y, 2 * SIZE
 | |
| 
 | |
| 	FADD	f8,  f8,  f0
 | |
| 	FADD	f9,  f9,  f1
 | |
| 
 | |
| 	STFD	f8,  0 * SIZE(YY)
 | |
| 	add	YY, YY, INCY
 | |
| 	STFD	f9,  0 * SIZE(YY)
 | |
| 	add	YY, YY, INCY
 | |
| 	.align 4
 | |
| 
 | |
| LL(997):
 | |
| 	andi.	J, M, 1
 | |
| 	ble	LL(999)
 | |
| 
 | |
| 	LFD	f0,  0 * SIZE(Y)
 | |
| 	LFD	f8,   0 * SIZE(NEW_Y)
 | |
| 
 | |
| 	FADD	f8,  f8,  f0
 | |
| 
 | |
| 	STFD	f8,  0 * SIZE(YY)
 | |
| 	.align 4
 | |
| 
 | |
| LL(999):
 | |
| 	li	r3, 0
 | |
| 
 | |
| 	lfd	f14,     0(SP)
 | |
| 	lfd	f15,     8(SP)
 | |
| 	lfd	f16,    16(SP)
 | |
| 	lfd	f17,    24(SP)
 | |
| 	lfd	f18,    32(SP)
 | |
| 	lfd	f19,    40(SP)
 | |
| 	lfd	f20,    48(SP)
 | |
| 	lfd	f21,    56(SP)
 | |
| 	lfd	f22,    64(SP)
 | |
| 	lfd	f23,    72(SP)
 | |
| 	lfd	f24,    80(SP)
 | |
| 	lfd	f25,    88(SP)
 | |
| 	lfd	f26,    96(SP)
 | |
| 	lfd	f27,   104(SP)
 | |
| 	lfd	f28,   112(SP)
 | |
| 	lfd	f29,   120(SP)
 | |
| 	lfd	f30,   128(SP)
 | |
| 	lfd	f31,   136(SP)
 | |
| 
 | |
| #ifdef __64BIT__
 | |
| 	ld	r14,   144(SP)
 | |
| 	ld	r15,   152(SP)
 | |
| 	ld	r16,   160(SP)
 | |
| 	ld	r17,   168(SP)
 | |
| 	ld	r18,   176(SP)
 | |
| 	ld	r19,   184(SP)
 | |
| 	ld	r20,   192(SP)
 | |
| 	ld	r21,   200(SP)
 | |
| 	ld	r22,   208(SP)
 | |
| 	ld	r23,   216(SP)
 | |
| 	ld	r24,   224(SP)
 | |
| 	ld	r25,   232(SP)
 | |
| 	ld	r26,   240(SP)
 | |
| 	ld	r27,   248(SP)
 | |
| #else
 | |
| 	lwz	r14,   144(SP)
 | |
| 	lwz	r15,   148(SP)
 | |
| 	lwz	r16,   152(SP)
 | |
| 	lwz	r17,   156(SP)
 | |
| 	lwz	r18,   160(SP)
 | |
| 	lwz	r19,   164(SP)
 | |
| 	lwz	r20,   168(SP)
 | |
| 	lwz	r21,   172(SP)
 | |
| 	lwz	r22,   176(SP)
 | |
| 	lwz	r23,   180(SP)
 | |
| 	lwz	r24,   184(SP)
 | |
| 	lwz	r25,   188(SP)
 | |
| 	lwz	r26,   192(SP)
 | |
| 	lwz	r27,   196(SP)
 | |
| #endif
 | |
| 
 | |
| 	addi	SP, SP, STACKSIZE
 | |
| 	blr
 | |
| 
 | |
| 	EPILOGUE
 | |
| #endif
 |