4255 lines
		
	
	
		
			64 KiB
		
	
	
	
		
			ArmAsm
		
	
	
	
			
		
		
	
	
			4255 lines
		
	
	
		
			64 KiB
		
	
	
	
		
			ArmAsm
		
	
	
	
| /*********************************************************************/
 | |
| /* Copyright 2009, 2010 The University of Texas at Austin.           */
 | |
| /* All rights reserved.                                              */
 | |
| /*                                                                   */
 | |
| /* Redistribution and use in source and binary forms, with or        */
 | |
| /* without modification, are permitted provided that the following   */
 | |
| /* conditions are met:                                               */
 | |
| /*                                                                   */
 | |
| /*   1. Redistributions of source code must retain the above         */
 | |
| /*      copyright notice, this list of conditions and the following  */
 | |
| /*      disclaimer.                                                  */
 | |
| /*                                                                   */
 | |
| /*   2. Redistributions in binary form must reproduce the above      */
 | |
| /*      copyright notice, this list of conditions and the following  */
 | |
| /*      disclaimer in the documentation and/or other materials       */
 | |
| /*      provided with the distribution.                              */
 | |
| /*                                                                   */
 | |
| /*    THIS  SOFTWARE IS PROVIDED  BY THE  UNIVERSITY OF  TEXAS AT    */
 | |
| /*    AUSTIN  ``AS IS''  AND ANY  EXPRESS OR  IMPLIED WARRANTIES,    */
 | |
| /*    INCLUDING, BUT  NOT LIMITED  TO, THE IMPLIED  WARRANTIES OF    */
 | |
| /*    MERCHANTABILITY  AND FITNESS FOR  A PARTICULAR  PURPOSE ARE    */
 | |
| /*    DISCLAIMED.  IN  NO EVENT SHALL THE UNIVERSITY  OF TEXAS AT    */
 | |
| /*    AUSTIN OR CONTRIBUTORS BE  LIABLE FOR ANY DIRECT, INDIRECT,    */
 | |
| /*    INCIDENTAL,  SPECIAL, EXEMPLARY,  OR  CONSEQUENTIAL DAMAGES    */
 | |
| /*    (INCLUDING, BUT  NOT LIMITED TO,  PROCUREMENT OF SUBSTITUTE    */
 | |
| /*    GOODS  OR  SERVICES; LOSS  OF  USE,  DATA,  OR PROFITS;  OR    */
 | |
| /*    BUSINESS INTERRUPTION) HOWEVER CAUSED  AND ON ANY THEORY OF    */
 | |
| /*    LIABILITY, WHETHER  IN CONTRACT, STRICT  LIABILITY, OR TORT    */
 | |
| /*    (INCLUDING NEGLIGENCE OR OTHERWISE)  ARISING IN ANY WAY OUT    */
 | |
| /*    OF  THE  USE OF  THIS  SOFTWARE,  EVEN  IF ADVISED  OF  THE    */
 | |
| /*    POSSIBILITY OF SUCH DAMAGE.                                    */
 | |
| /*                                                                   */
 | |
| /* The views and conclusions contained in the software and           */
 | |
| /* documentation are those of the authors and should not be          */
 | |
| /* interpreted as representing official policies, either expressed   */
 | |
| /* or implied, of The University of Texas at Austin.                 */
 | |
| /*********************************************************************/
 | |
| 
 | |
| #define ASSEMBLER
 | |
| #include "common.h"
 | |
| 
 | |
| #define M	%i0
 | |
| #define N	%i1
 | |
| #define K	%i2
 | |
| 
 | |
| #if defined(DOUBLE) && !defined(__64BIT__)
 | |
| #define A	%i5
 | |
| #define B	%i4
 | |
| #else
 | |
| #define A	%i4
 | |
| #define B	%i5
 | |
| #endif
 | |
| 
 | |
| #define C	%o4
 | |
| #define LDC	%o5
 | |
| 
 | |
| #define AO	%l0
 | |
| #define BO	%l1
 | |
| #define I	%l2
 | |
| #define J	%l3
 | |
| #define L	%l4
 | |
| 
 | |
| #define C1	%o0
 | |
| #define C2	%o1
 | |
| #define C3	%o2
 | |
| #define C4	%o3
 | |
| 
 | |
| #define OFFSET	%l5
 | |
| #define	KK	%l6
 | |
| #define TEMP1	%l7
 | |
| #define TEMP2	%i3
 | |
| #define AORIG	%g1
 | |
| 
 | |
| #ifdef DOUBLE
 | |
| #define c01	%f0
 | |
| #define c02	%f2
 | |
| #define c03	%f4
 | |
| #define c04	%f6
 | |
| #define c05	%f8
 | |
| #define c06	%f10
 | |
| #define c07	%f12
 | |
| #define c08	%f14
 | |
| #define c09	%f16
 | |
| #define c10	%f18
 | |
| #define c11	%f20
 | |
| #define c12	%f22
 | |
| #define c13	%f24
 | |
| #define c14	%f26
 | |
| #define c15	%f28
 | |
| #define c16	%f30
 | |
| 
 | |
| #define t1	%f32
 | |
| #define	t2 	%f34
 | |
| #define t3	%f36
 | |
| #define	t4 	%f38
 | |
| 
 | |
| #define a1	%f40
 | |
| #define a2	%f42
 | |
| #define a3	%f44
 | |
| #define a4	%f46
 | |
| #define a5	%f58
 | |
| 
 | |
| #define b1	%f48
 | |
| #define b2	%f50
 | |
| #define b3	%f52
 | |
| #define b4	%f54
 | |
| #define b5	%f56
 | |
| 
 | |
| #define FZERO	%f60
 | |
| #define ALPHA	%f62
 | |
| #else
 | |
| #define c01	%f0
 | |
| #define c02	%f1
 | |
| #define c03	%f2
 | |
| #define c04	%f3
 | |
| #define c05	%f4
 | |
| #define c06	%f5
 | |
| #define c07	%f6
 | |
| #define c08	%f7
 | |
| #define c09	%f8
 | |
| #define c10	%f9
 | |
| #define c11	%f10
 | |
| #define c12	%f11
 | |
| #define c13	%f12
 | |
| #define c14	%f13
 | |
| #define c15	%f14
 | |
| #define c16	%f15
 | |
| 
 | |
| #define t1	%f16
 | |
| #define	t2 	%f17
 | |
| #define t3	%f18
 | |
| #define	t4 	%f19
 | |
| 
 | |
| #define a1	%f20
 | |
| #define a2	%f21
 | |
| #define a3	%f22
 | |
| #define a4	%f23
 | |
| #define a5	%f31
 | |
| 
 | |
| #define b1	%f24
 | |
| #define b2	%f25
 | |
| #define b3	%f26
 | |
| #define b4	%f27
 | |
| #define b5	%f28
 | |
| 
 | |
| #define FZERO	%f29
 | |
| #define ALPHA	%f30
 | |
| #endif
 | |
| 
 | |
| #define APREFETCHSIZE 40
 | |
| #define BPREFETCHSIZE 40
 | |
| 
 | |
| #define APREFETCH_CATEGORY 0
 | |
| #define BPREFETCH_CATEGORY 0
 | |
| 
 | |
| 	PROLOGUE
 | |
| 	SAVESP
 | |
| 	nop
 | |
| 
 | |
| #ifndef __64BIT__
 | |
| #ifdef DOUBLE
 | |
| 	ld	[%sp + STACK_START + 28], B
 | |
| 	ld	[%sp + STACK_START + 32], C
 | |
| 	ld	[%sp + STACK_START + 36], LDC
 | |
| 	ld	[%sp + STACK_START + 40], OFFSET
 | |
| #else
 | |
| 	ld	[%sp + STACK_START + 28], C
 | |
| 	ld	[%sp + STACK_START + 32], LDC
 | |
| 	ld	[%sp + STACK_START + 36], OFFSET
 | |
| #endif
 | |
| #else
 | |
| 	ldx	[%sp+  STACK_START + 56], C
 | |
| 	ldx	[%sp+  STACK_START + 64], LDC
 | |
| 	ldx	[%sp+  STACK_START + 72], OFFSET
 | |
| #endif
 | |
| 
 | |
| 	FCLR(29)
 | |
| 
 | |
| 	sll	LDC, BASE_SHIFT, LDC
 | |
| 
 | |
| #ifdef LN
 | |
| 	smul	M, K, TEMP1
 | |
| 	sll	TEMP1, BASE_SHIFT, TEMP1
 | |
| 	add	A, TEMP1, A
 | |
| 
 | |
| 	sll	M, BASE_SHIFT, TEMP1
 | |
| 	add	C, TEMP1, C
 | |
| #endif
 | |
| 
 | |
| #ifdef RN
 | |
| 	neg	OFFSET, KK
 | |
| #endif
 | |
| 
 | |
| #ifdef RT
 | |
| 	smul	N, K, TEMP1
 | |
| 	sll	TEMP1, BASE_SHIFT, TEMP1
 | |
| 	add	B, TEMP1, B
 | |
| 
 | |
| 	smul	N, LDC, TEMP1
 | |
| 	add	C, TEMP1, C
 | |
| 
 | |
| 	sub	N, OFFSET, KK
 | |
| #endif
 | |
| 
 | |
| 	sra	N, 2, J
 | |
| 	cmp	J, 0
 | |
| 	ble,pn	%icc, .LL100
 | |
| 	nop
 | |
| 
 | |
| .LL11:
 | |
| #ifdef RT
 | |
| 	sll	K, 2 + BASE_SHIFT, TEMP1
 | |
| 	sub	B, TEMP1, B
 | |
| 
 | |
| 	sll	LDC, 2, TEMP1
 | |
| 	sub	C, TEMP1, C
 | |
| #endif
 | |
| 
 | |
| 	mov	C,  C1
 | |
| 	add	C,  LDC, C2
 | |
| 	add	C2, LDC, C3
 | |
| 	add	C3, LDC, C4
 | |
| 
 | |
| #ifdef LN
 | |
| 	add	M, OFFSET, KK
 | |
| #endif
 | |
| 
 | |
| #ifdef LT
 | |
| 	mov	OFFSET, KK
 | |
| #endif
 | |
| 
 | |
| #if defined(LN) || defined(RT)
 | |
| 	mov	A, AORIG
 | |
| #else
 | |
| 	mov	A, AO
 | |
| #endif
 | |
| 
 | |
| #ifndef RT
 | |
| 	add	C4, LDC, C
 | |
| #endif
 | |
| 
 | |
| 	and	M, 1, I
 | |
| 	cmp	I, 0
 | |
| 	ble,pn	%icc, .LL50
 | |
| 	nop
 | |
| 
 | |
| #if defined(LT) || defined(RN)
 | |
| 	sra	KK, 2, L
 | |
| 
 | |
| 	mov	B, BO
 | |
| 	cmp	L,  0
 | |
| #else
 | |
| 
 | |
| #ifdef LN
 | |
| 	sll	K,  0 + BASE_SHIFT, TEMP1
 | |
| 	sub	AORIG, TEMP1, AORIG
 | |
| #endif
 | |
| 
 | |
| 	sll	KK, 0 + BASE_SHIFT, TEMP1
 | |
| 	sll	KK, 2 + BASE_SHIFT, TEMP2
 | |
| 
 | |
| 	add	AORIG, TEMP1, AO
 | |
| 	add	B,     TEMP2, BO
 | |
| 
 | |
| 	sub	K, KK, TEMP1
 | |
| 	sra	TEMP1, 2, L
 | |
| 	cmp	L,  0
 | |
| #endif
 | |
| 
 | |
| 	LDF	[AO + 0 * SIZE], a1
 | |
| 	FMOV	FZERO, c01
 | |
| 	LDF	[BO + 0 * SIZE], b1
 | |
| 	FMOV	FZERO, t1
 | |
|  	LDF	[AO + 1 * SIZE], a2
 | |
| 	FMOV	FZERO, c02
 | |
| 	LDF	[BO + 1 * SIZE], b2
 | |
| 	FMOV	FZERO, t2
 | |
| 	LDF	[AO + 2 * SIZE], a3
 | |
| 	FMOV	FZERO, c03
 | |
| 	LDF	[BO + 2 * SIZE], b3
 | |
| 	FMOV	FZERO, t3
 | |
| 	LDF	[AO + 3 * SIZE], a4
 | |
| 	FMOV	FZERO, c04
 | |
| 	LDF	[BO + 3 * SIZE], b4
 | |
| 	FMOV	FZERO, t4
 | |
| 
 | |
| 	ble,pn	%icc, .LL75
 | |
| 	nop
 | |
| 
 | |
| .LL72:
 | |
| 	FADD	c01, t1, c01
 | |
| 	add	L, -1, L
 | |
| 	FMUL	a1, b1, t1
 | |
| 	LDF	[BO + 4 * SIZE], b1
 | |
| 
 | |
| 	FADD	c02, t2, c02
 | |
| 	cmp	L, 0
 | |
| 	FMUL	a1, b2, t2
 | |
| 	LDF	[BO + 5 * SIZE], b2
 | |
| 
 | |
| 	FADD	c03, t3, c03
 | |
| 	FMUL	a1, b3, t3
 | |
| 	LDF	[BO + 6 * SIZE], b3
 | |
| 
 | |
| 	FADD	c04, t4, c04
 | |
| 	FMUL	a1, b4, t4
 | |
| 	LDF	[BO + 7 * SIZE], b4
 | |
| 	LDF	[AO +  4 * SIZE], a1
 | |
| 
 | |
| 	FADD	c01, t1, c01
 | |
| 	add	AO,  4 * SIZE, AO
 | |
| 	FMUL	a2, b1, t1
 | |
| 	LDF	[BO +  8 * SIZE], b1
 | |
| 
 | |
| 	FADD	c02, t2, c02
 | |
| 	FMUL	a2, b2, t2
 | |
| 	LDF	[BO +  9 * SIZE], b2
 | |
| 
 | |
| 	FADD	c03, t3, c03
 | |
| 	FMUL	a2, b3, t3
 | |
| 	LDF	[BO + 10 * SIZE], b3
 | |
| 
 | |
| 	FADD	c04, t4, c04
 | |
| 	FMUL	a2, b4, t4
 | |
| 	LDF	[BO + 11 * SIZE], b4
 | |
| 	LDF	[AO +  1 * SIZE], a2
 | |
| 
 | |
| 	FADD	c01, t1, c01
 | |
| 	FMUL	a3, b1, t1
 | |
| 	LDF	[BO + 12 * SIZE], b1
 | |
| 
 | |
| 	FADD	c02, t2, c02
 | |
| 	FMUL	a3, b2, t2
 | |
| 	LDF	[BO + 13 * SIZE], b2
 | |
| 
 | |
| 	FADD	c03, t3, c03
 | |
| 	FMUL	a3, b3, t3
 | |
| 	LDF	[BO + 14 * SIZE], b3
 | |
| 
 | |
| 	FADD	c04, t4, c04
 | |
| 	FMUL	a3, b4, t4
 | |
| 	LDF	[BO + 15 * SIZE], b4
 | |
| 	LDF	[AO +  2 * SIZE], a3
 | |
| 
 | |
| 	FADD	c01, t1, c01
 | |
| 	FMUL	a4, b1, t1
 | |
| 	LDF	[BO + 16 * SIZE], b1
 | |
| 
 | |
| 	FADD	c02, t2, c02
 | |
| 	FMUL	a4, b2, t2
 | |
| 	LDF	[BO + 17 * SIZE], b2
 | |
| 
 | |
| 	FADD	c03, t3, c03
 | |
| 	FMUL	a4, b3, t3
 | |
| 	LDF	[BO + 18 * SIZE], b3
 | |
| 
 | |
| 	FADD	c04, t4, c04
 | |
| 	FMUL	a4, b4, t4
 | |
| 	LDF	[BO + 19 * SIZE], b4
 | |
| 
 | |
| 	add	BO, 16 * SIZE, BO
 | |
| 	bg,pt	%icc, .LL72
 | |
| 	LDF	[AO +  3 * SIZE], a4
 | |
| 
 | |
| .LL75:
 | |
| #if defined(LT) || defined(RN)
 | |
| 	and	KK,  3, L
 | |
| #else
 | |
| 	and	TEMP1, 3, L
 | |
| #endif
 | |
| 	cmp	L,  0
 | |
| 	ble,a,pn %icc, .LL79
 | |
| 	nop
 | |
| 
 | |
| .LL76:
 | |
| 	FADD	c01, t1, c01
 | |
| 	add	AO, 1 * SIZE, AO
 | |
| 	FMUL	a1, b1, t1
 | |
| 	LDF	[BO + 4 * SIZE], b1
 | |
| 
 | |
| 	FADD	c02, t2, c02
 | |
| 	add	L, -1, L
 | |
| 	FMUL	a1, b2, t2
 | |
| 	LDF	[BO + 5 * SIZE], b2
 | |
| 
 | |
| 	FADD	c03, t3, c03
 | |
| 	cmp	L, 0
 | |
| 	FMUL	a1, b3, t3
 | |
| 	LDF	[BO + 6 * SIZE], b3
 | |
| 
 | |
| 	FADD	c04, t4, c04
 | |
| 	add	BO, 4 * SIZE, BO
 | |
| 	FMUL	a1, b4, t4
 | |
| 	LDF	[AO + 0 * SIZE], a1
 | |
| 
 | |
| 	bg,pt	%icc, .LL76
 | |
| 	LDF	[BO + 3 * SIZE], b4
 | |
| 
 | |
| 
 | |
| .LL79:
 | |
| 	FADD	c01, t1, c01
 | |
| 	FADD	c02, t2, c02
 | |
| 	FADD	c03, t3, c03
 | |
| 	FADD	c04, t4, c04
 | |
| 
 | |
| #if defined(LN) || defined(RT)
 | |
| #ifdef LN
 | |
| 	sub	KK, 1, TEMP1
 | |
| #else
 | |
| 	sub	KK, 4, TEMP1
 | |
| #endif
 | |
| 	sll	TEMP1, 0 + BASE_SHIFT, TEMP2
 | |
| 	sll	TEMP1, 2 + BASE_SHIFT, TEMP1
 | |
| 	add	AORIG, TEMP2, AO
 | |
| 	add	B,     TEMP1, BO
 | |
| #endif
 | |
| 
 | |
| #if defined(LN) || defined(LT)
 | |
| 	LDF	[BO +  0 * SIZE], a1
 | |
| 	LDF	[BO +  1 * SIZE], a2
 | |
| 	LDF	[BO +  2 * SIZE], a3
 | |
| 	LDF	[BO +  3 * SIZE], a4
 | |
| 
 | |
| 	FSUB	a1, c01, c01
 | |
| 	FSUB	a2, c02, c02
 | |
| 	FSUB	a3, c03, c03
 | |
| 	FSUB	a4, c04, c04
 | |
| #else
 | |
| 	LDF	[AO +  0 * SIZE], a1
 | |
| 	LDF	[AO +  1 * SIZE], a2
 | |
| 	LDF	[AO +  2 * SIZE], a3
 | |
| 	LDF	[AO +  3 * SIZE], a4
 | |
| 
 | |
| 	FSUB	a1, c01, c01
 | |
| 	FSUB	a2, c02, c02
 | |
| 	FSUB	a3, c03, c03
 | |
| 	FSUB	a4, c04, c04
 | |
| #endif
 | |
| 
 | |
| #ifdef LN
 | |
| 	LDF	[AO +  0 * SIZE], a1
 | |
| 
 | |
| 	FMUL	a1, c01, c01
 | |
| 	FMUL	a1, c02, c02
 | |
| 	FMUL	a1, c03, c03
 | |
| 	FMUL	a1, c04, c04
 | |
| #endif
 | |
| 
 | |
| #ifdef LT
 | |
| 	LDF	[AO +  0 * SIZE], a1
 | |
| 
 | |
| 	FMUL	a1, c01, c01
 | |
| 	FMUL	a1, c02, c02
 | |
| 	FMUL	a1, c03, c03
 | |
| 	FMUL	a1, c04, c04
 | |
| #endif
 | |
| 
 | |
| #ifdef RN
 | |
| 	LDF	[BO +  0 * SIZE], a1
 | |
| 	LDF	[BO +  1 * SIZE], a2
 | |
| 	LDF	[BO +  2 * SIZE], a3
 | |
| 	LDF	[BO +  3 * SIZE], a4
 | |
| 
 | |
| 	FMUL	a1, c01, c01
 | |
| 	FMUL	a2, c01, t1
 | |
| 	FSUB	c02, t1, c02
 | |
| 	FMUL	a3, c01, t1
 | |
| 	FSUB	c03, t1, c03
 | |
| 	FMUL	a4, c01, t1
 | |
| 	FSUB	c04, t1, c04
 | |
| 
 | |
| 	LDF	[BO +  5 * SIZE], a1
 | |
| 	LDF	[BO +  6 * SIZE], a2
 | |
| 	LDF	[BO +  7 * SIZE], a3
 | |
| 
 | |
| 	FMUL	a1, c02, c02
 | |
| 	FMUL	a2, c02, t1
 | |
| 	FSUB	c03, t1, c03
 | |
| 	FMUL	a3, c02, t1
 | |
| 	FSUB	c04, t1, c04
 | |
| 
 | |
| 	LDF	[BO + 10 * SIZE], a1
 | |
| 	LDF	[BO + 11 * SIZE], a2
 | |
| 
 | |
| 	FMUL	a1, c03, c03
 | |
| 	FMUL	a2, c03, t1
 | |
| 	FSUB	c04, t1, c04
 | |
| 
 | |
| 	LDF	[BO + 15 * SIZE], a1
 | |
| 
 | |
| 	FMUL	a1, c04, c04
 | |
| #endif
 | |
| 
 | |
| #ifdef RT
 | |
| 	LDF	[BO + 15 * SIZE], a1
 | |
| 	LDF	[BO + 14 * SIZE], a2
 | |
| 	LDF	[BO + 13 * SIZE], a3
 | |
| 	LDF	[BO + 12 * SIZE], a4
 | |
| 
 | |
| 	FMUL	a1, c04, c04
 | |
| 	FMUL	a2, c04, t1
 | |
| 	FSUB	c03, t1, c03
 | |
| 	FMUL	a3, c04, t1
 | |
| 	FSUB	c02, t1, c02
 | |
| 	FMUL	a4, c04, t1
 | |
| 	FSUB	c01, t1, c01
 | |
| 
 | |
| 	LDF	[BO + 10 * SIZE], a1
 | |
| 	LDF	[BO +  9 * SIZE], a2
 | |
| 	LDF	[BO +  8 * SIZE], a3
 | |
| 
 | |
| 	FMUL	a1, c03, c03
 | |
| 	FMUL	a2, c03, t1
 | |
| 	FSUB	c02, t1, c02
 | |
| 	FMUL	a3, c03, t1
 | |
| 	FSUB	c01, t1, c01
 | |
| 
 | |
| 	LDF	[BO +  5 * SIZE], a1
 | |
| 	LDF	[BO +  4 * SIZE], a2
 | |
| 
 | |
| 	FMUL	a1, c02, c02
 | |
| 	FMUL	a2, c02, t1
 | |
| 	FSUB	c01, t1, c01
 | |
| 
 | |
| 	LDF	[BO +  0 * SIZE], a1
 | |
| 
 | |
| 	FMUL	a1, c01, c01
 | |
| #endif
 | |
| 
 | |
| #ifdef LN
 | |
| 	add	C1, -1 * SIZE, C1
 | |
| 	add	C2, -1 * SIZE, C2
 | |
| 	add	C3, -1 * SIZE, C3
 | |
| 	add	C4, -1 * SIZE, C4
 | |
| #endif
 | |
| 
 | |
| #if defined(LN) || defined(LT)
 | |
| 	STF	c01, [BO +  0 * SIZE]
 | |
| 	STF	c02, [BO +  1 * SIZE]
 | |
| 	STF	c03, [BO +  2 * SIZE]
 | |
| 	STF	c04, [BO +  3 * SIZE]
 | |
| #else
 | |
| 	STF	c01, [AO +  0 * SIZE]
 | |
| 	STF	c02, [AO +  1 * SIZE]
 | |
| 	STF	c03, [AO +  2 * SIZE]
 | |
| 	STF	c04, [AO +  3 * SIZE]
 | |
| #endif
 | |
| 
 | |
| 	STF	c01, [C1 + 0 * SIZE]
 | |
| 	STF	c02, [C2 + 0 * SIZE]
 | |
| 	STF	c03, [C3 + 0 * SIZE]
 | |
| 	STF	c04, [C4 + 0 * SIZE]
 | |
| 
 | |
| 	FMOV	FZERO, t1
 | |
| 	FMOV	FZERO, t2
 | |
| 	FMOV	FZERO, t3
 | |
| 	FMOV	FZERO, t4
 | |
| 
 | |
| #ifndef LN
 | |
| 	add	C1, 1 * SIZE, C1
 | |
| 	add	C2, 1 * SIZE, C2
 | |
| 	add	C3, 1 * SIZE, C3
 | |
| 	add	C4, 1 * SIZE, C4
 | |
| #endif
 | |
| 
 | |
| #ifdef RT
 | |
| 	sll	K, 0 + BASE_SHIFT, TEMP1
 | |
| 	add	AORIG, TEMP1, AORIG
 | |
| #endif
 | |
| 
 | |
| #if defined(LT) || defined(RN)
 | |
| 	sub	K, KK, TEMP1
 | |
| 	sll	TEMP1, 0 + BASE_SHIFT, TEMP2
 | |
| 	sll	TEMP1, 2 + BASE_SHIFT, TEMP1
 | |
| 	add	AO, TEMP2, AO
 | |
| 	add	BO, TEMP1, BO
 | |
| #endif
 | |
| 
 | |
| #ifdef LT
 | |
| 	add	KK, 1, KK
 | |
| #endif
 | |
| 
 | |
| #ifdef LN
 | |
| 	sub	KK, 1, KK
 | |
| #endif
 | |
| 
 | |
| .LL50:
 | |
| 	and	M, 2, I
 | |
| 	cmp	I, 0
 | |
| 	ble,pn	%icc, .LL70
 | |
| 	nop
 | |
| 
 | |
| #if defined(LT) || defined(RN)
 | |
| 	sra	KK, 2, L
 | |
| 
 | |
| 	mov	B, BO
 | |
| 	cmp	L,  0
 | |
| #else
 | |
| 
 | |
| #ifdef LN
 | |
| 	sll	K,  1 + BASE_SHIFT, TEMP1
 | |
| 	sub	AORIG, TEMP1, AORIG
 | |
| #endif
 | |
| 
 | |
| 	sll	KK, 1 + BASE_SHIFT, TEMP1
 | |
| 	sll	KK, 2 + BASE_SHIFT, TEMP2
 | |
| 
 | |
| 	add	AORIG, TEMP1, AO
 | |
| 	add	B,     TEMP2, BO
 | |
| 
 | |
| 	sub	K, KK, TEMP1
 | |
| 	sra	TEMP1, 2, L
 | |
| 	cmp	L,  0
 | |
| #endif
 | |
| 
 | |
| 	FMOV	FZERO, c02
 | |
| 	FMOV	FZERO, t1
 | |
| 	FMOV	FZERO, c04
 | |
| 
 | |
| 	LDF	[AO + 0 * SIZE], a1
 | |
| 	FMOV	FZERO, t2
 | |
| 	LDF	[BO + 0 * SIZE], b1
 | |
| 	FMOV	FZERO, c06
 | |
| 	LDF	[AO + 1 * SIZE], a2
 | |
| 	FMOV	FZERO, t3
 | |
| 	LDF	[BO + 1 * SIZE], b2
 | |
| 	FMOV	FZERO, c08
 | |
| 	LDF	[AO + 2 * SIZE], a3
 | |
| 	FMOV	FZERO, t4
 | |
| 	LDF	[BO + 2 * SIZE], b3
 | |
| 	FMOV	FZERO, c01
 | |
| 	LDF	[AO + 3 * SIZE], a4
 | |
| 	FMOV	FZERO, c03
 | |
| 	LDF	[BO + 3 * SIZE], b4
 | |
| 	FMOV	FZERO, c05
 | |
| 
 | |
| 	ble,pn	%icc, .LL55
 | |
| 	FMOV	FZERO, c07
 | |
| 
 | |
| .LL52:
 | |
| 	FADD	c02, t1, c02
 | |
| 	add	AO,  8 * SIZE, AO
 | |
| 	prefetch [AO + APREFETCHSIZE * SIZE], 0
 | |
| 
 | |
| 	FMUL	a1, b1, t1
 | |
| 	add	BO, 16 * SIZE, BO
 | |
| 
 | |
| 	FADD	c04, t2, c04
 | |
| 	add	L, -1, L
 | |
| 	FMUL	a1, b2, t2
 | |
| 
 | |
| 	FADD	c06, t3, c06
 | |
| 	cmp	L, 0
 | |
| 	FMUL	a1, b3, t3
 | |
| 
 | |
| 	FADD	c08, t4, c08
 | |
| 	FMUL	a1, b4, t4
 | |
| 	LDF	[AO -  4 * SIZE], a1
 | |
| 
 | |
| 	FADD	c01, t1, c01
 | |
| 	FMUL	a2, b1, t1
 | |
| 	LDF	[BO - 12 * SIZE], b1
 | |
| 	FADD	c03, t2, c03
 | |
| 	FMUL	a2, b2, t2
 | |
| 	LDF	[BO - 11 * SIZE], b2
 | |
| 
 | |
| 	FADD	c05, t3, c05
 | |
| 	FMUL	a2, b3, t3
 | |
| 	LDF	[BO - 10 * SIZE], b3
 | |
| 	FADD	c07, t4, c07
 | |
| 	FMUL	a2, b4, t4
 | |
| 	LDF	[BO -  9 * SIZE], b4
 | |
| 
 | |
| 	FADD	c02, t1, c02
 | |
| 	FMUL	a3, b1, t1
 | |
| 	LDF	[AO -  3 * SIZE], a2
 | |
| 	FADD	c04, t2, c04
 | |
| 	FMUL	a3, b2, t2
 | |
| 
 | |
| 	FADD	c06, t3, c06
 | |
| 	FMUL	a3, b3, t3
 | |
| 	FADD	c08, t4, c08
 | |
| 	FMUL	a3, b4, t4
 | |
| 	LDF	[AO -  2 * SIZE], a3
 | |
| 
 | |
| 	FADD	c01, t1, c01
 | |
| 	FMUL	a4, b1, t1
 | |
| 	LDF	[BO -  8 * SIZE], b1
 | |
| 	FADD	c03, t2, c03
 | |
| 	FMUL	a4, b2, t2
 | |
| 	LDF	[BO -  7 * SIZE], b2
 | |
| 
 | |
| 	FADD	c05, t3, c05
 | |
| 	FMUL	a4, b3, t3
 | |
| 	LDF	[BO -  6 * SIZE], b3
 | |
| 	FADD	c07, t4, c07
 | |
| 	FMUL	a4, b4, t4
 | |
| 	LDF	[BO -  5 * SIZE], b4
 | |
| 
 | |
| 	FADD	c02, t1, c02
 | |
| 	FMUL	a1, b1, t1
 | |
| 	LDF	[AO -  1 * SIZE], a4
 | |
| 	FADD	c04, t2, c04
 | |
| 	FMUL	a1, b2, t2
 | |
| 
 | |
| 	FADD	c06, t3, c06
 | |
| 	FMUL	a1, b3, t3
 | |
| 	FADD	c08, t4, c08
 | |
| 	FMUL	a1, b4, t4
 | |
| 	LDF	[AO +  0 * SIZE], a1
 | |
| 
 | |
| 	FADD	c01, t1, c01
 | |
| 	FMUL	a2, b1, t1
 | |
| 	LDF	[BO -  4 * SIZE], b1
 | |
| 
 | |
| 	FADD	c03, t2, c03
 | |
| 	FMUL	a2, b2, t2
 | |
| 	LDF	[BO -  3 * SIZE], b2
 | |
| 
 | |
| 	FADD	c05, t3, c05
 | |
| 	FMUL	a2, b3, t3
 | |
| 	LDF	[BO -  2 * SIZE], b3
 | |
| 	FADD	c07, t4, c07
 | |
| 	FMUL	a2, b4, t4
 | |
| 	LDF	[BO -  1 * SIZE], b4
 | |
| 
 | |
| 	FADD	c02, t1, c02
 | |
| 	FMUL	a3, b1, t1
 | |
| 	LDF	[AO +  1 * SIZE], a2
 | |
| 	FADD	c04, t2, c04
 | |
| 	FMUL	a3, b2, t2
 | |
| 
 | |
| 	FADD	c06, t3, c06
 | |
| 	FMUL	a3, b3, t3
 | |
| 	FADD	c08, t4, c08
 | |
| 	FMUL	a3, b4, t4
 | |
| 	LDF	[AO +  2 * SIZE], a3
 | |
| 
 | |
| 	FADD	c01, t1, c01
 | |
| 	FMUL	a4, b1, t1
 | |
| 	LDF	[BO +  0 * SIZE], b1
 | |
| 	FADD	c03, t2, c03
 | |
| 	FMUL	a4, b2, t2
 | |
| 	LDF	[BO +  1 * SIZE], b2
 | |
| 
 | |
| 	FADD	c05, t3, c05
 | |
| 	FMUL	a4, b3, t3
 | |
| 	LDF	[BO +  2 * SIZE], b3
 | |
| 	FADD	c07, t4, c07
 | |
| 	FMUL	a4, b4, t4
 | |
| 	LDF	[BO +  3 * SIZE], b4
 | |
| 
 | |
| 	bg,pt	%icc, .LL52
 | |
| 	LDF	[AO +  3 * SIZE], a4
 | |
| 
 | |
| .LL55:
 | |
| #if defined(LT) || defined(RN)
 | |
| 	and	KK,  3, L
 | |
| #else
 | |
| 	and	TEMP1, 3, L
 | |
| #endif
 | |
| 	cmp	L,  0
 | |
| 	ble,a,pn %icc, .LL59
 | |
| 	nop
 | |
| 
 | |
| .LL56:
 | |
| 	FADD	c02, t1, c02
 | |
| 	add	AO, 2 * SIZE, AO
 | |
| 	FMUL	a1, b1, t1
 | |
| 	add	L, -1, L
 | |
| 
 | |
| 	add	BO, 4 * SIZE, BO
 | |
| 	FADD	c04, t2, c04
 | |
| 	cmp	L, 0
 | |
| 	FMUL	a1, b2, t2
 | |
| 
 | |
| 	FADD	c06, t3, c06
 | |
| 	FMUL	a1, b3, t3
 | |
| 	FADD	c08, t4, c08
 | |
| 	FMUL	a1, b4, t4
 | |
| 	LDF	[AO + 0 * SIZE], a1
 | |
| 
 | |
| 	FADD	c01, t1, c01
 | |
| 	FMUL	a2, b1, t1
 | |
| 	LDF	[BO + 0 * SIZE], b1
 | |
| 	FADD	c03, t2, c03
 | |
| 	FMUL	a2, b2, t2
 | |
| 	LDF	[BO + 1 * SIZE], b2
 | |
| 
 | |
| 	FADD	c05, t3, c05
 | |
| 	FMUL	a2, b3, t3
 | |
| 	LDF	[BO + 2 * SIZE], b3
 | |
| 	FADD	c07, t4, c07
 | |
| 	FMUL	a2, b4, t4
 | |
| 	LDF	[BO + 3 * SIZE], b4
 | |
| 
 | |
| 	bg,pt	%icc, .LL56
 | |
| 	LDF	[AO + 1 * SIZE], a2
 | |
| 
 | |
| .LL59:
 | |
| #if defined(LN) || defined(RT)
 | |
| #ifdef LN
 | |
| 	sub	KK, 2, TEMP1
 | |
| #else
 | |
| 	sub	KK, 4, TEMP1
 | |
| #endif
 | |
| 	sll	TEMP1, 1 + BASE_SHIFT, TEMP2
 | |
| 	sll	TEMP1, 2 + BASE_SHIFT, TEMP1
 | |
| 	add	AORIG, TEMP2, AO
 | |
| 	add	B,     TEMP1, BO
 | |
| #endif
 | |
| 
 | |
| 	FADD	c02, t1, c02
 | |
| 	FADD	c04, t2, c04
 | |
| 	FADD	c06, t3, c06
 | |
| 	FADD	c08, t4, c08
 | |
| 
 | |
| #if defined(LN) || defined(LT)
 | |
| 	LDF	[BO +  0 * SIZE], a1
 | |
| 	LDF	[BO +  1 * SIZE], a2
 | |
| 	LDF	[BO +  2 * SIZE], a3
 | |
| 	LDF	[BO +  3 * SIZE], a4
 | |
| 
 | |
| 	LDF	[BO +  4 * SIZE], b1
 | |
| 	LDF	[BO +  5 * SIZE], b2
 | |
| 	LDF	[BO +  6 * SIZE], b3
 | |
| 	LDF	[BO +  7 * SIZE], b4
 | |
| 
 | |
| 	FSUB	a1, c01, c01
 | |
| 	FSUB	a2, c03, c03
 | |
| 	FSUB	a3, c05, c05
 | |
| 	FSUB	a4, c07, c07
 | |
| 
 | |
| 	FSUB	b1, c02, c02
 | |
| 	FSUB	b2, c04, c04
 | |
| 	FSUB	b3, c06, c06
 | |
| 	FSUB	b4, c08, c08
 | |
| #else
 | |
| 	LDF	[AO +  0 * SIZE], a1
 | |
| 	LDF	[AO +  1 * SIZE], a2
 | |
| 	LDF	[AO +  2 * SIZE], a3
 | |
| 	LDF	[AO +  3 * SIZE], a4
 | |
| 
 | |
| 	LDF	[AO +  4 * SIZE], b1
 | |
| 	LDF	[AO +  5 * SIZE], b2
 | |
| 	LDF	[AO +  6 * SIZE], b3
 | |
| 	LDF	[AO +  7 * SIZE], b4
 | |
| 
 | |
| 	FSUB	a1, c01, c01
 | |
| 	FSUB	a2, c02, c02
 | |
| 	FSUB	a3, c03, c03
 | |
| 	FSUB	a4, c04, c04
 | |
| 
 | |
| 	FSUB	b1, c05, c05
 | |
| 	FSUB	b2, c06, c06
 | |
| 	FSUB	b3, c07, c07
 | |
| 	FSUB	b4, c08, c08
 | |
| #endif
 | |
| 
 | |
| #ifdef LN
 | |
| 	LDF	[AO +  3 * SIZE], a1
 | |
| 	LDF	[AO +  2 * SIZE], a2
 | |
| 	LDF	[AO +  0 * SIZE], a3
 | |
| 
 | |
| 	FMUL	a1, c02, c02
 | |
| 	FMUL	a1, c04, c04
 | |
| 	FMUL	a1, c06, c06
 | |
| 	FMUL	a1, c08, c08
 | |
| 
 | |
| 	FMUL	a2, c02, t1
 | |
| 	FMUL	a2, c04, t2
 | |
| 	FMUL	a2, c06, t3
 | |
| 	FMUL	a2, c08, t4
 | |
| 
 | |
| 	FSUB	c01, t1, c01
 | |
| 	FSUB	c03, t2, c03
 | |
| 	FSUB	c05, t3, c05
 | |
| 	FSUB	c07, t4, c07
 | |
| 
 | |
| 	FMUL	a3, c01, c01
 | |
| 	FMUL	a3, c03, c03
 | |
| 	FMUL	a3, c05, c05
 | |
| 	FMUL	a3, c07, c07
 | |
| #endif
 | |
| 
 | |
| #ifdef LT
 | |
| 	LDF	[AO +  0 * SIZE], a1
 | |
| 	LDF	[AO +  1 * SIZE], a2
 | |
| 	LDF	[AO +  3 * SIZE], a3
 | |
| 
 | |
| 	FMUL	a1, c01, c01
 | |
| 	FMUL	a1, c03, c03
 | |
| 	FMUL	a1, c05, c05
 | |
| 	FMUL	a1, c07, c07
 | |
| 
 | |
| 	FMUL	a2, c01, t1
 | |
| 	FMUL	a2, c03, t2
 | |
| 	FMUL	a2, c05, t3
 | |
| 	FMUL	a2, c07, t4
 | |
| 
 | |
| 	FSUB	c02, t1, c02
 | |
| 	FSUB	c04, t2, c04
 | |
| 	FSUB	c06, t3, c06
 | |
| 	FSUB	c08, t4, c08
 | |
| 
 | |
| 	FMUL	a3, c02, c02
 | |
| 	FMUL	a3, c04, c04
 | |
| 	FMUL	a3, c06, c06
 | |
| 	FMUL	a3, c08, c08
 | |
| #endif
 | |
| 
 | |
| #ifdef RN
 | |
| 	LDF	[BO +  0 * SIZE], a1
 | |
| 	LDF	[BO +  1 * SIZE], a2
 | |
| 	LDF	[BO +  2 * SIZE], a3
 | |
| 	LDF	[BO +  3 * SIZE], a4
 | |
| 
 | |
| 	FMUL	a1, c01, c01
 | |
| 	FMUL	a1, c02, c02
 | |
| 
 | |
| 	FMUL	a2, c01, t1
 | |
| 	FMUL	a2, c02, t2
 | |
| 
 | |
| 	FSUB	c03, t1, c03
 | |
| 	FSUB	c04, t2, c04
 | |
| 
 | |
| 	FMUL	a3, c01, t1
 | |
| 	FMUL	a3, c02, t2
 | |
| 
 | |
| 	FSUB	c05, t1, c05
 | |
| 	FSUB	c06, t2, c06
 | |
| 
 | |
| 	FMUL	a4, c01, t1
 | |
| 	FMUL	a4, c02, t2
 | |
| 
 | |
| 	FSUB	c07, t1, c07
 | |
| 	FSUB	c08, t2, c08
 | |
| 
 | |
| 	LDF	[BO +  5 * SIZE], a1
 | |
| 	LDF	[BO +  6 * SIZE], a2
 | |
| 	LDF	[BO +  7 * SIZE], a3
 | |
| 
 | |
| 	FMUL	a1, c03, c03
 | |
| 	FMUL	a1, c04, c04
 | |
| 
 | |
| 	FMUL	a2, c03, t1
 | |
| 	FMUL	a2, c04, t2
 | |
| 
 | |
| 	FSUB	c05, t1, c05
 | |
| 	FSUB	c06, t2, c06
 | |
| 
 | |
| 	FMUL	a3, c03, t1
 | |
| 	FMUL	a3, c04, t2
 | |
| 
 | |
| 	FSUB	c07, t1, c07
 | |
| 	FSUB	c08, t2, c08
 | |
| 
 | |
| 	LDF	[BO + 10 * SIZE], a1
 | |
| 	LDF	[BO + 11 * SIZE], a2
 | |
| 
 | |
| 	FMUL	a1, c05, c05
 | |
| 	FMUL	a1, c06, c06
 | |
| 
 | |
| 	FMUL	a2, c05, t1
 | |
| 	FMUL	a2, c06, t2
 | |
| 
 | |
| 	FSUB	c07, t1, c07
 | |
| 	FSUB	c08, t2, c08
 | |
| 
 | |
| 	LDF	[BO + 15 * SIZE], a1
 | |
| 
 | |
| 	FMUL	a1, c07, c07
 | |
| 	FMUL	a1, c08, c08
 | |
| #endif
 | |
| 
 | |
| #ifdef RT
 | |
| 	LDF	[BO + 15 * SIZE], a1
 | |
| 	LDF	[BO + 14 * SIZE], a2
 | |
| 	LDF	[BO + 13 * SIZE], a3
 | |
| 	LDF	[BO + 12 * SIZE], a4
 | |
| 
 | |
| 	FMUL	a1, c07, c07
 | |
| 	FMUL	a1, c08, c08
 | |
| 
 | |
| 	FMUL	a2, c07, t1
 | |
| 	FMUL	a2, c08, t2
 | |
| 
 | |
| 	FSUB	c05, t1, c05
 | |
| 	FSUB	c06, t2, c06
 | |
| 
 | |
| 	FMUL	a3, c07, t1
 | |
| 	FMUL	a3, c08, t2
 | |
| 
 | |
| 	FSUB	c03, t1, c03
 | |
| 	FSUB	c04, t2, c04
 | |
| 
 | |
| 	FMUL	a4, c07, t1
 | |
| 	FMUL	a4, c08, t2
 | |
| 
 | |
| 	FSUB	c01, t1, c01
 | |
| 	FSUB	c02, t2, c02
 | |
| 
 | |
| 	LDF	[BO + 10 * SIZE], a1
 | |
| 	LDF	[BO +  9 * SIZE], a2
 | |
| 	LDF	[BO +  8 * SIZE], a3
 | |
| 
 | |
| 	FMUL	a1, c05, c05
 | |
| 	FMUL	a1, c06, c06
 | |
| 
 | |
| 	FMUL	a2, c05, t1
 | |
| 	FMUL	a2, c06, t2
 | |
| 
 | |
| 	FSUB	c03, t1, c03
 | |
| 	FSUB	c04, t2, c04
 | |
| 
 | |
| 	FMUL	a3, c05, t1
 | |
| 	FMUL	a3, c06, t2
 | |
| 
 | |
| 	FSUB	c01, t1, c01
 | |
| 	FSUB	c02, t2, c02
 | |
| 
 | |
| 	LDF	[BO +  5 * SIZE], a1
 | |
| 	LDF	[BO +  4 * SIZE], a2
 | |
| 
 | |
| 	FMUL	a1, c03, c03
 | |
| 	FMUL	a1, c04, c04
 | |
| 
 | |
| 	FMUL	a2, c03, t1
 | |
| 	FMUL	a2, c04, t2
 | |
| 
 | |
| 	FSUB	c01, t1, c01
 | |
| 	FSUB	c02, t2, c02
 | |
| 
 | |
| 	LDF	[BO +  0 * SIZE], a1
 | |
| 
 | |
| 	FMUL	a1, c01, c01
 | |
| 	FMUL	a1, c02, c02
 | |
| #endif
 | |
| 
 | |
| #ifdef LN
 | |
| 	add	C1, -2 * SIZE, C1
 | |
| 	add	C2, -2 * SIZE, C2
 | |
| 	add	C3, -2 * SIZE, C3
 | |
| 	add	C4, -2 * SIZE, C4
 | |
| #endif
 | |
| 
 | |
| #if defined(LN) || defined(LT)
 | |
| 	STF	c01, [BO +  0 * SIZE]
 | |
| 	STF	c03, [BO +  1 * SIZE]
 | |
| 	STF	c05, [BO +  2 * SIZE]
 | |
| 	STF	c07, [BO +  3 * SIZE]
 | |
| 
 | |
| 	STF	c02, [BO +  4 * SIZE]
 | |
| 	STF	c04, [BO +  5 * SIZE]
 | |
| 	STF	c06, [BO +  6 * SIZE]
 | |
| 	STF	c08, [BO +  7 * SIZE]
 | |
| #else
 | |
| 	STF	c01, [AO +  0 * SIZE]
 | |
| 	STF	c02, [AO +  1 * SIZE]
 | |
| 	STF	c03, [AO +  2 * SIZE]
 | |
| 	STF	c04, [AO +  3 * SIZE]
 | |
| 
 | |
| 	STF	c05, [AO +  4 * SIZE]
 | |
| 	STF	c06, [AO +  5 * SIZE]
 | |
| 	STF	c07, [AO +  6 * SIZE]
 | |
| 	STF	c08, [AO +  7 * SIZE]
 | |
| #endif
 | |
| 
 | |
| 	STF	c01, [C1 + 0 * SIZE]
 | |
| 	STF	c02, [C1 + 1 * SIZE]
 | |
| 	STF	c03, [C2 + 0 * SIZE]
 | |
| 	STF	c04, [C2 + 1 * SIZE]
 | |
| 
 | |
| 	STF	c05, [C3 + 0 * SIZE]
 | |
| 	STF	c06, [C3 + 1 * SIZE]
 | |
| 	STF	c07, [C4 + 0 * SIZE]
 | |
| 	STF	c08, [C4 + 1 * SIZE]
 | |
| 
 | |
| 	FMOV	FZERO, t1
 | |
| 	FMOV	FZERO, t2
 | |
| 	FMOV	FZERO, t3
 | |
| 	FMOV	FZERO, t4
 | |
| 
 | |
| #ifndef LN
 | |
| 	add	C1, 2 * SIZE, C1
 | |
| 	add	C2, 2 * SIZE, C2
 | |
| 	add	C3, 2 * SIZE, C3
 | |
| 	add	C4, 2 * SIZE, C4
 | |
| #endif
 | |
| 
 | |
| #ifdef RT
 | |
| 	sll	K, 1 + BASE_SHIFT, TEMP1
 | |
| 	add	AORIG, TEMP1, AORIG
 | |
| #endif
 | |
| 
 | |
| #if defined(LT) || defined(RN)
 | |
| 	sub	K, KK, TEMP1
 | |
| 	sll	TEMP1, 1 + BASE_SHIFT, TEMP2
 | |
| 	sll	TEMP1, 2 + BASE_SHIFT, TEMP1
 | |
| 	add	AO, TEMP2, AO
 | |
| 	add	BO, TEMP1, BO
 | |
| #endif
 | |
| 
 | |
| #ifdef LT
 | |
| 	add	KK, 2, KK
 | |
| #endif
 | |
| 
 | |
| #ifdef LN
 | |
| 	sub	KK, 2, KK
 | |
| #endif
 | |
| 
 | |
| .LL70:
 | |
| 	sra	M, 2, I
 | |
| 	cmp	I, 0
 | |
| 	ble,pn	%icc, .LL99
 | |
| 	nop
 | |
| 
 | |
| .LL21:
 | |
| 	FMOV	FZERO, t1
 | |
| 	FMOV	FZERO, t2
 | |
| 	FMOV	FZERO, t3
 | |
| 	FMOV	FZERO, t4
 | |
| 
 | |
| 	FMOV	FZERO, c01
 | |
| 	FMOV	FZERO, c02
 | |
| 	FMOV	FZERO, c03
 | |
| 
 | |
| #if defined(LT) || defined(RN)
 | |
| 	sra	KK, 2, L
 | |
| 
 | |
| 	mov	B, BO
 | |
| 	cmp	L,  0
 | |
| #else
 | |
| 
 | |
| #ifdef LN
 | |
| 	sll	K,  2 + BASE_SHIFT, TEMP1
 | |
| 	sub	AORIG, TEMP1, AORIG
 | |
| #endif
 | |
| 
 | |
| 	sll	KK, 2 + BASE_SHIFT, TEMP1
 | |
| 
 | |
| 	add	AORIG, TEMP1, AO
 | |
| 	add	B,     TEMP1, BO
 | |
| 
 | |
| 	sub	K, KK, TEMP1
 | |
| 
 | |
| 	sra	TEMP1, 2, L
 | |
| 	cmp	L,  0
 | |
| #endif
 | |
| 
 | |
| 	LDF	[AO + 0 * SIZE], a1
 | |
| 	FMOV	FZERO, c04
 | |
| 	LDF	[BO + 0 * SIZE], b1
 | |
| 	FMOV	FZERO, c05
 | |
| 	LDF	[AO + 1 * SIZE], a2
 | |
| 	FMOV	FZERO, c06
 | |
| 	LDF	[BO + 1 * SIZE], b2
 | |
| 	FMOV	FZERO, c07
 | |
| 
 | |
| 	LDF	[AO + 2 * SIZE], a3
 | |
| 	FMOV	FZERO, c08
 | |
| 	LDF	[BO + 2 * SIZE], b3
 | |
| 	FMOV	FZERO, c09
 | |
| 	LDF	[AO + 3 * SIZE], a4
 | |
| 	FMOV	FZERO, c10
 | |
| 	LDF	[BO + 3 * SIZE], b4
 | |
| 	FMOV	FZERO, c11
 | |
| 	LDF	[BO +  4 * SIZE], b5	/* ***** */
 | |
| 
 | |
| 	LDF	[AO +  4 * SIZE], a5	/* ***** */
 | |
| 
 | |
| #ifdef LN
 | |
| 	prefetch [C1 + 3 * SIZE], 3
 | |
| 	FMOV	FZERO, c12
 | |
| 	prefetch [C2 + 3 * SIZE], 3
 | |
| 	FMOV	FZERO, c13
 | |
| 	prefetch [C3 + 3 * SIZE], 3
 | |
| 	FMOV	FZERO, c14
 | |
| 	prefetch [C4 + 3 * SIZE], 3
 | |
| 	FMOV	FZERO, c15
 | |
| #else
 | |
| 	prefetch [C1 - 3 * SIZE], 3
 | |
| 	FMOV	FZERO, c12
 | |
| 	prefetch [C2 - 3 * SIZE], 3
 | |
| 	FMOV	FZERO, c13
 | |
| 	prefetch [C3 - 3 * SIZE], 3
 | |
| 	FMOV	FZERO, c14
 | |
| 	prefetch [C4 - 3 * SIZE], 3
 | |
| 	FMOV	FZERO, c15
 | |
| #endif
 | |
| 
 | |
| 	ble,pn	%icc, .LL25
 | |
| 	FMOV	FZERO, c16
 | |
| 
 | |
| .LL22:
 | |
| 	FADD	c04, t1, c04
 | |
| 	prefetch [AO + APREFETCHSIZE * SIZE], APREFETCH_CATEGORY
 | |
| 	FMUL	a1, b1, t1
 | |
| 	nop
 | |
| 
 | |
| 	FADD	c08, t2, c08
 | |
| 	prefetch [BO + BPREFETCHSIZE * SIZE], BPREFETCH_CATEGORY
 | |
| 	FMUL	a1, b2, t2
 | |
| 	add	AO, 16 * SIZE, AO
 | |
| 
 | |
| 	FADD	c12, t3, c12
 | |
| 	LDF	[AO - 13 * SIZE], a4
 | |
| 	FMUL	a1, b3, t3
 | |
| 	add	BO, 16 * SIZE, BO
 | |
| 
 | |
| 	FADD	c16, t4, c16
 | |
| 	nop
 | |
| 	FMUL	a1, b4, t4
 | |
| 	LDF	[AO -  8 * SIZE], a1
 | |
| 
 | |
| 	FADD	c01, t1, c01
 | |
| 	nop
 | |
| 	FMUL	a2, b1, t1
 | |
| 	nop
 | |
| 
 | |
| 	FADD	c05, t2, c05
 | |
| 	nop
 | |
| 	FMUL	a2, b2, t2
 | |
| 	nop
 | |
| 
 | |
| 	FADD	c09, t3, c09
 | |
| 	nop
 | |
| 	FMUL	a2, b3, t3
 | |
| 	nop
 | |
| 
 | |
| 	FADD	c13, t4, c13
 | |
| 	add	L, -1, L
 | |
| 	FMUL	a2, b4, t4
 | |
| 	LDF	[AO - 11 * SIZE], a2
 | |
| 
 | |
| 	FADD	c02, t1, c02
 | |
| 	nop
 | |
| 	FMUL	a3, b1, t1
 | |
| 	nop
 | |
| 
 | |
| 	FADD	c06, t2, c06
 | |
| 	nop
 | |
| 	FMUL	a3, b2, t2
 | |
| 	nop
 | |
| 
 | |
| 	FADD	c10, t3, c10
 | |
| 	nop
 | |
| 	FMUL	a3, b3, t3
 | |
| 	nop
 | |
| 
 | |
| 	FADD	c14, t4, c14
 | |
| 	nop
 | |
| 	FMUL	a3, b4, t4
 | |
| 	LDF	[AO - 10 * SIZE], a3
 | |
| 
 | |
| 	FADD	c03, t1, c03
 | |
| 	nop
 | |
| 	FMUL	a4, b1, t1
 | |
| 	LDF	[BO -  8 * SIZE], b1
 | |
| 
 | |
| 	FADD	c07, t2, c07
 | |
| 	nop
 | |
| 	FMUL	a4, b2, t2
 | |
| 	LDF	[BO - 11 * SIZE], b2
 | |
| 
 | |
| 	FADD	c11, t3, c11
 | |
| 	nop
 | |
| 	FMUL	a4, b3, t3
 | |
| 	LDF	[BO - 10 * SIZE], b3
 | |
| 
 | |
| 	FADD	c15, t4, c15
 | |
| 	nop
 | |
| 	FMUL	a4, b4, t4
 | |
| 	LDF	[BO -  9 * SIZE], b4
 | |
| 
 | |
| 	FADD	c04, t1, c04
 | |
| 	nop
 | |
| 	FMUL	a5, b5, t1
 | |
| 	LDF	[AO -  9 * SIZE], a4
 | |
| 
 | |
| 	FADD	c08, t2, c08
 | |
| 	nop
 | |
| 	FMUL	a5, b2, t2
 | |
| 	nop
 | |
| 
 | |
| 	FADD	c12, t3, c12
 | |
| 	nop
 | |
| 	FMUL	a5, b3, t3
 | |
| 	nop
 | |
| 
 | |
| 	FADD	c16, t4, c16
 | |
| 	nop
 | |
| 	FMUL	a5, b4, t4
 | |
| 	LDF	[AO - 4 * SIZE], a5
 | |
| 
 | |
| 	FADD	c01, t1, c01
 | |
| 	nop
 | |
| 	FMUL	a2, b5, t1
 | |
| 	nop
 | |
| 
 | |
| 	FADD	c05, t2, c05
 | |
| 	nop
 | |
| 	FMUL	a2, b2, t2
 | |
| 	nop
 | |
| 
 | |
| 	FADD	c09, t3, c09
 | |
| 	nop
 | |
| 	FMUL	a2, b3, t3
 | |
| 	nop
 | |
| 
 | |
| 	FADD	c13, t4, c13
 | |
| 	nop
 | |
| 	FMUL	a2, b4, t4
 | |
| 	LDF	[AO -  7 * SIZE], a2
 | |
| 
 | |
| 	FADD	c02, t1, c02
 | |
| 	nop
 | |
| 	FMUL	a3, b5, t1
 | |
| 	nop
 | |
| 
 | |
| 	FADD	c06, t2, c06
 | |
| 	nop
 | |
| 	FMUL	a3, b2, t2
 | |
| 	nop
 | |
| 
 | |
| 	FADD	c10, t3, c10
 | |
| 	nop
 | |
| 	FMUL	a3, b3, t3
 | |
| 	nop
 | |
| 
 | |
| 	FADD	c14, t4, c14
 | |
| 	nop
 | |
| 	FMUL	a3, b4, t4
 | |
| 	LDF	[AO -  6 * SIZE], a3
 | |
| 
 | |
| 	FADD	c03, t1, c03
 | |
| 	nop
 | |
| 	FMUL	a4, b5, t1
 | |
| 	LDF	[BO - 4 * SIZE], b5
 | |
| 
 | |
| 	FADD	c07, t2, c07
 | |
| 	nop
 | |
| 	FMUL	a4, b2, t2
 | |
| 	LDF	[BO -  7 * SIZE], b2
 | |
| 
 | |
| 	FADD	c11, t3, c11
 | |
| 	nop
 | |
| 	FMUL	a4, b3, t3
 | |
| 	LDF	[BO -  6 * SIZE], b3
 | |
| 
 | |
| 	FADD	c15, t4, c15
 | |
| 	nop
 | |
| 	FMUL	a4, b4, t4
 | |
| 	LDF	[BO -  5 * SIZE], b4
 | |
| 
 | |
| 	FADD	c04, t1, c04
 | |
| 	nop
 | |
| 	FMUL	a1, b1, t1
 | |
| 	LDF	[AO -  5 * SIZE], a4
 | |
| 
 | |
| 	FADD	c08, t2, c08
 | |
| 	nop
 | |
| 	FMUL	a1, b2, t2
 | |
| 	nop
 | |
| 
 | |
| 	FADD	c12, t3, c12
 | |
| 	nop
 | |
| 	FMUL	a1, b3, t3
 | |
| 	nop
 | |
| 
 | |
| 	FADD	c16, t4, c16
 | |
| 	nop
 | |
| 	FMUL	a1, b4, t4
 | |
| 	LDF	[AO -  0 * SIZE], a1
 | |
| 
 | |
| 	FADD	c01, t1, c01
 | |
| 	nop
 | |
| 	FMUL	a2, b1, t1
 | |
| 	nop
 | |
| 
 | |
| #ifdef DOUBLE
 | |
| 	prefetch [AO + (APREFETCHSIZE + 8) * SIZE], APREFETCH_CATEGORY
 | |
| #else
 | |
| 	nop
 | |
| #endif
 | |
| 	FADD	c05, t2, c05
 | |
| 	nop
 | |
| 	FMUL	a2, b2, t2
 | |
| 
 | |
| 	FADD	c09, t3, c09
 | |
| 	nop
 | |
| 	FMUL	a2, b3, t3
 | |
| 	nop
 | |
| 
 | |
| 	FADD	c13, t4, c13
 | |
| 	nop
 | |
| 	FMUL	a2, b4, t4
 | |
| 	nop
 | |
| 
 | |
| 	FADD	c02, t1, c02
 | |
| 	nop
 | |
| 	FMUL	a3, b1, t1
 | |
| 	LDF	[AO - 3 * SIZE], a2
 | |
| 
 | |
| 	FADD	c06, t2, c06
 | |
| #ifdef DOUBLE
 | |
| 	prefetch [BO + (BPREFETCHSIZE + 8) * SIZE], BPREFETCH_CATEGORY
 | |
| #else
 | |
| 	nop
 | |
| #endif
 | |
| 	FMUL	a3, b2, t2
 | |
| 	nop
 | |
| 
 | |
| 	FADD	c10, t3, c10
 | |
| 	nop
 | |
| 	FMUL	a3, b3, t3
 | |
| 	nop
 | |
| 
 | |
| 	FADD	c14, t4, c14
 | |
| 	nop
 | |
| 	FMUL	a3, b4, t4
 | |
| 	LDF	[AO - 2 * SIZE], a3
 | |
| 
 | |
| 	FADD	c03, t1, c03
 | |
| 	nop
 | |
| 	FMUL	a4, b1, t1
 | |
| 	LDF	[BO -  0 * SIZE], b1
 | |
| 
 | |
| 	FADD	c07, t2, c07
 | |
| 	nop
 | |
| 	FMUL	a4, b2, t2
 | |
| 	LDF	[BO - 3 * SIZE], b2
 | |
| 
 | |
| 	FADD	c11, t3, c11
 | |
| 	nop
 | |
| 	FMUL	a4, b3, t3
 | |
| 	LDF	[BO - 2 * SIZE], b3
 | |
| 
 | |
| 	FADD	c15, t4, c15
 | |
| 	nop
 | |
| 	FMUL	a4, b4, t4
 | |
| 	LDF	[BO - 1 * SIZE], b4
 | |
| 
 | |
| 	FADD	c04, t1, c04
 | |
| 	nop
 | |
| 	FMUL	a5, b5, t1
 | |
| 	LDF	[AO - 1 * SIZE], a4
 | |
| 
 | |
| 	FADD	c08, t2, c08
 | |
| 	FMUL	a5, b2, t2
 | |
| 	FADD	c12, t3, c12
 | |
| 	FMUL	a5, b3, t3
 | |
| 
 | |
| 	FADD	c16, t4, c16
 | |
| 	nop
 | |
| 	FMUL	a5, b4, t4
 | |
| 	LDF	[AO +  4 * SIZE], a5
 | |
| 
 | |
| 	FADD	c01, t1, c01
 | |
| 	nop
 | |
| 	FMUL	a2, b5, t1
 | |
| 	nop
 | |
| 
 | |
| 	FADD	c05, t2, c05
 | |
| 	nop
 | |
| 	FMUL	a2, b2, t2
 | |
| 	nop
 | |
| 
 | |
| 	FADD	c09, t3, c09
 | |
| 	nop
 | |
| 	FMUL	a2, b3, t3
 | |
| 	nop
 | |
| 
 | |
| 	FADD	c13, t4, c13
 | |
| 	nop
 | |
| 	FMUL	a2, b4, t4
 | |
| 	LDF	[AO +  1 * SIZE], a2
 | |
| 
 | |
| 	FADD	c02, t1, c02
 | |
| 	nop
 | |
| 	FMUL	a3, b5, t1
 | |
| 	nop
 | |
| 
 | |
| 	FADD	c06, t2, c06
 | |
| 	nop
 | |
| 	FMUL	a3, b2, t2
 | |
| 	nop
 | |
| 
 | |
| 	FADD	c10, t3, c10
 | |
| 	nop
 | |
| 	FMUL	a3, b3, t3
 | |
| 	nop
 | |
| 
 | |
| 	FADD	c14, t4, c14
 | |
| 	nop
 | |
| 	FMUL	a3, b4, t4
 | |
| 	LDF	[AO +  2 * SIZE], a3
 | |
| 
 | |
| 	FADD	c03, t1, c03
 | |
| 	cmp	L, 0
 | |
| 	FMUL	a4, b5, t1
 | |
| 	LDF	[BO +  4 * SIZE], b5
 | |
| 
 | |
| 	FADD	c07, t2, c07
 | |
| 	nop
 | |
| 	FMUL	a4, b2, t2
 | |
| 	LDF	[BO +  1 * SIZE], b2
 | |
| 
 | |
| 	FADD	c11, t3, c11
 | |
| 	nop
 | |
| 	FMUL	a4, b3, t3
 | |
| 	LDF	[BO +  2 * SIZE], b3
 | |
| 
 | |
| 	FADD	c15, t4, c15
 | |
| 	FMUL	a4, b4, t4
 | |
| 	bg,pt	%icc, .LL22
 | |
| 	LDF	[BO +  3 * SIZE], b4
 | |
| 
 | |
| .LL25:
 | |
| #if defined(LT) || defined(RN)
 | |
| 	and	KK,  3, L
 | |
| #else
 | |
| 	and	TEMP1, 3, L
 | |
| #endif
 | |
| 	cmp	L,  0
 | |
| 	ble,a,pn %icc, .LL29
 | |
| 	nop
 | |
| 
 | |
| .LL26:
 | |
| 	FADD	c04, t1, c04
 | |
| 	LDF	[AO +  3 * SIZE], a4
 | |
| 	FMUL	a1, b1, t1
 | |
| 	add	AO, 4 * SIZE, AO
 | |
| 
 | |
| 	FADD	c08, t2, c08
 | |
| 	add	BO, 4 * SIZE, BO
 | |
| 	FMUL	a1, b2, t2
 | |
| 	add	L, -1, L
 | |
| 
 | |
| 	FADD	c12, t3, c12
 | |
| 	nop
 | |
| 	FMUL	a1, b3, t3
 | |
| 	cmp	L, 0
 | |
| 
 | |
| 	FADD	c16, t4, c16
 | |
| 	nop
 | |
| 	FMUL	a1, b4, t4
 | |
| 	LDF	[AO + 0 * SIZE], a1
 | |
| 
 | |
| 	FADD	c01, t1, c01
 | |
| 	nop
 | |
| 	FMUL	a2, b1, t1
 | |
| 	nop
 | |
| 
 | |
| 	FADD	c05, t2, c05
 | |
| 	nop
 | |
| 	FMUL	a2, b2, t2
 | |
| 	nop
 | |
| 
 | |
| 	FADD	c09, t3, c09
 | |
| 	nop
 | |
| 	FMUL	a2, b3, t3
 | |
| 	nop
 | |
| 
 | |
| 	FADD	c13, t4, c13
 | |
| 	nop
 | |
| 	FMUL	a2, b4, t4
 | |
| 	LDF	[AO + 1 * SIZE], a2
 | |
| 
 | |
| 	FADD	c02, t1, c02
 | |
| 	nop
 | |
| 	FMUL	a3, b1, t1
 | |
| 	nop
 | |
| 
 | |
| 	FADD	c06, t2, c06
 | |
| 	nop
 | |
| 	FMUL	a3, b2, t2
 | |
| 	nop
 | |
| 
 | |
| 	FADD	c10, t3, c10
 | |
| 	nop
 | |
| 	FMUL	a3, b3, t3
 | |
| 	nop
 | |
| 
 | |
| 	FADD	c14, t4, c14
 | |
| 	nop
 | |
| 	FMUL	a3, b4, t4
 | |
| 	LDF	[AO + 2 * SIZE], a3
 | |
| 
 | |
| 	FADD	c03, t1, c03
 | |
| 	nop
 | |
| 	FMUL	a4, b1, t1
 | |
| 	LDF	[BO + 0 * SIZE], b1
 | |
| 
 | |
| 	FADD	c07, t2, c07
 | |
| 	nop
 | |
| 	FMUL	a4, b2, t2
 | |
| 	LDF	[BO + 1 * SIZE], b2
 | |
| 
 | |
| 	FADD	c11, t3, c11
 | |
| 	nop
 | |
| 	FMUL	a4, b3, t3
 | |
| 	LDF	[BO + 2 * SIZE], b3
 | |
| 
 | |
| 	FADD	c15, t4, c15
 | |
| 	FMUL	a4, b4, t4
 | |
| 	bg,pt	%icc, .LL26
 | |
| 	LDF	[BO + 3 * SIZE], b4
 | |
| 
 | |
| .LL29:
 | |
| #if defined(LN) || defined(RT)
 | |
| 	sub	KK, 4, TEMP1
 | |
| 	sll	TEMP1, 2 + BASE_SHIFT, TEMP1
 | |
| 	add	AORIG, TEMP1, AO
 | |
| 	add	B,     TEMP1, BO
 | |
| #endif
 | |
| 
 | |
| 	FADD	c04, t1, c04
 | |
| 	FADD	c08, t2, c08
 | |
| 	FADD	c12, t3, c12
 | |
| 	FADD	c16, t4, c16
 | |
| 
 | |
| #if defined(LN) || defined(LT)
 | |
| 	LDF	[BO +  0 * SIZE], a1
 | |
| 	LDF	[BO +  1 * SIZE], a2
 | |
| 	LDF	[BO +  2 * SIZE], a3
 | |
| 	LDF	[BO +  3 * SIZE], a4
 | |
| 
 | |
| 	LDF	[BO +  4 * SIZE], b1
 | |
| 	LDF	[BO +  5 * SIZE], b2
 | |
| 	LDF	[BO +  6 * SIZE], b3
 | |
| 	LDF	[BO +  7 * SIZE], b4
 | |
| 
 | |
| 	FSUB	a1, c01, c01
 | |
| 	FSUB	a2, c05, c05
 | |
| 	FSUB	a3, c09, c09
 | |
| 	FSUB	a4, c13, c13
 | |
| 
 | |
| 	FSUB	b1, c02, c02
 | |
| 	FSUB	b2, c06, c06
 | |
| 	FSUB	b3, c10, c10
 | |
| 	FSUB	b4, c14, c14
 | |
| 
 | |
| 	LDF	[BO +  8 * SIZE], a1
 | |
| 	LDF	[BO +  9 * SIZE], a2
 | |
| 	LDF	[BO + 10 * SIZE], a3
 | |
| 	LDF	[BO + 11 * SIZE], a4
 | |
| 
 | |
| 	LDF	[BO + 12 * SIZE], b1
 | |
| 	LDF	[BO + 13 * SIZE], b2
 | |
| 	LDF	[BO + 14 * SIZE], b3
 | |
| 	LDF	[BO + 15 * SIZE], b4
 | |
| 
 | |
| 	FSUB	a1, c03, c03
 | |
| 	FSUB	a2, c07, c07
 | |
| 	FSUB	a3, c11, c11
 | |
| 	FSUB	a4, c15, c15
 | |
| 
 | |
| 	FSUB	b1, c04, c04
 | |
| 	FSUB	b2, c08, c08
 | |
| 	FSUB	b3, c12, c12
 | |
| 	FSUB	b4, c16, c16
 | |
| #else
 | |
| 	LDF	[AO +  0 * SIZE], a1
 | |
| 	LDF	[AO +  1 * SIZE], a2
 | |
| 	LDF	[AO +  2 * SIZE], a3
 | |
| 	LDF	[AO +  3 * SIZE], a4
 | |
| 
 | |
| 	LDF	[AO +  4 * SIZE], b1
 | |
| 	LDF	[AO +  5 * SIZE], b2
 | |
| 	LDF	[AO +  6 * SIZE], b3
 | |
| 	LDF	[AO +  7 * SIZE], b4
 | |
| 
 | |
| 	FSUB	a1, c01, c01
 | |
| 	FSUB	a2, c02, c02
 | |
| 	FSUB	a3, c03, c03
 | |
| 	FSUB	a4, c04, c04
 | |
| 
 | |
| 	FSUB	b1, c05, c05
 | |
| 	FSUB	b2, c06, c06
 | |
| 	FSUB	b3, c07, c07
 | |
| 	FSUB	b4, c08, c08
 | |
| 
 | |
| 	LDF	[AO +  8 * SIZE], a1
 | |
| 	LDF	[AO +  9 * SIZE], a2
 | |
| 	LDF	[AO + 10 * SIZE], a3
 | |
| 	LDF	[AO + 11 * SIZE], a4
 | |
| 
 | |
| 	LDF	[AO + 12 * SIZE], b1
 | |
| 	LDF	[AO + 13 * SIZE], b2
 | |
| 	LDF	[AO + 14 * SIZE], b3
 | |
| 	LDF	[AO + 15 * SIZE], b4
 | |
| 
 | |
| 	FSUB	a1, c09, c09
 | |
| 	FSUB	a2, c10, c10
 | |
| 	FSUB	a3, c11, c11
 | |
| 	FSUB	a4, c12, c12
 | |
| 
 | |
| 	FSUB	b1, c13, c13
 | |
| 	FSUB	b2, c14, c14
 | |
| 	FSUB	b3, c15, c15
 | |
| 	FSUB	b4, c16, c16
 | |
| #endif
 | |
| 
 | |
| #ifdef LN
 | |
| 	LDF	[AO + 15 * SIZE], a1
 | |
| 	LDF	[AO + 14 * SIZE], a2
 | |
| 	LDF	[AO + 13 * SIZE], a3
 | |
| 	LDF	[AO + 12 * SIZE], a4
 | |
| 
 | |
| 	FMUL	a1, c04, c04
 | |
| 	FMUL	a1, c08, c08
 | |
| 	FMUL	a1, c12, c12
 | |
| 	FMUL	a1, c16, c16
 | |
| 
 | |
| 	FMUL	a2, c04, t1
 | |
| 	FMUL	a2, c08, t2
 | |
| 	FMUL	a2, c12, t3
 | |
| 	FMUL	a2, c16, t4
 | |
| 
 | |
| 	FSUB	c03, t1, c03
 | |
| 	FSUB	c07, t2, c07
 | |
| 	FSUB	c11, t3, c11
 | |
| 	FSUB	c15, t4, c15
 | |
| 
 | |
| 	FMUL	a3, c04, t1
 | |
| 	FMUL	a3, c08, t2
 | |
| 	FMUL	a3, c12, t3
 | |
| 	FMUL	a3, c16, t4
 | |
| 
 | |
| 	FSUB	c02, t1, c02
 | |
| 	FSUB	c06, t2, c06
 | |
| 	FSUB	c10, t3, c10
 | |
| 	FSUB	c14, t4, c14
 | |
| 
 | |
| 	FMUL	a4, c04, t1
 | |
| 	FMUL	a4, c08, t2
 | |
| 	FMUL	a4, c12, t3
 | |
| 	FMUL	a4, c16, t4
 | |
| 
 | |
| 	FSUB	c01, t1, c01
 | |
| 	FSUB	c05, t2, c05
 | |
| 	FSUB	c09, t3, c09
 | |
| 	FSUB	c13, t4, c13
 | |
| 
 | |
| 	LDF	[AO + 10 * SIZE], a1
 | |
| 	LDF	[AO +  9 * SIZE], a2
 | |
| 	LDF	[AO +  8 * SIZE], a3
 | |
| 
 | |
| 	FMUL	a1, c03, c03
 | |
| 	FMUL	a1, c07, c07
 | |
| 	FMUL	a1, c11, c11
 | |
| 	FMUL	a1, c15, c15
 | |
| 
 | |
| 	FMUL	a2, c03, t1
 | |
| 	FMUL	a2, c07, t2
 | |
| 	FMUL	a2, c11, t3
 | |
| 	FMUL	a2, c15, t4
 | |
| 
 | |
| 	FSUB	c02, t1, c02
 | |
| 	FSUB	c06, t2, c06
 | |
| 	FSUB	c10, t3, c10
 | |
| 	FSUB	c14, t4, c14
 | |
| 
 | |
| 	FMUL	a3, c03, t1
 | |
| 	FMUL	a3, c07, t2
 | |
| 	FMUL	a3, c11, t3
 | |
| 	FMUL	a3, c15, t4
 | |
| 
 | |
| 	FSUB	c01, t1, c01
 | |
| 	FSUB	c05, t2, c05
 | |
| 	FSUB	c09, t3, c09
 | |
| 	FSUB	c13, t4, c13
 | |
| 
 | |
| 	LDF	[AO +  5 * SIZE], a1
 | |
| 	LDF	[AO +  4 * SIZE], a2
 | |
| 
 | |
| 	FMUL	a1, c02, c02
 | |
| 	FMUL	a1, c06, c06
 | |
| 	FMUL	a1, c10, c10
 | |
| 	FMUL	a1, c14, c14
 | |
| 
 | |
| 	FMUL	a2, c02, t1
 | |
| 	FMUL	a2, c06, t2
 | |
| 	FMUL	a2, c10, t3
 | |
| 	FMUL	a2, c14, t4
 | |
| 
 | |
| 	FSUB	c01, t1, c01
 | |
| 	FSUB	c05, t2, c05
 | |
| 	FSUB	c09, t3, c09
 | |
| 	FSUB	c13, t4, c13
 | |
| 
 | |
| 	LDF	[AO +  0 * SIZE], a1
 | |
| 
 | |
| 	FMUL	a1, c01, c01
 | |
| 	FMUL	a1, c05, c05
 | |
| 	FMUL	a1, c09, c09
 | |
| 	FMUL	a1, c13, c13
 | |
| #endif
 | |
| 
 | |
| #ifdef LT
 | |
| 	LDF	[AO +  0 * SIZE], a1
 | |
| 	LDF	[AO +  1 * SIZE], a2
 | |
| 	LDF	[AO +  2 * SIZE], a3
 | |
| 	LDF	[AO +  3 * SIZE], a4
 | |
| 
 | |
| 	FMUL	a1, c01, c01
 | |
| 	FMUL	a1, c05, c05
 | |
| 	FMUL	a1, c09, c09
 | |
| 	FMUL	a1, c13, c13
 | |
| 
 | |
| 	FMUL	a2, c01, t1
 | |
| 	FMUL	a2, c05, t2
 | |
| 	FMUL	a2, c09, t3
 | |
| 	FMUL	a2, c13, t4
 | |
| 
 | |
| 	FSUB	c02, t1, c02
 | |
| 	FSUB	c06, t2, c06
 | |
| 	FSUB	c10, t3, c10
 | |
| 	FSUB	c14, t4, c14
 | |
| 
 | |
| 	FMUL	a3, c01, t1
 | |
| 	FMUL	a3, c05, t2
 | |
| 	FMUL	a3, c09, t3
 | |
| 	FMUL	a3, c13, t4
 | |
| 
 | |
| 	FSUB	c03, t1, c03
 | |
| 	FSUB	c07, t2, c07
 | |
| 	FSUB	c11, t3, c11
 | |
| 	FSUB	c15, t4, c15
 | |
| 
 | |
| 	FMUL	a4, c01, t1
 | |
| 	FMUL	a4, c05, t2
 | |
| 	FMUL	a4, c09, t3
 | |
| 	FMUL	a4, c13, t4
 | |
| 
 | |
| 	FSUB	c04, t1, c04
 | |
| 	FSUB	c08, t2, c08
 | |
| 	FSUB	c12, t3, c12
 | |
| 	FSUB	c16, t4, c16
 | |
| 
 | |
| 	LDF	[AO +  5 * SIZE], a1
 | |
| 	LDF	[AO +  6 * SIZE], a2
 | |
| 	LDF	[AO +  7 * SIZE], a3
 | |
| 
 | |
| 	FMUL	a1, c02, c02
 | |
| 	FMUL	a1, c06, c06
 | |
| 	FMUL	a1, c10, c10
 | |
| 	FMUL	a1, c14, c14
 | |
| 
 | |
| 	FMUL	a2, c02, t1
 | |
| 	FMUL	a2, c06, t2
 | |
| 	FMUL	a2, c10, t3
 | |
| 	FMUL	a2, c14, t4
 | |
| 
 | |
| 	FSUB	c03, t1, c03
 | |
| 	FSUB	c07, t2, c07
 | |
| 	FSUB	c11, t3, c11
 | |
| 	FSUB	c15, t4, c15
 | |
| 
 | |
| 	FMUL	a3, c02, t1
 | |
| 	FMUL	a3, c06, t2
 | |
| 	FMUL	a3, c10, t3
 | |
| 	FMUL	a3, c14, t4
 | |
| 
 | |
| 	FSUB	c04, t1, c04
 | |
| 	FSUB	c08, t2, c08
 | |
| 	FSUB	c12, t3, c12
 | |
| 	FSUB	c16, t4, c16
 | |
| 
 | |
| 	LDF	[AO + 10 * SIZE], a1
 | |
| 	LDF	[AO + 11 * SIZE], a2
 | |
| 
 | |
| 	FMUL	a1, c03, c03
 | |
| 	FMUL	a1, c07, c07
 | |
| 	FMUL	a1, c11, c11
 | |
| 	FMUL	a1, c15, c15
 | |
| 
 | |
| 	FMUL	a2, c03, t1
 | |
| 	FMUL	a2, c07, t2
 | |
| 	FMUL	a2, c11, t3
 | |
| 	FMUL	a2, c15, t4
 | |
| 
 | |
| 	FSUB	c04, t1, c04
 | |
| 	FSUB	c08, t2, c08
 | |
| 	FSUB	c12, t3, c12
 | |
| 	FSUB	c16, t4, c16
 | |
| 
 | |
| 	LDF	[AO + 15 * SIZE], a1
 | |
| 
 | |
| 	FMUL	a1, c04, c04
 | |
| 	FMUL	a1, c08, c08
 | |
| 	FMUL	a1, c12, c12
 | |
| 	FMUL	a1, c16, c16
 | |
| #endif
 | |
| 
 | |
| #ifdef RN
 | |
| 	LDF	[BO +  0 * SIZE], a1
 | |
| 	LDF	[BO +  1 * SIZE], a2
 | |
| 	LDF	[BO +  2 * SIZE], a3
 | |
| 	LDF	[BO +  3 * SIZE], a4
 | |
| 
 | |
| 	FMUL	a1, c01, c01
 | |
| 	FMUL	a1, c02, c02
 | |
| 	FMUL	a1, c03, c03
 | |
| 	FMUL	a1, c04, c04
 | |
| 
 | |
| 	FMUL	a2, c01, t1
 | |
| 	FMUL	a2, c02, t2
 | |
| 	FMUL	a2, c03, t3
 | |
| 	FMUL	a2, c04, t4
 | |
| 
 | |
| 	FSUB	c05, t1, c05
 | |
| 	FSUB	c06, t2, c06
 | |
| 	FSUB	c07, t3, c07
 | |
| 	FSUB	c08, t4, c08
 | |
| 
 | |
| 	FMUL	a3, c01, t1
 | |
| 	FMUL	a3, c02, t2
 | |
| 	FMUL	a3, c03, t3
 | |
| 	FMUL	a3, c04, t4
 | |
| 
 | |
| 	FSUB	c09, t1, c09
 | |
| 	FSUB	c10, t2, c10
 | |
| 	FSUB	c11, t3, c11
 | |
| 	FSUB	c12, t4, c12
 | |
| 
 | |
| 	FMUL	a4, c01, t1
 | |
| 	FMUL	a4, c02, t2
 | |
| 	FMUL	a4, c03, t3
 | |
| 	FMUL	a4, c04, t4
 | |
| 
 | |
| 	FSUB	c13, t1, c13
 | |
| 	FSUB	c14, t2, c14
 | |
| 	FSUB	c15, t3, c15
 | |
| 	FSUB	c16, t4, c16
 | |
| 
 | |
| 	LDF	[BO +  5 * SIZE], a1
 | |
| 	LDF	[BO +  6 * SIZE], a2
 | |
| 	LDF	[BO +  7 * SIZE], a3
 | |
| 
 | |
| 	FMUL	a1, c05, c05
 | |
| 	FMUL	a1, c06, c06
 | |
| 	FMUL	a1, c07, c07
 | |
| 	FMUL	a1, c08, c08
 | |
| 
 | |
| 	FMUL	a2, c05, t1
 | |
| 	FMUL	a2, c06, t2
 | |
| 	FMUL	a2, c07, t3
 | |
| 	FMUL	a2, c08, t4
 | |
| 
 | |
| 	FSUB	c09, t1, c09
 | |
| 	FSUB	c10, t2, c10
 | |
| 	FSUB	c11, t3, c11
 | |
| 	FSUB	c12, t4, c12
 | |
| 
 | |
| 	FMUL	a3, c05, t1
 | |
| 	FMUL	a3, c06, t2
 | |
| 	FMUL	a3, c07, t3
 | |
| 	FMUL	a3, c08, t4
 | |
| 
 | |
| 	FSUB	c13, t1, c13
 | |
| 	FSUB	c14, t2, c14
 | |
| 	FSUB	c15, t3, c15
 | |
| 	FSUB	c16, t4, c16
 | |
| 
 | |
| 	LDF	[BO + 10 * SIZE], a1
 | |
| 	LDF	[BO + 11 * SIZE], a2
 | |
| 
 | |
| 	FMUL	a1, c09, c09
 | |
| 	FMUL	a1, c10, c10
 | |
| 	FMUL	a1, c11, c11
 | |
| 	FMUL	a1, c12, c12
 | |
| 
 | |
| 	FMUL	a2, c09, t1
 | |
| 	FMUL	a2, c10, t2
 | |
| 	FMUL	a2, c11, t3
 | |
| 	FMUL	a2, c12, t4
 | |
| 
 | |
| 	FSUB	c13, t1, c13
 | |
| 	FSUB	c14, t2, c14
 | |
| 	FSUB	c15, t3, c15
 | |
| 	FSUB	c16, t4, c16
 | |
| 
 | |
| 	LDF	[BO + 15 * SIZE], a1
 | |
| 
 | |
| 	FMUL	a1, c13, c13
 | |
| 	FMUL	a1, c14, c14
 | |
| 	FMUL	a1, c15, c15
 | |
| 	FMUL	a1, c16, c16
 | |
| #endif
 | |
| 
 | |
| #ifdef RT
 | |
| 	LDF	[BO + 15 * SIZE], a1
 | |
| 	LDF	[BO + 14 * SIZE], a2
 | |
| 	LDF	[BO + 13 * SIZE], a3
 | |
| 	LDF	[BO + 12 * SIZE], a4
 | |
| 
 | |
| 	FMUL	a1, c13, c13
 | |
| 	FMUL	a1, c14, c14
 | |
| 	FMUL	a1, c15, c15
 | |
| 	FMUL	a1, c16, c16
 | |
| 
 | |
| 	FMUL	a2, c13, t1
 | |
| 	FMUL	a2, c14, t2
 | |
| 	FMUL	a2, c15, t3
 | |
| 	FMUL	a2, c16, t4
 | |
| 
 | |
| 	FSUB	c09, t1, c09
 | |
| 	FSUB	c10, t2, c10
 | |
| 	FSUB	c11, t3, c11
 | |
| 	FSUB	c12, t4, c12
 | |
| 
 | |
| 	FMUL	a3, c13, t1
 | |
| 	FMUL	a3, c14, t2
 | |
| 	FMUL	a3, c15, t3
 | |
| 	FMUL	a3, c16, t4
 | |
| 
 | |
| 	FSUB	c05, t1, c05
 | |
| 	FSUB	c06, t2, c06
 | |
| 	FSUB	c07, t3, c07
 | |
| 	FSUB	c08, t4, c08
 | |
| 
 | |
| 	FMUL	a4, c13, t1
 | |
| 	FMUL	a4, c14, t2
 | |
| 	FMUL	a4, c15, t3
 | |
| 	FMUL	a4, c16, t4
 | |
| 
 | |
| 	FSUB	c01, t1, c01
 | |
| 	FSUB	c02, t2, c02
 | |
| 	FSUB	c03, t3, c03
 | |
| 	FSUB	c04, t4, c04
 | |
| 
 | |
| 	LDF	[BO + 10 * SIZE], a1
 | |
| 	LDF	[BO +  9 * SIZE], a2
 | |
| 	LDF	[BO +  8 * SIZE], a3
 | |
| 
 | |
| 	FMUL	a1, c09, c09
 | |
| 	FMUL	a1, c10, c10
 | |
| 	FMUL	a1, c11, c11
 | |
| 	FMUL	a1, c12, c12
 | |
| 
 | |
| 	FMUL	a2, c09, t1
 | |
| 	FMUL	a2, c10, t2
 | |
| 	FMUL	a2, c11, t3
 | |
| 	FMUL	a2, c12, t4
 | |
| 
 | |
| 	FSUB	c05, t1, c05
 | |
| 	FSUB	c06, t2, c06
 | |
| 	FSUB	c07, t3, c07
 | |
| 	FSUB	c08, t4, c08
 | |
| 
 | |
| 	FMUL	a3, c09, t1
 | |
| 	FMUL	a3, c10, t2
 | |
| 	FMUL	a3, c11, t3
 | |
| 	FMUL	a3, c12, t4
 | |
| 
 | |
| 	FSUB	c01, t1, c01
 | |
| 	FSUB	c02, t2, c02
 | |
| 	FSUB	c03, t3, c03
 | |
| 	FSUB	c04, t4, c04
 | |
| 
 | |
| 	LDF	[BO +  5 * SIZE], a1
 | |
| 	LDF	[BO +  4 * SIZE], a2
 | |
| 
 | |
| 	FMUL	a1, c05, c05
 | |
| 	FMUL	a1, c06, c06
 | |
| 	FMUL	a1, c07, c07
 | |
| 	FMUL	a1, c08, c08
 | |
| 
 | |
| 	FMUL	a2, c05, t1
 | |
| 	FMUL	a2, c06, t2
 | |
| 	FMUL	a2, c07, t3
 | |
| 	FMUL	a2, c08, t4
 | |
| 
 | |
| 	FSUB	c01, t1, c01
 | |
| 	FSUB	c02, t2, c02
 | |
| 	FSUB	c03, t3, c03
 | |
| 	FSUB	c04, t4, c04
 | |
| 
 | |
| 	LDF	[BO +  0 * SIZE], a1
 | |
| 
 | |
| 	FMUL	a1, c01, c01
 | |
| 	FMUL	a1, c02, c02
 | |
| 	FMUL	a1, c03, c03
 | |
| 	FMUL	a1, c04, c04
 | |
| #endif
 | |
| 
 | |
| #ifdef LN
 | |
| 	add	C1, -4 * SIZE, C1
 | |
| 	add	C2, -4 * SIZE, C2
 | |
| 	add	C3, -4 * SIZE, C3
 | |
| 	add	C4, -4 * SIZE, C4
 | |
| #endif
 | |
| 
 | |
| #if defined(LN) || defined(LT)
 | |
| 	STF	c01, [BO +  0 * SIZE]
 | |
| 	STF	c05, [BO +  1 * SIZE]
 | |
| 	STF	c09, [BO +  2 * SIZE]
 | |
| 	STF	c13, [BO +  3 * SIZE]
 | |
| 
 | |
| 	STF	c02, [BO +  4 * SIZE]
 | |
| 	STF	c06, [BO +  5 * SIZE]
 | |
| 	STF	c10, [BO +  6 * SIZE]
 | |
| 	STF	c14, [BO +  7 * SIZE]
 | |
| 
 | |
| 	STF	c03, [BO +  8 * SIZE]
 | |
| 	STF	c07, [BO +  9 * SIZE]
 | |
| 	STF	c11, [BO + 10 * SIZE]
 | |
| 	STF	c15, [BO + 11 * SIZE]
 | |
| 
 | |
| 	STF	c04, [BO + 12 * SIZE]
 | |
| 	STF	c08, [BO + 13 * SIZE]
 | |
| 	STF	c12, [BO + 14 * SIZE]
 | |
| 	STF	c16, [BO + 15 * SIZE]
 | |
| #else
 | |
| 	STF	c01, [AO +  0 * SIZE]
 | |
| 	STF	c02, [AO +  1 * SIZE]
 | |
| 	STF	c03, [AO +  2 * SIZE]
 | |
| 	STF	c04, [AO +  3 * SIZE]
 | |
| 
 | |
| 	STF	c05, [AO +  4 * SIZE]
 | |
| 	STF	c06, [AO +  5 * SIZE]
 | |
| 	STF	c07, [AO +  6 * SIZE]
 | |
| 	STF	c08, [AO +  7 * SIZE]
 | |
| 
 | |
| 	STF	c09, [AO +  8 * SIZE]
 | |
| 	STF	c10, [AO +  9 * SIZE]
 | |
| 	STF	c11, [AO + 10 * SIZE]
 | |
| 	STF	c12, [AO + 11 * SIZE]
 | |
| 
 | |
| 	STF	c13, [AO + 12 * SIZE]
 | |
| 	STF	c14, [AO + 13 * SIZE]
 | |
| 	STF	c15, [AO + 14 * SIZE]
 | |
| 	STF	c16, [AO + 15 * SIZE]
 | |
| #endif
 | |
| 
 | |
| 	STF	c01, [C1 + 0 * SIZE]
 | |
| 	STF	c02, [C1 + 1 * SIZE]
 | |
| 	STF	c03, [C1 + 2 * SIZE]
 | |
| 	STF	c04, [C1 + 3 * SIZE]
 | |
| 
 | |
| 	STF	c05, [C2 + 0 * SIZE]
 | |
| 	STF	c06, [C2 + 1 * SIZE]
 | |
| 	STF	c07, [C2 + 2 * SIZE]
 | |
| 	STF	c08, [C2 + 3 * SIZE]
 | |
| 
 | |
| 	STF	c09, [C3 + 0 * SIZE]
 | |
| 	STF	c10, [C3 + 1 * SIZE]
 | |
| 	STF	c11, [C3 + 2 * SIZE]
 | |
| 	STF	c12, [C3 + 3 * SIZE]
 | |
| 
 | |
| 	STF	c13, [C4 + 0 * SIZE]
 | |
| 	STF	c14, [C4 + 1 * SIZE]
 | |
| 	STF	c15, [C4 + 2 * SIZE]
 | |
| 	STF	c16, [C4 + 3 * SIZE]
 | |
| 
 | |
| 	FMOV	FZERO, t1
 | |
| 	FMOV	FZERO, t2
 | |
| 	FMOV	FZERO, t3
 | |
| 	FMOV	FZERO, t4
 | |
| 
 | |
| #ifndef LN
 | |
| 	add	C1, 4 * SIZE, C1
 | |
| 	add	C2, 4 * SIZE, C2
 | |
| 	add	C3, 4 * SIZE, C3
 | |
| 	add	C4, 4 * SIZE, C4
 | |
| #endif
 | |
| 
 | |
| #ifdef RT
 | |
| 	sll	K, 2 + BASE_SHIFT, TEMP1
 | |
| 	add	AORIG, TEMP1, AORIG
 | |
| #endif
 | |
| 
 | |
| #if defined(LT) || defined(RN)
 | |
| 	sub	K, KK, TEMP1
 | |
| 	sll	TEMP1, 2 + BASE_SHIFT, TEMP1
 | |
| 	add	AO, TEMP1, AO
 | |
| 	add	BO, TEMP1, BO
 | |
| #endif
 | |
| 
 | |
| #ifdef LT
 | |
| 	add	KK, 4, KK
 | |
| #endif
 | |
| 
 | |
| #ifdef LN
 | |
| 	sub	KK, 4, KK
 | |
| #endif
 | |
| 
 | |
| 	add	I, -1, I
 | |
| 	cmp	I, 0
 | |
| 
 | |
| 	sra	K, 2, L
 | |
| 	bg,pt	%icc, .LL21
 | |
| 	FMOV	FZERO, c01
 | |
| 
 | |
| 
 | |
| 
 | |
| 
 | |
| 
 | |
| 
 | |
| 
 | |
| .LL99:
 | |
| #ifdef LN
 | |
| 	sll	K, 2 + BASE_SHIFT, TEMP1
 | |
| 	add	B, TEMP1, B
 | |
| #endif
 | |
| 
 | |
| #if defined(LT) || defined(RN)
 | |
| 	mov	BO, B
 | |
| #endif
 | |
| 
 | |
| #ifdef RN
 | |
| 	add	KK, 4, KK
 | |
| #endif
 | |
| 
 | |
| #ifdef RT
 | |
| 	sub	KK, 4, KK
 | |
| #endif
 | |
| 
 | |
| 	add	J, -1, J
 | |
| 	cmp	J, 0
 | |
| 	bg,pt	%icc, .LL11
 | |
| 	nop
 | |
| 
 | |
| .LL100:  /* n & 2 */
 | |
| 	and	N, 2, J
 | |
| 	cmp	J, 0
 | |
| 	ble,pn	%icc, .LL200
 | |
| 	nop
 | |
| 
 | |
| #ifdef RT
 | |
| 	sll	K, 1 + BASE_SHIFT, TEMP1
 | |
| 	sub	B, TEMP1, B
 | |
| 
 | |
| 	sll	LDC, 1, TEMP1
 | |
| 	sub	C, TEMP1, C
 | |
| #endif
 | |
| 
 | |
| 	mov	C, C1
 | |
| 	add	C, LDC, C2
 | |
| 
 | |
| #ifdef LN
 | |
| 	add	M, OFFSET, KK
 | |
| #endif
 | |
| 
 | |
| #ifdef LT
 | |
| 	mov	OFFSET, KK
 | |
| #endif
 | |
| 
 | |
| #if defined(LN) || defined(RT)
 | |
| 	mov	A, AORIG
 | |
| #else
 | |
| 	mov	A, AO
 | |
| #endif
 | |
| 
 | |
| #ifndef RT
 | |
| 	add	C2, LDC, C
 | |
| #endif
 | |
| 
 | |
| 	and	M, 1, I
 | |
| 	cmp	I, 0
 | |
| 	ble,pn	%icc, .LL150
 | |
| 	nop
 | |
| 
 | |
| #if defined(LT) || defined(RN)
 | |
| 	sra	KK, 2, L
 | |
| 
 | |
| 	mov	B, BO
 | |
| 	cmp	L,  0
 | |
| #else
 | |
| 
 | |
| #ifdef LN
 | |
| 	sll	K,  0 + BASE_SHIFT, TEMP1
 | |
| 	sub	AORIG, TEMP1, AORIG
 | |
| #endif
 | |
| 
 | |
| 	sll	KK, 0 + BASE_SHIFT, TEMP1
 | |
| 	sll	KK, 1 + BASE_SHIFT, TEMP2
 | |
| 
 | |
| 	add	AORIG, TEMP1, AO
 | |
| 	add	B,     TEMP2, BO
 | |
| 
 | |
| 	sub	K, KK, TEMP1
 | |
| 	sra	TEMP1, 2, L
 | |
| 	cmp	L,  0
 | |
| #endif
 | |
| 
 | |
| 	LDF	[AO + 0 * SIZE], a1
 | |
| 	FMOV	FZERO, c01
 | |
| 	LDF	[BO + 0 * SIZE], b1
 | |
| 	FMOV	FZERO, t1
 | |
| 
 | |
|  	LDF	[AO + 1 * SIZE], a2
 | |
| 	FMOV	FZERO, c02
 | |
| 	LDF	[BO + 1 * SIZE], b2
 | |
| 	FMOV	FZERO, t2
 | |
| 
 | |
| 	LDF	[AO + 2 * SIZE], a3
 | |
| 	FMOV	FZERO, c03
 | |
| 
 | |
| 	LDF	[BO + 2 * SIZE], b3
 | |
| 	FMOV	FZERO, t3
 | |
| 
 | |
| 	LDF	[AO + 3 * SIZE], a4
 | |
| 	FMOV	FZERO, c04
 | |
| 	LDF	[BO + 3 * SIZE], b4
 | |
| 	FMOV	FZERO, t4
 | |
| 
 | |
| 	ble,pn	%icc, .LL175
 | |
| 	nop
 | |
| 
 | |
| .LL172:
 | |
| 	FADD	c01, t1, c01
 | |
| 	add	AO,  4 * SIZE, AO
 | |
| 	FMUL	a1, b1, t1
 | |
| 	LDF	[BO + 4 * SIZE], b1
 | |
| 
 | |
| 	FADD	c02, t2, c02
 | |
| 	FMUL	a1, b2, t2
 | |
| 	LDF	[BO + 5 * SIZE], b2
 | |
| 
 | |
| 	add	L, -1, L
 | |
| 	LDF	[AO + 0 * SIZE], a1
 | |
| 
 | |
| 	FADD	c03, t3, c03
 | |
| 	cmp	L, 0
 | |
| 	FMUL	a2, b3, t3
 | |
| 	LDF	[BO + 6 * SIZE], b3
 | |
| 
 | |
| 	FADD	c04, t4, c04
 | |
| 	FMUL	a2, b4, t4
 | |
| 	LDF	[BO + 7 * SIZE], b4
 | |
| 	LDF	[AO + 1 * SIZE], a2
 | |
| 
 | |
| 	FADD	c01, t1, c01
 | |
| 	FMUL	a3, b1, t1
 | |
| 	LDF	[BO +  8 * SIZE], b1
 | |
| 
 | |
| 	FADD	c02, t2, c02
 | |
| 	FMUL	a3, b2, t2
 | |
| 	LDF	[BO +  9 * SIZE], b2
 | |
| 	LDF	[AO + 2 * SIZE], a3
 | |
| 
 | |
| 	FADD	c03, t3, c03
 | |
| 	FMUL	a4, b3, t3
 | |
| 	LDF	[BO + 10 * SIZE], b3
 | |
| 	FADD	c04, t4, c04
 | |
| 	FMUL	a4, b4, t4
 | |
| 	LDF	[BO + 11 * SIZE], b4
 | |
| 	add	BO,  8 * SIZE, BO
 | |
| 
 | |
| 	bg,pt	%icc, .LL172
 | |
| 	LDF	[AO + 3 * SIZE], a4
 | |
| 
 | |
| .LL175:
 | |
| #if defined(LT) || defined(RN)
 | |
| 	and	KK,  3, L
 | |
| #else
 | |
| 	and	TEMP1, 3, L
 | |
| #endif
 | |
| 	cmp	L,  0
 | |
| 	ble,a,pn %icc, .LL179
 | |
| 	nop
 | |
| 
 | |
| .LL176:
 | |
| 	FADD	c01, t1, c01
 | |
| 	add	L, -1, L
 | |
| 	FMUL	a1, b1, t1
 | |
| 	add	AO, 1 * SIZE, AO
 | |
| 	LDF	[BO + 2 * SIZE], b1
 | |
| 	FADD	c02, t2, c02
 | |
| 	cmp	L, 0
 | |
| 	FMUL	a1, b2, t2
 | |
| 	LDF	[BO + 3 * SIZE], b2
 | |
| 
 | |
| 	add	BO, 2 * SIZE, BO
 | |
| 	bg,pt	%icc, .LL176
 | |
| 	LDF	[AO + 0 * SIZE], a1
 | |
| 
 | |
| .LL179:
 | |
| 	FADD	c01, t1, c01
 | |
| 	FADD	c02, t2, c02
 | |
| 	FADD	c03, t3, c03
 | |
| 	FADD	c04, t4, c04
 | |
| 
 | |
| 	FADD	c01, c03, c01
 | |
| 	FADD	c02, c04, c02
 | |
| 
 | |
| 
 | |
| #if defined(LN) || defined(RT)
 | |
| #ifdef LN
 | |
| 	sub	KK, 1, TEMP1
 | |
| #else
 | |
| 	sub	KK, 2, TEMP1
 | |
| #endif
 | |
| 	sll	TEMP1, 0 + BASE_SHIFT, TEMP2
 | |
| 	sll	TEMP1, 1 + BASE_SHIFT, TEMP1
 | |
| 	add	AORIG, TEMP2, AO
 | |
| 	add	B,     TEMP1, BO
 | |
| #endif
 | |
| 
 | |
| #if defined(LN) || defined(LT)
 | |
| 	LDF	[BO +  0 * SIZE], a1
 | |
| 	LDF	[BO +  1 * SIZE], a2
 | |
| 
 | |
| 	FSUB	a1, c01, c01
 | |
| 	FSUB	a2, c02, c02
 | |
| #else
 | |
| 	LDF	[AO +  0 * SIZE], a1
 | |
| 	LDF	[AO +  1 * SIZE], a2
 | |
| 
 | |
| 	FSUB	a1, c01, c01
 | |
| 	FSUB	a2, c02, c02
 | |
| #endif
 | |
| 
 | |
| #ifdef LN
 | |
| 	LDF	[AO +  0 * SIZE], a1
 | |
| 
 | |
| 	FMUL	a1, c01, c01
 | |
| 	FMUL	a1, c02, c02
 | |
| #endif
 | |
| 
 | |
| #ifdef LT
 | |
| 	LDF	[AO +  0 * SIZE], a1
 | |
| 
 | |
| 	FMUL	a1, c01, c01
 | |
| 	FMUL	a1, c02, c02
 | |
| #endif
 | |
| 
 | |
| #ifdef RN
 | |
| 	LDF	[BO +  0 * SIZE], a1
 | |
| 	LDF	[BO +  1 * SIZE], a2
 | |
| 	LDF	[BO +  3 * SIZE], a3
 | |
| 
 | |
| 	FMUL	a1, c01, c01
 | |
| 	FMUL	a2, c01, t1
 | |
| 	FSUB	c02, t1, c02
 | |
| 	FMUL	a3, c02, c02
 | |
| #endif
 | |
| 
 | |
| #ifdef RT
 | |
| 	LDF	[BO +  3 * SIZE], a1
 | |
| 	LDF	[BO +  2 * SIZE], a2
 | |
| 	LDF	[BO +  0 * SIZE], a3
 | |
| 
 | |
| 	FMUL	a1, c02, c02
 | |
| 	FMUL	a2, c02, t1
 | |
| 	FSUB	c01, t1, c01
 | |
| 	FMUL	a3, c01, c01
 | |
| #endif
 | |
| 
 | |
| #ifdef LN
 | |
| 	add	C1, -1 * SIZE, C1
 | |
| 	add	C2, -1 * SIZE, C2
 | |
| #endif
 | |
| 
 | |
| #if defined(LN) || defined(LT)
 | |
| 	STF	c01, [BO +  0 * SIZE]
 | |
| 	STF	c02, [BO +  1 * SIZE]
 | |
| #else
 | |
| 	STF	c01, [AO +  0 * SIZE]
 | |
| 	STF	c02, [AO +  1 * SIZE]
 | |
| #endif
 | |
| 
 | |
| 	STF	c01, [C1 + 0 * SIZE]
 | |
| 	STF	c02, [C2 + 0 * SIZE]
 | |
| 
 | |
| 	FMOV	FZERO, t1
 | |
| 	FMOV	FZERO, t2
 | |
| 	FMOV	FZERO, t3
 | |
| 	FMOV	FZERO, t4
 | |
| 
 | |
| #ifndef LN
 | |
| 	add	C1, 1 * SIZE, C1
 | |
| 	add	C2, 1 * SIZE, C2
 | |
| #endif
 | |
| 
 | |
| #ifdef RT
 | |
| 	sll	K, 0 + BASE_SHIFT, TEMP1
 | |
| 	add	AORIG, TEMP1, AORIG
 | |
| #endif
 | |
| 
 | |
| #if defined(LT) || defined(RN)
 | |
| 	sub	K, KK, TEMP1
 | |
| 	sll	TEMP1, 0 + BASE_SHIFT, TEMP2
 | |
| 	sll	TEMP1, 1 + BASE_SHIFT, TEMP1
 | |
| 	add	AO, TEMP2, AO
 | |
| 	add	BO, TEMP1, BO
 | |
| #endif
 | |
| 
 | |
| #ifdef LT
 | |
| 	add	KK, 1, KK
 | |
| #endif
 | |
| 
 | |
| #ifdef LN
 | |
| 	sub	KK, 1, KK
 | |
| #endif
 | |
| 
 | |
| .LL150:
 | |
| 	and	M, 2, I
 | |
| 	cmp	I, 0
 | |
| 	ble,pn	%icc, .LL170
 | |
| 	nop
 | |
| 
 | |
| #if defined(LT) || defined(RN)
 | |
| 	sra	KK, 2, L
 | |
| 
 | |
| 	mov	B, BO
 | |
| 	cmp	L,  0
 | |
| #else
 | |
| 
 | |
| #ifdef LN
 | |
| 	sll	K,  1 + BASE_SHIFT, TEMP1
 | |
| 	sub	AORIG, TEMP1, AORIG
 | |
| #endif
 | |
| 
 | |
| 	sll	KK, 1 + BASE_SHIFT, TEMP1
 | |
| 	sll	KK, 1 + BASE_SHIFT, TEMP2
 | |
| 
 | |
| 	add	AORIG, TEMP1, AO
 | |
| 	add	B,     TEMP2, BO
 | |
| 
 | |
| 	sub	K, KK, TEMP1
 | |
| 	sra	TEMP1, 2, L
 | |
| 	cmp	L,  0
 | |
| #endif
 | |
| 
 | |
| 	LDF	[AO + 0 * SIZE], a1
 | |
| 	FMOV	FZERO, c01
 | |
| 	LDF	[BO + 0 * SIZE], b1
 | |
| 	FMOV	FZERO, t1
 | |
| 
 | |
| 	LDF	[AO + 1 * SIZE], a2
 | |
| 	cmp	L,  0
 | |
| 	FMOV	FZERO, c02
 | |
| 	LDF	[BO + 1 * SIZE], b2
 | |
| 	FMOV	FZERO, t2
 | |
| 
 | |
| 	LDF	[AO + 2 * SIZE], a3
 | |
| 	FMOV	FZERO, c03
 | |
| 	LDF	[BO + 2 * SIZE], b3
 | |
| 	FMOV	FZERO, t3
 | |
| 
 | |
| 	LDF	[AO + 3 * SIZE], a4
 | |
| 	FMOV	FZERO, c04
 | |
| 	LDF	[BO + 3 * SIZE], b4
 | |
| 	FMOV	FZERO, t4
 | |
| 	ble,pn	%icc, .LL155
 | |
| 	nop
 | |
| 
 | |
| .LL152:
 | |
| 	FADD	c01, t1, c01
 | |
| 	add	L, -1, L
 | |
| 	FMUL	a1, b1, t1
 | |
| 	prefetch [AO + APREFETCHSIZE * SIZE], 0
 | |
| 
 | |
| 	FADD	c02, t2, c02
 | |
| 	add	BO,  8 * SIZE, BO
 | |
| 	FMUL	a1, b2, t2
 | |
| 	LDF	[AO + 4 * SIZE], a1
 | |
| 
 | |
| 	FADD	c03, t3, c03
 | |
| 	cmp	L, 0
 | |
| 	FMUL	a2, b1, t3
 | |
| 	LDF	[BO - 4 * SIZE], b1
 | |
| 
 | |
| 	FADD	c04, t4, c04
 | |
| 	nop
 | |
| 	FMUL	a2, b2, t4
 | |
| 	LDF	[AO + 5 * SIZE], a2
 | |
| 
 | |
| 	FADD	c01, t1, c01
 | |
| 	nop
 | |
| 	FMUL	a3, b3, t1
 | |
| 	LDF	[BO - 3 * SIZE], b2
 | |
| 
 | |
| 	FADD	c02, t2, c02
 | |
| 	nop
 | |
| 	FMUL	a3, b4, t2
 | |
| 	LDF	[AO + 6 * SIZE], a3
 | |
| 
 | |
| 	FADD	c03, t3, c03
 | |
| 	nop
 | |
| 	FMUL	a4, b3, t3
 | |
| 	LDF	[BO - 2 * SIZE], b3
 | |
| 
 | |
| 	FADD	c04, t4, c04
 | |
| 	nop
 | |
| 	FMUL	a4, b4, t4
 | |
| 	LDF	[AO + 7 * SIZE], a4
 | |
| 
 | |
| 	FADD	c01, t1, c01
 | |
| 	nop
 | |
| 	FMUL	a1, b1, t1
 | |
| 	LDF	[BO - 1 * SIZE], b4
 | |
| 
 | |
| 	FADD	c02, t2, c02
 | |
| 	FMUL	a1, b2, t2
 | |
| 	LDF	[AO +  8 * SIZE], a1
 | |
| 
 | |
| 	FADD	c03, t3, c03
 | |
| 	FMUL	a2, b1, t3
 | |
| 	LDF	[BO +  0 * SIZE], b1
 | |
| 
 | |
| 	FADD	c04, t4, c04
 | |
| 	FMUL	a2, b2, t4
 | |
| 	LDF	[AO +  9 * SIZE], a2
 | |
| 
 | |
| 	FADD	c01, t1, c01
 | |
| 	FMUL	a3, b3, t1
 | |
| 	LDF	[BO +  1 * SIZE], b2
 | |
| 
 | |
| 	FADD	c02, t2, c02
 | |
| 	FMUL	a3, b4, t2
 | |
| 	LDF	[AO + 10 * SIZE], a3
 | |
| 
 | |
| 	FADD	c03, t3, c03
 | |
| 	FMUL	a4, b3, t3
 | |
| 	LDF	[BO +  2 * SIZE], b3
 | |
| 
 | |
| 	FADD	c04, t4, c04
 | |
| 	FMUL	a4, b4, t4
 | |
| 	LDF	[AO + 11 * SIZE], a4
 | |
| 
 | |
| 	add	AO,  8 * SIZE, AO
 | |
| 	bg,pt	%icc, .LL152
 | |
| 	LDF	[BO +  3 * SIZE], b4
 | |
| 
 | |
| .LL155:
 | |
| #if defined(LT) || defined(RN)
 | |
| 	and	KK,  3, L
 | |
| #else
 | |
| 	and	TEMP1, 3, L
 | |
| #endif
 | |
| 	cmp	L,  0
 | |
| 	ble,a,pn %icc, .LL159
 | |
| 	nop
 | |
| 
 | |
| .LL156:
 | |
| 	LDF	[AO + 0 * SIZE], a1
 | |
| 	LDF	[AO + 1 * SIZE], a2
 | |
| 
 | |
| 	LDF	[BO + 0 * SIZE], b1
 | |
| 	LDF	[BO + 1 * SIZE], b2
 | |
| 
 | |
| 	FADD	c01, t1, c01
 | |
| 	FADD	c02, t2, c02
 | |
| 	FADD	c03, t3, c03
 | |
| 	FADD	c04, t4, c04
 | |
| 
 | |
| 	FMUL	a1, b1, t1
 | |
| 	FMUL	a1, b2, t2
 | |
| 	FMUL	a2, b1, t3
 | |
| 	FMUL	a2, b2, t4
 | |
| 
 | |
| 	add	AO, 2 * SIZE, AO
 | |
| 	add	BO, 2 * SIZE, BO
 | |
| 
 | |
| 	add	L, -1, L
 | |
| 	cmp	L, 0
 | |
| 	bg,pt	%icc, .LL156
 | |
| 	nop
 | |
| 
 | |
| .LL159:
 | |
| 	FADD	c01, t1, c01
 | |
| 	FADD	c02, t2, c02
 | |
| 	FADD	c03, t3, c03
 | |
| 	FADD	c04, t4, c04
 | |
| 
 | |
| #if defined(LN) || defined(RT)
 | |
| #ifdef LN
 | |
| 	sub	KK, 2, TEMP1
 | |
| #else
 | |
| 	sub	KK, 2, TEMP1
 | |
| #endif
 | |
| 	sll	TEMP1, 1 + BASE_SHIFT, TEMP2
 | |
| 	sll	TEMP1, 1 + BASE_SHIFT, TEMP1
 | |
| 	add	AORIG, TEMP2, AO
 | |
| 	add	B,     TEMP1, BO
 | |
| #endif
 | |
| 
 | |
| #if defined(LN) || defined(LT)
 | |
| 	LDF	[BO +  0 * SIZE], a1
 | |
| 	LDF	[BO +  1 * SIZE], a2
 | |
| 	LDF	[BO +  2 * SIZE], a3
 | |
| 	LDF	[BO +  3 * SIZE], a4
 | |
| 
 | |
| 	FSUB	a1, c01, c01
 | |
| 	FSUB	a2, c02, c02
 | |
| 	FSUB	a3, c03, c03
 | |
| 	FSUB	a4, c04, c04
 | |
| #else
 | |
| 	LDF	[AO +  0 * SIZE], a1
 | |
| 	LDF	[AO +  1 * SIZE], a2
 | |
| 	LDF	[AO +  2 * SIZE], a3
 | |
| 	LDF	[AO +  3 * SIZE], a4
 | |
| 
 | |
| 	FSUB	a1, c01, c01
 | |
| 	FSUB	a2, c03, c03
 | |
| 	FSUB	a3, c02, c02
 | |
| 	FSUB	a4, c04, c04
 | |
| #endif
 | |
| 
 | |
| #ifdef LN
 | |
| 	LDF	[AO +  3 * SIZE], a1
 | |
| 	LDF	[AO +  2 * SIZE], a2
 | |
| 	LDF	[AO +  0 * SIZE], a3
 | |
| 
 | |
| 	FMUL	a1, c03, c03
 | |
| 	FMUL	a1, c04, c04
 | |
| 	FMUL	a2, c03, t1
 | |
| 	FMUL	a2, c04, t2
 | |
| 
 | |
| 	FSUB	c01, t1, c01
 | |
| 	FSUB	c02, t2, c02
 | |
| 	FMUL	a3, c01, c01
 | |
| 	FMUL	a3, c02, c02
 | |
| #endif
 | |
| 
 | |
| #ifdef LT
 | |
| 	LDF	[AO +  0 * SIZE], a1
 | |
| 	LDF	[AO +  1 * SIZE], a2
 | |
| 	LDF	[AO +  3 * SIZE], a3
 | |
| 
 | |
| 	FMUL	a1, c01, c01
 | |
| 	FMUL	a1, c02, c02
 | |
| 
 | |
| 	FMUL	a2, c01, t1
 | |
| 	FMUL	a2, c02, t2
 | |
| 
 | |
| 	FSUB	c03, t1, c03
 | |
| 	FSUB	c04, t2, c04
 | |
| 
 | |
| 	FMUL	a3, c03, c03
 | |
| 	FMUL	a3, c04, c04
 | |
| #endif
 | |
| 
 | |
| #ifdef RN
 | |
| 	LDF	[BO +  0 * SIZE], a1
 | |
| 	LDF	[BO +  1 * SIZE], a2
 | |
| 	LDF	[BO +  3 * SIZE], a3
 | |
| 
 | |
| 	FMUL	a1, c01, c01
 | |
| 	FMUL	a1, c03, c03
 | |
| 	FMUL	a2, c01, t1
 | |
| 	FMUL	a2, c03, t2
 | |
| 
 | |
| 	FSUB	c02, t1, c02
 | |
| 	FSUB	c04, t2, c04
 | |
| 	FMUL	a3, c02, c02
 | |
| 	FMUL	a3, c04, c04
 | |
| #endif
 | |
| 
 | |
| #ifdef RT
 | |
| 	LDF	[BO +  3 * SIZE], a1
 | |
| 	LDF	[BO +  2 * SIZE], a2
 | |
| 	LDF	[BO +  0 * SIZE], a3
 | |
| 
 | |
| 	FMUL	a1, c02, c02
 | |
| 	FMUL	a1, c04, c04
 | |
| 
 | |
| 	FMUL	a2, c02, t1
 | |
| 	FMUL	a2, c04, t2
 | |
| 	FSUB	c01, t1, c01
 | |
| 	FSUB	c03, t2, c03
 | |
| 
 | |
| 	FMUL	a3, c01, c01
 | |
| 	FMUL	a3, c03, c03
 | |
| #endif
 | |
| 
 | |
| #ifdef LN
 | |
| 	add	C1, -2 * SIZE, C1
 | |
| 	add	C2, -2 * SIZE, C2
 | |
| #endif
 | |
| 
 | |
| #if defined(LN) || defined(LT)
 | |
| 	STF	c01, [BO +  0 * SIZE]
 | |
| 	STF	c02, [BO +  1 * SIZE]
 | |
| 	STF	c03, [BO +  2 * SIZE]
 | |
| 	STF	c04, [BO +  3 * SIZE]
 | |
| #else
 | |
| 	STF	c01, [AO +  0 * SIZE]
 | |
| 	STF	c03, [AO +  1 * SIZE]
 | |
| 	STF	c02, [AO +  2 * SIZE]
 | |
| 	STF	c04, [AO +  3 * SIZE]
 | |
| #endif
 | |
| 
 | |
| 	STF	c01, [C1 + 0 * SIZE]
 | |
| 	STF	c03, [C1 + 1 * SIZE]
 | |
| 	STF	c02, [C2 + 0 * SIZE]
 | |
| 	STF	c04, [C2 + 1 * SIZE]
 | |
| 
 | |
| 	FMOV	FZERO, t1
 | |
| 	FMOV	FZERO, t2
 | |
| 	FMOV	FZERO, t3
 | |
| 	FMOV	FZERO, t4
 | |
| 
 | |
| #ifndef LN
 | |
| 	add	C1, 2 * SIZE, C1
 | |
| 	add	C2, 2 * SIZE, C2
 | |
| #endif
 | |
| 
 | |
| #ifdef RT
 | |
| 	sll	K, 1 + BASE_SHIFT, TEMP1
 | |
| 	add	AORIG, TEMP1, AORIG
 | |
| #endif
 | |
| 
 | |
| #if defined(LT) || defined(RN)
 | |
| 	sub	K, KK, TEMP1
 | |
| 	sll	TEMP1, 1 + BASE_SHIFT, TEMP2
 | |
| 	sll	TEMP1, 1 + BASE_SHIFT, TEMP1
 | |
| 	add	AO, TEMP2, AO
 | |
| 	add	BO, TEMP1, BO
 | |
| #endif
 | |
| 
 | |
| #ifdef LT
 | |
| 	add	KK, 2, KK
 | |
| #endif
 | |
| 
 | |
| #ifdef LN
 | |
| 	sub	KK, 2, KK
 | |
| #endif
 | |
| 
 | |
| .LL170:
 | |
| 	sra	M, 2, I
 | |
| 	cmp	I, 0
 | |
| 	ble,pn	%icc, .LL199
 | |
| 	FMOV	FZERO, c03
 | |
| 
 | |
| .LL121:
 | |
| #if defined(LT) || defined(RN)
 | |
| 	sra	KK, 2, L
 | |
| 
 | |
| 	mov	B, BO
 | |
| 	cmp	L,  0
 | |
| #else
 | |
| 
 | |
| #ifdef LN
 | |
| 	sll	K,  2 + BASE_SHIFT, TEMP1
 | |
| 	sub	AORIG, TEMP1, AORIG
 | |
| #endif
 | |
| 
 | |
| 	sll	KK, 2 + BASE_SHIFT, TEMP1
 | |
| 	sll	KK, 1 + BASE_SHIFT, TEMP2
 | |
| 
 | |
| 	add	AORIG, TEMP1, AO
 | |
| 	add	B,     TEMP2, BO
 | |
| 
 | |
| 	sub	K, KK, TEMP1
 | |
| 	sra	TEMP1, 2, L
 | |
| 	cmp	L,  0
 | |
| #endif
 | |
| 
 | |
| 	LDF	[AO + 0 * SIZE], a1
 | |
| 	FMOV	FZERO, t1
 | |
| 	LDF	[BO + 0 * SIZE], b1
 | |
| 	FMOV	FZERO, c07
 | |
| 
 | |
| 	LDF	[AO + 1 * SIZE], a2
 | |
| 	FMOV	FZERO, t2
 | |
| 	LDF	[BO + 1 * SIZE], b2
 | |
| 	FMOV	FZERO, c04
 | |
| 
 | |
| 	LDF	[AO + 2 * SIZE], a3
 | |
| 	FMOV	FZERO, t3
 | |
| 	LDF	[BO + 2 * SIZE], b3
 | |
| 	FMOV	FZERO, c08
 | |
| 
 | |
| 	LDF	[AO + 3 * SIZE], a4
 | |
| 	FMOV	FZERO, t4
 | |
| 	LDF	[BO + 3 * SIZE], b4
 | |
| 	FMOV	FZERO, c01
 | |
| 
 | |
| #ifdef LN
 | |
| 	prefetch [C1 - 3 * SIZE], 2
 | |
| 	FMOV	FZERO, c05
 | |
| 	prefetch [C2 - 3 * SIZE], 2
 | |
| 	FMOV	FZERO, c02
 | |
| #else
 | |
| 	prefetch [C1 + 3 * SIZE], 2
 | |
| 	FMOV	FZERO, c05
 | |
| 	prefetch [C2 + 3 * SIZE], 2
 | |
| 	FMOV	FZERO, c02
 | |
| #endif
 | |
| 
 | |
| 	ble,pn	%icc, .LL125
 | |
| 	FMOV	FZERO, c06
 | |
| 
 | |
| .LL122:
 | |
| 	FADD	c03, t1, c03
 | |
| 	add	L, -1, L
 | |
| 	FMUL	a1, b1, t1
 | |
| 	prefetch [AO + APREFETCHSIZE * SIZE], 0
 | |
| 
 | |
| 	FADD	c07, t2, c07
 | |
| 	add	BO,  8 * SIZE, BO
 | |
| 	FMUL	a1, b2, t2
 | |
| 	LDF	[AO + 4 * SIZE], a1
 | |
| 
 | |
| 	FADD	c04, t3, c04
 | |
| 	add	AO, 16 * SIZE, AO
 | |
| 	FMUL	a2, b1, t3
 | |
| 	cmp	L,  0
 | |
| 
 | |
| 	FADD	c08, t4, c08
 | |
| 	nop
 | |
| 	FMUL	a2, b2, t4
 | |
| 	LDF	[AO - 11 * SIZE], a2
 | |
| 
 | |
| 	FADD	c01, t1, c01
 | |
| 	nop
 | |
| 	FMUL	a3, b1, t1
 | |
| 	nop
 | |
| 
 | |
| 	FADD	c05, t2, c05
 | |
| 	nop
 | |
| 	FMUL	a3, b2, t2
 | |
| 	LDF	[AO - 10 * SIZE], a3
 | |
| 
 | |
| 	FADD	c02, t3, c02
 | |
| 	nop
 | |
| 	FMUL	a4, b1, t3
 | |
| 	LDF	[BO -  4 * SIZE], b1
 | |
| 
 | |
| 	FADD	c06, t4, c06
 | |
| 	nop
 | |
| 	FMUL	a4, b2, t4
 | |
| 	LDF	[BO -  3 * SIZE], b2
 | |
| 
 | |
| 	FADD	c03, t1, c03
 | |
| 	nop
 | |
| 	FMUL	a1, b3, t1
 | |
| 	LDF	[AO -  9 * SIZE], a4
 | |
| 
 | |
| 	FADD	c07, t2, c07
 | |
| 	nop
 | |
| 	FMUL	a1, b4, t2
 | |
| 	LDF	[AO -  8 * SIZE], a1
 | |
| 
 | |
| 	FADD	c04, t3, c04
 | |
| 	nop
 | |
| 	FMUL	a2, b3, t3
 | |
| 	nop
 | |
| 
 | |
| 	FADD	c08, t4, c08
 | |
| 	nop
 | |
| 	FMUL	a2, b4, t4
 | |
| 	LDF	[AO -  7 * SIZE], a2
 | |
| 
 | |
| 	FADD	c01, t1, c01
 | |
| 	nop
 | |
| 	FMUL	a3, b3, t1
 | |
| 	nop
 | |
| 
 | |
| 	FADD	c05, t2, c05
 | |
| 	nop
 | |
| 	FMUL	a3, b4, t2
 | |
| 	LDF	[AO -  6 * SIZE], a3
 | |
| 
 | |
| 	FADD	c02, t3, c02
 | |
| 	nop
 | |
| 	FMUL	a4, b3, t3
 | |
| 	LDF	[BO -  2 * SIZE], b3
 | |
| 
 | |
| 	FADD	c06, t4, c06
 | |
| 	nop
 | |
| 	FMUL	a4, b4, t4
 | |
| 	LDF	[BO -  1 * SIZE], b4
 | |
| 
 | |
| 	FADD	c03, t1, c03
 | |
| 	nop
 | |
| 	FMUL	a1, b1, t1
 | |
| 	LDF	[AO -  5 * SIZE], a4
 | |
| 
 | |
| 	FADD	c07, t2, c07
 | |
| 	nop
 | |
| 	FMUL	a1, b2, t2
 | |
| 	LDF	[AO -  4 * SIZE], a1
 | |
| 
 | |
| 	FADD	c04, t3, c04
 | |
| 	nop
 | |
| 	FMUL	a2, b1, t3
 | |
| 	nop
 | |
| 
 | |
| 	FADD	c08, t4, c08
 | |
| 	nop
 | |
| 	FMUL	a2, b2, t4
 | |
| 	LDF	[AO -  3 * SIZE], a2
 | |
| 
 | |
| 	FADD	c01, t1, c01
 | |
| 	nop
 | |
| 	FMUL	a3, b1, t1
 | |
| 	nop
 | |
| 
 | |
| 	FADD	c05, t2, c05
 | |
| 	nop
 | |
| 	FMUL	a3, b2, t2
 | |
| 	LDF	[AO -  2 * SIZE], a3
 | |
| 
 | |
| 	FADD	c02, t3, c02
 | |
| 	nop
 | |
| 	FMUL	a4, b1, t3
 | |
| 	LDF	[BO +  0 * SIZE], b1
 | |
| 
 | |
| 	FADD	c06, t4, c06
 | |
| 	nop
 | |
| 	FMUL	a4, b2, t4
 | |
| 	LDF	[BO +  1 * SIZE], b2
 | |
| 
 | |
| 	FADD	c03, t1, c03
 | |
| 	nop
 | |
| 	FMUL	a1, b3, t1
 | |
| 	LDF	[AO -  1 * SIZE], a4
 | |
| 
 | |
| 	FADD	c07, t2, c07
 | |
| 	nop
 | |
| 	FMUL	a1, b4, t2
 | |
| 	LDF	[AO +  0 * SIZE], a1
 | |
| 
 | |
| 	FADD	c04, t3, c04
 | |
| 	nop
 | |
| 	FMUL	a2, b3, t3
 | |
| 	nop
 | |
| 
 | |
| 	FADD	c08, t4, c08
 | |
| 	nop
 | |
| 	FMUL	a2, b4, t4
 | |
| 	LDF	[AO +  1 * SIZE], a2
 | |
| 
 | |
| 	FADD	c01, t1, c01
 | |
| 	nop
 | |
| 	FMUL	a3, b3, t1
 | |
| 	nop
 | |
| 
 | |
| 	FADD	c05, t2, c05
 | |
| 	nop
 | |
| 	FMUL	a3, b4, t2
 | |
| 	LDF	[AO +  2 * SIZE], a3
 | |
| 
 | |
| 	FADD	c02, t3, c02
 | |
| 	nop
 | |
| 	FMUL	a4, b3, t3
 | |
| 	LDF	[BO +  2 * SIZE], b3
 | |
| 
 | |
| 	FADD	c06, t4, c06
 | |
| 	FMUL	a4, b4, t4
 | |
| 	LDF	[AO +  3 * SIZE], a4
 | |
| 
 | |
| 	bg,pt	%icc, .LL122
 | |
| 	LDF	[BO +  3 * SIZE], b4
 | |
| 
 | |
| .LL125:
 | |
| #if defined(LT) || defined(RN)
 | |
| 	and	KK,  3, L
 | |
| #else
 | |
| 	and	TEMP1, 3, L
 | |
| #endif
 | |
| 	cmp	L,  0
 | |
| 	ble,a,pn %icc, .LL129
 | |
| 	nop
 | |
| 
 | |
| .LL126:
 | |
| 	FADD	c03, t1, c03
 | |
| 	add	AO, 4 * SIZE, AO
 | |
| 	FMUL	a1, b1, t1
 | |
| 	add	BO, 2 * SIZE, BO
 | |
| 
 | |
| 	FADD	c07, t2, c07
 | |
| 	add	L, -1, L
 | |
| 	FMUL	a1, b2, t2
 | |
| 	LDF	[AO + 0 * SIZE], a1
 | |
| 
 | |
| 	FADD	c04, t3, c04
 | |
| 	cmp	L, 0
 | |
| 	FMUL	a2, b1, t3
 | |
| 
 | |
| 	FADD	c08, t4, c08
 | |
| 	FMUL	a2, b2, t4
 | |
| 	LDF	[AO + 1 * SIZE], a2
 | |
| 
 | |
| 	FADD	c01, t1, c01
 | |
| 	FMUL	a3, b1, t1
 | |
| 	FADD	c05, t2, c05
 | |
| 	FMUL	a3, b2, t2
 | |
| 	LDF	[AO + 2 * SIZE], a3
 | |
| 
 | |
| 	FADD	c02, t3, c02
 | |
| 	FMUL	a4, b1, t3
 | |
| 	LDF	[BO + 0 * SIZE], b1
 | |
| 	FADD	c06, t4, c06
 | |
| 	FMUL	a4, b2, t4
 | |
| 	LDF	[BO + 1 * SIZE], b2
 | |
| 	bg,pt	%icc, .LL126
 | |
| 	LDF	[AO + 3 * SIZE], a4
 | |
| 
 | |
| .LL129:
 | |
| 	FADD	c03, t1, c03
 | |
| 	FADD	c07, t2, c07
 | |
| 	FADD	c04, t3, c04
 | |
| 	FADD	c08, t4, c08
 | |
| 
 | |
| #if defined(LN) || defined(RT)
 | |
| #ifdef LN
 | |
| 	sub	KK, 4, TEMP1
 | |
| #else
 | |
| 	sub	KK, 2, TEMP1
 | |
| #endif
 | |
| 	sll	TEMP1, 2 + BASE_SHIFT, TEMP2
 | |
| 	sll	TEMP1, 1 + BASE_SHIFT, TEMP1
 | |
| 	add	AORIG, TEMP2, AO
 | |
| 	add	B,     TEMP1, BO
 | |
| #endif
 | |
| 
 | |
| #if defined(LN) || defined(LT)
 | |
| 	LDF	[BO +  0 * SIZE], a1
 | |
| 	LDF	[BO +  1 * SIZE], a2
 | |
| 	LDF	[BO +  2 * SIZE], a3
 | |
| 	LDF	[BO +  3 * SIZE], a4
 | |
| 
 | |
| 	LDF	[BO +  4 * SIZE], b1
 | |
| 	LDF	[BO +  5 * SIZE], b2
 | |
| 	LDF	[BO +  6 * SIZE], b3
 | |
| 	LDF	[BO +  7 * SIZE], b4
 | |
| 
 | |
| 	FSUB	a1, c01, c01
 | |
| 	FSUB	a2, c05, c05
 | |
| 	FSUB	a3, c02, c02
 | |
| 	FSUB	a4, c06, c06
 | |
| 
 | |
| 	FSUB	b1, c03, c03
 | |
| 	FSUB	b2, c07, c07
 | |
| 	FSUB	b3, c04, c04
 | |
| 	FSUB	b4, c08, c08
 | |
| #else
 | |
| 	LDF	[AO +  0 * SIZE], a1
 | |
| 	LDF	[AO +  1 * SIZE], a2
 | |
| 	LDF	[AO +  2 * SIZE], a3
 | |
| 	LDF	[AO +  3 * SIZE], a4
 | |
| 
 | |
| 	LDF	[AO +  4 * SIZE], b1
 | |
| 	LDF	[AO +  5 * SIZE], b2
 | |
| 	LDF	[AO +  6 * SIZE], b3
 | |
| 	LDF	[AO +  7 * SIZE], b4
 | |
| 
 | |
| 	FSUB	a1, c01, c01
 | |
| 	FSUB	a2, c02, c02
 | |
| 	FSUB	a3, c03, c03
 | |
| 	FSUB	a4, c04, c04
 | |
| 
 | |
| 	FSUB	b1, c05, c05
 | |
| 	FSUB	b2, c06, c06
 | |
| 	FSUB	b3, c07, c07
 | |
| 	FSUB	b4, c08, c08
 | |
| #endif
 | |
| 
 | |
| #ifdef LN
 | |
| 	LDF	[AO + 15 * SIZE], a1
 | |
| 	LDF	[AO + 14 * SIZE], a2
 | |
| 	LDF	[AO + 13 * SIZE], a3
 | |
| 	LDF	[AO + 12 * SIZE], a4
 | |
| 
 | |
| 	FMUL	a1, c04, c04
 | |
| 	FMUL	a1, c08, c08
 | |
| 	FMUL	a2, c04, t1
 | |
| 	FMUL	a2, c08, t2
 | |
| 
 | |
| 	FSUB	c03, t1, c03
 | |
| 	FSUB	c07, t2, c07
 | |
| 	FMUL	a3, c04, t1
 | |
| 	FMUL	a3, c08, t2
 | |
| 
 | |
| 	FSUB	c02, t1, c02
 | |
| 	FSUB	c06, t2, c06
 | |
| 	FMUL	a4, c04, t1
 | |
| 	FMUL	a4, c08, t2
 | |
| 
 | |
| 	FSUB	c01, t1, c01
 | |
| 	FSUB	c05, t2, c05
 | |
| 
 | |
| 	LDF	[AO + 10 * SIZE], a1
 | |
| 	LDF	[AO +  9 * SIZE], a2
 | |
| 	LDF	[AO +  8 * SIZE], a3
 | |
| 
 | |
| 	FMUL	a1, c03, c03
 | |
| 	FMUL	a1, c07, c07
 | |
| 	FMUL	a2, c03, t1
 | |
| 	FMUL	a2, c07, t2
 | |
| 
 | |
| 	FSUB	c02, t1, c02
 | |
| 	FSUB	c06, t2, c06
 | |
| 	FMUL	a3, c03, t1
 | |
| 	FMUL	a3, c07, t2
 | |
| 
 | |
| 	FSUB	c01, t1, c01
 | |
| 	FSUB	c05, t2, c05
 | |
| 
 | |
| 	LDF	[AO +  5 * SIZE], a1
 | |
| 	LDF	[AO +  4 * SIZE], a2
 | |
| 
 | |
| 	FMUL	a1, c02, c02
 | |
| 	FMUL	a1, c06, c06
 | |
| 	FMUL	a2, c02, t1
 | |
| 	FMUL	a2, c06, t2
 | |
| 
 | |
| 	FSUB	c01, t1, c01
 | |
| 	FSUB	c05, t2, c05
 | |
| 
 | |
| 	LDF	[AO +  0 * SIZE], a1
 | |
| 
 | |
| 	FMUL	a1, c01, c01
 | |
| 	FMUL	a1, c05, c05
 | |
| #endif
 | |
| 
 | |
| #ifdef LT
 | |
| 	LDF	[AO +  0 * SIZE], a1
 | |
| 	LDF	[AO +  1 * SIZE], a2
 | |
| 	LDF	[AO +  2 * SIZE], a3
 | |
| 	LDF	[AO +  3 * SIZE], a4
 | |
| 
 | |
| 	FMUL	a1, c01, c01
 | |
| 	FMUL	a1, c05, c05
 | |
| 	FMUL	a2, c01, t1
 | |
| 	FMUL	a2, c05, t2
 | |
| 
 | |
| 	FSUB	c02, t1, c02
 | |
| 	FSUB	c06, t2, c06
 | |
| 	FMUL	a3, c01, t1
 | |
| 	FMUL	a3, c05, t2
 | |
| 
 | |
| 	FSUB	c03, t1, c03
 | |
| 	FSUB	c07, t2, c07
 | |
| 	FMUL	a4, c01, t1
 | |
| 	FMUL	a4, c05, t2
 | |
| 
 | |
| 	FSUB	c04, t1, c04
 | |
| 	FSUB	c08, t2, c08
 | |
| 
 | |
| 	LDF	[AO +  5 * SIZE], a1
 | |
| 	LDF	[AO +  6 * SIZE], a2
 | |
| 	LDF	[AO +  7 * SIZE], a3
 | |
| 
 | |
| 	FMUL	a1, c02, c02
 | |
| 	FMUL	a1, c06, c06
 | |
| 	FMUL	a2, c02, t1
 | |
| 	FMUL	a2, c06, t2
 | |
| 
 | |
| 	FSUB	c03, t1, c03
 | |
| 	FSUB	c07, t2, c07
 | |
| 	FMUL	a3, c02, t1
 | |
| 	FMUL	a3, c06, t2
 | |
| 	FSUB	c04, t1, c04
 | |
| 	FSUB	c08, t2, c08
 | |
| 
 | |
| 	LDF	[AO + 10 * SIZE], a1
 | |
| 	LDF	[AO + 11 * SIZE], a2
 | |
| 
 | |
| 	FMUL	a1, c03, c03
 | |
| 	FMUL	a1, c07, c07
 | |
| 	FMUL	a2, c03, t1
 | |
| 	FMUL	a2, c07, t2
 | |
| 
 | |
| 	FSUB	c04, t1, c04
 | |
| 	FSUB	c08, t2, c08
 | |
| 
 | |
| 	LDF	[AO + 15 * SIZE], a1
 | |
| 
 | |
| 	FMUL	a1, c04, c04
 | |
| 	FMUL	a1, c08, c08
 | |
| #endif
 | |
| 
 | |
| #ifdef RN
 | |
| 	LDF	[BO +  0 * SIZE], a1
 | |
| 	LDF	[BO +  1 * SIZE], a2
 | |
| 	LDF	[BO +  3 * SIZE], a3
 | |
| 
 | |
| 	FMUL	a1, c01, c01
 | |
| 	FMUL	a1, c02, c02
 | |
| 	FMUL	a1, c03, c03
 | |
| 	FMUL	a1, c04, c04
 | |
| 
 | |
| 	FMUL	a2, c01, t1
 | |
| 	FMUL	a2, c02, t2
 | |
| 	FMUL	a2, c03, t3
 | |
| 	FMUL	a2, c04, t4
 | |
| 
 | |
| 	FSUB	c05, t1, c05
 | |
| 	FSUB	c06, t2, c06
 | |
| 	FSUB	c07, t3, c07
 | |
| 	FSUB	c08, t4, c08
 | |
| 
 | |
| 	FMUL	a3, c05, c05
 | |
| 	FMUL	a3, c06, c06
 | |
| 	FMUL	a3, c07, c07
 | |
| 	FMUL	a3, c08, c08
 | |
| #endif
 | |
| 
 | |
| #ifdef RT
 | |
| 	LDF	[BO +  3 * SIZE], a1
 | |
| 	LDF	[BO +  2 * SIZE], a2
 | |
| 	LDF	[BO +  0 * SIZE], a3
 | |
| 
 | |
| 	FMUL	a1, c05, c05
 | |
| 	FMUL	a1, c06, c06
 | |
| 	FMUL	a1, c07, c07
 | |
| 	FMUL	a1, c08, c08
 | |
| 
 | |
| 	FMUL	a2, c05, t1
 | |
| 	FMUL	a2, c06, t2
 | |
| 	FMUL	a2, c07, t3
 | |
| 	FMUL	a2, c08, t4
 | |
| 
 | |
| 	FSUB	c01, t1, c01
 | |
| 	FSUB	c02, t2, c02
 | |
| 	FSUB	c03, t3, c03
 | |
| 	FSUB	c04, t4, c04
 | |
| 
 | |
| 	FMUL	a3, c01, c01
 | |
| 	FMUL	a3, c02, c02
 | |
| 	FMUL	a3, c03, c03
 | |
| 	FMUL	a3, c04, c04
 | |
| #endif
 | |
| 
 | |
| #ifdef LN
 | |
| 	add	C1, -4 * SIZE, C1
 | |
| 	add	C2, -4 * SIZE, C2
 | |
| #endif
 | |
| 
 | |
| #if defined(LN) || defined(LT)
 | |
| 	STF	c01, [BO +  0 * SIZE]
 | |
| 	STF	c05, [BO +  1 * SIZE]
 | |
| 	STF	c02, [BO +  2 * SIZE]
 | |
| 	STF	c06, [BO +  3 * SIZE]
 | |
| 
 | |
| 	STF	c03, [BO +  4 * SIZE]
 | |
| 	STF	c07, [BO +  5 * SIZE]
 | |
| 	STF	c04, [BO +  6 * SIZE]
 | |
| 	STF	c08, [BO +  7 * SIZE]
 | |
| #else
 | |
| 	STF	c01, [AO +  0 * SIZE]
 | |
| 	STF	c02, [AO +  1 * SIZE]
 | |
| 	STF	c03, [AO +  2 * SIZE]
 | |
| 	STF	c04, [AO +  3 * SIZE]
 | |
| 
 | |
| 	STF	c05, [AO +  4 * SIZE]
 | |
| 	STF	c06, [AO +  5 * SIZE]
 | |
| 	STF	c07, [AO +  6 * SIZE]
 | |
| 	STF	c08, [AO +  7 * SIZE]
 | |
| #endif
 | |
| 
 | |
| 	STF	c01, [C1 + 0 * SIZE]
 | |
| 	STF	c02, [C1 + 1 * SIZE]
 | |
| 	STF	c03, [C1 + 2 * SIZE]
 | |
| 	STF	c04, [C1 + 3 * SIZE]
 | |
| 
 | |
| 	STF	c05, [C2 + 0 * SIZE]
 | |
| 	STF	c06, [C2 + 1 * SIZE]
 | |
| 	STF	c07, [C2 + 2 * SIZE]
 | |
| 	STF	c08, [C2 + 3 * SIZE]
 | |
| 
 | |
| 	FMOV	FZERO, t1
 | |
| 	FMOV	FZERO, t2
 | |
| 	FMOV	FZERO, t3
 | |
| 	FMOV	FZERO, t4
 | |
| 
 | |
| #ifndef LN
 | |
| 	add	C1, 4 * SIZE, C1
 | |
| 	add	C2, 4 * SIZE, C2
 | |
| #endif
 | |
| 
 | |
| #ifdef RT
 | |
| 	sll	K, 2 + BASE_SHIFT, TEMP1
 | |
| 	add	AORIG, TEMP1, AORIG
 | |
| #endif
 | |
| 
 | |
| #if defined(LT) || defined(RN)
 | |
| 	sub	K, KK, TEMP1
 | |
| 	sll	TEMP1, 2 + BASE_SHIFT, TEMP2
 | |
| 	sll	TEMP1, 1 + BASE_SHIFT, TEMP1
 | |
| 	add	AO, TEMP2, AO
 | |
| 	add	BO, TEMP1, BO
 | |
| #endif
 | |
| 
 | |
| #ifdef LT
 | |
| 	add	KK, 4, KK
 | |
| #endif
 | |
| 
 | |
| #ifdef LN
 | |
| 	sub	KK, 4, KK
 | |
| #endif
 | |
| 
 | |
| 	add	I, -1, I
 | |
| 	cmp	I, 0
 | |
| 
 | |
| 	bg,pt	%icc, .LL121
 | |
| 	FMOV	FZERO, c03
 | |
| 
 | |
| .LL199:
 | |
| #ifdef LN
 | |
| 	sll	K, 1 + BASE_SHIFT, TEMP1
 | |
| 	add	B, TEMP1, B
 | |
| #endif
 | |
| 
 | |
| #if defined(LT) || defined(RN)
 | |
| 	mov	BO, B
 | |
| #endif
 | |
| 
 | |
| #ifdef RN
 | |
| 	add	KK, 2, KK
 | |
| #endif
 | |
| 
 | |
| #ifdef RT
 | |
| 	sub	KK, 2, KK
 | |
| #endif
 | |
| 
 | |
| .LL200:
 | |
| 	and	N, 1, J
 | |
| 
 | |
| 	cmp	J, 0
 | |
| 	ble,pn	%icc, .LL999
 | |
| 	nop
 | |
| 
 | |
| #ifdef RT
 | |
| 	sll	K, 0 + BASE_SHIFT, TEMP1
 | |
| 	sub	B, TEMP1, B
 | |
| 
 | |
| 	sub	C, LDC, C
 | |
| #endif
 | |
| 
 | |
| 	mov	C, C1
 | |
| 
 | |
| #ifdef LN
 | |
| 	add	M, OFFSET, KK
 | |
| #endif
 | |
| 
 | |
| #ifdef LT
 | |
| 	mov	OFFSET, KK
 | |
| #endif
 | |
| 
 | |
| #if defined(LN) || defined(RT)
 | |
| 	mov	A, AORIG
 | |
| #else
 | |
| 	mov	A, AO
 | |
| #endif
 | |
| 
 | |
| #ifndef RT
 | |
| 	add	C, LDC, C
 | |
| #endif
 | |
| 
 | |
| 	and	M, 1, I
 | |
| 	cmp	I, 0
 | |
| 	ble,pn	%icc, .LL250
 | |
| 	nop
 | |
| 
 | |
| #if defined(LT) || defined(RN)
 | |
| 	sra	KK, 2, L
 | |
| 
 | |
| 	mov	B, BO
 | |
| 	cmp	L,  0
 | |
| #else
 | |
| 
 | |
| #ifdef LN
 | |
| 	sll	K,  0 + BASE_SHIFT, TEMP1
 | |
| 	sub	AORIG, TEMP1, AORIG
 | |
| #endif
 | |
| 
 | |
| 	sll	KK, 0 + BASE_SHIFT, TEMP1
 | |
| 
 | |
| 	add	AORIG, TEMP1, AO
 | |
| 	add	B,     TEMP1, BO
 | |
| 
 | |
| 	sub	K, KK, TEMP1
 | |
| 	sra	TEMP1, 2, L
 | |
| 	cmp	L,  0
 | |
| #endif
 | |
| 
 | |
| 	LDF	[AO + 0 * SIZE], a1
 | |
| 	FMOV	FZERO, t1
 | |
|  	LDF	[AO + 1 * SIZE], a2
 | |
| 	FMOV	FZERO, c01
 | |
| 
 | |
| 	LDF	[AO + 2 * SIZE], a3
 | |
| 	FMOV	FZERO, t2
 | |
| 	LDF	[AO + 3 * SIZE], a4
 | |
| 	FMOV	FZERO, c02
 | |
| 
 | |
| 	LDF	[BO + 0 * SIZE], b1
 | |
| 	FMOV	FZERO, t3
 | |
| 	LDF	[BO + 1 * SIZE], b2
 | |
| 	FMOV	FZERO, t4
 | |
| 	LDF	[BO + 2 * SIZE], b3
 | |
| 
 | |
| 	ble,pn	%icc, .LL275
 | |
| 	LDF	[BO + 3 * SIZE], b4
 | |
| 
 | |
| .LL272:
 | |
| 	FADD	c01, t1, c01
 | |
| 	add	L, -1, L
 | |
| 	add	AO,  4 * SIZE, AO
 | |
| 
 | |
| 	FMUL	a1, b1, t1
 | |
| 	add	BO,  4 * SIZE, BO
 | |
| 	LDF	[AO + 0 * SIZE], a1
 | |
| 
 | |
| 	FADD	c02, t2, c02
 | |
| 	cmp	L, 0
 | |
| 	LDF	[BO + 0 * SIZE], b1
 | |
| 	FMUL	a2, b2, t2
 | |
| 
 | |
| 	LDF	[AO + 1 * SIZE], a2
 | |
| 	FADD	c01, t3, c01
 | |
| 	LDF	[BO + 1 * SIZE], b2
 | |
| 	FMUL	a3, b3, t3
 | |
| 
 | |
| 	LDF	[AO + 2 * SIZE], a3
 | |
| 	FADD	c02, t4, c02
 | |
| 	LDF	[BO + 2 * SIZE], b3
 | |
| 	FMUL	a4, b4, t4
 | |
| 	LDF	[AO + 3 * SIZE], a4
 | |
| 
 | |
| 	bg,pt	%icc, .LL272
 | |
| 	LDF	[BO + 3 * SIZE], b4
 | |
| 
 | |
| .LL275:
 | |
| #if defined(LT) || defined(RN)
 | |
| 	and	KK,  3, L
 | |
| #else
 | |
| 	and	TEMP1, 3, L
 | |
| #endif
 | |
| 	cmp	L,  0
 | |
| 	ble,a,pn %icc, .LL279
 | |
| 	nop
 | |
| 
 | |
| .LL276:
 | |
| 	FADD	c01, t1, c01
 | |
| 	add	L, -1, L
 | |
| 	FMUL	a1, b1, t1
 | |
| 	LDF	[AO + 1 * SIZE], a1
 | |
| 
 | |
| 	LDF	[BO + 1 * SIZE], b1
 | |
| 	add	BO, 1 * SIZE, BO
 | |
| 	cmp	L, 0
 | |
| 	bg,pt	%icc, .LL276
 | |
| 	add	AO, 1 * SIZE, AO
 | |
| 
 | |
| .LL279:
 | |
| 	FADD	c01, t1, c01
 | |
| 	FADD	c02, t2, c02
 | |
| 	FADD	c01, t3, c01
 | |
| 	FADD	c02, t4, c02
 | |
| 
 | |
| 	FADD	c01, c02, c01
 | |
| 
 | |
| #if defined(LN) || defined(RT)
 | |
| 	sub	KK, 1, TEMP1
 | |
| 	sll	TEMP1, 0 + BASE_SHIFT, TEMP1
 | |
| 	add	AORIG, TEMP1, AO
 | |
| 	add	B,     TEMP1, BO
 | |
| #endif
 | |
| 
 | |
| #if defined(LN) || defined(LT)
 | |
| 	LDF	[BO +  0 * SIZE], a1
 | |
| 	FSUB	a1, c01, c01
 | |
| #else
 | |
| 	LDF	[AO +  0 * SIZE], a1
 | |
| 	FSUB	a1, c01, c01
 | |
| #endif
 | |
| 
 | |
| #ifdef LN
 | |
| 	LDF	[AO +  0 * SIZE], a1
 | |
| 	FMUL	a1, c01, c01
 | |
| #endif
 | |
| 
 | |
| #ifdef LT
 | |
| 	LDF	[AO +  0 * SIZE], a1
 | |
| 	FMUL	a1, c01, c01
 | |
| #endif
 | |
| 
 | |
| #ifdef RN
 | |
| 	LDF	[BO +  0 * SIZE], a1
 | |
| 	FMUL	a1, c01, c01
 | |
| #endif
 | |
| 
 | |
| #ifdef RT
 | |
| 	LDF	[BO +  0 * SIZE], a1
 | |
| 	FMUL	a1, c01, c01
 | |
| #endif
 | |
| 
 | |
| #ifdef LN
 | |
| 	add	C1, -1 * SIZE, C1
 | |
| #endif
 | |
| 
 | |
| #if defined(LN) || defined(LT)
 | |
| 	STF	c01, [BO +  0 * SIZE]
 | |
| #else
 | |
| 	STF	c01, [AO +  0 * SIZE]
 | |
| #endif
 | |
| 
 | |
| 	STF	c01, [C1 + 0 * SIZE]
 | |
| 
 | |
| 	FMOV	FZERO, t1
 | |
| 	FMOV	FZERO, t2
 | |
| 	FMOV	FZERO, t3
 | |
| 	FMOV	FZERO, t4
 | |
| 
 | |
| #ifndef LN
 | |
| 	add	C1, 1 * SIZE, C1
 | |
| #endif
 | |
| 
 | |
| #ifdef RT
 | |
| 	sll	K, 0 + BASE_SHIFT, TEMP1
 | |
| 	add	AORIG, TEMP1, AORIG
 | |
| #endif
 | |
| 
 | |
| #if defined(LT) || defined(RN)
 | |
| 	sub	K, KK, TEMP1
 | |
| 	sll	TEMP1, 0 + BASE_SHIFT, TEMP1
 | |
| 	add	AO, TEMP1, AO
 | |
| 	add	BO, TEMP1, BO
 | |
| #endif
 | |
| 
 | |
| #ifdef LT
 | |
| 	add	KK, 1, KK
 | |
| #endif
 | |
| 
 | |
| #ifdef LN
 | |
| 	sub	KK, 1, KK
 | |
| #endif
 | |
| 
 | |
| .LL250:
 | |
| 	and	M, 2, I
 | |
| 	cmp	I, 0
 | |
| 	ble,pn	%icc, .LL270
 | |
| 	nop
 | |
| 
 | |
| #if defined(LT) || defined(RN)
 | |
| 	sra	KK, 2, L
 | |
| 
 | |
| 	mov	B, BO
 | |
| 	cmp	L,  0
 | |
| #else
 | |
| 
 | |
| #ifdef LN
 | |
| 	sll	K,  1 + BASE_SHIFT, TEMP1
 | |
| 	sub	AORIG, TEMP1, AORIG
 | |
| #endif
 | |
| 
 | |
| 	sll	KK, 1 + BASE_SHIFT, TEMP1
 | |
| 	sll	KK, 0 + BASE_SHIFT, TEMP2
 | |
| 
 | |
| 	add	AORIG, TEMP1, AO
 | |
| 	add	B,     TEMP2, BO
 | |
| 
 | |
| 	sub	K, KK, TEMP1
 | |
| 	sra	TEMP1, 2, L
 | |
| 	cmp	L,  0
 | |
| #endif
 | |
| 
 | |
| 	LDF	[AO + 0 * SIZE], a1
 | |
| 	FMOV	FZERO, c01
 | |
| 	LDF	[BO + 0 * SIZE], b1
 | |
| 	FMOV	FZERO, t1
 | |
| 
 | |
| 	LDF	[AO + 1 * SIZE], a2
 | |
| 	FMOV	FZERO, c02
 | |
| 	LDF	[BO + 1 * SIZE], b2
 | |
| 	FMOV	FZERO, t2
 | |
| 
 | |
| 	LDF	[AO + 2 * SIZE], a3
 | |
| 	FMOV	FZERO, c03
 | |
| 	LDF	[BO + 2 * SIZE], b3
 | |
| 	FMOV	FZERO, t3
 | |
| 
 | |
| 	LDF	[AO + 3 * SIZE], a4
 | |
| 	FMOV	FZERO, c04
 | |
| 	LDF	[BO + 3 * SIZE], b4
 | |
| 	FMOV	FZERO, t4
 | |
| 
 | |
| 	ble,pn	%icc, .LL255
 | |
| 	nop
 | |
| 
 | |
| .LL252:
 | |
| 	FADD	c01, t1, c01
 | |
| 	add	L, -1, L
 | |
| 	FMUL	a1, b1, t1
 | |
| 	LDF	[AO + 4 * SIZE], a1
 | |
| 
 | |
| 	FADD	c02, t2, c02
 | |
| 	FMUL	a2, b1, t2
 | |
| 	LDF	[AO +  5 * SIZE], a2
 | |
| 	LDF	[BO +  4 * SIZE], b1
 | |
| 
 | |
| 	FADD	c03, t3, c03
 | |
| 	cmp	L, 0
 | |
| 	FMUL	a3, b2, t3
 | |
| 	LDF	[AO +  6 * SIZE], a3
 | |
| 
 | |
| 	FADD	c04, t4, c04
 | |
| 	FMUL	a4, b2, t4
 | |
| 	LDF	[AO +  7 * SIZE], a4
 | |
| 	LDF	[BO +  5 * SIZE], b2
 | |
| 
 | |
| 	FADD	c01, t1, c01
 | |
| 	FMUL	a1, b3, t1
 | |
| 	LDF	[AO +  8 * SIZE], a1
 | |
| 
 | |
| 	FADD	c02, t2, c02
 | |
| 	FMUL	a2, b3, t2
 | |
| 	LDF	[AO +  9 * SIZE], a2
 | |
| 	LDF	[BO +  6 * SIZE], b3
 | |
| 
 | |
| 	FADD	c03, t3, c03
 | |
| 	FMUL	a3, b4, t3
 | |
| 	LDF	[AO + 10 * SIZE], a3
 | |
| 
 | |
| 	FADD	c04, t4, c04
 | |
| 	FMUL	a4, b4, t4
 | |
| 	LDF	[AO + 11 * SIZE], a4
 | |
| 	add	AO,  8 * SIZE, AO
 | |
| 
 | |
| 	LDF	[BO +  7 * SIZE], b4
 | |
| 	bg,pt	%icc, .LL252
 | |
| 	add	BO,  4 * SIZE, BO
 | |
| 
 | |
| .LL255:
 | |
| #if defined(LT) || defined(RN)
 | |
| 	and	KK,  3, L
 | |
| #else
 | |
| 	and	TEMP1, 3, L
 | |
| #endif
 | |
| 
 | |
| 	cmp	L,  0
 | |
| 	ble,a,pn %icc, .LL259
 | |
| 	nop
 | |
| 
 | |
| .LL256:
 | |
| 	FADD	c01, t1, c01
 | |
| 	add	L, -1, L
 | |
| 	FMUL	a1, b1, t1
 | |
| 	LDF	[AO + 2 * SIZE], a1
 | |
| 
 | |
| 	FADD	c02, t2, c02
 | |
| 	cmp	L, 0
 | |
| 	FMUL	a2, b1, t2
 | |
| 	LDF	[AO + 3 * SIZE], a2
 | |
| 
 | |
| 	LDF	[BO + 1 * SIZE], b1
 | |
| 	add	AO, 2 * SIZE, AO
 | |
| 
 | |
| 	bg,pt	%icc, .LL256
 | |
| 	add	BO, 1 * SIZE, BO
 | |
| 
 | |
| .LL259:
 | |
| 	FADD	c01, t1, c01
 | |
| 	FADD	c02, t2, c02
 | |
| 	FADD	c03, t3, c03
 | |
| 	FADD	c04, t4, c04
 | |
| 
 | |
| 	FADD	c01, c03, c01
 | |
| 	FADD	c02, c04, c02
 | |
| 
 | |
| #if defined(LN) || defined(RT)
 | |
| #ifdef LN
 | |
| 	sub	KK, 2, TEMP1
 | |
| #else
 | |
| 	sub	KK, 1, TEMP1
 | |
| #endif
 | |
| 	sll	TEMP1, 1 + BASE_SHIFT, TEMP2
 | |
| 	sll	TEMP1, 0 + BASE_SHIFT, TEMP1
 | |
| 	add	AORIG, TEMP2, AO
 | |
| 	add	B,     TEMP1, BO
 | |
| #endif
 | |
| 
 | |
| #if defined(LN) || defined(LT)
 | |
| 	LDF	[BO +  0 * SIZE], a1
 | |
| 	LDF	[BO +  1 * SIZE], a2
 | |
| 
 | |
| 	FSUB	a1, c01, c01
 | |
| 	FSUB	a2, c02, c02
 | |
| #else
 | |
| 	LDF	[AO +  0 * SIZE], a1
 | |
| 	LDF	[AO +  1 * SIZE], a2
 | |
| 
 | |
| 	FSUB	a1, c01, c01
 | |
| 	FSUB	a2, c02, c02
 | |
| #endif
 | |
| 
 | |
| #ifdef LN
 | |
| 	LDF	[AO +  3 * SIZE], a1
 | |
| 	LDF	[AO +  2 * SIZE], a2
 | |
| 	LDF	[AO +  0 * SIZE], a3
 | |
| 
 | |
| 	FMUL	a1, c02, c02
 | |
| 	FMUL	a2, c02, t1
 | |
| 	FSUB	c01, t1, c01
 | |
| 	FMUL	a3, c01, c01
 | |
| #endif
 | |
| 
 | |
| #ifdef LT
 | |
| 	LDF	[AO +  0 * SIZE], a1
 | |
| 	LDF	[AO +  1 * SIZE], a2
 | |
| 	LDF	[AO +  3 * SIZE], a3
 | |
| 
 | |
| 	FMUL	a1, c01, c01
 | |
| 	FMUL	a2, c01, t1
 | |
| 	FSUB	c02, t1, c02
 | |
| 	FMUL	a3, c02, c02
 | |
| #endif
 | |
| 
 | |
| #ifdef RN
 | |
| 	LDF	[BO +  0 * SIZE], a1
 | |
| 
 | |
| 	FMUL	a1, c01, c01
 | |
| 	FMUL	a1, c02, c02
 | |
| #endif
 | |
| 
 | |
| #ifdef RT
 | |
| 	LDF	[BO +  0 * SIZE], a1
 | |
| 
 | |
| 	FMUL	a1, c01, c01
 | |
| 	FMUL	a1, c02, c02
 | |
| #endif
 | |
| 
 | |
| #ifdef LN
 | |
| 	add	C1, -2 * SIZE, C1
 | |
| #endif
 | |
| 
 | |
| #if defined(LN) || defined(LT)
 | |
| 	STF	c01, [BO +  0 * SIZE]
 | |
| 	STF	c02, [BO +  1 * SIZE]
 | |
| #else
 | |
| 	STF	c01, [AO +  0 * SIZE]
 | |
| 	STF	c02, [AO +  1 * SIZE]
 | |
| #endif
 | |
| 
 | |
| 	STF	c01, [C1 + 0 * SIZE]
 | |
| 	STF	c02, [C1 + 1 * SIZE]
 | |
| 
 | |
| 	FMOV	FZERO, t1
 | |
| 	FMOV	FZERO, t2
 | |
| 	FMOV	FZERO, t3
 | |
| 	FMOV	FZERO, t4
 | |
| 
 | |
| #ifndef LN
 | |
| 	add	C1, 2 * SIZE, C1
 | |
| #endif
 | |
| 
 | |
| #ifdef RT
 | |
| 	sll	K, 1 + BASE_SHIFT, TEMP1
 | |
| 	add	AORIG, TEMP1, AORIG
 | |
| #endif
 | |
| 
 | |
| #if defined(LT) || defined(RN)
 | |
| 	sub	K, KK, TEMP1
 | |
| 	sll	TEMP1, 1 + BASE_SHIFT, TEMP2
 | |
| 	sll	TEMP1, 0 + BASE_SHIFT, TEMP1
 | |
| 	add	AO, TEMP2, AO
 | |
| 	add	BO, TEMP1, BO
 | |
| #endif
 | |
| 
 | |
| #ifdef LT
 | |
| 	add	KK, 2, KK
 | |
| #endif
 | |
| 
 | |
| #ifdef LN
 | |
| 	sub	KK, 2, KK
 | |
| #endif
 | |
| 
 | |
| .LL270:
 | |
| 	sra	M, 2, I
 | |
| 	cmp	I, 0
 | |
| 	ble,pn	%icc, .LL299
 | |
| 	nop
 | |
| 
 | |
| .LL221:
 | |
| #if defined(LT) || defined(RN)
 | |
| 	sra	KK, 2, L
 | |
| 
 | |
| 	mov	B, BO
 | |
| 	cmp	L,  0
 | |
| #else
 | |
| 
 | |
| #ifdef LN
 | |
| 	sll	K,  2 + BASE_SHIFT, TEMP1
 | |
| 	sub	AORIG, TEMP1, AORIG
 | |
| #endif
 | |
| 
 | |
| 	sll	KK, 2 + BASE_SHIFT, TEMP1
 | |
| 	sll	KK, 0 + BASE_SHIFT, TEMP2
 | |
| 
 | |
| 	add	AORIG, TEMP1, AO
 | |
| 	add	B,     TEMP2, BO
 | |
| 
 | |
| 	sub	K, KK, TEMP1
 | |
| 	sra	TEMP1, 2, L
 | |
| 	cmp	L,  0
 | |
| #endif
 | |
| 
 | |
| 	LDF	[AO + 0 * SIZE], a1
 | |
| 	FMOV	FZERO, c01
 | |
| 	LDF	[BO + 0 * SIZE], b1
 | |
| 	FMOV	FZERO, t1
 | |
| 
 | |
| 	LDF	[AO + 1 * SIZE], a2
 | |
| 	FMOV	FZERO, c02
 | |
| 	LDF	[BO + 1 * SIZE], b2
 | |
| 	FMOV	FZERO, t2
 | |
| 
 | |
| 	LDF	[AO + 2 * SIZE], a3
 | |
| 	FMOV	FZERO, c03
 | |
| 	LDF	[BO + 2 * SIZE], b3
 | |
| 	FMOV	FZERO, t3
 | |
| 
 | |
| 	LDF	[AO + 3 * SIZE], a4
 | |
| 	FMOV	FZERO, c04
 | |
| 	LDF	[BO + 3 * SIZE], b4
 | |
| 	FMOV	FZERO, t4
 | |
| 
 | |
| #ifdef LN
 | |
| 	prefetch [C1 - 3 * SIZE], 2
 | |
| #else
 | |
| 	prefetch [C1 + 3 * SIZE], 2
 | |
| #endif
 | |
| 
 | |
| 	ble,pn	%icc, .LL225
 | |
| 	prefetch [C1 + 4 * SIZE], 2
 | |
| 
 | |
| .LL222:
 | |
| 	FADD	c01, t1, c01
 | |
| 	add	BO,  4 * SIZE, BO
 | |
| 	FMUL	a1, b1, t1
 | |
| 	LDF	[AO +  4 * SIZE], a1
 | |
| 
 | |
| 	FADD	c02, t2, c02
 | |
| 	FMUL	a2, b1, t2
 | |
| 	LDF	[AO +  5 * SIZE], a2
 | |
| 
 | |
| 	FADD	c03, t3, c03
 | |
| 	add	L, -1, L
 | |
| 	FMUL	a3, b1, t3
 | |
| 	LDF	[AO +  6 * SIZE], a3
 | |
| 
 | |
| 	FADD	c04, t4, c04
 | |
| 	FMUL	a4, b1, t4
 | |
| 	LDF	[AO +  7 * SIZE], a4
 | |
| 	LDF	[BO +  0 * SIZE], b1
 | |
| 
 | |
| 	FADD	c01, t1, c01
 | |
| 	cmp	L,  0
 | |
| 	FMUL	a1, b2, t1
 | |
| 	LDF	[AO +  8 * SIZE], a1
 | |
| 
 | |
| 	FADD	c02, t2, c02
 | |
| 	FMUL	a2, b2, t2
 | |
| 	LDF	[AO +  9 * SIZE], a2
 | |
| 
 | |
| 	FADD	c03, t3, c03
 | |
| 	FMUL	a3, b2, t3
 | |
| 	LDF	[AO + 10 * SIZE], a3
 | |
| 
 | |
| 	FADD	c04, t4, c04
 | |
| 	FMUL	a4, b2, t4
 | |
| 	LDF	[AO + 11 * SIZE], a4
 | |
| 	LDF	[BO +  1 * SIZE], b2
 | |
| 
 | |
| 	FADD	c01, t1, c01
 | |
| 	FMUL	a1, b3, t1
 | |
| 	LDF	[AO + 12 * SIZE], a1
 | |
| 
 | |
| 	FADD	c02, t2, c02
 | |
| 	FMUL	a2, b3, t2
 | |
| 	LDF	[AO + 13 * SIZE], a2
 | |
| 
 | |
| 	FADD	c03, t3, c03
 | |
| 	FMUL	a3, b3, t3
 | |
| 	LDF	[AO + 14 * SIZE], a3
 | |
| 
 | |
| 	FADD	c04, t4, c04
 | |
| 	FMUL	a4, b3, t4
 | |
| 	LDF	[AO + 15 * SIZE], a4
 | |
| 	LDF	[BO +  2 * SIZE], b3
 | |
| 
 | |
| 	FADD	c01, t1, c01
 | |
| 	FMUL	a1, b4, t1
 | |
| 	LDF	[AO + 16 * SIZE], a1
 | |
| 
 | |
| 	FADD	c02, t2, c02
 | |
| 	FMUL	a2, b4, t2
 | |
| 	LDF	[AO + 17 * SIZE], a2
 | |
| 
 | |
| 	FADD	c03, t3, c03
 | |
| 	FMUL	a3, b4, t3
 | |
| 	LDF	[AO + 18 * SIZE], a3
 | |
| 
 | |
| 	FADD	c04, t4, c04
 | |
| 	FMUL	a4, b4, t4
 | |
| 	LDF	[AO + 19 * SIZE], a4
 | |
| 	add	AO, 16 * SIZE, AO
 | |
| 
 | |
| 	bg,pt	%icc, .LL222
 | |
| 	LDF	[BO +  3 * SIZE], b4
 | |
| 
 | |
| .LL225:
 | |
| #if defined(LT) || defined(RN)
 | |
| 	and	KK,  3, L
 | |
| #else
 | |
| 	and	TEMP1, 3, L
 | |
| #endif
 | |
| 	cmp	L,  0
 | |
| 	ble,a,pn %icc, .LL229
 | |
| 	nop
 | |
| 
 | |
| .LL226:
 | |
| 	FADD	c01, t1, c01
 | |
| 	add	BO, 1 * SIZE, BO
 | |
| 	FMUL	a1, b1, t1
 | |
| 	LDF	[AO + 4 * SIZE], a1
 | |
| 
 | |
| 	FADD	c02, t2, c02
 | |
| 	add	L, -1, L
 | |
| 	FMUL	a2, b1, t2
 | |
| 	LDF	[AO + 5 * SIZE], a2
 | |
| 
 | |
| 	FADD	c03, t3, c03
 | |
| 	cmp	L, 0
 | |
| 	FMUL	a3, b1, t3
 | |
| 	LDF	[AO + 6 * SIZE], a3
 | |
| 
 | |
| 	FADD	c04, t4, c04
 | |
| 	FMUL	a4, b1, t4
 | |
| 	LDF	[AO + 7 * SIZE], a4
 | |
| 	add	AO, 4 * SIZE, AO
 | |
| 
 | |
| 	bg,pt	%icc, .LL226
 | |
| 	LDF	[BO + 0 * SIZE], b1
 | |
| 
 | |
| .LL229:
 | |
| 	FADD	c01, t1, c01
 | |
| 	FADD	c02, t2, c02
 | |
| 	FADD	c03, t3, c03
 | |
| 	FADD	c04, t4, c04
 | |
| 
 | |
| #if defined(LN) || defined(RT)
 | |
| #ifdef LN
 | |
| 	sub	KK, 4, TEMP1
 | |
| #else
 | |
| 	sub	KK, 1, TEMP1
 | |
| #endif
 | |
| 	sll	TEMP1, 2 + BASE_SHIFT, TEMP2
 | |
| 	sll	TEMP1, 0 + BASE_SHIFT, TEMP1
 | |
| 	add	AORIG, TEMP2, AO
 | |
| 	add	B,     TEMP1, BO
 | |
| #endif
 | |
| 
 | |
| #if defined(LN) || defined(LT)
 | |
| 	LDF	[BO +  0 * SIZE], a1
 | |
| 	LDF	[BO +  1 * SIZE], a2
 | |
| 	LDF	[BO +  2 * SIZE], a3
 | |
| 	LDF	[BO +  3 * SIZE], a4
 | |
| 
 | |
| 	FSUB	a1, c01, c01
 | |
| 	FSUB	a2, c02, c02
 | |
| 	FSUB	a3, c03, c03
 | |
| 	FSUB	a4, c04, c04
 | |
| #else
 | |
| 	LDF	[AO +  0 * SIZE], a1
 | |
| 	LDF	[AO +  1 * SIZE], a2
 | |
| 	LDF	[AO +  2 * SIZE], a3
 | |
| 	LDF	[AO +  3 * SIZE], a4
 | |
| 
 | |
| 	FSUB	a1, c01, c01
 | |
| 	FSUB	a2, c02, c02
 | |
| 	FSUB	a3, c03, c03
 | |
| 	FSUB	a4, c04, c04
 | |
| #endif
 | |
| 
 | |
| #ifdef LN
 | |
| 	LDF	[AO + 15 * SIZE], a1
 | |
| 	LDF	[AO + 14 * SIZE], a2
 | |
| 	LDF	[AO + 13 * SIZE], a3
 | |
| 	LDF	[AO + 12 * SIZE], a4
 | |
| 
 | |
| 	FMUL	a1, c04, c04
 | |
| 	FMUL	a2, c04, t1
 | |
| 
 | |
| 	FSUB	c03, t1, c03
 | |
| 	FMUL	a3, c04, t1
 | |
| 
 | |
| 	FSUB	c02, t1, c02
 | |
| 	FMUL	a4, c04, t1
 | |
| 
 | |
| 	FSUB	c01, t1, c01
 | |
| 
 | |
| 	LDF	[AO + 10 * SIZE], a1
 | |
| 	LDF	[AO +  9 * SIZE], a2
 | |
| 	LDF	[AO +  8 * SIZE], a3
 | |
| 
 | |
| 	FMUL	a1, c03, c03
 | |
| 	FMUL	a2, c03, t1
 | |
| 
 | |
| 	FSUB	c02, t1, c02
 | |
| 	FMUL	a3, c03, t1
 | |
| 	FSUB	c01, t1, c01
 | |
| 
 | |
| 	LDF	[AO +  5 * SIZE], a1
 | |
| 	LDF	[AO +  4 * SIZE], a2
 | |
| 
 | |
| 	FMUL	a1, c02, c02
 | |
| 	FMUL	a2, c02, t1
 | |
| 	FSUB	c01, t1, c01
 | |
| 
 | |
| 	LDF	[AO +  0 * SIZE], a1
 | |
| 
 | |
| 	FMUL	a1, c01, c01
 | |
| #endif
 | |
| 
 | |
| #ifdef LT
 | |
| 	LDF	[AO +  0 * SIZE], a1
 | |
| 	LDF	[AO +  1 * SIZE], a2
 | |
| 	LDF	[AO +  2 * SIZE], a3
 | |
| 	LDF	[AO +  3 * SIZE], a4
 | |
| 
 | |
| 	FMUL	a1, c01, c01
 | |
| 	FMUL	a2, c01, t1
 | |
| 	FSUB	c02, t1, c02
 | |
| 	FMUL	a3, c01, t1
 | |
| 	FSUB	c03, t1, c03
 | |
| 	FMUL	a4, c01, t1
 | |
| 	FSUB	c04, t1, c04
 | |
| 
 | |
| 	LDF	[AO +  5 * SIZE], a1
 | |
| 	LDF	[AO +  6 * SIZE], a2
 | |
| 	LDF	[AO +  7 * SIZE], a3
 | |
| 
 | |
| 	FMUL	a1, c02, c02
 | |
| 	FMUL	a2, c02, t1
 | |
| 	FSUB	c03, t1, c03
 | |
| 	FMUL	a3, c02, t1
 | |
| 	FSUB	c04, t1, c04
 | |
| 
 | |
| 	LDF	[AO + 10 * SIZE], a1
 | |
| 	LDF	[AO + 11 * SIZE], a2
 | |
| 
 | |
| 	FMUL	a1, c03, c03
 | |
| 	FMUL	a2, c03, t1
 | |
| 
 | |
| 	FSUB	c04, t1, c04
 | |
| 
 | |
| 	LDF	[AO + 15 * SIZE], a1
 | |
| 
 | |
| 	FMUL	a1, c04, c04
 | |
| #endif
 | |
| 
 | |
| #ifdef RN
 | |
| 	LDF	[BO +  0 * SIZE], a1
 | |
| 
 | |
| 	FMUL	a1, c01, c01
 | |
| 	FMUL	a1, c02, c02
 | |
| 	FMUL	a1, c03, c03
 | |
| 	FMUL	a1, c04, c04
 | |
| #endif
 | |
| 
 | |
| #ifdef RT
 | |
| 	LDF	[BO +  0 * SIZE], a1
 | |
| 
 | |
| 	FMUL	a1, c01, c01
 | |
| 	FMUL	a1, c02, c02
 | |
| 	FMUL	a1, c03, c03
 | |
| 	FMUL	a1, c04, c04
 | |
| #endif
 | |
| 
 | |
| #ifdef LN
 | |
| 	add	C1, -4 * SIZE, C1
 | |
| #endif
 | |
| 
 | |
| #if defined(LN) || defined(LT)
 | |
| 	STF	c01, [BO +  0 * SIZE]
 | |
| 	STF	c02, [BO +  1 * SIZE]
 | |
| 	STF	c03, [BO +  2 * SIZE]
 | |
| 	STF	c04, [BO +  3 * SIZE]
 | |
| #else
 | |
| 	STF	c01, [AO +  0 * SIZE]
 | |
| 	STF	c02, [AO +  1 * SIZE]
 | |
| 	STF	c03, [AO +  2 * SIZE]
 | |
| 	STF	c04, [AO +  3 * SIZE]
 | |
| #endif
 | |
| 
 | |
| 	STF	c01, [C1 + 0 * SIZE]
 | |
| 	STF	c02, [C1 + 1 * SIZE]
 | |
| 	STF	c03, [C1 + 2 * SIZE]
 | |
| 	STF	c04, [C1 + 3 * SIZE]
 | |
| 
 | |
| 	FMOV	FZERO, t1
 | |
| 	FMOV	FZERO, t2
 | |
| 	FMOV	FZERO, t3
 | |
| 	FMOV	FZERO, t4
 | |
| 
 | |
| #ifndef LN
 | |
| 	add	C1, 4 * SIZE, C1
 | |
| #endif
 | |
| 
 | |
| #ifdef RT
 | |
| 	sll	K, 2 + BASE_SHIFT, TEMP1
 | |
| 	add	AORIG, TEMP1, AORIG
 | |
| #endif
 | |
| 
 | |
| #if defined(LT) || defined(RN)
 | |
| 	sub	K, KK, TEMP1
 | |
| 	sll	TEMP1, 2 + BASE_SHIFT, TEMP2
 | |
| 	sll	TEMP1, 0 + BASE_SHIFT, TEMP1
 | |
| 	add	AO, TEMP2, AO
 | |
| 	add	BO, TEMP1, BO
 | |
| #endif
 | |
| 
 | |
| #ifdef LT
 | |
| 	add	KK, 4, KK
 | |
| #endif
 | |
| 
 | |
| #ifdef LN
 | |
| 	sub	KK, 4, KK
 | |
| #endif
 | |
| 
 | |
| 	add	I, -1, I
 | |
| 	cmp	I, 0
 | |
| 
 | |
| 	bg,pt	%icc, .LL221
 | |
| 	nop
 | |
| 
 | |
| 
 | |
| 
 | |
| .LL299:
 | |
| #ifdef LN
 | |
| 	sll	K, 0 + BASE_SHIFT, TEMP1
 | |
| 	add	B, TEMP1, B
 | |
| #endif
 | |
| 
 | |
| #if defined(LT) || defined(RN)
 | |
| 	mov	BO, B
 | |
| #endif
 | |
| 
 | |
| #ifdef RN
 | |
| 	add	KK, 1, KK
 | |
| #endif
 | |
| 
 | |
| #ifdef RT
 | |
| 	sub	KK, 1, KK
 | |
| #endif
 | |
| 
 | |
| 
 | |
| .LL999:
 | |
| 	return	%i7 + 8
 | |
| 	clr	%o0
 | |
| 
 | |
| 	EPILOGUE
 |