429 lines
		
	
	
		
			8.8 KiB
		
	
	
	
		
			ArmAsm
		
	
	
	
			
		
		
	
	
			429 lines
		
	
	
		
			8.8 KiB
		
	
	
	
		
			ArmAsm
		
	
	
	
/*********************************************************************/
 | 
						|
/* Copyright 2009, 2010 The University of Texas at Austin.           */
 | 
						|
/* All rights reserved.                                              */
 | 
						|
/*                                                                   */
 | 
						|
/* Redistribution and use in source and binary forms, with or        */
 | 
						|
/* without modification, are permitted provided that the following   */
 | 
						|
/* conditions are met:                                               */
 | 
						|
/*                                                                   */
 | 
						|
/*   1. Redistributions of source code must retain the above         */
 | 
						|
/*      copyright notice, this list of conditions and the following  */
 | 
						|
/*      disclaimer.                                                  */
 | 
						|
/*                                                                   */
 | 
						|
/*   2. Redistributions in binary form must reproduce the above      */
 | 
						|
/*      copyright notice, this list of conditions and the following  */
 | 
						|
/*      disclaimer in the documentation and/or other materials       */
 | 
						|
/*      provided with the distribution.                              */
 | 
						|
/*                                                                   */
 | 
						|
/*    THIS  SOFTWARE IS PROVIDED  BY THE  UNIVERSITY OF  TEXAS AT    */
 | 
						|
/*    AUSTIN  ``AS IS''  AND ANY  EXPRESS OR  IMPLIED WARRANTIES,    */
 | 
						|
/*    INCLUDING, BUT  NOT LIMITED  TO, THE IMPLIED  WARRANTIES OF    */
 | 
						|
/*    MERCHANTABILITY  AND FITNESS FOR  A PARTICULAR  PURPOSE ARE    */
 | 
						|
/*    DISCLAIMED.  IN  NO EVENT SHALL THE UNIVERSITY  OF TEXAS AT    */
 | 
						|
/*    AUSTIN OR CONTRIBUTORS BE  LIABLE FOR ANY DIRECT, INDIRECT,    */
 | 
						|
/*    INCIDENTAL,  SPECIAL, EXEMPLARY,  OR  CONSEQUENTIAL DAMAGES    */
 | 
						|
/*    (INCLUDING, BUT  NOT LIMITED TO,  PROCUREMENT OF SUBSTITUTE    */
 | 
						|
/*    GOODS  OR  SERVICES; LOSS  OF  USE,  DATA,  OR PROFITS;  OR    */
 | 
						|
/*    BUSINESS INTERRUPTION) HOWEVER CAUSED  AND ON ANY THEORY OF    */
 | 
						|
/*    LIABILITY, WHETHER  IN CONTRACT, STRICT  LIABILITY, OR TORT    */
 | 
						|
/*    (INCLUDING NEGLIGENCE OR OTHERWISE)  ARISING IN ANY WAY OUT    */
 | 
						|
/*    OF  THE  USE OF  THIS  SOFTWARE,  EVEN  IF ADVISED  OF  THE    */
 | 
						|
/*    POSSIBILITY OF SUCH DAMAGE.                                    */
 | 
						|
/*                                                                   */
 | 
						|
/* The views and conclusions contained in the software and           */
 | 
						|
/* documentation are those of the authors and should not be          */
 | 
						|
/* interpreted as representing official policies, either expressed   */
 | 
						|
/* or implied, of The University of Texas at Austin.                 */
 | 
						|
/*********************************************************************/
 | 
						|
 | 
						|
#define ASSEMBLER
 | 
						|
#include "common.h"
 | 
						|
 | 
						|
 | 
						|
#define PREFETCHSIZE 40
 | 
						|
 | 
						|
	PROLOGUE
 | 
						|
	PROFCODE
 | 
						|
	.frame	$sp, 16, $26, 0
 | 
						|
 | 
						|
	ldq	$24,   0($sp)
 | 
						|
	fmov	$f19,  $f30
 | 
						|
	ldl	$23,   8($sp)
 | 
						|
	lda	$sp, -16($sp)
 | 
						|
#ifndef PROFILE
 | 
						|
	.prologue 0
 | 
						|
#else
 | 
						|
	.prologue 1
 | 
						|
#endif
 | 
						|
 | 
						|
	nop
 | 
						|
	sra	$16,  3,  $1
 | 
						|
	stt	$f2,   0($sp)
 | 
						|
	cmpeq	$21,  1,  $3
 | 
						|
 | 
						|
	stt	$f3,   8($sp)
 | 
						|
	cmpeq	$23,  1, $4
 | 
						|
	and	$16,  7,  $2
 | 
						|
	ble	$16, $End
 | 
						|
 | 
						|
	and	$3,  $4,  $3
 | 
						|
	fbeq	$f30, $End
 | 
						|
 | 
						|
	beq	$3,  $Sub
 | 
						|
	ble	$1,  $Remain
 | 
						|
	.align 4
 | 
						|
 | 
						|
	LD	$f10,  0*SIZE($20)
 | 
						|
	LD	$f11,  1*SIZE($20)
 | 
						|
	LD	$f12,  2*SIZE($20)
 | 
						|
	LD	$f13,  3*SIZE($20)
 | 
						|
 | 
						|
	LD	$f18,  0*SIZE($24)
 | 
						|
	LD	$f19,  1*SIZE($24)
 | 
						|
	LD	$f20,  2*SIZE($24)
 | 
						|
	LD	$f21,  3*SIZE($24)
 | 
						|
 | 
						|
	LD	$f14,  4*SIZE($20)
 | 
						|
	LD	$f15,  5*SIZE($20)
 | 
						|
	LD	$f16,  6*SIZE($20)
 | 
						|
	LD	$f17,  7*SIZE($20)
 | 
						|
 | 
						|
	LD	$f22,  4*SIZE($24)
 | 
						|
	LD	$f23,  5*SIZE($24)
 | 
						|
	LD	$f24,  6*SIZE($24)
 | 
						|
	LD	$f25,  7*SIZE($24)
 | 
						|
 | 
						|
	subq	$1,   1,  $1
 | 
						|
	addq	$20, 8*SIZE, $20
 | 
						|
	unop
 | 
						|
	ble	$1,  $LoopEnd
 | 
						|
	.align 4
 | 
						|
 | 
						|
$Loop:
 | 
						|
	ldt	$f31, PREFETCHSIZE * SIZE($24)
 | 
						|
	ldl	$31,  PREFETCHSIZE * SIZE($20)
 | 
						|
 | 
						|
	MUL	$f30, $f10, $f26		# ctemp1 = da * atemp1
 | 
						|
	LD	$f10,  0*SIZE($20)
 | 
						|
	MUL	$f30, $f11, $f27
 | 
						|
	LD	$f11,  1*SIZE($20)
 | 
						|
 | 
						|
	MUL	$f30, $f12, $f28
 | 
						|
	LD	$f12,  2*SIZE($20)
 | 
						|
	MUL	$f30, $f13, $f29
 | 
						|
	LD	$f13,  3*SIZE($20)
 | 
						|
 | 
						|
	ADD	$f18, $f26, $f0
 | 
						|
	LD	$f18,  8*SIZE($24)
 | 
						|
	MUL	$f30, $f14, $f26		# ctemp1 = da * atemp1
 | 
						|
	LD	$f14,  4*SIZE($20)
 | 
						|
 | 
						|
	ADD	$f19, $f27, $f1
 | 
						|
	LD	$f19,  9*SIZE($24)
 | 
						|
	MUL	$f30, $f15, $f27
 | 
						|
	LD	$f15,  5*SIZE($20)
 | 
						|
 | 
						|
	ADD	$f20, $f28, $f2
 | 
						|
	LD	$f20, 10*SIZE($24)
 | 
						|
	MUL	$f30, $f16, $f28
 | 
						|
	LD	$f16,  6*SIZE($20)
 | 
						|
 | 
						|
	ADD	$f21, $f29, $f3
 | 
						|
	LD	$f21, 11*SIZE($24)
 | 
						|
	MUL	$f30, $f17, $f29
 | 
						|
	LD	$f17, 7*SIZE($20)
 | 
						|
 | 
						|
	ST	$f0,   0*SIZE($24)
 | 
						|
	ADD	$f22, $f26, $f0
 | 
						|
	ST	$f1,   1*SIZE($24)
 | 
						|
	ADD	$f23, $f27, $f1
 | 
						|
 | 
						|
	ST	$f2,   2*SIZE($24)
 | 
						|
	ADD	$f24, $f28, $f2
 | 
						|
	ST	$f3,   3*SIZE($24)
 | 
						|
	ADD	$f25, $f29, $f3
 | 
						|
 | 
						|
	LD	$f22, 12*SIZE($24)
 | 
						|
	LD	$f23, 13*SIZE($24)
 | 
						|
	LD	$f24, 14*SIZE($24)
 | 
						|
	LD	$f25, 15*SIZE($24)
 | 
						|
 | 
						|
	ST	$f0,  4*SIZE($24)
 | 
						|
	ST	$f1,  5*SIZE($24)
 | 
						|
	ST	$f2,  6*SIZE($24)
 | 
						|
	ST	$f3,  7*SIZE($24)
 | 
						|
 | 
						|
	subq	$1,  1, $1
 | 
						|
	addq	$24, 8*SIZE, $24
 | 
						|
	addq	$20, 8*SIZE, $20
 | 
						|
	bgt	$1, $Loop
 | 
						|
	.align 4
 | 
						|
 | 
						|
$LoopEnd:
 | 
						|
	MUL	$f30, $f10, $f26		# ctemp1 = da * atemp1
 | 
						|
	MUL	$f30, $f11, $f27
 | 
						|
	MUL	$f30, $f12, $f28
 | 
						|
	MUL	$f30, $f13, $f29
 | 
						|
 | 
						|
	ADD	$f18, $f26, $f0
 | 
						|
	MUL	$f30, $f14, $f26		# ctemp1 = da * atemp1
 | 
						|
	ADD	$f19, $f27, $f1
 | 
						|
	MUL	$f30, $f15, $f27
 | 
						|
 | 
						|
	ADD	$f20, $f28, $f2
 | 
						|
	MUL	$f30, $f16, $f28
 | 
						|
	ADD	$f21, $f29, $f3
 | 
						|
	MUL	$f30, $f17, $f29
 | 
						|
 | 
						|
	ST	$f0,   0*SIZE($24)
 | 
						|
	ADD	$f22, $f26, $f0
 | 
						|
	ST	$f1,   1*SIZE($24)
 | 
						|
	ADD	$f23, $f27, $f1
 | 
						|
 | 
						|
	ST	$f2,   2*SIZE($24)
 | 
						|
	ADD	$f24, $f28, $f2
 | 
						|
	ST	$f3,   3*SIZE($24)
 | 
						|
	ADD	$f25, $f29, $f3
 | 
						|
 | 
						|
	ST	$f0,   4*SIZE($24)
 | 
						|
	ST	$f1,   5*SIZE($24)
 | 
						|
	ST	$f2,   6*SIZE($24)
 | 
						|
	ST	$f3,   7*SIZE($24)
 | 
						|
	addq	$24, 8*SIZE, $24
 | 
						|
	.align 4
 | 
						|
 | 
						|
$Remain:
 | 
						|
	ble	$2, $End
 | 
						|
	.align 4
 | 
						|
 | 
						|
$RemainLoop:
 | 
						|
	LD	$f10,  0*SIZE($20)
 | 
						|
	LD	$f11,  0*SIZE($24)
 | 
						|
	addq	$20, SIZE, $20
 | 
						|
	addq	$24, SIZE, $24
 | 
						|
 | 
						|
	MUL	$f30, $f10, $f12
 | 
						|
	subq	$2,  1,  $2
 | 
						|
	ADD	$f11, $f12, $f13
 | 
						|
	ST	$f13,  -1*SIZE($24)
 | 
						|
	bgt	$2,  $RemainLoop
 | 
						|
	.align 4
 | 
						|
 | 
						|
$End:
 | 
						|
	ldt	$f2,   0($sp)
 | 
						|
	ldt	$f3,   8($sp)
 | 
						|
	lda	$sp,  16($sp)
 | 
						|
	ret
 | 
						|
	.align 4
 | 
						|
 | 
						|
$Sub:
 | 
						|
	SXSUBL	$16,  SIZE, $22
 | 
						|
	subq	$1,  1, $4
 | 
						|
	ble	$1, $SubRemain
 | 
						|
	.align 4
 | 
						|
 | 
						|
	LD	$f10,  0($20)
 | 
						|
	SXADDQ	$21, $20, $20
 | 
						|
 | 
						|
	LD	$f11,  0($20)
 | 
						|
	SXADDQ	$21, $20, $20
 | 
						|
	LD	$f12,  0($20)
 | 
						|
	SXADDQ	$21, $20, $20
 | 
						|
 | 
						|
	LD	$f13,  0($20)
 | 
						|
	SXADDQ	$21, $20, $20
 | 
						|
	LD	$f18,  0($24)
 | 
						|
	SXADDQ	$23, $24, $22
 | 
						|
 | 
						|
	LD	$f19,  0($22)
 | 
						|
	SXADDQ	$23, $22, $22
 | 
						|
	LD	$f20,  0($22)
 | 
						|
	SXADDQ	$23, $22, $22
 | 
						|
 | 
						|
	LD	$f21,  0($22)
 | 
						|
	SXADDQ	$23, $22, $22
 | 
						|
	LD	$f14,  0($20)
 | 
						|
	SXADDQ	$21, $20, $20
 | 
						|
 | 
						|
	LD	$f15,  0($20)
 | 
						|
	SXADDQ	$21, $20, $20
 | 
						|
	LD	$f16,  0($20)
 | 
						|
	SXADDQ	$21, $20, $20
 | 
						|
 | 
						|
	LD	$f17,  0($20)
 | 
						|
	SXADDQ	$21, $20, $20
 | 
						|
	LD	$f22,  0($22)
 | 
						|
	SXADDQ	$23, $22, $22
 | 
						|
 | 
						|
	LD	$f23,  0($22)
 | 
						|
	SXADDQ	$23, $22, $22
 | 
						|
	LD	$f24,  0($22)
 | 
						|
	SXADDQ	$23, $22, $22
 | 
						|
 | 
						|
	LD	$f25,  0($22)
 | 
						|
	SXADDQ	$23, $22, $22
 | 
						|
	unop
 | 
						|
	ble	$4,  $SubLoopEnd
 | 
						|
	.align 4
 | 
						|
 | 
						|
$SubLoop:
 | 
						|
	MUL	$f30, $f10, $f26		# ctemp1 = da * atemp1
 | 
						|
	LD	$f10,  0($20)
 | 
						|
	unop
 | 
						|
	SXADDQ	$21, $20, $20
 | 
						|
 | 
						|
	MUL	$f30, $f11, $f27
 | 
						|
	LD	$f11,  0($20)
 | 
						|
	unop
 | 
						|
	SXADDQ	$21, $20, $20
 | 
						|
 | 
						|
	MUL	$f30, $f12, $f28
 | 
						|
	LD	$f12,  0($20)
 | 
						|
	unop
 | 
						|
	SXADDQ	$21, $20, $20
 | 
						|
 | 
						|
	MUL	$f30, $f13, $f29
 | 
						|
	LD	$f13,  0($20)
 | 
						|
	unop
 | 
						|
	SXADDQ	$21, $20, $20
 | 
						|
 | 
						|
	ADD	$f18, $f26, $f0
 | 
						|
	MUL	$f30, $f14, $f26		# ctemp1 = da * atemp1
 | 
						|
	LD	$f14,  0($20)
 | 
						|
	SXADDQ	$21, $20, $20
 | 
						|
 | 
						|
	ADD	$f19, $f27, $f1
 | 
						|
	MUL	$f30, $f15, $f27
 | 
						|
	LD	$f15,  0($20)
 | 
						|
	SXADDQ	$21, $20, $20
 | 
						|
 | 
						|
	ADD	$f20, $f28, $f2
 | 
						|
	MUL	$f30, $f16, $f28
 | 
						|
	LD	$f16,  0($20)
 | 
						|
	SXADDQ	$21, $20, $20
 | 
						|
 | 
						|
	ADD	$f21, $f29, $f3
 | 
						|
	MUL	$f30, $f17, $f29
 | 
						|
	LD	$f17,  0($20)
 | 
						|
	SXADDQ	$21, $20, $20
 | 
						|
 | 
						|
	ST	$f0,   0($24)
 | 
						|
	SXADDQ	$23, $24, $24
 | 
						|
	ADD	$f22, $f26, $f0
 | 
						|
	unop
 | 
						|
 | 
						|
	ST	$f1,   0($24)
 | 
						|
	SXADDQ	$23, $24, $24
 | 
						|
	ADD	$f23, $f27, $f1
 | 
						|
	unop
 | 
						|
 | 
						|
	ST	$f2,   0($24)
 | 
						|
	SXADDQ	$23, $24, $24
 | 
						|
	ADD	$f24, $f28, $f2
 | 
						|
	unop
 | 
						|
 | 
						|
	ST	$f3,   0($24)
 | 
						|
	SXADDQ	$23, $24, $24
 | 
						|
	ADD	$f25, $f29, $f3
 | 
						|
	unop
 | 
						|
 | 
						|
	LD	$f18,  0($22)
 | 
						|
	SXADDQ	$23, $22, $22
 | 
						|
	LD	$f19,  0($22)
 | 
						|
	SXADDQ	$23, $22, $22
 | 
						|
 | 
						|
	LD	$f20,  0($22)
 | 
						|
	SXADDQ	$23, $22, $22
 | 
						|
	LD	$f21,  0($22)
 | 
						|
	SXADDQ	$23, $22, $22
 | 
						|
 | 
						|
	LD	$f22,  0($22)
 | 
						|
	SXADDQ	$23, $22, $22
 | 
						|
	LD	$f23,  0($22)
 | 
						|
	SXADDQ	$23, $22, $22
 | 
						|
 | 
						|
	LD	$f24,  0($22)
 | 
						|
	SXADDQ	$23, $22, $22
 | 
						|
	LD	$f25,  0($22)
 | 
						|
	SXADDQ	$23, $22, $22
 | 
						|
 | 
						|
	ST	$f0,  0($24)
 | 
						|
	SXADDQ	$23, $24, $24
 | 
						|
	ST	$f1,  0($24)
 | 
						|
	SXADDQ	$23, $24, $24
 | 
						|
	ST	$f2,  0($24)
 | 
						|
	SXADDQ	$23, $24, $24
 | 
						|
	ST	$f3,  0($24)
 | 
						|
	SXADDQ	$23, $24, $24
 | 
						|
 | 
						|
	subq	$4,   1,  $4
 | 
						|
	bgt	$4, $SubLoop
 | 
						|
	.align 4
 | 
						|
 | 
						|
$SubLoopEnd:
 | 
						|
	MUL	$f30, $f10, $f26		# ctemp1 = da * atemp1
 | 
						|
	MUL	$f30, $f11, $f27
 | 
						|
	MUL	$f30, $f12, $f28
 | 
						|
	MUL	$f30, $f13, $f29
 | 
						|
 | 
						|
	ADD	$f18, $f26, $f0
 | 
						|
	MUL	$f30, $f14, $f26		# ctemp1 = da * atemp1
 | 
						|
	ADD	$f19, $f27, $f1
 | 
						|
	MUL	$f30, $f15, $f27
 | 
						|
 | 
						|
	ADD	$f20, $f28, $f2
 | 
						|
	MUL	$f30, $f16, $f28
 | 
						|
	ADD	$f21, $f29, $f3
 | 
						|
	MUL	$f30, $f17, $f29
 | 
						|
 | 
						|
	ST	$f0,   0($24)
 | 
						|
	SXADDQ	$23, $24, $24
 | 
						|
	ST	$f1,   0($24)
 | 
						|
	SXADDQ	$23, $24, $24
 | 
						|
 | 
						|
	ST	$f2,   0($24)
 | 
						|
	SXADDQ	$23, $24, $24
 | 
						|
	ST	$f3,   0($24)
 | 
						|
	SXADDQ	$23, $24, $24
 | 
						|
 | 
						|
	ADD	$f22, $f26, $f0
 | 
						|
	ADD	$f23, $f27, $f1
 | 
						|
	ADD	$f24, $f28, $f2
 | 
						|
	ADD	$f25, $f29, $f3
 | 
						|
 | 
						|
	ST	$f0,   0($24)
 | 
						|
	SXADDQ	$23, $24, $24
 | 
						|
	ST	$f1,   0($24)
 | 
						|
	SXADDQ	$23, $24, $24
 | 
						|
 | 
						|
	ST	$f2,   0($24)
 | 
						|
	SXADDQ	$23, $24, $24
 | 
						|
	ST	$f3,   0($24)
 | 
						|
	SXADDQ	$23, $24, $24
 | 
						|
	.align 4
 | 
						|
 | 
						|
$SubRemain:
 | 
						|
	ble	$2, $SubEnd
 | 
						|
	.align 4
 | 
						|
 | 
						|
$SubRemainLoop:
 | 
						|
	LD	$f10,  0($20)
 | 
						|
	LD	$f11,  0($24)
 | 
						|
	SXADDQ	$21, $20, $20
 | 
						|
 | 
						|
	MUL	$f30, $f10, $f12
 | 
						|
	subq	$2,  1,  $2
 | 
						|
	ADD	$f11, $f12, $f13
 | 
						|
	ST	$f13,  0($24)
 | 
						|
	SXADDQ	$23, $24, $24
 | 
						|
 | 
						|
	bgt	$2,  $SubRemainLoop
 | 
						|
	.align 4
 | 
						|
 | 
						|
$SubEnd:
 | 
						|
	ldt	$f2,   0($sp)
 | 
						|
	ldt	$f3,   8($sp)
 | 
						|
	lda	$sp,  16($sp)
 | 
						|
	ret
 | 
						|
	EPILOGUE
 |