Import GotoBLAS2 1.13 BSD version codes.
This commit is contained in:
124
kernel/alpha/KERNEL
Normal file
124
kernel/alpha/KERNEL
Normal file
@@ -0,0 +1,124 @@
|
||||
ifndef SAMINKERNEL
|
||||
SAMINKERNEL = amax.S
|
||||
endif
|
||||
|
||||
ifndef DAMINKERNEL
|
||||
DAMINKERNEL = amax.S
|
||||
endif
|
||||
|
||||
ifndef CAMINKERNEL
|
||||
CAMINKERNEL = zamax.S
|
||||
endif
|
||||
|
||||
ifndef ZAMINKERNEL
|
||||
ZAMINKERNEL = zamax.S
|
||||
endif
|
||||
|
||||
ifndef SMINKERNEL
|
||||
SMINKERNEL = max.S
|
||||
endif
|
||||
|
||||
ifndef DMINKERNEL
|
||||
DMINKERNEL = max.S
|
||||
endif
|
||||
|
||||
ifndef ISAMINKERNEL
|
||||
ISAMINKERNEL = iamax.S
|
||||
endif
|
||||
|
||||
ifndef IDAMINKERNEL
|
||||
IDAMINKERNEL = iamax.S
|
||||
endif
|
||||
|
||||
ifndef ICAMINKERNEL
|
||||
ICAMINKERNEL = izamax.S
|
||||
endif
|
||||
|
||||
ifndef IZAMINKERNEL
|
||||
IZAMINKERNEL = izamax.S
|
||||
endif
|
||||
|
||||
ifndef ISMINKERNEL
|
||||
ISMINKERNEL = iamax.S
|
||||
endif
|
||||
|
||||
ifndef IDMINKERNEL
|
||||
IDMINKERNEL = iamax.S
|
||||
endif
|
||||
|
||||
ifndef CCOPYKERNEL
|
||||
CCOPYKERNEL = copy.S
|
||||
endif
|
||||
|
||||
ifndef ZCOPYKERNEL
|
||||
ZCOPYKERNEL = copy.S
|
||||
endif
|
||||
|
||||
ifndef SNRM2KERNEL
|
||||
SNRM2KERNEL = snrm2.S
|
||||
endif
|
||||
|
||||
ifndef DNRM2KERNEL
|
||||
DNRM2KERNEL = dnrm2.S
|
||||
endif
|
||||
|
||||
ifndef CNRM2KERNEL
|
||||
CNRM2KERNEL = cnrm2.S
|
||||
endif
|
||||
|
||||
ifndef ZNRM2KERNEL
|
||||
ZNRM2KERNEL = znrm2.S
|
||||
endif
|
||||
|
||||
SGEMMKERNEL = gemm_kernel_4x4.S
|
||||
SGEMM_BETA = gemm_beta.S
|
||||
SGEMMONCOPY = ../generic/gemm_ncopy_4.c
|
||||
SGEMMOTCOPY = ../generic/gemm_tcopy_4.c
|
||||
SGEMMONCOPYOBJ = sgemm_oncopy.$(SUFFIX)
|
||||
SGEMMOTCOPYOBJ = sgemm_otcopy.$(SUFFIX)
|
||||
|
||||
DGEMMKERNEL = gemm_kernel_4x4.S
|
||||
DGEMM_BETA = gemm_beta.S
|
||||
DGEMMONCOPY = ../generic/gemm_ncopy_4.c
|
||||
DGEMMOTCOPY = ../generic/gemm_tcopy_4.c
|
||||
DGEMMONCOPYOBJ = dgemm_oncopy.$(SUFFIX)
|
||||
DGEMMOTCOPYOBJ = dgemm_otcopy.$(SUFFIX)
|
||||
|
||||
CGEMMKERNEL = zgemm_kernel_2x2.S
|
||||
CGEMM_BETA = zgemm_beta.S
|
||||
CGEMMONCOPY = ../generic/zgemm_ncopy_2.c
|
||||
CGEMMOTCOPY = ../generic/zgemm_tcopy_2.c
|
||||
CGEMMONCOPYOBJ = cgemm_oncopy.$(SUFFIX)
|
||||
CGEMMOTCOPYOBJ = cgemm_otcopy.$(SUFFIX)
|
||||
|
||||
ZGEMMKERNEL = zgemm_kernel_2x2.S
|
||||
ZGEMM_BETA = zgemm_beta.S
|
||||
ZGEMMONCOPY = ../generic/zgemm_ncopy_2.c
|
||||
ZGEMMOTCOPY = ../generic/zgemm_tcopy_2.c
|
||||
ZGEMMONCOPYOBJ = zgemm_oncopy.$(SUFFIX)
|
||||
ZGEMMOTCOPYOBJ = zgemm_otcopy.$(SUFFIX)
|
||||
|
||||
SGEMM_BETA = gemm_beta.S
|
||||
DGEMM_BETA = gemm_beta.S
|
||||
CGEMM_BETA = zgemm_beta.S
|
||||
ZGEMM_BETA = zgemm_beta.S
|
||||
|
||||
STRSMKERNEL_LN = trsm_kernel_4x4_LN.S
|
||||
STRSMKERNEL_LT = trsm_kernel_4x4_LT.S
|
||||
STRSMKERNEL_RN = trsm_kernel_4x4_LT.S
|
||||
STRSMKERNEL_RT = trsm_kernel_4x4_RT.S
|
||||
|
||||
DTRSMKERNEL_LN = trsm_kernel_4x4_LN.S
|
||||
DTRSMKERNEL_LT = trsm_kernel_4x4_LT.S
|
||||
DTRSMKERNEL_RN = trsm_kernel_4x4_LT.S
|
||||
DTRSMKERNEL_RT = trsm_kernel_4x4_RT.S
|
||||
|
||||
CTRSMKERNEL_LN = ztrsm_kernel_2x2_LN.S
|
||||
CTRSMKERNEL_LT = ztrsm_kernel_2x2_LT.S
|
||||
CTRSMKERNEL_RN = ztrsm_kernel_2x2_LT.S
|
||||
CTRSMKERNEL_RT = ztrsm_kernel_2x2_RT.S
|
||||
|
||||
ZTRSMKERNEL_LN = ztrsm_kernel_2x2_LN.S
|
||||
ZTRSMKERNEL_LT = ztrsm_kernel_2x2_LT.S
|
||||
ZTRSMKERNEL_RN = ztrsm_kernel_2x2_LT.S
|
||||
ZTRSMKERNEL_RT = ztrsm_kernel_2x2_RT.S
|
||||
2
kernel/alpha/Makefile
Normal file
2
kernel/alpha/Makefile
Normal file
@@ -0,0 +1,2 @@
|
||||
clean ::
|
||||
|
||||
283
kernel/alpha/amax.S
Normal file
283
kernel/alpha/amax.S
Normal file
@@ -0,0 +1,283 @@
|
||||
/*********************************************************************/
|
||||
/* Copyright 2009, 2010 The University of Texas at Austin. */
|
||||
/* All rights reserved. */
|
||||
/* */
|
||||
/* Redistribution and use in source and binary forms, with or */
|
||||
/* without modification, are permitted provided that the following */
|
||||
/* conditions are met: */
|
||||
/* */
|
||||
/* 1. Redistributions of source code must retain the above */
|
||||
/* copyright notice, this list of conditions and the following */
|
||||
/* disclaimer. */
|
||||
/* */
|
||||
/* 2. Redistributions in binary form must reproduce the above */
|
||||
/* copyright notice, this list of conditions and the following */
|
||||
/* disclaimer in the documentation and/or other materials */
|
||||
/* provided with the distribution. */
|
||||
/* */
|
||||
/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */
|
||||
/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */
|
||||
/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */
|
||||
/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */
|
||||
/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */
|
||||
/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */
|
||||
/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */
|
||||
/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */
|
||||
/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */
|
||||
/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */
|
||||
/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */
|
||||
/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */
|
||||
/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */
|
||||
/* POSSIBILITY OF SUCH DAMAGE. */
|
||||
/* */
|
||||
/* The views and conclusions contained in the software and */
|
||||
/* documentation are those of the authors and should not be */
|
||||
/* interpreted as representing official policies, either expressed */
|
||||
/* or implied, of The University of Texas at Austin. */
|
||||
/*********************************************************************/
|
||||
|
||||
#define ASSEMBLER
|
||||
#include "common.h"
|
||||
#include "version.h"
|
||||
|
||||
#define N $16
|
||||
#define X $17
|
||||
#define INCX $18
|
||||
|
||||
#ifndef USE_MIN
|
||||
#define CMPLT(a, b) cmptlt a, b
|
||||
#else
|
||||
#define CMPLT(a, b) cmptlt b, a
|
||||
#endif
|
||||
|
||||
#define STACKSIZE 6 * 8
|
||||
|
||||
PROLOGUE
|
||||
PROFCODE
|
||||
.frame $sp, STACKSIZE, $26, 0
|
||||
|
||||
lda $sp, -STACKSIZE($sp)
|
||||
nop
|
||||
.align 4
|
||||
|
||||
stt $f2, 0($sp)
|
||||
fclr $f16
|
||||
cmplt $31, N, $2
|
||||
unop
|
||||
|
||||
stt $f3, 8($sp)
|
||||
fclr $f17
|
||||
cmplt $31, INCX, $3
|
||||
unop
|
||||
|
||||
stt $f4, 16($sp)
|
||||
fclr $f18
|
||||
SXADDQ INCX, $31, INCX
|
||||
unop
|
||||
|
||||
stt $f5, 24($sp)
|
||||
fclr $f19
|
||||
and $2, $3, $0
|
||||
unop
|
||||
|
||||
stt $f6, 32($sp)
|
||||
fclr $f0
|
||||
sra N, 3, $1
|
||||
beq $0, $End # if (n <= 0) or (incx <= 0) return
|
||||
.align 4
|
||||
|
||||
LD $f20, 0 * SIZE(X)
|
||||
unop
|
||||
fabs $f20, $f0
|
||||
ble $1, $L15
|
||||
.align 4
|
||||
|
||||
fabs $f20, $f1
|
||||
unop
|
||||
addq X, INCX, X
|
||||
unop
|
||||
|
||||
LD $f21, 0 * SIZE(X)
|
||||
fabs $f20, $f2
|
||||
addq X, INCX, X
|
||||
unop
|
||||
|
||||
LD $f22, 0 * SIZE(X)
|
||||
fabs $f20, $f3
|
||||
addq X, INCX, X
|
||||
unop
|
||||
|
||||
LD $f23, 0 * SIZE(X)
|
||||
fabs $f20, $f4
|
||||
addq X, INCX, X
|
||||
unop
|
||||
|
||||
LD $f24, 0 * SIZE(X)
|
||||
addq X, INCX, X
|
||||
fabs $f20, $f5
|
||||
unop
|
||||
|
||||
LD $f25, 0 * SIZE(X)
|
||||
fabs $f20, $f6
|
||||
addq X, INCX, X
|
||||
unop
|
||||
|
||||
LD $f26, 0 * SIZE(X)
|
||||
fabs $f20, $f28
|
||||
addq X, INCX, X
|
||||
lda $1, -1($1)
|
||||
|
||||
LD $f27, 0 * SIZE(X)
|
||||
unop
|
||||
addq X, INCX, X
|
||||
ble $1, $L13
|
||||
.align 4
|
||||
|
||||
$L12:
|
||||
fcmovne $f16, $f12, $f4
|
||||
unop
|
||||
fabs $f20, $f29
|
||||
ldl $31, 56 * SIZE(X)
|
||||
|
||||
fcmovne $f17, $f13, $f5
|
||||
LD $f20, 0 * SIZE(X)
|
||||
fabs $f21, $f30
|
||||
addq X, INCX, X
|
||||
|
||||
fcmovne $f18, $f14, $f6
|
||||
LD $f21, 0 * SIZE(X)
|
||||
fabs $f22, $f10
|
||||
addq X, INCX, X
|
||||
|
||||
fcmovne $f19, $f15, $f28
|
||||
LD $f22, 0 * SIZE(X)
|
||||
fabs $f23, $f11
|
||||
addq X, INCX, X
|
||||
|
||||
fabs $f24, $f12
|
||||
LD $f23, 0 * SIZE(X)
|
||||
CMPLT($f0, $f29), $f16
|
||||
addq X, INCX, X
|
||||
|
||||
fabs $f25, $f13
|
||||
LD $f24, 0 * SIZE(X)
|
||||
CMPLT($f1, $f30), $f17
|
||||
addq X, INCX, X
|
||||
|
||||
fabs $f26, $f14
|
||||
LD $f25, 0 * SIZE(X)
|
||||
CMPLT($f2, $f10), $f18
|
||||
addq X, INCX, X
|
||||
|
||||
fabs $f27, $f15
|
||||
LD $f26, 0 * SIZE(X)
|
||||
CMPLT($f3, $f11), $f19
|
||||
addq X, INCX, X
|
||||
|
||||
fcmovne $f16, $f29, $f0
|
||||
LD $f27, 0 * SIZE(X)
|
||||
CMPLT($f4, $f12), $f16
|
||||
addq X, INCX, X
|
||||
|
||||
fcmovne $f17, $f30, $f1
|
||||
unop
|
||||
CMPLT($f5, $f13), $f17
|
||||
lda $1, -1($1) # i --
|
||||
|
||||
fcmovne $f18, $f10, $f2
|
||||
unop
|
||||
CMPLT($f6, $f14), $f18
|
||||
unop
|
||||
|
||||
fcmovne $f19, $f11, $f3
|
||||
unop
|
||||
CMPLT($f28, $f15), $f19
|
||||
bgt $1,$L12
|
||||
.align 4
|
||||
|
||||
$L13:
|
||||
fcmovne $f16, $f12, $f4
|
||||
fabs $f20, $f29
|
||||
fcmovne $f17, $f13, $f5
|
||||
fabs $f21, $f30
|
||||
|
||||
fcmovne $f18, $f14, $f6
|
||||
fabs $f22, $f10
|
||||
fcmovne $f19, $f15, $f28
|
||||
fabs $f23, $f11
|
||||
|
||||
fabs $f24, $f12
|
||||
CMPLT($f0, $f29), $f16
|
||||
fabs $f25, $f13
|
||||
CMPLT($f1, $f30), $f17
|
||||
|
||||
fabs $f26, $f14
|
||||
CMPLT($f2, $f10), $f18
|
||||
fabs $f27, $f15
|
||||
CMPLT($f3, $f11), $f19
|
||||
|
||||
fcmovne $f16, $f29, $f0
|
||||
CMPLT($f4, $f12), $f16
|
||||
fcmovne $f17, $f30, $f1
|
||||
CMPLT($f5, $f13), $f17
|
||||
|
||||
fcmovne $f18, $f10, $f2
|
||||
CMPLT($f6, $f14), $f18
|
||||
fcmovne $f19, $f11, $f3
|
||||
CMPLT($f28, $f15), $f19
|
||||
|
||||
fcmovne $f16, $f12, $f4
|
||||
CMPLT($f0, $f1), $f16
|
||||
fcmovne $f17, $f13, $f5
|
||||
CMPLT($f2, $f3), $f17
|
||||
|
||||
fcmovne $f18, $f14, $f6
|
||||
CMPLT($f4, $f5), $f18
|
||||
fcmovne $f19, $f15, $f28
|
||||
CMPLT($f6, $f28), $f19
|
||||
|
||||
fcmovne $f16, $f1, $f0
|
||||
fcmovne $f17, $f3, $f2
|
||||
fcmovne $f18, $f5, $f4
|
||||
fcmovne $f19, $f28, $f6
|
||||
|
||||
CMPLT($f0, $f2), $f16
|
||||
CMPLT($f4, $f6), $f17
|
||||
|
||||
fcmovne $f16, $f2, $f0
|
||||
fcmovne $f17, $f6, $f4
|
||||
|
||||
CMPLT($f0, $f4), $f16
|
||||
fcmovne $f16, $f4, $f0
|
||||
.align 4
|
||||
|
||||
$L15:
|
||||
and N, 7, $1
|
||||
unop
|
||||
unop
|
||||
ble $1, $End
|
||||
.align 4
|
||||
|
||||
$L16:
|
||||
LD $f20, 0 * SIZE(X)
|
||||
addq X, INCX, X
|
||||
|
||||
fabs $f20, $f29
|
||||
CMPLT($f0, $f29), $f16
|
||||
fcmovne $f16, $f29, $f0
|
||||
|
||||
lda $1, -1($1) # i --
|
||||
bgt $1, $L16
|
||||
.align 4
|
||||
|
||||
$End:
|
||||
ldt $f2, 0($sp)
|
||||
ldt $f3, 8($sp)
|
||||
ldt $f4, 16($sp)
|
||||
ldt $f5, 24($sp)
|
||||
|
||||
ldt $f6, 32($sp)
|
||||
lda $sp, STACKSIZE($sp)
|
||||
ret
|
||||
|
||||
EPILOGUE
|
||||
206
kernel/alpha/asum.S
Normal file
206
kernel/alpha/asum.S
Normal file
@@ -0,0 +1,206 @@
|
||||
/*********************************************************************/
|
||||
/* Copyright 2009, 2010 The University of Texas at Austin. */
|
||||
/* All rights reserved. */
|
||||
/* */
|
||||
/* Redistribution and use in source and binary forms, with or */
|
||||
/* without modification, are permitted provided that the following */
|
||||
/* conditions are met: */
|
||||
/* */
|
||||
/* 1. Redistributions of source code must retain the above */
|
||||
/* copyright notice, this list of conditions and the following */
|
||||
/* disclaimer. */
|
||||
/* */
|
||||
/* 2. Redistributions in binary form must reproduce the above */
|
||||
/* copyright notice, this list of conditions and the following */
|
||||
/* disclaimer in the documentation and/or other materials */
|
||||
/* provided with the distribution. */
|
||||
/* */
|
||||
/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */
|
||||
/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */
|
||||
/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */
|
||||
/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */
|
||||
/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */
|
||||
/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */
|
||||
/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */
|
||||
/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */
|
||||
/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */
|
||||
/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */
|
||||
/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */
|
||||
/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */
|
||||
/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */
|
||||
/* POSSIBILITY OF SUCH DAMAGE. */
|
||||
/* */
|
||||
/* The views and conclusions contained in the software and */
|
||||
/* documentation are those of the authors and should not be */
|
||||
/* interpreted as representing official policies, either expressed */
|
||||
/* or implied, of The University of Texas at Austin. */
|
||||
/*********************************************************************/
|
||||
|
||||
#define ASSEMBLER
|
||||
#include "common.h"
|
||||
#include "version.h"
|
||||
|
||||
#define PREFETCHSIZE 88
|
||||
|
||||
#define N $16
|
||||
#define X $17
|
||||
#define INCX $18
|
||||
#define I $19
|
||||
|
||||
#define s0 $f0
|
||||
#define s1 $f1
|
||||
#define s2 $f10
|
||||
#define s3 $f11
|
||||
|
||||
#define a0 $f12
|
||||
#define a1 $f13
|
||||
#define a2 $f14
|
||||
#define a3 $f15
|
||||
#define a4 $f16
|
||||
#define a5 $f17
|
||||
#define a6 $f18
|
||||
#define a7 $f19
|
||||
|
||||
#define t0 $f20
|
||||
#define t1 $f21
|
||||
#define t2 $f22
|
||||
#define t3 $f23
|
||||
|
||||
PROLOGUE
|
||||
PROFCODE
|
||||
|
||||
fclr s0
|
||||
unop
|
||||
fclr t0
|
||||
ble N, $L999
|
||||
|
||||
sra N, 3, I
|
||||
fclr s1
|
||||
fclr s2
|
||||
ble I, $L15
|
||||
|
||||
LD a0, 0 * SIZE(X)
|
||||
fclr t1
|
||||
SXADDQ INCX, X, X
|
||||
fclr t2
|
||||
|
||||
LD a1, 0 * SIZE(X)
|
||||
fclr t3
|
||||
SXADDQ INCX, X, X
|
||||
fclr s3
|
||||
|
||||
LD a2, 0 * SIZE(X)
|
||||
SXADDQ INCX, X, X
|
||||
LD a3, 0 * SIZE(X)
|
||||
SXADDQ INCX, X, X
|
||||
|
||||
LD a4, 0 * SIZE(X)
|
||||
SXADDQ INCX, X, X
|
||||
LD a5, 0 * SIZE(X)
|
||||
SXADDQ INCX, X, X
|
||||
|
||||
lda I, -1(I)
|
||||
ble I, $L13
|
||||
.align 4
|
||||
|
||||
$L12:
|
||||
ADD s0, t0, s0
|
||||
ldl $31, PREFETCHSIZE * 2 * SIZE(X)
|
||||
fabs a0, t0
|
||||
lda I, -1(I)
|
||||
|
||||
ADD s1, t1, s1
|
||||
LD a6, 0 * SIZE(X)
|
||||
fabs a1, t1
|
||||
SXADDQ INCX, X, X
|
||||
|
||||
ADD s2, t2, s2
|
||||
LD a7, 0 * SIZE(X)
|
||||
fabs a2, t2
|
||||
SXADDQ INCX, X, X
|
||||
|
||||
ADD s3, t3, s3
|
||||
LD a0, 0 * SIZE(X)
|
||||
fabs a3, t3
|
||||
SXADDQ INCX, X, X
|
||||
|
||||
ADD s0, t0, s0
|
||||
LD a1, 0 * SIZE(X)
|
||||
fabs a4, t0
|
||||
SXADDQ INCX, X, X
|
||||
|
||||
ADD s1, t1, s1
|
||||
LD a2, 0 * SIZE(X)
|
||||
fabs a5, t1
|
||||
SXADDQ INCX, X, X
|
||||
|
||||
ADD s2, t2, s2
|
||||
LD a3, 0 * SIZE(X)
|
||||
fabs a6, t2
|
||||
SXADDQ INCX, X, X
|
||||
|
||||
ADD s3, t3, s3
|
||||
LD a4, 0 * SIZE(X)
|
||||
fabs a7, t3
|
||||
SXADDQ INCX, X, X
|
||||
|
||||
LD a5, 0 * SIZE(X)
|
||||
unop
|
||||
SXADDQ INCX, X, X
|
||||
bne I, $L12
|
||||
.align 4
|
||||
|
||||
$L13:
|
||||
ADD s0, t0, s0
|
||||
LD a6, 0 * SIZE(X)
|
||||
fabs a0, t0
|
||||
SXADDQ INCX, X, X
|
||||
|
||||
ADD s1, t1, s1
|
||||
LD a7, 0 * SIZE(X)
|
||||
fabs a1, t1
|
||||
SXADDQ INCX, X, X
|
||||
|
||||
ADD s2, t2, s2
|
||||
fabs a2, t2
|
||||
ADD s3, t3, s3
|
||||
fabs a3, t3
|
||||
|
||||
ADD s0, t0, s0
|
||||
fabs a4, t0
|
||||
ADD s1, t1, s1
|
||||
fabs a5, t1
|
||||
ADD s2, t2, s2
|
||||
fabs a6, t2
|
||||
ADD s3, t3, s3
|
||||
fabs a7, t3
|
||||
|
||||
ADD s1, t1, s1
|
||||
ADD s2, t2, s2
|
||||
ADD s3, t3, s3
|
||||
|
||||
ADD s0, s1, s0
|
||||
ADD s2, s3, s2
|
||||
.align 4
|
||||
|
||||
$L15:
|
||||
and N, 7, I
|
||||
ADD s0, s2, s0
|
||||
unop
|
||||
ble I, $L999
|
||||
.align 4
|
||||
|
||||
$L17:
|
||||
ADD s0, t0, s0
|
||||
LD a0, 0 * SIZE(X)
|
||||
SXADDQ INCX, X, X
|
||||
fabs a0, t0
|
||||
|
||||
lda I, -1(I)
|
||||
bne I, $L17
|
||||
.align 4
|
||||
|
||||
$L999:
|
||||
ADD s0, t0, s0
|
||||
ret
|
||||
EPILOGUE
|
||||
428
kernel/alpha/axpy.S
Normal file
428
kernel/alpha/axpy.S
Normal file
@@ -0,0 +1,428 @@
|
||||
/*********************************************************************/
|
||||
/* Copyright 2009, 2010 The University of Texas at Austin. */
|
||||
/* All rights reserved. */
|
||||
/* */
|
||||
/* Redistribution and use in source and binary forms, with or */
|
||||
/* without modification, are permitted provided that the following */
|
||||
/* conditions are met: */
|
||||
/* */
|
||||
/* 1. Redistributions of source code must retain the above */
|
||||
/* copyright notice, this list of conditions and the following */
|
||||
/* disclaimer. */
|
||||
/* */
|
||||
/* 2. Redistributions in binary form must reproduce the above */
|
||||
/* copyright notice, this list of conditions and the following */
|
||||
/* disclaimer in the documentation and/or other materials */
|
||||
/* provided with the distribution. */
|
||||
/* */
|
||||
/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */
|
||||
/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */
|
||||
/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */
|
||||
/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */
|
||||
/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */
|
||||
/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */
|
||||
/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */
|
||||
/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */
|
||||
/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */
|
||||
/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */
|
||||
/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */
|
||||
/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */
|
||||
/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */
|
||||
/* POSSIBILITY OF SUCH DAMAGE. */
|
||||
/* */
|
||||
/* The views and conclusions contained in the software and */
|
||||
/* documentation are those of the authors and should not be */
|
||||
/* interpreted as representing official policies, either expressed */
|
||||
/* or implied, of The University of Texas at Austin. */
|
||||
/*********************************************************************/
|
||||
|
||||
#define ASSEMBLER
|
||||
#include "common.h"
|
||||
#include "version.h"
|
||||
|
||||
#define PREFETCHSIZE 40
|
||||
|
||||
PROLOGUE
|
||||
PROFCODE
|
||||
.frame $sp, 16, $26, 0
|
||||
|
||||
ldq $24, 0($sp)
|
||||
fmov $f19, $f30
|
||||
ldl $23, 8($sp)
|
||||
lda $sp, -16($sp)
|
||||
#ifndef PROFILE
|
||||
.prologue 0
|
||||
#else
|
||||
.prologue 1
|
||||
#endif
|
||||
|
||||
nop
|
||||
sra $16, 3, $1
|
||||
stt $f2, 0($sp)
|
||||
cmpeq $21, 1, $3
|
||||
|
||||
stt $f3, 8($sp)
|
||||
cmpeq $23, 1, $4
|
||||
and $16, 7, $2
|
||||
ble $16, $End
|
||||
|
||||
and $3, $4, $3
|
||||
fbeq $f30, $End
|
||||
|
||||
beq $3, $Sub
|
||||
ble $1, $Remain
|
||||
.align 4
|
||||
|
||||
LD $f10, 0*SIZE($20)
|
||||
LD $f11, 1*SIZE($20)
|
||||
LD $f12, 2*SIZE($20)
|
||||
LD $f13, 3*SIZE($20)
|
||||
|
||||
LD $f18, 0*SIZE($24)
|
||||
LD $f19, 1*SIZE($24)
|
||||
LD $f20, 2*SIZE($24)
|
||||
LD $f21, 3*SIZE($24)
|
||||
|
||||
LD $f14, 4*SIZE($20)
|
||||
LD $f15, 5*SIZE($20)
|
||||
LD $f16, 6*SIZE($20)
|
||||
LD $f17, 7*SIZE($20)
|
||||
|
||||
LD $f22, 4*SIZE($24)
|
||||
LD $f23, 5*SIZE($24)
|
||||
LD $f24, 6*SIZE($24)
|
||||
LD $f25, 7*SIZE($24)
|
||||
|
||||
subq $1, 1, $1
|
||||
addq $20, 8*SIZE, $20
|
||||
unop
|
||||
ble $1, $LoopEnd
|
||||
.align 4
|
||||
|
||||
$Loop:
|
||||
ldt $f31, PREFETCHSIZE * SIZE($24)
|
||||
ldl $31, PREFETCHSIZE * SIZE($20)
|
||||
|
||||
MUL $f30, $f10, $f26 # ctemp1 = da * atemp1
|
||||
LD $f10, 0*SIZE($20)
|
||||
MUL $f30, $f11, $f27
|
||||
LD $f11, 1*SIZE($20)
|
||||
|
||||
MUL $f30, $f12, $f28
|
||||
LD $f12, 2*SIZE($20)
|
||||
MUL $f30, $f13, $f29
|
||||
LD $f13, 3*SIZE($20)
|
||||
|
||||
ADD $f18, $f26, $f0
|
||||
LD $f18, 8*SIZE($24)
|
||||
MUL $f30, $f14, $f26 # ctemp1 = da * atemp1
|
||||
LD $f14, 4*SIZE($20)
|
||||
|
||||
ADD $f19, $f27, $f1
|
||||
LD $f19, 9*SIZE($24)
|
||||
MUL $f30, $f15, $f27
|
||||
LD $f15, 5*SIZE($20)
|
||||
|
||||
ADD $f20, $f28, $f2
|
||||
LD $f20, 10*SIZE($24)
|
||||
MUL $f30, $f16, $f28
|
||||
LD $f16, 6*SIZE($20)
|
||||
|
||||
ADD $f21, $f29, $f3
|
||||
LD $f21, 11*SIZE($24)
|
||||
MUL $f30, $f17, $f29
|
||||
LD $f17, 7*SIZE($20)
|
||||
|
||||
ST $f0, 0*SIZE($24)
|
||||
ADD $f22, $f26, $f0
|
||||
ST $f1, 1*SIZE($24)
|
||||
ADD $f23, $f27, $f1
|
||||
|
||||
ST $f2, 2*SIZE($24)
|
||||
ADD $f24, $f28, $f2
|
||||
ST $f3, 3*SIZE($24)
|
||||
ADD $f25, $f29, $f3
|
||||
|
||||
LD $f22, 12*SIZE($24)
|
||||
LD $f23, 13*SIZE($24)
|
||||
LD $f24, 14*SIZE($24)
|
||||
LD $f25, 15*SIZE($24)
|
||||
|
||||
ST $f0, 4*SIZE($24)
|
||||
ST $f1, 5*SIZE($24)
|
||||
ST $f2, 6*SIZE($24)
|
||||
ST $f3, 7*SIZE($24)
|
||||
|
||||
subq $1, 1, $1
|
||||
addq $24, 8*SIZE, $24
|
||||
addq $20, 8*SIZE, $20
|
||||
bgt $1, $Loop
|
||||
.align 4
|
||||
|
||||
$LoopEnd:
|
||||
MUL $f30, $f10, $f26 # ctemp1 = da * atemp1
|
||||
MUL $f30, $f11, $f27
|
||||
MUL $f30, $f12, $f28
|
||||
MUL $f30, $f13, $f29
|
||||
|
||||
ADD $f18, $f26, $f0
|
||||
MUL $f30, $f14, $f26 # ctemp1 = da * atemp1
|
||||
ADD $f19, $f27, $f1
|
||||
MUL $f30, $f15, $f27
|
||||
|
||||
ADD $f20, $f28, $f2
|
||||
MUL $f30, $f16, $f28
|
||||
ADD $f21, $f29, $f3
|
||||
MUL $f30, $f17, $f29
|
||||
|
||||
ST $f0, 0*SIZE($24)
|
||||
ADD $f22, $f26, $f0
|
||||
ST $f1, 1*SIZE($24)
|
||||
ADD $f23, $f27, $f1
|
||||
|
||||
ST $f2, 2*SIZE($24)
|
||||
ADD $f24, $f28, $f2
|
||||
ST $f3, 3*SIZE($24)
|
||||
ADD $f25, $f29, $f3
|
||||
|
||||
ST $f0, 4*SIZE($24)
|
||||
ST $f1, 5*SIZE($24)
|
||||
ST $f2, 6*SIZE($24)
|
||||
ST $f3, 7*SIZE($24)
|
||||
addq $24, 8*SIZE, $24
|
||||
.align 4
|
||||
|
||||
$Remain:
|
||||
ble $2, $End
|
||||
.align 4
|
||||
|
||||
$RemainLoop:
|
||||
LD $f10, 0*SIZE($20)
|
||||
LD $f11, 0*SIZE($24)
|
||||
addq $20, SIZE, $20
|
||||
addq $24, SIZE, $24
|
||||
|
||||
MUL $f30, $f10, $f12
|
||||
subq $2, 1, $2
|
||||
ADD $f11, $f12, $f13
|
||||
ST $f13, -1*SIZE($24)
|
||||
bgt $2, $RemainLoop
|
||||
.align 4
|
||||
|
||||
$End:
|
||||
ldt $f2, 0($sp)
|
||||
ldt $f3, 8($sp)
|
||||
lda $sp, 16($sp)
|
||||
ret
|
||||
.align 4
|
||||
|
||||
$Sub:
|
||||
SXSUBL $16, SIZE, $22
|
||||
subq $1, 1, $4
|
||||
ble $1, $SubRemain
|
||||
.align 4
|
||||
|
||||
LD $f10, 0($20)
|
||||
SXADDQ $21, $20, $20
|
||||
|
||||
LD $f11, 0($20)
|
||||
SXADDQ $21, $20, $20
|
||||
LD $f12, 0($20)
|
||||
SXADDQ $21, $20, $20
|
||||
|
||||
LD $f13, 0($20)
|
||||
SXADDQ $21, $20, $20
|
||||
LD $f18, 0($24)
|
||||
SXADDQ $23, $24, $22
|
||||
|
||||
LD $f19, 0($22)
|
||||
SXADDQ $23, $22, $22
|
||||
LD $f20, 0($22)
|
||||
SXADDQ $23, $22, $22
|
||||
|
||||
LD $f21, 0($22)
|
||||
SXADDQ $23, $22, $22
|
||||
LD $f14, 0($20)
|
||||
SXADDQ $21, $20, $20
|
||||
|
||||
LD $f15, 0($20)
|
||||
SXADDQ $21, $20, $20
|
||||
LD $f16, 0($20)
|
||||
SXADDQ $21, $20, $20
|
||||
|
||||
LD $f17, 0($20)
|
||||
SXADDQ $21, $20, $20
|
||||
LD $f22, 0($22)
|
||||
SXADDQ $23, $22, $22
|
||||
|
||||
LD $f23, 0($22)
|
||||
SXADDQ $23, $22, $22
|
||||
LD $f24, 0($22)
|
||||
SXADDQ $23, $22, $22
|
||||
|
||||
LD $f25, 0($22)
|
||||
SXADDQ $23, $22, $22
|
||||
unop
|
||||
ble $4, $SubLoopEnd
|
||||
.align 4
|
||||
|
||||
$SubLoop:
|
||||
MUL $f30, $f10, $f26 # ctemp1 = da * atemp1
|
||||
LD $f10, 0($20)
|
||||
unop
|
||||
SXADDQ $21, $20, $20
|
||||
|
||||
MUL $f30, $f11, $f27
|
||||
LD $f11, 0($20)
|
||||
unop
|
||||
SXADDQ $21, $20, $20
|
||||
|
||||
MUL $f30, $f12, $f28
|
||||
LD $f12, 0($20)
|
||||
unop
|
||||
SXADDQ $21, $20, $20
|
||||
|
||||
MUL $f30, $f13, $f29
|
||||
LD $f13, 0($20)
|
||||
unop
|
||||
SXADDQ $21, $20, $20
|
||||
|
||||
ADD $f18, $f26, $f0
|
||||
MUL $f30, $f14, $f26 # ctemp1 = da * atemp1
|
||||
LD $f14, 0($20)
|
||||
SXADDQ $21, $20, $20
|
||||
|
||||
ADD $f19, $f27, $f1
|
||||
MUL $f30, $f15, $f27
|
||||
LD $f15, 0($20)
|
||||
SXADDQ $21, $20, $20
|
||||
|
||||
ADD $f20, $f28, $f2
|
||||
MUL $f30, $f16, $f28
|
||||
LD $f16, 0($20)
|
||||
SXADDQ $21, $20, $20
|
||||
|
||||
ADD $f21, $f29, $f3
|
||||
MUL $f30, $f17, $f29
|
||||
LD $f17, 0($20)
|
||||
SXADDQ $21, $20, $20
|
||||
|
||||
ST $f0, 0($24)
|
||||
SXADDQ $23, $24, $24
|
||||
ADD $f22, $f26, $f0
|
||||
unop
|
||||
|
||||
ST $f1, 0($24)
|
||||
SXADDQ $23, $24, $24
|
||||
ADD $f23, $f27, $f1
|
||||
unop
|
||||
|
||||
ST $f2, 0($24)
|
||||
SXADDQ $23, $24, $24
|
||||
ADD $f24, $f28, $f2
|
||||
unop
|
||||
|
||||
ST $f3, 0($24)
|
||||
SXADDQ $23, $24, $24
|
||||
ADD $f25, $f29, $f3
|
||||
unop
|
||||
|
||||
LD $f18, 0($22)
|
||||
SXADDQ $23, $22, $22
|
||||
LD $f19, 0($22)
|
||||
SXADDQ $23, $22, $22
|
||||
|
||||
LD $f20, 0($22)
|
||||
SXADDQ $23, $22, $22
|
||||
LD $f21, 0($22)
|
||||
SXADDQ $23, $22, $22
|
||||
|
||||
LD $f22, 0($22)
|
||||
SXADDQ $23, $22, $22
|
||||
LD $f23, 0($22)
|
||||
SXADDQ $23, $22, $22
|
||||
|
||||
LD $f24, 0($22)
|
||||
SXADDQ $23, $22, $22
|
||||
LD $f25, 0($22)
|
||||
SXADDQ $23, $22, $22
|
||||
|
||||
ST $f0, 0($24)
|
||||
SXADDQ $23, $24, $24
|
||||
ST $f1, 0($24)
|
||||
SXADDQ $23, $24, $24
|
||||
ST $f2, 0($24)
|
||||
SXADDQ $23, $24, $24
|
||||
ST $f3, 0($24)
|
||||
SXADDQ $23, $24, $24
|
||||
|
||||
subq $4, 1, $4
|
||||
bgt $4, $SubLoop
|
||||
.align 4
|
||||
|
||||
$SubLoopEnd:
|
||||
MUL $f30, $f10, $f26 # ctemp1 = da * atemp1
|
||||
MUL $f30, $f11, $f27
|
||||
MUL $f30, $f12, $f28
|
||||
MUL $f30, $f13, $f29
|
||||
|
||||
ADD $f18, $f26, $f0
|
||||
MUL $f30, $f14, $f26 # ctemp1 = da * atemp1
|
||||
ADD $f19, $f27, $f1
|
||||
MUL $f30, $f15, $f27
|
||||
|
||||
ADD $f20, $f28, $f2
|
||||
MUL $f30, $f16, $f28
|
||||
ADD $f21, $f29, $f3
|
||||
MUL $f30, $f17, $f29
|
||||
|
||||
ST $f0, 0($24)
|
||||
SXADDQ $23, $24, $24
|
||||
ST $f1, 0($24)
|
||||
SXADDQ $23, $24, $24
|
||||
|
||||
ST $f2, 0($24)
|
||||
SXADDQ $23, $24, $24
|
||||
ST $f3, 0($24)
|
||||
SXADDQ $23, $24, $24
|
||||
|
||||
ADD $f22, $f26, $f0
|
||||
ADD $f23, $f27, $f1
|
||||
ADD $f24, $f28, $f2
|
||||
ADD $f25, $f29, $f3
|
||||
|
||||
ST $f0, 0($24)
|
||||
SXADDQ $23, $24, $24
|
||||
ST $f1, 0($24)
|
||||
SXADDQ $23, $24, $24
|
||||
|
||||
ST $f2, 0($24)
|
||||
SXADDQ $23, $24, $24
|
||||
ST $f3, 0($24)
|
||||
SXADDQ $23, $24, $24
|
||||
.align 4
|
||||
|
||||
$SubRemain:
|
||||
ble $2, $SubEnd
|
||||
.align 4
|
||||
|
||||
$SubRemainLoop:
|
||||
LD $f10, 0($20)
|
||||
LD $f11, 0($24)
|
||||
SXADDQ $21, $20, $20
|
||||
|
||||
MUL $f30, $f10, $f12
|
||||
subq $2, 1, $2
|
||||
ADD $f11, $f12, $f13
|
||||
ST $f13, 0($24)
|
||||
SXADDQ $23, $24, $24
|
||||
|
||||
bgt $2, $SubRemainLoop
|
||||
.align 4
|
||||
|
||||
$SubEnd:
|
||||
ldt $f2, 0($sp)
|
||||
ldt $f3, 8($sp)
|
||||
lda $sp, 16($sp)
|
||||
ret
|
||||
EPILOGUE
|
||||
71
kernel/alpha/cabs.S
Normal file
71
kernel/alpha/cabs.S
Normal file
@@ -0,0 +1,71 @@
|
||||
/*********************************************************************/
|
||||
/* Copyright 2009, 2010 The University of Texas at Austin. */
|
||||
/* All rights reserved. */
|
||||
/* */
|
||||
/* Redistribution and use in source and binary forms, with or */
|
||||
/* without modification, are permitted provided that the following */
|
||||
/* conditions are met: */
|
||||
/* */
|
||||
/* 1. Redistributions of source code must retain the above */
|
||||
/* copyright notice, this list of conditions and the following */
|
||||
/* disclaimer. */
|
||||
/* */
|
||||
/* 2. Redistributions in binary form must reproduce the above */
|
||||
/* copyright notice, this list of conditions and the following */
|
||||
/* disclaimer in the documentation and/or other materials */
|
||||
/* provided with the distribution. */
|
||||
/* */
|
||||
/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */
|
||||
/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */
|
||||
/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */
|
||||
/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */
|
||||
/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */
|
||||
/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */
|
||||
/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */
|
||||
/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */
|
||||
/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */
|
||||
/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */
|
||||
/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */
|
||||
/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */
|
||||
/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */
|
||||
/* POSSIBILITY OF SUCH DAMAGE. */
|
||||
/* */
|
||||
/* The views and conclusions contained in the software and */
|
||||
/* documentation are those of the authors and should not be */
|
||||
/* interpreted as representing official policies, either expressed */
|
||||
/* or implied, of The University of Texas at Austin. */
|
||||
/*********************************************************************/
|
||||
|
||||
#define ASSEMBLER
|
||||
#include "common.h"
|
||||
#include "version.h"
|
||||
|
||||
.set noat
|
||||
.set noreorder
|
||||
.text
|
||||
.align 5
|
||||
.globl NAME
|
||||
.ent NAME
|
||||
NAME:
|
||||
.frame $sp, 0, $26, 0
|
||||
|
||||
#ifdef PROFILE
|
||||
ldgp $gp, 0($27)
|
||||
lda $28, _mcount
|
||||
jsr $28, ($28), _mcount
|
||||
#endif
|
||||
|
||||
LD $f10, 0($16)
|
||||
LD $f11, SIZE($16)
|
||||
#ifndef PROFILE
|
||||
.prologue 0
|
||||
#else
|
||||
.prologue 1
|
||||
#endif
|
||||
|
||||
fabs $f10, $f12
|
||||
fabs $f11, $f0
|
||||
ADD $f12, $f0, $f0
|
||||
ret
|
||||
.end NAME
|
||||
.ident VERSION
|
||||
426
kernel/alpha/cnrm2.S
Normal file
426
kernel/alpha/cnrm2.S
Normal file
@@ -0,0 +1,426 @@
|
||||
/*********************************************************************/
|
||||
/* Copyright 2009, 2010 The University of Texas at Austin. */
|
||||
/* All rights reserved. */
|
||||
/* */
|
||||
/* Redistribution and use in source and binary forms, with or */
|
||||
/* without modification, are permitted provided that the following */
|
||||
/* conditions are met: */
|
||||
/* */
|
||||
/* 1. Redistributions of source code must retain the above */
|
||||
/* copyright notice, this list of conditions and the following */
|
||||
/* disclaimer. */
|
||||
/* */
|
||||
/* 2. Redistributions in binary form must reproduce the above */
|
||||
/* copyright notice, this list of conditions and the following */
|
||||
/* disclaimer in the documentation and/or other materials */
|
||||
/* provided with the distribution. */
|
||||
/* */
|
||||
/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */
|
||||
/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */
|
||||
/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */
|
||||
/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */
|
||||
/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */
|
||||
/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */
|
||||
/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */
|
||||
/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */
|
||||
/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */
|
||||
/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */
|
||||
/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */
|
||||
/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */
|
||||
/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */
|
||||
/* POSSIBILITY OF SUCH DAMAGE. */
|
||||
/* */
|
||||
/* The views and conclusions contained in the software and */
|
||||
/* documentation are those of the authors and should not be */
|
||||
/* interpreted as representing official policies, either expressed */
|
||||
/* or implied, of The University of Texas at Austin. */
|
||||
/*********************************************************************/
|
||||
|
||||
#define ASSEMBLER
|
||||
|
||||
#include "common.h"
|
||||
#include "version.h"
|
||||
|
||||
#define PREFETCH_SIZE 80
|
||||
|
||||
#define N $16
|
||||
#define X $17
|
||||
#define INCX $18
|
||||
#define XX $19
|
||||
|
||||
#define I $0
|
||||
|
||||
#define a0 $f0
|
||||
#define a1 $f1
|
||||
#define a2 $f10
|
||||
#define a3 $f11
|
||||
#define t0 $f12
|
||||
#define t1 $f13
|
||||
#define t2 $f14
|
||||
#define t3 $f15
|
||||
|
||||
#define x0 $f16
|
||||
#define x1 $f17
|
||||
#define x2 $f18
|
||||
#define x3 $f19
|
||||
#define x4 $f20
|
||||
#define x5 $f21
|
||||
#define x6 $f22
|
||||
#define x7 $f23
|
||||
|
||||
PROLOGUE
|
||||
|
||||
#if defined(EV4) || defined(EV5)
|
||||
.frame $30,16,$26,0
|
||||
.mask 0x4000000,-16
|
||||
ldah $29, 0($27) !gpdisp!1
|
||||
lda $29, 0($29) !gpdisp!1
|
||||
|
||||
lda $sp, -16($sp)
|
||||
ldq $27, sqrt($29) !literal!2
|
||||
stq $26, 0($sp)
|
||||
|
||||
PROFCODE
|
||||
.prologue 1
|
||||
#else
|
||||
PROFCODE
|
||||
#endif
|
||||
|
||||
fclr a0
|
||||
sll INCX, ZBASE_SHIFT, INCX
|
||||
fclr a1
|
||||
ble N, $L999
|
||||
|
||||
fclr a2
|
||||
cmpeq INCX, 2 * SIZE, $0
|
||||
fclr a3
|
||||
beq $0, $L20
|
||||
|
||||
fclr t0
|
||||
sra N, 3, I
|
||||
fclr t1
|
||||
ble I, $L15
|
||||
|
||||
fclr t2
|
||||
LD x0, 0 * SIZE(X)
|
||||
fclr t3
|
||||
LD x1, 1 * SIZE(X)
|
||||
|
||||
LD x2, 2 * SIZE(X)
|
||||
LD x3, 3 * SIZE(X)
|
||||
LD x4, 4 * SIZE(X)
|
||||
LD x5, 5 * SIZE(X)
|
||||
LD x6, 6 * SIZE(X)
|
||||
LD x7, 7 * SIZE(X)
|
||||
|
||||
lda I, -1(I)
|
||||
ble I, $L12
|
||||
.align 4
|
||||
|
||||
$L11:
|
||||
addt a0, t0, a0
|
||||
ldl $31, (PREFETCH_SIZE) * SIZE(X)
|
||||
mult x0, x0, t0
|
||||
LD x0, 8 * SIZE(X)
|
||||
|
||||
addt a1, t1, a1
|
||||
mov X, XX
|
||||
mult x1, x1, t1
|
||||
LD x1, 9 * SIZE(X)
|
||||
|
||||
addt a2, t2, a2
|
||||
unop
|
||||
mult x2, x2, t2
|
||||
LD x2, 10 * SIZE(X)
|
||||
|
||||
addt a3, t3, a3
|
||||
unop
|
||||
mult x3, x3, t3
|
||||
LD x3, 11 * SIZE(X)
|
||||
|
||||
addt a0, t0, a0
|
||||
unop
|
||||
mult x4, x4, t0
|
||||
LD x4, 12 * SIZE(X)
|
||||
|
||||
addt a1, t1, a1
|
||||
unop
|
||||
mult x5, x5, t1
|
||||
LD x5, 13 * SIZE(X)
|
||||
|
||||
addt a2, t2, a2
|
||||
unop
|
||||
mult x6, x6, t2
|
||||
LD x6, 14 * SIZE(X)
|
||||
|
||||
addt a3, t3, a3
|
||||
unop
|
||||
mult x7, x7, t3
|
||||
LD x7, 15 * SIZE(X)
|
||||
|
||||
addt a0, t0, a0
|
||||
unop
|
||||
mult x0, x0, t0
|
||||
LD x0, 16 * SIZE(X)
|
||||
|
||||
addt a1, t1, a1
|
||||
lda X, 16 * SIZE(X)
|
||||
mult x1, x1, t1
|
||||
LD x1, 17 * SIZE(XX)
|
||||
|
||||
addt a2, t2, a2
|
||||
unop
|
||||
mult x2, x2, t2
|
||||
LD x2, 18 * SIZE(XX)
|
||||
|
||||
addt a3, t3, a3
|
||||
unop
|
||||
mult x3, x3, t3
|
||||
LD x3, 19 * SIZE(XX)
|
||||
|
||||
addt a0, t0, a0
|
||||
unop
|
||||
mult x4, x4, t0
|
||||
LD x4, 20 * SIZE(XX)
|
||||
|
||||
addt a1, t1, a1
|
||||
lda I, -1(I)
|
||||
mult x5, x5, t1
|
||||
LD x5, 21 * SIZE(XX)
|
||||
|
||||
addt a2, t2, a2
|
||||
unop
|
||||
mult x6, x6, t2
|
||||
LD x6, 22 * SIZE(XX)
|
||||
|
||||
addt a3, t3, a3
|
||||
mult x7, x7, t3
|
||||
LD x7, 23 * SIZE(XX)
|
||||
bgt I, $L11
|
||||
.align 4
|
||||
|
||||
$L12:
|
||||
addt a0, t0, a0
|
||||
mov X, XX
|
||||
mult x0, x0, t0
|
||||
LD x0, 8 * SIZE(X)
|
||||
|
||||
addt a1, t1, a1
|
||||
unop
|
||||
mult x1, x1, t1
|
||||
LD x1, 9 * SIZE(X)
|
||||
|
||||
addt a2, t2, a2
|
||||
unop
|
||||
mult x2, x2, t2
|
||||
LD x2, 10 * SIZE(X)
|
||||
|
||||
addt a3, t3, a3
|
||||
unop
|
||||
mult x3, x3, t3
|
||||
LD x3, 11 * SIZE(X)
|
||||
|
||||
addt a0, t0, a0
|
||||
unop
|
||||
mult x4, x4, t0
|
||||
LD x4, 12 * SIZE(XX)
|
||||
|
||||
addt a1, t1, a1
|
||||
unop
|
||||
mult x5, x5, t1
|
||||
LD x5, 13 * SIZE(XX)
|
||||
|
||||
addt a2, t2, a2
|
||||
unop
|
||||
mult x6, x6, t2
|
||||
LD x6, 14 * SIZE(XX)
|
||||
|
||||
addt a3, t3, a3
|
||||
lda X, 16 * SIZE(X)
|
||||
mult x7, x7, t3
|
||||
LD x7, 15 * SIZE(XX)
|
||||
|
||||
addt a0, t0, a0
|
||||
mult x0, x0, t0
|
||||
addt a1, t1, a1
|
||||
mult x1, x1, t1
|
||||
|
||||
addt a2, t2, a2
|
||||
mult x2, x2, t2
|
||||
addt a3, t3, a3
|
||||
mult x3, x3, t3
|
||||
|
||||
addt a0, t0, a0
|
||||
mult x4, x4, t0
|
||||
addt a1, t1, a1
|
||||
mult x5, x5, t1
|
||||
|
||||
addt a2, t2, a2
|
||||
mult x6, x6, t2
|
||||
addt a3, t3, a3
|
||||
mult x7, x7, t3
|
||||
|
||||
addt a2, t2, a2
|
||||
addt a3, t3, a3
|
||||
.align 4
|
||||
|
||||
$L15:
|
||||
and N, 7, I
|
||||
ble I, $L998
|
||||
.align 4
|
||||
|
||||
$L16:
|
||||
LD x0, 0 * SIZE(X)
|
||||
LD x1, 1 * SIZE(X)
|
||||
|
||||
lda X, 2 * SIZE(X)
|
||||
|
||||
addt a0, t0, a0
|
||||
mult x0, x0, t0
|
||||
addt a1, t1, a1
|
||||
mult x1, x1, t1
|
||||
|
||||
lda I, -1(I)
|
||||
bgt I, $L16
|
||||
bsr $31, $L998
|
||||
.align 4
|
||||
|
||||
$L20:
|
||||
fclr t0
|
||||
sra N, 2, I
|
||||
fclr t1
|
||||
ble I, $L25
|
||||
|
||||
LD x0, 0 * SIZE(X)
|
||||
fclr t2
|
||||
LD x1, 1 * SIZE(X)
|
||||
addq X, INCX, X
|
||||
LD x2, 0 * SIZE(X)
|
||||
fclr t3
|
||||
LD x3, 1 * SIZE(X)
|
||||
addq X, INCX, X
|
||||
|
||||
LD x4, 0 * SIZE(X)
|
||||
lda I, -1(I)
|
||||
LD x5, 1 * SIZE(X)
|
||||
addq X, INCX, X
|
||||
|
||||
LD x6, 0 * SIZE(X)
|
||||
ble I, $L22
|
||||
.align 4
|
||||
|
||||
$L21:
|
||||
addt a0, t0, a0
|
||||
LD x7, 1 * SIZE(X)
|
||||
mult x0, x0, t0
|
||||
addq X, INCX, X
|
||||
|
||||
addt a1, t1, a1
|
||||
LD x0, 0 * SIZE(X)
|
||||
mult x1, x1, t1
|
||||
unop
|
||||
|
||||
addt a2, t2, a2
|
||||
LD x1, 1 * SIZE(X)
|
||||
mult x2, x2, t2
|
||||
addq X, INCX, X
|
||||
|
||||
addt a3, t3, a3
|
||||
LD x2, 0 * SIZE(X)
|
||||
mult x3, x3, t3
|
||||
unop
|
||||
|
||||
addt a0, t0, a0
|
||||
LD x3, 1 * SIZE(X)
|
||||
mult x4, x4, t0
|
||||
addq X, INCX, X
|
||||
|
||||
addt a1, t1, a1
|
||||
LD x4, 0 * SIZE(X)
|
||||
mult x5, x5, t1
|
||||
lda I, -1(I)
|
||||
|
||||
addt a2, t2, a2
|
||||
LD x5, 1 * SIZE(X)
|
||||
mult x6, x6, t2
|
||||
addq X, INCX, X
|
||||
|
||||
addt a3, t3, a3
|
||||
LD x6, 0 * SIZE(X)
|
||||
mult x7, x7, t3
|
||||
bgt I, $L21
|
||||
.align 4
|
||||
|
||||
$L22:
|
||||
addt a0, t0, a0
|
||||
LD x7, 1 * SIZE(X)
|
||||
mult x0, x0, t0
|
||||
addq X, INCX, X
|
||||
|
||||
addt a1, t1, a1
|
||||
mult x1, x1, t1
|
||||
addt a2, t2, a2
|
||||
mult x2, x2, t2
|
||||
|
||||
addt a3, t3, a3
|
||||
mult x3, x3, t3
|
||||
addt a0, t0, a0
|
||||
mult x4, x4, t0
|
||||
|
||||
addt a1, t1, a1
|
||||
mult x5, x5, t1
|
||||
addt a2, t2, a2
|
||||
mult x6, x6, t2
|
||||
|
||||
addt a3, t3, a3
|
||||
mult x7, x7, t3
|
||||
addt a2, t2, a2
|
||||
addt a3, t3, a3
|
||||
.align 4
|
||||
|
||||
$L25:
|
||||
and N, 3, I
|
||||
ble I, $L998
|
||||
.align 4
|
||||
|
||||
$L26:
|
||||
LD x0, 0 * SIZE(X)
|
||||
lda I, -1(I)
|
||||
LD x1, 1 * SIZE(X)
|
||||
addq X, INCX, X
|
||||
|
||||
addt a0, t0, a0
|
||||
mult x0, x0, t0
|
||||
addt a1, t1, a1
|
||||
mult x1, x1, t1
|
||||
|
||||
bgt I, $L26
|
||||
.align 4
|
||||
|
||||
|
||||
$L998:
|
||||
addt a0, t0, a0
|
||||
addt a1, t1, a1
|
||||
|
||||
addt a0, a1, a0
|
||||
addt a2, a3, a2
|
||||
|
||||
#if defined(EV4) || defined(EV5)
|
||||
addt a0, a2, $f16
|
||||
jsr $26, ($27), sqrt !lituse_jsr!2
|
||||
|
||||
ldah $29, 0($26) !gpdisp!3
|
||||
lda $29, 0($29) !gpdisp!3
|
||||
#else
|
||||
addt a0, a2, a0
|
||||
sqrtt a0, a0
|
||||
#endif
|
||||
.align 4
|
||||
|
||||
$L999:
|
||||
#if defined(EV4) || defined(EV5)
|
||||
ldq $26, 0($sp)
|
||||
lda $sp, 16($sp)
|
||||
#endif
|
||||
ret
|
||||
EPILOGUE
|
||||
379
kernel/alpha/copy.S
Normal file
379
kernel/alpha/copy.S
Normal file
@@ -0,0 +1,379 @@
|
||||
/*********************************************************************/
|
||||
/* Copyright 2009, 2010 The University of Texas at Austin. */
|
||||
/* All rights reserved. */
|
||||
/* */
|
||||
/* Redistribution and use in source and binary forms, with or */
|
||||
/* without modification, are permitted provided that the following */
|
||||
/* conditions are met: */
|
||||
/* */
|
||||
/* 1. Redistributions of source code must retain the above */
|
||||
/* copyright notice, this list of conditions and the following */
|
||||
/* disclaimer. */
|
||||
/* */
|
||||
/* 2. Redistributions in binary form must reproduce the above */
|
||||
/* copyright notice, this list of conditions and the following */
|
||||
/* disclaimer in the documentation and/or other materials */
|
||||
/* provided with the distribution. */
|
||||
/* */
|
||||
/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */
|
||||
/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */
|
||||
/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */
|
||||
/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */
|
||||
/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */
|
||||
/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */
|
||||
/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */
|
||||
/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */
|
||||
/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */
|
||||
/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */
|
||||
/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */
|
||||
/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */
|
||||
/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */
|
||||
/* POSSIBILITY OF SUCH DAMAGE. */
|
||||
/* */
|
||||
/* The views and conclusions contained in the software and */
|
||||
/* documentation are those of the authors and should not be */
|
||||
/* interpreted as representing official policies, either expressed */
|
||||
/* or implied, of The University of Texas at Austin. */
|
||||
/*********************************************************************/
|
||||
|
||||
#define ASSEMBLER
|
||||
#include "common.h"
|
||||
#include "version.h"
|
||||
|
||||
#define N $16
|
||||
#define X $17
|
||||
#define INCX $18
|
||||
#define Y $19
|
||||
#define INCY $20
|
||||
|
||||
PROLOGUE
|
||||
PROFCODE
|
||||
.frame $sp, 0, $26, 0
|
||||
|
||||
#ifndef PROFILE
|
||||
.prologue 0
|
||||
#else
|
||||
.prologue 1
|
||||
#endif
|
||||
|
||||
cmpeq INCX, 1, $0
|
||||
ble N, $End
|
||||
#ifndef COMPLEX
|
||||
sra N, 4, $4
|
||||
#else
|
||||
sra N, 3, $4
|
||||
#endif
|
||||
cmpeq INCY, 1, $1
|
||||
|
||||
and $0, $1, $0
|
||||
beq $0, $Sub
|
||||
#ifndef COMPLEX
|
||||
and N, 15, $5
|
||||
#else
|
||||
and N, 7, $5
|
||||
#endif
|
||||
ble $4, $Remain
|
||||
|
||||
LD $f10, 0*SIZE(X)
|
||||
LD $f11, 1*SIZE(X)
|
||||
LD $f12, 2*SIZE(X)
|
||||
LD $f13, 3*SIZE(X)
|
||||
LD $f14, 4*SIZE(X)
|
||||
LD $f15, 5*SIZE(X)
|
||||
LD $f16, 6*SIZE(X)
|
||||
LD $f17, 7*SIZE(X)
|
||||
|
||||
LD $f18, 8*SIZE(X)
|
||||
LD $f19, 9*SIZE(X)
|
||||
LD $f20, 10*SIZE(X)
|
||||
LD $f21, 11*SIZE(X)
|
||||
LD $f22, 12*SIZE(X)
|
||||
LD $f23, 13*SIZE(X)
|
||||
LD $f24, 14*SIZE(X)
|
||||
LD $f25, 15*SIZE(X)
|
||||
|
||||
subq $4, 1, $4
|
||||
lda X, 16*SIZE(X)
|
||||
ble $4, $MainLoopEnd
|
||||
.align 4
|
||||
|
||||
$MainLoop:
|
||||
ST $f10, 0*SIZE(Y)
|
||||
ST $f11, 1*SIZE(Y)
|
||||
ST $f12, 2*SIZE(Y)
|
||||
ST $f13, 3*SIZE(Y)
|
||||
|
||||
LD $f10, 0*SIZE(X)
|
||||
LD $f11, 1*SIZE(X)
|
||||
LD $f12, 2*SIZE(X)
|
||||
LD $f13, 3*SIZE(X)
|
||||
|
||||
ST $f14, 4*SIZE(Y)
|
||||
ST $f15, 5*SIZE(Y)
|
||||
ST $f16, 6*SIZE(Y)
|
||||
ST $f17, 7*SIZE(Y)
|
||||
|
||||
LD $f14, 4*SIZE(X)
|
||||
LD $f15, 5*SIZE(X)
|
||||
LD $f16, 6*SIZE(X)
|
||||
LD $f17, 7*SIZE(X)
|
||||
|
||||
ST $f18, 8*SIZE(Y)
|
||||
ST $f19, 9*SIZE(Y)
|
||||
ST $f20, 10*SIZE(Y)
|
||||
ST $f21, 11*SIZE(Y)
|
||||
|
||||
LD $f18, 8*SIZE(X)
|
||||
LD $f19, 9*SIZE(X)
|
||||
LD $f20, 10*SIZE(X)
|
||||
LD $f21, 11*SIZE(X)
|
||||
|
||||
ST $f22, 12*SIZE(Y)
|
||||
ST $f23, 13*SIZE(Y)
|
||||
ST $f24, 14*SIZE(Y)
|
||||
ST $f25, 15*SIZE(Y)
|
||||
|
||||
LD $f22, 12*SIZE(X)
|
||||
LD $f23, 13*SIZE(X)
|
||||
LD $f24, 14*SIZE(X)
|
||||
LD $f25, 15*SIZE(X)
|
||||
|
||||
subq $4, 1, $4
|
||||
lda Y, 16*SIZE(Y)
|
||||
lda X, 16*SIZE(X)
|
||||
bgt $4, $MainLoop
|
||||
.align 4
|
||||
|
||||
$MainLoopEnd:
|
||||
ST $f10, 0*SIZE(Y)
|
||||
ST $f11, 1*SIZE(Y)
|
||||
ST $f12, 2*SIZE(Y)
|
||||
ST $f13, 3*SIZE(Y)
|
||||
ST $f14, 4*SIZE(Y)
|
||||
ST $f15, 5*SIZE(Y)
|
||||
ST $f16, 6*SIZE(Y)
|
||||
ST $f17, 7*SIZE(Y)
|
||||
|
||||
ST $f18, 8*SIZE(Y)
|
||||
ST $f19, 9*SIZE(Y)
|
||||
ST $f20, 10*SIZE(Y)
|
||||
ST $f21, 11*SIZE(Y)
|
||||
ST $f22, 12*SIZE(Y)
|
||||
ST $f23, 13*SIZE(Y)
|
||||
ST $f24, 14*SIZE(Y)
|
||||
ST $f25, 15*SIZE(Y)
|
||||
|
||||
lda Y, 16*SIZE(Y)
|
||||
.align 4
|
||||
|
||||
$Remain:
|
||||
ble $5, $End
|
||||
.align 4
|
||||
|
||||
$RemainLoop:
|
||||
#ifndef COMPLEX
|
||||
LD $f10, 0*SIZE(X)
|
||||
lda X, 1*SIZE(X)
|
||||
ST $f10, 0*SIZE(Y)
|
||||
lda Y, 1*SIZE(Y)
|
||||
#else
|
||||
LD $f10, 0*SIZE(X)
|
||||
LD $f11, 1*SIZE(X)
|
||||
lda X, 2*SIZE(X)
|
||||
ST $f10, 0*SIZE(Y)
|
||||
ST $f11, 1*SIZE(Y)
|
||||
lda Y, 2*SIZE(Y)
|
||||
#endif
|
||||
subq $5, 1, $5
|
||||
bgt $5, $RemainLoop
|
||||
.align 4
|
||||
$End:
|
||||
ret
|
||||
.align 4
|
||||
|
||||
$Sub:
|
||||
#ifdef COMPLEX
|
||||
addq INCX, INCX, INCX
|
||||
addq INCY, INCY, INCY
|
||||
and N, 7, $5
|
||||
#else
|
||||
and N, 15, $5
|
||||
#endif
|
||||
ble $4, $SubRemain
|
||||
.align 4
|
||||
|
||||
$SubMainLoop:
|
||||
#ifndef COMPLEX
|
||||
LD $f10, 0(X)
|
||||
SXADDQ INCX, X, X
|
||||
LD $f11, 0(X)
|
||||
SXADDQ INCX, X, X
|
||||
|
||||
LD $f12, 0(X)
|
||||
SXADDQ INCX, X, X
|
||||
LD $f13, 0(X)
|
||||
SXADDQ INCX, X, X
|
||||
|
||||
LD $f14, 0(X)
|
||||
SXADDQ INCX, X, X
|
||||
LD $f15, 0(X)
|
||||
SXADDQ INCX, X, X
|
||||
|
||||
LD $f16, 0(X)
|
||||
SXADDQ INCX, X, X
|
||||
LD $f17, 0(X)
|
||||
SXADDQ INCX, X, X
|
||||
|
||||
LD $f18, 0(X)
|
||||
SXADDQ INCX, X, X
|
||||
LD $f19, 0(X)
|
||||
SXADDQ INCX, X, X
|
||||
|
||||
LD $f20, 0(X)
|
||||
SXADDQ INCX, X, X
|
||||
LD $f21, 0(X)
|
||||
SXADDQ INCX, X, X
|
||||
|
||||
LD $f22, 0(X)
|
||||
SXADDQ INCX, X, X
|
||||
LD $f23, 0(X)
|
||||
SXADDQ INCX, X, X
|
||||
|
||||
LD $f24, 0(X)
|
||||
SXADDQ INCX, X, X
|
||||
LD $f25, 0(X)
|
||||
SXADDQ INCX, X, X
|
||||
|
||||
ST $f10, 0(Y)
|
||||
SXADDQ INCY, Y, Y
|
||||
ST $f11, 0(Y)
|
||||
SXADDQ INCY, Y, Y
|
||||
|
||||
ST $f12, 0(Y)
|
||||
SXADDQ INCY, Y, Y
|
||||
ST $f13, 0(Y)
|
||||
SXADDQ INCY, Y, Y
|
||||
|
||||
ST $f14, 0(Y)
|
||||
SXADDQ INCY, Y, Y
|
||||
ST $f15, 0(Y)
|
||||
SXADDQ INCY, Y, Y
|
||||
|
||||
ST $f16, 0(Y)
|
||||
SXADDQ INCY, Y, Y
|
||||
ST $f17, 0(Y)
|
||||
SXADDQ INCY, Y, Y
|
||||
|
||||
ST $f18, 0(Y)
|
||||
SXADDQ INCY, Y, Y
|
||||
ST $f19, 0(Y)
|
||||
SXADDQ INCY, Y, Y
|
||||
|
||||
ST $f20, 0(Y)
|
||||
SXADDQ INCY, Y, Y
|
||||
ST $f21, 0(Y)
|
||||
SXADDQ INCY, Y, Y
|
||||
|
||||
ST $f22, 0(Y)
|
||||
SXADDQ INCY, Y, Y
|
||||
ST $f23, 0(Y)
|
||||
SXADDQ INCY, Y, Y
|
||||
|
||||
ST $f24, 0(Y)
|
||||
SXADDQ INCY, Y, Y
|
||||
ST $f25, 0(Y)
|
||||
SXADDQ INCY, Y, Y
|
||||
#else
|
||||
LD $f10, 0(X)
|
||||
LD $f11, SIZE(X)
|
||||
SXADDQ INCX, X, X
|
||||
|
||||
LD $f12, 0(X)
|
||||
LD $f13, SIZE(X)
|
||||
SXADDQ INCX, X, X
|
||||
|
||||
LD $f14, 0(X)
|
||||
LD $f15, SIZE(X)
|
||||
SXADDQ INCX, X, X
|
||||
|
||||
LD $f16, 0(X)
|
||||
LD $f17, SIZE(X)
|
||||
SXADDQ INCX, X, X
|
||||
|
||||
LD $f18, 0(X)
|
||||
LD $f19, SIZE(X)
|
||||
SXADDQ INCX, X, X
|
||||
|
||||
LD $f20, 0(X)
|
||||
LD $f21, SIZE(X)
|
||||
SXADDQ INCX, X, X
|
||||
|
||||
LD $f22, 0(X)
|
||||
LD $f23, SIZE(X)
|
||||
SXADDQ INCX, X, X
|
||||
|
||||
LD $f24, 0(X)
|
||||
LD $f25, SIZE(X)
|
||||
SXADDQ INCX, X, X
|
||||
|
||||
ST $f10, 0(Y)
|
||||
ST $f11, SIZE(Y)
|
||||
SXADDQ INCY, Y, Y
|
||||
|
||||
ST $f12, 0(Y)
|
||||
ST $f13, SIZE(Y)
|
||||
SXADDQ INCY, Y, Y
|
||||
|
||||
ST $f14, 0(Y)
|
||||
ST $f15, SIZE(Y)
|
||||
SXADDQ INCY, Y, Y
|
||||
|
||||
ST $f16, 0(Y)
|
||||
ST $f17, SIZE(Y)
|
||||
SXADDQ INCY, Y, Y
|
||||
|
||||
ST $f18, 0(Y)
|
||||
ST $f19, SIZE(Y)
|
||||
SXADDQ INCY, Y, Y
|
||||
|
||||
ST $f20, 0(Y)
|
||||
ST $f21, SIZE(Y)
|
||||
SXADDQ INCY, Y, Y
|
||||
|
||||
ST $f22, 0(Y)
|
||||
ST $f23, SIZE(Y)
|
||||
SXADDQ INCY, Y, Y
|
||||
|
||||
ST $f24, 0(Y)
|
||||
ST $f25, SIZE(Y)
|
||||
SXADDQ INCY, Y, Y
|
||||
#endif
|
||||
subq $4, 1, $4
|
||||
bgt $4, $SubMainLoop
|
||||
.align 4
|
||||
|
||||
$SubRemain:
|
||||
ble $5, $SubEnd
|
||||
.align 4
|
||||
|
||||
$SubRemainLoop:
|
||||
#ifndef COMPLEX
|
||||
LD $f10, 0(X)
|
||||
SXADDQ INCX, X, X
|
||||
ST $f10, 0(Y)
|
||||
SXADDQ INCY, Y, Y
|
||||
#else
|
||||
LD $f10, 0(X)
|
||||
LD $f11, SIZE(X)
|
||||
SXADDQ INCX, X, X
|
||||
ST $f10, 0(Y)
|
||||
ST $f11, SIZE(Y)
|
||||
SXADDQ INCY, Y, Y
|
||||
#endif
|
||||
subq $5, 1, $5
|
||||
bgt $5, $SubRemainLoop
|
||||
.align 4
|
||||
|
||||
$SubEnd:
|
||||
ret
|
||||
EPILOGUE
|
||||
217
kernel/alpha/cscal.S
Normal file
217
kernel/alpha/cscal.S
Normal file
@@ -0,0 +1,217 @@
|
||||
/*********************************************************************/
|
||||
/* Copyright 2009, 2010 The University of Texas at Austin. */
|
||||
/* All rights reserved. */
|
||||
/* */
|
||||
/* Redistribution and use in source and binary forms, with or */
|
||||
/* without modification, are permitted provided that the following */
|
||||
/* conditions are met: */
|
||||
/* */
|
||||
/* 1. Redistributions of source code must retain the above */
|
||||
/* copyright notice, this list of conditions and the following */
|
||||
/* disclaimer. */
|
||||
/* */
|
||||
/* 2. Redistributions in binary form must reproduce the above */
|
||||
/* copyright notice, this list of conditions and the following */
|
||||
/* disclaimer in the documentation and/or other materials */
|
||||
/* provided with the distribution. */
|
||||
/* */
|
||||
/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */
|
||||
/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */
|
||||
/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */
|
||||
/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */
|
||||
/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */
|
||||
/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */
|
||||
/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */
|
||||
/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */
|
||||
/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */
|
||||
/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */
|
||||
/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */
|
||||
/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */
|
||||
/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */
|
||||
/* POSSIBILITY OF SUCH DAMAGE. */
|
||||
/* */
|
||||
/* The views and conclusions contained in the software and */
|
||||
/* documentation are those of the authors and should not be */
|
||||
/* interpreted as representing official policies, either expressed */
|
||||
/* or implied, of The University of Texas at Austin. */
|
||||
/*********************************************************************/
|
||||
|
||||
.set noat
|
||||
.set noreorder
|
||||
|
||||
#define ASSEMBLER
|
||||
|
||||
#include "common.h"
|
||||
#include "version.h"
|
||||
|
||||
.globl NAME
|
||||
.ent NAME
|
||||
|
||||
NAME:
|
||||
#ifdef PROFILE
|
||||
ldgp $gp, 0($27)
|
||||
lda $28, _mcount
|
||||
jsr $28, ($28), _mcount
|
||||
#endif
|
||||
|
||||
#ifndef C_INTERFACE
|
||||
ldl $16, 0($16) # n
|
||||
mov $18, $20 # Store Address
|
||||
ldl $19, 0($19) # incx
|
||||
nop
|
||||
|
||||
LD $f1, 0($17) # alpha
|
||||
#else
|
||||
mov $18, $20 # Store Address
|
||||
fmov $f17, $f1 # alpha
|
||||
#endif
|
||||
|
||||
#ifndef PROFILE
|
||||
.prologue 0
|
||||
#else
|
||||
.prologue 1
|
||||
#endif
|
||||
|
||||
sra $16, 1, $21 # 4-unrolling
|
||||
ble $16, $End
|
||||
|
||||
lda $23, -1($19)
|
||||
ble $19, $End
|
||||
|
||||
bgt $23, $INC_NOT_1
|
||||
.align 4
|
||||
|
||||
ble $21, $Sub
|
||||
lda $21, -1($21)
|
||||
LD $f10, 0*SIZE($18)
|
||||
LD $f11, 1*SIZE($18)
|
||||
|
||||
LD $f12, 2*SIZE($18)
|
||||
LD $f13, 3*SIZE($18)
|
||||
lda $18, 4*SIZE($18)
|
||||
ble $21, $MainRemain
|
||||
.align 4
|
||||
|
||||
$MainLoop:
|
||||
MUL $f10, $f1, $f20
|
||||
LD $f10, 0*SIZE($18)
|
||||
MUL $f11, $f1, $f21
|
||||
LD $f11, 1*SIZE($18)
|
||||
|
||||
MUL $f12, $f1, $f22
|
||||
LD $f12, 2*SIZE($18)
|
||||
MUL $f13, $f1, $f23
|
||||
LD $f13, 3*SIZE($18)
|
||||
|
||||
lda $18, 4*SIZE($18)
|
||||
lda $21, -1($21)
|
||||
|
||||
ST $f20, 0*SIZE($20)
|
||||
ST $f21, 1*SIZE($20)
|
||||
ST $f22, 2*SIZE($20)
|
||||
ST $f23, 3*SIZE($20)
|
||||
lda $20, 4*SIZE($20)
|
||||
|
||||
bgt $21, $MainLoop
|
||||
.align 4
|
||||
|
||||
$MainRemain:
|
||||
MUL $f10, $f1, $f20
|
||||
MUL $f11, $f1, $f21
|
||||
MUL $f12, $f1, $f22
|
||||
MUL $f13, $f1, $f23
|
||||
|
||||
ST $f20, 0*SIZE($20)
|
||||
ST $f21, 1*SIZE($20)
|
||||
ST $f22, 2*SIZE($20)
|
||||
ST $f23, 3*SIZE($20)
|
||||
lda $20, 4*SIZE($20)
|
||||
.align 4
|
||||
|
||||
$Sub:
|
||||
blbc $16, $End
|
||||
LD $f10, 0*SIZE($18)
|
||||
LD $f11, 1*SIZE($18)
|
||||
MUL $f10, $f1, $f20
|
||||
MUL $f11, $f1, $f21
|
||||
ST $f20, 0*SIZE($20)
|
||||
ST $f21, 1*SIZE($20)
|
||||
.align 4
|
||||
|
||||
$End:
|
||||
ret
|
||||
.align 4
|
||||
|
||||
$INC_NOT_1:
|
||||
addl $19, $19, $19
|
||||
ble $21, $INC_Sub
|
||||
lda $21, -1($21)
|
||||
|
||||
LD $f10, 0*SIZE($18)
|
||||
LD $f11, 1*SIZE($18)
|
||||
SXADDQ $19, $18, $18
|
||||
|
||||
LD $f12, 0*SIZE($18)
|
||||
LD $f13, 1*SIZE($18)
|
||||
SXADDQ $19, $18, $18
|
||||
ble $21, $INC_MainRemain
|
||||
.align 4
|
||||
|
||||
$INC_MainLoop:
|
||||
MUL $f10, $f1, $f20
|
||||
LD $f10, 0*SIZE($18)
|
||||
MUL $f11, $f1, $f21
|
||||
LD $f11, 1*SIZE($18)
|
||||
|
||||
SXADDQ $19, $18, $18
|
||||
|
||||
MUL $f12, $f1, $f22
|
||||
LD $f12, 0*SIZE($18)
|
||||
MUL $f13, $f1, $f23
|
||||
LD $f13, 1*SIZE($18)
|
||||
|
||||
SXADDQ $19, $18, $18
|
||||
|
||||
ST $f20, 0*SIZE($20)
|
||||
lda $21, -1($21)
|
||||
ST $f21, 1*SIZE($20)
|
||||
SXADDQ $19, $20, $20
|
||||
|
||||
ST $f22, 0*SIZE($20)
|
||||
ST $f23, 1*SIZE($20)
|
||||
SXADDQ $19, $20, $20
|
||||
unop
|
||||
bgt $21, $INC_MainLoop
|
||||
.align 4
|
||||
|
||||
$INC_MainRemain:
|
||||
MUL $f10, $f1, $f20
|
||||
MUL $f11, $f1, $f21
|
||||
MUL $f12, $f1, $f22
|
||||
MUL $f13, $f1, $f23
|
||||
|
||||
ST $f20, 0*SIZE($20)
|
||||
ST $f21, 1*SIZE($20)
|
||||
SXADDQ $19, $20, $20
|
||||
|
||||
ST $f22, 0*SIZE($20)
|
||||
ST $f23, 1*SIZE($20)
|
||||
SXADDQ $19, $20, $20
|
||||
.align 4
|
||||
|
||||
$INC_Sub:
|
||||
blbc $16, $INC_End
|
||||
|
||||
LD $f10, 0*SIZE($18)
|
||||
LD $f11, 1*SIZE($18)
|
||||
MUL $f10, $f1, $f20
|
||||
MUL $f11, $f1, $f21
|
||||
|
||||
ST $f20, 0*SIZE($20)
|
||||
ST $f21, 1*SIZE($20)
|
||||
.align 4
|
||||
|
||||
$INC_End:
|
||||
ret
|
||||
.end NAME
|
||||
.ident VERSION
|
||||
431
kernel/alpha/dnrm2.S
Normal file
431
kernel/alpha/dnrm2.S
Normal file
@@ -0,0 +1,431 @@
|
||||
/*********************************************************************/
|
||||
/* Copyright 2009, 2010 The University of Texas at Austin. */
|
||||
/* All rights reserved. */
|
||||
/* */
|
||||
/* Redistribution and use in source and binary forms, with or */
|
||||
/* without modification, are permitted provided that the following */
|
||||
/* conditions are met: */
|
||||
/* */
|
||||
/* 1. Redistributions of source code must retain the above */
|
||||
/* copyright notice, this list of conditions and the following */
|
||||
/* disclaimer. */
|
||||
/* */
|
||||
/* 2. Redistributions in binary form must reproduce the above */
|
||||
/* copyright notice, this list of conditions and the following */
|
||||
/* disclaimer in the documentation and/or other materials */
|
||||
/* provided with the distribution. */
|
||||
/* */
|
||||
/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */
|
||||
/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */
|
||||
/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */
|
||||
/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */
|
||||
/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */
|
||||
/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */
|
||||
/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */
|
||||
/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */
|
||||
/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */
|
||||
/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */
|
||||
/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */
|
||||
/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */
|
||||
/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */
|
||||
/* POSSIBILITY OF SUCH DAMAGE. */
|
||||
/* */
|
||||
/* The views and conclusions contained in the software and */
|
||||
/* documentation are those of the authors and should not be */
|
||||
/* interpreted as representing official policies, either expressed */
|
||||
/* or implied, of The University of Texas at Austin. */
|
||||
/*********************************************************************/
|
||||
|
||||
#define ASSEMBLER
|
||||
|
||||
#include "common.h"
|
||||
#include "version.h"
|
||||
|
||||
#define PREFETCH_SIZE 80
|
||||
|
||||
#define N $16
|
||||
#define X $17
|
||||
#define INCX $18
|
||||
#define XX $19
|
||||
|
||||
#define I $0
|
||||
|
||||
#define a0 $f0
|
||||
#define a1 $f1
|
||||
#define a2 $f10
|
||||
#define a3 $f11
|
||||
#define t0 $f12
|
||||
#define t1 $f13
|
||||
#define t2 $f14
|
||||
#define t3 $f15
|
||||
|
||||
#define x0 $f16
|
||||
#define x1 $f17
|
||||
#define x2 $f18
|
||||
#define x3 $f19
|
||||
#define x4 $f20
|
||||
#define x5 $f21
|
||||
#define x6 $f22
|
||||
#define x7 $f23
|
||||
|
||||
PROLOGUE
|
||||
|
||||
#if defined(EV4) || defined(EV5)
|
||||
.frame $30,16,$26,0
|
||||
.mask 0x4000000,-16
|
||||
ldah $29, 0($27) !gpdisp!1
|
||||
lda $29, 0($29) !gpdisp!1
|
||||
|
||||
lda $sp, -16($sp)
|
||||
ldq $27, sqrt($29) !literal!2
|
||||
stq $26, 0($sp)
|
||||
|
||||
PROFCODE
|
||||
.prologue 1
|
||||
#else
|
||||
PROFCODE
|
||||
#endif
|
||||
|
||||
fclr a0
|
||||
SXADDQ INCX, 0, INCX
|
||||
fclr a1
|
||||
ble N, $L999
|
||||
|
||||
fclr a2
|
||||
cmpeq INCX, SIZE, $0
|
||||
fclr a3
|
||||
beq $0, $L20
|
||||
|
||||
fclr t0
|
||||
sra N, 4, I
|
||||
fclr t1
|
||||
ble I, $L15
|
||||
|
||||
fclr t2
|
||||
LD x0, 0 * SIZE(X)
|
||||
fclr t3
|
||||
LD x1, 1 * SIZE(X)
|
||||
|
||||
LD x2, 2 * SIZE(X)
|
||||
LD x3, 3 * SIZE(X)
|
||||
LD x4, 4 * SIZE(X)
|
||||
LD x5, 5 * SIZE(X)
|
||||
LD x6, 6 * SIZE(X)
|
||||
LD x7, 7 * SIZE(X)
|
||||
|
||||
lda I, -1(I)
|
||||
ble I, $L12
|
||||
.align 4
|
||||
|
||||
$L11:
|
||||
addt a0, t0, a0
|
||||
ldl $31, (PREFETCH_SIZE) * SIZE(X)
|
||||
mult x0, x0, t0
|
||||
LD x0, 8 * SIZE(X)
|
||||
|
||||
addt a1, t1, a1
|
||||
mov X, XX
|
||||
mult x1, x1, t1
|
||||
LD x1, 9 * SIZE(X)
|
||||
|
||||
addt a2, t2, a2
|
||||
unop
|
||||
mult x2, x2, t2
|
||||
LD x2, 10 * SIZE(X)
|
||||
|
||||
addt a3, t3, a3
|
||||
unop
|
||||
mult x3, x3, t3
|
||||
LD x3, 11 * SIZE(X)
|
||||
|
||||
addt a0, t0, a0
|
||||
unop
|
||||
mult x4, x4, t0
|
||||
LD x4, 12 * SIZE(X)
|
||||
|
||||
addt a1, t1, a1
|
||||
unop
|
||||
mult x5, x5, t1
|
||||
LD x5, 13 * SIZE(X)
|
||||
|
||||
addt a2, t2, a2
|
||||
unop
|
||||
mult x6, x6, t2
|
||||
LD x6, 14 * SIZE(X)
|
||||
|
||||
addt a3, t3, a3
|
||||
unop
|
||||
mult x7, x7, t3
|
||||
LD x7, 15 * SIZE(X)
|
||||
|
||||
addt a0, t0, a0
|
||||
unop
|
||||
mult x0, x0, t0
|
||||
LD x0, 16 * SIZE(X)
|
||||
|
||||
addt a1, t1, a1
|
||||
lda X, 16 * SIZE(X)
|
||||
mult x1, x1, t1
|
||||
LD x1, 17 * SIZE(XX)
|
||||
|
||||
addt a2, t2, a2
|
||||
unop
|
||||
mult x2, x2, t2
|
||||
LD x2, 18 * SIZE(XX)
|
||||
|
||||
addt a3, t3, a3
|
||||
unop
|
||||
mult x3, x3, t3
|
||||
LD x3, 19 * SIZE(XX)
|
||||
|
||||
addt a0, t0, a0
|
||||
unop
|
||||
mult x4, x4, t0
|
||||
LD x4, 20 * SIZE(XX)
|
||||
|
||||
addt a1, t1, a1
|
||||
lda I, -1(I)
|
||||
mult x5, x5, t1
|
||||
LD x5, 21 * SIZE(XX)
|
||||
|
||||
addt a2, t2, a2
|
||||
unop
|
||||
mult x6, x6, t2
|
||||
LD x6, 22 * SIZE(XX)
|
||||
|
||||
addt a3, t3, a3
|
||||
mult x7, x7, t3
|
||||
LD x7, 23 * SIZE(XX)
|
||||
bgt I, $L11
|
||||
.align 4
|
||||
|
||||
$L12:
|
||||
addt a0, t0, a0
|
||||
mov X, XX
|
||||
mult x0, x0, t0
|
||||
LD x0, 8 * SIZE(X)
|
||||
|
||||
addt a1, t1, a1
|
||||
unop
|
||||
mult x1, x1, t1
|
||||
LD x1, 9 * SIZE(X)
|
||||
|
||||
addt a2, t2, a2
|
||||
unop
|
||||
mult x2, x2, t2
|
||||
LD x2, 10 * SIZE(X)
|
||||
|
||||
addt a3, t3, a3
|
||||
unop
|
||||
mult x3, x3, t3
|
||||
LD x3, 11 * SIZE(X)
|
||||
|
||||
addt a0, t0, a0
|
||||
unop
|
||||
mult x4, x4, t0
|
||||
LD x4, 12 * SIZE(XX)
|
||||
|
||||
addt a1, t1, a1
|
||||
unop
|
||||
mult x5, x5, t1
|
||||
LD x5, 13 * SIZE(XX)
|
||||
|
||||
addt a2, t2, a2
|
||||
unop
|
||||
mult x6, x6, t2
|
||||
LD x6, 14 * SIZE(XX)
|
||||
|
||||
addt a3, t3, a3
|
||||
lda X, 16 * SIZE(X)
|
||||
mult x7, x7, t3
|
||||
LD x7, 15 * SIZE(XX)
|
||||
|
||||
addt a0, t0, a0
|
||||
mult x0, x0, t0
|
||||
addt a1, t1, a1
|
||||
mult x1, x1, t1
|
||||
|
||||
addt a2, t2, a2
|
||||
mult x2, x2, t2
|
||||
addt a3, t3, a3
|
||||
mult x3, x3, t3
|
||||
|
||||
addt a0, t0, a0
|
||||
mult x4, x4, t0
|
||||
addt a1, t1, a1
|
||||
mult x5, x5, t1
|
||||
|
||||
addt a2, t2, a2
|
||||
mult x6, x6, t2
|
||||
addt a3, t3, a3
|
||||
mult x7, x7, t3
|
||||
|
||||
addt a1, t1, a1
|
||||
addt a2, t2, a2
|
||||
addt a3, t3, a3
|
||||
.align 4
|
||||
|
||||
$L15:
|
||||
and N, 15, I
|
||||
ble I, $L998
|
||||
.align 4
|
||||
|
||||
$L16:
|
||||
LD x0, 0 * SIZE(X)
|
||||
lda X, 1 * SIZE(X)
|
||||
|
||||
addt a0, t0, a0
|
||||
mult x0, x0, t0
|
||||
|
||||
lda I, -1(I)
|
||||
bgt I, $L16
|
||||
bsr $31, $L998
|
||||
.align 4
|
||||
|
||||
$L20:
|
||||
fclr t0
|
||||
sra N, 3, I
|
||||
fclr t1
|
||||
ble I, $L25
|
||||
|
||||
fclr t2
|
||||
fclr t3
|
||||
|
||||
LD x0, 0 * SIZE(X)
|
||||
addq X, INCX, X
|
||||
LD x1, 0 * SIZE(X)
|
||||
addq X, INCX, X
|
||||
LD x2, 0 * SIZE(X)
|
||||
addq X, INCX, X
|
||||
LD x3, 0 * SIZE(X)
|
||||
addq X, INCX, X
|
||||
|
||||
LD x4, 0 * SIZE(X)
|
||||
addq X, INCX, X
|
||||
LD x5, 0 * SIZE(X)
|
||||
addq X, INCX, X
|
||||
LD x6, 0 * SIZE(X)
|
||||
addq X, INCX, X
|
||||
|
||||
lda I, -1(I)
|
||||
ble I, $L22
|
||||
.align 4
|
||||
|
||||
$L21:
|
||||
addt a0, t0, a0
|
||||
LD x7, 0 * SIZE(X)
|
||||
mult x0, x0, t0
|
||||
addq X, INCX, X
|
||||
|
||||
addt a1, t1, a1
|
||||
LD x0, 0 * SIZE(X)
|
||||
mult x1, x1, t1
|
||||
addq X, INCX, X
|
||||
|
||||
addt a2, t2, a2
|
||||
LD x1, 0 * SIZE(X)
|
||||
mult x2, x2, t2
|
||||
addq X, INCX, X
|
||||
|
||||
addt a3, t3, a3
|
||||
LD x2, 0 * SIZE(X)
|
||||
mult x3, x3, t3
|
||||
addq X, INCX, X
|
||||
|
||||
addt a0, t0, a0
|
||||
LD x3, 0 * SIZE(X)
|
||||
mult x4, x4, t0
|
||||
addq X, INCX, X
|
||||
|
||||
addt a1, t1, a1
|
||||
LD x4, 0 * SIZE(X)
|
||||
mult x5, x5, t1
|
||||
addq X, INCX, X
|
||||
|
||||
addt a2, t2, a2
|
||||
LD x5, 0 * SIZE(X)
|
||||
mult x6, x6, t2
|
||||
addq X, INCX, X
|
||||
|
||||
addt a3, t3, a3
|
||||
LD x6, 0 * SIZE(X)
|
||||
mult x7, x7, t3
|
||||
addq X, INCX, X
|
||||
|
||||
lda I, -1(I)
|
||||
bgt I, $L21
|
||||
.align 4
|
||||
|
||||
$L22:
|
||||
addt a0, t0, a0
|
||||
LD x7, 0 * SIZE(X)
|
||||
mult x0, x0, t0
|
||||
addq X, INCX, X
|
||||
|
||||
addt a1, t1, a1
|
||||
unop
|
||||
mult x1, x1, t1
|
||||
unop
|
||||
|
||||
addt a2, t2, a2
|
||||
mult x2, x2, t2
|
||||
addt a3, t3, a3
|
||||
mult x3, x3, t3
|
||||
|
||||
addt a0, t0, a0
|
||||
mult x4, x4, t0
|
||||
addt a1, t1, a1
|
||||
mult x5, x5, t1
|
||||
|
||||
addt a2, t2, a2
|
||||
mult x6, x6, t2
|
||||
addt a3, t3, a3
|
||||
mult x7, x7, t3
|
||||
|
||||
addt a1, t1, a1
|
||||
addt a2, t2, a2
|
||||
addt a3, t3, a3
|
||||
.align 4
|
||||
|
||||
$L25:
|
||||
and N, 7, I
|
||||
ble I, $L998
|
||||
.align 4
|
||||
|
||||
$L26:
|
||||
LD x0, 0 * SIZE(X)
|
||||
addq X, INCX, X
|
||||
|
||||
addt a0, t0, a0
|
||||
mult x0, x0, t0
|
||||
|
||||
lda I, -1(I)
|
||||
bgt I, $L26
|
||||
.align 4
|
||||
|
||||
|
||||
$L998:
|
||||
addt a0, t0, a0
|
||||
|
||||
addt a0, a1, a0
|
||||
addt a2, a3, a2
|
||||
|
||||
#if defined(EV4) || defined(EV5)
|
||||
addt a0, a2, $f16
|
||||
jsr $26, ($27), sqrt !lituse_jsr!2
|
||||
|
||||
ldah $29, 0($26) !gpdisp!3
|
||||
lda $29, 0($29) !gpdisp!3
|
||||
#else
|
||||
addt a0, a2, a0
|
||||
sqrtt a0, a0
|
||||
#endif
|
||||
.align 4
|
||||
|
||||
$L999:
|
||||
#if defined(EV4) || defined(EV5)
|
||||
ldq $26, 0($sp)
|
||||
lda $sp, 16($sp)
|
||||
#endif
|
||||
ret
|
||||
EPILOGUE
|
||||
530
kernel/alpha/dot.S
Normal file
530
kernel/alpha/dot.S
Normal file
@@ -0,0 +1,530 @@
|
||||
/*********************************************************************/
|
||||
/* Copyright 2009, 2010 The University of Texas at Austin. */
|
||||
/* All rights reserved. */
|
||||
/* */
|
||||
/* Redistribution and use in source and binary forms, with or */
|
||||
/* without modification, are permitted provided that the following */
|
||||
/* conditions are met: */
|
||||
/* */
|
||||
/* 1. Redistributions of source code must retain the above */
|
||||
/* copyright notice, this list of conditions and the following */
|
||||
/* disclaimer. */
|
||||
/* */
|
||||
/* 2. Redistributions in binary form must reproduce the above */
|
||||
/* copyright notice, this list of conditions and the following */
|
||||
/* disclaimer in the documentation and/or other materials */
|
||||
/* provided with the distribution. */
|
||||
/* */
|
||||
/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */
|
||||
/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */
|
||||
/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */
|
||||
/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */
|
||||
/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */
|
||||
/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */
|
||||
/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */
|
||||
/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */
|
||||
/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */
|
||||
/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */
|
||||
/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */
|
||||
/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */
|
||||
/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */
|
||||
/* POSSIBILITY OF SUCH DAMAGE. */
|
||||
/* */
|
||||
/* The views and conclusions contained in the software and */
|
||||
/* documentation are those of the authors and should not be */
|
||||
/* interpreted as representing official policies, either expressed */
|
||||
/* or implied, of The University of Texas at Austin. */
|
||||
/*********************************************************************/
|
||||
|
||||
#define ASSEMBLER
|
||||
#include "common.h"
|
||||
#include "version.h"
|
||||
|
||||
#define PREFETCHSIZE 88
|
||||
|
||||
#define N $16
|
||||
#define X $17
|
||||
#define INCX $18
|
||||
#define Y $19
|
||||
#define INCY $20
|
||||
|
||||
#define I $5
|
||||
|
||||
#define s0 $f0
|
||||
#define s1 $f30
|
||||
#define s2 $f1
|
||||
#define s3 $f2
|
||||
|
||||
#define a0 $f10
|
||||
#define a1 $f11
|
||||
#define a2 $f12
|
||||
#define a3 $f13
|
||||
#define a4 $f14
|
||||
#define a5 $f15
|
||||
#define a6 $f16
|
||||
#define a7 $f17
|
||||
|
||||
#define b0 $f18
|
||||
#define b1 $f19
|
||||
#define b2 $f20
|
||||
#define b3 $f21
|
||||
#define b4 $f22
|
||||
#define b5 $f23
|
||||
#define b6 $f24
|
||||
#define b7 $f25
|
||||
|
||||
#define t0 $f26
|
||||
#define t1 $f27
|
||||
#define t2 $f28
|
||||
#define t3 $f29
|
||||
|
||||
|
||||
PROLOGUE
|
||||
PROFCODE
|
||||
.frame $sp, 16, $26, 0
|
||||
|
||||
lda $sp, -16($sp)
|
||||
fclr s0
|
||||
stt $f2, 0($sp)
|
||||
fclr s1
|
||||
|
||||
fclr s2
|
||||
nop
|
||||
fclr s3
|
||||
ble N, $L999
|
||||
|
||||
fclr t0
|
||||
cmpeq INCX, 1, $21
|
||||
fclr t1
|
||||
cmpeq INCY, 1, $22
|
||||
fclr t2
|
||||
and $21, $22, $22
|
||||
fclr t3
|
||||
beq $22, $L20
|
||||
|
||||
#ifndef DOUBLE
|
||||
srl N, 4, I
|
||||
ble I, $L15
|
||||
|
||||
LD a0, 0 * SIZE(X)
|
||||
LD a1, 1 * SIZE(X)
|
||||
LD b0, 0 * SIZE(Y)
|
||||
LD b1, 1 * SIZE(Y)
|
||||
|
||||
LD a2, 2 * SIZE(X)
|
||||
LD a3, 3 * SIZE(X)
|
||||
LD b2, 2 * SIZE(Y)
|
||||
LD b3, 3 * SIZE(Y)
|
||||
|
||||
LD a4, 4 * SIZE(X)
|
||||
LD a5, 5 * SIZE(X)
|
||||
LD b4, 4 * SIZE(Y)
|
||||
LD b5, 5 * SIZE(Y)
|
||||
|
||||
LD a6, 6 * SIZE(X)
|
||||
LD a7, 7 * SIZE(X)
|
||||
addq X, 16 * SIZE, X
|
||||
subq I, 1, I
|
||||
|
||||
addq Y, 16 * SIZE, Y
|
||||
ble I, $L13
|
||||
.align 4
|
||||
|
||||
$L12:
|
||||
ldl $31, PREFETCHSIZE * 2 * SIZE(X)
|
||||
subq I, 1, I
|
||||
ldl $31, PREFETCHSIZE * 2 * SIZE(Y)
|
||||
addq X, 16 * SIZE, X
|
||||
|
||||
ADD s0, t0, s0
|
||||
LD b6, -10 * SIZE(Y)
|
||||
MUL a0, b0, t0
|
||||
LD b7, -9 * SIZE(Y)
|
||||
|
||||
ADD s1, t1, s1
|
||||
LD a0, -24 * SIZE(X)
|
||||
MUL a1, b1, t1
|
||||
LD a1, -23 * SIZE(X)
|
||||
|
||||
ADD s2, t2, s2
|
||||
LD b0, -8 * SIZE(Y)
|
||||
MUL a2, b2, t2
|
||||
LD b1, -7 * SIZE(Y)
|
||||
|
||||
ADD s3, t3, s3
|
||||
LD a2, -22 * SIZE(X)
|
||||
MUL a3, b3, t3
|
||||
LD a3, -21 * SIZE(X)
|
||||
|
||||
ADD s0, t0, s0
|
||||
LD b2, -6 * SIZE(Y)
|
||||
MUL a4, b4, t0
|
||||
LD b3, -5 * SIZE(Y)
|
||||
|
||||
ADD s1, t1, s1
|
||||
LD a4, -20 * SIZE(X)
|
||||
MUL a5, b5, t1
|
||||
LD a5, -19 * SIZE(X)
|
||||
|
||||
ADD s2, t2, s2
|
||||
LD b4, -4 * SIZE(Y)
|
||||
MUL a6, b6, t2
|
||||
LD b5, -3 * SIZE(Y)
|
||||
|
||||
ADD s3, t3, s3
|
||||
LD a6, -18 * SIZE(X)
|
||||
MUL a7, b7, t3
|
||||
LD a7, -17 * SIZE(X)
|
||||
|
||||
ADD s0, t0, s0
|
||||
LD b6, -2 * SIZE(Y)
|
||||
MUL a0, b0, t0
|
||||
LD b7, -1 * SIZE(Y)
|
||||
|
||||
ADD s1, t1, s1
|
||||
LD a0, -16 * SIZE(X)
|
||||
MUL a1, b1, t1
|
||||
LD a1, -15 * SIZE(X)
|
||||
|
||||
ADD s2, t2, s2
|
||||
LD b0, 0 * SIZE(Y)
|
||||
MUL a2, b2, t2
|
||||
LD b1, 1 * SIZE(Y)
|
||||
|
||||
ADD s3, t3, s3
|
||||
LD a2, -14 * SIZE(X)
|
||||
MUL a3, b3, t3
|
||||
LD a3, -13 * SIZE(X)
|
||||
|
||||
ADD s0, t0, s0
|
||||
LD b2, 2 * SIZE(Y)
|
||||
MUL a4, b4, t0
|
||||
LD b3, 3 * SIZE(Y)
|
||||
|
||||
ADD s1, t1, s1
|
||||
LD a4, -12 * SIZE(X)
|
||||
MUL a5, b5, t1
|
||||
LD a5, -11 * SIZE(X)
|
||||
|
||||
ADD s2, t2, s2
|
||||
LD b4, 4 * SIZE(Y)
|
||||
MUL a6, b6, t2
|
||||
LD b5, 5 * SIZE(Y)
|
||||
|
||||
ADD s3, t3, s3
|
||||
LD a6, -10 * SIZE(X)
|
||||
MUL a7, b7, t3
|
||||
LD a7, -9 * SIZE(X)
|
||||
|
||||
addq Y, 16 * SIZE, Y
|
||||
bgt I, $L12
|
||||
nop
|
||||
fnop
|
||||
.align 4
|
||||
|
||||
$L13:
|
||||
ADD s0, t0, s0
|
||||
LD b6,-10 * SIZE(Y)
|
||||
MUL a0, b0, t0
|
||||
LD b7, -9 * SIZE(Y)
|
||||
|
||||
ADD s1, t1, s1
|
||||
LD a0, -8 * SIZE(X)
|
||||
MUL a1, b1, t1
|
||||
LD a1, -7 * SIZE(X)
|
||||
|
||||
ADD s2, t2, s2
|
||||
LD b0, -8 * SIZE(Y)
|
||||
MUL a2, b2, t2
|
||||
LD b1, -7 * SIZE(Y)
|
||||
|
||||
ADD s3, t3, s3
|
||||
LD a2, -6 * SIZE(X)
|
||||
MUL a3, b3, t3
|
||||
LD a3, -5 * SIZE(X)
|
||||
|
||||
ADD s0, t0, s0
|
||||
LD b2, -6 * SIZE(Y)
|
||||
MUL a4, b4, t0
|
||||
LD b3, -5 * SIZE(Y)
|
||||
|
||||
ADD s1, t1, s1
|
||||
LD a4, -4 * SIZE(X)
|
||||
MUL a5, b5, t1
|
||||
LD a5, -3 * SIZE(X)
|
||||
|
||||
ADD s2, t2, s2
|
||||
LD b4, -4 * SIZE(Y)
|
||||
MUL a6, b6, t2
|
||||
LD b5, -3 * SIZE(Y)
|
||||
|
||||
ADD s3, t3, s3
|
||||
LD a6, -2 * SIZE(X)
|
||||
MUL a7, b7, t3
|
||||
LD a7, -1 * SIZE(X)
|
||||
|
||||
ADD s0, t0, s0
|
||||
LD b6, -2 * SIZE(Y)
|
||||
MUL a0, b0, t0
|
||||
LD b7, -1 * SIZE(Y)
|
||||
ADD s1, t1, s1
|
||||
MUL a1, b1, t1
|
||||
|
||||
ADD s2, t2, s2
|
||||
MUL a2, b2, t2
|
||||
ADD s3, t3, s3
|
||||
MUL a3, b3, t3
|
||||
|
||||
ADD s0, t0, s0
|
||||
MUL a4, b4, t0
|
||||
ADD s1, t1, s1
|
||||
MUL a5, b5, t1
|
||||
ADD s2, t2, s2
|
||||
MUL a6, b6, t2
|
||||
ADD s3, t3, s3
|
||||
MUL a7, b7, t3
|
||||
.align 4
|
||||
|
||||
$L15:
|
||||
ADD s0, t0, s0
|
||||
and N, 15, I
|
||||
ADD s1, t1, s1
|
||||
ble I, $L18
|
||||
.align 4
|
||||
|
||||
#else
|
||||
|
||||
srl N, 3, I
|
||||
ble I, $L15
|
||||
|
||||
LD a0, 0 * SIZE(X)
|
||||
LD a1, 1 * SIZE(X)
|
||||
LD b0, 0 * SIZE(Y)
|
||||
LD b1, 1 * SIZE(Y)
|
||||
|
||||
LD a2, 2 * SIZE(X)
|
||||
LD a3, 3 * SIZE(X)
|
||||
LD b2, 2 * SIZE(Y)
|
||||
LD b3, 3 * SIZE(Y)
|
||||
|
||||
LD a4, 4 * SIZE(X)
|
||||
LD a5, 5 * SIZE(X)
|
||||
LD b4, 4 * SIZE(Y)
|
||||
LD b5, 5 * SIZE(Y)
|
||||
|
||||
LD a6, 6 * SIZE(X)
|
||||
LD a7, 7 * SIZE(X)
|
||||
addq X, 8 * SIZE, X
|
||||
subq I, 1, I
|
||||
|
||||
addq Y, 8 * SIZE, Y
|
||||
ble I, $L13
|
||||
.align 4
|
||||
|
||||
$L12:
|
||||
ldl $31, PREFETCHSIZE * SIZE(X)
|
||||
subq I, 1, I
|
||||
ldl $31, PREFETCHSIZE * SIZE(Y)
|
||||
addq X, 8 * SIZE, X
|
||||
|
||||
ADD s0, t0, s0
|
||||
LD b6, -2 * SIZE(Y)
|
||||
MUL a0, b0, t0
|
||||
LD b7, -1 * SIZE(Y)
|
||||
|
||||
ADD s1, t1, s1
|
||||
LD a0, -8 * SIZE(X)
|
||||
MUL a1, b1, t1
|
||||
LD a1, -7 * SIZE(X)
|
||||
|
||||
ADD s2, t2, s2
|
||||
LD b0, 0 * SIZE(Y)
|
||||
MUL a2, b2, t2
|
||||
LD b1, 1 * SIZE(Y)
|
||||
|
||||
ADD s3, t3, s3
|
||||
LD a2, -6 * SIZE(X)
|
||||
MUL a3, b3, t3
|
||||
LD a3, -5 * SIZE(X)
|
||||
|
||||
ADD s0, t0, s0
|
||||
LD b2, 2 * SIZE(Y)
|
||||
MUL a4, b4, t0
|
||||
LD b3, 3 * SIZE(Y)
|
||||
|
||||
ADD s1, t1, s1
|
||||
LD a4, -4 * SIZE(X)
|
||||
MUL a5, b5, t1
|
||||
LD a5, -3 * SIZE(X)
|
||||
|
||||
ADD s2, t2, s2
|
||||
LD b4, 4 * SIZE(Y)
|
||||
MUL a6, b6, t2
|
||||
LD b5, 5 * SIZE(Y)
|
||||
|
||||
ADD s3, t3, s3
|
||||
LD a6, -2 * SIZE(X)
|
||||
MUL a7, b7, t3
|
||||
LD a7, -1 * SIZE(X)
|
||||
|
||||
addq Y, 8 * SIZE, Y
|
||||
bgt I, $L12
|
||||
nop
|
||||
fnop
|
||||
.align 4
|
||||
|
||||
$L13:
|
||||
ADD s0, t0, s0
|
||||
LD b6, -2 * SIZE(Y)
|
||||
MUL a0, b0, t0
|
||||
LD b7, -1 * SIZE(Y)
|
||||
ADD s1, t1, s1
|
||||
MUL a1, b1, t1
|
||||
|
||||
ADD s2, t2, s2
|
||||
MUL a2, b2, t2
|
||||
ADD s3, t3, s3
|
||||
MUL a3, b3, t3
|
||||
|
||||
ADD s0, t0, s0
|
||||
MUL a4, b4, t0
|
||||
ADD s1, t1, s1
|
||||
MUL a5, b5, t1
|
||||
ADD s2, t2, s2
|
||||
MUL a6, b6, t2
|
||||
ADD s3, t3, s3
|
||||
MUL a7, b7, t3
|
||||
.align 4
|
||||
|
||||
$L15:
|
||||
ADD s0, t0, s0
|
||||
and N, 7, I
|
||||
ADD s1, t1, s1
|
||||
ble I, $L18
|
||||
.align 4
|
||||
|
||||
#endif
|
||||
|
||||
$L16:
|
||||
LD a0, 0 * SIZE(X)
|
||||
addq X, SIZE, X
|
||||
LD b0, 0 * SIZE(Y)
|
||||
addq Y, SIZE, Y
|
||||
|
||||
ADD s2, t2, s2
|
||||
MUL a0, b0, t2
|
||||
subq I, 1, I
|
||||
bgt I, $L16
|
||||
.align 4
|
||||
|
||||
$L18:
|
||||
ADD s2, t2, s2
|
||||
ADD s3, t3, s3
|
||||
br $L999
|
||||
.align 4
|
||||
|
||||
$L20:
|
||||
srl N, 2, I
|
||||
ble I, $L25
|
||||
|
||||
LD a0, 0 * SIZE(X)
|
||||
SXADDQ INCX, X, X
|
||||
LD b0, 0 * SIZE(Y)
|
||||
SXADDQ INCY, Y, Y
|
||||
LD a1, 0 * SIZE(X)
|
||||
SXADDQ INCX, X, X
|
||||
LD b1, 0 * SIZE(Y)
|
||||
SXADDQ INCY, Y, Y
|
||||
|
||||
LD a2, 0 * SIZE(X)
|
||||
SXADDQ INCX, X, X
|
||||
LD b2, 0 * SIZE(Y)
|
||||
SXADDQ INCY, Y, Y
|
||||
LD a3, 0 * SIZE(X)
|
||||
SXADDQ INCX, X, X
|
||||
LD b3, 0 * SIZE(Y)
|
||||
subq I, 1, I
|
||||
|
||||
SXADDQ INCY, Y, Y
|
||||
ble I, $L23
|
||||
.align 4
|
||||
|
||||
$L22:
|
||||
ADD s0, t0, s0
|
||||
MUL a0, b0, t0
|
||||
ADD s1, t1, s1
|
||||
MUL a1, b1, t1
|
||||
ADD s2, t2, s2
|
||||
MUL a2, b2, t2
|
||||
ADD s3, t3, s3
|
||||
MUL a3, b3, t3
|
||||
|
||||
LD a0, 0 * SIZE(X)
|
||||
SXADDQ INCX, X, X
|
||||
LD b0, 0 * SIZE(Y)
|
||||
SXADDQ INCY, Y, Y
|
||||
LD a1, 0 * SIZE(X)
|
||||
SXADDQ INCX, X, X
|
||||
LD b1, 0 * SIZE(Y)
|
||||
SXADDQ INCY, Y, Y
|
||||
|
||||
LD a2, 0 * SIZE(X)
|
||||
SXADDQ INCX, X, X
|
||||
LD b2, 0 * SIZE(Y)
|
||||
SXADDQ INCY, Y, Y
|
||||
LD a3, 0 * SIZE(X)
|
||||
SXADDQ INCX, X, X
|
||||
LD b3, 0 * SIZE(Y)
|
||||
SXADDQ INCY, Y, Y
|
||||
|
||||
subq I, 1, I
|
||||
bgt I, $L22
|
||||
nop
|
||||
fnop
|
||||
.align 4
|
||||
|
||||
$L23:
|
||||
ADD s0, t0, s0
|
||||
MUL a0, b0, t0
|
||||
ADD s1, t1, s1
|
||||
MUL a1, b1, t1
|
||||
ADD s2, t2, s2
|
||||
MUL a2, b2, t2
|
||||
ADD s3, t3, s3
|
||||
MUL a3, b3, t3
|
||||
.align 4
|
||||
|
||||
$L25:
|
||||
ADD s0, t0, s0
|
||||
and N, 3, I
|
||||
ADD s1, t1, s1
|
||||
ble I, $L28
|
||||
.align 4
|
||||
|
||||
$L26:
|
||||
LD a0, 0 * SIZE(X)
|
||||
SXADDQ INCX, X, X
|
||||
LD b0, 0 * SIZE(Y)
|
||||
SXADDQ INCY, Y, Y
|
||||
|
||||
ADD s2, t2, s2
|
||||
MUL a0, b0, t2
|
||||
subq I, 1, I
|
||||
bgt I, $L26
|
||||
.align 4
|
||||
|
||||
$L28:
|
||||
ADD s2, t2, s2
|
||||
ADD s3, t3, s3
|
||||
.align 4
|
||||
|
||||
$L999:
|
||||
ADD s2, s3, s2
|
||||
ldt $f2, 0($sp)
|
||||
ADD s0, s1, s0
|
||||
lda $sp, 16($sp)
|
||||
|
||||
ADD s0, s2, s0
|
||||
ret
|
||||
|
||||
EPILOGUE
|
||||
179
kernel/alpha/gemm_beta.S
Normal file
179
kernel/alpha/gemm_beta.S
Normal file
@@ -0,0 +1,179 @@
|
||||
/*********************************************************************/
|
||||
/* Copyright 2009, 2010 The University of Texas at Austin. */
|
||||
/* All rights reserved. */
|
||||
/* */
|
||||
/* Redistribution and use in source and binary forms, with or */
|
||||
/* without modification, are permitted provided that the following */
|
||||
/* conditions are met: */
|
||||
/* */
|
||||
/* 1. Redistributions of source code must retain the above */
|
||||
/* copyright notice, this list of conditions and the following */
|
||||
/* disclaimer. */
|
||||
/* */
|
||||
/* 2. Redistributions in binary form must reproduce the above */
|
||||
/* copyright notice, this list of conditions and the following */
|
||||
/* disclaimer in the documentation and/or other materials */
|
||||
/* provided with the distribution. */
|
||||
/* */
|
||||
/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */
|
||||
/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */
|
||||
/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */
|
||||
/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */
|
||||
/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */
|
||||
/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */
|
||||
/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */
|
||||
/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */
|
||||
/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */
|
||||
/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */
|
||||
/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */
|
||||
/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */
|
||||
/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */
|
||||
/* POSSIBILITY OF SUCH DAMAGE. */
|
||||
/* */
|
||||
/* The views and conclusions contained in the software and */
|
||||
/* documentation are those of the authors and should not be */
|
||||
/* interpreted as representing official policies, either expressed */
|
||||
/* or implied, of The University of Texas at Austin. */
|
||||
/*********************************************************************/
|
||||
|
||||
#define ASSEMBLER
|
||||
#include "common.h"
|
||||
#include "version.h"
|
||||
|
||||
.set noat
|
||||
.set noreorder
|
||||
.text
|
||||
.align 5
|
||||
.globl CNAME
|
||||
.ent CNAME
|
||||
CNAME:
|
||||
.frame $sp, 0, $26, 0
|
||||
|
||||
#ifdef PROFILE
|
||||
ldgp $gp, 0($27)
|
||||
lda $28, _mcount
|
||||
jsr $28, ($28), _mcount
|
||||
#endif
|
||||
|
||||
ldq $18, 16($sp)
|
||||
ble $16, $End
|
||||
ldl $19, 24($sp)
|
||||
ble $17, $End
|
||||
#ifndef PROFILE
|
||||
.prologue 0
|
||||
#else
|
||||
.prologue 1
|
||||
#endif
|
||||
|
||||
fbeq $f19, $BETA_EQ_ZERO # if (beta == ZERO)
|
||||
.align 4
|
||||
|
||||
$BETA_NE_ZERO:
|
||||
sra $16, 3, $2 # i = (m >> 3)
|
||||
mov $18, $1 # c_offset = c
|
||||
lda $17, -1($17) # j --
|
||||
ble $2,$L52
|
||||
.align 4
|
||||
|
||||
$L51:
|
||||
lds $f31, 64($1)
|
||||
lda $2, -1($2)
|
||||
|
||||
LD $f14, 0*SIZE($1)
|
||||
LD $f15, 1*SIZE($1)
|
||||
LD $f16, 2*SIZE($1)
|
||||
LD $f17, 3*SIZE($1)
|
||||
LD $f18, 4*SIZE($1)
|
||||
LD $f11, 5*SIZE($1)
|
||||
LD $f21, 6*SIZE($1)
|
||||
LD $f22, 7*SIZE($1)
|
||||
|
||||
MUL $f19, $f14, $f23
|
||||
MUL $f19, $f15, $f24
|
||||
MUL $f19, $f16, $f25
|
||||
MUL $f19, $f17, $f26
|
||||
MUL $f19, $f18, $f27
|
||||
MUL $f19, $f11, $f28
|
||||
MUL $f19, $f21, $f29
|
||||
MUL $f19, $f22, $f30
|
||||
|
||||
ST $f23, 0*SIZE($1)
|
||||
ST $f24, 1*SIZE($1)
|
||||
ST $f25, 2*SIZE($1)
|
||||
ST $f26, 3*SIZE($1)
|
||||
ST $f27, 4*SIZE($1)
|
||||
ST $f28, 5*SIZE($1)
|
||||
ST $f29, 6*SIZE($1)
|
||||
ST $f30, 7*SIZE($1)
|
||||
|
||||
lda $1,8*SIZE($1)
|
||||
bgt $2,$L51
|
||||
.align 4
|
||||
|
||||
$L52:
|
||||
and $16, 7, $2
|
||||
ble $2,$L54
|
||||
.align 4
|
||||
|
||||
$L53:
|
||||
LD $f12, 0($1)
|
||||
lda $2, -1($2)
|
||||
MUL $f19, $f12, $f23
|
||||
ST $f23, 0($1)
|
||||
lda $1, SIZE($1)
|
||||
bgt $2,$L53
|
||||
.align 4
|
||||
|
||||
$L54:
|
||||
SXADDQ $19, $18, $18 # c += ldc
|
||||
bgt $17,$BETA_NE_ZERO
|
||||
clr $0
|
||||
ret
|
||||
.align 4
|
||||
|
||||
$BETA_EQ_ZERO:
|
||||
sra $16, 3, $2 # i = (m >> 3)
|
||||
lda $4, 8*SIZE($18)
|
||||
mov $18, $1 # c_offset = c
|
||||
lda $17, -1($17) # j --
|
||||
ble $2,$L42
|
||||
.align 4
|
||||
|
||||
$L41:
|
||||
ST $f31, 0*SIZE($1)
|
||||
ST $f31, 1*SIZE($1)
|
||||
ST $f31, 2*SIZE($1)
|
||||
ST $f31, 3*SIZE($1)
|
||||
ST $f31, 4*SIZE($1)
|
||||
ST $f31, 5*SIZE($1)
|
||||
ST $f31, 6*SIZE($1)
|
||||
ST $f31, 7*SIZE($1)
|
||||
lda $2, -1($2)
|
||||
|
||||
lda $4, 8*SIZE($4)
|
||||
lda $1, 8*SIZE($1)
|
||||
bgt $2,$L41
|
||||
.align 4
|
||||
|
||||
$L42:
|
||||
and $16, 7, $2
|
||||
ble $2,$L44
|
||||
.align 4
|
||||
|
||||
$L43:
|
||||
lda $2, -1($2)
|
||||
ST $f31, 0($1)
|
||||
lda $1, SIZE($1)
|
||||
bgt $2, $L43
|
||||
.align 4
|
||||
|
||||
$L44:
|
||||
SXADDQ $19, $18, $18 # c += ldc
|
||||
bgt $17,$BETA_EQ_ZERO
|
||||
clr $0
|
||||
.align 4
|
||||
|
||||
$End:
|
||||
ret
|
||||
.ident VERSION
|
||||
.end CNAME
|
||||
2852
kernel/alpha/gemm_kernel_4x4.S
Normal file
2852
kernel/alpha/gemm_kernel_4x4.S
Normal file
File diff suppressed because it is too large
Load Diff
1307
kernel/alpha/gemv_n.S
Normal file
1307
kernel/alpha/gemv_n.S
Normal file
File diff suppressed because it is too large
Load Diff
1061
kernel/alpha/gemv_t.S
Normal file
1061
kernel/alpha/gemv_t.S
Normal file
File diff suppressed because it is too large
Load Diff
440
kernel/alpha/iamax.S
Normal file
440
kernel/alpha/iamax.S
Normal file
@@ -0,0 +1,440 @@
|
||||
/*********************************************************************/
|
||||
/* Copyright 2009, 2010 The University of Texas at Austin. */
|
||||
/* All rights reserved. */
|
||||
/* */
|
||||
/* Redistribution and use in source and binary forms, with or */
|
||||
/* without modification, are permitted provided that the following */
|
||||
/* conditions are met: */
|
||||
/* */
|
||||
/* 1. Redistributions of source code must retain the above */
|
||||
/* copyright notice, this list of conditions and the following */
|
||||
/* disclaimer. */
|
||||
/* */
|
||||
/* 2. Redistributions in binary form must reproduce the above */
|
||||
/* copyright notice, this list of conditions and the following */
|
||||
/* disclaimer in the documentation and/or other materials */
|
||||
/* provided with the distribution. */
|
||||
/* */
|
||||
/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */
|
||||
/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */
|
||||
/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */
|
||||
/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */
|
||||
/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */
|
||||
/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */
|
||||
/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */
|
||||
/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */
|
||||
/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */
|
||||
/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */
|
||||
/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */
|
||||
/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */
|
||||
/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */
|
||||
/* POSSIBILITY OF SUCH DAMAGE. */
|
||||
/* */
|
||||
/* The views and conclusions contained in the software and */
|
||||
/* documentation are those of the authors and should not be */
|
||||
/* interpreted as representing official policies, either expressed */
|
||||
/* or implied, of The University of Texas at Austin. */
|
||||
/*********************************************************************/
|
||||
|
||||
#define ASSEMBLER
|
||||
#include "common.h"
|
||||
#include "version.h"
|
||||
|
||||
#define N $16
|
||||
#define X $17
|
||||
#define INCX $18
|
||||
#define XX $19
|
||||
|
||||
#ifndef USE_MIN
|
||||
#define CMPLT(a, b) cmptlt a, b
|
||||
#else
|
||||
#define CMPLT(a, b) cmptlt b, a
|
||||
#endif
|
||||
|
||||
#define STACKSIZE 6 * 8
|
||||
|
||||
PROLOGUE
|
||||
PROFCODE
|
||||
.frame $sp, STACKSIZE, $26, 0
|
||||
|
||||
#ifdef F_INTERFACE
|
||||
ldl N, 0(N) # n
|
||||
ldl INCX, 0(INCX) # incx
|
||||
#endif
|
||||
lda $sp, -STACKSIZE($sp)
|
||||
mov X, XX
|
||||
.align 4
|
||||
|
||||
stt $f2, 0($sp)
|
||||
fclr $f16
|
||||
cmplt $31, N, $2
|
||||
unop
|
||||
|
||||
stt $f3, 8($sp)
|
||||
fclr $f17
|
||||
cmplt $31, INCX, $3
|
||||
unop
|
||||
|
||||
stt $f4, 16($sp)
|
||||
fclr $f18
|
||||
SXADDQ INCX, $31, INCX
|
||||
unop
|
||||
|
||||
stt $f5, 24($sp)
|
||||
fclr $f19
|
||||
and $2, $3, $2
|
||||
clr $0
|
||||
|
||||
stt $f6, 32($sp)
|
||||
fclr $f0
|
||||
sra N, 3, $1
|
||||
beq $2, $End # if (n <= 0) or (incx <= 0) return
|
||||
.align 4
|
||||
|
||||
LD $f20, 0 * SIZE(X)
|
||||
unop
|
||||
fabs $f20, $f0
|
||||
ble $1, $L15
|
||||
.align 4
|
||||
|
||||
fabs $f20, $f1
|
||||
unop
|
||||
addq X, INCX, X
|
||||
unop
|
||||
|
||||
LD $f21, 0 * SIZE(X)
|
||||
fabs $f20, $f2
|
||||
addq X, INCX, X
|
||||
unop
|
||||
|
||||
LD $f22, 0 * SIZE(X)
|
||||
fabs $f20, $f3
|
||||
addq X, INCX, X
|
||||
unop
|
||||
|
||||
LD $f23, 0 * SIZE(X)
|
||||
fabs $f20, $f4
|
||||
addq X, INCX, X
|
||||
unop
|
||||
|
||||
LD $f24, 0 * SIZE(X)
|
||||
addq X, INCX, X
|
||||
fabs $f20, $f5
|
||||
unop
|
||||
|
||||
LD $f25, 0 * SIZE(X)
|
||||
fabs $f20, $f6
|
||||
addq X, INCX, X
|
||||
unop
|
||||
|
||||
LD $f26, 0 * SIZE(X)
|
||||
fabs $f20, $f28
|
||||
addq X, INCX, X
|
||||
lda $1, -1($1)
|
||||
|
||||
LD $f27, 0 * SIZE(X)
|
||||
unop
|
||||
addq X, INCX, X
|
||||
ble $1, $L13
|
||||
.align 4
|
||||
|
||||
$L12:
|
||||
fcmovne $f16, $f12, $f4
|
||||
unop
|
||||
fabs $f20, $f29
|
||||
ldl $31, 56 * SIZE(X)
|
||||
|
||||
fcmovne $f17, $f13, $f5
|
||||
LD $f20, 0 * SIZE(X)
|
||||
fabs $f21, $f30
|
||||
addq X, INCX, X
|
||||
|
||||
fcmovne $f18, $f14, $f6
|
||||
LD $f21, 0 * SIZE(X)
|
||||
fabs $f22, $f10
|
||||
addq X, INCX, X
|
||||
|
||||
fcmovne $f19, $f15, $f28
|
||||
LD $f22, 0 * SIZE(X)
|
||||
fabs $f23, $f11
|
||||
addq X, INCX, X
|
||||
|
||||
fabs $f24, $f12
|
||||
LD $f23, 0 * SIZE(X)
|
||||
CMPLT($f0, $f29), $f16
|
||||
addq X, INCX, X
|
||||
|
||||
fabs $f25, $f13
|
||||
LD $f24, 0 * SIZE(X)
|
||||
CMPLT($f1, $f30), $f17
|
||||
addq X, INCX, X
|
||||
|
||||
fabs $f26, $f14
|
||||
LD $f25, 0 * SIZE(X)
|
||||
CMPLT($f2, $f10), $f18
|
||||
addq X, INCX, X
|
||||
|
||||
fabs $f27, $f15
|
||||
LD $f26, 0 * SIZE(X)
|
||||
CMPLT($f3, $f11), $f19
|
||||
addq X, INCX, X
|
||||
|
||||
fcmovne $f16, $f29, $f0
|
||||
LD $f27, 0 * SIZE(X)
|
||||
CMPLT($f4, $f12), $f16
|
||||
addq X, INCX, X
|
||||
|
||||
fcmovne $f17, $f30, $f1
|
||||
unop
|
||||
CMPLT($f5, $f13), $f17
|
||||
lda $1, -1($1) # i --
|
||||
|
||||
fcmovne $f18, $f10, $f2
|
||||
unop
|
||||
CMPLT($f6, $f14), $f18
|
||||
unop
|
||||
|
||||
fcmovne $f19, $f11, $f3
|
||||
unop
|
||||
CMPLT($f28, $f15), $f19
|
||||
bgt $1,$L12
|
||||
.align 4
|
||||
|
||||
$L13:
|
||||
fcmovne $f16, $f12, $f4
|
||||
fabs $f20, $f29
|
||||
fcmovne $f17, $f13, $f5
|
||||
fabs $f21, $f30
|
||||
|
||||
fcmovne $f18, $f14, $f6
|
||||
fabs $f22, $f10
|
||||
fcmovne $f19, $f15, $f28
|
||||
fabs $f23, $f11
|
||||
|
||||
fabs $f24, $f12
|
||||
CMPLT($f0, $f29), $f16
|
||||
fabs $f25, $f13
|
||||
CMPLT($f1, $f30), $f17
|
||||
|
||||
fabs $f26, $f14
|
||||
CMPLT($f2, $f10), $f18
|
||||
fabs $f27, $f15
|
||||
CMPLT($f3, $f11), $f19
|
||||
|
||||
fcmovne $f16, $f29, $f0
|
||||
CMPLT($f4, $f12), $f16
|
||||
fcmovne $f17, $f30, $f1
|
||||
CMPLT($f5, $f13), $f17
|
||||
|
||||
fcmovne $f18, $f10, $f2
|
||||
CMPLT($f6, $f14), $f18
|
||||
fcmovne $f19, $f11, $f3
|
||||
CMPLT($f28, $f15), $f19
|
||||
|
||||
fcmovne $f16, $f12, $f4
|
||||
CMPLT($f0, $f1), $f16
|
||||
fcmovne $f17, $f13, $f5
|
||||
CMPLT($f2, $f3), $f17
|
||||
|
||||
fcmovne $f18, $f14, $f6
|
||||
CMPLT($f4, $f5), $f18
|
||||
fcmovne $f19, $f15, $f28
|
||||
CMPLT($f6, $f28), $f19
|
||||
|
||||
fcmovne $f16, $f1, $f0
|
||||
fcmovne $f17, $f3, $f2
|
||||
fcmovne $f18, $f5, $f4
|
||||
fcmovne $f19, $f28, $f6
|
||||
|
||||
CMPLT($f0, $f2), $f16
|
||||
CMPLT($f4, $f6), $f17
|
||||
|
||||
fcmovne $f16, $f2, $f0
|
||||
fcmovne $f17, $f6, $f4
|
||||
|
||||
CMPLT($f0, $f4), $f16
|
||||
fcmovne $f16, $f4, $f0
|
||||
.align 4
|
||||
|
||||
$L15:
|
||||
and N, 7, $1
|
||||
unop
|
||||
unop
|
||||
ble $1, $L20
|
||||
.align 4
|
||||
|
||||
$L16:
|
||||
LD $f20, 0 * SIZE(X)
|
||||
addq X, INCX, X
|
||||
|
||||
fabs $f20, $f29
|
||||
CMPLT($f0, $f29), $f16
|
||||
fcmovne $f16, $f29, $f0
|
||||
|
||||
lda $1, -1($1) # i --
|
||||
bgt $1, $L16
|
||||
.align 4
|
||||
|
||||
$L20:
|
||||
sra N, 3, $1
|
||||
ble $1, $L40
|
||||
.align 4
|
||||
|
||||
LD $f10, 0 * SIZE(XX)
|
||||
addq XX, INCX, XX
|
||||
LD $f11, 0 * SIZE(XX)
|
||||
addq XX, INCX, XX
|
||||
|
||||
LD $f12, 0 * SIZE(XX)
|
||||
addq XX, INCX, XX
|
||||
LD $f13, 0 * SIZE(XX)
|
||||
addq XX, INCX, XX
|
||||
|
||||
LD $f14, 0 * SIZE(XX)
|
||||
addq XX, INCX, XX
|
||||
LD $f15, 0 * SIZE(XX)
|
||||
addq XX, INCX, XX
|
||||
|
||||
LD $f16, 0 * SIZE(XX)
|
||||
addq XX, INCX, XX
|
||||
LD $f17, 0 * SIZE(XX)
|
||||
addq XX, INCX, XX
|
||||
|
||||
fabs $f10, $f18
|
||||
fabs $f11, $f19
|
||||
fabs $f12, $f20
|
||||
fabs $f13, $f21
|
||||
|
||||
lda $1, -1($1)
|
||||
ble $1, $L23
|
||||
.align 4
|
||||
|
||||
$L22:
|
||||
LD $f10, 0 * SIZE(XX)
|
||||
fabs $f14, $f22
|
||||
addq XX, INCX, XX
|
||||
cmpteq $f0, $f18, $f2
|
||||
|
||||
LD $f11, 0 * SIZE(XX)
|
||||
fabs $f15, $f23
|
||||
addq XX, INCX, XX
|
||||
cmpteq $f0, $f19, $f3
|
||||
|
||||
LD $f12, 0 * SIZE(XX)
|
||||
fabs $f16, $f24
|
||||
addq XX, INCX, XX
|
||||
cmpteq $f0, $f20, $f4
|
||||
|
||||
LD $f13, 0 * SIZE(XX)
|
||||
fabs $f17, $f25
|
||||
addq XX, INCX, XX
|
||||
cmpteq $f0, $f21, $f5
|
||||
|
||||
LD $f14, 0 * SIZE(XX)
|
||||
lda $1, -1($1) # i --
|
||||
cmpteq $f0, $f22, $f26
|
||||
addq XX, INCX, XX
|
||||
|
||||
lda $0, 1($0)
|
||||
fbne $f2, $End
|
||||
|
||||
LD $f15, 0 * SIZE(XX)
|
||||
cmpteq $f0, $f23, $f27
|
||||
lda $0, 1($0)
|
||||
fbne $f3, $End
|
||||
|
||||
addq XX, INCX, XX
|
||||
cmpteq $f0, $f24, $f28
|
||||
lda $0, 1($0)
|
||||
fbne $f4, $End
|
||||
|
||||
LD $f16, 0 * SIZE(XX)
|
||||
cmpteq $f0, $f25, $f29
|
||||
lda $0, 1($0)
|
||||
fbne $f5, $End
|
||||
|
||||
addq XX, INCX, XX
|
||||
lda $0, 1($0)
|
||||
fabs $f10, $f18
|
||||
fbne $f26, $End
|
||||
|
||||
LD $f17, 0 * SIZE(XX)
|
||||
lda $0, 1($0)
|
||||
fabs $f11, $f19
|
||||
fbne $f27, $End
|
||||
|
||||
addq XX, INCX, XX
|
||||
lda $0, 1($0)
|
||||
fabs $f12, $f20
|
||||
fbne $f28, $End
|
||||
|
||||
lda $0, 1($0)
|
||||
fabs $f13, $f21
|
||||
fbne $f29, $End
|
||||
bgt $1, $L22
|
||||
.align 4
|
||||
|
||||
$L23:
|
||||
fabs $f14, $f22
|
||||
cmpteq $f0, $f18, $f2
|
||||
fabs $f15, $f23
|
||||
cmpteq $f0, $f19, $f3
|
||||
|
||||
fabs $f16, $f24
|
||||
cmpteq $f0, $f20, $f4
|
||||
fabs $f17, $f25
|
||||
cmpteq $f0, $f21, $f5
|
||||
|
||||
cmpteq $f0, $f22, $f26
|
||||
lda $0, 1($0)
|
||||
unop
|
||||
fbne $f2, $End
|
||||
|
||||
cmpteq $f0, $f23, $f27
|
||||
lda $0, 1($0)
|
||||
unop
|
||||
fbne $f3, $End
|
||||
|
||||
cmpteq $f0, $f24, $f28
|
||||
lda $0, 1($0)
|
||||
unop
|
||||
fbne $f4, $End
|
||||
|
||||
cmpteq $f0, $f25, $f29
|
||||
lda $0, 1($0)
|
||||
unop
|
||||
fbne $f5, $End
|
||||
|
||||
lda $0, 1($0)
|
||||
fbne $f26, $End
|
||||
lda $0, 1($0)
|
||||
fbne $f27, $End
|
||||
lda $0, 1($0)
|
||||
fbne $f28, $End
|
||||
lda $0, 1($0)
|
||||
fbne $f29, $End
|
||||
.align 4
|
||||
|
||||
$L40:
|
||||
LD $f20, 0 * SIZE(XX)
|
||||
addq XX, INCX, XX
|
||||
|
||||
fabs $f20, $f25
|
||||
cmpteq $f0, $f25, $f29
|
||||
|
||||
lda $0, 1($0)
|
||||
fbne $f29, $End
|
||||
br $31, $L40
|
||||
.align 4
|
||||
|
||||
$End:
|
||||
ldt $f2, 0($sp)
|
||||
ldt $f3, 8($sp)
|
||||
ldt $f4, 16($sp)
|
||||
ldt $f5, 24($sp)
|
||||
|
||||
ldt $f6, 32($sp)
|
||||
lda $sp, STACKSIZE($sp)
|
||||
ret
|
||||
|
||||
EPILOGUE
|
||||
351
kernel/alpha/imax.S
Normal file
351
kernel/alpha/imax.S
Normal file
@@ -0,0 +1,351 @@
|
||||
/*********************************************************************/
|
||||
/* Copyright 2009, 2010 The University of Texas at Austin. */
|
||||
/* All rights reserved. */
|
||||
/* */
|
||||
/* Redistribution and use in source and binary forms, with or */
|
||||
/* without modification, are permitted provided that the following */
|
||||
/* conditions are met: */
|
||||
/* */
|
||||
/* 1. Redistributions of source code must retain the above */
|
||||
/* copyright notice, this list of conditions and the following */
|
||||
/* disclaimer. */
|
||||
/* */
|
||||
/* 2. Redistributions in binary form must reproduce the above */
|
||||
/* copyright notice, this list of conditions and the following */
|
||||
/* disclaimer in the documentation and/or other materials */
|
||||
/* provided with the distribution. */
|
||||
/* */
|
||||
/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */
|
||||
/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */
|
||||
/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */
|
||||
/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */
|
||||
/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */
|
||||
/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */
|
||||
/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */
|
||||
/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */
|
||||
/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */
|
||||
/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */
|
||||
/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */
|
||||
/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */
|
||||
/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */
|
||||
/* POSSIBILITY OF SUCH DAMAGE. */
|
||||
/* */
|
||||
/* The views and conclusions contained in the software and */
|
||||
/* documentation are those of the authors and should not be */
|
||||
/* interpreted as representing official policies, either expressed */
|
||||
/* or implied, of The University of Texas at Austin. */
|
||||
/*********************************************************************/
|
||||
|
||||
#define ASSEMBLER
|
||||
#include "common.h"
|
||||
#include "version.h"
|
||||
|
||||
#define N $16
|
||||
#define X $17
|
||||
#define INCX $18
|
||||
#define XX $19
|
||||
|
||||
#ifndef USE_MIN
|
||||
#define CMPLT(a, b) cmptlt a, b
|
||||
#else
|
||||
#define CMPLT(a, b) cmptlt b, a
|
||||
#endif
|
||||
|
||||
#define STACKSIZE 8 * 8
|
||||
|
||||
PROLOGUE
|
||||
PROFCODE
|
||||
|
||||
clr $0
|
||||
mov X, XX
|
||||
.align 4
|
||||
|
||||
cmplt $31, N, $2
|
||||
cmplt $31, INCX, $3
|
||||
SXADDQ INCX, $31, INCX
|
||||
and $2, $3, $2
|
||||
|
||||
sra N, 3, $1
|
||||
fclr $f0
|
||||
unop
|
||||
beq $2, $End # if (n <= 0) or (incx <= 0) return
|
||||
.align 4
|
||||
|
||||
LD $f0, 0 * SIZE(X)
|
||||
unop
|
||||
unop
|
||||
ble $1, $L15
|
||||
.align 4
|
||||
|
||||
fmov $f0, $f1
|
||||
addq X, INCX, X
|
||||
fmov $f0, $f10
|
||||
lda $1, -1($1)
|
||||
|
||||
LD $f21, 0 * SIZE(X)
|
||||
fmov $f0, $f11
|
||||
addq X, INCX, X
|
||||
fmov $f0, $f12
|
||||
|
||||
LD $f22, 0 * SIZE(X)
|
||||
fmov $f0, $f13
|
||||
addq X, INCX, X
|
||||
fmov $f0, $f14
|
||||
|
||||
LD $f23, 0 * SIZE(X)
|
||||
fmov $f0, $f15
|
||||
addq X, INCX, X
|
||||
fmov $f0, $f20
|
||||
|
||||
LD $f24, 0 * SIZE(X)
|
||||
addq X, INCX, X
|
||||
LD $f25, 0 * SIZE(X)
|
||||
addq X, INCX, X
|
||||
LD $f26, 0 * SIZE(X)
|
||||
addq X, INCX, X
|
||||
LD $f27, 0 * SIZE(X)
|
||||
addq X, INCX, X
|
||||
|
||||
CMPLT($f0, $f20), $f16
|
||||
CMPLT($f1, $f21), $f17
|
||||
CMPLT($f10, $f22), $f18
|
||||
CMPLT($f11, $f23), $f19
|
||||
|
||||
ble $1, $L13
|
||||
.align 4
|
||||
|
||||
$L12:
|
||||
fcmovne $f16, $f20, $f0
|
||||
LD $f20, 0 * SIZE(X)
|
||||
CMPLT($f12, $f24), $f16
|
||||
addq X, INCX, X
|
||||
|
||||
fcmovne $f17, $f21, $f1
|
||||
LD $f21, 0 * SIZE(X)
|
||||
CMPLT($f13, $f25), $f17
|
||||
addq X, INCX, X
|
||||
|
||||
fcmovne $f18, $f22, $f10
|
||||
LD $f22, 0 * SIZE(X)
|
||||
CMPLT($f14, $f26), $f18
|
||||
addq X, INCX, X
|
||||
|
||||
fcmovne $f19, $f23, $f11
|
||||
LD $f23, 0 * SIZE(X)
|
||||
CMPLT($f15, $f27), $f19
|
||||
addq X, INCX, X
|
||||
|
||||
fcmovne $f16, $f24, $f12
|
||||
LD $f24, 0 * SIZE(X)
|
||||
CMPLT($f0, $f20), $f16
|
||||
addq X, INCX, X
|
||||
|
||||
fcmovne $f17, $f25, $f13
|
||||
LD $f25, 0 * SIZE(X)
|
||||
CMPLT($f1, $f21), $f17
|
||||
addq X, INCX, X
|
||||
|
||||
fcmovne $f18, $f26, $f14
|
||||
LD $f26, 0 * SIZE(X)
|
||||
CMPLT($f10, $f22), $f18
|
||||
addq X, INCX, X
|
||||
|
||||
fcmovne $f19, $f27, $f15
|
||||
LD $f27, 0 * SIZE(X)
|
||||
CMPLT($f11, $f23), $f19
|
||||
lda $1, -1($1) # i --
|
||||
|
||||
addq X, INCX, X
|
||||
unop
|
||||
unop
|
||||
bgt $1,$L12
|
||||
.align 4
|
||||
|
||||
$L13:
|
||||
fcmovne $f16, $f20, $f0
|
||||
CMPLT($f12, $f24), $f16
|
||||
|
||||
fcmovne $f17, $f21, $f1
|
||||
CMPLT($f13, $f25), $f17
|
||||
|
||||
fcmovne $f18, $f22, $f10
|
||||
CMPLT($f14, $f26), $f18
|
||||
|
||||
fcmovne $f19, $f23, $f11
|
||||
CMPLT($f15, $f27), $f19
|
||||
|
||||
fcmovne $f16, $f24, $f12
|
||||
CMPLT($f0, $f1), $f16
|
||||
fcmovne $f17, $f25, $f13
|
||||
CMPLT($f10, $f11), $f17
|
||||
|
||||
fcmovne $f18, $f26, $f14
|
||||
CMPLT($f12, $f13), $f18
|
||||
fcmovne $f19, $f27, $f15
|
||||
CMPLT($f14, $f15), $f19
|
||||
|
||||
fcmovne $f16, $f1, $f0
|
||||
fcmovne $f17, $f11, $f10
|
||||
fcmovne $f18, $f13, $f12
|
||||
fcmovne $f19, $f15, $f14
|
||||
|
||||
CMPLT($f0, $f10), $f16
|
||||
CMPLT($f12, $f14), $f17
|
||||
|
||||
fcmovne $f16, $f10, $f0
|
||||
fcmovne $f17, $f14, $f12
|
||||
|
||||
CMPLT($f0, $f12), $f16
|
||||
fcmovne $f16, $f12, $f0
|
||||
.align 4
|
||||
|
||||
$L15:
|
||||
and N, 7, $1
|
||||
unop
|
||||
unop
|
||||
ble $1, $L20
|
||||
.align 4
|
||||
|
||||
$L16:
|
||||
LD $f20, 0 * SIZE(X)
|
||||
addq X, INCX, X
|
||||
|
||||
CMPLT($f0, $f20), $f16
|
||||
fcmovne $f16, $f20, $f0
|
||||
lda $1, -1($1) # i --
|
||||
bgt $1, $L16
|
||||
.align 4
|
||||
|
||||
$L20:
|
||||
sra N, 3, $1
|
||||
ble $1, $L40
|
||||
.align 4
|
||||
|
||||
LD $f10, 0 * SIZE(XX)
|
||||
addq XX, INCX, XX
|
||||
LD $f11, 0 * SIZE(XX)
|
||||
addq XX, INCX, XX
|
||||
|
||||
LD $f12, 0 * SIZE(XX)
|
||||
addq XX, INCX, XX
|
||||
LD $f13, 0 * SIZE(XX)
|
||||
addq XX, INCX, XX
|
||||
|
||||
LD $f14, 0 * SIZE(XX)
|
||||
addq XX, INCX, XX
|
||||
LD $f15, 0 * SIZE(XX)
|
||||
addq XX, INCX, XX
|
||||
|
||||
LD $f16, 0 * SIZE(XX)
|
||||
addq XX, INCX, XX
|
||||
LD $f17, 0 * SIZE(XX)
|
||||
addq XX, INCX, XX
|
||||
|
||||
cmpteq $f0, $f10, $f20
|
||||
cmpteq $f0, $f11, $f21
|
||||
cmpteq $f0, $f12, $f22
|
||||
cmpteq $f0, $f13, $f23
|
||||
|
||||
lda $1, -1($1)
|
||||
ble $1, $L23
|
||||
.align 4
|
||||
|
||||
$L22:
|
||||
LD $f10, 0 * SIZE(XX)
|
||||
cmpteq $f0, $f14, $f24
|
||||
lda $0, 1($0)
|
||||
addq XX, INCX, XX
|
||||
fbne $f20, $End
|
||||
|
||||
LD $f11, 0 * SIZE(XX)
|
||||
cmpteq $f0, $f15, $f25
|
||||
lda $0, 1($0)
|
||||
addq XX, INCX, XX
|
||||
fbne $f21, $End
|
||||
|
||||
LD $f12, 0 * SIZE(XX)
|
||||
cmpteq $f0, $f16, $f26
|
||||
lda $0, 1($0)
|
||||
addq XX, INCX, XX
|
||||
fbne $f22, $End
|
||||
|
||||
LD $f13, 0 * SIZE(XX)
|
||||
cmpteq $f0, $f17, $f27
|
||||
lda $0, 1($0)
|
||||
addq XX, INCX, XX
|
||||
fbne $f23, $End
|
||||
|
||||
LD $f14, 0 * SIZE(XX)
|
||||
cmpteq $f0, $f10, $f20
|
||||
lda $0, 1($0)
|
||||
addq XX, INCX, XX
|
||||
fbne $f24, $End
|
||||
|
||||
LD $f15, 0 * SIZE(XX)
|
||||
cmpteq $f0, $f11, $f21
|
||||
lda $0, 1($0)
|
||||
addq XX, INCX, XX
|
||||
fbne $f25, $End
|
||||
|
||||
LD $f16, 0 * SIZE(XX)
|
||||
lda $1, -1($1) # i --
|
||||
cmpteq $f0, $f12, $f22
|
||||
lda $0, 1($0)
|
||||
addq XX, INCX, XX
|
||||
fbne $f26, $End
|
||||
|
||||
LD $f17, 0 * SIZE(XX)
|
||||
cmpteq $f0, $f13, $f23
|
||||
lda $0, 1($0)
|
||||
addq XX, INCX, XX
|
||||
fbne $f27, $End
|
||||
|
||||
bgt $1, $L22
|
||||
.align 4
|
||||
|
||||
$L23:
|
||||
lda $0, 1($0)
|
||||
cmpteq $f0, $f14, $f24
|
||||
unop
|
||||
fbne $f20, $End
|
||||
|
||||
lda $0, 1($0)
|
||||
cmpteq $f0, $f15, $f25
|
||||
unop
|
||||
fbne $f21, $End
|
||||
|
||||
lda $0, 1($0)
|
||||
cmpteq $f0, $f16, $f26
|
||||
unop
|
||||
fbne $f22, $End
|
||||
|
||||
lda $0, 1($0)
|
||||
cmpteq $f0, $f17, $f27
|
||||
unop
|
||||
fbne $f23, $End
|
||||
|
||||
lda $0, 1($0)
|
||||
fbne $f24, $End
|
||||
lda $0, 1($0)
|
||||
fbne $f25, $End
|
||||
lda $0, 1($0)
|
||||
fbne $f26, $End
|
||||
lda $0, 1($0)
|
||||
fbne $f27, $End
|
||||
.align 4
|
||||
|
||||
$L40:
|
||||
LD $f20, 0 * SIZE(XX)
|
||||
addq XX, INCX, XX
|
||||
|
||||
cmpteq $f0, $f20, $f29
|
||||
|
||||
lda $0, 1($0)
|
||||
fbne $f29, $End
|
||||
br $31, $L40
|
||||
.align 4
|
||||
|
||||
$End:
|
||||
ret
|
||||
|
||||
EPILOGUE
|
||||
427
kernel/alpha/izamax.S
Normal file
427
kernel/alpha/izamax.S
Normal file
@@ -0,0 +1,427 @@
|
||||
/*********************************************************************/
|
||||
/* Copyright 2009, 2010 The University of Texas at Austin. */
|
||||
/* All rights reserved. */
|
||||
/* */
|
||||
/* Redistribution and use in source and binary forms, with or */
|
||||
/* without modification, are permitted provided that the following */
|
||||
/* conditions are met: */
|
||||
/* */
|
||||
/* 1. Redistributions of source code must retain the above */
|
||||
/* copyright notice, this list of conditions and the following */
|
||||
/* disclaimer. */
|
||||
/* */
|
||||
/* 2. Redistributions in binary form must reproduce the above */
|
||||
/* copyright notice, this list of conditions and the following */
|
||||
/* disclaimer in the documentation and/or other materials */
|
||||
/* provided with the distribution. */
|
||||
/* */
|
||||
/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */
|
||||
/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */
|
||||
/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */
|
||||
/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */
|
||||
/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */
|
||||
/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */
|
||||
/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */
|
||||
/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */
|
||||
/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */
|
||||
/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */
|
||||
/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */
|
||||
/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */
|
||||
/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */
|
||||
/* POSSIBILITY OF SUCH DAMAGE. */
|
||||
/* */
|
||||
/* The views and conclusions contained in the software and */
|
||||
/* documentation are those of the authors and should not be */
|
||||
/* interpreted as representing official policies, either expressed */
|
||||
/* or implied, of The University of Texas at Austin. */
|
||||
/*********************************************************************/
|
||||
|
||||
#define ASSEMBLER
|
||||
#include "common.h"
|
||||
#include "version.h"
|
||||
|
||||
#define N $16
|
||||
#define X $17
|
||||
#define INCX $18
|
||||
#define XX $19
|
||||
|
||||
#ifndef USE_MIN
|
||||
#define CMPLT(a, b) cmptlt a, b
|
||||
#else
|
||||
#define CMPLT(a, b) cmptlt b, a
|
||||
#endif
|
||||
|
||||
#define STACKSIZE 8 * 8
|
||||
|
||||
PROLOGUE
|
||||
PROFCODE
|
||||
|
||||
lda $sp, -STACKSIZE($sp)
|
||||
|
||||
stt $f2, 0($sp)
|
||||
fclr $f16
|
||||
cmplt $31, N, $2
|
||||
unop
|
||||
|
||||
stt $f3, 8($sp)
|
||||
fclr $f17
|
||||
cmplt $31, INCX, $3
|
||||
unop
|
||||
|
||||
stt $f4, 16($sp)
|
||||
fclr $f18
|
||||
SXADDQ INCX, $31, INCX
|
||||
unop
|
||||
|
||||
stt $f5, 24($sp)
|
||||
fclr $f19
|
||||
and $2, $3, $2
|
||||
clr $0
|
||||
|
||||
stt $f6, 32($sp)
|
||||
mov X, XX
|
||||
|
||||
stt $f7, 40($sp)
|
||||
stt $f8, 48($sp)
|
||||
stt $f9, 56($sp)
|
||||
|
||||
fclr $f0
|
||||
beq $2, $End # if (n <= 0) or (incx <= 0) return
|
||||
.align 4
|
||||
|
||||
LD $f20, 0 * SIZE(X)
|
||||
LD $f21, 1 * SIZE(X)
|
||||
sra N, 2, $1
|
||||
addq INCX, INCX, INCX
|
||||
|
||||
fabs $f20, $f20
|
||||
fabs $f21, $f21
|
||||
addt $f20, $f21, $f0
|
||||
ble $1, $L15
|
||||
.align 4
|
||||
|
||||
lda $1, -1($1)
|
||||
unop
|
||||
addq X, INCX, X
|
||||
unop
|
||||
|
||||
LD $f22, 0 * SIZE(X)
|
||||
fmov $f0, $f1
|
||||
LD $f23, 1 * SIZE(X)
|
||||
addq X, INCX, X
|
||||
|
||||
LD $f24, 0 * SIZE(X)
|
||||
fmov $f0, $f2
|
||||
LD $f25, 1 * SIZE(X)
|
||||
addq X, INCX, X
|
||||
|
||||
LD $f26, 0 * SIZE(X)
|
||||
fmov $f0, $f3
|
||||
LD $f27, 1 * SIZE(X)
|
||||
addq X, INCX, X
|
||||
|
||||
fabs $f20, $f8
|
||||
fabs $f21, $f9
|
||||
fabs $f22, $f10
|
||||
fabs $f23, $f11
|
||||
|
||||
fabs $f24, $f12
|
||||
fabs $f25, $f13
|
||||
fabs $f26, $f14
|
||||
fabs $f27, $f15
|
||||
|
||||
ble $1, $L14
|
||||
.align 4
|
||||
|
||||
LD $f20, 0 * SIZE(X)
|
||||
LD $f21, 1 * SIZE(X)
|
||||
lda $1, -1($1)
|
||||
addq X, INCX, X
|
||||
|
||||
LD $f22, 0 * SIZE(X)
|
||||
LD $f23, 1 * SIZE(X)
|
||||
unop
|
||||
addq X, INCX, X
|
||||
|
||||
LD $f24, 0 * SIZE(X)
|
||||
LD $f25, 1 * SIZE(X)
|
||||
unop
|
||||
addq X, INCX, X
|
||||
|
||||
LD $f26, 0 * SIZE(X)
|
||||
LD $f27, 1 * SIZE(X)
|
||||
addq X, INCX, X
|
||||
ble $1, $L13
|
||||
.align 4
|
||||
|
||||
$L12:
|
||||
addt $f8, $f9, $f16
|
||||
unop
|
||||
fabs $f20, $f8
|
||||
ldl $31, 64 * SIZE(X)
|
||||
|
||||
addt $f10, $f11, $f17
|
||||
unop
|
||||
fabs $f21, $f9
|
||||
LD $f20, 0 * SIZE(X)
|
||||
|
||||
addt $f12, $f13, $f18
|
||||
LD $f21, 1 * SIZE(X)
|
||||
fabs $f22, $f10
|
||||
addq X, INCX, X
|
||||
|
||||
addt $f14, $f15, $f19
|
||||
LD $f22, 0 * SIZE(X)
|
||||
fabs $f23, $f11
|
||||
unop
|
||||
|
||||
CMPLT($f0, $f16), $f4
|
||||
LD $f23, 1 * SIZE(X)
|
||||
fabs $f24, $f12
|
||||
addq X, INCX, X
|
||||
|
||||
CMPLT($f1, $f17), $f5
|
||||
LD $f24, 0 * SIZE(X)
|
||||
fabs $f25, $f13
|
||||
unop
|
||||
|
||||
CMPLT($f2, $f18), $f6
|
||||
LD $f25, 1 * SIZE(X)
|
||||
fabs $f26, $f14
|
||||
addq X, INCX, X
|
||||
|
||||
CMPLT($f3, $f19), $f7
|
||||
LD $f26, 0 * SIZE(X)
|
||||
fabs $f27, $f15
|
||||
unop
|
||||
|
||||
fcmovne $f4, $f16, $f0
|
||||
LD $f27, 1 * SIZE(X)
|
||||
addq X, INCX, X
|
||||
lda $1, -1($1) # i --
|
||||
|
||||
fcmovne $f5, $f17, $f1
|
||||
fcmovne $f6, $f18, $f2
|
||||
fcmovne $f7, $f19, $f3
|
||||
bgt $1,$L12
|
||||
.align 4
|
||||
|
||||
$L13:
|
||||
addt $f8, $f9, $f16
|
||||
fabs $f20, $f8
|
||||
|
||||
addt $f10, $f11, $f17
|
||||
fabs $f21, $f9
|
||||
|
||||
addt $f12, $f13, $f18
|
||||
fabs $f22, $f10
|
||||
|
||||
addt $f14, $f15, $f19
|
||||
fabs $f23, $f11
|
||||
|
||||
CMPLT($f0, $f16), $f4
|
||||
fabs $f24, $f12
|
||||
|
||||
CMPLT($f1, $f17), $f5
|
||||
fabs $f25, $f13
|
||||
|
||||
CMPLT($f2, $f18), $f6
|
||||
fabs $f26, $f14
|
||||
CMPLT($f3, $f19), $f7
|
||||
fabs $f27, $f15
|
||||
|
||||
fcmovne $f4, $f16, $f0
|
||||
fcmovne $f5, $f17, $f1
|
||||
fcmovne $f6, $f18, $f2
|
||||
fcmovne $f7, $f19, $f3
|
||||
.align 4
|
||||
|
||||
$L14:
|
||||
addt $f8, $f9, $f16
|
||||
addt $f10, $f11, $f17
|
||||
addt $f12, $f13, $f18
|
||||
addt $f14, $f15, $f19
|
||||
|
||||
CMPLT($f0, $f16), $f4
|
||||
CMPLT($f1, $f17), $f5
|
||||
CMPLT($f2, $f18), $f6
|
||||
CMPLT($f3, $f19), $f7
|
||||
|
||||
fcmovne $f4, $f16, $f0
|
||||
fcmovne $f5, $f17, $f1
|
||||
fcmovne $f6, $f18, $f2
|
||||
fcmovne $f7, $f19, $f3
|
||||
|
||||
CMPLT($f0, $f1), $f16
|
||||
CMPLT($f2, $f3), $f17
|
||||
|
||||
fcmovne $f16, $f1, $f0
|
||||
fcmovne $f17, $f3, $f2
|
||||
|
||||
CMPLT($f0, $f2), $f16
|
||||
fcmovne $f16, $f2, $f0
|
||||
.align 4
|
||||
|
||||
$L15:
|
||||
and N, 3, $1
|
||||
unop
|
||||
unop
|
||||
ble $1, $L20
|
||||
.align 4
|
||||
|
||||
$L16:
|
||||
LD $f20, 0 * SIZE(X)
|
||||
LD $f21, 1 * SIZE(X)
|
||||
unop
|
||||
addq X, INCX, X
|
||||
|
||||
fabs $f20, $f29
|
||||
fabs $f21, $f30
|
||||
addt $f29, $f30, $f29
|
||||
|
||||
CMPLT($f0, $f29), $f16
|
||||
fcmovne $f16, $f29, $f0
|
||||
|
||||
lda $1, -1($1) # i --
|
||||
bgt $1, $L16
|
||||
.align 4
|
||||
|
||||
$L20:
|
||||
sra N, 2, $1
|
||||
ble $1, $L40
|
||||
.align 4
|
||||
|
||||
LD $f10, 0 * SIZE(XX)
|
||||
LD $f11, 1 * SIZE(XX)
|
||||
addq XX, INCX, XX
|
||||
|
||||
LD $f12, 0 * SIZE(XX)
|
||||
LD $f13, 1 * SIZE(XX)
|
||||
addq XX, INCX, XX
|
||||
|
||||
LD $f14, 0 * SIZE(XX)
|
||||
LD $f15, 1 * SIZE(XX)
|
||||
addq XX, INCX, XX
|
||||
|
||||
LD $f16, 0 * SIZE(XX)
|
||||
LD $f17, 1 * SIZE(XX)
|
||||
addq XX, INCX, XX
|
||||
|
||||
fabs $f10, $f18
|
||||
fabs $f11, $f19
|
||||
fabs $f12, $f20
|
||||
fabs $f13, $f21
|
||||
|
||||
lda $1, -1($1)
|
||||
ble $1, $L23
|
||||
.align 4
|
||||
|
||||
$L22:
|
||||
LD $f10, 0 * SIZE(XX)
|
||||
fabs $f14, $f22
|
||||
LD $f11, 1 * SIZE(XX)
|
||||
addq XX, INCX, XX
|
||||
|
||||
LD $f12, 0 * SIZE(XX)
|
||||
fabs $f15, $f23
|
||||
LD $f13, 1 * SIZE(XX)
|
||||
addq XX, INCX, XX
|
||||
|
||||
LD $f14, 0 * SIZE(XX)
|
||||
fabs $f16, $f24
|
||||
LD $f15, 1 * SIZE(XX)
|
||||
addq XX, INCX, XX
|
||||
|
||||
LD $f16, 0 * SIZE(XX)
|
||||
fabs $f17, $f25
|
||||
LD $f17, 1 * SIZE(XX)
|
||||
addq XX, INCX, XX
|
||||
|
||||
addt $f18, $f19, $f4
|
||||
addt $f20, $f21, $f5
|
||||
addt $f22, $f23, $f6
|
||||
addt $f24, $f25, $f7
|
||||
|
||||
cmpteq $f0, $f4, $f26
|
||||
cmpteq $f0, $f5, $f27
|
||||
cmpteq $f0, $f6, $f28
|
||||
cmpteq $f0, $f7, $f29
|
||||
|
||||
fabs $f10, $f18
|
||||
lda $0, 1($0)
|
||||
lda $1, -1($1) # i --
|
||||
fbne $f26, $End
|
||||
|
||||
fabs $f11, $f19
|
||||
lda $0, 1($0)
|
||||
unop
|
||||
fbne $f27, $End
|
||||
|
||||
fabs $f12, $f20
|
||||
lda $0, 1($0)
|
||||
unop
|
||||
fbne $f28, $End
|
||||
|
||||
fabs $f13, $f21
|
||||
lda $0, 1($0)
|
||||
fbne $f29, $End
|
||||
bgt $1, $L22
|
||||
.align 4
|
||||
|
||||
$L23:
|
||||
fabs $f14, $f22
|
||||
fabs $f15, $f23
|
||||
fabs $f16, $f24
|
||||
fabs $f17, $f25
|
||||
|
||||
addt $f18, $f19, $f4
|
||||
addt $f20, $f21, $f5
|
||||
addt $f22, $f23, $f6
|
||||
addt $f24, $f25, $f7
|
||||
|
||||
cmpteq $f0, $f4, $f26
|
||||
cmpteq $f0, $f5, $f27
|
||||
cmpteq $f0, $f6, $f28
|
||||
cmpteq $f0, $f7, $f29
|
||||
|
||||
lda $0, 1($0)
|
||||
fbne $f26, $End
|
||||
lda $0, 1($0)
|
||||
fbne $f27, $End
|
||||
lda $0, 1($0)
|
||||
fbne $f28, $End
|
||||
lda $0, 1($0)
|
||||
fbne $f29, $End
|
||||
.align 4
|
||||
|
||||
$L40:
|
||||
LD $f10, 0 * SIZE(XX)
|
||||
LD $f11, 1 * SIZE(XX)
|
||||
|
||||
addq XX, INCX, XX
|
||||
|
||||
fabs $f10, $f18
|
||||
fabs $f11, $f19
|
||||
|
||||
addt $f18, $f19, $f18
|
||||
cmpteq $f0, $f18, $f2
|
||||
|
||||
lda $0, 1($0)
|
||||
fbne $f2, $End
|
||||
br $31, $L40
|
||||
.align 4
|
||||
|
||||
$End:
|
||||
ldt $f2, 0($sp)
|
||||
ldt $f3, 8($sp)
|
||||
ldt $f4, 16($sp)
|
||||
ldt $f5, 24($sp)
|
||||
|
||||
ldt $f6, 32($sp)
|
||||
ldt $f7, 40($sp)
|
||||
ldt $f8, 48($sp)
|
||||
ldt $f9, 56($sp)
|
||||
lda $sp, STACKSIZE($sp)
|
||||
ret
|
||||
|
||||
EPILOGUE
|
||||
76
kernel/alpha/lsame.S
Normal file
76
kernel/alpha/lsame.S
Normal file
@@ -0,0 +1,76 @@
|
||||
/*********************************************************************/
|
||||
/* Copyright 2009, 2010 The University of Texas at Austin. */
|
||||
/* All rights reserved. */
|
||||
/* */
|
||||
/* Redistribution and use in source and binary forms, with or */
|
||||
/* without modification, are permitted provided that the following */
|
||||
/* conditions are met: */
|
||||
/* */
|
||||
/* 1. Redistributions of source code must retain the above */
|
||||
/* copyright notice, this list of conditions and the following */
|
||||
/* disclaimer. */
|
||||
/* */
|
||||
/* 2. Redistributions in binary form must reproduce the above */
|
||||
/* copyright notice, this list of conditions and the following */
|
||||
/* disclaimer in the documentation and/or other materials */
|
||||
/* provided with the distribution. */
|
||||
/* */
|
||||
/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */
|
||||
/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */
|
||||
/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */
|
||||
/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */
|
||||
/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */
|
||||
/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */
|
||||
/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */
|
||||
/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */
|
||||
/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */
|
||||
/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */
|
||||
/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */
|
||||
/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */
|
||||
/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */
|
||||
/* POSSIBILITY OF SUCH DAMAGE. */
|
||||
/* */
|
||||
/* The views and conclusions contained in the software and */
|
||||
/* documentation are those of the authors and should not be */
|
||||
/* interpreted as representing official policies, either expressed */
|
||||
/* or implied, of The University of Texas at Austin. */
|
||||
/*********************************************************************/
|
||||
|
||||
#include "version.h"
|
||||
|
||||
.set noat
|
||||
.set noreorder
|
||||
.text
|
||||
.align 5
|
||||
.globl lsame_
|
||||
.ent lsame_
|
||||
lsame_:
|
||||
.frame $sp,0,$26,0
|
||||
#ifdef PROFILE
|
||||
ldgp $gp, 0($27)
|
||||
lda $28, _mcount
|
||||
jsr $28, ($28), _mcount
|
||||
.prologue 1
|
||||
#else
|
||||
.prologue 0
|
||||
#endif
|
||||
|
||||
ldq_u $5, 0($16)
|
||||
ldq_u $6, 0($17)
|
||||
extbl $5, $16, $5
|
||||
extbl $6, $17, $6
|
||||
|
||||
subl $5, 96, $1
|
||||
subl $6, 96, $2
|
||||
subl $5, 32, $3
|
||||
subl $6, 32, $4
|
||||
|
||||
cmovgt $1, $3, $5
|
||||
cmovgt $2, $4, $6
|
||||
cmpeq $5, $6, $0
|
||||
.align 4
|
||||
|
||||
$End:
|
||||
ret
|
||||
.end lsame_
|
||||
.ident VERSION
|
||||
227
kernel/alpha/max.S
Normal file
227
kernel/alpha/max.S
Normal file
@@ -0,0 +1,227 @@
|
||||
/*********************************************************************/
|
||||
/* Copyright 2009, 2010 The University of Texas at Austin. */
|
||||
/* All rights reserved. */
|
||||
/* */
|
||||
/* Redistribution and use in source and binary forms, with or */
|
||||
/* without modification, are permitted provided that the following */
|
||||
/* conditions are met: */
|
||||
/* */
|
||||
/* 1. Redistributions of source code must retain the above */
|
||||
/* copyright notice, this list of conditions and the following */
|
||||
/* disclaimer. */
|
||||
/* */
|
||||
/* 2. Redistributions in binary form must reproduce the above */
|
||||
/* copyright notice, this list of conditions and the following */
|
||||
/* disclaimer in the documentation and/or other materials */
|
||||
/* provided with the distribution. */
|
||||
/* */
|
||||
/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */
|
||||
/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */
|
||||
/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */
|
||||
/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */
|
||||
/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */
|
||||
/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */
|
||||
/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */
|
||||
/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */
|
||||
/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */
|
||||
/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */
|
||||
/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */
|
||||
/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */
|
||||
/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */
|
||||
/* POSSIBILITY OF SUCH DAMAGE. */
|
||||
/* */
|
||||
/* The views and conclusions contained in the software and */
|
||||
/* documentation are those of the authors and should not be */
|
||||
/* interpreted as representing official policies, either expressed */
|
||||
/* or implied, of The University of Texas at Austin. */
|
||||
/*********************************************************************/
|
||||
|
||||
#define ASSEMBLER
|
||||
#include "common.h"
|
||||
#include "version.h"
|
||||
|
||||
#define N $16
|
||||
#define X $17
|
||||
#define INCX $18
|
||||
|
||||
#ifndef USE_MIN
|
||||
#define CMPLT(a, b) cmptlt a, b
|
||||
#else
|
||||
#define CMPLT(a, b) cmptlt b, a
|
||||
#endif
|
||||
|
||||
#define STACKSIZE 8 * 8
|
||||
|
||||
PROLOGUE
|
||||
PROFCODE
|
||||
.frame $sp, STACKSIZE, $26, 0
|
||||
|
||||
#ifdef F_INTERFACE
|
||||
ldl N, 0(N) # n
|
||||
ldl INCX, 0(INCX) # incx
|
||||
#endif
|
||||
lda $sp, -STACKSIZE($sp)
|
||||
nop
|
||||
.align 4
|
||||
|
||||
cmplt $31, N, $2
|
||||
cmplt $31, INCX, $3
|
||||
SXADDQ INCX, $31, INCX
|
||||
and $2, $3, $0
|
||||
|
||||
sra N, 3, $1
|
||||
fclr $f0
|
||||
unop
|
||||
beq $0, $End # if (n <= 0) or (incx <= 0) return
|
||||
.align 4
|
||||
|
||||
LD $f0, 0 * SIZE(X)
|
||||
unop
|
||||
unop
|
||||
ble $1, $L15
|
||||
.align 4
|
||||
|
||||
fmov $f0, $f1
|
||||
addq X, INCX, X
|
||||
fmov $f0, $f10
|
||||
lda $1, -1($1)
|
||||
|
||||
LD $f21, 0 * SIZE(X)
|
||||
fmov $f0, $f11
|
||||
addq X, INCX, X
|
||||
fmov $f0, $f12
|
||||
|
||||
LD $f22, 0 * SIZE(X)
|
||||
fmov $f0, $f13
|
||||
addq X, INCX, X
|
||||
fmov $f0, $f14
|
||||
|
||||
LD $f23, 0 * SIZE(X)
|
||||
fmov $f0, $f15
|
||||
addq X, INCX, X
|
||||
fmov $f0, $f20
|
||||
|
||||
LD $f24, 0 * SIZE(X)
|
||||
addq X, INCX, X
|
||||
LD $f25, 0 * SIZE(X)
|
||||
addq X, INCX, X
|
||||
LD $f26, 0 * SIZE(X)
|
||||
addq X, INCX, X
|
||||
LD $f27, 0 * SIZE(X)
|
||||
addq X, INCX, X
|
||||
|
||||
CMPLT($f0, $f20), $f16
|
||||
CMPLT($f1, $f21), $f17
|
||||
CMPLT($f10, $f22), $f18
|
||||
CMPLT($f11, $f23), $f19
|
||||
|
||||
ble $1, $L13
|
||||
.align 4
|
||||
|
||||
$L12:
|
||||
fcmovne $f16, $f20, $f0
|
||||
LD $f20, 0 * SIZE(X)
|
||||
CMPLT($f12, $f24), $f16
|
||||
addq X, INCX, X
|
||||
|
||||
fcmovne $f17, $f21, $f1
|
||||
LD $f21, 0 * SIZE(X)
|
||||
CMPLT($f13, $f25), $f17
|
||||
addq X, INCX, X
|
||||
|
||||
fcmovne $f18, $f22, $f10
|
||||
LD $f22, 0 * SIZE(X)
|
||||
CMPLT($f14, $f26), $f18
|
||||
addq X, INCX, X
|
||||
|
||||
fcmovne $f19, $f23, $f11
|
||||
LD $f23, 0 * SIZE(X)
|
||||
CMPLT($f15, $f27), $f19
|
||||
addq X, INCX, X
|
||||
|
||||
fcmovne $f16, $f24, $f12
|
||||
LD $f24, 0 * SIZE(X)
|
||||
CMPLT($f0, $f20), $f16
|
||||
addq X, INCX, X
|
||||
|
||||
fcmovne $f17, $f25, $f13
|
||||
LD $f25, 0 * SIZE(X)
|
||||
CMPLT($f1, $f21), $f17
|
||||
addq X, INCX, X
|
||||
|
||||
fcmovne $f18, $f26, $f14
|
||||
LD $f26, 0 * SIZE(X)
|
||||
CMPLT($f10, $f22), $f18
|
||||
addq X, INCX, X
|
||||
|
||||
fcmovne $f19, $f27, $f15
|
||||
LD $f27, 0 * SIZE(X)
|
||||
CMPLT($f11, $f23), $f19
|
||||
lda $1, -1($1) # i --
|
||||
|
||||
addq X, INCX, X
|
||||
unop
|
||||
unop
|
||||
bgt $1,$L12
|
||||
.align 4
|
||||
|
||||
$L13:
|
||||
fcmovne $f16, $f20, $f0
|
||||
CMPLT($f12, $f24), $f16
|
||||
|
||||
fcmovne $f17, $f21, $f1
|
||||
CMPLT($f13, $f25), $f17
|
||||
|
||||
fcmovne $f18, $f22, $f10
|
||||
CMPLT($f14, $f26), $f18
|
||||
|
||||
fcmovne $f19, $f23, $f11
|
||||
CMPLT($f15, $f27), $f19
|
||||
|
||||
fcmovne $f16, $f24, $f12
|
||||
CMPLT($f0, $f1), $f16
|
||||
fcmovne $f17, $f25, $f13
|
||||
CMPLT($f10, $f11), $f17
|
||||
|
||||
fcmovne $f18, $f26, $f14
|
||||
CMPLT($f12, $f13), $f18
|
||||
fcmovne $f19, $f27, $f15
|
||||
CMPLT($f14, $f15), $f19
|
||||
|
||||
fcmovne $f16, $f1, $f0
|
||||
fcmovne $f17, $f11, $f10
|
||||
fcmovne $f18, $f13, $f12
|
||||
fcmovne $f19, $f15, $f14
|
||||
|
||||
CMPLT($f0, $f10), $f16
|
||||
CMPLT($f12, $f14), $f17
|
||||
|
||||
fcmovne $f16, $f10, $f0
|
||||
fcmovne $f17, $f14, $f12
|
||||
|
||||
CMPLT($f0, $f12), $f16
|
||||
fcmovne $f16, $f12, $f0
|
||||
.align 4
|
||||
|
||||
$L15:
|
||||
and N, 7, $1
|
||||
unop
|
||||
unop
|
||||
ble $1, $End
|
||||
.align 4
|
||||
|
||||
$L16:
|
||||
LD $f20, 0 * SIZE(X)
|
||||
addq X, INCX, X
|
||||
|
||||
CMPLT($f0, $f20), $f16
|
||||
fcmovne $f16, $f20, $f0
|
||||
lda $1, -1($1) # i --
|
||||
bgt $1, $L16
|
||||
.align 4
|
||||
|
||||
$End:
|
||||
lda $sp, STACKSIZE($sp)
|
||||
ret
|
||||
|
||||
EPILOGUE
|
||||
624
kernel/alpha/rot.S
Normal file
624
kernel/alpha/rot.S
Normal file
@@ -0,0 +1,624 @@
|
||||
/*********************************************************************/
|
||||
/* Copyright 2009, 2010 The University of Texas at Austin. */
|
||||
/* All rights reserved. */
|
||||
/* */
|
||||
/* Redistribution and use in source and binary forms, with or */
|
||||
/* without modification, are permitted provided that the following */
|
||||
/* conditions are met: */
|
||||
/* */
|
||||
/* 1. Redistributions of source code must retain the above */
|
||||
/* copyright notice, this list of conditions and the following */
|
||||
/* disclaimer. */
|
||||
/* */
|
||||
/* 2. Redistributions in binary form must reproduce the above */
|
||||
/* copyright notice, this list of conditions and the following */
|
||||
/* disclaimer in the documentation and/or other materials */
|
||||
/* provided with the distribution. */
|
||||
/* */
|
||||
/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */
|
||||
/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */
|
||||
/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */
|
||||
/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */
|
||||
/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */
|
||||
/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */
|
||||
/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */
|
||||
/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */
|
||||
/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */
|
||||
/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */
|
||||
/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */
|
||||
/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */
|
||||
/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */
|
||||
/* POSSIBILITY OF SUCH DAMAGE. */
|
||||
/* */
|
||||
/* The views and conclusions contained in the software and */
|
||||
/* documentation are those of the authors and should not be */
|
||||
/* interpreted as representing official policies, either expressed */
|
||||
/* or implied, of The University of Texas at Austin. */
|
||||
/*********************************************************************/
|
||||
|
||||
#define ASSEMBLER
|
||||
#include "common.h"
|
||||
#include "version.h"
|
||||
|
||||
#define N $16
|
||||
#define X $17
|
||||
#define INCX $18
|
||||
#define Y $19
|
||||
#define INCY $20
|
||||
#define I $21
|
||||
#define XX $23
|
||||
#define YY $24
|
||||
|
||||
#define C $f10
|
||||
#define S $f11
|
||||
|
||||
#define PREFETCH_SIZE 80
|
||||
|
||||
PROLOGUE
|
||||
PROFCODE
|
||||
.frame $sp, 0, $26, 0
|
||||
|
||||
#ifndef PROFILE
|
||||
.prologue 0
|
||||
#else
|
||||
.prologue 1
|
||||
#endif
|
||||
|
||||
fmov $f21, C
|
||||
LD S, 0($sp)
|
||||
|
||||
cmpeq INCX, 1, $23
|
||||
cmpeq INCY, 1, $24
|
||||
ble N, $L998
|
||||
|
||||
and $23, $24, $23
|
||||
beq $23, $L50
|
||||
|
||||
sra N, 3, I
|
||||
ble I, $L15
|
||||
|
||||
LD $f12, 0*SIZE(X)
|
||||
LD $f13, 0*SIZE(Y)
|
||||
LD $f14, 1*SIZE(X)
|
||||
LD $f15, 1*SIZE(Y)
|
||||
|
||||
LD $f16, 2*SIZE(X)
|
||||
LD $f17, 2*SIZE(Y)
|
||||
LD $f18, 3*SIZE(X)
|
||||
LD $f19, 3*SIZE(Y)
|
||||
|
||||
MUL C, $f12, $f21
|
||||
unop
|
||||
MUL S, $f13, $f22
|
||||
MUL C, $f13, $f23
|
||||
|
||||
LD $f13, 4*SIZE(Y)
|
||||
MUL S, $f12, $f24
|
||||
LD $f12, 4*SIZE(X)
|
||||
MUL C, $f14, $f25
|
||||
|
||||
lda I, -1(I)
|
||||
MUL S, $f15, $f26
|
||||
ADD $f21, $f22, $f22
|
||||
MUL C, $f15, $f27
|
||||
|
||||
LD $f15, 5*SIZE(Y)
|
||||
MUL S, $f14, $f28
|
||||
SUB $f23, $f24, $f24
|
||||
ble I, $L13
|
||||
.align 4
|
||||
|
||||
$L12:
|
||||
MUL C, $f16, $f21
|
||||
lds $f31, (PREFETCH_SIZE) * SIZE(X)
|
||||
unop
|
||||
LD $f14, 5*SIZE(X)
|
||||
|
||||
ST $f22, 0*SIZE(X)
|
||||
MUL S, $f17, $f22
|
||||
unop
|
||||
ADD $f25, $f26, $f26
|
||||
|
||||
MUL C, $f17, $f23
|
||||
lds $f31, (PREFETCH_SIZE) * SIZE(Y)
|
||||
unop
|
||||
LD $f17, 6*SIZE(Y)
|
||||
|
||||
ST $f24, 0*SIZE(Y)
|
||||
MUL S, $f16, $f24
|
||||
unop
|
||||
SUB $f27, $f28, $f28
|
||||
|
||||
MUL C, $f18, $f25
|
||||
LD $f16, 6*SIZE(X)
|
||||
unop
|
||||
unop
|
||||
|
||||
ST $f26, 1*SIZE(X)
|
||||
MUL S, $f19, $f26
|
||||
unop
|
||||
ADD $f21, $f22, $f22
|
||||
|
||||
MUL C, $f19, $f27
|
||||
unop
|
||||
unop
|
||||
LD $f19, 7*SIZE(Y)
|
||||
|
||||
ST $f28, 1*SIZE(Y)
|
||||
MUL S, $f18, $f28
|
||||
unop
|
||||
SUB $f23, $f24, $f24
|
||||
|
||||
MUL C, $f12, $f21
|
||||
LD $f18, 7*SIZE(X)
|
||||
unop
|
||||
unop
|
||||
|
||||
ST $f22, 2*SIZE(X)
|
||||
unop
|
||||
MUL S, $f13, $f22
|
||||
ADD $f25, $f26, $f26
|
||||
|
||||
MUL C, $f13, $f23
|
||||
LD $f13, 8*SIZE(Y)
|
||||
unop
|
||||
unop
|
||||
|
||||
ST $f24, 2*SIZE(Y)
|
||||
MUL S, $f12, $f24
|
||||
unop
|
||||
SUB $f27, $f28, $f28
|
||||
|
||||
MUL C, $f14, $f25
|
||||
LD $f12, 8*SIZE(X)
|
||||
unop
|
||||
unop
|
||||
|
||||
ST $f26, 3*SIZE(X)
|
||||
MUL S, $f15, $f26
|
||||
unop
|
||||
ADD $f21, $f22, $f22
|
||||
|
||||
MUL C, $f15, $f27
|
||||
LD $f15, 9*SIZE(Y)
|
||||
unop
|
||||
unop
|
||||
|
||||
ST $f28, 3*SIZE(Y)
|
||||
MUL S, $f14, $f28
|
||||
unop
|
||||
SUB $f23, $f24, $f24
|
||||
|
||||
MUL C, $f16, $f21
|
||||
LD $f14, 9*SIZE(X)
|
||||
unop
|
||||
unop
|
||||
|
||||
ST $f22, 4*SIZE(X)
|
||||
MUL S, $f17, $f22
|
||||
unop
|
||||
ADD $f25, $f26, $f26
|
||||
|
||||
MUL C, $f17, $f23
|
||||
LD $f17, 10*SIZE(Y)
|
||||
unop
|
||||
unop
|
||||
|
||||
ST $f24, 4*SIZE(Y)
|
||||
MUL S, $f16, $f24
|
||||
unop
|
||||
SUB $f27, $f28, $f28
|
||||
|
||||
MUL C, $f18, $f25
|
||||
LD $f16, 10*SIZE(X)
|
||||
unop
|
||||
unop
|
||||
|
||||
ST $f26, 5*SIZE(X)
|
||||
MUL S, $f19, $f26
|
||||
unop
|
||||
ADD $f21, $f22, $f22
|
||||
|
||||
MUL C, $f19, $f27
|
||||
LD $f19, 11*SIZE(Y)
|
||||
unop
|
||||
unop
|
||||
|
||||
ST $f28, 5*SIZE(Y)
|
||||
MUL S, $f18, $f28
|
||||
lda I, -1(I)
|
||||
SUB $f23, $f24, $f24
|
||||
|
||||
MUL C, $f12, $f21
|
||||
LD $f18, 11*SIZE(X)
|
||||
unop
|
||||
unop
|
||||
|
||||
ST $f22, 6*SIZE(X)
|
||||
MUL S, $f13, $f22
|
||||
unop
|
||||
ADD $f25, $f26, $f26
|
||||
|
||||
MUL C, $f13, $f23
|
||||
LD $f13, 12*SIZE(Y)
|
||||
lda X, 8*SIZE(X)
|
||||
unop
|
||||
|
||||
ST $f24, 6*SIZE(Y)
|
||||
MUL S, $f12, $f24
|
||||
unop
|
||||
SUB $f27, $f28, $f28
|
||||
|
||||
MUL C, $f14, $f25
|
||||
LD $f12, 4*SIZE(X)
|
||||
lda Y, 8*SIZE(Y)
|
||||
unop
|
||||
|
||||
ST $f26, -1*SIZE(X)
|
||||
MUL S, $f15, $f26
|
||||
unop
|
||||
ADD $f21, $f22, $f22
|
||||
|
||||
MUL C, $f15, $f27
|
||||
LD $f15, 5*SIZE(Y)
|
||||
unop
|
||||
unop
|
||||
|
||||
ST $f28, -1*SIZE(Y)
|
||||
MUL S, $f14, $f28
|
||||
SUB $f23, $f24, $f24
|
||||
bgt I, $L12
|
||||
.align 4
|
||||
|
||||
$L13:
|
||||
MUL C, $f16, $f21
|
||||
LD $f14, 5*SIZE(X)
|
||||
unop
|
||||
unop
|
||||
|
||||
ST $f22, 0*SIZE(X)
|
||||
MUL S, $f17, $f22
|
||||
unop
|
||||
ADD $f25, $f26, $f26
|
||||
|
||||
MUL C, $f17, $f23
|
||||
unop
|
||||
unop
|
||||
LD $f17, 6*SIZE(Y)
|
||||
|
||||
ST $f24, 0*SIZE(Y)
|
||||
MUL S, $f16, $f24
|
||||
LD $f16, 6*SIZE(X)
|
||||
SUB $f27, $f28, $f28
|
||||
|
||||
MUL C, $f18, $f25
|
||||
unop
|
||||
unop
|
||||
unop
|
||||
|
||||
ST $f26, 1*SIZE(X)
|
||||
MUL S, $f19, $f26
|
||||
unop
|
||||
ADD $f21, $f22, $f22
|
||||
|
||||
MUL C, $f19, $f27
|
||||
unop
|
||||
unop
|
||||
LD $f19, 7*SIZE(Y)
|
||||
|
||||
ST $f28, 1*SIZE(Y)
|
||||
MUL S, $f18, $f28
|
||||
LD $f18, 7*SIZE(X)
|
||||
SUB $f23, $f24, $f24
|
||||
|
||||
MUL C, $f12, $f21
|
||||
unop
|
||||
unop
|
||||
unop
|
||||
|
||||
ST $f22, 2*SIZE(X)
|
||||
unop
|
||||
MUL S, $f13, $f22
|
||||
ADD $f25, $f26, $f26
|
||||
|
||||
MUL C, $f13, $f23
|
||||
unop
|
||||
unop
|
||||
unop
|
||||
|
||||
ST $f24, 2*SIZE(Y)
|
||||
MUL S, $f12, $f24
|
||||
unop
|
||||
SUB $f27, $f28, $f28
|
||||
|
||||
MUL C, $f14, $f25
|
||||
unop
|
||||
unop
|
||||
unop
|
||||
|
||||
ST $f26, 3*SIZE(X)
|
||||
MUL S, $f15, $f26
|
||||
unop
|
||||
ADD $f21, $f22, $f22
|
||||
|
||||
MUL C, $f15, $f27
|
||||
unop
|
||||
unop
|
||||
unop
|
||||
|
||||
ST $f28, 3*SIZE(Y)
|
||||
MUL S, $f14, $f28
|
||||
unop
|
||||
SUB $f23, $f24, $f24
|
||||
|
||||
MUL C, $f16, $f21
|
||||
unop
|
||||
unop
|
||||
unop
|
||||
|
||||
ST $f22, 4*SIZE(X)
|
||||
MUL S, $f17, $f22
|
||||
unop
|
||||
ADD $f25, $f26, $f26
|
||||
|
||||
MUL C, $f17, $f23
|
||||
unop
|
||||
unop
|
||||
unop
|
||||
|
||||
ST $f24, 4*SIZE(Y)
|
||||
MUL S, $f16, $f24
|
||||
unop
|
||||
SUB $f27, $f28, $f28
|
||||
|
||||
MUL C, $f18, $f25
|
||||
unop
|
||||
unop
|
||||
unop
|
||||
|
||||
ST $f26, 5*SIZE(X)
|
||||
MUL S, $f19, $f26
|
||||
unop
|
||||
ADD $f21, $f22, $f22
|
||||
|
||||
MUL C, $f19, $f27
|
||||
unop
|
||||
unop
|
||||
unop
|
||||
|
||||
ST $f28, 5*SIZE(Y)
|
||||
MUL S, $f18, $f28
|
||||
unop
|
||||
SUB $f23, $f24, $f24
|
||||
|
||||
ST $f22, 6*SIZE(X)
|
||||
ADD $f25, $f26, $f26
|
||||
ST $f24, 6*SIZE(Y)
|
||||
SUB $f27, $f28, $f28
|
||||
|
||||
ST $f26, 7*SIZE(X)
|
||||
lda X, 8*SIZE(X)
|
||||
ST $f28, 7*SIZE(Y)
|
||||
lda Y, 8*SIZE(Y)
|
||||
.align 4
|
||||
|
||||
|
||||
$L15:
|
||||
and N, 7, I
|
||||
ble I, $L998
|
||||
.align 4
|
||||
|
||||
$L16:
|
||||
LD $f12, 0*SIZE(X)
|
||||
LD $f13, 0*SIZE(Y)
|
||||
|
||||
MUL C, $f12, $f21
|
||||
MUL S, $f13, $f22
|
||||
MUL C, $f13, $f23
|
||||
MUL S, $f12, $f24
|
||||
|
||||
ADD $f21, $f22, $f25
|
||||
SUB $f23, $f24, $f26
|
||||
lda I, -1(I)
|
||||
|
||||
ST $f25, 0*SIZE(X)
|
||||
lda X, 1 * SIZE(X)
|
||||
ST $f26, 0*SIZE(Y)
|
||||
lda Y, 1 * SIZE(Y)
|
||||
|
||||
bgt I, $L16
|
||||
.align 4
|
||||
|
||||
$L998:
|
||||
clr $0
|
||||
ret
|
||||
.align 4
|
||||
|
||||
$L50:
|
||||
mov X, XX
|
||||
mov Y, YY
|
||||
|
||||
sra N, 3, I
|
||||
ble I, $L55
|
||||
.align 4
|
||||
|
||||
$L51:
|
||||
LD $f12, 0*SIZE(X)
|
||||
SXADDQ INCX, X, X
|
||||
LD $f13, 0*SIZE(Y)
|
||||
SXADDQ INCY, Y, Y
|
||||
|
||||
LD $f14, 0*SIZE(X)
|
||||
SXADDQ INCX, X, X
|
||||
LD $f15, 0*SIZE(Y)
|
||||
SXADDQ INCY, Y, Y
|
||||
|
||||
LD $f16, 0*SIZE(X)
|
||||
SXADDQ INCX, X, X
|
||||
LD $f17, 0*SIZE(Y)
|
||||
SXADDQ INCY, Y, Y
|
||||
|
||||
LD $f18, 0*SIZE(X)
|
||||
SXADDQ INCX, X, X
|
||||
LD $f19, 0*SIZE(Y)
|
||||
SXADDQ INCY, Y, Y
|
||||
|
||||
MUL C, $f12, $f21
|
||||
MUL S, $f13, $f22
|
||||
MUL C, $f13, $f23
|
||||
MUL S, $f12, $f24
|
||||
|
||||
ADD $f21, $f22, $f22
|
||||
SUB $f23, $f24, $f24
|
||||
|
||||
ST $f22, 0*SIZE(XX)
|
||||
SXADDQ INCX, XX, XX
|
||||
ST $f24, 0*SIZE(YY)
|
||||
SXADDQ INCY, YY, YY
|
||||
|
||||
MUL C, $f14, $f25
|
||||
MUL S, $f15, $f26
|
||||
MUL C, $f15, $f27
|
||||
MUL S, $f14, $f28
|
||||
|
||||
ADD $f25, $f26, $f26
|
||||
SUB $f27, $f28, $f28
|
||||
|
||||
ST $f26, 0*SIZE(XX)
|
||||
SXADDQ INCX, XX, XX
|
||||
ST $f28, 0*SIZE(YY)
|
||||
SXADDQ INCY, YY, YY
|
||||
|
||||
MUL C, $f16, $f21
|
||||
MUL S, $f17, $f22
|
||||
MUL C, $f17, $f23
|
||||
MUL S, $f16, $f24
|
||||
|
||||
ADD $f21, $f22, $f22
|
||||
SUB $f23, $f24, $f24
|
||||
|
||||
ST $f22, 0*SIZE(XX)
|
||||
SXADDQ INCX, XX, XX
|
||||
ST $f24, 0*SIZE(YY)
|
||||
SXADDQ INCY, YY, YY
|
||||
|
||||
MUL C, $f18, $f25
|
||||
MUL S, $f19, $f26
|
||||
MUL C, $f19, $f27
|
||||
MUL S, $f18, $f28
|
||||
|
||||
ADD $f25, $f26, $f26
|
||||
SUB $f27, $f28, $f28
|
||||
|
||||
ST $f26, 0*SIZE(XX)
|
||||
SXADDQ INCX, XX, XX
|
||||
ST $f28, 0*SIZE(YY)
|
||||
SXADDQ INCY, YY, YY
|
||||
|
||||
|
||||
LD $f12, 0*SIZE(X)
|
||||
SXADDQ INCX, X, X
|
||||
LD $f13, 0*SIZE(Y)
|
||||
SXADDQ INCY, Y, Y
|
||||
|
||||
LD $f14, 0*SIZE(X)
|
||||
SXADDQ INCX, X, X
|
||||
LD $f15, 0*SIZE(Y)
|
||||
SXADDQ INCY, Y, Y
|
||||
|
||||
LD $f16, 0*SIZE(X)
|
||||
SXADDQ INCX, X, X
|
||||
LD $f17, 0*SIZE(Y)
|
||||
SXADDQ INCY, Y, Y
|
||||
|
||||
LD $f18, 0*SIZE(X)
|
||||
SXADDQ INCX, X, X
|
||||
LD $f19, 0*SIZE(Y)
|
||||
SXADDQ INCY, Y, Y
|
||||
|
||||
MUL C, $f12, $f21
|
||||
MUL S, $f13, $f22
|
||||
MUL C, $f13, $f23
|
||||
MUL S, $f12, $f24
|
||||
|
||||
ADD $f21, $f22, $f22
|
||||
SUB $f23, $f24, $f24
|
||||
|
||||
ST $f22, 0*SIZE(XX)
|
||||
SXADDQ INCX, XX, XX
|
||||
ST $f24, 0*SIZE(YY)
|
||||
SXADDQ INCY, YY, YY
|
||||
|
||||
MUL C, $f14, $f25
|
||||
MUL S, $f15, $f26
|
||||
MUL C, $f15, $f27
|
||||
MUL S, $f14, $f28
|
||||
|
||||
ADD $f25, $f26, $f26
|
||||
SUB $f27, $f28, $f28
|
||||
|
||||
ST $f26, 0*SIZE(XX)
|
||||
SXADDQ INCX, XX, XX
|
||||
ST $f28, 0*SIZE(YY)
|
||||
SXADDQ INCY, YY, YY
|
||||
|
||||
MUL C, $f16, $f21
|
||||
MUL S, $f17, $f22
|
||||
MUL C, $f17, $f23
|
||||
MUL S, $f16, $f24
|
||||
|
||||
ADD $f21, $f22, $f22
|
||||
SUB $f23, $f24, $f24
|
||||
|
||||
ST $f22, 0*SIZE(XX)
|
||||
SXADDQ INCX, XX, XX
|
||||
ST $f24, 0*SIZE(YY)
|
||||
SXADDQ INCY, YY, YY
|
||||
|
||||
MUL C, $f18, $f25
|
||||
MUL S, $f19, $f26
|
||||
MUL C, $f19, $f27
|
||||
MUL S, $f18, $f28
|
||||
|
||||
ADD $f25, $f26, $f26
|
||||
SUB $f27, $f28, $f28
|
||||
|
||||
ST $f26, 0*SIZE(XX)
|
||||
SXADDQ INCX, XX, XX
|
||||
ST $f28, 0*SIZE(YY)
|
||||
SXADDQ INCY, YY, YY
|
||||
|
||||
lda I, -1(I)
|
||||
bgt I, $L51
|
||||
.align 4
|
||||
|
||||
$L55:
|
||||
and N, 7, I
|
||||
ble I, $L999
|
||||
.align 4
|
||||
|
||||
$L56:
|
||||
LD $f12, 0*SIZE(X)
|
||||
LD $f13, 0*SIZE(Y)
|
||||
|
||||
MUL C, $f12, $f21
|
||||
MUL S, $f13, $f22
|
||||
MUL C, $f13, $f23
|
||||
MUL S, $f12, $f24
|
||||
|
||||
ADD $f21, $f22, $f25
|
||||
SUB $f23, $f24, $f26
|
||||
lda I, -1(I)
|
||||
|
||||
ST $f25, 0*SIZE(X)
|
||||
SXADDQ INCX, X, X
|
||||
ST $f26, 0*SIZE(Y)
|
||||
SXADDQ INCY, Y, Y
|
||||
|
||||
bgt I, $L56
|
||||
.align 4
|
||||
|
||||
$L999:
|
||||
clr $0
|
||||
ret
|
||||
EPILOGUE
|
||||
480
kernel/alpha/scal.S
Normal file
480
kernel/alpha/scal.S
Normal file
@@ -0,0 +1,480 @@
|
||||
/*********************************************************************/
|
||||
/* Copyright 2009, 2010 The University of Texas at Austin. */
|
||||
/* All rights reserved. */
|
||||
/* */
|
||||
/* Redistribution and use in source and binary forms, with or */
|
||||
/* without modification, are permitted provided that the following */
|
||||
/* conditions are met: */
|
||||
/* */
|
||||
/* 1. Redistributions of source code must retain the above */
|
||||
/* copyright notice, this list of conditions and the following */
|
||||
/* disclaimer. */
|
||||
/* */
|
||||
/* 2. Redistributions in binary form must reproduce the above */
|
||||
/* copyright notice, this list of conditions and the following */
|
||||
/* disclaimer in the documentation and/or other materials */
|
||||
/* provided with the distribution. */
|
||||
/* */
|
||||
/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */
|
||||
/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */
|
||||
/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */
|
||||
/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */
|
||||
/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */
|
||||
/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */
|
||||
/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */
|
||||
/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */
|
||||
/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */
|
||||
/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */
|
||||
/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */
|
||||
/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */
|
||||
/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */
|
||||
/* POSSIBILITY OF SUCH DAMAGE. */
|
||||
/* */
|
||||
/* The views and conclusions contained in the software and */
|
||||
/* documentation are those of the authors and should not be */
|
||||
/* interpreted as representing official policies, either expressed */
|
||||
/* or implied, of The University of Texas at Austin. */
|
||||
/*********************************************************************/
|
||||
|
||||
#define ASSEMBLER
|
||||
#include "common.h"
|
||||
#include "version.h"
|
||||
|
||||
#define PREFETCHSIZE 88
|
||||
|
||||
#define N $16
|
||||
#define X $20
|
||||
#define INCX $21
|
||||
|
||||
#define XX $18
|
||||
#define I $19
|
||||
|
||||
#define ALPHA $f19
|
||||
|
||||
#define s0 $f0
|
||||
#define s1 $f1
|
||||
#define s2 $f10
|
||||
#define s3 $f11
|
||||
|
||||
#define a0 $f12
|
||||
#define a1 $f13
|
||||
#define a2 $f14
|
||||
#define a3 $f15
|
||||
#define a4 $f16
|
||||
#define a5 $f17
|
||||
#define a6 $f18
|
||||
#define a7 $f21
|
||||
|
||||
#define t0 $f22
|
||||
#define t1 $f23
|
||||
#define t2 $f24
|
||||
#define t3 $f25
|
||||
|
||||
PROLOGUE
|
||||
PROFCODE
|
||||
|
||||
mov X, XX
|
||||
ble N, $L999
|
||||
|
||||
cmpeq INCX, 1, $0
|
||||
beq $0, $L20
|
||||
|
||||
#ifndef DOUBLE
|
||||
sra N, 4, I
|
||||
ble I, $L15
|
||||
|
||||
LD a0, 0 * SIZE(X)
|
||||
LD a1, 1 * SIZE(X)
|
||||
LD a2, 2 * SIZE(X)
|
||||
LD a3, 3 * SIZE(X)
|
||||
|
||||
LD a4, 4 * SIZE(X)
|
||||
MUL a0, ALPHA, t0
|
||||
LD a5, 5 * SIZE(X)
|
||||
MUL a1, ALPHA, t1
|
||||
LD a6, 6 * SIZE(X)
|
||||
MUL a2, ALPHA, t2
|
||||
LD a7, 7 * SIZE(X)
|
||||
MUL a3, ALPHA, t3
|
||||
|
||||
ST t0, 0 * SIZE(X)
|
||||
MUL a4, ALPHA, t0
|
||||
ST t1, 1 * SIZE(X)
|
||||
MUL a5, ALPHA, t1
|
||||
|
||||
ST t2, 2 * SIZE(X)
|
||||
MUL a6, ALPHA, t2
|
||||
ST t3, 3 * SIZE(X)
|
||||
MUL a7, ALPHA, t3
|
||||
|
||||
LD a0, 8 * SIZE(X)
|
||||
LD a1, 9 * SIZE(X)
|
||||
LD a2, 10 * SIZE(X)
|
||||
LD a3, 11 * SIZE(X)
|
||||
|
||||
ST t0, 4 * SIZE(X)
|
||||
MUL a0, ALPHA, t0
|
||||
ST t1, 5 * SIZE(X)
|
||||
MUL a1, ALPHA, t1
|
||||
|
||||
ST t2, 6 * SIZE(X)
|
||||
MUL a2, ALPHA, t2
|
||||
ST t3, 7 * SIZE(X)
|
||||
MUL a3, ALPHA, t3
|
||||
|
||||
LD a4, 12 * SIZE(X)
|
||||
LD a5, 13 * SIZE(X)
|
||||
LD a6, 14 * SIZE(X)
|
||||
LD a7, 15 * SIZE(X)
|
||||
|
||||
lda I, -1(I)
|
||||
ble I, $L13
|
||||
.align 4
|
||||
|
||||
$L12:
|
||||
ST t0, 8 * SIZE(X)
|
||||
MUL a4, ALPHA, t0
|
||||
ST t1, 9 * SIZE(X)
|
||||
MUL a5, ALPHA, t1
|
||||
|
||||
ST t2, 10 * SIZE(X)
|
||||
MUL a6, ALPHA, t2
|
||||
ST t3, 11 * SIZE(X)
|
||||
MUL a7, ALPHA, t3
|
||||
|
||||
LD a0, 16 * SIZE(X)
|
||||
LD a1, 17 * SIZE(X)
|
||||
LD a2, 18 * SIZE(X)
|
||||
LD a3, 19 * SIZE(X)
|
||||
|
||||
ST t0, 12 * SIZE(X)
|
||||
MUL a0, ALPHA, t0
|
||||
ST t1, 13 * SIZE(X)
|
||||
MUL a1, ALPHA, t1
|
||||
|
||||
ST t2, 14 * SIZE(X)
|
||||
MUL a2, ALPHA, t2
|
||||
ST t3, 15 * SIZE(X)
|
||||
MUL a3, ALPHA, t3
|
||||
|
||||
LD a4, 20 * SIZE(X)
|
||||
LD a5, 21 * SIZE(X)
|
||||
LD a6, 22 * SIZE(X)
|
||||
LD a7, 23 * SIZE(X)
|
||||
|
||||
ST t0, 16 * SIZE(X)
|
||||
MUL a4, ALPHA, t0
|
||||
ST t1, 17 * SIZE(X)
|
||||
MUL a5, ALPHA, t1
|
||||
|
||||
ST t2, 18 * SIZE(X)
|
||||
MUL a6, ALPHA, t2
|
||||
ST t3, 19 * SIZE(X)
|
||||
MUL a7, ALPHA, t3
|
||||
|
||||
LD a0, 24 * SIZE(X)
|
||||
LD a1, 25 * SIZE(X)
|
||||
LD a2, 26 * SIZE(X)
|
||||
LD a3, 27 * SIZE(X)
|
||||
|
||||
ST t0, 20 * SIZE(X)
|
||||
MUL a0, ALPHA, t0
|
||||
ST t1, 21 * SIZE(X)
|
||||
MUL a1, ALPHA, t1
|
||||
|
||||
ST t2, 22 * SIZE(X)
|
||||
MUL a2, ALPHA, t2
|
||||
ST t3, 23 * SIZE(X)
|
||||
MUL a3, ALPHA, t3
|
||||
|
||||
LD a4, 28 * SIZE(X)
|
||||
LD a5, 29 * SIZE(X)
|
||||
LD a6, 30 * SIZE(X)
|
||||
LD a7, 31 * SIZE(X)
|
||||
|
||||
lds $f31, PREFETCHSIZE * SIZE(X)
|
||||
lda I, -1(I)
|
||||
addq X, 16 * SIZE, X
|
||||
bne I, $L12
|
||||
.align 4
|
||||
|
||||
$L13:
|
||||
ST t0, 8 * SIZE(X)
|
||||
MUL a4, ALPHA, t0
|
||||
ST t1, 9 * SIZE(X)
|
||||
MUL a5, ALPHA, t1
|
||||
|
||||
ST t2, 10 * SIZE(X)
|
||||
MUL a6, ALPHA, t2
|
||||
ST t3, 11 * SIZE(X)
|
||||
MUL a7, ALPHA, t3
|
||||
|
||||
ST t0, 12 * SIZE(X)
|
||||
ST t1, 13 * SIZE(X)
|
||||
ST t2, 14 * SIZE(X)
|
||||
ST t3, 15 * SIZE(X)
|
||||
addq X, 16 * SIZE, X
|
||||
.align 4
|
||||
|
||||
$L15:
|
||||
and N, 15, I
|
||||
|
||||
#else
|
||||
|
||||
sra N, 3, I
|
||||
ble I, $L15
|
||||
|
||||
LD a0, 0 * SIZE(X)
|
||||
LD a1, 1 * SIZE(X)
|
||||
LD a2, 2 * SIZE(X)
|
||||
LD a3, 3 * SIZE(X)
|
||||
|
||||
LD a4, 4 * SIZE(X)
|
||||
MUL a0, ALPHA, t0
|
||||
LD a5, 5 * SIZE(X)
|
||||
MUL a1, ALPHA, t1
|
||||
|
||||
LD a6, 6 * SIZE(X)
|
||||
MUL a2, ALPHA, t2
|
||||
LD a7, 7 * SIZE(X)
|
||||
MUL a3, ALPHA, t3
|
||||
|
||||
lda I, -1(I)
|
||||
ble I, $L13
|
||||
.align 4
|
||||
|
||||
$L12:
|
||||
ST t0, 0 * SIZE(X)
|
||||
MUL a4, ALPHA, t0
|
||||
ST t1, 1 * SIZE(X)
|
||||
MUL a5, ALPHA, t1
|
||||
|
||||
ST t2, 2 * SIZE(X)
|
||||
MUL a6, ALPHA, t2
|
||||
ST t3, 3 * SIZE(X)
|
||||
MUL a7, ALPHA, t3
|
||||
|
||||
LD a0, 8 * SIZE(X)
|
||||
lda I, -1(I)
|
||||
LD a1, 9 * SIZE(X)
|
||||
addq X, 8 * SIZE, X
|
||||
|
||||
LD a2, 2 * SIZE(X)
|
||||
LD a3, 3 * SIZE(X)
|
||||
|
||||
ST t0, -4 * SIZE(X)
|
||||
MUL a0, ALPHA, t0
|
||||
ST t1, -3 * SIZE(X)
|
||||
MUL a1, ALPHA, t1
|
||||
|
||||
ST t2, -2 * SIZE(X)
|
||||
MUL a2, ALPHA, t2
|
||||
ST t3, -1 * SIZE(X)
|
||||
MUL a3, ALPHA, t3
|
||||
|
||||
LD a4, 4 * SIZE(X)
|
||||
LD a5, 5 * SIZE(X)
|
||||
|
||||
LD a6, 6 * SIZE(X)
|
||||
LD a7, 7 * SIZE(X)
|
||||
lds $f31, PREFETCHSIZE * SIZE(X)
|
||||
bne I, $L12
|
||||
.align 4
|
||||
|
||||
$L13:
|
||||
ST t0, 0 * SIZE(X)
|
||||
MUL a4, ALPHA, t0
|
||||
ST t1, 1 * SIZE(X)
|
||||
MUL a5, ALPHA, t1
|
||||
|
||||
ST t2, 2 * SIZE(X)
|
||||
MUL a6, ALPHA, t2
|
||||
ST t3, 3 * SIZE(X)
|
||||
MUL a7, ALPHA, t3
|
||||
|
||||
ST t0, 4 * SIZE(X)
|
||||
ST t1, 5 * SIZE(X)
|
||||
ST t2, 6 * SIZE(X)
|
||||
ST t3, 7 * SIZE(X)
|
||||
addq X, 8 * SIZE, X
|
||||
.align 4
|
||||
|
||||
$L15:
|
||||
and N, 7, I
|
||||
|
||||
#endif
|
||||
|
||||
unop
|
||||
unop
|
||||
ble I, $L999
|
||||
.align 4
|
||||
|
||||
$L17:
|
||||
LD a0, 0 * SIZE(X)
|
||||
|
||||
MUL a0, ALPHA, t0
|
||||
|
||||
ST t0, 0 * SIZE(X)
|
||||
|
||||
addq X, SIZE, X
|
||||
|
||||
lda I, -1(I)
|
||||
bne I, $L17
|
||||
ret
|
||||
.align 4
|
||||
|
||||
$L20:
|
||||
sra N, 3, I
|
||||
ble I, $L25
|
||||
|
||||
LD a0, 0 * SIZE(X)
|
||||
SXADDQ INCX, X, X
|
||||
LD a1, 0 * SIZE(X)
|
||||
SXADDQ INCX, X, X
|
||||
LD a2, 0 * SIZE(X)
|
||||
SXADDQ INCX, X, X
|
||||
LD a3, 0 * SIZE(X)
|
||||
SXADDQ INCX, X, X
|
||||
|
||||
LD a4, 0 * SIZE(X)
|
||||
MUL a0, ALPHA, t0
|
||||
lda I, -1(I)
|
||||
SXADDQ INCX, X, X
|
||||
|
||||
LD a5, 0 * SIZE(X)
|
||||
MUL a1, ALPHA, t1
|
||||
SXADDQ INCX, X, X
|
||||
unop
|
||||
|
||||
LD a6, 0 * SIZE(X)
|
||||
MUL a2, ALPHA, t2
|
||||
SXADDQ INCX, X, X
|
||||
unop
|
||||
|
||||
LD a7, 0 * SIZE(X)
|
||||
MUL a3, ALPHA, t3
|
||||
SXADDQ INCX, X, X
|
||||
ble I, $L23
|
||||
.align 4
|
||||
|
||||
$L22:
|
||||
ST t0, 0 * SIZE(XX)
|
||||
MUL a4, ALPHA, t0
|
||||
lds $f31, PREFETCHSIZE * SIZE(X)
|
||||
SXADDQ INCX, XX, XX
|
||||
|
||||
LD a0, 0 * SIZE(X)
|
||||
SXADDQ INCX, X, X
|
||||
lda I, -1(I)
|
||||
unop
|
||||
|
||||
ST t1, 0 * SIZE(XX)
|
||||
MUL a5, ALPHA, t1
|
||||
SXADDQ INCX, XX, XX
|
||||
unop
|
||||
|
||||
LD a1, 0 * SIZE(X)
|
||||
SXADDQ INCX, X, X
|
||||
|
||||
ST t2, 0 * SIZE(XX)
|
||||
MUL a6, ALPHA, t2
|
||||
SXADDQ INCX, XX, XX
|
||||
unop
|
||||
|
||||
LD a2, 0 * SIZE(X)
|
||||
SXADDQ INCX, X, X
|
||||
|
||||
ST t3, 0 * SIZE(XX)
|
||||
MUL a7, ALPHA, t3
|
||||
SXADDQ INCX, XX, XX
|
||||
unop
|
||||
|
||||
LD a3, 0 * SIZE(X)
|
||||
SXADDQ INCX, X, X
|
||||
|
||||
ST t0, 0 * SIZE(XX)
|
||||
MUL a0, ALPHA, t0
|
||||
SXADDQ INCX, XX, XX
|
||||
unop
|
||||
|
||||
LD a4, 0 * SIZE(X)
|
||||
SXADDQ INCX, X, X
|
||||
|
||||
ST t1, 0 * SIZE(XX)
|
||||
MUL a1, ALPHA, t1
|
||||
SXADDQ INCX, XX, XX
|
||||
unop
|
||||
|
||||
LD a5, 0 * SIZE(X)
|
||||
SXADDQ INCX, X, X
|
||||
|
||||
ST t2, 0 * SIZE(XX)
|
||||
MUL a2, ALPHA, t2
|
||||
SXADDQ INCX, XX, XX
|
||||
unop
|
||||
|
||||
LD a6, 0 * SIZE(X)
|
||||
SXADDQ INCX, X, X
|
||||
|
||||
ST t3, 0 * SIZE(XX)
|
||||
MUL a3, ALPHA, t3
|
||||
SXADDQ INCX, XX, XX
|
||||
unop
|
||||
|
||||
LD a7, 0 * SIZE(X)
|
||||
SXADDQ INCX, X, X
|
||||
unop
|
||||
bne I, $L22
|
||||
.align 4
|
||||
|
||||
$L23:
|
||||
ST t0, 0 * SIZE(XX)
|
||||
MUL a4, ALPHA, t0
|
||||
SXADDQ INCX, XX, XX
|
||||
|
||||
ST t1, 0 * SIZE(XX)
|
||||
MUL a5, ALPHA, t1
|
||||
SXADDQ INCX, XX, XX
|
||||
|
||||
ST t2, 0 * SIZE(XX)
|
||||
MUL a6, ALPHA, t2
|
||||
SXADDQ INCX, XX, XX
|
||||
|
||||
ST t3, 0 * SIZE(XX)
|
||||
MUL a7, ALPHA, t3
|
||||
SXADDQ INCX, XX, XX
|
||||
|
||||
ST t0, 0 * SIZE(XX)
|
||||
SXADDQ INCX, XX, XX
|
||||
ST t1, 0 * SIZE(XX)
|
||||
SXADDQ INCX, XX, XX
|
||||
ST t2, 0 * SIZE(XX)
|
||||
SXADDQ INCX, XX, XX
|
||||
ST t3, 0 * SIZE(XX)
|
||||
SXADDQ INCX, XX, XX
|
||||
.align 4
|
||||
|
||||
$L25:
|
||||
and N, 7, I
|
||||
unop
|
||||
unop
|
||||
ble I, $L999
|
||||
.align 4
|
||||
|
||||
$L27:
|
||||
LD a0, 0 * SIZE(X)
|
||||
|
||||
MUL a0, ALPHA, t0
|
||||
|
||||
ST t0, 0 * SIZE(XX)
|
||||
|
||||
SXADDQ INCX, X, X
|
||||
SXADDQ INCX, XX, XX
|
||||
|
||||
lda I, -1(I)
|
||||
bne I, $L27
|
||||
.align 4
|
||||
|
||||
$L999:
|
||||
ret
|
||||
EPILOGUE
|
||||
431
kernel/alpha/snrm2.S
Normal file
431
kernel/alpha/snrm2.S
Normal file
@@ -0,0 +1,431 @@
|
||||
/*********************************************************************/
|
||||
/* Copyright 2009, 2010 The University of Texas at Austin. */
|
||||
/* All rights reserved. */
|
||||
/* */
|
||||
/* Redistribution and use in source and binary forms, with or */
|
||||
/* without modification, are permitted provided that the following */
|
||||
/* conditions are met: */
|
||||
/* */
|
||||
/* 1. Redistributions of source code must retain the above */
|
||||
/* copyright notice, this list of conditions and the following */
|
||||
/* disclaimer. */
|
||||
/* */
|
||||
/* 2. Redistributions in binary form must reproduce the above */
|
||||
/* copyright notice, this list of conditions and the following */
|
||||
/* disclaimer in the documentation and/or other materials */
|
||||
/* provided with the distribution. */
|
||||
/* */
|
||||
/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */
|
||||
/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */
|
||||
/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */
|
||||
/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */
|
||||
/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */
|
||||
/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */
|
||||
/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */
|
||||
/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */
|
||||
/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */
|
||||
/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */
|
||||
/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */
|
||||
/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */
|
||||
/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */
|
||||
/* POSSIBILITY OF SUCH DAMAGE. */
|
||||
/* */
|
||||
/* The views and conclusions contained in the software and */
|
||||
/* documentation are those of the authors and should not be */
|
||||
/* interpreted as representing official policies, either expressed */
|
||||
/* or implied, of The University of Texas at Austin. */
|
||||
/*********************************************************************/
|
||||
|
||||
#define ASSEMBLER
|
||||
|
||||
#include "common.h"
|
||||
#include "version.h"
|
||||
|
||||
#define PREFETCH_SIZE 80
|
||||
|
||||
#define N $16
|
||||
#define X $17
|
||||
#define INCX $18
|
||||
#define XX $19
|
||||
|
||||
#define I $0
|
||||
|
||||
#define a0 $f0
|
||||
#define a1 $f1
|
||||
#define a2 $f10
|
||||
#define a3 $f11
|
||||
#define t0 $f12
|
||||
#define t1 $f13
|
||||
#define t2 $f14
|
||||
#define t3 $f15
|
||||
|
||||
#define x0 $f16
|
||||
#define x1 $f17
|
||||
#define x2 $f18
|
||||
#define x3 $f19
|
||||
#define x4 $f20
|
||||
#define x5 $f21
|
||||
#define x6 $f22
|
||||
#define x7 $f23
|
||||
|
||||
PROLOGUE
|
||||
|
||||
#if defined(EV4) || defined(EV5)
|
||||
.frame $30,16,$26,0
|
||||
.mask 0x4000000,-16
|
||||
ldah $29, 0($27) !gpdisp!1
|
||||
lda $29, 0($29) !gpdisp!1
|
||||
|
||||
lda $sp, -16($sp)
|
||||
ldq $27, sqrt($29) !literal!2
|
||||
stq $26, 0($sp)
|
||||
|
||||
PROFCODE
|
||||
.prologue 1
|
||||
#else
|
||||
PROFCODE
|
||||
#endif
|
||||
|
||||
fclr a0
|
||||
SXADDQ INCX, 0, INCX
|
||||
fclr a1
|
||||
ble N, $L999
|
||||
|
||||
fclr a2
|
||||
cmpeq INCX, SIZE, $0
|
||||
fclr a3
|
||||
beq $0, $L20
|
||||
|
||||
fclr t0
|
||||
sra N, 4, I
|
||||
fclr t1
|
||||
ble I, $L15
|
||||
|
||||
fclr t2
|
||||
LD x0, 0 * SIZE(X)
|
||||
fclr t3
|
||||
LD x1, 1 * SIZE(X)
|
||||
|
||||
LD x2, 2 * SIZE(X)
|
||||
LD x3, 3 * SIZE(X)
|
||||
LD x4, 4 * SIZE(X)
|
||||
LD x5, 5 * SIZE(X)
|
||||
LD x6, 6 * SIZE(X)
|
||||
LD x7, 7 * SIZE(X)
|
||||
|
||||
lda I, -1(I)
|
||||
ble I, $L12
|
||||
.align 4
|
||||
|
||||
$L11:
|
||||
addt a0, t0, a0
|
||||
ldl $31, (PREFETCH_SIZE) * SIZE(X)
|
||||
mult x0, x0, t0
|
||||
LD x0, 8 * SIZE(X)
|
||||
|
||||
addt a1, t1, a1
|
||||
mov X, XX
|
||||
mult x1, x1, t1
|
||||
LD x1, 9 * SIZE(X)
|
||||
|
||||
addt a2, t2, a2
|
||||
unop
|
||||
mult x2, x2, t2
|
||||
LD x2, 10 * SIZE(X)
|
||||
|
||||
addt a3, t3, a3
|
||||
unop
|
||||
mult x3, x3, t3
|
||||
LD x3, 11 * SIZE(X)
|
||||
|
||||
addt a0, t0, a0
|
||||
unop
|
||||
mult x4, x4, t0
|
||||
LD x4, 12 * SIZE(X)
|
||||
|
||||
addt a1, t1, a1
|
||||
unop
|
||||
mult x5, x5, t1
|
||||
LD x5, 13 * SIZE(X)
|
||||
|
||||
addt a2, t2, a2
|
||||
unop
|
||||
mult x6, x6, t2
|
||||
LD x6, 14 * SIZE(X)
|
||||
|
||||
addt a3, t3, a3
|
||||
unop
|
||||
mult x7, x7, t3
|
||||
LD x7, 15 * SIZE(X)
|
||||
|
||||
addt a0, t0, a0
|
||||
unop
|
||||
mult x0, x0, t0
|
||||
LD x0, 16 * SIZE(X)
|
||||
|
||||
addt a1, t1, a1
|
||||
lda X, 16 * SIZE(X)
|
||||
mult x1, x1, t1
|
||||
LD x1, 17 * SIZE(XX)
|
||||
|
||||
addt a2, t2, a2
|
||||
unop
|
||||
mult x2, x2, t2
|
||||
LD x2, 18 * SIZE(XX)
|
||||
|
||||
addt a3, t3, a3
|
||||
unop
|
||||
mult x3, x3, t3
|
||||
LD x3, 19 * SIZE(XX)
|
||||
|
||||
addt a0, t0, a0
|
||||
unop
|
||||
mult x4, x4, t0
|
||||
LD x4, 20 * SIZE(XX)
|
||||
|
||||
addt a1, t1, a1
|
||||
lda I, -1(I)
|
||||
mult x5, x5, t1
|
||||
LD x5, 21 * SIZE(XX)
|
||||
|
||||
addt a2, t2, a2
|
||||
unop
|
||||
mult x6, x6, t2
|
||||
LD x6, 22 * SIZE(XX)
|
||||
|
||||
addt a3, t3, a3
|
||||
mult x7, x7, t3
|
||||
LD x7, 23 * SIZE(XX)
|
||||
bgt I, $L11
|
||||
.align 4
|
||||
|
||||
$L12:
|
||||
addt a0, t0, a0
|
||||
mov X, XX
|
||||
mult x0, x0, t0
|
||||
LD x0, 8 * SIZE(X)
|
||||
|
||||
addt a1, t1, a1
|
||||
unop
|
||||
mult x1, x1, t1
|
||||
LD x1, 9 * SIZE(X)
|
||||
|
||||
addt a2, t2, a2
|
||||
unop
|
||||
mult x2, x2, t2
|
||||
LD x2, 10 * SIZE(X)
|
||||
|
||||
addt a3, t3, a3
|
||||
unop
|
||||
mult x3, x3, t3
|
||||
LD x3, 11 * SIZE(X)
|
||||
|
||||
addt a0, t0, a0
|
||||
unop
|
||||
mult x4, x4, t0
|
||||
LD x4, 12 * SIZE(XX)
|
||||
|
||||
addt a1, t1, a1
|
||||
unop
|
||||
mult x5, x5, t1
|
||||
LD x5, 13 * SIZE(XX)
|
||||
|
||||
addt a2, t2, a2
|
||||
unop
|
||||
mult x6, x6, t2
|
||||
LD x6, 14 * SIZE(XX)
|
||||
|
||||
addt a3, t3, a3
|
||||
lda X, 16 * SIZE(X)
|
||||
mult x7, x7, t3
|
||||
LD x7, 15 * SIZE(XX)
|
||||
|
||||
addt a0, t0, a0
|
||||
mult x0, x0, t0
|
||||
addt a1, t1, a1
|
||||
mult x1, x1, t1
|
||||
|
||||
addt a2, t2, a2
|
||||
mult x2, x2, t2
|
||||
addt a3, t3, a3
|
||||
mult x3, x3, t3
|
||||
|
||||
addt a0, t0, a0
|
||||
mult x4, x4, t0
|
||||
addt a1, t1, a1
|
||||
mult x5, x5, t1
|
||||
|
||||
addt a2, t2, a2
|
||||
mult x6, x6, t2
|
||||
addt a3, t3, a3
|
||||
mult x7, x7, t3
|
||||
|
||||
addt a1, t1, a1
|
||||
addt a2, t2, a2
|
||||
addt a3, t3, a3
|
||||
.align 4
|
||||
|
||||
$L15:
|
||||
and N, 15, I
|
||||
ble I, $L998
|
||||
.align 4
|
||||
|
||||
$L16:
|
||||
LD x0, 0 * SIZE(X)
|
||||
lda X, 1 * SIZE(X)
|
||||
|
||||
addt a0, t0, a0
|
||||
mult x0, x0, t0
|
||||
|
||||
lda I, -1(I)
|
||||
bgt I, $L16
|
||||
bsr $31, $L998
|
||||
.align 4
|
||||
|
||||
$L20:
|
||||
fclr t0
|
||||
sra N, 3, I
|
||||
fclr t1
|
||||
ble I, $L25
|
||||
|
||||
fclr t2
|
||||
fclr t3
|
||||
|
||||
LD x0, 0 * SIZE(X)
|
||||
addq X, INCX, X
|
||||
LD x1, 0 * SIZE(X)
|
||||
addq X, INCX, X
|
||||
LD x2, 0 * SIZE(X)
|
||||
addq X, INCX, X
|
||||
LD x3, 0 * SIZE(X)
|
||||
addq X, INCX, X
|
||||
|
||||
LD x4, 0 * SIZE(X)
|
||||
addq X, INCX, X
|
||||
LD x5, 0 * SIZE(X)
|
||||
addq X, INCX, X
|
||||
LD x6, 0 * SIZE(X)
|
||||
addq X, INCX, X
|
||||
|
||||
lda I, -1(I)
|
||||
ble I, $L22
|
||||
.align 4
|
||||
|
||||
$L21:
|
||||
addt a0, t0, a0
|
||||
LD x7, 0 * SIZE(X)
|
||||
mult x0, x0, t0
|
||||
addq X, INCX, X
|
||||
|
||||
addt a1, t1, a1
|
||||
LD x0, 0 * SIZE(X)
|
||||
mult x1, x1, t1
|
||||
addq X, INCX, X
|
||||
|
||||
addt a2, t2, a2
|
||||
LD x1, 0 * SIZE(X)
|
||||
mult x2, x2, t2
|
||||
addq X, INCX, X
|
||||
|
||||
addt a3, t3, a3
|
||||
LD x2, 0 * SIZE(X)
|
||||
mult x3, x3, t3
|
||||
addq X, INCX, X
|
||||
|
||||
addt a0, t0, a0
|
||||
LD x3, 0 * SIZE(X)
|
||||
mult x4, x4, t0
|
||||
addq X, INCX, X
|
||||
|
||||
addt a1, t1, a1
|
||||
LD x4, 0 * SIZE(X)
|
||||
mult x5, x5, t1
|
||||
addq X, INCX, X
|
||||
|
||||
addt a2, t2, a2
|
||||
LD x5, 0 * SIZE(X)
|
||||
mult x6, x6, t2
|
||||
addq X, INCX, X
|
||||
|
||||
addt a3, t3, a3
|
||||
LD x6, 0 * SIZE(X)
|
||||
mult x7, x7, t3
|
||||
addq X, INCX, X
|
||||
|
||||
lda I, -1(I)
|
||||
bgt I, $L21
|
||||
.align 4
|
||||
|
||||
$L22:
|
||||
addt a0, t0, a0
|
||||
LD x7, 0 * SIZE(X)
|
||||
mult x0, x0, t0
|
||||
addq X, INCX, X
|
||||
|
||||
addt a1, t1, a1
|
||||
unop
|
||||
mult x1, x1, t1
|
||||
unop
|
||||
|
||||
addt a2, t2, a2
|
||||
mult x2, x2, t2
|
||||
addt a3, t3, a3
|
||||
mult x3, x3, t3
|
||||
|
||||
addt a0, t0, a0
|
||||
mult x4, x4, t0
|
||||
addt a1, t1, a1
|
||||
mult x5, x5, t1
|
||||
|
||||
addt a2, t2, a2
|
||||
mult x6, x6, t2
|
||||
addt a3, t3, a3
|
||||
mult x7, x7, t3
|
||||
|
||||
addt a1, t1, a1
|
||||
addt a2, t2, a2
|
||||
addt a3, t3, a3
|
||||
.align 4
|
||||
|
||||
$L25:
|
||||
and N, 7, I
|
||||
ble I, $L998
|
||||
.align 4
|
||||
|
||||
$L26:
|
||||
LD x0, 0 * SIZE(X)
|
||||
addq X, INCX, X
|
||||
|
||||
addt a0, t0, a0
|
||||
mult x0, x0, t0
|
||||
|
||||
lda I, -1(I)
|
||||
bgt I, $L26
|
||||
.align 4
|
||||
|
||||
|
||||
$L998:
|
||||
addt a0, t0, a0
|
||||
|
||||
addt a0, a1, a0
|
||||
addt a2, a3, a2
|
||||
|
||||
#if defined(EV4) || defined(EV5)
|
||||
addt a0, a2, $f16
|
||||
jsr $26, ($27), sqrt !lituse_jsr!2
|
||||
|
||||
ldah $29, 0($26) !gpdisp!3
|
||||
lda $29, 0($29) !gpdisp!3
|
||||
#else
|
||||
addt a0, a2, a0
|
||||
sqrtt a0, a0
|
||||
#endif
|
||||
.align 4
|
||||
|
||||
$L999:
|
||||
#if defined(EV4) || defined(EV5)
|
||||
ldq $26, 0($sp)
|
||||
lda $sp, 16($sp)
|
||||
#endif
|
||||
ret
|
||||
EPILOGUE
|
||||
45
kernel/alpha/staticbuffer.S
Normal file
45
kernel/alpha/staticbuffer.S
Normal file
@@ -0,0 +1,45 @@
|
||||
/*********************************************************************/
|
||||
/* Copyright 2009, 2010 The University of Texas at Austin. */
|
||||
/* All rights reserved. */
|
||||
/* */
|
||||
/* Redistribution and use in source and binary forms, with or */
|
||||
/* without modification, are permitted provided that the following */
|
||||
/* conditions are met: */
|
||||
/* */
|
||||
/* 1. Redistributions of source code must retain the above */
|
||||
/* copyright notice, this list of conditions and the following */
|
||||
/* disclaimer. */
|
||||
/* */
|
||||
/* 2. Redistributions in binary form must reproduce the above */
|
||||
/* copyright notice, this list of conditions and the following */
|
||||
/* disclaimer in the documentation and/or other materials */
|
||||
/* provided with the distribution. */
|
||||
/* */
|
||||
/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */
|
||||
/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */
|
||||
/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */
|
||||
/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */
|
||||
/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */
|
||||
/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */
|
||||
/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */
|
||||
/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */
|
||||
/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */
|
||||
/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */
|
||||
/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */
|
||||
/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */
|
||||
/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */
|
||||
/* POSSIBILITY OF SUCH DAMAGE. */
|
||||
/* */
|
||||
/* The views and conclusions contained in the software and */
|
||||
/* documentation are those of the authors and should not be */
|
||||
/* interpreted as representing official policies, either expressed */
|
||||
/* or implied, of The University of Texas at Austin. */
|
||||
/*********************************************************************/
|
||||
|
||||
#define ASSEMBLER
|
||||
#include "common.h"
|
||||
|
||||
#ifdef ALLOC_STATIC
|
||||
.align 8
|
||||
.comm alloc_area, (NUM_BUFFERS * BUFFER_SIZE), 16384
|
||||
#endif
|
||||
249
kernel/alpha/swap.S
Normal file
249
kernel/alpha/swap.S
Normal file
@@ -0,0 +1,249 @@
|
||||
/*********************************************************************/
|
||||
/* Copyright 2009, 2010 The University of Texas at Austin. */
|
||||
/* All rights reserved. */
|
||||
/* */
|
||||
/* Redistribution and use in source and binary forms, with or */
|
||||
/* without modification, are permitted provided that the following */
|
||||
/* conditions are met: */
|
||||
/* */
|
||||
/* 1. Redistributions of source code must retain the above */
|
||||
/* copyright notice, this list of conditions and the following */
|
||||
/* disclaimer. */
|
||||
/* */
|
||||
/* 2. Redistributions in binary form must reproduce the above */
|
||||
/* copyright notice, this list of conditions and the following */
|
||||
/* disclaimer in the documentation and/or other materials */
|
||||
/* provided with the distribution. */
|
||||
/* */
|
||||
/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */
|
||||
/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */
|
||||
/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */
|
||||
/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */
|
||||
/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */
|
||||
/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */
|
||||
/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */
|
||||
/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */
|
||||
/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */
|
||||
/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */
|
||||
/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */
|
||||
/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */
|
||||
/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */
|
||||
/* POSSIBILITY OF SUCH DAMAGE. */
|
||||
/* */
|
||||
/* The views and conclusions contained in the software and */
|
||||
/* documentation are those of the authors and should not be */
|
||||
/* interpreted as representing official policies, either expressed */
|
||||
/* or implied, of The University of Texas at Austin. */
|
||||
/*********************************************************************/
|
||||
|
||||
#define ASSEMBLER
|
||||
#include "common.h"
|
||||
#include "version.h"
|
||||
|
||||
PROLOGUE
|
||||
PROFCODE
|
||||
.frame $sp, 0, $26, 0
|
||||
|
||||
mov $20, $17
|
||||
mov $21, $18
|
||||
ldq $19, 0($sp)
|
||||
ldl $20, 8($sp)
|
||||
#ifndef PROFILE
|
||||
.prologue 0
|
||||
#else
|
||||
.prologue 1
|
||||
#endif
|
||||
|
||||
subl $18, 1, $1
|
||||
subl $20, 1, $2
|
||||
ble $16, $SubEnd # if n <= 0 goto $End
|
||||
or $1, $2, $1
|
||||
|
||||
sra $16, 3, $21
|
||||
|
||||
and $16, 7, $22
|
||||
bne $1, $Sub
|
||||
ble $21, $MainRemain
|
||||
.align 4
|
||||
|
||||
$MainLoop:
|
||||
LD $f10, 0*SIZE($19)
|
||||
LD $f11, 1*SIZE($19)
|
||||
LD $f12, 2*SIZE($19)
|
||||
LD $f13, 3*SIZE($19)
|
||||
LD $f14, 4*SIZE($19)
|
||||
LD $f15, 5*SIZE($19)
|
||||
LD $f16, 6*SIZE($19)
|
||||
LD $f17, 7*SIZE($19)
|
||||
|
||||
LD $f20, 0*SIZE($17)
|
||||
LD $f21, 1*SIZE($17)
|
||||
LD $f22, 2*SIZE($17)
|
||||
LD $f23, 3*SIZE($17)
|
||||
LD $f24, 4*SIZE($17)
|
||||
LD $f25, 5*SIZE($17)
|
||||
LD $f26, 6*SIZE($17)
|
||||
LD $f27, 7*SIZE($17)
|
||||
|
||||
lds $f31, 32*SIZE($17)
|
||||
unop
|
||||
lds $f31, 32*SIZE($19)
|
||||
subl $21, 1, $21
|
||||
|
||||
ST $f10, 0*SIZE($17)
|
||||
ST $f11, 1*SIZE($17)
|
||||
ST $f12, 2*SIZE($17)
|
||||
ST $f13, 3*SIZE($17)
|
||||
ST $f14, 4*SIZE($17)
|
||||
ST $f15, 5*SIZE($17)
|
||||
ST $f16, 6*SIZE($17)
|
||||
ST $f17, 7*SIZE($17)
|
||||
|
||||
ST $f20, 0*SIZE($19)
|
||||
ST $f21, 1*SIZE($19)
|
||||
ST $f22, 2*SIZE($19)
|
||||
ST $f23, 3*SIZE($19)
|
||||
ST $f24, 4*SIZE($19)
|
||||
ST $f25, 5*SIZE($19)
|
||||
ST $f26, 6*SIZE($19)
|
||||
ST $f27, 7*SIZE($19)
|
||||
|
||||
lda $17, 8*SIZE($17)
|
||||
lda $19, 8*SIZE($19)
|
||||
bgt $21, $MainLoop
|
||||
.align 4
|
||||
|
||||
$MainRemain:
|
||||
ble $22, $MainEnd
|
||||
.align 4
|
||||
|
||||
$MainRemainLoop:
|
||||
LD $f10, 0*SIZE($19)
|
||||
LD $f20, 0*SIZE($17)
|
||||
lda $17, 1*SIZE($17)
|
||||
lda $19, 1*SIZE($19)
|
||||
subl $22, 1, $22
|
||||
ST $f10, -1*SIZE($17)
|
||||
ST $f20, -1*SIZE($19)
|
||||
bgt $22, $MainRemainLoop
|
||||
.align 4
|
||||
|
||||
$MainEnd:
|
||||
clr $0
|
||||
ret
|
||||
.align 4
|
||||
|
||||
$Sub:
|
||||
mov $17, $23
|
||||
mov $19, $24
|
||||
|
||||
ble $21, $SubRemain
|
||||
.align 4
|
||||
|
||||
$SubLoop:
|
||||
LD $f10, 0*SIZE($19)
|
||||
SXADDQ $20, $19, $19
|
||||
LD $f11, 0*SIZE($19)
|
||||
SXADDQ $20, $19, $19
|
||||
|
||||
LD $f12, 0*SIZE($19)
|
||||
SXADDQ $20, $19, $19
|
||||
LD $f13, 0*SIZE($19)
|
||||
SXADDQ $20, $19, $19
|
||||
|
||||
LD $f14, 0*SIZE($19)
|
||||
SXADDQ $20, $19, $19
|
||||
LD $f15, 0*SIZE($19)
|
||||
SXADDQ $20, $19, $19
|
||||
|
||||
LD $f16, 0*SIZE($19)
|
||||
SXADDQ $20, $19, $19
|
||||
LD $f17, 0*SIZE($19)
|
||||
SXADDQ $20, $19, $19
|
||||
|
||||
LD $f20, 0*SIZE($17)
|
||||
SXADDQ $18, $17, $17
|
||||
LD $f21, 0*SIZE($17)
|
||||
SXADDQ $18, $17, $17
|
||||
|
||||
LD $f22, 0*SIZE($17)
|
||||
SXADDQ $18, $17, $17
|
||||
LD $f23, 0*SIZE($17)
|
||||
SXADDQ $18, $17, $17
|
||||
|
||||
LD $f24, 0*SIZE($17)
|
||||
SXADDQ $18, $17, $17
|
||||
LD $f25, 0*SIZE($17)
|
||||
SXADDQ $18, $17, $17
|
||||
|
||||
LD $f26, 0*SIZE($17)
|
||||
SXADDQ $18, $17, $17
|
||||
LD $f27, 0*SIZE($17)
|
||||
SXADDQ $18, $17, $17
|
||||
|
||||
ST $f10, 0*SIZE($23)
|
||||
SXADDQ $18, $23, $23
|
||||
ST $f11, 0*SIZE($23)
|
||||
SXADDQ $18, $23, $23
|
||||
|
||||
ST $f12, 0*SIZE($23)
|
||||
SXADDQ $18, $23, $23
|
||||
ST $f13, 0*SIZE($23)
|
||||
SXADDQ $18, $23, $23
|
||||
|
||||
ST $f14, 0*SIZE($23)
|
||||
SXADDQ $18, $23, $23
|
||||
ST $f15, 0*SIZE($23)
|
||||
SXADDQ $18, $23, $23
|
||||
|
||||
ST $f16, 0*SIZE($23)
|
||||
SXADDQ $18, $23, $23
|
||||
ST $f17, 0*SIZE($23)
|
||||
SXADDQ $18, $23, $23
|
||||
|
||||
ST $f20, 0*SIZE($24)
|
||||
SXADDQ $20, $24, $24
|
||||
ST $f21, 0*SIZE($24)
|
||||
SXADDQ $20, $24, $24
|
||||
|
||||
ST $f22, 0*SIZE($24)
|
||||
SXADDQ $20, $24, $24
|
||||
ST $f23, 0*SIZE($24)
|
||||
SXADDQ $20, $24, $24
|
||||
|
||||
ST $f24, 0*SIZE($24)
|
||||
SXADDQ $20, $24, $24
|
||||
ST $f25, 0*SIZE($24)
|
||||
SXADDQ $20, $24, $24
|
||||
|
||||
ST $f26, 0*SIZE($24)
|
||||
SXADDQ $20, $24, $24
|
||||
ST $f27, 0*SIZE($24)
|
||||
SXADDQ $20, $24, $24
|
||||
|
||||
subl $21, 1, $21
|
||||
bgt $21, $SubLoop
|
||||
.align 4
|
||||
|
||||
$SubRemain:
|
||||
ble $22, $SubEnd
|
||||
.align 4
|
||||
|
||||
$SubRemainLoop:
|
||||
LD $f10, 0*SIZE($19)
|
||||
LD $f20, 0*SIZE($17)
|
||||
|
||||
subl $22, 1, $22
|
||||
|
||||
ST $f10, 0*SIZE($17)
|
||||
ST $f20, 0*SIZE($19)
|
||||
|
||||
SXADDQ $18, $17, $17
|
||||
SXADDQ $20, $19, $19
|
||||
bgt $22, $SubRemainLoop
|
||||
.align 4
|
||||
|
||||
$SubEnd:
|
||||
clr $0
|
||||
ret
|
||||
EPILOGUE
|
||||
4068
kernel/alpha/trsm_kernel_4x4_LN.S
Normal file
4068
kernel/alpha/trsm_kernel_4x4_LN.S
Normal file
File diff suppressed because it is too large
Load Diff
4066
kernel/alpha/trsm_kernel_4x4_LT.S
Normal file
4066
kernel/alpha/trsm_kernel_4x4_LT.S
Normal file
File diff suppressed because it is too large
Load Diff
4066
kernel/alpha/trsm_kernel_4x4_RT.S
Normal file
4066
kernel/alpha/trsm_kernel_4x4_RT.S
Normal file
File diff suppressed because it is too large
Load Diff
301
kernel/alpha/zamax.S
Normal file
301
kernel/alpha/zamax.S
Normal file
@@ -0,0 +1,301 @@
|
||||
/*********************************************************************/
|
||||
/* Copyright 2009, 2010 The University of Texas at Austin. */
|
||||
/* All rights reserved. */
|
||||
/* */
|
||||
/* Redistribution and use in source and binary forms, with or */
|
||||
/* without modification, are permitted provided that the following */
|
||||
/* conditions are met: */
|
||||
/* */
|
||||
/* 1. Redistributions of source code must retain the above */
|
||||
/* copyright notice, this list of conditions and the following */
|
||||
/* disclaimer. */
|
||||
/* */
|
||||
/* 2. Redistributions in binary form must reproduce the above */
|
||||
/* copyright notice, this list of conditions and the following */
|
||||
/* disclaimer in the documentation and/or other materials */
|
||||
/* provided with the distribution. */
|
||||
/* */
|
||||
/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */
|
||||
/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */
|
||||
/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */
|
||||
/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */
|
||||
/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */
|
||||
/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */
|
||||
/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */
|
||||
/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */
|
||||
/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */
|
||||
/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */
|
||||
/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */
|
||||
/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */
|
||||
/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */
|
||||
/* POSSIBILITY OF SUCH DAMAGE. */
|
||||
/* */
|
||||
/* The views and conclusions contained in the software and */
|
||||
/* documentation are those of the authors and should not be */
|
||||
/* interpreted as representing official policies, either expressed */
|
||||
/* or implied, of The University of Texas at Austin. */
|
||||
/*********************************************************************/
|
||||
|
||||
#define ASSEMBLER
|
||||
#include "common.h"
|
||||
#include "version.h"
|
||||
|
||||
#define N $16
|
||||
#define X $17
|
||||
#define INCX $18
|
||||
|
||||
#ifndef USE_MIN
|
||||
#define CMPLT(a, b) cmptlt a, b
|
||||
#else
|
||||
#define CMPLT(a, b) cmptlt b, a
|
||||
#endif
|
||||
|
||||
#define STACKSIZE 8 * 8
|
||||
|
||||
PROLOGUE
|
||||
PROFCODE
|
||||
.frame $sp, STACKSIZE, $26, 0
|
||||
|
||||
lda $sp, -STACKSIZE($sp)
|
||||
|
||||
stt $f2, 0($sp)
|
||||
fclr $f16
|
||||
cmplt $31, N, $2
|
||||
|
||||
stt $f3, 8($sp)
|
||||
fclr $f17
|
||||
cmplt $31, INCX, $3
|
||||
unop
|
||||
|
||||
stt $f4, 16($sp)
|
||||
fclr $f18
|
||||
SXADDQ INCX, $31, INCX
|
||||
unop
|
||||
|
||||
stt $f5, 24($sp)
|
||||
fclr $f19
|
||||
and $2, $3, $0
|
||||
unop
|
||||
|
||||
stt $f6, 32($sp)
|
||||
unop
|
||||
|
||||
stt $f7, 40($sp)
|
||||
stt $f8, 48($sp)
|
||||
stt $f9, 56($sp)
|
||||
|
||||
fclr $f0
|
||||
beq $0, $End # if (n <= 0) or (incx <= 0) return
|
||||
.align 4
|
||||
|
||||
LD $f20, 0 * SIZE(X)
|
||||
LD $f21, 1 * SIZE(X)
|
||||
sra N, 2, $1
|
||||
addq INCX, INCX, INCX
|
||||
|
||||
fabs $f20, $f20
|
||||
fabs $f21, $f21
|
||||
addt $f20, $f21, $f0
|
||||
ble $1, $L15
|
||||
.align 4
|
||||
|
||||
lda $1, -1($1)
|
||||
unop
|
||||
addq X, INCX, X
|
||||
unop
|
||||
|
||||
LD $f22, 0 * SIZE(X)
|
||||
fmov $f0, $f1
|
||||
LD $f23, 1 * SIZE(X)
|
||||
addq X, INCX, X
|
||||
|
||||
LD $f24, 0 * SIZE(X)
|
||||
fmov $f0, $f2
|
||||
LD $f25, 1 * SIZE(X)
|
||||
addq X, INCX, X
|
||||
|
||||
LD $f26, 0 * SIZE(X)
|
||||
fmov $f0, $f3
|
||||
LD $f27, 1 * SIZE(X)
|
||||
addq X, INCX, X
|
||||
|
||||
fabs $f20, $f8
|
||||
fabs $f21, $f9
|
||||
fabs $f22, $f10
|
||||
fabs $f23, $f11
|
||||
|
||||
fabs $f24, $f12
|
||||
fabs $f25, $f13
|
||||
fabs $f26, $f14
|
||||
fabs $f27, $f15
|
||||
|
||||
ble $1, $L14
|
||||
.align 4
|
||||
|
||||
LD $f20, 0 * SIZE(X)
|
||||
LD $f21, 1 * SIZE(X)
|
||||
lda $1, -1($1)
|
||||
addq X, INCX, X
|
||||
|
||||
LD $f22, 0 * SIZE(X)
|
||||
LD $f23, 1 * SIZE(X)
|
||||
unop
|
||||
addq X, INCX, X
|
||||
|
||||
LD $f24, 0 * SIZE(X)
|
||||
LD $f25, 1 * SIZE(X)
|
||||
unop
|
||||
addq X, INCX, X
|
||||
|
||||
LD $f26, 0 * SIZE(X)
|
||||
LD $f27, 1 * SIZE(X)
|
||||
addq X, INCX, X
|
||||
ble $1, $L13
|
||||
.align 4
|
||||
|
||||
$L12:
|
||||
addt $f8, $f9, $f16
|
||||
unop
|
||||
fabs $f20, $f8
|
||||
ldl $31, 64 * SIZE(X)
|
||||
|
||||
addt $f10, $f11, $f17
|
||||
unop
|
||||
fabs $f21, $f9
|
||||
LD $f20, 0 * SIZE(X)
|
||||
|
||||
addt $f12, $f13, $f18
|
||||
LD $f21, 1 * SIZE(X)
|
||||
fabs $f22, $f10
|
||||
addq X, INCX, X
|
||||
|
||||
addt $f14, $f15, $f19
|
||||
LD $f22, 0 * SIZE(X)
|
||||
fabs $f23, $f11
|
||||
unop
|
||||
|
||||
CMPLT($f0, $f16), $f4
|
||||
LD $f23, 1 * SIZE(X)
|
||||
fabs $f24, $f12
|
||||
addq X, INCX, X
|
||||
|
||||
CMPLT($f1, $f17), $f5
|
||||
LD $f24, 0 * SIZE(X)
|
||||
fabs $f25, $f13
|
||||
unop
|
||||
|
||||
CMPLT($f2, $f18), $f6
|
||||
LD $f25, 1 * SIZE(X)
|
||||
fabs $f26, $f14
|
||||
addq X, INCX, X
|
||||
|
||||
CMPLT($f3, $f19), $f7
|
||||
LD $f26, 0 * SIZE(X)
|
||||
fabs $f27, $f15
|
||||
unop
|
||||
|
||||
fcmovne $f4, $f16, $f0
|
||||
LD $f27, 1 * SIZE(X)
|
||||
addq X, INCX, X
|
||||
lda $1, -1($1) # i --
|
||||
|
||||
fcmovne $f5, $f17, $f1
|
||||
fcmovne $f6, $f18, $f2
|
||||
fcmovne $f7, $f19, $f3
|
||||
bgt $1,$L12
|
||||
.align 4
|
||||
|
||||
$L13:
|
||||
addt $f8, $f9, $f16
|
||||
fabs $f20, $f8
|
||||
|
||||
addt $f10, $f11, $f17
|
||||
fabs $f21, $f9
|
||||
|
||||
addt $f12, $f13, $f18
|
||||
fabs $f22, $f10
|
||||
|
||||
addt $f14, $f15, $f19
|
||||
fabs $f23, $f11
|
||||
|
||||
CMPLT($f0, $f16), $f4
|
||||
fabs $f24, $f12
|
||||
|
||||
CMPLT($f1, $f17), $f5
|
||||
fabs $f25, $f13
|
||||
|
||||
CMPLT($f2, $f18), $f6
|
||||
fabs $f26, $f14
|
||||
CMPLT($f3, $f19), $f7
|
||||
fabs $f27, $f15
|
||||
|
||||
fcmovne $f4, $f16, $f0
|
||||
fcmovne $f5, $f17, $f1
|
||||
fcmovne $f6, $f18, $f2
|
||||
fcmovne $f7, $f19, $f3
|
||||
.align 4
|
||||
|
||||
$L14:
|
||||
addt $f8, $f9, $f16
|
||||
addt $f10, $f11, $f17
|
||||
addt $f12, $f13, $f18
|
||||
addt $f14, $f15, $f19
|
||||
|
||||
CMPLT($f0, $f16), $f4
|
||||
CMPLT($f1, $f17), $f5
|
||||
CMPLT($f2, $f18), $f6
|
||||
CMPLT($f3, $f19), $f7
|
||||
|
||||
fcmovne $f4, $f16, $f0
|
||||
fcmovne $f5, $f17, $f1
|
||||
fcmovne $f6, $f18, $f2
|
||||
fcmovne $f7, $f19, $f3
|
||||
|
||||
CMPLT($f0, $f1), $f16
|
||||
CMPLT($f2, $f3), $f17
|
||||
|
||||
fcmovne $f16, $f1, $f0
|
||||
fcmovne $f17, $f3, $f2
|
||||
|
||||
CMPLT($f0, $f2), $f16
|
||||
fcmovne $f16, $f2, $f0
|
||||
.align 4
|
||||
|
||||
$L15:
|
||||
and N, 3, $1
|
||||
unop
|
||||
unop
|
||||
ble $1, $End
|
||||
.align 4
|
||||
|
||||
$L16:
|
||||
LD $f20, 0 * SIZE(X)
|
||||
LD $f21, 1 * SIZE(X)
|
||||
unop
|
||||
addq X, INCX, X
|
||||
|
||||
fabs $f20, $f29
|
||||
fabs $f21, $f30
|
||||
addt $f29, $f30, $f29
|
||||
|
||||
CMPLT($f0, $f29), $f16
|
||||
fcmovne $f16, $f29, $f0
|
||||
|
||||
lda $1, -1($1) # i --
|
||||
bgt $1, $L16
|
||||
.align 4
|
||||
|
||||
$End:
|
||||
ldt $f2, 0($sp)
|
||||
ldt $f3, 8($sp)
|
||||
ldt $f4, 16($sp)
|
||||
ldt $f5, 24($sp)
|
||||
|
||||
ldt $f6, 32($sp)
|
||||
ldt $f7, 40($sp)
|
||||
ldt $f8, 48($sp)
|
||||
ldt $f9, 56($sp)
|
||||
lda $sp, STACKSIZE($sp)
|
||||
ret
|
||||
|
||||
EPILOGUE
|
||||
208
kernel/alpha/zasum.S
Normal file
208
kernel/alpha/zasum.S
Normal file
@@ -0,0 +1,208 @@
|
||||
/*********************************************************************/
|
||||
/* Copyright 2009, 2010 The University of Texas at Austin. */
|
||||
/* All rights reserved. */
|
||||
/* */
|
||||
/* Redistribution and use in source and binary forms, with or */
|
||||
/* without modification, are permitted provided that the following */
|
||||
/* conditions are met: */
|
||||
/* */
|
||||
/* 1. Redistributions of source code must retain the above */
|
||||
/* copyright notice, this list of conditions and the following */
|
||||
/* disclaimer. */
|
||||
/* */
|
||||
/* 2. Redistributions in binary form must reproduce the above */
|
||||
/* copyright notice, this list of conditions and the following */
|
||||
/* disclaimer in the documentation and/or other materials */
|
||||
/* provided with the distribution. */
|
||||
/* */
|
||||
/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */
|
||||
/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */
|
||||
/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */
|
||||
/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */
|
||||
/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */
|
||||
/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */
|
||||
/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */
|
||||
/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */
|
||||
/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */
|
||||
/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */
|
||||
/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */
|
||||
/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */
|
||||
/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */
|
||||
/* POSSIBILITY OF SUCH DAMAGE. */
|
||||
/* */
|
||||
/* The views and conclusions contained in the software and */
|
||||
/* documentation are those of the authors and should not be */
|
||||
/* interpreted as representing official policies, either expressed */
|
||||
/* or implied, of The University of Texas at Austin. */
|
||||
/*********************************************************************/
|
||||
|
||||
#define ASSEMBLER
|
||||
#include "common.h"
|
||||
#include "version.h"
|
||||
|
||||
#define PREFETCHSIZE 88
|
||||
|
||||
#define N $16
|
||||
#define X $17
|
||||
#define INCX $18
|
||||
#define I $19
|
||||
|
||||
#define s0 $f0
|
||||
#define s1 $f1
|
||||
#define s2 $f10
|
||||
#define s3 $f11
|
||||
|
||||
#define a0 $f12
|
||||
#define a1 $f13
|
||||
#define a2 $f14
|
||||
#define a3 $f15
|
||||
#define a4 $f16
|
||||
#define a5 $f17
|
||||
#define a6 $f18
|
||||
#define a7 $f19
|
||||
|
||||
#define t0 $f20
|
||||
#define t1 $f21
|
||||
#define t2 $f22
|
||||
#define t3 $f23
|
||||
|
||||
PROLOGUE
|
||||
PROFCODE
|
||||
|
||||
fclr s0
|
||||
unop
|
||||
fclr t0
|
||||
addq INCX, INCX, INCX
|
||||
|
||||
fclr s1
|
||||
unop
|
||||
fclr t1
|
||||
ble N, $L999
|
||||
|
||||
fclr s2
|
||||
sra N, 2, I
|
||||
fclr s3
|
||||
ble I, $L15
|
||||
|
||||
LD a0, 0 * SIZE(X)
|
||||
fclr t2
|
||||
LD a1, 1 * SIZE(X)
|
||||
SXADDQ INCX, X, X
|
||||
|
||||
LD a2, 0 * SIZE(X)
|
||||
fclr t3
|
||||
LD a3, 1 * SIZE(X)
|
||||
SXADDQ INCX, X, X
|
||||
|
||||
LD a4, 0 * SIZE(X)
|
||||
LD a5, 1 * SIZE(X)
|
||||
SXADDQ INCX, X, X
|
||||
lda I, -1(I)
|
||||
|
||||
ble I, $L13
|
||||
.align 4
|
||||
|
||||
$L12:
|
||||
ADD s0, t0, s0
|
||||
ldl $31, PREFETCHSIZE * SIZE(X)
|
||||
fabs a0, t0
|
||||
lda I, -1(I)
|
||||
|
||||
ADD s1, t1, s1
|
||||
LD a6, 0 * SIZE(X)
|
||||
fabs a1, t1
|
||||
unop
|
||||
|
||||
ADD s2, t2, s2
|
||||
LD a7, 1 * SIZE(X)
|
||||
fabs a2, t2
|
||||
SXADDQ INCX, X, X
|
||||
|
||||
ADD s3, t3, s3
|
||||
LD a0, 0 * SIZE(X)
|
||||
fabs a3, t3
|
||||
unop
|
||||
|
||||
ADD s0, t0, s0
|
||||
LD a1, 1 * SIZE(X)
|
||||
fabs a4, t0
|
||||
SXADDQ INCX, X, X
|
||||
|
||||
ADD s1, t1, s1
|
||||
LD a2, 0 * SIZE(X)
|
||||
fabs a5, t1
|
||||
unop
|
||||
|
||||
ADD s2, t2, s2
|
||||
LD a3, 1 * SIZE(X)
|
||||
fabs a6, t2
|
||||
SXADDQ INCX, X, X
|
||||
|
||||
ADD s3, t3, s3
|
||||
LD a4, 0 * SIZE(X)
|
||||
fabs a7, t3
|
||||
unop
|
||||
|
||||
LD a5, 1 * SIZE(X)
|
||||
unop
|
||||
SXADDQ INCX, X, X
|
||||
bne I, $L12
|
||||
.align 4
|
||||
|
||||
$L13:
|
||||
ADD s0, t0, s0
|
||||
LD a6, 0 * SIZE(X)
|
||||
fabs a0, t0
|
||||
|
||||
ADD s1, t1, s1
|
||||
LD a7, 1 * SIZE(X)
|
||||
fabs a1, t1
|
||||
SXADDQ INCX, X, X
|
||||
|
||||
ADD s2, t2, s2
|
||||
fabs a2, t2
|
||||
ADD s3, t3, s3
|
||||
fabs a3, t3
|
||||
|
||||
ADD s0, t0, s0
|
||||
fabs a4, t0
|
||||
ADD s1, t1, s1
|
||||
fabs a5, t1
|
||||
ADD s2, t2, s2
|
||||
fabs a6, t2
|
||||
ADD s3, t3, s3
|
||||
fabs a7, t3
|
||||
|
||||
ADD s2, t2, s2
|
||||
ADD s3, t3, s3
|
||||
|
||||
.align 4
|
||||
|
||||
$L15:
|
||||
ADD s0, s2, s0
|
||||
and N, 3, I
|
||||
ADD s1, s3, s1
|
||||
ble I, $L999
|
||||
.align 4
|
||||
|
||||
$L17:
|
||||
ADD s0, t0, s0
|
||||
LD a0, 0 * SIZE(X)
|
||||
fabs a0, t0
|
||||
lda I, -1(I)
|
||||
|
||||
ADD s1, t1, s1
|
||||
LD a1, 1 * SIZE(X)
|
||||
fabs a1, t1
|
||||
SXADDQ INCX, X, X
|
||||
|
||||
bne I, $L17
|
||||
.align 4
|
||||
|
||||
$L999:
|
||||
ADD s0, t0, s0
|
||||
ADD s1, t1, s1
|
||||
|
||||
ADD s0, s1, s0
|
||||
ret
|
||||
EPILOGUE
|
||||
611
kernel/alpha/zaxpy.S
Normal file
611
kernel/alpha/zaxpy.S
Normal file
@@ -0,0 +1,611 @@
|
||||
/*********************************************************************/
|
||||
/* Copyright 2009, 2010 The University of Texas at Austin. */
|
||||
/* All rights reserved. */
|
||||
/* */
|
||||
/* Redistribution and use in source and binary forms, with or */
|
||||
/* without modification, are permitted provided that the following */
|
||||
/* conditions are met: */
|
||||
/* */
|
||||
/* 1. Redistributions of source code must retain the above */
|
||||
/* copyright notice, this list of conditions and the following */
|
||||
/* disclaimer. */
|
||||
/* */
|
||||
/* 2. Redistributions in binary form must reproduce the above */
|
||||
/* copyright notice, this list of conditions and the following */
|
||||
/* disclaimer in the documentation and/or other materials */
|
||||
/* provided with the distribution. */
|
||||
/* */
|
||||
/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */
|
||||
/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */
|
||||
/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */
|
||||
/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */
|
||||
/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */
|
||||
/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */
|
||||
/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */
|
||||
/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */
|
||||
/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */
|
||||
/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */
|
||||
/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */
|
||||
/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */
|
||||
/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */
|
||||
/* POSSIBILITY OF SUCH DAMAGE. */
|
||||
/* */
|
||||
/* The views and conclusions contained in the software and */
|
||||
/* documentation are those of the authors and should not be */
|
||||
/* interpreted as representing official policies, either expressed */
|
||||
/* or implied, of The University of Texas at Austin. */
|
||||
/*********************************************************************/
|
||||
|
||||
#define ASSEMBLER
|
||||
#include "common.h"
|
||||
#include "version.h"
|
||||
|
||||
#define PREFETCHSIZE 40
|
||||
|
||||
#ifndef CONJ
|
||||
#define ADD1 SUB
|
||||
#define ADD2 ADD
|
||||
#else
|
||||
#define ADD1 ADD
|
||||
#define ADD2 SUB
|
||||
#endif
|
||||
|
||||
|
||||
PROLOGUE
|
||||
PROFCODE
|
||||
.frame $sp, 16, $26, 0
|
||||
|
||||
ldl $19, 0($sp)
|
||||
fmov $f19, $f29
|
||||
ldq $20, 8($sp)
|
||||
fmov $f20, $f30
|
||||
|
||||
mov $21, $18
|
||||
ldl $21, 16($sp)
|
||||
lda $sp, -64($sp)
|
||||
nop
|
||||
|
||||
stt $f2, 0($sp)
|
||||
cmpeq $19, 1, $1
|
||||
stt $f3, 8($sp)
|
||||
cmpeq $21, 1, $2
|
||||
|
||||
stt $f4, 16($sp)
|
||||
and $16, 3, $5
|
||||
stt $f5, 24($sp)
|
||||
stt $f6, 32($sp)
|
||||
|
||||
stt $f7, 40($sp)
|
||||
stt $f8, 48($sp)
|
||||
#ifndef PROFILE
|
||||
.prologue 0
|
||||
#else
|
||||
.prologue 1
|
||||
#endif
|
||||
|
||||
and $1, $2, $1
|
||||
ble $16, $End
|
||||
sra $16, 2, $4
|
||||
beq $1, $Sub
|
||||
|
||||
ble $4, $Remain
|
||||
subq $4, 1, $4
|
||||
|
||||
LD $f0, 0*SIZE($18)
|
||||
LD $f1, 1*SIZE($18)
|
||||
LD $f2, 2*SIZE($18)
|
||||
LD $f3, 3*SIZE($18)
|
||||
LD $f4, 4*SIZE($18)
|
||||
LD $f5, 5*SIZE($18)
|
||||
LD $f6, 6*SIZE($18)
|
||||
LD $f7, 7*SIZE($18)
|
||||
|
||||
LD $f8, 0*SIZE($20)
|
||||
LD $f28, 1*SIZE($20)
|
||||
LD $f10, 2*SIZE($20)
|
||||
LD $f11, 3*SIZE($20)
|
||||
LD $f12, 4*SIZE($20)
|
||||
LD $f13, 5*SIZE($20)
|
||||
LD $f14, 6*SIZE($20)
|
||||
LD $f15, 7*SIZE($20)
|
||||
|
||||
addq $18, 8*SIZE, $18
|
||||
ble $4, $MainLoopEnd
|
||||
.align 4
|
||||
|
||||
$MainLoop:
|
||||
ldt $f31, PREFETCHSIZE * SIZE($20)
|
||||
ldl $31, PREFETCHSIZE * SIZE($18)
|
||||
|
||||
MUL $f29, $f0, $f20
|
||||
LD $f31, 9*SIZE($18)
|
||||
MUL $f30, $f1, $f21
|
||||
unop
|
||||
|
||||
MUL $f30, $f0, $f22
|
||||
LD $f0, 0*SIZE($18)
|
||||
MUL $f29, $f1, $f23
|
||||
LD $f1, 1*SIZE($18)
|
||||
|
||||
MUL $f29, $f2, $f24
|
||||
unop
|
||||
MUL $f30, $f3, $f25
|
||||
nop
|
||||
|
||||
MUL $f30, $f2, $f26
|
||||
LD $f2, 2*SIZE($18)
|
||||
MUL $f29, $f3, $f27
|
||||
LD $f3, 3*SIZE($18)
|
||||
|
||||
ADD1 $f20, $f21, $f16
|
||||
MUL $f29, $f4, $f20
|
||||
ADD2 $f22, $f23, $f17
|
||||
MUL $f30, $f5, $f21
|
||||
|
||||
ADD1 $f24, $f25, $f18
|
||||
unop
|
||||
MUL $f30, $f4, $f22
|
||||
LD $f4, 4*SIZE($18)
|
||||
|
||||
ADD2 $f26, $f27, $f19
|
||||
addq $20, 8*SIZE, $20
|
||||
MUL $f29, $f5, $f23
|
||||
LD $f5, 5*SIZE($18)
|
||||
|
||||
ADD $f16, $f8, $f16
|
||||
LD $f8, 0*SIZE($20)
|
||||
MUL $f29, $f6, $f24
|
||||
unop
|
||||
|
||||
ADD $f17, $f28, $f17
|
||||
LD $f28, 1*SIZE($20)
|
||||
MUL $f30, $f7, $f25
|
||||
unop
|
||||
|
||||
ADD $f18, $f10, $f18
|
||||
LD $f10, 2*SIZE($20)
|
||||
MUL $f30, $f6, $f26
|
||||
LD $f6, 6*SIZE($18)
|
||||
|
||||
ADD $f19, $f11, $f19
|
||||
LD $f11, 3*SIZE($20)
|
||||
MUL $f29, $f7, $f27
|
||||
LD $f7, 7*SIZE($18)
|
||||
|
||||
ST $f16,-8*SIZE($20)
|
||||
ADD1 $f20, $f21, $f16
|
||||
ST $f17,-7*SIZE($20)
|
||||
ADD2 $f22, $f23, $f17
|
||||
|
||||
ST $f18,-6*SIZE($20)
|
||||
ADD1 $f24, $f25, $f18
|
||||
ST $f19,-5*SIZE($20)
|
||||
ADD2 $f26, $f27, $f19
|
||||
|
||||
ADD $f16, $f12, $f16
|
||||
LD $f12, 4*SIZE($20)
|
||||
ADD $f17, $f13, $f17
|
||||
LD $f13, 5*SIZE($20)
|
||||
ADD $f18, $f14, $f18
|
||||
LD $f14, 6*SIZE($20)
|
||||
ADD $f19, $f15, $f19
|
||||
LD $f15, 7*SIZE($20)
|
||||
|
||||
ST $f16,-4*SIZE($20)
|
||||
addq $18, 8*SIZE, $18
|
||||
ST $f17,-3*SIZE($20)
|
||||
subq $4, 1, $4
|
||||
|
||||
ST $f18,-2*SIZE($20)
|
||||
nop
|
||||
ST $f19,-1*SIZE($20)
|
||||
bgt $4, $MainLoop
|
||||
.align 4
|
||||
|
||||
$MainLoopEnd:
|
||||
MUL $f29, $f0, $f20
|
||||
MUL $f30, $f1, $f21
|
||||
MUL $f30, $f0, $f22
|
||||
MUL $f29, $f1, $f23
|
||||
|
||||
MUL $f29, $f2, $f24
|
||||
MUL $f30, $f3, $f25
|
||||
MUL $f30, $f2, $f26
|
||||
MUL $f29, $f3, $f27
|
||||
|
||||
ADD1 $f20, $f21, $f16
|
||||
MUL $f29, $f4, $f20
|
||||
ADD2 $f22, $f23, $f17
|
||||
MUL $f30, $f5, $f21
|
||||
|
||||
ADD1 $f24, $f25, $f18
|
||||
MUL $f30, $f4, $f22
|
||||
ADD2 $f26, $f27, $f19
|
||||
MUL $f29, $f5, $f23
|
||||
|
||||
ADD $f16, $f8, $f16
|
||||
MUL $f29, $f6, $f24
|
||||
ADD $f17, $f28, $f17
|
||||
MUL $f30, $f7, $f25
|
||||
|
||||
ADD $f18, $f10, $f18
|
||||
MUL $f30, $f6, $f26
|
||||
ADD $f19, $f11, $f19
|
||||
MUL $f29, $f7, $f27
|
||||
|
||||
ST $f16, 0*SIZE($20)
|
||||
ADD1 $f20, $f21, $f16
|
||||
ST $f17, 1*SIZE($20)
|
||||
ADD2 $f22, $f23, $f17
|
||||
|
||||
ST $f18, 2*SIZE($20)
|
||||
ADD1 $f24, $f25, $f18
|
||||
ST $f19, 3*SIZE($20)
|
||||
ADD2 $f26, $f27, $f19
|
||||
|
||||
ADD $f16, $f12, $f16
|
||||
ADD $f17, $f13, $f17
|
||||
ADD $f18, $f14, $f18
|
||||
ADD $f19, $f15, $f19
|
||||
|
||||
ST $f16, 4*SIZE($20)
|
||||
ST $f17, 5*SIZE($20)
|
||||
ST $f18, 6*SIZE($20)
|
||||
ST $f19, 7*SIZE($20)
|
||||
|
||||
unop
|
||||
addq $20, 8*SIZE, $20
|
||||
unop
|
||||
ble $5, $End
|
||||
.align 4
|
||||
|
||||
$Remain:
|
||||
subq $5, 1, $6
|
||||
ble $5, $End
|
||||
LD $f0, 0*SIZE($18)
|
||||
LD $f1, 1*SIZE($18)
|
||||
|
||||
LD $f8, 0*SIZE($20)
|
||||
LD $f28, 1*SIZE($20)
|
||||
addq $18, 2*SIZE, $18
|
||||
ble $6, $RemainLoopEnd
|
||||
.align 4
|
||||
|
||||
$RemainLoop:
|
||||
MUL $f29, $f0, $f20
|
||||
subq $6, 1, $6
|
||||
MUL $f30, $f1, $f21
|
||||
addq $20, 2*SIZE, $20
|
||||
|
||||
MUL $f30, $f0, $f22
|
||||
LD $f0, 0*SIZE($18)
|
||||
MUL $f29, $f1, $f23
|
||||
LD $f1, 1*SIZE($18)
|
||||
|
||||
ADD1 $f20, $f21, $f16
|
||||
ADD2 $f22, $f23, $f17
|
||||
ADD $f16, $f8, $f16
|
||||
LD $f8, 0*SIZE($20)
|
||||
ADD $f17, $f28, $f17
|
||||
LD $f28, 1*SIZE($20)
|
||||
|
||||
ST $f16,-2*SIZE($20)
|
||||
addq $18, 2*SIZE, $18
|
||||
ST $f17,-1*SIZE($20)
|
||||
bgt $6, $RemainLoop
|
||||
.align 4
|
||||
|
||||
$RemainLoopEnd:
|
||||
MUL $f29, $f0, $f20
|
||||
MUL $f30, $f1, $f21
|
||||
MUL $f30, $f0, $f22
|
||||
MUL $f29, $f1, $f23
|
||||
|
||||
ADD1 $f20, $f21, $f16
|
||||
ADD2 $f22, $f23, $f17
|
||||
ADD $f16, $f8, $f16
|
||||
ADD $f17, $f28, $f17
|
||||
|
||||
ST $f16, 0*SIZE($20)
|
||||
nop
|
||||
ST $f17, 1*SIZE($20)
|
||||
nop
|
||||
.align 4
|
||||
|
||||
$End:
|
||||
ldt $f2, 0($sp)
|
||||
ldt $f3, 8($sp)
|
||||
ldt $f4, 16($sp)
|
||||
ldt $f5, 24($sp)
|
||||
ldt $f6, 32($sp)
|
||||
ldt $f7, 40($sp)
|
||||
ldt $f8, 48($sp)
|
||||
lda $sp, 64($sp)
|
||||
ret
|
||||
.align 4
|
||||
|
||||
$Sub:
|
||||
SXSUBL $16, SIZE, $22
|
||||
addq $22, $22, $22 # Complex
|
||||
.align 4
|
||||
|
||||
addq $19, $19, $19 # Complex
|
||||
addq $21, $21, $21 # Complex
|
||||
|
||||
ble $4, $SubRemain
|
||||
LD $f0, 0*SIZE($18)
|
||||
LD $f1, 1*SIZE($18)
|
||||
SXADDQ $19, $18, $18
|
||||
|
||||
LD $f2, 0*SIZE($18)
|
||||
LD $f3, 1*SIZE($18)
|
||||
SXADDQ $19, $18, $18
|
||||
|
||||
LD $f4, 0*SIZE($18)
|
||||
LD $f5, 1*SIZE($18)
|
||||
SXADDQ $19, $18, $18
|
||||
|
||||
LD $f6, 0*SIZE($18)
|
||||
LD $f7, 1*SIZE($18)
|
||||
SXADDQ $19, $18, $18
|
||||
|
||||
LD $f8, 0*SIZE($20)
|
||||
LD $f28, 1*SIZE($20)
|
||||
SXADDQ $21, $20, $24
|
||||
|
||||
LD $f10, 0*SIZE($24)
|
||||
LD $f11, 1*SIZE($24)
|
||||
SXADDQ $21, $24, $24
|
||||
|
||||
LD $f12, 0*SIZE($24)
|
||||
LD $f13, 1*SIZE($24)
|
||||
SXADDQ $21, $24, $24
|
||||
|
||||
LD $f14, 0*SIZE($24)
|
||||
LD $f15, 1*SIZE($24)
|
||||
SXADDQ $21, $24, $24
|
||||
|
||||
subq $4, 1, $4
|
||||
ble $4, $SubMainLoopEnd
|
||||
.align 4
|
||||
|
||||
$SubMainLoop:
|
||||
MUL $f29, $f0, $f20
|
||||
unop
|
||||
MUL $f30, $f1, $f21
|
||||
unop
|
||||
|
||||
MUL $f30, $f0, $f22
|
||||
LD $f0, 0*SIZE($18)
|
||||
MUL $f29, $f1, $f23
|
||||
LD $f1, 1*SIZE($18)
|
||||
|
||||
MUL $f29, $f2, $f24
|
||||
SXADDQ $19, $18, $18
|
||||
MUL $f30, $f3, $f25
|
||||
unop
|
||||
|
||||
MUL $f30, $f2, $f26
|
||||
LD $f2, 0*SIZE($18)
|
||||
MUL $f29, $f3, $f27
|
||||
LD $f3, 1*SIZE($18)
|
||||
|
||||
ADD1 $f20, $f21, $f16
|
||||
SXADDQ $19, $18, $18
|
||||
MUL $f29, $f4, $f20
|
||||
unop
|
||||
|
||||
ADD2 $f22, $f23, $f17
|
||||
unop
|
||||
MUL $f30, $f5, $f21
|
||||
unop
|
||||
|
||||
ADD1 $f24, $f25, $f18
|
||||
unop
|
||||
MUL $f30, $f4, $f22
|
||||
LD $f4, 0*SIZE($18)
|
||||
|
||||
ADD2 $f26, $f27, $f19
|
||||
unop
|
||||
MUL $f29, $f5, $f23
|
||||
LD $f5, 1*SIZE($18)
|
||||
|
||||
ADD $f16, $f8, $f16
|
||||
LD $f8, 0*SIZE($24)
|
||||
MUL $f29, $f6, $f24
|
||||
SXADDQ $19, $18, $18
|
||||
|
||||
ADD $f17, $f28, $f17
|
||||
LD $f28, 1*SIZE($24)
|
||||
MUL $f30, $f7, $f25
|
||||
SXADDQ $21, $24, $24
|
||||
|
||||
ADD $f18, $f10, $f18
|
||||
LD $f10, 0*SIZE($24)
|
||||
MUL $f30, $f6, $f26
|
||||
LD $f6, 0*SIZE($18)
|
||||
|
||||
ADD $f19, $f11, $f19
|
||||
LD $f11, 1*SIZE($24)
|
||||
MUL $f29, $f7, $f27
|
||||
LD $f7, 1*SIZE($18)
|
||||
|
||||
ST $f16, 0*SIZE($20)
|
||||
SXADDQ $19, $18, $18
|
||||
ADD1 $f20, $f21, $f16
|
||||
unop
|
||||
|
||||
ST $f17, 1*SIZE($20)
|
||||
SXADDQ $21, $20, $20
|
||||
ADD2 $f22, $f23, $f17
|
||||
unop
|
||||
|
||||
ST $f18, 0*SIZE($20)
|
||||
SXADDQ $21, $24, $24
|
||||
ADD1 $f24, $f25, $f18
|
||||
unop
|
||||
|
||||
ST $f19, 1*SIZE($20)
|
||||
unop
|
||||
ADD2 $f26, $f27, $f19
|
||||
SXADDQ $21, $20, $20
|
||||
|
||||
ADD $f16, $f12, $f16
|
||||
unop
|
||||
LD $f12, 0*SIZE($24)
|
||||
unop
|
||||
|
||||
ADD $f17, $f13, $f17
|
||||
unop
|
||||
LD $f13, 1*SIZE($24)
|
||||
SXADDQ $21, $24, $24
|
||||
|
||||
ADD $f18, $f14, $f18
|
||||
subq $4, 1, $4
|
||||
LD $f14, 0*SIZE($24)
|
||||
unop
|
||||
|
||||
ADD $f19, $f15, $f19
|
||||
unop
|
||||
LD $f15, 1*SIZE($24)
|
||||
SXADDQ $21, $24, $24
|
||||
|
||||
ST $f16, 0*SIZE($20)
|
||||
ST $f17, 1*SIZE($20)
|
||||
SXADDQ $21, $20, $20
|
||||
unop
|
||||
|
||||
ST $f18, 0*SIZE($20)
|
||||
ST $f19, 1*SIZE($20)
|
||||
SXADDQ $21, $20, $20
|
||||
bgt $4, $SubMainLoop
|
||||
.align 4
|
||||
|
||||
$SubMainLoopEnd:
|
||||
MUL $f29, $f0, $f20
|
||||
MUL $f30, $f1, $f21
|
||||
MUL $f30, $f0, $f22
|
||||
MUL $f29, $f1, $f23
|
||||
|
||||
MUL $f29, $f2, $f24
|
||||
MUL $f30, $f3, $f25
|
||||
MUL $f30, $f2, $f26
|
||||
MUL $f29, $f3, $f27
|
||||
|
||||
ADD1 $f20, $f21, $f16
|
||||
MUL $f29, $f4, $f20
|
||||
ADD2 $f22, $f23, $f17
|
||||
MUL $f30, $f5, $f21
|
||||
|
||||
ADD1 $f24, $f25, $f18
|
||||
MUL $f30, $f4, $f22
|
||||
ADD2 $f26, $f27, $f19
|
||||
MUL $f29, $f5, $f23
|
||||
|
||||
ADD $f16, $f8, $f16
|
||||
MUL $f29, $f6, $f24
|
||||
ADD $f17, $f28, $f17
|
||||
MUL $f30, $f7, $f25
|
||||
|
||||
ADD $f18, $f10, $f18
|
||||
MUL $f30, $f6, $f26
|
||||
ADD $f19, $f11, $f19
|
||||
MUL $f29, $f7, $f27
|
||||
|
||||
ST $f16, 0*SIZE($20)
|
||||
ADD1 $f20, $f21, $f16
|
||||
ST $f17, 1*SIZE($20)
|
||||
ADD2 $f22, $f23, $f17
|
||||
|
||||
SXADDQ $21, $20, $20
|
||||
nop
|
||||
ST $f18, 0*SIZE($20)
|
||||
ADD1 $f24, $f25, $f18
|
||||
|
||||
ST $f19, 1*SIZE($20)
|
||||
ADD2 $f26, $f27, $f19
|
||||
SXADDQ $21, $20, $20
|
||||
ADD $f16, $f12, $f16
|
||||
|
||||
ADD $f17, $f13, $f17
|
||||
ADD $f18, $f14, $f18
|
||||
ADD $f19, $f15, $f19
|
||||
|
||||
ST $f16, 0*SIZE($20)
|
||||
ST $f17, 1*SIZE($20)
|
||||
SXADDQ $21, $20, $20
|
||||
|
||||
ST $f18, 0*SIZE($20)
|
||||
ST $f19, 1*SIZE($20)
|
||||
SXADDQ $21, $20, $20
|
||||
ble $5, $SubEnd
|
||||
.align 4
|
||||
|
||||
$SubRemain:
|
||||
subq $5, 1, $6
|
||||
ble $5, $SubEnd
|
||||
LD $f0, 0*SIZE($18)
|
||||
LD $f1, 1*SIZE($18)
|
||||
|
||||
LD $f8, 0*SIZE($20)
|
||||
LD $f28, 1*SIZE($20)
|
||||
SXADDQ $19, $18, $18
|
||||
SXADDQ $21, $20, $24
|
||||
ble $6, $SubRemainLoopEnd
|
||||
.align 4
|
||||
|
||||
$SubRemainLoop:
|
||||
MUL $f29, $f0, $f20
|
||||
MUL $f30, $f1, $f21
|
||||
MUL $f30, $f0, $f22
|
||||
LD $f0, 0*SIZE($18)
|
||||
|
||||
MUL $f29, $f1, $f23
|
||||
LD $f1, 1*SIZE($18)
|
||||
ADD1 $f20, $f21, $f16
|
||||
SXADDQ $19, $18, $18
|
||||
|
||||
ADD2 $f22, $f23, $f17
|
||||
nop
|
||||
ADD $f16, $f8, $f16
|
||||
LD $f8, 0*SIZE($24)
|
||||
|
||||
ADD $f17, $f28, $f17
|
||||
LD $f28, 1*SIZE($24)
|
||||
SXADDQ $21, $24, $24
|
||||
subq $6, 1, $6
|
||||
|
||||
ST $f16, 0*SIZE($20)
|
||||
ST $f17, 1*SIZE($20)
|
||||
SXADDQ $21, $20, $20
|
||||
bgt $6, $SubRemainLoop
|
||||
.align 4
|
||||
|
||||
$SubRemainLoopEnd:
|
||||
MUL $f29, $f0, $f20
|
||||
MUL $f30, $f1, $f21
|
||||
MUL $f30, $f0, $f22
|
||||
MUL $f29, $f1, $f23
|
||||
|
||||
ADD1 $f20, $f21, $f16
|
||||
ADD2 $f22, $f23, $f17
|
||||
ADD $f16, $f8, $f16
|
||||
ADD $f17, $f28, $f17
|
||||
|
||||
ST $f16, 0*SIZE($20)
|
||||
nop
|
||||
ST $f17, 1*SIZE($20)
|
||||
nop
|
||||
.align 4
|
||||
|
||||
$SubEnd:
|
||||
ldt $f2, 0($sp)
|
||||
ldt $f3, 8($sp)
|
||||
ldt $f4, 16($sp)
|
||||
ldt $f5, 24($sp)
|
||||
ldt $f6, 32($sp)
|
||||
ldt $f7, 40($sp)
|
||||
ldt $f8, 48($sp)
|
||||
lda $sp, 64($sp)
|
||||
ret
|
||||
EPILOGUE
|
||||
500
kernel/alpha/zdot.S
Normal file
500
kernel/alpha/zdot.S
Normal file
@@ -0,0 +1,500 @@
|
||||
/*********************************************************************/
|
||||
/* Copyright 2009, 2010 The University of Texas at Austin. */
|
||||
/* All rights reserved. */
|
||||
/* */
|
||||
/* Redistribution and use in source and binary forms, with or */
|
||||
/* without modification, are permitted provided that the following */
|
||||
/* conditions are met: */
|
||||
/* */
|
||||
/* 1. Redistributions of source code must retain the above */
|
||||
/* copyright notice, this list of conditions and the following */
|
||||
/* disclaimer. */
|
||||
/* */
|
||||
/* 2. Redistributions in binary form must reproduce the above */
|
||||
/* copyright notice, this list of conditions and the following */
|
||||
/* disclaimer in the documentation and/or other materials */
|
||||
/* provided with the distribution. */
|
||||
/* */
|
||||
/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */
|
||||
/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */
|
||||
/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */
|
||||
/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */
|
||||
/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */
|
||||
/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */
|
||||
/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */
|
||||
/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */
|
||||
/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */
|
||||
/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */
|
||||
/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */
|
||||
/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */
|
||||
/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */
|
||||
/* POSSIBILITY OF SUCH DAMAGE. */
|
||||
/* */
|
||||
/* The views and conclusions contained in the software and */
|
||||
/* documentation are those of the authors and should not be */
|
||||
/* interpreted as representing official policies, either expressed */
|
||||
/* or implied, of The University of Texas at Austin. */
|
||||
/*********************************************************************/
|
||||
|
||||
#define ASSEMBLER
|
||||
#include "common.h"
|
||||
#include "version.h"
|
||||
|
||||
#define PREFETCHSIZE 88
|
||||
|
||||
#define N $16
|
||||
#define X $17
|
||||
#define INCX $18
|
||||
#define Y $19
|
||||
#define INCY $20
|
||||
#define XX $21
|
||||
#define YY $23
|
||||
|
||||
#define I $5
|
||||
|
||||
#define s0 $f0
|
||||
#define s1 $f1
|
||||
#define s2 $f2
|
||||
#define s3 $f30
|
||||
|
||||
#define a0 $f10
|
||||
#define a1 $f11
|
||||
#define a2 $f12
|
||||
#define a3 $f13
|
||||
#define a4 $f14
|
||||
#define a5 $f15
|
||||
#define a6 $f16
|
||||
#define a7 $f17
|
||||
|
||||
#define b0 $f18
|
||||
#define b1 $f19
|
||||
#define b2 $f20
|
||||
#define b3 $f21
|
||||
#define b4 $f22
|
||||
#define b5 $f23
|
||||
#define b6 $f24
|
||||
#define b7 $f25
|
||||
|
||||
#define t0 $f26
|
||||
#define t1 $f27
|
||||
#define t2 $f28
|
||||
#define t3 $f29
|
||||
|
||||
PROLOGUE
|
||||
PROFCODE
|
||||
.frame $sp, 16, $26, 0
|
||||
|
||||
lda $sp, -16($sp)
|
||||
fclr s0
|
||||
stt $f2, 0($sp)
|
||||
fclr s1
|
||||
|
||||
fclr s2
|
||||
addq INCX, INCX, INCX
|
||||
fclr s3
|
||||
ble N, $L999
|
||||
|
||||
addq INCY, INCY, INCY
|
||||
fclr t0
|
||||
fclr t1
|
||||
fclr t2
|
||||
fclr t3
|
||||
|
||||
srl N, 3, I
|
||||
ble I, $L25
|
||||
|
||||
LD a0, 0 * SIZE(X)
|
||||
LD a1, 1 * SIZE(X)
|
||||
LD b0, 0 * SIZE(Y)
|
||||
LD b1, 1 * SIZE(Y)
|
||||
|
||||
SXADDQ INCX, X, X
|
||||
SXADDQ INCY, Y, Y
|
||||
|
||||
LD a2, 0 * SIZE(X)
|
||||
LD a3, 1 * SIZE(X)
|
||||
LD b2, 0 * SIZE(Y)
|
||||
LD b3, 1 * SIZE(Y)
|
||||
|
||||
SXADDQ INCX, X, X
|
||||
SXADDQ INCY, Y, Y
|
||||
|
||||
LD a4, 0 * SIZE(X)
|
||||
LD a5, 1 * SIZE(X)
|
||||
LD b4, 0 * SIZE(Y)
|
||||
LD b5, 1 * SIZE(Y)
|
||||
|
||||
SXADDQ INCX, X, X
|
||||
SXADDQ INCY, Y, Y
|
||||
|
||||
LD a6, 0 * SIZE(X)
|
||||
LD b6, 0 * SIZE(Y)
|
||||
|
||||
subq I, 1, I
|
||||
ble I, $L23
|
||||
.align 4
|
||||
|
||||
$L22:
|
||||
ADD s0, t0, s0
|
||||
LD a7, 1 * SIZE(X)
|
||||
MUL a0, b0, t0
|
||||
LD b7, 1 * SIZE(Y)
|
||||
|
||||
ADD s1, t1, s1
|
||||
ldl $31, PREFETCHSIZE * SIZE(X)
|
||||
MUL a0, b1, t1
|
||||
SXADDQ INCX, X, X
|
||||
|
||||
ADD s2, t2, s2
|
||||
ldl $31, PREFETCHSIZE * SIZE(Y)
|
||||
MUL a1, b0, t2
|
||||
SXADDQ INCY, Y, Y
|
||||
|
||||
ADD s3, t3, s3
|
||||
LD a0, 0 * SIZE(X)
|
||||
MUL a1, b1, t3
|
||||
LD a1, 1 * SIZE(X)
|
||||
|
||||
ADD s0, t0, s0
|
||||
LD b0, 0 * SIZE(Y)
|
||||
MUL a2, b2, t0
|
||||
LD b1, 1 * SIZE(Y)
|
||||
|
||||
ADD s1, t1, s1
|
||||
SXADDQ INCX, X, X
|
||||
MUL a2, b3, t1
|
||||
SXADDQ INCY, Y, Y
|
||||
|
||||
ADD s2, t2, s2
|
||||
unop
|
||||
MUL a3, b2, t2
|
||||
unop
|
||||
|
||||
ADD s3, t3, s3
|
||||
LD a2, 0 * SIZE(X)
|
||||
MUL a3, b3, t3
|
||||
LD a3, 1 * SIZE(X)
|
||||
|
||||
ADD s0, t0, s0
|
||||
LD b2, 0 * SIZE(Y)
|
||||
MUL a4, b4, t0
|
||||
LD b3, 1 * SIZE(Y)
|
||||
|
||||
ADD s1, t1, s1
|
||||
SXADDQ INCX, X, X
|
||||
MUL a4, b5, t1
|
||||
SXADDQ INCY, Y, Y
|
||||
|
||||
ADD s2, t2, s2
|
||||
unop
|
||||
MUL a5, b4, t2
|
||||
unop
|
||||
|
||||
ADD s3, t3, s3
|
||||
LD a4, 0 * SIZE(X)
|
||||
MUL a5, b5, t3
|
||||
LD a5, 1 * SIZE(X)
|
||||
|
||||
ADD s0, t0, s0
|
||||
LD b4, 0 * SIZE(Y)
|
||||
MUL a6, b6, t0
|
||||
LD b5, 1 * SIZE(Y)
|
||||
|
||||
ADD s1, t1, s1
|
||||
SXADDQ INCX, X, X
|
||||
MUL a6, b7, t1
|
||||
SXADDQ INCY, Y, Y
|
||||
|
||||
ADD s2, t2, s2
|
||||
unop
|
||||
MUL a7, b6, t2
|
||||
unop
|
||||
|
||||
ADD s3, t3, s3
|
||||
LD a6, 0 * SIZE(X)
|
||||
MUL a7, b7, t3
|
||||
LD a7, 1 * SIZE(X)
|
||||
|
||||
ADD s0, t0, s0
|
||||
LD b6, 0 * SIZE(Y)
|
||||
MUL a0, b0, t0
|
||||
LD b7, 1 * SIZE(Y)
|
||||
|
||||
ADD s1, t1, s1
|
||||
SXADDQ INCX, X, X
|
||||
MUL a0, b1, t1
|
||||
SXADDQ INCY, Y, Y
|
||||
|
||||
ADD s2, t2, s2
|
||||
unop
|
||||
MUL a1, b0, t2
|
||||
unop
|
||||
|
||||
ADD s3, t3, s3
|
||||
LD a0, 0 * SIZE(X)
|
||||
MUL a1, b1, t3
|
||||
LD a1, 1 * SIZE(X)
|
||||
|
||||
ADD s0, t0, s0
|
||||
LD b0, 0 * SIZE(Y)
|
||||
MUL a2, b2, t0
|
||||
LD b1, 1 * SIZE(Y)
|
||||
|
||||
ADD s1, t1, s1
|
||||
SXADDQ INCX, X, X
|
||||
MUL a2, b3, t1
|
||||
SXADDQ INCY, Y, Y
|
||||
|
||||
ADD s2, t2, s2
|
||||
unop
|
||||
MUL a3, b2, t2
|
||||
unop
|
||||
|
||||
ADD s3, t3, s3
|
||||
LD a2, 0 * SIZE(X)
|
||||
MUL a3, b3, t3
|
||||
LD a3, 1 * SIZE(X)
|
||||
|
||||
ADD s0, t0, s0
|
||||
LD b2, 0 * SIZE(Y)
|
||||
MUL a4, b4, t0
|
||||
LD b3, 1 * SIZE(Y)
|
||||
|
||||
ADD s1, t1, s1
|
||||
SXADDQ INCX, X, X
|
||||
MUL a4, b5, t1
|
||||
SXADDQ INCY, Y, Y
|
||||
|
||||
ADD s2, t2, s2
|
||||
unop
|
||||
MUL a5, b4, t2
|
||||
subq I, 1, I
|
||||
|
||||
ADD s3, t3, s3
|
||||
LD a4, 0 * SIZE(X)
|
||||
MUL a5, b5, t3
|
||||
LD a5, 1 * SIZE(X)
|
||||
|
||||
ADD s0, t0, s0
|
||||
LD b4, 0 * SIZE(Y)
|
||||
MUL a6, b6, t0
|
||||
LD b5, 1 * SIZE(Y)
|
||||
|
||||
ADD s1, t1, s1
|
||||
SXADDQ INCX, X, X
|
||||
MUL a6, b7, t1
|
||||
SXADDQ INCY, Y, Y
|
||||
|
||||
ADD s2, t2, s2
|
||||
LD a6, 0 * SIZE(X)
|
||||
MUL a7, b6, t2
|
||||
unop
|
||||
|
||||
ADD s3, t3, s3
|
||||
LD b6, 0 * SIZE(Y)
|
||||
MUL a7, b7, t3
|
||||
bgt I, $L22
|
||||
.align 4
|
||||
|
||||
$L23:
|
||||
ADD s0, t0, s0
|
||||
LD a7, 1 * SIZE(X)
|
||||
MUL a0, b0, t0
|
||||
LD b7, 1 * SIZE(Y)
|
||||
|
||||
ADD s1, t1, s1
|
||||
SXADDQ INCX, X, X
|
||||
MUL a0, b1, t1
|
||||
SXADDQ INCY, Y, Y
|
||||
|
||||
ADD s2, t2, s2
|
||||
unop
|
||||
MUL a1, b0, t2
|
||||
unop
|
||||
|
||||
ADD s3, t3, s3
|
||||
LD a0, 0 * SIZE(X)
|
||||
MUL a1, b1, t3
|
||||
LD a1, 1 * SIZE(X)
|
||||
|
||||
ADD s0, t0, s0
|
||||
LD b0, 0 * SIZE(Y)
|
||||
MUL a2, b2, t0
|
||||
LD b1, 1 * SIZE(Y)
|
||||
|
||||
ADD s1, t1, s1
|
||||
SXADDQ INCX, X, X
|
||||
MUL a2, b3, t1
|
||||
SXADDQ INCY, Y, Y
|
||||
|
||||
ADD s2, t2, s2
|
||||
unop
|
||||
MUL a3, b2, t2
|
||||
unop
|
||||
|
||||
ADD s3, t3, s3
|
||||
LD a2, 0 * SIZE(X)
|
||||
MUL a3, b3, t3
|
||||
LD a3, 1 * SIZE(X)
|
||||
|
||||
ADD s0, t0, s0
|
||||
LD b2, 0 * SIZE(Y)
|
||||
MUL a4, b4, t0
|
||||
LD b3, 1 * SIZE(Y)
|
||||
|
||||
ADD s1, t1, s1
|
||||
SXADDQ INCX, X, X
|
||||
MUL a4, b5, t1
|
||||
SXADDQ INCY, Y, Y
|
||||
|
||||
ADD s2, t2, s2
|
||||
unop
|
||||
MUL a5, b4, t2
|
||||
unop
|
||||
|
||||
ADD s3, t3, s3
|
||||
LD a4, 0 * SIZE(X)
|
||||
MUL a5, b5, t3
|
||||
LD a5, 1 * SIZE(X)
|
||||
|
||||
ADD s0, t0, s0
|
||||
LD b4, 0 * SIZE(Y)
|
||||
MUL a6, b6, t0
|
||||
LD b5, 1 * SIZE(Y)
|
||||
|
||||
ADD s1, t1, s1
|
||||
SXADDQ INCX, X, X
|
||||
MUL a6, b7, t1
|
||||
SXADDQ INCY, Y, Y
|
||||
|
||||
ADD s2, t2, s2
|
||||
unop
|
||||
MUL a7, b6, t2
|
||||
unop
|
||||
|
||||
ADD s3, t3, s3
|
||||
LD a6, 0 * SIZE(X)
|
||||
MUL a7, b7, t3
|
||||
LD a7, 1 * SIZE(X)
|
||||
|
||||
ADD s0, t0, s0
|
||||
LD b6, 0 * SIZE(Y)
|
||||
MUL a0, b0, t0
|
||||
LD b7, 1 * SIZE(Y)
|
||||
|
||||
ADD s1, t1, s1
|
||||
SXADDQ INCX, X, X
|
||||
MUL a0, b1, t1
|
||||
SXADDQ INCY, Y, Y
|
||||
|
||||
ADD s2, t2, s2
|
||||
MUL a1, b0, t2
|
||||
ADD s3, t3, s3
|
||||
MUL a1, b1, t3
|
||||
|
||||
ADD s0, t0, s0
|
||||
MUL a2, b2, t0
|
||||
ADD s1, t1, s1
|
||||
MUL a2, b3, t1
|
||||
|
||||
ADD s2, t2, s2
|
||||
MUL a3, b2, t2
|
||||
ADD s3, t3, s3
|
||||
MUL a3, b3, t3
|
||||
|
||||
ADD s0, t0, s0
|
||||
MUL a4, b4, t0
|
||||
ADD s1, t1, s1
|
||||
MUL a4, b5, t1
|
||||
|
||||
ADD s2, t2, s2
|
||||
MUL a5, b4, t2
|
||||
ADD s3, t3, s3
|
||||
MUL a5, b5, t3
|
||||
|
||||
ADD s0, t0, s0
|
||||
MUL a6, b6, t0
|
||||
ADD s1, t1, s1
|
||||
MUL a6, b7, t1
|
||||
|
||||
ADD s2, t2, s2
|
||||
MUL a7, b6, t2
|
||||
ADD s3, t3, s3
|
||||
MUL a7, b7, t3
|
||||
.align 4
|
||||
|
||||
$L25:
|
||||
and N, 7, I
|
||||
unop
|
||||
unop
|
||||
ble I, $L998
|
||||
|
||||
LD a0, 0 * SIZE(X)
|
||||
LD a1, 1 * SIZE(X)
|
||||
LD b0, 0 * SIZE(Y)
|
||||
LD b1, 1 * SIZE(Y)
|
||||
|
||||
SXADDQ INCX, X, X
|
||||
subq I, 1, I
|
||||
SXADDQ INCY, Y, Y
|
||||
ble I, $L28
|
||||
.align 4
|
||||
|
||||
$L26:
|
||||
ADD s0, t0, s0
|
||||
mov X, XX
|
||||
MUL a0, b0, t0
|
||||
mov Y, YY
|
||||
|
||||
ADD s1, t1, s1
|
||||
SXADDQ INCX, X, X
|
||||
MUL a0, b1, t1
|
||||
SXADDQ INCY, Y, Y
|
||||
|
||||
ADD s2, t2, s2
|
||||
LD a0, 0 * SIZE(XX)
|
||||
MUL a1, b0, t2
|
||||
LD b0, 0 * SIZE(YY)
|
||||
|
||||
ADD s3, t3, s3
|
||||
subq I, 1, I
|
||||
MUL a1, b1, t3
|
||||
LD a1, 1 * SIZE(XX)
|
||||
|
||||
LD b1, 1 * SIZE(YY)
|
||||
bgt I, $L26
|
||||
.align 4
|
||||
|
||||
$L28:
|
||||
ADD s0, t0, s0
|
||||
MUL a0, b0, t0
|
||||
ADD s1, t1, s1
|
||||
MUL a0, b1, t1
|
||||
|
||||
ADD s2, t2, s2
|
||||
MUL a1, b0, t2
|
||||
ADD s3, t3, s3
|
||||
MUL a1, b1, t3
|
||||
.align 4
|
||||
|
||||
$L998:
|
||||
ADD s0, t0, s0
|
||||
ADD s1, t1, s1
|
||||
ADD s2, t2, s2
|
||||
ADD s3, t3, s3
|
||||
|
||||
#ifndef CONJ
|
||||
SUB s0, s3, s0
|
||||
ADD s1, s2, s1
|
||||
#else
|
||||
ADD s0, s3, s0
|
||||
SUB s1, s2, s1
|
||||
#endif
|
||||
.align 4
|
||||
|
||||
$L999:
|
||||
ldt $f2, 0($sp)
|
||||
lda $sp, 16($sp)
|
||||
ret
|
||||
|
||||
EPILOGUE
|
||||
192
kernel/alpha/zgemm_beta.S
Normal file
192
kernel/alpha/zgemm_beta.S
Normal file
@@ -0,0 +1,192 @@
|
||||
/*********************************************************************/
|
||||
/* Copyright 2009, 2010 The University of Texas at Austin. */
|
||||
/* All rights reserved. */
|
||||
/* */
|
||||
/* Redistribution and use in source and binary forms, with or */
|
||||
/* without modification, are permitted provided that the following */
|
||||
/* conditions are met: */
|
||||
/* */
|
||||
/* 1. Redistributions of source code must retain the above */
|
||||
/* copyright notice, this list of conditions and the following */
|
||||
/* disclaimer. */
|
||||
/* */
|
||||
/* 2. Redistributions in binary form must reproduce the above */
|
||||
/* copyright notice, this list of conditions and the following */
|
||||
/* disclaimer in the documentation and/or other materials */
|
||||
/* provided with the distribution. */
|
||||
/* */
|
||||
/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */
|
||||
/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */
|
||||
/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */
|
||||
/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */
|
||||
/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */
|
||||
/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */
|
||||
/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */
|
||||
/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */
|
||||
/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */
|
||||
/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */
|
||||
/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */
|
||||
/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */
|
||||
/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */
|
||||
/* POSSIBILITY OF SUCH DAMAGE. */
|
||||
/* */
|
||||
/* The views and conclusions contained in the software and */
|
||||
/* documentation are those of the authors and should not be */
|
||||
/* interpreted as representing official policies, either expressed */
|
||||
/* or implied, of The University of Texas at Austin. */
|
||||
/*********************************************************************/
|
||||
|
||||
#define ASSEMBLER
|
||||
#include "common.h"
|
||||
#include "version.h"
|
||||
|
||||
.set noat
|
||||
.set noreorder
|
||||
.text
|
||||
.align 5
|
||||
.globl CNAME
|
||||
.ent CNAME
|
||||
CNAME:
|
||||
.frame $sp, 0, $26, 0
|
||||
|
||||
#ifdef PROFILE
|
||||
ldgp $gp, 0($27)
|
||||
lda $28, _mcount
|
||||
jsr $28, ($28), _mcount
|
||||
.prologue 1
|
||||
#else
|
||||
.prologue 0
|
||||
#endif
|
||||
|
||||
ldq $18, 24($sp)
|
||||
ble $16, $End
|
||||
ldl $19, 32($sp)
|
||||
ble $17, $End
|
||||
|
||||
addq $19, $19, $19
|
||||
fbne $f19,$Main
|
||||
fbne $f20,$Main
|
||||
.align 4
|
||||
|
||||
$L13:
|
||||
mov $18, $1
|
||||
lda $17, -1($17)
|
||||
SXADDQ $19, $18, $18
|
||||
mov $16, $2
|
||||
.align 4
|
||||
|
||||
$L12:
|
||||
ST $f31, 0*SIZE($1)
|
||||
ST $f31, 1*SIZE($1)
|
||||
lda $2, -1($2)
|
||||
lda $1, 2*SIZE($1)
|
||||
bgt $2, $L12
|
||||
bgt $17,$L13
|
||||
clr $0
|
||||
ret
|
||||
.align 4
|
||||
|
||||
/* Main Routine */
|
||||
$Main:
|
||||
sra $16, 1, $2 # $2 = (m >> 1)
|
||||
mov $18, $1 # c_offset = c
|
||||
lda $17, -1($17) # n --
|
||||
SXADDQ $19, $18, $18 # c += ldc
|
||||
beq $2, $L18
|
||||
|
||||
LD $f14, 0*SIZE($1)
|
||||
LD $f15, 1*SIZE($1)
|
||||
LD $f24, 2*SIZE($1)
|
||||
LD $f25, 3*SIZE($1)
|
||||
lda $2, -1($2) # $2 --
|
||||
ble $2, $L19
|
||||
.align 4
|
||||
|
||||
|
||||
$L23:
|
||||
MUL $f19, $f14, $f10
|
||||
lds $f31, 9*SIZE($1)
|
||||
MUL $f20, $f15, $f11
|
||||
lda $2, -1($2)
|
||||
|
||||
MUL $f19, $f15, $f12
|
||||
LD $f15, 5*SIZE($1)
|
||||
MUL $f20, $f14, $f13
|
||||
LD $f14, 4*SIZE($1)
|
||||
|
||||
MUL $f19, $f24, $f16
|
||||
unop
|
||||
MUL $f20, $f25, $f17
|
||||
unop
|
||||
|
||||
MUL $f19, $f25, $f18
|
||||
LD $f25, 7*SIZE($1)
|
||||
SUB $f10, $f11, $f22
|
||||
unop
|
||||
|
||||
MUL $f20, $f24, $f21
|
||||
LD $f24, 6*SIZE($1)
|
||||
ADD $f12, $f13, $f23
|
||||
lda $1, 4*SIZE($1)
|
||||
|
||||
SUB $f16, $f17, $f26
|
||||
ADD $f18, $f21, $f27
|
||||
ST $f22,-4*SIZE($1)
|
||||
ST $f23,-3*SIZE($1)
|
||||
|
||||
ST $f26,-2*SIZE($1)
|
||||
ST $f27,-1*SIZE($1)
|
||||
unop
|
||||
bgt $2,$L23
|
||||
.align 4
|
||||
|
||||
$L19:
|
||||
MUL $f19, $f14, $f10
|
||||
MUL $f20, $f15, $f11
|
||||
MUL $f19, $f15, $f12
|
||||
MUL $f20, $f14, $f13
|
||||
|
||||
MUL $f19, $f24, $f16
|
||||
MUL $f20, $f25, $f17
|
||||
MUL $f19, $f25, $f18
|
||||
MUL $f20, $f24, $f21
|
||||
|
||||
SUB $f10, $f11, $f22
|
||||
ADD $f12, $f13, $f23
|
||||
SUB $f16, $f17, $f26
|
||||
ADD $f18, $f21, $f27
|
||||
lda $1, 4*SIZE($1)
|
||||
|
||||
ST $f22, -4*SIZE($1)
|
||||
ST $f23, -3*SIZE($1)
|
||||
ST $f26, -2*SIZE($1)
|
||||
ST $f27, -1*SIZE($1)
|
||||
|
||||
blbs $16, $L18
|
||||
bgt $17, $Main
|
||||
clr $0
|
||||
ret
|
||||
.align 4
|
||||
|
||||
$L18:
|
||||
LD $f14, 0*SIZE($1)
|
||||
LD $f15, 1*SIZE($1)
|
||||
MUL $f19, $f15, $f13
|
||||
MUL $f20, $f14, $f10
|
||||
|
||||
MUL $f19, $f14, $f12
|
||||
MUL $f20, $f15, $f11
|
||||
ADD $f13, $f10, $f26
|
||||
SUB $f12, $f11, $f27
|
||||
|
||||
ST $f26, 1*SIZE($1)
|
||||
ST $f27, 0*SIZE($1)
|
||||
lda $1, 2*SIZE($1)
|
||||
bgt $17, $Main
|
||||
.align 4
|
||||
|
||||
$End:
|
||||
clr $0
|
||||
ret
|
||||
.ident VERSION
|
||||
.end CNAME
|
||||
1712
kernel/alpha/zgemm_kernel_2x2.S
Normal file
1712
kernel/alpha/zgemm_kernel_2x2.S
Normal file
File diff suppressed because it is too large
Load Diff
1027
kernel/alpha/zgemv_n.S
Normal file
1027
kernel/alpha/zgemv_n.S
Normal file
File diff suppressed because it is too large
Load Diff
922
kernel/alpha/zgemv_t.S
Normal file
922
kernel/alpha/zgemv_t.S
Normal file
@@ -0,0 +1,922 @@
|
||||
/*********************************************************************/
|
||||
/* Copyright 2009, 2010 The University of Texas at Austin. */
|
||||
/* All rights reserved. */
|
||||
/* */
|
||||
/* Redistribution and use in source and binary forms, with or */
|
||||
/* without modification, are permitted provided that the following */
|
||||
/* conditions are met: */
|
||||
/* */
|
||||
/* 1. Redistributions of source code must retain the above */
|
||||
/* copyright notice, this list of conditions and the following */
|
||||
/* disclaimer. */
|
||||
/* */
|
||||
/* 2. Redistributions in binary form must reproduce the above */
|
||||
/* copyright notice, this list of conditions and the following */
|
||||
/* disclaimer in the documentation and/or other materials */
|
||||
/* provided with the distribution. */
|
||||
/* */
|
||||
/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */
|
||||
/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */
|
||||
/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */
|
||||
/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */
|
||||
/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */
|
||||
/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */
|
||||
/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */
|
||||
/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */
|
||||
/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */
|
||||
/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */
|
||||
/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */
|
||||
/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */
|
||||
/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */
|
||||
/* POSSIBILITY OF SUCH DAMAGE. */
|
||||
/* */
|
||||
/* The views and conclusions contained in the software and */
|
||||
/* documentation are those of the authors and should not be */
|
||||
/* interpreted as representing official policies, either expressed */
|
||||
/* or implied, of The University of Texas at Austin. */
|
||||
/*********************************************************************/
|
||||
|
||||
#define ASSEMBLER
|
||||
#include "common.h"
|
||||
#include "version.h"
|
||||
|
||||
#define STACKSIZE 64
|
||||
#define PREFETCHSIZE 32
|
||||
|
||||
#define M $16
|
||||
#define N $17
|
||||
#define A $21
|
||||
#define LDA $18
|
||||
|
||||
#define X $19
|
||||
#define INCX $20
|
||||
#define Y $22
|
||||
#define INCY $23
|
||||
|
||||
#define BUFFER $24
|
||||
|
||||
#define I $25
|
||||
#define J $27
|
||||
|
||||
#define X1 $3
|
||||
#define Y1 $4
|
||||
#define A1 $5
|
||||
#define A2 $6
|
||||
|
||||
#define alpha_r $f19
|
||||
#define alpha_i $f20
|
||||
|
||||
#define s0 $f0
|
||||
#define s1 $f1
|
||||
#define s2 $f10
|
||||
#define s3 $f11
|
||||
|
||||
#define t0 $f12
|
||||
#define t1 $f13
|
||||
#define t2 $f14
|
||||
#define t3 $f15
|
||||
|
||||
#define x0 $f16
|
||||
#define x1 $f17
|
||||
#define x2 $f18
|
||||
#define x3 $f21
|
||||
|
||||
#define a0 $f22
|
||||
#define a1 $f23
|
||||
#define a2 $f24
|
||||
#define a3 $f25
|
||||
#define a4 $f26
|
||||
#define a5 $f27
|
||||
#define a6 $f28
|
||||
#define a7 $f29
|
||||
|
||||
#define a8 $f2
|
||||
#define a9 $f3
|
||||
#define a10 $f4
|
||||
#define a11 $f5
|
||||
#define a12 $f6
|
||||
#define a13 $f7
|
||||
#define a14 $f8
|
||||
#define a15 $f9
|
||||
|
||||
#if !defined(CONJ) && !defined(XCONJ)
|
||||
#define ADD1 ADD
|
||||
#define ADD2 ADD
|
||||
#define ADD3 SUB
|
||||
#define ADD4 ADD
|
||||
#elif !defined(CONJ) && defined(XCONJ)
|
||||
#define ADD1 ADD
|
||||
#define ADD2 ADD
|
||||
#define ADD3 ADD
|
||||
#define ADD4 SUB
|
||||
#elif defined(CONJ) && !defined(XCONJ)
|
||||
#define ADD1 ADD
|
||||
#define ADD2 SUB
|
||||
#define ADD3 ADD
|
||||
#define ADD4 ADD
|
||||
#else
|
||||
#define ADD1 ADD
|
||||
#define ADD2 SUB
|
||||
#define ADD3 SUB
|
||||
#define ADD4 SUB
|
||||
#endif
|
||||
|
||||
PROLOGUE
|
||||
|
||||
lda $sp, -STACKSIZE($sp)
|
||||
ldq LDA, 0 + STACKSIZE($sp)
|
||||
ldq X, 8 + STACKSIZE($sp)
|
||||
ldq INCX, 16 + STACKSIZE($sp)
|
||||
ldq Y, 24 + STACKSIZE($sp)
|
||||
ldq INCY, 32 + STACKSIZE($sp)
|
||||
ldq BUFFER, 40 + STACKSIZE($sp)
|
||||
|
||||
stt $f2, 0($sp)
|
||||
stt $f3, 8($sp)
|
||||
stt $f4, 16($sp)
|
||||
stt $f5, 24($sp)
|
||||
stt $f6, 32($sp)
|
||||
stt $f7, 40($sp)
|
||||
stt $f8, 48($sp)
|
||||
stt $f9, 56($sp)
|
||||
|
||||
PROFCODE
|
||||
|
||||
cmple M, 0, $0
|
||||
sll INCX, ZBASE_SHIFT, INCX
|
||||
cmple N, 0, $1
|
||||
sll INCY, ZBASE_SHIFT, INCY
|
||||
|
||||
or $0, $1, $0
|
||||
bne $0, $L999
|
||||
|
||||
cmpeq INCX, 2 * SIZE, $0
|
||||
mov X, X1
|
||||
sll LDA, ZBASE_SHIFT,LDA
|
||||
bne $0, $L10
|
||||
|
||||
sra M, 2, I
|
||||
mov BUFFER, Y1
|
||||
mov BUFFER, X
|
||||
ble I, $L05
|
||||
.align 4
|
||||
|
||||
$L02:
|
||||
ldl $31, (PREFETCHSIZE + 0) * SIZE(X1)
|
||||
lda I, -1(I)
|
||||
|
||||
LD a0, 0 * SIZE(X1)
|
||||
LD a1, 1 * SIZE(X1)
|
||||
addq X1, INCX, X1
|
||||
LD a2, 0 * SIZE(X1)
|
||||
LD a3, 1 * SIZE(X1)
|
||||
addq X1, INCX, X1
|
||||
|
||||
ST a0, 0 * SIZE(Y1)
|
||||
ST a1, 1 * SIZE(Y1)
|
||||
ST a2, 2 * SIZE(Y1)
|
||||
ST a3, 3 * SIZE(Y1)
|
||||
|
||||
LD a4, 0 * SIZE(X1)
|
||||
LD a5, 1 * SIZE(X1)
|
||||
addq X1, INCX, X1
|
||||
LD a6, 0 * SIZE(X1)
|
||||
LD a7, 1 * SIZE(X1)
|
||||
addq X1, INCX, X1
|
||||
|
||||
ST a4, 4 * SIZE(Y1)
|
||||
ST a5, 5 * SIZE(Y1)
|
||||
ST a6, 6 * SIZE(Y1)
|
||||
ST a7, 7 * SIZE(Y1)
|
||||
|
||||
lda Y1, 8 * SIZE(Y1)
|
||||
bgt I, $L02
|
||||
.align 4
|
||||
|
||||
$L05:
|
||||
and M, 3, I
|
||||
ble I, $L10
|
||||
.align 4
|
||||
|
||||
$L06:
|
||||
LD a0, 0 * SIZE(X1)
|
||||
LD a1, 1 * SIZE(X1)
|
||||
addq X1, INCX, X1
|
||||
|
||||
ST a0, 0 * SIZE(Y1)
|
||||
ST a1, 1 * SIZE(Y1)
|
||||
lda Y1, 2 * SIZE(Y1)
|
||||
|
||||
lda I, -1(I)
|
||||
bgt I, $L06
|
||||
.align 4
|
||||
|
||||
$L10:
|
||||
mov Y, Y1
|
||||
fclr t0
|
||||
unop
|
||||
fclr t1
|
||||
|
||||
sra N, 1, J
|
||||
fclr t2
|
||||
fclr t3
|
||||
ble J, $L20
|
||||
.align 4
|
||||
|
||||
$L11:
|
||||
mov A, A1
|
||||
fclr s0
|
||||
addq A, LDA, A2
|
||||
fclr s1
|
||||
|
||||
addq A2, LDA, A
|
||||
unop
|
||||
mov X, X1
|
||||
lds $f31, 3 * SIZE(Y)
|
||||
|
||||
sra M, 2, I
|
||||
fclr s2
|
||||
fclr s3
|
||||
ble I, $L15
|
||||
|
||||
LD a0, 0 * SIZE(A1)
|
||||
LD a1, 1 * SIZE(A1)
|
||||
LD a2, 0 * SIZE(A2)
|
||||
LD a3, 1 * SIZE(A2)
|
||||
LD a4, 2 * SIZE(A1)
|
||||
LD a5, 3 * SIZE(A1)
|
||||
LD a6, 2 * SIZE(A2)
|
||||
LD a7, 3 * SIZE(A2)
|
||||
|
||||
LD a8, 4 * SIZE(A1)
|
||||
LD a9, 5 * SIZE(A1)
|
||||
LD a10, 4 * SIZE(A2)
|
||||
LD a11, 5 * SIZE(A2)
|
||||
LD a12, 6 * SIZE(A1)
|
||||
LD a13, 7 * SIZE(A1)
|
||||
LD a14, 6 * SIZE(A2)
|
||||
LD a15, 7 * SIZE(A2)
|
||||
|
||||
LD x0, 0 * SIZE(X1)
|
||||
LD x1, 1 * SIZE(X1)
|
||||
LD x2, 2 * SIZE(X1)
|
||||
|
||||
lda I, -1(I)
|
||||
ble I, $L13
|
||||
.align 4
|
||||
|
||||
$L12:
|
||||
ADD3 s0, t0, s0
|
||||
unop
|
||||
MUL x0, a0, t0
|
||||
LD x3, 3 * SIZE(X1)
|
||||
|
||||
ADD4 s1, t1, s1
|
||||
ldl $31, (PREFETCHSIZE + 0) * SIZE(A1)
|
||||
MUL x0, a1, t1
|
||||
unop
|
||||
|
||||
ADD3 s2, t2, s2
|
||||
unop
|
||||
MUL x0, a2, t2
|
||||
unop
|
||||
|
||||
ADD4 s3, t3, s3
|
||||
unop
|
||||
MUL x0, a3, t3
|
||||
LD x0, 4 * SIZE(X1)
|
||||
|
||||
ADD1 s0, t0, s0
|
||||
unop
|
||||
MUL x1, a1, t0
|
||||
LD a1, 9 * SIZE(A1)
|
||||
|
||||
ADD2 s1, t1, s1
|
||||
unop
|
||||
MUL x1, a0, t1
|
||||
LD a0, 8 * SIZE(A1)
|
||||
|
||||
ADD1 s2, t2, s2
|
||||
unop
|
||||
MUL x1, a3, t2
|
||||
LD a3, 9 * SIZE(A2)
|
||||
|
||||
ADD2 s3, t3, s3
|
||||
unop
|
||||
MUL x1, a2, t3
|
||||
LD a2, 8 * SIZE(A2)
|
||||
|
||||
ADD3 s0, t0, s0
|
||||
unop
|
||||
MUL x2, a4, t0
|
||||
LD x1, 5 * SIZE(X1)
|
||||
|
||||
ADD4 s1, t1, s1
|
||||
MUL x2, a5, t1
|
||||
ADD3 s2, t2, s2
|
||||
MUL x2, a6, t2
|
||||
|
||||
ADD4 s3, t3, s3
|
||||
unop
|
||||
MUL x2, a7, t3
|
||||
LD x2, 6 * SIZE(X1)
|
||||
|
||||
ADD1 s0, t0, s0
|
||||
unop
|
||||
MUL x3, a5, t0
|
||||
LD a5, 11 * SIZE(A1)
|
||||
|
||||
ADD2 s1, t1, s1
|
||||
unop
|
||||
MUL x3, a4, t1
|
||||
LD a4, 10 * SIZE(A1)
|
||||
|
||||
ADD1 s2, t2, s2
|
||||
unop
|
||||
MUL x3, a7, t2
|
||||
LD a7, 11 * SIZE(A2)
|
||||
|
||||
ADD2 s3, t3, s3
|
||||
unop
|
||||
MUL x3, a6, t3
|
||||
LD a6, 10 * SIZE(A2)
|
||||
|
||||
ADD3 s0, t0, s0
|
||||
unop
|
||||
MUL x0, a8, t0
|
||||
LD x3, 7 * SIZE(X1)
|
||||
|
||||
ADD4 s1, t1, s1
|
||||
ldl $31, (PREFETCHSIZE + 0) * SIZE(A2)
|
||||
MUL x0, a9, t1
|
||||
unop
|
||||
|
||||
ADD3 s2, t2, s2
|
||||
lda I, -1(I)
|
||||
MUL x0, a10, t2
|
||||
unop
|
||||
|
||||
ADD4 s3, t3, s3
|
||||
unop
|
||||
MUL x0, a11, t3
|
||||
LD x0, 8 * SIZE(X1)
|
||||
|
||||
ADD1 s0, t0, s0
|
||||
unop
|
||||
MUL x1, a9, t0
|
||||
LD a9, 13 * SIZE(A1)
|
||||
|
||||
ADD2 s1, t1, s1
|
||||
unop
|
||||
MUL x1, a8, t1
|
||||
LD a8, 12 * SIZE(A1)
|
||||
|
||||
ADD1 s2, t2, s2
|
||||
lda A1, 8 * SIZE(A1)
|
||||
MUL x1, a11, t2
|
||||
LD a11, 13 * SIZE(A2)
|
||||
|
||||
ADD2 s3, t3, s3
|
||||
unop
|
||||
MUL x1, a10, t3
|
||||
LD a10, 12 * SIZE(A2)
|
||||
|
||||
ADD3 s0, t0, s0
|
||||
unop
|
||||
MUL x2, a12, t0
|
||||
LD x1, 9 * SIZE(X1)
|
||||
|
||||
ADD4 s1, t1, s1
|
||||
ldl $31, (PREFETCHSIZE + 0) * SIZE(X1)
|
||||
MUL x2, a13, t1
|
||||
lda A2, 8 * SIZE(A2)
|
||||
|
||||
ADD3 s2, t2, s2
|
||||
unop
|
||||
MUL x2, a14, t2
|
||||
unop
|
||||
|
||||
ADD4 s3, t3, s3
|
||||
unop
|
||||
MUL x2, a15, t3
|
||||
LD x2, 10 * SIZE(X1)
|
||||
|
||||
ADD1 s0, t0, s0
|
||||
unop
|
||||
MUL x3, a13, t0
|
||||
LD a13, 7 * SIZE(A1)
|
||||
|
||||
ADD2 s1, t1, s1
|
||||
lda X1, 8 * SIZE(X1)
|
||||
MUL x3, a12, t1
|
||||
LD a12, 6 * SIZE(A1)
|
||||
|
||||
ADD1 s2, t2, s2
|
||||
unop
|
||||
MUL x3, a15, t2
|
||||
LD a15, 7 * SIZE(A2)
|
||||
|
||||
ADD2 s3, t3, s3
|
||||
MUL x3, a14, t3
|
||||
LD a14, 6 * SIZE(A2)
|
||||
bgt I, $L12
|
||||
.align 4
|
||||
|
||||
$L13:
|
||||
ADD3 s0, t0, s0
|
||||
unop
|
||||
MUL x0, a0, t0
|
||||
LD x3, 3 * SIZE(X1)
|
||||
|
||||
ADD4 s1, t1, s1
|
||||
MUL x0, a1, t1
|
||||
ADD3 s2, t2, s2
|
||||
MUL x0, a2, t2
|
||||
|
||||
ADD4 s3, t3, s3
|
||||
unop
|
||||
MUL x0, a3, t3
|
||||
LD x0, 4 * SIZE(X1)
|
||||
|
||||
ADD1 s0, t0, s0
|
||||
MUL x1, a1, t0
|
||||
ADD2 s1, t1, s1
|
||||
MUL x1, a0, t1
|
||||
|
||||
ADD1 s2, t2, s2
|
||||
unop
|
||||
MUL x1, a3, t2
|
||||
unop
|
||||
|
||||
ADD2 s3, t3, s3
|
||||
lda A1, 8 * SIZE(A1)
|
||||
MUL x1, a2, t3
|
||||
LD x1, 5 * SIZE(X1)
|
||||
|
||||
ADD3 s0, t0, s0
|
||||
MUL x2, a4, t0
|
||||
ADD4 s1, t1, s1
|
||||
MUL x2, a5, t1
|
||||
|
||||
ADD3 s2, t2, s2
|
||||
unop
|
||||
MUL x2, a6, t2
|
||||
unop
|
||||
|
||||
ADD4 s3, t3, s3
|
||||
lda A2, 8 * SIZE(A2)
|
||||
MUL x2, a7, t3
|
||||
LD x2, 6 * SIZE(X1)
|
||||
|
||||
ADD1 s0, t0, s0
|
||||
MUL x3, a5, t0
|
||||
ADD2 s1, t1, s1
|
||||
MUL x3, a4, t1
|
||||
|
||||
ADD1 s2, t2, s2
|
||||
unop
|
||||
MUL x3, a7, t2
|
||||
lda X1, 8 * SIZE(X1)
|
||||
|
||||
ADD2 s3, t3, s3
|
||||
unop
|
||||
MUL x3, a6, t3
|
||||
LD x3, -1 * SIZE(X1)
|
||||
|
||||
ADD3 s0, t0, s0
|
||||
MUL x0, a8, t0
|
||||
ADD4 s1, t1, s1
|
||||
MUL x0, a9, t1
|
||||
|
||||
ADD3 s2, t2, s2
|
||||
MUL x0, a10, t2
|
||||
ADD4 s3, t3, s3
|
||||
MUL x0, a11, t3
|
||||
|
||||
ADD1 s0, t0, s0
|
||||
MUL x1, a9, t0
|
||||
ADD2 s1, t1, s1
|
||||
MUL x1, a8, t1
|
||||
|
||||
ADD1 s2, t2, s2
|
||||
MUL x1, a11, t2
|
||||
ADD2 s3, t3, s3
|
||||
MUL x1, a10, t3
|
||||
|
||||
ADD3 s0, t0, s0
|
||||
MUL x2, a12, t0
|
||||
ADD4 s1, t1, s1
|
||||
MUL x2, a13, t1
|
||||
|
||||
ADD3 s2, t2, s2
|
||||
MUL x2, a14, t2
|
||||
ADD4 s3, t3, s3
|
||||
MUL x2, a15, t3
|
||||
|
||||
ADD1 s0, t0, s0
|
||||
MUL x3, a13, t0
|
||||
ADD2 s1, t1, s1
|
||||
MUL x3, a12, t1
|
||||
|
||||
ADD1 s2, t2, s2
|
||||
MUL x3, a15, t2
|
||||
ADD2 s3, t3, s3
|
||||
MUL x3, a14, t3
|
||||
.align 4
|
||||
|
||||
$L15:
|
||||
and M, 3, I
|
||||
ble I, $L18
|
||||
|
||||
LD a0, 0 * SIZE(A1)
|
||||
LD a1, 1 * SIZE(A1)
|
||||
LD a2, 0 * SIZE(A2)
|
||||
LD a3, 1 * SIZE(A2)
|
||||
|
||||
LD x0, 0 * SIZE(X1)
|
||||
|
||||
lda I, -1(I)
|
||||
ble I, $L17
|
||||
.align 4
|
||||
|
||||
$L16:
|
||||
ADD3 s0, t0, s0
|
||||
lda I, -1(I)
|
||||
MUL x0, a0, t0
|
||||
LD x1, 1 * SIZE(X1)
|
||||
|
||||
ADD4 s1, t1, s1
|
||||
MUL x0, a1, t1
|
||||
ADD3 s2, t2, s2
|
||||
MUL x0, a2, t2
|
||||
|
||||
ADD4 s3, t3, s3
|
||||
unop
|
||||
MUL x0, a3, t3
|
||||
LD x0, 2 * SIZE(X1)
|
||||
|
||||
ADD1 s0, t0, s0
|
||||
lda A2, 2 * SIZE(A2)
|
||||
MUL x1, a1, t0
|
||||
LD a1, 3 * SIZE(A1)
|
||||
|
||||
ADD2 s1, t1, s1
|
||||
lda X1, 2 * SIZE(X1)
|
||||
MUL x1, a0, t1
|
||||
LD a0, 2 * SIZE(A1)
|
||||
|
||||
ADD1 s2, t2, s2
|
||||
lda A1, 2 * SIZE(A1)
|
||||
MUL x1, a3, t2
|
||||
LD a3, 1 * SIZE(A2)
|
||||
|
||||
ADD2 s3, t3, s3
|
||||
MUL x1, a2, t3
|
||||
LD a2, 0 * SIZE(A2)
|
||||
bgt I, $L16
|
||||
.align 4
|
||||
|
||||
$L17:
|
||||
ADD3 s0, t0, s0
|
||||
unop
|
||||
MUL x0, a0, t0
|
||||
LD x1, 1 * SIZE(X1)
|
||||
|
||||
ADD4 s1, t1, s1
|
||||
unop
|
||||
MUL x0, a1, t1
|
||||
unop
|
||||
|
||||
ADD3 s2, t2, s2
|
||||
MUL x0, a2, t2
|
||||
ADD4 s3, t3, s3
|
||||
MUL x0, a3, t3
|
||||
|
||||
ADD1 s0, t0, s0
|
||||
MUL x1, a1, t0
|
||||
ADD2 s1, t1, s1
|
||||
MUL x1, a0, t1
|
||||
|
||||
ADD1 s2, t2, s2
|
||||
MUL x1, a3, t2
|
||||
ADD2 s3, t3, s3
|
||||
MUL x1, a2, t3
|
||||
.align 4
|
||||
|
||||
$L18:
|
||||
LD a0, 0 * SIZE(Y)
|
||||
unop
|
||||
LD a1, 1 * SIZE(Y)
|
||||
addq Y, INCY, Y
|
||||
|
||||
LD a2, 0 * SIZE(Y)
|
||||
unop
|
||||
LD a3, 1 * SIZE(Y)
|
||||
addq Y, INCY, Y
|
||||
|
||||
ADD3 s0, t0, s0
|
||||
ADD4 s1, t1, s1
|
||||
ADD3 s2, t2, s2
|
||||
ADD4 s3, t3, s3
|
||||
|
||||
MUL alpha_r, s0, t0
|
||||
MUL alpha_r, s1, t1
|
||||
MUL alpha_r, s2, t2
|
||||
MUL alpha_r, s3, t3
|
||||
|
||||
ADD a0, t0, a0
|
||||
MUL alpha_i, s1, t0
|
||||
ADD a1, t1, a1
|
||||
MUL alpha_i, s0, t1
|
||||
ADD a2, t2, a2
|
||||
MUL alpha_i, s3, t2
|
||||
ADD a3, t3, a3
|
||||
MUL alpha_i, s2, t3
|
||||
|
||||
SUB a0, t0, a0
|
||||
ADD a1, t1, a1
|
||||
SUB a2, t2, a2
|
||||
ADD a3, t3, a3
|
||||
|
||||
ST a0, 0 * SIZE(Y1)
|
||||
fclr t0
|
||||
ST a1, 1 * SIZE(Y1)
|
||||
addq Y1, INCY, Y1
|
||||
|
||||
ST a2, 0 * SIZE(Y1)
|
||||
fclr t1
|
||||
ST a3, 1 * SIZE(Y1)
|
||||
addq Y1, INCY, Y1
|
||||
|
||||
fclr t2
|
||||
lda J, -1(J)
|
||||
fclr t3
|
||||
bgt J, $L11
|
||||
.align 4
|
||||
|
||||
$L20:
|
||||
blbc N, $L999
|
||||
|
||||
mov A, A1
|
||||
fclr s0
|
||||
fclr s1
|
||||
mov X, X1
|
||||
|
||||
sra M, 2, I
|
||||
fclr s2
|
||||
fclr s3
|
||||
ble I, $L25
|
||||
|
||||
LD a0, 0 * SIZE(A1)
|
||||
LD a1, 1 * SIZE(A1)
|
||||
LD a4, 2 * SIZE(A1)
|
||||
LD a5, 3 * SIZE(A1)
|
||||
LD a8, 4 * SIZE(A1)
|
||||
LD a9, 5 * SIZE(A1)
|
||||
LD a12, 6 * SIZE(A1)
|
||||
LD a13, 7 * SIZE(A1)
|
||||
|
||||
LD x0, 0 * SIZE(X1)
|
||||
LD x1, 1 * SIZE(X1)
|
||||
LD x2, 2 * SIZE(X1)
|
||||
|
||||
lda I, -1(I)
|
||||
ble I, $L23
|
||||
.align 4
|
||||
|
||||
$L22:
|
||||
ADD3 s0, t0, s0
|
||||
ldl $31, (PREFETCHSIZE + 0) * SIZE(A1)
|
||||
MUL x0, a0, t0
|
||||
LD x3, 3 * SIZE(X1)
|
||||
|
||||
ADD4 s1, t1, s1
|
||||
unop
|
||||
MUL x0, a1, t1
|
||||
LD x0, 4 * SIZE(X1)
|
||||
|
||||
ADD1 s2, t0, s2
|
||||
lda I, -1(I)
|
||||
MUL x1, a1, t0
|
||||
LD a1, 9 * SIZE(A1)
|
||||
|
||||
ADD2 s3, t1, s3
|
||||
unop
|
||||
MUL x1, a0, t1
|
||||
LD a0, 8 * SIZE(A1)
|
||||
|
||||
ADD3 s0, t0, s0
|
||||
unop
|
||||
MUL x2, a4, t0
|
||||
LD x1, 5 * SIZE(X1)
|
||||
|
||||
ADD4 s1, t1, s1
|
||||
unop
|
||||
MUL x2, a5, t1
|
||||
LD x2, 6 * SIZE(X1)
|
||||
|
||||
ADD1 s2, t0, s2
|
||||
unop
|
||||
MUL x3, a5, t0
|
||||
LD a5, 11 * SIZE(A1)
|
||||
|
||||
ADD2 s3, t1, s3
|
||||
unop
|
||||
MUL x3, a4, t1
|
||||
LD a4, 10 * SIZE(A1)
|
||||
|
||||
ADD3 s0, t0, s0
|
||||
unop
|
||||
MUL x0, a8, t0
|
||||
LD x3, 7 * SIZE(X1)
|
||||
|
||||
ADD4 s1, t1, s1
|
||||
unop
|
||||
MUL x0, a9, t1
|
||||
LD x0, 8 * SIZE(X1)
|
||||
|
||||
ADD1 s2, t0, s2
|
||||
unop
|
||||
MUL x1, a9, t0
|
||||
LD a9, 13 * SIZE(A1)
|
||||
|
||||
ADD2 s3, t1, s3
|
||||
unop
|
||||
MUL x1, a8, t1
|
||||
LD a8, 12 * SIZE(A1)
|
||||
|
||||
ADD3 s0, t0, s0
|
||||
unop
|
||||
MUL x2, a12, t0
|
||||
LD x1, 9 * SIZE(X1)
|
||||
|
||||
ADD4 s1, t1, s1
|
||||
lda A1, 8 * SIZE(A1)
|
||||
MUL x2, a13, t1
|
||||
LD x2, 10 * SIZE(X1)
|
||||
|
||||
ADD1 s2, t0, s2
|
||||
lda X1, 8 * SIZE(X1)
|
||||
MUL x3, a13, t0
|
||||
LD a13, 7 * SIZE(A1)
|
||||
|
||||
ADD2 s3, t1, s3
|
||||
MUL x3, a12, t1
|
||||
LD a12, 6 * SIZE(A1)
|
||||
bgt I, $L22
|
||||
.align 4
|
||||
|
||||
$L23:
|
||||
ADD3 s0, t0, s0
|
||||
unop
|
||||
MUL x0, a0, t0
|
||||
LD x3, 3 * SIZE(X1)
|
||||
|
||||
ADD4 s1, t1, s1
|
||||
unop
|
||||
MUL x0, a1, t1
|
||||
LD x0, 4 * SIZE(X1)
|
||||
|
||||
ADD1 s2, t0, s2
|
||||
unop
|
||||
MUL x1, a1, t0
|
||||
lda A1, 8 * SIZE(A1)
|
||||
|
||||
ADD2 s3, t1, s3
|
||||
unop
|
||||
MUL x1, a0, t1
|
||||
LD x1, 5 * SIZE(X1)
|
||||
|
||||
ADD3 s0, t0, s0
|
||||
unop
|
||||
MUL x2, a4, t0
|
||||
unop
|
||||
|
||||
ADD4 s1, t1, s1
|
||||
unop
|
||||
MUL x2, a5, t1
|
||||
LD x2, 6 * SIZE(X1)
|
||||
|
||||
ADD1 s2, t0, s2
|
||||
unop
|
||||
MUL x3, a5, t0
|
||||
lda X1, 8 * SIZE(X1)
|
||||
|
||||
ADD2 s3, t1, s3
|
||||
unop
|
||||
MUL x3, a4, t1
|
||||
LD x3, -1 * SIZE(X1)
|
||||
|
||||
ADD3 s0, t0, s0
|
||||
MUL x0, a8, t0
|
||||
ADD4 s1, t1, s1
|
||||
MUL x0, a9, t1
|
||||
|
||||
ADD1 s2, t0, s2
|
||||
MUL x1, a9, t0
|
||||
ADD2 s3, t1, s3
|
||||
MUL x1, a8, t1
|
||||
|
||||
ADD3 s0, t0, s0
|
||||
MUL x2, a12, t0
|
||||
ADD4 s1, t1, s1
|
||||
MUL x2, a13, t1
|
||||
|
||||
ADD1 s2, t0, s2
|
||||
MUL x3, a13, t0
|
||||
ADD2 s3, t1, s3
|
||||
MUL x3, a12, t1
|
||||
.align 4
|
||||
|
||||
$L25:
|
||||
and M, 3, I
|
||||
ble I, $L28
|
||||
|
||||
LD a0, 0 * SIZE(A1)
|
||||
LD a1, 1 * SIZE(A1)
|
||||
|
||||
LD x0, 0 * SIZE(X1)
|
||||
|
||||
lda I, -1(I)
|
||||
ble I, $L27
|
||||
.align 4
|
||||
|
||||
$L26:
|
||||
ADD3 s0, t0, s0
|
||||
lda A1, 2 * SIZE(A1)
|
||||
MUL x0, a0, t0
|
||||
LD x1, 1 * SIZE(X1)
|
||||
|
||||
ADD4 s1, t1, s1
|
||||
lda I, -1(I)
|
||||
MUL x0, a1, t1
|
||||
LD x0, 2 * SIZE(X1)
|
||||
|
||||
ADD1 s0, t0, s0
|
||||
lda X1, 2 * SIZE(X1)
|
||||
MUL x1, a1, t0
|
||||
LD a1, 1 * SIZE(A1)
|
||||
|
||||
ADD2 s1, t1, s1
|
||||
MUL x1, a0, t1
|
||||
LD a0, 0 * SIZE(A1)
|
||||
bgt I, $L26
|
||||
.align 4
|
||||
|
||||
$L27:
|
||||
ADD3 s0, t0, s0
|
||||
unop
|
||||
MUL x0, a0, t0
|
||||
LD x1, 1 * SIZE(X1)
|
||||
|
||||
ADD4 s1, t1, s1
|
||||
unop
|
||||
MUL x0, a1, t1
|
||||
unop
|
||||
|
||||
ADD1 s0, t0, s0
|
||||
MUL x1, a1, t0
|
||||
ADD2 s1, t1, s1
|
||||
MUL x1, a0, t1
|
||||
.align 4
|
||||
|
||||
$L28:
|
||||
LD a0, 0 * SIZE(Y)
|
||||
LD a1, 1 * SIZE(Y)
|
||||
|
||||
ADD3 s0, t0, s0
|
||||
ADD4 s1, t1, s1
|
||||
ADD3 s2, t2, s2
|
||||
ADD4 s3, t3, s3
|
||||
|
||||
ADD s0, s2, s0
|
||||
ADD s1, s3, s1
|
||||
|
||||
MUL alpha_r, s0, t0
|
||||
MUL alpha_r, s1, t1
|
||||
|
||||
ADD a0, t0, a0
|
||||
MUL alpha_i, s1, t0
|
||||
ADD a1, t1, a1
|
||||
MUL alpha_i, s0, t1
|
||||
|
||||
SUB a0, t0, a0
|
||||
ADD a1, t1, a1
|
||||
|
||||
ST a0, 0 * SIZE(Y1)
|
||||
ST a1, 1 * SIZE(Y1)
|
||||
.align 4
|
||||
|
||||
$L999:
|
||||
ldt $f2, 0($sp)
|
||||
ldt $f3, 8($sp)
|
||||
ldt $f4, 16($sp)
|
||||
ldt $f5, 24($sp)
|
||||
ldt $f6, 32($sp)
|
||||
ldt $f7, 40($sp)
|
||||
ldt $f8, 48($sp)
|
||||
ldt $f9, 56($sp)
|
||||
|
||||
lda $sp, STACKSIZE($sp)
|
||||
ret
|
||||
EPILOGUE
|
||||
426
kernel/alpha/znrm2.S
Normal file
426
kernel/alpha/znrm2.S
Normal file
@@ -0,0 +1,426 @@
|
||||
/*********************************************************************/
|
||||
/* Copyright 2009, 2010 The University of Texas at Austin. */
|
||||
/* All rights reserved. */
|
||||
/* */
|
||||
/* Redistribution and use in source and binary forms, with or */
|
||||
/* without modification, are permitted provided that the following */
|
||||
/* conditions are met: */
|
||||
/* */
|
||||
/* 1. Redistributions of source code must retain the above */
|
||||
/* copyright notice, this list of conditions and the following */
|
||||
/* disclaimer. */
|
||||
/* */
|
||||
/* 2. Redistributions in binary form must reproduce the above */
|
||||
/* copyright notice, this list of conditions and the following */
|
||||
/* disclaimer in the documentation and/or other materials */
|
||||
/* provided with the distribution. */
|
||||
/* */
|
||||
/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */
|
||||
/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */
|
||||
/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */
|
||||
/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */
|
||||
/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */
|
||||
/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */
|
||||
/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */
|
||||
/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */
|
||||
/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */
|
||||
/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */
|
||||
/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */
|
||||
/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */
|
||||
/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */
|
||||
/* POSSIBILITY OF SUCH DAMAGE. */
|
||||
/* */
|
||||
/* The views and conclusions contained in the software and */
|
||||
/* documentation are those of the authors and should not be */
|
||||
/* interpreted as representing official policies, either expressed */
|
||||
/* or implied, of The University of Texas at Austin. */
|
||||
/*********************************************************************/
|
||||
|
||||
#define ASSEMBLER
|
||||
|
||||
#include "common.h"
|
||||
#include "version.h"
|
||||
|
||||
#define PREFETCH_SIZE 80
|
||||
|
||||
#define N $16
|
||||
#define X $17
|
||||
#define INCX $18
|
||||
#define XX $19
|
||||
|
||||
#define I $0
|
||||
|
||||
#define a0 $f0
|
||||
#define a1 $f1
|
||||
#define a2 $f10
|
||||
#define a3 $f11
|
||||
#define t0 $f12
|
||||
#define t1 $f13
|
||||
#define t2 $f14
|
||||
#define t3 $f15
|
||||
|
||||
#define x0 $f16
|
||||
#define x1 $f17
|
||||
#define x2 $f18
|
||||
#define x3 $f19
|
||||
#define x4 $f20
|
||||
#define x5 $f21
|
||||
#define x6 $f22
|
||||
#define x7 $f23
|
||||
|
||||
PROLOGUE
|
||||
|
||||
#if defined(EV4) || defined(EV5)
|
||||
.frame $30,16,$26,0
|
||||
.mask 0x4000000,-16
|
||||
ldah $29, 0($27) !gpdisp!1
|
||||
lda $29, 0($29) !gpdisp!1
|
||||
|
||||
lda $sp, -16($sp)
|
||||
ldq $27, sqrt($29) !literal!2
|
||||
stq $26, 0($sp)
|
||||
|
||||
PROFCODE
|
||||
.prologue 1
|
||||
#else
|
||||
PROFCODE
|
||||
#endif
|
||||
|
||||
fclr a0
|
||||
sll INCX, ZBASE_SHIFT, INCX
|
||||
fclr a1
|
||||
ble N, $L999
|
||||
|
||||
fclr a2
|
||||
cmpeq INCX, 2 * SIZE, $0
|
||||
fclr a3
|
||||
beq $0, $L20
|
||||
|
||||
fclr t0
|
||||
sra N, 3, I
|
||||
fclr t1
|
||||
ble I, $L15
|
||||
|
||||
fclr t2
|
||||
LD x0, 0 * SIZE(X)
|
||||
fclr t3
|
||||
LD x1, 1 * SIZE(X)
|
||||
|
||||
LD x2, 2 * SIZE(X)
|
||||
LD x3, 3 * SIZE(X)
|
||||
LD x4, 4 * SIZE(X)
|
||||
LD x5, 5 * SIZE(X)
|
||||
LD x6, 6 * SIZE(X)
|
||||
LD x7, 7 * SIZE(X)
|
||||
|
||||
lda I, -1(I)
|
||||
ble I, $L12
|
||||
.align 4
|
||||
|
||||
$L11:
|
||||
addt a0, t0, a0
|
||||
ldl $31, (PREFETCH_SIZE) * SIZE(X)
|
||||
mult x0, x0, t0
|
||||
LD x0, 8 * SIZE(X)
|
||||
|
||||
addt a1, t1, a1
|
||||
mov X, XX
|
||||
mult x1, x1, t1
|
||||
LD x1, 9 * SIZE(X)
|
||||
|
||||
addt a2, t2, a2
|
||||
unop
|
||||
mult x2, x2, t2
|
||||
LD x2, 10 * SIZE(X)
|
||||
|
||||
addt a3, t3, a3
|
||||
unop
|
||||
mult x3, x3, t3
|
||||
LD x3, 11 * SIZE(X)
|
||||
|
||||
addt a0, t0, a0
|
||||
unop
|
||||
mult x4, x4, t0
|
||||
LD x4, 12 * SIZE(X)
|
||||
|
||||
addt a1, t1, a1
|
||||
unop
|
||||
mult x5, x5, t1
|
||||
LD x5, 13 * SIZE(X)
|
||||
|
||||
addt a2, t2, a2
|
||||
unop
|
||||
mult x6, x6, t2
|
||||
LD x6, 14 * SIZE(X)
|
||||
|
||||
addt a3, t3, a3
|
||||
unop
|
||||
mult x7, x7, t3
|
||||
LD x7, 15 * SIZE(X)
|
||||
|
||||
addt a0, t0, a0
|
||||
unop
|
||||
mult x0, x0, t0
|
||||
LD x0, 16 * SIZE(X)
|
||||
|
||||
addt a1, t1, a1
|
||||
lda X, 16 * SIZE(X)
|
||||
mult x1, x1, t1
|
||||
LD x1, 17 * SIZE(XX)
|
||||
|
||||
addt a2, t2, a2
|
||||
unop
|
||||
mult x2, x2, t2
|
||||
LD x2, 18 * SIZE(XX)
|
||||
|
||||
addt a3, t3, a3
|
||||
unop
|
||||
mult x3, x3, t3
|
||||
LD x3, 19 * SIZE(XX)
|
||||
|
||||
addt a0, t0, a0
|
||||
unop
|
||||
mult x4, x4, t0
|
||||
LD x4, 20 * SIZE(XX)
|
||||
|
||||
addt a1, t1, a1
|
||||
lda I, -1(I)
|
||||
mult x5, x5, t1
|
||||
LD x5, 21 * SIZE(XX)
|
||||
|
||||
addt a2, t2, a2
|
||||
unop
|
||||
mult x6, x6, t2
|
||||
LD x6, 22 * SIZE(XX)
|
||||
|
||||
addt a3, t3, a3
|
||||
mult x7, x7, t3
|
||||
LD x7, 23 * SIZE(XX)
|
||||
bgt I, $L11
|
||||
.align 4
|
||||
|
||||
$L12:
|
||||
addt a0, t0, a0
|
||||
mov X, XX
|
||||
mult x0, x0, t0
|
||||
LD x0, 8 * SIZE(X)
|
||||
|
||||
addt a1, t1, a1
|
||||
unop
|
||||
mult x1, x1, t1
|
||||
LD x1, 9 * SIZE(X)
|
||||
|
||||
addt a2, t2, a2
|
||||
unop
|
||||
mult x2, x2, t2
|
||||
LD x2, 10 * SIZE(X)
|
||||
|
||||
addt a3, t3, a3
|
||||
unop
|
||||
mult x3, x3, t3
|
||||
LD x3, 11 * SIZE(X)
|
||||
|
||||
addt a0, t0, a0
|
||||
unop
|
||||
mult x4, x4, t0
|
||||
LD x4, 12 * SIZE(XX)
|
||||
|
||||
addt a1, t1, a1
|
||||
unop
|
||||
mult x5, x5, t1
|
||||
LD x5, 13 * SIZE(XX)
|
||||
|
||||
addt a2, t2, a2
|
||||
unop
|
||||
mult x6, x6, t2
|
||||
LD x6, 14 * SIZE(XX)
|
||||
|
||||
addt a3, t3, a3
|
||||
lda X, 16 * SIZE(X)
|
||||
mult x7, x7, t3
|
||||
LD x7, 15 * SIZE(XX)
|
||||
|
||||
addt a0, t0, a0
|
||||
mult x0, x0, t0
|
||||
addt a1, t1, a1
|
||||
mult x1, x1, t1
|
||||
|
||||
addt a2, t2, a2
|
||||
mult x2, x2, t2
|
||||
addt a3, t3, a3
|
||||
mult x3, x3, t3
|
||||
|
||||
addt a0, t0, a0
|
||||
mult x4, x4, t0
|
||||
addt a1, t1, a1
|
||||
mult x5, x5, t1
|
||||
|
||||
addt a2, t2, a2
|
||||
mult x6, x6, t2
|
||||
addt a3, t3, a3
|
||||
mult x7, x7, t3
|
||||
|
||||
addt a2, t2, a2
|
||||
addt a3, t3, a3
|
||||
.align 4
|
||||
|
||||
$L15:
|
||||
and N, 7, I
|
||||
ble I, $L998
|
||||
.align 4
|
||||
|
||||
$L16:
|
||||
LD x0, 0 * SIZE(X)
|
||||
LD x1, 1 * SIZE(X)
|
||||
|
||||
lda X, 2 * SIZE(X)
|
||||
|
||||
addt a0, t0, a0
|
||||
mult x0, x0, t0
|
||||
addt a1, t1, a1
|
||||
mult x1, x1, t1
|
||||
|
||||
lda I, -1(I)
|
||||
bgt I, $L16
|
||||
bsr $31, $L998
|
||||
.align 4
|
||||
|
||||
$L20:
|
||||
fclr t0
|
||||
sra N, 2, I
|
||||
fclr t1
|
||||
ble I, $L25
|
||||
|
||||
LD x0, 0 * SIZE(X)
|
||||
fclr t2
|
||||
LD x1, 1 * SIZE(X)
|
||||
addq X, INCX, X
|
||||
LD x2, 0 * SIZE(X)
|
||||
fclr t3
|
||||
LD x3, 1 * SIZE(X)
|
||||
addq X, INCX, X
|
||||
|
||||
LD x4, 0 * SIZE(X)
|
||||
lda I, -1(I)
|
||||
LD x5, 1 * SIZE(X)
|
||||
addq X, INCX, X
|
||||
|
||||
LD x6, 0 * SIZE(X)
|
||||
ble I, $L22
|
||||
.align 4
|
||||
|
||||
$L21:
|
||||
addt a0, t0, a0
|
||||
LD x7, 1 * SIZE(X)
|
||||
mult x0, x0, t0
|
||||
addq X, INCX, X
|
||||
|
||||
addt a1, t1, a1
|
||||
LD x0, 0 * SIZE(X)
|
||||
mult x1, x1, t1
|
||||
unop
|
||||
|
||||
addt a2, t2, a2
|
||||
LD x1, 1 * SIZE(X)
|
||||
mult x2, x2, t2
|
||||
addq X, INCX, X
|
||||
|
||||
addt a3, t3, a3
|
||||
LD x2, 0 * SIZE(X)
|
||||
mult x3, x3, t3
|
||||
unop
|
||||
|
||||
addt a0, t0, a0
|
||||
LD x3, 1 * SIZE(X)
|
||||
mult x4, x4, t0
|
||||
addq X, INCX, X
|
||||
|
||||
addt a1, t1, a1
|
||||
LD x4, 0 * SIZE(X)
|
||||
mult x5, x5, t1
|
||||
lda I, -1(I)
|
||||
|
||||
addt a2, t2, a2
|
||||
LD x5, 1 * SIZE(X)
|
||||
mult x6, x6, t2
|
||||
addq X, INCX, X
|
||||
|
||||
addt a3, t3, a3
|
||||
LD x6, 0 * SIZE(X)
|
||||
mult x7, x7, t3
|
||||
bgt I, $L21
|
||||
.align 4
|
||||
|
||||
$L22:
|
||||
addt a0, t0, a0
|
||||
LD x7, 1 * SIZE(X)
|
||||
mult x0, x0, t0
|
||||
addq X, INCX, X
|
||||
|
||||
addt a1, t1, a1
|
||||
mult x1, x1, t1
|
||||
addt a2, t2, a2
|
||||
mult x2, x2, t2
|
||||
|
||||
addt a3, t3, a3
|
||||
mult x3, x3, t3
|
||||
addt a0, t0, a0
|
||||
mult x4, x4, t0
|
||||
|
||||
addt a1, t1, a1
|
||||
mult x5, x5, t1
|
||||
addt a2, t2, a2
|
||||
mult x6, x6, t2
|
||||
|
||||
addt a3, t3, a3
|
||||
mult x7, x7, t3
|
||||
addt a2, t2, a2
|
||||
addt a3, t3, a3
|
||||
.align 4
|
||||
|
||||
$L25:
|
||||
and N, 3, I
|
||||
ble I, $L998
|
||||
.align 4
|
||||
|
||||
$L26:
|
||||
LD x0, 0 * SIZE(X)
|
||||
lda I, -1(I)
|
||||
LD x1, 1 * SIZE(X)
|
||||
addq X, INCX, X
|
||||
|
||||
addt a0, t0, a0
|
||||
mult x0, x0, t0
|
||||
addt a1, t1, a1
|
||||
mult x1, x1, t1
|
||||
|
||||
bgt I, $L26
|
||||
.align 4
|
||||
|
||||
|
||||
$L998:
|
||||
addt a0, t0, a0
|
||||
addt a1, t1, a1
|
||||
|
||||
addt a0, a1, a0
|
||||
addt a2, a3, a2
|
||||
|
||||
#if defined(EV4) || defined(EV5)
|
||||
addt a0, a2, $f16
|
||||
jsr $26, ($27), sqrt !lituse_jsr!2
|
||||
|
||||
ldah $29, 0($26) !gpdisp!3
|
||||
lda $29, 0($29) !gpdisp!3
|
||||
#else
|
||||
addt a0, a2, a0
|
||||
sqrtt a0, a0
|
||||
#endif
|
||||
.align 4
|
||||
|
||||
$L999:
|
||||
#if defined(EV4) || defined(EV5)
|
||||
ldq $26, 0($sp)
|
||||
lda $sp, 16($sp)
|
||||
#endif
|
||||
ret
|
||||
EPILOGUE
|
||||
631
kernel/alpha/zrot.S
Normal file
631
kernel/alpha/zrot.S
Normal file
@@ -0,0 +1,631 @@
|
||||
/*********************************************************************/
|
||||
/* Copyright 2009, 2010 The University of Texas at Austin. */
|
||||
/* All rights reserved. */
|
||||
/* */
|
||||
/* Redistribution and use in source and binary forms, with or */
|
||||
/* without modification, are permitted provided that the following */
|
||||
/* conditions are met: */
|
||||
/* */
|
||||
/* 1. Redistributions of source code must retain the above */
|
||||
/* copyright notice, this list of conditions and the following */
|
||||
/* disclaimer. */
|
||||
/* */
|
||||
/* 2. Redistributions in binary form must reproduce the above */
|
||||
/* copyright notice, this list of conditions and the following */
|
||||
/* disclaimer in the documentation and/or other materials */
|
||||
/* provided with the distribution. */
|
||||
/* */
|
||||
/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */
|
||||
/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */
|
||||
/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */
|
||||
/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */
|
||||
/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */
|
||||
/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */
|
||||
/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */
|
||||
/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */
|
||||
/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */
|
||||
/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */
|
||||
/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */
|
||||
/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */
|
||||
/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */
|
||||
/* POSSIBILITY OF SUCH DAMAGE. */
|
||||
/* */
|
||||
/* The views and conclusions contained in the software and */
|
||||
/* documentation are those of the authors and should not be */
|
||||
/* interpreted as representing official policies, either expressed */
|
||||
/* or implied, of The University of Texas at Austin. */
|
||||
/*********************************************************************/
|
||||
|
||||
#define ASSEMBLER
|
||||
#include "common.h"
|
||||
#include "version.h"
|
||||
|
||||
#define N $16
|
||||
#define X $17
|
||||
#define INCX $18
|
||||
#define Y $19
|
||||
#define INCY $20
|
||||
#define I $21
|
||||
#define XX $23
|
||||
#define YY $24
|
||||
|
||||
#define C $f10
|
||||
#define S $f11
|
||||
|
||||
#define PREFETCH_SIZE 80
|
||||
|
||||
PROLOGUE
|
||||
PROFCODE
|
||||
.frame $sp, 0, $26, 0
|
||||
|
||||
#ifndef PROFILE
|
||||
.prologue 0
|
||||
#else
|
||||
.prologue 1
|
||||
#endif
|
||||
|
||||
fmov $f21, C
|
||||
LD S, 0($sp)
|
||||
|
||||
addq INCX, INCX, INCX
|
||||
addq INCY, INCY, INCY
|
||||
|
||||
cmpeq INCX, 2, $23
|
||||
cmpeq INCY, 2, $24
|
||||
ble N, $L998
|
||||
|
||||
and $23, $24, $23
|
||||
beq $23, $L50
|
||||
|
||||
sra N, 2, I
|
||||
ble I, $L15
|
||||
|
||||
LD $f12, 0*SIZE(X)
|
||||
LD $f13, 0*SIZE(Y)
|
||||
LD $f14, 1*SIZE(X)
|
||||
LD $f15, 1*SIZE(Y)
|
||||
|
||||
LD $f16, 2*SIZE(X)
|
||||
LD $f17, 2*SIZE(Y)
|
||||
LD $f18, 3*SIZE(X)
|
||||
LD $f19, 3*SIZE(Y)
|
||||
|
||||
MUL C, $f12, $f21
|
||||
unop
|
||||
MUL S, $f13, $f22
|
||||
MUL C, $f13, $f23
|
||||
|
||||
LD $f13, 4*SIZE(Y)
|
||||
MUL S, $f12, $f24
|
||||
LD $f12, 4*SIZE(X)
|
||||
MUL C, $f14, $f25
|
||||
|
||||
lda I, -1(I)
|
||||
MUL S, $f15, $f26
|
||||
ADD $f21, $f22, $f22
|
||||
MUL C, $f15, $f27
|
||||
|
||||
LD $f15, 5*SIZE(Y)
|
||||
MUL S, $f14, $f28
|
||||
SUB $f23, $f24, $f24
|
||||
ble I, $L13
|
||||
.align 4
|
||||
|
||||
$L12:
|
||||
MUL C, $f16, $f21
|
||||
lds $f31, (PREFETCH_SIZE) * SIZE(X)
|
||||
unop
|
||||
LD $f14, 5*SIZE(X)
|
||||
|
||||
ST $f22, 0*SIZE(X)
|
||||
MUL S, $f17, $f22
|
||||
unop
|
||||
ADD $f25, $f26, $f26
|
||||
|
||||
MUL C, $f17, $f23
|
||||
lds $f31, (PREFETCH_SIZE) * SIZE(Y)
|
||||
unop
|
||||
LD $f17, 6*SIZE(Y)
|
||||
|
||||
ST $f24, 0*SIZE(Y)
|
||||
MUL S, $f16, $f24
|
||||
unop
|
||||
SUB $f27, $f28, $f28
|
||||
|
||||
MUL C, $f18, $f25
|
||||
LD $f16, 6*SIZE(X)
|
||||
unop
|
||||
unop
|
||||
|
||||
ST $f26, 1*SIZE(X)
|
||||
MUL S, $f19, $f26
|
||||
unop
|
||||
ADD $f21, $f22, $f22
|
||||
|
||||
MUL C, $f19, $f27
|
||||
unop
|
||||
unop
|
||||
LD $f19, 7*SIZE(Y)
|
||||
|
||||
ST $f28, 1*SIZE(Y)
|
||||
MUL S, $f18, $f28
|
||||
unop
|
||||
SUB $f23, $f24, $f24
|
||||
|
||||
MUL C, $f12, $f21
|
||||
LD $f18, 7*SIZE(X)
|
||||
unop
|
||||
unop
|
||||
|
||||
ST $f22, 2*SIZE(X)
|
||||
unop
|
||||
MUL S, $f13, $f22
|
||||
ADD $f25, $f26, $f26
|
||||
|
||||
MUL C, $f13, $f23
|
||||
LD $f13, 8*SIZE(Y)
|
||||
unop
|
||||
unop
|
||||
|
||||
ST $f24, 2*SIZE(Y)
|
||||
MUL S, $f12, $f24
|
||||
unop
|
||||
SUB $f27, $f28, $f28
|
||||
|
||||
MUL C, $f14, $f25
|
||||
LD $f12, 8*SIZE(X)
|
||||
unop
|
||||
unop
|
||||
|
||||
ST $f26, 3*SIZE(X)
|
||||
MUL S, $f15, $f26
|
||||
unop
|
||||
ADD $f21, $f22, $f22
|
||||
|
||||
MUL C, $f15, $f27
|
||||
LD $f15, 9*SIZE(Y)
|
||||
unop
|
||||
unop
|
||||
|
||||
ST $f28, 3*SIZE(Y)
|
||||
MUL S, $f14, $f28
|
||||
unop
|
||||
SUB $f23, $f24, $f24
|
||||
|
||||
MUL C, $f16, $f21
|
||||
LD $f14, 9*SIZE(X)
|
||||
unop
|
||||
unop
|
||||
|
||||
ST $f22, 4*SIZE(X)
|
||||
MUL S, $f17, $f22
|
||||
unop
|
||||
ADD $f25, $f26, $f26
|
||||
|
||||
MUL C, $f17, $f23
|
||||
LD $f17, 10*SIZE(Y)
|
||||
unop
|
||||
unop
|
||||
|
||||
ST $f24, 4*SIZE(Y)
|
||||
MUL S, $f16, $f24
|
||||
unop
|
||||
SUB $f27, $f28, $f28
|
||||
|
||||
MUL C, $f18, $f25
|
||||
LD $f16, 10*SIZE(X)
|
||||
unop
|
||||
unop
|
||||
|
||||
ST $f26, 5*SIZE(X)
|
||||
MUL S, $f19, $f26
|
||||
unop
|
||||
ADD $f21, $f22, $f22
|
||||
|
||||
MUL C, $f19, $f27
|
||||
LD $f19, 11*SIZE(Y)
|
||||
unop
|
||||
unop
|
||||
|
||||
ST $f28, 5*SIZE(Y)
|
||||
MUL S, $f18, $f28
|
||||
lda I, -1(I)
|
||||
SUB $f23, $f24, $f24
|
||||
|
||||
MUL C, $f12, $f21
|
||||
LD $f18, 11*SIZE(X)
|
||||
unop
|
||||
unop
|
||||
|
||||
ST $f22, 6*SIZE(X)
|
||||
MUL S, $f13, $f22
|
||||
unop
|
||||
ADD $f25, $f26, $f26
|
||||
|
||||
MUL C, $f13, $f23
|
||||
LD $f13, 12*SIZE(Y)
|
||||
lda X, 8*SIZE(X)
|
||||
unop
|
||||
|
||||
ST $f24, 6*SIZE(Y)
|
||||
MUL S, $f12, $f24
|
||||
unop
|
||||
SUB $f27, $f28, $f28
|
||||
|
||||
MUL C, $f14, $f25
|
||||
LD $f12, 4*SIZE(X)
|
||||
lda Y, 8*SIZE(Y)
|
||||
unop
|
||||
|
||||
ST $f26, -1*SIZE(X)
|
||||
MUL S, $f15, $f26
|
||||
unop
|
||||
ADD $f21, $f22, $f22
|
||||
|
||||
MUL C, $f15, $f27
|
||||
LD $f15, 5*SIZE(Y)
|
||||
unop
|
||||
unop
|
||||
|
||||
ST $f28, -1*SIZE(Y)
|
||||
MUL S, $f14, $f28
|
||||
SUB $f23, $f24, $f24
|
||||
bgt I, $L12
|
||||
.align 4
|
||||
|
||||
$L13:
|
||||
MUL C, $f16, $f21
|
||||
LD $f14, 5*SIZE(X)
|
||||
unop
|
||||
unop
|
||||
|
||||
ST $f22, 0*SIZE(X)
|
||||
MUL S, $f17, $f22
|
||||
unop
|
||||
ADD $f25, $f26, $f26
|
||||
|
||||
MUL C, $f17, $f23
|
||||
unop
|
||||
unop
|
||||
LD $f17, 6*SIZE(Y)
|
||||
|
||||
ST $f24, 0*SIZE(Y)
|
||||
MUL S, $f16, $f24
|
||||
LD $f16, 6*SIZE(X)
|
||||
SUB $f27, $f28, $f28
|
||||
|
||||
MUL C, $f18, $f25
|
||||
unop
|
||||
unop
|
||||
unop
|
||||
|
||||
ST $f26, 1*SIZE(X)
|
||||
MUL S, $f19, $f26
|
||||
unop
|
||||
ADD $f21, $f22, $f22
|
||||
|
||||
MUL C, $f19, $f27
|
||||
unop
|
||||
unop
|
||||
LD $f19, 7*SIZE(Y)
|
||||
|
||||
ST $f28, 1*SIZE(Y)
|
||||
MUL S, $f18, $f28
|
||||
LD $f18, 7*SIZE(X)
|
||||
SUB $f23, $f24, $f24
|
||||
|
||||
MUL C, $f12, $f21
|
||||
unop
|
||||
unop
|
||||
unop
|
||||
|
||||
ST $f22, 2*SIZE(X)
|
||||
unop
|
||||
MUL S, $f13, $f22
|
||||
ADD $f25, $f26, $f26
|
||||
|
||||
MUL C, $f13, $f23
|
||||
unop
|
||||
unop
|
||||
unop
|
||||
|
||||
ST $f24, 2*SIZE(Y)
|
||||
MUL S, $f12, $f24
|
||||
unop
|
||||
SUB $f27, $f28, $f28
|
||||
|
||||
MUL C, $f14, $f25
|
||||
unop
|
||||
unop
|
||||
unop
|
||||
|
||||
ST $f26, 3*SIZE(X)
|
||||
MUL S, $f15, $f26
|
||||
unop
|
||||
ADD $f21, $f22, $f22
|
||||
|
||||
MUL C, $f15, $f27
|
||||
unop
|
||||
unop
|
||||
unop
|
||||
|
||||
ST $f28, 3*SIZE(Y)
|
||||
MUL S, $f14, $f28
|
||||
unop
|
||||
SUB $f23, $f24, $f24
|
||||
|
||||
MUL C, $f16, $f21
|
||||
unop
|
||||
unop
|
||||
unop
|
||||
|
||||
ST $f22, 4*SIZE(X)
|
||||
MUL S, $f17, $f22
|
||||
unop
|
||||
ADD $f25, $f26, $f26
|
||||
|
||||
MUL C, $f17, $f23
|
||||
unop
|
||||
unop
|
||||
unop
|
||||
|
||||
ST $f24, 4*SIZE(Y)
|
||||
MUL S, $f16, $f24
|
||||
unop
|
||||
SUB $f27, $f28, $f28
|
||||
|
||||
MUL C, $f18, $f25
|
||||
unop
|
||||
unop
|
||||
unop
|
||||
|
||||
ST $f26, 5*SIZE(X)
|
||||
MUL S, $f19, $f26
|
||||
unop
|
||||
ADD $f21, $f22, $f22
|
||||
|
||||
MUL C, $f19, $f27
|
||||
unop
|
||||
unop
|
||||
unop
|
||||
|
||||
ST $f28, 5*SIZE(Y)
|
||||
MUL S, $f18, $f28
|
||||
unop
|
||||
SUB $f23, $f24, $f24
|
||||
|
||||
ST $f22, 6*SIZE(X)
|
||||
ADD $f25, $f26, $f26
|
||||
ST $f24, 6*SIZE(Y)
|
||||
SUB $f27, $f28, $f28
|
||||
|
||||
ST $f26, 7*SIZE(X)
|
||||
lda X, 8*SIZE(X)
|
||||
ST $f28, 7*SIZE(Y)
|
||||
lda Y, 8*SIZE(Y)
|
||||
.align 4
|
||||
|
||||
|
||||
$L15:
|
||||
and N, 3, I
|
||||
ble I, $L998
|
||||
.align 4
|
||||
|
||||
$L16:
|
||||
LD $f12, 0*SIZE(X)
|
||||
LD $f13, 0*SIZE(Y)
|
||||
LD $f14, 1*SIZE(X)
|
||||
LD $f15, 1*SIZE(Y)
|
||||
|
||||
MUL C, $f12, $f21
|
||||
MUL S, $f13, $f22
|
||||
MUL C, $f13, $f23
|
||||
MUL S, $f12, $f24
|
||||
|
||||
ADD $f21, $f22, $f22
|
||||
SUB $f23, $f24, $f24
|
||||
|
||||
MUL C, $f14, $f25
|
||||
MUL S, $f15, $f26
|
||||
MUL C, $f15, $f27
|
||||
MUL S, $f14, $f28
|
||||
|
||||
ADD $f25, $f26, $f26
|
||||
SUB $f27, $f28, $f28
|
||||
|
||||
ST $f22, 0*SIZE(X)
|
||||
ST $f24, 0*SIZE(Y)
|
||||
lda I, -1(I)
|
||||
|
||||
ST $f26, 1*SIZE(X)
|
||||
lda X, 2 * SIZE(X)
|
||||
ST $f28, 1*SIZE(Y)
|
||||
lda Y, 2 * SIZE(Y)
|
||||
|
||||
bgt I, $L16
|
||||
.align 4
|
||||
|
||||
$L998:
|
||||
clr $0
|
||||
ret
|
||||
.align 4
|
||||
|
||||
$L50:
|
||||
mov X, XX
|
||||
mov Y, YY
|
||||
|
||||
sra N, 2, I
|
||||
ble I, $L55
|
||||
.align 4
|
||||
|
||||
$L51:
|
||||
LD $f12, 0*SIZE(X)
|
||||
LD $f13, 0*SIZE(Y)
|
||||
LD $f14, 1*SIZE(X)
|
||||
SXADDQ INCX, X, X
|
||||
LD $f15, 1*SIZE(Y)
|
||||
SXADDQ INCY, Y, Y
|
||||
|
||||
MUL C, $f12, $f21
|
||||
MUL S, $f13, $f22
|
||||
MUL C, $f13, $f23
|
||||
MUL S, $f12, $f24
|
||||
|
||||
ADD $f21, $f22, $f22
|
||||
SUB $f23, $f24, $f24
|
||||
|
||||
MUL C, $f14, $f25
|
||||
MUL S, $f15, $f26
|
||||
MUL C, $f15, $f27
|
||||
MUL S, $f14, $f28
|
||||
|
||||
ADD $f25, $f26, $f26
|
||||
SUB $f27, $f28, $f28
|
||||
|
||||
ST $f22, 0*SIZE(XX)
|
||||
ST $f24, 0*SIZE(YY)
|
||||
ST $f26, 1*SIZE(XX)
|
||||
SXADDQ INCX, XX, XX
|
||||
ST $f28, 1*SIZE(YY)
|
||||
SXADDQ INCY, YY, YY
|
||||
|
||||
|
||||
LD $f12, 0*SIZE(X)
|
||||
LD $f13, 0*SIZE(Y)
|
||||
LD $f14, 1*SIZE(X)
|
||||
SXADDQ INCX, X, X
|
||||
LD $f15, 1*SIZE(Y)
|
||||
SXADDQ INCY, Y, Y
|
||||
|
||||
MUL C, $f12, $f21
|
||||
MUL S, $f13, $f22
|
||||
MUL C, $f13, $f23
|
||||
MUL S, $f12, $f24
|
||||
|
||||
ADD $f21, $f22, $f22
|
||||
SUB $f23, $f24, $f24
|
||||
|
||||
MUL C, $f14, $f25
|
||||
MUL S, $f15, $f26
|
||||
MUL C, $f15, $f27
|
||||
MUL S, $f14, $f28
|
||||
|
||||
ADD $f25, $f26, $f26
|
||||
SUB $f27, $f28, $f28
|
||||
|
||||
ST $f22, 0*SIZE(XX)
|
||||
ST $f24, 0*SIZE(YY)
|
||||
ST $f26, 1*SIZE(XX)
|
||||
SXADDQ INCX, XX, XX
|
||||
ST $f28, 1*SIZE(YY)
|
||||
SXADDQ INCY, YY, YY
|
||||
|
||||
|
||||
LD $f12, 0*SIZE(X)
|
||||
LD $f13, 0*SIZE(Y)
|
||||
LD $f14, 1*SIZE(X)
|
||||
SXADDQ INCX, X, X
|
||||
LD $f15, 1*SIZE(Y)
|
||||
SXADDQ INCY, Y, Y
|
||||
|
||||
MUL C, $f12, $f21
|
||||
MUL S, $f13, $f22
|
||||
MUL C, $f13, $f23
|
||||
MUL S, $f12, $f24
|
||||
|
||||
ADD $f21, $f22, $f22
|
||||
SUB $f23, $f24, $f24
|
||||
|
||||
MUL C, $f14, $f25
|
||||
MUL S, $f15, $f26
|
||||
MUL C, $f15, $f27
|
||||
MUL S, $f14, $f28
|
||||
|
||||
ADD $f25, $f26, $f26
|
||||
SUB $f27, $f28, $f28
|
||||
|
||||
ST $f22, 0*SIZE(XX)
|
||||
ST $f24, 0*SIZE(YY)
|
||||
ST $f26, 1*SIZE(XX)
|
||||
SXADDQ INCX, XX, XX
|
||||
ST $f28, 1*SIZE(YY)
|
||||
SXADDQ INCY, YY, YY
|
||||
|
||||
|
||||
LD $f12, 0*SIZE(X)
|
||||
LD $f13, 0*SIZE(Y)
|
||||
LD $f14, 1*SIZE(X)
|
||||
SXADDQ INCX, X, X
|
||||
LD $f15, 1*SIZE(Y)
|
||||
SXADDQ INCY, Y, Y
|
||||
|
||||
MUL C, $f12, $f21
|
||||
MUL S, $f13, $f22
|
||||
MUL C, $f13, $f23
|
||||
MUL S, $f12, $f24
|
||||
|
||||
ADD $f21, $f22, $f22
|
||||
SUB $f23, $f24, $f24
|
||||
|
||||
MUL C, $f14, $f25
|
||||
MUL S, $f15, $f26
|
||||
MUL C, $f15, $f27
|
||||
MUL S, $f14, $f28
|
||||
|
||||
ADD $f25, $f26, $f26
|
||||
SUB $f27, $f28, $f28
|
||||
|
||||
ST $f22, 0*SIZE(XX)
|
||||
ST $f24, 0*SIZE(YY)
|
||||
ST $f26, 1*SIZE(XX)
|
||||
SXADDQ INCX, XX, XX
|
||||
ST $f28, 1*SIZE(YY)
|
||||
SXADDQ INCY, YY, YY
|
||||
|
||||
lda I, -1(I)
|
||||
bgt I, $L51
|
||||
.align 4
|
||||
|
||||
$L55:
|
||||
and N, 3, I
|
||||
ble I, $L999
|
||||
.align 4
|
||||
|
||||
$L56:
|
||||
LD $f12, 0*SIZE(X)
|
||||
LD $f13, 0*SIZE(Y)
|
||||
LD $f14, 1*SIZE(X)
|
||||
LD $f15, 1*SIZE(Y)
|
||||
|
||||
MUL C, $f12, $f21
|
||||
MUL S, $f13, $f22
|
||||
MUL C, $f13, $f23
|
||||
MUL S, $f12, $f24
|
||||
|
||||
ADD $f21, $f22, $f22
|
||||
SUB $f23, $f24, $f24
|
||||
|
||||
MUL C, $f14, $f25
|
||||
MUL S, $f15, $f26
|
||||
MUL C, $f15, $f27
|
||||
MUL S, $f14, $f28
|
||||
|
||||
ADD $f25, $f26, $f26
|
||||
SUB $f27, $f28, $f28
|
||||
|
||||
ST $f22, 0*SIZE(X)
|
||||
ST $f24, 0*SIZE(Y)
|
||||
lda I, -1(I)
|
||||
|
||||
ST $f26, 1*SIZE(X)
|
||||
ST $f28, 1*SIZE(Y)
|
||||
SXADDQ INCX, X, X
|
||||
SXADDQ INCY, Y, Y
|
||||
|
||||
bgt I, $L56
|
||||
.align 4
|
||||
|
||||
$L999:
|
||||
clr $0
|
||||
ret
|
||||
EPILOGUE
|
||||
255
kernel/alpha/zscal.S
Normal file
255
kernel/alpha/zscal.S
Normal file
@@ -0,0 +1,255 @@
|
||||
/*********************************************************************/
|
||||
/* Copyright 2009, 2010 The University of Texas at Austin. */
|
||||
/* All rights reserved. */
|
||||
/* */
|
||||
/* Redistribution and use in source and binary forms, with or */
|
||||
/* without modification, are permitted provided that the following */
|
||||
/* conditions are met: */
|
||||
/* */
|
||||
/* 1. Redistributions of source code must retain the above */
|
||||
/* copyright notice, this list of conditions and the following */
|
||||
/* disclaimer. */
|
||||
/* */
|
||||
/* 2. Redistributions in binary form must reproduce the above */
|
||||
/* copyright notice, this list of conditions and the following */
|
||||
/* disclaimer in the documentation and/or other materials */
|
||||
/* provided with the distribution. */
|
||||
/* */
|
||||
/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */
|
||||
/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */
|
||||
/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */
|
||||
/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */
|
||||
/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */
|
||||
/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */
|
||||
/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */
|
||||
/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */
|
||||
/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */
|
||||
/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */
|
||||
/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */
|
||||
/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */
|
||||
/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */
|
||||
/* POSSIBILITY OF SUCH DAMAGE. */
|
||||
/* */
|
||||
/* The views and conclusions contained in the software and */
|
||||
/* documentation are those of the authors and should not be */
|
||||
/* interpreted as representing official policies, either expressed */
|
||||
/* or implied, of The University of Texas at Austin. */
|
||||
/*********************************************************************/
|
||||
|
||||
#define ASSEMBLER
|
||||
#include "common.h"
|
||||
#include "version.h"
|
||||
|
||||
#define PREFETCHSIZE 88
|
||||
|
||||
#define N $16
|
||||
#define X $21
|
||||
#define INCX $17
|
||||
|
||||
#define XX $18
|
||||
#define I $19
|
||||
|
||||
#define ALPHA_R $f19
|
||||
#define ALPHA_I $f20
|
||||
|
||||
#define s0 $f0
|
||||
#define s1 $f1
|
||||
#define s2 $f10
|
||||
#define s3 $f11
|
||||
|
||||
#define a0 $f12
|
||||
#define a1 $f13
|
||||
#define a2 $f14
|
||||
#define a3 $f15
|
||||
#define a4 $f16
|
||||
#define a5 $f17
|
||||
#define a6 $f18
|
||||
#define a7 $f21
|
||||
|
||||
#define t0 $f22
|
||||
#define t1 $f23
|
||||
#define t2 $f24
|
||||
#define t3 $f25
|
||||
|
||||
#define t4 $f26
|
||||
#define t5 $f27
|
||||
#define t6 $f28
|
||||
#define t7 $f29
|
||||
|
||||
PROLOGUE
|
||||
PROFCODE
|
||||
|
||||
ldq INCX, 0($sp)
|
||||
mov X, XX
|
||||
ble N, $L999
|
||||
|
||||
addq INCX, INCX, INCX
|
||||
|
||||
sra N, 2, I
|
||||
ble I, $L15
|
||||
|
||||
LD a0, 0 * SIZE(X)
|
||||
LD a1, 1 * SIZE(X)
|
||||
SXADDQ INCX, X, X
|
||||
LD a2, 0 * SIZE(X)
|
||||
LD a3, 1 * SIZE(X)
|
||||
SXADDQ INCX, X, X
|
||||
LD a4, 0 * SIZE(X)
|
||||
LD a5, 1 * SIZE(X)
|
||||
SXADDQ INCX, X, X
|
||||
LD a6, 0 * SIZE(X)
|
||||
LD a7, 1 * SIZE(X)
|
||||
SXADDQ INCX, X, X
|
||||
|
||||
MUL a0, ALPHA_R, t0
|
||||
MUL a1, ALPHA_I, t1
|
||||
MUL a0, ALPHA_I, t2
|
||||
MUL a1, ALPHA_R, t3
|
||||
|
||||
SUB t0, t1, t4
|
||||
ADD t2, t3, t5
|
||||
|
||||
lda I, -1(I)
|
||||
ble I, $L13
|
||||
.align 4
|
||||
|
||||
$L12:
|
||||
ST t4, 0 * SIZE(XX)
|
||||
MUL a2, ALPHA_R, t0
|
||||
ST t5, 1 * SIZE(XX)
|
||||
MUL a3, ALPHA_I, t1
|
||||
|
||||
MUL a2, ALPHA_I, t2
|
||||
LD a0, 0 * SIZE(X)
|
||||
MUL a3, ALPHA_R, t3
|
||||
LD a1, 1 * SIZE(X)
|
||||
|
||||
SUB t0, t1, t6
|
||||
SXADDQ INCX, XX, XX
|
||||
ADD t2, t3, t7
|
||||
SXADDQ INCX, X, X
|
||||
|
||||
MUL a4, ALPHA_R, t0
|
||||
ST t6, 0 * SIZE(XX)
|
||||
MUL a5, ALPHA_I, t1
|
||||
ST t7, 1 * SIZE(XX)
|
||||
|
||||
MUL a4, ALPHA_I, t2
|
||||
LD a2, 0 * SIZE(X)
|
||||
MUL a5, ALPHA_R, t3
|
||||
LD a3, 1 * SIZE(X)
|
||||
|
||||
SUB t0, t1, t4
|
||||
SXADDQ INCX, XX, XX
|
||||
ADD t2, t3, t5
|
||||
SXADDQ INCX, X, X
|
||||
|
||||
MUL a6, ALPHA_R, t0
|
||||
ST t4, 0 * SIZE(XX)
|
||||
MUL a7, ALPHA_I, t1
|
||||
ST t5, 1 * SIZE(XX)
|
||||
|
||||
MUL a6, ALPHA_I, t2
|
||||
LD a4, 0 * SIZE(X)
|
||||
MUL a7, ALPHA_R, t3
|
||||
LD a5, 1 * SIZE(X)
|
||||
|
||||
SUB t0, t1, t6
|
||||
SXADDQ INCX, XX, XX
|
||||
ADD t2, t3, t7
|
||||
SXADDQ INCX, X, X
|
||||
|
||||
MUL a0, ALPHA_R, t0
|
||||
ST t6, 0 * SIZE(XX)
|
||||
MUL a1, ALPHA_I, t1
|
||||
ST t7, 1 * SIZE(XX)
|
||||
|
||||
MUL a0, ALPHA_I, t2
|
||||
LD a6, 0 * SIZE(X)
|
||||
MUL a1, ALPHA_R, t3
|
||||
LD a7, 1 * SIZE(X)
|
||||
|
||||
SUB t0, t1, t4
|
||||
lda I, -1(I)
|
||||
ADD t2, t3, t5
|
||||
SXADDQ INCX, XX, XX
|
||||
|
||||
lds $f31, PREFETCHSIZE * SIZE(X)
|
||||
unop
|
||||
SXADDQ INCX, X, X
|
||||
bne I, $L12
|
||||
.align 4
|
||||
|
||||
$L13:
|
||||
MUL a2, ALPHA_R, t0
|
||||
MUL a3, ALPHA_I, t1
|
||||
ST t4, 0 * SIZE(XX)
|
||||
MUL a2, ALPHA_I, t2
|
||||
ST t5, 1 * SIZE(XX)
|
||||
MUL a3, ALPHA_R, t3
|
||||
|
||||
SUB t0, t1, t6
|
||||
SXADDQ INCX, XX, XX
|
||||
ADD t2, t3, t7
|
||||
unop
|
||||
|
||||
ST t6, 0 * SIZE(XX)
|
||||
MUL a4, ALPHA_R, t0
|
||||
ST t7, 1 * SIZE(XX)
|
||||
MUL a5, ALPHA_I, t1
|
||||
MUL a4, ALPHA_I, t2
|
||||
MUL a5, ALPHA_R, t3
|
||||
|
||||
SUB t0, t1, t4
|
||||
SXADDQ INCX, XX, XX
|
||||
ADD t2, t3, t5
|
||||
unop
|
||||
|
||||
MUL a6, ALPHA_R, t0
|
||||
ST t4, 0 * SIZE(XX)
|
||||
MUL a7, ALPHA_I, t1
|
||||
ST t5, 1 * SIZE(XX)
|
||||
|
||||
MUL a6, ALPHA_I, t2
|
||||
MUL a7, ALPHA_R, t3
|
||||
|
||||
SUB t0, t1, t6
|
||||
SXADDQ INCX, XX, XX
|
||||
ADD t2, t3, t7
|
||||
|
||||
ST t6, 0 * SIZE(XX)
|
||||
ST t7, 1 * SIZE(XX)
|
||||
SXADDQ INCX, XX, XX
|
||||
.align 4
|
||||
|
||||
$L15:
|
||||
and N, 3, I
|
||||
unop
|
||||
unop
|
||||
ble I, $L999
|
||||
.align 4
|
||||
|
||||
$L17:
|
||||
LD a0, 0 * SIZE(X)
|
||||
LD a1, 1 * SIZE(X)
|
||||
SXADDQ INCX, X, X
|
||||
|
||||
MUL a0, ALPHA_R, t0
|
||||
MUL a1, ALPHA_I, t1
|
||||
MUL a0, ALPHA_I, t2
|
||||
MUL a1, ALPHA_R, t3
|
||||
|
||||
SUB t0, t1, t4
|
||||
ADD t2, t3, t5
|
||||
|
||||
ST t4, 0 * SIZE(XX)
|
||||
ST t5, 1 * SIZE(XX)
|
||||
SXADDQ INCX, XX, XX
|
||||
|
||||
lda I, -1(I)
|
||||
bne I, $L17
|
||||
.align 4
|
||||
|
||||
$L999:
|
||||
ret
|
||||
EPILOGUE
|
||||
244
kernel/alpha/zswap.S
Normal file
244
kernel/alpha/zswap.S
Normal file
@@ -0,0 +1,244 @@
|
||||
/*********************************************************************/
|
||||
/* Copyright 2009, 2010 The University of Texas at Austin. */
|
||||
/* All rights reserved. */
|
||||
/* */
|
||||
/* Redistribution and use in source and binary forms, with or */
|
||||
/* without modification, are permitted provided that the following */
|
||||
/* conditions are met: */
|
||||
/* */
|
||||
/* 1. Redistributions of source code must retain the above */
|
||||
/* copyright notice, this list of conditions and the following */
|
||||
/* disclaimer. */
|
||||
/* */
|
||||
/* 2. Redistributions in binary form must reproduce the above */
|
||||
/* copyright notice, this list of conditions and the following */
|
||||
/* disclaimer in the documentation and/or other materials */
|
||||
/* provided with the distribution. */
|
||||
/* */
|
||||
/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */
|
||||
/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */
|
||||
/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */
|
||||
/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */
|
||||
/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */
|
||||
/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */
|
||||
/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */
|
||||
/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */
|
||||
/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */
|
||||
/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */
|
||||
/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */
|
||||
/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */
|
||||
/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */
|
||||
/* POSSIBILITY OF SUCH DAMAGE. */
|
||||
/* */
|
||||
/* The views and conclusions contained in the software and */
|
||||
/* documentation are those of the authors and should not be */
|
||||
/* interpreted as representing official policies, either expressed */
|
||||
/* or implied, of The University of Texas at Austin. */
|
||||
/*********************************************************************/
|
||||
|
||||
#define ASSEMBLER
|
||||
#include "common.h"
|
||||
#include "version.h"
|
||||
|
||||
PROLOGUE
|
||||
PROFCODE
|
||||
.frame $sp, 0, $26, 0
|
||||
|
||||
mov $21, $17
|
||||
ldl $18, 0($sp)
|
||||
ldq $19, 8($sp)
|
||||
ldl $20, 16($sp)
|
||||
#ifndef PROFILE
|
||||
.prologue 0
|
||||
#else
|
||||
.prologue 1
|
||||
#endif
|
||||
|
||||
ble $16, $SubEnd # if n <= 0 goto $End
|
||||
|
||||
cmpeq $18, 1, $1
|
||||
addq $18, $18, $18
|
||||
cmpeq $20, 1, $2
|
||||
addq $20, $20, $20
|
||||
|
||||
sra $16, 2, $21
|
||||
and $1, $2, $1
|
||||
and $16, 3, $22
|
||||
beq $1, $Sub
|
||||
|
||||
ble $21, $MainRemain
|
||||
.align 4
|
||||
|
||||
$MainLoop:
|
||||
LD $f10, 0*SIZE($19)
|
||||
LD $f11, 1*SIZE($19)
|
||||
LD $f12, 2*SIZE($19)
|
||||
LD $f13, 3*SIZE($19)
|
||||
LD $f14, 4*SIZE($19)
|
||||
LD $f15, 5*SIZE($19)
|
||||
LD $f16, 6*SIZE($19)
|
||||
LD $f17, 7*SIZE($19)
|
||||
|
||||
LD $f20, 0*SIZE($17)
|
||||
LD $f21, 1*SIZE($17)
|
||||
LD $f22, 2*SIZE($17)
|
||||
LD $f23, 3*SIZE($17)
|
||||
LD $f24, 4*SIZE($17)
|
||||
LD $f25, 5*SIZE($17)
|
||||
LD $f26, 6*SIZE($17)
|
||||
LD $f27, 7*SIZE($17)
|
||||
|
||||
lds $f31, 16*SIZE($17)
|
||||
unop
|
||||
lds $f31, 16*SIZE($19)
|
||||
subl $21, 1, $21
|
||||
|
||||
ST $f10, 0*SIZE($17)
|
||||
ST $f11, 1*SIZE($17)
|
||||
ST $f12, 2*SIZE($17)
|
||||
ST $f13, 3*SIZE($17)
|
||||
ST $f14, 4*SIZE($17)
|
||||
ST $f15, 5*SIZE($17)
|
||||
ST $f16, 6*SIZE($17)
|
||||
ST $f17, 7*SIZE($17)
|
||||
|
||||
ST $f20, 0*SIZE($19)
|
||||
ST $f21, 1*SIZE($19)
|
||||
ST $f22, 2*SIZE($19)
|
||||
ST $f23, 3*SIZE($19)
|
||||
ST $f24, 4*SIZE($19)
|
||||
ST $f25, 5*SIZE($19)
|
||||
ST $f26, 6*SIZE($19)
|
||||
ST $f27, 7*SIZE($19)
|
||||
|
||||
lda $17, 8*SIZE($17)
|
||||
lda $19, 8*SIZE($19)
|
||||
bgt $21, $MainLoop
|
||||
.align 4
|
||||
|
||||
$MainRemain:
|
||||
ble $22, $MainEnd
|
||||
.align 4
|
||||
|
||||
$MainRemainLoop:
|
||||
LD $f10, 0*SIZE($19)
|
||||
LD $f11, 1*SIZE($19)
|
||||
LD $f20, 0*SIZE($17)
|
||||
LD $f21, 1*SIZE($17)
|
||||
|
||||
lda $17, 2*SIZE($17)
|
||||
lda $19, 2*SIZE($19)
|
||||
subl $22, 1, $22
|
||||
ST $f10, -2*SIZE($17)
|
||||
ST $f11, -1*SIZE($17)
|
||||
ST $f20, -2*SIZE($19)
|
||||
ST $f21, -1*SIZE($19)
|
||||
bgt $22, $MainRemainLoop
|
||||
.align 4
|
||||
|
||||
$MainEnd:
|
||||
clr $0
|
||||
ret
|
||||
.align 4
|
||||
|
||||
$Sub:
|
||||
mov $17, $23
|
||||
mov $19, $24
|
||||
ble $21, $SubRemain
|
||||
.align 4
|
||||
|
||||
$SubLoop:
|
||||
LD $f10, 0*SIZE($19)
|
||||
LD $f11, 1*SIZE($19)
|
||||
SXADDQ $20, $19, $19
|
||||
|
||||
LD $f12, 0*SIZE($19)
|
||||
LD $f13, 1*SIZE($19)
|
||||
SXADDQ $20, $19, $19
|
||||
|
||||
LD $f14, 0*SIZE($19)
|
||||
LD $f15, 1*SIZE($19)
|
||||
SXADDQ $20, $19, $19
|
||||
|
||||
LD $f16, 0*SIZE($19)
|
||||
LD $f17, 1*SIZE($19)
|
||||
SXADDQ $20, $19, $19
|
||||
|
||||
LD $f20, 0*SIZE($17)
|
||||
LD $f21, 1*SIZE($17)
|
||||
SXADDQ $18, $17, $17
|
||||
|
||||
LD $f22, 0*SIZE($17)
|
||||
LD $f23, 1*SIZE($17)
|
||||
SXADDQ $18, $17, $17
|
||||
|
||||
LD $f24, 0*SIZE($17)
|
||||
LD $f25, 1*SIZE($17)
|
||||
SXADDQ $18, $17, $17
|
||||
|
||||
LD $f26, 0*SIZE($17)
|
||||
LD $f27, 1*SIZE($17)
|
||||
SXADDQ $18, $17, $17
|
||||
|
||||
ST $f10, 0*SIZE($23)
|
||||
ST $f11, 1*SIZE($23)
|
||||
SXADDQ $18, $23, $23
|
||||
|
||||
ST $f12, 0*SIZE($23)
|
||||
ST $f13, 1*SIZE($23)
|
||||
SXADDQ $18, $23, $23
|
||||
|
||||
ST $f14, 0*SIZE($23)
|
||||
ST $f15, 1*SIZE($23)
|
||||
SXADDQ $18, $23, $23
|
||||
|
||||
ST $f16, 0*SIZE($23)
|
||||
ST $f17, 1*SIZE($23)
|
||||
SXADDQ $18, $23, $23
|
||||
|
||||
ST $f20, 0*SIZE($24)
|
||||
ST $f21, 1*SIZE($24)
|
||||
SXADDQ $20, $24, $24
|
||||
|
||||
ST $f22, 0*SIZE($24)
|
||||
ST $f23, 1*SIZE($24)
|
||||
SXADDQ $20, $24, $24
|
||||
|
||||
ST $f24, 0*SIZE($24)
|
||||
ST $f25, 1*SIZE($24)
|
||||
SXADDQ $20, $24, $24
|
||||
|
||||
ST $f26, 0*SIZE($24)
|
||||
ST $f27, 1*SIZE($24)
|
||||
SXADDQ $20, $24, $24
|
||||
|
||||
subl $21, 1, $21
|
||||
bgt $21, $SubLoop
|
||||
.align 4
|
||||
|
||||
$SubRemain:
|
||||
ble $22, $SubEnd
|
||||
.align 4
|
||||
|
||||
$SubRemainLoop:
|
||||
LD $f10, 0*SIZE($19)
|
||||
LD $f11, 1*SIZE($19)
|
||||
LD $f20, 0*SIZE($17)
|
||||
LD $f21, 1*SIZE($17)
|
||||
|
||||
subl $22, 1, $22
|
||||
|
||||
ST $f10, 0*SIZE($17)
|
||||
ST $f11, 1*SIZE($17)
|
||||
ST $f20, 0*SIZE($19)
|
||||
ST $f21, 1*SIZE($19)
|
||||
|
||||
SXADDQ $18, $17, $17
|
||||
SXADDQ $20, $19, $19
|
||||
bgt $22, $SubRemainLoop
|
||||
.align 4
|
||||
|
||||
$SubEnd:
|
||||
clr $0
|
||||
ret
|
||||
EPILOGUE
|
||||
2237
kernel/alpha/ztrsm_kernel_2x2_LN.S
Normal file
2237
kernel/alpha/ztrsm_kernel_2x2_LN.S
Normal file
File diff suppressed because it is too large
Load Diff
2230
kernel/alpha/ztrsm_kernel_2x2_LT.S
Normal file
2230
kernel/alpha/ztrsm_kernel_2x2_LT.S
Normal file
File diff suppressed because it is too large
Load Diff
2230
kernel/alpha/ztrsm_kernel_2x2_RT.S
Normal file
2230
kernel/alpha/ztrsm_kernel_2x2_RT.S
Normal file
File diff suppressed because it is too large
Load Diff
Reference in New Issue
Block a user