Merge branch 'develop' into dev/slewis/merge-from-riscv
This commit is contained in:
commit
3ffd6868d7
13
cblas.h
13
cblas.h
|
@ -101,6 +101,16 @@ CBLAS_INDEX cblas_idamin(OPENBLAS_CONST blasint n, OPENBLAS_CONST double *x, OPE
|
|||
CBLAS_INDEX cblas_icamin(OPENBLAS_CONST blasint n, OPENBLAS_CONST void *x, OPENBLAS_CONST blasint incx);
|
||||
CBLAS_INDEX cblas_izamin(OPENBLAS_CONST blasint n, OPENBLAS_CONST void *x, OPENBLAS_CONST blasint incx);
|
||||
|
||||
float cblas_samax(OPENBLAS_CONST blasint n, OPENBLAS_CONST float *x, OPENBLAS_CONST blasint incx);
|
||||
double cblas_damax(OPENBLAS_CONST blasint n, OPENBLAS_CONST double *x, OPENBLAS_CONST blasint incx);
|
||||
float cblas_scamax(OPENBLAS_CONST blasint n, OPENBLAS_CONST void *x, OPENBLAS_CONST blasint incx);
|
||||
double cblas_dzamax(OPENBLAS_CONST blasint n, OPENBLAS_CONST void *x, OPENBLAS_CONST blasint incx);
|
||||
|
||||
float cblas_samin(OPENBLAS_CONST blasint n, OPENBLAS_CONST float *x, OPENBLAS_CONST blasint incx);
|
||||
double cblas_damin(OPENBLAS_CONST blasint n, OPENBLAS_CONST double *x, OPENBLAS_CONST blasint incx);
|
||||
float cblas_scamin(OPENBLAS_CONST blasint n, OPENBLAS_CONST void *x, OPENBLAS_CONST blasint incx);
|
||||
double cblas_dzamin(OPENBLAS_CONST blasint n, OPENBLAS_CONST void *x, OPENBLAS_CONST blasint incx);
|
||||
|
||||
CBLAS_INDEX cblas_ismax(OPENBLAS_CONST blasint n, OPENBLAS_CONST float *x, OPENBLAS_CONST blasint incx);
|
||||
CBLAS_INDEX cblas_idmax(OPENBLAS_CONST blasint n, OPENBLAS_CONST double *x, OPENBLAS_CONST blasint incx);
|
||||
CBLAS_INDEX cblas_icmax(OPENBLAS_CONST blasint n, OPENBLAS_CONST void *x, OPENBLAS_CONST blasint incx);
|
||||
|
@ -116,6 +126,9 @@ void cblas_daxpy(OPENBLAS_CONST blasint n, OPENBLAS_CONST double alpha, OPENBLAS
|
|||
void cblas_caxpy(OPENBLAS_CONST blasint n, OPENBLAS_CONST void *alpha, OPENBLAS_CONST void *x, OPENBLAS_CONST blasint incx, void *y, OPENBLAS_CONST blasint incy);
|
||||
void cblas_zaxpy(OPENBLAS_CONST blasint n, OPENBLAS_CONST void *alpha, OPENBLAS_CONST void *x, OPENBLAS_CONST blasint incx, void *y, OPENBLAS_CONST blasint incy);
|
||||
|
||||
void cblas_caxpyc(OPENBLAS_CONST blasint n, OPENBLAS_CONST void *alpha, OPENBLAS_CONST void *x, OPENBLAS_CONST blasint incx, void *y, OPENBLAS_CONST blasint incy);
|
||||
void cblas_zaxpyc(OPENBLAS_CONST blasint n, OPENBLAS_CONST void *alpha, OPENBLAS_CONST void *x, OPENBLAS_CONST blasint incx, void *y, OPENBLAS_CONST blasint incy);
|
||||
|
||||
void cblas_scopy(OPENBLAS_CONST blasint n, OPENBLAS_CONST float *x, OPENBLAS_CONST blasint incx, float *y, OPENBLAS_CONST blasint incy);
|
||||
void cblas_dcopy(OPENBLAS_CONST blasint n, OPENBLAS_CONST double *x, OPENBLAS_CONST blasint incx, double *y, OPENBLAS_CONST blasint incy);
|
||||
void cblas_ccopy(OPENBLAS_CONST blasint n, OPENBLAS_CONST void *x, OPENBLAS_CONST blasint incx, void *y, OPENBLAS_CONST blasint incy);
|
||||
|
|
|
@ -130,6 +130,8 @@ endif ()
|
|||
foreach (float_type ${FLOAT_TYPES})
|
||||
|
||||
if (${float_type} STREQUAL "COMPLEX" OR ${float_type} STREQUAL "ZCOMPLEX")
|
||||
GenerateNamedObjects("zaxpy.c" "" "axpyc" ${CBLAS_FLAG} "" "" false ${float_type})
|
||||
|
||||
GenerateNamedObjects("zger.c" "" "geru" ${CBLAS_FLAG} "" "" false ${float_type})
|
||||
GenerateNamedObjects("zger.c" "CONJ" "gerc" ${CBLAS_FLAG} "" "" false ${float_type})
|
||||
GenerateNamedObjects("zdot.c" "CONJ" "dotc" ${CBLAS_FLAG} "" "" false ${float_type})
|
||||
|
|
|
@ -270,7 +270,8 @@ CSBLAS1OBJS = \
|
|||
cblas_scopy.$(SUFFIX) cblas_sdot.$(SUFFIX) cblas_sdsdot.$(SUFFIX) cblas_dsdot.$(SUFFIX) \
|
||||
cblas_srot.$(SUFFIX) cblas_srotg.$(SUFFIX) cblas_srotm.$(SUFFIX) cblas_srotmg.$(SUFFIX) \
|
||||
cblas_sscal.$(SUFFIX) cblas_sswap.$(SUFFIX) cblas_snrm2.$(SUFFIX) cblas_saxpby.$(SUFFIX) \
|
||||
cblas_ismin.$(SUFFIX) cblas_ismax.$(SUFFIX) cblas_ssum.$(SUFFIX)
|
||||
cblas_ismin.$(SUFFIX) cblas_ismax.$(SUFFIX) cblas_ssum.$(SUFFIX) cblas_samax.$(SUFFIX) \
|
||||
cblas_samin.$(SUFFIX)
|
||||
|
||||
CSBLAS2OBJS = \
|
||||
cblas_sgemv.$(SUFFIX) cblas_sger.$(SUFFIX) cblas_ssymv.$(SUFFIX) cblas_strmv.$(SUFFIX) \
|
||||
|
@ -295,7 +296,8 @@ CDBLAS1OBJS = \
|
|||
cblas_dcopy.$(SUFFIX) cblas_ddot.$(SUFFIX) \
|
||||
cblas_drot.$(SUFFIX) cblas_drotg.$(SUFFIX) cblas_drotm.$(SUFFIX) cblas_drotmg.$(SUFFIX) \
|
||||
cblas_dscal.$(SUFFIX) cblas_dswap.$(SUFFIX) cblas_dnrm2.$(SUFFIX) cblas_daxpby.$(SUFFIX) \
|
||||
cblas_idmin.$(SUFFIX) cblas_idmax.$(SUFFIX) cblas_dsum.$(SUFFIX)
|
||||
cblas_idmin.$(SUFFIX) cblas_idmax.$(SUFFIX) cblas_dsum.$(SUFFIX) cblas_damax.$(SUFFIX) \
|
||||
cblas_damin.$(SUFFIX)
|
||||
|
||||
CDBLAS2OBJS = \
|
||||
cblas_dgemv.$(SUFFIX) cblas_dger.$(SUFFIX) cblas_dsymv.$(SUFFIX) cblas_dtrmv.$(SUFFIX) \
|
||||
|
@ -315,7 +317,7 @@ CCBLAS1OBJS = \
|
|||
cblas_cdotc_sub.$(SUFFIX) cblas_cdotu_sub.$(SUFFIX) \
|
||||
cblas_cscal.$(SUFFIX) cblas_csscal.$(SUFFIX) \
|
||||
cblas_cswap.$(SUFFIX) cblas_scnrm2.$(SUFFIX) \
|
||||
cblas_caxpby.$(SUFFIX) \
|
||||
cblas_caxpby.$(SUFFIX) cblas_scamax.$(SUFFIX) cblas_caxpyc.$(SUFFIX) cblas_scamin.$(SUFFIX) \
|
||||
cblas_icmin.$(SUFFIX) cblas_icmax.$(SUFFIX) cblas_scsum.$(SUFFIX) cblas_csrot.$(SUFFIX) cblas_crotg.$(SUFFIX)
|
||||
|
||||
CCBLAS2OBJS = \
|
||||
|
@ -340,12 +342,12 @@ CXERBLAOBJ = \
|
|||
|
||||
CZBLAS1OBJS = \
|
||||
cblas_izamax.$(SUFFIX) cblas_izamin.$(SUFFIX) cblas_dzasum.$(SUFFIX) cblas_zaxpy.$(SUFFIX) \
|
||||
cblas_zcopy.$(SUFFIX) \
|
||||
cblas_zcopy.$(SUFFIX) cblas_dzamax.$(SUFFIX) cblas_dzamin.$(SUFFIX) \
|
||||
cblas_zdotc.$(SUFFIX) cblas_zdotu.$(SUFFIX) \
|
||||
cblas_zdotc_sub.$(SUFFIX) cblas_zdotu_sub.$(SUFFIX) \
|
||||
cblas_zscal.$(SUFFIX) cblas_zdscal.$(SUFFIX) \
|
||||
cblas_zswap.$(SUFFIX) cblas_dznrm2.$(SUFFIX) \
|
||||
cblas_zaxpby.$(SUFFIX) \
|
||||
cblas_zaxpby.$(SUFFIX) cblas_zaxpyc.$(SUFFIX) \
|
||||
cblas_izmin.$(SUFFIX) cblas_izmax.$(SUFFIX) cblas_dzsum.$(SUFFIX) cblas_zdrot.$(SUFFIX) cblas_zrotg.$(SUFFIX)
|
||||
|
||||
|
||||
|
@ -1533,6 +1535,30 @@ cblas_icmin.$(SUFFIX) cblas_icmin.$(PSUFFIX) : imax.c
|
|||
cblas_izmin.$(SUFFIX) cblas_izmin.$(PSUFFIX) : imax.c
|
||||
$(CC) $(CFLAGS) -DCBLAS -c -UUSE_ABS -DUSE_MIN $< -o $(@F)
|
||||
|
||||
cblas_samax.$(SUFFIX) cblas_samax.$(PSUFFIX) : max.c
|
||||
$(CC) $(CFLAGS) -DCBLAS -c -DUSE_ABS -UUSE_MIN $< -o $(@F)
|
||||
|
||||
cblas_damax.$(SUFFIX) cblas_damax.$(PSUFFIX) : max.c
|
||||
$(CC) $(CFLAGS) -DCBLAS -c -DUSE_ABS -UUSE_MIN $< -o $(@F)
|
||||
|
||||
cblas_scamax.$(SUFFIX) cblas_scamax.$(PSUFFIX) : max.c
|
||||
$(CC) $(CFLAGS) -DCBLAS -c -DUSE_ABS -UUSE_MIN $< -o $(@F)
|
||||
|
||||
cblas_dzamax.$(SUFFIX) cblas_dzamax.$(PSUFFIX) : max.c
|
||||
$(CC) $(CFLAGS) -DCBLAS -c -DUSE_ABS -UUSE_MIN $< -o $(@F)
|
||||
|
||||
cblas_samin.$(SUFFIX) cblas_samin.$(PSUFFIX) : max.c
|
||||
$(CC) $(CFLAGS) -DCBLAS -c -DUSE_ABS -DUSE_MIN $< -o $(@F)
|
||||
|
||||
cblas_damin.$(SUFFIX) cblas_damin.$(PSUFFIX) : max.c
|
||||
$(CC) $(CFLAGS) -DCBLAS -c -DUSE_ABS -DUSE_MIN $< -o $(@F)
|
||||
|
||||
cblas_scamin.$(SUFFIX) cblas_scamin.$(PSUFFIX) : max.c
|
||||
$(CC) $(CFLAGS) -DCBLAS -c -DUSE_ABS -DUSE_MIN $< -o $(@F)
|
||||
|
||||
cblas_dzamin.$(SUFFIX) cblas_dzamin.$(PSUFFIX) : max.c
|
||||
$(CC) $(CFLAGS) -DCBLAS -c -DUSE_ABS -DUSE_MIN $< -o $(@F)
|
||||
|
||||
cblas_sasum.$(SUFFIX) cblas_sasum.$(PSUFFIX) : asum.c
|
||||
$(CC) $(CFLAGS) -DCBLAS -c $< -o $(@F)
|
||||
|
||||
|
@ -1627,6 +1653,19 @@ cblas_daxpy.$(SUFFIX) cblas_daxpy.$(PSUFFIX) : axpy.c
|
|||
cblas_caxpy.$(SUFFIX) cblas_caxpy.$(PSUFFIX) : zaxpy.c
|
||||
$(CC) $(CFLAGS) -DCBLAS -c $< -o $(@F)
|
||||
|
||||
cblas_caxpyc.$(SUFFIX) cblas_caxpyc.$(PSUFFIX) : zaxpy.c
|
||||
$(CC) $(CFLAGS) -DCBLAS -c -DCONJ $< -o $(@F)
|
||||
|
||||
cblas_zaxpyc.$(SUFFIX) cblas_zaxpyc.$(PSUFFIX) : zaxpy.c
|
||||
$(CC) $(CFLAGS) -DCBLAS -c -DCONJ $< -o $(@F)
|
||||
|
||||
cblas_xaxpyc.$(SUFFIX) cblas_xaxpyc.$(PSUFFIX) : zaxpy.c
|
||||
$(CC) $(CFLAGS) -DCBLAS -c -DCONJ $< -o $(@F)
|
||||
|
||||
sscal.$(SUFFIX) sscal.$(PSUFFIX) : scal.c
|
||||
$(CC) $(CFLAGS) -c $< -o $(@F)
|
||||
|
||||
dscal.$(SUFFIX) dscal.$(PSUFFIX) : scal.c
|
||||
cblas_zaxpy.$(SUFFIX) cblas_zaxpy.$(PSUFFIX) : zaxpy.c
|
||||
$(CC) $(CFLAGS) -DCBLAS -c $< -o $(@F)
|
||||
|
||||
|
|
|
@ -145,7 +145,12 @@ FLOATRET NAME(blasint *N, FLOAT *x, blasint *INCX){
|
|||
|
||||
#else
|
||||
|
||||
#ifdef COMPLEX
|
||||
FLOAT CNAME(blasint n, void *vx, blasint incx){
|
||||
FLOAT *x = (FLOAT*) vx;
|
||||
#else
|
||||
FLOAT CNAME(blasint n, FLOAT *x, blasint incx){
|
||||
#endif
|
||||
|
||||
FLOAT ret;
|
||||
|
||||
|
|
|
@ -14,10 +14,12 @@ ZSCALKERNEL = cscal_lsx.S
|
|||
SAMAXKERNEL = amax_lsx.S
|
||||
DAMAXKERNEL = amax_lsx.S
|
||||
CAMAXKERNEL = camax_lsx.S
|
||||
ZAMAXKERNEL = camax_lsx.S
|
||||
|
||||
SAMINKERNEL = amin_lsx.S
|
||||
DAMINKERNEL = amin_lsx.S
|
||||
CAMINKERNEL = camin_lsx.S
|
||||
ZAMINKERNEL = camin_lsx.S
|
||||
|
||||
SMAXKERNEL = max_lsx.S
|
||||
DMAXKERNEL = max_lsx.S
|
||||
|
|
|
@ -14,10 +14,12 @@ ZSCALKERNEL = cscal_lasx.S
|
|||
SAMAXKERNEL = amax_lasx.S
|
||||
DAMAXKERNEL = amax_lasx.S
|
||||
CAMAXKERNEL = camax_lasx.S
|
||||
ZAMAXKERNEL = camax_lasx.S
|
||||
|
||||
SAMINKERNEL = amin_lasx.S
|
||||
DAMINKERNEL = amin_lasx.S
|
||||
CAMINKERNEL = camin_lasx.S
|
||||
ZAMINKERNEL = camin_lasx.S
|
||||
|
||||
SMAXKERNEL = max_lsx.S
|
||||
DMAXKERNEL = max_lsx.S
|
||||
|
|
|
@ -66,7 +66,6 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|||
#else
|
||||
xvldrepl.w VM0, X, 0
|
||||
#endif
|
||||
XVFSUB VM0, VM0, VM0
|
||||
bne INCX, TEMP, .L20
|
||||
|
||||
srai.d I, N, 4
|
||||
|
|
|
@ -66,7 +66,6 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|||
#else
|
||||
vldrepl.w VM0, X, 0
|
||||
#endif
|
||||
VFSUB VM0, VM0, VM0
|
||||
bne INCX, TEMP, .L20
|
||||
|
||||
srai.d I, N, 3
|
||||
|
|
|
@ -63,42 +63,60 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|||
bge $r0, N, .L999
|
||||
bge $r0, INCX, .L999
|
||||
li.d TEMP, 1
|
||||
li.w I, -1
|
||||
slli.d TEMP, TEMP, ZBASE_SHIFT
|
||||
slli.d INCX, INCX, ZBASE_SHIFT
|
||||
xvreplgr2vr.w neg1, I
|
||||
xvffint.s.w neg1, neg1
|
||||
srai.d I, N, 3
|
||||
bne INCX, TEMP, .L20
|
||||
bge $r0, I, .L23
|
||||
.align 3
|
||||
|
||||
.L10:
|
||||
xvld VX0, X, 0 * SIZE
|
||||
xvld VX1, X, 8 * SIZE
|
||||
addi.d I, I, -1
|
||||
xvld VX0, X, 0
|
||||
xvld VX1, X, 32
|
||||
#ifdef DOUBLE
|
||||
xvpickev.d x1, VX1, VX0
|
||||
xvpickod.d x2, VX1, VX0
|
||||
#else
|
||||
xvpickev.w x1, VX1, VX0
|
||||
xvpickod.w x2, VX1, VX0
|
||||
xvfmul.s x3, neg1, x1
|
||||
xvfmul.s x4, neg1, x2
|
||||
xvfcmp.clt.s VT0, x1, res0
|
||||
xvfcmp.clt.s VT1, x2, res0
|
||||
xvbitsel.v x1, x1, x3, VT0
|
||||
xvbitsel.v x2, x2, x4, VT1
|
||||
#endif
|
||||
XVFSUB x3, res0, x1
|
||||
XVFSUB x4, res0, x2
|
||||
XVFMAX x1, x1, x3
|
||||
XVFMAX x2, x2, x4
|
||||
XVFADD VM1, x1, x2
|
||||
XVFMAX VM0, VM0, VM1
|
||||
#ifdef DOUBLE
|
||||
xvld VX0, X, 64
|
||||
xvld VX1, X, 96
|
||||
xvpickev.d x1, VX1, VX0
|
||||
xvpickod.d x2, VX1, VX0
|
||||
XVFSUB x3, res0, x1
|
||||
XVFSUB x4, res0, x2
|
||||
XVFMAX x1, x1, x3
|
||||
XVFMAX x2, x2, x4
|
||||
XVFADD VM1, x1, x2
|
||||
XVFMAX VM0, VM0, VM1
|
||||
#endif
|
||||
addi.d I, I, -1
|
||||
addi.d X, X, 16 * SIZE
|
||||
xvfadd.s VM1, x1, x2
|
||||
xvfmax.s VM0, VM0, VM1
|
||||
blt $r0, I, .L10
|
||||
.align 3
|
||||
|
||||
.L11:
|
||||
#ifdef DOUBLE
|
||||
xvpickve.d x1, VM0, 0
|
||||
xvpickve.d x2, VM0, 1
|
||||
XVFMAX VM0, x1, x2
|
||||
#else
|
||||
xvpickve.w x1, VM0, 0
|
||||
xvpickve.w x2, VM0, 1
|
||||
xvpickve.w x3, VM0, 2
|
||||
xvpickve.w x4, VM0, 3
|
||||
xvfmax.s VM1, x1, x2
|
||||
xvfmax.s VM0, x3, x4
|
||||
xvfmax.s VM0, VM0, VM1
|
||||
XVFMAX VM0, x1, x2
|
||||
XVFMAX VM1, x3, x4
|
||||
XVFMAX VM0, VM0, VM1
|
||||
#endif
|
||||
b .L23
|
||||
.align 3
|
||||
|
||||
|
@ -107,66 +125,66 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|||
.align 3
|
||||
|
||||
.L21:
|
||||
fld.s t1, X, 0 * SIZE
|
||||
fld.s t2, X, 1 * SIZE
|
||||
LD t1, X, 0 * SIZE
|
||||
LD t2, X, 1 * SIZE
|
||||
add.d X, X, INCX
|
||||
fld.s t3, X, 0 * SIZE
|
||||
fld.s t4, X, 1 * SIZE
|
||||
LD t3, X, 0 * SIZE
|
||||
LD t4, X, 1 * SIZE
|
||||
add.d X, X, INCX
|
||||
fabs.s t1, t1
|
||||
fabs.s t2, t2
|
||||
fabs.s t3, t3
|
||||
fabs.s t4, t4
|
||||
fadd.s t1, t1, t2
|
||||
fadd.s t3, t3, t4
|
||||
fmax.s s1, t1, t3
|
||||
fld.s t1, X, 0 * SIZE
|
||||
fld.s t2, X, 1 * SIZE
|
||||
FABS t1, t1
|
||||
FABS t2, t2
|
||||
FABS t3, t3
|
||||
FABS t4, t4
|
||||
ADD t1, t1, t2
|
||||
ADD t3, t3, t4
|
||||
FMAX s1, t1, t3
|
||||
LD t1, X, 0 * SIZE
|
||||
LD t2, X, 1 * SIZE
|
||||
add.d X, X, INCX
|
||||
fld.s t3, X, 0 * SIZE
|
||||
fld.s t4, X, 1 * SIZE
|
||||
LD t3, X, 0 * SIZE
|
||||
LD t4, X, 1 * SIZE
|
||||
add.d X, X, INCX
|
||||
fabs.s t1, t1
|
||||
fabs.s t2, t2
|
||||
fabs.s t3, t3
|
||||
fabs.s t4, t4
|
||||
fadd.s t1, t1, t2
|
||||
fadd.s t3, t3, t4
|
||||
fmax.s s1, t1, t3
|
||||
fld.s t1, X, 0 * SIZE
|
||||
fld.s t2, X, 1 * SIZE
|
||||
FABS t1, t1
|
||||
FABS t2, t2
|
||||
FABS t3, t3
|
||||
FABS t4, t4
|
||||
ADD t1, t1, t2
|
||||
ADD t3, t3, t4
|
||||
FMAX s1, t1, t3
|
||||
LD t1, X, 0 * SIZE
|
||||
LD t2, X, 1 * SIZE
|
||||
add.d X, X, INCX
|
||||
fld.s t3, X, 0 * SIZE
|
||||
fld.s t4, X, 1 * SIZE
|
||||
LD t3, X, 0 * SIZE
|
||||
LD t4, X, 1 * SIZE
|
||||
add.d X, X, INCX
|
||||
fabs.s t1, t1
|
||||
fabs.s t2, t2
|
||||
fabs.s t3, t3
|
||||
fabs.s t4, t4
|
||||
FABS t1, t1
|
||||
FABS t2, t2
|
||||
FABS t3, t3
|
||||
FABS t4, t4
|
||||
addi.d I, I, -1
|
||||
fadd.s t1, t1, t2
|
||||
fadd.s t3, t3, t4
|
||||
fmax.s s3, t1, t3
|
||||
fld.s t1, X, 0 * SIZE
|
||||
fld.s t2, X, 1 * SIZE
|
||||
ADD t1, t1, t2
|
||||
ADD t3, t3, t4
|
||||
FMAX s3, t1, t3
|
||||
LD t1, X, 0 * SIZE
|
||||
LD t2, X, 1 * SIZE
|
||||
add.d X, X, INCX
|
||||
fld.s t3, X, 0 * SIZE
|
||||
fld.s t4, X, 1 * SIZE
|
||||
LD t3, X, 0 * SIZE
|
||||
LD t4, X, 1 * SIZE
|
||||
add.d X, X, INCX
|
||||
fabs.s t1, t1
|
||||
fabs.s t2, t2
|
||||
fabs.s t3, t3
|
||||
fabs.s t4, t4
|
||||
fadd.s t1, t1, t2
|
||||
fadd.s t3, t3, t4
|
||||
fmax.s s4, t1, t3
|
||||
FABS t1, t1
|
||||
FABS t2, t2
|
||||
FABS t3, t3
|
||||
FABS t4, t4
|
||||
ADD t1, t1, t2
|
||||
ADD t3, t3, t4
|
||||
FMAX s4, t1, t3
|
||||
blt $r0, I, .L21
|
||||
.align 3
|
||||
|
||||
.L22:
|
||||
fmax.s s1, s1, s2
|
||||
fmax.s s3, s3, s4
|
||||
fmax.s s1, s1, s3
|
||||
FMAX s1, s1, s2
|
||||
FMAX s3, s3, s4
|
||||
FMAX s1, s1, s3
|
||||
.align 3
|
||||
|
||||
.L23: //N<8
|
||||
|
@ -182,12 +200,12 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|||
FABS a1, a1
|
||||
ADD a0, a0, a1
|
||||
add.d X, X, INCX
|
||||
fmax.s s1, a0, s1
|
||||
FMAX s1, a0, s1
|
||||
blt $r0, I, .L24
|
||||
.align 3
|
||||
|
||||
.L999:
|
||||
fmov.s $f0, $f22
|
||||
MOV $f0, $f22
|
||||
jirl $r0, $r1, 0x0
|
||||
.align 3
|
||||
|
||||
|
|
|
@ -63,54 +63,87 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|||
bge $r0, N, .L999
|
||||
bge $r0, INCX, .L999
|
||||
li.d TEMP, 1
|
||||
li.w I, -1
|
||||
slli.d TEMP, TEMP, ZBASE_SHIFT
|
||||
slli.d INCX, INCX, ZBASE_SHIFT
|
||||
vreplgr2vr.w neg1, I
|
||||
vffint.s.w neg1, neg1
|
||||
srai.d I, N, 3
|
||||
bne INCX, TEMP, .L20
|
||||
bge $r0, I, .L23
|
||||
.align 3
|
||||
|
||||
.L10:
|
||||
vld VX0, X, 0 * SIZE
|
||||
vld VX1, X, 4 * SIZE
|
||||
addi.d I, I, -1
|
||||
vld VX0, X, 0
|
||||
vld VX1, X, 16
|
||||
#ifdef DOUBLE
|
||||
vpickev.d x1, VX1, VX0
|
||||
vpickod.d x2, VX1, VX0
|
||||
#else
|
||||
vpickev.w x1, VX1, VX0
|
||||
vpickod.w x2, VX1, VX0
|
||||
vfmul.s x3, neg1, x1
|
||||
vfmul.s x4, neg1, x2
|
||||
vfcmp.clt.s VT0, x1, res0
|
||||
vfcmp.clt.s VT1, x2, res0
|
||||
vld VX0, X, 8 * SIZE
|
||||
vbitsel.v x1, x1, x3, VT0
|
||||
vbitsel.v x2, x2, x4, VT1
|
||||
vld VX1, X, 12 * SIZE
|
||||
vfadd.s VM1, x1, x2
|
||||
#endif
|
||||
VFSUB x3, res0, x1
|
||||
VFSUB x4, res0, x2
|
||||
VFMAX x1, x1, x3
|
||||
VFMAX x2, x2, x4
|
||||
VFADD VM1, x1, x2
|
||||
|
||||
vld VX0, X, 32
|
||||
vld VX1, X, 48
|
||||
#ifdef DOUBLE
|
||||
vpickev.d x1, VX1, VX0
|
||||
vpickod.d x2, VX1, VX0
|
||||
#else
|
||||
vpickev.w x1, VX1, VX0
|
||||
vpickod.w x2, VX1, VX0
|
||||
vfmul.s x3, neg1, x1
|
||||
vfmul.s x4, neg1, x2
|
||||
vfcmp.clt.s VT0, x1, res0
|
||||
vfcmp.clt.s VT1, x2, res0
|
||||
#endif
|
||||
VFSUB x3, res0, x1
|
||||
VFSUB x4, res0, x2
|
||||
VFMAX x1, x1, x3
|
||||
VFMAX x2, x2, x4
|
||||
VFADD x1, x1, x2
|
||||
VFMAX VM1, x1, VM1
|
||||
VFMAX VM0, VM0, VM1
|
||||
#ifdef DOUBLE
|
||||
vld VX0, X, 64
|
||||
vld VX1, X, 80
|
||||
vpickev.d x1, VX1, VX0
|
||||
vpickod.d x2, VX1, VX0
|
||||
VFSUB x3, res0, x1
|
||||
VFSUB x4, res0, x2
|
||||
VFMAX x1, x1, x3
|
||||
VFMAX x2, x2, x4
|
||||
VFADD VM1, x1, x2
|
||||
|
||||
vld VX0, X, 96
|
||||
vld VX1, X, 112
|
||||
vpickev.d x1, VX1, VX0
|
||||
vpickod.d x2, VX1, VX0
|
||||
VFSUB x3, res0, x1
|
||||
VFSUB x4, res0, x2
|
||||
VFMAX x1, x1, x3
|
||||
VFMAX x2, x2, x4
|
||||
VFADD x1, x1, x2
|
||||
VFMAX VM1, x1, VM1
|
||||
VFMAX VM0, VM0, VM1
|
||||
#endif
|
||||
addi.d X, X, 16 * SIZE
|
||||
vbitsel.v x1, x1, x3, VT0
|
||||
vbitsel.v x2, x2, x4, VT1
|
||||
vfadd.s x1, x1, x2
|
||||
vfmax.s VM1, x1, VM1
|
||||
vfmax.s VM0, VM0, VM1
|
||||
addi.d I, I, -1
|
||||
blt $r0, I, .L10
|
||||
.align 3
|
||||
|
||||
.L11:
|
||||
#ifdef DOUBLE
|
||||
vreplvei.d x1, VM0, 0
|
||||
vreplvei.d x2, VM0, 1
|
||||
VFMAX VM0, x1, x2
|
||||
#else
|
||||
vreplvei.w x1, VM0, 0
|
||||
vreplvei.w x2, VM0, 1
|
||||
vreplvei.w x3, VM0, 2
|
||||
vreplvei.w x4, VM0, 3
|
||||
vfmax.s VM1, x1, x2
|
||||
vfmax.s VM0, x3, x4
|
||||
vfmax.s VM0, VM0, VM1
|
||||
VFMAX VM1, x1, x2
|
||||
VFMAX VM0, x3, x4
|
||||
VFMAX VM0, VM0, VM1
|
||||
#endif
|
||||
b .L23
|
||||
.align 3
|
||||
|
||||
|
@ -119,66 +152,66 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|||
.align 3
|
||||
|
||||
.L21:
|
||||
fld.s t1, X, 0 * SIZE
|
||||
fld.s t2, X, 1 * SIZE
|
||||
LD t1, X, 0 * SIZE
|
||||
LD t2, X, 1 * SIZE
|
||||
add.d X, X, INCX
|
||||
fld.s t3, X, 0 * SIZE
|
||||
fld.s t4, X, 1 * SIZE
|
||||
LD t3, X, 0 * SIZE
|
||||
LD t4, X, 1 * SIZE
|
||||
add.d X, X, INCX
|
||||
fabs.s t1, t1
|
||||
fabs.s t2, t2
|
||||
fabs.s t3, t3
|
||||
fabs.s t4, t4
|
||||
fadd.s t1, t1, t2
|
||||
fadd.s t3, t3, t4
|
||||
fmax.s s1, t1, t3
|
||||
fld.s t1, X, 0 * SIZE
|
||||
fld.s t2, X, 1 * SIZE
|
||||
FABS t1, t1
|
||||
FABS t2, t2
|
||||
FABS t3, t3
|
||||
FABS t4, t4
|
||||
ADD t1, t1, t2
|
||||
ADD t3, t3, t4
|
||||
FMAX s1, t1, t3
|
||||
LD t1, X, 0 * SIZE
|
||||
LD t2, X, 1 * SIZE
|
||||
add.d X, X, INCX
|
||||
fld.s t3, X, 0 * SIZE
|
||||
fld.s t4, X, 1 * SIZE
|
||||
LD t3, X, 0 * SIZE
|
||||
LD t4, X, 1 * SIZE
|
||||
add.d X, X, INCX
|
||||
fabs.s t1, t1
|
||||
fabs.s t2, t2
|
||||
fabs.s t3, t3
|
||||
fabs.s t4, t4
|
||||
fadd.s t1, t1, t2
|
||||
fadd.s t3, t3, t4
|
||||
fmax.s s1, t1, t3
|
||||
fld.s t1, X, 0 * SIZE
|
||||
fld.s t2, X, 1 * SIZE
|
||||
FABS t1, t1
|
||||
FABS t2, t2
|
||||
FABS t3, t3
|
||||
FABS t4, t4
|
||||
ADD t1, t1, t2
|
||||
ADD t3, t3, t4
|
||||
FMAX s1, t1, t3
|
||||
LD t1, X, 0 * SIZE
|
||||
LD t2, X, 1 * SIZE
|
||||
add.d X, X, INCX
|
||||
fld.s t3, X, 0 * SIZE
|
||||
fld.s t4, X, 1 * SIZE
|
||||
LD t3, X, 0 * SIZE
|
||||
LD t4, X, 1 * SIZE
|
||||
add.d X, X, INCX
|
||||
fabs.s t1, t1
|
||||
fabs.s t2, t2
|
||||
fabs.s t3, t3
|
||||
fabs.s t4, t4
|
||||
FABS t1, t1
|
||||
FABS t2, t2
|
||||
FABS t3, t3
|
||||
FABS t4, t4
|
||||
addi.d I, I, -1
|
||||
fadd.s t1, t1, t2
|
||||
fadd.s t3, t3, t4
|
||||
fmax.s s3, t1, t3
|
||||
fld.s t1, X, 0 * SIZE
|
||||
fld.s t2, X, 1 * SIZE
|
||||
ADD t1, t1, t2
|
||||
ADD t3, t3, t4
|
||||
FMAX s3, t1, t3
|
||||
LD t1, X, 0 * SIZE
|
||||
LD t2, X, 1 * SIZE
|
||||
add.d X, X, INCX
|
||||
fld.s t3, X, 0 * SIZE
|
||||
fld.s t4, X, 1 * SIZE
|
||||
LD t3, X, 0 * SIZE
|
||||
LD t4, X, 1 * SIZE
|
||||
add.d X, X, INCX
|
||||
fabs.s t1, t1
|
||||
fabs.s t2, t2
|
||||
fabs.s t3, t3
|
||||
fabs.s t4, t4
|
||||
fadd.s t1, t1, t2
|
||||
fadd.s t3, t3, t4
|
||||
fmax.s s4, t1, t3
|
||||
FABS t1, t1
|
||||
FABS t2, t2
|
||||
FABS t3, t3
|
||||
FABS t4, t4
|
||||
ADD t1, t1, t2
|
||||
ADD t3, t3, t4
|
||||
FMAX s4, t1, t3
|
||||
blt $r0, I, .L21
|
||||
.align 3
|
||||
|
||||
.L22:
|
||||
fmax.s s1, s1, s2
|
||||
fmax.s s3, s3, s4
|
||||
fmax.s s1, s1, s3
|
||||
FMAX s1, s1, s2
|
||||
FMAX s3, s3, s4
|
||||
FMAX s1, s1, s3
|
||||
.align 3
|
||||
|
||||
.L23: //N<8
|
||||
|
@ -187,19 +220,19 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|||
.align 3
|
||||
|
||||
.L24:
|
||||
fld.s a0, X, 0 * SIZE
|
||||
fld.s a1, X, 1 * SIZE
|
||||
LD a0, X, 0 * SIZE
|
||||
LD a1, X, 1 * SIZE
|
||||
addi.d I, I, -1
|
||||
fabs.s a0, a0
|
||||
fabs.s a1, a1
|
||||
fadd.s a0, a0, a1
|
||||
FABS a0, a0
|
||||
FABS a1, a1
|
||||
ADD a0, a0, a1
|
||||
add.d X, X, INCX
|
||||
fmax.s s1, a0, s1
|
||||
FMAX s1, a0, s1
|
||||
blt $r0, I, .L24
|
||||
.align 3
|
||||
|
||||
.L999:
|
||||
fmov.s $f0, $f22
|
||||
MOV $f0, $f22
|
||||
jirl $r0, $r1, 0x0
|
||||
.align 3
|
||||
|
||||
|
|
|
@ -61,49 +61,71 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|||
xvxor.v res0, res0, res0
|
||||
bge $r0, N, .L999
|
||||
bge $r0, INCX, .L999
|
||||
fld.s a0, X, 0 * SIZE
|
||||
fld.s a1, X, 1 * SIZE
|
||||
fabs.s a0, a0
|
||||
fabs.s a1, a1
|
||||
fadd.s s1, a1, a0
|
||||
LD a0, X, 0 * SIZE
|
||||
LD a1, X, 1 * SIZE
|
||||
FABS a0, a0
|
||||
FABS a1, a1
|
||||
ADD s1, a1, a0
|
||||
#ifdef DOUBLE
|
||||
xvreplve0.d VM0, VM0
|
||||
#else
|
||||
xvreplve0.w VM0, VM0
|
||||
#endif
|
||||
li.d TEMP, 1
|
||||
li.w I, -1
|
||||
slli.d TEMP, TEMP, ZBASE_SHIFT
|
||||
slli.d INCX, INCX, ZBASE_SHIFT
|
||||
xvreplgr2vr.w neg1, I
|
||||
xvffint.s.w neg1, neg1
|
||||
srai.d I, N, 3
|
||||
bne INCX, TEMP, .L20
|
||||
bge $r0, I, .L23
|
||||
.align 3
|
||||
|
||||
.L10:
|
||||
xvld VX0, X, 0 * SIZE
|
||||
xvld VX1, X, 8 * SIZE
|
||||
addi.d I, I, -1
|
||||
xvld VX0, X, 0
|
||||
xvld VX1, X, 32
|
||||
#ifdef DOUBLE
|
||||
xvpickev.d x1, VX1, VX0
|
||||
xvpickod.d x2, VX1, VX0
|
||||
#else
|
||||
xvpickev.w x1, VX1, VX0
|
||||
xvpickod.w x2, VX1, VX0
|
||||
xvfmul.s x3, neg1, x1
|
||||
xvfmul.s x4, neg1, x2
|
||||
xvfcmp.clt.s VT0, x1, res0
|
||||
xvfcmp.clt.s VT1, x2, res0
|
||||
xvbitsel.v x1, x1, x3, VT0
|
||||
xvbitsel.v x2, x2, x4, VT1
|
||||
#endif
|
||||
XVFSUB x3, res0, x1
|
||||
XVFSUB x4, res0, x2
|
||||
XVFMAX x1, x1, x3
|
||||
XVFMAX x2, x2, x4
|
||||
XVFADD VM1, x1, x2
|
||||
XVFMIN VM0, VM0, VM1
|
||||
#ifdef DOUBLE
|
||||
xvld VX0, X, 64
|
||||
xvld VX1, X, 96
|
||||
xvpickev.d x1, VX1, VX0
|
||||
xvpickod.d x2, VX1, VX0
|
||||
XVFSUB x3, res0, x1
|
||||
XVFSUB x4, res0, x2
|
||||
XVFMAX x1, x1, x3
|
||||
XVFMAX x2, x2, x4
|
||||
XVFADD VM1, x1, x2
|
||||
XVFMIN VM0, VM0, VM1
|
||||
#endif
|
||||
addi.d I, I, -1
|
||||
addi.d X, X, 16 * SIZE
|
||||
xvfadd.s VM1, x1, x2
|
||||
xvfmin.s VM0, VM0, VM1
|
||||
blt $r0, I, .L10
|
||||
.align 3
|
||||
|
||||
.L11:
|
||||
#ifdef DOUBLE
|
||||
xvpickve.d x1, VM0, 0
|
||||
xvpickve.d x2, VM0, 1
|
||||
XVFMIN VM0, x1, x2
|
||||
#else
|
||||
xvpickve.w x1, VM0, 0
|
||||
xvpickve.w x2, VM0, 1
|
||||
xvpickve.w x3, VM0, 2
|
||||
xvpickve.w x4, VM0, 3
|
||||
xvfmin.s VM1, x1, x2
|
||||
xvfmin.s VM0, x3, x4
|
||||
xvfmin.s VM0, VM0, VM1
|
||||
XVFMIN VM0, x1, x2
|
||||
XVFMIN VM1, x3, x4
|
||||
XVFMIN VM0, VM0, VM1
|
||||
#endif
|
||||
b .L23
|
||||
.align 3
|
||||
|
||||
|
@ -112,66 +134,66 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|||
.align 3
|
||||
|
||||
.L21:
|
||||
fld.s t1, X, 0 * SIZE
|
||||
fld.s t2, X, 1 * SIZE
|
||||
LD t1, X, 0 * SIZE
|
||||
LD t2, X, 1 * SIZE
|
||||
add.d X, X, INCX
|
||||
fld.s t3, X, 0 * SIZE
|
||||
fld.s t4, X, 1 * SIZE
|
||||
LD t3, X, 0 * SIZE
|
||||
LD t4, X, 1 * SIZE
|
||||
add.d X, X, INCX
|
||||
fabs.s t1, t1
|
||||
fabs.s t2, t2
|
||||
fabs.s t3, t3
|
||||
fabs.s t4, t4
|
||||
fadd.s t1, t1, t2
|
||||
fadd.s t3, t3, t4
|
||||
fmin.s s1, t1, t3
|
||||
fld.s t1, X, 0 * SIZE
|
||||
fld.s t2, X, 1 * SIZE
|
||||
FABS t1, t1
|
||||
FABS t2, t2
|
||||
FABS t3, t3
|
||||
FABS t4, t4
|
||||
ADD t1, t1, t2
|
||||
ADD t3, t3, t4
|
||||
FMIN s1, t1, t3
|
||||
LD t1, X, 0 * SIZE
|
||||
LD t2, X, 1 * SIZE
|
||||
add.d X, X, INCX
|
||||
fld.s t3, X, 0 * SIZE
|
||||
fld.s t4, X, 1 * SIZE
|
||||
LD t3, X, 0 * SIZE
|
||||
LD t4, X, 1 * SIZE
|
||||
add.d X, X, INCX
|
||||
fabs.s t1, t1
|
||||
fabs.s t2, t2
|
||||
fabs.s t3, t3
|
||||
fabs.s t4, t4
|
||||
fadd.s t1, t1, t2
|
||||
fadd.s t3, t3, t4
|
||||
fmin.s s1, t1, t3
|
||||
fld.s t1, X, 0 * SIZE
|
||||
fld.s t2, X, 1 * SIZE
|
||||
FABS t1, t1
|
||||
FABS t2, t2
|
||||
FABS t3, t3
|
||||
FABS t4, t4
|
||||
ADD t1, t1, t2
|
||||
ADD t3, t3, t4
|
||||
FMIN s1, t1, t3
|
||||
LD t1, X, 0 * SIZE
|
||||
LD t2, X, 1 * SIZE
|
||||
add.d X, X, INCX
|
||||
fld.s t3, X, 0 * SIZE
|
||||
fld.s t4, X, 1 * SIZE
|
||||
LD t3, X, 0 * SIZE
|
||||
LD t4, X, 1 * SIZE
|
||||
add.d X, X, INCX
|
||||
fabs.s t1, t1
|
||||
fabs.s t2, t2
|
||||
fabs.s t3, t3
|
||||
fabs.s t4, t4
|
||||
FABS t1, t1
|
||||
FABS t2, t2
|
||||
FABS t3, t3
|
||||
FABS t4, t4
|
||||
addi.d I, I, -1
|
||||
fadd.s t1, t1, t2
|
||||
fadd.s t3, t3, t4
|
||||
fmin.s s3, t1, t3
|
||||
fld.s t1, X, 0 * SIZE
|
||||
fld.s t2, X, 1 * SIZE
|
||||
ADD t1, t1, t2
|
||||
ADD t3, t3, t4
|
||||
FMIN s3, t1, t3
|
||||
LD t1, X, 0 * SIZE
|
||||
LD t2, X, 1 * SIZE
|
||||
add.d X, X, INCX
|
||||
fld.s t3, X, 0 * SIZE
|
||||
fld.s t4, X, 1 * SIZE
|
||||
LD t3, X, 0 * SIZE
|
||||
LD t4, X, 1 * SIZE
|
||||
add.d X, X, INCX
|
||||
fabs.s t1, t1
|
||||
fabs.s t2, t2
|
||||
fabs.s t3, t3
|
||||
fabs.s t4, t4
|
||||
fadd.s t1, t1, t2
|
||||
fadd.s t3, t3, t4
|
||||
fmin.s s4, t1, t3
|
||||
FABS t1, t1
|
||||
FABS t2, t2
|
||||
FABS t3, t3
|
||||
FABS t4, t4
|
||||
ADD t1, t1, t2
|
||||
ADD t3, t3, t4
|
||||
FMIN s4, t1, t3
|
||||
blt $r0, I, .L21
|
||||
.align 3
|
||||
|
||||
.L22:
|
||||
fmin.s s1, s1, s2
|
||||
fmin.s s3, s3, s4
|
||||
fmin.s s1, s1, s3
|
||||
FMIN s1, s1, s2
|
||||
FMIN s3, s3, s4
|
||||
FMIN s1, s1, s3
|
||||
.align 3
|
||||
|
||||
.L23: //N<8
|
||||
|
@ -187,12 +209,12 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|||
FABS a1, a1
|
||||
ADD a0, a0, a1
|
||||
add.d X, X, INCX
|
||||
fmin.s s1, a0, s1
|
||||
FMIN s1, a0, s1
|
||||
blt $r0, I, .L24
|
||||
.align 3
|
||||
|
||||
.L999:
|
||||
fmov.s $f0, $f22
|
||||
MOV $f0, $f22
|
||||
jirl $r0, $r1, 0x0
|
||||
.align 3
|
||||
|
||||
|
|
|
@ -61,61 +61,98 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|||
vxor.v res0, res0, res0
|
||||
bge $r0, N, .L999
|
||||
bge $r0, INCX, .L999
|
||||
fld.s a0, X, 0 * SIZE
|
||||
fld.s a1, X, 1 * SIZE
|
||||
fabs.s a0, a0
|
||||
fabs.s a1, a1
|
||||
fadd.s s1, a1, a0
|
||||
LD a0, X, 0 * SIZE
|
||||
LD a1, X, 1 * SIZE
|
||||
FABS a0, a0
|
||||
FABS a1, a1
|
||||
ADD s1, a1, a0
|
||||
#ifdef DOUBLE
|
||||
vreplvei.d VM0, VM0, 0
|
||||
#else
|
||||
vreplvei.w VM0, VM0, 0
|
||||
#endif
|
||||
li.d TEMP, 1
|
||||
li.w I, -1
|
||||
slli.d TEMP, TEMP, ZBASE_SHIFT
|
||||
slli.d INCX, INCX, ZBASE_SHIFT
|
||||
vreplgr2vr.w neg1, I
|
||||
vffint.s.w neg1, neg1
|
||||
srai.d I, N, 3
|
||||
bne INCX, TEMP, .L20
|
||||
bge $r0, I, .L23
|
||||
.align 3
|
||||
|
||||
.L10:
|
||||
vld VX0, X, 0 * SIZE
|
||||
vld VX1, X, 4 * SIZE
|
||||
vld VX0, X, 0
|
||||
vld VX1, X, 16
|
||||
#ifdef DOUBLE
|
||||
vpickev.d x1, VX1, VX0
|
||||
vpickod.d x2, VX1, VX0
|
||||
#else
|
||||
vpickev.w x1, VX1, VX0
|
||||
vpickod.w x2, VX1, VX0
|
||||
#endif
|
||||
VFSUB x3, res0, x1
|
||||
VFSUB x4, res0, x2
|
||||
VFMAX x1, x1, x3
|
||||
VFMAX x2, x2, x4
|
||||
VFADD VM1, x1, x2
|
||||
|
||||
vld VX0, X, 32
|
||||
vld VX1, X, 48
|
||||
#ifdef DOUBLE
|
||||
vpickev.d x1, VX1, VX0
|
||||
vpickod.d x2, VX1, VX0
|
||||
#else
|
||||
vpickev.w x1, VX1, VX0
|
||||
vpickod.w x2, VX1, VX0
|
||||
#endif
|
||||
VFSUB x3, res0, x1
|
||||
VFSUB x4, res0, x2
|
||||
VFMAX x1, x1, x3
|
||||
VFMAX x2, x2, x4
|
||||
VFADD x1, x1, x2
|
||||
VFMIN VM1, x1, VM1
|
||||
VFMIN VM0, VM0, VM1
|
||||
#ifdef DOUBLE
|
||||
vld VX0, X, 64
|
||||
vld VX1, X, 80
|
||||
vpickev.d x1, VX1, VX0
|
||||
vpickod.d x2, VX1, VX0
|
||||
VFSUB x3, res0, x1
|
||||
VFSUB x4, res0, x2
|
||||
VFMAX x1, x1, x3
|
||||
VFMAX x2, x2, x4
|
||||
VFADD VM1, x1, x2
|
||||
|
||||
vld VX0, X, 96
|
||||
vld VX1, X, 112
|
||||
vpickev.d x1, VX1, VX0
|
||||
vpickod.d x2, VX1, VX0
|
||||
VFSUB x3, res0, x1
|
||||
VFSUB x4, res0, x2
|
||||
VFMAX x1, x1, x3
|
||||
VFMAX x2, x2, x4
|
||||
VFADD x1, x1, x2
|
||||
VFMIN VM1, x1, VM1
|
||||
VFMIN VM0, VM0, VM1
|
||||
#endif
|
||||
addi.d I, I, -1
|
||||
vpickev.w x1, VX1, VX0
|
||||
vpickod.w x2, VX1, VX0
|
||||
vfmul.s x3, neg1, x1
|
||||
vfmul.s x4, neg1, x2
|
||||
vfcmp.clt.s VT0, x1, res0
|
||||
vfcmp.clt.s VT1, x2, res0
|
||||
vld VX0, X, 8 * SIZE
|
||||
vbitsel.v x1, x1, x3, VT0
|
||||
vbitsel.v x2, x2, x4, VT1
|
||||
vld VX1, X, 12 * SIZE
|
||||
vfadd.s VM1, x1, x2
|
||||
vpickev.w x1, VX1, VX0
|
||||
vpickod.w x2, VX1, VX0
|
||||
vfmul.s x3, neg1, x1
|
||||
vfmul.s x4, neg1, x2
|
||||
vfcmp.clt.s VT0, x1, res0
|
||||
vfcmp.clt.s VT1, x2, res0
|
||||
addi.d X, X, 16 * SIZE
|
||||
vbitsel.v x1, x1, x3, VT0
|
||||
vbitsel.v x2, x2, x4, VT1
|
||||
vfadd.s x1, x1, x2
|
||||
vfmin.s VM1, x1, VM1
|
||||
vfmin.s VM0, VM0, VM1
|
||||
blt $r0, I, .L10
|
||||
.align 3
|
||||
|
||||
.L11:
|
||||
#ifdef DOUBLE
|
||||
vreplvei.d x1, VM0, 0
|
||||
vreplvei.d x2, VM0, 1
|
||||
VFMIN VM0, x1, x2
|
||||
#else
|
||||
vreplvei.w x1, VM0, 0
|
||||
vreplvei.w x2, VM0, 1
|
||||
vreplvei.w x3, VM0, 2
|
||||
vreplvei.w x4, VM0, 3
|
||||
vfmin.s VM1, x1, x2
|
||||
vfmin.s VM0, x3, x4
|
||||
vfmin.s VM0, VM0, VM1
|
||||
VFMIN VM1, x1, x2
|
||||
VFMIN VM0, x3, x4
|
||||
VFMIN VM0, VM0, VM1
|
||||
#endif
|
||||
b .L23
|
||||
.align 3
|
||||
|
||||
|
@ -124,66 +161,66 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|||
.align 3
|
||||
|
||||
.L21:
|
||||
fld.s t1, X, 0 * SIZE
|
||||
fld.s t2, X, 1 * SIZE
|
||||
LD t1, X, 0 * SIZE
|
||||
LD t2, X, 1 * SIZE
|
||||
add.d X, X, INCX
|
||||
fld.s t3, X, 0 * SIZE
|
||||
fld.s t4, X, 1 * SIZE
|
||||
LD t3, X, 0 * SIZE
|
||||
LD t4, X, 1 * SIZE
|
||||
add.d X, X, INCX
|
||||
fabs.s t1, t1
|
||||
fabs.s t2, t2
|
||||
fabs.s t3, t3
|
||||
fabs.s t4, t4
|
||||
fadd.s t1, t1, t2
|
||||
fadd.s t3, t3, t4
|
||||
fmin.s s1, t1, t3
|
||||
fld.s t1, X, 0 * SIZE
|
||||
fld.s t2, X, 1 * SIZE
|
||||
FABS t1, t1
|
||||
FABS t2, t2
|
||||
FABS t3, t3
|
||||
FABS t4, t4
|
||||
ADD t1, t1, t2
|
||||
ADD t3, t3, t4
|
||||
FMIN s1, t1, t3
|
||||
LD t1, X, 0 * SIZE
|
||||
LD t2, X, 1 * SIZE
|
||||
add.d X, X, INCX
|
||||
fld.s t3, X, 0 * SIZE
|
||||
fld.s t4, X, 1 * SIZE
|
||||
LD t3, X, 0 * SIZE
|
||||
LD t4, X, 1 * SIZE
|
||||
add.d X, X, INCX
|
||||
fabs.s t1, t1
|
||||
fabs.s t2, t2
|
||||
fabs.s t3, t3
|
||||
fabs.s t4, t4
|
||||
fadd.s t1, t1, t2
|
||||
fadd.s t3, t3, t4
|
||||
fmin.s s1, t1, t3
|
||||
fld.s t1, X, 0 * SIZE
|
||||
fld.s t2, X, 1 * SIZE
|
||||
FABS t1, t1
|
||||
FABS t2, t2
|
||||
FABS t3, t3
|
||||
FABS t4, t4
|
||||
ADD t1, t1, t2
|
||||
ADD t3, t3, t4
|
||||
FMIN s1, t1, t3
|
||||
LD t1, X, 0 * SIZE
|
||||
LD t2, X, 1 * SIZE
|
||||
add.d X, X, INCX
|
||||
fld.s t3, X, 0 * SIZE
|
||||
fld.s t4, X, 1 * SIZE
|
||||
LD t3, X, 0 * SIZE
|
||||
LD t4, X, 1 * SIZE
|
||||
add.d X, X, INCX
|
||||
fabs.s t1, t1
|
||||
fabs.s t2, t2
|
||||
fabs.s t3, t3
|
||||
fabs.s t4, t4
|
||||
FABS t1, t1
|
||||
FABS t2, t2
|
||||
FABS t3, t3
|
||||
FABS t4, t4
|
||||
addi.d I, I, -1
|
||||
fadd.s t1, t1, t2
|
||||
fadd.s t3, t3, t4
|
||||
fmin.s s3, t1, t3
|
||||
fld.s t1, X, 0 * SIZE
|
||||
fld.s t2, X, 1 * SIZE
|
||||
ADD t1, t1, t2
|
||||
ADD t3, t3, t4
|
||||
FMIN s3, t1, t3
|
||||
LD t1, X, 0 * SIZE
|
||||
LD t2, X, 1 * SIZE
|
||||
add.d X, X, INCX
|
||||
fld.s t3, X, 0 * SIZE
|
||||
fld.s t4, X, 1 * SIZE
|
||||
LD t3, X, 0 * SIZE
|
||||
LD t4, X, 1 * SIZE
|
||||
add.d X, X, INCX
|
||||
fabs.s t1, t1
|
||||
fabs.s t2, t2
|
||||
fabs.s t3, t3
|
||||
fabs.s t4, t4
|
||||
fadd.s t1, t1, t2
|
||||
fadd.s t3, t3, t4
|
||||
fmin.s s4, t1, t3
|
||||
FABS t1, t1
|
||||
FABS t2, t2
|
||||
FABS t3, t3
|
||||
FABS t4, t4
|
||||
ADD t1, t1, t2
|
||||
ADD t3, t3, t4
|
||||
FMIN s4, t1, t3
|
||||
blt $r0, I, .L21
|
||||
.align 3
|
||||
|
||||
.L22:
|
||||
fmin.s s1, s1, s2
|
||||
fmin.s s3, s3, s4
|
||||
fmin.s s1, s1, s3
|
||||
FMIN s1, s1, s2
|
||||
FMIN s3, s3, s4
|
||||
FMIN s1, s1, s3
|
||||
.align 3
|
||||
|
||||
.L23: //N<8
|
||||
|
@ -192,19 +229,19 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|||
.align 3
|
||||
|
||||
.L24:
|
||||
fld.s a0, X, 0 * SIZE
|
||||
fld.s a1, X, 1 * SIZE
|
||||
LD a0, X, 0 * SIZE
|
||||
LD a1, X, 1 * SIZE
|
||||
addi.d I, I, -1
|
||||
fabs.s a0, a0
|
||||
fabs.s a1, a1
|
||||
fadd.s a0, a0, a1
|
||||
FABS a0, a0
|
||||
FABS a1, a1
|
||||
ADD a0, a0, a1
|
||||
add.d X, X, INCX
|
||||
fmin.s s1, a0, s1
|
||||
FMIN s1, a0, s1
|
||||
blt $r0, I, .L24
|
||||
.align 3
|
||||
|
||||
.L999:
|
||||
fmov.s $f0, $f22
|
||||
MOV $f0, $f22
|
||||
jirl $r0, $r1, 0x0
|
||||
.align 3
|
||||
|
||||
|
|
|
@ -99,7 +99,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|||
b .L113 //alpha_r != 0.0 && alpha_i == 0.0
|
||||
|
||||
.L14:
|
||||
bceqz $fcc1, .L112 //alpha_r == 0.0 && alpha_i != 0.0
|
||||
bceqz $fcc1, .L114 //alpha_r == 0.0 && alpha_i != 0.0
|
||||
b .L111 //alpha_r == 0.0 && alpha_i == 0.0
|
||||
.align 3
|
||||
|
||||
|
@ -117,38 +117,6 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|||
b .L997
|
||||
.align 3
|
||||
|
||||
.L112: //alpha_r == 0.0 && alpha_i != 0.0
|
||||
xvld VX0, X, 0 * SIZE
|
||||
#ifdef DOUBLE
|
||||
xvld VX1, X, 4 * SIZE
|
||||
xvpickev.d x1, VX1, VX0
|
||||
xvpickod.d x2, VX1, VX0
|
||||
xvfmul.d x3, VXAI, x2
|
||||
xvfsub.d x3, VXZ, x3
|
||||
xvfmul.d x4, VXAI, x1
|
||||
xvilvl.d VX2, x4 ,x3
|
||||
xvilvh.d VX3, x4, x3
|
||||
xvst VX2, X, 0 * SIZE
|
||||
xvst VX3, X, 4 * SIZE
|
||||
addi.d X, X, 8 * SIZE
|
||||
#else
|
||||
xvld VX1, X, 8 * SIZE
|
||||
xvpickev.w x1, VX1, VX0
|
||||
xvpickod.w x2, VX1, VX0
|
||||
xvfmul.s x3, VXAI, x2
|
||||
xvfsub.s x3, VXZ, x3
|
||||
xvfmul.s x4, VXAI, x1
|
||||
xvilvl.w VX2, x4 ,x3
|
||||
xvilvh.w VX3, x4, x3
|
||||
xvst VX2, X, 0 * SIZE
|
||||
xvst VX3, X, 8 * SIZE
|
||||
addi.d X, X, 16 * SIZE
|
||||
#endif
|
||||
addi.d I, I, -1
|
||||
blt $r0, I, .L112
|
||||
b .L997
|
||||
.align 3
|
||||
|
||||
.L113: //alpha_r != 0.0 && alpha_i == 0.0
|
||||
xvld VX0, X, 0 * SIZE
|
||||
#ifdef DOUBLE
|
||||
|
@ -227,7 +195,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|||
b .L223 //alpha_r != 0.0 && alpha_i == 0.0
|
||||
|
||||
.L24:
|
||||
bceqz $fcc1, .L222 //alpha_r == 0.0 && alpha_i != 0.0
|
||||
bceqz $fcc1, .L224 //alpha_r == 0.0 && alpha_i != 0.0
|
||||
b .L221 //alpha_r == 0.0 && alpha_i == 0.0
|
||||
.align 3
|
||||
|
||||
|
@ -275,119 +243,6 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|||
b .L997
|
||||
.align 3
|
||||
|
||||
.L222: //alpha_r == 0.0 && alpha_i != 0.0
|
||||
#ifdef DOUBLE
|
||||
ld.d t1, X, 0 * SIZE
|
||||
ld.d t2, X, 1 * SIZE
|
||||
add.d X, X, INCX
|
||||
ld.d t3, X, 0 * SIZE
|
||||
ld.d t4, X, 1 * SIZE
|
||||
add.d X, X, INCX
|
||||
xvinsgr2vr.d x1, t1, 0
|
||||
xvinsgr2vr.d x2, t2, 0
|
||||
xvinsgr2vr.d x1, t3, 1
|
||||
xvinsgr2vr.d x2, t4, 1
|
||||
ld.d t1, X, 0 * SIZE
|
||||
ld.d t2, X, 1 * SIZE
|
||||
add.d X, X, INCX
|
||||
ld.d t3, X, 0 * SIZE
|
||||
ld.d t4, X, 1 * SIZE
|
||||
xvinsgr2vr.d x1, t1, 2
|
||||
xvinsgr2vr.d x2, t2, 2
|
||||
xvinsgr2vr.d x1, t3, 3
|
||||
xvinsgr2vr.d x2, t4, 3
|
||||
add.d X, X, INCX
|
||||
|
||||
xvfmul.d x3, VXAI, x2
|
||||
xvfsub.d x3, VXZ, x3
|
||||
xvfmul.d x4, VXAI, x1
|
||||
addi.d I, I, -1
|
||||
xvstelm.d x3, XX, 0 * SIZE, 0
|
||||
xvstelm.d x4, XX, 1 * SIZE, 0
|
||||
add.d XX, XX, INCX
|
||||
xvstelm.d x3, XX, 0 * SIZE, 1
|
||||
xvstelm.d x4, XX, 1 * SIZE, 1
|
||||
add.d XX, XX, INCX
|
||||
xvstelm.d x3, XX, 0 * SIZE, 2
|
||||
xvstelm.d x4, XX, 1 * SIZE, 2
|
||||
add.d XX, XX, INCX
|
||||
xvstelm.d x3, XX, 0 * SIZE, 3
|
||||
xvstelm.d x4, XX, 1 * SIZE, 3
|
||||
#else
|
||||
ld.w t1, X, 0 * SIZE
|
||||
ld.w t2, X, 1 * SIZE
|
||||
add.d X, X, INCX
|
||||
ld.w t3, X, 0 * SIZE
|
||||
ld.w t4, X, 1 * SIZE
|
||||
add.d X, X, INCX
|
||||
xvinsgr2vr.w x1, t1, 0
|
||||
xvinsgr2vr.w x2, t2, 0
|
||||
xvinsgr2vr.w x1, t3, 1
|
||||
xvinsgr2vr.w x2, t4, 1
|
||||
ld.w t1, X, 0 * SIZE
|
||||
ld.w t2, X, 1 * SIZE
|
||||
add.d X, X, INCX
|
||||
ld.w t3, X, 0 * SIZE
|
||||
ld.w t4, X, 1 * SIZE
|
||||
xvinsgr2vr.w x1, t1, 2
|
||||
xvinsgr2vr.w x2, t2, 2
|
||||
xvinsgr2vr.w x1, t3, 3
|
||||
xvinsgr2vr.w x2, t4, 3
|
||||
add.d X, X, INCX
|
||||
ld.w t1, X, 0 * SIZE
|
||||
ld.w t2, X, 1 * SIZE
|
||||
add.d X, X, INCX
|
||||
ld.w t3, X, 0 * SIZE
|
||||
ld.w t4, X, 1 * SIZE
|
||||
add.d X, X, INCX
|
||||
xvinsgr2vr.w x1, t1, 4
|
||||
xvinsgr2vr.w x2, t2, 4
|
||||
xvinsgr2vr.w x1, t3, 5
|
||||
xvinsgr2vr.w x2, t4, 5
|
||||
ld.w t1, X, 0 * SIZE
|
||||
ld.w t2, X, 1 * SIZE
|
||||
add.d X, X, INCX
|
||||
ld.w t3, X, 0 * SIZE
|
||||
ld.w t4, X, 1 * SIZE
|
||||
xvinsgr2vr.w x1, t1, 6
|
||||
xvinsgr2vr.w x2, t2, 6
|
||||
xvinsgr2vr.w x1, t3, 7
|
||||
xvinsgr2vr.w x2, t4, 7
|
||||
add.d X, X, INCX
|
||||
|
||||
xvfmul.s x3, VXAI, x2
|
||||
xvfsub.s x3, VXZ, x3
|
||||
xvfmul.s x4, VXAI, x1
|
||||
addi.d I, I, -1
|
||||
xvstelm.w x3, XX, 0 * SIZE, 0
|
||||
xvstelm.w x4, XX, 1 * SIZE, 0
|
||||
add.d XX, XX, INCX
|
||||
xvstelm.w x3, XX, 0 * SIZE, 1
|
||||
xvstelm.w x4, XX, 1 * SIZE, 1
|
||||
add.d XX, XX, INCX
|
||||
xvstelm.w x3, XX, 0 * SIZE, 2
|
||||
xvstelm.w x4, XX, 1 * SIZE, 2
|
||||
add.d XX, XX, INCX
|
||||
xvstelm.w x3, XX, 0 * SIZE, 3
|
||||
xvstelm.w x4, XX, 1 * SIZE, 3
|
||||
add.d XX, XX, INCX
|
||||
xvstelm.w x3, XX, 0 * SIZE, 4
|
||||
xvstelm.w x4, XX, 1 * SIZE, 4
|
||||
add.d XX, XX, INCX
|
||||
xvstelm.w x3, XX, 0 * SIZE, 5
|
||||
xvstelm.w x4, XX, 1 * SIZE, 5
|
||||
add.d XX, XX, INCX
|
||||
xvstelm.w x3, XX, 0 * SIZE, 6
|
||||
xvstelm.w x4, XX, 1 * SIZE, 6
|
||||
add.d XX, XX, INCX
|
||||
xvstelm.w x3, XX, 0 * SIZE, 7
|
||||
xvstelm.w x4, XX, 1 * SIZE, 7
|
||||
#endif
|
||||
add.d XX, XX, INCX
|
||||
blt $r0, I, .L222
|
||||
b .L997
|
||||
.align 3
|
||||
|
||||
.L223: //alpha_r != 0.0 && alpha_i == 0.0
|
||||
#ifdef DOUBLE
|
||||
ld.d t1, X, 0 * SIZE
|
||||
|
|
|
@ -97,7 +97,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|||
b .L113 //alpha_r != 0.0 && alpha_i == 0.0
|
||||
|
||||
.L14:
|
||||
bceqz $fcc1, .L112 //alpha_r == 0.0 && alpha_i != 0.0
|
||||
bceqz $fcc1, .L114 //alpha_r == 0.0 && alpha_i != 0.0
|
||||
b .L111 //alpha_r == 0.0 && alpha_i == 0.0
|
||||
.align 3
|
||||
|
||||
|
@ -116,48 +116,6 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|||
b .L997
|
||||
.align 3
|
||||
|
||||
.L112: //alpha_r == 0.0 && alpha_i != 0.0
|
||||
vld VX0, X, 0 * SIZE
|
||||
#ifdef DOUBLE
|
||||
vld VX1, X, 2 * SIZE
|
||||
vpickev.d x1, VX1, VX0
|
||||
vpickod.d x2, VX1, VX0
|
||||
vfmul.d x3, VXAI, x2
|
||||
vfsub.d x3, VXZ, x3
|
||||
vfmul.d x4, VXAI, x1
|
||||
vilvl.d VX2, x4 ,x3
|
||||
vilvh.d VX3, x4, x3
|
||||
vst VX2, X, 0 * SIZE
|
||||
vst VX3, X, 2 * SIZE
|
||||
vld VX0, X, 4 * SIZE
|
||||
vld VX1, X, 6 * SIZE
|
||||
vpickev.d x1, VX1, VX0
|
||||
vpickod.d x2, VX1, VX0
|
||||
vfmul.d x3, VXAI, x2
|
||||
vfsub.d x3, VXZ, x3
|
||||
vfmul.d x4, VXAI, x1
|
||||
vilvl.d VX2, x4 ,x3
|
||||
vilvh.d VX3, x4, x3
|
||||
vst VX2, X, 4 * SIZE
|
||||
vst VX3, X, 6 * SIZE
|
||||
#else
|
||||
vld VX1, X, 4 * SIZE
|
||||
vpickev.w x1, VX1, VX0
|
||||
vpickod.w x2, VX1, VX0
|
||||
vfmul.s x3, VXAI, x2
|
||||
vfsub.s x3, VXZ, x3
|
||||
vfmul.s x4, VXAI, x1
|
||||
vilvl.w VX2, x4 ,x3
|
||||
vilvh.w VX3, x4, x3
|
||||
vst VX2, X, 0 * SIZE
|
||||
vst VX3, X, 4 * SIZE
|
||||
#endif
|
||||
addi.d X, X, 8 * SIZE
|
||||
addi.d I, I, -1
|
||||
blt $r0, I, .L112
|
||||
b .L997
|
||||
.align 3
|
||||
|
||||
.L113: //alpha_r != 0.0 && alpha_i == 0.0
|
||||
vld VX0, X, 0 * SIZE
|
||||
#ifdef DOUBLE
|
||||
|
@ -256,7 +214,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|||
b .L223 //alpha_r != 0.0 && alpha_i == 0.0
|
||||
|
||||
.L24:
|
||||
bceqz $fcc1, .L222 //alpha_r == 0.0 && alpha_i != 0.0
|
||||
bceqz $fcc1, .L224 //alpha_r == 0.0 && alpha_i != 0.0
|
||||
b .L221 //alpha_r == 0.0 && alpha_i == 0.0
|
||||
.align 3
|
||||
|
||||
|
@ -292,90 +250,6 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|||
b .L997
|
||||
.align 3
|
||||
|
||||
.L222: //alpha_r == 0.0 && alpha_i != 0.0
|
||||
#ifdef DOUBLE
|
||||
ld.d t1, X, 0 * SIZE
|
||||
ld.d t2, X, 1 * SIZE
|
||||
add.d X, X, INCX
|
||||
ld.d t3, X, 0 * SIZE
|
||||
ld.d t4, X, 1 * SIZE
|
||||
add.d X, X, INCX
|
||||
vinsgr2vr.d x1, t1, 0
|
||||
vinsgr2vr.d x2, t2, 0
|
||||
vinsgr2vr.d x1, t3, 1
|
||||
vinsgr2vr.d x2, t4, 1
|
||||
vfmul.d x3, VXAI, x2
|
||||
vfsub.d x3, VXZ, x3
|
||||
vfmul.d x4, VXAI, x1
|
||||
vstelm.d x3, XX, 0 * SIZE, 0
|
||||
vstelm.d x4, XX, 1 * SIZE, 0
|
||||
add.d XX, XX, INCX
|
||||
vstelm.d x3, XX, 0 * SIZE, 1
|
||||
vstelm.d x4, XX, 1 * SIZE, 1
|
||||
add.d XX, XX, INCX
|
||||
|
||||
ld.d t1, X, 0 * SIZE
|
||||
ld.d t2, X, 1 * SIZE
|
||||
add.d X, X, INCX
|
||||
ld.d t3, X, 0 * SIZE
|
||||
ld.d t4, X, 1 * SIZE
|
||||
vinsgr2vr.d x1, t1, 0
|
||||
vinsgr2vr.d x2, t2, 0
|
||||
vinsgr2vr.d x1, t3, 1
|
||||
vinsgr2vr.d x2, t4, 1
|
||||
add.d X, X, INCX
|
||||
vfmul.d x3, VXAI, x2
|
||||
vfsub.d x3, VXZ, x3
|
||||
vfmul.d x4, VXAI, x1
|
||||
addi.d I, I, -1
|
||||
vstelm.d x3, XX, 0 * SIZE, 0
|
||||
vstelm.d x4, XX, 1 * SIZE, 0
|
||||
add.d XX, XX, INCX
|
||||
vstelm.d x3, XX, 0 * SIZE, 1
|
||||
vstelm.d x4, XX, 1 * SIZE, 1
|
||||
#else
|
||||
ld.w t1, X, 0 * SIZE
|
||||
ld.w t2, X, 1 * SIZE
|
||||
add.d X, X, INCX
|
||||
ld.w t3, X, 0 * SIZE
|
||||
ld.w t4, X, 1 * SIZE
|
||||
add.d X, X, INCX
|
||||
vinsgr2vr.w x1, t1, 0
|
||||
vinsgr2vr.w x2, t2, 0
|
||||
vinsgr2vr.w x1, t3, 1
|
||||
vinsgr2vr.w x2, t4, 1
|
||||
ld.w t1, X, 0 * SIZE
|
||||
ld.w t2, X, 1 * SIZE
|
||||
add.d X, X, INCX
|
||||
ld.w t3, X, 0 * SIZE
|
||||
ld.w t4, X, 1 * SIZE
|
||||
vinsgr2vr.w x1, t1, 2
|
||||
vinsgr2vr.w x2, t2, 2
|
||||
vinsgr2vr.w x1, t3, 3
|
||||
vinsgr2vr.w x2, t4, 3
|
||||
add.d X, X, INCX
|
||||
|
||||
vfmul.s x3, VXAI, x2
|
||||
vfsub.s x3, VXZ, x3
|
||||
vfmul.s x4, VXAI, x1
|
||||
addi.d I, I, -1
|
||||
vstelm.w x3, XX, 0 * SIZE, 0
|
||||
vstelm.w x4, XX, 1 * SIZE, 0
|
||||
add.d XX, XX, INCX
|
||||
vstelm.w x3, XX, 0 * SIZE, 1
|
||||
vstelm.w x4, XX, 1 * SIZE, 1
|
||||
add.d XX, XX, INCX
|
||||
vstelm.w x3, XX, 0 * SIZE, 2
|
||||
vstelm.w x4, XX, 1 * SIZE, 2
|
||||
add.d XX, XX, INCX
|
||||
vstelm.w x3, XX, 0 * SIZE, 3
|
||||
vstelm.w x4, XX, 1 * SIZE, 3
|
||||
#endif
|
||||
add.d XX, XX, INCX
|
||||
blt $r0, I, .L222
|
||||
b .L997
|
||||
.align 3
|
||||
|
||||
.L223: //alpha_r != 0.0 && alpha_i == 0.0
|
||||
#ifdef DOUBLE
|
||||
ld.d t1, X, 0 * SIZE
|
||||
|
|
|
@ -254,9 +254,15 @@ int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da_r, FLOAT da_i,
|
|||
while(j < n1)
|
||||
{
|
||||
|
||||
if (isnan(x[i]) || isinf(x[i]))
|
||||
temp0 = NAN;
|
||||
else
|
||||
temp0 = -da_i * x[i+1];
|
||||
x[i+1] = da_i * x[i];
|
||||
x[i] = temp0;
|
||||
if (isnan(x[i+inc_x]) || isinf(x[i+inc_x]))
|
||||
temp1 = NAN;
|
||||
else
|
||||
temp1 = -da_i * x[i+1+inc_x];
|
||||
x[i+1+inc_x] = da_i * x[i+inc_x];
|
||||
x[i+inc_x] = temp1;
|
||||
|
@ -268,6 +274,9 @@ int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da_r, FLOAT da_i,
|
|||
while(j < n)
|
||||
{
|
||||
|
||||
if (isnan(x[i]) || isinf(x[i]))
|
||||
temp0 = NAN;
|
||||
else
|
||||
temp0 = -da_i * x[i+1];
|
||||
x[i+1] = da_i * x[i];
|
||||
x[i] = temp0;
|
||||
|
|
|
@ -16,6 +16,7 @@ else ()
|
|||
test_dnrm2.c
|
||||
test_swap.c
|
||||
test_zscal.c
|
||||
test_amin.c
|
||||
)
|
||||
endif ()
|
||||
|
||||
|
|
|
@ -11,7 +11,8 @@ UTESTBIN=openblas_utest
|
|||
|
||||
include $(TOPDIR)/Makefile.system
|
||||
|
||||
OBJS=utest_main.o test_min.o test_amax.o test_ismin.o test_rotmg.o test_axpy.o test_dotu.o test_dsdot.o test_swap.o test_rot.o test_dnrm2.o test_zscal.o
|
||||
OBJS=utest_main.o test_min.o test_amax.o test_ismin.o test_rotmg.o test_axpy.o test_dotu.o test_dsdot.o test_swap.o test_rot.o test_dnrm2.o test_zscal.o \
|
||||
test_amin.o
|
||||
#test_rot.o test_swap.o test_axpy.o test_dotu.o test_dsdot.o test_fork.o
|
||||
|
||||
ifneq ($(NO_LAPACK), 1)
|
||||
|
|
|
@ -1,5 +1,5 @@
|
|||
/*****************************************************************************
|
||||
Copyright (c) 2011-2016, The OpenBLAS Project
|
||||
Copyright (c) 2011-2024, The OpenBLAS Project
|
||||
All rights reserved.
|
||||
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
|
@ -57,4 +57,31 @@ CTEST(amax, damax){
|
|||
ASSERT_DBL_NEAR_TOL((double)(tr_max), (double)(te_max), DOUBLE_EPS);
|
||||
}
|
||||
#endif
|
||||
#ifdef BUILD_COMPLEX
|
||||
CTEST(amax, scamax){
|
||||
blasint N = 9, inc = 1;
|
||||
float te_max = 0.0, tr_max = 0.0;
|
||||
float x[] = { -1.1, 2.2, -3.3, 4.4, -5.5, 6.6, -7.7, 8.8,
|
||||
-9.9, 10.10, -1.1, 2.2, -3.3, 4.4, -5.5, 6.6,
|
||||
-7.7, 8.8 };
|
||||
|
||||
te_max = BLASFUNC(scamax)(&N, x, &inc);
|
||||
tr_max = 20.0;
|
||||
|
||||
ASSERT_DBL_NEAR_TOL((double)(tr_max), (double)(te_max), SINGLE_EPS);
|
||||
}
|
||||
#endif
|
||||
#ifdef BUILD_COMPLEX16
|
||||
CTEST(amax, dzamax){
|
||||
blasint N = 9, inc = 1;
|
||||
double te_max = 0.0, tr_max = 0.0;
|
||||
double x[] = { -1.1, 2.2, -3.3, 4.4, -5.5, 6.6, -7.7, 8.8,
|
||||
-9.9, 10.10, -1.1, 2.2, -3.3, 4.4, -5.5, 6.6,
|
||||
-7.7, 8.8 };
|
||||
|
||||
te_max = BLASFUNC(dzamax)(&N, x, &inc);
|
||||
tr_max = 20.0;
|
||||
|
||||
ASSERT_DBL_NEAR_TOL((double)(tr_max), (double)(te_max), DOUBLE_EPS);
|
||||
}
|
||||
#endif
|
||||
|
|
|
@ -0,0 +1,89 @@
|
|||
/*****************************************************************************
|
||||
Copyright (c) 2011-2024, The OpenBLAS Project
|
||||
All rights reserved.
|
||||
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
met:
|
||||
|
||||
1. Redistributions of source code must retain the above copyright
|
||||
notice, this list of conditions and the following disclaimer.
|
||||
|
||||
2. Redistributions in binary form must reproduce the above copyright
|
||||
notice, this list of conditions and the following disclaimer in
|
||||
the documentation and/or other materials provided with the
|
||||
distribution.
|
||||
3. Neither the name of the OpenBLAS project nor the names of
|
||||
its contributors may be used to endorse or promote products
|
||||
derived from this software without specific prior written
|
||||
permission.
|
||||
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||||
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
||||
ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
|
||||
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
|
||||
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
|
||||
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
|
||||
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
|
||||
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
|
||||
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
|
||||
**********************************************************************************/
|
||||
|
||||
#include "openblas_utest.h"
|
||||
|
||||
#ifdef BUILD_SINGLE
|
||||
CTEST(amin, samin){
|
||||
blasint N = 3, inc = 1;
|
||||
float te_min = 0.0, tr_min = 0.0;
|
||||
float x[] = { -1.1, 2.2, -3.3, 4.4, -5.5, 6.6, -7.7, 8.8,
|
||||
-9.9 };
|
||||
|
||||
te_min = BLASFUNC(samin)(&N, x, &inc);
|
||||
tr_min = 1.1;
|
||||
|
||||
ASSERT_DBL_NEAR_TOL((double)(tr_min), (double)(te_min), SINGLE_EPS);
|
||||
}
|
||||
#endif
|
||||
#ifdef BUILD_DOUBLE
|
||||
CTEST(amin, damin){
|
||||
blasint N = 3, inc = 1;
|
||||
double te_min = 0.0, tr_min = 0.0;
|
||||
double x[] = { -1.1, 2.2, -3.3, 4.4, -5.5, 6.6, -7.7, 8.8,
|
||||
-9.9 };
|
||||
|
||||
te_min = BLASFUNC(damin)(&N, x, &inc);
|
||||
tr_min = 1.1;
|
||||
|
||||
ASSERT_DBL_NEAR_TOL((double)(tr_min), (double)(te_min), DOUBLE_EPS);
|
||||
}
|
||||
#endif
|
||||
#ifdef BUILD_COMPLEX
|
||||
CTEST(amin, scamin){
|
||||
blasint N = 9, inc = 1;
|
||||
float te_min = 0.0, tr_min = 0.0;
|
||||
float x[] = { -1.1, 2.2, -3.3, 4.4, -5.5, 6.6, -7.7, 8.8,
|
||||
-9.9, 10.10, -1.1, 2.2, -3.3, 4.4, -5.5, 6.6,
|
||||
-7.7, 8.8 };
|
||||
|
||||
te_min = BLASFUNC(scamin)(&N, x, &inc);
|
||||
tr_min = 3.3;
|
||||
|
||||
ASSERT_DBL_NEAR_TOL((double)(tr_min), (double)(te_min), SINGLE_EPS);
|
||||
}
|
||||
#endif
|
||||
#ifdef BUILD_COMPLEX16
|
||||
CTEST(amin, dzamin){
|
||||
blasint N = 9, inc = 1;
|
||||
double te_min = 0.0, tr_min = 0.0;
|
||||
double x[] = { -1.1, 2.2, -3.3, 4.4, -5.5, 6.6, -7.7, 8.8,
|
||||
-9.9, 10.10, -1.1, 2.2, -3.3, 4.4, -5.5, 6.6,
|
||||
-7.7, 8.8 };
|
||||
|
||||
te_min = BLASFUNC(dzamin)(&N, x, &inc);
|
||||
tr_min = 3.3;
|
||||
|
||||
ASSERT_DBL_NEAR_TOL((double)(tr_min), (double)(te_min), DOUBLE_EPS);
|
||||
}
|
||||
#endif
|
|
@ -20,6 +20,18 @@ CTEST(zscal, i_nan)
|
|||
ASSERT_TRUE(isnan(nan[17]));
|
||||
}
|
||||
|
||||
CTEST(zscal, i_nan_inc_2)
|
||||
{
|
||||
double i[] = {0,1, 0,1, 0,1, 0,1, 0,1, 0,1, 0,1, 0,1, 0,1 };
|
||||
double nan[] = {NAN,0, NAN,0, NAN,0, NAN,0, NAN,0, NAN,0, NAN,0, NAN,0, NAN,0, NAN,0,
|
||||
NAN,0, NAN,0, NAN,0, NAN,0, NAN,0, NAN,0, NAN,0, NAN,0, NAN,0, NAN,0};
|
||||
cblas_zscal(9, i, &nan, 2);
|
||||
ASSERT_TRUE(isnan(nan[0]));
|
||||
ASSERT_TRUE(isnan(nan[1]));
|
||||
ASSERT_TRUE(isnan(nan[16]));
|
||||
ASSERT_TRUE(isnan(nan[17]));
|
||||
}
|
||||
|
||||
CTEST(zscal, nan_i)
|
||||
{
|
||||
double i[] = {0,1, 0,1, 0,1, 0,1, 0,1, 0,1, 0,1, 0,1, 0,1 };
|
||||
|
@ -31,6 +43,18 @@ CTEST(zscal, nan_i)
|
|||
ASSERT_TRUE(isnan(i[17]));
|
||||
}
|
||||
|
||||
CTEST(zscal, nan_i_inc_2)
|
||||
{
|
||||
double i[] = {0,1, 0,1, 0,1, 0,1, 0,1, 0,1, 0,1, 0,1, 0,1,
|
||||
0,1, 0,1, 0,1, 0,1, 0,1, 0,1, 0,1, 0,1, 0,1 };
|
||||
double nan[] = {NAN,0, NAN,0, NAN,0, NAN,0, NAN,0, NAN,0, NAN,0, NAN,0, NAN,0, NAN,0};
|
||||
cblas_zscal(9, &nan, &i, 2);
|
||||
ASSERT_TRUE(isnan(i[0]));
|
||||
ASSERT_TRUE(isnan(i[1]));
|
||||
ASSERT_TRUE(isnan(i[16]));
|
||||
ASSERT_TRUE(isnan(i[17]));
|
||||
}
|
||||
|
||||
CTEST(zscal, i_inf)
|
||||
{
|
||||
double i[] = {0,1, 0,1, 0,1, 0,1, 0,1, 0,1, 0,1, 0,1, 0,1 };
|
||||
|
@ -42,6 +66,18 @@ CTEST(zscal, i_inf)
|
|||
ASSERT_TRUE(isinf(inf[17]));
|
||||
}
|
||||
|
||||
CTEST(zscal, i_inf_inc_2)
|
||||
{
|
||||
double i[] = {0,1, 0,1, 0,1, 0,1, 0,1, 0,1, 0,1, 0,1, 0,1 };
|
||||
double inf[] = {INFINITY, 0, INFINITY,0, INFINITY,0, INFINITY,0, INFINITY,0, INFINITY,0, INFINITY,0, INFINITY,0, INFINITY,0,
|
||||
INFINITY, 0, INFINITY,0, INFINITY,0, INFINITY,0, INFINITY,0, INFINITY,0, INFINITY,0, INFINITY,0, INFINITY,0};
|
||||
cblas_zscal(9, i, &inf, 2);
|
||||
ASSERT_TRUE(isnan(inf[0]));
|
||||
ASSERT_TRUE(isinf(inf[1]));
|
||||
ASSERT_TRUE(isnan(inf[16]));
|
||||
ASSERT_TRUE(isinf(inf[17]));
|
||||
}
|
||||
|
||||
CTEST(zscal, inf_i)
|
||||
{
|
||||
double i[] = {0,1, 0,1, 0,1, 0,1, 0,1, 0,1, 0,1, 0,1, 0,1 };
|
||||
|
@ -53,4 +89,16 @@ CTEST(zscal, inf_i)
|
|||
ASSERT_TRUE(isinf(i[17]));
|
||||
}
|
||||
|
||||
CTEST(zscal, inf_i_inc_2)
|
||||
{
|
||||
double i[] = {0,1, 0,1, 0,1, 0,1, 0,1, 0,1, 0,1, 0,1, 0,1,
|
||||
0,1, 0,1, 0,1, 0,1, 0,1, 0,1, 0,1, 0,1, 0,1 };
|
||||
double inf[] = {INFINITY, 0, INFINITY,0, INFINITY,0, INFINITY,0, INFINITY,0, INFINITY,0, INFINITY,0, INFINITY,0, INFINITY,0};
|
||||
cblas_zscal(9, &inf, &i, 2);
|
||||
ASSERT_TRUE(isnan(i[0]));
|
||||
ASSERT_TRUE(isinf(i[1]));
|
||||
ASSERT_TRUE(isnan(i[16]));
|
||||
ASSERT_TRUE(isinf(i[17]));
|
||||
}
|
||||
|
||||
#endif
|
||||
|
|
Loading…
Reference in New Issue