Merge pull request #2072 from martin-frbg/sum
Add (C)BLAS extension ?sum
This commit is contained in:
commit
ccfb7ead15
5
cblas.h
5
cblas.h
|
@ -73,6 +73,11 @@ double cblas_dasum (OPENBLAS_CONST blasint n, OPENBLAS_CONST double *x, OPENBLAS
|
|||
float cblas_scasum(OPENBLAS_CONST blasint n, OPENBLAS_CONST void *x, OPENBLAS_CONST blasint incx);
|
||||
double cblas_dzasum(OPENBLAS_CONST blasint n, OPENBLAS_CONST void *x, OPENBLAS_CONST blasint incx);
|
||||
|
||||
float cblas_ssum (OPENBLAS_CONST blasint n, OPENBLAS_CONST float *x, OPENBLAS_CONST blasint incx);
|
||||
double cblas_dsum (OPENBLAS_CONST blasint n, OPENBLAS_CONST double *x, OPENBLAS_CONST blasint incx);
|
||||
float cblas_scsum(OPENBLAS_CONST blasint n, OPENBLAS_CONST void *x, OPENBLAS_CONST blasint incx);
|
||||
double cblas_dzsum(OPENBLAS_CONST blasint n, OPENBLAS_CONST void *x, OPENBLAS_CONST blasint incx);
|
||||
|
||||
float cblas_snrm2 (OPENBLAS_CONST blasint N, OPENBLAS_CONST float *X, OPENBLAS_CONST blasint incX);
|
||||
double cblas_dnrm2 (OPENBLAS_CONST blasint N, OPENBLAS_CONST double *X, OPENBLAS_CONST blasint incX);
|
||||
float cblas_scnrm2(OPENBLAS_CONST blasint N, OPENBLAS_CONST void *X, OPENBLAS_CONST blasint incX);
|
||||
|
|
|
@ -107,6 +107,12 @@ macro(SetDefaultL1)
|
|||
set(DAXPBYKERNEL ../arm/axpby.c)
|
||||
set(CAXPBYKERNEL ../arm/zaxpby.c)
|
||||
set(ZAXPBYKERNEL ../arm/zaxpby.c)
|
||||
set(SSUMKERNEL sum.S)
|
||||
set(DSUMKERNEL sum.S)
|
||||
set(CSUMKERNEL zsum.S)
|
||||
set(ZSUMKERNEL zsum.S)
|
||||
set(QSUMKERNEL sum.S)
|
||||
set(XSUMKERNEL zsum.S)
|
||||
endmacro ()
|
||||
|
||||
macro(SetDefaultL2)
|
||||
|
|
|
@ -19,6 +19,7 @@
|
|||
#define CDOTC_K cdotc_k
|
||||
#define CNRM2_K cnrm2_k
|
||||
#define CSCAL_K cscal_k
|
||||
#define CSUM_K csum_k
|
||||
#define CSWAP_K cswap_k
|
||||
#define CROT_K csrot_k
|
||||
|
||||
|
@ -249,6 +250,7 @@
|
|||
#define CDOTC_K gotoblas -> cdotc_k
|
||||
#define CNRM2_K gotoblas -> cnrm2_k
|
||||
#define CSCAL_K gotoblas -> cscal_k
|
||||
#define CSUM_K gotoblas -> csum_k
|
||||
#define CSWAP_K gotoblas -> cswap_k
|
||||
#define CROT_K gotoblas -> csrot_k
|
||||
|
||||
|
|
|
@ -19,6 +19,7 @@
|
|||
#define DDOTC_K ddot_k
|
||||
#define DNRM2_K dnrm2_k
|
||||
#define DSCAL_K dscal_k
|
||||
#define DSUM_K dsum_k
|
||||
#define DSWAP_K dswap_k
|
||||
#define DROT_K drot_k
|
||||
|
||||
|
@ -174,6 +175,7 @@
|
|||
#define DDOTC_K gotoblas -> ddot_k
|
||||
#define DNRM2_K gotoblas -> dnrm2_k
|
||||
#define DSCAL_K gotoblas -> dscal_k
|
||||
#define DSUM_K gotoblas -> dsum_k
|
||||
#define DSWAP_K gotoblas -> dswap_k
|
||||
#define DROT_K gotoblas -> drot_k
|
||||
|
||||
|
|
|
@ -122,6 +122,13 @@ xdouble BLASFUNC(qasum) (blasint *, xdouble *, blasint *);
|
|||
double BLASFUNC(dzasum)(blasint *, double *, blasint *);
|
||||
xdouble BLASFUNC(qxasum)(blasint *, xdouble *, blasint *);
|
||||
|
||||
FLOATRET BLASFUNC(ssum) (blasint *, float *, blasint *);
|
||||
FLOATRET BLASFUNC(scsum)(blasint *, float *, blasint *);
|
||||
double BLASFUNC(dsum) (blasint *, double *, blasint *);
|
||||
xdouble BLASFUNC(qsum) (blasint *, xdouble *, blasint *);
|
||||
double BLASFUNC(dzsum)(blasint *, double *, blasint *);
|
||||
xdouble BLASFUNC(qxsum)(blasint *, xdouble *, blasint *);
|
||||
|
||||
blasint BLASFUNC(isamax)(blasint *, float *, blasint *);
|
||||
blasint BLASFUNC(idamax)(blasint *, double *, blasint *);
|
||||
blasint BLASFUNC(iqamax)(blasint *, xdouble *, blasint *);
|
||||
|
|
|
@ -100,6 +100,13 @@ float casum_k (BLASLONG, float *, BLASLONG);
|
|||
double zasum_k (BLASLONG, double *, BLASLONG);
|
||||
xdouble xasum_k (BLASLONG, xdouble *, BLASLONG);
|
||||
|
||||
float ssum_k (BLASLONG, float *, BLASLONG);
|
||||
double dsum_k (BLASLONG, double *, BLASLONG);
|
||||
xdouble qsum_k (BLASLONG, xdouble *, BLASLONG);
|
||||
float csum_k (BLASLONG, float *, BLASLONG);
|
||||
double zsum_k (BLASLONG, double *, BLASLONG);
|
||||
xdouble xsum_k (BLASLONG, xdouble *, BLASLONG);
|
||||
|
||||
float samax_k (BLASLONG, float *, BLASLONG);
|
||||
double damax_k (BLASLONG, double *, BLASLONG);
|
||||
xdouble qamax_k (BLASLONG, xdouble *, BLASLONG);
|
||||
|
|
|
@ -66,6 +66,7 @@
|
|||
#define DOTC_K QDOTC_K
|
||||
#define NRM2_K QNRM2_K
|
||||
#define SCAL_K QSCAL_K
|
||||
#define SUM_K QSUM_K
|
||||
#define SWAP_K QSWAP_K
|
||||
#define ROT_K QROT_K
|
||||
|
||||
|
@ -356,6 +357,7 @@
|
|||
#define DOTC_K DDOTC_K
|
||||
#define NRM2_K DNRM2_K
|
||||
#define SCAL_K DSCAL_K
|
||||
#define SUM_K DSUM_K
|
||||
#define SWAP_K DSWAP_K
|
||||
#define ROT_K DROT_K
|
||||
|
||||
|
@ -658,6 +660,7 @@
|
|||
#define DOTC_K SDOTC_K
|
||||
#define NRM2_K SNRM2_K
|
||||
#define SCAL_K SSCAL_K
|
||||
#define SUM_K SSUM_K
|
||||
#define SWAP_K SSWAP_K
|
||||
#define ROT_K SROT_K
|
||||
|
||||
|
@ -962,6 +965,7 @@
|
|||
#define DOTC_K XDOTC_K
|
||||
#define NRM2_K XNRM2_K
|
||||
#define SCAL_K XSCAL_K
|
||||
#define SUM_K XSUM_K
|
||||
#define SWAP_K XSWAP_K
|
||||
#define ROT_K XROT_K
|
||||
|
||||
|
@ -1363,6 +1367,7 @@
|
|||
#define DOTC_K ZDOTC_K
|
||||
#define NRM2_K ZNRM2_K
|
||||
#define SCAL_K ZSCAL_K
|
||||
#define SUM_K ZSUM_K
|
||||
#define SWAP_K ZSWAP_K
|
||||
#define ROT_K ZROT_K
|
||||
|
||||
|
@ -1785,6 +1790,7 @@
|
|||
#define DOTC_K CDOTC_K
|
||||
#define NRM2_K CNRM2_K
|
||||
#define SCAL_K CSCAL_K
|
||||
#define SUM_K CSUM_K
|
||||
#define SWAP_K CSWAP_K
|
||||
#define ROT_K CROT_K
|
||||
|
||||
|
|
|
@ -63,6 +63,7 @@ BLASLONG (*ismin_k) (BLASLONG, float *, BLASLONG);
|
|||
|
||||
float (*snrm2_k) (BLASLONG, float *, BLASLONG);
|
||||
float (*sasum_k) (BLASLONG, float *, BLASLONG);
|
||||
float (*ssum_k) (BLASLONG, float *, BLASLONG);
|
||||
int (*scopy_k) (BLASLONG, float *, BLASLONG, float *, BLASLONG);
|
||||
float (*sdot_k) (BLASLONG, float *, BLASLONG, float *, BLASLONG);
|
||||
double (*dsdot_k) (BLASLONG, float *, BLASLONG, float *, BLASLONG);
|
||||
|
@ -154,6 +155,7 @@ BLASLONG (*idmin_k) (BLASLONG, double *, BLASLONG);
|
|||
|
||||
double (*dnrm2_k) (BLASLONG, double *, BLASLONG);
|
||||
double (*dasum_k) (BLASLONG, double *, BLASLONG);
|
||||
double (*dsum_k) (BLASLONG, double *, BLASLONG);
|
||||
int (*dcopy_k) (BLASLONG, double *, BLASLONG, double *, BLASLONG);
|
||||
double (*ddot_k) (BLASLONG, double *, BLASLONG, double *, BLASLONG);
|
||||
int (*drot_k) (BLASLONG, double *, BLASLONG, double *, BLASLONG, double, double);
|
||||
|
@ -245,6 +247,7 @@ BLASLONG (*iqmin_k) (BLASLONG, xdouble *, BLASLONG);
|
|||
|
||||
xdouble (*qnrm2_k) (BLASLONG, xdouble *, BLASLONG);
|
||||
xdouble (*qasum_k) (BLASLONG, xdouble *, BLASLONG);
|
||||
xdouble (*qsum_k) (BLASLONG, xdouble *, BLASLONG);
|
||||
int (*qcopy_k) (BLASLONG, xdouble *, BLASLONG, xdouble *, BLASLONG);
|
||||
xdouble (*qdot_k) (BLASLONG, xdouble *, BLASLONG, xdouble *, BLASLONG);
|
||||
int (*qrot_k) (BLASLONG, xdouble *, BLASLONG, xdouble *, BLASLONG, xdouble, xdouble);
|
||||
|
@ -332,6 +335,7 @@ BLASLONG (*icamin_k)(BLASLONG, float *, BLASLONG);
|
|||
|
||||
float (*cnrm2_k) (BLASLONG, float *, BLASLONG);
|
||||
float (*casum_k) (BLASLONG, float *, BLASLONG);
|
||||
float (*csum_k) (BLASLONG, float *, BLASLONG);
|
||||
int (*ccopy_k) (BLASLONG, float *, BLASLONG, float *, BLASLONG);
|
||||
openblas_complex_float (*cdotu_k) (BLASLONG, float *, BLASLONG, float *, BLASLONG);
|
||||
openblas_complex_float (*cdotc_k) (BLASLONG, float *, BLASLONG, float *, BLASLONG);
|
||||
|
@ -495,6 +499,7 @@ BLASLONG (*izamin_k)(BLASLONG, double *, BLASLONG);
|
|||
|
||||
double (*znrm2_k) (BLASLONG, double *, BLASLONG);
|
||||
double (*zasum_k) (BLASLONG, double *, BLASLONG);
|
||||
double (*zsum_k) (BLASLONG, double *, BLASLONG);
|
||||
int (*zcopy_k) (BLASLONG, double *, BLASLONG, double *, BLASLONG);
|
||||
openblas_complex_double (*zdotu_k) (BLASLONG, double *, BLASLONG, double *, BLASLONG);
|
||||
openblas_complex_double (*zdotc_k) (BLASLONG, double *, BLASLONG, double *, BLASLONG);
|
||||
|
@ -660,6 +665,7 @@ BLASLONG (*ixamin_k)(BLASLONG, xdouble *, BLASLONG);
|
|||
|
||||
xdouble (*xnrm2_k) (BLASLONG, xdouble *, BLASLONG);
|
||||
xdouble (*xasum_k) (BLASLONG, xdouble *, BLASLONG);
|
||||
xdouble (*xsum_k) (BLASLONG, xdouble *, BLASLONG);
|
||||
int (*xcopy_k) (BLASLONG, xdouble *, BLASLONG, xdouble *, BLASLONG);
|
||||
openblas_complex_xdouble (*xdotu_k) (BLASLONG, xdouble *, BLASLONG, xdouble *, BLASLONG);
|
||||
openblas_complex_xdouble (*xdotc_k) (BLASLONG, xdouble *, BLASLONG, xdouble *, BLASLONG);
|
||||
|
|
|
@ -19,6 +19,7 @@
|
|||
#define QDOTC_K qdot_k
|
||||
#define QNRM2_K qnrm2_k
|
||||
#define QSCAL_K qscal_k
|
||||
#define QSUM_K qsum_k
|
||||
#define QSWAP_K qswap_k
|
||||
#define QROT_K qrot_k
|
||||
|
||||
|
@ -161,6 +162,7 @@
|
|||
#define QDOTC_K gotoblas -> qdot_k
|
||||
#define QNRM2_K gotoblas -> qnrm2_k
|
||||
#define QSCAL_K gotoblas -> qscal_k
|
||||
#define QSUM_K gotoblas -> qsum_k
|
||||
#define QSWAP_K gotoblas -> qswap_k
|
||||
#define QROT_K gotoblas -> qrot_k
|
||||
|
||||
|
|
|
@ -12,6 +12,7 @@
|
|||
#define ISMAX_K ismax_k
|
||||
#define ISMIN_K ismin_k
|
||||
#define SASUM_K sasum_k
|
||||
#define SSUM_K ssum_k
|
||||
#define SAXPYU_K saxpy_k
|
||||
#define SAXPYC_K saxpy_k
|
||||
#define SCOPY_K scopy_k
|
||||
|
@ -170,6 +171,7 @@
|
|||
#define ISMAX_K gotoblas -> ismax_k
|
||||
#define ISMIN_K gotoblas -> ismin_k
|
||||
#define SASUM_K gotoblas -> sasum_k
|
||||
#define SSUM_K gotoblas -> ssum_k
|
||||
#define SAXPYU_K gotoblas -> saxpy_k
|
||||
#define SAXPYC_K gotoblas -> saxpy_k
|
||||
#define SCOPY_K gotoblas -> scopy_k
|
||||
|
|
|
@ -19,6 +19,7 @@
|
|||
#define XDOTC_K xdotc_k
|
||||
#define XNRM2_K xnrm2_k
|
||||
#define XSCAL_K xscal_k
|
||||
#define XSUM_K xsum_k
|
||||
#define XSWAP_K xswap_k
|
||||
#define XROT_K xqrot_k
|
||||
|
||||
|
@ -227,6 +228,7 @@
|
|||
#define XDOTC_K gotoblas -> xdotc_k
|
||||
#define XNRM2_K gotoblas -> xnrm2_k
|
||||
#define XSCAL_K gotoblas -> xscal_k
|
||||
#define XSUM_K gotoblas -> xsum_k
|
||||
#define XSWAP_K gotoblas -> xswap_k
|
||||
#define XROT_K gotoblas -> xqrot_k
|
||||
|
||||
|
|
|
@ -19,6 +19,7 @@
|
|||
#define ZDOTC_K zdotc_k
|
||||
#define ZNRM2_K znrm2_k
|
||||
#define ZSCAL_K zscal_k
|
||||
#define ZSUM_K zsum_k
|
||||
#define ZSWAP_K zswap_k
|
||||
#define ZROT_K zdrot_k
|
||||
|
||||
|
@ -249,6 +250,7 @@
|
|||
#define ZDOTC_K gotoblas -> zdotc_k
|
||||
#define ZNRM2_K gotoblas -> znrm2_k
|
||||
#define ZSCAL_K gotoblas -> zscal_k
|
||||
#define ZSUM_K gotoblas -> zsum_k
|
||||
#define ZSWAP_K gotoblas -> zswap_k
|
||||
#define ZROT_K gotoblas -> zdrot_k
|
||||
|
||||
|
|
|
@ -12,6 +12,7 @@ set(BLAS1_REAL_ONLY_SOURCES
|
|||
rotm.c rotmg.c # N.B. these do not have complex counterparts
|
||||
rot.c
|
||||
asum.c
|
||||
sum.c
|
||||
)
|
||||
|
||||
# these will have 'z' prepended for the complex version
|
||||
|
@ -124,6 +125,7 @@ foreach (float_type ${FLOAT_TYPES})
|
|||
GenerateNamedObjects("max.c" "USE_ABS;USE_MIN" "scamin" ${CBLAS_FLAG} "" "" true "COMPLEX")
|
||||
GenerateNamedObjects("max.c" "USE_ABS" "scamax" ${CBLAS_FLAG} "" "" true "COMPLEX")
|
||||
GenerateNamedObjects("asum.c" "" "scasum" ${CBLAS_FLAG} "" "" true "COMPLEX")
|
||||
GenerateNamedObjects("sum.c" "" "scsum" ${CBLAS_FLAG} "" "" true "COMPLEX")
|
||||
endif ()
|
||||
if (${float_type} STREQUAL "ZCOMPLEX")
|
||||
GenerateNamedObjects("zscal.c" "SSCAL" "dscal" ${CBLAS_FLAG} "" "" false "ZCOMPLEX")
|
||||
|
@ -132,6 +134,7 @@ foreach (float_type ${FLOAT_TYPES})
|
|||
GenerateNamedObjects("max.c" "USE_ABS;USE_MIN" "dzamin" ${CBLAS_FLAG} "" "" true "ZCOMPLEX")
|
||||
GenerateNamedObjects("max.c" "USE_ABS" "dzamax" ${CBLAS_FLAG} "" "" true "ZCOMPLEX")
|
||||
GenerateNamedObjects("asum.c" "" "dzasum" ${CBLAS_FLAG} "" "" true "ZCOMPLEX")
|
||||
GenerateNamedObjects("sum.c" "" "dzsum" ${CBLAS_FLAG} "" "" true "ZCOMPLEX")
|
||||
endif ()
|
||||
endforeach ()
|
||||
|
||||
|
|
|
@ -25,7 +25,7 @@ SBLAS1OBJS = \
|
|||
saxpy.$(SUFFIX) sswap.$(SUFFIX) \
|
||||
scopy.$(SUFFIX) sscal.$(SUFFIX) \
|
||||
sdot.$(SUFFIX) sdsdot.$(SUFFIX) dsdot.$(SUFFIX) \
|
||||
sasum.$(SUFFIX) snrm2.$(SUFFIX) \
|
||||
sasum.$(SUFFIX) ssum.$(SUFFIX) snrm2.$(SUFFIX) \
|
||||
smax.$(SUFFIX) samax.$(SUFFIX) ismax.$(SUFFIX) isamax.$(SUFFIX) \
|
||||
smin.$(SUFFIX) samin.$(SUFFIX) ismin.$(SUFFIX) isamin.$(SUFFIX) \
|
||||
srot.$(SUFFIX) srotg.$(SUFFIX) srotm.$(SUFFIX) srotmg.$(SUFFIX) \
|
||||
|
@ -51,7 +51,7 @@ DBLAS1OBJS = \
|
|||
daxpy.$(SUFFIX) dswap.$(SUFFIX) \
|
||||
dcopy.$(SUFFIX) dscal.$(SUFFIX) \
|
||||
ddot.$(SUFFIX) \
|
||||
dasum.$(SUFFIX) dnrm2.$(SUFFIX) \
|
||||
dasum.$(SUFFIX) dsum.$(SUFFIX) dnrm2.$(SUFFIX) \
|
||||
dmax.$(SUFFIX) damax.$(SUFFIX) idmax.$(SUFFIX) idamax.$(SUFFIX) \
|
||||
dmin.$(SUFFIX) damin.$(SUFFIX) idmin.$(SUFFIX) idamin.$(SUFFIX) \
|
||||
drot.$(SUFFIX) drotg.$(SUFFIX) drotm.$(SUFFIX) drotmg.$(SUFFIX) \
|
||||
|
@ -76,7 +76,7 @@ CBLAS1OBJS = \
|
|||
caxpy.$(SUFFIX) caxpyc.$(SUFFIX) cswap.$(SUFFIX) \
|
||||
ccopy.$(SUFFIX) cscal.$(SUFFIX) csscal.$(SUFFIX) \
|
||||
cdotc.$(SUFFIX) cdotu.$(SUFFIX) \
|
||||
scasum.$(SUFFIX) scnrm2.$(SUFFIX) \
|
||||
scasum.$(SUFFIX) scsum.$(SUFFIX) scnrm2.$(SUFFIX) \
|
||||
scamax.$(SUFFIX) icamax.$(SUFFIX) \
|
||||
scamin.$(SUFFIX) icamin.$(SUFFIX) \
|
||||
csrot.$(SUFFIX) crotg.$(SUFFIX) \
|
||||
|
@ -105,7 +105,7 @@ ZBLAS1OBJS = \
|
|||
zaxpy.$(SUFFIX) zaxpyc.$(SUFFIX) zswap.$(SUFFIX) \
|
||||
zcopy.$(SUFFIX) zscal.$(SUFFIX) zdscal.$(SUFFIX) \
|
||||
zdotc.$(SUFFIX) zdotu.$(SUFFIX) \
|
||||
dzasum.$(SUFFIX) dznrm2.$(SUFFIX) \
|
||||
dzasum.$(SUFFIX) dzsum.$(SUFFIX) dznrm2.$(SUFFIX) \
|
||||
dzamax.$(SUFFIX) izamax.$(SUFFIX) \
|
||||
dzamin.$(SUFFIX) izamin.$(SUFFIX) \
|
||||
zdrot.$(SUFFIX) zrotg.$(SUFFIX) \
|
||||
|
@ -146,7 +146,7 @@ QBLAS1OBJS = \
|
|||
qaxpy.$(SUFFIX) qswap.$(SUFFIX) \
|
||||
qcopy.$(SUFFIX) qscal.$(SUFFIX) \
|
||||
qdot.$(SUFFIX) \
|
||||
qasum.$(SUFFIX) qnrm2.$(SUFFIX) \
|
||||
qasum.$(SUFFIX) qsum.$(SUFFIX) qnrm2.$(SUFFIX) \
|
||||
qmax.$(SUFFIX) qamax.$(SUFFIX) iqmax.$(SUFFIX) iqamax.$(SUFFIX) \
|
||||
qmin.$(SUFFIX) qamin.$(SUFFIX) iqmin.$(SUFFIX) iqamin.$(SUFFIX) \
|
||||
qrot.$(SUFFIX) qrotg.$(SUFFIX) qrotm.$(SUFFIX) qrotmg.$(SUFFIX) \
|
||||
|
@ -168,7 +168,7 @@ XBLAS1OBJS = \
|
|||
xaxpy.$(SUFFIX) xaxpyc.$(SUFFIX) xswap.$(SUFFIX) \
|
||||
xcopy.$(SUFFIX) xscal.$(SUFFIX) xqscal.$(SUFFIX) \
|
||||
xdotc.$(SUFFIX) xdotu.$(SUFFIX) \
|
||||
qxasum.$(SUFFIX) qxnrm2.$(SUFFIX) \
|
||||
qxasum.$(SUFFIX) qxsum.$(SUFFIX) qxnrm2.$(SUFFIX) \
|
||||
qxamax.$(SUFFIX) ixamax.$(SUFFIX) \
|
||||
qxamin.$(SUFFIX) ixamin.$(SUFFIX) \
|
||||
xqrot.$(SUFFIX) xrotg.$(SUFFIX) \
|
||||
|
@ -203,7 +203,7 @@ ifdef QUAD_PRECISION
|
|||
QBLAS1OBJS = \
|
||||
qaxpy.$(SUFFIX) qswap.$(SUFFIX) \
|
||||
qcopy.$(SUFFIX) qscal.$(SUFFIX) \
|
||||
qasum.$(SUFFIX) qnrm2.$(SUFFIX) \
|
||||
qasum.$(SUFFIX) qsum.$(SUFFIX) qnrm2.$(SUFFIX) \
|
||||
qmax.$(SUFFIX) qamax.$(SUFFIX) iqmax.$(SUFFIX) iqamax.$(SUFFIX) \
|
||||
qmin.$(SUFFIX) qamin.$(SUFFIX) iqmin.$(SUFFIX) iqamin.$(SUFFIX) \
|
||||
qrot.$(SUFFIX) qrotg.$(SUFFIX) qrotm.$(SUFFIX) qrotmg.$(SUFFIX) \
|
||||
|
@ -224,7 +224,7 @@ QBLAS3OBJS = \
|
|||
XBLAS1OBJS = \
|
||||
xaxpy.$(SUFFIX) xaxpyc.$(SUFFIX) xswap.$(SUFFIX) \
|
||||
xcopy.$(SUFFIX) xscal.$(SUFFIX) xqscal.$(SUFFIX) \
|
||||
qxasum.$(SUFFIX) qxnrm2.$(SUFFIX) \
|
||||
qxasum.$(SUFFIX) qxsum.$(SUFFIX) qxnrm2.$(SUFFIX) \
|
||||
qxamax.$(SUFFIX) ixamax.$(SUFFIX) \
|
||||
qxamin.$(SUFFIX) ixamin.$(SUFFIX) \
|
||||
xqrot.$(SUFFIX) xrotg.$(SUFFIX) \
|
||||
|
@ -264,7 +264,7 @@ CSBLAS1OBJS = \
|
|||
cblas_scopy.$(SUFFIX) cblas_sdot.$(SUFFIX) cblas_sdsdot.$(SUFFIX) cblas_dsdot.$(SUFFIX) \
|
||||
cblas_srot.$(SUFFIX) cblas_srotg.$(SUFFIX) cblas_srotm.$(SUFFIX) cblas_srotmg.$(SUFFIX) \
|
||||
cblas_sscal.$(SUFFIX) cblas_sswap.$(SUFFIX) cblas_snrm2.$(SUFFIX) cblas_saxpby.$(SUFFIX) \
|
||||
cblas_ismin.$(SUFFIX) cblas_ismax.$(SUFFIX)
|
||||
cblas_ismin.$(SUFFIX) cblas_ismax.$(SUFFIX) cblas_ssum.$(SUFFIX)
|
||||
|
||||
CSBLAS2OBJS = \
|
||||
cblas_sgemv.$(SUFFIX) cblas_sger.$(SUFFIX) cblas_ssymv.$(SUFFIX) cblas_strmv.$(SUFFIX) \
|
||||
|
@ -282,7 +282,7 @@ CDBLAS1OBJS = \
|
|||
cblas_dcopy.$(SUFFIX) cblas_ddot.$(SUFFIX) \
|
||||
cblas_drot.$(SUFFIX) cblas_drotg.$(SUFFIX) cblas_drotm.$(SUFFIX) cblas_drotmg.$(SUFFIX) \
|
||||
cblas_dscal.$(SUFFIX) cblas_dswap.$(SUFFIX) cblas_dnrm2.$(SUFFIX) cblas_daxpby.$(SUFFIX) \
|
||||
cblas_idmin.$(SUFFIX) cblas_idmax.$(SUFFIX)
|
||||
cblas_idmin.$(SUFFIX) cblas_idmax.$(SUFFIX) cblas_dsum.$(SUFFIX)
|
||||
|
||||
CDBLAS2OBJS = \
|
||||
cblas_dgemv.$(SUFFIX) cblas_dger.$(SUFFIX) cblas_dsymv.$(SUFFIX) cblas_dtrmv.$(SUFFIX) \
|
||||
|
@ -303,7 +303,7 @@ CCBLAS1OBJS = \
|
|||
cblas_cscal.$(SUFFIX) cblas_csscal.$(SUFFIX) \
|
||||
cblas_cswap.$(SUFFIX) cblas_scnrm2.$(SUFFIX) \
|
||||
cblas_caxpby.$(SUFFIX) \
|
||||
cblas_icmin.$(SUFFIX) cblas_icmax.$(SUFFIX)
|
||||
cblas_icmin.$(SUFFIX) cblas_icmax.$(SUFFIX) cblas_scsum.$(SUFFIX)
|
||||
|
||||
CCBLAS2OBJS = \
|
||||
cblas_cgemv.$(SUFFIX) cblas_cgerc.$(SUFFIX) cblas_cgeru.$(SUFFIX) \
|
||||
|
@ -330,7 +330,7 @@ CZBLAS1OBJS = \
|
|||
cblas_zscal.$(SUFFIX) cblas_zdscal.$(SUFFIX) \
|
||||
cblas_zswap.$(SUFFIX) cblas_dznrm2.$(SUFFIX) \
|
||||
cblas_zaxpby.$(SUFFIX) \
|
||||
cblas_izmin.$(SUFFIX) cblas_izmax.$(SUFFIX)
|
||||
cblas_izmin.$(SUFFIX) cblas_izmax.$(SUFFIX) cblas_dzsum.$(SUFFIX)
|
||||
|
||||
|
||||
CZBLAS2OBJS = \
|
||||
|
@ -565,6 +565,24 @@ dzasum.$(SUFFIX) dzasum.$(PSUFFIX) : asum.c
|
|||
qxasum.$(SUFFIX) qxasum.$(PSUFFIX) : asum.c
|
||||
$(CC) $(CFLAGS) -c $< -o $(@F)
|
||||
|
||||
ssum.$(SUFFIX) ssum.$(PSUFFIX) : sum.c
|
||||
$(CC) $(CFLAGS) -c $< -o $(@F)
|
||||
|
||||
dsum.$(SUFFIX) dsum.$(PSUFFIX) : sum.c
|
||||
$(CC) $(CFLAGS) -c $< -o $(@F)
|
||||
|
||||
qsum.$(SUFFIX) qsum.$(PSUFFIX) : sum.c
|
||||
$(CC) $(CFLAGS) -c $< -o $(@F)
|
||||
|
||||
scsum.$(SUFFIX) scsum.$(PSUFFIX) : sum.c
|
||||
$(CC) $(CFLAGS) -c $< -o $(@F)
|
||||
|
||||
dzsum.$(SUFFIX) dzsum.$(PSUFFIX) : sum.c
|
||||
$(CC) $(CFLAGS) -c $< -o $(@F)
|
||||
|
||||
qxsum.$(SUFFIX) qxsum.$(PSUFFIX) : sum.c
|
||||
$(CC) $(CFLAGS) -c $< -o $(@F)
|
||||
|
||||
snrm2.$(SUFFIX) snrm2.$(PSUFFIX) : nrm2.c
|
||||
$(CC) $(CFLAGS) -c $< -o $(@F)
|
||||
|
||||
|
@ -1412,6 +1430,18 @@ cblas_scasum.$(SUFFIX) cblas_scasum.$(PSUFFIX) : asum.c
|
|||
cblas_dzasum.$(SUFFIX) cblas_dzasum.$(PSUFFIX) : asum.c
|
||||
$(CC) $(CFLAGS) -DCBLAS -c $< -o $(@F)
|
||||
|
||||
cblas_ssum.$(SUFFIX) cblas_ssum.$(PSUFFIX) : sum.c
|
||||
$(CC) $(CFLAGS) -DCBLAS -c $< -o $(@F)
|
||||
|
||||
cblas_dsum.$(SUFFIX) cblas_dsum.$(PSUFFIX) : sum.c
|
||||
$(CC) $(CFLAGS) -DCBLAS -c $< -o $(@F)
|
||||
|
||||
cblas_scsum.$(SUFFIX) cblas_scsum.$(PSUFFIX) : sum.c
|
||||
$(CC) $(CFLAGS) -DCBLAS -c $< -o $(@F)
|
||||
|
||||
cblas_dzsum.$(SUFFIX) cblas_dzsum.$(PSUFFIX) : sum.c
|
||||
$(CC) $(CFLAGS) -DCBLAS -c $< -o $(@F)
|
||||
|
||||
cblas_sdsdot.$(SUFFIX) cblas_sdsdot.$(PSUFFIX) : sdsdot.c
|
||||
$(CC) $(CFLAGS) -DCBLAS -c $< -o $(@F)
|
||||
|
||||
|
|
|
@ -0,0 +1,97 @@
|
|||
/*********************************************************************/
|
||||
/* Copyright 2009, 2010 The University of Texas at Austin. */
|
||||
/* All rights reserved. */
|
||||
/* */
|
||||
/* Redistribution and use in source and binary forms, with or */
|
||||
/* without modification, are permitted provided that the following */
|
||||
/* conditions are met: */
|
||||
/* */
|
||||
/* 1. Redistributions of source code must retain the above */
|
||||
/* copyright notice, this list of conditions and the following */
|
||||
/* disclaimer. */
|
||||
/* */
|
||||
/* 2. Redistributions in binary form must reproduce the above */
|
||||
/* copyright notice, this list of conditions and the following */
|
||||
/* disclaimer in the documentation and/or other materials */
|
||||
/* provided with the distribution. */
|
||||
/* */
|
||||
/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */
|
||||
/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */
|
||||
/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */
|
||||
/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */
|
||||
/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */
|
||||
/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */
|
||||
/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */
|
||||
/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */
|
||||
/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */
|
||||
/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */
|
||||
/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */
|
||||
/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */
|
||||
/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */
|
||||
/* POSSIBILITY OF SUCH DAMAGE. */
|
||||
/* */
|
||||
/* The views and conclusions contained in the software and */
|
||||
/* documentation are those of the authors and should not be */
|
||||
/* interpreted as representing official policies, either expressed */
|
||||
/* or implied, of The University of Texas at Austin. */
|
||||
/*********************************************************************/
|
||||
|
||||
#include <stdio.h>
|
||||
#include "common.h"
|
||||
#ifdef FUNCTION_PROFILE
|
||||
#include "functable.h"
|
||||
#endif
|
||||
|
||||
#ifndef CBLAS
|
||||
|
||||
FLOATRET NAME(blasint *N, FLOAT *x, blasint *INCX){
|
||||
|
||||
BLASLONG n = *N;
|
||||
BLASLONG incx = *INCX;
|
||||
FLOATRET ret;
|
||||
|
||||
PRINT_DEBUG_NAME;
|
||||
|
||||
if (n <= 0) return 0;
|
||||
|
||||
IDEBUG_START;
|
||||
|
||||
FUNCTION_PROFILE_START();
|
||||
|
||||
ret = (FLOATRET)SUM_K(n, x, incx);
|
||||
|
||||
FUNCTION_PROFILE_END(COMPSIZE, n, n);
|
||||
|
||||
IDEBUG_END;
|
||||
|
||||
return ret;
|
||||
}
|
||||
|
||||
#else
|
||||
#ifdef COMPLEX
|
||||
FLOAT CNAME(blasint n, void *vx, blasint incx){
|
||||
FLOAT *x = (FLOAT*) vx;
|
||||
#else
|
||||
FLOAT CNAME(blasint n, FLOAT *x, blasint incx){
|
||||
#endif
|
||||
|
||||
FLOAT ret;
|
||||
|
||||
PRINT_DEBUG_CNAME;
|
||||
|
||||
if (n <= 0) return 0;
|
||||
|
||||
IDEBUG_START;
|
||||
|
||||
FUNCTION_PROFILE_START();
|
||||
|
||||
ret = SUM_K(n, x, incx);
|
||||
|
||||
FUNCTION_PROFILE_END(COMPSIZE, n, n);
|
||||
|
||||
IDEBUG_END;
|
||||
|
||||
return ret;
|
||||
}
|
||||
|
||||
#endif
|
|
@ -65,6 +65,7 @@ function (build_core TARGET_CORE KDIR TSUFFIX KERNEL_DEFINITIONS)
|
|||
GenerateNamedObjects("${KERNELDIR}/${${float_char}SCALKERNEL}" "" "scal_k" false "" "" false ${float_type})
|
||||
GenerateNamedObjects("${KERNELDIR}/${${float_char}SWAPKERNEL}" "" "swap_k" false "" "" false ${float_type})
|
||||
GenerateNamedObjects("${KERNELDIR}/${${float_char}AXPBYKERNEL}" "" "axpby_k" false "" "" false ${float_type})
|
||||
GenerateNamedObjects("${KERNELDIR}/${${float_char}SUMKERNEL}" "" "sum_k" false "" "" false ${float_type})
|
||||
|
||||
if (${float_type} STREQUAL "COMPLEX" OR ${float_type} STREQUAL "ZCOMPLEX")
|
||||
GenerateNamedObjects("${KERNELDIR}/${${float_char}AXPYKERNEL}" "CONJ" "axpyc_k" false "" "" false ${float_type})
|
||||
|
|
|
@ -340,6 +340,32 @@ ifndef XSCALKERNEL
|
|||
XSCALKERNEL = zscal.S
|
||||
endif
|
||||
|
||||
### SUM ###
|
||||
|
||||
ifndef SSUMKERNEL
|
||||
SSUMKERNEL = sum.S
|
||||
endif
|
||||
|
||||
ifndef DSUMKERNEL
|
||||
DSUMKERNEL = sum.S
|
||||
endif
|
||||
|
||||
ifndef CSUMKERNEL
|
||||
CSUMKERNEL = zsum.S
|
||||
endif
|
||||
|
||||
ifndef ZSUMKERNEL
|
||||
ZSUMKERNEL = zsum.S
|
||||
endif
|
||||
|
||||
ifndef QSUMKERNEL
|
||||
QSUMKERNEL = sum.S
|
||||
endif
|
||||
|
||||
ifndef XSUMKERNEL
|
||||
XSUMKERNEL = zsum.S
|
||||
endif
|
||||
|
||||
### SWAP ###
|
||||
|
||||
ifndef SSWAPKERNEL
|
||||
|
@ -453,7 +479,7 @@ endif
|
|||
SBLASOBJS += \
|
||||
samax_k$(TSUFFIX).$(SUFFIX) samin_k$(TSUFFIX).$(SUFFIX) smax_k$(TSUFFIX).$(SUFFIX) smin_k$(TSUFFIX).$(SUFFIX) \
|
||||
isamax_k$(TSUFFIX).$(SUFFIX) isamin_k$(TSUFFIX).$(SUFFIX) ismax_k$(TSUFFIX).$(SUFFIX) ismin_k$(TSUFFIX).$(SUFFIX) \
|
||||
sasum_k$(TSUFFIX).$(SUFFIX) saxpy_k$(TSUFFIX).$(SUFFIX) scopy_k$(TSUFFIX).$(SUFFIX) \
|
||||
sasum_k$(TSUFFIX).$(SUFFIX) ssum_k$(TSUFFIX).$(SUFFIX) saxpy_k$(TSUFFIX).$(SUFFIX) scopy_k$(TSUFFIX).$(SUFFIX) \
|
||||
sdot_k$(TSUFFIX).$(SUFFIX) sdsdot_k$(TSUFFIX).$(SUFFIX) dsdot_k$(TSUFFIX).$(SUFFIX) \
|
||||
snrm2_k$(TSUFFIX).$(SUFFIX) srot_k$(TSUFFIX).$(SUFFIX) sscal_k$(TSUFFIX).$(SUFFIX) sswap_k$(TSUFFIX).$(SUFFIX) \
|
||||
saxpby_k$(TSUFFIX).$(SUFFIX)
|
||||
|
@ -463,31 +489,32 @@ DBLASOBJS += \
|
|||
idamax_k$(TSUFFIX).$(SUFFIX) idamin_k$(TSUFFIX).$(SUFFIX) idmax_k$(TSUFFIX).$(SUFFIX) idmin_k$(TSUFFIX).$(SUFFIX) \
|
||||
dasum_k$(TSUFFIX).$(SUFFIX) daxpy_k$(TSUFFIX).$(SUFFIX) dcopy_k$(TSUFFIX).$(SUFFIX) ddot_k$(TSUFFIX).$(SUFFIX) \
|
||||
dnrm2_k$(TSUFFIX).$(SUFFIX) drot_k$(TSUFFIX).$(SUFFIX) dscal_k$(TSUFFIX).$(SUFFIX) dswap_k$(TSUFFIX).$(SUFFIX) \
|
||||
daxpby_k$(TSUFFIX).$(SUFFIX)
|
||||
daxpby_k$(TSUFFIX).$(SUFFIX) dsum_k$(TSUFFIX).$(SUFFIX)
|
||||
|
||||
QBLASOBJS += \
|
||||
qamax_k$(TSUFFIX).$(SUFFIX) qamin_k$(TSUFFIX).$(SUFFIX) qmax_k$(TSUFFIX).$(SUFFIX) qmin_k$(TSUFFIX).$(SUFFIX) \
|
||||
iqamax_k$(TSUFFIX).$(SUFFIX) iqamin_k$(TSUFFIX).$(SUFFIX) iqmax_k$(TSUFFIX).$(SUFFIX) iqmin_k$(TSUFFIX).$(SUFFIX) \
|
||||
qasum_k$(TSUFFIX).$(SUFFIX) qaxpy_k$(TSUFFIX).$(SUFFIX) qcopy_k$(TSUFFIX).$(SUFFIX) qdot_k$(TSUFFIX).$(SUFFIX) \
|
||||
qnrm2_k$(TSUFFIX).$(SUFFIX) qrot_k$(TSUFFIX).$(SUFFIX) qscal_k$(TSUFFIX).$(SUFFIX) qswap_k$(TSUFFIX).$(SUFFIX)
|
||||
qnrm2_k$(TSUFFIX).$(SUFFIX) qrot_k$(TSUFFIX).$(SUFFIX) qscal_k$(TSUFFIX).$(SUFFIX) qswap_k$(TSUFFIX).$(SUFFIX) \
|
||||
qsum_k$(TSUFFIX).$(SUFFIX)
|
||||
|
||||
CBLASOBJS += \
|
||||
camax_k$(TSUFFIX).$(SUFFIX) camin_k$(TSUFFIX).$(SUFFIX) icamax_k$(TSUFFIX).$(SUFFIX) icamin_k$(TSUFFIX).$(SUFFIX) \
|
||||
casum_k$(TSUFFIX).$(SUFFIX) caxpy_k$(TSUFFIX).$(SUFFIX) caxpyc_k$(TSUFFIX).$(SUFFIX) ccopy_k$(TSUFFIX).$(SUFFIX) \
|
||||
cdotc_k$(TSUFFIX).$(SUFFIX) cdotu_k$(TSUFFIX).$(SUFFIX) cnrm2_k$(TSUFFIX).$(SUFFIX) csrot_k$(TSUFFIX).$(SUFFIX) \
|
||||
cscal_k$(TSUFFIX).$(SUFFIX) cswap_k$(TSUFFIX).$(SUFFIX) caxpby_k$(TSUFFIX).$(SUFFIX)
|
||||
cscal_k$(TSUFFIX).$(SUFFIX) cswap_k$(TSUFFIX).$(SUFFIX) caxpby_k$(TSUFFIX).$(SUFFIX) csum_k$(TSUFFIX).$(SUFFIX)
|
||||
|
||||
ZBLASOBJS += \
|
||||
zamax_k$(TSUFFIX).$(SUFFIX) zamin_k$(TSUFFIX).$(SUFFIX) izamax_k$(TSUFFIX).$(SUFFIX) izamin_k$(TSUFFIX).$(SUFFIX) \
|
||||
zasum_k$(TSUFFIX).$(SUFFIX) zaxpy_k$(TSUFFIX).$(SUFFIX) zaxpyc_k$(TSUFFIX).$(SUFFIX) zcopy_k$(TSUFFIX).$(SUFFIX) \
|
||||
zdotc_k$(TSUFFIX).$(SUFFIX) zdotu_k$(TSUFFIX).$(SUFFIX) znrm2_k$(TSUFFIX).$(SUFFIX) zdrot_k$(TSUFFIX).$(SUFFIX) \
|
||||
zscal_k$(TSUFFIX).$(SUFFIX) zswap_k$(TSUFFIX).$(SUFFIX) zaxpby_k$(TSUFFIX).$(SUFFIX)
|
||||
zscal_k$(TSUFFIX).$(SUFFIX) zswap_k$(TSUFFIX).$(SUFFIX) zaxpby_k$(TSUFFIX).$(SUFFIX) zsum_k$(TSUFFIX).$(SUFFIX)
|
||||
|
||||
XBLASOBJS += \
|
||||
xamax_k$(TSUFFIX).$(SUFFIX) xamin_k$(TSUFFIX).$(SUFFIX) ixamax_k$(TSUFFIX).$(SUFFIX) ixamin_k$(TSUFFIX).$(SUFFIX) \
|
||||
xasum_k$(TSUFFIX).$(SUFFIX) xaxpy_k$(TSUFFIX).$(SUFFIX) xaxpyc_k$(TSUFFIX).$(SUFFIX) xcopy_k$(TSUFFIX).$(SUFFIX) \
|
||||
xdotc_k$(TSUFFIX).$(SUFFIX) xdotu_k$(TSUFFIX).$(SUFFIX) xnrm2_k$(TSUFFIX).$(SUFFIX) xqrot_k$(TSUFFIX).$(SUFFIX) \
|
||||
xscal_k$(TSUFFIX).$(SUFFIX) xswap_k$(TSUFFIX).$(SUFFIX)
|
||||
xscal_k$(TSUFFIX).$(SUFFIX) xswap_k$(TSUFFIX).$(SUFFIX) xsum_k$(TSUFFIX).$(SUFFIX)
|
||||
|
||||
### AMAX ###
|
||||
|
||||
|
@ -617,7 +644,7 @@ $(KDIR)idmin_k$(TSUFFIX).$(SUFFIX) $(KDIR)idmin_k$(TPSUFFIX).$(PSUFFIX) : $(KE
|
|||
$(KDIR)iqmin_k$(TSUFFIX).$(SUFFIX) $(KDIR)iqmin_k$(TPSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(IQMINKERNEL)
|
||||
$(CC) -c $(CFLAGS) -UCOMPLEX -DXDOUBLE -UUSE_ABS -DUSE_MIN $< -o $@
|
||||
|
||||
|
||||
### ASUM ###
|
||||
$(KDIR)sasum_k$(TSUFFIX).$(SUFFIX) $(KDIR)sasum_k$(TPSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(SASUMKERNEL)
|
||||
$(CC) -c $(CFLAGS) -UCOMPLEX -UDOUBLE $< -o $@
|
||||
|
||||
|
@ -636,6 +663,26 @@ $(KDIR)zasum_k$(TSUFFIX).$(SUFFIX) $(KDIR)zasum_k$(TPSUFFIX).$(PSUFFIX) : $(KE
|
|||
$(KDIR)xasum_k$(TSUFFIX).$(SUFFIX) $(KDIR)xasum_k$(TPSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(XASUMKERNEL)
|
||||
$(CC) -c $(CFLAGS) -DCOMPLEX -DXDOUBLE $< -o $@
|
||||
|
||||
### SUM ###
|
||||
$(KDIR)ssum_k$(TSUFFIX).$(SUFFIX) $(KDIR)ssum_k$(TPSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(SSUMKERNEL)
|
||||
$(CC) -c $(CFLAGS) -UCOMPLEX -UDOUBLE $< -o $@
|
||||
|
||||
$(KDIR)dsum_k$(TSUFFIX).$(SUFFIX) $(KDIR)dsum_k$(TPSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(DSUMKERNEL)
|
||||
$(CC) -c $(CFLAGS) -UCOMPLEX -DDOUBLE $< -o $@
|
||||
|
||||
$(KDIR)qsum_k$(TSUFFIX).$(SUFFIX) $(KDIR)qsum_k$(TPSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(QSUMKERNEL)
|
||||
$(CC) -c $(CFLAGS) -UCOMPLEX -DXDOUBLE $< -o $@
|
||||
|
||||
$(KDIR)csum_k$(TSUFFIX).$(SUFFIX) $(KDIR)csum_k$(TPSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(CSUMKERNEL)
|
||||
$(CC) -c $(CFLAGS) -DCOMPLEX -UDOUBLE $< -o $@
|
||||
|
||||
$(KDIR)zsum_k$(TSUFFIX).$(SUFFIX) $(KDIR)zsum_k$(TPSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(ZSUMKERNEL)
|
||||
$(CC) -c $(CFLAGS) -DCOMPLEX -DDOUBLE $< -o $@
|
||||
|
||||
$(KDIR)xsum_k$(TSUFFIX).$(SUFFIX) $(KDIR)xsum_k$(TPSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(XSUMKERNEL)
|
||||
$(CC) -c $(CFLAGS) -DCOMPLEX -DXDOUBLE $< -o $@
|
||||
|
||||
### AXPY ###
|
||||
$(KDIR)saxpy_k$(TSUFFIX).$(SUFFIX) $(KDIR)saxpy_k$(TPSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(SAXPYKERNEL)
|
||||
$(CC) -c $(CFLAGS) -UCOMPLEX -UDOUBLE $< -o $@
|
||||
|
||||
|
|
|
@ -0,0 +1,206 @@
|
|||
/*********************************************************************/
|
||||
/* Copyright 2009, 2010 The University of Texas at Austin. */
|
||||
/* All rights reserved. */
|
||||
/* */
|
||||
/* Redistribution and use in source and binary forms, with or */
|
||||
/* without modification, are permitted provided that the following */
|
||||
/* conditions are met: */
|
||||
/* */
|
||||
/* 1. Redistributions of source code must retain the above */
|
||||
/* copyright notice, this list of conditions and the following */
|
||||
/* disclaimer. */
|
||||
/* */
|
||||
/* 2. Redistributions in binary form must reproduce the above */
|
||||
/* copyright notice, this list of conditions and the following */
|
||||
/* disclaimer in the documentation and/or other materials */
|
||||
/* provided with the distribution. */
|
||||
/* */
|
||||
/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */
|
||||
/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */
|
||||
/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */
|
||||
/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */
|
||||
/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */
|
||||
/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */
|
||||
/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */
|
||||
/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */
|
||||
/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */
|
||||
/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */
|
||||
/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */
|
||||
/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */
|
||||
/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */
|
||||
/* POSSIBILITY OF SUCH DAMAGE. */
|
||||
/* */
|
||||
/* The views and conclusions contained in the software and */
|
||||
/* documentation are those of the authors and should not be */
|
||||
/* interpreted as representing official policies, either expressed */
|
||||
/* or implied, of The University of Texas at Austin. */
|
||||
/*********************************************************************/
|
||||
|
||||
#define ASSEMBLER
|
||||
#include "common.h"
|
||||
#include "version.h"
|
||||
|
||||
#define PREFETCHSIZE 88
|
||||
|
||||
#define N $16
|
||||
#define X $17
|
||||
#define INCX $18
|
||||
#define I $19
|
||||
|
||||
#define s0 $f0
|
||||
#define s1 $f1
|
||||
#define s2 $f10
|
||||
#define s3 $f11
|
||||
|
||||
#define a0 $f12
|
||||
#define a1 $f13
|
||||
#define a2 $f14
|
||||
#define a3 $f15
|
||||
#define a4 $f16
|
||||
#define a5 $f17
|
||||
#define a6 $f18
|
||||
#define a7 $f19
|
||||
|
||||
#define t0 $f20
|
||||
#define t1 $f21
|
||||
#define t2 $f22
|
||||
#define t3 $f23
|
||||
|
||||
PROLOGUE
|
||||
PROFCODE
|
||||
|
||||
fclr s0
|
||||
unop
|
||||
fclr t0
|
||||
ble N, $L999
|
||||
|
||||
sra N, 3, I
|
||||
fclr s1
|
||||
fclr s2
|
||||
ble I, $L15
|
||||
|
||||
LD a0, 0 * SIZE(X)
|
||||
fclr t1
|
||||
SXADDQ INCX, X, X
|
||||
fclr t2
|
||||
|
||||
LD a1, 0 * SIZE(X)
|
||||
fclr t3
|
||||
SXADDQ INCX, X, X
|
||||
fclr s3
|
||||
|
||||
LD a2, 0 * SIZE(X)
|
||||
SXADDQ INCX, X, X
|
||||
LD a3, 0 * SIZE(X)
|
||||
SXADDQ INCX, X, X
|
||||
|
||||
LD a4, 0 * SIZE(X)
|
||||
SXADDQ INCX, X, X
|
||||
LD a5, 0 * SIZE(X)
|
||||
SXADDQ INCX, X, X
|
||||
|
||||
lda I, -1(I)
|
||||
ble I, $L13
|
||||
.align 4
|
||||
|
||||
$L12:
|
||||
ADD s0, t0, s0
|
||||
ldl $31, PREFETCHSIZE * 2 * SIZE(X)
|
||||
fmov a0, t0
|
||||
lda I, -1(I)
|
||||
|
||||
ADD s1, t1, s1
|
||||
LD a6, 0 * SIZE(X)
|
||||
fmov a1, t1
|
||||
SXADDQ INCX, X, X
|
||||
|
||||
ADD s2, t2, s2
|
||||
LD a7, 0 * SIZE(X)
|
||||
fmov a2, t2
|
||||
SXADDQ INCX, X, X
|
||||
|
||||
ADD s3, t3, s3
|
||||
LD a0, 0 * SIZE(X)
|
||||
fmov a3, t3
|
||||
SXADDQ INCX, X, X
|
||||
|
||||
ADD s0, t0, s0
|
||||
LD a1, 0 * SIZE(X)
|
||||
fmov a4, t0
|
||||
SXADDQ INCX, X, X
|
||||
|
||||
ADD s1, t1, s1
|
||||
LD a2, 0 * SIZE(X)
|
||||
fmov a5, t1
|
||||
SXADDQ INCX, X, X
|
||||
|
||||
ADD s2, t2, s2
|
||||
LD a3, 0 * SIZE(X)
|
||||
fmov a6, t2
|
||||
SXADDQ INCX, X, X
|
||||
|
||||
ADD s3, t3, s3
|
||||
LD a4, 0 * SIZE(X)
|
||||
fmov a7, t3
|
||||
SXADDQ INCX, X, X
|
||||
|
||||
LD a5, 0 * SIZE(X)
|
||||
unop
|
||||
SXADDQ INCX, X, X
|
||||
bne I, $L12
|
||||
.align 4
|
||||
|
||||
$L13:
|
||||
ADD s0, t0, s0
|
||||
LD a6, 0 * SIZE(X)
|
||||
fmov a0, t0
|
||||
SXADDQ INCX, X, X
|
||||
|
||||
ADD s1, t1, s1
|
||||
LD a7, 0 * SIZE(X)
|
||||
fmov a1, t1
|
||||
SXADDQ INCX, X, X
|
||||
|
||||
ADD s2, t2, s2
|
||||
fmov a2, t2
|
||||
ADD s3, t3, s3
|
||||
fmov a3, t3
|
||||
|
||||
ADD s0, t0, s0
|
||||
fmov a4, t0
|
||||
ADD s1, t1, s1
|
||||
fmov a5, t1
|
||||
ADD s2, t2, s2
|
||||
fmov a6, t2
|
||||
ADD s3, t3, s3
|
||||
fmov a7, t3
|
||||
|
||||
ADD s1, t1, s1
|
||||
ADD s2, t2, s2
|
||||
ADD s3, t3, s3
|
||||
|
||||
ADD s0, s1, s0
|
||||
ADD s2, s3, s2
|
||||
.align 4
|
||||
|
||||
$L15:
|
||||
and N, 7, I
|
||||
ADD s0, s2, s0
|
||||
unop
|
||||
ble I, $L999
|
||||
.align 4
|
||||
|
||||
$L17:
|
||||
ADD s0, t0, s0
|
||||
LD a0, 0 * SIZE(X)
|
||||
SXADDQ INCX, X, X
|
||||
fmov a0, t0
|
||||
|
||||
lda I, -1(I)
|
||||
bne I, $L17
|
||||
.align 4
|
||||
|
||||
$L999:
|
||||
ADD s0, t0, s0
|
||||
ret
|
||||
EPILOGUE
|
|
@ -0,0 +1,208 @@
|
|||
/*********************************************************************/
|
||||
/* Copyright 2009, 2010 The University of Texas at Austin. */
|
||||
/* All rights reserved. */
|
||||
/* */
|
||||
/* Redistribution and use in source and binary forms, with or */
|
||||
/* without modification, are permitted provided that the following */
|
||||
/* conditions are met: */
|
||||
/* */
|
||||
/* 1. Redistributions of source code must retain the above */
|
||||
/* copyright notice, this list of conditions and the following */
|
||||
/* disclaimer. */
|
||||
/* */
|
||||
/* 2. Redistributions in binary form must reproduce the above */
|
||||
/* copyright notice, this list of conditions and the following */
|
||||
/* disclaimer in the documentation and/or other materials */
|
||||
/* provided with the distribution. */
|
||||
/* */
|
||||
/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */
|
||||
/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */
|
||||
/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */
|
||||
/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */
|
||||
/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */
|
||||
/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */
|
||||
/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */
|
||||
/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */
|
||||
/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */
|
||||
/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */
|
||||
/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */
|
||||
/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */
|
||||
/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */
|
||||
/* POSSIBILITY OF SUCH DAMAGE. */
|
||||
/* */
|
||||
/* The views and conclusions contained in the software and */
|
||||
/* documentation are those of the authors and should not be */
|
||||
/* interpreted as representing official policies, either expressed */
|
||||
/* or implied, of The University of Texas at Austin. */
|
||||
/*********************************************************************/
|
||||
|
||||
#define ASSEMBLER
|
||||
#include "common.h"
|
||||
#include "version.h"
|
||||
|
||||
#define PREFETCHSIZE 88
|
||||
|
||||
#define N $16
|
||||
#define X $17
|
||||
#define INCX $18
|
||||
#define I $19
|
||||
|
||||
#define s0 $f0
|
||||
#define s1 $f1
|
||||
#define s2 $f10
|
||||
#define s3 $f11
|
||||
|
||||
#define a0 $f12
|
||||
#define a1 $f13
|
||||
#define a2 $f14
|
||||
#define a3 $f15
|
||||
#define a4 $f16
|
||||
#define a5 $f17
|
||||
#define a6 $f18
|
||||
#define a7 $f19
|
||||
|
||||
#define t0 $f20
|
||||
#define t1 $f21
|
||||
#define t2 $f22
|
||||
#define t3 $f23
|
||||
|
||||
PROLOGUE
|
||||
PROFCODE
|
||||
|
||||
fclr s0
|
||||
unop
|
||||
fclr t0
|
||||
addq INCX, INCX, INCX
|
||||
|
||||
fclr s1
|
||||
unop
|
||||
fclr t1
|
||||
ble N, $L999
|
||||
|
||||
fclr s2
|
||||
sra N, 2, I
|
||||
fclr s3
|
||||
ble I, $L15
|
||||
|
||||
LD a0, 0 * SIZE(X)
|
||||
fclr t2
|
||||
LD a1, 1 * SIZE(X)
|
||||
SXADDQ INCX, X, X
|
||||
|
||||
LD a2, 0 * SIZE(X)
|
||||
fclr t3
|
||||
LD a3, 1 * SIZE(X)
|
||||
SXADDQ INCX, X, X
|
||||
|
||||
LD a4, 0 * SIZE(X)
|
||||
LD a5, 1 * SIZE(X)
|
||||
SXADDQ INCX, X, X
|
||||
lda I, -1(I)
|
||||
|
||||
ble I, $L13
|
||||
.align 4
|
||||
|
||||
$L12:
|
||||
ADD s0, t0, s0
|
||||
ldl $31, PREFETCHSIZE * SIZE(X)
|
||||
fmov a0, t0
|
||||
lda I, -1(I)
|
||||
|
||||
ADD s1, t1, s1
|
||||
LD a6, 0 * SIZE(X)
|
||||
fmov a1, t1
|
||||
unop
|
||||
|
||||
ADD s2, t2, s2
|
||||
LD a7, 1 * SIZE(X)
|
||||
fmov a2, t2
|
||||
SXADDQ INCX, X, X
|
||||
|
||||
ADD s3, t3, s3
|
||||
LD a0, 0 * SIZE(X)
|
||||
fmov a3, t3
|
||||
unop
|
||||
|
||||
ADD s0, t0, s0
|
||||
LD a1, 1 * SIZE(X)
|
||||
fmov a4, t0
|
||||
SXADDQ INCX, X, X
|
||||
|
||||
ADD s1, t1, s1
|
||||
LD a2, 0 * SIZE(X)
|
||||
fmov a5, t1
|
||||
unop
|
||||
|
||||
ADD s2, t2, s2
|
||||
LD a3, 1 * SIZE(X)
|
||||
fmov a6, t2
|
||||
SXADDQ INCX, X, X
|
||||
|
||||
ADD s3, t3, s3
|
||||
LD a4, 0 * SIZE(X)
|
||||
fmov a7, t3
|
||||
unop
|
||||
|
||||
LD a5, 1 * SIZE(X)
|
||||
unop
|
||||
SXADDQ INCX, X, X
|
||||
bne I, $L12
|
||||
.align 4
|
||||
|
||||
$L13:
|
||||
ADD s0, t0, s0
|
||||
LD a6, 0 * SIZE(X)
|
||||
fmov a0, t0
|
||||
|
||||
ADD s1, t1, s1
|
||||
LD a7, 1 * SIZE(X)
|
||||
fmov a1, t1
|
||||
SXADDQ INCX, X, X
|
||||
|
||||
ADD s2, t2, s2
|
||||
fmov a2, t2
|
||||
ADD s3, t3, s3
|
||||
fmov a3, t3
|
||||
|
||||
ADD s0, t0, s0
|
||||
fmov a4, t0
|
||||
ADD s1, t1, s1
|
||||
fmov a5, t1
|
||||
ADD s2, t2, s2
|
||||
fmov a6, t2
|
||||
ADD s3, t3, s3
|
||||
fmov a7, t3
|
||||
|
||||
ADD s2, t2, s2
|
||||
ADD s3, t3, s3
|
||||
|
||||
.align 4
|
||||
|
||||
$L15:
|
||||
ADD s0, s2, s0
|
||||
and N, 3, I
|
||||
ADD s1, s3, s1
|
||||
ble I, $L999
|
||||
.align 4
|
||||
|
||||
$L17:
|
||||
ADD s0, t0, s0
|
||||
LD a0, 0 * SIZE(X)
|
||||
fmov a0, t0
|
||||
lda I, -1(I)
|
||||
|
||||
ADD s1, t1, s1
|
||||
LD a1, 1 * SIZE(X)
|
||||
fmov a1, t1
|
||||
SXADDQ INCX, X, X
|
||||
|
||||
bne I, $L17
|
||||
.align 4
|
||||
|
||||
$L999:
|
||||
ADD s0, t0, s0
|
||||
ADD s1, t1, s1
|
||||
|
||||
ADD s0, s1, s0
|
||||
ret
|
||||
EPILOGUE
|
|
@ -35,6 +35,11 @@ DASUMKERNEL = ../arm/asum.c
|
|||
CASUMKERNEL = ../arm/zasum.c
|
||||
ZASUMKERNEL = ../arm/zasum.c
|
||||
|
||||
SSUMKERNEL = ../arm/sum.c
|
||||
DSUMKERNEL = ../arm/sum.c
|
||||
CSUMKERNEL = ../arm/zsum.c
|
||||
ZSUMKERNEL = ../arm/zsum.c
|
||||
|
||||
SAXPYKERNEL = ../arm/axpy.c
|
||||
DAXPYKERNEL = ../arm/axpy.c
|
||||
CAXPYKERNEL = ../arm/zaxpy.c
|
||||
|
|
|
@ -37,6 +37,9 @@ DASUMKERNEL = asum_vfp.S
|
|||
CASUMKERNEL = asum_vfp.S
|
||||
ZASUMKERNEL = asum_vfp.S
|
||||
|
||||
SSUMKERNEL = sum_vfp.S
|
||||
DSUMKERNEL = sum_vfp.S
|
||||
|
||||
SAXPYKERNEL = axpy_vfp.S
|
||||
DAXPYKERNEL = axpy_vfp.S
|
||||
CAXPYKERNEL = axpy_vfp.S
|
||||
|
|
|
@ -0,0 +1,51 @@
|
|||
/***************************************************************************
|
||||
Copyright (c) 2013, The OpenBLAS Project
|
||||
All rights reserved.
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
met:
|
||||
1. Redistributions of source code must retain the above copyright
|
||||
notice, this list of conditions and the following disclaimer.
|
||||
2. Redistributions in binary form must reproduce the above copyright
|
||||
notice, this list of conditions and the following disclaimer in
|
||||
the documentation and/or other materials provided with the
|
||||
distribution.
|
||||
3. Neither the name of the OpenBLAS project nor the names of
|
||||
its contributors may be used to endorse or promote products
|
||||
derived from this software without specific prior written permission.
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||||
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
||||
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
|
||||
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
|
||||
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
|
||||
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
|
||||
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
|
||||
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
|
||||
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
*****************************************************************************/
|
||||
|
||||
/**************************************************************************************
|
||||
* trivial copy of asum.c with the ABS() removed *
|
||||
**************************************************************************************/
|
||||
|
||||
|
||||
#include "common.h"
|
||||
#include <math.h>
|
||||
|
||||
FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x)
|
||||
{
|
||||
BLASLONG i=0;
|
||||
FLOAT sumf = 0.0;
|
||||
if (n <= 0 || inc_x <= 0) return(sumf);
|
||||
|
||||
n *= inc_x;
|
||||
while(i < n)
|
||||
{
|
||||
sumf += x[i];
|
||||
i += inc_x;
|
||||
}
|
||||
return(sumf);
|
||||
}
|
||||
|
||||
|
|
@ -0,0 +1,425 @@
|
|||
/***************************************************************************
|
||||
Copyright (c) 2013, The OpenBLAS Project
|
||||
All rights reserved.
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
met:
|
||||
1. Redistributions of source code must retain the above copyright
|
||||
notice, this list of conditions and the following disclaimer.
|
||||
2. Redistributions in binary form must reproduce the above copyright
|
||||
notice, this list of conditions and the following disclaimer in
|
||||
the documentation and/or other materials provided with the
|
||||
distribution.
|
||||
3. Neither the name of the OpenBLAS project nor the names of
|
||||
its contributors may be used to endorse or promote products
|
||||
derived from this software without specific prior written permission.
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||||
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
||||
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
|
||||
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
|
||||
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
|
||||
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
|
||||
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
|
||||
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
|
||||
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
*****************************************************************************/
|
||||
|
||||
/**************************************************************************************
|
||||
* trivial copy of asum_vfp.S with the in-place vabs.f64 calls removed *
|
||||
**************************************************************************************/
|
||||
|
||||
#define ASSEMBLER
|
||||
#include "common.h"
|
||||
|
||||
#define STACKSIZE 256
|
||||
|
||||
#define N r0
|
||||
#define X r1
|
||||
#define INC_X r2
|
||||
|
||||
|
||||
#define I r12
|
||||
|
||||
#define X_PRE 512
|
||||
|
||||
/**************************************************************************************
|
||||
* Macro definitions
|
||||
**************************************************************************************/
|
||||
|
||||
#if !defined(COMPLEX)
|
||||
|
||||
#if defined(DOUBLE)
|
||||
|
||||
.macro KERNEL_F4
|
||||
|
||||
pld [ X, #X_PRE ]
|
||||
vldmia.f64 X!, { d4 - d5 }
|
||||
vadd.f64 d0 , d0, d4
|
||||
vldmia.f64 X!, { d6 - d7 }
|
||||
vadd.f64 d1 , d1, d5
|
||||
vadd.f64 d0 , d0, d6
|
||||
vadd.f64 d1 , d1, d7
|
||||
|
||||
.endm
|
||||
|
||||
.macro KERNEL_F1
|
||||
|
||||
vldmia.f64 X!, { d4 }
|
||||
vadd.f64 d0 , d0, d4
|
||||
|
||||
.endm
|
||||
|
||||
|
||||
.macro KERNEL_S4
|
||||
|
||||
vldmia.f64 X, { d4 }
|
||||
vadd.f64 d0 , d0, d4
|
||||
add X, X, INC_X
|
||||
|
||||
vldmia.f64 X, { d4 }
|
||||
vadd.f64 d0 , d0, d4
|
||||
add X, X, INC_X
|
||||
|
||||
vldmia.f64 X, { d4 }
|
||||
vadd.f64 d0 , d0, d4
|
||||
add X, X, INC_X
|
||||
|
||||
vldmia.f64 X, { d4 }
|
||||
vadd.f64 d0 , d0, d4
|
||||
add X, X, INC_X
|
||||
|
||||
.endm
|
||||
|
||||
|
||||
.macro KERNEL_S1
|
||||
|
||||
vldmia.f64 X, { d4 }
|
||||
vadd.f64 d0 , d0, d4
|
||||
add X, X, INC_X
|
||||
|
||||
.endm
|
||||
|
||||
#else
|
||||
|
||||
.macro KERNEL_F4
|
||||
|
||||
vldmia.f32 X!, { s4 - s5 }
|
||||
vadd.f32 s0 , s0, s4
|
||||
vldmia.f32 X!, { s6 - s7 }
|
||||
vadd.f32 s1 , s1, s5
|
||||
vadd.f32 s0 , s0, s6
|
||||
vadd.f32 s1 , s1, s7
|
||||
|
||||
.endm
|
||||
|
||||
.macro KERNEL_F1
|
||||
|
||||
vldmia.f32 X!, { s4 }
|
||||
vadd.f32 s0 , s0, s4
|
||||
|
||||
.endm
|
||||
|
||||
|
||||
.macro KERNEL_S4
|
||||
|
||||
vldmia.f32 X, { s4 }
|
||||
vadd.f32 s0 , s0, s4
|
||||
add X, X, INC_X
|
||||
|
||||
vldmia.f32 X, { s4 }
|
||||
vadd.f32 s0 , s0, s4
|
||||
add X, X, INC_X
|
||||
|
||||
vldmia.f32 X, { s4 }
|
||||
vadd.f32 s0 , s0, s4
|
||||
add X, X, INC_X
|
||||
|
||||
vldmia.f32 X, { s4 }
|
||||
vadd.f32 s0 , s0, s4
|
||||
add X, X, INC_X
|
||||
|
||||
.endm
|
||||
|
||||
|
||||
.macro KERNEL_S1
|
||||
|
||||
vldmia.f32 X, { s4 }
|
||||
vadd.f32 s0 , s0, s4
|
||||
add X, X, INC_X
|
||||
|
||||
.endm
|
||||
|
||||
|
||||
#endif
|
||||
|
||||
#else
|
||||
|
||||
#if defined(DOUBLE)
|
||||
|
||||
.macro KERNEL_F4
|
||||
|
||||
pld [ X, #X_PRE ]
|
||||
vldmia.f64 X!, { d4 - d5 }
|
||||
vadd.f64 d0 , d0, d4
|
||||
vldmia.f64 X!, { d6 - d7 }
|
||||
vadd.f64 d1 , d1, d5
|
||||
vadd.f64 d0 , d0, d6
|
||||
vadd.f64 d1 , d1, d7
|
||||
|
||||
pld [ X, #X_PRE ]
|
||||
vldmia.f64 X!, { d4 - d5 }
|
||||
vadd.f64 d0 , d0, d4
|
||||
vldmia.f64 X!, { d6 - d7 }
|
||||
vadd.f64 d1 , d1, d5
|
||||
vadd.f64 d0 , d0, d6
|
||||
vadd.f64 d1 , d1, d7
|
||||
|
||||
|
||||
.endm
|
||||
|
||||
.macro KERNEL_F1
|
||||
|
||||
vldmia.f64 X!, { d4 }
|
||||
vadd.f64 d0 , d0, d4
|
||||
|
||||
vldmia.f64 X!, { d4 }
|
||||
vadd.f64 d0 , d0, d4
|
||||
|
||||
|
||||
.endm
|
||||
|
||||
|
||||
.macro KERNEL_S4
|
||||
|
||||
vldmia.f64 X, { d4 -d5 }
|
||||
vadd.f64 d0 , d0, d4
|
||||
vadd.f64 d0 , d0, d5
|
||||
add X, X, INC_X
|
||||
|
||||
vldmia.f64 X, { d4 -d5 }
|
||||
vadd.f64 d0 , d0, d4
|
||||
vadd.f64 d0 , d0, d5
|
||||
add X, X, INC_X
|
||||
|
||||
vldmia.f64 X, { d4 -d5 }
|
||||
vadd.f64 d0 , d0, d4
|
||||
vadd.f64 d0 , d0, d5
|
||||
add X, X, INC_X
|
||||
|
||||
vldmia.f64 X, { d4 -d5 }
|
||||
vadd.f64 d0 , d0, d4
|
||||
vadd.f64 d0 , d0, d5
|
||||
add X, X, INC_X
|
||||
|
||||
.endm
|
||||
|
||||
|
||||
.macro KERNEL_S1
|
||||
|
||||
vldmia.f64 X, { d4 -d5 }
|
||||
vadd.f64 d0 , d0, d4
|
||||
vadd.f64 d0 , d0, d5
|
||||
add X, X, INC_X
|
||||
|
||||
.endm
|
||||
|
||||
#else
|
||||
|
||||
.macro KERNEL_F4
|
||||
|
||||
pld [ X, #X_PRE ]
|
||||
vldmia.f32 X!, { s4 - s5 }
|
||||
vadd.f32 s0 , s0, s4
|
||||
vldmia.f32 X!, { s6 - s7 }
|
||||
vadd.f32 s1 , s1, s5
|
||||
vadd.f32 s0 , s0, s6
|
||||
vadd.f32 s1 , s1, s7
|
||||
|
||||
vldmia.f32 X!, { s4 - s5 }
|
||||
vadd.f32 s0 , s0, s4
|
||||
vldmia.f32 X!, { s6 - s7 }
|
||||
vadd.f32 s1 , s1, s5
|
||||
vadd.f32 s0 , s0, s6
|
||||
vadd.f32 s1 , s1, s7
|
||||
|
||||
|
||||
.endm
|
||||
|
||||
.macro KERNEL_F1
|
||||
|
||||
vldmia.f32 X!, { s4 }
|
||||
vadd.f32 s0 , s0, s4
|
||||
|
||||
vldmia.f32 X!, { s4 }
|
||||
vadd.f32 s0 , s0, s4
|
||||
|
||||
.endm
|
||||
|
||||
|
||||
.macro KERNEL_S4
|
||||
|
||||
vldmia.f32 X, { s4 -s5 }
|
||||
vadd.f32 s0 , s0, s4
|
||||
vadd.f32 s0 , s0, s5
|
||||
add X, X, INC_X
|
||||
|
||||
vldmia.f32 X, { s4 -s5 }
|
||||
vadd.f32 s0 , s0, s4
|
||||
vadd.f32 s0 , s0, s5
|
||||
add X, X, INC_X
|
||||
|
||||
vldmia.f32 X, { s4 -s5 }
|
||||
vadd.f32 s0 , s0, s4
|
||||
vadd.f32 s0 , s0, s5
|
||||
add X, X, INC_X
|
||||
|
||||
vldmia.f32 X, { s4 -s5 }
|
||||
vadd.f32 s0 , s0, s4
|
||||
vadd.f32 s0 , s0, s5
|
||||
add X, X, INC_X
|
||||
|
||||
.endm
|
||||
|
||||
|
||||
.macro KERNEL_S1
|
||||
|
||||
vldmia.f32 X, { s4 -s5 }
|
||||
vadd.f32 s0 , s0, s4
|
||||
vadd.f32 s0 , s0, s5
|
||||
add X, X, INC_X
|
||||
|
||||
.endm
|
||||
|
||||
#endif
|
||||
|
||||
#endif
|
||||
|
||||
/**************************************************************************************
|
||||
* End of macro definitions
|
||||
**************************************************************************************/
|
||||
|
||||
PROLOGUE
|
||||
|
||||
.align 5
|
||||
|
||||
movs r12, #0 // clear floating point register
|
||||
vmov s0, r12
|
||||
vmov s1, r12
|
||||
#if defined(DOUBLE)
|
||||
vcvt.f64.f32 d0, s0
|
||||
vcvt.f64.f32 d1, s1
|
||||
#endif
|
||||
|
||||
cmp N, #0
|
||||
ble asum_kernel_L999
|
||||
|
||||
cmp INC_X, #0
|
||||
beq asum_kernel_L999
|
||||
|
||||
cmp INC_X, #1
|
||||
bne asum_kernel_S_BEGIN
|
||||
|
||||
|
||||
asum_kernel_F_BEGIN:
|
||||
|
||||
asrs I, N, #2 // I = N / 4
|
||||
ble asum_kernel_F1
|
||||
|
||||
.align 5
|
||||
|
||||
asum_kernel_F4:
|
||||
|
||||
#if !defined(DOUBLE) && !defined(COMPLEX)
|
||||
pld [ X, #X_PRE ]
|
||||
#endif
|
||||
KERNEL_F4
|
||||
|
||||
subs I, I, #1
|
||||
ble asum_kernel_F1
|
||||
|
||||
KERNEL_F4
|
||||
|
||||
subs I, I, #1
|
||||
bne asum_kernel_F4
|
||||
|
||||
asum_kernel_F1:
|
||||
|
||||
ands I, N, #3
|
||||
ble asum_kernel_L999
|
||||
|
||||
asum_kernel_F10:
|
||||
|
||||
KERNEL_F1
|
||||
|
||||
subs I, I, #1
|
||||
bne asum_kernel_F10
|
||||
|
||||
b asum_kernel_L999
|
||||
|
||||
asum_kernel_S_BEGIN:
|
||||
|
||||
#if defined(COMPLEX)
|
||||
|
||||
#if defined(DOUBLE)
|
||||
lsl INC_X, INC_X, #4 // INC_X * SIZE * 2
|
||||
#else
|
||||
lsl INC_X, INC_X, #3 // INC_X * SIZE * 2
|
||||
#endif
|
||||
|
||||
#else
|
||||
|
||||
#if defined(DOUBLE)
|
||||
lsl INC_X, INC_X, #3 // INC_X * SIZE
|
||||
#else
|
||||
lsl INC_X, INC_X, #2 // INC_X * SIZE
|
||||
#endif
|
||||
|
||||
#endif
|
||||
|
||||
asrs I, N, #2 // I = N / 4
|
||||
ble asum_kernel_S1
|
||||
|
||||
.align 5
|
||||
|
||||
asum_kernel_S4:
|
||||
|
||||
KERNEL_S4
|
||||
|
||||
subs I, I, #1
|
||||
bne asum_kernel_S4
|
||||
|
||||
asum_kernel_S1:
|
||||
|
||||
ands I, N, #3
|
||||
ble asum_kernel_L999
|
||||
|
||||
asum_kernel_S10:
|
||||
|
||||
KERNEL_S1
|
||||
|
||||
subs I, I, #1
|
||||
bne asum_kernel_S10
|
||||
|
||||
|
||||
asum_kernel_L999:
|
||||
|
||||
|
||||
#if defined(DOUBLE)
|
||||
vadd.f64 d0 , d0, d1 // set return value
|
||||
#else
|
||||
vadd.f32 s0 , s0, s1 // set return value
|
||||
#endif
|
||||
|
||||
#if !defined(__ARM_PCS_VFP)
|
||||
#if !defined(DOUBLE)
|
||||
vmov r0, s0
|
||||
#else
|
||||
vmov r0, r1, d0
|
||||
#endif
|
||||
#endif
|
||||
|
||||
bx lr
|
||||
|
||||
EPILOGUE
|
||||
|
|
@ -0,0 +1,57 @@
|
|||
/***************************************************************************
|
||||
Copyright (c) 2013, The OpenBLAS Project
|
||||
All rights reserved.
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
met:
|
||||
1. Redistributions of source code must retain the above copyright
|
||||
notice, this list of conditions and the following disclaimer.
|
||||
2. Redistributions in binary form must reproduce the above copyright
|
||||
notice, this list of conditions and the following disclaimer in
|
||||
the documentation and/or other materials provided with the
|
||||
distribution.
|
||||
3. Neither the name of the OpenBLAS project nor the names of
|
||||
its contributors may be used to endorse or promote products
|
||||
derived from this software without specific prior written permission.
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||||
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
||||
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
|
||||
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
|
||||
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
|
||||
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
|
||||
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
|
||||
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
|
||||
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
*****************************************************************************/
|
||||
|
||||
/**************************************************************************************
|
||||
* trivial copy of zasum.c with the ABS() removed *
|
||||
**************************************************************************************/
|
||||
|
||||
|
||||
#include "common.h"
|
||||
#include <math.h>
|
||||
|
||||
#define CSUM1(x,i) x[i]+x[i+1]
|
||||
|
||||
FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x)
|
||||
{
|
||||
BLASLONG i=0;
|
||||
FLOAT sumf = 0.0;
|
||||
BLASLONG inc_x2;
|
||||
|
||||
if (n <= 0 || inc_x <= 0) return(sumf);
|
||||
|
||||
inc_x2 = 2 * inc_x;
|
||||
|
||||
n *= inc_x2;
|
||||
while(i < n)
|
||||
{
|
||||
sumf += CSUM1(x,i);
|
||||
i += inc_x2;
|
||||
}
|
||||
return(sumf);
|
||||
}
|
||||
|
||||
|
|
@ -0,0 +1,164 @@
|
|||
/*******************************************************************************
|
||||
Copyright (c) 2019, The OpenBLAS Project
|
||||
All rights reserved.
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
met:
|
||||
1. Redistributions of source code must retain the above copyright
|
||||
notice, this list of conditions and the following disclaimer.
|
||||
2. Redistributions in binary form must reproduce the above copyright
|
||||
notice, this list of conditions and the following disclaimer in
|
||||
the documentation and/or other materials provided with the
|
||||
distribution.
|
||||
3. Neither the name of the OpenBLAS project nor the names of
|
||||
its contributors may be used to endorse or promote products
|
||||
derived from this software without specific prior written permission.
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||||
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
||||
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
|
||||
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
|
||||
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
|
||||
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
|
||||
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
|
||||
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
|
||||
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
*******************************************************************************/
|
||||
|
||||
#define ASSEMBLER
|
||||
#include "common.h"
|
||||
|
||||
#define N x0 /* vector length */
|
||||
#define X x1 /* X vector address */
|
||||
#define INC_X x2 /* X stride */
|
||||
#define I x5 /* loop variable */
|
||||
|
||||
/*******************************************************************************
|
||||
* Macro definitions
|
||||
*******************************************************************************/
|
||||
|
||||
#define REG0 wzr
|
||||
#define SUMF s0
|
||||
#define TMPF s1
|
||||
#define TMPVF {v1.s}[0]
|
||||
#define SZ 4
|
||||
|
||||
/******************************************************************************/
|
||||
|
||||
.macro KERNEL_F1
|
||||
ld1 {v1.2s}, [X], #8
|
||||
ext v2.8b, v1.8b, v1.8b, #4
|
||||
fadd TMPF, TMPF, s2
|
||||
fadd SUMF, SUMF, TMPF
|
||||
.endm
|
||||
|
||||
.macro KERNEL_F8
|
||||
ld1 {v1.4s, v2.4s, v3.4s, v4.4s}, [X]
|
||||
add X, X, #64
|
||||
|
||||
PRFM PLDL1KEEP, [X, #1024]
|
||||
|
||||
fadd v1.4s, v1.4s, v2.4s
|
||||
fadd v3.4s, v3.4s, v4.4s
|
||||
fadd v0.4s, v0.4s, v1.4s
|
||||
fadd v0.4s, v0.4s, v3.4s
|
||||
.endm
|
||||
|
||||
.macro KERNEL_F8_FINALIZE
|
||||
ext v1.16b, v0.16b, v0.16b, #8
|
||||
fadd v0.2s, v0.2s, v1.2s
|
||||
faddp SUMF, v0.2s
|
||||
.endm
|
||||
|
||||
.macro INIT_S
|
||||
lsl INC_X, INC_X, #3
|
||||
.endm
|
||||
|
||||
.macro KERNEL_S1
|
||||
ld1 {v1.2s}, [X], INC_X
|
||||
ext v2.8b, v1.8b, v1.8b, #4
|
||||
fadd TMPF, TMPF, s2
|
||||
fadd SUMF, SUMF, TMPF
|
||||
|
||||
.endm
|
||||
|
||||
/*******************************************************************************
|
||||
* End of macro definitions
|
||||
*******************************************************************************/
|
||||
|
||||
PROLOGUE
|
||||
|
||||
fmov SUMF, REG0
|
||||
fmov s1, SUMF
|
||||
|
||||
cmp N, xzr
|
||||
ble .Lcsum_kernel_L999
|
||||
cmp INC_X, xzr
|
||||
ble .Lcsum_kernel_L999
|
||||
|
||||
cmp INC_X, #1
|
||||
bne .Lcsum_kernel_S_BEGIN
|
||||
|
||||
.Lcsum_kernel_F_BEGIN:
|
||||
|
||||
asr I, N, #3
|
||||
cmp I, xzr
|
||||
beq .Lcsum_kernel_F1
|
||||
|
||||
.Lcsum_kernel_F8:
|
||||
|
||||
KERNEL_F8
|
||||
|
||||
subs I, I, #1
|
||||
bne .Lcsum_kernel_F8
|
||||
|
||||
KERNEL_F8_FINALIZE
|
||||
|
||||
.Lcsum_kernel_F1:
|
||||
|
||||
ands I, N, #7
|
||||
ble .Lcsum_kernel_L999
|
||||
|
||||
.Lcsum_kernel_F10:
|
||||
|
||||
KERNEL_F1
|
||||
|
||||
subs I, I, #1
|
||||
bne .Lcsum_kernel_F10
|
||||
|
||||
.Lcsum_kernel_L999:
|
||||
ret
|
||||
|
||||
.Lcsum_kernel_S_BEGIN:
|
||||
|
||||
INIT_S
|
||||
|
||||
asr I, N, #2
|
||||
cmp I, xzr
|
||||
ble .Lcsum_kernel_S1
|
||||
|
||||
.Lcsum_kernel_S4:
|
||||
|
||||
KERNEL_S1
|
||||
KERNEL_S1
|
||||
KERNEL_S1
|
||||
KERNEL_S1
|
||||
|
||||
subs I, I, #1
|
||||
bne .Lcsum_kernel_S4
|
||||
|
||||
.Lcsum_kernel_S1:
|
||||
|
||||
ands I, N, #3
|
||||
ble .Lcsum_kernel_L999
|
||||
|
||||
.Lcsum_kernel_S10:
|
||||
|
||||
KERNEL_S1
|
||||
|
||||
subs I, I, #1
|
||||
bne .Lcsum_kernel_S10
|
||||
|
||||
ret
|
||||
|
||||
EPILOGUE
|
|
@ -0,0 +1,186 @@
|
|||
/*******************************************************************************
|
||||
Copyright (c) 2019, The OpenBLAS Project
|
||||
All rights reserved.
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
met:
|
||||
1. Redistributions of source code must retain the above copyright
|
||||
notice, this list of conditions and the following disclaimer.
|
||||
2. Redistributions in binary form must reproduce the above copyright
|
||||
notice, this list of conditions and the following disclaimer in
|
||||
the documentation and/or other materials provided with the
|
||||
distribution.
|
||||
3. Neither the name of the OpenBLAS project nor the names of
|
||||
its contributors may be used to endorse or promote products
|
||||
derived from this software without specific prior written permission.
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||||
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
||||
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
|
||||
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
|
||||
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
|
||||
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
|
||||
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
|
||||
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
|
||||
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
*******************************************************************************/
|
||||
|
||||
#define ASSEMBLER
|
||||
#include "common.h"
|
||||
|
||||
#define N x0 /* vector length */
|
||||
#define X x1 /* X vector address */
|
||||
#define INC_X x2 /* X stride */
|
||||
#define I x5 /* loop variable */
|
||||
|
||||
/*******************************************************************************
|
||||
* Macro definitions
|
||||
*******************************************************************************/
|
||||
|
||||
#if !defined(DOUBLE)
|
||||
#define REG0 wzr
|
||||
#define SUMF s0
|
||||
#define TMPF s1
|
||||
#define TMPVF {v1.s}[0]
|
||||
#define SZ 4
|
||||
#else
|
||||
#define REG0 xzr
|
||||
#define SUMF d0
|
||||
#define TMPF d1
|
||||
#define TMPVF {v1.d}[0]
|
||||
#define SZ 8
|
||||
#endif
|
||||
|
||||
/******************************************************************************/
|
||||
|
||||
.macro KERNEL_F1
|
||||
ldr TMPF, [X], #SZ
|
||||
fadd SUMF, SUMF, TMPF
|
||||
.endm
|
||||
|
||||
.macro KERNEL_F8
|
||||
#if !defined(DOUBLE)
|
||||
ld1 {v1.4s, v2.4s}, [X], #32 // Load [X3, X2, X1, X0]
|
||||
fadd v1.4s, v1.4s, v2.4s // [X3+X1, X2+X0]
|
||||
fadd v0.4s, v0.4s, v1.4s // [X3+X1, X2+X0]
|
||||
PRFM PLDL1KEEP, [X, #1024]
|
||||
#else // DOUBLE
|
||||
ld1 {v2.2d, v3.2d, v4.2d, v5.2d}, [X]
|
||||
add X, X, #64
|
||||
|
||||
PRFM PLDL1KEEP, [X, #1024]
|
||||
|
||||
fadd v2.2d, v2.2d, v3.2d
|
||||
fadd v4.2d, v4.2d, v5.2d
|
||||
fadd v0.2d, v0.2d, v2.2d
|
||||
fadd v0.2d, v0.2d, v4.2d
|
||||
#endif
|
||||
.endm
|
||||
|
||||
.macro KERNEL_F8_FINALIZE
|
||||
#if !defined(DOUBLE)
|
||||
ext v1.16b, v0.16b, v0.16b, #8
|
||||
fadd v0.2s, v0.2s, v1.2s
|
||||
faddp SUMF, v0.2s
|
||||
#else
|
||||
faddp SUMF, v0.2d
|
||||
#endif
|
||||
.endm
|
||||
|
||||
.macro INIT_S
|
||||
#if !defined(DOUBLE)
|
||||
lsl INC_X, INC_X, #2
|
||||
#else
|
||||
lsl INC_X, INC_X, #3
|
||||
#endif
|
||||
.endm
|
||||
|
||||
.macro KERNEL_S1
|
||||
ld1 TMPVF, [X], INC_X
|
||||
fadd SUMF, SUMF, TMPF
|
||||
.endm
|
||||
|
||||
/*******************************************************************************
|
||||
* End of macro definitions
|
||||
*******************************************************************************/
|
||||
|
||||
PROLOGUE
|
||||
|
||||
fmov SUMF, REG0
|
||||
#if !defined(DOUBLE)
|
||||
fmov s1, SUMF
|
||||
#else
|
||||
fmov d1, SUMF
|
||||
#endif
|
||||
|
||||
cmp N, xzr
|
||||
ble .Lsum_kernel_L999
|
||||
cmp INC_X, xzr
|
||||
ble .Lsum_kernel_L999
|
||||
|
||||
cmp INC_X, #1
|
||||
bne .Lsum_kernel_S_BEGIN
|
||||
|
||||
.Lsum_kernel_F_BEGIN:
|
||||
|
||||
asr I, N, #3
|
||||
cmp I, xzr
|
||||
beq .Lsum_kernel_F1
|
||||
|
||||
.Lsum_kernel_F8:
|
||||
|
||||
KERNEL_F8
|
||||
|
||||
subs I, I, #1
|
||||
bne .Lsum_kernel_F8
|
||||
|
||||
KERNEL_F8_FINALIZE
|
||||
|
||||
.Lsum_kernel_F1:
|
||||
|
||||
ands I, N, #7
|
||||
ble .Lsum_kernel_L999
|
||||
|
||||
.Lsum_kernel_F10:
|
||||
|
||||
KERNEL_F1
|
||||
|
||||
subs I, I, #1
|
||||
bne .Lsum_kernel_F10
|
||||
|
||||
.Lsum_kernel_L999:
|
||||
ret
|
||||
|
||||
.Lsum_kernel_S_BEGIN:
|
||||
|
||||
INIT_S
|
||||
|
||||
asr I, N, #2
|
||||
cmp I, xzr
|
||||
ble .Lsum_kernel_S1
|
||||
|
||||
.Lsum_kernel_S4:
|
||||
|
||||
KERNEL_S1
|
||||
KERNEL_S1
|
||||
KERNEL_S1
|
||||
KERNEL_S1
|
||||
|
||||
subs I, I, #1
|
||||
bne .Lsum_kernel_S4
|
||||
|
||||
.Lsum_kernel_S1:
|
||||
|
||||
ands I, N, #3
|
||||
ble .Lsum_kernel_L999
|
||||
|
||||
.Lsum_kernel_S10:
|
||||
|
||||
KERNEL_S1
|
||||
|
||||
subs I, I, #1
|
||||
bne .Lsum_kernel_S10
|
||||
|
||||
ret
|
||||
|
||||
EPILOGUE
|
|
@ -0,0 +1,158 @@
|
|||
/*******************************************************************************
|
||||
Copyright (c) 2015, The OpenBLAS Project
|
||||
All rights reserved.
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
met:
|
||||
1. Redistributions of source code must retain the above copyright
|
||||
notice, this list of conditions and the following disclaimer.
|
||||
2. Redistributions in binary form must reproduce the above copyright
|
||||
notice, this list of conditions and the following disclaimer in
|
||||
the documentation and/or other materials provided with the
|
||||
distribution.
|
||||
3. Neither the name of the OpenBLAS project nor the names of
|
||||
its contributors may be used to endorse or promote products
|
||||
derived from this software without specific prior written permission.
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||||
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
||||
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
|
||||
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
|
||||
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
|
||||
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
|
||||
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
|
||||
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
|
||||
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
*******************************************************************************/
|
||||
|
||||
#define ASSEMBLER
|
||||
#include "common.h"
|
||||
|
||||
#define N x0 /* vector length */
|
||||
#define X x1 /* X vector address */
|
||||
#define INC_X x2 /* X stride */
|
||||
#define I x5 /* loop variable */
|
||||
|
||||
/*******************************************************************************
|
||||
* Macro definitions
|
||||
*******************************************************************************/
|
||||
|
||||
#define REG0 xzr
|
||||
#define SUMF d0
|
||||
#define TMPF d1
|
||||
#define TMPVF {v1.d}[0]
|
||||
#define SZ 8
|
||||
|
||||
/******************************************************************************/
|
||||
|
||||
.macro KERNEL_F1
|
||||
ld1 {v1.2d}, [X], #16
|
||||
faddp TMPF, v1.2d
|
||||
fadd SUMF, SUMF, TMPF
|
||||
.endm
|
||||
|
||||
.macro KERNEL_F4
|
||||
ld1 {v1.2d, v2.2d, v3.2d, v4.2d}, [X], #64
|
||||
|
||||
fadd v1.2d, v1.2d, v2.2d
|
||||
fadd v3.2d, v3.2d, v4.2d
|
||||
|
||||
fadd v0.2d, v0.2d, v1.2d
|
||||
fadd v0.2d, v0.2d, v3.2d
|
||||
|
||||
PRFM PLDL1KEEP, [X, #1024]
|
||||
.endm
|
||||
|
||||
.macro KERNEL_F4_FINALIZE
|
||||
faddp SUMF, v0.2d
|
||||
.endm
|
||||
|
||||
.macro INIT_S
|
||||
lsl INC_X, INC_X, #4
|
||||
.endm
|
||||
|
||||
.macro KERNEL_S1
|
||||
ld1 {v1.2d}, [X], INC_X
|
||||
faddp TMPF, v1.2d
|
||||
fadd SUMF, SUMF, TMPF
|
||||
.endm
|
||||
|
||||
/*******************************************************************************
|
||||
* End of macro definitions
|
||||
*******************************************************************************/
|
||||
|
||||
PROLOGUE
|
||||
|
||||
fmov SUMF, REG0
|
||||
|
||||
cmp N, xzr
|
||||
ble .Lzsum_kernel_L999
|
||||
cmp INC_X, xzr
|
||||
ble .Lzsum_kernel_L999
|
||||
|
||||
cmp INC_X, #1
|
||||
bne .Lzsum_kernel_S_BEGIN
|
||||
|
||||
.Lzsum_kernel_F_BEGIN:
|
||||
|
||||
asr I, N, #2
|
||||
cmp I, xzr
|
||||
beq .Lzsum_kernel_F1
|
||||
|
||||
.Lzsum_kernel_F4:
|
||||
|
||||
KERNEL_F4
|
||||
|
||||
subs I, I, #1
|
||||
bne .Lzsum_kernel_F4
|
||||
|
||||
KERNEL_F4_FINALIZE
|
||||
|
||||
.Lzsum_kernel_F1:
|
||||
|
||||
ands I, N, #3
|
||||
ble .Lzsum_kernel_L999
|
||||
|
||||
.Lzsum_kernel_F10:
|
||||
|
||||
KERNEL_F1
|
||||
|
||||
subs I, I, #1
|
||||
bne .Lzsum_kernel_F10
|
||||
|
||||
.Lzsum_kernel_L999:
|
||||
ret
|
||||
|
||||
.Lzsum_kernel_S_BEGIN:
|
||||
|
||||
INIT_S
|
||||
|
||||
asr I, N, #2
|
||||
cmp I, xzr
|
||||
ble .Lzsum_kernel_S1
|
||||
|
||||
.Lzsum_kernel_S4:
|
||||
|
||||
KERNEL_S1
|
||||
KERNEL_S1
|
||||
KERNEL_S1
|
||||
KERNEL_S1
|
||||
|
||||
subs I, I, #1
|
||||
bne .Lzsum_kernel_S4
|
||||
|
||||
.Lzsum_kernel_S1:
|
||||
|
||||
ands I, N, #3
|
||||
ble .Lzsum_kernel_L999
|
||||
|
||||
.Lzsum_kernel_S10:
|
||||
|
||||
KERNEL_S1
|
||||
|
||||
subs I, I, #1
|
||||
bne .Lzsum_kernel_S10
|
||||
|
||||
ret
|
||||
|
||||
EPILOGUE
|
|
@ -60,6 +60,10 @@ CASUMKERNEL = asum.S
|
|||
ZASUMKERNEL = asum.S
|
||||
XASUMKERNEL = asum.S
|
||||
|
||||
CSUMKERNEL = sum.S
|
||||
ZSUMKERNEL = sum.S
|
||||
XSUMKERNEL = sum.S
|
||||
|
||||
CNRM2KERNEL = nrm2.S
|
||||
ZNRM2KERNEL = nrm2.S
|
||||
XNRM2KERNEL = nrm2.S
|
||||
|
|
|
@ -0,0 +1,358 @@
|
|||
/*********************************************************************/
|
||||
/* Copyright 2009, 2010 The University of Texas at Austin. */
|
||||
/* Copyright 2019, The OpenBLAS project */
|
||||
/* All rights reserved. */
|
||||
/* */
|
||||
/* Redistribution and use in source and binary forms, with or */
|
||||
/* without modification, are permitted provided that the following */
|
||||
/* conditions are met: */
|
||||
/* */
|
||||
/* 1. Redistributions of source code must retain the above */
|
||||
/* copyright notice, this list of conditions and the following */
|
||||
/* disclaimer. */
|
||||
/* */
|
||||
/* 2. Redistributions in binary form must reproduce the above */
|
||||
/* copyright notice, this list of conditions and the following */
|
||||
/* disclaimer in the documentation and/or other materials */
|
||||
/* provided with the distribution. */
|
||||
/* */
|
||||
/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */
|
||||
/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */
|
||||
/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */
|
||||
/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */
|
||||
/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */
|
||||
/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */
|
||||
/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */
|
||||
/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */
|
||||
/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */
|
||||
/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */
|
||||
/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */
|
||||
/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */
|
||||
/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */
|
||||
/* POSSIBILITY OF SUCH DAMAGE. */
|
||||
/* */
|
||||
/* The views and conclusions contained in the software and */
|
||||
/* documentation are those of the authors and should not be */
|
||||
/* interpreted as representing official policies, either expressed */
|
||||
/* or implied, of The University of Texas at Austin. */
|
||||
/*********************************************************************/
|
||||
|
||||
#define ASSEMBLER
|
||||
#include "common.h"
|
||||
|
||||
#ifdef XDOUBLE
|
||||
#define PREFETCH_SIZE ( 8 * 16 + 4)
|
||||
#elif defined(DOUBLE)
|
||||
#define PREFETCH_SIZE (16 * 16 + 8)
|
||||
#else
|
||||
#define PREFETCH_SIZE (32 * 16 + 16)
|
||||
#endif
|
||||
|
||||
#ifndef COMPLEX
|
||||
#define COMPADD 0
|
||||
#define STRIDE INCX
|
||||
#else
|
||||
#define COMPADD 1
|
||||
#define STRIDE SIZE
|
||||
#endif
|
||||
|
||||
#define PRE1 r2
|
||||
|
||||
#define I r17
|
||||
#define J r18
|
||||
#define INCX16 r21
|
||||
|
||||
#define PR r30
|
||||
#define ARLC r31
|
||||
|
||||
#define N r32
|
||||
#define X r33
|
||||
#define INCX r34
|
||||
|
||||
|
||||
PROLOGUE
|
||||
.prologue
|
||||
PROFCODE
|
||||
{ .mfi
|
||||
adds PRE1 = PREFETCH_SIZE * SIZE, X
|
||||
mov f8 = f0
|
||||
.save ar.lc, ARLC
|
||||
mov ARLC = ar.lc
|
||||
}
|
||||
;;
|
||||
.body
|
||||
#ifdef F_INTERFACE
|
||||
{ .mmi
|
||||
LDINT N = [N]
|
||||
LDINT INCX = [INCX]
|
||||
nop.i 0
|
||||
}
|
||||
;;
|
||||
#ifndef USE64BITINT
|
||||
{ .mii
|
||||
nop.m 0
|
||||
sxt4 N = N
|
||||
sxt4 INCX = INCX
|
||||
}
|
||||
;;
|
||||
#endif
|
||||
#endif
|
||||
{ .mmi
|
||||
cmp.lt p0, p6 = r0, INCX
|
||||
cmp.lt p0, p7 = r0, N
|
||||
shr I = N, (4 - COMPADD)
|
||||
}
|
||||
{ .mbb
|
||||
and J = ((1 << (4 - COMPADD)) - 1), N
|
||||
(p6) br.ret.sptk.many b0
|
||||
(p7) br.ret.sptk.many b0
|
||||
}
|
||||
;;
|
||||
{ .mfi
|
||||
adds I = -1, I
|
||||
mov f10 = f0
|
||||
mov PR = pr
|
||||
}
|
||||
{ .mfi
|
||||
cmp.eq p9, p0 = r0, J
|
||||
mov f9 = f0
|
||||
tbit.z p0, p12 = N, 3 - COMPADD
|
||||
}
|
||||
;;
|
||||
{ .mmi
|
||||
cmp.eq p16, p0 = r0, r0
|
||||
cmp.ne p17, p0 = r0, r0
|
||||
mov ar.ec= 3
|
||||
}
|
||||
{ .mfi
|
||||
cmp.ne p18, p0 = r0, r0
|
||||
mov f11 = f0
|
||||
shl INCX = INCX, BASE_SHIFT + COMPADD
|
||||
}
|
||||
;;
|
||||
{ .mmi
|
||||
#ifdef XDOUBLE
|
||||
shladd INCX16 = INCX, (3 - COMPADD), r0
|
||||
#else
|
||||
shladd INCX16 = INCX, (4 - COMPADD), r0
|
||||
#endif
|
||||
cmp.ne p19, p0 = r0, r0
|
||||
mov ar.lc = I
|
||||
}
|
||||
{ .mmb
|
||||
cmp.gt p8 ,p0 = r0, I
|
||||
#ifdef COMPLEX
|
||||
adds INCX = - SIZE, INCX
|
||||
#else
|
||||
nop.m 0
|
||||
#endif
|
||||
(p8) br.cond.dpnt .L55
|
||||
}
|
||||
;;
|
||||
.align 32
|
||||
|
||||
.L52:
|
||||
{ .mmf
|
||||
(p16) lfetch.nt1 [PRE1], INCX16
|
||||
(p16) LDFD f32 = [X], STRIDE
|
||||
}
|
||||
{ .mfb
|
||||
(p19) FADD f8 = f8, f71
|
||||
}
|
||||
;;
|
||||
{ .mmf
|
||||
(p16) LDFD f35 = [X], INCX
|
||||
}
|
||||
{ .mfb
|
||||
(p19) FADD f9 = f9, f74
|
||||
}
|
||||
;;
|
||||
{ .mmf
|
||||
(p16) LDFD f38 = [X], STRIDE
|
||||
}
|
||||
{ .mfb
|
||||
(p19) FADD f10 = f10, f77
|
||||
}
|
||||
;;
|
||||
{ .mmf
|
||||
(p16) LDFD f41 = [X], INCX
|
||||
}
|
||||
{ .mfb
|
||||
(p19) FADD f11 = f11, f80
|
||||
}
|
||||
;;
|
||||
{ .mmf
|
||||
(p16) LDFD f44 = [X], STRIDE
|
||||
}
|
||||
{ .mfb
|
||||
(p18) FADD f8 = f8, f34
|
||||
}
|
||||
;;
|
||||
{ .mmf
|
||||
(p16) LDFD f47 = [X], INCX
|
||||
}
|
||||
{ .mfb
|
||||
(p18) FADD f9 = f9, f37
|
||||
}
|
||||
;;
|
||||
{ .mmf
|
||||
(p16) LDFD f50 = [X], STRIDE
|
||||
}
|
||||
{ .mfb
|
||||
(p18) FADD f10 = f10, f40
|
||||
}
|
||||
;;
|
||||
{ .mmf
|
||||
(p16) LDFD f53 = [X], INCX
|
||||
}
|
||||
{ .mfb
|
||||
(p18) FADD f11 = f11, f43
|
||||
}
|
||||
;;
|
||||
{ .mmf
|
||||
#ifdef XDOUBLE
|
||||
(p16) lfetch.nt1 [PRE1], INCX16
|
||||
#endif
|
||||
(p16) LDFD f56 = [X], STRIDE
|
||||
}
|
||||
{ .mfb
|
||||
(p18) FADD f8 = f8, f46
|
||||
}
|
||||
;;
|
||||
{ .mmf
|
||||
(p16) LDFD f59 = [X], INCX
|
||||
}
|
||||
{ .mfb
|
||||
(p18) FADD f9 = f9, f49
|
||||
}
|
||||
;;
|
||||
{ .mmf
|
||||
(p16) LDFD f62 = [X], STRIDE
|
||||
}
|
||||
{ .mfb
|
||||
(p18) FADD f10 = f10, f52
|
||||
}
|
||||
;;
|
||||
{ .mmf
|
||||
(p16) LDFD f65 = [X], INCX
|
||||
}
|
||||
{ .mfb
|
||||
(p18) FADD f11 = f11, f55
|
||||
}
|
||||
;;
|
||||
{ .mmf
|
||||
(p16) LDFD f68 = [X], STRIDE
|
||||
}
|
||||
{ .mfb
|
||||
(p18) FADD f8 = f8, f58
|
||||
}
|
||||
;;
|
||||
{ .mmf
|
||||
(p16) LDFD f71 = [X], INCX
|
||||
}
|
||||
{ .mfb
|
||||
(p18) FADD f9 = f9, f61
|
||||
}
|
||||
;;
|
||||
{ .mmf
|
||||
(p16) LDFD f74 = [X], STRIDE
|
||||
}
|
||||
{ .mfb
|
||||
(p18) FADD f10 = f10, f64
|
||||
}
|
||||
;;
|
||||
{ .mmf
|
||||
(p16) LDFD f77 = [X], INCX
|
||||
}
|
||||
{ .mfb
|
||||
(p18) FADD f11 = f11, f67
|
||||
br.ctop.sptk.few .L52
|
||||
}
|
||||
;;
|
||||
FADD f8 = f8, f71
|
||||
FADD f9 = f9, f74
|
||||
FADD f10 = f10, f77
|
||||
FADD f11 = f11, f80
|
||||
.align 32
|
||||
;;
|
||||
.L55:
|
||||
(p12) LDFD f32 = [X], STRIDE
|
||||
(p9) br.cond.dptk .L998
|
||||
;;
|
||||
(p12) LDFD f33 = [X], INCX
|
||||
;;
|
||||
(p12) LDFD f34 = [X], STRIDE
|
||||
;;
|
||||
(p12) LDFD f35 = [X], INCX
|
||||
tbit.z p0, p13 = N, (2 - COMPADD)
|
||||
;;
|
||||
(p12) LDFD f36 = [X], STRIDE
|
||||
tbit.z p0, p14 = N, (1 - COMPADD)
|
||||
;;
|
||||
(p12) LDFD f37 = [X], INCX
|
||||
#ifndef COMPLEX
|
||||
tbit.z p0, p15 = N, 0
|
||||
#endif
|
||||
;;
|
||||
(p12) LDFD f38 = [X], STRIDE
|
||||
;;
|
||||
(p12) LDFD f39 = [X], INCX
|
||||
;;
|
||||
(p13) LDFD f40 = [X], STRIDE
|
||||
;;
|
||||
(p13) LDFD f41 = [X], INCX
|
||||
;;
|
||||
(p13) LDFD f42 = [X], STRIDE
|
||||
(p12) FADD f8 = f8, f32
|
||||
;;
|
||||
(p13) LDFD f43 = [X], INCX
|
||||
(p12) FADD f9 = f9, f33
|
||||
;;
|
||||
(p14) LDFD f44 = [X], STRIDE
|
||||
(p12) FADD f10 = f10, f34
|
||||
;;
|
||||
(p14) LDFD f45 = [X], INCX
|
||||
(p12) FADD f11 = f11, f35
|
||||
;;
|
||||
#ifndef COMPLEX
|
||||
(p15) LDFD f46 = [X]
|
||||
#endif
|
||||
(p12) FADD f8 = f8, f36
|
||||
;;
|
||||
(p12) FADD f9 = f9, f37
|
||||
(p12) FADD f10 = f10, f38
|
||||
(p12) FADD f11 = f11, f39
|
||||
;;
|
||||
(p13) FADD f8 = f8, f40
|
||||
(p13) FADD f9 = f9, f41
|
||||
#ifndef COMPLEX
|
||||
#endif
|
||||
(p13) FADD f10 = f10, f42
|
||||
;;
|
||||
(p13) FADD f11 = f11, f43
|
||||
(p14) FADD f8 = f8, f44
|
||||
(p14) FADD f9 = f9, f45
|
||||
#ifndef COMPLEX
|
||||
(p15) FADD f10 = f10, f46
|
||||
#endif
|
||||
;;
|
||||
.align 32
|
||||
|
||||
.L998:
|
||||
{ .mfi
|
||||
FADD f8 = f8, f9
|
||||
mov ar.lc = ARLC
|
||||
}
|
||||
{ .mmf
|
||||
FADD f10 = f10, f11
|
||||
}
|
||||
;;
|
||||
{ .mii
|
||||
mov pr = PR, -65474
|
||||
}
|
||||
;;
|
||||
{ .mfb
|
||||
FADD f8 = f8, f10
|
||||
br.ret.sptk.many b0
|
||||
}
|
||||
EPILOGUE
|
|
@ -30,6 +30,11 @@ IDMAXKERNEL = ../mips/imax.c
|
|||
ISMINKERNEL = ../mips/imin.c
|
||||
IDMINKERNEL = ../mips/imin.c
|
||||
|
||||
SSUMKERNEL = ../mips/sum.c
|
||||
DSUMKERNEL = ../mips/sum.c
|
||||
CSUMKERNEL = ../mips/zsum.c
|
||||
ZSUMKERNEL = ../mips/zsum.c
|
||||
|
||||
ifdef HAVE_MSA
|
||||
SASUMKERNEL = ../mips/sasum_msa.c
|
||||
DASUMKERNEL = ../mips/dasum_msa.c
|
||||
|
|
|
@ -0,0 +1,47 @@
|
|||
/***************************************************************************
|
||||
Copyright (c) 2016, The OpenBLAS Project
|
||||
All rights reserved.
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
met:
|
||||
1. Redistributions of source code must retain the above copyright
|
||||
notice, this list of conditions and the following disclaimer.
|
||||
2. Redistributions in binary form must reproduce the above copyright
|
||||
notice, this list of conditions and the following disclaimer in
|
||||
the documentation and/or other materials provided with the
|
||||
distribution.
|
||||
3. Neither the name of the OpenBLAS project nor the names of
|
||||
its contributors may be used to endorse or promote products
|
||||
derived from this software without specific prior written permission.
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||||
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
||||
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
|
||||
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
|
||||
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
|
||||
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
|
||||
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
|
||||
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
|
||||
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
*****************************************************************************/
|
||||
|
||||
#include "common.h"
|
||||
#include <math.h>
|
||||
|
||||
|
||||
FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x)
|
||||
{
|
||||
BLASLONG i=0;
|
||||
FLOAT sumf = 0.0;
|
||||
if (n <= 0 || inc_x <= 0) return(sumf);
|
||||
|
||||
n *= inc_x;
|
||||
while(i < n)
|
||||
{
|
||||
sumf += x[i];
|
||||
i += inc_x;
|
||||
}
|
||||
return(sumf);
|
||||
}
|
||||
|
||||
|
|
@ -0,0 +1,52 @@
|
|||
/***************************************************************************
|
||||
Copyright (c) 2016, The OpenBLAS Project
|
||||
All rights reserved.
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
met:
|
||||
1. Redistributions of source code must retain the above copyright
|
||||
notice, this list of conditions and the following disclaimer.
|
||||
2. Redistributions in binary form must reproduce the above copyright
|
||||
notice, this list of conditions and the following disclaimer in
|
||||
the documentation and/or other materials provided with the
|
||||
distribution.
|
||||
3. Neither the name of the OpenBLAS project nor the names of
|
||||
its contributors may be used to endorse or promote products
|
||||
derived from this software without specific prior written permission.
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||||
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
||||
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
|
||||
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
|
||||
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
|
||||
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
|
||||
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
|
||||
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
|
||||
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
*****************************************************************************/
|
||||
|
||||
#include "common.h"
|
||||
#include <math.h>
|
||||
|
||||
#define CSUM1(x,i) x[i]+x[i+1]
|
||||
|
||||
FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x)
|
||||
{
|
||||
BLASLONG i=0;
|
||||
FLOAT sumf = 0.0;
|
||||
BLASLONG inc_x2;
|
||||
|
||||
if (n <= 0 || inc_x <= 0) return(sumf);
|
||||
|
||||
inc_x2 = 2 * inc_x;
|
||||
|
||||
n *= inc_x2;
|
||||
while(i < n)
|
||||
{
|
||||
sumf += CSUM1(x,i);
|
||||
i += inc_x2;
|
||||
}
|
||||
return(sumf);
|
||||
}
|
||||
|
||||
|
|
@ -0,0 +1,332 @@
|
|||
/*********************************************************************/
|
||||
/* Copyright 2009, 2010 The University of Texas at Austin. */
|
||||
/* All rights reserved. */
|
||||
/* */
|
||||
/* Redistribution and use in source and binary forms, with or */
|
||||
/* without modification, are permitted provided that the following */
|
||||
/* conditions are met: */
|
||||
/* */
|
||||
/* 1. Redistributions of source code must retain the above */
|
||||
/* copyright notice, this list of conditions and the following */
|
||||
/* disclaimer. */
|
||||
/* */
|
||||
/* 2. Redistributions in binary form must reproduce the above */
|
||||
/* copyright notice, this list of conditions and the following */
|
||||
/* disclaimer in the documentation and/or other materials */
|
||||
/* provided with the distribution. */
|
||||
/* */
|
||||
/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */
|
||||
/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */
|
||||
/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */
|
||||
/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */
|
||||
/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */
|
||||
/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */
|
||||
/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */
|
||||
/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */
|
||||
/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */
|
||||
/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */
|
||||
/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */
|
||||
/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */
|
||||
/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */
|
||||
/* POSSIBILITY OF SUCH DAMAGE. */
|
||||
/* */
|
||||
/* The views and conclusions contained in the software and */
|
||||
/* documentation are those of the authors and should not be */
|
||||
/* interpreted as representing official policies, either expressed */
|
||||
/* or implied, of The University of Texas at Austin. */
|
||||
/*********************************************************************/
|
||||
|
||||
#define ASSEMBLER
|
||||
#include "common.h"
|
||||
|
||||
#define N $4
|
||||
#define X $5
|
||||
#define INCX $6
|
||||
|
||||
#define I $2
|
||||
#define TEMP $3
|
||||
|
||||
#define a1 $f2
|
||||
#define a2 $f3
|
||||
#define a3 $f4
|
||||
#define a4 $f5
|
||||
#define a5 $f6
|
||||
#define a6 $f7
|
||||
#define a7 $f8
|
||||
#define a8 $f9
|
||||
|
||||
#define t1 $f10
|
||||
#define t2 $f11
|
||||
#define t3 $f12
|
||||
#define t4 $f13
|
||||
|
||||
#define s1 $f0
|
||||
#define s2 $f1
|
||||
|
||||
PROLOGUE
|
||||
|
||||
#ifdef F_INTERFACE
|
||||
LDINT N, 0(N)
|
||||
LDINT INCX, 0(INCX)
|
||||
#endif
|
||||
|
||||
MTC $0, s1
|
||||
|
||||
MTC $0, s2
|
||||
dsll INCX, INCX, BASE_SHIFT
|
||||
|
||||
blez N, .L999
|
||||
li TEMP, SIZE
|
||||
|
||||
bne INCX, TEMP, .L20
|
||||
dsra I, N, 3
|
||||
|
||||
blez I, .L15
|
||||
NOP
|
||||
|
||||
LD a1, 0 * SIZE(X)
|
||||
LD a2, 1 * SIZE(X)
|
||||
LD a3, 2 * SIZE(X)
|
||||
LD a4, 3 * SIZE(X)
|
||||
|
||||
LD a5, 4 * SIZE(X)
|
||||
MOV t1, a1
|
||||
LD a6, 5 * SIZE(X)
|
||||
MOV t2, a2
|
||||
LD a7, 6 * SIZE(X)
|
||||
MOV t3, a3
|
||||
|
||||
MOV t4, a4
|
||||
daddiu I, I, -1
|
||||
|
||||
blez I, .L13
|
||||
LD a8, 7 * SIZE(X)
|
||||
.align 3
|
||||
|
||||
.L12:
|
||||
ADD s1, s1, t1
|
||||
LD a1, 8 * SIZE(X)
|
||||
|
||||
MOV t1, a5
|
||||
daddiu I, I, -1
|
||||
|
||||
ADD s2, s2, t2
|
||||
LD a2, 9 * SIZE(X)
|
||||
|
||||
MOV t2, a6
|
||||
NOP
|
||||
|
||||
ADD s1, s1, t3
|
||||
LD a3, 10 * SIZE(X)
|
||||
|
||||
MOV t3, a7
|
||||
NOP
|
||||
|
||||
ADD s2, s2, t4
|
||||
LD a4, 11 * SIZE(X)
|
||||
|
||||
MOV t4, a8
|
||||
daddiu X, X, 8 * SIZE
|
||||
|
||||
ADD s1, s1, t1
|
||||
LD a5, 4 * SIZE(X)
|
||||
|
||||
MOV t1, a1
|
||||
NOP
|
||||
|
||||
ADD s2, s2, t2
|
||||
LD a6, 5 * SIZE(X)
|
||||
|
||||
MOV t2, a2
|
||||
NOP
|
||||
|
||||
ADD s1, s1, t3
|
||||
LD a7, 6 * SIZE(X)
|
||||
|
||||
MOV t3, a3
|
||||
NOP
|
||||
|
||||
ADD s2, s2, t4
|
||||
LD a8, 7 * SIZE(X)
|
||||
|
||||
bgtz I, .L12
|
||||
MOV t4, a4
|
||||
.align 3
|
||||
|
||||
.L13:
|
||||
ADD s1, s1, t1
|
||||
daddiu X, X, 8 * SIZE
|
||||
|
||||
MOV t1, a5
|
||||
NOP
|
||||
|
||||
ADD s2, s2, t2
|
||||
MOV t2, a6
|
||||
|
||||
ADD s1, s1, t3
|
||||
MOV t3, a7
|
||||
|
||||
ADD s2, s2, t4
|
||||
MOV t4, a8
|
||||
|
||||
ADD s1, s1, t1
|
||||
ADD s2, s2, t2
|
||||
ADD s1, s1, t3
|
||||
ADD s2, s2, t4
|
||||
.align 3
|
||||
|
||||
.L15:
|
||||
andi I, N, 7
|
||||
|
||||
blez I, .L999
|
||||
NOP
|
||||
.align 3
|
||||
|
||||
.L16:
|
||||
LD a1, 0 * SIZE(X)
|
||||
daddiu I, I, -1
|
||||
|
||||
MOV t1, a1
|
||||
|
||||
ADD s1, s1, t1
|
||||
|
||||
bgtz I, .L16
|
||||
daddiu X, X, SIZE
|
||||
|
||||
j .L999
|
||||
NOP
|
||||
.align 3
|
||||
|
||||
.L20:
|
||||
blez I, .L25
|
||||
NOP
|
||||
|
||||
LD a1, 0 * SIZE(X)
|
||||
daddu X, X, INCX
|
||||
|
||||
LD a2, 0 * SIZE(X)
|
||||
daddu X, X, INCX
|
||||
|
||||
LD a3, 0 * SIZE(X)
|
||||
daddu X, X, INCX
|
||||
|
||||
LD a4, 0 * SIZE(X)
|
||||
daddu X, X, INCX
|
||||
|
||||
LD a5, 0 * SIZE(X)
|
||||
daddu X, X, INCX
|
||||
|
||||
LD a6, 0 * SIZE(X)
|
||||
daddu X, X, INCX
|
||||
|
||||
MOV t1, a1
|
||||
LD a7, 0 * SIZE(X)
|
||||
|
||||
MOV t2, a2
|
||||
daddu X, X, INCX
|
||||
|
||||
MOV t3, a3
|
||||
LD a8, 0 * SIZE(X)
|
||||
|
||||
MOV t4, a4
|
||||
daddiu I, I, -1
|
||||
|
||||
blez I, .L24
|
||||
daddu X, X, INCX
|
||||
.align 3
|
||||
|
||||
.L23:
|
||||
ADD s1, s1, t1
|
||||
LD a1, 0 * SIZE(X)
|
||||
|
||||
MOV t1, a5
|
||||
daddu X, X, INCX
|
||||
|
||||
ADD s2, s2, t2
|
||||
LD a2, 0 * SIZE(X)
|
||||
|
||||
MOV t2, a6
|
||||
daddu X, X, INCX
|
||||
|
||||
ADD s1, s1, t3
|
||||
LD a3, 0 * SIZE(X)
|
||||
|
||||
MOV t3, a7
|
||||
daddu X, X, INCX
|
||||
|
||||
ADD s2, s2, t4
|
||||
LD a4, 0 * SIZE(X)
|
||||
|
||||
MOV t4, a8
|
||||
daddu X, X, INCX
|
||||
|
||||
ADD s1, s1, t1
|
||||
LD a5, 0 * SIZE(X)
|
||||
|
||||
MOV t1, a1
|
||||
daddu X, X, INCX
|
||||
|
||||
ADD s2, s2, t2
|
||||
LD a6, 0 * SIZE(X)
|
||||
|
||||
MOV t2, a2
|
||||
daddu X, X, INCX
|
||||
|
||||
ADD s1, s1, t3
|
||||
LD a7, 0 * SIZE(X)
|
||||
|
||||
MOV t3, a3
|
||||
daddu X, X, INCX
|
||||
|
||||
ADD s2, s2, t4
|
||||
LD a8, 0 * SIZE(X)
|
||||
|
||||
MOV t4, a4
|
||||
daddiu I, I, -1
|
||||
|
||||
bgtz I, .L23
|
||||
daddu X, X, INCX
|
||||
.align 3
|
||||
|
||||
.L24:
|
||||
ADD s1, s1, t1
|
||||
MOV t1, a5
|
||||
|
||||
ADD s2, s2, t2
|
||||
MOV t2, a6
|
||||
|
||||
ADD s1, s1, t3
|
||||
MOV t3, a7
|
||||
|
||||
ADD s2, s2, t4
|
||||
MOV t4, a8
|
||||
|
||||
ADD s1, s1, t1
|
||||
ADD s2, s2, t2
|
||||
ADD s1, s1, t3
|
||||
ADD s2, s2, t4
|
||||
.align 3
|
||||
|
||||
.L25:
|
||||
andi I, N, 7
|
||||
|
||||
blez I, .L999
|
||||
NOP
|
||||
.align 3
|
||||
|
||||
.L26:
|
||||
LD a1, 0 * SIZE(X)
|
||||
daddiu I, I, -1
|
||||
|
||||
MOV t1, a1
|
||||
daddu X, X, INCX
|
||||
|
||||
bgtz I, .L26
|
||||
ADD s1, s1, t1
|
||||
.align 3
|
||||
|
||||
.L999:
|
||||
j $31
|
||||
ADD s1, s1, s2
|
||||
|
||||
EPILOGUE
|
|
@ -0,0 +1,204 @@
|
|||
/*********************************************************************/
|
||||
/* Copyright 2009, 2010 The University of Texas at Austin. */
|
||||
/* All rights reserved. */
|
||||
/* */
|
||||
/* Redistribution and use in source and binary forms, with or */
|
||||
/* without modification, are permitted provided that the following */
|
||||
/* conditions are met: */
|
||||
/* */
|
||||
/* 1. Redistributions of source code must retain the above */
|
||||
/* copyright notice, this list of conditions and the following */
|
||||
/* disclaimer. */
|
||||
/* */
|
||||
/* 2. Redistributions in binary form must reproduce the above */
|
||||
/* copyright notice, this list of conditions and the following */
|
||||
/* disclaimer in the documentation and/or other materials */
|
||||
/* provided with the distribution. */
|
||||
/* */
|
||||
/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */
|
||||
/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */
|
||||
/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */
|
||||
/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */
|
||||
/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */
|
||||
/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */
|
||||
/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */
|
||||
/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */
|
||||
/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */
|
||||
/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */
|
||||
/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */
|
||||
/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */
|
||||
/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */
|
||||
/* POSSIBILITY OF SUCH DAMAGE. */
|
||||
/* */
|
||||
/* The views and conclusions contained in the software and */
|
||||
/* documentation are those of the authors and should not be */
|
||||
/* interpreted as representing official policies, either expressed */
|
||||
/* or implied, of The University of Texas at Austin. */
|
||||
/*********************************************************************/
|
||||
|
||||
#define ASSEMBLER
|
||||
#include "common.h"
|
||||
|
||||
#define N $4
|
||||
#define X $5
|
||||
#define INCX $6
|
||||
|
||||
#define I $2
|
||||
#define TEMP $3
|
||||
|
||||
#define a1 $f2
|
||||
#define a2 $f3
|
||||
#define a3 $f4
|
||||
#define a4 $f5
|
||||
#define a5 $f6
|
||||
#define a6 $f7
|
||||
#define a7 $f8
|
||||
#define a8 $f9
|
||||
|
||||
#define t1 $f10
|
||||
#define t2 $f11
|
||||
#define t3 $f12
|
||||
#define t4 $f13
|
||||
|
||||
#define s1 $f0
|
||||
#define s2 $f1
|
||||
|
||||
PROLOGUE
|
||||
|
||||
#ifdef F_INTERFACE
|
||||
LDINT N, 0(N)
|
||||
LDINT INCX, 0(INCX)
|
||||
#endif
|
||||
|
||||
MTC $0, s1
|
||||
|
||||
MTC $0, s2
|
||||
dsll INCX, INCX, ZBASE_SHIFT
|
||||
|
||||
blez N, .L999
|
||||
dsra I, N, 2
|
||||
|
||||
blez I, .L25
|
||||
NOP
|
||||
|
||||
LD a1, 0 * SIZE(X)
|
||||
LD a2, 1 * SIZE(X)
|
||||
daddu X, X, INCX
|
||||
|
||||
LD a3, 0 * SIZE(X)
|
||||
LD a4, 1 * SIZE(X)
|
||||
daddu X, X, INCX
|
||||
|
||||
LD a5, 0 * SIZE(X)
|
||||
LD a6, 1 * SIZE(X)
|
||||
daddu X, X, INCX
|
||||
|
||||
MOV t1, a1
|
||||
MOV t2, a2
|
||||
|
||||
LD a7, 0 * SIZE(X)
|
||||
LD a8, 1 * SIZE(X)
|
||||
|
||||
MOV t3, a3
|
||||
MOV t4, a4
|
||||
daddiu I, I, -1
|
||||
|
||||
blez I, .L24
|
||||
daddu X, X, INCX
|
||||
.align 3
|
||||
|
||||
.L23:
|
||||
ADD s1, s1, t1
|
||||
LD a1, 0 * SIZE(X)
|
||||
|
||||
MOV t1, a5
|
||||
daddiu I, I, -1
|
||||
|
||||
ADD s2, s2, t2
|
||||
LD a2, 1 * SIZE(X)
|
||||
|
||||
MOV t2, a6
|
||||
daddu X, X, INCX
|
||||
|
||||
ADD s1, s1, t3
|
||||
LD a3, 0 * SIZE(X)
|
||||
|
||||
MOV t3, a7
|
||||
NOP
|
||||
|
||||
ADD s2, s2, t4
|
||||
LD a4, 1 * SIZE(X)
|
||||
|
||||
MOV t4, a8
|
||||
daddu X, X, INCX
|
||||
|
||||
ADD s1, s1, t1
|
||||
LD a5, 0 * SIZE(X)
|
||||
|
||||
MOV t1, a1
|
||||
NOP
|
||||
|
||||
ADD s2, s2, t2
|
||||
LD a6, 1 * SIZE(X)
|
||||
|
||||
MOV t2, a2
|
||||
daddu X, X, INCX
|
||||
|
||||
ADD s1, s1, t3
|
||||
LD a7, 0 * SIZE(X)
|
||||
|
||||
MOV t3, a3
|
||||
LD a8, 1 * SIZE(X)
|
||||
|
||||
ADD s2, s2, t4
|
||||
daddu X, X, INCX
|
||||
|
||||
bgtz I, .L23
|
||||
MOV t4, a4
|
||||
.align 3
|
||||
|
||||
.L24:
|
||||
ADD s1, s1, t1
|
||||
MOV t1, a5
|
||||
|
||||
ADD s2, s2, t2
|
||||
MOV t2, a6
|
||||
|
||||
ADD s1, s1, t3
|
||||
MOV t3, a7
|
||||
|
||||
ADD s2, s2, t4
|
||||
MOV t4, a8
|
||||
|
||||
ADD s1, s1, t1
|
||||
ADD s2, s2, t2
|
||||
ADD s1, s1, t3
|
||||
ADD s2, s2, t4
|
||||
.align 3
|
||||
|
||||
.L25:
|
||||
andi I, N, 3
|
||||
|
||||
blez I, .L999
|
||||
NOP
|
||||
.align 3
|
||||
|
||||
.L26:
|
||||
LD a1, 0 * SIZE(X)
|
||||
LD a2, 1 * SIZE(X)
|
||||
|
||||
MOV t1, a1
|
||||
daddiu I, I, -1
|
||||
MOV t2, a2
|
||||
daddu X, X, INCX
|
||||
|
||||
ADD s1, s1, t1
|
||||
bgtz I, .L26
|
||||
ADD s2, s2, t2
|
||||
.align 3
|
||||
|
||||
.L999:
|
||||
j $31
|
||||
ADD s1, s1, s2
|
||||
|
||||
EPILOGUE
|
|
@ -0,0 +1,446 @@
|
|||
/*********************************************************************/
|
||||
/* Copyright 2009, 2010 The University of Texas at Austin. */
|
||||
/* All rights reserved. */
|
||||
/* */
|
||||
/* Redistribution and use in source and binary forms, with or */
|
||||
/* without modification, are permitted provided that the following */
|
||||
/* conditions are met: */
|
||||
/* */
|
||||
/* 1. Redistributions of source code must retain the above */
|
||||
/* copyright notice, this list of conditions and the following */
|
||||
/* disclaimer. */
|
||||
/* */
|
||||
/* 2. Redistributions in binary form must reproduce the above */
|
||||
/* copyright notice, this list of conditions and the following */
|
||||
/* disclaimer in the documentation and/or other materials */
|
||||
/* provided with the distribution. */
|
||||
/* */
|
||||
/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */
|
||||
/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */
|
||||
/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */
|
||||
/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */
|
||||
/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */
|
||||
/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */
|
||||
/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */
|
||||
/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */
|
||||
/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */
|
||||
/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */
|
||||
/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */
|
||||
/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */
|
||||
/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */
|
||||
/* POSSIBILITY OF SUCH DAMAGE. */
|
||||
/* */
|
||||
/* The views and conclusions contained in the software and */
|
||||
/* documentation are those of the authors and should not be */
|
||||
/* interpreted as representing official policies, either expressed */
|
||||
/* or implied, of The University of Texas at Austin. */
|
||||
/*********************************************************************/
|
||||
|
||||
#define ASSEMBLER
|
||||
#include "common.h"
|
||||
|
||||
#define N r3
|
||||
#define X r4
|
||||
#define INCX r5
|
||||
|
||||
#define PREA r8
|
||||
|
||||
#define FZERO f0
|
||||
|
||||
#define STACKSIZE 160
|
||||
|
||||
PROLOGUE
|
||||
PROFCODE
|
||||
|
||||
addi SP, SP, -STACKSIZE
|
||||
li r0, 0
|
||||
|
||||
stfd f14, 0(SP)
|
||||
stfd f15, 8(SP)
|
||||
stfd f16, 16(SP)
|
||||
stfd f17, 24(SP)
|
||||
|
||||
stfd f18, 32(SP)
|
||||
stfd f19, 40(SP)
|
||||
stfd f20, 48(SP)
|
||||
stfd f21, 56(SP)
|
||||
|
||||
stfd f22, 64(SP)
|
||||
stfd f23, 72(SP)
|
||||
stfd f24, 80(SP)
|
||||
stfd f25, 88(SP)
|
||||
|
||||
stfd f26, 96(SP)
|
||||
stfd f27, 104(SP)
|
||||
stfd f28, 112(SP)
|
||||
stfd f29, 120(SP)
|
||||
|
||||
stfd f30, 128(SP)
|
||||
stfd f31, 136(SP)
|
||||
|
||||
stw r0, 144(SP)
|
||||
lfs FZERO,144(SP)
|
||||
|
||||
#ifdef F_INTERFACE
|
||||
LDINT N, 0(N)
|
||||
LDINT INCX, 0(INCX)
|
||||
#endif
|
||||
|
||||
slwi INCX, INCX, BASE_SHIFT
|
||||
|
||||
fmr f1, FZERO
|
||||
fmr f2, FZERO
|
||||
fmr f3, FZERO
|
||||
fmr f4, FZERO
|
||||
fmr f5, FZERO
|
||||
fmr f6, FZERO
|
||||
fmr f7, FZERO
|
||||
|
||||
li PREA, L1_PREFETCHSIZE
|
||||
|
||||
cmpwi cr0, N, 0
|
||||
ble- LL(999)
|
||||
|
||||
cmpwi cr0, INCX, 0
|
||||
ble- LL(999)
|
||||
|
||||
cmpwi cr0, INCX, SIZE
|
||||
bne- cr0, LL(100)
|
||||
|
||||
srawi. r0, N, 4
|
||||
mtspr CTR, r0
|
||||
beq- cr0, LL(50)
|
||||
.align 4
|
||||
|
||||
LFD f8, 0 * SIZE(X)
|
||||
LFD f9, 1 * SIZE(X)
|
||||
LFD f10, 2 * SIZE(X)
|
||||
LFD f11, 3 * SIZE(X)
|
||||
LFD f12, 4 * SIZE(X)
|
||||
LFD f13, 5 * SIZE(X)
|
||||
LFD f14, 6 * SIZE(X)
|
||||
LFD f15, 7 * SIZE(X)
|
||||
|
||||
LFD f24, 8 * SIZE(X)
|
||||
LFD f25, 9 * SIZE(X)
|
||||
LFD f26, 10 * SIZE(X)
|
||||
LFD f27, 11 * SIZE(X)
|
||||
LFD f28, 12 * SIZE(X)
|
||||
LFD f29, 13 * SIZE(X)
|
||||
LFD f30, 14 * SIZE(X)
|
||||
LFD f31, 15 * SIZE(X)
|
||||
|
||||
fmr f16, f8
|
||||
fmr f17, f9
|
||||
fmr f18, f10
|
||||
fmr f19, f11
|
||||
|
||||
fmr f20, f12
|
||||
fmr f21, f13
|
||||
fmr f22, f14
|
||||
fmr f23, f15
|
||||
bdz LL(20)
|
||||
.align 4
|
||||
|
||||
LL(10):
|
||||
FADD f0, f0, f16
|
||||
fmr f16, f24
|
||||
FADD f1, f1, f17
|
||||
fmr f17, f25
|
||||
|
||||
FADD f2, f2, f18
|
||||
fmr f18, f26
|
||||
FADD f3, f3, f19
|
||||
fmr f19, f27
|
||||
|
||||
LFD f8, 16 * SIZE(X)
|
||||
LFD f9, 17 * SIZE(X)
|
||||
LFD f10, 18 * SIZE(X)
|
||||
LFD f11, 19 * SIZE(X)
|
||||
|
||||
FADD f4, f4, f20
|
||||
fmr f20, f28
|
||||
FADD f5, f5, f21
|
||||
fmr f21, f29
|
||||
|
||||
FADD f6, f6, f22
|
||||
fmr f22, f30
|
||||
FADD f7, f7, f23
|
||||
fmr f23, f31
|
||||
|
||||
LFD f12, 20 * SIZE(X)
|
||||
LFD f13, 21 * SIZE(X)
|
||||
LFD f14, 22 * SIZE(X)
|
||||
LFD f15, 23 * SIZE(X)
|
||||
|
||||
FADD f0, f0, f16
|
||||
fmr f16, f8
|
||||
FADD f1, f1, f17
|
||||
fmr f17, f9
|
||||
|
||||
FADD f2, f2, f18
|
||||
fmr f18, f10
|
||||
FADD f3, f3, f19
|
||||
fmr f19, f11
|
||||
|
||||
LFD f24, 24 * SIZE(X)
|
||||
LFD f25, 25 * SIZE(X)
|
||||
LFD f26, 26 * SIZE(X)
|
||||
LFD f27, 27 * SIZE(X)
|
||||
|
||||
FADD f4, f4, f20
|
||||
fmr f20, f12
|
||||
FADD f5, f5, f21
|
||||
fmr f21, f13
|
||||
|
||||
FADD f6, f6, f22
|
||||
fmr f22, f14
|
||||
FADD f7, f7, f23
|
||||
fmr f23, f15
|
||||
|
||||
LFD f28, 28 * SIZE(X)
|
||||
LFD f29, 29 * SIZE(X)
|
||||
LFD f30, 30 * SIZE(X)
|
||||
LFD f31, 31 * SIZE(X)
|
||||
|
||||
#ifndef POWER6
|
||||
L1_PREFETCH X, PREA
|
||||
#endif
|
||||
addi X, X, 16 * SIZE
|
||||
#ifdef POWER6
|
||||
L1_PREFETCH X, PREA
|
||||
#endif
|
||||
|
||||
bdnz LL(10)
|
||||
.align 4
|
||||
|
||||
LL(20):
|
||||
FADD f0, f0, f16
|
||||
fmr f16, f24
|
||||
FADD f1, f1, f17
|
||||
fmr f17, f25
|
||||
|
||||
FADD f2, f2, f18
|
||||
fmr f18, f26
|
||||
FADD f3, f3, f19
|
||||
fmr f19, f27
|
||||
|
||||
FADD f4, f4, f20
|
||||
fmr f20, f28
|
||||
FADD f5, f5, f21
|
||||
fmr f21, f29
|
||||
|
||||
FADD f6, f6, f22
|
||||
fmr f22, f30
|
||||
FADD f7, f7, f23
|
||||
fmr f23, f31
|
||||
|
||||
FADD f0, f0, f16
|
||||
FADD f1, f1, f17
|
||||
FADD f2, f2, f18
|
||||
FADD f3, f3, f19
|
||||
|
||||
FADD f4, f4, f20
|
||||
FADD f5, f5, f21
|
||||
FADD f6, f6, f22
|
||||
FADD f7, f7, f23
|
||||
addi X, X, 16 * SIZE
|
||||
.align 4
|
||||
|
||||
LL(50):
|
||||
andi. r0, N, 15
|
||||
mtspr CTR, r0
|
||||
beq LL(999)
|
||||
.align 4
|
||||
|
||||
LL(60):
|
||||
LFD f8, 0 * SIZE(X)
|
||||
addi X, X, 1 * SIZE
|
||||
|
||||
FADD f0, f0, f8
|
||||
|
||||
bdnz LL(60)
|
||||
b LL(999)
|
||||
.align 4
|
||||
|
||||
LL(100):
|
||||
sub X, X, INCX
|
||||
|
||||
srawi. r0, N, 4
|
||||
mtspr CTR, r0
|
||||
beq- LL(150)
|
||||
|
||||
LFDUX f8, X, INCX
|
||||
LFDUX f9, X, INCX
|
||||
LFDUX f10, X, INCX
|
||||
LFDUX f11, X, INCX
|
||||
LFDUX f12, X, INCX
|
||||
LFDUX f13, X, INCX
|
||||
LFDUX f14, X, INCX
|
||||
LFDUX f15, X, INCX
|
||||
|
||||
LFDUX f24, X, INCX
|
||||
LFDUX f25, X, INCX
|
||||
LFDUX f26, X, INCX
|
||||
LFDUX f27, X, INCX
|
||||
LFDUX f28, X, INCX
|
||||
LFDUX f29, X, INCX
|
||||
LFDUX f30, X, INCX
|
||||
LFDUX f31, X, INCX
|
||||
|
||||
fmr f16, f8
|
||||
fmr f17, f9
|
||||
fmr f18, f10
|
||||
fmr f19, f11
|
||||
|
||||
fmr f20, f12
|
||||
fmr f21, f13
|
||||
fmr f22, f14
|
||||
fmr f23, f15
|
||||
bdz LL(120)
|
||||
.align 4
|
||||
|
||||
LL(110):
|
||||
FADD f0, f0, f16
|
||||
fmr f16, f24
|
||||
FADD f1, f1, f17
|
||||
fmr f17, f25
|
||||
|
||||
FADD f2, f2, f18
|
||||
fmr f18, f26
|
||||
FADD f3, f3, f19
|
||||
fmr f19, f27
|
||||
|
||||
LFDUX f8, X, INCX
|
||||
LFDUX f9, X, INCX
|
||||
LFDUX f10, X, INCX
|
||||
LFDUX f11, X, INCX
|
||||
|
||||
FADD f4, f4, f20
|
||||
fmr f20, f28
|
||||
FADD f5, f5, f21
|
||||
fmr f21, f29
|
||||
|
||||
FADD f6, f6, f22
|
||||
fmr f22, f30
|
||||
FADD f7, f7, f23
|
||||
fmr f23, f31
|
||||
|
||||
LFDUX f12, X, INCX
|
||||
LFDUX f13, X, INCX
|
||||
LFDUX f14, X, INCX
|
||||
LFDUX f15, X, INCX
|
||||
|
||||
FADD f0, f0, f16
|
||||
fmr f16, f8
|
||||
FADD f1, f1, f17
|
||||
fmr f17, f9
|
||||
|
||||
FADD f2, f2, f18
|
||||
fmr f18, f10
|
||||
FADD f3, f3, f19
|
||||
fmr f19, f11
|
||||
|
||||
LFDUX f24, X, INCX
|
||||
LFDUX f25, X, INCX
|
||||
LFDUX f26, X, INCX
|
||||
LFDUX f27, X, INCX
|
||||
|
||||
FADD f4, f4, f20
|
||||
fmr f20, f12
|
||||
FADD f5, f5, f21
|
||||
fmr f21, f13
|
||||
|
||||
FADD f6, f6, f22
|
||||
fmr f22, f14
|
||||
FADD f7, f7, f23
|
||||
fmr f23, f15
|
||||
|
||||
LFDUX f28, X, INCX
|
||||
LFDUX f29, X, INCX
|
||||
LFDUX f30, X, INCX
|
||||
LFDUX f31, X, INCX
|
||||
bdnz LL(110)
|
||||
.align 4
|
||||
|
||||
LL(120):
|
||||
FADD f0, f0, f16
|
||||
fmr f16, f24
|
||||
FADD f1, f1, f17
|
||||
fmr f17, f25
|
||||
|
||||
FADD f2, f2, f18
|
||||
fmr f18, f26
|
||||
FADD f3, f3, f19
|
||||
fmr f19, f27
|
||||
|
||||
FADD f4, f4, f20
|
||||
fmr f20, f28
|
||||
FADD f5, f5, f21
|
||||
fmr f21, f29
|
||||
|
||||
FADD f6, f6, f22
|
||||
fmr f22, f30
|
||||
FADD f7, f7, f23
|
||||
fmr f23, f31
|
||||
|
||||
FADD f0, f0, f16
|
||||
FADD f1, f1, f17
|
||||
FADD f2, f2, f18
|
||||
FADD f3, f3, f19
|
||||
|
||||
FADD f4, f4, f20
|
||||
FADD f5, f5, f21
|
||||
FADD f6, f6, f22
|
||||
FADD f7, f7, f23
|
||||
.align 4
|
||||
|
||||
LL(150):
|
||||
andi. r0, N, 15
|
||||
mtspr CTR, r0
|
||||
beq LL(999)
|
||||
.align 4
|
||||
|
||||
LL(160):
|
||||
LFDUX f8, X, INCX
|
||||
FADD f0, f0, f8
|
||||
bdnz LL(160)
|
||||
.align 4
|
||||
|
||||
LL(999):
|
||||
FADD f0, f0, f1
|
||||
FADD f2, f2, f3
|
||||
FADD f4, f4, f5
|
||||
FADD f6, f6, f7
|
||||
|
||||
FADD f0, f0, f2
|
||||
FADD f4, f4, f6
|
||||
FADD f1, f0, f4
|
||||
|
||||
lfd f14, 0(SP)
|
||||
lfd f15, 8(SP)
|
||||
lfd f16, 16(SP)
|
||||
lfd f17, 24(SP)
|
||||
|
||||
lfd f18, 32(SP)
|
||||
lfd f19, 40(SP)
|
||||
lfd f20, 48(SP)
|
||||
lfd f21, 56(SP)
|
||||
|
||||
lfd f22, 64(SP)
|
||||
lfd f23, 72(SP)
|
||||
lfd f24, 80(SP)
|
||||
lfd f25, 88(SP)
|
||||
|
||||
lfd f26, 96(SP)
|
||||
lfd f27, 104(SP)
|
||||
lfd f28, 112(SP)
|
||||
lfd f29, 120(SP)
|
||||
|
||||
lfd f30, 128(SP)
|
||||
lfd f31, 136(SP)
|
||||
|
||||
addi SP, SP, STACKSIZE
|
||||
blr
|
||||
|
||||
EPILOGUE
|
|
@ -0,0 +1,452 @@
|
|||
/*********************************************************************/
|
||||
/* Copyright 2009, 2010 The University of Texas at Austin. */
|
||||
/* All rights reserved. */
|
||||
/* */
|
||||
/* Redistribution and use in source and binary forms, with or */
|
||||
/* without modification, are permitted provided that the following */
|
||||
/* conditions are met: */
|
||||
/* */
|
||||
/* 1. Redistributions of source code must retain the above */
|
||||
/* copyright notice, this list of conditions and the following */
|
||||
/* disclaimer. */
|
||||
/* */
|
||||
/* 2. Redistributions in binary form must reproduce the above */
|
||||
/* copyright notice, this list of conditions and the following */
|
||||
/* disclaimer in the documentation and/or other materials */
|
||||
/* provided with the distribution. */
|
||||
/* */
|
||||
/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */
|
||||
/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */
|
||||
/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */
|
||||
/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */
|
||||
/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */
|
||||
/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */
|
||||
/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */
|
||||
/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */
|
||||
/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */
|
||||
/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */
|
||||
/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */
|
||||
/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */
|
||||
/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */
|
||||
/* POSSIBILITY OF SUCH DAMAGE. */
|
||||
/* */
|
||||
/* The views and conclusions contained in the software and */
|
||||
/* documentation are those of the authors and should not be */
|
||||
/* interpreted as representing official policies, either expressed */
|
||||
/* or implied, of The University of Texas at Austin. */
|
||||
/*********************************************************************/
|
||||
|
||||
#define ASSEMBLER
|
||||
#include "common.h"
|
||||
|
||||
#define N r3
|
||||
#define X r4
|
||||
#define INCX r5
|
||||
|
||||
#define INCXM1 r9
|
||||
#define PREA r8
|
||||
|
||||
#define FZERO f0
|
||||
|
||||
#define STACKSIZE 160
|
||||
|
||||
PROLOGUE
|
||||
PROFCODE
|
||||
|
||||
addi SP, SP, -STACKSIZE
|
||||
li r0, 0
|
||||
|
||||
stfd f14, 0(SP)
|
||||
stfd f15, 8(SP)
|
||||
stfd f16, 16(SP)
|
||||
stfd f17, 24(SP)
|
||||
|
||||
stfd f18, 32(SP)
|
||||
stfd f19, 40(SP)
|
||||
stfd f20, 48(SP)
|
||||
stfd f21, 56(SP)
|
||||
|
||||
stfd f22, 64(SP)
|
||||
stfd f23, 72(SP)
|
||||
stfd f24, 80(SP)
|
||||
stfd f25, 88(SP)
|
||||
|
||||
stfd f26, 96(SP)
|
||||
stfd f27, 104(SP)
|
||||
stfd f28, 112(SP)
|
||||
stfd f29, 120(SP)
|
||||
|
||||
stfd f30, 128(SP)
|
||||
stfd f31, 136(SP)
|
||||
|
||||
stw r0, 144(SP)
|
||||
lfs FZERO,144(SP)
|
||||
|
||||
#ifdef F_INTERFACE
|
||||
LDINT N, 0(N)
|
||||
LDINT INCX, 0(INCX)
|
||||
#endif
|
||||
|
||||
slwi INCX, INCX, ZBASE_SHIFT
|
||||
subi INCXM1, INCX, SIZE
|
||||
|
||||
fmr f1, FZERO
|
||||
fmr f2, FZERO
|
||||
fmr f3, FZERO
|
||||
fmr f4, FZERO
|
||||
fmr f5, FZERO
|
||||
fmr f6, FZERO
|
||||
fmr f7, FZERO
|
||||
|
||||
li PREA, L1_PREFETCHSIZE
|
||||
|
||||
cmpwi cr0, N, 0
|
||||
ble- LL(999)
|
||||
|
||||
cmpwi cr0, INCX, 0
|
||||
ble- LL(999)
|
||||
|
||||
cmpwi cr0, INCX, 2 * SIZE
|
||||
bne- cr0, LL(100)
|
||||
|
||||
srawi. r0, N, 3
|
||||
mtspr CTR, r0
|
||||
beq- cr0, LL(50)
|
||||
.align 4
|
||||
|
||||
LFD f8, 0 * SIZE(X)
|
||||
LFD f9, 1 * SIZE(X)
|
||||
LFD f10, 2 * SIZE(X)
|
||||
LFD f11, 3 * SIZE(X)
|
||||
LFD f12, 4 * SIZE(X)
|
||||
LFD f13, 5 * SIZE(X)
|
||||
LFD f14, 6 * SIZE(X)
|
||||
LFD f15, 7 * SIZE(X)
|
||||
|
||||
LFD f24, 8 * SIZE(X)
|
||||
LFD f25, 9 * SIZE(X)
|
||||
LFD f26, 10 * SIZE(X)
|
||||
LFD f27, 11 * SIZE(X)
|
||||
LFD f28, 12 * SIZE(X)
|
||||
LFD f29, 13 * SIZE(X)
|
||||
LFD f30, 14 * SIZE(X)
|
||||
LFD f31, 15 * SIZE(X)
|
||||
|
||||
fmr f16, f8
|
||||
fmr f17, f9
|
||||
fmr f18, f10
|
||||
fmr f19, f11
|
||||
|
||||
fmr f20, f12
|
||||
fmr f21, f13
|
||||
fmr f22, f14
|
||||
fmr f23, f15
|
||||
bdz LL(20)
|
||||
.align 4
|
||||
|
||||
LL(10):
|
||||
FADD f0, f0, f16
|
||||
fmr f16, f24
|
||||
FADD f1, f1, f17
|
||||
fmr f17, f25
|
||||
|
||||
FADD f2, f2, f18
|
||||
fmr f18, f26
|
||||
FADD f3, f3, f19
|
||||
fmr f19, f27
|
||||
|
||||
LFD f8, 16 * SIZE(X)
|
||||
LFD f9, 17 * SIZE(X)
|
||||
LFD f10, 18 * SIZE(X)
|
||||
LFD f11, 19 * SIZE(X)
|
||||
|
||||
FADD f4, f4, f20
|
||||
fmr f20, f28
|
||||
FADD f5, f5, f21
|
||||
fmr f21, f29
|
||||
|
||||
FADD f6, f6, f22
|
||||
fmr f22, f30
|
||||
FADD f7, f7, f23
|
||||
fmr f23, f31
|
||||
|
||||
LFD f12, 20 * SIZE(X)
|
||||
LFD f13, 21 * SIZE(X)
|
||||
LFD f14, 22 * SIZE(X)
|
||||
LFD f15, 23 * SIZE(X)
|
||||
|
||||
FADD f0, f0, f16
|
||||
fmr f16, f8
|
||||
FADD f1, f1, f17
|
||||
fmr f17, f9
|
||||
|
||||
FADD f2, f2, f18
|
||||
fmr f18, f10
|
||||
FADD f3, f3, f19
|
||||
fmr f19, f11
|
||||
|
||||
LFD f24, 24 * SIZE(X)
|
||||
LFD f25, 25 * SIZE(X)
|
||||
LFD f26, 26 * SIZE(X)
|
||||
LFD f27, 27 * SIZE(X)
|
||||
|
||||
FADD f4, f4, f20
|
||||
fmr f20, f12
|
||||
FADD f5, f5, f21
|
||||
fmr f21, f13
|
||||
|
||||
FADD f6, f6, f22
|
||||
fmr f22, f14
|
||||
FADD f7, f7, f23
|
||||
fmr f23, f15
|
||||
|
||||
LFD f28, 28 * SIZE(X)
|
||||
LFD f29, 29 * SIZE(X)
|
||||
LFD f30, 30 * SIZE(X)
|
||||
LFD f31, 31 * SIZE(X)
|
||||
|
||||
#ifndef POWER6
|
||||
L1_PREFETCH X, PREA
|
||||
#endif
|
||||
addi X, X, 16 * SIZE
|
||||
#ifdef POWER6
|
||||
L1_PREFETCH X, PREA
|
||||
#endif
|
||||
|
||||
bdnz LL(10)
|
||||
.align 4
|
||||
|
||||
LL(20):
|
||||
FADD f0, f0, f16
|
||||
fmr f16, f24
|
||||
FADD f1, f1, f17
|
||||
fmr f17, f25
|
||||
|
||||
FADD f2, f2, f18
|
||||
fmr f18, f26
|
||||
FADD f3, f3, f19
|
||||
fmr f19, f27
|
||||
|
||||
FADD f4, f4, f20
|
||||
fmr f20, f28
|
||||
FADD f5, f5, f21
|
||||
fmr f21, f29
|
||||
|
||||
FADD f6, f6, f22
|
||||
fmr f22, f30
|
||||
FADD f7, f7, f23
|
||||
fmr f23, f31
|
||||
|
||||
FADD f0, f0, f16
|
||||
FADD f1, f1, f17
|
||||
FADD f2, f2, f18
|
||||
FADD f3, f3, f19
|
||||
|
||||
FADD f4, f4, f20
|
||||
FADD f5, f5, f21
|
||||
FADD f6, f6, f22
|
||||
FADD f7, f7, f23
|
||||
addi X, X, 16 * SIZE
|
||||
.align 4
|
||||
|
||||
LL(50):
|
||||
andi. r0, N, 7
|
||||
mtspr CTR, r0
|
||||
beq LL(999)
|
||||
.align 4
|
||||
|
||||
LL(60):
|
||||
LFD f8, 0 * SIZE(X)
|
||||
LFD f9, 1 * SIZE(X)
|
||||
addi X, X, 2 * SIZE
|
||||
|
||||
FADD f0, f0, f8
|
||||
FADD f1, f1, f9
|
||||
|
||||
bdnz LL(60)
|
||||
b LL(999)
|
||||
.align 4
|
||||
|
||||
LL(100):
|
||||
sub X, X, INCXM1
|
||||
|
||||
srawi. r0, N, 3
|
||||
mtspr CTR, r0
|
||||
beq- LL(150)
|
||||
|
||||
LFDX f8, X, INCXM1
|
||||
LFDUX f9, X, INCX
|
||||
LFDX f10, X, INCXM1
|
||||
LFDUX f11, X, INCX
|
||||
LFDX f12, X, INCXM1
|
||||
LFDUX f13, X, INCX
|
||||
LFDX f14, X, INCXM1
|
||||
LFDUX f15, X, INCX
|
||||
|
||||
LFDX f24, X, INCXM1
|
||||
LFDUX f25, X, INCX
|
||||
LFDX f26, X, INCXM1
|
||||
LFDUX f27, X, INCX
|
||||
LFDX f28, X, INCXM1
|
||||
LFDUX f29, X, INCX
|
||||
LFDX f30, X, INCXM1
|
||||
LFDUX f31, X, INCX
|
||||
|
||||
fmr f16, f8
|
||||
fmr f17, f9
|
||||
fmr f18, f10
|
||||
fmr f19, f11
|
||||
|
||||
fmr f20, f12
|
||||
fmr f21, f13
|
||||
fmr f22, f14
|
||||
fmr f23, f15
|
||||
bdz LL(120)
|
||||
.align 4
|
||||
|
||||
LL(110):
|
||||
FADD f0, f0, f16
|
||||
fmr f16, f24
|
||||
FADD f1, f1, f17
|
||||
fmr f17, f25
|
||||
|
||||
FADD f2, f2, f18
|
||||
fmr f18, f26
|
||||
FADD f3, f3, f19
|
||||
fmr f19, f27
|
||||
|
||||
LFDX f8, X, INCXM1
|
||||
LFDUX f9, X, INCX
|
||||
LFDX f10, X, INCXM1
|
||||
LFDUX f11, X, INCX
|
||||
|
||||
FADD f4, f4, f20
|
||||
fmr f20, f28
|
||||
FADD f5, f5, f21
|
||||
fmr f21, f29
|
||||
|
||||
FADD f6, f6, f22
|
||||
fmr f22, f30
|
||||
FADD f7, f7, f23
|
||||
fmr f23, f31
|
||||
|
||||
LFDX f12, X, INCXM1
|
||||
LFDUX f13, X, INCX
|
||||
LFDX f14, X, INCXM1
|
||||
LFDUX f15, X, INCX
|
||||
|
||||
FADD f0, f0, f16
|
||||
fmr f16, f8
|
||||
FADD f1, f1, f17
|
||||
fmr f17, f9
|
||||
|
||||
FADD f2, f2, f18
|
||||
fmr f18, f10
|
||||
FADD f3, f3, f19
|
||||
fmr f19, f11
|
||||
|
||||
LFDX f24, X, INCXM1
|
||||
LFDUX f25, X, INCX
|
||||
LFDX f26, X, INCXM1
|
||||
LFDUX f27, X, INCX
|
||||
|
||||
FADD f4, f4, f20
|
||||
fmr f20, f12
|
||||
FADD f5, f5, f21
|
||||
fmr f21, f13
|
||||
|
||||
FADD f6, f6, f22
|
||||
fmr f22, f14
|
||||
FADD f7, f7, f23
|
||||
fmr f23, f15
|
||||
|
||||
LFDX f28, X, INCXM1
|
||||
LFDUX f29, X, INCX
|
||||
LFDX f30, X, INCXM1
|
||||
LFDUX f31, X, INCX
|
||||
bdnz LL(110)
|
||||
.align 4
|
||||
|
||||
LL(120):
|
||||
FADD f0, f0, f16
|
||||
fmr f16, f24
|
||||
FADD f1, f1, f17
|
||||
fmr f17, f25
|
||||
|
||||
FADD f2, f2, f18
|
||||
fmr f18, f26
|
||||
FADD f3, f3, f19
|
||||
fmr f19, f27
|
||||
|
||||
FADD f4, f4, f20
|
||||
fmr f20, f28
|
||||
FADD f5, f5, f21
|
||||
fmr f21, f29
|
||||
|
||||
FADD f6, f6, f22
|
||||
fmr f22, f30
|
||||
FADD f7, f7, f23
|
||||
fmr f23, f31
|
||||
|
||||
FADD f0, f0, f16
|
||||
FADD f1, f1, f17
|
||||
FADD f2, f2, f18
|
||||
FADD f3, f3, f19
|
||||
|
||||
FADD f4, f4, f20
|
||||
FADD f5, f5, f21
|
||||
FADD f6, f6, f22
|
||||
FADD f7, f7, f23
|
||||
.align 4
|
||||
|
||||
LL(150):
|
||||
andi. r0, N, 7
|
||||
mtspr CTR, r0
|
||||
beq LL(999)
|
||||
.align 4
|
||||
|
||||
LL(160):
|
||||
LFDX f8, X, INCXM1
|
||||
LFDUX f9, X, INCX
|
||||
FADD f0, f0, f8
|
||||
FADD f1, f1, f9
|
||||
bdnz LL(160)
|
||||
.align 4
|
||||
|
||||
LL(999):
|
||||
FADD f0, f0, f1
|
||||
FADD f2, f2, f3
|
||||
FADD f4, f4, f5
|
||||
FADD f6, f6, f7
|
||||
|
||||
FADD f0, f0, f2
|
||||
FADD f4, f4, f6
|
||||
FADD f1, f0, f4
|
||||
|
||||
lfd f14, 0(SP)
|
||||
lfd f15, 8(SP)
|
||||
lfd f16, 16(SP)
|
||||
lfd f17, 24(SP)
|
||||
|
||||
lfd f18, 32(SP)
|
||||
lfd f19, 40(SP)
|
||||
lfd f20, 48(SP)
|
||||
lfd f21, 56(SP)
|
||||
|
||||
lfd f22, 64(SP)
|
||||
lfd f23, 72(SP)
|
||||
lfd f24, 80(SP)
|
||||
lfd f25, 88(SP)
|
||||
|
||||
lfd f26, 96(SP)
|
||||
lfd f27, 104(SP)
|
||||
lfd f28, 112(SP)
|
||||
lfd f29, 120(SP)
|
||||
|
||||
lfd f30, 128(SP)
|
||||
lfd f31, 136(SP)
|
||||
|
||||
addi SP, SP, STACKSIZE
|
||||
blr
|
||||
|
||||
EPILOGUE
|
|
@ -70,7 +70,7 @@ gotoblas_t TABLE_NAME = {
|
|||
|
||||
samax_kTS, samin_kTS, smax_kTS, smin_kTS,
|
||||
isamax_kTS, isamin_kTS, ismax_kTS, ismin_kTS,
|
||||
snrm2_kTS, sasum_kTS, scopy_kTS, sdot_kTS,
|
||||
snrm2_kTS, sasum_kTS, ssum_kTS, scopy_kTS, sdot_kTS,
|
||||
dsdot_kTS,
|
||||
srot_kTS, saxpy_kTS, sscal_kTS, sswap_kTS,
|
||||
sgemv_nTS, sgemv_tTS, sger_kTS,
|
||||
|
@ -126,7 +126,7 @@ gotoblas_t TABLE_NAME = {
|
|||
|
||||
damax_kTS, damin_kTS, dmax_kTS, dmin_kTS,
|
||||
idamax_kTS, idamin_kTS, idmax_kTS, idmin_kTS,
|
||||
dnrm2_kTS, dasum_kTS, dcopy_kTS, ddot_kTS,
|
||||
dnrm2_kTS, dasum_kTS, dsum_kTS, dcopy_kTS, ddot_kTS,
|
||||
drot_kTS, daxpy_kTS, dscal_kTS, dswap_kTS,
|
||||
dgemv_nTS, dgemv_tTS, dger_kTS,
|
||||
dsymv_LTS, dsymv_UTS,
|
||||
|
@ -178,7 +178,7 @@ gotoblas_t TABLE_NAME = {
|
|||
|
||||
qamax_kTS, qamin_kTS, qmax_kTS, qmin_kTS,
|
||||
iqamax_kTS, iqamin_kTS, iqmax_kTS, iqmin_kTS,
|
||||
qnrm2_kTS, qasum_kTS, qcopy_kTS, qdot_kTS,
|
||||
qnrm2_kTS, qasum_kTS, qsum_kTS, qcopy_kTS, qdot_kTS,
|
||||
qrot_kTS, qaxpy_kTS, qscal_kTS, qswap_kTS,
|
||||
qgemv_nTS, qgemv_tTS, qger_kTS,
|
||||
qsymv_LTS, qsymv_UTS,
|
||||
|
@ -234,7 +234,7 @@ gotoblas_t TABLE_NAME = {
|
|||
#endif
|
||||
|
||||
camax_kTS, camin_kTS, icamax_kTS, icamin_kTS,
|
||||
cnrm2_kTS, casum_kTS, ccopy_kTS,
|
||||
cnrm2_kTS, casum_kTS, csum_kTS, ccopy_kTS,
|
||||
cdotu_kTS, cdotc_kTS, csrot_kTS,
|
||||
caxpy_kTS, caxpyc_kTS, cscal_kTS, cswap_kTS,
|
||||
|
||||
|
@ -369,7 +369,7 @@ gotoblas_t TABLE_NAME = {
|
|||
#endif
|
||||
|
||||
zamax_kTS, zamin_kTS, izamax_kTS, izamin_kTS,
|
||||
znrm2_kTS, zasum_kTS, zcopy_kTS,
|
||||
znrm2_kTS, zasum_kTS, zsum_kTS, zcopy_kTS,
|
||||
zdotu_kTS, zdotc_kTS, zdrot_kTS,
|
||||
zaxpy_kTS, zaxpyc_kTS, zscal_kTS, zswap_kTS,
|
||||
|
||||
|
@ -500,7 +500,7 @@ gotoblas_t TABLE_NAME = {
|
|||
XGEMM_DEFAULT_UNROLL_M, XGEMM_DEFAULT_UNROLL_N, MAX(XGEMM_DEFAULT_UNROLL_M, XGEMM_DEFAULT_UNROLL_N),
|
||||
|
||||
xamax_kTS, xamin_kTS, ixamax_kTS, ixamin_kTS,
|
||||
xnrm2_kTS, xasum_kTS, xcopy_kTS,
|
||||
xnrm2_kTS, xasum_kTS, xsum_kTS, xcopy_kTS,
|
||||
xdotu_kTS, xdotc_kTS, xqrot_kTS,
|
||||
xaxpy_kTS, xaxpyc_kTS, xscal_kTS, xswap_kTS,
|
||||
|
||||
|
|
|
@ -0,0 +1,325 @@
|
|||
/*********************************************************************/
|
||||
/* Copyright 2009, 2010 The University of Texas at Austin. */
|
||||
/* All rights reserved. */
|
||||
/* */
|
||||
/* Redistribution and use in source and binary forms, with or */
|
||||
/* without modification, are permitted provided that the following */
|
||||
/* conditions are met: */
|
||||
/* */
|
||||
/* 1. Redistributions of source code must retain the above */
|
||||
/* copyright notice, this list of conditions and the following */
|
||||
/* disclaimer. */
|
||||
/* */
|
||||
/* 2. Redistributions in binary form must reproduce the above */
|
||||
/* copyright notice, this list of conditions and the following */
|
||||
/* disclaimer in the documentation and/or other materials */
|
||||
/* provided with the distribution. */
|
||||
/* */
|
||||
/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */
|
||||
/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */
|
||||
/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */
|
||||
/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */
|
||||
/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */
|
||||
/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */
|
||||
/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */
|
||||
/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */
|
||||
/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */
|
||||
/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */
|
||||
/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */
|
||||
/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */
|
||||
/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */
|
||||
/* POSSIBILITY OF SUCH DAMAGE. */
|
||||
/* */
|
||||
/* The views and conclusions contained in the software and */
|
||||
/* documentation are those of the authors and should not be */
|
||||
/* interpreted as representing official policies, either expressed */
|
||||
/* or implied, of The University of Texas at Austin. */
|
||||
/*********************************************************************/
|
||||
|
||||
#define ASSEMBLER
|
||||
#include "common.h"
|
||||
|
||||
#define N %i0
|
||||
#define X %i1
|
||||
#define INCX %i2
|
||||
#define I %i3
|
||||
|
||||
#ifdef DOUBLE
|
||||
#define c1 %f0
|
||||
#define c2 %f2
|
||||
#define t1 %f8
|
||||
#define t2 %f10
|
||||
#define t3 %f12
|
||||
#define t4 %f14
|
||||
|
||||
#define a1 %f16
|
||||
#define a2 %f18
|
||||
#define a3 %f20
|
||||
#define a4 %f22
|
||||
#define a5 %f24
|
||||
#define a6 %f26
|
||||
#define a7 %f28
|
||||
#define a8 %f30
|
||||
#else
|
||||
#define c1 %f0
|
||||
#define c2 %f1
|
||||
#define t1 %f4
|
||||
#define t2 %f5
|
||||
#define t3 %f6
|
||||
#define t4 %f7
|
||||
|
||||
#define a1 %f8
|
||||
#define a2 %f9
|
||||
#define a3 %f10
|
||||
#define a4 %f11
|
||||
#define a5 %f12
|
||||
#define a6 %f13
|
||||
#define a7 %f14
|
||||
#define a8 %f15
|
||||
#endif
|
||||
|
||||
PROLOGUE
|
||||
SAVESP
|
||||
|
||||
FCLR(0)
|
||||
|
||||
sll INCX, BASE_SHIFT, INCX
|
||||
|
||||
FMOV c1, c2
|
||||
FMOV c1, t1
|
||||
FMOV c1, t2
|
||||
FMOV c1, t3
|
||||
FMOV c1, t4
|
||||
|
||||
cmp INCX, 0
|
||||
ble .LL19
|
||||
cmp INCX, SIZE
|
||||
bne .LL50
|
||||
|
||||
sra N, 3, I
|
||||
cmp I, 0
|
||||
ble,pn %icc, .LL15
|
||||
nop
|
||||
|
||||
LDF [X + 0 * SIZE], a1
|
||||
add I, -1, I
|
||||
LDF [X + 1 * SIZE], a2
|
||||
cmp I, 0
|
||||
LDF [X + 2 * SIZE], a3
|
||||
LDF [X + 3 * SIZE], a4
|
||||
LDF [X + 4 * SIZE], a5
|
||||
LDF [X + 5 * SIZE], a6
|
||||
LDF [X + 6 * SIZE], a7
|
||||
LDF [X + 7 * SIZE], a8
|
||||
|
||||
ble,pt %icc, .LL12
|
||||
add X, 8 * SIZE, X
|
||||
|
||||
#define PREFETCHSIZE 128
|
||||
|
||||
.LL11:
|
||||
FADD c1, t1, c1
|
||||
prefetch [X + PREFETCHSIZE * SIZE], 0
|
||||
FMOV a1, t1
|
||||
LDF [X + 0 * SIZE], a1
|
||||
|
||||
FADD c2, t2, c2
|
||||
add I, -1, I
|
||||
FMOV a2, t2
|
||||
LDF [X + 1 * SIZE], a2
|
||||
|
||||
FADD c1, t3, c1
|
||||
cmp I, 0
|
||||
FMOV a3, t3
|
||||
LDF [X + 2 * SIZE], a3
|
||||
|
||||
FADD c2, t4, c2
|
||||
nop
|
||||
FMOV a4, t4
|
||||
LDF [X + 3 * SIZE], a4
|
||||
|
||||
FADD c1, t1, c1
|
||||
nop
|
||||
FMOV a5, t1
|
||||
LDF [X + 4 * SIZE], a5
|
||||
|
||||
FADD c2, t2, c2
|
||||
nop
|
||||
FMOV a6, t2
|
||||
LDF [X + 5 * SIZE], a6
|
||||
|
||||
FADD c1, t3, c1
|
||||
FMOV a7, t3
|
||||
LDF [X + 6 * SIZE], a7
|
||||
add X, 8 * SIZE, X
|
||||
|
||||
FADD c2, t4, c2
|
||||
FMOV a8, t4
|
||||
bg,pt %icc, .LL11
|
||||
LDF [X - 1 * SIZE], a8
|
||||
|
||||
.LL12:
|
||||
FADD c1, t1, c1
|
||||
FMOV a1, t1
|
||||
FADD c2, t2, c2
|
||||
FMOV a2, t2
|
||||
|
||||
FADD c1, t3, c1
|
||||
FMOV a3, t3
|
||||
FADD c2, t4, c2
|
||||
FMOV a4, t4
|
||||
|
||||
FADD c1, t1, c1
|
||||
FMOV a5, t1
|
||||
FADD c2, t2, c2
|
||||
FMOV a6, t2
|
||||
|
||||
FADD c1, t3, c1
|
||||
FMOV a7, t3
|
||||
FADD c2, t4, c2
|
||||
FMOV a8, t4
|
||||
|
||||
.LL15:
|
||||
and N, 7, I
|
||||
cmp I, 0
|
||||
ble,a,pn %icc, .LL19
|
||||
nop
|
||||
|
||||
.LL16:
|
||||
LDF [X + 0 * SIZE], a1
|
||||
add I, -1, I
|
||||
cmp I, 0
|
||||
FADD c1, t1, c1
|
||||
FMOV a1, t1
|
||||
bg,pt %icc, .LL16
|
||||
add X, 1 * SIZE, X
|
||||
|
||||
.LL19:
|
||||
FADD c1, t1, c1
|
||||
FADD c2, t2, c2
|
||||
FADD c1, t3, c1
|
||||
FADD c2, t4, c2
|
||||
|
||||
FADD c1, c2, c1
|
||||
return %i7 + 8
|
||||
clr %g0
|
||||
|
||||
.LL50:
|
||||
sra N, 3, I
|
||||
cmp I, 0
|
||||
ble,pn %icc, .LL55
|
||||
nop
|
||||
|
||||
LDF [X + 0 * SIZE], a1
|
||||
add X, INCX, X
|
||||
LDF [X + 0 * SIZE], a2
|
||||
add X, INCX, X
|
||||
LDF [X + 0 * SIZE], a3
|
||||
add X, INCX, X
|
||||
LDF [X + 0 * SIZE], a4
|
||||
add X, INCX, X
|
||||
LDF [X + 0 * SIZE], a5
|
||||
add X, INCX, X
|
||||
LDF [X + 0 * SIZE], a6
|
||||
add X, INCX, X
|
||||
add I, -1, I
|
||||
LDF [X + 0 * SIZE], a7
|
||||
cmp I, 0
|
||||
add X, INCX, X
|
||||
LDF [X + 0 * SIZE], a8
|
||||
|
||||
ble,pt %icc, .LL52
|
||||
add X, INCX, X
|
||||
|
||||
.LL51:
|
||||
FADD c1, t1, c1
|
||||
add I, -1, I
|
||||
FMOV a1, t1
|
||||
LDF [X + 0 * SIZE], a1
|
||||
add X, INCX, X
|
||||
|
||||
FADD c2, t2, c2
|
||||
cmp I, 0
|
||||
FMOV a2, t2
|
||||
LDF [X + 0 * SIZE], a2
|
||||
add X, INCX, X
|
||||
|
||||
FADD c1, t3, c1
|
||||
FMOV a3, t3
|
||||
LDF [X + 0 * SIZE], a3
|
||||
add X, INCX, X
|
||||
|
||||
FADD c2, t4, c2
|
||||
FMOV a4, t4
|
||||
LDF [X + 0 * SIZE], a4
|
||||
add X, INCX, X
|
||||
|
||||
FADD c1, t1, c1
|
||||
FMOV a5, t1
|
||||
LDF [X + 0 * SIZE], a5
|
||||
add X, INCX, X
|
||||
|
||||
FADD c2, t2, c2
|
||||
FMOV a6, t2
|
||||
LDF [X + 0 * SIZE], a6
|
||||
add X, INCX, X
|
||||
|
||||
FADD c1, t3, c1
|
||||
FMOV a7, t3
|
||||
LDF [X + 0 * SIZE], a7
|
||||
add X, INCX, X
|
||||
|
||||
FADD c2, t4, c2
|
||||
FMOV a8, t4
|
||||
LDF [X + 0 * SIZE], a8
|
||||
|
||||
bg,pt %icc, .LL51
|
||||
add X, INCX, X
|
||||
|
||||
.LL52:
|
||||
FADD c1, t1, c1
|
||||
FMOV a1, t1
|
||||
FADD c2, t2, c2
|
||||
FMOV a2, t2
|
||||
|
||||
FADD c1, t3, c1
|
||||
FMOV a3, t3
|
||||
FADD c2, t4, c2
|
||||
FMOV a4, t4
|
||||
|
||||
FADD c1, t1, c1
|
||||
FMOV a5, t1
|
||||
FADD c2, t2, c2
|
||||
FMOV a6, t2
|
||||
|
||||
FADD c1, t3, c1
|
||||
FMOV a7, t3
|
||||
FADD c2, t4, c2
|
||||
FMOV a8, t4
|
||||
|
||||
.LL55:
|
||||
and N, 7, I
|
||||
cmp I, 0
|
||||
ble,a,pn %icc, .LL59
|
||||
nop
|
||||
|
||||
.LL56:
|
||||
LDF [X + 0 * SIZE], a1
|
||||
FADD c1, t1, c1
|
||||
add I, -1, I
|
||||
FMOV a1, t1
|
||||
cmp I, 0
|
||||
bg,pt %icc, .LL56
|
||||
add X, INCX, X
|
||||
|
||||
.LL59:
|
||||
FADD c1, t1, c1
|
||||
FADD c2, t2, c2
|
||||
FADD c1, t3, c1
|
||||
FADD c2, t4, c2
|
||||
|
||||
FADD c1, c2, c1
|
||||
return %i7 + 8
|
||||
clr %o0
|
||||
|
||||
EPILOGUE
|
|
@ -0,0 +1,327 @@
|
|||
/*********************************************************************/
|
||||
/* Copyright 2009, 2010 The University of Texas at Austin. */
|
||||
/* All rights reserved. */
|
||||
/* */
|
||||
/* Redistribution and use in source and binary forms, with or */
|
||||
/* without modification, are permitted provided that the following */
|
||||
/* conditions are met: */
|
||||
/* */
|
||||
/* 1. Redistributions of source code must retain the above */
|
||||
/* copyright notice, this list of conditions and the following */
|
||||
/* disclaimer. */
|
||||
/* */
|
||||
/* 2. Redistributions in binary form must reproduce the above */
|
||||
/* copyright notice, this list of conditions and the following */
|
||||
/* disclaimer in the documentation and/or other materials */
|
||||
/* provided with the distribution. */
|
||||
/* */
|
||||
/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */
|
||||
/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */
|
||||
/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */
|
||||
/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */
|
||||
/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */
|
||||
/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */
|
||||
/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */
|
||||
/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */
|
||||
/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */
|
||||
/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */
|
||||
/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */
|
||||
/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */
|
||||
/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */
|
||||
/* POSSIBILITY OF SUCH DAMAGE. */
|
||||
/* */
|
||||
/* The views and conclusions contained in the software and */
|
||||
/* documentation are those of the authors and should not be */
|
||||
/* interpreted as representing official policies, either expressed */
|
||||
/* or implied, of The University of Texas at Austin. */
|
||||
/*********************************************************************/
|
||||
|
||||
#define ASSEMBLER
|
||||
#include "common.h"
|
||||
|
||||
#define N %i0
|
||||
#define X %i1
|
||||
#define INCX %i2
|
||||
#define I %i3
|
||||
|
||||
#ifdef DOUBLE
|
||||
#define c1 %f0
|
||||
#define c2 %f2
|
||||
#define t1 %f8
|
||||
#define t2 %f10
|
||||
#define t3 %f12
|
||||
#define t4 %f14
|
||||
|
||||
#define a1 %f16
|
||||
#define a2 %f18
|
||||
#define a3 %f20
|
||||
#define a4 %f22
|
||||
#define a5 %f24
|
||||
#define a6 %f26
|
||||
#define a7 %f28
|
||||
#define a8 %f30
|
||||
#else
|
||||
#define c1 %f0
|
||||
#define c2 %f1
|
||||
#define t1 %f4
|
||||
#define t2 %f5
|
||||
#define t3 %f6
|
||||
#define t4 %f7
|
||||
|
||||
#define a1 %f8
|
||||
#define a2 %f9
|
||||
#define a3 %f10
|
||||
#define a4 %f11
|
||||
#define a5 %f12
|
||||
#define a6 %f13
|
||||
#define a7 %f14
|
||||
#define a8 %f15
|
||||
#endif
|
||||
|
||||
PROLOGUE
|
||||
SAVESP
|
||||
|
||||
FCLR(0)
|
||||
|
||||
sll INCX, ZBASE_SHIFT, INCX
|
||||
|
||||
FMOV c1, c2
|
||||
FMOV c1, t1
|
||||
FMOV c1, t2
|
||||
FMOV c1, t3
|
||||
FMOV c1, t4
|
||||
|
||||
cmp INCX, 0
|
||||
ble .LL19
|
||||
nop
|
||||
|
||||
cmp INCX, 2 * SIZE
|
||||
bne .LL50
|
||||
nop
|
||||
|
||||
sra N, 2, I
|
||||
cmp I, 0
|
||||
ble,pn %icc, .LL15
|
||||
nop
|
||||
|
||||
LDF [X + 0 * SIZE], a1
|
||||
add I, -1, I
|
||||
LDF [X + 1 * SIZE], a2
|
||||
cmp I, 0
|
||||
LDF [X + 2 * SIZE], a3
|
||||
LDF [X + 3 * SIZE], a4
|
||||
LDF [X + 4 * SIZE], a5
|
||||
LDF [X + 5 * SIZE], a6
|
||||
LDF [X + 6 * SIZE], a7
|
||||
LDF [X + 7 * SIZE], a8
|
||||
|
||||
ble,pt %icc, .LL12
|
||||
add X, 8 * SIZE, X
|
||||
|
||||
#define PREFETCHSIZE 32
|
||||
|
||||
.LL11:
|
||||
FADD c1, t1, c1
|
||||
prefetch [X + PREFETCHSIZE * SIZE], 0
|
||||
FMOV a1, t1
|
||||
LDF [X + 0 * SIZE], a1
|
||||
|
||||
FADD c2, t2, c2
|
||||
add I, -1, I
|
||||
FMOV a2, t2
|
||||
LDF [X + 1 * SIZE], a2
|
||||
|
||||
FADD c1, t3, c1
|
||||
cmp I, 0
|
||||
FMOV a3, t3
|
||||
LDF [X + 2 * SIZE], a3
|
||||
|
||||
FADD c2, t4, c2
|
||||
nop
|
||||
FMOV a4, t4
|
||||
LDF [X + 3 * SIZE], a4
|
||||
|
||||
FADD c1, t1, c1
|
||||
nop
|
||||
FMOV a5, t1
|
||||
LDF [X + 4 * SIZE], a5
|
||||
|
||||
FADD c2, t2, c2
|
||||
nop
|
||||
FMOV a6, t2
|
||||
LDF [X + 5 * SIZE], a6
|
||||
|
||||
FADD c1, t3, c1
|
||||
FMOV a7, t3
|
||||
LDF [X + 6 * SIZE], a7
|
||||
add X, 8 * SIZE, X
|
||||
|
||||
FADD c2, t4, c2
|
||||
FMOV a8, t4
|
||||
bg,pt %icc, .LL11
|
||||
LDF [X - 1 * SIZE], a8
|
||||
|
||||
.LL12:
|
||||
FADD c1, t1, c1
|
||||
FMOV a1, t1
|
||||
FADD c2, t2, c2
|
||||
FMOV a2, t2
|
||||
|
||||
FADD c1, t3, c1
|
||||
FMOV a3, t3
|
||||
FADD c2, t4, c2
|
||||
FMOV a4, t4
|
||||
|
||||
FADD c1, t1, c1
|
||||
FMOV a5, t1
|
||||
FADD c2, t2, c2
|
||||
FMOV a6, t2
|
||||
|
||||
FADD c1, t3, c1
|
||||
FMOV a7, t3
|
||||
FADD c2, t4, c2
|
||||
FMOV a8, t4
|
||||
|
||||
.LL15:
|
||||
and N, 3, I
|
||||
cmp I, 0
|
||||
ble,a,pn %icc, .LL19
|
||||
nop
|
||||
|
||||
.LL16:
|
||||
LDF [X + 0 * SIZE], a1
|
||||
LDF [X + 1 * SIZE], a2
|
||||
add I, -1, I
|
||||
cmp I, 0
|
||||
FADD c1, t1, c1
|
||||
FADD c2, t2, c2
|
||||
FMOV a1, t1
|
||||
FMOV a2, t2
|
||||
bg,pt %icc, .LL16
|
||||
add X, 2 * SIZE, X
|
||||
|
||||
.LL19:
|
||||
FADD c1, t1, c1
|
||||
FADD c2, t2, c2
|
||||
FADD c1, t3, c1
|
||||
FADD c2, t4, c2
|
||||
|
||||
FADD c1, c2, c1
|
||||
return %i7 + 8
|
||||
clr %g0
|
||||
|
||||
.LL50:
|
||||
sra N, 2, I
|
||||
cmp I, 0
|
||||
ble,pn %icc, .LL55
|
||||
nop
|
||||
|
||||
LDF [X + 0 * SIZE], a1
|
||||
LDF [X + 1 * SIZE], a2
|
||||
add X, INCX, X
|
||||
LDF [X + 0 * SIZE], a3
|
||||
LDF [X + 1 * SIZE], a4
|
||||
add X, INCX, X
|
||||
LDF [X + 0 * SIZE], a5
|
||||
LDF [X + 1 * SIZE], a6
|
||||
add X, INCX, X
|
||||
add I, -1, I
|
||||
LDF [X + 0 * SIZE], a7
|
||||
cmp I, 0
|
||||
LDF [X + 1 * SIZE], a8
|
||||
|
||||
ble,pt %icc, .LL52
|
||||
add X, INCX, X
|
||||
|
||||
.LL51:
|
||||
FADD c1, t1, c1
|
||||
add I, -1, I
|
||||
FMOV a1, t1
|
||||
LDF [X + 0 * SIZE], a1
|
||||
|
||||
FADD c2, t2, c2
|
||||
cmp I, 0
|
||||
FMOV a2, t2
|
||||
LDF [X + 1 * SIZE], a2
|
||||
add X, INCX, X
|
||||
|
||||
FADD c1, t3, c1
|
||||
FMOV a3, t3
|
||||
LDF [X + 0 * SIZE], a3
|
||||
|
||||
FADD c2, t4, c2
|
||||
FMOV a4, t4
|
||||
LDF [X + 1 * SIZE], a4
|
||||
add X, INCX, X
|
||||
|
||||
FADD c1, t1, c1
|
||||
FMOV a5, t1
|
||||
LDF [X + 0 * SIZE], a5
|
||||
|
||||
FADD c2, t2, c2
|
||||
FMOV a6, t2
|
||||
LDF [X + 1 * SIZE], a6
|
||||
add X, INCX, X
|
||||
|
||||
FADD c1, t3, c1
|
||||
FMOV a7, t3
|
||||
LDF [X + 0 * SIZE], a7
|
||||
|
||||
FADD c2, t4, c2
|
||||
FMOV a8, t4
|
||||
LDF [X + 1 * SIZE], a8
|
||||
|
||||
bg,pt %icc, .LL51
|
||||
add X, INCX, X
|
||||
|
||||
.LL52:
|
||||
FADD c1, t1, c1
|
||||
FMOV a1, t1
|
||||
FADD c2, t2, c2
|
||||
FMOV a2, t2
|
||||
|
||||
FADD c1, t3, c1
|
||||
FMOV a3, t3
|
||||
FADD c2, t4, c2
|
||||
FMOV a4, t4
|
||||
|
||||
FADD c1, t1, c1
|
||||
FMOV a5, t1
|
||||
FADD c2, t2, c2
|
||||
FMOV a6, t2
|
||||
|
||||
FADD c1, t3, c1
|
||||
FMOV a7, t3
|
||||
FADD c2, t4, c2
|
||||
FMOV a8, t4
|
||||
|
||||
.LL55:
|
||||
and N, 3, I
|
||||
cmp I, 0
|
||||
ble,a,pn %icc, .LL59
|
||||
nop
|
||||
|
||||
.LL56:
|
||||
LDF [X + 0 * SIZE], a1
|
||||
LDF [X + 1 * SIZE], a2
|
||||
FADD c1, t1, c1
|
||||
FADD c2, t2, c2
|
||||
add I, -1, I
|
||||
FMOV a1, t1
|
||||
FMOV a2, t2
|
||||
cmp I, 0
|
||||
bg,pt %icc, .LL56
|
||||
add X, INCX, X
|
||||
|
||||
.LL59:
|
||||
FADD c1, t1, c1
|
||||
FADD c2, t2, c2
|
||||
FADD c1, t3, c1
|
||||
FADD c2, t4, c2
|
||||
|
||||
FADD c1, c2, c1
|
||||
|
||||
return %i7 + 8
|
||||
clr %o0
|
||||
|
||||
EPILOGUE
|
|
@ -94,6 +94,11 @@ DASUMKERNEL = ../arm/asum.c
|
|||
CASUMKERNEL = ../arm/zasum.c
|
||||
ZASUMKERNEL = ../arm/zasum.c
|
||||
|
||||
SSUMKERNEL = ../arm/sum.c
|
||||
DSUMKERNEL = ../arm/sum.c
|
||||
CSUMKERNEL = ../arm/zsum.c
|
||||
ZSUMKERNEL = ../arm/zsum.c
|
||||
|
||||
SAXPYKERNEL = ../arm/axpy.c
|
||||
DAXPYKERNEL = ../arm/axpy.c
|
||||
CAXPYKERNEL = ../arm/zaxpy.c
|
||||
|
|
|
@ -0,0 +1,207 @@
|
|||
/*********************************************************************/
|
||||
/* Copyright 2009, 2010 The University of Texas at Austin. */
|
||||
/* All rights reserved. */
|
||||
/* */
|
||||
/* Redistribution and use in source and binary forms, with or */
|
||||
/* without modification, are permitted provided that the following */
|
||||
/* conditions are met: */
|
||||
/* */
|
||||
/* 1. Redistributions of source code must retain the above */
|
||||
/* copyright notice, this list of conditions and the following */
|
||||
/* disclaimer. */
|
||||
/* */
|
||||
/* 2. Redistributions in binary form must reproduce the above */
|
||||
/* copyright notice, this list of conditions and the following */
|
||||
/* disclaimer in the documentation and/or other materials */
|
||||
/* provided with the distribution. */
|
||||
/* */
|
||||
/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */
|
||||
/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */
|
||||
/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */
|
||||
/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */
|
||||
/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */
|
||||
/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */
|
||||
/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */
|
||||
/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */
|
||||
/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */
|
||||
/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */
|
||||
/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */
|
||||
/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */
|
||||
/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */
|
||||
/* POSSIBILITY OF SUCH DAMAGE. */
|
||||
/* */
|
||||
/* The views and conclusions contained in the software and */
|
||||
/* documentation are those of the authors and should not be */
|
||||
/* interpreted as representing official policies, either expressed */
|
||||
/* or implied, of The University of Texas at Austin. */
|
||||
/*********************************************************************/
|
||||
|
||||
#define ASSEMBLER
|
||||
#include "common.h"
|
||||
|
||||
#define STACK 8
|
||||
#define ARGS 0
|
||||
|
||||
#define STACK_M 4 + STACK + ARGS(%esp)
|
||||
#define STACK_X 8 + STACK + ARGS(%esp)
|
||||
#define STACK_INCX 12 + STACK + ARGS(%esp)
|
||||
|
||||
#define M %edx
|
||||
#define X %ecx
|
||||
#define INCX %esi
|
||||
|
||||
#define I %eax
|
||||
|
||||
#include "l1param.h"
|
||||
|
||||
PROLOGUE
|
||||
|
||||
pushl %esi
|
||||
pushl %ebx
|
||||
|
||||
PROFCODE
|
||||
|
||||
#if defined(F_INTERFACE_GFORT) || defined(F_INTERFACE_G95)
|
||||
EMMS
|
||||
#endif
|
||||
|
||||
movl STACK_M, M
|
||||
movl STACK_X, X
|
||||
movl STACK_INCX, INCX
|
||||
|
||||
#ifdef F_INTERFACE
|
||||
movl (M), M
|
||||
movl (INCX), INCX
|
||||
#endif
|
||||
|
||||
fldz
|
||||
testl M, M
|
||||
jle .L999
|
||||
testl INCX, INCX
|
||||
jle .L999
|
||||
|
||||
sall $BASE_SHIFT, INCX
|
||||
fldz
|
||||
fldz
|
||||
fldz
|
||||
cmpl $SIZE, INCX
|
||||
jne .L40
|
||||
|
||||
movl M, I
|
||||
sarl $3, I
|
||||
jle .L20
|
||||
ALIGN_4
|
||||
|
||||
.L10:
|
||||
#ifdef PREFETCH
|
||||
PREFETCH (PREFETCHSIZE + 0) - PREOFFSET(X)
|
||||
#endif
|
||||
|
||||
FLD 0 * SIZE(X)
|
||||
FLD 1 * SIZE(X)
|
||||
FLD 2 * SIZE(X)
|
||||
FLD 3 * SIZE(X)
|
||||
|
||||
faddp %st, %st(7)
|
||||
faddp %st, %st(5)
|
||||
faddp %st, %st(3)
|
||||
faddp %st, %st(1)
|
||||
|
||||
FLD 4 * SIZE(X)
|
||||
FLD 5 * SIZE(X)
|
||||
FLD 6 * SIZE(X)
|
||||
FLD 7 * SIZE(X)
|
||||
|
||||
addl $8 * SIZE, X
|
||||
|
||||
faddp %st, %st(7)
|
||||
faddp %st, %st(5)
|
||||
faddp %st, %st(3)
|
||||
faddp %st, %st(1)
|
||||
|
||||
decl I
|
||||
jg .L10
|
||||
ALIGN_4
|
||||
|
||||
.L20:
|
||||
movl M, I
|
||||
andl $7, I
|
||||
jle .L998
|
||||
ALIGN_4
|
||||
|
||||
|
||||
.L21:
|
||||
FLD (X)
|
||||
faddp %st,%st(1)
|
||||
addl $1 * SIZE, X
|
||||
decl I
|
||||
jg .L21
|
||||
jmp .L998
|
||||
ALIGN_4
|
||||
|
||||
.L40:
|
||||
movl M, I
|
||||
sarl $3, I
|
||||
jle .L60
|
||||
ALIGN_4
|
||||
|
||||
.L50:
|
||||
FLD (X)
|
||||
addl INCX, X
|
||||
FLD (X)
|
||||
addl INCX, X
|
||||
FLD (X)
|
||||
addl INCX, X
|
||||
FLD (X)
|
||||
addl INCX, X
|
||||
|
||||
faddp %st, %st(7)
|
||||
faddp %st, %st(5)
|
||||
faddp %st, %st(3)
|
||||
faddp %st, %st(1)
|
||||
|
||||
FLD (X)
|
||||
addl INCX, X
|
||||
FLD (X)
|
||||
addl INCX, X
|
||||
FLD (X)
|
||||
addl INCX, X
|
||||
FLD (X)
|
||||
addl INCX, X
|
||||
|
||||
faddp %st, %st(7)
|
||||
faddp %st, %st(5)
|
||||
faddp %st, %st(3)
|
||||
faddp %st, %st(1)
|
||||
|
||||
decl I
|
||||
jg .L50
|
||||
ALIGN_4
|
||||
|
||||
.L60:
|
||||
movl M, I
|
||||
andl $7, I
|
||||
jle .L998
|
||||
ALIGN_4
|
||||
|
||||
|
||||
.L61:
|
||||
FLD (X)
|
||||
addl INCX, X
|
||||
faddp %st,%st(1)
|
||||
decl I
|
||||
jg .L61
|
||||
ALIGN_4
|
||||
|
||||
.L998:
|
||||
faddp %st,%st(2)
|
||||
faddp %st,%st(1)
|
||||
faddp %st,%st(1)
|
||||
ALIGN_4
|
||||
|
||||
.L999:
|
||||
popl %ebx
|
||||
popl %esi
|
||||
ret
|
||||
|
||||
EPILOGUE
|
|
@ -0,0 +1,208 @@
|
|||
/*********************************************************************/
|
||||
/* Copyright 2009, 2010 The University of Texas at Austin. */
|
||||
/* All rights reserved. */
|
||||
/* */
|
||||
/* Redistribution and use in source and binary forms, with or */
|
||||
/* without modification, are permitted provided that the following */
|
||||
/* conditions are met: */
|
||||
/* */
|
||||
/* 1. Redistributions of source code must retain the above */
|
||||
/* copyright notice, this list of conditions and the following */
|
||||
/* disclaimer. */
|
||||
/* */
|
||||
/* 2. Redistributions in binary form must reproduce the above */
|
||||
/* copyright notice, this list of conditions and the following */
|
||||
/* disclaimer in the documentation and/or other materials */
|
||||
/* provided with the distribution. */
|
||||
/* */
|
||||
/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */
|
||||
/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */
|
||||
/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */
|
||||
/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */
|
||||
/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */
|
||||
/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */
|
||||
/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */
|
||||
/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */
|
||||
/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */
|
||||
/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */
|
||||
/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */
|
||||
/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */
|
||||
/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */
|
||||
/* POSSIBILITY OF SUCH DAMAGE. */
|
||||
/* */
|
||||
/* The views and conclusions contained in the software and */
|
||||
/* documentation are those of the authors and should not be */
|
||||
/* interpreted as representing official policies, either expressed */
|
||||
/* or implied, of The University of Texas at Austin. */
|
||||
/*********************************************************************/
|
||||
|
||||
#define ASSEMBLER
|
||||
#include "common.h"
|
||||
|
||||
#define STACK 8
|
||||
#define ARGS 0
|
||||
|
||||
#define STACK_M 4 + STACK + ARGS(%esp)
|
||||
#define STACK_X 8 + STACK + ARGS(%esp)
|
||||
#define STACK_INCX 12 + STACK + ARGS(%esp)
|
||||
|
||||
#define M %edx
|
||||
#define X %ecx
|
||||
#define INCX %esi
|
||||
|
||||
#define I %eax
|
||||
|
||||
#include "l1param.h"
|
||||
|
||||
PROLOGUE
|
||||
|
||||
pushl %esi
|
||||
pushl %ebx
|
||||
|
||||
PROFCODE
|
||||
|
||||
#if defined(F_INTERFACE_GFORT) || defined(F_INTERFACE_G95)
|
||||
EMMS
|
||||
#endif
|
||||
|
||||
movl STACK_M, M
|
||||
movl STACK_X, X
|
||||
movl STACK_INCX, INCX
|
||||
|
||||
#ifdef F_INTERFACE
|
||||
movl (M), M
|
||||
movl (INCX), INCX
|
||||
#endif
|
||||
|
||||
fldz
|
||||
testl M, M
|
||||
jle .L999
|
||||
testl INCX, INCX
|
||||
jle .L999
|
||||
|
||||
sall $ZBASE_SHIFT, INCX
|
||||
|
||||
fldz
|
||||
fldz
|
||||
fldz
|
||||
cmpl $SIZE * 2, INCX
|
||||
jne .L40
|
||||
|
||||
movl M, I
|
||||
sarl $2, I
|
||||
jle .L20
|
||||
ALIGN_4
|
||||
|
||||
.L10:
|
||||
#ifdef PREFETCH
|
||||
PREFETCH (PREFETCHSIZE + 0) - PREOFFSET(X)
|
||||
#endif
|
||||
|
||||
FLD 0 * SIZE(X)
|
||||
FLD 1 * SIZE(X)
|
||||
FLD 2 * SIZE(X)
|
||||
FLD 3 * SIZE(X)
|
||||
|
||||
faddp %st, %st(7)
|
||||
faddp %st, %st(5)
|
||||
faddp %st, %st(3)
|
||||
faddp %st, %st(1)
|
||||
|
||||
FLD 4 * SIZE(X)
|
||||
FLD 5 * SIZE(X)
|
||||
FLD 6 * SIZE(X)
|
||||
FLD 7 * SIZE(X)
|
||||
|
||||
addl $8 * SIZE, X
|
||||
|
||||
faddp %st, %st(7)
|
||||
faddp %st, %st(5)
|
||||
faddp %st, %st(3)
|
||||
faddp %st, %st(1)
|
||||
|
||||
decl I
|
||||
jg .L10
|
||||
ALIGN_4
|
||||
|
||||
.L20:
|
||||
movl M, I
|
||||
andl $3, I
|
||||
jle .L998
|
||||
ALIGN_4
|
||||
|
||||
|
||||
.L21:
|
||||
FLD 0 * SIZE(X)
|
||||
FLD 1 * SIZE(X)
|
||||
faddp %st,%st(3)
|
||||
faddp %st,%st(1)
|
||||
addl $2 * SIZE, X
|
||||
decl I
|
||||
jg .L21
|
||||
jmp .L998
|
||||
ALIGN_4
|
||||
|
||||
.L40:
|
||||
movl M, I
|
||||
sarl $2, I
|
||||
jle .L60
|
||||
ALIGN_4
|
||||
|
||||
.L50:
|
||||
FLD 0 * SIZE(X)
|
||||
FLD 1 * SIZE(X)
|
||||
addl INCX, X
|
||||
FLD 0 * SIZE(X)
|
||||
FLD 1 * SIZE(X)
|
||||
addl INCX, X
|
||||
|
||||
faddp %st, %st(7)
|
||||
faddp %st, %st(5)
|
||||
faddp %st, %st(3)
|
||||
faddp %st, %st(1)
|
||||
|
||||
FLD 0 * SIZE(X)
|
||||
FLD 1 * SIZE(X)
|
||||
addl INCX, X
|
||||
FLD 0 * SIZE(X)
|
||||
FLD 1 * SIZE(X)
|
||||
addl INCX, X
|
||||
|
||||
faddp %st, %st(7)
|
||||
faddp %st, %st(5)
|
||||
faddp %st, %st(3)
|
||||
faddp %st, %st(1)
|
||||
|
||||
decl I
|
||||
jg .L50
|
||||
ALIGN_4
|
||||
|
||||
.L60:
|
||||
movl M, I
|
||||
andl $3, I
|
||||
jle .L998
|
||||
ALIGN_4
|
||||
|
||||
|
||||
.L61:
|
||||
FLD 0 * SIZE(X)
|
||||
FLD 1 * SIZE(X)
|
||||
addl INCX, X
|
||||
faddp %st,%st(3)
|
||||
faddp %st,%st(1)
|
||||
decl I
|
||||
jg .L61
|
||||
ALIGN_4
|
||||
|
||||
.L998:
|
||||
faddp %st,%st(2)
|
||||
faddp %st,%st(1)
|
||||
faddp %st,%st(1)
|
||||
ALIGN_4
|
||||
|
||||
.L999:
|
||||
popl %ebx
|
||||
popl %esi
|
||||
ret
|
||||
|
||||
EPILOGUE
|
|
@ -94,6 +94,11 @@ DASUMKERNEL = ../arm/asum.c
|
|||
CASUMKERNEL = ../arm/zasum.c
|
||||
ZASUMKERNEL = ../arm/zasum.c
|
||||
|
||||
SSUMKERNEL = ../arm/sum.c
|
||||
DSUMKERNEL = ../arm/sum.c
|
||||
CSUMKERNEL = ../arm/zsum.c
|
||||
ZSUMKERNEL = ../arm/zsum.c
|
||||
|
||||
SAXPYKERNEL = ../arm/axpy.c
|
||||
DAXPYKERNEL = ../arm/axpy.c
|
||||
CAXPYKERNEL = ../arm/zaxpy.c
|
||||
|
|
|
@ -0,0 +1,179 @@
|
|||
/*********************************************************************/
|
||||
/* Copyright 2009, 2010 The University of Texas at Austin. */
|
||||
/* All rights reserved. */
|
||||
/* */
|
||||
/* Redistribution and use in source and binary forms, with or */
|
||||
/* without modification, are permitted provided that the following */
|
||||
/* conditions are met: */
|
||||
/* */
|
||||
/* 1. Redistributions of source code must retain the above */
|
||||
/* copyright notice, this list of conditions and the following */
|
||||
/* disclaimer. */
|
||||
/* */
|
||||
/* 2. Redistributions in binary form must reproduce the above */
|
||||
/* copyright notice, this list of conditions and the following */
|
||||
/* disclaimer in the documentation and/or other materials */
|
||||
/* provided with the distribution. */
|
||||
/* */
|
||||
/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */
|
||||
/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */
|
||||
/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */
|
||||
/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */
|
||||
/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */
|
||||
/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */
|
||||
/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */
|
||||
/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */
|
||||
/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */
|
||||
/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */
|
||||
/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */
|
||||
/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */
|
||||
/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */
|
||||
/* POSSIBILITY OF SUCH DAMAGE. */
|
||||
/* */
|
||||
/* The views and conclusions contained in the software and */
|
||||
/* documentation are those of the authors and should not be */
|
||||
/* interpreted as representing official policies, either expressed */
|
||||
/* or implied, of The University of Texas at Austin. */
|
||||
/*********************************************************************/
|
||||
|
||||
#define ASSEMBLER
|
||||
#include "common.h"
|
||||
|
||||
#define M ARG1
|
||||
#define X ARG2
|
||||
#define INCX ARG3
|
||||
|
||||
#define I %rax
|
||||
|
||||
#include "l1param.h"
|
||||
|
||||
PROLOGUE
|
||||
PROFCODE
|
||||
|
||||
fldz
|
||||
testq M, M
|
||||
jle .L999
|
||||
testq INCX, INCX
|
||||
jle .L999
|
||||
|
||||
salq $BASE_SHIFT, INCX
|
||||
|
||||
fldz
|
||||
fldz
|
||||
fldz
|
||||
cmpq $SIZE, INCX
|
||||
jne .L40
|
||||
|
||||
movq M, I
|
||||
sarq $3, I
|
||||
jle .L20
|
||||
ALIGN_4
|
||||
|
||||
.L10:
|
||||
#ifdef PREFETCH
|
||||
PREFETCH (PREFETCHSIZE + 0) - PREOFFSET(X)
|
||||
#endif
|
||||
|
||||
FLD 0 * SIZE(X)
|
||||
FLD 1 * SIZE(X)
|
||||
FLD 2 * SIZE(X)
|
||||
FLD 3 * SIZE(X)
|
||||
|
||||
faddp %st, %st(7)
|
||||
faddp %st, %st(5)
|
||||
faddp %st, %st(3)
|
||||
faddp %st, %st(1)
|
||||
|
||||
FLD 4 * SIZE(X)
|
||||
FLD 5 * SIZE(X)
|
||||
FLD 6 * SIZE(X)
|
||||
FLD 7 * SIZE(X)
|
||||
|
||||
addq $8 * SIZE, X
|
||||
|
||||
faddp %st, %st(7)
|
||||
faddp %st, %st(5)
|
||||
faddp %st, %st(3)
|
||||
faddp %st, %st(1)
|
||||
|
||||
decq I
|
||||
jg .L10
|
||||
ALIGN_4
|
||||
|
||||
.L20:
|
||||
andq $7, M
|
||||
jle .L998
|
||||
ALIGN_4
|
||||
|
||||
.L21:
|
||||
FLD (X)
|
||||
faddp %st,%st(1)
|
||||
addq $1 * SIZE, X
|
||||
decq M
|
||||
jg .L21
|
||||
jmp .L998
|
||||
ALIGN_4
|
||||
|
||||
.L40:
|
||||
movq M, I
|
||||
sarq $3, I
|
||||
jle .L60
|
||||
ALIGN_4
|
||||
|
||||
.L50:
|
||||
FLD (X)
|
||||
addq INCX, X
|
||||
FLD (X)
|
||||
addq INCX, X
|
||||
FLD (X)
|
||||
addq INCX, X
|
||||
FLD (X)
|
||||
addq INCX, X
|
||||
|
||||
faddp %st, %st(7)
|
||||
faddp %st, %st(5)
|
||||
faddp %st, %st(3)
|
||||
faddp %st, %st(1)
|
||||
|
||||
FLD (X)
|
||||
addq INCX, X
|
||||
FLD (X)
|
||||
addq INCX, X
|
||||
FLD (X)
|
||||
addq INCX, X
|
||||
FLD (X)
|
||||
addq INCX, X
|
||||
|
||||
faddp %st, %st(7)
|
||||
faddp %st, %st(5)
|
||||
faddp %st, %st(3)
|
||||
faddp %st, %st(1)
|
||||
|
||||
decq I
|
||||
jg .L50
|
||||
ALIGN_4
|
||||
|
||||
.L60:
|
||||
andq $7, M
|
||||
jle .L998
|
||||
ALIGN_4
|
||||
|
||||
|
||||
.L61:
|
||||
FLD (X)
|
||||
addq INCX, X
|
||||
faddp %st,%st(1)
|
||||
decq M
|
||||
jg .L61
|
||||
ALIGN_4
|
||||
|
||||
.L998:
|
||||
faddp %st,%st(2)
|
||||
faddp %st,%st(1)
|
||||
faddp %st,%st(1)
|
||||
ALIGN_4
|
||||
|
||||
.L999:
|
||||
ret
|
||||
|
||||
EPILOGUE
|
|
@ -0,0 +1,180 @@
|
|||
/*********************************************************************/
|
||||
/* Copyright 2009, 2010 The University of Texas at Austin. */
|
||||
/* All rights reserved. */
|
||||
/* */
|
||||
/* Redistribution and use in source and binary forms, with or */
|
||||
/* without modification, are permitted provided that the following */
|
||||
/* conditions are met: */
|
||||
/* */
|
||||
/* 1. Redistributions of source code must retain the above */
|
||||
/* copyright notice, this list of conditions and the following */
|
||||
/* disclaimer. */
|
||||
/* */
|
||||
/* 2. Redistributions in binary form must reproduce the above */
|
||||
/* copyright notice, this list of conditions and the following */
|
||||
/* disclaimer in the documentation and/or other materials */
|
||||
/* provided with the distribution. */
|
||||
/* */
|
||||
/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */
|
||||
/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */
|
||||
/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */
|
||||
/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */
|
||||
/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */
|
||||
/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */
|
||||
/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */
|
||||
/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */
|
||||
/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */
|
||||
/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */
|
||||
/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */
|
||||
/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */
|
||||
/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */
|
||||
/* POSSIBILITY OF SUCH DAMAGE. */
|
||||
/* */
|
||||
/* The views and conclusions contained in the software and */
|
||||
/* documentation are those of the authors and should not be */
|
||||
/* interpreted as representing official policies, either expressed */
|
||||
/* or implied, of The University of Texas at Austin. */
|
||||
/*********************************************************************/
|
||||
|
||||
#define ASSEMBLER
|
||||
#include "common.h"
|
||||
|
||||
#define M ARG1
|
||||
#define X ARG2
|
||||
#define INCX ARG3
|
||||
|
||||
#define I %rax
|
||||
|
||||
#include "l1param.h"
|
||||
|
||||
PROLOGUE
|
||||
PROFCODE
|
||||
|
||||
fldz
|
||||
testq M, M
|
||||
jle .L999
|
||||
testq INCX, INCX
|
||||
jle .L999
|
||||
|
||||
salq $ZBASE_SHIFT, INCX
|
||||
|
||||
fldz
|
||||
fldz
|
||||
fldz
|
||||
cmpq $SIZE * 2, INCX
|
||||
jne .L40
|
||||
|
||||
movq M, I
|
||||
sarq $2, I
|
||||
jle .L20
|
||||
ALIGN_4
|
||||
|
||||
.L10:
|
||||
#ifdef PREFETCH
|
||||
PREFETCH (PREFETCHSIZE + 0) - PREOFFSET(X)
|
||||
#endif
|
||||
|
||||
FLD 0 * SIZE(X)
|
||||
FLD 1 * SIZE(X)
|
||||
FLD 2 * SIZE(X)
|
||||
FLD 3 * SIZE(X)
|
||||
|
||||
faddp %st, %st(7)
|
||||
faddp %st, %st(5)
|
||||
faddp %st, %st(3)
|
||||
faddp %st, %st(1)
|
||||
|
||||
FLD 4 * SIZE(X)
|
||||
FLD 5 * SIZE(X)
|
||||
FLD 6 * SIZE(X)
|
||||
FLD 7 * SIZE(X)
|
||||
|
||||
addq $8 * SIZE, X
|
||||
|
||||
faddp %st, %st(7)
|
||||
faddp %st, %st(5)
|
||||
faddp %st, %st(3)
|
||||
faddp %st, %st(1)
|
||||
|
||||
decq I
|
||||
jg .L10
|
||||
ALIGN_4
|
||||
|
||||
.L20:
|
||||
andq $3, M
|
||||
jle .L998
|
||||
ALIGN_4
|
||||
|
||||
|
||||
.L21:
|
||||
FLD 0 * SIZE(X)
|
||||
FLD 1 * SIZE(X)
|
||||
faddp %st,%st(3)
|
||||
faddp %st,%st(1)
|
||||
addq $2 * SIZE, X
|
||||
decq M
|
||||
jg .L21
|
||||
jmp .L998
|
||||
ALIGN_4
|
||||
|
||||
.L40:
|
||||
movq M, I
|
||||
sarq $2, I
|
||||
jle .L60
|
||||
ALIGN_4
|
||||
|
||||
.L50:
|
||||
FLD 0 * SIZE(X)
|
||||
FLD 1 * SIZE(X)
|
||||
addq INCX, X
|
||||
FLD 0 * SIZE(X)
|
||||
FLD 1 * SIZE(X)
|
||||
addq INCX, X
|
||||
|
||||
faddp %st, %st(7)
|
||||
faddp %st, %st(5)
|
||||
faddp %st, %st(3)
|
||||
faddp %st, %st(1)
|
||||
|
||||
FLD 0 * SIZE(X)
|
||||
FLD 1 * SIZE(X)
|
||||
addq INCX, X
|
||||
FLD 0 * SIZE(X)
|
||||
FLD 1 * SIZE(X)
|
||||
addq INCX, X
|
||||
|
||||
faddp %st, %st(7)
|
||||
faddp %st, %st(5)
|
||||
faddp %st, %st(3)
|
||||
faddp %st, %st(1)
|
||||
|
||||
decq I
|
||||
jg .L50
|
||||
ALIGN_4
|
||||
|
||||
.L60:
|
||||
andq $3, M
|
||||
jle .L998
|
||||
ALIGN_4
|
||||
|
||||
|
||||
.L61:
|
||||
FLD 0 * SIZE(X)
|
||||
FLD 1 * SIZE(X)
|
||||
addq INCX, X
|
||||
faddp %st,%st(3)
|
||||
faddp %st,%st(1)
|
||||
decq M
|
||||
jg .L61
|
||||
ALIGN_4
|
||||
|
||||
.L998:
|
||||
faddp %st,%st(2)
|
||||
faddp %st,%st(1)
|
||||
faddp %st,%st(1)
|
||||
ALIGN_4
|
||||
|
||||
.L999:
|
||||
ret
|
||||
|
||||
EPILOGUE
|
|
@ -35,6 +35,11 @@ DASUMKERNEL = dasum.c
|
|||
CASUMKERNEL = ../arm/zasum.c
|
||||
ZASUMKERNEL = zasum.c
|
||||
|
||||
SSUMKERNEL = ../arm/asum.c
|
||||
DSUMKERNEL = dasum.c
|
||||
CSUMKERNEL = ../arm/zasum.c
|
||||
ZSUMKERNEL = zasum.c
|
||||
|
||||
SAXPYKERNEL = ../arm/axpy.c
|
||||
DAXPYKERNEL = daxpy.c
|
||||
CAXPYKERNEL = ../arm/zaxpy.c
|
||||
|
|
|
@ -35,6 +35,11 @@ DASUMKERNEL = dasum.c
|
|||
CASUMKERNEL = casum.c
|
||||
ZASUMKERNEL = zasum.c
|
||||
|
||||
SSUMKERNEL = ssum.c
|
||||
DSUMKERNEL = dsum.c
|
||||
CSUMKERNEL = csum.c
|
||||
ZSUMKERNEL = zsum.c
|
||||
|
||||
SAXPYKERNEL = saxpy.c
|
||||
DAXPYKERNEL = daxpy.c
|
||||
CAXPYKERNEL = caxpy.c
|
||||
|
|
|
@ -35,6 +35,11 @@ DASUMKERNEL = ../arm/asum.c
|
|||
CASUMKERNEL = ../arm/zasum.c
|
||||
ZASUMKERNEL = ../arm/zasum.c
|
||||
|
||||
SSUMKERNEL = ../arm/sum.c
|
||||
DSUMKERNEL = ../arm/sum.c
|
||||
CSUMKERNEL = ../arm/zsum.c
|
||||
ZSUMKERNEL = ../arm/zsum.c
|
||||
|
||||
SAXPYKERNEL = ../arm/axpy.c
|
||||
DAXPYKERNEL = ../arm/axpy.c
|
||||
CAXPYKERNEL = ../arm/zaxpy.c
|
||||
|
|
|
@ -0,0 +1,137 @@
|
|||
/***************************************************************************
|
||||
Copyright (c) 2013-2019, The OpenBLAS Project
|
||||
All rights reserved.
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
met:
|
||||
1. Redistributions of source code must retain the above copyright
|
||||
notice, this list of conditions and the following disclaimer.
|
||||
2. Redistributions in binary form must reproduce the above copyright
|
||||
notice, this list of conditions and the following disclaimer in
|
||||
the documentation and/or other materials provided with the
|
||||
distribution.
|
||||
3. Neither the name of the OpenBLAS project nor the names of
|
||||
its contributors may be used to endorse or promote products
|
||||
derived from this software without specific prior written permission.
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||||
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
||||
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
|
||||
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
|
||||
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
|
||||
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
|
||||
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
|
||||
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
|
||||
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
*****************************************************************************/
|
||||
|
||||
#include "common.h"
|
||||
#include <math.h>
|
||||
|
||||
static FLOAT csum_kernel_32(BLASLONG n, FLOAT *x) {
|
||||
FLOAT sum;
|
||||
|
||||
__asm__("vzero %%v24\n\t"
|
||||
"vzero %%v25\n\t"
|
||||
"vzero %%v26\n\t"
|
||||
"vzero %%v27\n\t"
|
||||
"vzero %%v28\n\t"
|
||||
"vzero %%v29\n\t"
|
||||
"vzero %%v30\n\t"
|
||||
"vzero %%v31\n\t"
|
||||
"srlg %[n],%[n],5\n\t"
|
||||
"xgr %%r1,%%r1\n\t"
|
||||
"0:\n\t"
|
||||
"pfd 1, 1024(%%r1,%[x])\n\t"
|
||||
"vl %%v16, 0(%%r1,%[x])\n\t"
|
||||
"vl %%v17, 16(%%r1,%[x])\n\t"
|
||||
"vl %%v18, 32(%%r1,%[x])\n\t"
|
||||
"vl %%v19, 48(%%r1,%[x])\n\t"
|
||||
"vl %%v20, 64(%%r1,%[x])\n\t"
|
||||
"vl %%v21, 80(%%r1,%[x])\n\t"
|
||||
"vl %%v22, 96(%%r1,%[x])\n\t"
|
||||
"vl %%v23, 112(%%r1,%[x])\n\t"
|
||||
"vfasb %%v24,%%v24,%%v16\n\t"
|
||||
"vfasb %%v25,%%v25,%%v17\n\t"
|
||||
"vfasb %%v26,%%v26,%%v18\n\t"
|
||||
"vfasb %%v27,%%v27,%%v19\n\t"
|
||||
"vfasb %%v28,%%v28,%%v20\n\t"
|
||||
"vfasb %%v29,%%v29,%%v21\n\t"
|
||||
"vfasb %%v30,%%v30,%%v22\n\t"
|
||||
"vfasb %%v31,%%v31,%%v23\n\t"
|
||||
"vl %%v16, 128(%%r1,%[x])\n\t"
|
||||
"vl %%v17, 144(%%r1,%[x])\n\t"
|
||||
"vl %%v18, 160(%%r1,%[x])\n\t"
|
||||
"vl %%v19, 176(%%r1,%[x])\n\t"
|
||||
"vl %%v20, 192(%%r1,%[x])\n\t"
|
||||
"vl %%v21, 208(%%r1,%[x])\n\t"
|
||||
"vl %%v22, 224(%%r1,%[x])\n\t"
|
||||
"vl %%v23, 240(%%r1,%[x])\n\t"
|
||||
"vfasb %%v24,%%v24,%%v16\n\t"
|
||||
"vfasb %%v25,%%v25,%%v17\n\t"
|
||||
"vfasb %%v26,%%v26,%%v18\n\t"
|
||||
"vfasb %%v27,%%v27,%%v19\n\t"
|
||||
"vfasb %%v28,%%v28,%%v20\n\t"
|
||||
"vfasb %%v29,%%v29,%%v21\n\t"
|
||||
"vfasb %%v30,%%v30,%%v22\n\t"
|
||||
"vfasb %%v31,%%v31,%%v23\n\t"
|
||||
"agfi %%r1,256\n\t"
|
||||
"brctg %[n],0b\n\t"
|
||||
"vfasb %%v24,%%v24,%%v25\n\t"
|
||||
"vfasb %%v24,%%v24,%%v26\n\t"
|
||||
"vfasb %%v24,%%v24,%%v27\n\t"
|
||||
"vfasb %%v24,%%v24,%%v28\n\t"
|
||||
"vfasb %%v24,%%v24,%%v29\n\t"
|
||||
"vfasb %%v24,%%v24,%%v30\n\t"
|
||||
"vfasb %%v24,%%v24,%%v31\n\t"
|
||||
"veslg %%v25,%%v24,32\n\t"
|
||||
"vfasb %%v24,%%v24,%%v25\n\t"
|
||||
"vrepf %%v25,%%v24,2\n\t"
|
||||
"vfasb %%v24,%%v24,%%v25\n\t"
|
||||
"vstef %%v24,%[asum],0"
|
||||
: [sum] "=Q"(sum),[n] "+&r"(n)
|
||||
: "m"(*(const struct { FLOAT x[n * 2]; } *) x),[x] "a"(x)
|
||||
: "cc", "r1", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23",
|
||||
"v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31");
|
||||
|
||||
return sum;
|
||||
}
|
||||
|
||||
FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) {
|
||||
BLASLONG i = 0;
|
||||
BLASLONG ip = 0;
|
||||
FLOAT sumf = 0.0;
|
||||
BLASLONG n1;
|
||||
BLASLONG inc_x2;
|
||||
|
||||
if (n <= 0 || inc_x <= 0)
|
||||
return (sumf);
|
||||
|
||||
if (inc_x == 1) {
|
||||
|
||||
n1 = n & -32;
|
||||
if (n1 > 0) {
|
||||
|
||||
sumf = csum_kernel_32(n1, x);
|
||||
i = n1;
|
||||
ip = 2 * n1;
|
||||
}
|
||||
|
||||
while (i < n) {
|
||||
sumf += x[ip] + x[ip + 1];
|
||||
i++;
|
||||
ip += 2;
|
||||
}
|
||||
|
||||
} else {
|
||||
inc_x2 = 2 * inc_x;
|
||||
|
||||
while (i < n) {
|
||||
sumf += x[ip] + x[ip + 1];
|
||||
ip += inc_x2;
|
||||
i++;
|
||||
}
|
||||
|
||||
}
|
||||
return (sumf);
|
||||
}
|
|
@ -0,0 +1,148 @@
|
|||
/***************************************************************************
|
||||
Copyright (c) 2013-2019, The OpenBLAS Project
|
||||
All rights reserved.
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
met:
|
||||
1. Redistributions of source code must retain the above copyright
|
||||
notice, this list of conditions and the following disclaimer.
|
||||
2. Redistributions in binary form must reproduce the above copyright
|
||||
notice, this list of conditions and the following disclaimer in
|
||||
the documentation and/or other materials provided with the
|
||||
distribution.
|
||||
3. Neither the name of the OpenBLAS project nor the names of
|
||||
its contributors may be used to endorse or promote products
|
||||
derived from this software without specific prior written permission.
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||||
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
||||
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
|
||||
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
|
||||
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
|
||||
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
|
||||
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
|
||||
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
|
||||
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
*****************************************************************************/
|
||||
|
||||
#include "common.h"
|
||||
#include <math.h>
|
||||
|
||||
static FLOAT dsum_kernel_32(BLASLONG n, FLOAT *x) {
|
||||
FLOAT sum;
|
||||
|
||||
__asm__("vzero %%v24\n\t"
|
||||
"vzero %%v25\n\t"
|
||||
"vzero %%v26\n\t"
|
||||
"vzero %%v27\n\t"
|
||||
"vzero %%v28\n\t"
|
||||
"vzero %%v29\n\t"
|
||||
"vzero %%v30\n\t"
|
||||
"vzero %%v31\n\t"
|
||||
"srlg %[n],%[n],5\n\t"
|
||||
"xgr %%r1,%%r1\n\t"
|
||||
"0:\n\t"
|
||||
"pfd 1, 1024(%%r1,%[x])\n\t"
|
||||
"vl %%v16, 0(%%r1,%[x])\n\t"
|
||||
"vl %%v17, 16(%%r1,%[x])\n\t"
|
||||
"vl %%v18, 32(%%r1,%[x])\n\t"
|
||||
"vl %%v19, 48(%%r1,%[x])\n\t"
|
||||
"vl %%v20, 64(%%r1,%[x])\n\t"
|
||||
"vl %%v21, 80(%%r1,%[x])\n\t"
|
||||
"vl %%v22, 96(%%r1,%[x])\n\t"
|
||||
"vl %%v23, 112(%%r1,%[x])\n\t"
|
||||
"vfadb %%v24,%%v24,%%v16\n\t"
|
||||
"vfadb %%v25,%%v25,%%v17\n\t"
|
||||
"vfadb %%v26,%%v26,%%v18\n\t"
|
||||
"vfadb %%v27,%%v27,%%v19\n\t"
|
||||
"vfadb %%v28,%%v28,%%v20\n\t"
|
||||
"vfadb %%v29,%%v29,%%v21\n\t"
|
||||
"vfadb %%v30,%%v30,%%v22\n\t"
|
||||
"vfadb %%v31,%%v31,%%v23\n\t"
|
||||
"vl %%v16, 128(%%r1,%[x])\n\t"
|
||||
"vl %%v17, 144(%%r1,%[x])\n\t"
|
||||
"vl %%v18, 160(%%r1,%[x])\n\t"
|
||||
"vl %%v19, 176(%%r1,%[x])\n\t"
|
||||
"vl %%v20, 192(%%r1,%[x])\n\t"
|
||||
"vl %%v21, 208(%%r1,%[x])\n\t"
|
||||
"vl %%v22, 224(%%r1,%[x])\n\t"
|
||||
"vl %%v23, 240(%%r1,%[x])\n\t"
|
||||
"vfadb %%v24,%%v24,%%v16\n\t"
|
||||
"vfadb %%v25,%%v25,%%v17\n\t"
|
||||
"vfadb %%v26,%%v26,%%v18\n\t"
|
||||
"vfadb %%v27,%%v27,%%v19\n\t"
|
||||
"vfadb %%v28,%%v28,%%v20\n\t"
|
||||
"vfadb %%v29,%%v29,%%v21\n\t"
|
||||
"vfadb %%v30,%%v30,%%v22\n\t"
|
||||
"vfadb %%v31,%%v31,%%v23\n\t"
|
||||
"agfi %%r1,256\n\t"
|
||||
"brctg %[n],0b\n\t"
|
||||
"vfadb %%v24,%%v24,%%v25\n\t"
|
||||
"vfadb %%v24,%%v24,%%v26\n\t"
|
||||
"vfadb %%v24,%%v24,%%v27\n\t"
|
||||
"vfadb %%v24,%%v24,%%v28\n\t"
|
||||
"vfadb %%v24,%%v24,%%v29\n\t"
|
||||
"vfadb %%v24,%%v24,%%v30\n\t"
|
||||
"vfadb %%v24,%%v24,%%v31\n\t"
|
||||
"vrepg %%v25,%%v24,1\n\t"
|
||||
"vfadb %%v24,%%v24,%%v25\n\t"
|
||||
"vsteg %%v24,%[asum],0"
|
||||
: [sum] "=Q"(sum),[n] "+&r"(n)
|
||||
: "m"(*(const struct { FLOAT x[n]; } *) x),[x] "a"(x)
|
||||
: "cc", "r1", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23",
|
||||
"v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31");
|
||||
|
||||
return sum;
|
||||
}
|
||||
|
||||
FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) {
|
||||
BLASLONG i = 0;
|
||||
BLASLONG j = 0;
|
||||
FLOAT sumf = 0.0;
|
||||
BLASLONG n1;
|
||||
|
||||
if (n <= 0 || inc_x <= 0)
|
||||
return sumf;
|
||||
|
||||
if (inc_x == 1) {
|
||||
|
||||
n1 = n & -32;
|
||||
|
||||
if (n1 > 0) {
|
||||
|
||||
sumf = dsum_kernel_32(n1, x);
|
||||
i = n1;
|
||||
}
|
||||
|
||||
while (i < n) {
|
||||
sumf += x[i];
|
||||
i++;
|
||||
}
|
||||
|
||||
} else {
|
||||
BLASLONG n1 = n & -4;
|
||||
register FLOAT sum1, sum2;
|
||||
sum1 = 0.0;
|
||||
sum2 = 0.0;
|
||||
while (j < n1) {
|
||||
|
||||
sum1 += x[i];
|
||||
sum2 += x[i + inc_x];
|
||||
sum1 += x[i + 2 * inc_x];
|
||||
sum2 += x[i + 3 * inc_x];
|
||||
|
||||
i += inc_x * 4;
|
||||
j += 4;
|
||||
|
||||
}
|
||||
sumf = sum1 + sum2;
|
||||
while (j < n) {
|
||||
|
||||
sumf += x[i];
|
||||
i += inc_x;
|
||||
j++;
|
||||
}
|
||||
|
||||
}
|
||||
return sumf;
|
||||
}
|
|
@ -0,0 +1,151 @@
|
|||
/***************************************************************************
|
||||
Copyright (c) 2013-2019, The OpenBLAS Project
|
||||
All rights reserved.
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
met:
|
||||
1. Redistributions of source code must retain the above copyright
|
||||
notice, this list of conditions and the following disclaimer.
|
||||
2. Redistributions in binary form must reproduce the above copyright
|
||||
notice, this list of conditions and the following disclaimer in
|
||||
the documentation and/or other materials provided with the
|
||||
distribution.
|
||||
3. Neither the name of the OpenBLAS project nor the names of
|
||||
its contributors may be used to endorse or promote products
|
||||
derived from this software without specific prior written permission.
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||||
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
||||
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
|
||||
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
|
||||
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
|
||||
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
|
||||
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
|
||||
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
|
||||
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
*****************************************************************************/
|
||||
|
||||
#include "common.h"
|
||||
#include <math.h>
|
||||
|
||||
|
||||
static FLOAT ssum_kernel_64(BLASLONG n, FLOAT *x) {
|
||||
FLOAT sum;
|
||||
|
||||
__asm__("vzero %%v24\n\t"
|
||||
"vzero %%v25\n\t"
|
||||
"vzero %%v26\n\t"
|
||||
"vzero %%v27\n\t"
|
||||
"vzero %%v28\n\t"
|
||||
"vzero %%v29\n\t"
|
||||
"vzero %%v30\n\t"
|
||||
"vzero %%v31\n\t"
|
||||
"srlg %[n],%[n],6\n\t"
|
||||
"xgr %%r1,%%r1\n\t"
|
||||
"0:\n\t"
|
||||
"pfd 1, 1024(%%r1,%[x])\n\t"
|
||||
"vl %%v16, 0(%%r1,%[x])\n\t"
|
||||
"vl %%v17, 16(%%r1,%[x])\n\t"
|
||||
"vl %%v18, 32(%%r1,%[x])\n\t"
|
||||
"vl %%v19, 48(%%r1,%[x])\n\t"
|
||||
"vl %%v20, 64(%%r1,%[x])\n\t"
|
||||
"vl %%v21, 80(%%r1,%[x])\n\t"
|
||||
"vl %%v22, 96(%%r1,%[x])\n\t"
|
||||
"vl %%v23, 112(%%r1,%[x])\n\t"
|
||||
"vfasb %%v24,%%v24,%%v16\n\t"
|
||||
"vfasb %%v25,%%v25,%%v17\n\t"
|
||||
"vfasb %%v26,%%v26,%%v18\n\t"
|
||||
"vfasb %%v27,%%v27,%%v19\n\t"
|
||||
"vfasb %%v28,%%v28,%%v20\n\t"
|
||||
"vfasb %%v29,%%v29,%%v21\n\t"
|
||||
"vfasb %%v30,%%v30,%%v22\n\t"
|
||||
"vfasb %%v31,%%v31,%%v23\n\t"
|
||||
"vl %%v16, 128(%%r1,%[x])\n\t"
|
||||
"vl %%v17, 144(%%r1,%[x])\n\t"
|
||||
"vl %%v18, 160(%%r1,%[x])\n\t"
|
||||
"vl %%v19, 176(%%r1,%[x])\n\t"
|
||||
"vl %%v20, 192(%%r1,%[x])\n\t"
|
||||
"vl %%v21, 208(%%r1,%[x])\n\t"
|
||||
"vl %%v22, 224(%%r1,%[x])\n\t"
|
||||
"vl %%v23, 240(%%r1,%[x])\n\t"
|
||||
"vfasb %%v24,%%v24,%%v16\n\t"
|
||||
"vfasb %%v25,%%v25,%%v17\n\t"
|
||||
"vfasb %%v26,%%v26,%%v18\n\t"
|
||||
"vfasb %%v27,%%v27,%%v19\n\t"
|
||||
"vfasb %%v28,%%v28,%%v20\n\t"
|
||||
"vfasb %%v29,%%v29,%%v21\n\t"
|
||||
"vfasb %%v30,%%v30,%%v22\n\t"
|
||||
"vfasb %%v31,%%v31,%%v23\n\t"
|
||||
"agfi %%r1,256\n\t"
|
||||
"brctg %[n],0b\n\t"
|
||||
"vfasb %%v24,%%v24,%%v25\n\t"
|
||||
"vfasb %%v24,%%v24,%%v26\n\t"
|
||||
"vfasb %%v24,%%v24,%%v27\n\t"
|
||||
"vfasb %%v24,%%v24,%%v28\n\t"
|
||||
"vfasb %%v24,%%v24,%%v29\n\t"
|
||||
"vfasb %%v24,%%v24,%%v30\n\t"
|
||||
"vfasb %%v24,%%v24,%%v31\n\t"
|
||||
"veslg %%v25,%%v24,32\n\t"
|
||||
"vfasb %%v24,%%v24,%%v25\n\t"
|
||||
"vrepf %%v25,%%v24,2\n\t"
|
||||
"vfasb %%v24,%%v24,%%v25\n\t"
|
||||
"vstef %%v24,%[asum],0"
|
||||
: [sum] "=Q"(sum),[n] "+&r"(n)
|
||||
: "m"(*(const struct { FLOAT x[n]; } *) x),[x] "a"(x)
|
||||
: "cc", "r1", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23",
|
||||
"v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31");
|
||||
|
||||
return sum;
|
||||
}
|
||||
|
||||
FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) {
|
||||
BLASLONG i = 0;
|
||||
BLASLONG j = 0;
|
||||
FLOAT sumf = 0.0;
|
||||
BLASLONG n1;
|
||||
|
||||
if (n <= 0 || inc_x <= 0)
|
||||
return sumf;
|
||||
|
||||
if (inc_x == 1) {
|
||||
|
||||
n1 = n & -64;
|
||||
|
||||
if (n1 > 0) {
|
||||
|
||||
sumf = ssum_kernel_64(n1, x);
|
||||
i = n1;
|
||||
}
|
||||
|
||||
while (i < n) {
|
||||
sumf += x[i];
|
||||
i++;
|
||||
}
|
||||
|
||||
} else {
|
||||
BLASLONG n1 = n & -4;
|
||||
register FLOAT sum1, sum2;
|
||||
sum1 = 0.0;
|
||||
sum2 = 0.0;
|
||||
while (j < n1) {
|
||||
|
||||
sum1 += x[i];
|
||||
sum2 += x[i + inc_x];
|
||||
sum1 += x[i + 2 * inc_x];
|
||||
sum2 += x[i + 3 * inc_x];
|
||||
|
||||
i += inc_x * 4;
|
||||
j += 4;
|
||||
|
||||
}
|
||||
sumf = sum1 + sum2;
|
||||
while (j < n) {
|
||||
|
||||
sumf += x[i];
|
||||
i += inc_x;
|
||||
j++;
|
||||
}
|
||||
|
||||
}
|
||||
return sumf;
|
||||
}
|
|
@ -0,0 +1,136 @@
|
|||
/***************************************************************************
|
||||
Copyright (c) 2013-2019, The OpenBLAS Project
|
||||
All rights reserved.
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
met:
|
||||
1. Redistributions of source code must retain the above copyright
|
||||
notice, this list of conditions and the following disclaimer.
|
||||
2. Redistributions in binary form must reproduce the above copyright
|
||||
notice, this list of conditions and the following disclaimer in
|
||||
the documentation and/or other materials provided with the
|
||||
distribution.
|
||||
3. Neither the name of the OpenBLAS project nor the names of
|
||||
its contributors may be used to endorse or promote products
|
||||
derived from this software without specific prior written permission.
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||||
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
||||
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
|
||||
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
|
||||
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
|
||||
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
|
||||
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
|
||||
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
|
||||
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
*****************************************************************************/
|
||||
|
||||
#include "common.h"
|
||||
#include <math.h>
|
||||
|
||||
|
||||
static FLOAT zsum_kernel_16(BLASLONG n, FLOAT *x) {
|
||||
FLOAT sum;
|
||||
|
||||
__asm__("vzero %%v24\n\t"
|
||||
"vzero %%v25\n\t"
|
||||
"vzero %%v26\n\t"
|
||||
"vzero %%v27\n\t"
|
||||
"vzero %%v28\n\t"
|
||||
"vzero %%v29\n\t"
|
||||
"vzero %%v30\n\t"
|
||||
"vzero %%v31\n\t"
|
||||
"srlg %[n],%[n],4\n\t"
|
||||
"xgr %%r1,%%r1\n\t"
|
||||
"0:\n\t"
|
||||
"pfd 1, 1024(%%r1,%[x])\n\t"
|
||||
"vl %%v16, 0(%%r1,%[x])\n\t"
|
||||
"vl %%v17, 16(%%r1,%[x])\n\t"
|
||||
"vl %%v18, 32(%%r1,%[x])\n\t"
|
||||
"vl %%v19, 48(%%r1,%[x])\n\t"
|
||||
"vl %%v20, 64(%%r1,%[x])\n\t"
|
||||
"vl %%v21, 80(%%r1,%[x])\n\t"
|
||||
"vl %%v22, 96(%%r1,%[x])\n\t"
|
||||
"vl %%v23, 112(%%r1,%[x])\n\t"
|
||||
"vfadb %%v24,%%v24,%%v16\n\t"
|
||||
"vfadb %%v25,%%v25,%%v17\n\t"
|
||||
"vfadb %%v26,%%v26,%%v18\n\t"
|
||||
"vfadb %%v27,%%v27,%%v19\n\t"
|
||||
"vfadb %%v28,%%v28,%%v20\n\t"
|
||||
"vfadb %%v29,%%v29,%%v21\n\t"
|
||||
"vfadb %%v30,%%v30,%%v22\n\t"
|
||||
"vfadb %%v31,%%v31,%%v23\n\t"
|
||||
"vl %%v16, 128(%%r1,%[x])\n\t"
|
||||
"vl %%v17, 144(%%r1,%[x])\n\t"
|
||||
"vl %%v18, 160(%%r1,%[x])\n\t"
|
||||
"vl %%v19, 176(%%r1,%[x])\n\t"
|
||||
"vl %%v20, 192(%%r1,%[x])\n\t"
|
||||
"vl %%v21, 208(%%r1,%[x])\n\t"
|
||||
"vl %%v22, 224(%%r1,%[x])\n\t"
|
||||
"vl %%v23, 240(%%r1,%[x])\n\t"
|
||||
"vfadb %%v24,%%v24,%%v16\n\t"
|
||||
"vfadb %%v25,%%v25,%%v17\n\t"
|
||||
"vfadb %%v26,%%v26,%%v18\n\t"
|
||||
"vfadb %%v27,%%v27,%%v19\n\t"
|
||||
"vfadb %%v28,%%v28,%%v20\n\t"
|
||||
"vfadb %%v29,%%v29,%%v21\n\t"
|
||||
"vfadb %%v30,%%v30,%%v22\n\t"
|
||||
"vfadb %%v31,%%v31,%%v23\n\t"
|
||||
"agfi %%r1,256\n\t"
|
||||
"brctg %[n],0b\n\t"
|
||||
"vfadb %%v24,%%v24,%%v25\n\t"
|
||||
"vfadb %%v24,%%v24,%%v26\n\t"
|
||||
"vfadb %%v24,%%v24,%%v27\n\t"
|
||||
"vfadb %%v24,%%v24,%%v28\n\t"
|
||||
"vfadb %%v24,%%v24,%%v29\n\t"
|
||||
"vfadb %%v24,%%v24,%%v30\n\t"
|
||||
"vfadb %%v24,%%v24,%%v31\n\t"
|
||||
"vrepg %%v25,%%v24,1\n\t"
|
||||
"vfadb %%v24,%%v24,%%v25\n\t"
|
||||
"vsteg %%v24,%[asum],0"
|
||||
: [sum] "=Q"(sum),[n] "+&r"(n)
|
||||
: "m"(*(const struct { FLOAT x[n * 2]; } *) x),[x] "a"(x)
|
||||
: "cc", "r1", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23",
|
||||
"v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31");
|
||||
|
||||
return sum;
|
||||
}
|
||||
|
||||
FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) {
|
||||
BLASLONG i = 0;
|
||||
BLASLONG ip = 0;
|
||||
FLOAT sumf = 0.0;
|
||||
BLASLONG n1;
|
||||
BLASLONG inc_x2;
|
||||
|
||||
if (n <= 0 || inc_x <= 0)
|
||||
return (sumf);
|
||||
|
||||
if (inc_x == 1) {
|
||||
|
||||
n1 = n & -16;
|
||||
if (n1 > 0) {
|
||||
|
||||
sumf = zsum_kernel_16(n1, x);
|
||||
i = n1;
|
||||
ip = 2 * n1;
|
||||
}
|
||||
|
||||
while (i < n) {
|
||||
sumf += x[ip] + x[ip + 1];
|
||||
i++;
|
||||
ip += 2;
|
||||
}
|
||||
|
||||
} else {
|
||||
inc_x2 = 2 * inc_x;
|
||||
|
||||
while (i < n) {
|
||||
sumf += x[ip] + x[ip + 1];
|
||||
ip += inc_x2;
|
||||
i++;
|
||||
}
|
||||
|
||||
}
|
||||
return (sumf);
|
||||
}
|
Loading…
Reference in New Issue