Merge pull request #2072 from martin-frbg/sum

Add (C)BLAS extension ?sum
This commit is contained in:
Martin Kroeker 2019-04-23 20:11:36 +02:00 committed by GitHub
commit ccfb7ead15
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
52 changed files with 5640 additions and 27 deletions

View File

@ -73,6 +73,11 @@ double cblas_dasum (OPENBLAS_CONST blasint n, OPENBLAS_CONST double *x, OPENBLAS
float cblas_scasum(OPENBLAS_CONST blasint n, OPENBLAS_CONST void *x, OPENBLAS_CONST blasint incx);
double cblas_dzasum(OPENBLAS_CONST blasint n, OPENBLAS_CONST void *x, OPENBLAS_CONST blasint incx);
float cblas_ssum (OPENBLAS_CONST blasint n, OPENBLAS_CONST float *x, OPENBLAS_CONST blasint incx);
double cblas_dsum (OPENBLAS_CONST blasint n, OPENBLAS_CONST double *x, OPENBLAS_CONST blasint incx);
float cblas_scsum(OPENBLAS_CONST blasint n, OPENBLAS_CONST void *x, OPENBLAS_CONST blasint incx);
double cblas_dzsum(OPENBLAS_CONST blasint n, OPENBLAS_CONST void *x, OPENBLAS_CONST blasint incx);
float cblas_snrm2 (OPENBLAS_CONST blasint N, OPENBLAS_CONST float *X, OPENBLAS_CONST blasint incX);
double cblas_dnrm2 (OPENBLAS_CONST blasint N, OPENBLAS_CONST double *X, OPENBLAS_CONST blasint incX);
float cblas_scnrm2(OPENBLAS_CONST blasint N, OPENBLAS_CONST void *X, OPENBLAS_CONST blasint incX);

View File

@ -107,6 +107,12 @@ macro(SetDefaultL1)
set(DAXPBYKERNEL ../arm/axpby.c)
set(CAXPBYKERNEL ../arm/zaxpby.c)
set(ZAXPBYKERNEL ../arm/zaxpby.c)
set(SSUMKERNEL sum.S)
set(DSUMKERNEL sum.S)
set(CSUMKERNEL zsum.S)
set(ZSUMKERNEL zsum.S)
set(QSUMKERNEL sum.S)
set(XSUMKERNEL zsum.S)
endmacro ()
macro(SetDefaultL2)

View File

@ -19,6 +19,7 @@
#define CDOTC_K cdotc_k
#define CNRM2_K cnrm2_k
#define CSCAL_K cscal_k
#define CSUM_K csum_k
#define CSWAP_K cswap_k
#define CROT_K csrot_k
@ -249,6 +250,7 @@
#define CDOTC_K gotoblas -> cdotc_k
#define CNRM2_K gotoblas -> cnrm2_k
#define CSCAL_K gotoblas -> cscal_k
#define CSUM_K gotoblas -> csum_k
#define CSWAP_K gotoblas -> cswap_k
#define CROT_K gotoblas -> csrot_k

View File

@ -19,6 +19,7 @@
#define DDOTC_K ddot_k
#define DNRM2_K dnrm2_k
#define DSCAL_K dscal_k
#define DSUM_K dsum_k
#define DSWAP_K dswap_k
#define DROT_K drot_k
@ -174,6 +175,7 @@
#define DDOTC_K gotoblas -> ddot_k
#define DNRM2_K gotoblas -> dnrm2_k
#define DSCAL_K gotoblas -> dscal_k
#define DSUM_K gotoblas -> dsum_k
#define DSWAP_K gotoblas -> dswap_k
#define DROT_K gotoblas -> drot_k

View File

@ -122,6 +122,13 @@ xdouble BLASFUNC(qasum) (blasint *, xdouble *, blasint *);
double BLASFUNC(dzasum)(blasint *, double *, blasint *);
xdouble BLASFUNC(qxasum)(blasint *, xdouble *, blasint *);
FLOATRET BLASFUNC(ssum) (blasint *, float *, blasint *);
FLOATRET BLASFUNC(scsum)(blasint *, float *, blasint *);
double BLASFUNC(dsum) (blasint *, double *, blasint *);
xdouble BLASFUNC(qsum) (blasint *, xdouble *, blasint *);
double BLASFUNC(dzsum)(blasint *, double *, blasint *);
xdouble BLASFUNC(qxsum)(blasint *, xdouble *, blasint *);
blasint BLASFUNC(isamax)(blasint *, float *, blasint *);
blasint BLASFUNC(idamax)(blasint *, double *, blasint *);
blasint BLASFUNC(iqamax)(blasint *, xdouble *, blasint *);

View File

@ -100,6 +100,13 @@ float casum_k (BLASLONG, float *, BLASLONG);
double zasum_k (BLASLONG, double *, BLASLONG);
xdouble xasum_k (BLASLONG, xdouble *, BLASLONG);
float ssum_k (BLASLONG, float *, BLASLONG);
double dsum_k (BLASLONG, double *, BLASLONG);
xdouble qsum_k (BLASLONG, xdouble *, BLASLONG);
float csum_k (BLASLONG, float *, BLASLONG);
double zsum_k (BLASLONG, double *, BLASLONG);
xdouble xsum_k (BLASLONG, xdouble *, BLASLONG);
float samax_k (BLASLONG, float *, BLASLONG);
double damax_k (BLASLONG, double *, BLASLONG);
xdouble qamax_k (BLASLONG, xdouble *, BLASLONG);

View File

@ -66,6 +66,7 @@
#define DOTC_K QDOTC_K
#define NRM2_K QNRM2_K
#define SCAL_K QSCAL_K
#define SUM_K QSUM_K
#define SWAP_K QSWAP_K
#define ROT_K QROT_K
@ -356,6 +357,7 @@
#define DOTC_K DDOTC_K
#define NRM2_K DNRM2_K
#define SCAL_K DSCAL_K
#define SUM_K DSUM_K
#define SWAP_K DSWAP_K
#define ROT_K DROT_K
@ -658,6 +660,7 @@
#define DOTC_K SDOTC_K
#define NRM2_K SNRM2_K
#define SCAL_K SSCAL_K
#define SUM_K SSUM_K
#define SWAP_K SSWAP_K
#define ROT_K SROT_K
@ -962,6 +965,7 @@
#define DOTC_K XDOTC_K
#define NRM2_K XNRM2_K
#define SCAL_K XSCAL_K
#define SUM_K XSUM_K
#define SWAP_K XSWAP_K
#define ROT_K XROT_K
@ -1363,6 +1367,7 @@
#define DOTC_K ZDOTC_K
#define NRM2_K ZNRM2_K
#define SCAL_K ZSCAL_K
#define SUM_K ZSUM_K
#define SWAP_K ZSWAP_K
#define ROT_K ZROT_K
@ -1785,6 +1790,7 @@
#define DOTC_K CDOTC_K
#define NRM2_K CNRM2_K
#define SCAL_K CSCAL_K
#define SUM_K CSUM_K
#define SWAP_K CSWAP_K
#define ROT_K CROT_K

View File

@ -63,6 +63,7 @@ BLASLONG (*ismin_k) (BLASLONG, float *, BLASLONG);
float (*snrm2_k) (BLASLONG, float *, BLASLONG);
float (*sasum_k) (BLASLONG, float *, BLASLONG);
float (*ssum_k) (BLASLONG, float *, BLASLONG);
int (*scopy_k) (BLASLONG, float *, BLASLONG, float *, BLASLONG);
float (*sdot_k) (BLASLONG, float *, BLASLONG, float *, BLASLONG);
double (*dsdot_k) (BLASLONG, float *, BLASLONG, float *, BLASLONG);
@ -154,6 +155,7 @@ BLASLONG (*idmin_k) (BLASLONG, double *, BLASLONG);
double (*dnrm2_k) (BLASLONG, double *, BLASLONG);
double (*dasum_k) (BLASLONG, double *, BLASLONG);
double (*dsum_k) (BLASLONG, double *, BLASLONG);
int (*dcopy_k) (BLASLONG, double *, BLASLONG, double *, BLASLONG);
double (*ddot_k) (BLASLONG, double *, BLASLONG, double *, BLASLONG);
int (*drot_k) (BLASLONG, double *, BLASLONG, double *, BLASLONG, double, double);
@ -245,6 +247,7 @@ BLASLONG (*iqmin_k) (BLASLONG, xdouble *, BLASLONG);
xdouble (*qnrm2_k) (BLASLONG, xdouble *, BLASLONG);
xdouble (*qasum_k) (BLASLONG, xdouble *, BLASLONG);
xdouble (*qsum_k) (BLASLONG, xdouble *, BLASLONG);
int (*qcopy_k) (BLASLONG, xdouble *, BLASLONG, xdouble *, BLASLONG);
xdouble (*qdot_k) (BLASLONG, xdouble *, BLASLONG, xdouble *, BLASLONG);
int (*qrot_k) (BLASLONG, xdouble *, BLASLONG, xdouble *, BLASLONG, xdouble, xdouble);
@ -332,6 +335,7 @@ BLASLONG (*icamin_k)(BLASLONG, float *, BLASLONG);
float (*cnrm2_k) (BLASLONG, float *, BLASLONG);
float (*casum_k) (BLASLONG, float *, BLASLONG);
float (*csum_k) (BLASLONG, float *, BLASLONG);
int (*ccopy_k) (BLASLONG, float *, BLASLONG, float *, BLASLONG);
openblas_complex_float (*cdotu_k) (BLASLONG, float *, BLASLONG, float *, BLASLONG);
openblas_complex_float (*cdotc_k) (BLASLONG, float *, BLASLONG, float *, BLASLONG);
@ -495,6 +499,7 @@ BLASLONG (*izamin_k)(BLASLONG, double *, BLASLONG);
double (*znrm2_k) (BLASLONG, double *, BLASLONG);
double (*zasum_k) (BLASLONG, double *, BLASLONG);
double (*zsum_k) (BLASLONG, double *, BLASLONG);
int (*zcopy_k) (BLASLONG, double *, BLASLONG, double *, BLASLONG);
openblas_complex_double (*zdotu_k) (BLASLONG, double *, BLASLONG, double *, BLASLONG);
openblas_complex_double (*zdotc_k) (BLASLONG, double *, BLASLONG, double *, BLASLONG);
@ -660,6 +665,7 @@ BLASLONG (*ixamin_k)(BLASLONG, xdouble *, BLASLONG);
xdouble (*xnrm2_k) (BLASLONG, xdouble *, BLASLONG);
xdouble (*xasum_k) (BLASLONG, xdouble *, BLASLONG);
xdouble (*xsum_k) (BLASLONG, xdouble *, BLASLONG);
int (*xcopy_k) (BLASLONG, xdouble *, BLASLONG, xdouble *, BLASLONG);
openblas_complex_xdouble (*xdotu_k) (BLASLONG, xdouble *, BLASLONG, xdouble *, BLASLONG);
openblas_complex_xdouble (*xdotc_k) (BLASLONG, xdouble *, BLASLONG, xdouble *, BLASLONG);

View File

@ -19,6 +19,7 @@
#define QDOTC_K qdot_k
#define QNRM2_K qnrm2_k
#define QSCAL_K qscal_k
#define QSUM_K qsum_k
#define QSWAP_K qswap_k
#define QROT_K qrot_k
@ -161,6 +162,7 @@
#define QDOTC_K gotoblas -> qdot_k
#define QNRM2_K gotoblas -> qnrm2_k
#define QSCAL_K gotoblas -> qscal_k
#define QSUM_K gotoblas -> qsum_k
#define QSWAP_K gotoblas -> qswap_k
#define QROT_K gotoblas -> qrot_k

View File

@ -12,6 +12,7 @@
#define ISMAX_K ismax_k
#define ISMIN_K ismin_k
#define SASUM_K sasum_k
#define SSUM_K ssum_k
#define SAXPYU_K saxpy_k
#define SAXPYC_K saxpy_k
#define SCOPY_K scopy_k
@ -170,6 +171,7 @@
#define ISMAX_K gotoblas -> ismax_k
#define ISMIN_K gotoblas -> ismin_k
#define SASUM_K gotoblas -> sasum_k
#define SSUM_K gotoblas -> ssum_k
#define SAXPYU_K gotoblas -> saxpy_k
#define SAXPYC_K gotoblas -> saxpy_k
#define SCOPY_K gotoblas -> scopy_k

View File

@ -19,6 +19,7 @@
#define XDOTC_K xdotc_k
#define XNRM2_K xnrm2_k
#define XSCAL_K xscal_k
#define XSUM_K xsum_k
#define XSWAP_K xswap_k
#define XROT_K xqrot_k
@ -227,6 +228,7 @@
#define XDOTC_K gotoblas -> xdotc_k
#define XNRM2_K gotoblas -> xnrm2_k
#define XSCAL_K gotoblas -> xscal_k
#define XSUM_K gotoblas -> xsum_k
#define XSWAP_K gotoblas -> xswap_k
#define XROT_K gotoblas -> xqrot_k

View File

@ -19,6 +19,7 @@
#define ZDOTC_K zdotc_k
#define ZNRM2_K znrm2_k
#define ZSCAL_K zscal_k
#define ZSUM_K zsum_k
#define ZSWAP_K zswap_k
#define ZROT_K zdrot_k
@ -249,6 +250,7 @@
#define ZDOTC_K gotoblas -> zdotc_k
#define ZNRM2_K gotoblas -> znrm2_k
#define ZSCAL_K gotoblas -> zscal_k
#define ZSUM_K gotoblas -> zsum_k
#define ZSWAP_K gotoblas -> zswap_k
#define ZROT_K gotoblas -> zdrot_k

View File

@ -12,6 +12,7 @@ set(BLAS1_REAL_ONLY_SOURCES
rotm.c rotmg.c # N.B. these do not have complex counterparts
rot.c
asum.c
sum.c
)
# these will have 'z' prepended for the complex version
@ -124,6 +125,7 @@ foreach (float_type ${FLOAT_TYPES})
GenerateNamedObjects("max.c" "USE_ABS;USE_MIN" "scamin" ${CBLAS_FLAG} "" "" true "COMPLEX")
GenerateNamedObjects("max.c" "USE_ABS" "scamax" ${CBLAS_FLAG} "" "" true "COMPLEX")
GenerateNamedObjects("asum.c" "" "scasum" ${CBLAS_FLAG} "" "" true "COMPLEX")
GenerateNamedObjects("sum.c" "" "scsum" ${CBLAS_FLAG} "" "" true "COMPLEX")
endif ()
if (${float_type} STREQUAL "ZCOMPLEX")
GenerateNamedObjects("zscal.c" "SSCAL" "dscal" ${CBLAS_FLAG} "" "" false "ZCOMPLEX")
@ -132,6 +134,7 @@ foreach (float_type ${FLOAT_TYPES})
GenerateNamedObjects("max.c" "USE_ABS;USE_MIN" "dzamin" ${CBLAS_FLAG} "" "" true "ZCOMPLEX")
GenerateNamedObjects("max.c" "USE_ABS" "dzamax" ${CBLAS_FLAG} "" "" true "ZCOMPLEX")
GenerateNamedObjects("asum.c" "" "dzasum" ${CBLAS_FLAG} "" "" true "ZCOMPLEX")
GenerateNamedObjects("sum.c" "" "dzsum" ${CBLAS_FLAG} "" "" true "ZCOMPLEX")
endif ()
endforeach ()

View File

@ -25,7 +25,7 @@ SBLAS1OBJS = \
saxpy.$(SUFFIX) sswap.$(SUFFIX) \
scopy.$(SUFFIX) sscal.$(SUFFIX) \
sdot.$(SUFFIX) sdsdot.$(SUFFIX) dsdot.$(SUFFIX) \
sasum.$(SUFFIX) snrm2.$(SUFFIX) \
sasum.$(SUFFIX) ssum.$(SUFFIX) snrm2.$(SUFFIX) \
smax.$(SUFFIX) samax.$(SUFFIX) ismax.$(SUFFIX) isamax.$(SUFFIX) \
smin.$(SUFFIX) samin.$(SUFFIX) ismin.$(SUFFIX) isamin.$(SUFFIX) \
srot.$(SUFFIX) srotg.$(SUFFIX) srotm.$(SUFFIX) srotmg.$(SUFFIX) \
@ -51,7 +51,7 @@ DBLAS1OBJS = \
daxpy.$(SUFFIX) dswap.$(SUFFIX) \
dcopy.$(SUFFIX) dscal.$(SUFFIX) \
ddot.$(SUFFIX) \
dasum.$(SUFFIX) dnrm2.$(SUFFIX) \
dasum.$(SUFFIX) dsum.$(SUFFIX) dnrm2.$(SUFFIX) \
dmax.$(SUFFIX) damax.$(SUFFIX) idmax.$(SUFFIX) idamax.$(SUFFIX) \
dmin.$(SUFFIX) damin.$(SUFFIX) idmin.$(SUFFIX) idamin.$(SUFFIX) \
drot.$(SUFFIX) drotg.$(SUFFIX) drotm.$(SUFFIX) drotmg.$(SUFFIX) \
@ -76,7 +76,7 @@ CBLAS1OBJS = \
caxpy.$(SUFFIX) caxpyc.$(SUFFIX) cswap.$(SUFFIX) \
ccopy.$(SUFFIX) cscal.$(SUFFIX) csscal.$(SUFFIX) \
cdotc.$(SUFFIX) cdotu.$(SUFFIX) \
scasum.$(SUFFIX) scnrm2.$(SUFFIX) \
scasum.$(SUFFIX) scsum.$(SUFFIX) scnrm2.$(SUFFIX) \
scamax.$(SUFFIX) icamax.$(SUFFIX) \
scamin.$(SUFFIX) icamin.$(SUFFIX) \
csrot.$(SUFFIX) crotg.$(SUFFIX) \
@ -105,7 +105,7 @@ ZBLAS1OBJS = \
zaxpy.$(SUFFIX) zaxpyc.$(SUFFIX) zswap.$(SUFFIX) \
zcopy.$(SUFFIX) zscal.$(SUFFIX) zdscal.$(SUFFIX) \
zdotc.$(SUFFIX) zdotu.$(SUFFIX) \
dzasum.$(SUFFIX) dznrm2.$(SUFFIX) \
dzasum.$(SUFFIX) dzsum.$(SUFFIX) dznrm2.$(SUFFIX) \
dzamax.$(SUFFIX) izamax.$(SUFFIX) \
dzamin.$(SUFFIX) izamin.$(SUFFIX) \
zdrot.$(SUFFIX) zrotg.$(SUFFIX) \
@ -146,7 +146,7 @@ QBLAS1OBJS = \
qaxpy.$(SUFFIX) qswap.$(SUFFIX) \
qcopy.$(SUFFIX) qscal.$(SUFFIX) \
qdot.$(SUFFIX) \
qasum.$(SUFFIX) qnrm2.$(SUFFIX) \
qasum.$(SUFFIX) qsum.$(SUFFIX) qnrm2.$(SUFFIX) \
qmax.$(SUFFIX) qamax.$(SUFFIX) iqmax.$(SUFFIX) iqamax.$(SUFFIX) \
qmin.$(SUFFIX) qamin.$(SUFFIX) iqmin.$(SUFFIX) iqamin.$(SUFFIX) \
qrot.$(SUFFIX) qrotg.$(SUFFIX) qrotm.$(SUFFIX) qrotmg.$(SUFFIX) \
@ -168,7 +168,7 @@ XBLAS1OBJS = \
xaxpy.$(SUFFIX) xaxpyc.$(SUFFIX) xswap.$(SUFFIX) \
xcopy.$(SUFFIX) xscal.$(SUFFIX) xqscal.$(SUFFIX) \
xdotc.$(SUFFIX) xdotu.$(SUFFIX) \
qxasum.$(SUFFIX) qxnrm2.$(SUFFIX) \
qxasum.$(SUFFIX) qxsum.$(SUFFIX) qxnrm2.$(SUFFIX) \
qxamax.$(SUFFIX) ixamax.$(SUFFIX) \
qxamin.$(SUFFIX) ixamin.$(SUFFIX) \
xqrot.$(SUFFIX) xrotg.$(SUFFIX) \
@ -203,7 +203,7 @@ ifdef QUAD_PRECISION
QBLAS1OBJS = \
qaxpy.$(SUFFIX) qswap.$(SUFFIX) \
qcopy.$(SUFFIX) qscal.$(SUFFIX) \
qasum.$(SUFFIX) qnrm2.$(SUFFIX) \
qasum.$(SUFFIX) qsum.$(SUFFIX) qnrm2.$(SUFFIX) \
qmax.$(SUFFIX) qamax.$(SUFFIX) iqmax.$(SUFFIX) iqamax.$(SUFFIX) \
qmin.$(SUFFIX) qamin.$(SUFFIX) iqmin.$(SUFFIX) iqamin.$(SUFFIX) \
qrot.$(SUFFIX) qrotg.$(SUFFIX) qrotm.$(SUFFIX) qrotmg.$(SUFFIX) \
@ -224,7 +224,7 @@ QBLAS3OBJS = \
XBLAS1OBJS = \
xaxpy.$(SUFFIX) xaxpyc.$(SUFFIX) xswap.$(SUFFIX) \
xcopy.$(SUFFIX) xscal.$(SUFFIX) xqscal.$(SUFFIX) \
qxasum.$(SUFFIX) qxnrm2.$(SUFFIX) \
qxasum.$(SUFFIX) qxsum.$(SUFFIX) qxnrm2.$(SUFFIX) \
qxamax.$(SUFFIX) ixamax.$(SUFFIX) \
qxamin.$(SUFFIX) ixamin.$(SUFFIX) \
xqrot.$(SUFFIX) xrotg.$(SUFFIX) \
@ -264,7 +264,7 @@ CSBLAS1OBJS = \
cblas_scopy.$(SUFFIX) cblas_sdot.$(SUFFIX) cblas_sdsdot.$(SUFFIX) cblas_dsdot.$(SUFFIX) \
cblas_srot.$(SUFFIX) cblas_srotg.$(SUFFIX) cblas_srotm.$(SUFFIX) cblas_srotmg.$(SUFFIX) \
cblas_sscal.$(SUFFIX) cblas_sswap.$(SUFFIX) cblas_snrm2.$(SUFFIX) cblas_saxpby.$(SUFFIX) \
cblas_ismin.$(SUFFIX) cblas_ismax.$(SUFFIX)
cblas_ismin.$(SUFFIX) cblas_ismax.$(SUFFIX) cblas_ssum.$(SUFFIX)
CSBLAS2OBJS = \
cblas_sgemv.$(SUFFIX) cblas_sger.$(SUFFIX) cblas_ssymv.$(SUFFIX) cblas_strmv.$(SUFFIX) \
@ -282,7 +282,7 @@ CDBLAS1OBJS = \
cblas_dcopy.$(SUFFIX) cblas_ddot.$(SUFFIX) \
cblas_drot.$(SUFFIX) cblas_drotg.$(SUFFIX) cblas_drotm.$(SUFFIX) cblas_drotmg.$(SUFFIX) \
cblas_dscal.$(SUFFIX) cblas_dswap.$(SUFFIX) cblas_dnrm2.$(SUFFIX) cblas_daxpby.$(SUFFIX) \
cblas_idmin.$(SUFFIX) cblas_idmax.$(SUFFIX)
cblas_idmin.$(SUFFIX) cblas_idmax.$(SUFFIX) cblas_dsum.$(SUFFIX)
CDBLAS2OBJS = \
cblas_dgemv.$(SUFFIX) cblas_dger.$(SUFFIX) cblas_dsymv.$(SUFFIX) cblas_dtrmv.$(SUFFIX) \
@ -303,7 +303,7 @@ CCBLAS1OBJS = \
cblas_cscal.$(SUFFIX) cblas_csscal.$(SUFFIX) \
cblas_cswap.$(SUFFIX) cblas_scnrm2.$(SUFFIX) \
cblas_caxpby.$(SUFFIX) \
cblas_icmin.$(SUFFIX) cblas_icmax.$(SUFFIX)
cblas_icmin.$(SUFFIX) cblas_icmax.$(SUFFIX) cblas_scsum.$(SUFFIX)
CCBLAS2OBJS = \
cblas_cgemv.$(SUFFIX) cblas_cgerc.$(SUFFIX) cblas_cgeru.$(SUFFIX) \
@ -330,7 +330,7 @@ CZBLAS1OBJS = \
cblas_zscal.$(SUFFIX) cblas_zdscal.$(SUFFIX) \
cblas_zswap.$(SUFFIX) cblas_dznrm2.$(SUFFIX) \
cblas_zaxpby.$(SUFFIX) \
cblas_izmin.$(SUFFIX) cblas_izmax.$(SUFFIX)
cblas_izmin.$(SUFFIX) cblas_izmax.$(SUFFIX) cblas_dzsum.$(SUFFIX)
CZBLAS2OBJS = \
@ -565,6 +565,24 @@ dzasum.$(SUFFIX) dzasum.$(PSUFFIX) : asum.c
qxasum.$(SUFFIX) qxasum.$(PSUFFIX) : asum.c
$(CC) $(CFLAGS) -c $< -o $(@F)
ssum.$(SUFFIX) ssum.$(PSUFFIX) : sum.c
$(CC) $(CFLAGS) -c $< -o $(@F)
dsum.$(SUFFIX) dsum.$(PSUFFIX) : sum.c
$(CC) $(CFLAGS) -c $< -o $(@F)
qsum.$(SUFFIX) qsum.$(PSUFFIX) : sum.c
$(CC) $(CFLAGS) -c $< -o $(@F)
scsum.$(SUFFIX) scsum.$(PSUFFIX) : sum.c
$(CC) $(CFLAGS) -c $< -o $(@F)
dzsum.$(SUFFIX) dzsum.$(PSUFFIX) : sum.c
$(CC) $(CFLAGS) -c $< -o $(@F)
qxsum.$(SUFFIX) qxsum.$(PSUFFIX) : sum.c
$(CC) $(CFLAGS) -c $< -o $(@F)
snrm2.$(SUFFIX) snrm2.$(PSUFFIX) : nrm2.c
$(CC) $(CFLAGS) -c $< -o $(@F)
@ -1412,6 +1430,18 @@ cblas_scasum.$(SUFFIX) cblas_scasum.$(PSUFFIX) : asum.c
cblas_dzasum.$(SUFFIX) cblas_dzasum.$(PSUFFIX) : asum.c
$(CC) $(CFLAGS) -DCBLAS -c $< -o $(@F)
cblas_ssum.$(SUFFIX) cblas_ssum.$(PSUFFIX) : sum.c
$(CC) $(CFLAGS) -DCBLAS -c $< -o $(@F)
cblas_dsum.$(SUFFIX) cblas_dsum.$(PSUFFIX) : sum.c
$(CC) $(CFLAGS) -DCBLAS -c $< -o $(@F)
cblas_scsum.$(SUFFIX) cblas_scsum.$(PSUFFIX) : sum.c
$(CC) $(CFLAGS) -DCBLAS -c $< -o $(@F)
cblas_dzsum.$(SUFFIX) cblas_dzsum.$(PSUFFIX) : sum.c
$(CC) $(CFLAGS) -DCBLAS -c $< -o $(@F)
cblas_sdsdot.$(SUFFIX) cblas_sdsdot.$(PSUFFIX) : sdsdot.c
$(CC) $(CFLAGS) -DCBLAS -c $< -o $(@F)

97
interface/sum.c Normal file
View File

@ -0,0 +1,97 @@
/*********************************************************************/
/* Copyright 2009, 2010 The University of Texas at Austin. */
/* All rights reserved. */
/* */
/* Redistribution and use in source and binary forms, with or */
/* without modification, are permitted provided that the following */
/* conditions are met: */
/* */
/* 1. Redistributions of source code must retain the above */
/* copyright notice, this list of conditions and the following */
/* disclaimer. */
/* */
/* 2. Redistributions in binary form must reproduce the above */
/* copyright notice, this list of conditions and the following */
/* disclaimer in the documentation and/or other materials */
/* provided with the distribution. */
/* */
/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */
/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */
/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */
/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */
/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */
/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */
/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */
/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */
/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */
/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */
/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */
/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */
/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */
/* POSSIBILITY OF SUCH DAMAGE. */
/* */
/* The views and conclusions contained in the software and */
/* documentation are those of the authors and should not be */
/* interpreted as representing official policies, either expressed */
/* or implied, of The University of Texas at Austin. */
/*********************************************************************/
#include <stdio.h>
#include "common.h"
#ifdef FUNCTION_PROFILE
#include "functable.h"
#endif
#ifndef CBLAS
FLOATRET NAME(blasint *N, FLOAT *x, blasint *INCX){
BLASLONG n = *N;
BLASLONG incx = *INCX;
FLOATRET ret;
PRINT_DEBUG_NAME;
if (n <= 0) return 0;
IDEBUG_START;
FUNCTION_PROFILE_START();
ret = (FLOATRET)SUM_K(n, x, incx);
FUNCTION_PROFILE_END(COMPSIZE, n, n);
IDEBUG_END;
return ret;
}
#else
#ifdef COMPLEX
FLOAT CNAME(blasint n, void *vx, blasint incx){
FLOAT *x = (FLOAT*) vx;
#else
FLOAT CNAME(blasint n, FLOAT *x, blasint incx){
#endif
FLOAT ret;
PRINT_DEBUG_CNAME;
if (n <= 0) return 0;
IDEBUG_START;
FUNCTION_PROFILE_START();
ret = SUM_K(n, x, incx);
FUNCTION_PROFILE_END(COMPSIZE, n, n);
IDEBUG_END;
return ret;
}
#endif

View File

@ -65,6 +65,7 @@ function (build_core TARGET_CORE KDIR TSUFFIX KERNEL_DEFINITIONS)
GenerateNamedObjects("${KERNELDIR}/${${float_char}SCALKERNEL}" "" "scal_k" false "" "" false ${float_type})
GenerateNamedObjects("${KERNELDIR}/${${float_char}SWAPKERNEL}" "" "swap_k" false "" "" false ${float_type})
GenerateNamedObjects("${KERNELDIR}/${${float_char}AXPBYKERNEL}" "" "axpby_k" false "" "" false ${float_type})
GenerateNamedObjects("${KERNELDIR}/${${float_char}SUMKERNEL}" "" "sum_k" false "" "" false ${float_type})
if (${float_type} STREQUAL "COMPLEX" OR ${float_type} STREQUAL "ZCOMPLEX")
GenerateNamedObjects("${KERNELDIR}/${${float_char}AXPYKERNEL}" "CONJ" "axpyc_k" false "" "" false ${float_type})

View File

@ -340,6 +340,32 @@ ifndef XSCALKERNEL
XSCALKERNEL = zscal.S
endif
### SUM ###
ifndef SSUMKERNEL
SSUMKERNEL = sum.S
endif
ifndef DSUMKERNEL
DSUMKERNEL = sum.S
endif
ifndef CSUMKERNEL
CSUMKERNEL = zsum.S
endif
ifndef ZSUMKERNEL
ZSUMKERNEL = zsum.S
endif
ifndef QSUMKERNEL
QSUMKERNEL = sum.S
endif
ifndef XSUMKERNEL
XSUMKERNEL = zsum.S
endif
### SWAP ###
ifndef SSWAPKERNEL
@ -453,7 +479,7 @@ endif
SBLASOBJS += \
samax_k$(TSUFFIX).$(SUFFIX) samin_k$(TSUFFIX).$(SUFFIX) smax_k$(TSUFFIX).$(SUFFIX) smin_k$(TSUFFIX).$(SUFFIX) \
isamax_k$(TSUFFIX).$(SUFFIX) isamin_k$(TSUFFIX).$(SUFFIX) ismax_k$(TSUFFIX).$(SUFFIX) ismin_k$(TSUFFIX).$(SUFFIX) \
sasum_k$(TSUFFIX).$(SUFFIX) saxpy_k$(TSUFFIX).$(SUFFIX) scopy_k$(TSUFFIX).$(SUFFIX) \
sasum_k$(TSUFFIX).$(SUFFIX) ssum_k$(TSUFFIX).$(SUFFIX) saxpy_k$(TSUFFIX).$(SUFFIX) scopy_k$(TSUFFIX).$(SUFFIX) \
sdot_k$(TSUFFIX).$(SUFFIX) sdsdot_k$(TSUFFIX).$(SUFFIX) dsdot_k$(TSUFFIX).$(SUFFIX) \
snrm2_k$(TSUFFIX).$(SUFFIX) srot_k$(TSUFFIX).$(SUFFIX) sscal_k$(TSUFFIX).$(SUFFIX) sswap_k$(TSUFFIX).$(SUFFIX) \
saxpby_k$(TSUFFIX).$(SUFFIX)
@ -463,31 +489,32 @@ DBLASOBJS += \
idamax_k$(TSUFFIX).$(SUFFIX) idamin_k$(TSUFFIX).$(SUFFIX) idmax_k$(TSUFFIX).$(SUFFIX) idmin_k$(TSUFFIX).$(SUFFIX) \
dasum_k$(TSUFFIX).$(SUFFIX) daxpy_k$(TSUFFIX).$(SUFFIX) dcopy_k$(TSUFFIX).$(SUFFIX) ddot_k$(TSUFFIX).$(SUFFIX) \
dnrm2_k$(TSUFFIX).$(SUFFIX) drot_k$(TSUFFIX).$(SUFFIX) dscal_k$(TSUFFIX).$(SUFFIX) dswap_k$(TSUFFIX).$(SUFFIX) \
daxpby_k$(TSUFFIX).$(SUFFIX)
daxpby_k$(TSUFFIX).$(SUFFIX) dsum_k$(TSUFFIX).$(SUFFIX)
QBLASOBJS += \
qamax_k$(TSUFFIX).$(SUFFIX) qamin_k$(TSUFFIX).$(SUFFIX) qmax_k$(TSUFFIX).$(SUFFIX) qmin_k$(TSUFFIX).$(SUFFIX) \
iqamax_k$(TSUFFIX).$(SUFFIX) iqamin_k$(TSUFFIX).$(SUFFIX) iqmax_k$(TSUFFIX).$(SUFFIX) iqmin_k$(TSUFFIX).$(SUFFIX) \
qasum_k$(TSUFFIX).$(SUFFIX) qaxpy_k$(TSUFFIX).$(SUFFIX) qcopy_k$(TSUFFIX).$(SUFFIX) qdot_k$(TSUFFIX).$(SUFFIX) \
qnrm2_k$(TSUFFIX).$(SUFFIX) qrot_k$(TSUFFIX).$(SUFFIX) qscal_k$(TSUFFIX).$(SUFFIX) qswap_k$(TSUFFIX).$(SUFFIX)
qnrm2_k$(TSUFFIX).$(SUFFIX) qrot_k$(TSUFFIX).$(SUFFIX) qscal_k$(TSUFFIX).$(SUFFIX) qswap_k$(TSUFFIX).$(SUFFIX) \
qsum_k$(TSUFFIX).$(SUFFIX)
CBLASOBJS += \
camax_k$(TSUFFIX).$(SUFFIX) camin_k$(TSUFFIX).$(SUFFIX) icamax_k$(TSUFFIX).$(SUFFIX) icamin_k$(TSUFFIX).$(SUFFIX) \
casum_k$(TSUFFIX).$(SUFFIX) caxpy_k$(TSUFFIX).$(SUFFIX) caxpyc_k$(TSUFFIX).$(SUFFIX) ccopy_k$(TSUFFIX).$(SUFFIX) \
cdotc_k$(TSUFFIX).$(SUFFIX) cdotu_k$(TSUFFIX).$(SUFFIX) cnrm2_k$(TSUFFIX).$(SUFFIX) csrot_k$(TSUFFIX).$(SUFFIX) \
cscal_k$(TSUFFIX).$(SUFFIX) cswap_k$(TSUFFIX).$(SUFFIX) caxpby_k$(TSUFFIX).$(SUFFIX)
cscal_k$(TSUFFIX).$(SUFFIX) cswap_k$(TSUFFIX).$(SUFFIX) caxpby_k$(TSUFFIX).$(SUFFIX) csum_k$(TSUFFIX).$(SUFFIX)
ZBLASOBJS += \
zamax_k$(TSUFFIX).$(SUFFIX) zamin_k$(TSUFFIX).$(SUFFIX) izamax_k$(TSUFFIX).$(SUFFIX) izamin_k$(TSUFFIX).$(SUFFIX) \
zasum_k$(TSUFFIX).$(SUFFIX) zaxpy_k$(TSUFFIX).$(SUFFIX) zaxpyc_k$(TSUFFIX).$(SUFFIX) zcopy_k$(TSUFFIX).$(SUFFIX) \
zdotc_k$(TSUFFIX).$(SUFFIX) zdotu_k$(TSUFFIX).$(SUFFIX) znrm2_k$(TSUFFIX).$(SUFFIX) zdrot_k$(TSUFFIX).$(SUFFIX) \
zscal_k$(TSUFFIX).$(SUFFIX) zswap_k$(TSUFFIX).$(SUFFIX) zaxpby_k$(TSUFFIX).$(SUFFIX)
zscal_k$(TSUFFIX).$(SUFFIX) zswap_k$(TSUFFIX).$(SUFFIX) zaxpby_k$(TSUFFIX).$(SUFFIX) zsum_k$(TSUFFIX).$(SUFFIX)
XBLASOBJS += \
xamax_k$(TSUFFIX).$(SUFFIX) xamin_k$(TSUFFIX).$(SUFFIX) ixamax_k$(TSUFFIX).$(SUFFIX) ixamin_k$(TSUFFIX).$(SUFFIX) \
xasum_k$(TSUFFIX).$(SUFFIX) xaxpy_k$(TSUFFIX).$(SUFFIX) xaxpyc_k$(TSUFFIX).$(SUFFIX) xcopy_k$(TSUFFIX).$(SUFFIX) \
xdotc_k$(TSUFFIX).$(SUFFIX) xdotu_k$(TSUFFIX).$(SUFFIX) xnrm2_k$(TSUFFIX).$(SUFFIX) xqrot_k$(TSUFFIX).$(SUFFIX) \
xscal_k$(TSUFFIX).$(SUFFIX) xswap_k$(TSUFFIX).$(SUFFIX)
xscal_k$(TSUFFIX).$(SUFFIX) xswap_k$(TSUFFIX).$(SUFFIX) xsum_k$(TSUFFIX).$(SUFFIX)
### AMAX ###
@ -617,7 +644,7 @@ $(KDIR)idmin_k$(TSUFFIX).$(SUFFIX) $(KDIR)idmin_k$(TPSUFFIX).$(PSUFFIX) : $(KE
$(KDIR)iqmin_k$(TSUFFIX).$(SUFFIX) $(KDIR)iqmin_k$(TPSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(IQMINKERNEL)
$(CC) -c $(CFLAGS) -UCOMPLEX -DXDOUBLE -UUSE_ABS -DUSE_MIN $< -o $@
### ASUM ###
$(KDIR)sasum_k$(TSUFFIX).$(SUFFIX) $(KDIR)sasum_k$(TPSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(SASUMKERNEL)
$(CC) -c $(CFLAGS) -UCOMPLEX -UDOUBLE $< -o $@
@ -636,6 +663,26 @@ $(KDIR)zasum_k$(TSUFFIX).$(SUFFIX) $(KDIR)zasum_k$(TPSUFFIX).$(PSUFFIX) : $(KE
$(KDIR)xasum_k$(TSUFFIX).$(SUFFIX) $(KDIR)xasum_k$(TPSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(XASUMKERNEL)
$(CC) -c $(CFLAGS) -DCOMPLEX -DXDOUBLE $< -o $@
### SUM ###
$(KDIR)ssum_k$(TSUFFIX).$(SUFFIX) $(KDIR)ssum_k$(TPSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(SSUMKERNEL)
$(CC) -c $(CFLAGS) -UCOMPLEX -UDOUBLE $< -o $@
$(KDIR)dsum_k$(TSUFFIX).$(SUFFIX) $(KDIR)dsum_k$(TPSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(DSUMKERNEL)
$(CC) -c $(CFLAGS) -UCOMPLEX -DDOUBLE $< -o $@
$(KDIR)qsum_k$(TSUFFIX).$(SUFFIX) $(KDIR)qsum_k$(TPSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(QSUMKERNEL)
$(CC) -c $(CFLAGS) -UCOMPLEX -DXDOUBLE $< -o $@
$(KDIR)csum_k$(TSUFFIX).$(SUFFIX) $(KDIR)csum_k$(TPSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(CSUMKERNEL)
$(CC) -c $(CFLAGS) -DCOMPLEX -UDOUBLE $< -o $@
$(KDIR)zsum_k$(TSUFFIX).$(SUFFIX) $(KDIR)zsum_k$(TPSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(ZSUMKERNEL)
$(CC) -c $(CFLAGS) -DCOMPLEX -DDOUBLE $< -o $@
$(KDIR)xsum_k$(TSUFFIX).$(SUFFIX) $(KDIR)xsum_k$(TPSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(XSUMKERNEL)
$(CC) -c $(CFLAGS) -DCOMPLEX -DXDOUBLE $< -o $@
### AXPY ###
$(KDIR)saxpy_k$(TSUFFIX).$(SUFFIX) $(KDIR)saxpy_k$(TPSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(SAXPYKERNEL)
$(CC) -c $(CFLAGS) -UCOMPLEX -UDOUBLE $< -o $@

206
kernel/alpha/sum.S Normal file
View File

@ -0,0 +1,206 @@
/*********************************************************************/
/* Copyright 2009, 2010 The University of Texas at Austin. */
/* All rights reserved. */
/* */
/* Redistribution and use in source and binary forms, with or */
/* without modification, are permitted provided that the following */
/* conditions are met: */
/* */
/* 1. Redistributions of source code must retain the above */
/* copyright notice, this list of conditions and the following */
/* disclaimer. */
/* */
/* 2. Redistributions in binary form must reproduce the above */
/* copyright notice, this list of conditions and the following */
/* disclaimer in the documentation and/or other materials */
/* provided with the distribution. */
/* */
/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */
/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */
/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */
/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */
/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */
/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */
/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */
/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */
/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */
/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */
/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */
/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */
/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */
/* POSSIBILITY OF SUCH DAMAGE. */
/* */
/* The views and conclusions contained in the software and */
/* documentation are those of the authors and should not be */
/* interpreted as representing official policies, either expressed */
/* or implied, of The University of Texas at Austin. */
/*********************************************************************/
#define ASSEMBLER
#include "common.h"
#include "version.h"
#define PREFETCHSIZE 88
#define N $16
#define X $17
#define INCX $18
#define I $19
#define s0 $f0
#define s1 $f1
#define s2 $f10
#define s3 $f11
#define a0 $f12
#define a1 $f13
#define a2 $f14
#define a3 $f15
#define a4 $f16
#define a5 $f17
#define a6 $f18
#define a7 $f19
#define t0 $f20
#define t1 $f21
#define t2 $f22
#define t3 $f23
PROLOGUE
PROFCODE
fclr s0
unop
fclr t0
ble N, $L999
sra N, 3, I
fclr s1
fclr s2
ble I, $L15
LD a0, 0 * SIZE(X)
fclr t1
SXADDQ INCX, X, X
fclr t2
LD a1, 0 * SIZE(X)
fclr t3
SXADDQ INCX, X, X
fclr s3
LD a2, 0 * SIZE(X)
SXADDQ INCX, X, X
LD a3, 0 * SIZE(X)
SXADDQ INCX, X, X
LD a4, 0 * SIZE(X)
SXADDQ INCX, X, X
LD a5, 0 * SIZE(X)
SXADDQ INCX, X, X
lda I, -1(I)
ble I, $L13
.align 4
$L12:
ADD s0, t0, s0
ldl $31, PREFETCHSIZE * 2 * SIZE(X)
fmov a0, t0
lda I, -1(I)
ADD s1, t1, s1
LD a6, 0 * SIZE(X)
fmov a1, t1
SXADDQ INCX, X, X
ADD s2, t2, s2
LD a7, 0 * SIZE(X)
fmov a2, t2
SXADDQ INCX, X, X
ADD s3, t3, s3
LD a0, 0 * SIZE(X)
fmov a3, t3
SXADDQ INCX, X, X
ADD s0, t0, s0
LD a1, 0 * SIZE(X)
fmov a4, t0
SXADDQ INCX, X, X
ADD s1, t1, s1
LD a2, 0 * SIZE(X)
fmov a5, t1
SXADDQ INCX, X, X
ADD s2, t2, s2
LD a3, 0 * SIZE(X)
fmov a6, t2
SXADDQ INCX, X, X
ADD s3, t3, s3
LD a4, 0 * SIZE(X)
fmov a7, t3
SXADDQ INCX, X, X
LD a5, 0 * SIZE(X)
unop
SXADDQ INCX, X, X
bne I, $L12
.align 4
$L13:
ADD s0, t0, s0
LD a6, 0 * SIZE(X)
fmov a0, t0
SXADDQ INCX, X, X
ADD s1, t1, s1
LD a7, 0 * SIZE(X)
fmov a1, t1
SXADDQ INCX, X, X
ADD s2, t2, s2
fmov a2, t2
ADD s3, t3, s3
fmov a3, t3
ADD s0, t0, s0
fmov a4, t0
ADD s1, t1, s1
fmov a5, t1
ADD s2, t2, s2
fmov a6, t2
ADD s3, t3, s3
fmov a7, t3
ADD s1, t1, s1
ADD s2, t2, s2
ADD s3, t3, s3
ADD s0, s1, s0
ADD s2, s3, s2
.align 4
$L15:
and N, 7, I
ADD s0, s2, s0
unop
ble I, $L999
.align 4
$L17:
ADD s0, t0, s0
LD a0, 0 * SIZE(X)
SXADDQ INCX, X, X
fmov a0, t0
lda I, -1(I)
bne I, $L17
.align 4
$L999:
ADD s0, t0, s0
ret
EPILOGUE

208
kernel/alpha/zsum.S Normal file
View File

@ -0,0 +1,208 @@
/*********************************************************************/
/* Copyright 2009, 2010 The University of Texas at Austin. */
/* All rights reserved. */
/* */
/* Redistribution and use in source and binary forms, with or */
/* without modification, are permitted provided that the following */
/* conditions are met: */
/* */
/* 1. Redistributions of source code must retain the above */
/* copyright notice, this list of conditions and the following */
/* disclaimer. */
/* */
/* 2. Redistributions in binary form must reproduce the above */
/* copyright notice, this list of conditions and the following */
/* disclaimer in the documentation and/or other materials */
/* provided with the distribution. */
/* */
/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */
/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */
/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */
/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */
/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */
/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */
/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */
/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */
/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */
/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */
/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */
/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */
/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */
/* POSSIBILITY OF SUCH DAMAGE. */
/* */
/* The views and conclusions contained in the software and */
/* documentation are those of the authors and should not be */
/* interpreted as representing official policies, either expressed */
/* or implied, of The University of Texas at Austin. */
/*********************************************************************/
#define ASSEMBLER
#include "common.h"
#include "version.h"
#define PREFETCHSIZE 88
#define N $16
#define X $17
#define INCX $18
#define I $19
#define s0 $f0
#define s1 $f1
#define s2 $f10
#define s3 $f11
#define a0 $f12
#define a1 $f13
#define a2 $f14
#define a3 $f15
#define a4 $f16
#define a5 $f17
#define a6 $f18
#define a7 $f19
#define t0 $f20
#define t1 $f21
#define t2 $f22
#define t3 $f23
PROLOGUE
PROFCODE
fclr s0
unop
fclr t0
addq INCX, INCX, INCX
fclr s1
unop
fclr t1
ble N, $L999
fclr s2
sra N, 2, I
fclr s3
ble I, $L15
LD a0, 0 * SIZE(X)
fclr t2
LD a1, 1 * SIZE(X)
SXADDQ INCX, X, X
LD a2, 0 * SIZE(X)
fclr t3
LD a3, 1 * SIZE(X)
SXADDQ INCX, X, X
LD a4, 0 * SIZE(X)
LD a5, 1 * SIZE(X)
SXADDQ INCX, X, X
lda I, -1(I)
ble I, $L13
.align 4
$L12:
ADD s0, t0, s0
ldl $31, PREFETCHSIZE * SIZE(X)
fmov a0, t0
lda I, -1(I)
ADD s1, t1, s1
LD a6, 0 * SIZE(X)
fmov a1, t1
unop
ADD s2, t2, s2
LD a7, 1 * SIZE(X)
fmov a2, t2
SXADDQ INCX, X, X
ADD s3, t3, s3
LD a0, 0 * SIZE(X)
fmov a3, t3
unop
ADD s0, t0, s0
LD a1, 1 * SIZE(X)
fmov a4, t0
SXADDQ INCX, X, X
ADD s1, t1, s1
LD a2, 0 * SIZE(X)
fmov a5, t1
unop
ADD s2, t2, s2
LD a3, 1 * SIZE(X)
fmov a6, t2
SXADDQ INCX, X, X
ADD s3, t3, s3
LD a4, 0 * SIZE(X)
fmov a7, t3
unop
LD a5, 1 * SIZE(X)
unop
SXADDQ INCX, X, X
bne I, $L12
.align 4
$L13:
ADD s0, t0, s0
LD a6, 0 * SIZE(X)
fmov a0, t0
ADD s1, t1, s1
LD a7, 1 * SIZE(X)
fmov a1, t1
SXADDQ INCX, X, X
ADD s2, t2, s2
fmov a2, t2
ADD s3, t3, s3
fmov a3, t3
ADD s0, t0, s0
fmov a4, t0
ADD s1, t1, s1
fmov a5, t1
ADD s2, t2, s2
fmov a6, t2
ADD s3, t3, s3
fmov a7, t3
ADD s2, t2, s2
ADD s3, t3, s3
.align 4
$L15:
ADD s0, s2, s0
and N, 3, I
ADD s1, s3, s1
ble I, $L999
.align 4
$L17:
ADD s0, t0, s0
LD a0, 0 * SIZE(X)
fmov a0, t0
lda I, -1(I)
ADD s1, t1, s1
LD a1, 1 * SIZE(X)
fmov a1, t1
SXADDQ INCX, X, X
bne I, $L17
.align 4
$L999:
ADD s0, t0, s0
ADD s1, t1, s1
ADD s0, s1, s0
ret
EPILOGUE

View File

@ -35,6 +35,11 @@ DASUMKERNEL = ../arm/asum.c
CASUMKERNEL = ../arm/zasum.c
ZASUMKERNEL = ../arm/zasum.c
SSUMKERNEL = ../arm/sum.c
DSUMKERNEL = ../arm/sum.c
CSUMKERNEL = ../arm/zsum.c
ZSUMKERNEL = ../arm/zsum.c
SAXPYKERNEL = ../arm/axpy.c
DAXPYKERNEL = ../arm/axpy.c
CAXPYKERNEL = ../arm/zaxpy.c

View File

@ -37,6 +37,9 @@ DASUMKERNEL = asum_vfp.S
CASUMKERNEL = asum_vfp.S
ZASUMKERNEL = asum_vfp.S
SSUMKERNEL = sum_vfp.S
DSUMKERNEL = sum_vfp.S
SAXPYKERNEL = axpy_vfp.S
DAXPYKERNEL = axpy_vfp.S
CAXPYKERNEL = axpy_vfp.S

51
kernel/arm/sum.c Normal file
View File

@ -0,0 +1,51 @@
/***************************************************************************
Copyright (c) 2013, The OpenBLAS Project
All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
1. Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
2. Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in
the documentation and/or other materials provided with the
distribution.
3. Neither the name of the OpenBLAS project nor the names of
its contributors may be used to endorse or promote products
derived from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*****************************************************************************/
/**************************************************************************************
* trivial copy of asum.c with the ABS() removed *
**************************************************************************************/
#include "common.h"
#include <math.h>
FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x)
{
BLASLONG i=0;
FLOAT sumf = 0.0;
if (n <= 0 || inc_x <= 0) return(sumf);
n *= inc_x;
while(i < n)
{
sumf += x[i];
i += inc_x;
}
return(sumf);
}

425
kernel/arm/sum_vfp.S Normal file
View File

@ -0,0 +1,425 @@
/***************************************************************************
Copyright (c) 2013, The OpenBLAS Project
All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
1. Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
2. Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in
the documentation and/or other materials provided with the
distribution.
3. Neither the name of the OpenBLAS project nor the names of
its contributors may be used to endorse or promote products
derived from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*****************************************************************************/
/**************************************************************************************
* trivial copy of asum_vfp.S with the in-place vabs.f64 calls removed *
**************************************************************************************/
#define ASSEMBLER
#include "common.h"
#define STACKSIZE 256
#define N r0
#define X r1
#define INC_X r2
#define I r12
#define X_PRE 512
/**************************************************************************************
* Macro definitions
**************************************************************************************/
#if !defined(COMPLEX)
#if defined(DOUBLE)
.macro KERNEL_F4
pld [ X, #X_PRE ]
vldmia.f64 X!, { d4 - d5 }
vadd.f64 d0 , d0, d4
vldmia.f64 X!, { d6 - d7 }
vadd.f64 d1 , d1, d5
vadd.f64 d0 , d0, d6
vadd.f64 d1 , d1, d7
.endm
.macro KERNEL_F1
vldmia.f64 X!, { d4 }
vadd.f64 d0 , d0, d4
.endm
.macro KERNEL_S4
vldmia.f64 X, { d4 }
vadd.f64 d0 , d0, d4
add X, X, INC_X
vldmia.f64 X, { d4 }
vadd.f64 d0 , d0, d4
add X, X, INC_X
vldmia.f64 X, { d4 }
vadd.f64 d0 , d0, d4
add X, X, INC_X
vldmia.f64 X, { d4 }
vadd.f64 d0 , d0, d4
add X, X, INC_X
.endm
.macro KERNEL_S1
vldmia.f64 X, { d4 }
vadd.f64 d0 , d0, d4
add X, X, INC_X
.endm
#else
.macro KERNEL_F4
vldmia.f32 X!, { s4 - s5 }
vadd.f32 s0 , s0, s4
vldmia.f32 X!, { s6 - s7 }
vadd.f32 s1 , s1, s5
vadd.f32 s0 , s0, s6
vadd.f32 s1 , s1, s7
.endm
.macro KERNEL_F1
vldmia.f32 X!, { s4 }
vadd.f32 s0 , s0, s4
.endm
.macro KERNEL_S4
vldmia.f32 X, { s4 }
vadd.f32 s0 , s0, s4
add X, X, INC_X
vldmia.f32 X, { s4 }
vadd.f32 s0 , s0, s4
add X, X, INC_X
vldmia.f32 X, { s4 }
vadd.f32 s0 , s0, s4
add X, X, INC_X
vldmia.f32 X, { s4 }
vadd.f32 s0 , s0, s4
add X, X, INC_X
.endm
.macro KERNEL_S1
vldmia.f32 X, { s4 }
vadd.f32 s0 , s0, s4
add X, X, INC_X
.endm
#endif
#else
#if defined(DOUBLE)
.macro KERNEL_F4
pld [ X, #X_PRE ]
vldmia.f64 X!, { d4 - d5 }
vadd.f64 d0 , d0, d4
vldmia.f64 X!, { d6 - d7 }
vadd.f64 d1 , d1, d5
vadd.f64 d0 , d0, d6
vadd.f64 d1 , d1, d7
pld [ X, #X_PRE ]
vldmia.f64 X!, { d4 - d5 }
vadd.f64 d0 , d0, d4
vldmia.f64 X!, { d6 - d7 }
vadd.f64 d1 , d1, d5
vadd.f64 d0 , d0, d6
vadd.f64 d1 , d1, d7
.endm
.macro KERNEL_F1
vldmia.f64 X!, { d4 }
vadd.f64 d0 , d0, d4
vldmia.f64 X!, { d4 }
vadd.f64 d0 , d0, d4
.endm
.macro KERNEL_S4
vldmia.f64 X, { d4 -d5 }
vadd.f64 d0 , d0, d4
vadd.f64 d0 , d0, d5
add X, X, INC_X
vldmia.f64 X, { d4 -d5 }
vadd.f64 d0 , d0, d4
vadd.f64 d0 , d0, d5
add X, X, INC_X
vldmia.f64 X, { d4 -d5 }
vadd.f64 d0 , d0, d4
vadd.f64 d0 , d0, d5
add X, X, INC_X
vldmia.f64 X, { d4 -d5 }
vadd.f64 d0 , d0, d4
vadd.f64 d0 , d0, d5
add X, X, INC_X
.endm
.macro KERNEL_S1
vldmia.f64 X, { d4 -d5 }
vadd.f64 d0 , d0, d4
vadd.f64 d0 , d0, d5
add X, X, INC_X
.endm
#else
.macro KERNEL_F4
pld [ X, #X_PRE ]
vldmia.f32 X!, { s4 - s5 }
vadd.f32 s0 , s0, s4
vldmia.f32 X!, { s6 - s7 }
vadd.f32 s1 , s1, s5
vadd.f32 s0 , s0, s6
vadd.f32 s1 , s1, s7
vldmia.f32 X!, { s4 - s5 }
vadd.f32 s0 , s0, s4
vldmia.f32 X!, { s6 - s7 }
vadd.f32 s1 , s1, s5
vadd.f32 s0 , s0, s6
vadd.f32 s1 , s1, s7
.endm
.macro KERNEL_F1
vldmia.f32 X!, { s4 }
vadd.f32 s0 , s0, s4
vldmia.f32 X!, { s4 }
vadd.f32 s0 , s0, s4
.endm
.macro KERNEL_S4
vldmia.f32 X, { s4 -s5 }
vadd.f32 s0 , s0, s4
vadd.f32 s0 , s0, s5
add X, X, INC_X
vldmia.f32 X, { s4 -s5 }
vadd.f32 s0 , s0, s4
vadd.f32 s0 , s0, s5
add X, X, INC_X
vldmia.f32 X, { s4 -s5 }
vadd.f32 s0 , s0, s4
vadd.f32 s0 , s0, s5
add X, X, INC_X
vldmia.f32 X, { s4 -s5 }
vadd.f32 s0 , s0, s4
vadd.f32 s0 , s0, s5
add X, X, INC_X
.endm
.macro KERNEL_S1
vldmia.f32 X, { s4 -s5 }
vadd.f32 s0 , s0, s4
vadd.f32 s0 , s0, s5
add X, X, INC_X
.endm
#endif
#endif
/**************************************************************************************
* End of macro definitions
**************************************************************************************/
PROLOGUE
.align 5
movs r12, #0 // clear floating point register
vmov s0, r12
vmov s1, r12
#if defined(DOUBLE)
vcvt.f64.f32 d0, s0
vcvt.f64.f32 d1, s1
#endif
cmp N, #0
ble asum_kernel_L999
cmp INC_X, #0
beq asum_kernel_L999
cmp INC_X, #1
bne asum_kernel_S_BEGIN
asum_kernel_F_BEGIN:
asrs I, N, #2 // I = N / 4
ble asum_kernel_F1
.align 5
asum_kernel_F4:
#if !defined(DOUBLE) && !defined(COMPLEX)
pld [ X, #X_PRE ]
#endif
KERNEL_F4
subs I, I, #1
ble asum_kernel_F1
KERNEL_F4
subs I, I, #1
bne asum_kernel_F4
asum_kernel_F1:
ands I, N, #3
ble asum_kernel_L999
asum_kernel_F10:
KERNEL_F1
subs I, I, #1
bne asum_kernel_F10
b asum_kernel_L999
asum_kernel_S_BEGIN:
#if defined(COMPLEX)
#if defined(DOUBLE)
lsl INC_X, INC_X, #4 // INC_X * SIZE * 2
#else
lsl INC_X, INC_X, #3 // INC_X * SIZE * 2
#endif
#else
#if defined(DOUBLE)
lsl INC_X, INC_X, #3 // INC_X * SIZE
#else
lsl INC_X, INC_X, #2 // INC_X * SIZE
#endif
#endif
asrs I, N, #2 // I = N / 4
ble asum_kernel_S1
.align 5
asum_kernel_S4:
KERNEL_S4
subs I, I, #1
bne asum_kernel_S4
asum_kernel_S1:
ands I, N, #3
ble asum_kernel_L999
asum_kernel_S10:
KERNEL_S1
subs I, I, #1
bne asum_kernel_S10
asum_kernel_L999:
#if defined(DOUBLE)
vadd.f64 d0 , d0, d1 // set return value
#else
vadd.f32 s0 , s0, s1 // set return value
#endif
#if !defined(__ARM_PCS_VFP)
#if !defined(DOUBLE)
vmov r0, s0
#else
vmov r0, r1, d0
#endif
#endif
bx lr
EPILOGUE

57
kernel/arm/zsum.c Normal file
View File

@ -0,0 +1,57 @@
/***************************************************************************
Copyright (c) 2013, The OpenBLAS Project
All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
1. Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
2. Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in
the documentation and/or other materials provided with the
distribution.
3. Neither the name of the OpenBLAS project nor the names of
its contributors may be used to endorse or promote products
derived from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*****************************************************************************/
/**************************************************************************************
* trivial copy of zasum.c with the ABS() removed *
**************************************************************************************/
#include "common.h"
#include <math.h>
#define CSUM1(x,i) x[i]+x[i+1]
FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x)
{
BLASLONG i=0;
FLOAT sumf = 0.0;
BLASLONG inc_x2;
if (n <= 0 || inc_x <= 0) return(sumf);
inc_x2 = 2 * inc_x;
n *= inc_x2;
while(i < n)
{
sumf += CSUM1(x,i);
i += inc_x2;
}
return(sumf);
}

164
kernel/arm64/csum.S Normal file
View File

@ -0,0 +1,164 @@
/*******************************************************************************
Copyright (c) 2019, The OpenBLAS Project
All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
1. Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
2. Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in
the documentation and/or other materials provided with the
distribution.
3. Neither the name of the OpenBLAS project nor the names of
its contributors may be used to endorse or promote products
derived from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*******************************************************************************/
#define ASSEMBLER
#include "common.h"
#define N x0 /* vector length */
#define X x1 /* X vector address */
#define INC_X x2 /* X stride */
#define I x5 /* loop variable */
/*******************************************************************************
* Macro definitions
*******************************************************************************/
#define REG0 wzr
#define SUMF s0
#define TMPF s1
#define TMPVF {v1.s}[0]
#define SZ 4
/******************************************************************************/
.macro KERNEL_F1
ld1 {v1.2s}, [X], #8
ext v2.8b, v1.8b, v1.8b, #4
fadd TMPF, TMPF, s2
fadd SUMF, SUMF, TMPF
.endm
.macro KERNEL_F8
ld1 {v1.4s, v2.4s, v3.4s, v4.4s}, [X]
add X, X, #64
PRFM PLDL1KEEP, [X, #1024]
fadd v1.4s, v1.4s, v2.4s
fadd v3.4s, v3.4s, v4.4s
fadd v0.4s, v0.4s, v1.4s
fadd v0.4s, v0.4s, v3.4s
.endm
.macro KERNEL_F8_FINALIZE
ext v1.16b, v0.16b, v0.16b, #8
fadd v0.2s, v0.2s, v1.2s
faddp SUMF, v0.2s
.endm
.macro INIT_S
lsl INC_X, INC_X, #3
.endm
.macro KERNEL_S1
ld1 {v1.2s}, [X], INC_X
ext v2.8b, v1.8b, v1.8b, #4
fadd TMPF, TMPF, s2
fadd SUMF, SUMF, TMPF
.endm
/*******************************************************************************
* End of macro definitions
*******************************************************************************/
PROLOGUE
fmov SUMF, REG0
fmov s1, SUMF
cmp N, xzr
ble .Lcsum_kernel_L999
cmp INC_X, xzr
ble .Lcsum_kernel_L999
cmp INC_X, #1
bne .Lcsum_kernel_S_BEGIN
.Lcsum_kernel_F_BEGIN:
asr I, N, #3
cmp I, xzr
beq .Lcsum_kernel_F1
.Lcsum_kernel_F8:
KERNEL_F8
subs I, I, #1
bne .Lcsum_kernel_F8
KERNEL_F8_FINALIZE
.Lcsum_kernel_F1:
ands I, N, #7
ble .Lcsum_kernel_L999
.Lcsum_kernel_F10:
KERNEL_F1
subs I, I, #1
bne .Lcsum_kernel_F10
.Lcsum_kernel_L999:
ret
.Lcsum_kernel_S_BEGIN:
INIT_S
asr I, N, #2
cmp I, xzr
ble .Lcsum_kernel_S1
.Lcsum_kernel_S4:
KERNEL_S1
KERNEL_S1
KERNEL_S1
KERNEL_S1
subs I, I, #1
bne .Lcsum_kernel_S4
.Lcsum_kernel_S1:
ands I, N, #3
ble .Lcsum_kernel_L999
.Lcsum_kernel_S10:
KERNEL_S1
subs I, I, #1
bne .Lcsum_kernel_S10
ret
EPILOGUE

186
kernel/arm64/sum.S Normal file
View File

@ -0,0 +1,186 @@
/*******************************************************************************
Copyright (c) 2019, The OpenBLAS Project
All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
1. Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
2. Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in
the documentation and/or other materials provided with the
distribution.
3. Neither the name of the OpenBLAS project nor the names of
its contributors may be used to endorse or promote products
derived from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*******************************************************************************/
#define ASSEMBLER
#include "common.h"
#define N x0 /* vector length */
#define X x1 /* X vector address */
#define INC_X x2 /* X stride */
#define I x5 /* loop variable */
/*******************************************************************************
* Macro definitions
*******************************************************************************/
#if !defined(DOUBLE)
#define REG0 wzr
#define SUMF s0
#define TMPF s1
#define TMPVF {v1.s}[0]
#define SZ 4
#else
#define REG0 xzr
#define SUMF d0
#define TMPF d1
#define TMPVF {v1.d}[0]
#define SZ 8
#endif
/******************************************************************************/
.macro KERNEL_F1
ldr TMPF, [X], #SZ
fadd SUMF, SUMF, TMPF
.endm
.macro KERNEL_F8
#if !defined(DOUBLE)
ld1 {v1.4s, v2.4s}, [X], #32 // Load [X3, X2, X1, X0]
fadd v1.4s, v1.4s, v2.4s // [X3+X1, X2+X0]
fadd v0.4s, v0.4s, v1.4s // [X3+X1, X2+X0]
PRFM PLDL1KEEP, [X, #1024]
#else // DOUBLE
ld1 {v2.2d, v3.2d, v4.2d, v5.2d}, [X]
add X, X, #64
PRFM PLDL1KEEP, [X, #1024]
fadd v2.2d, v2.2d, v3.2d
fadd v4.2d, v4.2d, v5.2d
fadd v0.2d, v0.2d, v2.2d
fadd v0.2d, v0.2d, v4.2d
#endif
.endm
.macro KERNEL_F8_FINALIZE
#if !defined(DOUBLE)
ext v1.16b, v0.16b, v0.16b, #8
fadd v0.2s, v0.2s, v1.2s
faddp SUMF, v0.2s
#else
faddp SUMF, v0.2d
#endif
.endm
.macro INIT_S
#if !defined(DOUBLE)
lsl INC_X, INC_X, #2
#else
lsl INC_X, INC_X, #3
#endif
.endm
.macro KERNEL_S1
ld1 TMPVF, [X], INC_X
fadd SUMF, SUMF, TMPF
.endm
/*******************************************************************************
* End of macro definitions
*******************************************************************************/
PROLOGUE
fmov SUMF, REG0
#if !defined(DOUBLE)
fmov s1, SUMF
#else
fmov d1, SUMF
#endif
cmp N, xzr
ble .Lsum_kernel_L999
cmp INC_X, xzr
ble .Lsum_kernel_L999
cmp INC_X, #1
bne .Lsum_kernel_S_BEGIN
.Lsum_kernel_F_BEGIN:
asr I, N, #3
cmp I, xzr
beq .Lsum_kernel_F1
.Lsum_kernel_F8:
KERNEL_F8
subs I, I, #1
bne .Lsum_kernel_F8
KERNEL_F8_FINALIZE
.Lsum_kernel_F1:
ands I, N, #7
ble .Lsum_kernel_L999
.Lsum_kernel_F10:
KERNEL_F1
subs I, I, #1
bne .Lsum_kernel_F10
.Lsum_kernel_L999:
ret
.Lsum_kernel_S_BEGIN:
INIT_S
asr I, N, #2
cmp I, xzr
ble .Lsum_kernel_S1
.Lsum_kernel_S4:
KERNEL_S1
KERNEL_S1
KERNEL_S1
KERNEL_S1
subs I, I, #1
bne .Lsum_kernel_S4
.Lsum_kernel_S1:
ands I, N, #3
ble .Lsum_kernel_L999
.Lsum_kernel_S10:
KERNEL_S1
subs I, I, #1
bne .Lsum_kernel_S10
ret
EPILOGUE

158
kernel/arm64/zsum.S Normal file
View File

@ -0,0 +1,158 @@
/*******************************************************************************
Copyright (c) 2015, The OpenBLAS Project
All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
1. Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
2. Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in
the documentation and/or other materials provided with the
distribution.
3. Neither the name of the OpenBLAS project nor the names of
its contributors may be used to endorse or promote products
derived from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*******************************************************************************/
#define ASSEMBLER
#include "common.h"
#define N x0 /* vector length */
#define X x1 /* X vector address */
#define INC_X x2 /* X stride */
#define I x5 /* loop variable */
/*******************************************************************************
* Macro definitions
*******************************************************************************/
#define REG0 xzr
#define SUMF d0
#define TMPF d1
#define TMPVF {v1.d}[0]
#define SZ 8
/******************************************************************************/
.macro KERNEL_F1
ld1 {v1.2d}, [X], #16
faddp TMPF, v1.2d
fadd SUMF, SUMF, TMPF
.endm
.macro KERNEL_F4
ld1 {v1.2d, v2.2d, v3.2d, v4.2d}, [X], #64
fadd v1.2d, v1.2d, v2.2d
fadd v3.2d, v3.2d, v4.2d
fadd v0.2d, v0.2d, v1.2d
fadd v0.2d, v0.2d, v3.2d
PRFM PLDL1KEEP, [X, #1024]
.endm
.macro KERNEL_F4_FINALIZE
faddp SUMF, v0.2d
.endm
.macro INIT_S
lsl INC_X, INC_X, #4
.endm
.macro KERNEL_S1
ld1 {v1.2d}, [X], INC_X
faddp TMPF, v1.2d
fadd SUMF, SUMF, TMPF
.endm
/*******************************************************************************
* End of macro definitions
*******************************************************************************/
PROLOGUE
fmov SUMF, REG0
cmp N, xzr
ble .Lzsum_kernel_L999
cmp INC_X, xzr
ble .Lzsum_kernel_L999
cmp INC_X, #1
bne .Lzsum_kernel_S_BEGIN
.Lzsum_kernel_F_BEGIN:
asr I, N, #2
cmp I, xzr
beq .Lzsum_kernel_F1
.Lzsum_kernel_F4:
KERNEL_F4
subs I, I, #1
bne .Lzsum_kernel_F4
KERNEL_F4_FINALIZE
.Lzsum_kernel_F1:
ands I, N, #3
ble .Lzsum_kernel_L999
.Lzsum_kernel_F10:
KERNEL_F1
subs I, I, #1
bne .Lzsum_kernel_F10
.Lzsum_kernel_L999:
ret
.Lzsum_kernel_S_BEGIN:
INIT_S
asr I, N, #2
cmp I, xzr
ble .Lzsum_kernel_S1
.Lzsum_kernel_S4:
KERNEL_S1
KERNEL_S1
KERNEL_S1
KERNEL_S1
subs I, I, #1
bne .Lzsum_kernel_S4
.Lzsum_kernel_S1:
ands I, N, #3
ble .Lzsum_kernel_L999
.Lzsum_kernel_S10:
KERNEL_S1
subs I, I, #1
bne .Lzsum_kernel_S10
ret
EPILOGUE

View File

@ -60,6 +60,10 @@ CASUMKERNEL = asum.S
ZASUMKERNEL = asum.S
XASUMKERNEL = asum.S
CSUMKERNEL = sum.S
ZSUMKERNEL = sum.S
XSUMKERNEL = sum.S
CNRM2KERNEL = nrm2.S
ZNRM2KERNEL = nrm2.S
XNRM2KERNEL = nrm2.S

358
kernel/ia64/sum.S Normal file
View File

@ -0,0 +1,358 @@
/*********************************************************************/
/* Copyright 2009, 2010 The University of Texas at Austin. */
/* Copyright 2019, The OpenBLAS project */
/* All rights reserved. */
/* */
/* Redistribution and use in source and binary forms, with or */
/* without modification, are permitted provided that the following */
/* conditions are met: */
/* */
/* 1. Redistributions of source code must retain the above */
/* copyright notice, this list of conditions and the following */
/* disclaimer. */
/* */
/* 2. Redistributions in binary form must reproduce the above */
/* copyright notice, this list of conditions and the following */
/* disclaimer in the documentation and/or other materials */
/* provided with the distribution. */
/* */
/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */
/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */
/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */
/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */
/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */
/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */
/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */
/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */
/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */
/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */
/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */
/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */
/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */
/* POSSIBILITY OF SUCH DAMAGE. */
/* */
/* The views and conclusions contained in the software and */
/* documentation are those of the authors and should not be */
/* interpreted as representing official policies, either expressed */
/* or implied, of The University of Texas at Austin. */
/*********************************************************************/
#define ASSEMBLER
#include "common.h"
#ifdef XDOUBLE
#define PREFETCH_SIZE ( 8 * 16 + 4)
#elif defined(DOUBLE)
#define PREFETCH_SIZE (16 * 16 + 8)
#else
#define PREFETCH_SIZE (32 * 16 + 16)
#endif
#ifndef COMPLEX
#define COMPADD 0
#define STRIDE INCX
#else
#define COMPADD 1
#define STRIDE SIZE
#endif
#define PRE1 r2
#define I r17
#define J r18
#define INCX16 r21
#define PR r30
#define ARLC r31
#define N r32
#define X r33
#define INCX r34
PROLOGUE
.prologue
PROFCODE
{ .mfi
adds PRE1 = PREFETCH_SIZE * SIZE, X
mov f8 = f0
.save ar.lc, ARLC
mov ARLC = ar.lc
}
;;
.body
#ifdef F_INTERFACE
{ .mmi
LDINT N = [N]
LDINT INCX = [INCX]
nop.i 0
}
;;
#ifndef USE64BITINT
{ .mii
nop.m 0
sxt4 N = N
sxt4 INCX = INCX
}
;;
#endif
#endif
{ .mmi
cmp.lt p0, p6 = r0, INCX
cmp.lt p0, p7 = r0, N
shr I = N, (4 - COMPADD)
}
{ .mbb
and J = ((1 << (4 - COMPADD)) - 1), N
(p6) br.ret.sptk.many b0
(p7) br.ret.sptk.many b0
}
;;
{ .mfi
adds I = -1, I
mov f10 = f0
mov PR = pr
}
{ .mfi
cmp.eq p9, p0 = r0, J
mov f9 = f0
tbit.z p0, p12 = N, 3 - COMPADD
}
;;
{ .mmi
cmp.eq p16, p0 = r0, r0
cmp.ne p17, p0 = r0, r0
mov ar.ec= 3
}
{ .mfi
cmp.ne p18, p0 = r0, r0
mov f11 = f0
shl INCX = INCX, BASE_SHIFT + COMPADD
}
;;
{ .mmi
#ifdef XDOUBLE
shladd INCX16 = INCX, (3 - COMPADD), r0
#else
shladd INCX16 = INCX, (4 - COMPADD), r0
#endif
cmp.ne p19, p0 = r0, r0
mov ar.lc = I
}
{ .mmb
cmp.gt p8 ,p0 = r0, I
#ifdef COMPLEX
adds INCX = - SIZE, INCX
#else
nop.m 0
#endif
(p8) br.cond.dpnt .L55
}
;;
.align 32
.L52:
{ .mmf
(p16) lfetch.nt1 [PRE1], INCX16
(p16) LDFD f32 = [X], STRIDE
}
{ .mfb
(p19) FADD f8 = f8, f71
}
;;
{ .mmf
(p16) LDFD f35 = [X], INCX
}
{ .mfb
(p19) FADD f9 = f9, f74
}
;;
{ .mmf
(p16) LDFD f38 = [X], STRIDE
}
{ .mfb
(p19) FADD f10 = f10, f77
}
;;
{ .mmf
(p16) LDFD f41 = [X], INCX
}
{ .mfb
(p19) FADD f11 = f11, f80
}
;;
{ .mmf
(p16) LDFD f44 = [X], STRIDE
}
{ .mfb
(p18) FADD f8 = f8, f34
}
;;
{ .mmf
(p16) LDFD f47 = [X], INCX
}
{ .mfb
(p18) FADD f9 = f9, f37
}
;;
{ .mmf
(p16) LDFD f50 = [X], STRIDE
}
{ .mfb
(p18) FADD f10 = f10, f40
}
;;
{ .mmf
(p16) LDFD f53 = [X], INCX
}
{ .mfb
(p18) FADD f11 = f11, f43
}
;;
{ .mmf
#ifdef XDOUBLE
(p16) lfetch.nt1 [PRE1], INCX16
#endif
(p16) LDFD f56 = [X], STRIDE
}
{ .mfb
(p18) FADD f8 = f8, f46
}
;;
{ .mmf
(p16) LDFD f59 = [X], INCX
}
{ .mfb
(p18) FADD f9 = f9, f49
}
;;
{ .mmf
(p16) LDFD f62 = [X], STRIDE
}
{ .mfb
(p18) FADD f10 = f10, f52
}
;;
{ .mmf
(p16) LDFD f65 = [X], INCX
}
{ .mfb
(p18) FADD f11 = f11, f55
}
;;
{ .mmf
(p16) LDFD f68 = [X], STRIDE
}
{ .mfb
(p18) FADD f8 = f8, f58
}
;;
{ .mmf
(p16) LDFD f71 = [X], INCX
}
{ .mfb
(p18) FADD f9 = f9, f61
}
;;
{ .mmf
(p16) LDFD f74 = [X], STRIDE
}
{ .mfb
(p18) FADD f10 = f10, f64
}
;;
{ .mmf
(p16) LDFD f77 = [X], INCX
}
{ .mfb
(p18) FADD f11 = f11, f67
br.ctop.sptk.few .L52
}
;;
FADD f8 = f8, f71
FADD f9 = f9, f74
FADD f10 = f10, f77
FADD f11 = f11, f80
.align 32
;;
.L55:
(p12) LDFD f32 = [X], STRIDE
(p9) br.cond.dptk .L998
;;
(p12) LDFD f33 = [X], INCX
;;
(p12) LDFD f34 = [X], STRIDE
;;
(p12) LDFD f35 = [X], INCX
tbit.z p0, p13 = N, (2 - COMPADD)
;;
(p12) LDFD f36 = [X], STRIDE
tbit.z p0, p14 = N, (1 - COMPADD)
;;
(p12) LDFD f37 = [X], INCX
#ifndef COMPLEX
tbit.z p0, p15 = N, 0
#endif
;;
(p12) LDFD f38 = [X], STRIDE
;;
(p12) LDFD f39 = [X], INCX
;;
(p13) LDFD f40 = [X], STRIDE
;;
(p13) LDFD f41 = [X], INCX
;;
(p13) LDFD f42 = [X], STRIDE
(p12) FADD f8 = f8, f32
;;
(p13) LDFD f43 = [X], INCX
(p12) FADD f9 = f9, f33
;;
(p14) LDFD f44 = [X], STRIDE
(p12) FADD f10 = f10, f34
;;
(p14) LDFD f45 = [X], INCX
(p12) FADD f11 = f11, f35
;;
#ifndef COMPLEX
(p15) LDFD f46 = [X]
#endif
(p12) FADD f8 = f8, f36
;;
(p12) FADD f9 = f9, f37
(p12) FADD f10 = f10, f38
(p12) FADD f11 = f11, f39
;;
(p13) FADD f8 = f8, f40
(p13) FADD f9 = f9, f41
#ifndef COMPLEX
#endif
(p13) FADD f10 = f10, f42
;;
(p13) FADD f11 = f11, f43
(p14) FADD f8 = f8, f44
(p14) FADD f9 = f9, f45
#ifndef COMPLEX
(p15) FADD f10 = f10, f46
#endif
;;
.align 32
.L998:
{ .mfi
FADD f8 = f8, f9
mov ar.lc = ARLC
}
{ .mmf
FADD f10 = f10, f11
}
;;
{ .mii
mov pr = PR, -65474
}
;;
{ .mfb
FADD f8 = f8, f10
br.ret.sptk.many b0
}
EPILOGUE

View File

@ -30,6 +30,11 @@ IDMAXKERNEL = ../mips/imax.c
ISMINKERNEL = ../mips/imin.c
IDMINKERNEL = ../mips/imin.c
SSUMKERNEL = ../mips/sum.c
DSUMKERNEL = ../mips/sum.c
CSUMKERNEL = ../mips/zsum.c
ZSUMKERNEL = ../mips/zsum.c
ifdef HAVE_MSA
SASUMKERNEL = ../mips/sasum_msa.c
DASUMKERNEL = ../mips/dasum_msa.c

47
kernel/mips/sum.c Normal file
View File

@ -0,0 +1,47 @@
/***************************************************************************
Copyright (c) 2016, The OpenBLAS Project
All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
1. Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
2. Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in
the documentation and/or other materials provided with the
distribution.
3. Neither the name of the OpenBLAS project nor the names of
its contributors may be used to endorse or promote products
derived from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*****************************************************************************/
#include "common.h"
#include <math.h>
FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x)
{
BLASLONG i=0;
FLOAT sumf = 0.0;
if (n <= 0 || inc_x <= 0) return(sumf);
n *= inc_x;
while(i < n)
{
sumf += x[i];
i += inc_x;
}
return(sumf);
}

52
kernel/mips/zsum.c Normal file
View File

@ -0,0 +1,52 @@
/***************************************************************************
Copyright (c) 2016, The OpenBLAS Project
All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
1. Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
2. Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in
the documentation and/or other materials provided with the
distribution.
3. Neither the name of the OpenBLAS project nor the names of
its contributors may be used to endorse or promote products
derived from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*****************************************************************************/
#include "common.h"
#include <math.h>
#define CSUM1(x,i) x[i]+x[i+1]
FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x)
{
BLASLONG i=0;
FLOAT sumf = 0.0;
BLASLONG inc_x2;
if (n <= 0 || inc_x <= 0) return(sumf);
inc_x2 = 2 * inc_x;
n *= inc_x2;
while(i < n)
{
sumf += CSUM1(x,i);
i += inc_x2;
}
return(sumf);
}

332
kernel/mips64/sum.S Normal file
View File

@ -0,0 +1,332 @@
/*********************************************************************/
/* Copyright 2009, 2010 The University of Texas at Austin. */
/* All rights reserved. */
/* */
/* Redistribution and use in source and binary forms, with or */
/* without modification, are permitted provided that the following */
/* conditions are met: */
/* */
/* 1. Redistributions of source code must retain the above */
/* copyright notice, this list of conditions and the following */
/* disclaimer. */
/* */
/* 2. Redistributions in binary form must reproduce the above */
/* copyright notice, this list of conditions and the following */
/* disclaimer in the documentation and/or other materials */
/* provided with the distribution. */
/* */
/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */
/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */
/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */
/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */
/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */
/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */
/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */
/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */
/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */
/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */
/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */
/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */
/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */
/* POSSIBILITY OF SUCH DAMAGE. */
/* */
/* The views and conclusions contained in the software and */
/* documentation are those of the authors and should not be */
/* interpreted as representing official policies, either expressed */
/* or implied, of The University of Texas at Austin. */
/*********************************************************************/
#define ASSEMBLER
#include "common.h"
#define N $4
#define X $5
#define INCX $6
#define I $2
#define TEMP $3
#define a1 $f2
#define a2 $f3
#define a3 $f4
#define a4 $f5
#define a5 $f6
#define a6 $f7
#define a7 $f8
#define a8 $f9
#define t1 $f10
#define t2 $f11
#define t3 $f12
#define t4 $f13
#define s1 $f0
#define s2 $f1
PROLOGUE
#ifdef F_INTERFACE
LDINT N, 0(N)
LDINT INCX, 0(INCX)
#endif
MTC $0, s1
MTC $0, s2
dsll INCX, INCX, BASE_SHIFT
blez N, .L999
li TEMP, SIZE
bne INCX, TEMP, .L20
dsra I, N, 3
blez I, .L15
NOP
LD a1, 0 * SIZE(X)
LD a2, 1 * SIZE(X)
LD a3, 2 * SIZE(X)
LD a4, 3 * SIZE(X)
LD a5, 4 * SIZE(X)
MOV t1, a1
LD a6, 5 * SIZE(X)
MOV t2, a2
LD a7, 6 * SIZE(X)
MOV t3, a3
MOV t4, a4
daddiu I, I, -1
blez I, .L13
LD a8, 7 * SIZE(X)
.align 3
.L12:
ADD s1, s1, t1
LD a1, 8 * SIZE(X)
MOV t1, a5
daddiu I, I, -1
ADD s2, s2, t2
LD a2, 9 * SIZE(X)
MOV t2, a6
NOP
ADD s1, s1, t3
LD a3, 10 * SIZE(X)
MOV t3, a7
NOP
ADD s2, s2, t4
LD a4, 11 * SIZE(X)
MOV t4, a8
daddiu X, X, 8 * SIZE
ADD s1, s1, t1
LD a5, 4 * SIZE(X)
MOV t1, a1
NOP
ADD s2, s2, t2
LD a6, 5 * SIZE(X)
MOV t2, a2
NOP
ADD s1, s1, t3
LD a7, 6 * SIZE(X)
MOV t3, a3
NOP
ADD s2, s2, t4
LD a8, 7 * SIZE(X)
bgtz I, .L12
MOV t4, a4
.align 3
.L13:
ADD s1, s1, t1
daddiu X, X, 8 * SIZE
MOV t1, a5
NOP
ADD s2, s2, t2
MOV t2, a6
ADD s1, s1, t3
MOV t3, a7
ADD s2, s2, t4
MOV t4, a8
ADD s1, s1, t1
ADD s2, s2, t2
ADD s1, s1, t3
ADD s2, s2, t4
.align 3
.L15:
andi I, N, 7
blez I, .L999
NOP
.align 3
.L16:
LD a1, 0 * SIZE(X)
daddiu I, I, -1
MOV t1, a1
ADD s1, s1, t1
bgtz I, .L16
daddiu X, X, SIZE
j .L999
NOP
.align 3
.L20:
blez I, .L25
NOP
LD a1, 0 * SIZE(X)
daddu X, X, INCX
LD a2, 0 * SIZE(X)
daddu X, X, INCX
LD a3, 0 * SIZE(X)
daddu X, X, INCX
LD a4, 0 * SIZE(X)
daddu X, X, INCX
LD a5, 0 * SIZE(X)
daddu X, X, INCX
LD a6, 0 * SIZE(X)
daddu X, X, INCX
MOV t1, a1
LD a7, 0 * SIZE(X)
MOV t2, a2
daddu X, X, INCX
MOV t3, a3
LD a8, 0 * SIZE(X)
MOV t4, a4
daddiu I, I, -1
blez I, .L24
daddu X, X, INCX
.align 3
.L23:
ADD s1, s1, t1
LD a1, 0 * SIZE(X)
MOV t1, a5
daddu X, X, INCX
ADD s2, s2, t2
LD a2, 0 * SIZE(X)
MOV t2, a6
daddu X, X, INCX
ADD s1, s1, t3
LD a3, 0 * SIZE(X)
MOV t3, a7
daddu X, X, INCX
ADD s2, s2, t4
LD a4, 0 * SIZE(X)
MOV t4, a8
daddu X, X, INCX
ADD s1, s1, t1
LD a5, 0 * SIZE(X)
MOV t1, a1
daddu X, X, INCX
ADD s2, s2, t2
LD a6, 0 * SIZE(X)
MOV t2, a2
daddu X, X, INCX
ADD s1, s1, t3
LD a7, 0 * SIZE(X)
MOV t3, a3
daddu X, X, INCX
ADD s2, s2, t4
LD a8, 0 * SIZE(X)
MOV t4, a4
daddiu I, I, -1
bgtz I, .L23
daddu X, X, INCX
.align 3
.L24:
ADD s1, s1, t1
MOV t1, a5
ADD s2, s2, t2
MOV t2, a6
ADD s1, s1, t3
MOV t3, a7
ADD s2, s2, t4
MOV t4, a8
ADD s1, s1, t1
ADD s2, s2, t2
ADD s1, s1, t3
ADD s2, s2, t4
.align 3
.L25:
andi I, N, 7
blez I, .L999
NOP
.align 3
.L26:
LD a1, 0 * SIZE(X)
daddiu I, I, -1
MOV t1, a1
daddu X, X, INCX
bgtz I, .L26
ADD s1, s1, t1
.align 3
.L999:
j $31
ADD s1, s1, s2
EPILOGUE

204
kernel/mips64/zsum.S Normal file
View File

@ -0,0 +1,204 @@
/*********************************************************************/
/* Copyright 2009, 2010 The University of Texas at Austin. */
/* All rights reserved. */
/* */
/* Redistribution and use in source and binary forms, with or */
/* without modification, are permitted provided that the following */
/* conditions are met: */
/* */
/* 1. Redistributions of source code must retain the above */
/* copyright notice, this list of conditions and the following */
/* disclaimer. */
/* */
/* 2. Redistributions in binary form must reproduce the above */
/* copyright notice, this list of conditions and the following */
/* disclaimer in the documentation and/or other materials */
/* provided with the distribution. */
/* */
/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */
/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */
/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */
/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */
/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */
/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */
/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */
/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */
/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */
/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */
/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */
/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */
/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */
/* POSSIBILITY OF SUCH DAMAGE. */
/* */
/* The views and conclusions contained in the software and */
/* documentation are those of the authors and should not be */
/* interpreted as representing official policies, either expressed */
/* or implied, of The University of Texas at Austin. */
/*********************************************************************/
#define ASSEMBLER
#include "common.h"
#define N $4
#define X $5
#define INCX $6
#define I $2
#define TEMP $3
#define a1 $f2
#define a2 $f3
#define a3 $f4
#define a4 $f5
#define a5 $f6
#define a6 $f7
#define a7 $f8
#define a8 $f9
#define t1 $f10
#define t2 $f11
#define t3 $f12
#define t4 $f13
#define s1 $f0
#define s2 $f1
PROLOGUE
#ifdef F_INTERFACE
LDINT N, 0(N)
LDINT INCX, 0(INCX)
#endif
MTC $0, s1
MTC $0, s2
dsll INCX, INCX, ZBASE_SHIFT
blez N, .L999
dsra I, N, 2
blez I, .L25
NOP
LD a1, 0 * SIZE(X)
LD a2, 1 * SIZE(X)
daddu X, X, INCX
LD a3, 0 * SIZE(X)
LD a4, 1 * SIZE(X)
daddu X, X, INCX
LD a5, 0 * SIZE(X)
LD a6, 1 * SIZE(X)
daddu X, X, INCX
MOV t1, a1
MOV t2, a2
LD a7, 0 * SIZE(X)
LD a8, 1 * SIZE(X)
MOV t3, a3
MOV t4, a4
daddiu I, I, -1
blez I, .L24
daddu X, X, INCX
.align 3
.L23:
ADD s1, s1, t1
LD a1, 0 * SIZE(X)
MOV t1, a5
daddiu I, I, -1
ADD s2, s2, t2
LD a2, 1 * SIZE(X)
MOV t2, a6
daddu X, X, INCX
ADD s1, s1, t3
LD a3, 0 * SIZE(X)
MOV t3, a7
NOP
ADD s2, s2, t4
LD a4, 1 * SIZE(X)
MOV t4, a8
daddu X, X, INCX
ADD s1, s1, t1
LD a5, 0 * SIZE(X)
MOV t1, a1
NOP
ADD s2, s2, t2
LD a6, 1 * SIZE(X)
MOV t2, a2
daddu X, X, INCX
ADD s1, s1, t3
LD a7, 0 * SIZE(X)
MOV t3, a3
LD a8, 1 * SIZE(X)
ADD s2, s2, t4
daddu X, X, INCX
bgtz I, .L23
MOV t4, a4
.align 3
.L24:
ADD s1, s1, t1
MOV t1, a5
ADD s2, s2, t2
MOV t2, a6
ADD s1, s1, t3
MOV t3, a7
ADD s2, s2, t4
MOV t4, a8
ADD s1, s1, t1
ADD s2, s2, t2
ADD s1, s1, t3
ADD s2, s2, t4
.align 3
.L25:
andi I, N, 3
blez I, .L999
NOP
.align 3
.L26:
LD a1, 0 * SIZE(X)
LD a2, 1 * SIZE(X)
MOV t1, a1
daddiu I, I, -1
MOV t2, a2
daddu X, X, INCX
ADD s1, s1, t1
bgtz I, .L26
ADD s2, s2, t2
.align 3
.L999:
j $31
ADD s1, s1, s2
EPILOGUE

446
kernel/power/sum.S Normal file
View File

@ -0,0 +1,446 @@
/*********************************************************************/
/* Copyright 2009, 2010 The University of Texas at Austin. */
/* All rights reserved. */
/* */
/* Redistribution and use in source and binary forms, with or */
/* without modification, are permitted provided that the following */
/* conditions are met: */
/* */
/* 1. Redistributions of source code must retain the above */
/* copyright notice, this list of conditions and the following */
/* disclaimer. */
/* */
/* 2. Redistributions in binary form must reproduce the above */
/* copyright notice, this list of conditions and the following */
/* disclaimer in the documentation and/or other materials */
/* provided with the distribution. */
/* */
/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */
/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */
/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */
/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */
/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */
/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */
/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */
/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */
/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */
/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */
/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */
/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */
/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */
/* POSSIBILITY OF SUCH DAMAGE. */
/* */
/* The views and conclusions contained in the software and */
/* documentation are those of the authors and should not be */
/* interpreted as representing official policies, either expressed */
/* or implied, of The University of Texas at Austin. */
/*********************************************************************/
#define ASSEMBLER
#include "common.h"
#define N r3
#define X r4
#define INCX r5
#define PREA r8
#define FZERO f0
#define STACKSIZE 160
PROLOGUE
PROFCODE
addi SP, SP, -STACKSIZE
li r0, 0
stfd f14, 0(SP)
stfd f15, 8(SP)
stfd f16, 16(SP)
stfd f17, 24(SP)
stfd f18, 32(SP)
stfd f19, 40(SP)
stfd f20, 48(SP)
stfd f21, 56(SP)
stfd f22, 64(SP)
stfd f23, 72(SP)
stfd f24, 80(SP)
stfd f25, 88(SP)
stfd f26, 96(SP)
stfd f27, 104(SP)
stfd f28, 112(SP)
stfd f29, 120(SP)
stfd f30, 128(SP)
stfd f31, 136(SP)
stw r0, 144(SP)
lfs FZERO,144(SP)
#ifdef F_INTERFACE
LDINT N, 0(N)
LDINT INCX, 0(INCX)
#endif
slwi INCX, INCX, BASE_SHIFT
fmr f1, FZERO
fmr f2, FZERO
fmr f3, FZERO
fmr f4, FZERO
fmr f5, FZERO
fmr f6, FZERO
fmr f7, FZERO
li PREA, L1_PREFETCHSIZE
cmpwi cr0, N, 0
ble- LL(999)
cmpwi cr0, INCX, 0
ble- LL(999)
cmpwi cr0, INCX, SIZE
bne- cr0, LL(100)
srawi. r0, N, 4
mtspr CTR, r0
beq- cr0, LL(50)
.align 4
LFD f8, 0 * SIZE(X)
LFD f9, 1 * SIZE(X)
LFD f10, 2 * SIZE(X)
LFD f11, 3 * SIZE(X)
LFD f12, 4 * SIZE(X)
LFD f13, 5 * SIZE(X)
LFD f14, 6 * SIZE(X)
LFD f15, 7 * SIZE(X)
LFD f24, 8 * SIZE(X)
LFD f25, 9 * SIZE(X)
LFD f26, 10 * SIZE(X)
LFD f27, 11 * SIZE(X)
LFD f28, 12 * SIZE(X)
LFD f29, 13 * SIZE(X)
LFD f30, 14 * SIZE(X)
LFD f31, 15 * SIZE(X)
fmr f16, f8
fmr f17, f9
fmr f18, f10
fmr f19, f11
fmr f20, f12
fmr f21, f13
fmr f22, f14
fmr f23, f15
bdz LL(20)
.align 4
LL(10):
FADD f0, f0, f16
fmr f16, f24
FADD f1, f1, f17
fmr f17, f25
FADD f2, f2, f18
fmr f18, f26
FADD f3, f3, f19
fmr f19, f27
LFD f8, 16 * SIZE(X)
LFD f9, 17 * SIZE(X)
LFD f10, 18 * SIZE(X)
LFD f11, 19 * SIZE(X)
FADD f4, f4, f20
fmr f20, f28
FADD f5, f5, f21
fmr f21, f29
FADD f6, f6, f22
fmr f22, f30
FADD f7, f7, f23
fmr f23, f31
LFD f12, 20 * SIZE(X)
LFD f13, 21 * SIZE(X)
LFD f14, 22 * SIZE(X)
LFD f15, 23 * SIZE(X)
FADD f0, f0, f16
fmr f16, f8
FADD f1, f1, f17
fmr f17, f9
FADD f2, f2, f18
fmr f18, f10
FADD f3, f3, f19
fmr f19, f11
LFD f24, 24 * SIZE(X)
LFD f25, 25 * SIZE(X)
LFD f26, 26 * SIZE(X)
LFD f27, 27 * SIZE(X)
FADD f4, f4, f20
fmr f20, f12
FADD f5, f5, f21
fmr f21, f13
FADD f6, f6, f22
fmr f22, f14
FADD f7, f7, f23
fmr f23, f15
LFD f28, 28 * SIZE(X)
LFD f29, 29 * SIZE(X)
LFD f30, 30 * SIZE(X)
LFD f31, 31 * SIZE(X)
#ifndef POWER6
L1_PREFETCH X, PREA
#endif
addi X, X, 16 * SIZE
#ifdef POWER6
L1_PREFETCH X, PREA
#endif
bdnz LL(10)
.align 4
LL(20):
FADD f0, f0, f16
fmr f16, f24
FADD f1, f1, f17
fmr f17, f25
FADD f2, f2, f18
fmr f18, f26
FADD f3, f3, f19
fmr f19, f27
FADD f4, f4, f20
fmr f20, f28
FADD f5, f5, f21
fmr f21, f29
FADD f6, f6, f22
fmr f22, f30
FADD f7, f7, f23
fmr f23, f31
FADD f0, f0, f16
FADD f1, f1, f17
FADD f2, f2, f18
FADD f3, f3, f19
FADD f4, f4, f20
FADD f5, f5, f21
FADD f6, f6, f22
FADD f7, f7, f23
addi X, X, 16 * SIZE
.align 4
LL(50):
andi. r0, N, 15
mtspr CTR, r0
beq LL(999)
.align 4
LL(60):
LFD f8, 0 * SIZE(X)
addi X, X, 1 * SIZE
FADD f0, f0, f8
bdnz LL(60)
b LL(999)
.align 4
LL(100):
sub X, X, INCX
srawi. r0, N, 4
mtspr CTR, r0
beq- LL(150)
LFDUX f8, X, INCX
LFDUX f9, X, INCX
LFDUX f10, X, INCX
LFDUX f11, X, INCX
LFDUX f12, X, INCX
LFDUX f13, X, INCX
LFDUX f14, X, INCX
LFDUX f15, X, INCX
LFDUX f24, X, INCX
LFDUX f25, X, INCX
LFDUX f26, X, INCX
LFDUX f27, X, INCX
LFDUX f28, X, INCX
LFDUX f29, X, INCX
LFDUX f30, X, INCX
LFDUX f31, X, INCX
fmr f16, f8
fmr f17, f9
fmr f18, f10
fmr f19, f11
fmr f20, f12
fmr f21, f13
fmr f22, f14
fmr f23, f15
bdz LL(120)
.align 4
LL(110):
FADD f0, f0, f16
fmr f16, f24
FADD f1, f1, f17
fmr f17, f25
FADD f2, f2, f18
fmr f18, f26
FADD f3, f3, f19
fmr f19, f27
LFDUX f8, X, INCX
LFDUX f9, X, INCX
LFDUX f10, X, INCX
LFDUX f11, X, INCX
FADD f4, f4, f20
fmr f20, f28
FADD f5, f5, f21
fmr f21, f29
FADD f6, f6, f22
fmr f22, f30
FADD f7, f7, f23
fmr f23, f31
LFDUX f12, X, INCX
LFDUX f13, X, INCX
LFDUX f14, X, INCX
LFDUX f15, X, INCX
FADD f0, f0, f16
fmr f16, f8
FADD f1, f1, f17
fmr f17, f9
FADD f2, f2, f18
fmr f18, f10
FADD f3, f3, f19
fmr f19, f11
LFDUX f24, X, INCX
LFDUX f25, X, INCX
LFDUX f26, X, INCX
LFDUX f27, X, INCX
FADD f4, f4, f20
fmr f20, f12
FADD f5, f5, f21
fmr f21, f13
FADD f6, f6, f22
fmr f22, f14
FADD f7, f7, f23
fmr f23, f15
LFDUX f28, X, INCX
LFDUX f29, X, INCX
LFDUX f30, X, INCX
LFDUX f31, X, INCX
bdnz LL(110)
.align 4
LL(120):
FADD f0, f0, f16
fmr f16, f24
FADD f1, f1, f17
fmr f17, f25
FADD f2, f2, f18
fmr f18, f26
FADD f3, f3, f19
fmr f19, f27
FADD f4, f4, f20
fmr f20, f28
FADD f5, f5, f21
fmr f21, f29
FADD f6, f6, f22
fmr f22, f30
FADD f7, f7, f23
fmr f23, f31
FADD f0, f0, f16
FADD f1, f1, f17
FADD f2, f2, f18
FADD f3, f3, f19
FADD f4, f4, f20
FADD f5, f5, f21
FADD f6, f6, f22
FADD f7, f7, f23
.align 4
LL(150):
andi. r0, N, 15
mtspr CTR, r0
beq LL(999)
.align 4
LL(160):
LFDUX f8, X, INCX
FADD f0, f0, f8
bdnz LL(160)
.align 4
LL(999):
FADD f0, f0, f1
FADD f2, f2, f3
FADD f4, f4, f5
FADD f6, f6, f7
FADD f0, f0, f2
FADD f4, f4, f6
FADD f1, f0, f4
lfd f14, 0(SP)
lfd f15, 8(SP)
lfd f16, 16(SP)
lfd f17, 24(SP)
lfd f18, 32(SP)
lfd f19, 40(SP)
lfd f20, 48(SP)
lfd f21, 56(SP)
lfd f22, 64(SP)
lfd f23, 72(SP)
lfd f24, 80(SP)
lfd f25, 88(SP)
lfd f26, 96(SP)
lfd f27, 104(SP)
lfd f28, 112(SP)
lfd f29, 120(SP)
lfd f30, 128(SP)
lfd f31, 136(SP)
addi SP, SP, STACKSIZE
blr
EPILOGUE

452
kernel/power/zsum.S Normal file
View File

@ -0,0 +1,452 @@
/*********************************************************************/
/* Copyright 2009, 2010 The University of Texas at Austin. */
/* All rights reserved. */
/* */
/* Redistribution and use in source and binary forms, with or */
/* without modification, are permitted provided that the following */
/* conditions are met: */
/* */
/* 1. Redistributions of source code must retain the above */
/* copyright notice, this list of conditions and the following */
/* disclaimer. */
/* */
/* 2. Redistributions in binary form must reproduce the above */
/* copyright notice, this list of conditions and the following */
/* disclaimer in the documentation and/or other materials */
/* provided with the distribution. */
/* */
/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */
/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */
/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */
/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */
/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */
/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */
/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */
/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */
/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */
/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */
/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */
/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */
/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */
/* POSSIBILITY OF SUCH DAMAGE. */
/* */
/* The views and conclusions contained in the software and */
/* documentation are those of the authors and should not be */
/* interpreted as representing official policies, either expressed */
/* or implied, of The University of Texas at Austin. */
/*********************************************************************/
#define ASSEMBLER
#include "common.h"
#define N r3
#define X r4
#define INCX r5
#define INCXM1 r9
#define PREA r8
#define FZERO f0
#define STACKSIZE 160
PROLOGUE
PROFCODE
addi SP, SP, -STACKSIZE
li r0, 0
stfd f14, 0(SP)
stfd f15, 8(SP)
stfd f16, 16(SP)
stfd f17, 24(SP)
stfd f18, 32(SP)
stfd f19, 40(SP)
stfd f20, 48(SP)
stfd f21, 56(SP)
stfd f22, 64(SP)
stfd f23, 72(SP)
stfd f24, 80(SP)
stfd f25, 88(SP)
stfd f26, 96(SP)
stfd f27, 104(SP)
stfd f28, 112(SP)
stfd f29, 120(SP)
stfd f30, 128(SP)
stfd f31, 136(SP)
stw r0, 144(SP)
lfs FZERO,144(SP)
#ifdef F_INTERFACE
LDINT N, 0(N)
LDINT INCX, 0(INCX)
#endif
slwi INCX, INCX, ZBASE_SHIFT
subi INCXM1, INCX, SIZE
fmr f1, FZERO
fmr f2, FZERO
fmr f3, FZERO
fmr f4, FZERO
fmr f5, FZERO
fmr f6, FZERO
fmr f7, FZERO
li PREA, L1_PREFETCHSIZE
cmpwi cr0, N, 0
ble- LL(999)
cmpwi cr0, INCX, 0
ble- LL(999)
cmpwi cr0, INCX, 2 * SIZE
bne- cr0, LL(100)
srawi. r0, N, 3
mtspr CTR, r0
beq- cr0, LL(50)
.align 4
LFD f8, 0 * SIZE(X)
LFD f9, 1 * SIZE(X)
LFD f10, 2 * SIZE(X)
LFD f11, 3 * SIZE(X)
LFD f12, 4 * SIZE(X)
LFD f13, 5 * SIZE(X)
LFD f14, 6 * SIZE(X)
LFD f15, 7 * SIZE(X)
LFD f24, 8 * SIZE(X)
LFD f25, 9 * SIZE(X)
LFD f26, 10 * SIZE(X)
LFD f27, 11 * SIZE(X)
LFD f28, 12 * SIZE(X)
LFD f29, 13 * SIZE(X)
LFD f30, 14 * SIZE(X)
LFD f31, 15 * SIZE(X)
fmr f16, f8
fmr f17, f9
fmr f18, f10
fmr f19, f11
fmr f20, f12
fmr f21, f13
fmr f22, f14
fmr f23, f15
bdz LL(20)
.align 4
LL(10):
FADD f0, f0, f16
fmr f16, f24
FADD f1, f1, f17
fmr f17, f25
FADD f2, f2, f18
fmr f18, f26
FADD f3, f3, f19
fmr f19, f27
LFD f8, 16 * SIZE(X)
LFD f9, 17 * SIZE(X)
LFD f10, 18 * SIZE(X)
LFD f11, 19 * SIZE(X)
FADD f4, f4, f20
fmr f20, f28
FADD f5, f5, f21
fmr f21, f29
FADD f6, f6, f22
fmr f22, f30
FADD f7, f7, f23
fmr f23, f31
LFD f12, 20 * SIZE(X)
LFD f13, 21 * SIZE(X)
LFD f14, 22 * SIZE(X)
LFD f15, 23 * SIZE(X)
FADD f0, f0, f16
fmr f16, f8
FADD f1, f1, f17
fmr f17, f9
FADD f2, f2, f18
fmr f18, f10
FADD f3, f3, f19
fmr f19, f11
LFD f24, 24 * SIZE(X)
LFD f25, 25 * SIZE(X)
LFD f26, 26 * SIZE(X)
LFD f27, 27 * SIZE(X)
FADD f4, f4, f20
fmr f20, f12
FADD f5, f5, f21
fmr f21, f13
FADD f6, f6, f22
fmr f22, f14
FADD f7, f7, f23
fmr f23, f15
LFD f28, 28 * SIZE(X)
LFD f29, 29 * SIZE(X)
LFD f30, 30 * SIZE(X)
LFD f31, 31 * SIZE(X)
#ifndef POWER6
L1_PREFETCH X, PREA
#endif
addi X, X, 16 * SIZE
#ifdef POWER6
L1_PREFETCH X, PREA
#endif
bdnz LL(10)
.align 4
LL(20):
FADD f0, f0, f16
fmr f16, f24
FADD f1, f1, f17
fmr f17, f25
FADD f2, f2, f18
fmr f18, f26
FADD f3, f3, f19
fmr f19, f27
FADD f4, f4, f20
fmr f20, f28
FADD f5, f5, f21
fmr f21, f29
FADD f6, f6, f22
fmr f22, f30
FADD f7, f7, f23
fmr f23, f31
FADD f0, f0, f16
FADD f1, f1, f17
FADD f2, f2, f18
FADD f3, f3, f19
FADD f4, f4, f20
FADD f5, f5, f21
FADD f6, f6, f22
FADD f7, f7, f23
addi X, X, 16 * SIZE
.align 4
LL(50):
andi. r0, N, 7
mtspr CTR, r0
beq LL(999)
.align 4
LL(60):
LFD f8, 0 * SIZE(X)
LFD f9, 1 * SIZE(X)
addi X, X, 2 * SIZE
FADD f0, f0, f8
FADD f1, f1, f9
bdnz LL(60)
b LL(999)
.align 4
LL(100):
sub X, X, INCXM1
srawi. r0, N, 3
mtspr CTR, r0
beq- LL(150)
LFDX f8, X, INCXM1
LFDUX f9, X, INCX
LFDX f10, X, INCXM1
LFDUX f11, X, INCX
LFDX f12, X, INCXM1
LFDUX f13, X, INCX
LFDX f14, X, INCXM1
LFDUX f15, X, INCX
LFDX f24, X, INCXM1
LFDUX f25, X, INCX
LFDX f26, X, INCXM1
LFDUX f27, X, INCX
LFDX f28, X, INCXM1
LFDUX f29, X, INCX
LFDX f30, X, INCXM1
LFDUX f31, X, INCX
fmr f16, f8
fmr f17, f9
fmr f18, f10
fmr f19, f11
fmr f20, f12
fmr f21, f13
fmr f22, f14
fmr f23, f15
bdz LL(120)
.align 4
LL(110):
FADD f0, f0, f16
fmr f16, f24
FADD f1, f1, f17
fmr f17, f25
FADD f2, f2, f18
fmr f18, f26
FADD f3, f3, f19
fmr f19, f27
LFDX f8, X, INCXM1
LFDUX f9, X, INCX
LFDX f10, X, INCXM1
LFDUX f11, X, INCX
FADD f4, f4, f20
fmr f20, f28
FADD f5, f5, f21
fmr f21, f29
FADD f6, f6, f22
fmr f22, f30
FADD f7, f7, f23
fmr f23, f31
LFDX f12, X, INCXM1
LFDUX f13, X, INCX
LFDX f14, X, INCXM1
LFDUX f15, X, INCX
FADD f0, f0, f16
fmr f16, f8
FADD f1, f1, f17
fmr f17, f9
FADD f2, f2, f18
fmr f18, f10
FADD f3, f3, f19
fmr f19, f11
LFDX f24, X, INCXM1
LFDUX f25, X, INCX
LFDX f26, X, INCXM1
LFDUX f27, X, INCX
FADD f4, f4, f20
fmr f20, f12
FADD f5, f5, f21
fmr f21, f13
FADD f6, f6, f22
fmr f22, f14
FADD f7, f7, f23
fmr f23, f15
LFDX f28, X, INCXM1
LFDUX f29, X, INCX
LFDX f30, X, INCXM1
LFDUX f31, X, INCX
bdnz LL(110)
.align 4
LL(120):
FADD f0, f0, f16
fmr f16, f24
FADD f1, f1, f17
fmr f17, f25
FADD f2, f2, f18
fmr f18, f26
FADD f3, f3, f19
fmr f19, f27
FADD f4, f4, f20
fmr f20, f28
FADD f5, f5, f21
fmr f21, f29
FADD f6, f6, f22
fmr f22, f30
FADD f7, f7, f23
fmr f23, f31
FADD f0, f0, f16
FADD f1, f1, f17
FADD f2, f2, f18
FADD f3, f3, f19
FADD f4, f4, f20
FADD f5, f5, f21
FADD f6, f6, f22
FADD f7, f7, f23
.align 4
LL(150):
andi. r0, N, 7
mtspr CTR, r0
beq LL(999)
.align 4
LL(160):
LFDX f8, X, INCXM1
LFDUX f9, X, INCX
FADD f0, f0, f8
FADD f1, f1, f9
bdnz LL(160)
.align 4
LL(999):
FADD f0, f0, f1
FADD f2, f2, f3
FADD f4, f4, f5
FADD f6, f6, f7
FADD f0, f0, f2
FADD f4, f4, f6
FADD f1, f0, f4
lfd f14, 0(SP)
lfd f15, 8(SP)
lfd f16, 16(SP)
lfd f17, 24(SP)
lfd f18, 32(SP)
lfd f19, 40(SP)
lfd f20, 48(SP)
lfd f21, 56(SP)
lfd f22, 64(SP)
lfd f23, 72(SP)
lfd f24, 80(SP)
lfd f25, 88(SP)
lfd f26, 96(SP)
lfd f27, 104(SP)
lfd f28, 112(SP)
lfd f29, 120(SP)
lfd f30, 128(SP)
lfd f31, 136(SP)
addi SP, SP, STACKSIZE
blr
EPILOGUE

View File

@ -70,7 +70,7 @@ gotoblas_t TABLE_NAME = {
samax_kTS, samin_kTS, smax_kTS, smin_kTS,
isamax_kTS, isamin_kTS, ismax_kTS, ismin_kTS,
snrm2_kTS, sasum_kTS, scopy_kTS, sdot_kTS,
snrm2_kTS, sasum_kTS, ssum_kTS, scopy_kTS, sdot_kTS,
dsdot_kTS,
srot_kTS, saxpy_kTS, sscal_kTS, sswap_kTS,
sgemv_nTS, sgemv_tTS, sger_kTS,
@ -126,7 +126,7 @@ gotoblas_t TABLE_NAME = {
damax_kTS, damin_kTS, dmax_kTS, dmin_kTS,
idamax_kTS, idamin_kTS, idmax_kTS, idmin_kTS,
dnrm2_kTS, dasum_kTS, dcopy_kTS, ddot_kTS,
dnrm2_kTS, dasum_kTS, dsum_kTS, dcopy_kTS, ddot_kTS,
drot_kTS, daxpy_kTS, dscal_kTS, dswap_kTS,
dgemv_nTS, dgemv_tTS, dger_kTS,
dsymv_LTS, dsymv_UTS,
@ -178,7 +178,7 @@ gotoblas_t TABLE_NAME = {
qamax_kTS, qamin_kTS, qmax_kTS, qmin_kTS,
iqamax_kTS, iqamin_kTS, iqmax_kTS, iqmin_kTS,
qnrm2_kTS, qasum_kTS, qcopy_kTS, qdot_kTS,
qnrm2_kTS, qasum_kTS, qsum_kTS, qcopy_kTS, qdot_kTS,
qrot_kTS, qaxpy_kTS, qscal_kTS, qswap_kTS,
qgemv_nTS, qgemv_tTS, qger_kTS,
qsymv_LTS, qsymv_UTS,
@ -234,7 +234,7 @@ gotoblas_t TABLE_NAME = {
#endif
camax_kTS, camin_kTS, icamax_kTS, icamin_kTS,
cnrm2_kTS, casum_kTS, ccopy_kTS,
cnrm2_kTS, casum_kTS, csum_kTS, ccopy_kTS,
cdotu_kTS, cdotc_kTS, csrot_kTS,
caxpy_kTS, caxpyc_kTS, cscal_kTS, cswap_kTS,
@ -369,7 +369,7 @@ gotoblas_t TABLE_NAME = {
#endif
zamax_kTS, zamin_kTS, izamax_kTS, izamin_kTS,
znrm2_kTS, zasum_kTS, zcopy_kTS,
znrm2_kTS, zasum_kTS, zsum_kTS, zcopy_kTS,
zdotu_kTS, zdotc_kTS, zdrot_kTS,
zaxpy_kTS, zaxpyc_kTS, zscal_kTS, zswap_kTS,
@ -500,7 +500,7 @@ gotoblas_t TABLE_NAME = {
XGEMM_DEFAULT_UNROLL_M, XGEMM_DEFAULT_UNROLL_N, MAX(XGEMM_DEFAULT_UNROLL_M, XGEMM_DEFAULT_UNROLL_N),
xamax_kTS, xamin_kTS, ixamax_kTS, ixamin_kTS,
xnrm2_kTS, xasum_kTS, xcopy_kTS,
xnrm2_kTS, xasum_kTS, xsum_kTS, xcopy_kTS,
xdotu_kTS, xdotc_kTS, xqrot_kTS,
xaxpy_kTS, xaxpyc_kTS, xscal_kTS, xswap_kTS,

325
kernel/sparc/sum.S Normal file
View File

@ -0,0 +1,325 @@
/*********************************************************************/
/* Copyright 2009, 2010 The University of Texas at Austin. */
/* All rights reserved. */
/* */
/* Redistribution and use in source and binary forms, with or */
/* without modification, are permitted provided that the following */
/* conditions are met: */
/* */
/* 1. Redistributions of source code must retain the above */
/* copyright notice, this list of conditions and the following */
/* disclaimer. */
/* */
/* 2. Redistributions in binary form must reproduce the above */
/* copyright notice, this list of conditions and the following */
/* disclaimer in the documentation and/or other materials */
/* provided with the distribution. */
/* */
/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */
/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */
/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */
/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */
/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */
/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */
/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */
/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */
/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */
/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */
/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */
/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */
/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */
/* POSSIBILITY OF SUCH DAMAGE. */
/* */
/* The views and conclusions contained in the software and */
/* documentation are those of the authors and should not be */
/* interpreted as representing official policies, either expressed */
/* or implied, of The University of Texas at Austin. */
/*********************************************************************/
#define ASSEMBLER
#include "common.h"
#define N %i0
#define X %i1
#define INCX %i2
#define I %i3
#ifdef DOUBLE
#define c1 %f0
#define c2 %f2
#define t1 %f8
#define t2 %f10
#define t3 %f12
#define t4 %f14
#define a1 %f16
#define a2 %f18
#define a3 %f20
#define a4 %f22
#define a5 %f24
#define a6 %f26
#define a7 %f28
#define a8 %f30
#else
#define c1 %f0
#define c2 %f1
#define t1 %f4
#define t2 %f5
#define t3 %f6
#define t4 %f7
#define a1 %f8
#define a2 %f9
#define a3 %f10
#define a4 %f11
#define a5 %f12
#define a6 %f13
#define a7 %f14
#define a8 %f15
#endif
PROLOGUE
SAVESP
FCLR(0)
sll INCX, BASE_SHIFT, INCX
FMOV c1, c2
FMOV c1, t1
FMOV c1, t2
FMOV c1, t3
FMOV c1, t4
cmp INCX, 0
ble .LL19
cmp INCX, SIZE
bne .LL50
sra N, 3, I
cmp I, 0
ble,pn %icc, .LL15
nop
LDF [X + 0 * SIZE], a1
add I, -1, I
LDF [X + 1 * SIZE], a2
cmp I, 0
LDF [X + 2 * SIZE], a3
LDF [X + 3 * SIZE], a4
LDF [X + 4 * SIZE], a5
LDF [X + 5 * SIZE], a6
LDF [X + 6 * SIZE], a7
LDF [X + 7 * SIZE], a8
ble,pt %icc, .LL12
add X, 8 * SIZE, X
#define PREFETCHSIZE 128
.LL11:
FADD c1, t1, c1
prefetch [X + PREFETCHSIZE * SIZE], 0
FMOV a1, t1
LDF [X + 0 * SIZE], a1
FADD c2, t2, c2
add I, -1, I
FMOV a2, t2
LDF [X + 1 * SIZE], a2
FADD c1, t3, c1
cmp I, 0
FMOV a3, t3
LDF [X + 2 * SIZE], a3
FADD c2, t4, c2
nop
FMOV a4, t4
LDF [X + 3 * SIZE], a4
FADD c1, t1, c1
nop
FMOV a5, t1
LDF [X + 4 * SIZE], a5
FADD c2, t2, c2
nop
FMOV a6, t2
LDF [X + 5 * SIZE], a6
FADD c1, t3, c1
FMOV a7, t3
LDF [X + 6 * SIZE], a7
add X, 8 * SIZE, X
FADD c2, t4, c2
FMOV a8, t4
bg,pt %icc, .LL11
LDF [X - 1 * SIZE], a8
.LL12:
FADD c1, t1, c1
FMOV a1, t1
FADD c2, t2, c2
FMOV a2, t2
FADD c1, t3, c1
FMOV a3, t3
FADD c2, t4, c2
FMOV a4, t4
FADD c1, t1, c1
FMOV a5, t1
FADD c2, t2, c2
FMOV a6, t2
FADD c1, t3, c1
FMOV a7, t3
FADD c2, t4, c2
FMOV a8, t4
.LL15:
and N, 7, I
cmp I, 0
ble,a,pn %icc, .LL19
nop
.LL16:
LDF [X + 0 * SIZE], a1
add I, -1, I
cmp I, 0
FADD c1, t1, c1
FMOV a1, t1
bg,pt %icc, .LL16
add X, 1 * SIZE, X
.LL19:
FADD c1, t1, c1
FADD c2, t2, c2
FADD c1, t3, c1
FADD c2, t4, c2
FADD c1, c2, c1
return %i7 + 8
clr %g0
.LL50:
sra N, 3, I
cmp I, 0
ble,pn %icc, .LL55
nop
LDF [X + 0 * SIZE], a1
add X, INCX, X
LDF [X + 0 * SIZE], a2
add X, INCX, X
LDF [X + 0 * SIZE], a3
add X, INCX, X
LDF [X + 0 * SIZE], a4
add X, INCX, X
LDF [X + 0 * SIZE], a5
add X, INCX, X
LDF [X + 0 * SIZE], a6
add X, INCX, X
add I, -1, I
LDF [X + 0 * SIZE], a7
cmp I, 0
add X, INCX, X
LDF [X + 0 * SIZE], a8
ble,pt %icc, .LL52
add X, INCX, X
.LL51:
FADD c1, t1, c1
add I, -1, I
FMOV a1, t1
LDF [X + 0 * SIZE], a1
add X, INCX, X
FADD c2, t2, c2
cmp I, 0
FMOV a2, t2
LDF [X + 0 * SIZE], a2
add X, INCX, X
FADD c1, t3, c1
FMOV a3, t3
LDF [X + 0 * SIZE], a3
add X, INCX, X
FADD c2, t4, c2
FMOV a4, t4
LDF [X + 0 * SIZE], a4
add X, INCX, X
FADD c1, t1, c1
FMOV a5, t1
LDF [X + 0 * SIZE], a5
add X, INCX, X
FADD c2, t2, c2
FMOV a6, t2
LDF [X + 0 * SIZE], a6
add X, INCX, X
FADD c1, t3, c1
FMOV a7, t3
LDF [X + 0 * SIZE], a7
add X, INCX, X
FADD c2, t4, c2
FMOV a8, t4
LDF [X + 0 * SIZE], a8
bg,pt %icc, .LL51
add X, INCX, X
.LL52:
FADD c1, t1, c1
FMOV a1, t1
FADD c2, t2, c2
FMOV a2, t2
FADD c1, t3, c1
FMOV a3, t3
FADD c2, t4, c2
FMOV a4, t4
FADD c1, t1, c1
FMOV a5, t1
FADD c2, t2, c2
FMOV a6, t2
FADD c1, t3, c1
FMOV a7, t3
FADD c2, t4, c2
FMOV a8, t4
.LL55:
and N, 7, I
cmp I, 0
ble,a,pn %icc, .LL59
nop
.LL56:
LDF [X + 0 * SIZE], a1
FADD c1, t1, c1
add I, -1, I
FMOV a1, t1
cmp I, 0
bg,pt %icc, .LL56
add X, INCX, X
.LL59:
FADD c1, t1, c1
FADD c2, t2, c2
FADD c1, t3, c1
FADD c2, t4, c2
FADD c1, c2, c1
return %i7 + 8
clr %o0
EPILOGUE

327
kernel/sparc/zsum.S Normal file
View File

@ -0,0 +1,327 @@
/*********************************************************************/
/* Copyright 2009, 2010 The University of Texas at Austin. */
/* All rights reserved. */
/* */
/* Redistribution and use in source and binary forms, with or */
/* without modification, are permitted provided that the following */
/* conditions are met: */
/* */
/* 1. Redistributions of source code must retain the above */
/* copyright notice, this list of conditions and the following */
/* disclaimer. */
/* */
/* 2. Redistributions in binary form must reproduce the above */
/* copyright notice, this list of conditions and the following */
/* disclaimer in the documentation and/or other materials */
/* provided with the distribution. */
/* */
/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */
/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */
/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */
/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */
/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */
/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */
/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */
/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */
/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */
/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */
/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */
/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */
/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */
/* POSSIBILITY OF SUCH DAMAGE. */
/* */
/* The views and conclusions contained in the software and */
/* documentation are those of the authors and should not be */
/* interpreted as representing official policies, either expressed */
/* or implied, of The University of Texas at Austin. */
/*********************************************************************/
#define ASSEMBLER
#include "common.h"
#define N %i0
#define X %i1
#define INCX %i2
#define I %i3
#ifdef DOUBLE
#define c1 %f0
#define c2 %f2
#define t1 %f8
#define t2 %f10
#define t3 %f12
#define t4 %f14
#define a1 %f16
#define a2 %f18
#define a3 %f20
#define a4 %f22
#define a5 %f24
#define a6 %f26
#define a7 %f28
#define a8 %f30
#else
#define c1 %f0
#define c2 %f1
#define t1 %f4
#define t2 %f5
#define t3 %f6
#define t4 %f7
#define a1 %f8
#define a2 %f9
#define a3 %f10
#define a4 %f11
#define a5 %f12
#define a6 %f13
#define a7 %f14
#define a8 %f15
#endif
PROLOGUE
SAVESP
FCLR(0)
sll INCX, ZBASE_SHIFT, INCX
FMOV c1, c2
FMOV c1, t1
FMOV c1, t2
FMOV c1, t3
FMOV c1, t4
cmp INCX, 0
ble .LL19
nop
cmp INCX, 2 * SIZE
bne .LL50
nop
sra N, 2, I
cmp I, 0
ble,pn %icc, .LL15
nop
LDF [X + 0 * SIZE], a1
add I, -1, I
LDF [X + 1 * SIZE], a2
cmp I, 0
LDF [X + 2 * SIZE], a3
LDF [X + 3 * SIZE], a4
LDF [X + 4 * SIZE], a5
LDF [X + 5 * SIZE], a6
LDF [X + 6 * SIZE], a7
LDF [X + 7 * SIZE], a8
ble,pt %icc, .LL12
add X, 8 * SIZE, X
#define PREFETCHSIZE 32
.LL11:
FADD c1, t1, c1
prefetch [X + PREFETCHSIZE * SIZE], 0
FMOV a1, t1
LDF [X + 0 * SIZE], a1
FADD c2, t2, c2
add I, -1, I
FMOV a2, t2
LDF [X + 1 * SIZE], a2
FADD c1, t3, c1
cmp I, 0
FMOV a3, t3
LDF [X + 2 * SIZE], a3
FADD c2, t4, c2
nop
FMOV a4, t4
LDF [X + 3 * SIZE], a4
FADD c1, t1, c1
nop
FMOV a5, t1
LDF [X + 4 * SIZE], a5
FADD c2, t2, c2
nop
FMOV a6, t2
LDF [X + 5 * SIZE], a6
FADD c1, t3, c1
FMOV a7, t3
LDF [X + 6 * SIZE], a7
add X, 8 * SIZE, X
FADD c2, t4, c2
FMOV a8, t4
bg,pt %icc, .LL11
LDF [X - 1 * SIZE], a8
.LL12:
FADD c1, t1, c1
FMOV a1, t1
FADD c2, t2, c2
FMOV a2, t2
FADD c1, t3, c1
FMOV a3, t3
FADD c2, t4, c2
FMOV a4, t4
FADD c1, t1, c1
FMOV a5, t1
FADD c2, t2, c2
FMOV a6, t2
FADD c1, t3, c1
FMOV a7, t3
FADD c2, t4, c2
FMOV a8, t4
.LL15:
and N, 3, I
cmp I, 0
ble,a,pn %icc, .LL19
nop
.LL16:
LDF [X + 0 * SIZE], a1
LDF [X + 1 * SIZE], a2
add I, -1, I
cmp I, 0
FADD c1, t1, c1
FADD c2, t2, c2
FMOV a1, t1
FMOV a2, t2
bg,pt %icc, .LL16
add X, 2 * SIZE, X
.LL19:
FADD c1, t1, c1
FADD c2, t2, c2
FADD c1, t3, c1
FADD c2, t4, c2
FADD c1, c2, c1
return %i7 + 8
clr %g0
.LL50:
sra N, 2, I
cmp I, 0
ble,pn %icc, .LL55
nop
LDF [X + 0 * SIZE], a1
LDF [X + 1 * SIZE], a2
add X, INCX, X
LDF [X + 0 * SIZE], a3
LDF [X + 1 * SIZE], a4
add X, INCX, X
LDF [X + 0 * SIZE], a5
LDF [X + 1 * SIZE], a6
add X, INCX, X
add I, -1, I
LDF [X + 0 * SIZE], a7
cmp I, 0
LDF [X + 1 * SIZE], a8
ble,pt %icc, .LL52
add X, INCX, X
.LL51:
FADD c1, t1, c1
add I, -1, I
FMOV a1, t1
LDF [X + 0 * SIZE], a1
FADD c2, t2, c2
cmp I, 0
FMOV a2, t2
LDF [X + 1 * SIZE], a2
add X, INCX, X
FADD c1, t3, c1
FMOV a3, t3
LDF [X + 0 * SIZE], a3
FADD c2, t4, c2
FMOV a4, t4
LDF [X + 1 * SIZE], a4
add X, INCX, X
FADD c1, t1, c1
FMOV a5, t1
LDF [X + 0 * SIZE], a5
FADD c2, t2, c2
FMOV a6, t2
LDF [X + 1 * SIZE], a6
add X, INCX, X
FADD c1, t3, c1
FMOV a7, t3
LDF [X + 0 * SIZE], a7
FADD c2, t4, c2
FMOV a8, t4
LDF [X + 1 * SIZE], a8
bg,pt %icc, .LL51
add X, INCX, X
.LL52:
FADD c1, t1, c1
FMOV a1, t1
FADD c2, t2, c2
FMOV a2, t2
FADD c1, t3, c1
FMOV a3, t3
FADD c2, t4, c2
FMOV a4, t4
FADD c1, t1, c1
FMOV a5, t1
FADD c2, t2, c2
FMOV a6, t2
FADD c1, t3, c1
FMOV a7, t3
FADD c2, t4, c2
FMOV a8, t4
.LL55:
and N, 3, I
cmp I, 0
ble,a,pn %icc, .LL59
nop
.LL56:
LDF [X + 0 * SIZE], a1
LDF [X + 1 * SIZE], a2
FADD c1, t1, c1
FADD c2, t2, c2
add I, -1, I
FMOV a1, t1
FMOV a2, t2
cmp I, 0
bg,pt %icc, .LL56
add X, INCX, X
.LL59:
FADD c1, t1, c1
FADD c2, t2, c2
FADD c1, t3, c1
FADD c2, t4, c2
FADD c1, c2, c1
return %i7 + 8
clr %o0
EPILOGUE

View File

@ -94,6 +94,11 @@ DASUMKERNEL = ../arm/asum.c
CASUMKERNEL = ../arm/zasum.c
ZASUMKERNEL = ../arm/zasum.c
SSUMKERNEL = ../arm/sum.c
DSUMKERNEL = ../arm/sum.c
CSUMKERNEL = ../arm/zsum.c
ZSUMKERNEL = ../arm/zsum.c
SAXPYKERNEL = ../arm/axpy.c
DAXPYKERNEL = ../arm/axpy.c
CAXPYKERNEL = ../arm/zaxpy.c

207
kernel/x86/sum.S Normal file
View File

@ -0,0 +1,207 @@
/*********************************************************************/
/* Copyright 2009, 2010 The University of Texas at Austin. */
/* All rights reserved. */
/* */
/* Redistribution and use in source and binary forms, with or */
/* without modification, are permitted provided that the following */
/* conditions are met: */
/* */
/* 1. Redistributions of source code must retain the above */
/* copyright notice, this list of conditions and the following */
/* disclaimer. */
/* */
/* 2. Redistributions in binary form must reproduce the above */
/* copyright notice, this list of conditions and the following */
/* disclaimer in the documentation and/or other materials */
/* provided with the distribution. */
/* */
/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */
/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */
/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */
/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */
/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */
/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */
/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */
/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */
/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */
/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */
/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */
/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */
/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */
/* POSSIBILITY OF SUCH DAMAGE. */
/* */
/* The views and conclusions contained in the software and */
/* documentation are those of the authors and should not be */
/* interpreted as representing official policies, either expressed */
/* or implied, of The University of Texas at Austin. */
/*********************************************************************/
#define ASSEMBLER
#include "common.h"
#define STACK 8
#define ARGS 0
#define STACK_M 4 + STACK + ARGS(%esp)
#define STACK_X 8 + STACK + ARGS(%esp)
#define STACK_INCX 12 + STACK + ARGS(%esp)
#define M %edx
#define X %ecx
#define INCX %esi
#define I %eax
#include "l1param.h"
PROLOGUE
pushl %esi
pushl %ebx
PROFCODE
#if defined(F_INTERFACE_GFORT) || defined(F_INTERFACE_G95)
EMMS
#endif
movl STACK_M, M
movl STACK_X, X
movl STACK_INCX, INCX
#ifdef F_INTERFACE
movl (M), M
movl (INCX), INCX
#endif
fldz
testl M, M
jle .L999
testl INCX, INCX
jle .L999
sall $BASE_SHIFT, INCX
fldz
fldz
fldz
cmpl $SIZE, INCX
jne .L40
movl M, I
sarl $3, I
jle .L20
ALIGN_4
.L10:
#ifdef PREFETCH
PREFETCH (PREFETCHSIZE + 0) - PREOFFSET(X)
#endif
FLD 0 * SIZE(X)
FLD 1 * SIZE(X)
FLD 2 * SIZE(X)
FLD 3 * SIZE(X)
faddp %st, %st(7)
faddp %st, %st(5)
faddp %st, %st(3)
faddp %st, %st(1)
FLD 4 * SIZE(X)
FLD 5 * SIZE(X)
FLD 6 * SIZE(X)
FLD 7 * SIZE(X)
addl $8 * SIZE, X
faddp %st, %st(7)
faddp %st, %st(5)
faddp %st, %st(3)
faddp %st, %st(1)
decl I
jg .L10
ALIGN_4
.L20:
movl M, I
andl $7, I
jle .L998
ALIGN_4
.L21:
FLD (X)
faddp %st,%st(1)
addl $1 * SIZE, X
decl I
jg .L21
jmp .L998
ALIGN_4
.L40:
movl M, I
sarl $3, I
jle .L60
ALIGN_4
.L50:
FLD (X)
addl INCX, X
FLD (X)
addl INCX, X
FLD (X)
addl INCX, X
FLD (X)
addl INCX, X
faddp %st, %st(7)
faddp %st, %st(5)
faddp %st, %st(3)
faddp %st, %st(1)
FLD (X)
addl INCX, X
FLD (X)
addl INCX, X
FLD (X)
addl INCX, X
FLD (X)
addl INCX, X
faddp %st, %st(7)
faddp %st, %st(5)
faddp %st, %st(3)
faddp %st, %st(1)
decl I
jg .L50
ALIGN_4
.L60:
movl M, I
andl $7, I
jle .L998
ALIGN_4
.L61:
FLD (X)
addl INCX, X
faddp %st,%st(1)
decl I
jg .L61
ALIGN_4
.L998:
faddp %st,%st(2)
faddp %st,%st(1)
faddp %st,%st(1)
ALIGN_4
.L999:
popl %ebx
popl %esi
ret
EPILOGUE

208
kernel/x86/zsum.S Normal file
View File

@ -0,0 +1,208 @@
/*********************************************************************/
/* Copyright 2009, 2010 The University of Texas at Austin. */
/* All rights reserved. */
/* */
/* Redistribution and use in source and binary forms, with or */
/* without modification, are permitted provided that the following */
/* conditions are met: */
/* */
/* 1. Redistributions of source code must retain the above */
/* copyright notice, this list of conditions and the following */
/* disclaimer. */
/* */
/* 2. Redistributions in binary form must reproduce the above */
/* copyright notice, this list of conditions and the following */
/* disclaimer in the documentation and/or other materials */
/* provided with the distribution. */
/* */
/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */
/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */
/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */
/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */
/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */
/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */
/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */
/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */
/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */
/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */
/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */
/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */
/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */
/* POSSIBILITY OF SUCH DAMAGE. */
/* */
/* The views and conclusions contained in the software and */
/* documentation are those of the authors and should not be */
/* interpreted as representing official policies, either expressed */
/* or implied, of The University of Texas at Austin. */
/*********************************************************************/
#define ASSEMBLER
#include "common.h"
#define STACK 8
#define ARGS 0
#define STACK_M 4 + STACK + ARGS(%esp)
#define STACK_X 8 + STACK + ARGS(%esp)
#define STACK_INCX 12 + STACK + ARGS(%esp)
#define M %edx
#define X %ecx
#define INCX %esi
#define I %eax
#include "l1param.h"
PROLOGUE
pushl %esi
pushl %ebx
PROFCODE
#if defined(F_INTERFACE_GFORT) || defined(F_INTERFACE_G95)
EMMS
#endif
movl STACK_M, M
movl STACK_X, X
movl STACK_INCX, INCX
#ifdef F_INTERFACE
movl (M), M
movl (INCX), INCX
#endif
fldz
testl M, M
jle .L999
testl INCX, INCX
jle .L999
sall $ZBASE_SHIFT, INCX
fldz
fldz
fldz
cmpl $SIZE * 2, INCX
jne .L40
movl M, I
sarl $2, I
jle .L20
ALIGN_4
.L10:
#ifdef PREFETCH
PREFETCH (PREFETCHSIZE + 0) - PREOFFSET(X)
#endif
FLD 0 * SIZE(X)
FLD 1 * SIZE(X)
FLD 2 * SIZE(X)
FLD 3 * SIZE(X)
faddp %st, %st(7)
faddp %st, %st(5)
faddp %st, %st(3)
faddp %st, %st(1)
FLD 4 * SIZE(X)
FLD 5 * SIZE(X)
FLD 6 * SIZE(X)
FLD 7 * SIZE(X)
addl $8 * SIZE, X
faddp %st, %st(7)
faddp %st, %st(5)
faddp %st, %st(3)
faddp %st, %st(1)
decl I
jg .L10
ALIGN_4
.L20:
movl M, I
andl $3, I
jle .L998
ALIGN_4
.L21:
FLD 0 * SIZE(X)
FLD 1 * SIZE(X)
faddp %st,%st(3)
faddp %st,%st(1)
addl $2 * SIZE, X
decl I
jg .L21
jmp .L998
ALIGN_4
.L40:
movl M, I
sarl $2, I
jle .L60
ALIGN_4
.L50:
FLD 0 * SIZE(X)
FLD 1 * SIZE(X)
addl INCX, X
FLD 0 * SIZE(X)
FLD 1 * SIZE(X)
addl INCX, X
faddp %st, %st(7)
faddp %st, %st(5)
faddp %st, %st(3)
faddp %st, %st(1)
FLD 0 * SIZE(X)
FLD 1 * SIZE(X)
addl INCX, X
FLD 0 * SIZE(X)
FLD 1 * SIZE(X)
addl INCX, X
faddp %st, %st(7)
faddp %st, %st(5)
faddp %st, %st(3)
faddp %st, %st(1)
decl I
jg .L50
ALIGN_4
.L60:
movl M, I
andl $3, I
jle .L998
ALIGN_4
.L61:
FLD 0 * SIZE(X)
FLD 1 * SIZE(X)
addl INCX, X
faddp %st,%st(3)
faddp %st,%st(1)
decl I
jg .L61
ALIGN_4
.L998:
faddp %st,%st(2)
faddp %st,%st(1)
faddp %st,%st(1)
ALIGN_4
.L999:
popl %ebx
popl %esi
ret
EPILOGUE

View File

@ -94,6 +94,11 @@ DASUMKERNEL = ../arm/asum.c
CASUMKERNEL = ../arm/zasum.c
ZASUMKERNEL = ../arm/zasum.c
SSUMKERNEL = ../arm/sum.c
DSUMKERNEL = ../arm/sum.c
CSUMKERNEL = ../arm/zsum.c
ZSUMKERNEL = ../arm/zsum.c
SAXPYKERNEL = ../arm/axpy.c
DAXPYKERNEL = ../arm/axpy.c
CAXPYKERNEL = ../arm/zaxpy.c

179
kernel/x86_64/sum.S Normal file
View File

@ -0,0 +1,179 @@
/*********************************************************************/
/* Copyright 2009, 2010 The University of Texas at Austin. */
/* All rights reserved. */
/* */
/* Redistribution and use in source and binary forms, with or */
/* without modification, are permitted provided that the following */
/* conditions are met: */
/* */
/* 1. Redistributions of source code must retain the above */
/* copyright notice, this list of conditions and the following */
/* disclaimer. */
/* */
/* 2. Redistributions in binary form must reproduce the above */
/* copyright notice, this list of conditions and the following */
/* disclaimer in the documentation and/or other materials */
/* provided with the distribution. */
/* */
/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */
/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */
/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */
/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */
/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */
/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */
/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */
/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */
/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */
/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */
/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */
/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */
/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */
/* POSSIBILITY OF SUCH DAMAGE. */
/* */
/* The views and conclusions contained in the software and */
/* documentation are those of the authors and should not be */
/* interpreted as representing official policies, either expressed */
/* or implied, of The University of Texas at Austin. */
/*********************************************************************/
#define ASSEMBLER
#include "common.h"
#define M ARG1
#define X ARG2
#define INCX ARG3
#define I %rax
#include "l1param.h"
PROLOGUE
PROFCODE
fldz
testq M, M
jle .L999
testq INCX, INCX
jle .L999
salq $BASE_SHIFT, INCX
fldz
fldz
fldz
cmpq $SIZE, INCX
jne .L40
movq M, I
sarq $3, I
jle .L20
ALIGN_4
.L10:
#ifdef PREFETCH
PREFETCH (PREFETCHSIZE + 0) - PREOFFSET(X)
#endif
FLD 0 * SIZE(X)
FLD 1 * SIZE(X)
FLD 2 * SIZE(X)
FLD 3 * SIZE(X)
faddp %st, %st(7)
faddp %st, %st(5)
faddp %st, %st(3)
faddp %st, %st(1)
FLD 4 * SIZE(X)
FLD 5 * SIZE(X)
FLD 6 * SIZE(X)
FLD 7 * SIZE(X)
addq $8 * SIZE, X
faddp %st, %st(7)
faddp %st, %st(5)
faddp %st, %st(3)
faddp %st, %st(1)
decq I
jg .L10
ALIGN_4
.L20:
andq $7, M
jle .L998
ALIGN_4
.L21:
FLD (X)
faddp %st,%st(1)
addq $1 * SIZE, X
decq M
jg .L21
jmp .L998
ALIGN_4
.L40:
movq M, I
sarq $3, I
jle .L60
ALIGN_4
.L50:
FLD (X)
addq INCX, X
FLD (X)
addq INCX, X
FLD (X)
addq INCX, X
FLD (X)
addq INCX, X
faddp %st, %st(7)
faddp %st, %st(5)
faddp %st, %st(3)
faddp %st, %st(1)
FLD (X)
addq INCX, X
FLD (X)
addq INCX, X
FLD (X)
addq INCX, X
FLD (X)
addq INCX, X
faddp %st, %st(7)
faddp %st, %st(5)
faddp %st, %st(3)
faddp %st, %st(1)
decq I
jg .L50
ALIGN_4
.L60:
andq $7, M
jle .L998
ALIGN_4
.L61:
FLD (X)
addq INCX, X
faddp %st,%st(1)
decq M
jg .L61
ALIGN_4
.L998:
faddp %st,%st(2)
faddp %st,%st(1)
faddp %st,%st(1)
ALIGN_4
.L999:
ret
EPILOGUE

180
kernel/x86_64/zsum.S Normal file
View File

@ -0,0 +1,180 @@
/*********************************************************************/
/* Copyright 2009, 2010 The University of Texas at Austin. */
/* All rights reserved. */
/* */
/* Redistribution and use in source and binary forms, with or */
/* without modification, are permitted provided that the following */
/* conditions are met: */
/* */
/* 1. Redistributions of source code must retain the above */
/* copyright notice, this list of conditions and the following */
/* disclaimer. */
/* */
/* 2. Redistributions in binary form must reproduce the above */
/* copyright notice, this list of conditions and the following */
/* disclaimer in the documentation and/or other materials */
/* provided with the distribution. */
/* */
/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */
/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */
/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */
/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */
/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */
/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */
/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */
/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */
/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */
/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */
/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */
/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */
/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */
/* POSSIBILITY OF SUCH DAMAGE. */
/* */
/* The views and conclusions contained in the software and */
/* documentation are those of the authors and should not be */
/* interpreted as representing official policies, either expressed */
/* or implied, of The University of Texas at Austin. */
/*********************************************************************/
#define ASSEMBLER
#include "common.h"
#define M ARG1
#define X ARG2
#define INCX ARG3
#define I %rax
#include "l1param.h"
PROLOGUE
PROFCODE
fldz
testq M, M
jle .L999
testq INCX, INCX
jle .L999
salq $ZBASE_SHIFT, INCX
fldz
fldz
fldz
cmpq $SIZE * 2, INCX
jne .L40
movq M, I
sarq $2, I
jle .L20
ALIGN_4
.L10:
#ifdef PREFETCH
PREFETCH (PREFETCHSIZE + 0) - PREOFFSET(X)
#endif
FLD 0 * SIZE(X)
FLD 1 * SIZE(X)
FLD 2 * SIZE(X)
FLD 3 * SIZE(X)
faddp %st, %st(7)
faddp %st, %st(5)
faddp %st, %st(3)
faddp %st, %st(1)
FLD 4 * SIZE(X)
FLD 5 * SIZE(X)
FLD 6 * SIZE(X)
FLD 7 * SIZE(X)
addq $8 * SIZE, X
faddp %st, %st(7)
faddp %st, %st(5)
faddp %st, %st(3)
faddp %st, %st(1)
decq I
jg .L10
ALIGN_4
.L20:
andq $3, M
jle .L998
ALIGN_4
.L21:
FLD 0 * SIZE(X)
FLD 1 * SIZE(X)
faddp %st,%st(3)
faddp %st,%st(1)
addq $2 * SIZE, X
decq M
jg .L21
jmp .L998
ALIGN_4
.L40:
movq M, I
sarq $2, I
jle .L60
ALIGN_4
.L50:
FLD 0 * SIZE(X)
FLD 1 * SIZE(X)
addq INCX, X
FLD 0 * SIZE(X)
FLD 1 * SIZE(X)
addq INCX, X
faddp %st, %st(7)
faddp %st, %st(5)
faddp %st, %st(3)
faddp %st, %st(1)
FLD 0 * SIZE(X)
FLD 1 * SIZE(X)
addq INCX, X
FLD 0 * SIZE(X)
FLD 1 * SIZE(X)
addq INCX, X
faddp %st, %st(7)
faddp %st, %st(5)
faddp %st, %st(3)
faddp %st, %st(1)
decq I
jg .L50
ALIGN_4
.L60:
andq $3, M
jle .L998
ALIGN_4
.L61:
FLD 0 * SIZE(X)
FLD 1 * SIZE(X)
addq INCX, X
faddp %st,%st(3)
faddp %st,%st(1)
decq M
jg .L61
ALIGN_4
.L998:
faddp %st,%st(2)
faddp %st,%st(1)
faddp %st,%st(1)
ALIGN_4
.L999:
ret
EPILOGUE

View File

@ -35,6 +35,11 @@ DASUMKERNEL = dasum.c
CASUMKERNEL = ../arm/zasum.c
ZASUMKERNEL = zasum.c
SSUMKERNEL = ../arm/asum.c
DSUMKERNEL = dasum.c
CSUMKERNEL = ../arm/zasum.c
ZSUMKERNEL = zasum.c
SAXPYKERNEL = ../arm/axpy.c
DAXPYKERNEL = daxpy.c
CAXPYKERNEL = ../arm/zaxpy.c

View File

@ -35,6 +35,11 @@ DASUMKERNEL = dasum.c
CASUMKERNEL = casum.c
ZASUMKERNEL = zasum.c
SSUMKERNEL = ssum.c
DSUMKERNEL = dsum.c
CSUMKERNEL = csum.c
ZSUMKERNEL = zsum.c
SAXPYKERNEL = saxpy.c
DAXPYKERNEL = daxpy.c
CAXPYKERNEL = caxpy.c

View File

@ -35,6 +35,11 @@ DASUMKERNEL = ../arm/asum.c
CASUMKERNEL = ../arm/zasum.c
ZASUMKERNEL = ../arm/zasum.c
SSUMKERNEL = ../arm/sum.c
DSUMKERNEL = ../arm/sum.c
CSUMKERNEL = ../arm/zsum.c
ZSUMKERNEL = ../arm/zsum.c
SAXPYKERNEL = ../arm/axpy.c
DAXPYKERNEL = ../arm/axpy.c
CAXPYKERNEL = ../arm/zaxpy.c

137
kernel/zarch/csum.c Normal file
View File

@ -0,0 +1,137 @@
/***************************************************************************
Copyright (c) 2013-2019, The OpenBLAS Project
All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
1. Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
2. Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in
the documentation and/or other materials provided with the
distribution.
3. Neither the name of the OpenBLAS project nor the names of
its contributors may be used to endorse or promote products
derived from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*****************************************************************************/
#include "common.h"
#include <math.h>
static FLOAT csum_kernel_32(BLASLONG n, FLOAT *x) {
FLOAT sum;
__asm__("vzero %%v24\n\t"
"vzero %%v25\n\t"
"vzero %%v26\n\t"
"vzero %%v27\n\t"
"vzero %%v28\n\t"
"vzero %%v29\n\t"
"vzero %%v30\n\t"
"vzero %%v31\n\t"
"srlg %[n],%[n],5\n\t"
"xgr %%r1,%%r1\n\t"
"0:\n\t"
"pfd 1, 1024(%%r1,%[x])\n\t"
"vl %%v16, 0(%%r1,%[x])\n\t"
"vl %%v17, 16(%%r1,%[x])\n\t"
"vl %%v18, 32(%%r1,%[x])\n\t"
"vl %%v19, 48(%%r1,%[x])\n\t"
"vl %%v20, 64(%%r1,%[x])\n\t"
"vl %%v21, 80(%%r1,%[x])\n\t"
"vl %%v22, 96(%%r1,%[x])\n\t"
"vl %%v23, 112(%%r1,%[x])\n\t"
"vfasb %%v24,%%v24,%%v16\n\t"
"vfasb %%v25,%%v25,%%v17\n\t"
"vfasb %%v26,%%v26,%%v18\n\t"
"vfasb %%v27,%%v27,%%v19\n\t"
"vfasb %%v28,%%v28,%%v20\n\t"
"vfasb %%v29,%%v29,%%v21\n\t"
"vfasb %%v30,%%v30,%%v22\n\t"
"vfasb %%v31,%%v31,%%v23\n\t"
"vl %%v16, 128(%%r1,%[x])\n\t"
"vl %%v17, 144(%%r1,%[x])\n\t"
"vl %%v18, 160(%%r1,%[x])\n\t"
"vl %%v19, 176(%%r1,%[x])\n\t"
"vl %%v20, 192(%%r1,%[x])\n\t"
"vl %%v21, 208(%%r1,%[x])\n\t"
"vl %%v22, 224(%%r1,%[x])\n\t"
"vl %%v23, 240(%%r1,%[x])\n\t"
"vfasb %%v24,%%v24,%%v16\n\t"
"vfasb %%v25,%%v25,%%v17\n\t"
"vfasb %%v26,%%v26,%%v18\n\t"
"vfasb %%v27,%%v27,%%v19\n\t"
"vfasb %%v28,%%v28,%%v20\n\t"
"vfasb %%v29,%%v29,%%v21\n\t"
"vfasb %%v30,%%v30,%%v22\n\t"
"vfasb %%v31,%%v31,%%v23\n\t"
"agfi %%r1,256\n\t"
"brctg %[n],0b\n\t"
"vfasb %%v24,%%v24,%%v25\n\t"
"vfasb %%v24,%%v24,%%v26\n\t"
"vfasb %%v24,%%v24,%%v27\n\t"
"vfasb %%v24,%%v24,%%v28\n\t"
"vfasb %%v24,%%v24,%%v29\n\t"
"vfasb %%v24,%%v24,%%v30\n\t"
"vfasb %%v24,%%v24,%%v31\n\t"
"veslg %%v25,%%v24,32\n\t"
"vfasb %%v24,%%v24,%%v25\n\t"
"vrepf %%v25,%%v24,2\n\t"
"vfasb %%v24,%%v24,%%v25\n\t"
"vstef %%v24,%[asum],0"
: [sum] "=Q"(sum),[n] "+&r"(n)
: "m"(*(const struct { FLOAT x[n * 2]; } *) x),[x] "a"(x)
: "cc", "r1", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23",
"v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31");
return sum;
}
FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) {
BLASLONG i = 0;
BLASLONG ip = 0;
FLOAT sumf = 0.0;
BLASLONG n1;
BLASLONG inc_x2;
if (n <= 0 || inc_x <= 0)
return (sumf);
if (inc_x == 1) {
n1 = n & -32;
if (n1 > 0) {
sumf = csum_kernel_32(n1, x);
i = n1;
ip = 2 * n1;
}
while (i < n) {
sumf += x[ip] + x[ip + 1];
i++;
ip += 2;
}
} else {
inc_x2 = 2 * inc_x;
while (i < n) {
sumf += x[ip] + x[ip + 1];
ip += inc_x2;
i++;
}
}
return (sumf);
}

148
kernel/zarch/dsum.c Normal file
View File

@ -0,0 +1,148 @@
/***************************************************************************
Copyright (c) 2013-2019, The OpenBLAS Project
All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
1. Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
2. Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in
the documentation and/or other materials provided with the
distribution.
3. Neither the name of the OpenBLAS project nor the names of
its contributors may be used to endorse or promote products
derived from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*****************************************************************************/
#include "common.h"
#include <math.h>
static FLOAT dsum_kernel_32(BLASLONG n, FLOAT *x) {
FLOAT sum;
__asm__("vzero %%v24\n\t"
"vzero %%v25\n\t"
"vzero %%v26\n\t"
"vzero %%v27\n\t"
"vzero %%v28\n\t"
"vzero %%v29\n\t"
"vzero %%v30\n\t"
"vzero %%v31\n\t"
"srlg %[n],%[n],5\n\t"
"xgr %%r1,%%r1\n\t"
"0:\n\t"
"pfd 1, 1024(%%r1,%[x])\n\t"
"vl %%v16, 0(%%r1,%[x])\n\t"
"vl %%v17, 16(%%r1,%[x])\n\t"
"vl %%v18, 32(%%r1,%[x])\n\t"
"vl %%v19, 48(%%r1,%[x])\n\t"
"vl %%v20, 64(%%r1,%[x])\n\t"
"vl %%v21, 80(%%r1,%[x])\n\t"
"vl %%v22, 96(%%r1,%[x])\n\t"
"vl %%v23, 112(%%r1,%[x])\n\t"
"vfadb %%v24,%%v24,%%v16\n\t"
"vfadb %%v25,%%v25,%%v17\n\t"
"vfadb %%v26,%%v26,%%v18\n\t"
"vfadb %%v27,%%v27,%%v19\n\t"
"vfadb %%v28,%%v28,%%v20\n\t"
"vfadb %%v29,%%v29,%%v21\n\t"
"vfadb %%v30,%%v30,%%v22\n\t"
"vfadb %%v31,%%v31,%%v23\n\t"
"vl %%v16, 128(%%r1,%[x])\n\t"
"vl %%v17, 144(%%r1,%[x])\n\t"
"vl %%v18, 160(%%r1,%[x])\n\t"
"vl %%v19, 176(%%r1,%[x])\n\t"
"vl %%v20, 192(%%r1,%[x])\n\t"
"vl %%v21, 208(%%r1,%[x])\n\t"
"vl %%v22, 224(%%r1,%[x])\n\t"
"vl %%v23, 240(%%r1,%[x])\n\t"
"vfadb %%v24,%%v24,%%v16\n\t"
"vfadb %%v25,%%v25,%%v17\n\t"
"vfadb %%v26,%%v26,%%v18\n\t"
"vfadb %%v27,%%v27,%%v19\n\t"
"vfadb %%v28,%%v28,%%v20\n\t"
"vfadb %%v29,%%v29,%%v21\n\t"
"vfadb %%v30,%%v30,%%v22\n\t"
"vfadb %%v31,%%v31,%%v23\n\t"
"agfi %%r1,256\n\t"
"brctg %[n],0b\n\t"
"vfadb %%v24,%%v24,%%v25\n\t"
"vfadb %%v24,%%v24,%%v26\n\t"
"vfadb %%v24,%%v24,%%v27\n\t"
"vfadb %%v24,%%v24,%%v28\n\t"
"vfadb %%v24,%%v24,%%v29\n\t"
"vfadb %%v24,%%v24,%%v30\n\t"
"vfadb %%v24,%%v24,%%v31\n\t"
"vrepg %%v25,%%v24,1\n\t"
"vfadb %%v24,%%v24,%%v25\n\t"
"vsteg %%v24,%[asum],0"
: [sum] "=Q"(sum),[n] "+&r"(n)
: "m"(*(const struct { FLOAT x[n]; } *) x),[x] "a"(x)
: "cc", "r1", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23",
"v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31");
return sum;
}
FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) {
BLASLONG i = 0;
BLASLONG j = 0;
FLOAT sumf = 0.0;
BLASLONG n1;
if (n <= 0 || inc_x <= 0)
return sumf;
if (inc_x == 1) {
n1 = n & -32;
if (n1 > 0) {
sumf = dsum_kernel_32(n1, x);
i = n1;
}
while (i < n) {
sumf += x[i];
i++;
}
} else {
BLASLONG n1 = n & -4;
register FLOAT sum1, sum2;
sum1 = 0.0;
sum2 = 0.0;
while (j < n1) {
sum1 += x[i];
sum2 += x[i + inc_x];
sum1 += x[i + 2 * inc_x];
sum2 += x[i + 3 * inc_x];
i += inc_x * 4;
j += 4;
}
sumf = sum1 + sum2;
while (j < n) {
sumf += x[i];
i += inc_x;
j++;
}
}
return sumf;
}

151
kernel/zarch/ssum.c Normal file
View File

@ -0,0 +1,151 @@
/***************************************************************************
Copyright (c) 2013-2019, The OpenBLAS Project
All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
1. Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
2. Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in
the documentation and/or other materials provided with the
distribution.
3. Neither the name of the OpenBLAS project nor the names of
its contributors may be used to endorse or promote products
derived from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*****************************************************************************/
#include "common.h"
#include <math.h>
static FLOAT ssum_kernel_64(BLASLONG n, FLOAT *x) {
FLOAT sum;
__asm__("vzero %%v24\n\t"
"vzero %%v25\n\t"
"vzero %%v26\n\t"
"vzero %%v27\n\t"
"vzero %%v28\n\t"
"vzero %%v29\n\t"
"vzero %%v30\n\t"
"vzero %%v31\n\t"
"srlg %[n],%[n],6\n\t"
"xgr %%r1,%%r1\n\t"
"0:\n\t"
"pfd 1, 1024(%%r1,%[x])\n\t"
"vl %%v16, 0(%%r1,%[x])\n\t"
"vl %%v17, 16(%%r1,%[x])\n\t"
"vl %%v18, 32(%%r1,%[x])\n\t"
"vl %%v19, 48(%%r1,%[x])\n\t"
"vl %%v20, 64(%%r1,%[x])\n\t"
"vl %%v21, 80(%%r1,%[x])\n\t"
"vl %%v22, 96(%%r1,%[x])\n\t"
"vl %%v23, 112(%%r1,%[x])\n\t"
"vfasb %%v24,%%v24,%%v16\n\t"
"vfasb %%v25,%%v25,%%v17\n\t"
"vfasb %%v26,%%v26,%%v18\n\t"
"vfasb %%v27,%%v27,%%v19\n\t"
"vfasb %%v28,%%v28,%%v20\n\t"
"vfasb %%v29,%%v29,%%v21\n\t"
"vfasb %%v30,%%v30,%%v22\n\t"
"vfasb %%v31,%%v31,%%v23\n\t"
"vl %%v16, 128(%%r1,%[x])\n\t"
"vl %%v17, 144(%%r1,%[x])\n\t"
"vl %%v18, 160(%%r1,%[x])\n\t"
"vl %%v19, 176(%%r1,%[x])\n\t"
"vl %%v20, 192(%%r1,%[x])\n\t"
"vl %%v21, 208(%%r1,%[x])\n\t"
"vl %%v22, 224(%%r1,%[x])\n\t"
"vl %%v23, 240(%%r1,%[x])\n\t"
"vfasb %%v24,%%v24,%%v16\n\t"
"vfasb %%v25,%%v25,%%v17\n\t"
"vfasb %%v26,%%v26,%%v18\n\t"
"vfasb %%v27,%%v27,%%v19\n\t"
"vfasb %%v28,%%v28,%%v20\n\t"
"vfasb %%v29,%%v29,%%v21\n\t"
"vfasb %%v30,%%v30,%%v22\n\t"
"vfasb %%v31,%%v31,%%v23\n\t"
"agfi %%r1,256\n\t"
"brctg %[n],0b\n\t"
"vfasb %%v24,%%v24,%%v25\n\t"
"vfasb %%v24,%%v24,%%v26\n\t"
"vfasb %%v24,%%v24,%%v27\n\t"
"vfasb %%v24,%%v24,%%v28\n\t"
"vfasb %%v24,%%v24,%%v29\n\t"
"vfasb %%v24,%%v24,%%v30\n\t"
"vfasb %%v24,%%v24,%%v31\n\t"
"veslg %%v25,%%v24,32\n\t"
"vfasb %%v24,%%v24,%%v25\n\t"
"vrepf %%v25,%%v24,2\n\t"
"vfasb %%v24,%%v24,%%v25\n\t"
"vstef %%v24,%[asum],0"
: [sum] "=Q"(sum),[n] "+&r"(n)
: "m"(*(const struct { FLOAT x[n]; } *) x),[x] "a"(x)
: "cc", "r1", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23",
"v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31");
return sum;
}
FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) {
BLASLONG i = 0;
BLASLONG j = 0;
FLOAT sumf = 0.0;
BLASLONG n1;
if (n <= 0 || inc_x <= 0)
return sumf;
if (inc_x == 1) {
n1 = n & -64;
if (n1 > 0) {
sumf = ssum_kernel_64(n1, x);
i = n1;
}
while (i < n) {
sumf += x[i];
i++;
}
} else {
BLASLONG n1 = n & -4;
register FLOAT sum1, sum2;
sum1 = 0.0;
sum2 = 0.0;
while (j < n1) {
sum1 += x[i];
sum2 += x[i + inc_x];
sum1 += x[i + 2 * inc_x];
sum2 += x[i + 3 * inc_x];
i += inc_x * 4;
j += 4;
}
sumf = sum1 + sum2;
while (j < n) {
sumf += x[i];
i += inc_x;
j++;
}
}
return sumf;
}

136
kernel/zarch/zsum.c Normal file
View File

@ -0,0 +1,136 @@
/***************************************************************************
Copyright (c) 2013-2019, The OpenBLAS Project
All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
1. Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
2. Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in
the documentation and/or other materials provided with the
distribution.
3. Neither the name of the OpenBLAS project nor the names of
its contributors may be used to endorse or promote products
derived from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*****************************************************************************/
#include "common.h"
#include <math.h>
static FLOAT zsum_kernel_16(BLASLONG n, FLOAT *x) {
FLOAT sum;
__asm__("vzero %%v24\n\t"
"vzero %%v25\n\t"
"vzero %%v26\n\t"
"vzero %%v27\n\t"
"vzero %%v28\n\t"
"vzero %%v29\n\t"
"vzero %%v30\n\t"
"vzero %%v31\n\t"
"srlg %[n],%[n],4\n\t"
"xgr %%r1,%%r1\n\t"
"0:\n\t"
"pfd 1, 1024(%%r1,%[x])\n\t"
"vl %%v16, 0(%%r1,%[x])\n\t"
"vl %%v17, 16(%%r1,%[x])\n\t"
"vl %%v18, 32(%%r1,%[x])\n\t"
"vl %%v19, 48(%%r1,%[x])\n\t"
"vl %%v20, 64(%%r1,%[x])\n\t"
"vl %%v21, 80(%%r1,%[x])\n\t"
"vl %%v22, 96(%%r1,%[x])\n\t"
"vl %%v23, 112(%%r1,%[x])\n\t"
"vfadb %%v24,%%v24,%%v16\n\t"
"vfadb %%v25,%%v25,%%v17\n\t"
"vfadb %%v26,%%v26,%%v18\n\t"
"vfadb %%v27,%%v27,%%v19\n\t"
"vfadb %%v28,%%v28,%%v20\n\t"
"vfadb %%v29,%%v29,%%v21\n\t"
"vfadb %%v30,%%v30,%%v22\n\t"
"vfadb %%v31,%%v31,%%v23\n\t"
"vl %%v16, 128(%%r1,%[x])\n\t"
"vl %%v17, 144(%%r1,%[x])\n\t"
"vl %%v18, 160(%%r1,%[x])\n\t"
"vl %%v19, 176(%%r1,%[x])\n\t"
"vl %%v20, 192(%%r1,%[x])\n\t"
"vl %%v21, 208(%%r1,%[x])\n\t"
"vl %%v22, 224(%%r1,%[x])\n\t"
"vl %%v23, 240(%%r1,%[x])\n\t"
"vfadb %%v24,%%v24,%%v16\n\t"
"vfadb %%v25,%%v25,%%v17\n\t"
"vfadb %%v26,%%v26,%%v18\n\t"
"vfadb %%v27,%%v27,%%v19\n\t"
"vfadb %%v28,%%v28,%%v20\n\t"
"vfadb %%v29,%%v29,%%v21\n\t"
"vfadb %%v30,%%v30,%%v22\n\t"
"vfadb %%v31,%%v31,%%v23\n\t"
"agfi %%r1,256\n\t"
"brctg %[n],0b\n\t"
"vfadb %%v24,%%v24,%%v25\n\t"
"vfadb %%v24,%%v24,%%v26\n\t"
"vfadb %%v24,%%v24,%%v27\n\t"
"vfadb %%v24,%%v24,%%v28\n\t"
"vfadb %%v24,%%v24,%%v29\n\t"
"vfadb %%v24,%%v24,%%v30\n\t"
"vfadb %%v24,%%v24,%%v31\n\t"
"vrepg %%v25,%%v24,1\n\t"
"vfadb %%v24,%%v24,%%v25\n\t"
"vsteg %%v24,%[asum],0"
: [sum] "=Q"(sum),[n] "+&r"(n)
: "m"(*(const struct { FLOAT x[n * 2]; } *) x),[x] "a"(x)
: "cc", "r1", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23",
"v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31");
return sum;
}
FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) {
BLASLONG i = 0;
BLASLONG ip = 0;
FLOAT sumf = 0.0;
BLASLONG n1;
BLASLONG inc_x2;
if (n <= 0 || inc_x <= 0)
return (sumf);
if (inc_x == 1) {
n1 = n & -16;
if (n1 > 0) {
sumf = zsum_kernel_16(n1, x);
i = n1;
ip = 2 * n1;
}
while (i < n) {
sumf += x[ip] + x[ip + 1];
i++;
ip += 2;
}
} else {
inc_x2 = 2 * inc_x;
while (i < n) {
sumf += x[ip] + x[ip + 1];
ip += inc_x2;
i++;
}
}
return (sumf);
}