Merge pull request #2072 from martin-frbg/sum
Add (C)BLAS extension ?sum
This commit is contained in:
commit
ccfb7ead15
5
cblas.h
5
cblas.h
|
@ -73,6 +73,11 @@ double cblas_dasum (OPENBLAS_CONST blasint n, OPENBLAS_CONST double *x, OPENBLAS
|
||||||
float cblas_scasum(OPENBLAS_CONST blasint n, OPENBLAS_CONST void *x, OPENBLAS_CONST blasint incx);
|
float cblas_scasum(OPENBLAS_CONST blasint n, OPENBLAS_CONST void *x, OPENBLAS_CONST blasint incx);
|
||||||
double cblas_dzasum(OPENBLAS_CONST blasint n, OPENBLAS_CONST void *x, OPENBLAS_CONST blasint incx);
|
double cblas_dzasum(OPENBLAS_CONST blasint n, OPENBLAS_CONST void *x, OPENBLAS_CONST blasint incx);
|
||||||
|
|
||||||
|
float cblas_ssum (OPENBLAS_CONST blasint n, OPENBLAS_CONST float *x, OPENBLAS_CONST blasint incx);
|
||||||
|
double cblas_dsum (OPENBLAS_CONST blasint n, OPENBLAS_CONST double *x, OPENBLAS_CONST blasint incx);
|
||||||
|
float cblas_scsum(OPENBLAS_CONST blasint n, OPENBLAS_CONST void *x, OPENBLAS_CONST blasint incx);
|
||||||
|
double cblas_dzsum(OPENBLAS_CONST blasint n, OPENBLAS_CONST void *x, OPENBLAS_CONST blasint incx);
|
||||||
|
|
||||||
float cblas_snrm2 (OPENBLAS_CONST blasint N, OPENBLAS_CONST float *X, OPENBLAS_CONST blasint incX);
|
float cblas_snrm2 (OPENBLAS_CONST blasint N, OPENBLAS_CONST float *X, OPENBLAS_CONST blasint incX);
|
||||||
double cblas_dnrm2 (OPENBLAS_CONST blasint N, OPENBLAS_CONST double *X, OPENBLAS_CONST blasint incX);
|
double cblas_dnrm2 (OPENBLAS_CONST blasint N, OPENBLAS_CONST double *X, OPENBLAS_CONST blasint incX);
|
||||||
float cblas_scnrm2(OPENBLAS_CONST blasint N, OPENBLAS_CONST void *X, OPENBLAS_CONST blasint incX);
|
float cblas_scnrm2(OPENBLAS_CONST blasint N, OPENBLAS_CONST void *X, OPENBLAS_CONST blasint incX);
|
||||||
|
|
|
@ -107,6 +107,12 @@ macro(SetDefaultL1)
|
||||||
set(DAXPBYKERNEL ../arm/axpby.c)
|
set(DAXPBYKERNEL ../arm/axpby.c)
|
||||||
set(CAXPBYKERNEL ../arm/zaxpby.c)
|
set(CAXPBYKERNEL ../arm/zaxpby.c)
|
||||||
set(ZAXPBYKERNEL ../arm/zaxpby.c)
|
set(ZAXPBYKERNEL ../arm/zaxpby.c)
|
||||||
|
set(SSUMKERNEL sum.S)
|
||||||
|
set(DSUMKERNEL sum.S)
|
||||||
|
set(CSUMKERNEL zsum.S)
|
||||||
|
set(ZSUMKERNEL zsum.S)
|
||||||
|
set(QSUMKERNEL sum.S)
|
||||||
|
set(XSUMKERNEL zsum.S)
|
||||||
endmacro ()
|
endmacro ()
|
||||||
|
|
||||||
macro(SetDefaultL2)
|
macro(SetDefaultL2)
|
||||||
|
@ -162,4 +168,4 @@ macro(SetDefaultL3)
|
||||||
set(DGEADD_KERNEL ../generic/geadd.c)
|
set(DGEADD_KERNEL ../generic/geadd.c)
|
||||||
set(CGEADD_KERNEL ../generic/zgeadd.c)
|
set(CGEADD_KERNEL ../generic/zgeadd.c)
|
||||||
set(ZGEADD_KERNEL ../generic/zgeadd.c)
|
set(ZGEADD_KERNEL ../generic/zgeadd.c)
|
||||||
endmacro ()
|
endmacro ()
|
||||||
|
|
|
@ -19,6 +19,7 @@
|
||||||
#define CDOTC_K cdotc_k
|
#define CDOTC_K cdotc_k
|
||||||
#define CNRM2_K cnrm2_k
|
#define CNRM2_K cnrm2_k
|
||||||
#define CSCAL_K cscal_k
|
#define CSCAL_K cscal_k
|
||||||
|
#define CSUM_K csum_k
|
||||||
#define CSWAP_K cswap_k
|
#define CSWAP_K cswap_k
|
||||||
#define CROT_K csrot_k
|
#define CROT_K csrot_k
|
||||||
|
|
||||||
|
@ -249,6 +250,7 @@
|
||||||
#define CDOTC_K gotoblas -> cdotc_k
|
#define CDOTC_K gotoblas -> cdotc_k
|
||||||
#define CNRM2_K gotoblas -> cnrm2_k
|
#define CNRM2_K gotoblas -> cnrm2_k
|
||||||
#define CSCAL_K gotoblas -> cscal_k
|
#define CSCAL_K gotoblas -> cscal_k
|
||||||
|
#define CSUM_K gotoblas -> csum_k
|
||||||
#define CSWAP_K gotoblas -> cswap_k
|
#define CSWAP_K gotoblas -> cswap_k
|
||||||
#define CROT_K gotoblas -> csrot_k
|
#define CROT_K gotoblas -> csrot_k
|
||||||
|
|
||||||
|
|
|
@ -19,6 +19,7 @@
|
||||||
#define DDOTC_K ddot_k
|
#define DDOTC_K ddot_k
|
||||||
#define DNRM2_K dnrm2_k
|
#define DNRM2_K dnrm2_k
|
||||||
#define DSCAL_K dscal_k
|
#define DSCAL_K dscal_k
|
||||||
|
#define DSUM_K dsum_k
|
||||||
#define DSWAP_K dswap_k
|
#define DSWAP_K dswap_k
|
||||||
#define DROT_K drot_k
|
#define DROT_K drot_k
|
||||||
|
|
||||||
|
@ -174,6 +175,7 @@
|
||||||
#define DDOTC_K gotoblas -> ddot_k
|
#define DDOTC_K gotoblas -> ddot_k
|
||||||
#define DNRM2_K gotoblas -> dnrm2_k
|
#define DNRM2_K gotoblas -> dnrm2_k
|
||||||
#define DSCAL_K gotoblas -> dscal_k
|
#define DSCAL_K gotoblas -> dscal_k
|
||||||
|
#define DSUM_K gotoblas -> dsum_k
|
||||||
#define DSWAP_K gotoblas -> dswap_k
|
#define DSWAP_K gotoblas -> dswap_k
|
||||||
#define DROT_K gotoblas -> drot_k
|
#define DROT_K gotoblas -> drot_k
|
||||||
|
|
||||||
|
|
|
@ -122,6 +122,13 @@ xdouble BLASFUNC(qasum) (blasint *, xdouble *, blasint *);
|
||||||
double BLASFUNC(dzasum)(blasint *, double *, blasint *);
|
double BLASFUNC(dzasum)(blasint *, double *, blasint *);
|
||||||
xdouble BLASFUNC(qxasum)(blasint *, xdouble *, blasint *);
|
xdouble BLASFUNC(qxasum)(blasint *, xdouble *, blasint *);
|
||||||
|
|
||||||
|
FLOATRET BLASFUNC(ssum) (blasint *, float *, blasint *);
|
||||||
|
FLOATRET BLASFUNC(scsum)(blasint *, float *, blasint *);
|
||||||
|
double BLASFUNC(dsum) (blasint *, double *, blasint *);
|
||||||
|
xdouble BLASFUNC(qsum) (blasint *, xdouble *, blasint *);
|
||||||
|
double BLASFUNC(dzsum)(blasint *, double *, blasint *);
|
||||||
|
xdouble BLASFUNC(qxsum)(blasint *, xdouble *, blasint *);
|
||||||
|
|
||||||
blasint BLASFUNC(isamax)(blasint *, float *, blasint *);
|
blasint BLASFUNC(isamax)(blasint *, float *, blasint *);
|
||||||
blasint BLASFUNC(idamax)(blasint *, double *, blasint *);
|
blasint BLASFUNC(idamax)(blasint *, double *, blasint *);
|
||||||
blasint BLASFUNC(iqamax)(blasint *, xdouble *, blasint *);
|
blasint BLASFUNC(iqamax)(blasint *, xdouble *, blasint *);
|
||||||
|
|
|
@ -100,6 +100,13 @@ float casum_k (BLASLONG, float *, BLASLONG);
|
||||||
double zasum_k (BLASLONG, double *, BLASLONG);
|
double zasum_k (BLASLONG, double *, BLASLONG);
|
||||||
xdouble xasum_k (BLASLONG, xdouble *, BLASLONG);
|
xdouble xasum_k (BLASLONG, xdouble *, BLASLONG);
|
||||||
|
|
||||||
|
float ssum_k (BLASLONG, float *, BLASLONG);
|
||||||
|
double dsum_k (BLASLONG, double *, BLASLONG);
|
||||||
|
xdouble qsum_k (BLASLONG, xdouble *, BLASLONG);
|
||||||
|
float csum_k (BLASLONG, float *, BLASLONG);
|
||||||
|
double zsum_k (BLASLONG, double *, BLASLONG);
|
||||||
|
xdouble xsum_k (BLASLONG, xdouble *, BLASLONG);
|
||||||
|
|
||||||
float samax_k (BLASLONG, float *, BLASLONG);
|
float samax_k (BLASLONG, float *, BLASLONG);
|
||||||
double damax_k (BLASLONG, double *, BLASLONG);
|
double damax_k (BLASLONG, double *, BLASLONG);
|
||||||
xdouble qamax_k (BLASLONG, xdouble *, BLASLONG);
|
xdouble qamax_k (BLASLONG, xdouble *, BLASLONG);
|
||||||
|
|
|
@ -66,6 +66,7 @@
|
||||||
#define DOTC_K QDOTC_K
|
#define DOTC_K QDOTC_K
|
||||||
#define NRM2_K QNRM2_K
|
#define NRM2_K QNRM2_K
|
||||||
#define SCAL_K QSCAL_K
|
#define SCAL_K QSCAL_K
|
||||||
|
#define SUM_K QSUM_K
|
||||||
#define SWAP_K QSWAP_K
|
#define SWAP_K QSWAP_K
|
||||||
#define ROT_K QROT_K
|
#define ROT_K QROT_K
|
||||||
|
|
||||||
|
@ -356,6 +357,7 @@
|
||||||
#define DOTC_K DDOTC_K
|
#define DOTC_K DDOTC_K
|
||||||
#define NRM2_K DNRM2_K
|
#define NRM2_K DNRM2_K
|
||||||
#define SCAL_K DSCAL_K
|
#define SCAL_K DSCAL_K
|
||||||
|
#define SUM_K DSUM_K
|
||||||
#define SWAP_K DSWAP_K
|
#define SWAP_K DSWAP_K
|
||||||
#define ROT_K DROT_K
|
#define ROT_K DROT_K
|
||||||
|
|
||||||
|
@ -658,6 +660,7 @@
|
||||||
#define DOTC_K SDOTC_K
|
#define DOTC_K SDOTC_K
|
||||||
#define NRM2_K SNRM2_K
|
#define NRM2_K SNRM2_K
|
||||||
#define SCAL_K SSCAL_K
|
#define SCAL_K SSCAL_K
|
||||||
|
#define SUM_K SSUM_K
|
||||||
#define SWAP_K SSWAP_K
|
#define SWAP_K SSWAP_K
|
||||||
#define ROT_K SROT_K
|
#define ROT_K SROT_K
|
||||||
|
|
||||||
|
@ -962,6 +965,7 @@
|
||||||
#define DOTC_K XDOTC_K
|
#define DOTC_K XDOTC_K
|
||||||
#define NRM2_K XNRM2_K
|
#define NRM2_K XNRM2_K
|
||||||
#define SCAL_K XSCAL_K
|
#define SCAL_K XSCAL_K
|
||||||
|
#define SUM_K XSUM_K
|
||||||
#define SWAP_K XSWAP_K
|
#define SWAP_K XSWAP_K
|
||||||
#define ROT_K XROT_K
|
#define ROT_K XROT_K
|
||||||
|
|
||||||
|
@ -1363,6 +1367,7 @@
|
||||||
#define DOTC_K ZDOTC_K
|
#define DOTC_K ZDOTC_K
|
||||||
#define NRM2_K ZNRM2_K
|
#define NRM2_K ZNRM2_K
|
||||||
#define SCAL_K ZSCAL_K
|
#define SCAL_K ZSCAL_K
|
||||||
|
#define SUM_K ZSUM_K
|
||||||
#define SWAP_K ZSWAP_K
|
#define SWAP_K ZSWAP_K
|
||||||
#define ROT_K ZROT_K
|
#define ROT_K ZROT_K
|
||||||
|
|
||||||
|
@ -1785,6 +1790,7 @@
|
||||||
#define DOTC_K CDOTC_K
|
#define DOTC_K CDOTC_K
|
||||||
#define NRM2_K CNRM2_K
|
#define NRM2_K CNRM2_K
|
||||||
#define SCAL_K CSCAL_K
|
#define SCAL_K CSCAL_K
|
||||||
|
#define SUM_K CSUM_K
|
||||||
#define SWAP_K CSWAP_K
|
#define SWAP_K CSWAP_K
|
||||||
#define ROT_K CROT_K
|
#define ROT_K CROT_K
|
||||||
|
|
||||||
|
|
|
@ -63,6 +63,7 @@ BLASLONG (*ismin_k) (BLASLONG, float *, BLASLONG);
|
||||||
|
|
||||||
float (*snrm2_k) (BLASLONG, float *, BLASLONG);
|
float (*snrm2_k) (BLASLONG, float *, BLASLONG);
|
||||||
float (*sasum_k) (BLASLONG, float *, BLASLONG);
|
float (*sasum_k) (BLASLONG, float *, BLASLONG);
|
||||||
|
float (*ssum_k) (BLASLONG, float *, BLASLONG);
|
||||||
int (*scopy_k) (BLASLONG, float *, BLASLONG, float *, BLASLONG);
|
int (*scopy_k) (BLASLONG, float *, BLASLONG, float *, BLASLONG);
|
||||||
float (*sdot_k) (BLASLONG, float *, BLASLONG, float *, BLASLONG);
|
float (*sdot_k) (BLASLONG, float *, BLASLONG, float *, BLASLONG);
|
||||||
double (*dsdot_k) (BLASLONG, float *, BLASLONG, float *, BLASLONG);
|
double (*dsdot_k) (BLASLONG, float *, BLASLONG, float *, BLASLONG);
|
||||||
|
@ -154,6 +155,7 @@ BLASLONG (*idmin_k) (BLASLONG, double *, BLASLONG);
|
||||||
|
|
||||||
double (*dnrm2_k) (BLASLONG, double *, BLASLONG);
|
double (*dnrm2_k) (BLASLONG, double *, BLASLONG);
|
||||||
double (*dasum_k) (BLASLONG, double *, BLASLONG);
|
double (*dasum_k) (BLASLONG, double *, BLASLONG);
|
||||||
|
double (*dsum_k) (BLASLONG, double *, BLASLONG);
|
||||||
int (*dcopy_k) (BLASLONG, double *, BLASLONG, double *, BLASLONG);
|
int (*dcopy_k) (BLASLONG, double *, BLASLONG, double *, BLASLONG);
|
||||||
double (*ddot_k) (BLASLONG, double *, BLASLONG, double *, BLASLONG);
|
double (*ddot_k) (BLASLONG, double *, BLASLONG, double *, BLASLONG);
|
||||||
int (*drot_k) (BLASLONG, double *, BLASLONG, double *, BLASLONG, double, double);
|
int (*drot_k) (BLASLONG, double *, BLASLONG, double *, BLASLONG, double, double);
|
||||||
|
@ -245,6 +247,7 @@ BLASLONG (*iqmin_k) (BLASLONG, xdouble *, BLASLONG);
|
||||||
|
|
||||||
xdouble (*qnrm2_k) (BLASLONG, xdouble *, BLASLONG);
|
xdouble (*qnrm2_k) (BLASLONG, xdouble *, BLASLONG);
|
||||||
xdouble (*qasum_k) (BLASLONG, xdouble *, BLASLONG);
|
xdouble (*qasum_k) (BLASLONG, xdouble *, BLASLONG);
|
||||||
|
xdouble (*qsum_k) (BLASLONG, xdouble *, BLASLONG);
|
||||||
int (*qcopy_k) (BLASLONG, xdouble *, BLASLONG, xdouble *, BLASLONG);
|
int (*qcopy_k) (BLASLONG, xdouble *, BLASLONG, xdouble *, BLASLONG);
|
||||||
xdouble (*qdot_k) (BLASLONG, xdouble *, BLASLONG, xdouble *, BLASLONG);
|
xdouble (*qdot_k) (BLASLONG, xdouble *, BLASLONG, xdouble *, BLASLONG);
|
||||||
int (*qrot_k) (BLASLONG, xdouble *, BLASLONG, xdouble *, BLASLONG, xdouble, xdouble);
|
int (*qrot_k) (BLASLONG, xdouble *, BLASLONG, xdouble *, BLASLONG, xdouble, xdouble);
|
||||||
|
@ -332,6 +335,7 @@ BLASLONG (*icamin_k)(BLASLONG, float *, BLASLONG);
|
||||||
|
|
||||||
float (*cnrm2_k) (BLASLONG, float *, BLASLONG);
|
float (*cnrm2_k) (BLASLONG, float *, BLASLONG);
|
||||||
float (*casum_k) (BLASLONG, float *, BLASLONG);
|
float (*casum_k) (BLASLONG, float *, BLASLONG);
|
||||||
|
float (*csum_k) (BLASLONG, float *, BLASLONG);
|
||||||
int (*ccopy_k) (BLASLONG, float *, BLASLONG, float *, BLASLONG);
|
int (*ccopy_k) (BLASLONG, float *, BLASLONG, float *, BLASLONG);
|
||||||
openblas_complex_float (*cdotu_k) (BLASLONG, float *, BLASLONG, float *, BLASLONG);
|
openblas_complex_float (*cdotu_k) (BLASLONG, float *, BLASLONG, float *, BLASLONG);
|
||||||
openblas_complex_float (*cdotc_k) (BLASLONG, float *, BLASLONG, float *, BLASLONG);
|
openblas_complex_float (*cdotc_k) (BLASLONG, float *, BLASLONG, float *, BLASLONG);
|
||||||
|
@ -495,6 +499,7 @@ BLASLONG (*izamin_k)(BLASLONG, double *, BLASLONG);
|
||||||
|
|
||||||
double (*znrm2_k) (BLASLONG, double *, BLASLONG);
|
double (*znrm2_k) (BLASLONG, double *, BLASLONG);
|
||||||
double (*zasum_k) (BLASLONG, double *, BLASLONG);
|
double (*zasum_k) (BLASLONG, double *, BLASLONG);
|
||||||
|
double (*zsum_k) (BLASLONG, double *, BLASLONG);
|
||||||
int (*zcopy_k) (BLASLONG, double *, BLASLONG, double *, BLASLONG);
|
int (*zcopy_k) (BLASLONG, double *, BLASLONG, double *, BLASLONG);
|
||||||
openblas_complex_double (*zdotu_k) (BLASLONG, double *, BLASLONG, double *, BLASLONG);
|
openblas_complex_double (*zdotu_k) (BLASLONG, double *, BLASLONG, double *, BLASLONG);
|
||||||
openblas_complex_double (*zdotc_k) (BLASLONG, double *, BLASLONG, double *, BLASLONG);
|
openblas_complex_double (*zdotc_k) (BLASLONG, double *, BLASLONG, double *, BLASLONG);
|
||||||
|
@ -660,6 +665,7 @@ BLASLONG (*ixamin_k)(BLASLONG, xdouble *, BLASLONG);
|
||||||
|
|
||||||
xdouble (*xnrm2_k) (BLASLONG, xdouble *, BLASLONG);
|
xdouble (*xnrm2_k) (BLASLONG, xdouble *, BLASLONG);
|
||||||
xdouble (*xasum_k) (BLASLONG, xdouble *, BLASLONG);
|
xdouble (*xasum_k) (BLASLONG, xdouble *, BLASLONG);
|
||||||
|
xdouble (*xsum_k) (BLASLONG, xdouble *, BLASLONG);
|
||||||
int (*xcopy_k) (BLASLONG, xdouble *, BLASLONG, xdouble *, BLASLONG);
|
int (*xcopy_k) (BLASLONG, xdouble *, BLASLONG, xdouble *, BLASLONG);
|
||||||
openblas_complex_xdouble (*xdotu_k) (BLASLONG, xdouble *, BLASLONG, xdouble *, BLASLONG);
|
openblas_complex_xdouble (*xdotu_k) (BLASLONG, xdouble *, BLASLONG, xdouble *, BLASLONG);
|
||||||
openblas_complex_xdouble (*xdotc_k) (BLASLONG, xdouble *, BLASLONG, xdouble *, BLASLONG);
|
openblas_complex_xdouble (*xdotc_k) (BLASLONG, xdouble *, BLASLONG, xdouble *, BLASLONG);
|
||||||
|
|
|
@ -19,6 +19,7 @@
|
||||||
#define QDOTC_K qdot_k
|
#define QDOTC_K qdot_k
|
||||||
#define QNRM2_K qnrm2_k
|
#define QNRM2_K qnrm2_k
|
||||||
#define QSCAL_K qscal_k
|
#define QSCAL_K qscal_k
|
||||||
|
#define QSUM_K qsum_k
|
||||||
#define QSWAP_K qswap_k
|
#define QSWAP_K qswap_k
|
||||||
#define QROT_K qrot_k
|
#define QROT_K qrot_k
|
||||||
|
|
||||||
|
@ -161,6 +162,7 @@
|
||||||
#define QDOTC_K gotoblas -> qdot_k
|
#define QDOTC_K gotoblas -> qdot_k
|
||||||
#define QNRM2_K gotoblas -> qnrm2_k
|
#define QNRM2_K gotoblas -> qnrm2_k
|
||||||
#define QSCAL_K gotoblas -> qscal_k
|
#define QSCAL_K gotoblas -> qscal_k
|
||||||
|
#define QSUM_K gotoblas -> qsum_k
|
||||||
#define QSWAP_K gotoblas -> qswap_k
|
#define QSWAP_K gotoblas -> qswap_k
|
||||||
#define QROT_K gotoblas -> qrot_k
|
#define QROT_K gotoblas -> qrot_k
|
||||||
|
|
||||||
|
|
|
@ -12,6 +12,7 @@
|
||||||
#define ISMAX_K ismax_k
|
#define ISMAX_K ismax_k
|
||||||
#define ISMIN_K ismin_k
|
#define ISMIN_K ismin_k
|
||||||
#define SASUM_K sasum_k
|
#define SASUM_K sasum_k
|
||||||
|
#define SSUM_K ssum_k
|
||||||
#define SAXPYU_K saxpy_k
|
#define SAXPYU_K saxpy_k
|
||||||
#define SAXPYC_K saxpy_k
|
#define SAXPYC_K saxpy_k
|
||||||
#define SCOPY_K scopy_k
|
#define SCOPY_K scopy_k
|
||||||
|
@ -170,6 +171,7 @@
|
||||||
#define ISMAX_K gotoblas -> ismax_k
|
#define ISMAX_K gotoblas -> ismax_k
|
||||||
#define ISMIN_K gotoblas -> ismin_k
|
#define ISMIN_K gotoblas -> ismin_k
|
||||||
#define SASUM_K gotoblas -> sasum_k
|
#define SASUM_K gotoblas -> sasum_k
|
||||||
|
#define SSUM_K gotoblas -> ssum_k
|
||||||
#define SAXPYU_K gotoblas -> saxpy_k
|
#define SAXPYU_K gotoblas -> saxpy_k
|
||||||
#define SAXPYC_K gotoblas -> saxpy_k
|
#define SAXPYC_K gotoblas -> saxpy_k
|
||||||
#define SCOPY_K gotoblas -> scopy_k
|
#define SCOPY_K gotoblas -> scopy_k
|
||||||
|
|
|
@ -19,6 +19,7 @@
|
||||||
#define XDOTC_K xdotc_k
|
#define XDOTC_K xdotc_k
|
||||||
#define XNRM2_K xnrm2_k
|
#define XNRM2_K xnrm2_k
|
||||||
#define XSCAL_K xscal_k
|
#define XSCAL_K xscal_k
|
||||||
|
#define XSUM_K xsum_k
|
||||||
#define XSWAP_K xswap_k
|
#define XSWAP_K xswap_k
|
||||||
#define XROT_K xqrot_k
|
#define XROT_K xqrot_k
|
||||||
|
|
||||||
|
@ -227,6 +228,7 @@
|
||||||
#define XDOTC_K gotoblas -> xdotc_k
|
#define XDOTC_K gotoblas -> xdotc_k
|
||||||
#define XNRM2_K gotoblas -> xnrm2_k
|
#define XNRM2_K gotoblas -> xnrm2_k
|
||||||
#define XSCAL_K gotoblas -> xscal_k
|
#define XSCAL_K gotoblas -> xscal_k
|
||||||
|
#define XSUM_K gotoblas -> xsum_k
|
||||||
#define XSWAP_K gotoblas -> xswap_k
|
#define XSWAP_K gotoblas -> xswap_k
|
||||||
#define XROT_K gotoblas -> xqrot_k
|
#define XROT_K gotoblas -> xqrot_k
|
||||||
|
|
||||||
|
|
|
@ -19,6 +19,7 @@
|
||||||
#define ZDOTC_K zdotc_k
|
#define ZDOTC_K zdotc_k
|
||||||
#define ZNRM2_K znrm2_k
|
#define ZNRM2_K znrm2_k
|
||||||
#define ZSCAL_K zscal_k
|
#define ZSCAL_K zscal_k
|
||||||
|
#define ZSUM_K zsum_k
|
||||||
#define ZSWAP_K zswap_k
|
#define ZSWAP_K zswap_k
|
||||||
#define ZROT_K zdrot_k
|
#define ZROT_K zdrot_k
|
||||||
|
|
||||||
|
@ -249,6 +250,7 @@
|
||||||
#define ZDOTC_K gotoblas -> zdotc_k
|
#define ZDOTC_K gotoblas -> zdotc_k
|
||||||
#define ZNRM2_K gotoblas -> znrm2_k
|
#define ZNRM2_K gotoblas -> znrm2_k
|
||||||
#define ZSCAL_K gotoblas -> zscal_k
|
#define ZSCAL_K gotoblas -> zscal_k
|
||||||
|
#define ZSUM_K gotoblas -> zsum_k
|
||||||
#define ZSWAP_K gotoblas -> zswap_k
|
#define ZSWAP_K gotoblas -> zswap_k
|
||||||
#define ZROT_K gotoblas -> zdrot_k
|
#define ZROT_K gotoblas -> zdrot_k
|
||||||
|
|
||||||
|
|
|
@ -12,6 +12,7 @@ set(BLAS1_REAL_ONLY_SOURCES
|
||||||
rotm.c rotmg.c # N.B. these do not have complex counterparts
|
rotm.c rotmg.c # N.B. these do not have complex counterparts
|
||||||
rot.c
|
rot.c
|
||||||
asum.c
|
asum.c
|
||||||
|
sum.c
|
||||||
)
|
)
|
||||||
|
|
||||||
# these will have 'z' prepended for the complex version
|
# these will have 'z' prepended for the complex version
|
||||||
|
@ -124,6 +125,7 @@ foreach (float_type ${FLOAT_TYPES})
|
||||||
GenerateNamedObjects("max.c" "USE_ABS;USE_MIN" "scamin" ${CBLAS_FLAG} "" "" true "COMPLEX")
|
GenerateNamedObjects("max.c" "USE_ABS;USE_MIN" "scamin" ${CBLAS_FLAG} "" "" true "COMPLEX")
|
||||||
GenerateNamedObjects("max.c" "USE_ABS" "scamax" ${CBLAS_FLAG} "" "" true "COMPLEX")
|
GenerateNamedObjects("max.c" "USE_ABS" "scamax" ${CBLAS_FLAG} "" "" true "COMPLEX")
|
||||||
GenerateNamedObjects("asum.c" "" "scasum" ${CBLAS_FLAG} "" "" true "COMPLEX")
|
GenerateNamedObjects("asum.c" "" "scasum" ${CBLAS_FLAG} "" "" true "COMPLEX")
|
||||||
|
GenerateNamedObjects("sum.c" "" "scsum" ${CBLAS_FLAG} "" "" true "COMPLEX")
|
||||||
endif ()
|
endif ()
|
||||||
if (${float_type} STREQUAL "ZCOMPLEX")
|
if (${float_type} STREQUAL "ZCOMPLEX")
|
||||||
GenerateNamedObjects("zscal.c" "SSCAL" "dscal" ${CBLAS_FLAG} "" "" false "ZCOMPLEX")
|
GenerateNamedObjects("zscal.c" "SSCAL" "dscal" ${CBLAS_FLAG} "" "" false "ZCOMPLEX")
|
||||||
|
@ -132,6 +134,7 @@ foreach (float_type ${FLOAT_TYPES})
|
||||||
GenerateNamedObjects("max.c" "USE_ABS;USE_MIN" "dzamin" ${CBLAS_FLAG} "" "" true "ZCOMPLEX")
|
GenerateNamedObjects("max.c" "USE_ABS;USE_MIN" "dzamin" ${CBLAS_FLAG} "" "" true "ZCOMPLEX")
|
||||||
GenerateNamedObjects("max.c" "USE_ABS" "dzamax" ${CBLAS_FLAG} "" "" true "ZCOMPLEX")
|
GenerateNamedObjects("max.c" "USE_ABS" "dzamax" ${CBLAS_FLAG} "" "" true "ZCOMPLEX")
|
||||||
GenerateNamedObjects("asum.c" "" "dzasum" ${CBLAS_FLAG} "" "" true "ZCOMPLEX")
|
GenerateNamedObjects("asum.c" "" "dzasum" ${CBLAS_FLAG} "" "" true "ZCOMPLEX")
|
||||||
|
GenerateNamedObjects("sum.c" "" "dzsum" ${CBLAS_FLAG} "" "" true "ZCOMPLEX")
|
||||||
endif ()
|
endif ()
|
||||||
endforeach ()
|
endforeach ()
|
||||||
|
|
||||||
|
|
|
@ -25,7 +25,7 @@ SBLAS1OBJS = \
|
||||||
saxpy.$(SUFFIX) sswap.$(SUFFIX) \
|
saxpy.$(SUFFIX) sswap.$(SUFFIX) \
|
||||||
scopy.$(SUFFIX) sscal.$(SUFFIX) \
|
scopy.$(SUFFIX) sscal.$(SUFFIX) \
|
||||||
sdot.$(SUFFIX) sdsdot.$(SUFFIX) dsdot.$(SUFFIX) \
|
sdot.$(SUFFIX) sdsdot.$(SUFFIX) dsdot.$(SUFFIX) \
|
||||||
sasum.$(SUFFIX) snrm2.$(SUFFIX) \
|
sasum.$(SUFFIX) ssum.$(SUFFIX) snrm2.$(SUFFIX) \
|
||||||
smax.$(SUFFIX) samax.$(SUFFIX) ismax.$(SUFFIX) isamax.$(SUFFIX) \
|
smax.$(SUFFIX) samax.$(SUFFIX) ismax.$(SUFFIX) isamax.$(SUFFIX) \
|
||||||
smin.$(SUFFIX) samin.$(SUFFIX) ismin.$(SUFFIX) isamin.$(SUFFIX) \
|
smin.$(SUFFIX) samin.$(SUFFIX) ismin.$(SUFFIX) isamin.$(SUFFIX) \
|
||||||
srot.$(SUFFIX) srotg.$(SUFFIX) srotm.$(SUFFIX) srotmg.$(SUFFIX) \
|
srot.$(SUFFIX) srotg.$(SUFFIX) srotm.$(SUFFIX) srotmg.$(SUFFIX) \
|
||||||
|
@ -51,7 +51,7 @@ DBLAS1OBJS = \
|
||||||
daxpy.$(SUFFIX) dswap.$(SUFFIX) \
|
daxpy.$(SUFFIX) dswap.$(SUFFIX) \
|
||||||
dcopy.$(SUFFIX) dscal.$(SUFFIX) \
|
dcopy.$(SUFFIX) dscal.$(SUFFIX) \
|
||||||
ddot.$(SUFFIX) \
|
ddot.$(SUFFIX) \
|
||||||
dasum.$(SUFFIX) dnrm2.$(SUFFIX) \
|
dasum.$(SUFFIX) dsum.$(SUFFIX) dnrm2.$(SUFFIX) \
|
||||||
dmax.$(SUFFIX) damax.$(SUFFIX) idmax.$(SUFFIX) idamax.$(SUFFIX) \
|
dmax.$(SUFFIX) damax.$(SUFFIX) idmax.$(SUFFIX) idamax.$(SUFFIX) \
|
||||||
dmin.$(SUFFIX) damin.$(SUFFIX) idmin.$(SUFFIX) idamin.$(SUFFIX) \
|
dmin.$(SUFFIX) damin.$(SUFFIX) idmin.$(SUFFIX) idamin.$(SUFFIX) \
|
||||||
drot.$(SUFFIX) drotg.$(SUFFIX) drotm.$(SUFFIX) drotmg.$(SUFFIX) \
|
drot.$(SUFFIX) drotg.$(SUFFIX) drotm.$(SUFFIX) drotmg.$(SUFFIX) \
|
||||||
|
@ -76,7 +76,7 @@ CBLAS1OBJS = \
|
||||||
caxpy.$(SUFFIX) caxpyc.$(SUFFIX) cswap.$(SUFFIX) \
|
caxpy.$(SUFFIX) caxpyc.$(SUFFIX) cswap.$(SUFFIX) \
|
||||||
ccopy.$(SUFFIX) cscal.$(SUFFIX) csscal.$(SUFFIX) \
|
ccopy.$(SUFFIX) cscal.$(SUFFIX) csscal.$(SUFFIX) \
|
||||||
cdotc.$(SUFFIX) cdotu.$(SUFFIX) \
|
cdotc.$(SUFFIX) cdotu.$(SUFFIX) \
|
||||||
scasum.$(SUFFIX) scnrm2.$(SUFFIX) \
|
scasum.$(SUFFIX) scsum.$(SUFFIX) scnrm2.$(SUFFIX) \
|
||||||
scamax.$(SUFFIX) icamax.$(SUFFIX) \
|
scamax.$(SUFFIX) icamax.$(SUFFIX) \
|
||||||
scamin.$(SUFFIX) icamin.$(SUFFIX) \
|
scamin.$(SUFFIX) icamin.$(SUFFIX) \
|
||||||
csrot.$(SUFFIX) crotg.$(SUFFIX) \
|
csrot.$(SUFFIX) crotg.$(SUFFIX) \
|
||||||
|
@ -105,7 +105,7 @@ ZBLAS1OBJS = \
|
||||||
zaxpy.$(SUFFIX) zaxpyc.$(SUFFIX) zswap.$(SUFFIX) \
|
zaxpy.$(SUFFIX) zaxpyc.$(SUFFIX) zswap.$(SUFFIX) \
|
||||||
zcopy.$(SUFFIX) zscal.$(SUFFIX) zdscal.$(SUFFIX) \
|
zcopy.$(SUFFIX) zscal.$(SUFFIX) zdscal.$(SUFFIX) \
|
||||||
zdotc.$(SUFFIX) zdotu.$(SUFFIX) \
|
zdotc.$(SUFFIX) zdotu.$(SUFFIX) \
|
||||||
dzasum.$(SUFFIX) dznrm2.$(SUFFIX) \
|
dzasum.$(SUFFIX) dzsum.$(SUFFIX) dznrm2.$(SUFFIX) \
|
||||||
dzamax.$(SUFFIX) izamax.$(SUFFIX) \
|
dzamax.$(SUFFIX) izamax.$(SUFFIX) \
|
||||||
dzamin.$(SUFFIX) izamin.$(SUFFIX) \
|
dzamin.$(SUFFIX) izamin.$(SUFFIX) \
|
||||||
zdrot.$(SUFFIX) zrotg.$(SUFFIX) \
|
zdrot.$(SUFFIX) zrotg.$(SUFFIX) \
|
||||||
|
@ -146,7 +146,7 @@ QBLAS1OBJS = \
|
||||||
qaxpy.$(SUFFIX) qswap.$(SUFFIX) \
|
qaxpy.$(SUFFIX) qswap.$(SUFFIX) \
|
||||||
qcopy.$(SUFFIX) qscal.$(SUFFIX) \
|
qcopy.$(SUFFIX) qscal.$(SUFFIX) \
|
||||||
qdot.$(SUFFIX) \
|
qdot.$(SUFFIX) \
|
||||||
qasum.$(SUFFIX) qnrm2.$(SUFFIX) \
|
qasum.$(SUFFIX) qsum.$(SUFFIX) qnrm2.$(SUFFIX) \
|
||||||
qmax.$(SUFFIX) qamax.$(SUFFIX) iqmax.$(SUFFIX) iqamax.$(SUFFIX) \
|
qmax.$(SUFFIX) qamax.$(SUFFIX) iqmax.$(SUFFIX) iqamax.$(SUFFIX) \
|
||||||
qmin.$(SUFFIX) qamin.$(SUFFIX) iqmin.$(SUFFIX) iqamin.$(SUFFIX) \
|
qmin.$(SUFFIX) qamin.$(SUFFIX) iqmin.$(SUFFIX) iqamin.$(SUFFIX) \
|
||||||
qrot.$(SUFFIX) qrotg.$(SUFFIX) qrotm.$(SUFFIX) qrotmg.$(SUFFIX) \
|
qrot.$(SUFFIX) qrotg.$(SUFFIX) qrotm.$(SUFFIX) qrotmg.$(SUFFIX) \
|
||||||
|
@ -168,7 +168,7 @@ XBLAS1OBJS = \
|
||||||
xaxpy.$(SUFFIX) xaxpyc.$(SUFFIX) xswap.$(SUFFIX) \
|
xaxpy.$(SUFFIX) xaxpyc.$(SUFFIX) xswap.$(SUFFIX) \
|
||||||
xcopy.$(SUFFIX) xscal.$(SUFFIX) xqscal.$(SUFFIX) \
|
xcopy.$(SUFFIX) xscal.$(SUFFIX) xqscal.$(SUFFIX) \
|
||||||
xdotc.$(SUFFIX) xdotu.$(SUFFIX) \
|
xdotc.$(SUFFIX) xdotu.$(SUFFIX) \
|
||||||
qxasum.$(SUFFIX) qxnrm2.$(SUFFIX) \
|
qxasum.$(SUFFIX) qxsum.$(SUFFIX) qxnrm2.$(SUFFIX) \
|
||||||
qxamax.$(SUFFIX) ixamax.$(SUFFIX) \
|
qxamax.$(SUFFIX) ixamax.$(SUFFIX) \
|
||||||
qxamin.$(SUFFIX) ixamin.$(SUFFIX) \
|
qxamin.$(SUFFIX) ixamin.$(SUFFIX) \
|
||||||
xqrot.$(SUFFIX) xrotg.$(SUFFIX) \
|
xqrot.$(SUFFIX) xrotg.$(SUFFIX) \
|
||||||
|
@ -203,7 +203,7 @@ ifdef QUAD_PRECISION
|
||||||
QBLAS1OBJS = \
|
QBLAS1OBJS = \
|
||||||
qaxpy.$(SUFFIX) qswap.$(SUFFIX) \
|
qaxpy.$(SUFFIX) qswap.$(SUFFIX) \
|
||||||
qcopy.$(SUFFIX) qscal.$(SUFFIX) \
|
qcopy.$(SUFFIX) qscal.$(SUFFIX) \
|
||||||
qasum.$(SUFFIX) qnrm2.$(SUFFIX) \
|
qasum.$(SUFFIX) qsum.$(SUFFIX) qnrm2.$(SUFFIX) \
|
||||||
qmax.$(SUFFIX) qamax.$(SUFFIX) iqmax.$(SUFFIX) iqamax.$(SUFFIX) \
|
qmax.$(SUFFIX) qamax.$(SUFFIX) iqmax.$(SUFFIX) iqamax.$(SUFFIX) \
|
||||||
qmin.$(SUFFIX) qamin.$(SUFFIX) iqmin.$(SUFFIX) iqamin.$(SUFFIX) \
|
qmin.$(SUFFIX) qamin.$(SUFFIX) iqmin.$(SUFFIX) iqamin.$(SUFFIX) \
|
||||||
qrot.$(SUFFIX) qrotg.$(SUFFIX) qrotm.$(SUFFIX) qrotmg.$(SUFFIX) \
|
qrot.$(SUFFIX) qrotg.$(SUFFIX) qrotm.$(SUFFIX) qrotmg.$(SUFFIX) \
|
||||||
|
@ -224,7 +224,7 @@ QBLAS3OBJS = \
|
||||||
XBLAS1OBJS = \
|
XBLAS1OBJS = \
|
||||||
xaxpy.$(SUFFIX) xaxpyc.$(SUFFIX) xswap.$(SUFFIX) \
|
xaxpy.$(SUFFIX) xaxpyc.$(SUFFIX) xswap.$(SUFFIX) \
|
||||||
xcopy.$(SUFFIX) xscal.$(SUFFIX) xqscal.$(SUFFIX) \
|
xcopy.$(SUFFIX) xscal.$(SUFFIX) xqscal.$(SUFFIX) \
|
||||||
qxasum.$(SUFFIX) qxnrm2.$(SUFFIX) \
|
qxasum.$(SUFFIX) qxsum.$(SUFFIX) qxnrm2.$(SUFFIX) \
|
||||||
qxamax.$(SUFFIX) ixamax.$(SUFFIX) \
|
qxamax.$(SUFFIX) ixamax.$(SUFFIX) \
|
||||||
qxamin.$(SUFFIX) ixamin.$(SUFFIX) \
|
qxamin.$(SUFFIX) ixamin.$(SUFFIX) \
|
||||||
xqrot.$(SUFFIX) xrotg.$(SUFFIX) \
|
xqrot.$(SUFFIX) xrotg.$(SUFFIX) \
|
||||||
|
@ -264,7 +264,7 @@ CSBLAS1OBJS = \
|
||||||
cblas_scopy.$(SUFFIX) cblas_sdot.$(SUFFIX) cblas_sdsdot.$(SUFFIX) cblas_dsdot.$(SUFFIX) \
|
cblas_scopy.$(SUFFIX) cblas_sdot.$(SUFFIX) cblas_sdsdot.$(SUFFIX) cblas_dsdot.$(SUFFIX) \
|
||||||
cblas_srot.$(SUFFIX) cblas_srotg.$(SUFFIX) cblas_srotm.$(SUFFIX) cblas_srotmg.$(SUFFIX) \
|
cblas_srot.$(SUFFIX) cblas_srotg.$(SUFFIX) cblas_srotm.$(SUFFIX) cblas_srotmg.$(SUFFIX) \
|
||||||
cblas_sscal.$(SUFFIX) cblas_sswap.$(SUFFIX) cblas_snrm2.$(SUFFIX) cblas_saxpby.$(SUFFIX) \
|
cblas_sscal.$(SUFFIX) cblas_sswap.$(SUFFIX) cblas_snrm2.$(SUFFIX) cblas_saxpby.$(SUFFIX) \
|
||||||
cblas_ismin.$(SUFFIX) cblas_ismax.$(SUFFIX)
|
cblas_ismin.$(SUFFIX) cblas_ismax.$(SUFFIX) cblas_ssum.$(SUFFIX)
|
||||||
|
|
||||||
CSBLAS2OBJS = \
|
CSBLAS2OBJS = \
|
||||||
cblas_sgemv.$(SUFFIX) cblas_sger.$(SUFFIX) cblas_ssymv.$(SUFFIX) cblas_strmv.$(SUFFIX) \
|
cblas_sgemv.$(SUFFIX) cblas_sger.$(SUFFIX) cblas_ssymv.$(SUFFIX) cblas_strmv.$(SUFFIX) \
|
||||||
|
@ -282,7 +282,7 @@ CDBLAS1OBJS = \
|
||||||
cblas_dcopy.$(SUFFIX) cblas_ddot.$(SUFFIX) \
|
cblas_dcopy.$(SUFFIX) cblas_ddot.$(SUFFIX) \
|
||||||
cblas_drot.$(SUFFIX) cblas_drotg.$(SUFFIX) cblas_drotm.$(SUFFIX) cblas_drotmg.$(SUFFIX) \
|
cblas_drot.$(SUFFIX) cblas_drotg.$(SUFFIX) cblas_drotm.$(SUFFIX) cblas_drotmg.$(SUFFIX) \
|
||||||
cblas_dscal.$(SUFFIX) cblas_dswap.$(SUFFIX) cblas_dnrm2.$(SUFFIX) cblas_daxpby.$(SUFFIX) \
|
cblas_dscal.$(SUFFIX) cblas_dswap.$(SUFFIX) cblas_dnrm2.$(SUFFIX) cblas_daxpby.$(SUFFIX) \
|
||||||
cblas_idmin.$(SUFFIX) cblas_idmax.$(SUFFIX)
|
cblas_idmin.$(SUFFIX) cblas_idmax.$(SUFFIX) cblas_dsum.$(SUFFIX)
|
||||||
|
|
||||||
CDBLAS2OBJS = \
|
CDBLAS2OBJS = \
|
||||||
cblas_dgemv.$(SUFFIX) cblas_dger.$(SUFFIX) cblas_dsymv.$(SUFFIX) cblas_dtrmv.$(SUFFIX) \
|
cblas_dgemv.$(SUFFIX) cblas_dger.$(SUFFIX) cblas_dsymv.$(SUFFIX) cblas_dtrmv.$(SUFFIX) \
|
||||||
|
@ -303,7 +303,7 @@ CCBLAS1OBJS = \
|
||||||
cblas_cscal.$(SUFFIX) cblas_csscal.$(SUFFIX) \
|
cblas_cscal.$(SUFFIX) cblas_csscal.$(SUFFIX) \
|
||||||
cblas_cswap.$(SUFFIX) cblas_scnrm2.$(SUFFIX) \
|
cblas_cswap.$(SUFFIX) cblas_scnrm2.$(SUFFIX) \
|
||||||
cblas_caxpby.$(SUFFIX) \
|
cblas_caxpby.$(SUFFIX) \
|
||||||
cblas_icmin.$(SUFFIX) cblas_icmax.$(SUFFIX)
|
cblas_icmin.$(SUFFIX) cblas_icmax.$(SUFFIX) cblas_scsum.$(SUFFIX)
|
||||||
|
|
||||||
CCBLAS2OBJS = \
|
CCBLAS2OBJS = \
|
||||||
cblas_cgemv.$(SUFFIX) cblas_cgerc.$(SUFFIX) cblas_cgeru.$(SUFFIX) \
|
cblas_cgemv.$(SUFFIX) cblas_cgerc.$(SUFFIX) cblas_cgeru.$(SUFFIX) \
|
||||||
|
@ -330,7 +330,7 @@ CZBLAS1OBJS = \
|
||||||
cblas_zscal.$(SUFFIX) cblas_zdscal.$(SUFFIX) \
|
cblas_zscal.$(SUFFIX) cblas_zdscal.$(SUFFIX) \
|
||||||
cblas_zswap.$(SUFFIX) cblas_dznrm2.$(SUFFIX) \
|
cblas_zswap.$(SUFFIX) cblas_dznrm2.$(SUFFIX) \
|
||||||
cblas_zaxpby.$(SUFFIX) \
|
cblas_zaxpby.$(SUFFIX) \
|
||||||
cblas_izmin.$(SUFFIX) cblas_izmax.$(SUFFIX)
|
cblas_izmin.$(SUFFIX) cblas_izmax.$(SUFFIX) cblas_dzsum.$(SUFFIX)
|
||||||
|
|
||||||
|
|
||||||
CZBLAS2OBJS = \
|
CZBLAS2OBJS = \
|
||||||
|
@ -565,6 +565,24 @@ dzasum.$(SUFFIX) dzasum.$(PSUFFIX) : asum.c
|
||||||
qxasum.$(SUFFIX) qxasum.$(PSUFFIX) : asum.c
|
qxasum.$(SUFFIX) qxasum.$(PSUFFIX) : asum.c
|
||||||
$(CC) $(CFLAGS) -c $< -o $(@F)
|
$(CC) $(CFLAGS) -c $< -o $(@F)
|
||||||
|
|
||||||
|
ssum.$(SUFFIX) ssum.$(PSUFFIX) : sum.c
|
||||||
|
$(CC) $(CFLAGS) -c $< -o $(@F)
|
||||||
|
|
||||||
|
dsum.$(SUFFIX) dsum.$(PSUFFIX) : sum.c
|
||||||
|
$(CC) $(CFLAGS) -c $< -o $(@F)
|
||||||
|
|
||||||
|
qsum.$(SUFFIX) qsum.$(PSUFFIX) : sum.c
|
||||||
|
$(CC) $(CFLAGS) -c $< -o $(@F)
|
||||||
|
|
||||||
|
scsum.$(SUFFIX) scsum.$(PSUFFIX) : sum.c
|
||||||
|
$(CC) $(CFLAGS) -c $< -o $(@F)
|
||||||
|
|
||||||
|
dzsum.$(SUFFIX) dzsum.$(PSUFFIX) : sum.c
|
||||||
|
$(CC) $(CFLAGS) -c $< -o $(@F)
|
||||||
|
|
||||||
|
qxsum.$(SUFFIX) qxsum.$(PSUFFIX) : sum.c
|
||||||
|
$(CC) $(CFLAGS) -c $< -o $(@F)
|
||||||
|
|
||||||
snrm2.$(SUFFIX) snrm2.$(PSUFFIX) : nrm2.c
|
snrm2.$(SUFFIX) snrm2.$(PSUFFIX) : nrm2.c
|
||||||
$(CC) $(CFLAGS) -c $< -o $(@F)
|
$(CC) $(CFLAGS) -c $< -o $(@F)
|
||||||
|
|
||||||
|
@ -1412,6 +1430,18 @@ cblas_scasum.$(SUFFIX) cblas_scasum.$(PSUFFIX) : asum.c
|
||||||
cblas_dzasum.$(SUFFIX) cblas_dzasum.$(PSUFFIX) : asum.c
|
cblas_dzasum.$(SUFFIX) cblas_dzasum.$(PSUFFIX) : asum.c
|
||||||
$(CC) $(CFLAGS) -DCBLAS -c $< -o $(@F)
|
$(CC) $(CFLAGS) -DCBLAS -c $< -o $(@F)
|
||||||
|
|
||||||
|
cblas_ssum.$(SUFFIX) cblas_ssum.$(PSUFFIX) : sum.c
|
||||||
|
$(CC) $(CFLAGS) -DCBLAS -c $< -o $(@F)
|
||||||
|
|
||||||
|
cblas_dsum.$(SUFFIX) cblas_dsum.$(PSUFFIX) : sum.c
|
||||||
|
$(CC) $(CFLAGS) -DCBLAS -c $< -o $(@F)
|
||||||
|
|
||||||
|
cblas_scsum.$(SUFFIX) cblas_scsum.$(PSUFFIX) : sum.c
|
||||||
|
$(CC) $(CFLAGS) -DCBLAS -c $< -o $(@F)
|
||||||
|
|
||||||
|
cblas_dzsum.$(SUFFIX) cblas_dzsum.$(PSUFFIX) : sum.c
|
||||||
|
$(CC) $(CFLAGS) -DCBLAS -c $< -o $(@F)
|
||||||
|
|
||||||
cblas_sdsdot.$(SUFFIX) cblas_sdsdot.$(PSUFFIX) : sdsdot.c
|
cblas_sdsdot.$(SUFFIX) cblas_sdsdot.$(PSUFFIX) : sdsdot.c
|
||||||
$(CC) $(CFLAGS) -DCBLAS -c $< -o $(@F)
|
$(CC) $(CFLAGS) -DCBLAS -c $< -o $(@F)
|
||||||
|
|
||||||
|
@ -1419,7 +1449,7 @@ cblas_dsdot.$(SUFFIX) cblas_dsdot.$(PSUFFIX) : dsdot.c
|
||||||
$(CC) $(CFLAGS) -DCBLAS -c $< -o $(@F)
|
$(CC) $(CFLAGS) -DCBLAS -c $< -o $(@F)
|
||||||
|
|
||||||
cblas_sdot.$(SUFFIX) cblas_sdot.$(PSUFFIX) : dot.c
|
cblas_sdot.$(SUFFIX) cblas_sdot.$(PSUFFIX) : dot.c
|
||||||
$(CC) $(CFLAGS) -DCBLAS -c $< -o $(@F)
|
$(CC) $(CFLAGS) -DCBLAS -c $< -o $(@F)
|
||||||
|
|
||||||
cblas_ddot.$(SUFFIX) cblas_ddot.$(PSUFFIX) : dot.c
|
cblas_ddot.$(SUFFIX) cblas_ddot.$(PSUFFIX) : dot.c
|
||||||
$(CC) $(CFLAGS) -DCBLAS -c $< -o $(@F)
|
$(CC) $(CFLAGS) -DCBLAS -c $< -o $(@F)
|
||||||
|
|
|
@ -0,0 +1,97 @@
|
||||||
|
/*********************************************************************/
|
||||||
|
/* Copyright 2009, 2010 The University of Texas at Austin. */
|
||||||
|
/* All rights reserved. */
|
||||||
|
/* */
|
||||||
|
/* Redistribution and use in source and binary forms, with or */
|
||||||
|
/* without modification, are permitted provided that the following */
|
||||||
|
/* conditions are met: */
|
||||||
|
/* */
|
||||||
|
/* 1. Redistributions of source code must retain the above */
|
||||||
|
/* copyright notice, this list of conditions and the following */
|
||||||
|
/* disclaimer. */
|
||||||
|
/* */
|
||||||
|
/* 2. Redistributions in binary form must reproduce the above */
|
||||||
|
/* copyright notice, this list of conditions and the following */
|
||||||
|
/* disclaimer in the documentation and/or other materials */
|
||||||
|
/* provided with the distribution. */
|
||||||
|
/* */
|
||||||
|
/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */
|
||||||
|
/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */
|
||||||
|
/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */
|
||||||
|
/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */
|
||||||
|
/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */
|
||||||
|
/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */
|
||||||
|
/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */
|
||||||
|
/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */
|
||||||
|
/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */
|
||||||
|
/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */
|
||||||
|
/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */
|
||||||
|
/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */
|
||||||
|
/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */
|
||||||
|
/* POSSIBILITY OF SUCH DAMAGE. */
|
||||||
|
/* */
|
||||||
|
/* The views and conclusions contained in the software and */
|
||||||
|
/* documentation are those of the authors and should not be */
|
||||||
|
/* interpreted as representing official policies, either expressed */
|
||||||
|
/* or implied, of The University of Texas at Austin. */
|
||||||
|
/*********************************************************************/
|
||||||
|
|
||||||
|
#include <stdio.h>
|
||||||
|
#include "common.h"
|
||||||
|
#ifdef FUNCTION_PROFILE
|
||||||
|
#include "functable.h"
|
||||||
|
#endif
|
||||||
|
|
||||||
|
#ifndef CBLAS
|
||||||
|
|
||||||
|
FLOATRET NAME(blasint *N, FLOAT *x, blasint *INCX){
|
||||||
|
|
||||||
|
BLASLONG n = *N;
|
||||||
|
BLASLONG incx = *INCX;
|
||||||
|
FLOATRET ret;
|
||||||
|
|
||||||
|
PRINT_DEBUG_NAME;
|
||||||
|
|
||||||
|
if (n <= 0) return 0;
|
||||||
|
|
||||||
|
IDEBUG_START;
|
||||||
|
|
||||||
|
FUNCTION_PROFILE_START();
|
||||||
|
|
||||||
|
ret = (FLOATRET)SUM_K(n, x, incx);
|
||||||
|
|
||||||
|
FUNCTION_PROFILE_END(COMPSIZE, n, n);
|
||||||
|
|
||||||
|
IDEBUG_END;
|
||||||
|
|
||||||
|
return ret;
|
||||||
|
}
|
||||||
|
|
||||||
|
#else
|
||||||
|
#ifdef COMPLEX
|
||||||
|
FLOAT CNAME(blasint n, void *vx, blasint incx){
|
||||||
|
FLOAT *x = (FLOAT*) vx;
|
||||||
|
#else
|
||||||
|
FLOAT CNAME(blasint n, FLOAT *x, blasint incx){
|
||||||
|
#endif
|
||||||
|
|
||||||
|
FLOAT ret;
|
||||||
|
|
||||||
|
PRINT_DEBUG_CNAME;
|
||||||
|
|
||||||
|
if (n <= 0) return 0;
|
||||||
|
|
||||||
|
IDEBUG_START;
|
||||||
|
|
||||||
|
FUNCTION_PROFILE_START();
|
||||||
|
|
||||||
|
ret = SUM_K(n, x, incx);
|
||||||
|
|
||||||
|
FUNCTION_PROFILE_END(COMPSIZE, n, n);
|
||||||
|
|
||||||
|
IDEBUG_END;
|
||||||
|
|
||||||
|
return ret;
|
||||||
|
}
|
||||||
|
|
||||||
|
#endif
|
|
@ -65,6 +65,7 @@ function (build_core TARGET_CORE KDIR TSUFFIX KERNEL_DEFINITIONS)
|
||||||
GenerateNamedObjects("${KERNELDIR}/${${float_char}SCALKERNEL}" "" "scal_k" false "" "" false ${float_type})
|
GenerateNamedObjects("${KERNELDIR}/${${float_char}SCALKERNEL}" "" "scal_k" false "" "" false ${float_type})
|
||||||
GenerateNamedObjects("${KERNELDIR}/${${float_char}SWAPKERNEL}" "" "swap_k" false "" "" false ${float_type})
|
GenerateNamedObjects("${KERNELDIR}/${${float_char}SWAPKERNEL}" "" "swap_k" false "" "" false ${float_type})
|
||||||
GenerateNamedObjects("${KERNELDIR}/${${float_char}AXPBYKERNEL}" "" "axpby_k" false "" "" false ${float_type})
|
GenerateNamedObjects("${KERNELDIR}/${${float_char}AXPBYKERNEL}" "" "axpby_k" false "" "" false ${float_type})
|
||||||
|
GenerateNamedObjects("${KERNELDIR}/${${float_char}SUMKERNEL}" "" "sum_k" false "" "" false ${float_type})
|
||||||
|
|
||||||
if (${float_type} STREQUAL "COMPLEX" OR ${float_type} STREQUAL "ZCOMPLEX")
|
if (${float_type} STREQUAL "COMPLEX" OR ${float_type} STREQUAL "ZCOMPLEX")
|
||||||
GenerateNamedObjects("${KERNELDIR}/${${float_char}AXPYKERNEL}" "CONJ" "axpyc_k" false "" "" false ${float_type})
|
GenerateNamedObjects("${KERNELDIR}/${${float_char}AXPYKERNEL}" "CONJ" "axpyc_k" false "" "" false ${float_type})
|
||||||
|
|
|
@ -340,6 +340,32 @@ ifndef XSCALKERNEL
|
||||||
XSCALKERNEL = zscal.S
|
XSCALKERNEL = zscal.S
|
||||||
endif
|
endif
|
||||||
|
|
||||||
|
### SUM ###
|
||||||
|
|
||||||
|
ifndef SSUMKERNEL
|
||||||
|
SSUMKERNEL = sum.S
|
||||||
|
endif
|
||||||
|
|
||||||
|
ifndef DSUMKERNEL
|
||||||
|
DSUMKERNEL = sum.S
|
||||||
|
endif
|
||||||
|
|
||||||
|
ifndef CSUMKERNEL
|
||||||
|
CSUMKERNEL = zsum.S
|
||||||
|
endif
|
||||||
|
|
||||||
|
ifndef ZSUMKERNEL
|
||||||
|
ZSUMKERNEL = zsum.S
|
||||||
|
endif
|
||||||
|
|
||||||
|
ifndef QSUMKERNEL
|
||||||
|
QSUMKERNEL = sum.S
|
||||||
|
endif
|
||||||
|
|
||||||
|
ifndef XSUMKERNEL
|
||||||
|
XSUMKERNEL = zsum.S
|
||||||
|
endif
|
||||||
|
|
||||||
### SWAP ###
|
### SWAP ###
|
||||||
|
|
||||||
ifndef SSWAPKERNEL
|
ifndef SSWAPKERNEL
|
||||||
|
@ -453,7 +479,7 @@ endif
|
||||||
SBLASOBJS += \
|
SBLASOBJS += \
|
||||||
samax_k$(TSUFFIX).$(SUFFIX) samin_k$(TSUFFIX).$(SUFFIX) smax_k$(TSUFFIX).$(SUFFIX) smin_k$(TSUFFIX).$(SUFFIX) \
|
samax_k$(TSUFFIX).$(SUFFIX) samin_k$(TSUFFIX).$(SUFFIX) smax_k$(TSUFFIX).$(SUFFIX) smin_k$(TSUFFIX).$(SUFFIX) \
|
||||||
isamax_k$(TSUFFIX).$(SUFFIX) isamin_k$(TSUFFIX).$(SUFFIX) ismax_k$(TSUFFIX).$(SUFFIX) ismin_k$(TSUFFIX).$(SUFFIX) \
|
isamax_k$(TSUFFIX).$(SUFFIX) isamin_k$(TSUFFIX).$(SUFFIX) ismax_k$(TSUFFIX).$(SUFFIX) ismin_k$(TSUFFIX).$(SUFFIX) \
|
||||||
sasum_k$(TSUFFIX).$(SUFFIX) saxpy_k$(TSUFFIX).$(SUFFIX) scopy_k$(TSUFFIX).$(SUFFIX) \
|
sasum_k$(TSUFFIX).$(SUFFIX) ssum_k$(TSUFFIX).$(SUFFIX) saxpy_k$(TSUFFIX).$(SUFFIX) scopy_k$(TSUFFIX).$(SUFFIX) \
|
||||||
sdot_k$(TSUFFIX).$(SUFFIX) sdsdot_k$(TSUFFIX).$(SUFFIX) dsdot_k$(TSUFFIX).$(SUFFIX) \
|
sdot_k$(TSUFFIX).$(SUFFIX) sdsdot_k$(TSUFFIX).$(SUFFIX) dsdot_k$(TSUFFIX).$(SUFFIX) \
|
||||||
snrm2_k$(TSUFFIX).$(SUFFIX) srot_k$(TSUFFIX).$(SUFFIX) sscal_k$(TSUFFIX).$(SUFFIX) sswap_k$(TSUFFIX).$(SUFFIX) \
|
snrm2_k$(TSUFFIX).$(SUFFIX) srot_k$(TSUFFIX).$(SUFFIX) sscal_k$(TSUFFIX).$(SUFFIX) sswap_k$(TSUFFIX).$(SUFFIX) \
|
||||||
saxpby_k$(TSUFFIX).$(SUFFIX)
|
saxpby_k$(TSUFFIX).$(SUFFIX)
|
||||||
|
@ -463,31 +489,32 @@ DBLASOBJS += \
|
||||||
idamax_k$(TSUFFIX).$(SUFFIX) idamin_k$(TSUFFIX).$(SUFFIX) idmax_k$(TSUFFIX).$(SUFFIX) idmin_k$(TSUFFIX).$(SUFFIX) \
|
idamax_k$(TSUFFIX).$(SUFFIX) idamin_k$(TSUFFIX).$(SUFFIX) idmax_k$(TSUFFIX).$(SUFFIX) idmin_k$(TSUFFIX).$(SUFFIX) \
|
||||||
dasum_k$(TSUFFIX).$(SUFFIX) daxpy_k$(TSUFFIX).$(SUFFIX) dcopy_k$(TSUFFIX).$(SUFFIX) ddot_k$(TSUFFIX).$(SUFFIX) \
|
dasum_k$(TSUFFIX).$(SUFFIX) daxpy_k$(TSUFFIX).$(SUFFIX) dcopy_k$(TSUFFIX).$(SUFFIX) ddot_k$(TSUFFIX).$(SUFFIX) \
|
||||||
dnrm2_k$(TSUFFIX).$(SUFFIX) drot_k$(TSUFFIX).$(SUFFIX) dscal_k$(TSUFFIX).$(SUFFIX) dswap_k$(TSUFFIX).$(SUFFIX) \
|
dnrm2_k$(TSUFFIX).$(SUFFIX) drot_k$(TSUFFIX).$(SUFFIX) dscal_k$(TSUFFIX).$(SUFFIX) dswap_k$(TSUFFIX).$(SUFFIX) \
|
||||||
daxpby_k$(TSUFFIX).$(SUFFIX)
|
daxpby_k$(TSUFFIX).$(SUFFIX) dsum_k$(TSUFFIX).$(SUFFIX)
|
||||||
|
|
||||||
QBLASOBJS += \
|
QBLASOBJS += \
|
||||||
qamax_k$(TSUFFIX).$(SUFFIX) qamin_k$(TSUFFIX).$(SUFFIX) qmax_k$(TSUFFIX).$(SUFFIX) qmin_k$(TSUFFIX).$(SUFFIX) \
|
qamax_k$(TSUFFIX).$(SUFFIX) qamin_k$(TSUFFIX).$(SUFFIX) qmax_k$(TSUFFIX).$(SUFFIX) qmin_k$(TSUFFIX).$(SUFFIX) \
|
||||||
iqamax_k$(TSUFFIX).$(SUFFIX) iqamin_k$(TSUFFIX).$(SUFFIX) iqmax_k$(TSUFFIX).$(SUFFIX) iqmin_k$(TSUFFIX).$(SUFFIX) \
|
iqamax_k$(TSUFFIX).$(SUFFIX) iqamin_k$(TSUFFIX).$(SUFFIX) iqmax_k$(TSUFFIX).$(SUFFIX) iqmin_k$(TSUFFIX).$(SUFFIX) \
|
||||||
qasum_k$(TSUFFIX).$(SUFFIX) qaxpy_k$(TSUFFIX).$(SUFFIX) qcopy_k$(TSUFFIX).$(SUFFIX) qdot_k$(TSUFFIX).$(SUFFIX) \
|
qasum_k$(TSUFFIX).$(SUFFIX) qaxpy_k$(TSUFFIX).$(SUFFIX) qcopy_k$(TSUFFIX).$(SUFFIX) qdot_k$(TSUFFIX).$(SUFFIX) \
|
||||||
qnrm2_k$(TSUFFIX).$(SUFFIX) qrot_k$(TSUFFIX).$(SUFFIX) qscal_k$(TSUFFIX).$(SUFFIX) qswap_k$(TSUFFIX).$(SUFFIX)
|
qnrm2_k$(TSUFFIX).$(SUFFIX) qrot_k$(TSUFFIX).$(SUFFIX) qscal_k$(TSUFFIX).$(SUFFIX) qswap_k$(TSUFFIX).$(SUFFIX) \
|
||||||
|
qsum_k$(TSUFFIX).$(SUFFIX)
|
||||||
|
|
||||||
CBLASOBJS += \
|
CBLASOBJS += \
|
||||||
camax_k$(TSUFFIX).$(SUFFIX) camin_k$(TSUFFIX).$(SUFFIX) icamax_k$(TSUFFIX).$(SUFFIX) icamin_k$(TSUFFIX).$(SUFFIX) \
|
camax_k$(TSUFFIX).$(SUFFIX) camin_k$(TSUFFIX).$(SUFFIX) icamax_k$(TSUFFIX).$(SUFFIX) icamin_k$(TSUFFIX).$(SUFFIX) \
|
||||||
casum_k$(TSUFFIX).$(SUFFIX) caxpy_k$(TSUFFIX).$(SUFFIX) caxpyc_k$(TSUFFIX).$(SUFFIX) ccopy_k$(TSUFFIX).$(SUFFIX) \
|
casum_k$(TSUFFIX).$(SUFFIX) caxpy_k$(TSUFFIX).$(SUFFIX) caxpyc_k$(TSUFFIX).$(SUFFIX) ccopy_k$(TSUFFIX).$(SUFFIX) \
|
||||||
cdotc_k$(TSUFFIX).$(SUFFIX) cdotu_k$(TSUFFIX).$(SUFFIX) cnrm2_k$(TSUFFIX).$(SUFFIX) csrot_k$(TSUFFIX).$(SUFFIX) \
|
cdotc_k$(TSUFFIX).$(SUFFIX) cdotu_k$(TSUFFIX).$(SUFFIX) cnrm2_k$(TSUFFIX).$(SUFFIX) csrot_k$(TSUFFIX).$(SUFFIX) \
|
||||||
cscal_k$(TSUFFIX).$(SUFFIX) cswap_k$(TSUFFIX).$(SUFFIX) caxpby_k$(TSUFFIX).$(SUFFIX)
|
cscal_k$(TSUFFIX).$(SUFFIX) cswap_k$(TSUFFIX).$(SUFFIX) caxpby_k$(TSUFFIX).$(SUFFIX) csum_k$(TSUFFIX).$(SUFFIX)
|
||||||
|
|
||||||
ZBLASOBJS += \
|
ZBLASOBJS += \
|
||||||
zamax_k$(TSUFFIX).$(SUFFIX) zamin_k$(TSUFFIX).$(SUFFIX) izamax_k$(TSUFFIX).$(SUFFIX) izamin_k$(TSUFFIX).$(SUFFIX) \
|
zamax_k$(TSUFFIX).$(SUFFIX) zamin_k$(TSUFFIX).$(SUFFIX) izamax_k$(TSUFFIX).$(SUFFIX) izamin_k$(TSUFFIX).$(SUFFIX) \
|
||||||
zasum_k$(TSUFFIX).$(SUFFIX) zaxpy_k$(TSUFFIX).$(SUFFIX) zaxpyc_k$(TSUFFIX).$(SUFFIX) zcopy_k$(TSUFFIX).$(SUFFIX) \
|
zasum_k$(TSUFFIX).$(SUFFIX) zaxpy_k$(TSUFFIX).$(SUFFIX) zaxpyc_k$(TSUFFIX).$(SUFFIX) zcopy_k$(TSUFFIX).$(SUFFIX) \
|
||||||
zdotc_k$(TSUFFIX).$(SUFFIX) zdotu_k$(TSUFFIX).$(SUFFIX) znrm2_k$(TSUFFIX).$(SUFFIX) zdrot_k$(TSUFFIX).$(SUFFIX) \
|
zdotc_k$(TSUFFIX).$(SUFFIX) zdotu_k$(TSUFFIX).$(SUFFIX) znrm2_k$(TSUFFIX).$(SUFFIX) zdrot_k$(TSUFFIX).$(SUFFIX) \
|
||||||
zscal_k$(TSUFFIX).$(SUFFIX) zswap_k$(TSUFFIX).$(SUFFIX) zaxpby_k$(TSUFFIX).$(SUFFIX)
|
zscal_k$(TSUFFIX).$(SUFFIX) zswap_k$(TSUFFIX).$(SUFFIX) zaxpby_k$(TSUFFIX).$(SUFFIX) zsum_k$(TSUFFIX).$(SUFFIX)
|
||||||
|
|
||||||
XBLASOBJS += \
|
XBLASOBJS += \
|
||||||
xamax_k$(TSUFFIX).$(SUFFIX) xamin_k$(TSUFFIX).$(SUFFIX) ixamax_k$(TSUFFIX).$(SUFFIX) ixamin_k$(TSUFFIX).$(SUFFIX) \
|
xamax_k$(TSUFFIX).$(SUFFIX) xamin_k$(TSUFFIX).$(SUFFIX) ixamax_k$(TSUFFIX).$(SUFFIX) ixamin_k$(TSUFFIX).$(SUFFIX) \
|
||||||
xasum_k$(TSUFFIX).$(SUFFIX) xaxpy_k$(TSUFFIX).$(SUFFIX) xaxpyc_k$(TSUFFIX).$(SUFFIX) xcopy_k$(TSUFFIX).$(SUFFIX) \
|
xasum_k$(TSUFFIX).$(SUFFIX) xaxpy_k$(TSUFFIX).$(SUFFIX) xaxpyc_k$(TSUFFIX).$(SUFFIX) xcopy_k$(TSUFFIX).$(SUFFIX) \
|
||||||
xdotc_k$(TSUFFIX).$(SUFFIX) xdotu_k$(TSUFFIX).$(SUFFIX) xnrm2_k$(TSUFFIX).$(SUFFIX) xqrot_k$(TSUFFIX).$(SUFFIX) \
|
xdotc_k$(TSUFFIX).$(SUFFIX) xdotu_k$(TSUFFIX).$(SUFFIX) xnrm2_k$(TSUFFIX).$(SUFFIX) xqrot_k$(TSUFFIX).$(SUFFIX) \
|
||||||
xscal_k$(TSUFFIX).$(SUFFIX) xswap_k$(TSUFFIX).$(SUFFIX)
|
xscal_k$(TSUFFIX).$(SUFFIX) xswap_k$(TSUFFIX).$(SUFFIX) xsum_k$(TSUFFIX).$(SUFFIX)
|
||||||
|
|
||||||
### AMAX ###
|
### AMAX ###
|
||||||
|
|
||||||
|
@ -617,7 +644,7 @@ $(KDIR)idmin_k$(TSUFFIX).$(SUFFIX) $(KDIR)idmin_k$(TPSUFFIX).$(PSUFFIX) : $(KE
|
||||||
$(KDIR)iqmin_k$(TSUFFIX).$(SUFFIX) $(KDIR)iqmin_k$(TPSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(IQMINKERNEL)
|
$(KDIR)iqmin_k$(TSUFFIX).$(SUFFIX) $(KDIR)iqmin_k$(TPSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(IQMINKERNEL)
|
||||||
$(CC) -c $(CFLAGS) -UCOMPLEX -DXDOUBLE -UUSE_ABS -DUSE_MIN $< -o $@
|
$(CC) -c $(CFLAGS) -UCOMPLEX -DXDOUBLE -UUSE_ABS -DUSE_MIN $< -o $@
|
||||||
|
|
||||||
|
### ASUM ###
|
||||||
$(KDIR)sasum_k$(TSUFFIX).$(SUFFIX) $(KDIR)sasum_k$(TPSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(SASUMKERNEL)
|
$(KDIR)sasum_k$(TSUFFIX).$(SUFFIX) $(KDIR)sasum_k$(TPSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(SASUMKERNEL)
|
||||||
$(CC) -c $(CFLAGS) -UCOMPLEX -UDOUBLE $< -o $@
|
$(CC) -c $(CFLAGS) -UCOMPLEX -UDOUBLE $< -o $@
|
||||||
|
|
||||||
|
@ -636,6 +663,26 @@ $(KDIR)zasum_k$(TSUFFIX).$(SUFFIX) $(KDIR)zasum_k$(TPSUFFIX).$(PSUFFIX) : $(KE
|
||||||
$(KDIR)xasum_k$(TSUFFIX).$(SUFFIX) $(KDIR)xasum_k$(TPSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(XASUMKERNEL)
|
$(KDIR)xasum_k$(TSUFFIX).$(SUFFIX) $(KDIR)xasum_k$(TPSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(XASUMKERNEL)
|
||||||
$(CC) -c $(CFLAGS) -DCOMPLEX -DXDOUBLE $< -o $@
|
$(CC) -c $(CFLAGS) -DCOMPLEX -DXDOUBLE $< -o $@
|
||||||
|
|
||||||
|
### SUM ###
|
||||||
|
$(KDIR)ssum_k$(TSUFFIX).$(SUFFIX) $(KDIR)ssum_k$(TPSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(SSUMKERNEL)
|
||||||
|
$(CC) -c $(CFLAGS) -UCOMPLEX -UDOUBLE $< -o $@
|
||||||
|
|
||||||
|
$(KDIR)dsum_k$(TSUFFIX).$(SUFFIX) $(KDIR)dsum_k$(TPSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(DSUMKERNEL)
|
||||||
|
$(CC) -c $(CFLAGS) -UCOMPLEX -DDOUBLE $< -o $@
|
||||||
|
|
||||||
|
$(KDIR)qsum_k$(TSUFFIX).$(SUFFIX) $(KDIR)qsum_k$(TPSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(QSUMKERNEL)
|
||||||
|
$(CC) -c $(CFLAGS) -UCOMPLEX -DXDOUBLE $< -o $@
|
||||||
|
|
||||||
|
$(KDIR)csum_k$(TSUFFIX).$(SUFFIX) $(KDIR)csum_k$(TPSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(CSUMKERNEL)
|
||||||
|
$(CC) -c $(CFLAGS) -DCOMPLEX -UDOUBLE $< -o $@
|
||||||
|
|
||||||
|
$(KDIR)zsum_k$(TSUFFIX).$(SUFFIX) $(KDIR)zsum_k$(TPSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(ZSUMKERNEL)
|
||||||
|
$(CC) -c $(CFLAGS) -DCOMPLEX -DDOUBLE $< -o $@
|
||||||
|
|
||||||
|
$(KDIR)xsum_k$(TSUFFIX).$(SUFFIX) $(KDIR)xsum_k$(TPSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(XSUMKERNEL)
|
||||||
|
$(CC) -c $(CFLAGS) -DCOMPLEX -DXDOUBLE $< -o $@
|
||||||
|
|
||||||
|
### AXPY ###
|
||||||
$(KDIR)saxpy_k$(TSUFFIX).$(SUFFIX) $(KDIR)saxpy_k$(TPSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(SAXPYKERNEL)
|
$(KDIR)saxpy_k$(TSUFFIX).$(SUFFIX) $(KDIR)saxpy_k$(TPSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(SAXPYKERNEL)
|
||||||
$(CC) -c $(CFLAGS) -UCOMPLEX -UDOUBLE $< -o $@
|
$(CC) -c $(CFLAGS) -UCOMPLEX -UDOUBLE $< -o $@
|
||||||
|
|
||||||
|
|
|
@ -0,0 +1,206 @@
|
||||||
|
/*********************************************************************/
|
||||||
|
/* Copyright 2009, 2010 The University of Texas at Austin. */
|
||||||
|
/* All rights reserved. */
|
||||||
|
/* */
|
||||||
|
/* Redistribution and use in source and binary forms, with or */
|
||||||
|
/* without modification, are permitted provided that the following */
|
||||||
|
/* conditions are met: */
|
||||||
|
/* */
|
||||||
|
/* 1. Redistributions of source code must retain the above */
|
||||||
|
/* copyright notice, this list of conditions and the following */
|
||||||
|
/* disclaimer. */
|
||||||
|
/* */
|
||||||
|
/* 2. Redistributions in binary form must reproduce the above */
|
||||||
|
/* copyright notice, this list of conditions and the following */
|
||||||
|
/* disclaimer in the documentation and/or other materials */
|
||||||
|
/* provided with the distribution. */
|
||||||
|
/* */
|
||||||
|
/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */
|
||||||
|
/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */
|
||||||
|
/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */
|
||||||
|
/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */
|
||||||
|
/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */
|
||||||
|
/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */
|
||||||
|
/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */
|
||||||
|
/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */
|
||||||
|
/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */
|
||||||
|
/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */
|
||||||
|
/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */
|
||||||
|
/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */
|
||||||
|
/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */
|
||||||
|
/* POSSIBILITY OF SUCH DAMAGE. */
|
||||||
|
/* */
|
||||||
|
/* The views and conclusions contained in the software and */
|
||||||
|
/* documentation are those of the authors and should not be */
|
||||||
|
/* interpreted as representing official policies, either expressed */
|
||||||
|
/* or implied, of The University of Texas at Austin. */
|
||||||
|
/*********************************************************************/
|
||||||
|
|
||||||
|
#define ASSEMBLER
|
||||||
|
#include "common.h"
|
||||||
|
#include "version.h"
|
||||||
|
|
||||||
|
#define PREFETCHSIZE 88
|
||||||
|
|
||||||
|
#define N $16
|
||||||
|
#define X $17
|
||||||
|
#define INCX $18
|
||||||
|
#define I $19
|
||||||
|
|
||||||
|
#define s0 $f0
|
||||||
|
#define s1 $f1
|
||||||
|
#define s2 $f10
|
||||||
|
#define s3 $f11
|
||||||
|
|
||||||
|
#define a0 $f12
|
||||||
|
#define a1 $f13
|
||||||
|
#define a2 $f14
|
||||||
|
#define a3 $f15
|
||||||
|
#define a4 $f16
|
||||||
|
#define a5 $f17
|
||||||
|
#define a6 $f18
|
||||||
|
#define a7 $f19
|
||||||
|
|
||||||
|
#define t0 $f20
|
||||||
|
#define t1 $f21
|
||||||
|
#define t2 $f22
|
||||||
|
#define t3 $f23
|
||||||
|
|
||||||
|
PROLOGUE
|
||||||
|
PROFCODE
|
||||||
|
|
||||||
|
fclr s0
|
||||||
|
unop
|
||||||
|
fclr t0
|
||||||
|
ble N, $L999
|
||||||
|
|
||||||
|
sra N, 3, I
|
||||||
|
fclr s1
|
||||||
|
fclr s2
|
||||||
|
ble I, $L15
|
||||||
|
|
||||||
|
LD a0, 0 * SIZE(X)
|
||||||
|
fclr t1
|
||||||
|
SXADDQ INCX, X, X
|
||||||
|
fclr t2
|
||||||
|
|
||||||
|
LD a1, 0 * SIZE(X)
|
||||||
|
fclr t3
|
||||||
|
SXADDQ INCX, X, X
|
||||||
|
fclr s3
|
||||||
|
|
||||||
|
LD a2, 0 * SIZE(X)
|
||||||
|
SXADDQ INCX, X, X
|
||||||
|
LD a3, 0 * SIZE(X)
|
||||||
|
SXADDQ INCX, X, X
|
||||||
|
|
||||||
|
LD a4, 0 * SIZE(X)
|
||||||
|
SXADDQ INCX, X, X
|
||||||
|
LD a5, 0 * SIZE(X)
|
||||||
|
SXADDQ INCX, X, X
|
||||||
|
|
||||||
|
lda I, -1(I)
|
||||||
|
ble I, $L13
|
||||||
|
.align 4
|
||||||
|
|
||||||
|
$L12:
|
||||||
|
ADD s0, t0, s0
|
||||||
|
ldl $31, PREFETCHSIZE * 2 * SIZE(X)
|
||||||
|
fmov a0, t0
|
||||||
|
lda I, -1(I)
|
||||||
|
|
||||||
|
ADD s1, t1, s1
|
||||||
|
LD a6, 0 * SIZE(X)
|
||||||
|
fmov a1, t1
|
||||||
|
SXADDQ INCX, X, X
|
||||||
|
|
||||||
|
ADD s2, t2, s2
|
||||||
|
LD a7, 0 * SIZE(X)
|
||||||
|
fmov a2, t2
|
||||||
|
SXADDQ INCX, X, X
|
||||||
|
|
||||||
|
ADD s3, t3, s3
|
||||||
|
LD a0, 0 * SIZE(X)
|
||||||
|
fmov a3, t3
|
||||||
|
SXADDQ INCX, X, X
|
||||||
|
|
||||||
|
ADD s0, t0, s0
|
||||||
|
LD a1, 0 * SIZE(X)
|
||||||
|
fmov a4, t0
|
||||||
|
SXADDQ INCX, X, X
|
||||||
|
|
||||||
|
ADD s1, t1, s1
|
||||||
|
LD a2, 0 * SIZE(X)
|
||||||
|
fmov a5, t1
|
||||||
|
SXADDQ INCX, X, X
|
||||||
|
|
||||||
|
ADD s2, t2, s2
|
||||||
|
LD a3, 0 * SIZE(X)
|
||||||
|
fmov a6, t2
|
||||||
|
SXADDQ INCX, X, X
|
||||||
|
|
||||||
|
ADD s3, t3, s3
|
||||||
|
LD a4, 0 * SIZE(X)
|
||||||
|
fmov a7, t3
|
||||||
|
SXADDQ INCX, X, X
|
||||||
|
|
||||||
|
LD a5, 0 * SIZE(X)
|
||||||
|
unop
|
||||||
|
SXADDQ INCX, X, X
|
||||||
|
bne I, $L12
|
||||||
|
.align 4
|
||||||
|
|
||||||
|
$L13:
|
||||||
|
ADD s0, t0, s0
|
||||||
|
LD a6, 0 * SIZE(X)
|
||||||
|
fmov a0, t0
|
||||||
|
SXADDQ INCX, X, X
|
||||||
|
|
||||||
|
ADD s1, t1, s1
|
||||||
|
LD a7, 0 * SIZE(X)
|
||||||
|
fmov a1, t1
|
||||||
|
SXADDQ INCX, X, X
|
||||||
|
|
||||||
|
ADD s2, t2, s2
|
||||||
|
fmov a2, t2
|
||||||
|
ADD s3, t3, s3
|
||||||
|
fmov a3, t3
|
||||||
|
|
||||||
|
ADD s0, t0, s0
|
||||||
|
fmov a4, t0
|
||||||
|
ADD s1, t1, s1
|
||||||
|
fmov a5, t1
|
||||||
|
ADD s2, t2, s2
|
||||||
|
fmov a6, t2
|
||||||
|
ADD s3, t3, s3
|
||||||
|
fmov a7, t3
|
||||||
|
|
||||||
|
ADD s1, t1, s1
|
||||||
|
ADD s2, t2, s2
|
||||||
|
ADD s3, t3, s3
|
||||||
|
|
||||||
|
ADD s0, s1, s0
|
||||||
|
ADD s2, s3, s2
|
||||||
|
.align 4
|
||||||
|
|
||||||
|
$L15:
|
||||||
|
and N, 7, I
|
||||||
|
ADD s0, s2, s0
|
||||||
|
unop
|
||||||
|
ble I, $L999
|
||||||
|
.align 4
|
||||||
|
|
||||||
|
$L17:
|
||||||
|
ADD s0, t0, s0
|
||||||
|
LD a0, 0 * SIZE(X)
|
||||||
|
SXADDQ INCX, X, X
|
||||||
|
fmov a0, t0
|
||||||
|
|
||||||
|
lda I, -1(I)
|
||||||
|
bne I, $L17
|
||||||
|
.align 4
|
||||||
|
|
||||||
|
$L999:
|
||||||
|
ADD s0, t0, s0
|
||||||
|
ret
|
||||||
|
EPILOGUE
|
|
@ -0,0 +1,208 @@
|
||||||
|
/*********************************************************************/
|
||||||
|
/* Copyright 2009, 2010 The University of Texas at Austin. */
|
||||||
|
/* All rights reserved. */
|
||||||
|
/* */
|
||||||
|
/* Redistribution and use in source and binary forms, with or */
|
||||||
|
/* without modification, are permitted provided that the following */
|
||||||
|
/* conditions are met: */
|
||||||
|
/* */
|
||||||
|
/* 1. Redistributions of source code must retain the above */
|
||||||
|
/* copyright notice, this list of conditions and the following */
|
||||||
|
/* disclaimer. */
|
||||||
|
/* */
|
||||||
|
/* 2. Redistributions in binary form must reproduce the above */
|
||||||
|
/* copyright notice, this list of conditions and the following */
|
||||||
|
/* disclaimer in the documentation and/or other materials */
|
||||||
|
/* provided with the distribution. */
|
||||||
|
/* */
|
||||||
|
/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */
|
||||||
|
/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */
|
||||||
|
/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */
|
||||||
|
/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */
|
||||||
|
/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */
|
||||||
|
/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */
|
||||||
|
/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */
|
||||||
|
/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */
|
||||||
|
/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */
|
||||||
|
/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */
|
||||||
|
/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */
|
||||||
|
/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */
|
||||||
|
/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */
|
||||||
|
/* POSSIBILITY OF SUCH DAMAGE. */
|
||||||
|
/* */
|
||||||
|
/* The views and conclusions contained in the software and */
|
||||||
|
/* documentation are those of the authors and should not be */
|
||||||
|
/* interpreted as representing official policies, either expressed */
|
||||||
|
/* or implied, of The University of Texas at Austin. */
|
||||||
|
/*********************************************************************/
|
||||||
|
|
||||||
|
#define ASSEMBLER
|
||||||
|
#include "common.h"
|
||||||
|
#include "version.h"
|
||||||
|
|
||||||
|
#define PREFETCHSIZE 88
|
||||||
|
|
||||||
|
#define N $16
|
||||||
|
#define X $17
|
||||||
|
#define INCX $18
|
||||||
|
#define I $19
|
||||||
|
|
||||||
|
#define s0 $f0
|
||||||
|
#define s1 $f1
|
||||||
|
#define s2 $f10
|
||||||
|
#define s3 $f11
|
||||||
|
|
||||||
|
#define a0 $f12
|
||||||
|
#define a1 $f13
|
||||||
|
#define a2 $f14
|
||||||
|
#define a3 $f15
|
||||||
|
#define a4 $f16
|
||||||
|
#define a5 $f17
|
||||||
|
#define a6 $f18
|
||||||
|
#define a7 $f19
|
||||||
|
|
||||||
|
#define t0 $f20
|
||||||
|
#define t1 $f21
|
||||||
|
#define t2 $f22
|
||||||
|
#define t3 $f23
|
||||||
|
|
||||||
|
PROLOGUE
|
||||||
|
PROFCODE
|
||||||
|
|
||||||
|
fclr s0
|
||||||
|
unop
|
||||||
|
fclr t0
|
||||||
|
addq INCX, INCX, INCX
|
||||||
|
|
||||||
|
fclr s1
|
||||||
|
unop
|
||||||
|
fclr t1
|
||||||
|
ble N, $L999
|
||||||
|
|
||||||
|
fclr s2
|
||||||
|
sra N, 2, I
|
||||||
|
fclr s3
|
||||||
|
ble I, $L15
|
||||||
|
|
||||||
|
LD a0, 0 * SIZE(X)
|
||||||
|
fclr t2
|
||||||
|
LD a1, 1 * SIZE(X)
|
||||||
|
SXADDQ INCX, X, X
|
||||||
|
|
||||||
|
LD a2, 0 * SIZE(X)
|
||||||
|
fclr t3
|
||||||
|
LD a3, 1 * SIZE(X)
|
||||||
|
SXADDQ INCX, X, X
|
||||||
|
|
||||||
|
LD a4, 0 * SIZE(X)
|
||||||
|
LD a5, 1 * SIZE(X)
|
||||||
|
SXADDQ INCX, X, X
|
||||||
|
lda I, -1(I)
|
||||||
|
|
||||||
|
ble I, $L13
|
||||||
|
.align 4
|
||||||
|
|
||||||
|
$L12:
|
||||||
|
ADD s0, t0, s0
|
||||||
|
ldl $31, PREFETCHSIZE * SIZE(X)
|
||||||
|
fmov a0, t0
|
||||||
|
lda I, -1(I)
|
||||||
|
|
||||||
|
ADD s1, t1, s1
|
||||||
|
LD a6, 0 * SIZE(X)
|
||||||
|
fmov a1, t1
|
||||||
|
unop
|
||||||
|
|
||||||
|
ADD s2, t2, s2
|
||||||
|
LD a7, 1 * SIZE(X)
|
||||||
|
fmov a2, t2
|
||||||
|
SXADDQ INCX, X, X
|
||||||
|
|
||||||
|
ADD s3, t3, s3
|
||||||
|
LD a0, 0 * SIZE(X)
|
||||||
|
fmov a3, t3
|
||||||
|
unop
|
||||||
|
|
||||||
|
ADD s0, t0, s0
|
||||||
|
LD a1, 1 * SIZE(X)
|
||||||
|
fmov a4, t0
|
||||||
|
SXADDQ INCX, X, X
|
||||||
|
|
||||||
|
ADD s1, t1, s1
|
||||||
|
LD a2, 0 * SIZE(X)
|
||||||
|
fmov a5, t1
|
||||||
|
unop
|
||||||
|
|
||||||
|
ADD s2, t2, s2
|
||||||
|
LD a3, 1 * SIZE(X)
|
||||||
|
fmov a6, t2
|
||||||
|
SXADDQ INCX, X, X
|
||||||
|
|
||||||
|
ADD s3, t3, s3
|
||||||
|
LD a4, 0 * SIZE(X)
|
||||||
|
fmov a7, t3
|
||||||
|
unop
|
||||||
|
|
||||||
|
LD a5, 1 * SIZE(X)
|
||||||
|
unop
|
||||||
|
SXADDQ INCX, X, X
|
||||||
|
bne I, $L12
|
||||||
|
.align 4
|
||||||
|
|
||||||
|
$L13:
|
||||||
|
ADD s0, t0, s0
|
||||||
|
LD a6, 0 * SIZE(X)
|
||||||
|
fmov a0, t0
|
||||||
|
|
||||||
|
ADD s1, t1, s1
|
||||||
|
LD a7, 1 * SIZE(X)
|
||||||
|
fmov a1, t1
|
||||||
|
SXADDQ INCX, X, X
|
||||||
|
|
||||||
|
ADD s2, t2, s2
|
||||||
|
fmov a2, t2
|
||||||
|
ADD s3, t3, s3
|
||||||
|
fmov a3, t3
|
||||||
|
|
||||||
|
ADD s0, t0, s0
|
||||||
|
fmov a4, t0
|
||||||
|
ADD s1, t1, s1
|
||||||
|
fmov a5, t1
|
||||||
|
ADD s2, t2, s2
|
||||||
|
fmov a6, t2
|
||||||
|
ADD s3, t3, s3
|
||||||
|
fmov a7, t3
|
||||||
|
|
||||||
|
ADD s2, t2, s2
|
||||||
|
ADD s3, t3, s3
|
||||||
|
|
||||||
|
.align 4
|
||||||
|
|
||||||
|
$L15:
|
||||||
|
ADD s0, s2, s0
|
||||||
|
and N, 3, I
|
||||||
|
ADD s1, s3, s1
|
||||||
|
ble I, $L999
|
||||||
|
.align 4
|
||||||
|
|
||||||
|
$L17:
|
||||||
|
ADD s0, t0, s0
|
||||||
|
LD a0, 0 * SIZE(X)
|
||||||
|
fmov a0, t0
|
||||||
|
lda I, -1(I)
|
||||||
|
|
||||||
|
ADD s1, t1, s1
|
||||||
|
LD a1, 1 * SIZE(X)
|
||||||
|
fmov a1, t1
|
||||||
|
SXADDQ INCX, X, X
|
||||||
|
|
||||||
|
bne I, $L17
|
||||||
|
.align 4
|
||||||
|
|
||||||
|
$L999:
|
||||||
|
ADD s0, t0, s0
|
||||||
|
ADD s1, t1, s1
|
||||||
|
|
||||||
|
ADD s0, s1, s0
|
||||||
|
ret
|
||||||
|
EPILOGUE
|
|
@ -35,6 +35,11 @@ DASUMKERNEL = ../arm/asum.c
|
||||||
CASUMKERNEL = ../arm/zasum.c
|
CASUMKERNEL = ../arm/zasum.c
|
||||||
ZASUMKERNEL = ../arm/zasum.c
|
ZASUMKERNEL = ../arm/zasum.c
|
||||||
|
|
||||||
|
SSUMKERNEL = ../arm/sum.c
|
||||||
|
DSUMKERNEL = ../arm/sum.c
|
||||||
|
CSUMKERNEL = ../arm/zsum.c
|
||||||
|
ZSUMKERNEL = ../arm/zsum.c
|
||||||
|
|
||||||
SAXPYKERNEL = ../arm/axpy.c
|
SAXPYKERNEL = ../arm/axpy.c
|
||||||
DAXPYKERNEL = ../arm/axpy.c
|
DAXPYKERNEL = ../arm/axpy.c
|
||||||
CAXPYKERNEL = ../arm/zaxpy.c
|
CAXPYKERNEL = ../arm/zaxpy.c
|
||||||
|
|
|
@ -37,6 +37,9 @@ DASUMKERNEL = asum_vfp.S
|
||||||
CASUMKERNEL = asum_vfp.S
|
CASUMKERNEL = asum_vfp.S
|
||||||
ZASUMKERNEL = asum_vfp.S
|
ZASUMKERNEL = asum_vfp.S
|
||||||
|
|
||||||
|
SSUMKERNEL = sum_vfp.S
|
||||||
|
DSUMKERNEL = sum_vfp.S
|
||||||
|
|
||||||
SAXPYKERNEL = axpy_vfp.S
|
SAXPYKERNEL = axpy_vfp.S
|
||||||
DAXPYKERNEL = axpy_vfp.S
|
DAXPYKERNEL = axpy_vfp.S
|
||||||
CAXPYKERNEL = axpy_vfp.S
|
CAXPYKERNEL = axpy_vfp.S
|
||||||
|
|
|
@ -0,0 +1,51 @@
|
||||||
|
/***************************************************************************
|
||||||
|
Copyright (c) 2013, The OpenBLAS Project
|
||||||
|
All rights reserved.
|
||||||
|
Redistribution and use in source and binary forms, with or without
|
||||||
|
modification, are permitted provided that the following conditions are
|
||||||
|
met:
|
||||||
|
1. Redistributions of source code must retain the above copyright
|
||||||
|
notice, this list of conditions and the following disclaimer.
|
||||||
|
2. Redistributions in binary form must reproduce the above copyright
|
||||||
|
notice, this list of conditions and the following disclaimer in
|
||||||
|
the documentation and/or other materials provided with the
|
||||||
|
distribution.
|
||||||
|
3. Neither the name of the OpenBLAS project nor the names of
|
||||||
|
its contributors may be used to endorse or promote products
|
||||||
|
derived from this software without specific prior written permission.
|
||||||
|
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||||||
|
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||||
|
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
||||||
|
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
|
||||||
|
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
|
||||||
|
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
|
||||||
|
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
|
||||||
|
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
|
||||||
|
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
|
||||||
|
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||||
|
*****************************************************************************/
|
||||||
|
|
||||||
|
/**************************************************************************************
|
||||||
|
* trivial copy of asum.c with the ABS() removed *
|
||||||
|
**************************************************************************************/
|
||||||
|
|
||||||
|
|
||||||
|
#include "common.h"
|
||||||
|
#include <math.h>
|
||||||
|
|
||||||
|
FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x)
|
||||||
|
{
|
||||||
|
BLASLONG i=0;
|
||||||
|
FLOAT sumf = 0.0;
|
||||||
|
if (n <= 0 || inc_x <= 0) return(sumf);
|
||||||
|
|
||||||
|
n *= inc_x;
|
||||||
|
while(i < n)
|
||||||
|
{
|
||||||
|
sumf += x[i];
|
||||||
|
i += inc_x;
|
||||||
|
}
|
||||||
|
return(sumf);
|
||||||
|
}
|
||||||
|
|
||||||
|
|
|
@ -0,0 +1,425 @@
|
||||||
|
/***************************************************************************
|
||||||
|
Copyright (c) 2013, The OpenBLAS Project
|
||||||
|
All rights reserved.
|
||||||
|
Redistribution and use in source and binary forms, with or without
|
||||||
|
modification, are permitted provided that the following conditions are
|
||||||
|
met:
|
||||||
|
1. Redistributions of source code must retain the above copyright
|
||||||
|
notice, this list of conditions and the following disclaimer.
|
||||||
|
2. Redistributions in binary form must reproduce the above copyright
|
||||||
|
notice, this list of conditions and the following disclaimer in
|
||||||
|
the documentation and/or other materials provided with the
|
||||||
|
distribution.
|
||||||
|
3. Neither the name of the OpenBLAS project nor the names of
|
||||||
|
its contributors may be used to endorse or promote products
|
||||||
|
derived from this software without specific prior written permission.
|
||||||
|
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||||||
|
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||||
|
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
||||||
|
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
|
||||||
|
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
|
||||||
|
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
|
||||||
|
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
|
||||||
|
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
|
||||||
|
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
|
||||||
|
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||||
|
*****************************************************************************/
|
||||||
|
|
||||||
|
/**************************************************************************************
|
||||||
|
* trivial copy of asum_vfp.S with the in-place vabs.f64 calls removed *
|
||||||
|
**************************************************************************************/
|
||||||
|
|
||||||
|
#define ASSEMBLER
|
||||||
|
#include "common.h"
|
||||||
|
|
||||||
|
#define STACKSIZE 256
|
||||||
|
|
||||||
|
#define N r0
|
||||||
|
#define X r1
|
||||||
|
#define INC_X r2
|
||||||
|
|
||||||
|
|
||||||
|
#define I r12
|
||||||
|
|
||||||
|
#define X_PRE 512
|
||||||
|
|
||||||
|
/**************************************************************************************
|
||||||
|
* Macro definitions
|
||||||
|
**************************************************************************************/
|
||||||
|
|
||||||
|
#if !defined(COMPLEX)
|
||||||
|
|
||||||
|
#if defined(DOUBLE)
|
||||||
|
|
||||||
|
.macro KERNEL_F4
|
||||||
|
|
||||||
|
pld [ X, #X_PRE ]
|
||||||
|
vldmia.f64 X!, { d4 - d5 }
|
||||||
|
vadd.f64 d0 , d0, d4
|
||||||
|
vldmia.f64 X!, { d6 - d7 }
|
||||||
|
vadd.f64 d1 , d1, d5
|
||||||
|
vadd.f64 d0 , d0, d6
|
||||||
|
vadd.f64 d1 , d1, d7
|
||||||
|
|
||||||
|
.endm
|
||||||
|
|
||||||
|
.macro KERNEL_F1
|
||||||
|
|
||||||
|
vldmia.f64 X!, { d4 }
|
||||||
|
vadd.f64 d0 , d0, d4
|
||||||
|
|
||||||
|
.endm
|
||||||
|
|
||||||
|
|
||||||
|
.macro KERNEL_S4
|
||||||
|
|
||||||
|
vldmia.f64 X, { d4 }
|
||||||
|
vadd.f64 d0 , d0, d4
|
||||||
|
add X, X, INC_X
|
||||||
|
|
||||||
|
vldmia.f64 X, { d4 }
|
||||||
|
vadd.f64 d0 , d0, d4
|
||||||
|
add X, X, INC_X
|
||||||
|
|
||||||
|
vldmia.f64 X, { d4 }
|
||||||
|
vadd.f64 d0 , d0, d4
|
||||||
|
add X, X, INC_X
|
||||||
|
|
||||||
|
vldmia.f64 X, { d4 }
|
||||||
|
vadd.f64 d0 , d0, d4
|
||||||
|
add X, X, INC_X
|
||||||
|
|
||||||
|
.endm
|
||||||
|
|
||||||
|
|
||||||
|
.macro KERNEL_S1
|
||||||
|
|
||||||
|
vldmia.f64 X, { d4 }
|
||||||
|
vadd.f64 d0 , d0, d4
|
||||||
|
add X, X, INC_X
|
||||||
|
|
||||||
|
.endm
|
||||||
|
|
||||||
|
#else
|
||||||
|
|
||||||
|
.macro KERNEL_F4
|
||||||
|
|
||||||
|
vldmia.f32 X!, { s4 - s5 }
|
||||||
|
vadd.f32 s0 , s0, s4
|
||||||
|
vldmia.f32 X!, { s6 - s7 }
|
||||||
|
vadd.f32 s1 , s1, s5
|
||||||
|
vadd.f32 s0 , s0, s6
|
||||||
|
vadd.f32 s1 , s1, s7
|
||||||
|
|
||||||
|
.endm
|
||||||
|
|
||||||
|
.macro KERNEL_F1
|
||||||
|
|
||||||
|
vldmia.f32 X!, { s4 }
|
||||||
|
vadd.f32 s0 , s0, s4
|
||||||
|
|
||||||
|
.endm
|
||||||
|
|
||||||
|
|
||||||
|
.macro KERNEL_S4
|
||||||
|
|
||||||
|
vldmia.f32 X, { s4 }
|
||||||
|
vadd.f32 s0 , s0, s4
|
||||||
|
add X, X, INC_X
|
||||||
|
|
||||||
|
vldmia.f32 X, { s4 }
|
||||||
|
vadd.f32 s0 , s0, s4
|
||||||
|
add X, X, INC_X
|
||||||
|
|
||||||
|
vldmia.f32 X, { s4 }
|
||||||
|
vadd.f32 s0 , s0, s4
|
||||||
|
add X, X, INC_X
|
||||||
|
|
||||||
|
vldmia.f32 X, { s4 }
|
||||||
|
vadd.f32 s0 , s0, s4
|
||||||
|
add X, X, INC_X
|
||||||
|
|
||||||
|
.endm
|
||||||
|
|
||||||
|
|
||||||
|
.macro KERNEL_S1
|
||||||
|
|
||||||
|
vldmia.f32 X, { s4 }
|
||||||
|
vadd.f32 s0 , s0, s4
|
||||||
|
add X, X, INC_X
|
||||||
|
|
||||||
|
.endm
|
||||||
|
|
||||||
|
|
||||||
|
#endif
|
||||||
|
|
||||||
|
#else
|
||||||
|
|
||||||
|
#if defined(DOUBLE)
|
||||||
|
|
||||||
|
.macro KERNEL_F4
|
||||||
|
|
||||||
|
pld [ X, #X_PRE ]
|
||||||
|
vldmia.f64 X!, { d4 - d5 }
|
||||||
|
vadd.f64 d0 , d0, d4
|
||||||
|
vldmia.f64 X!, { d6 - d7 }
|
||||||
|
vadd.f64 d1 , d1, d5
|
||||||
|
vadd.f64 d0 , d0, d6
|
||||||
|
vadd.f64 d1 , d1, d7
|
||||||
|
|
||||||
|
pld [ X, #X_PRE ]
|
||||||
|
vldmia.f64 X!, { d4 - d5 }
|
||||||
|
vadd.f64 d0 , d0, d4
|
||||||
|
vldmia.f64 X!, { d6 - d7 }
|
||||||
|
vadd.f64 d1 , d1, d5
|
||||||
|
vadd.f64 d0 , d0, d6
|
||||||
|
vadd.f64 d1 , d1, d7
|
||||||
|
|
||||||
|
|
||||||
|
.endm
|
||||||
|
|
||||||
|
.macro KERNEL_F1
|
||||||
|
|
||||||
|
vldmia.f64 X!, { d4 }
|
||||||
|
vadd.f64 d0 , d0, d4
|
||||||
|
|
||||||
|
vldmia.f64 X!, { d4 }
|
||||||
|
vadd.f64 d0 , d0, d4
|
||||||
|
|
||||||
|
|
||||||
|
.endm
|
||||||
|
|
||||||
|
|
||||||
|
.macro KERNEL_S4
|
||||||
|
|
||||||
|
vldmia.f64 X, { d4 -d5 }
|
||||||
|
vadd.f64 d0 , d0, d4
|
||||||
|
vadd.f64 d0 , d0, d5
|
||||||
|
add X, X, INC_X
|
||||||
|
|
||||||
|
vldmia.f64 X, { d4 -d5 }
|
||||||
|
vadd.f64 d0 , d0, d4
|
||||||
|
vadd.f64 d0 , d0, d5
|
||||||
|
add X, X, INC_X
|
||||||
|
|
||||||
|
vldmia.f64 X, { d4 -d5 }
|
||||||
|
vadd.f64 d0 , d0, d4
|
||||||
|
vadd.f64 d0 , d0, d5
|
||||||
|
add X, X, INC_X
|
||||||
|
|
||||||
|
vldmia.f64 X, { d4 -d5 }
|
||||||
|
vadd.f64 d0 , d0, d4
|
||||||
|
vadd.f64 d0 , d0, d5
|
||||||
|
add X, X, INC_X
|
||||||
|
|
||||||
|
.endm
|
||||||
|
|
||||||
|
|
||||||
|
.macro KERNEL_S1
|
||||||
|
|
||||||
|
vldmia.f64 X, { d4 -d5 }
|
||||||
|
vadd.f64 d0 , d0, d4
|
||||||
|
vadd.f64 d0 , d0, d5
|
||||||
|
add X, X, INC_X
|
||||||
|
|
||||||
|
.endm
|
||||||
|
|
||||||
|
#else
|
||||||
|
|
||||||
|
.macro KERNEL_F4
|
||||||
|
|
||||||
|
pld [ X, #X_PRE ]
|
||||||
|
vldmia.f32 X!, { s4 - s5 }
|
||||||
|
vadd.f32 s0 , s0, s4
|
||||||
|
vldmia.f32 X!, { s6 - s7 }
|
||||||
|
vadd.f32 s1 , s1, s5
|
||||||
|
vadd.f32 s0 , s0, s6
|
||||||
|
vadd.f32 s1 , s1, s7
|
||||||
|
|
||||||
|
vldmia.f32 X!, { s4 - s5 }
|
||||||
|
vadd.f32 s0 , s0, s4
|
||||||
|
vldmia.f32 X!, { s6 - s7 }
|
||||||
|
vadd.f32 s1 , s1, s5
|
||||||
|
vadd.f32 s0 , s0, s6
|
||||||
|
vadd.f32 s1 , s1, s7
|
||||||
|
|
||||||
|
|
||||||
|
.endm
|
||||||
|
|
||||||
|
.macro KERNEL_F1
|
||||||
|
|
||||||
|
vldmia.f32 X!, { s4 }
|
||||||
|
vadd.f32 s0 , s0, s4
|
||||||
|
|
||||||
|
vldmia.f32 X!, { s4 }
|
||||||
|
vadd.f32 s0 , s0, s4
|
||||||
|
|
||||||
|
.endm
|
||||||
|
|
||||||
|
|
||||||
|
.macro KERNEL_S4
|
||||||
|
|
||||||
|
vldmia.f32 X, { s4 -s5 }
|
||||||
|
vadd.f32 s0 , s0, s4
|
||||||
|
vadd.f32 s0 , s0, s5
|
||||||
|
add X, X, INC_X
|
||||||
|
|
||||||
|
vldmia.f32 X, { s4 -s5 }
|
||||||
|
vadd.f32 s0 , s0, s4
|
||||||
|
vadd.f32 s0 , s0, s5
|
||||||
|
add X, X, INC_X
|
||||||
|
|
||||||
|
vldmia.f32 X, { s4 -s5 }
|
||||||
|
vadd.f32 s0 , s0, s4
|
||||||
|
vadd.f32 s0 , s0, s5
|
||||||
|
add X, X, INC_X
|
||||||
|
|
||||||
|
vldmia.f32 X, { s4 -s5 }
|
||||||
|
vadd.f32 s0 , s0, s4
|
||||||
|
vadd.f32 s0 , s0, s5
|
||||||
|
add X, X, INC_X
|
||||||
|
|
||||||
|
.endm
|
||||||
|
|
||||||
|
|
||||||
|
.macro KERNEL_S1
|
||||||
|
|
||||||
|
vldmia.f32 X, { s4 -s5 }
|
||||||
|
vadd.f32 s0 , s0, s4
|
||||||
|
vadd.f32 s0 , s0, s5
|
||||||
|
add X, X, INC_X
|
||||||
|
|
||||||
|
.endm
|
||||||
|
|
||||||
|
#endif
|
||||||
|
|
||||||
|
#endif
|
||||||
|
|
||||||
|
/**************************************************************************************
|
||||||
|
* End of macro definitions
|
||||||
|
**************************************************************************************/
|
||||||
|
|
||||||
|
PROLOGUE
|
||||||
|
|
||||||
|
.align 5
|
||||||
|
|
||||||
|
movs r12, #0 // clear floating point register
|
||||||
|
vmov s0, r12
|
||||||
|
vmov s1, r12
|
||||||
|
#if defined(DOUBLE)
|
||||||
|
vcvt.f64.f32 d0, s0
|
||||||
|
vcvt.f64.f32 d1, s1
|
||||||
|
#endif
|
||||||
|
|
||||||
|
cmp N, #0
|
||||||
|
ble asum_kernel_L999
|
||||||
|
|
||||||
|
cmp INC_X, #0
|
||||||
|
beq asum_kernel_L999
|
||||||
|
|
||||||
|
cmp INC_X, #1
|
||||||
|
bne asum_kernel_S_BEGIN
|
||||||
|
|
||||||
|
|
||||||
|
asum_kernel_F_BEGIN:
|
||||||
|
|
||||||
|
asrs I, N, #2 // I = N / 4
|
||||||
|
ble asum_kernel_F1
|
||||||
|
|
||||||
|
.align 5
|
||||||
|
|
||||||
|
asum_kernel_F4:
|
||||||
|
|
||||||
|
#if !defined(DOUBLE) && !defined(COMPLEX)
|
||||||
|
pld [ X, #X_PRE ]
|
||||||
|
#endif
|
||||||
|
KERNEL_F4
|
||||||
|
|
||||||
|
subs I, I, #1
|
||||||
|
ble asum_kernel_F1
|
||||||
|
|
||||||
|
KERNEL_F4
|
||||||
|
|
||||||
|
subs I, I, #1
|
||||||
|
bne asum_kernel_F4
|
||||||
|
|
||||||
|
asum_kernel_F1:
|
||||||
|
|
||||||
|
ands I, N, #3
|
||||||
|
ble asum_kernel_L999
|
||||||
|
|
||||||
|
asum_kernel_F10:
|
||||||
|
|
||||||
|
KERNEL_F1
|
||||||
|
|
||||||
|
subs I, I, #1
|
||||||
|
bne asum_kernel_F10
|
||||||
|
|
||||||
|
b asum_kernel_L999
|
||||||
|
|
||||||
|
asum_kernel_S_BEGIN:
|
||||||
|
|
||||||
|
#if defined(COMPLEX)
|
||||||
|
|
||||||
|
#if defined(DOUBLE)
|
||||||
|
lsl INC_X, INC_X, #4 // INC_X * SIZE * 2
|
||||||
|
#else
|
||||||
|
lsl INC_X, INC_X, #3 // INC_X * SIZE * 2
|
||||||
|
#endif
|
||||||
|
|
||||||
|
#else
|
||||||
|
|
||||||
|
#if defined(DOUBLE)
|
||||||
|
lsl INC_X, INC_X, #3 // INC_X * SIZE
|
||||||
|
#else
|
||||||
|
lsl INC_X, INC_X, #2 // INC_X * SIZE
|
||||||
|
#endif
|
||||||
|
|
||||||
|
#endif
|
||||||
|
|
||||||
|
asrs I, N, #2 // I = N / 4
|
||||||
|
ble asum_kernel_S1
|
||||||
|
|
||||||
|
.align 5
|
||||||
|
|
||||||
|
asum_kernel_S4:
|
||||||
|
|
||||||
|
KERNEL_S4
|
||||||
|
|
||||||
|
subs I, I, #1
|
||||||
|
bne asum_kernel_S4
|
||||||
|
|
||||||
|
asum_kernel_S1:
|
||||||
|
|
||||||
|
ands I, N, #3
|
||||||
|
ble asum_kernel_L999
|
||||||
|
|
||||||
|
asum_kernel_S10:
|
||||||
|
|
||||||
|
KERNEL_S1
|
||||||
|
|
||||||
|
subs I, I, #1
|
||||||
|
bne asum_kernel_S10
|
||||||
|
|
||||||
|
|
||||||
|
asum_kernel_L999:
|
||||||
|
|
||||||
|
|
||||||
|
#if defined(DOUBLE)
|
||||||
|
vadd.f64 d0 , d0, d1 // set return value
|
||||||
|
#else
|
||||||
|
vadd.f32 s0 , s0, s1 // set return value
|
||||||
|
#endif
|
||||||
|
|
||||||
|
#if !defined(__ARM_PCS_VFP)
|
||||||
|
#if !defined(DOUBLE)
|
||||||
|
vmov r0, s0
|
||||||
|
#else
|
||||||
|
vmov r0, r1, d0
|
||||||
|
#endif
|
||||||
|
#endif
|
||||||
|
|
||||||
|
bx lr
|
||||||
|
|
||||||
|
EPILOGUE
|
||||||
|
|
|
@ -0,0 +1,57 @@
|
||||||
|
/***************************************************************************
|
||||||
|
Copyright (c) 2013, The OpenBLAS Project
|
||||||
|
All rights reserved.
|
||||||
|
Redistribution and use in source and binary forms, with or without
|
||||||
|
modification, are permitted provided that the following conditions are
|
||||||
|
met:
|
||||||
|
1. Redistributions of source code must retain the above copyright
|
||||||
|
notice, this list of conditions and the following disclaimer.
|
||||||
|
2. Redistributions in binary form must reproduce the above copyright
|
||||||
|
notice, this list of conditions and the following disclaimer in
|
||||||
|
the documentation and/or other materials provided with the
|
||||||
|
distribution.
|
||||||
|
3. Neither the name of the OpenBLAS project nor the names of
|
||||||
|
its contributors may be used to endorse or promote products
|
||||||
|
derived from this software without specific prior written permission.
|
||||||
|
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||||||
|
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||||
|
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
||||||
|
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
|
||||||
|
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
|
||||||
|
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
|
||||||
|
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
|
||||||
|
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
|
||||||
|
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
|
||||||
|
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||||
|
*****************************************************************************/
|
||||||
|
|
||||||
|
/**************************************************************************************
|
||||||
|
* trivial copy of zasum.c with the ABS() removed *
|
||||||
|
**************************************************************************************/
|
||||||
|
|
||||||
|
|
||||||
|
#include "common.h"
|
||||||
|
#include <math.h>
|
||||||
|
|
||||||
|
#define CSUM1(x,i) x[i]+x[i+1]
|
||||||
|
|
||||||
|
FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x)
|
||||||
|
{
|
||||||
|
BLASLONG i=0;
|
||||||
|
FLOAT sumf = 0.0;
|
||||||
|
BLASLONG inc_x2;
|
||||||
|
|
||||||
|
if (n <= 0 || inc_x <= 0) return(sumf);
|
||||||
|
|
||||||
|
inc_x2 = 2 * inc_x;
|
||||||
|
|
||||||
|
n *= inc_x2;
|
||||||
|
while(i < n)
|
||||||
|
{
|
||||||
|
sumf += CSUM1(x,i);
|
||||||
|
i += inc_x2;
|
||||||
|
}
|
||||||
|
return(sumf);
|
||||||
|
}
|
||||||
|
|
||||||
|
|
|
@ -0,0 +1,164 @@
|
||||||
|
/*******************************************************************************
|
||||||
|
Copyright (c) 2019, The OpenBLAS Project
|
||||||
|
All rights reserved.
|
||||||
|
Redistribution and use in source and binary forms, with or without
|
||||||
|
modification, are permitted provided that the following conditions are
|
||||||
|
met:
|
||||||
|
1. Redistributions of source code must retain the above copyright
|
||||||
|
notice, this list of conditions and the following disclaimer.
|
||||||
|
2. Redistributions in binary form must reproduce the above copyright
|
||||||
|
notice, this list of conditions and the following disclaimer in
|
||||||
|
the documentation and/or other materials provided with the
|
||||||
|
distribution.
|
||||||
|
3. Neither the name of the OpenBLAS project nor the names of
|
||||||
|
its contributors may be used to endorse or promote products
|
||||||
|
derived from this software without specific prior written permission.
|
||||||
|
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||||||
|
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||||
|
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
||||||
|
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
|
||||||
|
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
|
||||||
|
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
|
||||||
|
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
|
||||||
|
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
|
||||||
|
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
|
||||||
|
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||||
|
*******************************************************************************/
|
||||||
|
|
||||||
|
#define ASSEMBLER
|
||||||
|
#include "common.h"
|
||||||
|
|
||||||
|
#define N x0 /* vector length */
|
||||||
|
#define X x1 /* X vector address */
|
||||||
|
#define INC_X x2 /* X stride */
|
||||||
|
#define I x5 /* loop variable */
|
||||||
|
|
||||||
|
/*******************************************************************************
|
||||||
|
* Macro definitions
|
||||||
|
*******************************************************************************/
|
||||||
|
|
||||||
|
#define REG0 wzr
|
||||||
|
#define SUMF s0
|
||||||
|
#define TMPF s1
|
||||||
|
#define TMPVF {v1.s}[0]
|
||||||
|
#define SZ 4
|
||||||
|
|
||||||
|
/******************************************************************************/
|
||||||
|
|
||||||
|
.macro KERNEL_F1
|
||||||
|
ld1 {v1.2s}, [X], #8
|
||||||
|
ext v2.8b, v1.8b, v1.8b, #4
|
||||||
|
fadd TMPF, TMPF, s2
|
||||||
|
fadd SUMF, SUMF, TMPF
|
||||||
|
.endm
|
||||||
|
|
||||||
|
.macro KERNEL_F8
|
||||||
|
ld1 {v1.4s, v2.4s, v3.4s, v4.4s}, [X]
|
||||||
|
add X, X, #64
|
||||||
|
|
||||||
|
PRFM PLDL1KEEP, [X, #1024]
|
||||||
|
|
||||||
|
fadd v1.4s, v1.4s, v2.4s
|
||||||
|
fadd v3.4s, v3.4s, v4.4s
|
||||||
|
fadd v0.4s, v0.4s, v1.4s
|
||||||
|
fadd v0.4s, v0.4s, v3.4s
|
||||||
|
.endm
|
||||||
|
|
||||||
|
.macro KERNEL_F8_FINALIZE
|
||||||
|
ext v1.16b, v0.16b, v0.16b, #8
|
||||||
|
fadd v0.2s, v0.2s, v1.2s
|
||||||
|
faddp SUMF, v0.2s
|
||||||
|
.endm
|
||||||
|
|
||||||
|
.macro INIT_S
|
||||||
|
lsl INC_X, INC_X, #3
|
||||||
|
.endm
|
||||||
|
|
||||||
|
.macro KERNEL_S1
|
||||||
|
ld1 {v1.2s}, [X], INC_X
|
||||||
|
ext v2.8b, v1.8b, v1.8b, #4
|
||||||
|
fadd TMPF, TMPF, s2
|
||||||
|
fadd SUMF, SUMF, TMPF
|
||||||
|
|
||||||
|
.endm
|
||||||
|
|
||||||
|
/*******************************************************************************
|
||||||
|
* End of macro definitions
|
||||||
|
*******************************************************************************/
|
||||||
|
|
||||||
|
PROLOGUE
|
||||||
|
|
||||||
|
fmov SUMF, REG0
|
||||||
|
fmov s1, SUMF
|
||||||
|
|
||||||
|
cmp N, xzr
|
||||||
|
ble .Lcsum_kernel_L999
|
||||||
|
cmp INC_X, xzr
|
||||||
|
ble .Lcsum_kernel_L999
|
||||||
|
|
||||||
|
cmp INC_X, #1
|
||||||
|
bne .Lcsum_kernel_S_BEGIN
|
||||||
|
|
||||||
|
.Lcsum_kernel_F_BEGIN:
|
||||||
|
|
||||||
|
asr I, N, #3
|
||||||
|
cmp I, xzr
|
||||||
|
beq .Lcsum_kernel_F1
|
||||||
|
|
||||||
|
.Lcsum_kernel_F8:
|
||||||
|
|
||||||
|
KERNEL_F8
|
||||||
|
|
||||||
|
subs I, I, #1
|
||||||
|
bne .Lcsum_kernel_F8
|
||||||
|
|
||||||
|
KERNEL_F8_FINALIZE
|
||||||
|
|
||||||
|
.Lcsum_kernel_F1:
|
||||||
|
|
||||||
|
ands I, N, #7
|
||||||
|
ble .Lcsum_kernel_L999
|
||||||
|
|
||||||
|
.Lcsum_kernel_F10:
|
||||||
|
|
||||||
|
KERNEL_F1
|
||||||
|
|
||||||
|
subs I, I, #1
|
||||||
|
bne .Lcsum_kernel_F10
|
||||||
|
|
||||||
|
.Lcsum_kernel_L999:
|
||||||
|
ret
|
||||||
|
|
||||||
|
.Lcsum_kernel_S_BEGIN:
|
||||||
|
|
||||||
|
INIT_S
|
||||||
|
|
||||||
|
asr I, N, #2
|
||||||
|
cmp I, xzr
|
||||||
|
ble .Lcsum_kernel_S1
|
||||||
|
|
||||||
|
.Lcsum_kernel_S4:
|
||||||
|
|
||||||
|
KERNEL_S1
|
||||||
|
KERNEL_S1
|
||||||
|
KERNEL_S1
|
||||||
|
KERNEL_S1
|
||||||
|
|
||||||
|
subs I, I, #1
|
||||||
|
bne .Lcsum_kernel_S4
|
||||||
|
|
||||||
|
.Lcsum_kernel_S1:
|
||||||
|
|
||||||
|
ands I, N, #3
|
||||||
|
ble .Lcsum_kernel_L999
|
||||||
|
|
||||||
|
.Lcsum_kernel_S10:
|
||||||
|
|
||||||
|
KERNEL_S1
|
||||||
|
|
||||||
|
subs I, I, #1
|
||||||
|
bne .Lcsum_kernel_S10
|
||||||
|
|
||||||
|
ret
|
||||||
|
|
||||||
|
EPILOGUE
|
|
@ -0,0 +1,186 @@
|
||||||
|
/*******************************************************************************
|
||||||
|
Copyright (c) 2019, The OpenBLAS Project
|
||||||
|
All rights reserved.
|
||||||
|
Redistribution and use in source and binary forms, with or without
|
||||||
|
modification, are permitted provided that the following conditions are
|
||||||
|
met:
|
||||||
|
1. Redistributions of source code must retain the above copyright
|
||||||
|
notice, this list of conditions and the following disclaimer.
|
||||||
|
2. Redistributions in binary form must reproduce the above copyright
|
||||||
|
notice, this list of conditions and the following disclaimer in
|
||||||
|
the documentation and/or other materials provided with the
|
||||||
|
distribution.
|
||||||
|
3. Neither the name of the OpenBLAS project nor the names of
|
||||||
|
its contributors may be used to endorse or promote products
|
||||||
|
derived from this software without specific prior written permission.
|
||||||
|
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||||||
|
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||||
|
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
||||||
|
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
|
||||||
|
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
|
||||||
|
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
|
||||||
|
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
|
||||||
|
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
|
||||||
|
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
|
||||||
|
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||||
|
*******************************************************************************/
|
||||||
|
|
||||||
|
#define ASSEMBLER
|
||||||
|
#include "common.h"
|
||||||
|
|
||||||
|
#define N x0 /* vector length */
|
||||||
|
#define X x1 /* X vector address */
|
||||||
|
#define INC_X x2 /* X stride */
|
||||||
|
#define I x5 /* loop variable */
|
||||||
|
|
||||||
|
/*******************************************************************************
|
||||||
|
* Macro definitions
|
||||||
|
*******************************************************************************/
|
||||||
|
|
||||||
|
#if !defined(DOUBLE)
|
||||||
|
#define REG0 wzr
|
||||||
|
#define SUMF s0
|
||||||
|
#define TMPF s1
|
||||||
|
#define TMPVF {v1.s}[0]
|
||||||
|
#define SZ 4
|
||||||
|
#else
|
||||||
|
#define REG0 xzr
|
||||||
|
#define SUMF d0
|
||||||
|
#define TMPF d1
|
||||||
|
#define TMPVF {v1.d}[0]
|
||||||
|
#define SZ 8
|
||||||
|
#endif
|
||||||
|
|
||||||
|
/******************************************************************************/
|
||||||
|
|
||||||
|
.macro KERNEL_F1
|
||||||
|
ldr TMPF, [X], #SZ
|
||||||
|
fadd SUMF, SUMF, TMPF
|
||||||
|
.endm
|
||||||
|
|
||||||
|
.macro KERNEL_F8
|
||||||
|
#if !defined(DOUBLE)
|
||||||
|
ld1 {v1.4s, v2.4s}, [X], #32 // Load [X3, X2, X1, X0]
|
||||||
|
fadd v1.4s, v1.4s, v2.4s // [X3+X1, X2+X0]
|
||||||
|
fadd v0.4s, v0.4s, v1.4s // [X3+X1, X2+X0]
|
||||||
|
PRFM PLDL1KEEP, [X, #1024]
|
||||||
|
#else // DOUBLE
|
||||||
|
ld1 {v2.2d, v3.2d, v4.2d, v5.2d}, [X]
|
||||||
|
add X, X, #64
|
||||||
|
|
||||||
|
PRFM PLDL1KEEP, [X, #1024]
|
||||||
|
|
||||||
|
fadd v2.2d, v2.2d, v3.2d
|
||||||
|
fadd v4.2d, v4.2d, v5.2d
|
||||||
|
fadd v0.2d, v0.2d, v2.2d
|
||||||
|
fadd v0.2d, v0.2d, v4.2d
|
||||||
|
#endif
|
||||||
|
.endm
|
||||||
|
|
||||||
|
.macro KERNEL_F8_FINALIZE
|
||||||
|
#if !defined(DOUBLE)
|
||||||
|
ext v1.16b, v0.16b, v0.16b, #8
|
||||||
|
fadd v0.2s, v0.2s, v1.2s
|
||||||
|
faddp SUMF, v0.2s
|
||||||
|
#else
|
||||||
|
faddp SUMF, v0.2d
|
||||||
|
#endif
|
||||||
|
.endm
|
||||||
|
|
||||||
|
.macro INIT_S
|
||||||
|
#if !defined(DOUBLE)
|
||||||
|
lsl INC_X, INC_X, #2
|
||||||
|
#else
|
||||||
|
lsl INC_X, INC_X, #3
|
||||||
|
#endif
|
||||||
|
.endm
|
||||||
|
|
||||||
|
.macro KERNEL_S1
|
||||||
|
ld1 TMPVF, [X], INC_X
|
||||||
|
fadd SUMF, SUMF, TMPF
|
||||||
|
.endm
|
||||||
|
|
||||||
|
/*******************************************************************************
|
||||||
|
* End of macro definitions
|
||||||
|
*******************************************************************************/
|
||||||
|
|
||||||
|
PROLOGUE
|
||||||
|
|
||||||
|
fmov SUMF, REG0
|
||||||
|
#if !defined(DOUBLE)
|
||||||
|
fmov s1, SUMF
|
||||||
|
#else
|
||||||
|
fmov d1, SUMF
|
||||||
|
#endif
|
||||||
|
|
||||||
|
cmp N, xzr
|
||||||
|
ble .Lsum_kernel_L999
|
||||||
|
cmp INC_X, xzr
|
||||||
|
ble .Lsum_kernel_L999
|
||||||
|
|
||||||
|
cmp INC_X, #1
|
||||||
|
bne .Lsum_kernel_S_BEGIN
|
||||||
|
|
||||||
|
.Lsum_kernel_F_BEGIN:
|
||||||
|
|
||||||
|
asr I, N, #3
|
||||||
|
cmp I, xzr
|
||||||
|
beq .Lsum_kernel_F1
|
||||||
|
|
||||||
|
.Lsum_kernel_F8:
|
||||||
|
|
||||||
|
KERNEL_F8
|
||||||
|
|
||||||
|
subs I, I, #1
|
||||||
|
bne .Lsum_kernel_F8
|
||||||
|
|
||||||
|
KERNEL_F8_FINALIZE
|
||||||
|
|
||||||
|
.Lsum_kernel_F1:
|
||||||
|
|
||||||
|
ands I, N, #7
|
||||||
|
ble .Lsum_kernel_L999
|
||||||
|
|
||||||
|
.Lsum_kernel_F10:
|
||||||
|
|
||||||
|
KERNEL_F1
|
||||||
|
|
||||||
|
subs I, I, #1
|
||||||
|
bne .Lsum_kernel_F10
|
||||||
|
|
||||||
|
.Lsum_kernel_L999:
|
||||||
|
ret
|
||||||
|
|
||||||
|
.Lsum_kernel_S_BEGIN:
|
||||||
|
|
||||||
|
INIT_S
|
||||||
|
|
||||||
|
asr I, N, #2
|
||||||
|
cmp I, xzr
|
||||||
|
ble .Lsum_kernel_S1
|
||||||
|
|
||||||
|
.Lsum_kernel_S4:
|
||||||
|
|
||||||
|
KERNEL_S1
|
||||||
|
KERNEL_S1
|
||||||
|
KERNEL_S1
|
||||||
|
KERNEL_S1
|
||||||
|
|
||||||
|
subs I, I, #1
|
||||||
|
bne .Lsum_kernel_S4
|
||||||
|
|
||||||
|
.Lsum_kernel_S1:
|
||||||
|
|
||||||
|
ands I, N, #3
|
||||||
|
ble .Lsum_kernel_L999
|
||||||
|
|
||||||
|
.Lsum_kernel_S10:
|
||||||
|
|
||||||
|
KERNEL_S1
|
||||||
|
|
||||||
|
subs I, I, #1
|
||||||
|
bne .Lsum_kernel_S10
|
||||||
|
|
||||||
|
ret
|
||||||
|
|
||||||
|
EPILOGUE
|
|
@ -0,0 +1,158 @@
|
||||||
|
/*******************************************************************************
|
||||||
|
Copyright (c) 2015, The OpenBLAS Project
|
||||||
|
All rights reserved.
|
||||||
|
Redistribution and use in source and binary forms, with or without
|
||||||
|
modification, are permitted provided that the following conditions are
|
||||||
|
met:
|
||||||
|
1. Redistributions of source code must retain the above copyright
|
||||||
|
notice, this list of conditions and the following disclaimer.
|
||||||
|
2. Redistributions in binary form must reproduce the above copyright
|
||||||
|
notice, this list of conditions and the following disclaimer in
|
||||||
|
the documentation and/or other materials provided with the
|
||||||
|
distribution.
|
||||||
|
3. Neither the name of the OpenBLAS project nor the names of
|
||||||
|
its contributors may be used to endorse or promote products
|
||||||
|
derived from this software without specific prior written permission.
|
||||||
|
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||||||
|
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||||
|
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
||||||
|
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
|
||||||
|
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
|
||||||
|
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
|
||||||
|
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
|
||||||
|
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
|
||||||
|
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
|
||||||
|
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||||
|
*******************************************************************************/
|
||||||
|
|
||||||
|
#define ASSEMBLER
|
||||||
|
#include "common.h"
|
||||||
|
|
||||||
|
#define N x0 /* vector length */
|
||||||
|
#define X x1 /* X vector address */
|
||||||
|
#define INC_X x2 /* X stride */
|
||||||
|
#define I x5 /* loop variable */
|
||||||
|
|
||||||
|
/*******************************************************************************
|
||||||
|
* Macro definitions
|
||||||
|
*******************************************************************************/
|
||||||
|
|
||||||
|
#define REG0 xzr
|
||||||
|
#define SUMF d0
|
||||||
|
#define TMPF d1
|
||||||
|
#define TMPVF {v1.d}[0]
|
||||||
|
#define SZ 8
|
||||||
|
|
||||||
|
/******************************************************************************/
|
||||||
|
|
||||||
|
.macro KERNEL_F1
|
||||||
|
ld1 {v1.2d}, [X], #16
|
||||||
|
faddp TMPF, v1.2d
|
||||||
|
fadd SUMF, SUMF, TMPF
|
||||||
|
.endm
|
||||||
|
|
||||||
|
.macro KERNEL_F4
|
||||||
|
ld1 {v1.2d, v2.2d, v3.2d, v4.2d}, [X], #64
|
||||||
|
|
||||||
|
fadd v1.2d, v1.2d, v2.2d
|
||||||
|
fadd v3.2d, v3.2d, v4.2d
|
||||||
|
|
||||||
|
fadd v0.2d, v0.2d, v1.2d
|
||||||
|
fadd v0.2d, v0.2d, v3.2d
|
||||||
|
|
||||||
|
PRFM PLDL1KEEP, [X, #1024]
|
||||||
|
.endm
|
||||||
|
|
||||||
|
.macro KERNEL_F4_FINALIZE
|
||||||
|
faddp SUMF, v0.2d
|
||||||
|
.endm
|
||||||
|
|
||||||
|
.macro INIT_S
|
||||||
|
lsl INC_X, INC_X, #4
|
||||||
|
.endm
|
||||||
|
|
||||||
|
.macro KERNEL_S1
|
||||||
|
ld1 {v1.2d}, [X], INC_X
|
||||||
|
faddp TMPF, v1.2d
|
||||||
|
fadd SUMF, SUMF, TMPF
|
||||||
|
.endm
|
||||||
|
|
||||||
|
/*******************************************************************************
|
||||||
|
* End of macro definitions
|
||||||
|
*******************************************************************************/
|
||||||
|
|
||||||
|
PROLOGUE
|
||||||
|
|
||||||
|
fmov SUMF, REG0
|
||||||
|
|
||||||
|
cmp N, xzr
|
||||||
|
ble .Lzsum_kernel_L999
|
||||||
|
cmp INC_X, xzr
|
||||||
|
ble .Lzsum_kernel_L999
|
||||||
|
|
||||||
|
cmp INC_X, #1
|
||||||
|
bne .Lzsum_kernel_S_BEGIN
|
||||||
|
|
||||||
|
.Lzsum_kernel_F_BEGIN:
|
||||||
|
|
||||||
|
asr I, N, #2
|
||||||
|
cmp I, xzr
|
||||||
|
beq .Lzsum_kernel_F1
|
||||||
|
|
||||||
|
.Lzsum_kernel_F4:
|
||||||
|
|
||||||
|
KERNEL_F4
|
||||||
|
|
||||||
|
subs I, I, #1
|
||||||
|
bne .Lzsum_kernel_F4
|
||||||
|
|
||||||
|
KERNEL_F4_FINALIZE
|
||||||
|
|
||||||
|
.Lzsum_kernel_F1:
|
||||||
|
|
||||||
|
ands I, N, #3
|
||||||
|
ble .Lzsum_kernel_L999
|
||||||
|
|
||||||
|
.Lzsum_kernel_F10:
|
||||||
|
|
||||||
|
KERNEL_F1
|
||||||
|
|
||||||
|
subs I, I, #1
|
||||||
|
bne .Lzsum_kernel_F10
|
||||||
|
|
||||||
|
.Lzsum_kernel_L999:
|
||||||
|
ret
|
||||||
|
|
||||||
|
.Lzsum_kernel_S_BEGIN:
|
||||||
|
|
||||||
|
INIT_S
|
||||||
|
|
||||||
|
asr I, N, #2
|
||||||
|
cmp I, xzr
|
||||||
|
ble .Lzsum_kernel_S1
|
||||||
|
|
||||||
|
.Lzsum_kernel_S4:
|
||||||
|
|
||||||
|
KERNEL_S1
|
||||||
|
KERNEL_S1
|
||||||
|
KERNEL_S1
|
||||||
|
KERNEL_S1
|
||||||
|
|
||||||
|
subs I, I, #1
|
||||||
|
bne .Lzsum_kernel_S4
|
||||||
|
|
||||||
|
.Lzsum_kernel_S1:
|
||||||
|
|
||||||
|
ands I, N, #3
|
||||||
|
ble .Lzsum_kernel_L999
|
||||||
|
|
||||||
|
.Lzsum_kernel_S10:
|
||||||
|
|
||||||
|
KERNEL_S1
|
||||||
|
|
||||||
|
subs I, I, #1
|
||||||
|
bne .Lzsum_kernel_S10
|
||||||
|
|
||||||
|
ret
|
||||||
|
|
||||||
|
EPILOGUE
|
|
@ -60,6 +60,10 @@ CASUMKERNEL = asum.S
|
||||||
ZASUMKERNEL = asum.S
|
ZASUMKERNEL = asum.S
|
||||||
XASUMKERNEL = asum.S
|
XASUMKERNEL = asum.S
|
||||||
|
|
||||||
|
CSUMKERNEL = sum.S
|
||||||
|
ZSUMKERNEL = sum.S
|
||||||
|
XSUMKERNEL = sum.S
|
||||||
|
|
||||||
CNRM2KERNEL = nrm2.S
|
CNRM2KERNEL = nrm2.S
|
||||||
ZNRM2KERNEL = nrm2.S
|
ZNRM2KERNEL = nrm2.S
|
||||||
XNRM2KERNEL = nrm2.S
|
XNRM2KERNEL = nrm2.S
|
||||||
|
|
|
@ -0,0 +1,358 @@
|
||||||
|
/*********************************************************************/
|
||||||
|
/* Copyright 2009, 2010 The University of Texas at Austin. */
|
||||||
|
/* Copyright 2019, The OpenBLAS project */
|
||||||
|
/* All rights reserved. */
|
||||||
|
/* */
|
||||||
|
/* Redistribution and use in source and binary forms, with or */
|
||||||
|
/* without modification, are permitted provided that the following */
|
||||||
|
/* conditions are met: */
|
||||||
|
/* */
|
||||||
|
/* 1. Redistributions of source code must retain the above */
|
||||||
|
/* copyright notice, this list of conditions and the following */
|
||||||
|
/* disclaimer. */
|
||||||
|
/* */
|
||||||
|
/* 2. Redistributions in binary form must reproduce the above */
|
||||||
|
/* copyright notice, this list of conditions and the following */
|
||||||
|
/* disclaimer in the documentation and/or other materials */
|
||||||
|
/* provided with the distribution. */
|
||||||
|
/* */
|
||||||
|
/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */
|
||||||
|
/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */
|
||||||
|
/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */
|
||||||
|
/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */
|
||||||
|
/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */
|
||||||
|
/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */
|
||||||
|
/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */
|
||||||
|
/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */
|
||||||
|
/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */
|
||||||
|
/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */
|
||||||
|
/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */
|
||||||
|
/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */
|
||||||
|
/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */
|
||||||
|
/* POSSIBILITY OF SUCH DAMAGE. */
|
||||||
|
/* */
|
||||||
|
/* The views and conclusions contained in the software and */
|
||||||
|
/* documentation are those of the authors and should not be */
|
||||||
|
/* interpreted as representing official policies, either expressed */
|
||||||
|
/* or implied, of The University of Texas at Austin. */
|
||||||
|
/*********************************************************************/
|
||||||
|
|
||||||
|
#define ASSEMBLER
|
||||||
|
#include "common.h"
|
||||||
|
|
||||||
|
#ifdef XDOUBLE
|
||||||
|
#define PREFETCH_SIZE ( 8 * 16 + 4)
|
||||||
|
#elif defined(DOUBLE)
|
||||||
|
#define PREFETCH_SIZE (16 * 16 + 8)
|
||||||
|
#else
|
||||||
|
#define PREFETCH_SIZE (32 * 16 + 16)
|
||||||
|
#endif
|
||||||
|
|
||||||
|
#ifndef COMPLEX
|
||||||
|
#define COMPADD 0
|
||||||
|
#define STRIDE INCX
|
||||||
|
#else
|
||||||
|
#define COMPADD 1
|
||||||
|
#define STRIDE SIZE
|
||||||
|
#endif
|
||||||
|
|
||||||
|
#define PRE1 r2
|
||||||
|
|
||||||
|
#define I r17
|
||||||
|
#define J r18
|
||||||
|
#define INCX16 r21
|
||||||
|
|
||||||
|
#define PR r30
|
||||||
|
#define ARLC r31
|
||||||
|
|
||||||
|
#define N r32
|
||||||
|
#define X r33
|
||||||
|
#define INCX r34
|
||||||
|
|
||||||
|
|
||||||
|
PROLOGUE
|
||||||
|
.prologue
|
||||||
|
PROFCODE
|
||||||
|
{ .mfi
|
||||||
|
adds PRE1 = PREFETCH_SIZE * SIZE, X
|
||||||
|
mov f8 = f0
|
||||||
|
.save ar.lc, ARLC
|
||||||
|
mov ARLC = ar.lc
|
||||||
|
}
|
||||||
|
;;
|
||||||
|
.body
|
||||||
|
#ifdef F_INTERFACE
|
||||||
|
{ .mmi
|
||||||
|
LDINT N = [N]
|
||||||
|
LDINT INCX = [INCX]
|
||||||
|
nop.i 0
|
||||||
|
}
|
||||||
|
;;
|
||||||
|
#ifndef USE64BITINT
|
||||||
|
{ .mii
|
||||||
|
nop.m 0
|
||||||
|
sxt4 N = N
|
||||||
|
sxt4 INCX = INCX
|
||||||
|
}
|
||||||
|
;;
|
||||||
|
#endif
|
||||||
|
#endif
|
||||||
|
{ .mmi
|
||||||
|
cmp.lt p0, p6 = r0, INCX
|
||||||
|
cmp.lt p0, p7 = r0, N
|
||||||
|
shr I = N, (4 - COMPADD)
|
||||||
|
}
|
||||||
|
{ .mbb
|
||||||
|
and J = ((1 << (4 - COMPADD)) - 1), N
|
||||||
|
(p6) br.ret.sptk.many b0
|
||||||
|
(p7) br.ret.sptk.many b0
|
||||||
|
}
|
||||||
|
;;
|
||||||
|
{ .mfi
|
||||||
|
adds I = -1, I
|
||||||
|
mov f10 = f0
|
||||||
|
mov PR = pr
|
||||||
|
}
|
||||||
|
{ .mfi
|
||||||
|
cmp.eq p9, p0 = r0, J
|
||||||
|
mov f9 = f0
|
||||||
|
tbit.z p0, p12 = N, 3 - COMPADD
|
||||||
|
}
|
||||||
|
;;
|
||||||
|
{ .mmi
|
||||||
|
cmp.eq p16, p0 = r0, r0
|
||||||
|
cmp.ne p17, p0 = r0, r0
|
||||||
|
mov ar.ec= 3
|
||||||
|
}
|
||||||
|
{ .mfi
|
||||||
|
cmp.ne p18, p0 = r0, r0
|
||||||
|
mov f11 = f0
|
||||||
|
shl INCX = INCX, BASE_SHIFT + COMPADD
|
||||||
|
}
|
||||||
|
;;
|
||||||
|
{ .mmi
|
||||||
|
#ifdef XDOUBLE
|
||||||
|
shladd INCX16 = INCX, (3 - COMPADD), r0
|
||||||
|
#else
|
||||||
|
shladd INCX16 = INCX, (4 - COMPADD), r0
|
||||||
|
#endif
|
||||||
|
cmp.ne p19, p0 = r0, r0
|
||||||
|
mov ar.lc = I
|
||||||
|
}
|
||||||
|
{ .mmb
|
||||||
|
cmp.gt p8 ,p0 = r0, I
|
||||||
|
#ifdef COMPLEX
|
||||||
|
adds INCX = - SIZE, INCX
|
||||||
|
#else
|
||||||
|
nop.m 0
|
||||||
|
#endif
|
||||||
|
(p8) br.cond.dpnt .L55
|
||||||
|
}
|
||||||
|
;;
|
||||||
|
.align 32
|
||||||
|
|
||||||
|
.L52:
|
||||||
|
{ .mmf
|
||||||
|
(p16) lfetch.nt1 [PRE1], INCX16
|
||||||
|
(p16) LDFD f32 = [X], STRIDE
|
||||||
|
}
|
||||||
|
{ .mfb
|
||||||
|
(p19) FADD f8 = f8, f71
|
||||||
|
}
|
||||||
|
;;
|
||||||
|
{ .mmf
|
||||||
|
(p16) LDFD f35 = [X], INCX
|
||||||
|
}
|
||||||
|
{ .mfb
|
||||||
|
(p19) FADD f9 = f9, f74
|
||||||
|
}
|
||||||
|
;;
|
||||||
|
{ .mmf
|
||||||
|
(p16) LDFD f38 = [X], STRIDE
|
||||||
|
}
|
||||||
|
{ .mfb
|
||||||
|
(p19) FADD f10 = f10, f77
|
||||||
|
}
|
||||||
|
;;
|
||||||
|
{ .mmf
|
||||||
|
(p16) LDFD f41 = [X], INCX
|
||||||
|
}
|
||||||
|
{ .mfb
|
||||||
|
(p19) FADD f11 = f11, f80
|
||||||
|
}
|
||||||
|
;;
|
||||||
|
{ .mmf
|
||||||
|
(p16) LDFD f44 = [X], STRIDE
|
||||||
|
}
|
||||||
|
{ .mfb
|
||||||
|
(p18) FADD f8 = f8, f34
|
||||||
|
}
|
||||||
|
;;
|
||||||
|
{ .mmf
|
||||||
|
(p16) LDFD f47 = [X], INCX
|
||||||
|
}
|
||||||
|
{ .mfb
|
||||||
|
(p18) FADD f9 = f9, f37
|
||||||
|
}
|
||||||
|
;;
|
||||||
|
{ .mmf
|
||||||
|
(p16) LDFD f50 = [X], STRIDE
|
||||||
|
}
|
||||||
|
{ .mfb
|
||||||
|
(p18) FADD f10 = f10, f40
|
||||||
|
}
|
||||||
|
;;
|
||||||
|
{ .mmf
|
||||||
|
(p16) LDFD f53 = [X], INCX
|
||||||
|
}
|
||||||
|
{ .mfb
|
||||||
|
(p18) FADD f11 = f11, f43
|
||||||
|
}
|
||||||
|
;;
|
||||||
|
{ .mmf
|
||||||
|
#ifdef XDOUBLE
|
||||||
|
(p16) lfetch.nt1 [PRE1], INCX16
|
||||||
|
#endif
|
||||||
|
(p16) LDFD f56 = [X], STRIDE
|
||||||
|
}
|
||||||
|
{ .mfb
|
||||||
|
(p18) FADD f8 = f8, f46
|
||||||
|
}
|
||||||
|
;;
|
||||||
|
{ .mmf
|
||||||
|
(p16) LDFD f59 = [X], INCX
|
||||||
|
}
|
||||||
|
{ .mfb
|
||||||
|
(p18) FADD f9 = f9, f49
|
||||||
|
}
|
||||||
|
;;
|
||||||
|
{ .mmf
|
||||||
|
(p16) LDFD f62 = [X], STRIDE
|
||||||
|
}
|
||||||
|
{ .mfb
|
||||||
|
(p18) FADD f10 = f10, f52
|
||||||
|
}
|
||||||
|
;;
|
||||||
|
{ .mmf
|
||||||
|
(p16) LDFD f65 = [X], INCX
|
||||||
|
}
|
||||||
|
{ .mfb
|
||||||
|
(p18) FADD f11 = f11, f55
|
||||||
|
}
|
||||||
|
;;
|
||||||
|
{ .mmf
|
||||||
|
(p16) LDFD f68 = [X], STRIDE
|
||||||
|
}
|
||||||
|
{ .mfb
|
||||||
|
(p18) FADD f8 = f8, f58
|
||||||
|
}
|
||||||
|
;;
|
||||||
|
{ .mmf
|
||||||
|
(p16) LDFD f71 = [X], INCX
|
||||||
|
}
|
||||||
|
{ .mfb
|
||||||
|
(p18) FADD f9 = f9, f61
|
||||||
|
}
|
||||||
|
;;
|
||||||
|
{ .mmf
|
||||||
|
(p16) LDFD f74 = [X], STRIDE
|
||||||
|
}
|
||||||
|
{ .mfb
|
||||||
|
(p18) FADD f10 = f10, f64
|
||||||
|
}
|
||||||
|
;;
|
||||||
|
{ .mmf
|
||||||
|
(p16) LDFD f77 = [X], INCX
|
||||||
|
}
|
||||||
|
{ .mfb
|
||||||
|
(p18) FADD f11 = f11, f67
|
||||||
|
br.ctop.sptk.few .L52
|
||||||
|
}
|
||||||
|
;;
|
||||||
|
FADD f8 = f8, f71
|
||||||
|
FADD f9 = f9, f74
|
||||||
|
FADD f10 = f10, f77
|
||||||
|
FADD f11 = f11, f80
|
||||||
|
.align 32
|
||||||
|
;;
|
||||||
|
.L55:
|
||||||
|
(p12) LDFD f32 = [X], STRIDE
|
||||||
|
(p9) br.cond.dptk .L998
|
||||||
|
;;
|
||||||
|
(p12) LDFD f33 = [X], INCX
|
||||||
|
;;
|
||||||
|
(p12) LDFD f34 = [X], STRIDE
|
||||||
|
;;
|
||||||
|
(p12) LDFD f35 = [X], INCX
|
||||||
|
tbit.z p0, p13 = N, (2 - COMPADD)
|
||||||
|
;;
|
||||||
|
(p12) LDFD f36 = [X], STRIDE
|
||||||
|
tbit.z p0, p14 = N, (1 - COMPADD)
|
||||||
|
;;
|
||||||
|
(p12) LDFD f37 = [X], INCX
|
||||||
|
#ifndef COMPLEX
|
||||||
|
tbit.z p0, p15 = N, 0
|
||||||
|
#endif
|
||||||
|
;;
|
||||||
|
(p12) LDFD f38 = [X], STRIDE
|
||||||
|
;;
|
||||||
|
(p12) LDFD f39 = [X], INCX
|
||||||
|
;;
|
||||||
|
(p13) LDFD f40 = [X], STRIDE
|
||||||
|
;;
|
||||||
|
(p13) LDFD f41 = [X], INCX
|
||||||
|
;;
|
||||||
|
(p13) LDFD f42 = [X], STRIDE
|
||||||
|
(p12) FADD f8 = f8, f32
|
||||||
|
;;
|
||||||
|
(p13) LDFD f43 = [X], INCX
|
||||||
|
(p12) FADD f9 = f9, f33
|
||||||
|
;;
|
||||||
|
(p14) LDFD f44 = [X], STRIDE
|
||||||
|
(p12) FADD f10 = f10, f34
|
||||||
|
;;
|
||||||
|
(p14) LDFD f45 = [X], INCX
|
||||||
|
(p12) FADD f11 = f11, f35
|
||||||
|
;;
|
||||||
|
#ifndef COMPLEX
|
||||||
|
(p15) LDFD f46 = [X]
|
||||||
|
#endif
|
||||||
|
(p12) FADD f8 = f8, f36
|
||||||
|
;;
|
||||||
|
(p12) FADD f9 = f9, f37
|
||||||
|
(p12) FADD f10 = f10, f38
|
||||||
|
(p12) FADD f11 = f11, f39
|
||||||
|
;;
|
||||||
|
(p13) FADD f8 = f8, f40
|
||||||
|
(p13) FADD f9 = f9, f41
|
||||||
|
#ifndef COMPLEX
|
||||||
|
#endif
|
||||||
|
(p13) FADD f10 = f10, f42
|
||||||
|
;;
|
||||||
|
(p13) FADD f11 = f11, f43
|
||||||
|
(p14) FADD f8 = f8, f44
|
||||||
|
(p14) FADD f9 = f9, f45
|
||||||
|
#ifndef COMPLEX
|
||||||
|
(p15) FADD f10 = f10, f46
|
||||||
|
#endif
|
||||||
|
;;
|
||||||
|
.align 32
|
||||||
|
|
||||||
|
.L998:
|
||||||
|
{ .mfi
|
||||||
|
FADD f8 = f8, f9
|
||||||
|
mov ar.lc = ARLC
|
||||||
|
}
|
||||||
|
{ .mmf
|
||||||
|
FADD f10 = f10, f11
|
||||||
|
}
|
||||||
|
;;
|
||||||
|
{ .mii
|
||||||
|
mov pr = PR, -65474
|
||||||
|
}
|
||||||
|
;;
|
||||||
|
{ .mfb
|
||||||
|
FADD f8 = f8, f10
|
||||||
|
br.ret.sptk.many b0
|
||||||
|
}
|
||||||
|
EPILOGUE
|
|
@ -30,6 +30,11 @@ IDMAXKERNEL = ../mips/imax.c
|
||||||
ISMINKERNEL = ../mips/imin.c
|
ISMINKERNEL = ../mips/imin.c
|
||||||
IDMINKERNEL = ../mips/imin.c
|
IDMINKERNEL = ../mips/imin.c
|
||||||
|
|
||||||
|
SSUMKERNEL = ../mips/sum.c
|
||||||
|
DSUMKERNEL = ../mips/sum.c
|
||||||
|
CSUMKERNEL = ../mips/zsum.c
|
||||||
|
ZSUMKERNEL = ../mips/zsum.c
|
||||||
|
|
||||||
ifdef HAVE_MSA
|
ifdef HAVE_MSA
|
||||||
SASUMKERNEL = ../mips/sasum_msa.c
|
SASUMKERNEL = ../mips/sasum_msa.c
|
||||||
DASUMKERNEL = ../mips/dasum_msa.c
|
DASUMKERNEL = ../mips/dasum_msa.c
|
||||||
|
|
|
@ -0,0 +1,47 @@
|
||||||
|
/***************************************************************************
|
||||||
|
Copyright (c) 2016, The OpenBLAS Project
|
||||||
|
All rights reserved.
|
||||||
|
Redistribution and use in source and binary forms, with or without
|
||||||
|
modification, are permitted provided that the following conditions are
|
||||||
|
met:
|
||||||
|
1. Redistributions of source code must retain the above copyright
|
||||||
|
notice, this list of conditions and the following disclaimer.
|
||||||
|
2. Redistributions in binary form must reproduce the above copyright
|
||||||
|
notice, this list of conditions and the following disclaimer in
|
||||||
|
the documentation and/or other materials provided with the
|
||||||
|
distribution.
|
||||||
|
3. Neither the name of the OpenBLAS project nor the names of
|
||||||
|
its contributors may be used to endorse or promote products
|
||||||
|
derived from this software without specific prior written permission.
|
||||||
|
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||||||
|
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||||
|
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
||||||
|
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
|
||||||
|
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
|
||||||
|
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
|
||||||
|
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
|
||||||
|
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
|
||||||
|
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
|
||||||
|
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||||
|
*****************************************************************************/
|
||||||
|
|
||||||
|
#include "common.h"
|
||||||
|
#include <math.h>
|
||||||
|
|
||||||
|
|
||||||
|
FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x)
|
||||||
|
{
|
||||||
|
BLASLONG i=0;
|
||||||
|
FLOAT sumf = 0.0;
|
||||||
|
if (n <= 0 || inc_x <= 0) return(sumf);
|
||||||
|
|
||||||
|
n *= inc_x;
|
||||||
|
while(i < n)
|
||||||
|
{
|
||||||
|
sumf += x[i];
|
||||||
|
i += inc_x;
|
||||||
|
}
|
||||||
|
return(sumf);
|
||||||
|
}
|
||||||
|
|
||||||
|
|
|
@ -0,0 +1,52 @@
|
||||||
|
/***************************************************************************
|
||||||
|
Copyright (c) 2016, The OpenBLAS Project
|
||||||
|
All rights reserved.
|
||||||
|
Redistribution and use in source and binary forms, with or without
|
||||||
|
modification, are permitted provided that the following conditions are
|
||||||
|
met:
|
||||||
|
1. Redistributions of source code must retain the above copyright
|
||||||
|
notice, this list of conditions and the following disclaimer.
|
||||||
|
2. Redistributions in binary form must reproduce the above copyright
|
||||||
|
notice, this list of conditions and the following disclaimer in
|
||||||
|
the documentation and/or other materials provided with the
|
||||||
|
distribution.
|
||||||
|
3. Neither the name of the OpenBLAS project nor the names of
|
||||||
|
its contributors may be used to endorse or promote products
|
||||||
|
derived from this software without specific prior written permission.
|
||||||
|
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||||||
|
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||||
|
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
||||||
|
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
|
||||||
|
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
|
||||||
|
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
|
||||||
|
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
|
||||||
|
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
|
||||||
|
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
|
||||||
|
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||||
|
*****************************************************************************/
|
||||||
|
|
||||||
|
#include "common.h"
|
||||||
|
#include <math.h>
|
||||||
|
|
||||||
|
#define CSUM1(x,i) x[i]+x[i+1]
|
||||||
|
|
||||||
|
FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x)
|
||||||
|
{
|
||||||
|
BLASLONG i=0;
|
||||||
|
FLOAT sumf = 0.0;
|
||||||
|
BLASLONG inc_x2;
|
||||||
|
|
||||||
|
if (n <= 0 || inc_x <= 0) return(sumf);
|
||||||
|
|
||||||
|
inc_x2 = 2 * inc_x;
|
||||||
|
|
||||||
|
n *= inc_x2;
|
||||||
|
while(i < n)
|
||||||
|
{
|
||||||
|
sumf += CSUM1(x,i);
|
||||||
|
i += inc_x2;
|
||||||
|
}
|
||||||
|
return(sumf);
|
||||||
|
}
|
||||||
|
|
||||||
|
|
|
@ -0,0 +1,332 @@
|
||||||
|
/*********************************************************************/
|
||||||
|
/* Copyright 2009, 2010 The University of Texas at Austin. */
|
||||||
|
/* All rights reserved. */
|
||||||
|
/* */
|
||||||
|
/* Redistribution and use in source and binary forms, with or */
|
||||||
|
/* without modification, are permitted provided that the following */
|
||||||
|
/* conditions are met: */
|
||||||
|
/* */
|
||||||
|
/* 1. Redistributions of source code must retain the above */
|
||||||
|
/* copyright notice, this list of conditions and the following */
|
||||||
|
/* disclaimer. */
|
||||||
|
/* */
|
||||||
|
/* 2. Redistributions in binary form must reproduce the above */
|
||||||
|
/* copyright notice, this list of conditions and the following */
|
||||||
|
/* disclaimer in the documentation and/or other materials */
|
||||||
|
/* provided with the distribution. */
|
||||||
|
/* */
|
||||||
|
/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */
|
||||||
|
/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */
|
||||||
|
/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */
|
||||||
|
/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */
|
||||||
|
/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */
|
||||||
|
/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */
|
||||||
|
/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */
|
||||||
|
/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */
|
||||||
|
/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */
|
||||||
|
/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */
|
||||||
|
/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */
|
||||||
|
/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */
|
||||||
|
/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */
|
||||||
|
/* POSSIBILITY OF SUCH DAMAGE. */
|
||||||
|
/* */
|
||||||
|
/* The views and conclusions contained in the software and */
|
||||||
|
/* documentation are those of the authors and should not be */
|
||||||
|
/* interpreted as representing official policies, either expressed */
|
||||||
|
/* or implied, of The University of Texas at Austin. */
|
||||||
|
/*********************************************************************/
|
||||||
|
|
||||||
|
#define ASSEMBLER
|
||||||
|
#include "common.h"
|
||||||
|
|
||||||
|
#define N $4
|
||||||
|
#define X $5
|
||||||
|
#define INCX $6
|
||||||
|
|
||||||
|
#define I $2
|
||||||
|
#define TEMP $3
|
||||||
|
|
||||||
|
#define a1 $f2
|
||||||
|
#define a2 $f3
|
||||||
|
#define a3 $f4
|
||||||
|
#define a4 $f5
|
||||||
|
#define a5 $f6
|
||||||
|
#define a6 $f7
|
||||||
|
#define a7 $f8
|
||||||
|
#define a8 $f9
|
||||||
|
|
||||||
|
#define t1 $f10
|
||||||
|
#define t2 $f11
|
||||||
|
#define t3 $f12
|
||||||
|
#define t4 $f13
|
||||||
|
|
||||||
|
#define s1 $f0
|
||||||
|
#define s2 $f1
|
||||||
|
|
||||||
|
PROLOGUE
|
||||||
|
|
||||||
|
#ifdef F_INTERFACE
|
||||||
|
LDINT N, 0(N)
|
||||||
|
LDINT INCX, 0(INCX)
|
||||||
|
#endif
|
||||||
|
|
||||||
|
MTC $0, s1
|
||||||
|
|
||||||
|
MTC $0, s2
|
||||||
|
dsll INCX, INCX, BASE_SHIFT
|
||||||
|
|
||||||
|
blez N, .L999
|
||||||
|
li TEMP, SIZE
|
||||||
|
|
||||||
|
bne INCX, TEMP, .L20
|
||||||
|
dsra I, N, 3
|
||||||
|
|
||||||
|
blez I, .L15
|
||||||
|
NOP
|
||||||
|
|
||||||
|
LD a1, 0 * SIZE(X)
|
||||||
|
LD a2, 1 * SIZE(X)
|
||||||
|
LD a3, 2 * SIZE(X)
|
||||||
|
LD a4, 3 * SIZE(X)
|
||||||
|
|
||||||
|
LD a5, 4 * SIZE(X)
|
||||||
|
MOV t1, a1
|
||||||
|
LD a6, 5 * SIZE(X)
|
||||||
|
MOV t2, a2
|
||||||
|
LD a7, 6 * SIZE(X)
|
||||||
|
MOV t3, a3
|
||||||
|
|
||||||
|
MOV t4, a4
|
||||||
|
daddiu I, I, -1
|
||||||
|
|
||||||
|
blez I, .L13
|
||||||
|
LD a8, 7 * SIZE(X)
|
||||||
|
.align 3
|
||||||
|
|
||||||
|
.L12:
|
||||||
|
ADD s1, s1, t1
|
||||||
|
LD a1, 8 * SIZE(X)
|
||||||
|
|
||||||
|
MOV t1, a5
|
||||||
|
daddiu I, I, -1
|
||||||
|
|
||||||
|
ADD s2, s2, t2
|
||||||
|
LD a2, 9 * SIZE(X)
|
||||||
|
|
||||||
|
MOV t2, a6
|
||||||
|
NOP
|
||||||
|
|
||||||
|
ADD s1, s1, t3
|
||||||
|
LD a3, 10 * SIZE(X)
|
||||||
|
|
||||||
|
MOV t3, a7
|
||||||
|
NOP
|
||||||
|
|
||||||
|
ADD s2, s2, t4
|
||||||
|
LD a4, 11 * SIZE(X)
|
||||||
|
|
||||||
|
MOV t4, a8
|
||||||
|
daddiu X, X, 8 * SIZE
|
||||||
|
|
||||||
|
ADD s1, s1, t1
|
||||||
|
LD a5, 4 * SIZE(X)
|
||||||
|
|
||||||
|
MOV t1, a1
|
||||||
|
NOP
|
||||||
|
|
||||||
|
ADD s2, s2, t2
|
||||||
|
LD a6, 5 * SIZE(X)
|
||||||
|
|
||||||
|
MOV t2, a2
|
||||||
|
NOP
|
||||||
|
|
||||||
|
ADD s1, s1, t3
|
||||||
|
LD a7, 6 * SIZE(X)
|
||||||
|
|
||||||
|
MOV t3, a3
|
||||||
|
NOP
|
||||||
|
|
||||||
|
ADD s2, s2, t4
|
||||||
|
LD a8, 7 * SIZE(X)
|
||||||
|
|
||||||
|
bgtz I, .L12
|
||||||
|
MOV t4, a4
|
||||||
|
.align 3
|
||||||
|
|
||||||
|
.L13:
|
||||||
|
ADD s1, s1, t1
|
||||||
|
daddiu X, X, 8 * SIZE
|
||||||
|
|
||||||
|
MOV t1, a5
|
||||||
|
NOP
|
||||||
|
|
||||||
|
ADD s2, s2, t2
|
||||||
|
MOV t2, a6
|
||||||
|
|
||||||
|
ADD s1, s1, t3
|
||||||
|
MOV t3, a7
|
||||||
|
|
||||||
|
ADD s2, s2, t4
|
||||||
|
MOV t4, a8
|
||||||
|
|
||||||
|
ADD s1, s1, t1
|
||||||
|
ADD s2, s2, t2
|
||||||
|
ADD s1, s1, t3
|
||||||
|
ADD s2, s2, t4
|
||||||
|
.align 3
|
||||||
|
|
||||||
|
.L15:
|
||||||
|
andi I, N, 7
|
||||||
|
|
||||||
|
blez I, .L999
|
||||||
|
NOP
|
||||||
|
.align 3
|
||||||
|
|
||||||
|
.L16:
|
||||||
|
LD a1, 0 * SIZE(X)
|
||||||
|
daddiu I, I, -1
|
||||||
|
|
||||||
|
MOV t1, a1
|
||||||
|
|
||||||
|
ADD s1, s1, t1
|
||||||
|
|
||||||
|
bgtz I, .L16
|
||||||
|
daddiu X, X, SIZE
|
||||||
|
|
||||||
|
j .L999
|
||||||
|
NOP
|
||||||
|
.align 3
|
||||||
|
|
||||||
|
.L20:
|
||||||
|
blez I, .L25
|
||||||
|
NOP
|
||||||
|
|
||||||
|
LD a1, 0 * SIZE(X)
|
||||||
|
daddu X, X, INCX
|
||||||
|
|
||||||
|
LD a2, 0 * SIZE(X)
|
||||||
|
daddu X, X, INCX
|
||||||
|
|
||||||
|
LD a3, 0 * SIZE(X)
|
||||||
|
daddu X, X, INCX
|
||||||
|
|
||||||
|
LD a4, 0 * SIZE(X)
|
||||||
|
daddu X, X, INCX
|
||||||
|
|
||||||
|
LD a5, 0 * SIZE(X)
|
||||||
|
daddu X, X, INCX
|
||||||
|
|
||||||
|
LD a6, 0 * SIZE(X)
|
||||||
|
daddu X, X, INCX
|
||||||
|
|
||||||
|
MOV t1, a1
|
||||||
|
LD a7, 0 * SIZE(X)
|
||||||
|
|
||||||
|
MOV t2, a2
|
||||||
|
daddu X, X, INCX
|
||||||
|
|
||||||
|
MOV t3, a3
|
||||||
|
LD a8, 0 * SIZE(X)
|
||||||
|
|
||||||
|
MOV t4, a4
|
||||||
|
daddiu I, I, -1
|
||||||
|
|
||||||
|
blez I, .L24
|
||||||
|
daddu X, X, INCX
|
||||||
|
.align 3
|
||||||
|
|
||||||
|
.L23:
|
||||||
|
ADD s1, s1, t1
|
||||||
|
LD a1, 0 * SIZE(X)
|
||||||
|
|
||||||
|
MOV t1, a5
|
||||||
|
daddu X, X, INCX
|
||||||
|
|
||||||
|
ADD s2, s2, t2
|
||||||
|
LD a2, 0 * SIZE(X)
|
||||||
|
|
||||||
|
MOV t2, a6
|
||||||
|
daddu X, X, INCX
|
||||||
|
|
||||||
|
ADD s1, s1, t3
|
||||||
|
LD a3, 0 * SIZE(X)
|
||||||
|
|
||||||
|
MOV t3, a7
|
||||||
|
daddu X, X, INCX
|
||||||
|
|
||||||
|
ADD s2, s2, t4
|
||||||
|
LD a4, 0 * SIZE(X)
|
||||||
|
|
||||||
|
MOV t4, a8
|
||||||
|
daddu X, X, INCX
|
||||||
|
|
||||||
|
ADD s1, s1, t1
|
||||||
|
LD a5, 0 * SIZE(X)
|
||||||
|
|
||||||
|
MOV t1, a1
|
||||||
|
daddu X, X, INCX
|
||||||
|
|
||||||
|
ADD s2, s2, t2
|
||||||
|
LD a6, 0 * SIZE(X)
|
||||||
|
|
||||||
|
MOV t2, a2
|
||||||
|
daddu X, X, INCX
|
||||||
|
|
||||||
|
ADD s1, s1, t3
|
||||||
|
LD a7, 0 * SIZE(X)
|
||||||
|
|
||||||
|
MOV t3, a3
|
||||||
|
daddu X, X, INCX
|
||||||
|
|
||||||
|
ADD s2, s2, t4
|
||||||
|
LD a8, 0 * SIZE(X)
|
||||||
|
|
||||||
|
MOV t4, a4
|
||||||
|
daddiu I, I, -1
|
||||||
|
|
||||||
|
bgtz I, .L23
|
||||||
|
daddu X, X, INCX
|
||||||
|
.align 3
|
||||||
|
|
||||||
|
.L24:
|
||||||
|
ADD s1, s1, t1
|
||||||
|
MOV t1, a5
|
||||||
|
|
||||||
|
ADD s2, s2, t2
|
||||||
|
MOV t2, a6
|
||||||
|
|
||||||
|
ADD s1, s1, t3
|
||||||
|
MOV t3, a7
|
||||||
|
|
||||||
|
ADD s2, s2, t4
|
||||||
|
MOV t4, a8
|
||||||
|
|
||||||
|
ADD s1, s1, t1
|
||||||
|
ADD s2, s2, t2
|
||||||
|
ADD s1, s1, t3
|
||||||
|
ADD s2, s2, t4
|
||||||
|
.align 3
|
||||||
|
|
||||||
|
.L25:
|
||||||
|
andi I, N, 7
|
||||||
|
|
||||||
|
blez I, .L999
|
||||||
|
NOP
|
||||||
|
.align 3
|
||||||
|
|
||||||
|
.L26:
|
||||||
|
LD a1, 0 * SIZE(X)
|
||||||
|
daddiu I, I, -1
|
||||||
|
|
||||||
|
MOV t1, a1
|
||||||
|
daddu X, X, INCX
|
||||||
|
|
||||||
|
bgtz I, .L26
|
||||||
|
ADD s1, s1, t1
|
||||||
|
.align 3
|
||||||
|
|
||||||
|
.L999:
|
||||||
|
j $31
|
||||||
|
ADD s1, s1, s2
|
||||||
|
|
||||||
|
EPILOGUE
|
|
@ -0,0 +1,204 @@
|
||||||
|
/*********************************************************************/
|
||||||
|
/* Copyright 2009, 2010 The University of Texas at Austin. */
|
||||||
|
/* All rights reserved. */
|
||||||
|
/* */
|
||||||
|
/* Redistribution and use in source and binary forms, with or */
|
||||||
|
/* without modification, are permitted provided that the following */
|
||||||
|
/* conditions are met: */
|
||||||
|
/* */
|
||||||
|
/* 1. Redistributions of source code must retain the above */
|
||||||
|
/* copyright notice, this list of conditions and the following */
|
||||||
|
/* disclaimer. */
|
||||||
|
/* */
|
||||||
|
/* 2. Redistributions in binary form must reproduce the above */
|
||||||
|
/* copyright notice, this list of conditions and the following */
|
||||||
|
/* disclaimer in the documentation and/or other materials */
|
||||||
|
/* provided with the distribution. */
|
||||||
|
/* */
|
||||||
|
/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */
|
||||||
|
/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */
|
||||||
|
/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */
|
||||||
|
/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */
|
||||||
|
/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */
|
||||||
|
/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */
|
||||||
|
/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */
|
||||||
|
/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */
|
||||||
|
/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */
|
||||||
|
/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */
|
||||||
|
/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */
|
||||||
|
/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */
|
||||||
|
/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */
|
||||||
|
/* POSSIBILITY OF SUCH DAMAGE. */
|
||||||
|
/* */
|
||||||
|
/* The views and conclusions contained in the software and */
|
||||||
|
/* documentation are those of the authors and should not be */
|
||||||
|
/* interpreted as representing official policies, either expressed */
|
||||||
|
/* or implied, of The University of Texas at Austin. */
|
||||||
|
/*********************************************************************/
|
||||||
|
|
||||||
|
#define ASSEMBLER
|
||||||
|
#include "common.h"
|
||||||
|
|
||||||
|
#define N $4
|
||||||
|
#define X $5
|
||||||
|
#define INCX $6
|
||||||
|
|
||||||
|
#define I $2
|
||||||
|
#define TEMP $3
|
||||||
|
|
||||||
|
#define a1 $f2
|
||||||
|
#define a2 $f3
|
||||||
|
#define a3 $f4
|
||||||
|
#define a4 $f5
|
||||||
|
#define a5 $f6
|
||||||
|
#define a6 $f7
|
||||||
|
#define a7 $f8
|
||||||
|
#define a8 $f9
|
||||||
|
|
||||||
|
#define t1 $f10
|
||||||
|
#define t2 $f11
|
||||||
|
#define t3 $f12
|
||||||
|
#define t4 $f13
|
||||||
|
|
||||||
|
#define s1 $f0
|
||||||
|
#define s2 $f1
|
||||||
|
|
||||||
|
PROLOGUE
|
||||||
|
|
||||||
|
#ifdef F_INTERFACE
|
||||||
|
LDINT N, 0(N)
|
||||||
|
LDINT INCX, 0(INCX)
|
||||||
|
#endif
|
||||||
|
|
||||||
|
MTC $0, s1
|
||||||
|
|
||||||
|
MTC $0, s2
|
||||||
|
dsll INCX, INCX, ZBASE_SHIFT
|
||||||
|
|
||||||
|
blez N, .L999
|
||||||
|
dsra I, N, 2
|
||||||
|
|
||||||
|
blez I, .L25
|
||||||
|
NOP
|
||||||
|
|
||||||
|
LD a1, 0 * SIZE(X)
|
||||||
|
LD a2, 1 * SIZE(X)
|
||||||
|
daddu X, X, INCX
|
||||||
|
|
||||||
|
LD a3, 0 * SIZE(X)
|
||||||
|
LD a4, 1 * SIZE(X)
|
||||||
|
daddu X, X, INCX
|
||||||
|
|
||||||
|
LD a5, 0 * SIZE(X)
|
||||||
|
LD a6, 1 * SIZE(X)
|
||||||
|
daddu X, X, INCX
|
||||||
|
|
||||||
|
MOV t1, a1
|
||||||
|
MOV t2, a2
|
||||||
|
|
||||||
|
LD a7, 0 * SIZE(X)
|
||||||
|
LD a8, 1 * SIZE(X)
|
||||||
|
|
||||||
|
MOV t3, a3
|
||||||
|
MOV t4, a4
|
||||||
|
daddiu I, I, -1
|
||||||
|
|
||||||
|
blez I, .L24
|
||||||
|
daddu X, X, INCX
|
||||||
|
.align 3
|
||||||
|
|
||||||
|
.L23:
|
||||||
|
ADD s1, s1, t1
|
||||||
|
LD a1, 0 * SIZE(X)
|
||||||
|
|
||||||
|
MOV t1, a5
|
||||||
|
daddiu I, I, -1
|
||||||
|
|
||||||
|
ADD s2, s2, t2
|
||||||
|
LD a2, 1 * SIZE(X)
|
||||||
|
|
||||||
|
MOV t2, a6
|
||||||
|
daddu X, X, INCX
|
||||||
|
|
||||||
|
ADD s1, s1, t3
|
||||||
|
LD a3, 0 * SIZE(X)
|
||||||
|
|
||||||
|
MOV t3, a7
|
||||||
|
NOP
|
||||||
|
|
||||||
|
ADD s2, s2, t4
|
||||||
|
LD a4, 1 * SIZE(X)
|
||||||
|
|
||||||
|
MOV t4, a8
|
||||||
|
daddu X, X, INCX
|
||||||
|
|
||||||
|
ADD s1, s1, t1
|
||||||
|
LD a5, 0 * SIZE(X)
|
||||||
|
|
||||||
|
MOV t1, a1
|
||||||
|
NOP
|
||||||
|
|
||||||
|
ADD s2, s2, t2
|
||||||
|
LD a6, 1 * SIZE(X)
|
||||||
|
|
||||||
|
MOV t2, a2
|
||||||
|
daddu X, X, INCX
|
||||||
|
|
||||||
|
ADD s1, s1, t3
|
||||||
|
LD a7, 0 * SIZE(X)
|
||||||
|
|
||||||
|
MOV t3, a3
|
||||||
|
LD a8, 1 * SIZE(X)
|
||||||
|
|
||||||
|
ADD s2, s2, t4
|
||||||
|
daddu X, X, INCX
|
||||||
|
|
||||||
|
bgtz I, .L23
|
||||||
|
MOV t4, a4
|
||||||
|
.align 3
|
||||||
|
|
||||||
|
.L24:
|
||||||
|
ADD s1, s1, t1
|
||||||
|
MOV t1, a5
|
||||||
|
|
||||||
|
ADD s2, s2, t2
|
||||||
|
MOV t2, a6
|
||||||
|
|
||||||
|
ADD s1, s1, t3
|
||||||
|
MOV t3, a7
|
||||||
|
|
||||||
|
ADD s2, s2, t4
|
||||||
|
MOV t4, a8
|
||||||
|
|
||||||
|
ADD s1, s1, t1
|
||||||
|
ADD s2, s2, t2
|
||||||
|
ADD s1, s1, t3
|
||||||
|
ADD s2, s2, t4
|
||||||
|
.align 3
|
||||||
|
|
||||||
|
.L25:
|
||||||
|
andi I, N, 3
|
||||||
|
|
||||||
|
blez I, .L999
|
||||||
|
NOP
|
||||||
|
.align 3
|
||||||
|
|
||||||
|
.L26:
|
||||||
|
LD a1, 0 * SIZE(X)
|
||||||
|
LD a2, 1 * SIZE(X)
|
||||||
|
|
||||||
|
MOV t1, a1
|
||||||
|
daddiu I, I, -1
|
||||||
|
MOV t2, a2
|
||||||
|
daddu X, X, INCX
|
||||||
|
|
||||||
|
ADD s1, s1, t1
|
||||||
|
bgtz I, .L26
|
||||||
|
ADD s2, s2, t2
|
||||||
|
.align 3
|
||||||
|
|
||||||
|
.L999:
|
||||||
|
j $31
|
||||||
|
ADD s1, s1, s2
|
||||||
|
|
||||||
|
EPILOGUE
|
|
@ -0,0 +1,446 @@
|
||||||
|
/*********************************************************************/
|
||||||
|
/* Copyright 2009, 2010 The University of Texas at Austin. */
|
||||||
|
/* All rights reserved. */
|
||||||
|
/* */
|
||||||
|
/* Redistribution and use in source and binary forms, with or */
|
||||||
|
/* without modification, are permitted provided that the following */
|
||||||
|
/* conditions are met: */
|
||||||
|
/* */
|
||||||
|
/* 1. Redistributions of source code must retain the above */
|
||||||
|
/* copyright notice, this list of conditions and the following */
|
||||||
|
/* disclaimer. */
|
||||||
|
/* */
|
||||||
|
/* 2. Redistributions in binary form must reproduce the above */
|
||||||
|
/* copyright notice, this list of conditions and the following */
|
||||||
|
/* disclaimer in the documentation and/or other materials */
|
||||||
|
/* provided with the distribution. */
|
||||||
|
/* */
|
||||||
|
/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */
|
||||||
|
/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */
|
||||||
|
/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */
|
||||||
|
/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */
|
||||||
|
/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */
|
||||||
|
/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */
|
||||||
|
/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */
|
||||||
|
/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */
|
||||||
|
/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */
|
||||||
|
/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */
|
||||||
|
/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */
|
||||||
|
/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */
|
||||||
|
/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */
|
||||||
|
/* POSSIBILITY OF SUCH DAMAGE. */
|
||||||
|
/* */
|
||||||
|
/* The views and conclusions contained in the software and */
|
||||||
|
/* documentation are those of the authors and should not be */
|
||||||
|
/* interpreted as representing official policies, either expressed */
|
||||||
|
/* or implied, of The University of Texas at Austin. */
|
||||||
|
/*********************************************************************/
|
||||||
|
|
||||||
|
#define ASSEMBLER
|
||||||
|
#include "common.h"
|
||||||
|
|
||||||
|
#define N r3
|
||||||
|
#define X r4
|
||||||
|
#define INCX r5
|
||||||
|
|
||||||
|
#define PREA r8
|
||||||
|
|
||||||
|
#define FZERO f0
|
||||||
|
|
||||||
|
#define STACKSIZE 160
|
||||||
|
|
||||||
|
PROLOGUE
|
||||||
|
PROFCODE
|
||||||
|
|
||||||
|
addi SP, SP, -STACKSIZE
|
||||||
|
li r0, 0
|
||||||
|
|
||||||
|
stfd f14, 0(SP)
|
||||||
|
stfd f15, 8(SP)
|
||||||
|
stfd f16, 16(SP)
|
||||||
|
stfd f17, 24(SP)
|
||||||
|
|
||||||
|
stfd f18, 32(SP)
|
||||||
|
stfd f19, 40(SP)
|
||||||
|
stfd f20, 48(SP)
|
||||||
|
stfd f21, 56(SP)
|
||||||
|
|
||||||
|
stfd f22, 64(SP)
|
||||||
|
stfd f23, 72(SP)
|
||||||
|
stfd f24, 80(SP)
|
||||||
|
stfd f25, 88(SP)
|
||||||
|
|
||||||
|
stfd f26, 96(SP)
|
||||||
|
stfd f27, 104(SP)
|
||||||
|
stfd f28, 112(SP)
|
||||||
|
stfd f29, 120(SP)
|
||||||
|
|
||||||
|
stfd f30, 128(SP)
|
||||||
|
stfd f31, 136(SP)
|
||||||
|
|
||||||
|
stw r0, 144(SP)
|
||||||
|
lfs FZERO,144(SP)
|
||||||
|
|
||||||
|
#ifdef F_INTERFACE
|
||||||
|
LDINT N, 0(N)
|
||||||
|
LDINT INCX, 0(INCX)
|
||||||
|
#endif
|
||||||
|
|
||||||
|
slwi INCX, INCX, BASE_SHIFT
|
||||||
|
|
||||||
|
fmr f1, FZERO
|
||||||
|
fmr f2, FZERO
|
||||||
|
fmr f3, FZERO
|
||||||
|
fmr f4, FZERO
|
||||||
|
fmr f5, FZERO
|
||||||
|
fmr f6, FZERO
|
||||||
|
fmr f7, FZERO
|
||||||
|
|
||||||
|
li PREA, L1_PREFETCHSIZE
|
||||||
|
|
||||||
|
cmpwi cr0, N, 0
|
||||||
|
ble- LL(999)
|
||||||
|
|
||||||
|
cmpwi cr0, INCX, 0
|
||||||
|
ble- LL(999)
|
||||||
|
|
||||||
|
cmpwi cr0, INCX, SIZE
|
||||||
|
bne- cr0, LL(100)
|
||||||
|
|
||||||
|
srawi. r0, N, 4
|
||||||
|
mtspr CTR, r0
|
||||||
|
beq- cr0, LL(50)
|
||||||
|
.align 4
|
||||||
|
|
||||||
|
LFD f8, 0 * SIZE(X)
|
||||||
|
LFD f9, 1 * SIZE(X)
|
||||||
|
LFD f10, 2 * SIZE(X)
|
||||||
|
LFD f11, 3 * SIZE(X)
|
||||||
|
LFD f12, 4 * SIZE(X)
|
||||||
|
LFD f13, 5 * SIZE(X)
|
||||||
|
LFD f14, 6 * SIZE(X)
|
||||||
|
LFD f15, 7 * SIZE(X)
|
||||||
|
|
||||||
|
LFD f24, 8 * SIZE(X)
|
||||||
|
LFD f25, 9 * SIZE(X)
|
||||||
|
LFD f26, 10 * SIZE(X)
|
||||||
|
LFD f27, 11 * SIZE(X)
|
||||||
|
LFD f28, 12 * SIZE(X)
|
||||||
|
LFD f29, 13 * SIZE(X)
|
||||||
|
LFD f30, 14 * SIZE(X)
|
||||||
|
LFD f31, 15 * SIZE(X)
|
||||||
|
|
||||||
|
fmr f16, f8
|
||||||
|
fmr f17, f9
|
||||||
|
fmr f18, f10
|
||||||
|
fmr f19, f11
|
||||||
|
|
||||||
|
fmr f20, f12
|
||||||
|
fmr f21, f13
|
||||||
|
fmr f22, f14
|
||||||
|
fmr f23, f15
|
||||||
|
bdz LL(20)
|
||||||
|
.align 4
|
||||||
|
|
||||||
|
LL(10):
|
||||||
|
FADD f0, f0, f16
|
||||||
|
fmr f16, f24
|
||||||
|
FADD f1, f1, f17
|
||||||
|
fmr f17, f25
|
||||||
|
|
||||||
|
FADD f2, f2, f18
|
||||||
|
fmr f18, f26
|
||||||
|
FADD f3, f3, f19
|
||||||
|
fmr f19, f27
|
||||||
|
|
||||||
|
LFD f8, 16 * SIZE(X)
|
||||||
|
LFD f9, 17 * SIZE(X)
|
||||||
|
LFD f10, 18 * SIZE(X)
|
||||||
|
LFD f11, 19 * SIZE(X)
|
||||||
|
|
||||||
|
FADD f4, f4, f20
|
||||||
|
fmr f20, f28
|
||||||
|
FADD f5, f5, f21
|
||||||
|
fmr f21, f29
|
||||||
|
|
||||||
|
FADD f6, f6, f22
|
||||||
|
fmr f22, f30
|
||||||
|
FADD f7, f7, f23
|
||||||
|
fmr f23, f31
|
||||||
|
|
||||||
|
LFD f12, 20 * SIZE(X)
|
||||||
|
LFD f13, 21 * SIZE(X)
|
||||||
|
LFD f14, 22 * SIZE(X)
|
||||||
|
LFD f15, 23 * SIZE(X)
|
||||||
|
|
||||||
|
FADD f0, f0, f16
|
||||||
|
fmr f16, f8
|
||||||
|
FADD f1, f1, f17
|
||||||
|
fmr f17, f9
|
||||||
|
|
||||||
|
FADD f2, f2, f18
|
||||||
|
fmr f18, f10
|
||||||
|
FADD f3, f3, f19
|
||||||
|
fmr f19, f11
|
||||||
|
|
||||||
|
LFD f24, 24 * SIZE(X)
|
||||||
|
LFD f25, 25 * SIZE(X)
|
||||||
|
LFD f26, 26 * SIZE(X)
|
||||||
|
LFD f27, 27 * SIZE(X)
|
||||||
|
|
||||||
|
FADD f4, f4, f20
|
||||||
|
fmr f20, f12
|
||||||
|
FADD f5, f5, f21
|
||||||
|
fmr f21, f13
|
||||||
|
|
||||||
|
FADD f6, f6, f22
|
||||||
|
fmr f22, f14
|
||||||
|
FADD f7, f7, f23
|
||||||
|
fmr f23, f15
|
||||||
|
|
||||||
|
LFD f28, 28 * SIZE(X)
|
||||||
|
LFD f29, 29 * SIZE(X)
|
||||||
|
LFD f30, 30 * SIZE(X)
|
||||||
|
LFD f31, 31 * SIZE(X)
|
||||||
|
|
||||||
|
#ifndef POWER6
|
||||||
|
L1_PREFETCH X, PREA
|
||||||
|
#endif
|
||||||
|
addi X, X, 16 * SIZE
|
||||||
|
#ifdef POWER6
|
||||||
|
L1_PREFETCH X, PREA
|
||||||
|
#endif
|
||||||
|
|
||||||
|
bdnz LL(10)
|
||||||
|
.align 4
|
||||||
|
|
||||||
|
LL(20):
|
||||||
|
FADD f0, f0, f16
|
||||||
|
fmr f16, f24
|
||||||
|
FADD f1, f1, f17
|
||||||
|
fmr f17, f25
|
||||||
|
|
||||||
|
FADD f2, f2, f18
|
||||||
|
fmr f18, f26
|
||||||
|
FADD f3, f3, f19
|
||||||
|
fmr f19, f27
|
||||||
|
|
||||||
|
FADD f4, f4, f20
|
||||||
|
fmr f20, f28
|
||||||
|
FADD f5, f5, f21
|
||||||
|
fmr f21, f29
|
||||||
|
|
||||||
|
FADD f6, f6, f22
|
||||||
|
fmr f22, f30
|
||||||
|
FADD f7, f7, f23
|
||||||
|
fmr f23, f31
|
||||||
|
|
||||||
|
FADD f0, f0, f16
|
||||||
|
FADD f1, f1, f17
|
||||||
|
FADD f2, f2, f18
|
||||||
|
FADD f3, f3, f19
|
||||||
|
|
||||||
|
FADD f4, f4, f20
|
||||||
|
FADD f5, f5, f21
|
||||||
|
FADD f6, f6, f22
|
||||||
|
FADD f7, f7, f23
|
||||||
|
addi X, X, 16 * SIZE
|
||||||
|
.align 4
|
||||||
|
|
||||||
|
LL(50):
|
||||||
|
andi. r0, N, 15
|
||||||
|
mtspr CTR, r0
|
||||||
|
beq LL(999)
|
||||||
|
.align 4
|
||||||
|
|
||||||
|
LL(60):
|
||||||
|
LFD f8, 0 * SIZE(X)
|
||||||
|
addi X, X, 1 * SIZE
|
||||||
|
|
||||||
|
FADD f0, f0, f8
|
||||||
|
|
||||||
|
bdnz LL(60)
|
||||||
|
b LL(999)
|
||||||
|
.align 4
|
||||||
|
|
||||||
|
LL(100):
|
||||||
|
sub X, X, INCX
|
||||||
|
|
||||||
|
srawi. r0, N, 4
|
||||||
|
mtspr CTR, r0
|
||||||
|
beq- LL(150)
|
||||||
|
|
||||||
|
LFDUX f8, X, INCX
|
||||||
|
LFDUX f9, X, INCX
|
||||||
|
LFDUX f10, X, INCX
|
||||||
|
LFDUX f11, X, INCX
|
||||||
|
LFDUX f12, X, INCX
|
||||||
|
LFDUX f13, X, INCX
|
||||||
|
LFDUX f14, X, INCX
|
||||||
|
LFDUX f15, X, INCX
|
||||||
|
|
||||||
|
LFDUX f24, X, INCX
|
||||||
|
LFDUX f25, X, INCX
|
||||||
|
LFDUX f26, X, INCX
|
||||||
|
LFDUX f27, X, INCX
|
||||||
|
LFDUX f28, X, INCX
|
||||||
|
LFDUX f29, X, INCX
|
||||||
|
LFDUX f30, X, INCX
|
||||||
|
LFDUX f31, X, INCX
|
||||||
|
|
||||||
|
fmr f16, f8
|
||||||
|
fmr f17, f9
|
||||||
|
fmr f18, f10
|
||||||
|
fmr f19, f11
|
||||||
|
|
||||||
|
fmr f20, f12
|
||||||
|
fmr f21, f13
|
||||||
|
fmr f22, f14
|
||||||
|
fmr f23, f15
|
||||||
|
bdz LL(120)
|
||||||
|
.align 4
|
||||||
|
|
||||||
|
LL(110):
|
||||||
|
FADD f0, f0, f16
|
||||||
|
fmr f16, f24
|
||||||
|
FADD f1, f1, f17
|
||||||
|
fmr f17, f25
|
||||||
|
|
||||||
|
FADD f2, f2, f18
|
||||||
|
fmr f18, f26
|
||||||
|
FADD f3, f3, f19
|
||||||
|
fmr f19, f27
|
||||||
|
|
||||||
|
LFDUX f8, X, INCX
|
||||||
|
LFDUX f9, X, INCX
|
||||||
|
LFDUX f10, X, INCX
|
||||||
|
LFDUX f11, X, INCX
|
||||||
|
|
||||||
|
FADD f4, f4, f20
|
||||||
|
fmr f20, f28
|
||||||
|
FADD f5, f5, f21
|
||||||
|
fmr f21, f29
|
||||||
|
|
||||||
|
FADD f6, f6, f22
|
||||||
|
fmr f22, f30
|
||||||
|
FADD f7, f7, f23
|
||||||
|
fmr f23, f31
|
||||||
|
|
||||||
|
LFDUX f12, X, INCX
|
||||||
|
LFDUX f13, X, INCX
|
||||||
|
LFDUX f14, X, INCX
|
||||||
|
LFDUX f15, X, INCX
|
||||||
|
|
||||||
|
FADD f0, f0, f16
|
||||||
|
fmr f16, f8
|
||||||
|
FADD f1, f1, f17
|
||||||
|
fmr f17, f9
|
||||||
|
|
||||||
|
FADD f2, f2, f18
|
||||||
|
fmr f18, f10
|
||||||
|
FADD f3, f3, f19
|
||||||
|
fmr f19, f11
|
||||||
|
|
||||||
|
LFDUX f24, X, INCX
|
||||||
|
LFDUX f25, X, INCX
|
||||||
|
LFDUX f26, X, INCX
|
||||||
|
LFDUX f27, X, INCX
|
||||||
|
|
||||||
|
FADD f4, f4, f20
|
||||||
|
fmr f20, f12
|
||||||
|
FADD f5, f5, f21
|
||||||
|
fmr f21, f13
|
||||||
|
|
||||||
|
FADD f6, f6, f22
|
||||||
|
fmr f22, f14
|
||||||
|
FADD f7, f7, f23
|
||||||
|
fmr f23, f15
|
||||||
|
|
||||||
|
LFDUX f28, X, INCX
|
||||||
|
LFDUX f29, X, INCX
|
||||||
|
LFDUX f30, X, INCX
|
||||||
|
LFDUX f31, X, INCX
|
||||||
|
bdnz LL(110)
|
||||||
|
.align 4
|
||||||
|
|
||||||
|
LL(120):
|
||||||
|
FADD f0, f0, f16
|
||||||
|
fmr f16, f24
|
||||||
|
FADD f1, f1, f17
|
||||||
|
fmr f17, f25
|
||||||
|
|
||||||
|
FADD f2, f2, f18
|
||||||
|
fmr f18, f26
|
||||||
|
FADD f3, f3, f19
|
||||||
|
fmr f19, f27
|
||||||
|
|
||||||
|
FADD f4, f4, f20
|
||||||
|
fmr f20, f28
|
||||||
|
FADD f5, f5, f21
|
||||||
|
fmr f21, f29
|
||||||
|
|
||||||
|
FADD f6, f6, f22
|
||||||
|
fmr f22, f30
|
||||||
|
FADD f7, f7, f23
|
||||||
|
fmr f23, f31
|
||||||
|
|
||||||
|
FADD f0, f0, f16
|
||||||
|
FADD f1, f1, f17
|
||||||
|
FADD f2, f2, f18
|
||||||
|
FADD f3, f3, f19
|
||||||
|
|
||||||
|
FADD f4, f4, f20
|
||||||
|
FADD f5, f5, f21
|
||||||
|
FADD f6, f6, f22
|
||||||
|
FADD f7, f7, f23
|
||||||
|
.align 4
|
||||||
|
|
||||||
|
LL(150):
|
||||||
|
andi. r0, N, 15
|
||||||
|
mtspr CTR, r0
|
||||||
|
beq LL(999)
|
||||||
|
.align 4
|
||||||
|
|
||||||
|
LL(160):
|
||||||
|
LFDUX f8, X, INCX
|
||||||
|
FADD f0, f0, f8
|
||||||
|
bdnz LL(160)
|
||||||
|
.align 4
|
||||||
|
|
||||||
|
LL(999):
|
||||||
|
FADD f0, f0, f1
|
||||||
|
FADD f2, f2, f3
|
||||||
|
FADD f4, f4, f5
|
||||||
|
FADD f6, f6, f7
|
||||||
|
|
||||||
|
FADD f0, f0, f2
|
||||||
|
FADD f4, f4, f6
|
||||||
|
FADD f1, f0, f4
|
||||||
|
|
||||||
|
lfd f14, 0(SP)
|
||||||
|
lfd f15, 8(SP)
|
||||||
|
lfd f16, 16(SP)
|
||||||
|
lfd f17, 24(SP)
|
||||||
|
|
||||||
|
lfd f18, 32(SP)
|
||||||
|
lfd f19, 40(SP)
|
||||||
|
lfd f20, 48(SP)
|
||||||
|
lfd f21, 56(SP)
|
||||||
|
|
||||||
|
lfd f22, 64(SP)
|
||||||
|
lfd f23, 72(SP)
|
||||||
|
lfd f24, 80(SP)
|
||||||
|
lfd f25, 88(SP)
|
||||||
|
|
||||||
|
lfd f26, 96(SP)
|
||||||
|
lfd f27, 104(SP)
|
||||||
|
lfd f28, 112(SP)
|
||||||
|
lfd f29, 120(SP)
|
||||||
|
|
||||||
|
lfd f30, 128(SP)
|
||||||
|
lfd f31, 136(SP)
|
||||||
|
|
||||||
|
addi SP, SP, STACKSIZE
|
||||||
|
blr
|
||||||
|
|
||||||
|
EPILOGUE
|
|
@ -0,0 +1,452 @@
|
||||||
|
/*********************************************************************/
|
||||||
|
/* Copyright 2009, 2010 The University of Texas at Austin. */
|
||||||
|
/* All rights reserved. */
|
||||||
|
/* */
|
||||||
|
/* Redistribution and use in source and binary forms, with or */
|
||||||
|
/* without modification, are permitted provided that the following */
|
||||||
|
/* conditions are met: */
|
||||||
|
/* */
|
||||||
|
/* 1. Redistributions of source code must retain the above */
|
||||||
|
/* copyright notice, this list of conditions and the following */
|
||||||
|
/* disclaimer. */
|
||||||
|
/* */
|
||||||
|
/* 2. Redistributions in binary form must reproduce the above */
|
||||||
|
/* copyright notice, this list of conditions and the following */
|
||||||
|
/* disclaimer in the documentation and/or other materials */
|
||||||
|
/* provided with the distribution. */
|
||||||
|
/* */
|
||||||
|
/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */
|
||||||
|
/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */
|
||||||
|
/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */
|
||||||
|
/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */
|
||||||
|
/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */
|
||||||
|
/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */
|
||||||
|
/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */
|
||||||
|
/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */
|
||||||
|
/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */
|
||||||
|
/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */
|
||||||
|
/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */
|
||||||
|
/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */
|
||||||
|
/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */
|
||||||
|
/* POSSIBILITY OF SUCH DAMAGE. */
|
||||||
|
/* */
|
||||||
|
/* The views and conclusions contained in the software and */
|
||||||
|
/* documentation are those of the authors and should not be */
|
||||||
|
/* interpreted as representing official policies, either expressed */
|
||||||
|
/* or implied, of The University of Texas at Austin. */
|
||||||
|
/*********************************************************************/
|
||||||
|
|
||||||
|
#define ASSEMBLER
|
||||||
|
#include "common.h"
|
||||||
|
|
||||||
|
#define N r3
|
||||||
|
#define X r4
|
||||||
|
#define INCX r5
|
||||||
|
|
||||||
|
#define INCXM1 r9
|
||||||
|
#define PREA r8
|
||||||
|
|
||||||
|
#define FZERO f0
|
||||||
|
|
||||||
|
#define STACKSIZE 160
|
||||||
|
|
||||||
|
PROLOGUE
|
||||||
|
PROFCODE
|
||||||
|
|
||||||
|
addi SP, SP, -STACKSIZE
|
||||||
|
li r0, 0
|
||||||
|
|
||||||
|
stfd f14, 0(SP)
|
||||||
|
stfd f15, 8(SP)
|
||||||
|
stfd f16, 16(SP)
|
||||||
|
stfd f17, 24(SP)
|
||||||
|
|
||||||
|
stfd f18, 32(SP)
|
||||||
|
stfd f19, 40(SP)
|
||||||
|
stfd f20, 48(SP)
|
||||||
|
stfd f21, 56(SP)
|
||||||
|
|
||||||
|
stfd f22, 64(SP)
|
||||||
|
stfd f23, 72(SP)
|
||||||
|
stfd f24, 80(SP)
|
||||||
|
stfd f25, 88(SP)
|
||||||
|
|
||||||
|
stfd f26, 96(SP)
|
||||||
|
stfd f27, 104(SP)
|
||||||
|
stfd f28, 112(SP)
|
||||||
|
stfd f29, 120(SP)
|
||||||
|
|
||||||
|
stfd f30, 128(SP)
|
||||||
|
stfd f31, 136(SP)
|
||||||
|
|
||||||
|
stw r0, 144(SP)
|
||||||
|
lfs FZERO,144(SP)
|
||||||
|
|
||||||
|
#ifdef F_INTERFACE
|
||||||
|
LDINT N, 0(N)
|
||||||
|
LDINT INCX, 0(INCX)
|
||||||
|
#endif
|
||||||
|
|
||||||
|
slwi INCX, INCX, ZBASE_SHIFT
|
||||||
|
subi INCXM1, INCX, SIZE
|
||||||
|
|
||||||
|
fmr f1, FZERO
|
||||||
|
fmr f2, FZERO
|
||||||
|
fmr f3, FZERO
|
||||||
|
fmr f4, FZERO
|
||||||
|
fmr f5, FZERO
|
||||||
|
fmr f6, FZERO
|
||||||
|
fmr f7, FZERO
|
||||||
|
|
||||||
|
li PREA, L1_PREFETCHSIZE
|
||||||
|
|
||||||
|
cmpwi cr0, N, 0
|
||||||
|
ble- LL(999)
|
||||||
|
|
||||||
|
cmpwi cr0, INCX, 0
|
||||||
|
ble- LL(999)
|
||||||
|
|
||||||
|
cmpwi cr0, INCX, 2 * SIZE
|
||||||
|
bne- cr0, LL(100)
|
||||||
|
|
||||||
|
srawi. r0, N, 3
|
||||||
|
mtspr CTR, r0
|
||||||
|
beq- cr0, LL(50)
|
||||||
|
.align 4
|
||||||
|
|
||||||
|
LFD f8, 0 * SIZE(X)
|
||||||
|
LFD f9, 1 * SIZE(X)
|
||||||
|
LFD f10, 2 * SIZE(X)
|
||||||
|
LFD f11, 3 * SIZE(X)
|
||||||
|
LFD f12, 4 * SIZE(X)
|
||||||
|
LFD f13, 5 * SIZE(X)
|
||||||
|
LFD f14, 6 * SIZE(X)
|
||||||
|
LFD f15, 7 * SIZE(X)
|
||||||
|
|
||||||
|
LFD f24, 8 * SIZE(X)
|
||||||
|
LFD f25, 9 * SIZE(X)
|
||||||
|
LFD f26, 10 * SIZE(X)
|
||||||
|
LFD f27, 11 * SIZE(X)
|
||||||
|
LFD f28, 12 * SIZE(X)
|
||||||
|
LFD f29, 13 * SIZE(X)
|
||||||
|
LFD f30, 14 * SIZE(X)
|
||||||
|
LFD f31, 15 * SIZE(X)
|
||||||
|
|
||||||
|
fmr f16, f8
|
||||||
|
fmr f17, f9
|
||||||
|
fmr f18, f10
|
||||||
|
fmr f19, f11
|
||||||
|
|
||||||
|
fmr f20, f12
|
||||||
|
fmr f21, f13
|
||||||
|
fmr f22, f14
|
||||||
|
fmr f23, f15
|
||||||
|
bdz LL(20)
|
||||||
|
.align 4
|
||||||
|
|
||||||
|
LL(10):
|
||||||
|
FADD f0, f0, f16
|
||||||
|
fmr f16, f24
|
||||||
|
FADD f1, f1, f17
|
||||||
|
fmr f17, f25
|
||||||
|
|
||||||
|
FADD f2, f2, f18
|
||||||
|
fmr f18, f26
|
||||||
|
FADD f3, f3, f19
|
||||||
|
fmr f19, f27
|
||||||
|
|
||||||
|
LFD f8, 16 * SIZE(X)
|
||||||
|
LFD f9, 17 * SIZE(X)
|
||||||
|
LFD f10, 18 * SIZE(X)
|
||||||
|
LFD f11, 19 * SIZE(X)
|
||||||
|
|
||||||
|
FADD f4, f4, f20
|
||||||
|
fmr f20, f28
|
||||||
|
FADD f5, f5, f21
|
||||||
|
fmr f21, f29
|
||||||
|
|
||||||
|
FADD f6, f6, f22
|
||||||
|
fmr f22, f30
|
||||||
|
FADD f7, f7, f23
|
||||||
|
fmr f23, f31
|
||||||
|
|
||||||
|
LFD f12, 20 * SIZE(X)
|
||||||
|
LFD f13, 21 * SIZE(X)
|
||||||
|
LFD f14, 22 * SIZE(X)
|
||||||
|
LFD f15, 23 * SIZE(X)
|
||||||
|
|
||||||
|
FADD f0, f0, f16
|
||||||
|
fmr f16, f8
|
||||||
|
FADD f1, f1, f17
|
||||||
|
fmr f17, f9
|
||||||
|
|
||||||
|
FADD f2, f2, f18
|
||||||
|
fmr f18, f10
|
||||||
|
FADD f3, f3, f19
|
||||||
|
fmr f19, f11
|
||||||
|
|
||||||
|
LFD f24, 24 * SIZE(X)
|
||||||
|
LFD f25, 25 * SIZE(X)
|
||||||
|
LFD f26, 26 * SIZE(X)
|
||||||
|
LFD f27, 27 * SIZE(X)
|
||||||
|
|
||||||
|
FADD f4, f4, f20
|
||||||
|
fmr f20, f12
|
||||||
|
FADD f5, f5, f21
|
||||||
|
fmr f21, f13
|
||||||
|
|
||||||
|
FADD f6, f6, f22
|
||||||
|
fmr f22, f14
|
||||||
|
FADD f7, f7, f23
|
||||||
|
fmr f23, f15
|
||||||
|
|
||||||
|
LFD f28, 28 * SIZE(X)
|
||||||
|
LFD f29, 29 * SIZE(X)
|
||||||
|
LFD f30, 30 * SIZE(X)
|
||||||
|
LFD f31, 31 * SIZE(X)
|
||||||
|
|
||||||
|
#ifndef POWER6
|
||||||
|
L1_PREFETCH X, PREA
|
||||||
|
#endif
|
||||||
|
addi X, X, 16 * SIZE
|
||||||
|
#ifdef POWER6
|
||||||
|
L1_PREFETCH X, PREA
|
||||||
|
#endif
|
||||||
|
|
||||||
|
bdnz LL(10)
|
||||||
|
.align 4
|
||||||
|
|
||||||
|
LL(20):
|
||||||
|
FADD f0, f0, f16
|
||||||
|
fmr f16, f24
|
||||||
|
FADD f1, f1, f17
|
||||||
|
fmr f17, f25
|
||||||
|
|
||||||
|
FADD f2, f2, f18
|
||||||
|
fmr f18, f26
|
||||||
|
FADD f3, f3, f19
|
||||||
|
fmr f19, f27
|
||||||
|
|
||||||
|
FADD f4, f4, f20
|
||||||
|
fmr f20, f28
|
||||||
|
FADD f5, f5, f21
|
||||||
|
fmr f21, f29
|
||||||
|
|
||||||
|
FADD f6, f6, f22
|
||||||
|
fmr f22, f30
|
||||||
|
FADD f7, f7, f23
|
||||||
|
fmr f23, f31
|
||||||
|
|
||||||
|
FADD f0, f0, f16
|
||||||
|
FADD f1, f1, f17
|
||||||
|
FADD f2, f2, f18
|
||||||
|
FADD f3, f3, f19
|
||||||
|
|
||||||
|
FADD f4, f4, f20
|
||||||
|
FADD f5, f5, f21
|
||||||
|
FADD f6, f6, f22
|
||||||
|
FADD f7, f7, f23
|
||||||
|
addi X, X, 16 * SIZE
|
||||||
|
.align 4
|
||||||
|
|
||||||
|
LL(50):
|
||||||
|
andi. r0, N, 7
|
||||||
|
mtspr CTR, r0
|
||||||
|
beq LL(999)
|
||||||
|
.align 4
|
||||||
|
|
||||||
|
LL(60):
|
||||||
|
LFD f8, 0 * SIZE(X)
|
||||||
|
LFD f9, 1 * SIZE(X)
|
||||||
|
addi X, X, 2 * SIZE
|
||||||
|
|
||||||
|
FADD f0, f0, f8
|
||||||
|
FADD f1, f1, f9
|
||||||
|
|
||||||
|
bdnz LL(60)
|
||||||
|
b LL(999)
|
||||||
|
.align 4
|
||||||
|
|
||||||
|
LL(100):
|
||||||
|
sub X, X, INCXM1
|
||||||
|
|
||||||
|
srawi. r0, N, 3
|
||||||
|
mtspr CTR, r0
|
||||||
|
beq- LL(150)
|
||||||
|
|
||||||
|
LFDX f8, X, INCXM1
|
||||||
|
LFDUX f9, X, INCX
|
||||||
|
LFDX f10, X, INCXM1
|
||||||
|
LFDUX f11, X, INCX
|
||||||
|
LFDX f12, X, INCXM1
|
||||||
|
LFDUX f13, X, INCX
|
||||||
|
LFDX f14, X, INCXM1
|
||||||
|
LFDUX f15, X, INCX
|
||||||
|
|
||||||
|
LFDX f24, X, INCXM1
|
||||||
|
LFDUX f25, X, INCX
|
||||||
|
LFDX f26, X, INCXM1
|
||||||
|
LFDUX f27, X, INCX
|
||||||
|
LFDX f28, X, INCXM1
|
||||||
|
LFDUX f29, X, INCX
|
||||||
|
LFDX f30, X, INCXM1
|
||||||
|
LFDUX f31, X, INCX
|
||||||
|
|
||||||
|
fmr f16, f8
|
||||||
|
fmr f17, f9
|
||||||
|
fmr f18, f10
|
||||||
|
fmr f19, f11
|
||||||
|
|
||||||
|
fmr f20, f12
|
||||||
|
fmr f21, f13
|
||||||
|
fmr f22, f14
|
||||||
|
fmr f23, f15
|
||||||
|
bdz LL(120)
|
||||||
|
.align 4
|
||||||
|
|
||||||
|
LL(110):
|
||||||
|
FADD f0, f0, f16
|
||||||
|
fmr f16, f24
|
||||||
|
FADD f1, f1, f17
|
||||||
|
fmr f17, f25
|
||||||
|
|
||||||
|
FADD f2, f2, f18
|
||||||
|
fmr f18, f26
|
||||||
|
FADD f3, f3, f19
|
||||||
|
fmr f19, f27
|
||||||
|
|
||||||
|
LFDX f8, X, INCXM1
|
||||||
|
LFDUX f9, X, INCX
|
||||||
|
LFDX f10, X, INCXM1
|
||||||
|
LFDUX f11, X, INCX
|
||||||
|
|
||||||
|
FADD f4, f4, f20
|
||||||
|
fmr f20, f28
|
||||||
|
FADD f5, f5, f21
|
||||||
|
fmr f21, f29
|
||||||
|
|
||||||
|
FADD f6, f6, f22
|
||||||
|
fmr f22, f30
|
||||||
|
FADD f7, f7, f23
|
||||||
|
fmr f23, f31
|
||||||
|
|
||||||
|
LFDX f12, X, INCXM1
|
||||||
|
LFDUX f13, X, INCX
|
||||||
|
LFDX f14, X, INCXM1
|
||||||
|
LFDUX f15, X, INCX
|
||||||
|
|
||||||
|
FADD f0, f0, f16
|
||||||
|
fmr f16, f8
|
||||||
|
FADD f1, f1, f17
|
||||||
|
fmr f17, f9
|
||||||
|
|
||||||
|
FADD f2, f2, f18
|
||||||
|
fmr f18, f10
|
||||||
|
FADD f3, f3, f19
|
||||||
|
fmr f19, f11
|
||||||
|
|
||||||
|
LFDX f24, X, INCXM1
|
||||||
|
LFDUX f25, X, INCX
|
||||||
|
LFDX f26, X, INCXM1
|
||||||
|
LFDUX f27, X, INCX
|
||||||
|
|
||||||
|
FADD f4, f4, f20
|
||||||
|
fmr f20, f12
|
||||||
|
FADD f5, f5, f21
|
||||||
|
fmr f21, f13
|
||||||
|
|
||||||
|
FADD f6, f6, f22
|
||||||
|
fmr f22, f14
|
||||||
|
FADD f7, f7, f23
|
||||||
|
fmr f23, f15
|
||||||
|
|
||||||
|
LFDX f28, X, INCXM1
|
||||||
|
LFDUX f29, X, INCX
|
||||||
|
LFDX f30, X, INCXM1
|
||||||
|
LFDUX f31, X, INCX
|
||||||
|
bdnz LL(110)
|
||||||
|
.align 4
|
||||||
|
|
||||||
|
LL(120):
|
||||||
|
FADD f0, f0, f16
|
||||||
|
fmr f16, f24
|
||||||
|
FADD f1, f1, f17
|
||||||
|
fmr f17, f25
|
||||||
|
|
||||||
|
FADD f2, f2, f18
|
||||||
|
fmr f18, f26
|
||||||
|
FADD f3, f3, f19
|
||||||
|
fmr f19, f27
|
||||||
|
|
||||||
|
FADD f4, f4, f20
|
||||||
|
fmr f20, f28
|
||||||
|
FADD f5, f5, f21
|
||||||
|
fmr f21, f29
|
||||||
|
|
||||||
|
FADD f6, f6, f22
|
||||||
|
fmr f22, f30
|
||||||
|
FADD f7, f7, f23
|
||||||
|
fmr f23, f31
|
||||||
|
|
||||||
|
FADD f0, f0, f16
|
||||||
|
FADD f1, f1, f17
|
||||||
|
FADD f2, f2, f18
|
||||||
|
FADD f3, f3, f19
|
||||||
|
|
||||||
|
FADD f4, f4, f20
|
||||||
|
FADD f5, f5, f21
|
||||||
|
FADD f6, f6, f22
|
||||||
|
FADD f7, f7, f23
|
||||||
|
.align 4
|
||||||
|
|
||||||
|
LL(150):
|
||||||
|
andi. r0, N, 7
|
||||||
|
mtspr CTR, r0
|
||||||
|
beq LL(999)
|
||||||
|
.align 4
|
||||||
|
|
||||||
|
LL(160):
|
||||||
|
LFDX f8, X, INCXM1
|
||||||
|
LFDUX f9, X, INCX
|
||||||
|
FADD f0, f0, f8
|
||||||
|
FADD f1, f1, f9
|
||||||
|
bdnz LL(160)
|
||||||
|
.align 4
|
||||||
|
|
||||||
|
LL(999):
|
||||||
|
FADD f0, f0, f1
|
||||||
|
FADD f2, f2, f3
|
||||||
|
FADD f4, f4, f5
|
||||||
|
FADD f6, f6, f7
|
||||||
|
|
||||||
|
FADD f0, f0, f2
|
||||||
|
FADD f4, f4, f6
|
||||||
|
FADD f1, f0, f4
|
||||||
|
|
||||||
|
lfd f14, 0(SP)
|
||||||
|
lfd f15, 8(SP)
|
||||||
|
lfd f16, 16(SP)
|
||||||
|
lfd f17, 24(SP)
|
||||||
|
|
||||||
|
lfd f18, 32(SP)
|
||||||
|
lfd f19, 40(SP)
|
||||||
|
lfd f20, 48(SP)
|
||||||
|
lfd f21, 56(SP)
|
||||||
|
|
||||||
|
lfd f22, 64(SP)
|
||||||
|
lfd f23, 72(SP)
|
||||||
|
lfd f24, 80(SP)
|
||||||
|
lfd f25, 88(SP)
|
||||||
|
|
||||||
|
lfd f26, 96(SP)
|
||||||
|
lfd f27, 104(SP)
|
||||||
|
lfd f28, 112(SP)
|
||||||
|
lfd f29, 120(SP)
|
||||||
|
|
||||||
|
lfd f30, 128(SP)
|
||||||
|
lfd f31, 136(SP)
|
||||||
|
|
||||||
|
addi SP, SP, STACKSIZE
|
||||||
|
blr
|
||||||
|
|
||||||
|
EPILOGUE
|
|
@ -70,7 +70,7 @@ gotoblas_t TABLE_NAME = {
|
||||||
|
|
||||||
samax_kTS, samin_kTS, smax_kTS, smin_kTS,
|
samax_kTS, samin_kTS, smax_kTS, smin_kTS,
|
||||||
isamax_kTS, isamin_kTS, ismax_kTS, ismin_kTS,
|
isamax_kTS, isamin_kTS, ismax_kTS, ismin_kTS,
|
||||||
snrm2_kTS, sasum_kTS, scopy_kTS, sdot_kTS,
|
snrm2_kTS, sasum_kTS, ssum_kTS, scopy_kTS, sdot_kTS,
|
||||||
dsdot_kTS,
|
dsdot_kTS,
|
||||||
srot_kTS, saxpy_kTS, sscal_kTS, sswap_kTS,
|
srot_kTS, saxpy_kTS, sscal_kTS, sswap_kTS,
|
||||||
sgemv_nTS, sgemv_tTS, sger_kTS,
|
sgemv_nTS, sgemv_tTS, sger_kTS,
|
||||||
|
@ -126,7 +126,7 @@ gotoblas_t TABLE_NAME = {
|
||||||
|
|
||||||
damax_kTS, damin_kTS, dmax_kTS, dmin_kTS,
|
damax_kTS, damin_kTS, dmax_kTS, dmin_kTS,
|
||||||
idamax_kTS, idamin_kTS, idmax_kTS, idmin_kTS,
|
idamax_kTS, idamin_kTS, idmax_kTS, idmin_kTS,
|
||||||
dnrm2_kTS, dasum_kTS, dcopy_kTS, ddot_kTS,
|
dnrm2_kTS, dasum_kTS, dsum_kTS, dcopy_kTS, ddot_kTS,
|
||||||
drot_kTS, daxpy_kTS, dscal_kTS, dswap_kTS,
|
drot_kTS, daxpy_kTS, dscal_kTS, dswap_kTS,
|
||||||
dgemv_nTS, dgemv_tTS, dger_kTS,
|
dgemv_nTS, dgemv_tTS, dger_kTS,
|
||||||
dsymv_LTS, dsymv_UTS,
|
dsymv_LTS, dsymv_UTS,
|
||||||
|
@ -178,7 +178,7 @@ gotoblas_t TABLE_NAME = {
|
||||||
|
|
||||||
qamax_kTS, qamin_kTS, qmax_kTS, qmin_kTS,
|
qamax_kTS, qamin_kTS, qmax_kTS, qmin_kTS,
|
||||||
iqamax_kTS, iqamin_kTS, iqmax_kTS, iqmin_kTS,
|
iqamax_kTS, iqamin_kTS, iqmax_kTS, iqmin_kTS,
|
||||||
qnrm2_kTS, qasum_kTS, qcopy_kTS, qdot_kTS,
|
qnrm2_kTS, qasum_kTS, qsum_kTS, qcopy_kTS, qdot_kTS,
|
||||||
qrot_kTS, qaxpy_kTS, qscal_kTS, qswap_kTS,
|
qrot_kTS, qaxpy_kTS, qscal_kTS, qswap_kTS,
|
||||||
qgemv_nTS, qgemv_tTS, qger_kTS,
|
qgemv_nTS, qgemv_tTS, qger_kTS,
|
||||||
qsymv_LTS, qsymv_UTS,
|
qsymv_LTS, qsymv_UTS,
|
||||||
|
@ -234,7 +234,7 @@ gotoblas_t TABLE_NAME = {
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
camax_kTS, camin_kTS, icamax_kTS, icamin_kTS,
|
camax_kTS, camin_kTS, icamax_kTS, icamin_kTS,
|
||||||
cnrm2_kTS, casum_kTS, ccopy_kTS,
|
cnrm2_kTS, casum_kTS, csum_kTS, ccopy_kTS,
|
||||||
cdotu_kTS, cdotc_kTS, csrot_kTS,
|
cdotu_kTS, cdotc_kTS, csrot_kTS,
|
||||||
caxpy_kTS, caxpyc_kTS, cscal_kTS, cswap_kTS,
|
caxpy_kTS, caxpyc_kTS, cscal_kTS, cswap_kTS,
|
||||||
|
|
||||||
|
@ -369,7 +369,7 @@ gotoblas_t TABLE_NAME = {
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
zamax_kTS, zamin_kTS, izamax_kTS, izamin_kTS,
|
zamax_kTS, zamin_kTS, izamax_kTS, izamin_kTS,
|
||||||
znrm2_kTS, zasum_kTS, zcopy_kTS,
|
znrm2_kTS, zasum_kTS, zsum_kTS, zcopy_kTS,
|
||||||
zdotu_kTS, zdotc_kTS, zdrot_kTS,
|
zdotu_kTS, zdotc_kTS, zdrot_kTS,
|
||||||
zaxpy_kTS, zaxpyc_kTS, zscal_kTS, zswap_kTS,
|
zaxpy_kTS, zaxpyc_kTS, zscal_kTS, zswap_kTS,
|
||||||
|
|
||||||
|
@ -500,7 +500,7 @@ gotoblas_t TABLE_NAME = {
|
||||||
XGEMM_DEFAULT_UNROLL_M, XGEMM_DEFAULT_UNROLL_N, MAX(XGEMM_DEFAULT_UNROLL_M, XGEMM_DEFAULT_UNROLL_N),
|
XGEMM_DEFAULT_UNROLL_M, XGEMM_DEFAULT_UNROLL_N, MAX(XGEMM_DEFAULT_UNROLL_M, XGEMM_DEFAULT_UNROLL_N),
|
||||||
|
|
||||||
xamax_kTS, xamin_kTS, ixamax_kTS, ixamin_kTS,
|
xamax_kTS, xamin_kTS, ixamax_kTS, ixamin_kTS,
|
||||||
xnrm2_kTS, xasum_kTS, xcopy_kTS,
|
xnrm2_kTS, xasum_kTS, xsum_kTS, xcopy_kTS,
|
||||||
xdotu_kTS, xdotc_kTS, xqrot_kTS,
|
xdotu_kTS, xdotc_kTS, xqrot_kTS,
|
||||||
xaxpy_kTS, xaxpyc_kTS, xscal_kTS, xswap_kTS,
|
xaxpy_kTS, xaxpyc_kTS, xscal_kTS, xswap_kTS,
|
||||||
|
|
||||||
|
|
|
@ -0,0 +1,325 @@
|
||||||
|
/*********************************************************************/
|
||||||
|
/* Copyright 2009, 2010 The University of Texas at Austin. */
|
||||||
|
/* All rights reserved. */
|
||||||
|
/* */
|
||||||
|
/* Redistribution and use in source and binary forms, with or */
|
||||||
|
/* without modification, are permitted provided that the following */
|
||||||
|
/* conditions are met: */
|
||||||
|
/* */
|
||||||
|
/* 1. Redistributions of source code must retain the above */
|
||||||
|
/* copyright notice, this list of conditions and the following */
|
||||||
|
/* disclaimer. */
|
||||||
|
/* */
|
||||||
|
/* 2. Redistributions in binary form must reproduce the above */
|
||||||
|
/* copyright notice, this list of conditions and the following */
|
||||||
|
/* disclaimer in the documentation and/or other materials */
|
||||||
|
/* provided with the distribution. */
|
||||||
|
/* */
|
||||||
|
/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */
|
||||||
|
/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */
|
||||||
|
/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */
|
||||||
|
/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */
|
||||||
|
/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */
|
||||||
|
/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */
|
||||||
|
/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */
|
||||||
|
/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */
|
||||||
|
/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */
|
||||||
|
/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */
|
||||||
|
/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */
|
||||||
|
/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */
|
||||||
|
/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */
|
||||||
|
/* POSSIBILITY OF SUCH DAMAGE. */
|
||||||
|
/* */
|
||||||
|
/* The views and conclusions contained in the software and */
|
||||||
|
/* documentation are those of the authors and should not be */
|
||||||
|
/* interpreted as representing official policies, either expressed */
|
||||||
|
/* or implied, of The University of Texas at Austin. */
|
||||||
|
/*********************************************************************/
|
||||||
|
|
||||||
|
#define ASSEMBLER
|
||||||
|
#include "common.h"
|
||||||
|
|
||||||
|
#define N %i0
|
||||||
|
#define X %i1
|
||||||
|
#define INCX %i2
|
||||||
|
#define I %i3
|
||||||
|
|
||||||
|
#ifdef DOUBLE
|
||||||
|
#define c1 %f0
|
||||||
|
#define c2 %f2
|
||||||
|
#define t1 %f8
|
||||||
|
#define t2 %f10
|
||||||
|
#define t3 %f12
|
||||||
|
#define t4 %f14
|
||||||
|
|
||||||
|
#define a1 %f16
|
||||||
|
#define a2 %f18
|
||||||
|
#define a3 %f20
|
||||||
|
#define a4 %f22
|
||||||
|
#define a5 %f24
|
||||||
|
#define a6 %f26
|
||||||
|
#define a7 %f28
|
||||||
|
#define a8 %f30
|
||||||
|
#else
|
||||||
|
#define c1 %f0
|
||||||
|
#define c2 %f1
|
||||||
|
#define t1 %f4
|
||||||
|
#define t2 %f5
|
||||||
|
#define t3 %f6
|
||||||
|
#define t4 %f7
|
||||||
|
|
||||||
|
#define a1 %f8
|
||||||
|
#define a2 %f9
|
||||||
|
#define a3 %f10
|
||||||
|
#define a4 %f11
|
||||||
|
#define a5 %f12
|
||||||
|
#define a6 %f13
|
||||||
|
#define a7 %f14
|
||||||
|
#define a8 %f15
|
||||||
|
#endif
|
||||||
|
|
||||||
|
PROLOGUE
|
||||||
|
SAVESP
|
||||||
|
|
||||||
|
FCLR(0)
|
||||||
|
|
||||||
|
sll INCX, BASE_SHIFT, INCX
|
||||||
|
|
||||||
|
FMOV c1, c2
|
||||||
|
FMOV c1, t1
|
||||||
|
FMOV c1, t2
|
||||||
|
FMOV c1, t3
|
||||||
|
FMOV c1, t4
|
||||||
|
|
||||||
|
cmp INCX, 0
|
||||||
|
ble .LL19
|
||||||
|
cmp INCX, SIZE
|
||||||
|
bne .LL50
|
||||||
|
|
||||||
|
sra N, 3, I
|
||||||
|
cmp I, 0
|
||||||
|
ble,pn %icc, .LL15
|
||||||
|
nop
|
||||||
|
|
||||||
|
LDF [X + 0 * SIZE], a1
|
||||||
|
add I, -1, I
|
||||||
|
LDF [X + 1 * SIZE], a2
|
||||||
|
cmp I, 0
|
||||||
|
LDF [X + 2 * SIZE], a3
|
||||||
|
LDF [X + 3 * SIZE], a4
|
||||||
|
LDF [X + 4 * SIZE], a5
|
||||||
|
LDF [X + 5 * SIZE], a6
|
||||||
|
LDF [X + 6 * SIZE], a7
|
||||||
|
LDF [X + 7 * SIZE], a8
|
||||||
|
|
||||||
|
ble,pt %icc, .LL12
|
||||||
|
add X, 8 * SIZE, X
|
||||||
|
|
||||||
|
#define PREFETCHSIZE 128
|
||||||
|
|
||||||
|
.LL11:
|
||||||
|
FADD c1, t1, c1
|
||||||
|
prefetch [X + PREFETCHSIZE * SIZE], 0
|
||||||
|
FMOV a1, t1
|
||||||
|
LDF [X + 0 * SIZE], a1
|
||||||
|
|
||||||
|
FADD c2, t2, c2
|
||||||
|
add I, -1, I
|
||||||
|
FMOV a2, t2
|
||||||
|
LDF [X + 1 * SIZE], a2
|
||||||
|
|
||||||
|
FADD c1, t3, c1
|
||||||
|
cmp I, 0
|
||||||
|
FMOV a3, t3
|
||||||
|
LDF [X + 2 * SIZE], a3
|
||||||
|
|
||||||
|
FADD c2, t4, c2
|
||||||
|
nop
|
||||||
|
FMOV a4, t4
|
||||||
|
LDF [X + 3 * SIZE], a4
|
||||||
|
|
||||||
|
FADD c1, t1, c1
|
||||||
|
nop
|
||||||
|
FMOV a5, t1
|
||||||
|
LDF [X + 4 * SIZE], a5
|
||||||
|
|
||||||
|
FADD c2, t2, c2
|
||||||
|
nop
|
||||||
|
FMOV a6, t2
|
||||||
|
LDF [X + 5 * SIZE], a6
|
||||||
|
|
||||||
|
FADD c1, t3, c1
|
||||||
|
FMOV a7, t3
|
||||||
|
LDF [X + 6 * SIZE], a7
|
||||||
|
add X, 8 * SIZE, X
|
||||||
|
|
||||||
|
FADD c2, t4, c2
|
||||||
|
FMOV a8, t4
|
||||||
|
bg,pt %icc, .LL11
|
||||||
|
LDF [X - 1 * SIZE], a8
|
||||||
|
|
||||||
|
.LL12:
|
||||||
|
FADD c1, t1, c1
|
||||||
|
FMOV a1, t1
|
||||||
|
FADD c2, t2, c2
|
||||||
|
FMOV a2, t2
|
||||||
|
|
||||||
|
FADD c1, t3, c1
|
||||||
|
FMOV a3, t3
|
||||||
|
FADD c2, t4, c2
|
||||||
|
FMOV a4, t4
|
||||||
|
|
||||||
|
FADD c1, t1, c1
|
||||||
|
FMOV a5, t1
|
||||||
|
FADD c2, t2, c2
|
||||||
|
FMOV a6, t2
|
||||||
|
|
||||||
|
FADD c1, t3, c1
|
||||||
|
FMOV a7, t3
|
||||||
|
FADD c2, t4, c2
|
||||||
|
FMOV a8, t4
|
||||||
|
|
||||||
|
.LL15:
|
||||||
|
and N, 7, I
|
||||||
|
cmp I, 0
|
||||||
|
ble,a,pn %icc, .LL19
|
||||||
|
nop
|
||||||
|
|
||||||
|
.LL16:
|
||||||
|
LDF [X + 0 * SIZE], a1
|
||||||
|
add I, -1, I
|
||||||
|
cmp I, 0
|
||||||
|
FADD c1, t1, c1
|
||||||
|
FMOV a1, t1
|
||||||
|
bg,pt %icc, .LL16
|
||||||
|
add X, 1 * SIZE, X
|
||||||
|
|
||||||
|
.LL19:
|
||||||
|
FADD c1, t1, c1
|
||||||
|
FADD c2, t2, c2
|
||||||
|
FADD c1, t3, c1
|
||||||
|
FADD c2, t4, c2
|
||||||
|
|
||||||
|
FADD c1, c2, c1
|
||||||
|
return %i7 + 8
|
||||||
|
clr %g0
|
||||||
|
|
||||||
|
.LL50:
|
||||||
|
sra N, 3, I
|
||||||
|
cmp I, 0
|
||||||
|
ble,pn %icc, .LL55
|
||||||
|
nop
|
||||||
|
|
||||||
|
LDF [X + 0 * SIZE], a1
|
||||||
|
add X, INCX, X
|
||||||
|
LDF [X + 0 * SIZE], a2
|
||||||
|
add X, INCX, X
|
||||||
|
LDF [X + 0 * SIZE], a3
|
||||||
|
add X, INCX, X
|
||||||
|
LDF [X + 0 * SIZE], a4
|
||||||
|
add X, INCX, X
|
||||||
|
LDF [X + 0 * SIZE], a5
|
||||||
|
add X, INCX, X
|
||||||
|
LDF [X + 0 * SIZE], a6
|
||||||
|
add X, INCX, X
|
||||||
|
add I, -1, I
|
||||||
|
LDF [X + 0 * SIZE], a7
|
||||||
|
cmp I, 0
|
||||||
|
add X, INCX, X
|
||||||
|
LDF [X + 0 * SIZE], a8
|
||||||
|
|
||||||
|
ble,pt %icc, .LL52
|
||||||
|
add X, INCX, X
|
||||||
|
|
||||||
|
.LL51:
|
||||||
|
FADD c1, t1, c1
|
||||||
|
add I, -1, I
|
||||||
|
FMOV a1, t1
|
||||||
|
LDF [X + 0 * SIZE], a1
|
||||||
|
add X, INCX, X
|
||||||
|
|
||||||
|
FADD c2, t2, c2
|
||||||
|
cmp I, 0
|
||||||
|
FMOV a2, t2
|
||||||
|
LDF [X + 0 * SIZE], a2
|
||||||
|
add X, INCX, X
|
||||||
|
|
||||||
|
FADD c1, t3, c1
|
||||||
|
FMOV a3, t3
|
||||||
|
LDF [X + 0 * SIZE], a3
|
||||||
|
add X, INCX, X
|
||||||
|
|
||||||
|
FADD c2, t4, c2
|
||||||
|
FMOV a4, t4
|
||||||
|
LDF [X + 0 * SIZE], a4
|
||||||
|
add X, INCX, X
|
||||||
|
|
||||||
|
FADD c1, t1, c1
|
||||||
|
FMOV a5, t1
|
||||||
|
LDF [X + 0 * SIZE], a5
|
||||||
|
add X, INCX, X
|
||||||
|
|
||||||
|
FADD c2, t2, c2
|
||||||
|
FMOV a6, t2
|
||||||
|
LDF [X + 0 * SIZE], a6
|
||||||
|
add X, INCX, X
|
||||||
|
|
||||||
|
FADD c1, t3, c1
|
||||||
|
FMOV a7, t3
|
||||||
|
LDF [X + 0 * SIZE], a7
|
||||||
|
add X, INCX, X
|
||||||
|
|
||||||
|
FADD c2, t4, c2
|
||||||
|
FMOV a8, t4
|
||||||
|
LDF [X + 0 * SIZE], a8
|
||||||
|
|
||||||
|
bg,pt %icc, .LL51
|
||||||
|
add X, INCX, X
|
||||||
|
|
||||||
|
.LL52:
|
||||||
|
FADD c1, t1, c1
|
||||||
|
FMOV a1, t1
|
||||||
|
FADD c2, t2, c2
|
||||||
|
FMOV a2, t2
|
||||||
|
|
||||||
|
FADD c1, t3, c1
|
||||||
|
FMOV a3, t3
|
||||||
|
FADD c2, t4, c2
|
||||||
|
FMOV a4, t4
|
||||||
|
|
||||||
|
FADD c1, t1, c1
|
||||||
|
FMOV a5, t1
|
||||||
|
FADD c2, t2, c2
|
||||||
|
FMOV a6, t2
|
||||||
|
|
||||||
|
FADD c1, t3, c1
|
||||||
|
FMOV a7, t3
|
||||||
|
FADD c2, t4, c2
|
||||||
|
FMOV a8, t4
|
||||||
|
|
||||||
|
.LL55:
|
||||||
|
and N, 7, I
|
||||||
|
cmp I, 0
|
||||||
|
ble,a,pn %icc, .LL59
|
||||||
|
nop
|
||||||
|
|
||||||
|
.LL56:
|
||||||
|
LDF [X + 0 * SIZE], a1
|
||||||
|
FADD c1, t1, c1
|
||||||
|
add I, -1, I
|
||||||
|
FMOV a1, t1
|
||||||
|
cmp I, 0
|
||||||
|
bg,pt %icc, .LL56
|
||||||
|
add X, INCX, X
|
||||||
|
|
||||||
|
.LL59:
|
||||||
|
FADD c1, t1, c1
|
||||||
|
FADD c2, t2, c2
|
||||||
|
FADD c1, t3, c1
|
||||||
|
FADD c2, t4, c2
|
||||||
|
|
||||||
|
FADD c1, c2, c1
|
||||||
|
return %i7 + 8
|
||||||
|
clr %o0
|
||||||
|
|
||||||
|
EPILOGUE
|
|
@ -0,0 +1,327 @@
|
||||||
|
/*********************************************************************/
|
||||||
|
/* Copyright 2009, 2010 The University of Texas at Austin. */
|
||||||
|
/* All rights reserved. */
|
||||||
|
/* */
|
||||||
|
/* Redistribution and use in source and binary forms, with or */
|
||||||
|
/* without modification, are permitted provided that the following */
|
||||||
|
/* conditions are met: */
|
||||||
|
/* */
|
||||||
|
/* 1. Redistributions of source code must retain the above */
|
||||||
|
/* copyright notice, this list of conditions and the following */
|
||||||
|
/* disclaimer. */
|
||||||
|
/* */
|
||||||
|
/* 2. Redistributions in binary form must reproduce the above */
|
||||||
|
/* copyright notice, this list of conditions and the following */
|
||||||
|
/* disclaimer in the documentation and/or other materials */
|
||||||
|
/* provided with the distribution. */
|
||||||
|
/* */
|
||||||
|
/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */
|
||||||
|
/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */
|
||||||
|
/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */
|
||||||
|
/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */
|
||||||
|
/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */
|
||||||
|
/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */
|
||||||
|
/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */
|
||||||
|
/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */
|
||||||
|
/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */
|
||||||
|
/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */
|
||||||
|
/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */
|
||||||
|
/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */
|
||||||
|
/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */
|
||||||
|
/* POSSIBILITY OF SUCH DAMAGE. */
|
||||||
|
/* */
|
||||||
|
/* The views and conclusions contained in the software and */
|
||||||
|
/* documentation are those of the authors and should not be */
|
||||||
|
/* interpreted as representing official policies, either expressed */
|
||||||
|
/* or implied, of The University of Texas at Austin. */
|
||||||
|
/*********************************************************************/
|
||||||
|
|
||||||
|
#define ASSEMBLER
|
||||||
|
#include "common.h"
|
||||||
|
|
||||||
|
#define N %i0
|
||||||
|
#define X %i1
|
||||||
|
#define INCX %i2
|
||||||
|
#define I %i3
|
||||||
|
|
||||||
|
#ifdef DOUBLE
|
||||||
|
#define c1 %f0
|
||||||
|
#define c2 %f2
|
||||||
|
#define t1 %f8
|
||||||
|
#define t2 %f10
|
||||||
|
#define t3 %f12
|
||||||
|
#define t4 %f14
|
||||||
|
|
||||||
|
#define a1 %f16
|
||||||
|
#define a2 %f18
|
||||||
|
#define a3 %f20
|
||||||
|
#define a4 %f22
|
||||||
|
#define a5 %f24
|
||||||
|
#define a6 %f26
|
||||||
|
#define a7 %f28
|
||||||
|
#define a8 %f30
|
||||||
|
#else
|
||||||
|
#define c1 %f0
|
||||||
|
#define c2 %f1
|
||||||
|
#define t1 %f4
|
||||||
|
#define t2 %f5
|
||||||
|
#define t3 %f6
|
||||||
|
#define t4 %f7
|
||||||
|
|
||||||
|
#define a1 %f8
|
||||||
|
#define a2 %f9
|
||||||
|
#define a3 %f10
|
||||||
|
#define a4 %f11
|
||||||
|
#define a5 %f12
|
||||||
|
#define a6 %f13
|
||||||
|
#define a7 %f14
|
||||||
|
#define a8 %f15
|
||||||
|
#endif
|
||||||
|
|
||||||
|
PROLOGUE
|
||||||
|
SAVESP
|
||||||
|
|
||||||
|
FCLR(0)
|
||||||
|
|
||||||
|
sll INCX, ZBASE_SHIFT, INCX
|
||||||
|
|
||||||
|
FMOV c1, c2
|
||||||
|
FMOV c1, t1
|
||||||
|
FMOV c1, t2
|
||||||
|
FMOV c1, t3
|
||||||
|
FMOV c1, t4
|
||||||
|
|
||||||
|
cmp INCX, 0
|
||||||
|
ble .LL19
|
||||||
|
nop
|
||||||
|
|
||||||
|
cmp INCX, 2 * SIZE
|
||||||
|
bne .LL50
|
||||||
|
nop
|
||||||
|
|
||||||
|
sra N, 2, I
|
||||||
|
cmp I, 0
|
||||||
|
ble,pn %icc, .LL15
|
||||||
|
nop
|
||||||
|
|
||||||
|
LDF [X + 0 * SIZE], a1
|
||||||
|
add I, -1, I
|
||||||
|
LDF [X + 1 * SIZE], a2
|
||||||
|
cmp I, 0
|
||||||
|
LDF [X + 2 * SIZE], a3
|
||||||
|
LDF [X + 3 * SIZE], a4
|
||||||
|
LDF [X + 4 * SIZE], a5
|
||||||
|
LDF [X + 5 * SIZE], a6
|
||||||
|
LDF [X + 6 * SIZE], a7
|
||||||
|
LDF [X + 7 * SIZE], a8
|
||||||
|
|
||||||
|
ble,pt %icc, .LL12
|
||||||
|
add X, 8 * SIZE, X
|
||||||
|
|
||||||
|
#define PREFETCHSIZE 32
|
||||||
|
|
||||||
|
.LL11:
|
||||||
|
FADD c1, t1, c1
|
||||||
|
prefetch [X + PREFETCHSIZE * SIZE], 0
|
||||||
|
FMOV a1, t1
|
||||||
|
LDF [X + 0 * SIZE], a1
|
||||||
|
|
||||||
|
FADD c2, t2, c2
|
||||||
|
add I, -1, I
|
||||||
|
FMOV a2, t2
|
||||||
|
LDF [X + 1 * SIZE], a2
|
||||||
|
|
||||||
|
FADD c1, t3, c1
|
||||||
|
cmp I, 0
|
||||||
|
FMOV a3, t3
|
||||||
|
LDF [X + 2 * SIZE], a3
|
||||||
|
|
||||||
|
FADD c2, t4, c2
|
||||||
|
nop
|
||||||
|
FMOV a4, t4
|
||||||
|
LDF [X + 3 * SIZE], a4
|
||||||
|
|
||||||
|
FADD c1, t1, c1
|
||||||
|
nop
|
||||||
|
FMOV a5, t1
|
||||||
|
LDF [X + 4 * SIZE], a5
|
||||||
|
|
||||||
|
FADD c2, t2, c2
|
||||||
|
nop
|
||||||
|
FMOV a6, t2
|
||||||
|
LDF [X + 5 * SIZE], a6
|
||||||
|
|
||||||
|
FADD c1, t3, c1
|
||||||
|
FMOV a7, t3
|
||||||
|
LDF [X + 6 * SIZE], a7
|
||||||
|
add X, 8 * SIZE, X
|
||||||
|
|
||||||
|
FADD c2, t4, c2
|
||||||
|
FMOV a8, t4
|
||||||
|
bg,pt %icc, .LL11
|
||||||
|
LDF [X - 1 * SIZE], a8
|
||||||
|
|
||||||
|
.LL12:
|
||||||
|
FADD c1, t1, c1
|
||||||
|
FMOV a1, t1
|
||||||
|
FADD c2, t2, c2
|
||||||
|
FMOV a2, t2
|
||||||
|
|
||||||
|
FADD c1, t3, c1
|
||||||
|
FMOV a3, t3
|
||||||
|
FADD c2, t4, c2
|
||||||
|
FMOV a4, t4
|
||||||
|
|
||||||
|
FADD c1, t1, c1
|
||||||
|
FMOV a5, t1
|
||||||
|
FADD c2, t2, c2
|
||||||
|
FMOV a6, t2
|
||||||
|
|
||||||
|
FADD c1, t3, c1
|
||||||
|
FMOV a7, t3
|
||||||
|
FADD c2, t4, c2
|
||||||
|
FMOV a8, t4
|
||||||
|
|
||||||
|
.LL15:
|
||||||
|
and N, 3, I
|
||||||
|
cmp I, 0
|
||||||
|
ble,a,pn %icc, .LL19
|
||||||
|
nop
|
||||||
|
|
||||||
|
.LL16:
|
||||||
|
LDF [X + 0 * SIZE], a1
|
||||||
|
LDF [X + 1 * SIZE], a2
|
||||||
|
add I, -1, I
|
||||||
|
cmp I, 0
|
||||||
|
FADD c1, t1, c1
|
||||||
|
FADD c2, t2, c2
|
||||||
|
FMOV a1, t1
|
||||||
|
FMOV a2, t2
|
||||||
|
bg,pt %icc, .LL16
|
||||||
|
add X, 2 * SIZE, X
|
||||||
|
|
||||||
|
.LL19:
|
||||||
|
FADD c1, t1, c1
|
||||||
|
FADD c2, t2, c2
|
||||||
|
FADD c1, t3, c1
|
||||||
|
FADD c2, t4, c2
|
||||||
|
|
||||||
|
FADD c1, c2, c1
|
||||||
|
return %i7 + 8
|
||||||
|
clr %g0
|
||||||
|
|
||||||
|
.LL50:
|
||||||
|
sra N, 2, I
|
||||||
|
cmp I, 0
|
||||||
|
ble,pn %icc, .LL55
|
||||||
|
nop
|
||||||
|
|
||||||
|
LDF [X + 0 * SIZE], a1
|
||||||
|
LDF [X + 1 * SIZE], a2
|
||||||
|
add X, INCX, X
|
||||||
|
LDF [X + 0 * SIZE], a3
|
||||||
|
LDF [X + 1 * SIZE], a4
|
||||||
|
add X, INCX, X
|
||||||
|
LDF [X + 0 * SIZE], a5
|
||||||
|
LDF [X + 1 * SIZE], a6
|
||||||
|
add X, INCX, X
|
||||||
|
add I, -1, I
|
||||||
|
LDF [X + 0 * SIZE], a7
|
||||||
|
cmp I, 0
|
||||||
|
LDF [X + 1 * SIZE], a8
|
||||||
|
|
||||||
|
ble,pt %icc, .LL52
|
||||||
|
add X, INCX, X
|
||||||
|
|
||||||
|
.LL51:
|
||||||
|
FADD c1, t1, c1
|
||||||
|
add I, -1, I
|
||||||
|
FMOV a1, t1
|
||||||
|
LDF [X + 0 * SIZE], a1
|
||||||
|
|
||||||
|
FADD c2, t2, c2
|
||||||
|
cmp I, 0
|
||||||
|
FMOV a2, t2
|
||||||
|
LDF [X + 1 * SIZE], a2
|
||||||
|
add X, INCX, X
|
||||||
|
|
||||||
|
FADD c1, t3, c1
|
||||||
|
FMOV a3, t3
|
||||||
|
LDF [X + 0 * SIZE], a3
|
||||||
|
|
||||||
|
FADD c2, t4, c2
|
||||||
|
FMOV a4, t4
|
||||||
|
LDF [X + 1 * SIZE], a4
|
||||||
|
add X, INCX, X
|
||||||
|
|
||||||
|
FADD c1, t1, c1
|
||||||
|
FMOV a5, t1
|
||||||
|
LDF [X + 0 * SIZE], a5
|
||||||
|
|
||||||
|
FADD c2, t2, c2
|
||||||
|
FMOV a6, t2
|
||||||
|
LDF [X + 1 * SIZE], a6
|
||||||
|
add X, INCX, X
|
||||||
|
|
||||||
|
FADD c1, t3, c1
|
||||||
|
FMOV a7, t3
|
||||||
|
LDF [X + 0 * SIZE], a7
|
||||||
|
|
||||||
|
FADD c2, t4, c2
|
||||||
|
FMOV a8, t4
|
||||||
|
LDF [X + 1 * SIZE], a8
|
||||||
|
|
||||||
|
bg,pt %icc, .LL51
|
||||||
|
add X, INCX, X
|
||||||
|
|
||||||
|
.LL52:
|
||||||
|
FADD c1, t1, c1
|
||||||
|
FMOV a1, t1
|
||||||
|
FADD c2, t2, c2
|
||||||
|
FMOV a2, t2
|
||||||
|
|
||||||
|
FADD c1, t3, c1
|
||||||
|
FMOV a3, t3
|
||||||
|
FADD c2, t4, c2
|
||||||
|
FMOV a4, t4
|
||||||
|
|
||||||
|
FADD c1, t1, c1
|
||||||
|
FMOV a5, t1
|
||||||
|
FADD c2, t2, c2
|
||||||
|
FMOV a6, t2
|
||||||
|
|
||||||
|
FADD c1, t3, c1
|
||||||
|
FMOV a7, t3
|
||||||
|
FADD c2, t4, c2
|
||||||
|
FMOV a8, t4
|
||||||
|
|
||||||
|
.LL55:
|
||||||
|
and N, 3, I
|
||||||
|
cmp I, 0
|
||||||
|
ble,a,pn %icc, .LL59
|
||||||
|
nop
|
||||||
|
|
||||||
|
.LL56:
|
||||||
|
LDF [X + 0 * SIZE], a1
|
||||||
|
LDF [X + 1 * SIZE], a2
|
||||||
|
FADD c1, t1, c1
|
||||||
|
FADD c2, t2, c2
|
||||||
|
add I, -1, I
|
||||||
|
FMOV a1, t1
|
||||||
|
FMOV a2, t2
|
||||||
|
cmp I, 0
|
||||||
|
bg,pt %icc, .LL56
|
||||||
|
add X, INCX, X
|
||||||
|
|
||||||
|
.LL59:
|
||||||
|
FADD c1, t1, c1
|
||||||
|
FADD c2, t2, c2
|
||||||
|
FADD c1, t3, c1
|
||||||
|
FADD c2, t4, c2
|
||||||
|
|
||||||
|
FADD c1, c2, c1
|
||||||
|
|
||||||
|
return %i7 + 8
|
||||||
|
clr %o0
|
||||||
|
|
||||||
|
EPILOGUE
|
|
@ -94,6 +94,11 @@ DASUMKERNEL = ../arm/asum.c
|
||||||
CASUMKERNEL = ../arm/zasum.c
|
CASUMKERNEL = ../arm/zasum.c
|
||||||
ZASUMKERNEL = ../arm/zasum.c
|
ZASUMKERNEL = ../arm/zasum.c
|
||||||
|
|
||||||
|
SSUMKERNEL = ../arm/sum.c
|
||||||
|
DSUMKERNEL = ../arm/sum.c
|
||||||
|
CSUMKERNEL = ../arm/zsum.c
|
||||||
|
ZSUMKERNEL = ../arm/zsum.c
|
||||||
|
|
||||||
SAXPYKERNEL = ../arm/axpy.c
|
SAXPYKERNEL = ../arm/axpy.c
|
||||||
DAXPYKERNEL = ../arm/axpy.c
|
DAXPYKERNEL = ../arm/axpy.c
|
||||||
CAXPYKERNEL = ../arm/zaxpy.c
|
CAXPYKERNEL = ../arm/zaxpy.c
|
||||||
|
|
|
@ -0,0 +1,207 @@
|
||||||
|
/*********************************************************************/
|
||||||
|
/* Copyright 2009, 2010 The University of Texas at Austin. */
|
||||||
|
/* All rights reserved. */
|
||||||
|
/* */
|
||||||
|
/* Redistribution and use in source and binary forms, with or */
|
||||||
|
/* without modification, are permitted provided that the following */
|
||||||
|
/* conditions are met: */
|
||||||
|
/* */
|
||||||
|
/* 1. Redistributions of source code must retain the above */
|
||||||
|
/* copyright notice, this list of conditions and the following */
|
||||||
|
/* disclaimer. */
|
||||||
|
/* */
|
||||||
|
/* 2. Redistributions in binary form must reproduce the above */
|
||||||
|
/* copyright notice, this list of conditions and the following */
|
||||||
|
/* disclaimer in the documentation and/or other materials */
|
||||||
|
/* provided with the distribution. */
|
||||||
|
/* */
|
||||||
|
/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */
|
||||||
|
/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */
|
||||||
|
/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */
|
||||||
|
/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */
|
||||||
|
/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */
|
||||||
|
/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */
|
||||||
|
/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */
|
||||||
|
/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */
|
||||||
|
/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */
|
||||||
|
/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */
|
||||||
|
/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */
|
||||||
|
/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */
|
||||||
|
/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */
|
||||||
|
/* POSSIBILITY OF SUCH DAMAGE. */
|
||||||
|
/* */
|
||||||
|
/* The views and conclusions contained in the software and */
|
||||||
|
/* documentation are those of the authors and should not be */
|
||||||
|
/* interpreted as representing official policies, either expressed */
|
||||||
|
/* or implied, of The University of Texas at Austin. */
|
||||||
|
/*********************************************************************/
|
||||||
|
|
||||||
|
#define ASSEMBLER
|
||||||
|
#include "common.h"
|
||||||
|
|
||||||
|
#define STACK 8
|
||||||
|
#define ARGS 0
|
||||||
|
|
||||||
|
#define STACK_M 4 + STACK + ARGS(%esp)
|
||||||
|
#define STACK_X 8 + STACK + ARGS(%esp)
|
||||||
|
#define STACK_INCX 12 + STACK + ARGS(%esp)
|
||||||
|
|
||||||
|
#define M %edx
|
||||||
|
#define X %ecx
|
||||||
|
#define INCX %esi
|
||||||
|
|
||||||
|
#define I %eax
|
||||||
|
|
||||||
|
#include "l1param.h"
|
||||||
|
|
||||||
|
PROLOGUE
|
||||||
|
|
||||||
|
pushl %esi
|
||||||
|
pushl %ebx
|
||||||
|
|
||||||
|
PROFCODE
|
||||||
|
|
||||||
|
#if defined(F_INTERFACE_GFORT) || defined(F_INTERFACE_G95)
|
||||||
|
EMMS
|
||||||
|
#endif
|
||||||
|
|
||||||
|
movl STACK_M, M
|
||||||
|
movl STACK_X, X
|
||||||
|
movl STACK_INCX, INCX
|
||||||
|
|
||||||
|
#ifdef F_INTERFACE
|
||||||
|
movl (M), M
|
||||||
|
movl (INCX), INCX
|
||||||
|
#endif
|
||||||
|
|
||||||
|
fldz
|
||||||
|
testl M, M
|
||||||
|
jle .L999
|
||||||
|
testl INCX, INCX
|
||||||
|
jle .L999
|
||||||
|
|
||||||
|
sall $BASE_SHIFT, INCX
|
||||||
|
fldz
|
||||||
|
fldz
|
||||||
|
fldz
|
||||||
|
cmpl $SIZE, INCX
|
||||||
|
jne .L40
|
||||||
|
|
||||||
|
movl M, I
|
||||||
|
sarl $3, I
|
||||||
|
jle .L20
|
||||||
|
ALIGN_4
|
||||||
|
|
||||||
|
.L10:
|
||||||
|
#ifdef PREFETCH
|
||||||
|
PREFETCH (PREFETCHSIZE + 0) - PREOFFSET(X)
|
||||||
|
#endif
|
||||||
|
|
||||||
|
FLD 0 * SIZE(X)
|
||||||
|
FLD 1 * SIZE(X)
|
||||||
|
FLD 2 * SIZE(X)
|
||||||
|
FLD 3 * SIZE(X)
|
||||||
|
|
||||||
|
faddp %st, %st(7)
|
||||||
|
faddp %st, %st(5)
|
||||||
|
faddp %st, %st(3)
|
||||||
|
faddp %st, %st(1)
|
||||||
|
|
||||||
|
FLD 4 * SIZE(X)
|
||||||
|
FLD 5 * SIZE(X)
|
||||||
|
FLD 6 * SIZE(X)
|
||||||
|
FLD 7 * SIZE(X)
|
||||||
|
|
||||||
|
addl $8 * SIZE, X
|
||||||
|
|
||||||
|
faddp %st, %st(7)
|
||||||
|
faddp %st, %st(5)
|
||||||
|
faddp %st, %st(3)
|
||||||
|
faddp %st, %st(1)
|
||||||
|
|
||||||
|
decl I
|
||||||
|
jg .L10
|
||||||
|
ALIGN_4
|
||||||
|
|
||||||
|
.L20:
|
||||||
|
movl M, I
|
||||||
|
andl $7, I
|
||||||
|
jle .L998
|
||||||
|
ALIGN_4
|
||||||
|
|
||||||
|
|
||||||
|
.L21:
|
||||||
|
FLD (X)
|
||||||
|
faddp %st,%st(1)
|
||||||
|
addl $1 * SIZE, X
|
||||||
|
decl I
|
||||||
|
jg .L21
|
||||||
|
jmp .L998
|
||||||
|
ALIGN_4
|
||||||
|
|
||||||
|
.L40:
|
||||||
|
movl M, I
|
||||||
|
sarl $3, I
|
||||||
|
jle .L60
|
||||||
|
ALIGN_4
|
||||||
|
|
||||||
|
.L50:
|
||||||
|
FLD (X)
|
||||||
|
addl INCX, X
|
||||||
|
FLD (X)
|
||||||
|
addl INCX, X
|
||||||
|
FLD (X)
|
||||||
|
addl INCX, X
|
||||||
|
FLD (X)
|
||||||
|
addl INCX, X
|
||||||
|
|
||||||
|
faddp %st, %st(7)
|
||||||
|
faddp %st, %st(5)
|
||||||
|
faddp %st, %st(3)
|
||||||
|
faddp %st, %st(1)
|
||||||
|
|
||||||
|
FLD (X)
|
||||||
|
addl INCX, X
|
||||||
|
FLD (X)
|
||||||
|
addl INCX, X
|
||||||
|
FLD (X)
|
||||||
|
addl INCX, X
|
||||||
|
FLD (X)
|
||||||
|
addl INCX, X
|
||||||
|
|
||||||
|
faddp %st, %st(7)
|
||||||
|
faddp %st, %st(5)
|
||||||
|
faddp %st, %st(3)
|
||||||
|
faddp %st, %st(1)
|
||||||
|
|
||||||
|
decl I
|
||||||
|
jg .L50
|
||||||
|
ALIGN_4
|
||||||
|
|
||||||
|
.L60:
|
||||||
|
movl M, I
|
||||||
|
andl $7, I
|
||||||
|
jle .L998
|
||||||
|
ALIGN_4
|
||||||
|
|
||||||
|
|
||||||
|
.L61:
|
||||||
|
FLD (X)
|
||||||
|
addl INCX, X
|
||||||
|
faddp %st,%st(1)
|
||||||
|
decl I
|
||||||
|
jg .L61
|
||||||
|
ALIGN_4
|
||||||
|
|
||||||
|
.L998:
|
||||||
|
faddp %st,%st(2)
|
||||||
|
faddp %st,%st(1)
|
||||||
|
faddp %st,%st(1)
|
||||||
|
ALIGN_4
|
||||||
|
|
||||||
|
.L999:
|
||||||
|
popl %ebx
|
||||||
|
popl %esi
|
||||||
|
ret
|
||||||
|
|
||||||
|
EPILOGUE
|
|
@ -0,0 +1,208 @@
|
||||||
|
/*********************************************************************/
|
||||||
|
/* Copyright 2009, 2010 The University of Texas at Austin. */
|
||||||
|
/* All rights reserved. */
|
||||||
|
/* */
|
||||||
|
/* Redistribution and use in source and binary forms, with or */
|
||||||
|
/* without modification, are permitted provided that the following */
|
||||||
|
/* conditions are met: */
|
||||||
|
/* */
|
||||||
|
/* 1. Redistributions of source code must retain the above */
|
||||||
|
/* copyright notice, this list of conditions and the following */
|
||||||
|
/* disclaimer. */
|
||||||
|
/* */
|
||||||
|
/* 2. Redistributions in binary form must reproduce the above */
|
||||||
|
/* copyright notice, this list of conditions and the following */
|
||||||
|
/* disclaimer in the documentation and/or other materials */
|
||||||
|
/* provided with the distribution. */
|
||||||
|
/* */
|
||||||
|
/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */
|
||||||
|
/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */
|
||||||
|
/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */
|
||||||
|
/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */
|
||||||
|
/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */
|
||||||
|
/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */
|
||||||
|
/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */
|
||||||
|
/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */
|
||||||
|
/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */
|
||||||
|
/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */
|
||||||
|
/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */
|
||||||
|
/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */
|
||||||
|
/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */
|
||||||
|
/* POSSIBILITY OF SUCH DAMAGE. */
|
||||||
|
/* */
|
||||||
|
/* The views and conclusions contained in the software and */
|
||||||
|
/* documentation are those of the authors and should not be */
|
||||||
|
/* interpreted as representing official policies, either expressed */
|
||||||
|
/* or implied, of The University of Texas at Austin. */
|
||||||
|
/*********************************************************************/
|
||||||
|
|
||||||
|
#define ASSEMBLER
|
||||||
|
#include "common.h"
|
||||||
|
|
||||||
|
#define STACK 8
|
||||||
|
#define ARGS 0
|
||||||
|
|
||||||
|
#define STACK_M 4 + STACK + ARGS(%esp)
|
||||||
|
#define STACK_X 8 + STACK + ARGS(%esp)
|
||||||
|
#define STACK_INCX 12 + STACK + ARGS(%esp)
|
||||||
|
|
||||||
|
#define M %edx
|
||||||
|
#define X %ecx
|
||||||
|
#define INCX %esi
|
||||||
|
|
||||||
|
#define I %eax
|
||||||
|
|
||||||
|
#include "l1param.h"
|
||||||
|
|
||||||
|
PROLOGUE
|
||||||
|
|
||||||
|
pushl %esi
|
||||||
|
pushl %ebx
|
||||||
|
|
||||||
|
PROFCODE
|
||||||
|
|
||||||
|
#if defined(F_INTERFACE_GFORT) || defined(F_INTERFACE_G95)
|
||||||
|
EMMS
|
||||||
|
#endif
|
||||||
|
|
||||||
|
movl STACK_M, M
|
||||||
|
movl STACK_X, X
|
||||||
|
movl STACK_INCX, INCX
|
||||||
|
|
||||||
|
#ifdef F_INTERFACE
|
||||||
|
movl (M), M
|
||||||
|
movl (INCX), INCX
|
||||||
|
#endif
|
||||||
|
|
||||||
|
fldz
|
||||||
|
testl M, M
|
||||||
|
jle .L999
|
||||||
|
testl INCX, INCX
|
||||||
|
jle .L999
|
||||||
|
|
||||||
|
sall $ZBASE_SHIFT, INCX
|
||||||
|
|
||||||
|
fldz
|
||||||
|
fldz
|
||||||
|
fldz
|
||||||
|
cmpl $SIZE * 2, INCX
|
||||||
|
jne .L40
|
||||||
|
|
||||||
|
movl M, I
|
||||||
|
sarl $2, I
|
||||||
|
jle .L20
|
||||||
|
ALIGN_4
|
||||||
|
|
||||||
|
.L10:
|
||||||
|
#ifdef PREFETCH
|
||||||
|
PREFETCH (PREFETCHSIZE + 0) - PREOFFSET(X)
|
||||||
|
#endif
|
||||||
|
|
||||||
|
FLD 0 * SIZE(X)
|
||||||
|
FLD 1 * SIZE(X)
|
||||||
|
FLD 2 * SIZE(X)
|
||||||
|
FLD 3 * SIZE(X)
|
||||||
|
|
||||||
|
faddp %st, %st(7)
|
||||||
|
faddp %st, %st(5)
|
||||||
|
faddp %st, %st(3)
|
||||||
|
faddp %st, %st(1)
|
||||||
|
|
||||||
|
FLD 4 * SIZE(X)
|
||||||
|
FLD 5 * SIZE(X)
|
||||||
|
FLD 6 * SIZE(X)
|
||||||
|
FLD 7 * SIZE(X)
|
||||||
|
|
||||||
|
addl $8 * SIZE, X
|
||||||
|
|
||||||
|
faddp %st, %st(7)
|
||||||
|
faddp %st, %st(5)
|
||||||
|
faddp %st, %st(3)
|
||||||
|
faddp %st, %st(1)
|
||||||
|
|
||||||
|
decl I
|
||||||
|
jg .L10
|
||||||
|
ALIGN_4
|
||||||
|
|
||||||
|
.L20:
|
||||||
|
movl M, I
|
||||||
|
andl $3, I
|
||||||
|
jle .L998
|
||||||
|
ALIGN_4
|
||||||
|
|
||||||
|
|
||||||
|
.L21:
|
||||||
|
FLD 0 * SIZE(X)
|
||||||
|
FLD 1 * SIZE(X)
|
||||||
|
faddp %st,%st(3)
|
||||||
|
faddp %st,%st(1)
|
||||||
|
addl $2 * SIZE, X
|
||||||
|
decl I
|
||||||
|
jg .L21
|
||||||
|
jmp .L998
|
||||||
|
ALIGN_4
|
||||||
|
|
||||||
|
.L40:
|
||||||
|
movl M, I
|
||||||
|
sarl $2, I
|
||||||
|
jle .L60
|
||||||
|
ALIGN_4
|
||||||
|
|
||||||
|
.L50:
|
||||||
|
FLD 0 * SIZE(X)
|
||||||
|
FLD 1 * SIZE(X)
|
||||||
|
addl INCX, X
|
||||||
|
FLD 0 * SIZE(X)
|
||||||
|
FLD 1 * SIZE(X)
|
||||||
|
addl INCX, X
|
||||||
|
|
||||||
|
faddp %st, %st(7)
|
||||||
|
faddp %st, %st(5)
|
||||||
|
faddp %st, %st(3)
|
||||||
|
faddp %st, %st(1)
|
||||||
|
|
||||||
|
FLD 0 * SIZE(X)
|
||||||
|
FLD 1 * SIZE(X)
|
||||||
|
addl INCX, X
|
||||||
|
FLD 0 * SIZE(X)
|
||||||
|
FLD 1 * SIZE(X)
|
||||||
|
addl INCX, X
|
||||||
|
|
||||||
|
faddp %st, %st(7)
|
||||||
|
faddp %st, %st(5)
|
||||||
|
faddp %st, %st(3)
|
||||||
|
faddp %st, %st(1)
|
||||||
|
|
||||||
|
decl I
|
||||||
|
jg .L50
|
||||||
|
ALIGN_4
|
||||||
|
|
||||||
|
.L60:
|
||||||
|
movl M, I
|
||||||
|
andl $3, I
|
||||||
|
jle .L998
|
||||||
|
ALIGN_4
|
||||||
|
|
||||||
|
|
||||||
|
.L61:
|
||||||
|
FLD 0 * SIZE(X)
|
||||||
|
FLD 1 * SIZE(X)
|
||||||
|
addl INCX, X
|
||||||
|
faddp %st,%st(3)
|
||||||
|
faddp %st,%st(1)
|
||||||
|
decl I
|
||||||
|
jg .L61
|
||||||
|
ALIGN_4
|
||||||
|
|
||||||
|
.L998:
|
||||||
|
faddp %st,%st(2)
|
||||||
|
faddp %st,%st(1)
|
||||||
|
faddp %st,%st(1)
|
||||||
|
ALIGN_4
|
||||||
|
|
||||||
|
.L999:
|
||||||
|
popl %ebx
|
||||||
|
popl %esi
|
||||||
|
ret
|
||||||
|
|
||||||
|
EPILOGUE
|
|
@ -94,6 +94,11 @@ DASUMKERNEL = ../arm/asum.c
|
||||||
CASUMKERNEL = ../arm/zasum.c
|
CASUMKERNEL = ../arm/zasum.c
|
||||||
ZASUMKERNEL = ../arm/zasum.c
|
ZASUMKERNEL = ../arm/zasum.c
|
||||||
|
|
||||||
|
SSUMKERNEL = ../arm/sum.c
|
||||||
|
DSUMKERNEL = ../arm/sum.c
|
||||||
|
CSUMKERNEL = ../arm/zsum.c
|
||||||
|
ZSUMKERNEL = ../arm/zsum.c
|
||||||
|
|
||||||
SAXPYKERNEL = ../arm/axpy.c
|
SAXPYKERNEL = ../arm/axpy.c
|
||||||
DAXPYKERNEL = ../arm/axpy.c
|
DAXPYKERNEL = ../arm/axpy.c
|
||||||
CAXPYKERNEL = ../arm/zaxpy.c
|
CAXPYKERNEL = ../arm/zaxpy.c
|
||||||
|
|
|
@ -0,0 +1,179 @@
|
||||||
|
/*********************************************************************/
|
||||||
|
/* Copyright 2009, 2010 The University of Texas at Austin. */
|
||||||
|
/* All rights reserved. */
|
||||||
|
/* */
|
||||||
|
/* Redistribution and use in source and binary forms, with or */
|
||||||
|
/* without modification, are permitted provided that the following */
|
||||||
|
/* conditions are met: */
|
||||||
|
/* */
|
||||||
|
/* 1. Redistributions of source code must retain the above */
|
||||||
|
/* copyright notice, this list of conditions and the following */
|
||||||
|
/* disclaimer. */
|
||||||
|
/* */
|
||||||
|
/* 2. Redistributions in binary form must reproduce the above */
|
||||||
|
/* copyright notice, this list of conditions and the following */
|
||||||
|
/* disclaimer in the documentation and/or other materials */
|
||||||
|
/* provided with the distribution. */
|
||||||
|
/* */
|
||||||
|
/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */
|
||||||
|
/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */
|
||||||
|
/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */
|
||||||
|
/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */
|
||||||
|
/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */
|
||||||
|
/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */
|
||||||
|
/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */
|
||||||
|
/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */
|
||||||
|
/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */
|
||||||
|
/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */
|
||||||
|
/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */
|
||||||
|
/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */
|
||||||
|
/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */
|
||||||
|
/* POSSIBILITY OF SUCH DAMAGE. */
|
||||||
|
/* */
|
||||||
|
/* The views and conclusions contained in the software and */
|
||||||
|
/* documentation are those of the authors and should not be */
|
||||||
|
/* interpreted as representing official policies, either expressed */
|
||||||
|
/* or implied, of The University of Texas at Austin. */
|
||||||
|
/*********************************************************************/
|
||||||
|
|
||||||
|
#define ASSEMBLER
|
||||||
|
#include "common.h"
|
||||||
|
|
||||||
|
#define M ARG1
|
||||||
|
#define X ARG2
|
||||||
|
#define INCX ARG3
|
||||||
|
|
||||||
|
#define I %rax
|
||||||
|
|
||||||
|
#include "l1param.h"
|
||||||
|
|
||||||
|
PROLOGUE
|
||||||
|
PROFCODE
|
||||||
|
|
||||||
|
fldz
|
||||||
|
testq M, M
|
||||||
|
jle .L999
|
||||||
|
testq INCX, INCX
|
||||||
|
jle .L999
|
||||||
|
|
||||||
|
salq $BASE_SHIFT, INCX
|
||||||
|
|
||||||
|
fldz
|
||||||
|
fldz
|
||||||
|
fldz
|
||||||
|
cmpq $SIZE, INCX
|
||||||
|
jne .L40
|
||||||
|
|
||||||
|
movq M, I
|
||||||
|
sarq $3, I
|
||||||
|
jle .L20
|
||||||
|
ALIGN_4
|
||||||
|
|
||||||
|
.L10:
|
||||||
|
#ifdef PREFETCH
|
||||||
|
PREFETCH (PREFETCHSIZE + 0) - PREOFFSET(X)
|
||||||
|
#endif
|
||||||
|
|
||||||
|
FLD 0 * SIZE(X)
|
||||||
|
FLD 1 * SIZE(X)
|
||||||
|
FLD 2 * SIZE(X)
|
||||||
|
FLD 3 * SIZE(X)
|
||||||
|
|
||||||
|
faddp %st, %st(7)
|
||||||
|
faddp %st, %st(5)
|
||||||
|
faddp %st, %st(3)
|
||||||
|
faddp %st, %st(1)
|
||||||
|
|
||||||
|
FLD 4 * SIZE(X)
|
||||||
|
FLD 5 * SIZE(X)
|
||||||
|
FLD 6 * SIZE(X)
|
||||||
|
FLD 7 * SIZE(X)
|
||||||
|
|
||||||
|
addq $8 * SIZE, X
|
||||||
|
|
||||||
|
faddp %st, %st(7)
|
||||||
|
faddp %st, %st(5)
|
||||||
|
faddp %st, %st(3)
|
||||||
|
faddp %st, %st(1)
|
||||||
|
|
||||||
|
decq I
|
||||||
|
jg .L10
|
||||||
|
ALIGN_4
|
||||||
|
|
||||||
|
.L20:
|
||||||
|
andq $7, M
|
||||||
|
jle .L998
|
||||||
|
ALIGN_4
|
||||||
|
|
||||||
|
.L21:
|
||||||
|
FLD (X)
|
||||||
|
faddp %st,%st(1)
|
||||||
|
addq $1 * SIZE, X
|
||||||
|
decq M
|
||||||
|
jg .L21
|
||||||
|
jmp .L998
|
||||||
|
ALIGN_4
|
||||||
|
|
||||||
|
.L40:
|
||||||
|
movq M, I
|
||||||
|
sarq $3, I
|
||||||
|
jle .L60
|
||||||
|
ALIGN_4
|
||||||
|
|
||||||
|
.L50:
|
||||||
|
FLD (X)
|
||||||
|
addq INCX, X
|
||||||
|
FLD (X)
|
||||||
|
addq INCX, X
|
||||||
|
FLD (X)
|
||||||
|
addq INCX, X
|
||||||
|
FLD (X)
|
||||||
|
addq INCX, X
|
||||||
|
|
||||||
|
faddp %st, %st(7)
|
||||||
|
faddp %st, %st(5)
|
||||||
|
faddp %st, %st(3)
|
||||||
|
faddp %st, %st(1)
|
||||||
|
|
||||||
|
FLD (X)
|
||||||
|
addq INCX, X
|
||||||
|
FLD (X)
|
||||||
|
addq INCX, X
|
||||||
|
FLD (X)
|
||||||
|
addq INCX, X
|
||||||
|
FLD (X)
|
||||||
|
addq INCX, X
|
||||||
|
|
||||||
|
faddp %st, %st(7)
|
||||||
|
faddp %st, %st(5)
|
||||||
|
faddp %st, %st(3)
|
||||||
|
faddp %st, %st(1)
|
||||||
|
|
||||||
|
decq I
|
||||||
|
jg .L50
|
||||||
|
ALIGN_4
|
||||||
|
|
||||||
|
.L60:
|
||||||
|
andq $7, M
|
||||||
|
jle .L998
|
||||||
|
ALIGN_4
|
||||||
|
|
||||||
|
|
||||||
|
.L61:
|
||||||
|
FLD (X)
|
||||||
|
addq INCX, X
|
||||||
|
faddp %st,%st(1)
|
||||||
|
decq M
|
||||||
|
jg .L61
|
||||||
|
ALIGN_4
|
||||||
|
|
||||||
|
.L998:
|
||||||
|
faddp %st,%st(2)
|
||||||
|
faddp %st,%st(1)
|
||||||
|
faddp %st,%st(1)
|
||||||
|
ALIGN_4
|
||||||
|
|
||||||
|
.L999:
|
||||||
|
ret
|
||||||
|
|
||||||
|
EPILOGUE
|
|
@ -0,0 +1,180 @@
|
||||||
|
/*********************************************************************/
|
||||||
|
/* Copyright 2009, 2010 The University of Texas at Austin. */
|
||||||
|
/* All rights reserved. */
|
||||||
|
/* */
|
||||||
|
/* Redistribution and use in source and binary forms, with or */
|
||||||
|
/* without modification, are permitted provided that the following */
|
||||||
|
/* conditions are met: */
|
||||||
|
/* */
|
||||||
|
/* 1. Redistributions of source code must retain the above */
|
||||||
|
/* copyright notice, this list of conditions and the following */
|
||||||
|
/* disclaimer. */
|
||||||
|
/* */
|
||||||
|
/* 2. Redistributions in binary form must reproduce the above */
|
||||||
|
/* copyright notice, this list of conditions and the following */
|
||||||
|
/* disclaimer in the documentation and/or other materials */
|
||||||
|
/* provided with the distribution. */
|
||||||
|
/* */
|
||||||
|
/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */
|
||||||
|
/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */
|
||||||
|
/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */
|
||||||
|
/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */
|
||||||
|
/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */
|
||||||
|
/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */
|
||||||
|
/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */
|
||||||
|
/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */
|
||||||
|
/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */
|
||||||
|
/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */
|
||||||
|
/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */
|
||||||
|
/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */
|
||||||
|
/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */
|
||||||
|
/* POSSIBILITY OF SUCH DAMAGE. */
|
||||||
|
/* */
|
||||||
|
/* The views and conclusions contained in the software and */
|
||||||
|
/* documentation are those of the authors and should not be */
|
||||||
|
/* interpreted as representing official policies, either expressed */
|
||||||
|
/* or implied, of The University of Texas at Austin. */
|
||||||
|
/*********************************************************************/
|
||||||
|
|
||||||
|
#define ASSEMBLER
|
||||||
|
#include "common.h"
|
||||||
|
|
||||||
|
#define M ARG1
|
||||||
|
#define X ARG2
|
||||||
|
#define INCX ARG3
|
||||||
|
|
||||||
|
#define I %rax
|
||||||
|
|
||||||
|
#include "l1param.h"
|
||||||
|
|
||||||
|
PROLOGUE
|
||||||
|
PROFCODE
|
||||||
|
|
||||||
|
fldz
|
||||||
|
testq M, M
|
||||||
|
jle .L999
|
||||||
|
testq INCX, INCX
|
||||||
|
jle .L999
|
||||||
|
|
||||||
|
salq $ZBASE_SHIFT, INCX
|
||||||
|
|
||||||
|
fldz
|
||||||
|
fldz
|
||||||
|
fldz
|
||||||
|
cmpq $SIZE * 2, INCX
|
||||||
|
jne .L40
|
||||||
|
|
||||||
|
movq M, I
|
||||||
|
sarq $2, I
|
||||||
|
jle .L20
|
||||||
|
ALIGN_4
|
||||||
|
|
||||||
|
.L10:
|
||||||
|
#ifdef PREFETCH
|
||||||
|
PREFETCH (PREFETCHSIZE + 0) - PREOFFSET(X)
|
||||||
|
#endif
|
||||||
|
|
||||||
|
FLD 0 * SIZE(X)
|
||||||
|
FLD 1 * SIZE(X)
|
||||||
|
FLD 2 * SIZE(X)
|
||||||
|
FLD 3 * SIZE(X)
|
||||||
|
|
||||||
|
faddp %st, %st(7)
|
||||||
|
faddp %st, %st(5)
|
||||||
|
faddp %st, %st(3)
|
||||||
|
faddp %st, %st(1)
|
||||||
|
|
||||||
|
FLD 4 * SIZE(X)
|
||||||
|
FLD 5 * SIZE(X)
|
||||||
|
FLD 6 * SIZE(X)
|
||||||
|
FLD 7 * SIZE(X)
|
||||||
|
|
||||||
|
addq $8 * SIZE, X
|
||||||
|
|
||||||
|
faddp %st, %st(7)
|
||||||
|
faddp %st, %st(5)
|
||||||
|
faddp %st, %st(3)
|
||||||
|
faddp %st, %st(1)
|
||||||
|
|
||||||
|
decq I
|
||||||
|
jg .L10
|
||||||
|
ALIGN_4
|
||||||
|
|
||||||
|
.L20:
|
||||||
|
andq $3, M
|
||||||
|
jle .L998
|
||||||
|
ALIGN_4
|
||||||
|
|
||||||
|
|
||||||
|
.L21:
|
||||||
|
FLD 0 * SIZE(X)
|
||||||
|
FLD 1 * SIZE(X)
|
||||||
|
faddp %st,%st(3)
|
||||||
|
faddp %st,%st(1)
|
||||||
|
addq $2 * SIZE, X
|
||||||
|
decq M
|
||||||
|
jg .L21
|
||||||
|
jmp .L998
|
||||||
|
ALIGN_4
|
||||||
|
|
||||||
|
.L40:
|
||||||
|
movq M, I
|
||||||
|
sarq $2, I
|
||||||
|
jle .L60
|
||||||
|
ALIGN_4
|
||||||
|
|
||||||
|
.L50:
|
||||||
|
FLD 0 * SIZE(X)
|
||||||
|
FLD 1 * SIZE(X)
|
||||||
|
addq INCX, X
|
||||||
|
FLD 0 * SIZE(X)
|
||||||
|
FLD 1 * SIZE(X)
|
||||||
|
addq INCX, X
|
||||||
|
|
||||||
|
faddp %st, %st(7)
|
||||||
|
faddp %st, %st(5)
|
||||||
|
faddp %st, %st(3)
|
||||||
|
faddp %st, %st(1)
|
||||||
|
|
||||||
|
FLD 0 * SIZE(X)
|
||||||
|
FLD 1 * SIZE(X)
|
||||||
|
addq INCX, X
|
||||||
|
FLD 0 * SIZE(X)
|
||||||
|
FLD 1 * SIZE(X)
|
||||||
|
addq INCX, X
|
||||||
|
|
||||||
|
faddp %st, %st(7)
|
||||||
|
faddp %st, %st(5)
|
||||||
|
faddp %st, %st(3)
|
||||||
|
faddp %st, %st(1)
|
||||||
|
|
||||||
|
decq I
|
||||||
|
jg .L50
|
||||||
|
ALIGN_4
|
||||||
|
|
||||||
|
.L60:
|
||||||
|
andq $3, M
|
||||||
|
jle .L998
|
||||||
|
ALIGN_4
|
||||||
|
|
||||||
|
|
||||||
|
.L61:
|
||||||
|
FLD 0 * SIZE(X)
|
||||||
|
FLD 1 * SIZE(X)
|
||||||
|
addq INCX, X
|
||||||
|
faddp %st,%st(3)
|
||||||
|
faddp %st,%st(1)
|
||||||
|
decq M
|
||||||
|
jg .L61
|
||||||
|
ALIGN_4
|
||||||
|
|
||||||
|
.L998:
|
||||||
|
faddp %st,%st(2)
|
||||||
|
faddp %st,%st(1)
|
||||||
|
faddp %st,%st(1)
|
||||||
|
ALIGN_4
|
||||||
|
|
||||||
|
.L999:
|
||||||
|
ret
|
||||||
|
|
||||||
|
EPILOGUE
|
|
@ -35,6 +35,11 @@ DASUMKERNEL = dasum.c
|
||||||
CASUMKERNEL = ../arm/zasum.c
|
CASUMKERNEL = ../arm/zasum.c
|
||||||
ZASUMKERNEL = zasum.c
|
ZASUMKERNEL = zasum.c
|
||||||
|
|
||||||
|
SSUMKERNEL = ../arm/asum.c
|
||||||
|
DSUMKERNEL = dasum.c
|
||||||
|
CSUMKERNEL = ../arm/zasum.c
|
||||||
|
ZSUMKERNEL = zasum.c
|
||||||
|
|
||||||
SAXPYKERNEL = ../arm/axpy.c
|
SAXPYKERNEL = ../arm/axpy.c
|
||||||
DAXPYKERNEL = daxpy.c
|
DAXPYKERNEL = daxpy.c
|
||||||
CAXPYKERNEL = ../arm/zaxpy.c
|
CAXPYKERNEL = ../arm/zaxpy.c
|
||||||
|
|
|
@ -35,6 +35,11 @@ DASUMKERNEL = dasum.c
|
||||||
CASUMKERNEL = casum.c
|
CASUMKERNEL = casum.c
|
||||||
ZASUMKERNEL = zasum.c
|
ZASUMKERNEL = zasum.c
|
||||||
|
|
||||||
|
SSUMKERNEL = ssum.c
|
||||||
|
DSUMKERNEL = dsum.c
|
||||||
|
CSUMKERNEL = csum.c
|
||||||
|
ZSUMKERNEL = zsum.c
|
||||||
|
|
||||||
SAXPYKERNEL = saxpy.c
|
SAXPYKERNEL = saxpy.c
|
||||||
DAXPYKERNEL = daxpy.c
|
DAXPYKERNEL = daxpy.c
|
||||||
CAXPYKERNEL = caxpy.c
|
CAXPYKERNEL = caxpy.c
|
||||||
|
|
|
@ -35,6 +35,11 @@ DASUMKERNEL = ../arm/asum.c
|
||||||
CASUMKERNEL = ../arm/zasum.c
|
CASUMKERNEL = ../arm/zasum.c
|
||||||
ZASUMKERNEL = ../arm/zasum.c
|
ZASUMKERNEL = ../arm/zasum.c
|
||||||
|
|
||||||
|
SSUMKERNEL = ../arm/sum.c
|
||||||
|
DSUMKERNEL = ../arm/sum.c
|
||||||
|
CSUMKERNEL = ../arm/zsum.c
|
||||||
|
ZSUMKERNEL = ../arm/zsum.c
|
||||||
|
|
||||||
SAXPYKERNEL = ../arm/axpy.c
|
SAXPYKERNEL = ../arm/axpy.c
|
||||||
DAXPYKERNEL = ../arm/axpy.c
|
DAXPYKERNEL = ../arm/axpy.c
|
||||||
CAXPYKERNEL = ../arm/zaxpy.c
|
CAXPYKERNEL = ../arm/zaxpy.c
|
||||||
|
|
|
@ -0,0 +1,137 @@
|
||||||
|
/***************************************************************************
|
||||||
|
Copyright (c) 2013-2019, The OpenBLAS Project
|
||||||
|
All rights reserved.
|
||||||
|
Redistribution and use in source and binary forms, with or without
|
||||||
|
modification, are permitted provided that the following conditions are
|
||||||
|
met:
|
||||||
|
1. Redistributions of source code must retain the above copyright
|
||||||
|
notice, this list of conditions and the following disclaimer.
|
||||||
|
2. Redistributions in binary form must reproduce the above copyright
|
||||||
|
notice, this list of conditions and the following disclaimer in
|
||||||
|
the documentation and/or other materials provided with the
|
||||||
|
distribution.
|
||||||
|
3. Neither the name of the OpenBLAS project nor the names of
|
||||||
|
its contributors may be used to endorse or promote products
|
||||||
|
derived from this software without specific prior written permission.
|
||||||
|
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||||||
|
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||||
|
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
||||||
|
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
|
||||||
|
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
|
||||||
|
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
|
||||||
|
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
|
||||||
|
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
|
||||||
|
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
|
||||||
|
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||||
|
*****************************************************************************/
|
||||||
|
|
||||||
|
#include "common.h"
|
||||||
|
#include <math.h>
|
||||||
|
|
||||||
|
static FLOAT csum_kernel_32(BLASLONG n, FLOAT *x) {
|
||||||
|
FLOAT sum;
|
||||||
|
|
||||||
|
__asm__("vzero %%v24\n\t"
|
||||||
|
"vzero %%v25\n\t"
|
||||||
|
"vzero %%v26\n\t"
|
||||||
|
"vzero %%v27\n\t"
|
||||||
|
"vzero %%v28\n\t"
|
||||||
|
"vzero %%v29\n\t"
|
||||||
|
"vzero %%v30\n\t"
|
||||||
|
"vzero %%v31\n\t"
|
||||||
|
"srlg %[n],%[n],5\n\t"
|
||||||
|
"xgr %%r1,%%r1\n\t"
|
||||||
|
"0:\n\t"
|
||||||
|
"pfd 1, 1024(%%r1,%[x])\n\t"
|
||||||
|
"vl %%v16, 0(%%r1,%[x])\n\t"
|
||||||
|
"vl %%v17, 16(%%r1,%[x])\n\t"
|
||||||
|
"vl %%v18, 32(%%r1,%[x])\n\t"
|
||||||
|
"vl %%v19, 48(%%r1,%[x])\n\t"
|
||||||
|
"vl %%v20, 64(%%r1,%[x])\n\t"
|
||||||
|
"vl %%v21, 80(%%r1,%[x])\n\t"
|
||||||
|
"vl %%v22, 96(%%r1,%[x])\n\t"
|
||||||
|
"vl %%v23, 112(%%r1,%[x])\n\t"
|
||||||
|
"vfasb %%v24,%%v24,%%v16\n\t"
|
||||||
|
"vfasb %%v25,%%v25,%%v17\n\t"
|
||||||
|
"vfasb %%v26,%%v26,%%v18\n\t"
|
||||||
|
"vfasb %%v27,%%v27,%%v19\n\t"
|
||||||
|
"vfasb %%v28,%%v28,%%v20\n\t"
|
||||||
|
"vfasb %%v29,%%v29,%%v21\n\t"
|
||||||
|
"vfasb %%v30,%%v30,%%v22\n\t"
|
||||||
|
"vfasb %%v31,%%v31,%%v23\n\t"
|
||||||
|
"vl %%v16, 128(%%r1,%[x])\n\t"
|
||||||
|
"vl %%v17, 144(%%r1,%[x])\n\t"
|
||||||
|
"vl %%v18, 160(%%r1,%[x])\n\t"
|
||||||
|
"vl %%v19, 176(%%r1,%[x])\n\t"
|
||||||
|
"vl %%v20, 192(%%r1,%[x])\n\t"
|
||||||
|
"vl %%v21, 208(%%r1,%[x])\n\t"
|
||||||
|
"vl %%v22, 224(%%r1,%[x])\n\t"
|
||||||
|
"vl %%v23, 240(%%r1,%[x])\n\t"
|
||||||
|
"vfasb %%v24,%%v24,%%v16\n\t"
|
||||||
|
"vfasb %%v25,%%v25,%%v17\n\t"
|
||||||
|
"vfasb %%v26,%%v26,%%v18\n\t"
|
||||||
|
"vfasb %%v27,%%v27,%%v19\n\t"
|
||||||
|
"vfasb %%v28,%%v28,%%v20\n\t"
|
||||||
|
"vfasb %%v29,%%v29,%%v21\n\t"
|
||||||
|
"vfasb %%v30,%%v30,%%v22\n\t"
|
||||||
|
"vfasb %%v31,%%v31,%%v23\n\t"
|
||||||
|
"agfi %%r1,256\n\t"
|
||||||
|
"brctg %[n],0b\n\t"
|
||||||
|
"vfasb %%v24,%%v24,%%v25\n\t"
|
||||||
|
"vfasb %%v24,%%v24,%%v26\n\t"
|
||||||
|
"vfasb %%v24,%%v24,%%v27\n\t"
|
||||||
|
"vfasb %%v24,%%v24,%%v28\n\t"
|
||||||
|
"vfasb %%v24,%%v24,%%v29\n\t"
|
||||||
|
"vfasb %%v24,%%v24,%%v30\n\t"
|
||||||
|
"vfasb %%v24,%%v24,%%v31\n\t"
|
||||||
|
"veslg %%v25,%%v24,32\n\t"
|
||||||
|
"vfasb %%v24,%%v24,%%v25\n\t"
|
||||||
|
"vrepf %%v25,%%v24,2\n\t"
|
||||||
|
"vfasb %%v24,%%v24,%%v25\n\t"
|
||||||
|
"vstef %%v24,%[asum],0"
|
||||||
|
: [sum] "=Q"(sum),[n] "+&r"(n)
|
||||||
|
: "m"(*(const struct { FLOAT x[n * 2]; } *) x),[x] "a"(x)
|
||||||
|
: "cc", "r1", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23",
|
||||||
|
"v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31");
|
||||||
|
|
||||||
|
return sum;
|
||||||
|
}
|
||||||
|
|
||||||
|
FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) {
|
||||||
|
BLASLONG i = 0;
|
||||||
|
BLASLONG ip = 0;
|
||||||
|
FLOAT sumf = 0.0;
|
||||||
|
BLASLONG n1;
|
||||||
|
BLASLONG inc_x2;
|
||||||
|
|
||||||
|
if (n <= 0 || inc_x <= 0)
|
||||||
|
return (sumf);
|
||||||
|
|
||||||
|
if (inc_x == 1) {
|
||||||
|
|
||||||
|
n1 = n & -32;
|
||||||
|
if (n1 > 0) {
|
||||||
|
|
||||||
|
sumf = csum_kernel_32(n1, x);
|
||||||
|
i = n1;
|
||||||
|
ip = 2 * n1;
|
||||||
|
}
|
||||||
|
|
||||||
|
while (i < n) {
|
||||||
|
sumf += x[ip] + x[ip + 1];
|
||||||
|
i++;
|
||||||
|
ip += 2;
|
||||||
|
}
|
||||||
|
|
||||||
|
} else {
|
||||||
|
inc_x2 = 2 * inc_x;
|
||||||
|
|
||||||
|
while (i < n) {
|
||||||
|
sumf += x[ip] + x[ip + 1];
|
||||||
|
ip += inc_x2;
|
||||||
|
i++;
|
||||||
|
}
|
||||||
|
|
||||||
|
}
|
||||||
|
return (sumf);
|
||||||
|
}
|
|
@ -0,0 +1,148 @@
|
||||||
|
/***************************************************************************
|
||||||
|
Copyright (c) 2013-2019, The OpenBLAS Project
|
||||||
|
All rights reserved.
|
||||||
|
Redistribution and use in source and binary forms, with or without
|
||||||
|
modification, are permitted provided that the following conditions are
|
||||||
|
met:
|
||||||
|
1. Redistributions of source code must retain the above copyright
|
||||||
|
notice, this list of conditions and the following disclaimer.
|
||||||
|
2. Redistributions in binary form must reproduce the above copyright
|
||||||
|
notice, this list of conditions and the following disclaimer in
|
||||||
|
the documentation and/or other materials provided with the
|
||||||
|
distribution.
|
||||||
|
3. Neither the name of the OpenBLAS project nor the names of
|
||||||
|
its contributors may be used to endorse or promote products
|
||||||
|
derived from this software without specific prior written permission.
|
||||||
|
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||||||
|
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||||
|
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
||||||
|
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
|
||||||
|
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
|
||||||
|
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
|
||||||
|
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
|
||||||
|
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
|
||||||
|
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
|
||||||
|
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||||
|
*****************************************************************************/
|
||||||
|
|
||||||
|
#include "common.h"
|
||||||
|
#include <math.h>
|
||||||
|
|
||||||
|
static FLOAT dsum_kernel_32(BLASLONG n, FLOAT *x) {
|
||||||
|
FLOAT sum;
|
||||||
|
|
||||||
|
__asm__("vzero %%v24\n\t"
|
||||||
|
"vzero %%v25\n\t"
|
||||||
|
"vzero %%v26\n\t"
|
||||||
|
"vzero %%v27\n\t"
|
||||||
|
"vzero %%v28\n\t"
|
||||||
|
"vzero %%v29\n\t"
|
||||||
|
"vzero %%v30\n\t"
|
||||||
|
"vzero %%v31\n\t"
|
||||||
|
"srlg %[n],%[n],5\n\t"
|
||||||
|
"xgr %%r1,%%r1\n\t"
|
||||||
|
"0:\n\t"
|
||||||
|
"pfd 1, 1024(%%r1,%[x])\n\t"
|
||||||
|
"vl %%v16, 0(%%r1,%[x])\n\t"
|
||||||
|
"vl %%v17, 16(%%r1,%[x])\n\t"
|
||||||
|
"vl %%v18, 32(%%r1,%[x])\n\t"
|
||||||
|
"vl %%v19, 48(%%r1,%[x])\n\t"
|
||||||
|
"vl %%v20, 64(%%r1,%[x])\n\t"
|
||||||
|
"vl %%v21, 80(%%r1,%[x])\n\t"
|
||||||
|
"vl %%v22, 96(%%r1,%[x])\n\t"
|
||||||
|
"vl %%v23, 112(%%r1,%[x])\n\t"
|
||||||
|
"vfadb %%v24,%%v24,%%v16\n\t"
|
||||||
|
"vfadb %%v25,%%v25,%%v17\n\t"
|
||||||
|
"vfadb %%v26,%%v26,%%v18\n\t"
|
||||||
|
"vfadb %%v27,%%v27,%%v19\n\t"
|
||||||
|
"vfadb %%v28,%%v28,%%v20\n\t"
|
||||||
|
"vfadb %%v29,%%v29,%%v21\n\t"
|
||||||
|
"vfadb %%v30,%%v30,%%v22\n\t"
|
||||||
|
"vfadb %%v31,%%v31,%%v23\n\t"
|
||||||
|
"vl %%v16, 128(%%r1,%[x])\n\t"
|
||||||
|
"vl %%v17, 144(%%r1,%[x])\n\t"
|
||||||
|
"vl %%v18, 160(%%r1,%[x])\n\t"
|
||||||
|
"vl %%v19, 176(%%r1,%[x])\n\t"
|
||||||
|
"vl %%v20, 192(%%r1,%[x])\n\t"
|
||||||
|
"vl %%v21, 208(%%r1,%[x])\n\t"
|
||||||
|
"vl %%v22, 224(%%r1,%[x])\n\t"
|
||||||
|
"vl %%v23, 240(%%r1,%[x])\n\t"
|
||||||
|
"vfadb %%v24,%%v24,%%v16\n\t"
|
||||||
|
"vfadb %%v25,%%v25,%%v17\n\t"
|
||||||
|
"vfadb %%v26,%%v26,%%v18\n\t"
|
||||||
|
"vfadb %%v27,%%v27,%%v19\n\t"
|
||||||
|
"vfadb %%v28,%%v28,%%v20\n\t"
|
||||||
|
"vfadb %%v29,%%v29,%%v21\n\t"
|
||||||
|
"vfadb %%v30,%%v30,%%v22\n\t"
|
||||||
|
"vfadb %%v31,%%v31,%%v23\n\t"
|
||||||
|
"agfi %%r1,256\n\t"
|
||||||
|
"brctg %[n],0b\n\t"
|
||||||
|
"vfadb %%v24,%%v24,%%v25\n\t"
|
||||||
|
"vfadb %%v24,%%v24,%%v26\n\t"
|
||||||
|
"vfadb %%v24,%%v24,%%v27\n\t"
|
||||||
|
"vfadb %%v24,%%v24,%%v28\n\t"
|
||||||
|
"vfadb %%v24,%%v24,%%v29\n\t"
|
||||||
|
"vfadb %%v24,%%v24,%%v30\n\t"
|
||||||
|
"vfadb %%v24,%%v24,%%v31\n\t"
|
||||||
|
"vrepg %%v25,%%v24,1\n\t"
|
||||||
|
"vfadb %%v24,%%v24,%%v25\n\t"
|
||||||
|
"vsteg %%v24,%[asum],0"
|
||||||
|
: [sum] "=Q"(sum),[n] "+&r"(n)
|
||||||
|
: "m"(*(const struct { FLOAT x[n]; } *) x),[x] "a"(x)
|
||||||
|
: "cc", "r1", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23",
|
||||||
|
"v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31");
|
||||||
|
|
||||||
|
return sum;
|
||||||
|
}
|
||||||
|
|
||||||
|
FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) {
|
||||||
|
BLASLONG i = 0;
|
||||||
|
BLASLONG j = 0;
|
||||||
|
FLOAT sumf = 0.0;
|
||||||
|
BLASLONG n1;
|
||||||
|
|
||||||
|
if (n <= 0 || inc_x <= 0)
|
||||||
|
return sumf;
|
||||||
|
|
||||||
|
if (inc_x == 1) {
|
||||||
|
|
||||||
|
n1 = n & -32;
|
||||||
|
|
||||||
|
if (n1 > 0) {
|
||||||
|
|
||||||
|
sumf = dsum_kernel_32(n1, x);
|
||||||
|
i = n1;
|
||||||
|
}
|
||||||
|
|
||||||
|
while (i < n) {
|
||||||
|
sumf += x[i];
|
||||||
|
i++;
|
||||||
|
}
|
||||||
|
|
||||||
|
} else {
|
||||||
|
BLASLONG n1 = n & -4;
|
||||||
|
register FLOAT sum1, sum2;
|
||||||
|
sum1 = 0.0;
|
||||||
|
sum2 = 0.0;
|
||||||
|
while (j < n1) {
|
||||||
|
|
||||||
|
sum1 += x[i];
|
||||||
|
sum2 += x[i + inc_x];
|
||||||
|
sum1 += x[i + 2 * inc_x];
|
||||||
|
sum2 += x[i + 3 * inc_x];
|
||||||
|
|
||||||
|
i += inc_x * 4;
|
||||||
|
j += 4;
|
||||||
|
|
||||||
|
}
|
||||||
|
sumf = sum1 + sum2;
|
||||||
|
while (j < n) {
|
||||||
|
|
||||||
|
sumf += x[i];
|
||||||
|
i += inc_x;
|
||||||
|
j++;
|
||||||
|
}
|
||||||
|
|
||||||
|
}
|
||||||
|
return sumf;
|
||||||
|
}
|
|
@ -0,0 +1,151 @@
|
||||||
|
/***************************************************************************
|
||||||
|
Copyright (c) 2013-2019, The OpenBLAS Project
|
||||||
|
All rights reserved.
|
||||||
|
Redistribution and use in source and binary forms, with or without
|
||||||
|
modification, are permitted provided that the following conditions are
|
||||||
|
met:
|
||||||
|
1. Redistributions of source code must retain the above copyright
|
||||||
|
notice, this list of conditions and the following disclaimer.
|
||||||
|
2. Redistributions in binary form must reproduce the above copyright
|
||||||
|
notice, this list of conditions and the following disclaimer in
|
||||||
|
the documentation and/or other materials provided with the
|
||||||
|
distribution.
|
||||||
|
3. Neither the name of the OpenBLAS project nor the names of
|
||||||
|
its contributors may be used to endorse or promote products
|
||||||
|
derived from this software without specific prior written permission.
|
||||||
|
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||||||
|
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||||
|
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
||||||
|
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
|
||||||
|
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
|
||||||
|
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
|
||||||
|
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
|
||||||
|
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
|
||||||
|
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
|
||||||
|
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||||
|
*****************************************************************************/
|
||||||
|
|
||||||
|
#include "common.h"
|
||||||
|
#include <math.h>
|
||||||
|
|
||||||
|
|
||||||
|
static FLOAT ssum_kernel_64(BLASLONG n, FLOAT *x) {
|
||||||
|
FLOAT sum;
|
||||||
|
|
||||||
|
__asm__("vzero %%v24\n\t"
|
||||||
|
"vzero %%v25\n\t"
|
||||||
|
"vzero %%v26\n\t"
|
||||||
|
"vzero %%v27\n\t"
|
||||||
|
"vzero %%v28\n\t"
|
||||||
|
"vzero %%v29\n\t"
|
||||||
|
"vzero %%v30\n\t"
|
||||||
|
"vzero %%v31\n\t"
|
||||||
|
"srlg %[n],%[n],6\n\t"
|
||||||
|
"xgr %%r1,%%r1\n\t"
|
||||||
|
"0:\n\t"
|
||||||
|
"pfd 1, 1024(%%r1,%[x])\n\t"
|
||||||
|
"vl %%v16, 0(%%r1,%[x])\n\t"
|
||||||
|
"vl %%v17, 16(%%r1,%[x])\n\t"
|
||||||
|
"vl %%v18, 32(%%r1,%[x])\n\t"
|
||||||
|
"vl %%v19, 48(%%r1,%[x])\n\t"
|
||||||
|
"vl %%v20, 64(%%r1,%[x])\n\t"
|
||||||
|
"vl %%v21, 80(%%r1,%[x])\n\t"
|
||||||
|
"vl %%v22, 96(%%r1,%[x])\n\t"
|
||||||
|
"vl %%v23, 112(%%r1,%[x])\n\t"
|
||||||
|
"vfasb %%v24,%%v24,%%v16\n\t"
|
||||||
|
"vfasb %%v25,%%v25,%%v17\n\t"
|
||||||
|
"vfasb %%v26,%%v26,%%v18\n\t"
|
||||||
|
"vfasb %%v27,%%v27,%%v19\n\t"
|
||||||
|
"vfasb %%v28,%%v28,%%v20\n\t"
|
||||||
|
"vfasb %%v29,%%v29,%%v21\n\t"
|
||||||
|
"vfasb %%v30,%%v30,%%v22\n\t"
|
||||||
|
"vfasb %%v31,%%v31,%%v23\n\t"
|
||||||
|
"vl %%v16, 128(%%r1,%[x])\n\t"
|
||||||
|
"vl %%v17, 144(%%r1,%[x])\n\t"
|
||||||
|
"vl %%v18, 160(%%r1,%[x])\n\t"
|
||||||
|
"vl %%v19, 176(%%r1,%[x])\n\t"
|
||||||
|
"vl %%v20, 192(%%r1,%[x])\n\t"
|
||||||
|
"vl %%v21, 208(%%r1,%[x])\n\t"
|
||||||
|
"vl %%v22, 224(%%r1,%[x])\n\t"
|
||||||
|
"vl %%v23, 240(%%r1,%[x])\n\t"
|
||||||
|
"vfasb %%v24,%%v24,%%v16\n\t"
|
||||||
|
"vfasb %%v25,%%v25,%%v17\n\t"
|
||||||
|
"vfasb %%v26,%%v26,%%v18\n\t"
|
||||||
|
"vfasb %%v27,%%v27,%%v19\n\t"
|
||||||
|
"vfasb %%v28,%%v28,%%v20\n\t"
|
||||||
|
"vfasb %%v29,%%v29,%%v21\n\t"
|
||||||
|
"vfasb %%v30,%%v30,%%v22\n\t"
|
||||||
|
"vfasb %%v31,%%v31,%%v23\n\t"
|
||||||
|
"agfi %%r1,256\n\t"
|
||||||
|
"brctg %[n],0b\n\t"
|
||||||
|
"vfasb %%v24,%%v24,%%v25\n\t"
|
||||||
|
"vfasb %%v24,%%v24,%%v26\n\t"
|
||||||
|
"vfasb %%v24,%%v24,%%v27\n\t"
|
||||||
|
"vfasb %%v24,%%v24,%%v28\n\t"
|
||||||
|
"vfasb %%v24,%%v24,%%v29\n\t"
|
||||||
|
"vfasb %%v24,%%v24,%%v30\n\t"
|
||||||
|
"vfasb %%v24,%%v24,%%v31\n\t"
|
||||||
|
"veslg %%v25,%%v24,32\n\t"
|
||||||
|
"vfasb %%v24,%%v24,%%v25\n\t"
|
||||||
|
"vrepf %%v25,%%v24,2\n\t"
|
||||||
|
"vfasb %%v24,%%v24,%%v25\n\t"
|
||||||
|
"vstef %%v24,%[asum],0"
|
||||||
|
: [sum] "=Q"(sum),[n] "+&r"(n)
|
||||||
|
: "m"(*(const struct { FLOAT x[n]; } *) x),[x] "a"(x)
|
||||||
|
: "cc", "r1", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23",
|
||||||
|
"v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31");
|
||||||
|
|
||||||
|
return sum;
|
||||||
|
}
|
||||||
|
|
||||||
|
FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) {
|
||||||
|
BLASLONG i = 0;
|
||||||
|
BLASLONG j = 0;
|
||||||
|
FLOAT sumf = 0.0;
|
||||||
|
BLASLONG n1;
|
||||||
|
|
||||||
|
if (n <= 0 || inc_x <= 0)
|
||||||
|
return sumf;
|
||||||
|
|
||||||
|
if (inc_x == 1) {
|
||||||
|
|
||||||
|
n1 = n & -64;
|
||||||
|
|
||||||
|
if (n1 > 0) {
|
||||||
|
|
||||||
|
sumf = ssum_kernel_64(n1, x);
|
||||||
|
i = n1;
|
||||||
|
}
|
||||||
|
|
||||||
|
while (i < n) {
|
||||||
|
sumf += x[i];
|
||||||
|
i++;
|
||||||
|
}
|
||||||
|
|
||||||
|
} else {
|
||||||
|
BLASLONG n1 = n & -4;
|
||||||
|
register FLOAT sum1, sum2;
|
||||||
|
sum1 = 0.0;
|
||||||
|
sum2 = 0.0;
|
||||||
|
while (j < n1) {
|
||||||
|
|
||||||
|
sum1 += x[i];
|
||||||
|
sum2 += x[i + inc_x];
|
||||||
|
sum1 += x[i + 2 * inc_x];
|
||||||
|
sum2 += x[i + 3 * inc_x];
|
||||||
|
|
||||||
|
i += inc_x * 4;
|
||||||
|
j += 4;
|
||||||
|
|
||||||
|
}
|
||||||
|
sumf = sum1 + sum2;
|
||||||
|
while (j < n) {
|
||||||
|
|
||||||
|
sumf += x[i];
|
||||||
|
i += inc_x;
|
||||||
|
j++;
|
||||||
|
}
|
||||||
|
|
||||||
|
}
|
||||||
|
return sumf;
|
||||||
|
}
|
|
@ -0,0 +1,136 @@
|
||||||
|
/***************************************************************************
|
||||||
|
Copyright (c) 2013-2019, The OpenBLAS Project
|
||||||
|
All rights reserved.
|
||||||
|
Redistribution and use in source and binary forms, with or without
|
||||||
|
modification, are permitted provided that the following conditions are
|
||||||
|
met:
|
||||||
|
1. Redistributions of source code must retain the above copyright
|
||||||
|
notice, this list of conditions and the following disclaimer.
|
||||||
|
2. Redistributions in binary form must reproduce the above copyright
|
||||||
|
notice, this list of conditions and the following disclaimer in
|
||||||
|
the documentation and/or other materials provided with the
|
||||||
|
distribution.
|
||||||
|
3. Neither the name of the OpenBLAS project nor the names of
|
||||||
|
its contributors may be used to endorse or promote products
|
||||||
|
derived from this software without specific prior written permission.
|
||||||
|
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||||||
|
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||||
|
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
||||||
|
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
|
||||||
|
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
|
||||||
|
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
|
||||||
|
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
|
||||||
|
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
|
||||||
|
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
|
||||||
|
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||||
|
*****************************************************************************/
|
||||||
|
|
||||||
|
#include "common.h"
|
||||||
|
#include <math.h>
|
||||||
|
|
||||||
|
|
||||||
|
static FLOAT zsum_kernel_16(BLASLONG n, FLOAT *x) {
|
||||||
|
FLOAT sum;
|
||||||
|
|
||||||
|
__asm__("vzero %%v24\n\t"
|
||||||
|
"vzero %%v25\n\t"
|
||||||
|
"vzero %%v26\n\t"
|
||||||
|
"vzero %%v27\n\t"
|
||||||
|
"vzero %%v28\n\t"
|
||||||
|
"vzero %%v29\n\t"
|
||||||
|
"vzero %%v30\n\t"
|
||||||
|
"vzero %%v31\n\t"
|
||||||
|
"srlg %[n],%[n],4\n\t"
|
||||||
|
"xgr %%r1,%%r1\n\t"
|
||||||
|
"0:\n\t"
|
||||||
|
"pfd 1, 1024(%%r1,%[x])\n\t"
|
||||||
|
"vl %%v16, 0(%%r1,%[x])\n\t"
|
||||||
|
"vl %%v17, 16(%%r1,%[x])\n\t"
|
||||||
|
"vl %%v18, 32(%%r1,%[x])\n\t"
|
||||||
|
"vl %%v19, 48(%%r1,%[x])\n\t"
|
||||||
|
"vl %%v20, 64(%%r1,%[x])\n\t"
|
||||||
|
"vl %%v21, 80(%%r1,%[x])\n\t"
|
||||||
|
"vl %%v22, 96(%%r1,%[x])\n\t"
|
||||||
|
"vl %%v23, 112(%%r1,%[x])\n\t"
|
||||||
|
"vfadb %%v24,%%v24,%%v16\n\t"
|
||||||
|
"vfadb %%v25,%%v25,%%v17\n\t"
|
||||||
|
"vfadb %%v26,%%v26,%%v18\n\t"
|
||||||
|
"vfadb %%v27,%%v27,%%v19\n\t"
|
||||||
|
"vfadb %%v28,%%v28,%%v20\n\t"
|
||||||
|
"vfadb %%v29,%%v29,%%v21\n\t"
|
||||||
|
"vfadb %%v30,%%v30,%%v22\n\t"
|
||||||
|
"vfadb %%v31,%%v31,%%v23\n\t"
|
||||||
|
"vl %%v16, 128(%%r1,%[x])\n\t"
|
||||||
|
"vl %%v17, 144(%%r1,%[x])\n\t"
|
||||||
|
"vl %%v18, 160(%%r1,%[x])\n\t"
|
||||||
|
"vl %%v19, 176(%%r1,%[x])\n\t"
|
||||||
|
"vl %%v20, 192(%%r1,%[x])\n\t"
|
||||||
|
"vl %%v21, 208(%%r1,%[x])\n\t"
|
||||||
|
"vl %%v22, 224(%%r1,%[x])\n\t"
|
||||||
|
"vl %%v23, 240(%%r1,%[x])\n\t"
|
||||||
|
"vfadb %%v24,%%v24,%%v16\n\t"
|
||||||
|
"vfadb %%v25,%%v25,%%v17\n\t"
|
||||||
|
"vfadb %%v26,%%v26,%%v18\n\t"
|
||||||
|
"vfadb %%v27,%%v27,%%v19\n\t"
|
||||||
|
"vfadb %%v28,%%v28,%%v20\n\t"
|
||||||
|
"vfadb %%v29,%%v29,%%v21\n\t"
|
||||||
|
"vfadb %%v30,%%v30,%%v22\n\t"
|
||||||
|
"vfadb %%v31,%%v31,%%v23\n\t"
|
||||||
|
"agfi %%r1,256\n\t"
|
||||||
|
"brctg %[n],0b\n\t"
|
||||||
|
"vfadb %%v24,%%v24,%%v25\n\t"
|
||||||
|
"vfadb %%v24,%%v24,%%v26\n\t"
|
||||||
|
"vfadb %%v24,%%v24,%%v27\n\t"
|
||||||
|
"vfadb %%v24,%%v24,%%v28\n\t"
|
||||||
|
"vfadb %%v24,%%v24,%%v29\n\t"
|
||||||
|
"vfadb %%v24,%%v24,%%v30\n\t"
|
||||||
|
"vfadb %%v24,%%v24,%%v31\n\t"
|
||||||
|
"vrepg %%v25,%%v24,1\n\t"
|
||||||
|
"vfadb %%v24,%%v24,%%v25\n\t"
|
||||||
|
"vsteg %%v24,%[asum],0"
|
||||||
|
: [sum] "=Q"(sum),[n] "+&r"(n)
|
||||||
|
: "m"(*(const struct { FLOAT x[n * 2]; } *) x),[x] "a"(x)
|
||||||
|
: "cc", "r1", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23",
|
||||||
|
"v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31");
|
||||||
|
|
||||||
|
return sum;
|
||||||
|
}
|
||||||
|
|
||||||
|
FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) {
|
||||||
|
BLASLONG i = 0;
|
||||||
|
BLASLONG ip = 0;
|
||||||
|
FLOAT sumf = 0.0;
|
||||||
|
BLASLONG n1;
|
||||||
|
BLASLONG inc_x2;
|
||||||
|
|
||||||
|
if (n <= 0 || inc_x <= 0)
|
||||||
|
return (sumf);
|
||||||
|
|
||||||
|
if (inc_x == 1) {
|
||||||
|
|
||||||
|
n1 = n & -16;
|
||||||
|
if (n1 > 0) {
|
||||||
|
|
||||||
|
sumf = zsum_kernel_16(n1, x);
|
||||||
|
i = n1;
|
||||||
|
ip = 2 * n1;
|
||||||
|
}
|
||||||
|
|
||||||
|
while (i < n) {
|
||||||
|
sumf += x[ip] + x[ip + 1];
|
||||||
|
i++;
|
||||||
|
ip += 2;
|
||||||
|
}
|
||||||
|
|
||||||
|
} else {
|
||||||
|
inc_x2 = 2 * inc_x;
|
||||||
|
|
||||||
|
while (i < n) {
|
||||||
|
sumf += x[ip] + x[ip + 1];
|
||||||
|
ip += inc_x2;
|
||||||
|
i++;
|
||||||
|
}
|
||||||
|
|
||||||
|
}
|
||||||
|
return (sumf);
|
||||||
|
}
|
Loading…
Reference in New Issue