diff --git a/Makefile.arm b/Makefile.arm new file mode 100644 index 000000000..8502d5286 --- /dev/null +++ b/Makefile.arm @@ -0,0 +1,12 @@ + +ifeq ($(CORE), ARMV7) +CCOMMON_OPT += -marm -mfpu=vfpv3 -mfloat-abi=hard -march=armv7-a +FCOMMON_OPT += -marm -mfpu=vfpv3 -mfloat-abi=hard -march=armv7-a +endif + +ifeq ($(CORE), ARMV6) +CCOMMON_OPT += -marm -mfpu=vfp -mfloat-abi=hard -march=armv6 +FCOMMON_OPT += -marm -mfpu=vfp -mfloat-abi=hard -march=armv6 +endif + + diff --git a/Makefile.arm64 b/Makefile.arm64 new file mode 100644 index 000000000..a4f8bab6b --- /dev/null +++ b/Makefile.arm64 @@ -0,0 +1,7 @@ + +ifeq ($(CORE), ARMV8) +CCOMMON_OPT += -march=armv8-a +FCOMMON_OPT += -march=armv8-a +endif + + diff --git a/cblas_noconst.h b/cblas_noconst.h new file mode 100644 index 000000000..fd2e940c0 --- /dev/null +++ b/cblas_noconst.h @@ -0,0 +1,303 @@ +#ifndef CBLAS_H +#define CBLAS_H + +#include +#include "common.h" + +#ifdef __cplusplus +extern "C" { + /* Assume C declarations for C++ */ +#endif /* __cplusplus */ + +/*Set the number of threads on runtime.*/ +void openblas_set_num_threads(int num_threads); +void goto_set_num_threads(int num_threads); + +/*Get the build configure on runtime.*/ +char* openblas_get_config(void); + +/* Get the parallelization type which is used by OpenBLAS */ +int openblas_get_parallel(void); +/* OpenBLAS is compiled for sequential use */ +#define OPENBLAS_SEQUENTIAL 0 +/* OpenBLAS is compiled using normal threading model */ +#define OPENBLAS_THREAD 1 +/* OpenBLAS is compiled using OpenMP threading model */ +#define OPENBLAS_OPENMP 2 + + +#define CBLAS_INDEX size_t + +typedef enum CBLAS_ORDER {CblasRowMajor=101, CblasColMajor=102} CBLAS_ORDER; +typedef enum CBLAS_TRANSPOSE {CblasNoTrans=111, CblasTrans=112, CblasConjTrans=113, CblasConjNoTrans=114} CBLAS_TRANSPOSE; +typedef enum CBLAS_UPLO {CblasUpper=121, CblasLower=122} CBLAS_UPLO; +typedef enum CBLAS_DIAG {CblasNonUnit=131, CblasUnit=132} CBLAS_DIAG; +typedef enum CBLAS_SIDE {CblasLeft=141, CblasRight=142} CBLAS_SIDE; + +float cblas_sdsdot(blasint n, float alpha, float *x, blasint incx, float *y, blasint incy); +double cblas_dsdot (blasint n, float *x, blasint incx, float *y, blasint incy); +float cblas_sdot(blasint n, float *x, blasint incx, float *y, blasint incy); +double cblas_ddot(blasint n, double *x, blasint incx, double *y, blasint incy); + +openblas_complex_float cblas_cdotu(blasint n, float *x, blasint incx, float *y, blasint incy); +openblas_complex_float cblas_cdotc(blasint n, float *x, blasint incx, float *y, blasint incy); +openblas_complex_double cblas_zdotu(blasint n, double *x, blasint incx, double *y, blasint incy); +openblas_complex_double cblas_zdotc(blasint n, double *x, blasint incx, double *y, blasint incy); + +void cblas_cdotu_sub(blasint n, float *x, blasint incx, float *y, blasint incy, openblas_complex_float *ret); +void cblas_cdotc_sub(blasint n, float *x, blasint incx, float *y, blasint incy, openblas_complex_float *ret); +void cblas_zdotu_sub(blasint n, double *x, blasint incx, double *y, blasint incy, openblas_complex_double *ret); +void cblas_zdotc_sub(blasint n, double *x, blasint incx, double *y, blasint incy, openblas_complex_double *ret); + +float cblas_sasum (blasint n, float *x, blasint incx); +double cblas_dasum (blasint n, double *x, blasint incx); +float cblas_scasum(blasint n, float *x, blasint incx); +double cblas_dzasum(blasint n, double *x, blasint incx); + +float cblas_snrm2 (blasint N, float *X, blasint incX); +double cblas_dnrm2 (blasint N, double *X, blasint incX); +float cblas_scnrm2(blasint N, float *X, blasint incX); +double cblas_dznrm2(blasint N, double *X, blasint incX); + +CBLAS_INDEX cblas_isamax(blasint n, float *x, blasint incx); +CBLAS_INDEX cblas_idamax(blasint n, double *x, blasint incx); +CBLAS_INDEX cblas_icamax(blasint n, float *x, blasint incx); +CBLAS_INDEX cblas_izamax(blasint n, double *x, blasint incx); + +void cblas_saxpy(blasint n, float alpha, float *x, blasint incx, float *y, blasint incy); +void cblas_daxpy(blasint n, double alpha, double *x, blasint incx, double *y, blasint incy); +void cblas_caxpy(blasint n, float *alpha, float *x, blasint incx, float *y, blasint incy); +void cblas_zaxpy(blasint n, double *alpha, double *x, blasint incx, double *y, blasint incy); + +void cblas_scopy(blasint n, float *x, blasint incx, float *y, blasint incy); +void cblas_dcopy(blasint n, double *x, blasint incx, double *y, blasint incy); +void cblas_ccopy(blasint n, float *x, blasint incx, float *y, blasint incy); +void cblas_zcopy(blasint n, double *x, blasint incx, double *y, blasint incy); + +void cblas_sswap(blasint n, float *x, blasint incx, float *y, blasint incy); +void cblas_dswap(blasint n, double *x, blasint incx, double *y, blasint incy); +void cblas_cswap(blasint n, float *x, blasint incx, float *y, blasint incy); +void cblas_zswap(blasint n, double *x, blasint incx, double *y, blasint incy); + +void cblas_srot(blasint N, float *X, blasint incX, float *Y, blasint incY, float c, float s); +void cblas_drot(blasint N, double *X, blasint incX, double *Y, blasint incY, double c, double s); + +void cblas_srotg(float *a, float *b, float *c, float *s); +void cblas_drotg(double *a, double *b, double *c, double *s); + +void cblas_srotm(blasint N, float *X, blasint incX, float *Y, blasint incY, float *P); +void cblas_drotm(blasint N, double *X, blasint incX, double *Y, blasint incY, double *P); + +void cblas_srotmg(float *d1, float *d2, float *b1, float b2, float *P); +void cblas_drotmg(double *d1, double *d2, double *b1, double b2, double *P); + +void cblas_sscal(blasint N, float alpha, float *X, blasint incX); +void cblas_dscal(blasint N, double alpha, double *X, blasint incX); +void cblas_cscal(blasint N, float *alpha, float *X, blasint incX); +void cblas_zscal(blasint N, double *alpha, double *X, blasint incX); +void cblas_csscal(blasint N, float alpha, float *X, blasint incX); +void cblas_zdscal(blasint N, double alpha, double *X, blasint incX); + +void cblas_sgemv(enum CBLAS_ORDER order, enum CBLAS_TRANSPOSE trans, blasint m, blasint n, + float alpha, float *a, blasint lda, float *x, blasint incx, float beta, float *y, blasint incy); +void cblas_dgemv(enum CBLAS_ORDER order, enum CBLAS_TRANSPOSE trans, blasint m, blasint n, + double alpha, double *a, blasint lda, double *x, blasint incx, double beta, double *y, blasint incy); +void cblas_cgemv(enum CBLAS_ORDER order, enum CBLAS_TRANSPOSE trans, blasint m, blasint n, + float *alpha, float *a, blasint lda, float *x, blasint incx, float *beta, float *y, blasint incy); +void cblas_zgemv(enum CBLAS_ORDER order, enum CBLAS_TRANSPOSE trans, blasint m, blasint n, + double *alpha, double *a, blasint lda, double *x, blasint incx, double *beta, double *y, blasint incy); + +void cblas_sger (enum CBLAS_ORDER order, blasint M, blasint N, float alpha, float *X, blasint incX, float *Y, blasint incY, float *A, blasint lda); +void cblas_dger (enum CBLAS_ORDER order, blasint M, blasint N, double alpha, double *X, blasint incX, double *Y, blasint incY, double *A, blasint lda); +void cblas_cgeru(enum CBLAS_ORDER order, blasint M, blasint N, float *alpha, float *X, blasint incX, float *Y, blasint incY, float *A, blasint lda); +void cblas_cgerc(enum CBLAS_ORDER order, blasint M, blasint N, float *alpha, float *X, blasint incX, float *Y, blasint incY, float *A, blasint lda); +void cblas_zgeru(enum CBLAS_ORDER order, blasint M, blasint N, double *alpha, double *X, blasint incX, double *Y, blasint incY, double *A, blasint lda); +void cblas_zgerc(enum CBLAS_ORDER order, blasint M, blasint N, double *alpha, double *X, blasint incX, double *Y, blasint incY, double *A, blasint lda); + +void cblas_strsv(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE TransA, enum CBLAS_DIAG Diag, blasint N, float *A, blasint lda, float *X, blasint incX); +void cblas_dtrsv(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE TransA, enum CBLAS_DIAG Diag, blasint N, double *A, blasint lda, double *X, blasint incX); +void cblas_ctrsv(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE TransA, enum CBLAS_DIAG Diag, blasint N, float *A, blasint lda, float *X, blasint incX); +void cblas_ztrsv(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE TransA, enum CBLAS_DIAG Diag, blasint N, double *A, blasint lda, double *X, blasint incX); + +void cblas_strmv(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE TransA, enum CBLAS_DIAG Diag, blasint N, float *A, blasint lda, float *X, blasint incX); +void cblas_dtrmv(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE TransA, enum CBLAS_DIAG Diag, blasint N, double *A, blasint lda, double *X, blasint incX); +void cblas_ctrmv(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE TransA, enum CBLAS_DIAG Diag, blasint N, float *A, blasint lda, float *X, blasint incX); +void cblas_ztrmv(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE TransA, enum CBLAS_DIAG Diag, blasint N, double *A, blasint lda, double *X, blasint incX); + +void cblas_ssyr(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, blasint N, float alpha, float *X, blasint incX, float *A, blasint lda); +void cblas_dsyr(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, blasint N, double alpha, double *X, blasint incX, double *A, blasint lda); +void cblas_cher(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, blasint N, float alpha, float *X, blasint incX, float *A, blasint lda); +void cblas_zher(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, blasint N, double alpha, double *X, blasint incX, double *A, blasint lda); + +void cblas_ssyr2(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo,blasint N, float alpha, float *X, + blasint incX, float *Y, blasint incY, float *A, blasint lda); +void cblas_dsyr2(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, blasint N, double alpha, double *X, + blasint incX, double *Y, blasint incY, double *A, blasint lda); +void cblas_cher2(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, blasint N, float *alpha, float *X, blasint incX, + float *Y, blasint incY, float *A, blasint lda); +void cblas_zher2(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, blasint N, double *alpha, double *X, blasint incX, + double *Y, blasint incY, double *A, blasint lda); + +void cblas_sgbmv(enum CBLAS_ORDER order, enum CBLAS_TRANSPOSE TransA, blasint M, blasint N, + blasint KL, blasint KU, float alpha, float *A, blasint lda, float *X, blasint incX, float beta, float *Y, blasint incY); +void cblas_dgbmv(enum CBLAS_ORDER order, enum CBLAS_TRANSPOSE TransA, blasint M, blasint N, + blasint KL, blasint KU, double alpha, double *A, blasint lda, double *X, blasint incX, double beta, double *Y, blasint incY); +void cblas_cgbmv(enum CBLAS_ORDER order, enum CBLAS_TRANSPOSE TransA, blasint M, blasint N, + blasint KL, blasint KU, float *alpha, float *A, blasint lda, float *X, blasint incX, float *beta, float *Y, blasint incY); +void cblas_zgbmv(enum CBLAS_ORDER order, enum CBLAS_TRANSPOSE TransA, blasint M, blasint N, + blasint KL, blasint KU, double *alpha, double *A, blasint lda, double *X, blasint incX, double *beta, double *Y, blasint incY); + +void cblas_ssbmv(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, blasint N, blasint K, float alpha, float *A, + blasint lda, float *X, blasint incX, float beta, float *Y, blasint incY); +void cblas_dsbmv(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, blasint N, blasint K, double alpha, double *A, + blasint lda, double *X, blasint incX, double beta, double *Y, blasint incY); + + +void cblas_stbmv(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE TransA, enum CBLAS_DIAG Diag, + blasint N, blasint K, float *A, blasint lda, float *X, blasint incX); +void cblas_dtbmv(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE TransA, enum CBLAS_DIAG Diag, + blasint N, blasint K, double *A, blasint lda, double *X, blasint incX); +void cblas_ctbmv(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE TransA, enum CBLAS_DIAG Diag, + blasint N, blasint K, float *A, blasint lda, float *X, blasint incX); +void cblas_ztbmv(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE TransA, enum CBLAS_DIAG Diag, + blasint N, blasint K, double *A, blasint lda, double *X, blasint incX); + +void cblas_stbsv(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE TransA, enum CBLAS_DIAG Diag, + blasint N, blasint K, float *A, blasint lda, float *X, blasint incX); +void cblas_dtbsv(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE TransA, enum CBLAS_DIAG Diag, + blasint N, blasint K, double *A, blasint lda, double *X, blasint incX); +void cblas_ctbsv(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE TransA, enum CBLAS_DIAG Diag, + blasint N, blasint K, float *A, blasint lda, float *X, blasint incX); +void cblas_ztbsv(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE TransA, enum CBLAS_DIAG Diag, + blasint N, blasint K, double *A, blasint lda, double *X, blasint incX); + +void cblas_stpmv(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE TransA, enum CBLAS_DIAG Diag, + blasint N, float *Ap, float *X, blasint incX); +void cblas_dtpmv(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE TransA, enum CBLAS_DIAG Diag, + blasint N, double *Ap, double *X, blasint incX); +void cblas_ctpmv(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE TransA, enum CBLAS_DIAG Diag, + blasint N, float *Ap, float *X, blasint incX); +void cblas_ztpmv(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE TransA, enum CBLAS_DIAG Diag, + blasint N, double *Ap, double *X, blasint incX); + +void cblas_stpsv(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE TransA, enum CBLAS_DIAG Diag, + blasint N, float *Ap, float *X, blasint incX); +void cblas_dtpsv(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE TransA, enum CBLAS_DIAG Diag, + blasint N, double *Ap, double *X, blasint incX); +void cblas_ctpsv(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE TransA, enum CBLAS_DIAG Diag, + blasint N, float *Ap, float *X, blasint incX); +void cblas_ztpsv(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE TransA, enum CBLAS_DIAG Diag, + blasint N, double *Ap, double *X, blasint incX); + +void cblas_ssymv(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, blasint N, float alpha, float *A, + blasint lda, float *X, blasint incX, float beta, float *Y, blasint incY); +void cblas_dsymv(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, blasint N, double alpha, double *A, + blasint lda, double *X, blasint incX, double beta, double *Y, blasint incY); +void cblas_chemv(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, blasint N, float *alpha, float *A, + blasint lda, float *X, blasint incX, float *beta, float *Y, blasint incY); +void cblas_zhemv(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, blasint N, double *alpha, double *A, + blasint lda, double *X, blasint incX, double *beta, double *Y, blasint incY); + + +void cblas_sspmv(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, blasint N, float alpha, float *Ap, + float *X, blasint incX, float beta, float *Y, blasint incY); +void cblas_dspmv(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, blasint N, double alpha, double *Ap, + double *X, blasint incX, double beta, double *Y, blasint incY); + +void cblas_sspr(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, blasint N, float alpha, float *X, blasint incX, float *Ap); +void cblas_dspr(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, blasint N, double alpha, double *X, blasint incX, double *Ap); + +void cblas_chpr(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, blasint N, float alpha, float *X, blasint incX, float *A); +void cblas_zhpr(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, blasint N, double alpha, double *X,blasint incX, double *A); + +void cblas_sspr2(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, blasint N, float alpha, float *X, blasint incX, float *Y, blasint incY, float *A); +void cblas_dspr2(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, blasint N, double alpha, double *X, blasint incX, double *Y, blasint incY, double *A); +void cblas_chpr2(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, blasint N, float *alpha, float *X, blasint incX, float *Y, blasint incY, float *Ap); +void cblas_zhpr2(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, blasint N, double *alpha, double *X, blasint incX, double *Y, blasint incY, double *Ap); + +void cblas_chbmv(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, blasint N, blasint K, + float *alpha, float *A, blasint lda, float *X, blasint incX, float *beta, float *Y, blasint incY); +void cblas_zhbmv(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, blasint N, blasint K, + double *alpha, double *A, blasint lda, double *X, blasint incX, double *beta, double *Y, blasint incY); + +void cblas_chpmv(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, blasint N, + float *alpha, float *Ap, float *X, blasint incX, float *beta, float *Y, blasint incY); +void cblas_zhpmv(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, blasint N, + double *alpha, double *Ap, double *X, blasint incX, double *beta, double *Y, blasint incY); + +void cblas_sgemm(enum CBLAS_ORDER Order, enum CBLAS_TRANSPOSE TransA, enum CBLAS_TRANSPOSE TransB, blasint M, blasint N, blasint K, + float alpha, float *A, blasint lda, float *B, blasint ldb, float beta, float *C, blasint ldc); +void cblas_dgemm(enum CBLAS_ORDER Order, enum CBLAS_TRANSPOSE TransA, enum CBLAS_TRANSPOSE TransB, blasint M, blasint N, blasint K, + double alpha, double *A, blasint lda, double *B, blasint ldb, double beta, double *C, blasint ldc); +void cblas_cgemm(enum CBLAS_ORDER Order, enum CBLAS_TRANSPOSE TransA, enum CBLAS_TRANSPOSE TransB, blasint M, blasint N, blasint K, + float *alpha, float *A, blasint lda, float *B, blasint ldb, float *beta, float *C, blasint ldc); +void cblas_zgemm(enum CBLAS_ORDER Order, enum CBLAS_TRANSPOSE TransA, enum CBLAS_TRANSPOSE TransB, blasint M, blasint N, blasint K, + double *alpha, double *A, blasint lda, double *B, blasint ldb, double *beta, double *C, blasint ldc); + +void cblas_ssymm(enum CBLAS_ORDER Order, enum CBLAS_SIDE Side, enum CBLAS_UPLO Uplo, blasint M, blasint N, + float alpha, float *A, blasint lda, float *B, blasint ldb, float beta, float *C, blasint ldc); +void cblas_dsymm(enum CBLAS_ORDER Order, enum CBLAS_SIDE Side, enum CBLAS_UPLO Uplo, blasint M, blasint N, + double alpha, double *A, blasint lda, double *B, blasint ldb, double beta, double *C, blasint ldc); +void cblas_csymm(enum CBLAS_ORDER Order, enum CBLAS_SIDE Side, enum CBLAS_UPLO Uplo, blasint M, blasint N, + float *alpha, float *A, blasint lda, float *B, blasint ldb, float *beta, float *C, blasint ldc); +void cblas_zsymm(enum CBLAS_ORDER Order, enum CBLAS_SIDE Side, enum CBLAS_UPLO Uplo, blasint M, blasint N, + double *alpha, double *A, blasint lda, double *B, blasint ldb, double *beta, double *C, blasint ldc); + +void cblas_ssyrk(enum CBLAS_ORDER Order, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE Trans, + blasint N, blasint K, float alpha, float *A, blasint lda, float beta, float *C, blasint ldc); +void cblas_dsyrk(enum CBLAS_ORDER Order, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE Trans, + blasint N, blasint K, double alpha, double *A, blasint lda, double beta, double *C, blasint ldc); +void cblas_csyrk(enum CBLAS_ORDER Order, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE Trans, + blasint N, blasint K, float *alpha, float *A, blasint lda, float *beta, float *C, blasint ldc); +void cblas_zsyrk(enum CBLAS_ORDER Order, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE Trans, + blasint N, blasint K, double *alpha, double *A, blasint lda, double *beta, double *C, blasint ldc); + +void cblas_ssyr2k(enum CBLAS_ORDER Order, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE Trans, + blasint N, blasint K, float alpha, float *A, blasint lda, float *B, blasint ldb, float beta, float *C, blasint ldc); +void cblas_dsyr2k(enum CBLAS_ORDER Order, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE Trans, + blasint N, blasint K, double alpha, double *A, blasint lda, double *B, blasint ldb, double beta, double *C, blasint ldc); +void cblas_csyr2k(enum CBLAS_ORDER Order, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE Trans, + blasint N, blasint K, float *alpha, float *A, blasint lda, float *B, blasint ldb, float *beta, float *C, blasint ldc); +void cblas_zsyr2k(enum CBLAS_ORDER Order, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE Trans, + blasint N, blasint K, double *alpha, double *A, blasint lda, double *B, blasint ldb, double *beta, double *C, blasint ldc); + +void cblas_strmm(enum CBLAS_ORDER Order, enum CBLAS_SIDE Side, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE TransA, + enum CBLAS_DIAG Diag, blasint M, blasint N, float alpha, float *A, blasint lda, float *B, blasint ldb); +void cblas_dtrmm(enum CBLAS_ORDER Order, enum CBLAS_SIDE Side, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE TransA, + enum CBLAS_DIAG Diag, blasint M, blasint N, double alpha, double *A, blasint lda, double *B, blasint ldb); +void cblas_ctrmm(enum CBLAS_ORDER Order, enum CBLAS_SIDE Side, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE TransA, + enum CBLAS_DIAG Diag, blasint M, blasint N, float *alpha, float *A, blasint lda, float *B, blasint ldb); +void cblas_ztrmm(enum CBLAS_ORDER Order, enum CBLAS_SIDE Side, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE TransA, + enum CBLAS_DIAG Diag, blasint M, blasint N, double *alpha, double *A, blasint lda, double *B, blasint ldb); + +void cblas_strsm(enum CBLAS_ORDER Order, enum CBLAS_SIDE Side, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE TransA, + enum CBLAS_DIAG Diag, blasint M, blasint N, float alpha, float *A, blasint lda, float *B, blasint ldb); +void cblas_dtrsm(enum CBLAS_ORDER Order, enum CBLAS_SIDE Side, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE TransA, + enum CBLAS_DIAG Diag, blasint M, blasint N, double alpha, double *A, blasint lda, double *B, blasint ldb); +void cblas_ctrsm(enum CBLAS_ORDER Order, enum CBLAS_SIDE Side, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE TransA, + enum CBLAS_DIAG Diag, blasint M, blasint N, float *alpha, float *A, blasint lda, float *B, blasint ldb); +void cblas_ztrsm(enum CBLAS_ORDER Order, enum CBLAS_SIDE Side, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE TransA, + enum CBLAS_DIAG Diag, blasint M, blasint N, double *alpha, double *A, blasint lda, double *B, blasint ldb); + +void cblas_chemm(enum CBLAS_ORDER Order, enum CBLAS_SIDE Side, enum CBLAS_UPLO Uplo, blasint M, blasint N, + float *alpha, float *A, blasint lda, float *B, blasint ldb, float *beta, float *C, blasint ldc); +void cblas_zhemm(enum CBLAS_ORDER Order, enum CBLAS_SIDE Side, enum CBLAS_UPLO Uplo, blasint M, blasint N, + double *alpha, double *A, blasint lda, double *B, blasint ldb, double *beta, double *C, blasint ldc); + +void cblas_cherk(enum CBLAS_ORDER Order, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE Trans, blasint N, blasint K, + float alpha, float *A, blasint lda, float beta, float *C, blasint ldc); +void cblas_zherk(enum CBLAS_ORDER Order, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE Trans, blasint N, blasint K, + double alpha, double *A, blasint lda, double beta, double *C, blasint ldc); + +void cblas_cher2k(enum CBLAS_ORDER Order, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE Trans, blasint N, blasint K, + float *alpha, float *A, blasint lda, float *B, blasint ldb, float beta, float *C, blasint ldc); +void cblas_zher2k(enum CBLAS_ORDER Order, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE Trans, blasint N, blasint K, + double *alpha, double *A, blasint lda, double *B, blasint ldb, double beta, double *C, blasint ldc); + +void cblas_xerbla(blasint p, char *rout, char *form, ...); + +#ifdef __cplusplus +} +#endif /* __cplusplus */ + +#endif diff --git a/common_arm.h b/common_arm.h new file mode 100644 index 000000000..8c9752d9f --- /dev/null +++ b/common_arm.h @@ -0,0 +1,169 @@ +/***************************************************************************** +Copyright (c) 2011, Lab of Parallel Software and Computational Science,ICSAS +All rights reserved. + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: + + 1. Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + + 2. Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in + the documentation and/or other materials provided with the + distribution. + 3. Neither the name of the ISCAS nor the names of its contributors may + be used to endorse or promote products derived from this software + without specific prior written permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +**********************************************************************************/ + +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#ifndef COMMON_ARM +#define COMMON_ARM + +#define MB +#define WMB + +#define INLINE inline + +#define RETURN_BY_COMPLEX + +#ifndef ASSEMBLER + +static void __inline blas_lock(volatile BLASULONG *address){ + + int register ret; + + do { + while (*address) {YIELDING;}; + + __asm__ __volatile__( + "ldrex r2, [%1] \n\t" + "mov r2, #0 \n\t" + "strex r3, r2, [%1] \n\t" + "mov %0 , r3 \n\t" + : "=r"(ret), "=r"(address) + : "1"(address) + : "memory", "r2" , "r3" + + + ); + + } while (ret); + +} + + +static inline unsigned long long rpcc(void){ + unsigned long long ret=0; + double v; + struct timeval tv; + gettimeofday(&tv,NULL); + v=(double) tv.tv_sec + (double) tv.tv_usec * 1e-6; + ret = (unsigned long long) ( v * 1000.0d ); + return ret; +} + +static inline int blas_quickdivide(blasint x, blasint y){ + return x / y; +} + +#if defined(DOUBLE) +#define GET_IMAGE(res) __asm__ __volatile__("vstr.f64 d1, %0" : "=m"(res) : : "memory") +#else +#define GET_IMAGE(res) __asm__ __volatile__("vstr.f32 s1, %0" : "=m"(res) : : "memory") +#endif + +#define GET_IMAGE_CANCEL + +#endif + + +#ifndef F_INTERFACE +#define REALNAME ASMNAME +#else +#define REALNAME ASMFNAME +#endif + +#if defined(ASSEMBLER) && !defined(NEEDPARAM) + +#define PROLOGUE \ + .arm ;\ + .global REALNAME ;\ + .func REALNAME ;\ +REALNAME: + +#define EPILOGUE + +#define PROFCODE + +#endif + + +#define SEEK_ADDRESS + +#ifndef PAGESIZE +#define PAGESIZE ( 4 << 10) +#endif +#define HUGE_PAGESIZE ( 4 << 20) + +#define BUFFER_SIZE (16 << 20) + + +#define BASE_ADDRESS (START_ADDRESS - BUFFER_SIZE * MAX_CPU_NUMBER) + +#ifndef MAP_ANONYMOUS +#define MAP_ANONYMOUS MAP_ANON +#endif + +#endif diff --git a/common_arm64.h b/common_arm64.h new file mode 100644 index 000000000..2da0d894c --- /dev/null +++ b/common_arm64.h @@ -0,0 +1,169 @@ +/***************************************************************************** +Copyright (c) 2011, Lab of Parallel Software and Computational Science,ICSAS +All rights reserved. + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: + + 1. Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + + 2. Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in + the documentation and/or other materials provided with the + distribution. + 3. Neither the name of the ISCAS nor the names of its contributors may + be used to endorse or promote products derived from this software + without specific prior written permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +**********************************************************************************/ + +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#ifndef COMMON_ARM64 +#define COMMON_ARM64 + +#define MB +#define WMB + +#define INLINE inline + +#define RETURN_BY_COMPLEX + +#ifndef ASSEMBLER + +static void __inline blas_lock(volatile BLASULONG *address){ +/* + int register ret; + + do { + while (*address) {YIELDING;}; + + __asm__ __volatile__( + "ldrex r2, [%1] \n\t" + "mov r2, #0 \n\t" + "strex r3, r2, [%1] \n\t" + "mov %0 , r3 \n\t" + : "=r"(ret), "=r"(address) + : "1"(address) + : "memory", "r2" , "r3" + + + ); + + } while (ret); +*/ +} + + +static inline unsigned long long rpcc(void){ + unsigned long long ret=0; + double v; + struct timeval tv; + gettimeofday(&tv,NULL); + v=(double) tv.tv_sec + (double) tv.tv_usec * 1e-6; + ret = (unsigned long long) ( v * 1000.0d ); + return ret; +} + +static inline int blas_quickdivide(blasint x, blasint y){ + return x / y; +} + +#if defined(DOUBLE) +#define GET_IMAGE(res) __asm__ __volatile__("vstr.f64 d1, %0" : "=m"(res) : : "memory") +#else +#define GET_IMAGE(res) __asm__ __volatile__("vstr.f32 s1, %0" : "=m"(res) : : "memory") +#endif + +#define GET_IMAGE_CANCEL + +#endif + + +#ifndef F_INTERFACE +#define REALNAME ASMNAME +#else +#define REALNAME ASMFNAME +#endif + +#if defined(ASSEMBLER) && !defined(NEEDPARAM) + +#define PROLOGUE \ + .arm ;\ + .global REALNAME ;\ + .func REALNAME ;\ +REALNAME: + +#define EPILOGUE + +#define PROFCODE + +#endif + + +#define SEEK_ADDRESS + +#ifndef PAGESIZE +#define PAGESIZE ( 4 << 10) +#endif +#define HUGE_PAGESIZE ( 4 << 20) + +#define BUFFER_SIZE (16 << 20) + + +#define BASE_ADDRESS (START_ADDRESS - BUFFER_SIZE * MAX_CPU_NUMBER) + +#ifndef MAP_ANONYMOUS +#define MAP_ANONYMOUS MAP_ANON +#endif + +#endif diff --git a/cpuid_arm.c b/cpuid_arm.c new file mode 100644 index 000000000..efd1369b4 --- /dev/null +++ b/cpuid_arm.c @@ -0,0 +1,262 @@ +/************************************************************************** + Copyright (c) 2013, The OpenBLAS Project + All rights reserved. + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are + met: + 1. Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + 2. Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in + the documentation and/or other materials provided with the + distribution. + 3. Neither the name of the OpenBLAS project nor the names of + its contributors may be used to endorse or promote products + derived from this software without specific prior written permission. + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE + LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR + SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER + CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, + OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE + USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + *****************************************************************************/ + +#include + +#define CPU_UNKNOWN 0 +#define CPU_ARMV6 1 +#define CPU_ARMV7 2 +#define CPU_CORTEXA15 3 + +static char *cpuname[] = { + "UNKOWN", + "ARMV6", + "ARMV7", + "CORTEXA15" +}; + + +int get_feature(char *search) +{ + +#ifdef linux + FILE *infile; + char buffer[2048], *p,*t; + p = (char *) NULL ; + + infile = fopen("/proc/cpuinfo", "r"); + + while (fgets(buffer, sizeof(buffer), infile)) + { + + if (!strncmp("Features", buffer, 8)) + { + p = strchr(buffer, ':') + 2; + break; + } + } + + fclose(infile); + + + if( p == NULL ) return; + + t = strtok(p," "); + while( t = strtok(NULL," ")) + { + if (!strcmp(t, search)) { return(1); } + } + +#endif + return(0); +} + + +int detect(void) +{ + +#ifdef linux + + FILE *infile; + char buffer[512], *p; + p = (char *) NULL ; + + infile = fopen("/proc/cpuinfo", "r"); + + while (fgets(buffer, sizeof(buffer), infile)) + { + + if (!strncmp("model name", buffer, 10)) + { + p = strchr(buffer, ':') + 2; + break; + } + } + + fclose(infile); + + if(p != NULL) + { + + if (strstr(p, "ARMv7")) + { + if ( get_feature("vfpv4")) + return CPU_ARMV7; + + if ( get_feature("vfpv3")) + return CPU_ARMV7; + + if ( get_feature("vfp")) + return CPU_ARMV6; + + + } + + if (strstr(p, "ARMv6")) + { + if ( get_feature("vfp")) + return CPU_ARMV6; + } + + + } +#endif + + return CPU_UNKNOWN; +} + +char *get_corename(void) +{ + return cpuname[detect()]; +} + +void get_architecture(void) +{ + printf("ARM"); +} + +void get_subarchitecture(void) +{ + int d = detect(); + switch (d) + { + + case CPU_ARMV7: + printf("ARMV7"); + break; + + case CPU_ARMV6: + printf("ARMV6"); + break; + + default: + printf("UNKNOWN"); + break; + } +} + +void get_subdirname(void) +{ + printf("arm"); +} + +void get_cpuconfig(void) +{ + + int d = detect(); + switch (d) + { + + case CPU_ARMV7: + printf("#define ARMV7\n"); + printf("#define HAVE_VFP\n"); + printf("#define HAVE_VFPV3\n"); + if ( get_feature("neon")) printf("#define HAVE_NEON\n"); + if ( get_feature("vfpv4")) printf("#define HAVE_VFPV4\n"); + printf("#define L1_DATA_SIZE 65536\n"); + printf("#define L1_DATA_LINESIZE 32\n"); + printf("#define L2_SIZE 512488\n"); + printf("#define L2_LINESIZE 32\n"); + printf("#define DTB_DEFAULT_ENTRIES 64\n"); + printf("#define DTB_SIZE 4096\n"); + printf("#define L2_ASSOCIATIVE 4\n"); + break; + + case CPU_ARMV6: + printf("#define ARMV6\n"); + printf("#define HAVE_VFP\n"); + printf("#define L1_DATA_SIZE 65536\n"); + printf("#define L1_DATA_LINESIZE 32\n"); + printf("#define L2_SIZE 512488\n"); + printf("#define L2_LINESIZE 32\n"); + printf("#define DTB_DEFAULT_ENTRIES 64\n"); + printf("#define DTB_SIZE 4096\n"); + printf("#define L2_ASSOCIATIVE 4\n"); + break; + + } +} + + +void get_libname(void) +{ + + int d = detect(); + switch (d) + { + + case CPU_ARMV7: + printf("armv7\n"); + break; + + case CPU_ARMV6: + printf("armv6\n"); + break; + + } +} + + +void get_features(void) +{ + +#ifdef linux + FILE *infile; + char buffer[2048], *p,*t; + p = (char *) NULL ; + + infile = fopen("/proc/cpuinfo", "r"); + + while (fgets(buffer, sizeof(buffer), infile)) + { + + if (!strncmp("Features", buffer, 8)) + { + p = strchr(buffer, ':') + 2; + break; + } + } + + fclose(infile); + + + if( p == NULL ) return; + + t = strtok(p," "); + while( t = strtok(NULL," ")) + { + if (!strcmp(t, "vfp")) { printf("HAVE_VFP=1\n"); continue; } + if (!strcmp(t, "vfpv3")) { printf("HAVE_VFPV3=1\n"); continue; } + if (!strcmp(t, "vfpv4")) { printf("HAVE_VFPV4=1\n"); continue; } + if (!strcmp(t, "neon")) { printf("HAVE_NEON=1\n"); continue; } + } + +#endif + return; +} + + diff --git a/ctest.c b/ctest.c index 413519274..86dc226d4 100644 --- a/ctest.c +++ b/ctest.c @@ -124,3 +124,12 @@ ARCH_IA64 #if defined(__LP64) || defined(__LP64__) || defined(__ptr64) || defined(__x86_64__) || defined(__amd64__) || defined(__64BIT__) BINARY_64 #endif + +#if defined(__ARM_ARCH) || defined(__ARM_ARCH_7A__) +ARCH_ARM +#endif + +#if defined(__aarch64__) +ARCH_ARM64 +#endif + diff --git a/kernel/arm/KERNEL b/kernel/arm/KERNEL new file mode 100644 index 000000000..aeccfbf4c --- /dev/null +++ b/kernel/arm/KERNEL @@ -0,0 +1,46 @@ +ifndef SNRM2KERNEL +SNRM2KERNEL = nrm2.c +endif + +ifndef DNRM2KERNEL +DNRM2KERNEL = nrm2.c +endif + +ifndef CNRM2KERNEL +CNRM2KERNEL = znrm2.c +endif + +ifndef ZNRM2KERNEL +ZNRM2KERNEL = znrm2.c +endif + +ifndef SCABS_KERNEL +SCABS_KERNEL = ../generic/cabs.c +endif + +ifndef DCABS_KERNEL +DCABS_KERNEL = ../generic/cabs.c +endif + +ifndef QCABS_KERNEL +QCABS_KERNEL = ../generic/cabs.c +endif + +ifndef LSAME_KERNEL +LSAME_KERNEL = ../generic/lsame.c +endif + +ifndef SGEMM_BETA +SGEMM_BETA = ../generic/gemm_beta.c +endif +ifndef DGEMM_BETA +DGEMM_BETA = ../generic/gemm_beta.c +endif +ifndef CGEMM_BETA +CGEMM_BETA = ../generic/zgemm_beta.c +endif +ifndef ZGEMM_BETA +ZGEMM_BETA = ../generic/zgemm_beta.c +endif + + diff --git a/kernel/arm/KERNEL.ARMV6 b/kernel/arm/KERNEL.ARMV6 new file mode 100644 index 000000000..f47a843f3 --- /dev/null +++ b/kernel/arm/KERNEL.ARMV6 @@ -0,0 +1,142 @@ +SAMAXKERNEL = iamax_vfp.S +DAMAXKERNEL = iamax_vfp.S +CAMAXKERNEL = iamax_vfp.S +ZAMAXKERNEL = iamax_vfp.S + +SAMINKERNEL = iamax_vfp.S +DAMINKERNEL = iamax_vfp.S +CAMINKERNEL = iamax_vfp.S +ZAMINKERNEL = iamax_vfp.S + +SMAXKERNEL = iamax_vfp.S +DMAXKERNEL = iamax_vfp.S + +SMINKERNEL = iamax_vfp.S +DMINKERNEL = iamax_vfp.S + +ISAMAXKERNEL = iamax_vfp.S +IDAMAXKERNEL = iamax_vfp.S +ICAMAXKERNEL = iamax_vfp.S +IZAMAXKERNEL = iamax_vfp.S + +ISAMINKERNEL = iamax_vfp.S +IDAMINKERNEL = iamax_vfp.S +ICAMINKERNEL = iamax_vfp.S +IZAMINKERNEL = iamax_vfp.S + +ISMAXKERNEL = iamax_vfp.S +IDMAXKERNEL = iamax_vfp.S + +ISMINKERNEL = iamax_vfp.S +IDMINKERNEL = iamax_vfp.S + +SASUMKERNEL = asum_vfp.S +DASUMKERNEL = asum_vfp.S +CASUMKERNEL = asum_vfp.S +ZASUMKERNEL = asum_vfp.S + +SAXPYKERNEL = axpy_vfp.S +DAXPYKERNEL = axpy_vfp.S +CAXPYKERNEL = axpy_vfp.S +ZAXPYKERNEL = axpy_vfp.S + +SCOPYKERNEL = scopy_vfp.S +DCOPYKERNEL = dcopy_vfp.S +CCOPYKERNEL = ccopy_vfp.S +ZCOPYKERNEL = zcopy_vfp.S + +SDOTKERNEL = sdot_vfp.S +DDOTKERNEL = ddot_vfp.S +CDOTKERNEL = cdot_vfp.S +ZDOTKERNEL = zdot_vfp.S + +SNRM2KERNEL = nrm2_vfp.S +DNRM2KERNEL = nrm2_vfp.S +CNRM2KERNEL = nrm2_vfp.S +ZNRM2KERNEL = nrm2_vfp.S + +SROTKERNEL = rot_vfp.S +DROTKERNEL = rot_vfp.S +CROTKERNEL = rot_vfp.S +ZROTKERNEL = rot_vfp.S + +SSCALKERNEL = scal_vfp.S +DSCALKERNEL = scal_vfp.S +CSCALKERNEL = scal_vfp.S +ZSCALKERNEL = scal_vfp.S + +SSWAPKERNEL = swap_vfp.S +DSWAPKERNEL = swap_vfp.S +CSWAPKERNEL = swap_vfp.S +ZSWAPKERNEL = swap_vfp.S + +SGEMVNKERNEL = gemv_n_vfp.S +DGEMVNKERNEL = gemv_n_vfp.S +CGEMVNKERNEL = cgemv_n_vfp.S +ZGEMVNKERNEL = zgemv_n_vfp.S + +SGEMVTKERNEL = gemv_t_vfp.S +DGEMVTKERNEL = gemv_t_vfp.S +CGEMVTKERNEL = cgemv_t_vfp.S +ZGEMVTKERNEL = zgemv_t_vfp.S + +STRMMKERNEL = strmm_kernel_4x2_vfp.S +DTRMMKERNEL = dtrmm_kernel_4x2_vfp.S +CTRMMKERNEL = ctrmm_kernel_2x2_vfp.S +ZTRMMKERNEL = ztrmm_kernel_2x2_vfp.S + +SGEMMKERNEL = sgemm_kernel_4x2_vfp.S +SGEMMINCOPY = sgemm_ncopy_4_vfp.S +SGEMMITCOPY = sgemm_tcopy_4_vfp.S +SGEMMINCOPYOBJ = sgemm_incopy.o +SGEMMITCOPYOBJ = sgemm_itcopy.o +SGEMMONCOPY = sgemm_ncopy_2_vfp.S +SGEMMOTCOPY = ../generic/gemm_tcopy_2.c +SGEMMONCOPYOBJ = sgemm_oncopy.o +SGEMMOTCOPYOBJ = sgemm_otcopy.o + +DGEMMKERNEL = dgemm_kernel_4x2_vfp.S +DGEMMINCOPY = dgemm_ncopy_4_vfp.S +DGEMMITCOPY = dgemm_tcopy_4_vfp.S +DGEMMINCOPYOBJ = dgemm_incopy.o +DGEMMITCOPYOBJ = dgemm_itcopy.o +DGEMMONCOPY = dgemm_ncopy_2_vfp.S +DGEMMOTCOPY = ../generic/gemm_tcopy_2.c +DGEMMONCOPYOBJ = dgemm_oncopy.o +DGEMMOTCOPYOBJ = dgemm_otcopy.o + +CGEMMKERNEL = cgemm_kernel_2x2_vfp.S +CGEMMONCOPY = cgemm_ncopy_2_vfp.S +CGEMMOTCOPY = cgemm_tcopy_2_vfp.S +CGEMMONCOPYOBJ = cgemm_oncopy.o +CGEMMOTCOPYOBJ = cgemm_otcopy.o + +ZGEMMKERNEL = zgemm_kernel_2x2_vfp.S +ZGEMMONCOPY = zgemm_ncopy_2_vfp.S +ZGEMMOTCOPY = zgemm_tcopy_2_vfp.S +ZGEMMONCOPYOBJ = zgemm_oncopy.o +ZGEMMOTCOPYOBJ = zgemm_otcopy.o + +STRSMKERNEL_LN = ../generic/trsm_kernel_LN.c +STRSMKERNEL_LT = ../generic/trsm_kernel_LT.c +STRSMKERNEL_RN = ../generic/trsm_kernel_RN.c +STRSMKERNEL_RT = ../generic/trsm_kernel_RT.c + +DTRSMKERNEL_LN = ../generic/trsm_kernel_LN.c +DTRSMKERNEL_LT = ../generic/trsm_kernel_LT.c +DTRSMKERNEL_RN = ../generic/trsm_kernel_RN.c +DTRSMKERNEL_RT = ../generic/trsm_kernel_RT.c + +CTRSMKERNEL_LN = ../generic/trsm_kernel_LN.c +CTRSMKERNEL_LT = ../generic/trsm_kernel_LT.c +CTRSMKERNEL_RN = ../generic/trsm_kernel_RN.c +CTRSMKERNEL_RT = ../generic/trsm_kernel_RT.c + +ZTRSMKERNEL_LN = ../generic/trsm_kernel_LN.c +ZTRSMKERNEL_LT = ../generic/trsm_kernel_LT.c +ZTRSMKERNEL_RN = ../generic/trsm_kernel_RN.c +ZTRSMKERNEL_RT = ../generic/trsm_kernel_RT.c + + + + diff --git a/kernel/arm/KERNEL.ARMV7 b/kernel/arm/KERNEL.ARMV7 new file mode 100644 index 000000000..507f9813c --- /dev/null +++ b/kernel/arm/KERNEL.ARMV7 @@ -0,0 +1,141 @@ +SAMAXKERNEL = iamax_vfp.S +DAMAXKERNEL = iamax_vfp.S +CAMAXKERNEL = iamax_vfp.S +ZAMAXKERNEL = iamax_vfp.S + +SAMINKERNEL = iamax_vfp.S +DAMINKERNEL = iamax_vfp.S +CAMINKERNEL = iamax_vfp.S +ZAMINKERNEL = iamax_vfp.S + +SMAXKERNEL = iamax_vfp.S +DMAXKERNEL = iamax_vfp.S + +SMINKERNEL = iamax_vfp.S +DMINKERNEL = iamax_vfp.S + +ISAMAXKERNEL = iamax_vfp.S +IDAMAXKERNEL = iamax_vfp.S +ICAMAXKERNEL = iamax_vfp.S +IZAMAXKERNEL = iamax_vfp.S + +ISAMINKERNEL = iamax_vfp.S +IDAMINKERNEL = iamax_vfp.S +ICAMINKERNEL = iamax_vfp.S +IZAMINKERNEL = iamax_vfp.S + +ISMAXKERNEL = iamax_vfp.S +IDMAXKERNEL = iamax_vfp.S + +ISMINKERNEL = iamax_vfp.S +IDMINKERNEL = iamax_vfp.S + +SSWAPKERNEL = swap_vfp.S +DSWAPKERNEL = swap_vfp.S +CSWAPKERNEL = swap_vfp.S +ZSWAPKERNEL = swap_vfp.S + +SASUMKERNEL = asum_vfp.S +DASUMKERNEL = asum_vfp.S +CASUMKERNEL = asum_vfp.S +ZASUMKERNEL = asum_vfp.S + +SAXPYKERNEL = axpy_vfp.S +DAXPYKERNEL = axpy_vfp.S +CAXPYKERNEL = axpy_vfp.S +ZAXPYKERNEL = axpy_vfp.S + +SCOPYKERNEL = scopy_vfp.S +DCOPYKERNEL = dcopy_vfp.S +CCOPYKERNEL = ccopy_vfp.S +ZCOPYKERNEL = zcopy_vfp.S + +SDOTKERNEL = sdot_vfp.S +DDOTKERNEL = ddot_vfp.S +CDOTKERNEL = cdot_vfp.S +ZDOTKERNEL = zdot_vfp.S + +SNRM2KERNEL = nrm2_vfpv3.S +DNRM2KERNEL = nrm2_vfpv3.S +CNRM2KERNEL = nrm2_vfpv3.S +ZNRM2KERNEL = nrm2_vfpv3.S + +SROTKERNEL = rot_vfp.S +DROTKERNEL = rot_vfp.S +CROTKERNEL = rot_vfp.S +ZROTKERNEL = rot_vfp.S + +SSCALKERNEL = scal_vfp.S +DSCALKERNEL = scal_vfp.S +CSCALKERNEL = scal_vfp.S +ZSCALKERNEL = scal_vfp.S + +SGEMVNKERNEL = gemv_n_vfp.S +DGEMVNKERNEL = gemv_n_vfp.S +CGEMVNKERNEL = cgemv_n_vfp.S +ZGEMVNKERNEL = zgemv_n_vfp.S + +SGEMVTKERNEL = gemv_t_vfp.S +DGEMVTKERNEL = gemv_t_vfp.S +CGEMVTKERNEL = cgemv_t_vfp.S +ZGEMVTKERNEL = zgemv_t_vfp.S + +STRMMKERNEL = strmm_kernel_4x4_vfpv3.S +DTRMMKERNEL = dtrmm_kernel_4x4_vfpv3.S +CTRMMKERNEL = ctrmm_kernel_2x2_vfpv3.S +ZTRMMKERNEL = ztrmm_kernel_2x2_vfpv3.S + +#SGEMMKERNEL = ../generic/gemmkernel_2x2.c +SGEMMKERNEL = sgemm_kernel_4x4_vfpv3.S +SGEMMINCOPY = +SGEMMITCOPY = +SGEMMONCOPY = sgemm_ncopy_4_vfp.S +SGEMMOTCOPY = sgemm_tcopy_4_vfp.S +SGEMMINCOPYOBJ = +SGEMMITCOPYOBJ = +SGEMMONCOPYOBJ = sgemm_oncopy.o +SGEMMOTCOPYOBJ = sgemm_otcopy.o + +DGEMMKERNEL = dgemm_kernel_4x4_vfpv3.S +DGEMMINCOPY = +DGEMMITCOPY = +DGEMMONCOPY = dgemm_ncopy_4_vfp.S +DGEMMOTCOPY = dgemm_tcopy_4_vfp.S +DGEMMINCOPYOBJ = +DGEMMITCOPYOBJ = +DGEMMONCOPYOBJ = dgemm_oncopy.o +DGEMMOTCOPYOBJ = dgemm_otcopy.o + +CGEMMKERNEL = cgemm_kernel_2x2_vfpv3.S +CGEMMONCOPY = cgemm_ncopy_2_vfp.S +CGEMMOTCOPY = cgemm_tcopy_2_vfp.S +CGEMMONCOPYOBJ = cgemm_oncopy.o +CGEMMOTCOPYOBJ = cgemm_otcopy.o + +ZGEMMKERNEL = zgemm_kernel_2x2_vfpv3.S +ZGEMMONCOPY = zgemm_ncopy_2_vfp.S +ZGEMMOTCOPY = zgemm_tcopy_2_vfp.S +ZGEMMONCOPYOBJ = zgemm_oncopy.o +ZGEMMOTCOPYOBJ = zgemm_otcopy.o + +STRSMKERNEL_LN = ../generic/trsm_kernel_LN.c +STRSMKERNEL_LT = ../generic/trsm_kernel_LT.c +STRSMKERNEL_RN = ../generic/trsm_kernel_RN.c +STRSMKERNEL_RT = ../generic/trsm_kernel_RT.c + +DTRSMKERNEL_LN = ../generic/trsm_kernel_LN.c +DTRSMKERNEL_LT = ../generic/trsm_kernel_LT.c +DTRSMKERNEL_RN = ../generic/trsm_kernel_RN.c +DTRSMKERNEL_RT = ../generic/trsm_kernel_RT.c + +CTRSMKERNEL_LN = ../generic/trsm_kernel_LN.c +CTRSMKERNEL_LT = ../generic/trsm_kernel_LT.c +CTRSMKERNEL_RN = ../generic/trsm_kernel_RN.c +CTRSMKERNEL_RT = ../generic/trsm_kernel_RT.c + +ZTRSMKERNEL_LN = ../generic/trsm_kernel_LN.c +ZTRSMKERNEL_LT = ../generic/trsm_kernel_LT.c +ZTRSMKERNEL_RN = ../generic/trsm_kernel_RN.c +ZTRSMKERNEL_RT = ../generic/trsm_kernel_RT.c + + diff --git a/kernel/arm/Makefile b/kernel/arm/Makefile new file mode 100644 index 000000000..efae70d7b --- /dev/null +++ b/kernel/arm/Makefile @@ -0,0 +1,2 @@ +clean :: + diff --git a/kernel/arm/amax.c b/kernel/arm/amax.c new file mode 100644 index 000000000..55107ca4f --- /dev/null +++ b/kernel/arm/amax.c @@ -0,0 +1,73 @@ +/*************************************************************************** +Copyright (c) 2013, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +/************************************************************************************** +* 2013/09/14 Saar +* BLASTEST float : OK +* BLASTEST double : OK +* CTEST : NoTest +* TEST : NoTest +* +**************************************************************************************/ + +#include "common.h" +#include + +#if defined(DOUBLE) + +#define ABS fabs + +#else + +#define ABS fabsf + +#endif + + +FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) +{ + BLASLONG i=0; + BLASLONG ix=0; + FLOAT maxf=0.0; + + if (n < 0 || inc_x < 1 ) return(maxf); + + maxf=ABS(x[0]); + + while(i < n) + { + if( ABS(x[ix]) > ABS(maxf) ) + { + maxf = ABS(x[ix]); + } + ix += inc_x; + i++; + } + return(maxf); +} + + diff --git a/kernel/arm/amin.c b/kernel/arm/amin.c new file mode 100644 index 000000000..3f7e97be6 --- /dev/null +++ b/kernel/arm/amin.c @@ -0,0 +1,73 @@ +/*************************************************************************** +Copyright (c) 2013, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +/************************************************************************************** +* 2013/09/14 Saar +* BLASTEST float : OK +* BLASTEST double : OK +* CTEST : NoTest +* TEST : NoTest +* +**************************************************************************************/ + +#include "common.h" +#include + +#if defined(DOUBLE) + +#define ABS fabs + +#else + +#define ABS fabsf + +#endif + + +FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) +{ + BLASLONG i=0; + BLASLONG ix=0; + FLOAT minf=0.0; + + if (n < 0 || inc_x < 1 ) return(minf); + + minf=ABS(x[0]); + + while(i < n) + { + if( ABS(x[ix]) < ABS(minf) ) + { + minf = ABS(x[ix]); + } + ix += inc_x; + i++; + } + return(minf); +} + + diff --git a/kernel/arm/asum.c b/kernel/arm/asum.c new file mode 100644 index 000000000..5ac6936a0 --- /dev/null +++ b/kernel/arm/asum.c @@ -0,0 +1,67 @@ +/*************************************************************************** +Copyright (c) 2013, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +/************************************************************************************** +* 2013/09/14 Saar +* BLASTEST float : OK +* BLASTEST double : OK +* CTEST : OK +* TEST : OK +* +**************************************************************************************/ + + +#include "common.h" +#include + +#if defined(DOUBLE) + +#define ABS fabs + +#else + +#define ABS fabsf + +#endif + + +FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) +{ + BLASLONG i=0; + FLOAT sumf = 0.0; + if (n < 0 || inc_x < 1 ) return(sumf); + + n *= inc_x; + while(i < n) + { + sumf += ABS(x[i]); + i += inc_x; + } + return(sumf); +} + + diff --git a/kernel/arm/asum_vfp.S b/kernel/arm/asum_vfp.S new file mode 100644 index 000000000..2b6ceb191 --- /dev/null +++ b/kernel/arm/asum_vfp.S @@ -0,0 +1,481 @@ +/*************************************************************************** +Copyright (c) 2013, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +/************************************************************************************** +* 2013/11/11 Saar +* BLASTEST : OK +* CTEST : OK +* TEST : OK +* +**************************************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#define STACKSIZE 256 + +#define N r0 +#define X r1 +#define INC_X r2 + + +#define I r12 + +#define X_PRE 512 + +/************************************************************************************** +* Macro definitions +**************************************************************************************/ + +#if !defined(COMPLEX) + +#if defined(DOUBLE) + +.macro KERNEL_F4 + + pld [ X, #X_PRE ] + fldmiad X!, { d4 - d5 } + vabs.f64 d4, d4 + vadd.f64 d0 , d0, d4 + vabs.f64 d5, d5 + fldmiad X!, { d6 - d7 } + vabs.f64 d6, d6 + vadd.f64 d1 , d1, d5 + vabs.f64 d7, d7 + vadd.f64 d0 , d0, d6 + vadd.f64 d1 , d1, d7 + +.endm + +.macro KERNEL_F1 + + fldmiad X!, { d4 } + vabs.f64 d4, d4 + vadd.f64 d0 , d0, d4 + +.endm + + +.macro KERNEL_S4 + + fldmiad X, { d4 } + vabs.f64 d4, d4 + vadd.f64 d0 , d0, d4 + add X, X, INC_X + + fldmiad X, { d4 } + vabs.f64 d4, d4 + vadd.f64 d0 , d0, d4 + add X, X, INC_X + + fldmiad X, { d4 } + vabs.f64 d4, d4 + vadd.f64 d0 , d0, d4 + add X, X, INC_X + + fldmiad X, { d4 } + vabs.f64 d4, d4 + vadd.f64 d0 , d0, d4 + add X, X, INC_X + +.endm + + +.macro KERNEL_S1 + + fldmiad X, { d4 } + vabs.f64 d4, d4 + vadd.f64 d0 , d0, d4 + add X, X, INC_X + +.endm + +#else + +.macro KERNEL_F4 + + fldmias X!, { s4 - s5 } + vabs.f32 s4, s4 + vadd.f32 s0 , s0, s4 + vabs.f32 s5, s5 + fldmias X!, { s6 - s7 } + vabs.f32 s6, s6 + vadd.f32 s1 , s1, s5 + vabs.f32 s7, s7 + vadd.f32 s0 , s0, s6 + vadd.f32 s1 , s1, s7 + +.endm + +.macro KERNEL_F1 + + fldmias X!, { s4 } + vabs.f32 s4, s4 + vadd.f32 s0 , s0, s4 + +.endm + + +.macro KERNEL_S4 + + fldmias X, { s4 } + vabs.f32 s4, s4 + vadd.f32 s0 , s0, s4 + add X, X, INC_X + + fldmias X, { s4 } + vabs.f32 s4, s4 + vadd.f32 s0 , s0, s4 + add X, X, INC_X + + fldmias X, { s4 } + vabs.f32 s4, s4 + vadd.f32 s0 , s0, s4 + add X, X, INC_X + + fldmias X, { s4 } + vabs.f32 s4, s4 + vadd.f32 s0 , s0, s4 + add X, X, INC_X + +.endm + + +.macro KERNEL_S1 + + fldmias X, { s4 } + vabs.f32 s4, s4 + vadd.f32 s0 , s0, s4 + add X, X, INC_X + +.endm + + +#endif + +#else + +#if defined(DOUBLE) + +.macro KERNEL_F4 + + pld [ X, #X_PRE ] + fldmiad X!, { d4 - d5 } + vabs.f64 d4, d4 + vadd.f64 d0 , d0, d4 + vabs.f64 d5, d5 + fldmiad X!, { d6 - d7 } + vabs.f64 d6, d6 + vadd.f64 d1 , d1, d5 + vabs.f64 d7, d7 + vadd.f64 d0 , d0, d6 + vadd.f64 d1 , d1, d7 + + pld [ X, #X_PRE ] + fldmiad X!, { d4 - d5 } + vabs.f64 d4, d4 + vadd.f64 d0 , d0, d4 + vabs.f64 d5, d5 + fldmiad X!, { d6 - d7 } + vabs.f64 d6, d6 + vadd.f64 d1 , d1, d5 + vabs.f64 d7, d7 + vadd.f64 d0 , d0, d6 + vadd.f64 d1 , d1, d7 + + +.endm + +.macro KERNEL_F1 + + fldmiad X!, { d4 } + vabs.f64 d4, d4 + vadd.f64 d0 , d0, d4 + + fldmiad X!, { d4 } + vabs.f64 d4, d4 + vadd.f64 d0 , d0, d4 + + +.endm + + +.macro KERNEL_S4 + + fldmiad X, { d4 -d5 } + vabs.f64 d4, d4 + vadd.f64 d0 , d0, d4 + vabs.f64 d5, d5 + vadd.f64 d0 , d0, d5 + add X, X, INC_X + + fldmiad X, { d4 -d5 } + vabs.f64 d4, d4 + vadd.f64 d0 , d0, d4 + vabs.f64 d5, d5 + vadd.f64 d0 , d0, d5 + add X, X, INC_X + + fldmiad X, { d4 -d5 } + vabs.f64 d4, d4 + vadd.f64 d0 , d0, d4 + vabs.f64 d5, d5 + vadd.f64 d0 , d0, d5 + add X, X, INC_X + + fldmiad X, { d4 -d5 } + vabs.f64 d4, d4 + vadd.f64 d0 , d0, d4 + vabs.f64 d5, d5 + vadd.f64 d0 , d0, d5 + add X, X, INC_X + +.endm + + +.macro KERNEL_S1 + + fldmiad X, { d4 -d5 } + vabs.f64 d4, d4 + vadd.f64 d0 , d0, d4 + vabs.f64 d5, d5 + vadd.f64 d0 , d0, d5 + add X, X, INC_X + +.endm + +#else + +.macro KERNEL_F4 + + pld [ X, #X_PRE ] + fldmias X!, { s4 - s5 } + vabs.f32 s4, s4 + vadd.f32 s0 , s0, s4 + vabs.f32 s5, s5 + fldmias X!, { s6 - s7 } + vabs.f32 s6, s6 + vadd.f32 s1 , s1, s5 + vabs.f32 s7, s7 + vadd.f32 s0 , s0, s6 + vadd.f32 s1 , s1, s7 + + fldmias X!, { s4 - s5 } + vabs.f32 s4, s4 + vadd.f32 s0 , s0, s4 + vabs.f32 s5, s5 + fldmias X!, { s6 - s7 } + vabs.f32 s6, s6 + vadd.f32 s1 , s1, s5 + vabs.f32 s7, s7 + vadd.f32 s0 , s0, s6 + vadd.f32 s1 , s1, s7 + + +.endm + +.macro KERNEL_F1 + + fldmias X!, { s4 } + vabs.f32 s4, s4 + vadd.f32 s0 , s0, s4 + + fldmias X!, { s4 } + vabs.f32 s4, s4 + vadd.f32 s0 , s0, s4 + +.endm + + +.macro KERNEL_S4 + + fldmias X, { s4 -s5 } + vabs.f32 s4, s4 + vadd.f32 s0 , s0, s4 + vabs.f32 s5, s5 + vadd.f32 s0 , s0, s5 + add X, X, INC_X + + fldmias X, { s4 -s5 } + vabs.f32 s4, s4 + vadd.f32 s0 , s0, s4 + vabs.f32 s5, s5 + vadd.f32 s0 , s0, s5 + add X, X, INC_X + + fldmias X, { s4 -s5 } + vabs.f32 s4, s4 + vadd.f32 s0 , s0, s4 + vabs.f32 s5, s5 + vadd.f32 s0 , s0, s5 + add X, X, INC_X + + fldmias X, { s4 -s5 } + vabs.f32 s4, s4 + vadd.f32 s0 , s0, s4 + vabs.f32 s5, s5 + vadd.f32 s0 , s0, s5 + add X, X, INC_X + +.endm + + +.macro KERNEL_S1 + + fldmias X, { s4 -s5 } + vabs.f32 s4, s4 + vadd.f32 s0 , s0, s4 + vabs.f32 s5, s5 + vadd.f32 s0 , s0, s5 + add X, X, INC_X + +.endm + +#endif + +#endif + +/************************************************************************************** +* End of macro definitions +**************************************************************************************/ + + PROLOGUE + + .align 5 + +#if defined(DOUBLE) + vsub.f64 d0 , d0 , d0 + vsub.f64 d1 , d1 , d1 +#else + vsub.f32 s0 , s0 , s0 + vsub.f32 s1 , s1 , s1 +#endif + + cmp N, #0 + ble asum_kernel_L999 + + cmp INC_X, #0 + beq asum_kernel_L999 + + cmp INC_X, #1 + bne asum_kernel_S_BEGIN + + +asum_kernel_F_BEGIN: + + asrs I, N, #2 // I = N / 4 + ble asum_kernel_F1 + + .align 5 + +asum_kernel_F4: + +#if !defined(DOUBLE) && !defined(COMPLEX) + pld [ X, #X_PRE ] +#endif + KERNEL_F4 + + subs I, I, #1 + ble asum_kernel_F1 + + KERNEL_F4 + + subs I, I, #1 + bne asum_kernel_F4 + +asum_kernel_F1: + + ands I, N, #3 + ble asum_kernel_L999 + +asum_kernel_F10: + + KERNEL_F1 + + subs I, I, #1 + bne asum_kernel_F10 + + b asum_kernel_L999 + +asum_kernel_S_BEGIN: + +#if defined(COMPLEX) + +#if defined(DOUBLE) + lsl INC_X, INC_X, #4 // INC_X * SIZE * 2 +#else + lsl INC_X, INC_X, #3 // INC_X * SIZE * 2 +#endif + +#else + +#if defined(DOUBLE) + lsl INC_X, INC_X, #3 // INC_X * SIZE +#else + lsl INC_X, INC_X, #2 // INC_X * SIZE +#endif + +#endif + + asrs I, N, #2 // I = N / 4 + ble asum_kernel_S1 + + .align 5 + +asum_kernel_S4: + + KERNEL_S4 + + subs I, I, #1 + bne asum_kernel_S4 + +asum_kernel_S1: + + ands I, N, #3 + ble asum_kernel_L999 + +asum_kernel_S10: + + KERNEL_S1 + + subs I, I, #1 + bne asum_kernel_S10 + + +asum_kernel_L999: + + +#if defined(DOUBLE) + vadd.f64 d0 , d0, d1 // set return value +#else + vadd.f32 s0 , s0, s1 // set return value +#endif + + bx lr + + EPILOGUE + diff --git a/kernel/arm/axpy.c b/kernel/arm/axpy.c new file mode 100644 index 000000000..dceddf78a --- /dev/null +++ b/kernel/arm/axpy.c @@ -0,0 +1,64 @@ +/*************************************************************************** +Copyright (c) 2013, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +/************************************************************************************** +* 2013/09/14 Saar +* BLASTEST float : OK +* BLASTEST double : OK +* CTEST : OK +* TEST : OK +* +**************************************************************************************/ + + +#include "common.h" + +int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *dummy, BLASLONG dummy2) +{ + BLASLONG i=0; + BLASLONG ix,iy; + + if ( n < 0 ) return(0); + if ( da == 0.0 ) return(0); + + ix = 0; + iy = 0; + + while(i < n) + { + + y[iy] += da * x[ix] ; + ix += inc_x ; + iy += inc_y ; + i++ ; + + } + return(0); + +} + + diff --git a/kernel/arm/axpy_vfp.S b/kernel/arm/axpy_vfp.S new file mode 100644 index 000000000..acc575707 --- /dev/null +++ b/kernel/arm/axpy_vfp.S @@ -0,0 +1,503 @@ +/*************************************************************************** +Copyright (c) 2013, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +/************************************************************************************** +* 2013/11/14 Saar +* BLASTEST : xOK +* CTEST : xOK +* TEST : xOK +* +**************************************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#define STACKSIZE 256 + +#define OLD_INC_X [fp, #0 ] +#define OLD_Y [fp, #4 ] +#define OLD_INC_Y [fp, #8 ] + + +#define N r0 +#define Y r1 +#define INC_X r2 +#define X r3 +#define INC_Y r4 + +#define I r12 + +#define X_PRE 512 + +/************************************************************************************** +* Macro definitions +**************************************************************************************/ + +/*****************************************************************************************/ + +#if !defined(CONJ) + +#if defined(DOUBLE) + +#define FMAC_R1 fmacd +#define FMAC_R2 fnmacd +#define FMAC_I1 fmacd +#define FMAC_I2 fmacd + +#else + +#define FMAC_R1 fmacs +#define FMAC_R2 fnmacs +#define FMAC_I1 fmacs +#define FMAC_I2 fmacs + +#endif + +#else // CONJ + +#if defined(DOUBLE) + +#define FMAC_R1 fmacd +#define FMAC_R2 fmacd +#define FMAC_I1 fnmacd +#define FMAC_I2 fmacd + +#else + +#define FMAC_R1 fmacs +#define FMAC_R2 fmacs +#define FMAC_I1 fnmacs +#define FMAC_I2 fmacs + +#endif + +#endif + + +#if !defined(COMPLEX) + +#if defined(DOUBLE) + +.macro KERNEL_F4 + + pld [ X, #X_PRE ] + fldmiad X!, { d4 - d7 } + pld [ Y, #X_PRE ] + fldmiad Y , { d8 - d11 } + fmacd d8 , d0, d4 + fstmiad Y!, { d8 } + fmacd d9 , d0, d5 + fstmiad Y!, { d9 } + fmacd d10, d0, d6 + fstmiad Y!, { d10 } + fmacd d11, d0, d7 + fstmiad Y!, { d11 } + + +.endm + + +.macro KERNEL_F1 + + fldmiad X!, { d4 } + fldmiad Y , { d8 } + fmacd d8 , d0, d4 + fstmiad Y!, { d8 } + +.endm + +.macro KERNEL_S1 + + fldmiad X , { d4 } + fldmiad Y , { d8 } + fmacd d8 , d0, d4 + fstmiad Y , { d8 } + add X, X, INC_X + add Y, Y, INC_Y + +.endm + +#else + +.macro KERNEL_F4 + + fldmias X!, { s4 - s7 } + fldmias Y , { s8 - s11 } + fmacs s8 , s0, s4 + fstmias Y!, { s8 } + fmacs s9 , s0, s5 + fstmias Y!, { s9 } + fmacs s10, s0, s6 + fstmias Y!, { s10 } + fmacs s11, s0, s7 + fstmias Y!, { s11 } + + +.endm + + +.macro KERNEL_F1 + + fldmias X!, { s4 } + fldmias Y , { s8 } + fmacs s8 , s0, s4 + fstmias Y!, { s8 } + +.endm + +.macro KERNEL_S1 + + fldmias X , { s4 } + fldmias Y , { s8 } + fmacs s8 , s0, s4 + fstmias Y , { s8 } + add X, X, INC_X + add Y, Y, INC_Y + +.endm + + +#endif + +#else + +#if defined(DOUBLE) + +.macro KERNEL_F4 + + pld [ X, #X_PRE ] + fldmiad X!, { d4 - d7 } + pld [ Y, #X_PRE ] + fldmiad Y , { d8 - d11 } + + FMAC_R1 d8 , d0, d4 + FMAC_R2 d8 , d1, d5 + FMAC_I1 d9 , d0, d5 + FMAC_I2 d9 , d1, d4 + fstmiad Y!, { d8 } + fstmiad Y!, { d9 } + + FMAC_R1 d10, d0, d6 + FMAC_R2 d10, d1, d7 + FMAC_I1 d11, d0, d7 + FMAC_I2 d11, d1, d6 + fstmiad Y!, { d10 } + fstmiad Y!, { d11 } + + pld [ X, #X_PRE ] + fldmiad X!, { d4 - d7 } + pld [ Y, #X_PRE ] + fldmiad Y , { d8 - d11 } + + FMAC_R1 d8 , d0, d4 + FMAC_R2 d8 , d1, d5 + FMAC_I1 d9 , d0, d5 + FMAC_I2 d9 , d1, d4 + fstmiad Y!, { d8 } + fstmiad Y!, { d9 } + + FMAC_R1 d10, d0, d6 + FMAC_R2 d10, d1, d7 + FMAC_I1 d11, d0, d7 + FMAC_I2 d11, d1, d6 + fstmiad Y!, { d10 } + fstmiad Y!, { d11 } + + + + + +.endm + + +.macro KERNEL_F1 + + fldmiad X!, { d4 - d5 } + fldmiad Y , { d8 - d9 } + + FMAC_R1 d8 , d0, d4 + FMAC_R2 d8 , d1, d5 + FMAC_I1 d9 , d0, d5 + FMAC_I2 d9 , d1, d4 + fstmiad Y!, { d8 } + fstmiad Y!, { d9 } + + + +.endm + +.macro KERNEL_S1 + + fldmiad X , { d4 - d5 } + fldmiad Y , { d8 - d9 } + + FMAC_R1 d8 , d0, d4 + FMAC_R2 d8 , d1, d5 + FMAC_I1 d9 , d0, d5 + FMAC_I2 d9 , d1, d4 + fstmiad Y , { d8 - d9 } + + add X, X, INC_X + add Y, Y, INC_Y + +.endm + + + +#else + +.macro KERNEL_F4 + + pld [ X, #X_PRE ] + fldmias X!, { s4 - s7 } + pld [ Y, #X_PRE ] + fldmias Y , { s8 - s11 } + + FMAC_R1 s8 , s0, s4 + FMAC_R2 s8 , s1, s5 + FMAC_I1 s9 , s0, s5 + FMAC_I2 s9 , s1, s4 + fstmias Y!, { s8 } + fstmias Y!, { s9 } + + FMAC_R1 s10, s0, s6 + FMAC_R2 s10, s1, s7 + FMAC_I1 s11, s0, s7 + FMAC_I2 s11, s1, s6 + fstmias Y!, { s10 } + fstmias Y!, { s11 } + + fldmias X!, { s4 - s7 } + fldmias Y , { s8 - s11 } + + FMAC_R1 s8 , s0, s4 + FMAC_R2 s8 , s1, s5 + FMAC_I1 s9 , s0, s5 + FMAC_I2 s9 , s1, s4 + fstmias Y!, { s8 } + fstmias Y!, { s9 } + + FMAC_R1 s10, s0, s6 + FMAC_R2 s10, s1, s7 + FMAC_I1 s11, s0, s7 + FMAC_I2 s11, s1, s6 + fstmias Y!, { s10 } + fstmias Y!, { s11 } + + + + + +.endm + + +.macro KERNEL_F1 + + fldmias X!, { s4 - s5 } + fldmias Y , { s8 - s9 } + + FMAC_R1 s8 , s0, s4 + FMAC_R2 s8 , s1, s5 + FMAC_I1 s9 , s0, s5 + FMAC_I2 s9 , s1, s4 + fstmias Y!, { s8 } + fstmias Y!, { s9 } + + + +.endm + +.macro KERNEL_S1 + + fldmias X , { s4 - s5 } + fldmias Y , { s8 - s9 } + + FMAC_R1 s8 , s0, s4 + FMAC_R2 s8 , s1, s5 + FMAC_I1 s9 , s0, s5 + FMAC_I2 s9 , s1, s4 + fstmias Y , { s8 - s9 } + + add X, X, INC_X + add Y, Y, INC_Y + +.endm + + +#endif + +#endif + +/************************************************************************************** +* End of macro definitions +**************************************************************************************/ + + PROLOGUE + + .align 5 + push {r4 , fp} + add fp, sp, #8 + sub sp, sp, #STACKSIZE // reserve stack + + ldr INC_X , OLD_INC_X + ldr Y, OLD_Y + ldr INC_Y , OLD_INC_Y + + sub r12, fp, #128 + +#if defined(DOUBLE) + vstm r12, { d8 - d15} // store floating point registers +#else + vstm r12, { s8 - s15} // store floating point registers +#endif + + cmp N, #0 + ble axpy_kernel_L999 + + cmp INC_X, #0 + beq axpy_kernel_L999 + + cmp INC_Y, #0 + beq axpy_kernel_L999 + + cmp INC_X, #1 + bne axpy_kernel_S_BEGIN + + cmp INC_Y, #1 + bne axpy_kernel_S_BEGIN + + +axpy_kernel_F_BEGIN: + + + asrs I, N, #2 // I = N / 4 + ble axpy_kernel_F1 + + .align 5 + +axpy_kernel_F4: + +#if !defined(COMPLEX) && !defined(DOUBLE) + pld [ X, #X_PRE ] + pld [ Y, #X_PRE ] +#endif + + KERNEL_F4 + + subs I, I, #1 + ble axpy_kernel_F1 + + KERNEL_F4 + + subs I, I, #1 + bne axpy_kernel_F4 + +axpy_kernel_F1: + + ands I, N, #3 + ble axpy_kernel_L999 + +axpy_kernel_F10: + + KERNEL_F1 + + subs I, I, #1 + bne axpy_kernel_F10 + + b axpy_kernel_L999 + +axpy_kernel_S_BEGIN: + +#if defined(COMPLEX) + +#if defined(DOUBLE) + lsl INC_X, INC_X, #4 // INC_X * SIZE * 2 + lsl INC_Y, INC_Y, #4 // INC_Y * SIZE * 2 +#else + lsl INC_X, INC_X, #3 // INC_X * SIZE * 2 + lsl INC_Y, INC_Y, #3 // INC_Y * SIZE * 2 +#endif + +#else + +#if defined(DOUBLE) + lsl INC_X, INC_X, #3 // INC_X * SIZE + lsl INC_Y, INC_Y, #3 // INC_Y * SIZE +#else + lsl INC_X, INC_X, #2 // INC_X * SIZE + lsl INC_Y, INC_Y, #2 // INC_Y * SIZE +#endif + +#endif + + + asrs I, N, #2 // I = N / 4 + ble axpy_kernel_S1 + + .align 5 + +axpy_kernel_S4: + + KERNEL_S1 + KERNEL_S1 + KERNEL_S1 + KERNEL_S1 + + subs I, I, #1 + bne axpy_kernel_S4 + +axpy_kernel_S1: + + ands I, N, #3 + ble axpy_kernel_L999 + +axpy_kernel_S10: + + KERNEL_S1 + + subs I, I, #1 + bne axpy_kernel_S10 + + +axpy_kernel_L999: + + sub r3, fp, #128 + +#if defined(DOUBLE) + vldm r3, { d8 - d15 } // restore floating point registers +#else + vldm r3, { s8 - s15 } // restore floating point registers +#endif + + mov r0, #0 // set return value + + sub sp, fp, #8 + pop {r4,fp} + bx lr + + EPILOGUE + diff --git a/kernel/arm/ccopy_vfp.S b/kernel/arm/ccopy_vfp.S new file mode 100644 index 000000000..aaba7825e --- /dev/null +++ b/kernel/arm/ccopy_vfp.S @@ -0,0 +1,222 @@ +/*************************************************************************** +Copyright (c) 2013, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +/************************************************************************************** +* 2013/11/07 Saar +* BLASTEST : OK +* CTEST : OK +* TEST : OK +* +**************************************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#define STACKSIZE 256 + +#define N r0 +#define X r1 +#define INC_X r2 +#define OLD_Y r3 + + +/****************************************************** +* [fp, #-128] - [fp, #-64] is reserved +* for store and restore of floating point +* registers +*******************************************************/ + +#define OLD_INC_Y [fp, #4 ] + +#define I r5 +#define Y r6 +#define INC_Y r7 + +#define X_PRE 256 + +/************************************************************************************** +* Macro definitions +**************************************************************************************/ + +.macro COPY_F4 + + pld [ X, #X_PRE ] + fldmias X!, { s0 - s7 } + fstmias Y!, { s0 - s7 } + +.endm + +.macro COPY_F1 + + fldmias X!, { s0 - s1 } + fstmias Y!, { s0 - s1 } + +.endm + + +/*************************************************************************************************************************/ + +.macro COPY_S4 + + nop + fldmias X, { s0 - s1 } + fstmias Y, { s0 - s1 } + add X, X, INC_X + add Y, Y, INC_Y + + fldmias X, { s2 - s3 } + fstmias Y, { s2 - s3 } + add X, X, INC_X + add Y, Y, INC_Y + + fldmias X, { s0 - s1 } + fstmias Y, { s0 - s1 } + add X, X, INC_X + add Y, Y, INC_Y + + fldmias X, { s2 - s3 } + fstmias Y, { s2 - s3 } + add X, X, INC_X + add Y, Y, INC_Y + +.endm + + +.macro COPY_S1 + + fldmias X, { s0 - s1 } + fstmias Y, { s0 - s1 } + add X, X, INC_X + add Y, Y, INC_Y + +.endm + + + +/************************************************************************************** +* End of macro definitions +**************************************************************************************/ + + PROLOGUE + + .align 5 + + push {r4 - r9, fp} + add fp, sp, #24 + sub sp, sp, #STACKSIZE // reserve stack + + sub r4, fp, #128 + vstm r4, { s8 - s15} // store floating point registers + + mov Y, OLD_Y + ldr INC_Y, OLD_INC_Y + + cmp N, #0 + ble ccopy_kernel_L999 + + cmp INC_X, #0 + beq ccopy_kernel_L999 + + cmp INC_Y, #0 + beq ccopy_kernel_L999 + + cmp INC_X, #1 + bne ccopy_kernel_S_BEGIN + + cmp INC_Y, #1 + bne ccopy_kernel_S_BEGIN + +ccopy_kernel_F_BEGIN: + + asrs I, N, #2 // I = N / 4 + ble ccopy_kernel_F1 + +ccopy_kernel_F4: + + COPY_F4 + + subs I, I, #1 + bne ccopy_kernel_F4 + +ccopy_kernel_F1: + + ands I, N, #3 + ble ccopy_kernel_L999 + +ccopy_kernel_F10: + + COPY_F1 + + subs I, I, #1 + bne ccopy_kernel_F10 + + b ccopy_kernel_L999 + +ccopy_kernel_S_BEGIN: + + lsl INC_X, INC_X, #3 // INC_X * SIZE * 2 + lsl INC_Y, INC_Y, #3 // INC_Y * SIZE * 2 + + asrs I, N, #2 // I = N / 4 + ble ccopy_kernel_S1 + +ccopy_kernel_S4: + + COPY_S4 + + subs I, I, #1 + bne ccopy_kernel_S4 + +ccopy_kernel_S1: + + ands I, N, #3 + ble ccopy_kernel_L999 + +ccopy_kernel_S10: + + COPY_S1 + + subs I, I, #1 + bne ccopy_kernel_S10 + + + + + + +ccopy_kernel_L999: + + sub r3, fp, #128 + vldm r3, { s8 - s15} // restore floating point registers + + mov r0, #0 // set return value + sub sp, fp, #24 + pop {r4 - r9, fp} + bx lr + + EPILOGUE + diff --git a/kernel/arm/cdot_vfp.S b/kernel/arm/cdot_vfp.S new file mode 100644 index 000000000..b653888df --- /dev/null +++ b/kernel/arm/cdot_vfp.S @@ -0,0 +1,284 @@ +/*************************************************************************** +Copyright (c) 2013, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +/************************************************************************************** +* 2013/11/11 Saar +* BLASTEST : OK +* CTEST : OK +* TEST : OK +* +**************************************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#define STACKSIZE 256 + +#define N r0 +#define X r1 +#define INC_X r2 +#define OLD_Y r3 + + +/****************************************************** +* [fp, #-128] - [fp, #-64] is reserved +* for store and restore of floating point +* registers +*******************************************************/ + +#define OLD_INC_Y [fp, #4 ] + +#define I r5 +#define Y r6 +#define INC_Y r7 + +#define X_PRE 512 + +/************************************************************************************** +* Macro definitions +**************************************************************************************/ + +.macro KERNEL_F4 + + pld [ X, #X_PRE ] + pld [ Y, #X_PRE ] + + fldmias X!, { s4 - s5 } + fldmias Y!, { s8 - s9 } + fmacs s0 , s4, s8 + fmacs s1 , s4, s9 + fldmias X!, { s6 - s7 } + fmacs s2 , s5, s9 + fmacs s3 , s5, s8 + + fldmias Y!, { s10 - s11 } + fmacs s0 , s6, s10 + fmacs s1 , s6, s11 + fmacs s2 , s7, s11 + fmacs s3 , s7, s10 + + + fldmias X!, { s4 - s5 } + fldmias Y!, { s8 - s9 } + fmacs s0 , s4, s8 + fmacs s1 , s4, s9 + fldmias X!, { s6 - s7 } + fmacs s2 , s5, s9 + fmacs s3 , s5, s8 + + fldmias Y!, { s10 - s11 } + fmacs s0 , s6, s10 + fmacs s1 , s6, s11 + fmacs s2 , s7, s11 + fmacs s3 , s7, s10 + +.endm + +.macro KERNEL_F1 + + fldmias X!, { s4 - s5 } + fldmias Y!, { s8 - s9 } + fmacs s0 , s4, s8 + fmacs s1 , s4, s9 + fmacs s2 , s5, s9 + fmacs s3 , s5, s8 + +.endm + + +/*************************************************************************************************************************/ + +.macro KERNEL_S4 + + nop + + fldmias X, { s4 - s5 } + fldmias Y, { s8 - s9 } + fmacs s0 , s4, s8 + fmacs s1 , s4, s9 + fmacs s2 , s5, s9 + fmacs s3 , s5, s8 + add X, X, INC_X + add Y, Y, INC_Y + + fldmias X, { s4 - s5 } + fldmias Y, { s8 - s9 } + fmacs s0 , s4, s8 + fmacs s1 , s4, s9 + fmacs s2 , s5, s9 + fmacs s3 , s5, s8 + add X, X, INC_X + add Y, Y, INC_Y + + fldmias X, { s4 - s5 } + fldmias Y, { s8 - s9 } + fmacs s0 , s4, s8 + fmacs s1 , s4, s9 + fmacs s2 , s5, s9 + fmacs s3 , s5, s8 + add X, X, INC_X + add Y, Y, INC_Y + + fldmias X, { s4 - s5 } + fldmias Y, { s8 - s9 } + fmacs s0 , s4, s8 + fmacs s1 , s4, s9 + fmacs s2 , s5, s9 + fmacs s3 , s5, s8 + add X, X, INC_X + add Y, Y, INC_Y + +.endm + + +.macro KERNEL_S1 + + fldmias X, { s4 - s5 } + fldmias Y, { s8 - s9 } + fmacs s0 , s4, s8 + fmacs s1 , s4, s9 + fmacs s2 , s5, s9 + fmacs s3 , s5, s8 + add X, X, INC_X + add Y, Y, INC_Y + +.endm + + + +/************************************************************************************** +* End of macro definitions +**************************************************************************************/ + + PROLOGUE + + .align 5 + + push {r4 - r9, fp} + add fp, sp, #24 + sub sp, sp, #STACKSIZE // reserve stack + + sub r4, fp, #128 + vstm r4, { s8 - s15} // store floating point registers + + mov Y, OLD_Y + ldr INC_Y, OLD_INC_Y + + vsub.f32 s0 , s0 , s0 + vsub.f32 s1 , s1 , s1 + vsub.f32 s2 , s2 , s2 + vsub.f32 s3 , s3 , s3 + + cmp N, #0 + ble cdot_kernel_L999 + + cmp INC_X, #0 + beq cdot_kernel_L999 + + cmp INC_Y, #0 + beq cdot_kernel_L999 + + cmp INC_X, #1 + bne cdot_kernel_S_BEGIN + + cmp INC_Y, #1 + bne cdot_kernel_S_BEGIN + +cdot_kernel_F_BEGIN: + + asrs I, N, #2 // I = N / 4 + ble cdot_kernel_F1 + +cdot_kernel_F4: + + KERNEL_F4 + + subs I, I, #1 + bne cdot_kernel_F4 + +cdot_kernel_F1: + + ands I, N, #3 + ble cdot_kernel_L999 + +cdot_kernel_F10: + + KERNEL_F1 + + subs I, I, #1 + bne cdot_kernel_F10 + + b cdot_kernel_L999 + +cdot_kernel_S_BEGIN: + + lsl INC_X, INC_X, #3 // INC_X * SIZE * 2 + lsl INC_Y, INC_Y, #3 // INC_Y * SIZE * 2 + + asrs I, N, #2 // I = N / 4 + ble cdot_kernel_S1 + +cdot_kernel_S4: + + KERNEL_S4 + + subs I, I, #1 + bne cdot_kernel_S4 + +cdot_kernel_S1: + + ands I, N, #3 + ble cdot_kernel_L999 + +cdot_kernel_S10: + + KERNEL_S1 + + subs I, I, #1 + bne cdot_kernel_S10 + + + +cdot_kernel_L999: + + sub r3, fp, #128 + vldm r3, { s8 - s15} // restore floating point registers + +#if !defined(CONJ) + vsub.f32 s0 , s0, s2 + vadd.f32 s1 , s1, s3 +#else + vadd.f32 s0 , s0, s2 + vsub.f32 s1 , s1, s3 +#endif + + sub sp, fp, #24 + pop {r4 - r9, fp} + bx lr + + EPILOGUE + diff --git a/kernel/arm/cgemm_kernel_2x2_vfp.S b/kernel/arm/cgemm_kernel_2x2_vfp.S new file mode 100644 index 000000000..75fbf097b --- /dev/null +++ b/kernel/arm/cgemm_kernel_2x2_vfp.S @@ -0,0 +1,1252 @@ +/*************************************************************************** +Copyright (c) 2013, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +/************************************************************************************** +* 2013/11/28 Saar +* BLASTEST : OK +* CTEST : OK +* TEST : OK +* +**************************************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#define STACKSIZE 256 + +#define OLD_M r0 +#define OLD_N r1 +#define OLD_K r2 +#define OLD_A r3 +#define OLD_ALPHA_R s0 +#define OLD_ALPHA_I s1 + +/****************************************************** +* [fp, #-128] - [fp, #-64] is reserved +* for store and restore of floating point +* registers +*******************************************************/ + +#define A [fp, #-248 ] +#define LDC [fp, #-252 ] +#define M [fp, #-256 ] +#define N [fp, #-260 ] +#define K [fp, #-264 ] + +#define ALPHA_I [fp, #-272] +#define ALPHA_R [fp, #-280] + +#define B [fp, #4 ] +#define C [fp, #8 ] +#define OLD_LDC [fp, #12 ] + +#define I r0 +#define J r1 +#define L r2 + +#define AO r5 +#define BO r6 + +#define CO1 r8 +#define CO2 r9 + +#define K1 r7 +#define BC r12 + +#define A_PRE 96 +#define B_PRE 96 +#define C_PRE 64 + + + +/************************************************************************************** +* Macro definitions +**************************************************************************************/ + + +#if defined(NN) || defined(NT) || defined(TN) || defined(TT) + + #define KMAC_R fnmacs + #define KMAC_I fmacs + + #define FMAC_R1 fmacs + #define FMAC_R2 fnmacs + #define FMAC_I1 fmacs + #define FMAC_I2 fmacs + +#elif defined(CN) || defined(CT) + + #define KMAC_R fmacs + #define KMAC_I fnmacs + + #define FMAC_R1 fmacs + #define FMAC_R2 fnmacs + #define FMAC_I1 fmacs + #define FMAC_I2 fmacs + +#elif defined(NC) || defined(TC) + + #define KMAC_R fmacs + #define KMAC_I fnmacs + + #define FMAC_R1 fmacs + #define FMAC_R2 fmacs + #define FMAC_I1 fnmacs + #define FMAC_I2 fmacs + +#else + + #define KMAC_R fnmacs + #define KMAC_I fmacs + + #define FMAC_R1 fmacs + #define FMAC_R2 fmacs + #define FMAC_I1 fnmacs + #define FMAC_I2 fmacs + +#endif + + + +/************************************************************************************** +* Macro definitions +**************************************************************************************/ + +.macro INIT2x2 + + vsub.f32 s8 , s8 , s8 + vmov.f32 s9 , s8 + vmov.f32 s10, s8 + vmov.f32 s11, s8 + vmov.f32 s12, s8 + vmov.f32 s13, s8 + vmov.f32 s14, s8 + vmov.f32 s15, s8 + +.endm + +.macro KERNEL2x2_I + + pld [ AO, #A_PRE ] + fldmias AO!, { s0 - s3 } + pld [ BO, #B_PRE ] + fldmias BO!, { s4 - s7 } + + + fmuls s8 , s0, s4 + fmuls s9 , s0, s5 + fmuls s10 , s2, s4 + fmuls s11 , s2, s5 + + KMAC_R s8 , s1, s5 + KMAC_I s9 , s1, s4 + KMAC_R s10 , s3, s5 + KMAC_I s11 , s3, s4 + + fmuls s12 , s0, s6 + fmuls s13 , s0, s7 + fmuls s14 , s2, s6 + fmuls s15 , s2, s7 + + KMAC_R s12 , s1, s7 + KMAC_I s13 , s1, s6 + KMAC_R s14 , s3, s7 + KMAC_I s15 , s3, s6 + +.endm + + + +.macro KERNEL2x2_M1 + + pld [ AO, #A_PRE ] + fldmias AO!, { s0 - s3 } + pld [ BO, #B_PRE ] + fldmias BO!, { s4 - s7 } + + fmacs s8 , s0, s4 + fmacs s9 , s0, s5 + fmacs s10 , s2, s4 + fmacs s11 , s2, s5 + + KMAC_R s8 , s1, s5 + KMAC_I s9 , s1, s4 + KMAC_R s10 , s3, s5 + KMAC_I s11 , s3, s4 + + fmacs s12 , s0, s6 + fmacs s13 , s0, s7 + fmacs s14 , s2, s6 + fmacs s15 , s2, s7 + + KMAC_R s12 , s1, s7 + KMAC_I s13 , s1, s6 + KMAC_R s14 , s3, s7 + KMAC_I s15 , s3, s6 + +.endm + +.macro KERNEL2x2_M2 + + fldmias AO!, { s0 - s3 } + fldmias BO!, { s4 - s7 } + + fmacs s8 , s0, s4 + fmacs s9 , s0, s5 + fmacs s10 , s2, s4 + fmacs s11 , s2, s5 + + KMAC_R s8 , s1, s5 + KMAC_I s9 , s1, s4 + KMAC_R s10 , s3, s5 + KMAC_I s11 , s3, s4 + + fmacs s12 , s0, s6 + fmacs s13 , s0, s7 + fmacs s14 , s2, s6 + fmacs s15 , s2, s7 + + KMAC_R s12 , s1, s7 + KMAC_I s13 , s1, s6 + KMAC_R s14 , s3, s7 + KMAC_I s15 , s3, s6 + + +.endm + + +.macro KERNEL2x2_E + + fldmias AO!, { s0 - s3 } + fldmias BO!, { s4 - s7 } + + fmacs s8 , s0, s4 + fmacs s9 , s0, s5 + fmacs s10 , s2, s4 + fmacs s11 , s2, s5 + + KMAC_R s8 , s1, s5 + KMAC_I s9 , s1, s4 + KMAC_R s10 , s3, s5 + KMAC_I s11 , s3, s4 + + fmacs s12 , s0, s6 + fmacs s13 , s0, s7 + fmacs s14 , s2, s6 + fmacs s15 , s2, s7 + + KMAC_R s12 , s1, s7 + KMAC_I s13 , s1, s6 + KMAC_R s14 , s3, s7 + KMAC_I s15 , s3, s6 + + +.endm + +.macro KERNEL2x2_SUB + + fldmias AO!, { s0 - s3 } + fldmias BO!, { s4 - s7 } + + fmacs s8 , s0, s4 + fmacs s9 , s0, s5 + fmacs s10 , s2, s4 + fmacs s11 , s2, s5 + + KMAC_R s8 , s1, s5 + KMAC_I s9 , s1, s4 + KMAC_R s10 , s3, s5 + KMAC_I s11 , s3, s4 + + fmacs s12 , s0, s6 + fmacs s13 , s0, s7 + fmacs s14 , s2, s6 + fmacs s15 , s2, s7 + + KMAC_R s12 , s1, s7 + KMAC_I s13 , s1, s6 + KMAC_R s14 , s3, s7 + KMAC_I s15 , s3, s6 + + +.endm + + +.macro SAVE2x2 + + ldr r3 , LDC + add CO2 , CO1, r3 + + flds s0, ALPHA_R + flds s1, ALPHA_I + + fldmias CO1, { s4 - s7 } + + FMAC_R1 s4 , s0 , s8 + FMAC_I1 s5 , s0 , s9 + FMAC_R2 s4 , s1 , s9 + FMAC_I2 s5 , s1 , s8 + + FMAC_R1 s6 , s0 , s10 + FMAC_I1 s7 , s0 , s11 + FMAC_R2 s6 , s1 , s11 + FMAC_I2 s7 , s1 , s10 + + fstmias CO1, { s4 - s7 } + + fldmias CO2, { s4 - s7 } + + FMAC_R1 s4 , s0 , s12 + FMAC_I1 s5 , s0 , s13 + FMAC_R2 s4 , s1 , s13 + FMAC_I2 s5 , s1 , s12 + + FMAC_R1 s6 , s0 , s14 + FMAC_I1 s7 , s0 , s15 + FMAC_R2 s6 , s1 , s15 + FMAC_I2 s7 , s1 , s14 + + fstmias CO2, { s4 - s7 } + + add CO1, CO1, #16 + +.endm + +/******************************************************************************/ + +.macro INIT1x2 + + vsub.f32 s8 , s8 , s8 + vmov.f32 s9 , s8 + vmov.f32 s12, s8 + vmov.f32 s13, s8 + +.endm + +.macro KERNEL1x2_I + + flds s0 , [ AO ] + flds s1 , [ AO, #4 ] + + flds s4 , [ BO ] + flds s5 , [ BO, #4 ] + + flds s6 , [ BO, #8 ] + flds s7 , [ BO, #12 ] + + fmuls s8 , s0, s4 + KMAC_R s8 , s1, s5 + fmuls s9 , s0, s5 + KMAC_I s9 , s1, s4 + + fmuls s12 , s0, s6 + KMAC_R s12 , s1, s7 + fmuls s13 , s0, s7 + KMAC_I s13 , s1, s6 + + add BO , BO, #16 + add AO , AO, #8 + +.endm + + + +.macro KERNEL1x2_M1 + + flds s0 , [ AO ] + flds s1 , [ AO, #4 ] + + flds s4 , [ BO ] + flds s5 , [ BO, #4 ] + flds s6 , [ BO, #8 ] + flds s7 , [ BO, #12 ] + + fmacs s8 , s0, s4 + KMAC_R s8 , s1, s5 + fmacs s9 , s0, s5 + KMAC_I s9 , s1, s4 + + fmacs s12 , s0, s6 + KMAC_R s12 , s1, s7 + fmacs s13 , s0, s7 + KMAC_I s13 , s1, s6 + + add BO , BO, #16 + add AO , AO, #8 + +.endm + +.macro KERNEL1x2_M2 + + flds s0 , [ AO ] + flds s1 , [ AO, #4 ] + + flds s4 , [ BO ] + flds s5 , [ BO, #4 ] + flds s6 , [ BO, #8 ] + flds s7 , [ BO, #12 ] + + fmacs s8 , s0, s4 + KMAC_R s8 , s1, s5 + fmacs s9 , s0, s5 + KMAC_I s9 , s1, s4 + + fmacs s12 , s0, s6 + KMAC_R s12 , s1, s7 + fmacs s13 , s0, s7 + KMAC_I s13 , s1, s6 + + add BO , BO, #16 + add AO , AO, #8 + + +.endm + + +.macro KERNEL1x2_E + + flds s0 , [ AO ] + flds s1 , [ AO, #4 ] + + flds s4 , [ BO ] + flds s5 , [ BO, #4 ] + flds s6 , [ BO, #8 ] + flds s7 , [ BO, #12 ] + + fmacs s8 , s0, s4 + KMAC_R s8 , s1, s5 + fmacs s9 , s0, s5 + KMAC_I s9 , s1, s4 + + fmacs s12 , s0, s6 + KMAC_R s12 , s1, s7 + fmacs s13 , s0, s7 + KMAC_I s13 , s1, s6 + + add BO , BO, #16 + add AO , AO, #8 + +.endm + +.macro KERNEL1x2_SUB + + flds s0 , [ AO ] + flds s1 , [ AO, #4 ] + + flds s4 , [ BO ] + flds s5 , [ BO, #4 ] + flds s6 , [ BO, #8 ] + flds s7 , [ BO, #12 ] + + fmacs s8 , s0, s4 + KMAC_R s8 , s1, s5 + fmacs s9 , s0, s5 + KMAC_I s9 , s1, s4 + + fmacs s12 , s0, s6 + KMAC_R s12 , s1, s7 + fmacs s13 , s0, s7 + KMAC_I s13 , s1, s6 + + add BO , BO, #16 + add AO , AO, #8 + + +.endm + + +.macro SAVE1x2 + + ldr r3 , LDC + add CO2 , CO1, r3 + + flds s0, ALPHA_R + flds s1, ALPHA_I + + fldmias CO1, { s4 - s5 } + + FMAC_R1 s4 , s0 , s8 + FMAC_I1 s5 , s0 , s9 + FMAC_R2 s4 , s1 , s9 + FMAC_I2 s5 , s1 , s8 + + fstmias CO1, { s4 - s5 } + + fldmias CO2, { s4 - s5 } + + FMAC_R1 s4 , s0 , s12 + FMAC_I1 s5 , s0 , s13 + FMAC_R2 s4 , s1 , s13 + FMAC_I2 s5 , s1 , s12 + + fstmias CO2, { s4 - s5 } + + add CO1, CO1, #8 + +.endm + + +/******************************************************************************/ + +.macro INIT2x1 + + vsub.f32 s8 , s8 , s8 + vmov.f32 s9 , s8 + vmov.f32 s10, s8 + vmov.f32 s11, s8 + +.endm + +.macro KERNEL2x1_I + + flds s0 , [ AO ] + flds s1 , [ AO, #4 ] + + flds s2 , [ AO, #8 ] + flds s3 , [ AO, #12 ] + + flds s4 , [ BO ] + flds s5 , [ BO, #4 ] + + fmuls s8 , s0, s4 + KMAC_R s8 , s1, s5 + fmuls s9 , s0, s5 + KMAC_I s9 , s1, s4 + + fmuls s10 , s2, s4 + KMAC_R s10 , s3, s5 + fmuls s11 , s2, s5 + KMAC_I s11 , s3, s4 + + add BO , BO, #8 + add AO , AO, #16 + +.endm + + + +.macro KERNEL2x1_M1 + + flds s0 , [ AO ] + flds s1 , [ AO, #4 ] + flds s2 , [ AO, #8 ] + flds s3 , [ AO, #12 ] + + flds s4 , [ BO ] + flds s5 , [ BO, #4 ] + + fmacs s8 , s0, s4 + KMAC_R s8 , s1, s5 + fmacs s9 , s0, s5 + KMAC_I s9 , s1, s4 + + fmacs s10 , s2, s4 + KMAC_R s10 , s3, s5 + fmacs s11 , s2, s5 + KMAC_I s11 , s3, s4 + + add BO , BO, #8 + add AO , AO, #16 + +.endm + +.macro KERNEL2x1_M2 + + flds s0 , [ AO ] + flds s1 , [ AO, #4 ] + flds s2 , [ AO, #8 ] + flds s3 , [ AO, #12 ] + + flds s4 , [ BO ] + flds s5 , [ BO, #4 ] + + fmacs s8 , s0, s4 + KMAC_R s8 , s1, s5 + fmacs s9 , s0, s5 + KMAC_I s9 , s1, s4 + + fmacs s10 , s2, s4 + KMAC_R s10 , s3, s5 + fmacs s11 , s2, s5 + KMAC_I s11 , s3, s4 + + add BO , BO, #8 + add AO , AO, #16 + + +.endm + + +.macro KERNEL2x1_E + + flds s0 , [ AO ] + flds s1 , [ AO, #4 ] + flds s2 , [ AO, #8 ] + flds s3 , [ AO, #12 ] + + flds s4 , [ BO ] + flds s5 , [ BO, #4 ] + + fmacs s8 , s0, s4 + KMAC_R s8 , s1, s5 + fmacs s9 , s0, s5 + KMAC_I s9 , s1, s4 + + fmacs s10 , s2, s4 + KMAC_R s10 , s3, s5 + fmacs s11 , s2, s5 + KMAC_I s11 , s3, s4 + + add BO , BO, #8 + add AO , AO, #16 + +.endm + +.macro KERNEL2x1_SUB + + flds s0 , [ AO ] + flds s1 , [ AO, #4 ] + flds s2 , [ AO, #8 ] + flds s3 , [ AO, #12 ] + + flds s4 , [ BO ] + flds s5 , [ BO, #4 ] + + fmacs s8 , s0, s4 + KMAC_R s8 , s1, s5 + fmacs s9 , s0, s5 + KMAC_I s9 , s1, s4 + + fmacs s10 , s2, s4 + KMAC_R s10 , s3, s5 + fmacs s11 , s2, s5 + KMAC_I s11 , s3, s4 + + add BO , BO, #8 + add AO , AO, #16 + + +.endm + + +.macro SAVE2x1 + + flds s0, ALPHA_R + flds s1, ALPHA_I + + fldmias CO1, { s4 - s7 } + + FMAC_R1 s4 , s0 , s8 + FMAC_I1 s5 , s0 , s9 + FMAC_R2 s4 , s1 , s9 + FMAC_I2 s5 , s1 , s8 + + FMAC_R1 s6 , s0 , s10 + FMAC_I1 s7 , s0 , s11 + FMAC_R2 s6 , s1 , s11 + FMAC_I2 s7 , s1 , s10 + + fstmias CO1, { s4 - s7 } + + add CO1, CO1, #16 + +.endm + + +/******************************************************************************/ + +.macro INIT1x1 + + vsub.f32 s8 , s8 , s8 + vmov.f32 s9 , s8 + +.endm + +.macro KERNEL1x1_I + + flds s0 , [ AO ] + flds s1 , [ AO, #4 ] + + flds s4 , [ BO ] + flds s5 , [ BO, #4 ] + + fmuls s8 , s0, s4 + KMAC_R s8 , s1, s5 + fmuls s9 , s0, s5 + KMAC_I s9 , s1, s4 + + add BO , BO, #8 + add AO , AO, #8 + +.endm + + + +.macro KERNEL1x1_M1 + + flds s0 , [ AO ] + flds s1 , [ AO, #4 ] + + flds s4 , [ BO ] + flds s5 , [ BO, #4 ] + + fmacs s8 , s0, s4 + KMAC_R s8 , s1, s5 + fmacs s9 , s0, s5 + KMAC_I s9 , s1, s4 + + add BO , BO, #8 + add AO , AO, #8 + +.endm + +.macro KERNEL1x1_M2 + + flds s0 , [ AO ] + flds s1 , [ AO, #4 ] + + flds s4 , [ BO ] + flds s5 , [ BO, #4 ] + + fmacs s8 , s0, s4 + KMAC_R s8 , s1, s5 + fmacs s9 , s0, s5 + KMAC_I s9 , s1, s4 + + add BO , BO, #8 + add AO , AO, #8 + + +.endm + + +.macro KERNEL1x1_E + + flds s0 , [ AO ] + flds s1 , [ AO, #4 ] + + flds s4 , [ BO ] + flds s5 , [ BO, #4 ] + + fmacs s8 , s0, s4 + KMAC_R s8 , s1, s5 + fmacs s9 , s0, s5 + KMAC_I s9 , s1, s4 + + add BO , BO, #8 + add AO , AO, #8 + +.endm + +.macro KERNEL1x1_SUB + + flds s0 , [ AO ] + flds s1 , [ AO, #4 ] + + flds s4 , [ BO ] + flds s5 , [ BO, #4 ] + + fmacs s8 , s0, s4 + KMAC_R s8 , s1, s5 + fmacs s9 , s0, s5 + KMAC_I s9 , s1, s4 + + add BO , BO, #8 + add AO , AO, #8 + + +.endm + + +.macro SAVE1x1 + + flds s0, ALPHA_R + flds s1, ALPHA_I + + fldmias CO1, { s4 - s5 } + + FMAC_R1 s4 , s0 , s8 + FMAC_I1 s5 , s0 , s9 + FMAC_R2 s4 , s1 , s9 + FMAC_I2 s5 , s1 , s8 + + fstmias CO1, { s4 - s5 } + + add CO1, CO1, #8 + +.endm + +/************************************************************************************** +* End of macro definitions +**************************************************************************************/ + + PROLOGUE + + .align 5 + + push {r4 - r9, fp} + add fp, sp, #24 + sub sp, sp, #STACKSIZE // reserve stack + + str OLD_M, M + str OLD_N, N + str OLD_K, K + str OLD_A, A + vstr OLD_ALPHA_R, ALPHA_R + vstr OLD_ALPHA_I, ALPHA_I + + sub r3, fp, #128 + vstm r3, { s8 - s15} // store floating point registers + + ldr r3, OLD_LDC + lsl r3, r3, #3 // ldc = ldc * 4 * 2 + str r3, LDC + + ldr K1, K + ldr BC, B + + ldr J, N + asrs J, J, #1 // J = J / 2 + ble cgemm_kernel_L1_BEGIN + +cgemm_kernel_L2_BEGIN: + + ldr CO1, C // CO1 = C + ldr r4 , LDC + lsl r4 , r4 , #1 // LDC * 2 + add r3 , r4, CO1 + str r3 , C // store C + + ldr AO, A // AO = A + pld [AO , #A_PRE-64] + pld [AO , #A_PRE-32] + + + +cgemm_kernel_L2_M2_BEGIN: + + ldr I, M + asrs I, I, #1 // I = I / 2 + ble cgemm_kernel_L2_M1_BEGIN + +cgemm_kernel_L2_M2_20: + + + mov BO, BC + asrs L , K1, #3 // L = L / 8 + cmp L , #3 + blt cgemm_kernel_L2_M2_30 + .align 5 + + + + KERNEL2x2_I + KERNEL2x2_M2 + KERNEL2x2_M1 + KERNEL2x2_M2 + + KERNEL2x2_M1 + KERNEL2x2_M2 + KERNEL2x2_M1 + KERNEL2x2_M2 + + sub L, L, #2 + +cgemm_kernel_L2_M2_22: + + KERNEL2x2_M1 + KERNEL2x2_M2 + KERNEL2x2_M1 + KERNEL2x2_M2 + + KERNEL2x2_M1 + KERNEL2x2_M2 + KERNEL2x2_M1 + KERNEL2x2_M2 + + subs L, L, #1 + bgt cgemm_kernel_L2_M2_22 + + KERNEL2x2_M1 + KERNEL2x2_M2 + KERNEL2x2_M1 + KERNEL2x2_M2 + + KERNEL2x2_M1 + KERNEL2x2_M2 + KERNEL2x2_M1 + KERNEL2x2_E + + b cgemm_kernel_L2_M2_44 + + +cgemm_kernel_L2_M2_30: + tst L, #3 + ble cgemm_kernel_L2_M2_40 + + tst L, #2 + ble cgemm_kernel_L2_M2_32 + + KERNEL2x2_I + KERNEL2x2_M2 + KERNEL2x2_M1 + KERNEL2x2_M2 + + KERNEL2x2_M1 + KERNEL2x2_M2 + KERNEL2x2_M1 + KERNEL2x2_M2 + + KERNEL2x2_M1 + KERNEL2x2_M2 + KERNEL2x2_M1 + KERNEL2x2_M2 + + + KERNEL2x2_M1 + KERNEL2x2_M2 + KERNEL2x2_M1 + KERNEL2x2_E + + b cgemm_kernel_L2_M2_44 + +cgemm_kernel_L2_M2_32: + + tst L, #1 + ble cgemm_kernel_L2_M2_40 + + KERNEL2x2_I + KERNEL2x2_M2 + KERNEL2x2_M1 + KERNEL2x2_M2 + + KERNEL2x2_M1 + KERNEL2x2_M2 + KERNEL2x2_M1 + KERNEL2x2_E + + b cgemm_kernel_L2_M2_44 + + +cgemm_kernel_L2_M2_40: + + INIT2x2 + + +cgemm_kernel_L2_M2_44: + + ands L , K1, #7 // L = L % 8 + ble cgemm_kernel_L2_M2_100 + +cgemm_kernel_L2_M2_46: + + KERNEL2x2_SUB + + subs L, L, #1 + bne cgemm_kernel_L2_M2_46 + +cgemm_kernel_L2_M2_100: + + SAVE2x2 + +cgemm_kernel_L2_M2_END: + + subs I, I, #1 + bne cgemm_kernel_L2_M2_20 + + +cgemm_kernel_L2_M1_BEGIN: + + ldr I, M + tst I, #1 // I = I % 2 + ble cgemm_kernel_L2_END + +cgemm_kernel_L2_M1_20: + + INIT1x2 + + mov BO, BC + asrs L , K1, #3 // L = L / 8 + ble cgemm_kernel_L2_M1_40 + +cgemm_kernel_L2_M1_22: + + KERNEL1x2_SUB + KERNEL1x2_SUB + KERNEL1x2_SUB + KERNEL1x2_SUB + + KERNEL1x2_SUB + KERNEL1x2_SUB + KERNEL1x2_SUB + KERNEL1x2_SUB + + subs L, L, #1 + bgt cgemm_kernel_L2_M1_22 + + +cgemm_kernel_L2_M1_40: + + ands L , K1, #7 // L = L % 8 + ble cgemm_kernel_L2_M1_100 + +cgemm_kernel_L2_M1_42: + + KERNEL1x2_SUB + + subs L, L, #1 + bgt cgemm_kernel_L2_M1_42 + +cgemm_kernel_L2_M1_100: + + SAVE1x2 + + +cgemm_kernel_L2_END: + + mov r3, BC + mov r4, K1 + lsl r4, r4, #4 // k * 2 * 4 * 2 + add r3, r3, r4 // B = B + K * 2 * 8 + mov BC, r3 + + subs J , #1 // j-- + bgt cgemm_kernel_L2_BEGIN + + + +/*********************************************************************************************/ + +cgemm_kernel_L1_BEGIN: + + ldr J , N + tst J , #1 + ble cgemm_kernel_L999 + + + ldr CO1, C // CO1 = C + ldr r4 , LDC + add r3 , r4, CO1 + str r3 , C // store C + + ldr AO, A // AO = A + +cgemm_kernel_L1_M2_BEGIN: + + ldr I, M + asrs I, I, #1 // I = I / 2 + ble cgemm_kernel_L1_M1_BEGIN + +cgemm_kernel_L1_M2_20: + + + mov BO, BC + asrs L , K1, #3 // L = L / 8 + cmp L , #3 + blt cgemm_kernel_L1_M2_30 + .align 5 + + + + KERNEL2x1_I + KERNEL2x1_M2 + KERNEL2x1_M1 + KERNEL2x1_M2 + + KERNEL2x1_M1 + KERNEL2x1_M2 + KERNEL2x1_M1 + KERNEL2x1_M2 + + sub L, L, #2 + +cgemm_kernel_L1_M2_22: + + KERNEL2x1_M1 + KERNEL2x1_M2 + KERNEL2x1_M1 + KERNEL2x1_M2 + + KERNEL2x1_M1 + KERNEL2x1_M2 + KERNEL2x1_M1 + KERNEL2x1_M2 + + subs L, L, #1 + bgt cgemm_kernel_L1_M2_22 + + KERNEL2x1_M1 + KERNEL2x1_M2 + KERNEL2x1_M1 + KERNEL2x1_M2 + + KERNEL2x1_M1 + KERNEL2x1_M2 + KERNEL2x1_M1 + KERNEL2x1_E + + b cgemm_kernel_L1_M2_44 + + +cgemm_kernel_L1_M2_30: + tst L, #3 + ble cgemm_kernel_L1_M2_40 + + tst L, #2 + ble cgemm_kernel_L1_M2_32 + + KERNEL2x1_I + KERNEL2x1_M2 + KERNEL2x1_M1 + KERNEL2x1_M2 + + KERNEL2x1_M1 + KERNEL2x1_M2 + KERNEL2x1_M1 + KERNEL2x1_M2 + + KERNEL2x1_M1 + KERNEL2x1_M2 + KERNEL2x1_M1 + KERNEL2x1_M2 + + + KERNEL2x1_M1 + KERNEL2x1_M2 + KERNEL2x1_M1 + KERNEL2x1_E + + b cgemm_kernel_L1_M2_44 + +cgemm_kernel_L1_M2_32: + + tst L, #1 + ble cgemm_kernel_L1_M2_40 + + KERNEL2x1_I + KERNEL2x1_M2 + KERNEL2x1_M1 + KERNEL2x1_M2 + + KERNEL2x1_M1 + KERNEL2x1_M2 + KERNEL2x1_M1 + KERNEL2x1_E + + b cgemm_kernel_L1_M2_44 + + +cgemm_kernel_L1_M2_40: + + INIT2x1 + + +cgemm_kernel_L1_M2_44: + + ands L , K1, #7 // L = L % 8 + ble cgemm_kernel_L1_M2_100 + +cgemm_kernel_L1_M2_46: + + KERNEL2x1_SUB + + subs L, L, #1 + bne cgemm_kernel_L1_M2_46 + +cgemm_kernel_L1_M2_100: + + SAVE2x1 + +cgemm_kernel_L1_M2_END: + + subs I, I, #1 + bne cgemm_kernel_L1_M2_20 + + +cgemm_kernel_L1_M1_BEGIN: + + ldr I, M + tst I, #1 // I = I % 2 + ble cgemm_kernel_L1_END + +cgemm_kernel_L1_M1_20: + + INIT1x1 + + mov BO, BC + asrs L , K1, #3 // L = L / 8 + ble cgemm_kernel_L1_M1_40 + +cgemm_kernel_L1_M1_22: + + KERNEL1x1_SUB + KERNEL1x1_SUB + KERNEL1x1_SUB + KERNEL1x1_SUB + + KERNEL1x1_SUB + KERNEL1x1_SUB + KERNEL1x1_SUB + KERNEL1x1_SUB + + subs L, L, #1 + bgt cgemm_kernel_L1_M1_22 + + +cgemm_kernel_L1_M1_40: + + ands L , K1, #7 // L = L % 8 + ble cgemm_kernel_L1_M1_100 + +cgemm_kernel_L1_M1_42: + + KERNEL1x1_SUB + + subs L, L, #1 + bgt cgemm_kernel_L1_M1_42 + +cgemm_kernel_L1_M1_100: + + SAVE1x1 + + +cgemm_kernel_L1_END: + + + +cgemm_kernel_L999: + + sub r3, fp, #128 + vldm r3, { s8 - s15} // restore floating point registers + + movs r0, #0 // set return value + sub sp, fp, #24 + pop {r4 - r9, fp} + bx lr + + EPILOGUE + diff --git a/kernel/arm/cgemm_kernel_2x2_vfpv3.S b/kernel/arm/cgemm_kernel_2x2_vfpv3.S new file mode 100644 index 000000000..3aba68de8 --- /dev/null +++ b/kernel/arm/cgemm_kernel_2x2_vfpv3.S @@ -0,0 +1,1309 @@ +/*************************************************************************** +Copyright (c) 2013, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +/************************************************************************************** +* 2013/11/05 Saar +* BLASTEST : OK +* CTEST : OK +* TEST : OK +* +* 2013/11/01 Saar +* UNROLL_N 2 +* UNROLL_M 2 +* CGEMM_P 96 +* CGEMM_Q 120 +* CGEMM_R 4096 +* A_PRE 96 +* B_PRE 96 +* C_PRE 64 +* +* Performance on Odroid U2: +* +* 1 Core: 2.59 GFLOPS ATLAS: 2.37 GFLOPS +* 2 Cores: 5.17 GFLOPS ATLAS: 4.46 GFLOPS +* 3 Cores: 7.69 GFLOPS ATLAS: 6.50 GFLOPS +* 4 Cores: 10.22 GFLOPS ATLAS: 8.18 GFLOPS +**************************************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#define STACKSIZE 256 + +#define OLD_M r0 +#define OLD_N r1 +#define OLD_K r2 +#define OLD_A r3 +#define OLD_ALPHA_R s0 +#define OLD_ALPHA_I s1 + +/****************************************************** +* [fp, #-128] - [fp, #-64] is reserved +* for store and restore of floating point +* registers +*******************************************************/ + +#define A [fp, #-248 ] +#define LDC [fp, #-252 ] +#define M [fp, #-256 ] +#define N [fp, #-260 ] +#define K [fp, #-264 ] + +#define ALPHA_I [fp, #-272] +#define ALPHA_R [fp, #-280] + +#define B [fp, #4 ] +#define C [fp, #8 ] +#define OLD_LDC [fp, #12 ] + +#define I r0 +#define J r1 +#define L r2 + +#define AO r5 +#define BO r6 + +#define CO1 r8 +#define CO2 r9 + +#define K1 r7 +#define BC r12 + +#define A_PRE 96 +#define B_PRE 96 +#define C_PRE 64 + +#if defined(NN) || defined(NT) || defined(TN) || defined(TT) + + #define FADD_R fsubs + #define FADD_I fadds + + #define FMAC_R1 fnmacs + #define FMAC_R2 fnmacs + #define FMAC_I1 fmacs + #define FMAC_I2 fnmacs + +#elif defined(CN) || defined(CT) + + #define FADD_R fadds + #define FADD_I fsubs + + #define FMAC_R1 fmacs + #define FMAC_R2 fmacs + #define FMAC_I1 fnmacs + #define FMAC_I2 fmacs + +#elif defined(NC) || defined(TC) + + #define FADD_R fadds + #define FADD_I fsubs + + #define FMAC_R1 fmacs + #define FMAC_R2 fnmacs + #define FMAC_I1 fmacs + #define FMAC_I2 fmacs + +#else + + #define FADD_R fsubs + #define FADD_I fadds + + #define FMAC_R1 fnmacs + #define FMAC_R2 fmacs + #define FMAC_I1 fnmacs + #define FMAC_I2 fnmacs + +#endif + + + +/************************************************************************************** +* Macro definitions +**************************************************************************************/ + +.macro INIT2x2 + + vsub.f32 s16 , s16 , s16 + vmov.f32 s17, s16 + vmov.f32 s18, s16 + vmov.f32 s19, s16 + vmov.f32 s20, s16 + vmov.f32 s21, s16 + vmov.f32 s22, s16 + vmov.f32 s23, s16 + vmov.f32 s24, s16 + vmov.f32 s25, s16 + vmov.f32 s26, s16 + vmov.f32 s27, s16 + vmov.f32 s28, s16 + vmov.f32 s29, s16 + vmov.f32 s30, s16 + vmov.f32 s31, s16 + +.endm + +.macro KERNEL2x2_I + pld [ AO , #A_PRE ] + pld [ BO , #B_PRE ] + fldmias AO!, { s0 - s1 } + fldmias BO!, { s8 - s9 } + + fmuls s16 , s0, s8 + fmuls s24 , s1, s9 + fldmias AO!, { s2 - s3 } + fmuls s17 , s0, s9 + fmuls s25 , s1, s8 + + fldmias BO!, { s10 - s11 } + fmuls s18 , s2, s8 + fmuls s26 , s3, s9 + fldmias AO!, { s4 - s5 } + fmuls s19 , s2, s9 + fmuls s27 , s3, s8 + + fldmias BO!, { s12 - s13 } + fmuls s20 , s0, s10 + fmuls s28 , s1, s11 + fldmias AO!, { s6 - s7 } + fmuls s21 , s0, s11 + fmuls s29 , s1, s10 + + fldmias BO!, { s14 - s15 } + fmuls s22 , s2, s10 + fmuls s30 , s3, s11 + fmuls s23 , s2, s11 + fmuls s31 , s3, s10 + +.endm + + + +.macro KERNEL2x2_M1 + + fmacs s16 , s0, s8 + fldmias AO!, { s4 - s5 } + fmacs s24 , s1, s9 + fmacs s17 , s0, s9 + fldmias BO!, { s12 - s13 } + fmacs s25 , s1, s8 + + fmacs s18 , s2, s8 + fldmias AO!, { s6 - s7 } + fmacs s26 , s3, s9 + fmacs s19 , s2, s9 + fldmias BO!, { s14 - s15 } + fmacs s27 , s3, s8 + + fmacs s20 , s0, s10 + fmacs s28 , s1, s11 + fmacs s21 , s0, s11 + fmacs s29 , s1, s10 + + fmacs s22 , s2, s10 + fmacs s30 , s3, s11 + fmacs s23 , s2, s11 + fmacs s31 , s3, s10 + +.endm + +.macro KERNEL2x2_M2 + pld [ AO , #A_PRE ] + + fmacs s16 , s4, s12 + pld [ BO , #B_PRE ] + fmacs s24 , s5, s13 + fmacs s17 , s4, s13 + fldmias AO!, { s0 - s1 } + fmacs s25 , s5, s12 + + fmacs s18 , s6, s12 + fmacs s26 , s7, s13 + fldmias BO!, { s8 - s9 } + fmacs s19 , s6, s13 + fmacs s27 , s7, s12 + + fldmias AO!, { s2 - s3 } + fmacs s20 , s4, s14 + fmacs s28 , s5, s15 + fldmias BO!, { s10 - s11 } + fmacs s21 , s4, s15 + fmacs s29 , s5, s14 + + fmacs s22 , s6, s14 + fmacs s30 , s7, s15 + fmacs s23 , s6, s15 + fmacs s31 , s7, s14 + +.endm + + +.macro KERNEL2x2_E + + fmacs s16 , s4, s12 + fmacs s24 , s5, s13 + fmacs s17 , s4, s13 + fmacs s25 , s5, s12 + + fmacs s18 , s6, s12 + fmacs s26 , s7, s13 + fmacs s19 , s6, s13 + fmacs s27 , s7, s12 + + fmacs s20 , s4, s14 + fmacs s28 , s5, s15 + fmacs s21 , s4, s15 + fmacs s29 , s5, s14 + + fmacs s22 , s6, s14 + fmacs s30 , s7, s15 + fmacs s23 , s6, s15 + fmacs s31 , s7, s14 + +.endm + +.macro KERNEL2x2_SUB + + fldmias AO!, { s0 - s1 } + fldmias BO!, { s8 - s9 } + + fmacs s16 , s0, s8 + fmacs s24 , s1, s9 + fldmias AO!, { s2 - s3 } + fmacs s17 , s0, s9 + fmacs s25 , s1, s8 + + fldmias BO!, { s10 - s11 } + fmacs s18 , s2, s8 + fmacs s26 , s3, s9 + fmacs s19 , s2, s9 + fmacs s27 , s3, s8 + + fmacs s20 , s0, s10 + fmacs s28 , s1, s11 + fmacs s21 , s0, s11 + fmacs s29 , s1, s10 + + fmacs s22 , s2, s10 + fmacs s30 , s3, s11 + fmacs s23 , s2, s11 + fmacs s31 , s3, s10 + +.endm + + + + +.macro SAVE2x2 + pld [ CO1 , #C_PRE ] + + ldr r3 , LDC + add CO2 , CO1, r3 + flds s0, ALPHA_R + flds s1, ALPHA_I + + fldmias CO1, { s4 - s7 } + fldmias CO2, { s8 - s11 } + + FADD_R s16, s24 , s16 + FADD_I s17, s25 , s17 + FADD_R s18, s26 , s18 + FADD_I s19, s27 , s19 + FADD_R s20, s28 , s20 + FADD_I s21, s29 , s21 + FADD_R s22, s30 , s22 + FADD_I s23, s31 , s23 + + FMAC_R1 s4 , s0 , s16 + FMAC_I1 s5 , s0 , s17 + FMAC_R2 s4 , s1 , s17 + FMAC_I2 s5 , s1 , s16 + + FMAC_R1 s6 , s0 , s18 + FMAC_I1 s7 , s0 , s19 + FMAC_R2 s6 , s1 , s19 + FMAC_I2 s7 , s1 , s18 + + FMAC_R1 s8 , s0 , s20 + FMAC_I1 s9 , s0 , s21 + FMAC_R2 s8 , s1 , s21 + FMAC_I2 s9 , s1 , s20 + + FMAC_R1 s10, s0 , s22 + FMAC_I1 s11, s0 , s23 + FMAC_R2 s10, s1 , s23 + FMAC_I2 s11, s1 , s22 + + fstmias CO1, { s4 - s7 } + fstmias CO2, { s8 - s11 } + + add CO1, CO1, #16 + +.endm + +/******************************************************************************/ + +.macro INIT1x2 + + vsub.f32 s16 , s16 , s16 + vmov.f32 s17, s16 + vmov.f32 s20, s16 + vmov.f32 s21, s16 + vmov.f32 s24, s16 + vmov.f32 s25, s16 + vmov.f32 s28, s16 + vmov.f32 s29, s16 + +.endm + +.macro KERNEL1x2_I + pld [ AO , #A_PRE ] + pld [ BO , #B_PRE ] + flds s0 , [ AO ] + flds s1 , [ AO, #4 ] + flds s8 , [ BO ] + flds s9 , [ BO, #4 ] + flds s10, [ BO, #8 ] + flds s11, [ BO, #12 ] + + fmuls s16 , s0, s8 + fmuls s24 , s1, s9 + fmuls s17 , s0, s9 + fmuls s25 , s1, s8 + + fmuls s20 , s0, s10 + fmuls s28 , s1, s11 + fmuls s21 , s0, s11 + fmuls s29 , s1, s10 + + add BO , BO, #16 + add AO , AO, #8 + + pld [ BO , #B_PRE ] + + flds s4 , [ AO, #0 ] + flds s5 , [ AO, #4 ] + + flds s12, [ BO ] + flds s13, [ BO, #4 ] + flds s14, [ BO, #8 ] + flds s15, [ BO, #12 ] + + add BO , BO, #16 + add AO , AO, #8 +.endm + + + +.macro KERNEL1x2_M1 + pld [ BO , #B_PRE ] + + fmacs s16 , s0, s8 + fmacs s24 , s1, s9 + fmacs s17 , s0, s9 + fmacs s25 , s1, s8 + + fmacs s20 , s0, s10 + fmacs s28 , s1, s11 + fmacs s21 , s0, s11 + fmacs s29 , s1, s10 + + flds s4 , [ AO, #0 ] + flds s5 , [ AO, #4 ] + + flds s12, [ BO ] + flds s13, [ BO, #4 ] + flds s14, [ BO, #8 ] + flds s15, [ BO, #12 ] + + add BO , BO, #16 + add AO , AO, #8 +.endm + +.macro KERNEL1x2_M2 + pld [ AO , #A_PRE ] + pld [ BO , #B_PRE ] + + fmacs s16 , s4, s12 + fmacs s24 , s5, s13 + fmacs s17 , s4, s13 + fmacs s25 , s5, s12 + + fmacs s20 , s4, s14 + fmacs s28 , s5, s15 + fmacs s21 , s4, s15 + fmacs s29 , s5, s14 + + flds s0 , [ AO, #0 ] + flds s1 , [ AO, #4 ] + + flds s8 , [ BO ] + flds s9 , [ BO, #4 ] + flds s10, [ BO, #8 ] + flds s11, [ BO, #12 ] + + add BO , BO, #16 + add AO , AO, #8 +.endm + + +.macro KERNEL1x2_E + + fmacs s16 , s4, s12 + fmacs s24 , s5, s13 + fmacs s17 , s4, s13 + fmacs s25 , s5, s12 + + fmacs s20 , s4, s14 + fmacs s28 , s5, s15 + fmacs s21 , s4, s15 + fmacs s29 , s5, s14 + +.endm + +.macro KERNEL1x2_SUB + + pld [ AO , #A_PRE ] + pld [ BO , #B_PRE ] + flds s0 , [ AO ] + flds s1 , [ AO, #4 ] + flds s8 , [ BO ] + flds s9 , [ BO, #4 ] + flds s10, [ BO, #8 ] + flds s11, [ BO, #12 ] + + fmacs s16 , s0, s8 + fmacs s24 , s1, s9 + fmacs s17 , s0, s9 + fmacs s25 , s1, s8 + + fmacs s20 , s0, s10 + fmacs s28 , s1, s11 + fmacs s21 , s0, s11 + fmacs s29 , s1, s10 + + add BO , BO, #16 + add AO , AO, #8 + +.endm + + + + +.macro SAVE1x2 + pld [ CO1 , #C_PRE ] + + ldr r3 , LDC + add CO2 , CO1, r3 + flds s0, ALPHA_R + flds s1, ALPHA_I + + fldmias CO1, { s4 - s5 } + fldmias CO2, { s8 - s9 } + + FADD_R s16, s24 , s16 + FADD_I s17, s25 , s17 + FADD_R s20, s28 , s20 + FADD_I s21, s29 , s21 + + FMAC_R1 s4 , s0 , s16 + FMAC_I1 s5 , s0 , s17 + FMAC_R2 s4 , s1 , s17 + FMAC_I2 s5 , s1 , s16 + + FMAC_R1 s8 , s0 , s20 + FMAC_I1 s9 , s0 , s21 + FMAC_R2 s8 , s1 , s21 + FMAC_I2 s9 , s1 , s20 + + fstmias CO1, { s4 - s5 } + fstmias CO2, { s8 - s9 } + + add CO1, CO1, #8 + +.endm + +/******************************************************************************/ + +.macro INIT2x1 + + vsub.f32 s16 , s16 , s16 + vmov.f32 s17, s16 + vmov.f32 s18, s16 + vmov.f32 s19, s16 + vmov.f32 s24, s16 + vmov.f32 s25, s16 + vmov.f32 s26, s16 + vmov.f32 s27, s16 + +.endm + +.macro KERNEL2x1_I + pld [ AO , #A_PRE ] + pld [ BO , #B_PRE ] + flds s0 , [ AO ] + flds s1 , [ AO, #4 ] + flds s2 , [ AO, #8 ] + flds s3 , [ AO, #12 ] + flds s8 , [ BO ] + flds s9 , [ BO, #4 ] + + fmuls s16 , s0, s8 + fmuls s24 , s1, s9 + fmuls s17 , s0, s9 + fmuls s25 , s1, s8 + + fmuls s18 , s2, s8 + fmuls s26 , s3, s9 + fmuls s19 , s2, s9 + fmuls s27 , s3, s8 + + add BO , BO, #8 + add AO , AO, #16 + + pld [ BO , #B_PRE ] + pld [ AO , #A_PRE ] + + flds s4 , [ AO, #0 ] + flds s5 , [ AO, #4 ] + flds s6 , [ AO, #8 ] + flds s7 , [ AO, #12 ] + + flds s12, [ BO ] + flds s13, [ BO, #4 ] + + add BO , BO, #8 + add AO , AO, #16 +.endm + + + +.macro KERNEL2x1_M1 + pld [ AO , #A_PRE ] + pld [ BO , #B_PRE ] + + fmacs s16 , s0, s8 + fmacs s24 , s1, s9 + fmacs s17 , s0, s9 + fmacs s25 , s1, s8 + + fmacs s18 , s2, s8 + fmacs s26 , s3, s9 + fmacs s19 , s2, s9 + fmacs s27 , s3, s8 + + flds s4 , [ AO, #0 ] + flds s5 , [ AO, #4 ] + flds s6 , [ AO, #8 ] + flds s7 , [ AO, #12 ] + + flds s12, [ BO ] + flds s13, [ BO, #4 ] + + add BO , BO, #8 + add AO , AO, #16 +.endm + +.macro KERNEL2x1_M2 + pld [ AO , #A_PRE ] + pld [ BO , #B_PRE ] + + fmacs s16 , s4, s12 + fmacs s24 , s5, s13 + fmacs s17 , s4, s13 + fmacs s25 , s5, s12 + + fmacs s18 , s6, s12 + fmacs s26 , s7, s13 + fmacs s19 , s6, s13 + fmacs s27 , s7, s12 + + flds s0 , [ AO, #0 ] + flds s1 , [ AO, #4 ] + flds s2 , [ AO, #8 ] + flds s3 , [ AO, #12 ] + + flds s8 , [ BO ] + flds s9 , [ BO, #4 ] + + add BO , BO, #8 + add AO , AO, #16 +.endm + + +.macro KERNEL2x1_E + + fmacs s16 , s4, s12 + fmacs s24 , s5, s13 + fmacs s17 , s4, s13 + fmacs s25 , s5, s12 + + fmacs s18 , s6, s12 + fmacs s26 , s7, s13 + fmacs s19 , s6, s13 + fmacs s27 , s7, s12 + +.endm + +.macro KERNEL2x1_SUB + + pld [ AO , #A_PRE ] + pld [ BO , #B_PRE ] + flds s0 , [ AO ] + flds s1 , [ AO, #4 ] + flds s2 , [ AO, #8 ] + flds s3 , [ AO, #12 ] + flds s8 , [ BO ] + flds s9 , [ BO, #4 ] + + fmacs s16 , s0, s8 + fmacs s24 , s1, s9 + fmacs s17 , s0, s9 + fmacs s25 , s1, s8 + + fmacs s18 , s2, s8 + fmacs s26 , s3, s9 + fmacs s19 , s2, s9 + fmacs s27 , s3, s8 + + add BO , BO, #8 + add AO , AO, #16 + +.endm + + + + +.macro SAVE2x1 + pld [ CO1 , #C_PRE ] + + flds s0, ALPHA_R + flds s1, ALPHA_I + + fldmias CO1, { s4 - s7 } + + FADD_R s16, s24 , s16 + FADD_I s17, s25 , s17 + FADD_R s18, s26 , s18 + FADD_I s19, s27 , s19 + + FMAC_R1 s4 , s0 , s16 + FMAC_I1 s5 , s0 , s17 + FMAC_R2 s4 , s1 , s17 + FMAC_I2 s5 , s1 , s16 + + FMAC_R1 s6 , s0 , s18 + FMAC_I1 s7 , s0 , s19 + FMAC_R2 s6 , s1 , s19 + FMAC_I2 s7 , s1 , s18 + + fstmias CO1, { s4 - s7 } + + add CO1, CO1, #16 + +.endm + +/******************************************************************************/ + +.macro INIT1x1 + + vsub.f32 s16 , s16 , s16 + vmov.f32 s17, s16 + vmov.f32 s24, s16 + vmov.f32 s25, s16 + +.endm + +.macro KERNEL1x1_I + pld [ AO , #A_PRE ] + pld [ BO , #B_PRE ] + flds s0 , [ AO ] + flds s1 , [ AO, #4 ] + flds s8 , [ BO ] + flds s9 , [ BO, #4 ] + + fmuls s16 , s0, s8 + fmuls s24 , s1, s9 + fmuls s17 , s0, s9 + fmuls s25 , s1, s8 + + add BO , BO, #8 + add AO , AO, #8 + + pld [ BO , #B_PRE ] + pld [ AO , #A_PRE ] + + flds s4 , [ AO, #0 ] + flds s5 , [ AO, #4 ] + + flds s12, [ BO ] + flds s13, [ BO, #4 ] + + add BO , BO, #8 + add AO , AO, #8 +.endm + + + +.macro KERNEL1x1_M1 + + fmacs s16 , s0, s8 + fmacs s24 , s1, s9 + fmacs s17 , s0, s9 + fmacs s25 , s1, s8 + + flds s4 , [ AO, #0 ] + flds s5 , [ AO, #4 ] + + flds s12, [ BO ] + flds s13, [ BO, #4 ] + + add BO , BO, #8 + add AO , AO, #8 +.endm + +.macro KERNEL1x1_M2 + + fmacs s16 , s4, s12 + fmacs s24 , s5, s13 + fmacs s17 , s4, s13 + fmacs s25 , s5, s12 + + flds s0 , [ AO, #0 ] + flds s1 , [ AO, #4 ] + + flds s8 , [ BO ] + flds s9 , [ BO, #4 ] + + add BO , BO, #8 + add AO , AO, #8 +.endm + + +.macro KERNEL1x1_E + + fmacs s16 , s4, s12 + fmacs s24 , s5, s13 + fmacs s17 , s4, s13 + fmacs s25 , s5, s12 + +.endm + +.macro KERNEL1x1_SUB + + flds s0 , [ AO ] + flds s1 , [ AO, #4 ] + flds s8 , [ BO ] + flds s9 , [ BO, #4 ] + + fmacs s16 , s0, s8 + fmacs s24 , s1, s9 + fmacs s17 , s0, s9 + fmacs s25 , s1, s8 + + add BO , BO, #8 + add AO , AO, #8 + +.endm + + + + +.macro SAVE1x1 + pld [ CO1 , #C_PRE ] + + flds s0, ALPHA_R + flds s1, ALPHA_I + + fldmias CO1, { s4 - s5 } + + FADD_R s16, s24 , s16 + FADD_I s17, s25 , s17 + + FMAC_R1 s4 , s0 , s16 + FMAC_I1 s5 , s0 , s17 + FMAC_R2 s4 , s1 , s17 + FMAC_I2 s5 , s1 , s16 + + fstmias CO1, { s4 - s5 } + + add CO1, CO1, #8 + +.endm + +/******************************************************************************/ + + +/************************************************************************************** +* End of macro definitions +**************************************************************************************/ + + PROLOGUE + + .align 5 + + push {r4 - r9, fp} + add fp, sp, #24 + sub sp, sp, #STACKSIZE // reserve stack + + str OLD_M, M + str OLD_N, N + str OLD_K, K + str OLD_A, A + vstr OLD_ALPHA_R, ALPHA_R + vstr OLD_ALPHA_I, ALPHA_I + + sub r3, fp, #128 + vstm r3, { s8 - s31} // store floating point registers + + ldr r3, OLD_LDC + lsl r3, r3, #3 // ldc = ldc * 4 * 2 + str r3, LDC + + ldr K1, K + ldr BC, B + + ldr J, N + asrs J, J, #1 // J = J / 2 + ble cgemm_kernel_L1_BEGIN + +cgemm_kernel_L2_BEGIN: + + ldr CO1, C // CO1 = C + ldr r4 , LDC + lsl r4 , r4 , #1 // LDC * 2 + add r3 , r4, CO1 + str r3 , C // store C + + ldr AO, A // AO = A + pld [AO , #A_PRE-64] + pld [AO , #A_PRE-32] + + + +cgemm_kernel_L2_M2_BEGIN: + + ldr I, M + asrs I, I, #1 // I = I / 2 + ble cgemm_kernel_L2_M1_BEGIN + +cgemm_kernel_L2_M2_20: + + + mov BO, BC + asrs L , K1, #3 // L = L / 8 + cmp L , #3 + blt cgemm_kernel_L2_M2_30 + .align 5 + + + + KERNEL2x2_I + KERNEL2x2_M2 + KERNEL2x2_M1 + KERNEL2x2_M2 + + KERNEL2x2_M1 + KERNEL2x2_M2 + KERNEL2x2_M1 + KERNEL2x2_M2 + + sub L, L, #2 + +cgemm_kernel_L2_M2_22: + + KERNEL2x2_M1 + KERNEL2x2_M2 + KERNEL2x2_M1 + KERNEL2x2_M2 + + KERNEL2x2_M1 + KERNEL2x2_M2 + KERNEL2x2_M1 + KERNEL2x2_M2 + + subs L, L, #1 + bgt cgemm_kernel_L2_M2_22 + + KERNEL2x2_M1 + KERNEL2x2_M2 + KERNEL2x2_M1 + KERNEL2x2_M2 + + KERNEL2x2_M1 + KERNEL2x2_M2 + KERNEL2x2_M1 + KERNEL2x2_E + + b cgemm_kernel_L2_M2_44 + + +cgemm_kernel_L2_M2_30: + tst L, #3 + ble cgemm_kernel_L2_M2_40 + + tst L, #2 + ble cgemm_kernel_L2_M2_32 + + KERNEL2x2_I + KERNEL2x2_M2 + KERNEL2x2_M1 + KERNEL2x2_M2 + + KERNEL2x2_M1 + KERNEL2x2_M2 + KERNEL2x2_M1 + KERNEL2x2_M2 + + KERNEL2x2_M1 + KERNEL2x2_M2 + KERNEL2x2_M1 + KERNEL2x2_M2 + + + KERNEL2x2_M1 + KERNEL2x2_M2 + KERNEL2x2_M1 + KERNEL2x2_E + + b cgemm_kernel_L2_M2_44 + +cgemm_kernel_L2_M2_32: + + tst L, #1 + ble cgemm_kernel_L2_M2_40 + + KERNEL2x2_I + KERNEL2x2_M2 + KERNEL2x2_M1 + KERNEL2x2_M2 + + KERNEL2x2_M1 + KERNEL2x2_M2 + KERNEL2x2_M1 + KERNEL2x2_E + + b cgemm_kernel_L2_M2_44 + + +cgemm_kernel_L2_M2_40: + + INIT2x2 + + +cgemm_kernel_L2_M2_44: + + ands L , K1, #7 // L = L % 8 + ble cgemm_kernel_L2_M2_100 + +cgemm_kernel_L2_M2_46: + + KERNEL2x2_SUB + + subs L, L, #1 + bne cgemm_kernel_L2_M2_46 + +cgemm_kernel_L2_M2_100: + + SAVE2x2 + +cgemm_kernel_L2_M2_END: + + subs I, I, #1 + bne cgemm_kernel_L2_M2_20 + + +cgemm_kernel_L2_M1_BEGIN: + + ldr I, M + tst I, #1 // I = I % 2 + ble cgemm_kernel_L2_END + +cgemm_kernel_L2_M1_20: + + INIT1x2 + + mov BO, BC + asrs L , K1, #3 // L = L / 8 + ble cgemm_kernel_L2_M1_40 + +cgemm_kernel_L2_M1_22: + + KERNEL1x2_SUB + KERNEL1x2_SUB + KERNEL1x2_SUB + KERNEL1x2_SUB + + KERNEL1x2_SUB + KERNEL1x2_SUB + KERNEL1x2_SUB + KERNEL1x2_SUB + + subs L, L, #1 + bgt cgemm_kernel_L2_M1_22 + + +cgemm_kernel_L2_M1_40: + + ands L , K1, #7 // L = L % 8 + ble cgemm_kernel_L2_M1_100 + +cgemm_kernel_L2_M1_42: + + KERNEL1x2_SUB + + subs L, L, #1 + bgt cgemm_kernel_L2_M1_42 + +cgemm_kernel_L2_M1_100: + + SAVE1x2 + + +cgemm_kernel_L2_END: + + mov r3, BC + mov r4, K1 + lsl r4, r4, #4 // k * 2 * 4 * 2 + add r3, r3, r4 // B = B + K * 2 * 8 + mov BC, r3 + + subs J , #1 // j-- + bgt cgemm_kernel_L2_BEGIN + + + +/*********************************************************************************************/ + +cgemm_kernel_L1_BEGIN: + + ldr J , N + tst J , #1 + ble cgemm_kernel_L999 + + + ldr CO1, C // CO1 = C + ldr r4 , LDC + add r3 , r4, CO1 + str r3 , C // store C + + ldr AO, A // AO = A + +cgemm_kernel_L1_M2_BEGIN: + + ldr I, M + asrs I, I, #1 // I = I / 2 + ble cgemm_kernel_L1_M1_BEGIN + +cgemm_kernel_L1_M2_20: + + + mov BO, BC + asrs L , K1, #3 // L = L / 8 + cmp L , #3 + blt cgemm_kernel_L1_M2_30 + .align 5 + + + + KERNEL2x1_I + KERNEL2x1_M2 + KERNEL2x1_M1 + KERNEL2x1_M2 + + KERNEL2x1_M1 + KERNEL2x1_M2 + KERNEL2x1_M1 + KERNEL2x1_M2 + + sub L, L, #2 + +cgemm_kernel_L1_M2_22: + + KERNEL2x1_M1 + KERNEL2x1_M2 + KERNEL2x1_M1 + KERNEL2x1_M2 + + KERNEL2x1_M1 + KERNEL2x1_M2 + KERNEL2x1_M1 + KERNEL2x1_M2 + + subs L, L, #1 + bgt cgemm_kernel_L1_M2_22 + + KERNEL2x1_M1 + KERNEL2x1_M2 + KERNEL2x1_M1 + KERNEL2x1_M2 + + KERNEL2x1_M1 + KERNEL2x1_M2 + KERNEL2x1_M1 + KERNEL2x1_E + + b cgemm_kernel_L1_M2_44 + + +cgemm_kernel_L1_M2_30: + tst L, #3 + ble cgemm_kernel_L1_M2_40 + + tst L, #2 + ble cgemm_kernel_L1_M2_32 + + KERNEL2x1_I + KERNEL2x1_M2 + KERNEL2x1_M1 + KERNEL2x1_M2 + + KERNEL2x1_M1 + KERNEL2x1_M2 + KERNEL2x1_M1 + KERNEL2x1_M2 + + KERNEL2x1_M1 + KERNEL2x1_M2 + KERNEL2x1_M1 + KERNEL2x1_M2 + + + KERNEL2x1_M1 + KERNEL2x1_M2 + KERNEL2x1_M1 + KERNEL2x1_E + + b cgemm_kernel_L1_M2_44 + +cgemm_kernel_L1_M2_32: + + tst L, #1 + ble cgemm_kernel_L1_M2_40 + + KERNEL2x1_I + KERNEL2x1_M2 + KERNEL2x1_M1 + KERNEL2x1_M2 + + KERNEL2x1_M1 + KERNEL2x1_M2 + KERNEL2x1_M1 + KERNEL2x1_E + + b cgemm_kernel_L1_M2_44 + + +cgemm_kernel_L1_M2_40: + + INIT2x1 + + +cgemm_kernel_L1_M2_44: + + ands L , K1, #7 // L = L % 8 + ble cgemm_kernel_L1_M2_100 + +cgemm_kernel_L1_M2_46: + + KERNEL2x1_SUB + + subs L, L, #1 + bne cgemm_kernel_L1_M2_46 + +cgemm_kernel_L1_M2_100: + + SAVE2x1 + +cgemm_kernel_L1_M2_END: + + subs I, I, #1 + bne cgemm_kernel_L1_M2_20 + + +cgemm_kernel_L1_M1_BEGIN: + + ldr I, M + tst I, #1 // I = I % 2 + ble cgemm_kernel_L1_END + +cgemm_kernel_L1_M1_20: + + INIT1x1 + + mov BO, BC + asrs L , K1, #3 // L = L / 8 + ble cgemm_kernel_L1_M1_40 + +cgemm_kernel_L1_M1_22: + + KERNEL1x1_SUB + KERNEL1x1_SUB + KERNEL1x1_SUB + KERNEL1x1_SUB + + KERNEL1x1_SUB + KERNEL1x1_SUB + KERNEL1x1_SUB + KERNEL1x1_SUB + + subs L, L, #1 + bgt cgemm_kernel_L1_M1_22 + + +cgemm_kernel_L1_M1_40: + + ands L , K1, #7 // L = L % 8 + ble cgemm_kernel_L1_M1_100 + +cgemm_kernel_L1_M1_42: + + KERNEL1x1_SUB + + subs L, L, #1 + bgt cgemm_kernel_L1_M1_42 + +cgemm_kernel_L1_M1_100: + + SAVE1x1 + + +cgemm_kernel_L1_END: + + + +cgemm_kernel_L999: + + sub r3, fp, #128 + vldm r3, { s8 - s31} // restore floating point registers + + movs r0, #0 // set return value + sub sp, fp, #24 + pop {r4 - r9, fp} + bx lr + + EPILOGUE + diff --git a/kernel/arm/cgemm_ncopy_2_vfp.S b/kernel/arm/cgemm_ncopy_2_vfp.S new file mode 100644 index 000000000..08fbd5501 --- /dev/null +++ b/kernel/arm/cgemm_ncopy_2_vfp.S @@ -0,0 +1,258 @@ +/*************************************************************************** +Copyright (c) 2013, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +/************************************************************************************** +* 2013/11/05 Saar +* BLASTEST : OK +* CTEST : OK +* TEST : OK +* +**************************************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#define STACKSIZE 256 + +#define OLD_M r0 +#define OLD_N r1 +#define OLD_A r2 +#define OLD_LDA r3 + + +/****************************************************** +* [fp, #-128] - [fp, #-64] is reserved +* for store and restore of floating point +* registers +*******************************************************/ + +#define LDA [fp, #-260 ] + +#define B [fp, #4 ] + +#define M r0 +#define N r1 +#define A r2 + +#define BO r5 + +#define AO1 r6 +#define AO2 r7 + +#define I r3 +#define J r12 + +#define A_PRE 256 + +/************************************************************************************** +* Macro definitions +**************************************************************************************/ + +.macro COPY2x2 + + flds s0 , [ AO1, #0 ] + flds s1 , [ AO1, #4 ] + flds s4 , [ AO1, #8 ] + flds s5 , [ AO1, #12 ] + + flds s2 , [ AO2, #0 ] + flds s3 , [ AO2, #4 ] + add AO1, AO1, #16 + flds s6 , [ AO2, #8 ] + flds s7 , [ AO2, #12 ] + + fstmias BO!, { s0 - s7 } + add AO2, AO2, #16 + +.endm + + +.macro COPY1x2 + + flds s0 , [ AO1, #0 ] + flds s1 , [ AO1, #4 ] + flds s2 , [ AO2, #0 ] + flds s3 , [ AO2, #4 ] + + add AO1, AO1, #8 + fstmias BO!, { s0 - s3 } + add AO2, AO2, #8 + +.endm + +.macro COPY2x1 + + flds s0 , [ AO1, #0 ] + flds s1 , [ AO1, #4 ] + flds s2 , [ AO1, #8 ] + flds s3 , [ AO1, #12 ] + + fstmias BO!, { s0 - s3 } + add AO1, AO1, #16 + +.endm + + +.macro COPY1x1 + + flds s0 , [ AO1, #0 ] + flds s1 , [ AO1, #4 ] + + fstmias BO!, { s0 - s1 } + add AO1, AO1, #8 + +.endm + + + + + +/************************************************************************************** +* End of macro definitions +**************************************************************************************/ + + PROLOGUE + + .align 5 + + push {r4 - r9, fp} + add fp, sp, #24 + sub sp, sp, #STACKSIZE // reserve stack + + + lsl r3, r3, #3 // lda = lda * 4 * 2 + str r3, LDA + + sub r4, fp, #128 + vstm r4, { s8 - s15} // store floating point registers + + ldr BO, B + +/*********************************************************************************************/ + +cgemm_ncopy_L2_BEGIN: + + asrs J, N, #1 // J = N / 2 + ble cgemm_ncopy_L1_BEGIN + +cgemm_ncopy_L2_M2_BEGIN: + + mov AO1, A // AO1 = A + ldr r4 , LDA + add AO2, AO1, r4 + add A , AO2, r4 // A = A + 2 * LDA + + asrs I, M, #1 // I = M / 2 + ble cgemm_ncopy_L2_M2_40 + +cgemm_ncopy_L2_M2_20: + + pld [ AO1, #A_PRE ] + pld [ AO2, #A_PRE ] + + COPY2x2 + subs I , I , #1 + ble cgemm_ncopy_L2_M2_40 + + COPY2x2 + subs I , I , #1 + bne cgemm_ncopy_L2_M2_20 + + +cgemm_ncopy_L2_M2_40: + + ands I, M , #1 + ble cgemm_ncopy_L2_M2_END + +cgemm_ncopy_L2_M2_60: + + COPY1x2 + + subs I , I , #1 + bne cgemm_ncopy_L2_M2_60 + + +cgemm_ncopy_L2_M2_END: + + subs J , J, #1 // j-- + bne cgemm_ncopy_L2_M2_BEGIN + + +/*********************************************************************************************/ + +cgemm_ncopy_L1_BEGIN: + + tst N, #1 + ble cgemm_ncopy_L999 + + +cgemm_ncopy_L1_M2_BEGIN: + + mov AO1, A // AO1 = A + ldr r4 , LDA + add A , AO1, r4 // A = A + 1 * LDA + + asrs I, M, #1 // I = M / 2 + ble cgemm_ncopy_L1_M2_40 + +cgemm_ncopy_L1_M2_20: + + COPY2x1 + + subs I , I , #1 + bne cgemm_ncopy_L1_M2_20 + + +cgemm_ncopy_L1_M2_40: + + ands I, M , #1 + ble cgemm_ncopy_L1_M2_END + +cgemm_ncopy_L1_M2_60: + + COPY1x1 + + subs I , I , #1 + bne cgemm_ncopy_L1_M2_60 + + +cgemm_ncopy_L1_M2_END: + + + +cgemm_ncopy_L999: + + sub r3, fp, #128 + vldm r3, { s8 - s15} // restore floating point registers + + movs r0, #0 // set return value + sub sp, fp, #24 + pop {r4 - r9, fp} + bx lr + + EPILOGUE + diff --git a/kernel/arm/cgemm_tcopy_2_vfp.S b/kernel/arm/cgemm_tcopy_2_vfp.S new file mode 100644 index 000000000..9036b994d --- /dev/null +++ b/kernel/arm/cgemm_tcopy_2_vfp.S @@ -0,0 +1,243 @@ +/*************************************************************************** +Copyright (c) 2013, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +/************************************************************************************** +* 2013/11/07 Saar +* BLASTEST : OK +* CTEST : OK +* TEST : OK +* +**************************************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#define STACKSIZE 256 + +#define OLD_M r0 +#define OLD_N r1 +#define OLD_A r2 +#define OLD_LDA r3 + + +/****************************************************** +* [fp, #-128] - [fp, #-64] is reserved +* for store and restore of floating point +* registers +*******************************************************/ + +#define B [fp, #4 ] +#define A [fp, #-248 ] + +#define M r0 +#define N r1 +#define M4 r2 + +#define LDA r5 + +#define AO1 r6 +#define BO1 r7 +#define BO2 r8 + +#define I r4 +#define J r12 + +#define A_PRE 256 + +/************************************************************************************** +* Macro definitions +**************************************************************************************/ +.macro COPY2x2 + + fldmias AO1, { s0 - s3 } + + add r3, AO1, LDA + fldmias r3, { s4 - s7 } + + fstmias BO1, { s0 - s7 } + add AO1, AO1, #16 + add BO1, BO1, M4 + +.endm + +.macro COPY1x2 + + fldmias AO1, { s0 -s1 } + + add r3, AO1, LDA + fldmias r3, { s2 - s3 } + + fstmias BO2, { s0 - s3 } + add AO1, AO1, #8 + add BO2, BO2, #16 + +.endm + +/*************************************************************************************************************************/ +.macro COPY2x1 + + fldmias AO1, { s0 - s3 } + + fstmias BO1, { s0 - s3 } + add AO1, AO1, #16 + add BO1, BO1, M4 + +.endm + +.macro COPY1x1 + + fldmias AO1, { s0 - s1 } + + fstmias BO2, { s0 - s1 } + add AO1, AO1, #8 + add BO2, BO2, #8 + +.endm + + + +/************************************************************************************** +* End of macro definitions +**************************************************************************************/ + + PROLOGUE + + .align 5 + + push {r4 - r9, fp} + add fp, sp, #24 + sub sp, sp, #STACKSIZE // reserve stack + + str OLD_A, A // store A + + lsl LDA, OLD_LDA, #3 // lda = lda * SIZE * 2 + + sub r4, fp, #128 + vstm r4, { s8 - s15} // store floating point registers + + lsl r4 , M, #3 // M * SIZE * 2 + + ldr r3, B + + and BO2 , N , #-2 + + mul BO2, BO2, r4 + + add BO2 , BO2, r3 + + lsl M4, M, #4 // M4 = M * 2 * SIZE * 2 + +cgemm_tcopy_L2_BEGIN: + + asrs J, M, #1 // J = N / 2 + ble cgemm_tcopy_L1_BEGIN + +cgemm_tcopy_L2_M2_BEGIN: + + ldr AO1, A // AO1 = A + lsl r3, LDA, #1 // r3 = 2 * LDA + add r3, r3 , AO1 // A = A + 2 * LDA + str r3, A // store A + + ldr BO1, B + add r3, BO1, #32 // B = B + 4 * SIZE *2 + str r3, B + + asrs I, N, #1 // I = M / 2 + ble cgemm_tcopy_L2_M2_60 + +cgemm_tcopy_L2_M2_40: + + COPY2x2 + subs I, I, #1 + bne cgemm_tcopy_L2_M2_40 + +cgemm_tcopy_L2_M2_60: + + tst N , #1 + ble cgemm_tcopy_L2_M2_END + + COPY1x2 + + +cgemm_tcopy_L2_M2_END: + + subs J , J, #1 // j-- + bne cgemm_tcopy_L2_M2_BEGIN + +/*********************************************************************************************/ + +cgemm_tcopy_L1_BEGIN: + + tst M, #1 + ble cgemm_tcopy_L999 + + +cgemm_tcopy_L1_M2_BEGIN: + + ldr AO1, A // AO1 = A + add r3, LDA , AO1 // A = A + 1 * LDA + str r3, A // store A + + ldr BO1, B + add r3, BO1, #16 // B = B + 2 * SIZE *2 + str r3, B + + asrs I, N, #1 // I = M / 2 + ble cgemm_tcopy_L1_M2_60 + + +cgemm_tcopy_L1_M2_40: + + COPY2x1 + subs I, I, #1 + bne cgemm_tcopy_L1_M2_40 + +cgemm_tcopy_L1_M2_60: + + tst N , #1 + ble cgemm_tcopy_L1_M2_END + + COPY1x1 + + +cgemm_tcopy_L1_M2_END: + + + +cgemm_tcopy_L999: + + sub r3, fp, #128 + vldm r3, { s8 - s15} // restore floating point registers + + mov r0, #0 // set return value + sub sp, fp, #24 + pop {r4 - r9, fp} + bx lr + + EPILOGUE + diff --git a/kernel/arm/cgemv_n_vfp.S b/kernel/arm/cgemv_n_vfp.S new file mode 100644 index 000000000..522c4c764 --- /dev/null +++ b/kernel/arm/cgemv_n_vfp.S @@ -0,0 +1,697 @@ +/*************************************************************************** +Copyright (c) 2013, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +/************************************************************************************** +* 2013/11/29 Saar +* BLASTEST : OK +* CTEST : OK +* TEST : OK +* +**************************************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#define STACKSIZE 256 + +#define OLD_LDA [fp, #0 ] +#define X [fp, #4 ] +#define OLD_INC_X [fp, #8 ] +#define Y [fp, #12 ] +#define OLD_INC_Y [fp, #16 ] +#define OLD_A r3 +#define OLD_M r0 + +#define AO1 r0 +#define N r1 +#define J r2 + +#define AO2 r4 +#define XO r5 +#define YO r6 +#define LDA r7 +#define INC_X r8 +#define INC_Y r9 + +#define I r12 + +#define ALPHA_I [fp, #-236] +#define ALPHA_R [fp, #-244] + +#define M [fp, #-252 ] +#define A [fp, #-256 ] + + +#define X_PRE 64 +#define Y_PRE 0 +#define A_PRE 0 + +/**************************************************************************************/ + +#if !defined(CONJ) && !defined(XCONJ) + + #define KMAC_R fnmacs + #define KMAC_I fmacs + + #define FMAC_R1 fmacs + #define FMAC_R2 fnmacs + #define FMAC_I1 fmacs + #define FMAC_I2 fmacs + +#elif defined(CONJ) && !defined(XCONJ) + + #define KMAC_R fmacs + #define KMAC_I fnmacs + + #define FMAC_R1 fmacs + #define FMAC_R2 fnmacs + #define FMAC_I1 fmacs + #define FMAC_I2 fmacs + +#elif !defined(CONJ) && defined(XCONJ) + + #define KMAC_R fmacs + #define KMAC_I fnmacs + + #define FMAC_R1 fmacs + #define FMAC_R2 fmacs + #define FMAC_I1 fnmacs + #define FMAC_I2 fmacs + +#else + + #define KMAC_R fnmacs + #define KMAC_I fmacs + + #define FMAC_R1 fmacs + #define FMAC_R2 fmacs + #define FMAC_I1 fnmacs + #define FMAC_I2 fmacs + +#endif + +.macro INIT_F4 + + pld [ YO, #Y_PRE ] + vsub.f32 s8 , s8 , s8 + vmov.f32 s9 , s8 + vmov.f32 s10, s8 + vmov.f32 s11, s8 + vmov.f32 s12, s8 + vmov.f32 s13, s8 + vmov.f32 s14, s8 + vmov.f32 s15, s8 + +.endm + +.macro KERNEL_F4X4 + + pld [ XO, #X_PRE ] + KERNEL_F4X1 + KERNEL_F4X1 + KERNEL_F4X1 + KERNEL_F4X1 + +.endm + +.macro KERNEL_F4X1 + + pld [ AO2, #A_PRE ] + flds s0 , [ AO1 ] + flds s1 , [ AO1, #4 ] + flds s2 , [ AO1, #8 ] + flds s3 , [ AO1, #12 ] + + flds s4 , [ XO ] + flds s5 , [ XO, #4 ] + + fmacs s8 , s0, s4 + fmacs s9 , s0, s5 + fmacs s10 , s2, s4 + fmacs s11 , s2, s5 + + KMAC_R s8 , s1, s5 + KMAC_I s9 , s1, s4 + KMAC_R s10 , s3, s5 + KMAC_I s11 , s3, s4 + + flds s0 , [ AO1, #16 ] + flds s1 , [ AO1, #20 ] + flds s2 , [ AO1, #24 ] + flds s3 , [ AO1, #28 ] + + fmacs s12 , s0, s4 + fmacs s13 , s0, s5 + fmacs s14 , s2, s4 + fmacs s15 , s2, s5 + + KMAC_R s12 , s1, s5 + KMAC_I s13 , s1, s4 + KMAC_R s14 , s3, s5 + KMAC_I s15 , s3, s4 + + add XO , XO, #8 + add AO1 , AO1, LDA + add AO2 , AO2, LDA + +.endm + +.macro SAVE_F4 + + flds s0, ALPHA_R + flds s1, ALPHA_I + + fldmias YO, { s4 - s7 } + + FMAC_R1 s4 , s0 , s8 + FMAC_I1 s5 , s0 , s9 + FMAC_R2 s4 , s1 , s9 + FMAC_I2 s5 , s1 , s8 + + FMAC_R1 s6 , s0 , s10 + FMAC_I1 s7 , s0 , s11 + FMAC_R2 s6 , s1 , s11 + FMAC_I2 s7 , s1 , s10 + + fstmias YO!, { s4 - s7 } + + fldmias YO, { s4 - s7 } + + FMAC_R1 s4 , s0 , s12 + FMAC_I1 s5 , s0 , s13 + FMAC_R2 s4 , s1 , s13 + FMAC_I2 s5 , s1 , s12 + + FMAC_R1 s6 , s0 , s14 + FMAC_I1 s7 , s0 , s15 + FMAC_R2 s6 , s1 , s15 + FMAC_I2 s7 , s1 , s14 + + fstmias YO!, { s4 - s7 } + +.endm + + + + +.macro INIT_F1 + + vsub.f32 s8 , s8 , s8 + vmov.f32 s9 , s8 + +.endm + +.macro KERNEL_F1X1 + + flds s0 , [ AO1 ] + flds s1 , [ AO1, #4 ] + + flds s4 , [ XO ] + flds s5 , [ XO, #4 ] + + fmacs s8 , s0, s4 + fmacs s9 , s0, s5 + + KMAC_R s8 , s1, s5 + KMAC_I s9 , s1, s4 + + add XO , XO, #8 + add AO1 , AO1, LDA + + +.endm + +.macro SAVE_F1 + + flds s0, ALPHA_R + flds s1, ALPHA_I + + fldmias YO, { s4 - s5 } + + FMAC_R1 s4 , s0 , s8 + FMAC_I1 s5 , s0 , s9 + FMAC_R2 s4 , s1 , s9 + FMAC_I2 s5 , s1 , s8 + + fstmias YO, { s4 - s5 } + + add YO, YO, #8 + +.endm + +/****************************************************************************************/ + +.macro INIT_S4 + + vsub.f32 s8 , s8 , s8 + vmov.f32 s9 , s8 + vmov.f32 s10, s8 + vmov.f32 s11, s8 + vmov.f32 s12, s8 + vmov.f32 s13, s8 + vmov.f32 s14, s8 + vmov.f32 s15, s8 + +.endm + +.macro KERNEL_S4X4 + + KERNEL_S4X1 + KERNEL_S4X1 + KERNEL_S4X1 + KERNEL_S4X1 + +.endm + +.macro KERNEL_S4X1 + + flds s0 , [ AO1 ] + flds s1 , [ AO1, #4 ] + flds s2 , [ AO1, #8 ] + flds s3 , [ AO1, #12 ] + + flds s4 , [ XO ] + flds s5 , [ XO, #4 ] + + fmacs s8 , s0, s4 + fmacs s9 , s0, s5 + fmacs s10 , s2, s4 + fmacs s11 , s2, s5 + + KMAC_R s8 , s1, s5 + KMAC_I s9 , s1, s4 + KMAC_R s10 , s3, s5 + KMAC_I s11 , s3, s4 + + flds s0 , [ AO1, #16 ] + flds s1 , [ AO1, #20 ] + flds s2 , [ AO1, #24 ] + flds s3 , [ AO1, #28 ] + + fmacs s12 , s0, s4 + fmacs s13 , s0, s5 + fmacs s14 , s2, s4 + fmacs s15 , s2, s5 + + KMAC_R s12 , s1, s5 + KMAC_I s13 , s1, s4 + KMAC_R s14 , s3, s5 + KMAC_I s15 , s3, s4 + + add XO , XO, INC_X + add AO1 , AO1, LDA + add AO2 , AO2, LDA + +.endm + +.macro SAVE_S4 + + flds s0, ALPHA_R + flds s1, ALPHA_I + + fldmias YO, { s4 - s5 } + + FMAC_R1 s4 , s0 , s8 + FMAC_I1 s5 , s0 , s9 + FMAC_R2 s4 , s1 , s9 + FMAC_I2 s5 , s1 , s8 + + fstmias YO, { s4 - s5 } + + add YO, YO, INC_Y + + fldmias YO, { s6 - s7 } + + FMAC_R1 s6 , s0 , s10 + FMAC_I1 s7 , s0 , s11 + FMAC_R2 s6 , s1 , s11 + FMAC_I2 s7 , s1 , s10 + + fstmias YO, { s6 - s7 } + + add YO, YO, INC_Y + + fldmias YO, { s4 - s5 } + + FMAC_R1 s4 , s0 , s12 + FMAC_I1 s5 , s0 , s13 + FMAC_R2 s4 , s1 , s13 + FMAC_I2 s5 , s1 , s12 + + fstmias YO, { s4 - s5 } + + add YO, YO, INC_Y + + fldmias YO, { s6 - s7 } + + FMAC_R1 s6 , s0 , s14 + FMAC_I1 s7 , s0 , s15 + FMAC_R2 s6 , s1 , s15 + FMAC_I2 s7 , s1 , s14 + + fstmias YO, { s6 - s7 } + + add YO, YO, INC_Y + +.endm + + + + +.macro INIT_S1 + + vsub.f32 s8 , s8 , s8 + vmov.f32 s9 , s8 + +.endm + +.macro KERNEL_S1X1 + + flds s0 , [ AO1 ] + flds s1 , [ AO1, #4 ] + + flds s4 , [ XO ] + flds s5 , [ XO, #4 ] + + fmacs s8 , s0, s4 + fmacs s9 , s0, s5 + + KMAC_R s8 , s1, s5 + KMAC_I s9 , s1, s4 + + add XO , XO, INC_X + add AO1 , AO1, LDA + + +.endm + +.macro SAVE_S1 + + flds s0, ALPHA_R + flds s1, ALPHA_I + + fldmias YO, { s4 - s5 } + + FMAC_R1 s4 , s0 , s8 + FMAC_I1 s5 , s0 , s9 + FMAC_R2 s4 , s1 , s9 + FMAC_I2 s5 , s1 , s8 + + fstmias YO, { s4 - s5 } + + add YO, YO, INC_Y + +.endm + + + +/************************************************************************************** +* End of macro definitions +**************************************************************************************/ + + PROLOGUE + + .align 5 + push {r4 - r9 , fp} + add fp, sp, #28 + sub sp, sp, #STACKSIZE // reserve stack + + sub r12, fp, #192 + +#if defined(DOUBLE) + vstm r12, { d8 - d15 } // store floating point registers +#else + vstm r12, { s8 - s15 } // store floating point registers +#endif + + cmp OLD_M, #0 + ble cgemvn_kernel_L999 + + cmp N, #0 + ble cgemvn_kernel_L999 + + str OLD_A, A + str OLD_M, M + vstr s0 , ALPHA_R + vstr s1 , ALPHA_I + + + ldr INC_X , OLD_INC_X + ldr INC_Y , OLD_INC_Y + + cmp INC_X, #0 + beq cgemvn_kernel_L999 + + cmp INC_Y, #0 + beq cgemvn_kernel_L999 + + ldr LDA, OLD_LDA + + +#if defined(DOUBLE) + lsl LDA, LDA, #4 // LDA * SIZE * 2 +#else + lsl LDA, LDA, #3 // LDA * SIZE * 2 +#endif + + cmp INC_X, #1 + bne cgemvn_kernel_S4_BEGIN + + cmp INC_Y, #1 + bne cgemvn_kernel_S4_BEGIN + + +cgemvn_kernel_F4_BEGIN: + + ldr YO , Y + + ldr I, M + asrs I, I, #2 // I = M / 4 + ble cgemvn_kernel_F1_BEGIN + +cgemvn_kernel_F4X4: + + ldr AO1, A + add AO2, AO1, LDA + add r3 , AO1, #32 + str r3 , A + + add AO2, AO2, LDA + add AO2, AO2, LDA + + ldr XO , X + + INIT_F4 + + asrs J, N, #2 // J = N / 4 + ble cgemvn_kernel_F4X1 + + +cgemvn_kernel_F4X4_10: + + KERNEL_F4X4 + + subs J, J, #1 + bne cgemvn_kernel_F4X4_10 + + +cgemvn_kernel_F4X1: + + ands J, N , #3 + ble cgemvn_kernel_F4_END + +cgemvn_kernel_F4X1_10: + + KERNEL_F4X1 + + subs J, J, #1 + bne cgemvn_kernel_F4X1_10 + + +cgemvn_kernel_F4_END: + + SAVE_F4 + + subs I , I , #1 + bne cgemvn_kernel_F4X4 + + +cgemvn_kernel_F1_BEGIN: + + ldr I, M + ands I, I , #3 + ble cgemvn_kernel_L999 + +cgemvn_kernel_F1X1: + + ldr AO1, A + add r3, AO1, #8 + str r3, A + + ldr XO , X + + INIT_F1 + + mov J, N + + +cgemvn_kernel_F1X1_10: + + KERNEL_F1X1 + + subs J, J, #1 + bne cgemvn_kernel_F1X1_10 + + +cgemvn_kernel_F1_END: + + SAVE_F1 + + subs I , I , #1 + bne cgemvn_kernel_F1X1 + + b cgemvn_kernel_L999 + + + +/*************************************************************************************************************/ + +cgemvn_kernel_S4_BEGIN: + +#if defined(DOUBLE) + lsl INC_X, INC_X, #4 // INC_X * SIZE * 2 + lsl INC_Y, INC_Y, #4 // INC_Y * SIZE * 2 +#else + lsl INC_X, INC_X, #3 // INC_X * SIZE * 2 + lsl INC_Y, INC_Y, #3 // INC_Y * SIZE * 2 +#endif + + ldr YO , Y + + ldr I, M + asrs I, I, #2 // I = M / 4 + ble cgemvn_kernel_S1_BEGIN + +cgemvn_kernel_S4X4: + + ldr AO1, A + add AO2, AO1, LDA + add r3 , AO1, #32 + str r3 , A + + ldr XO , X + + INIT_S4 + + asrs J, N, #2 // J = N / 4 + ble cgemvn_kernel_S4X1 + + +cgemvn_kernel_S4X4_10: + + KERNEL_S4X4 + + subs J, J, #1 + bne cgemvn_kernel_S4X4_10 + + +cgemvn_kernel_S4X1: + + ands J, N , #3 + ble cgemvn_kernel_S4_END + +cgemvn_kernel_S4X1_10: + + KERNEL_S4X1 + + subs J, J, #1 + bne cgemvn_kernel_S4X1_10 + + +cgemvn_kernel_S4_END: + + SAVE_S4 + + subs I , I , #1 + bne cgemvn_kernel_S4X4 + + +cgemvn_kernel_S1_BEGIN: + + ldr I, M + ands I, I , #3 + ble cgemvn_kernel_L999 + +cgemvn_kernel_S1X1: + + ldr AO1, A + add r3, AO1, #8 + str r3, A + + ldr XO , X + + INIT_S1 + + mov J, N + + +cgemvn_kernel_S1X1_10: + + KERNEL_S1X1 + + subs J, J, #1 + bne cgemvn_kernel_S1X1_10 + + +cgemvn_kernel_S1_END: + + SAVE_S1 + + subs I , I , #1 + bne cgemvn_kernel_S1X1 + + +/*************************************************************************************************************/ + +cgemvn_kernel_L999: + + sub r3, fp, #192 + +#if defined(DOUBLE) + vldm r3, { d8 - d15 } // restore floating point registers +#else + vldm r3, { s8 - s15 } // restore floating point registers +#endif + + mov r0, #0 // set return value + + sub sp, fp, #28 + pop {r4 -r9 ,fp} + bx lr + + EPILOGUE + diff --git a/kernel/arm/cgemv_t_vfp.S b/kernel/arm/cgemv_t_vfp.S new file mode 100644 index 000000000..52276a06f --- /dev/null +++ b/kernel/arm/cgemv_t_vfp.S @@ -0,0 +1,607 @@ +/*************************************************************************** +Copyright (c) 2013, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +/************************************************************************************** +* 2013/11/29 Saar +* BLASTEST : OK +* CTEST : OK +* TEST : OK +* +**************************************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#define STACKSIZE 256 + +#define OLD_LDA [fp, #0 ] +#define X [fp, #4 ] +#define OLD_INC_X [fp, #8 ] +#define Y [fp, #12 ] +#define OLD_INC_Y [fp, #16 ] +#define OLD_A r3 +#define OLD_N r1 + +#define M r0 +#define AO1 r1 +#define J r2 + +#define AO2 r4 +#define XO r5 +#define YO r6 +#define LDA r7 +#define INC_X r8 +#define INC_Y r9 + +#define I r12 + +#define N [fp, #-252 ] +#define A [fp, #-256 ] + + +#define X_PRE 512 +#define A_PRE 512 + +/************************************************************************************** +* Macro definitions +**************************************************************************************/ + +#if !defined(CONJ) && !defined(XCONJ) + + #define KMAC_R fnmacs + #define KMAC_I fmacs + + #define FMAC_R1 fmacs + #define FMAC_R2 fnmacs + #define FMAC_I1 fmacs + #define FMAC_I2 fmacs + +#elif defined(CONJ) && !defined(XCONJ) + + #define KMAC_R fmacs + #define KMAC_I fnmacs + + #define FMAC_R1 fmacs + #define FMAC_R2 fnmacs + #define FMAC_I1 fmacs + #define FMAC_I2 fmacs + +#elif !defined(CONJ) && defined(XCONJ) + + #define KMAC_R fmacs + #define KMAC_I fnmacs + + #define FMAC_R1 fmacs + #define FMAC_R2 fmacs + #define FMAC_I1 fnmacs + #define FMAC_I2 fmacs + +#else + + #define KMAC_R fnmacs + #define KMAC_I fmacs + + #define FMAC_R1 fmacs + #define FMAC_R2 fmacs + #define FMAC_I1 fnmacs + #define FMAC_I2 fmacs + +#endif + + + +.macro INIT_F2 + + vsub.f32 s12, s12, s12 + vsub.f32 s13, s13, s13 + vsub.f32 s14, s14, s14 + vsub.f32 s15, s15, s15 + +.endm + +.macro KERNEL_F2X4 + + KERNEL_F2X1 + KERNEL_F2X1 + KERNEL_F2X1 + KERNEL_F2X1 + +.endm + +.macro KERNEL_F2X1 + + fldmias XO! , { s2 - s3 } + fldmias AO1!, { s4 - s5 } + fldmias AO2!, { s8 - s9 } + + fmacs s12 , s4 , s2 + fmacs s13 , s4 , s3 + KMAC_R s12 , s5 , s3 + KMAC_I s13 , s5 , s2 + + fmacs s14 , s8 , s2 + fmacs s15 , s8 , s3 + KMAC_R s14 , s9 , s3 + KMAC_I s15 , s9 , s2 + +.endm + +.macro SAVE_F2 + + fldmias YO, { s4 - s7 } + + FMAC_R1 s4 , s0 , s12 + FMAC_I1 s5 , s0 , s13 + FMAC_R2 s4 , s1 , s13 + FMAC_I2 s5 , s1 , s12 + + FMAC_R1 s6 , s0 , s14 + FMAC_I1 s7 , s0 , s15 + FMAC_R2 s6 , s1 , s15 + FMAC_I2 s7 , s1 , s14 + + fstmias YO!, { s4 - s7 } + +.endm + +/************************************************************************************************/ + +.macro INIT_F1 + + vsub.f32 s12, s12, s12 + vsub.f32 s13, s13, s13 + +.endm + +.macro KERNEL_F1X4 + + KERNEL_F1X1 + KERNEL_F1X1 + KERNEL_F1X1 + KERNEL_F1X1 + +.endm + +.macro KERNEL_F1X1 + + fldmias XO! , { s2 - s3 } + fldmias AO1!, { s4 - s5 } + + fmacs s12 , s4 , s2 + fmacs s13 , s4 , s3 + KMAC_R s12 , s5 , s3 + KMAC_I s13 , s5 , s2 + +.endm + +.macro SAVE_F1 + + fldmias YO, { s4 - s5 } + + FMAC_R1 s4 , s0 , s12 + FMAC_I1 s5 , s0 , s13 + FMAC_R2 s4 , s1 , s13 + FMAC_I2 s5 , s1 , s12 + + fstmias YO!, { s4 - s5 } + +.endm + +/************************************************************************************************/ + +.macro INIT_S2 + + vsub.f32 s12, s12, s12 + vsub.f32 s13, s13, s13 + vsub.f32 s14, s14, s14 + vsub.f32 s15, s15, s15 + +.endm + +.macro KERNEL_S2X4 + + KERNEL_S2X1 + KERNEL_S2X1 + KERNEL_S2X1 + KERNEL_S2X1 + +.endm + +.macro KERNEL_S2X1 + + fldmias XO , { s2 - s3 } + fldmias AO1!, { s4 - s5 } + fldmias AO2!, { s8 - s9 } + + fmacs s12 , s4 , s2 + fmacs s13 , s4 , s3 + KMAC_R s12 , s5 , s3 + KMAC_I s13 , s5 , s2 + + fmacs s14 , s8 , s2 + fmacs s15 , s8 , s3 + KMAC_R s14 , s9 , s3 + KMAC_I s15 , s9 , s2 + + add XO, XO, INC_X + +.endm + +.macro SAVE_S2 + + fldmias YO, { s4 - s5 } + + FMAC_R1 s4 , s0 , s12 + FMAC_I1 s5 , s0 , s13 + FMAC_R2 s4 , s1 , s13 + FMAC_I2 s5 , s1 , s12 + + fstmias YO, { s4 - s5 } + + add YO, YO, INC_Y + + fldmias YO, { s6 - s7 } + + FMAC_R1 s6 , s0 , s14 + FMAC_I1 s7 , s0 , s15 + FMAC_R2 s6 , s1 , s15 + FMAC_I2 s7 , s1 , s14 + + fstmias YO, { s6 - s7 } + + add YO, YO, INC_Y + +.endm + +/************************************************************************************************/ + +.macro INIT_S1 + + vsub.f32 s12, s12, s12 + vsub.f32 s13, s13, s13 + +.endm + +.macro KERNEL_S1X4 + + KERNEL_S1X1 + KERNEL_S1X1 + KERNEL_S1X1 + KERNEL_S1X1 + +.endm + +.macro KERNEL_S1X1 + + fldmias XO , { s2 - s3 } + fldmias AO1!, { s4 - s5 } + + fmacs s12 , s4 , s2 + fmacs s13 , s4 , s3 + KMAC_R s12 , s5 , s3 + KMAC_I s13 , s5 , s2 + + add XO, XO, INC_X + +.endm + +.macro SAVE_S1 + + fldmias YO, { s4 - s5 } + + FMAC_R1 s4 , s0 , s12 + FMAC_I1 s5 , s0 , s13 + FMAC_R2 s4 , s1 , s13 + FMAC_I2 s5 , s1 , s12 + + fstmias YO, { s4 - s5 } + + add YO, YO, INC_Y + +.endm + + + +/************************************************************************************** +* End of macro definitions +**************************************************************************************/ + + PROLOGUE + + .align 5 + push {r4 - r9 , fp} + add fp, sp, #28 + sub sp, sp, #STACKSIZE // reserve stack + + sub r12, fp, #192 + +#if defined(DOUBLE) + vstm r12, { d8 - d15 } // store floating point registers +#else + vstm r12, { s8 - s15 } // store floating point registers +#endif + + cmp M, #0 + ble cgemvt_kernel_L999 + + cmp OLD_N, #0 + ble cgemvt_kernel_L999 + + str OLD_A, A + str OLD_N, N + + ldr INC_X , OLD_INC_X + ldr INC_Y , OLD_INC_Y + + cmp INC_X, #0 + beq cgemvt_kernel_L999 + + cmp INC_Y, #0 + beq cgemvt_kernel_L999 + + ldr LDA, OLD_LDA + + +#if defined(DOUBLE) + lsl LDA, LDA, #4 // LDA * SIZE +#else + lsl LDA, LDA, #3 // LDA * SIZE +#endif + + cmp INC_X, #1 + bne cgemvt_kernel_S2_BEGIN + + cmp INC_Y, #1 + bne cgemvt_kernel_S2_BEGIN + + +cgemvt_kernel_F2_BEGIN: + + ldr YO , Y + + ldr J, N + asrs J, J, #1 // J = N / 2 + ble cgemvt_kernel_F1_BEGIN + +cgemvt_kernel_F2X4: + + ldr AO1, A + add AO2, AO1, LDA + add r3 , AO2, LDA + str r3 , A + + ldr XO , X + + INIT_F2 + + asrs I, M, #2 // I = M / 4 + ble cgemvt_kernel_F2X1 + + +cgemvt_kernel_F2X4_10: + + KERNEL_F2X4 + + subs I, I, #1 + bne cgemvt_kernel_F2X4_10 + + +cgemvt_kernel_F2X1: + + ands I, M , #3 + ble cgemvt_kernel_F2_END + +cgemvt_kernel_F2X1_10: + + KERNEL_F2X1 + + subs I, I, #1 + bne cgemvt_kernel_F2X1_10 + + +cgemvt_kernel_F2_END: + + SAVE_F2 + + subs J , J , #1 + bne cgemvt_kernel_F2X4 + + +cgemvt_kernel_F1_BEGIN: + + ldr J, N + ands J, J, #1 + ble cgemvt_kernel_L999 + +cgemvt_kernel_F1X4: + + ldr AO1, A + + ldr XO , X + + INIT_F1 + + asrs I, M, #2 // I = M / 4 + ble cgemvt_kernel_F1X1 + + +cgemvt_kernel_F1X4_10: + + KERNEL_F1X4 + + subs I, I, #1 + bne cgemvt_kernel_F1X4_10 + + +cgemvt_kernel_F1X1: + + ands I, M , #3 + ble cgemvt_kernel_F1_END + +cgemvt_kernel_F1X1_10: + + KERNEL_F1X1 + + subs I, I, #1 + bne cgemvt_kernel_F1X1_10 + + +cgemvt_kernel_F1_END: + + SAVE_F1 + + b cgemvt_kernel_L999 + + + +/*************************************************************************************************************/ + +cgemvt_kernel_S2_BEGIN: + +#if defined(DOUBLE) + lsl INC_X, INC_X, #4 // INC_X * SIZE + lsl INC_Y, INC_Y, #4 // INC_Y * SIZE +#else + lsl INC_X, INC_X, #3 // INC_X * SIZE + lsl INC_Y, INC_Y, #3 // INC_Y * SIZE +#endif + + ldr YO , Y + + ldr J, N + asrs J, J, #1 // J = N / 2 + ble cgemvt_kernel_S1_BEGIN + +cgemvt_kernel_S2X4: + + ldr AO1, A + add AO2, AO1, LDA + add r3 , AO2, LDA + str r3 , A + + ldr XO , X + + INIT_S2 + + asrs I, M, #2 // I = M / 4 + ble cgemvt_kernel_S2X1 + + +cgemvt_kernel_S2X4_10: + + KERNEL_S2X4 + + subs I, I, #1 + bne cgemvt_kernel_S2X4_10 + + +cgemvt_kernel_S2X1: + + ands I, M , #3 + ble cgemvt_kernel_S2_END + +cgemvt_kernel_S2X1_10: + + KERNEL_S2X1 + + subs I, I, #1 + bne cgemvt_kernel_S2X1_10 + + +cgemvt_kernel_S2_END: + + SAVE_S2 + + subs J , J , #1 + bne cgemvt_kernel_S2X4 + + +cgemvt_kernel_S1_BEGIN: + + ldr J, N + ands J, J, #1 + ble cgemvt_kernel_L999 + +cgemvt_kernel_S1X4: + + ldr AO1, A + + ldr XO , X + + INIT_S1 + + asrs I, M, #2 // I = M / 4 + ble cgemvt_kernel_S1X1 + + +cgemvt_kernel_S1X4_10: + + KERNEL_S1X4 + + subs I, I, #1 + bne cgemvt_kernel_S1X4_10 + + +cgemvt_kernel_S1X1: + + ands I, M , #3 + ble cgemvt_kernel_S1_END + +cgemvt_kernel_S1X1_10: + + KERNEL_S1X1 + + subs I, I, #1 + bne cgemvt_kernel_S1X1_10 + + +cgemvt_kernel_S1_END: + + SAVE_S1 + + + +/*************************************************************************************************************/ + +cgemvt_kernel_L999: + + sub r3, fp, #192 + +#if defined(DOUBLE) + vldm r3, { d8 - d15 } // restore floating point registers +#else + vldm r3, { s8 - s15 } // restore floating point registers +#endif + + mov r0, #0 // set return value + + sub sp, fp, #28 + pop {r4 -r9 ,fp} + bx lr + + EPILOGUE + diff --git a/kernel/arm/copy.c b/kernel/arm/copy.c new file mode 100644 index 000000000..f742a4a33 --- /dev/null +++ b/kernel/arm/copy.c @@ -0,0 +1,59 @@ +/*************************************************************************** +Copyright (c) 2013, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +/************************************************************************************** +* 2013/09/14 Saar +* BLASTEST float : OK +* BLASTEST double : OK +* CTEST : OK +* TEST : OK +* +**************************************************************************************/ + +#include "common.h" + +int CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y) +{ + BLASLONG i=0; + BLASLONG ix=0,iy=0; + + if ( n < 0 ) return(0); + + while(i < n) + { + + y[iy] = x[ix] ; + ix += inc_x ; + iy += inc_y ; + i++ ; + + } + return(0); + +} + + diff --git a/kernel/arm/ctrmm_kernel_2x2_vfp.S b/kernel/arm/ctrmm_kernel_2x2_vfp.S new file mode 100644 index 000000000..a68434f97 --- /dev/null +++ b/kernel/arm/ctrmm_kernel_2x2_vfp.S @@ -0,0 +1,1455 @@ +/*************************************************************************** +Copyright (c) 2013, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +/************************************************************************************** +* 2013/10/16 Saar +* BLASTEST : OK +* CTEST : OK +* TEST : OK +* +**************************************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#define STACKSIZE 256 + +#define OLD_M r0 +#define OLD_N r1 +#define OLD_K r2 +#define OLD_A r3 +#define OLD_ALPHA_R s0 +#define OLD_ALPHA_I s1 + +/****************************************************** +* [fp, #-128] - [fp, #-64] is reserved +* for store and restore of floating point +* registers +*******************************************************/ + +#define KKK [fp, #-240] +#define KK [fp, #-244 ] +#define A [fp, #-248 ] +#define LDC [fp, #-252 ] +#define M [fp, #-256 ] +#define N [fp, #-260 ] +#define K [fp, #-264 ] + +#define ALPHA_I [fp, #-272] +#define ALPHA_R [fp, #-280] + +#define B [fp, #4 ] +#define C [fp, #8 ] +#define OLD_LDC [fp, #12 ] +#define OFFSET [fp, #16 ] + +#define I r0 +#define J r1 +#define L r2 + +#define AO r5 +#define BO r6 + +#define CO1 r8 +#define CO2 r9 + +#define K1 r7 +#define BC r12 + +#define A_PRE 96 +#define B_PRE 96 +#define C_PRE 64 + + + +/************************************************************************************** +* Macro definitions +**************************************************************************************/ + + +#if defined(NN) || defined(NT) || defined(TN) || defined(TT) + + #define KMAC_R fnmacs + #define KMAC_I fmacs + + #define FMAC_R1 fmacs + #define FMAC_R2 fnmacs + #define FMAC_I1 fmacs + #define FMAC_I2 fmacs + +#elif defined(CN) || defined(CT) + + #define KMAC_R fmacs + #define KMAC_I fnmacs + + #define FMAC_R1 fmacs + #define FMAC_R2 fnmacs + #define FMAC_I1 fmacs + #define FMAC_I2 fmacs + +#elif defined(NC) || defined(TC) + + #define KMAC_R fmacs + #define KMAC_I fnmacs + + #define FMAC_R1 fmacs + #define FMAC_R2 fmacs + #define FMAC_I1 fnmacs + #define FMAC_I2 fmacs + +#else + + #define KMAC_R fnmacs + #define KMAC_I fmacs + + #define FMAC_R1 fmacs + #define FMAC_R2 fmacs + #define FMAC_I1 fnmacs + #define FMAC_I2 fmacs + +#endif + + +.macro INIT2x2 + + vsub.f32 s8 , s8 , s8 + vmov.f32 s9 , s8 + vmov.f32 s10, s8 + vmov.f32 s11, s8 + vmov.f32 s12, s8 + vmov.f32 s13, s8 + vmov.f32 s14, s8 + vmov.f32 s15, s8 + +.endm + +.macro KERNEL2x2_I + + pld [ AO, #A_PRE ] + fldmias AO!, { s0 - s3 } + pld [ BO, #B_PRE ] + fldmias BO!, { s4 - s7 } + + + fmuls s8 , s0, s4 + fmuls s9 , s0, s5 + fmuls s10 , s2, s4 + fmuls s11 , s2, s5 + + KMAC_R s8 , s1, s5 + KMAC_I s9 , s1, s4 + KMAC_R s10 , s3, s5 + KMAC_I s11 , s3, s4 + + fmuls s12 , s0, s6 + fmuls s13 , s0, s7 + fmuls s14 , s2, s6 + fmuls s15 , s2, s7 + + KMAC_R s12 , s1, s7 + KMAC_I s13 , s1, s6 + KMAC_R s14 , s3, s7 + KMAC_I s15 , s3, s6 + +.endm + + + +.macro KERNEL2x2_M1 + + pld [ AO, #A_PRE ] + fldmias AO!, { s0 - s3 } + pld [ BO, #B_PRE ] + fldmias BO!, { s4 - s7 } + + fmacs s8 , s0, s4 + fmacs s9 , s0, s5 + fmacs s10 , s2, s4 + fmacs s11 , s2, s5 + + KMAC_R s8 , s1, s5 + KMAC_I s9 , s1, s4 + KMAC_R s10 , s3, s5 + KMAC_I s11 , s3, s4 + + fmacs s12 , s0, s6 + fmacs s13 , s0, s7 + fmacs s14 , s2, s6 + fmacs s15 , s2, s7 + + KMAC_R s12 , s1, s7 + KMAC_I s13 , s1, s6 + KMAC_R s14 , s3, s7 + KMAC_I s15 , s3, s6 + +.endm + +.macro KERNEL2x2_M2 + + fldmias AO!, { s0 - s3 } + fldmias BO!, { s4 - s7 } + + fmacs s8 , s0, s4 + fmacs s9 , s0, s5 + fmacs s10 , s2, s4 + fmacs s11 , s2, s5 + + KMAC_R s8 , s1, s5 + KMAC_I s9 , s1, s4 + KMAC_R s10 , s3, s5 + KMAC_I s11 , s3, s4 + + fmacs s12 , s0, s6 + fmacs s13 , s0, s7 + fmacs s14 , s2, s6 + fmacs s15 , s2, s7 + + KMAC_R s12 , s1, s7 + KMAC_I s13 , s1, s6 + KMAC_R s14 , s3, s7 + KMAC_I s15 , s3, s6 + + +.endm + + +.macro KERNEL2x2_E + + fldmias AO!, { s0 - s3 } + fldmias BO!, { s4 - s7 } + + fmacs s8 , s0, s4 + fmacs s9 , s0, s5 + fmacs s10 , s2, s4 + fmacs s11 , s2, s5 + + KMAC_R s8 , s1, s5 + KMAC_I s9 , s1, s4 + KMAC_R s10 , s3, s5 + KMAC_I s11 , s3, s4 + + fmacs s12 , s0, s6 + fmacs s13 , s0, s7 + fmacs s14 , s2, s6 + fmacs s15 , s2, s7 + + KMAC_R s12 , s1, s7 + KMAC_I s13 , s1, s6 + KMAC_R s14 , s3, s7 + KMAC_I s15 , s3, s6 + + +.endm + +.macro KERNEL2x2_SUB + + fldmias AO!, { s0 - s3 } + fldmias BO!, { s4 - s7 } + + fmacs s8 , s0, s4 + fmacs s9 , s0, s5 + fmacs s10 , s2, s4 + fmacs s11 , s2, s5 + + KMAC_R s8 , s1, s5 + KMAC_I s9 , s1, s4 + KMAC_R s10 , s3, s5 + KMAC_I s11 , s3, s4 + + fmacs s12 , s0, s6 + fmacs s13 , s0, s7 + fmacs s14 , s2, s6 + fmacs s15 , s2, s7 + + KMAC_R s12 , s1, s7 + KMAC_I s13 , s1, s6 + KMAC_R s14 , s3, s7 + KMAC_I s15 , s3, s6 + + +.endm + +.macro SAVE2x2 + + ldr r3 , LDC + add CO2 , CO1, r3 + + flds s0, ALPHA_R + flds s1, ALPHA_I + + vsub.f32 s4, s4, s4 + vsub.f32 s5, s5, s5 + vsub.f32 s6, s6, s6 + vsub.f32 s7, s7, s7 + + FMAC_R1 s4 , s0 , s8 + FMAC_I1 s5 , s0 , s9 + FMAC_R2 s4 , s1 , s9 + FMAC_I2 s5 , s1 , s8 + + FMAC_R1 s6 , s0 , s10 + FMAC_I1 s7 , s0 , s11 + FMAC_R2 s6 , s1 , s11 + FMAC_I2 s7 , s1 , s10 + + fstmias CO1, { s4 - s7 } + + vsub.f32 s4, s4, s4 + vsub.f32 s5, s5, s5 + vsub.f32 s6, s6, s6 + vsub.f32 s7, s7, s7 + + FMAC_R1 s4 , s0 , s12 + FMAC_I1 s5 , s0 , s13 + FMAC_R2 s4 , s1 , s13 + FMAC_I2 s5 , s1 , s12 + + FMAC_R1 s6 , s0 , s14 + FMAC_I1 s7 , s0 , s15 + FMAC_R2 s6 , s1 , s15 + FMAC_I2 s7 , s1 , s14 + + fstmias CO2, { s4 - s7 } + + add CO1, CO1, #16 + +.endm + +/******************************************************************************/ + +.macro INIT1x2 + + vsub.f32 s8 , s8 , s8 + vmov.f32 s9 , s8 + vmov.f32 s12, s8 + vmov.f32 s13, s8 + +.endm + +.macro KERNEL1x2_I + + flds s0 , [ AO ] + flds s1 , [ AO, #4 ] + + flds s4 , [ BO ] + flds s5 , [ BO, #4 ] + + flds s6 , [ BO, #8 ] + flds s7 , [ BO, #12 ] + + fmuls s8 , s0, s4 + KMAC_R s8 , s1, s5 + fmuls s9 , s0, s5 + KMAC_I s9 , s1, s4 + + fmuls s12 , s0, s6 + KMAC_R s12 , s1, s7 + fmuls s13 , s0, s7 + KMAC_I s13 , s1, s6 + + add BO , BO, #16 + add AO , AO, #8 + +.endm + + + +.macro KERNEL1x2_M1 + + flds s0 , [ AO ] + flds s1 , [ AO, #4 ] + + flds s4 , [ BO ] + flds s5 , [ BO, #4 ] + flds s6 , [ BO, #8 ] + flds s7 , [ BO, #12 ] + + fmacs s8 , s0, s4 + KMAC_R s8 , s1, s5 + fmacs s9 , s0, s5 + KMAC_I s9 , s1, s4 + + fmacs s12 , s0, s6 + KMAC_R s12 , s1, s7 + fmacs s13 , s0, s7 + KMAC_I s13 , s1, s6 + + add BO , BO, #16 + add AO , AO, #8 + +.endm + +.macro KERNEL1x2_M2 + + flds s0 , [ AO ] + flds s1 , [ AO, #4 ] + + flds s4 , [ BO ] + flds s5 , [ BO, #4 ] + flds s6 , [ BO, #8 ] + flds s7 , [ BO, #12 ] + + fmacs s8 , s0, s4 + KMAC_R s8 , s1, s5 + fmacs s9 , s0, s5 + KMAC_I s9 , s1, s4 + + fmacs s12 , s0, s6 + KMAC_R s12 , s1, s7 + fmacs s13 , s0, s7 + KMAC_I s13 , s1, s6 + + add BO , BO, #16 + add AO , AO, #8 + + +.endm + + +.macro KERNEL1x2_E + + flds s0 , [ AO ] + flds s1 , [ AO, #4 ] + + flds s4 , [ BO ] + flds s5 , [ BO, #4 ] + flds s6 , [ BO, #8 ] + flds s7 , [ BO, #12 ] + + fmacs s8 , s0, s4 + KMAC_R s8 , s1, s5 + fmacs s9 , s0, s5 + KMAC_I s9 , s1, s4 + + fmacs s12 , s0, s6 + KMAC_R s12 , s1, s7 + fmacs s13 , s0, s7 + KMAC_I s13 , s1, s6 + + add BO , BO, #16 + add AO , AO, #8 + +.endm + +.macro KERNEL1x2_SUB + + flds s0 , [ AO ] + flds s1 , [ AO, #4 ] + + flds s4 , [ BO ] + flds s5 , [ BO, #4 ] + flds s6 , [ BO, #8 ] + flds s7 , [ BO, #12 ] + + fmacs s8 , s0, s4 + KMAC_R s8 , s1, s5 + fmacs s9 , s0, s5 + KMAC_I s9 , s1, s4 + + fmacs s12 , s0, s6 + KMAC_R s12 , s1, s7 + fmacs s13 , s0, s7 + KMAC_I s13 , s1, s6 + + add BO , BO, #16 + add AO , AO, #8 + + +.endm + + +.macro SAVE1x2 + + ldr r3 , LDC + add CO2 , CO1, r3 + + flds s0, ALPHA_R + flds s1, ALPHA_I + + vsub.f32 s4, s4, s4 + vsub.f32 s5, s5, s5 + + FMAC_R1 s4 , s0 , s8 + FMAC_I1 s5 , s0 , s9 + FMAC_R2 s4 , s1 , s9 + FMAC_I2 s5 , s1 , s8 + + fstmias CO1, { s4 - s5 } + + vsub.f32 s4, s4, s4 + vsub.f32 s5, s5, s5 + + FMAC_R1 s4 , s0 , s12 + FMAC_I1 s5 , s0 , s13 + FMAC_R2 s4 , s1 , s13 + FMAC_I2 s5 , s1 , s12 + + fstmias CO2, { s4 - s5 } + + add CO1, CO1, #8 + +.endm + + +/******************************************************************************/ + +.macro INIT2x1 + + vsub.f32 s8 , s8 , s8 + vmov.f32 s9 , s8 + vmov.f32 s10, s8 + vmov.f32 s11, s8 + +.endm + +.macro KERNEL2x1_I + + flds s0 , [ AO ] + flds s1 , [ AO, #4 ] + + flds s2 , [ AO, #8 ] + flds s3 , [ AO, #12 ] + + flds s4 , [ BO ] + flds s5 , [ BO, #4 ] + + fmuls s8 , s0, s4 + KMAC_R s8 , s1, s5 + fmuls s9 , s0, s5 + KMAC_I s9 , s1, s4 + + fmuls s10 , s2, s4 + KMAC_R s10 , s3, s5 + fmuls s11 , s2, s5 + KMAC_I s11 , s3, s4 + + add BO , BO, #8 + add AO , AO, #16 + +.endm + + + +.macro KERNEL2x1_M1 + + flds s0 , [ AO ] + flds s1 , [ AO, #4 ] + flds s2 , [ AO, #8 ] + flds s3 , [ AO, #12 ] + + flds s4 , [ BO ] + flds s5 , [ BO, #4 ] + + fmacs s8 , s0, s4 + KMAC_R s8 , s1, s5 + fmacs s9 , s0, s5 + KMAC_I s9 , s1, s4 + + fmacs s10 , s2, s4 + KMAC_R s10 , s3, s5 + fmacs s11 , s2, s5 + KMAC_I s11 , s3, s4 + + add BO , BO, #8 + add AO , AO, #16 + +.endm + +.macro KERNEL2x1_M2 + + flds s0 , [ AO ] + flds s1 , [ AO, #4 ] + flds s2 , [ AO, #8 ] + flds s3 , [ AO, #12 ] + + flds s4 , [ BO ] + flds s5 , [ BO, #4 ] + + fmacs s8 , s0, s4 + KMAC_R s8 , s1, s5 + fmacs s9 , s0, s5 + KMAC_I s9 , s1, s4 + + fmacs s10 , s2, s4 + KMAC_R s10 , s3, s5 + fmacs s11 , s2, s5 + KMAC_I s11 , s3, s4 + + add BO , BO, #8 + add AO , AO, #16 + + +.endm + + +.macro KERNEL2x1_E + + flds s0 , [ AO ] + flds s1 , [ AO, #4 ] + flds s2 , [ AO, #8 ] + flds s3 , [ AO, #12 ] + + flds s4 , [ BO ] + flds s5 , [ BO, #4 ] + + fmacs s8 , s0, s4 + KMAC_R s8 , s1, s5 + fmacs s9 , s0, s5 + KMAC_I s9 , s1, s4 + + fmacs s10 , s2, s4 + KMAC_R s10 , s3, s5 + fmacs s11 , s2, s5 + KMAC_I s11 , s3, s4 + + add BO , BO, #8 + add AO , AO, #16 + +.endm + +.macro KERNEL2x1_SUB + + flds s0 , [ AO ] + flds s1 , [ AO, #4 ] + flds s2 , [ AO, #8 ] + flds s3 , [ AO, #12 ] + + flds s4 , [ BO ] + flds s5 , [ BO, #4 ] + + fmacs s8 , s0, s4 + KMAC_R s8 , s1, s5 + fmacs s9 , s0, s5 + KMAC_I s9 , s1, s4 + + fmacs s10 , s2, s4 + KMAC_R s10 , s3, s5 + fmacs s11 , s2, s5 + KMAC_I s11 , s3, s4 + + add BO , BO, #8 + add AO , AO, #16 + + +.endm + + +.macro SAVE2x1 + + flds s0, ALPHA_R + flds s1, ALPHA_I + + vsub.f32 s4, s4, s4 + vsub.f32 s5, s5, s5 + vsub.f32 s6, s6, s6 + vsub.f32 s7, s7, s7 + + FMAC_R1 s4 , s0 , s8 + FMAC_I1 s5 , s0 , s9 + FMAC_R2 s4 , s1 , s9 + FMAC_I2 s5 , s1 , s8 + + FMAC_R1 s6 , s0 , s10 + FMAC_I1 s7 , s0 , s11 + FMAC_R2 s6 , s1 , s11 + FMAC_I2 s7 , s1 , s10 + + fstmias CO1, { s4 - s7 } + + add CO1, CO1, #16 + +.endm + + +/******************************************************************************/ + +.macro INIT1x1 + + vsub.f32 s8 , s8 , s8 + vmov.f32 s9 , s8 + +.endm + +.macro KERNEL1x1_I + + flds s0 , [ AO ] + flds s1 , [ AO, #4 ] + + flds s4 , [ BO ] + flds s5 , [ BO, #4 ] + + fmuls s8 , s0, s4 + KMAC_R s8 , s1, s5 + fmuls s9 , s0, s5 + KMAC_I s9 , s1, s4 + + add BO , BO, #8 + add AO , AO, #8 + +.endm + + + +.macro KERNEL1x1_M1 + + flds s0 , [ AO ] + flds s1 , [ AO, #4 ] + + flds s4 , [ BO ] + flds s5 , [ BO, #4 ] + + fmacs s8 , s0, s4 + KMAC_R s8 , s1, s5 + fmacs s9 , s0, s5 + KMAC_I s9 , s1, s4 + + add BO , BO, #8 + add AO , AO, #8 + +.endm + +.macro KERNEL1x1_M2 + + flds s0 , [ AO ] + flds s1 , [ AO, #4 ] + + flds s4 , [ BO ] + flds s5 , [ BO, #4 ] + + fmacs s8 , s0, s4 + KMAC_R s8 , s1, s5 + fmacs s9 , s0, s5 + KMAC_I s9 , s1, s4 + + add BO , BO, #8 + add AO , AO, #8 + + +.endm + + +.macro KERNEL1x1_E + + flds s0 , [ AO ] + flds s1 , [ AO, #4 ] + + flds s4 , [ BO ] + flds s5 , [ BO, #4 ] + + fmacs s8 , s0, s4 + KMAC_R s8 , s1, s5 + fmacs s9 , s0, s5 + KMAC_I s9 , s1, s4 + + add BO , BO, #8 + add AO , AO, #8 + +.endm + +.macro KERNEL1x1_SUB + + flds s0 , [ AO ] + flds s1 , [ AO, #4 ] + + flds s4 , [ BO ] + flds s5 , [ BO, #4 ] + + fmacs s8 , s0, s4 + KMAC_R s8 , s1, s5 + fmacs s9 , s0, s5 + KMAC_I s9 , s1, s4 + + add BO , BO, #8 + add AO , AO, #8 + + +.endm + + +.macro SAVE1x1 + + flds s0, ALPHA_R + flds s1, ALPHA_I + + vsub.f32 s4, s4, s4 + vsub.f32 s5, s5, s5 + + FMAC_R1 s4 , s0 , s8 + FMAC_I1 s5 , s0 , s9 + FMAC_R2 s4 , s1 , s9 + FMAC_I2 s5 , s1 , s8 + + fstmias CO1, { s4 - s5 } + + add CO1, CO1, #8 + +.endm + +/************************************************************************************** +* End of macro definitions +**************************************************************************************/ + + PROLOGUE + + .align 5 + + push {r4 - r9, fp} + add fp, sp, #24 + sub sp, sp, #STACKSIZE // reserve stack + + str OLD_M, M + str OLD_N, N + str OLD_K, K + str OLD_A, A + vstr OLD_ALPHA_R, ALPHA_R + vstr OLD_ALPHA_I, ALPHA_I + + sub r3, fp, #128 + vstm r3, { s8 - s15} // store floating point registers + + ldr r3, OLD_LDC + lsl r3, r3, #3 // ldc = ldc * 4 * 2 + str r3, LDC + + ldr r3, OFFSET +#ifndef LEFT + neg r3 , r3 +#endif + str r3 , KK + + ldr BC, B + + ldr J, N + asrs J, J, #1 // J = J / 2 + ble _L1_BEGIN + +_L2_BEGIN: + + ldr CO1, C // CO1 = C + ldr r4 , LDC + lsl r4 , r4 , #1 // LDC * 2 + add r3 , r4, CO1 + str r3 , C // store C + +#if defined(LEFT) + ldr r3 , OFFSET + str r3 , KK +#endif + + ldr AO, A // AO = A + pld [AO , #A_PRE-64] + pld [AO , #A_PRE-32] + + + +_L2_M2_BEGIN: + + ldr I, M + asrs I, I, #1 // I = I / 2 + ble _L2_M1_BEGIN + +_L2_M2_20: + +#if (defined(LEFT) && defined(TRANSA)) || \ + (!defined(LEFT) && !defined(TRANSA)) + + mov BO, BC +#else + mov BO, BC + ldr r3 , KK + lsls r4 , r3 , #4 // 2 * 4 * 2 float values + add BO , BO , r4 + lsls r4 , r3 , #4 // 2 * 4 * 2 float values + add AO , AO , r4 + +#endif + +#ifndef TRMMKERNEL + ldr K1, K +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + ldr K1, K + ldr r3, KK + sub K1, K1, r3 + str K1, KKK +#else + ldr K1, KK +#ifdef LEFT + add K1, K1, #2 // number of values in AO +#else + add K1, K1, #2 // number of values in BO +#endif + str K1, KKK +#endif + + asrs L , K1, #3 // L = L / 8 + cmp L , #3 + blt _L2_M2_30 + .align 5 + + + + KERNEL2x2_I + KERNEL2x2_M2 + KERNEL2x2_M1 + KERNEL2x2_M2 + + KERNEL2x2_M1 + KERNEL2x2_M2 + KERNEL2x2_M1 + KERNEL2x2_M2 + + sub L, L, #2 + +_L2_M2_22: + + KERNEL2x2_M1 + KERNEL2x2_M2 + KERNEL2x2_M1 + KERNEL2x2_M2 + + KERNEL2x2_M1 + KERNEL2x2_M2 + KERNEL2x2_M1 + KERNEL2x2_M2 + + subs L, L, #1 + bgt _L2_M2_22 + + KERNEL2x2_M1 + KERNEL2x2_M2 + KERNEL2x2_M1 + KERNEL2x2_M2 + + KERNEL2x2_M1 + KERNEL2x2_M2 + KERNEL2x2_M1 + KERNEL2x2_E + + b _L2_M2_44 + + +_L2_M2_30: + tst L, #3 + ble _L2_M2_40 + + tst L, #2 + ble _L2_M2_32 + + KERNEL2x2_I + KERNEL2x2_M2 + KERNEL2x2_M1 + KERNEL2x2_M2 + + KERNEL2x2_M1 + KERNEL2x2_M2 + KERNEL2x2_M1 + KERNEL2x2_M2 + + KERNEL2x2_M1 + KERNEL2x2_M2 + KERNEL2x2_M1 + KERNEL2x2_M2 + + + KERNEL2x2_M1 + KERNEL2x2_M2 + KERNEL2x2_M1 + KERNEL2x2_E + + b _L2_M2_44 + +_L2_M2_32: + + tst L, #1 + ble _L2_M2_40 + + KERNEL2x2_I + KERNEL2x2_M2 + KERNEL2x2_M1 + KERNEL2x2_M2 + + KERNEL2x2_M1 + KERNEL2x2_M2 + KERNEL2x2_M1 + KERNEL2x2_E + + b _L2_M2_44 + + +_L2_M2_40: + + INIT2x2 + + +_L2_M2_44: + + ands L , K1, #7 // L = L % 8 + ble _L2_M2_100 + +_L2_M2_46: + + KERNEL2x2_SUB + + subs L, L, #1 + bne _L2_M2_46 + +_L2_M2_100: + + SAVE2x2 + +#if (defined(LEFT) && defined(TRANSA)) || \ + (!defined(LEFT) && !defined(TRANSA)) + ldr r3 , K + ldr r4 , KKK + sub r3 , r3 , r4 + lsls r4 , r3 , #4 // 2 * 4 * 2 float values + add BO , BO , r4 + lsls r4 , r3 , #4 // 2 * 4 * 2 float values + add AO , AO , r4 +#endif + +#if defined(LEFT) + ldr r3 , KK + add r3 , r3 , #2 // number of values in AO + str r3 , KK +#endif + + +_L2_M2_END: + + subs I, I, #1 + bne _L2_M2_20 + + +_L2_M1_BEGIN: + + ldr I, M + tst I, #1 // I = I % 2 + ble _L2_END + +_L2_M1_20: + + INIT1x2 + +#if (defined(LEFT) && defined(TRANSA)) || \ + (!defined(LEFT) && !defined(TRANSA)) + + mov BO, BC +#else + mov BO, BC + ldr r3 , KK + lsls r4 , r3 , #4 // 2 * 4 * 2 float values + add BO , BO , r4 + lsls r4 , r3 , #3 // 1 * 4 * 2 float values + add AO , AO , r4 + +#endif + +#ifndef TRMMKERNEL + ldr K1, K +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + ldr K1, K + ldr r3, KK + sub K1, K1, r3 + str K1, KKK +#else + ldr K1, KK +#ifdef LEFT + add K1, K1, #1 // number of values in AO +#else + add K1, K1, #2 // number of values in BO +#endif + str K1, KKK +#endif + + asrs L , K1, #3 // L = L / 8 + ble _L2_M1_40 + +_L2_M1_22: + + KERNEL1x2_SUB + KERNEL1x2_SUB + KERNEL1x2_SUB + KERNEL1x2_SUB + + KERNEL1x2_SUB + KERNEL1x2_SUB + KERNEL1x2_SUB + KERNEL1x2_SUB + + subs L, L, #1 + bgt _L2_M1_22 + + +_L2_M1_40: + + ands L , K1, #7 // L = L % 8 + ble _L2_M1_100 + +_L2_M1_42: + + KERNEL1x2_SUB + + subs L, L, #1 + bgt _L2_M1_42 + +_L2_M1_100: + + SAVE1x2 + +#if (defined(LEFT) && defined(TRANSA)) || \ + (!defined(LEFT) && !defined(TRANSA)) + ldr r3 , K + ldr r4 , KKK + sub r3 , r3 , r4 + lsls r4 , r3 , #4 // 2 * 4 * 2 float values + add BO , BO , r4 + lsls r4 , r3 , #3 // 1 * 4 * 2 float values + add AO , AO , r4 +#endif + +#if defined(LEFT) + ldr r3 , KK + add r3 , r3 , #1 // number of values in AO + str r3 , KK +#endif + + + +_L2_END: + + mov r3, BC + ldr r4, K + lsl r4, r4, #4 // k * 2 * 4 * 2 + add r3, r3, r4 // B = B + K * 2 * 8 + mov BC, r3 + +#if !defined(LEFT) + ldr r3 , KK + add r3 , r3 , #2 // number of values in BO + str r3 , KK +#endif + + subs J , #1 // j-- + bgt _L2_BEGIN + + + +/*********************************************************************************************/ + +_L1_BEGIN: + + ldr J , N + tst J , #1 + ble _L999 + + + ldr CO1, C // CO1 = C + ldr r4 , LDC + add r3 , r4, CO1 + str r3 , C // store C + +#if defined(LEFT) + ldr r3 , OFFSET + str r3 , KK +#endif + + ldr AO, A // AO = A + +_L1_M2_BEGIN: + + ldr I, M + asrs I, I, #1 // I = I / 2 + ble _L1_M1_BEGIN + +_L1_M2_20: + +#if (defined(LEFT) && defined(TRANSA)) || \ + (!defined(LEFT) && !defined(TRANSA)) + + mov BO, BC +#else + mov BO, BC + ldr r3 , KK + lsls r4 , r3 , #3 // 1 * 4 * 2 float values + add BO , BO , r4 + lsls r4 , r3 , #4 // 2 * 4 * 2 float values + add AO , AO , r4 + +#endif + +#ifndef TRMMKERNEL + ldr K1, K +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + ldr K1, K + ldr r3, KK + sub K1, K1, r3 + str K1, KKK +#else + ldr K1, KK +#ifdef LEFT + add K1, K1, #2 // number of values in AO +#else + add K1, K1, #1 // number of values in BO +#endif + str K1, KKK +#endif + + asrs L , K1, #3 // L = L / 8 + cmp L , #3 + blt _L1_M2_30 + .align 5 + + + + KERNEL2x1_I + KERNEL2x1_M2 + KERNEL2x1_M1 + KERNEL2x1_M2 + + KERNEL2x1_M1 + KERNEL2x1_M2 + KERNEL2x1_M1 + KERNEL2x1_M2 + + sub L, L, #2 + +_L1_M2_22: + + KERNEL2x1_M1 + KERNEL2x1_M2 + KERNEL2x1_M1 + KERNEL2x1_M2 + + KERNEL2x1_M1 + KERNEL2x1_M2 + KERNEL2x1_M1 + KERNEL2x1_M2 + + subs L, L, #1 + bgt _L1_M2_22 + + KERNEL2x1_M1 + KERNEL2x1_M2 + KERNEL2x1_M1 + KERNEL2x1_M2 + + KERNEL2x1_M1 + KERNEL2x1_M2 + KERNEL2x1_M1 + KERNEL2x1_E + + b _L1_M2_44 + + +_L1_M2_30: + tst L, #3 + ble _L1_M2_40 + + tst L, #2 + ble _L1_M2_32 + + KERNEL2x1_I + KERNEL2x1_M2 + KERNEL2x1_M1 + KERNEL2x1_M2 + + KERNEL2x1_M1 + KERNEL2x1_M2 + KERNEL2x1_M1 + KERNEL2x1_M2 + + KERNEL2x1_M1 + KERNEL2x1_M2 + KERNEL2x1_M1 + KERNEL2x1_M2 + + + KERNEL2x1_M1 + KERNEL2x1_M2 + KERNEL2x1_M1 + KERNEL2x1_E + + b _L1_M2_44 + +_L1_M2_32: + + tst L, #1 + ble _L1_M2_40 + + KERNEL2x1_I + KERNEL2x1_M2 + KERNEL2x1_M1 + KERNEL2x1_M2 + + KERNEL2x1_M1 + KERNEL2x1_M2 + KERNEL2x1_M1 + KERNEL2x1_E + + b _L1_M2_44 + + +_L1_M2_40: + + INIT2x1 + + +_L1_M2_44: + + ands L , K1, #7 // L = L % 8 + ble _L1_M2_100 + +_L1_M2_46: + + KERNEL2x1_SUB + + subs L, L, #1 + bne _L1_M2_46 + +_L1_M2_100: + + SAVE2x1 + +#if (defined(LEFT) && defined(TRANSA)) || \ + (!defined(LEFT) && !defined(TRANSA)) + ldr r3 , K + ldr r4 , KKK + sub r3 , r3 , r4 + lsls r4 , r3 , #3 // 1 * 4 * 2 float values + add BO , BO , r4 + lsls r4 , r3 , #4 // 2 * 4 * 2 float values + add AO , AO , r4 +#endif + +#if defined(LEFT) + ldr r3 , KK + add r3 , r3 , #2 // number of values in AO + str r3 , KK +#endif + + + +_L1_M2_END: + + subs I, I, #1 + bne _L1_M2_20 + + +_L1_M1_BEGIN: + + ldr I, M + tst I, #1 // I = I % 2 + ble _L1_END + +_L1_M1_20: + + INIT1x1 + +#if (defined(LEFT) && defined(TRANSA)) || \ + (!defined(LEFT) && !defined(TRANSA)) + + mov BO, BC +#else + mov BO, BC + ldr r3 , KK + lsls r4 , r3 , #3 // 1 * 4 * 2 float values + add BO , BO , r4 + lsls r4 , r3 , #3 // 1 * 4 * 2 float values + add AO , AO , r4 + +#endif + +#ifndef TRMMKERNEL + ldr K1, K +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + ldr K1, K + ldr r3, KK + sub K1, K1, r3 + str K1, KKK +#else + ldr K1, KK +#ifdef LEFT + add K1, K1, #1 // number of values in AO +#else + add K1, K1, #1 // number of values in BO +#endif + str K1, KKK +#endif + + asrs L , K1, #3 // L = L / 8 + ble _L1_M1_40 + +_L1_M1_22: + + KERNEL1x1_SUB + KERNEL1x1_SUB + KERNEL1x1_SUB + KERNEL1x1_SUB + + KERNEL1x1_SUB + KERNEL1x1_SUB + KERNEL1x1_SUB + KERNEL1x1_SUB + + subs L, L, #1 + bgt _L1_M1_22 + + +_L1_M1_40: + + ands L , K1, #7 // L = L % 8 + ble _L1_M1_100 + +_L1_M1_42: + + KERNEL1x1_SUB + + subs L, L, #1 + bgt _L1_M1_42 + +_L1_M1_100: + + SAVE1x1 + + +_L1_END: + + + +_L999: + + sub r3, fp, #128 + vldm r3, { s8 - s15} // restore floating point registers + + movs r0, #0 // set return value + sub sp, fp, #24 + pop {r4 - r9, fp} + bx lr + + EPILOGUE + diff --git a/kernel/arm/ctrmm_kernel_2x2_vfpv3.S b/kernel/arm/ctrmm_kernel_2x2_vfpv3.S new file mode 100644 index 000000000..28e555caa --- /dev/null +++ b/kernel/arm/ctrmm_kernel_2x2_vfpv3.S @@ -0,0 +1,1476 @@ +/*************************************************************************** +Copyright (c) 2013, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +/************************************************************************************** +* 2013/10/16 Saar +* BLASTEST : OK +* CTEST : OK +* TEST : OK +* +**************************************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#define STACKSIZE 256 + +#define OLD_M r0 +#define OLD_N r1 +#define OLD_K r2 +#define OLD_A r3 +#define OLD_ALPHA_R s0 +#define OLD_ALPHA_I s1 + +/****************************************************** +* [fp, #-128] - [fp, #-64] is reserved +* for store and restore of floating point +* registers +*******************************************************/ + +#define KKK [fp, #-240] +#define KK [fp, #-244 ] +#define A [fp, #-248 ] +#define LDC [fp, #-252 ] +#define M [fp, #-256 ] +#define N [fp, #-260 ] +#define K [fp, #-264 ] + +#define ALPHA_I [fp, #-272] +#define ALPHA_R [fp, #-280] + +#define B [fp, #4 ] +#define C [fp, #8 ] +#define OLD_LDC [fp, #12 ] +#define OFFSET [fp, #16 ] + +#define I r0 +#define J r1 +#define L r2 + +#define AO r5 +#define BO r6 + +#define CO1 r8 +#define CO2 r9 + +#define K1 r7 +#define BC r12 + +#define A_PRE 96 +#define B_PRE 96 +#define C_PRE 64 + +#if defined(NN) || defined(NT) || defined(TN) || defined(TT) + + #define FADD_R fsubs + #define FADD_I fadds + + #define FMAC_R1 fnmuls + #define FMAC_R2 fnmacs + #define FMAC_I1 fmuls + #define FMAC_I2 fnmacs + +#elif defined(CN) || defined(CT) + + #define FADD_R fadds + #define FADD_I fsubs + + #define FMAC_R1 fmuls + #define FMAC_R2 fmacs + #define FMAC_I1 fnmuls + #define FMAC_I2 fmacs + +#elif defined(NC) || defined(TC) + + #define FADD_R fadds + #define FADD_I fsubs + + #define FMAC_R1 fmuls + #define FMAC_R2 fnmacs + #define FMAC_I1 fmuls + #define FMAC_I2 fmacs + +#else + + #define FADD_R fsubs + #define FADD_I fadds + + #define FMAC_R1 fnmuls + #define FMAC_R2 fmacs + #define FMAC_I1 fnmuls + #define FMAC_I2 fnmacs + +#endif + + + +/************************************************************************************** +* Macro definitions +**************************************************************************************/ + +.macro INIT2x2 + + vsub.f32 s16 , s16 , s16 + vmov.f32 s17, s16 + vmov.f32 s18, s16 + vmov.f32 s19, s16 + vmov.f32 s20, s16 + vmov.f32 s21, s16 + vmov.f32 s22, s16 + vmov.f32 s23, s16 + vmov.f32 s24, s16 + vmov.f32 s25, s16 + vmov.f32 s26, s16 + vmov.f32 s27, s16 + vmov.f32 s28, s16 + vmov.f32 s29, s16 + vmov.f32 s30, s16 + vmov.f32 s31, s16 + +.endm + +.macro KERNEL2x2_I + pld [ AO , #A_PRE ] + pld [ BO , #B_PRE ] + fldmias AO!, { s0 - s1 } + fldmias BO!, { s8 - s9 } + + fmuls s16 , s0, s8 + fmuls s24 , s1, s9 + fldmias AO!, { s2 - s3 } + fmuls s17 , s0, s9 + fmuls s25 , s1, s8 + + fldmias BO!, { s10 - s11 } + fmuls s18 , s2, s8 + fmuls s26 , s3, s9 + fldmias AO!, { s4 - s5 } + fmuls s19 , s2, s9 + fmuls s27 , s3, s8 + + fldmias BO!, { s12 - s13 } + fmuls s20 , s0, s10 + fmuls s28 , s1, s11 + fldmias AO!, { s6 - s7 } + fmuls s21 , s0, s11 + fmuls s29 , s1, s10 + + fldmias BO!, { s14 - s15 } + fmuls s22 , s2, s10 + fmuls s30 , s3, s11 + fmuls s23 , s2, s11 + fmuls s31 , s3, s10 + +.endm + + + +.macro KERNEL2x2_M1 + + fmacs s16 , s0, s8 + fldmias AO!, { s4 - s5 } + fmacs s24 , s1, s9 + fmacs s17 , s0, s9 + fldmias BO!, { s12 - s13 } + fmacs s25 , s1, s8 + + fmacs s18 , s2, s8 + fldmias AO!, { s6 - s7 } + fmacs s26 , s3, s9 + fmacs s19 , s2, s9 + fldmias BO!, { s14 - s15 } + fmacs s27 , s3, s8 + + fmacs s20 , s0, s10 + fmacs s28 , s1, s11 + fmacs s21 , s0, s11 + fmacs s29 , s1, s10 + + fmacs s22 , s2, s10 + fmacs s30 , s3, s11 + fmacs s23 , s2, s11 + fmacs s31 , s3, s10 + +.endm + +.macro KERNEL2x2_M2 + pld [ AO , #A_PRE ] + + fmacs s16 , s4, s12 + pld [ BO , #B_PRE ] + fmacs s24 , s5, s13 + fmacs s17 , s4, s13 + fldmias AO!, { s0 - s1 } + fmacs s25 , s5, s12 + + fmacs s18 , s6, s12 + fmacs s26 , s7, s13 + fldmias BO!, { s8 - s9 } + fmacs s19 , s6, s13 + fmacs s27 , s7, s12 + + fldmias AO!, { s2 - s3 } + fmacs s20 , s4, s14 + fmacs s28 , s5, s15 + fldmias BO!, { s10 - s11 } + fmacs s21 , s4, s15 + fmacs s29 , s5, s14 + + fmacs s22 , s6, s14 + fmacs s30 , s7, s15 + fmacs s23 , s6, s15 + fmacs s31 , s7, s14 + +.endm + + +.macro KERNEL2x2_E + + fmacs s16 , s4, s12 + fmacs s24 , s5, s13 + fmacs s17 , s4, s13 + fmacs s25 , s5, s12 + + fmacs s18 , s6, s12 + fmacs s26 , s7, s13 + fmacs s19 , s6, s13 + fmacs s27 , s7, s12 + + fmacs s20 , s4, s14 + fmacs s28 , s5, s15 + fmacs s21 , s4, s15 + fmacs s29 , s5, s14 + + fmacs s22 , s6, s14 + fmacs s30 , s7, s15 + fmacs s23 , s6, s15 + fmacs s31 , s7, s14 + +.endm + +.macro KERNEL2x2_SUB + + fldmias AO!, { s0 - s1 } + fldmias BO!, { s8 - s9 } + + fmacs s16 , s0, s8 + fmacs s24 , s1, s9 + fldmias AO!, { s2 - s3 } + fmacs s17 , s0, s9 + fmacs s25 , s1, s8 + + fldmias BO!, { s10 - s11 } + fmacs s18 , s2, s8 + fmacs s26 , s3, s9 + fmacs s19 , s2, s9 + fmacs s27 , s3, s8 + + fmacs s20 , s0, s10 + fmacs s28 , s1, s11 + fmacs s21 , s0, s11 + fmacs s29 , s1, s10 + + fmacs s22 , s2, s10 + fmacs s30 , s3, s11 + fmacs s23 , s2, s11 + fmacs s31 , s3, s10 + +.endm + + + + +.macro SAVE2x2 + + ldr r3 , LDC + add CO2 , CO1, r3 + flds s0, ALPHA_R + flds s1, ALPHA_I + + FADD_R s16, s24 , s16 + FADD_I s17, s25 , s17 + FADD_R s18, s26 , s18 + FADD_I s19, s27 , s19 + FADD_R s20, s28 , s20 + FADD_I s21, s29 , s21 + FADD_R s22, s30 , s22 + FADD_I s23, s31 , s23 + + FMAC_R1 s4 , s0 , s16 + FMAC_I1 s5 , s0 , s17 + FMAC_R2 s4 , s1 , s17 + FMAC_I2 s5 , s1 , s16 + + FMAC_R1 s6 , s0 , s18 + FMAC_I1 s7 , s0 , s19 + FMAC_R2 s6 , s1 , s19 + FMAC_I2 s7 , s1 , s18 + + FMAC_R1 s8 , s0 , s20 + FMAC_I1 s9 , s0 , s21 + FMAC_R2 s8 , s1 , s21 + FMAC_I2 s9 , s1 , s20 + + FMAC_R1 s10, s0 , s22 + FMAC_I1 s11, s0 , s23 + FMAC_R2 s10, s1 , s23 + FMAC_I2 s11, s1 , s22 + + fstmias CO1, { s4 - s7 } + fstmias CO2, { s8 - s11 } + + add CO1, CO1, #16 + +.endm + +/******************************************************************************/ + +.macro INIT1x2 + + vsub.f32 s16 , s16 , s16 + vmov.f32 s17, s16 + vmov.f32 s20, s16 + vmov.f32 s21, s16 + vmov.f32 s24, s16 + vmov.f32 s25, s16 + vmov.f32 s28, s16 + vmov.f32 s29, s16 + +.endm + +.macro KERNEL1x2_I + pld [ AO , #A_PRE ] + pld [ BO , #B_PRE ] + flds s0 , [ AO ] + flds s1 , [ AO, #4 ] + flds s8 , [ BO ] + flds s9 , [ BO, #4 ] + flds s10, [ BO, #8 ] + flds s11, [ BO, #12 ] + + fmuls s16 , s0, s8 + fmuls s24 , s1, s9 + fmuls s17 , s0, s9 + fmuls s25 , s1, s8 + + fmuls s20 , s0, s10 + fmuls s28 , s1, s11 + fmuls s21 , s0, s11 + fmuls s29 , s1, s10 + + add BO , BO, #16 + add AO , AO, #8 + + pld [ BO , #B_PRE ] + + flds s4 , [ AO, #0 ] + flds s5 , [ AO, #4 ] + + flds s12, [ BO ] + flds s13, [ BO, #4 ] + flds s14, [ BO, #8 ] + flds s15, [ BO, #12 ] + + add BO , BO, #16 + add AO , AO, #8 +.endm + + + +.macro KERNEL1x2_M1 + pld [ BO , #B_PRE ] + + fmacs s16 , s0, s8 + fmacs s24 , s1, s9 + fmacs s17 , s0, s9 + fmacs s25 , s1, s8 + + fmacs s20 , s0, s10 + fmacs s28 , s1, s11 + fmacs s21 , s0, s11 + fmacs s29 , s1, s10 + + flds s4 , [ AO, #0 ] + flds s5 , [ AO, #4 ] + + flds s12, [ BO ] + flds s13, [ BO, #4 ] + flds s14, [ BO, #8 ] + flds s15, [ BO, #12 ] + + add BO , BO, #16 + add AO , AO, #8 +.endm + +.macro KERNEL1x2_M2 + pld [ AO , #A_PRE ] + pld [ BO , #B_PRE ] + + fmacs s16 , s4, s12 + fmacs s24 , s5, s13 + fmacs s17 , s4, s13 + fmacs s25 , s5, s12 + + fmacs s20 , s4, s14 + fmacs s28 , s5, s15 + fmacs s21 , s4, s15 + fmacs s29 , s5, s14 + + flds s0 , [ AO, #0 ] + flds s1 , [ AO, #4 ] + + flds s8 , [ BO ] + flds s9 , [ BO, #4 ] + flds s10, [ BO, #8 ] + flds s11, [ BO, #12 ] + + add BO , BO, #16 + add AO , AO, #8 +.endm + + +.macro KERNEL1x2_E + + fmacs s16 , s4, s12 + fmacs s24 , s5, s13 + fmacs s17 , s4, s13 + fmacs s25 , s5, s12 + + fmacs s20 , s4, s14 + fmacs s28 , s5, s15 + fmacs s21 , s4, s15 + fmacs s29 , s5, s14 + +.endm + +.macro KERNEL1x2_SUB + + pld [ AO , #A_PRE ] + pld [ BO , #B_PRE ] + flds s0 , [ AO ] + flds s1 , [ AO, #4 ] + flds s8 , [ BO ] + flds s9 , [ BO, #4 ] + flds s10, [ BO, #8 ] + flds s11, [ BO, #12 ] + + fmacs s16 , s0, s8 + fmacs s24 , s1, s9 + fmacs s17 , s0, s9 + fmacs s25 , s1, s8 + + fmacs s20 , s0, s10 + fmacs s28 , s1, s11 + fmacs s21 , s0, s11 + fmacs s29 , s1, s10 + + add BO , BO, #16 + add AO , AO, #8 + +.endm + + + + +.macro SAVE1x2 + + ldr r3 , LDC + add CO2 , CO1, r3 + flds s0, ALPHA_R + flds s1, ALPHA_I + + FADD_R s16, s24 , s16 + FADD_I s17, s25 , s17 + FADD_R s20, s28 , s20 + FADD_I s21, s29 , s21 + + FMAC_R1 s4 , s0 , s16 + FMAC_I1 s5 , s0 , s17 + FMAC_R2 s4 , s1 , s17 + FMAC_I2 s5 , s1 , s16 + + FMAC_R1 s8 , s0 , s20 + FMAC_I1 s9 , s0 , s21 + FMAC_R2 s8 , s1 , s21 + FMAC_I2 s9 , s1 , s20 + + fstmias CO1, { s4 - s5 } + fstmias CO2, { s8 - s9 } + + add CO1, CO1, #8 + +.endm + +/******************************************************************************/ + +.macro INIT2x1 + + vsub.f32 s16 , s16 , s16 + vmov.f32 s17, s16 + vmov.f32 s18, s16 + vmov.f32 s19, s16 + vmov.f32 s24, s16 + vmov.f32 s25, s16 + vmov.f32 s26, s16 + vmov.f32 s27, s16 + +.endm + +.macro KERNEL2x1_I + pld [ AO , #A_PRE ] + pld [ BO , #B_PRE ] + flds s0 , [ AO ] + flds s1 , [ AO, #4 ] + flds s2 , [ AO, #8 ] + flds s3 , [ AO, #12 ] + flds s8 , [ BO ] + flds s9 , [ BO, #4 ] + + fmuls s16 , s0, s8 + fmuls s24 , s1, s9 + fmuls s17 , s0, s9 + fmuls s25 , s1, s8 + + fmuls s18 , s2, s8 + fmuls s26 , s3, s9 + fmuls s19 , s2, s9 + fmuls s27 , s3, s8 + + add BO , BO, #8 + add AO , AO, #16 + + pld [ BO , #B_PRE ] + pld [ AO , #A_PRE ] + + flds s4 , [ AO, #0 ] + flds s5 , [ AO, #4 ] + flds s6 , [ AO, #8 ] + flds s7 , [ AO, #12 ] + + flds s12, [ BO ] + flds s13, [ BO, #4 ] + + add BO , BO, #8 + add AO , AO, #16 +.endm + + + +.macro KERNEL2x1_M1 + pld [ AO , #A_PRE ] + pld [ BO , #B_PRE ] + + fmacs s16 , s0, s8 + fmacs s24 , s1, s9 + fmacs s17 , s0, s9 + fmacs s25 , s1, s8 + + fmacs s18 , s2, s8 + fmacs s26 , s3, s9 + fmacs s19 , s2, s9 + fmacs s27 , s3, s8 + + flds s4 , [ AO, #0 ] + flds s5 , [ AO, #4 ] + flds s6 , [ AO, #8 ] + flds s7 , [ AO, #12 ] + + flds s12, [ BO ] + flds s13, [ BO, #4 ] + + add BO , BO, #8 + add AO , AO, #16 +.endm + +.macro KERNEL2x1_M2 + pld [ AO , #A_PRE ] + pld [ BO , #B_PRE ] + + fmacs s16 , s4, s12 + fmacs s24 , s5, s13 + fmacs s17 , s4, s13 + fmacs s25 , s5, s12 + + fmacs s18 , s6, s12 + fmacs s26 , s7, s13 + fmacs s19 , s6, s13 + fmacs s27 , s7, s12 + + flds s0 , [ AO, #0 ] + flds s1 , [ AO, #4 ] + flds s2 , [ AO, #8 ] + flds s3 , [ AO, #12 ] + + flds s8 , [ BO ] + flds s9 , [ BO, #4 ] + + add BO , BO, #8 + add AO , AO, #16 +.endm + + +.macro KERNEL2x1_E + + fmacs s16 , s4, s12 + fmacs s24 , s5, s13 + fmacs s17 , s4, s13 + fmacs s25 , s5, s12 + + fmacs s18 , s6, s12 + fmacs s26 , s7, s13 + fmacs s19 , s6, s13 + fmacs s27 , s7, s12 + +.endm + +.macro KERNEL2x1_SUB + + pld [ AO , #A_PRE ] + pld [ BO , #B_PRE ] + flds s0 , [ AO ] + flds s1 , [ AO, #4 ] + flds s2 , [ AO, #8 ] + flds s3 , [ AO, #12 ] + flds s8 , [ BO ] + flds s9 , [ BO, #4 ] + + fmacs s16 , s0, s8 + fmacs s24 , s1, s9 + fmacs s17 , s0, s9 + fmacs s25 , s1, s8 + + fmacs s18 , s2, s8 + fmacs s26 , s3, s9 + fmacs s19 , s2, s9 + fmacs s27 , s3, s8 + + add BO , BO, #8 + add AO , AO, #16 + +.endm + + + + +.macro SAVE2x1 + + flds s0, ALPHA_R + flds s1, ALPHA_I + + FADD_R s16, s24 , s16 + FADD_I s17, s25 , s17 + FADD_R s18, s26 , s18 + FADD_I s19, s27 , s19 + + FMAC_R1 s4 , s0 , s16 + FMAC_I1 s5 , s0 , s17 + FMAC_R2 s4 , s1 , s17 + FMAC_I2 s5 , s1 , s16 + + FMAC_R1 s6 , s0 , s18 + FMAC_I1 s7 , s0 , s19 + FMAC_R2 s6 , s1 , s19 + FMAC_I2 s7 , s1 , s18 + + fstmias CO1, { s4 - s7 } + + add CO1, CO1, #16 + +.endm + +/******************************************************************************/ + +.macro INIT1x1 + + vsub.f32 s16 , s16 , s16 + vmov.f32 s17, s16 + vmov.f32 s24, s16 + vmov.f32 s25, s16 + +.endm + +.macro KERNEL1x1_I + pld [ AO , #A_PRE ] + pld [ BO , #B_PRE ] + flds s0 , [ AO ] + flds s1 , [ AO, #4 ] + flds s8 , [ BO ] + flds s9 , [ BO, #4 ] + + fmuls s16 , s0, s8 + fmuls s24 , s1, s9 + fmuls s17 , s0, s9 + fmuls s25 , s1, s8 + + add BO , BO, #8 + add AO , AO, #8 + + pld [ BO , #B_PRE ] + pld [ AO , #A_PRE ] + + flds s4 , [ AO, #0 ] + flds s5 , [ AO, #4 ] + + flds s12, [ BO ] + flds s13, [ BO, #4 ] + + add BO , BO, #8 + add AO , AO, #8 +.endm + + + +.macro KERNEL1x1_M1 + + fmacs s16 , s0, s8 + fmacs s24 , s1, s9 + fmacs s17 , s0, s9 + fmacs s25 , s1, s8 + + flds s4 , [ AO, #0 ] + flds s5 , [ AO, #4 ] + + flds s12, [ BO ] + flds s13, [ BO, #4 ] + + add BO , BO, #8 + add AO , AO, #8 +.endm + +.macro KERNEL1x1_M2 + + fmacs s16 , s4, s12 + fmacs s24 , s5, s13 + fmacs s17 , s4, s13 + fmacs s25 , s5, s12 + + flds s0 , [ AO, #0 ] + flds s1 , [ AO, #4 ] + + flds s8 , [ BO ] + flds s9 , [ BO, #4 ] + + add BO , BO, #8 + add AO , AO, #8 +.endm + + +.macro KERNEL1x1_E + + fmacs s16 , s4, s12 + fmacs s24 , s5, s13 + fmacs s17 , s4, s13 + fmacs s25 , s5, s12 + +.endm + +.macro KERNEL1x1_SUB + + flds s0 , [ AO ] + flds s1 , [ AO, #4 ] + flds s8 , [ BO ] + flds s9 , [ BO, #4 ] + + fmacs s16 , s0, s8 + fmacs s24 , s1, s9 + fmacs s17 , s0, s9 + fmacs s25 , s1, s8 + + add BO , BO, #8 + add AO , AO, #8 + +.endm + + + + +.macro SAVE1x1 + + flds s0, ALPHA_R + flds s1, ALPHA_I + + FADD_R s16, s24 , s16 + FADD_I s17, s25 , s17 + + FMAC_R1 s4 , s0 , s16 + FMAC_I1 s5 , s0 , s17 + FMAC_R2 s4 , s1 , s17 + FMAC_I2 s5 , s1 , s16 + + fstmias CO1, { s4 - s5 } + + add CO1, CO1, #8 + +.endm + +/******************************************************************************/ + + +/************************************************************************************** +* End of macro definitions +**************************************************************************************/ + + PROLOGUE + + .align 5 + + push {r4 - r9, fp} + add fp, sp, #24 + sub sp, sp, #STACKSIZE // reserve stack + + str OLD_M, M + str OLD_N, N + str OLD_K, K + str OLD_A, A + vstr OLD_ALPHA_R, ALPHA_R + vstr OLD_ALPHA_I, ALPHA_I + + sub r3, fp, #128 + vstm r3, { s8 - s31} // store floating point registers + + ldr r3, OLD_LDC + lsl r3, r3, #3 // ldc = ldc * 4 * 2 + str r3, LDC + + ldr r3, OFFSET +#ifndef LEFT + neg r3 , r3 +#endif + str r3 , KK + + ldr BC, B + + ldr J, N + asrs J, J, #1 // J = J / 2 + ble _L1_BEGIN + +_L2_BEGIN: + + ldr CO1, C // CO1 = C + ldr r4 , LDC + lsl r4 , r4 , #1 // LDC * 2 + add r3 , r4, CO1 + str r3 , C // store C + +#if defined(LEFT) + ldr r3 , OFFSET + str r3 , KK +#endif + + ldr AO, A // AO = A + pld [AO , #A_PRE-64] + pld [AO , #A_PRE-32] + + + +_L2_M2_BEGIN: + + ldr I, M + asrs I, I, #1 // I = I / 2 + ble _L2_M1_BEGIN + +_L2_M2_20: + +#if (defined(LEFT) && defined(TRANSA)) || \ + (!defined(LEFT) && !defined(TRANSA)) + + mov BO, BC +#else + mov BO, BC + ldr r3 , KK + lsls r4 , r3 , #4 // 2 * 4 * 2 float values + add BO , BO , r4 + lsls r4 , r3 , #4 // 2 * 4 * 2 float values + add AO , AO , r4 + +#endif + +#ifndef TRMMKERNEL + ldr K1, K +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + ldr K1, K + ldr r3, KK + sub K1, K1, r3 + str K1, KKK +#else + ldr K1, KK +#ifdef LEFT + add K1, K1, #2 // number of values in AO +#else + add K1, K1, #2 // number of values in BO +#endif + str K1, KKK +#endif + + asrs L , K1, #3 // L = L / 8 + cmp L , #3 + blt _L2_M2_30 + .align 5 + + + + KERNEL2x2_I + KERNEL2x2_M2 + KERNEL2x2_M1 + KERNEL2x2_M2 + + KERNEL2x2_M1 + KERNEL2x2_M2 + KERNEL2x2_M1 + KERNEL2x2_M2 + + sub L, L, #2 + +_L2_M2_22: + + KERNEL2x2_M1 + KERNEL2x2_M2 + KERNEL2x2_M1 + KERNEL2x2_M2 + + KERNEL2x2_M1 + KERNEL2x2_M2 + KERNEL2x2_M1 + KERNEL2x2_M2 + + subs L, L, #1 + bgt _L2_M2_22 + + KERNEL2x2_M1 + KERNEL2x2_M2 + KERNEL2x2_M1 + KERNEL2x2_M2 + + KERNEL2x2_M1 + KERNEL2x2_M2 + KERNEL2x2_M1 + KERNEL2x2_E + + b _L2_M2_44 + + +_L2_M2_30: + tst L, #3 + ble _L2_M2_40 + + tst L, #2 + ble _L2_M2_32 + + KERNEL2x2_I + KERNEL2x2_M2 + KERNEL2x2_M1 + KERNEL2x2_M2 + + KERNEL2x2_M1 + KERNEL2x2_M2 + KERNEL2x2_M1 + KERNEL2x2_M2 + + KERNEL2x2_M1 + KERNEL2x2_M2 + KERNEL2x2_M1 + KERNEL2x2_M2 + + + KERNEL2x2_M1 + KERNEL2x2_M2 + KERNEL2x2_M1 + KERNEL2x2_E + + b _L2_M2_44 + +_L2_M2_32: + + tst L, #1 + ble _L2_M2_40 + + KERNEL2x2_I + KERNEL2x2_M2 + KERNEL2x2_M1 + KERNEL2x2_M2 + + KERNEL2x2_M1 + KERNEL2x2_M2 + KERNEL2x2_M1 + KERNEL2x2_E + + b _L2_M2_44 + + +_L2_M2_40: + + INIT2x2 + + +_L2_M2_44: + + ands L , K1, #7 // L = L % 8 + ble _L2_M2_100 + +_L2_M2_46: + + KERNEL2x2_SUB + + subs L, L, #1 + bne _L2_M2_46 + +_L2_M2_100: + + SAVE2x2 + +#if (defined(LEFT) && defined(TRANSA)) || \ + (!defined(LEFT) && !defined(TRANSA)) + ldr r3 , K + ldr r4 , KKK + sub r3 , r3 , r4 + lsls r4 , r3 , #4 // 2 * 4 * 2 float values + add BO , BO , r4 + lsls r4 , r3 , #4 // 2 * 4 * 2 float values + add AO , AO , r4 +#endif + +#if defined(LEFT) + ldr r3 , KK + add r3 , r3 , #2 // number of values in AO + str r3 , KK +#endif + + +_L2_M2_END: + + subs I, I, #1 + bne _L2_M2_20 + + +_L2_M1_BEGIN: + + ldr I, M + tst I, #1 // I = I % 2 + ble _L2_END + +_L2_M1_20: + + INIT1x2 + +#if (defined(LEFT) && defined(TRANSA)) || \ + (!defined(LEFT) && !defined(TRANSA)) + + mov BO, BC +#else + mov BO, BC + ldr r3 , KK + lsls r4 , r3 , #4 // 2 * 4 * 2 float values + add BO , BO , r4 + lsls r4 , r3 , #3 // 1 * 4 * 2 float values + add AO , AO , r4 + +#endif + +#ifndef TRMMKERNEL + ldr K1, K +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + ldr K1, K + ldr r3, KK + sub K1, K1, r3 + str K1, KKK +#else + ldr K1, KK +#ifdef LEFT + add K1, K1, #1 // number of values in AO +#else + add K1, K1, #2 // number of values in BO +#endif + str K1, KKK +#endif + + asrs L , K1, #3 // L = L / 8 + ble _L2_M1_40 + +_L2_M1_22: + + KERNEL1x2_SUB + KERNEL1x2_SUB + KERNEL1x2_SUB + KERNEL1x2_SUB + + KERNEL1x2_SUB + KERNEL1x2_SUB + KERNEL1x2_SUB + KERNEL1x2_SUB + + subs L, L, #1 + bgt _L2_M1_22 + + +_L2_M1_40: + + ands L , K1, #7 // L = L % 8 + ble _L2_M1_100 + +_L2_M1_42: + + KERNEL1x2_SUB + + subs L, L, #1 + bgt _L2_M1_42 + +_L2_M1_100: + + SAVE1x2 + +#if (defined(LEFT) && defined(TRANSA)) || \ + (!defined(LEFT) && !defined(TRANSA)) + ldr r3 , K + ldr r4 , KKK + sub r3 , r3 , r4 + lsls r4 , r3 , #4 // 2 * 4 * 2 float values + add BO , BO , r4 + lsls r4 , r3 , #3 // 1 * 4 * 2 float values + add AO , AO , r4 +#endif + +#if defined(LEFT) + ldr r3 , KK + add r3 , r3 , #1 // number of values in AO + str r3 , KK +#endif + + + +_L2_END: + + mov r3, BC + ldr r4, K + lsl r4, r4, #4 // k * 2 * 4 * 2 + add r3, r3, r4 // B = B + K * 2 * 8 + mov BC, r3 + +#if !defined(LEFT) + ldr r3 , KK + add r3 , r3 , #2 // number of values in BO + str r3 , KK +#endif + + subs J , #1 // j-- + bgt _L2_BEGIN + + + +/*********************************************************************************************/ + +_L1_BEGIN: + + ldr J , N + tst J , #1 + ble _L999 + + + ldr CO1, C // CO1 = C + ldr r4 , LDC + add r3 , r4, CO1 + str r3 , C // store C + +#if defined(LEFT) + ldr r3 , OFFSET + str r3 , KK +#endif + + ldr AO, A // AO = A + +_L1_M2_BEGIN: + + ldr I, M + asrs I, I, #1 // I = I / 2 + ble _L1_M1_BEGIN + +_L1_M2_20: + +#if (defined(LEFT) && defined(TRANSA)) || \ + (!defined(LEFT) && !defined(TRANSA)) + + mov BO, BC +#else + mov BO, BC + ldr r3 , KK + lsls r4 , r3 , #3 // 1 * 4 * 2 float values + add BO , BO , r4 + lsls r4 , r3 , #4 // 2 * 4 * 2 float values + add AO , AO , r4 + +#endif + +#ifndef TRMMKERNEL + ldr K1, K +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + ldr K1, K + ldr r3, KK + sub K1, K1, r3 + str K1, KKK +#else + ldr K1, KK +#ifdef LEFT + add K1, K1, #2 // number of values in AO +#else + add K1, K1, #1 // number of values in BO +#endif + str K1, KKK +#endif + + asrs L , K1, #3 // L = L / 8 + cmp L , #3 + blt _L1_M2_30 + .align 5 + + + + KERNEL2x1_I + KERNEL2x1_M2 + KERNEL2x1_M1 + KERNEL2x1_M2 + + KERNEL2x1_M1 + KERNEL2x1_M2 + KERNEL2x1_M1 + KERNEL2x1_M2 + + sub L, L, #2 + +_L1_M2_22: + + KERNEL2x1_M1 + KERNEL2x1_M2 + KERNEL2x1_M1 + KERNEL2x1_M2 + + KERNEL2x1_M1 + KERNEL2x1_M2 + KERNEL2x1_M1 + KERNEL2x1_M2 + + subs L, L, #1 + bgt _L1_M2_22 + + KERNEL2x1_M1 + KERNEL2x1_M2 + KERNEL2x1_M1 + KERNEL2x1_M2 + + KERNEL2x1_M1 + KERNEL2x1_M2 + KERNEL2x1_M1 + KERNEL2x1_E + + b _L1_M2_44 + + +_L1_M2_30: + tst L, #3 + ble _L1_M2_40 + + tst L, #2 + ble _L1_M2_32 + + KERNEL2x1_I + KERNEL2x1_M2 + KERNEL2x1_M1 + KERNEL2x1_M2 + + KERNEL2x1_M1 + KERNEL2x1_M2 + KERNEL2x1_M1 + KERNEL2x1_M2 + + KERNEL2x1_M1 + KERNEL2x1_M2 + KERNEL2x1_M1 + KERNEL2x1_M2 + + + KERNEL2x1_M1 + KERNEL2x1_M2 + KERNEL2x1_M1 + KERNEL2x1_E + + b _L1_M2_44 + +_L1_M2_32: + + tst L, #1 + ble _L1_M2_40 + + KERNEL2x1_I + KERNEL2x1_M2 + KERNEL2x1_M1 + KERNEL2x1_M2 + + KERNEL2x1_M1 + KERNEL2x1_M2 + KERNEL2x1_M1 + KERNEL2x1_E + + b _L1_M2_44 + + +_L1_M2_40: + + INIT2x1 + + +_L1_M2_44: + + ands L , K1, #7 // L = L % 8 + ble _L1_M2_100 + +_L1_M2_46: + + KERNEL2x1_SUB + + subs L, L, #1 + bne _L1_M2_46 + +_L1_M2_100: + + SAVE2x1 + +#if (defined(LEFT) && defined(TRANSA)) || \ + (!defined(LEFT) && !defined(TRANSA)) + ldr r3 , K + ldr r4 , KKK + sub r3 , r3 , r4 + lsls r4 , r3 , #3 // 1 * 4 * 2 float values + add BO , BO , r4 + lsls r4 , r3 , #4 // 2 * 4 * 2 float values + add AO , AO , r4 +#endif + +#if defined(LEFT) + ldr r3 , KK + add r3 , r3 , #2 // number of values in AO + str r3 , KK +#endif + + + +_L1_M2_END: + + subs I, I, #1 + bne _L1_M2_20 + + +_L1_M1_BEGIN: + + ldr I, M + tst I, #1 // I = I % 2 + ble _L1_END + +_L1_M1_20: + + INIT1x1 + +#if (defined(LEFT) && defined(TRANSA)) || \ + (!defined(LEFT) && !defined(TRANSA)) + + mov BO, BC +#else + mov BO, BC + ldr r3 , KK + lsls r4 , r3 , #3 // 1 * 4 * 2 float values + add BO , BO , r4 + lsls r4 , r3 , #3 // 1 * 4 * 2 float values + add AO , AO , r4 + +#endif + +#ifndef TRMMKERNEL + ldr K1, K +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + ldr K1, K + ldr r3, KK + sub K1, K1, r3 + str K1, KKK +#else + ldr K1, KK +#ifdef LEFT + add K1, K1, #1 // number of values in AO +#else + add K1, K1, #1 // number of values in BO +#endif + str K1, KKK +#endif + + asrs L , K1, #3 // L = L / 8 + ble _L1_M1_40 + +_L1_M1_22: + + KERNEL1x1_SUB + KERNEL1x1_SUB + KERNEL1x1_SUB + KERNEL1x1_SUB + + KERNEL1x1_SUB + KERNEL1x1_SUB + KERNEL1x1_SUB + KERNEL1x1_SUB + + subs L, L, #1 + bgt _L1_M1_22 + + +_L1_M1_40: + + ands L , K1, #7 // L = L % 8 + ble _L1_M1_100 + +_L1_M1_42: + + KERNEL1x1_SUB + + subs L, L, #1 + bgt _L1_M1_42 + +_L1_M1_100: + + SAVE1x1 + + +_L1_END: + + + +_L999: + + sub r3, fp, #128 + vldm r3, { s8 - s31} // restore floating point registers + + movs r0, #0 // set return value + sub sp, fp, #24 + pop {r4 - r9, fp} + bx lr + + EPILOGUE + diff --git a/kernel/arm/dcopy_vfp.S b/kernel/arm/dcopy_vfp.S new file mode 100644 index 000000000..0fad3c4a6 --- /dev/null +++ b/kernel/arm/dcopy_vfp.S @@ -0,0 +1,222 @@ +/*************************************************************************** +Copyright (c) 2013, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +/************************************************************************************** +* 2013/11/07 Saar +* BLASTEST : OK +* CTEST : OK +* TEST : OK +* +**************************************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#define STACKSIZE 256 + +#define N r0 +#define X r1 +#define INC_X r2 +#define OLD_Y r3 + + +/****************************************************** +* [fp, #-128] - [fp, #-64] is reserved +* for store and restore of floating point +* registers +*******************************************************/ + +#define OLD_INC_Y [fp, #4 ] + +#define I r5 +#define Y r6 +#define INC_Y r7 + +#define X_PRE 256 + +/************************************************************************************** +* Macro definitions +**************************************************************************************/ + +.macro COPY_F4 + + pld [ X, #X_PRE ] + fldmiad X!, { d0 - d3 } + fstmiad Y!, { d0 - d3 } + +.endm + +.macro COPY_F1 + + fldmiad X!, { d0 } + fstmiad Y!, { d0 } + +.endm + + +/*************************************************************************************************************************/ + +.macro COPY_S4 + + nop + fldmiad X, { d0 } + fstmiad Y, { d0 } + add X, X, INC_X + add Y, Y, INC_Y + + fldmiad X, { d1 } + fstmiad Y, { d1 } + add X, X, INC_X + add Y, Y, INC_Y + + fldmiad X, { d0 } + fstmiad Y, { d0 } + add X, X, INC_X + add Y, Y, INC_Y + + fldmiad X, { d1 } + fstmiad Y, { d1 } + add X, X, INC_X + add Y, Y, INC_Y + +.endm + + +.macro COPY_S1 + + fldmiad X, { d0 } + fstmiad Y, { d0 } + add X, X, INC_X + add Y, Y, INC_Y + +.endm + + + +/************************************************************************************** +* End of macro definitions +**************************************************************************************/ + + PROLOGUE + + .align 5 + + push {r4 - r9, fp} + add fp, sp, #24 + sub sp, sp, #STACKSIZE // reserve stack + + sub r4, fp, #128 + vstm r4, { d8 - d15} // store floating point registers + + mov Y, OLD_Y + ldr INC_Y, OLD_INC_Y + + cmp N, #0 + ble dcopy_kernel_L999 + + cmp INC_X, #0 + beq dcopy_kernel_L999 + + cmp INC_Y, #0 + beq dcopy_kernel_L999 + + cmp INC_X, #1 + bne dcopy_kernel_S_BEGIN + + cmp INC_Y, #1 + bne dcopy_kernel_S_BEGIN + +dcopy_kernel_F_BEGIN: + + asrs I, N, #2 // I = N / 4 + ble dcopy_kernel_F1 + +dcopy_kernel_F4: + + COPY_F4 + + subs I, I, #1 + bne dcopy_kernel_F4 + +dcopy_kernel_F1: + + ands I, N, #3 + ble dcopy_kernel_L999 + +dcopy_kernel_F10: + + COPY_F1 + + subs I, I, #1 + bne dcopy_kernel_F10 + + b dcopy_kernel_L999 + +dcopy_kernel_S_BEGIN: + + lsl INC_X, INC_X, #3 // INC_X * SIZE + lsl INC_Y, INC_Y, #3 // INC_Y * SIZE + + asrs I, N, #2 // I = N / 4 + ble dcopy_kernel_S1 + +dcopy_kernel_S4: + + COPY_S4 + + subs I, I, #1 + bne dcopy_kernel_S4 + +dcopy_kernel_S1: + + ands I, N, #3 + ble dcopy_kernel_L999 + +dcopy_kernel_S10: + + COPY_S1 + + subs I, I, #1 + bne dcopy_kernel_S10 + + + + + + +dcopy_kernel_L999: + + sub r3, fp, #128 + vldm r3, { d8 - d15} // restore floating point registers + + mov r0, #0 // set return value + sub sp, fp, #24 + pop {r4 - r9, fp} + bx lr + + EPILOGUE + diff --git a/kernel/arm/ddot_vfp.S b/kernel/arm/ddot_vfp.S new file mode 100644 index 000000000..ab819ec98 --- /dev/null +++ b/kernel/arm/ddot_vfp.S @@ -0,0 +1,248 @@ +/*************************************************************************** +Copyright (c) 2013, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +/************************************************************************************** +* 2013/11/11 Saar +* BLASTEST : OK +* CTEST : OK +* TEST : OK +* +**************************************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#define STACKSIZE 256 + +#define N r0 +#define X r1 +#define INC_X r2 +#define OLD_Y r3 + + +/****************************************************** +* [fp, #-128] - [fp, #-64] is reserved +* for store and restore of floating point +* registers +*******************************************************/ + +#define OLD_INC_Y [fp, #4 ] + +#define I r5 +#define Y r6 +#define INC_Y r7 + +#define X_PRE 512 + +/************************************************************************************** +* Macro definitions +**************************************************************************************/ + +.macro KERNEL_F4 + + pld [ X, #X_PRE ] + fldmiad X!, { d8 } + pld [ Y, #X_PRE ] + fldmiad Y!, { d4 } + fldmiad Y!, { d5 } + fmacd d0 , d4, d8 + fldmiad X!, { d9 } + fldmiad Y!, { d6 } + fmacd d1 , d5, d9 + fldmiad X!, { d10 } + fldmiad X!, { d11 } + fmacd d0 , d6, d10 + fldmiad Y!, { d7 } + fmacd d1 , d7, d11 + +.endm + +.macro KERNEL_F1 + + fldmiad X!, { d4 } + fldmiad Y!, { d8 } + fmacd d0 , d4, d8 + +.endm + + +/*************************************************************************************************************************/ + +.macro KERNEL_S4 + + nop + fldmiad X, { d4 } + fldmiad Y, { d8 } + add X, X, INC_X + add Y, Y, INC_Y + fmacd d0 , d4, d8 + + fldmiad X, { d5 } + fldmiad Y, { d9 } + add X, X, INC_X + add Y, Y, INC_Y + fmacd d1 , d5, d9 + + fldmiad X, { d6 } + fldmiad Y, { d10 } + add X, X, INC_X + add Y, Y, INC_Y + fmacd d0 , d6, d10 + + fldmiad X, { d7 } + fldmiad Y, { d11 } + add X, X, INC_X + add Y, Y, INC_Y + fmacd d1 , d7, d11 + +.endm + + +.macro KERNEL_S1 + + fldmiad X, { d4 } + fldmiad Y, { d8 } + add X, X, INC_X + fmacd d0 , d4, d8 + add Y, Y, INC_Y + +.endm + + + +/************************************************************************************** +* End of macro definitions +**************************************************************************************/ + + PROLOGUE + + .align 5 + + push {r4 - r9, fp} + add fp, sp, #24 + sub sp, sp, #STACKSIZE // reserve stack + + sub r4, fp, #128 + vstm r4, { d8 - d15} // store floating point registers + + mov Y, OLD_Y + ldr INC_Y, OLD_INC_Y + + vsub.f64 d0 , d0 , d0 + vsub.f64 d1 , d1 , d1 + + cmp N, #0 + ble ddot_kernel_L999 + + cmp INC_X, #0 + beq ddot_kernel_L999 + + cmp INC_Y, #0 + beq ddot_kernel_L999 + + cmp INC_X, #1 + bne ddot_kernel_S_BEGIN + + cmp INC_Y, #1 + bne ddot_kernel_S_BEGIN + +ddot_kernel_F_BEGIN: + + asrs I, N, #2 // I = N / 4 + ble ddot_kernel_F1 + +ddot_kernel_F4: + + KERNEL_F4 + + subs I, I, #1 + ble ddot_kernel_F1 + + + KERNEL_F4 + + subs I, I, #1 + bne ddot_kernel_F4 + +ddot_kernel_F1: + + ands I, N, #3 + ble ddot_kernel_L999 + +ddot_kernel_F10: + + KERNEL_F1 + + subs I, I, #1 + bne ddot_kernel_F10 + + b ddot_kernel_L999 + +ddot_kernel_S_BEGIN: + + lsl INC_X, INC_X, #3 // INC_X * SIZE + lsl INC_Y, INC_Y, #3 // INC_Y * SIZE + + asrs I, N, #2 // I = N / 4 + ble ddot_kernel_S1 + +ddot_kernel_S4: + + KERNEL_S4 + + subs I, I, #1 + bne ddot_kernel_S4 + +ddot_kernel_S1: + + ands I, N, #3 + ble ddot_kernel_L999 + +ddot_kernel_S10: + + KERNEL_S1 + + subs I, I, #1 + bne ddot_kernel_S10 + + + + + + +ddot_kernel_L999: + + sub r3, fp, #128 + vldm r3, { d8 - d15} // restore floating point registers + + vadd.f64 d0 , d0, d1 // set return value + sub sp, fp, #24 + pop {r4 - r9, fp} + bx lr + + EPILOGUE + diff --git a/kernel/arm/dgemm_kernel_4x2_vfp.S b/kernel/arm/dgemm_kernel_4x2_vfp.S new file mode 100644 index 000000000..55409a5ef --- /dev/null +++ b/kernel/arm/dgemm_kernel_4x2_vfp.S @@ -0,0 +1,806 @@ +/*************************************************************************** +Copyright (c) 2013, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +/************************************************************************************** +* 2013/11/27 Saar +* BLASTEST : OK +* CTEST : OK +* TEST : OK +* +**************************************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#define STACKSIZE 256 + +#define OLD_M r0 +#define OLD_N r1 +#define OLD_K r2 +#define OLD_A r3 +#define OLD_ALPHA d0 + +/****************************************************** +* [fp, #-128] - [fp, #-64] is reserved +* for store and restore of floating point +* registers +*******************************************************/ + +#define LDC [fp, #-252 ] +#define M [fp, #-256 ] +#define N [fp, #-260 ] +#define K [fp, #-264 ] +#define A [fp, #-268 ] + +#define ALPHA [fp, #-280] + +#define B [fp, #4 ] +#define C [fp, #8 ] +#define OLD_LDC [fp, #12 ] + +#define I r0 +#define J r1 +#define L r2 + +#define AO r5 +#define BO r6 + +#define CO1 r8 +#define CO2 r9 + +#define K1 r7 +#define BC r12 + +#define A_PRE 96 +#define B_PRE 96 +#define C_PRE 32 + +/************************************************************************************** +* Macro definitions +**************************************************************************************/ + +.macro INIT4x2 + + vsub.f64 d8 , d8 , d8 + vmov.f64 d9, d8 + vmov.f64 d10, d8 + vmov.f64 d11, d8 + vmov.f64 d12, d8 + vmov.f64 d13, d8 + vmov.f64 d14, d8 + vmov.f64 d15, d8 + +.endm + + + +.macro KERNEL4x2_SUB + + pld [ AO, #A_PRE ] + fldd d4 , [ BO ] + + fldd d0 , [ AO ] + fldd d1 , [ AO, #8 ] + + fmacd d8 , d0, d4 + fldd d2 , [ AO, #16 ] + fmacd d9 , d1, d4 + fldd d3 , [ AO, #24 ] + fmacd d10 , d2, d4 + fldd d5 , [ BO, #8 ] + fmacd d11 , d3, d4 + + fmacd d12 , d0, d5 + fmacd d13 , d1, d5 + add AO , AO, #32 + fmacd d14 , d2, d5 + add BO , BO, #16 + fmacd d15 , d3, d5 + + +.endm + +.macro SAVE4x2 + + ldr r3 , LDC + add CO2 , CO1, r3 + + fldd d0, ALPHA + + + fldd d4 , [CO1] + fldd d5 , [CO1, #8 ] + + pld [ CO1, #C_PRE ] + fmacd d4 , d0 , d8 + fldd d6 , [CO1, #16 ] + fmacd d5 , d0 , d9 + fldd d7 , [CO1, #24 ] + fmacd d6 , d0 , d10 + fstd d4 , [CO1] + fmacd d7 , d0 , d11 + + fstd d5 , [CO1, #8 ] + fstd d6 , [CO1, #16 ] + fstd d7 , [CO1, #24 ] + + fldd d4 , [CO2] + fldd d5 , [CO2, #8 ] + + pld [ CO2, #C_PRE ] + fmacd d4 , d0 , d12 + fldd d6 , [CO2, #16 ] + fmacd d5 , d0 , d13 + fldd d7 , [CO2, #24 ] + fmacd d6 , d0 , d14 + fstd d4 , [CO2] + fmacd d7 , d0 , d15 + add CO1, CO1, #32 + + fstd d5 , [CO2, #8 ] + fstd d6 , [CO2, #16 ] + fstd d7 , [CO2, #24 ] + + +.endm + + +/******************************************************************************/ + +.macro INIT2x2 + + vsub.f64 d8 , d8 , d8 + vmov.f64 d9, d8 + vmov.f64 d12, d8 + vmov.f64 d13, d8 + +.endm + +.macro KERNEL2x2_SUB + + fldd d4 , [ BO ] + fldd d5 , [ BO, #8 ] + + fldd d0 , [ AO ] + fldd d1 , [ AO, #8 ] + + fmacd d8 , d0, d4 + fmacd d9 , d1, d4 + + fmacd d12 , d0, d5 + fmacd d13 , d1, d5 + + add AO , AO, #16 + add BO , BO, #16 + +.endm + +.macro SAVE2x2 + + ldr r3 , LDC + add CO2 , CO1, r3 + + fldd d0, ALPHA + + fldd d4 , [CO1] + fldd d5 , [CO1, #8 ] + + fmacd d4 , d0 , d8 + fmacd d5 , d0 , d9 + + fstd d4 , [CO1] + fstd d5 , [CO1, #8 ] + + fldd d4 , [CO2] + fldd d5 , [CO2, #8 ] + + fmacd d4 , d0 , d12 + fmacd d5 , d0 , d13 + + fstd d4 , [CO2] + fstd d5 , [CO2, #8 ] + + add CO1, CO1, #16 + +.endm + + +/******************************************************************************/ + +.macro INIT1x2 + + vsub.f64 d8 , d8 , d8 + vmov.f64 d12, d8 + +.endm + +.macro KERNEL1x2_SUB + + fldd d4 , [ BO ] + fldd d5 , [ BO, #8 ] + + fldd d0 , [ AO ] + + fmacd d8 , d0, d4 + + fmacd d12 , d0, d5 + + add AO , AO, #8 + add BO , BO, #16 + +.endm + +.macro SAVE1x2 + + ldr r3 , LDC + add CO2 , CO1, r3 + + fldd d0, ALPHA + + fldd d4 , [CO1] + + fmacd d4 , d0 , d8 + + fstd d4 , [CO1] + + fldd d4 , [CO2] + + fmacd d4 , d0 , d12 + + fstd d4 , [CO2] + + add CO1, CO1, #8 + +.endm + + + +/******************************************************************************/ + +.macro INIT4x1 + + vsub.f64 d8 , d8 , d8 + vmov.f64 d9, d8 + vmov.f64 d10, d8 + vmov.f64 d11, d8 + +.endm + + + +.macro KERNEL4x1_SUB + + fldd d4 , [ BO ] + + fldd d0 , [ AO ] + fldd d1 , [ AO, #8 ] + fldd d2 , [ AO, #16 ] + fldd d3 , [ AO, #24 ] + + fmacd d8 , d0, d4 + fmacd d9 , d1, d4 + fmacd d10 , d2, d4 + fmacd d11 , d3, d4 + + add AO , AO, #32 + add BO , BO, #8 + +.endm + +.macro SAVE4x1 + + fldd d0, ALPHA + + fldd d4 , [CO1] + fldd d5 , [CO1, #8 ] + fldd d6 , [CO1, #16 ] + fldd d7 , [CO1, #24 ] + + fmacd d4 , d0 , d8 + fmacd d5 , d0 , d9 + fmacd d6 , d0 , d10 + fmacd d7 , d0 , d11 + + fstd d4 , [CO1] + fstd d5 , [CO1, #8 ] + fstd d6 , [CO1, #16 ] + fstd d7 , [CO1, #24 ] + + add CO1, CO1, #32 + +.endm + +/******************************************************************************/ + +.macro INIT2x1 + + vsub.f64 d8 , d8 , d8 + vmov.f64 d9 , d8 + +.endm + +.macro KERNEL2x1_SUB + + fldd d4 , [ BO ] + + fldd d0 , [ AO ] + fldd d1 , [ AO, #8 ] + + fmacd d8 , d0, d4 + fmacd d9 , d1, d4 + + add AO , AO, #16 + add BO , BO, #8 + +.endm + +.macro SAVE2x1 + + fldd d0, ALPHA + + fldd d4 , [CO1] + fldd d5 , [CO1, #8 ] + + fmacd d4 , d0 , d8 + fmacd d5 , d0 , d9 + + fstd d4 , [CO1] + fstd d5 , [CO1, #8 ] + + add CO1, CO1, #16 + +.endm + + +/******************************************************************************/ + +.macro INIT1x1 + + vsub.f64 d8 , d8 , d8 + +.endm + +.macro KERNEL1x1_SUB + + fldd d4 , [ BO ] + + fldd d0 , [ AO ] + + fmacd d8 , d0, d4 + + add AO , AO, #8 + add BO , BO, #8 + +.endm + +.macro SAVE1x1 + + fldd d0, ALPHA + + fldd d4 , [CO1] + + fmacd d4 , d0 , d8 + + fstd d4 , [CO1] + + add CO1, CO1, #8 + +.endm + + +/************************************************************************************** +* End of macro definitions +**************************************************************************************/ + + PROLOGUE + + .align 5 + + push {r4 - r9, fp} + add fp, sp, #24 + sub sp, sp, #STACKSIZE // reserve stack + + str OLD_M, M + str OLD_N, N + str OLD_K, K + str OLD_A, A + vstr OLD_ALPHA, ALPHA + + sub r3, fp, #128 + vstm r3, { d8 - d15} // store floating point registers + + ldr r3, OLD_LDC + lsl r3, r3, #3 // ldc = ldc * 8 + str r3, LDC + + ldr K1, K + ldr BC, B + + ldr J, N + asrs J, J, #1 // J = J / 2 + ble dgemm_kernel_L1_BEGIN + + +/*********************************************************************************************/ + +dgemm_kernel_L2_BEGIN: + + ldr CO1, C // CO1 = C + ldr r4 , LDC + lsl r4 , r4 , #1 // LDC * 2 + add r3 , r4, CO1 + str r3 , C // store C + + ldr AO, A // AO = A + +dgemm_kernel_L2_M4_BEGIN: + + ldr I, M + asrs I, I, #2 // I = I / 4 + ble dgemm_kernel_L2_M2_BEGIN + +dgemm_kernel_L2_M4_20: + + INIT4x2 + + mov BO, BC + asrs L , K1, #3 // L = L / 8 + ble dgemm_kernel_L2_M4_40 + .align 5 + +dgemm_kernel_L2_M4_22: + + pld [ BO, #B_PRE ] + KERNEL4x2_SUB + KERNEL4x2_SUB + pld [ BO, #B_PRE ] + KERNEL4x2_SUB + KERNEL4x2_SUB + + pld [ BO, #B_PRE ] + KERNEL4x2_SUB + KERNEL4x2_SUB + pld [ BO, #B_PRE ] + KERNEL4x2_SUB + KERNEL4x2_SUB + + subs L, L, #1 + bgt dgemm_kernel_L2_M4_22 + + +dgemm_kernel_L2_M4_40: + + ands L , K1, #7 // L = L % 8 + ble dgemm_kernel_L2_M4_100 + +dgemm_kernel_L2_M4_42: + + KERNEL4x2_SUB + + subs L, L, #1 + bgt dgemm_kernel_L2_M4_42 + +dgemm_kernel_L2_M4_100: + + SAVE4x2 + +dgemm_kernel_L2_M4_END: + + subs I, I, #1 + bgt dgemm_kernel_L2_M4_20 + + +dgemm_kernel_L2_M2_BEGIN: + + ldr I, M + tst I , #3 + ble dgemm_kernel_L2_END + + tst I, #2 // I = I / 2 + ble dgemm_kernel_L2_M1_BEGIN + +dgemm_kernel_L2_M2_20: + + INIT2x2 + + mov BO, BC + asrs L , K1, #3 // L = L / 8 + ble dgemm_kernel_L2_M2_40 + +dgemm_kernel_L2_M2_22: + + KERNEL2x2_SUB + KERNEL2x2_SUB + KERNEL2x2_SUB + KERNEL2x2_SUB + + KERNEL2x2_SUB + KERNEL2x2_SUB + KERNEL2x2_SUB + KERNEL2x2_SUB + + subs L, L, #1 + bgt dgemm_kernel_L2_M2_22 + + +dgemm_kernel_L2_M2_40: + + ands L , K1, #7 // L = L % 8 + ble dgemm_kernel_L2_M2_100 + +dgemm_kernel_L2_M2_42: + + KERNEL2x2_SUB + + subs L, L, #1 + bgt dgemm_kernel_L2_M2_42 + +dgemm_kernel_L2_M2_100: + + SAVE2x2 + +dgemm_kernel_L2_M2_END: + + +dgemm_kernel_L2_M1_BEGIN: + + tst I, #1 // I = I % 2 + ble dgemm_kernel_L2_END + +dgemm_kernel_L2_M1_20: + + INIT1x2 + + mov BO, BC + asrs L , K1, #3 // L = L / 8 + ble dgemm_kernel_L2_M1_40 + +dgemm_kernel_L2_M1_22: + KERNEL1x2_SUB + KERNEL1x2_SUB + KERNEL1x2_SUB + KERNEL1x2_SUB + + KERNEL1x2_SUB + KERNEL1x2_SUB + KERNEL1x2_SUB + KERNEL1x2_SUB + + subs L, L, #1 + bgt dgemm_kernel_L2_M1_22 + + +dgemm_kernel_L2_M1_40: + + ands L , K1, #7 // L = L % 8 + ble dgemm_kernel_L2_M1_100 + +dgemm_kernel_L2_M1_42: + + KERNEL1x2_SUB + + subs L, L, #1 + bgt dgemm_kernel_L2_M1_42 + +dgemm_kernel_L2_M1_100: + + SAVE1x2 + + +dgemm_kernel_L2_END: + + mov r3, BC + mov r4, K1 + lsl r4, r4, #4 // k * 2 * 8 + add r3, r3, r4 // B = B + K * 2 * 8 + mov BC, r3 + + subs J , #1 // j-- + bgt dgemm_kernel_L2_BEGIN + +/*********************************************************************************************/ + +dgemm_kernel_L1_BEGIN: + + ldr J , N + tst J , #1 + ble dgemm_kernel_L999 + + + ldr CO1, C // CO1 = C + ldr r4 , LDC + add r3 , r4, CO1 + str r3 , C // store C + + ldr AO, A // AO = A + + + +dgemm_kernel_L1_M4_BEGIN: + + ldr I, M + asrs I, I, #2 // I = I / 4 + ble dgemm_kernel_L1_M2_BEGIN + +dgemm_kernel_L1_M4_20: + + INIT4x1 + + mov BO, BC + asrs L , K1, #3 // L = L / 8 + ble dgemm_kernel_L1_M4_40 + .align 5 + +dgemm_kernel_L1_M4_22: + KERNEL4x1_SUB + KERNEL4x1_SUB + KERNEL4x1_SUB + KERNEL4x1_SUB + + KERNEL4x1_SUB + KERNEL4x1_SUB + KERNEL4x1_SUB + KERNEL4x1_SUB + + subs L, L, #1 + bgt dgemm_kernel_L1_M4_22 + + +dgemm_kernel_L1_M4_40: + + ands L , K1, #7 // L = L % 8 + ble dgemm_kernel_L1_M4_100 + +dgemm_kernel_L1_M4_42: + + KERNEL4x1_SUB + + subs L, L, #1 + bgt dgemm_kernel_L1_M4_42 + +dgemm_kernel_L1_M4_100: + + SAVE4x1 + +dgemm_kernel_L1_M4_END: + + subs I, I, #1 + bgt dgemm_kernel_L1_M4_20 + + +dgemm_kernel_L1_M2_BEGIN: + + ldr I, M + tst I , #3 + ble dgemm_kernel_L1_END + + tst I, #2 // I = I / 2 + ble dgemm_kernel_L1_M1_BEGIN + +dgemm_kernel_L1_M2_20: + + INIT2x1 + + mov BO, BC + asrs L , K1, #3 // L = L / 8 + ble dgemm_kernel_L1_M2_40 + +dgemm_kernel_L1_M2_22: + + KERNEL2x1_SUB + KERNEL2x1_SUB + KERNEL2x1_SUB + KERNEL2x1_SUB + + KERNEL2x1_SUB + KERNEL2x1_SUB + KERNEL2x1_SUB + KERNEL2x1_SUB + + subs L, L, #1 + bgt dgemm_kernel_L1_M2_22 + + +dgemm_kernel_L1_M2_40: + + ands L , K1, #7 // L = L % 8 + ble dgemm_kernel_L1_M2_100 + +dgemm_kernel_L1_M2_42: + + KERNEL2x1_SUB + + subs L, L, #1 + bgt dgemm_kernel_L1_M2_42 + +dgemm_kernel_L1_M2_100: + + SAVE2x1 + +dgemm_kernel_L1_M2_END: + + +dgemm_kernel_L1_M1_BEGIN: + + tst I, #1 // I = I % 2 + ble dgemm_kernel_L1_END + +dgemm_kernel_L1_M1_20: + + INIT1x1 + + mov BO, BC + asrs L , K1, #3 // L = L / 8 + ble dgemm_kernel_L1_M1_40 + +dgemm_kernel_L1_M1_22: + KERNEL1x1_SUB + KERNEL1x1_SUB + KERNEL1x1_SUB + KERNEL1x1_SUB + + KERNEL1x1_SUB + KERNEL1x1_SUB + KERNEL1x1_SUB + KERNEL1x1_SUB + + subs L, L, #1 + bgt dgemm_kernel_L1_M1_22 + + +dgemm_kernel_L1_M1_40: + + ands L , K1, #7 // L = L % 8 + ble dgemm_kernel_L1_M1_100 + +dgemm_kernel_L1_M1_42: + + KERNEL1x1_SUB + + subs L, L, #1 + bgt dgemm_kernel_L1_M1_42 + +dgemm_kernel_L1_M1_100: + + SAVE1x1 + + +dgemm_kernel_L1_END: + + +dgemm_kernel_L999: + + sub r3, fp, #128 + vldm r3, { d8 - d15} // restore floating point registers + + movs r0, #0 // set return value + sub sp, fp, #24 + pop {r4 - r9, fp} + bx lr + + EPILOGUE + diff --git a/kernel/arm/dgemm_kernel_4x4_vfpv3.S b/kernel/arm/dgemm_kernel_4x4_vfpv3.S new file mode 100644 index 000000000..3b6af19a3 --- /dev/null +++ b/kernel/arm/dgemm_kernel_4x4_vfpv3.S @@ -0,0 +1,1483 @@ +/*************************************************************************** +Copyright (c) 2013, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +/************************************************************************************** +* 2013/11/23 Saar +* BLASTEST : OK +* CTEST : OK +* TEST : OK +* +* +* 2013/10/11 Saar +* UNROLL_N 4 +* UNROLL_M 4 +* DGEMM_P 128 +* DGEMM_Q 96 +* DGEMM_R 512 +* A_PRE 96 +* B_PRE 96 +* C_PRE 64 +* +* Performance on Odroid U2: +* +* 1 Core: 1.57 GFLOPS ATLAS: 1.59 GFLOPS +* 2 Cores: 3.14 GFLOPS ATLAS: 3.16 GFLOPS +* 3 Cores: 4.56 GFLOPS ATLAS: 4.60 GFLOPS +* 4 Cores: 5.82 GFLOPS ATLAS: 5.41 GFLOPS +**************************************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#define STACKSIZE 256 + +#define OLD_M r0 +#define OLD_N r1 +#define OLD_K r2 +#define OLD_A r3 +#define OLD_ALPHA d0 + +/****************************************************** +* [fp, #-128] - [fp, #-64] is reserved +* for store and restore of floating point +* registers +*******************************************************/ + +#define LDC [fp, #-252 ] +#define M [fp, #-256 ] +#define N [fp, #-260 ] +#define K [fp, #-264 ] +#define A [fp, #-268 ] + +#define ALPHA [fp, #-280] + +#define B [fp, #4 ] +#define C [fp, #8 ] +#define OLD_LDC [fp, #12 ] + +#define I r0 +#define J r1 +#define L r2 + +#define AO r5 +#define BO r6 + +#define CO1 r8 +#define CO2 r9 + +#define K1 r7 +#define BC r12 + +#define A_PRE 96 +#define B_PRE 96 +#define C_PRE 64 + +/************************************************************************************** +* Macro definitions +**************************************************************************************/ + +.macro INIT4x4 + + vsub.f64 d16 , d16 , d16 + vmov.f64 d17, d16 + vmov.f64 d18, d16 + vmov.f64 d19, d16 + vmov.f64 d20, d16 + vmov.f64 d21, d16 + vmov.f64 d22, d16 + vmov.f64 d23, d16 + vmov.f64 d24, d16 + vmov.f64 d25, d16 + vmov.f64 d26, d16 + vmov.f64 d27, d16 + vmov.f64 d28, d16 + vmov.f64 d29, d16 + vmov.f64 d30, d16 + vmov.f64 d31, d16 + +.endm + +.macro KERNEL4x4_I + pld [ BO , #B_PRE ] + fldd d8 , [ BO ] + fldd d0 , [ AO ] + pld [ AO , #A_PRE ] + + fldd d1 , [ AO, #8 ] + fmuld d16 , d0, d8 + fldd d2 , [ AO, #16 ] + fmuld d17 , d1, d8 + fldd d3 , [ AO, #24 ] + fmuld d18 , d2, d8 + fldd d9 , [ BO, #8 ] + fmuld d19 , d3, d8 + + fldd d10, [ BO, #16 ] + fmuld d20 , d0, d9 + fldd d11, [ BO, #24 ] + fmuld d21 , d1, d9 + add BO , BO, #32 + add AO , AO, #32 + fmuld d22 , d2, d9 + + pld [ BO , #B_PRE ] + fldd d12, [ BO ] + fmuld d23 , d3, d9 + + pld [ AO , #A_PRE ] + fldd d4 , [ AO, #0 ] + fmuld d24 , d0, d10 + fldd d5 , [ AO, #8 ] + fmuld d25 , d1, d10 + fldd d6 , [ AO, #16 ] + fmuld d26 , d2, d10 + fldd d7 , [ AO, #24 ] + fmuld d27 , d3, d10 + + fldd d13, [ BO, #8 ] + fmuld d28 , d0, d11 + fldd d14, [ BO, #16 ] + fmuld d29 , d1, d11 + fldd d15, [ BO, #24 ] + fmuld d30 , d2, d11 + fmuld d31 , d3, d11 + +.endm + + + +.macro KERNEL4x4_M2 + + fmacd d16 , d4, d12 + pld [ AO , #A_PRE+32 ] + fmacd d17 , d5, d12 + fldd d0 , [ AO , #32 ] + fmacd d18 , d6, d12 + pld [ BO , #B_PRE+32 ] + fmacd d19 , d7, d12 + + fldd d8 , [ BO , #32 ] + fmacd d20 , d4, d13 + fldd d1 , [ AO, #40 ] + fmacd d21 , d5, d13 + fldd d2 , [ AO, #48 ] + fmacd d22 , d6, d13 + fldd d3 , [ AO, #56 ] + fmacd d23 , d7, d13 + + fmacd d24 , d4, d14 + fmacd d25 , d5, d14 + fldd d9 , [ BO, #40 ] + fmacd d26 , d6, d14 + fldd d10, [ BO, #48 ] + fmacd d27 , d7, d14 + + fldd d11, [ BO, #56 ] + fmacd d28 , d4, d15 + fmacd d29 , d5, d15 + add AO , AO, #64 + fmacd d30 , d6, d15 + add BO , BO, #64 + fmacd d31 , d7, d15 + +.endm + + +.macro KERNEL4x4_M1 + + fmacd d16 , d0, d8 + pld [ AO , #A_PRE ] + fmacd d17 , d1, d8 + fldd d4 , [ AO ] + fmacd d18 , d2, d8 + pld [ BO , #B_PRE ] + fmacd d19 , d3, d8 + + fldd d12, [ BO ] + fmacd d20 , d0, d9 + fldd d5 , [ AO, #8 ] + fmacd d21 , d1, d9 + fldd d6 , [ AO, #16 ] + fmacd d22 , d2, d9 + fldd d7 , [ AO, #24 ] + fmacd d23 , d3, d9 + + fmacd d24 , d0, d10 + fmacd d25 , d1, d10 + fldd d13, [ BO, #8 ] + fmacd d26 , d2, d10 + fldd d14, [ BO, #16 ] + fmacd d27 , d3, d10 + + fldd d15, [ BO, #24 ] + fmacd d28 , d0, d11 + fmacd d29 , d1, d11 + fmacd d30 , d2, d11 + fmacd d31 , d3, d11 + +.endm + + + +.macro KERNEL4x4_E + + fmacd d16 , d4, d12 + fmacd d17 , d5, d12 + add BO , BO, #32 + add AO , AO, #32 + fmacd d18 , d6, d12 + fmacd d19 , d7, d12 + + fmacd d20 , d4, d13 + fmacd d21 , d5, d13 + fmacd d22 , d6, d13 + fmacd d23 , d7, d13 + + fmacd d24 , d4, d14 + fmacd d25 , d5, d14 + fmacd d26 , d6, d14 + fmacd d27 , d7, d14 + + fmacd d28 , d4, d15 + fmacd d29 , d5, d15 + fmacd d30 , d6, d15 + fmacd d31 , d7, d15 + +.endm + + + + +.macro KERNEL4x4_SUB + + fldd d8 , [ BO ] + pld [ BO , #B_PRE ] + + fldd d0 , [ AO ] + pld [ AO , #A_PRE ] + fldd d1 , [ AO, #8 ] + + fmacd d16 , d0, d8 + fldd d2 , [ AO, #16 ] + fmacd d17 , d1, d8 + fldd d3 , [ AO, #24 ] + fmacd d18 , d2, d8 + fldd d9 , [ BO, #8 ] + fmacd d19 , d3, d8 + + fldd d10, [ BO, #16 ] + fmacd d20 , d0, d9 + fldd d11, [ BO, #24 ] + fmacd d21 , d1, d9 + fmacd d22 , d2, d9 + fmacd d23 , d3, d9 + + fmacd d24 , d0, d10 + fmacd d25 , d1, d10 + fmacd d26 , d2, d10 + fmacd d27 , d3, d10 + + fmacd d28 , d0, d11 + fmacd d29 , d1, d11 + add AO , AO, #32 + fmacd d30 , d2, d11 + add BO , BO, #32 + fmacd d31 , d3, d11 + +.endm + +.macro SAVE4x4 + pld [ CO1 , #C_PRE ] + + ldr r3 , LDC + add CO2 , CO1, r3 + fldd d0, ALPHA + add r4 , CO2, r3 + pld [ CO2 , #C_PRE ] + + fldmiad CO1, { d8 - d11 } + pld [ r4 , #C_PRE ] + + fmacd d8 , d0 , d16 + fldd d12, [CO2] + fmacd d9 , d0 , d17 + fldd d13, [CO2, #8 ] + fmacd d10, d0 , d18 + fldd d14, [CO2, #16 ] + fmacd d11, d0 , d19 + fldd d15, [CO2, #24 ] + + fmacd d12, d0 , d20 + fstd d8 , [CO1] + fmacd d13, d0 , d21 + fstd d9 , [CO1, #8 ] + fmacd d14, d0 , d22 + fstd d10, [CO1, #16 ] + fmacd d15, d0 , d23 + fstd d11, [CO1, #24 ] + + fldmiad r4, { d8 - d11 } + + fmacd d8 , d0 , d24 + fstd d12, [CO2] + fmacd d9 , d0 , d25 + fstd d13, [CO2, #8 ] + fmacd d10, d0 , d26 + fstd d14, [CO2, #16 ] + fmacd d11, d0 , d27 + fstd d15, [CO2, #24 ] + + add CO2, r4 , r3 + + pld [ CO2 , #C_PRE ] + + fldmiad CO2, { d12 - d15 } + + fstd d8 , [r4 ] + fmacd d12, d0 , d28 + fstd d9 , [r4 , #8 ] + fmacd d13, d0 , d29 + fstd d10, [r4 , #16 ] + fmacd d14, d0 , d30 + fstd d11, [r4 , #24 ] + fmacd d15, d0 , d31 + + fstmiad CO2, { d12 - d15 } + + add CO1, CO1, #32 + +.endm + +/******************************************************************************/ + +.macro INIT2x4 + + vsub.f64 d16 , d16 , d16 + vmov.f64 d17, d16 + vmov.f64 d20, d16 + vmov.f64 d21, d16 + vmov.f64 d24, d16 + vmov.f64 d25, d16 + vmov.f64 d28, d16 + vmov.f64 d29, d16 + +.endm + + + +.macro KERNEL2x4_SUB + + fldd d8 , [ BO ] + fldd d9 , [ BO, #8 ] + fldd d10, [ BO, #16 ] + fldd d11, [ BO, #24 ] + + fldd d0 , [ AO ] + fldd d1 , [ AO, #8 ] + + fmacd d16 , d0, d8 + fmacd d17 , d1, d8 + + fmacd d20 , d0, d9 + fmacd d21 , d1, d9 + + fmacd d24 , d0, d10 + fmacd d25 , d1, d10 + + fmacd d28 , d0, d11 + fmacd d29 , d1, d11 + add AO , AO, #16 + add BO , BO, #32 + +.endm + +.macro SAVE2x4 + + ldr r3 , LDC + add CO2 , CO1, r3 + add r4 , CO2, r3 + + fldd d0, ALPHA + + fldd d8 , [CO1] + fldd d9 , [CO1, #8 ] + + fmacd d8 , d0 , d16 + fmacd d9 , d0 , d17 + + fstd d8 , [CO1] + fstd d9 , [CO1, #8 ] + + fldd d12, [CO2] + fldd d13, [CO2, #8 ] + + fmacd d12, d0 , d20 + fmacd d13, d0 , d21 + + fstd d12, [CO2] + fstd d13, [CO2, #8 ] + + fldd d8 , [r4 ] + fldd d9 , [r4 , #8 ] + + fmacd d8 , d0 , d24 + fmacd d9 , d0 , d25 + + fstd d8 , [r4 ] + fstd d9 , [r4 , #8 ] + + add CO2, r4 , r3 + + fldd d12, [CO2] + fldd d13, [CO2, #8 ] + + fmacd d12, d0 , d28 + fmacd d13, d0 , d29 + + fstd d12, [CO2] + fstd d13, [CO2, #8 ] + + add CO1, CO1, #16 + +.endm + + +/******************************************************************************/ + +.macro INIT1x4 + + vsub.f64 d16 , d16 , d16 + vmov.f64 d20, d16 + vmov.f64 d24, d16 + vmov.f64 d28, d16 + +.endm + + + +.macro KERNEL1x4_SUB + + fldd d8 , [ BO ] + fldd d9 , [ BO, #8 ] + fldd d10, [ BO, #16 ] + fldd d11, [ BO, #24 ] + + fldd d0 , [ AO ] + + fmacd d16 , d0, d8 + fmacd d20 , d0, d9 + fmacd d24 , d0, d10 + fmacd d28 , d0, d11 + + add AO , AO, #8 + add BO , BO, #32 + +.endm + +.macro SAVE1x4 + + ldr r3 , LDC + add CO2 , CO1, r3 + add r4 , CO2, r3 + + fldd d0, ALPHA + + fldd d8 , [CO1] + fmacd d8 , d0 , d16 + fstd d8 , [CO1] + + fldd d12, [CO2] + fmacd d12, d0 , d20 + fstd d12, [CO2] + + fldd d8 , [r4 ] + fmacd d8 , d0 , d24 + fstd d8 , [r4 ] + + add CO2, r4 , r3 + + fldd d12, [CO2] + fmacd d12, d0 , d28 + fstd d12, [CO2] + + add CO1, CO1, #8 + +.endm + +/******************************************************************************/ +/******************************************************************************/ + +.macro INIT4x2 + + vsub.f64 d16 , d16 , d16 + vmov.f64 d17, d16 + vmov.f64 d18, d16 + vmov.f64 d19, d16 + vmov.f64 d20, d16 + vmov.f64 d21, d16 + vmov.f64 d22, d16 + vmov.f64 d23, d16 + +.endm + + + +.macro KERNEL4x2_SUB + + fldd d8 , [ BO ] + fldd d9 , [ BO, #8 ] + + fldd d0 , [ AO ] + fldd d1 , [ AO, #8 ] + fldd d2 , [ AO, #16 ] + fldd d3 , [ AO, #24 ] + + fmacd d16 , d0, d8 + fmacd d17 , d1, d8 + fmacd d18 , d2, d8 + fmacd d19 , d3, d8 + + fmacd d20 , d0, d9 + fmacd d21 , d1, d9 + fmacd d22 , d2, d9 + fmacd d23 , d3, d9 + + add AO , AO, #32 + add BO , BO, #16 + +.endm + +.macro SAVE4x2 + + ldr r3 , LDC + add CO2 , CO1, r3 + + fldd d0, ALPHA + + fldd d8 , [CO1] + fldd d9 , [CO1, #8 ] + fldd d10, [CO1, #16 ] + fldd d11, [CO1, #24 ] + + fmacd d8 , d0 , d16 + fmacd d9 , d0 , d17 + fmacd d10, d0 , d18 + fmacd d11, d0 , d19 + + fstd d8 , [CO1] + fstd d9 , [CO1, #8 ] + fstd d10, [CO1, #16 ] + fstd d11, [CO1, #24 ] + + fldd d12, [CO2] + fldd d13, [CO2, #8 ] + fldd d14, [CO2, #16 ] + fldd d15, [CO2, #24 ] + + fmacd d12, d0 , d20 + fmacd d13, d0 , d21 + fmacd d14, d0 , d22 + fmacd d15, d0 , d23 + + fstd d12, [CO2] + fstd d13, [CO2, #8 ] + fstd d14, [CO2, #16 ] + fstd d15, [CO2, #24 ] + + add CO1, CO1, #32 + +.endm + + +/******************************************************************************/ + +.macro INIT2x2 + + vsub.f64 d16 , d16 , d16 + vmov.f64 d17, d16 + vmov.f64 d20, d16 + vmov.f64 d21, d16 + +.endm + + + +.macro KERNEL2x2_SUB + + fldd d8 , [ BO ] + fldd d9 , [ BO, #8 ] + + fldd d0 , [ AO ] + fldd d1 , [ AO, #8 ] + + fmacd d16 , d0, d8 + fmacd d17 , d1, d8 + + fmacd d20 , d0, d9 + fmacd d21 , d1, d9 + + add AO , AO, #16 + add BO , BO, #16 + +.endm + +.macro SAVE2x2 + + ldr r3 , LDC + add CO2 , CO1, r3 + + fldd d0, ALPHA + + fldd d8 , [CO1] + fldd d9 , [CO1, #8 ] + + fmacd d8 , d0 , d16 + fmacd d9 , d0 , d17 + + fstd d8 , [CO1] + fstd d9 , [CO1, #8 ] + + fldd d12, [CO2] + fldd d13, [CO2, #8 ] + + fmacd d12, d0 , d20 + fmacd d13, d0 , d21 + + fstd d12, [CO2] + fstd d13, [CO2, #8 ] + + add CO1, CO1, #16 + +.endm + +/******************************************************************************/ + +.macro INIT1x2 + + vsub.f64 d16 , d16 , d16 + vmov.f64 d20, d16 + +.endm + + + +.macro KERNEL1x2_SUB + + fldd d8 , [ BO ] + fldd d9 , [ BO, #8 ] + + fldd d0 , [ AO ] + fmacd d16 , d0, d8 + fmacd d20 , d0, d9 + + add AO , AO, #8 + add BO , BO, #16 + +.endm + +.macro SAVE1x2 + + ldr r3 , LDC + add CO2 , CO1, r3 + + fldd d0, ALPHA + + fldd d8 , [CO1] + fmacd d8 , d0 , d16 + fstd d8 , [CO1] + + fldd d12, [CO2] + fmacd d12, d0 , d20 + fstd d12, [CO2] + + add CO1, CO1, #8 + +.endm + +/******************************************************************************/ +/******************************************************************************/ + +.macro INIT4x1 + + vsub.f64 d16 , d16 , d16 + vmov.f64 d17, d16 + vmov.f64 d18, d16 + vmov.f64 d19, d16 + +.endm + + + +.macro KERNEL4x1_SUB + + fldd d8 , [ BO ] + + fldd d0 , [ AO ] + fldd d1 , [ AO, #8 ] + fldd d2 , [ AO, #16 ] + fldd d3 , [ AO, #24 ] + + fmacd d16 , d0, d8 + fmacd d17 , d1, d8 + fmacd d18 , d2, d8 + fmacd d19 , d3, d8 + + add AO , AO, #32 + add BO , BO, #8 + +.endm + +.macro SAVE4x1 + + + fldd d0, ALPHA + + fldd d8 , [CO1] + fldd d9 , [CO1, #8 ] + fldd d10, [CO1, #16 ] + fldd d11, [CO1, #24 ] + + fmacd d8 , d0 , d16 + fmacd d9 , d0 , d17 + fmacd d10, d0 , d18 + fmacd d11, d0 , d19 + + fstd d8 , [CO1] + fstd d9 , [CO1, #8 ] + fstd d10, [CO1, #16 ] + fstd d11, [CO1, #24 ] + + add CO1, CO1, #32 + +.endm + + + + +/******************************************************************************/ + +.macro INIT2x1 + + vsub.f64 d16 , d16 , d16 + vmov.f64 d17, d16 + +.endm + + + +.macro KERNEL2x1_SUB + + fldd d8 , [ BO ] + + fldd d0 , [ AO ] + fldd d1 , [ AO, #8 ] + + fmacd d16 , d0, d8 + fmacd d17 , d1, d8 + + add AO , AO, #16 + add BO , BO, #8 + +.endm + +.macro SAVE2x1 + + + fldd d0, ALPHA + + fldd d8 , [CO1] + fldd d9 , [CO1, #8 ] + + fmacd d8 , d0 , d16 + fmacd d9 , d0 , d17 + + fstd d8 , [CO1] + fstd d9 , [CO1, #8 ] + + add CO1, CO1, #16 + +.endm + +/******************************************************************************/ + +.macro INIT1x1 + + vsub.f64 d16 , d16 , d16 + +.endm + + + +.macro KERNEL1x1_SUB + + fldd d8 , [ BO ] + + fldd d0 , [ AO ] + + fmacd d16 , d0, d8 + + add AO , AO, #8 + add BO , BO, #8 + +.endm + +.macro SAVE1x1 + + + fldd d0, ALPHA + + fldd d8 , [CO1] + fmacd d8 , d0 , d16 + fstd d8 , [CO1] + + add CO1, CO1, #8 + +.endm + + + + + +/************************************************************************************** +* End of macro definitions +**************************************************************************************/ + + PROLOGUE + + .align 5 + + push {r4 - r9, fp} + add fp, sp, #24 + sub sp, sp, #STACKSIZE // reserve stack + + str OLD_M, M + str OLD_N, N + str OLD_K, K + str OLD_A, A + vstr OLD_ALPHA, ALPHA + + sub r3, fp, #128 + vstm r3, { d8 - d15} // store floating point registers + + ldr r3, OLD_LDC + lsl r3, r3, #3 // ldc = ldc * 8 + str r3, LDC + + ldr K1, K + ldr BC, B + + ldr J, N + asrs J, J, #2 // J = J / 4 + ble dgemm_kernel_L2_BEGIN + +dgemm_kernel_L4_BEGIN: + + ldr CO1, C // CO1 = C + ldr r4 , LDC + lsl r4 , r4 , #2 // LDC * 4 + add r3 , r4, CO1 + str r3 , C // store C + + ldr AO, A // AO = A + pld [AO , #A_PRE-64] + pld [AO , #A_PRE-32] + + + +dgemm_kernel_L4_M4_BEGIN: + + ldr I, M + asrs I, I, #2 // I = I / 4 + ble dgemm_kernel_L4_M2_BEGIN + +dgemm_kernel_L4_M4_20: + + + mov BO, BC + asrs L , K1, #3 // L = L / 8 + cmp L , #2 + blt dgemm_kernel_L4_M4_32 + + + KERNEL4x4_I + KERNEL4x4_M2 + KERNEL4x4_M1 + KERNEL4x4_M2 + + KERNEL4x4_M1 + KERNEL4x4_M2 + KERNEL4x4_M1 + KERNEL4x4_M2 + + subs L, L, #2 + ble dgemm_kernel_L4_M4_22a + .align 5 + +dgemm_kernel_L4_M4_22: + + KERNEL4x4_M1 + KERNEL4x4_M2 + KERNEL4x4_M1 + KERNEL4x4_M2 + + KERNEL4x4_M1 + KERNEL4x4_M2 + KERNEL4x4_M1 + KERNEL4x4_M2 + + subs L, L, #1 + bgt dgemm_kernel_L4_M4_22 + +dgemm_kernel_L4_M4_22a: + + KERNEL4x4_M1 + KERNEL4x4_M2 + KERNEL4x4_M1 + KERNEL4x4_M2 + + KERNEL4x4_M1 + KERNEL4x4_M2 + KERNEL4x4_M1 + KERNEL4x4_E + + b dgemm_kernel_L4_M4_44 + +dgemm_kernel_L4_M4_32: + + tst L, #1 + ble dgemm_kernel_L4_M4_40 + + KERNEL4x4_I + KERNEL4x4_M2 + KERNEL4x4_M1 + KERNEL4x4_M2 + + KERNEL4x4_M1 + KERNEL4x4_M2 + KERNEL4x4_M1 + KERNEL4x4_E + + b dgemm_kernel_L4_M4_44 + + +dgemm_kernel_L4_M4_40: + + INIT4x4 + + +dgemm_kernel_L4_M4_44: + + ands L , K1, #7 // L = L % 8 + ble dgemm_kernel_L4_M4_100 + +dgemm_kernel_L4_M4_46: + + KERNEL4x4_SUB + + subs L, L, #1 + bne dgemm_kernel_L4_M4_46 + +dgemm_kernel_L4_M4_100: + + SAVE4x4 + +dgemm_kernel_L4_M4_END: + + subs I, I, #1 + bne dgemm_kernel_L4_M4_20 + + +dgemm_kernel_L4_M2_BEGIN: + + ldr I, M + tst I , #3 + ble dgemm_kernel_L4_END + + tst I, #2 // I = I / 2 + ble dgemm_kernel_L4_M1_BEGIN + +dgemm_kernel_L4_M2_20: + + INIT2x4 + + mov BO, BC + asrs L , K1, #3 // L = L / 8 + ble dgemm_kernel_L4_M2_40 + +dgemm_kernel_L4_M2_22: + + KERNEL2x4_SUB + KERNEL2x4_SUB + KERNEL2x4_SUB + KERNEL2x4_SUB + + KERNEL2x4_SUB + KERNEL2x4_SUB + KERNEL2x4_SUB + KERNEL2x4_SUB + + subs L, L, #1 + bgt dgemm_kernel_L4_M2_22 + + +dgemm_kernel_L4_M2_40: + + ands L , K1, #7 // L = L % 8 + ble dgemm_kernel_L4_M2_100 + +dgemm_kernel_L4_M2_42: + + KERNEL2x4_SUB + + subs L, L, #1 + bgt dgemm_kernel_L4_M2_42 + +dgemm_kernel_L4_M2_100: + + SAVE2x4 + +dgemm_kernel_L4_M2_END: + + +dgemm_kernel_L4_M1_BEGIN: + + tst I, #1 // I = I % 2 + ble dgemm_kernel_L4_END + +dgemm_kernel_L4_M1_20: + + INIT1x4 + + mov BO, BC + asrs L , K1, #3 // L = L / 8 + ble dgemm_kernel_L4_M1_40 + +dgemm_kernel_L4_M1_22: + KERNEL1x4_SUB + KERNEL1x4_SUB + KERNEL1x4_SUB + KERNEL1x4_SUB + + KERNEL1x4_SUB + KERNEL1x4_SUB + KERNEL1x4_SUB + KERNEL1x4_SUB + + subs L, L, #1 + bgt dgemm_kernel_L4_M1_22 + + +dgemm_kernel_L4_M1_40: + + ands L , K1, #7 // L = L % 8 + ble dgemm_kernel_L4_M1_100 + +dgemm_kernel_L4_M1_42: + + KERNEL1x4_SUB + + subs L, L, #1 + bgt dgemm_kernel_L4_M1_42 + +dgemm_kernel_L4_M1_100: + + SAVE1x4 + + +dgemm_kernel_L4_END: + + mov r3, BC + mov r4, K1 + lsl r4, r4, #5 // k * 4 * 8 + add r3, r3, r4 // B = B + K * 4 * 8 + mov BC, r3 + + subs J , #1 // j-- + bgt dgemm_kernel_L4_BEGIN + + + +/*********************************************************************************************/ + +dgemm_kernel_L2_BEGIN: + + ldr J , N + tst J , #3 + ble dgemm_kernel_L999 + + tst J , #2 + ble dgemm_kernel_L1_BEGIN + + ldr CO1, C // CO1 = C + ldr r4 , LDC + lsl r4 , r4 , #1 // LDC * 2 + add r3 , r4, CO1 + str r3 , C // store C + + ldr AO, A // AO = A + + + +dgemm_kernel_L2_M4_BEGIN: + + ldr I, M + asrs I, I, #2 // I = I / 4 + ble dgemm_kernel_L2_M2_BEGIN + +dgemm_kernel_L2_M4_20: + + INIT4x2 + + mov BO, BC + asrs L , K1, #3 // L = L / 8 + ble dgemm_kernel_L2_M4_40 + .align 5 + +dgemm_kernel_L2_M4_22: + KERNEL4x2_SUB + KERNEL4x2_SUB + KERNEL4x2_SUB + KERNEL4x2_SUB + + KERNEL4x2_SUB + KERNEL4x2_SUB + KERNEL4x2_SUB + KERNEL4x2_SUB + + subs L, L, #1 + bgt dgemm_kernel_L2_M4_22 + + +dgemm_kernel_L2_M4_40: + + ands L , K1, #7 // L = L % 8 + ble dgemm_kernel_L2_M4_100 + +dgemm_kernel_L2_M4_42: + + KERNEL4x2_SUB + + subs L, L, #1 + bgt dgemm_kernel_L2_M4_42 + +dgemm_kernel_L2_M4_100: + + SAVE4x2 + +dgemm_kernel_L2_M4_END: + + subs I, I, #1 + bgt dgemm_kernel_L2_M4_20 + + +dgemm_kernel_L2_M2_BEGIN: + + ldr I, M + tst I , #3 + ble dgemm_kernel_L2_END + + tst I, #2 // I = I / 2 + ble dgemm_kernel_L2_M1_BEGIN + +dgemm_kernel_L2_M2_20: + + INIT2x2 + + mov BO, BC + asrs L , K1, #3 // L = L / 8 + ble dgemm_kernel_L2_M2_40 + +dgemm_kernel_L2_M2_22: + + KERNEL2x2_SUB + KERNEL2x2_SUB + KERNEL2x2_SUB + KERNEL2x2_SUB + + KERNEL2x2_SUB + KERNEL2x2_SUB + KERNEL2x2_SUB + KERNEL2x2_SUB + + subs L, L, #1 + bgt dgemm_kernel_L2_M2_22 + + +dgemm_kernel_L2_M2_40: + + ands L , K1, #7 // L = L % 8 + ble dgemm_kernel_L2_M2_100 + +dgemm_kernel_L2_M2_42: + + KERNEL2x2_SUB + + subs L, L, #1 + bgt dgemm_kernel_L2_M2_42 + +dgemm_kernel_L2_M2_100: + + SAVE2x2 + +dgemm_kernel_L2_M2_END: + + +dgemm_kernel_L2_M1_BEGIN: + + tst I, #1 // I = I % 2 + ble dgemm_kernel_L2_END + +dgemm_kernel_L2_M1_20: + + INIT1x2 + + mov BO, BC + asrs L , K1, #3 // L = L / 8 + ble dgemm_kernel_L2_M1_40 + +dgemm_kernel_L2_M1_22: + KERNEL1x2_SUB + KERNEL1x2_SUB + KERNEL1x2_SUB + KERNEL1x2_SUB + + KERNEL1x2_SUB + KERNEL1x2_SUB + KERNEL1x2_SUB + KERNEL1x2_SUB + + subs L, L, #1 + bgt dgemm_kernel_L2_M1_22 + + +dgemm_kernel_L2_M1_40: + + ands L , K1, #7 // L = L % 8 + ble dgemm_kernel_L2_M1_100 + +dgemm_kernel_L2_M1_42: + + KERNEL1x2_SUB + + subs L, L, #1 + bgt dgemm_kernel_L2_M1_42 + +dgemm_kernel_L2_M1_100: + + SAVE1x2 + + +dgemm_kernel_L2_END: + + mov r3, BC + mov r4, K1 + lsl r4, r4, #4 // k * 2 * 8 + add r3, r3, r4 // B = B + K * 2 * 8 + mov BC, r3 + +/*********************************************************************************************/ + +dgemm_kernel_L1_BEGIN: + + ldr J , N + tst J , #1 + ble dgemm_kernel_L999 + + + ldr CO1, C // CO1 = C + ldr r4 , LDC + add r3 , r4, CO1 + str r3 , C // store C + + ldr AO, A // AO = A + + + +dgemm_kernel_L1_M4_BEGIN: + + ldr I, M + asrs I, I, #2 // I = I / 4 + ble dgemm_kernel_L1_M2_BEGIN + +dgemm_kernel_L1_M4_20: + + INIT4x1 + + mov BO, BC + asrs L , K1, #3 // L = L / 8 + ble dgemm_kernel_L1_M4_40 + .align 5 + +dgemm_kernel_L1_M4_22: + KERNEL4x1_SUB + KERNEL4x1_SUB + KERNEL4x1_SUB + KERNEL4x1_SUB + + KERNEL4x1_SUB + KERNEL4x1_SUB + KERNEL4x1_SUB + KERNEL4x1_SUB + + subs L, L, #1 + bgt dgemm_kernel_L1_M4_22 + + +dgemm_kernel_L1_M4_40: + + ands L , K1, #7 // L = L % 8 + ble dgemm_kernel_L1_M4_100 + +dgemm_kernel_L1_M4_42: + + KERNEL4x1_SUB + + subs L, L, #1 + bgt dgemm_kernel_L1_M4_42 + +dgemm_kernel_L1_M4_100: + + SAVE4x1 + +dgemm_kernel_L1_M4_END: + + subs I, I, #1 + bgt dgemm_kernel_L1_M4_20 + + +dgemm_kernel_L1_M2_BEGIN: + + ldr I, M + tst I , #3 + ble dgemm_kernel_L1_END + + tst I, #2 // I = I / 2 + ble dgemm_kernel_L1_M1_BEGIN + +dgemm_kernel_L1_M2_20: + + INIT2x1 + + mov BO, BC + asrs L , K1, #3 // L = L / 8 + ble dgemm_kernel_L1_M2_40 + +dgemm_kernel_L1_M2_22: + + KERNEL2x1_SUB + KERNEL2x1_SUB + KERNEL2x1_SUB + KERNEL2x1_SUB + + KERNEL2x1_SUB + KERNEL2x1_SUB + KERNEL2x1_SUB + KERNEL2x1_SUB + + subs L, L, #1 + bgt dgemm_kernel_L1_M2_22 + + +dgemm_kernel_L1_M2_40: + + ands L , K1, #7 // L = L % 8 + ble dgemm_kernel_L1_M2_100 + +dgemm_kernel_L1_M2_42: + + KERNEL2x1_SUB + + subs L, L, #1 + bgt dgemm_kernel_L1_M2_42 + +dgemm_kernel_L1_M2_100: + + SAVE2x1 + +dgemm_kernel_L1_M2_END: + + +dgemm_kernel_L1_M1_BEGIN: + + tst I, #1 // I = I % 2 + ble dgemm_kernel_L1_END + +dgemm_kernel_L1_M1_20: + + INIT1x1 + + mov BO, BC + asrs L , K1, #3 // L = L / 8 + ble dgemm_kernel_L1_M1_40 + +dgemm_kernel_L1_M1_22: + KERNEL1x1_SUB + KERNEL1x1_SUB + KERNEL1x1_SUB + KERNEL1x1_SUB + + KERNEL1x1_SUB + KERNEL1x1_SUB + KERNEL1x1_SUB + KERNEL1x1_SUB + + subs L, L, #1 + bgt dgemm_kernel_L1_M1_22 + + +dgemm_kernel_L1_M1_40: + + ands L , K1, #7 // L = L % 8 + ble dgemm_kernel_L1_M1_100 + +dgemm_kernel_L1_M1_42: + + KERNEL1x1_SUB + + subs L, L, #1 + bgt dgemm_kernel_L1_M1_42 + +dgemm_kernel_L1_M1_100: + + SAVE1x1 + + +dgemm_kernel_L1_END: + + +dgemm_kernel_L999: + + sub r3, fp, #128 + vldm r3, { d8 - d15} // restore floating point registers + + movs r0, #0 // set return value + sub sp, fp, #24 + pop {r4 - r9, fp} + bx lr + + EPILOGUE + diff --git a/kernel/arm/dgemm_ncopy_2_vfp.S b/kernel/arm/dgemm_ncopy_2_vfp.S new file mode 100644 index 000000000..763c032e1 --- /dev/null +++ b/kernel/arm/dgemm_ncopy_2_vfp.S @@ -0,0 +1,225 @@ +/*************************************************************************** +Copyright (c) 2013, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +/************************************************************************************** +* 2013/11/24 Saar +* BLASTEST : OK +* CTEST : OK +* TEST : OK +* +**************************************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#define STACKSIZE 256 + +#define OLD_M r0 +#define OLD_N r1 +#define OLD_A r2 +#define OLD_LDA r3 + +#define B [fp, #4 ] + +#define M r0 +#define N r1 +#define A r2 + +#define BO r5 + +#define AO1 r6 +#define AO2 r7 +#define LDA r8 + +#define I r3 +#define J r12 + +#define A_PRE 256 + +/************************************************************************************** +* Macro definitions +**************************************************************************************/ + +.macro COPY2x2 + + fldd d0 , [ AO1, #0 ] + fldd d2 , [ AO1, #8 ] + + fldd d1 , [ AO2, #0 ] + fldd d3 , [ AO2, #8 ] + + add AO1, AO1, #16 + fstmiad BO!, { d0 - d3 } + add AO2, AO2, #16 + +.endm + + +.macro COPY1x2 + + fldd d0 , [ AO1, #0 ] + fldd d1 , [ AO2, #0 ] + add AO1, AO1, #8 + + fstmiad BO!, { d0 - d1 } + add AO2, AO2, #8 + +.endm + +.macro COPY2x1 + + fldd d0 , [ AO1, #0 ] + fldd d1 , [ AO1, #8 ] + + fstmiad BO!, { d0 - d1 } + add AO1, AO1, #16 + +.endm + + +.macro COPY1x1 + + fldd d0 , [ AO1, #0 ] + + fstmiad BO!, { d0 } + add AO1, AO1, #8 + +.endm + + + + + +/************************************************************************************** +* End of macro definitions +**************************************************************************************/ + + PROLOGUE + + .align 5 + + push {r4 - r9, fp} + add fp, sp, #24 + + lsl LDA, OLD_LDA, #3 // lda = lda * 8 + + ldr BO, B + + +/*********************************************************************************************/ + +dgemm_ncopy_L2_BEGIN: + + asrs J, N, #1 // J = N / 2 + ble dgemm_ncopy_L1_BEGIN + +dgemm_ncopy_L2_M2_BEGIN: + + mov AO1, A // AO1 = A + add AO2, AO1, LDA + add A , AO2, LDA // A = A + 2 * LDA + + asrs I, M, #1 // I = M / 2 + ble dgemm_ncopy_L2_M2_40 + +dgemm_ncopy_L2_M2_20: + + COPY2x2 + + subs I , I , #1 + bne dgemm_ncopy_L2_M2_20 + + +dgemm_ncopy_L2_M2_40: + + ands I, M , #1 + ble dgemm_ncopy_L2_M2_END + +dgemm_ncopy_L2_M2_60: + + COPY1x2 + + subs I , I , #1 + bne dgemm_ncopy_L2_M2_60 + + +dgemm_ncopy_L2_M2_END: + + subs J , J, #1 // j-- + bne dgemm_ncopy_L2_M2_BEGIN + +/*********************************************************************************************/ + +dgemm_ncopy_L1_BEGIN: + + tst N, #1 + ble dgemm_ncopy_L999 + + +dgemm_ncopy_L1_M2_BEGIN: + + mov AO1, A // AO1 = A + add A , AO1, LDA // A = A + 1 * LDA + + asrs I, M, #1 // I = M / 2 + ble dgemm_ncopy_L1_M2_40 + +dgemm_ncopy_L1_M2_20: + + COPY2x1 + + subs I , I , #1 + bne dgemm_ncopy_L1_M2_20 + + +dgemm_ncopy_L1_M2_40: + + ands I, M , #1 + ble dgemm_ncopy_L1_M2_END + +dgemm_ncopy_L1_M2_60: + + COPY1x1 + + subs I , I , #1 + bne dgemm_ncopy_L1_M2_60 + + +dgemm_ncopy_L1_M2_END: + + + +dgemm_ncopy_L999: + + + movs r0, #0 // set return value + sub sp, fp, #24 + pop {r4 - r9, fp} + bx lr + + EPILOGUE + diff --git a/kernel/arm/dgemm_ncopy_4_vfp.S b/kernel/arm/dgemm_ncopy_4_vfp.S new file mode 100644 index 000000000..ad6692e50 --- /dev/null +++ b/kernel/arm/dgemm_ncopy_4_vfp.S @@ -0,0 +1,349 @@ +/*************************************************************************** +Copyright (c) 2013, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +/************************************************************************************** +* 2013/11/05 Saar +* BLASTEST : OK +* CTEST : OK +* TEST : OK +* +**************************************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#define STACKSIZE 256 + +#define OLD_M r0 +#define OLD_N r1 +#define OLD_A r2 +#define OLD_LDA r3 + + +/****************************************************** +* [fp, #-128] - [fp, #-64] is reserved +* for store and restore of floating point +* registers +*******************************************************/ + +#define LDA [fp, #-260 ] + +#define B [fp, #4 ] + +#define M r0 +#define N r1 +#define A r2 + +#define BO r5 + +#define AO1 r6 +#define AO2 r7 +#define AO3 r8 +#define AO4 r9 + +#define I r3 +#define J r12 + +#define A_PRE 256 + +/************************************************************************************** +* Macro definitions +**************************************************************************************/ + +.macro COPY4x4 + + pld [ AO1, #A_PRE ] + pld [ AO2, #A_PRE ] + pld [ AO3, #A_PRE ] + pld [ AO4, #A_PRE ] + + fldd d0 , [ AO1, #0 ] + fldd d1 , [ AO2, #0 ] + fldd d2 , [ AO3, #0 ] + fldd d3 , [ AO4, #0 ] + + fldd d4 , [ AO1, #8 ] + fldd d8 , [ AO1, #16 ] + fldd d12, [ AO1, #24 ] + + fldd d5 , [ AO2, #8 ] + add AO1, AO1, #32 + fldd d9 , [ AO2, #16 ] + fldd d13, [ AO2, #24 ] + + fldd d6 , [ AO3, #8 ] + add AO2, AO2, #32 + fldd d10, [ AO3, #16 ] + fldd d14, [ AO3, #24 ] + + fldd d7 , [ AO4, #8 ] + add AO3, AO3, #32 + fldd d11, [ AO4, #16 ] + fldd d15, [ AO4, #24 ] + + fstmiad BO!, { d0 - d3 } + add AO4, AO4, #32 + fstmiad BO!, { d4 - d7 } + fstmiad BO!, { d8 - d15 } + +.endm + +.macro COPY1x4 + + fldd d0 , [ AO1, #0 ] + fldd d1 , [ AO2, #0 ] + add AO1, AO1, #8 + fldd d2 , [ AO3, #0 ] + add AO2, AO2, #8 + fldd d3 , [ AO4, #0 ] + + add AO3, AO3, #8 + fstmiad BO!, { d0 - d3 } + add AO4, AO4, #8 + +.endm + +.macro COPY4x2 + + fldd d0 , [ AO1, #0 ] + fldd d2 , [ AO1, #8 ] + fldd d4 , [ AO1, #16 ] + fldd d6 , [ AO1, #24 ] + + fldd d1 , [ AO2, #0 ] + fldd d3 , [ AO2, #8 ] + add AO1, AO1, #32 + fldd d5 , [ AO2, #16 ] + fldd d7 , [ AO2, #24 ] + + fstmiad BO!, { d0 - d7 } + add AO2, AO2, #32 + +.endm + + +.macro COPY1x2 + + fldd d0 , [ AO1, #0 ] + fldd d1 , [ AO2, #0 ] + add AO1, AO1, #8 + + fstmiad BO!, { d0 - d1 } + add AO2, AO2, #8 + +.endm + +.macro COPY4x1 + + fldd d0 , [ AO1, #0 ] + fldd d1 , [ AO1, #8 ] + fldd d2 , [ AO1, #16 ] + fldd d3 , [ AO1, #24 ] + + fstmiad BO!, { d0 - d3 } + add AO1, AO1, #32 + +.endm + + +.macro COPY1x1 + + fldd d0 , [ AO1, #0 ] + + fstmiad BO!, { d0 } + add AO1, AO1, #8 + +.endm + + + + + +/************************************************************************************** +* End of macro definitions +**************************************************************************************/ + + PROLOGUE + + .align 5 + + push {r4 - r9, fp} + add fp, sp, #24 + sub sp, sp, #STACKSIZE // reserve stack + + + lsl r3, r3, #3 // lda = lda * 8 + str r3, LDA + + sub r4, fp, #128 + vstm r4, { d8 - d15} // store floating point registers + + ldr BO, B + +dgemm_ncopy_L4_BEGIN: + + asrs J, N, #2 // J = N / 4 + ble dgemm_ncopy_L2_BEGIN + +dgemm_ncopy_L4_M4_BEGIN: + + mov AO1, A // AO1 = A + ldr r4 , LDA + add AO2, AO1, r4 + add AO3, AO2, r4 + add AO4, AO3, r4 + add A , AO4, r4 // A = A + 4 * LDA + + asrs I, M, #2 // I = M / 4 + ble dgemm_ncopy_L4_M4_40 + +dgemm_ncopy_L4_M4_20: + + COPY4x4 + + subs I , I , #1 + bne dgemm_ncopy_L4_M4_20 + + +dgemm_ncopy_L4_M4_40: + + ands I, M , #3 + ble dgemm_ncopy_L4_M4_END + +dgemm_ncopy_L4_M4_60: + + COPY1x4 + + subs I , I , #1 + bne dgemm_ncopy_L4_M4_60 + + +dgemm_ncopy_L4_M4_END: + + subs J , J, #1 // j-- + bne dgemm_ncopy_L4_M4_BEGIN + + + +/*********************************************************************************************/ + +dgemm_ncopy_L2_BEGIN: + + tst N, #3 + ble dgemm_ncopy_L999 + + tst N, #2 + ble dgemm_ncopy_L1_BEGIN + +dgemm_ncopy_L2_M4_BEGIN: + + mov AO1, A // AO1 = A + ldr r4 , LDA + add AO2, AO1, r4 + add A , AO2, r4 // A = A + 2 * LDA + + asrs I, M, #2 // I = M / 4 + ble dgemm_ncopy_L2_M4_40 + +dgemm_ncopy_L2_M4_20: + + COPY4x2 + + subs I , I , #1 + bne dgemm_ncopy_L2_M4_20 + + +dgemm_ncopy_L2_M4_40: + + ands I, M , #3 + ble dgemm_ncopy_L2_M4_END + +dgemm_ncopy_L2_M4_60: + + COPY1x2 + + subs I , I , #1 + bne dgemm_ncopy_L2_M4_60 + + +dgemm_ncopy_L2_M4_END: + + +/*********************************************************************************************/ + +dgemm_ncopy_L1_BEGIN: + + tst N, #1 + ble dgemm_ncopy_L999 + + +dgemm_ncopy_L1_M4_BEGIN: + + mov AO1, A // AO1 = A + ldr r4 , LDA + add A , AO1, r4 // A = A + 1 * LDA + + asrs I, M, #2 // I = M / 4 + ble dgemm_ncopy_L1_M4_40 + +dgemm_ncopy_L1_M4_20: + + COPY4x1 + + subs I , I , #1 + bne dgemm_ncopy_L1_M4_20 + + +dgemm_ncopy_L1_M4_40: + + ands I, M , #3 + ble dgemm_ncopy_L1_M4_END + +dgemm_ncopy_L1_M4_60: + + COPY1x1 + + subs I , I , #1 + bne dgemm_ncopy_L1_M4_60 + + +dgemm_ncopy_L1_M4_END: + + + +dgemm_ncopy_L999: + + sub r3, fp, #128 + vldm r3, { d8 - d15} // restore floating point registers + + movs r0, #0 // set return value + sub sp, fp, #24 + pop {r4 - r9, fp} + bx lr + + EPILOGUE + diff --git a/kernel/arm/dgemm_tcopy_4_vfp.S b/kernel/arm/dgemm_tcopy_4_vfp.S new file mode 100644 index 000000000..88a139ad8 --- /dev/null +++ b/kernel/arm/dgemm_tcopy_4_vfp.S @@ -0,0 +1,408 @@ +/*************************************************************************** +Copyright (c) 2013, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +/************************************************************************************** +* 2013/11/06 Saar +* BLASTEST : OK +* CTEST : OK +* TEST : OK +* +**************************************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#define STACKSIZE 256 + +#define OLD_M r0 +#define OLD_N r1 +#define OLD_A r2 +#define OLD_LDA r3 + + +/****************************************************** +* [fp, #-128] - [fp, #-64] is reserved +* for store and restore of floating point +* registers +*******************************************************/ + +#define B [fp, #4 ] +#define A [fp, #-248 ] + +#define M r0 +#define N r1 +#define M4 r2 + +#define LDA r5 + +#define AO1 r6 +#define BO1 r7 +#define BO2 r8 +#define BO3 r9 + +#define I r4 +#define J r12 + +#define A_PRE 256 + +/************************************************************************************** +* Macro definitions +**************************************************************************************/ + +.macro COPY4x4 + + pld [ AO1, #A_PRE ] + fldmiad AO1, { d0 - d3 } + + add r3, AO1, LDA + pld [ r3, #A_PRE ] + fldmiad r3, { d4 - d7 } + + add r3, r3, LDA + pld [ r3, #A_PRE ] + fldmiad r3, { d8 - d11 } + + add r3, r3, LDA + pld [ r3, #A_PRE ] + fldmiad r3, { d12 - d15 } + + fstmiad BO1, { d0 - d15 } + add AO1, AO1, #32 + add BO1, BO1, M4 + +.endm + +.macro COPY2x4 + + fldmiad AO1, { d0 - d1 } + + add r3, AO1, LDA + fldmiad r3, { d2 - d3 } + + add r3, r3, LDA + fldmiad r3, { d4 - d5 } + + add r3, r3, LDA + fldmiad r3, { d6 - d7 } + + fstmiad BO2, { d0 - d7 } + add AO1, AO1, #16 + add BO2, BO2, #64 + +.endm + +.macro COPY1x4 + + fldmiad AO1, { d0 } + + add r3, AO1, LDA + fldmiad r3, { d1 } + + add r3, r3, LDA + fldmiad r3, { d2 } + + add r3, r3, LDA + fldmiad r3, { d3 } + + fstmiad BO3, { d0 - d3 } + add AO1, AO1, #8 + add BO3, BO3, #32 + +.endm + +/*************************************************************************************************************************/ + +.macro COPY4x2 + + pld [ AO1, #A_PRE ] + fldmiad AO1, { d0 - d3 } + + add r3, AO1, LDA + pld [ r3, #A_PRE ] + fldmiad r3, { d4 - d7 } + + fstmiad BO1, { d0 - d7 } + add AO1, AO1, #32 + add BO1, BO1, M4 + +.endm + +.macro COPY2x2 + + fldmiad AO1, { d0 - d1 } + + add r3, AO1, LDA + fldmiad r3, { d2 - d3 } + + fstmiad BO2, { d0 - d3 } + add AO1, AO1, #16 + add BO2, BO2, #32 + +.endm + +.macro COPY1x2 + + fldmiad AO1, { d0 } + + add r3, AO1, LDA + fldmiad r3, { d1 } + + fstmiad BO3, { d0 - d1 } + add AO1, AO1, #8 + add BO3, BO3, #16 + +.endm + +/*************************************************************************************************************************/ + +.macro COPY4x1 + + pld [ AO1, #A_PRE ] + fldmiad AO1, { d0 - d3 } + + fstmiad BO1, { d0 - d3 } + add AO1, AO1, #32 + add BO1, BO1, M4 + +.endm + +.macro COPY2x1 + + fldmiad AO1, { d0 - d1 } + + fstmiad BO2, { d0 - d1 } + add AO1, AO1, #16 + add BO2, BO2, #16 + +.endm + +.macro COPY1x1 + + fldmiad AO1, { d0 } + + fstmiad BO3, { d0 } + add AO1, AO1, #8 + add BO3, BO3, #8 + +.endm + + + +/************************************************************************************** +* End of macro definitions +**************************************************************************************/ + + PROLOGUE + + .align 5 + + push {r4 - r9, fp} + add fp, sp, #24 + sub sp, sp, #STACKSIZE // reserve stack + + str OLD_A, A // store A + + lsl LDA, OLD_LDA, #3 // lda = lda * SIZE + + sub r4, fp, #128 + vstm r4, { d8 - d15} // store floating point registers + + lsl r4 , M, #3 // M * SIZE + + ldr r3, B + + and BO2 , N , #-4 + and BO3 , N , #-2 + + mul BO2, BO2, r4 + mul BO3, BO3, r4 + + add BO2 , BO2, r3 + add BO3 , BO3, r3 + + lsl M4, M, #5 // M4 = M * 4 * SIZE + +dgemm_tcopy_L4_BEGIN: + + asrs J, M, #2 // J = N / 4 + ble dgemm_tcopy_L2_BEGIN + +dgemm_tcopy_L4_M4_BEGIN: + + ldr AO1, A // AO1 = A + lsl r3, LDA, #2 // r3 = 4 * LDA + add r3, r3 , AO1 // A = A + 4 * LDA + str r3, A // store A + + ldr BO1, B + add r3, BO1, #128 // B = B + 16 * SIZE + str r3, B + + asrs I, N, #2 // I = M / 4 + ble dgemm_tcopy_L4_M4_40 + +dgemm_tcopy_L4_M4_20: + + COPY4x4 + + subs I , I , #1 + bne dgemm_tcopy_L4_M4_20 + + +dgemm_tcopy_L4_M4_40: + + tst N , #2 + ble dgemm_tcopy_L4_M4_60 + + COPY2x4 + + +dgemm_tcopy_L4_M4_60: + + tst N, #1 + ble dgemm_tcopy_L4_M4_END + + COPY1x4 + + +dgemm_tcopy_L4_M4_END: + + subs J , J, #1 // j-- + bne dgemm_tcopy_L4_M4_BEGIN + + + +/*********************************************************************************************/ + +dgemm_tcopy_L2_BEGIN: + + tst M, #3 + ble dgemm_tcopy_L999 + + tst M, #2 + ble dgemm_tcopy_L1_BEGIN + +dgemm_tcopy_L2_M4_BEGIN: + + ldr AO1, A // AO1 = A + lsl r3, LDA, #1 // r3 = 2 * LDA + add r3, r3 , AO1 // A = A + 2 * LDA + str r3, A // store A + + ldr BO1, B + add r3, BO1, #64 // B = B + 8 * SIZE + str r3, B + + asrs I, N, #2 // I = M / 4 + ble dgemm_tcopy_L2_M4_40 + +dgemm_tcopy_L2_M4_20: + + COPY4x2 + + subs I , I , #1 + bne dgemm_tcopy_L2_M4_20 + + +dgemm_tcopy_L2_M4_40: + + tst N , #2 + ble dgemm_tcopy_L2_M4_60 + + COPY2x2 + +dgemm_tcopy_L2_M4_60: + + tst N , #1 + ble dgemm_tcopy_L2_M4_END + + COPY1x2 + + +dgemm_tcopy_L2_M4_END: + + +/*********************************************************************************************/ + +dgemm_tcopy_L1_BEGIN: + + tst M, #1 + ble dgemm_tcopy_L999 + + +dgemm_tcopy_L1_M4_BEGIN: + + ldr AO1, A // AO1 = A + add r3, LDA , AO1 // A = A + 1 * LDA + str r3, A // store A + + ldr BO1, B + add r3, BO1, #32 // B = B + 4 * SIZE + str r3, B + + asrs I, N, #2 // I = M / 4 + ble dgemm_tcopy_L1_M4_40 + +dgemm_tcopy_L1_M4_20: + + COPY4x1 + + subs I , I , #1 + bne dgemm_tcopy_L1_M4_20 + + +dgemm_tcopy_L1_M4_40: + + tst N , #2 + ble dgemm_tcopy_L1_M4_60 + + COPY2x1 + +dgemm_tcopy_L1_M4_60: + + tst N , #1 + ble dgemm_tcopy_L1_M4_END + + COPY1x1 + + +dgemm_tcopy_L1_M4_END: + + + +dgemm_tcopy_L999: + + sub r3, fp, #128 + vldm r3, { d8 - d15} // restore floating point registers + + mov r0, #0 // set return value + sub sp, fp, #24 + pop {r4 - r9, fp} + bx lr + + EPILOGUE + diff --git a/kernel/arm/dot.c b/kernel/arm/dot.c new file mode 100644 index 000000000..30490e291 --- /dev/null +++ b/kernel/arm/dot.c @@ -0,0 +1,64 @@ +/*************************************************************************** +Copyright (c) 2013, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +/************************************************************************************** +* 2013/09/14 Saar +* BLASTEST float : OK +* BLASTEST double : OK +* CTEST : OK +* TEST : OK +* +**************************************************************************************/ + +#include "common.h" + +#if defined(DSDOT) +double CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y) +#else +FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y) +#endif +{ + BLASLONG i=0; + BLASLONG ix=0,iy=0; + double dot = 0.0 ; + + if ( n < 0 ) return(dot); + + while(i < n) + { + + dot += y[iy] * x[ix] ; + ix += inc_x ; + iy += inc_y ; + i++ ; + + } + return(dot); + +} + + diff --git a/kernel/arm/dtrmm_kernel_4x2_vfp.S b/kernel/arm/dtrmm_kernel_4x2_vfp.S new file mode 100644 index 000000000..762b9c580 --- /dev/null +++ b/kernel/arm/dtrmm_kernel_4x2_vfp.S @@ -0,0 +1,1089 @@ +/*************************************************************************** +Copyright (c) 2013, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +/************************************************************************************** +* 2013/11/28 Saar +* BLASTEST : OK +* CTEST : OK +* TEST : OK +* +**************************************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#define STACKSIZE 252 + +#define OLD_M r0 +#define OLD_N r1 +#define OLD_K r2 +#define OLD_A r3 +#define OLD_ALPHA d0 + +/****************************************************** +* [fp, #-128] - [fp, #-64] is reserved +* for store and restore of floating point +* registers +*******************************************************/ + +#define KK [fp, #-240 ] +#define KKK [fp, #-244] +#define C [fp, #-248 ] +#define LDC [fp, #-252 ] +#define M [fp, #-256 ] +#define N [fp, #-260 ] +#define K [fp, #-264 ] +#define A [fp, #-268 ] + +#define ALPHA [fp, #-276 ] + +#define B [fp, #4 ] +#define OLD_C [fp, #8 ] +#define OLD_LDC [fp, #12 ] +#define OFFSET [fp, #16 ] + +#define I r0 +#define J r1 +#define L r2 + +#define AO r5 +#define BO r6 + +#define CO1 r8 +#define CO2 r9 + +#define K1 r7 +#define BC r12 + +#define A_PRE 64 +#define B_PRE 64 +#define C_PRE 64 + + +/************************************************************************************** +* Macro definitions +**************************************************************************************/ + +.macro INIT4x2 + + vsub.f64 d8 , d8 , d8 + vmov.f64 d9, d8 + vmov.f64 d10, d8 + vmov.f64 d11, d8 + vmov.f64 d12, d8 + vmov.f64 d13, d8 + vmov.f64 d14, d8 + vmov.f64 d15, d8 + +.endm + + + +.macro KERNEL4x2_SUB + + fldd d4 , [ BO ] + fldd d0 , [ AO ] + + fldd d1 , [ AO, #8 ] + pld [ AO , #A_PRE ] + + fmacd d8 , d0, d4 + fldd d2 , [ AO, #16 ] + fmacd d9 , d1, d4 + fldd d3 , [ AO, #24 ] + fmacd d10 , d2, d4 + fldd d5 , [ BO, #8 ] + fmacd d11 , d3, d4 + + fmacd d12 , d0, d5 + fmacd d13 , d1, d5 + add AO , AO, #32 + fmacd d14 , d2, d5 + add BO , BO, #16 + fmacd d15 , d3, d5 + + +.endm + +.macro SAVE4x2 + + ldr r3 , LDC + add CO2 , CO1, r3 + + fldd d0, ALPHA + + fmuld d4 , d0 , d8 + fmuld d5 , d0 , d9 + fmuld d6 , d0 , d10 + fmuld d7 , d0 , d11 + + fstd d4 , [CO1] + fstd d5 , [CO1, #8 ] + fstd d6 , [CO1, #16 ] + fstd d7 , [CO1, #24 ] + + fmuld d4 , d0 , d12 + fmuld d5 , d0 , d13 + fmuld d6 , d0 , d14 + fmuld d7 , d0 , d15 + + fstd d4 , [CO2] + fstd d5 , [CO2, #8 ] + fstd d6 , [CO2, #16 ] + fstd d7 , [CO2, #24 ] + + add CO1, CO1, #32 + +.endm + + +/******************************************************************************/ + +.macro INIT2x2 + + vsub.f64 d8 , d8 , d8 + vmov.f64 d9, d8 + vmov.f64 d12, d8 + vmov.f64 d13, d8 + +.endm + +.macro KERNEL2x2_SUB + + fldd d4 , [ BO ] + fldd d5 , [ BO, #8 ] + + fldd d0 , [ AO ] + fldd d1 , [ AO, #8 ] + + fmacd d8 , d0, d4 + fmacd d9 , d1, d4 + + fmacd d12 , d0, d5 + fmacd d13 , d1, d5 + + add AO , AO, #16 + add BO , BO, #16 + +.endm + +.macro SAVE2x2 + + ldr r3 , LDC + add CO2 , CO1, r3 + + fldd d0, ALPHA + + + fmuld d4 , d0 , d8 + fmuld d5 , d0 , d9 + + fstd d4 , [CO1] + fstd d5 , [CO1, #8 ] + + fmuld d4 , d0 , d12 + fmuld d5 , d0 , d13 + + fstd d4 , [CO2] + fstd d5 , [CO2, #8 ] + + add CO1, CO1, #16 + +.endm + + +/******************************************************************************/ + +.macro INIT1x2 + + vsub.f64 d8 , d8 , d8 + vmov.f64 d12, d8 + +.endm + +.macro KERNEL1x2_SUB + + fldd d4 , [ BO ] + fldd d5 , [ BO, #8 ] + + fldd d0 , [ AO ] + + fmacd d8 , d0, d4 + + fmacd d12 , d0, d5 + + add AO , AO, #8 + add BO , BO, #16 + +.endm + +.macro SAVE1x2 + + ldr r3 , LDC + add CO2 , CO1, r3 + + fldd d0, ALPHA + + + fmuld d4 , d0 , d8 + + fstd d4 , [CO1] + + + fmuld d4 , d0 , d12 + + fstd d4 , [CO2] + + add CO1, CO1, #8 + +.endm + + + +/******************************************************************************/ + +.macro INIT4x1 + + vsub.f64 d8 , d8 , d8 + vmov.f64 d9, d8 + vmov.f64 d10, d8 + vmov.f64 d11, d8 + +.endm + + + +.macro KERNEL4x1_SUB + + fldd d4 , [ BO ] + + fldd d0 , [ AO ] + fldd d1 , [ AO, #8 ] + fldd d2 , [ AO, #16 ] + fldd d3 , [ AO, #24 ] + + fmacd d8 , d0, d4 + fmacd d9 , d1, d4 + fmacd d10 , d2, d4 + fmacd d11 , d3, d4 + + add AO , AO, #32 + add BO , BO, #8 + +.endm + +.macro SAVE4x1 + + fldd d0, ALPHA + + fmuld d4 , d0 , d8 + fmuld d5 , d0 , d9 + fmuld d6 , d0 , d10 + fmuld d7 , d0 , d11 + + fstd d4 , [CO1] + fstd d5 , [CO1, #8 ] + fstd d6 , [CO1, #16 ] + fstd d7 , [CO1, #24 ] + + add CO1, CO1, #32 + +.endm + +/******************************************************************************/ + +.macro INIT2x1 + + vsub.f64 d8 , d8 , d8 + vmov.f64 d9 , d8 + +.endm + +.macro KERNEL2x1_SUB + + fldd d4 , [ BO ] + + fldd d0 , [ AO ] + fldd d1 , [ AO, #8 ] + + fmacd d8 , d0, d4 + fmacd d9 , d1, d4 + + add AO , AO, #16 + add BO , BO, #8 + +.endm + +.macro SAVE2x1 + + fldd d0, ALPHA + + fmuld d4 , d0 , d8 + fmuld d5 , d0 , d9 + + fstd d4 , [CO1] + fstd d5 , [CO1, #8 ] + + add CO1, CO1, #16 + +.endm + + +/******************************************************************************/ + +.macro INIT1x1 + + vsub.f64 d8 , d8 , d8 + +.endm + +.macro KERNEL1x1_SUB + + fldd d4 , [ BO ] + + fldd d0 , [ AO ] + + fmacd d8 , d0, d4 + + add AO , AO, #8 + add BO , BO, #8 + +.endm + +.macro SAVE1x1 + + fldd d0, ALPHA + + fmuld d4 , d0 , d8 + + fstd d4 , [CO1] + + add CO1, CO1, #8 + +.endm + + +/************************************************************************************** +* End of macro definitions +**************************************************************************************/ + + + PROLOGUE + + .align 5 + + push {r4 - r9, fp} + add fp, sp, #24 + sub sp, sp, #STACKSIZE // reserve stack + + str OLD_M, M + str OLD_N, N + str OLD_K, K + str OLD_A, A + vstr OLD_ALPHA, ALPHA + + sub r3, fp, #128 + vstm r3, { d8 - d15} // store floating point registers + + ldr r3, OLD_LDC + lsl r3, r3, #3 // ldc = ldc * 8 + str r3, LDC + + ldr r3, OLD_C + str r3, C + + ldr BC, B + + ldr r3, OFFSET +#ifndef LEFT + neg r3 , r3 +#endif + str r3 , KK + + ldr J, N + asrs J, J, #1 // J = J / 2 + ble _L1_BEGIN + +_L2_BEGIN: + + ldr CO1, C // CO1 = C + ldr r4 , LDC + lsl r4 , r4 , #1 // LDC * 2 + add r3 , r4, CO1 + str r3 , C // store C + +#if defined(LEFT) + ldr r3 , OFFSET + str r3 , KK +#endif + + ldr AO, A // AO = A + + +_L2_M4_BEGIN: + + ldr I, M + asrs I, I, #2 // I = I / 4 + ble _L2_M2_BEGIN + +_L2_M4_20: + + INIT4x2 + +#if (defined(LEFT) && defined(TRANSA)) || \ + (!defined(LEFT) && !defined(TRANSA)) + + mov BO, BC +#else + mov BO, BC + ldr r3 , KK + lsls r4 , r3 , #4 // 2 double values + add BO , BO , r4 + lsls r4 , r3 , #5 // 4 double values + add AO , AO , r4 + +#endif + +#ifndef TRMMKERNEL + ldr L , K +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + ldr L , K + ldr r3, KK + sub L , L, r3 + str L , KKK +#else + ldr L , KK +#ifdef LEFT + add L , L , #4 // number of values in AO +#else + add L , L , #2 // number of values in BO +#endif + str L , KKK +#endif + + mov K1, L + asrs L , K1, #3 // L = L / 8 + ble _L2_M4_40 + .align 5 + +_L2_M4_22: + + pld [ BO , #B_PRE ] + KERNEL4x2_SUB + KERNEL4x2_SUB + pld [ BO , #B_PRE ] + KERNEL4x2_SUB + KERNEL4x2_SUB + + pld [ BO , #B_PRE ] + KERNEL4x2_SUB + KERNEL4x2_SUB + pld [ BO , #B_PRE ] + KERNEL4x2_SUB + KERNEL4x2_SUB + + subs L, L, #1 + bgt _L2_M4_22 + + +_L2_M4_40: + + ands L , K1, #7 // L = L % 8 + ble _L2_M4_100 + +_L2_M4_42: + + KERNEL4x2_SUB + + subs L, L, #1 + bgt _L2_M4_42 + +_L2_M4_100: + + SAVE4x2 + +#if (defined(LEFT) && defined(TRANSA)) || \ + (!defined(LEFT) && !defined(TRANSA)) + ldr r3 , K + ldr r4 , KKK + sub r3 , r3 , r4 + lsls r4 , r3 , #4 // 2 double values + add BO , BO , r4 + lsls r4 , r3 , #5 // 4 double values + add AO , AO , r4 +#endif + +#if defined(LEFT) + ldr r3 , KK + add r3 , r3 , #4 // number of values in AO + str r3 , KK +#endif + + + +_L2_M4_END: + + subs I, I, #1 + bgt _L2_M4_20 + + +_L2_M2_BEGIN: + + ldr I, M + tst I , #3 + ble _L2_END + + tst I, #2 // I = I / 2 + ble _L2_M1_BEGIN + +_L2_M2_20: + + INIT2x2 + +#if (defined(LEFT) && defined(TRANSA)) || \ + (!defined(LEFT) && !defined(TRANSA)) + + mov BO, BC +#else + mov BO, BC + ldr r3 , KK + lsls r4 , r3 , #4 // 2 double values + add BO , BO , r4 + lsls r4 , r3 , #4 // 2 double values + add AO , AO , r4 + +#endif + +#ifndef TRMMKERNEL + ldr L , K +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + ldr L , K + ldr r3, KK + sub L , L, r3 + str L , KKK +#else + ldr L , KK +#ifdef LEFT + add L , L , #2 // number of values in AO +#else + add L , L , #2 // number of values in BO +#endif + str L , KKK +#endif + + mov K1, L + asrs L , K1, #3 // L = L / 8 + ble _L2_M2_40 + +_L2_M2_22: + + KERNEL2x2_SUB + KERNEL2x2_SUB + KERNEL2x2_SUB + KERNEL2x2_SUB + + KERNEL2x2_SUB + KERNEL2x2_SUB + KERNEL2x2_SUB + KERNEL2x2_SUB + + subs L, L, #1 + bgt _L2_M2_22 + + +_L2_M2_40: + + ands L , K1, #7 // L = L % 8 + ble _L2_M2_100 + +_L2_M2_42: + + KERNEL2x2_SUB + + subs L, L, #1 + bgt _L2_M2_42 + +_L2_M2_100: + + SAVE2x2 + +#if (defined(LEFT) && defined(TRANSA)) || \ + (!defined(LEFT) && !defined(TRANSA)) + ldr r3 , K + ldr r4 , KKK + sub r3 , r3 , r4 + lsls r4 , r3 , #4 // 2 double values + add BO , BO , r4 + lsls r4 , r3 , #4 // 2 double values + add AO , AO , r4 +#endif + +#if defined(LEFT) + ldr r3 , KK + add r3 , r3 , #2 // number of values in AO + str r3 , KK +#endif + + + +_L2_M2_END: + + +_L2_M1_BEGIN: + + tst I, #1 // I = I % 2 + ble _L2_END + +_L2_M1_20: + + INIT1x2 + +#if (defined(LEFT) && defined(TRANSA)) || \ + (!defined(LEFT) && !defined(TRANSA)) + + mov BO, BC +#else + mov BO, BC + ldr r3 , KK + lsls r4 , r3 , #4 // 2 double values + add BO , BO , r4 + lsls r4 , r3 , #3 // 1 double value + add AO , AO , r4 + +#endif + +#ifndef TRMMKERNEL + ldr L , K +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + ldr L , K + ldr r3, KK + sub L , L, r3 + str L , KKK +#else + ldr L , KK +#ifdef LEFT + add L , L , #1 // number of values in AO +#else + add L , L , #2 // number of values in BO +#endif + str L , KKK +#endif + + mov K1, L + asrs L , K1, #3 // L = L / 8 + ble _L2_M1_40 + +_L2_M1_22: + KERNEL1x2_SUB + KERNEL1x2_SUB + KERNEL1x2_SUB + KERNEL1x2_SUB + + KERNEL1x2_SUB + KERNEL1x2_SUB + KERNEL1x2_SUB + KERNEL1x2_SUB + + subs L, L, #1 + bgt _L2_M1_22 + + +_L2_M1_40: + + ands L , K1, #7 // L = L % 8 + ble _L2_M1_100 + +_L2_M1_42: + + KERNEL1x2_SUB + + subs L, L, #1 + bgt _L2_M1_42 + +_L2_M1_100: + + SAVE1x2 + +#if (defined(LEFT) && defined(TRANSA)) || \ + (!defined(LEFT) && !defined(TRANSA)) + ldr r3 , K + ldr r4 , KKK + sub r3 , r3 , r4 + lsls r4 , r3 , #4 // 2 double values + add BO , BO , r4 + lsls r4 , r3 , #3 // 1 double value + add AO , AO , r4 +#endif + +#if defined(LEFT) + ldr r3 , KK + add r3 , r3 , #1 // number of values in AO + str r3 , KK +#endif + + + + +_L2_END: + + mov r3, BC + ldr r4, K + lsl r4, r4, #4 // k * 2 * 8 + add r3, r3, r4 // B = B + K * 2 * 8 + mov BC, r3 + +#if !defined(LEFT) + ldr r3 , KK + add r3 , r3 , #2 // number of values in BO + str r3 , KK +#endif + + + subs J , #1 // j-- + bgt _L2_BEGIN + +/*********************************************************************************************/ + +_L1_BEGIN: + + ldr J , N + tst J , #1 + ble _L999 + + + ldr CO1, C // CO1 = C + ldr r4 , LDC + add r3 , r4, CO1 + str r3 , C // store C + +#if defined(LEFT) + ldr r3 , OFFSET + str r3 , KK +#endif + + ldr AO, A // AO = A + //pld [AO , #A_PRE-96] + //pld [AO , #A_PRE-64] + //pld [AO , #A_PRE-32] + + + +_L1_M4_BEGIN: + + ldr I, M + asrs I, I, #2 // I = I / 4 + ble _L1_M2_BEGIN + +_L1_M4_20: + + INIT4x1 + +#if (defined(LEFT) && defined(TRANSA)) || \ + (!defined(LEFT) && !defined(TRANSA)) + + mov BO, BC +#else + mov BO, BC + ldr r3 , KK + lsls r4 , r3 , #3 // 1 double value + add BO , BO , r4 + lsls r4 , r3 , #5 // 4 double values + add AO , AO , r4 + +#endif + +#ifndef TRMMKERNEL + ldr L , K +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + ldr L , K + ldr r3, KK + sub L , L, r3 + str L , KKK +#else + ldr L , KK +#ifdef LEFT + add L , L , #4 // number of values in AO +#else + add L , L , #1 // number of values in BO +#endif + str L , KKK +#endif + + mov K1, L + asrs L , K1, #3 // L = L / 8 + ble _L1_M4_40 + .align 5 + +_L1_M4_22: + KERNEL4x1_SUB + KERNEL4x1_SUB + KERNEL4x1_SUB + KERNEL4x1_SUB + + KERNEL4x1_SUB + KERNEL4x1_SUB + KERNEL4x1_SUB + KERNEL4x1_SUB + + subs L, L, #1 + bgt _L1_M4_22 + + +_L1_M4_40: + + ands L , K1, #7 // L = L % 8 + ble _L1_M4_100 + +_L1_M4_42: + + KERNEL4x1_SUB + + subs L, L, #1 + bgt _L1_M4_42 + +_L1_M4_100: + + SAVE4x1 + +#if (defined(LEFT) && defined(TRANSA)) || \ + (!defined(LEFT) && !defined(TRANSA)) + ldr r3 , K + ldr r4 , KKK + sub r3 , r3 , r4 + lsls r4 , r3 , #3 // 1 double value + add BO , BO , r4 + lsls r4 , r3 , #5 // 4 double values + add AO , AO , r4 +#endif + +#if defined(LEFT) + ldr r3 , KK + add r3 , r3 , #4 // number of values in AO + str r3 , KK +#endif + + + +_L1_M4_END: + + subs I, I, #1 + bgt _L1_M4_20 + + +_L1_M2_BEGIN: + + ldr I, M + tst I , #3 + ble _L1_END + + tst I, #2 // I = I / 2 + ble _L1_M1_BEGIN + +_L1_M2_20: + + INIT2x1 + +#if (defined(LEFT) && defined(TRANSA)) || \ + (!defined(LEFT) && !defined(TRANSA)) + + mov BO, BC +#else + mov BO, BC + ldr r3 , KK + lsls r4 , r3 , #3 // 1 double value + add BO , BO , r4 + lsls r4 , r3 , #4 // 2 double values + add AO , AO , r4 + +#endif + +#ifndef TRMMKERNEL + ldr L , K +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + ldr L , K + ldr r3, KK + sub L , L, r3 + str L , KKK +#else + ldr L , KK +#ifdef LEFT + add L , L , #2 // number of values in AO +#else + add L , L , #1 // number of values in BO +#endif + str L , KKK +#endif + + mov K1, L + asrs L , K1, #3 // L = L / 8 + ble _L1_M2_40 + +_L1_M2_22: + + KERNEL2x1_SUB + KERNEL2x1_SUB + KERNEL2x1_SUB + KERNEL2x1_SUB + + KERNEL2x1_SUB + KERNEL2x1_SUB + KERNEL2x1_SUB + KERNEL2x1_SUB + + subs L, L, #1 + bgt _L1_M2_22 + + +_L1_M2_40: + + ands L , K1, #7 // L = L % 8 + ble _L1_M2_100 + +_L1_M2_42: + + KERNEL2x1_SUB + + subs L, L, #1 + bgt _L1_M2_42 + +_L1_M2_100: + + SAVE2x1 + +#if (defined(LEFT) && defined(TRANSA)) || \ + (!defined(LEFT) && !defined(TRANSA)) + ldr r3 , K + ldr r4 , KKK + sub r3 , r3 , r4 + lsls r4 , r3 , #3 // 1 double value + add BO , BO , r4 + lsls r4 , r3 , #4 // 2 double values + add AO , AO , r4 +#endif + +#if defined(LEFT) + ldr r3 , KK + add r3 , r3 , #2 // number of values in AO + str r3 , KK +#endif + + + +_L1_M2_END: + + +_L1_M1_BEGIN: + + tst I, #1 // I = I % 2 + ble _L1_END + +_L1_M1_20: + + INIT1x1 + +#if (defined(LEFT) && defined(TRANSA)) || \ + (!defined(LEFT) && !defined(TRANSA)) + + mov BO, BC +#else + mov BO, BC + ldr r3 , KK + lsls r4 , r3 , #3 // 1 double value + add BO , BO , r4 + lsls r4 , r3 , #3 // 1 double value + add AO , AO , r4 + +#endif + +#ifndef TRMMKERNEL + ldr L , K +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + ldr L , K + ldr r3, KK + sub L , L, r3 + str L , KKK +#else + ldr L , KK +#ifdef LEFT + add L , L , #1 // number of values in AO +#else + add L , L , #1 // number of values in BO +#endif + str L , KKK +#endif + + mov K1, L + asrs L , K1, #3 // L = L / 8 + ble _L1_M1_40 + +_L1_M1_22: + KERNEL1x1_SUB + KERNEL1x1_SUB + KERNEL1x1_SUB + KERNEL1x1_SUB + + KERNEL1x1_SUB + KERNEL1x1_SUB + KERNEL1x1_SUB + KERNEL1x1_SUB + + subs L, L, #1 + bgt _L1_M1_22 + + +_L1_M1_40: + + ands L , K1, #7 // L = L % 8 + ble _L1_M1_100 + +_L1_M1_42: + + KERNEL1x1_SUB + + subs L, L, #1 + bgt _L1_M1_42 + +_L1_M1_100: + + SAVE1x1 + + +_L1_END: + + +_L999: + + sub r3, fp, #128 + vldm r3, { d8 - d15} // restore floating point registers + + movs r0, #0 // set return value + sub sp, fp, #24 + pop {r4 - r9, fp} + bx lr + + EPILOGUE + diff --git a/kernel/arm/dtrmm_kernel_4x4_vfpv3.S b/kernel/arm/dtrmm_kernel_4x4_vfpv3.S new file mode 100644 index 000000000..0f8a9291a --- /dev/null +++ b/kernel/arm/dtrmm_kernel_4x4_vfpv3.S @@ -0,0 +1,1953 @@ +/*************************************************************************** +Copyright (c) 2013, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +/************************************************************************************** +* 2013/11/23 Saar +* BLASTEST : OK +* CTEST : OK +* TEST : OK +* +**************************************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#define STACKSIZE 252 + +#define OLD_M r0 +#define OLD_N r1 +#define OLD_K r2 +#define OLD_A r3 +#define OLD_ALPHA d0 + +/****************************************************** +* [fp, #-128] - [fp, #-64] is reserved +* for store and restore of floating point +* registers +*******************************************************/ + +#define KK [fp, #-240 ] +#define KKK [fp, #-244] +#define C [fp, #-248 ] +#define LDC [fp, #-252 ] +#define M [fp, #-256 ] +#define N [fp, #-260 ] +#define K [fp, #-264 ] +#define A [fp, #-268 ] + +#define ALPHA [fp, #-276 ] + +#define B [fp, #4 ] +#define OLD_C [fp, #8 ] +#define OLD_LDC [fp, #12 ] +#define OFFSET [fp, #16 ] + +#define I r0 +#define J r1 +#define L r2 + +#define AO r5 +#define BO r6 + +#define CO1 r8 +#define CO2 r9 + +#define K1 r7 +#define BC r12 + +#define A_PRE 64 +#define B_PRE 64 +#define C_PRE 64 + +/************************************************************************************** +* Macro definitions +**************************************************************************************/ + +.macro INIT4x4 + + vsub.f64 d16 , d16 , d16 + vmov.f64 d17, d16 + vmov.f64 d18, d16 + vmov.f64 d19, d16 + vmov.f64 d20, d16 + vmov.f64 d21, d16 + vmov.f64 d22, d16 + vmov.f64 d23, d16 + vmov.f64 d24, d16 + vmov.f64 d25, d16 + vmov.f64 d26, d16 + vmov.f64 d27, d16 + vmov.f64 d28, d16 + vmov.f64 d29, d16 + vmov.f64 d30, d16 + vmov.f64 d31, d16 + +.endm + +.macro KERNEL4x4_I + pld [ BO , #B_PRE ] + + fldd d8 , [ BO ] + + pld [ AO , #A_PRE ] + fldmiad AO!, { d0 - d1} + + fmuld d16 , d0, d8 + fldmiad AO!, { d2 - d3} + fmuld d17 , d1, d8 + fldd d9 , [ BO, #8 ] + fmuld d18 , d2, d8 + fldd d10, [ BO, #16 ] + fmuld d19 , d3, d8 + + fldd d11, [ BO, #24 ] + fmuld d20 , d0, d9 + fmuld d21 , d1, d9 + add BO , BO, #32 + fmuld d22 , d2, d9 + + fldd d12, [ BO ] + fmuld d23 , d3, d9 + + fmuld d24 , d0, d10 + fldmiad AO!, { d4 - d5 } + fmuld d25 , d1, d10 + fmuld d26 , d2, d10 + fldmiad AO!, { d6 - d7 } + fmuld d27 , d3, d10 + + fldd d13, [ BO, #8 ] + fmuld d28 , d0, d11 + fldd d14, [ BO, #16 ] + fmuld d29 , d1, d11 + fldd d15, [ BO, #24 ] + fmuld d30 , d2, d11 + fmuld d31 , d3, d11 + add BO , BO, #32 + +.endm + + + +.macro KERNEL4x4_S + pld [ BO , #B_PRE ] + + fldd d8 , [ BO ] + + pld [ AO , #A_PRE ] + fldmiad AO!, { d0 - d1} + + fmacd d16 , d0, d8 + fldmiad AO!, { d2 - d3} + fmacd d17 , d1, d8 + fldd d9 , [ BO, #8 ] + fmacd d18 , d2, d8 + fldd d10, [ BO, #16 ] + fmacd d19 , d3, d8 + + fldd d11, [ BO, #24 ] + fmacd d20 , d0, d9 + fmacd d21 , d1, d9 + add BO , BO, #32 + fmacd d22 , d2, d9 + + fldd d12, [ BO ] + fmacd d23 , d3, d9 + + fmacd d24 , d0, d10 + fldmiad AO!, { d4 - d5 } + fmacd d25 , d1, d10 + fmacd d26 , d2, d10 + fldmiad AO!, { d6 - d7 } + fmacd d27 , d3, d10 + + fldd d13, [ BO, #8 ] + fmacd d28 , d0, d11 + fldd d14, [ BO, #16 ] + fmacd d29 , d1, d11 + fldd d15, [ BO, #24 ] + fmacd d30 , d2, d11 + fmacd d31 , d3, d11 + add BO , BO, #32 + +.endm + + + +.macro KERNEL4x4_M1 + + fmacd d16 , d4, d12 + pld [ AO , #A_PRE ] + fmacd d17 , d5, d12 + fmacd d18 , d6, d12 + pld [ BO , #B_PRE ] + fmacd d19 , d7, d12 + + fmacd d20 , d4, d13 + fldd d8 , [ BO ] + fmacd d21 , d5, d13 + fmacd d22 , d6, d13 + fldmiad AO!, { d0 - d1 } + fmacd d23 , d7, d13 + + fmacd d24 , d4, d14 + fldmiad AO!, { d2 - d3 } + fmacd d25 , d5, d14 + fldd d9 , [ BO, #8 ] + fmacd d26 , d6, d14 + fldd d10, [ BO, #16 ] + fmacd d27 , d7, d14 + + fldd d11, [ BO, #24 ] + fmacd d28 , d4, d15 + fmacd d29 , d5, d15 + fmacd d30 , d6, d15 + add BO , BO, #32 + fmacd d31 , d7, d15 + +.endm + +.macro KERNEL4x4_M2 + + + fmacd d16 , d0, d8 + pld [ AO , #A_PRE ] + fmacd d17 , d1, d8 + pld [ BO , #B_PRE ] + fmacd d18 , d2, d8 + fldd d12, [ BO ] + fmacd d19 , d3, d8 + + fmacd d20 , d0, d9 + fldmiad AO!, { d4 - d5 } + fmacd d21 , d1, d9 + fmacd d22 , d2, d9 + fldmiad AO!, { d6 - d7 } + fmacd d23 , d3, d9 + + fmacd d24 , d0, d10 + fmacd d25 , d1, d10 + fmacd d26 , d2, d10 + fmacd d27 , d3, d10 + + fldd d13, [ BO, #8 ] + fmacd d28 , d0, d11 + fldd d14, [ BO, #16 ] + fmacd d29 , d1, d11 + fldd d15, [ BO, #24 ] + fmacd d30 , d2, d11 + fmacd d31 , d3, d11 + add BO , BO, #32 + +.endm + + +.macro KERNEL4x4_E + + fmacd d16 , d4, d12 + pld [ AO , #A_PRE ] + fmacd d17 , d5, d12 + fmacd d18 , d6, d12 + pld [ BO , #B_PRE ] + fmacd d19 , d7, d12 + + fmacd d20 , d4, d13 + fmacd d21 , d5, d13 + fmacd d22 , d6, d13 + fmacd d23 , d7, d13 + + fmacd d24 , d4, d14 + fmacd d25 , d5, d14 + fmacd d26 , d6, d14 + fmacd d27 , d7, d14 + + fmacd d28 , d4, d15 + fmacd d29 , d5, d15 + fmacd d30 , d6, d15 + fmacd d31 , d7, d15 + +.endm + + + + +.macro KERNEL4x4_SUB + pld [ BO , #B_PRE ] + pld [ AO , #A_PRE ] + + fldd d8 , [ BO ] + + fldd d0 , [ AO ] + fldd d1 , [ AO, #8 ] + fldd d2 , [ AO, #16 ] + fldd d3 , [ AO, #24 ] + + fmacd d16 , d0, d8 + fldd d9 , [ BO, #8 ] + fmacd d17 , d1, d8 + fldd d10, [ BO, #16 ] + fmacd d18 , d2, d8 + fldd d11, [ BO, #24 ] + fmacd d19 , d3, d8 + + fmacd d20 , d0, d9 + fmacd d21 , d1, d9 + fmacd d22 , d2, d9 + fmacd d23 , d3, d9 + + fmacd d24 , d0, d10 + fmacd d25 , d1, d10 + fmacd d26 , d2, d10 + fmacd d27 , d3, d10 + + fmacd d28 , d0, d11 + fmacd d29 , d1, d11 + add AO , AO, #32 + fmacd d30 , d2, d11 + add BO , BO, #32 + fmacd d31 , d3, d11 + +.endm + +.macro SAVE4x4 + + ldr r3 , LDC + add CO2 , CO1, r3 + fldd d0, ALPHA + add r4 , CO2, r3 + + + fmuld d8 , d0 , d16 + fmuld d9 , d0 , d17 + fmuld d10, d0 , d18 + fmuld d11, d0 , d19 + + fmuld d12, d0 , d20 + fstd d8 , [CO1] + fmuld d13, d0 , d21 + fstd d9 , [CO1, #8 ] + fmuld d14, d0 , d22 + fstd d10, [CO1, #16 ] + fmuld d15, d0 , d23 + fstd d11, [CO1, #24 ] + + + fmuld d8 , d0 , d24 + fstd d12, [CO2] + fmuld d9 , d0 , d25 + fstd d13, [CO2, #8 ] + fmuld d10, d0 , d26 + fstd d14, [CO2, #16 ] + fmuld d11, d0 , d27 + fstd d15, [CO2, #24 ] + + add CO2, r4 , r3 + + fstd d8 , [r4 ] + fmuld d12, d0 , d28 + fstd d9 , [r4 , #8 ] + fmuld d13, d0 , d29 + fstd d10, [r4 , #16 ] + fmuld d14, d0 , d30 + fstd d11, [r4 , #24 ] + fmuld d15, d0 , d31 + + fstmiad CO2, { d12 - d15 } + + add CO1, CO1, #32 + +.endm + +/******************************************************************************/ + +.macro INIT2x4 + + vsub.f64 d16 , d16 , d16 + vmov.f64 d17, d16 + vmov.f64 d20, d16 + vmov.f64 d21, d16 + vmov.f64 d24, d16 + vmov.f64 d25, d16 + vmov.f64 d28, d16 + vmov.f64 d29, d16 + +.endm + + + +.macro KERNEL2x4_SUB + + fldd d8 , [ BO ] + fldd d9 , [ BO, #8 ] + fldd d10, [ BO, #16 ] + fldd d11, [ BO, #24 ] + + fldd d0 , [ AO ] + fldd d1 , [ AO, #8 ] + + fmacd d16 , d0, d8 + fmacd d17 , d1, d8 + + fmacd d20 , d0, d9 + fmacd d21 , d1, d9 + + fmacd d24 , d0, d10 + fmacd d25 , d1, d10 + + fmacd d28 , d0, d11 + fmacd d29 , d1, d11 + add AO , AO, #16 + add BO , BO, #32 + +.endm + +.macro SAVE2x4 + + ldr r3 , LDC + add CO2 , CO1, r3 + add r4 , CO2, r3 + + fldd d0, ALPHA + + fmuld d8 , d0 , d16 + fmuld d9 , d0 , d17 + + fstd d8 , [CO1] + fstd d9 , [CO1, #8 ] + + fmuld d12, d0 , d20 + fmuld d13, d0 , d21 + + fstd d12, [CO2] + fstd d13, [CO2, #8 ] + + fmuld d8 , d0 , d24 + fmuld d9 , d0 , d25 + + fstd d8 , [r4 ] + fstd d9 , [r4 , #8 ] + + add CO2, r4 , r3 + + fmuld d12, d0 , d28 + fmuld d13, d0 , d29 + + fstd d12, [CO2] + fstd d13, [CO2, #8 ] + + add CO1, CO1, #16 + +.endm + + +/******************************************************************************/ + +.macro INIT1x4 + + vsub.f64 d16 , d16 , d16 + vmov.f64 d20, d16 + vmov.f64 d24, d16 + vmov.f64 d28, d16 + +.endm + + + +.macro KERNEL1x4_SUB + + fldd d8 , [ BO ] + fldd d9 , [ BO, #8 ] + fldd d10, [ BO, #16 ] + fldd d11, [ BO, #24 ] + + fldd d0 , [ AO ] + + fmacd d16 , d0, d8 + fmacd d20 , d0, d9 + fmacd d24 , d0, d10 + fmacd d28 , d0, d11 + + add AO , AO, #8 + add BO , BO, #32 + +.endm + +.macro SAVE1x4 + + ldr r3 , LDC + add CO2 , CO1, r3 + add r4 , CO2, r3 + + fldd d0, ALPHA + + fmuld d8 , d0 , d16 + fstd d8 , [CO1] + + fmuld d12, d0 , d20 + fstd d12, [CO2] + + fmuld d8 , d0 , d24 + fstd d8 , [r4 ] + + add CO2, r4 , r3 + + fmuld d12, d0 , d28 + fstd d12, [CO2] + + add CO1, CO1, #8 + +.endm + +/******************************************************************************/ +/******************************************************************************/ + +.macro INIT4x2 + + vsub.f64 d16 , d16 , d16 + vmov.f64 d17, d16 + vmov.f64 d18, d16 + vmov.f64 d19, d16 + vmov.f64 d20, d16 + vmov.f64 d21, d16 + vmov.f64 d22, d16 + vmov.f64 d23, d16 + +.endm + + + +.macro KERNEL4x2_SUB + + fldd d8 , [ BO ] + fldd d9 , [ BO, #8 ] + + fldd d0 , [ AO ] + fldd d1 , [ AO, #8 ] + fldd d2 , [ AO, #16 ] + fldd d3 , [ AO, #24 ] + + fmacd d16 , d0, d8 + fmacd d17 , d1, d8 + fmacd d18 , d2, d8 + fmacd d19 , d3, d8 + + fmacd d20 , d0, d9 + fmacd d21 , d1, d9 + fmacd d22 , d2, d9 + fmacd d23 , d3, d9 + + add AO , AO, #32 + add BO , BO, #16 + +.endm + +.macro SAVE4x2 + + ldr r3 , LDC + add CO2 , CO1, r3 + + fldd d0, ALPHA + + fmuld d8 , d0 , d16 + fmuld d9 , d0 , d17 + fmuld d10, d0 , d18 + fmuld d11, d0 , d19 + + fstd d8 , [CO1] + fstd d9 , [CO1, #8 ] + fstd d10, [CO1, #16 ] + fstd d11, [CO1, #24 ] + + fmuld d12, d0 , d20 + fmuld d13, d0 , d21 + fmuld d14, d0 , d22 + fmuld d15, d0 , d23 + + fstd d12, [CO2] + fstd d13, [CO2, #8 ] + fstd d14, [CO2, #16 ] + fstd d15, [CO2, #24 ] + + add CO1, CO1, #32 + +.endm + + +/******************************************************************************/ + +.macro INIT2x2 + + vsub.f64 d16 , d16 , d16 + vmov.f64 d17, d16 + vmov.f64 d20, d16 + vmov.f64 d21, d16 + +.endm + + + +.macro KERNEL2x2_SUB + + fldd d8 , [ BO ] + fldd d9 , [ BO, #8 ] + + fldd d0 , [ AO ] + fldd d1 , [ AO, #8 ] + + fmacd d16 , d0, d8 + fmacd d17 , d1, d8 + + fmacd d20 , d0, d9 + fmacd d21 , d1, d9 + + add AO , AO, #16 + add BO , BO, #16 + +.endm + +.macro SAVE2x2 + + ldr r3 , LDC + add CO2 , CO1, r3 + + fldd d0, ALPHA + + fmuld d8 , d0 , d16 + fmuld d9 , d0 , d17 + + fstd d8 , [CO1] + fstd d9 , [CO1, #8 ] + + fmuld d12, d0 , d20 + fmuld d13, d0 , d21 + + fstd d12, [CO2] + fstd d13, [CO2, #8 ] + + add CO1, CO1, #16 + +.endm + +/******************************************************************************/ + +.macro INIT1x2 + + vsub.f64 d16 , d16 , d16 + vmov.f64 d20, d16 + +.endm + + + +.macro KERNEL1x2_SUB + + fldd d8 , [ BO ] + fldd d9 , [ BO, #8 ] + + fldd d0 , [ AO ] + fmacd d16 , d0, d8 + fmacd d20 , d0, d9 + + add AO , AO, #8 + add BO , BO, #16 + +.endm + +.macro SAVE1x2 + + ldr r3 , LDC + add CO2 , CO1, r3 + + fldd d0, ALPHA + + fmuld d8 , d0 , d16 + fstd d8 , [CO1] + + fmuld d12, d0 , d20 + fstd d12, [CO2] + + add CO1, CO1, #8 + +.endm + +/******************************************************************************/ +/******************************************************************************/ + +.macro INIT4x1 + + vsub.f64 d16 , d16 , d16 + vmov.f64 d17, d16 + vmov.f64 d18, d16 + vmov.f64 d19, d16 + +.endm + + + +.macro KERNEL4x1_SUB + + fldd d8 , [ BO ] + + fldd d0 , [ AO ] + fldd d1 , [ AO, #8 ] + fldd d2 , [ AO, #16 ] + fldd d3 , [ AO, #24 ] + + fmacd d16 , d0, d8 + fmacd d17 , d1, d8 + fmacd d18 , d2, d8 + fmacd d19 , d3, d8 + + add AO , AO, #32 + add BO , BO, #8 + +.endm + +.macro SAVE4x1 + + + fldd d0, ALPHA + + fmuld d8 , d0 , d16 + fmuld d9 , d0 , d17 + fmuld d10, d0 , d18 + fmuld d11, d0 , d19 + + fstd d8 , [CO1] + fstd d9 , [CO1, #8 ] + fstd d10, [CO1, #16 ] + fstd d11, [CO1, #24 ] + + add CO1, CO1, #32 + +.endm + + + + +/******************************************************************************/ + +.macro INIT2x1 + + vsub.f64 d16 , d16 , d16 + vmov.f64 d17, d16 + +.endm + + + +.macro KERNEL2x1_SUB + + fldd d8 , [ BO ] + + fldd d0 , [ AO ] + fldd d1 , [ AO, #8 ] + + fmacd d16 , d0, d8 + fmacd d17 , d1, d8 + + add AO , AO, #16 + add BO , BO, #8 + +.endm + +.macro SAVE2x1 + + + fldd d0, ALPHA + + fmuld d8 , d0 , d16 + fmuld d9 , d0 , d17 + + fstd d8 , [CO1] + fstd d9 , [CO1, #8 ] + + add CO1, CO1, #16 + +.endm + +/******************************************************************************/ + +.macro INIT1x1 + + vsub.f64 d16 , d16 , d16 + +.endm + + + +.macro KERNEL1x1_SUB + + fldd d8 , [ BO ] + + fldd d0 , [ AO ] + + fmacd d16 , d0, d8 + + add AO , AO, #8 + add BO , BO, #8 + +.endm + +.macro SAVE1x1 + + + fldd d0, ALPHA + + fmuld d8 , d0 , d16 + fstd d8 , [CO1] + + add CO1, CO1, #8 + +.endm + + + + + +/************************************************************************************** +* End of macro definitions +**************************************************************************************/ + + PROLOGUE + + .align 5 + + push {r4 - r9, fp} + add fp, sp, #24 + sub sp, sp, #STACKSIZE // reserve stack + + str OLD_M, M + str OLD_N, N + str OLD_K, K + str OLD_A, A + vstr OLD_ALPHA, ALPHA + + sub r3, fp, #128 + vstm r3, { d8 - d15} // store floating point registers + + ldr r3, OLD_LDC + lsl r3, r3, #3 // ldc = ldc * 8 + str r3, LDC + + ldr r3, OLD_C + str r3, C + + ldr BC, B + + ldr r3, OFFSET +#ifndef LEFT + neg r3 , r3 +#endif + str r3 , KK + + ldr J, N + asrs J, J, #2 // J = J / 4 + ble _L2_BEGIN + +_L4_BEGIN: + + ldr CO1, C // CO1 = C + ldr r4 , LDC + lsl r4 , r4 , #2 // LDC * 4 + add r3 , r4, CO1 + str r3 , C // store C + +#if defined(LEFT) + ldr r3 , OFFSET + str r3 , KK +#endif + + + ldr AO, A // AO = A + pld [AO , #A_PRE-64] + pld [AO , #A_PRE-32] + + + +_L4_M4_BEGIN: + + ldr I, M + asrs I, I, #2 // I = I / 4 + ble _L4_M2_BEGIN + +_L4_M4_20: + +#if (defined(LEFT) && defined(TRANSA)) || \ + (!defined(LEFT) && !defined(TRANSA)) + + mov BO, BC +#else + mov BO, BC + ldr r3 , KK + lsls r4 , r3 , #5 // 4 double values + add BO , BO , r4 + lsls r4 , r3 , #5 // 4 double values + add AO , AO , r4 + +#endif + +#ifndef TRMMKERNEL + ldr L , K +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + ldr L , K + ldr r3, KK + sub L , L, r3 + str L , KKK +#else + ldr L , KK +#ifdef LEFT + add L , L , #4 // number of values in AO +#else + add L , L , #4 // number of values in BO +#endif + str L , KKK +#endif + + mov K1, L + asrs L , K1, #5 // L = L / 8 + ble _L4_M4_40 + .align 5 + + + + KERNEL4x4_I + KERNEL4x4_M1 + KERNEL4x4_M2 + KERNEL4x4_M1 + + KERNEL4x4_M2 + KERNEL4x4_M1 + KERNEL4x4_M2 + KERNEL4x4_M1 + + KERNEL4x4_M2 + KERNEL4x4_M1 + KERNEL4x4_M2 + KERNEL4x4_M1 + + KERNEL4x4_M2 + KERNEL4x4_M1 + KERNEL4x4_M2 + KERNEL4x4_M1 + + + KERNEL4x4_M2 + KERNEL4x4_M1 + KERNEL4x4_M2 + KERNEL4x4_M1 + + KERNEL4x4_M2 + KERNEL4x4_M1 + KERNEL4x4_M2 + KERNEL4x4_M1 + + KERNEL4x4_M2 + KERNEL4x4_M1 + KERNEL4x4_M2 + KERNEL4x4_M1 + + KERNEL4x4_M2 + KERNEL4x4_M1 + KERNEL4x4_M2 + KERNEL4x4_E + + subs L, L, #1 + ble _L4_M4_41 + +_L4_M4_22: + + KERNEL4x4_S + KERNEL4x4_M1 + KERNEL4x4_M2 + KERNEL4x4_M1 + + KERNEL4x4_M2 + KERNEL4x4_M1 + KERNEL4x4_M2 + KERNEL4x4_M1 + + KERNEL4x4_M2 + KERNEL4x4_M1 + KERNEL4x4_M2 + KERNEL4x4_M1 + + KERNEL4x4_M2 + KERNEL4x4_M1 + KERNEL4x4_M2 + KERNEL4x4_M1 + + + KERNEL4x4_M2 + KERNEL4x4_M1 + KERNEL4x4_M2 + KERNEL4x4_M1 + + KERNEL4x4_M2 + KERNEL4x4_M1 + KERNEL4x4_M2 + KERNEL4x4_M1 + + KERNEL4x4_M2 + KERNEL4x4_M1 + KERNEL4x4_M2 + KERNEL4x4_M1 + + KERNEL4x4_M2 + KERNEL4x4_M1 + KERNEL4x4_M2 + KERNEL4x4_E + + subs L, L, #1 + ble _L4_M4_41 + + b _L4_M4_22 + + +_L4_M4_40: + + INIT4x4 + +_L4_M4_41: + + ands L , K1, #31 // L = L % 8 + ble _L4_M4_100 + +_L4_M4_42: + + KERNEL4x4_SUB + + subs L, L, #1 + bgt _L4_M4_42 + +_L4_M4_100: + + SAVE4x4 + + +#if (defined(LEFT) && defined(TRANSA)) || \ + (!defined(LEFT) && !defined(TRANSA)) + ldr r3 , K + ldr r4 , KKK + sub r3 , r3 , r4 + lsls r4 , r3 , #5 // 4 double values + add BO , BO , r4 + lsls r4 , r3 , #5 // 4 double values + add AO , AO , r4 +#endif + +#if defined(LEFT) + ldr r3 , KK + add r3 , r3 , #4 // number of values in AO + str r3 , KK +#endif + + +_L4_M4_END: + + subs I, I, #1 + bgt _L4_M4_20 + + +_L4_M2_BEGIN: + + ldr I, M + tst I , #3 + ble _L4_END + + tst I, #2 // I = I / 2 + ble _L4_M1_BEGIN + +_L4_M2_20: + + INIT2x4 + +#if (defined(LEFT) && defined(TRANSA)) || \ + (!defined(LEFT) && !defined(TRANSA)) + + mov BO, BC +#else + mov BO, BC + ldr r3 , KK + lsls r4 , r3 , #5 // 4 double values + add BO , BO , r4 + lsls r4 , r3 , #4 // 2 double values + add AO , AO , r4 + +#endif + +#ifndef TRMMKERNEL + ldr L , K +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + ldr L , K + ldr r3, KK + sub L , L, r3 + str L , KKK +#else + ldr L , KK +#ifdef LEFT + add L , L , #2 // number of values in AO +#else + add L , L , #4 // number of values in BO +#endif + str L , KKK +#endif + + mov K1, L + asrs L , K1, #3 // L = L / 8 + ble _L4_M2_40 + +_L4_M2_22: + + KERNEL2x4_SUB + KERNEL2x4_SUB + KERNEL2x4_SUB + KERNEL2x4_SUB + + KERNEL2x4_SUB + KERNEL2x4_SUB + KERNEL2x4_SUB + KERNEL2x4_SUB + + subs L, L, #1 + bgt _L4_M2_22 + + +_L4_M2_40: + + ands L , K1, #7 // L = L % 8 + ble _L4_M2_100 + +_L4_M2_42: + + KERNEL2x4_SUB + + subs L, L, #1 + bgt _L4_M2_42 + +_L4_M2_100: + + SAVE2x4 + +#if (defined(LEFT) && defined(TRANSA)) || \ + (!defined(LEFT) && !defined(TRANSA)) + ldr r3 , K + ldr r4 , KKK + sub r3 , r3 , r4 + lsls r4 , r3 , #5 // 4 double values + add BO , BO , r4 + lsls r4 , r3 , #4 // 2 double values + add AO , AO , r4 +#endif + +#if defined(LEFT) + ldr r3 , KK + add r3 , r3 , #2 // number of values in AO + str r3 , KK +#endif + + +_L4_M2_END: + + +_L4_M1_BEGIN: + + tst I, #1 // I = I % 2 + ble _L4_END + +_L4_M1_20: + + INIT1x4 + +#if (defined(LEFT) && defined(TRANSA)) || \ + (!defined(LEFT) && !defined(TRANSA)) + + mov BO, BC +#else + mov BO, BC + ldr r3 , KK + lsls r4 , r3 , #5 // 4 double values + add BO , BO , r4 + lsls r4 , r3 , #3 // 1 double value + add AO , AO , r4 + +#endif + +#ifndef TRMMKERNEL + ldr L , K +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + ldr L , K + ldr r3, KK + sub L , L, r3 + str L , KKK +#else + ldr L , KK +#ifdef LEFT + add L , L , #1 // number of values in AO +#else + add L , L , #4 // number of values in BO +#endif + str L , KKK +#endif + + mov K1, L + asrs L , K1, #3 // L = L / 8 + ble _L4_M1_40 + +_L4_M1_22: + KERNEL1x4_SUB + KERNEL1x4_SUB + KERNEL1x4_SUB + KERNEL1x4_SUB + + KERNEL1x4_SUB + KERNEL1x4_SUB + KERNEL1x4_SUB + KERNEL1x4_SUB + + subs L, L, #1 + bgt _L4_M1_22 + + +_L4_M1_40: + + ands L , K1, #7 // L = L % 8 + ble _L4_M1_100 + +_L4_M1_42: + + KERNEL1x4_SUB + + subs L, L, #1 + bgt _L4_M1_42 + +_L4_M1_100: + + SAVE1x4 + +#if (defined(LEFT) && defined(TRANSA)) || \ + (!defined(LEFT) && !defined(TRANSA)) + ldr r3 , K + ldr r4 , KKK + sub r3 , r3 , r4 + lsls r4 , r3 , #5 // 4 double values + add BO , BO , r4 + lsls r4 , r3 , #3 // 1 double value + add AO , AO , r4 +#endif + +#if defined(LEFT) + ldr r3 , KK + add r3 , r3 , #1 // number of values in AO + str r3 , KK +#endif + + + +_L4_END: + + mov r3, BC + ldr r4, K + lsl r4, r4, #5 // k * 4 * 8 + add r3, r3, r4 // B = B + K * 4 * 8 + mov BC, r3 + +#if !defined(LEFT) + ldr r3 , KK + add r3 , r3 , #4 // number of values in BO + str r3 , KK +#endif + + subs J , #1 // j-- + bgt _L4_BEGIN + + + +/*********************************************************************************************/ + +_L2_BEGIN: + + ldr J , N + tst J , #3 + ble _L999 + + tst J , #2 + ble _L1_BEGIN + + ldr CO1, C // CO1 = C + ldr r4 , LDC + lsl r4 , r4 , #1 // LDC * 2 + add r3 , r4, CO1 + str r3 , C // store C + +#if defined(LEFT) + ldr r3 , OFFSET + str r3 , KK +#endif + + ldr AO, A // AO = A + //pld [AO , #A_PRE-96] + //pld [AO , #A_PRE-64] + //pld [AO , #A_PRE-32] + + + +_L2_M4_BEGIN: + + ldr I, M + asrs I, I, #2 // I = I / 4 + ble _L2_M2_BEGIN + +_L2_M4_20: + + INIT4x2 + +#if (defined(LEFT) && defined(TRANSA)) || \ + (!defined(LEFT) && !defined(TRANSA)) + + mov BO, BC +#else + mov BO, BC + ldr r3 , KK + lsls r4 , r3 , #4 // 2 double values + add BO , BO , r4 + lsls r4 , r3 , #5 // 4 double values + add AO , AO , r4 + +#endif + +#ifndef TRMMKERNEL + ldr L , K +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + ldr L , K + ldr r3, KK + sub L , L, r3 + str L , KKK +#else + ldr L , KK +#ifdef LEFT + add L , L , #4 // number of values in AO +#else + add L , L , #2 // number of values in BO +#endif + str L , KKK +#endif + + mov K1, L + asrs L , K1, #3 // L = L / 8 + ble _L2_M4_40 + .align 5 + +_L2_M4_22: + KERNEL4x2_SUB + KERNEL4x2_SUB + KERNEL4x2_SUB + KERNEL4x2_SUB + + KERNEL4x2_SUB + KERNEL4x2_SUB + KERNEL4x2_SUB + KERNEL4x2_SUB + + subs L, L, #1 + bgt _L2_M4_22 + + +_L2_M4_40: + + ands L , K1, #7 // L = L % 8 + ble _L2_M4_100 + +_L2_M4_42: + + KERNEL4x2_SUB + + subs L, L, #1 + bgt _L2_M4_42 + +_L2_M4_100: + + SAVE4x2 + +#if (defined(LEFT) && defined(TRANSA)) || \ + (!defined(LEFT) && !defined(TRANSA)) + ldr r3 , K + ldr r4 , KKK + sub r3 , r3 , r4 + lsls r4 , r3 , #4 // 2 double values + add BO , BO , r4 + lsls r4 , r3 , #5 // 4 double values + add AO , AO , r4 +#endif + +#if defined(LEFT) + ldr r3 , KK + add r3 , r3 , #4 // number of values in AO + str r3 , KK +#endif + + + +_L2_M4_END: + + subs I, I, #1 + bgt _L2_M4_20 + + +_L2_M2_BEGIN: + + ldr I, M + tst I , #3 + ble _L2_END + + tst I, #2 // I = I / 2 + ble _L2_M1_BEGIN + +_L2_M2_20: + + INIT2x2 + +#if (defined(LEFT) && defined(TRANSA)) || \ + (!defined(LEFT) && !defined(TRANSA)) + + mov BO, BC +#else + mov BO, BC + ldr r3 , KK + lsls r4 , r3 , #4 // 2 double values + add BO , BO , r4 + lsls r4 , r3 , #4 // 2 double values + add AO , AO , r4 + +#endif + +#ifndef TRMMKERNEL + ldr L , K +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + ldr L , K + ldr r3, KK + sub L , L, r3 + str L , KKK +#else + ldr L , KK +#ifdef LEFT + add L , L , #2 // number of values in AO +#else + add L , L , #2 // number of values in BO +#endif + str L , KKK +#endif + + mov K1, L + asrs L , K1, #3 // L = L / 8 + ble _L2_M2_40 + +_L2_M2_22: + + KERNEL2x2_SUB + KERNEL2x2_SUB + KERNEL2x2_SUB + KERNEL2x2_SUB + + KERNEL2x2_SUB + KERNEL2x2_SUB + KERNEL2x2_SUB + KERNEL2x2_SUB + + subs L, L, #1 + bgt _L2_M2_22 + + +_L2_M2_40: + + ands L , K1, #7 // L = L % 8 + ble _L2_M2_100 + +_L2_M2_42: + + KERNEL2x2_SUB + + subs L, L, #1 + bgt _L2_M2_42 + +_L2_M2_100: + + SAVE2x2 + +#if (defined(LEFT) && defined(TRANSA)) || \ + (!defined(LEFT) && !defined(TRANSA)) + ldr r3 , K + ldr r4 , KKK + sub r3 , r3 , r4 + lsls r4 , r3 , #4 // 2 double values + add BO , BO , r4 + lsls r4 , r3 , #4 // 2 double values + add AO , AO , r4 +#endif + +#if defined(LEFT) + ldr r3 , KK + add r3 , r3 , #2 // number of values in AO + str r3 , KK +#endif + + + +_L2_M2_END: + + +_L2_M1_BEGIN: + + tst I, #1 // I = I % 2 + ble _L2_END + +_L2_M1_20: + + INIT1x2 + +#if (defined(LEFT) && defined(TRANSA)) || \ + (!defined(LEFT) && !defined(TRANSA)) + + mov BO, BC +#else + mov BO, BC + ldr r3 , KK + lsls r4 , r3 , #4 // 2 double values + add BO , BO , r4 + lsls r4 , r3 , #3 // 1 double value + add AO , AO , r4 + +#endif + +#ifndef TRMMKERNEL + ldr L , K +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + ldr L , K + ldr r3, KK + sub L , L, r3 + str L , KKK +#else + ldr L , KK +#ifdef LEFT + add L , L , #1 // number of values in AO +#else + add L , L , #2 // number of values in BO +#endif + str L , KKK +#endif + + mov K1, L + asrs L , K1, #3 // L = L / 8 + ble _L2_M1_40 + +_L2_M1_22: + KERNEL1x2_SUB + KERNEL1x2_SUB + KERNEL1x2_SUB + KERNEL1x2_SUB + + KERNEL1x2_SUB + KERNEL1x2_SUB + KERNEL1x2_SUB + KERNEL1x2_SUB + + subs L, L, #1 + bgt _L2_M1_22 + + +_L2_M1_40: + + ands L , K1, #7 // L = L % 8 + ble _L2_M1_100 + +_L2_M1_42: + + KERNEL1x2_SUB + + subs L, L, #1 + bgt _L2_M1_42 + +_L2_M1_100: + + SAVE1x2 + +#if (defined(LEFT) && defined(TRANSA)) || \ + (!defined(LEFT) && !defined(TRANSA)) + ldr r3 , K + ldr r4 , KKK + sub r3 , r3 , r4 + lsls r4 , r3 , #4 // 2 double values + add BO , BO , r4 + lsls r4 , r3 , #3 // 1 double value + add AO , AO , r4 +#endif + +#if defined(LEFT) + ldr r3 , KK + add r3 , r3 , #1 // number of values in AO + str r3 , KK +#endif + + + + +_L2_END: + + mov r3, BC + ldr r4, K + lsl r4, r4, #4 // k * 2 * 8 + add r3, r3, r4 // B = B + K * 2 * 8 + mov BC, r3 + +#if !defined(LEFT) + ldr r3 , KK + add r3 , r3 , #2 // number of values in BO + str r3 , KK +#endif + + +/*********************************************************************************************/ + +_L1_BEGIN: + + ldr J , N + tst J , #1 + ble _L999 + + + ldr CO1, C // CO1 = C + ldr r4 , LDC + add r3 , r4, CO1 + str r3 , C // store C + +#if defined(LEFT) + ldr r3 , OFFSET + str r3 , KK +#endif + + ldr AO, A // AO = A + //pld [AO , #A_PRE-96] + //pld [AO , #A_PRE-64] + //pld [AO , #A_PRE-32] + + + +_L1_M4_BEGIN: + + ldr I, M + asrs I, I, #2 // I = I / 4 + ble _L1_M2_BEGIN + +_L1_M4_20: + + INIT4x1 + +#if (defined(LEFT) && defined(TRANSA)) || \ + (!defined(LEFT) && !defined(TRANSA)) + + mov BO, BC +#else + mov BO, BC + ldr r3 , KK + lsls r4 , r3 , #3 // 1 double value + add BO , BO , r4 + lsls r4 , r3 , #5 // 4 double values + add AO , AO , r4 + +#endif + +#ifndef TRMMKERNEL + ldr L , K +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + ldr L , K + ldr r3, KK + sub L , L, r3 + str L , KKK +#else + ldr L , KK +#ifdef LEFT + add L , L , #4 // number of values in AO +#else + add L , L , #1 // number of values in BO +#endif + str L , KKK +#endif + + mov K1, L + asrs L , K1, #3 // L = L / 8 + ble _L1_M4_40 + .align 5 + +_L1_M4_22: + KERNEL4x1_SUB + KERNEL4x1_SUB + KERNEL4x1_SUB + KERNEL4x1_SUB + + KERNEL4x1_SUB + KERNEL4x1_SUB + KERNEL4x1_SUB + KERNEL4x1_SUB + + subs L, L, #1 + bgt _L1_M4_22 + + +_L1_M4_40: + + ands L , K1, #7 // L = L % 8 + ble _L1_M4_100 + +_L1_M4_42: + + KERNEL4x1_SUB + + subs L, L, #1 + bgt _L1_M4_42 + +_L1_M4_100: + + SAVE4x1 + +#if (defined(LEFT) && defined(TRANSA)) || \ + (!defined(LEFT) && !defined(TRANSA)) + ldr r3 , K + ldr r4 , KKK + sub r3 , r3 , r4 + lsls r4 , r3 , #3 // 1 double value + add BO , BO , r4 + lsls r4 , r3 , #5 // 4 double values + add AO , AO , r4 +#endif + +#if defined(LEFT) + ldr r3 , KK + add r3 , r3 , #4 // number of values in AO + str r3 , KK +#endif + + + +_L1_M4_END: + + subs I, I, #1 + bgt _L1_M4_20 + + +_L1_M2_BEGIN: + + ldr I, M + tst I , #3 + ble _L1_END + + tst I, #2 // I = I / 2 + ble _L1_M1_BEGIN + +_L1_M2_20: + + INIT2x1 + +#if (defined(LEFT) && defined(TRANSA)) || \ + (!defined(LEFT) && !defined(TRANSA)) + + mov BO, BC +#else + mov BO, BC + ldr r3 , KK + lsls r4 , r3 , #3 // 1 double value + add BO , BO , r4 + lsls r4 , r3 , #4 // 2 double values + add AO , AO , r4 + +#endif + +#ifndef TRMMKERNEL + ldr L , K +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + ldr L , K + ldr r3, KK + sub L , L, r3 + str L , KKK +#else + ldr L , KK +#ifdef LEFT + add L , L , #2 // number of values in AO +#else + add L , L , #1 // number of values in BO +#endif + str L , KKK +#endif + + mov K1, L + asrs L , K1, #3 // L = L / 8 + ble _L1_M2_40 + +_L1_M2_22: + + KERNEL2x1_SUB + KERNEL2x1_SUB + KERNEL2x1_SUB + KERNEL2x1_SUB + + KERNEL2x1_SUB + KERNEL2x1_SUB + KERNEL2x1_SUB + KERNEL2x1_SUB + + subs L, L, #1 + bgt _L1_M2_22 + + +_L1_M2_40: + + ands L , K1, #7 // L = L % 8 + ble _L1_M2_100 + +_L1_M2_42: + + KERNEL2x1_SUB + + subs L, L, #1 + bgt _L1_M2_42 + +_L1_M2_100: + + SAVE2x1 + +#if (defined(LEFT) && defined(TRANSA)) || \ + (!defined(LEFT) && !defined(TRANSA)) + ldr r3 , K + ldr r4 , KKK + sub r3 , r3 , r4 + lsls r4 , r3 , #3 // 1 double value + add BO , BO , r4 + lsls r4 , r3 , #4 // 2 double values + add AO , AO , r4 +#endif + +#if defined(LEFT) + ldr r3 , KK + add r3 , r3 , #2 // number of values in AO + str r3 , KK +#endif + + + +_L1_M2_END: + + +_L1_M1_BEGIN: + + tst I, #1 // I = I % 2 + ble _L1_END + +_L1_M1_20: + + INIT1x1 + +#if (defined(LEFT) && defined(TRANSA)) || \ + (!defined(LEFT) && !defined(TRANSA)) + + mov BO, BC +#else + mov BO, BC + ldr r3 , KK + lsls r4 , r3 , #3 // 1 double value + add BO , BO , r4 + lsls r4 , r3 , #3 // 1 double value + add AO , AO , r4 + +#endif + +#ifndef TRMMKERNEL + ldr L , K +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + ldr L , K + ldr r3, KK + sub L , L, r3 + str L , KKK +#else + ldr L , KK +#ifdef LEFT + add L , L , #1 // number of values in AO +#else + add L , L , #1 // number of values in BO +#endif + str L , KKK +#endif + + mov K1, L + asrs L , K1, #3 // L = L / 8 + ble _L1_M1_40 + +_L1_M1_22: + KERNEL1x1_SUB + KERNEL1x1_SUB + KERNEL1x1_SUB + KERNEL1x1_SUB + + KERNEL1x1_SUB + KERNEL1x1_SUB + KERNEL1x1_SUB + KERNEL1x1_SUB + + subs L, L, #1 + bgt _L1_M1_22 + + +_L1_M1_40: + + ands L , K1, #7 // L = L % 8 + ble _L1_M1_100 + +_L1_M1_42: + + KERNEL1x1_SUB + + subs L, L, #1 + bgt _L1_M1_42 + +_L1_M1_100: + + SAVE1x1 + + +_L1_END: + + +_L999: + + sub r3, fp, #128 + vldm r3, { d8 - d15} // restore floating point registers + + movs r0, #0 // set return value + sub sp, fp, #24 + pop {r4 - r9, fp} + bx lr + + EPILOGUE + diff --git a/kernel/arm/gemv_n.c b/kernel/arm/gemv_n.c new file mode 100644 index 000000000..aedcca965 --- /dev/null +++ b/kernel/arm/gemv_n.c @@ -0,0 +1,67 @@ +/*************************************************************************** +Copyright (c) 2013, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + + +/************************************************************************************** + * * 2013/09/14 Saar + * * BLASTEST float : OK + * * BLASTEST double : OK + * CTEST : OK + * TEST : OK + * * + * **************************************************************************************/ + + +#include "common.h" + +int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha, FLOAT *a, BLASLONG lda, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *buffer) +{ + BLASLONG i; + BLASLONG ix,iy; + BLASLONG j; + FLOAT *a_ptr; + FLOAT temp; + + ix = 0; + a_ptr = a; + + for (j=0; j + +#if defined(DOUBLE) + +#define ABS fabs + +#else + +#define ABS fabsf + +#endif + + +BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) +{ + BLASLONG i=0; + BLASLONG ix=0; + FLOAT maxf=0.0; + BLASLONG max=0; + + if (n < 0 || inc_x < 1 ) return(max); + + maxf=ABS(x[0]); + + while(i < n) + { + if( ABS(x[ix]) > ABS(maxf) ) + { + max = i; + maxf = ABS(x[ix]); + } + ix += inc_x; + i++; + } + return(max+1); +} + + diff --git a/kernel/arm/iamax_vfp.S b/kernel/arm/iamax_vfp.S new file mode 100644 index 000000000..1d7344898 --- /dev/null +++ b/kernel/arm/iamax_vfp.S @@ -0,0 +1,478 @@ +/*************************************************************************** +Copyright (c) 2013, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +/************************************************************************************** +* 2013/11/14 Saar +* BLASTEST : OK +* CTEST : OK +* TEST : OK +* +**************************************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#define STACKSIZE 256 + +#define N r0 +#define X r1 +#define INC_X r2 +#define INDEX r3 +#define Z r4 + +#define I r12 + +#define X_PRE 512 + +/************************************************************************************** +* Macro definitions +**************************************************************************************/ + +#if defined(USE_ABS) + +#if defined(DOUBLE) + +#define VABS(x0,x1) vabs.f64 x0, x1 + +#else + +#define VABS(x0,x1) vabs.f32 x0, x1 + +#endif + +#else + +#define VABS(x0,x1) nop + +#endif + +/*****************************************************************************************/ + +#if defined(USE_MIN) + +#define MOVCOND movlt + +#if defined(DOUBLE) + +#define VMOVCOND vmovlt.f64 + +#else + +#define VMOVCOND vmovlt.f32 + +#endif + +#else + +#define MOVCOND movgt + +#if defined(DOUBLE) + +#define VMOVCOND vmovgt.f64 + +#else + +#define VMOVCOND vmovgt.f32 + +#endif + + +#endif + + +/*****************************************************************************************/ + + + +#if !defined(COMPLEX) + +#if defined(DOUBLE) + +.macro INIT_F + + fldmiad X!, { d0 } + VABS( d0, d0 ) + mov Z, #1 + mov INDEX, Z + +.endm + +.macro KERNEL_F1 + + fldmiad X!, { d4 } + add Z, Z, #1 + VABS( d4, d4 ) + vcmpe.f64 d4, d0 + vmrs APSR_nzcv, fpscr + VMOVCOND d0, d4 + MOVCOND INDEX, Z + +.endm + +.macro INIT_S + + fldmiad X, { d0 } + VABS( d0, d0 ) + mov Z, #1 + mov INDEX, Z + add X, X, INC_X + +.endm + + +.macro KERNEL_S1 + + fldmiad X, { d4 } + add Z, Z, #1 + VABS( d4, d4 ) + vcmpe.f64 d4, d0 + vmrs APSR_nzcv, fpscr + VMOVCOND d0, d4 + MOVCOND INDEX, Z + add X, X, INC_X + +.endm + +#else + +.macro INIT_F + + fldmias X!, { s0 } + VABS( s0, s0 ) + mov Z, #1 + mov INDEX, Z + +.endm + +.macro KERNEL_F1 + + fldmias X!, { s4 } + add Z, Z, #1 + VABS( s4, s4 ) + vcmpe.f32 s4, s0 + vmrs APSR_nzcv, fpscr + VMOVCOND s0, s4 + MOVCOND INDEX, Z + +.endm + +.macro INIT_S + + fldmias X, { s0 } + VABS( s0, s0 ) + mov Z, #1 + mov INDEX, Z + add X, X, INC_X + +.endm + + +.macro KERNEL_S1 + + fldmias X, { s4 } + add Z, Z, #1 + VABS( s4, s4 ) + vcmpe.f32 s4, s0 + vmrs APSR_nzcv, fpscr + VMOVCOND s0, s4 + MOVCOND INDEX, Z + add X, X, INC_X + +.endm + + + + +#endif + +#else + +#if defined(DOUBLE) + +.macro INIT_F + + fldmiad X!, { d0 -d1 } + vabs.f64 d0, d0 + vabs.f64 d1, d1 + vadd.f64 d0 , d0, d1 + mov Z, #1 + mov INDEX, Z + +.endm + + +.macro KERNEL_F1 + + fldmiad X!, { d4 - d5 } + add Z, Z, #1 + vabs.f64 d4, d4 + vabs.f64 d5, d5 + vadd.f64 d4 , d4, d5 + vcmpe.f64 d4, d0 + vmrs APSR_nzcv, fpscr + VMOVCOND d0, d4 + MOVCOND INDEX, Z + +.endm + +.macro INIT_S + + fldmiad X, { d0 -d1 } + vabs.f64 d0, d0 + vabs.f64 d1, d1 + vadd.f64 d0 , d0, d1 + mov Z, #1 + mov INDEX, Z + add X, X, INC_X + +.endm + + + +.macro KERNEL_S1 + + fldmiad X, { d4 - d5 } + add Z, Z, #1 + vabs.f64 d4, d4 + vabs.f64 d5, d5 + vadd.f64 d4 , d4, d5 + vcmpe.f64 d4, d0 + vmrs APSR_nzcv, fpscr + VMOVCOND d0, d4 + MOVCOND INDEX, Z + add X, X, INC_X + +.endm + +#else + +.macro INIT_F + + fldmias X!, { s0 -s1 } + vabs.f32 s0, s0 + vabs.f32 s1, s1 + vadd.f32 s0 , s0, s1 + mov Z, #1 + mov INDEX, Z + +.endm + + +.macro KERNEL_F1 + + fldmias X!, { s4 - s5 } + add Z, Z, #1 + vabs.f32 s4, s4 + vabs.f32 s5, s5 + vadd.f32 s4 , s4, s5 + vcmpe.f32 s4, s0 + vmrs APSR_nzcv, fpscr + VMOVCOND s0, s4 + MOVCOND INDEX, Z + +.endm + +.macro INIT_S + + fldmias X, { s0 -s1 } + vabs.f32 s0, s0 + vabs.f32 s1, s1 + vadd.f32 s0 , s0, s1 + mov Z, #1 + mov INDEX, Z + add X, X, INC_X + +.endm + + + +.macro KERNEL_S1 + + fldmias X, { s4 - s5 } + add Z, Z, #1 + vabs.f32 s4, s4 + vabs.f32 s5, s5 + vadd.f32 s4 , s4, s5 + vcmpe.f32 s4, s0 + vmrs APSR_nzcv, fpscr + VMOVCOND s0, s4 + MOVCOND INDEX, Z + add X, X, INC_X + +.endm + + + + +#endif + +#endif + +/************************************************************************************** +* End of macro definitions +**************************************************************************************/ + + PROLOGUE + + .align 5 + push {r4} + +#if defined(DOUBLE) + vsub.f64 d0 , d0 , d0 +#else + vsub.f32 s0 , s0 , s0 +#endif + mov INDEX, #0 + + cmp N, #0 + ble iamax_kernel_L999 + + cmp INC_X, #0 + beq iamax_kernel_L999 + + + cmp INC_X, #1 + bne iamax_kernel_S_BEGIN + + +iamax_kernel_F_BEGIN: + + INIT_F + + subs N, N , #1 + ble iamax_kernel_L999 + + asrs I, N, #2 // I = N / 4 + ble iamax_kernel_F1 + + .align 5 + +iamax_kernel_F4: + + pld [ X, #X_PRE ] + KERNEL_F1 + KERNEL_F1 +#if defined(COMPLEX) && defined(DOUBLE) + pld [ X, #X_PRE ] +#endif + KERNEL_F1 + KERNEL_F1 + + subs I, I, #1 + ble iamax_kernel_F1 + + +#if defined(COMPLEX) || defined(DOUBLE) + pld [ X, #X_PRE ] +#endif + KERNEL_F1 + KERNEL_F1 +#if defined(COMPLEX) && defined(DOUBLE) + pld [ X, #X_PRE ] +#endif + KERNEL_F1 + KERNEL_F1 + + subs I, I, #1 + bne iamax_kernel_F4 + +iamax_kernel_F1: + + ands I, N, #3 + ble iamax_kernel_L999 + +iamax_kernel_F10: + + KERNEL_F1 + + subs I, I, #1 + bne iamax_kernel_F10 + + b iamax_kernel_L999 + +iamax_kernel_S_BEGIN: + +#if defined(COMPLEX) + +#if defined(DOUBLE) + lsl INC_X, INC_X, #4 // INC_X * SIZE * 2 +#else + lsl INC_X, INC_X, #3 // INC_X * SIZE * 2 +#endif + +#else + +#if defined(DOUBLE) + lsl INC_X, INC_X, #3 // INC_X * SIZE +#else + lsl INC_X, INC_X, #2 // INC_X * SIZE +#endif + +#endif + + INIT_S + + subs N, N , #1 + ble iamax_kernel_L999 + + asrs I, N, #2 // I = N / 4 + ble iamax_kernel_S1 + + .align 5 + +iamax_kernel_S4: + + KERNEL_S1 + KERNEL_S1 + KERNEL_S1 + KERNEL_S1 + + subs I, I, #1 + bne iamax_kernel_S4 + +iamax_kernel_S1: + + ands I, N, #3 + ble iamax_kernel_L999 + +iamax_kernel_S10: + + KERNEL_S1 + + subs I, I, #1 + bne iamax_kernel_S10 + + +iamax_kernel_L999: + + mov r0, INDEX // set return value + + pop {r4} + bx lr + + EPILOGUE + diff --git a/kernel/arm/iamin.c b/kernel/arm/iamin.c new file mode 100644 index 000000000..fdb5d7a10 --- /dev/null +++ b/kernel/arm/iamin.c @@ -0,0 +1,75 @@ +/*************************************************************************** +Copyright (c) 2013, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +/************************************************************************************** +* 2013/09/14 Saar +* BLASTEST float : NoTest +* BLASTEST double : NoTest +* CTEST : NoTest +* TEST : NoTest +* +**************************************************************************************/ + +#include "common.h" +#include + +#if defined(DOUBLE) + +#define ABS fabs + +#else + +#define ABS fabsf + +#endif + + +BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) +{ + BLASLONG i=0; + BLASLONG ix=0; + FLOAT minf=0.0; + BLASLONG min=0; + + if (n < 0 || inc_x < 1 ) return(min); + + minf=ABS(x[0]); + + while(i < n) + { + if( ABS(x[ix]) < ABS(minf) ) + { + min = i; + minf = ABS(x[ix]); + } + ix += inc_x; + i++; + } + return(min+1); +} + + diff --git a/kernel/arm/imax.c b/kernel/arm/imax.c new file mode 100644 index 000000000..e3e4b9a6c --- /dev/null +++ b/kernel/arm/imax.c @@ -0,0 +1,67 @@ +/*************************************************************************** +Copyright (c) 2013, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + + +/************************************************************************************** +* 2013/09/14 Saar +* BLASTEST float : NoTest +* BLASTEST double : NoTest +* CTEST : NoTest +* TEST : NoTest +* +**************************************************************************************/ + +#include "common.h" +#include + + + +BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) +{ + BLASLONG i=0; + BLASLONG ix=0; + FLOAT maxf=0.0; + BLASLONG max=0; + + if (n < 0 || inc_x < 1 ) return(max); + + maxf=x[0]; + + while(i < n) + { + if( x[ix] > maxf ) + { + max = i; + maxf = x[ix]; + } + ix += inc_x; + i++; + } + return(max+1); +} + + diff --git a/kernel/arm/imin.c b/kernel/arm/imin.c new file mode 100644 index 000000000..fbcadc2fd --- /dev/null +++ b/kernel/arm/imin.c @@ -0,0 +1,65 @@ +/*************************************************************************** +Copyright (c) 2013, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + + +/************************************************************************************** +* 2013/08/19 Saar +* BLASTEST float +* BLASTEST double +* +**************************************************************************************/ + +#include "common.h" +#include + + + +BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) +{ + BLASLONG i=0; + BLASLONG ix=0; + FLOAT minf=0.0; + BLASLONG min=0; + + if (n < 0 || inc_x < 1 ) return(min); + + minf=x[0]; + + while(i < n) + { + if( x[ix] > minf ) + { + min = i; + minf = x[ix]; + } + ix += inc_x; + i++; + } + return(min+1); +} + + diff --git a/kernel/arm/izamax.c b/kernel/arm/izamax.c new file mode 100644 index 000000000..a6ba86388 --- /dev/null +++ b/kernel/arm/izamax.c @@ -0,0 +1,81 @@ +/*************************************************************************** +Copyright (c) 2013, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +/************************************************************************************** +* 2013/09/14 Saar +* BLASTEST float : NoTest +* BLASTEST double : NoTest +* CTEST : OK +* TEST : OK +* +**************************************************************************************/ + +#include "common.h" +#include + +#if defined(DOUBLE) + +#define ABS fabs + +#else + +#define ABS fabsf + +#endif + +#define CABS1(x,i) ABS(x[i])+ABS(x[i+1]) + +BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) +{ + BLASLONG i=0; + BLASLONG ix=0; + FLOAT maxf[2]; + BLASLONG max=0; + BLASLONG inc_x2; + + if (n < 0 || inc_x < 1 ) return(max); + + inc_x2 = 2 * inc_x; + + maxf[0] = ABS(x[ix]); + maxf[1] = ABS(x[ix+1]); + + while(i < n) + { + if( CABS1(x,ix) > CABS1(maxf,0) ) + { + max = i; + maxf[0] = ABS(x[ix]); + maxf[1] = ABS(x[ix+1]); + } + ix += inc_x2; + i++; + } + return(max+1); +} + + diff --git a/kernel/arm/izamin.c b/kernel/arm/izamin.c new file mode 100644 index 000000000..45c2a7c9c --- /dev/null +++ b/kernel/arm/izamin.c @@ -0,0 +1,81 @@ +/*************************************************************************** +Copyright (c) 2013, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +/************************************************************************************** +* 2013/09/14 Saar +* BLASTEST float : NoTest +* BLASTEST double : NoTest +* CTEST : NoTest +* TEST : NoTest +* +**************************************************************************************/ + +#include "common.h" +#include + +#if defined(DOUBLE) + +#define ABS fabs + +#else + +#define ABS fabsf + +#endif + +#define CABS1(x,i) ABS(x[i])+ABS(x[i+1]) + +BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) +{ + BLASLONG i=0; + BLASLONG ix=0; + FLOAT minf[2]; + BLASLONG min=0; + BLASLONG inc_x2; + + if (n < 0 || inc_x < 1 ) return(min); + + inc_x2 = 2 * inc_x; + + minf[0] = ABS(x[ix]); + minf[1] = ABS(x[ix+1]); + + while(i < n) + { + if( CABS1(x,ix) < CABS1(minf,0) ) + { + min = i; + minf[0] = ABS(x[ix]); + minf[1] = ABS(x[ix+1]); + } + ix += inc_x2; + i++; + } + return(min+1); +} + + diff --git a/kernel/arm/max.c b/kernel/arm/max.c new file mode 100644 index 000000000..3239e3408 --- /dev/null +++ b/kernel/arm/max.c @@ -0,0 +1,63 @@ +/*************************************************************************** +Copyright (c) 2013, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +/************************************************************************************** +* 2013/09/14 Saar +* BLASTEST float : NoTest +* BLASTEST double : NoTest +* CTEST : NoTest +* TEST : NoTest +* +**************************************************************************************/ + +#include "common.h" +#include + + +FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) +{ + BLASLONG i=0; + BLASLONG ix=0; + FLOAT maxf=0.0; + + if (n < 0 || inc_x < 1 ) return(maxf); + + maxf=x[0]; + + while(i < n) + { + if( x[ix] > maxf ) + { + maxf = x[ix]; + } + ix += inc_x; + i++; + } + return(maxf); +} + + diff --git a/kernel/arm/min.c b/kernel/arm/min.c new file mode 100644 index 000000000..de4c4719a --- /dev/null +++ b/kernel/arm/min.c @@ -0,0 +1,63 @@ +/*************************************************************************** +Copyright (c) 2013, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +/************************************************************************************** +* 2013/09/14 Saar +* BLASTEST float : NoTest +* BLASTEST double : NoTest +* CTEST : NoTest +* TEST : NoTest +* +**************************************************************************************/ + +#include "common.h" +#include + + +FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) +{ + BLASLONG i=0; + BLASLONG ix=0; + FLOAT minf=0.0; + + if (n < 0 || inc_x < 1 ) return(minf); + + minf=x[0]; + + while(i < n) + { + if( x[ix] < minf ) + { + minf = x[ix]; + } + ix += inc_x; + i++; + } + return(minf); +} + + diff --git a/kernel/arm/nrm2.c b/kernel/arm/nrm2.c new file mode 100644 index 000000000..d65c5a410 --- /dev/null +++ b/kernel/arm/nrm2.c @@ -0,0 +1,88 @@ +/*************************************************************************** +Copyright (c) 2013, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +/************************************************************************************** +* 2013/09/13 Saar +* BLASTEST float : OK +* BLASTEST double : OK +* CTEST : OK +* TEST : OK +* +**************************************************************************************/ + +#include "common.h" +#include + +#if defined(DOUBLE) + +#define ABS fabs + +#else + +#define ABS fabsf + +#endif + + + +FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) +{ + BLASLONG i=0; + FLOAT scale = 0.0; + FLOAT ssq = 1.0; + FLOAT absxi = 0.0; + + + if (n < 0 || inc_x < 1 ) return(0.0); + if ( n == 1 ) return( ABS(x[0]) ); + + n *= inc_x; + while(i < n) + { + + if ( x[i] != 0.0 ) + { + absxi = ABS( x[i] ); + if ( scale < absxi ) + { + ssq = 1 + ssq * ( scale / absxi ) * ( scale / absxi ); + scale = absxi ; + } + else + { + ssq += ( absxi/scale ) * ( absxi/scale ); + } + + } + i += inc_x; + } + scale = scale * sqrt( ssq ); + return(scale); + +} + + diff --git a/kernel/arm/nrm2_vfp.S b/kernel/arm/nrm2_vfp.S new file mode 100644 index 000000000..4c62917b9 --- /dev/null +++ b/kernel/arm/nrm2_vfp.S @@ -0,0 +1,565 @@ +/*************************************************************************** +Copyright (c) 2013, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +/************************************************************************************** +* 2013/11/22 Saar +* BLASTEST : OK +* CTEST : OK +* TEST : OK +* +**************************************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#define STACKSIZE 256 + +#define N r0 +#define X r1 +#define INC_X r2 + +#define I r12 + +#define X_PRE 512 + +/************************************************************************************** +* Macro definitions +**************************************************************************************/ + + +#if !defined(COMPLEX) + +#if defined(DOUBLE) + + +.macro KERNEL_F1 + + fldmiad X!, { d4 } + vcmpe.f64 d4, d6 // compare with 0.0 + vmrs APSR_nzcv, fpscr + beq KERNEL_F1_NEXT_\@ + vabs.f64 d4, d4 + vcmpe.f64 d0, d4 // compare with scale + vmrs APSR_nzcv, fpscr + vdivge.f64 d2 , d4, d0 // scale >= x ? x / scale + vmlage.f64 d1 , d2 , d2 // ssq += ( x/scale ) * ( x/scale ) + bge KERNEL_F1_NEXT_\@ + vdiv.f64 d2 , d0, d4 // scale / x + vmul.f64 d2 , d2, d2 // ( scale / x ) * ( scale / x ) + vmul.f64 d3 , d1, d2 // ssq * ( scale / x ) * ( scale / x ) + vadd.f64 d1 , d3, d7 // ssq = 1 + ssq * ( scale / x ) * ( scale / x ) + vmov.f64 d0 , d4 // scale = x + +KERNEL_F1_NEXT_\@: + +.endm + +.macro KERNEL_F8 + + pld [ X, #X_PRE ] + KERNEL_F1 + KERNEL_F1 + KERNEL_F1 + KERNEL_F1 + pld [ X, #X_PRE ] + KERNEL_F1 + KERNEL_F1 + KERNEL_F1 + KERNEL_F1 + +.endm + +.macro KERNEL_S1 + + fldmiad X, { d4 } + vcmpe.f64 d4, d6 // compare with 0.0 + vmrs APSR_nzcv, fpscr + beq KERNEL_S1_NEXT + vabs.f64 d4, d4 + vcmpe.f64 d0, d4 // compare with scale + vmrs APSR_nzcv, fpscr + vdivge.f64 d2 , d4, d0 // scale >= x ? x / scale + vmlage.f64 d1 , d2 , d2 // ssq += ( x/scale ) * ( x/scale ) + bge KERNEL_S1_NEXT + vdiv.f64 d2 , d0, d4 // scale / x + vmul.f64 d2 , d2, d2 // ( scale / x ) * ( scale / x ) + vmul.f64 d3 , d1, d2 // ssq * ( scale / x ) * ( scale / x ) + vadd.f64 d1 , d3, d7 // ssq = 1 + ssq * ( scale / x ) * ( scale / x ) + vmov.f64 d0 , d4 // scale = x + +KERNEL_S1_NEXT: + + add X, X, INC_X + +.endm + +#else + +.macro KERNEL_F1 + + fldmias X!, { s4 } + vcmpe.f32 s4, s6 // compare with 0.0 + vmrs APSR_nzcv, fpscr + beq KERNEL_F1_NEXT_\@ + vabs.f32 s4, s4 + vcmpe.f32 s0, s4 // compare with scale + vmrs APSR_nzcv, fpscr + vdivge.f32 s2 , s4, s0 // scale >= x ? x / scale + vmlage.f32 s1 , s2 , s2 // ssq += ( x/scale ) * ( x/scale ) + bge KERNEL_F1_NEXT_\@ + vdiv.f32 s2 , s0, s4 // scale / x + vmul.f32 s2 , s2, s2 // ( scale / x ) * ( scale / x ) + vmul.f32 s3 , s1, s2 // ssq * ( scale / x ) * ( scale / x ) + vadd.f32 s1 , s3, s7 // ssq = 1 + ssq * ( scale / x ) * ( scale / x ) + vmov.f32 s0 , s4 // scale = x + +KERNEL_F1_NEXT_\@: + +.endm + +.macro KERNEL_F8 + + pld [ X, #X_PRE ] + KERNEL_F1 + KERNEL_F1 + KERNEL_F1 + KERNEL_F1 + + KERNEL_F1 + KERNEL_F1 + KERNEL_F1 + KERNEL_F1 + +.endm + +.macro KERNEL_S1 + + fldmias X, { s4 } + vcmpe.f32 s4, s6 // compare with 0.0 + vmrs APSR_nzcv, fpscr + beq KERNEL_S1_NEXT + vabs.f32 s4, s4 + vcmpe.f32 s0, s4 // compare with scale + vmrs APSR_nzcv, fpscr + vdivge.f32 s2 , s4, s0 // scale >= x ? x / scale + vmlage.f32 s1 , s2 , s2 // ssq += ( x/scale ) * ( x/scale ) + bge KERNEL_S1_NEXT + vdiv.f32 s2 , s0, s4 // scale / x + vmul.f32 s2 , s2, s2 // ( scale / x ) * ( scale / x ) + vmul.f32 s3 , s1, s2 // ssq * ( scale / x ) * ( scale / x ) + vadd.f32 s1 , s3, s7 // ssq = 1 + ssq * ( scale / x ) * ( scale / x ) + vmov.f32 s0 , s4 // scale = x + +KERNEL_S1_NEXT: + + add X, X, INC_X + +.endm + + + + +#endif + +#else + +#if defined(DOUBLE) + +.macro KERNEL_F1 + + fldmiad X!, { d4 - d5 } + + vcmpe.f64 d4, d6 // compare with 0.0 + vmrs APSR_nzcv, fpscr + beq KERNEL_F1_NEXT_\@ + vabs.f64 d4, d4 + vcmpe.f64 d0, d4 // compare with scale + vmrs APSR_nzcv, fpscr + vdivge.f64 d2 , d4, d0 // scale >= x ? x / scale + vmlage.f64 d1 , d2 , d2 // ssq += ( x/scale ) * ( x/scale ) + bge KERNEL_F1_NEXT_\@ + vdiv.f64 d2 , d0, d4 // scale / x + vmul.f64 d2 , d2, d2 // ( scale / x ) * ( scale / x ) + vmul.f64 d3 , d1, d2 // ssq * ( scale / x ) * ( scale / x ) + vadd.f64 d1 , d3, d7 // ssq = 1 + ssq * ( scale / x ) * ( scale / x ) + vmov.f64 d0 , d4 // scale = x + +KERNEL_F1_NEXT_\@: + + vcmpe.f64 d5, d6 // compare with 0.0 + vmrs APSR_nzcv, fpscr + beq KERNEL_F1_END_\@ + vabs.f64 d5, d5 + vcmpe.f64 d0, d5 // compare with scale + vmrs APSR_nzcv, fpscr + vdivge.f64 d2 , d5, d0 // scale >= x ? x / scale + vmlage.f64 d1 , d2 , d2 // ssq += ( x/scale ) * ( x/scale ) + bge KERNEL_F1_END_\@ + vdiv.f64 d2 , d0, d5 // scale / x + vmul.f64 d2 , d2, d2 // ( scale / x ) * ( scale / x ) + vmul.f64 d3 , d1, d2 // ssq * ( scale / x ) * ( scale / x ) + vadd.f64 d1 , d3, d7 // ssq = 1 + ssq * ( scale / x ) * ( scale / x ) + vmov.f64 d0 , d5 // scale = x + +KERNEL_F1_END_\@: + + +.endm + +.macro KERNEL_F8 + + pld [ X, #X_PRE ] + KERNEL_F1 + KERNEL_F1 + pld [ X, #X_PRE ] + KERNEL_F1 + KERNEL_F1 + pld [ X, #X_PRE ] + KERNEL_F1 + KERNEL_F1 + pld [ X, #X_PRE ] + KERNEL_F1 + KERNEL_F1 + +.endm + +.macro KERNEL_S1 + + fldmiad X, { d4 - d5 } + + vcmpe.f64 d4, d6 // compare with 0.0 + vmrs APSR_nzcv, fpscr + beq KERNEL_S1_NEXT_\@ + vabs.f64 d4, d4 + vcmpe.f64 d0, d4 // compare with scale + vmrs APSR_nzcv, fpscr + vdivge.f64 d2 , d4, d0 // scale >= x ? x / scale + vmlage.f64 d1 , d2 , d2 // ssq += ( x/scale ) * ( x/scale ) + bge KERNEL_S1_NEXT_\@ + vdiv.f64 d2 , d0, d4 // scale / x + vmul.f64 d2 , d2, d2 // ( scale / x ) * ( scale / x ) + vmul.f64 d3 , d1, d2 // ssq * ( scale / x ) * ( scale / x ) + vadd.f64 d1 , d3, d7 // ssq = 1 + ssq * ( scale / x ) * ( scale / x ) + vmov.f64 d0 , d4 // scale = x + +KERNEL_S1_NEXT_\@: + + vcmpe.f64 d5, d6 // compare with 0.0 + vmrs APSR_nzcv, fpscr + beq KERNEL_S1_END_\@ + vabs.f64 d5, d5 + vcmpe.f64 d0, d5 // compare with scale + vmrs APSR_nzcv, fpscr + vdivge.f64 d2 , d5, d0 // scale >= x ? x / scale + vmlage.f64 d1 , d2 , d2 // ssq += ( x/scale ) * ( x/scale ) + bge KERNEL_S1_END_\@ + vdiv.f64 d2 , d0, d5 // scale / x + vmul.f64 d2 , d2, d2 // ( scale / x ) * ( scale / x ) + vmul.f64 d3 , d1, d2 // ssq * ( scale / x ) * ( scale / x ) + vadd.f64 d1 , d3, d7 // ssq = 1 + ssq * ( scale / x ) * ( scale / x ) + vmov.f64 d0 , d5 // scale = x + +KERNEL_S1_END_\@: + + add X, X, INC_X + +.endm + + +#else + +.macro KERNEL_F1 + + fldmias X!, { s4 - s5 } + + vcmpe.f32 s4, s6 // compare with 0.0 + vmrs APSR_nzcv, fpscr + beq KERNEL_F1_NEXT_\@ + vabs.f32 s4, s4 + vcmpe.f32 s0, s4 // compare with scale + vmrs APSR_nzcv, fpscr + vdivge.f32 s2 , s4, s0 // scale >= x ? x / scale + vmlage.f32 s1 , s2 , s2 // ssq += ( x/scale ) * ( x/scale ) + bge KERNEL_F1_NEXT_\@ + vdiv.f32 s2 , s0, s4 // scale / x + vmul.f32 s2 , s2, s2 // ( scale / x ) * ( scale / x ) + vmul.f32 s3 , s1, s2 // ssq * ( scale / x ) * ( scale / x ) + vadd.f32 s1 , s3, s7 // ssq = 1 + ssq * ( scale / x ) * ( scale / x ) + vmov.f32 s0 , s4 // scale = x + +KERNEL_F1_NEXT_\@: + + vcmpe.f32 s5, s6 // compare with 0.0 + vmrs APSR_nzcv, fpscr + beq KERNEL_F1_END_\@ + vabs.f32 s5, s5 + vcmpe.f32 s0, s5 // compare with scale + vmrs APSR_nzcv, fpscr + vdivge.f32 s2 , s5, s0 // scale >= x ? x / scale + vmlage.f32 s1 , s2 , s2 // ssq += ( x/scale ) * ( x/scale ) + bge KERNEL_F1_END_\@ + vdiv.f32 s2 , s0, s5 // scale / x + vmul.f32 s2 , s2, s2 // ( scale / x ) * ( scale / x ) + vmul.f32 s3 , s1, s2 // ssq * ( scale / x ) * ( scale / x ) + vadd.f32 s1 , s3, s7 // ssq = 1 + ssq * ( scale / x ) * ( scale / x ) + vmov.f32 s0 , s5 // scale = x + +KERNEL_F1_END_\@: + + +.endm + +.macro KERNEL_F8 + + pld [ X, #X_PRE ] + KERNEL_F1 + KERNEL_F1 + KERNEL_F1 + KERNEL_F1 + pld [ X, #X_PRE ] + KERNEL_F1 + KERNEL_F1 + KERNEL_F1 + KERNEL_F1 + +.endm + +.macro KERNEL_S1 + + fldmias X, { s4 - s5 } + + vcmpe.f32 s4, s6 // compare with 0.0 + vmrs APSR_nzcv, fpscr + beq KERNEL_S1_NEXT_\@ + vabs.f32 s4, s4 + vcmpe.f32 s0, s4 // compare with scale + vmrs APSR_nzcv, fpscr + vdivge.f32 s2 , s4, s0 // scale >= x ? x / scale + vmlage.f32 s1 , s2 , s2 // ssq += ( x/scale ) * ( x/scale ) + bge KERNEL_S1_NEXT_\@ + vdiv.f32 s2 , s0, s4 // scale / x + vmul.f32 s2 , s2, s2 // ( scale / x ) * ( scale / x ) + vmul.f32 s3 , s1, s2 // ssq * ( scale / x ) * ( scale / x ) + vadd.f32 s1 , s3, s7 // ssq = 1 + ssq * ( scale / x ) * ( scale / x ) + vmov.f32 s0 , s4 // scale = x + +KERNEL_S1_NEXT_\@: + + vcmpe.f32 s5, s6 // compare with 0.0 + vmrs APSR_nzcv, fpscr + beq KERNEL_S1_END_\@ + vabs.f32 s5, s5 + vcmpe.f32 s0, s5 // compare with scale + vmrs APSR_nzcv, fpscr + vdivge.f32 s2 , s5, s0 // scale >= x ? x / scale + vmlage.f32 s1 , s2 , s2 // ssq += ( x/scale ) * ( x/scale ) + bge KERNEL_S1_END_\@ + vdiv.f32 s2 , s0, s5 // scale / x + vmul.f32 s2 , s2, s2 // ( scale / x ) * ( scale / x ) + vmul.f32 s3 , s1, s2 // ssq * ( scale / x ) * ( scale / x ) + vadd.f32 s1 , s3, s7 // ssq = 1 + ssq * ( scale / x ) * ( scale / x ) + vmov.f32 s0 , s5 // scale = x + +KERNEL_S1_END_\@: + + add X, X, INC_X + +.endm + + + +#endif + +#endif + +/************************************************************************************** +* End of macro definitions +**************************************************************************************/ + + PROLOGUE + + b nrm2_begin + + +#if defined(COMPLEX) + +#if defined(DOUBLE) + +znrm2_one: + .word 0x00000000 + .word 0x3ff00000 + +#else + +cnrm2_one: + .word 0x3f800000 + +#endif + +#else + +#if defined(DOUBLE) + +dnrm2_one: + .word 0x00000000 + .word 0x3ff00000 + +#else + +snrm2_one: + .word 0x3f800000 + +#endif + +#endif + + + .align 5 + + +nrm2_begin: + +#if defined(COMPLEX) + +#if defined(DOUBLE) + vsub.f64 d0 , d0 , d0 // scale=0.0 + vldr.64 d1 , znrm2_one // ssq=1.0 + vmov.f64 d7 , d1 // value 1.0 + vmov.f64 d6 , d0 // value 0.0 +#else + vsub.f32 s0 , s0 , s0 // scale=0.0 + vldr.32 s1 , cnrm2_one // ssq=1.0 + vmov.f32 s7 , s1 // value 1.0 + vmov.f32 s6 , s0 // value 0.0 +#endif + +#else + +#if defined(DOUBLE) + vsub.f64 d0 , d0 , d0 // scale=0.0 + vldr.64 d1 , dnrm2_one // ssq=1.0 + vmov.f64 d7 , d1 // value 1.0 + vmov.f64 d6 , d0 // value 0.0 +#else + vsub.f32 s0 , s0 , s0 // scale=0.0 + vldr.32 s1 , snrm2_one // ssq=1.0 + vmov.f32 s7 , s1 // value 1.0 + vmov.f32 s6 , s0 // value 0.0 +#endif + + +#endif + + + cmp N, #0 + ble nrm2_kernel_L999 + + cmp INC_X, #0 + beq nrm2_kernel_L999 + + + cmp INC_X, #1 + bne nrm2_kernel_S_BEGIN + + +nrm2_kernel_F_BEGIN: + + asrs I, N, #3 // I = N / 8 + ble nrm2_kernel_F1 + +nrm2_kernel_F8: + + KERNEL_F8 + + subs I, I, #1 + bne nrm2_kernel_F8 + +nrm2_kernel_F1: + + ands I, N, #7 + ble nrm2_kernel_L999 + + +nrm2_kernel_F10: + + KERNEL_F1 + + subs I, I, #1 + bne nrm2_kernel_F10 + + b nrm2_kernel_L999 + +nrm2_kernel_S_BEGIN: + +#if defined(COMPLEX) + +#if defined(DOUBLE) + lsl INC_X, INC_X, #4 // INC_X * SIZE * 2 +#else + lsl INC_X, INC_X, #3 // INC_X * SIZE * 2 +#endif + +#else + +#if defined(DOUBLE) + lsl INC_X, INC_X, #3 // INC_X * SIZE +#else + lsl INC_X, INC_X, #2 // INC_X * SIZE +#endif + +#endif + + + +nrm2_kernel_S1: + + mov I, N + + .align 5 + +nrm2_kernel_S10: + + KERNEL_S1 + + subs I, I, #1 + bne nrm2_kernel_S10 + + +nrm2_kernel_L999: + +#if defined(DOUBLE) + vsqrt.f64 d1, d1 + vmul.f64 d0, d0, d1 +#else + vsqrt.f32 s1, s1 + vmul.f32 s0, s0, s1 +#endif + + bx lr + + EPILOGUE + diff --git a/kernel/arm/nrm2_vfpv3.S b/kernel/arm/nrm2_vfpv3.S new file mode 100644 index 000000000..b56f8b038 --- /dev/null +++ b/kernel/arm/nrm2_vfpv3.S @@ -0,0 +1,508 @@ +/*************************************************************************** +Copyright (c) 2013, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +/************************************************************************************** +* 2013/11/16 Saar +* BLASTEST : OK +* CTEST : OK +* TEST : OK +* +**************************************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#define STACKSIZE 256 + +#define N r0 +#define X r1 +#define INC_X r2 + +#define I r12 + +#define X_PRE 512 + +/************************************************************************************** +* Macro definitions +**************************************************************************************/ + + +#if !defined(COMPLEX) + +#if defined(DOUBLE) + + +.macro KERNEL_F1 + + fldmiad X!, { d4 } + vcmpe.f64 d4, d6 // compare with 0.0 + vmrs APSR_nzcv, fpscr + beq KERNEL_F1_NEXT_\@ + vabs.f64 d4, d4 + vcmpe.f64 d0, d4 // compare with scale + vmrs APSR_nzcv, fpscr + vdivge.f64 d2 , d4, d0 // scale >= x ? x / scale + vmlage.f64 d1 , d2 , d2 // ssq += ( x/scale ) * ( x/scale ) + bge KERNEL_F1_NEXT_\@ + vdiv.f64 d2 , d0, d4 // scale / x + vmul.f64 d2 , d2, d2 // ( scale / x ) * ( scale / x ) + vmul.f64 d3 , d1, d2 // ssq * ( scale / x ) * ( scale / x ) + vadd.f64 d1 , d3, d7 // ssq = 1 + ssq * ( scale / x ) * ( scale / x ) + vmov.f64 d0 , d4 // scale = x + +KERNEL_F1_NEXT_\@: + +.endm + +.macro KERNEL_F8 + + pld [ X, #X_PRE ] + KERNEL_F1 + KERNEL_F1 + KERNEL_F1 + KERNEL_F1 + pld [ X, #X_PRE ] + KERNEL_F1 + KERNEL_F1 + KERNEL_F1 + KERNEL_F1 + +.endm + +.macro KERNEL_S1 + + fldmiad X, { d4 } + vcmpe.f64 d4, d6 // compare with 0.0 + vmrs APSR_nzcv, fpscr + beq KERNEL_S1_NEXT + vabs.f64 d4, d4 + vcmpe.f64 d0, d4 // compare with scale + vmrs APSR_nzcv, fpscr + vdivge.f64 d2 , d4, d0 // scale >= x ? x / scale + vmlage.f64 d1 , d2 , d2 // ssq += ( x/scale ) * ( x/scale ) + bge KERNEL_S1_NEXT + vdiv.f64 d2 , d0, d4 // scale / x + vmul.f64 d2 , d2, d2 // ( scale / x ) * ( scale / x ) + vmul.f64 d3 , d1, d2 // ssq * ( scale / x ) * ( scale / x ) + vadd.f64 d1 , d3, d7 // ssq = 1 + ssq * ( scale / x ) * ( scale / x ) + vmov.f64 d0 , d4 // scale = x + +KERNEL_S1_NEXT: + + add X, X, INC_X + +.endm + +#else + +.macro KERNEL_F1 + + fldmias X!, { s4 } + vcmpe.f32 s4, s6 // compare with 0.0 + vmrs APSR_nzcv, fpscr + beq KERNEL_F1_NEXT_\@ + vabs.f32 s4, s4 + vcmpe.f32 s0, s4 // compare with scale + vmrs APSR_nzcv, fpscr + vdivge.f32 s2 , s4, s0 // scale >= x ? x / scale + vmlage.f32 s1 , s2 , s2 // ssq += ( x/scale ) * ( x/scale ) + bge KERNEL_F1_NEXT_\@ + vdiv.f32 s2 , s0, s4 // scale / x + vmul.f32 s2 , s2, s2 // ( scale / x ) * ( scale / x ) + vmul.f32 s3 , s1, s2 // ssq * ( scale / x ) * ( scale / x ) + vadd.f32 s1 , s3, s7 // ssq = 1 + ssq * ( scale / x ) * ( scale / x ) + vmov.f32 s0 , s4 // scale = x + +KERNEL_F1_NEXT_\@: + +.endm + +.macro KERNEL_F8 + + pld [ X, #X_PRE ] + KERNEL_F1 + KERNEL_F1 + KERNEL_F1 + KERNEL_F1 + + KERNEL_F1 + KERNEL_F1 + KERNEL_F1 + KERNEL_F1 + +.endm + +.macro KERNEL_S1 + + fldmias X, { s4 } + vcmpe.f32 s4, s6 // compare with 0.0 + vmrs APSR_nzcv, fpscr + beq KERNEL_S1_NEXT + vabs.f32 s4, s4 + vcmpe.f32 s0, s4 // compare with scale + vmrs APSR_nzcv, fpscr + vdivge.f32 s2 , s4, s0 // scale >= x ? x / scale + vmlage.f32 s1 , s2 , s2 // ssq += ( x/scale ) * ( x/scale ) + bge KERNEL_S1_NEXT + vdiv.f32 s2 , s0, s4 // scale / x + vmul.f32 s2 , s2, s2 // ( scale / x ) * ( scale / x ) + vmul.f32 s3 , s1, s2 // ssq * ( scale / x ) * ( scale / x ) + vadd.f32 s1 , s3, s7 // ssq = 1 + ssq * ( scale / x ) * ( scale / x ) + vmov.f32 s0 , s4 // scale = x + +KERNEL_S1_NEXT: + + add X, X, INC_X + +.endm + + + + +#endif + +#else + +#if defined(DOUBLE) + +.macro KERNEL_F1 + + fldmiad X!, { d4 - d5 } + + vcmpe.f64 d4, d6 // compare with 0.0 + vmrs APSR_nzcv, fpscr + beq KERNEL_F1_NEXT_\@ + vabs.f64 d4, d4 + vcmpe.f64 d0, d4 // compare with scale + vmrs APSR_nzcv, fpscr + vdivge.f64 d2 , d4, d0 // scale >= x ? x / scale + vmlage.f64 d1 , d2 , d2 // ssq += ( x/scale ) * ( x/scale ) + bge KERNEL_F1_NEXT_\@ + vdiv.f64 d2 , d0, d4 // scale / x + vmul.f64 d2 , d2, d2 // ( scale / x ) * ( scale / x ) + vmul.f64 d3 , d1, d2 // ssq * ( scale / x ) * ( scale / x ) + vadd.f64 d1 , d3, d7 // ssq = 1 + ssq * ( scale / x ) * ( scale / x ) + vmov.f64 d0 , d4 // scale = x + +KERNEL_F1_NEXT_\@: + + vcmpe.f64 d5, d6 // compare with 0.0 + vmrs APSR_nzcv, fpscr + beq KERNEL_F1_END_\@ + vabs.f64 d5, d5 + vcmpe.f64 d0, d5 // compare with scale + vmrs APSR_nzcv, fpscr + vdivge.f64 d2 , d5, d0 // scale >= x ? x / scale + vmlage.f64 d1 , d2 , d2 // ssq += ( x/scale ) * ( x/scale ) + bge KERNEL_F1_END_\@ + vdiv.f64 d2 , d0, d5 // scale / x + vmul.f64 d2 , d2, d2 // ( scale / x ) * ( scale / x ) + vmul.f64 d3 , d1, d2 // ssq * ( scale / x ) * ( scale / x ) + vadd.f64 d1 , d3, d7 // ssq = 1 + ssq * ( scale / x ) * ( scale / x ) + vmov.f64 d0 , d5 // scale = x + +KERNEL_F1_END_\@: + + +.endm + +.macro KERNEL_F8 + + pld [ X, #X_PRE ] + KERNEL_F1 + KERNEL_F1 + pld [ X, #X_PRE ] + KERNEL_F1 + KERNEL_F1 + pld [ X, #X_PRE ] + KERNEL_F1 + KERNEL_F1 + pld [ X, #X_PRE ] + KERNEL_F1 + KERNEL_F1 + +.endm + +.macro KERNEL_S1 + + fldmiad X, { d4 - d5 } + + vcmpe.f64 d4, d6 // compare with 0.0 + vmrs APSR_nzcv, fpscr + beq KERNEL_S1_NEXT_\@ + vabs.f64 d4, d4 + vcmpe.f64 d0, d4 // compare with scale + vmrs APSR_nzcv, fpscr + vdivge.f64 d2 , d4, d0 // scale >= x ? x / scale + vmlage.f64 d1 , d2 , d2 // ssq += ( x/scale ) * ( x/scale ) + bge KERNEL_S1_NEXT_\@ + vdiv.f64 d2 , d0, d4 // scale / x + vmul.f64 d2 , d2, d2 // ( scale / x ) * ( scale / x ) + vmul.f64 d3 , d1, d2 // ssq * ( scale / x ) * ( scale / x ) + vadd.f64 d1 , d3, d7 // ssq = 1 + ssq * ( scale / x ) * ( scale / x ) + vmov.f64 d0 , d4 // scale = x + +KERNEL_S1_NEXT_\@: + + vcmpe.f64 d5, d6 // compare with 0.0 + vmrs APSR_nzcv, fpscr + beq KERNEL_S1_END_\@ + vabs.f64 d5, d5 + vcmpe.f64 d0, d5 // compare with scale + vmrs APSR_nzcv, fpscr + vdivge.f64 d2 , d5, d0 // scale >= x ? x / scale + vmlage.f64 d1 , d2 , d2 // ssq += ( x/scale ) * ( x/scale ) + bge KERNEL_S1_END_\@ + vdiv.f64 d2 , d0, d5 // scale / x + vmul.f64 d2 , d2, d2 // ( scale / x ) * ( scale / x ) + vmul.f64 d3 , d1, d2 // ssq * ( scale / x ) * ( scale / x ) + vadd.f64 d1 , d3, d7 // ssq = 1 + ssq * ( scale / x ) * ( scale / x ) + vmov.f64 d0 , d5 // scale = x + +KERNEL_S1_END_\@: + + add X, X, INC_X + +.endm + + +#else + +.macro KERNEL_F1 + + fldmias X!, { s4 - s5 } + + vcmpe.f32 s4, s6 // compare with 0.0 + vmrs APSR_nzcv, fpscr + beq KERNEL_F1_NEXT_\@ + vabs.f32 s4, s4 + vcmpe.f32 s0, s4 // compare with scale + vmrs APSR_nzcv, fpscr + vdivge.f32 s2 , s4, s0 // scale >= x ? x / scale + vmlage.f32 s1 , s2 , s2 // ssq += ( x/scale ) * ( x/scale ) + bge KERNEL_F1_NEXT_\@ + vdiv.f32 s2 , s0, s4 // scale / x + vmul.f32 s2 , s2, s2 // ( scale / x ) * ( scale / x ) + vmul.f32 s3 , s1, s2 // ssq * ( scale / x ) * ( scale / x ) + vadd.f32 s1 , s3, s7 // ssq = 1 + ssq * ( scale / x ) * ( scale / x ) + vmov.f32 s0 , s4 // scale = x + +KERNEL_F1_NEXT_\@: + + vcmpe.f32 s5, s6 // compare with 0.0 + vmrs APSR_nzcv, fpscr + beq KERNEL_F1_END_\@ + vabs.f32 s5, s5 + vcmpe.f32 s0, s5 // compare with scale + vmrs APSR_nzcv, fpscr + vdivge.f32 s2 , s5, s0 // scale >= x ? x / scale + vmlage.f32 s1 , s2 , s2 // ssq += ( x/scale ) * ( x/scale ) + bge KERNEL_F1_END_\@ + vdiv.f32 s2 , s0, s5 // scale / x + vmul.f32 s2 , s2, s2 // ( scale / x ) * ( scale / x ) + vmul.f32 s3 , s1, s2 // ssq * ( scale / x ) * ( scale / x ) + vadd.f32 s1 , s3, s7 // ssq = 1 + ssq * ( scale / x ) * ( scale / x ) + vmov.f32 s0 , s5 // scale = x + +KERNEL_F1_END_\@: + + +.endm + +.macro KERNEL_F8 + + pld [ X, #X_PRE ] + KERNEL_F1 + KERNEL_F1 + KERNEL_F1 + KERNEL_F1 + pld [ X, #X_PRE ] + KERNEL_F1 + KERNEL_F1 + KERNEL_F1 + KERNEL_F1 + +.endm + +.macro KERNEL_S1 + + fldmias X, { s4 - s5 } + + vcmpe.f32 s4, s6 // compare with 0.0 + vmrs APSR_nzcv, fpscr + beq KERNEL_S1_NEXT_\@ + vabs.f32 s4, s4 + vcmpe.f32 s0, s4 // compare with scale + vmrs APSR_nzcv, fpscr + vdivge.f32 s2 , s4, s0 // scale >= x ? x / scale + vmlage.f32 s1 , s2 , s2 // ssq += ( x/scale ) * ( x/scale ) + bge KERNEL_S1_NEXT_\@ + vdiv.f32 s2 , s0, s4 // scale / x + vmul.f32 s2 , s2, s2 // ( scale / x ) * ( scale / x ) + vmul.f32 s3 , s1, s2 // ssq * ( scale / x ) * ( scale / x ) + vadd.f32 s1 , s3, s7 // ssq = 1 + ssq * ( scale / x ) * ( scale / x ) + vmov.f32 s0 , s4 // scale = x + +KERNEL_S1_NEXT_\@: + + vcmpe.f32 s5, s6 // compare with 0.0 + vmrs APSR_nzcv, fpscr + beq KERNEL_S1_END_\@ + vabs.f32 s5, s5 + vcmpe.f32 s0, s5 // compare with scale + vmrs APSR_nzcv, fpscr + vdivge.f32 s2 , s5, s0 // scale >= x ? x / scale + vmlage.f32 s1 , s2 , s2 // ssq += ( x/scale ) * ( x/scale ) + bge KERNEL_S1_END_\@ + vdiv.f32 s2 , s0, s5 // scale / x + vmul.f32 s2 , s2, s2 // ( scale / x ) * ( scale / x ) + vmul.f32 s3 , s1, s2 // ssq * ( scale / x ) * ( scale / x ) + vadd.f32 s1 , s3, s7 // ssq = 1 + ssq * ( scale / x ) * ( scale / x ) + vmov.f32 s0 , s5 // scale = x + +KERNEL_S1_END_\@: + + add X, X, INC_X + +.endm + + + +#endif + +#endif + +/************************************************************************************** +* End of macro definitions +**************************************************************************************/ + + PROLOGUE + + .align 5 + +#if defined(DOUBLE) + vsub.f64 d0 , d0 , d0 // scale=0.0 + vmov.f64 d1 , #1.0 // ssq=1.0 + vmov.f64 d7 , d1 // value 1.0 + vmov.f64 d6 , d0 // value 0.0 +#else + vsub.f32 s0 , s0 , s0 // scale=0.0 + vmov.f32 s1 , #1.0 // ssq=1.0 + vmov.f32 s7 , s1 // value 1.0 + vmov.f32 s6 , s0 // value 0.0 +#endif + + + + cmp N, #0 + ble nrm2_kernel_L999 + + cmp INC_X, #0 + beq nrm2_kernel_L999 + + + cmp INC_X, #1 + bne nrm2_kernel_S_BEGIN + + +nrm2_kernel_F_BEGIN: + + asrs I, N, #3 // I = N / 8 + ble nrm2_kernel_F1 + +nrm2_kernel_F8: + + KERNEL_F8 + + subs I, I, #1 + bne nrm2_kernel_F8 + +nrm2_kernel_F1: + + ands I, N, #7 + ble nrm2_kernel_L999 + + +nrm2_kernel_F10: + + KERNEL_F1 + + subs I, I, #1 + bne nrm2_kernel_F10 + + b nrm2_kernel_L999 + +nrm2_kernel_S_BEGIN: + +#if defined(COMPLEX) + +#if defined(DOUBLE) + lsl INC_X, INC_X, #4 // INC_X * SIZE * 2 +#else + lsl INC_X, INC_X, #3 // INC_X * SIZE * 2 +#endif + +#else + +#if defined(DOUBLE) + lsl INC_X, INC_X, #3 // INC_X * SIZE +#else + lsl INC_X, INC_X, #2 // INC_X * SIZE +#endif + +#endif + + + +nrm2_kernel_S1: + + mov I, N + + .align 5 + +nrm2_kernel_S10: + + KERNEL_S1 + + subs I, I, #1 + bne nrm2_kernel_S10 + + +nrm2_kernel_L999: + +#if defined(DOUBLE) + vsqrt.f64 d1, d1 + vmul.f64 d0, d0, d1 +#else + vsqrt.f32 s1, s1 + vmul.f32 s0, s0, s1 +#endif + + bx lr + + EPILOGUE + diff --git a/kernel/arm/rot.c b/kernel/arm/rot.c new file mode 100644 index 000000000..aa60b4471 --- /dev/null +++ b/kernel/arm/rot.c @@ -0,0 +1,62 @@ +/*************************************************************************** +Copyright (c) 2013, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +/************************************************************************************** +* 2013/09/14 Saar +* BLASTEST float : OK +* BLASTEST double : OK +* CTEST : OK +* TEST : OK +* +**************************************************************************************/ + +#include "common.h" + +int CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT c, FLOAT s) +{ + BLASLONG i=0; + BLASLONG ix=0,iy=0; + FLOAT temp; + + if ( n <= 0 ) return(0); + + while(i < n) + { + temp = c*x[ix] + s*y[iy] ; + y[iy] = c*y[iy] - s*x[ix] ; + x[ix] = temp ; + + ix += inc_x ; + iy += inc_y ; + i++ ; + + } + return(0); + +} + + diff --git a/kernel/arm/rot_vfp.S b/kernel/arm/rot_vfp.S new file mode 100644 index 000000000..663ecdf81 --- /dev/null +++ b/kernel/arm/rot_vfp.S @@ -0,0 +1,584 @@ +/*************************************************************************** +Copyright (c) 2013, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +/************************************************************************************** +* 2013/11/15 Saar +* BLASTEST : OK +* CTEST : OK +* TEST : OK +* +**************************************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#define STACKSIZE 256 + +#define OLD_INC_Y [fp, #0 ] + + +#define N r0 +#define X r1 +#define INC_X r2 +#define Y r3 +#define INC_Y r4 + +#define I r12 + +#define X_PRE 512 + +/************************************************************************************** +* Macro definitions +**************************************************************************************/ + +/*****************************************************************************************/ + + + +#if !defined(COMPLEX) + +#if defined(DOUBLE) + +.macro KERNEL_F4 + + pld [ X, #X_PRE ] + pld [ Y, #X_PRE ] + + fldmiad X, { d4 } + fldmiad Y, { d5 } + vmul.f64 d2 , d0, d4 + fmacd d2 , d1, d5 + vmul.f64 d3 , d0, d5 + fnmacd d3 , d1, d4 + fstmiad X!, { d2 } + fstmiad Y!, { d3 } + + fldmiad X, { d4 } + fldmiad Y, { d5 } + vmul.f64 d2 , d0, d4 + fmacd d2 , d1, d5 + vmul.f64 d3 , d0, d5 + fnmacd d3 , d1, d4 + fstmiad X!, { d2 } + fstmiad Y!, { d3 } + + fldmiad X, { d4 } + fldmiad Y, { d5 } + vmul.f64 d2 , d0, d4 + fmacd d2 , d1, d5 + vmul.f64 d3 , d0, d5 + fnmacd d3 , d1, d4 + fstmiad X!, { d2 } + fstmiad Y!, { d3 } + + fldmiad X, { d4 } + fldmiad Y, { d5 } + vmul.f64 d2 , d0, d4 + fmacd d2 , d1, d5 + vmul.f64 d3 , d0, d5 + fnmacd d3 , d1, d4 + fstmiad X!, { d2 } + fstmiad Y!, { d3 } + +.endm + + +.macro KERNEL_F1 + + fldmiad X, { d4 } + fldmiad Y, { d5 } + vmul.f64 d2 , d0, d4 + fmacd d2 , d1, d5 + vmul.f64 d3 , d0, d5 + fnmacd d3 , d1, d4 + fstmiad X!, { d2 } + fstmiad Y!, { d3 } + +.endm + +.macro KERNEL_S1 + + fldmiad X, { d4 } + fldmiad Y, { d5 } + vmul.f64 d2 , d0, d4 + fmacd d2 , d1, d5 + vmul.f64 d3 , d0, d5 + fnmacd d3 , d1, d4 + fstmiad X, { d2 } + fstmiad Y, { d3 } + + add X, X, INC_X + add Y, Y, INC_Y + +.endm + +#else + +.macro KERNEL_F4 + + fldmias X, { s4 } + fldmias Y, { s5 } + vmul.f32 s2 , s0, s4 + fmacs s2 , s1, s5 + vmul.f32 s3 , s0, s5 + fnmacs s3 , s1, s4 + fstmias X!, { s2 } + fstmias Y!, { s3 } + + fldmias X, { s4 } + fldmias Y, { s5 } + vmul.f32 s2 , s0, s4 + fmacs s2 , s1, s5 + vmul.f32 s3 , s0, s5 + fnmacs s3 , s1, s4 + fstmias X!, { s2 } + fstmias Y!, { s3 } + + fldmias X, { s4 } + fldmias Y, { s5 } + vmul.f32 s2 , s0, s4 + fmacs s2 , s1, s5 + vmul.f32 s3 , s0, s5 + fnmacs s3 , s1, s4 + fstmias X!, { s2 } + fstmias Y!, { s3 } + + fldmias X, { s4 } + fldmias Y, { s5 } + vmul.f32 s2 , s0, s4 + fmacs s2 , s1, s5 + vmul.f32 s3 , s0, s5 + fnmacs s3 , s1, s4 + fstmias X!, { s2 } + fstmias Y!, { s3 } + +.endm + + +.macro KERNEL_F1 + + fldmias X, { s4 } + fldmias Y, { s5 } + vmul.f32 s2 , s0, s4 + fmacs s2 , s1, s5 + vmul.f32 s3 , s0, s5 + fnmacs s3 , s1, s4 + fstmias X!, { s2 } + fstmias Y!, { s3 } + +.endm + +.macro KERNEL_S1 + + fldmias X, { s4 } + fldmias Y, { s5 } + vmul.f32 s2 , s0, s4 + fmacs s2 , s1, s5 + vmul.f32 s3 , s0, s5 + fnmacs s3 , s1, s4 + fstmias X, { s2 } + fstmias Y, { s3 } + + add X, X, INC_X + add Y, Y, INC_Y + +.endm + + + +#endif + +#else + +#if defined(DOUBLE) + +.macro KERNEL_F4 + + pld [ X, #X_PRE ] + pld [ Y, #X_PRE ] + + fldmiad X, { d4 - d5 } + fldmiad Y, { d6 - d7 } + vmul.f64 d2 , d0, d4 + fmacd d2 , d1, d6 + vmul.f64 d3 , d0, d6 + fnmacd d3 , d1, d4 + fstmiad X!, { d2 } + fstmiad Y!, { d3 } + vmul.f64 d2 , d0, d5 + fmacd d2 , d1, d7 + vmul.f64 d3 , d0, d7 + fnmacd d3 , d1, d5 + fstmiad X!, { d2 } + fstmiad Y!, { d3 } + + fldmiad X, { d4 - d5 } + fldmiad Y, { d6 - d7 } + vmul.f64 d2 , d0, d4 + fmacd d2 , d1, d6 + vmul.f64 d3 , d0, d6 + fnmacd d3 , d1, d4 + fstmiad X!, { d2 } + fstmiad Y!, { d3 } + vmul.f64 d2 , d0, d5 + fmacd d2 , d1, d7 + vmul.f64 d3 , d0, d7 + fnmacd d3 , d1, d5 + fstmiad X!, { d2 } + fstmiad Y!, { d3 } + + pld [ X, #X_PRE ] + pld [ Y, #X_PRE ] + + fldmiad X, { d4 - d5 } + fldmiad Y, { d6 - d7 } + vmul.f64 d2 , d0, d4 + fmacd d2 , d1, d6 + vmul.f64 d3 , d0, d6 + fnmacd d3 , d1, d4 + fstmiad X!, { d2 } + fstmiad Y!, { d3 } + vmul.f64 d2 , d0, d5 + fmacd d2 , d1, d7 + vmul.f64 d3 , d0, d7 + fnmacd d3 , d1, d5 + fstmiad X!, { d2 } + fstmiad Y!, { d3 } + + fldmiad X, { d4 - d5 } + fldmiad Y, { d6 - d7 } + vmul.f64 d2 , d0, d4 + fmacd d2 , d1, d6 + vmul.f64 d3 , d0, d6 + fnmacd d3 , d1, d4 + fstmiad X!, { d2 } + fstmiad Y!, { d3 } + vmul.f64 d2 , d0, d5 + fmacd d2 , d1, d7 + vmul.f64 d3 , d0, d7 + fnmacd d3 , d1, d5 + fstmiad X!, { d2 } + fstmiad Y!, { d3 } + +.endm + + +.macro KERNEL_F1 + + fldmiad X, { d4 - d5 } + fldmiad Y, { d6 - d7 } + vmul.f64 d2 , d0, d4 + fmacd d2 , d1, d6 + vmul.f64 d3 , d0, d6 + fnmacd d3 , d1, d4 + fstmiad X!, { d2 } + fstmiad Y!, { d3 } + vmul.f64 d2 , d0, d5 + fmacd d2 , d1, d7 + vmul.f64 d3 , d0, d7 + fnmacd d3 , d1, d5 + fstmiad X!, { d2 } + fstmiad Y!, { d3 } + + +.endm + +.macro KERNEL_S1 + + fldmiad X, { d4 - d5 } + fldmiad Y, { d6 - d7 } + vmul.f64 d2 , d0, d4 + fmacd d2 , d1, d6 + vmul.f64 d3 , d0, d6 + fnmacd d3 , d1, d4 + vstr d2 , [ X, #0 ] + vstr d3 , [ Y, #0 ] + vmul.f64 d2 , d0, d5 + fmacd d2 , d1, d7 + vmul.f64 d3 , d0, d7 + fnmacd d3 , d1, d5 + vstr d2 , [ X, #8 ] + vstr d3 , [ Y, #8 ] + + add X, X, INC_X + add Y, Y, INC_Y + +.endm + + + +#else + +.macro KERNEL_F4 + + pld [ X, #X_PRE ] + pld [ Y, #X_PRE ] + + fldmias X, { s4 - s5 } + fldmias Y, { s6 - s7 } + vmul.f32 s2 , s0, s4 + fmacs s2 , s1, s6 + vmul.f32 s3 , s0, s6 + fnmacs s3 , s1, s4 + fstmias X!, { s2 } + fstmias Y!, { s3 } + vmul.f32 s2 , s0, s5 + fmacs s2 , s1, s7 + vmul.f32 s3 , s0, s7 + fnmacs s3 , s1, s5 + fstmias X!, { s2 } + fstmias Y!, { s3 } + + fldmias X, { s4 - s5 } + fldmias Y, { s6 - s7 } + vmul.f32 s2 , s0, s4 + fmacs s2 , s1, s6 + vmul.f32 s3 , s0, s6 + fnmacs s3 , s1, s4 + fstmias X!, { s2 } + fstmias Y!, { s3 } + vmul.f32 s2 , s0, s5 + fmacs s2 , s1, s7 + vmul.f32 s3 , s0, s7 + fnmacs s3 , s1, s5 + fstmias X!, { s2 } + fstmias Y!, { s3 } + + pld [ X, #X_PRE ] + pld [ Y, #X_PRE ] + + fldmias X, { s4 - s5 } + fldmias Y, { s6 - s7 } + vmul.f32 s2 , s0, s4 + fmacs s2 , s1, s6 + vmul.f32 s3 , s0, s6 + fnmacs s3 , s1, s4 + fstmias X!, { s2 } + fstmias Y!, { s3 } + vmul.f32 s2 , s0, s5 + fmacs s2 , s1, s7 + vmul.f32 s3 , s0, s7 + fnmacs s3 , s1, s5 + fstmias X!, { s2 } + fstmias Y!, { s3 } + + fldmias X, { s4 - s5 } + fldmias Y, { s6 - s7 } + vmul.f32 s2 , s0, s4 + fmacs s2 , s1, s6 + vmul.f32 s3 , s0, s6 + fnmacs s3 , s1, s4 + fstmias X!, { s2 } + fstmias Y!, { s3 } + vmul.f32 s2 , s0, s5 + fmacs s2 , s1, s7 + vmul.f32 s3 , s0, s7 + fnmacs s3 , s1, s5 + fstmias X!, { s2 } + fstmias Y!, { s3 } + +.endm + + +.macro KERNEL_F1 + + fldmias X, { s4 - s5 } + fldmias Y, { s6 - s7 } + vmul.f32 s2 , s0, s4 + fmacs s2 , s1, s6 + vmul.f32 s3 , s0, s6 + fnmacs s3 , s1, s4 + fstmias X!, { s2 } + fstmias Y!, { s3 } + vmul.f32 s2 , s0, s5 + fmacs s2 , s1, s7 + vmul.f32 s3 , s0, s7 + fnmacs s3 , s1, s5 + fstmias X!, { s2 } + fstmias Y!, { s3 } + + +.endm + +.macro KERNEL_S1 + + fldmias X, { s4 - s5 } + fldmias Y, { s6 - s7 } + vmul.f32 s2 , s0, s4 + fmacs s2 , s1, s6 + vmul.f32 s3 , s0, s6 + fnmacs s3 , s1, s4 + vstr s2 , [ X, #0 ] + vstr s3 , [ Y, #0 ] + vmul.f32 s2 , s0, s5 + fmacs s2 , s1, s7 + vmul.f32 s3 , s0, s7 + fnmacs s3 , s1, s5 + vstr s2 , [ X, #4 ] + vstr s3 , [ Y, #4 ] + + add X, X, INC_X + add Y, Y, INC_Y + +.endm + + +#endif + +#endif + +/************************************************************************************** +* End of macro definitions +**************************************************************************************/ + + PROLOGUE + + .align 5 + push {r4 , fp} + add fp, sp, #8 + + ldr INC_Y , OLD_INC_Y + + + cmp N, #0 + ble rot_kernel_L999 + + cmp INC_X, #0 + beq rot_kernel_L999 + + cmp INC_Y, #0 + beq rot_kernel_L999 + + cmp INC_X, #1 + bne rot_kernel_S_BEGIN + + cmp INC_Y, #1 + bne rot_kernel_S_BEGIN + + +rot_kernel_F_BEGIN: + + + asrs I, N, #2 // I = N / 4 + ble rot_kernel_F1 + + .align 5 + +rot_kernel_F4: + +#if !defined(COMPLEX) && !defined(DOUBLE) + pld [ X, #X_PRE ] + pld [ Y, #X_PRE ] +#endif + + KERNEL_F4 + + subs I, I, #1 + ble rot_kernel_F1 + + KERNEL_F4 + + subs I, I, #1 + bne rot_kernel_F4 + +rot_kernel_F1: + + ands I, N, #3 + ble rot_kernel_L999 + +rot_kernel_F10: + + KERNEL_F1 + + subs I, I, #1 + bne rot_kernel_F10 + + b rot_kernel_L999 + +rot_kernel_S_BEGIN: + +#if defined(COMPLEX) + +#if defined(DOUBLE) + lsl INC_X, INC_X, #4 // INC_X * SIZE * 2 + lsl INC_Y, INC_Y, #4 // INC_Y * SIZE * 2 +#else + lsl INC_X, INC_X, #3 // INC_X * SIZE * 2 + lsl INC_Y, INC_Y, #3 // INC_Y * SIZE * 2 +#endif + +#else + +#if defined(DOUBLE) + lsl INC_X, INC_X, #3 // INC_X * SIZE + lsl INC_Y, INC_Y, #3 // INC_Y * SIZE +#else + lsl INC_X, INC_X, #2 // INC_X * SIZE + lsl INC_Y, INC_Y, #2 // INC_Y * SIZE +#endif + +#endif + + + asrs I, N, #2 // I = N / 4 + ble rot_kernel_S1 + + .align 5 + +rot_kernel_S4: + + KERNEL_S1 + KERNEL_S1 + KERNEL_S1 + KERNEL_S1 + + subs I, I, #1 + bne rot_kernel_S4 + +rot_kernel_S1: + + ands I, N, #3 + ble rot_kernel_L999 + +rot_kernel_S10: + + KERNEL_S1 + + subs I, I, #1 + bne rot_kernel_S10 + + +rot_kernel_L999: + + mov r0, #0 // set return value + + sub sp, fp, #8 + pop {r4,fp} + bx lr + + EPILOGUE + diff --git a/kernel/arm/scal.c b/kernel/arm/scal.c new file mode 100644 index 000000000..d385c46bc --- /dev/null +++ b/kernel/arm/scal.c @@ -0,0 +1,58 @@ +/*************************************************************************** +Copyright (c) 2013, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +/************************************************************************************** +* 2013/09/14 Saar +* BLASTEST float : OK +* BLASTEST double : OK +* CTEST : OK +* TEST : OK +* +**************************************************************************************/ + +#include "common.h" + +int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *dummy, BLASLONG dummy2) +{ + BLASLONG i=0; + + if ( n < 0 || inc_x < 1 ) return(0); + if ( da == 1.0 ) return(0); + + n *= inc_x; + while(i < n) + { + + x[i] = da * x[i] ; + i += inc_x ; + + } + return(0); + +} + + diff --git a/kernel/arm/scal_vfp.S b/kernel/arm/scal_vfp.S new file mode 100644 index 000000000..a04b7241e --- /dev/null +++ b/kernel/arm/scal_vfp.S @@ -0,0 +1,376 @@ +/*************************************************************************** +Copyright (c) 2013, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +/************************************************************************************** +* 2013/11/15 Saar +* BLASTEST : OK +* CTEST : OK +* TEST : OK +* +**************************************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#define STACKSIZE 256 + +#define OLD_INC_X [sp, #0 ] + + +#define N r0 +#define INC_X r1 +#define X r3 + +#define I r12 + +#define X_PRE 512 + +/************************************************************************************** +* Macro definitions +**************************************************************************************/ + +/*****************************************************************************************/ + + + +#if !defined(COMPLEX) + +#if defined(DOUBLE) + +.macro KERNEL_F4 + + pld [ X, #X_PRE ] + fldmiad X, { d4 - d7 } + vmul.f64 d4, d4, d0 + vmul.f64 d5, d5, d0 + vmul.f64 d6, d6, d0 + fstmiad X!, { d4 - d5 } + vmul.f64 d7, d7, d0 + fstmiad X!, { d6 - d7 } + +.endm + + +.macro KERNEL_F1 + + fldmiad X, { d4 } + vmul.f64 d4, d4, d0 + fstmiad X!, { d4 } + +.endm + +.macro KERNEL_S1 + + fldmiad X, { d4 } + vmul.f64 d4, d4, d0 + fstmiad X, { d4 } + add X, X, INC_X + +.endm + +#else + +.macro KERNEL_F4 + + fldmias X, { s4 - s7 } + vmul.f32 s4, s4, s0 + vmul.f32 s5, s5, s0 + vmul.f32 s6, s6, s0 + fstmias X!, { s4 - s5 } + vmul.f32 s7, s7, s0 + fstmias X!, { s6 - s7 } + +.endm + + +.macro KERNEL_F1 + + fldmias X, { s4 } + vmul.f32 s4, s4, s0 + fstmias X!, { s4 } + +.endm + +.macro KERNEL_S1 + + fldmias X, { s4 } + vmul.f32 s4, s4, s0 + fstmias X, { s4 } + add X, X, INC_X + +.endm + + + +#endif + +#else + +#if defined(DOUBLE) + +.macro KERNEL_F4 + + pld [ X, #X_PRE ] + + fldmiad X, { d4 - d5 } + vmul.f64 d2, d0, d4 + fnmacd d2, d1, d5 + vmul.f64 d3, d0, d5 + fmacd d3, d1, d4 + fstmiad X!, { d2 - d3 } + + fldmiad X, { d4 - d5 } + vmul.f64 d2, d0, d4 + fnmacd d2, d1, d5 + vmul.f64 d3, d0, d5 + fmacd d3, d1, d4 + fstmiad X!, { d2 - d3 } + + pld [ X, #X_PRE ] + + fldmiad X, { d4 - d5 } + vmul.f64 d2, d0, d4 + fnmacd d2, d1, d5 + vmul.f64 d3, d0, d5 + fmacd d3, d1, d4 + fstmiad X!, { d2 - d3 } + + fldmiad X, { d4 - d5 } + vmul.f64 d2, d0, d4 + fnmacd d2, d1, d5 + vmul.f64 d3, d0, d5 + fmacd d3, d1, d4 + fstmiad X!, { d2 - d3 } + +.endm + + +.macro KERNEL_F1 + + fldmiad X, { d4 - d5 } + vmul.f64 d2, d0, d4 + fnmacd d2, d1, d5 + vmul.f64 d3, d0, d5 + fmacd d3, d1, d4 + fstmiad X!, { d2 - d3 } + +.endm + +.macro KERNEL_S1 + + fldmiad X, { d4 - d5 } + vmul.f64 d2, d0, d4 + fnmacd d2, d1, d5 + vmul.f64 d3, d0, d5 + fmacd d3, d1, d4 + fstmiad X, { d2 - d3 } + add X, X, INC_X + +.endm + + +#else + +.macro KERNEL_F4 + + pld [ X, #X_PRE ] + + fldmias X, { s4 - s5 } + vmul.f32 s2, s0, s4 + fnmacs s2, s1, s5 + vmul.f32 s3, s0, s5 + fmacs s3, s1, s4 + fstmias X!, { s2 - s3 } + + fldmias X, { s4 - s5 } + vmul.f32 s2, s0, s4 + fnmacs s2, s1, s5 + vmul.f32 s3, s0, s5 + fmacs s3, s1, s4 + fstmias X!, { s2 - s3 } + + fldmias X, { s4 - s5 } + vmul.f32 s2, s0, s4 + fnmacs s2, s1, s5 + vmul.f32 s3, s0, s5 + fmacs s3, s1, s4 + fstmias X!, { s2 - s3 } + + fldmias X, { s4 - s5 } + vmul.f32 s2, s0, s4 + fnmacs s2, s1, s5 + vmul.f32 s3, s0, s5 + fmacs s3, s1, s4 + fstmias X!, { s2 - s3 } + +.endm + + +.macro KERNEL_F1 + + fldmias X, { s4 - s5 } + vmul.f32 s2, s0, s4 + fnmacs s2, s1, s5 + vmul.f32 s3, s0, s5 + fmacs s3, s1, s4 + fstmias X!, { s2 - s3 } + +.endm + +.macro KERNEL_S1 + + fldmias X, { s4 - s5 } + vmul.f32 s2, s0, s4 + fnmacs s2, s1, s5 + vmul.f32 s3, s0, s5 + fmacs s3, s1, s4 + fstmias X, { s2 - s3 } + add X, X, INC_X + +.endm + + + +#endif + +#endif + +/************************************************************************************** +* End of macro definitions +**************************************************************************************/ + + PROLOGUE + + .align 5 + + ldr INC_X , OLD_INC_X + + cmp N, #0 + ble scal_kernel_L999 + + cmp INC_X, #0 + ble scal_kernel_L999 + + cmp INC_X, #1 + bne scal_kernel_S_BEGIN + + +scal_kernel_F_BEGIN: + + + asrs I, N, #2 // I = N / 4 + ble scal_kernel_F1 + + .align 5 + +scal_kernel_F4: + +#if !defined(COMPLEX) && !defined(DOUBLE) + pld [ X, #X_PRE ] +#endif + + KERNEL_F4 + + subs I, I, #1 + ble scal_kernel_F1 + + KERNEL_F4 + + subs I, I, #1 + bne scal_kernel_F4 + +scal_kernel_F1: + + ands I, N, #3 + ble scal_kernel_L999 + +scal_kernel_F10: + + KERNEL_F1 + + subs I, I, #1 + bne scal_kernel_F10 + + b scal_kernel_L999 + +scal_kernel_S_BEGIN: + +#if defined(COMPLEX) + +#if defined(DOUBLE) + lsl INC_X, INC_X, #4 // INC_X * SIZE * 2 +#else + lsl INC_X, INC_X, #3 // INC_X * SIZE * 2 +#endif + +#else + +#if defined(DOUBLE) + lsl INC_X, INC_X, #3 // INC_X * SIZE +#else + lsl INC_X, INC_X, #2 // INC_X * SIZE +#endif + +#endif + + + asrs I, N, #2 // I = N / 4 + ble scal_kernel_S1 + + .align 5 + +scal_kernel_S4: + + KERNEL_S1 + KERNEL_S1 + KERNEL_S1 + KERNEL_S1 + + subs I, I, #1 + bne scal_kernel_S4 + +scal_kernel_S1: + + ands I, N, #3 + ble scal_kernel_L999 + +scal_kernel_S10: + + KERNEL_S1 + + subs I, I, #1 + bne scal_kernel_S10 + + +scal_kernel_L999: + + mov r0, #0 // set return value + + bx lr + + EPILOGUE + diff --git a/kernel/arm/scopy_vfp.S b/kernel/arm/scopy_vfp.S new file mode 100644 index 000000000..e6ceaf2fb --- /dev/null +++ b/kernel/arm/scopy_vfp.S @@ -0,0 +1,224 @@ +/*************************************************************************** +Copyright (c) 2013, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +/************************************************************************************** +* 2013/11/07 Saar +* BLASTEST : OK +* CTEST : OK +* TEST : OK +* +**************************************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#define STACKSIZE 256 + +#define N r0 +#define X r1 +#define INC_X r2 +#define OLD_Y r3 + + +/****************************************************** +* [fp, #-128] - [fp, #-64] is reserved +* for store and restore of floating point +* registers +*******************************************************/ + +#define OLD_INC_Y [fp, #4 ] + +#define I r5 +#define Y r6 +#define INC_Y r7 + +#define X_PRE 256 + +/************************************************************************************** +* Macro definitions +**************************************************************************************/ + +.macro COPY_F8 + + pld [ X, #X_PRE ] + fldmias X!, { s0 - s3 } + fldmias X!, { s4 - s7 } + fstmias Y!, { s0 - s3 } + fstmias Y!, { s4 - s7 } + +.endm + +.macro COPY_F1 + + fldmias X!, { s0 } + fstmias Y!, { s0 } + +.endm + + +/*************************************************************************************************************************/ + +.macro COPY_S4 + + nop + fldmias X, { s0 } + fstmias Y, { s0 } + add X, X, INC_X + add Y, Y, INC_Y + + fldmias X, { s1 } + fstmias Y, { s1 } + add X, X, INC_X + add Y, Y, INC_Y + + fldmias X, { s0 } + fstmias Y, { s0 } + add X, X, INC_X + add Y, Y, INC_Y + + fldmias X, { s1 } + fstmias Y, { s1 } + add X, X, INC_X + add Y, Y, INC_Y + +.endm + + +.macro COPY_S1 + + fldmias X, { s0 } + fstmias Y, { s0 } + add X, X, INC_X + add Y, Y, INC_Y + +.endm + + + +/************************************************************************************** +* End of macro definitions +**************************************************************************************/ + + PROLOGUE + + .align 5 + + push {r4 - r9, fp} + add fp, sp, #24 + sub sp, sp, #STACKSIZE // reserve stack + + sub r4, fp, #128 + vstm r4, { s8 - s15} // store floating point registers + + mov Y, OLD_Y + ldr INC_Y, OLD_INC_Y + + cmp N, #0 + ble scopy_kernel_L999 + + cmp INC_X, #0 + beq scopy_kernel_L999 + + cmp INC_Y, #0 + beq scopy_kernel_L999 + + cmp INC_X, #1 + bne scopy_kernel_S_BEGIN + + cmp INC_Y, #1 + bne scopy_kernel_S_BEGIN + +scopy_kernel_F_BEGIN: + + asrs I, N, #3 // I = N / 8 + ble scopy_kernel_F1 + +scopy_kernel_F8: + + COPY_F8 + + subs I, I, #1 + bne scopy_kernel_F8 + +scopy_kernel_F1: + + ands I, N, #7 + ble scopy_kernel_L999 + +scopy_kernel_F10: + + COPY_F1 + + subs I, I, #1 + bne scopy_kernel_F10 + + b scopy_kernel_L999 + +scopy_kernel_S_BEGIN: + + lsl INC_X, INC_X, #2 // INC_X * SIZE + lsl INC_Y, INC_Y, #2 // INC_Y * SIZE + + asrs I, N, #2 // I = N / 4 + ble scopy_kernel_S1 + +scopy_kernel_S4: + + COPY_S4 + + subs I, I, #1 + bne scopy_kernel_S4 + +scopy_kernel_S1: + + ands I, N, #3 + ble scopy_kernel_L999 + +scopy_kernel_S10: + + COPY_S1 + + subs I, I, #1 + bne scopy_kernel_S10 + + + + + + +scopy_kernel_L999: + + sub r3, fp, #128 + vldm r3, { s8 - s15} // restore floating point registers + + mov r0, #0 // set return value + sub sp, fp, #24 + pop {r4 - r9, fp} + bx lr + + EPILOGUE + diff --git a/kernel/arm/sdot_vfp.S b/kernel/arm/sdot_vfp.S new file mode 100644 index 000000000..2d1909201 --- /dev/null +++ b/kernel/arm/sdot_vfp.S @@ -0,0 +1,347 @@ +/*************************************************************************** +Copyright (c) 2013, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +/************************************************************************************** +* 2013/11/11 Saar +* BLASTEST : OK +* CTEST : OK (no test for dsdot) +* TEST : OK (no test for dsdot) +* +**************************************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#define STACKSIZE 256 + +#define N r0 +#define X r1 +#define INC_X r2 +#define OLD_Y r3 + + +/****************************************************** +* [fp, #-128] - [fp, #-64] is reserved +* for store and restore of floating point +* registers +*******************************************************/ + +#define OLD_INC_Y [fp, #4 ] + +#define I r5 +#define Y r6 +#define INC_Y r7 + +#define X_PRE 512 + +/************************************************************************************** +* Macro definitions +**************************************************************************************/ + +#if defined(DSDOT) + +.macro KERNEL_F4 + + fldmias X!, { s14 } + fldmias Y!, { s15 } + vmul.f32 s15, s14, s15 + vcvt.f64.f32 d4, s15 + vadd.f64 d0 , d0, d4 + + fldmias X!, { s14 } + fldmias Y!, { s15 } + vmul.f32 s15, s14, s15 + vcvt.f64.f32 d4, s15 + vadd.f64 d0 , d0, d4 + + fldmias X!, { s14 } + fldmias Y!, { s15 } + vmul.f32 s15, s14, s15 + vcvt.f64.f32 d4, s15 + vadd.f64 d0 , d0, d4 + + fldmias X!, { s14 } + fldmias Y!, { s15 } + vmul.f32 s15, s14, s15 + vcvt.f64.f32 d4, s15 + vadd.f64 d0 , d0, d4 + +.endm + +.macro KERNEL_F1 + + fldmias X!, { s14 } + fldmias Y!, { s15 } + vmul.f32 s15, s14, s15 + vcvt.f64.f32 d4, s15 + vadd.f64 d0 , d0, d4 + +.endm + + +.macro KERNEL_S4 + + nop + + fldmias X, { s14 } + fldmias Y, { s15 } + vmul.f32 s15, s14, s15 + vcvt.f64.f32 d4, s15 + vadd.f64 d0 , d0, d4 + add X, X, INC_X + add Y, Y, INC_Y + + fldmias X, { s14 } + fldmias Y, { s15 } + vmul.f32 s15, s14, s15 + vcvt.f64.f32 d4, s15 + vadd.f64 d0 , d0, d4 + add X, X, INC_X + add Y, Y, INC_Y + + fldmias X, { s14 } + fldmias Y, { s15 } + vmul.f32 s15, s14, s15 + vcvt.f64.f32 d4, s15 + vadd.f64 d0 , d0, d4 + add X, X, INC_X + add Y, Y, INC_Y + + fldmias X, { s14 } + fldmias Y, { s15 } + vmul.f32 s15, s14, s15 + vcvt.f64.f32 d4, s15 + vadd.f64 d0 , d0, d4 + add X, X, INC_X + add Y, Y, INC_Y + +.endm + + +.macro KERNEL_S1 + + fldmias X, { s14 } + fldmias Y, { s15 } + vmul.f32 s15, s14, s15 + vcvt.f64.f32 d4, s15 + vadd.f64 d0 , d0, d4 + add X, X, INC_X + add Y, Y, INC_Y + +.endm + + + +#else + +.macro KERNEL_F4 + + fldmias X!, { s8 - s9 } + fldmias Y!, { s4 - s5} + fmacs s0 , s4, s8 + fldmias X!, { s10 - s11 } + fmacs s1 , s5, s9 + fldmias Y!, { s6 - s7 } + fmacs s0 , s6, s10 + fmacs s1 , s7, s11 + +.endm + +.macro KERNEL_F1 + + fldmias X!, { s4 } + fldmias Y!, { s8 } + fmacs s0 , s4, s8 + +.endm + + +.macro KERNEL_S4 + + nop + fldmias X, { s4 } + fldmias Y, { s8 } + add X, X, INC_X + add Y, Y, INC_Y + fmacs s0 , s4, s8 + + fldmias X, { s5 } + fldmias Y, { s9 } + add X, X, INC_X + add Y, Y, INC_Y + fmacs s1 , s5, s9 + + fldmias X, { s6 } + fldmias Y, { s10 } + add X, X, INC_X + add Y, Y, INC_Y + fmacs s0 , s6, s10 + + fldmias X, { s7 } + fldmias Y, { s11 } + add X, X, INC_X + add Y, Y, INC_Y + fmacs s1 , s7, s11 + +.endm + + +.macro KERNEL_S1 + + fldmias X, { s4 } + fldmias Y, { s8 } + add X, X, INC_X + fmacs s0 , s4, s8 + add Y, Y, INC_Y + +.endm + +#endif + +/************************************************************************************** +* End of macro definitions +**************************************************************************************/ + + PROLOGUE + + .align 5 + + push {r4 - r9, fp} + add fp, sp, #24 + sub sp, sp, #STACKSIZE // reserve stack + + sub r4, fp, #128 + vstm r4, { s8 - s15 } // store floating point registers + + mov Y, OLD_Y + ldr INC_Y, OLD_INC_Y + +#if defined(DSDOT) + + vsub.f64 d0 , d0 , d0 + vsub.f64 d1 , d1 , d1 + +#else + + vsub.f32 s0 , s0 , s0 + vsub.f32 s1 , s1 , s1 + +#endif + + cmp N, #0 + ble sdot_kernel_L999 + + cmp INC_X, #0 + beq sdot_kernel_L999 + + cmp INC_Y, #0 + beq sdot_kernel_L999 + + cmp INC_X, #1 + bne sdot_kernel_S_BEGIN + + cmp INC_Y, #1 + bne sdot_kernel_S_BEGIN + +sdot_kernel_F_BEGIN: + + asrs I, N, #2 // I = N / 4 + ble sdot_kernel_F1 + +sdot_kernel_F4: + + KERNEL_F4 + + subs I, I, #1 + bne sdot_kernel_F4 + +sdot_kernel_F1: + + ands I, N, #3 + ble sdot_kernel_L999 + +sdot_kernel_F10: + + KERNEL_F1 + + subs I, I, #1 + bne sdot_kernel_F10 + + b sdot_kernel_L999 + +sdot_kernel_S_BEGIN: + + lsl INC_X, INC_X, #2 // INC_X * SIZE + lsl INC_Y, INC_Y, #2 // INC_Y * SIZE + + asrs I, N, #2 // I = N / 4 + ble sdot_kernel_S1 + +sdot_kernel_S4: + + KERNEL_S4 + + subs I, I, #1 + bne sdot_kernel_S4 + +sdot_kernel_S1: + + ands I, N, #3 + ble sdot_kernel_L999 + +sdot_kernel_S10: + + KERNEL_S1 + + subs I, I, #1 + bne sdot_kernel_S10 + + + + + + +sdot_kernel_L999: + + sub r3, fp, #128 + vldm r3, { s8 - s15} // restore floating point registers + +#if defined(DSDOT) + + vadd.f64 d0 , d0, d1 // set return value + +#else + + vadd.f32 s0 , s0, s1 // set return value + +#endif + sub sp, fp, #24 + pop {r4 - r9, fp} + bx lr + + EPILOGUE + diff --git a/kernel/arm/sgemm_kernel_4x2_vfp.S b/kernel/arm/sgemm_kernel_4x2_vfp.S new file mode 100644 index 000000000..0e2061d77 --- /dev/null +++ b/kernel/arm/sgemm_kernel_4x2_vfp.S @@ -0,0 +1,797 @@ +/*************************************************************************** +Copyright (c) 2013, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +/************************************************************************************** +* 2013/11/28 Saar +* BLASTEST : OK +* CTEST : OK +* TEST : OK +* +**************************************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#define STACKSIZE 256 + +#define OLD_M r0 +#define OLD_N r1 +#define OLD_K r2 +#define OLD_A r3 +#define OLD_ALPHA s0 + +/****************************************************** +* [fp, #-128] - [fp, #-64] is reserved +* for store and restore of floating point +* registers +*******************************************************/ + +#define LDC [fp, #-252 ] +#define M [fp, #-256 ] +#define N [fp, #-260 ] +#define K [fp, #-264 ] +#define A [fp, #-268 ] + +#define ALPHA [fp, #-280] + +#define B [fp, #4 ] +#define C [fp, #8 ] +#define OLD_LDC [fp, #12 ] + +#define I r0 +#define J r1 +#define L r2 + +#define AO r5 +#define BO r6 + +#define CO1 r8 +#define CO2 r9 + +#define K1 r7 +#define BC r12 + +#define A_PRE 96 +#define B_PRE 96 +#define C_PRE 64 + +/************************************************************************************** +* Macro definitions +**************************************************************************************/ + +.macro INIT4x2 + + vsub.f32 s8 , s8 , s8 + vmov.f32 s9, s8 + vmov.f32 s10, s8 + vmov.f32 s11, s8 + vmov.f32 s12, s8 + vmov.f32 s13, s8 + vmov.f32 s14, s8 + vmov.f32 s15, s8 + +.endm + + + +.macro KERNEL4x2_SUB + + fldmias AO! , { s0 - s3 } + fldmias BO! , { s4 - s5 } + + fmacs s8 , s0, s4 + fmacs s9 , s1, s4 + fmacs s10 , s2, s4 + fmacs s11 , s3, s4 + + fmacs s12 , s0, s5 + fmacs s13 , s1, s5 + fmacs s14 , s2, s5 + fmacs s15 , s3, s5 + +.endm + +.macro SAVE4x2 + + ldr r3 , LDC + add CO2 , CO1, r3 + + flds s0, ALPHA + + flds s4 , [CO1] + flds s5 , [CO1, #4 ] + flds s6 , [CO1, #8 ] + flds s7 , [CO1, #12 ] + + fmacs s4 , s0 , s8 + fmacs s5 , s0 , s9 + fmacs s6 , s0 , s10 + fmacs s7 , s0 , s11 + + fsts s4 , [CO1] + fsts s5 , [CO1, #4 ] + fsts s6 , [CO1, #8 ] + fsts s7 , [CO1, #12 ] + + flds s4 , [CO2] + flds s5 , [CO2, #4 ] + flds s6 , [CO2, #8 ] + flds s7 , [CO2, #12 ] + + fmacs s4 , s0 , s12 + fmacs s5 , s0 , s13 + fmacs s6 , s0 , s14 + fmacs s7 , s0 , s15 + + fsts s4 , [CO2] + fsts s5 , [CO2, #4 ] + fsts s6 , [CO2, #8 ] + fsts s7 , [CO2, #12 ] + + add CO1, CO1, #16 + +.endm + + +/******************************************************************************/ + +.macro INIT2x2 + + vsub.f32 s8 , s8 , s8 + vmov.f32 s9, s8 + vmov.f32 s12, s8 + vmov.f32 s13, s8 + +.endm + +.macro KERNEL2x2_SUB + + flds s4 , [ BO ] + flds s5 , [ BO, #4 ] + + flds s0 , [ AO ] + flds s1 , [ AO, #4 ] + + fmacs s8 , s0, s4 + fmacs s9 , s1, s4 + + fmacs s12 , s0, s5 + fmacs s13 , s1, s5 + + add AO , AO, #8 + add BO , BO, #8 + +.endm + +.macro SAVE2x2 + + ldr r3 , LDC + add CO2 , CO1, r3 + + flds s0, ALPHA + + flds s4 , [CO1] + flds s5 , [CO1, #4 ] + + fmacs s4 , s0 , s8 + fmacs s5 , s0 , s9 + + fsts s4 , [CO1] + fsts s5 , [CO1, #4 ] + + flds s4 , [CO2] + flds s5 , [CO2, #4 ] + + fmacs s4 , s0 , s12 + fmacs s5 , s0 , s13 + + fsts s4 , [CO2] + fsts s5 , [CO2, #4 ] + + add CO1, CO1, #8 + +.endm + + +/******************************************************************************/ + +.macro INIT1x2 + + vsub.f32 s8 , s8 , s8 + vmov.f32 s12, s8 + +.endm + +.macro KERNEL1x2_SUB + + flds s4 , [ BO ] + flds s5 , [ BO, #4 ] + + flds s0 , [ AO ] + + fmacs s8 , s0, s4 + + fmacs s12 , s0, s5 + + add AO , AO, #4 + add BO , BO, #8 + +.endm + +.macro SAVE1x2 + + ldr r3 , LDC + add CO2 , CO1, r3 + + flds s0, ALPHA + + flds s4 , [CO1] + + fmacs s4 , s0 , s8 + + fsts s4 , [CO1] + + flds s4 , [CO2] + + fmacs s4 , s0 , s12 + + fsts s4 , [CO2] + + add CO1, CO1, #4 + +.endm + + + +/******************************************************************************/ + +.macro INIT4x1 + + vsub.f32 s8 , s8 , s8 + vmov.f32 s9, s8 + vmov.f32 s10, s8 + vmov.f32 s11, s8 + +.endm + + + +.macro KERNEL4x1_SUB + + flds s4 , [ BO ] + + flds s0 , [ AO ] + flds s1 , [ AO, #4 ] + flds s2 , [ AO, #8 ] + flds s3 , [ AO, #12 ] + + fmacs s8 , s0, s4 + fmacs s9 , s1, s4 + fmacs s10 , s2, s4 + fmacs s11 , s3, s4 + + add AO , AO, #16 + add BO , BO, #4 + +.endm + +.macro SAVE4x1 + + flds s0, ALPHA + + flds s4 , [CO1] + flds s5 , [CO1, #4 ] + flds s6 , [CO1, #8 ] + flds s7 , [CO1, #12 ] + + fmacs s4 , s0 , s8 + fmacs s5 , s0 , s9 + fmacs s6 , s0 , s10 + fmacs s7 , s0 , s11 + + fsts s4 , [CO1] + fsts s5 , [CO1, #4 ] + fsts s6 , [CO1, #8 ] + fsts s7 , [CO1, #12 ] + + add CO1, CO1, #16 + +.endm + +/******************************************************************************/ + +.macro INIT2x1 + + vsub.f32 s8 , s8 , s8 + vmov.f32 s9 , s8 + +.endm + +.macro KERNEL2x1_SUB + + flds s4 , [ BO ] + + flds s0 , [ AO ] + flds s1 , [ AO, #4 ] + + fmacs s8 , s0, s4 + fmacs s9 , s1, s4 + + add AO , AO, #8 + add BO , BO, #4 + +.endm + +.macro SAVE2x1 + + flds s0, ALPHA + + flds s4 , [CO1] + flds s5 , [CO1, #4 ] + + fmacs s4 , s0 , s8 + fmacs s5 , s0 , s9 + + fsts s4 , [CO1] + fsts s5 , [CO1, #4 ] + + add CO1, CO1, #8 + +.endm + + +/******************************************************************************/ + +.macro INIT1x1 + + vsub.f32 s8 , s8 , s8 + +.endm + +.macro KERNEL1x1_SUB + + flds s4 , [ BO ] + + flds s0 , [ AO ] + + fmacs s8 , s0, s4 + + add AO , AO, #4 + add BO , BO, #4 + +.endm + +.macro SAVE1x1 + + flds s0, ALPHA + + flds s4 , [CO1] + + fmacs s4 , s0 , s8 + + fsts s4 , [CO1] + + add CO1, CO1, #4 + +.endm + + +/************************************************************************************** +* End of macro definitions +**************************************************************************************/ + + PROLOGUE + + .align 5 + + push {r4 - r9, fp} + add fp, sp, #24 + sub sp, sp, #STACKSIZE // reserve stack + + str OLD_M, M + str OLD_N, N + str OLD_K, K + str OLD_A, A + vstr OLD_ALPHA, ALPHA + + sub r3, fp, #128 + vstm r3, { s8 - s15} // store floating point registers + + ldr r3, OLD_LDC + lsl r3, r3, #2 // ldc = ldc * 4 + str r3, LDC + + ldr K1, K + ldr BC, B + + ldr J, N + asrs J, J, #1 // J = J / 2 + ble sgemm_kernel_L1_BEGIN + + +/*********************************************************************************************/ + +sgemm_kernel_L2_BEGIN: + + ldr CO1, C // CO1 = C + ldr r4 , LDC + lsl r4 , r4 , #1 // LDC * 2 + add r3 , r4, CO1 + str r3 , C // store C + + ldr AO, A // AO = A + +sgemm_kernel_L2_M4_BEGIN: + + ldr I, M + asrs I, I, #2 // I = I / 4 + ble sgemm_kernel_L2_M2_BEGIN + +sgemm_kernel_L2_M4_20: + + INIT4x2 + + mov BO, BC + asrs L , K1, #3 // L = L / 8 + ble sgemm_kernel_L2_M4_40 + .align 5 + +sgemm_kernel_L2_M4_22: + + pld [ AO, #A_PRE ] + pld [ BO, #B_PRE ] + KERNEL4x2_SUB + KERNEL4x2_SUB + pld [ AO, #A_PRE ] + KERNEL4x2_SUB + KERNEL4x2_SUB + + pld [ AO, #A_PRE ] + pld [ BO, #B_PRE ] + KERNEL4x2_SUB + KERNEL4x2_SUB + pld [ AO, #A_PRE ] + KERNEL4x2_SUB + KERNEL4x2_SUB + + subs L, L, #1 + bgt sgemm_kernel_L2_M4_22 + + +sgemm_kernel_L2_M4_40: + + ands L , K1, #7 // L = L % 8 + ble sgemm_kernel_L2_M4_100 + +sgemm_kernel_L2_M4_42: + + KERNEL4x2_SUB + + subs L, L, #1 + bgt sgemm_kernel_L2_M4_42 + +sgemm_kernel_L2_M4_100: + + SAVE4x2 + +sgemm_kernel_L2_M4_END: + + subs I, I, #1 + bgt sgemm_kernel_L2_M4_20 + + +sgemm_kernel_L2_M2_BEGIN: + + ldr I, M + tst I , #3 + ble sgemm_kernel_L2_END + + tst I, #2 // I = I / 2 + ble sgemm_kernel_L2_M1_BEGIN + +sgemm_kernel_L2_M2_20: + + INIT2x2 + + mov BO, BC + asrs L , K1, #3 // L = L / 8 + ble sgemm_kernel_L2_M2_40 + +sgemm_kernel_L2_M2_22: + + KERNEL2x2_SUB + KERNEL2x2_SUB + KERNEL2x2_SUB + KERNEL2x2_SUB + + KERNEL2x2_SUB + KERNEL2x2_SUB + KERNEL2x2_SUB + KERNEL2x2_SUB + + subs L, L, #1 + bgt sgemm_kernel_L2_M2_22 + + +sgemm_kernel_L2_M2_40: + + ands L , K1, #7 // L = L % 8 + ble sgemm_kernel_L2_M2_100 + +sgemm_kernel_L2_M2_42: + + KERNEL2x2_SUB + + subs L, L, #1 + bgt sgemm_kernel_L2_M2_42 + +sgemm_kernel_L2_M2_100: + + SAVE2x2 + +sgemm_kernel_L2_M2_END: + + +sgemm_kernel_L2_M1_BEGIN: + + tst I, #1 // I = I % 2 + ble sgemm_kernel_L2_END + +sgemm_kernel_L2_M1_20: + + INIT1x2 + + mov BO, BC + asrs L , K1, #3 // L = L / 8 + ble sgemm_kernel_L2_M1_40 + +sgemm_kernel_L2_M1_22: + KERNEL1x2_SUB + KERNEL1x2_SUB + KERNEL1x2_SUB + KERNEL1x2_SUB + + KERNEL1x2_SUB + KERNEL1x2_SUB + KERNEL1x2_SUB + KERNEL1x2_SUB + + subs L, L, #1 + bgt sgemm_kernel_L2_M1_22 + + +sgemm_kernel_L2_M1_40: + + ands L , K1, #7 // L = L % 8 + ble sgemm_kernel_L2_M1_100 + +sgemm_kernel_L2_M1_42: + + KERNEL1x2_SUB + + subs L, L, #1 + bgt sgemm_kernel_L2_M1_42 + +sgemm_kernel_L2_M1_100: + + SAVE1x2 + + +sgemm_kernel_L2_END: + + mov r3, BC + mov r4, K1 + lsl r4, r4, #3 // k * 2 * 4 + add r3, r3, r4 // B = B + K * 2 * 4 + mov BC, r3 + + subs J , #1 // j-- + bgt sgemm_kernel_L2_BEGIN + +/*********************************************************************************************/ + +sgemm_kernel_L1_BEGIN: + + ldr J , N + tst J , #1 + ble sgemm_kernel_L999 + + + ldr CO1, C // CO1 = C + ldr r4 , LDC + add r3 , r4, CO1 + str r3 , C // store C + + ldr AO, A // AO = A + + + +sgemm_kernel_L1_M4_BEGIN: + + ldr I, M + asrs I, I, #2 // I = I / 4 + ble sgemm_kernel_L1_M2_BEGIN + +sgemm_kernel_L1_M4_20: + + INIT4x1 + + mov BO, BC + asrs L , K1, #3 // L = L / 8 + ble sgemm_kernel_L1_M4_40 + .align 5 + +sgemm_kernel_L1_M4_22: + KERNEL4x1_SUB + KERNEL4x1_SUB + KERNEL4x1_SUB + KERNEL4x1_SUB + + KERNEL4x1_SUB + KERNEL4x1_SUB + KERNEL4x1_SUB + KERNEL4x1_SUB + + subs L, L, #1 + bgt sgemm_kernel_L1_M4_22 + + +sgemm_kernel_L1_M4_40: + + ands L , K1, #7 // L = L % 8 + ble sgemm_kernel_L1_M4_100 + +sgemm_kernel_L1_M4_42: + + KERNEL4x1_SUB + + subs L, L, #1 + bgt sgemm_kernel_L1_M4_42 + +sgemm_kernel_L1_M4_100: + + SAVE4x1 + +sgemm_kernel_L1_M4_END: + + subs I, I, #1 + bgt sgemm_kernel_L1_M4_20 + + +sgemm_kernel_L1_M2_BEGIN: + + ldr I, M + tst I , #3 + ble sgemm_kernel_L1_END + + tst I, #2 // I = I / 2 + ble sgemm_kernel_L1_M1_BEGIN + +sgemm_kernel_L1_M2_20: + + INIT2x1 + + mov BO, BC + asrs L , K1, #3 // L = L / 8 + ble sgemm_kernel_L1_M2_40 + +sgemm_kernel_L1_M2_22: + + KERNEL2x1_SUB + KERNEL2x1_SUB + KERNEL2x1_SUB + KERNEL2x1_SUB + + KERNEL2x1_SUB + KERNEL2x1_SUB + KERNEL2x1_SUB + KERNEL2x1_SUB + + subs L, L, #1 + bgt sgemm_kernel_L1_M2_22 + + +sgemm_kernel_L1_M2_40: + + ands L , K1, #7 // L = L % 8 + ble sgemm_kernel_L1_M2_100 + +sgemm_kernel_L1_M2_42: + + KERNEL2x1_SUB + + subs L, L, #1 + bgt sgemm_kernel_L1_M2_42 + +sgemm_kernel_L1_M2_100: + + SAVE2x1 + +sgemm_kernel_L1_M2_END: + + +sgemm_kernel_L1_M1_BEGIN: + + tst I, #1 // I = I % 2 + ble sgemm_kernel_L1_END + +sgemm_kernel_L1_M1_20: + + INIT1x1 + + mov BO, BC + asrs L , K1, #3 // L = L / 8 + ble sgemm_kernel_L1_M1_40 + +sgemm_kernel_L1_M1_22: + + KERNEL1x1_SUB + KERNEL1x1_SUB + KERNEL1x1_SUB + KERNEL1x1_SUB + + KERNEL1x1_SUB + KERNEL1x1_SUB + KERNEL1x1_SUB + KERNEL1x1_SUB + + subs L, L, #1 + bgt sgemm_kernel_L1_M1_22 + + +sgemm_kernel_L1_M1_40: + + ands L , K1, #7 // L = L % 8 + ble sgemm_kernel_L1_M1_100 + +sgemm_kernel_L1_M1_42: + + KERNEL1x1_SUB + + subs L, L, #1 + bgt sgemm_kernel_L1_M1_42 + +sgemm_kernel_L1_M1_100: + + SAVE1x1 + + +sgemm_kernel_L1_END: + + +sgemm_kernel_L999: + + sub r3, fp, #128 + vldm r3, { s8 - s15} // restore floating point registers + + movs r0, #0 // set return value + sub sp, fp, #24 + pop {r4 - r9, fp} + bx lr + + EPILOGUE + diff --git a/kernel/arm/sgemm_kernel_4x4_vfpv3.S b/kernel/arm/sgemm_kernel_4x4_vfpv3.S new file mode 100644 index 000000000..38dc4d3ea --- /dev/null +++ b/kernel/arm/sgemm_kernel_4x4_vfpv3.S @@ -0,0 +1,1436 @@ +/*************************************************************************** +Copyright (c) 2013, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +/************************************************************************************** +* 2013/11/23 Saar +* BLASTEST : OK +* CTEST : OK +* TEST : OK +* +* +* 2013/11/02 Saar +* UNROLL_N 4 +* UNROLL_M 4 +* DGEMM_P 128 +* DGEMM_Q 240 +* DGEMM_R 12288 +* A_PRE 128 +* B_PRE 128 +* C_PRE 32 +* +* Performance on Odroid U2: +* +* 3072x3072 1 Core: 2.62 GFLOPS ATLAS: 2.69 GFLOPS +* 3072x3072 2 Cores: 5.23 GFLOPS ATLAS: 5.27 GFLOPS +* 3072x3072 3 Cores: 7.78 GFLOPS ATLAS: 7.87 GFLOPS +* 3072x3072 4 Cores: 10.10 GFLOPS ATLAS: 9.98 GFLOPS +**************************************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#define STACKSIZE 256 + +#define OLD_M r0 +#define OLD_N r1 +#define OLD_K r2 +#define OLD_A r3 +#define OLD_ALPHA s0 + +/****************************************************** +* [fp, #-128] - [fp, #-64] is reserved +* for store and restore of floating point +* registers +*******************************************************/ + +#define LDC [fp, #-252 ] +#define M [fp, #-256 ] +#define N [fp, #-260 ] +#define K [fp, #-264 ] +#define A [fp, #-268 ] + +#define ALPHA [fp, #-280] + +#define B [fp, #4 ] +#define C [fp, #8 ] +#define OLD_LDC [fp, #12 ] + +#define I r0 +#define J r1 +#define L r2 + +#define AO r5 +#define BO r6 + +#define CO1 r8 +#define CO2 r9 + +#define K1 r7 +#define BC r12 + +#define A_PRE 128 +#define B_PRE 128 +#define C_PRE 32 + +/************************************************************************************** +* Macro definitions +**************************************************************************************/ + +.macro INIT4x4 + + vsub.f32 s16 , s16 , s16 + vmov.f32 s17, s16 + vmov.f32 s18, s16 + vmov.f32 s19, s16 + vmov.f32 s20, s16 + vmov.f32 s21, s16 + vmov.f32 s22, s16 + vmov.f32 s23, s16 + vmov.f32 s24, s16 + vmov.f32 s25, s16 + vmov.f32 s26, s16 + vmov.f32 s27, s16 + vmov.f32 s28, s16 + vmov.f32 s29, s16 + vmov.f32 s30, s16 + vmov.f32 s31, s16 + +.endm + +.macro KERNEL4x4_I + + pld [ AO , #A_PRE ] + fldmias AO!, { s0 - s1 } + pld [ BO , #B_PRE ] + fldmias BO!, { s8 - s9 } + + fmuls s16 , s0, s8 + fldmias AO!, { s2 - s3 } + fmuls s17 , s1, s8 + fmuls s18 , s2, s8 + fldmias BO!, { s10 - s11 } + fmuls s19 , s3, s8 + + fmuls s20 , s0, s9 + fldmias AO!, { s4 - s5 } + fmuls s21 , s1, s9 + fmuls s22 , s2, s9 + fldmias AO!, { s6 - s7 } + fmuls s23 , s3, s9 + + fmuls s24 , s0, s10 + fldmias BO!, { s12 - s13 } + fmuls s25 , s1, s10 + fmuls s26 , s2, s10 + fldmias BO!, { s14 - s15 } + fmuls s27 , s3, s10 + + fmuls s28 , s0, s11 + fmuls s29 , s1, s11 + fmuls s30 , s2, s11 + fmuls s31 , s3, s11 + +.endm + + +.macro KERNEL4x4_M2 + + pld [ AO , #A_PRE ] + fmacs s16 , s4, s12 + fmacs s17 , s5, s12 + fldmias AO!, { s0 - s3 } + fmacs s18 , s6, s12 + pld [ BO , #B_PRE ] + fmacs s19 , s7, s12 + + fmacs s20 , s4, s13 + fldmias BO!, { s8 - s11 } + fmacs s21 , s5, s13 + fmacs s22 , s6, s13 + //fldmias AO!, { s2 - s3 } + fmacs s23 , s7, s13 + + fmacs s24 , s4, s14 + //fldmias BO!, { s10 - s11 } + fmacs s25 , s5, s14 + fmacs s26 , s6, s14 + fmacs s27 , s7, s14 + + fmacs s28 , s4, s15 + fmacs s29 , s5, s15 + fmacs s30 , s6, s15 + fmacs s31 , s7, s15 + +.endm + + +.macro KERNEL4x4_M1 + + fmacs s16 , s0, s8 + fldmias AO!, { s4 - s7 } + fmacs s17 , s1, s8 + fmacs s18 , s2, s8 + fldmias BO!, { s12 - s15 } + //fldmias AO!, { s6 - s7 } + fmacs s19 , s3, s8 + + fmacs s20 , s0, s9 + fmacs s21 , s1, s9 + fmacs s22 , s2, s9 + //fldmias BO!, { s14 - s15 } + fmacs s23 , s3, s9 + + fmacs s24 , s0, s10 + fmacs s25 , s1, s10 + fmacs s26 , s2, s10 + fmacs s27 , s3, s10 + + fmacs s28 , s0, s11 + fmacs s29 , s1, s11 + fmacs s30 , s2, s11 + fmacs s31 , s3, s11 + +.endm + + + +.macro KERNEL4x4_E + + fmacs s16 , s4, s12 + fmacs s17 , s5, s12 + fmacs s18 , s6, s12 + fmacs s19 , s7, s12 + + fmacs s20 , s4, s13 + fmacs s21 , s5, s13 + fmacs s22 , s6, s13 + fmacs s23 , s7, s13 + + fmacs s24 , s4, s14 + fmacs s25 , s5, s14 + fmacs s26 , s6, s14 + fmacs s27 , s7, s14 + + fmacs s28 , s4, s15 + fmacs s29 , s5, s15 + fmacs s30 , s6, s15 + fmacs s31 , s7, s15 + +.endm + + + + +.macro KERNEL4x4_SUB + + flds s8 , [ BO ] + + flds s0 , [ AO ] + flds s1 , [ AO, #4 ] + + fmacs s16 , s0, s8 + flds s2 , [ AO, #8 ] + fmacs s17 , s1, s8 + flds s3 , [ AO, #12 ] + fmacs s18 , s2, s8 + flds s9 , [ BO, #4 ] + fmacs s19 , s3, s8 + + flds s10, [ BO, #8 ] + fmacs s20 , s0, s9 + flds s11, [ BO, #12 ] + fmacs s21 , s1, s9 + fmacs s22 , s2, s9 + fmacs s23 , s3, s9 + + fmacs s24 , s0, s10 + fmacs s25 , s1, s10 + fmacs s26 , s2, s10 + fmacs s27 , s3, s10 + + fmacs s28 , s0, s11 + fmacs s29 , s1, s11 + add AO , AO, #16 + fmacs s30 , s2, s11 + add BO , BO, #16 + fmacs s31 , s3, s11 + +.endm + +.macro SAVE4x4 + + ldr r3 , LDC + add CO2 , CO1, r3 + flds s0, ALPHA + add r4 , CO2, r3 + + fldmias CO1, { s8 - s11 } + + fmacs s8 , s0 , s16 + flds s12, [CO2] + fmacs s9 , s0 , s17 + flds s13, [CO2, #4 ] + fmacs s10, s0 , s18 + flds s14, [CO2, #8 ] + fmacs s11, s0 , s19 + flds s15, [CO2, #12 ] + + fmacs s12, s0 , s20 + fsts s8 , [CO1] + fmacs s13, s0 , s21 + fsts s9 , [CO1, #4 ] + fmacs s14, s0 , s22 + fsts s10, [CO1, #8 ] + fmacs s15, s0 , s23 + fsts s11, [CO1, #12 ] + + pld [ CO1 , #C_PRE ] + + fldmias r4, { s8 - s11 } + + fmacs s8 , s0 , s24 + fsts s12, [CO2] + fmacs s9 , s0 , s25 + fsts s13, [CO2, #4 ] + fmacs s10, s0 , s26 + fsts s14, [CO2, #8 ] + fmacs s11, s0 , s27 + fsts s15, [CO2, #12 ] + + pld [ CO2 , #C_PRE ] + + add CO2, r4 , r3 + + + fldmias CO2, { s12 - s15 } + + fsts s8 , [r4 ] + fmacs s12, s0 , s28 + fsts s9 , [r4 , #4 ] + fmacs s13, s0 , s29 + fsts s10, [r4 , #8 ] + fmacs s14, s0 , s30 + fsts s11, [r4 , #12 ] + fmacs s15, s0 , s31 + + pld [ r4 , #C_PRE ] + fstmias CO2, { s12 - s15 } + pld [ CO2 , #C_PRE ] + + add CO1, CO1, #16 + +.endm + +/******************************************************************************/ + +.macro INIT2x4 + + vsub.f32 s16 , s16 , s16 + vmov.f32 s17, s16 + vmov.f32 s20, s16 + vmov.f32 s21, s16 + vmov.f32 s24, s16 + vmov.f32 s25, s16 + vmov.f32 s28, s16 + vmov.f32 s29, s16 + +.endm + + + +.macro KERNEL2x4_SUB + + flds s8 , [ BO ] + flds s9 , [ BO, #4 ] + flds s10, [ BO, #8 ] + flds s11, [ BO, #12 ] + + flds s0 , [ AO ] + flds s1 , [ AO, #4 ] + + fmacs s16 , s0, s8 + fmacs s17 , s1, s8 + + fmacs s20 , s0, s9 + fmacs s21 , s1, s9 + + fmacs s24 , s0, s10 + fmacs s25 , s1, s10 + + fmacs s28 , s0, s11 + fmacs s29 , s1, s11 + add AO , AO, #8 + add BO , BO, #16 + +.endm + +.macro SAVE2x4 + + ldr r3 , LDC + add CO2 , CO1, r3 + add r4 , CO2, r3 + + flds s0, ALPHA + + flds s8 , [CO1] + flds s9 , [CO1, #4 ] + + fmacs s8 , s0 , s16 + fmacs s9 , s0 , s17 + + fsts s8 , [CO1] + fsts s9 , [CO1, #4 ] + + flds s12, [CO2] + flds s13, [CO2, #4 ] + + fmacs s12, s0 , s20 + fmacs s13, s0 , s21 + + fsts s12, [CO2] + fsts s13, [CO2, #4 ] + + flds s8 , [r4 ] + flds s9 , [r4 , #4 ] + + fmacs s8 , s0 , s24 + fmacs s9 , s0 , s25 + + fsts s8 , [r4 ] + fsts s9 , [r4 , #4 ] + + add CO2, r4 , r3 + + flds s12, [CO2] + flds s13, [CO2, #4 ] + + fmacs s12, s0 , s28 + fmacs s13, s0 , s29 + + fsts s12, [CO2] + fsts s13, [CO2, #4 ] + + add CO1, CO1, #8 + +.endm + + +/******************************************************************************/ + +.macro INIT1x4 + + vsub.f32 s16 , s16 , s16 + vmov.f32 s20, s16 + vmov.f32 s24, s16 + vmov.f32 s28, s16 + +.endm + + + +.macro KERNEL1x4_SUB + + flds s8 , [ BO ] + flds s9 , [ BO, #4 ] + flds s10, [ BO, #8 ] + flds s11, [ BO, #12 ] + + flds s0 , [ AO ] + + fmacs s16 , s0, s8 + fmacs s20 , s0, s9 + fmacs s24 , s0, s10 + fmacs s28 , s0, s11 + + add AO , AO, #4 + add BO , BO, #16 + +.endm + +.macro SAVE1x4 + + ldr r3 , LDC + add CO2 , CO1, r3 + add r4 , CO2, r3 + + flds s0, ALPHA + + flds s8 , [CO1] + fmacs s8 , s0 , s16 + fsts s8 , [CO1] + + flds s12, [CO2] + fmacs s12, s0 , s20 + fsts s12, [CO2] + + flds s8 , [r4 ] + fmacs s8 , s0 , s24 + fsts s8 , [r4 ] + + add CO2, r4 , r3 + + flds s12, [CO2] + fmacs s12, s0 , s28 + fsts s12, [CO2] + + add CO1, CO1, #4 + +.endm + +/******************************************************************************/ +/******************************************************************************/ + +.macro INIT4x2 + + vsub.f32 s16 , s16 , s16 + vmov.f32 s17, s16 + vmov.f32 s18, s16 + vmov.f32 s19, s16 + vmov.f32 s20, s16 + vmov.f32 s21, s16 + vmov.f32 s22, s16 + vmov.f32 s23, s16 + +.endm + + + +.macro KERNEL4x2_SUB + + flds s8 , [ BO ] + flds s9 , [ BO, #4 ] + + flds s0 , [ AO ] + flds s1 , [ AO, #4 ] + flds s2 , [ AO, #8 ] + flds s3 , [ AO, #12 ] + + fmacs s16 , s0, s8 + fmacs s17 , s1, s8 + fmacs s18 , s2, s8 + fmacs s19 , s3, s8 + + fmacs s20 , s0, s9 + fmacs s21 , s1, s9 + fmacs s22 , s2, s9 + fmacs s23 , s3, s9 + + add AO , AO, #16 + add BO , BO, #8 + +.endm + +.macro SAVE4x2 + + ldr r3 , LDC + add CO2 , CO1, r3 + + flds s0, ALPHA + + flds s8 , [CO1] + flds s9 , [CO1, #4 ] + flds s10, [CO1, #8 ] + flds s11, [CO1, #12 ] + + fmacs s8 , s0 , s16 + fmacs s9 , s0 , s17 + fmacs s10, s0 , s18 + fmacs s11, s0 , s19 + + fsts s8 , [CO1] + fsts s9 , [CO1, #4 ] + fsts s10, [CO1, #8 ] + fsts s11, [CO1, #12 ] + + flds s12, [CO2] + flds s13, [CO2, #4 ] + flds s14, [CO2, #8 ] + flds s15, [CO2, #12 ] + + fmacs s12, s0 , s20 + fmacs s13, s0 , s21 + fmacs s14, s0 , s22 + fmacs s15, s0 , s23 + + fsts s12, [CO2] + fsts s13, [CO2, #4 ] + fsts s14, [CO2, #8 ] + fsts s15, [CO2, #12 ] + + add CO1, CO1, #16 + +.endm + + +/******************************************************************************/ + +.macro INIT2x2 + + vsub.f32 s16 , s16 , s16 + vmov.f32 s17, s16 + vmov.f32 s20, s16 + vmov.f32 s21, s16 + +.endm + + + +.macro KERNEL2x2_SUB + + flds s8 , [ BO ] + flds s9 , [ BO, #4 ] + + flds s0 , [ AO ] + flds s1 , [ AO, #4 ] + + fmacs s16 , s0, s8 + fmacs s17 , s1, s8 + + fmacs s20 , s0, s9 + fmacs s21 , s1, s9 + + add AO , AO, #8 + add BO , BO, #8 + +.endm + +.macro SAVE2x2 + + ldr r3 , LDC + add CO2 , CO1, r3 + + flds s0, ALPHA + + flds s8 , [CO1] + flds s9 , [CO1, #4 ] + + fmacs s8 , s0 , s16 + fmacs s9 , s0 , s17 + + fsts s8 , [CO1] + fsts s9 , [CO1, #4 ] + + flds s12, [CO2] + flds s13, [CO2, #4 ] + + fmacs s12, s0 , s20 + fmacs s13, s0 , s21 + + fsts s12, [CO2] + fsts s13, [CO2, #4 ] + + add CO1, CO1, #8 + +.endm + +/******************************************************************************/ + +.macro INIT1x2 + + vsub.f32 s16 , s16 , s16 + vmov.f32 s20, s16 + +.endm + + + +.macro KERNEL1x2_SUB + + flds s8 , [ BO ] + flds s9 , [ BO, #4 ] + + flds s0 , [ AO ] + fmacs s16 , s0, s8 + fmacs s20 , s0, s9 + + add AO , AO, #4 + add BO , BO, #8 + +.endm + +.macro SAVE1x2 + + ldr r3 , LDC + add CO2 , CO1, r3 + + flds s0, ALPHA + + flds s8 , [CO1] + fmacs s8 , s0 , s16 + fsts s8 , [CO1] + + flds s12, [CO2] + fmacs s12, s0 , s20 + fsts s12, [CO2] + + add CO1, CO1, #4 + +.endm + +/******************************************************************************/ +/******************************************************************************/ + +.macro INIT4x1 + + vsub.f32 s16 , s16 , s16 + vmov.f32 s17, s16 + vmov.f32 s18, s16 + vmov.f32 s19, s16 + +.endm + + + +.macro KERNEL4x1_SUB + + flds s8 , [ BO ] + + flds s0 , [ AO ] + flds s1 , [ AO, #4 ] + flds s2 , [ AO, #8 ] + flds s3 , [ AO, #12 ] + + fmacs s16 , s0, s8 + fmacs s17 , s1, s8 + fmacs s18 , s2, s8 + fmacs s19 , s3, s8 + + add AO , AO, #16 + add BO , BO, #4 + +.endm + +.macro SAVE4x1 + + + flds s0, ALPHA + + flds s8 , [CO1] + flds s9 , [CO1, #4 ] + flds s10, [CO1, #8 ] + flds s11, [CO1, #12 ] + + fmacs s8 , s0 , s16 + fmacs s9 , s0 , s17 + fmacs s10, s0 , s18 + fmacs s11, s0 , s19 + + fsts s8 , [CO1] + fsts s9 , [CO1, #4 ] + fsts s10, [CO1, #8 ] + fsts s11, [CO1, #12 ] + + add CO1, CO1, #16 + +.endm + + + + +/******************************************************************************/ + +.macro INIT2x1 + + vsub.f32 s16 , s16 , s16 + vmov.f32 s17, s16 + +.endm + + + +.macro KERNEL2x1_SUB + + flds s8 , [ BO ] + + flds s0 , [ AO ] + flds s1 , [ AO, #4 ] + + fmacs s16 , s0, s8 + fmacs s17 , s1, s8 + + add AO , AO, #8 + add BO , BO, #4 + +.endm + +.macro SAVE2x1 + + + flds s0, ALPHA + + flds s8 , [CO1] + flds s9 , [CO1, #4 ] + + fmacs s8 , s0 , s16 + fmacs s9 , s0 , s17 + + fsts s8 , [CO1] + fsts s9 , [CO1, #4 ] + + add CO1, CO1, #8 + +.endm + +/******************************************************************************/ + +.macro INIT1x1 + + vsub.f32 s16 , s16 , s16 + +.endm + + + +.macro KERNEL1x1_SUB + + flds s8 , [ BO ] + + flds s0 , [ AO ] + + fmacs s16 , s0, s8 + + add AO , AO, #4 + add BO , BO, #4 + +.endm + +.macro SAVE1x1 + + + flds s0, ALPHA + + flds s8 , [CO1] + fmacs s8 , s0 , s16 + fsts s8 , [CO1] + + add CO1, CO1, #4 + +.endm + + + + + +/************************************************************************************** +* End of macro definitions +**************************************************************************************/ + + PROLOGUE + + .align 5 + + push {r4 - r9, fp} + add fp, sp, #24 + sub sp, sp, #STACKSIZE // reserve stack + + str OLD_M, M + str OLD_N, N + str OLD_K, K + str OLD_A, A + vstr OLD_ALPHA, ALPHA + + sub r3, fp, #128 + vstm r3, { s8 - s31} // store floating point registers + + ldr r3, OLD_LDC + lsl r3, r3, #2 // ldc = ldc * 4 + str r3, LDC + + ldr K1, K + ldr BC, B + + ldr J, N + asrs J, J, #2 // J = J / 4 + ble sgemm_kernel_L2_BEGIN + +sgemm_kernel_L4_BEGIN: + + ldr CO1, C // CO1 = C + ldr r4 , LDC + lsl r4 , r4 , #2 // LDC * 4 + add r3 , r4, CO1 + str r3 , C // store C + + ldr AO, A // AO = A + pld [AO , #A_PRE-64] + pld [AO , #A_PRE-32] + + + +sgemm_kernel_L4_M4_BEGIN: + + ldr I, M + asrs I, I, #2 // I = I / 4 + ble sgemm_kernel_L4_M2_BEGIN + +sgemm_kernel_L4_M4_20: + + + mov BO, BC + asrs L , K1, #1 // L = L / 8 + cmp L , #2 + blt sgemm_kernel_L4_M4_32 + + + + KERNEL4x4_I + KERNEL4x4_M2 + + subs L, L, #2 + ble sgemm_kernel_L4_M4_22a + .align 5 + +sgemm_kernel_L4_M4_22: + + KERNEL4x4_M1 + KERNEL4x4_M2 + + subs L, L, #1 + bgt sgemm_kernel_L4_M4_22 + +sgemm_kernel_L4_M4_22a: + + KERNEL4x4_M1 + KERNEL4x4_E + + b sgemm_kernel_L4_M4_44 + +sgemm_kernel_L4_M4_32: + + tst L, #1 + ble sgemm_kernel_L4_M4_40 + + KERNEL4x4_I + + KERNEL4x4_E + + b sgemm_kernel_L4_M4_44 + + +sgemm_kernel_L4_M4_40: + + INIT4x4 + + +sgemm_kernel_L4_M4_44: + + ands L , K1, #1 // L = L % 8 + ble sgemm_kernel_L4_M4_100 + +sgemm_kernel_L4_M4_46: + + KERNEL4x4_SUB + + subs L, L, #1 + bne sgemm_kernel_L4_M4_46 + +sgemm_kernel_L4_M4_100: + + SAVE4x4 + +sgemm_kernel_L4_M4_END: + + subs I, I, #1 + bne sgemm_kernel_L4_M4_20 + + +sgemm_kernel_L4_M2_BEGIN: + + ldr I, M + tst I , #3 + ble sgemm_kernel_L4_END + + tst I, #2 // I = I / 2 + ble sgemm_kernel_L4_M1_BEGIN + +sgemm_kernel_L4_M2_20: + + INIT2x4 + + mov BO, BC + asrs L , K1, #3 // L = L / 8 + ble sgemm_kernel_L4_M2_40 + +sgemm_kernel_L4_M2_22: + + KERNEL2x4_SUB + KERNEL2x4_SUB + KERNEL2x4_SUB + KERNEL2x4_SUB + + KERNEL2x4_SUB + KERNEL2x4_SUB + KERNEL2x4_SUB + KERNEL2x4_SUB + + subs L, L, #1 + bgt sgemm_kernel_L4_M2_22 + + +sgemm_kernel_L4_M2_40: + + ands L , K1, #7 // L = L % 8 + ble sgemm_kernel_L4_M2_100 + +sgemm_kernel_L4_M2_42: + + KERNEL2x4_SUB + + subs L, L, #1 + bgt sgemm_kernel_L4_M2_42 + +sgemm_kernel_L4_M2_100: + + SAVE2x4 + +sgemm_kernel_L4_M2_END: + + +sgemm_kernel_L4_M1_BEGIN: + + tst I, #1 // I = I % 2 + ble sgemm_kernel_L4_END + +sgemm_kernel_L4_M1_20: + + INIT1x4 + + mov BO, BC + asrs L , K1, #3 // L = L / 8 + ble sgemm_kernel_L4_M1_40 + +sgemm_kernel_L4_M1_22: + KERNEL1x4_SUB + KERNEL1x4_SUB + KERNEL1x4_SUB + KERNEL1x4_SUB + + KERNEL1x4_SUB + KERNEL1x4_SUB + KERNEL1x4_SUB + KERNEL1x4_SUB + + subs L, L, #1 + bgt sgemm_kernel_L4_M1_22 + + +sgemm_kernel_L4_M1_40: + + ands L , K1, #7 // L = L % 8 + ble sgemm_kernel_L4_M1_100 + +sgemm_kernel_L4_M1_42: + + KERNEL1x4_SUB + + subs L, L, #1 + bgt sgemm_kernel_L4_M1_42 + +sgemm_kernel_L4_M1_100: + + SAVE1x4 + + +sgemm_kernel_L4_END: + + mov r3, BC + mov r4, K1 + lsl r4, r4, #4 // k * 4 * 4 + add r3, r3, r4 // B = B + K * 4 * 4 + mov BC, r3 + + subs J , #1 // j-- + bgt sgemm_kernel_L4_BEGIN + + + +/*********************************************************************************************/ + +sgemm_kernel_L2_BEGIN: + + ldr J , N + tst J , #3 + ble sgemm_kernel_L999 + + tst J , #2 + ble sgemm_kernel_L1_BEGIN + + ldr CO1, C // CO1 = C + ldr r4 , LDC + lsl r4 , r4 , #1 // LDC * 2 + add r3 , r4, CO1 + str r3 , C // store C + + ldr AO, A // AO = A + //pld [AO , #A_PRE-96] + //pld [AO , #A_PRE-64] + //pld [AO , #A_PRE-32] + + + +sgemm_kernel_L2_M4_BEGIN: + + ldr I, M + asrs I, I, #2 // I = I / 4 + ble sgemm_kernel_L2_M2_BEGIN + +sgemm_kernel_L2_M4_20: + + INIT4x2 + + mov BO, BC + asrs L , K1, #3 // L = L / 8 + ble sgemm_kernel_L2_M4_40 + .align 5 + +sgemm_kernel_L2_M4_22: + KERNEL4x2_SUB + KERNEL4x2_SUB + KERNEL4x2_SUB + KERNEL4x2_SUB + + KERNEL4x2_SUB + KERNEL4x2_SUB + KERNEL4x2_SUB + KERNEL4x2_SUB + + subs L, L, #1 + bgt sgemm_kernel_L2_M4_22 + + +sgemm_kernel_L2_M4_40: + + ands L , K1, #7 // L = L % 8 + ble sgemm_kernel_L2_M4_100 + +sgemm_kernel_L2_M4_42: + + KERNEL4x2_SUB + + subs L, L, #1 + bgt sgemm_kernel_L2_M4_42 + +sgemm_kernel_L2_M4_100: + + SAVE4x2 + +sgemm_kernel_L2_M4_END: + + subs I, I, #1 + bgt sgemm_kernel_L2_M4_20 + + +sgemm_kernel_L2_M2_BEGIN: + + ldr I, M + tst I , #3 + ble sgemm_kernel_L2_END + + tst I, #2 // I = I / 2 + ble sgemm_kernel_L2_M1_BEGIN + +sgemm_kernel_L2_M2_20: + + INIT2x2 + + mov BO, BC + asrs L , K1, #3 // L = L / 8 + ble sgemm_kernel_L2_M2_40 + +sgemm_kernel_L2_M2_22: + + KERNEL2x2_SUB + KERNEL2x2_SUB + KERNEL2x2_SUB + KERNEL2x2_SUB + + KERNEL2x2_SUB + KERNEL2x2_SUB + KERNEL2x2_SUB + KERNEL2x2_SUB + + subs L, L, #1 + bgt sgemm_kernel_L2_M2_22 + + +sgemm_kernel_L2_M2_40: + + ands L , K1, #7 // L = L % 8 + ble sgemm_kernel_L2_M2_100 + +sgemm_kernel_L2_M2_42: + + KERNEL2x2_SUB + + subs L, L, #1 + bgt sgemm_kernel_L2_M2_42 + +sgemm_kernel_L2_M2_100: + + SAVE2x2 + +sgemm_kernel_L2_M2_END: + + +sgemm_kernel_L2_M1_BEGIN: + + tst I, #1 // I = I % 2 + ble sgemm_kernel_L2_END + +sgemm_kernel_L2_M1_20: + + INIT1x2 + + mov BO, BC + asrs L , K1, #3 // L = L / 8 + ble sgemm_kernel_L2_M1_40 + +sgemm_kernel_L2_M1_22: + KERNEL1x2_SUB + KERNEL1x2_SUB + KERNEL1x2_SUB + KERNEL1x2_SUB + + KERNEL1x2_SUB + KERNEL1x2_SUB + KERNEL1x2_SUB + KERNEL1x2_SUB + + subs L, L, #1 + bgt sgemm_kernel_L2_M1_22 + + +sgemm_kernel_L2_M1_40: + + ands L , K1, #7 // L = L % 8 + ble sgemm_kernel_L2_M1_100 + +sgemm_kernel_L2_M1_42: + + KERNEL1x2_SUB + + subs L, L, #1 + bgt sgemm_kernel_L2_M1_42 + +sgemm_kernel_L2_M1_100: + + SAVE1x2 + + +sgemm_kernel_L2_END: + + mov r3, BC + mov r4, K1 + lsl r4, r4, #3 // k * 2 * 4 + add r3, r3, r4 // B = B + K * 2 * 4 + mov BC, r3 + +/*********************************************************************************************/ + +sgemm_kernel_L1_BEGIN: + + ldr J , N + tst J , #1 + ble sgemm_kernel_L999 + + + ldr CO1, C // CO1 = C + ldr r4 , LDC + add r3 , r4, CO1 + str r3 , C // store C + + ldr AO, A // AO = A + //pld [AO , #A_PRE-96] + //pld [AO , #A_PRE-64] + //pld [AO , #A_PRE-32] + + + +sgemm_kernel_L1_M4_BEGIN: + + ldr I, M + asrs I, I, #2 // I = I / 4 + ble sgemm_kernel_L1_M2_BEGIN + +sgemm_kernel_L1_M4_20: + + INIT4x1 + + mov BO, BC + asrs L , K1, #3 // L = L / 8 + ble sgemm_kernel_L1_M4_40 + .align 5 + +sgemm_kernel_L1_M4_22: + KERNEL4x1_SUB + KERNEL4x1_SUB + KERNEL4x1_SUB + KERNEL4x1_SUB + + KERNEL4x1_SUB + KERNEL4x1_SUB + KERNEL4x1_SUB + KERNEL4x1_SUB + + subs L, L, #1 + bgt sgemm_kernel_L1_M4_22 + + +sgemm_kernel_L1_M4_40: + + ands L , K1, #7 // L = L % 8 + ble sgemm_kernel_L1_M4_100 + +sgemm_kernel_L1_M4_42: + + KERNEL4x1_SUB + + subs L, L, #1 + bgt sgemm_kernel_L1_M4_42 + +sgemm_kernel_L1_M4_100: + + SAVE4x1 + +sgemm_kernel_L1_M4_END: + + subs I, I, #1 + bgt sgemm_kernel_L1_M4_20 + + +sgemm_kernel_L1_M2_BEGIN: + + ldr I, M + tst I , #3 + ble sgemm_kernel_L1_END + + tst I, #2 // I = I / 2 + ble sgemm_kernel_L1_M1_BEGIN + +sgemm_kernel_L1_M2_20: + + INIT2x1 + + mov BO, BC + asrs L , K1, #3 // L = L / 8 + ble sgemm_kernel_L1_M2_40 + +sgemm_kernel_L1_M2_22: + + KERNEL2x1_SUB + KERNEL2x1_SUB + KERNEL2x1_SUB + KERNEL2x1_SUB + + KERNEL2x1_SUB + KERNEL2x1_SUB + KERNEL2x1_SUB + KERNEL2x1_SUB + + subs L, L, #1 + bgt sgemm_kernel_L1_M2_22 + + +sgemm_kernel_L1_M2_40: + + ands L , K1, #7 // L = L % 8 + ble sgemm_kernel_L1_M2_100 + +sgemm_kernel_L1_M2_42: + + KERNEL2x1_SUB + + subs L, L, #1 + bgt sgemm_kernel_L1_M2_42 + +sgemm_kernel_L1_M2_100: + + SAVE2x1 + +sgemm_kernel_L1_M2_END: + + +sgemm_kernel_L1_M1_BEGIN: + + tst I, #1 // I = I % 2 + ble sgemm_kernel_L1_END + +sgemm_kernel_L1_M1_20: + + INIT1x1 + + mov BO, BC + asrs L , K1, #3 // L = L / 8 + ble sgemm_kernel_L1_M1_40 + +sgemm_kernel_L1_M1_22: + KERNEL1x1_SUB + KERNEL1x1_SUB + KERNEL1x1_SUB + KERNEL1x1_SUB + + KERNEL1x1_SUB + KERNEL1x1_SUB + KERNEL1x1_SUB + KERNEL1x1_SUB + + subs L, L, #1 + bgt sgemm_kernel_L1_M1_22 + + +sgemm_kernel_L1_M1_40: + + ands L , K1, #7 // L = L % 8 + ble sgemm_kernel_L1_M1_100 + +sgemm_kernel_L1_M1_42: + + KERNEL1x1_SUB + + subs L, L, #1 + bgt sgemm_kernel_L1_M1_42 + +sgemm_kernel_L1_M1_100: + + SAVE1x1 + + +sgemm_kernel_L1_END: + + +sgemm_kernel_L999: + + sub r3, fp, #128 + vldm r3, { s8 - s31} // restore floating point registers + + movs r0, #0 // set return value + sub sp, fp, #24 + pop {r4 - r9, fp} + bx lr + + EPILOGUE + diff --git a/kernel/arm/sgemm_ncopy_2_vfp.S b/kernel/arm/sgemm_ncopy_2_vfp.S new file mode 100644 index 000000000..0546f1d69 --- /dev/null +++ b/kernel/arm/sgemm_ncopy_2_vfp.S @@ -0,0 +1,225 @@ +/*************************************************************************** +Copyright (c) 2013, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +/************************************************************************************** +* 2013/11/24 Saar +* BLASTEST : OK +* CTEST : OK +* TEST : OK +* +**************************************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#define STACKSIZE 256 + +#define OLD_M r0 +#define OLD_N r1 +#define OLD_A r2 +#define OLD_LDA r3 + +#define B [fp, #4 ] + +#define M r0 +#define N r1 +#define A r2 + +#define BO r5 + +#define AO1 r6 +#define AO2 r7 +#define LDA r8 + +#define I r3 +#define J r12 + +#define A_PRE 256 + +/************************************************************************************** +* Macro definitions +**************************************************************************************/ + +.macro COPY2x2 + + flds s0 , [ AO1, #0 ] + flds s2 , [ AO1, #4 ] + + flds s1 , [ AO2, #0 ] + flds s3 , [ AO2, #4 ] + + add AO1, AO1, #8 + fstmias BO!, { s0 - s3 } + add AO2, AO2, #8 + +.endm + + +.macro COPY1x2 + + flds s0 , [ AO1, #0 ] + flds s1 , [ AO2, #0 ] + add AO1, AO1, #4 + + fstmias BO!, { s0 - s1 } + add AO2, AO2, #4 + +.endm + +.macro COPY2x1 + + flds s0 , [ AO1, #0 ] + flds s1 , [ AO1, #4 ] + + fstmias BO!, { s0 - s1 } + add AO1, AO1, #8 + +.endm + + +.macro COPY1x1 + + flds s0 , [ AO1, #0 ] + + fstmias BO!, { s0 } + add AO1, AO1, #4 + +.endm + + + + + +/************************************************************************************** +* End of macro definitions +**************************************************************************************/ + + PROLOGUE + + .align 5 + + push {r4 - r9, fp} + add fp, sp, #24 + + lsl LDA, OLD_LDA, #2 // lda = lda * 4 + + ldr BO, B + + +/*********************************************************************************************/ + +sgemm_ncopy_L2_BEGIN: + + asrs J, N, #1 // J = N / 2 + ble sgemm_ncopy_L1_BEGIN + +sgemm_ncopy_L2_M2_BEGIN: + + mov AO1, A // AO1 = A + add AO2, AO1, LDA + add A , AO2, LDA // A = A + 2 * LDA + + asrs I, M, #1 // I = M / 2 + ble sgemm_ncopy_L2_M2_40 + +sgemm_ncopy_L2_M2_20: + + COPY2x2 + + subs I , I , #1 + bne sgemm_ncopy_L2_M2_20 + + +sgemm_ncopy_L2_M2_40: + + ands I, M , #1 + ble sgemm_ncopy_L2_M2_END + +sgemm_ncopy_L2_M2_60: + + COPY1x2 + + subs I , I , #1 + bne sgemm_ncopy_L2_M2_60 + + +sgemm_ncopy_L2_M2_END: + + subs J , J, #1 // j-- + bne sgemm_ncopy_L2_M2_BEGIN + +/*********************************************************************************************/ + +sgemm_ncopy_L1_BEGIN: + + tst N, #1 + ble sgemm_ncopy_L999 + + +sgemm_ncopy_L1_M2_BEGIN: + + mov AO1, A // AO1 = A + add A , AO1, LDA // A = A + 1 * LDA + + asrs I, M, #1 // I = M / 2 + ble sgemm_ncopy_L1_M2_40 + +sgemm_ncopy_L1_M2_20: + + COPY2x1 + + subs I , I , #1 + bne sgemm_ncopy_L1_M2_20 + + +sgemm_ncopy_L1_M2_40: + + ands I, M , #1 + ble sgemm_ncopy_L1_M2_END + +sgemm_ncopy_L1_M2_60: + + COPY1x1 + + subs I , I , #1 + bne sgemm_ncopy_L1_M2_60 + + +sgemm_ncopy_L1_M2_END: + + + +sgemm_ncopy_L999: + + + movs r0, #0 // set return value + sub sp, fp, #24 + pop {r4 - r9, fp} + bx lr + + EPILOGUE + diff --git a/kernel/arm/sgemm_ncopy_4_vfp.S b/kernel/arm/sgemm_ncopy_4_vfp.S new file mode 100644 index 000000000..2d8fa2e24 --- /dev/null +++ b/kernel/arm/sgemm_ncopy_4_vfp.S @@ -0,0 +1,353 @@ +/*************************************************************************** +Copyright (c) 2013, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +/************************************************************************************** +* 2013/11/05 Saar +* BLASTEST : OK +* CTEST : OK +* TEST : OK +* +**************************************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#define STACKSIZE 256 + +#define OLD_M r0 +#define OLD_N r1 +#define OLD_A r2 +#define OLD_LDA r3 + + +/****************************************************** +* [fp, #-128] - [fp, #-64] is reserved +* for store and restore of floating point +* registers +*******************************************************/ + +#define LDA [fp, #-260 ] + +#define B [fp, #4 ] + +#define M r0 +#define N r1 +#define A r2 + +#define BO r5 + +#define AO1 r6 +#define AO2 r7 +#define AO3 r8 +#define AO4 r9 + +#define I r3 +#define J r12 + +#define A_PRE 192 + +/************************************************************************************** +* Macro definitions +**************************************************************************************/ + +.macro COPY4x4 + + flds s0 , [ AO1, #0 ] + flds s1 , [ AO2, #0 ] + flds s2 , [ AO3, #0 ] + flds s3 , [ AO4, #0 ] + + flds s4 , [ AO1, #4 ] + flds s8 , [ AO1, #8 ] + flds s12, [ AO1, #12 ] + + flds s5 , [ AO2, #4 ] + add AO1, AO1, #16 + flds s9 , [ AO2, #8 ] + flds s13, [ AO2, #12 ] + + flds s6 , [ AO3, #4 ] + add AO2, AO2, #16 + flds s10, [ AO3, #8 ] + flds s14, [ AO3, #12 ] + + flds s7 , [ AO4, #4 ] + add AO3, AO3, #16 + flds s11, [ AO4, #8 ] + flds s15, [ AO4, #12 ] + + fstmias BO!, { s0 - s3 } + add AO4, AO4, #16 + fstmias BO!, { s4 - s7 } + fstmias BO!, { s8 - s15 } + +.endm + +.macro COPY1x4 + + flds s0 , [ AO1, #0 ] + flds s1 , [ AO2, #0 ] + add AO1, AO1, #4 + flds s2 , [ AO3, #0 ] + add AO2, AO2, #4 + flds s3 , [ AO4, #0 ] + + add AO3, AO3, #4 + fstmias BO!, { s0 - s3 } + add AO4, AO4, #4 + +.endm + +.macro COPY4x2 + + flds s0 , [ AO1, #0 ] + flds s2 , [ AO1, #4 ] + flds s4 , [ AO1, #8 ] + flds s6 , [ AO1, #12 ] + + flds s1 , [ AO2, #0 ] + flds s3 , [ AO2, #4 ] + add AO1, AO1, #16 + flds s5 , [ AO2, #8 ] + flds s7 , [ AO2, #12 ] + + fstmias BO!, { s0 - s7 } + add AO2, AO2, #16 + +.endm + + +.macro COPY1x2 + + flds s0 , [ AO1, #0 ] + flds s1 , [ AO2, #0 ] + add AO1, AO1, #4 + + fstmias BO!, { s0 - s1 } + add AO2, AO2, #4 + +.endm + +.macro COPY4x1 + + flds s0 , [ AO1, #0 ] + flds s1 , [ AO1, #4 ] + flds s2 , [ AO1, #8 ] + flds s3 , [ AO1, #12 ] + + fstmias BO!, { s0 - s3 } + add AO1, AO1, #16 + +.endm + + +.macro COPY1x1 + + flds s0 , [ AO1, #0 ] + + fstmias BO!, { s0 } + add AO1, AO1, #4 + +.endm + + + + + +/************************************************************************************** +* End of macro definitions +**************************************************************************************/ + + PROLOGUE + + .align 5 + + push {r4 - r9, fp} + add fp, sp, #24 + sub sp, sp, #STACKSIZE // reserve stack + + + lsl r3, r3, #2 // lda = lda * 4 + str r3, LDA + + sub r4, fp, #128 + vstm r4, { s8 - s15} // store floating point registers + + ldr BO, B + +sgemm_ncopy_L4_BEGIN: + + asrs J, N, #2 // J = N / 4 + ble sgemm_ncopy_L2_BEGIN + +sgemm_ncopy_L4_M4_BEGIN: + + mov AO1, A // AO1 = A + ldr r4 , LDA + add AO2, AO1, r4 + add AO3, AO2, r4 + add AO4, AO3, r4 + add A , AO4, r4 // A = A + 4 * LDA + + asrs I, M, #2 // I = M / 4 + ble sgemm_ncopy_L4_M4_40 + +sgemm_ncopy_L4_M4_20: + + pld [ AO1, #A_PRE ] + pld [ AO2, #A_PRE ] + pld [ AO3, #A_PRE ] + pld [ AO4, #A_PRE ] + COPY4x4 + + subs I , I , #1 + ble sgemm_ncopy_L4_M4_40 + + COPY4x4 + + subs I , I , #1 + bne sgemm_ncopy_L4_M4_20 + + +sgemm_ncopy_L4_M4_40: + + ands I, M , #3 + ble sgemm_ncopy_L4_M4_END + +sgemm_ncopy_L4_M4_60: + + COPY1x4 + + subs I , I , #1 + bne sgemm_ncopy_L4_M4_60 + + +sgemm_ncopy_L4_M4_END: + + subs J , J, #1 // j-- + bne sgemm_ncopy_L4_M4_BEGIN + + + +/*********************************************************************************************/ + +sgemm_ncopy_L2_BEGIN: + + tst N, #3 + ble sgemm_ncopy_L999 + + tst N, #2 + ble sgemm_ncopy_L1_BEGIN + +sgemm_ncopy_L2_M4_BEGIN: + + mov AO1, A // AO1 = A + ldr r4 , LDA + add AO2, AO1, r4 + add A , AO2, r4 // A = A + 2 * LDA + + asrs I, M, #2 // I = M / 4 + ble sgemm_ncopy_L2_M4_40 + +sgemm_ncopy_L2_M4_20: + + COPY4x2 + + subs I , I , #1 + bne sgemm_ncopy_L2_M4_20 + + +sgemm_ncopy_L2_M4_40: + + ands I, M , #3 + ble sgemm_ncopy_L2_M4_END + +sgemm_ncopy_L2_M4_60: + + COPY1x2 + + subs I , I , #1 + bne sgemm_ncopy_L2_M4_60 + + +sgemm_ncopy_L2_M4_END: + + +/*********************************************************************************************/ + +sgemm_ncopy_L1_BEGIN: + + tst N, #1 + ble sgemm_ncopy_L999 + + +sgemm_ncopy_L1_M4_BEGIN: + + mov AO1, A // AO1 = A + ldr r4 , LDA + add A , AO1, r4 // A = A + 1 * LDA + + asrs I, M, #2 // I = M / 4 + ble sgemm_ncopy_L1_M4_40 + +sgemm_ncopy_L1_M4_20: + + COPY4x1 + + subs I , I , #1 + bne sgemm_ncopy_L1_M4_20 + + +sgemm_ncopy_L1_M4_40: + + ands I, M , #3 + ble sgemm_ncopy_L1_M4_END + +sgemm_ncopy_L1_M4_60: + + COPY1x1 + + subs I , I , #1 + bne sgemm_ncopy_L1_M4_60 + + +sgemm_ncopy_L1_M4_END: + + + +sgemm_ncopy_L999: + + sub r3, fp, #128 + vldm r3, { s8 - s15} // restore floating point registers + + movs r0, #0 // set return value + sub sp, fp, #24 + pop {r4 - r9, fp} + bx lr + + EPILOGUE + diff --git a/kernel/arm/sgemm_tcopy_4_vfp.S b/kernel/arm/sgemm_tcopy_4_vfp.S new file mode 100644 index 000000000..b0a3278ff --- /dev/null +++ b/kernel/arm/sgemm_tcopy_4_vfp.S @@ -0,0 +1,430 @@ +/*************************************************************************** +Copyright (c) 2013, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +/************************************************************************************** +* 2013/11/06 Saar +* BLASTEST : OK +* CTEST : OK +* TEST : OK +* +**************************************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#define STACKSIZE 256 + +#define OLD_M r0 +#define OLD_N r1 +#define OLD_A r2 +#define OLD_LDA r3 + + +/****************************************************** +* [fp, #-128] - [fp, #-64] is reserved +* for store and restore of floating point +* registers +*******************************************************/ + +#define B [fp, #4 ] +#define A [fp, #-248 ] + +#define M r0 +#define N r1 +#define M4 r2 + +#define LDA r5 + +#define AO1 r6 +#define BO1 r7 +#define BO2 r8 +#define BO3 r9 + +#define I r4 +#define J r12 + +#define A_PRE 256 + +/************************************************************************************** +* Macro definitions +**************************************************************************************/ + +.macro COPY4x4_1 + + pld [ AO1, #A_PRE ] + fldmias AO1, { s0 - s3 } + + add r3, AO1, LDA + pld [ r3, #A_PRE ] + fldmias r3, { s4 - s7 } + + add r3, r3, LDA + pld [ r3, #A_PRE ] + fldmias r3, { s8 - s11 } + + add r3, r3, LDA + pld [ r3, #A_PRE ] + fldmias r3, { s12 - s15 } + + fstmias BO1, { s0 - s15 } + add AO1, AO1, #16 + add BO1, BO1, M4 + +.endm + +.macro COPY4x4_2 + + fldmias AO1, { s0 - s3 } + + add r3, AO1, LDA + fldmias r3, { s4 - s7 } + + add r3, r3, LDA + fldmias r3, { s8 - s11 } + + add r3, r3, LDA + fldmias r3, { s12 - s15 } + + fstmias BO1, { s0 - s15 } + add AO1, AO1, #16 + add BO1, BO1, M4 + +.endm + + +.macro COPY2x4 + + fldmias AO1, { s0 - s1 } + + add r3, AO1, LDA + fldmias r3, { s2 - s3 } + + add r3, r3, LDA + fldmias r3, { s4 - s5 } + + add r3, r3, LDA + fldmias r3, { s6 - s7 } + + fstmias BO2, { s0 - s7 } + add AO1, AO1, #8 + add BO2, BO2, #32 + +.endm + +.macro COPY1x4 + + fldmias AO1, { s0 } + + add r3, AO1, LDA + fldmias r3, { s1 } + + add r3, r3, LDA + fldmias r3, { s2 } + + add r3, r3, LDA + fldmias r3, { s3 } + + fstmias BO3, { s0 - s3 } + add AO1, AO1, #4 + add BO3, BO3, #16 + +.endm + +/*************************************************************************************************************************/ + +.macro COPY4x2 + + fldmias AO1, { s0 - s3 } + + add r3, AO1, LDA + fldmias r3, { s4 - s7 } + + fstmias BO1, { s0 - s7 } + add AO1, AO1, #16 + add BO1, BO1, M4 + +.endm + +.macro COPY2x2 + + fldmias AO1, { s0 - s1 } + + add r3, AO1, LDA + fldmias r3, { s2 - s3 } + + fstmias BO2, { s0 - s3 } + add AO1, AO1, #8 + add BO2, BO2, #16 + +.endm + +.macro COPY1x2 + + fldmias AO1, { s0 } + + add r3, AO1, LDA + fldmias r3, { s1 } + + fstmias BO3, { s0 - s1 } + add AO1, AO1, #4 + add BO3, BO3, #8 + +.endm + +/*************************************************************************************************************************/ + +.macro COPY4x1 + + fldmias AO1, { s0 - s3 } + + fstmias BO1, { s0 - s3 } + add AO1, AO1, #16 + add BO1, BO1, M4 + +.endm + +.macro COPY2x1 + + fldmias AO1, { s0 - s1 } + + fstmias BO2, { s0 - s1 } + add AO1, AO1, #8 + add BO2, BO2, #8 + +.endm + +.macro COPY1x1 + + fldmias AO1, { s0 } + + fstmias BO3, { s0 } + add AO1, AO1, #4 + add BO3, BO3, #4 + +.endm + + + +/************************************************************************************** +* End of macro definitions +**************************************************************************************/ + + PROLOGUE + + .align 5 + + push {r4 - r9, fp} + add fp, sp, #24 + sub sp, sp, #STACKSIZE // reserve stack + + str OLD_A, A // store A + + lsl LDA, OLD_LDA, #2 // lda = lda * SIZE + + sub r4, fp, #128 + vstm r4, { s8 - s15} // store floating point registers + + lsl r4 , M, #2 // M * SIZE + + ldr r3, B + + and BO2 , N , #-4 + and BO3 , N , #-2 + + mul BO2, BO2, r4 + mul BO3, BO3, r4 + + add BO2 , BO2, r3 + add BO3 , BO3, r3 + + lsl M4, M, #4 // M4 = M * 4 * SIZE + +sgemm_tcopy_L4_BEGIN: + + asrs J, M, #2 // J = N / 4 + ble sgemm_tcopy_L2_BEGIN + +sgemm_tcopy_L4_M4_BEGIN: + + ldr AO1, A // AO1 = A + lsl r3, LDA, #2 // r3 = 4 * LDA + add r3, r3 , AO1 // A = A + 4 * LDA + str r3, A // store A + + ldr BO1, B + add r3, BO1, #64 // B = B + 16 * SIZE + str r3, B + + asrs I, N, #2 // I = M / 4 + ble sgemm_tcopy_L4_M4_40 + +sgemm_tcopy_L4_M4_20: + + COPY4x4_1 + + subs I , I , #1 + ble sgemm_tcopy_L4_M4_40 + + COPY4x4_2 + + subs I , I , #1 + bne sgemm_tcopy_L4_M4_20 + + +sgemm_tcopy_L4_M4_40: + + tst N , #2 + ble sgemm_tcopy_L4_M4_60 + + COPY2x4 + + +sgemm_tcopy_L4_M4_60: + + tst N, #1 + ble sgemm_tcopy_L4_M4_END + + COPY1x4 + + +sgemm_tcopy_L4_M4_END: + + subs J , J, #1 // j-- + bne sgemm_tcopy_L4_M4_BEGIN + + + +/*********************************************************************************************/ + +sgemm_tcopy_L2_BEGIN: + + tst M, #3 + ble sgemm_tcopy_L999 + + tst M, #2 + ble sgemm_tcopy_L1_BEGIN + +sgemm_tcopy_L2_M4_BEGIN: + + ldr AO1, A // AO1 = A + lsl r3, LDA, #1 // r3 = 2 * LDA + add r3, r3 , AO1 // A = A + 2 * LDA + str r3, A // store A + + ldr BO1, B + add r3, BO1, #32 // B = B + 8 * SIZE + str r3, B + + asrs I, N, #2 // I = M / 4 + ble sgemm_tcopy_L2_M4_40 + +sgemm_tcopy_L2_M4_20: + + COPY4x2 + + subs I , I , #1 + bne sgemm_tcopy_L2_M4_20 + + +sgemm_tcopy_L2_M4_40: + + tst N , #2 + ble sgemm_tcopy_L2_M4_60 + + COPY2x2 + +sgemm_tcopy_L2_M4_60: + + tst N , #1 + ble sgemm_tcopy_L2_M4_END + + COPY1x2 + + +sgemm_tcopy_L2_M4_END: + + +/*********************************************************************************************/ + +sgemm_tcopy_L1_BEGIN: + + tst M, #1 + ble sgemm_tcopy_L999 + + +sgemm_tcopy_L1_M4_BEGIN: + + ldr AO1, A // AO1 = A + add r3, LDA , AO1 // A = A + 1 * LDA + str r3, A // store A + + ldr BO1, B + add r3, BO1, #16 // B = B + 4 * SIZE + str r3, B + + asrs I, N, #2 // I = M / 4 + ble sgemm_tcopy_L1_M4_40 + +sgemm_tcopy_L1_M4_20: + + COPY4x1 + + subs I , I , #1 + bne sgemm_tcopy_L1_M4_20 + + +sgemm_tcopy_L1_M4_40: + + tst N , #2 + ble sgemm_tcopy_L1_M4_60 + + COPY2x1 + +sgemm_tcopy_L1_M4_60: + + tst N , #1 + ble sgemm_tcopy_L1_M4_END + + COPY1x1 + + +sgemm_tcopy_L1_M4_END: + + + +sgemm_tcopy_L999: + + sub r3, fp, #128 + vldm r3, { s8 - s15} // restore floating point registers + + mov r0, #0 // set return value + sub sp, fp, #24 + pop {r4 - r9, fp} + bx lr + + EPILOGUE + diff --git a/kernel/arm/strmm_kernel_4x2_vfp.S b/kernel/arm/strmm_kernel_4x2_vfp.S new file mode 100644 index 000000000..ab5ff7fa2 --- /dev/null +++ b/kernel/arm/strmm_kernel_4x2_vfp.S @@ -0,0 +1,1081 @@ +/*************************************************************************** +Copyright (c) 2013, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +/************************************************************************************** +* 2013/11/28 Saar +* BLASTEST : OK +* CTEST : OK +* TEST : OK +* +**************************************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#define STACKSIZE 252 + +#define OLD_M r0 +#define OLD_N r1 +#define OLD_K r2 +#define OLD_A r3 +#define OLD_ALPHA s0 + +/****************************************************** +* [fp, #-128] - [fp, #-64] is reserved +* for store and restore of floating point +* registers +*******************************************************/ + +#define KK [fp, #-240 ] +#define KKK [fp, #-244] +#define C [fp, #-248 ] +#define LDC [fp, #-252 ] +#define M [fp, #-256 ] +#define N [fp, #-260 ] +#define K [fp, #-264 ] +#define A [fp, #-268 ] + +#define ALPHA [fp, #-276 ] + +#define B [fp, #4 ] +#define OLD_C [fp, #8 ] +#define OLD_LDC [fp, #12 ] +#define OFFSET [fp, #16 ] + +#define I r0 +#define J r1 +#define L r2 + +#define AO r5 +#define BO r6 + +#define CO1 r8 +#define CO2 r9 + +#define K1 r7 +#define BC r12 + +#define A_PRE 64 +#define B_PRE 64 +#define C_PRE 64 + + +/************************************************************************************** +* Macro definitions +**************************************************************************************/ + +.macro INIT4x2 + + vsub.f32 s8 , s8 , s8 + vmov.f32 s9, s8 + vmov.f32 s10, s8 + vmov.f32 s11, s8 + vmov.f32 s12, s8 + vmov.f32 s13, s8 + vmov.f32 s14, s8 + vmov.f32 s15, s8 + +.endm + + + +.macro KERNEL4x2_SUB + + fldmias AO!, { s0 - s3 } + fldmias BO!, { s4 - s5 } + + fmacs s8 , s0, s4 + fmacs s9 , s1, s4 + fmacs s10 , s2, s4 + fmacs s11 , s3, s4 + + fmacs s12 , s0, s5 + fmacs s13 , s1, s5 + fmacs s14 , s2, s5 + fmacs s15 , s3, s5 + +.endm + +.macro SAVE4x2 + + ldr r3 , LDC + add CO2 , CO1, r3 + + flds s0, ALPHA + + fmuls s4 , s0 , s8 + fmuls s5 , s0 , s9 + fmuls s6 , s0 , s10 + fmuls s7 , s0 , s11 + + fsts s4 , [CO1] + fsts s5 , [CO1, #4 ] + fsts s6 , [CO1, #8 ] + fsts s7 , [CO1, #12 ] + + fmuls s4 , s0 , s12 + fmuls s5 , s0 , s13 + fmuls s6 , s0 , s14 + fmuls s7 , s0 , s15 + + fsts s4 , [CO2] + fsts s5 , [CO2, #4 ] + fsts s6 , [CO2, #8 ] + fsts s7 , [CO2, #12 ] + + add CO1, CO1, #16 + +.endm + + +/******************************************************************************/ + +.macro INIT2x2 + + vsub.f32 s8 , s8 , s8 + vmov.f32 s9, s8 + vmov.f32 s12, s8 + vmov.f32 s13, s8 + +.endm + +.macro KERNEL2x2_SUB + + flds s4 , [ BO ] + flds s5 , [ BO, #4 ] + + flds s0 , [ AO ] + flds s1 , [ AO, #4 ] + + fmacs s8 , s0, s4 + fmacs s9 , s1, s4 + + fmacs s12 , s0, s5 + fmacs s13 , s1, s5 + + add AO , AO, #8 + add BO , BO, #8 + +.endm + +.macro SAVE2x2 + + ldr r3 , LDC + add CO2 , CO1, r3 + + flds s0, ALPHA + + + fmuls s4 , s0 , s8 + fmuls s5 , s0 , s9 + + fsts s4 , [CO1] + fsts s5 , [CO1, #4 ] + + fmuls s4 , s0 , s12 + fmuls s5 , s0 , s13 + + fsts s4 , [CO2] + fsts s5 , [CO2, #4 ] + + add CO1, CO1, #8 + +.endm + + +/******************************************************************************/ + +.macro INIT1x2 + + vsub.f32 s8 , s8 , s8 + vmov.f32 s12, s8 + +.endm + +.macro KERNEL1x2_SUB + + flds s4 , [ BO ] + flds s5 , [ BO, #4 ] + + flds s0 , [ AO ] + + fmacs s8 , s0, s4 + + fmacs s12 , s0, s5 + + add AO , AO, #4 + add BO , BO, #8 + +.endm + +.macro SAVE1x2 + + ldr r3 , LDC + add CO2 , CO1, r3 + + flds s0, ALPHA + + + fmuls s4 , s0 , s8 + + fsts s4 , [CO1] + + + fmuls s4 , s0 , s12 + + fsts s4 , [CO2] + + add CO1, CO1, #4 + +.endm + + + +/******************************************************************************/ + +.macro INIT4x1 + + vsub.f32 s8 , s8 , s8 + vmov.f32 s9, s8 + vmov.f32 s10, s8 + vmov.f32 s11, s8 + +.endm + + + +.macro KERNEL4x1_SUB + + flds s4 , [ BO ] + + flds s0 , [ AO ] + flds s1 , [ AO, #4 ] + flds s2 , [ AO, #8 ] + flds s3 , [ AO, #12 ] + + fmacs s8 , s0, s4 + fmacs s9 , s1, s4 + fmacs s10 , s2, s4 + fmacs s11 , s3, s4 + + add AO , AO, #16 + add BO , BO, #4 + +.endm + +.macro SAVE4x1 + + flds s0, ALPHA + + fmuls s4 , s0 , s8 + fmuls s5 , s0 , s9 + fmuls s6 , s0 , s10 + fmuls s7 , s0 , s11 + + fsts s4 , [CO1] + fsts s5 , [CO1, #4 ] + fsts s6 , [CO1, #8 ] + fsts s7 , [CO1, #12 ] + + add CO1, CO1, #16 + +.endm + +/******************************************************************************/ + +.macro INIT2x1 + + vsub.f32 s8 , s8 , s8 + vmov.f32 s9 , s8 + +.endm + +.macro KERNEL2x1_SUB + + flds s4 , [ BO ] + + flds s0 , [ AO ] + flds s1 , [ AO, #4 ] + + fmacs s8 , s0, s4 + fmacs s9 , s1, s4 + + add AO , AO, #8 + add BO , BO, #4 + +.endm + +.macro SAVE2x1 + + flds s0, ALPHA + + fmuls s4 , s0 , s8 + fmuls s5 , s0 , s9 + + fsts s4 , [CO1] + fsts s5 , [CO1, #4 ] + + add CO1, CO1, #8 + +.endm + + +/******************************************************************************/ + +.macro INIT1x1 + + vsub.f32 s8 , s8 , s8 + +.endm + +.macro KERNEL1x1_SUB + + flds s4 , [ BO ] + + flds s0 , [ AO ] + + fmacs s8 , s0, s4 + + add AO , AO, #4 + add BO , BO, #4 + +.endm + +.macro SAVE1x1 + + flds s0, ALPHA + + fmuls s4 , s0 , s8 + + fsts s4 , [CO1] + + add CO1, CO1, #4 + +.endm + + +/************************************************************************************** +* End of macro definitions +**************************************************************************************/ + + + PROLOGUE + + .align 5 + + push {r4 - r9, fp} + add fp, sp, #24 + sub sp, sp, #STACKSIZE // reserve stack + + str OLD_M, M + str OLD_N, N + str OLD_K, K + str OLD_A, A + vstr OLD_ALPHA, ALPHA + + sub r3, fp, #128 + vstm r3, { s8 - s15} // store floating point registers + + ldr r3, OLD_LDC + lsl r3, r3, #2 // ldc = ldc * 4 + str r3, LDC + + ldr r3, OLD_C + str r3, C + + ldr BC, B + + ldr r3, OFFSET +#ifndef LEFT + neg r3 , r3 +#endif + str r3 , KK + + ldr J, N + asrs J, J, #1 // J = J / 2 + ble _L1_BEGIN + +_L2_BEGIN: + + ldr CO1, C // CO1 = C + ldr r4 , LDC + lsl r4 , r4 , #1 // LDC * 2 + add r3 , r4, CO1 + str r3 , C // store C + +#if defined(LEFT) + ldr r3 , OFFSET + str r3 , KK +#endif + + ldr AO, A // AO = A + + +_L2_M4_BEGIN: + + ldr I, M + asrs I, I, #2 // I = I / 4 + ble _L2_M2_BEGIN + +_L2_M4_20: + + INIT4x2 + +#if (defined(LEFT) && defined(TRANSA)) || \ + (!defined(LEFT) && !defined(TRANSA)) + + mov BO, BC +#else + mov BO, BC + ldr r3 , KK + lsls r4 , r3 , #3 // 2 float values + add BO , BO , r4 + lsls r4 , r3 , #4 // 4 float values + add AO , AO , r4 + +#endif + +#ifndef TRMMKERNEL + ldr L , K +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + ldr L , K + ldr r3, KK + sub L , L, r3 + str L , KKK +#else + ldr L , KK +#ifdef LEFT + add L , L , #4 // number of values in AO +#else + add L , L , #2 // number of values in BO +#endif + str L , KKK +#endif + + mov K1, L + asrs L , K1, #3 // L = L / 8 + ble _L2_M4_40 + .align 5 + +_L2_M4_22: + pld [ AO , #A_PRE ] + pld [ BO , #B_PRE ] + KERNEL4x2_SUB + KERNEL4x2_SUB + pld [ AO , #A_PRE ] + KERNEL4x2_SUB + KERNEL4x2_SUB + + pld [ AO , #A_PRE ] + pld [ BO , #B_PRE ] + KERNEL4x2_SUB + KERNEL4x2_SUB + pld [ AO , #A_PRE ] + KERNEL4x2_SUB + KERNEL4x2_SUB + + subs L, L, #1 + bgt _L2_M4_22 + + +_L2_M4_40: + + ands L , K1, #7 // L = L % 8 + ble _L2_M4_100 + +_L2_M4_42: + + KERNEL4x2_SUB + + subs L, L, #1 + bgt _L2_M4_42 + +_L2_M4_100: + + SAVE4x2 + +#if (defined(LEFT) && defined(TRANSA)) || \ + (!defined(LEFT) && !defined(TRANSA)) + ldr r3 , K + ldr r4 , KKK + sub r3 , r3 , r4 + lsls r4 , r3 , #3 // 2 float values + add BO , BO , r4 + lsls r4 , r3 , #4 // 4 float values + add AO , AO , r4 +#endif + +#if defined(LEFT) + ldr r3 , KK + add r3 , r3 , #4 // number of values in AO + str r3 , KK +#endif + + + +_L2_M4_END: + + subs I, I, #1 + bgt _L2_M4_20 + + +_L2_M2_BEGIN: + + ldr I, M + tst I , #3 + ble _L2_END + + tst I, #2 // I = I / 2 + ble _L2_M1_BEGIN + +_L2_M2_20: + + INIT2x2 + +#if (defined(LEFT) && defined(TRANSA)) || \ + (!defined(LEFT) && !defined(TRANSA)) + + mov BO, BC +#else + mov BO, BC + ldr r3 , KK + lsls r4 , r3 , #3 // 2 float values + add BO , BO , r4 + lsls r4 , r3 , #3 // 2 float values + add AO , AO , r4 + +#endif + +#ifndef TRMMKERNEL + ldr L , K +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + ldr L , K + ldr r3, KK + sub L , L, r3 + str L , KKK +#else + ldr L , KK +#ifdef LEFT + add L , L , #2 // number of values in AO +#else + add L , L , #2 // number of values in BO +#endif + str L , KKK +#endif + + mov K1, L + asrs L , K1, #3 // L = L / 8 + ble _L2_M2_40 + +_L2_M2_22: + + KERNEL2x2_SUB + KERNEL2x2_SUB + KERNEL2x2_SUB + KERNEL2x2_SUB + + KERNEL2x2_SUB + KERNEL2x2_SUB + KERNEL2x2_SUB + KERNEL2x2_SUB + + subs L, L, #1 + bgt _L2_M2_22 + + +_L2_M2_40: + + ands L , K1, #7 // L = L % 8 + ble _L2_M2_100 + +_L2_M2_42: + + KERNEL2x2_SUB + + subs L, L, #1 + bgt _L2_M2_42 + +_L2_M2_100: + + SAVE2x2 + +#if (defined(LEFT) && defined(TRANSA)) || \ + (!defined(LEFT) && !defined(TRANSA)) + ldr r3 , K + ldr r4 , KKK + sub r3 , r3 , r4 + lsls r4 , r3 , #3 // 2 float values + add BO , BO , r4 + lsls r4 , r3 , #3 // 2 float values + add AO , AO , r4 +#endif + +#if defined(LEFT) + ldr r3 , KK + add r3 , r3 , #2 // number of values in AO + str r3 , KK +#endif + + + +_L2_M2_END: + + +_L2_M1_BEGIN: + + tst I, #1 // I = I % 2 + ble _L2_END + +_L2_M1_20: + + INIT1x2 + +#if (defined(LEFT) && defined(TRANSA)) || \ + (!defined(LEFT) && !defined(TRANSA)) + + mov BO, BC +#else + mov BO, BC + ldr r3 , KK + lsls r4 , r3 , #3 // 2 float values + add BO , BO , r4 + lsls r4 , r3 , #2 // 1 float value + add AO , AO , r4 + +#endif + +#ifndef TRMMKERNEL + ldr L , K +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + ldr L , K + ldr r3, KK + sub L , L, r3 + str L , KKK +#else + ldr L , KK +#ifdef LEFT + add L , L , #1 // number of values in AO +#else + add L , L , #2 // number of values in BO +#endif + str L , KKK +#endif + + mov K1, L + asrs L , K1, #3 // L = L / 8 + ble _L2_M1_40 + +_L2_M1_22: + KERNEL1x2_SUB + KERNEL1x2_SUB + KERNEL1x2_SUB + KERNEL1x2_SUB + + KERNEL1x2_SUB + KERNEL1x2_SUB + KERNEL1x2_SUB + KERNEL1x2_SUB + + subs L, L, #1 + bgt _L2_M1_22 + + +_L2_M1_40: + + ands L , K1, #7 // L = L % 8 + ble _L2_M1_100 + +_L2_M1_42: + + KERNEL1x2_SUB + + subs L, L, #1 + bgt _L2_M1_42 + +_L2_M1_100: + + SAVE1x2 + +#if (defined(LEFT) && defined(TRANSA)) || \ + (!defined(LEFT) && !defined(TRANSA)) + ldr r3 , K + ldr r4 , KKK + sub r3 , r3 , r4 + lsls r4 , r3 , #3 // 2 float values + add BO , BO , r4 + lsls r4 , r3 , #2 // 1 float value + add AO , AO , r4 +#endif + +#if defined(LEFT) + ldr r3 , KK + add r3 , r3 , #1 // number of values in AO + str r3 , KK +#endif + + + + +_L2_END: + + mov r3, BC + ldr r4, K + lsl r4, r4, #3 // k * 2 * 4 + add r3, r3, r4 // B = B + K * 2 * 4 + mov BC, r3 + +#if !defined(LEFT) + ldr r3 , KK + add r3 , r3 , #2 // number of values in BO + str r3 , KK +#endif + + + subs J , #1 // j-- + bgt _L2_BEGIN + +/*********************************************************************************************/ + +_L1_BEGIN: + + ldr J , N + tst J , #1 + ble _L999 + + + ldr CO1, C // CO1 = C + ldr r4 , LDC + add r3 , r4, CO1 + str r3 , C // store C + +#if defined(LEFT) + ldr r3 , OFFSET + str r3 , KK +#endif + + ldr AO, A // AO = A + //pld [AO , #A_PRE-96] + //pld [AO , #A_PRE-64] + //pld [AO , #A_PRE-32] + + + +_L1_M4_BEGIN: + + ldr I, M + asrs I, I, #2 // I = I / 4 + ble _L1_M2_BEGIN + +_L1_M4_20: + + INIT4x1 + +#if (defined(LEFT) && defined(TRANSA)) || \ + (!defined(LEFT) && !defined(TRANSA)) + + mov BO, BC +#else + mov BO, BC + ldr r3 , KK + lsls r4 , r3 , #2 // 1 float value + add BO , BO , r4 + lsls r4 , r3 , #4 // 4 float values + add AO , AO , r4 + +#endif + +#ifndef TRMMKERNEL + ldr L , K +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + ldr L , K + ldr r3, KK + sub L , L, r3 + str L , KKK +#else + ldr L , KK +#ifdef LEFT + add L , L , #4 // number of values in AO +#else + add L , L , #1 // number of values in BO +#endif + str L , KKK +#endif + + mov K1, L + asrs L , K1, #3 // L = L / 8 + ble _L1_M4_40 + .align 5 + +_L1_M4_22: + KERNEL4x1_SUB + KERNEL4x1_SUB + KERNEL4x1_SUB + KERNEL4x1_SUB + + KERNEL4x1_SUB + KERNEL4x1_SUB + KERNEL4x1_SUB + KERNEL4x1_SUB + + subs L, L, #1 + bgt _L1_M4_22 + + +_L1_M4_40: + + ands L , K1, #7 // L = L % 8 + ble _L1_M4_100 + +_L1_M4_42: + + KERNEL4x1_SUB + + subs L, L, #1 + bgt _L1_M4_42 + +_L1_M4_100: + + SAVE4x1 + +#if (defined(LEFT) && defined(TRANSA)) || \ + (!defined(LEFT) && !defined(TRANSA)) + ldr r3 , K + ldr r4 , KKK + sub r3 , r3 , r4 + lsls r4 , r3 , #2 // 1 float value + add BO , BO , r4 + lsls r4 , r3 , #4 // 4 float values + add AO , AO , r4 +#endif + +#if defined(LEFT) + ldr r3 , KK + add r3 , r3 , #4 // number of values in AO + str r3 , KK +#endif + + + +_L1_M4_END: + + subs I, I, #1 + bgt _L1_M4_20 + + +_L1_M2_BEGIN: + + ldr I, M + tst I , #3 + ble _L1_END + + tst I, #2 // I = I / 2 + ble _L1_M1_BEGIN + +_L1_M2_20: + + INIT2x1 + +#if (defined(LEFT) && defined(TRANSA)) || \ + (!defined(LEFT) && !defined(TRANSA)) + + mov BO, BC +#else + mov BO, BC + ldr r3 , KK + lsls r4 , r3 , #2 // 1 float value + add BO , BO , r4 + lsls r4 , r3 , #3 // 2 float values + add AO , AO , r4 + +#endif + +#ifndef TRMMKERNEL + ldr L , K +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + ldr L , K + ldr r3, KK + sub L , L, r3 + str L , KKK +#else + ldr L , KK +#ifdef LEFT + add L , L , #2 // number of values in AO +#else + add L , L , #1 // number of values in BO +#endif + str L , KKK +#endif + + mov K1, L + asrs L , K1, #3 // L = L / 8 + ble _L1_M2_40 + +_L1_M2_22: + + KERNEL2x1_SUB + KERNEL2x1_SUB + KERNEL2x1_SUB + KERNEL2x1_SUB + + KERNEL2x1_SUB + KERNEL2x1_SUB + KERNEL2x1_SUB + KERNEL2x1_SUB + + subs L, L, #1 + bgt _L1_M2_22 + + +_L1_M2_40: + + ands L , K1, #7 // L = L % 8 + ble _L1_M2_100 + +_L1_M2_42: + + KERNEL2x1_SUB + + subs L, L, #1 + bgt _L1_M2_42 + +_L1_M2_100: + + SAVE2x1 + +#if (defined(LEFT) && defined(TRANSA)) || \ + (!defined(LEFT) && !defined(TRANSA)) + ldr r3 , K + ldr r4 , KKK + sub r3 , r3 , r4 + lsls r4 , r3 , #2 // 1 float value + add BO , BO , r4 + lsls r4 , r3 , #3 // 2 float values + add AO , AO , r4 +#endif + +#if defined(LEFT) + ldr r3 , KK + add r3 , r3 , #2 // number of values in AO + str r3 , KK +#endif + + + +_L1_M2_END: + + +_L1_M1_BEGIN: + + tst I, #1 // I = I % 2 + ble _L1_END + +_L1_M1_20: + + INIT1x1 + +#if (defined(LEFT) && defined(TRANSA)) || \ + (!defined(LEFT) && !defined(TRANSA)) + + mov BO, BC +#else + mov BO, BC + ldr r3 , KK + lsls r4 , r3 , #2 // 1 float value + add BO , BO , r4 + lsls r4 , r3 , #2 // 1 float value + add AO , AO , r4 + +#endif + +#ifndef TRMMKERNEL + ldr L , K +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + ldr L , K + ldr r3, KK + sub L , L, r3 + str L , KKK +#else + ldr L , KK +#ifdef LEFT + add L , L , #1 // number of values in AO +#else + add L , L , #1 // number of values in BO +#endif + str L , KKK +#endif + + mov K1, L + asrs L , K1, #3 // L = L / 8 + ble _L1_M1_40 + +_L1_M1_22: + KERNEL1x1_SUB + KERNEL1x1_SUB + KERNEL1x1_SUB + KERNEL1x1_SUB + + KERNEL1x1_SUB + KERNEL1x1_SUB + KERNEL1x1_SUB + KERNEL1x1_SUB + + subs L, L, #1 + bgt _L1_M1_22 + + +_L1_M1_40: + + ands L , K1, #7 // L = L % 8 + ble _L1_M1_100 + +_L1_M1_42: + + KERNEL1x1_SUB + + subs L, L, #1 + bgt _L1_M1_42 + +_L1_M1_100: + + SAVE1x1 + + +_L1_END: + + +_L999: + + sub r3, fp, #128 + vldm r3, { s8 - s15} // restore floating point registers + + movs r0, #0 // set return value + sub sp, fp, #24 + pop {r4 - r9, fp} + bx lr + + EPILOGUE + diff --git a/kernel/arm/strmm_kernel_4x4_vfpv3.S b/kernel/arm/strmm_kernel_4x4_vfpv3.S new file mode 100644 index 000000000..3a0c8af87 --- /dev/null +++ b/kernel/arm/strmm_kernel_4x4_vfpv3.S @@ -0,0 +1,1884 @@ +/*************************************************************************** +Copyright (c) 2013, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +/************************************************************************************** +* 2013/11/23 Saar +* BLASTEST : OK +* CTEST : OK +* TEST : OK +* +**************************************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#define STACKSIZE 256 + +#define OLD_M r0 +#define OLD_N r1 +#define OLD_K r2 +#define OLD_A r3 +#define OLD_ALPHA s0 + +/****************************************************** +* [fp, #-128] - [fp, #-32] is reserved +* for store and restore of floating point +* registers +*******************************************************/ + +#define KK [fp, #-244 ] +#define KKK [fp, #-248] +#define LDC [fp, #-252 ] +#define M [fp, #-256 ] +#define N [fp, #-260 ] +#define K [fp, #-264 ] +#define A [fp, #-268 ] + +#define ALPHA [fp, #-280] + +#define B [fp, #4 ] +#define C [fp, #8 ] +#define OLD_LDC [fp, #12 ] +#define OFFSET [fp, #16 ] + +#define I r0 +#define J r1 +#define L r2 + +#define AO r5 +#define BO r6 + +#define CO1 r8 +#define CO2 r9 + +#define K1 r7 +#define BC r12 + +#define A_PRE 96 +#define B_PRE 96 +#define C_PRE 64 + +/************************************************************************************** +* Macro definitions +**************************************************************************************/ + +.macro INIT4x4 + + vsub.f32 s16 , s16 , s16 + vmov.f32 s17, s16 + vmov.f32 s18, s16 + vmov.f32 s19, s16 + vmov.f32 s20, s16 + vmov.f32 s21, s16 + vmov.f32 s22, s16 + vmov.f32 s23, s16 + vmov.f32 s24, s16 + vmov.f32 s25, s16 + vmov.f32 s26, s16 + vmov.f32 s27, s16 + vmov.f32 s28, s16 + vmov.f32 s29, s16 + vmov.f32 s30, s16 + vmov.f32 s31, s16 + +.endm + +.macro KERNEL4x4_I + + fldmias AO!, { s0 - s1 } + pld [ AO , #A_PRE-8 ] + fldmias BO!, { s8 - s9 } + pld [ BO , #B_PRE-8 ] + + fmuls s16 , s0, s8 + fldmias AO!, { s2 - s3 } + fmuls s17 , s1, s8 + fmuls s18 , s2, s8 + fldmias BO!, { s10 - s11 } + fmuls s19 , s3, s8 + + fmuls s20 , s0, s9 + fldmias AO!, { s4 - s5 } + fmuls s21 , s1, s9 + fmuls s22 , s2, s9 + fldmias AO!, { s6 - s7 } + fmuls s23 , s3, s9 + + fmuls s24 , s0, s10 + fldmias BO!, { s12 - s13 } + fmuls s25 , s1, s10 + fmuls s26 , s2, s10 + fldmias BO!, { s14 - s15 } + fmuls s27 , s3, s10 + + fmuls s28 , s0, s11 + fmuls s29 , s1, s11 + fmuls s30 , s2, s11 + fmuls s31 , s3, s11 + +.endm + + +.macro KERNEL4x4_M2 + + pld [ AO , #A_PRE ] + fmacs s16 , s4, s12 + fmacs s17 , s5, s12 + fldmias AO!, { s0 - s1 } + fmacs s18 , s6, s12 + pld [ BO , #B_PRE ] + fmacs s19 , s7, s12 + + fmacs s20 , s4, s13 + fldmias AO!, { s2 - s3 } + fmacs s21 , s5, s13 + fmacs s22 , s6, s13 + fldmias BO!, { s8 - s9 } + fmacs s23 , s7, s13 + + fmacs s24 , s4, s14 + fldmias BO!, { s10 - s11 } + fmacs s25 , s5, s14 + fmacs s26 , s6, s14 + fmacs s27 , s7, s14 + + fmacs s28 , s4, s15 + fmacs s29 , s5, s15 + fmacs s30 , s6, s15 + fmacs s31 , s7, s15 + +.endm + + +.macro KERNEL4x4_M1 + + fmacs s16 , s0, s8 + fldmias AO!, { s4 - s5 } + fmacs s17 , s1, s8 + fmacs s18 , s2, s8 + fldmias AO!, { s6 - s7 } + fmacs s19 , s3, s8 + + fmacs s20 , s0, s9 + fldmias BO!, { s12 - s13 } + fmacs s21 , s1, s9 + fmacs s22 , s2, s9 + fldmias BO!, { s14 - s15 } + fmacs s23 , s3, s9 + + fmacs s24 , s0, s10 + fmacs s25 , s1, s10 + fmacs s26 , s2, s10 + fmacs s27 , s3, s10 + + fmacs s28 , s0, s11 + fmacs s29 , s1, s11 + fmacs s30 , s2, s11 + fmacs s31 , s3, s11 + +.endm + + + +.macro KERNEL4x4_E + + fmacs s16 , s4, s12 + fmacs s17 , s5, s12 + fmacs s18 , s6, s12 + fmacs s19 , s7, s12 + + fmacs s20 , s4, s13 + fmacs s21 , s5, s13 + fmacs s22 , s6, s13 + fmacs s23 , s7, s13 + + fmacs s24 , s4, s14 + fmacs s25 , s5, s14 + fmacs s26 , s6, s14 + fmacs s27 , s7, s14 + + fmacs s28 , s4, s15 + fmacs s29 , s5, s15 + fmacs s30 , s6, s15 + fmacs s31 , s7, s15 + +.endm + + + + +.macro KERNEL4x4_SUB + + flds s8 , [ BO ] + pld [ BO , #B_PRE ] + + flds s0 , [ AO ] + pld [ AO , #A_PRE ] + flds s1 , [ AO, #4 ] + + fmacs s16 , s0, s8 + flds s2 , [ AO, #8 ] + fmacs s17 , s1, s8 + flds s3 , [ AO, #12 ] + fmacs s18 , s2, s8 + flds s9 , [ BO, #4 ] + fmacs s19 , s3, s8 + + flds s10, [ BO, #8 ] + fmacs s20 , s0, s9 + flds s11, [ BO, #12 ] + fmacs s21 , s1, s9 + fmacs s22 , s2, s9 + fmacs s23 , s3, s9 + + fmacs s24 , s0, s10 + fmacs s25 , s1, s10 + fmacs s26 , s2, s10 + fmacs s27 , s3, s10 + + fmacs s28 , s0, s11 + fmacs s29 , s1, s11 + add AO , AO, #16 + fmacs s30 , s2, s11 + add BO , BO, #16 + fmacs s31 , s3, s11 + +.endm + +.macro SAVE4x4 + + ldr r3 , LDC + add CO2 , CO1, r3 + flds s0, ALPHA + add r4 , CO2, r3 + + + fmuls s8 , s0 , s16 + fmuls s9 , s0 , s17 + fmuls s10, s0 , s18 + fmuls s11, s0 , s19 + + fmuls s12, s0 , s20 + fsts s8 , [CO1] + fmuls s13, s0 , s21 + fsts s9 , [CO1, #4 ] + fmuls s14, s0 , s22 + fsts s10, [CO1, #8 ] + fmuls s15, s0 , s23 + fsts s11, [CO1, #12 ] + + + fmuls s8 , s0 , s24 + fsts s12, [CO2] + fmuls s9 , s0 , s25 + fsts s13, [CO2, #4 ] + fmuls s10, s0 , s26 + fsts s14, [CO2, #8 ] + fmuls s11, s0 , s27 + fsts s15, [CO2, #12 ] + + add CO2, r4 , r3 + + fsts s8 , [r4 ] + fmuls s12, s0 , s28 + fsts s9 , [r4 , #4 ] + fmuls s13, s0 , s29 + fsts s10, [r4 , #8 ] + fmuls s14, s0 , s30 + fsts s11, [r4 , #12 ] + fmuls s15, s0 , s31 + + fstmias CO2, { s12 - s15 } + + add CO1, CO1, #16 + +.endm + +/******************************************************************************/ + +.macro INIT2x4 + + vsub.f32 s16 , s16 , s16 + vmov.f32 s17, s16 + vmov.f32 s20, s16 + vmov.f32 s21, s16 + vmov.f32 s24, s16 + vmov.f32 s25, s16 + vmov.f32 s28, s16 + vmov.f32 s29, s16 + +.endm + + + +.macro KERNEL2x4_SUB + + flds s8 , [ BO ] + flds s9 , [ BO, #4 ] + flds s10, [ BO, #8 ] + flds s11, [ BO, #12 ] + + flds s0 , [ AO ] + flds s1 , [ AO, #4 ] + + fmacs s16 , s0, s8 + fmacs s17 , s1, s8 + + fmacs s20 , s0, s9 + fmacs s21 , s1, s9 + + fmacs s24 , s0, s10 + fmacs s25 , s1, s10 + + fmacs s28 , s0, s11 + fmacs s29 , s1, s11 + add AO , AO, #8 + add BO , BO, #16 + +.endm + +.macro SAVE2x4 + + ldr r3 , LDC + add CO2 , CO1, r3 + add r4 , CO2, r3 + + flds s0, ALPHA + + fmuls s8 , s0 , s16 + fmuls s9 , s0 , s17 + + fsts s8 , [CO1] + fsts s9 , [CO1, #4 ] + + fmuls s12, s0 , s20 + fmuls s13, s0 , s21 + + fsts s12, [CO2] + fsts s13, [CO2, #4 ] + + + fmuls s8 , s0 , s24 + fmuls s9 , s0 , s25 + + fsts s8 , [r4 ] + fsts s9 , [r4 , #4 ] + + add CO2, r4 , r3 + + fmuls s12, s0 , s28 + fmuls s13, s0 , s29 + + fsts s12, [CO2] + fsts s13, [CO2, #4 ] + + add CO1, CO1, #8 + +.endm + + +/******************************************************************************/ + +.macro INIT1x4 + + vsub.f32 s16 , s16 , s16 + vmov.f32 s20, s16 + vmov.f32 s24, s16 + vmov.f32 s28, s16 + +.endm + + + +.macro KERNEL1x4_SUB + + flds s8 , [ BO ] + flds s9 , [ BO, #4 ] + flds s10, [ BO, #8 ] + flds s11, [ BO, #12 ] + + flds s0 , [ AO ] + + fmacs s16 , s0, s8 + fmacs s20 , s0, s9 + fmacs s24 , s0, s10 + fmacs s28 , s0, s11 + + add AO , AO, #4 + add BO , BO, #16 + +.endm + +.macro SAVE1x4 + + ldr r3 , LDC + add CO2 , CO1, r3 + add r4 , CO2, r3 + + flds s0, ALPHA + + fmuls s8 , s0 , s16 + fsts s8 , [CO1] + + fmuls s12, s0 , s20 + fsts s12, [CO2] + + fmuls s8 , s0 , s24 + fsts s8 , [r4 ] + + add CO2, r4 , r3 + + fmuls s12, s0 , s28 + fsts s12, [CO2] + + add CO1, CO1, #4 + +.endm + +/******************************************************************************/ +/******************************************************************************/ + +.macro INIT4x2 + + vsub.f32 s16 , s16 , s16 + vmov.f32 s17, s16 + vmov.f32 s18, s16 + vmov.f32 s19, s16 + vmov.f32 s20, s16 + vmov.f32 s21, s16 + vmov.f32 s22, s16 + vmov.f32 s23, s16 + +.endm + + + +.macro KERNEL4x2_SUB + + flds s8 , [ BO ] + flds s9 , [ BO, #4 ] + + flds s0 , [ AO ] + flds s1 , [ AO, #4 ] + flds s2 , [ AO, #8 ] + flds s3 , [ AO, #12 ] + + fmacs s16 , s0, s8 + fmacs s17 , s1, s8 + fmacs s18 , s2, s8 + fmacs s19 , s3, s8 + + fmacs s20 , s0, s9 + fmacs s21 , s1, s9 + fmacs s22 , s2, s9 + fmacs s23 , s3, s9 + + add AO , AO, #16 + add BO , BO, #8 + +.endm + +.macro SAVE4x2 + + ldr r3 , LDC + add CO2 , CO1, r3 + + flds s0, ALPHA + + fmuls s8 , s0 , s16 + fmuls s9 , s0 , s17 + fmuls s10, s0 , s18 + fmuls s11, s0 , s19 + + fsts s8 , [CO1] + fsts s9 , [CO1, #4 ] + fsts s10, [CO1, #8 ] + fsts s11, [CO1, #12 ] + + fmuls s12, s0 , s20 + fmuls s13, s0 , s21 + fmuls s14, s0 , s22 + fmuls s15, s0 , s23 + + fsts s12, [CO2] + fsts s13, [CO2, #4 ] + fsts s14, [CO2, #8 ] + fsts s15, [CO2, #12 ] + + add CO1, CO1, #16 + +.endm + + +/******************************************************************************/ + +.macro INIT2x2 + + vsub.f32 s16 , s16 , s16 + vmov.f32 s17, s16 + vmov.f32 s20, s16 + vmov.f32 s21, s16 + +.endm + + + +.macro KERNEL2x2_SUB + + flds s8 , [ BO ] + flds s9 , [ BO, #4 ] + + flds s0 , [ AO ] + flds s1 , [ AO, #4 ] + + fmacs s16 , s0, s8 + fmacs s17 , s1, s8 + + fmacs s20 , s0, s9 + fmacs s21 , s1, s9 + + add AO , AO, #8 + add BO , BO, #8 + +.endm + +.macro SAVE2x2 + + ldr r3 , LDC + add CO2 , CO1, r3 + + flds s0, ALPHA + + fmuls s8 , s0 , s16 + fmuls s9 , s0 , s17 + + fsts s8 , [CO1] + fsts s9 , [CO1, #4 ] + + fmuls s12, s0 , s20 + fmuls s13, s0 , s21 + + fsts s12, [CO2] + fsts s13, [CO2, #4 ] + + add CO1, CO1, #8 + +.endm + +/******************************************************************************/ + +.macro INIT1x2 + + vsub.f32 s16 , s16 , s16 + vmov.f32 s20, s16 + +.endm + + + +.macro KERNEL1x2_SUB + + flds s8 , [ BO ] + flds s9 , [ BO, #4 ] + + flds s0 , [ AO ] + fmacs s16 , s0, s8 + fmacs s20 , s0, s9 + + add AO , AO, #4 + add BO , BO, #8 + +.endm + +.macro SAVE1x2 + + ldr r3 , LDC + add CO2 , CO1, r3 + + flds s0, ALPHA + + fmuls s8 , s0 , s16 + fsts s8 , [CO1] + + fmuls s12, s0 , s20 + fsts s12, [CO2] + + add CO1, CO1, #4 + +.endm + +/******************************************************************************/ +/******************************************************************************/ + +.macro INIT4x1 + + vsub.f32 s16 , s16 , s16 + vmov.f32 s17, s16 + vmov.f32 s18, s16 + vmov.f32 s19, s16 + +.endm + + + +.macro KERNEL4x1_SUB + + flds s8 , [ BO ] + + flds s0 , [ AO ] + flds s1 , [ AO, #4 ] + flds s2 , [ AO, #8 ] + flds s3 , [ AO, #12 ] + + fmacs s16 , s0, s8 + fmacs s17 , s1, s8 + fmacs s18 , s2, s8 + fmacs s19 , s3, s8 + + add AO , AO, #16 + add BO , BO, #4 + +.endm + +.macro SAVE4x1 + + + flds s0, ALPHA + + fmuls s8 , s0 , s16 + fmuls s9 , s0 , s17 + fmuls s10, s0 , s18 + fmuls s11, s0 , s19 + + fsts s8 , [CO1] + fsts s9 , [CO1, #4 ] + fsts s10, [CO1, #8 ] + fsts s11, [CO1, #12 ] + + add CO1, CO1, #16 + +.endm + + + + +/******************************************************************************/ + +.macro INIT2x1 + + vsub.f32 s16 , s16 , s16 + vmov.f32 s17, s16 + +.endm + + + +.macro KERNEL2x1_SUB + + flds s8 , [ BO ] + + flds s0 , [ AO ] + flds s1 , [ AO, #4 ] + + fmacs s16 , s0, s8 + fmacs s17 , s1, s8 + + add AO , AO, #8 + add BO , BO, #4 + +.endm + +.macro SAVE2x1 + + + flds s0, ALPHA + + fmuls s8 , s0 , s16 + fmuls s9 , s0 , s17 + + fsts s8 , [CO1] + fsts s9 , [CO1, #4 ] + + add CO1, CO1, #8 + +.endm + +/******************************************************************************/ + +.macro INIT1x1 + + vsub.f32 s16 , s16 , s16 + +.endm + + + +.macro KERNEL1x1_SUB + + flds s8 , [ BO ] + + flds s0 , [ AO ] + + fmacs s16 , s0, s8 + + add AO , AO, #4 + add BO , BO, #4 + +.endm + +.macro SAVE1x1 + + + flds s0, ALPHA + + fmuls s8 , s0 , s16 + fsts s8 , [CO1] + + add CO1, CO1, #4 + +.endm + + + + + +/************************************************************************************** +* End of macro definitions +**************************************************************************************/ + + PROLOGUE + + .align 5 + + push {r4 - r9, fp} + add fp, sp, #24 + sub sp, sp, #STACKSIZE // reserve stack + + str OLD_M, M + str OLD_N, N + str OLD_K, K + str OLD_A, A + vstr OLD_ALPHA, ALPHA + + sub r3, fp, #128 + vstm r3, { s8 - s31} // store floating point registers + + ldr r3, OLD_LDC + lsl r3, r3, #2 // ldc = ldc * 4 + str r3, LDC + + ldr r3, OFFSET +#ifndef LEFT + neg r3 , r3 +#endif + str r3 , KK + + ldr BC, B + + ldr J, N + asrs J, J, #2 // J = J / 4 + ble _L2_BEGIN + +_L4_BEGIN: + + ldr CO1, C // CO1 = C + ldr r4 , LDC + lsl r4 , r4 , #2 // LDC * 4 + add r3 , r4, CO1 + str r3 , C // store C + +#if defined(LEFT) + ldr r3 , OFFSET + str r3 , KK +#endif + + ldr AO, A // AO = A + pld [AO , #A_PRE-64] + pld [AO , #A_PRE-32] + + + +_L4_M4_BEGIN: + + ldr I, M + asrs I, I, #2 // I = I / 4 + ble _L4_M2_BEGIN + +_L4_M4_20: + +#if (defined(LEFT) && defined(TRANSA)) || \ + (!defined(LEFT) && !defined(TRANSA)) + + mov BO, BC +#else + mov BO, BC + ldr r3 , KK + lsls r4 , r3 , #4 // 4 float values + add BO , BO , r4 + lsls r4 , r3 , #4 // 4 float values + add AO , AO , r4 + +#endif + +#ifndef TRMMKERNEL + ldr K1, K +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + ldr K1, K + ldr r3, KK + sub K1, K1, r3 + str K1, KKK +#else + ldr K1, KK +#ifdef LEFT + add K1, K1, #4 // number of values in AO +#else + add K1, K1, #4 // number of values in BO +#endif + str K1, KKK +#endif + + asrs L , K1, #3 // L = L / 8 + cmp L , #3 + blt _L4_M4_30 + .align 5 + + + + KERNEL4x4_I + KERNEL4x4_M2 + KERNEL4x4_M1 + KERNEL4x4_M2 + + KERNEL4x4_M1 + KERNEL4x4_M2 + KERNEL4x4_M1 + KERNEL4x4_M2 + + sub L, L, #2 + +_L4_M4_22: + + KERNEL4x4_M1 + KERNEL4x4_M2 + KERNEL4x4_M1 + KERNEL4x4_M2 + + KERNEL4x4_M1 + KERNEL4x4_M2 + KERNEL4x4_M1 + KERNEL4x4_M2 + + subs L, L, #1 + bgt _L4_M4_22 + + KERNEL4x4_M1 + KERNEL4x4_M2 + KERNEL4x4_M1 + KERNEL4x4_M2 + + KERNEL4x4_M1 + KERNEL4x4_M2 + KERNEL4x4_M1 + KERNEL4x4_E + + b _L4_M4_44 + + +_L4_M4_30: + tst L, #3 + ble _L4_M4_40 + + tst L, #2 + ble _L4_M4_32 + + KERNEL4x4_I + KERNEL4x4_M2 + KERNEL4x4_M1 + KERNEL4x4_M2 + + KERNEL4x4_M1 + KERNEL4x4_M2 + KERNEL4x4_M1 + KERNEL4x4_M2 + + KERNEL4x4_M1 + KERNEL4x4_M2 + KERNEL4x4_M1 + KERNEL4x4_M2 + + + KERNEL4x4_M1 + KERNEL4x4_M2 + KERNEL4x4_M1 + KERNEL4x4_E + + b _L4_M4_44 + +_L4_M4_32: + + tst L, #1 + ble _L4_M4_40 + + KERNEL4x4_I + KERNEL4x4_M2 + KERNEL4x4_M1 + KERNEL4x4_M2 + + KERNEL4x4_M1 + KERNEL4x4_M2 + KERNEL4x4_M1 + KERNEL4x4_E + + b _L4_M4_44 + + +_L4_M4_40: + + INIT4x4 + + +_L4_M4_44: + + ands L , K1, #7 // L = L % 8 + ble _L4_M4_100 + +_L4_M4_46: + + KERNEL4x4_SUB + + subs L, L, #1 + bne _L4_M4_46 + +_L4_M4_100: + + SAVE4x4 + +#if (defined(LEFT) && defined(TRANSA)) || \ + (!defined(LEFT) && !defined(TRANSA)) + ldr r3 , K + ldr r4 , KKK + sub r3 , r3 , r4 + lsls r4 , r3 , #4 // 4 float values + add BO , BO , r4 + lsls r4 , r3 , #4 // 4 float values + add AO , AO , r4 +#endif + +#if defined(LEFT) + ldr r3 , KK + add r3 , r3 , #4 // number of values in AO + str r3 , KK +#endif + + + + +_L4_M4_END: + + subs I, I, #1 + bne _L4_M4_20 + + +_L4_M2_BEGIN: + + ldr I, M + tst I , #3 + ble _L4_END + + tst I, #2 // I = I / 2 + ble _L4_M1_BEGIN + +_L4_M2_20: + + INIT2x4 + +#if (defined(LEFT) && defined(TRANSA)) || \ + (!defined(LEFT) && !defined(TRANSA)) + + mov BO, BC +#else + mov BO, BC + ldr r3 , KK + lsls r4 , r3 , #4 // 4 float values + add BO , BO , r4 + lsls r4 , r3 , #3 // 2 float values + add AO , AO , r4 + +#endif + +#ifndef TRMMKERNEL + ldr K1, K +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + ldr K1, K + ldr r3, KK + sub K1, K1, r3 + str K1, KKK +#else + ldr K1, KK +#ifdef LEFT + add K1, K1, #2 // number of values in AO +#else + add K1, K1, #4 // number of values in BO +#endif + str K1, KKK +#endif + + + asrs L , K1, #3 // L = L / 8 + ble _L4_M2_40 + +_L4_M2_22: + + KERNEL2x4_SUB + KERNEL2x4_SUB + KERNEL2x4_SUB + KERNEL2x4_SUB + + KERNEL2x4_SUB + KERNEL2x4_SUB + KERNEL2x4_SUB + KERNEL2x4_SUB + + subs L, L, #1 + bgt _L4_M2_22 + + +_L4_M2_40: + + ands L , K1, #7 // L = L % 8 + ble _L4_M2_100 + +_L4_M2_42: + + KERNEL2x4_SUB + + subs L, L, #1 + bgt _L4_M2_42 + +_L4_M2_100: + + SAVE2x4 + +#if (defined(LEFT) && defined(TRANSA)) || \ + (!defined(LEFT) && !defined(TRANSA)) + ldr r3 , K + ldr r4 , KKK + sub r3 , r3 , r4 + lsls r4 , r3 , #4 // 4 float values + add BO , BO , r4 + lsls r4 , r3 , #3 // 2 float values + add AO , AO , r4 +#endif + +#if defined(LEFT) + ldr r3 , KK + add r3 , r3 , #2 // number of values in AO + str r3 , KK +#endif + + + +_L4_M2_END: + + +_L4_M1_BEGIN: + + tst I, #1 // I = I % 2 + ble _L4_END + +_L4_M1_20: + + INIT1x4 + +#if (defined(LEFT) && defined(TRANSA)) || \ + (!defined(LEFT) && !defined(TRANSA)) + + mov BO, BC +#else + mov BO, BC + ldr r3 , KK + lsls r4 , r3 , #4 // 4 float values + add BO , BO , r4 + lsls r4 , r3 , #2 // 1 float value + add AO , AO , r4 + +#endif + +#ifndef TRMMKERNEL + ldr K1, K +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + ldr K1, K + ldr r3, KK + sub K1, K1, r3 + str K1, KKK +#else + ldr K1, KK +#ifdef LEFT + add K1, K1, #1 // number of values in AO +#else + add K1, K1, #4 // number of values in BO +#endif + str K1, KKK +#endif + + + asrs L , K1, #3 // L = L / 8 + ble _L4_M1_40 + +_L4_M1_22: + KERNEL1x4_SUB + KERNEL1x4_SUB + KERNEL1x4_SUB + KERNEL1x4_SUB + + KERNEL1x4_SUB + KERNEL1x4_SUB + KERNEL1x4_SUB + KERNEL1x4_SUB + + subs L, L, #1 + bgt _L4_M1_22 + + +_L4_M1_40: + + ands L , K1, #7 // L = L % 8 + ble _L4_M1_100 + +_L4_M1_42: + + KERNEL1x4_SUB + + subs L, L, #1 + bgt _L4_M1_42 + +_L4_M1_100: + + SAVE1x4 + +#if (defined(LEFT) && defined(TRANSA)) || \ + (!defined(LEFT) && !defined(TRANSA)) + ldr r3 , K + ldr r4 , KKK + sub r3 , r3 , r4 + lsls r4 , r3 , #4 // 4 float values + add BO , BO , r4 + lsls r4 , r3 , #2 // 1 float value + add AO , AO , r4 +#endif + +#if defined(LEFT) + ldr r3 , KK + add r3 , r3 , #1 // number of values in AO + str r3 , KK +#endif + + + +_L4_END: + + mov r3, BC + ldr r4, K + lsl r4, r4, #4 // k * 4 * 4 + add r3, r3, r4 // B = B + K * 4 * 4 + mov BC, r3 + +#if !defined(LEFT) + ldr r3 , KK + add r3 , r3 , #4 // number of values in BO + str r3 , KK +#endif + + subs J , #1 // j-- + bgt _L4_BEGIN + + + +/*********************************************************************************************/ + +_L2_BEGIN: + + ldr J , N + tst J , #3 + ble _L999 + + tst J , #2 + ble _L1_BEGIN + + ldr CO1, C // CO1 = C + ldr r4 , LDC + lsl r4 , r4 , #1 // LDC * 2 + add r3 , r4, CO1 + str r3 , C // store C + +#if defined(LEFT) + ldr r3 , OFFSET + str r3 , KK +#endif + + ldr AO, A // AO = A + //pld [AO , #A_PRE-96] + //pld [AO , #A_PRE-64] + //pld [AO , #A_PRE-32] + + + +_L2_M4_BEGIN: + + ldr I, M + asrs I, I, #2 // I = I / 4 + ble _L2_M2_BEGIN + +_L2_M4_20: + + INIT4x2 + +#if (defined(LEFT) && defined(TRANSA)) || \ + (!defined(LEFT) && !defined(TRANSA)) + + mov BO, BC +#else + mov BO, BC + ldr r3 , KK + lsls r4 , r3 , #3 // 2 float values + add BO , BO , r4 + lsls r4 , r3 , #4 // 4 float values + add AO , AO , r4 + +#endif + +#ifndef TRMMKERNEL + ldr K1, K +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + ldr K1, K + ldr r3, KK + sub K1, K1, r3 + str K1, KKK +#else + ldr K1, KK +#ifdef LEFT + add K1, K1, #4 // number of values in AO +#else + add K1, K1, #2 // number of values in BO +#endif + str K1, KKK +#endif + + + asrs L , K1, #3 // L = L / 8 + ble _L2_M4_40 + .align 5 + +_L2_M4_22: + KERNEL4x2_SUB + KERNEL4x2_SUB + KERNEL4x2_SUB + KERNEL4x2_SUB + + KERNEL4x2_SUB + KERNEL4x2_SUB + KERNEL4x2_SUB + KERNEL4x2_SUB + + subs L, L, #1 + bgt _L2_M4_22 + + +_L2_M4_40: + + ands L , K1, #7 // L = L % 8 + ble _L2_M4_100 + +_L2_M4_42: + + KERNEL4x2_SUB + + subs L, L, #1 + bgt _L2_M4_42 + +_L2_M4_100: + + SAVE4x2 + +#if (defined(LEFT) && defined(TRANSA)) || \ + (!defined(LEFT) && !defined(TRANSA)) + ldr r3 , K + ldr r4 , KKK + sub r3 , r3 , r4 + lsls r4 , r3 , #3 // 2 float values + add BO , BO , r4 + lsls r4 , r3 , #4 // 4 float values + add AO , AO , r4 +#endif + +#if defined(LEFT) + ldr r3 , KK + add r3 , r3 , #4 // number of values in AO + str r3 , KK +#endif + + + +_L2_M4_END: + + subs I, I, #1 + bgt _L2_M4_20 + + +_L2_M2_BEGIN: + + ldr I, M + tst I , #3 + ble _L2_END + + tst I, #2 // I = I / 2 + ble _L2_M1_BEGIN + +_L2_M2_20: + + INIT2x2 + +#if (defined(LEFT) && defined(TRANSA)) || \ + (!defined(LEFT) && !defined(TRANSA)) + + mov BO, BC +#else + mov BO, BC + ldr r3 , KK + lsls r4 , r3 , #3 // 2 float values + add BO , BO , r4 + lsls r4 , r3 , #3 // 2 float values + add AO , AO , r4 + +#endif + +#ifndef TRMMKERNEL + ldr K1, K +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + ldr K1, K + ldr r3, KK + sub K1, K1, r3 + str K1, KKK +#else + ldr K1, KK +#ifdef LEFT + add K1, K1, #2 // number of values in AO +#else + add K1, K1, #2 // number of values in BO +#endif + str K1, KKK +#endif + + + asrs L , K1, #3 // L = L / 8 + ble _L2_M2_40 + +_L2_M2_22: + + KERNEL2x2_SUB + KERNEL2x2_SUB + KERNEL2x2_SUB + KERNEL2x2_SUB + + KERNEL2x2_SUB + KERNEL2x2_SUB + KERNEL2x2_SUB + KERNEL2x2_SUB + + subs L, L, #1 + bgt _L2_M2_22 + + +_L2_M2_40: + + ands L , K1, #7 // L = L % 8 + ble _L2_M2_100 + +_L2_M2_42: + + KERNEL2x2_SUB + + subs L, L, #1 + bgt _L2_M2_42 + +_L2_M2_100: + + SAVE2x2 + +#if (defined(LEFT) && defined(TRANSA)) || \ + (!defined(LEFT) && !defined(TRANSA)) + ldr r3 , K + ldr r4 , KKK + sub r3 , r3 , r4 + lsls r4 , r3 , #3 // 2 float values + add BO , BO , r4 + lsls r4 , r3 , #3 // 2 float values + add AO , AO , r4 +#endif + +#if defined(LEFT) + ldr r3 , KK + add r3 , r3 , #2 // number of values in AO + str r3 , KK +#endif + + + +_L2_M2_END: + + +_L2_M1_BEGIN: + + tst I, #1 // I = I % 2 + ble _L2_END + +_L2_M1_20: + + INIT1x2 + +#if (defined(LEFT) && defined(TRANSA)) || \ + (!defined(LEFT) && !defined(TRANSA)) + + mov BO, BC +#else + mov BO, BC + ldr r3 , KK + lsls r4 , r3 , #3 // 2 float values + add BO , BO , r4 + lsls r4 , r3 , #2 // 1 float value + add AO , AO , r4 + +#endif + +#ifndef TRMMKERNEL + ldr K1, K +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + ldr K1, K + ldr r3, KK + sub K1, K1, r3 + str K1, KKK +#else + ldr K1, KK +#ifdef LEFT + add K1, K1, #1 // number of values in AO +#else + add K1, K1, #2 // number of values in BO +#endif + str K1, KKK +#endif + + + asrs L , K1, #3 // L = L / 8 + ble _L2_M1_40 + +_L2_M1_22: + KERNEL1x2_SUB + KERNEL1x2_SUB + KERNEL1x2_SUB + KERNEL1x2_SUB + + KERNEL1x2_SUB + KERNEL1x2_SUB + KERNEL1x2_SUB + KERNEL1x2_SUB + + subs L, L, #1 + bgt _L2_M1_22 + + +_L2_M1_40: + + ands L , K1, #7 // L = L % 8 + ble _L2_M1_100 + +_L2_M1_42: + + KERNEL1x2_SUB + + subs L, L, #1 + bgt _L2_M1_42 + +_L2_M1_100: + + SAVE1x2 + +#if (defined(LEFT) && defined(TRANSA)) || \ + (!defined(LEFT) && !defined(TRANSA)) + ldr r3 , K + ldr r4 , KKK + sub r3 , r3 , r4 + lsls r4 , r3 , #3 // 2 float values + add BO , BO , r4 + lsls r4 , r3 , #2 // 1 float value + add AO , AO , r4 +#endif + +#if defined(LEFT) + ldr r3 , KK + add r3 , r3 , #1 // number of values in AO + str r3 , KK +#endif + + + +_L2_END: + + mov r3, BC + ldr r4, K + lsl r4, r4, #3 // k * 2 * 4 + add r3, r3, r4 // B = B + K * 2 * 4 + mov BC, r3 + +#if !defined(LEFT) + ldr r3 , KK + add r3 , r3 , #2 // number of values in BO + str r3 , KK +#endif + + +/*********************************************************************************************/ + +_L1_BEGIN: + + ldr J , N + tst J , #1 + ble _L999 + + + ldr CO1, C // CO1 = C + ldr r4 , LDC + add r3 , r4, CO1 + str r3 , C // store C + +#if defined(LEFT) + ldr r3 , OFFSET + str r3 , KK +#endif + + ldr AO, A // AO = A + //pld [AO , #A_PRE-96] + //pld [AO , #A_PRE-64] + //pld [AO , #A_PRE-32] + + + +_L1_M4_BEGIN: + + ldr I, M + asrs I, I, #2 // I = I / 4 + ble _L1_M2_BEGIN + +_L1_M4_20: + + INIT4x1 + +#if (defined(LEFT) && defined(TRANSA)) || \ + (!defined(LEFT) && !defined(TRANSA)) + + mov BO, BC +#else + mov BO, BC + ldr r3 , KK + lsls r4 , r3 , #2 // 1 float value + add BO , BO , r4 + lsls r4 , r3 , #4 // 4 float values + add AO , AO , r4 + +#endif + +#ifndef TRMMKERNEL + ldr K1, K +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + ldr K1, K + ldr r3, KK + sub K1, K1, r3 + str K1, KKK +#else + ldr K1, KK +#ifdef LEFT + add K1, K1, #4 // number of values in AO +#else + add K1, K1, #1 // number of values in BO +#endif + str K1, KKK +#endif + + + asrs L , K1, #3 // L = L / 8 + ble _L1_M4_40 + .align 5 + +_L1_M4_22: + KERNEL4x1_SUB + KERNEL4x1_SUB + KERNEL4x1_SUB + KERNEL4x1_SUB + + KERNEL4x1_SUB + KERNEL4x1_SUB + KERNEL4x1_SUB + KERNEL4x1_SUB + + subs L, L, #1 + bgt _L1_M4_22 + + +_L1_M4_40: + + ands L , K1, #7 // L = L % 8 + ble _L1_M4_100 + +_L1_M4_42: + + KERNEL4x1_SUB + + subs L, L, #1 + bgt _L1_M4_42 + +_L1_M4_100: + + SAVE4x1 + +#if (defined(LEFT) && defined(TRANSA)) || \ + (!defined(LEFT) && !defined(TRANSA)) + ldr r3 , K + ldr r4 , KKK + sub r3 , r3 , r4 + lsls r4 , r3 , #2 // 1 float value + add BO , BO , r4 + lsls r4 , r3 , #4 // 4 float values + add AO , AO , r4 +#endif + +#if defined(LEFT) + ldr r3 , KK + add r3 , r3 , #4 // number of values in AO + str r3 , KK +#endif + + + + +_L1_M4_END: + + subs I, I, #1 + bgt _L1_M4_20 + + +_L1_M2_BEGIN: + + ldr I, M + tst I , #3 + ble _L1_END + + tst I, #2 // I = I / 2 + ble _L1_M1_BEGIN + +_L1_M2_20: + + INIT2x1 + +#if (defined(LEFT) && defined(TRANSA)) || \ + (!defined(LEFT) && !defined(TRANSA)) + + mov BO, BC +#else + mov BO, BC + ldr r3 , KK + lsls r4 , r3 , #2 // 1 float value + add BO , BO , r4 + lsls r4 , r3 , #3 // 2 float values + add AO , AO , r4 + +#endif + +#ifndef TRMMKERNEL + ldr K1, K +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + ldr K1, K + ldr r3, KK + sub K1, K1, r3 + str K1, KKK +#else + ldr K1, KK +#ifdef LEFT + add K1, K1, #2 // number of values in AO +#else + add K1, K1, #1 // number of values in BO +#endif + str K1, KKK +#endif + + + asrs L , K1, #3 // L = L / 8 + ble _L1_M2_40 + +_L1_M2_22: + + KERNEL2x1_SUB + KERNEL2x1_SUB + KERNEL2x1_SUB + KERNEL2x1_SUB + + KERNEL2x1_SUB + KERNEL2x1_SUB + KERNEL2x1_SUB + KERNEL2x1_SUB + + subs L, L, #1 + bgt _L1_M2_22 + + +_L1_M2_40: + + ands L , K1, #7 // L = L % 8 + ble _L1_M2_100 + +_L1_M2_42: + + KERNEL2x1_SUB + + subs L, L, #1 + bgt _L1_M2_42 + +_L1_M2_100: + + SAVE2x1 + +#if (defined(LEFT) && defined(TRANSA)) || \ + (!defined(LEFT) && !defined(TRANSA)) + ldr r3 , K + ldr r4 , KKK + sub r3 , r3 , r4 + lsls r4 , r3 , #2 // 1 float value + add BO , BO , r4 + lsls r4 , r3 , #3 // 2 float values + add AO , AO , r4 +#endif + +#if defined(LEFT) + ldr r3 , KK + add r3 , r3 , #2 // number of values in AO + str r3 , KK +#endif + + + +_L1_M2_END: + + +_L1_M1_BEGIN: + + tst I, #1 // I = I % 2 + ble _L1_END + +_L1_M1_20: + + INIT1x1 + +#if (defined(LEFT) && defined(TRANSA)) || \ + (!defined(LEFT) && !defined(TRANSA)) + + mov BO, BC +#else + mov BO, BC + ldr r3 , KK + lsls r4 , r3 , #2 // 1 float value + add BO , BO , r4 + lsls r4 , r3 , #2 // 1 float value + add AO , AO , r4 + +#endif + +#ifndef TRMMKERNEL + ldr K1, K +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + ldr K1, K + ldr r3, KK + sub K1, K1, r3 + str K1, KKK +#else + ldr K1, KK +#ifdef LEFT + add K1, K1, #1 // number of values in AO +#else + add K1, K1, #1 // number of values in BO +#endif + str K1, KKK +#endif + + + asrs L , K1, #3 // L = L / 8 + ble _L1_M1_40 + +_L1_M1_22: + KERNEL1x1_SUB + KERNEL1x1_SUB + KERNEL1x1_SUB + KERNEL1x1_SUB + + KERNEL1x1_SUB + KERNEL1x1_SUB + KERNEL1x1_SUB + KERNEL1x1_SUB + + subs L, L, #1 + bgt _L1_M1_22 + + +_L1_M1_40: + + ands L , K1, #7 // L = L % 8 + ble _L1_M1_100 + +_L1_M1_42: + + KERNEL1x1_SUB + + subs L, L, #1 + bgt _L1_M1_42 + +_L1_M1_100: + + SAVE1x1 + + +_L1_END: + + +_L999: + + sub r3, fp, #128 + vldm r3, { s8 - s31} // restore floating point registers + + movs r0, #0 // set return value + sub sp, fp, #24 + pop {r4 - r9, fp} + bx lr + + EPILOGUE + diff --git a/kernel/arm/swap.c b/kernel/arm/swap.c new file mode 100644 index 000000000..1ca9e7607 --- /dev/null +++ b/kernel/arm/swap.c @@ -0,0 +1,62 @@ +/*************************************************************************** +Copyright (c) 2013, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +/************************************************************************************** +* 2013/08/20 Saar +* BLASTEST float OK +* BLASTEST double OK +* +**************************************************************************************/ + +#include "common.h" +#include + +int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT dummy3, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *dummy, BLASLONG dummy2) +{ + BLASLONG i=0; + BLASLONG ix=0,iy=0; + FLOAT temp; + + if ( n < 0 ) return(0); + + while(i < n) + { + + temp = x[ix] ; + x[ix] = y[iy] ; + y[iy] = temp ; + + ix += inc_x ; + iy += inc_y ; + i++ ; + + } + return(0); + +} + + diff --git a/kernel/arm/swap_vfp.S b/kernel/arm/swap_vfp.S new file mode 100644 index 000000000..352875188 --- /dev/null +++ b/kernel/arm/swap_vfp.S @@ -0,0 +1,354 @@ +/*************************************************************************** +Copyright (c) 2013, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +/************************************************************************************** +* 2013/11/14 Saar +* BLASTEST : OK +* CTEST : OK +* TEST : OK +* +**************************************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#define STACKSIZE 256 + +#define OLD_INC_X [fp, #0 ] +#define OLD_Y [fp, #4 ] +#define OLD_INC_Y [fp, #8 ] + + +#define N r0 +#define Y r1 +#define INC_X r2 +#define X r3 +#define INC_Y r4 + +#define I r12 + +#define X_PRE 512 + +/************************************************************************************** +* Macro definitions +**************************************************************************************/ + +/*****************************************************************************************/ + + + +#if !defined(COMPLEX) + +#if defined(DOUBLE) + +.macro KERNEL_F4 + + pld [ X, #X_PRE ] + pld [ Y, #X_PRE ] + fldmiad X, { d0 - d3 } + fldmiad Y, { d4 - d7 } + fstmiad Y!, { d0 - d3 } + fstmiad X!, { d4 - d7} + +.endm + + +.macro KERNEL_F1 + + fldmiad X, { d0 } + fldmiad Y, { d4 } + fstmiad Y!, { d0 } + fstmiad X!, { d4 } + +.endm + +.macro KERNEL_S1 + + fldmiad X, { d0 } + fldmiad Y, { d4 } + fstmiad Y, { d0 } + fstmiad X, { d4 } + add X, X, INC_X + add Y, Y, INC_Y + +.endm + +#else + +.macro KERNEL_F4 + + fldmias X, { s0 - s3 } + fldmias Y, { s4 - s7 } + fstmias Y!, { s0 - s3 } + fstmias X!, { s4 - s7} + +.endm + + +.macro KERNEL_F1 + + fldmias X, { s0 } + fldmias Y, { s4 } + fstmias Y!, { s0 } + fstmias X!, { s4 } + +.endm + +.macro KERNEL_S1 + + fldmias X, { s0 } + fldmias Y, { s4 } + fstmias Y, { s0 } + fstmias X, { s4 } + add X, X, INC_X + add Y, Y, INC_Y + +.endm + + +#endif + +#else + +#if defined(DOUBLE) + +.macro KERNEL_F4 + + pld [ X, #X_PRE ] + pld [ Y, #X_PRE ] + fldmiad X, { d0 - d3 } + fldmiad Y, { d4 - d7 } + fstmiad Y!, { d0 - d3 } + fstmiad X!, { d4 - d7} + + pld [ X, #X_PRE ] + pld [ Y, #X_PRE ] + fldmiad X, { d0 - d3 } + fldmiad Y, { d4 - d7 } + fstmiad Y!, { d0 - d3 } + fstmiad X!, { d4 - d7} + +.endm + +.macro KERNEL_F1 + + fldmiad X, { d0 - d1 } + fldmiad Y, { d4 - d5 } + fstmiad Y!, { d0 - d1 } + fstmiad X!, { d4 - d5 } + +.endm + +.macro KERNEL_S1 + + fldmiad X, { d0 - d1 } + fldmiad Y, { d4 - d5 } + fstmiad Y, { d0 - d1 } + fstmiad X, { d4 - d5 } + add X, X, INC_X + add Y, Y, INC_Y + +.endm + + +#else + +.macro KERNEL_F4 + + pld [ X, #X_PRE ] + pld [ Y, #X_PRE ] + fldmias X, { s0 - s3 } + fldmias Y, { s4 - s7 } + fstmias Y!, { s0 - s3 } + fstmias X!, { s4 - s7} + + fldmias X, { s0 - s3 } + fldmias Y, { s4 - s7 } + fstmias Y!, { s0 - s3 } + fstmias X!, { s4 - s7} + +.endm + +.macro KERNEL_F1 + + fldmias X, { s0 - s1 } + fldmias Y, { s4 - s5 } + fstmias Y!, { s0 - s1 } + fstmias X!, { s4 - s5 } + +.endm + +.macro KERNEL_S1 + + fldmias X, { s0 - s1 } + fldmias Y, { s4 - s5 } + fstmias Y, { s0 - s1 } + fstmias X, { s4 - s5 } + add X, X, INC_X + add Y, Y, INC_Y + +.endm + + + +#endif + +#endif + +/************************************************************************************** +* End of macro definitions +**************************************************************************************/ + + PROLOGUE + + .align 5 + push {r4 , fp} + add fp, sp, #8 + + ldr INC_X , OLD_INC_X + ldr Y, OLD_Y + ldr INC_Y , OLD_INC_Y + + + cmp N, #0 + ble swap_kernel_L999 + + cmp INC_X, #0 + beq swap_kernel_L999 + + cmp INC_Y, #0 + beq swap_kernel_L999 + + cmp INC_X, #1 + bne swap_kernel_S_BEGIN + + cmp INC_Y, #1 + bne swap_kernel_S_BEGIN + + +swap_kernel_F_BEGIN: + + + asrs I, N, #2 // I = N / 4 + ble swap_kernel_F1 + + .align 5 + +swap_kernel_F4: + +#if !defined(COMPLEX) && !defined(DOUBLE) + pld [ X, #X_PRE ] + pld [ Y, #X_PRE ] +#endif + + KERNEL_F4 + + subs I, I, #1 + ble swap_kernel_F1 + + KERNEL_F4 + + subs I, I, #1 + bne swap_kernel_F4 + +swap_kernel_F1: + + ands I, N, #3 + ble swap_kernel_L999 + +swap_kernel_F10: + + KERNEL_F1 + + subs I, I, #1 + bne swap_kernel_F10 + + b swap_kernel_L999 + +swap_kernel_S_BEGIN: + +#if defined(COMPLEX) + +#if defined(DOUBLE) + lsl INC_X, INC_X, #4 // INC_X * SIZE * 2 + lsl INC_Y, INC_Y, #4 // INC_Y * SIZE * 2 +#else + lsl INC_X, INC_X, #3 // INC_X * SIZE * 2 + lsl INC_Y, INC_Y, #3 // INC_Y * SIZE * 2 +#endif + +#else + +#if defined(DOUBLE) + lsl INC_X, INC_X, #3 // INC_X * SIZE + lsl INC_Y, INC_Y, #3 // INC_Y * SIZE +#else + lsl INC_X, INC_X, #2 // INC_X * SIZE + lsl INC_Y, INC_Y, #2 // INC_Y * SIZE +#endif + +#endif + + + asrs I, N, #2 // I = N / 4 + ble swap_kernel_S1 + + .align 5 + +swap_kernel_S4: + + KERNEL_S1 + KERNEL_S1 + KERNEL_S1 + KERNEL_S1 + + subs I, I, #1 + bne swap_kernel_S4 + +swap_kernel_S1: + + ands I, N, #3 + ble swap_kernel_L999 + +swap_kernel_S10: + + KERNEL_S1 + + subs I, I, #1 + bne swap_kernel_S10 + + +swap_kernel_L999: + + mov r0, #0 // set return value + + sub sp, fp, #8 + pop {r4,fp} + bx lr + + EPILOGUE + diff --git a/kernel/arm/zamax.c b/kernel/arm/zamax.c new file mode 100644 index 000000000..8c2a5c346 --- /dev/null +++ b/kernel/arm/zamax.c @@ -0,0 +1,81 @@ +/*************************************************************************** +Copyright (c) 2013, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +/************************************************************************************** +* 2013/09/14 Saar +* BLASTEST float : OK +* BLASTEST double : OK +* CTEST : NoTest +* TEST : NoTest +* +**************************************************************************************/ + +#include "common.h" +#include + +#if defined(DOUBLE) + +#define ABS fabs + +#else + +#define ABS fabsf + +#endif + +#define CABS1(x,i) ABS(x[i])+ABS(x[i+1]) + +FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) +{ + BLASLONG i=0; + BLASLONG ix=0; + FLOAT maxf[2]; + BLASLONG max=0; + BLASLONG inc_x2; + + if (n < 0 || inc_x < 1 ) return(0.0); + + inc_x2 = 2 * inc_x; + + maxf[0] = ABS(x[ix]); + maxf[1] = ABS(x[ix+1]); + + while(i < n) + { + if( CABS1(x,ix) > CABS1(maxf,0) ) + { + max = i; + maxf[0] = ABS(x[ix]); + maxf[1] = ABS(x[ix+1]); + } + ix += inc_x2; + i++; + } + return(CABS1(maxf,0)); +} + + diff --git a/kernel/arm/zamin.c b/kernel/arm/zamin.c new file mode 100644 index 000000000..6956ced0e --- /dev/null +++ b/kernel/arm/zamin.c @@ -0,0 +1,81 @@ +/*************************************************************************** +Copyright (c) 2013, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +/************************************************************************************** +* 2013/09/14 Saar +* BLASTEST float : OK +* BLASTEST double : OK +* CTEST : NoTest +* TEST : NoTest +* +**************************************************************************************/ + +#include "common.h" +#include + +#if defined(DOUBLE) + +#define ABS fabs + +#else + +#define ABS fabsf + +#endif + +#define CABS1(x,i) ABS(x[i])+ABS(x[i+1]) + +FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) +{ + BLASLONG i=0; + BLASLONG ix=0; + FLOAT minf[2]; + BLASLONG min=0; + BLASLONG inc_x2; + + if (n < 0 || inc_x < 1 ) return(0.0); + + inc_x2 = 2 * inc_x; + + minf[0] = ABS(x[ix]); + minf[1] = ABS(x[ix+1]); + + while(i < n) + { + if( CABS1(x,ix) < CABS1(minf,0) ) + { + min = i; + minf[0] = ABS(x[ix]); + minf[1] = ABS(x[ix+1]); + } + ix += inc_x2; + i++; + } + return(CABS1(minf,0)); +} + + diff --git a/kernel/arm/zasum.c b/kernel/arm/zasum.c new file mode 100644 index 000000000..13acfc0f0 --- /dev/null +++ b/kernel/arm/zasum.c @@ -0,0 +1,71 @@ +/*************************************************************************** +Copyright (c) 2013, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +/************************************************************************************** +* 2013/09/14 Saar +* BLASTEST float : OK +* BLASTEST double : OK +* CTEST : OK +* TEST : OK +* +**************************************************************************************/ + + +#include "common.h" +#include + +#if defined(DOUBLE) + +#define ABS fabs + +#else + +#define ABS fabsf + +#endif + +#define CABS1(x,i) ABS(x[i])+ABS(x[i+1]) + +FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) +{ + BLASLONG i=0; + FLOAT sumf = 0.0; + BLASLONG inc_x2; + if (n < 0 || inc_x < 1 ) return(sumf); + + inc_x2 = 2 * inc_x; + + n *= inc_x2; + while(i < n) + { + sumf += CABS1(x,i); + i += inc_x2; + } + return(sumf); +} + + diff --git a/kernel/arm/zaxpy.c b/kernel/arm/zaxpy.c new file mode 100644 index 000000000..28a4380fb --- /dev/null +++ b/kernel/arm/zaxpy.c @@ -0,0 +1,72 @@ +/*************************************************************************** +Copyright (c) 2013, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +/************************************************************************************** +* 2013/09/15 Saar +* BLASTEST float : OK +* BLASTEST double : OK +* CTEST : OK +* TEST : OK +* +**************************************************************************************/ + + +#include "common.h" + +int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da_r, FLOAT da_i, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *dummy, BLASLONG dummy2) +{ + BLASLONG i=0; + BLASLONG ix,iy; + + if ( n < 0 ) return(0); + if ( da_r == 0.0 && da_i == 0.0 ) return(0); + + ix = 0; + iy = 0; + + BLASLONG inc_x2 = 2 * inc_x; + BLASLONG inc_y2 = 2 * inc_y; + + while(i < n) + { +#if !defined(CONJ) + y[iy] += ( da_r * x[ix] - da_i * x[ix+1] ) ; + y[iy+1] += ( da_r * x[ix+1] + da_i * x[ix] ) ; +#else + y[iy] += ( da_r * x[ix] + da_i * x[ix+1] ) ; + y[iy+1] -= ( da_r * x[ix+1] - da_i * x[ix] ) ; +#endif + ix += inc_x2 ; + iy += inc_y2 ; + i++ ; + + } + return(0); + +} + + diff --git a/kernel/arm/zcopy.c b/kernel/arm/zcopy.c new file mode 100644 index 000000000..654711240 --- /dev/null +++ b/kernel/arm/zcopy.c @@ -0,0 +1,63 @@ +/*************************************************************************** +Copyright (c) 2013, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +/************************************************************************************** +* 2013/09/14 Saar +* BLASTEST float : OK +* BLASTEST double : OK +* CTEST : OK +* TEST : OK +* +**************************************************************************************/ + +#include "common.h" + +int CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y) +{ + BLASLONG i=0; + BLASLONG ix=0,iy=0; + + if ( n < 0 ) return(0); + + BLASLONG inc_x2 = 2 * inc_x; + BLASLONG inc_y2 = 2 * inc_y; + + while(i < n) + { + + y[iy] = x[ix] ; + y[iy+1] = x[ix+1] ; + ix += inc_x2; + iy += inc_y2; + i++ ; + + } + return(0); + +} + + diff --git a/kernel/arm/zcopy_vfp.S b/kernel/arm/zcopy_vfp.S new file mode 100644 index 000000000..06f892446 --- /dev/null +++ b/kernel/arm/zcopy_vfp.S @@ -0,0 +1,223 @@ +/*************************************************************************** +Copyright (c) 2013, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +/************************************************************************************** +* 2013/11/07 Saar +* BLASTEST : OK +* CTEST : OK +* TEST : OK +* +**************************************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#define STACKSIZE 256 + +#define N r0 +#define X r1 +#define INC_X r2 +#define OLD_Y r3 + + +/****************************************************** +* [fp, #-128] - [fp, #-64] is reserved +* for store and restore of floating point +* registers +*******************************************************/ + +#define OLD_INC_Y [fp, #4 ] + +#define I r5 +#define Y r6 +#define INC_Y r7 + +#define X_PRE 256 + +/************************************************************************************** +* Macro definitions +**************************************************************************************/ + +.macro COPY_F4 + + pld [ X, #X_PRE ] + pld [ X, #X_PRE+32 ] + fldmiad X!, { d0 - d7 } + fstmiad Y!, { d0 - d7 } + +.endm + +.macro COPY_F1 + + fldmiad X!, { d0 - d1 } + fstmiad Y!, { d0 - d1 } + +.endm + + +/*************************************************************************************************************************/ + +.macro COPY_S4 + + nop + fldmiad X, { d0 - d1 } + fstmiad Y, { d0 - d1 } + add X, X, INC_X + add Y, Y, INC_Y + + fldmiad X, { d2 - d3 } + fstmiad Y, { d2 - d3 } + add X, X, INC_X + add Y, Y, INC_Y + + fldmiad X, { d0 - d1 } + fstmiad Y, { d0 - d1 } + add X, X, INC_X + add Y, Y, INC_Y + + fldmiad X, { d2 - d3 } + fstmiad Y, { d2 - d3 } + add X, X, INC_X + add Y, Y, INC_Y + +.endm + + +.macro COPY_S1 + + fldmiad X, { d0 - d1 } + fstmiad Y, { d0 - d1 } + add X, X, INC_X + add Y, Y, INC_Y + +.endm + + + +/************************************************************************************** +* End of macro definitions +**************************************************************************************/ + + PROLOGUE + + .align 5 + + push {r4 - r9, fp} + add fp, sp, #24 + sub sp, sp, #STACKSIZE // reserve stack + + sub r4, fp, #128 + vstm r4, { d8 - d15} // store floating point registers + + mov Y, OLD_Y + ldr INC_Y, OLD_INC_Y + + cmp N, #0 + ble zcopy_kernel_L999 + + cmp INC_X, #0 + beq zcopy_kernel_L999 + + cmp INC_Y, #0 + beq zcopy_kernel_L999 + + cmp INC_X, #1 + bne zcopy_kernel_S_BEGIN + + cmp INC_Y, #1 + bne zcopy_kernel_S_BEGIN + +zcopy_kernel_F_BEGIN: + + asrs I, N, #2 // I = N / 4 + ble zcopy_kernel_F1 + +zcopy_kernel_F4: + + COPY_F4 + + subs I, I, #1 + bne zcopy_kernel_F4 + +zcopy_kernel_F1: + + ands I, N, #3 + ble zcopy_kernel_L999 + +zcopy_kernel_F10: + + COPY_F1 + + subs I, I, #1 + bne zcopy_kernel_F10 + + b zcopy_kernel_L999 + +zcopy_kernel_S_BEGIN: + + lsl INC_X, INC_X, #4 // INC_X * SIZE * 2 + lsl INC_Y, INC_Y, #4 // INC_Y * SIZE * 2 + + asrs I, N, #2 // I = N / 4 + ble zcopy_kernel_S1 + +zcopy_kernel_S4: + + COPY_S4 + + subs I, I, #1 + bne zcopy_kernel_S4 + +zcopy_kernel_S1: + + ands I, N, #3 + ble zcopy_kernel_L999 + +zcopy_kernel_S10: + + COPY_S1 + + subs I, I, #1 + bne zcopy_kernel_S10 + + + + + + +zcopy_kernel_L999: + + sub r3, fp, #128 + vldm r3, { d8 - d15} // restore floating point registers + + mov r0, #0 // set return value + sub sp, fp, #24 + pop {r4 - r9, fp} + bx lr + + EPILOGUE + diff --git a/kernel/arm/zdot.c b/kernel/arm/zdot.c new file mode 100644 index 000000000..096ced9db --- /dev/null +++ b/kernel/arm/zdot.c @@ -0,0 +1,78 @@ +/*************************************************************************** +Copyright (c) 2013, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +/************************************************************************************** +* 2013/09/14 Saar +* BLASTEST float : FAIL +* BLASTEST double : FAIL +* CTEST : OK +* TEST : OK +* +**************************************************************************************/ + +#include "common.h" +#include + +FLOAT _Complex CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y) +{ + BLASLONG i=0; + BLASLONG ix=0,iy=0; + FLOAT dot[2]; + FLOAT _Complex result; + + dot[0]=0.0; + dot[1]=0.0; + + __real__ result = 0.0 ; + __imag__ result = 0.0 ; + + if ( n < 1 ) return(result); + + BLASLONG inc_x2 = 2 * inc_x ; + BLASLONG inc_y2 = 2 * inc_y ; + + while(i < n) + { +#if !defined(CONJ) + dot[0] += ( x[ix] * y[iy] - x[ix+1] * y[iy+1] ) ; + dot[1] += ( x[ix+1] * y[iy] + x[ix] * y[iy+1] ) ; +#else + dot[0] += ( x[ix] * y[iy] + x[ix+1] * y[iy+1] ) ; + dot[1] -= ( x[ix+1] * y[iy] - x[ix] * y[iy+1] ) ; +#endif + ix += inc_x2 ; + iy += inc_y2 ; + i++ ; + + } + __real__ result = dot[0]; + __imag__ result = dot[1]; + return(result); + +} + + diff --git a/kernel/arm/zdot_vfp.S b/kernel/arm/zdot_vfp.S new file mode 100644 index 000000000..1a78b5aec --- /dev/null +++ b/kernel/arm/zdot_vfp.S @@ -0,0 +1,286 @@ +/*************************************************************************** +Copyright (c) 2013, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +/************************************************************************************** +* 2013/11/11 Saar +* BLASTEST : OK +* CTEST : OK +* TEST : OK +* +**************************************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#define STACKSIZE 256 + +#define N r0 +#define X r1 +#define INC_X r2 +#define OLD_Y r3 + + +/****************************************************** +* [fp, #-128] - [fp, #-64] is reserved +* for store and restore of floating point +* registers +*******************************************************/ + +#define OLD_INC_Y [fp, #4 ] + +#define I r5 +#define Y r6 +#define INC_Y r7 + +#define X_PRE 512 + +/************************************************************************************** +* Macro definitions +**************************************************************************************/ + +.macro KERNEL_F4 + + pld [ X, #X_PRE ] + pld [ Y, #X_PRE ] + + fldmiad X!, { d4 - d5 } + fldmiad Y!, { d8 - d9 } + fmacd d0 , d4, d8 + fmacd d1 , d4, d9 + fldmiad X!, { d6 - d7 } + fmacd d2 , d5, d9 + fmacd d3 , d5, d8 + + fldmiad Y!, { d10 - d11 } + fmacd d0 , d6, d10 + fmacd d1 , d6, d11 + pld [ X, #X_PRE ] + fmacd d2 , d7, d11 + fmacd d3 , d7, d10 + + pld [ Y, #X_PRE ] + + fldmiad X!, { d4 - d5 } + fldmiad Y!, { d8 - d9 } + fmacd d0 , d4, d8 + fmacd d1 , d4, d9 + fldmiad X!, { d6 - d7 } + fmacd d2 , d5, d9 + fmacd d3 , d5, d8 + + fldmiad Y!, { d10 - d11 } + fmacd d0 , d6, d10 + fmacd d1 , d6, d11 + fmacd d2 , d7, d11 + fmacd d3 , d7, d10 + +.endm + +.macro KERNEL_F1 + + fldmiad X!, { d4 - d5 } + fldmiad Y!, { d8 - d9 } + fmacd d0 , d4, d8 + fmacd d1 , d4, d9 + fmacd d2 , d5, d9 + fmacd d3 , d5, d8 + +.endm + + +/*************************************************************************************************************************/ + +.macro KERNEL_S4 + + nop + + fldmiad X, { d4 - d5 } + fldmiad Y, { d8 - d9 } + fmacd d0 , d4, d8 + fmacd d1 , d4, d9 + fmacd d2 , d5, d9 + fmacd d3 , d5, d8 + add X, X, INC_X + add Y, Y, INC_Y + + fldmiad X, { d4 - d5 } + fldmiad Y, { d8 - d9 } + fmacd d0 , d4, d8 + fmacd d1 , d4, d9 + fmacd d2 , d5, d9 + fmacd d3 , d5, d8 + add X, X, INC_X + add Y, Y, INC_Y + + fldmiad X, { d4 - d5 } + fldmiad Y, { d8 - d9 } + fmacd d0 , d4, d8 + fmacd d1 , d4, d9 + fmacd d2 , d5, d9 + fmacd d3 , d5, d8 + add X, X, INC_X + add Y, Y, INC_Y + + fldmiad X, { d4 - d5 } + fldmiad Y, { d8 - d9 } + fmacd d0 , d4, d8 + fmacd d1 , d4, d9 + fmacd d2 , d5, d9 + fmacd d3 , d5, d8 + add X, X, INC_X + add Y, Y, INC_Y + +.endm + + +.macro KERNEL_S1 + + fldmiad X, { d4 - d5 } + fldmiad Y, { d8 - d9 } + fmacd d0 , d4, d8 + fmacd d1 , d4, d9 + fmacd d2 , d5, d9 + fmacd d3 , d5, d8 + add X, X, INC_X + add Y, Y, INC_Y + +.endm + + + +/************************************************************************************** +* End of macro definitions +**************************************************************************************/ + + PROLOGUE + + .align 5 + + push {r4 - r9, fp} + add fp, sp, #24 + sub sp, sp, #STACKSIZE // reserve stack + + sub r4, fp, #128 + vstm r4, { d8 - d15} // store floating point registers + + mov Y, OLD_Y + ldr INC_Y, OLD_INC_Y + + vsub.f64 d0 , d0 , d0 + vsub.f64 d1 , d1 , d1 + vsub.f64 d2 , d2 , d2 + vsub.f64 d3 , d3 , d3 + + cmp N, #0 + ble zdot_kernel_L999 + + cmp INC_X, #0 + beq zdot_kernel_L999 + + cmp INC_Y, #0 + beq zdot_kernel_L999 + + cmp INC_X, #1 + bne zdot_kernel_S_BEGIN + + cmp INC_Y, #1 + bne zdot_kernel_S_BEGIN + +zdot_kernel_F_BEGIN: + + asrs I, N, #2 // I = N / 4 + ble zdot_kernel_F1 + +zdot_kernel_F4: + + KERNEL_F4 + + subs I, I, #1 + bne zdot_kernel_F4 + +zdot_kernel_F1: + + ands I, N, #3 + ble zdot_kernel_L999 + +zdot_kernel_F10: + + KERNEL_F1 + + subs I, I, #1 + bne zdot_kernel_F10 + + b zdot_kernel_L999 + +zdot_kernel_S_BEGIN: + + lsl INC_X, INC_X, #4 // INC_X * SIZE * 2 + lsl INC_Y, INC_Y, #4 // INC_Y * SIZE * 2 + + asrs I, N, #2 // I = N / 4 + ble zdot_kernel_S1 + +zdot_kernel_S4: + + KERNEL_S4 + + subs I, I, #1 + bne zdot_kernel_S4 + +zdot_kernel_S1: + + ands I, N, #3 + ble zdot_kernel_L999 + +zdot_kernel_S10: + + KERNEL_S1 + + subs I, I, #1 + bne zdot_kernel_S10 + + + +zdot_kernel_L999: + + sub r3, fp, #128 + vldm r3, { d8 - d15} // restore floating point registers + +#if !defined(CONJ) + vsub.f64 d0 , d0, d2 + vadd.f64 d1 , d1, d3 +#else + vadd.f64 d0 , d0, d2 + vsub.f64 d1 , d1, d3 +#endif + + sub sp, fp, #24 + pop {r4 - r9, fp} + bx lr + + EPILOGUE + diff --git a/kernel/arm/zgemm_kernel_2x2_vfp.S b/kernel/arm/zgemm_kernel_2x2_vfp.S new file mode 100644 index 000000000..8a5401858 --- /dev/null +++ b/kernel/arm/zgemm_kernel_2x2_vfp.S @@ -0,0 +1,1299 @@ +/*************************************************************************** +Copyright (c) 2013, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +/************************************************************************************** +* 2013/11/28 Saar +* BLASTEST : OK +* CTEST : OK +* TEST : OK +* +***************************************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#define STACKSIZE 256 + +#define OLD_M r0 +#define OLD_N r1 +#define OLD_K r2 +#define OLD_A r3 +#define OLD_ALPHA_R d0 +#define OLD_ALPHA_I d1 + +/****************************************************** +* [fp, #-128] - [fp, #-64] is reserved +* for store and restore of floating point +* registers +*******************************************************/ + +#define A [fp, #-248 ] +#define LDC [fp, #-252 ] +#define M [fp, #-256 ] +#define N [fp, #-260 ] +#define K [fp, #-264 ] + +#define ALPHA_I [fp, #-272] +#define ALPHA_R [fp, #-280] + +#define B [fp, #4 ] +#define C [fp, #8 ] +#define OLD_LDC [fp, #12 ] + +#define I r0 +#define J r1 +#define L r2 + +#define AO r5 +#define BO r6 + +#define CO1 r8 +#define CO2 r9 + +#define K1 r7 +#define BC r12 + +#define A_PRE 96 +#define B_PRE 96 +#define C_PRE 64 + +#if defined(NN) || defined(NT) || defined(TN) || defined(TT) + + #define KMAC_R fnmacd + #define KMAC_I fmacd + + #define FMAC_R1 fmacd + #define FMAC_R2 fnmacd + #define FMAC_I1 fmacd + #define FMAC_I2 fmacd + +#elif defined(CN) || defined(CT) + + #define KMAC_R fmacd + #define KMAC_I fnmacd + + #define FMAC_R1 fmacd + #define FMAC_R2 fnmacd + #define FMAC_I1 fmacd + #define FMAC_I2 fmacd + +#elif defined(NC) || defined(TC) + + #define KMAC_R fmacd + #define KMAC_I fnmacd + + #define FMAC_R1 fmacd + #define FMAC_R2 fmacd + #define FMAC_I1 fnmacd + #define FMAC_I2 fmacd + +#else + + #define KMAC_R fnmacd + #define KMAC_I fmacd + + #define FMAC_R1 fmacd + #define FMAC_R2 fmacd + #define FMAC_I1 fnmacd + #define FMAC_I2 fmacd + +#endif + + + +/************************************************************************************** +* Macro definitions +**************************************************************************************/ + +.macro INIT2x2 + + vsub.f64 d8 , d8 , d8 + vmov.f64 d9 , d8 + vmov.f64 d10, d8 + vmov.f64 d11, d8 + vmov.f64 d12, d8 + vmov.f64 d13, d8 + vmov.f64 d14, d8 + vmov.f64 d15, d8 + +.endm + +.macro KERNEL2x2_I + pld [ AO, #A_PRE ] + pld [ BO, #B_PRE ] + + fldd d0 , [ AO ] + fldd d1 , [ AO, #8 ] + + fldd d2 , [ AO, #16 ] + fldd d3 , [ AO, #24 ] + + fldd d4 , [ BO ] + fldd d5 , [ BO, #8 ] + + fldd d6 , [ BO, #16 ] + fldd d7 , [ BO, #24 ] + + fmuld d8 , d0, d4 + KMAC_R d8 , d1, d5 + fmuld d9 , d0, d5 + KMAC_I d9 , d1, d4 + + fmuld d10 , d2, d4 + KMAC_R d10 , d3, d5 + fmuld d11 , d2, d5 + KMAC_I d11 , d3, d4 + + fmuld d12 , d0, d6 + KMAC_R d12 , d1, d7 + fmuld d13 , d0, d7 + KMAC_I d13 , d1, d6 + + fmuld d14 , d2, d6 + KMAC_R d14 , d3, d7 + fmuld d15 , d2, d7 + KMAC_I d15 , d3, d6 + + add BO , BO, #32 + add AO , AO, #32 + +.endm + + + +.macro KERNEL2x2_M1 + + + fldd d0 , [ AO ] + fldd d4 , [ BO ] + fldd d5 , [ BO, #8 ] + + fmacd d8 , d0, d4 + fldd d1 , [ AO, #8 ] + fmacd d9 , d0, d5 + fldd d2 , [ AO, #16 ] + KMAC_R d8 , d1, d5 + fldd d3 , [ AO, #24 ] + KMAC_I d9 , d1, d4 + + fldd d6 , [ BO, #16 ] + fmacd d10 , d2, d4 + fldd d7 , [ BO, #24 ] + fmacd d11 , d2, d5 + KMAC_R d10 , d3, d5 + pld [ AO, #A_PRE ] + KMAC_I d11 , d3, d4 + + pld [ BO, #B_PRE ] + fmacd d12 , d0, d6 + fmacd d13 , d0, d7 + KMAC_R d12 , d1, d7 + KMAC_I d13 , d1, d6 + + fmacd d14 , d2, d6 + fmacd d15 , d2, d7 + add BO , BO, #32 + KMAC_R d14 , d3, d7 + add AO , AO, #32 + KMAC_I d15 , d3, d6 + + +.endm + +.macro KERNEL2x2_M2 + + fldd d0 , [ AO ] + fldd d4 , [ BO ] + fldd d5 , [ BO, #8 ] + + fmacd d8 , d0, d4 + fldd d1 , [ AO, #8 ] + fmacd d9 , d0, d5 + fldd d2 , [ AO, #16 ] + KMAC_R d8 , d1, d5 + fldd d3 , [ AO, #24 ] + KMAC_I d9 , d1, d4 + + fldd d6 , [ BO, #16 ] + fmacd d10 , d2, d4 + fldd d7 , [ BO, #24 ] + fmacd d11 , d2, d5 + KMAC_R d10 , d3, d5 + pld [ AO, #A_PRE ] + KMAC_I d11 , d3, d4 + + pld [ BO, #B_PRE ] + fmacd d12 , d0, d6 + fmacd d13 , d0, d7 + KMAC_R d12 , d1, d7 + KMAC_I d13 , d1, d6 + + fmacd d14 , d2, d6 + fmacd d15 , d2, d7 + add BO , BO, #32 + KMAC_R d14 , d3, d7 + add AO , AO, #32 + KMAC_I d15 , d3, d6 + + +.endm + + +.macro KERNEL2x2_E + + fldd d0 , [ AO ] + fldd d1 , [ AO, #8 ] + fldd d2 , [ AO, #16 ] + fldd d3 , [ AO, #24 ] + + fldd d4 , [ BO ] + fldd d5 , [ BO, #8 ] + fldd d6 , [ BO, #16 ] + fldd d7 , [ BO, #24 ] + + fmacd d8 , d0, d4 + KMAC_R d8 , d1, d5 + fmacd d9 , d0, d5 + KMAC_I d9 , d1, d4 + + fmacd d10 , d2, d4 + KMAC_R d10 , d3, d5 + fmacd d11 , d2, d5 + KMAC_I d11 , d3, d4 + + fmacd d12 , d0, d6 + KMAC_R d12 , d1, d7 + fmacd d13 , d0, d7 + KMAC_I d13 , d1, d6 + + fmacd d14 , d2, d6 + KMAC_R d14 , d3, d7 + fmacd d15 , d2, d7 + KMAC_I d15 , d3, d6 + + add BO , BO, #32 + add AO , AO, #32 + +.endm + +.macro KERNEL2x2_SUB + + fldd d0 , [ AO ] + fldd d4 , [ BO ] + fldd d5 , [ BO, #8 ] + + fmacd d8 , d0, d4 + fldd d1 , [ AO, #8 ] + fmacd d9 , d0, d5 + fldd d2 , [ AO, #16 ] + KMAC_R d8 , d1, d5 + fldd d3 , [ AO, #24 ] + KMAC_I d9 , d1, d4 + + fldd d6 , [ BO, #16 ] + fmacd d10 , d2, d4 + fldd d7 , [ BO, #24 ] + fmacd d11 , d2, d5 + KMAC_R d10 , d3, d5 + pld [ AO, #A_PRE ] + KMAC_I d11 , d3, d4 + + pld [ BO, #B_PRE ] + fmacd d12 , d0, d6 + fmacd d13 , d0, d7 + KMAC_R d12 , d1, d7 + KMAC_I d13 , d1, d6 + + fmacd d14 , d2, d6 + fmacd d15 , d2, d7 + add BO , BO, #32 + KMAC_R d14 , d3, d7 + add AO , AO, #32 + KMAC_I d15 , d3, d6 + + +.endm + + +.macro SAVE2x2 + + ldr r3 , LDC + add CO2 , CO1, r3 + + fldd d0, ALPHA_R + fldd d1, ALPHA_I + + fldmiad CO1, { d4 - d7 } + + FMAC_R1 d4 , d0 , d8 + FMAC_I1 d5 , d0 , d9 + FMAC_R2 d4 , d1 , d9 + FMAC_I2 d5 , d1 , d8 + + FMAC_R1 d6 , d0 , d10 + FMAC_I1 d7 , d0 , d11 + FMAC_R2 d6 , d1 , d11 + FMAC_I2 d7 , d1 , d10 + + fstmiad CO1, { d4 - d7 } + + fldmiad CO2, { d4 - d7 } + + FMAC_R1 d4 , d0 , d12 + FMAC_I1 d5 , d0 , d13 + FMAC_R2 d4 , d1 , d13 + FMAC_I2 d5 , d1 , d12 + + FMAC_R1 d6 , d0 , d14 + FMAC_I1 d7 , d0 , d15 + FMAC_R2 d6 , d1 , d15 + FMAC_I2 d7 , d1 , d14 + + fstmiad CO2, { d4 - d7 } + + add CO1, CO1, #32 + +.endm + +/******************************************************************************/ + +.macro INIT1x2 + + vsub.f64 d8 , d8 , d8 + vmov.f64 d9 , d8 + vmov.f64 d12, d8 + vmov.f64 d13, d8 + +.endm + +.macro KERNEL1x2_I + + fldd d0 , [ AO ] + fldd d1 , [ AO, #8 ] + + fldd d4 , [ BO ] + fldd d5 , [ BO, #8 ] + + fldd d6 , [ BO, #16 ] + fldd d7 , [ BO, #24 ] + + fmuld d8 , d0, d4 + KMAC_R d8 , d1, d5 + fmuld d9 , d0, d5 + KMAC_I d9 , d1, d4 + + fmuld d12 , d0, d6 + KMAC_R d12 , d1, d7 + fmuld d13 , d0, d7 + KMAC_I d13 , d1, d6 + + add BO , BO, #32 + add AO , AO, #16 + +.endm + + + +.macro KERNEL1x2_M1 + + fldd d0 , [ AO ] + fldd d1 , [ AO, #8 ] + + fldd d4 , [ BO ] + fldd d5 , [ BO, #8 ] + fldd d6 , [ BO, #16 ] + fldd d7 , [ BO, #24 ] + + fmacd d8 , d0, d4 + KMAC_R d8 , d1, d5 + fmacd d9 , d0, d5 + KMAC_I d9 , d1, d4 + + fmacd d12 , d0, d6 + KMAC_R d12 , d1, d7 + fmacd d13 , d0, d7 + KMAC_I d13 , d1, d6 + + add BO , BO, #32 + add AO , AO, #16 + +.endm + +.macro KERNEL1x2_M2 + + fldd d0 , [ AO ] + fldd d1 , [ AO, #8 ] + + fldd d4 , [ BO ] + fldd d5 , [ BO, #8 ] + fldd d6 , [ BO, #16 ] + fldd d7 , [ BO, #24 ] + + fmacd d8 , d0, d4 + KMAC_R d8 , d1, d5 + fmacd d9 , d0, d5 + KMAC_I d9 , d1, d4 + + fmacd d12 , d0, d6 + KMAC_R d12 , d1, d7 + fmacd d13 , d0, d7 + KMAC_I d13 , d1, d6 + + add BO , BO, #32 + add AO , AO, #16 + + +.endm + + +.macro KERNEL1x2_E + + fldd d0 , [ AO ] + fldd d1 , [ AO, #8 ] + + fldd d4 , [ BO ] + fldd d5 , [ BO, #8 ] + fldd d6 , [ BO, #16 ] + fldd d7 , [ BO, #24 ] + + fmacd d8 , d0, d4 + KMAC_R d8 , d1, d5 + fmacd d9 , d0, d5 + KMAC_I d9 , d1, d4 + + fmacd d12 , d0, d6 + KMAC_R d12 , d1, d7 + fmacd d13 , d0, d7 + KMAC_I d13 , d1, d6 + + add BO , BO, #32 + add AO , AO, #16 + +.endm + +.macro KERNEL1x2_SUB + + fldd d0 , [ AO ] + fldd d1 , [ AO, #8 ] + + fldd d4 , [ BO ] + fldd d5 , [ BO, #8 ] + fldd d6 , [ BO, #16 ] + fldd d7 , [ BO, #24 ] + + fmacd d8 , d0, d4 + KMAC_R d8 , d1, d5 + fmacd d9 , d0, d5 + KMAC_I d9 , d1, d4 + + fmacd d12 , d0, d6 + KMAC_R d12 , d1, d7 + fmacd d13 , d0, d7 + KMAC_I d13 , d1, d6 + + add BO , BO, #32 + add AO , AO, #16 + + +.endm + + +.macro SAVE1x2 + + ldr r3 , LDC + add CO2 , CO1, r3 + + fldd d0, ALPHA_R + fldd d1, ALPHA_I + + fldmiad CO1, { d4 - d5 } + + FMAC_R1 d4 , d0 , d8 + FMAC_I1 d5 , d0 , d9 + FMAC_R2 d4 , d1 , d9 + FMAC_I2 d5 , d1 , d8 + + fstmiad CO1, { d4 - d5 } + + fldmiad CO2, { d4 - d5 } + + FMAC_R1 d4 , d0 , d12 + FMAC_I1 d5 , d0 , d13 + FMAC_R2 d4 , d1 , d13 + FMAC_I2 d5 , d1 , d12 + + fstmiad CO2, { d4 - d5 } + + add CO1, CO1, #16 + +.endm + + +/******************************************************************************/ + +.macro INIT2x1 + + vsub.f64 d8 , d8 , d8 + vmov.f64 d9 , d8 + vmov.f64 d10, d8 + vmov.f64 d11, d8 + +.endm + +.macro KERNEL2x1_I + + fldd d0 , [ AO ] + fldd d1 , [ AO, #8 ] + + fldd d2 , [ AO, #16 ] + fldd d3 , [ AO, #24 ] + + fldd d4 , [ BO ] + fldd d5 , [ BO, #8 ] + + fmuld d8 , d0, d4 + KMAC_R d8 , d1, d5 + fmuld d9 , d0, d5 + KMAC_I d9 , d1, d4 + + fmuld d10 , d2, d4 + KMAC_R d10 , d3, d5 + fmuld d11 , d2, d5 + KMAC_I d11 , d3, d4 + + add BO , BO, #16 + add AO , AO, #32 + +.endm + + + +.macro KERNEL2x1_M1 + + fldd d0 , [ AO ] + fldd d1 , [ AO, #8 ] + fldd d2 , [ AO, #16 ] + fldd d3 , [ AO, #24 ] + + fldd d4 , [ BO ] + fldd d5 , [ BO, #8 ] + + fmacd d8 , d0, d4 + KMAC_R d8 , d1, d5 + fmacd d9 , d0, d5 + KMAC_I d9 , d1, d4 + + fmacd d10 , d2, d4 + KMAC_R d10 , d3, d5 + fmacd d11 , d2, d5 + KMAC_I d11 , d3, d4 + + add BO , BO, #16 + add AO , AO, #32 + +.endm + +.macro KERNEL2x1_M2 + + fldd d0 , [ AO ] + fldd d1 , [ AO, #8 ] + fldd d2 , [ AO, #16 ] + fldd d3 , [ AO, #24 ] + + fldd d4 , [ BO ] + fldd d5 , [ BO, #8 ] + + fmacd d8 , d0, d4 + KMAC_R d8 , d1, d5 + fmacd d9 , d0, d5 + KMAC_I d9 , d1, d4 + + fmacd d10 , d2, d4 + KMAC_R d10 , d3, d5 + fmacd d11 , d2, d5 + KMAC_I d11 , d3, d4 + + add BO , BO, #16 + add AO , AO, #32 + + +.endm + + +.macro KERNEL2x1_E + + fldd d0 , [ AO ] + fldd d1 , [ AO, #8 ] + fldd d2 , [ AO, #16 ] + fldd d3 , [ AO, #24 ] + + fldd d4 , [ BO ] + fldd d5 , [ BO, #8 ] + + fmacd d8 , d0, d4 + KMAC_R d8 , d1, d5 + fmacd d9 , d0, d5 + KMAC_I d9 , d1, d4 + + fmacd d10 , d2, d4 + KMAC_R d10 , d3, d5 + fmacd d11 , d2, d5 + KMAC_I d11 , d3, d4 + + add BO , BO, #16 + add AO , AO, #32 + +.endm + +.macro KERNEL2x1_SUB + + fldd d0 , [ AO ] + fldd d1 , [ AO, #8 ] + fldd d2 , [ AO, #16 ] + fldd d3 , [ AO, #24 ] + + fldd d4 , [ BO ] + fldd d5 , [ BO, #8 ] + + fmacd d8 , d0, d4 + KMAC_R d8 , d1, d5 + fmacd d9 , d0, d5 + KMAC_I d9 , d1, d4 + + fmacd d10 , d2, d4 + KMAC_R d10 , d3, d5 + fmacd d11 , d2, d5 + KMAC_I d11 , d3, d4 + + add BO , BO, #16 + add AO , AO, #32 + + +.endm + + +.macro SAVE2x1 + + fldd d0, ALPHA_R + fldd d1, ALPHA_I + + fldmiad CO1, { d4 - d7 } + + FMAC_R1 d4 , d0 , d8 + FMAC_I1 d5 , d0 , d9 + FMAC_R2 d4 , d1 , d9 + FMAC_I2 d5 , d1 , d8 + + FMAC_R1 d6 , d0 , d10 + FMAC_I1 d7 , d0 , d11 + FMAC_R2 d6 , d1 , d11 + FMAC_I2 d7 , d1 , d10 + + fstmiad CO1, { d4 - d7 } + + add CO1, CO1, #32 + +.endm + + +/******************************************************************************/ + +.macro INIT1x1 + + vsub.f64 d8 , d8 , d8 + vmov.f64 d9 , d8 + +.endm + +.macro KERNEL1x1_I + + fldd d0 , [ AO ] + fldd d1 , [ AO, #8 ] + + fldd d4 , [ BO ] + fldd d5 , [ BO, #8 ] + + fmuld d8 , d0, d4 + KMAC_R d8 , d1, d5 + fmuld d9 , d0, d5 + KMAC_I d9 , d1, d4 + + add BO , BO, #16 + add AO , AO, #16 + +.endm + + + +.macro KERNEL1x1_M1 + + fldd d0 , [ AO ] + fldd d1 , [ AO, #8 ] + + fldd d4 , [ BO ] + fldd d5 , [ BO, #8 ] + + fmacd d8 , d0, d4 + KMAC_R d8 , d1, d5 + fmacd d9 , d0, d5 + KMAC_I d9 , d1, d4 + + add BO , BO, #16 + add AO , AO, #16 + +.endm + +.macro KERNEL1x1_M2 + + fldd d0 , [ AO ] + fldd d1 , [ AO, #8 ] + + fldd d4 , [ BO ] + fldd d5 , [ BO, #8 ] + + fmacd d8 , d0, d4 + KMAC_R d8 , d1, d5 + fmacd d9 , d0, d5 + KMAC_I d9 , d1, d4 + + add BO , BO, #16 + add AO , AO, #16 + + +.endm + + +.macro KERNEL1x1_E + + fldd d0 , [ AO ] + fldd d1 , [ AO, #8 ] + + fldd d4 , [ BO ] + fldd d5 , [ BO, #8 ] + + fmacd d8 , d0, d4 + KMAC_R d8 , d1, d5 + fmacd d9 , d0, d5 + KMAC_I d9 , d1, d4 + + add BO , BO, #16 + add AO , AO, #16 + +.endm + +.macro KERNEL1x1_SUB + + fldd d0 , [ AO ] + fldd d1 , [ AO, #8 ] + + fldd d4 , [ BO ] + fldd d5 , [ BO, #8 ] + + fmacd d8 , d0, d4 + KMAC_R d8 , d1, d5 + fmacd d9 , d0, d5 + KMAC_I d9 , d1, d4 + + add BO , BO, #16 + add AO , AO, #16 + + +.endm + + +.macro SAVE1x1 + + fldd d0, ALPHA_R + fldd d1, ALPHA_I + + fldmiad CO1, { d4 - d5 } + + FMAC_R1 d4 , d0 , d8 + FMAC_I1 d5 , d0 , d9 + FMAC_R2 d4 , d1 , d9 + FMAC_I2 d5 , d1 , d8 + + fstmiad CO1, { d4 - d5 } + + add CO1, CO1, #16 + +.endm + + +/******************************************************************************/ + + +/************************************************************************************** +* End of macro definitions +**************************************************************************************/ + + PROLOGUE + + .align 5 + + push {r4 - r9, fp} + add fp, sp, #24 + sub sp, sp, #STACKSIZE // reserve stack + + str OLD_M, M + str OLD_N, N + str OLD_K, K + str OLD_A, A + vstr OLD_ALPHA_R, ALPHA_R + vstr OLD_ALPHA_I, ALPHA_I + + sub r3, fp, #128 + vstm r3, { d8 - d15} // store floating point registers + + ldr r3, OLD_LDC + lsl r3, r3, #4 // ldc = ldc * 8 * 2 + str r3, LDC + + ldr K1, K + ldr BC, B + + ldr J, N + asrs J, J, #1 // J = J / 2 + ble zgemm_kernel_L1_BEGIN + +zgemm_kernel_L2_BEGIN: + + ldr CO1, C // CO1 = C + ldr r4 , LDC + lsl r4 , r4 , #1 // LDC * 2 + add r3 , r4, CO1 + str r3 , C // store C + + ldr AO, A // AO = A + pld [AO , #A_PRE-64] + pld [AO , #A_PRE-32] + + + +zgemm_kernel_L2_M2_BEGIN: + + ldr I, M + asrs I, I, #1 // I = I / 2 + ble zgemm_kernel_L2_M1_BEGIN + +zgemm_kernel_L2_M2_20: + + + mov BO, BC + asrs L , K1, #3 // L = L / 8 + cmp L , #3 + blt zgemm_kernel_L2_M2_30 + .align 5 + + + + KERNEL2x2_I + KERNEL2x2_M2 + KERNEL2x2_M1 + KERNEL2x2_M2 + + KERNEL2x2_M1 + KERNEL2x2_M2 + KERNEL2x2_M1 + KERNEL2x2_M2 + + sub L, L, #2 + +zgemm_kernel_L2_M2_22: + + KERNEL2x2_M1 + KERNEL2x2_M2 + KERNEL2x2_M1 + KERNEL2x2_M2 + + KERNEL2x2_M1 + KERNEL2x2_M2 + KERNEL2x2_M1 + KERNEL2x2_M2 + + subs L, L, #1 + bgt zgemm_kernel_L2_M2_22 + + KERNEL2x2_M1 + KERNEL2x2_M2 + KERNEL2x2_M1 + KERNEL2x2_M2 + + KERNEL2x2_M1 + KERNEL2x2_M2 + KERNEL2x2_M1 + KERNEL2x2_E + + b zgemm_kernel_L2_M2_44 + + +zgemm_kernel_L2_M2_30: + tst L, #3 + ble zgemm_kernel_L2_M2_40 + + tst L, #2 + ble zgemm_kernel_L2_M2_32 + + KERNEL2x2_I + KERNEL2x2_M2 + KERNEL2x2_M1 + KERNEL2x2_M2 + + KERNEL2x2_M1 + KERNEL2x2_M2 + KERNEL2x2_M1 + KERNEL2x2_M2 + + KERNEL2x2_M1 + KERNEL2x2_M2 + KERNEL2x2_M1 + KERNEL2x2_M2 + + + KERNEL2x2_M1 + KERNEL2x2_M2 + KERNEL2x2_M1 + KERNEL2x2_E + + b zgemm_kernel_L2_M2_44 + +zgemm_kernel_L2_M2_32: + + tst L, #1 + ble zgemm_kernel_L2_M2_40 + + KERNEL2x2_I + KERNEL2x2_M2 + KERNEL2x2_M1 + KERNEL2x2_M2 + + KERNEL2x2_M1 + KERNEL2x2_M2 + KERNEL2x2_M1 + KERNEL2x2_E + + b zgemm_kernel_L2_M2_44 + + +zgemm_kernel_L2_M2_40: + + INIT2x2 + + +zgemm_kernel_L2_M2_44: + + ands L , K1, #7 // L = L % 8 + ble zgemm_kernel_L2_M2_100 + +zgemm_kernel_L2_M2_46: + + KERNEL2x2_SUB + + subs L, L, #1 + bne zgemm_kernel_L2_M2_46 + +zgemm_kernel_L2_M2_100: + + SAVE2x2 + +zgemm_kernel_L2_M2_END: + + subs I, I, #1 + bne zgemm_kernel_L2_M2_20 + + +zgemm_kernel_L2_M1_BEGIN: + + ldr I, M + tst I, #1 // I = I % 2 + ble zgemm_kernel_L2_END + +zgemm_kernel_L2_M1_20: + + INIT1x2 + + mov BO, BC + asrs L , K1, #3 // L = L / 8 + ble zgemm_kernel_L2_M1_40 + +zgemm_kernel_L2_M1_22: + + KERNEL1x2_SUB + KERNEL1x2_SUB + KERNEL1x2_SUB + KERNEL1x2_SUB + + KERNEL1x2_SUB + KERNEL1x2_SUB + KERNEL1x2_SUB + KERNEL1x2_SUB + + subs L, L, #1 + bgt zgemm_kernel_L2_M1_22 + + +zgemm_kernel_L2_M1_40: + + ands L , K1, #7 // L = L % 8 + ble zgemm_kernel_L2_M1_100 + +zgemm_kernel_L2_M1_42: + + KERNEL1x2_SUB + + subs L, L, #1 + bgt zgemm_kernel_L2_M1_42 + +zgemm_kernel_L2_M1_100: + + SAVE1x2 + + +zgemm_kernel_L2_END: + + mov r3, BC + mov r4, K1 + lsl r4, r4, #5 // k * 2 * 8 * 2 + add r3, r3, r4 // B = B + K * 4 * 8 + mov BC, r3 + + subs J , #1 // j-- + bgt zgemm_kernel_L2_BEGIN + + + +/*********************************************************************************************/ + +zgemm_kernel_L1_BEGIN: + + ldr J , N + tst J , #1 + ble zgemm_kernel_L999 + + + ldr CO1, C // CO1 = C + ldr r4 , LDC + add r3 , r4, CO1 + str r3 , C // store C + + ldr AO, A // AO = A + +zgemm_kernel_L1_M2_BEGIN: + + ldr I, M + asrs I, I, #1 // I = I / 2 + ble zgemm_kernel_L1_M1_BEGIN + +zgemm_kernel_L1_M2_20: + + + mov BO, BC + asrs L , K1, #3 // L = L / 8 + cmp L , #3 + blt zgemm_kernel_L1_M2_30 + .align 5 + + + + KERNEL2x1_I + KERNEL2x1_M2 + KERNEL2x1_M1 + KERNEL2x1_M2 + + KERNEL2x1_M1 + KERNEL2x1_M2 + KERNEL2x1_M1 + KERNEL2x1_M2 + + sub L, L, #2 + +zgemm_kernel_L1_M2_22: + + KERNEL2x1_M1 + KERNEL2x1_M2 + KERNEL2x1_M1 + KERNEL2x1_M2 + + KERNEL2x1_M1 + KERNEL2x1_M2 + KERNEL2x1_M1 + KERNEL2x1_M2 + + subs L, L, #1 + bgt zgemm_kernel_L1_M2_22 + + KERNEL2x1_M1 + KERNEL2x1_M2 + KERNEL2x1_M1 + KERNEL2x1_M2 + + KERNEL2x1_M1 + KERNEL2x1_M2 + KERNEL2x1_M1 + KERNEL2x1_E + + b zgemm_kernel_L1_M2_44 + + +zgemm_kernel_L1_M2_30: + tst L, #3 + ble zgemm_kernel_L1_M2_40 + + tst L, #2 + ble zgemm_kernel_L1_M2_32 + + KERNEL2x1_I + KERNEL2x1_M2 + KERNEL2x1_M1 + KERNEL2x1_M2 + + KERNEL2x1_M1 + KERNEL2x1_M2 + KERNEL2x1_M1 + KERNEL2x1_M2 + + KERNEL2x1_M1 + KERNEL2x1_M2 + KERNEL2x1_M1 + KERNEL2x1_M2 + + + KERNEL2x1_M1 + KERNEL2x1_M2 + KERNEL2x1_M1 + KERNEL2x1_E + + b zgemm_kernel_L1_M2_44 + +zgemm_kernel_L1_M2_32: + + tst L, #1 + ble zgemm_kernel_L1_M2_40 + + KERNEL2x1_I + KERNEL2x1_M2 + KERNEL2x1_M1 + KERNEL2x1_M2 + + KERNEL2x1_M1 + KERNEL2x1_M2 + KERNEL2x1_M1 + KERNEL2x1_E + + b zgemm_kernel_L1_M2_44 + + +zgemm_kernel_L1_M2_40: + + INIT2x1 + + +zgemm_kernel_L1_M2_44: + + ands L , K1, #7 // L = L % 8 + ble zgemm_kernel_L1_M2_100 + +zgemm_kernel_L1_M2_46: + + KERNEL2x1_SUB + + subs L, L, #1 + bne zgemm_kernel_L1_M2_46 + +zgemm_kernel_L1_M2_100: + + SAVE2x1 + +zgemm_kernel_L1_M2_END: + + subs I, I, #1 + bne zgemm_kernel_L1_M2_20 + + +zgemm_kernel_L1_M1_BEGIN: + + ldr I, M + tst I, #1 // I = I % 2 + ble zgemm_kernel_L1_END + +zgemm_kernel_L1_M1_20: + + INIT1x1 + + mov BO, BC + asrs L , K1, #3 // L = L / 8 + ble zgemm_kernel_L1_M1_40 + +zgemm_kernel_L1_M1_22: + + KERNEL1x1_SUB + KERNEL1x1_SUB + KERNEL1x1_SUB + KERNEL1x1_SUB + + KERNEL1x1_SUB + KERNEL1x1_SUB + KERNEL1x1_SUB + KERNEL1x1_SUB + + subs L, L, #1 + bgt zgemm_kernel_L1_M1_22 + + +zgemm_kernel_L1_M1_40: + + ands L , K1, #7 // L = L % 8 + ble zgemm_kernel_L1_M1_100 + +zgemm_kernel_L1_M1_42: + + KERNEL1x1_SUB + + subs L, L, #1 + bgt zgemm_kernel_L1_M1_42 + +zgemm_kernel_L1_M1_100: + + SAVE1x1 + + +zgemm_kernel_L1_END: + + + +zgemm_kernel_L999: + + sub r3, fp, #128 + vldm r3, { d8 - d15} // restore floating point registers + + movs r0, #0 // set return value + sub sp, fp, #24 + pop {r4 - r9, fp} + bx lr + + EPILOGUE + diff --git a/kernel/arm/zgemm_kernel_2x2_vfpv3.S b/kernel/arm/zgemm_kernel_2x2_vfpv3.S new file mode 100644 index 000000000..2d35028a2 --- /dev/null +++ b/kernel/arm/zgemm_kernel_2x2_vfpv3.S @@ -0,0 +1,1345 @@ +/*************************************************************************** +Copyright (c) 2013, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +/************************************************************************************** +* 2013/11/05 Saar +* BLASTEST : OK +* CTEST : OK +* TEST : OK +* +* 2013/11/02 Saar +* UNROLL_N 2 +* UNROLL_M 2 +* ZGEMM_P 64 +* ZGEMM_Q 120 +* ZGEMM_R 4096 +* A_PRE 96 +* B_PRE 96 +* C_PRE 64 +* +* Performance on Odroid U2: +* +* 1 Core: 1.62 GFLOPS ATLAS: 1.39 GFLOPS +* 2 Cores: 3.20 GFLOPS ATLAS: 2.54 GFLOPS +* 3 Cores: 4.72 GFLOPS ATLAS: 3.76 GFLOPS +* 4 Cores: 5.93 GFLOPS ATLAS: 4.88 GFLOPS +**************************************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#define STACKSIZE 256 + +#define OLD_M r0 +#define OLD_N r1 +#define OLD_K r2 +#define OLD_A r3 +#define OLD_ALPHA_R d0 +#define OLD_ALPHA_I d1 + +/****************************************************** +* [fp, #-128] - [fp, #-64] is reserved +* for store and restore of floating point +* registers +*******************************************************/ + +#define A [fp, #-248 ] +#define LDC [fp, #-252 ] +#define M [fp, #-256 ] +#define N [fp, #-260 ] +#define K [fp, #-264 ] + +#define ALPHA_I [fp, #-272] +#define ALPHA_R [fp, #-280] + +#define B [fp, #4 ] +#define C [fp, #8 ] +#define OLD_LDC [fp, #12 ] + +#define I r0 +#define J r1 +#define L r2 + +#define AO r5 +#define BO r6 + +#define CO1 r8 +#define CO2 r9 + +#define K1 r7 +#define BC r12 + +#define A_PRE 96 +#define B_PRE 96 +#define C_PRE 64 + +#if defined(NN) || defined(NT) || defined(TN) || defined(TT) + + #define FADD_R fsubd + #define FADD_I faddd + + #define FMAC_R1 fnmacd + #define FMAC_R2 fnmacd + #define FMAC_I1 fmacd + #define FMAC_I2 fnmacd + +#elif defined(CN) || defined(CT) + + #define FADD_R faddd + #define FADD_I fsubd + + #define FMAC_R1 fmacd + #define FMAC_R2 fmacd + #define FMAC_I1 fnmacd + #define FMAC_I2 fmacd + +#elif defined(NC) || defined(TC) + + #define FADD_R faddd + #define FADD_I fsubd + + #define FMAC_R1 fmacd + #define FMAC_R2 fnmacd + #define FMAC_I1 fmacd + #define FMAC_I2 fmacd + +#else + + #define FADD_R fsubd + #define FADD_I faddd + + #define FMAC_R1 fnmacd + #define FMAC_R2 fmacd + #define FMAC_I1 fnmacd + #define FMAC_I2 fnmacd + +#endif + + + +/************************************************************************************** +* Macro definitions +**************************************************************************************/ + +.macro INIT2x2 + + vsub.f64 d16 , d16 , d16 + vmov.f64 d17, d16 + vmov.f64 d18, d16 + vmov.f64 d19, d16 + vmov.f64 d20, d16 + vmov.f64 d21, d16 + vmov.f64 d22, d16 + vmov.f64 d23, d16 + vmov.f64 d24, d16 + vmov.f64 d25, d16 + vmov.f64 d26, d16 + vmov.f64 d27, d16 + vmov.f64 d28, d16 + vmov.f64 d29, d16 + vmov.f64 d30, d16 + vmov.f64 d31, d16 + +.endm + +.macro KERNEL2x2_I + pld [ AO , #A_PRE ] + pld [ BO , #B_PRE ] + fldd d0 , [ AO ] + fldd d1 , [ AO, #8 ] + fldd d8 , [ BO ] + fldd d9 , [ BO, #8 ] + + fmuld d16 , d0, d8 + fldd d2 , [ AO, #16 ] + fmuld d24 , d1, d9 + fldd d3 , [ AO, #24 ] + fmuld d17 , d0, d9 + fldd d10, [ BO, #16 ] + fmuld d25 , d1, d8 + + fldd d11, [ BO, #24 ] + fmuld d18 , d2, d8 + add BO , BO, #32 + fmuld d26 , d3, d9 + add AO , AO, #32 + fmuld d19 , d2, d9 + pld [ BO , #B_PRE ] + fmuld d27 , d3, d8 + + pld [ AO , #A_PRE ] + fmuld d20 , d0, d10 + fldd d4 , [ AO, #0 ] + fmuld d28 , d1, d11 + fldd d5 , [ AO, #8 ] + fmuld d21 , d0, d11 + fldd d12, [ BO ] + fmuld d29 , d1, d10 + + fldd d13, [ BO, #8 ] + fmuld d22 , d2, d10 + fldd d6 , [ AO, #16 ] + fmuld d30 , d3, d11 + fldd d7 , [ AO, #24 ] + fmuld d23 , d2, d11 + fldd d14, [ BO, #16 ] + fmuld d31 , d3, d10 + fldd d15, [ BO, #24 ] + + add BO , BO, #32 + add AO , AO, #32 +.endm + + + +.macro KERNEL2x2_M1 + pld [ AO , #A_PRE ] + + fmacd d16 , d0, d8 + pld [ BO , #B_PRE ] + fmacd d24 , d1, d9 + fldd d4 , [ AO, #0 ] + fmacd d17 , d0, d9 + fldd d5 , [ AO, #8 ] + fmacd d25 , d1, d8 + + fldd d12, [ BO ] + fmacd d18 , d2, d8 + fldd d13, [ BO, #8 ] + fmacd d26 , d3, d9 + fldd d6 , [ AO, #16 ] + fmacd d19 , d2, d9 + fldd d7 , [ AO, #24 ] + fmacd d27 , d3, d8 + + fmacd d20 , d0, d10 + fldd d14, [ BO, #16 ] + fmacd d28 , d1, d11 + fmacd d21 , d0, d11 + fldd d15, [ BO, #24 ] + fmacd d29 , d1, d10 + + fmacd d22 , d2, d10 + add BO , BO, #32 + fmacd d30 , d3, d11 + fmacd d23 , d2, d11 + add AO , AO, #32 + fmacd d31 , d3, d10 + +.endm + +.macro KERNEL2x2_M2 + pld [ AO , #A_PRE ] + + fmacd d16 , d4, d12 + pld [ BO , #B_PRE ] + fmacd d24 , d5, d13 + fldd d0 , [ AO, #0 ] + fmacd d17 , d4, d13 + fldd d1 , [ AO, #8 ] + fmacd d25 , d5, d12 + + fmacd d18 , d6, d12 + fldd d8 , [ BO ] + fmacd d26 , d7, d13 + fldd d9 , [ BO, #8 ] + fmacd d19 , d6, d13 + fmacd d27 , d7, d12 + + fldd d2 , [ AO, #16 ] + fmacd d20 , d4, d14 + fldd d3 , [ AO, #24 ] + fmacd d28 , d5, d15 + fmacd d21 , d4, d15 + fldd d10, [ BO, #16 ] + fmacd d29 , d5, d14 + + fldd d11, [ BO, #24 ] + fmacd d22 , d6, d14 + fmacd d30 , d7, d15 + add BO , BO, #32 + fmacd d23 , d6, d15 + add AO , AO, #32 + fmacd d31 , d7, d14 + +.endm + + +.macro KERNEL2x2_E + + fmacd d16 , d4, d12 + fmacd d24 , d5, d13 + fmacd d17 , d4, d13 + fmacd d25 , d5, d12 + + fmacd d18 , d6, d12 + fmacd d26 , d7, d13 + fmacd d19 , d6, d13 + fmacd d27 , d7, d12 + + fmacd d20 , d4, d14 + fmacd d28 , d5, d15 + fmacd d21 , d4, d15 + fmacd d29 , d5, d14 + + fmacd d22 , d6, d14 + fmacd d30 , d7, d15 + fmacd d23 , d6, d15 + fmacd d31 , d7, d14 + +.endm + +.macro KERNEL2x2_SUB + + pld [ AO , #A_PRE ] + pld [ BO , #B_PRE ] + fldd d0 , [ AO ] + fldd d1 , [ AO, #8 ] + fldd d8 , [ BO ] + fldd d9 , [ BO, #8 ] + + fmacd d16 , d0, d8 + fldd d2 , [ AO, #16 ] + fmacd d24 , d1, d9 + fldd d3 , [ AO, #24 ] + fmacd d17 , d0, d9 + fldd d10, [ BO, #16 ] + fmacd d25 , d1, d8 + + fldd d11, [ BO, #24 ] + fmacd d18 , d2, d8 + fmacd d26 , d3, d9 + fmacd d19 , d2, d9 + fmacd d27 , d3, d8 + + fmacd d20 , d0, d10 + fmacd d28 , d1, d11 + fmacd d21 , d0, d11 + fmacd d29 , d1, d10 + + fmacd d22 , d2, d10 + add BO , BO, #32 + fmacd d30 , d3, d11 + fmacd d23 , d2, d11 + add AO , AO, #32 + fmacd d31 , d3, d10 + +.endm + + + + +.macro SAVE2x2 + pld [ CO1 , #C_PRE ] + + ldr r3 , LDC + add CO2 , CO1, r3 + fldd d0, ALPHA_R + fldd d1, ALPHA_I + + fldmiad CO1, { d4 - d7 } + fldmiad CO2, { d8 - d11 } + + FADD_R d16, d24 , d16 + FADD_I d17, d25 , d17 + FADD_R d18, d26 , d18 + FADD_I d19, d27 , d19 + FADD_R d20, d28 , d20 + FADD_I d21, d29 , d21 + FADD_R d22, d30 , d22 + FADD_I d23, d31 , d23 + + FMAC_R1 d4 , d0 , d16 + FMAC_I1 d5 , d0 , d17 + FMAC_R2 d4 , d1 , d17 + FMAC_I2 d5 , d1 , d16 + + FMAC_R1 d6 , d0 , d18 + FMAC_I1 d7 , d0 , d19 + FMAC_R2 d6 , d1 , d19 + FMAC_I2 d7 , d1 , d18 + + FMAC_R1 d8 , d0 , d20 + FMAC_I1 d9 , d0 , d21 + FMAC_R2 d8 , d1 , d21 + FMAC_I2 d9 , d1 , d20 + + FMAC_R1 d10, d0 , d22 + FMAC_I1 d11, d0 , d23 + FMAC_R2 d10, d1 , d23 + FMAC_I2 d11, d1 , d22 + + fstmiad CO1, { d4 - d7 } + fstmiad CO2, { d8 - d11 } + + add CO1, CO1, #32 + +.endm + +/******************************************************************************/ + +.macro INIT1x2 + + vsub.f64 d16 , d16 , d16 + vmov.f64 d17, d16 + vmov.f64 d20, d16 + vmov.f64 d21, d16 + vmov.f64 d24, d16 + vmov.f64 d25, d16 + vmov.f64 d28, d16 + vmov.f64 d29, d16 + +.endm + +.macro KERNEL1x2_I + pld [ AO , #A_PRE ] + pld [ BO , #B_PRE ] + fldd d0 , [ AO ] + fldd d1 , [ AO, #8 ] + fldd d8 , [ BO ] + fldd d9 , [ BO, #8 ] + fldd d10, [ BO, #16 ] + fldd d11, [ BO, #24 ] + + fmuld d16 , d0, d8 + fmuld d24 , d1, d9 + fmuld d17 , d0, d9 + fmuld d25 , d1, d8 + + fmuld d20 , d0, d10 + fmuld d28 , d1, d11 + fmuld d21 , d0, d11 + fmuld d29 , d1, d10 + + add BO , BO, #32 + add AO , AO, #16 + + pld [ BO , #B_PRE ] + + fldd d4 , [ AO, #0 ] + fldd d5 , [ AO, #8 ] + + fldd d12, [ BO ] + fldd d13, [ BO, #8 ] + fldd d14, [ BO, #16 ] + fldd d15, [ BO, #24 ] + + add BO , BO, #32 + add AO , AO, #16 +.endm + + + +.macro KERNEL1x2_M1 + pld [ BO , #B_PRE ] + + fmacd d16 , d0, d8 + fmacd d24 , d1, d9 + fmacd d17 , d0, d9 + fmacd d25 , d1, d8 + + fmacd d20 , d0, d10 + fmacd d28 , d1, d11 + fmacd d21 , d0, d11 + fmacd d29 , d1, d10 + + fldd d4 , [ AO, #0 ] + fldd d5 , [ AO, #8 ] + + fldd d12, [ BO ] + fldd d13, [ BO, #8 ] + fldd d14, [ BO, #16 ] + fldd d15, [ BO, #24 ] + + add BO , BO, #32 + add AO , AO, #16 +.endm + +.macro KERNEL1x2_M2 + pld [ AO , #A_PRE ] + pld [ BO , #B_PRE ] + + fmacd d16 , d4, d12 + fmacd d24 , d5, d13 + fmacd d17 , d4, d13 + fmacd d25 , d5, d12 + + fmacd d20 , d4, d14 + fmacd d28 , d5, d15 + fmacd d21 , d4, d15 + fmacd d29 , d5, d14 + + fldd d0 , [ AO, #0 ] + fldd d1 , [ AO, #8 ] + + fldd d8 , [ BO ] + fldd d9 , [ BO, #8 ] + fldd d10, [ BO, #16 ] + fldd d11, [ BO, #24 ] + + add BO , BO, #32 + add AO , AO, #16 +.endm + + +.macro KERNEL1x2_E + + fmacd d16 , d4, d12 + fmacd d24 , d5, d13 + fmacd d17 , d4, d13 + fmacd d25 , d5, d12 + + fmacd d20 , d4, d14 + fmacd d28 , d5, d15 + fmacd d21 , d4, d15 + fmacd d29 , d5, d14 + +.endm + +.macro KERNEL1x2_SUB + + pld [ AO , #A_PRE ] + pld [ BO , #B_PRE ] + fldd d0 , [ AO ] + fldd d1 , [ AO, #8 ] + fldd d8 , [ BO ] + fldd d9 , [ BO, #8 ] + fldd d10, [ BO, #16 ] + fldd d11, [ BO, #24 ] + + fmacd d16 , d0, d8 + fmacd d24 , d1, d9 + fmacd d17 , d0, d9 + fmacd d25 , d1, d8 + + fmacd d20 , d0, d10 + fmacd d28 , d1, d11 + fmacd d21 , d0, d11 + fmacd d29 , d1, d10 + + add BO , BO, #32 + add AO , AO, #16 + +.endm + + + + +.macro SAVE1x2 + pld [ CO1 , #C_PRE ] + + ldr r3 , LDC + add CO2 , CO1, r3 + fldd d0, ALPHA_R + fldd d1, ALPHA_I + + fldmiad CO1, { d4 - d5 } + fldmiad CO2, { d8 - d9 } + + FADD_R d16, d24 , d16 + FADD_I d17, d25 , d17 + FADD_R d20, d28 , d20 + FADD_I d21, d29 , d21 + + FMAC_R1 d4 , d0 , d16 + FMAC_I1 d5 , d0 , d17 + FMAC_R2 d4 , d1 , d17 + FMAC_I2 d5 , d1 , d16 + + FMAC_R1 d8 , d0 , d20 + FMAC_I1 d9 , d0 , d21 + FMAC_R2 d8 , d1 , d21 + FMAC_I2 d9 , d1 , d20 + + fstmiad CO1, { d4 - d5 } + fstmiad CO2, { d8 - d9 } + + add CO1, CO1, #16 + +.endm + +/******************************************************************************/ + +.macro INIT2x1 + + vsub.f64 d16 , d16 , d16 + vmov.f64 d17, d16 + vmov.f64 d18, d16 + vmov.f64 d19, d16 + vmov.f64 d24, d16 + vmov.f64 d25, d16 + vmov.f64 d26, d16 + vmov.f64 d27, d16 + +.endm + +.macro KERNEL2x1_I + pld [ AO , #A_PRE ] + pld [ BO , #B_PRE ] + fldd d0 , [ AO ] + fldd d1 , [ AO, #8 ] + fldd d2 , [ AO, #16 ] + fldd d3 , [ AO, #24 ] + fldd d8 , [ BO ] + fldd d9 , [ BO, #8 ] + + fmuld d16 , d0, d8 + fmuld d24 , d1, d9 + fmuld d17 , d0, d9 + fmuld d25 , d1, d8 + + fmuld d18 , d2, d8 + fmuld d26 , d3, d9 + fmuld d19 , d2, d9 + fmuld d27 , d3, d8 + + add BO , BO, #16 + add AO , AO, #32 + + pld [ BO , #B_PRE ] + pld [ AO , #A_PRE ] + + fldd d4 , [ AO, #0 ] + fldd d5 , [ AO, #8 ] + fldd d6 , [ AO, #16 ] + fldd d7 , [ AO, #24 ] + + fldd d12, [ BO ] + fldd d13, [ BO, #8 ] + + add BO , BO, #16 + add AO , AO, #32 +.endm + + + +.macro KERNEL2x1_M1 + pld [ AO , #A_PRE ] + pld [ BO , #B_PRE ] + + fmacd d16 , d0, d8 + fmacd d24 , d1, d9 + fmacd d17 , d0, d9 + fmacd d25 , d1, d8 + + fmacd d18 , d2, d8 + fmacd d26 , d3, d9 + fmacd d19 , d2, d9 + fmacd d27 , d3, d8 + + fldd d4 , [ AO, #0 ] + fldd d5 , [ AO, #8 ] + fldd d6 , [ AO, #16 ] + fldd d7 , [ AO, #24 ] + + fldd d12, [ BO ] + fldd d13, [ BO, #8 ] + + add BO , BO, #16 + add AO , AO, #32 +.endm + +.macro KERNEL2x1_M2 + pld [ AO , #A_PRE ] + pld [ BO , #B_PRE ] + + fmacd d16 , d4, d12 + fmacd d24 , d5, d13 + fmacd d17 , d4, d13 + fmacd d25 , d5, d12 + + fmacd d18 , d6, d12 + fmacd d26 , d7, d13 + fmacd d19 , d6, d13 + fmacd d27 , d7, d12 + + fldd d0 , [ AO, #0 ] + fldd d1 , [ AO, #8 ] + fldd d2 , [ AO, #16 ] + fldd d3 , [ AO, #24 ] + + fldd d8 , [ BO ] + fldd d9 , [ BO, #8 ] + + add BO , BO, #16 + add AO , AO, #32 +.endm + + +.macro KERNEL2x1_E + + fmacd d16 , d4, d12 + fmacd d24 , d5, d13 + fmacd d17 , d4, d13 + fmacd d25 , d5, d12 + + fmacd d18 , d6, d12 + fmacd d26 , d7, d13 + fmacd d19 , d6, d13 + fmacd d27 , d7, d12 + +.endm + +.macro KERNEL2x1_SUB + + pld [ AO , #A_PRE ] + pld [ BO , #B_PRE ] + fldd d0 , [ AO ] + fldd d1 , [ AO, #8 ] + fldd d2 , [ AO, #16 ] + fldd d3 , [ AO, #24 ] + fldd d8 , [ BO ] + fldd d9 , [ BO, #8 ] + + fmacd d16 , d0, d8 + fmacd d24 , d1, d9 + fmacd d17 , d0, d9 + fmacd d25 , d1, d8 + + fmacd d18 , d2, d8 + fmacd d26 , d3, d9 + fmacd d19 , d2, d9 + fmacd d27 , d3, d8 + + add BO , BO, #16 + add AO , AO, #32 + +.endm + + + + +.macro SAVE2x1 + pld [ CO1 , #C_PRE ] + + fldd d0, ALPHA_R + fldd d1, ALPHA_I + + fldmiad CO1, { d4 - d7 } + + FADD_R d16, d24 , d16 + FADD_I d17, d25 , d17 + FADD_R d18, d26 , d18 + FADD_I d19, d27 , d19 + + FMAC_R1 d4 , d0 , d16 + FMAC_I1 d5 , d0 , d17 + FMAC_R2 d4 , d1 , d17 + FMAC_I2 d5 , d1 , d16 + + FMAC_R1 d6 , d0 , d18 + FMAC_I1 d7 , d0 , d19 + FMAC_R2 d6 , d1 , d19 + FMAC_I2 d7 , d1 , d18 + + fstmiad CO1, { d4 - d7 } + + add CO1, CO1, #32 + +.endm + +/******************************************************************************/ + +.macro INIT1x1 + + vsub.f64 d16 , d16 , d16 + vmov.f64 d17, d16 + vmov.f64 d24, d16 + vmov.f64 d25, d16 + +.endm + +.macro KERNEL1x1_I + pld [ AO , #A_PRE ] + pld [ BO , #B_PRE ] + fldd d0 , [ AO ] + fldd d1 , [ AO, #8 ] + fldd d8 , [ BO ] + fldd d9 , [ BO, #8 ] + + fmuld d16 , d0, d8 + fmuld d24 , d1, d9 + fmuld d17 , d0, d9 + fmuld d25 , d1, d8 + + add BO , BO, #16 + add AO , AO, #16 + + pld [ BO , #B_PRE ] + pld [ AO , #A_PRE ] + + fldd d4 , [ AO, #0 ] + fldd d5 , [ AO, #8 ] + + fldd d12, [ BO ] + fldd d13, [ BO, #8 ] + + add BO , BO, #16 + add AO , AO, #16 +.endm + + + +.macro KERNEL1x1_M1 + + fmacd d16 , d0, d8 + fmacd d24 , d1, d9 + fmacd d17 , d0, d9 + fmacd d25 , d1, d8 + + fldd d4 , [ AO, #0 ] + fldd d5 , [ AO, #8 ] + + fldd d12, [ BO ] + fldd d13, [ BO, #8 ] + + add BO , BO, #16 + add AO , AO, #16 +.endm + +.macro KERNEL1x1_M2 + + fmacd d16 , d4, d12 + fmacd d24 , d5, d13 + fmacd d17 , d4, d13 + fmacd d25 , d5, d12 + + fldd d0 , [ AO, #0 ] + fldd d1 , [ AO, #8 ] + + fldd d8 , [ BO ] + fldd d9 , [ BO, #8 ] + + add BO , BO, #16 + add AO , AO, #16 +.endm + + +.macro KERNEL1x1_E + + fmacd d16 , d4, d12 + fmacd d24 , d5, d13 + fmacd d17 , d4, d13 + fmacd d25 , d5, d12 + +.endm + +.macro KERNEL1x1_SUB + + fldd d0 , [ AO ] + fldd d1 , [ AO, #8 ] + fldd d8 , [ BO ] + fldd d9 , [ BO, #8 ] + + fmacd d16 , d0, d8 + fmacd d24 , d1, d9 + fmacd d17 , d0, d9 + fmacd d25 , d1, d8 + + add BO , BO, #16 + add AO , AO, #16 + +.endm + + + + +.macro SAVE1x1 + pld [ CO1 , #C_PRE ] + + fldd d0, ALPHA_R + fldd d1, ALPHA_I + + fldmiad CO1, { d4 - d5 } + + FADD_R d16, d24 , d16 + FADD_I d17, d25 , d17 + + FMAC_R1 d4 , d0 , d16 + FMAC_I1 d5 , d0 , d17 + FMAC_R2 d4 , d1 , d17 + FMAC_I2 d5 , d1 , d16 + + fstmiad CO1, { d4 - d5 } + + add CO1, CO1, #16 + +.endm + +/******************************************************************************/ + + +/************************************************************************************** +* End of macro definitions +**************************************************************************************/ + + PROLOGUE + + .align 5 + + push {r4 - r9, fp} + add fp, sp, #24 + sub sp, sp, #STACKSIZE // reserve stack + + str OLD_M, M + str OLD_N, N + str OLD_K, K + str OLD_A, A + vstr OLD_ALPHA_R, ALPHA_R + vstr OLD_ALPHA_I, ALPHA_I + + sub r3, fp, #128 + vstm r3, { d8 - d15} // store floating point registers + + ldr r3, OLD_LDC + lsl r3, r3, #4 // ldc = ldc * 8 * 2 + str r3, LDC + + ldr K1, K + ldr BC, B + + ldr J, N + asrs J, J, #1 // J = J / 2 + ble zgemm_kernel_L1_BEGIN + +zgemm_kernel_L2_BEGIN: + + ldr CO1, C // CO1 = C + ldr r4 , LDC + lsl r4 , r4 , #1 // LDC * 2 + add r3 , r4, CO1 + str r3 , C // store C + + ldr AO, A // AO = A + pld [AO , #A_PRE-64] + pld [AO , #A_PRE-32] + + + +zgemm_kernel_L2_M2_BEGIN: + + ldr I, M + asrs I, I, #1 // I = I / 2 + ble zgemm_kernel_L2_M1_BEGIN + +zgemm_kernel_L2_M2_20: + + + mov BO, BC + asrs L , K1, #3 // L = L / 8 + cmp L , #3 + blt zgemm_kernel_L2_M2_30 + .align 5 + + + + KERNEL2x2_I + KERNEL2x2_M2 + KERNEL2x2_M1 + KERNEL2x2_M2 + + KERNEL2x2_M1 + KERNEL2x2_M2 + KERNEL2x2_M1 + KERNEL2x2_M2 + + sub L, L, #2 + +zgemm_kernel_L2_M2_22: + + KERNEL2x2_M1 + KERNEL2x2_M2 + KERNEL2x2_M1 + KERNEL2x2_M2 + + KERNEL2x2_M1 + KERNEL2x2_M2 + KERNEL2x2_M1 + KERNEL2x2_M2 + + subs L, L, #1 + bgt zgemm_kernel_L2_M2_22 + + KERNEL2x2_M1 + KERNEL2x2_M2 + KERNEL2x2_M1 + KERNEL2x2_M2 + + KERNEL2x2_M1 + KERNEL2x2_M2 + KERNEL2x2_M1 + KERNEL2x2_E + + b zgemm_kernel_L2_M2_44 + + +zgemm_kernel_L2_M2_30: + tst L, #3 + ble zgemm_kernel_L2_M2_40 + + tst L, #2 + ble zgemm_kernel_L2_M2_32 + + KERNEL2x2_I + KERNEL2x2_M2 + KERNEL2x2_M1 + KERNEL2x2_M2 + + KERNEL2x2_M1 + KERNEL2x2_M2 + KERNEL2x2_M1 + KERNEL2x2_M2 + + KERNEL2x2_M1 + KERNEL2x2_M2 + KERNEL2x2_M1 + KERNEL2x2_M2 + + + KERNEL2x2_M1 + KERNEL2x2_M2 + KERNEL2x2_M1 + KERNEL2x2_E + + b zgemm_kernel_L2_M2_44 + +zgemm_kernel_L2_M2_32: + + tst L, #1 + ble zgemm_kernel_L2_M2_40 + + KERNEL2x2_I + KERNEL2x2_M2 + KERNEL2x2_M1 + KERNEL2x2_M2 + + KERNEL2x2_M1 + KERNEL2x2_M2 + KERNEL2x2_M1 + KERNEL2x2_E + + b zgemm_kernel_L2_M2_44 + + +zgemm_kernel_L2_M2_40: + + INIT2x2 + + +zgemm_kernel_L2_M2_44: + + ands L , K1, #7 // L = L % 8 + ble zgemm_kernel_L2_M2_100 + +zgemm_kernel_L2_M2_46: + + KERNEL2x2_SUB + + subs L, L, #1 + bne zgemm_kernel_L2_M2_46 + +zgemm_kernel_L2_M2_100: + + SAVE2x2 + +zgemm_kernel_L2_M2_END: + + subs I, I, #1 + bne zgemm_kernel_L2_M2_20 + + +zgemm_kernel_L2_M1_BEGIN: + + ldr I, M + tst I, #1 // I = I % 2 + ble zgemm_kernel_L2_END + +zgemm_kernel_L2_M1_20: + + INIT1x2 + + mov BO, BC + asrs L , K1, #3 // L = L / 8 + ble zgemm_kernel_L2_M1_40 + +zgemm_kernel_L2_M1_22: + + KERNEL1x2_SUB + KERNEL1x2_SUB + KERNEL1x2_SUB + KERNEL1x2_SUB + + KERNEL1x2_SUB + KERNEL1x2_SUB + KERNEL1x2_SUB + KERNEL1x2_SUB + + subs L, L, #1 + bgt zgemm_kernel_L2_M1_22 + + +zgemm_kernel_L2_M1_40: + + ands L , K1, #7 // L = L % 8 + ble zgemm_kernel_L2_M1_100 + +zgemm_kernel_L2_M1_42: + + KERNEL1x2_SUB + + subs L, L, #1 + bgt zgemm_kernel_L2_M1_42 + +zgemm_kernel_L2_M1_100: + + SAVE1x2 + + +zgemm_kernel_L2_END: + + mov r3, BC + mov r4, K1 + lsl r4, r4, #5 // k * 2 * 8 * 2 + add r3, r3, r4 // B = B + K * 4 * 8 + mov BC, r3 + + subs J , #1 // j-- + bgt zgemm_kernel_L2_BEGIN + + + +/*********************************************************************************************/ + +zgemm_kernel_L1_BEGIN: + + ldr J , N + tst J , #1 + ble zgemm_kernel_L999 + + + ldr CO1, C // CO1 = C + ldr r4 , LDC + add r3 , r4, CO1 + str r3 , C // store C + + ldr AO, A // AO = A + +zgemm_kernel_L1_M2_BEGIN: + + ldr I, M + asrs I, I, #1 // I = I / 2 + ble zgemm_kernel_L1_M1_BEGIN + +zgemm_kernel_L1_M2_20: + + + mov BO, BC + asrs L , K1, #3 // L = L / 8 + cmp L , #3 + blt zgemm_kernel_L1_M2_30 + .align 5 + + + + KERNEL2x1_I + KERNEL2x1_M2 + KERNEL2x1_M1 + KERNEL2x1_M2 + + KERNEL2x1_M1 + KERNEL2x1_M2 + KERNEL2x1_M1 + KERNEL2x1_M2 + + sub L, L, #2 + +zgemm_kernel_L1_M2_22: + + KERNEL2x1_M1 + KERNEL2x1_M2 + KERNEL2x1_M1 + KERNEL2x1_M2 + + KERNEL2x1_M1 + KERNEL2x1_M2 + KERNEL2x1_M1 + KERNEL2x1_M2 + + subs L, L, #1 + bgt zgemm_kernel_L1_M2_22 + + KERNEL2x1_M1 + KERNEL2x1_M2 + KERNEL2x1_M1 + KERNEL2x1_M2 + + KERNEL2x1_M1 + KERNEL2x1_M2 + KERNEL2x1_M1 + KERNEL2x1_E + + b zgemm_kernel_L1_M2_44 + + +zgemm_kernel_L1_M2_30: + tst L, #3 + ble zgemm_kernel_L1_M2_40 + + tst L, #2 + ble zgemm_kernel_L1_M2_32 + + KERNEL2x1_I + KERNEL2x1_M2 + KERNEL2x1_M1 + KERNEL2x1_M2 + + KERNEL2x1_M1 + KERNEL2x1_M2 + KERNEL2x1_M1 + KERNEL2x1_M2 + + KERNEL2x1_M1 + KERNEL2x1_M2 + KERNEL2x1_M1 + KERNEL2x1_M2 + + + KERNEL2x1_M1 + KERNEL2x1_M2 + KERNEL2x1_M1 + KERNEL2x1_E + + b zgemm_kernel_L1_M2_44 + +zgemm_kernel_L1_M2_32: + + tst L, #1 + ble zgemm_kernel_L1_M2_40 + + KERNEL2x1_I + KERNEL2x1_M2 + KERNEL2x1_M1 + KERNEL2x1_M2 + + KERNEL2x1_M1 + KERNEL2x1_M2 + KERNEL2x1_M1 + KERNEL2x1_E + + b zgemm_kernel_L1_M2_44 + + +zgemm_kernel_L1_M2_40: + + INIT2x1 + + +zgemm_kernel_L1_M2_44: + + ands L , K1, #7 // L = L % 8 + ble zgemm_kernel_L1_M2_100 + +zgemm_kernel_L1_M2_46: + + KERNEL2x1_SUB + + subs L, L, #1 + bne zgemm_kernel_L1_M2_46 + +zgemm_kernel_L1_M2_100: + + SAVE2x1 + +zgemm_kernel_L1_M2_END: + + subs I, I, #1 + bne zgemm_kernel_L1_M2_20 + + +zgemm_kernel_L1_M1_BEGIN: + + ldr I, M + tst I, #1 // I = I % 2 + ble zgemm_kernel_L1_END + +zgemm_kernel_L1_M1_20: + + INIT1x1 + + mov BO, BC + asrs L , K1, #3 // L = L / 8 + ble zgemm_kernel_L1_M1_40 + +zgemm_kernel_L1_M1_22: + + KERNEL1x1_SUB + KERNEL1x1_SUB + KERNEL1x1_SUB + KERNEL1x1_SUB + + KERNEL1x1_SUB + KERNEL1x1_SUB + KERNEL1x1_SUB + KERNEL1x1_SUB + + subs L, L, #1 + bgt zgemm_kernel_L1_M1_22 + + +zgemm_kernel_L1_M1_40: + + ands L , K1, #7 // L = L % 8 + ble zgemm_kernel_L1_M1_100 + +zgemm_kernel_L1_M1_42: + + KERNEL1x1_SUB + + subs L, L, #1 + bgt zgemm_kernel_L1_M1_42 + +zgemm_kernel_L1_M1_100: + + SAVE1x1 + + +zgemm_kernel_L1_END: + + + +zgemm_kernel_L999: + + sub r3, fp, #128 + vldm r3, { d8 - d15} // restore floating point registers + + movs r0, #0 // set return value + sub sp, fp, #24 + pop {r4 - r9, fp} + bx lr + + EPILOGUE + diff --git a/kernel/arm/zgemm_ncopy_2_vfp.S b/kernel/arm/zgemm_ncopy_2_vfp.S new file mode 100644 index 000000000..5ff8ee299 --- /dev/null +++ b/kernel/arm/zgemm_ncopy_2_vfp.S @@ -0,0 +1,254 @@ +/*************************************************************************** +Copyright (c) 2013, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +/************************************************************************************** +* 2013/11/05 Saar +* BLASTEST : OK +* CTEST : OK +* TEST : OK +* +**************************************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#define STACKSIZE 256 + +#define OLD_M r0 +#define OLD_N r1 +#define OLD_A r2 +#define OLD_LDA r3 + + +/****************************************************** +* [fp, #-128] - [fp, #-64] is reserved +* for store and restore of floating point +* registers +*******************************************************/ + +#define LDA [fp, #-260 ] + +#define B [fp, #4 ] + +#define M r0 +#define N r1 +#define A r2 + +#define BO r5 + +#define AO1 r6 +#define AO2 r7 + +#define I r3 +#define J r12 + +#define A_PRE 256 + +/************************************************************************************** +* Macro definitions +**************************************************************************************/ + +.macro COPY2x2 + + pld [ AO1, #A_PRE ] + pld [ AO2, #A_PRE ] + fldd d0 , [ AO1, #0 ] + fldd d1 , [ AO1, #8 ] + fldd d4 , [ AO1, #16 ] + fldd d5 , [ AO1, #24 ] + + fldd d2 , [ AO2, #0 ] + fldd d3 , [ AO2, #8 ] + add AO1, AO1, #32 + fldd d6 , [ AO2, #16 ] + fldd d7 , [ AO2, #24 ] + + fstmiad BO!, { d0 - d7 } + add AO2, AO2, #32 + +.endm + + +.macro COPY1x2 + + fldd d0 , [ AO1, #0 ] + fldd d1 , [ AO1, #8 ] + fldd d2 , [ AO2, #0 ] + fldd d3 , [ AO2, #8 ] + + add AO1, AO1, #16 + fstmiad BO!, { d0 - d3 } + add AO2, AO2, #16 + +.endm + +.macro COPY2x1 + + fldd d0 , [ AO1, #0 ] + fldd d1 , [ AO1, #8 ] + fldd d2 , [ AO1, #16 ] + fldd d3 , [ AO1, #24 ] + + fstmiad BO!, { d0 - d3 } + add AO1, AO1, #32 + +.endm + + +.macro COPY1x1 + + fldd d0 , [ AO1, #0 ] + fldd d1 , [ AO1, #8 ] + + fstmiad BO!, { d0 - d1 } + add AO1, AO1, #16 + +.endm + + + + + +/************************************************************************************** +* End of macro definitions +**************************************************************************************/ + + PROLOGUE + + .align 5 + + push {r4 - r9, fp} + add fp, sp, #24 + sub sp, sp, #STACKSIZE // reserve stack + + + lsl r3, r3, #4 // lda = lda * 8 * 2 + str r3, LDA + + sub r4, fp, #128 + vstm r4, { d8 - d15} // store floating point registers + + ldr BO, B + +/*********************************************************************************************/ + +zgemm_ncopy_L2_BEGIN: + + asrs J, N, #1 // J = N / 2 + ble zgemm_ncopy_L1_BEGIN + +zgemm_ncopy_L2_M2_BEGIN: + + mov AO1, A // AO1 = A + ldr r4 , LDA + add AO2, AO1, r4 + add A , AO2, r4 // A = A + 2 * LDA + + asrs I, M, #1 // I = M / 2 + ble zgemm_ncopy_L2_M2_40 + +zgemm_ncopy_L2_M2_20: + + COPY2x2 + + subs I , I , #1 + bne zgemm_ncopy_L2_M2_20 + + +zgemm_ncopy_L2_M2_40: + + ands I, M , #1 + ble zgemm_ncopy_L2_M2_END + +zgemm_ncopy_L2_M2_60: + + COPY1x2 + + subs I , I , #1 + bne zgemm_ncopy_L2_M2_60 + + +zgemm_ncopy_L2_M2_END: + + subs J , J, #1 // j-- + bne zgemm_ncopy_L2_M2_BEGIN + + +/*********************************************************************************************/ + +zgemm_ncopy_L1_BEGIN: + + tst N, #1 + ble zgemm_ncopy_L999 + + +zgemm_ncopy_L1_M2_BEGIN: + + mov AO1, A // AO1 = A + ldr r4 , LDA + add A , AO1, r4 // A = A + 1 * LDA + + asrs I, M, #1 // I = M / 2 + ble zgemm_ncopy_L1_M2_40 + +zgemm_ncopy_L1_M2_20: + + COPY2x1 + + subs I , I , #1 + bne zgemm_ncopy_L1_M2_20 + + +zgemm_ncopy_L1_M2_40: + + ands I, M , #1 + ble zgemm_ncopy_L1_M2_END + +zgemm_ncopy_L1_M2_60: + + COPY1x1 + + subs I , I , #1 + bne zgemm_ncopy_L1_M2_60 + + +zgemm_ncopy_L1_M2_END: + + + +zgemm_ncopy_L999: + + sub r3, fp, #128 + vldm r3, { d8 - d15} // restore floating point registers + + movs r0, #0 // set return value + sub sp, fp, #24 + pop {r4 - r9, fp} + bx lr + + EPILOGUE + diff --git a/kernel/arm/zgemm_tcopy_2_vfp.S b/kernel/arm/zgemm_tcopy_2_vfp.S new file mode 100644 index 000000000..7e27ca6a6 --- /dev/null +++ b/kernel/arm/zgemm_tcopy_2_vfp.S @@ -0,0 +1,245 @@ +/*************************************************************************** +Copyright (c) 2013, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +/************************************************************************************** +* 2013/11/07 Saar +* BLASTEST : OK +* CTEST : OK +* TEST : OK +* +**************************************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#define STACKSIZE 256 + +#define OLD_M r0 +#define OLD_N r1 +#define OLD_A r2 +#define OLD_LDA r3 + + +/****************************************************** +* [fp, #-128] - [fp, #-64] is reserved +* for store and restore of floating point +* registers +*******************************************************/ + +#define B [fp, #4 ] +#define A [fp, #-248 ] + +#define M r0 +#define N r1 +#define M4 r2 + +#define LDA r5 + +#define AO1 r6 +#define BO1 r7 +#define BO2 r8 + +#define I r4 +#define J r12 + +#define A_PRE 256 + +/************************************************************************************** +* Macro definitions +**************************************************************************************/ +.macro COPY2x2 + + pld [ AO1, #A_PRE ] + fldmiad AO1, { d0 - d3 } + + add r3, AO1, LDA + pld [ r3, #A_PRE ] + fldmiad r3, { d4 - d7 } + + fstmiad BO1, { d0 - d7 } + add AO1, AO1, #32 + add BO1, BO1, M4 + +.endm + +.macro COPY1x2 + + fldmiad AO1, { d0 -d1 } + + add r3, AO1, LDA + fldmiad r3, { d2 - d3 } + + fstmiad BO2, { d0 - d3 } + add AO1, AO1, #16 + add BO2, BO2, #32 + +.endm + +/*************************************************************************************************************************/ +.macro COPY2x1 + + fldmiad AO1, { d0 - d3 } + + fstmiad BO1, { d0 - d3 } + add AO1, AO1, #32 + add BO1, BO1, M4 + +.endm + +.macro COPY1x1 + + fldmiad AO1, { d0 - d1 } + + fstmiad BO2, { d0 - d1 } + add AO1, AO1, #16 + add BO2, BO2, #16 + +.endm + + + +/************************************************************************************** +* End of macro definitions +**************************************************************************************/ + + PROLOGUE + + .align 5 + + push {r4 - r9, fp} + add fp, sp, #24 + sub sp, sp, #STACKSIZE // reserve stack + + str OLD_A, A // store A + + lsl LDA, OLD_LDA, #4 // lda = lda * SIZE * 2 + + sub r4, fp, #128 + vstm r4, { d8 - d15} // store floating point registers + + lsl r4 , M, #4 // M * SIZE * 2 + + ldr r3, B + + and BO2 , N , #-2 + + mul BO2, BO2, r4 + + add BO2 , BO2, r3 + + lsl M4, M, #5 // M4 = M * 2 * SIZE * 2 + +zgemm_tcopy_L2_BEGIN: + + asrs J, M, #1 // J = N / 2 + ble zgemm_tcopy_L1_BEGIN + +zgemm_tcopy_L2_M2_BEGIN: + + ldr AO1, A // AO1 = A + lsl r3, LDA, #1 // r3 = 2 * LDA + add r3, r3 , AO1 // A = A + 2 * LDA + str r3, A // store A + + ldr BO1, B + add r3, BO1, #64 // B = B + 4 * SIZE *2 + str r3, B + + asrs I, N, #1 // I = M / 2 + ble zgemm_tcopy_L2_M2_60 + +zgemm_tcopy_L2_M2_40: + + COPY2x2 + subs I, I, #1 + bne zgemm_tcopy_L2_M2_40 + +zgemm_tcopy_L2_M2_60: + + tst N , #1 + ble zgemm_tcopy_L2_M2_END + + COPY1x2 + + +zgemm_tcopy_L2_M2_END: + + subs J , J, #1 // j-- + bne zgemm_tcopy_L2_M2_BEGIN + +/*********************************************************************************************/ + +zgemm_tcopy_L1_BEGIN: + + tst M, #1 + ble zgemm_tcopy_L999 + + +zgemm_tcopy_L1_M2_BEGIN: + + ldr AO1, A // AO1 = A + add r3, LDA , AO1 // A = A + 1 * LDA + str r3, A // store A + + ldr BO1, B + add r3, BO1, #32 // B = B + 2 * SIZE *2 + str r3, B + + asrs I, N, #1 // I = M / 2 + ble zgemm_tcopy_L1_M2_60 + + +zgemm_tcopy_L1_M2_40: + + COPY2x1 + subs I, I, #1 + bne zgemm_tcopy_L1_M2_40 + +zgemm_tcopy_L1_M2_60: + + tst N , #1 + ble zgemm_tcopy_L1_M2_END + + COPY1x1 + + +zgemm_tcopy_L1_M2_END: + + + +zgemm_tcopy_L999: + + sub r3, fp, #128 + vldm r3, { d8 - d15} // restore floating point registers + + mov r0, #0 // set return value + sub sp, fp, #24 + pop {r4 - r9, fp} + bx lr + + EPILOGUE + diff --git a/kernel/arm/zgemv_n.c b/kernel/arm/zgemv_n.c new file mode 100644 index 000000000..dc2ffa0d2 --- /dev/null +++ b/kernel/arm/zgemv_n.c @@ -0,0 +1,157 @@ +/*************************************************************************** +Copyright (c) 2013, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +/************************************************************************************** + * * 2013/11/23 Saar + * * BLASTEST float : OK + * * BLASTEST double : OK + * CTEST : OK + * TEST : OK + * * + * **************************************************************************************/ + + +#include "common.h" + +int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha_r, FLOAT alpha_i, FLOAT *a, BLASLONG lda, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *buffer) +{ + BLASLONG i; + BLASLONG ix,iy; + BLASLONG j; + FLOAT *a_ptr; + FLOAT temp_r,temp_i; + BLASLONG inc_x2,inc_y2; + BLASLONG lda2; + BLASLONG i2; + + lda2 = 2*lda; + + ix = 0; + a_ptr = a; + + if ( inc_x == 1 && inc_y == 1 ) + { + + for (j=0; j + +#if defined(DOUBLE) + +#define ABS fabs + +#else + +#define ABS fabsf + +#endif + + + +FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) +{ + BLASLONG i=0; + FLOAT scale = 0.0; + FLOAT ssq = 1.0; + BLASLONG inc_x2; + FLOAT temp; + + if (n < 0 || inc_x < 1 ) return(0.0); + + inc_x2 = 2 * inc_x; + + n *= inc_x2; + while(i < n) + { + + if ( x[i] != 0.0 ) + { + temp = ABS( x[i] ); + if ( scale < temp ) + { + ssq = 1 + ssq * ( scale / temp ) * ( scale / temp ); + scale = temp ; + } + else + { + ssq += ( temp / scale ) * ( temp / scale ); + } + + } + + if ( x[i+1] != 0.0 ) + { + temp = ABS( x[i+1] ); + if ( scale < temp ) + { + ssq = 1 + ssq * ( scale / temp ) * ( scale / temp ); + scale = temp ; + } + else + { + ssq += ( temp / scale ) * ( temp / scale ); + } + + } + + + i += inc_x2; + } + scale = scale * sqrt( ssq ); + return(scale); + +} + + diff --git a/kernel/arm/zrot.c b/kernel/arm/zrot.c new file mode 100644 index 000000000..4a2f37f64 --- /dev/null +++ b/kernel/arm/zrot.c @@ -0,0 +1,68 @@ +/*************************************************************************** +Copyright (c) 2013, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +/************************************************************************************** +* 2013/09/14 Saar +* BLASTEST float : OK +* BLASTEST double : OK +* CTEST : OK +* TEST : OK +* +**************************************************************************************/ + +#include "common.h" + +int CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT c, FLOAT s) +{ + BLASLONG i=0; + BLASLONG ix=0,iy=0; + FLOAT temp[2]; + + if ( n <= 0 ) return(0); + + BLASLONG inc_x2 = 2 * inc_x ; + BLASLONG inc_y2 = 2 * inc_y ; + + while(i < n) + { + temp[0] = c*x[ix] + s*y[iy] ; + temp[1] = c*x[ix+1] + s*y[iy+1] ; + y[iy] = c*y[iy] - s*x[ix] ; + y[iy+1] = c*y[iy+1] - s*x[ix+1] ; + x[ix] = temp[0] ; + x[ix+1] = temp[1] ; + + ix += inc_x2 ; + iy += inc_y2 ; + i++ ; + + } + return(0); + +} + + diff --git a/kernel/arm/zscal.c b/kernel/arm/zscal.c new file mode 100644 index 000000000..833dc8c03 --- /dev/null +++ b/kernel/arm/zscal.c @@ -0,0 +1,64 @@ +/*************************************************************************** +Copyright (c) 2013, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +/************************************************************************************** +* 2013/09/14 Saar +* BLASTEST float : OK +* BLASTEST double : OK +* CTEST : OK +* TEST : OK +* +**************************************************************************************/ + +#include "common.h" + +int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da_r,FLOAT da_i, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *dummy, BLASLONG dummy2) +{ + BLASLONG i=0; + BLASLONG inc_x2; + BLASLONG ip = 0; + FLOAT temp; + + if ( n < 0 || inc_x < 1 ) return(0); + + inc_x2 = 2 * inc_x; + for ( i=0; i + +int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT dummy3, FLOAT dummy4, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *dummy, BLASLONG dummy2) +{ + BLASLONG i=0; + BLASLONG ix=0,iy=0; + FLOAT temp[2]; + + if ( n < 0 ) return(0); + + BLASLONG inc_x2 = 2 * inc_x; + BLASLONG inc_y2 = 2 * inc_y; + + while(i < n) + { + + temp[0] = x[ix] ; + temp[1] = x[ix+1] ; + x[ix] = y[iy] ; + x[ix+1] = y[iy+1] ; + y[iy] = temp[0] ; + y[iy+1] = temp[1] ; + + ix += inc_x2 ; + iy += inc_y2 ; + i++ ; + + } + return(0); + +} + + diff --git a/kernel/arm/ztrmm_kernel_2x2_vfp.S b/kernel/arm/ztrmm_kernel_2x2_vfp.S new file mode 100644 index 000000000..59039c32f --- /dev/null +++ b/kernel/arm/ztrmm_kernel_2x2_vfp.S @@ -0,0 +1,1537 @@ +/*************************************************************************** +Copyright (c) 2013, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +/************************************************************************************** +* 2013/11/28 Saar +* BLASTEST : OK +* CTEST : OK +* TEST : OK +* +**************************************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#define STACKSIZE 256 + +#define OLD_M r0 +#define OLD_N r1 +#define OLD_K r2 +#define OLD_A r3 +#define OLD_ALPHA_R d0 +#define OLD_ALPHA_I d1 + +/****************************************************** +* [fp, #-128] - [fp, #-64] is reserved +* for store and restore of floating point +* registers +*******************************************************/ + +#define KKK [fp, #-240] +#define KK [fp, #-244 ] +#define A [fp, #-248 ] +#define LDC [fp, #-252 ] +#define M [fp, #-256 ] +#define N [fp, #-260 ] +#define K [fp, #-264 ] + +#define ALPHA_I [fp, #-272] +#define ALPHA_R [fp, #-280] + +#define B [fp, #4 ] +#define C [fp, #8 ] +#define OLD_LDC [fp, #12 ] +#define OFFSET [fp, #16 ] + +#define I r0 +#define J r1 +#define L r2 + +#define AO r5 +#define BO r6 + +#define CO1 r8 +#define CO2 r9 + +#define K1 r7 +#define BC r12 + +#define A_PRE 96 +#define B_PRE 96 +#define C_PRE 64 + + +/************************************************************************************** +* Macro definitions +**************************************************************************************/ + + +#if defined(NN) || defined(NT) || defined(TN) || defined(TT) + + #define KMAC_R fnmacd + #define KMAC_I fmacd + + #define FMAC_R1 fmacd + #define FMAC_R2 fnmacd + #define FMAC_I1 fmacd + #define FMAC_I2 fmacd + +#elif defined(CN) || defined(CT) + + #define KMAC_R fmacd + #define KMAC_I fnmacd + + #define FMAC_R1 fmacd + #define FMAC_R2 fnmacd + #define FMAC_I1 fmacd + #define FMAC_I2 fmacd + +#elif defined(NC) || defined(TC) + + #define KMAC_R fmacd + #define KMAC_I fnmacd + + #define FMAC_R1 fmacd + #define FMAC_R2 fmacd + #define FMAC_I1 fnmacd + #define FMAC_I2 fmacd + +#else + + #define KMAC_R fnmacd + #define KMAC_I fmacd + + #define FMAC_R1 fmacd + #define FMAC_R2 fmacd + #define FMAC_I1 fnmacd + #define FMAC_I2 fmacd + +#endif + + + +/************************************************************************************** +* Macro definitions +**************************************************************************************/ + +.macro INIT2x2 + + vsub.f64 d8 , d8 , d8 + vmov.f64 d9 , d8 + vmov.f64 d10, d8 + vmov.f64 d11, d8 + vmov.f64 d12, d8 + vmov.f64 d13, d8 + vmov.f64 d14, d8 + vmov.f64 d15, d8 + +.endm + +.macro KERNEL2x2_I + pld [ AO, #A_PRE ] + pld [ BO, #B_PRE ] + + fldd d0 , [ AO ] + fldd d1 , [ AO, #8 ] + + fldd d2 , [ AO, #16 ] + fldd d3 , [ AO, #24 ] + + fldd d4 , [ BO ] + fldd d5 , [ BO, #8 ] + + fldd d6 , [ BO, #16 ] + fldd d7 , [ BO, #24 ] + + fmuld d8 , d0, d4 + KMAC_R d8 , d1, d5 + fmuld d9 , d0, d5 + KMAC_I d9 , d1, d4 + + fmuld d10 , d2, d4 + KMAC_R d10 , d3, d5 + fmuld d11 , d2, d5 + KMAC_I d11 , d3, d4 + + fmuld d12 , d0, d6 + KMAC_R d12 , d1, d7 + fmuld d13 , d0, d7 + KMAC_I d13 , d1, d6 + + fmuld d14 , d2, d6 + KMAC_R d14 , d3, d7 + fmuld d15 , d2, d7 + KMAC_I d15 , d3, d6 + + add BO , BO, #32 + add AO , AO, #32 + +.endm + + + +.macro KERNEL2x2_M1 + + + fldd d0 , [ AO ] + fldd d4 , [ BO ] + fldd d5 , [ BO, #8 ] + + fmacd d8 , d0, d4 + fldd d1 , [ AO, #8 ] + fmacd d9 , d0, d5 + fldd d2 , [ AO, #16 ] + KMAC_R d8 , d1, d5 + fldd d3 , [ AO, #24 ] + KMAC_I d9 , d1, d4 + + fldd d6 , [ BO, #16 ] + fmacd d10 , d2, d4 + fldd d7 , [ BO, #24 ] + fmacd d11 , d2, d5 + KMAC_R d10 , d3, d5 + pld [ AO, #A_PRE ] + KMAC_I d11 , d3, d4 + + pld [ BO, #B_PRE ] + fmacd d12 , d0, d6 + fmacd d13 , d0, d7 + KMAC_R d12 , d1, d7 + KMAC_I d13 , d1, d6 + + fmacd d14 , d2, d6 + fmacd d15 , d2, d7 + add BO , BO, #32 + KMAC_R d14 , d3, d7 + add AO , AO, #32 + KMAC_I d15 , d3, d6 + + +.endm + +.macro KERNEL2x2_M2 + + fldd d0 , [ AO ] + fldd d4 , [ BO ] + fldd d5 , [ BO, #8 ] + + fmacd d8 , d0, d4 + fldd d1 , [ AO, #8 ] + fmacd d9 , d0, d5 + fldd d2 , [ AO, #16 ] + KMAC_R d8 , d1, d5 + fldd d3 , [ AO, #24 ] + KMAC_I d9 , d1, d4 + + fldd d6 , [ BO, #16 ] + fmacd d10 , d2, d4 + fldd d7 , [ BO, #24 ] + fmacd d11 , d2, d5 + KMAC_R d10 , d3, d5 + pld [ AO, #A_PRE ] + KMAC_I d11 , d3, d4 + + pld [ BO, #B_PRE ] + fmacd d12 , d0, d6 + fmacd d13 , d0, d7 + KMAC_R d12 , d1, d7 + KMAC_I d13 , d1, d6 + + fmacd d14 , d2, d6 + fmacd d15 , d2, d7 + add BO , BO, #32 + KMAC_R d14 , d3, d7 + add AO , AO, #32 + KMAC_I d15 , d3, d6 + + +.endm + + +.macro KERNEL2x2_E + + fldd d0 , [ AO ] + fldd d1 , [ AO, #8 ] + fldd d2 , [ AO, #16 ] + fldd d3 , [ AO, #24 ] + + fldd d4 , [ BO ] + fldd d5 , [ BO, #8 ] + fldd d6 , [ BO, #16 ] + fldd d7 , [ BO, #24 ] + + fmacd d8 , d0, d4 + KMAC_R d8 , d1, d5 + fmacd d9 , d0, d5 + KMAC_I d9 , d1, d4 + + fmacd d10 , d2, d4 + KMAC_R d10 , d3, d5 + fmacd d11 , d2, d5 + KMAC_I d11 , d3, d4 + + fmacd d12 , d0, d6 + KMAC_R d12 , d1, d7 + fmacd d13 , d0, d7 + KMAC_I d13 , d1, d6 + + fmacd d14 , d2, d6 + KMAC_R d14 , d3, d7 + fmacd d15 , d2, d7 + KMAC_I d15 , d3, d6 + + add BO , BO, #32 + add AO , AO, #32 + +.endm + +.macro KERNEL2x2_SUB + + fldd d0 , [ AO ] + fldd d4 , [ BO ] + fldd d5 , [ BO, #8 ] + + fmacd d8 , d0, d4 + fldd d1 , [ AO, #8 ] + fmacd d9 , d0, d5 + fldd d2 , [ AO, #16 ] + KMAC_R d8 , d1, d5 + fldd d3 , [ AO, #24 ] + KMAC_I d9 , d1, d4 + + fldd d6 , [ BO, #16 ] + fmacd d10 , d2, d4 + fldd d7 , [ BO, #24 ] + fmacd d11 , d2, d5 + KMAC_R d10 , d3, d5 + pld [ AO, #A_PRE ] + KMAC_I d11 , d3, d4 + + pld [ BO, #B_PRE ] + fmacd d12 , d0, d6 + fmacd d13 , d0, d7 + KMAC_R d12 , d1, d7 + KMAC_I d13 , d1, d6 + + fmacd d14 , d2, d6 + fmacd d15 , d2, d7 + add BO , BO, #32 + KMAC_R d14 , d3, d7 + add AO , AO, #32 + KMAC_I d15 , d3, d6 + + +.endm + + +.macro SAVE2x2 + + ldr r3 , LDC + add CO2 , CO1, r3 + + fldd d0, ALPHA_R + fldd d1, ALPHA_I + + vsub.f64 d4, d4 , d4 + vsub.f64 d5, d5 , d5 + vsub.f64 d6, d6 , d6 + vsub.f64 d7, d7 , d7 + + FMAC_R1 d4 , d0 , d8 + FMAC_I1 d5 , d0 , d9 + FMAC_R2 d4 , d1 , d9 + FMAC_I2 d5 , d1 , d8 + + FMAC_R1 d6 , d0 , d10 + FMAC_I1 d7 , d0 , d11 + FMAC_R2 d6 , d1 , d11 + FMAC_I2 d7 , d1 , d10 + + fstmiad CO1, { d4 - d7 } + + vsub.f64 d4, d4 , d4 + vsub.f64 d5, d5 , d5 + vsub.f64 d6, d6 , d6 + vsub.f64 d7, d7 , d7 + + FMAC_R1 d4 , d0 , d12 + FMAC_I1 d5 , d0 , d13 + FMAC_R2 d4 , d1 , d13 + FMAC_I2 d5 , d1 , d12 + + FMAC_R1 d6 , d0 , d14 + FMAC_I1 d7 , d0 , d15 + FMAC_R2 d6 , d1 , d15 + FMAC_I2 d7 , d1 , d14 + + fstmiad CO2, { d4 - d7 } + + add CO1, CO1, #32 + +.endm + +/******************************************************************************/ + +.macro INIT1x2 + + vsub.f64 d8 , d8 , d8 + vmov.f64 d9 , d8 + vmov.f64 d12, d8 + vmov.f64 d13, d8 + +.endm + +.macro KERNEL1x2_I + + fldd d0 , [ AO ] + fldd d1 , [ AO, #8 ] + + fldd d4 , [ BO ] + fldd d5 , [ BO, #8 ] + + fldd d6 , [ BO, #16 ] + fldd d7 , [ BO, #24 ] + + fmuld d8 , d0, d4 + KMAC_R d8 , d1, d5 + fmuld d9 , d0, d5 + KMAC_I d9 , d1, d4 + + fmuld d12 , d0, d6 + KMAC_R d12 , d1, d7 + fmuld d13 , d0, d7 + KMAC_I d13 , d1, d6 + + add BO , BO, #32 + add AO , AO, #16 + +.endm + + + +.macro KERNEL1x2_M1 + + fldd d0 , [ AO ] + fldd d1 , [ AO, #8 ] + + fldd d4 , [ BO ] + fldd d5 , [ BO, #8 ] + fldd d6 , [ BO, #16 ] + fldd d7 , [ BO, #24 ] + + fmacd d8 , d0, d4 + KMAC_R d8 , d1, d5 + fmacd d9 , d0, d5 + KMAC_I d9 , d1, d4 + + fmacd d12 , d0, d6 + KMAC_R d12 , d1, d7 + fmacd d13 , d0, d7 + KMAC_I d13 , d1, d6 + + add BO , BO, #32 + add AO , AO, #16 + +.endm + +.macro KERNEL1x2_M2 + + fldd d0 , [ AO ] + fldd d1 , [ AO, #8 ] + + fldd d4 , [ BO ] + fldd d5 , [ BO, #8 ] + fldd d6 , [ BO, #16 ] + fldd d7 , [ BO, #24 ] + + fmacd d8 , d0, d4 + KMAC_R d8 , d1, d5 + fmacd d9 , d0, d5 + KMAC_I d9 , d1, d4 + + fmacd d12 , d0, d6 + KMAC_R d12 , d1, d7 + fmacd d13 , d0, d7 + KMAC_I d13 , d1, d6 + + add BO , BO, #32 + add AO , AO, #16 + + +.endm + + +.macro KERNEL1x2_E + + fldd d0 , [ AO ] + fldd d1 , [ AO, #8 ] + + fldd d4 , [ BO ] + fldd d5 , [ BO, #8 ] + fldd d6 , [ BO, #16 ] + fldd d7 , [ BO, #24 ] + + fmacd d8 , d0, d4 + KMAC_R d8 , d1, d5 + fmacd d9 , d0, d5 + KMAC_I d9 , d1, d4 + + fmacd d12 , d0, d6 + KMAC_R d12 , d1, d7 + fmacd d13 , d0, d7 + KMAC_I d13 , d1, d6 + + add BO , BO, #32 + add AO , AO, #16 + +.endm + +.macro KERNEL1x2_SUB + + fldd d0 , [ AO ] + fldd d1 , [ AO, #8 ] + + fldd d4 , [ BO ] + fldd d5 , [ BO, #8 ] + fldd d6 , [ BO, #16 ] + fldd d7 , [ BO, #24 ] + + fmacd d8 , d0, d4 + KMAC_R d8 , d1, d5 + fmacd d9 , d0, d5 + KMAC_I d9 , d1, d4 + + fmacd d12 , d0, d6 + KMAC_R d12 , d1, d7 + fmacd d13 , d0, d7 + KMAC_I d13 , d1, d6 + + add BO , BO, #32 + add AO , AO, #16 + + +.endm + + +.macro SAVE1x2 + + ldr r3 , LDC + add CO2 , CO1, r3 + + fldd d0, ALPHA_R + fldd d1, ALPHA_I + + vsub.f64 d4, d4 , d4 + vsub.f64 d5, d5 , d5 + + FMAC_R1 d4 , d0 , d8 + FMAC_I1 d5 , d0 , d9 + FMAC_R2 d4 , d1 , d9 + FMAC_I2 d5 , d1 , d8 + + fstmiad CO1, { d4 - d5 } + + vsub.f64 d4, d4 , d4 + vsub.f64 d5, d5 , d5 + + FMAC_R1 d4 , d0 , d12 + FMAC_I1 d5 , d0 , d13 + FMAC_R2 d4 , d1 , d13 + FMAC_I2 d5 , d1 , d12 + + fstmiad CO2, { d4 - d5 } + + add CO1, CO1, #16 + +.endm + + +/******************************************************************************/ + +.macro INIT2x1 + + vsub.f64 d8 , d8 , d8 + vmov.f64 d9 , d8 + vmov.f64 d10, d8 + vmov.f64 d11, d8 + +.endm + +.macro KERNEL2x1_I + + fldd d0 , [ AO ] + fldd d1 , [ AO, #8 ] + + fldd d2 , [ AO, #16 ] + fldd d3 , [ AO, #24 ] + + fldd d4 , [ BO ] + fldd d5 , [ BO, #8 ] + + fmuld d8 , d0, d4 + KMAC_R d8 , d1, d5 + fmuld d9 , d0, d5 + KMAC_I d9 , d1, d4 + + fmuld d10 , d2, d4 + KMAC_R d10 , d3, d5 + fmuld d11 , d2, d5 + KMAC_I d11 , d3, d4 + + add BO , BO, #16 + add AO , AO, #32 + +.endm + + + +.macro KERNEL2x1_M1 + + fldd d0 , [ AO ] + fldd d1 , [ AO, #8 ] + fldd d2 , [ AO, #16 ] + fldd d3 , [ AO, #24 ] + + fldd d4 , [ BO ] + fldd d5 , [ BO, #8 ] + + fmacd d8 , d0, d4 + KMAC_R d8 , d1, d5 + fmacd d9 , d0, d5 + KMAC_I d9 , d1, d4 + + fmacd d10 , d2, d4 + KMAC_R d10 , d3, d5 + fmacd d11 , d2, d5 + KMAC_I d11 , d3, d4 + + add BO , BO, #16 + add AO , AO, #32 + +.endm + +.macro KERNEL2x1_M2 + + fldd d0 , [ AO ] + fldd d1 , [ AO, #8 ] + fldd d2 , [ AO, #16 ] + fldd d3 , [ AO, #24 ] + + fldd d4 , [ BO ] + fldd d5 , [ BO, #8 ] + + fmacd d8 , d0, d4 + KMAC_R d8 , d1, d5 + fmacd d9 , d0, d5 + KMAC_I d9 , d1, d4 + + fmacd d10 , d2, d4 + KMAC_R d10 , d3, d5 + fmacd d11 , d2, d5 + KMAC_I d11 , d3, d4 + + add BO , BO, #16 + add AO , AO, #32 + + +.endm + + +.macro KERNEL2x1_E + + fldd d0 , [ AO ] + fldd d1 , [ AO, #8 ] + fldd d2 , [ AO, #16 ] + fldd d3 , [ AO, #24 ] + + fldd d4 , [ BO ] + fldd d5 , [ BO, #8 ] + + fmacd d8 , d0, d4 + KMAC_R d8 , d1, d5 + fmacd d9 , d0, d5 + KMAC_I d9 , d1, d4 + + fmacd d10 , d2, d4 + KMAC_R d10 , d3, d5 + fmacd d11 , d2, d5 + KMAC_I d11 , d3, d4 + + add BO , BO, #16 + add AO , AO, #32 + +.endm + +.macro KERNEL2x1_SUB + + fldd d0 , [ AO ] + fldd d1 , [ AO, #8 ] + fldd d2 , [ AO, #16 ] + fldd d3 , [ AO, #24 ] + + fldd d4 , [ BO ] + fldd d5 , [ BO, #8 ] + + fmacd d8 , d0, d4 + KMAC_R d8 , d1, d5 + fmacd d9 , d0, d5 + KMAC_I d9 , d1, d4 + + fmacd d10 , d2, d4 + KMAC_R d10 , d3, d5 + fmacd d11 , d2, d5 + KMAC_I d11 , d3, d4 + + add BO , BO, #16 + add AO , AO, #32 + + +.endm + + +.macro SAVE2x1 + + fldd d0, ALPHA_R + fldd d1, ALPHA_I + + vsub.f64 d4, d4 , d4 + vsub.f64 d5, d5 , d5 + vsub.f64 d6, d6 , d6 + vsub.f64 d7, d7 , d7 + + FMAC_R1 d4 , d0 , d8 + FMAC_I1 d5 , d0 , d9 + FMAC_R2 d4 , d1 , d9 + FMAC_I2 d5 , d1 , d8 + + FMAC_R1 d6 , d0 , d10 + FMAC_I1 d7 , d0 , d11 + FMAC_R2 d6 , d1 , d11 + FMAC_I2 d7 , d1 , d10 + + fstmiad CO1, { d4 - d7 } + + add CO1, CO1, #32 + +.endm + + +/******************************************************************************/ + +.macro INIT1x1 + + vsub.f64 d8 , d8 , d8 + vmov.f64 d9 , d8 + +.endm + +.macro KERNEL1x1_I + + fldd d0 , [ AO ] + fldd d1 , [ AO, #8 ] + + fldd d4 , [ BO ] + fldd d5 , [ BO, #8 ] + + fmuld d8 , d0, d4 + KMAC_R d8 , d1, d5 + fmuld d9 , d0, d5 + KMAC_I d9 , d1, d4 + + add BO , BO, #16 + add AO , AO, #16 + +.endm + + + +.macro KERNEL1x1_M1 + + fldd d0 , [ AO ] + fldd d1 , [ AO, #8 ] + + fldd d4 , [ BO ] + fldd d5 , [ BO, #8 ] + + fmacd d8 , d0, d4 + KMAC_R d8 , d1, d5 + fmacd d9 , d0, d5 + KMAC_I d9 , d1, d4 + + add BO , BO, #16 + add AO , AO, #16 + +.endm + +.macro KERNEL1x1_M2 + + fldd d0 , [ AO ] + fldd d1 , [ AO, #8 ] + + fldd d4 , [ BO ] + fldd d5 , [ BO, #8 ] + + fmacd d8 , d0, d4 + KMAC_R d8 , d1, d5 + fmacd d9 , d0, d5 + KMAC_I d9 , d1, d4 + + add BO , BO, #16 + add AO , AO, #16 + + +.endm + + +.macro KERNEL1x1_E + + fldd d0 , [ AO ] + fldd d1 , [ AO, #8 ] + + fldd d4 , [ BO ] + fldd d5 , [ BO, #8 ] + + fmacd d8 , d0, d4 + KMAC_R d8 , d1, d5 + fmacd d9 , d0, d5 + KMAC_I d9 , d1, d4 + + add BO , BO, #16 + add AO , AO, #16 + +.endm + +.macro KERNEL1x1_SUB + + fldd d0 , [ AO ] + fldd d1 , [ AO, #8 ] + + fldd d4 , [ BO ] + fldd d5 , [ BO, #8 ] + + fmacd d8 , d0, d4 + KMAC_R d8 , d1, d5 + fmacd d9 , d0, d5 + KMAC_I d9 , d1, d4 + + add BO , BO, #16 + add AO , AO, #16 + + +.endm + + +.macro SAVE1x1 + + fldd d0, ALPHA_R + fldd d1, ALPHA_I + + vsub.f64 d4, d4 , d4 + vsub.f64 d5, d5 , d5 + + FMAC_R1 d4 , d0 , d8 + FMAC_I1 d5 , d0 , d9 + FMAC_R2 d4 , d1 , d9 + FMAC_I2 d5 , d1 , d8 + + fstmiad CO1, { d4 - d5 } + + add CO1, CO1, #16 + +.endm + + + +/************************************************************************************** +* End of macro definitions +**************************************************************************************/ + + PROLOGUE + + .align 5 + + push {r4 - r9, fp} + add fp, sp, #24 + sub sp, sp, #STACKSIZE // reserve stack + + str OLD_M, M + str OLD_N, N + str OLD_K, K + str OLD_A, A + vstr OLD_ALPHA_R, ALPHA_R + vstr OLD_ALPHA_I, ALPHA_I + + sub r3, fp, #128 + vstm r3, { d8 - d15} // store floating point registers + + ldr r3, OLD_LDC + lsl r3, r3, #4 // ldc = ldc * 8 * 2 + str r3, LDC + + ldr r3, OFFSET +#ifndef LEFT + neg r3 , r3 +#endif + str r3 , KK + + ldr BC, B + + ldr J, N + asrs J, J, #1 // J = J / 2 + ble _L1_BEGIN + +_L2_BEGIN: + + ldr CO1, C // CO1 = C + ldr r4 , LDC + lsl r4 , r4 , #1 // LDC * 2 + add r3 , r4, CO1 + str r3 , C // store C + +#if defined(LEFT) + ldr r3 , OFFSET + str r3 , KK +#endif + + + ldr AO, A // AO = A + pld [AO , #A_PRE-64] + pld [AO , #A_PRE-32] + + + +_L2_M2_BEGIN: + + ldr I, M + asrs I, I, #1 // I = I / 2 + ble _L2_M1_BEGIN + +_L2_M2_20: + +#if (defined(LEFT) && defined(TRANSA)) || \ + (!defined(LEFT) && !defined(TRANSA)) + + mov BO, BC +#else + mov BO, BC + ldr r3 , KK + lsls r4 , r3 , #5 // 2 * 8 * 2 double values + add BO , BO , r4 + lsls r4 , r3 , #5 // 2 * 8 * 2 double values + add AO , AO , r4 + +#endif + +#ifndef TRMMKERNEL + ldr K1, K +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + ldr K1, K + ldr r3, KK + sub K1, K1, r3 + str K1, KKK +#else + ldr K1, KK +#ifdef LEFT + add K1, K1, #2 // number of values in AO +#else + add K1, K1, #2 // number of values in BO +#endif + str K1, KKK +#endif + + + asrs L , K1, #3 // L = L / 8 + cmp L , #3 + blt _L2_M2_30 + .align 5 + + + + KERNEL2x2_I + KERNEL2x2_M2 + KERNEL2x2_M1 + KERNEL2x2_M2 + + KERNEL2x2_M1 + KERNEL2x2_M2 + KERNEL2x2_M1 + KERNEL2x2_M2 + + sub L, L, #2 + +_L2_M2_22: + + KERNEL2x2_M1 + KERNEL2x2_M2 + KERNEL2x2_M1 + KERNEL2x2_M2 + + KERNEL2x2_M1 + KERNEL2x2_M2 + KERNEL2x2_M1 + KERNEL2x2_M2 + + subs L, L, #1 + bgt _L2_M2_22 + + KERNEL2x2_M1 + KERNEL2x2_M2 + KERNEL2x2_M1 + KERNEL2x2_M2 + + KERNEL2x2_M1 + KERNEL2x2_M2 + KERNEL2x2_M1 + KERNEL2x2_E + + b _L2_M2_44 + + +_L2_M2_30: + tst L, #3 + ble _L2_M2_40 + + tst L, #2 + ble _L2_M2_32 + + KERNEL2x2_I + KERNEL2x2_M2 + KERNEL2x2_M1 + KERNEL2x2_M2 + + KERNEL2x2_M1 + KERNEL2x2_M2 + KERNEL2x2_M1 + KERNEL2x2_M2 + + KERNEL2x2_M1 + KERNEL2x2_M2 + KERNEL2x2_M1 + KERNEL2x2_M2 + + + KERNEL2x2_M1 + KERNEL2x2_M2 + KERNEL2x2_M1 + KERNEL2x2_E + + b _L2_M2_44 + +_L2_M2_32: + + tst L, #1 + ble _L2_M2_40 + + KERNEL2x2_I + KERNEL2x2_M2 + KERNEL2x2_M1 + KERNEL2x2_M2 + + KERNEL2x2_M1 + KERNEL2x2_M2 + KERNEL2x2_M1 + KERNEL2x2_E + + b _L2_M2_44 + + +_L2_M2_40: + + INIT2x2 + + +_L2_M2_44: + + ands L , K1, #7 // L = L % 8 + ble _L2_M2_100 + +_L2_M2_46: + + KERNEL2x2_SUB + + subs L, L, #1 + bne _L2_M2_46 + +_L2_M2_100: + + SAVE2x2 + +#if (defined(LEFT) && defined(TRANSA)) || \ + (!defined(LEFT) && !defined(TRANSA)) + ldr r3 , K + ldr r4 , KKK + sub r3 , r3 , r4 + lsls r4 , r3 , #5 // 2 * 8 * 2 double values + add BO , BO , r4 + lsls r4 , r3 , #5 // 2 * 8 * 2 double values + add AO , AO , r4 +#endif + +#if defined(LEFT) + ldr r3 , KK + add r3 , r3 , #2 // number of values in AO + str r3 , KK +#endif + + + + +_L2_M2_END: + + subs I, I, #1 + bne _L2_M2_20 + + +_L2_M1_BEGIN: + + ldr I, M + tst I, #1 // I = I % 2 + ble _L2_END + +_L2_M1_20: + + INIT1x2 + +#if (defined(LEFT) && defined(TRANSA)) || \ + (!defined(LEFT) && !defined(TRANSA)) + + mov BO, BC +#else + mov BO, BC + ldr r3 , KK + lsls r4 , r3 , #5 // 2 * 8 * 2 double values + add BO , BO , r4 + lsls r4 , r3 , #4 // 1 * 8 * 2 double values + add AO , AO , r4 + +#endif + +#ifndef TRMMKERNEL + ldr K1, K +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + ldr K1, K + ldr r3, KK + sub K1, K1, r3 + str K1, KKK +#else + ldr K1, KK +#ifdef LEFT + add K1, K1, #1 // number of values in AO +#else + add K1, K1, #2 // number of values in BO +#endif + str K1, KKK +#endif + + asrs L , K1, #3 // L = L / 8 + ble _L2_M1_40 + +_L2_M1_22: + + KERNEL1x2_SUB + KERNEL1x2_SUB + KERNEL1x2_SUB + KERNEL1x2_SUB + + KERNEL1x2_SUB + KERNEL1x2_SUB + KERNEL1x2_SUB + KERNEL1x2_SUB + + subs L, L, #1 + bgt _L2_M1_22 + + +_L2_M1_40: + + ands L , K1, #7 // L = L % 8 + ble _L2_M1_100 + +_L2_M1_42: + + KERNEL1x2_SUB + + subs L, L, #1 + bgt _L2_M1_42 + +_L2_M1_100: + + SAVE1x2 + +#if (defined(LEFT) && defined(TRANSA)) || \ + (!defined(LEFT) && !defined(TRANSA)) + ldr r3 , K + ldr r4 , KKK + sub r3 , r3 , r4 + lsls r4 , r3 , #5 // 2 * 8 * 2 double values + add BO , BO , r4 + lsls r4 , r3 , #4 // 1 * 8 * 2 double values + add AO , AO , r4 +#endif + +#if defined(LEFT) + ldr r3 , KK + add r3 , r3 , #1 // number of values in AO + str r3 , KK +#endif + + + +_L2_END: + + mov r3, BC + ldr r4, K + lsl r4, r4, #5 // k * 2 * 8 * 2 + add r3, r3, r4 // B = B + K * 4 * 8 + mov BC, r3 + +#if !defined(LEFT) + ldr r3 , KK + add r3 , r3 , #2 // number of values in BO + str r3 , KK +#endif + + + subs J , #1 // j-- + bgt _L2_BEGIN + + + +/*********************************************************************************************/ + +_L1_BEGIN: + + ldr J , N + tst J , #1 + ble _L999 + + + ldr CO1, C // CO1 = C + ldr r4 , LDC + add r3 , r4, CO1 + str r3 , C // store C + +#if defined(LEFT) + ldr r3 , OFFSET + str r3 , KK +#endif + + + ldr AO, A // AO = A + +_L1_M2_BEGIN: + + ldr I, M + asrs I, I, #1 // I = I / 2 + ble _L1_M1_BEGIN + +_L1_M2_20: + +#if (defined(LEFT) && defined(TRANSA)) || \ + (!defined(LEFT) && !defined(TRANSA)) + + mov BO, BC +#else + mov BO, BC + ldr r3 , KK + lsls r4 , r3 , #4 // 1 * 8 * 2 double values + add BO , BO , r4 + lsls r4 , r3 , #5 // 2 * 8 * 2 double values + add AO , AO , r4 + +#endif + +#ifndef TRMMKERNEL + ldr K1, K +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + ldr K1, K + ldr r3, KK + sub K1, K1, r3 + str K1, KKK +#else + ldr K1, KK +#ifdef LEFT + add K1, K1, #2 // number of values in AO +#else + add K1, K1, #1 // number of values in BO +#endif + str K1, KKK +#endif + + asrs L , K1, #3 // L = L / 8 + cmp L , #3 + blt _L1_M2_30 + .align 5 + + + + KERNEL2x1_I + KERNEL2x1_M2 + KERNEL2x1_M1 + KERNEL2x1_M2 + + KERNEL2x1_M1 + KERNEL2x1_M2 + KERNEL2x1_M1 + KERNEL2x1_M2 + + sub L, L, #2 + +_L1_M2_22: + + KERNEL2x1_M1 + KERNEL2x1_M2 + KERNEL2x1_M1 + KERNEL2x1_M2 + + KERNEL2x1_M1 + KERNEL2x1_M2 + KERNEL2x1_M1 + KERNEL2x1_M2 + + subs L, L, #1 + bgt _L1_M2_22 + + KERNEL2x1_M1 + KERNEL2x1_M2 + KERNEL2x1_M1 + KERNEL2x1_M2 + + KERNEL2x1_M1 + KERNEL2x1_M2 + KERNEL2x1_M1 + KERNEL2x1_E + + b _L1_M2_44 + + +_L1_M2_30: + tst L, #3 + ble _L1_M2_40 + + tst L, #2 + ble _L1_M2_32 + + KERNEL2x1_I + KERNEL2x1_M2 + KERNEL2x1_M1 + KERNEL2x1_M2 + + KERNEL2x1_M1 + KERNEL2x1_M2 + KERNEL2x1_M1 + KERNEL2x1_M2 + + KERNEL2x1_M1 + KERNEL2x1_M2 + KERNEL2x1_M1 + KERNEL2x1_M2 + + + KERNEL2x1_M1 + KERNEL2x1_M2 + KERNEL2x1_M1 + KERNEL2x1_E + + b _L1_M2_44 + +_L1_M2_32: + + tst L, #1 + ble _L1_M2_40 + + KERNEL2x1_I + KERNEL2x1_M2 + KERNEL2x1_M1 + KERNEL2x1_M2 + + KERNEL2x1_M1 + KERNEL2x1_M2 + KERNEL2x1_M1 + KERNEL2x1_E + + b _L1_M2_44 + + +_L1_M2_40: + + INIT2x1 + + +_L1_M2_44: + + ands L , K1, #7 // L = L % 8 + ble _L1_M2_100 + +_L1_M2_46: + + KERNEL2x1_SUB + + subs L, L, #1 + bne _L1_M2_46 + +_L1_M2_100: + + SAVE2x1 + +#if (defined(LEFT) && defined(TRANSA)) || \ + (!defined(LEFT) && !defined(TRANSA)) + ldr r3 , K + ldr r4 , KKK + sub r3 , r3 , r4 + lsls r4 , r3 , #4 // 1 * 8 * 2 double values + add BO , BO , r4 + lsls r4 , r3 , #5 // 2 * 8 * 2 double values + add AO , AO , r4 +#endif + +#if defined(LEFT) + ldr r3 , KK + add r3 , r3 , #2 // number of values in AO + str r3 , KK +#endif + + + +_L1_M2_END: + + subs I, I, #1 + bne _L1_M2_20 + + +_L1_M1_BEGIN: + + ldr I, M + tst I, #1 // I = I % 2 + ble _L1_END + +_L1_M1_20: + + INIT1x1 + +#if (defined(LEFT) && defined(TRANSA)) || \ + (!defined(LEFT) && !defined(TRANSA)) + + mov BO, BC +#else + mov BO, BC + ldr r3 , KK + lsls r4 , r3 , #4 // 1 * 8 * 2 double values + add BO , BO , r4 + lsls r4 , r3 , #4 // 1 * 8 * 2 double values + add AO , AO , r4 + +#endif + +#ifndef TRMMKERNEL + ldr K1, K +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + ldr K1, K + ldr r3, KK + sub K1, K1, r3 + str K1, KKK +#else + ldr K1, KK +#ifdef LEFT + add K1, K1, #1 // number of values in AO +#else + add K1, K1, #1 // number of values in BO +#endif + str K1, KKK +#endif + + asrs L , K1, #3 // L = L / 8 + ble _L1_M1_40 + +_L1_M1_22: + + KERNEL1x1_SUB + KERNEL1x1_SUB + KERNEL1x1_SUB + KERNEL1x1_SUB + + KERNEL1x1_SUB + KERNEL1x1_SUB + KERNEL1x1_SUB + KERNEL1x1_SUB + + subs L, L, #1 + bgt _L1_M1_22 + + +_L1_M1_40: + + ands L , K1, #7 // L = L % 8 + ble _L1_M1_100 + +_L1_M1_42: + + KERNEL1x1_SUB + + subs L, L, #1 + bgt _L1_M1_42 + +_L1_M1_100: + + SAVE1x1 + + +#if (defined(LEFT) && defined(TRANSA)) || \ + (!defined(LEFT) && !defined(TRANSA)) + ldr r3 , K + ldr r4 , KKK + sub r3 , r3 , r4 + lsls r4 , r3 , #4 // 1 * 8 * 2 double values + add BO , BO , r4 + lsls r4 , r3 , #4 // 1 * 8 * 2 double values + add AO , AO , r4 +#endif + +#if defined(LEFT) + ldr r3 , KK + add r3 , r3 , #1 // number of values in AO + str r3 , KK +#endif + + + +_L1_END: + + + +_L999: + + sub r3, fp, #128 + vldm r3, { d8 - d15} // restore floating point registers + + movs r0, #0 // set return value + sub sp, fp, #24 + pop {r4 - r9, fp} + bx lr + + EPILOGUE + diff --git a/kernel/arm/ztrmm_kernel_2x2_vfpv3.S b/kernel/arm/ztrmm_kernel_2x2_vfpv3.S new file mode 100644 index 000000000..917ce610f --- /dev/null +++ b/kernel/arm/ztrmm_kernel_2x2_vfpv3.S @@ -0,0 +1,1538 @@ +/*************************************************************************** +Copyright (c) 2013, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +/************************************************************************************** +* 2013/10/16 Saar +* BLASTEST : OK +* CTEST : OK +* TEST : OK +* +**************************************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#define STACKSIZE 256 + +#define OLD_M r0 +#define OLD_N r1 +#define OLD_K r2 +#define OLD_A r3 +#define OLD_ALPHA_R d0 +#define OLD_ALPHA_I d1 + +/****************************************************** +* [fp, #-128] - [fp, #-64] is reserved +* for store and restore of floating point +* registers +*******************************************************/ + +#define KKK [fp, #-240] +#define KK [fp, #-244 ] +#define A [fp, #-248 ] +#define LDC [fp, #-252 ] +#define M [fp, #-256 ] +#define N [fp, #-260 ] +#define K [fp, #-264 ] + +#define ALPHA_I [fp, #-272] +#define ALPHA_R [fp, #-280] + +#define B [fp, #4 ] +#define C [fp, #8 ] +#define OLD_LDC [fp, #12 ] +#define OFFSET [fp, #16 ] + +#define I r0 +#define J r1 +#define L r2 + +#define AO r5 +#define BO r6 + +#define CO1 r8 +#define CO2 r9 + +#define K1 r7 +#define BC r12 + +#define A_PRE 96 +#define B_PRE 96 +#define C_PRE 64 + +#if defined(NN) || defined(NT) || defined(TN) || defined(TT) + + #define FADD_R fsubd + #define FADD_I faddd + + #define FMAC_R1 fnmuld + #define FMAC_R2 fnmacd + #define FMAC_I1 fmuld + #define FMAC_I2 fnmacd + +#elif defined(CN) || defined(CT) + + #define FADD_R faddd + #define FADD_I fsubd + + #define FMAC_R1 fmuld + #define FMAC_R2 fmacd + #define FMAC_I1 fnmuld + #define FMAC_I2 fmacd + +#elif defined(NC) || defined(TC) + + #define FADD_R faddd + #define FADD_I fsubd + + #define FMAC_R1 fmuld + #define FMAC_R2 fnmacd + #define FMAC_I1 fmuld + #define FMAC_I2 fmacd + +#else + + #define FADD_R fsubd + #define FADD_I faddd + + #define FMAC_R1 fnmuld + #define FMAC_R2 fmacd + #define FMAC_I1 fnmuld + #define FMAC_I2 fnmacd + +#endif + + + +/************************************************************************************** +* Macro definitions +**************************************************************************************/ + +.macro INIT2x2 + + vsub.f64 d16 , d16 , d16 + vmov.f64 d17, d16 + vmov.f64 d18, d16 + vmov.f64 d19, d16 + vmov.f64 d20, d16 + vmov.f64 d21, d16 + vmov.f64 d22, d16 + vmov.f64 d23, d16 + vmov.f64 d24, d16 + vmov.f64 d25, d16 + vmov.f64 d26, d16 + vmov.f64 d27, d16 + vmov.f64 d28, d16 + vmov.f64 d29, d16 + vmov.f64 d30, d16 + vmov.f64 d31, d16 + +.endm + +.macro KERNEL2x2_I + pld [ AO , #A_PRE ] + pld [ BO , #B_PRE ] + fldd d0 , [ AO ] + fldd d1 , [ AO, #8 ] + fldd d8 , [ BO ] + fldd d9 , [ BO, #8 ] + + fmuld d16 , d0, d8 + fldd d2 , [ AO, #16 ] + fmuld d24 , d1, d9 + fldd d3 , [ AO, #24 ] + fmuld d17 , d0, d9 + fldd d10, [ BO, #16 ] + fmuld d25 , d1, d8 + + fldd d11, [ BO, #24 ] + fmuld d18 , d2, d8 + add BO , BO, #32 + fmuld d26 , d3, d9 + add AO , AO, #32 + fmuld d19 , d2, d9 + pld [ BO , #B_PRE ] + fmuld d27 , d3, d8 + + pld [ AO , #A_PRE ] + fmuld d20 , d0, d10 + fldd d4 , [ AO, #0 ] + fmuld d28 , d1, d11 + fldd d5 , [ AO, #8 ] + fmuld d21 , d0, d11 + fldd d12, [ BO ] + fmuld d29 , d1, d10 + + fldd d13, [ BO, #8 ] + fmuld d22 , d2, d10 + fldd d6 , [ AO, #16 ] + fmuld d30 , d3, d11 + fldd d7 , [ AO, #24 ] + fmuld d23 , d2, d11 + fldd d14, [ BO, #16 ] + fmuld d31 , d3, d10 + fldd d15, [ BO, #24 ] + + add BO , BO, #32 + add AO , AO, #32 +.endm + + + +.macro KERNEL2x2_M1 + pld [ AO , #A_PRE ] + + fmacd d16 , d0, d8 + pld [ BO , #B_PRE ] + fmacd d24 , d1, d9 + fldd d4 , [ AO, #0 ] + fmacd d17 , d0, d9 + fldd d5 , [ AO, #8 ] + fmacd d25 , d1, d8 + + fldd d12, [ BO ] + fmacd d18 , d2, d8 + fldd d13, [ BO, #8 ] + fmacd d26 , d3, d9 + fldd d6 , [ AO, #16 ] + fmacd d19 , d2, d9 + fldd d7 , [ AO, #24 ] + fmacd d27 , d3, d8 + + fmacd d20 , d0, d10 + fldd d14, [ BO, #16 ] + fmacd d28 , d1, d11 + fmacd d21 , d0, d11 + fldd d15, [ BO, #24 ] + fmacd d29 , d1, d10 + + fmacd d22 , d2, d10 + add BO , BO, #32 + fmacd d30 , d3, d11 + fmacd d23 , d2, d11 + add AO , AO, #32 + fmacd d31 , d3, d10 + +.endm + +.macro KERNEL2x2_M2 + pld [ AO , #A_PRE ] + + fmacd d16 , d4, d12 + pld [ BO , #B_PRE ] + fmacd d24 , d5, d13 + fldd d0 , [ AO, #0 ] + fmacd d17 , d4, d13 + fldd d1 , [ AO, #8 ] + fmacd d25 , d5, d12 + + fmacd d18 , d6, d12 + fldd d8 , [ BO ] + fmacd d26 , d7, d13 + fldd d9 , [ BO, #8 ] + fmacd d19 , d6, d13 + fmacd d27 , d7, d12 + + fldd d2 , [ AO, #16 ] + fmacd d20 , d4, d14 + fldd d3 , [ AO, #24 ] + fmacd d28 , d5, d15 + fmacd d21 , d4, d15 + fldd d10, [ BO, #16 ] + fmacd d29 , d5, d14 + + fldd d11, [ BO, #24 ] + fmacd d22 , d6, d14 + fmacd d30 , d7, d15 + add BO , BO, #32 + fmacd d23 , d6, d15 + add AO , AO, #32 + fmacd d31 , d7, d14 + +.endm + + +.macro KERNEL2x2_E + + fmacd d16 , d4, d12 + fmacd d24 , d5, d13 + fmacd d17 , d4, d13 + fmacd d25 , d5, d12 + + fmacd d18 , d6, d12 + fmacd d26 , d7, d13 + fmacd d19 , d6, d13 + fmacd d27 , d7, d12 + + fmacd d20 , d4, d14 + fmacd d28 , d5, d15 + fmacd d21 , d4, d15 + fmacd d29 , d5, d14 + + fmacd d22 , d6, d14 + fmacd d30 , d7, d15 + fmacd d23 , d6, d15 + fmacd d31 , d7, d14 + +.endm + +.macro KERNEL2x2_SUB + + pld [ AO , #A_PRE ] + pld [ BO , #B_PRE ] + fldd d0 , [ AO ] + fldd d1 , [ AO, #8 ] + fldd d8 , [ BO ] + fldd d9 , [ BO, #8 ] + + fmacd d16 , d0, d8 + fldd d2 , [ AO, #16 ] + fmacd d24 , d1, d9 + fldd d3 , [ AO, #24 ] + fmacd d17 , d0, d9 + fldd d10, [ BO, #16 ] + fmacd d25 , d1, d8 + + fldd d11, [ BO, #24 ] + fmacd d18 , d2, d8 + fmacd d26 , d3, d9 + fmacd d19 , d2, d9 + fmacd d27 , d3, d8 + + fmacd d20 , d0, d10 + fmacd d28 , d1, d11 + fmacd d21 , d0, d11 + fmacd d29 , d1, d10 + + fmacd d22 , d2, d10 + add BO , BO, #32 + fmacd d30 , d3, d11 + fmacd d23 , d2, d11 + add AO , AO, #32 + fmacd d31 , d3, d10 + +.endm + + + + +.macro SAVE2x2 + + ldr r3 , LDC + add CO2 , CO1, r3 + fldd d0, ALPHA_R + fldd d1, ALPHA_I + + + FADD_R d16, d24 , d16 + FADD_I d17, d25 , d17 + FADD_R d18, d26 , d18 + FADD_I d19, d27 , d19 + FADD_R d20, d28 , d20 + FADD_I d21, d29 , d21 + FADD_R d22, d30 , d22 + FADD_I d23, d31 , d23 + + FMAC_R1 d4 , d0 , d16 + FMAC_I1 d5 , d0 , d17 + FMAC_R2 d4 , d1 , d17 + FMAC_I2 d5 , d1 , d16 + + FMAC_R1 d6 , d0 , d18 + FMAC_I1 d7 , d0 , d19 + FMAC_R2 d6 , d1 , d19 + FMAC_I2 d7 , d1 , d18 + + FMAC_R1 d8 , d0 , d20 + FMAC_I1 d9 , d0 , d21 + FMAC_R2 d8 , d1 , d21 + FMAC_I2 d9 , d1 , d20 + + FMAC_R1 d10, d0 , d22 + FMAC_I1 d11, d0 , d23 + FMAC_R2 d10, d1 , d23 + FMAC_I2 d11, d1 , d22 + + fstmiad CO1, { d4 - d7 } + fstmiad CO2, { d8 - d11 } + + add CO1, CO1, #32 + +.endm + +/******************************************************************************/ + +.macro INIT1x2 + + vsub.f64 d16 , d16 , d16 + vmov.f64 d17, d16 + vmov.f64 d20, d16 + vmov.f64 d21, d16 + vmov.f64 d24, d16 + vmov.f64 d25, d16 + vmov.f64 d28, d16 + vmov.f64 d29, d16 + +.endm + +.macro KERNEL1x2_I + pld [ AO , #A_PRE ] + pld [ BO , #B_PRE ] + fldd d0 , [ AO ] + fldd d1 , [ AO, #8 ] + fldd d8 , [ BO ] + fldd d9 , [ BO, #8 ] + fldd d10, [ BO, #16 ] + fldd d11, [ BO, #24 ] + + fmuld d16 , d0, d8 + fmuld d24 , d1, d9 + fmuld d17 , d0, d9 + fmuld d25 , d1, d8 + + fmuld d20 , d0, d10 + fmuld d28 , d1, d11 + fmuld d21 , d0, d11 + fmuld d29 , d1, d10 + + add BO , BO, #32 + add AO , AO, #16 + + pld [ BO , #B_PRE ] + + fldd d4 , [ AO, #0 ] + fldd d5 , [ AO, #8 ] + + fldd d12, [ BO ] + fldd d13, [ BO, #8 ] + fldd d14, [ BO, #16 ] + fldd d15, [ BO, #24 ] + + add BO , BO, #32 + add AO , AO, #16 +.endm + + + +.macro KERNEL1x2_M1 + pld [ BO , #B_PRE ] + + fmacd d16 , d0, d8 + fmacd d24 , d1, d9 + fmacd d17 , d0, d9 + fmacd d25 , d1, d8 + + fmacd d20 , d0, d10 + fmacd d28 , d1, d11 + fmacd d21 , d0, d11 + fmacd d29 , d1, d10 + + fldd d4 , [ AO, #0 ] + fldd d5 , [ AO, #8 ] + + fldd d12, [ BO ] + fldd d13, [ BO, #8 ] + fldd d14, [ BO, #16 ] + fldd d15, [ BO, #24 ] + + add BO , BO, #32 + add AO , AO, #16 +.endm + +.macro KERNEL1x2_M2 + pld [ AO , #A_PRE ] + pld [ BO , #B_PRE ] + + fmacd d16 , d4, d12 + fmacd d24 , d5, d13 + fmacd d17 , d4, d13 + fmacd d25 , d5, d12 + + fmacd d20 , d4, d14 + fmacd d28 , d5, d15 + fmacd d21 , d4, d15 + fmacd d29 , d5, d14 + + fldd d0 , [ AO, #0 ] + fldd d1 , [ AO, #8 ] + + fldd d8 , [ BO ] + fldd d9 , [ BO, #8 ] + fldd d10, [ BO, #16 ] + fldd d11, [ BO, #24 ] + + add BO , BO, #32 + add AO , AO, #16 +.endm + + +.macro KERNEL1x2_E + + fmacd d16 , d4, d12 + fmacd d24 , d5, d13 + fmacd d17 , d4, d13 + fmacd d25 , d5, d12 + + fmacd d20 , d4, d14 + fmacd d28 , d5, d15 + fmacd d21 , d4, d15 + fmacd d29 , d5, d14 + +.endm + +.macro KERNEL1x2_SUB + + pld [ AO , #A_PRE ] + pld [ BO , #B_PRE ] + fldd d0 , [ AO ] + fldd d1 , [ AO, #8 ] + fldd d8 , [ BO ] + fldd d9 , [ BO, #8 ] + fldd d10, [ BO, #16 ] + fldd d11, [ BO, #24 ] + + fmacd d16 , d0, d8 + fmacd d24 , d1, d9 + fmacd d17 , d0, d9 + fmacd d25 , d1, d8 + + fmacd d20 , d0, d10 + fmacd d28 , d1, d11 + fmacd d21 , d0, d11 + fmacd d29 , d1, d10 + + add BO , BO, #32 + add AO , AO, #16 + +.endm + + + + +.macro SAVE1x2 + + ldr r3 , LDC + add CO2 , CO1, r3 + fldd d0, ALPHA_R + fldd d1, ALPHA_I + + FADD_R d16, d24 , d16 + FADD_I d17, d25 , d17 + FADD_R d20, d28 , d20 + FADD_I d21, d29 , d21 + + FMAC_R1 d4 , d0 , d16 + FMAC_I1 d5 , d0 , d17 + FMAC_R2 d4 , d1 , d17 + FMAC_I2 d5 , d1 , d16 + + FMAC_R1 d8 , d0 , d20 + FMAC_I1 d9 , d0 , d21 + FMAC_R2 d8 , d1 , d21 + FMAC_I2 d9 , d1 , d20 + + fstmiad CO1, { d4 - d5 } + fstmiad CO2, { d8 - d9 } + + add CO1, CO1, #16 + +.endm + +/******************************************************************************/ + +.macro INIT2x1 + + vsub.f64 d16 , d16 , d16 + vmov.f64 d17, d16 + vmov.f64 d18, d16 + vmov.f64 d19, d16 + vmov.f64 d24, d16 + vmov.f64 d25, d16 + vmov.f64 d26, d16 + vmov.f64 d27, d16 + +.endm + +.macro KERNEL2x1_I + pld [ AO , #A_PRE ] + pld [ BO , #B_PRE ] + fldd d0 , [ AO ] + fldd d1 , [ AO, #8 ] + fldd d2 , [ AO, #16 ] + fldd d3 , [ AO, #24 ] + fldd d8 , [ BO ] + fldd d9 , [ BO, #8 ] + + fmuld d16 , d0, d8 + fmuld d24 , d1, d9 + fmuld d17 , d0, d9 + fmuld d25 , d1, d8 + + fmuld d18 , d2, d8 + fmuld d26 , d3, d9 + fmuld d19 , d2, d9 + fmuld d27 , d3, d8 + + add BO , BO, #16 + add AO , AO, #32 + + pld [ BO , #B_PRE ] + pld [ AO , #A_PRE ] + + fldd d4 , [ AO, #0 ] + fldd d5 , [ AO, #8 ] + fldd d6 , [ AO, #16 ] + fldd d7 , [ AO, #24 ] + + fldd d12, [ BO ] + fldd d13, [ BO, #8 ] + + add BO , BO, #16 + add AO , AO, #32 +.endm + + + +.macro KERNEL2x1_M1 + pld [ AO , #A_PRE ] + pld [ BO , #B_PRE ] + + fmacd d16 , d0, d8 + fmacd d24 , d1, d9 + fmacd d17 , d0, d9 + fmacd d25 , d1, d8 + + fmacd d18 , d2, d8 + fmacd d26 , d3, d9 + fmacd d19 , d2, d9 + fmacd d27 , d3, d8 + + fldd d4 , [ AO, #0 ] + fldd d5 , [ AO, #8 ] + fldd d6 , [ AO, #16 ] + fldd d7 , [ AO, #24 ] + + fldd d12, [ BO ] + fldd d13, [ BO, #8 ] + + add BO , BO, #16 + add AO , AO, #32 +.endm + +.macro KERNEL2x1_M2 + pld [ AO , #A_PRE ] + pld [ BO , #B_PRE ] + + fmacd d16 , d4, d12 + fmacd d24 , d5, d13 + fmacd d17 , d4, d13 + fmacd d25 , d5, d12 + + fmacd d18 , d6, d12 + fmacd d26 , d7, d13 + fmacd d19 , d6, d13 + fmacd d27 , d7, d12 + + fldd d0 , [ AO, #0 ] + fldd d1 , [ AO, #8 ] + fldd d2 , [ AO, #16 ] + fldd d3 , [ AO, #24 ] + + fldd d8 , [ BO ] + fldd d9 , [ BO, #8 ] + + add BO , BO, #16 + add AO , AO, #32 +.endm + + +.macro KERNEL2x1_E + + fmacd d16 , d4, d12 + fmacd d24 , d5, d13 + fmacd d17 , d4, d13 + fmacd d25 , d5, d12 + + fmacd d18 , d6, d12 + fmacd d26 , d7, d13 + fmacd d19 , d6, d13 + fmacd d27 , d7, d12 + +.endm + +.macro KERNEL2x1_SUB + + pld [ AO , #A_PRE ] + pld [ BO , #B_PRE ] + fldd d0 , [ AO ] + fldd d1 , [ AO, #8 ] + fldd d2 , [ AO, #16 ] + fldd d3 , [ AO, #24 ] + fldd d8 , [ BO ] + fldd d9 , [ BO, #8 ] + + fmacd d16 , d0, d8 + fmacd d24 , d1, d9 + fmacd d17 , d0, d9 + fmacd d25 , d1, d8 + + fmacd d18 , d2, d8 + fmacd d26 , d3, d9 + fmacd d19 , d2, d9 + fmacd d27 , d3, d8 + + add BO , BO, #16 + add AO , AO, #32 + +.endm + + + + +.macro SAVE2x1 + + fldd d0, ALPHA_R + fldd d1, ALPHA_I + + FADD_R d16, d24 , d16 + FADD_I d17, d25 , d17 + FADD_R d18, d26 , d18 + FADD_I d19, d27 , d19 + + FMAC_R1 d4 , d0 , d16 + FMAC_I1 d5 , d0 , d17 + FMAC_R2 d4 , d1 , d17 + FMAC_I2 d5 , d1 , d16 + + FMAC_R1 d6 , d0 , d18 + FMAC_I1 d7 , d0 , d19 + FMAC_R2 d6 , d1 , d19 + FMAC_I2 d7 , d1 , d18 + + fstmiad CO1, { d4 - d7 } + + add CO1, CO1, #32 + +.endm + +/******************************************************************************/ + +.macro INIT1x1 + + vsub.f64 d16 , d16 , d16 + vmov.f64 d17, d16 + vmov.f64 d24, d16 + vmov.f64 d25, d16 + +.endm + +.macro KERNEL1x1_I + pld [ AO , #A_PRE ] + pld [ BO , #B_PRE ] + fldd d0 , [ AO ] + fldd d1 , [ AO, #8 ] + fldd d8 , [ BO ] + fldd d9 , [ BO, #8 ] + + fmuld d16 , d0, d8 + fmuld d24 , d1, d9 + fmuld d17 , d0, d9 + fmuld d25 , d1, d8 + + add BO , BO, #16 + add AO , AO, #16 + + pld [ BO , #B_PRE ] + pld [ AO , #A_PRE ] + + fldd d4 , [ AO, #0 ] + fldd d5 , [ AO, #8 ] + + fldd d12, [ BO ] + fldd d13, [ BO, #8 ] + + add BO , BO, #16 + add AO , AO, #16 +.endm + + + +.macro KERNEL1x1_M1 + + fmacd d16 , d0, d8 + fmacd d24 , d1, d9 + fmacd d17 , d0, d9 + fmacd d25 , d1, d8 + + fldd d4 , [ AO, #0 ] + fldd d5 , [ AO, #8 ] + + fldd d12, [ BO ] + fldd d13, [ BO, #8 ] + + add BO , BO, #16 + add AO , AO, #16 +.endm + +.macro KERNEL1x1_M2 + + fmacd d16 , d4, d12 + fmacd d24 , d5, d13 + fmacd d17 , d4, d13 + fmacd d25 , d5, d12 + + fldd d0 , [ AO, #0 ] + fldd d1 , [ AO, #8 ] + + fldd d8 , [ BO ] + fldd d9 , [ BO, #8 ] + + add BO , BO, #16 + add AO , AO, #16 +.endm + + +.macro KERNEL1x1_E + + fmacd d16 , d4, d12 + fmacd d24 , d5, d13 + fmacd d17 , d4, d13 + fmacd d25 , d5, d12 + +.endm + +.macro KERNEL1x1_SUB + + fldd d0 , [ AO ] + fldd d1 , [ AO, #8 ] + fldd d8 , [ BO ] + fldd d9 , [ BO, #8 ] + + fmacd d16 , d0, d8 + fmacd d24 , d1, d9 + fmacd d17 , d0, d9 + fmacd d25 , d1, d8 + + add BO , BO, #16 + add AO , AO, #16 + +.endm + + + + +.macro SAVE1x1 + + fldd d0, ALPHA_R + fldd d1, ALPHA_I + + FADD_R d16, d24 , d16 + FADD_I d17, d25 , d17 + + FMAC_R1 d4 , d0 , d16 + FMAC_I1 d5 , d0 , d17 + FMAC_R2 d4 , d1 , d17 + FMAC_I2 d5 , d1 , d16 + + fstmiad CO1, { d4 - d5 } + + add CO1, CO1, #16 + +.endm + +/******************************************************************************/ + + +/************************************************************************************** +* End of macro definitions +**************************************************************************************/ + + PROLOGUE + + .align 5 + + push {r4 - r9, fp} + add fp, sp, #24 + sub sp, sp, #STACKSIZE // reserve stack + + str OLD_M, M + str OLD_N, N + str OLD_K, K + str OLD_A, A + vstr OLD_ALPHA_R, ALPHA_R + vstr OLD_ALPHA_I, ALPHA_I + + sub r3, fp, #128 + vstm r3, { d8 - d15} // store floating point registers + + ldr r3, OLD_LDC + lsl r3, r3, #4 // ldc = ldc * 8 * 2 + str r3, LDC + + ldr r3, OFFSET +#ifndef LEFT + neg r3 , r3 +#endif + str r3 , KK + + ldr BC, B + + ldr J, N + asrs J, J, #1 // J = J / 2 + ble _L1_BEGIN + +_L2_BEGIN: + + ldr CO1, C // CO1 = C + ldr r4 , LDC + lsl r4 , r4 , #1 // LDC * 2 + add r3 , r4, CO1 + str r3 , C // store C + +#if defined(LEFT) + ldr r3 , OFFSET + str r3 , KK +#endif + + + ldr AO, A // AO = A + pld [AO , #A_PRE-64] + pld [AO , #A_PRE-32] + + + +_L2_M2_BEGIN: + + ldr I, M + asrs I, I, #1 // I = I / 2 + ble _L2_M1_BEGIN + +_L2_M2_20: + +#if (defined(LEFT) && defined(TRANSA)) || \ + (!defined(LEFT) && !defined(TRANSA)) + + mov BO, BC +#else + mov BO, BC + ldr r3 , KK + lsls r4 , r3 , #5 // 2 * 8 * 2 double values + add BO , BO , r4 + lsls r4 , r3 , #5 // 2 * 8 * 2 double values + add AO , AO , r4 + +#endif + +#ifndef TRMMKERNEL + ldr K1, K +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + ldr K1, K + ldr r3, KK + sub K1, K1, r3 + str K1, KKK +#else + ldr K1, KK +#ifdef LEFT + add K1, K1, #2 // number of values in AO +#else + add K1, K1, #2 // number of values in BO +#endif + str K1, KKK +#endif + + + asrs L , K1, #3 // L = L / 8 + cmp L , #3 + blt _L2_M2_30 + .align 5 + + + + KERNEL2x2_I + KERNEL2x2_M2 + KERNEL2x2_M1 + KERNEL2x2_M2 + + KERNEL2x2_M1 + KERNEL2x2_M2 + KERNEL2x2_M1 + KERNEL2x2_M2 + + sub L, L, #2 + +_L2_M2_22: + + KERNEL2x2_M1 + KERNEL2x2_M2 + KERNEL2x2_M1 + KERNEL2x2_M2 + + KERNEL2x2_M1 + KERNEL2x2_M2 + KERNEL2x2_M1 + KERNEL2x2_M2 + + subs L, L, #1 + bgt _L2_M2_22 + + KERNEL2x2_M1 + KERNEL2x2_M2 + KERNEL2x2_M1 + KERNEL2x2_M2 + + KERNEL2x2_M1 + KERNEL2x2_M2 + KERNEL2x2_M1 + KERNEL2x2_E + + b _L2_M2_44 + + +_L2_M2_30: + tst L, #3 + ble _L2_M2_40 + + tst L, #2 + ble _L2_M2_32 + + KERNEL2x2_I + KERNEL2x2_M2 + KERNEL2x2_M1 + KERNEL2x2_M2 + + KERNEL2x2_M1 + KERNEL2x2_M2 + KERNEL2x2_M1 + KERNEL2x2_M2 + + KERNEL2x2_M1 + KERNEL2x2_M2 + KERNEL2x2_M1 + KERNEL2x2_M2 + + + KERNEL2x2_M1 + KERNEL2x2_M2 + KERNEL2x2_M1 + KERNEL2x2_E + + b _L2_M2_44 + +_L2_M2_32: + + tst L, #1 + ble _L2_M2_40 + + KERNEL2x2_I + KERNEL2x2_M2 + KERNEL2x2_M1 + KERNEL2x2_M2 + + KERNEL2x2_M1 + KERNEL2x2_M2 + KERNEL2x2_M1 + KERNEL2x2_E + + b _L2_M2_44 + + +_L2_M2_40: + + INIT2x2 + + +_L2_M2_44: + + ands L , K1, #7 // L = L % 8 + ble _L2_M2_100 + +_L2_M2_46: + + KERNEL2x2_SUB + + subs L, L, #1 + bne _L2_M2_46 + +_L2_M2_100: + + SAVE2x2 + +#if (defined(LEFT) && defined(TRANSA)) || \ + (!defined(LEFT) && !defined(TRANSA)) + ldr r3 , K + ldr r4 , KKK + sub r3 , r3 , r4 + lsls r4 , r3 , #5 // 2 * 8 * 2 double values + add BO , BO , r4 + lsls r4 , r3 , #5 // 2 * 8 * 2 double values + add AO , AO , r4 +#endif + +#if defined(LEFT) + ldr r3 , KK + add r3 , r3 , #2 // number of values in AO + str r3 , KK +#endif + + + + +_L2_M2_END: + + subs I, I, #1 + bne _L2_M2_20 + + +_L2_M1_BEGIN: + + ldr I, M + tst I, #1 // I = I % 2 + ble _L2_END + +_L2_M1_20: + + INIT1x2 + +#if (defined(LEFT) && defined(TRANSA)) || \ + (!defined(LEFT) && !defined(TRANSA)) + + mov BO, BC +#else + mov BO, BC + ldr r3 , KK + lsls r4 , r3 , #5 // 2 * 8 * 2 double values + add BO , BO , r4 + lsls r4 , r3 , #4 // 1 * 8 * 2 double values + add AO , AO , r4 + +#endif + +#ifndef TRMMKERNEL + ldr K1, K +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + ldr K1, K + ldr r3, KK + sub K1, K1, r3 + str K1, KKK +#else + ldr K1, KK +#ifdef LEFT + add K1, K1, #1 // number of values in AO +#else + add K1, K1, #2 // number of values in BO +#endif + str K1, KKK +#endif + + asrs L , K1, #3 // L = L / 8 + ble _L2_M1_40 + +_L2_M1_22: + + KERNEL1x2_SUB + KERNEL1x2_SUB + KERNEL1x2_SUB + KERNEL1x2_SUB + + KERNEL1x2_SUB + KERNEL1x2_SUB + KERNEL1x2_SUB + KERNEL1x2_SUB + + subs L, L, #1 + bgt _L2_M1_22 + + +_L2_M1_40: + + ands L , K1, #7 // L = L % 8 + ble _L2_M1_100 + +_L2_M1_42: + + KERNEL1x2_SUB + + subs L, L, #1 + bgt _L2_M1_42 + +_L2_M1_100: + + SAVE1x2 + +#if (defined(LEFT) && defined(TRANSA)) || \ + (!defined(LEFT) && !defined(TRANSA)) + ldr r3 , K + ldr r4 , KKK + sub r3 , r3 , r4 + lsls r4 , r3 , #5 // 2 * 8 * 2 double values + add BO , BO , r4 + lsls r4 , r3 , #4 // 1 * 8 * 2 double values + add AO , AO , r4 +#endif + +#if defined(LEFT) + ldr r3 , KK + add r3 , r3 , #1 // number of values in AO + str r3 , KK +#endif + + + +_L2_END: + + mov r3, BC + ldr r4, K + lsl r4, r4, #5 // k * 2 * 8 * 2 + add r3, r3, r4 // B = B + K * 4 * 8 + mov BC, r3 + +#if !defined(LEFT) + ldr r3 , KK + add r3 , r3 , #2 // number of values in BO + str r3 , KK +#endif + + + subs J , #1 // j-- + bgt _L2_BEGIN + + + +/*********************************************************************************************/ + +_L1_BEGIN: + + ldr J , N + tst J , #1 + ble _L999 + + + ldr CO1, C // CO1 = C + ldr r4 , LDC + add r3 , r4, CO1 + str r3 , C // store C + +#if defined(LEFT) + ldr r3 , OFFSET + str r3 , KK +#endif + + + ldr AO, A // AO = A + +_L1_M2_BEGIN: + + ldr I, M + asrs I, I, #1 // I = I / 2 + ble _L1_M1_BEGIN + +_L1_M2_20: + +#if (defined(LEFT) && defined(TRANSA)) || \ + (!defined(LEFT) && !defined(TRANSA)) + + mov BO, BC +#else + mov BO, BC + ldr r3 , KK + lsls r4 , r3 , #4 // 1 * 8 * 2 double values + add BO , BO , r4 + lsls r4 , r3 , #5 // 2 * 8 * 2 double values + add AO , AO , r4 + +#endif + +#ifndef TRMMKERNEL + ldr K1, K +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + ldr K1, K + ldr r3, KK + sub K1, K1, r3 + str K1, KKK +#else + ldr K1, KK +#ifdef LEFT + add K1, K1, #2 // number of values in AO +#else + add K1, K1, #1 // number of values in BO +#endif + str K1, KKK +#endif + + asrs L , K1, #3 // L = L / 8 + cmp L , #3 + blt _L1_M2_30 + .align 5 + + + + KERNEL2x1_I + KERNEL2x1_M2 + KERNEL2x1_M1 + KERNEL2x1_M2 + + KERNEL2x1_M1 + KERNEL2x1_M2 + KERNEL2x1_M1 + KERNEL2x1_M2 + + sub L, L, #2 + +_L1_M2_22: + + KERNEL2x1_M1 + KERNEL2x1_M2 + KERNEL2x1_M1 + KERNEL2x1_M2 + + KERNEL2x1_M1 + KERNEL2x1_M2 + KERNEL2x1_M1 + KERNEL2x1_M2 + + subs L, L, #1 + bgt _L1_M2_22 + + KERNEL2x1_M1 + KERNEL2x1_M2 + KERNEL2x1_M1 + KERNEL2x1_M2 + + KERNEL2x1_M1 + KERNEL2x1_M2 + KERNEL2x1_M1 + KERNEL2x1_E + + b _L1_M2_44 + + +_L1_M2_30: + tst L, #3 + ble _L1_M2_40 + + tst L, #2 + ble _L1_M2_32 + + KERNEL2x1_I + KERNEL2x1_M2 + KERNEL2x1_M1 + KERNEL2x1_M2 + + KERNEL2x1_M1 + KERNEL2x1_M2 + KERNEL2x1_M1 + KERNEL2x1_M2 + + KERNEL2x1_M1 + KERNEL2x1_M2 + KERNEL2x1_M1 + KERNEL2x1_M2 + + + KERNEL2x1_M1 + KERNEL2x1_M2 + KERNEL2x1_M1 + KERNEL2x1_E + + b _L1_M2_44 + +_L1_M2_32: + + tst L, #1 + ble _L1_M2_40 + + KERNEL2x1_I + KERNEL2x1_M2 + KERNEL2x1_M1 + KERNEL2x1_M2 + + KERNEL2x1_M1 + KERNEL2x1_M2 + KERNEL2x1_M1 + KERNEL2x1_E + + b _L1_M2_44 + + +_L1_M2_40: + + INIT2x1 + + +_L1_M2_44: + + ands L , K1, #7 // L = L % 8 + ble _L1_M2_100 + +_L1_M2_46: + + KERNEL2x1_SUB + + subs L, L, #1 + bne _L1_M2_46 + +_L1_M2_100: + + SAVE2x1 + +#if (defined(LEFT) && defined(TRANSA)) || \ + (!defined(LEFT) && !defined(TRANSA)) + ldr r3 , K + ldr r4 , KKK + sub r3 , r3 , r4 + lsls r4 , r3 , #4 // 1 * 8 * 2 double values + add BO , BO , r4 + lsls r4 , r3 , #5 // 2 * 8 * 2 double values + add AO , AO , r4 +#endif + +#if defined(LEFT) + ldr r3 , KK + add r3 , r3 , #2 // number of values in AO + str r3 , KK +#endif + + + +_L1_M2_END: + + subs I, I, #1 + bne _L1_M2_20 + + +_L1_M1_BEGIN: + + ldr I, M + tst I, #1 // I = I % 2 + ble _L1_END + +_L1_M1_20: + + INIT1x1 + +#if (defined(LEFT) && defined(TRANSA)) || \ + (!defined(LEFT) && !defined(TRANSA)) + + mov BO, BC +#else + mov BO, BC + ldr r3 , KK + lsls r4 , r3 , #4 // 1 * 8 * 2 double values + add BO , BO , r4 + lsls r4 , r3 , #4 // 1 * 8 * 2 double values + add AO , AO , r4 + +#endif + +#ifndef TRMMKERNEL + ldr K1, K +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + ldr K1, K + ldr r3, KK + sub K1, K1, r3 + str K1, KKK +#else + ldr K1, KK +#ifdef LEFT + add K1, K1, #1 // number of values in AO +#else + add K1, K1, #1 // number of values in BO +#endif + str K1, KKK +#endif + + asrs L , K1, #3 // L = L / 8 + ble _L1_M1_40 + +_L1_M1_22: + + KERNEL1x1_SUB + KERNEL1x1_SUB + KERNEL1x1_SUB + KERNEL1x1_SUB + + KERNEL1x1_SUB + KERNEL1x1_SUB + KERNEL1x1_SUB + KERNEL1x1_SUB + + subs L, L, #1 + bgt _L1_M1_22 + + +_L1_M1_40: + + ands L , K1, #7 // L = L % 8 + ble _L1_M1_100 + +_L1_M1_42: + + KERNEL1x1_SUB + + subs L, L, #1 + bgt _L1_M1_42 + +_L1_M1_100: + + SAVE1x1 + + +#if (defined(LEFT) && defined(TRANSA)) || \ + (!defined(LEFT) && !defined(TRANSA)) + ldr r3 , K + ldr r4 , KKK + sub r3 , r3 , r4 + lsls r4 , r3 , #4 // 1 * 8 * 2 double values + add BO , BO , r4 + lsls r4 , r3 , #4 // 1 * 8 * 2 double values + add AO , AO , r4 +#endif + +#if defined(LEFT) + ldr r3 , KK + add r3 , r3 , #1 // number of values in AO + str r3 , KK +#endif + + + +_L1_END: + + + +_L999: + + sub r3, fp, #128 + vldm r3, { d8 - d15} // restore floating point registers + + movs r0, #0 // set return value + sub sp, fp, #24 + pop {r4 - r9, fp} + bx lr + + EPILOGUE + diff --git a/kernel/arm64/KERNEL b/kernel/arm64/KERNEL new file mode 100644 index 000000000..aeccfbf4c --- /dev/null +++ b/kernel/arm64/KERNEL @@ -0,0 +1,46 @@ +ifndef SNRM2KERNEL +SNRM2KERNEL = nrm2.c +endif + +ifndef DNRM2KERNEL +DNRM2KERNEL = nrm2.c +endif + +ifndef CNRM2KERNEL +CNRM2KERNEL = znrm2.c +endif + +ifndef ZNRM2KERNEL +ZNRM2KERNEL = znrm2.c +endif + +ifndef SCABS_KERNEL +SCABS_KERNEL = ../generic/cabs.c +endif + +ifndef DCABS_KERNEL +DCABS_KERNEL = ../generic/cabs.c +endif + +ifndef QCABS_KERNEL +QCABS_KERNEL = ../generic/cabs.c +endif + +ifndef LSAME_KERNEL +LSAME_KERNEL = ../generic/lsame.c +endif + +ifndef SGEMM_BETA +SGEMM_BETA = ../generic/gemm_beta.c +endif +ifndef DGEMM_BETA +DGEMM_BETA = ../generic/gemm_beta.c +endif +ifndef CGEMM_BETA +CGEMM_BETA = ../generic/zgemm_beta.c +endif +ifndef ZGEMM_BETA +ZGEMM_BETA = ../generic/zgemm_beta.c +endif + + diff --git a/kernel/arm64/KERNEL.ARMV8 b/kernel/arm64/KERNEL.ARMV8 new file mode 100644 index 000000000..ecf278cf9 --- /dev/null +++ b/kernel/arm64/KERNEL.ARMV8 @@ -0,0 +1,134 @@ +SAMAXKERNEL = ../arm/amax.c +DAMAXKERNEL = ../arm/amax.c +CAMAXKERNEL = ../arm/zamax.c +ZAMAXKERNEL = ../arm/zamax.c + +SAMINKERNEL = ../arm/amin.c +DAMINKERNEL = ../arm/amin.c +CAMINKERNEL = ../arm/zamin.c +ZAMINKERNEL = ../arm/zamin.c + +SMAXKERNEL = ../arm/max.c +DMAXKERNEL = ../arm/max.c + +SMINKERNEL = ../arm/min.c +DMINKERNEL = ../arm/min.c + +ISAMAXKERNEL = ../arm/iamax.c +IDAMAXKERNEL = ../arm/iamax.c +ICAMAXKERNEL = ../arm/izamax.c +IZAMAXKERNEL = ../arm/izamax.c + +ISAMINKERNEL = ../arm/iamin.c +IDAMINKERNEL = ../arm/iamin.c +ICAMINKERNEL = ../arm/izamin.c +IZAMINKERNEL = ../arm/izamin.c + +ISMAXKERNEL = ../arm/imax.c +IDMAXKERNEL = ../arm/imax.c + +ISMINKERNEL = ../arm/imin.c +IDMINKERNEL = ../arm/imin.c + +SASUMKERNEL = ../arm/asum.c +DASUMKERNEL = ../arm/asum.c +CASUMKERNEL = ../arm/zasum.c +ZASUMKERNEL = ../arm/zasum.c + +SAXPYKERNEL = ../arm/axpy.c +DAXPYKERNEL = ../arm/axpy.c +CAXPYKERNEL = ../arm/zaxpy.c +ZAXPYKERNEL = ../arm/zaxpy.c + +SCOPYKERNEL = ../arm/copy.c +DCOPYKERNEL = ../arm/copy.c +CCOPYKERNEL = ../arm/zcopy.c +ZCOPYKERNEL = ../arm/zcopy.c + +SDOTKERNEL = ../arm/dot.c +DDOTKERNEL = ../arm/dot.c +CDOTKERNEL = ../arm/zdot.c +ZDOTKERNEL = ../arm/zdot.c + +SNRM2KERNEL = ../arm/nrm2.c +DNRM2KERNEL = ../arm/nrm2.c +CNRM2KERNEL = ../arm/znrm2.c +ZNRM2KERNEL = ../arm/znrm2.c + +SROTKERNEL = ../arm/rot.c +DROTKERNEL = ../arm/rot.c +CROTKERNEL = ../arm/zrot.c +ZROTKERNEL = ../arm/zrot.c + +SSCALKERNEL = ../arm/scal.c +DSCALKERNEL = ../arm/scal.c +CSCALKERNEL = ../arm/zscal.c +ZSCALKERNEL = ../arm/zscal.c + +SSWAPKERNEL = ../arm/swap.c +DSWAPKERNEL = ../arm/swap.c +CSWAPKERNEL = ../arm/zswap.c +ZSWAPKERNEL = ../arm/zswap.c + +SGEMVNKERNEL = ../arm/gemv_n.c +DGEMVNKERNEL = ../arm/gemv_n.c +CGEMVNKERNEL = ../arm/zgemv_n.c +ZGEMVNKERNEL = ../arm/zgemv_n.c + +SGEMVTKERNEL = ../arm/gemv_t.c +DGEMVTKERNEL = ../arm/gemv_t.c +CGEMVTKERNEL = ../arm/zgemv_t.c +ZGEMVTKERNEL = ../arm/zgemv_t.c + +STRMMKERNEL = ../generic/trmmkernel_2x2.c +DTRMMKERNEL = ../generic/trmmkernel_2x2.c +CTRMMKERNEL = ../generic/ztrmmkernel_2x2.c +ZTRMMKERNEL = ../generic/ztrmmkernel_2x2.c + +SGEMMKERNEL = ../generic/gemmkernel_2x2.c +SGEMMONCOPY = ../generic/gemm_ncopy_2.c +SGEMMOTCOPY = ../generic/gemm_tcopy_2.c +SGEMMONCOPYOBJ = sgemm_oncopy.o +SGEMMOTCOPYOBJ = sgemm_otcopy.o + +DGEMMKERNEL = ../generic/gemmkernel_2x2.c +DGEMMONCOPY = ../generic/gemm_ncopy_2.c +DGEMMOTCOPY = ../generic/gemm_tcopy_2.c +DGEMMONCOPYOBJ = dgemm_oncopy.o +DGEMMOTCOPYOBJ = dgemm_otcopy.o + +CGEMMKERNEL = ../generic/zgemmkernel_2x2.c +CGEMMONCOPY = ../generic/zgemm_ncopy_2.c +CGEMMOTCOPY = ../generic/zgemm_tcopy_2.c +CGEMMONCOPYOBJ = cgemm_oncopy.o +CGEMMOTCOPYOBJ = cgemm_otcopy.o + +ZGEMMKERNEL = ../generic/zgemmkernel_2x2.c +ZGEMMONCOPY = ../generic/zgemm_ncopy_2.c +ZGEMMOTCOPY = ../generic/zgemm_tcopy_2.c +ZGEMMONCOPYOBJ = zgemm_oncopy.o +ZGEMMOTCOPYOBJ = zgemm_otcopy.o + +STRSMKERNEL_LN = ../generic/trsm_kernel_LN.c +STRSMKERNEL_LT = ../generic/trsm_kernel_LT.c +STRSMKERNEL_RN = ../generic/trsm_kernel_RN.c +STRSMKERNEL_RT = ../generic/trsm_kernel_RT.c + +DTRSMKERNEL_LN = ../generic/trsm_kernel_LN.c +DTRSMKERNEL_LT = ../generic/trsm_kernel_LT.c +DTRSMKERNEL_RN = ../generic/trsm_kernel_RN.c +DTRSMKERNEL_RT = ../generic/trsm_kernel_RT.c + +CTRSMKERNEL_LN = ../generic/trsm_kernel_LN.c +CTRSMKERNEL_LT = ../generic/trsm_kernel_LT.c +CTRSMKERNEL_RN = ../generic/trsm_kernel_RN.c +CTRSMKERNEL_RT = ../generic/trsm_kernel_RT.c + +ZTRSMKERNEL_LN = ../generic/trsm_kernel_LN.c +ZTRSMKERNEL_LT = ../generic/trsm_kernel_LT.c +ZTRSMKERNEL_RN = ../generic/trsm_kernel_RN.c +ZTRSMKERNEL_RT = ../generic/trsm_kernel_RT.c + + + + diff --git a/kernel/arm64/Makefile b/kernel/arm64/Makefile new file mode 100644 index 000000000..efae70d7b --- /dev/null +++ b/kernel/arm64/Makefile @@ -0,0 +1,2 @@ +clean :: + diff --git a/lapack/laswp/arm/Makefile b/lapack/laswp/arm/Makefile new file mode 100644 index 000000000..434c82a84 --- /dev/null +++ b/lapack/laswp/arm/Makefile @@ -0,0 +1,33 @@ +TOPDIR = ../../.. +include ../../../Makefile.system + +ifeq ($(CORE), CORE2) +LASWP = ../generic/laswp_k_2.c +ZLASWP = ../generic/zlaswp_k_2.c +endif + +ifeq ($(CORE), OPTERON) +LASWP = ../generic/laswp_k_1.c +ZLASWP = ../generic/zlaswp_k_1.c +endif + +ifeq ($(CORE), PRESCOTT) +LASWP = ../generic/laswp_k_1.c +ZLASWP = ../generic/zlaswp_k_1.c +endif + +ifeq ($(DYNAMIC_ARCH), 1) +LASWP = ../generic/laswp_k_4.c +ZLASWP = ../generic/zlaswp_k_4.c +endif + +ifndef LASWP +LASWP = ../generic/laswp_k.c +endif + +ifndef ZLASWP +ZLASWP = ../generic/zlaswp_k.c +endif + +include ../generic/Makefile + diff --git a/lapack/laswp/arm64/Makefile b/lapack/laswp/arm64/Makefile new file mode 100644 index 000000000..434c82a84 --- /dev/null +++ b/lapack/laswp/arm64/Makefile @@ -0,0 +1,33 @@ +TOPDIR = ../../.. +include ../../../Makefile.system + +ifeq ($(CORE), CORE2) +LASWP = ../generic/laswp_k_2.c +ZLASWP = ../generic/zlaswp_k_2.c +endif + +ifeq ($(CORE), OPTERON) +LASWP = ../generic/laswp_k_1.c +ZLASWP = ../generic/zlaswp_k_1.c +endif + +ifeq ($(CORE), PRESCOTT) +LASWP = ../generic/laswp_k_1.c +ZLASWP = ../generic/zlaswp_k_1.c +endif + +ifeq ($(DYNAMIC_ARCH), 1) +LASWP = ../generic/laswp_k_4.c +ZLASWP = ../generic/zlaswp_k_4.c +endif + +ifndef LASWP +LASWP = ../generic/laswp_k.c +endif + +ifndef ZLASWP +ZLASWP = ../generic/zlaswp_k.c +endif + +include ../generic/Makefile +