Merge pull request #323 from wernsaar/develop

Merge bulldozer, haswell, piledriver and armv7 branches
This commit is contained in:
Zhang Xianyi 2013-12-03 06:47:03 -08:00
commit ea74f331f4
158 changed files with 75877 additions and 240 deletions

12
Makefile.arm Normal file
View File

@ -0,0 +1,12 @@
ifeq ($(CORE), ARMV7)
CCOMMON_OPT += -marm -mfpu=vfpv3 -mfloat-abi=hard -march=armv7-a
FCOMMON_OPT += -marm -mfpu=vfpv3 -mfloat-abi=hard -march=armv7-a
endif
ifeq ($(CORE), ARMV6)
CCOMMON_OPT += -marm -mfpu=vfp -mfloat-abi=hard -march=armv6
FCOMMON_OPT += -marm -mfpu=vfp -mfloat-abi=hard -march=armv6
endif

7
Makefile.arm64 Normal file
View File

@ -0,0 +1,7 @@
ifeq ($(CORE), ARMV8)
CCOMMON_OPT += -march=armv8-a
FCOMMON_OPT += -march=armv8-a
endif

View File

@ -336,14 +336,14 @@ ifeq ($(ARCH), x86)
DYNAMIC_CORE = KATMAI COPPERMINE NORTHWOOD PRESCOTT BANIAS \ DYNAMIC_CORE = KATMAI COPPERMINE NORTHWOOD PRESCOTT BANIAS \
CORE2 PENRYN DUNNINGTON NEHALEM ATHLON OPTERON OPTERON_SSE3 BARCELONA BOBCAT ATOM NANO CORE2 PENRYN DUNNINGTON NEHALEM ATHLON OPTERON OPTERON_SSE3 BARCELONA BOBCAT ATOM NANO
ifneq ($(NO_AVX), 1) ifneq ($(NO_AVX), 1)
DYNAMIC_CORE += SANDYBRIDGE BULLDOZER PILEDRIVER DYNAMIC_CORE += SANDYBRIDGE BULLDOZER PILEDRIVER HASWELL
endif endif
endif endif
ifeq ($(ARCH), x86_64) ifeq ($(ARCH), x86_64)
DYNAMIC_CORE = PRESCOTT CORE2 PENRYN DUNNINGTON NEHALEM OPTERON OPTERON_SSE3 BARCELONA BOBCAT ATOM NANO DYNAMIC_CORE = PRESCOTT CORE2 PENRYN DUNNINGTON NEHALEM OPTERON OPTERON_SSE3 BARCELONA BOBCAT ATOM NANO
ifneq ($(NO_AVX), 1) ifneq ($(NO_AVX), 1)
DYNAMIC_CORE += SANDYBRIDGE BULLDOZER PILEDRIVER DYNAMIC_CORE += SANDYBRIDGE BULLDOZER PILEDRIVER HASWELL
endif endif
endif endif
@ -373,6 +373,19 @@ NO_BINARY_MODE = 1
BINARY_DEFINED = 1 BINARY_DEFINED = 1
endif endif
ifeq ($(ARCH), arm)
NO_BINARY_MODE = 1
BINARY_DEFINED = 1
endif
ifeq ($(ARCH), arm64)
NO_BINARY_MODE = 1
BINARY_DEFINED = 1
endif
# #
# C Compiler dependent settings # C Compiler dependent settings
# #
@ -833,6 +846,19 @@ ifeq ($(DEBUG), 1)
COMMON_OPT += -g COMMON_OPT += -g
endif endif
ifndef COMMON_OPT
ifeq ($(ARCH), arm)
COMMON_OPT = -O3
endif
endif
ifndef COMMON_OPT
ifeq ($(ARCH), arm64)
COMMON_OPT = -O3
endif
endif
ifndef COMMON_OPT ifndef COMMON_OPT
COMMON_OPT = -O2 COMMON_OPT = -O2
endif endif
@ -958,6 +984,10 @@ export HAVE_SSE4_2
export HAVE_SSE4A export HAVE_SSE4A
export HAVE_SSE5 export HAVE_SSE5
export HAVE_AVX export HAVE_AVX
export HAVE_VFP
export HAVE_VFPV3
export HAVE_VFPV4
export HAVE_NEON
export KERNELDIR export KERNELDIR
export FUNCTION_PROFILE export FUNCTION_PROFILE
export TARGET_CORE export TARGET_CORE

View File

@ -63,6 +63,8 @@ $architecture = mips64 if ($data =~ /ARCH_MIPS64/);
$architecture = alpha if ($data =~ /ARCH_ALPHA/); $architecture = alpha if ($data =~ /ARCH_ALPHA/);
$architecture = sparc if ($data =~ /ARCH_SPARC/); $architecture = sparc if ($data =~ /ARCH_SPARC/);
$architecture = ia64 if ($data =~ /ARCH_IA64/); $architecture = ia64 if ($data =~ /ARCH_IA64/);
$architecture = arm if ($data =~ /ARCH_ARM/);
$architecture = arm64 if ($data =~ /ARCH_ARM64/);
$defined = 0; $defined = 0;
@ -149,6 +151,8 @@ $architecture = mips64 if ($data =~ /ARCH_MIPS64/);
$architecture = alpha if ($data =~ /ARCH_ALPHA/); $architecture = alpha if ($data =~ /ARCH_ALPHA/);
$architecture = sparc if ($data =~ /ARCH_SPARC/); $architecture = sparc if ($data =~ /ARCH_SPARC/);
$architecture = ia64 if ($data =~ /ARCH_IA64/); $architecture = ia64 if ($data =~ /ARCH_IA64/);
$architecture = arm if ($data =~ /ARCH_ARM/);
$architecture = arm64 if ($data =~ /ARCH_ARM64/);
$binformat = bin32; $binformat = bin32;
$binformat = bin64 if ($data =~ /BINARY_64/); $binformat = bin64 if ($data =~ /BINARY_64/);

303
cblas_noconst.h Normal file
View File

@ -0,0 +1,303 @@
#ifndef CBLAS_H
#define CBLAS_H
#include <stddef.h>
#include "common.h"
#ifdef __cplusplus
extern "C" {
/* Assume C declarations for C++ */
#endif /* __cplusplus */
/*Set the number of threads on runtime.*/
void openblas_set_num_threads(int num_threads);
void goto_set_num_threads(int num_threads);
/*Get the build configure on runtime.*/
char* openblas_get_config(void);
/* Get the parallelization type which is used by OpenBLAS */
int openblas_get_parallel(void);
/* OpenBLAS is compiled for sequential use */
#define OPENBLAS_SEQUENTIAL 0
/* OpenBLAS is compiled using normal threading model */
#define OPENBLAS_THREAD 1
/* OpenBLAS is compiled using OpenMP threading model */
#define OPENBLAS_OPENMP 2
#define CBLAS_INDEX size_t
typedef enum CBLAS_ORDER {CblasRowMajor=101, CblasColMajor=102} CBLAS_ORDER;
typedef enum CBLAS_TRANSPOSE {CblasNoTrans=111, CblasTrans=112, CblasConjTrans=113, CblasConjNoTrans=114} CBLAS_TRANSPOSE;
typedef enum CBLAS_UPLO {CblasUpper=121, CblasLower=122} CBLAS_UPLO;
typedef enum CBLAS_DIAG {CblasNonUnit=131, CblasUnit=132} CBLAS_DIAG;
typedef enum CBLAS_SIDE {CblasLeft=141, CblasRight=142} CBLAS_SIDE;
float cblas_sdsdot(blasint n, float alpha, float *x, blasint incx, float *y, blasint incy);
double cblas_dsdot (blasint n, float *x, blasint incx, float *y, blasint incy);
float cblas_sdot(blasint n, float *x, blasint incx, float *y, blasint incy);
double cblas_ddot(blasint n, double *x, blasint incx, double *y, blasint incy);
openblas_complex_float cblas_cdotu(blasint n, float *x, blasint incx, float *y, blasint incy);
openblas_complex_float cblas_cdotc(blasint n, float *x, blasint incx, float *y, blasint incy);
openblas_complex_double cblas_zdotu(blasint n, double *x, blasint incx, double *y, blasint incy);
openblas_complex_double cblas_zdotc(blasint n, double *x, blasint incx, double *y, blasint incy);
void cblas_cdotu_sub(blasint n, float *x, blasint incx, float *y, blasint incy, openblas_complex_float *ret);
void cblas_cdotc_sub(blasint n, float *x, blasint incx, float *y, blasint incy, openblas_complex_float *ret);
void cblas_zdotu_sub(blasint n, double *x, blasint incx, double *y, blasint incy, openblas_complex_double *ret);
void cblas_zdotc_sub(blasint n, double *x, blasint incx, double *y, blasint incy, openblas_complex_double *ret);
float cblas_sasum (blasint n, float *x, blasint incx);
double cblas_dasum (blasint n, double *x, blasint incx);
float cblas_scasum(blasint n, float *x, blasint incx);
double cblas_dzasum(blasint n, double *x, blasint incx);
float cblas_snrm2 (blasint N, float *X, blasint incX);
double cblas_dnrm2 (blasint N, double *X, blasint incX);
float cblas_scnrm2(blasint N, float *X, blasint incX);
double cblas_dznrm2(blasint N, double *X, blasint incX);
CBLAS_INDEX cblas_isamax(blasint n, float *x, blasint incx);
CBLAS_INDEX cblas_idamax(blasint n, double *x, blasint incx);
CBLAS_INDEX cblas_icamax(blasint n, float *x, blasint incx);
CBLAS_INDEX cblas_izamax(blasint n, double *x, blasint incx);
void cblas_saxpy(blasint n, float alpha, float *x, blasint incx, float *y, blasint incy);
void cblas_daxpy(blasint n, double alpha, double *x, blasint incx, double *y, blasint incy);
void cblas_caxpy(blasint n, float *alpha, float *x, blasint incx, float *y, blasint incy);
void cblas_zaxpy(blasint n, double *alpha, double *x, blasint incx, double *y, blasint incy);
void cblas_scopy(blasint n, float *x, blasint incx, float *y, blasint incy);
void cblas_dcopy(blasint n, double *x, blasint incx, double *y, blasint incy);
void cblas_ccopy(blasint n, float *x, blasint incx, float *y, blasint incy);
void cblas_zcopy(blasint n, double *x, blasint incx, double *y, blasint incy);
void cblas_sswap(blasint n, float *x, blasint incx, float *y, blasint incy);
void cblas_dswap(blasint n, double *x, blasint incx, double *y, blasint incy);
void cblas_cswap(blasint n, float *x, blasint incx, float *y, blasint incy);
void cblas_zswap(blasint n, double *x, blasint incx, double *y, blasint incy);
void cblas_srot(blasint N, float *X, blasint incX, float *Y, blasint incY, float c, float s);
void cblas_drot(blasint N, double *X, blasint incX, double *Y, blasint incY, double c, double s);
void cblas_srotg(float *a, float *b, float *c, float *s);
void cblas_drotg(double *a, double *b, double *c, double *s);
void cblas_srotm(blasint N, float *X, blasint incX, float *Y, blasint incY, float *P);
void cblas_drotm(blasint N, double *X, blasint incX, double *Y, blasint incY, double *P);
void cblas_srotmg(float *d1, float *d2, float *b1, float b2, float *P);
void cblas_drotmg(double *d1, double *d2, double *b1, double b2, double *P);
void cblas_sscal(blasint N, float alpha, float *X, blasint incX);
void cblas_dscal(blasint N, double alpha, double *X, blasint incX);
void cblas_cscal(blasint N, float *alpha, float *X, blasint incX);
void cblas_zscal(blasint N, double *alpha, double *X, blasint incX);
void cblas_csscal(blasint N, float alpha, float *X, blasint incX);
void cblas_zdscal(blasint N, double alpha, double *X, blasint incX);
void cblas_sgemv(enum CBLAS_ORDER order, enum CBLAS_TRANSPOSE trans, blasint m, blasint n,
float alpha, float *a, blasint lda, float *x, blasint incx, float beta, float *y, blasint incy);
void cblas_dgemv(enum CBLAS_ORDER order, enum CBLAS_TRANSPOSE trans, blasint m, blasint n,
double alpha, double *a, blasint lda, double *x, blasint incx, double beta, double *y, blasint incy);
void cblas_cgemv(enum CBLAS_ORDER order, enum CBLAS_TRANSPOSE trans, blasint m, blasint n,
float *alpha, float *a, blasint lda, float *x, blasint incx, float *beta, float *y, blasint incy);
void cblas_zgemv(enum CBLAS_ORDER order, enum CBLAS_TRANSPOSE trans, blasint m, blasint n,
double *alpha, double *a, blasint lda, double *x, blasint incx, double *beta, double *y, blasint incy);
void cblas_sger (enum CBLAS_ORDER order, blasint M, blasint N, float alpha, float *X, blasint incX, float *Y, blasint incY, float *A, blasint lda);
void cblas_dger (enum CBLAS_ORDER order, blasint M, blasint N, double alpha, double *X, blasint incX, double *Y, blasint incY, double *A, blasint lda);
void cblas_cgeru(enum CBLAS_ORDER order, blasint M, blasint N, float *alpha, float *X, blasint incX, float *Y, blasint incY, float *A, blasint lda);
void cblas_cgerc(enum CBLAS_ORDER order, blasint M, blasint N, float *alpha, float *X, blasint incX, float *Y, blasint incY, float *A, blasint lda);
void cblas_zgeru(enum CBLAS_ORDER order, blasint M, blasint N, double *alpha, double *X, blasint incX, double *Y, blasint incY, double *A, blasint lda);
void cblas_zgerc(enum CBLAS_ORDER order, blasint M, blasint N, double *alpha, double *X, blasint incX, double *Y, blasint incY, double *A, blasint lda);
void cblas_strsv(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE TransA, enum CBLAS_DIAG Diag, blasint N, float *A, blasint lda, float *X, blasint incX);
void cblas_dtrsv(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE TransA, enum CBLAS_DIAG Diag, blasint N, double *A, blasint lda, double *X, blasint incX);
void cblas_ctrsv(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE TransA, enum CBLAS_DIAG Diag, blasint N, float *A, blasint lda, float *X, blasint incX);
void cblas_ztrsv(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE TransA, enum CBLAS_DIAG Diag, blasint N, double *A, blasint lda, double *X, blasint incX);
void cblas_strmv(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE TransA, enum CBLAS_DIAG Diag, blasint N, float *A, blasint lda, float *X, blasint incX);
void cblas_dtrmv(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE TransA, enum CBLAS_DIAG Diag, blasint N, double *A, blasint lda, double *X, blasint incX);
void cblas_ctrmv(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE TransA, enum CBLAS_DIAG Diag, blasint N, float *A, blasint lda, float *X, blasint incX);
void cblas_ztrmv(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE TransA, enum CBLAS_DIAG Diag, blasint N, double *A, blasint lda, double *X, blasint incX);
void cblas_ssyr(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, blasint N, float alpha, float *X, blasint incX, float *A, blasint lda);
void cblas_dsyr(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, blasint N, double alpha, double *X, blasint incX, double *A, blasint lda);
void cblas_cher(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, blasint N, float alpha, float *X, blasint incX, float *A, blasint lda);
void cblas_zher(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, blasint N, double alpha, double *X, blasint incX, double *A, blasint lda);
void cblas_ssyr2(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo,blasint N, float alpha, float *X,
blasint incX, float *Y, blasint incY, float *A, blasint lda);
void cblas_dsyr2(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, blasint N, double alpha, double *X,
blasint incX, double *Y, blasint incY, double *A, blasint lda);
void cblas_cher2(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, blasint N, float *alpha, float *X, blasint incX,
float *Y, blasint incY, float *A, blasint lda);
void cblas_zher2(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, blasint N, double *alpha, double *X, blasint incX,
double *Y, blasint incY, double *A, blasint lda);
void cblas_sgbmv(enum CBLAS_ORDER order, enum CBLAS_TRANSPOSE TransA, blasint M, blasint N,
blasint KL, blasint KU, float alpha, float *A, blasint lda, float *X, blasint incX, float beta, float *Y, blasint incY);
void cblas_dgbmv(enum CBLAS_ORDER order, enum CBLAS_TRANSPOSE TransA, blasint M, blasint N,
blasint KL, blasint KU, double alpha, double *A, blasint lda, double *X, blasint incX, double beta, double *Y, blasint incY);
void cblas_cgbmv(enum CBLAS_ORDER order, enum CBLAS_TRANSPOSE TransA, blasint M, blasint N,
blasint KL, blasint KU, float *alpha, float *A, blasint lda, float *X, blasint incX, float *beta, float *Y, blasint incY);
void cblas_zgbmv(enum CBLAS_ORDER order, enum CBLAS_TRANSPOSE TransA, blasint M, blasint N,
blasint KL, blasint KU, double *alpha, double *A, blasint lda, double *X, blasint incX, double *beta, double *Y, blasint incY);
void cblas_ssbmv(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, blasint N, blasint K, float alpha, float *A,
blasint lda, float *X, blasint incX, float beta, float *Y, blasint incY);
void cblas_dsbmv(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, blasint N, blasint K, double alpha, double *A,
blasint lda, double *X, blasint incX, double beta, double *Y, blasint incY);
void cblas_stbmv(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE TransA, enum CBLAS_DIAG Diag,
blasint N, blasint K, float *A, blasint lda, float *X, blasint incX);
void cblas_dtbmv(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE TransA, enum CBLAS_DIAG Diag,
blasint N, blasint K, double *A, blasint lda, double *X, blasint incX);
void cblas_ctbmv(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE TransA, enum CBLAS_DIAG Diag,
blasint N, blasint K, float *A, blasint lda, float *X, blasint incX);
void cblas_ztbmv(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE TransA, enum CBLAS_DIAG Diag,
blasint N, blasint K, double *A, blasint lda, double *X, blasint incX);
void cblas_stbsv(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE TransA, enum CBLAS_DIAG Diag,
blasint N, blasint K, float *A, blasint lda, float *X, blasint incX);
void cblas_dtbsv(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE TransA, enum CBLAS_DIAG Diag,
blasint N, blasint K, double *A, blasint lda, double *X, blasint incX);
void cblas_ctbsv(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE TransA, enum CBLAS_DIAG Diag,
blasint N, blasint K, float *A, blasint lda, float *X, blasint incX);
void cblas_ztbsv(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE TransA, enum CBLAS_DIAG Diag,
blasint N, blasint K, double *A, blasint lda, double *X, blasint incX);
void cblas_stpmv(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE TransA, enum CBLAS_DIAG Diag,
blasint N, float *Ap, float *X, blasint incX);
void cblas_dtpmv(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE TransA, enum CBLAS_DIAG Diag,
blasint N, double *Ap, double *X, blasint incX);
void cblas_ctpmv(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE TransA, enum CBLAS_DIAG Diag,
blasint N, float *Ap, float *X, blasint incX);
void cblas_ztpmv(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE TransA, enum CBLAS_DIAG Diag,
blasint N, double *Ap, double *X, blasint incX);
void cblas_stpsv(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE TransA, enum CBLAS_DIAG Diag,
blasint N, float *Ap, float *X, blasint incX);
void cblas_dtpsv(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE TransA, enum CBLAS_DIAG Diag,
blasint N, double *Ap, double *X, blasint incX);
void cblas_ctpsv(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE TransA, enum CBLAS_DIAG Diag,
blasint N, float *Ap, float *X, blasint incX);
void cblas_ztpsv(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE TransA, enum CBLAS_DIAG Diag,
blasint N, double *Ap, double *X, blasint incX);
void cblas_ssymv(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, blasint N, float alpha, float *A,
blasint lda, float *X, blasint incX, float beta, float *Y, blasint incY);
void cblas_dsymv(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, blasint N, double alpha, double *A,
blasint lda, double *X, blasint incX, double beta, double *Y, blasint incY);
void cblas_chemv(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, blasint N, float *alpha, float *A,
blasint lda, float *X, blasint incX, float *beta, float *Y, blasint incY);
void cblas_zhemv(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, blasint N, double *alpha, double *A,
blasint lda, double *X, blasint incX, double *beta, double *Y, blasint incY);
void cblas_sspmv(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, blasint N, float alpha, float *Ap,
float *X, blasint incX, float beta, float *Y, blasint incY);
void cblas_dspmv(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, blasint N, double alpha, double *Ap,
double *X, blasint incX, double beta, double *Y, blasint incY);
void cblas_sspr(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, blasint N, float alpha, float *X, blasint incX, float *Ap);
void cblas_dspr(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, blasint N, double alpha, double *X, blasint incX, double *Ap);
void cblas_chpr(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, blasint N, float alpha, float *X, blasint incX, float *A);
void cblas_zhpr(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, blasint N, double alpha, double *X,blasint incX, double *A);
void cblas_sspr2(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, blasint N, float alpha, float *X, blasint incX, float *Y, blasint incY, float *A);
void cblas_dspr2(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, blasint N, double alpha, double *X, blasint incX, double *Y, blasint incY, double *A);
void cblas_chpr2(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, blasint N, float *alpha, float *X, blasint incX, float *Y, blasint incY, float *Ap);
void cblas_zhpr2(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, blasint N, double *alpha, double *X, blasint incX, double *Y, blasint incY, double *Ap);
void cblas_chbmv(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, blasint N, blasint K,
float *alpha, float *A, blasint lda, float *X, blasint incX, float *beta, float *Y, blasint incY);
void cblas_zhbmv(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, blasint N, blasint K,
double *alpha, double *A, blasint lda, double *X, blasint incX, double *beta, double *Y, blasint incY);
void cblas_chpmv(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, blasint N,
float *alpha, float *Ap, float *X, blasint incX, float *beta, float *Y, blasint incY);
void cblas_zhpmv(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, blasint N,
double *alpha, double *Ap, double *X, blasint incX, double *beta, double *Y, blasint incY);
void cblas_sgemm(enum CBLAS_ORDER Order, enum CBLAS_TRANSPOSE TransA, enum CBLAS_TRANSPOSE TransB, blasint M, blasint N, blasint K,
float alpha, float *A, blasint lda, float *B, blasint ldb, float beta, float *C, blasint ldc);
void cblas_dgemm(enum CBLAS_ORDER Order, enum CBLAS_TRANSPOSE TransA, enum CBLAS_TRANSPOSE TransB, blasint M, blasint N, blasint K,
double alpha, double *A, blasint lda, double *B, blasint ldb, double beta, double *C, blasint ldc);
void cblas_cgemm(enum CBLAS_ORDER Order, enum CBLAS_TRANSPOSE TransA, enum CBLAS_TRANSPOSE TransB, blasint M, blasint N, blasint K,
float *alpha, float *A, blasint lda, float *B, blasint ldb, float *beta, float *C, blasint ldc);
void cblas_zgemm(enum CBLAS_ORDER Order, enum CBLAS_TRANSPOSE TransA, enum CBLAS_TRANSPOSE TransB, blasint M, blasint N, blasint K,
double *alpha, double *A, blasint lda, double *B, blasint ldb, double *beta, double *C, blasint ldc);
void cblas_ssymm(enum CBLAS_ORDER Order, enum CBLAS_SIDE Side, enum CBLAS_UPLO Uplo, blasint M, blasint N,
float alpha, float *A, blasint lda, float *B, blasint ldb, float beta, float *C, blasint ldc);
void cblas_dsymm(enum CBLAS_ORDER Order, enum CBLAS_SIDE Side, enum CBLAS_UPLO Uplo, blasint M, blasint N,
double alpha, double *A, blasint lda, double *B, blasint ldb, double beta, double *C, blasint ldc);
void cblas_csymm(enum CBLAS_ORDER Order, enum CBLAS_SIDE Side, enum CBLAS_UPLO Uplo, blasint M, blasint N,
float *alpha, float *A, blasint lda, float *B, blasint ldb, float *beta, float *C, blasint ldc);
void cblas_zsymm(enum CBLAS_ORDER Order, enum CBLAS_SIDE Side, enum CBLAS_UPLO Uplo, blasint M, blasint N,
double *alpha, double *A, blasint lda, double *B, blasint ldb, double *beta, double *C, blasint ldc);
void cblas_ssyrk(enum CBLAS_ORDER Order, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE Trans,
blasint N, blasint K, float alpha, float *A, blasint lda, float beta, float *C, blasint ldc);
void cblas_dsyrk(enum CBLAS_ORDER Order, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE Trans,
blasint N, blasint K, double alpha, double *A, blasint lda, double beta, double *C, blasint ldc);
void cblas_csyrk(enum CBLAS_ORDER Order, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE Trans,
blasint N, blasint K, float *alpha, float *A, blasint lda, float *beta, float *C, blasint ldc);
void cblas_zsyrk(enum CBLAS_ORDER Order, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE Trans,
blasint N, blasint K, double *alpha, double *A, blasint lda, double *beta, double *C, blasint ldc);
void cblas_ssyr2k(enum CBLAS_ORDER Order, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE Trans,
blasint N, blasint K, float alpha, float *A, blasint lda, float *B, blasint ldb, float beta, float *C, blasint ldc);
void cblas_dsyr2k(enum CBLAS_ORDER Order, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE Trans,
blasint N, blasint K, double alpha, double *A, blasint lda, double *B, blasint ldb, double beta, double *C, blasint ldc);
void cblas_csyr2k(enum CBLAS_ORDER Order, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE Trans,
blasint N, blasint K, float *alpha, float *A, blasint lda, float *B, blasint ldb, float *beta, float *C, blasint ldc);
void cblas_zsyr2k(enum CBLAS_ORDER Order, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE Trans,
blasint N, blasint K, double *alpha, double *A, blasint lda, double *B, blasint ldb, double *beta, double *C, blasint ldc);
void cblas_strmm(enum CBLAS_ORDER Order, enum CBLAS_SIDE Side, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE TransA,
enum CBLAS_DIAG Diag, blasint M, blasint N, float alpha, float *A, blasint lda, float *B, blasint ldb);
void cblas_dtrmm(enum CBLAS_ORDER Order, enum CBLAS_SIDE Side, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE TransA,
enum CBLAS_DIAG Diag, blasint M, blasint N, double alpha, double *A, blasint lda, double *B, blasint ldb);
void cblas_ctrmm(enum CBLAS_ORDER Order, enum CBLAS_SIDE Side, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE TransA,
enum CBLAS_DIAG Diag, blasint M, blasint N, float *alpha, float *A, blasint lda, float *B, blasint ldb);
void cblas_ztrmm(enum CBLAS_ORDER Order, enum CBLAS_SIDE Side, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE TransA,
enum CBLAS_DIAG Diag, blasint M, blasint N, double *alpha, double *A, blasint lda, double *B, blasint ldb);
void cblas_strsm(enum CBLAS_ORDER Order, enum CBLAS_SIDE Side, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE TransA,
enum CBLAS_DIAG Diag, blasint M, blasint N, float alpha, float *A, blasint lda, float *B, blasint ldb);
void cblas_dtrsm(enum CBLAS_ORDER Order, enum CBLAS_SIDE Side, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE TransA,
enum CBLAS_DIAG Diag, blasint M, blasint N, double alpha, double *A, blasint lda, double *B, blasint ldb);
void cblas_ctrsm(enum CBLAS_ORDER Order, enum CBLAS_SIDE Side, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE TransA,
enum CBLAS_DIAG Diag, blasint M, blasint N, float *alpha, float *A, blasint lda, float *B, blasint ldb);
void cblas_ztrsm(enum CBLAS_ORDER Order, enum CBLAS_SIDE Side, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE TransA,
enum CBLAS_DIAG Diag, blasint M, blasint N, double *alpha, double *A, blasint lda, double *B, blasint ldb);
void cblas_chemm(enum CBLAS_ORDER Order, enum CBLAS_SIDE Side, enum CBLAS_UPLO Uplo, blasint M, blasint N,
float *alpha, float *A, blasint lda, float *B, blasint ldb, float *beta, float *C, blasint ldc);
void cblas_zhemm(enum CBLAS_ORDER Order, enum CBLAS_SIDE Side, enum CBLAS_UPLO Uplo, blasint M, blasint N,
double *alpha, double *A, blasint lda, double *B, blasint ldb, double *beta, double *C, blasint ldc);
void cblas_cherk(enum CBLAS_ORDER Order, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE Trans, blasint N, blasint K,
float alpha, float *A, blasint lda, float beta, float *C, blasint ldc);
void cblas_zherk(enum CBLAS_ORDER Order, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE Trans, blasint N, blasint K,
double alpha, double *A, blasint lda, double beta, double *C, blasint ldc);
void cblas_cher2k(enum CBLAS_ORDER Order, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE Trans, blasint N, blasint K,
float *alpha, float *A, blasint lda, float *B, blasint ldb, float beta, float *C, blasint ldc);
void cblas_zher2k(enum CBLAS_ORDER Order, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE Trans, blasint N, blasint K,
double *alpha, double *A, blasint lda, double *B, blasint ldb, double beta, double *C, blasint ldc);
void cblas_xerbla(blasint p, char *rout, char *form, ...);
#ifdef __cplusplus
}
#endif /* __cplusplus */
#endif

View File

@ -310,6 +310,15 @@ typedef int blasint;
#define YIELDING SwitchToThread() #define YIELDING SwitchToThread()
#endif #endif
#if defined(ARMV7) || defined(ARMV6) || defined(ARMV8)
#define YIELDING asm volatile ("nop;nop;nop;nop;nop;nop;nop;nop; \n");
#endif
#ifdef PILEDRIVER
#define YIELDING __asm__ __volatile__ ("nop;nop;nop;nop;nop;nop;nop;nop;\n");
#endif
#ifndef YIELDING #ifndef YIELDING
#define YIELDING sched_yield() #define YIELDING sched_yield()
#endif #endif
@ -363,6 +372,15 @@ please https://github.com/xianyi/OpenBLAS/issues/246
#include "common_mips64.h" #include "common_mips64.h"
#endif #endif
#ifdef ARCH_ARM
#include "common_arm.h"
#endif
#ifdef ARCH_ARM64
#include "common_arm64.h"
#endif
#ifdef OS_LINUX #ifdef OS_LINUX
#include "common_linux.h" #include "common_linux.h"
#endif #endif

169
common_arm.h Normal file
View File

@ -0,0 +1,169 @@
/*****************************************************************************
Copyright (c) 2011, Lab of Parallel Software and Computational Science,ICSAS
All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
1. Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
2. Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in
the documentation and/or other materials provided with the
distribution.
3. Neither the name of the ISCAS nor the names of its contributors may
be used to endorse or promote products derived from this software
without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
**********************************************************************************/
/*********************************************************************/
/* Copyright 2009, 2010 The University of Texas at Austin. */
/* All rights reserved. */
/* */
/* Redistribution and use in source and binary forms, with or */
/* without modification, are permitted provided that the following */
/* conditions are met: */
/* */
/* 1. Redistributions of source code must retain the above */
/* copyright notice, this list of conditions and the following */
/* disclaimer. */
/* */
/* 2. Redistributions in binary form must reproduce the above */
/* copyright notice, this list of conditions and the following */
/* disclaimer in the documentation and/or other materials */
/* provided with the distribution. */
/* */
/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */
/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */
/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */
/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */
/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */
/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */
/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */
/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */
/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */
/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */
/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */
/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */
/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */
/* POSSIBILITY OF SUCH DAMAGE. */
/* */
/* The views and conclusions contained in the software and */
/* documentation are those of the authors and should not be */
/* interpreted as representing official policies, either expressed */
/* or implied, of The University of Texas at Austin. */
/*********************************************************************/
#ifndef COMMON_ARM
#define COMMON_ARM
#define MB
#define WMB
#define INLINE inline
#define RETURN_BY_COMPLEX
#ifndef ASSEMBLER
static void __inline blas_lock(volatile BLASULONG *address){
int register ret;
do {
while (*address) {YIELDING;};
__asm__ __volatile__(
"ldrex r2, [%1] \n\t"
"mov r2, #0 \n\t"
"strex r3, r2, [%1] \n\t"
"mov %0 , r3 \n\t"
: "=r"(ret), "=r"(address)
: "1"(address)
: "memory", "r2" , "r3"
);
} while (ret);
}
static inline unsigned long long rpcc(void){
unsigned long long ret=0;
double v;
struct timeval tv;
gettimeofday(&tv,NULL);
v=(double) tv.tv_sec + (double) tv.tv_usec * 1e-6;
ret = (unsigned long long) ( v * 1000.0d );
return ret;
}
static inline int blas_quickdivide(blasint x, blasint y){
return x / y;
}
#if defined(DOUBLE)
#define GET_IMAGE(res) __asm__ __volatile__("vstr.f64 d1, %0" : "=m"(res) : : "memory")
#else
#define GET_IMAGE(res) __asm__ __volatile__("vstr.f32 s1, %0" : "=m"(res) : : "memory")
#endif
#define GET_IMAGE_CANCEL
#endif
#ifndef F_INTERFACE
#define REALNAME ASMNAME
#else
#define REALNAME ASMFNAME
#endif
#if defined(ASSEMBLER) && !defined(NEEDPARAM)
#define PROLOGUE \
.arm ;\
.global REALNAME ;\
.func REALNAME ;\
REALNAME:
#define EPILOGUE
#define PROFCODE
#endif
#define SEEK_ADDRESS
#ifndef PAGESIZE
#define PAGESIZE ( 4 << 10)
#endif
#define HUGE_PAGESIZE ( 4 << 20)
#define BUFFER_SIZE (16 << 20)
#define BASE_ADDRESS (START_ADDRESS - BUFFER_SIZE * MAX_CPU_NUMBER)
#ifndef MAP_ANONYMOUS
#define MAP_ANONYMOUS MAP_ANON
#endif
#endif

169
common_arm64.h Normal file
View File

@ -0,0 +1,169 @@
/*****************************************************************************
Copyright (c) 2011, Lab of Parallel Software and Computational Science,ICSAS
All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
1. Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
2. Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in
the documentation and/or other materials provided with the
distribution.
3. Neither the name of the ISCAS nor the names of its contributors may
be used to endorse or promote products derived from this software
without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
**********************************************************************************/
/*********************************************************************/
/* Copyright 2009, 2010 The University of Texas at Austin. */
/* All rights reserved. */
/* */
/* Redistribution and use in source and binary forms, with or */
/* without modification, are permitted provided that the following */
/* conditions are met: */
/* */
/* 1. Redistributions of source code must retain the above */
/* copyright notice, this list of conditions and the following */
/* disclaimer. */
/* */
/* 2. Redistributions in binary form must reproduce the above */
/* copyright notice, this list of conditions and the following */
/* disclaimer in the documentation and/or other materials */
/* provided with the distribution. */
/* */
/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */
/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */
/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */
/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */
/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */
/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */
/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */
/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */
/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */
/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */
/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */
/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */
/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */
/* POSSIBILITY OF SUCH DAMAGE. */
/* */
/* The views and conclusions contained in the software and */
/* documentation are those of the authors and should not be */
/* interpreted as representing official policies, either expressed */
/* or implied, of The University of Texas at Austin. */
/*********************************************************************/
#ifndef COMMON_ARM64
#define COMMON_ARM64
#define MB
#define WMB
#define INLINE inline
#define RETURN_BY_COMPLEX
#ifndef ASSEMBLER
static void __inline blas_lock(volatile BLASULONG *address){
/*
int register ret;
do {
while (*address) {YIELDING;};
__asm__ __volatile__(
"ldrex r2, [%1] \n\t"
"mov r2, #0 \n\t"
"strex r3, r2, [%1] \n\t"
"mov %0 , r3 \n\t"
: "=r"(ret), "=r"(address)
: "1"(address)
: "memory", "r2" , "r3"
);
} while (ret);
*/
}
static inline unsigned long long rpcc(void){
unsigned long long ret=0;
double v;
struct timeval tv;
gettimeofday(&tv,NULL);
v=(double) tv.tv_sec + (double) tv.tv_usec * 1e-6;
ret = (unsigned long long) ( v * 1000.0d );
return ret;
}
static inline int blas_quickdivide(blasint x, blasint y){
return x / y;
}
#if defined(DOUBLE)
#define GET_IMAGE(res) __asm__ __volatile__("vstr.f64 d1, %0" : "=m"(res) : : "memory")
#else
#define GET_IMAGE(res) __asm__ __volatile__("vstr.f32 s1, %0" : "=m"(res) : : "memory")
#endif
#define GET_IMAGE_CANCEL
#endif
#ifndef F_INTERFACE
#define REALNAME ASMNAME
#else
#define REALNAME ASMFNAME
#endif
#if defined(ASSEMBLER) && !defined(NEEDPARAM)
#define PROLOGUE \
.arm ;\
.global REALNAME ;\
.func REALNAME ;\
REALNAME:
#define EPILOGUE
#define PROFCODE
#endif
#define SEEK_ADDRESS
#ifndef PAGESIZE
#define PAGESIZE ( 4 << 10)
#endif
#define HUGE_PAGESIZE ( 4 << 20)
#define BUFFER_SIZE (16 << 20)
#define BASE_ADDRESS (START_ADDRESS - BUFFER_SIZE * MAX_CPU_NUMBER)
#ifndef MAP_ANONYMOUS
#define MAP_ANONYMOUS MAP_ANON
#endif
#endif

View File

@ -107,7 +107,7 @@
#define CORE_BOBCAT 21 #define CORE_BOBCAT 21
#define CORE_BULLDOZER 22 #define CORE_BULLDOZER 22
#define CORE_PILEDRIVER 23 #define CORE_PILEDRIVER 23
#define CORE_HASWELL CORE_SANDYBRIDGE #define CORE_HASWELL 24
#define HAVE_SSE (1 << 0) #define HAVE_SSE (1 << 0)
#define HAVE_SSE2 (1 << 1) #define HAVE_SSE2 (1 << 1)
@ -200,7 +200,6 @@ typedef struct {
#define CPUTYPE_BOBCAT 45 #define CPUTYPE_BOBCAT 45
#define CPUTYPE_BULLDOZER 46 #define CPUTYPE_BULLDOZER 46
#define CPUTYPE_PILEDRIVER 47 #define CPUTYPE_PILEDRIVER 47
// this define is because BLAS doesn't have haswell specific optimizations yet #define CPUTYPE_HASWELL 48
#define CPUTYPE_HASWELL CPUTYPE_SANDYBRIDGE
#endif #endif

262
cpuid_arm.c Normal file
View File

@ -0,0 +1,262 @@
/**************************************************************************
Copyright (c) 2013, The OpenBLAS Project
All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
1. Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
2. Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in
the documentation and/or other materials provided with the
distribution.
3. Neither the name of the OpenBLAS project nor the names of
its contributors may be used to endorse or promote products
derived from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*****************************************************************************/
#include <string.h>
#define CPU_UNKNOWN 0
#define CPU_ARMV6 1
#define CPU_ARMV7 2
#define CPU_CORTEXA15 3
static char *cpuname[] = {
"UNKOWN",
"ARMV6",
"ARMV7",
"CORTEXA15"
};
int get_feature(char *search)
{
#ifdef linux
FILE *infile;
char buffer[2048], *p,*t;
p = (char *) NULL ;
infile = fopen("/proc/cpuinfo", "r");
while (fgets(buffer, sizeof(buffer), infile))
{
if (!strncmp("Features", buffer, 8))
{
p = strchr(buffer, ':') + 2;
break;
}
}
fclose(infile);
if( p == NULL ) return;
t = strtok(p," ");
while( t = strtok(NULL," "))
{
if (!strcmp(t, search)) { return(1); }
}
#endif
return(0);
}
int detect(void)
{
#ifdef linux
FILE *infile;
char buffer[512], *p;
p = (char *) NULL ;
infile = fopen("/proc/cpuinfo", "r");
while (fgets(buffer, sizeof(buffer), infile))
{
if (!strncmp("model name", buffer, 10))
{
p = strchr(buffer, ':') + 2;
break;
}
}
fclose(infile);
if(p != NULL)
{
if (strstr(p, "ARMv7"))
{
if ( get_feature("vfpv4"))
return CPU_ARMV7;
if ( get_feature("vfpv3"))
return CPU_ARMV7;
if ( get_feature("vfp"))
return CPU_ARMV6;
}
if (strstr(p, "ARMv6"))
{
if ( get_feature("vfp"))
return CPU_ARMV6;
}
}
#endif
return CPU_UNKNOWN;
}
char *get_corename(void)
{
return cpuname[detect()];
}
void get_architecture(void)
{
printf("ARM");
}
void get_subarchitecture(void)
{
int d = detect();
switch (d)
{
case CPU_ARMV7:
printf("ARMV7");
break;
case CPU_ARMV6:
printf("ARMV6");
break;
default:
printf("UNKNOWN");
break;
}
}
void get_subdirname(void)
{
printf("arm");
}
void get_cpuconfig(void)
{
int d = detect();
switch (d)
{
case CPU_ARMV7:
printf("#define ARMV7\n");
printf("#define HAVE_VFP\n");
printf("#define HAVE_VFPV3\n");
if ( get_feature("neon")) printf("#define HAVE_NEON\n");
if ( get_feature("vfpv4")) printf("#define HAVE_VFPV4\n");
printf("#define L1_DATA_SIZE 65536\n");
printf("#define L1_DATA_LINESIZE 32\n");
printf("#define L2_SIZE 512488\n");
printf("#define L2_LINESIZE 32\n");
printf("#define DTB_DEFAULT_ENTRIES 64\n");
printf("#define DTB_SIZE 4096\n");
printf("#define L2_ASSOCIATIVE 4\n");
break;
case CPU_ARMV6:
printf("#define ARMV6\n");
printf("#define HAVE_VFP\n");
printf("#define L1_DATA_SIZE 65536\n");
printf("#define L1_DATA_LINESIZE 32\n");
printf("#define L2_SIZE 512488\n");
printf("#define L2_LINESIZE 32\n");
printf("#define DTB_DEFAULT_ENTRIES 64\n");
printf("#define DTB_SIZE 4096\n");
printf("#define L2_ASSOCIATIVE 4\n");
break;
}
}
void get_libname(void)
{
int d = detect();
switch (d)
{
case CPU_ARMV7:
printf("armv7\n");
break;
case CPU_ARMV6:
printf("armv6\n");
break;
}
}
void get_features(void)
{
#ifdef linux
FILE *infile;
char buffer[2048], *p,*t;
p = (char *) NULL ;
infile = fopen("/proc/cpuinfo", "r");
while (fgets(buffer, sizeof(buffer), infile))
{
if (!strncmp("Features", buffer, 8))
{
p = strchr(buffer, ':') + 2;
break;
}
}
fclose(infile);
if( p == NULL ) return;
t = strtok(p," ");
while( t = strtok(NULL," "))
{
if (!strcmp(t, "vfp")) { printf("HAVE_VFP=1\n"); continue; }
if (!strcmp(t, "vfpv3")) { printf("HAVE_VFPV3=1\n"); continue; }
if (!strcmp(t, "vfpv4")) { printf("HAVE_VFPV4=1\n"); continue; }
if (!strcmp(t, "neon")) { printf("HAVE_NEON=1\n"); continue; }
}
#endif
return;
}

View File

@ -1243,6 +1243,7 @@ static char *cpuname[] = {
"BOBCAT", "BOBCAT",
"BULLDOZER", "BULLDOZER",
"PILEDRIVER", "PILEDRIVER",
"HASWELL",
}; };
static char *lowercpuname[] = { static char *lowercpuname[] = {
@ -1293,6 +1294,7 @@ static char *lowercpuname[] = {
"bobcat", "bobcat",
"bulldozer", "bulldozer",
"piledriver", "piledriver",
"haswell",
}; };
static char *corename[] = { static char *corename[] = {
@ -1320,6 +1322,7 @@ static char *corename[] = {
"BOBCAT", "BOBCAT",
"BULLDOZER", "BULLDOZER",
"PILEDRIVER", "PILEDRIVER",
"HASWELL",
}; };
static char *corename_lower[] = { static char *corename_lower[] = {
@ -1347,6 +1350,7 @@ static char *corename_lower[] = {
"bobcat", "bobcat",
"bulldozer", "bulldozer",
"piledriver", "piledriver",
"haswell",
}; };

View File

@ -124,3 +124,12 @@ ARCH_IA64
#if defined(__LP64) || defined(__LP64__) || defined(__ptr64) || defined(__x86_64__) || defined(__amd64__) || defined(__64BIT__) #if defined(__LP64) || defined(__LP64__) || defined(__ptr64) || defined(__x86_64__) || defined(__amd64__) || defined(__64BIT__)
BINARY_64 BINARY_64
#endif #endif
#if defined(__ARM_ARCH) || defined(__ARM_ARCH_7A__)
ARCH_ARM
#endif
#if defined(__aarch64__)
ARCH_ARM64
#endif

View File

@ -333,9 +333,7 @@ int CNAME(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n,
for(jjs = js; jjs < js + min_j; jjs += min_jj){ for(jjs = js; jjs < js + min_j; jjs += min_jj){
min_jj = min_j + js - jjs; min_jj = min_j + js - jjs;
#if defined(BULLDOZER) && defined(ARCH_X86_64) && !defined(XDOUBLE) && !defined(COMPLEX) #if ( defined(BULLDOZER) || defined(PILEDRIVER) || defined(HASWELL) ) && defined(ARCH_X86_64) && !defined(XDOUBLE) && !defined(COMPLEX)
if (min_jj >= 12*GEMM_UNROLL_N) min_jj = 12*GEMM_UNROLL_N;
else
if (min_jj >= 6*GEMM_UNROLL_N) min_jj = 6*GEMM_UNROLL_N; if (min_jj >= 6*GEMM_UNROLL_N) min_jj = 6*GEMM_UNROLL_N;
else else
if (min_jj >= 3*GEMM_UNROLL_N) min_jj = 3*GEMM_UNROLL_N; if (min_jj >= 3*GEMM_UNROLL_N) min_jj = 3*GEMM_UNROLL_N;

View File

@ -367,9 +367,7 @@ static int inner_thread(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n,
for(jjs = xxx; jjs < MIN(n_to, xxx + div_n); jjs += min_jj){ for(jjs = xxx; jjs < MIN(n_to, xxx + div_n); jjs += min_jj){
min_jj = MIN(n_to, xxx + div_n) - jjs; min_jj = MIN(n_to, xxx + div_n) - jjs;
#if defined(BULLDOZER) && defined(ARCH_X86_64) && !defined(XDOUBLE) && !defined(COMPLEX) #if ( defined(BULLDOZER) || defined(PILEDRIVER) || defined(HASWELL) ) && defined(ARCH_X86_64) && !defined(XDOUBLE) && !defined(COMPLEX)
if (min_jj >= 12*GEMM_UNROLL_N) min_jj = 12*GEMM_UNROLL_N;
else
if (min_jj >= 6*GEMM_UNROLL_N) min_jj = 6*GEMM_UNROLL_N; if (min_jj >= 6*GEMM_UNROLL_N) min_jj = 6*GEMM_UNROLL_N;
else else
if (min_jj >= 3*GEMM_UNROLL_N) min_jj = 3*GEMM_UNROLL_N; if (min_jj >= 3*GEMM_UNROLL_N) min_jj = 3*GEMM_UNROLL_N;

View File

@ -65,14 +65,15 @@ extern gotoblas_t gotoblas_BOBCAT;
extern gotoblas_t gotoblas_SANDYBRIDGE; extern gotoblas_t gotoblas_SANDYBRIDGE;
extern gotoblas_t gotoblas_BULLDOZER; extern gotoblas_t gotoblas_BULLDOZER;
extern gotoblas_t gotoblas_PILEDRIVER; extern gotoblas_t gotoblas_PILEDRIVER;
extern gotoblas_t gotoblas_HASWELL;
#else #else
//Use NEHALEM kernels for sandy bridge //Use NEHALEM kernels for sandy bridge
#define gotoblas_SANDYBRIDGE gotoblas_NEHALEM #define gotoblas_SANDYBRIDGE gotoblas_NEHALEM
#define gotoblas_HASWELL gotoblas_NEHALEM
#define gotoblas_BULLDOZER gotoblas_BARCELONA #define gotoblas_BULLDOZER gotoblas_BARCELONA
#define gotoblas_PILEDRIVER gotoblas_BARCELONA #define gotoblas_PILEDRIVER gotoblas_BARCELONA
#endif #endif
//Use sandy bridge kernels for haswell.
#define gotoblas_HASWELL gotoblas_SANDYBRIDGE
#define VENDOR_INTEL 1 #define VENDOR_INTEL 1
#define VENDOR_AMD 2 #define VENDOR_AMD 2
@ -297,6 +298,7 @@ static char *corename[] = {
"Bobcat", "Bobcat",
"Bulldozer", "Bulldozer",
"Piledriver", "Piledriver",
"Haswell",
}; };
char *gotoblas_corename(void) { char *gotoblas_corename(void) {
@ -319,7 +321,8 @@ char *gotoblas_corename(void) {
if (gotoblas == &gotoblas_SANDYBRIDGE) return corename[16]; if (gotoblas == &gotoblas_SANDYBRIDGE) return corename[16];
if (gotoblas == &gotoblas_BOBCAT) return corename[17]; if (gotoblas == &gotoblas_BOBCAT) return corename[17];
if (gotoblas == &gotoblas_BULLDOZER) return corename[18]; if (gotoblas == &gotoblas_BULLDOZER) return corename[18];
if (gotoblas == &gotoblas_PILEDRIVER) return corename[19]; if (gotoblas == &gotoblas_PILEDRIVER) return corename[19];
if (gotoblas == &gotoblas_HASWELL) return corename[20];
return corename[0]; return corename[0];
} }

View File

@ -298,6 +298,21 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#define CORENAME "SANDYBRIDGE" #define CORENAME "SANDYBRIDGE"
#endif #endif
#ifdef FORCE_HASWELL
#define FORCE
#define FORCE_INTEL
#define ARCHITECTURE "X86"
#define SUBARCHITECTURE "HASWELL"
#define ARCHCONFIG "-DHASWELL " \
"-DL1_DATA_SIZE=32768 -DL1_DATA_LINESIZE=64 " \
"-DL2_SIZE=262144 -DL2_LINESIZE=64 " \
"-DDTB_DEFAULT_ENTRIES=64 -DDTB_SIZE=4096 " \
"-DHAVE_CMOV -DHAVE_MMX -DHAVE_SSE -DHAVE_SSE2 -DHAVE_SSE3 -DHAVE_SSSE3 -DHAVE_SSE4_1 -DHAVE_SSE4_2 -DHAVE_AVX " \
"-DFMA3"
#define LIBNAME "haswell"
#define CORENAME "HASWELL"
#endif
#ifdef FORCE_ATOM #ifdef FORCE_ATOM
#define FORCE #define FORCE
#define FORCE_INTEL #define FORCE_INTEL
@ -679,6 +694,52 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#define CORENAME "generic" #define CORENAME "generic"
#endif #endif
#ifdef FORCE_ARMV7
#define FORCE
#define ARCHITECTURE "ARM"
#define SUBARCHITECTURE "ARMV7"
#define SUBDIRNAME "arm"
#define ARCHCONFIG "-DARMV7 " \
"-DL1_DATA_SIZE=65536 -DL1_DATA_LINESIZE=32 " \
"-DL2_SIZE=512488 -DL2_LINESIZE=32 " \
"-DDTB_DEFAULT_ENTRIES=64 -DDTB_SIZE=4096 -DL2_ASSOCIATIVE=4 " \
"-DHAVE_VFPV3 -DHAVE_VFP"
#define LIBNAME "armv7"
#define CORENAME "ARMV7"
#else
#endif
#ifdef FORCE_ARMV6
#define FORCE
#define ARCHITECTURE "ARM"
#define SUBARCHITECTURE "ARMV6"
#define SUBDIRNAME "arm"
#define ARCHCONFIG "-DARMV6 " \
"-DL1_DATA_SIZE=65536 -DL1_DATA_LINESIZE=32 " \
"-DL2_SIZE=512488 -DL2_LINESIZE=32 " \
"-DDTB_DEFAULT_ENTRIES=64 -DDTB_SIZE=4096 -DL2_ASSOCIATIVE=4 " \
"-DHAVE_VFP"
#define LIBNAME "armv6"
#define CORENAME "ARMV6"
#else
#endif
#ifdef FORCE_ARMV8
#define FORCE
#define ARCHITECTURE "ARM64"
#define SUBARCHITECTURE "ARMV8"
#define SUBDIRNAME "arm64"
#define ARCHCONFIG "-DARMV8 " \
"-DL1_DATA_SIZE=65536 -DL1_DATA_LINESIZE=32 " \
"-DL2_SIZE=512488 -DL2_LINESIZE=32 " \
"-DDTB_DEFAULT_ENTRIES=64 -DDTB_SIZE=4096 -DL2_ASSOCIATIVE=4 " \
"-DHAVE_VFP -DHAVE_VFPV3 -DHAVE_VFPV4"
#define LIBNAME "armv8"
#define CORENAME "ARMV8"
#else
#endif
#ifndef FORCE #ifndef FORCE
#if defined(__powerpc__) || defined(__powerpc) || defined(powerpc) || \ #if defined(__powerpc__) || defined(__powerpc) || defined(powerpc) || \
@ -719,6 +780,12 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#define OPENBLAS_SUPPORTED #define OPENBLAS_SUPPORTED
#endif #endif
#ifdef __arm__
#include "cpuid_arm.c"
#define OPENBLAS_SUPPORTED
#endif
#ifndef OPENBLAS_SUPPORTED #ifndef OPENBLAS_SUPPORTED
#error "This arch/CPU is not supported by OpenBLAS." #error "This arch/CPU is not supported by OpenBLAS."
#endif #endif
@ -773,7 +840,7 @@ int main(int argc, char *argv[]){
#ifdef FORCE #ifdef FORCE
printf("CORE=%s\n", CORENAME); printf("CORE=%s\n", CORENAME);
#else #else
#if defined(__i386__) || defined(__x86_64__) || defined(POWER) || defined(__mips__) #if defined(__i386__) || defined(__x86_64__) || defined(POWER) || defined(__mips__) || defined(__arm__)
printf("CORE=%s\n", get_corename()); printf("CORE=%s\n", get_corename());
#endif #endif
#endif #endif
@ -788,6 +855,11 @@ int main(int argc, char *argv[]){
printf("NUM_CORES=%d\n", get_num_cores()); printf("NUM_CORES=%d\n", get_num_cores());
#if defined(__arm__) && !defined(FORCE)
get_features();
#endif
#if defined(__i386__) || defined(__x86_64__) #if defined(__i386__) || defined(__x86_64__)
#ifndef FORCE #ifndef FORCE
get_sse(); get_sse();

View File

@ -14,6 +14,20 @@ ifeq ($(ARCH), MIPS)
USE_GEMM3M = 1 USE_GEMM3M = 1
endif endif
ifeq ($(ARCH), arm)
USE_TRMM = 1
endif
ifeq ($(ARCH), arm64)
USE_TRMM = 1
endif
ifeq ($(TARGET), LOONGSON3B)
USE_TRMM = 1
endif
SKERNELOBJS += \ SKERNELOBJS += \
sgemm_kernel$(TSUFFIX).$(SUFFIX) \ sgemm_kernel$(TSUFFIX).$(SUFFIX) \
$(SGEMMINCOPYOBJ) $(SGEMMITCOPYOBJ) \ $(SGEMMINCOPYOBJ) $(SGEMMITCOPYOBJ) \
@ -498,7 +512,8 @@ $(KDIR)xgemm_kernel_r$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(XGEMMKERNEL) $(XGEMMD
$(KDIR)xgemm_kernel_b$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(XGEMMKERNEL) $(XGEMMDEPEND) $(KDIR)xgemm_kernel_b$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(XGEMMKERNEL) $(XGEMMDEPEND)
$(CC) $(CFLAGS) -c -DXDOUBLE -DCOMPLEX -DCC $< -o $@ $(CC) $(CFLAGS) -c -DXDOUBLE -DCOMPLEX -DCC $< -o $@
ifeq ($(TARGET), LOONGSON3B)
ifdef USE_TRMM
$(KDIR)strmm_kernel_LN$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(STRMMKERNEL) $(KDIR)strmm_kernel_LN$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(STRMMKERNEL)
$(CC) $(CFLAGS) -c -DTRMMKERNEL -UDOUBLE -UCOMPLEX -DLEFT -UTRANSA $< -o $@ $(CC) $(CFLAGS) -c -DTRMMKERNEL -UDOUBLE -UCOMPLEX -DLEFT -UTRANSA $< -o $@
@ -582,24 +597,6 @@ $(KDIR)ztrmm_kernel_RR$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZTRMMKERNEL)
$(KDIR)ztrmm_kernel_RC$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZTRMMKERNEL) $(KDIR)ztrmm_kernel_RC$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZTRMMKERNEL)
$(CC) $(CFLAGS) -c -DTRMMKERNEL -DDOUBLE -DCOMPLEX -ULEFT -DTRANSA -DCONJ -DNC $< -o $@ $(CC) $(CFLAGS) -c -DTRMMKERNEL -DDOUBLE -DCOMPLEX -ULEFT -DTRANSA -DCONJ -DNC $< -o $@
else
ifdef STRMMKERNEL
$(KDIR)strmm_kernel_LN$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(STRMMKERNEL)
$(CC) $(CFLAGS) -c -DTRMMKERNEL -UDOUBLE -UCOMPLEX -DLEFT -UTRANSA $< -o $@
$(KDIR)strmm_kernel_LT$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(STRMMKERNEL)
$(CC) $(CFLAGS) -c -DTRMMKERNEL -UDOUBLE -UCOMPLEX -DLEFT -DTRANSA $< -o $@
$(KDIR)strmm_kernel_RN$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(STRMMKERNEL)
$(CC) $(CFLAGS) -c -DTRMMKERNEL -UDOUBLE -UCOMPLEX -ULEFT -UTRANSA $< -o $@
$(KDIR)strmm_kernel_RT$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(STRMMKERNEL)
$(CC) $(CFLAGS) -c -DTRMMKERNEL -UDOUBLE -UCOMPLEX -ULEFT -DTRANSA $< -o $@
else else
$(KDIR)strmm_kernel_LN$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(SGEMMKERNEL) $(KDIR)strmm_kernel_LN$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(SGEMMKERNEL)
$(CC) $(CFLAGS) -c -DTRMMKERNEL -UDOUBLE -UCOMPLEX -DLEFT -UTRANSA $< -o $@ $(CC) $(CFLAGS) -c -DTRMMKERNEL -UDOUBLE -UCOMPLEX -DLEFT -UTRANSA $< -o $@
@ -613,79 +610,17 @@ $(KDIR)strmm_kernel_RN$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(SGEMMKERNEL)
$(KDIR)strmm_kernel_RT$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(SGEMMKERNEL) $(KDIR)strmm_kernel_RT$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(SGEMMKERNEL)
$(CC) $(CFLAGS) -c -DTRMMKERNEL -UDOUBLE -UCOMPLEX -ULEFT -DTRANSA $< -o $@ $(CC) $(CFLAGS) -c -DTRMMKERNEL -UDOUBLE -UCOMPLEX -ULEFT -DTRANSA $< -o $@
endif
ifdef DTRMMKERNEL
ifdef DTRMMKERNEL_LN
$(KDIR)dtrmm_kernel_LN$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(DTRMMKERNEL_LN)
$(CC) $(CFLAGS) -c -DTRMMKERNEL -DDOUBLE -UCOMPLEX -DLEFT -UTRANSA $< -o $@
else
$(KDIR)dtrmm_kernel_LN$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(DTRMMKERNEL)
$(CC) $(CFLAGS) -c -DTRMMKERNEL -DDOUBLE -UCOMPLEX -DLEFT -UTRANSA $< -o $@
endif
ifdef DTRMMKERNEL_LT
$(KDIR)dtrmm_kernel_LT$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(DTRMMKERNEL_LT)
$(CC) $(CFLAGS) -c -DTRMMKERNEL -DDOUBLE -UCOMPLEX -DLEFT -DTRANSA $< -o $@
else
$(KDIR)dtrmm_kernel_LT$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(DTRMMKERNEL)
$(CC) $(CFLAGS) -c -DTRMMKERNEL -DDOUBLE -UCOMPLEX -DLEFT -DTRANSA $< -o $@
endif
ifdef DTRMMKERNEL_RN
$(KDIR)dtrmm_kernel_RN$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(DTRMMKERNEL_RN)
$(CC) $(CFLAGS) -c -DTRMMKERNEL -DDOUBLE -UCOMPLEX -ULEFT -UTRANSA $< -o $@
else
$(KDIR)dtrmm_kernel_RN$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(DTRMMKERNEL)
$(CC) $(CFLAGS) -c -DTRMMKERNEL -DDOUBLE -UCOMPLEX -ULEFT -UTRANSA $< -o $@
endif
ifdef DTRMMKERNEL_RT
$(KDIR)dtrmm_kernel_RT$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(DTRMMKERNEL_RT)
$(CC) $(CFLAGS) -c -DTRMMKERNEL -DDOUBLE -UCOMPLEX -ULEFT -DTRANSA $< -o $@
else
$(KDIR)dtrmm_kernel_RT$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(DTRMMKERNEL)
$(CC) $(CFLAGS) -c -DTRMMKERNEL -DDOUBLE -UCOMPLEX -ULEFT -DTRANSA $< -o $@
endif
else
ifdef DTRMMKERNEL_LN
$(KDIR)dtrmm_kernel_LN$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(DGEMMKERNEL_LN)
$(CC) $(CFLAGS) -c -DTRMMKERNEL -DDOUBLE -UCOMPLEX -DLEFT -UTRANSA $< -o $@
else
$(KDIR)dtrmm_kernel_LN$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(DGEMMKERNEL) $(KDIR)dtrmm_kernel_LN$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(DGEMMKERNEL)
$(CC) $(CFLAGS) -c -DTRMMKERNEL -DDOUBLE -UCOMPLEX -DLEFT -UTRANSA $< -o $@ $(CC) $(CFLAGS) -c -DTRMMKERNEL -DDOUBLE -UCOMPLEX -DLEFT -UTRANSA $< -o $@
endif
ifdef DTRMMKERNEL_LT
$(KDIR)dtrmm_kernel_LT$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(DGEMMKERNEL_LT)
$(CC) $(CFLAGS) -c -DTRMMKERNEL -DDOUBLE -UCOMPLEX -DLEFT -DTRANSA $< -o $@
else
$(KDIR)dtrmm_kernel_LT$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(DGEMMKERNEL) $(KDIR)dtrmm_kernel_LT$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(DGEMMKERNEL)
$(CC) $(CFLAGS) -c -DTRMMKERNEL -DDOUBLE -UCOMPLEX -DLEFT -DTRANSA $< -o $@ $(CC) $(CFLAGS) -c -DTRMMKERNEL -DDOUBLE -UCOMPLEX -DLEFT -DTRANSA $< -o $@
endif
ifdef DTRMMKERNEL_RN
$(KDIR)dtrmm_kernel_RN$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(DGEMMKERNEL_RN)
$(CC) $(CFLAGS) -c -DTRMMKERNEL -DDOUBLE -UCOMPLEX -ULEFT -UTRANSA $< -o $@
else
$(KDIR)dtrmm_kernel_RN$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(DGEMMKERNEL) $(KDIR)dtrmm_kernel_RN$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(DGEMMKERNEL)
$(CC) $(CFLAGS) -c -DTRMMKERNEL -DDOUBLE -UCOMPLEX -ULEFT -UTRANSA $< -o $@ $(CC) $(CFLAGS) -c -DTRMMKERNEL -DDOUBLE -UCOMPLEX -ULEFT -UTRANSA $< -o $@
endif
ifdef DTRMMKERNEL_RT
$(KDIR)dtrmm_kernel_RT$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(DGEMMKERNEL_RT)
$(CC) $(CFLAGS) -c -DTRMMKERNEL -DDOUBLE -UCOMPLEX -ULEFT -DTRANSA $< -o $@
else
$(KDIR)dtrmm_kernel_RT$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(DGEMMKERNEL) $(KDIR)dtrmm_kernel_RT$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(DGEMMKERNEL)
$(CC) $(CFLAGS) -c -DTRMMKERNEL -DDOUBLE -UCOMPLEX -ULEFT -DTRANSA $< -o $@ $(CC) $(CFLAGS) -c -DTRMMKERNEL -DDOUBLE -UCOMPLEX -ULEFT -DTRANSA $< -o $@
endif
endif
ifdef QTRMMKERNEL
$(KDIR)qtrmm_kernel_LN$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(QGEMMKERNEL) $(KDIR)qtrmm_kernel_LN$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(QGEMMKERNEL)
$(CC) $(CFLAGS) -c -DTRMMKERNEL -DXDOUBLE -UCOMPLEX -DLEFT -UTRANSA $< -o $@ $(CC) $(CFLAGS) -c -DTRMMKERNEL -DXDOUBLE -UCOMPLEX -DLEFT -UTRANSA $< -o $@
@ -699,50 +634,6 @@ $(KDIR)qtrmm_kernel_RN$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(QGEMMKERNEL)
$(KDIR)qtrmm_kernel_RT$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(QGEMMKERNEL) $(KDIR)qtrmm_kernel_RT$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(QGEMMKERNEL)
$(CC) $(CFLAGS) -c -DTRMMKERNEL -DXDOUBLE -UCOMPLEX -ULEFT -DTRANSA $< -o $@ $(CC) $(CFLAGS) -c -DTRMMKERNEL -DXDOUBLE -UCOMPLEX -ULEFT -DTRANSA $< -o $@
else
$(KDIR)qtrmm_kernel_LN$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(QGEMMKERNEL)
$(CC) $(CFLAGS) -c -DTRMMKERNEL -DXDOUBLE -UCOMPLEX -DLEFT -UTRANSA $< -o $@
$(KDIR)qtrmm_kernel_LT$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(QGEMMKERNEL)
$(CC) $(CFLAGS) -c -DTRMMKERNEL -DXDOUBLE -UCOMPLEX -DLEFT -DTRANSA $< -o $@
$(KDIR)qtrmm_kernel_RN$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(QGEMMKERNEL)
$(CC) $(CFLAGS) -c -DTRMMKERNEL -DXDOUBLE -UCOMPLEX -ULEFT -UTRANSA $< -o $@
$(KDIR)qtrmm_kernel_RT$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(QGEMMKERNEL)
$(CC) $(CFLAGS) -c -DTRMMKERNEL -DXDOUBLE -UCOMPLEX -ULEFT -DTRANSA $< -o $@
endif
ifdef CTRMMKERNEL
$(KDIR)ctrmm_kernel_LN$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(CTRMMKERNEL)
$(CC) $(CFLAGS) -c -DTRMMKERNEL -UDOUBLE -DCOMPLEX -DLEFT -UTRANSA -UCONJ -DNN $< -o $@
$(KDIR)ctrmm_kernel_LT$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(CTRMMKERNEL)
$(CC) $(CFLAGS) -c -DTRMMKERNEL -UDOUBLE -DCOMPLEX -DLEFT -DTRANSA -UCONJ -DNN $< -o $@
$(KDIR)ctrmm_kernel_LR$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(CTRMMKERNEL)
$(CC) $(CFLAGS) -c -DTRMMKERNEL -UDOUBLE -DCOMPLEX -DLEFT -UTRANSA -DCONJ -DCN $< -o $@
$(KDIR)ctrmm_kernel_LC$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(CTRMMKERNEL)
$(CC) $(CFLAGS) -c -DTRMMKERNEL -UDOUBLE -DCOMPLEX -DLEFT -DTRANSA -DCONJ -DCN $< -o $@
$(KDIR)ctrmm_kernel_RN$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(CTRMMKERNEL)
$(CC) $(CFLAGS) -c -DTRMMKERNEL -UDOUBLE -DCOMPLEX -ULEFT -UTRANSA -UCONJ -DNN $< -o $@
$(KDIR)ctrmm_kernel_RT$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(CTRMMKERNEL)
$(CC) $(CFLAGS) -c -DTRMMKERNEL -UDOUBLE -DCOMPLEX -ULEFT -DTRANSA -UCONJ -DNN $< -o $@
$(KDIR)ctrmm_kernel_RR$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(CTRMMKERNEL)
$(CC) $(CFLAGS) -c -DTRMMKERNEL -UDOUBLE -DCOMPLEX -ULEFT -UTRANSA -DCONJ -DNC $< -o $@
$(KDIR)ctrmm_kernel_RC$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(CTRMMKERNEL)
$(CC) $(CFLAGS) -c -DTRMMKERNEL -UDOUBLE -DCOMPLEX -ULEFT -DTRANSA -DCONJ -DNC $< -o $@
else
$(KDIR)ctrmm_kernel_LN$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(CGEMMKERNEL) $(KDIR)ctrmm_kernel_LN$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(CGEMMKERNEL)
$(CC) $(CFLAGS) -c -DTRMMKERNEL -UDOUBLE -DCOMPLEX -DLEFT -UTRANSA -UCONJ -DNN $< -o $@ $(CC) $(CFLAGS) -c -DTRMMKERNEL -UDOUBLE -DCOMPLEX -DLEFT -UTRANSA -UCONJ -DNN $< -o $@
@ -767,37 +658,6 @@ $(KDIR)ctrmm_kernel_RR$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(CGEMMKERNEL)
$(KDIR)ctrmm_kernel_RC$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(CGEMMKERNEL) $(KDIR)ctrmm_kernel_RC$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(CGEMMKERNEL)
$(CC) $(CFLAGS) -c -DTRMMKERNEL -UDOUBLE -DCOMPLEX -ULEFT -DTRANSA -DCONJ -DNC $< -o $@ $(CC) $(CFLAGS) -c -DTRMMKERNEL -UDOUBLE -DCOMPLEX -ULEFT -DTRANSA -DCONJ -DNC $< -o $@
endif
ifdef ZTRMMKERNEL
$(KDIR)ztrmm_kernel_LN$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZTRMMKERNEL)
$(CC) $(CFLAGS) -c -DTRMMKERNEL -DDOUBLE -DCOMPLEX -DLEFT -UTRANSA -UCONJ -DNN $< -o $@
$(KDIR)ztrmm_kernel_LT$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZTRMMKERNEL)
$(CC) $(CFLAGS) -c -DTRMMKERNEL -DDOUBLE -DCOMPLEX -DLEFT -DTRANSA -UCONJ -DNN $< -o $@
$(KDIR)ztrmm_kernel_LR$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZTRMMKERNEL)
$(CC) $(CFLAGS) -c -DTRMMKERNEL -DDOUBLE -DCOMPLEX -DLEFT -UTRANSA -DCONJ -DCN $< -o $@
$(KDIR)ztrmm_kernel_LC$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZTRMMKERNEL)
$(CC) $(CFLAGS) -c -DTRMMKERNEL -DDOUBLE -DCOMPLEX -DLEFT -DTRANSA -DCONJ -DCN $< -o $@
$(KDIR)ztrmm_kernel_RN$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZTRMMKERNEL)
$(CC) $(CFLAGS) -c -DTRMMKERNEL -DDOUBLE -DCOMPLEX -ULEFT -UTRANSA -UCONJ -DNN $< -o $@
$(KDIR)ztrmm_kernel_RT$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZTRMMKERNEL)
$(CC) $(CFLAGS) -c -DTRMMKERNEL -DDOUBLE -DCOMPLEX -ULEFT -DTRANSA -UCONJ -DNN $< -o $@
$(KDIR)ztrmm_kernel_RR$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZTRMMKERNEL)
$(CC) $(CFLAGS) -c -DTRMMKERNEL -DDOUBLE -DCOMPLEX -ULEFT -UTRANSA -DCONJ -DNC $< -o $@
$(KDIR)ztrmm_kernel_RC$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZTRMMKERNEL)
$(CC) $(CFLAGS) -c -DTRMMKERNEL -DDOUBLE -DCOMPLEX -ULEFT -DTRANSA -DCONJ -DNC $< -o $@
else
$(KDIR)ztrmm_kernel_LN$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZGEMMKERNEL) $(KDIR)ztrmm_kernel_LN$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZGEMMKERNEL)
$(CC) $(CFLAGS) -c -DTRMMKERNEL -DDOUBLE -DCOMPLEX -DLEFT -UTRANSA -UCONJ -DNN $< -o $@ $(CC) $(CFLAGS) -c -DTRMMKERNEL -DDOUBLE -DCOMPLEX -DLEFT -UTRANSA -UCONJ -DNN $< -o $@
@ -821,37 +681,10 @@ $(KDIR)ztrmm_kernel_RR$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZGEMMKERNEL)
$(KDIR)ztrmm_kernel_RC$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZGEMMKERNEL) $(KDIR)ztrmm_kernel_RC$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZGEMMKERNEL)
$(CC) $(CFLAGS) -c -DTRMMKERNEL -DDOUBLE -DCOMPLEX -ULEFT -DTRANSA -DCONJ -DNC $< -o $@ $(CC) $(CFLAGS) -c -DTRMMKERNEL -DDOUBLE -DCOMPLEX -ULEFT -DTRANSA -DCONJ -DNC $< -o $@
endif
endif endif
ifdef XTRMMKERNEL
$(KDIR)xtrmm_kernel_LN$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(XTRMMKERNEL)
$(CC) $(CFLAGS) -c -DTRMMKERNEL -DXDOUBLE -DCOMPLEX -DLEFT -UTRANSA -UCONJ -DNN $< -o $@
$(KDIR)xtrmm_kernel_LT$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(XTRMMKERNEL)
$(CC) $(CFLAGS) -c -DTRMMKERNEL -DXDOUBLE -DCOMPLEX -DLEFT -DTRANSA -UCONJ -DNN $< -o $@
$(KDIR)xtrmm_kernel_LR$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(XTRMMKERNEL)
$(CC) $(CFLAGS) -c -DTRMMKERNEL -DXDOUBLE -DCOMPLEX -DLEFT -UTRANSA -DCONJ -DCN $< -o $@
$(KDIR)xtrmm_kernel_LC$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(XTRMMKERNEL)
$(CC) $(CFLAGS) -c -DTRMMKERNEL -DXDOUBLE -DCOMPLEX -DLEFT -DTRANSA -DCONJ -DCN $< -o $@
$(KDIR)xtrmm_kernel_RN$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(XTRMMKERNEL)
$(CC) $(CFLAGS) -c -DTRMMKERNEL -DXDOUBLE -DCOMPLEX -ULEFT -UTRANSA -UCONJ -DNN $< -o $@
$(KDIR)xtrmm_kernel_RT$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(XTRMMKERNEL)
$(CC) $(CFLAGS) -c -DTRMMKERNEL -DXDOUBLE -DCOMPLEX -ULEFT -DTRANSA -UCONJ -DNN $< -o $@
$(KDIR)xtrmm_kernel_RR$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(XTRMMKERNEL)
$(CC) $(CFLAGS) -c -DTRMMKERNEL -DXDOUBLE -DCOMPLEX -ULEFT -UTRANSA -DCONJ -DNC $< -o $@
$(KDIR)xtrmm_kernel_RC$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(XTRMMKERNEL)
$(CC) $(CFLAGS) -c -DTRMMKERNEL -DXDOUBLE -DCOMPLEX -ULEFT -DTRANSA -DCONJ -DNC $< -o $@
else
$(KDIR)xtrmm_kernel_LN$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(XGEMMKERNEL) $(KDIR)xtrmm_kernel_LN$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(XGEMMKERNEL)
$(CC) $(CFLAGS) -c -DTRMMKERNEL -DXDOUBLE -DCOMPLEX -DLEFT -UTRANSA -UCONJ -DNN $< -o $@ $(CC) $(CFLAGS) -c -DTRMMKERNEL -DXDOUBLE -DCOMPLEX -DLEFT -UTRANSA -UCONJ -DNN $< -o $@
@ -877,9 +710,6 @@ $(KDIR)xtrmm_kernel_RR$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(XGEMMKERNEL)
$(KDIR)xtrmm_kernel_RC$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(XGEMMKERNEL) $(KDIR)xtrmm_kernel_RC$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(XGEMMKERNEL)
$(CC) $(CFLAGS) -c -DTRMMKERNEL -DXDOUBLE -DCOMPLEX -ULEFT -DTRANSA -DCONJ -DNC $< -o $@ $(CC) $(CFLAGS) -c -DTRMMKERNEL -DXDOUBLE -DCOMPLEX -ULEFT -DTRANSA -DCONJ -DNC $< -o $@
endif
$(KDIR)cgemm3m_kernel$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(CGEMM3MKERNEL) $(KDIR)cgemm3m_kernel$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(CGEMM3MKERNEL)
$(CC) $(CFLAGS) -c -UDOUBLE -DCOMPLEX -DNN $< -o $@ $(CC) $(CFLAGS) -c -UDOUBLE -DCOMPLEX -DNN $< -o $@

46
kernel/arm/KERNEL Normal file
View File

@ -0,0 +1,46 @@
ifndef SNRM2KERNEL
SNRM2KERNEL = nrm2.c
endif
ifndef DNRM2KERNEL
DNRM2KERNEL = nrm2.c
endif
ifndef CNRM2KERNEL
CNRM2KERNEL = znrm2.c
endif
ifndef ZNRM2KERNEL
ZNRM2KERNEL = znrm2.c
endif
ifndef SCABS_KERNEL
SCABS_KERNEL = ../generic/cabs.c
endif
ifndef DCABS_KERNEL
DCABS_KERNEL = ../generic/cabs.c
endif
ifndef QCABS_KERNEL
QCABS_KERNEL = ../generic/cabs.c
endif
ifndef LSAME_KERNEL
LSAME_KERNEL = ../generic/lsame.c
endif
ifndef SGEMM_BETA
SGEMM_BETA = ../generic/gemm_beta.c
endif
ifndef DGEMM_BETA
DGEMM_BETA = ../generic/gemm_beta.c
endif
ifndef CGEMM_BETA
CGEMM_BETA = ../generic/zgemm_beta.c
endif
ifndef ZGEMM_BETA
ZGEMM_BETA = ../generic/zgemm_beta.c
endif

142
kernel/arm/KERNEL.ARMV6 Normal file
View File

@ -0,0 +1,142 @@
SAMAXKERNEL = iamax_vfp.S
DAMAXKERNEL = iamax_vfp.S
CAMAXKERNEL = iamax_vfp.S
ZAMAXKERNEL = iamax_vfp.S
SAMINKERNEL = iamax_vfp.S
DAMINKERNEL = iamax_vfp.S
CAMINKERNEL = iamax_vfp.S
ZAMINKERNEL = iamax_vfp.S
SMAXKERNEL = iamax_vfp.S
DMAXKERNEL = iamax_vfp.S
SMINKERNEL = iamax_vfp.S
DMINKERNEL = iamax_vfp.S
ISAMAXKERNEL = iamax_vfp.S
IDAMAXKERNEL = iamax_vfp.S
ICAMAXKERNEL = iamax_vfp.S
IZAMAXKERNEL = iamax_vfp.S
ISAMINKERNEL = iamax_vfp.S
IDAMINKERNEL = iamax_vfp.S
ICAMINKERNEL = iamax_vfp.S
IZAMINKERNEL = iamax_vfp.S
ISMAXKERNEL = iamax_vfp.S
IDMAXKERNEL = iamax_vfp.S
ISMINKERNEL = iamax_vfp.S
IDMINKERNEL = iamax_vfp.S
SASUMKERNEL = asum_vfp.S
DASUMKERNEL = asum_vfp.S
CASUMKERNEL = asum_vfp.S
ZASUMKERNEL = asum_vfp.S
SAXPYKERNEL = axpy_vfp.S
DAXPYKERNEL = axpy_vfp.S
CAXPYKERNEL = axpy_vfp.S
ZAXPYKERNEL = axpy_vfp.S
SCOPYKERNEL = scopy_vfp.S
DCOPYKERNEL = dcopy_vfp.S
CCOPYKERNEL = ccopy_vfp.S
ZCOPYKERNEL = zcopy_vfp.S
SDOTKERNEL = sdot_vfp.S
DDOTKERNEL = ddot_vfp.S
CDOTKERNEL = cdot_vfp.S
ZDOTKERNEL = zdot_vfp.S
SNRM2KERNEL = nrm2_vfp.S
DNRM2KERNEL = nrm2_vfp.S
CNRM2KERNEL = nrm2_vfp.S
ZNRM2KERNEL = nrm2_vfp.S
SROTKERNEL = rot_vfp.S
DROTKERNEL = rot_vfp.S
CROTKERNEL = rot_vfp.S
ZROTKERNEL = rot_vfp.S
SSCALKERNEL = scal_vfp.S
DSCALKERNEL = scal_vfp.S
CSCALKERNEL = scal_vfp.S
ZSCALKERNEL = scal_vfp.S
SSWAPKERNEL = swap_vfp.S
DSWAPKERNEL = swap_vfp.S
CSWAPKERNEL = swap_vfp.S
ZSWAPKERNEL = swap_vfp.S
SGEMVNKERNEL = gemv_n_vfp.S
DGEMVNKERNEL = gemv_n_vfp.S
CGEMVNKERNEL = cgemv_n_vfp.S
ZGEMVNKERNEL = zgemv_n_vfp.S
SGEMVTKERNEL = gemv_t_vfp.S
DGEMVTKERNEL = gemv_t_vfp.S
CGEMVTKERNEL = cgemv_t_vfp.S
ZGEMVTKERNEL = zgemv_t_vfp.S
STRMMKERNEL = strmm_kernel_4x2_vfp.S
DTRMMKERNEL = dtrmm_kernel_4x2_vfp.S
CTRMMKERNEL = ctrmm_kernel_2x2_vfp.S
ZTRMMKERNEL = ztrmm_kernel_2x2_vfp.S
SGEMMKERNEL = sgemm_kernel_4x2_vfp.S
SGEMMINCOPY = sgemm_ncopy_4_vfp.S
SGEMMITCOPY = sgemm_tcopy_4_vfp.S
SGEMMINCOPYOBJ = sgemm_incopy.o
SGEMMITCOPYOBJ = sgemm_itcopy.o
SGEMMONCOPY = sgemm_ncopy_2_vfp.S
SGEMMOTCOPY = ../generic/gemm_tcopy_2.c
SGEMMONCOPYOBJ = sgemm_oncopy.o
SGEMMOTCOPYOBJ = sgemm_otcopy.o
DGEMMKERNEL = dgemm_kernel_4x2_vfp.S
DGEMMINCOPY = dgemm_ncopy_4_vfp.S
DGEMMITCOPY = dgemm_tcopy_4_vfp.S
DGEMMINCOPYOBJ = dgemm_incopy.o
DGEMMITCOPYOBJ = dgemm_itcopy.o
DGEMMONCOPY = dgemm_ncopy_2_vfp.S
DGEMMOTCOPY = ../generic/gemm_tcopy_2.c
DGEMMONCOPYOBJ = dgemm_oncopy.o
DGEMMOTCOPYOBJ = dgemm_otcopy.o
CGEMMKERNEL = cgemm_kernel_2x2_vfp.S
CGEMMONCOPY = cgemm_ncopy_2_vfp.S
CGEMMOTCOPY = cgemm_tcopy_2_vfp.S
CGEMMONCOPYOBJ = cgemm_oncopy.o
CGEMMOTCOPYOBJ = cgemm_otcopy.o
ZGEMMKERNEL = zgemm_kernel_2x2_vfp.S
ZGEMMONCOPY = zgemm_ncopy_2_vfp.S
ZGEMMOTCOPY = zgemm_tcopy_2_vfp.S
ZGEMMONCOPYOBJ = zgemm_oncopy.o
ZGEMMOTCOPYOBJ = zgemm_otcopy.o
STRSMKERNEL_LN = ../generic/trsm_kernel_LN.c
STRSMKERNEL_LT = ../generic/trsm_kernel_LT.c
STRSMKERNEL_RN = ../generic/trsm_kernel_RN.c
STRSMKERNEL_RT = ../generic/trsm_kernel_RT.c
DTRSMKERNEL_LN = ../generic/trsm_kernel_LN.c
DTRSMKERNEL_LT = ../generic/trsm_kernel_LT.c
DTRSMKERNEL_RN = ../generic/trsm_kernel_RN.c
DTRSMKERNEL_RT = ../generic/trsm_kernel_RT.c
CTRSMKERNEL_LN = ../generic/trsm_kernel_LN.c
CTRSMKERNEL_LT = ../generic/trsm_kernel_LT.c
CTRSMKERNEL_RN = ../generic/trsm_kernel_RN.c
CTRSMKERNEL_RT = ../generic/trsm_kernel_RT.c
ZTRSMKERNEL_LN = ../generic/trsm_kernel_LN.c
ZTRSMKERNEL_LT = ../generic/trsm_kernel_LT.c
ZTRSMKERNEL_RN = ../generic/trsm_kernel_RN.c
ZTRSMKERNEL_RT = ../generic/trsm_kernel_RT.c

141
kernel/arm/KERNEL.ARMV7 Normal file
View File

@ -0,0 +1,141 @@
SAMAXKERNEL = iamax_vfp.S
DAMAXKERNEL = iamax_vfp.S
CAMAXKERNEL = iamax_vfp.S
ZAMAXKERNEL = iamax_vfp.S
SAMINKERNEL = iamax_vfp.S
DAMINKERNEL = iamax_vfp.S
CAMINKERNEL = iamax_vfp.S
ZAMINKERNEL = iamax_vfp.S
SMAXKERNEL = iamax_vfp.S
DMAXKERNEL = iamax_vfp.S
SMINKERNEL = iamax_vfp.S
DMINKERNEL = iamax_vfp.S
ISAMAXKERNEL = iamax_vfp.S
IDAMAXKERNEL = iamax_vfp.S
ICAMAXKERNEL = iamax_vfp.S
IZAMAXKERNEL = iamax_vfp.S
ISAMINKERNEL = iamax_vfp.S
IDAMINKERNEL = iamax_vfp.S
ICAMINKERNEL = iamax_vfp.S
IZAMINKERNEL = iamax_vfp.S
ISMAXKERNEL = iamax_vfp.S
IDMAXKERNEL = iamax_vfp.S
ISMINKERNEL = iamax_vfp.S
IDMINKERNEL = iamax_vfp.S
SSWAPKERNEL = swap_vfp.S
DSWAPKERNEL = swap_vfp.S
CSWAPKERNEL = swap_vfp.S
ZSWAPKERNEL = swap_vfp.S
SASUMKERNEL = asum_vfp.S
DASUMKERNEL = asum_vfp.S
CASUMKERNEL = asum_vfp.S
ZASUMKERNEL = asum_vfp.S
SAXPYKERNEL = axpy_vfp.S
DAXPYKERNEL = axpy_vfp.S
CAXPYKERNEL = axpy_vfp.S
ZAXPYKERNEL = axpy_vfp.S
SCOPYKERNEL = scopy_vfp.S
DCOPYKERNEL = dcopy_vfp.S
CCOPYKERNEL = ccopy_vfp.S
ZCOPYKERNEL = zcopy_vfp.S
SDOTKERNEL = sdot_vfp.S
DDOTKERNEL = ddot_vfp.S
CDOTKERNEL = cdot_vfp.S
ZDOTKERNEL = zdot_vfp.S
SNRM2KERNEL = nrm2_vfpv3.S
DNRM2KERNEL = nrm2_vfpv3.S
CNRM2KERNEL = nrm2_vfpv3.S
ZNRM2KERNEL = nrm2_vfpv3.S
SROTKERNEL = rot_vfp.S
DROTKERNEL = rot_vfp.S
CROTKERNEL = rot_vfp.S
ZROTKERNEL = rot_vfp.S
SSCALKERNEL = scal_vfp.S
DSCALKERNEL = scal_vfp.S
CSCALKERNEL = scal_vfp.S
ZSCALKERNEL = scal_vfp.S
SGEMVNKERNEL = gemv_n_vfp.S
DGEMVNKERNEL = gemv_n_vfp.S
CGEMVNKERNEL = cgemv_n_vfp.S
ZGEMVNKERNEL = zgemv_n_vfp.S
SGEMVTKERNEL = gemv_t_vfp.S
DGEMVTKERNEL = gemv_t_vfp.S
CGEMVTKERNEL = cgemv_t_vfp.S
ZGEMVTKERNEL = zgemv_t_vfp.S
STRMMKERNEL = strmm_kernel_4x4_vfpv3.S
DTRMMKERNEL = dtrmm_kernel_4x4_vfpv3.S
CTRMMKERNEL = ctrmm_kernel_2x2_vfpv3.S
ZTRMMKERNEL = ztrmm_kernel_2x2_vfpv3.S
#SGEMMKERNEL = ../generic/gemmkernel_2x2.c
SGEMMKERNEL = sgemm_kernel_4x4_vfpv3.S
SGEMMINCOPY =
SGEMMITCOPY =
SGEMMONCOPY = sgemm_ncopy_4_vfp.S
SGEMMOTCOPY = sgemm_tcopy_4_vfp.S
SGEMMINCOPYOBJ =
SGEMMITCOPYOBJ =
SGEMMONCOPYOBJ = sgemm_oncopy.o
SGEMMOTCOPYOBJ = sgemm_otcopy.o
DGEMMKERNEL = dgemm_kernel_4x4_vfpv3.S
DGEMMINCOPY =
DGEMMITCOPY =
DGEMMONCOPY = dgemm_ncopy_4_vfp.S
DGEMMOTCOPY = dgemm_tcopy_4_vfp.S
DGEMMINCOPYOBJ =
DGEMMITCOPYOBJ =
DGEMMONCOPYOBJ = dgemm_oncopy.o
DGEMMOTCOPYOBJ = dgemm_otcopy.o
CGEMMKERNEL = cgemm_kernel_2x2_vfpv3.S
CGEMMONCOPY = cgemm_ncopy_2_vfp.S
CGEMMOTCOPY = cgemm_tcopy_2_vfp.S
CGEMMONCOPYOBJ = cgemm_oncopy.o
CGEMMOTCOPYOBJ = cgemm_otcopy.o
ZGEMMKERNEL = zgemm_kernel_2x2_vfpv3.S
ZGEMMONCOPY = zgemm_ncopy_2_vfp.S
ZGEMMOTCOPY = zgemm_tcopy_2_vfp.S
ZGEMMONCOPYOBJ = zgemm_oncopy.o
ZGEMMOTCOPYOBJ = zgemm_otcopy.o
STRSMKERNEL_LN = ../generic/trsm_kernel_LN.c
STRSMKERNEL_LT = ../generic/trsm_kernel_LT.c
STRSMKERNEL_RN = ../generic/trsm_kernel_RN.c
STRSMKERNEL_RT = ../generic/trsm_kernel_RT.c
DTRSMKERNEL_LN = ../generic/trsm_kernel_LN.c
DTRSMKERNEL_LT = ../generic/trsm_kernel_LT.c
DTRSMKERNEL_RN = ../generic/trsm_kernel_RN.c
DTRSMKERNEL_RT = ../generic/trsm_kernel_RT.c
CTRSMKERNEL_LN = ../generic/trsm_kernel_LN.c
CTRSMKERNEL_LT = ../generic/trsm_kernel_LT.c
CTRSMKERNEL_RN = ../generic/trsm_kernel_RN.c
CTRSMKERNEL_RT = ../generic/trsm_kernel_RT.c
ZTRSMKERNEL_LN = ../generic/trsm_kernel_LN.c
ZTRSMKERNEL_LT = ../generic/trsm_kernel_LT.c
ZTRSMKERNEL_RN = ../generic/trsm_kernel_RN.c
ZTRSMKERNEL_RT = ../generic/trsm_kernel_RT.c

2
kernel/arm/Makefile Normal file
View File

@ -0,0 +1,2 @@
clean ::

73
kernel/arm/amax.c Normal file
View File

@ -0,0 +1,73 @@
/***************************************************************************
Copyright (c) 2013, The OpenBLAS Project
All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
1. Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
2. Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in
the documentation and/or other materials provided with the
distribution.
3. Neither the name of the OpenBLAS project nor the names of
its contributors may be used to endorse or promote products
derived from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*****************************************************************************/
/**************************************************************************************
* 2013/09/14 Saar
* BLASTEST float : OK
* BLASTEST double : OK
* CTEST : NoTest
* TEST : NoTest
*
**************************************************************************************/
#include "common.h"
#include <math.h>
#if defined(DOUBLE)
#define ABS fabs
#else
#define ABS fabsf
#endif
FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x)
{
BLASLONG i=0;
BLASLONG ix=0;
FLOAT maxf=0.0;
if (n < 0 || inc_x < 1 ) return(maxf);
maxf=ABS(x[0]);
while(i < n)
{
if( ABS(x[ix]) > ABS(maxf) )
{
maxf = ABS(x[ix]);
}
ix += inc_x;
i++;
}
return(maxf);
}

73
kernel/arm/amin.c Normal file
View File

@ -0,0 +1,73 @@
/***************************************************************************
Copyright (c) 2013, The OpenBLAS Project
All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
1. Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
2. Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in
the documentation and/or other materials provided with the
distribution.
3. Neither the name of the OpenBLAS project nor the names of
its contributors may be used to endorse or promote products
derived from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*****************************************************************************/
/**************************************************************************************
* 2013/09/14 Saar
* BLASTEST float : OK
* BLASTEST double : OK
* CTEST : NoTest
* TEST : NoTest
*
**************************************************************************************/
#include "common.h"
#include <math.h>
#if defined(DOUBLE)
#define ABS fabs
#else
#define ABS fabsf
#endif
FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x)
{
BLASLONG i=0;
BLASLONG ix=0;
FLOAT minf=0.0;
if (n < 0 || inc_x < 1 ) return(minf);
minf=ABS(x[0]);
while(i < n)
{
if( ABS(x[ix]) < ABS(minf) )
{
minf = ABS(x[ix]);
}
ix += inc_x;
i++;
}
return(minf);
}

67
kernel/arm/asum.c Normal file
View File

@ -0,0 +1,67 @@
/***************************************************************************
Copyright (c) 2013, The OpenBLAS Project
All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
1. Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
2. Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in
the documentation and/or other materials provided with the
distribution.
3. Neither the name of the OpenBLAS project nor the names of
its contributors may be used to endorse or promote products
derived from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*****************************************************************************/
/**************************************************************************************
* 2013/09/14 Saar
* BLASTEST float : OK
* BLASTEST double : OK
* CTEST : OK
* TEST : OK
*
**************************************************************************************/
#include "common.h"
#include <math.h>
#if defined(DOUBLE)
#define ABS fabs
#else
#define ABS fabsf
#endif
FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x)
{
BLASLONG i=0;
FLOAT sumf = 0.0;
if (n < 0 || inc_x < 1 ) return(sumf);
n *= inc_x;
while(i < n)
{
sumf += ABS(x[i]);
i += inc_x;
}
return(sumf);
}

481
kernel/arm/asum_vfp.S Normal file
View File

@ -0,0 +1,481 @@
/***************************************************************************
Copyright (c) 2013, The OpenBLAS Project
All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
1. Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
2. Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in
the documentation and/or other materials provided with the
distribution.
3. Neither the name of the OpenBLAS project nor the names of
its contributors may be used to endorse or promote products
derived from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*****************************************************************************/
/**************************************************************************************
* 2013/11/11 Saar
* BLASTEST : OK
* CTEST : OK
* TEST : OK
*
**************************************************************************************/
#define ASSEMBLER
#include "common.h"
#define STACKSIZE 256
#define N r0
#define X r1
#define INC_X r2
#define I r12
#define X_PRE 512
/**************************************************************************************
* Macro definitions
**************************************************************************************/
#if !defined(COMPLEX)
#if defined(DOUBLE)
.macro KERNEL_F4
pld [ X, #X_PRE ]
fldmiad X!, { d4 - d5 }
vabs.f64 d4, d4
vadd.f64 d0 , d0, d4
vabs.f64 d5, d5
fldmiad X!, { d6 - d7 }
vabs.f64 d6, d6
vadd.f64 d1 , d1, d5
vabs.f64 d7, d7
vadd.f64 d0 , d0, d6
vadd.f64 d1 , d1, d7
.endm
.macro KERNEL_F1
fldmiad X!, { d4 }
vabs.f64 d4, d4
vadd.f64 d0 , d0, d4
.endm
.macro KERNEL_S4
fldmiad X, { d4 }
vabs.f64 d4, d4
vadd.f64 d0 , d0, d4
add X, X, INC_X
fldmiad X, { d4 }
vabs.f64 d4, d4
vadd.f64 d0 , d0, d4
add X, X, INC_X
fldmiad X, { d4 }
vabs.f64 d4, d4
vadd.f64 d0 , d0, d4
add X, X, INC_X
fldmiad X, { d4 }
vabs.f64 d4, d4
vadd.f64 d0 , d0, d4
add X, X, INC_X
.endm
.macro KERNEL_S1
fldmiad X, { d4 }
vabs.f64 d4, d4
vadd.f64 d0 , d0, d4
add X, X, INC_X
.endm
#else
.macro KERNEL_F4
fldmias X!, { s4 - s5 }
vabs.f32 s4, s4
vadd.f32 s0 , s0, s4
vabs.f32 s5, s5
fldmias X!, { s6 - s7 }
vabs.f32 s6, s6
vadd.f32 s1 , s1, s5
vabs.f32 s7, s7
vadd.f32 s0 , s0, s6
vadd.f32 s1 , s1, s7
.endm
.macro KERNEL_F1
fldmias X!, { s4 }
vabs.f32 s4, s4
vadd.f32 s0 , s0, s4
.endm
.macro KERNEL_S4
fldmias X, { s4 }
vabs.f32 s4, s4
vadd.f32 s0 , s0, s4
add X, X, INC_X
fldmias X, { s4 }
vabs.f32 s4, s4
vadd.f32 s0 , s0, s4
add X, X, INC_X
fldmias X, { s4 }
vabs.f32 s4, s4
vadd.f32 s0 , s0, s4
add X, X, INC_X
fldmias X, { s4 }
vabs.f32 s4, s4
vadd.f32 s0 , s0, s4
add X, X, INC_X
.endm
.macro KERNEL_S1
fldmias X, { s4 }
vabs.f32 s4, s4
vadd.f32 s0 , s0, s4
add X, X, INC_X
.endm
#endif
#else
#if defined(DOUBLE)
.macro KERNEL_F4
pld [ X, #X_PRE ]
fldmiad X!, { d4 - d5 }
vabs.f64 d4, d4
vadd.f64 d0 , d0, d4
vabs.f64 d5, d5
fldmiad X!, { d6 - d7 }
vabs.f64 d6, d6
vadd.f64 d1 , d1, d5
vabs.f64 d7, d7
vadd.f64 d0 , d0, d6
vadd.f64 d1 , d1, d7
pld [ X, #X_PRE ]
fldmiad X!, { d4 - d5 }
vabs.f64 d4, d4
vadd.f64 d0 , d0, d4
vabs.f64 d5, d5
fldmiad X!, { d6 - d7 }
vabs.f64 d6, d6
vadd.f64 d1 , d1, d5
vabs.f64 d7, d7
vadd.f64 d0 , d0, d6
vadd.f64 d1 , d1, d7
.endm
.macro KERNEL_F1
fldmiad X!, { d4 }
vabs.f64 d4, d4
vadd.f64 d0 , d0, d4
fldmiad X!, { d4 }
vabs.f64 d4, d4
vadd.f64 d0 , d0, d4
.endm
.macro KERNEL_S4
fldmiad X, { d4 -d5 }
vabs.f64 d4, d4
vadd.f64 d0 , d0, d4
vabs.f64 d5, d5
vadd.f64 d0 , d0, d5
add X, X, INC_X
fldmiad X, { d4 -d5 }
vabs.f64 d4, d4
vadd.f64 d0 , d0, d4
vabs.f64 d5, d5
vadd.f64 d0 , d0, d5
add X, X, INC_X
fldmiad X, { d4 -d5 }
vabs.f64 d4, d4
vadd.f64 d0 , d0, d4
vabs.f64 d5, d5
vadd.f64 d0 , d0, d5
add X, X, INC_X
fldmiad X, { d4 -d5 }
vabs.f64 d4, d4
vadd.f64 d0 , d0, d4
vabs.f64 d5, d5
vadd.f64 d0 , d0, d5
add X, X, INC_X
.endm
.macro KERNEL_S1
fldmiad X, { d4 -d5 }
vabs.f64 d4, d4
vadd.f64 d0 , d0, d4
vabs.f64 d5, d5
vadd.f64 d0 , d0, d5
add X, X, INC_X
.endm
#else
.macro KERNEL_F4
pld [ X, #X_PRE ]
fldmias X!, { s4 - s5 }
vabs.f32 s4, s4
vadd.f32 s0 , s0, s4
vabs.f32 s5, s5
fldmias X!, { s6 - s7 }
vabs.f32 s6, s6
vadd.f32 s1 , s1, s5
vabs.f32 s7, s7
vadd.f32 s0 , s0, s6
vadd.f32 s1 , s1, s7
fldmias X!, { s4 - s5 }
vabs.f32 s4, s4
vadd.f32 s0 , s0, s4
vabs.f32 s5, s5
fldmias X!, { s6 - s7 }
vabs.f32 s6, s6
vadd.f32 s1 , s1, s5
vabs.f32 s7, s7
vadd.f32 s0 , s0, s6
vadd.f32 s1 , s1, s7
.endm
.macro KERNEL_F1
fldmias X!, { s4 }
vabs.f32 s4, s4
vadd.f32 s0 , s0, s4
fldmias X!, { s4 }
vabs.f32 s4, s4
vadd.f32 s0 , s0, s4
.endm
.macro KERNEL_S4
fldmias X, { s4 -s5 }
vabs.f32 s4, s4
vadd.f32 s0 , s0, s4
vabs.f32 s5, s5
vadd.f32 s0 , s0, s5
add X, X, INC_X
fldmias X, { s4 -s5 }
vabs.f32 s4, s4
vadd.f32 s0 , s0, s4
vabs.f32 s5, s5
vadd.f32 s0 , s0, s5
add X, X, INC_X
fldmias X, { s4 -s5 }
vabs.f32 s4, s4
vadd.f32 s0 , s0, s4
vabs.f32 s5, s5
vadd.f32 s0 , s0, s5
add X, X, INC_X
fldmias X, { s4 -s5 }
vabs.f32 s4, s4
vadd.f32 s0 , s0, s4
vabs.f32 s5, s5
vadd.f32 s0 , s0, s5
add X, X, INC_X
.endm
.macro KERNEL_S1
fldmias X, { s4 -s5 }
vabs.f32 s4, s4
vadd.f32 s0 , s0, s4
vabs.f32 s5, s5
vadd.f32 s0 , s0, s5
add X, X, INC_X
.endm
#endif
#endif
/**************************************************************************************
* End of macro definitions
**************************************************************************************/
PROLOGUE
.align 5
#if defined(DOUBLE)
vsub.f64 d0 , d0 , d0
vsub.f64 d1 , d1 , d1
#else
vsub.f32 s0 , s0 , s0
vsub.f32 s1 , s1 , s1
#endif
cmp N, #0
ble asum_kernel_L999
cmp INC_X, #0
beq asum_kernel_L999
cmp INC_X, #1
bne asum_kernel_S_BEGIN
asum_kernel_F_BEGIN:
asrs I, N, #2 // I = N / 4
ble asum_kernel_F1
.align 5
asum_kernel_F4:
#if !defined(DOUBLE) && !defined(COMPLEX)
pld [ X, #X_PRE ]
#endif
KERNEL_F4
subs I, I, #1
ble asum_kernel_F1
KERNEL_F4
subs I, I, #1
bne asum_kernel_F4
asum_kernel_F1:
ands I, N, #3
ble asum_kernel_L999
asum_kernel_F10:
KERNEL_F1
subs I, I, #1
bne asum_kernel_F10
b asum_kernel_L999
asum_kernel_S_BEGIN:
#if defined(COMPLEX)
#if defined(DOUBLE)
lsl INC_X, INC_X, #4 // INC_X * SIZE * 2
#else
lsl INC_X, INC_X, #3 // INC_X * SIZE * 2
#endif
#else
#if defined(DOUBLE)
lsl INC_X, INC_X, #3 // INC_X * SIZE
#else
lsl INC_X, INC_X, #2 // INC_X * SIZE
#endif
#endif
asrs I, N, #2 // I = N / 4
ble asum_kernel_S1
.align 5
asum_kernel_S4:
KERNEL_S4
subs I, I, #1
bne asum_kernel_S4
asum_kernel_S1:
ands I, N, #3
ble asum_kernel_L999
asum_kernel_S10:
KERNEL_S1
subs I, I, #1
bne asum_kernel_S10
asum_kernel_L999:
#if defined(DOUBLE)
vadd.f64 d0 , d0, d1 // set return value
#else
vadd.f32 s0 , s0, s1 // set return value
#endif
bx lr
EPILOGUE

64
kernel/arm/axpy.c Normal file
View File

@ -0,0 +1,64 @@
/***************************************************************************
Copyright (c) 2013, The OpenBLAS Project
All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
1. Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
2. Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in
the documentation and/or other materials provided with the
distribution.
3. Neither the name of the OpenBLAS project nor the names of
its contributors may be used to endorse or promote products
derived from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*****************************************************************************/
/**************************************************************************************
* 2013/09/14 Saar
* BLASTEST float : OK
* BLASTEST double : OK
* CTEST : OK
* TEST : OK
*
**************************************************************************************/
#include "common.h"
int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *dummy, BLASLONG dummy2)
{
BLASLONG i=0;
BLASLONG ix,iy;
if ( n < 0 ) return(0);
if ( da == 0.0 ) return(0);
ix = 0;
iy = 0;
while(i < n)
{
y[iy] += da * x[ix] ;
ix += inc_x ;
iy += inc_y ;
i++ ;
}
return(0);
}

503
kernel/arm/axpy_vfp.S Normal file
View File

@ -0,0 +1,503 @@
/***************************************************************************
Copyright (c) 2013, The OpenBLAS Project
All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
1. Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
2. Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in
the documentation and/or other materials provided with the
distribution.
3. Neither the name of the OpenBLAS project nor the names of
its contributors may be used to endorse or promote products
derived from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*****************************************************************************/
/**************************************************************************************
* 2013/11/14 Saar
* BLASTEST : xOK
* CTEST : xOK
* TEST : xOK
*
**************************************************************************************/
#define ASSEMBLER
#include "common.h"
#define STACKSIZE 256
#define OLD_INC_X [fp, #0 ]
#define OLD_Y [fp, #4 ]
#define OLD_INC_Y [fp, #8 ]
#define N r0
#define Y r1
#define INC_X r2
#define X r3
#define INC_Y r4
#define I r12
#define X_PRE 512
/**************************************************************************************
* Macro definitions
**************************************************************************************/
/*****************************************************************************************/
#if !defined(CONJ)
#if defined(DOUBLE)
#define FMAC_R1 fmacd
#define FMAC_R2 fnmacd
#define FMAC_I1 fmacd
#define FMAC_I2 fmacd
#else
#define FMAC_R1 fmacs
#define FMAC_R2 fnmacs
#define FMAC_I1 fmacs
#define FMAC_I2 fmacs
#endif
#else // CONJ
#if defined(DOUBLE)
#define FMAC_R1 fmacd
#define FMAC_R2 fmacd
#define FMAC_I1 fnmacd
#define FMAC_I2 fmacd
#else
#define FMAC_R1 fmacs
#define FMAC_R2 fmacs
#define FMAC_I1 fnmacs
#define FMAC_I2 fmacs
#endif
#endif
#if !defined(COMPLEX)
#if defined(DOUBLE)
.macro KERNEL_F4
pld [ X, #X_PRE ]
fldmiad X!, { d4 - d7 }
pld [ Y, #X_PRE ]
fldmiad Y , { d8 - d11 }
fmacd d8 , d0, d4
fstmiad Y!, { d8 }
fmacd d9 , d0, d5
fstmiad Y!, { d9 }
fmacd d10, d0, d6
fstmiad Y!, { d10 }
fmacd d11, d0, d7
fstmiad Y!, { d11 }
.endm
.macro KERNEL_F1
fldmiad X!, { d4 }
fldmiad Y , { d8 }
fmacd d8 , d0, d4
fstmiad Y!, { d8 }
.endm
.macro KERNEL_S1
fldmiad X , { d4 }
fldmiad Y , { d8 }
fmacd d8 , d0, d4
fstmiad Y , { d8 }
add X, X, INC_X
add Y, Y, INC_Y
.endm
#else
.macro KERNEL_F4
fldmias X!, { s4 - s7 }
fldmias Y , { s8 - s11 }
fmacs s8 , s0, s4
fstmias Y!, { s8 }
fmacs s9 , s0, s5
fstmias Y!, { s9 }
fmacs s10, s0, s6
fstmias Y!, { s10 }
fmacs s11, s0, s7
fstmias Y!, { s11 }
.endm
.macro KERNEL_F1
fldmias X!, { s4 }
fldmias Y , { s8 }
fmacs s8 , s0, s4
fstmias Y!, { s8 }
.endm
.macro KERNEL_S1
fldmias X , { s4 }
fldmias Y , { s8 }
fmacs s8 , s0, s4
fstmias Y , { s8 }
add X, X, INC_X
add Y, Y, INC_Y
.endm
#endif
#else
#if defined(DOUBLE)
.macro KERNEL_F4
pld [ X, #X_PRE ]
fldmiad X!, { d4 - d7 }
pld [ Y, #X_PRE ]
fldmiad Y , { d8 - d11 }
FMAC_R1 d8 , d0, d4
FMAC_R2 d8 , d1, d5
FMAC_I1 d9 , d0, d5
FMAC_I2 d9 , d1, d4
fstmiad Y!, { d8 }
fstmiad Y!, { d9 }
FMAC_R1 d10, d0, d6
FMAC_R2 d10, d1, d7
FMAC_I1 d11, d0, d7
FMAC_I2 d11, d1, d6
fstmiad Y!, { d10 }
fstmiad Y!, { d11 }
pld [ X, #X_PRE ]
fldmiad X!, { d4 - d7 }
pld [ Y, #X_PRE ]
fldmiad Y , { d8 - d11 }
FMAC_R1 d8 , d0, d4
FMAC_R2 d8 , d1, d5
FMAC_I1 d9 , d0, d5
FMAC_I2 d9 , d1, d4
fstmiad Y!, { d8 }
fstmiad Y!, { d9 }
FMAC_R1 d10, d0, d6
FMAC_R2 d10, d1, d7
FMAC_I1 d11, d0, d7
FMAC_I2 d11, d1, d6
fstmiad Y!, { d10 }
fstmiad Y!, { d11 }
.endm
.macro KERNEL_F1
fldmiad X!, { d4 - d5 }
fldmiad Y , { d8 - d9 }
FMAC_R1 d8 , d0, d4
FMAC_R2 d8 , d1, d5
FMAC_I1 d9 , d0, d5
FMAC_I2 d9 , d1, d4
fstmiad Y!, { d8 }
fstmiad Y!, { d9 }
.endm
.macro KERNEL_S1
fldmiad X , { d4 - d5 }
fldmiad Y , { d8 - d9 }
FMAC_R1 d8 , d0, d4
FMAC_R2 d8 , d1, d5
FMAC_I1 d9 , d0, d5
FMAC_I2 d9 , d1, d4
fstmiad Y , { d8 - d9 }
add X, X, INC_X
add Y, Y, INC_Y
.endm
#else
.macro KERNEL_F4
pld [ X, #X_PRE ]
fldmias X!, { s4 - s7 }
pld [ Y, #X_PRE ]
fldmias Y , { s8 - s11 }
FMAC_R1 s8 , s0, s4
FMAC_R2 s8 , s1, s5
FMAC_I1 s9 , s0, s5
FMAC_I2 s9 , s1, s4
fstmias Y!, { s8 }
fstmias Y!, { s9 }
FMAC_R1 s10, s0, s6
FMAC_R2 s10, s1, s7
FMAC_I1 s11, s0, s7
FMAC_I2 s11, s1, s6
fstmias Y!, { s10 }
fstmias Y!, { s11 }
fldmias X!, { s4 - s7 }
fldmias Y , { s8 - s11 }
FMAC_R1 s8 , s0, s4
FMAC_R2 s8 , s1, s5
FMAC_I1 s9 , s0, s5
FMAC_I2 s9 , s1, s4
fstmias Y!, { s8 }
fstmias Y!, { s9 }
FMAC_R1 s10, s0, s6
FMAC_R2 s10, s1, s7
FMAC_I1 s11, s0, s7
FMAC_I2 s11, s1, s6
fstmias Y!, { s10 }
fstmias Y!, { s11 }
.endm
.macro KERNEL_F1
fldmias X!, { s4 - s5 }
fldmias Y , { s8 - s9 }
FMAC_R1 s8 , s0, s4
FMAC_R2 s8 , s1, s5
FMAC_I1 s9 , s0, s5
FMAC_I2 s9 , s1, s4
fstmias Y!, { s8 }
fstmias Y!, { s9 }
.endm
.macro KERNEL_S1
fldmias X , { s4 - s5 }
fldmias Y , { s8 - s9 }
FMAC_R1 s8 , s0, s4
FMAC_R2 s8 , s1, s5
FMAC_I1 s9 , s0, s5
FMAC_I2 s9 , s1, s4
fstmias Y , { s8 - s9 }
add X, X, INC_X
add Y, Y, INC_Y
.endm
#endif
#endif
/**************************************************************************************
* End of macro definitions
**************************************************************************************/
PROLOGUE
.align 5
push {r4 , fp}
add fp, sp, #8
sub sp, sp, #STACKSIZE // reserve stack
ldr INC_X , OLD_INC_X
ldr Y, OLD_Y
ldr INC_Y , OLD_INC_Y
sub r12, fp, #128
#if defined(DOUBLE)
vstm r12, { d8 - d15} // store floating point registers
#else
vstm r12, { s8 - s15} // store floating point registers
#endif
cmp N, #0
ble axpy_kernel_L999
cmp INC_X, #0
beq axpy_kernel_L999
cmp INC_Y, #0
beq axpy_kernel_L999
cmp INC_X, #1
bne axpy_kernel_S_BEGIN
cmp INC_Y, #1
bne axpy_kernel_S_BEGIN
axpy_kernel_F_BEGIN:
asrs I, N, #2 // I = N / 4
ble axpy_kernel_F1
.align 5
axpy_kernel_F4:
#if !defined(COMPLEX) && !defined(DOUBLE)
pld [ X, #X_PRE ]
pld [ Y, #X_PRE ]
#endif
KERNEL_F4
subs I, I, #1
ble axpy_kernel_F1
KERNEL_F4
subs I, I, #1
bne axpy_kernel_F4
axpy_kernel_F1:
ands I, N, #3
ble axpy_kernel_L999
axpy_kernel_F10:
KERNEL_F1
subs I, I, #1
bne axpy_kernel_F10
b axpy_kernel_L999
axpy_kernel_S_BEGIN:
#if defined(COMPLEX)
#if defined(DOUBLE)
lsl INC_X, INC_X, #4 // INC_X * SIZE * 2
lsl INC_Y, INC_Y, #4 // INC_Y * SIZE * 2
#else
lsl INC_X, INC_X, #3 // INC_X * SIZE * 2
lsl INC_Y, INC_Y, #3 // INC_Y * SIZE * 2
#endif
#else
#if defined(DOUBLE)
lsl INC_X, INC_X, #3 // INC_X * SIZE
lsl INC_Y, INC_Y, #3 // INC_Y * SIZE
#else
lsl INC_X, INC_X, #2 // INC_X * SIZE
lsl INC_Y, INC_Y, #2 // INC_Y * SIZE
#endif
#endif
asrs I, N, #2 // I = N / 4
ble axpy_kernel_S1
.align 5
axpy_kernel_S4:
KERNEL_S1
KERNEL_S1
KERNEL_S1
KERNEL_S1
subs I, I, #1
bne axpy_kernel_S4
axpy_kernel_S1:
ands I, N, #3
ble axpy_kernel_L999
axpy_kernel_S10:
KERNEL_S1
subs I, I, #1
bne axpy_kernel_S10
axpy_kernel_L999:
sub r3, fp, #128
#if defined(DOUBLE)
vldm r3, { d8 - d15 } // restore floating point registers
#else
vldm r3, { s8 - s15 } // restore floating point registers
#endif
mov r0, #0 // set return value
sub sp, fp, #8
pop {r4,fp}
bx lr
EPILOGUE

222
kernel/arm/ccopy_vfp.S Normal file
View File

@ -0,0 +1,222 @@
/***************************************************************************
Copyright (c) 2013, The OpenBLAS Project
All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
1. Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
2. Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in
the documentation and/or other materials provided with the
distribution.
3. Neither the name of the OpenBLAS project nor the names of
its contributors may be used to endorse or promote products
derived from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*****************************************************************************/
/**************************************************************************************
* 2013/11/07 Saar
* BLASTEST : OK
* CTEST : OK
* TEST : OK
*
**************************************************************************************/
#define ASSEMBLER
#include "common.h"
#define STACKSIZE 256
#define N r0
#define X r1
#define INC_X r2
#define OLD_Y r3
/******************************************************
* [fp, #-128] - [fp, #-64] is reserved
* for store and restore of floating point
* registers
*******************************************************/
#define OLD_INC_Y [fp, #4 ]
#define I r5
#define Y r6
#define INC_Y r7
#define X_PRE 256
/**************************************************************************************
* Macro definitions
**************************************************************************************/
.macro COPY_F4
pld [ X, #X_PRE ]
fldmias X!, { s0 - s7 }
fstmias Y!, { s0 - s7 }
.endm
.macro COPY_F1
fldmias X!, { s0 - s1 }
fstmias Y!, { s0 - s1 }
.endm
/*************************************************************************************************************************/
.macro COPY_S4
nop
fldmias X, { s0 - s1 }
fstmias Y, { s0 - s1 }
add X, X, INC_X
add Y, Y, INC_Y
fldmias X, { s2 - s3 }
fstmias Y, { s2 - s3 }
add X, X, INC_X
add Y, Y, INC_Y
fldmias X, { s0 - s1 }
fstmias Y, { s0 - s1 }
add X, X, INC_X
add Y, Y, INC_Y
fldmias X, { s2 - s3 }
fstmias Y, { s2 - s3 }
add X, X, INC_X
add Y, Y, INC_Y
.endm
.macro COPY_S1
fldmias X, { s0 - s1 }
fstmias Y, { s0 - s1 }
add X, X, INC_X
add Y, Y, INC_Y
.endm
/**************************************************************************************
* End of macro definitions
**************************************************************************************/
PROLOGUE
.align 5
push {r4 - r9, fp}
add fp, sp, #24
sub sp, sp, #STACKSIZE // reserve stack
sub r4, fp, #128
vstm r4, { s8 - s15} // store floating point registers
mov Y, OLD_Y
ldr INC_Y, OLD_INC_Y
cmp N, #0
ble ccopy_kernel_L999
cmp INC_X, #0
beq ccopy_kernel_L999
cmp INC_Y, #0
beq ccopy_kernel_L999
cmp INC_X, #1
bne ccopy_kernel_S_BEGIN
cmp INC_Y, #1
bne ccopy_kernel_S_BEGIN
ccopy_kernel_F_BEGIN:
asrs I, N, #2 // I = N / 4
ble ccopy_kernel_F1
ccopy_kernel_F4:
COPY_F4
subs I, I, #1
bne ccopy_kernel_F4
ccopy_kernel_F1:
ands I, N, #3
ble ccopy_kernel_L999
ccopy_kernel_F10:
COPY_F1
subs I, I, #1
bne ccopy_kernel_F10
b ccopy_kernel_L999
ccopy_kernel_S_BEGIN:
lsl INC_X, INC_X, #3 // INC_X * SIZE * 2
lsl INC_Y, INC_Y, #3 // INC_Y * SIZE * 2
asrs I, N, #2 // I = N / 4
ble ccopy_kernel_S1
ccopy_kernel_S4:
COPY_S4
subs I, I, #1
bne ccopy_kernel_S4
ccopy_kernel_S1:
ands I, N, #3
ble ccopy_kernel_L999
ccopy_kernel_S10:
COPY_S1
subs I, I, #1
bne ccopy_kernel_S10
ccopy_kernel_L999:
sub r3, fp, #128
vldm r3, { s8 - s15} // restore floating point registers
mov r0, #0 // set return value
sub sp, fp, #24
pop {r4 - r9, fp}
bx lr
EPILOGUE

284
kernel/arm/cdot_vfp.S Normal file
View File

@ -0,0 +1,284 @@
/***************************************************************************
Copyright (c) 2013, The OpenBLAS Project
All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
1. Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
2. Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in
the documentation and/or other materials provided with the
distribution.
3. Neither the name of the OpenBLAS project nor the names of
its contributors may be used to endorse or promote products
derived from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*****************************************************************************/
/**************************************************************************************
* 2013/11/11 Saar
* BLASTEST : OK
* CTEST : OK
* TEST : OK
*
**************************************************************************************/
#define ASSEMBLER
#include "common.h"
#define STACKSIZE 256
#define N r0
#define X r1
#define INC_X r2
#define OLD_Y r3
/******************************************************
* [fp, #-128] - [fp, #-64] is reserved
* for store and restore of floating point
* registers
*******************************************************/
#define OLD_INC_Y [fp, #4 ]
#define I r5
#define Y r6
#define INC_Y r7
#define X_PRE 512
/**************************************************************************************
* Macro definitions
**************************************************************************************/
.macro KERNEL_F4
pld [ X, #X_PRE ]
pld [ Y, #X_PRE ]
fldmias X!, { s4 - s5 }
fldmias Y!, { s8 - s9 }
fmacs s0 , s4, s8
fmacs s1 , s4, s9
fldmias X!, { s6 - s7 }
fmacs s2 , s5, s9
fmacs s3 , s5, s8
fldmias Y!, { s10 - s11 }
fmacs s0 , s6, s10
fmacs s1 , s6, s11
fmacs s2 , s7, s11
fmacs s3 , s7, s10
fldmias X!, { s4 - s5 }
fldmias Y!, { s8 - s9 }
fmacs s0 , s4, s8
fmacs s1 , s4, s9
fldmias X!, { s6 - s7 }
fmacs s2 , s5, s9
fmacs s3 , s5, s8
fldmias Y!, { s10 - s11 }
fmacs s0 , s6, s10
fmacs s1 , s6, s11
fmacs s2 , s7, s11
fmacs s3 , s7, s10
.endm
.macro KERNEL_F1
fldmias X!, { s4 - s5 }
fldmias Y!, { s8 - s9 }
fmacs s0 , s4, s8
fmacs s1 , s4, s9
fmacs s2 , s5, s9
fmacs s3 , s5, s8
.endm
/*************************************************************************************************************************/
.macro KERNEL_S4
nop
fldmias X, { s4 - s5 }
fldmias Y, { s8 - s9 }
fmacs s0 , s4, s8
fmacs s1 , s4, s9
fmacs s2 , s5, s9
fmacs s3 , s5, s8
add X, X, INC_X
add Y, Y, INC_Y
fldmias X, { s4 - s5 }
fldmias Y, { s8 - s9 }
fmacs s0 , s4, s8
fmacs s1 , s4, s9
fmacs s2 , s5, s9
fmacs s3 , s5, s8
add X, X, INC_X
add Y, Y, INC_Y
fldmias X, { s4 - s5 }
fldmias Y, { s8 - s9 }
fmacs s0 , s4, s8
fmacs s1 , s4, s9
fmacs s2 , s5, s9
fmacs s3 , s5, s8
add X, X, INC_X
add Y, Y, INC_Y
fldmias X, { s4 - s5 }
fldmias Y, { s8 - s9 }
fmacs s0 , s4, s8
fmacs s1 , s4, s9
fmacs s2 , s5, s9
fmacs s3 , s5, s8
add X, X, INC_X
add Y, Y, INC_Y
.endm
.macro KERNEL_S1
fldmias X, { s4 - s5 }
fldmias Y, { s8 - s9 }
fmacs s0 , s4, s8
fmacs s1 , s4, s9
fmacs s2 , s5, s9
fmacs s3 , s5, s8
add X, X, INC_X
add Y, Y, INC_Y
.endm
/**************************************************************************************
* End of macro definitions
**************************************************************************************/
PROLOGUE
.align 5
push {r4 - r9, fp}
add fp, sp, #24
sub sp, sp, #STACKSIZE // reserve stack
sub r4, fp, #128
vstm r4, { s8 - s15} // store floating point registers
mov Y, OLD_Y
ldr INC_Y, OLD_INC_Y
vsub.f32 s0 , s0 , s0
vsub.f32 s1 , s1 , s1
vsub.f32 s2 , s2 , s2
vsub.f32 s3 , s3 , s3
cmp N, #0
ble cdot_kernel_L999
cmp INC_X, #0
beq cdot_kernel_L999
cmp INC_Y, #0
beq cdot_kernel_L999
cmp INC_X, #1
bne cdot_kernel_S_BEGIN
cmp INC_Y, #1
bne cdot_kernel_S_BEGIN
cdot_kernel_F_BEGIN:
asrs I, N, #2 // I = N / 4
ble cdot_kernel_F1
cdot_kernel_F4:
KERNEL_F4
subs I, I, #1
bne cdot_kernel_F4
cdot_kernel_F1:
ands I, N, #3
ble cdot_kernel_L999
cdot_kernel_F10:
KERNEL_F1
subs I, I, #1
bne cdot_kernel_F10
b cdot_kernel_L999
cdot_kernel_S_BEGIN:
lsl INC_X, INC_X, #3 // INC_X * SIZE * 2
lsl INC_Y, INC_Y, #3 // INC_Y * SIZE * 2
asrs I, N, #2 // I = N / 4
ble cdot_kernel_S1
cdot_kernel_S4:
KERNEL_S4
subs I, I, #1
bne cdot_kernel_S4
cdot_kernel_S1:
ands I, N, #3
ble cdot_kernel_L999
cdot_kernel_S10:
KERNEL_S1
subs I, I, #1
bne cdot_kernel_S10
cdot_kernel_L999:
sub r3, fp, #128
vldm r3, { s8 - s15} // restore floating point registers
#if !defined(CONJ)
vsub.f32 s0 , s0, s2
vadd.f32 s1 , s1, s3
#else
vadd.f32 s0 , s0, s2
vsub.f32 s1 , s1, s3
#endif
sub sp, fp, #24
pop {r4 - r9, fp}
bx lr
EPILOGUE

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

View File

@ -0,0 +1,258 @@
/***************************************************************************
Copyright (c) 2013, The OpenBLAS Project
All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
1. Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
2. Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in
the documentation and/or other materials provided with the
distribution.
3. Neither the name of the OpenBLAS project nor the names of
its contributors may be used to endorse or promote products
derived from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*****************************************************************************/
/**************************************************************************************
* 2013/11/05 Saar
* BLASTEST : OK
* CTEST : OK
* TEST : OK
*
**************************************************************************************/
#define ASSEMBLER
#include "common.h"
#define STACKSIZE 256
#define OLD_M r0
#define OLD_N r1
#define OLD_A r2
#define OLD_LDA r3
/******************************************************
* [fp, #-128] - [fp, #-64] is reserved
* for store and restore of floating point
* registers
*******************************************************/
#define LDA [fp, #-260 ]
#define B [fp, #4 ]
#define M r0
#define N r1
#define A r2
#define BO r5
#define AO1 r6
#define AO2 r7
#define I r3
#define J r12
#define A_PRE 256
/**************************************************************************************
* Macro definitions
**************************************************************************************/
.macro COPY2x2
flds s0 , [ AO1, #0 ]
flds s1 , [ AO1, #4 ]
flds s4 , [ AO1, #8 ]
flds s5 , [ AO1, #12 ]
flds s2 , [ AO2, #0 ]
flds s3 , [ AO2, #4 ]
add AO1, AO1, #16
flds s6 , [ AO2, #8 ]
flds s7 , [ AO2, #12 ]
fstmias BO!, { s0 - s7 }
add AO2, AO2, #16
.endm
.macro COPY1x2
flds s0 , [ AO1, #0 ]
flds s1 , [ AO1, #4 ]
flds s2 , [ AO2, #0 ]
flds s3 , [ AO2, #4 ]
add AO1, AO1, #8
fstmias BO!, { s0 - s3 }
add AO2, AO2, #8
.endm
.macro COPY2x1
flds s0 , [ AO1, #0 ]
flds s1 , [ AO1, #4 ]
flds s2 , [ AO1, #8 ]
flds s3 , [ AO1, #12 ]
fstmias BO!, { s0 - s3 }
add AO1, AO1, #16
.endm
.macro COPY1x1
flds s0 , [ AO1, #0 ]
flds s1 , [ AO1, #4 ]
fstmias BO!, { s0 - s1 }
add AO1, AO1, #8
.endm
/**************************************************************************************
* End of macro definitions
**************************************************************************************/
PROLOGUE
.align 5
push {r4 - r9, fp}
add fp, sp, #24
sub sp, sp, #STACKSIZE // reserve stack
lsl r3, r3, #3 // lda = lda * 4 * 2
str r3, LDA
sub r4, fp, #128
vstm r4, { s8 - s15} // store floating point registers
ldr BO, B
/*********************************************************************************************/
cgemm_ncopy_L2_BEGIN:
asrs J, N, #1 // J = N / 2
ble cgemm_ncopy_L1_BEGIN
cgemm_ncopy_L2_M2_BEGIN:
mov AO1, A // AO1 = A
ldr r4 , LDA
add AO2, AO1, r4
add A , AO2, r4 // A = A + 2 * LDA
asrs I, M, #1 // I = M / 2
ble cgemm_ncopy_L2_M2_40
cgemm_ncopy_L2_M2_20:
pld [ AO1, #A_PRE ]
pld [ AO2, #A_PRE ]
COPY2x2
subs I , I , #1
ble cgemm_ncopy_L2_M2_40
COPY2x2
subs I , I , #1
bne cgemm_ncopy_L2_M2_20
cgemm_ncopy_L2_M2_40:
ands I, M , #1
ble cgemm_ncopy_L2_M2_END
cgemm_ncopy_L2_M2_60:
COPY1x2
subs I , I , #1
bne cgemm_ncopy_L2_M2_60
cgemm_ncopy_L2_M2_END:
subs J , J, #1 // j--
bne cgemm_ncopy_L2_M2_BEGIN
/*********************************************************************************************/
cgemm_ncopy_L1_BEGIN:
tst N, #1
ble cgemm_ncopy_L999
cgemm_ncopy_L1_M2_BEGIN:
mov AO1, A // AO1 = A
ldr r4 , LDA
add A , AO1, r4 // A = A + 1 * LDA
asrs I, M, #1 // I = M / 2
ble cgemm_ncopy_L1_M2_40
cgemm_ncopy_L1_M2_20:
COPY2x1
subs I , I , #1
bne cgemm_ncopy_L1_M2_20
cgemm_ncopy_L1_M2_40:
ands I, M , #1
ble cgemm_ncopy_L1_M2_END
cgemm_ncopy_L1_M2_60:
COPY1x1
subs I , I , #1
bne cgemm_ncopy_L1_M2_60
cgemm_ncopy_L1_M2_END:
cgemm_ncopy_L999:
sub r3, fp, #128
vldm r3, { s8 - s15} // restore floating point registers
movs r0, #0 // set return value
sub sp, fp, #24
pop {r4 - r9, fp}
bx lr
EPILOGUE

View File

@ -0,0 +1,243 @@
/***************************************************************************
Copyright (c) 2013, The OpenBLAS Project
All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
1. Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
2. Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in
the documentation and/or other materials provided with the
distribution.
3. Neither the name of the OpenBLAS project nor the names of
its contributors may be used to endorse or promote products
derived from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*****************************************************************************/
/**************************************************************************************
* 2013/11/07 Saar
* BLASTEST : OK
* CTEST : OK
* TEST : OK
*
**************************************************************************************/
#define ASSEMBLER
#include "common.h"
#define STACKSIZE 256
#define OLD_M r0
#define OLD_N r1
#define OLD_A r2
#define OLD_LDA r3
/******************************************************
* [fp, #-128] - [fp, #-64] is reserved
* for store and restore of floating point
* registers
*******************************************************/
#define B [fp, #4 ]
#define A [fp, #-248 ]
#define M r0
#define N r1
#define M4 r2
#define LDA r5
#define AO1 r6
#define BO1 r7
#define BO2 r8
#define I r4
#define J r12
#define A_PRE 256
/**************************************************************************************
* Macro definitions
**************************************************************************************/
.macro COPY2x2
fldmias AO1, { s0 - s3 }
add r3, AO1, LDA
fldmias r3, { s4 - s7 }
fstmias BO1, { s0 - s7 }
add AO1, AO1, #16
add BO1, BO1, M4
.endm
.macro COPY1x2
fldmias AO1, { s0 -s1 }
add r3, AO1, LDA
fldmias r3, { s2 - s3 }
fstmias BO2, { s0 - s3 }
add AO1, AO1, #8
add BO2, BO2, #16
.endm
/*************************************************************************************************************************/
.macro COPY2x1
fldmias AO1, { s0 - s3 }
fstmias BO1, { s0 - s3 }
add AO1, AO1, #16
add BO1, BO1, M4
.endm
.macro COPY1x1
fldmias AO1, { s0 - s1 }
fstmias BO2, { s0 - s1 }
add AO1, AO1, #8
add BO2, BO2, #8
.endm
/**************************************************************************************
* End of macro definitions
**************************************************************************************/
PROLOGUE
.align 5
push {r4 - r9, fp}
add fp, sp, #24
sub sp, sp, #STACKSIZE // reserve stack
str OLD_A, A // store A
lsl LDA, OLD_LDA, #3 // lda = lda * SIZE * 2
sub r4, fp, #128
vstm r4, { s8 - s15} // store floating point registers
lsl r4 , M, #3 // M * SIZE * 2
ldr r3, B
and BO2 , N , #-2
mul BO2, BO2, r4
add BO2 , BO2, r3
lsl M4, M, #4 // M4 = M * 2 * SIZE * 2
cgemm_tcopy_L2_BEGIN:
asrs J, M, #1 // J = N / 2
ble cgemm_tcopy_L1_BEGIN
cgemm_tcopy_L2_M2_BEGIN:
ldr AO1, A // AO1 = A
lsl r3, LDA, #1 // r3 = 2 * LDA
add r3, r3 , AO1 // A = A + 2 * LDA
str r3, A // store A
ldr BO1, B
add r3, BO1, #32 // B = B + 4 * SIZE *2
str r3, B
asrs I, N, #1 // I = M / 2
ble cgemm_tcopy_L2_M2_60
cgemm_tcopy_L2_M2_40:
COPY2x2
subs I, I, #1
bne cgemm_tcopy_L2_M2_40
cgemm_tcopy_L2_M2_60:
tst N , #1
ble cgemm_tcopy_L2_M2_END
COPY1x2
cgemm_tcopy_L2_M2_END:
subs J , J, #1 // j--
bne cgemm_tcopy_L2_M2_BEGIN
/*********************************************************************************************/
cgemm_tcopy_L1_BEGIN:
tst M, #1
ble cgemm_tcopy_L999
cgemm_tcopy_L1_M2_BEGIN:
ldr AO1, A // AO1 = A
add r3, LDA , AO1 // A = A + 1 * LDA
str r3, A // store A
ldr BO1, B
add r3, BO1, #16 // B = B + 2 * SIZE *2
str r3, B
asrs I, N, #1 // I = M / 2
ble cgemm_tcopy_L1_M2_60
cgemm_tcopy_L1_M2_40:
COPY2x1
subs I, I, #1
bne cgemm_tcopy_L1_M2_40
cgemm_tcopy_L1_M2_60:
tst N , #1
ble cgemm_tcopy_L1_M2_END
COPY1x1
cgemm_tcopy_L1_M2_END:
cgemm_tcopy_L999:
sub r3, fp, #128
vldm r3, { s8 - s15} // restore floating point registers
mov r0, #0 // set return value
sub sp, fp, #24
pop {r4 - r9, fp}
bx lr
EPILOGUE

697
kernel/arm/cgemv_n_vfp.S Normal file
View File

@ -0,0 +1,697 @@
/***************************************************************************
Copyright (c) 2013, The OpenBLAS Project
All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
1. Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
2. Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in
the documentation and/or other materials provided with the
distribution.
3. Neither the name of the OpenBLAS project nor the names of
its contributors may be used to endorse or promote products
derived from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*****************************************************************************/
/**************************************************************************************
* 2013/11/29 Saar
* BLASTEST : OK
* CTEST : OK
* TEST : OK
*
**************************************************************************************/
#define ASSEMBLER
#include "common.h"
#define STACKSIZE 256
#define OLD_LDA [fp, #0 ]
#define X [fp, #4 ]
#define OLD_INC_X [fp, #8 ]
#define Y [fp, #12 ]
#define OLD_INC_Y [fp, #16 ]
#define OLD_A r3
#define OLD_M r0
#define AO1 r0
#define N r1
#define J r2
#define AO2 r4
#define XO r5
#define YO r6
#define LDA r7
#define INC_X r8
#define INC_Y r9
#define I r12
#define ALPHA_I [fp, #-236]
#define ALPHA_R [fp, #-244]
#define M [fp, #-252 ]
#define A [fp, #-256 ]
#define X_PRE 64
#define Y_PRE 0
#define A_PRE 0
/**************************************************************************************/
#if !defined(CONJ) && !defined(XCONJ)
#define KMAC_R fnmacs
#define KMAC_I fmacs
#define FMAC_R1 fmacs
#define FMAC_R2 fnmacs
#define FMAC_I1 fmacs
#define FMAC_I2 fmacs
#elif defined(CONJ) && !defined(XCONJ)
#define KMAC_R fmacs
#define KMAC_I fnmacs
#define FMAC_R1 fmacs
#define FMAC_R2 fnmacs
#define FMAC_I1 fmacs
#define FMAC_I2 fmacs
#elif !defined(CONJ) && defined(XCONJ)
#define KMAC_R fmacs
#define KMAC_I fnmacs
#define FMAC_R1 fmacs
#define FMAC_R2 fmacs
#define FMAC_I1 fnmacs
#define FMAC_I2 fmacs
#else
#define KMAC_R fnmacs
#define KMAC_I fmacs
#define FMAC_R1 fmacs
#define FMAC_R2 fmacs
#define FMAC_I1 fnmacs
#define FMAC_I2 fmacs
#endif
.macro INIT_F4
pld [ YO, #Y_PRE ]
vsub.f32 s8 , s8 , s8
vmov.f32 s9 , s8
vmov.f32 s10, s8
vmov.f32 s11, s8
vmov.f32 s12, s8
vmov.f32 s13, s8
vmov.f32 s14, s8
vmov.f32 s15, s8
.endm
.macro KERNEL_F4X4
pld [ XO, #X_PRE ]
KERNEL_F4X1
KERNEL_F4X1
KERNEL_F4X1
KERNEL_F4X1
.endm
.macro KERNEL_F4X1
pld [ AO2, #A_PRE ]
flds s0 , [ AO1 ]
flds s1 , [ AO1, #4 ]
flds s2 , [ AO1, #8 ]
flds s3 , [ AO1, #12 ]
flds s4 , [ XO ]
flds s5 , [ XO, #4 ]
fmacs s8 , s0, s4
fmacs s9 , s0, s5
fmacs s10 , s2, s4
fmacs s11 , s2, s5
KMAC_R s8 , s1, s5
KMAC_I s9 , s1, s4
KMAC_R s10 , s3, s5
KMAC_I s11 , s3, s4
flds s0 , [ AO1, #16 ]
flds s1 , [ AO1, #20 ]
flds s2 , [ AO1, #24 ]
flds s3 , [ AO1, #28 ]
fmacs s12 , s0, s4
fmacs s13 , s0, s5
fmacs s14 , s2, s4
fmacs s15 , s2, s5
KMAC_R s12 , s1, s5
KMAC_I s13 , s1, s4
KMAC_R s14 , s3, s5
KMAC_I s15 , s3, s4
add XO , XO, #8
add AO1 , AO1, LDA
add AO2 , AO2, LDA
.endm
.macro SAVE_F4
flds s0, ALPHA_R
flds s1, ALPHA_I
fldmias YO, { s4 - s7 }
FMAC_R1 s4 , s0 , s8
FMAC_I1 s5 , s0 , s9
FMAC_R2 s4 , s1 , s9
FMAC_I2 s5 , s1 , s8
FMAC_R1 s6 , s0 , s10
FMAC_I1 s7 , s0 , s11
FMAC_R2 s6 , s1 , s11
FMAC_I2 s7 , s1 , s10
fstmias YO!, { s4 - s7 }
fldmias YO, { s4 - s7 }
FMAC_R1 s4 , s0 , s12
FMAC_I1 s5 , s0 , s13
FMAC_R2 s4 , s1 , s13
FMAC_I2 s5 , s1 , s12
FMAC_R1 s6 , s0 , s14
FMAC_I1 s7 , s0 , s15
FMAC_R2 s6 , s1 , s15
FMAC_I2 s7 , s1 , s14
fstmias YO!, { s4 - s7 }
.endm
.macro INIT_F1
vsub.f32 s8 , s8 , s8
vmov.f32 s9 , s8
.endm
.macro KERNEL_F1X1
flds s0 , [ AO1 ]
flds s1 , [ AO1, #4 ]
flds s4 , [ XO ]
flds s5 , [ XO, #4 ]
fmacs s8 , s0, s4
fmacs s9 , s0, s5
KMAC_R s8 , s1, s5
KMAC_I s9 , s1, s4
add XO , XO, #8
add AO1 , AO1, LDA
.endm
.macro SAVE_F1
flds s0, ALPHA_R
flds s1, ALPHA_I
fldmias YO, { s4 - s5 }
FMAC_R1 s4 , s0 , s8
FMAC_I1 s5 , s0 , s9
FMAC_R2 s4 , s1 , s9
FMAC_I2 s5 , s1 , s8
fstmias YO, { s4 - s5 }
add YO, YO, #8
.endm
/****************************************************************************************/
.macro INIT_S4
vsub.f32 s8 , s8 , s8
vmov.f32 s9 , s8
vmov.f32 s10, s8
vmov.f32 s11, s8
vmov.f32 s12, s8
vmov.f32 s13, s8
vmov.f32 s14, s8
vmov.f32 s15, s8
.endm
.macro KERNEL_S4X4
KERNEL_S4X1
KERNEL_S4X1
KERNEL_S4X1
KERNEL_S4X1
.endm
.macro KERNEL_S4X1
flds s0 , [ AO1 ]
flds s1 , [ AO1, #4 ]
flds s2 , [ AO1, #8 ]
flds s3 , [ AO1, #12 ]
flds s4 , [ XO ]
flds s5 , [ XO, #4 ]
fmacs s8 , s0, s4
fmacs s9 , s0, s5
fmacs s10 , s2, s4
fmacs s11 , s2, s5
KMAC_R s8 , s1, s5
KMAC_I s9 , s1, s4
KMAC_R s10 , s3, s5
KMAC_I s11 , s3, s4
flds s0 , [ AO1, #16 ]
flds s1 , [ AO1, #20 ]
flds s2 , [ AO1, #24 ]
flds s3 , [ AO1, #28 ]
fmacs s12 , s0, s4
fmacs s13 , s0, s5
fmacs s14 , s2, s4
fmacs s15 , s2, s5
KMAC_R s12 , s1, s5
KMAC_I s13 , s1, s4
KMAC_R s14 , s3, s5
KMAC_I s15 , s3, s4
add XO , XO, INC_X
add AO1 , AO1, LDA
add AO2 , AO2, LDA
.endm
.macro SAVE_S4
flds s0, ALPHA_R
flds s1, ALPHA_I
fldmias YO, { s4 - s5 }
FMAC_R1 s4 , s0 , s8
FMAC_I1 s5 , s0 , s9
FMAC_R2 s4 , s1 , s9
FMAC_I2 s5 , s1 , s8
fstmias YO, { s4 - s5 }
add YO, YO, INC_Y
fldmias YO, { s6 - s7 }
FMAC_R1 s6 , s0 , s10
FMAC_I1 s7 , s0 , s11
FMAC_R2 s6 , s1 , s11
FMAC_I2 s7 , s1 , s10
fstmias YO, { s6 - s7 }
add YO, YO, INC_Y
fldmias YO, { s4 - s5 }
FMAC_R1 s4 , s0 , s12
FMAC_I1 s5 , s0 , s13
FMAC_R2 s4 , s1 , s13
FMAC_I2 s5 , s1 , s12
fstmias YO, { s4 - s5 }
add YO, YO, INC_Y
fldmias YO, { s6 - s7 }
FMAC_R1 s6 , s0 , s14
FMAC_I1 s7 , s0 , s15
FMAC_R2 s6 , s1 , s15
FMAC_I2 s7 , s1 , s14
fstmias YO, { s6 - s7 }
add YO, YO, INC_Y
.endm
.macro INIT_S1
vsub.f32 s8 , s8 , s8
vmov.f32 s9 , s8
.endm
.macro KERNEL_S1X1
flds s0 , [ AO1 ]
flds s1 , [ AO1, #4 ]
flds s4 , [ XO ]
flds s5 , [ XO, #4 ]
fmacs s8 , s0, s4
fmacs s9 , s0, s5
KMAC_R s8 , s1, s5
KMAC_I s9 , s1, s4
add XO , XO, INC_X
add AO1 , AO1, LDA
.endm
.macro SAVE_S1
flds s0, ALPHA_R
flds s1, ALPHA_I
fldmias YO, { s4 - s5 }
FMAC_R1 s4 , s0 , s8
FMAC_I1 s5 , s0 , s9
FMAC_R2 s4 , s1 , s9
FMAC_I2 s5 , s1 , s8
fstmias YO, { s4 - s5 }
add YO, YO, INC_Y
.endm
/**************************************************************************************
* End of macro definitions
**************************************************************************************/
PROLOGUE
.align 5
push {r4 - r9 , fp}
add fp, sp, #28
sub sp, sp, #STACKSIZE // reserve stack
sub r12, fp, #192
#if defined(DOUBLE)
vstm r12, { d8 - d15 } // store floating point registers
#else
vstm r12, { s8 - s15 } // store floating point registers
#endif
cmp OLD_M, #0
ble cgemvn_kernel_L999
cmp N, #0
ble cgemvn_kernel_L999
str OLD_A, A
str OLD_M, M
vstr s0 , ALPHA_R
vstr s1 , ALPHA_I
ldr INC_X , OLD_INC_X
ldr INC_Y , OLD_INC_Y
cmp INC_X, #0
beq cgemvn_kernel_L999
cmp INC_Y, #0
beq cgemvn_kernel_L999
ldr LDA, OLD_LDA
#if defined(DOUBLE)
lsl LDA, LDA, #4 // LDA * SIZE * 2
#else
lsl LDA, LDA, #3 // LDA * SIZE * 2
#endif
cmp INC_X, #1
bne cgemvn_kernel_S4_BEGIN
cmp INC_Y, #1
bne cgemvn_kernel_S4_BEGIN
cgemvn_kernel_F4_BEGIN:
ldr YO , Y
ldr I, M
asrs I, I, #2 // I = M / 4
ble cgemvn_kernel_F1_BEGIN
cgemvn_kernel_F4X4:
ldr AO1, A
add AO2, AO1, LDA
add r3 , AO1, #32
str r3 , A
add AO2, AO2, LDA
add AO2, AO2, LDA
ldr XO , X
INIT_F4
asrs J, N, #2 // J = N / 4
ble cgemvn_kernel_F4X1
cgemvn_kernel_F4X4_10:
KERNEL_F4X4
subs J, J, #1
bne cgemvn_kernel_F4X4_10
cgemvn_kernel_F4X1:
ands J, N , #3
ble cgemvn_kernel_F4_END
cgemvn_kernel_F4X1_10:
KERNEL_F4X1
subs J, J, #1
bne cgemvn_kernel_F4X1_10
cgemvn_kernel_F4_END:
SAVE_F4
subs I , I , #1
bne cgemvn_kernel_F4X4
cgemvn_kernel_F1_BEGIN:
ldr I, M
ands I, I , #3
ble cgemvn_kernel_L999
cgemvn_kernel_F1X1:
ldr AO1, A
add r3, AO1, #8
str r3, A
ldr XO , X
INIT_F1
mov J, N
cgemvn_kernel_F1X1_10:
KERNEL_F1X1
subs J, J, #1
bne cgemvn_kernel_F1X1_10
cgemvn_kernel_F1_END:
SAVE_F1
subs I , I , #1
bne cgemvn_kernel_F1X1
b cgemvn_kernel_L999
/*************************************************************************************************************/
cgemvn_kernel_S4_BEGIN:
#if defined(DOUBLE)
lsl INC_X, INC_X, #4 // INC_X * SIZE * 2
lsl INC_Y, INC_Y, #4 // INC_Y * SIZE * 2
#else
lsl INC_X, INC_X, #3 // INC_X * SIZE * 2
lsl INC_Y, INC_Y, #3 // INC_Y * SIZE * 2
#endif
ldr YO , Y
ldr I, M
asrs I, I, #2 // I = M / 4
ble cgemvn_kernel_S1_BEGIN
cgemvn_kernel_S4X4:
ldr AO1, A
add AO2, AO1, LDA
add r3 , AO1, #32
str r3 , A
ldr XO , X
INIT_S4
asrs J, N, #2 // J = N / 4
ble cgemvn_kernel_S4X1
cgemvn_kernel_S4X4_10:
KERNEL_S4X4
subs J, J, #1
bne cgemvn_kernel_S4X4_10
cgemvn_kernel_S4X1:
ands J, N , #3
ble cgemvn_kernel_S4_END
cgemvn_kernel_S4X1_10:
KERNEL_S4X1
subs J, J, #1
bne cgemvn_kernel_S4X1_10
cgemvn_kernel_S4_END:
SAVE_S4
subs I , I , #1
bne cgemvn_kernel_S4X4
cgemvn_kernel_S1_BEGIN:
ldr I, M
ands I, I , #3
ble cgemvn_kernel_L999
cgemvn_kernel_S1X1:
ldr AO1, A
add r3, AO1, #8
str r3, A
ldr XO , X
INIT_S1
mov J, N
cgemvn_kernel_S1X1_10:
KERNEL_S1X1
subs J, J, #1
bne cgemvn_kernel_S1X1_10
cgemvn_kernel_S1_END:
SAVE_S1
subs I , I , #1
bne cgemvn_kernel_S1X1
/*************************************************************************************************************/
cgemvn_kernel_L999:
sub r3, fp, #192
#if defined(DOUBLE)
vldm r3, { d8 - d15 } // restore floating point registers
#else
vldm r3, { s8 - s15 } // restore floating point registers
#endif
mov r0, #0 // set return value
sub sp, fp, #28
pop {r4 -r9 ,fp}
bx lr
EPILOGUE

607
kernel/arm/cgemv_t_vfp.S Normal file
View File

@ -0,0 +1,607 @@
/***************************************************************************
Copyright (c) 2013, The OpenBLAS Project
All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
1. Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
2. Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in
the documentation and/or other materials provided with the
distribution.
3. Neither the name of the OpenBLAS project nor the names of
its contributors may be used to endorse or promote products
derived from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*****************************************************************************/
/**************************************************************************************
* 2013/11/29 Saar
* BLASTEST : OK
* CTEST : OK
* TEST : OK
*
**************************************************************************************/
#define ASSEMBLER
#include "common.h"
#define STACKSIZE 256
#define OLD_LDA [fp, #0 ]
#define X [fp, #4 ]
#define OLD_INC_X [fp, #8 ]
#define Y [fp, #12 ]
#define OLD_INC_Y [fp, #16 ]
#define OLD_A r3
#define OLD_N r1
#define M r0
#define AO1 r1
#define J r2
#define AO2 r4
#define XO r5
#define YO r6
#define LDA r7
#define INC_X r8
#define INC_Y r9
#define I r12
#define N [fp, #-252 ]
#define A [fp, #-256 ]
#define X_PRE 512
#define A_PRE 512
/**************************************************************************************
* Macro definitions
**************************************************************************************/
#if !defined(CONJ) && !defined(XCONJ)
#define KMAC_R fnmacs
#define KMAC_I fmacs
#define FMAC_R1 fmacs
#define FMAC_R2 fnmacs
#define FMAC_I1 fmacs
#define FMAC_I2 fmacs
#elif defined(CONJ) && !defined(XCONJ)
#define KMAC_R fmacs
#define KMAC_I fnmacs
#define FMAC_R1 fmacs
#define FMAC_R2 fnmacs
#define FMAC_I1 fmacs
#define FMAC_I2 fmacs
#elif !defined(CONJ) && defined(XCONJ)
#define KMAC_R fmacs
#define KMAC_I fnmacs
#define FMAC_R1 fmacs
#define FMAC_R2 fmacs
#define FMAC_I1 fnmacs
#define FMAC_I2 fmacs
#else
#define KMAC_R fnmacs
#define KMAC_I fmacs
#define FMAC_R1 fmacs
#define FMAC_R2 fmacs
#define FMAC_I1 fnmacs
#define FMAC_I2 fmacs
#endif
.macro INIT_F2
vsub.f32 s12, s12, s12
vsub.f32 s13, s13, s13
vsub.f32 s14, s14, s14
vsub.f32 s15, s15, s15
.endm
.macro KERNEL_F2X4
KERNEL_F2X1
KERNEL_F2X1
KERNEL_F2X1
KERNEL_F2X1
.endm
.macro KERNEL_F2X1
fldmias XO! , { s2 - s3 }
fldmias AO1!, { s4 - s5 }
fldmias AO2!, { s8 - s9 }
fmacs s12 , s4 , s2
fmacs s13 , s4 , s3
KMAC_R s12 , s5 , s3
KMAC_I s13 , s5 , s2
fmacs s14 , s8 , s2
fmacs s15 , s8 , s3
KMAC_R s14 , s9 , s3
KMAC_I s15 , s9 , s2
.endm
.macro SAVE_F2
fldmias YO, { s4 - s7 }
FMAC_R1 s4 , s0 , s12
FMAC_I1 s5 , s0 , s13
FMAC_R2 s4 , s1 , s13
FMAC_I2 s5 , s1 , s12
FMAC_R1 s6 , s0 , s14
FMAC_I1 s7 , s0 , s15
FMAC_R2 s6 , s1 , s15
FMAC_I2 s7 , s1 , s14
fstmias YO!, { s4 - s7 }
.endm
/************************************************************************************************/
.macro INIT_F1
vsub.f32 s12, s12, s12
vsub.f32 s13, s13, s13
.endm
.macro KERNEL_F1X4
KERNEL_F1X1
KERNEL_F1X1
KERNEL_F1X1
KERNEL_F1X1
.endm
.macro KERNEL_F1X1
fldmias XO! , { s2 - s3 }
fldmias AO1!, { s4 - s5 }
fmacs s12 , s4 , s2
fmacs s13 , s4 , s3
KMAC_R s12 , s5 , s3
KMAC_I s13 , s5 , s2
.endm
.macro SAVE_F1
fldmias YO, { s4 - s5 }
FMAC_R1 s4 , s0 , s12
FMAC_I1 s5 , s0 , s13
FMAC_R2 s4 , s1 , s13
FMAC_I2 s5 , s1 , s12
fstmias YO!, { s4 - s5 }
.endm
/************************************************************************************************/
.macro INIT_S2
vsub.f32 s12, s12, s12
vsub.f32 s13, s13, s13
vsub.f32 s14, s14, s14
vsub.f32 s15, s15, s15
.endm
.macro KERNEL_S2X4
KERNEL_S2X1
KERNEL_S2X1
KERNEL_S2X1
KERNEL_S2X1
.endm
.macro KERNEL_S2X1
fldmias XO , { s2 - s3 }
fldmias AO1!, { s4 - s5 }
fldmias AO2!, { s8 - s9 }
fmacs s12 , s4 , s2
fmacs s13 , s4 , s3
KMAC_R s12 , s5 , s3
KMAC_I s13 , s5 , s2
fmacs s14 , s8 , s2
fmacs s15 , s8 , s3
KMAC_R s14 , s9 , s3
KMAC_I s15 , s9 , s2
add XO, XO, INC_X
.endm
.macro SAVE_S2
fldmias YO, { s4 - s5 }
FMAC_R1 s4 , s0 , s12
FMAC_I1 s5 , s0 , s13
FMAC_R2 s4 , s1 , s13
FMAC_I2 s5 , s1 , s12
fstmias YO, { s4 - s5 }
add YO, YO, INC_Y
fldmias YO, { s6 - s7 }
FMAC_R1 s6 , s0 , s14
FMAC_I1 s7 , s0 , s15
FMAC_R2 s6 , s1 , s15
FMAC_I2 s7 , s1 , s14
fstmias YO, { s6 - s7 }
add YO, YO, INC_Y
.endm
/************************************************************************************************/
.macro INIT_S1
vsub.f32 s12, s12, s12
vsub.f32 s13, s13, s13
.endm
.macro KERNEL_S1X4
KERNEL_S1X1
KERNEL_S1X1
KERNEL_S1X1
KERNEL_S1X1
.endm
.macro KERNEL_S1X1
fldmias XO , { s2 - s3 }
fldmias AO1!, { s4 - s5 }
fmacs s12 , s4 , s2
fmacs s13 , s4 , s3
KMAC_R s12 , s5 , s3
KMAC_I s13 , s5 , s2
add XO, XO, INC_X
.endm
.macro SAVE_S1
fldmias YO, { s4 - s5 }
FMAC_R1 s4 , s0 , s12
FMAC_I1 s5 , s0 , s13
FMAC_R2 s4 , s1 , s13
FMAC_I2 s5 , s1 , s12
fstmias YO, { s4 - s5 }
add YO, YO, INC_Y
.endm
/**************************************************************************************
* End of macro definitions
**************************************************************************************/
PROLOGUE
.align 5
push {r4 - r9 , fp}
add fp, sp, #28
sub sp, sp, #STACKSIZE // reserve stack
sub r12, fp, #192
#if defined(DOUBLE)
vstm r12, { d8 - d15 } // store floating point registers
#else
vstm r12, { s8 - s15 } // store floating point registers
#endif
cmp M, #0
ble cgemvt_kernel_L999
cmp OLD_N, #0
ble cgemvt_kernel_L999
str OLD_A, A
str OLD_N, N
ldr INC_X , OLD_INC_X
ldr INC_Y , OLD_INC_Y
cmp INC_X, #0
beq cgemvt_kernel_L999
cmp INC_Y, #0
beq cgemvt_kernel_L999
ldr LDA, OLD_LDA
#if defined(DOUBLE)
lsl LDA, LDA, #4 // LDA * SIZE
#else
lsl LDA, LDA, #3 // LDA * SIZE
#endif
cmp INC_X, #1
bne cgemvt_kernel_S2_BEGIN
cmp INC_Y, #1
bne cgemvt_kernel_S2_BEGIN
cgemvt_kernel_F2_BEGIN:
ldr YO , Y
ldr J, N
asrs J, J, #1 // J = N / 2
ble cgemvt_kernel_F1_BEGIN
cgemvt_kernel_F2X4:
ldr AO1, A
add AO2, AO1, LDA
add r3 , AO2, LDA
str r3 , A
ldr XO , X
INIT_F2
asrs I, M, #2 // I = M / 4
ble cgemvt_kernel_F2X1
cgemvt_kernel_F2X4_10:
KERNEL_F2X4
subs I, I, #1
bne cgemvt_kernel_F2X4_10
cgemvt_kernel_F2X1:
ands I, M , #3
ble cgemvt_kernel_F2_END
cgemvt_kernel_F2X1_10:
KERNEL_F2X1
subs I, I, #1
bne cgemvt_kernel_F2X1_10
cgemvt_kernel_F2_END:
SAVE_F2
subs J , J , #1
bne cgemvt_kernel_F2X4
cgemvt_kernel_F1_BEGIN:
ldr J, N
ands J, J, #1
ble cgemvt_kernel_L999
cgemvt_kernel_F1X4:
ldr AO1, A
ldr XO , X
INIT_F1
asrs I, M, #2 // I = M / 4
ble cgemvt_kernel_F1X1
cgemvt_kernel_F1X4_10:
KERNEL_F1X4
subs I, I, #1
bne cgemvt_kernel_F1X4_10
cgemvt_kernel_F1X1:
ands I, M , #3
ble cgemvt_kernel_F1_END
cgemvt_kernel_F1X1_10:
KERNEL_F1X1
subs I, I, #1
bne cgemvt_kernel_F1X1_10
cgemvt_kernel_F1_END:
SAVE_F1
b cgemvt_kernel_L999
/*************************************************************************************************************/
cgemvt_kernel_S2_BEGIN:
#if defined(DOUBLE)
lsl INC_X, INC_X, #4 // INC_X * SIZE
lsl INC_Y, INC_Y, #4 // INC_Y * SIZE
#else
lsl INC_X, INC_X, #3 // INC_X * SIZE
lsl INC_Y, INC_Y, #3 // INC_Y * SIZE
#endif
ldr YO , Y
ldr J, N
asrs J, J, #1 // J = N / 2
ble cgemvt_kernel_S1_BEGIN
cgemvt_kernel_S2X4:
ldr AO1, A
add AO2, AO1, LDA
add r3 , AO2, LDA
str r3 , A
ldr XO , X
INIT_S2
asrs I, M, #2 // I = M / 4
ble cgemvt_kernel_S2X1
cgemvt_kernel_S2X4_10:
KERNEL_S2X4
subs I, I, #1
bne cgemvt_kernel_S2X4_10
cgemvt_kernel_S2X1:
ands I, M , #3
ble cgemvt_kernel_S2_END
cgemvt_kernel_S2X1_10:
KERNEL_S2X1
subs I, I, #1
bne cgemvt_kernel_S2X1_10
cgemvt_kernel_S2_END:
SAVE_S2
subs J , J , #1
bne cgemvt_kernel_S2X4
cgemvt_kernel_S1_BEGIN:
ldr J, N
ands J, J, #1
ble cgemvt_kernel_L999
cgemvt_kernel_S1X4:
ldr AO1, A
ldr XO , X
INIT_S1
asrs I, M, #2 // I = M / 4
ble cgemvt_kernel_S1X1
cgemvt_kernel_S1X4_10:
KERNEL_S1X4
subs I, I, #1
bne cgemvt_kernel_S1X4_10
cgemvt_kernel_S1X1:
ands I, M , #3
ble cgemvt_kernel_S1_END
cgemvt_kernel_S1X1_10:
KERNEL_S1X1
subs I, I, #1
bne cgemvt_kernel_S1X1_10
cgemvt_kernel_S1_END:
SAVE_S1
/*************************************************************************************************************/
cgemvt_kernel_L999:
sub r3, fp, #192
#if defined(DOUBLE)
vldm r3, { d8 - d15 } // restore floating point registers
#else
vldm r3, { s8 - s15 } // restore floating point registers
#endif
mov r0, #0 // set return value
sub sp, fp, #28
pop {r4 -r9 ,fp}
bx lr
EPILOGUE

59
kernel/arm/copy.c Normal file
View File

@ -0,0 +1,59 @@
/***************************************************************************
Copyright (c) 2013, The OpenBLAS Project
All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
1. Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
2. Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in
the documentation and/or other materials provided with the
distribution.
3. Neither the name of the OpenBLAS project nor the names of
its contributors may be used to endorse or promote products
derived from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*****************************************************************************/
/**************************************************************************************
* 2013/09/14 Saar
* BLASTEST float : OK
* BLASTEST double : OK
* CTEST : OK
* TEST : OK
*
**************************************************************************************/
#include "common.h"
int CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y)
{
BLASLONG i=0;
BLASLONG ix=0,iy=0;
if ( n < 0 ) return(0);
while(i < n)
{
y[iy] = x[ix] ;
ix += inc_x ;
iy += inc_y ;
i++ ;
}
return(0);
}

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

222
kernel/arm/dcopy_vfp.S Normal file
View File

@ -0,0 +1,222 @@
/***************************************************************************
Copyright (c) 2013, The OpenBLAS Project
All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
1. Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
2. Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in
the documentation and/or other materials provided with the
distribution.
3. Neither the name of the OpenBLAS project nor the names of
its contributors may be used to endorse or promote products
derived from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*****************************************************************************/
/**************************************************************************************
* 2013/11/07 Saar
* BLASTEST : OK
* CTEST : OK
* TEST : OK
*
**************************************************************************************/
#define ASSEMBLER
#include "common.h"
#define STACKSIZE 256
#define N r0
#define X r1
#define INC_X r2
#define OLD_Y r3
/******************************************************
* [fp, #-128] - [fp, #-64] is reserved
* for store and restore of floating point
* registers
*******************************************************/
#define OLD_INC_Y [fp, #4 ]
#define I r5
#define Y r6
#define INC_Y r7
#define X_PRE 256
/**************************************************************************************
* Macro definitions
**************************************************************************************/
.macro COPY_F4
pld [ X, #X_PRE ]
fldmiad X!, { d0 - d3 }
fstmiad Y!, { d0 - d3 }
.endm
.macro COPY_F1
fldmiad X!, { d0 }
fstmiad Y!, { d0 }
.endm
/*************************************************************************************************************************/
.macro COPY_S4
nop
fldmiad X, { d0 }
fstmiad Y, { d0 }
add X, X, INC_X
add Y, Y, INC_Y
fldmiad X, { d1 }
fstmiad Y, { d1 }
add X, X, INC_X
add Y, Y, INC_Y
fldmiad X, { d0 }
fstmiad Y, { d0 }
add X, X, INC_X
add Y, Y, INC_Y
fldmiad X, { d1 }
fstmiad Y, { d1 }
add X, X, INC_X
add Y, Y, INC_Y
.endm
.macro COPY_S1
fldmiad X, { d0 }
fstmiad Y, { d0 }
add X, X, INC_X
add Y, Y, INC_Y
.endm
/**************************************************************************************
* End of macro definitions
**************************************************************************************/
PROLOGUE
.align 5
push {r4 - r9, fp}
add fp, sp, #24
sub sp, sp, #STACKSIZE // reserve stack
sub r4, fp, #128
vstm r4, { d8 - d15} // store floating point registers
mov Y, OLD_Y
ldr INC_Y, OLD_INC_Y
cmp N, #0
ble dcopy_kernel_L999
cmp INC_X, #0
beq dcopy_kernel_L999
cmp INC_Y, #0
beq dcopy_kernel_L999
cmp INC_X, #1
bne dcopy_kernel_S_BEGIN
cmp INC_Y, #1
bne dcopy_kernel_S_BEGIN
dcopy_kernel_F_BEGIN:
asrs I, N, #2 // I = N / 4
ble dcopy_kernel_F1
dcopy_kernel_F4:
COPY_F4
subs I, I, #1
bne dcopy_kernel_F4
dcopy_kernel_F1:
ands I, N, #3
ble dcopy_kernel_L999
dcopy_kernel_F10:
COPY_F1
subs I, I, #1
bne dcopy_kernel_F10
b dcopy_kernel_L999
dcopy_kernel_S_BEGIN:
lsl INC_X, INC_X, #3 // INC_X * SIZE
lsl INC_Y, INC_Y, #3 // INC_Y * SIZE
asrs I, N, #2 // I = N / 4
ble dcopy_kernel_S1
dcopy_kernel_S4:
COPY_S4
subs I, I, #1
bne dcopy_kernel_S4
dcopy_kernel_S1:
ands I, N, #3
ble dcopy_kernel_L999
dcopy_kernel_S10:
COPY_S1
subs I, I, #1
bne dcopy_kernel_S10
dcopy_kernel_L999:
sub r3, fp, #128
vldm r3, { d8 - d15} // restore floating point registers
mov r0, #0 // set return value
sub sp, fp, #24
pop {r4 - r9, fp}
bx lr
EPILOGUE

248
kernel/arm/ddot_vfp.S Normal file
View File

@ -0,0 +1,248 @@
/***************************************************************************
Copyright (c) 2013, The OpenBLAS Project
All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
1. Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
2. Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in
the documentation and/or other materials provided with the
distribution.
3. Neither the name of the OpenBLAS project nor the names of
its contributors may be used to endorse or promote products
derived from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*****************************************************************************/
/**************************************************************************************
* 2013/11/11 Saar
* BLASTEST : OK
* CTEST : OK
* TEST : OK
*
**************************************************************************************/
#define ASSEMBLER
#include "common.h"
#define STACKSIZE 256
#define N r0
#define X r1
#define INC_X r2
#define OLD_Y r3
/******************************************************
* [fp, #-128] - [fp, #-64] is reserved
* for store and restore of floating point
* registers
*******************************************************/
#define OLD_INC_Y [fp, #4 ]
#define I r5
#define Y r6
#define INC_Y r7
#define X_PRE 512
/**************************************************************************************
* Macro definitions
**************************************************************************************/
.macro KERNEL_F4
pld [ X, #X_PRE ]
fldmiad X!, { d8 }
pld [ Y, #X_PRE ]
fldmiad Y!, { d4 }
fldmiad Y!, { d5 }
fmacd d0 , d4, d8
fldmiad X!, { d9 }
fldmiad Y!, { d6 }
fmacd d1 , d5, d9
fldmiad X!, { d10 }
fldmiad X!, { d11 }
fmacd d0 , d6, d10
fldmiad Y!, { d7 }
fmacd d1 , d7, d11
.endm
.macro KERNEL_F1
fldmiad X!, { d4 }
fldmiad Y!, { d8 }
fmacd d0 , d4, d8
.endm
/*************************************************************************************************************************/
.macro KERNEL_S4
nop
fldmiad X, { d4 }
fldmiad Y, { d8 }
add X, X, INC_X
add Y, Y, INC_Y
fmacd d0 , d4, d8
fldmiad X, { d5 }
fldmiad Y, { d9 }
add X, X, INC_X
add Y, Y, INC_Y
fmacd d1 , d5, d9
fldmiad X, { d6 }
fldmiad Y, { d10 }
add X, X, INC_X
add Y, Y, INC_Y
fmacd d0 , d6, d10
fldmiad X, { d7 }
fldmiad Y, { d11 }
add X, X, INC_X
add Y, Y, INC_Y
fmacd d1 , d7, d11
.endm
.macro KERNEL_S1
fldmiad X, { d4 }
fldmiad Y, { d8 }
add X, X, INC_X
fmacd d0 , d4, d8
add Y, Y, INC_Y
.endm
/**************************************************************************************
* End of macro definitions
**************************************************************************************/
PROLOGUE
.align 5
push {r4 - r9, fp}
add fp, sp, #24
sub sp, sp, #STACKSIZE // reserve stack
sub r4, fp, #128
vstm r4, { d8 - d15} // store floating point registers
mov Y, OLD_Y
ldr INC_Y, OLD_INC_Y
vsub.f64 d0 , d0 , d0
vsub.f64 d1 , d1 , d1
cmp N, #0
ble ddot_kernel_L999
cmp INC_X, #0
beq ddot_kernel_L999
cmp INC_Y, #0
beq ddot_kernel_L999
cmp INC_X, #1
bne ddot_kernel_S_BEGIN
cmp INC_Y, #1
bne ddot_kernel_S_BEGIN
ddot_kernel_F_BEGIN:
asrs I, N, #2 // I = N / 4
ble ddot_kernel_F1
ddot_kernel_F4:
KERNEL_F4
subs I, I, #1
ble ddot_kernel_F1
KERNEL_F4
subs I, I, #1
bne ddot_kernel_F4
ddot_kernel_F1:
ands I, N, #3
ble ddot_kernel_L999
ddot_kernel_F10:
KERNEL_F1
subs I, I, #1
bne ddot_kernel_F10
b ddot_kernel_L999
ddot_kernel_S_BEGIN:
lsl INC_X, INC_X, #3 // INC_X * SIZE
lsl INC_Y, INC_Y, #3 // INC_Y * SIZE
asrs I, N, #2 // I = N / 4
ble ddot_kernel_S1
ddot_kernel_S4:
KERNEL_S4
subs I, I, #1
bne ddot_kernel_S4
ddot_kernel_S1:
ands I, N, #3
ble ddot_kernel_L999
ddot_kernel_S10:
KERNEL_S1
subs I, I, #1
bne ddot_kernel_S10
ddot_kernel_L999:
sub r3, fp, #128
vldm r3, { d8 - d15} // restore floating point registers
vadd.f64 d0 , d0, d1 // set return value
sub sp, fp, #24
pop {r4 - r9, fp}
bx lr
EPILOGUE

View File

@ -0,0 +1,806 @@
/***************************************************************************
Copyright (c) 2013, The OpenBLAS Project
All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
1. Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
2. Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in
the documentation and/or other materials provided with the
distribution.
3. Neither the name of the OpenBLAS project nor the names of
its contributors may be used to endorse or promote products
derived from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*****************************************************************************/
/**************************************************************************************
* 2013/11/27 Saar
* BLASTEST : OK
* CTEST : OK
* TEST : OK
*
**************************************************************************************/
#define ASSEMBLER
#include "common.h"
#define STACKSIZE 256
#define OLD_M r0
#define OLD_N r1
#define OLD_K r2
#define OLD_A r3
#define OLD_ALPHA d0
/******************************************************
* [fp, #-128] - [fp, #-64] is reserved
* for store and restore of floating point
* registers
*******************************************************/
#define LDC [fp, #-252 ]
#define M [fp, #-256 ]
#define N [fp, #-260 ]
#define K [fp, #-264 ]
#define A [fp, #-268 ]
#define ALPHA [fp, #-280]
#define B [fp, #4 ]
#define C [fp, #8 ]
#define OLD_LDC [fp, #12 ]
#define I r0
#define J r1
#define L r2
#define AO r5
#define BO r6
#define CO1 r8
#define CO2 r9
#define K1 r7
#define BC r12
#define A_PRE 96
#define B_PRE 96
#define C_PRE 32
/**************************************************************************************
* Macro definitions
**************************************************************************************/
.macro INIT4x2
vsub.f64 d8 , d8 , d8
vmov.f64 d9, d8
vmov.f64 d10, d8
vmov.f64 d11, d8
vmov.f64 d12, d8
vmov.f64 d13, d8
vmov.f64 d14, d8
vmov.f64 d15, d8
.endm
.macro KERNEL4x2_SUB
pld [ AO, #A_PRE ]
fldd d4 , [ BO ]
fldd d0 , [ AO ]
fldd d1 , [ AO, #8 ]
fmacd d8 , d0, d4
fldd d2 , [ AO, #16 ]
fmacd d9 , d1, d4
fldd d3 , [ AO, #24 ]
fmacd d10 , d2, d4
fldd d5 , [ BO, #8 ]
fmacd d11 , d3, d4
fmacd d12 , d0, d5
fmacd d13 , d1, d5
add AO , AO, #32
fmacd d14 , d2, d5
add BO , BO, #16
fmacd d15 , d3, d5
.endm
.macro SAVE4x2
ldr r3 , LDC
add CO2 , CO1, r3
fldd d0, ALPHA
fldd d4 , [CO1]
fldd d5 , [CO1, #8 ]
pld [ CO1, #C_PRE ]
fmacd d4 , d0 , d8
fldd d6 , [CO1, #16 ]
fmacd d5 , d0 , d9
fldd d7 , [CO1, #24 ]
fmacd d6 , d0 , d10
fstd d4 , [CO1]
fmacd d7 , d0 , d11
fstd d5 , [CO1, #8 ]
fstd d6 , [CO1, #16 ]
fstd d7 , [CO1, #24 ]
fldd d4 , [CO2]
fldd d5 , [CO2, #8 ]
pld [ CO2, #C_PRE ]
fmacd d4 , d0 , d12
fldd d6 , [CO2, #16 ]
fmacd d5 , d0 , d13
fldd d7 , [CO2, #24 ]
fmacd d6 , d0 , d14
fstd d4 , [CO2]
fmacd d7 , d0 , d15
add CO1, CO1, #32
fstd d5 , [CO2, #8 ]
fstd d6 , [CO2, #16 ]
fstd d7 , [CO2, #24 ]
.endm
/******************************************************************************/
.macro INIT2x2
vsub.f64 d8 , d8 , d8
vmov.f64 d9, d8
vmov.f64 d12, d8
vmov.f64 d13, d8
.endm
.macro KERNEL2x2_SUB
fldd d4 , [ BO ]
fldd d5 , [ BO, #8 ]
fldd d0 , [ AO ]
fldd d1 , [ AO, #8 ]
fmacd d8 , d0, d4
fmacd d9 , d1, d4
fmacd d12 , d0, d5
fmacd d13 , d1, d5
add AO , AO, #16
add BO , BO, #16
.endm
.macro SAVE2x2
ldr r3 , LDC
add CO2 , CO1, r3
fldd d0, ALPHA
fldd d4 , [CO1]
fldd d5 , [CO1, #8 ]
fmacd d4 , d0 , d8
fmacd d5 , d0 , d9
fstd d4 , [CO1]
fstd d5 , [CO1, #8 ]
fldd d4 , [CO2]
fldd d5 , [CO2, #8 ]
fmacd d4 , d0 , d12
fmacd d5 , d0 , d13
fstd d4 , [CO2]
fstd d5 , [CO2, #8 ]
add CO1, CO1, #16
.endm
/******************************************************************************/
.macro INIT1x2
vsub.f64 d8 , d8 , d8
vmov.f64 d12, d8
.endm
.macro KERNEL1x2_SUB
fldd d4 , [ BO ]
fldd d5 , [ BO, #8 ]
fldd d0 , [ AO ]
fmacd d8 , d0, d4
fmacd d12 , d0, d5
add AO , AO, #8
add BO , BO, #16
.endm
.macro SAVE1x2
ldr r3 , LDC
add CO2 , CO1, r3
fldd d0, ALPHA
fldd d4 , [CO1]
fmacd d4 , d0 , d8
fstd d4 , [CO1]
fldd d4 , [CO2]
fmacd d4 , d0 , d12
fstd d4 , [CO2]
add CO1, CO1, #8
.endm
/******************************************************************************/
.macro INIT4x1
vsub.f64 d8 , d8 , d8
vmov.f64 d9, d8
vmov.f64 d10, d8
vmov.f64 d11, d8
.endm
.macro KERNEL4x1_SUB
fldd d4 , [ BO ]
fldd d0 , [ AO ]
fldd d1 , [ AO, #8 ]
fldd d2 , [ AO, #16 ]
fldd d3 , [ AO, #24 ]
fmacd d8 , d0, d4
fmacd d9 , d1, d4
fmacd d10 , d2, d4
fmacd d11 , d3, d4
add AO , AO, #32
add BO , BO, #8
.endm
.macro SAVE4x1
fldd d0, ALPHA
fldd d4 , [CO1]
fldd d5 , [CO1, #8 ]
fldd d6 , [CO1, #16 ]
fldd d7 , [CO1, #24 ]
fmacd d4 , d0 , d8
fmacd d5 , d0 , d9
fmacd d6 , d0 , d10
fmacd d7 , d0 , d11
fstd d4 , [CO1]
fstd d5 , [CO1, #8 ]
fstd d6 , [CO1, #16 ]
fstd d7 , [CO1, #24 ]
add CO1, CO1, #32
.endm
/******************************************************************************/
.macro INIT2x1
vsub.f64 d8 , d8 , d8
vmov.f64 d9 , d8
.endm
.macro KERNEL2x1_SUB
fldd d4 , [ BO ]
fldd d0 , [ AO ]
fldd d1 , [ AO, #8 ]
fmacd d8 , d0, d4
fmacd d9 , d1, d4
add AO , AO, #16
add BO , BO, #8
.endm
.macro SAVE2x1
fldd d0, ALPHA
fldd d4 , [CO1]
fldd d5 , [CO1, #8 ]
fmacd d4 , d0 , d8
fmacd d5 , d0 , d9
fstd d4 , [CO1]
fstd d5 , [CO1, #8 ]
add CO1, CO1, #16
.endm
/******************************************************************************/
.macro INIT1x1
vsub.f64 d8 , d8 , d8
.endm
.macro KERNEL1x1_SUB
fldd d4 , [ BO ]
fldd d0 , [ AO ]
fmacd d8 , d0, d4
add AO , AO, #8
add BO , BO, #8
.endm
.macro SAVE1x1
fldd d0, ALPHA
fldd d4 , [CO1]
fmacd d4 , d0 , d8
fstd d4 , [CO1]
add CO1, CO1, #8
.endm
/**************************************************************************************
* End of macro definitions
**************************************************************************************/
PROLOGUE
.align 5
push {r4 - r9, fp}
add fp, sp, #24
sub sp, sp, #STACKSIZE // reserve stack
str OLD_M, M
str OLD_N, N
str OLD_K, K
str OLD_A, A
vstr OLD_ALPHA, ALPHA
sub r3, fp, #128
vstm r3, { d8 - d15} // store floating point registers
ldr r3, OLD_LDC
lsl r3, r3, #3 // ldc = ldc * 8
str r3, LDC
ldr K1, K
ldr BC, B
ldr J, N
asrs J, J, #1 // J = J / 2
ble dgemm_kernel_L1_BEGIN
/*********************************************************************************************/
dgemm_kernel_L2_BEGIN:
ldr CO1, C // CO1 = C
ldr r4 , LDC
lsl r4 , r4 , #1 // LDC * 2
add r3 , r4, CO1
str r3 , C // store C
ldr AO, A // AO = A
dgemm_kernel_L2_M4_BEGIN:
ldr I, M
asrs I, I, #2 // I = I / 4
ble dgemm_kernel_L2_M2_BEGIN
dgemm_kernel_L2_M4_20:
INIT4x2
mov BO, BC
asrs L , K1, #3 // L = L / 8
ble dgemm_kernel_L2_M4_40
.align 5
dgemm_kernel_L2_M4_22:
pld [ BO, #B_PRE ]
KERNEL4x2_SUB
KERNEL4x2_SUB
pld [ BO, #B_PRE ]
KERNEL4x2_SUB
KERNEL4x2_SUB
pld [ BO, #B_PRE ]
KERNEL4x2_SUB
KERNEL4x2_SUB
pld [ BO, #B_PRE ]
KERNEL4x2_SUB
KERNEL4x2_SUB
subs L, L, #1
bgt dgemm_kernel_L2_M4_22
dgemm_kernel_L2_M4_40:
ands L , K1, #7 // L = L % 8
ble dgemm_kernel_L2_M4_100
dgemm_kernel_L2_M4_42:
KERNEL4x2_SUB
subs L, L, #1
bgt dgemm_kernel_L2_M4_42
dgemm_kernel_L2_M4_100:
SAVE4x2
dgemm_kernel_L2_M4_END:
subs I, I, #1
bgt dgemm_kernel_L2_M4_20
dgemm_kernel_L2_M2_BEGIN:
ldr I, M
tst I , #3
ble dgemm_kernel_L2_END
tst I, #2 // I = I / 2
ble dgemm_kernel_L2_M1_BEGIN
dgemm_kernel_L2_M2_20:
INIT2x2
mov BO, BC
asrs L , K1, #3 // L = L / 8
ble dgemm_kernel_L2_M2_40
dgemm_kernel_L2_M2_22:
KERNEL2x2_SUB
KERNEL2x2_SUB
KERNEL2x2_SUB
KERNEL2x2_SUB
KERNEL2x2_SUB
KERNEL2x2_SUB
KERNEL2x2_SUB
KERNEL2x2_SUB
subs L, L, #1
bgt dgemm_kernel_L2_M2_22
dgemm_kernel_L2_M2_40:
ands L , K1, #7 // L = L % 8
ble dgemm_kernel_L2_M2_100
dgemm_kernel_L2_M2_42:
KERNEL2x2_SUB
subs L, L, #1
bgt dgemm_kernel_L2_M2_42
dgemm_kernel_L2_M2_100:
SAVE2x2
dgemm_kernel_L2_M2_END:
dgemm_kernel_L2_M1_BEGIN:
tst I, #1 // I = I % 2
ble dgemm_kernel_L2_END
dgemm_kernel_L2_M1_20:
INIT1x2
mov BO, BC
asrs L , K1, #3 // L = L / 8
ble dgemm_kernel_L2_M1_40
dgemm_kernel_L2_M1_22:
KERNEL1x2_SUB
KERNEL1x2_SUB
KERNEL1x2_SUB
KERNEL1x2_SUB
KERNEL1x2_SUB
KERNEL1x2_SUB
KERNEL1x2_SUB
KERNEL1x2_SUB
subs L, L, #1
bgt dgemm_kernel_L2_M1_22
dgemm_kernel_L2_M1_40:
ands L , K1, #7 // L = L % 8
ble dgemm_kernel_L2_M1_100
dgemm_kernel_L2_M1_42:
KERNEL1x2_SUB
subs L, L, #1
bgt dgemm_kernel_L2_M1_42
dgemm_kernel_L2_M1_100:
SAVE1x2
dgemm_kernel_L2_END:
mov r3, BC
mov r4, K1
lsl r4, r4, #4 // k * 2 * 8
add r3, r3, r4 // B = B + K * 2 * 8
mov BC, r3
subs J , #1 // j--
bgt dgemm_kernel_L2_BEGIN
/*********************************************************************************************/
dgemm_kernel_L1_BEGIN:
ldr J , N
tst J , #1
ble dgemm_kernel_L999
ldr CO1, C // CO1 = C
ldr r4 , LDC
add r3 , r4, CO1
str r3 , C // store C
ldr AO, A // AO = A
dgemm_kernel_L1_M4_BEGIN:
ldr I, M
asrs I, I, #2 // I = I / 4
ble dgemm_kernel_L1_M2_BEGIN
dgemm_kernel_L1_M4_20:
INIT4x1
mov BO, BC
asrs L , K1, #3 // L = L / 8
ble dgemm_kernel_L1_M4_40
.align 5
dgemm_kernel_L1_M4_22:
KERNEL4x1_SUB
KERNEL4x1_SUB
KERNEL4x1_SUB
KERNEL4x1_SUB
KERNEL4x1_SUB
KERNEL4x1_SUB
KERNEL4x1_SUB
KERNEL4x1_SUB
subs L, L, #1
bgt dgemm_kernel_L1_M4_22
dgemm_kernel_L1_M4_40:
ands L , K1, #7 // L = L % 8
ble dgemm_kernel_L1_M4_100
dgemm_kernel_L1_M4_42:
KERNEL4x1_SUB
subs L, L, #1
bgt dgemm_kernel_L1_M4_42
dgemm_kernel_L1_M4_100:
SAVE4x1
dgemm_kernel_L1_M4_END:
subs I, I, #1
bgt dgemm_kernel_L1_M4_20
dgemm_kernel_L1_M2_BEGIN:
ldr I, M
tst I , #3
ble dgemm_kernel_L1_END
tst I, #2 // I = I / 2
ble dgemm_kernel_L1_M1_BEGIN
dgemm_kernel_L1_M2_20:
INIT2x1
mov BO, BC
asrs L , K1, #3 // L = L / 8
ble dgemm_kernel_L1_M2_40
dgemm_kernel_L1_M2_22:
KERNEL2x1_SUB
KERNEL2x1_SUB
KERNEL2x1_SUB
KERNEL2x1_SUB
KERNEL2x1_SUB
KERNEL2x1_SUB
KERNEL2x1_SUB
KERNEL2x1_SUB
subs L, L, #1
bgt dgemm_kernel_L1_M2_22
dgemm_kernel_L1_M2_40:
ands L , K1, #7 // L = L % 8
ble dgemm_kernel_L1_M2_100
dgemm_kernel_L1_M2_42:
KERNEL2x1_SUB
subs L, L, #1
bgt dgemm_kernel_L1_M2_42
dgemm_kernel_L1_M2_100:
SAVE2x1
dgemm_kernel_L1_M2_END:
dgemm_kernel_L1_M1_BEGIN:
tst I, #1 // I = I % 2
ble dgemm_kernel_L1_END
dgemm_kernel_L1_M1_20:
INIT1x1
mov BO, BC
asrs L , K1, #3 // L = L / 8
ble dgemm_kernel_L1_M1_40
dgemm_kernel_L1_M1_22:
KERNEL1x1_SUB
KERNEL1x1_SUB
KERNEL1x1_SUB
KERNEL1x1_SUB
KERNEL1x1_SUB
KERNEL1x1_SUB
KERNEL1x1_SUB
KERNEL1x1_SUB
subs L, L, #1
bgt dgemm_kernel_L1_M1_22
dgemm_kernel_L1_M1_40:
ands L , K1, #7 // L = L % 8
ble dgemm_kernel_L1_M1_100
dgemm_kernel_L1_M1_42:
KERNEL1x1_SUB
subs L, L, #1
bgt dgemm_kernel_L1_M1_42
dgemm_kernel_L1_M1_100:
SAVE1x1
dgemm_kernel_L1_END:
dgemm_kernel_L999:
sub r3, fp, #128
vldm r3, { d8 - d15} // restore floating point registers
movs r0, #0 // set return value
sub sp, fp, #24
pop {r4 - r9, fp}
bx lr
EPILOGUE

File diff suppressed because it is too large Load Diff

View File

@ -0,0 +1,225 @@
/***************************************************************************
Copyright (c) 2013, The OpenBLAS Project
All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
1. Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
2. Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in
the documentation and/or other materials provided with the
distribution.
3. Neither the name of the OpenBLAS project nor the names of
its contributors may be used to endorse or promote products
derived from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*****************************************************************************/
/**************************************************************************************
* 2013/11/24 Saar
* BLASTEST : OK
* CTEST : OK
* TEST : OK
*
**************************************************************************************/
#define ASSEMBLER
#include "common.h"
#define STACKSIZE 256
#define OLD_M r0
#define OLD_N r1
#define OLD_A r2
#define OLD_LDA r3
#define B [fp, #4 ]
#define M r0
#define N r1
#define A r2
#define BO r5
#define AO1 r6
#define AO2 r7
#define LDA r8
#define I r3
#define J r12
#define A_PRE 256
/**************************************************************************************
* Macro definitions
**************************************************************************************/
.macro COPY2x2
fldd d0 , [ AO1, #0 ]
fldd d2 , [ AO1, #8 ]
fldd d1 , [ AO2, #0 ]
fldd d3 , [ AO2, #8 ]
add AO1, AO1, #16
fstmiad BO!, { d0 - d3 }
add AO2, AO2, #16
.endm
.macro COPY1x2
fldd d0 , [ AO1, #0 ]
fldd d1 , [ AO2, #0 ]
add AO1, AO1, #8
fstmiad BO!, { d0 - d1 }
add AO2, AO2, #8
.endm
.macro COPY2x1
fldd d0 , [ AO1, #0 ]
fldd d1 , [ AO1, #8 ]
fstmiad BO!, { d0 - d1 }
add AO1, AO1, #16
.endm
.macro COPY1x1
fldd d0 , [ AO1, #0 ]
fstmiad BO!, { d0 }
add AO1, AO1, #8
.endm
/**************************************************************************************
* End of macro definitions
**************************************************************************************/
PROLOGUE
.align 5
push {r4 - r9, fp}
add fp, sp, #24
lsl LDA, OLD_LDA, #3 // lda = lda * 8
ldr BO, B
/*********************************************************************************************/
dgemm_ncopy_L2_BEGIN:
asrs J, N, #1 // J = N / 2
ble dgemm_ncopy_L1_BEGIN
dgemm_ncopy_L2_M2_BEGIN:
mov AO1, A // AO1 = A
add AO2, AO1, LDA
add A , AO2, LDA // A = A + 2 * LDA
asrs I, M, #1 // I = M / 2
ble dgemm_ncopy_L2_M2_40
dgemm_ncopy_L2_M2_20:
COPY2x2
subs I , I , #1
bne dgemm_ncopy_L2_M2_20
dgemm_ncopy_L2_M2_40:
ands I, M , #1
ble dgemm_ncopy_L2_M2_END
dgemm_ncopy_L2_M2_60:
COPY1x2
subs I , I , #1
bne dgemm_ncopy_L2_M2_60
dgemm_ncopy_L2_M2_END:
subs J , J, #1 // j--
bne dgemm_ncopy_L2_M2_BEGIN
/*********************************************************************************************/
dgemm_ncopy_L1_BEGIN:
tst N, #1
ble dgemm_ncopy_L999
dgemm_ncopy_L1_M2_BEGIN:
mov AO1, A // AO1 = A
add A , AO1, LDA // A = A + 1 * LDA
asrs I, M, #1 // I = M / 2
ble dgemm_ncopy_L1_M2_40
dgemm_ncopy_L1_M2_20:
COPY2x1
subs I , I , #1
bne dgemm_ncopy_L1_M2_20
dgemm_ncopy_L1_M2_40:
ands I, M , #1
ble dgemm_ncopy_L1_M2_END
dgemm_ncopy_L1_M2_60:
COPY1x1
subs I , I , #1
bne dgemm_ncopy_L1_M2_60
dgemm_ncopy_L1_M2_END:
dgemm_ncopy_L999:
movs r0, #0 // set return value
sub sp, fp, #24
pop {r4 - r9, fp}
bx lr
EPILOGUE

View File

@ -0,0 +1,349 @@
/***************************************************************************
Copyright (c) 2013, The OpenBLAS Project
All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
1. Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
2. Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in
the documentation and/or other materials provided with the
distribution.
3. Neither the name of the OpenBLAS project nor the names of
its contributors may be used to endorse or promote products
derived from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*****************************************************************************/
/**************************************************************************************
* 2013/11/05 Saar
* BLASTEST : OK
* CTEST : OK
* TEST : OK
*
**************************************************************************************/
#define ASSEMBLER
#include "common.h"
#define STACKSIZE 256
#define OLD_M r0
#define OLD_N r1
#define OLD_A r2
#define OLD_LDA r3
/******************************************************
* [fp, #-128] - [fp, #-64] is reserved
* for store and restore of floating point
* registers
*******************************************************/
#define LDA [fp, #-260 ]
#define B [fp, #4 ]
#define M r0
#define N r1
#define A r2
#define BO r5
#define AO1 r6
#define AO2 r7
#define AO3 r8
#define AO4 r9
#define I r3
#define J r12
#define A_PRE 256
/**************************************************************************************
* Macro definitions
**************************************************************************************/
.macro COPY4x4
pld [ AO1, #A_PRE ]
pld [ AO2, #A_PRE ]
pld [ AO3, #A_PRE ]
pld [ AO4, #A_PRE ]
fldd d0 , [ AO1, #0 ]
fldd d1 , [ AO2, #0 ]
fldd d2 , [ AO3, #0 ]
fldd d3 , [ AO4, #0 ]
fldd d4 , [ AO1, #8 ]
fldd d8 , [ AO1, #16 ]
fldd d12, [ AO1, #24 ]
fldd d5 , [ AO2, #8 ]
add AO1, AO1, #32
fldd d9 , [ AO2, #16 ]
fldd d13, [ AO2, #24 ]
fldd d6 , [ AO3, #8 ]
add AO2, AO2, #32
fldd d10, [ AO3, #16 ]
fldd d14, [ AO3, #24 ]
fldd d7 , [ AO4, #8 ]
add AO3, AO3, #32
fldd d11, [ AO4, #16 ]
fldd d15, [ AO4, #24 ]
fstmiad BO!, { d0 - d3 }
add AO4, AO4, #32
fstmiad BO!, { d4 - d7 }
fstmiad BO!, { d8 - d15 }
.endm
.macro COPY1x4
fldd d0 , [ AO1, #0 ]
fldd d1 , [ AO2, #0 ]
add AO1, AO1, #8
fldd d2 , [ AO3, #0 ]
add AO2, AO2, #8
fldd d3 , [ AO4, #0 ]
add AO3, AO3, #8
fstmiad BO!, { d0 - d3 }
add AO4, AO4, #8
.endm
.macro COPY4x2
fldd d0 , [ AO1, #0 ]
fldd d2 , [ AO1, #8 ]
fldd d4 , [ AO1, #16 ]
fldd d6 , [ AO1, #24 ]
fldd d1 , [ AO2, #0 ]
fldd d3 , [ AO2, #8 ]
add AO1, AO1, #32
fldd d5 , [ AO2, #16 ]
fldd d7 , [ AO2, #24 ]
fstmiad BO!, { d0 - d7 }
add AO2, AO2, #32
.endm
.macro COPY1x2
fldd d0 , [ AO1, #0 ]
fldd d1 , [ AO2, #0 ]
add AO1, AO1, #8
fstmiad BO!, { d0 - d1 }
add AO2, AO2, #8
.endm
.macro COPY4x1
fldd d0 , [ AO1, #0 ]
fldd d1 , [ AO1, #8 ]
fldd d2 , [ AO1, #16 ]
fldd d3 , [ AO1, #24 ]
fstmiad BO!, { d0 - d3 }
add AO1, AO1, #32
.endm
.macro COPY1x1
fldd d0 , [ AO1, #0 ]
fstmiad BO!, { d0 }
add AO1, AO1, #8
.endm
/**************************************************************************************
* End of macro definitions
**************************************************************************************/
PROLOGUE
.align 5
push {r4 - r9, fp}
add fp, sp, #24
sub sp, sp, #STACKSIZE // reserve stack
lsl r3, r3, #3 // lda = lda * 8
str r3, LDA
sub r4, fp, #128
vstm r4, { d8 - d15} // store floating point registers
ldr BO, B
dgemm_ncopy_L4_BEGIN:
asrs J, N, #2 // J = N / 4
ble dgemm_ncopy_L2_BEGIN
dgemm_ncopy_L4_M4_BEGIN:
mov AO1, A // AO1 = A
ldr r4 , LDA
add AO2, AO1, r4
add AO3, AO2, r4
add AO4, AO3, r4
add A , AO4, r4 // A = A + 4 * LDA
asrs I, M, #2 // I = M / 4
ble dgemm_ncopy_L4_M4_40
dgemm_ncopy_L4_M4_20:
COPY4x4
subs I , I , #1
bne dgemm_ncopy_L4_M4_20
dgemm_ncopy_L4_M4_40:
ands I, M , #3
ble dgemm_ncopy_L4_M4_END
dgemm_ncopy_L4_M4_60:
COPY1x4
subs I , I , #1
bne dgemm_ncopy_L4_M4_60
dgemm_ncopy_L4_M4_END:
subs J , J, #1 // j--
bne dgemm_ncopy_L4_M4_BEGIN
/*********************************************************************************************/
dgemm_ncopy_L2_BEGIN:
tst N, #3
ble dgemm_ncopy_L999
tst N, #2
ble dgemm_ncopy_L1_BEGIN
dgemm_ncopy_L2_M4_BEGIN:
mov AO1, A // AO1 = A
ldr r4 , LDA
add AO2, AO1, r4
add A , AO2, r4 // A = A + 2 * LDA
asrs I, M, #2 // I = M / 4
ble dgemm_ncopy_L2_M4_40
dgemm_ncopy_L2_M4_20:
COPY4x2
subs I , I , #1
bne dgemm_ncopy_L2_M4_20
dgemm_ncopy_L2_M4_40:
ands I, M , #3
ble dgemm_ncopy_L2_M4_END
dgemm_ncopy_L2_M4_60:
COPY1x2
subs I , I , #1
bne dgemm_ncopy_L2_M4_60
dgemm_ncopy_L2_M4_END:
/*********************************************************************************************/
dgemm_ncopy_L1_BEGIN:
tst N, #1
ble dgemm_ncopy_L999
dgemm_ncopy_L1_M4_BEGIN:
mov AO1, A // AO1 = A
ldr r4 , LDA
add A , AO1, r4 // A = A + 1 * LDA
asrs I, M, #2 // I = M / 4
ble dgemm_ncopy_L1_M4_40
dgemm_ncopy_L1_M4_20:
COPY4x1
subs I , I , #1
bne dgemm_ncopy_L1_M4_20
dgemm_ncopy_L1_M4_40:
ands I, M , #3
ble dgemm_ncopy_L1_M4_END
dgemm_ncopy_L1_M4_60:
COPY1x1
subs I , I , #1
bne dgemm_ncopy_L1_M4_60
dgemm_ncopy_L1_M4_END:
dgemm_ncopy_L999:
sub r3, fp, #128
vldm r3, { d8 - d15} // restore floating point registers
movs r0, #0 // set return value
sub sp, fp, #24
pop {r4 - r9, fp}
bx lr
EPILOGUE

View File

@ -0,0 +1,408 @@
/***************************************************************************
Copyright (c) 2013, The OpenBLAS Project
All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
1. Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
2. Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in
the documentation and/or other materials provided with the
distribution.
3. Neither the name of the OpenBLAS project nor the names of
its contributors may be used to endorse or promote products
derived from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*****************************************************************************/
/**************************************************************************************
* 2013/11/06 Saar
* BLASTEST : OK
* CTEST : OK
* TEST : OK
*
**************************************************************************************/
#define ASSEMBLER
#include "common.h"
#define STACKSIZE 256
#define OLD_M r0
#define OLD_N r1
#define OLD_A r2
#define OLD_LDA r3
/******************************************************
* [fp, #-128] - [fp, #-64] is reserved
* for store and restore of floating point
* registers
*******************************************************/
#define B [fp, #4 ]
#define A [fp, #-248 ]
#define M r0
#define N r1
#define M4 r2
#define LDA r5
#define AO1 r6
#define BO1 r7
#define BO2 r8
#define BO3 r9
#define I r4
#define J r12
#define A_PRE 256
/**************************************************************************************
* Macro definitions
**************************************************************************************/
.macro COPY4x4
pld [ AO1, #A_PRE ]
fldmiad AO1, { d0 - d3 }
add r3, AO1, LDA
pld [ r3, #A_PRE ]
fldmiad r3, { d4 - d7 }
add r3, r3, LDA
pld [ r3, #A_PRE ]
fldmiad r3, { d8 - d11 }
add r3, r3, LDA
pld [ r3, #A_PRE ]
fldmiad r3, { d12 - d15 }
fstmiad BO1, { d0 - d15 }
add AO1, AO1, #32
add BO1, BO1, M4
.endm
.macro COPY2x4
fldmiad AO1, { d0 - d1 }
add r3, AO1, LDA
fldmiad r3, { d2 - d3 }
add r3, r3, LDA
fldmiad r3, { d4 - d5 }
add r3, r3, LDA
fldmiad r3, { d6 - d7 }
fstmiad BO2, { d0 - d7 }
add AO1, AO1, #16
add BO2, BO2, #64
.endm
.macro COPY1x4
fldmiad AO1, { d0 }
add r3, AO1, LDA
fldmiad r3, { d1 }
add r3, r3, LDA
fldmiad r3, { d2 }
add r3, r3, LDA
fldmiad r3, { d3 }
fstmiad BO3, { d0 - d3 }
add AO1, AO1, #8
add BO3, BO3, #32
.endm
/*************************************************************************************************************************/
.macro COPY4x2
pld [ AO1, #A_PRE ]
fldmiad AO1, { d0 - d3 }
add r3, AO1, LDA
pld [ r3, #A_PRE ]
fldmiad r3, { d4 - d7 }
fstmiad BO1, { d0 - d7 }
add AO1, AO1, #32
add BO1, BO1, M4
.endm
.macro COPY2x2
fldmiad AO1, { d0 - d1 }
add r3, AO1, LDA
fldmiad r3, { d2 - d3 }
fstmiad BO2, { d0 - d3 }
add AO1, AO1, #16
add BO2, BO2, #32
.endm
.macro COPY1x2
fldmiad AO1, { d0 }
add r3, AO1, LDA
fldmiad r3, { d1 }
fstmiad BO3, { d0 - d1 }
add AO1, AO1, #8
add BO3, BO3, #16
.endm
/*************************************************************************************************************************/
.macro COPY4x1
pld [ AO1, #A_PRE ]
fldmiad AO1, { d0 - d3 }
fstmiad BO1, { d0 - d3 }
add AO1, AO1, #32
add BO1, BO1, M4
.endm
.macro COPY2x1
fldmiad AO1, { d0 - d1 }
fstmiad BO2, { d0 - d1 }
add AO1, AO1, #16
add BO2, BO2, #16
.endm
.macro COPY1x1
fldmiad AO1, { d0 }
fstmiad BO3, { d0 }
add AO1, AO1, #8
add BO3, BO3, #8
.endm
/**************************************************************************************
* End of macro definitions
**************************************************************************************/
PROLOGUE
.align 5
push {r4 - r9, fp}
add fp, sp, #24
sub sp, sp, #STACKSIZE // reserve stack
str OLD_A, A // store A
lsl LDA, OLD_LDA, #3 // lda = lda * SIZE
sub r4, fp, #128
vstm r4, { d8 - d15} // store floating point registers
lsl r4 , M, #3 // M * SIZE
ldr r3, B
and BO2 , N , #-4
and BO3 , N , #-2
mul BO2, BO2, r4
mul BO3, BO3, r4
add BO2 , BO2, r3
add BO3 , BO3, r3
lsl M4, M, #5 // M4 = M * 4 * SIZE
dgemm_tcopy_L4_BEGIN:
asrs J, M, #2 // J = N / 4
ble dgemm_tcopy_L2_BEGIN
dgemm_tcopy_L4_M4_BEGIN:
ldr AO1, A // AO1 = A
lsl r3, LDA, #2 // r3 = 4 * LDA
add r3, r3 , AO1 // A = A + 4 * LDA
str r3, A // store A
ldr BO1, B
add r3, BO1, #128 // B = B + 16 * SIZE
str r3, B
asrs I, N, #2 // I = M / 4
ble dgemm_tcopy_L4_M4_40
dgemm_tcopy_L4_M4_20:
COPY4x4
subs I , I , #1
bne dgemm_tcopy_L4_M4_20
dgemm_tcopy_L4_M4_40:
tst N , #2
ble dgemm_tcopy_L4_M4_60
COPY2x4
dgemm_tcopy_L4_M4_60:
tst N, #1
ble dgemm_tcopy_L4_M4_END
COPY1x4
dgemm_tcopy_L4_M4_END:
subs J , J, #1 // j--
bne dgemm_tcopy_L4_M4_BEGIN
/*********************************************************************************************/
dgemm_tcopy_L2_BEGIN:
tst M, #3
ble dgemm_tcopy_L999
tst M, #2
ble dgemm_tcopy_L1_BEGIN
dgemm_tcopy_L2_M4_BEGIN:
ldr AO1, A // AO1 = A
lsl r3, LDA, #1 // r3 = 2 * LDA
add r3, r3 , AO1 // A = A + 2 * LDA
str r3, A // store A
ldr BO1, B
add r3, BO1, #64 // B = B + 8 * SIZE
str r3, B
asrs I, N, #2 // I = M / 4
ble dgemm_tcopy_L2_M4_40
dgemm_tcopy_L2_M4_20:
COPY4x2
subs I , I , #1
bne dgemm_tcopy_L2_M4_20
dgemm_tcopy_L2_M4_40:
tst N , #2
ble dgemm_tcopy_L2_M4_60
COPY2x2
dgemm_tcopy_L2_M4_60:
tst N , #1
ble dgemm_tcopy_L2_M4_END
COPY1x2
dgemm_tcopy_L2_M4_END:
/*********************************************************************************************/
dgemm_tcopy_L1_BEGIN:
tst M, #1
ble dgemm_tcopy_L999
dgemm_tcopy_L1_M4_BEGIN:
ldr AO1, A // AO1 = A
add r3, LDA , AO1 // A = A + 1 * LDA
str r3, A // store A
ldr BO1, B
add r3, BO1, #32 // B = B + 4 * SIZE
str r3, B
asrs I, N, #2 // I = M / 4
ble dgemm_tcopy_L1_M4_40
dgemm_tcopy_L1_M4_20:
COPY4x1
subs I , I , #1
bne dgemm_tcopy_L1_M4_20
dgemm_tcopy_L1_M4_40:
tst N , #2
ble dgemm_tcopy_L1_M4_60
COPY2x1
dgemm_tcopy_L1_M4_60:
tst N , #1
ble dgemm_tcopy_L1_M4_END
COPY1x1
dgemm_tcopy_L1_M4_END:
dgemm_tcopy_L999:
sub r3, fp, #128
vldm r3, { d8 - d15} // restore floating point registers
mov r0, #0 // set return value
sub sp, fp, #24
pop {r4 - r9, fp}
bx lr
EPILOGUE

64
kernel/arm/dot.c Normal file
View File

@ -0,0 +1,64 @@
/***************************************************************************
Copyright (c) 2013, The OpenBLAS Project
All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
1. Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
2. Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in
the documentation and/or other materials provided with the
distribution.
3. Neither the name of the OpenBLAS project nor the names of
its contributors may be used to endorse or promote products
derived from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*****************************************************************************/
/**************************************************************************************
* 2013/09/14 Saar
* BLASTEST float : OK
* BLASTEST double : OK
* CTEST : OK
* TEST : OK
*
**************************************************************************************/
#include "common.h"
#if defined(DSDOT)
double CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y)
#else
FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y)
#endif
{
BLASLONG i=0;
BLASLONG ix=0,iy=0;
double dot = 0.0 ;
if ( n < 0 ) return(dot);
while(i < n)
{
dot += y[iy] * x[ix] ;
ix += inc_x ;
iy += inc_y ;
i++ ;
}
return(dot);
}

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

67
kernel/arm/gemv_n.c Normal file
View File

@ -0,0 +1,67 @@
/***************************************************************************
Copyright (c) 2013, The OpenBLAS Project
All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
1. Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
2. Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in
the documentation and/or other materials provided with the
distribution.
3. Neither the name of the OpenBLAS project nor the names of
its contributors may be used to endorse or promote products
derived from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*****************************************************************************/
/**************************************************************************************
* * 2013/09/14 Saar
* * BLASTEST float : OK
* * BLASTEST double : OK
* CTEST : OK
* TEST : OK
* *
* **************************************************************************************/
#include "common.h"
int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha, FLOAT *a, BLASLONG lda, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *buffer)
{
BLASLONG i;
BLASLONG ix,iy;
BLASLONG j;
FLOAT *a_ptr;
FLOAT temp;
ix = 0;
a_ptr = a;
for (j=0; j<n; j++)
{
temp = alpha * x[ix];
iy = 0;
for (i=0; i<m; i++)
{
y[iy] += temp * a_ptr[i];
iy += inc_y;
}
a_ptr += lda;
ix += inc_x;
}
}

740
kernel/arm/gemv_n_vfp.S Normal file
View File

@ -0,0 +1,740 @@
/***************************************************************************
Copyright (c) 2013, The OpenBLAS Project
All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
1. Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
2. Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in
the documentation and/or other materials provided with the
distribution.
3. Neither the name of the OpenBLAS project nor the names of
its contributors may be used to endorse or promote products
derived from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*****************************************************************************/
/**************************************************************************************
* 2013/11/28 Saar
* BLASTEST : OK
* CTEST : OK
* TEST : OK
*
**************************************************************************************/
#define ASSEMBLER
#include "common.h"
#define STACKSIZE 256
#define OLD_LDA [fp, #0 ]
#define X [fp, #4 ]
#define OLD_INC_X [fp, #8 ]
#define Y [fp, #12 ]
#define OLD_INC_Y [fp, #16 ]
#define OLD_A r3
#define OLD_M r0
#define AO1 r0
#define N r1
#define J r2
#define AO2 r4
#define XO r5
#define YO r6
#define LDA r7
#define INC_X r8
#define INC_Y r9
#define I r12
#define M [fp, #-252 ]
#define A [fp, #-256 ]
#define X_PRE 64
#define Y_PRE 0
#define A_PRE 0
/**************************************************************************************
* Macro definitions
**************************************************************************************/
#if defined(DOUBLE)
.macro INIT_F8
pld [ YO , #Y_PRE ]
pld [ YO , #Y_PRE+32 ]
vsub.f64 d8 , d8 , d8
vmov.f64 d9 , d8
vmov.f64 d10 , d8
vmov.f64 d11 , d8
vmov.f64 d12 , d8
vmov.f64 d13 , d8
vmov.f64 d14 , d8
vmov.f64 d15 , d8
.endm
.macro KERNEL_F8X8
pld [ XO , #X_PRE ]
KERNEL_F8X1
KERNEL_F8X1
KERNEL_F8X1
KERNEL_F8X1
pld [ XO , #X_PRE ]
KERNEL_F8X1
KERNEL_F8X1
KERNEL_F8X1
KERNEL_F8X1
.endm
.macro KERNEL_F8X1
pld [ AO2 , #A_PRE ]
fldmiad XO! , { d2 }
fldmiad AO1 , { d4 - d7 }
vmla.f64 d8 , d2 , d4
pld [ AO2 , #4*SIZE ]
vmla.f64 d9 , d2 , d5
add r3, AO1, #4*SIZE
vmla.f64 d10 , d2 , d6
vmla.f64 d11 , d2 , d7
fldmiad r3 , { d4 - d7 }
vmla.f64 d12 , d2 , d4
vmla.f64 d13 , d2 , d5
add AO1, AO1, LDA
vmla.f64 d14 , d2 , d6
add AO2, AO2, LDA
vmla.f64 d15 , d2 , d7
.endm
.macro SAVE_F8
fldmiad YO, { d4 - d7 }
vmla.f64 d4 , d0, d8
vmla.f64 d5 , d0, d9
vmla.f64 d6 , d0, d10
vmla.f64 d7 , d0, d11
fstmiad YO!, { d4 - d7 }
fldmiad YO, { d4 - d7 }
vmla.f64 d4 , d0, d12
vmla.f64 d5 , d0, d13
vmla.f64 d6 , d0, d14
vmla.f64 d7 , d0, d15
fstmiad YO!, { d4 - d7 }
.endm
.macro INIT_F1
vsub.f64 d12 , d12 , d12
.endm
.macro KERNEL_F1X1
fldmiad XO! , { d2 }
fldmiad AO1 , { d8 }
vmla.f64 d12 , d2 , d8
add AO1, AO1, LDA
.endm
.macro SAVE_F1
fldmiad YO, { d4 }
vmla.f64 d4, d0, d12
fstmiad YO!, { d4 }
.endm
/*********************************************************************************************/
.macro INIT_S4
vsub.f64 d12 , d12 , d12
vmov.f64 d13 , d12
vmov.f64 d14 , d12
vmov.f64 d15 , d12
.endm
.macro KERNEL_S4X4
KERNEL_S4X1
KERNEL_S4X1
KERNEL_S4X1
KERNEL_S4X1
.endm
.macro KERNEL_S4X1
pld [ AO2 , #A_PRE ]
fldmiad XO , { d2 }
fldmiad AO1 , { d8 - d11 }
vmla.f64 d12 , d2 , d8
add AO1, AO1, LDA
vmla.f64 d13 , d2 , d9
add AO2, AO2, LDA
vmla.f64 d14 , d2 , d10
vmla.f64 d15 , d2 , d11
add XO, XO , INC_X
.endm
.macro SAVE_S4
fldmiad YO, { d4 }
vmla.f64 d4 , d0, d12
fstmiad YO, { d4 }
add YO, YO, INC_Y
fldmiad YO, { d5 }
vmla.f64 d5 , d0, d13
fstmiad YO, { d5 }
add YO, YO, INC_Y
fldmiad YO, { d4 }
vmla.f64 d4 , d0, d14
fstmiad YO, { d4 }
add YO, YO, INC_Y
fldmiad YO, { d5 }
vmla.f64 d5 , d0, d15
fstmiad YO, { d5 }
add YO, YO, INC_Y
.endm
.macro INIT_S1
vsub.f64 d12 , d12 , d12
.endm
.macro KERNEL_S1X1
fldmiad XO , { d2 }
fldmiad AO1 , { d8 }
vmla.f64 d12 , d2 , d8
add AO1, AO1, LDA
add XO, XO , INC_X
.endm
.macro SAVE_S1
fldmiad YO, { d4 }
vmla.f64 d4, d0, d12
fstmiad YO , { d4 }
add YO, YO, INC_Y
.endm
#else /************************* SINGLE PRECISION *****************************************/
.macro INIT_F8
pld [ YO , #Y_PRE ]
vsub.f32 s8 , s8 , s8
vmov.f32 s9 , s8
vmov.f32 s10 , s8
vmov.f32 s11 , s8
vmov.f32 s12 , s8
vmov.f32 s13 , s8
vmov.f32 s14 , s8
vmov.f32 s15 , s8
.endm
.macro KERNEL_F8X8
pld [ XO , #X_PRE ]
KERNEL_F8X1
KERNEL_F8X1
KERNEL_F8X1
KERNEL_F8X1
KERNEL_F8X1
KERNEL_F8X1
KERNEL_F8X1
KERNEL_F8X1
.endm
.macro KERNEL_F8X1
pld [ AO2, #A_PRE ]
fldmias XO! , { s2 }
fldmias AO1 , { s4 - s7 }
vmla.f32 s8 , s2 , s4
vmla.f32 s9 , s2 , s5
vmla.f32 s10 , s2 , s6
vmla.f32 s11 , s2 , s7
add r3, AO1, #4*SIZE
fldmias r3 , { s4 - s7 }
vmla.f32 s12 , s2 , s4
vmla.f32 s13 , s2 , s5
vmla.f32 s14 , s2 , s6
vmla.f32 s15 , s2 , s7
add AO1, AO1, LDA
add AO2, AO2, LDA
.endm
.macro SAVE_F8
fldmias YO, { s4 - s7 }
vmla.f32 s4 , s0, s8
vmla.f32 s5 , s0, s9
vmla.f32 s6 , s0, s10
vmla.f32 s7 , s0, s11
fstmias YO!, { s4 - s7 }
fldmias YO, { s4 - s7 }
vmla.f32 s4 , s0, s12
vmla.f32 s5 , s0, s13
vmla.f32 s6 , s0, s14
vmla.f32 s7 , s0, s15
fstmias YO!, { s4 - s7 }
.endm
.macro INIT_F1
vsub.f32 s12 , s12 , s12
.endm
.macro KERNEL_F1X1
fldmias XO! , { s2 }
fldmias AO1 , { s8 }
vmla.f32 s12 , s2 , s8
add AO1, AO1, LDA
.endm
.macro SAVE_F1
fldmias YO, { s4 }
vmla.f32 s4, s0, s12
fstmias YO!, { s4 }
.endm
/*********************************************************************************************/
.macro INIT_S4
vsub.f32 s12 , s12 , s12
vmov.f32 s13 , s12
vmov.f32 s14 , s12
vmov.f32 s15 , s12
.endm
.macro KERNEL_S4X4
pld [ AO2 , #A_PRE ]
KERNEL_S4X1
KERNEL_S4X1
pld [ AO2 , #A_PRE ]
KERNEL_S4X1
KERNEL_S4X1
.endm
.macro KERNEL_S4X1
fldmias XO , { s2 }
fldmias AO1 , { s8 - s11 }
vmla.f32 s12 , s2 , s8
vmla.f32 s13 , s2 , s9
vmla.f32 s14 , s2 , s10
vmla.f32 s15 , s2 , s11
add AO1, AO1, LDA
add AO2, AO2, LDA
add XO, XO , INC_X
.endm
.macro SAVE_S4
fldmias YO, { s4 }
vmla.f32 s4 , s0, s12
fstmias YO, { s4 }
add YO, YO, INC_Y
fldmias YO, { s5 }
vmla.f32 s5 , s0, s13
fstmias YO, { s5 }
add YO, YO, INC_Y
fldmias YO, { s4 }
vmla.f32 s4 , s0, s14
fstmias YO, { s4 }
add YO, YO, INC_Y
fldmias YO, { s5 }
vmla.f32 s5 , s0, s15
fstmias YO, { s5 }
add YO, YO, INC_Y
.endm
.macro INIT_S1
vsub.f32 s12 , s12 , s12
.endm
.macro KERNEL_S1X1
fldmias XO , { s2 }
fldmias AO1 , { s8 }
vmla.f32 s12 , s2 , s8
add AO1, AO1, LDA
add XO, XO , INC_X
.endm
.macro SAVE_S1
fldmias YO, { s4 }
vmla.f32 s4, s0, s12
fstmias YO , { s4 }
add YO, YO, INC_Y
.endm
#endif
/**************************************************************************************
* End of macro definitions
**************************************************************************************/
PROLOGUE
.align 5
push {r4 - r9 , fp}
add fp, sp, #28
sub sp, sp, #STACKSIZE // reserve stack
sub r12, fp, #192
#if defined(DOUBLE)
vstm r12, { d8 - d15 } // store floating point registers
#else
vstm r12, { s8 - s15 } // store floating point registers
#endif
cmp OLD_M, #0
ble gemvn_kernel_L999
cmp N, #0
ble gemvn_kernel_L999
str OLD_A, A
str OLD_M, M
ldr INC_X , OLD_INC_X
ldr INC_Y , OLD_INC_Y
cmp INC_X, #0
beq gemvn_kernel_L999
cmp INC_Y, #0
beq gemvn_kernel_L999
ldr LDA, OLD_LDA
#if defined(DOUBLE)
lsl LDA, LDA, #3 // LDA * SIZE
#else
lsl LDA, LDA, #2 // LDA * SIZE
#endif
cmp INC_X, #1
bne gemvn_kernel_S4_BEGIN
cmp INC_Y, #1
bne gemvn_kernel_S4_BEGIN
gemvn_kernel_F4_BEGIN:
ldr YO , Y
ldr I, M
asrs I, I, #3 // I = M / 8
ble gemvn_kernel_F1_BEGIN
gemvn_kernel_F4X4:
ldr AO1, A
add AO2, AO1, LDA
add r3 , AO1, #8*SIZE
str r3 , A
add AO2, AO2, LDA
add AO2, AO2, LDA
ldr XO , X
INIT_F8
asrs J, N, #3 // J = N / 8
ble gemvn_kernel_F4X1
gemvn_kernel_F4X4_10:
KERNEL_F8X8
subs J, J, #1
bne gemvn_kernel_F4X4_10
gemvn_kernel_F4X1:
ands J, N , #7
ble gemvn_kernel_F4_END
gemvn_kernel_F4X1_10:
KERNEL_F8X1
subs J, J, #1
bne gemvn_kernel_F4X1_10
gemvn_kernel_F4_END:
SAVE_F8
subs I , I , #1
bne gemvn_kernel_F4X4
gemvn_kernel_F1_BEGIN:
ldr I, M
ands I, I , #7
ble gemvn_kernel_L999
gemvn_kernel_F1X1:
ldr AO1, A
add r3, AO1, #SIZE
str r3, A
ldr XO , X
INIT_F1
mov J, N
gemvn_kernel_F1X1_10:
KERNEL_F1X1
subs J, J, #1
bne gemvn_kernel_F1X1_10
gemvn_kernel_F1_END:
SAVE_F1
subs I , I , #1
bne gemvn_kernel_F1X1
b gemvn_kernel_L999
/*************************************************************************************************************/
gemvn_kernel_S4_BEGIN:
#if defined(DOUBLE)
lsl INC_X, INC_X, #3 // INC_X * SIZE
lsl INC_Y, INC_Y, #3 // INC_Y * SIZE
#else
lsl INC_X, INC_X, #2 // INC_X * SIZE
lsl INC_Y, INC_Y, #2 // INC_Y * SIZE
#endif
ldr YO , Y
ldr I, M
asrs I, I, #2 // I = M / 4
ble gemvn_kernel_S1_BEGIN
gemvn_kernel_S4X4:
ldr AO1, A
add AO2, AO1, LDA
add r3 , AO1, #4*SIZE
str r3 , A
ldr XO , X
INIT_S4
asrs J, N, #2 // J = N / 4
ble gemvn_kernel_S4X1
gemvn_kernel_S4X4_10:
KERNEL_S4X4
subs J, J, #1
bne gemvn_kernel_S4X4_10
gemvn_kernel_S4X1:
ands J, N , #3
ble gemvn_kernel_S4_END
gemvn_kernel_S4X1_10:
KERNEL_S4X1
subs J, J, #1
bne gemvn_kernel_S4X1_10
gemvn_kernel_S4_END:
SAVE_S4
subs I , I , #1
bne gemvn_kernel_S4X4
gemvn_kernel_S1_BEGIN:
ldr I, M
ands I, I , #3
ble gemvn_kernel_L999
gemvn_kernel_S1X1:
ldr AO1, A
add r3, AO1, #SIZE
str r3, A
ldr XO , X
INIT_S1
mov J, N
gemvn_kernel_S1X1_10:
KERNEL_S1X1
subs J, J, #1
bne gemvn_kernel_S1X1_10
gemvn_kernel_S1_END:
SAVE_S1
subs I , I , #1
bne gemvn_kernel_S1X1
/*************************************************************************************************************/
gemvn_kernel_L999:
sub r3, fp, #192
#if defined(DOUBLE)
vldm r3, { d8 - d15 } // restore floating point registers
#else
vldm r3, { s8 - s15 } // restore floating point registers
#endif
mov r0, #0 // set return value
sub sp, fp, #28
pop {r4 -r9 ,fp}
bx lr
EPILOGUE

781
kernel/arm/gemv_n_vfpv3.S Normal file
View File

@ -0,0 +1,781 @@
/***************************************************************************
Copyright (c) 2013, The OpenBLAS Project
All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
1. Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
2. Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in
the documentation and/or other materials provided with the
distribution.
3. Neither the name of the OpenBLAS project nor the names of
its contributors may be used to endorse or promote products
derived from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*****************************************************************************/
/**************************************************************************************
* 2013/11/19 Saar
* BLASTEST : OK
* CTEST : OK
* TEST : OK
*
**************************************************************************************/
#define ASSEMBLER
#include "common.h"
#define STACKSIZE 256
#define OLD_LDA [fp, #0 ]
#define X [fp, #4 ]
#define OLD_INC_X [fp, #8 ]
#define Y [fp, #12 ]
#define OLD_INC_Y [fp, #16 ]
#define OLD_A r3
#define OLD_M r0
#define AO1 r0
#define N r1
#define J r2
#define AO2 r4
#define XO r5
#define YO r6
#define LDA r7
#define INC_X r8
#define INC_Y r9
#define I r12
#define M [fp, #-252 ]
#define A [fp, #-256 ]
#define X_PRE 64
#define Y_PRE 0
#define A_PRE 0
/**************************************************************************************
* Macro definitions
**************************************************************************************/
#if defined(DOUBLE)
.macro INIT_F8
pld [ YO , #Y_PRE ]
pld [ YO , #Y_PRE+32 ]
vsub.f64 d24 , d24 , d24
vmov.f64 d25 , d24
vmov.f64 d26 , d24
vmov.f64 d27 , d24
vmov.f64 d28 , d24
vmov.f64 d29 , d24
vmov.f64 d30 , d24
vmov.f64 d31 , d24
.endm
.macro KERNEL_F8X8
pld [ XO , #X_PRE ]
KERNEL_F8X1
KERNEL_F8X1
KERNEL_F8X1
KERNEL_F8X1
pld [ XO , #X_PRE ]
KERNEL_F8X1
KERNEL_F8X1
KERNEL_F8X1
KERNEL_F8X1
.endm
.macro KERNEL_F8X1
fldmiad XO! , { d4 }
fldmiad AO1 , { d8 - d15 }
vmla.f64 d24 , d4 , d8
pld [ AO2 , #A_PRE ]
vmla.f64 d25 , d4 , d9
pld [ AO2 , #A_PRE+32 ]
vmla.f64 d26 , d4 , d10
vmla.f64 d27 , d4 , d11
vmla.f64 d28 , d4 , d12
vmla.f64 d29 , d4 , d13
add AO1, AO1, LDA
vmla.f64 d30 , d4 , d14
add AO2, AO2, LDA
vmla.f64 d31 , d4 , d15
.endm
.macro SAVE_F8
fldmiad YO, { d16 - d23 }
vmla.f64 d16, d0, d24
vmla.f64 d17, d0, d25
vmla.f64 d18, d0, d26
vmla.f64 d19, d0, d27
vmla.f64 d20, d0, d28
vmla.f64 d21, d0, d29
vmla.f64 d22, d0, d30
vmla.f64 d23, d0, d31
fstmiad YO!, { d16 - d23 }
.endm
.macro INIT_F1
vsub.f64 d24 , d24 , d24
.endm
.macro KERNEL_F1X1
fldmiad XO! , { d4 }
fldmiad AO1 , { d8 }
vmla.f64 d24 , d4 , d8
add AO1, AO1, LDA
.endm
.macro SAVE_F1
fldmiad YO, { d16 }
vmla.f64 d16, d0, d24
fstmiad YO!, { d16 }
.endm
/*********************************************************************************************/
.macro INIT_S8
vsub.f64 d24 , d24 , d24
vmov.f64 d25 , d24
vmov.f64 d26 , d24
vmov.f64 d27 , d24
vmov.f64 d28 , d24
vmov.f64 d29 , d24
vmov.f64 d30 , d24
vmov.f64 d31 , d24
.endm
.macro KERNEL_S8X8
KERNEL_S8X1
KERNEL_S8X1
KERNEL_S8X1
KERNEL_S8X1
KERNEL_S8X1
KERNEL_S8X1
KERNEL_S8X1
KERNEL_S8X1
.endm
.macro KERNEL_S8X1
pld [ AO2 , #A_PRE ]
pld [ AO2 , #A_PRE+32 ]
fldmiad XO , { d4 }
fldmiad AO1 , { d8 - d15 }
vmla.f64 d24 , d4 , d8
vmla.f64 d25 , d4 , d9
vmla.f64 d26 , d4 , d10
vmla.f64 d27 , d4 , d11
vmla.f64 d28 , d4 , d12
vmla.f64 d29 , d4 , d13
vmla.f64 d30 , d4 , d14
vmla.f64 d31 , d4 , d15
add AO1, AO1, LDA
add AO2, AO2, LDA
add XO, XO, INC_X
.endm
.macro SAVE_S8
fldmiad YO, { d16 }
vmla.f64 d16, d0, d24
fstmiad YO, { d16 }
add YO, YO, INC_Y
fldmiad YO, { d17 }
vmla.f64 d17, d0, d25
fstmiad YO, { d17 }
add YO, YO, INC_Y
fldmiad YO, { d18 }
vmla.f64 d18, d0, d26
fstmiad YO, { d18 }
add YO, YO, INC_Y
fldmiad YO, { d19 }
vmla.f64 d19, d0, d27
fstmiad YO, { d19 }
add YO, YO, INC_Y
fldmiad YO, { d20 }
vmla.f64 d20, d0, d28
fstmiad YO, { d20 }
add YO, YO, INC_Y
fldmiad YO, { d21 }
vmla.f64 d21, d0, d29
fstmiad YO, { d21 }
add YO, YO, INC_Y
fldmiad YO, { d22 }
vmla.f64 d22, d0, d30
fstmiad YO, { d22 }
add YO, YO, INC_Y
fldmiad YO, { d23 }
vmla.f64 d23, d0, d31
fstmiad YO, { d23 }
add YO, YO, INC_Y
.endm
.macro INIT_S1
vsub.f64 d24 , d24 , d24
.endm
.macro KERNEL_S1X1
fldmiad XO , { d4 }
fldmiad AO1 , { d8 }
vmla.f64 d24 , d4 , d8
add AO1, AO1, LDA
add XO, XO, INC_X
.endm
.macro SAVE_S1
fldmiad YO, { d16 }
vmla.f64 d16, d0, d24
fstmiad YO, { d16 }
add YO, YO, INC_Y
.endm
#else /************************* SINGLE PRECISION *****************************************/
.macro INIT_F8
pld [ YO , #Y_PRE ]
vsub.f32 s24 , s24 , s24
vmov.f32 s25 , s24
vmov.f32 s26 , s24
vmov.f32 s27 , s24
vmov.f32 s28 , s24
vmov.f32 s29 , s24
vmov.f32 s30 , s24
vmov.f32 s31 , s24
.endm
.macro KERNEL_F8X8
pld [ XO , #X_PRE ]
KERNEL_F8X1
KERNEL_F8X1
KERNEL_F8X1
KERNEL_F8X1
KERNEL_F8X1
KERNEL_F8X1
KERNEL_F8X1
KERNEL_F8X1
.endm
.macro KERNEL_F8X1
pld [ AO2 , #A_PRE ]
fldmias XO! , { s4 }
fldmias AO1 , { s8 - s15 }
vmla.f32 s24 , s4 , s8
vmla.f32 s25 , s4 , s9
vmla.f32 s26 , s4 , s10
vmla.f32 s27 , s4 , s11
vmla.f32 s28 , s4 , s12
vmla.f32 s29 , s4 , s13
vmla.f32 s30 , s4 , s14
vmla.f32 s31 , s4 , s15
add AO1, AO1, LDA
add AO2, AO2, LDA
.endm
.macro SAVE_F8
fldmias YO, { s16 - s23 }
vmla.f32 s16, s0, s24
vmla.f32 s17, s0, s25
vmla.f32 s18, s0, s26
vmla.f32 s19, s0, s27
vmla.f32 s20, s0, s28
vmla.f32 s21, s0, s29
vmla.f32 s22, s0, s30
vmla.f32 s23, s0, s31
fstmias YO!, { s16 - s23 }
.endm
.macro INIT_F1
vsub.f32 s24 , s24 , s24
.endm
.macro KERNEL_F1X1
fldmias XO! , { s4 }
fldmias AO1 , { s8 }
vmla.f32 s24 , s4 , s8
add AO1, AO1, LDA
.endm
.macro SAVE_F1
fldmias YO, { s16 }
vmla.f32 s16, s0, s24
fstmias YO!, { s16 }
.endm
/*********************************************************************************************/
.macro INIT_S8
vsub.f32 s24 , s24 , s24
vmov.f32 s25 , s24
vmov.f32 s26 , s24
vmov.f32 s27 , s24
vmov.f32 s28 , s24
vmov.f32 s29 , s24
vmov.f32 s30 , s24
vmov.f32 s31 , s24
.endm
.macro KERNEL_S8X8
KERNEL_S8X1
KERNEL_S8X1
KERNEL_S8X1
KERNEL_S8X1
KERNEL_S8X1
KERNEL_S8X1
KERNEL_S8X1
KERNEL_S8X1
.endm
.macro KERNEL_S8X1
pld [ AO2 , #A_PRE ]
fldmias XO , { s4 }
fldmias AO1 , { s8 - s15 }
vmla.f32 s24 , s4 , s8
vmla.f32 s25 , s4 , s9
vmla.f32 s26 , s4 , s10
vmla.f32 s27 , s4 , s11
vmla.f32 s28 , s4 , s12
vmla.f32 s29 , s4 , s13
vmla.f32 s30 , s4 , s14
vmla.f32 s31 , s4 , s15
add AO1, AO1, LDA
add AO2, AO2, LDA
add XO, XO, INC_X
.endm
.macro SAVE_S8
fldmias YO, { s16 }
vmla.f32 s16, s0, s24
fstmias YO, { s16 }
add YO, YO, INC_Y
fldmias YO, { s17 }
vmla.f32 s17, s0, s25
fstmias YO, { s17 }
add YO, YO, INC_Y
fldmias YO, { s18 }
vmla.f32 s18, s0, s26
fstmias YO, { s18 }
add YO, YO, INC_Y
fldmias YO, { s19 }
vmla.f32 s19, s0, s27
fstmias YO, { s19 }
add YO, YO, INC_Y
fldmias YO, { s20 }
vmla.f32 s20, s0, s28
fstmias YO, { s20 }
add YO, YO, INC_Y
fldmias YO, { s21 }
vmla.f32 s21, s0, s29
fstmias YO, { s21 }
add YO, YO, INC_Y
fldmias YO, { s22 }
vmla.f32 s22, s0, s30
fstmias YO, { s22 }
add YO, YO, INC_Y
fldmias YO, { s23 }
vmla.f32 s23, s0, s31
fstmias YO, { s23 }
add YO, YO, INC_Y
.endm
.macro INIT_S1
vsub.f32 s24 , s24 , s24
.endm
.macro KERNEL_S1X1
fldmias XO , { s4 }
fldmias AO1 , { s8 }
vmla.f32 s24 , s4 , s8
add AO1, AO1, LDA
add XO, XO, INC_X
.endm
.macro SAVE_S1
fldmias YO, { s16 }
vmla.f32 s16, s0, s24
fstmias YO, { s16 }
add YO, YO, INC_Y
.endm
#endif
/**************************************************************************************
* End of macro definitions
**************************************************************************************/
PROLOGUE
.align 5
push {r4 - r9 , fp}
add fp, sp, #28
sub sp, sp, #STACKSIZE // reserve stack
sub r12, fp, #192
#if defined(DOUBLE)
vstm r12, { d8 - d15 } // store floating point registers
#else
vstm r12, { s8 - s31 } // store floating point registers
#endif
cmp OLD_M, #0
ble gemvn_kernel_L999
cmp N, #0
ble gemvn_kernel_L999
str OLD_A, A
str OLD_M, M
ldr INC_X , OLD_INC_X
ldr INC_Y , OLD_INC_Y
cmp INC_X, #0
beq gemvn_kernel_L999
cmp INC_Y, #0
beq gemvn_kernel_L999
ldr LDA, OLD_LDA
#if defined(DOUBLE)
lsl LDA, LDA, #3 // LDA * SIZE
#else
lsl LDA, LDA, #2 // LDA * SIZE
#endif
cmp INC_X, #1
bne gemvn_kernel_S8_BEGIN
cmp INC_Y, #1
bne gemvn_kernel_S8_BEGIN
gemvn_kernel_F8_BEGIN:
ldr YO , Y
ldr I, M
asrs I, I, #3 // I = M / 8
ble gemvn_kernel_F1_BEGIN
gemvn_kernel_F8X8:
ldr AO1, A
add AO2, AO1, LDA
add r3 , AO1, #8*SIZE
str r3 , A
ldr XO , X
INIT_F8
asrs J, N, #3 // J = N / 8
ble gemvn_kernel_F8X1
gemvn_kernel_F8X8_10:
KERNEL_F8X8
subs J, J, #1
bne gemvn_kernel_F8X8_10
gemvn_kernel_F8X1:
ands J, N , #7
ble gemvn_kernel_F8_END
gemvn_kernel_F8X1_10:
KERNEL_F8X1
subs J, J, #1
bne gemvn_kernel_F8X1_10
gemvn_kernel_F8_END:
SAVE_F8
subs I , I , #1
bne gemvn_kernel_F8X8
gemvn_kernel_F1_BEGIN:
ldr I, M
ands I, I , #7
ble gemvn_kernel_L999
gemvn_kernel_F1X1:
ldr AO1, A
add r3, AO1, #SIZE
str r3, A
ldr XO , X
INIT_F1
mov J, N
gemvn_kernel_F1X1_10:
KERNEL_F1X1
subs J, J, #1
bne gemvn_kernel_F1X1_10
gemvn_kernel_F1_END:
SAVE_F1
subs I , I , #1
bne gemvn_kernel_F1X1
b gemvn_kernel_L999
/*************************************************************************************************************/
gemvn_kernel_S8_BEGIN:
#if defined(DOUBLE)
lsl INC_X, INC_X, #3 // INC_X * SIZE
lsl INC_Y, INC_Y, #3 // INC_Y * SIZE
#else
lsl INC_X, INC_X, #2 // INC_X * SIZE
lsl INC_Y, INC_Y, #2 // INC_Y * SIZE
#endif
ldr YO , Y
ldr I, M
asrs I, I, #3 // I = M / 8
ble gemvn_kernel_S1_BEGIN
gemvn_kernel_S8X8:
ldr AO1, A
add AO2, AO1, LDA
add r3 , AO1, #8*SIZE
str r3 , A
ldr XO , X
INIT_S8
asrs J, N, #3 // J = N / 8
ble gemvn_kernel_S8X1
gemvn_kernel_S8X8_10:
KERNEL_S8X8
subs J, J, #1
bne gemvn_kernel_S8X8_10
gemvn_kernel_S8X1:
ands J, N , #7
ble gemvn_kernel_S8_END
gemvn_kernel_S8X1_10:
KERNEL_S8X1
subs J, J, #1
bne gemvn_kernel_S8X1_10
gemvn_kernel_S8_END:
SAVE_S8
subs I , I , #1
bne gemvn_kernel_S8X8
gemvn_kernel_S1_BEGIN:
ldr I, M
ands I, I , #7
ble gemvn_kernel_L999
gemvn_kernel_S1X1:
ldr AO1, A
add r3, AO1, #SIZE
str r3, A
ldr XO , X
INIT_S1
mov J, N
gemvn_kernel_S1X1_10:
KERNEL_S1X1
subs J, J, #1
bne gemvn_kernel_S1X1_10
gemvn_kernel_S1_END:
SAVE_S1
subs I , I , #1
bne gemvn_kernel_S1X1
/*************************************************************************************************************/
gemvn_kernel_L999:
sub r3, fp, #192
#if defined(DOUBLE)
vldm r3, { d8 - d15 } // restore floating point registers
#else
vldm r3, { s8 - s31 } // restore floating point registers
#endif
mov r0, #0 // set return value
sub sp, fp, #28
pop {r4 -r9 ,fp}
bx lr
EPILOGUE

67
kernel/arm/gemv_t.c Normal file
View File

@ -0,0 +1,67 @@
/***************************************************************************
Copyright (c) 2013, The OpenBLAS Project
All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
1. Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
2. Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in
the documentation and/or other materials provided with the
distribution.
3. Neither the name of the OpenBLAS project nor the names of
its contributors may be used to endorse or promote products
derived from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*****************************************************************************/
/**************************************************************************************
* * 2013/09/14 Saar
* * BLASTEST float : OK
* * BLASTEST double : OK
* CTEST : OK
* TEST : OK
* *
* **************************************************************************************/
#include "common.h"
int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha, FLOAT *a, BLASLONG lda, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *buffer)
{
BLASLONG i;
BLASLONG ix,iy;
BLASLONG j;
FLOAT *a_ptr;
FLOAT temp;
iy = 0;
a_ptr = a;
for (j=0; j<n; j++)
{
temp = 0.0;
ix = 0;
for (i=0; i<m; i++)
{
temp += a_ptr[i] * x[ix];
ix += inc_x;
}
y[iy] += alpha * temp;
iy += inc_y;
a_ptr += lda;
}
}

750
kernel/arm/gemv_t_vfp.S Normal file
View File

@ -0,0 +1,750 @@
/***************************************************************************
Copyright (c) 2013, The OpenBLAS Project
All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
1. Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
2. Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in
the documentation and/or other materials provided with the
distribution.
3. Neither the name of the OpenBLAS project nor the names of
its contributors may be used to endorse or promote products
derived from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*****************************************************************************/
/**************************************************************************************
* 2013/11/25 Saar
* BLASTEST : OK
* CTEST : OK
* TEST : OK
*
**************************************************************************************/
#define ASSEMBLER
#include "common.h"
#define STACKSIZE 256
#define OLD_LDA [fp, #0 ]
#define X [fp, #4 ]
#define OLD_INC_X [fp, #8 ]
#define Y [fp, #12 ]
#define OLD_INC_Y [fp, #16 ]
#define OLD_A r3
#define OLD_N r1
#define M r0
#define AO1 r1
#define J r2
#define AO2 r4
#define XO r5
#define YO r6
#define LDA r7
#define INC_X r8
#define INC_Y r9
#define I r12
#define N [fp, #-252 ]
#define A [fp, #-256 ]
#define X_PRE 512
#define A_PRE 512
/**************************************************************************************
* Macro definitions
**************************************************************************************/
#if defined(DOUBLE)
.macro INIT_F2
vsub.f64 d2 , d2 , d2
vsub.f64 d3 , d3 , d3
.endm
.macro KERNEL_F2X4
pld [ XO , #X_PRE ]
fldmiad XO! , { d12 - d15 }
pld [ AO1 , #A_PRE ]
fldmiad AO1!, { d8 - d9 }
pld [ AO2 , #A_PRE ]
fldmiad AO2!, { d4 - d5 }
fldmiad AO1!, { d10 - d11 }
fldmiad AO2!, { d6 - d7 }
vmla.f64 d2 , d12 , d8
vmla.f64 d3 , d12 , d4
vmla.f64 d2 , d13 , d9
vmla.f64 d3 , d13 , d5
vmla.f64 d2 , d14, d10
vmla.f64 d3 , d14, d6
vmla.f64 d2 , d15, d11
vmla.f64 d3 , d15, d7
.endm
.macro KERNEL_F2X1
fldmiad XO! , { d1 }
fldmiad AO1!, { d8 }
fldmiad AO2!, { d4 }
vmla.f64 d2 , d1 , d8
vmla.f64 d3 , d1 , d4
.endm
.macro SAVE_F2
fldmiad YO, { d4 - d5 }
vmla.f64 d4, d0, d2
vmla.f64 d5, d0, d3
fstmiad YO!, { d4 - d5 }
.endm
.macro INIT_F1
vsub.f64 d2 , d2 , d2
.endm
.macro KERNEL_F1X4
pld [ XO , #X_PRE ]
fldmiad XO! , { d12 - d15 }
pld [ AO1 , #A_PRE ]
fldmiad AO1!, { d8 - d9 }
fldmiad AO1!, { d10 - d11 }
vmla.f64 d2 , d12 , d8
vmla.f64 d2 , d13 , d9
vmla.f64 d2 , d14, d10
vmla.f64 d2 , d15, d11
.endm
.macro KERNEL_F1X1
fldmiad XO! , { d1 }
fldmiad AO1!, { d8 }
vmla.f64 d2 , d1 , d8
.endm
.macro SAVE_F1
fldmiad YO, { d4 }
vmla.f64 d4, d0, d2
fstmiad YO!, { d4 }
.endm
.macro INIT_S2
vsub.f64 d2 , d2 , d2
vsub.f64 d3 , d3 , d3
.endm
.macro KERNEL_S2X4
fldmiad XO , { d12 }
add XO, XO, INC_X
pld [ AO1 , #A_PRE ]
fldmiad AO1!, { d8 - d9 }
pld [ AO2 , #A_PRE ]
fldmiad AO2!, { d4 - d5 }
fldmiad XO , { d13 }
add XO, XO, INC_X
fldmiad AO1!, { d10 - d11 }
fldmiad AO2!, { d6 - d7 }
fldmiad XO , { d14 }
add XO, XO, INC_X
fldmiad XO , { d15 }
add XO, XO, INC_X
vmla.f64 d2 , d12 , d8
vmla.f64 d3 , d12 , d4
vmla.f64 d2 , d13 , d9
vmla.f64 d3 , d13 , d5
vmla.f64 d2 , d14, d10
vmla.f64 d3 , d14, d6
vmla.f64 d2 , d15, d11
vmla.f64 d3 , d15, d7
.endm
.macro KERNEL_S2X1
fldmiad XO , { d1 }
fldmiad AO1!, { d8 }
fldmiad AO2!, { d4 }
vmla.f64 d2 , d1 , d8
add XO, XO, INC_X
vmla.f64 d3 , d1 , d4
.endm
.macro SAVE_S2
fldmiad YO, { d4 }
vmla.f64 d4, d0, d2
fstmiad YO, { d4 }
add YO, YO, INC_Y
fldmiad YO, { d5 }
vmla.f64 d5, d0, d3
fstmiad YO, { d5 }
add YO, YO, INC_Y
.endm
.macro INIT_S1
vsub.f64 d2 , d2 , d2
.endm
.macro KERNEL_S1X4
fldmiad XO , { d12 }
add XO, XO, INC_X
pld [ AO1 , #A_PRE ]
fldmiad AO1!, { d8 - d9 }
fldmiad XO , { d13 }
add XO, XO, INC_X
fldmiad AO1!, { d10 - d11 }
fldmiad XO , { d14 }
add XO, XO, INC_X
fldmiad XO , { d15 }
add XO, XO, INC_X
vmla.f64 d2 , d12 , d8
vmla.f64 d2 , d13 , d9
vmla.f64 d2 , d14, d10
vmla.f64 d2 , d15, d11
.endm
.macro KERNEL_S1X1
fldmiad XO , { d1 }
fldmiad AO1!, { d8 }
vmla.f64 d2 , d1 , d8
add XO, XO, INC_X
.endm
.macro SAVE_S1
fldmiad YO, { d4 }
vmla.f64 d4, d0, d2
fstmiad YO, { d4 }
add YO, YO, INC_Y
.endm
#else /************************* SINGLE PRECISION *****************************************/
.macro INIT_F2
vsub.f32 s2 , s2 , s2
vsub.f32 s3 , s3 , s3
.endm
.macro KERNEL_F2X4
fldmias XO! , { s12 - s15 }
fldmias AO1!, { s8 - s9 }
fldmias AO2!, { s4 - s5 }
fldmias AO1!, { s10 - s11 }
fldmias AO2!, { s6 - s7 }
vmla.f32 s2 , s12 , s8
vmla.f32 s3 , s12 , s4
vmla.f32 s2 , s13 , s9
vmla.f32 s3 , s13 , s5
vmla.f32 s2 , s14, s10
vmla.f32 s3 , s14, s6
vmla.f32 s2 , s15, s11
vmla.f32 s3 , s15, s7
.endm
.macro KERNEL_F2X1
fldmias XO! , { s1 }
fldmias AO1!, { s8 }
fldmias AO2!, { s4 }
vmla.f32 s2 , s1 , s8
vmla.f32 s3 , s1 , s4
.endm
.macro SAVE_F2
fldmias YO, { s4 - s5 }
vmla.f32 s4, s0, s2
vmla.f32 s5, s0, s3
fstmias YO!, { s4 - s5 }
.endm
.macro INIT_F1
vsub.f32 s2 , s2 , s2
.endm
.macro KERNEL_F1X4
fldmias XO! , { s12 - s15 }
fldmias AO1!, { s8 - s9 }
fldmias AO1!, { s10 - s11 }
vmla.f32 s2 , s12 , s8
vmla.f32 s2 , s13 , s9
vmla.f32 s2 , s14, s10
vmla.f32 s2 , s15, s11
.endm
.macro KERNEL_F1X1
fldmias XO! , { s1 }
fldmias AO1!, { s8 }
vmla.f32 s2 , s1 , s8
.endm
.macro SAVE_F1
fldmias YO, { s4 }
vmla.f32 s4, s0, s2
fstmias YO!, { s4 }
.endm
.macro INIT_S2
vsub.f32 s2 , s2 , s2
vsub.f32 s3 , s3 , s3
.endm
.macro KERNEL_S2X4
fldmias XO , { s12 }
add XO, XO, INC_X
fldmias AO1!, { s8 - s9 }
fldmias AO2!, { s4 - s5 }
fldmias XO , { s13 }
add XO, XO, INC_X
fldmias AO1!, { s10 - s11 }
fldmias AO2!, { s6 - s7 }
fldmias XO , { s14 }
add XO, XO, INC_X
fldmias XO , { s15 }
add XO, XO, INC_X
vmla.f32 s2 , s12 , s8
vmla.f32 s3 , s12 , s4
vmla.f32 s2 , s13 , s9
vmla.f32 s3 , s13 , s5
vmla.f32 s2 , s14, s10
vmla.f32 s3 , s14, s6
vmla.f32 s2 , s15, s11
vmla.f32 s3 , s15, s7
.endm
.macro KERNEL_S2X1
fldmias XO , { s1 }
fldmias AO1!, { s8 }
fldmias AO2!, { s4 }
vmla.f32 s2 , s1 , s8
add XO, XO, INC_X
vmla.f32 s3 , s1 , s4
.endm
.macro SAVE_S2
fldmias YO, { s4 }
vmla.f32 s4, s0, s2
fstmias YO, { s4 }
add YO, YO, INC_Y
fldmias YO, { s5 }
vmla.f32 s5, s0, s3
fstmias YO, { s5 }
add YO, YO, INC_Y
.endm
.macro INIT_S1
vsub.f32 s2 , s2 , s2
.endm
.macro KERNEL_S1X4
fldmias XO , { s12 }
add XO, XO, INC_X
pld [ AO1 , #A_PRE ]
fldmias AO1!, { s8 - s9 }
fldmias XO , { s13 }
add XO, XO, INC_X
fldmias AO1!, { s10 - s11 }
fldmias XO , { s14 }
add XO, XO, INC_X
fldmias XO , { s15 }
add XO, XO, INC_X
vmla.f32 s2 , s12 , s8
vmla.f32 s2 , s13 , s9
vmla.f32 s2 , s14, s10
vmla.f32 s2 , s15, s11
.endm
.macro KERNEL_S1X1
fldmias XO , { s1 }
fldmias AO1!, { s8 }
vmla.f32 s2 , s1 , s8
add XO, XO, INC_X
.endm
.macro SAVE_S1
fldmias YO, { s4 }
vmla.f32 s4, s0, s2
fstmias YO, { s4 }
add YO, YO, INC_Y
.endm
#endif
/**************************************************************************************
* End of macro definitions
**************************************************************************************/
PROLOGUE
.align 5
push {r4 - r9 , fp}
add fp, sp, #28
sub sp, sp, #STACKSIZE // reserve stack
sub r12, fp, #192
#if defined(DOUBLE)
vstm r12, { d8 - d15 } // store floating point registers
#else
vstm r12, { s8 - s15 } // store floating point registers
#endif
cmp M, #0
ble gemvt_kernel_L999
cmp OLD_N, #0
ble gemvt_kernel_L999
str OLD_A, A
str OLD_N, N
ldr INC_X , OLD_INC_X
ldr INC_Y , OLD_INC_Y
cmp INC_X, #0
beq gemvt_kernel_L999
cmp INC_Y, #0
beq gemvt_kernel_L999
ldr LDA, OLD_LDA
#if defined(DOUBLE)
lsl LDA, LDA, #3 // LDA * SIZE
#else
lsl LDA, LDA, #2 // LDA * SIZE
#endif
cmp INC_X, #1
bne gemvt_kernel_S2_BEGIN
cmp INC_Y, #1
bne gemvt_kernel_S2_BEGIN
gemvt_kernel_F2_BEGIN:
ldr YO , Y
ldr J, N
asrs J, J, #1 // J = N / 2
ble gemvt_kernel_F1_BEGIN
gemvt_kernel_F2X4:
ldr AO1, A
add AO2, AO1, LDA
add r3 , AO2, LDA
str r3 , A
ldr XO , X
INIT_F2
asrs I, M, #2 // I = M / 4
ble gemvt_kernel_F2X1
gemvt_kernel_F2X4_10:
KERNEL_F2X4
subs I, I, #1
bne gemvt_kernel_F2X4_10
gemvt_kernel_F2X1:
ands I, M , #3
ble gemvt_kernel_F2_END
gemvt_kernel_F2X1_10:
KERNEL_F2X1
subs I, I, #1
bne gemvt_kernel_F2X1_10
gemvt_kernel_F2_END:
SAVE_F2
subs J , J , #1
bne gemvt_kernel_F2X4
gemvt_kernel_F1_BEGIN:
ldr J, N
ands J, J, #1
ble gemvt_kernel_L999
gemvt_kernel_F1X4:
ldr AO1, A
ldr XO , X
INIT_F1
asrs I, M, #2 // I = M / 4
ble gemvt_kernel_F1X1
gemvt_kernel_F1X4_10:
KERNEL_F1X4
subs I, I, #1
bne gemvt_kernel_F1X4_10
gemvt_kernel_F1X1:
ands I, M , #3
ble gemvt_kernel_F1_END
gemvt_kernel_F1X1_10:
KERNEL_F1X1
subs I, I, #1
bne gemvt_kernel_F1X1_10
gemvt_kernel_F1_END:
SAVE_F1
b gemvt_kernel_L999
/*************************************************************************************************************/
gemvt_kernel_S2_BEGIN:
#if defined(DOUBLE)
lsl INC_X, INC_X, #3 // INC_X * SIZE
lsl INC_Y, INC_Y, #3 // INC_Y * SIZE
#else
lsl INC_X, INC_X, #2 // INC_X * SIZE
lsl INC_Y, INC_Y, #2 // INC_Y * SIZE
#endif
ldr YO , Y
ldr J, N
asrs J, J, #1 // J = N / 2
ble gemvt_kernel_S1_BEGIN
gemvt_kernel_S2X4:
ldr AO1, A
add AO2, AO1, LDA
add r3 , AO2, LDA
str r3 , A
ldr XO , X
INIT_S2
asrs I, M, #2 // I = M / 4
ble gemvt_kernel_S2X1
gemvt_kernel_S2X4_10:
KERNEL_S2X4
subs I, I, #1
bne gemvt_kernel_S2X4_10
gemvt_kernel_S2X1:
ands I, M , #3
ble gemvt_kernel_S2_END
gemvt_kernel_S2X1_10:
KERNEL_S2X1
subs I, I, #1
bne gemvt_kernel_S2X1_10
gemvt_kernel_S2_END:
SAVE_S2
subs J , J , #1
bne gemvt_kernel_S2X4
gemvt_kernel_S1_BEGIN:
ldr J, N
ands J, J, #1
ble gemvt_kernel_L999
gemvt_kernel_S1X4:
ldr AO1, A
ldr XO , X
INIT_S1
asrs I, M, #2 // I = M / 4
ble gemvt_kernel_S1X1
gemvt_kernel_S1X4_10:
KERNEL_S1X4
subs I, I, #1
bne gemvt_kernel_S1X4_10
gemvt_kernel_S1X1:
ands I, M , #3
ble gemvt_kernel_S1_END
gemvt_kernel_S1X1_10:
KERNEL_S1X1
subs I, I, #1
bne gemvt_kernel_S1X1_10
gemvt_kernel_S1_END:
SAVE_S1
/*************************************************************************************************************/
gemvt_kernel_L999:
sub r3, fp, #192
#if defined(DOUBLE)
vldm r3, { d8 - d15 } // restore floating point registers
#else
vldm r3, { s8 - s15 } // restore floating point registers
#endif
mov r0, #0 // set return value
sub sp, fp, #28
pop {r4 -r9 ,fp}
bx lr
EPILOGUE

732
kernel/arm/gemv_t_vfpv3.S Normal file
View File

@ -0,0 +1,732 @@
/***************************************************************************
Copyright (c) 2013, The OpenBLAS Project
All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
1. Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
2. Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in
the documentation and/or other materials provided with the
distribution.
3. Neither the name of the OpenBLAS project nor the names of
its contributors may be used to endorse or promote products
derived from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*****************************************************************************/
/**************************************************************************************
* 2013/11/18 Saar
* BLASTEST : OK
* CTEST : OK
* TEST : OK
*
**************************************************************************************/
#define ASSEMBLER
#include "common.h"
#define STACKSIZE 256
#define OLD_LDA [fp, #0 ]
#define X [fp, #4 ]
#define OLD_INC_X [fp, #8 ]
#define Y [fp, #12 ]
#define OLD_INC_Y [fp, #16 ]
#define OLD_A r3
#define OLD_N r1
#define M r0
#define AO1 r1
#define J r2
#define AO2 r4
#define XO r5
#define YO r6
#define LDA r7
#define INC_X r8
#define INC_Y r9
#define I r12
#define N [fp, #-252 ]
#define A [fp, #-256 ]
#define X_PRE 512
#define A_PRE 512
/**************************************************************************************
* Macro definitions
**************************************************************************************/
#if defined(DOUBLE)
.macro INIT_F2
vsub.f64 d4 , d4 , d4
vsub.f64 d5 , d5 , d5
.endm
.macro KERNEL_F2X4
pld [ XO , #X_PRE ]
fldmiad XO! , { d28 - d31 }
pld [ AO1 , #A_PRE ]
fldmiad AO1!, { d8 - d9 }
pld [ AO2 , #A_PRE ]
fldmiad AO2!, { d16 - d17 }
vmla.f64 d4 , d28 , d8
vmla.f64 d5 , d28 , d16
fldmiad AO1!, { d10 - d11 }
vmla.f64 d4 , d29 , d9
vmla.f64 d5 , d29 , d17
fldmiad AO2!, { d18 - d19 }
vmla.f64 d4 , d30, d10
vmla.f64 d5 , d30, d18
vmla.f64 d4 , d31, d11
vmla.f64 d5 , d31, d19
.endm
.macro KERNEL_F2X1
fldmiad XO! , { d2 }
fldmiad AO1!, { d8 }
fldmiad AO2!, { d16 }
vmla.f64 d4 , d2 , d8
vmla.f64 d5 , d2 , d16
.endm
.macro SAVE_F2
fldmiad YO, { d24 - d25 }
vmla.f64 d24, d0, d4
vmla.f64 d25, d0, d5
fstmiad YO!, { d24 - d25 }
.endm
.macro INIT_S2
vsub.f64 d4 , d4 , d4
vsub.f64 d5 , d5 , d5
.endm
.macro KERNEL_S2X4
pld [ AO1 , #A_PRE ]
fldmiad XO , { d28 }
add XO, XO, INC_X
fldmiad AO1!, { d8 - d9 }
pld [ AO2 , #A_PRE ]
fldmiad AO2!, { d16 - d17 }
vmla.f64 d4 , d28 , d8
fldmiad XO , { d29 }
add XO, XO, INC_X
vmla.f64 d5 , d28 , d16
fldmiad AO1!, { d10 - d11 }
vmla.f64 d4 , d29 , d9
fldmiad XO , { d30 }
add XO, XO, INC_X
vmla.f64 d5 , d29 , d17
fldmiad AO2!, { d18 - d19 }
vmla.f64 d4 , d30, d10
fldmiad XO , { d31 }
add XO, XO, INC_X
vmla.f64 d5 , d30, d18
vmla.f64 d4 , d31, d11
vmla.f64 d5 , d31, d19
.endm
.macro KERNEL_S2X1
fldmiad XO , { d2 }
fldmiad AO1!, { d8 }
add XO, XO, INC_X
fldmiad AO2!, { d16 }
vmla.f64 d4 , d2 , d8
vmla.f64 d5 , d2 , d16
.endm
.macro SAVE_S2
fldmiad YO, { d24 }
vmla.f64 d24, d0, d4
fstmiad YO, { d24 }
add YO, YO, INC_Y
fldmiad YO, { d24 }
vmla.f64 d24, d0, d5
fstmiad YO, { d24 }
add YO, YO, INC_Y
.endm
.macro INIT_F1
vsub.f64 d4 , d4 , d4
.endm
.macro KERNEL_F1X4
pld [ XO , #X_PRE ]
fldmiad XO! , { d28 - d31 }
pld [ AO1 , #A_PRE ]
fldmiad AO1!, { d8 - d9 }
vmla.f64 d4 , d28 , d8
fldmiad AO1!, { d10 - d11 }
vmla.f64 d4 , d29 , d9
vmla.f64 d4 , d30, d10
vmla.f64 d4 , d31, d11
.endm
.macro KERNEL_F1X1
fldmiad XO! , { d2 }
fldmiad AO1!, { d8 }
vmla.f64 d4 , d2 , d8
.endm
.macro SAVE_F1
fldmiad YO, { d24 }
vmla.f64 d24, d0, d4
fstmiad YO!, { d24 }
.endm
.macro INIT_S1
vsub.f64 d4 , d4 , d4
.endm
.macro KERNEL_S1X4
pld [ AO1 , #A_PRE ]
fldmiad XO , { d28 }
add XO, XO, INC_X
fldmiad AO1!, { d8 - d9 }
vmla.f64 d4 , d28 , d8
fldmiad XO , { d29 }
add XO, XO, INC_X
fldmiad AO1!, { d10 - d11 }
vmla.f64 d4 , d29 , d9
fldmiad XO , { d30 }
add XO, XO, INC_X
vmla.f64 d4 , d30, d10
fldmiad XO , { d31 }
add XO, XO, INC_X
vmla.f64 d4 , d31, d11
.endm
.macro KERNEL_S1X1
fldmiad XO , { d2 }
fldmiad AO1!, { d8 }
add XO, XO, INC_X
vmla.f64 d4 , d2 , d8
.endm
.macro SAVE_S1
fldmiad YO, { d24 }
vmla.f64 d24, d0, d4
fstmiad YO, { d24 }
add YO, YO, INC_Y
.endm
#else /************************* SINGLE PRECISION *****************************************/
.macro INIT_F2
vsub.f32 s4 , s4 , s4
vsub.f32 s5 , s5 , s5
.endm
.macro KERNEL_F2X4
fldmias XO! , { s28 - s31 }
fldmias AO1!, { s8 - s9 }
fldmias AO2!, { s16 - s17 }
vmla.f32 s4 , s28 , s8
vmla.f32 s5 , s28 , s16
fldmias AO1!, { s10 - s11 }
vmla.f32 s4 , s29 , s9
vmla.f32 s5 , s29 , s17
fldmias AO2!, { s18 - s19 }
vmla.f32 s4 , s30, s10
vmla.f32 s5 , s30, s18
vmla.f32 s4 , s31, s11
vmla.f32 s5 , s31, s19
.endm
.macro KERNEL_F2X1
fldmias XO! , { s2 }
fldmias AO1!, { s8 }
fldmias AO2!, { s16 }
vmla.f32 s4 , s2 , s8
vmla.f32 s5 , s2 , s16
.endm
.macro SAVE_F2
fldmias YO, { s24 - s25 }
vmla.f32 s24, s0, s4
vmla.f32 s25, s0, s5
fstmias YO!, { s24 - s25 }
.endm
.macro INIT_S2
vsub.f32 s4 , s4 , s4
vsub.f32 s5 , s5 , s5
.endm
.macro KERNEL_S2X4
fldmias XO , { s28 }
add XO, XO, INC_X
fldmias AO1!, { s8 - s9 }
fldmias AO2!, { s16 - s17 }
vmla.f32 s4 , s28 , s8
fldmias XO , { s29 }
add XO, XO, INC_X
vmla.f32 s5 , s28 , s16
fldmias AO1!, { s10 - s11 }
vmla.f32 s4 , s29 , s9
fldmias XO , { s30 }
add XO, XO, INC_X
vmla.f32 s5 , s29 , s17
fldmias AO2!, { s18 - s19 }
vmla.f32 s4 , s30, s10
fldmias XO , { s31 }
add XO, XO, INC_X
vmla.f32 s5 , s30, s18
vmla.f32 s4 , s31, s11
vmla.f32 s5 , s31, s19
.endm
.macro KERNEL_S2X1
fldmias XO , { s2 }
fldmias AO1!, { s8 }
add XO, XO, INC_X
fldmias AO2!, { s16 }
vmla.f32 s4 , s2 , s8
vmla.f32 s5 , s2 , s16
.endm
.macro SAVE_S2
fldmias YO, { s24 }
vmla.f32 s24, s0, s4
fstmias YO, { s24 }
add YO, YO, INC_Y
fldmias YO, { s24 }
vmla.f32 s24, s0, s5
fstmias YO, { s24 }
add YO, YO, INC_Y
.endm
.macro INIT_F1
vsub.f32 s4 , s4 , s4
.endm
.macro KERNEL_F1X4
fldmias XO! , { s28 - s31 }
fldmias AO1!, { s8 - s9 }
vmla.f32 s4 , s28 , s8
fldmias AO1!, { s10 - s11 }
vmla.f32 s4 , s29 , s9
vmla.f32 s4 , s30, s10
vmla.f32 s4 , s31, s11
.endm
.macro KERNEL_F1X1
fldmias XO! , { s2 }
fldmias AO1!, { s8 }
vmla.f32 s4 , s2 , s8
.endm
.macro SAVE_F1
fldmias YO, { s24 }
vmla.f32 s24, s0, s4
fstmias YO!, { s24 }
.endm
.macro INIT_S1
vsub.f32 s4 , s4 , s4
.endm
.macro KERNEL_S1X4
fldmias XO , { s28 }
add XO, XO, INC_X
fldmias AO1!, { s8 - s9 }
vmla.f32 s4 , s28 , s8
fldmias XO , { s29 }
add XO, XO, INC_X
fldmias AO1!, { s10 - s11 }
vmla.f32 s4 , s29 , s9
fldmias XO , { s30 }
add XO, XO, INC_X
vmla.f32 s4 , s30, s10
fldmias XO , { s31 }
add XO, XO, INC_X
vmla.f32 s4 , s31, s11
.endm
.macro KERNEL_S1X1
fldmias XO , { s2 }
fldmias AO1!, { s8 }
add XO, XO, INC_X
vmla.f32 s4 , s2 , s8
.endm
.macro SAVE_S1
fldmias YO, { s24 }
vmla.f32 s24, s0, s4
fstmias YO, { s24 }
add YO, YO, INC_Y
.endm
#endif
/**************************************************************************************
* End of macro definitions
**************************************************************************************/
PROLOGUE
.align 5
push {r4 - r9 , fp}
add fp, sp, #28
sub sp, sp, #STACKSIZE // reserve stack
sub r12, fp, #192
#if defined(DOUBLE)
vstm r12, { d8 - d15 } // store floating point registers
#else
vstm r12, { s8 - s31 } // store floating point registers
#endif
cmp M, #0
ble gemvt_kernel_L999
cmp OLD_N, #0
ble gemvt_kernel_L999
str OLD_A, A
str OLD_N, N
ldr INC_X , OLD_INC_X
ldr INC_Y , OLD_INC_Y
cmp INC_X, #0
beq gemvt_kernel_L999
cmp INC_Y, #0
beq gemvt_kernel_L999
ldr LDA, OLD_LDA
#if defined(DOUBLE)
lsl LDA, LDA, #3 // LDA * SIZE
#else
lsl LDA, LDA, #2 // LDA * SIZE
#endif
cmp INC_X, #1
bne gemvt_kernel_S2_BEGIN
cmp INC_Y, #1
bne gemvt_kernel_S2_BEGIN
gemvt_kernel_F2_BEGIN:
ldr YO , Y
ldr J, N
asrs J, J, #1 // J = N / 2
ble gemvt_kernel_F1_BEGIN
gemvt_kernel_F2X4:
ldr AO1, A
add AO2, AO1, LDA
add r3 , AO2, LDA
str r3 , A
ldr XO , X
INIT_F2
asrs I, M, #2 // I = M / 4
ble gemvt_kernel_F2X1
gemvt_kernel_F2X4_10:
KERNEL_F2X4
subs I, I, #1
bne gemvt_kernel_F2X4_10
gemvt_kernel_F2X1:
ands I, M , #3
ble gemvt_kernel_F2_END
gemvt_kernel_F2X1_10:
KERNEL_F2X1
subs I, I, #1
bne gemvt_kernel_F2X1_10
gemvt_kernel_F2_END:
SAVE_F2
subs J , J , #1
bne gemvt_kernel_F2X4
gemvt_kernel_F1_BEGIN:
ldr J, N
ands J, J, #1
ble gemvt_kernel_L999
gemvt_kernel_F1X4:
ldr AO1, A
ldr XO , X
INIT_F1
asrs I, M, #2 // I = M / 4
ble gemvt_kernel_F1X1
gemvt_kernel_F1X4_10:
KERNEL_F1X4
subs I, I, #1
bne gemvt_kernel_F1X4_10
gemvt_kernel_F1X1:
ands I, M , #3
ble gemvt_kernel_F1_END
gemvt_kernel_F1X1_10:
KERNEL_F1X1
subs I, I, #1
bne gemvt_kernel_F1X1_10
gemvt_kernel_F1_END:
SAVE_F1
b gemvt_kernel_L999
/*************************************************************************************************************/
gemvt_kernel_S2_BEGIN:
#if defined(DOUBLE)
lsl INC_X, INC_X, #3 // INC_X * SIZE
lsl INC_Y, INC_Y, #3 // INC_Y * SIZE
#else
lsl INC_X, INC_X, #2 // INC_X * SIZE
lsl INC_Y, INC_Y, #2 // INC_Y * SIZE
#endif
ldr YO , Y
ldr J, N
asrs J, J, #1 // J = N / 2
ble gemvt_kernel_S1_BEGIN
gemvt_kernel_S2X4:
ldr AO1, A
add AO2, AO1, LDA
add r3 , AO2, LDA
str r3 , A
ldr XO , X
INIT_S2
asrs I, M, #2 // I = M / 4
ble gemvt_kernel_S2X1
gemvt_kernel_S2X4_10:
KERNEL_S2X4
subs I, I, #1
bne gemvt_kernel_S2X4_10
gemvt_kernel_S2X1:
ands I, M , #3
ble gemvt_kernel_S2_END
gemvt_kernel_S2X1_10:
KERNEL_S2X1
subs I, I, #1
bne gemvt_kernel_S2X1_10
gemvt_kernel_S2_END:
SAVE_S2
subs J , J , #1
bne gemvt_kernel_S2X4
gemvt_kernel_S1_BEGIN:
ldr J, N
ands J, J, #1
ble gemvt_kernel_L999
gemvt_kernel_S1X4:
ldr AO1, A
ldr XO , X
INIT_S1
asrs I, M, #2 // I = M / 4
ble gemvt_kernel_S1X1
gemvt_kernel_S1X4_10:
KERNEL_S1X4
subs I, I, #1
bne gemvt_kernel_S1X4_10
gemvt_kernel_S1X1:
ands I, M , #3
ble gemvt_kernel_S1_END
gemvt_kernel_S1X1_10:
KERNEL_S1X1
subs I, I, #1
bne gemvt_kernel_S1X1_10
gemvt_kernel_S1_END:
SAVE_S1
/*************************************************************************************************************/
gemvt_kernel_L999:
sub r3, fp, #192
#if defined(DOUBLE)
vldm r3, { d8 - d15 } // restore floating point registers
#else
vldm r3, { s8 - s31 } // restore floating point registers
#endif
mov r0, #0 // set return value
sub sp, fp, #28
pop {r4 -r9 ,fp}
bx lr
EPILOGUE

75
kernel/arm/iamax.c Normal file
View File

@ -0,0 +1,75 @@
/***************************************************************************
Copyright (c) 2013, The OpenBLAS Project
All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
1. Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
2. Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in
the documentation and/or other materials provided with the
distribution.
3. Neither the name of the OpenBLAS project nor the names of
its contributors may be used to endorse or promote products
derived from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*****************************************************************************/
/**************************************************************************************
* 2013/09/14 Saar
* BLASTEST float : NoTest
* BLASTEST double : NoTest
* CTEST : OK
* TEST : OK
*
**************************************************************************************/
#include "common.h"
#include <math.h>
#if defined(DOUBLE)
#define ABS fabs
#else
#define ABS fabsf
#endif
BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x)
{
BLASLONG i=0;
BLASLONG ix=0;
FLOAT maxf=0.0;
BLASLONG max=0;
if (n < 0 || inc_x < 1 ) return(max);
maxf=ABS(x[0]);
while(i < n)
{
if( ABS(x[ix]) > ABS(maxf) )
{
max = i;
maxf = ABS(x[ix]);
}
ix += inc_x;
i++;
}
return(max+1);
}

478
kernel/arm/iamax_vfp.S Normal file
View File

@ -0,0 +1,478 @@
/***************************************************************************
Copyright (c) 2013, The OpenBLAS Project
All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
1. Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
2. Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in
the documentation and/or other materials provided with the
distribution.
3. Neither the name of the OpenBLAS project nor the names of
its contributors may be used to endorse or promote products
derived from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*****************************************************************************/
/**************************************************************************************
* 2013/11/14 Saar
* BLASTEST : OK
* CTEST : OK
* TEST : OK
*
**************************************************************************************/
#define ASSEMBLER
#include "common.h"
#define STACKSIZE 256
#define N r0
#define X r1
#define INC_X r2
#define INDEX r3
#define Z r4
#define I r12
#define X_PRE 512
/**************************************************************************************
* Macro definitions
**************************************************************************************/
#if defined(USE_ABS)
#if defined(DOUBLE)
#define VABS(x0,x1) vabs.f64 x0, x1
#else
#define VABS(x0,x1) vabs.f32 x0, x1
#endif
#else
#define VABS(x0,x1) nop
#endif
/*****************************************************************************************/
#if defined(USE_MIN)
#define MOVCOND movlt
#if defined(DOUBLE)
#define VMOVCOND vmovlt.f64
#else
#define VMOVCOND vmovlt.f32
#endif
#else
#define MOVCOND movgt
#if defined(DOUBLE)
#define VMOVCOND vmovgt.f64
#else
#define VMOVCOND vmovgt.f32
#endif
#endif
/*****************************************************************************************/
#if !defined(COMPLEX)
#if defined(DOUBLE)
.macro INIT_F
fldmiad X!, { d0 }
VABS( d0, d0 )
mov Z, #1
mov INDEX, Z
.endm
.macro KERNEL_F1
fldmiad X!, { d4 }
add Z, Z, #1
VABS( d4, d4 )
vcmpe.f64 d4, d0
vmrs APSR_nzcv, fpscr
VMOVCOND d0, d4
MOVCOND INDEX, Z
.endm
.macro INIT_S
fldmiad X, { d0 }
VABS( d0, d0 )
mov Z, #1
mov INDEX, Z
add X, X, INC_X
.endm
.macro KERNEL_S1
fldmiad X, { d4 }
add Z, Z, #1
VABS( d4, d4 )
vcmpe.f64 d4, d0
vmrs APSR_nzcv, fpscr
VMOVCOND d0, d4
MOVCOND INDEX, Z
add X, X, INC_X
.endm
#else
.macro INIT_F
fldmias X!, { s0 }
VABS( s0, s0 )
mov Z, #1
mov INDEX, Z
.endm
.macro KERNEL_F1
fldmias X!, { s4 }
add Z, Z, #1
VABS( s4, s4 )
vcmpe.f32 s4, s0
vmrs APSR_nzcv, fpscr
VMOVCOND s0, s4
MOVCOND INDEX, Z
.endm
.macro INIT_S
fldmias X, { s0 }
VABS( s0, s0 )
mov Z, #1
mov INDEX, Z
add X, X, INC_X
.endm
.macro KERNEL_S1
fldmias X, { s4 }
add Z, Z, #1
VABS( s4, s4 )
vcmpe.f32 s4, s0
vmrs APSR_nzcv, fpscr
VMOVCOND s0, s4
MOVCOND INDEX, Z
add X, X, INC_X
.endm
#endif
#else
#if defined(DOUBLE)
.macro INIT_F
fldmiad X!, { d0 -d1 }
vabs.f64 d0, d0
vabs.f64 d1, d1
vadd.f64 d0 , d0, d1
mov Z, #1
mov INDEX, Z
.endm
.macro KERNEL_F1
fldmiad X!, { d4 - d5 }
add Z, Z, #1
vabs.f64 d4, d4
vabs.f64 d5, d5
vadd.f64 d4 , d4, d5
vcmpe.f64 d4, d0
vmrs APSR_nzcv, fpscr
VMOVCOND d0, d4
MOVCOND INDEX, Z
.endm
.macro INIT_S
fldmiad X, { d0 -d1 }
vabs.f64 d0, d0
vabs.f64 d1, d1
vadd.f64 d0 , d0, d1
mov Z, #1
mov INDEX, Z
add X, X, INC_X
.endm
.macro KERNEL_S1
fldmiad X, { d4 - d5 }
add Z, Z, #1
vabs.f64 d4, d4
vabs.f64 d5, d5
vadd.f64 d4 , d4, d5
vcmpe.f64 d4, d0
vmrs APSR_nzcv, fpscr
VMOVCOND d0, d4
MOVCOND INDEX, Z
add X, X, INC_X
.endm
#else
.macro INIT_F
fldmias X!, { s0 -s1 }
vabs.f32 s0, s0
vabs.f32 s1, s1
vadd.f32 s0 , s0, s1
mov Z, #1
mov INDEX, Z
.endm
.macro KERNEL_F1
fldmias X!, { s4 - s5 }
add Z, Z, #1
vabs.f32 s4, s4
vabs.f32 s5, s5
vadd.f32 s4 , s4, s5
vcmpe.f32 s4, s0
vmrs APSR_nzcv, fpscr
VMOVCOND s0, s4
MOVCOND INDEX, Z
.endm
.macro INIT_S
fldmias X, { s0 -s1 }
vabs.f32 s0, s0
vabs.f32 s1, s1
vadd.f32 s0 , s0, s1
mov Z, #1
mov INDEX, Z
add X, X, INC_X
.endm
.macro KERNEL_S1
fldmias X, { s4 - s5 }
add Z, Z, #1
vabs.f32 s4, s4
vabs.f32 s5, s5
vadd.f32 s4 , s4, s5
vcmpe.f32 s4, s0
vmrs APSR_nzcv, fpscr
VMOVCOND s0, s4
MOVCOND INDEX, Z
add X, X, INC_X
.endm
#endif
#endif
/**************************************************************************************
* End of macro definitions
**************************************************************************************/
PROLOGUE
.align 5
push {r4}
#if defined(DOUBLE)
vsub.f64 d0 , d0 , d0
#else
vsub.f32 s0 , s0 , s0
#endif
mov INDEX, #0
cmp N, #0
ble iamax_kernel_L999
cmp INC_X, #0
beq iamax_kernel_L999
cmp INC_X, #1
bne iamax_kernel_S_BEGIN
iamax_kernel_F_BEGIN:
INIT_F
subs N, N , #1
ble iamax_kernel_L999
asrs I, N, #2 // I = N / 4
ble iamax_kernel_F1
.align 5
iamax_kernel_F4:
pld [ X, #X_PRE ]
KERNEL_F1
KERNEL_F1
#if defined(COMPLEX) && defined(DOUBLE)
pld [ X, #X_PRE ]
#endif
KERNEL_F1
KERNEL_F1
subs I, I, #1
ble iamax_kernel_F1
#if defined(COMPLEX) || defined(DOUBLE)
pld [ X, #X_PRE ]
#endif
KERNEL_F1
KERNEL_F1
#if defined(COMPLEX) && defined(DOUBLE)
pld [ X, #X_PRE ]
#endif
KERNEL_F1
KERNEL_F1
subs I, I, #1
bne iamax_kernel_F4
iamax_kernel_F1:
ands I, N, #3
ble iamax_kernel_L999
iamax_kernel_F10:
KERNEL_F1
subs I, I, #1
bne iamax_kernel_F10
b iamax_kernel_L999
iamax_kernel_S_BEGIN:
#if defined(COMPLEX)
#if defined(DOUBLE)
lsl INC_X, INC_X, #4 // INC_X * SIZE * 2
#else
lsl INC_X, INC_X, #3 // INC_X * SIZE * 2
#endif
#else
#if defined(DOUBLE)
lsl INC_X, INC_X, #3 // INC_X * SIZE
#else
lsl INC_X, INC_X, #2 // INC_X * SIZE
#endif
#endif
INIT_S
subs N, N , #1
ble iamax_kernel_L999
asrs I, N, #2 // I = N / 4
ble iamax_kernel_S1
.align 5
iamax_kernel_S4:
KERNEL_S1
KERNEL_S1
KERNEL_S1
KERNEL_S1
subs I, I, #1
bne iamax_kernel_S4
iamax_kernel_S1:
ands I, N, #3
ble iamax_kernel_L999
iamax_kernel_S10:
KERNEL_S1
subs I, I, #1
bne iamax_kernel_S10
iamax_kernel_L999:
mov r0, INDEX // set return value
pop {r4}
bx lr
EPILOGUE

75
kernel/arm/iamin.c Normal file
View File

@ -0,0 +1,75 @@
/***************************************************************************
Copyright (c) 2013, The OpenBLAS Project
All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
1. Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
2. Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in
the documentation and/or other materials provided with the
distribution.
3. Neither the name of the OpenBLAS project nor the names of
its contributors may be used to endorse or promote products
derived from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*****************************************************************************/
/**************************************************************************************
* 2013/09/14 Saar
* BLASTEST float : NoTest
* BLASTEST double : NoTest
* CTEST : NoTest
* TEST : NoTest
*
**************************************************************************************/
#include "common.h"
#include <math.h>
#if defined(DOUBLE)
#define ABS fabs
#else
#define ABS fabsf
#endif
BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x)
{
BLASLONG i=0;
BLASLONG ix=0;
FLOAT minf=0.0;
BLASLONG min=0;
if (n < 0 || inc_x < 1 ) return(min);
minf=ABS(x[0]);
while(i < n)
{
if( ABS(x[ix]) < ABS(minf) )
{
min = i;
minf = ABS(x[ix]);
}
ix += inc_x;
i++;
}
return(min+1);
}

67
kernel/arm/imax.c Normal file
View File

@ -0,0 +1,67 @@
/***************************************************************************
Copyright (c) 2013, The OpenBLAS Project
All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
1. Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
2. Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in
the documentation and/or other materials provided with the
distribution.
3. Neither the name of the OpenBLAS project nor the names of
its contributors may be used to endorse or promote products
derived from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*****************************************************************************/
/**************************************************************************************
* 2013/09/14 Saar
* BLASTEST float : NoTest
* BLASTEST double : NoTest
* CTEST : NoTest
* TEST : NoTest
*
**************************************************************************************/
#include "common.h"
#include <math.h>
BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x)
{
BLASLONG i=0;
BLASLONG ix=0;
FLOAT maxf=0.0;
BLASLONG max=0;
if (n < 0 || inc_x < 1 ) return(max);
maxf=x[0];
while(i < n)
{
if( x[ix] > maxf )
{
max = i;
maxf = x[ix];
}
ix += inc_x;
i++;
}
return(max+1);
}

65
kernel/arm/imin.c Normal file
View File

@ -0,0 +1,65 @@
/***************************************************************************
Copyright (c) 2013, The OpenBLAS Project
All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
1. Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
2. Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in
the documentation and/or other materials provided with the
distribution.
3. Neither the name of the OpenBLAS project nor the names of
its contributors may be used to endorse or promote products
derived from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*****************************************************************************/
/**************************************************************************************
* 2013/08/19 Saar
* BLASTEST float
* BLASTEST double
*
**************************************************************************************/
#include "common.h"
#include <math.h>
BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x)
{
BLASLONG i=0;
BLASLONG ix=0;
FLOAT minf=0.0;
BLASLONG min=0;
if (n < 0 || inc_x < 1 ) return(min);
minf=x[0];
while(i < n)
{
if( x[ix] > minf )
{
min = i;
minf = x[ix];
}
ix += inc_x;
i++;
}
return(min+1);
}

81
kernel/arm/izamax.c Normal file
View File

@ -0,0 +1,81 @@
/***************************************************************************
Copyright (c) 2013, The OpenBLAS Project
All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
1. Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
2. Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in
the documentation and/or other materials provided with the
distribution.
3. Neither the name of the OpenBLAS project nor the names of
its contributors may be used to endorse or promote products
derived from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*****************************************************************************/
/**************************************************************************************
* 2013/09/14 Saar
* BLASTEST float : NoTest
* BLASTEST double : NoTest
* CTEST : OK
* TEST : OK
*
**************************************************************************************/
#include "common.h"
#include <math.h>
#if defined(DOUBLE)
#define ABS fabs
#else
#define ABS fabsf
#endif
#define CABS1(x,i) ABS(x[i])+ABS(x[i+1])
BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x)
{
BLASLONG i=0;
BLASLONG ix=0;
FLOAT maxf[2];
BLASLONG max=0;
BLASLONG inc_x2;
if (n < 0 || inc_x < 1 ) return(max);
inc_x2 = 2 * inc_x;
maxf[0] = ABS(x[ix]);
maxf[1] = ABS(x[ix+1]);
while(i < n)
{
if( CABS1(x,ix) > CABS1(maxf,0) )
{
max = i;
maxf[0] = ABS(x[ix]);
maxf[1] = ABS(x[ix+1]);
}
ix += inc_x2;
i++;
}
return(max+1);
}

81
kernel/arm/izamin.c Normal file
View File

@ -0,0 +1,81 @@
/***************************************************************************
Copyright (c) 2013, The OpenBLAS Project
All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
1. Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
2. Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in
the documentation and/or other materials provided with the
distribution.
3. Neither the name of the OpenBLAS project nor the names of
its contributors may be used to endorse or promote products
derived from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*****************************************************************************/
/**************************************************************************************
* 2013/09/14 Saar
* BLASTEST float : NoTest
* BLASTEST double : NoTest
* CTEST : NoTest
* TEST : NoTest
*
**************************************************************************************/
#include "common.h"
#include <math.h>
#if defined(DOUBLE)
#define ABS fabs
#else
#define ABS fabsf
#endif
#define CABS1(x,i) ABS(x[i])+ABS(x[i+1])
BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x)
{
BLASLONG i=0;
BLASLONG ix=0;
FLOAT minf[2];
BLASLONG min=0;
BLASLONG inc_x2;
if (n < 0 || inc_x < 1 ) return(min);
inc_x2 = 2 * inc_x;
minf[0] = ABS(x[ix]);
minf[1] = ABS(x[ix+1]);
while(i < n)
{
if( CABS1(x,ix) < CABS1(minf,0) )
{
min = i;
minf[0] = ABS(x[ix]);
minf[1] = ABS(x[ix+1]);
}
ix += inc_x2;
i++;
}
return(min+1);
}

63
kernel/arm/max.c Normal file
View File

@ -0,0 +1,63 @@
/***************************************************************************
Copyright (c) 2013, The OpenBLAS Project
All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
1. Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
2. Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in
the documentation and/or other materials provided with the
distribution.
3. Neither the name of the OpenBLAS project nor the names of
its contributors may be used to endorse or promote products
derived from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*****************************************************************************/
/**************************************************************************************
* 2013/09/14 Saar
* BLASTEST float : NoTest
* BLASTEST double : NoTest
* CTEST : NoTest
* TEST : NoTest
*
**************************************************************************************/
#include "common.h"
#include <math.h>
FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x)
{
BLASLONG i=0;
BLASLONG ix=0;
FLOAT maxf=0.0;
if (n < 0 || inc_x < 1 ) return(maxf);
maxf=x[0];
while(i < n)
{
if( x[ix] > maxf )
{
maxf = x[ix];
}
ix += inc_x;
i++;
}
return(maxf);
}

63
kernel/arm/min.c Normal file
View File

@ -0,0 +1,63 @@
/***************************************************************************
Copyright (c) 2013, The OpenBLAS Project
All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
1. Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
2. Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in
the documentation and/or other materials provided with the
distribution.
3. Neither the name of the OpenBLAS project nor the names of
its contributors may be used to endorse or promote products
derived from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*****************************************************************************/
/**************************************************************************************
* 2013/09/14 Saar
* BLASTEST float : NoTest
* BLASTEST double : NoTest
* CTEST : NoTest
* TEST : NoTest
*
**************************************************************************************/
#include "common.h"
#include <math.h>
FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x)
{
BLASLONG i=0;
BLASLONG ix=0;
FLOAT minf=0.0;
if (n < 0 || inc_x < 1 ) return(minf);
minf=x[0];
while(i < n)
{
if( x[ix] < minf )
{
minf = x[ix];
}
ix += inc_x;
i++;
}
return(minf);
}

88
kernel/arm/nrm2.c Normal file
View File

@ -0,0 +1,88 @@
/***************************************************************************
Copyright (c) 2013, The OpenBLAS Project
All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
1. Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
2. Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in
the documentation and/or other materials provided with the
distribution.
3. Neither the name of the OpenBLAS project nor the names of
its contributors may be used to endorse or promote products
derived from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*****************************************************************************/
/**************************************************************************************
* 2013/09/13 Saar
* BLASTEST float : OK
* BLASTEST double : OK
* CTEST : OK
* TEST : OK
*
**************************************************************************************/
#include "common.h"
#include <math.h>
#if defined(DOUBLE)
#define ABS fabs
#else
#define ABS fabsf
#endif
FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x)
{
BLASLONG i=0;
FLOAT scale = 0.0;
FLOAT ssq = 1.0;
FLOAT absxi = 0.0;
if (n < 0 || inc_x < 1 ) return(0.0);
if ( n == 1 ) return( ABS(x[0]) );
n *= inc_x;
while(i < n)
{
if ( x[i] != 0.0 )
{
absxi = ABS( x[i] );
if ( scale < absxi )
{
ssq = 1 + ssq * ( scale / absxi ) * ( scale / absxi );
scale = absxi ;
}
else
{
ssq += ( absxi/scale ) * ( absxi/scale );
}
}
i += inc_x;
}
scale = scale * sqrt( ssq );
return(scale);
}

565
kernel/arm/nrm2_vfp.S Normal file
View File

@ -0,0 +1,565 @@
/***************************************************************************
Copyright (c) 2013, The OpenBLAS Project
All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
1. Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
2. Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in
the documentation and/or other materials provided with the
distribution.
3. Neither the name of the OpenBLAS project nor the names of
its contributors may be used to endorse or promote products
derived from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*****************************************************************************/
/**************************************************************************************
* 2013/11/22 Saar
* BLASTEST : OK
* CTEST : OK
* TEST : OK
*
**************************************************************************************/
#define ASSEMBLER
#include "common.h"
#define STACKSIZE 256
#define N r0
#define X r1
#define INC_X r2
#define I r12
#define X_PRE 512
/**************************************************************************************
* Macro definitions
**************************************************************************************/
#if !defined(COMPLEX)
#if defined(DOUBLE)
.macro KERNEL_F1
fldmiad X!, { d4 }
vcmpe.f64 d4, d6 // compare with 0.0
vmrs APSR_nzcv, fpscr
beq KERNEL_F1_NEXT_\@
vabs.f64 d4, d4
vcmpe.f64 d0, d4 // compare with scale
vmrs APSR_nzcv, fpscr
vdivge.f64 d2 , d4, d0 // scale >= x ? x / scale
vmlage.f64 d1 , d2 , d2 // ssq += ( x/scale ) * ( x/scale )
bge KERNEL_F1_NEXT_\@
vdiv.f64 d2 , d0, d4 // scale / x
vmul.f64 d2 , d2, d2 // ( scale / x ) * ( scale / x )
vmul.f64 d3 , d1, d2 // ssq * ( scale / x ) * ( scale / x )
vadd.f64 d1 , d3, d7 // ssq = 1 + ssq * ( scale / x ) * ( scale / x )
vmov.f64 d0 , d4 // scale = x
KERNEL_F1_NEXT_\@:
.endm
.macro KERNEL_F8
pld [ X, #X_PRE ]
KERNEL_F1
KERNEL_F1
KERNEL_F1
KERNEL_F1
pld [ X, #X_PRE ]
KERNEL_F1
KERNEL_F1
KERNEL_F1
KERNEL_F1
.endm
.macro KERNEL_S1
fldmiad X, { d4 }
vcmpe.f64 d4, d6 // compare with 0.0
vmrs APSR_nzcv, fpscr
beq KERNEL_S1_NEXT
vabs.f64 d4, d4
vcmpe.f64 d0, d4 // compare with scale
vmrs APSR_nzcv, fpscr
vdivge.f64 d2 , d4, d0 // scale >= x ? x / scale
vmlage.f64 d1 , d2 , d2 // ssq += ( x/scale ) * ( x/scale )
bge KERNEL_S1_NEXT
vdiv.f64 d2 , d0, d4 // scale / x
vmul.f64 d2 , d2, d2 // ( scale / x ) * ( scale / x )
vmul.f64 d3 , d1, d2 // ssq * ( scale / x ) * ( scale / x )
vadd.f64 d1 , d3, d7 // ssq = 1 + ssq * ( scale / x ) * ( scale / x )
vmov.f64 d0 , d4 // scale = x
KERNEL_S1_NEXT:
add X, X, INC_X
.endm
#else
.macro KERNEL_F1
fldmias X!, { s4 }
vcmpe.f32 s4, s6 // compare with 0.0
vmrs APSR_nzcv, fpscr
beq KERNEL_F1_NEXT_\@
vabs.f32 s4, s4
vcmpe.f32 s0, s4 // compare with scale
vmrs APSR_nzcv, fpscr
vdivge.f32 s2 , s4, s0 // scale >= x ? x / scale
vmlage.f32 s1 , s2 , s2 // ssq += ( x/scale ) * ( x/scale )
bge KERNEL_F1_NEXT_\@
vdiv.f32 s2 , s0, s4 // scale / x
vmul.f32 s2 , s2, s2 // ( scale / x ) * ( scale / x )
vmul.f32 s3 , s1, s2 // ssq * ( scale / x ) * ( scale / x )
vadd.f32 s1 , s3, s7 // ssq = 1 + ssq * ( scale / x ) * ( scale / x )
vmov.f32 s0 , s4 // scale = x
KERNEL_F1_NEXT_\@:
.endm
.macro KERNEL_F8
pld [ X, #X_PRE ]
KERNEL_F1
KERNEL_F1
KERNEL_F1
KERNEL_F1
KERNEL_F1
KERNEL_F1
KERNEL_F1
KERNEL_F1
.endm
.macro KERNEL_S1
fldmias X, { s4 }
vcmpe.f32 s4, s6 // compare with 0.0
vmrs APSR_nzcv, fpscr
beq KERNEL_S1_NEXT
vabs.f32 s4, s4
vcmpe.f32 s0, s4 // compare with scale
vmrs APSR_nzcv, fpscr
vdivge.f32 s2 , s4, s0 // scale >= x ? x / scale
vmlage.f32 s1 , s2 , s2 // ssq += ( x/scale ) * ( x/scale )
bge KERNEL_S1_NEXT
vdiv.f32 s2 , s0, s4 // scale / x
vmul.f32 s2 , s2, s2 // ( scale / x ) * ( scale / x )
vmul.f32 s3 , s1, s2 // ssq * ( scale / x ) * ( scale / x )
vadd.f32 s1 , s3, s7 // ssq = 1 + ssq * ( scale / x ) * ( scale / x )
vmov.f32 s0 , s4 // scale = x
KERNEL_S1_NEXT:
add X, X, INC_X
.endm
#endif
#else
#if defined(DOUBLE)
.macro KERNEL_F1
fldmiad X!, { d4 - d5 }
vcmpe.f64 d4, d6 // compare with 0.0
vmrs APSR_nzcv, fpscr
beq KERNEL_F1_NEXT_\@
vabs.f64 d4, d4
vcmpe.f64 d0, d4 // compare with scale
vmrs APSR_nzcv, fpscr
vdivge.f64 d2 , d4, d0 // scale >= x ? x / scale
vmlage.f64 d1 , d2 , d2 // ssq += ( x/scale ) * ( x/scale )
bge KERNEL_F1_NEXT_\@
vdiv.f64 d2 , d0, d4 // scale / x
vmul.f64 d2 , d2, d2 // ( scale / x ) * ( scale / x )
vmul.f64 d3 , d1, d2 // ssq * ( scale / x ) * ( scale / x )
vadd.f64 d1 , d3, d7 // ssq = 1 + ssq * ( scale / x ) * ( scale / x )
vmov.f64 d0 , d4 // scale = x
KERNEL_F1_NEXT_\@:
vcmpe.f64 d5, d6 // compare with 0.0
vmrs APSR_nzcv, fpscr
beq KERNEL_F1_END_\@
vabs.f64 d5, d5
vcmpe.f64 d0, d5 // compare with scale
vmrs APSR_nzcv, fpscr
vdivge.f64 d2 , d5, d0 // scale >= x ? x / scale
vmlage.f64 d1 , d2 , d2 // ssq += ( x/scale ) * ( x/scale )
bge KERNEL_F1_END_\@
vdiv.f64 d2 , d0, d5 // scale / x
vmul.f64 d2 , d2, d2 // ( scale / x ) * ( scale / x )
vmul.f64 d3 , d1, d2 // ssq * ( scale / x ) * ( scale / x )
vadd.f64 d1 , d3, d7 // ssq = 1 + ssq * ( scale / x ) * ( scale / x )
vmov.f64 d0 , d5 // scale = x
KERNEL_F1_END_\@:
.endm
.macro KERNEL_F8
pld [ X, #X_PRE ]
KERNEL_F1
KERNEL_F1
pld [ X, #X_PRE ]
KERNEL_F1
KERNEL_F1
pld [ X, #X_PRE ]
KERNEL_F1
KERNEL_F1
pld [ X, #X_PRE ]
KERNEL_F1
KERNEL_F1
.endm
.macro KERNEL_S1
fldmiad X, { d4 - d5 }
vcmpe.f64 d4, d6 // compare with 0.0
vmrs APSR_nzcv, fpscr
beq KERNEL_S1_NEXT_\@
vabs.f64 d4, d4
vcmpe.f64 d0, d4 // compare with scale
vmrs APSR_nzcv, fpscr
vdivge.f64 d2 , d4, d0 // scale >= x ? x / scale
vmlage.f64 d1 , d2 , d2 // ssq += ( x/scale ) * ( x/scale )
bge KERNEL_S1_NEXT_\@
vdiv.f64 d2 , d0, d4 // scale / x
vmul.f64 d2 , d2, d2 // ( scale / x ) * ( scale / x )
vmul.f64 d3 , d1, d2 // ssq * ( scale / x ) * ( scale / x )
vadd.f64 d1 , d3, d7 // ssq = 1 + ssq * ( scale / x ) * ( scale / x )
vmov.f64 d0 , d4 // scale = x
KERNEL_S1_NEXT_\@:
vcmpe.f64 d5, d6 // compare with 0.0
vmrs APSR_nzcv, fpscr
beq KERNEL_S1_END_\@
vabs.f64 d5, d5
vcmpe.f64 d0, d5 // compare with scale
vmrs APSR_nzcv, fpscr
vdivge.f64 d2 , d5, d0 // scale >= x ? x / scale
vmlage.f64 d1 , d2 , d2 // ssq += ( x/scale ) * ( x/scale )
bge KERNEL_S1_END_\@
vdiv.f64 d2 , d0, d5 // scale / x
vmul.f64 d2 , d2, d2 // ( scale / x ) * ( scale / x )
vmul.f64 d3 , d1, d2 // ssq * ( scale / x ) * ( scale / x )
vadd.f64 d1 , d3, d7 // ssq = 1 + ssq * ( scale / x ) * ( scale / x )
vmov.f64 d0 , d5 // scale = x
KERNEL_S1_END_\@:
add X, X, INC_X
.endm
#else
.macro KERNEL_F1
fldmias X!, { s4 - s5 }
vcmpe.f32 s4, s6 // compare with 0.0
vmrs APSR_nzcv, fpscr
beq KERNEL_F1_NEXT_\@
vabs.f32 s4, s4
vcmpe.f32 s0, s4 // compare with scale
vmrs APSR_nzcv, fpscr
vdivge.f32 s2 , s4, s0 // scale >= x ? x / scale
vmlage.f32 s1 , s2 , s2 // ssq += ( x/scale ) * ( x/scale )
bge KERNEL_F1_NEXT_\@
vdiv.f32 s2 , s0, s4 // scale / x
vmul.f32 s2 , s2, s2 // ( scale / x ) * ( scale / x )
vmul.f32 s3 , s1, s2 // ssq * ( scale / x ) * ( scale / x )
vadd.f32 s1 , s3, s7 // ssq = 1 + ssq * ( scale / x ) * ( scale / x )
vmov.f32 s0 , s4 // scale = x
KERNEL_F1_NEXT_\@:
vcmpe.f32 s5, s6 // compare with 0.0
vmrs APSR_nzcv, fpscr
beq KERNEL_F1_END_\@
vabs.f32 s5, s5
vcmpe.f32 s0, s5 // compare with scale
vmrs APSR_nzcv, fpscr
vdivge.f32 s2 , s5, s0 // scale >= x ? x / scale
vmlage.f32 s1 , s2 , s2 // ssq += ( x/scale ) * ( x/scale )
bge KERNEL_F1_END_\@
vdiv.f32 s2 , s0, s5 // scale / x
vmul.f32 s2 , s2, s2 // ( scale / x ) * ( scale / x )
vmul.f32 s3 , s1, s2 // ssq * ( scale / x ) * ( scale / x )
vadd.f32 s1 , s3, s7 // ssq = 1 + ssq * ( scale / x ) * ( scale / x )
vmov.f32 s0 , s5 // scale = x
KERNEL_F1_END_\@:
.endm
.macro KERNEL_F8
pld [ X, #X_PRE ]
KERNEL_F1
KERNEL_F1
KERNEL_F1
KERNEL_F1
pld [ X, #X_PRE ]
KERNEL_F1
KERNEL_F1
KERNEL_F1
KERNEL_F1
.endm
.macro KERNEL_S1
fldmias X, { s4 - s5 }
vcmpe.f32 s4, s6 // compare with 0.0
vmrs APSR_nzcv, fpscr
beq KERNEL_S1_NEXT_\@
vabs.f32 s4, s4
vcmpe.f32 s0, s4 // compare with scale
vmrs APSR_nzcv, fpscr
vdivge.f32 s2 , s4, s0 // scale >= x ? x / scale
vmlage.f32 s1 , s2 , s2 // ssq += ( x/scale ) * ( x/scale )
bge KERNEL_S1_NEXT_\@
vdiv.f32 s2 , s0, s4 // scale / x
vmul.f32 s2 , s2, s2 // ( scale / x ) * ( scale / x )
vmul.f32 s3 , s1, s2 // ssq * ( scale / x ) * ( scale / x )
vadd.f32 s1 , s3, s7 // ssq = 1 + ssq * ( scale / x ) * ( scale / x )
vmov.f32 s0 , s4 // scale = x
KERNEL_S1_NEXT_\@:
vcmpe.f32 s5, s6 // compare with 0.0
vmrs APSR_nzcv, fpscr
beq KERNEL_S1_END_\@
vabs.f32 s5, s5
vcmpe.f32 s0, s5 // compare with scale
vmrs APSR_nzcv, fpscr
vdivge.f32 s2 , s5, s0 // scale >= x ? x / scale
vmlage.f32 s1 , s2 , s2 // ssq += ( x/scale ) * ( x/scale )
bge KERNEL_S1_END_\@
vdiv.f32 s2 , s0, s5 // scale / x
vmul.f32 s2 , s2, s2 // ( scale / x ) * ( scale / x )
vmul.f32 s3 , s1, s2 // ssq * ( scale / x ) * ( scale / x )
vadd.f32 s1 , s3, s7 // ssq = 1 + ssq * ( scale / x ) * ( scale / x )
vmov.f32 s0 , s5 // scale = x
KERNEL_S1_END_\@:
add X, X, INC_X
.endm
#endif
#endif
/**************************************************************************************
* End of macro definitions
**************************************************************************************/
PROLOGUE
b nrm2_begin
#if defined(COMPLEX)
#if defined(DOUBLE)
znrm2_one:
.word 0x00000000
.word 0x3ff00000
#else
cnrm2_one:
.word 0x3f800000
#endif
#else
#if defined(DOUBLE)
dnrm2_one:
.word 0x00000000
.word 0x3ff00000
#else
snrm2_one:
.word 0x3f800000
#endif
#endif
.align 5
nrm2_begin:
#if defined(COMPLEX)
#if defined(DOUBLE)
vsub.f64 d0 , d0 , d0 // scale=0.0
vldr.64 d1 , znrm2_one // ssq=1.0
vmov.f64 d7 , d1 // value 1.0
vmov.f64 d6 , d0 // value 0.0
#else
vsub.f32 s0 , s0 , s0 // scale=0.0
vldr.32 s1 , cnrm2_one // ssq=1.0
vmov.f32 s7 , s1 // value 1.0
vmov.f32 s6 , s0 // value 0.0
#endif
#else
#if defined(DOUBLE)
vsub.f64 d0 , d0 , d0 // scale=0.0
vldr.64 d1 , dnrm2_one // ssq=1.0
vmov.f64 d7 , d1 // value 1.0
vmov.f64 d6 , d0 // value 0.0
#else
vsub.f32 s0 , s0 , s0 // scale=0.0
vldr.32 s1 , snrm2_one // ssq=1.0
vmov.f32 s7 , s1 // value 1.0
vmov.f32 s6 , s0 // value 0.0
#endif
#endif
cmp N, #0
ble nrm2_kernel_L999
cmp INC_X, #0
beq nrm2_kernel_L999
cmp INC_X, #1
bne nrm2_kernel_S_BEGIN
nrm2_kernel_F_BEGIN:
asrs I, N, #3 // I = N / 8
ble nrm2_kernel_F1
nrm2_kernel_F8:
KERNEL_F8
subs I, I, #1
bne nrm2_kernel_F8
nrm2_kernel_F1:
ands I, N, #7
ble nrm2_kernel_L999
nrm2_kernel_F10:
KERNEL_F1
subs I, I, #1
bne nrm2_kernel_F10
b nrm2_kernel_L999
nrm2_kernel_S_BEGIN:
#if defined(COMPLEX)
#if defined(DOUBLE)
lsl INC_X, INC_X, #4 // INC_X * SIZE * 2
#else
lsl INC_X, INC_X, #3 // INC_X * SIZE * 2
#endif
#else
#if defined(DOUBLE)
lsl INC_X, INC_X, #3 // INC_X * SIZE
#else
lsl INC_X, INC_X, #2 // INC_X * SIZE
#endif
#endif
nrm2_kernel_S1:
mov I, N
.align 5
nrm2_kernel_S10:
KERNEL_S1
subs I, I, #1
bne nrm2_kernel_S10
nrm2_kernel_L999:
#if defined(DOUBLE)
vsqrt.f64 d1, d1
vmul.f64 d0, d0, d1
#else
vsqrt.f32 s1, s1
vmul.f32 s0, s0, s1
#endif
bx lr
EPILOGUE

508
kernel/arm/nrm2_vfpv3.S Normal file
View File

@ -0,0 +1,508 @@
/***************************************************************************
Copyright (c) 2013, The OpenBLAS Project
All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
1. Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
2. Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in
the documentation and/or other materials provided with the
distribution.
3. Neither the name of the OpenBLAS project nor the names of
its contributors may be used to endorse or promote products
derived from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*****************************************************************************/
/**************************************************************************************
* 2013/11/16 Saar
* BLASTEST : OK
* CTEST : OK
* TEST : OK
*
**************************************************************************************/
#define ASSEMBLER
#include "common.h"
#define STACKSIZE 256
#define N r0
#define X r1
#define INC_X r2
#define I r12
#define X_PRE 512
/**************************************************************************************
* Macro definitions
**************************************************************************************/
#if !defined(COMPLEX)
#if defined(DOUBLE)
.macro KERNEL_F1
fldmiad X!, { d4 }
vcmpe.f64 d4, d6 // compare with 0.0
vmrs APSR_nzcv, fpscr
beq KERNEL_F1_NEXT_\@
vabs.f64 d4, d4
vcmpe.f64 d0, d4 // compare with scale
vmrs APSR_nzcv, fpscr
vdivge.f64 d2 , d4, d0 // scale >= x ? x / scale
vmlage.f64 d1 , d2 , d2 // ssq += ( x/scale ) * ( x/scale )
bge KERNEL_F1_NEXT_\@
vdiv.f64 d2 , d0, d4 // scale / x
vmul.f64 d2 , d2, d2 // ( scale / x ) * ( scale / x )
vmul.f64 d3 , d1, d2 // ssq * ( scale / x ) * ( scale / x )
vadd.f64 d1 , d3, d7 // ssq = 1 + ssq * ( scale / x ) * ( scale / x )
vmov.f64 d0 , d4 // scale = x
KERNEL_F1_NEXT_\@:
.endm
.macro KERNEL_F8
pld [ X, #X_PRE ]
KERNEL_F1
KERNEL_F1
KERNEL_F1
KERNEL_F1
pld [ X, #X_PRE ]
KERNEL_F1
KERNEL_F1
KERNEL_F1
KERNEL_F1
.endm
.macro KERNEL_S1
fldmiad X, { d4 }
vcmpe.f64 d4, d6 // compare with 0.0
vmrs APSR_nzcv, fpscr
beq KERNEL_S1_NEXT
vabs.f64 d4, d4
vcmpe.f64 d0, d4 // compare with scale
vmrs APSR_nzcv, fpscr
vdivge.f64 d2 , d4, d0 // scale >= x ? x / scale
vmlage.f64 d1 , d2 , d2 // ssq += ( x/scale ) * ( x/scale )
bge KERNEL_S1_NEXT
vdiv.f64 d2 , d0, d4 // scale / x
vmul.f64 d2 , d2, d2 // ( scale / x ) * ( scale / x )
vmul.f64 d3 , d1, d2 // ssq * ( scale / x ) * ( scale / x )
vadd.f64 d1 , d3, d7 // ssq = 1 + ssq * ( scale / x ) * ( scale / x )
vmov.f64 d0 , d4 // scale = x
KERNEL_S1_NEXT:
add X, X, INC_X
.endm
#else
.macro KERNEL_F1
fldmias X!, { s4 }
vcmpe.f32 s4, s6 // compare with 0.0
vmrs APSR_nzcv, fpscr
beq KERNEL_F1_NEXT_\@
vabs.f32 s4, s4
vcmpe.f32 s0, s4 // compare with scale
vmrs APSR_nzcv, fpscr
vdivge.f32 s2 , s4, s0 // scale >= x ? x / scale
vmlage.f32 s1 , s2 , s2 // ssq += ( x/scale ) * ( x/scale )
bge KERNEL_F1_NEXT_\@
vdiv.f32 s2 , s0, s4 // scale / x
vmul.f32 s2 , s2, s2 // ( scale / x ) * ( scale / x )
vmul.f32 s3 , s1, s2 // ssq * ( scale / x ) * ( scale / x )
vadd.f32 s1 , s3, s7 // ssq = 1 + ssq * ( scale / x ) * ( scale / x )
vmov.f32 s0 , s4 // scale = x
KERNEL_F1_NEXT_\@:
.endm
.macro KERNEL_F8
pld [ X, #X_PRE ]
KERNEL_F1
KERNEL_F1
KERNEL_F1
KERNEL_F1
KERNEL_F1
KERNEL_F1
KERNEL_F1
KERNEL_F1
.endm
.macro KERNEL_S1
fldmias X, { s4 }
vcmpe.f32 s4, s6 // compare with 0.0
vmrs APSR_nzcv, fpscr
beq KERNEL_S1_NEXT
vabs.f32 s4, s4
vcmpe.f32 s0, s4 // compare with scale
vmrs APSR_nzcv, fpscr
vdivge.f32 s2 , s4, s0 // scale >= x ? x / scale
vmlage.f32 s1 , s2 , s2 // ssq += ( x/scale ) * ( x/scale )
bge KERNEL_S1_NEXT
vdiv.f32 s2 , s0, s4 // scale / x
vmul.f32 s2 , s2, s2 // ( scale / x ) * ( scale / x )
vmul.f32 s3 , s1, s2 // ssq * ( scale / x ) * ( scale / x )
vadd.f32 s1 , s3, s7 // ssq = 1 + ssq * ( scale / x ) * ( scale / x )
vmov.f32 s0 , s4 // scale = x
KERNEL_S1_NEXT:
add X, X, INC_X
.endm
#endif
#else
#if defined(DOUBLE)
.macro KERNEL_F1
fldmiad X!, { d4 - d5 }
vcmpe.f64 d4, d6 // compare with 0.0
vmrs APSR_nzcv, fpscr
beq KERNEL_F1_NEXT_\@
vabs.f64 d4, d4
vcmpe.f64 d0, d4 // compare with scale
vmrs APSR_nzcv, fpscr
vdivge.f64 d2 , d4, d0 // scale >= x ? x / scale
vmlage.f64 d1 , d2 , d2 // ssq += ( x/scale ) * ( x/scale )
bge KERNEL_F1_NEXT_\@
vdiv.f64 d2 , d0, d4 // scale / x
vmul.f64 d2 , d2, d2 // ( scale / x ) * ( scale / x )
vmul.f64 d3 , d1, d2 // ssq * ( scale / x ) * ( scale / x )
vadd.f64 d1 , d3, d7 // ssq = 1 + ssq * ( scale / x ) * ( scale / x )
vmov.f64 d0 , d4 // scale = x
KERNEL_F1_NEXT_\@:
vcmpe.f64 d5, d6 // compare with 0.0
vmrs APSR_nzcv, fpscr
beq KERNEL_F1_END_\@
vabs.f64 d5, d5
vcmpe.f64 d0, d5 // compare with scale
vmrs APSR_nzcv, fpscr
vdivge.f64 d2 , d5, d0 // scale >= x ? x / scale
vmlage.f64 d1 , d2 , d2 // ssq += ( x/scale ) * ( x/scale )
bge KERNEL_F1_END_\@
vdiv.f64 d2 , d0, d5 // scale / x
vmul.f64 d2 , d2, d2 // ( scale / x ) * ( scale / x )
vmul.f64 d3 , d1, d2 // ssq * ( scale / x ) * ( scale / x )
vadd.f64 d1 , d3, d7 // ssq = 1 + ssq * ( scale / x ) * ( scale / x )
vmov.f64 d0 , d5 // scale = x
KERNEL_F1_END_\@:
.endm
.macro KERNEL_F8
pld [ X, #X_PRE ]
KERNEL_F1
KERNEL_F1
pld [ X, #X_PRE ]
KERNEL_F1
KERNEL_F1
pld [ X, #X_PRE ]
KERNEL_F1
KERNEL_F1
pld [ X, #X_PRE ]
KERNEL_F1
KERNEL_F1
.endm
.macro KERNEL_S1
fldmiad X, { d4 - d5 }
vcmpe.f64 d4, d6 // compare with 0.0
vmrs APSR_nzcv, fpscr
beq KERNEL_S1_NEXT_\@
vabs.f64 d4, d4
vcmpe.f64 d0, d4 // compare with scale
vmrs APSR_nzcv, fpscr
vdivge.f64 d2 , d4, d0 // scale >= x ? x / scale
vmlage.f64 d1 , d2 , d2 // ssq += ( x/scale ) * ( x/scale )
bge KERNEL_S1_NEXT_\@
vdiv.f64 d2 , d0, d4 // scale / x
vmul.f64 d2 , d2, d2 // ( scale / x ) * ( scale / x )
vmul.f64 d3 , d1, d2 // ssq * ( scale / x ) * ( scale / x )
vadd.f64 d1 , d3, d7 // ssq = 1 + ssq * ( scale / x ) * ( scale / x )
vmov.f64 d0 , d4 // scale = x
KERNEL_S1_NEXT_\@:
vcmpe.f64 d5, d6 // compare with 0.0
vmrs APSR_nzcv, fpscr
beq KERNEL_S1_END_\@
vabs.f64 d5, d5
vcmpe.f64 d0, d5 // compare with scale
vmrs APSR_nzcv, fpscr
vdivge.f64 d2 , d5, d0 // scale >= x ? x / scale
vmlage.f64 d1 , d2 , d2 // ssq += ( x/scale ) * ( x/scale )
bge KERNEL_S1_END_\@
vdiv.f64 d2 , d0, d5 // scale / x
vmul.f64 d2 , d2, d2 // ( scale / x ) * ( scale / x )
vmul.f64 d3 , d1, d2 // ssq * ( scale / x ) * ( scale / x )
vadd.f64 d1 , d3, d7 // ssq = 1 + ssq * ( scale / x ) * ( scale / x )
vmov.f64 d0 , d5 // scale = x
KERNEL_S1_END_\@:
add X, X, INC_X
.endm
#else
.macro KERNEL_F1
fldmias X!, { s4 - s5 }
vcmpe.f32 s4, s6 // compare with 0.0
vmrs APSR_nzcv, fpscr
beq KERNEL_F1_NEXT_\@
vabs.f32 s4, s4
vcmpe.f32 s0, s4 // compare with scale
vmrs APSR_nzcv, fpscr
vdivge.f32 s2 , s4, s0 // scale >= x ? x / scale
vmlage.f32 s1 , s2 , s2 // ssq += ( x/scale ) * ( x/scale )
bge KERNEL_F1_NEXT_\@
vdiv.f32 s2 , s0, s4 // scale / x
vmul.f32 s2 , s2, s2 // ( scale / x ) * ( scale / x )
vmul.f32 s3 , s1, s2 // ssq * ( scale / x ) * ( scale / x )
vadd.f32 s1 , s3, s7 // ssq = 1 + ssq * ( scale / x ) * ( scale / x )
vmov.f32 s0 , s4 // scale = x
KERNEL_F1_NEXT_\@:
vcmpe.f32 s5, s6 // compare with 0.0
vmrs APSR_nzcv, fpscr
beq KERNEL_F1_END_\@
vabs.f32 s5, s5
vcmpe.f32 s0, s5 // compare with scale
vmrs APSR_nzcv, fpscr
vdivge.f32 s2 , s5, s0 // scale >= x ? x / scale
vmlage.f32 s1 , s2 , s2 // ssq += ( x/scale ) * ( x/scale )
bge KERNEL_F1_END_\@
vdiv.f32 s2 , s0, s5 // scale / x
vmul.f32 s2 , s2, s2 // ( scale / x ) * ( scale / x )
vmul.f32 s3 , s1, s2 // ssq * ( scale / x ) * ( scale / x )
vadd.f32 s1 , s3, s7 // ssq = 1 + ssq * ( scale / x ) * ( scale / x )
vmov.f32 s0 , s5 // scale = x
KERNEL_F1_END_\@:
.endm
.macro KERNEL_F8
pld [ X, #X_PRE ]
KERNEL_F1
KERNEL_F1
KERNEL_F1
KERNEL_F1
pld [ X, #X_PRE ]
KERNEL_F1
KERNEL_F1
KERNEL_F1
KERNEL_F1
.endm
.macro KERNEL_S1
fldmias X, { s4 - s5 }
vcmpe.f32 s4, s6 // compare with 0.0
vmrs APSR_nzcv, fpscr
beq KERNEL_S1_NEXT_\@
vabs.f32 s4, s4
vcmpe.f32 s0, s4 // compare with scale
vmrs APSR_nzcv, fpscr
vdivge.f32 s2 , s4, s0 // scale >= x ? x / scale
vmlage.f32 s1 , s2 , s2 // ssq += ( x/scale ) * ( x/scale )
bge KERNEL_S1_NEXT_\@
vdiv.f32 s2 , s0, s4 // scale / x
vmul.f32 s2 , s2, s2 // ( scale / x ) * ( scale / x )
vmul.f32 s3 , s1, s2 // ssq * ( scale / x ) * ( scale / x )
vadd.f32 s1 , s3, s7 // ssq = 1 + ssq * ( scale / x ) * ( scale / x )
vmov.f32 s0 , s4 // scale = x
KERNEL_S1_NEXT_\@:
vcmpe.f32 s5, s6 // compare with 0.0
vmrs APSR_nzcv, fpscr
beq KERNEL_S1_END_\@
vabs.f32 s5, s5
vcmpe.f32 s0, s5 // compare with scale
vmrs APSR_nzcv, fpscr
vdivge.f32 s2 , s5, s0 // scale >= x ? x / scale
vmlage.f32 s1 , s2 , s2 // ssq += ( x/scale ) * ( x/scale )
bge KERNEL_S1_END_\@
vdiv.f32 s2 , s0, s5 // scale / x
vmul.f32 s2 , s2, s2 // ( scale / x ) * ( scale / x )
vmul.f32 s3 , s1, s2 // ssq * ( scale / x ) * ( scale / x )
vadd.f32 s1 , s3, s7 // ssq = 1 + ssq * ( scale / x ) * ( scale / x )
vmov.f32 s0 , s5 // scale = x
KERNEL_S1_END_\@:
add X, X, INC_X
.endm
#endif
#endif
/**************************************************************************************
* End of macro definitions
**************************************************************************************/
PROLOGUE
.align 5
#if defined(DOUBLE)
vsub.f64 d0 , d0 , d0 // scale=0.0
vmov.f64 d1 , #1.0 // ssq=1.0
vmov.f64 d7 , d1 // value 1.0
vmov.f64 d6 , d0 // value 0.0
#else
vsub.f32 s0 , s0 , s0 // scale=0.0
vmov.f32 s1 , #1.0 // ssq=1.0
vmov.f32 s7 , s1 // value 1.0
vmov.f32 s6 , s0 // value 0.0
#endif
cmp N, #0
ble nrm2_kernel_L999
cmp INC_X, #0
beq nrm2_kernel_L999
cmp INC_X, #1
bne nrm2_kernel_S_BEGIN
nrm2_kernel_F_BEGIN:
asrs I, N, #3 // I = N / 8
ble nrm2_kernel_F1
nrm2_kernel_F8:
KERNEL_F8
subs I, I, #1
bne nrm2_kernel_F8
nrm2_kernel_F1:
ands I, N, #7
ble nrm2_kernel_L999
nrm2_kernel_F10:
KERNEL_F1
subs I, I, #1
bne nrm2_kernel_F10
b nrm2_kernel_L999
nrm2_kernel_S_BEGIN:
#if defined(COMPLEX)
#if defined(DOUBLE)
lsl INC_X, INC_X, #4 // INC_X * SIZE * 2
#else
lsl INC_X, INC_X, #3 // INC_X * SIZE * 2
#endif
#else
#if defined(DOUBLE)
lsl INC_X, INC_X, #3 // INC_X * SIZE
#else
lsl INC_X, INC_X, #2 // INC_X * SIZE
#endif
#endif
nrm2_kernel_S1:
mov I, N
.align 5
nrm2_kernel_S10:
KERNEL_S1
subs I, I, #1
bne nrm2_kernel_S10
nrm2_kernel_L999:
#if defined(DOUBLE)
vsqrt.f64 d1, d1
vmul.f64 d0, d0, d1
#else
vsqrt.f32 s1, s1
vmul.f32 s0, s0, s1
#endif
bx lr
EPILOGUE

62
kernel/arm/rot.c Normal file
View File

@ -0,0 +1,62 @@
/***************************************************************************
Copyright (c) 2013, The OpenBLAS Project
All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
1. Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
2. Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in
the documentation and/or other materials provided with the
distribution.
3. Neither the name of the OpenBLAS project nor the names of
its contributors may be used to endorse or promote products
derived from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*****************************************************************************/
/**************************************************************************************
* 2013/09/14 Saar
* BLASTEST float : OK
* BLASTEST double : OK
* CTEST : OK
* TEST : OK
*
**************************************************************************************/
#include "common.h"
int CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT c, FLOAT s)
{
BLASLONG i=0;
BLASLONG ix=0,iy=0;
FLOAT temp;
if ( n <= 0 ) return(0);
while(i < n)
{
temp = c*x[ix] + s*y[iy] ;
y[iy] = c*y[iy] - s*x[ix] ;
x[ix] = temp ;
ix += inc_x ;
iy += inc_y ;
i++ ;
}
return(0);
}

584
kernel/arm/rot_vfp.S Normal file
View File

@ -0,0 +1,584 @@
/***************************************************************************
Copyright (c) 2013, The OpenBLAS Project
All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
1. Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
2. Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in
the documentation and/or other materials provided with the
distribution.
3. Neither the name of the OpenBLAS project nor the names of
its contributors may be used to endorse or promote products
derived from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*****************************************************************************/
/**************************************************************************************
* 2013/11/15 Saar
* BLASTEST : OK
* CTEST : OK
* TEST : OK
*
**************************************************************************************/
#define ASSEMBLER
#include "common.h"
#define STACKSIZE 256
#define OLD_INC_Y [fp, #0 ]
#define N r0
#define X r1
#define INC_X r2
#define Y r3
#define INC_Y r4
#define I r12
#define X_PRE 512
/**************************************************************************************
* Macro definitions
**************************************************************************************/
/*****************************************************************************************/
#if !defined(COMPLEX)
#if defined(DOUBLE)
.macro KERNEL_F4
pld [ X, #X_PRE ]
pld [ Y, #X_PRE ]
fldmiad X, { d4 }
fldmiad Y, { d5 }
vmul.f64 d2 , d0, d4
fmacd d2 , d1, d5
vmul.f64 d3 , d0, d5
fnmacd d3 , d1, d4
fstmiad X!, { d2 }
fstmiad Y!, { d3 }
fldmiad X, { d4 }
fldmiad Y, { d5 }
vmul.f64 d2 , d0, d4
fmacd d2 , d1, d5
vmul.f64 d3 , d0, d5
fnmacd d3 , d1, d4
fstmiad X!, { d2 }
fstmiad Y!, { d3 }
fldmiad X, { d4 }
fldmiad Y, { d5 }
vmul.f64 d2 , d0, d4
fmacd d2 , d1, d5
vmul.f64 d3 , d0, d5
fnmacd d3 , d1, d4
fstmiad X!, { d2 }
fstmiad Y!, { d3 }
fldmiad X, { d4 }
fldmiad Y, { d5 }
vmul.f64 d2 , d0, d4
fmacd d2 , d1, d5
vmul.f64 d3 , d0, d5
fnmacd d3 , d1, d4
fstmiad X!, { d2 }
fstmiad Y!, { d3 }
.endm
.macro KERNEL_F1
fldmiad X, { d4 }
fldmiad Y, { d5 }
vmul.f64 d2 , d0, d4
fmacd d2 , d1, d5
vmul.f64 d3 , d0, d5
fnmacd d3 , d1, d4
fstmiad X!, { d2 }
fstmiad Y!, { d3 }
.endm
.macro KERNEL_S1
fldmiad X, { d4 }
fldmiad Y, { d5 }
vmul.f64 d2 , d0, d4
fmacd d2 , d1, d5
vmul.f64 d3 , d0, d5
fnmacd d3 , d1, d4
fstmiad X, { d2 }
fstmiad Y, { d3 }
add X, X, INC_X
add Y, Y, INC_Y
.endm
#else
.macro KERNEL_F4
fldmias X, { s4 }
fldmias Y, { s5 }
vmul.f32 s2 , s0, s4
fmacs s2 , s1, s5
vmul.f32 s3 , s0, s5
fnmacs s3 , s1, s4
fstmias X!, { s2 }
fstmias Y!, { s3 }
fldmias X, { s4 }
fldmias Y, { s5 }
vmul.f32 s2 , s0, s4
fmacs s2 , s1, s5
vmul.f32 s3 , s0, s5
fnmacs s3 , s1, s4
fstmias X!, { s2 }
fstmias Y!, { s3 }
fldmias X, { s4 }
fldmias Y, { s5 }
vmul.f32 s2 , s0, s4
fmacs s2 , s1, s5
vmul.f32 s3 , s0, s5
fnmacs s3 , s1, s4
fstmias X!, { s2 }
fstmias Y!, { s3 }
fldmias X, { s4 }
fldmias Y, { s5 }
vmul.f32 s2 , s0, s4
fmacs s2 , s1, s5
vmul.f32 s3 , s0, s5
fnmacs s3 , s1, s4
fstmias X!, { s2 }
fstmias Y!, { s3 }
.endm
.macro KERNEL_F1
fldmias X, { s4 }
fldmias Y, { s5 }
vmul.f32 s2 , s0, s4
fmacs s2 , s1, s5
vmul.f32 s3 , s0, s5
fnmacs s3 , s1, s4
fstmias X!, { s2 }
fstmias Y!, { s3 }
.endm
.macro KERNEL_S1
fldmias X, { s4 }
fldmias Y, { s5 }
vmul.f32 s2 , s0, s4
fmacs s2 , s1, s5
vmul.f32 s3 , s0, s5
fnmacs s3 , s1, s4
fstmias X, { s2 }
fstmias Y, { s3 }
add X, X, INC_X
add Y, Y, INC_Y
.endm
#endif
#else
#if defined(DOUBLE)
.macro KERNEL_F4
pld [ X, #X_PRE ]
pld [ Y, #X_PRE ]
fldmiad X, { d4 - d5 }
fldmiad Y, { d6 - d7 }
vmul.f64 d2 , d0, d4
fmacd d2 , d1, d6
vmul.f64 d3 , d0, d6
fnmacd d3 , d1, d4
fstmiad X!, { d2 }
fstmiad Y!, { d3 }
vmul.f64 d2 , d0, d5
fmacd d2 , d1, d7
vmul.f64 d3 , d0, d7
fnmacd d3 , d1, d5
fstmiad X!, { d2 }
fstmiad Y!, { d3 }
fldmiad X, { d4 - d5 }
fldmiad Y, { d6 - d7 }
vmul.f64 d2 , d0, d4
fmacd d2 , d1, d6
vmul.f64 d3 , d0, d6
fnmacd d3 , d1, d4
fstmiad X!, { d2 }
fstmiad Y!, { d3 }
vmul.f64 d2 , d0, d5
fmacd d2 , d1, d7
vmul.f64 d3 , d0, d7
fnmacd d3 , d1, d5
fstmiad X!, { d2 }
fstmiad Y!, { d3 }
pld [ X, #X_PRE ]
pld [ Y, #X_PRE ]
fldmiad X, { d4 - d5 }
fldmiad Y, { d6 - d7 }
vmul.f64 d2 , d0, d4
fmacd d2 , d1, d6
vmul.f64 d3 , d0, d6
fnmacd d3 , d1, d4
fstmiad X!, { d2 }
fstmiad Y!, { d3 }
vmul.f64 d2 , d0, d5
fmacd d2 , d1, d7
vmul.f64 d3 , d0, d7
fnmacd d3 , d1, d5
fstmiad X!, { d2 }
fstmiad Y!, { d3 }
fldmiad X, { d4 - d5 }
fldmiad Y, { d6 - d7 }
vmul.f64 d2 , d0, d4
fmacd d2 , d1, d6
vmul.f64 d3 , d0, d6
fnmacd d3 , d1, d4
fstmiad X!, { d2 }
fstmiad Y!, { d3 }
vmul.f64 d2 , d0, d5
fmacd d2 , d1, d7
vmul.f64 d3 , d0, d7
fnmacd d3 , d1, d5
fstmiad X!, { d2 }
fstmiad Y!, { d3 }
.endm
.macro KERNEL_F1
fldmiad X, { d4 - d5 }
fldmiad Y, { d6 - d7 }
vmul.f64 d2 , d0, d4
fmacd d2 , d1, d6
vmul.f64 d3 , d0, d6
fnmacd d3 , d1, d4
fstmiad X!, { d2 }
fstmiad Y!, { d3 }
vmul.f64 d2 , d0, d5
fmacd d2 , d1, d7
vmul.f64 d3 , d0, d7
fnmacd d3 , d1, d5
fstmiad X!, { d2 }
fstmiad Y!, { d3 }
.endm
.macro KERNEL_S1
fldmiad X, { d4 - d5 }
fldmiad Y, { d6 - d7 }
vmul.f64 d2 , d0, d4
fmacd d2 , d1, d6
vmul.f64 d3 , d0, d6
fnmacd d3 , d1, d4
vstr d2 , [ X, #0 ]
vstr d3 , [ Y, #0 ]
vmul.f64 d2 , d0, d5
fmacd d2 , d1, d7
vmul.f64 d3 , d0, d7
fnmacd d3 , d1, d5
vstr d2 , [ X, #8 ]
vstr d3 , [ Y, #8 ]
add X, X, INC_X
add Y, Y, INC_Y
.endm
#else
.macro KERNEL_F4
pld [ X, #X_PRE ]
pld [ Y, #X_PRE ]
fldmias X, { s4 - s5 }
fldmias Y, { s6 - s7 }
vmul.f32 s2 , s0, s4
fmacs s2 , s1, s6
vmul.f32 s3 , s0, s6
fnmacs s3 , s1, s4
fstmias X!, { s2 }
fstmias Y!, { s3 }
vmul.f32 s2 , s0, s5
fmacs s2 , s1, s7
vmul.f32 s3 , s0, s7
fnmacs s3 , s1, s5
fstmias X!, { s2 }
fstmias Y!, { s3 }
fldmias X, { s4 - s5 }
fldmias Y, { s6 - s7 }
vmul.f32 s2 , s0, s4
fmacs s2 , s1, s6
vmul.f32 s3 , s0, s6
fnmacs s3 , s1, s4
fstmias X!, { s2 }
fstmias Y!, { s3 }
vmul.f32 s2 , s0, s5
fmacs s2 , s1, s7
vmul.f32 s3 , s0, s7
fnmacs s3 , s1, s5
fstmias X!, { s2 }
fstmias Y!, { s3 }
pld [ X, #X_PRE ]
pld [ Y, #X_PRE ]
fldmias X, { s4 - s5 }
fldmias Y, { s6 - s7 }
vmul.f32 s2 , s0, s4
fmacs s2 , s1, s6
vmul.f32 s3 , s0, s6
fnmacs s3 , s1, s4
fstmias X!, { s2 }
fstmias Y!, { s3 }
vmul.f32 s2 , s0, s5
fmacs s2 , s1, s7
vmul.f32 s3 , s0, s7
fnmacs s3 , s1, s5
fstmias X!, { s2 }
fstmias Y!, { s3 }
fldmias X, { s4 - s5 }
fldmias Y, { s6 - s7 }
vmul.f32 s2 , s0, s4
fmacs s2 , s1, s6
vmul.f32 s3 , s0, s6
fnmacs s3 , s1, s4
fstmias X!, { s2 }
fstmias Y!, { s3 }
vmul.f32 s2 , s0, s5
fmacs s2 , s1, s7
vmul.f32 s3 , s0, s7
fnmacs s3 , s1, s5
fstmias X!, { s2 }
fstmias Y!, { s3 }
.endm
.macro KERNEL_F1
fldmias X, { s4 - s5 }
fldmias Y, { s6 - s7 }
vmul.f32 s2 , s0, s4
fmacs s2 , s1, s6
vmul.f32 s3 , s0, s6
fnmacs s3 , s1, s4
fstmias X!, { s2 }
fstmias Y!, { s3 }
vmul.f32 s2 , s0, s5
fmacs s2 , s1, s7
vmul.f32 s3 , s0, s7
fnmacs s3 , s1, s5
fstmias X!, { s2 }
fstmias Y!, { s3 }
.endm
.macro KERNEL_S1
fldmias X, { s4 - s5 }
fldmias Y, { s6 - s7 }
vmul.f32 s2 , s0, s4
fmacs s2 , s1, s6
vmul.f32 s3 , s0, s6
fnmacs s3 , s1, s4
vstr s2 , [ X, #0 ]
vstr s3 , [ Y, #0 ]
vmul.f32 s2 , s0, s5
fmacs s2 , s1, s7
vmul.f32 s3 , s0, s7
fnmacs s3 , s1, s5
vstr s2 , [ X, #4 ]
vstr s3 , [ Y, #4 ]
add X, X, INC_X
add Y, Y, INC_Y
.endm
#endif
#endif
/**************************************************************************************
* End of macro definitions
**************************************************************************************/
PROLOGUE
.align 5
push {r4 , fp}
add fp, sp, #8
ldr INC_Y , OLD_INC_Y
cmp N, #0
ble rot_kernel_L999
cmp INC_X, #0
beq rot_kernel_L999
cmp INC_Y, #0
beq rot_kernel_L999
cmp INC_X, #1
bne rot_kernel_S_BEGIN
cmp INC_Y, #1
bne rot_kernel_S_BEGIN
rot_kernel_F_BEGIN:
asrs I, N, #2 // I = N / 4
ble rot_kernel_F1
.align 5
rot_kernel_F4:
#if !defined(COMPLEX) && !defined(DOUBLE)
pld [ X, #X_PRE ]
pld [ Y, #X_PRE ]
#endif
KERNEL_F4
subs I, I, #1
ble rot_kernel_F1
KERNEL_F4
subs I, I, #1
bne rot_kernel_F4
rot_kernel_F1:
ands I, N, #3
ble rot_kernel_L999
rot_kernel_F10:
KERNEL_F1
subs I, I, #1
bne rot_kernel_F10
b rot_kernel_L999
rot_kernel_S_BEGIN:
#if defined(COMPLEX)
#if defined(DOUBLE)
lsl INC_X, INC_X, #4 // INC_X * SIZE * 2
lsl INC_Y, INC_Y, #4 // INC_Y * SIZE * 2
#else
lsl INC_X, INC_X, #3 // INC_X * SIZE * 2
lsl INC_Y, INC_Y, #3 // INC_Y * SIZE * 2
#endif
#else
#if defined(DOUBLE)
lsl INC_X, INC_X, #3 // INC_X * SIZE
lsl INC_Y, INC_Y, #3 // INC_Y * SIZE
#else
lsl INC_X, INC_X, #2 // INC_X * SIZE
lsl INC_Y, INC_Y, #2 // INC_Y * SIZE
#endif
#endif
asrs I, N, #2 // I = N / 4
ble rot_kernel_S1
.align 5
rot_kernel_S4:
KERNEL_S1
KERNEL_S1
KERNEL_S1
KERNEL_S1
subs I, I, #1
bne rot_kernel_S4
rot_kernel_S1:
ands I, N, #3
ble rot_kernel_L999
rot_kernel_S10:
KERNEL_S1
subs I, I, #1
bne rot_kernel_S10
rot_kernel_L999:
mov r0, #0 // set return value
sub sp, fp, #8
pop {r4,fp}
bx lr
EPILOGUE

58
kernel/arm/scal.c Normal file
View File

@ -0,0 +1,58 @@
/***************************************************************************
Copyright (c) 2013, The OpenBLAS Project
All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
1. Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
2. Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in
the documentation and/or other materials provided with the
distribution.
3. Neither the name of the OpenBLAS project nor the names of
its contributors may be used to endorse or promote products
derived from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*****************************************************************************/
/**************************************************************************************
* 2013/09/14 Saar
* BLASTEST float : OK
* BLASTEST double : OK
* CTEST : OK
* TEST : OK
*
**************************************************************************************/
#include "common.h"
int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *dummy, BLASLONG dummy2)
{
BLASLONG i=0;
if ( n < 0 || inc_x < 1 ) return(0);
if ( da == 1.0 ) return(0);
n *= inc_x;
while(i < n)
{
x[i] = da * x[i] ;
i += inc_x ;
}
return(0);
}

376
kernel/arm/scal_vfp.S Normal file
View File

@ -0,0 +1,376 @@
/***************************************************************************
Copyright (c) 2013, The OpenBLAS Project
All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
1. Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
2. Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in
the documentation and/or other materials provided with the
distribution.
3. Neither the name of the OpenBLAS project nor the names of
its contributors may be used to endorse or promote products
derived from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*****************************************************************************/
/**************************************************************************************
* 2013/11/15 Saar
* BLASTEST : OK
* CTEST : OK
* TEST : OK
*
**************************************************************************************/
#define ASSEMBLER
#include "common.h"
#define STACKSIZE 256
#define OLD_INC_X [sp, #0 ]
#define N r0
#define INC_X r1
#define X r3
#define I r12
#define X_PRE 512
/**************************************************************************************
* Macro definitions
**************************************************************************************/
/*****************************************************************************************/
#if !defined(COMPLEX)
#if defined(DOUBLE)
.macro KERNEL_F4
pld [ X, #X_PRE ]
fldmiad X, { d4 - d7 }
vmul.f64 d4, d4, d0
vmul.f64 d5, d5, d0
vmul.f64 d6, d6, d0
fstmiad X!, { d4 - d5 }
vmul.f64 d7, d7, d0
fstmiad X!, { d6 - d7 }
.endm
.macro KERNEL_F1
fldmiad X, { d4 }
vmul.f64 d4, d4, d0
fstmiad X!, { d4 }
.endm
.macro KERNEL_S1
fldmiad X, { d4 }
vmul.f64 d4, d4, d0
fstmiad X, { d4 }
add X, X, INC_X
.endm
#else
.macro KERNEL_F4
fldmias X, { s4 - s7 }
vmul.f32 s4, s4, s0
vmul.f32 s5, s5, s0
vmul.f32 s6, s6, s0
fstmias X!, { s4 - s5 }
vmul.f32 s7, s7, s0
fstmias X!, { s6 - s7 }
.endm
.macro KERNEL_F1
fldmias X, { s4 }
vmul.f32 s4, s4, s0
fstmias X!, { s4 }
.endm
.macro KERNEL_S1
fldmias X, { s4 }
vmul.f32 s4, s4, s0
fstmias X, { s4 }
add X, X, INC_X
.endm
#endif
#else
#if defined(DOUBLE)
.macro KERNEL_F4
pld [ X, #X_PRE ]
fldmiad X, { d4 - d5 }
vmul.f64 d2, d0, d4
fnmacd d2, d1, d5
vmul.f64 d3, d0, d5
fmacd d3, d1, d4
fstmiad X!, { d2 - d3 }
fldmiad X, { d4 - d5 }
vmul.f64 d2, d0, d4
fnmacd d2, d1, d5
vmul.f64 d3, d0, d5
fmacd d3, d1, d4
fstmiad X!, { d2 - d3 }
pld [ X, #X_PRE ]
fldmiad X, { d4 - d5 }
vmul.f64 d2, d0, d4
fnmacd d2, d1, d5
vmul.f64 d3, d0, d5
fmacd d3, d1, d4
fstmiad X!, { d2 - d3 }
fldmiad X, { d4 - d5 }
vmul.f64 d2, d0, d4
fnmacd d2, d1, d5
vmul.f64 d3, d0, d5
fmacd d3, d1, d4
fstmiad X!, { d2 - d3 }
.endm
.macro KERNEL_F1
fldmiad X, { d4 - d5 }
vmul.f64 d2, d0, d4
fnmacd d2, d1, d5
vmul.f64 d3, d0, d5
fmacd d3, d1, d4
fstmiad X!, { d2 - d3 }
.endm
.macro KERNEL_S1
fldmiad X, { d4 - d5 }
vmul.f64 d2, d0, d4
fnmacd d2, d1, d5
vmul.f64 d3, d0, d5
fmacd d3, d1, d4
fstmiad X, { d2 - d3 }
add X, X, INC_X
.endm
#else
.macro KERNEL_F4
pld [ X, #X_PRE ]
fldmias X, { s4 - s5 }
vmul.f32 s2, s0, s4
fnmacs s2, s1, s5
vmul.f32 s3, s0, s5
fmacs s3, s1, s4
fstmias X!, { s2 - s3 }
fldmias X, { s4 - s5 }
vmul.f32 s2, s0, s4
fnmacs s2, s1, s5
vmul.f32 s3, s0, s5
fmacs s3, s1, s4
fstmias X!, { s2 - s3 }
fldmias X, { s4 - s5 }
vmul.f32 s2, s0, s4
fnmacs s2, s1, s5
vmul.f32 s3, s0, s5
fmacs s3, s1, s4
fstmias X!, { s2 - s3 }
fldmias X, { s4 - s5 }
vmul.f32 s2, s0, s4
fnmacs s2, s1, s5
vmul.f32 s3, s0, s5
fmacs s3, s1, s4
fstmias X!, { s2 - s3 }
.endm
.macro KERNEL_F1
fldmias X, { s4 - s5 }
vmul.f32 s2, s0, s4
fnmacs s2, s1, s5
vmul.f32 s3, s0, s5
fmacs s3, s1, s4
fstmias X!, { s2 - s3 }
.endm
.macro KERNEL_S1
fldmias X, { s4 - s5 }
vmul.f32 s2, s0, s4
fnmacs s2, s1, s5
vmul.f32 s3, s0, s5
fmacs s3, s1, s4
fstmias X, { s2 - s3 }
add X, X, INC_X
.endm
#endif
#endif
/**************************************************************************************
* End of macro definitions
**************************************************************************************/
PROLOGUE
.align 5
ldr INC_X , OLD_INC_X
cmp N, #0
ble scal_kernel_L999
cmp INC_X, #0
ble scal_kernel_L999
cmp INC_X, #1
bne scal_kernel_S_BEGIN
scal_kernel_F_BEGIN:
asrs I, N, #2 // I = N / 4
ble scal_kernel_F1
.align 5
scal_kernel_F4:
#if !defined(COMPLEX) && !defined(DOUBLE)
pld [ X, #X_PRE ]
#endif
KERNEL_F4
subs I, I, #1
ble scal_kernel_F1
KERNEL_F4
subs I, I, #1
bne scal_kernel_F4
scal_kernel_F1:
ands I, N, #3
ble scal_kernel_L999
scal_kernel_F10:
KERNEL_F1
subs I, I, #1
bne scal_kernel_F10
b scal_kernel_L999
scal_kernel_S_BEGIN:
#if defined(COMPLEX)
#if defined(DOUBLE)
lsl INC_X, INC_X, #4 // INC_X * SIZE * 2
#else
lsl INC_X, INC_X, #3 // INC_X * SIZE * 2
#endif
#else
#if defined(DOUBLE)
lsl INC_X, INC_X, #3 // INC_X * SIZE
#else
lsl INC_X, INC_X, #2 // INC_X * SIZE
#endif
#endif
asrs I, N, #2 // I = N / 4
ble scal_kernel_S1
.align 5
scal_kernel_S4:
KERNEL_S1
KERNEL_S1
KERNEL_S1
KERNEL_S1
subs I, I, #1
bne scal_kernel_S4
scal_kernel_S1:
ands I, N, #3
ble scal_kernel_L999
scal_kernel_S10:
KERNEL_S1
subs I, I, #1
bne scal_kernel_S10
scal_kernel_L999:
mov r0, #0 // set return value
bx lr
EPILOGUE

224
kernel/arm/scopy_vfp.S Normal file
View File

@ -0,0 +1,224 @@
/***************************************************************************
Copyright (c) 2013, The OpenBLAS Project
All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
1. Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
2. Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in
the documentation and/or other materials provided with the
distribution.
3. Neither the name of the OpenBLAS project nor the names of
its contributors may be used to endorse or promote products
derived from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*****************************************************************************/
/**************************************************************************************
* 2013/11/07 Saar
* BLASTEST : OK
* CTEST : OK
* TEST : OK
*
**************************************************************************************/
#define ASSEMBLER
#include "common.h"
#define STACKSIZE 256
#define N r0
#define X r1
#define INC_X r2
#define OLD_Y r3
/******************************************************
* [fp, #-128] - [fp, #-64] is reserved
* for store and restore of floating point
* registers
*******************************************************/
#define OLD_INC_Y [fp, #4 ]
#define I r5
#define Y r6
#define INC_Y r7
#define X_PRE 256
/**************************************************************************************
* Macro definitions
**************************************************************************************/
.macro COPY_F8
pld [ X, #X_PRE ]
fldmias X!, { s0 - s3 }
fldmias X!, { s4 - s7 }
fstmias Y!, { s0 - s3 }
fstmias Y!, { s4 - s7 }
.endm
.macro COPY_F1
fldmias X!, { s0 }
fstmias Y!, { s0 }
.endm
/*************************************************************************************************************************/
.macro COPY_S4
nop
fldmias X, { s0 }
fstmias Y, { s0 }
add X, X, INC_X
add Y, Y, INC_Y
fldmias X, { s1 }
fstmias Y, { s1 }
add X, X, INC_X
add Y, Y, INC_Y
fldmias X, { s0 }
fstmias Y, { s0 }
add X, X, INC_X
add Y, Y, INC_Y
fldmias X, { s1 }
fstmias Y, { s1 }
add X, X, INC_X
add Y, Y, INC_Y
.endm
.macro COPY_S1
fldmias X, { s0 }
fstmias Y, { s0 }
add X, X, INC_X
add Y, Y, INC_Y
.endm
/**************************************************************************************
* End of macro definitions
**************************************************************************************/
PROLOGUE
.align 5
push {r4 - r9, fp}
add fp, sp, #24
sub sp, sp, #STACKSIZE // reserve stack
sub r4, fp, #128
vstm r4, { s8 - s15} // store floating point registers
mov Y, OLD_Y
ldr INC_Y, OLD_INC_Y
cmp N, #0
ble scopy_kernel_L999
cmp INC_X, #0
beq scopy_kernel_L999
cmp INC_Y, #0
beq scopy_kernel_L999
cmp INC_X, #1
bne scopy_kernel_S_BEGIN
cmp INC_Y, #1
bne scopy_kernel_S_BEGIN
scopy_kernel_F_BEGIN:
asrs I, N, #3 // I = N / 8
ble scopy_kernel_F1
scopy_kernel_F8:
COPY_F8
subs I, I, #1
bne scopy_kernel_F8
scopy_kernel_F1:
ands I, N, #7
ble scopy_kernel_L999
scopy_kernel_F10:
COPY_F1
subs I, I, #1
bne scopy_kernel_F10
b scopy_kernel_L999
scopy_kernel_S_BEGIN:
lsl INC_X, INC_X, #2 // INC_X * SIZE
lsl INC_Y, INC_Y, #2 // INC_Y * SIZE
asrs I, N, #2 // I = N / 4
ble scopy_kernel_S1
scopy_kernel_S4:
COPY_S4
subs I, I, #1
bne scopy_kernel_S4
scopy_kernel_S1:
ands I, N, #3
ble scopy_kernel_L999
scopy_kernel_S10:
COPY_S1
subs I, I, #1
bne scopy_kernel_S10
scopy_kernel_L999:
sub r3, fp, #128
vldm r3, { s8 - s15} // restore floating point registers
mov r0, #0 // set return value
sub sp, fp, #24
pop {r4 - r9, fp}
bx lr
EPILOGUE

347
kernel/arm/sdot_vfp.S Normal file
View File

@ -0,0 +1,347 @@
/***************************************************************************
Copyright (c) 2013, The OpenBLAS Project
All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
1. Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
2. Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in
the documentation and/or other materials provided with the
distribution.
3. Neither the name of the OpenBLAS project nor the names of
its contributors may be used to endorse or promote products
derived from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*****************************************************************************/
/**************************************************************************************
* 2013/11/11 Saar
* BLASTEST : OK
* CTEST : OK (no test for dsdot)
* TEST : OK (no test for dsdot)
*
**************************************************************************************/
#define ASSEMBLER
#include "common.h"
#define STACKSIZE 256
#define N r0
#define X r1
#define INC_X r2
#define OLD_Y r3
/******************************************************
* [fp, #-128] - [fp, #-64] is reserved
* for store and restore of floating point
* registers
*******************************************************/
#define OLD_INC_Y [fp, #4 ]
#define I r5
#define Y r6
#define INC_Y r7
#define X_PRE 512
/**************************************************************************************
* Macro definitions
**************************************************************************************/
#if defined(DSDOT)
.macro KERNEL_F4
fldmias X!, { s14 }
fldmias Y!, { s15 }
vmul.f32 s15, s14, s15
vcvt.f64.f32 d4, s15
vadd.f64 d0 , d0, d4
fldmias X!, { s14 }
fldmias Y!, { s15 }
vmul.f32 s15, s14, s15
vcvt.f64.f32 d4, s15
vadd.f64 d0 , d0, d4
fldmias X!, { s14 }
fldmias Y!, { s15 }
vmul.f32 s15, s14, s15
vcvt.f64.f32 d4, s15
vadd.f64 d0 , d0, d4
fldmias X!, { s14 }
fldmias Y!, { s15 }
vmul.f32 s15, s14, s15
vcvt.f64.f32 d4, s15
vadd.f64 d0 , d0, d4
.endm
.macro KERNEL_F1
fldmias X!, { s14 }
fldmias Y!, { s15 }
vmul.f32 s15, s14, s15
vcvt.f64.f32 d4, s15
vadd.f64 d0 , d0, d4
.endm
.macro KERNEL_S4
nop
fldmias X, { s14 }
fldmias Y, { s15 }
vmul.f32 s15, s14, s15
vcvt.f64.f32 d4, s15
vadd.f64 d0 , d0, d4
add X, X, INC_X
add Y, Y, INC_Y
fldmias X, { s14 }
fldmias Y, { s15 }
vmul.f32 s15, s14, s15
vcvt.f64.f32 d4, s15
vadd.f64 d0 , d0, d4
add X, X, INC_X
add Y, Y, INC_Y
fldmias X, { s14 }
fldmias Y, { s15 }
vmul.f32 s15, s14, s15
vcvt.f64.f32 d4, s15
vadd.f64 d0 , d0, d4
add X, X, INC_X
add Y, Y, INC_Y
fldmias X, { s14 }
fldmias Y, { s15 }
vmul.f32 s15, s14, s15
vcvt.f64.f32 d4, s15
vadd.f64 d0 , d0, d4
add X, X, INC_X
add Y, Y, INC_Y
.endm
.macro KERNEL_S1
fldmias X, { s14 }
fldmias Y, { s15 }
vmul.f32 s15, s14, s15
vcvt.f64.f32 d4, s15
vadd.f64 d0 , d0, d4
add X, X, INC_X
add Y, Y, INC_Y
.endm
#else
.macro KERNEL_F4
fldmias X!, { s8 - s9 }
fldmias Y!, { s4 - s5}
fmacs s0 , s4, s8
fldmias X!, { s10 - s11 }
fmacs s1 , s5, s9
fldmias Y!, { s6 - s7 }
fmacs s0 , s6, s10
fmacs s1 , s7, s11
.endm
.macro KERNEL_F1
fldmias X!, { s4 }
fldmias Y!, { s8 }
fmacs s0 , s4, s8
.endm
.macro KERNEL_S4
nop
fldmias X, { s4 }
fldmias Y, { s8 }
add X, X, INC_X
add Y, Y, INC_Y
fmacs s0 , s4, s8
fldmias X, { s5 }
fldmias Y, { s9 }
add X, X, INC_X
add Y, Y, INC_Y
fmacs s1 , s5, s9
fldmias X, { s6 }
fldmias Y, { s10 }
add X, X, INC_X
add Y, Y, INC_Y
fmacs s0 , s6, s10
fldmias X, { s7 }
fldmias Y, { s11 }
add X, X, INC_X
add Y, Y, INC_Y
fmacs s1 , s7, s11
.endm
.macro KERNEL_S1
fldmias X, { s4 }
fldmias Y, { s8 }
add X, X, INC_X
fmacs s0 , s4, s8
add Y, Y, INC_Y
.endm
#endif
/**************************************************************************************
* End of macro definitions
**************************************************************************************/
PROLOGUE
.align 5
push {r4 - r9, fp}
add fp, sp, #24
sub sp, sp, #STACKSIZE // reserve stack
sub r4, fp, #128
vstm r4, { s8 - s15 } // store floating point registers
mov Y, OLD_Y
ldr INC_Y, OLD_INC_Y
#if defined(DSDOT)
vsub.f64 d0 , d0 , d0
vsub.f64 d1 , d1 , d1
#else
vsub.f32 s0 , s0 , s0
vsub.f32 s1 , s1 , s1
#endif
cmp N, #0
ble sdot_kernel_L999
cmp INC_X, #0
beq sdot_kernel_L999
cmp INC_Y, #0
beq sdot_kernel_L999
cmp INC_X, #1
bne sdot_kernel_S_BEGIN
cmp INC_Y, #1
bne sdot_kernel_S_BEGIN
sdot_kernel_F_BEGIN:
asrs I, N, #2 // I = N / 4
ble sdot_kernel_F1
sdot_kernel_F4:
KERNEL_F4
subs I, I, #1
bne sdot_kernel_F4
sdot_kernel_F1:
ands I, N, #3
ble sdot_kernel_L999
sdot_kernel_F10:
KERNEL_F1
subs I, I, #1
bne sdot_kernel_F10
b sdot_kernel_L999
sdot_kernel_S_BEGIN:
lsl INC_X, INC_X, #2 // INC_X * SIZE
lsl INC_Y, INC_Y, #2 // INC_Y * SIZE
asrs I, N, #2 // I = N / 4
ble sdot_kernel_S1
sdot_kernel_S4:
KERNEL_S4
subs I, I, #1
bne sdot_kernel_S4
sdot_kernel_S1:
ands I, N, #3
ble sdot_kernel_L999
sdot_kernel_S10:
KERNEL_S1
subs I, I, #1
bne sdot_kernel_S10
sdot_kernel_L999:
sub r3, fp, #128
vldm r3, { s8 - s15} // restore floating point registers
#if defined(DSDOT)
vadd.f64 d0 , d0, d1 // set return value
#else
vadd.f32 s0 , s0, s1 // set return value
#endif
sub sp, fp, #24
pop {r4 - r9, fp}
bx lr
EPILOGUE

View File

@ -0,0 +1,797 @@
/***************************************************************************
Copyright (c) 2013, The OpenBLAS Project
All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
1. Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
2. Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in
the documentation and/or other materials provided with the
distribution.
3. Neither the name of the OpenBLAS project nor the names of
its contributors may be used to endorse or promote products
derived from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*****************************************************************************/
/**************************************************************************************
* 2013/11/28 Saar
* BLASTEST : OK
* CTEST : OK
* TEST : OK
*
**************************************************************************************/
#define ASSEMBLER
#include "common.h"
#define STACKSIZE 256
#define OLD_M r0
#define OLD_N r1
#define OLD_K r2
#define OLD_A r3
#define OLD_ALPHA s0
/******************************************************
* [fp, #-128] - [fp, #-64] is reserved
* for store and restore of floating point
* registers
*******************************************************/
#define LDC [fp, #-252 ]
#define M [fp, #-256 ]
#define N [fp, #-260 ]
#define K [fp, #-264 ]
#define A [fp, #-268 ]
#define ALPHA [fp, #-280]
#define B [fp, #4 ]
#define C [fp, #8 ]
#define OLD_LDC [fp, #12 ]
#define I r0
#define J r1
#define L r2
#define AO r5
#define BO r6
#define CO1 r8
#define CO2 r9
#define K1 r7
#define BC r12
#define A_PRE 96
#define B_PRE 96
#define C_PRE 64
/**************************************************************************************
* Macro definitions
**************************************************************************************/
.macro INIT4x2
vsub.f32 s8 , s8 , s8
vmov.f32 s9, s8
vmov.f32 s10, s8
vmov.f32 s11, s8
vmov.f32 s12, s8
vmov.f32 s13, s8
vmov.f32 s14, s8
vmov.f32 s15, s8
.endm
.macro KERNEL4x2_SUB
fldmias AO! , { s0 - s3 }
fldmias BO! , { s4 - s5 }
fmacs s8 , s0, s4
fmacs s9 , s1, s4
fmacs s10 , s2, s4
fmacs s11 , s3, s4
fmacs s12 , s0, s5
fmacs s13 , s1, s5
fmacs s14 , s2, s5
fmacs s15 , s3, s5
.endm
.macro SAVE4x2
ldr r3 , LDC
add CO2 , CO1, r3
flds s0, ALPHA
flds s4 , [CO1]
flds s5 , [CO1, #4 ]
flds s6 , [CO1, #8 ]
flds s7 , [CO1, #12 ]
fmacs s4 , s0 , s8
fmacs s5 , s0 , s9
fmacs s6 , s0 , s10
fmacs s7 , s0 , s11
fsts s4 , [CO1]
fsts s5 , [CO1, #4 ]
fsts s6 , [CO1, #8 ]
fsts s7 , [CO1, #12 ]
flds s4 , [CO2]
flds s5 , [CO2, #4 ]
flds s6 , [CO2, #8 ]
flds s7 , [CO2, #12 ]
fmacs s4 , s0 , s12
fmacs s5 , s0 , s13
fmacs s6 , s0 , s14
fmacs s7 , s0 , s15
fsts s4 , [CO2]
fsts s5 , [CO2, #4 ]
fsts s6 , [CO2, #8 ]
fsts s7 , [CO2, #12 ]
add CO1, CO1, #16
.endm
/******************************************************************************/
.macro INIT2x2
vsub.f32 s8 , s8 , s8
vmov.f32 s9, s8
vmov.f32 s12, s8
vmov.f32 s13, s8
.endm
.macro KERNEL2x2_SUB
flds s4 , [ BO ]
flds s5 , [ BO, #4 ]
flds s0 , [ AO ]
flds s1 , [ AO, #4 ]
fmacs s8 , s0, s4
fmacs s9 , s1, s4
fmacs s12 , s0, s5
fmacs s13 , s1, s5
add AO , AO, #8
add BO , BO, #8
.endm
.macro SAVE2x2
ldr r3 , LDC
add CO2 , CO1, r3
flds s0, ALPHA
flds s4 , [CO1]
flds s5 , [CO1, #4 ]
fmacs s4 , s0 , s8
fmacs s5 , s0 , s9
fsts s4 , [CO1]
fsts s5 , [CO1, #4 ]
flds s4 , [CO2]
flds s5 , [CO2, #4 ]
fmacs s4 , s0 , s12
fmacs s5 , s0 , s13
fsts s4 , [CO2]
fsts s5 , [CO2, #4 ]
add CO1, CO1, #8
.endm
/******************************************************************************/
.macro INIT1x2
vsub.f32 s8 , s8 , s8
vmov.f32 s12, s8
.endm
.macro KERNEL1x2_SUB
flds s4 , [ BO ]
flds s5 , [ BO, #4 ]
flds s0 , [ AO ]
fmacs s8 , s0, s4
fmacs s12 , s0, s5
add AO , AO, #4
add BO , BO, #8
.endm
.macro SAVE1x2
ldr r3 , LDC
add CO2 , CO1, r3
flds s0, ALPHA
flds s4 , [CO1]
fmacs s4 , s0 , s8
fsts s4 , [CO1]
flds s4 , [CO2]
fmacs s4 , s0 , s12
fsts s4 , [CO2]
add CO1, CO1, #4
.endm
/******************************************************************************/
.macro INIT4x1
vsub.f32 s8 , s8 , s8
vmov.f32 s9, s8
vmov.f32 s10, s8
vmov.f32 s11, s8
.endm
.macro KERNEL4x1_SUB
flds s4 , [ BO ]
flds s0 , [ AO ]
flds s1 , [ AO, #4 ]
flds s2 , [ AO, #8 ]
flds s3 , [ AO, #12 ]
fmacs s8 , s0, s4
fmacs s9 , s1, s4
fmacs s10 , s2, s4
fmacs s11 , s3, s4
add AO , AO, #16
add BO , BO, #4
.endm
.macro SAVE4x1
flds s0, ALPHA
flds s4 , [CO1]
flds s5 , [CO1, #4 ]
flds s6 , [CO1, #8 ]
flds s7 , [CO1, #12 ]
fmacs s4 , s0 , s8
fmacs s5 , s0 , s9
fmacs s6 , s0 , s10
fmacs s7 , s0 , s11
fsts s4 , [CO1]
fsts s5 , [CO1, #4 ]
fsts s6 , [CO1, #8 ]
fsts s7 , [CO1, #12 ]
add CO1, CO1, #16
.endm
/******************************************************************************/
.macro INIT2x1
vsub.f32 s8 , s8 , s8
vmov.f32 s9 , s8
.endm
.macro KERNEL2x1_SUB
flds s4 , [ BO ]
flds s0 , [ AO ]
flds s1 , [ AO, #4 ]
fmacs s8 , s0, s4
fmacs s9 , s1, s4
add AO , AO, #8
add BO , BO, #4
.endm
.macro SAVE2x1
flds s0, ALPHA
flds s4 , [CO1]
flds s5 , [CO1, #4 ]
fmacs s4 , s0 , s8
fmacs s5 , s0 , s9
fsts s4 , [CO1]
fsts s5 , [CO1, #4 ]
add CO1, CO1, #8
.endm
/******************************************************************************/
.macro INIT1x1
vsub.f32 s8 , s8 , s8
.endm
.macro KERNEL1x1_SUB
flds s4 , [ BO ]
flds s0 , [ AO ]
fmacs s8 , s0, s4
add AO , AO, #4
add BO , BO, #4
.endm
.macro SAVE1x1
flds s0, ALPHA
flds s4 , [CO1]
fmacs s4 , s0 , s8
fsts s4 , [CO1]
add CO1, CO1, #4
.endm
/**************************************************************************************
* End of macro definitions
**************************************************************************************/
PROLOGUE
.align 5
push {r4 - r9, fp}
add fp, sp, #24
sub sp, sp, #STACKSIZE // reserve stack
str OLD_M, M
str OLD_N, N
str OLD_K, K
str OLD_A, A
vstr OLD_ALPHA, ALPHA
sub r3, fp, #128
vstm r3, { s8 - s15} // store floating point registers
ldr r3, OLD_LDC
lsl r3, r3, #2 // ldc = ldc * 4
str r3, LDC
ldr K1, K
ldr BC, B
ldr J, N
asrs J, J, #1 // J = J / 2
ble sgemm_kernel_L1_BEGIN
/*********************************************************************************************/
sgemm_kernel_L2_BEGIN:
ldr CO1, C // CO1 = C
ldr r4 , LDC
lsl r4 , r4 , #1 // LDC * 2
add r3 , r4, CO1
str r3 , C // store C
ldr AO, A // AO = A
sgemm_kernel_L2_M4_BEGIN:
ldr I, M
asrs I, I, #2 // I = I / 4
ble sgemm_kernel_L2_M2_BEGIN
sgemm_kernel_L2_M4_20:
INIT4x2
mov BO, BC
asrs L , K1, #3 // L = L / 8
ble sgemm_kernel_L2_M4_40
.align 5
sgemm_kernel_L2_M4_22:
pld [ AO, #A_PRE ]
pld [ BO, #B_PRE ]
KERNEL4x2_SUB
KERNEL4x2_SUB
pld [ AO, #A_PRE ]
KERNEL4x2_SUB
KERNEL4x2_SUB
pld [ AO, #A_PRE ]
pld [ BO, #B_PRE ]
KERNEL4x2_SUB
KERNEL4x2_SUB
pld [ AO, #A_PRE ]
KERNEL4x2_SUB
KERNEL4x2_SUB
subs L, L, #1
bgt sgemm_kernel_L2_M4_22
sgemm_kernel_L2_M4_40:
ands L , K1, #7 // L = L % 8
ble sgemm_kernel_L2_M4_100
sgemm_kernel_L2_M4_42:
KERNEL4x2_SUB
subs L, L, #1
bgt sgemm_kernel_L2_M4_42
sgemm_kernel_L2_M4_100:
SAVE4x2
sgemm_kernel_L2_M4_END:
subs I, I, #1
bgt sgemm_kernel_L2_M4_20
sgemm_kernel_L2_M2_BEGIN:
ldr I, M
tst I , #3
ble sgemm_kernel_L2_END
tst I, #2 // I = I / 2
ble sgemm_kernel_L2_M1_BEGIN
sgemm_kernel_L2_M2_20:
INIT2x2
mov BO, BC
asrs L , K1, #3 // L = L / 8
ble sgemm_kernel_L2_M2_40
sgemm_kernel_L2_M2_22:
KERNEL2x2_SUB
KERNEL2x2_SUB
KERNEL2x2_SUB
KERNEL2x2_SUB
KERNEL2x2_SUB
KERNEL2x2_SUB
KERNEL2x2_SUB
KERNEL2x2_SUB
subs L, L, #1
bgt sgemm_kernel_L2_M2_22
sgemm_kernel_L2_M2_40:
ands L , K1, #7 // L = L % 8
ble sgemm_kernel_L2_M2_100
sgemm_kernel_L2_M2_42:
KERNEL2x2_SUB
subs L, L, #1
bgt sgemm_kernel_L2_M2_42
sgemm_kernel_L2_M2_100:
SAVE2x2
sgemm_kernel_L2_M2_END:
sgemm_kernel_L2_M1_BEGIN:
tst I, #1 // I = I % 2
ble sgemm_kernel_L2_END
sgemm_kernel_L2_M1_20:
INIT1x2
mov BO, BC
asrs L , K1, #3 // L = L / 8
ble sgemm_kernel_L2_M1_40
sgemm_kernel_L2_M1_22:
KERNEL1x2_SUB
KERNEL1x2_SUB
KERNEL1x2_SUB
KERNEL1x2_SUB
KERNEL1x2_SUB
KERNEL1x2_SUB
KERNEL1x2_SUB
KERNEL1x2_SUB
subs L, L, #1
bgt sgemm_kernel_L2_M1_22
sgemm_kernel_L2_M1_40:
ands L , K1, #7 // L = L % 8
ble sgemm_kernel_L2_M1_100
sgemm_kernel_L2_M1_42:
KERNEL1x2_SUB
subs L, L, #1
bgt sgemm_kernel_L2_M1_42
sgemm_kernel_L2_M1_100:
SAVE1x2
sgemm_kernel_L2_END:
mov r3, BC
mov r4, K1
lsl r4, r4, #3 // k * 2 * 4
add r3, r3, r4 // B = B + K * 2 * 4
mov BC, r3
subs J , #1 // j--
bgt sgemm_kernel_L2_BEGIN
/*********************************************************************************************/
sgemm_kernel_L1_BEGIN:
ldr J , N
tst J , #1
ble sgemm_kernel_L999
ldr CO1, C // CO1 = C
ldr r4 , LDC
add r3 , r4, CO1
str r3 , C // store C
ldr AO, A // AO = A
sgemm_kernel_L1_M4_BEGIN:
ldr I, M
asrs I, I, #2 // I = I / 4
ble sgemm_kernel_L1_M2_BEGIN
sgemm_kernel_L1_M4_20:
INIT4x1
mov BO, BC
asrs L , K1, #3 // L = L / 8
ble sgemm_kernel_L1_M4_40
.align 5
sgemm_kernel_L1_M4_22:
KERNEL4x1_SUB
KERNEL4x1_SUB
KERNEL4x1_SUB
KERNEL4x1_SUB
KERNEL4x1_SUB
KERNEL4x1_SUB
KERNEL4x1_SUB
KERNEL4x1_SUB
subs L, L, #1
bgt sgemm_kernel_L1_M4_22
sgemm_kernel_L1_M4_40:
ands L , K1, #7 // L = L % 8
ble sgemm_kernel_L1_M4_100
sgemm_kernel_L1_M4_42:
KERNEL4x1_SUB
subs L, L, #1
bgt sgemm_kernel_L1_M4_42
sgemm_kernel_L1_M4_100:
SAVE4x1
sgemm_kernel_L1_M4_END:
subs I, I, #1
bgt sgemm_kernel_L1_M4_20
sgemm_kernel_L1_M2_BEGIN:
ldr I, M
tst I , #3
ble sgemm_kernel_L1_END
tst I, #2 // I = I / 2
ble sgemm_kernel_L1_M1_BEGIN
sgemm_kernel_L1_M2_20:
INIT2x1
mov BO, BC
asrs L , K1, #3 // L = L / 8
ble sgemm_kernel_L1_M2_40
sgemm_kernel_L1_M2_22:
KERNEL2x1_SUB
KERNEL2x1_SUB
KERNEL2x1_SUB
KERNEL2x1_SUB
KERNEL2x1_SUB
KERNEL2x1_SUB
KERNEL2x1_SUB
KERNEL2x1_SUB
subs L, L, #1
bgt sgemm_kernel_L1_M2_22
sgemm_kernel_L1_M2_40:
ands L , K1, #7 // L = L % 8
ble sgemm_kernel_L1_M2_100
sgemm_kernel_L1_M2_42:
KERNEL2x1_SUB
subs L, L, #1
bgt sgemm_kernel_L1_M2_42
sgemm_kernel_L1_M2_100:
SAVE2x1
sgemm_kernel_L1_M2_END:
sgemm_kernel_L1_M1_BEGIN:
tst I, #1 // I = I % 2
ble sgemm_kernel_L1_END
sgemm_kernel_L1_M1_20:
INIT1x1
mov BO, BC
asrs L , K1, #3 // L = L / 8
ble sgemm_kernel_L1_M1_40
sgemm_kernel_L1_M1_22:
KERNEL1x1_SUB
KERNEL1x1_SUB
KERNEL1x1_SUB
KERNEL1x1_SUB
KERNEL1x1_SUB
KERNEL1x1_SUB
KERNEL1x1_SUB
KERNEL1x1_SUB
subs L, L, #1
bgt sgemm_kernel_L1_M1_22
sgemm_kernel_L1_M1_40:
ands L , K1, #7 // L = L % 8
ble sgemm_kernel_L1_M1_100
sgemm_kernel_L1_M1_42:
KERNEL1x1_SUB
subs L, L, #1
bgt sgemm_kernel_L1_M1_42
sgemm_kernel_L1_M1_100:
SAVE1x1
sgemm_kernel_L1_END:
sgemm_kernel_L999:
sub r3, fp, #128
vldm r3, { s8 - s15} // restore floating point registers
movs r0, #0 // set return value
sub sp, fp, #24
pop {r4 - r9, fp}
bx lr
EPILOGUE

File diff suppressed because it is too large Load Diff

View File

@ -0,0 +1,225 @@
/***************************************************************************
Copyright (c) 2013, The OpenBLAS Project
All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
1. Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
2. Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in
the documentation and/or other materials provided with the
distribution.
3. Neither the name of the OpenBLAS project nor the names of
its contributors may be used to endorse or promote products
derived from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*****************************************************************************/
/**************************************************************************************
* 2013/11/24 Saar
* BLASTEST : OK
* CTEST : OK
* TEST : OK
*
**************************************************************************************/
#define ASSEMBLER
#include "common.h"
#define STACKSIZE 256
#define OLD_M r0
#define OLD_N r1
#define OLD_A r2
#define OLD_LDA r3
#define B [fp, #4 ]
#define M r0
#define N r1
#define A r2
#define BO r5
#define AO1 r6
#define AO2 r7
#define LDA r8
#define I r3
#define J r12
#define A_PRE 256
/**************************************************************************************
* Macro definitions
**************************************************************************************/
.macro COPY2x2
flds s0 , [ AO1, #0 ]
flds s2 , [ AO1, #4 ]
flds s1 , [ AO2, #0 ]
flds s3 , [ AO2, #4 ]
add AO1, AO1, #8
fstmias BO!, { s0 - s3 }
add AO2, AO2, #8
.endm
.macro COPY1x2
flds s0 , [ AO1, #0 ]
flds s1 , [ AO2, #0 ]
add AO1, AO1, #4
fstmias BO!, { s0 - s1 }
add AO2, AO2, #4
.endm
.macro COPY2x1
flds s0 , [ AO1, #0 ]
flds s1 , [ AO1, #4 ]
fstmias BO!, { s0 - s1 }
add AO1, AO1, #8
.endm
.macro COPY1x1
flds s0 , [ AO1, #0 ]
fstmias BO!, { s0 }
add AO1, AO1, #4
.endm
/**************************************************************************************
* End of macro definitions
**************************************************************************************/
PROLOGUE
.align 5
push {r4 - r9, fp}
add fp, sp, #24
lsl LDA, OLD_LDA, #2 // lda = lda * 4
ldr BO, B
/*********************************************************************************************/
sgemm_ncopy_L2_BEGIN:
asrs J, N, #1 // J = N / 2
ble sgemm_ncopy_L1_BEGIN
sgemm_ncopy_L2_M2_BEGIN:
mov AO1, A // AO1 = A
add AO2, AO1, LDA
add A , AO2, LDA // A = A + 2 * LDA
asrs I, M, #1 // I = M / 2
ble sgemm_ncopy_L2_M2_40
sgemm_ncopy_L2_M2_20:
COPY2x2
subs I , I , #1
bne sgemm_ncopy_L2_M2_20
sgemm_ncopy_L2_M2_40:
ands I, M , #1
ble sgemm_ncopy_L2_M2_END
sgemm_ncopy_L2_M2_60:
COPY1x2
subs I , I , #1
bne sgemm_ncopy_L2_M2_60
sgemm_ncopy_L2_M2_END:
subs J , J, #1 // j--
bne sgemm_ncopy_L2_M2_BEGIN
/*********************************************************************************************/
sgemm_ncopy_L1_BEGIN:
tst N, #1
ble sgemm_ncopy_L999
sgemm_ncopy_L1_M2_BEGIN:
mov AO1, A // AO1 = A
add A , AO1, LDA // A = A + 1 * LDA
asrs I, M, #1 // I = M / 2
ble sgemm_ncopy_L1_M2_40
sgemm_ncopy_L1_M2_20:
COPY2x1
subs I , I , #1
bne sgemm_ncopy_L1_M2_20
sgemm_ncopy_L1_M2_40:
ands I, M , #1
ble sgemm_ncopy_L1_M2_END
sgemm_ncopy_L1_M2_60:
COPY1x1
subs I , I , #1
bne sgemm_ncopy_L1_M2_60
sgemm_ncopy_L1_M2_END:
sgemm_ncopy_L999:
movs r0, #0 // set return value
sub sp, fp, #24
pop {r4 - r9, fp}
bx lr
EPILOGUE

View File

@ -0,0 +1,353 @@
/***************************************************************************
Copyright (c) 2013, The OpenBLAS Project
All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
1. Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
2. Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in
the documentation and/or other materials provided with the
distribution.
3. Neither the name of the OpenBLAS project nor the names of
its contributors may be used to endorse or promote products
derived from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*****************************************************************************/
/**************************************************************************************
* 2013/11/05 Saar
* BLASTEST : OK
* CTEST : OK
* TEST : OK
*
**************************************************************************************/
#define ASSEMBLER
#include "common.h"
#define STACKSIZE 256
#define OLD_M r0
#define OLD_N r1
#define OLD_A r2
#define OLD_LDA r3
/******************************************************
* [fp, #-128] - [fp, #-64] is reserved
* for store and restore of floating point
* registers
*******************************************************/
#define LDA [fp, #-260 ]
#define B [fp, #4 ]
#define M r0
#define N r1
#define A r2
#define BO r5
#define AO1 r6
#define AO2 r7
#define AO3 r8
#define AO4 r9
#define I r3
#define J r12
#define A_PRE 192
/**************************************************************************************
* Macro definitions
**************************************************************************************/
.macro COPY4x4
flds s0 , [ AO1, #0 ]
flds s1 , [ AO2, #0 ]
flds s2 , [ AO3, #0 ]
flds s3 , [ AO4, #0 ]
flds s4 , [ AO1, #4 ]
flds s8 , [ AO1, #8 ]
flds s12, [ AO1, #12 ]
flds s5 , [ AO2, #4 ]
add AO1, AO1, #16
flds s9 , [ AO2, #8 ]
flds s13, [ AO2, #12 ]
flds s6 , [ AO3, #4 ]
add AO2, AO2, #16
flds s10, [ AO3, #8 ]
flds s14, [ AO3, #12 ]
flds s7 , [ AO4, #4 ]
add AO3, AO3, #16
flds s11, [ AO4, #8 ]
flds s15, [ AO4, #12 ]
fstmias BO!, { s0 - s3 }
add AO4, AO4, #16
fstmias BO!, { s4 - s7 }
fstmias BO!, { s8 - s15 }
.endm
.macro COPY1x4
flds s0 , [ AO1, #0 ]
flds s1 , [ AO2, #0 ]
add AO1, AO1, #4
flds s2 , [ AO3, #0 ]
add AO2, AO2, #4
flds s3 , [ AO4, #0 ]
add AO3, AO3, #4
fstmias BO!, { s0 - s3 }
add AO4, AO4, #4
.endm
.macro COPY4x2
flds s0 , [ AO1, #0 ]
flds s2 , [ AO1, #4 ]
flds s4 , [ AO1, #8 ]
flds s6 , [ AO1, #12 ]
flds s1 , [ AO2, #0 ]
flds s3 , [ AO2, #4 ]
add AO1, AO1, #16
flds s5 , [ AO2, #8 ]
flds s7 , [ AO2, #12 ]
fstmias BO!, { s0 - s7 }
add AO2, AO2, #16
.endm
.macro COPY1x2
flds s0 , [ AO1, #0 ]
flds s1 , [ AO2, #0 ]
add AO1, AO1, #4
fstmias BO!, { s0 - s1 }
add AO2, AO2, #4
.endm
.macro COPY4x1
flds s0 , [ AO1, #0 ]
flds s1 , [ AO1, #4 ]
flds s2 , [ AO1, #8 ]
flds s3 , [ AO1, #12 ]
fstmias BO!, { s0 - s3 }
add AO1, AO1, #16
.endm
.macro COPY1x1
flds s0 , [ AO1, #0 ]
fstmias BO!, { s0 }
add AO1, AO1, #4
.endm
/**************************************************************************************
* End of macro definitions
**************************************************************************************/
PROLOGUE
.align 5
push {r4 - r9, fp}
add fp, sp, #24
sub sp, sp, #STACKSIZE // reserve stack
lsl r3, r3, #2 // lda = lda * 4
str r3, LDA
sub r4, fp, #128
vstm r4, { s8 - s15} // store floating point registers
ldr BO, B
sgemm_ncopy_L4_BEGIN:
asrs J, N, #2 // J = N / 4
ble sgemm_ncopy_L2_BEGIN
sgemm_ncopy_L4_M4_BEGIN:
mov AO1, A // AO1 = A
ldr r4 , LDA
add AO2, AO1, r4
add AO3, AO2, r4
add AO4, AO3, r4
add A , AO4, r4 // A = A + 4 * LDA
asrs I, M, #2 // I = M / 4
ble sgemm_ncopy_L4_M4_40
sgemm_ncopy_L4_M4_20:
pld [ AO1, #A_PRE ]
pld [ AO2, #A_PRE ]
pld [ AO3, #A_PRE ]
pld [ AO4, #A_PRE ]
COPY4x4
subs I , I , #1
ble sgemm_ncopy_L4_M4_40
COPY4x4
subs I , I , #1
bne sgemm_ncopy_L4_M4_20
sgemm_ncopy_L4_M4_40:
ands I, M , #3
ble sgemm_ncopy_L4_M4_END
sgemm_ncopy_L4_M4_60:
COPY1x4
subs I , I , #1
bne sgemm_ncopy_L4_M4_60
sgemm_ncopy_L4_M4_END:
subs J , J, #1 // j--
bne sgemm_ncopy_L4_M4_BEGIN
/*********************************************************************************************/
sgemm_ncopy_L2_BEGIN:
tst N, #3
ble sgemm_ncopy_L999
tst N, #2
ble sgemm_ncopy_L1_BEGIN
sgemm_ncopy_L2_M4_BEGIN:
mov AO1, A // AO1 = A
ldr r4 , LDA
add AO2, AO1, r4
add A , AO2, r4 // A = A + 2 * LDA
asrs I, M, #2 // I = M / 4
ble sgemm_ncopy_L2_M4_40
sgemm_ncopy_L2_M4_20:
COPY4x2
subs I , I , #1
bne sgemm_ncopy_L2_M4_20
sgemm_ncopy_L2_M4_40:
ands I, M , #3
ble sgemm_ncopy_L2_M4_END
sgemm_ncopy_L2_M4_60:
COPY1x2
subs I , I , #1
bne sgemm_ncopy_L2_M4_60
sgemm_ncopy_L2_M4_END:
/*********************************************************************************************/
sgemm_ncopy_L1_BEGIN:
tst N, #1
ble sgemm_ncopy_L999
sgemm_ncopy_L1_M4_BEGIN:
mov AO1, A // AO1 = A
ldr r4 , LDA
add A , AO1, r4 // A = A + 1 * LDA
asrs I, M, #2 // I = M / 4
ble sgemm_ncopy_L1_M4_40
sgemm_ncopy_L1_M4_20:
COPY4x1
subs I , I , #1
bne sgemm_ncopy_L1_M4_20
sgemm_ncopy_L1_M4_40:
ands I, M , #3
ble sgemm_ncopy_L1_M4_END
sgemm_ncopy_L1_M4_60:
COPY1x1
subs I , I , #1
bne sgemm_ncopy_L1_M4_60
sgemm_ncopy_L1_M4_END:
sgemm_ncopy_L999:
sub r3, fp, #128
vldm r3, { s8 - s15} // restore floating point registers
movs r0, #0 // set return value
sub sp, fp, #24
pop {r4 - r9, fp}
bx lr
EPILOGUE

View File

@ -0,0 +1,430 @@
/***************************************************************************
Copyright (c) 2013, The OpenBLAS Project
All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
1. Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
2. Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in
the documentation and/or other materials provided with the
distribution.
3. Neither the name of the OpenBLAS project nor the names of
its contributors may be used to endorse or promote products
derived from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*****************************************************************************/
/**************************************************************************************
* 2013/11/06 Saar
* BLASTEST : OK
* CTEST : OK
* TEST : OK
*
**************************************************************************************/
#define ASSEMBLER
#include "common.h"
#define STACKSIZE 256
#define OLD_M r0
#define OLD_N r1
#define OLD_A r2
#define OLD_LDA r3
/******************************************************
* [fp, #-128] - [fp, #-64] is reserved
* for store and restore of floating point
* registers
*******************************************************/
#define B [fp, #4 ]
#define A [fp, #-248 ]
#define M r0
#define N r1
#define M4 r2
#define LDA r5
#define AO1 r6
#define BO1 r7
#define BO2 r8
#define BO3 r9
#define I r4
#define J r12
#define A_PRE 256
/**************************************************************************************
* Macro definitions
**************************************************************************************/
.macro COPY4x4_1
pld [ AO1, #A_PRE ]
fldmias AO1, { s0 - s3 }
add r3, AO1, LDA
pld [ r3, #A_PRE ]
fldmias r3, { s4 - s7 }
add r3, r3, LDA
pld [ r3, #A_PRE ]
fldmias r3, { s8 - s11 }
add r3, r3, LDA
pld [ r3, #A_PRE ]
fldmias r3, { s12 - s15 }
fstmias BO1, { s0 - s15 }
add AO1, AO1, #16
add BO1, BO1, M4
.endm
.macro COPY4x4_2
fldmias AO1, { s0 - s3 }
add r3, AO1, LDA
fldmias r3, { s4 - s7 }
add r3, r3, LDA
fldmias r3, { s8 - s11 }
add r3, r3, LDA
fldmias r3, { s12 - s15 }
fstmias BO1, { s0 - s15 }
add AO1, AO1, #16
add BO1, BO1, M4
.endm
.macro COPY2x4
fldmias AO1, { s0 - s1 }
add r3, AO1, LDA
fldmias r3, { s2 - s3 }
add r3, r3, LDA
fldmias r3, { s4 - s5 }
add r3, r3, LDA
fldmias r3, { s6 - s7 }
fstmias BO2, { s0 - s7 }
add AO1, AO1, #8
add BO2, BO2, #32
.endm
.macro COPY1x4
fldmias AO1, { s0 }
add r3, AO1, LDA
fldmias r3, { s1 }
add r3, r3, LDA
fldmias r3, { s2 }
add r3, r3, LDA
fldmias r3, { s3 }
fstmias BO3, { s0 - s3 }
add AO1, AO1, #4
add BO3, BO3, #16
.endm
/*************************************************************************************************************************/
.macro COPY4x2
fldmias AO1, { s0 - s3 }
add r3, AO1, LDA
fldmias r3, { s4 - s7 }
fstmias BO1, { s0 - s7 }
add AO1, AO1, #16
add BO1, BO1, M4
.endm
.macro COPY2x2
fldmias AO1, { s0 - s1 }
add r3, AO1, LDA
fldmias r3, { s2 - s3 }
fstmias BO2, { s0 - s3 }
add AO1, AO1, #8
add BO2, BO2, #16
.endm
.macro COPY1x2
fldmias AO1, { s0 }
add r3, AO1, LDA
fldmias r3, { s1 }
fstmias BO3, { s0 - s1 }
add AO1, AO1, #4
add BO3, BO3, #8
.endm
/*************************************************************************************************************************/
.macro COPY4x1
fldmias AO1, { s0 - s3 }
fstmias BO1, { s0 - s3 }
add AO1, AO1, #16
add BO1, BO1, M4
.endm
.macro COPY2x1
fldmias AO1, { s0 - s1 }
fstmias BO2, { s0 - s1 }
add AO1, AO1, #8
add BO2, BO2, #8
.endm
.macro COPY1x1
fldmias AO1, { s0 }
fstmias BO3, { s0 }
add AO1, AO1, #4
add BO3, BO3, #4
.endm
/**************************************************************************************
* End of macro definitions
**************************************************************************************/
PROLOGUE
.align 5
push {r4 - r9, fp}
add fp, sp, #24
sub sp, sp, #STACKSIZE // reserve stack
str OLD_A, A // store A
lsl LDA, OLD_LDA, #2 // lda = lda * SIZE
sub r4, fp, #128
vstm r4, { s8 - s15} // store floating point registers
lsl r4 , M, #2 // M * SIZE
ldr r3, B
and BO2 , N , #-4
and BO3 , N , #-2
mul BO2, BO2, r4
mul BO3, BO3, r4
add BO2 , BO2, r3
add BO3 , BO3, r3
lsl M4, M, #4 // M4 = M * 4 * SIZE
sgemm_tcopy_L4_BEGIN:
asrs J, M, #2 // J = N / 4
ble sgemm_tcopy_L2_BEGIN
sgemm_tcopy_L4_M4_BEGIN:
ldr AO1, A // AO1 = A
lsl r3, LDA, #2 // r3 = 4 * LDA
add r3, r3 , AO1 // A = A + 4 * LDA
str r3, A // store A
ldr BO1, B
add r3, BO1, #64 // B = B + 16 * SIZE
str r3, B
asrs I, N, #2 // I = M / 4
ble sgemm_tcopy_L4_M4_40
sgemm_tcopy_L4_M4_20:
COPY4x4_1
subs I , I , #1
ble sgemm_tcopy_L4_M4_40
COPY4x4_2
subs I , I , #1
bne sgemm_tcopy_L4_M4_20
sgemm_tcopy_L4_M4_40:
tst N , #2
ble sgemm_tcopy_L4_M4_60
COPY2x4
sgemm_tcopy_L4_M4_60:
tst N, #1
ble sgemm_tcopy_L4_M4_END
COPY1x4
sgemm_tcopy_L4_M4_END:
subs J , J, #1 // j--
bne sgemm_tcopy_L4_M4_BEGIN
/*********************************************************************************************/
sgemm_tcopy_L2_BEGIN:
tst M, #3
ble sgemm_tcopy_L999
tst M, #2
ble sgemm_tcopy_L1_BEGIN
sgemm_tcopy_L2_M4_BEGIN:
ldr AO1, A // AO1 = A
lsl r3, LDA, #1 // r3 = 2 * LDA
add r3, r3 , AO1 // A = A + 2 * LDA
str r3, A // store A
ldr BO1, B
add r3, BO1, #32 // B = B + 8 * SIZE
str r3, B
asrs I, N, #2 // I = M / 4
ble sgemm_tcopy_L2_M4_40
sgemm_tcopy_L2_M4_20:
COPY4x2
subs I , I , #1
bne sgemm_tcopy_L2_M4_20
sgemm_tcopy_L2_M4_40:
tst N , #2
ble sgemm_tcopy_L2_M4_60
COPY2x2
sgemm_tcopy_L2_M4_60:
tst N , #1
ble sgemm_tcopy_L2_M4_END
COPY1x2
sgemm_tcopy_L2_M4_END:
/*********************************************************************************************/
sgemm_tcopy_L1_BEGIN:
tst M, #1
ble sgemm_tcopy_L999
sgemm_tcopy_L1_M4_BEGIN:
ldr AO1, A // AO1 = A
add r3, LDA , AO1 // A = A + 1 * LDA
str r3, A // store A
ldr BO1, B
add r3, BO1, #16 // B = B + 4 * SIZE
str r3, B
asrs I, N, #2 // I = M / 4
ble sgemm_tcopy_L1_M4_40
sgemm_tcopy_L1_M4_20:
COPY4x1
subs I , I , #1
bne sgemm_tcopy_L1_M4_20
sgemm_tcopy_L1_M4_40:
tst N , #2
ble sgemm_tcopy_L1_M4_60
COPY2x1
sgemm_tcopy_L1_M4_60:
tst N , #1
ble sgemm_tcopy_L1_M4_END
COPY1x1
sgemm_tcopy_L1_M4_END:
sgemm_tcopy_L999:
sub r3, fp, #128
vldm r3, { s8 - s15} // restore floating point registers
mov r0, #0 // set return value
sub sp, fp, #24
pop {r4 - r9, fp}
bx lr
EPILOGUE

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

62
kernel/arm/swap.c Normal file
View File

@ -0,0 +1,62 @@
/***************************************************************************
Copyright (c) 2013, The OpenBLAS Project
All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
1. Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
2. Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in
the documentation and/or other materials provided with the
distribution.
3. Neither the name of the OpenBLAS project nor the names of
its contributors may be used to endorse or promote products
derived from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*****************************************************************************/
/**************************************************************************************
* 2013/08/20 Saar
* BLASTEST float OK
* BLASTEST double OK
*
**************************************************************************************/
#include "common.h"
#include <stdio.h>
int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT dummy3, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *dummy, BLASLONG dummy2)
{
BLASLONG i=0;
BLASLONG ix=0,iy=0;
FLOAT temp;
if ( n < 0 ) return(0);
while(i < n)
{
temp = x[ix] ;
x[ix] = y[iy] ;
y[iy] = temp ;
ix += inc_x ;
iy += inc_y ;
i++ ;
}
return(0);
}

354
kernel/arm/swap_vfp.S Normal file
View File

@ -0,0 +1,354 @@
/***************************************************************************
Copyright (c) 2013, The OpenBLAS Project
All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
1. Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
2. Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in
the documentation and/or other materials provided with the
distribution.
3. Neither the name of the OpenBLAS project nor the names of
its contributors may be used to endorse or promote products
derived from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*****************************************************************************/
/**************************************************************************************
* 2013/11/14 Saar
* BLASTEST : OK
* CTEST : OK
* TEST : OK
*
**************************************************************************************/
#define ASSEMBLER
#include "common.h"
#define STACKSIZE 256
#define OLD_INC_X [fp, #0 ]
#define OLD_Y [fp, #4 ]
#define OLD_INC_Y [fp, #8 ]
#define N r0
#define Y r1
#define INC_X r2
#define X r3
#define INC_Y r4
#define I r12
#define X_PRE 512
/**************************************************************************************
* Macro definitions
**************************************************************************************/
/*****************************************************************************************/
#if !defined(COMPLEX)
#if defined(DOUBLE)
.macro KERNEL_F4
pld [ X, #X_PRE ]
pld [ Y, #X_PRE ]
fldmiad X, { d0 - d3 }
fldmiad Y, { d4 - d7 }
fstmiad Y!, { d0 - d3 }
fstmiad X!, { d4 - d7}
.endm
.macro KERNEL_F1
fldmiad X, { d0 }
fldmiad Y, { d4 }
fstmiad Y!, { d0 }
fstmiad X!, { d4 }
.endm
.macro KERNEL_S1
fldmiad X, { d0 }
fldmiad Y, { d4 }
fstmiad Y, { d0 }
fstmiad X, { d4 }
add X, X, INC_X
add Y, Y, INC_Y
.endm
#else
.macro KERNEL_F4
fldmias X, { s0 - s3 }
fldmias Y, { s4 - s7 }
fstmias Y!, { s0 - s3 }
fstmias X!, { s4 - s7}
.endm
.macro KERNEL_F1
fldmias X, { s0 }
fldmias Y, { s4 }
fstmias Y!, { s0 }
fstmias X!, { s4 }
.endm
.macro KERNEL_S1
fldmias X, { s0 }
fldmias Y, { s4 }
fstmias Y, { s0 }
fstmias X, { s4 }
add X, X, INC_X
add Y, Y, INC_Y
.endm
#endif
#else
#if defined(DOUBLE)
.macro KERNEL_F4
pld [ X, #X_PRE ]
pld [ Y, #X_PRE ]
fldmiad X, { d0 - d3 }
fldmiad Y, { d4 - d7 }
fstmiad Y!, { d0 - d3 }
fstmiad X!, { d4 - d7}
pld [ X, #X_PRE ]
pld [ Y, #X_PRE ]
fldmiad X, { d0 - d3 }
fldmiad Y, { d4 - d7 }
fstmiad Y!, { d0 - d3 }
fstmiad X!, { d4 - d7}
.endm
.macro KERNEL_F1
fldmiad X, { d0 - d1 }
fldmiad Y, { d4 - d5 }
fstmiad Y!, { d0 - d1 }
fstmiad X!, { d4 - d5 }
.endm
.macro KERNEL_S1
fldmiad X, { d0 - d1 }
fldmiad Y, { d4 - d5 }
fstmiad Y, { d0 - d1 }
fstmiad X, { d4 - d5 }
add X, X, INC_X
add Y, Y, INC_Y
.endm
#else
.macro KERNEL_F4
pld [ X, #X_PRE ]
pld [ Y, #X_PRE ]
fldmias X, { s0 - s3 }
fldmias Y, { s4 - s7 }
fstmias Y!, { s0 - s3 }
fstmias X!, { s4 - s7}
fldmias X, { s0 - s3 }
fldmias Y, { s4 - s7 }
fstmias Y!, { s0 - s3 }
fstmias X!, { s4 - s7}
.endm
.macro KERNEL_F1
fldmias X, { s0 - s1 }
fldmias Y, { s4 - s5 }
fstmias Y!, { s0 - s1 }
fstmias X!, { s4 - s5 }
.endm
.macro KERNEL_S1
fldmias X, { s0 - s1 }
fldmias Y, { s4 - s5 }
fstmias Y, { s0 - s1 }
fstmias X, { s4 - s5 }
add X, X, INC_X
add Y, Y, INC_Y
.endm
#endif
#endif
/**************************************************************************************
* End of macro definitions
**************************************************************************************/
PROLOGUE
.align 5
push {r4 , fp}
add fp, sp, #8
ldr INC_X , OLD_INC_X
ldr Y, OLD_Y
ldr INC_Y , OLD_INC_Y
cmp N, #0
ble swap_kernel_L999
cmp INC_X, #0
beq swap_kernel_L999
cmp INC_Y, #0
beq swap_kernel_L999
cmp INC_X, #1
bne swap_kernel_S_BEGIN
cmp INC_Y, #1
bne swap_kernel_S_BEGIN
swap_kernel_F_BEGIN:
asrs I, N, #2 // I = N / 4
ble swap_kernel_F1
.align 5
swap_kernel_F4:
#if !defined(COMPLEX) && !defined(DOUBLE)
pld [ X, #X_PRE ]
pld [ Y, #X_PRE ]
#endif
KERNEL_F4
subs I, I, #1
ble swap_kernel_F1
KERNEL_F4
subs I, I, #1
bne swap_kernel_F4
swap_kernel_F1:
ands I, N, #3
ble swap_kernel_L999
swap_kernel_F10:
KERNEL_F1
subs I, I, #1
bne swap_kernel_F10
b swap_kernel_L999
swap_kernel_S_BEGIN:
#if defined(COMPLEX)
#if defined(DOUBLE)
lsl INC_X, INC_X, #4 // INC_X * SIZE * 2
lsl INC_Y, INC_Y, #4 // INC_Y * SIZE * 2
#else
lsl INC_X, INC_X, #3 // INC_X * SIZE * 2
lsl INC_Y, INC_Y, #3 // INC_Y * SIZE * 2
#endif
#else
#if defined(DOUBLE)
lsl INC_X, INC_X, #3 // INC_X * SIZE
lsl INC_Y, INC_Y, #3 // INC_Y * SIZE
#else
lsl INC_X, INC_X, #2 // INC_X * SIZE
lsl INC_Y, INC_Y, #2 // INC_Y * SIZE
#endif
#endif
asrs I, N, #2 // I = N / 4
ble swap_kernel_S1
.align 5
swap_kernel_S4:
KERNEL_S1
KERNEL_S1
KERNEL_S1
KERNEL_S1
subs I, I, #1
bne swap_kernel_S4
swap_kernel_S1:
ands I, N, #3
ble swap_kernel_L999
swap_kernel_S10:
KERNEL_S1
subs I, I, #1
bne swap_kernel_S10
swap_kernel_L999:
mov r0, #0 // set return value
sub sp, fp, #8
pop {r4,fp}
bx lr
EPILOGUE

81
kernel/arm/zamax.c Normal file
View File

@ -0,0 +1,81 @@
/***************************************************************************
Copyright (c) 2013, The OpenBLAS Project
All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
1. Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
2. Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in
the documentation and/or other materials provided with the
distribution.
3. Neither the name of the OpenBLAS project nor the names of
its contributors may be used to endorse or promote products
derived from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*****************************************************************************/
/**************************************************************************************
* 2013/09/14 Saar
* BLASTEST float : OK
* BLASTEST double : OK
* CTEST : NoTest
* TEST : NoTest
*
**************************************************************************************/
#include "common.h"
#include <math.h>
#if defined(DOUBLE)
#define ABS fabs
#else
#define ABS fabsf
#endif
#define CABS1(x,i) ABS(x[i])+ABS(x[i+1])
FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x)
{
BLASLONG i=0;
BLASLONG ix=0;
FLOAT maxf[2];
BLASLONG max=0;
BLASLONG inc_x2;
if (n < 0 || inc_x < 1 ) return(0.0);
inc_x2 = 2 * inc_x;
maxf[0] = ABS(x[ix]);
maxf[1] = ABS(x[ix+1]);
while(i < n)
{
if( CABS1(x,ix) > CABS1(maxf,0) )
{
max = i;
maxf[0] = ABS(x[ix]);
maxf[1] = ABS(x[ix+1]);
}
ix += inc_x2;
i++;
}
return(CABS1(maxf,0));
}

81
kernel/arm/zamin.c Normal file
View File

@ -0,0 +1,81 @@
/***************************************************************************
Copyright (c) 2013, The OpenBLAS Project
All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
1. Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
2. Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in
the documentation and/or other materials provided with the
distribution.
3. Neither the name of the OpenBLAS project nor the names of
its contributors may be used to endorse or promote products
derived from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*****************************************************************************/
/**************************************************************************************
* 2013/09/14 Saar
* BLASTEST float : OK
* BLASTEST double : OK
* CTEST : NoTest
* TEST : NoTest
*
**************************************************************************************/
#include "common.h"
#include <math.h>
#if defined(DOUBLE)
#define ABS fabs
#else
#define ABS fabsf
#endif
#define CABS1(x,i) ABS(x[i])+ABS(x[i+1])
FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x)
{
BLASLONG i=0;
BLASLONG ix=0;
FLOAT minf[2];
BLASLONG min=0;
BLASLONG inc_x2;
if (n < 0 || inc_x < 1 ) return(0.0);
inc_x2 = 2 * inc_x;
minf[0] = ABS(x[ix]);
minf[1] = ABS(x[ix+1]);
while(i < n)
{
if( CABS1(x,ix) < CABS1(minf,0) )
{
min = i;
minf[0] = ABS(x[ix]);
minf[1] = ABS(x[ix+1]);
}
ix += inc_x2;
i++;
}
return(CABS1(minf,0));
}

71
kernel/arm/zasum.c Normal file
View File

@ -0,0 +1,71 @@
/***************************************************************************
Copyright (c) 2013, The OpenBLAS Project
All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
1. Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
2. Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in
the documentation and/or other materials provided with the
distribution.
3. Neither the name of the OpenBLAS project nor the names of
its contributors may be used to endorse or promote products
derived from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*****************************************************************************/
/**************************************************************************************
* 2013/09/14 Saar
* BLASTEST float : OK
* BLASTEST double : OK
* CTEST : OK
* TEST : OK
*
**************************************************************************************/
#include "common.h"
#include <math.h>
#if defined(DOUBLE)
#define ABS fabs
#else
#define ABS fabsf
#endif
#define CABS1(x,i) ABS(x[i])+ABS(x[i+1])
FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x)
{
BLASLONG i=0;
FLOAT sumf = 0.0;
BLASLONG inc_x2;
if (n < 0 || inc_x < 1 ) return(sumf);
inc_x2 = 2 * inc_x;
n *= inc_x2;
while(i < n)
{
sumf += CABS1(x,i);
i += inc_x2;
}
return(sumf);
}

72
kernel/arm/zaxpy.c Normal file
View File

@ -0,0 +1,72 @@
/***************************************************************************
Copyright (c) 2013, The OpenBLAS Project
All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
1. Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
2. Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in
the documentation and/or other materials provided with the
distribution.
3. Neither the name of the OpenBLAS project nor the names of
its contributors may be used to endorse or promote products
derived from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*****************************************************************************/
/**************************************************************************************
* 2013/09/15 Saar
* BLASTEST float : OK
* BLASTEST double : OK
* CTEST : OK
* TEST : OK
*
**************************************************************************************/
#include "common.h"
int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da_r, FLOAT da_i, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *dummy, BLASLONG dummy2)
{
BLASLONG i=0;
BLASLONG ix,iy;
if ( n < 0 ) return(0);
if ( da_r == 0.0 && da_i == 0.0 ) return(0);
ix = 0;
iy = 0;
BLASLONG inc_x2 = 2 * inc_x;
BLASLONG inc_y2 = 2 * inc_y;
while(i < n)
{
#if !defined(CONJ)
y[iy] += ( da_r * x[ix] - da_i * x[ix+1] ) ;
y[iy+1] += ( da_r * x[ix+1] + da_i * x[ix] ) ;
#else
y[iy] += ( da_r * x[ix] + da_i * x[ix+1] ) ;
y[iy+1] -= ( da_r * x[ix+1] - da_i * x[ix] ) ;
#endif
ix += inc_x2 ;
iy += inc_y2 ;
i++ ;
}
return(0);
}

63
kernel/arm/zcopy.c Normal file
View File

@ -0,0 +1,63 @@
/***************************************************************************
Copyright (c) 2013, The OpenBLAS Project
All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
1. Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
2. Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in
the documentation and/or other materials provided with the
distribution.
3. Neither the name of the OpenBLAS project nor the names of
its contributors may be used to endorse or promote products
derived from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*****************************************************************************/
/**************************************************************************************
* 2013/09/14 Saar
* BLASTEST float : OK
* BLASTEST double : OK
* CTEST : OK
* TEST : OK
*
**************************************************************************************/
#include "common.h"
int CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y)
{
BLASLONG i=0;
BLASLONG ix=0,iy=0;
if ( n < 0 ) return(0);
BLASLONG inc_x2 = 2 * inc_x;
BLASLONG inc_y2 = 2 * inc_y;
while(i < n)
{
y[iy] = x[ix] ;
y[iy+1] = x[ix+1] ;
ix += inc_x2;
iy += inc_y2;
i++ ;
}
return(0);
}

223
kernel/arm/zcopy_vfp.S Normal file
View File

@ -0,0 +1,223 @@
/***************************************************************************
Copyright (c) 2013, The OpenBLAS Project
All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
1. Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
2. Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in
the documentation and/or other materials provided with the
distribution.
3. Neither the name of the OpenBLAS project nor the names of
its contributors may be used to endorse or promote products
derived from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*****************************************************************************/
/**************************************************************************************
* 2013/11/07 Saar
* BLASTEST : OK
* CTEST : OK
* TEST : OK
*
**************************************************************************************/
#define ASSEMBLER
#include "common.h"
#define STACKSIZE 256
#define N r0
#define X r1
#define INC_X r2
#define OLD_Y r3
/******************************************************
* [fp, #-128] - [fp, #-64] is reserved
* for store and restore of floating point
* registers
*******************************************************/
#define OLD_INC_Y [fp, #4 ]
#define I r5
#define Y r6
#define INC_Y r7
#define X_PRE 256
/**************************************************************************************
* Macro definitions
**************************************************************************************/
.macro COPY_F4
pld [ X, #X_PRE ]
pld [ X, #X_PRE+32 ]
fldmiad X!, { d0 - d7 }
fstmiad Y!, { d0 - d7 }
.endm
.macro COPY_F1
fldmiad X!, { d0 - d1 }
fstmiad Y!, { d0 - d1 }
.endm
/*************************************************************************************************************************/
.macro COPY_S4
nop
fldmiad X, { d0 - d1 }
fstmiad Y, { d0 - d1 }
add X, X, INC_X
add Y, Y, INC_Y
fldmiad X, { d2 - d3 }
fstmiad Y, { d2 - d3 }
add X, X, INC_X
add Y, Y, INC_Y
fldmiad X, { d0 - d1 }
fstmiad Y, { d0 - d1 }
add X, X, INC_X
add Y, Y, INC_Y
fldmiad X, { d2 - d3 }
fstmiad Y, { d2 - d3 }
add X, X, INC_X
add Y, Y, INC_Y
.endm
.macro COPY_S1
fldmiad X, { d0 - d1 }
fstmiad Y, { d0 - d1 }
add X, X, INC_X
add Y, Y, INC_Y
.endm
/**************************************************************************************
* End of macro definitions
**************************************************************************************/
PROLOGUE
.align 5
push {r4 - r9, fp}
add fp, sp, #24
sub sp, sp, #STACKSIZE // reserve stack
sub r4, fp, #128
vstm r4, { d8 - d15} // store floating point registers
mov Y, OLD_Y
ldr INC_Y, OLD_INC_Y
cmp N, #0
ble zcopy_kernel_L999
cmp INC_X, #0
beq zcopy_kernel_L999
cmp INC_Y, #0
beq zcopy_kernel_L999
cmp INC_X, #1
bne zcopy_kernel_S_BEGIN
cmp INC_Y, #1
bne zcopy_kernel_S_BEGIN
zcopy_kernel_F_BEGIN:
asrs I, N, #2 // I = N / 4
ble zcopy_kernel_F1
zcopy_kernel_F4:
COPY_F4
subs I, I, #1
bne zcopy_kernel_F4
zcopy_kernel_F1:
ands I, N, #3
ble zcopy_kernel_L999
zcopy_kernel_F10:
COPY_F1
subs I, I, #1
bne zcopy_kernel_F10
b zcopy_kernel_L999
zcopy_kernel_S_BEGIN:
lsl INC_X, INC_X, #4 // INC_X * SIZE * 2
lsl INC_Y, INC_Y, #4 // INC_Y * SIZE * 2
asrs I, N, #2 // I = N / 4
ble zcopy_kernel_S1
zcopy_kernel_S4:
COPY_S4
subs I, I, #1
bne zcopy_kernel_S4
zcopy_kernel_S1:
ands I, N, #3
ble zcopy_kernel_L999
zcopy_kernel_S10:
COPY_S1
subs I, I, #1
bne zcopy_kernel_S10
zcopy_kernel_L999:
sub r3, fp, #128
vldm r3, { d8 - d15} // restore floating point registers
mov r0, #0 // set return value
sub sp, fp, #24
pop {r4 - r9, fp}
bx lr
EPILOGUE

78
kernel/arm/zdot.c Normal file
View File

@ -0,0 +1,78 @@
/***************************************************************************
Copyright (c) 2013, The OpenBLAS Project
All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
1. Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
2. Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in
the documentation and/or other materials provided with the
distribution.
3. Neither the name of the OpenBLAS project nor the names of
its contributors may be used to endorse or promote products
derived from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*****************************************************************************/
/**************************************************************************************
* 2013/09/14 Saar
* BLASTEST float : FAIL
* BLASTEST double : FAIL
* CTEST : OK
* TEST : OK
*
**************************************************************************************/
#include "common.h"
#include <complex.h>
FLOAT _Complex CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y)
{
BLASLONG i=0;
BLASLONG ix=0,iy=0;
FLOAT dot[2];
FLOAT _Complex result;
dot[0]=0.0;
dot[1]=0.0;
__real__ result = 0.0 ;
__imag__ result = 0.0 ;
if ( n < 1 ) return(result);
BLASLONG inc_x2 = 2 * inc_x ;
BLASLONG inc_y2 = 2 * inc_y ;
while(i < n)
{
#if !defined(CONJ)
dot[0] += ( x[ix] * y[iy] - x[ix+1] * y[iy+1] ) ;
dot[1] += ( x[ix+1] * y[iy] + x[ix] * y[iy+1] ) ;
#else
dot[0] += ( x[ix] * y[iy] + x[ix+1] * y[iy+1] ) ;
dot[1] -= ( x[ix+1] * y[iy] - x[ix] * y[iy+1] ) ;
#endif
ix += inc_x2 ;
iy += inc_y2 ;
i++ ;
}
__real__ result = dot[0];
__imag__ result = dot[1];
return(result);
}

286
kernel/arm/zdot_vfp.S Normal file
View File

@ -0,0 +1,286 @@
/***************************************************************************
Copyright (c) 2013, The OpenBLAS Project
All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
1. Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
2. Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in
the documentation and/or other materials provided with the
distribution.
3. Neither the name of the OpenBLAS project nor the names of
its contributors may be used to endorse or promote products
derived from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*****************************************************************************/
/**************************************************************************************
* 2013/11/11 Saar
* BLASTEST : OK
* CTEST : OK
* TEST : OK
*
**************************************************************************************/
#define ASSEMBLER
#include "common.h"
#define STACKSIZE 256
#define N r0
#define X r1
#define INC_X r2
#define OLD_Y r3
/******************************************************
* [fp, #-128] - [fp, #-64] is reserved
* for store and restore of floating point
* registers
*******************************************************/
#define OLD_INC_Y [fp, #4 ]
#define I r5
#define Y r6
#define INC_Y r7
#define X_PRE 512
/**************************************************************************************
* Macro definitions
**************************************************************************************/
.macro KERNEL_F4
pld [ X, #X_PRE ]
pld [ Y, #X_PRE ]
fldmiad X!, { d4 - d5 }
fldmiad Y!, { d8 - d9 }
fmacd d0 , d4, d8
fmacd d1 , d4, d9
fldmiad X!, { d6 - d7 }
fmacd d2 , d5, d9
fmacd d3 , d5, d8
fldmiad Y!, { d10 - d11 }
fmacd d0 , d6, d10
fmacd d1 , d6, d11
pld [ X, #X_PRE ]
fmacd d2 , d7, d11
fmacd d3 , d7, d10
pld [ Y, #X_PRE ]
fldmiad X!, { d4 - d5 }
fldmiad Y!, { d8 - d9 }
fmacd d0 , d4, d8
fmacd d1 , d4, d9
fldmiad X!, { d6 - d7 }
fmacd d2 , d5, d9
fmacd d3 , d5, d8
fldmiad Y!, { d10 - d11 }
fmacd d0 , d6, d10
fmacd d1 , d6, d11
fmacd d2 , d7, d11
fmacd d3 , d7, d10
.endm
.macro KERNEL_F1
fldmiad X!, { d4 - d5 }
fldmiad Y!, { d8 - d9 }
fmacd d0 , d4, d8
fmacd d1 , d4, d9
fmacd d2 , d5, d9
fmacd d3 , d5, d8
.endm
/*************************************************************************************************************************/
.macro KERNEL_S4
nop
fldmiad X, { d4 - d5 }
fldmiad Y, { d8 - d9 }
fmacd d0 , d4, d8
fmacd d1 , d4, d9
fmacd d2 , d5, d9
fmacd d3 , d5, d8
add X, X, INC_X
add Y, Y, INC_Y
fldmiad X, { d4 - d5 }
fldmiad Y, { d8 - d9 }
fmacd d0 , d4, d8
fmacd d1 , d4, d9
fmacd d2 , d5, d9
fmacd d3 , d5, d8
add X, X, INC_X
add Y, Y, INC_Y
fldmiad X, { d4 - d5 }
fldmiad Y, { d8 - d9 }
fmacd d0 , d4, d8
fmacd d1 , d4, d9
fmacd d2 , d5, d9
fmacd d3 , d5, d8
add X, X, INC_X
add Y, Y, INC_Y
fldmiad X, { d4 - d5 }
fldmiad Y, { d8 - d9 }
fmacd d0 , d4, d8
fmacd d1 , d4, d9
fmacd d2 , d5, d9
fmacd d3 , d5, d8
add X, X, INC_X
add Y, Y, INC_Y
.endm
.macro KERNEL_S1
fldmiad X, { d4 - d5 }
fldmiad Y, { d8 - d9 }
fmacd d0 , d4, d8
fmacd d1 , d4, d9
fmacd d2 , d5, d9
fmacd d3 , d5, d8
add X, X, INC_X
add Y, Y, INC_Y
.endm
/**************************************************************************************
* End of macro definitions
**************************************************************************************/
PROLOGUE
.align 5
push {r4 - r9, fp}
add fp, sp, #24
sub sp, sp, #STACKSIZE // reserve stack
sub r4, fp, #128
vstm r4, { d8 - d15} // store floating point registers
mov Y, OLD_Y
ldr INC_Y, OLD_INC_Y
vsub.f64 d0 , d0 , d0
vsub.f64 d1 , d1 , d1
vsub.f64 d2 , d2 , d2
vsub.f64 d3 , d3 , d3
cmp N, #0
ble zdot_kernel_L999
cmp INC_X, #0
beq zdot_kernel_L999
cmp INC_Y, #0
beq zdot_kernel_L999
cmp INC_X, #1
bne zdot_kernel_S_BEGIN
cmp INC_Y, #1
bne zdot_kernel_S_BEGIN
zdot_kernel_F_BEGIN:
asrs I, N, #2 // I = N / 4
ble zdot_kernel_F1
zdot_kernel_F4:
KERNEL_F4
subs I, I, #1
bne zdot_kernel_F4
zdot_kernel_F1:
ands I, N, #3
ble zdot_kernel_L999
zdot_kernel_F10:
KERNEL_F1
subs I, I, #1
bne zdot_kernel_F10
b zdot_kernel_L999
zdot_kernel_S_BEGIN:
lsl INC_X, INC_X, #4 // INC_X * SIZE * 2
lsl INC_Y, INC_Y, #4 // INC_Y * SIZE * 2
asrs I, N, #2 // I = N / 4
ble zdot_kernel_S1
zdot_kernel_S4:
KERNEL_S4
subs I, I, #1
bne zdot_kernel_S4
zdot_kernel_S1:
ands I, N, #3
ble zdot_kernel_L999
zdot_kernel_S10:
KERNEL_S1
subs I, I, #1
bne zdot_kernel_S10
zdot_kernel_L999:
sub r3, fp, #128
vldm r3, { d8 - d15} // restore floating point registers
#if !defined(CONJ)
vsub.f64 d0 , d0, d2
vadd.f64 d1 , d1, d3
#else
vadd.f64 d0 , d0, d2
vsub.f64 d1 , d1, d3
#endif
sub sp, fp, #24
pop {r4 - r9, fp}
bx lr
EPILOGUE

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

View File

@ -0,0 +1,254 @@
/***************************************************************************
Copyright (c) 2013, The OpenBLAS Project
All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
1. Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
2. Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in
the documentation and/or other materials provided with the
distribution.
3. Neither the name of the OpenBLAS project nor the names of
its contributors may be used to endorse or promote products
derived from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*****************************************************************************/
/**************************************************************************************
* 2013/11/05 Saar
* BLASTEST : OK
* CTEST : OK
* TEST : OK
*
**************************************************************************************/
#define ASSEMBLER
#include "common.h"
#define STACKSIZE 256
#define OLD_M r0
#define OLD_N r1
#define OLD_A r2
#define OLD_LDA r3
/******************************************************
* [fp, #-128] - [fp, #-64] is reserved
* for store and restore of floating point
* registers
*******************************************************/
#define LDA [fp, #-260 ]
#define B [fp, #4 ]
#define M r0
#define N r1
#define A r2
#define BO r5
#define AO1 r6
#define AO2 r7
#define I r3
#define J r12
#define A_PRE 256
/**************************************************************************************
* Macro definitions
**************************************************************************************/
.macro COPY2x2
pld [ AO1, #A_PRE ]
pld [ AO2, #A_PRE ]
fldd d0 , [ AO1, #0 ]
fldd d1 , [ AO1, #8 ]
fldd d4 , [ AO1, #16 ]
fldd d5 , [ AO1, #24 ]
fldd d2 , [ AO2, #0 ]
fldd d3 , [ AO2, #8 ]
add AO1, AO1, #32
fldd d6 , [ AO2, #16 ]
fldd d7 , [ AO2, #24 ]
fstmiad BO!, { d0 - d7 }
add AO2, AO2, #32
.endm
.macro COPY1x2
fldd d0 , [ AO1, #0 ]
fldd d1 , [ AO1, #8 ]
fldd d2 , [ AO2, #0 ]
fldd d3 , [ AO2, #8 ]
add AO1, AO1, #16
fstmiad BO!, { d0 - d3 }
add AO2, AO2, #16
.endm
.macro COPY2x1
fldd d0 , [ AO1, #0 ]
fldd d1 , [ AO1, #8 ]
fldd d2 , [ AO1, #16 ]
fldd d3 , [ AO1, #24 ]
fstmiad BO!, { d0 - d3 }
add AO1, AO1, #32
.endm
.macro COPY1x1
fldd d0 , [ AO1, #0 ]
fldd d1 , [ AO1, #8 ]
fstmiad BO!, { d0 - d1 }
add AO1, AO1, #16
.endm
/**************************************************************************************
* End of macro definitions
**************************************************************************************/
PROLOGUE
.align 5
push {r4 - r9, fp}
add fp, sp, #24
sub sp, sp, #STACKSIZE // reserve stack
lsl r3, r3, #4 // lda = lda * 8 * 2
str r3, LDA
sub r4, fp, #128
vstm r4, { d8 - d15} // store floating point registers
ldr BO, B
/*********************************************************************************************/
zgemm_ncopy_L2_BEGIN:
asrs J, N, #1 // J = N / 2
ble zgemm_ncopy_L1_BEGIN
zgemm_ncopy_L2_M2_BEGIN:
mov AO1, A // AO1 = A
ldr r4 , LDA
add AO2, AO1, r4
add A , AO2, r4 // A = A + 2 * LDA
asrs I, M, #1 // I = M / 2
ble zgemm_ncopy_L2_M2_40
zgemm_ncopy_L2_M2_20:
COPY2x2
subs I , I , #1
bne zgemm_ncopy_L2_M2_20
zgemm_ncopy_L2_M2_40:
ands I, M , #1
ble zgemm_ncopy_L2_M2_END
zgemm_ncopy_L2_M2_60:
COPY1x2
subs I , I , #1
bne zgemm_ncopy_L2_M2_60
zgemm_ncopy_L2_M2_END:
subs J , J, #1 // j--
bne zgemm_ncopy_L2_M2_BEGIN
/*********************************************************************************************/
zgemm_ncopy_L1_BEGIN:
tst N, #1
ble zgemm_ncopy_L999
zgemm_ncopy_L1_M2_BEGIN:
mov AO1, A // AO1 = A
ldr r4 , LDA
add A , AO1, r4 // A = A + 1 * LDA
asrs I, M, #1 // I = M / 2
ble zgemm_ncopy_L1_M2_40
zgemm_ncopy_L1_M2_20:
COPY2x1
subs I , I , #1
bne zgemm_ncopy_L1_M2_20
zgemm_ncopy_L1_M2_40:
ands I, M , #1
ble zgemm_ncopy_L1_M2_END
zgemm_ncopy_L1_M2_60:
COPY1x1
subs I , I , #1
bne zgemm_ncopy_L1_M2_60
zgemm_ncopy_L1_M2_END:
zgemm_ncopy_L999:
sub r3, fp, #128
vldm r3, { d8 - d15} // restore floating point registers
movs r0, #0 // set return value
sub sp, fp, #24
pop {r4 - r9, fp}
bx lr
EPILOGUE

View File

@ -0,0 +1,245 @@
/***************************************************************************
Copyright (c) 2013, The OpenBLAS Project
All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
1. Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
2. Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in
the documentation and/or other materials provided with the
distribution.
3. Neither the name of the OpenBLAS project nor the names of
its contributors may be used to endorse or promote products
derived from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*****************************************************************************/
/**************************************************************************************
* 2013/11/07 Saar
* BLASTEST : OK
* CTEST : OK
* TEST : OK
*
**************************************************************************************/
#define ASSEMBLER
#include "common.h"
#define STACKSIZE 256
#define OLD_M r0
#define OLD_N r1
#define OLD_A r2
#define OLD_LDA r3
/******************************************************
* [fp, #-128] - [fp, #-64] is reserved
* for store and restore of floating point
* registers
*******************************************************/
#define B [fp, #4 ]
#define A [fp, #-248 ]
#define M r0
#define N r1
#define M4 r2
#define LDA r5
#define AO1 r6
#define BO1 r7
#define BO2 r8
#define I r4
#define J r12
#define A_PRE 256
/**************************************************************************************
* Macro definitions
**************************************************************************************/
.macro COPY2x2
pld [ AO1, #A_PRE ]
fldmiad AO1, { d0 - d3 }
add r3, AO1, LDA
pld [ r3, #A_PRE ]
fldmiad r3, { d4 - d7 }
fstmiad BO1, { d0 - d7 }
add AO1, AO1, #32
add BO1, BO1, M4
.endm
.macro COPY1x2
fldmiad AO1, { d0 -d1 }
add r3, AO1, LDA
fldmiad r3, { d2 - d3 }
fstmiad BO2, { d0 - d3 }
add AO1, AO1, #16
add BO2, BO2, #32
.endm
/*************************************************************************************************************************/
.macro COPY2x1
fldmiad AO1, { d0 - d3 }
fstmiad BO1, { d0 - d3 }
add AO1, AO1, #32
add BO1, BO1, M4
.endm
.macro COPY1x1
fldmiad AO1, { d0 - d1 }
fstmiad BO2, { d0 - d1 }
add AO1, AO1, #16
add BO2, BO2, #16
.endm
/**************************************************************************************
* End of macro definitions
**************************************************************************************/
PROLOGUE
.align 5
push {r4 - r9, fp}
add fp, sp, #24
sub sp, sp, #STACKSIZE // reserve stack
str OLD_A, A // store A
lsl LDA, OLD_LDA, #4 // lda = lda * SIZE * 2
sub r4, fp, #128
vstm r4, { d8 - d15} // store floating point registers
lsl r4 , M, #4 // M * SIZE * 2
ldr r3, B
and BO2 , N , #-2
mul BO2, BO2, r4
add BO2 , BO2, r3
lsl M4, M, #5 // M4 = M * 2 * SIZE * 2
zgemm_tcopy_L2_BEGIN:
asrs J, M, #1 // J = N / 2
ble zgemm_tcopy_L1_BEGIN
zgemm_tcopy_L2_M2_BEGIN:
ldr AO1, A // AO1 = A
lsl r3, LDA, #1 // r3 = 2 * LDA
add r3, r3 , AO1 // A = A + 2 * LDA
str r3, A // store A
ldr BO1, B
add r3, BO1, #64 // B = B + 4 * SIZE *2
str r3, B
asrs I, N, #1 // I = M / 2
ble zgemm_tcopy_L2_M2_60
zgemm_tcopy_L2_M2_40:
COPY2x2
subs I, I, #1
bne zgemm_tcopy_L2_M2_40
zgemm_tcopy_L2_M2_60:
tst N , #1
ble zgemm_tcopy_L2_M2_END
COPY1x2
zgemm_tcopy_L2_M2_END:
subs J , J, #1 // j--
bne zgemm_tcopy_L2_M2_BEGIN
/*********************************************************************************************/
zgemm_tcopy_L1_BEGIN:
tst M, #1
ble zgemm_tcopy_L999
zgemm_tcopy_L1_M2_BEGIN:
ldr AO1, A // AO1 = A
add r3, LDA , AO1 // A = A + 1 * LDA
str r3, A // store A
ldr BO1, B
add r3, BO1, #32 // B = B + 2 * SIZE *2
str r3, B
asrs I, N, #1 // I = M / 2
ble zgemm_tcopy_L1_M2_60
zgemm_tcopy_L1_M2_40:
COPY2x1
subs I, I, #1
bne zgemm_tcopy_L1_M2_40
zgemm_tcopy_L1_M2_60:
tst N , #1
ble zgemm_tcopy_L1_M2_END
COPY1x1
zgemm_tcopy_L1_M2_END:
zgemm_tcopy_L999:
sub r3, fp, #128
vldm r3, { d8 - d15} // restore floating point registers
mov r0, #0 // set return value
sub sp, fp, #24
pop {r4 - r9, fp}
bx lr
EPILOGUE

157
kernel/arm/zgemv_n.c Normal file
View File

@ -0,0 +1,157 @@
/***************************************************************************
Copyright (c) 2013, The OpenBLAS Project
All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
1. Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
2. Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in
the documentation and/or other materials provided with the
distribution.
3. Neither the name of the OpenBLAS project nor the names of
its contributors may be used to endorse or promote products
derived from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*****************************************************************************/
/**************************************************************************************
* * 2013/11/23 Saar
* * BLASTEST float : OK
* * BLASTEST double : OK
* CTEST : OK
* TEST : OK
* *
* **************************************************************************************/
#include "common.h"
int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha_r, FLOAT alpha_i, FLOAT *a, BLASLONG lda, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *buffer)
{
BLASLONG i;
BLASLONG ix,iy;
BLASLONG j;
FLOAT *a_ptr;
FLOAT temp_r,temp_i;
BLASLONG inc_x2,inc_y2;
BLASLONG lda2;
BLASLONG i2;
lda2 = 2*lda;
ix = 0;
a_ptr = a;
if ( inc_x == 1 && inc_y == 1 )
{
for (j=0; j<n; j++)
{
#if !defined(XCONJ)
temp_r = alpha_r * x[ix] - alpha_i * x[ix+1];
temp_i = alpha_r * x[ix+1] + alpha_i * x[ix];
#else
temp_r = alpha_r * x[ix] + alpha_i * x[ix+1];
temp_i = alpha_r * x[ix+1] - alpha_i * x[ix];
#endif
iy = 0;
i2=0;
for (i=0; i<m; i++)
{
#if !defined(CONJ)
#if !defined(XCONJ)
y[iy] += temp_r * a_ptr[i2] - temp_i * a_ptr[i2+1];
y[iy+1] += temp_r * a_ptr[i2+1] + temp_i * a_ptr[i2];
#else
y[iy] += temp_r * a_ptr[i2] + temp_i * a_ptr[i2+1];
y[iy+1] += temp_r * a_ptr[i2+1] - temp_i * a_ptr[i2];
#endif
#else
#if !defined(XCONJ)
y[iy] += temp_r * a_ptr[i2] + temp_i * a_ptr[i2+1];
y[iy+1] -= temp_r * a_ptr[i2+1] - temp_i * a_ptr[i2];
#else
y[iy] += temp_r * a_ptr[i2] - temp_i * a_ptr[i2+1];
y[iy+1] -= temp_r * a_ptr[i2+1] + temp_i * a_ptr[i2];
#endif
#endif
i2 += 2;
iy += 2;
}
a_ptr += lda2;
ix += 2;
}
return(0);
}
inc_x2 = 2 * inc_x;
inc_y2 = 2 * inc_y;
for (j=0; j<n; j++)
{
#if !defined(XCONJ)
temp_r = alpha_r * x[ix] - alpha_i * x[ix+1];
temp_i = alpha_r * x[ix+1] + alpha_i * x[ix];
#else
temp_r = alpha_r * x[ix] + alpha_i * x[ix+1];
temp_i = alpha_r * x[ix+1] - alpha_i * x[ix];
#endif
iy = 0;
i2=0;
for (i=0; i<m; i++)
{
#if !defined(CONJ)
#if !defined(XCONJ)
y[iy] += temp_r * a_ptr[i2] - temp_i * a_ptr[i2+1];
y[iy+1] += temp_r * a_ptr[i2+1] + temp_i * a_ptr[i2];
#else
y[iy] += temp_r * a_ptr[i2] + temp_i * a_ptr[i2+1];
y[iy+1] += temp_r * a_ptr[i2+1] - temp_i * a_ptr[i2];
#endif
#else
#if !defined(XCONJ)
y[iy] += temp_r * a_ptr[i2] + temp_i * a_ptr[i2+1];
y[iy+1] -= temp_r * a_ptr[i2+1] - temp_i * a_ptr[i2];
#else
y[iy] += temp_r * a_ptr[i2] - temp_i * a_ptr[i2+1];
y[iy+1] -= temp_r * a_ptr[i2+1] + temp_i * a_ptr[i2];
#endif
#endif
i2 += 2;
iy += inc_y2;
}
a_ptr += lda2;
ix += inc_x2;
}
return(0);
}

699
kernel/arm/zgemv_n_vfp.S Normal file
View File

@ -0,0 +1,699 @@
/***************************************************************************
Copyright (c) 2013, The OpenBLAS Project
All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
1. Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
2. Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in
the documentation and/or other materials provided with the
distribution.
3. Neither the name of the OpenBLAS project nor the names of
its contributors may be used to endorse or promote products
derived from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*****************************************************************************/
/**************************************************************************************
* 2013/11/29 Saar
* BLASTEST : OK
* CTEST : OK
* TEST : OK
*
**************************************************************************************/
#define ASSEMBLER
#include "common.h"
#define STACKSIZE 256
#define OLD_LDA [fp, #0 ]
#define X [fp, #4 ]
#define OLD_INC_X [fp, #8 ]
#define Y [fp, #12 ]
#define OLD_INC_Y [fp, #16 ]
#define OLD_A r3
#define OLD_M r0
#define AO1 r0
#define N r1
#define J r2
#define AO2 r4
#define XO r5
#define YO r6
#define LDA r7
#define INC_X r8
#define INC_Y r9
#define I r12
#define ALPHA_I [fp, #-236]
#define ALPHA_R [fp, #-244]
#define M [fp, #-252 ]
#define A [fp, #-256 ]
#define X_PRE 64
#define Y_PRE 0
#define A_PRE 0
/**************************************************************************************/
#if !defined(CONJ) && !defined(XCONJ)
#define KMAC_R fnmacd
#define KMAC_I fmacd
#define FMAC_R1 fmacd
#define FMAC_R2 fnmacd
#define FMAC_I1 fmacd
#define FMAC_I2 fmacd
#elif defined(CONJ) && !defined(XCONJ)
#define KMAC_R fmacd
#define KMAC_I fnmacd
#define FMAC_R1 fmacd
#define FMAC_R2 fnmacd
#define FMAC_I1 fmacd
#define FMAC_I2 fmacd
#elif !defined(CONJ) && defined(XCONJ)
#define KMAC_R fmacd
#define KMAC_I fnmacd
#define FMAC_R1 fmacd
#define FMAC_R2 fmacd
#define FMAC_I1 fnmacd
#define FMAC_I2 fmacd
#else
#define KMAC_R fnmacd
#define KMAC_I fmacd
#define FMAC_R1 fmacd
#define FMAC_R2 fmacd
#define FMAC_I1 fnmacd
#define FMAC_I2 fmacd
#endif
.macro INIT_F4
pld [ YO, #Y_PRE ]
vsub.f64 d8 , d8 , d8
vmov.f64 d9 , d8
vmov.f64 d10, d8
vmov.f64 d11, d8
vmov.f64 d12, d8
vmov.f64 d13, d8
vmov.f64 d14, d8
vmov.f64 d15, d8
.endm
.macro KERNEL_F4X4
pld [ XO, #X_PRE ]
KERNEL_F4X1
KERNEL_F4X1
pld [ XO, #X_PRE ]
KERNEL_F4X1
KERNEL_F4X1
.endm
.macro KERNEL_F4X1
fldd d0 , [ AO1 ]
fldd d4 , [ XO ]
fldd d5 , [ XO, #8 ]
pld [ AO2, #A_PRE ]
fldd d1 , [ AO1, #8 ]
fmacd d8 , d0, d4
fldd d2 , [ AO1, #16 ]
fmacd d9 , d0, d5
fldd d3 , [ AO1, #24 ]
fmacd d10 , d2, d4
fldd d0 , [ AO1, #32 ]
fmacd d11 , d2, d5
KMAC_R d8 , d1, d5
KMAC_I d9 , d1, d4
KMAC_R d10 , d3, d5
fldd d1 , [ AO1, #40 ]
KMAC_I d11 , d3, d4
fldd d2 , [ AO1, #48 ]
fmacd d12 , d0, d4
fldd d3 , [ AO1, #56 ]
fmacd d13 , d0, d5
pld [ AO2, #A_PRE+32 ]
fmacd d14 , d2, d4
fmacd d15 , d2, d5
KMAC_R d12 , d1, d5
add XO , XO, #16
KMAC_I d13 , d1, d4
add AO1 , AO1, LDA
KMAC_R d14 , d3, d5
add AO2 , AO2, LDA
KMAC_I d15 , d3, d4
.endm
.macro SAVE_F4
fldd d0, ALPHA_R
fldd d1, ALPHA_I
fldmiad YO, { d4 - d7 }
FMAC_R1 d4 , d0 , d8
FMAC_I1 d5 , d0 , d9
FMAC_R2 d4 , d1 , d9
FMAC_I2 d5 , d1 , d8
FMAC_R1 d6 , d0 , d10
FMAC_I1 d7 , d0 , d11
FMAC_R2 d6 , d1 , d11
FMAC_I2 d7 , d1 , d10
fstmiad YO!, { d4 - d7 }
fldmiad YO, { d4 - d7 }
FMAC_R1 d4 , d0 , d12
FMAC_I1 d5 , d0 , d13
FMAC_R2 d4 , d1 , d13
FMAC_I2 d5 , d1 , d12
FMAC_R1 d6 , d0 , d14
FMAC_I1 d7 , d0 , d15
FMAC_R2 d6 , d1 , d15
FMAC_I2 d7 , d1 , d14
fstmiad YO!, { d4 - d7 }
.endm
.macro INIT_F1
vsub.f64 d8 , d8 , d8
vmov.f64 d9 , d8
.endm
.macro KERNEL_F1X1
fldd d0 , [ AO1 ]
fldd d1 , [ AO1, #8 ]
fldd d4 , [ XO ]
fldd d5 , [ XO, #8 ]
fmacd d8 , d0, d4
fmacd d9 , d0, d5
KMAC_R d8 , d1, d5
KMAC_I d9 , d1, d4
add XO , XO, #16
add AO1 , AO1, LDA
.endm
.macro SAVE_F1
fldd d0, ALPHA_R
fldd d1, ALPHA_I
fldmiad YO, { d4 - d5 }
FMAC_R1 d4 , d0 , d8
FMAC_I1 d5 , d0 , d9
FMAC_R2 d4 , d1 , d9
FMAC_I2 d5 , d1 , d8
fstmiad YO, { d4 - d5 }
add YO, YO, #16
.endm
/****************************************************************************************/
.macro INIT_S4
vsub.f64 d8 , d8 , d8
vmov.f64 d9 , d8
vmov.f64 d10, d8
vmov.f64 d11, d8
vmov.f64 d12, d8
vmov.f64 d13, d8
vmov.f64 d14, d8
vmov.f64 d15, d8
.endm
.macro KERNEL_S4X4
KERNEL_S4X1
KERNEL_S4X1
KERNEL_S4X1
KERNEL_S4X1
.endm
.macro KERNEL_S4X1
fldd d0 , [ AO1 ]
fldd d1 , [ AO1, #8 ]
fldd d2 , [ AO1, #16 ]
fldd d3 , [ AO1, #24 ]
fldd d4 , [ XO ]
fldd d5 , [ XO, #8 ]
fmacd d8 , d0, d4
fmacd d9 , d0, d5
fmacd d10 , d2, d4
fmacd d11 , d2, d5
KMAC_R d8 , d1, d5
KMAC_I d9 , d1, d4
KMAC_R d10 , d3, d5
KMAC_I d11 , d3, d4
fldd d0 , [ AO1, #32 ]
fldd d1 , [ AO1, #40 ]
fldd d2 , [ AO1, #48 ]
fldd d3 , [ AO1, #56 ]
fmacd d12 , d0, d4
fmacd d13 , d0, d5
fmacd d14 , d2, d4
fmacd d15 , d2, d5
KMAC_R d12 , d1, d5
KMAC_I d13 , d1, d4
KMAC_R d14 , d3, d5
KMAC_I d15 , d3, d4
add XO , XO, INC_X
add AO1 , AO1, LDA
add AO2 , AO2, LDA
.endm
.macro SAVE_S4
fldd d0, ALPHA_R
fldd d1, ALPHA_I
fldmiad YO, { d4 - d5 }
FMAC_R1 d4 , d0 , d8
FMAC_I1 d5 , d0 , d9
FMAC_R2 d4 , d1 , d9
FMAC_I2 d5 , d1 , d8
fstmiad YO, { d4 - d5 }
add YO, YO, INC_Y
fldmiad YO, { d6 - d7 }
FMAC_R1 d6 , d0 , d10
FMAC_I1 d7 , d0 , d11
FMAC_R2 d6 , d1 , d11
FMAC_I2 d7 , d1 , d10
fstmiad YO, { d6 - d7 }
add YO, YO, INC_Y
fldmiad YO, { d4 - d5 }
FMAC_R1 d4 , d0 , d12
FMAC_I1 d5 , d0 , d13
FMAC_R2 d4 , d1 , d13
FMAC_I2 d5 , d1 , d12
fstmiad YO, { d4 - d5 }
add YO, YO, INC_Y
fldmiad YO, { d6 - d7 }
FMAC_R1 d6 , d0 , d14
FMAC_I1 d7 , d0 , d15
FMAC_R2 d6 , d1 , d15
FMAC_I2 d7 , d1 , d14
fstmiad YO, { d6 - d7 }
add YO, YO, INC_Y
.endm
.macro INIT_S1
vsub.f64 d8 , d8 , d8
vmov.f64 d9 , d8
.endm
.macro KERNEL_S1X1
fldd d0 , [ AO1 ]
fldd d1 , [ AO1, #8 ]
fldd d4 , [ XO ]
fldd d5 , [ XO, #8 ]
fmacd d8 , d0, d4
fmacd d9 , d0, d5
KMAC_R d8 , d1, d5
KMAC_I d9 , d1, d4
add XO , XO, INC_X
add AO1 , AO1, LDA
.endm
.macro SAVE_S1
fldd d0, ALPHA_R
fldd d1, ALPHA_I
fldmiad YO, { d4 - d5 }
FMAC_R1 d4 , d0 , d8
FMAC_I1 d5 , d0 , d9
FMAC_R2 d4 , d1 , d9
FMAC_I2 d5 , d1 , d8
fstmiad YO, { d4 - d5 }
add YO, YO, INC_Y
.endm
/**************************************************************************************
* End of macro definitions
**************************************************************************************/
PROLOGUE
.align 5
push {r4 - r9 , fp}
add fp, sp, #28
sub sp, sp, #STACKSIZE // reserve stack
sub r12, fp, #192
#if defined(DOUBLE)
vstm r12, { d8 - d15 } // store floating point registers
#else
vstm r12, { s8 - s15 } // store floating point registers
#endif
cmp OLD_M, #0
ble zgemvn_kernel_L999
cmp N, #0
ble zgemvn_kernel_L999
str OLD_A, A
str OLD_M, M
vstr d0 , ALPHA_R
vstr d1 , ALPHA_I
ldr INC_X , OLD_INC_X
ldr INC_Y , OLD_INC_Y
cmp INC_X, #0
beq zgemvn_kernel_L999
cmp INC_Y, #0
beq zgemvn_kernel_L999
ldr LDA, OLD_LDA
#if defined(DOUBLE)
lsl LDA, LDA, #4 // LDA * SIZE * 2
#else
lsl LDA, LDA, #3 // LDA * SIZE * 2
#endif
cmp INC_X, #1
bne zgemvn_kernel_S4_BEGIN
cmp INC_Y, #1
bne zgemvn_kernel_S4_BEGIN
zgemvn_kernel_F4_BEGIN:
ldr YO , Y
ldr I, M
asrs I, I, #2 // I = M / 4
ble zgemvn_kernel_F1_BEGIN
zgemvn_kernel_F4X4:
ldr AO1, A
add AO2, AO1, LDA
add r3 , AO1, #64
str r3 , A
add AO2, AO2, LDA
add AO2, AO2, LDA
ldr XO , X
INIT_F4
asrs J, N, #2 // J = N / 4
ble zgemvn_kernel_F4X1
zgemvn_kernel_F4X4_10:
KERNEL_F4X4
subs J, J, #1
bne zgemvn_kernel_F4X4_10
zgemvn_kernel_F4X1:
ands J, N , #3
ble zgemvn_kernel_F4_END
zgemvn_kernel_F4X1_10:
KERNEL_F4X1
subs J, J, #1
bne zgemvn_kernel_F4X1_10
zgemvn_kernel_F4_END:
SAVE_F4
subs I , I , #1
bne zgemvn_kernel_F4X4
zgemvn_kernel_F1_BEGIN:
ldr I, M
ands I, I , #3
ble zgemvn_kernel_L999
zgemvn_kernel_F1X1:
ldr AO1, A
add r3, AO1, #16
str r3, A
ldr XO , X
INIT_F1
mov J, N
zgemvn_kernel_F1X1_10:
KERNEL_F1X1
subs J, J, #1
bne zgemvn_kernel_F1X1_10
zgemvn_kernel_F1_END:
SAVE_F1
subs I , I , #1
bne zgemvn_kernel_F1X1
b zgemvn_kernel_L999
/*************************************************************************************************************/
zgemvn_kernel_S4_BEGIN:
#if defined(DOUBLE)
lsl INC_X, INC_X, #4 // INC_X * SIZE * 2
lsl INC_Y, INC_Y, #4 // INC_Y * SIZE * 2
#else
lsl INC_X, INC_X, #3 // INC_X * SIZE * 2
lsl INC_Y, INC_Y, #3 // INC_Y * SIZE * 2
#endif
ldr YO , Y
ldr I, M
asrs I, I, #2 // I = M / 4
ble zgemvn_kernel_S1_BEGIN
zgemvn_kernel_S4X4:
ldr AO1, A
add AO2, AO1, LDA
add r3 , AO1, #64
str r3 , A
ldr XO , X
INIT_S4
asrs J, N, #2 // J = N / 4
ble zgemvn_kernel_S4X1
zgemvn_kernel_S4X4_10:
KERNEL_S4X4
subs J, J, #1
bne zgemvn_kernel_S4X4_10
zgemvn_kernel_S4X1:
ands J, N , #3
ble zgemvn_kernel_S4_END
zgemvn_kernel_S4X1_10:
KERNEL_S4X1
subs J, J, #1
bne zgemvn_kernel_S4X1_10
zgemvn_kernel_S4_END:
SAVE_S4
subs I , I , #1
bne zgemvn_kernel_S4X4
zgemvn_kernel_S1_BEGIN:
ldr I, M
ands I, I , #3
ble zgemvn_kernel_L999
zgemvn_kernel_S1X1:
ldr AO1, A
add r3, AO1, #16
str r3, A
ldr XO , X
INIT_S1
mov J, N
zgemvn_kernel_S1X1_10:
KERNEL_S1X1
subs J, J, #1
bne zgemvn_kernel_S1X1_10
zgemvn_kernel_S1_END:
SAVE_S1
subs I , I , #1
bne zgemvn_kernel_S1X1
/*************************************************************************************************************/
zgemvn_kernel_L999:
sub r3, fp, #192
#if defined(DOUBLE)
vldm r3, { d8 - d15 } // restore floating point registers
#else
vldm r3, { s8 - s15 } // restore floating point registers
#endif
mov r0, #0 // set return value
sub sp, fp, #28
pop {r4 -r9 ,fp}
bx lr
EPILOGUE

140
kernel/arm/zgemv_t.c Normal file
View File

@ -0,0 +1,140 @@
/***************************************************************************
Copyright (c) 2013, The OpenBLAS Project
All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
1. Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
2. Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in
the documentation and/or other materials provided with the
distribution.
3. Neither the name of the OpenBLAS project nor the names of
its contributors may be used to endorse or promote products
derived from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*****************************************************************************/
/**************************************************************************************
* * 2013/11/23 Saar
* * BLASTEST float : OK
* * BLASTEST double : OK
* CTEST : OK
* TEST : OK
* *
* **************************************************************************************/
#include "common.h"
int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha_r, FLOAT alpha_i, FLOAT *a, BLASLONG lda, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *buffer)
{
BLASLONG i;
BLASLONG ix,iy;
BLASLONG j;
FLOAT *a_ptr;
FLOAT temp_r,temp_i;
BLASLONG inc_x2,inc_y2;
BLASLONG lda2;
BLASLONG i2;
lda2 = 2*lda;
iy = 0;
a_ptr = a;
if ( inc_x == 1 && inc_y == 1 )
{
for (j=0; j<n; j++)
{
temp_r = 0.0;
temp_i = 0.0;
ix = 0;
i2=0;
for (i=0; i<m; i++)
{
#if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) )
temp_r += a_ptr[i2] * x[ix] - a_ptr[i2+1] * x[ix+1];
temp_i += a_ptr[i2] * x[ix+1] + a_ptr[i2+1] * x[ix];
#else
temp_r += a_ptr[i2] * x[ix] + a_ptr[i2+1] * x[ix+1];
temp_i += a_ptr[i2] * x[ix+1] - a_ptr[i2+1] * x[ix];
#endif
i2 += 2;
ix += 2;
}
#if !defined(XCONJ)
y[iy] += alpha_r * temp_r - alpha_i * temp_i;
y[iy+1] += alpha_r * temp_i + alpha_i * temp_r;
#else
y[iy] += alpha_r * temp_r + alpha_i * temp_i;
y[iy+1] -= alpha_r * temp_i - alpha_i * temp_r;
#endif
a_ptr += lda2;
iy += 2;
}
return(0);
}
inc_x2 = 2 * inc_x;
inc_y2 = 2 * inc_y;
for (j=0; j<n; j++)
{
temp_r = 0.0;
temp_i = 0.0;
ix = 0;
i2=0;
for (i=0; i<m; i++)
{
#if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) )
temp_r += a_ptr[i2] * x[ix] - a_ptr[i2+1] * x[ix+1];
temp_i += a_ptr[i2] * x[ix+1] + a_ptr[i2+1] * x[ix];
#else
temp_r += a_ptr[i2] * x[ix] + a_ptr[i2+1] * x[ix+1];
temp_i += a_ptr[i2] * x[ix+1] - a_ptr[i2+1] * x[ix];
#endif
i2 += 2;
ix += inc_x2;
}
#if !defined(XCONJ)
y[iy] += alpha_r * temp_r - alpha_i * temp_i;
y[iy+1] += alpha_r * temp_i + alpha_i * temp_r;
#else
y[iy] += alpha_r * temp_r + alpha_i * temp_i;
y[iy+1] -= alpha_r * temp_i - alpha_i * temp_r;
#endif
a_ptr += lda2;
iy += inc_y2;
}
return(0);
}

608
kernel/arm/zgemv_t_vfp.S Normal file
View File

@ -0,0 +1,608 @@
/***************************************************************************
Copyright (c) 2013, The OpenBLAS Project
All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
1. Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
2. Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in
the documentation and/or other materials provided with the
distribution.
3. Neither the name of the OpenBLAS project nor the names of
its contributors may be used to endorse or promote products
derived from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*****************************************************************************/
/**************************************************************************************
* 2013/11/29 Saar
* BLASTEST : OK
* CTEST : OK
* TEST : OK
*
**************************************************************************************/
#define ASSEMBLER
#include "common.h"
#define STACKSIZE 256
#define OLD_LDA [fp, #0 ]
#define X [fp, #4 ]
#define OLD_INC_X [fp, #8 ]
#define Y [fp, #12 ]
#define OLD_INC_Y [fp, #16 ]
#define OLD_A r3
#define OLD_N r1
#define M r0
#define AO1 r1
#define J r2
#define AO2 r4
#define XO r5
#define YO r6
#define LDA r7
#define INC_X r8
#define INC_Y r9
#define I r12
#define N [fp, #-252 ]
#define A [fp, #-256 ]
#define X_PRE 512
#define A_PRE 512
#define Y_PRE 32
/**************************************************************************************
* Macro definitions
**************************************************************************************/
#if !defined(CONJ) && !defined(XCONJ)
#define KMAC_R fnmacd
#define KMAC_I fmacd
#define FMAC_R1 fmacd
#define FMAC_R2 fnmacd
#define FMAC_I1 fmacd
#define FMAC_I2 fmacd
#elif defined(CONJ) && !defined(XCONJ)
#define KMAC_R fmacd
#define KMAC_I fnmacd
#define FMAC_R1 fmacd
#define FMAC_R2 fnmacd
#define FMAC_I1 fmacd
#define FMAC_I2 fmacd
#elif !defined(CONJ) && defined(XCONJ)
#define KMAC_R fmacd
#define KMAC_I fnmacd
#define FMAC_R1 fmacd
#define FMAC_R2 fmacd
#define FMAC_I1 fnmacd
#define FMAC_I2 fmacd
#else
#define KMAC_R fnmacd
#define KMAC_I fmacd
#define FMAC_R1 fmacd
#define FMAC_R2 fmacd
#define FMAC_I1 fnmacd
#define FMAC_I2 fmacd
#endif
.macro INIT_F2
vsub.f64 d12, d12, d12
vsub.f64 d13, d13, d13
vsub.f64 d14, d14, d14
vsub.f64 d15, d15, d15
.endm
.macro KERNEL_F2X4
KERNEL_F2X1
KERNEL_F2X1
KERNEL_F2X1
KERNEL_F2X1
.endm
.macro KERNEL_F2X1
fldmiad XO! , { d2 - d3 }
fldmiad AO1!, { d4 - d5 }
fmacd d12 , d4 , d2
fmacd d13 , d4 , d3
fldmiad AO2!, { d8 - d9 }
KMAC_R d12 , d5 , d3
KMAC_I d13 , d5 , d2
fmacd d14 , d8 , d2
fmacd d15 , d8 , d3
KMAC_R d14 , d9 , d3
KMAC_I d15 , d9 , d2
.endm
.macro SAVE_F2
fldmiad YO, { d4 - d7 }
FMAC_R1 d4 , d0 , d12
FMAC_I1 d5 , d0 , d13
FMAC_R2 d4 , d1 , d13
FMAC_I2 d5 , d1 , d12
FMAC_R1 d6 , d0 , d14
FMAC_I1 d7 , d0 , d15
FMAC_R2 d6 , d1 , d15
FMAC_I2 d7 , d1 , d14
fstmiad YO!, { d4 - d7 }
.endm
/************************************************************************************************/
.macro INIT_F1
vsub.f64 d12, d12, d12
vsub.f64 d13, d13, d13
.endm
.macro KERNEL_F1X4
KERNEL_F1X1
KERNEL_F1X1
KERNEL_F1X1
KERNEL_F1X1
.endm
.macro KERNEL_F1X1
fldmiad XO! , { d2 - d3 }
fldmiad AO1!, { d4 - d5 }
fmacd d12 , d4 , d2
fmacd d13 , d4 , d3
KMAC_R d12 , d5 , d3
KMAC_I d13 , d5 , d2
.endm
.macro SAVE_F1
fldmiad YO, { d4 - d5 }
FMAC_R1 d4 , d0 , d12
FMAC_I1 d5 , d0 , d13
FMAC_R2 d4 , d1 , d13
FMAC_I2 d5 , d1 , d12
fstmiad YO!, { d4 - d5 }
.endm
/************************************************************************************************/
.macro INIT_S2
vsub.f64 d12, d12, d12
vsub.f64 d13, d13, d13
vsub.f64 d14, d14, d14
vsub.f64 d15, d15, d15
.endm
.macro KERNEL_S2X4
KERNEL_S2X1
KERNEL_S2X1
KERNEL_S2X1
KERNEL_S2X1
.endm
.macro KERNEL_S2X1
fldmiad XO , { d2 - d3 }
fldmiad AO1!, { d4 - d5 }
fldmiad AO2!, { d8 - d9 }
fmacd d12 , d4 , d2
fmacd d13 , d4 , d3
KMAC_R d12 , d5 , d3
KMAC_I d13 , d5 , d2
fmacd d14 , d8 , d2
fmacd d15 , d8 , d3
KMAC_R d14 , d9 , d3
KMAC_I d15 , d9 , d2
add XO, XO, INC_X
.endm
.macro SAVE_S2
fldmiad YO, { d4 - d5 }
FMAC_R1 d4 , d0 , d12
FMAC_I1 d5 , d0 , d13
FMAC_R2 d4 , d1 , d13
FMAC_I2 d5 , d1 , d12
fstmiad YO, { d4 - d5 }
add YO, YO, INC_Y
fldmiad YO, { d6 - d7 }
FMAC_R1 d6 , d0 , d14
FMAC_I1 d7 , d0 , d15
FMAC_R2 d6 , d1 , d15
FMAC_I2 d7 , d1 , d14
fstmiad YO, { d6 - d7 }
add YO, YO, INC_Y
.endm
/************************************************************************************************/
.macro INIT_S1
vsub.f64 d12, d12, d12
vsub.f64 d13, d13, d13
.endm
.macro KERNEL_S1X4
KERNEL_S1X1
KERNEL_S1X1
KERNEL_S1X1
KERNEL_S1X1
.endm
.macro KERNEL_S1X1
fldmiad XO , { d2 - d3 }
fldmiad AO1!, { d4 - d5 }
fmacd d12 , d4 , d2
fmacd d13 , d4 , d3
KMAC_R d12 , d5 , d3
KMAC_I d13 , d5 , d2
add XO, XO, INC_X
.endm
.macro SAVE_S1
fldmiad YO, { d4 - d5 }
FMAC_R1 d4 , d0 , d12
FMAC_I1 d5 , d0 , d13
FMAC_R2 d4 , d1 , d13
FMAC_I2 d5 , d1 , d12
fstmiad YO, { d4 - d5 }
add YO, YO, INC_Y
.endm
/**************************************************************************************
* End of macro definitions
**************************************************************************************/
PROLOGUE
.align 5
push {r4 - r9 , fp}
add fp, sp, #28
sub sp, sp, #STACKSIZE // reserve stack
sub r12, fp, #192
#if defined(DOUBLE)
vstm r12, { d8 - d15 } // store floating point registers
#else
vstm r12, { s8 - s15 } // store floating point registers
#endif
cmp M, #0
ble zgemvt_kernel_L999
cmp OLD_N, #0
ble zgemvt_kernel_L999
str OLD_A, A
str OLD_N, N
ldr INC_X , OLD_INC_X
ldr INC_Y , OLD_INC_Y
cmp INC_X, #0
beq zgemvt_kernel_L999
cmp INC_Y, #0
beq zgemvt_kernel_L999
ldr LDA, OLD_LDA
#if defined(DOUBLE)
lsl LDA, LDA, #4 // LDA * SIZE
#else
lsl LDA, LDA, #3 // LDA * SIZE
#endif
cmp INC_X, #1
bne zgemvt_kernel_S2_BEGIN
cmp INC_Y, #1
bne zgemvt_kernel_S2_BEGIN
zgemvt_kernel_F2_BEGIN:
ldr YO , Y
ldr J, N
asrs J, J, #1 // J = N / 2
ble zgemvt_kernel_F1_BEGIN
zgemvt_kernel_F2X4:
ldr AO1, A
add AO2, AO1, LDA
add r3 , AO2, LDA
str r3 , A
ldr XO , X
INIT_F2
asrs I, M, #2 // I = M / 4
ble zgemvt_kernel_F2X1
zgemvt_kernel_F2X4_10:
KERNEL_F2X4
subs I, I, #1
bne zgemvt_kernel_F2X4_10
zgemvt_kernel_F2X1:
ands I, M , #3
ble zgemvt_kernel_F2_END
zgemvt_kernel_F2X1_10:
KERNEL_F2X1
subs I, I, #1
bne zgemvt_kernel_F2X1_10
zgemvt_kernel_F2_END:
SAVE_F2
subs J , J , #1
bne zgemvt_kernel_F2X4
zgemvt_kernel_F1_BEGIN:
ldr J, N
ands J, J, #1
ble zgemvt_kernel_L999
zgemvt_kernel_F1X4:
ldr AO1, A
ldr XO , X
INIT_F1
asrs I, M, #2 // I = M / 4
ble zgemvt_kernel_F1X1
zgemvt_kernel_F1X4_10:
KERNEL_F1X4
subs I, I, #1
bne zgemvt_kernel_F1X4_10
zgemvt_kernel_F1X1:
ands I, M , #3
ble zgemvt_kernel_F1_END
zgemvt_kernel_F1X1_10:
KERNEL_F1X1
subs I, I, #1
bne zgemvt_kernel_F1X1_10
zgemvt_kernel_F1_END:
SAVE_F1
b zgemvt_kernel_L999
/*************************************************************************************************************/
zgemvt_kernel_S2_BEGIN:
#if defined(DOUBLE)
lsl INC_X, INC_X, #4 // INC_X * SIZE
lsl INC_Y, INC_Y, #4 // INC_Y * SIZE
#else
lsl INC_X, INC_X, #3 // INC_X * SIZE
lsl INC_Y, INC_Y, #3 // INC_Y * SIZE
#endif
ldr YO , Y
ldr J, N
asrs J, J, #1 // J = N / 2
ble zgemvt_kernel_S1_BEGIN
zgemvt_kernel_S2X4:
ldr AO1, A
add AO2, AO1, LDA
add r3 , AO2, LDA
str r3 , A
ldr XO , X
INIT_S2
asrs I, M, #2 // I = M / 4
ble zgemvt_kernel_S2X1
zgemvt_kernel_S2X4_10:
KERNEL_S2X4
subs I, I, #1
bne zgemvt_kernel_S2X4_10
zgemvt_kernel_S2X1:
ands I, M , #3
ble zgemvt_kernel_S2_END
zgemvt_kernel_S2X1_10:
KERNEL_S2X1
subs I, I, #1
bne zgemvt_kernel_S2X1_10
zgemvt_kernel_S2_END:
SAVE_S2
subs J , J , #1
bne zgemvt_kernel_S2X4
zgemvt_kernel_S1_BEGIN:
ldr J, N
ands J, J, #1
ble zgemvt_kernel_L999
zgemvt_kernel_S1X4:
ldr AO1, A
ldr XO , X
INIT_S1
asrs I, M, #2 // I = M / 4
ble zgemvt_kernel_S1X1
zgemvt_kernel_S1X4_10:
KERNEL_S1X4
subs I, I, #1
bne zgemvt_kernel_S1X4_10
zgemvt_kernel_S1X1:
ands I, M , #3
ble zgemvt_kernel_S1_END
zgemvt_kernel_S1X1_10:
KERNEL_S1X1
subs I, I, #1
bne zgemvt_kernel_S1X1_10
zgemvt_kernel_S1_END:
SAVE_S1
/*************************************************************************************************************/
zgemvt_kernel_L999:
sub r3, fp, #192
#if defined(DOUBLE)
vldm r3, { d8 - d15 } // restore floating point registers
#else
vldm r3, { s8 - s15 } // restore floating point registers
#endif
mov r0, #0 // set return value
sub sp, fp, #28
pop {r4 -r9 ,fp}
bx lr
EPILOGUE

106
kernel/arm/znrm2.c Normal file
View File

@ -0,0 +1,106 @@
/***************************************************************************
Copyright (c) 2013, The OpenBLAS Project
All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
1. Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
2. Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in
the documentation and/or other materials provided with the
distribution.
3. Neither the name of the OpenBLAS project nor the names of
its contributors may be used to endorse or promote products
derived from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*****************************************************************************/
/**************************************************************************************
* 2013/09/13 Saar
* BLASTEST float : OK
* BLASTEST double : OK
* CTEST : OK
* TEST : OK
*
**************************************************************************************/
#include "common.h"
#include <math.h>
#if defined(DOUBLE)
#define ABS fabs
#else
#define ABS fabsf
#endif
FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x)
{
BLASLONG i=0;
FLOAT scale = 0.0;
FLOAT ssq = 1.0;
BLASLONG inc_x2;
FLOAT temp;
if (n < 0 || inc_x < 1 ) return(0.0);
inc_x2 = 2 * inc_x;
n *= inc_x2;
while(i < n)
{
if ( x[i] != 0.0 )
{
temp = ABS( x[i] );
if ( scale < temp )
{
ssq = 1 + ssq * ( scale / temp ) * ( scale / temp );
scale = temp ;
}
else
{
ssq += ( temp / scale ) * ( temp / scale );
}
}
if ( x[i+1] != 0.0 )
{
temp = ABS( x[i+1] );
if ( scale < temp )
{
ssq = 1 + ssq * ( scale / temp ) * ( scale / temp );
scale = temp ;
}
else
{
ssq += ( temp / scale ) * ( temp / scale );
}
}
i += inc_x2;
}
scale = scale * sqrt( ssq );
return(scale);
}

68
kernel/arm/zrot.c Normal file
View File

@ -0,0 +1,68 @@
/***************************************************************************
Copyright (c) 2013, The OpenBLAS Project
All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
1. Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
2. Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in
the documentation and/or other materials provided with the
distribution.
3. Neither the name of the OpenBLAS project nor the names of
its contributors may be used to endorse or promote products
derived from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*****************************************************************************/
/**************************************************************************************
* 2013/09/14 Saar
* BLASTEST float : OK
* BLASTEST double : OK
* CTEST : OK
* TEST : OK
*
**************************************************************************************/
#include "common.h"
int CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT c, FLOAT s)
{
BLASLONG i=0;
BLASLONG ix=0,iy=0;
FLOAT temp[2];
if ( n <= 0 ) return(0);
BLASLONG inc_x2 = 2 * inc_x ;
BLASLONG inc_y2 = 2 * inc_y ;
while(i < n)
{
temp[0] = c*x[ix] + s*y[iy] ;
temp[1] = c*x[ix+1] + s*y[iy+1] ;
y[iy] = c*y[iy] - s*x[ix] ;
y[iy+1] = c*y[iy+1] - s*x[ix+1] ;
x[ix] = temp[0] ;
x[ix+1] = temp[1] ;
ix += inc_x2 ;
iy += inc_y2 ;
i++ ;
}
return(0);
}

64
kernel/arm/zscal.c Normal file
View File

@ -0,0 +1,64 @@
/***************************************************************************
Copyright (c) 2013, The OpenBLAS Project
All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
1. Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
2. Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in
the documentation and/or other materials provided with the
distribution.
3. Neither the name of the OpenBLAS project nor the names of
its contributors may be used to endorse or promote products
derived from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*****************************************************************************/
/**************************************************************************************
* 2013/09/14 Saar
* BLASTEST float : OK
* BLASTEST double : OK
* CTEST : OK
* TEST : OK
*
**************************************************************************************/
#include "common.h"
int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da_r,FLOAT da_i, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *dummy, BLASLONG dummy2)
{
BLASLONG i=0;
BLASLONG inc_x2;
BLASLONG ip = 0;
FLOAT temp;
if ( n < 0 || inc_x < 1 ) return(0);
inc_x2 = 2 * inc_x;
for ( i=0; i<n; i++ )
{
temp = da_r * x[ip] - da_i * x[ip+1] ;
x[ip+1] = da_r * x[ip+1] + da_i * x[ip] ;
x[ip] = temp;
ip += inc_x2;
}
return(0);
}

Some files were not shown because too many files have changed in this diff Show More