diff --git a/Makefile b/Makefile index 39e3bbd65..a84b54d4b 100644 --- a/Makefile +++ b/Makefile @@ -314,7 +314,7 @@ clean :: #endif @$(MAKE) -C reference clean @rm -f *.$(LIBSUFFIX) *.so *~ *.exe getarch getarch_2nd *.dll *.lib *.$(SUFFIX) *.dwf $(LIBPREFIX).$(LIBSUFFIX) $(LIBPREFIX)_p.$(LIBSUFFIX) $(LIBPREFIX).so.$(MAJOR_VERSION) *.lnk myconfig.h - @rm -f Makefile.conf config.h Makefile_kernel.conf config_kernel.h st* *.dylib + @rm -f Makefile.conf config.h cblas_noconst.h Makefile_kernel.conf config_kernel.h st* *.dylib @if test -d $(NETLIB_LAPACK_DIR); then \ echo deleting $(NETLIB_LAPACK_DIR); \ rm -rf $(NETLIB_LAPACK_DIR) ;\ diff --git a/Makefile.getarch b/Makefile.prebuild similarity index 80% rename from Makefile.getarch rename to Makefile.prebuild index dadfb5b1b..c7d0de70e 100644 --- a/Makefile.getarch +++ b/Makefile.prebuild @@ -1,3 +1,5 @@ +# This is triggered by Makefile.system and runs before any of the code is built. + export BINARY export USE_OPENMP @@ -15,7 +17,7 @@ ifdef CPUIDEMU EXFLAGS = -DCPUIDEMU -DVENDOR=99 endif -all: getarch_2nd +all: getarch_2nd cblas_noconst.h ./getarch_2nd 0 >> $(TARGET_MAKE) ./getarch_2nd 1 >> $(TARGET_CONF) @@ -36,4 +38,7 @@ else $(HOSTCC) -I. $(CFLAGS) -DBUILD_KERNEL -o $(@F) getarch_2nd.c endif +cblas_noconst.h : cblas.h + perl -ane ' s/\bconst\b\s*//g; print; ' < cblas.h > cblas_noconst.h + dummy: diff --git a/Makefile.system b/Makefile.system index 75c0e0ad4..eac61e961 100644 --- a/Makefile.system +++ b/Makefile.system @@ -70,7 +70,7 @@ ifndef GOTOBLAS_MAKEFILE export GOTOBLAS_MAKEFILE = 1 # Generating Makefile.conf and config.h -DUMMY := $(shell $(MAKE) -C $(TOPDIR) -f Makefile.getarch CC="$(CC)" FC="$(FC)" HOSTCC="$(HOSTCC)" CFLAGS="$(GETARCH_FLAGS)" BINARY=$(BINARY) USE_OPENMP=$(USE_OPENMP) TARGET_CORE=$(TARGET_CORE) all) +DUMMY := $(shell $(MAKE) -C $(TOPDIR) -f Makefile.prebuild CC="$(CC)" FC="$(FC)" HOSTCC="$(HOSTCC)" CFLAGS="$(GETARCH_FLAGS)" BINARY=$(BINARY) USE_OPENMP=$(USE_OPENMP) TARGET_CORE=$(TARGET_CORE) all) ifndef TARGET_CORE include $(TOPDIR)/Makefile.conf diff --git a/cblas.h b/cblas.h index ee8bf08b2..6684262e2 100644 --- a/cblas.h +++ b/cblas.h @@ -1,291 +1,293 @@ #ifndef CBLAS_H #define CBLAS_H +#include +#include "common.h" + #ifdef __cplusplus extern "C" { /* Assume C declarations for C++ */ #endif /* __cplusplus */ -#include -#include "common.h" - /*Set the number of threads on runtime.*/ void openblas_set_num_threads(int num_threads); void goto_set_num_threads(int num_threads); +/*Get the build configure on runtime.*/ +char* openblas_get_config(void); + #define CBLAS_INDEX size_t -enum CBLAS_ORDER {CblasRowMajor=101, CblasColMajor=102}; -enum CBLAS_TRANSPOSE {CblasNoTrans=111, CblasTrans=112, CblasConjTrans=113, CblasConjNoTrans=114}; -enum CBLAS_UPLO {CblasUpper=121, CblasLower=122}; -enum CBLAS_DIAG {CblasNonUnit=131, CblasUnit=132}; -enum CBLAS_SIDE {CblasLeft=141, CblasRight=142}; +typedef enum CBLAS_ORDER {CblasRowMajor=101, CblasColMajor=102} CBLAS_ORDER; +typedef enum CBLAS_TRANSPOSE {CblasNoTrans=111, CblasTrans=112, CblasConjTrans=113, CblasConjNoTrans=114} CBLAS_TRANSPOSE; +typedef enum CBLAS_UPLO {CblasUpper=121, CblasLower=122} CBLAS_UPLO; +typedef enum CBLAS_DIAG {CblasNonUnit=131, CblasUnit=132} CBLAS_DIAG; +typedef enum CBLAS_SIDE {CblasLeft=141, CblasRight=142} CBLAS_SIDE; -float cblas_sdsdot(blasint n, float, float *x, blasint incx, float *y, blasint incy); -double cblas_dsdot (blasint n, float *x, blasint incx, float *y, blasint incy); -float cblas_sdot(blasint n, float *x, blasint incx, float *y, blasint incy); -double cblas_ddot(blasint n, double *x, blasint incx, double *y, blasint incy); +float cblas_sdsdot(const blasint n, const float alpha, const float *x, const blasint incx, const float *y, const blasint incy); +double cblas_dsdot (const blasint n, const float *x, const blasint incx, const float *y, const blasint incy); +float cblas_sdot(const blasint n, const float *x, const blasint incx, const float *y, const blasint incy); +double cblas_ddot(const blasint n, const double *x, const blasint incx, const double *y, const blasint incy); -openblas_complex_float cblas_cdotu(blasint n, float *x, blasint incx, float *y, blasint incy); -openblas_complex_float cblas_cdotc(blasint n, float *x, blasint incx, float *y, blasint incy); -openblas_complex_double cblas_zdotu(blasint n, double *x, blasint incx, double *y, blasint incy); -openblas_complex_double cblas_zdotc(blasint n, double *x, blasint incx, double *y, blasint incy); +openblas_complex_float cblas_cdotu(const blasint n, const float *x, const blasint incx, const float *y, const blasint incy); +openblas_complex_float cblas_cdotc(const blasint n, const float *x, const blasint incx, const float *y, const blasint incy); +openblas_complex_double cblas_zdotu(const blasint n, const double *x, const blasint incx, const double *y, const blasint incy); +openblas_complex_double cblas_zdotc(const blasint n, const double *x, const blasint incx, const double *y, const blasint incy); -void cblas_cdotu_sub(blasint n, float *x, blasint incx, float *y, blasint incy, openblas_complex_float *ret); -void cblas_cdotc_sub(blasint n, float *x, blasint incx, float *y, blasint incy, openblas_complex_float *ret); -void cblas_zdotu_sub(blasint n, double *x, blasint incx, double *y, blasint incy, openblas_complex_double *ret); -void cblas_zdotc_sub(blasint n, double *x, blasint incx, double *y, blasint incy, openblas_complex_double *ret); +void cblas_cdotu_sub(const blasint n, const float *x, const blasint incx, const float *y, const blasint incy, openblas_complex_float *ret); +void cblas_cdotc_sub(const blasint n, const float *x, const blasint incx, const float *y, const blasint incy, openblas_complex_float *ret); +void cblas_zdotu_sub(const blasint n, const double *x, const blasint incx, const double *y, const blasint incy, openblas_complex_double *ret); +void cblas_zdotc_sub(const blasint n, const double *x, const blasint incx, const double *y, const blasint incy, openblas_complex_double *ret); -float cblas_sasum (blasint n, float *x, blasint incx); -double cblas_dasum (blasint n, double *x, blasint incx); -float cblas_scasum(blasint n, float *x, blasint incx); -double cblas_dzasum(blasint n, double *x, blasint incx); +float cblas_sasum (const blasint n, const float *x, const blasint incx); +double cblas_dasum (const blasint n, const double *x, const blasint incx); +float cblas_scasum(const blasint n, const float *x, const blasint incx); +double cblas_dzasum(const blasint n, const double *x, const blasint incx); -float cblas_snrm2 (blasint N, float *X, blasint incX); -double cblas_dnrm2 (blasint N, double *X, blasint incX); -float cblas_scnrm2(blasint N, float *X, blasint incX); -double cblas_dznrm2(blasint N, double *X, blasint incX); +float cblas_snrm2 (const blasint N, const float *X, const blasint incX); +double cblas_dnrm2 (const blasint N, const double *X, const blasint incX); +float cblas_scnrm2(const blasint N, const float *X, const blasint incX); +double cblas_dznrm2(const blasint N, const double *X, const blasint incX); -CBLAS_INDEX cblas_isamax(blasint n, float *x, blasint incx); -CBLAS_INDEX cblas_idamax(blasint n, double *x, blasint incx); -CBLAS_INDEX cblas_icamax(blasint n, float *x, blasint incx); -CBLAS_INDEX cblas_izamax(blasint n, double *x, blasint incx); +CBLAS_INDEX cblas_isamax(const blasint n, const float *x, const blasint incx); +CBLAS_INDEX cblas_idamax(const blasint n, const double *x, const blasint incx); +CBLAS_INDEX cblas_icamax(const blasint n, const float *x, const blasint incx); +CBLAS_INDEX cblas_izamax(const blasint n, const double *x, const blasint incx); -void cblas_saxpy(blasint n, float, float *x, blasint incx, float *y, blasint incy); -void cblas_daxpy(blasint n, double, double *x, blasint incx, double *y, blasint incy); -void cblas_caxpy(blasint n, float *, float *x, blasint incx, float *y, blasint incy); -void cblas_zaxpy(blasint n, double *, double *x, blasint incx, double *y, blasint incy); +void cblas_saxpy(const blasint n, const float alpha, const float *x, const blasint incx, float *y, const blasint incy); +void cblas_daxpy(const blasint n, const double alpha, const double *x, const blasint incx, double *y, const blasint incy); +void cblas_caxpy(const blasint n, const float *alpha, const float *x, const blasint incx, float *y, const blasint incy); +void cblas_zaxpy(const blasint n, const double *alpha, const double *x, const blasint incx, double *y, const blasint incy); -void cblas_scopy(blasint n, float *x, blasint incx, float *y, blasint incy); -void cblas_dcopy(blasint n, double *x, blasint incx, double *y, blasint incy); -void cblas_ccopy(blasint n, float *x, blasint incx, float *y, blasint incy); -void cblas_zcopy(blasint n, double *x, blasint incx, double *y, blasint incy); +void cblas_scopy(const blasint n, const float *x, const blasint incx, float *y, const blasint incy); +void cblas_dcopy(const blasint n, const double *x, const blasint incx, double *y, const blasint incy); +void cblas_ccopy(const blasint n, const float *x, const blasint incx, float *y, const blasint incy); +void cblas_zcopy(const blasint n, const double *x, const blasint incx, double *y, const blasint incy); -void cblas_sswap(blasint n, float *x, blasint incx, float *y, blasint incy); -void cblas_dswap(blasint n, double *x, blasint incx, double *y, blasint incy); -void cblas_cswap(blasint n, float *x, blasint incx, float *y, blasint incy); -void cblas_zswap(blasint n, double *x, blasint incx, double *y, blasint incy); +void cblas_sswap(const blasint n, float *x, const blasint incx, float *y, const blasint incy); +void cblas_dswap(const blasint n, double *x, const blasint incx, double *y, const blasint incy); +void cblas_cswap(const blasint n, float *x, const blasint incx, float *y, const blasint incy); +void cblas_zswap(const blasint n, double *x, const blasint incx, double *y, const blasint incy); -void cblas_srot(blasint N, float *X, blasint incX, float *Y, blasint incY, float c, float s); -void cblas_drot(blasint N, double *X, blasint incX, double *Y, blasint incY, double c, double s); +void cblas_srot(const blasint N, float *X, const blasint incX, float *Y, const blasint incY, const float c, const float s); +void cblas_drot(const blasint N, double *X, const blasint incX, double *Y, const blasint incY, const double c, const double s); void cblas_srotg(float *a, float *b, float *c, float *s); void cblas_drotg(double *a, double *b, double *c, double *s); -void cblas_srotm(blasint N, float *X, blasint incX, float *Y, blasint incY, float *P); -void cblas_drotm(blasint N, double *X, blasint incX, double *Y, blasint incY, double *P); +void cblas_srotm(const blasint N, float *X, const blasint incX, float *Y, const blasint incY, const float *P); +void cblas_drotm(const blasint N, double *X, const blasint incX, double *Y, const blasint incY, const double *P); -void cblas_srotmg(float *d1, float *d2, float *b1, float b2, float *P); -void cblas_drotmg(double *d1, double *d2, double *b1, double b2, double *P); +void cblas_srotmg(float *d1, float *d2, float *b1, const float b2, float *P); +void cblas_drotmg(double *d1, double *d2, double *b1, const double b2, double *P); -void cblas_sscal(blasint N, float alpha, float *X, blasint incX); -void cblas_dscal(blasint N, double alpha, double *X, blasint incX); -void cblas_cscal(blasint N, float *alpha, float *X, blasint incX); -void cblas_zscal(blasint N, double *alpha, double *X, blasint incX); -void cblas_csscal(blasint N, float alpha, float *X, blasint incX); -void cblas_zdscal(blasint N, double alpha, double *X, blasint incX); +void cblas_sscal(const blasint N, const float alpha, float *X, const blasint incX); +void cblas_dscal(const blasint N, const double alpha, double *X, const blasint incX); +void cblas_cscal(const blasint N, const float *alpha, float *X, const blasint incX); +void cblas_zscal(const blasint N, const double *alpha, double *X, const blasint incX); +void cblas_csscal(const blasint N, const float alpha, float *X, const blasint incX); +void cblas_zdscal(const blasint N, const double alpha, double *X, const blasint incX); -void cblas_sgemv(enum CBLAS_ORDER order, enum CBLAS_TRANSPOSE trans, blasint m, blasint n, - float alpha, float *a, blasint lda, float *x, blasint incx, float beta, float *y, blasint incy); -void cblas_dgemv(enum CBLAS_ORDER order, enum CBLAS_TRANSPOSE trans, blasint m, blasint n, - double alpha, double *a, blasint lda, double *x, blasint incx, double beta, double *y, blasint incy); -void cblas_cgemv(enum CBLAS_ORDER order, enum CBLAS_TRANSPOSE trans, blasint m, blasint n, - float *alpha, float *a, blasint lda, float *x, blasint incx, float *beta, float *y, blasint incy); -void cblas_zgemv(enum CBLAS_ORDER order, enum CBLAS_TRANSPOSE trans, blasint m, blasint n, - double *alpha, double *a, blasint lda, double *x, blasint incx, double *beta, double *y, blasint incy); +void cblas_sgemv(const enum CBLAS_ORDER order, const enum CBLAS_TRANSPOSE trans, const blasint m, const blasint n, + const float alpha, const float *a, const blasint lda, const float *x, const blasint incx, const float beta, float *y, const blasint incy); +void cblas_dgemv(const enum CBLAS_ORDER order, const enum CBLAS_TRANSPOSE trans, const blasint m, const blasint n, + const double alpha, const double *a, const blasint lda, const double *x, const blasint incx, const double beta, double *y, const blasint incy); +void cblas_cgemv(const enum CBLAS_ORDER order, const enum CBLAS_TRANSPOSE trans, const blasint m, const blasint n, + const float *alpha, const float *a, const blasint lda, const float *x, const blasint incx, const float *beta, float *y, const blasint incy); +void cblas_zgemv(const enum CBLAS_ORDER order, const enum CBLAS_TRANSPOSE trans, const blasint m, const blasint n, + const double *alpha, const double *a, const blasint lda, const double *x, const blasint incx, const double *beta, double *y, const blasint incy); -void cblas_sger (enum CBLAS_ORDER order, blasint M, blasint N, float alpha, float *X, blasint incX, float *Y, blasint incY, float *A, blasint lda); -void cblas_dger (enum CBLAS_ORDER order, blasint M, blasint N, double alpha, double *X, blasint incX, double *Y, blasint incY, double *A, blasint lda); -void cblas_cgeru(enum CBLAS_ORDER order, blasint M, blasint N, float *alpha, float *X, blasint incX, float *Y, blasint incY, float *A, blasint lda); -void cblas_cgerc(enum CBLAS_ORDER order, blasint M, blasint N, float *alpha, float *X, blasint incX, float *Y, blasint incY, float *A, blasint lda); -void cblas_zgeru(enum CBLAS_ORDER order, blasint M, blasint N, double *alpha, double *X, blasint incX, double *Y, blasint incY, double *A, blasint lda); -void cblas_zgerc(enum CBLAS_ORDER order, blasint M, blasint N, double *alpha, double *X, blasint incX, double *Y, blasint incY, double *A, blasint lda); +void cblas_sger (const enum CBLAS_ORDER order, const blasint M, const blasint N, const float alpha, const float *X, const blasint incX, const float *Y, const blasint incY, float *A, const blasint lda); +void cblas_dger (const enum CBLAS_ORDER order, const blasint M, const blasint N, const double alpha, const double *X, const blasint incX, const double *Y, const blasint incY, double *A, const blasint lda); +void cblas_cgeru(const enum CBLAS_ORDER order, const blasint M, const blasint N, const float *alpha, const float *X, const blasint incX, const float *Y, const blasint incY, float *A, const blasint lda); +void cblas_cgerc(const enum CBLAS_ORDER order, const blasint M, const blasint N, const float *alpha, const float *X, const blasint incX, const float *Y, const blasint incY, float *A, const blasint lda); +void cblas_zgeru(const enum CBLAS_ORDER order, const blasint M, const blasint N, const double *alpha, const double *X, const blasint incX, const double *Y, const blasint incY, double *A, const blasint lda); +void cblas_zgerc(const enum CBLAS_ORDER order, const blasint M, const blasint N, const double *alpha, const double *X, const blasint incX, const double *Y, const blasint incY, double *A, const blasint lda); -void cblas_strsv(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE TransA, enum CBLAS_DIAG Diag, blasint N, float *A, blasint lda, float *X, blasint incX); -void cblas_dtrsv(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE TransA, enum CBLAS_DIAG Diag, blasint N, double *A, blasint lda, double *X, blasint incX); -void cblas_ctrsv(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE TransA, enum CBLAS_DIAG Diag, blasint N, float *A, blasint lda, float *X, blasint incX); -void cblas_ztrsv(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE TransA, enum CBLAS_DIAG Diag, blasint N, double *A, blasint lda, double *X, blasint incX); +void cblas_strsv(const enum CBLAS_ORDER order, const enum CBLAS_UPLO Uplo, const enum CBLAS_TRANSPOSE TransA, const enum CBLAS_DIAG Diag, const blasint N, const float *A, const blasint lda, float *X, const blasint incX); +void cblas_dtrsv(const enum CBLAS_ORDER order, const enum CBLAS_UPLO Uplo, const enum CBLAS_TRANSPOSE TransA, const enum CBLAS_DIAG Diag, const blasint N, const double *A, const blasint lda, double *X, const blasint incX); +void cblas_ctrsv(const enum CBLAS_ORDER order, const enum CBLAS_UPLO Uplo, const enum CBLAS_TRANSPOSE TransA, const enum CBLAS_DIAG Diag, const blasint N, const float *A, const blasint lda, float *X, const blasint incX); +void cblas_ztrsv(const enum CBLAS_ORDER order, const enum CBLAS_UPLO Uplo, const enum CBLAS_TRANSPOSE TransA, const enum CBLAS_DIAG Diag, const blasint N, const double *A, const blasint lda, double *X, const blasint incX); -void cblas_strmv(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE TransA, enum CBLAS_DIAG Diag, blasint N, float *A, blasint lda, float *X, blasint incX); -void cblas_dtrmv(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE TransA, enum CBLAS_DIAG Diag, blasint N, double *A, blasint lda, double *X, blasint incX); -void cblas_ctrmv(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE TransA, enum CBLAS_DIAG Diag, blasint N, float *A, blasint lda, float *X, blasint incX); -void cblas_ztrmv(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE TransA, enum CBLAS_DIAG Diag, blasint N, double *A, blasint lda, double *X, blasint incX); +void cblas_strmv(const enum CBLAS_ORDER order, const enum CBLAS_UPLO Uplo, const enum CBLAS_TRANSPOSE TransA, const enum CBLAS_DIAG Diag, const blasint N, const float *A, const blasint lda, float *X, const blasint incX); +void cblas_dtrmv(const enum CBLAS_ORDER order, const enum CBLAS_UPLO Uplo, const enum CBLAS_TRANSPOSE TransA, const enum CBLAS_DIAG Diag, const blasint N, const double *A, const blasint lda, double *X, const blasint incX); +void cblas_ctrmv(const enum CBLAS_ORDER order, const enum CBLAS_UPLO Uplo, const enum CBLAS_TRANSPOSE TransA, const enum CBLAS_DIAG Diag, const blasint N, const float *A, const blasint lda, float *X, const blasint incX); +void cblas_ztrmv(const enum CBLAS_ORDER order, const enum CBLAS_UPLO Uplo, const enum CBLAS_TRANSPOSE TransA, const enum CBLAS_DIAG Diag, const blasint N, const double *A, const blasint lda, double *X, const blasint incX); -void cblas_ssyr(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, blasint N, float alpha, float *X, blasint incX, float *A, blasint lda); -void cblas_dsyr(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, blasint N, double alpha, double *X, blasint incX, double *A, blasint lda); -void cblas_cher(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, blasint N, float alpha, float *X, blasint incX, float *A, blasint lda); -void cblas_zher(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, blasint N, double alpha, double *X, blasint incX, double *A, blasint lda); +void cblas_ssyr(const enum CBLAS_ORDER order, const enum CBLAS_UPLO Uplo, const blasint N, const float alpha, const float *X, const blasint incX, float *A, const blasint lda); +void cblas_dsyr(const enum CBLAS_ORDER order, const enum CBLAS_UPLO Uplo, const blasint N, const double alpha, const double *X, const blasint incX, double *A, const blasint lda); +void cblas_cher(const enum CBLAS_ORDER order, const enum CBLAS_UPLO Uplo, const blasint N, const float alpha, const float *X, const blasint incX, float *A, const blasint lda); +void cblas_zher(const enum CBLAS_ORDER order, const enum CBLAS_UPLO Uplo, const blasint N, const double alpha, const double *X, const blasint incX, double *A, const blasint lda); -void cblas_ssyr2(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo,blasint N, float alpha, float *X, - blasint incX, float *Y, blasint incY, float *A, blasint lda); -void cblas_dsyr2(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, blasint N, double alpha, double *X, - blasint incX, double *Y, blasint incY, double *A, blasint lda); -void cblas_cher2(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, blasint N, float *alpha, float *X, blasint incX, - float *Y, blasint incY, float *A, blasint lda); -void cblas_zher2(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, blasint N, double *alpha, double *X, blasint incX, - double *Y, blasint incY, double *A, blasint lda); +void cblas_ssyr2(const enum CBLAS_ORDER order, const enum CBLAS_UPLO Uplo,const blasint N, const float alpha, const float *X, + const blasint incX, const float *Y, const blasint incY, float *A, const blasint lda); +void cblas_dsyr2(const enum CBLAS_ORDER order, const enum CBLAS_UPLO Uplo, const blasint N, const double alpha, const double *X, + const blasint incX, const double *Y, const blasint incY, double *A, const blasint lda); +void cblas_cher2(const enum CBLAS_ORDER order, const enum CBLAS_UPLO Uplo, const blasint N, const float *alpha, const float *X, const blasint incX, + const float *Y, const blasint incY, float *A, const blasint lda); +void cblas_zher2(const enum CBLAS_ORDER order, const enum CBLAS_UPLO Uplo, const blasint N, const double *alpha, const double *X, const blasint incX, + const double *Y, const blasint incY, double *A, const blasint lda); -void cblas_sgbmv(enum CBLAS_ORDER order, enum CBLAS_TRANSPOSE TransA, blasint M, blasint N, - blasint KL, blasint KU, float alpha, float *A, blasint lda, float *X, blasint incX, float beta, float *Y, blasint incY); -void cblas_dgbmv(enum CBLAS_ORDER order, enum CBLAS_TRANSPOSE TransA, blasint M, blasint N, - blasint KL, blasint KU, double alpha, double *A, blasint lda, double *X, blasint incX, double beta, double *Y, blasint incY); -void cblas_cgbmv(enum CBLAS_ORDER order, enum CBLAS_TRANSPOSE TransA, blasint M, blasint N, - blasint KL, blasint KU, float *alpha, float *A, blasint lda, float *X, blasint incX, float *beta, float *Y, blasint incY); -void cblas_zgbmv(enum CBLAS_ORDER order, enum CBLAS_TRANSPOSE TransA, blasint M, blasint N, - blasint KL, blasint KU, double *alpha, double *A, blasint lda, double *X, blasint incX, double *beta, double *Y, blasint incY); +void cblas_sgbmv(const enum CBLAS_ORDER order, const enum CBLAS_TRANSPOSE TransA, const blasint M, const blasint N, + const blasint KL, const blasint KU, const float alpha, const float *A, const blasint lda, const float *X, const blasint incX, const float beta, float *Y, const blasint incY); +void cblas_dgbmv(const enum CBLAS_ORDER order, const enum CBLAS_TRANSPOSE TransA, const blasint M, const blasint N, + const blasint KL, const blasint KU, const double alpha, const double *A, const blasint lda, const double *X, const blasint incX, const double beta, double *Y, const blasint incY); +void cblas_cgbmv(const enum CBLAS_ORDER order, const enum CBLAS_TRANSPOSE TransA, const blasint M, const blasint N, + const blasint KL, const blasint KU, const float *alpha, const float *A, const blasint lda, const float *X, const blasint incX, const float *beta, float *Y, const blasint incY); +void cblas_zgbmv(const enum CBLAS_ORDER order, const enum CBLAS_TRANSPOSE TransA, const blasint M, const blasint N, + const blasint KL, const blasint KU, const double *alpha, const double *A, const blasint lda, const double *X, const blasint incX, const double *beta, double *Y, const blasint incY); -void cblas_ssbmv(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, blasint N, blasint K, float alpha, float *A, - blasint lda, float *X, blasint incX, float beta, float *Y, blasint incY); -void cblas_dsbmv(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, blasint N, blasint K, double alpha, double *A, - blasint lda, double *X, blasint incX, double beta, double *Y, blasint incY); +void cblas_ssbmv(const enum CBLAS_ORDER order, const enum CBLAS_UPLO Uplo, const blasint N, const blasint K, const float alpha, const float *A, + const blasint lda, const float *X, const blasint incX, const float beta, float *Y, const blasint incY); +void cblas_dsbmv(const enum CBLAS_ORDER order, const enum CBLAS_UPLO Uplo, const blasint N, const blasint K, const double alpha, const double *A, + const blasint lda, const double *X, const blasint incX, const double beta, double *Y, const blasint incY); -void cblas_stbmv(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE TransA, enum CBLAS_DIAG Diag, - blasint N, blasint K, float *A, blasint lda, float *X, blasint incX); -void cblas_dtbmv(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE TransA, enum CBLAS_DIAG Diag, - blasint N, blasint K, double *A, blasint lda, double *X, blasint incX); -void cblas_ctbmv(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE TransA, enum CBLAS_DIAG Diag, - blasint N, blasint K, float *A, blasint lda, float *X, blasint incX); -void cblas_ztbmv(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE TransA, enum CBLAS_DIAG Diag, - blasint N, blasint K, double *A, blasint lda, double *X, blasint incX); +void cblas_stbmv(const enum CBLAS_ORDER order, const enum CBLAS_UPLO Uplo, const enum CBLAS_TRANSPOSE TransA, const enum CBLAS_DIAG Diag, + const blasint N, const blasint K, const float *A, const blasint lda, float *X, const blasint incX); +void cblas_dtbmv(const enum CBLAS_ORDER order, const enum CBLAS_UPLO Uplo, const enum CBLAS_TRANSPOSE TransA, const enum CBLAS_DIAG Diag, + const blasint N, const blasint K, const double *A, const blasint lda, double *X, const blasint incX); +void cblas_ctbmv(const enum CBLAS_ORDER order, const enum CBLAS_UPLO Uplo, const enum CBLAS_TRANSPOSE TransA, const enum CBLAS_DIAG Diag, + const blasint N, const blasint K, const float *A, const blasint lda, float *X, const blasint incX); +void cblas_ztbmv(const enum CBLAS_ORDER order, const enum CBLAS_UPLO Uplo, const enum CBLAS_TRANSPOSE TransA, const enum CBLAS_DIAG Diag, + const blasint N, const blasint K, const double *A, const blasint lda, double *X, const blasint incX); -void cblas_stbsv(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE TransA, enum CBLAS_DIAG Diag, - blasint N, blasint K, float *A, blasint lda, float *X, blasint incX); -void cblas_dtbsv(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE TransA, enum CBLAS_DIAG Diag, - blasint N, blasint K, double *A, blasint lda, double *X, blasint incX); -void cblas_ctbsv(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE TransA, enum CBLAS_DIAG Diag, - blasint N, blasint K, float *A, blasint lda, float *X, blasint incX); -void cblas_ztbsv(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE TransA, enum CBLAS_DIAG Diag, - blasint N, blasint K, double *A, blasint lda, double *X, blasint incX); +void cblas_stbsv(const enum CBLAS_ORDER order, const enum CBLAS_UPLO Uplo, const enum CBLAS_TRANSPOSE TransA, const enum CBLAS_DIAG Diag, + const blasint N, const blasint K, const float *A, const blasint lda, float *X, const blasint incX); +void cblas_dtbsv(const enum CBLAS_ORDER order, const enum CBLAS_UPLO Uplo, const enum CBLAS_TRANSPOSE TransA, const enum CBLAS_DIAG Diag, + const blasint N, const blasint K, const double *A, const blasint lda, double *X, const blasint incX); +void cblas_ctbsv(const enum CBLAS_ORDER order, const enum CBLAS_UPLO Uplo, const enum CBLAS_TRANSPOSE TransA, const enum CBLAS_DIAG Diag, + const blasint N, const blasint K, const float *A, const blasint lda, float *X, const blasint incX); +void cblas_ztbsv(const enum CBLAS_ORDER order, const enum CBLAS_UPLO Uplo, const enum CBLAS_TRANSPOSE TransA, const enum CBLAS_DIAG Diag, + const blasint N, const blasint K, const double *A, const blasint lda, double *X, const blasint incX); -void cblas_stpmv(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE TransA, enum CBLAS_DIAG Diag, - blasint N, float *Ap, float *X, blasint incX); -void cblas_dtpmv(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE TransA, enum CBLAS_DIAG Diag, - blasint N, double *Ap, double *X, blasint incX); -void cblas_ctpmv(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE TransA, enum CBLAS_DIAG Diag, - blasint N, float *Ap, float *X, blasint incX); -void cblas_ztpmv(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE TransA, enum CBLAS_DIAG Diag, - blasint N, double *Ap, double *X, blasint incX); +void cblas_stpmv(const enum CBLAS_ORDER order, const enum CBLAS_UPLO Uplo, const enum CBLAS_TRANSPOSE TransA, const enum CBLAS_DIAG Diag, + const blasint N, const float *Ap, float *X, const blasint incX); +void cblas_dtpmv(const enum CBLAS_ORDER order, const enum CBLAS_UPLO Uplo, const enum CBLAS_TRANSPOSE TransA, const enum CBLAS_DIAG Diag, + const blasint N, const double *Ap, double *X, const blasint incX); +void cblas_ctpmv(const enum CBLAS_ORDER order, const enum CBLAS_UPLO Uplo, const enum CBLAS_TRANSPOSE TransA, const enum CBLAS_DIAG Diag, + const blasint N, const float *Ap, float *X, const blasint incX); +void cblas_ztpmv(const enum CBLAS_ORDER order, const enum CBLAS_UPLO Uplo, const enum CBLAS_TRANSPOSE TransA, const enum CBLAS_DIAG Diag, + const blasint N, const double *Ap, double *X, const blasint incX); -void cblas_stpsv(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE TransA, enum CBLAS_DIAG Diag, - blasint N, float *Ap, float *X, blasint incX); -void cblas_dtpsv(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE TransA, enum CBLAS_DIAG Diag, - blasint N, double *Ap, double *X, blasint incX); -void cblas_ctpsv(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE TransA, enum CBLAS_DIAG Diag, - blasint N, float *Ap, float *X, blasint incX); -void cblas_ztpsv(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE TransA, enum CBLAS_DIAG Diag, - blasint N, double *Ap, double *X, blasint incX); +void cblas_stpsv(const enum CBLAS_ORDER order, const enum CBLAS_UPLO Uplo, const enum CBLAS_TRANSPOSE TransA, const enum CBLAS_DIAG Diag, + const blasint N, const float *Ap, float *X, const blasint incX); +void cblas_dtpsv(const enum CBLAS_ORDER order, const enum CBLAS_UPLO Uplo, const enum CBLAS_TRANSPOSE TransA, const enum CBLAS_DIAG Diag, + const blasint N, const double *Ap, double *X, const blasint incX); +void cblas_ctpsv(const enum CBLAS_ORDER order, const enum CBLAS_UPLO Uplo, const enum CBLAS_TRANSPOSE TransA, const enum CBLAS_DIAG Diag, + const blasint N, const float *Ap, float *X, const blasint incX); +void cblas_ztpsv(const enum CBLAS_ORDER order, const enum CBLAS_UPLO Uplo, const enum CBLAS_TRANSPOSE TransA, const enum CBLAS_DIAG Diag, + const blasint N, const double *Ap, double *X, const blasint incX); -void cblas_ssymv(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, blasint N, float alpha, float *A, - blasint lda, float *X, blasint incX, float beta, float *Y, blasint incY); -void cblas_dsymv(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, blasint N, double alpha, double *A, - blasint lda, double *X, blasint incX, double beta, double *Y, blasint incY); -void cblas_chemv(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, blasint N, float *alpha, float *A, - blasint lda, float *X, blasint incX, float *beta, float *Y, blasint incY); -void cblas_zhemv(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, blasint N, double *alpha, double *A, - blasint lda, double *X, blasint incX, double *beta, double *Y, blasint incY); +void cblas_ssymv(const enum CBLAS_ORDER order, const enum CBLAS_UPLO Uplo, const blasint N, const float alpha, const float *A, + const blasint lda, const float *X, const blasint incX, const float beta, float *Y, const blasint incY); +void cblas_dsymv(const enum CBLAS_ORDER order, const enum CBLAS_UPLO Uplo, const blasint N, const double alpha, const double *A, + const blasint lda, const double *X, const blasint incX, const double beta, double *Y, const blasint incY); +void cblas_chemv(const enum CBLAS_ORDER order, const enum CBLAS_UPLO Uplo, const blasint N, const float *alpha, const float *A, + const blasint lda, const float *X, const blasint incX, const float *beta, float *Y, const blasint incY); +void cblas_zhemv(const enum CBLAS_ORDER order, const enum CBLAS_UPLO Uplo, const blasint N, const double *alpha, const double *A, + const blasint lda, const double *X, const blasint incX, const double *beta, double *Y, const blasint incY); -void cblas_sspmv(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, blasint N, float alpha, float *Ap, - float *X, blasint incX, float beta, float *Y, blasint incY); -void cblas_dspmv(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, blasint N, double alpha, double *Ap, - double *X, blasint incX, double beta, double *Y, blasint incY); +void cblas_sspmv(const enum CBLAS_ORDER order, const enum CBLAS_UPLO Uplo, const blasint N, const float alpha, const float *Ap, + const float *X, const blasint incX, const float beta, float *Y, const blasint incY); +void cblas_dspmv(const enum CBLAS_ORDER order, const enum CBLAS_UPLO Uplo, const blasint N, const double alpha, const double *Ap, + const double *X, const blasint incX, const double beta, double *Y, const blasint incY); -void cblas_sspr(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, blasint N, float alpha, float *X, blasint incX, float *Ap); -void cblas_dspr(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, blasint N, double alpha, double *X, blasint incX, double *Ap); +void cblas_sspr(const enum CBLAS_ORDER order, const enum CBLAS_UPLO Uplo, const blasint N, const float alpha, const float *X, const blasint incX, float *Ap); +void cblas_dspr(const enum CBLAS_ORDER order, const enum CBLAS_UPLO Uplo, const blasint N, const double alpha, const double *X, const blasint incX, double *Ap); -void cblas_chpr(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, blasint N, float alpha, float *X, blasint incX, float *A); -void cblas_zhpr(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, blasint N, double alpha, double *X,blasint incX, double *A); +void cblas_chpr(const enum CBLAS_ORDER order, const enum CBLAS_UPLO Uplo, const blasint N, const float alpha, const float *X, const blasint incX, float *A); +void cblas_zhpr(const enum CBLAS_ORDER order, const enum CBLAS_UPLO Uplo, const blasint N, const double alpha, const double *X,const blasint incX, double *A); -void cblas_sspr2(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, blasint N, float alpha, float *X, blasint incX, float *Y, blasint incY, float *A); -void cblas_dspr2(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, blasint N, double alpha, double *X, blasint incX, double *Y, blasint incY, double *A); -void cblas_chpr2(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, blasint N, float *alpha, float *X, blasint incX, float *Y, blasint incY, float *Ap); -void cblas_zhpr2(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, blasint N, double *alpha, double *X, blasint incX, double *Y, blasint incY, double *Ap); +void cblas_sspr2(const enum CBLAS_ORDER order, const enum CBLAS_UPLO Uplo, const blasint N, const float alpha, const float *X, const blasint incX, const float *Y, const blasint incY, float *A); +void cblas_dspr2(const enum CBLAS_ORDER order, const enum CBLAS_UPLO Uplo, const blasint N, const double alpha, const double *X, const blasint incX, const double *Y, const blasint incY, double *A); +void cblas_chpr2(const enum CBLAS_ORDER order, const enum CBLAS_UPLO Uplo, const blasint N, const float *alpha, const float *X, const blasint incX, const float *Y, const blasint incY, float *Ap); +void cblas_zhpr2(const enum CBLAS_ORDER order, const enum CBLAS_UPLO Uplo, const blasint N, const double *alpha, const double *X, const blasint incX, const double *Y, const blasint incY, double *Ap); -void cblas_chbmv(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, blasint N, blasint K, - float *alpha, float *A, blasint lda, float *X, blasint incX, float *beta, float *Y, blasint incY); -void cblas_zhbmv(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, blasint N, blasint K, - double *alpha, double *A, blasint lda, double *X, blasint incX, double *beta, double *Y, blasint incY); +void cblas_chbmv(const enum CBLAS_ORDER order, const enum CBLAS_UPLO Uplo, const blasint N, const blasint K, + const float *alpha, const float *A, const blasint lda, const float *X, const blasint incX, const float *beta, float *Y, const blasint incY); +void cblas_zhbmv(const enum CBLAS_ORDER order, const enum CBLAS_UPLO Uplo, const blasint N, const blasint K, + const double *alpha, const double *A, const blasint lda, const double *X, const blasint incX, const double *beta, double *Y, const blasint incY); -void cblas_chpmv(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, blasint N, - float *alpha, float *Ap, float *X, blasint incX, float *beta, float *Y, blasint incY); -void cblas_zhpmv(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, blasint N, - double *alpha, double *Ap, double *X, blasint incX, double *beta, double *Y, blasint incY); +void cblas_chpmv(const enum CBLAS_ORDER order, const enum CBLAS_UPLO Uplo, const blasint N, + const float *alpha, const float *Ap, const float *X, const blasint incX, const float *beta, float *Y, const blasint incY); +void cblas_zhpmv(const enum CBLAS_ORDER order, const enum CBLAS_UPLO Uplo, const blasint N, + const double *alpha, const double *Ap, const double *X, const blasint incX, const double *beta, double *Y, const blasint incY); -void cblas_sgemm(enum CBLAS_ORDER Order, enum CBLAS_TRANSPOSE TransA, enum CBLAS_TRANSPOSE TransB, blasint M, blasint N, blasint K, - float alpha, float *A, blasint lda, float *B, blasint ldb, float beta, float *C, blasint ldc); -void cblas_dgemm(enum CBLAS_ORDER Order, enum CBLAS_TRANSPOSE TransA, enum CBLAS_TRANSPOSE TransB, blasint M, blasint N, blasint K, - double alpha, double *A, blasint lda, double *B, blasint ldb, double beta, double *C, blasint ldc); -void cblas_cgemm(enum CBLAS_ORDER Order, enum CBLAS_TRANSPOSE TransA, enum CBLAS_TRANSPOSE TransB, blasint M, blasint N, blasint K, - float *alpha, float *A, blasint lda, float *B, blasint ldb, float *beta, float *C, blasint ldc); -void cblas_zgemm(enum CBLAS_ORDER Order, enum CBLAS_TRANSPOSE TransA, enum CBLAS_TRANSPOSE TransB, blasint M, blasint N, blasint K, - double *alpha, double *A, blasint lda, double *B, blasint ldb, double *beta, double *C, blasint ldc); +void cblas_sgemm(const enum CBLAS_ORDER Order, const enum CBLAS_TRANSPOSE TransA, const enum CBLAS_TRANSPOSE TransB, const blasint M, const blasint N, const blasint K, + const float alpha, const float *A, const blasint lda, const float *B, const blasint ldb, const float beta, float *C, const blasint ldc); +void cblas_dgemm(const enum CBLAS_ORDER Order, const enum CBLAS_TRANSPOSE TransA, const enum CBLAS_TRANSPOSE TransB, const blasint M, const blasint N, const blasint K, + const double alpha, const double *A, const blasint lda, const double *B, const blasint ldb, const double beta, double *C, const blasint ldc); +void cblas_cgemm(const enum CBLAS_ORDER Order, const enum CBLAS_TRANSPOSE TransA, const enum CBLAS_TRANSPOSE TransB, const blasint M, const blasint N, const blasint K, + const float *alpha, const float *A, const blasint lda, const float *B, const blasint ldb, const float *beta, float *C, const blasint ldc); +void cblas_zgemm(const enum CBLAS_ORDER Order, const enum CBLAS_TRANSPOSE TransA, const enum CBLAS_TRANSPOSE TransB, const blasint M, const blasint N, const blasint K, + const double *alpha, const double *A, const blasint lda, const double *B, const blasint ldb, const double *beta, double *C, const blasint ldc); -void cblas_ssymm(enum CBLAS_ORDER Order, enum CBLAS_SIDE Side, enum CBLAS_UPLO Uplo, blasint M, blasint N, - float alpha, float *A, blasint lda, float *B, blasint ldb, float beta, float *C, blasint ldc); -void cblas_dsymm(enum CBLAS_ORDER Order, enum CBLAS_SIDE Side, enum CBLAS_UPLO Uplo, blasint M, blasint N, - double alpha, double *A, blasint lda, double *B, blasint ldb, double beta, double *C, blasint ldc); -void cblas_csymm(enum CBLAS_ORDER Order, enum CBLAS_SIDE Side, enum CBLAS_UPLO Uplo, blasint M, blasint N, - float *alpha, float *A, blasint lda, float *B, blasint ldb, float *beta, float *C, blasint ldc); -void cblas_zsymm(enum CBLAS_ORDER Order, enum CBLAS_SIDE Side, enum CBLAS_UPLO Uplo, blasint M, blasint N, - double *alpha, double *A, blasint lda, double *B, blasint ldb, double *beta, double *C, blasint ldc); +void cblas_ssymm(const enum CBLAS_ORDER Order, const enum CBLAS_SIDE Side, const enum CBLAS_UPLO Uplo, const blasint M, const blasint N, + const float alpha, const float *A, const blasint lda, const float *B, const blasint ldb, const float beta, float *C, const blasint ldc); +void cblas_dsymm(const enum CBLAS_ORDER Order, const enum CBLAS_SIDE Side, const enum CBLAS_UPLO Uplo, const blasint M, const blasint N, + const double alpha, const double *A, const blasint lda, const double *B, const blasint ldb, const double beta, double *C, const blasint ldc); +void cblas_csymm(const enum CBLAS_ORDER Order, const enum CBLAS_SIDE Side, const enum CBLAS_UPLO Uplo, const blasint M, const blasint N, + const float *alpha, const float *A, const blasint lda, const float *B, const blasint ldb, const float *beta, float *C, const blasint ldc); +void cblas_zsymm(const enum CBLAS_ORDER Order, const enum CBLAS_SIDE Side, const enum CBLAS_UPLO Uplo, const blasint M, const blasint N, + const double *alpha, const double *A, const blasint lda, const double *B, const blasint ldb, const double *beta, double *C, const blasint ldc); -void cblas_ssyrk(enum CBLAS_ORDER Order, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE Trans, - blasint N, blasint K, float alpha, float *A, blasint lda, float beta, float *C, blasint ldc); -void cblas_dsyrk(enum CBLAS_ORDER Order, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE Trans, - blasint N, blasint K, double alpha, double *A, blasint lda, double beta, double *C, blasint ldc); -void cblas_csyrk(enum CBLAS_ORDER Order, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE Trans, - blasint N, blasint K, float *alpha, float *A, blasint lda, float *beta, float *C, blasint ldc); -void cblas_zsyrk(enum CBLAS_ORDER Order, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE Trans, - blasint N, blasint K, double *alpha, double *A, blasint lda, double *beta, double *C, blasint ldc); +void cblas_ssyrk(const enum CBLAS_ORDER Order, const enum CBLAS_UPLO Uplo, const enum CBLAS_TRANSPOSE Trans, + const blasint N, const blasint K, const float alpha, const float *A, const blasint lda, const float beta, float *C, const blasint ldc); +void cblas_dsyrk(const enum CBLAS_ORDER Order, const enum CBLAS_UPLO Uplo, const enum CBLAS_TRANSPOSE Trans, + const blasint N, const blasint K, const double alpha, const double *A, const blasint lda, const double beta, double *C, const blasint ldc); +void cblas_csyrk(const enum CBLAS_ORDER Order, const enum CBLAS_UPLO Uplo, const enum CBLAS_TRANSPOSE Trans, + const blasint N, const blasint K, const float *alpha, const float *A, const blasint lda, const float *beta, float *C, const blasint ldc); +void cblas_zsyrk(const enum CBLAS_ORDER Order, const enum CBLAS_UPLO Uplo, const enum CBLAS_TRANSPOSE Trans, + const blasint N, const blasint K, const double *alpha, const double *A, const blasint lda, const double *beta, double *C, const blasint ldc); -void cblas_ssyr2k(enum CBLAS_ORDER Order, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE Trans, - blasint N, blasint K, float alpha, float *A, blasint lda, float *B, blasint ldb, float beta, float *C, blasint ldc); -void cblas_dsyr2k(enum CBLAS_ORDER Order, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE Trans, - blasint N, blasint K, double alpha, double *A, blasint lda, double *B, blasint ldb, double beta, double *C, blasint ldc); -void cblas_csyr2k(enum CBLAS_ORDER Order, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE Trans, - blasint N, blasint K, float *alpha, float *A, blasint lda, float *B, blasint ldb, float *beta, float *C, blasint ldc); -void cblas_zsyr2k(enum CBLAS_ORDER Order, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE Trans, - blasint N, blasint K, double *alpha, double *A, blasint lda, double *B, blasint ldb, double *beta, double *C, blasint ldc); +void cblas_ssyr2k(const enum CBLAS_ORDER Order, const enum CBLAS_UPLO Uplo, const enum CBLAS_TRANSPOSE Trans, + const blasint N, const blasint K, const float alpha, const float *A, const blasint lda, const float *B, const blasint ldb, const float beta, float *C, const blasint ldc); +void cblas_dsyr2k(const enum CBLAS_ORDER Order, const enum CBLAS_UPLO Uplo, const enum CBLAS_TRANSPOSE Trans, + const blasint N, const blasint K, const double alpha, const double *A, const blasint lda, const double *B, const blasint ldb, const double beta, double *C, const blasint ldc); +void cblas_csyr2k(const enum CBLAS_ORDER Order, const enum CBLAS_UPLO Uplo, const enum CBLAS_TRANSPOSE Trans, + const blasint N, const blasint K, const float *alpha, const float *A, const blasint lda, const float *B, const blasint ldb, const float *beta, float *C, const blasint ldc); +void cblas_zsyr2k(const enum CBLAS_ORDER Order, const enum CBLAS_UPLO Uplo, const enum CBLAS_TRANSPOSE Trans, + const blasint N, const blasint K, const double *alpha, const double *A, const blasint lda, const double *B, const blasint ldb, const double *beta, double *C, const blasint ldc); -void cblas_strmm(enum CBLAS_ORDER Order, enum CBLAS_SIDE Side, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE TransA, - enum CBLAS_DIAG Diag, blasint M, blasint N, float alpha, float *A, blasint lda, float *B, blasint ldb); -void cblas_dtrmm(enum CBLAS_ORDER Order, enum CBLAS_SIDE Side, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE TransA, - enum CBLAS_DIAG Diag, blasint M, blasint N, double alpha, double *A, blasint lda, double *B, blasint ldb); -void cblas_ctrmm(enum CBLAS_ORDER Order, enum CBLAS_SIDE Side, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE TransA, - enum CBLAS_DIAG Diag, blasint M, blasint N, float *alpha, float *A, blasint lda, float *B, blasint ldb); -void cblas_ztrmm(enum CBLAS_ORDER Order, enum CBLAS_SIDE Side, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE TransA, - enum CBLAS_DIAG Diag, blasint M, blasint N, double *alpha, double *A, blasint lda, double *B, blasint ldb); +void cblas_strmm(const enum CBLAS_ORDER Order, const enum CBLAS_SIDE Side, const enum CBLAS_UPLO Uplo, const enum CBLAS_TRANSPOSE TransA, + const enum CBLAS_DIAG Diag, const blasint M, const blasint N, const float alpha, const float *A, const blasint lda, float *B, const blasint ldb); +void cblas_dtrmm(const enum CBLAS_ORDER Order, const enum CBLAS_SIDE Side, const enum CBLAS_UPLO Uplo, const enum CBLAS_TRANSPOSE TransA, + const enum CBLAS_DIAG Diag, const blasint M, const blasint N, const double alpha, const double *A, const blasint lda, double *B, const blasint ldb); +void cblas_ctrmm(const enum CBLAS_ORDER Order, const enum CBLAS_SIDE Side, const enum CBLAS_UPLO Uplo, const enum CBLAS_TRANSPOSE TransA, + const enum CBLAS_DIAG Diag, const blasint M, const blasint N, const float *alpha, const float *A, const blasint lda, float *B, const blasint ldb); +void cblas_ztrmm(const enum CBLAS_ORDER Order, const enum CBLAS_SIDE Side, const enum CBLAS_UPLO Uplo, const enum CBLAS_TRANSPOSE TransA, + const enum CBLAS_DIAG Diag, const blasint M, const blasint N, const double *alpha, const double *A, const blasint lda, double *B, const blasint ldb); -void cblas_strsm(enum CBLAS_ORDER Order, enum CBLAS_SIDE Side, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE TransA, - enum CBLAS_DIAG Diag, blasint M, blasint N, float alpha, float *A, blasint lda, float *B, blasint ldb); -void cblas_dtrsm(enum CBLAS_ORDER Order, enum CBLAS_SIDE Side, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE TransA, - enum CBLAS_DIAG Diag, blasint M, blasint N, double alpha, double *A, blasint lda, double *B, blasint ldb); -void cblas_ctrsm(enum CBLAS_ORDER Order, enum CBLAS_SIDE Side, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE TransA, - enum CBLAS_DIAG Diag, blasint M, blasint N, float *alpha, float *A, blasint lda, float *B, blasint ldb); -void cblas_ztrsm(enum CBLAS_ORDER Order, enum CBLAS_SIDE Side, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE TransA, - enum CBLAS_DIAG Diag, blasint M, blasint N, double *alpha, double *A, blasint lda, double *B, blasint ldb); +void cblas_strsm(const enum CBLAS_ORDER Order, const enum CBLAS_SIDE Side, const enum CBLAS_UPLO Uplo, const enum CBLAS_TRANSPOSE TransA, + const enum CBLAS_DIAG Diag, const blasint M, const blasint N, const float alpha, const float *A, const blasint lda, float *B, const blasint ldb); +void cblas_dtrsm(const enum CBLAS_ORDER Order, const enum CBLAS_SIDE Side, const enum CBLAS_UPLO Uplo, const enum CBLAS_TRANSPOSE TransA, + const enum CBLAS_DIAG Diag, const blasint M, const blasint N, const double alpha, const double *A, const blasint lda, double *B, const blasint ldb); +void cblas_ctrsm(const enum CBLAS_ORDER Order, const enum CBLAS_SIDE Side, const enum CBLAS_UPLO Uplo, const enum CBLAS_TRANSPOSE TransA, + const enum CBLAS_DIAG Diag, const blasint M, const blasint N, const float *alpha, const float *A, const blasint lda, float *B, const blasint ldb); +void cblas_ztrsm(const enum CBLAS_ORDER Order, const enum CBLAS_SIDE Side, const enum CBLAS_UPLO Uplo, const enum CBLAS_TRANSPOSE TransA, + const enum CBLAS_DIAG Diag, const blasint M, const blasint N, const double *alpha, const double *A, const blasint lda, double *B, const blasint ldb); -void cblas_chemm(enum CBLAS_ORDER Order, enum CBLAS_SIDE Side, enum CBLAS_UPLO Uplo, blasint M, blasint N, - float *alpha, float *A, blasint lda, float *B, blasint ldb, float *beta, float *C, blasint ldc); -void cblas_zhemm(enum CBLAS_ORDER Order, enum CBLAS_SIDE Side, enum CBLAS_UPLO Uplo, blasint M, blasint N, - double *alpha, double *A, blasint lda, double *B, blasint ldb, double *beta, double *C, blasint ldc); +void cblas_chemm(const enum CBLAS_ORDER Order, const enum CBLAS_SIDE Side, const enum CBLAS_UPLO Uplo, const blasint M, const blasint N, + const float *alpha, const float *A, const blasint lda, const float *B, const blasint ldb, const float *beta, float *C, const blasint ldc); +void cblas_zhemm(const enum CBLAS_ORDER Order, const enum CBLAS_SIDE Side, const enum CBLAS_UPLO Uplo, const blasint M, const blasint N, + const double *alpha, const double *A, const blasint lda, const double *B, const blasint ldb, const double *beta, double *C, const blasint ldc); -void cblas_cherk(enum CBLAS_ORDER Order, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE Trans, blasint N, blasint K, - float alpha, float *A, blasint lda, float beta, float *C, blasint ldc); -void cblas_zherk(enum CBLAS_ORDER Order, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE Trans, blasint N, blasint K, - double alpha, double *A, blasint lda, double beta, double *C, blasint ldc); +void cblas_cherk(const enum CBLAS_ORDER Order, const enum CBLAS_UPLO Uplo, const enum CBLAS_TRANSPOSE Trans, const blasint N, const blasint K, + const float alpha, const float *A, const blasint lda, const float beta, float *C, const blasint ldc); +void cblas_zherk(const enum CBLAS_ORDER Order, const enum CBLAS_UPLO Uplo, const enum CBLAS_TRANSPOSE Trans, const blasint N, const blasint K, + const double alpha, const double *A, const blasint lda, const double beta, double *C, const blasint ldc); -void cblas_cher2k(enum CBLAS_ORDER Order, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE Trans, blasint N, blasint K, - float *alpha, float *A, blasint lda, float *B, blasint ldb, float beta, float *C, blasint ldc); -void cblas_zher2k(enum CBLAS_ORDER Order, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE Trans, blasint N, blasint K, - double *alpha, double *A, blasint lda, double *B, blasint ldb, double beta, double *C, blasint ldc); +void cblas_cher2k(const enum CBLAS_ORDER Order, const enum CBLAS_UPLO Uplo, const enum CBLAS_TRANSPOSE Trans, const blasint N, const blasint K, + const float *alpha, const float *A, const blasint lda, const float *B, const blasint ldb, const float beta, float *C, const blasint ldc); +void cblas_zher2k(const enum CBLAS_ORDER Order, const enum CBLAS_UPLO Uplo, const enum CBLAS_TRANSPOSE Trans, const blasint N, const blasint K, + const double *alpha, const double *A, const blasint lda, const double *B, const blasint ldb, const double beta, double *C, const blasint ldc); void cblas_xerbla(blasint p, char *rout, char *form, ...); #ifdef __cplusplus } - #endif /* __cplusplus */ #endif diff --git a/common.h b/common.h index 003fde77f..d46a5230a 100644 --- a/common.h +++ b/common.h @@ -390,7 +390,8 @@ typedef int blasint; /* C99 supports complex floating numbers natively, which GCC also offers as an extension since version 3.0. If neither are available, use a compatible structure as fallback (see Clause 6.2.5.13 of the C99 standard). */ -#if defined(__STDC_IEC_559_COMPLEX__) || __STDC_VERSION__ >= 199901L || __GNUC__ >= 3 +#if (defined(__STDC_IEC_559_COMPLEX__) || __STDC_VERSION__ >= 199901L || \ + (__GNUC__ >= 3 && !defined(__cplusplus))) #define OPENBLAS_COMPLEX_C99 typedef float _Complex openblas_complex_float; typedef double _Complex openblas_complex_double; @@ -557,7 +558,8 @@ typedef struct { #include "common_level3.h" #include "common_lapack.h" #ifdef CBLAS -#include "cblas.h" +/* This header file is generated from "cblas.h" (see Makefile.prebuild). */ +#include "cblas_noconst.h" #endif #ifndef ASSEMBLER diff --git a/cpuid_x86.c b/cpuid_x86.c index 385114619..317774691 100644 --- a/cpuid_x86.c +++ b/cpuid_x86.c @@ -118,8 +118,9 @@ static inline int have_excpuid(void){ #ifndef NO_AVX static inline void xgetbv(int op, int * eax, int * edx){ + //Use binary code for xgetbv __asm__ __volatile__ - ("xgetbv": "=a" (*eax), "=d" (*edx) : "c" (op) : "cc"); + (".byte 0x0f, 0x01, 0xd0": "=a" (*eax), "=d" (*edx) : "c" (op) : "cc"); } #endif @@ -1035,6 +1036,8 @@ int get_cpuname(void){ return CPUTYPE_SANDYBRIDGE; else return CPUTYPE_NEHALEM; + case 14: + // Xeon E7540 case 15: //Xeon Processor E7 (Westmere-EX) return CPUTYPE_NEHALEM; @@ -1407,6 +1410,8 @@ int get_coretype(void){ return CORE_SANDYBRIDGE; else return CORE_NEHALEM; //OS doesn't support AVX + case 14: + //Xeon E7540 case 15: //Xeon Processor E7 (Westmere-EX) return CORE_NEHALEM; @@ -1508,6 +1513,9 @@ void get_cpuconfig(void){ printf("#define DTB_SIZE %d\n", info.size * 1024); printf("#define DTB_ASSOCIATIVE %d\n", info.associative); printf("#define DTB_DEFAULT_ENTRIES %d\n", info.linesize); + } else { + //fall back for some virtual machines. + printf("#define DTB_DEFAULT_ENTRIES 32\n"); } features = get_cputype(GET_FEATURE); diff --git a/driver/others/Makefile b/driver/others/Makefile index a1c7a504e..c449ec6c6 100644 --- a/driver/others/Makefile +++ b/driver/others/Makefile @@ -1,7 +1,7 @@ TOPDIR = ../.. include ../../Makefile.system -COMMONOBJS = memory.$(SUFFIX) xerbla.$(SUFFIX) c_abs.$(SUFFIX) z_abs.$(SUFFIX) openblas_set_num_threads.$(SUFFIX) +COMMONOBJS = memory.$(SUFFIX) xerbla.$(SUFFIX) c_abs.$(SUFFIX) z_abs.$(SUFFIX) openblas_set_num_threads.$(SUFFIX) openblas_get_config.$(SUFFIX) COMMONOBJS += slamch.$(SUFFIX) slamc3.$(SUFFIX) dlamch.$(SUFFIX) dlamc3.$(SUFFIX) @@ -103,6 +103,9 @@ blas_server.$(SUFFIX) : $(BLAS_SERVER) ../../common.h ../../common_thread.h ../. openblas_set_num_threads.$(SUFFIX) : openblas_set_num_threads.c $(CC) $(CFLAGS) -c $< -o $(@F) +openblas_get_config.$(SUFFIX) : openblas_get_config.c + $(CC) $(CFLAGS) -c $< -o $(@F) + blasL1thread.$(SUFFIX) : blas_l1_thread.c ../../common.h ../../common_thread.h $(CC) $(CFLAGS) -c $< -o $(@F) diff --git a/driver/others/dynamic.c b/driver/others/dynamic.c index 1c0e1d3bb..6523abb4d 100644 --- a/driver/others/dynamic.c +++ b/driver/others/dynamic.c @@ -80,8 +80,9 @@ extern gotoblas_t gotoblas_BULLDOZER; #ifndef NO_AVX static inline void xgetbv(int op, int * eax, int * edx){ + //Use binary code for xgetbv __asm__ __volatile__ - ("xgetbv": "=a" (*eax), "=d" (*edx) : "c" (op) : "cc"); + (".byte 0x0f, 0x01, 0xd0": "=a" (*eax), "=d" (*edx) : "c" (op) : "cc"); } #endif @@ -165,7 +166,8 @@ static gotoblas_t *get_coretype(void){ //Intel Xeon Processor 5600 (Westmere-EP) //Xeon Processor E7 (Westmere-EX) - if (model == 12 || model == 15) return &gotoblas_NEHALEM; + //Xeon E7540 + if (model == 12 || model == 14 || model == 15) return &gotoblas_NEHALEM; //Intel Core i5-2000 /i7-2000 (Sandy Bridge) //Intel Core i7-3000 / Xeon E5 @@ -285,6 +287,15 @@ void gotoblas_dynamic_init(void) { if (gotoblas == NULL) gotoblas = &gotoblas_KATMAI; #else if (gotoblas == NULL) gotoblas = &gotoblas_PRESCOTT; + /* sanity check, if 64bit pointer we can't have a 32 bit cpu */ + if (sizeof(void*) == 8) { + if (gotoblas == &gotoblas_KATMAI || + gotoblas == &gotoblas_COPPERMINE || + gotoblas == &gotoblas_NORTHWOOD || + gotoblas == &gotoblas_BANIAS || + gotoblas == &gotoblas_ATHLON) + gotoblas = &gotoblas_PRESCOTT; + } #endif if (gotoblas && gotoblas -> init) { diff --git a/driver/others/openblas_get_config.c b/driver/others/openblas_get_config.c new file mode 100644 index 000000000..581ab1a43 --- /dev/null +++ b/driver/others/openblas_get_config.c @@ -0,0 +1,59 @@ +/***************************************************************************** +Copyright (c) 2011,2012 Lab of Parallel Software and Computational Science,ISCAS +All rights reserved. + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: + + 1. Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + + 2. Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in + the documentation and/or other materials provided with the + distribution. + 3. Neither the name of the ISCAS nor the names of its contributors may + be used to endorse or promote products derived from this software + without specific prior written permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +**********************************************************************************/ + +#include "common.h" + +static char* openblas_config_str="" +#ifdef USE64BITINT + "USE64BITINT " +#endif +#ifdef NO_CBLAS + "NO_CBLAS " +#endif +#ifdef NO_LAPACK + "NO_LAPACK " +#endif +#ifdef NO_LAPACKE + "NO_LAPACKE " +#endif +#ifdef DYNAMIC_ARCH + "DYNAMIC_ARCH " +#endif +#ifdef NO_AFFINITY + "NO_AFFINITY " +#endif + ; + +char* CNAME() { + return openblas_config_str; +} + diff --git a/exports/gensymbol b/exports/gensymbol index c492eefb5..04cbd7d84 100644 --- a/exports/gensymbol +++ b/exports/gensymbol @@ -74,6 +74,7 @@ @misc_no_underscore_objs = ( openblas_set_num_threads, goto_set_num_threads, + openblas_get_config, ); @misc_underscore_objs = ( diff --git a/getarch_2nd.c b/getarch_2nd.c index 5339af442..4bdd16a99 100644 --- a/getarch_2nd.c +++ b/getarch_2nd.c @@ -34,7 +34,7 @@ int main(int argc, char **argv) { #ifdef USE64BITINT printf("#define USE64BITINT\n"); #endif - printf("#define GEMM_MULTITHREAD_THRESHOLD\t%ld\n", GEMM_MULTITHREAD_THRESHOLD); + printf("#define GEMM_MULTITHREAD_THRESHOLD\t%ld\n", (long int)GEMM_MULTITHREAD_THRESHOLD); } return 0; diff --git a/kernel/setparam-ref.c b/kernel/setparam-ref.c index e8db76871..83f2b047f 100644 --- a/kernel/setparam-ref.c +++ b/kernel/setparam-ref.c @@ -634,10 +634,10 @@ static void init_parameter(void) { TABLE_NAME.xgemm_q = XGEMM_DEFAULT_Q; #endif -#if defined(CORE_KATMAI) || defined(CORE_COPPERMINE) || defined(CORE_BANIAS) || defined(CORE_YONAH) +#if defined(CORE_KATMAI) || defined(CORE_COPPERMINE) || defined(CORE_BANIAS) || defined(CORE_YONAH) || defined(CORE_ATHLON) #ifdef DEBUG - fprintf(stderr, "Katmai, Coppermine, Banias\n"); + fprintf(stderr, "Katmai, Coppermine, Banias, Athlon\n"); #endif TABLE_NAME.sgemm_p = 64 * (l2 >> 7); diff --git a/kernel/x86/gemv_n_sse.S b/kernel/x86/gemv_n_sse.S index 0891657fa..3ff9203c8 100644 --- a/kernel/x86/gemv_n_sse.S +++ b/kernel/x86/gemv_n_sse.S @@ -89,17 +89,22 @@ #endif #define STACKSIZE 16 +#define ARGS 16 -#define M 4 + STACKSIZE(%esp) -#define N 8 + STACKSIZE(%esp) -#define ALPHA 16 + STACKSIZE(%esp) -#define A 20 + STACKSIZE(%esp) -#define STACK_LDA 24 + STACKSIZE(%esp) -#define STACK_X 28 + STACKSIZE(%esp) -#define STACK_INCX 32 + STACKSIZE(%esp) -#define Y 36 + STACKSIZE(%esp) -#define STACK_INCY 40 + STACKSIZE(%esp) -#define BUFFER 44 + STACKSIZE(%esp) +#define M 4 + STACKSIZE+ARGS(%esp) +#define N 8 + STACKSIZE+ARGS(%esp) +#define ALPHA 16 + STACKSIZE+ARGS(%esp) +#define A 20 + STACKSIZE+ARGS(%esp) +#define STACK_LDA 24 + STACKSIZE+ARGS(%esp) +#define STACK_X 28 + STACKSIZE+ARGS(%esp) +#define STACK_INCX 32 + STACKSIZE+ARGS(%esp) +#define Y 36 + STACKSIZE+ARGS(%esp) +#define STACK_INCY 40 + STACKSIZE+ARGS(%esp) +#define BUFFER 44 + STACKSIZE+ARGS(%esp) +#define MMM 0+ARGS(%esp) +#define YY 4+ARGS(%esp) +#define AA 8+ARGS(%esp) +#define LDAX 12+ARGS(%esp) #define I %eax #define J %ebx @@ -114,6 +119,7 @@ PROLOGUE + subl $ARGS,%esp pushl %ebp pushl %edi pushl %esi @@ -121,7 +127,34 @@ PROFCODE + movl Y,J + movl J,YY # backup Y + movl A,J + movl J,AA # backup A + movl M,J + movl J,MMM # backup MM +.L0t: + xorl J,J + addl $1,J + sall $21,J + subl J,MMM + movl J,M + jge .L00t + ALIGN_4 + + movl MMM,%eax + addl J,%eax + jle .L999x + movl %eax,M + +.L00t: + movl AA,%eax + movl %eax,A + + movl YY,J + movl J,Y movl STACK_LDA, LDA + movl STACK_X, X movl STACK_INCX, INCX @@ -651,12 +684,22 @@ addss 0 * SIZE(X), %xmm0 movss %xmm0, (Y1) ALIGN_3 - .L999: + movl M,J + leal (,J,SIZE),%eax + addl %eax,AA + movl YY,J + addl %eax,J + movl J,YY + jmp .L0t + ALIGN_4 + +.L999x: popl %ebx popl %esi popl %edi popl %ebp + addl $ARGS,%esp ret EPILOGUE diff --git a/kernel/x86/gemv_n_sse2.S b/kernel/x86/gemv_n_sse2.S index 5f5fa5a51..980797d91 100644 --- a/kernel/x86/gemv_n_sse2.S +++ b/kernel/x86/gemv_n_sse2.S @@ -76,17 +76,22 @@ #endif #define STACKSIZE 16 +#define ARGS 16 -#define M 4 + STACKSIZE(%esp) -#define N 8 + STACKSIZE(%esp) -#define ALPHA 16 + STACKSIZE(%esp) -#define A 24 + STACKSIZE(%esp) -#define STACK_LDA 28 + STACKSIZE(%esp) -#define STACK_X 32 + STACKSIZE(%esp) -#define STACK_INCX 36 + STACKSIZE(%esp) -#define Y 40 + STACKSIZE(%esp) -#define STACK_INCY 44 + STACKSIZE(%esp) -#define BUFFER 48 + STACKSIZE(%esp) +#define M 4 + STACKSIZE+ARGS(%esp) +#define N 8 + STACKSIZE+ARGS(%esp) +#define ALPHA 16 + STACKSIZE+ARGS(%esp) +#define A 24 + STACKSIZE+ARGS(%esp) +#define STACK_LDA 28 + STACKSIZE+ARGS(%esp) +#define STACK_X 32 + STACKSIZE+ARGS(%esp) +#define STACK_INCX 36 + STACKSIZE+ARGS(%esp) +#define Y 40 + STACKSIZE+ARGS(%esp) +#define STACK_INCY 44 + STACKSIZE+ARGS(%esp) +#define BUFFER 48 + STACKSIZE+ARGS(%esp) + +#define MMM 0+ARGS(%esp) +#define YY 4+ARGS(%esp) +#define AA 8+ARGS(%esp) #define I %eax #define J %ebx @@ -101,6 +106,8 @@ PROLOGUE + + subl $ARGS,%esp pushl %ebp pushl %edi pushl %esi @@ -108,6 +115,33 @@ PROFCODE + movl Y,J + movl J,YY # backup Y + movl A,J + movl J,AA # backup A + movl M,J + movl J,MMM # backup MM +.L0t: + xorl J,J + addl $1,J + sall $20,J + subl J,MMM + movl J,M + jge .L00t + ALIGN_4 + + movl MMM,%eax + addl J,%eax + jle .L999x + movl %eax,M + +.L00t: + movl AA,%eax + movl %eax,A + + movl YY,J + movl J,Y + movl STACK_LDA, LDA movl STACK_X, X movl STACK_INCX, INCX @@ -677,10 +711,22 @@ ALIGN_3 .L999: + movl M,J + leal (,J,SIZE),%eax + addl %eax,AA + movl YY,J + addl %eax,J + movl J,YY + jmp .L0t + ALIGN_4 + +.L999x: + popl %ebx popl %esi popl %edi popl %ebp + addl $ARGS,%esp ret EPILOGUE diff --git a/kernel/x86/gemv_t_sse.S b/kernel/x86/gemv_t_sse.S index 5bacb7da8..326584bbc 100644 --- a/kernel/x86/gemv_t_sse.S +++ b/kernel/x86/gemv_t_sse.S @@ -89,17 +89,24 @@ #endif #define STACKSIZE 16 +#define ARGS 20 -#define M 4 + STACKSIZE(%esp) -#define N 8 + STACKSIZE(%esp) -#define ALPHA 16 + STACKSIZE(%esp) -#define A 20 + STACKSIZE(%esp) -#define STACK_LDA 24 + STACKSIZE(%esp) -#define STACK_X 28 + STACKSIZE(%esp) -#define STACK_INCX 32 + STACKSIZE(%esp) -#define Y 36 + STACKSIZE(%esp) -#define STACK_INCY 40 + STACKSIZE(%esp) -#define BUFFER 44 + STACKSIZE(%esp) +#define M 4 + STACKSIZE+ARGS(%esp) +#define N 8 + STACKSIZE+ARGS(%esp) +#define ALPHA 16 + STACKSIZE+ARGS(%esp) +#define A 20 + STACKSIZE+ARGS(%esp) +#define STACK_LDA 24 + STACKSIZE+ARGS(%esp) +#define STACK_X 28 + STACKSIZE+ARGS(%esp) +#define STACK_INCX 32 + STACKSIZE+ARGS(%esp) +#define Y 36 + STACKSIZE+ARGS(%esp) +#define STACK_INCY 40 + STACKSIZE+ARGS(%esp) +#define BUFFER 44 + STACKSIZE+ARGS(%esp) + +#define MMM 0+STACKSIZE(%esp) +#define NN 4+STACKSIZE(%esp) +#define AA 8+STACKSIZE(%esp) +#define LDAX 12+STACKSIZE(%esp) +#define XX 16+STACKSIZE(%esp) #define I %eax #define J %ebx @@ -114,6 +121,7 @@ PROLOGUE + subl $ARGS,%esp pushl %ebp pushl %edi pushl %esi @@ -122,7 +130,42 @@ PROFCODE movl STACK_LDA, LDA + movl LDA,LDAX # backup LDA movl STACK_X, X + movl X,XX + movl N,J + movl J,NN # backup N + movl A,J + movl J,AA # backup A + movl M,J + movl J,MMM # mov M to MMM +.L0t: + xorl J,J + addl $1,J + sall $22,J # J=2^24*sizeof(float)=buffer size(16MB) + subl $8, J # Don't use last 8 float in the buffer. + # Now, split M by block J + subl J,MMM # MMM=MMM-J + movl J,M + jge .L00t + ALIGN_4 + + movl MMM,%eax + addl J,%eax + jle .L999x + movl %eax,M + +.L00t: + movl AA,%eax + movl %eax,A # mov AA to A + + movl NN,%eax + movl %eax,N # reset N + + + movl LDAX, LDA # reset LDA + movl XX,X + movl STACK_INCX, INCX movl STACK_INCY, INCY @@ -198,6 +241,20 @@ jg .L06 ALIGN_4 +//Padding zero to prevent loading the dirty number from buffer. + movl M, I + movl $8, J + andl $7, I + xorps %xmm0, %xmm0 + subl I, J + ALIGN_2 +.L07: + movss %xmm0, 0 * SIZE(Y1) + addl $SIZE, Y1 + decl J + jg .L07 + ALIGN_4 + .L10: movl Y, Y1 @@ -628,10 +685,22 @@ ALIGN_4 .L999: + movl M,J + leal (,J,SIZE),%eax + addl %eax,AA + movl XX,J + addl %eax,J + movl J,XX + jmp .L0t + ALIGN_4 + +.L999x: popl %ebx popl %esi popl %edi popl %ebp + + addl $ARGS,%esp ret EPILOGUE diff --git a/kernel/x86/gemv_t_sse2.S b/kernel/x86/gemv_t_sse2.S index c7e685dd8..60d6ef270 100644 --- a/kernel/x86/gemv_t_sse2.S +++ b/kernel/x86/gemv_t_sse2.S @@ -76,18 +76,24 @@ #endif #define STACKSIZE 16 +#define ARGS 16 + +#define M 4 + STACKSIZE+ARGS(%esp) +#define N 8 + STACKSIZE+ARGS(%esp) +#define ALPHA 16 + STACKSIZE+ARGS(%esp) +#define A 24 + STACKSIZE+ARGS(%esp) +#define STACK_LDA 28 + STACKSIZE+ARGS(%esp) +#define STACK_X 32 + STACKSIZE+ARGS(%esp) +#define STACK_INCX 36 + STACKSIZE+ARGS(%esp) +#define Y 40 + STACKSIZE+ARGS(%esp) +#define STACK_INCY 44 + STACKSIZE+ARGS(%esp) +#define BUFFER 48 + STACKSIZE+ARGS(%esp) + +#define MMM 0+STACKSIZE(%esp) +#define AA 4+STACKSIZE(%esp) +#define LDAX 8+STACKSIZE(%esp) +#define NN 12+STACKSIZE(%esp) -#define M 4 + STACKSIZE(%esp) -#define N 8 + STACKSIZE(%esp) -#define ALPHA 16 + STACKSIZE(%esp) -#define A 24 + STACKSIZE(%esp) -#define STACK_LDA 28 + STACKSIZE(%esp) -#define STACK_X 32 + STACKSIZE(%esp) -#define STACK_INCX 36 + STACKSIZE(%esp) -#define Y 40 + STACKSIZE(%esp) -#define STACK_INCY 44 + STACKSIZE(%esp) -#define BUFFER 48 + STACKSIZE(%esp) - #define I %eax #define J %ebx @@ -101,6 +107,8 @@ PROLOGUE + subl $ARGS,%esp + pushl %ebp pushl %edi pushl %esi @@ -108,7 +116,40 @@ PROFCODE + movl STACK_LDA, LDA + movl LDA,LDAX # backup LDA + movl N,J + movl J,NN # backup N + movl A,J + movl J,AA # backup A + movl M,J + movl J,MMM # mov M to MMM +.L0t: + xorl J,J + addl $1,J + sall $21,J # J=2^21*sizeof(double)=buffer size(16MB) + subl $4, J # Don't use last 4 double in the buffer. + # Now, split M by block J + subl J,MMM # MMM=MMM-J + movl J,M + jge .L00t + ALIGN_4 + + movl MMM,%eax + addl J,%eax + jle .L999x + movl %eax,M + +.L00t: + movl AA,%eax + movl %eax,A # mov AA to A + + movl NN,%eax + movl %eax,N # reset N + + + movl LDAX, LDA # reset LDA movl STACK_X, X movl STACK_INCX, INCX movl STACK_INCY, INCY @@ -117,6 +158,7 @@ leal (,INCY, SIZE), INCY leal (,LDA, SIZE), LDA + subl $-16 * SIZE, A cmpl $0, N @@ -560,10 +602,19 @@ ALIGN_4 .L999: + movl M,J + leal (,J,SIZE),%eax + addl %eax,AA + jmp .L0t + ALIGN_4 + +.L999x: popl %ebx popl %esi popl %edi popl %ebp + + addl $ARGS,%esp ret EPILOGUE diff --git a/kernel/x86_64/dot_sse.S b/kernel/x86_64/dot_sse.S index 61c481064..985ce9fec 100644 --- a/kernel/x86_64/dot_sse.S +++ b/kernel/x86_64/dot_sse.S @@ -530,7 +530,7 @@ #endif movsd -32 * SIZE(Y), %xmm8 - pshufd $0x39, %xmm4, %xmm5 + pshufd $0x29, %xmm4, %xmm5 mulps %xmm8, %xmm5 addps %xmm5, %xmm3 @@ -750,7 +750,8 @@ xorps %xmm5, %xmm5 movhlps %xmm4, %xmm5 - mulps -32 * SIZE(Y), %xmm5 + movlps -32 * SIZE(Y), %xmm4 + mulps %xmm4, %xmm5 addps %xmm5, %xmm0 addq $2 * SIZE, X @@ -992,7 +993,7 @@ movsd -32 * SIZE(Y), %xmm8 movss %xmm5, %xmm4 - shufps $0x93, %xmm5, %xmm4 + shufps $0x93, %xmm4, %xmm4 mulps %xmm8, %xmm4 addps %xmm4, %xmm3 diff --git a/kernel/x86_64/sgemv_t.S b/kernel/x86_64/sgemv_t.S index 052ff1a79..854e0f295 100644 --- a/kernel/x86_64/sgemv_t.S +++ b/kernel/x86_64/sgemv_t.S @@ -1,4 +1,3 @@ -/*********************************************************************/ /* Copyright 2009, 2010 The University of Texas at Austin. */ /* All rights reserved. */ /* */ @@ -47,7 +46,7 @@ #ifndef WINDOWS_ABI -#define STACKSIZE 64 +#define STACKSIZE 128 #define OLD_M %rdi #define OLD_N %rsi @@ -57,6 +56,10 @@ #define STACK_Y 16 + STACKSIZE(%rsp) #define STACK_INCY 24 + STACKSIZE(%rsp) #define STACK_BUFFER 32 + STACKSIZE(%rsp) +#define MMM 56(%rsp) +#define NN 64(%rsp) +#define AA 72(%rsp) +#define LDAX 80(%rsp) #else @@ -71,6 +74,10 @@ #define STACK_Y 72 + STACKSIZE(%rsp) #define STACK_INCY 80 + STACKSIZE(%rsp) #define STACK_BUFFER 88 + STACKSIZE(%rsp) +#define MMM 216(%rsp) +#define NN 224(%rsp) +#define AA 232(%rsp) +#define LDAX 240(%rsp) #endif @@ -127,29 +134,48 @@ movups %xmm14, 192(%rsp) movups %xmm15, 208(%rsp) - movq OLD_M, M - movq OLD_N, N - movq OLD_A, A - movq OLD_LDA, LDA + movq OLD_M, MMM + movq OLD_N, NN + movq OLD_A, X + movq X, AA + movq OLD_LDA, X + movq X, LDAX movq OLD_X, X #else - movq OLD_M, M - movq OLD_N, N - movq OLD_A, A - movq OLD_LDA, LDA + movq OLD_M, MMM + movq OLD_N, NN + movq OLD_A, AA + movq OLD_LDA, LDAX #endif - - movq STACK_INCX, INCX - movq STACK_Y, Y - movq STACK_INCY, INCY - movq STACK_BUFFER, BUFFER - #ifndef WINDOWS_ABI pshufd $0, %xmm0, ALPHA #else pshufd $0, %xmm3, ALPHA #endif + +.L0t: + xorq M,M + addq $1,M + salq $22,M + subq M,MMM + jge .L00t + ALIGN_4 + + movq MMM,%rax + addq M,%rax + jle .L999x + movq %rax,M + +.L00t: + movq LDAX,LDA + movq NN,N + movq AA,A + movq STACK_INCX, INCX + movq STACK_Y, Y + movq STACK_INCY, INCY + movq STACK_BUFFER, BUFFER + leaq (,INCX, SIZE), INCX leaq (,INCY, SIZE), INCY leaq (,LDA, SIZE), LDA @@ -6341,6 +6367,12 @@ ALIGN_4 .L999: + leaq (,M,SIZE),%rax + addq %rax,AA + jmp .L0t + ALIGN_4 + +.L999x: movq 0(%rsp), %rbx movq 8(%rsp), %rbp movq 16(%rsp), %r12 diff --git a/kernel/x86_64/zdot_sse.S b/kernel/x86_64/zdot_sse.S index 13804e0f8..e2f153ab3 100644 --- a/kernel/x86_64/zdot_sse.S +++ b/kernel/x86_64/zdot_sse.S @@ -699,7 +699,7 @@ movsd -32 * SIZE(X), %xmm4 pshufd $0xb1, %xmm4, %xmm12 - shufps $0x39, %xmm8, %xmm8 + shufps $0x59, %xmm8, %xmm8 mulps %xmm8, %xmm4 addps %xmm4, %xmm0 mulps %xmm8, %xmm12 @@ -1336,7 +1336,7 @@ movss %xmm9, %xmm8 pshufd $0xb1, %xmm4, %xmm12 - shufps $0x93, %xmm8, %xmm8 + shufps $0x03, %xmm8, %xmm8 mulps %xmm8, %xmm4 addps %xmm4, %xmm0 mulps %xmm8, %xmm12 @@ -1697,7 +1697,7 @@ movsd -32 * SIZE(Y), %xmm4 pshufd $0xb1, %xmm4, %xmm12 - shufps $0x39, %xmm8, %xmm8 + shufps $0xa9, %xmm8, %xmm8 mulps %xmm8, %xmm4 addps %xmm4, %xmm0 mulps %xmm8, %xmm12 @@ -2024,7 +2024,7 @@ movss %xmm9, %xmm8 pshufd $0xb1, %xmm4, %xmm12 - shufps $0x93, %xmm8, %xmm8 + shufps $0x03, %xmm8, %xmm8 mulps %xmm8, %xmm4 addps %xmm4, %xmm0 mulps %xmm8, %xmm12 diff --git a/make.inc b/make.inc index 30004233f..01b9bde92 100644 --- a/make.inc +++ b/make.inc @@ -4,7 +4,7 @@ DRVOPTS = $(OPTS) LOADER = $(FORTRAN) TIMER = NONE ARCHFLAGS= -ru -RANLIB = ranlib +#RANLIB = ranlib BLASLIB = TMGLIB = tmglib.a EIGSRCLIB = eigsrc.a diff --git a/openblas_config_template.h b/openblas_config_template.h index a2b05696f..cf2c037cc 100644 --- a/openblas_config_template.h +++ b/openblas_config_template.h @@ -48,7 +48,8 @@ typedef int blasint; /* C99 supports complex floating numbers natively, which GCC also offers as an extension since version 3.0. If neither are available, use a compatible structure as fallback (see Clause 6.2.5.13 of the C99 standard). */ -#if defined(__STDC_IEC_559_COMPLEX__) || __STDC_VERSION__ >= 199901L || __GNUC__ >= 3 +#if (defined(__STDC_IEC_559_COMPLEX__) || __STDC_VERSION__ >= 199901L || \ + (__GNUC__ >= 3 && !defined(__cplusplus))) #define OPENBLAS_COMPLEX_C99 #include typedef float _Complex openblas_complex_float;