diff --git a/.gitignore b/.gitignore index 3e163abef..2c298e3b4 100644 --- a/.gitignore +++ b/.gitignore @@ -15,6 +15,7 @@ lapack-netlib/make.inc lapack-netlib/lapacke/include/lapacke_mangling.h lapack-netlib/TESTING/testing_results.txt *.so +*.so.* *.a .svn *~ diff --git a/.travis.yml b/.travis.yml index 7d625c9dc..806cb0046 100644 --- a/.travis.yml +++ b/.travis.yml @@ -1,4 +1,13 @@ language: c + +notifications: + webhooks: + urls: + - https://webhooks.gitter.im/e/8a6e4470a0cebd090344 + on_success: change # options: [always|never|change] default: always + on_failure: always # options: [always|never|change] default: always + on_start: never # options: [always|never|change] default: always + compiler: - gcc diff --git a/CONTRIBUTORS.md b/CONTRIBUTORS.md index b88e3671b..88e461dc4 100644 --- a/CONTRIBUTORS.md +++ b/CONTRIBUTORS.md @@ -127,5 +127,8 @@ In chronological order: * Ton van den Heuvel * [2015-03-18] Fix race condition during shutdown causing a crash in gotoblas_set_affinity(). +* Martin Koehler + * [2015-09-07] Improved imatcopy + * [Your name or handle] <[email or website]> * [Date] [Brief summary of your changes] diff --git a/Makefile.arm b/Makefile.arm index 2f7b33730..272220ca9 100644 --- a/Makefile.arm +++ b/Makefile.arm @@ -26,8 +26,8 @@ endif ifeq ($(CORE), ARMV5) -CCOMMON_OPT += -marm -mfpu=vfp -mfloat-abi=hard -march=armv6 -FCOMMON_OPT += -marm -mfpu=vfp -mfloat-abi=hard -march=armv6 +CCOMMON_OPT += -marm -march=armv5 +FCOMMON_OPT += -marm -march=armv5 endif diff --git a/Makefile.install b/Makefile.install index a5814e55a..9814302b0 100644 --- a/Makefile.install +++ b/Makefile.install @@ -11,6 +11,7 @@ OPENBLAS_BINARY_DIR := $(PREFIX)/bin OPENBLAS_BUILD_DIR := $(CURDIR) OPENBLAS_CMAKE_DIR := $(OPENBLAS_LIBRARY_DIR)/cmake/openblas OPENBLAS_CMAKE_CONFIG := OpenBLASConfig.cmake +OPENBLAS_CMAKE_CONFIG_VERSION := OpenBLASConfigVersion.cmake .PHONY : install .NOTPARALLEL : install @@ -97,6 +98,7 @@ endif @echo Generating $(OPENBLAS_CMAKE_CONFIG) in $(DESTDIR)$(OPENBLAS_CMAKE_DIR) @echo "SET(OpenBLAS_VERSION \"${VERSION}\")" > $(DESTDIR)$(OPENBLAS_CMAKE_DIR)/$(OPENBLAS_CMAKE_CONFIG) @echo "SET(OpenBLAS_INCLUDE_DIRS ${OPENBLAS_INCLUDE_DIR})" >> $(DESTDIR)$(OPENBLAS_CMAKE_DIR)/$(OPENBLAS_CMAKE_CONFIG) + ifndef NO_SHARED #ifeq logical or ifeq ($(OSNAME), $(filter $(OSNAME),Linux FreeBSD NetBSD)) @@ -112,5 +114,16 @@ else #only static @echo "SET(OpenBLAS_LIBRARIES ${OPENBLAS_LIBRARY_DIR}/$(LIBPREFIX).$(LIBSUFFIX))" >> $(DESTDIR)$(OPENBLAS_CMAKE_DIR)/$(OPENBLAS_CMAKE_CONFIG) endif +#Generating OpenBLASConfigVersion.cmake + @echo Generating $(OPENBLAS_CMAKE_CONFIG_VERSION) in $(DESTDIR)$(OPENBLAS_CMAKE_DIR) + @echo "set (PACKAGE_VERSION \"${VERSION}\")" > $(DESTDIR)$(OPENBLAS_CMAKE_DIR)/$(OPENBLAS_CMAKE_CONFIG_VERSION) + @echo "if (PACKAGE_VERSION VERSION_LESS PACKAGE_FIND_VERSION)" >> $(DESTDIR)$(OPENBLAS_CMAKE_DIR)/$(OPENBLAS_CMAKE_CONFIG_VERSION) + @echo " set (PACKAGE_VERSION_COMPATIBLE FALSE)" >> $(DESTDIR)$(OPENBLAS_CMAKE_DIR)/$(OPENBLAS_CMAKE_CONFIG_VERSION) + @echo "else ()" >> $(DESTDIR)$(OPENBLAS_CMAKE_DIR)/$(OPENBLAS_CMAKE_CONFIG_VERSION) + @echo " set (PACKAGE_VERSION_COMPATIBLE TRUE)" >> $(DESTDIR)$(OPENBLAS_CMAKE_DIR)/$(OPENBLAS_CMAKE_CONFIG_VERSION) + @echo " if (PACKAGE_FIND_VERSION STREQUAL PACKAGE_VERSION)" >> $(DESTDIR)$(OPENBLAS_CMAKE_DIR)/$(OPENBLAS_CMAKE_CONFIG_VERSION) + @echo " set (PACKAGE_VERSION_EXACT TRUE)" >> $(DESTDIR)$(OPENBLAS_CMAKE_DIR)/$(OPENBLAS_CMAKE_CONFIG_VERSION) + @echo " endif ()" >> $(DESTDIR)$(OPENBLAS_CMAKE_DIR)/$(OPENBLAS_CMAKE_CONFIG_VERSION) + @echo "endif ()" >> $(DESTDIR)$(OPENBLAS_CMAKE_DIR)/$(OPENBLAS_CMAKE_CONFIG_VERSION) @echo Install OK! diff --git a/Makefile.rule b/Makefile.rule index 19f3fe3d9..22f222e3f 100644 --- a/Makefile.rule +++ b/Makefile.rule @@ -169,6 +169,9 @@ COMMON_PROF = -pg # 64 bit integer interfaces in OpenBLAS. # For details, https://github.com/xianyi/OpenBLAS/pull/459 # +# The same prefix and suffix are also added to the library name, +# i.e. you get lib$(SYMBOLPREFIX)openblas$(SYMBOLSUFFIX) rather than libopenblas +# # SYMBOLPREFIX= # SYMBOLSUFFIX= diff --git a/Makefile.system b/Makefile.system index 325ee6af9..42ad49849 100644 --- a/Makefile.system +++ b/Makefile.system @@ -880,12 +880,6 @@ ifdef USE_SIMPLE_THREADED_LEVEL3 CCOMMON_OPT += -DUSE_SIMPLE_THREADED_LEVEL3 endif -ifndef LIBNAMESUFFIX -LIBPREFIX = libopenblas -else -LIBPREFIX = libopenblas_$(LIBNAMESUFFIX) -endif - ifndef SYMBOLPREFIX SYMBOLPREFIX = endif @@ -894,6 +888,12 @@ ifndef SYMBOLSUFFIX SYMBOLSUFFIX = endif +ifndef LIBNAMESUFFIX +LIBPREFIX = lib$(SYMBOLPREFIX)openblas$(SYMBOLSUFFIX) +else +LIBPREFIX = lib$(SYMBOLPREFIX)openblas$(SYMBOLSUFFIX)_$(LIBNAMESUFFIX) +endif + KERNELDIR = $(TOPDIR)/kernel/$(ARCH) include $(TOPDIR)/Makefile.$(ARCH) diff --git a/README.md b/README.md index cdacf9888..16f874078 100644 --- a/README.md +++ b/README.md @@ -1,5 +1,7 @@ # OpenBLAS +[![Join the chat at https://gitter.im/xianyi/OpenBLAS](https://badges.gitter.im/Join%20Chat.svg)](https://gitter.im/xianyi/OpenBLAS?utm_source=badge&utm_medium=badge&utm_campaign=pr-badge&utm_content=badge) + [![Build Status](https://travis-ci.org/xianyi/OpenBLAS.png?branch=develop)](https://travis-ci.org/xianyi/OpenBLAS) ## Introduction diff --git a/TargetList.txt b/TargetList.txt index 0a9d8b40c..b2878ba32 100644 --- a/TargetList.txt +++ b/TargetList.txt @@ -44,6 +44,8 @@ NANO POWER4 POWER5 POWER6 +POWER7 +POWER8 PPCG4 PPC970 PPC970MP diff --git a/c_check b/c_check index 0fdadb659..d694e7411 100644 --- a/c_check +++ b/c_check @@ -30,7 +30,7 @@ if ($ARGV[0] =~ /(.*)(-[.\d]+)/) { $cross_suffix = $1; } } else { - if ($ARGV[0] =~ /(.*-)(.*)/) { + if ($ARGV[0] =~ /([^\/]*-)([^\/]*$)/) { $cross_suffix = $1; } } diff --git a/cblas_noconst.h b/cblas_noconst.h deleted file mode 100644 index 4451c304e..000000000 --- a/cblas_noconst.h +++ /dev/null @@ -1,350 +0,0 @@ -#ifndef CBLAS_H -#define CBLAS_H - -#include -#include "common.h" - -#ifdef __cplusplus -extern "C" { - /* Assume C declarations for C++ */ -#endif /* __cplusplus */ - -/*Set the number of threads on runtime.*/ -void openblas_set_num_threads(int num_threads); -void goto_set_num_threads(int num_threads); - -/*Get the number of threads on runtime.*/ -int openblas_get_num_threads(void); - -/*Get the number of physical processors (cores).*/ -int openblas_get_num_procs(void); - -/*Get the build configure on runtime.*/ -char* openblas_get_config(void); - -/* Get the parallelization type which is used by OpenBLAS */ -int openblas_get_parallel(void); -/* OpenBLAS is compiled for sequential use */ -#define OPENBLAS_SEQUENTIAL 0 -/* OpenBLAS is compiled using normal threading model */ -#define OPENBLAS_THREAD 1 -/* OpenBLAS is compiled using OpenMP threading model */ -#define OPENBLAS_OPENMP 2 - - -#define CBLAS_INDEX size_t - -typedef enum CBLAS_ORDER {CblasRowMajor=101, CblasColMajor=102} CBLAS_ORDER; -typedef enum CBLAS_TRANSPOSE {CblasNoTrans=111, CblasTrans=112, CblasConjTrans=113, CblasConjNoTrans=114} CBLAS_TRANSPOSE; -typedef enum CBLAS_UPLO {CblasUpper=121, CblasLower=122} CBLAS_UPLO; -typedef enum CBLAS_DIAG {CblasNonUnit=131, CblasUnit=132} CBLAS_DIAG; -typedef enum CBLAS_SIDE {CblasLeft=141, CblasRight=142} CBLAS_SIDE; - -float cblas_sdsdot(blasint n, float alpha, float *x, blasint incx, float *y, blasint incy); -double cblas_dsdot (blasint n, float *x, blasint incx, float *y, blasint incy); -float cblas_sdot(blasint n, float *x, blasint incx, float *y, blasint incy); -double cblas_ddot(blasint n, double *x, blasint incx, double *y, blasint incy); - -openblas_complex_float cblas_cdotu(blasint n, float *x, blasint incx, float *y, blasint incy); -openblas_complex_float cblas_cdotc(blasint n, float *x, blasint incx, float *y, blasint incy); -openblas_complex_double cblas_zdotu(blasint n, double *x, blasint incx, double *y, blasint incy); -openblas_complex_double cblas_zdotc(blasint n, double *x, blasint incx, double *y, blasint incy); - -void cblas_cdotu_sub(blasint n, float *x, blasint incx, float *y, blasint incy, openblas_complex_float *ret); -void cblas_cdotc_sub(blasint n, float *x, blasint incx, float *y, blasint incy, openblas_complex_float *ret); -void cblas_zdotu_sub(blasint n, double *x, blasint incx, double *y, blasint incy, openblas_complex_double *ret); -void cblas_zdotc_sub(blasint n, double *x, blasint incx, double *y, blasint incy, openblas_complex_double *ret); - -float cblas_sasum (blasint n, float *x, blasint incx); -double cblas_dasum (blasint n, double *x, blasint incx); -float cblas_scasum(blasint n, float *x, blasint incx); -double cblas_dzasum(blasint n, double *x, blasint incx); - -float cblas_snrm2 (blasint N, float *X, blasint incX); -double cblas_dnrm2 (blasint N, double *X, blasint incX); -float cblas_scnrm2(blasint N, float *X, blasint incX); -double cblas_dznrm2(blasint N, double *X, blasint incX); - -CBLAS_INDEX cblas_isamax(blasint n, float *x, blasint incx); -CBLAS_INDEX cblas_idamax(blasint n, double *x, blasint incx); -CBLAS_INDEX cblas_icamax(blasint n, float *x, blasint incx); -CBLAS_INDEX cblas_izamax(blasint n, double *x, blasint incx); - -void cblas_saxpy(blasint n, float alpha, float *x, blasint incx, float *y, blasint incy); -void cblas_daxpy(blasint n, double alpha, double *x, blasint incx, double *y, blasint incy); -void cblas_caxpy(blasint n, float *alpha, float *x, blasint incx, float *y, blasint incy); -void cblas_zaxpy(blasint n, double *alpha, double *x, blasint incx, double *y, blasint incy); - -void cblas_scopy(blasint n, float *x, blasint incx, float *y, blasint incy); -void cblas_dcopy(blasint n, double *x, blasint incx, double *y, blasint incy); -void cblas_ccopy(blasint n, float *x, blasint incx, float *y, blasint incy); -void cblas_zcopy(blasint n, double *x, blasint incx, double *y, blasint incy); - -void cblas_sswap(blasint n, float *x, blasint incx, float *y, blasint incy); -void cblas_dswap(blasint n, double *x, blasint incx, double *y, blasint incy); -void cblas_cswap(blasint n, float *x, blasint incx, float *y, blasint incy); -void cblas_zswap(blasint n, double *x, blasint incx, double *y, blasint incy); - -void cblas_srot(blasint N, float *X, blasint incX, float *Y, blasint incY, float c, float s); -void cblas_drot(blasint N, double *X, blasint incX, double *Y, blasint incY, double c, double s); - -void cblas_srotg(float *a, float *b, float *c, float *s); -void cblas_drotg(double *a, double *b, double *c, double *s); - -void cblas_srotm(blasint N, float *X, blasint incX, float *Y, blasint incY, float *P); -void cblas_drotm(blasint N, double *X, blasint incX, double *Y, blasint incY, double *P); - -void cblas_srotmg(float *d1, float *d2, float *b1, float b2, float *P); -void cblas_drotmg(double *d1, double *d2, double *b1, double b2, double *P); - -void cblas_sscal(blasint N, float alpha, float *X, blasint incX); -void cblas_dscal(blasint N, double alpha, double *X, blasint incX); -void cblas_cscal(blasint N, float *alpha, float *X, blasint incX); -void cblas_zscal(blasint N, double *alpha, double *X, blasint incX); -void cblas_csscal(blasint N, float alpha, float *X, blasint incX); -void cblas_zdscal(blasint N, double alpha, double *X, blasint incX); - -void cblas_sgemv(enum CBLAS_ORDER order, enum CBLAS_TRANSPOSE trans, blasint m, blasint n, - float alpha, float *a, blasint lda, float *x, blasint incx, float beta, float *y, blasint incy); -void cblas_dgemv(enum CBLAS_ORDER order, enum CBLAS_TRANSPOSE trans, blasint m, blasint n, - double alpha, double *a, blasint lda, double *x, blasint incx, double beta, double *y, blasint incy); -void cblas_cgemv(enum CBLAS_ORDER order, enum CBLAS_TRANSPOSE trans, blasint m, blasint n, - float *alpha, float *a, blasint lda, float *x, blasint incx, float *beta, float *y, blasint incy); -void cblas_zgemv(enum CBLAS_ORDER order, enum CBLAS_TRANSPOSE trans, blasint m, blasint n, - double *alpha, double *a, blasint lda, double *x, blasint incx, double *beta, double *y, blasint incy); - -void cblas_sger (enum CBLAS_ORDER order, blasint M, blasint N, float alpha, float *X, blasint incX, float *Y, blasint incY, float *A, blasint lda); -void cblas_dger (enum CBLAS_ORDER order, blasint M, blasint N, double alpha, double *X, blasint incX, double *Y, blasint incY, double *A, blasint lda); -void cblas_cgeru(enum CBLAS_ORDER order, blasint M, blasint N, float *alpha, float *X, blasint incX, float *Y, blasint incY, float *A, blasint lda); -void cblas_cgerc(enum CBLAS_ORDER order, blasint M, blasint N, float *alpha, float *X, blasint incX, float *Y, blasint incY, float *A, blasint lda); -void cblas_zgeru(enum CBLAS_ORDER order, blasint M, blasint N, double *alpha, double *X, blasint incX, double *Y, blasint incY, double *A, blasint lda); -void cblas_zgerc(enum CBLAS_ORDER order, blasint M, blasint N, double *alpha, double *X, blasint incX, double *Y, blasint incY, double *A, blasint lda); - -void cblas_strsv(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE TransA, enum CBLAS_DIAG Diag, blasint N, float *A, blasint lda, float *X, blasint incX); -void cblas_dtrsv(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE TransA, enum CBLAS_DIAG Diag, blasint N, double *A, blasint lda, double *X, blasint incX); -void cblas_ctrsv(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE TransA, enum CBLAS_DIAG Diag, blasint N, float *A, blasint lda, float *X, blasint incX); -void cblas_ztrsv(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE TransA, enum CBLAS_DIAG Diag, blasint N, double *A, blasint lda, double *X, blasint incX); - -void cblas_strmv(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE TransA, enum CBLAS_DIAG Diag, blasint N, float *A, blasint lda, float *X, blasint incX); -void cblas_dtrmv(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE TransA, enum CBLAS_DIAG Diag, blasint N, double *A, blasint lda, double *X, blasint incX); -void cblas_ctrmv(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE TransA, enum CBLAS_DIAG Diag, blasint N, float *A, blasint lda, float *X, blasint incX); -void cblas_ztrmv(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE TransA, enum CBLAS_DIAG Diag, blasint N, double *A, blasint lda, double *X, blasint incX); - -void cblas_ssyr(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, blasint N, float alpha, float *X, blasint incX, float *A, blasint lda); -void cblas_dsyr(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, blasint N, double alpha, double *X, blasint incX, double *A, blasint lda); -void cblas_cher(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, blasint N, float alpha, float *X, blasint incX, float *A, blasint lda); -void cblas_zher(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, blasint N, double alpha, double *X, blasint incX, double *A, blasint lda); - -void cblas_ssyr2(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo,blasint N, float alpha, float *X, - blasint incX, float *Y, blasint incY, float *A, blasint lda); -void cblas_dsyr2(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, blasint N, double alpha, double *X, - blasint incX, double *Y, blasint incY, double *A, blasint lda); -void cblas_cher2(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, blasint N, float *alpha, float *X, blasint incX, - float *Y, blasint incY, float *A, blasint lda); -void cblas_zher2(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, blasint N, double *alpha, double *X, blasint incX, - double *Y, blasint incY, double *A, blasint lda); - -void cblas_sgbmv(enum CBLAS_ORDER order, enum CBLAS_TRANSPOSE TransA, blasint M, blasint N, - blasint KL, blasint KU, float alpha, float *A, blasint lda, float *X, blasint incX, float beta, float *Y, blasint incY); -void cblas_dgbmv(enum CBLAS_ORDER order, enum CBLAS_TRANSPOSE TransA, blasint M, blasint N, - blasint KL, blasint KU, double alpha, double *A, blasint lda, double *X, blasint incX, double beta, double *Y, blasint incY); -void cblas_cgbmv(enum CBLAS_ORDER order, enum CBLAS_TRANSPOSE TransA, blasint M, blasint N, - blasint KL, blasint KU, float *alpha, float *A, blasint lda, float *X, blasint incX, float *beta, float *Y, blasint incY); -void cblas_zgbmv(enum CBLAS_ORDER order, enum CBLAS_TRANSPOSE TransA, blasint M, blasint N, - blasint KL, blasint KU, double *alpha, double *A, blasint lda, double *X, blasint incX, double *beta, double *Y, blasint incY); - -void cblas_ssbmv(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, blasint N, blasint K, float alpha, float *A, - blasint lda, float *X, blasint incX, float beta, float *Y, blasint incY); -void cblas_dsbmv(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, blasint N, blasint K, double alpha, double *A, - blasint lda, double *X, blasint incX, double beta, double *Y, blasint incY); - - -void cblas_stbmv(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE TransA, enum CBLAS_DIAG Diag, - blasint N, blasint K, float *A, blasint lda, float *X, blasint incX); -void cblas_dtbmv(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE TransA, enum CBLAS_DIAG Diag, - blasint N, blasint K, double *A, blasint lda, double *X, blasint incX); -void cblas_ctbmv(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE TransA, enum CBLAS_DIAG Diag, - blasint N, blasint K, float *A, blasint lda, float *X, blasint incX); -void cblas_ztbmv(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE TransA, enum CBLAS_DIAG Diag, - blasint N, blasint K, double *A, blasint lda, double *X, blasint incX); - -void cblas_stbsv(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE TransA, enum CBLAS_DIAG Diag, - blasint N, blasint K, float *A, blasint lda, float *X, blasint incX); -void cblas_dtbsv(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE TransA, enum CBLAS_DIAG Diag, - blasint N, blasint K, double *A, blasint lda, double *X, blasint incX); -void cblas_ctbsv(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE TransA, enum CBLAS_DIAG Diag, - blasint N, blasint K, float *A, blasint lda, float *X, blasint incX); -void cblas_ztbsv(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE TransA, enum CBLAS_DIAG Diag, - blasint N, blasint K, double *A, blasint lda, double *X, blasint incX); - -void cblas_stpmv(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE TransA, enum CBLAS_DIAG Diag, - blasint N, float *Ap, float *X, blasint incX); -void cblas_dtpmv(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE TransA, enum CBLAS_DIAG Diag, - blasint N, double *Ap, double *X, blasint incX); -void cblas_ctpmv(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE TransA, enum CBLAS_DIAG Diag, - blasint N, float *Ap, float *X, blasint incX); -void cblas_ztpmv(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE TransA, enum CBLAS_DIAG Diag, - blasint N, double *Ap, double *X, blasint incX); - -void cblas_stpsv(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE TransA, enum CBLAS_DIAG Diag, - blasint N, float *Ap, float *X, blasint incX); -void cblas_dtpsv(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE TransA, enum CBLAS_DIAG Diag, - blasint N, double *Ap, double *X, blasint incX); -void cblas_ctpsv(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE TransA, enum CBLAS_DIAG Diag, - blasint N, float *Ap, float *X, blasint incX); -void cblas_ztpsv(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE TransA, enum CBLAS_DIAG Diag, - blasint N, double *Ap, double *X, blasint incX); - -void cblas_ssymv(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, blasint N, float alpha, float *A, - blasint lda, float *X, blasint incX, float beta, float *Y, blasint incY); -void cblas_dsymv(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, blasint N, double alpha, double *A, - blasint lda, double *X, blasint incX, double beta, double *Y, blasint incY); -void cblas_chemv(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, blasint N, float *alpha, float *A, - blasint lda, float *X, blasint incX, float *beta, float *Y, blasint incY); -void cblas_zhemv(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, blasint N, double *alpha, double *A, - blasint lda, double *X, blasint incX, double *beta, double *Y, blasint incY); - - -void cblas_sspmv(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, blasint N, float alpha, float *Ap, - float *X, blasint incX, float beta, float *Y, blasint incY); -void cblas_dspmv(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, blasint N, double alpha, double *Ap, - double *X, blasint incX, double beta, double *Y, blasint incY); - -void cblas_sspr(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, blasint N, float alpha, float *X, blasint incX, float *Ap); -void cblas_dspr(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, blasint N, double alpha, double *X, blasint incX, double *Ap); - -void cblas_chpr(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, blasint N, float alpha, float *X, blasint incX, float *A); -void cblas_zhpr(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, blasint N, double alpha, double *X,blasint incX, double *A); - -void cblas_sspr2(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, blasint N, float alpha, float *X, blasint incX, float *Y, blasint incY, float *A); -void cblas_dspr2(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, blasint N, double alpha, double *X, blasint incX, double *Y, blasint incY, double *A); -void cblas_chpr2(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, blasint N, float *alpha, float *X, blasint incX, float *Y, blasint incY, float *Ap); -void cblas_zhpr2(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, blasint N, double *alpha, double *X, blasint incX, double *Y, blasint incY, double *Ap); - -void cblas_chbmv(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, blasint N, blasint K, - float *alpha, float *A, blasint lda, float *X, blasint incX, float *beta, float *Y, blasint incY); -void cblas_zhbmv(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, blasint N, blasint K, - double *alpha, double *A, blasint lda, double *X, blasint incX, double *beta, double *Y, blasint incY); - -void cblas_chpmv(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, blasint N, - float *alpha, float *Ap, float *X, blasint incX, float *beta, float *Y, blasint incY); -void cblas_zhpmv(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, blasint N, - double *alpha, double *Ap, double *X, blasint incX, double *beta, double *Y, blasint incY); - -void cblas_sgemm(enum CBLAS_ORDER Order, enum CBLAS_TRANSPOSE TransA, enum CBLAS_TRANSPOSE TransB, blasint M, blasint N, blasint K, - float alpha, float *A, blasint lda, float *B, blasint ldb, float beta, float *C, blasint ldc); -void cblas_dgemm(enum CBLAS_ORDER Order, enum CBLAS_TRANSPOSE TransA, enum CBLAS_TRANSPOSE TransB, blasint M, blasint N, blasint K, - double alpha, double *A, blasint lda, double *B, blasint ldb, double beta, double *C, blasint ldc); -void cblas_cgemm(enum CBLAS_ORDER Order, enum CBLAS_TRANSPOSE TransA, enum CBLAS_TRANSPOSE TransB, blasint M, blasint N, blasint K, - float *alpha, float *A, blasint lda, float *B, blasint ldb, float *beta, float *C, blasint ldc); -void cblas_cgemm3m(enum CBLAS_ORDER Order, enum CBLAS_TRANSPOSE TransA, enum CBLAS_TRANSPOSE TransB, blasint M, blasint N, blasint K, - float *alpha, float *A, blasint lda, float *B, blasint ldb, float *beta, float *C, blasint ldc); -void cblas_zgemm(enum CBLAS_ORDER Order, enum CBLAS_TRANSPOSE TransA, enum CBLAS_TRANSPOSE TransB, blasint M, blasint N, blasint K, - double *alpha, double *A, blasint lda, double *B, blasint ldb, double *beta, double *C, blasint ldc); -void cblas_zgemm3m(enum CBLAS_ORDER Order, enum CBLAS_TRANSPOSE TransA, enum CBLAS_TRANSPOSE TransB, blasint M, blasint N, blasint K, - double *alpha, double *A, blasint lda, double *B, blasint ldb, double *beta, double *C, blasint ldc); - -void cblas_ssymm(enum CBLAS_ORDER Order, enum CBLAS_SIDE Side, enum CBLAS_UPLO Uplo, blasint M, blasint N, - float alpha, float *A, blasint lda, float *B, blasint ldb, float beta, float *C, blasint ldc); -void cblas_dsymm(enum CBLAS_ORDER Order, enum CBLAS_SIDE Side, enum CBLAS_UPLO Uplo, blasint M, blasint N, - double alpha, double *A, blasint lda, double *B, blasint ldb, double beta, double *C, blasint ldc); -void cblas_csymm(enum CBLAS_ORDER Order, enum CBLAS_SIDE Side, enum CBLAS_UPLO Uplo, blasint M, blasint N, - float *alpha, float *A, blasint lda, float *B, blasint ldb, float *beta, float *C, blasint ldc); -void cblas_zsymm(enum CBLAS_ORDER Order, enum CBLAS_SIDE Side, enum CBLAS_UPLO Uplo, blasint M, blasint N, - double *alpha, double *A, blasint lda, double *B, blasint ldb, double *beta, double *C, blasint ldc); - -void cblas_ssyrk(enum CBLAS_ORDER Order, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE Trans, - blasint N, blasint K, float alpha, float *A, blasint lda, float beta, float *C, blasint ldc); -void cblas_dsyrk(enum CBLAS_ORDER Order, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE Trans, - blasint N, blasint K, double alpha, double *A, blasint lda, double beta, double *C, blasint ldc); -void cblas_csyrk(enum CBLAS_ORDER Order, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE Trans, - blasint N, blasint K, float *alpha, float *A, blasint lda, float *beta, float *C, blasint ldc); -void cblas_zsyrk(enum CBLAS_ORDER Order, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE Trans, - blasint N, blasint K, double *alpha, double *A, blasint lda, double *beta, double *C, blasint ldc); - -void cblas_ssyr2k(enum CBLAS_ORDER Order, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE Trans, - blasint N, blasint K, float alpha, float *A, blasint lda, float *B, blasint ldb, float beta, float *C, blasint ldc); -void cblas_dsyr2k(enum CBLAS_ORDER Order, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE Trans, - blasint N, blasint K, double alpha, double *A, blasint lda, double *B, blasint ldb, double beta, double *C, blasint ldc); -void cblas_csyr2k(enum CBLAS_ORDER Order, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE Trans, - blasint N, blasint K, float *alpha, float *A, blasint lda, float *B, blasint ldb, float *beta, float *C, blasint ldc); -void cblas_zsyr2k(enum CBLAS_ORDER Order, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE Trans, - blasint N, blasint K, double *alpha, double *A, blasint lda, double *B, blasint ldb, double *beta, double *C, blasint ldc); - -void cblas_strmm(enum CBLAS_ORDER Order, enum CBLAS_SIDE Side, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE TransA, - enum CBLAS_DIAG Diag, blasint M, blasint N, float alpha, float *A, blasint lda, float *B, blasint ldb); -void cblas_dtrmm(enum CBLAS_ORDER Order, enum CBLAS_SIDE Side, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE TransA, - enum CBLAS_DIAG Diag, blasint M, blasint N, double alpha, double *A, blasint lda, double *B, blasint ldb); -void cblas_ctrmm(enum CBLAS_ORDER Order, enum CBLAS_SIDE Side, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE TransA, - enum CBLAS_DIAG Diag, blasint M, blasint N, float *alpha, float *A, blasint lda, float *B, blasint ldb); -void cblas_ztrmm(enum CBLAS_ORDER Order, enum CBLAS_SIDE Side, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE TransA, - enum CBLAS_DIAG Diag, blasint M, blasint N, double *alpha, double *A, blasint lda, double *B, blasint ldb); - -void cblas_strsm(enum CBLAS_ORDER Order, enum CBLAS_SIDE Side, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE TransA, - enum CBLAS_DIAG Diag, blasint M, blasint N, float alpha, float *A, blasint lda, float *B, blasint ldb); -void cblas_dtrsm(enum CBLAS_ORDER Order, enum CBLAS_SIDE Side, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE TransA, - enum CBLAS_DIAG Diag, blasint M, blasint N, double alpha, double *A, blasint lda, double *B, blasint ldb); -void cblas_ctrsm(enum CBLAS_ORDER Order, enum CBLAS_SIDE Side, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE TransA, - enum CBLAS_DIAG Diag, blasint M, blasint N, float *alpha, float *A, blasint lda, float *B, blasint ldb); -void cblas_ztrsm(enum CBLAS_ORDER Order, enum CBLAS_SIDE Side, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE TransA, - enum CBLAS_DIAG Diag, blasint M, blasint N, double *alpha, double *A, blasint lda, double *B, blasint ldb); - -void cblas_chemm(enum CBLAS_ORDER Order, enum CBLAS_SIDE Side, enum CBLAS_UPLO Uplo, blasint M, blasint N, - float *alpha, float *A, blasint lda, float *B, blasint ldb, float *beta, float *C, blasint ldc); -void cblas_zhemm(enum CBLAS_ORDER Order, enum CBLAS_SIDE Side, enum CBLAS_UPLO Uplo, blasint M, blasint N, - double *alpha, double *A, blasint lda, double *B, blasint ldb, double *beta, double *C, blasint ldc); - -void cblas_cherk(enum CBLAS_ORDER Order, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE Trans, blasint N, blasint K, - float alpha, float *A, blasint lda, float beta, float *C, blasint ldc); -void cblas_zherk(enum CBLAS_ORDER Order, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE Trans, blasint N, blasint K, - double alpha, double *A, blasint lda, double beta, double *C, blasint ldc); - -void cblas_cher2k(enum CBLAS_ORDER Order, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE Trans, blasint N, blasint K, - float *alpha, float *A, blasint lda, float *B, blasint ldb, float beta, float *C, blasint ldc); -void cblas_zher2k(enum CBLAS_ORDER Order, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE Trans, blasint N, blasint K, - double *alpha, double *A, blasint lda, double *B, blasint ldb, double beta, double *C, blasint ldc); - -void cblas_xerbla(blasint p, char *rout, char *form, ...); - -/*** BLAS extensions ***/ - -void cblas_saxpby(blasint n, float alpha, float *x, blasint incx,float beta, float *y, blasint incy); - -void cblas_daxpby(blasint n, double alpha, double *x, blasint incx,double beta, double *y, blasint incy); - -void cblas_caxpby(blasint n, float *alpha, float *x, blasint incx,float *beta, float *y, blasint incy); - -void cblas_zaxpby(blasint n, double *alpha, double *x, blasint incx,double *beta, double *y, blasint incy); - -void cblas_somatcopy( enum CBLAS_ORDER CORDER, enum CBLAS_TRANSPOSE CTRANS, blasint crows, blasint ccols, float calpha, float *a, - blasint clda, float *b, blasint cldb); -void cblas_domatcopy( enum CBLAS_ORDER CORDER, enum CBLAS_TRANSPOSE CTRANS, blasint crows, blasint ccols, double calpha, double *a, - blasint clda, double *b, blasint cldb); -void cblas_comatcopy( enum CBLAS_ORDER CORDER, enum CBLAS_TRANSPOSE CTRANS, blasint crows, blasint ccols, void* calpha, void* a, - blasint clda, void *b, blasint cldb); -void cblas_zomatcopy( enum CBLAS_ORDER CORDER, enum CBLAS_TRANSPOSE CTRANS, blasint crows, blasint ccols, void* calpha, void* a, - blasint clda, void *b, blasint cldb); - -void cblas_simatcopy( enum CBLAS_ORDER CORDER, enum CBLAS_TRANSPOSE CTRANS, blasint crows, blasint ccols, float calpha, float *a, - blasint clda, blasint cldb); -void cblas_dimatcopy( enum CBLAS_ORDER CORDER, enum CBLAS_TRANSPOSE CTRANS, blasint crows, blasint ccols, double calpha, double *a, - blasint clda, blasint cldb); -void cblas_cimatcopy( enum CBLAS_ORDER CORDER, enum CBLAS_TRANSPOSE CTRANS, blasint crows, blasint ccols, float* calpha, float* a, - blasint clda, blasint cldb); -void cblas_zimatcopy( enum CBLAS_ORDER CORDER, enum CBLAS_TRANSPOSE CTRANS, blasint crows, blasint ccols, double* calpha, double* a, - blasint clda, blasint cldb); - -void cblas_sgeadd( enum CBLAS_ORDER CORDER, blasint crows, blasint ccols, float calpha, float *a, blasint clda, float cbeta, - float *c, blasint cldc); -void cblas_dgeadd( enum CBLAS_ORDER CORDER, blasint crows, blasint ccols, double calpha, double *a, blasint clda, double cbeta, - double *c, blasint cldc); -void cblas_cgeadd( enum CBLAS_ORDER CORDER, blasint crows, blasint ccols, float *calpha, float *a, blasint clda, float *cbeta, - float *c, blasint cldc); -void cblas_zgeadd( enum CBLAS_ORDER CORDER, blasint crows, blasint ccols, double *calpha, double *a, blasint clda, double *cbeta, - double *c, blasint cldc); - -#ifdef __cplusplus -} -#endif /* __cplusplus */ - -#endif diff --git a/common.h b/common.h index c87ed6652..0b0bdb812 100644 --- a/common.h +++ b/common.h @@ -117,6 +117,7 @@ extern "C" { #include #endif #include +#include #include #include #ifdef SMP @@ -410,7 +411,51 @@ typedef char env_var_t[MAX_PATH]; typedef char* env_var_t; #define readenv(p, n) ((p)=getenv(n)) #endif + +#if !defined(RPCC_DEFINED) && !defined(OS_WINDOWS) +#ifdef _POSIX_MONOTONIC_CLOCK +#if defined(__GLIBC_PREREQ) // cut the if condition if two lines, otherwise will fail at __GLIBC_PREREQ(2, 17) +#if __GLIBC_PREREQ(2, 17) // don't require -lrt +#define USE_MONOTONIC #endif +#elif defined(OS_ANDROID) +#define USE_MONOTONIC +#endif +#endif +/* use similar scale as x86 rdtsc for timeouts to work correctly */ +static inline unsigned long long rpcc(void){ +#ifdef USE_MONOTONIC + struct timespec ts; + clock_gettime(CLOCK_MONOTONIC, &ts); + return (unsigned long long)ts.tv_sec * 1000000000ull + ts.tv_nsec; +#else + struct timeval tv; + gettimeofday(&tv,NULL); + return (unsigned long long)tv.tv_sec * 1000000000ull + tv.tv_usec * 1000; +#endif +} +#define RPCC_DEFINED +#define RPCC64BIT +#endif // !RPCC_DEFINED + +#if !defined(BLAS_LOCK_DEFINED) && defined(__GNUC__) +static void __inline blas_lock(volatile BLASULONG *address){ + + do { + while (*address) {YIELDING;}; + + } while (!__sync_bool_compare_and_swap(address, 0, 1)); +} +#define BLAS_LOCK_DEFINED +#endif + +#ifndef RPCC_DEFINED +#error "rpcc() implementation is missing for your platform" +#endif +#ifndef BLAS_LOCK_DEFINED +#error "blas_lock() implementation is missing for your platform" +#endif +#endif // !ASSEMBLER #ifdef OS_LINUX #include "common_linux.h" diff --git a/common_alpha.h b/common_alpha.h index 845fb316a..9739c941d 100644 --- a/common_alpha.h +++ b/common_alpha.h @@ -76,6 +76,7 @@ static void __inline blas_lock(unsigned long *address){ "30:", address); #endif } +#define BLAS_LOCK_DEFINED static __inline unsigned int rpcc(void){ @@ -89,6 +90,7 @@ static __inline unsigned int rpcc(void){ return r0; } +#define RPCC_DEFINED #define HALT ldq $0, 0($0) diff --git a/common_arm.h b/common_arm.h index 135191057..6bf836835 100644 --- a/common_arm.h +++ b/common_arm.h @@ -51,6 +51,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #ifndef ASSEMBLER +#if defined(ARMV6) || defined(ARMV7) || defined(ARMV8) + static void __inline blas_lock(volatile BLASULONG *address){ int register ret; @@ -59,40 +61,29 @@ static void __inline blas_lock(volatile BLASULONG *address){ while (*address) {YIELDING;}; __asm__ __volatile__( - "1: \n\t" - "ldrex r2, [%1] \n\t" - "mov r2, #0 \n\t" - "strex r3, r2, [%1] \n\t" - "cmp r3, #0 \n\t" - "bne 1b \n\t" - "mov %0 , r3 \n\t" - : "=r"(ret), "=r"(address) - : "1"(address) - : "memory", "r2" , "r3" - - + "ldrex r2, [%1] \n\t" + "strex %0, %2, [%1] \n\t" + "orr %0, r2 \n\t" + : "=&r"(ret) + : "r"(address), "r"(1) + : "memory", "r2" ); } while (ret); - + MB; } - -static inline unsigned long long rpcc(void){ - unsigned long long ret=0; - double v; - struct timeval tv; - gettimeofday(&tv,NULL); - v=(double) tv.tv_sec + (double) tv.tv_usec * 1e-6; - ret = (unsigned long long) ( v * 1000.0d ); - return ret; -} +#define BLAS_LOCK_DEFINED +#endif static inline int blas_quickdivide(blasint x, blasint y){ return x / y; } -#if defined(DOUBLE) +#if !defined(HAVE_VFP) +/* no FPU, soft float */ +#define GET_IMAGE(res) +#elif defined(DOUBLE) #define GET_IMAGE(res) __asm__ __volatile__("vstr.f64 d1, %0" : "=m"(res) : : "memory") #else #define GET_IMAGE(res) __asm__ __volatile__("vstr.f32 s1, %0" : "=m"(res) : : "memory") @@ -140,4 +131,8 @@ REALNAME: #define MAP_ANONYMOUS MAP_ANON #endif +#if !defined(ARMV5) && !defined(ARMV6) && !defined(ARMV7) && !defined(ARMV8) +#error "you must define ARMV5, ARMV6, ARMV7 or ARMV8" +#endif + #endif diff --git a/common_arm64.h b/common_arm64.h index aa310c5f2..15987c677 100644 --- a/common_arm64.h +++ b/common_arm64.h @@ -45,42 +45,26 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. static void __inline blas_lock(volatile BLASULONG *address){ - int register ret; - int register tmp; + long register ret; do { while (*address) {YIELDING;}; __asm__ __volatile__( - "1: \n\t" - "ldaxr %2, [%1] \n\t" - "mov %2, #0 \n\t" - "stlxr %w0, %2, [%1] \n\t" - "cbnz %w0, 1b \n\t" - "mov %0 , #0 \n\t" - : "=r"(ret), "=r"(address), "=r"(tmp) - : "1"(address) - : "memory", "%w0" - //, "%r2" , "%r3" - - + "ldaxr %0, [%1] \n\t" + "stlxr w2, %2, [%1] \n\t" + "orr %0, %0, x2 \n\t" + : "=r"(ret) + : "r"(address), "r"(1l) + : "memory", "x2" ); } while (ret); - + MB; } +#define BLAS_LOCK_DEFINED -static inline unsigned long long rpcc(void){ - unsigned long long ret=0; - double v; - struct timeval tv; - gettimeofday(&tv,NULL); - v=(double) tv.tv_sec + (double) tv.tv_usec * 1e-6; - ret = (unsigned long long) ( v * 1000.0d ); - return ret; -} - static inline int blas_quickdivide(blasint x, blasint y){ return x / y; } diff --git a/common_c.h b/common_c.h index 741d7d087..ce0f2a5bd 100644 --- a/common_c.h +++ b/common_c.h @@ -220,6 +220,15 @@ #define COMATCOPY_K_CTC comatcopy_k_ctc #define COMATCOPY_K_RTC comatcopy_k_rtc +#define CIMATCOPY_K_CN cimatcopy_k_cn +#define CIMATCOPY_K_RN cimatcopy_k_rn +#define CIMATCOPY_K_CT cimatcopy_k_ct +#define CIMATCOPY_K_RT cimatcopy_k_rt +#define CIMATCOPY_K_CNC cimatcopy_k_cnc +#define CIMATCOPY_K_RNC cimatcopy_k_rnc +#define CIMATCOPY_K_CTC cimatcopy_k_ctc +#define CIMATCOPY_K_RTC cimatcopy_k_rtc + #define CGEADD_K cgeadd_k #else @@ -403,6 +412,16 @@ #define COMATCOPY_K_RNC gotoblas -> comatcopy_k_rnc #define COMATCOPY_K_CTC gotoblas -> comatcopy_k_ctc #define COMATCOPY_K_RTC gotoblas -> comatcopy_k_rtc + +#define CIMATCOPY_K_CN gotoblas -> cimatcopy_k_cn +#define CIMATCOPY_K_RN gotoblas -> cimatcopy_k_rn +#define CIMATCOPY_K_CT gotoblas -> cimatcopy_k_ct +#define CIMATCOPY_K_RT gotoblas -> cimatcopy_k_rt +#define CIMATCOPY_K_CNC gotoblas -> cimatcopy_k_cnc +#define CIMATCOPY_K_RNC gotoblas -> cimatcopy_k_rnc +#define CIMATCOPY_K_CTC gotoblas -> cimatcopy_k_ctc +#define CIMATCOPY_K_RTC gotoblas -> cimatcopy_k_rtc + #define CGEADD_K gotoblas -> cgeadd_k #endif diff --git a/common_d.h b/common_d.h index d6dfd7f04..ad9945186 100644 --- a/common_d.h +++ b/common_d.h @@ -149,6 +149,11 @@ #define DOMATCOPY_K_RN domatcopy_k_rn #define DOMATCOPY_K_CT domatcopy_k_ct #define DOMATCOPY_K_RT domatcopy_k_rt + +#define DIMATCOPY_K_CN dimatcopy_k_cn +#define DIMATCOPY_K_RN dimatcopy_k_rn +#define DIMATCOPY_K_CT dimatcopy_k_ct +#define DIMATCOPY_K_RT dimatcopy_k_rt #define DGEADD_K dgeadd_k #else @@ -267,6 +272,10 @@ #define DOMATCOPY_K_RN gotoblas -> domatcopy_k_rn #define DOMATCOPY_K_CT gotoblas -> domatcopy_k_ct #define DOMATCOPY_K_RT gotoblas -> domatcopy_k_rt +#define DIMATCOPY_K_CN gotoblas -> dimatcopy_k_cn +#define DIMATCOPY_K_RN gotoblas -> dimatcopy_k_rn +#define DIMATCOPY_K_CT gotoblas -> dimatcopy_k_ct +#define DIMATCOPY_K_RT gotoblas -> dimatcopy_k_rt #define DGEADD_K gotoblas -> dgeadd_k diff --git a/common_ia64.h b/common_ia64.h index 8e92b5992..72b75fc4e 100644 --- a/common_ia64.h +++ b/common_ia64.h @@ -68,6 +68,7 @@ static __inline void blas_lock(volatile unsigned long *address){ : "ar.ccv", "memory"); } while (ret); } +#define BLAS_LOCK_DEFINED static __inline unsigned long rpcc(void) { unsigned long clocks; @@ -75,6 +76,7 @@ static __inline unsigned long rpcc(void) { __asm__ __volatile__ ("mov %0=ar.itc" : "=r"(clocks)); return clocks; } +#define RPCC_DEFINED static __inline unsigned long stmxcsr(void){ @@ -99,10 +101,12 @@ static __inline void blas_lock(volatile unsigned long *address){ while (*address || _InterlockedCompareExchange((volatile int *) address,1,0)) ; } +#define BLAS_LOCK_DEFINED static __inline unsigned int rpcc(void) { return __getReg(_IA64_REG_AR_ITC); } +#define RPCC_DEFINED static __inline unsigned int stmxcsr(void) { return __getReg(_IA64_REG_AR_FPSR); diff --git a/common_level3.h b/common_level3.h index e0ecbc4e2..1f5490baa 100644 --- a/common_level3.h +++ b/common_level3.h @@ -1736,31 +1736,55 @@ int somatcopy_k_cn(BLASLONG, BLASLONG, float, float *, BLASLONG, float *, BLAS int somatcopy_k_rn(BLASLONG, BLASLONG, float, float *, BLASLONG, float *, BLASLONG); int somatcopy_k_ct(BLASLONG, BLASLONG, float, float *, BLASLONG, float *, BLASLONG); int somatcopy_k_rt(BLASLONG, BLASLONG, float, float *, BLASLONG, float *, BLASLONG); +int simatcopy_k_cn(BLASLONG, BLASLONG, float, float *, BLASLONG); +int simatcopy_k_rn(BLASLONG, BLASLONG, float, float *, BLASLONG); +int simatcopy_k_ct(BLASLONG, BLASLONG, float, float *, BLASLONG); +int simatcopy_k_rt(BLASLONG, BLASLONG, float, float *, BLASLONG); int domatcopy_k_cn(BLASLONG, BLASLONG, double, double *, BLASLONG, double *, BLASLONG); int domatcopy_k_rn(BLASLONG, BLASLONG, double, double *, BLASLONG, double *, BLASLONG); int domatcopy_k_ct(BLASLONG, BLASLONG, double, double *, BLASLONG, double *, BLASLONG); int domatcopy_k_rt(BLASLONG, BLASLONG, double, double *, BLASLONG, double *, BLASLONG); +int dimatcopy_k_cn(BLASLONG, BLASLONG, double, double *, BLASLONG); +int dimatcopy_k_rn(BLASLONG, BLASLONG, double, double *, BLASLONG); +int dimatcopy_k_ct(BLASLONG, BLASLONG, double, double *, BLASLONG); +int dimatcopy_k_rt(BLASLONG, BLASLONG, double, double *, BLASLONG); int comatcopy_k_cn(BLASLONG, BLASLONG, float, float, float *, BLASLONG, float *, BLASLONG); int comatcopy_k_rn(BLASLONG, BLASLONG, float, float, float *, BLASLONG, float *, BLASLONG); int comatcopy_k_ct(BLASLONG, BLASLONG, float, float, float *, BLASLONG, float *, BLASLONG); int comatcopy_k_rt(BLASLONG, BLASLONG, float, float, float *, BLASLONG, float *, BLASLONG); +int cimatcopy_k_cn(BLASLONG, BLASLONG, float, float, float *, BLASLONG); +int cimatcopy_k_rn(BLASLONG, BLASLONG, float, float, float *, BLASLONG); +int cimatcopy_k_ct(BLASLONG, BLASLONG, float, float, float *, BLASLONG); +int cimatcopy_k_rt(BLASLONG, BLASLONG, float, float, float *, BLASLONG); int comatcopy_k_cnc(BLASLONG, BLASLONG, float, float, float *, BLASLONG, float *, BLASLONG); int comatcopy_k_rnc(BLASLONG, BLASLONG, float, float, float *, BLASLONG, float *, BLASLONG); int comatcopy_k_ctc(BLASLONG, BLASLONG, float, float, float *, BLASLONG, float *, BLASLONG); int comatcopy_k_rtc(BLASLONG, BLASLONG, float, float, float *, BLASLONG, float *, BLASLONG); +int cimatcopy_k_cnc(BLASLONG, BLASLONG, float, float, float *, BLASLONG); +int cimatcopy_k_rnc(BLASLONG, BLASLONG, float, float, float *, BLASLONG); +int cimatcopy_k_ctc(BLASLONG, BLASLONG, float, float, float *, BLASLONG); +int cimatcopy_k_rtc(BLASLONG, BLASLONG, float, float, float *, BLASLONG); int zomatcopy_k_cn(BLASLONG, BLASLONG, double, double, double *, BLASLONG, double *, BLASLONG); int zomatcopy_k_rn(BLASLONG, BLASLONG, double, double, double *, BLASLONG, double *, BLASLONG); int zomatcopy_k_ct(BLASLONG, BLASLONG, double, double, double *, BLASLONG, double *, BLASLONG); int zomatcopy_k_rt(BLASLONG, BLASLONG, double, double, double *, BLASLONG, double *, BLASLONG); +int zimatcopy_k_cn(BLASLONG, BLASLONG, double, double, double *, BLASLONG); +int zimatcopy_k_rn(BLASLONG, BLASLONG, double, double, double *, BLASLONG); +int zimatcopy_k_ct(BLASLONG, BLASLONG, double, double, double *, BLASLONG); +int zimatcopy_k_rt(BLASLONG, BLASLONG, double, double, double *, BLASLONG); int zomatcopy_k_cnc(BLASLONG, BLASLONG, double, double, double *, BLASLONG, double *, BLASLONG); int zomatcopy_k_rnc(BLASLONG, BLASLONG, double, double, double *, BLASLONG, double *, BLASLONG); int zomatcopy_k_ctc(BLASLONG, BLASLONG, double, double, double *, BLASLONG, double *, BLASLONG); int zomatcopy_k_rtc(BLASLONG, BLASLONG, double, double, double *, BLASLONG, double *, BLASLONG); +int zimatcopy_k_cnc(BLASLONG, BLASLONG, double, double, double *, BLASLONG); +int zimatcopy_k_rnc(BLASLONG, BLASLONG, double, double, double *, BLASLONG); +int zimatcopy_k_ctc(BLASLONG, BLASLONG, double, double, double *, BLASLONG); +int zimatcopy_k_rtc(BLASLONG, BLASLONG, double, double, double *, BLASLONG); int sgeadd_k(BLASLONG, BLASLONG, float, float*, BLASLONG, float, float *, BLASLONG); int dgeadd_k(BLASLONG, BLASLONG, double, double*, BLASLONG, double, double *, BLASLONG); diff --git a/common_macro.h b/common_macro.h index 8555baa67..4976e766f 100644 --- a/common_macro.h +++ b/common_macro.h @@ -634,6 +634,11 @@ #define OMATCOPY_K_RN DOMATCOPY_K_RN #define OMATCOPY_K_CT DOMATCOPY_K_CT #define OMATCOPY_K_RT DOMATCOPY_K_RT +#define IMATCOPY_K_CN DIMATCOPY_K_CN +#define IMATCOPY_K_RN DIMATCOPY_K_RN +#define IMATCOPY_K_CT DIMATCOPY_K_CT +#define IMATCOPY_K_RT DIMATCOPY_K_RT + #define GEADD_K DGEADD_K #else @@ -931,6 +936,10 @@ #define OMATCOPY_K_RN SOMATCOPY_K_RN #define OMATCOPY_K_CT SOMATCOPY_K_CT #define OMATCOPY_K_RT SOMATCOPY_K_RT +#define IMATCOPY_K_CN SIMATCOPY_K_CN +#define IMATCOPY_K_RN SIMATCOPY_K_RN +#define IMATCOPY_K_CT SIMATCOPY_K_CT +#define IMATCOPY_K_RT SIMATCOPY_K_RT #define GEADD_K SGEADD_K #endif @@ -1747,6 +1756,15 @@ #define OMATCOPY_K_RNC ZOMATCOPY_K_RNC #define OMATCOPY_K_CTC ZOMATCOPY_K_CTC #define OMATCOPY_K_RTC ZOMATCOPY_K_RTC +#define IMATCOPY_K_CN ZIMATCOPY_K_CN +#define IMATCOPY_K_RN ZIMATCOPY_K_RN +#define IMATCOPY_K_CT ZIMATCOPY_K_CT +#define IMATCOPY_K_RT ZIMATCOPY_K_RT +#define IMATCOPY_K_CNC ZIMATCOPY_K_CNC +#define IMATCOPY_K_RNC ZIMATCOPY_K_RNC +#define IMATCOPY_K_CTC ZIMATCOPY_K_CTC +#define IMATCOPY_K_RTC ZIMATCOPY_K_RTC + #define GEADD_K ZGEADD_K #else @@ -2160,6 +2178,14 @@ #define OMATCOPY_K_RNC COMATCOPY_K_RNC #define OMATCOPY_K_CTC COMATCOPY_K_CTC #define OMATCOPY_K_RTC COMATCOPY_K_RTC +#define IMATCOPY_K_CN CIMATCOPY_K_CN +#define IMATCOPY_K_RN CIMATCOPY_K_RN +#define IMATCOPY_K_CT CIMATCOPY_K_CT +#define IMATCOPY_K_RT CIMATCOPY_K_RT +#define IMATCOPY_K_CNC CIMATCOPY_K_CNC +#define IMATCOPY_K_RNC CIMATCOPY_K_RNC +#define IMATCOPY_K_CTC CIMATCOPY_K_CTC +#define IMATCOPY_K_RTC CIMATCOPY_K_RTC #define GEADD_K CGEADD_K diff --git a/common_mips64.h b/common_mips64.h index 7cd86b375..f5c0ec7cf 100644 --- a/common_mips64.h +++ b/common_mips64.h @@ -98,6 +98,7 @@ static void INLINE blas_lock(volatile unsigned long *address){ } while (ret); } +#define BLAS_LOCK_DEFINED static inline unsigned int rpcc(void){ unsigned long ret; @@ -118,6 +119,7 @@ static inline unsigned int rpcc(void){ #endif return ret; } +#define RPCC_DEFINED #if defined(LOONGSON3A) || defined(LOONGSON3B) #ifndef NO_AFFINITY diff --git a/common_param.h b/common_param.h index 1b56e85f0..ab40ddeef 100644 --- a/common_param.h +++ b/common_param.h @@ -830,31 +830,61 @@ BLASLONG (*ixamin_k)(BLASLONG, xdouble *, BLASLONG); int (*somatcopy_k_rn) (BLASLONG, BLASLONG, float, float*, BLASLONG, float*, BLASLONG); int (*somatcopy_k_rt) (BLASLONG, BLASLONG, float, float*, BLASLONG, float*, BLASLONG); + int (*simatcopy_k_cn) (BLASLONG, BLASLONG, float, float*, BLASLONG); + int (*simatcopy_k_ct) (BLASLONG, BLASLONG, float, float*, BLASLONG); + int (*simatcopy_k_rn) (BLASLONG, BLASLONG, float, float*, BLASLONG); + int (*simatcopy_k_rt) (BLASLONG, BLASLONG, float, float*, BLASLONG); + int (*domatcopy_k_cn) (BLASLONG, BLASLONG, double, double*, BLASLONG, double*, BLASLONG); int (*domatcopy_k_ct) (BLASLONG, BLASLONG, double, double*, BLASLONG, double*, BLASLONG); int (*domatcopy_k_rn) (BLASLONG, BLASLONG, double, double*, BLASLONG, double*, BLASLONG); int (*domatcopy_k_rt) (BLASLONG, BLASLONG, double, double*, BLASLONG, double*, BLASLONG); + int (*dimatcopy_k_cn) (BLASLONG, BLASLONG, double, double*, BLASLONG); + int (*dimatcopy_k_ct) (BLASLONG, BLASLONG, double, double*, BLASLONG); + int (*dimatcopy_k_rn) (BLASLONG, BLASLONG, double, double*, BLASLONG); + int (*dimatcopy_k_rt) (BLASLONG, BLASLONG, double, double*, BLASLONG); + int (*comatcopy_k_cn) (BLASLONG, BLASLONG, float, float, float*, BLASLONG, float*, BLASLONG); int (*comatcopy_k_ct) (BLASLONG, BLASLONG, float, float, float*, BLASLONG, float*, BLASLONG); int (*comatcopy_k_rn) (BLASLONG, BLASLONG, float, float, float*, BLASLONG, float*, BLASLONG); int (*comatcopy_k_rt) (BLASLONG, BLASLONG, float, float, float*, BLASLONG, float*, BLASLONG); + int (*cimatcopy_k_cn) (BLASLONG, BLASLONG, float, float, float*, BLASLONG); + int (*cimatcopy_k_ct) (BLASLONG, BLASLONG, float, float, float*, BLASLONG); + int (*cimatcopy_k_rn) (BLASLONG, BLASLONG, float, float, float*, BLASLONG); + int (*cimatcopy_k_rt) (BLASLONG, BLASLONG, float, float, float*, BLASLONG); + int (*comatcopy_k_cnc) (BLASLONG, BLASLONG, float, float, float*, BLASLONG, float*, BLASLONG); int (*comatcopy_k_ctc) (BLASLONG, BLASLONG, float, float, float*, BLASLONG, float*, BLASLONG); int (*comatcopy_k_rnc) (BLASLONG, BLASLONG, float, float, float*, BLASLONG, float*, BLASLONG); int (*comatcopy_k_rtc) (BLASLONG, BLASLONG, float, float, float*, BLASLONG, float*, BLASLONG); + int (*cimatcopy_k_cnc) (BLASLONG, BLASLONG, float, float, float*, BLASLONG); + int (*cimatcopy_k_ctc) (BLASLONG, BLASLONG, float, float, float*, BLASLONG); + int (*cimatcopy_k_rnc) (BLASLONG, BLASLONG, float, float, float*, BLASLONG); + int (*cimatcopy_k_rtc) (BLASLONG, BLASLONG, float, float, float*, BLASLONG); + int (*zomatcopy_k_cn) (BLASLONG, BLASLONG, double, double, double*, BLASLONG, double*, BLASLONG); int (*zomatcopy_k_ct) (BLASLONG, BLASLONG, double, double, double*, BLASLONG, double*, BLASLONG); int (*zomatcopy_k_rn) (BLASLONG, BLASLONG, double, double, double*, BLASLONG, double*, BLASLONG); int (*zomatcopy_k_rt) (BLASLONG, BLASLONG, double, double, double*, BLASLONG, double*, BLASLONG); + int (*zimatcopy_k_cn) (BLASLONG, BLASLONG, double, double, double*, BLASLONG); + int (*zimatcopy_k_ct) (BLASLONG, BLASLONG, double, double, double*, BLASLONG); + int (*zimatcopy_k_rn) (BLASLONG, BLASLONG, double, double, double*, BLASLONG); + int (*zimatcopy_k_rt) (BLASLONG, BLASLONG, double, double, double*, BLASLONG); + int (*zomatcopy_k_cnc) (BLASLONG, BLASLONG, double, double, double*, BLASLONG, double*, BLASLONG); int (*zomatcopy_k_ctc) (BLASLONG, BLASLONG, double, double, double*, BLASLONG, double*, BLASLONG); int (*zomatcopy_k_rnc) (BLASLONG, BLASLONG, double, double, double*, BLASLONG, double*, BLASLONG); int (*zomatcopy_k_rtc) (BLASLONG, BLASLONG, double, double, double*, BLASLONG, double*, BLASLONG); + int (*zimatcopy_k_cnc) (BLASLONG, BLASLONG, double, double, double*, BLASLONG); + int (*zimatcopy_k_ctc) (BLASLONG, BLASLONG, double, double, double*, BLASLONG); + int (*zimatcopy_k_rnc) (BLASLONG, BLASLONG, double, double, double*, BLASLONG); + int (*zimatcopy_k_rtc) (BLASLONG, BLASLONG, double, double, double*, BLASLONG); + int (*sgeadd_k) (BLASLONG, BLASLONG, float, float *, BLASLONG, float, float *, BLASLONG); int (*dgeadd_k) (BLASLONG, BLASLONG, double, double *, BLASLONG, double, double *, BLASLONG); int (*cgeadd_k) (BLASLONG, BLASLONG, float, float, float *, BLASLONG, float, float, float *, BLASLONG); diff --git a/common_power.h b/common_power.h index f88f527bd..ab331b04a 100644 --- a/common_power.h +++ b/common_power.h @@ -87,6 +87,7 @@ static void INLINE blas_lock(volatile unsigned long *address){ #endif } while (ret); } +#define BLAS_LOCK_DEFINED static inline unsigned long rpcc(void){ unsigned long ret; @@ -103,6 +104,7 @@ static inline unsigned long rpcc(void){ #endif } +#define RPCC_DEFINED #ifdef __64BIT__ #define RPCC64BIT @@ -495,6 +497,15 @@ static inline int blas_quickdivide(blasint x, blasint y){ REALNAME: #define EPILOGUE .size REALNAME, .-REALNAME #else +#if _CALL_ELF == 2 +#define PROLOGUE \ + .section .text;\ + .align 6;\ + .globl REALNAME;\ + .type REALNAME, @function;\ +REALNAME: +#define EPILOGUE .size REALNAME, .-REALNAME +#else #define PROLOGUE \ .section .text;\ .align 5;\ @@ -514,6 +525,7 @@ REALNAME:;\ .size .REALNAME, .-.REALNAME; \ .section .note.GNU-stack,"",@progbits #endif +#endif #ifdef PROFILE #ifndef __64BIT__ @@ -792,4 +804,25 @@ Lmcount$lazy_ptr: #ifndef MAP_ANONYMOUS #define MAP_ANONYMOUS MAP_ANON #endif + +#ifdef OS_LINUX +#ifndef __64BIT__ +#define FRAMESLOT(X) (((X) * 4) + 8) +#else +#if _CALL_ELF == 2 +#define FRAMESLOT(X) (((X) * 8) + 96) +#else +#define FRAMESLOT(X) (((X) * 8) + 112) +#endif +#endif +#endif + +#if defined(OS_AIX) || defined(OS_DARWIN) +#ifndef __64BIT__ +#define FRAMESLOT(X) (((X) * 4) + 56) +#else +#define FRAMESLOT(X) (((X) * 8) + 112) +#endif +#endif + #endif diff --git a/common_s.h b/common_s.h index a4d8679b7..3c1600859 100644 --- a/common_s.h +++ b/common_s.h @@ -152,6 +152,10 @@ #define SOMATCOPY_K_RN somatcopy_k_rn #define SOMATCOPY_K_CT somatcopy_k_ct #define SOMATCOPY_K_RT somatcopy_k_rt +#define SIMATCOPY_K_CN simatcopy_k_cn +#define SIMATCOPY_K_RN simatcopy_k_rn +#define SIMATCOPY_K_CT simatcopy_k_ct +#define SIMATCOPY_K_RT simatcopy_k_rt #define SGEADD_K sgeadd_k @@ -274,6 +278,10 @@ #define SOMATCOPY_K_RN gotoblas -> somatcopy_k_rn #define SOMATCOPY_K_CT gotoblas -> somatcopy_k_ct #define SOMATCOPY_K_RT gotoblas -> somatcopy_k_rt +#define SIMATCOPY_K_CN gotoblas -> simatcopy_k_cn +#define SIMATCOPY_K_RN gotoblas -> simatcopy_k_rn +#define SIMATCOPY_K_CT gotoblas -> simatcopy_k_ct +#define SIMATCOPY_K_RT gotoblas -> simatcopy_k_rt #define SGEADD_K gotoblas -> sgeadd_k diff --git a/common_sparc.h b/common_sparc.h index 87ef75276..f99972db9 100644 --- a/common_sparc.h +++ b/common_sparc.h @@ -58,6 +58,7 @@ static void __inline blas_lock(volatile unsigned long *address){ : "memory"); } while (ret); } +#define BLAS_LOCK_DEFINED static __inline unsigned long rpcc(void){ unsigned long clocks; @@ -66,6 +67,7 @@ static __inline unsigned long rpcc(void){ return clocks; }; +#define RPCC_DEFINED #ifdef __64BIT__ #define RPCC64BIT diff --git a/common_x86.h b/common_x86.h index 8f875fc29..1ace84cad 100644 --- a/common_x86.h +++ b/common_x86.h @@ -77,6 +77,7 @@ static void __inline blas_lock(volatile BLASULONG *address){ } while (ret); } +#define BLAS_LOCK_DEFINED static __inline unsigned long long rpcc(void){ #if defined(_MSC_VER) && !defined(__clang__) @@ -89,6 +90,7 @@ static __inline unsigned long long rpcc(void){ return ((unsigned long long)a + ((unsigned long long)d << 32)); #endif }; +#define RPCC_DEFINED static __inline unsigned long getstackaddr(void){ #if defined(_MSC_VER) && !defined(__clang__) diff --git a/common_x86_64.h b/common_x86_64.h index 54377695c..da9afc0e4 100644 --- a/common_x86_64.h +++ b/common_x86_64.h @@ -86,6 +86,7 @@ static void __inline blas_lock(volatile BLASULONG *address){ } while (ret); } +#define BLAS_LOCK_DEFINED static __inline BLASULONG rpcc(void){ #ifdef C_MSVC @@ -98,6 +99,7 @@ static __inline BLASULONG rpcc(void){ return ((BLASULONG)a + ((BLASULONG)d << 32)); #endif } +#define RPCC_DEFINED #define RPCC64BIT diff --git a/common_z.h b/common_z.h index b17122776..b4f58bb0c 100644 --- a/common_z.h +++ b/common_z.h @@ -220,6 +220,15 @@ #define ZOMATCOPY_K_CTC zomatcopy_k_ctc #define ZOMATCOPY_K_RTC zomatcopy_k_rtc +#define ZIMATCOPY_K_CN zimatcopy_k_cn +#define ZIMATCOPY_K_RN zimatcopy_k_rn +#define ZIMATCOPY_K_CT zimatcopy_k_ct +#define ZIMATCOPY_K_RT zimatcopy_k_rt +#define ZIMATCOPY_K_CNC zimatcopy_k_cnc +#define ZIMATCOPY_K_RNC zimatcopy_k_rnc +#define ZIMATCOPY_K_CTC zimatcopy_k_ctc +#define ZIMATCOPY_K_RTC zimatcopy_k_rtc + #define ZGEADD_K zgeadd_k #else @@ -404,6 +413,15 @@ #define ZOMATCOPY_K_CTC gotoblas -> zomatcopy_k_ctc #define ZOMATCOPY_K_RTC gotoblas -> zomatcopy_k_rtc +#define ZIMATCOPY_K_CN gotoblas -> zimatcopy_k_cn +#define ZIMATCOPY_K_RN gotoblas -> zimatcopy_k_rn +#define ZIMATCOPY_K_CT gotoblas -> zimatcopy_k_ct +#define ZIMATCOPY_K_RT gotoblas -> zimatcopy_k_rt +#define ZIMATCOPY_K_CNC gotoblas -> zimatcopy_k_cnc +#define ZIMATCOPY_K_RNC gotoblas -> zimatcopy_k_rnc +#define ZIMATCOPY_K_CTC gotoblas -> zimatcopy_k_ctc +#define ZIMATCOPY_K_RTC gotoblas -> zimatcopy_k_rtc + #define ZGEADD_K gotoblas -> zgeadd_k #endif diff --git a/cpuid_arm.c b/cpuid_arm.c index 51ba72d70..6485003f3 100644 --- a/cpuid_arm.c +++ b/cpuid_arm.c @@ -192,6 +192,7 @@ void get_cpuconfig(void) { case CPU_CORTEXA9: printf("#define CORTEXA9\n"); + printf("#define ARMV7\n"); printf("#define HAVE_VFP\n"); printf("#define HAVE_VFPV3\n"); if ( get_feature("neon")) printf("#define HAVE_NEON\n"); @@ -207,6 +208,7 @@ void get_cpuconfig(void) case CPU_CORTEXA15: printf("#define CORTEXA15\n"); + printf("#define ARMV7\n"); printf("#define HAVE_VFP\n"); printf("#define HAVE_VFPV3\n"); if ( get_feature("neon")) printf("#define HAVE_NEON\n"); diff --git a/cpuid_power.c b/cpuid_power.c index 2fc333dd2..366c6ed08 100644 --- a/cpuid_power.c +++ b/cpuid_power.c @@ -115,6 +115,7 @@ int detect(void){ if (!strncasecmp(p, "POWER5", 6)) return CPUTYPE_POWER5; if (!strncasecmp(p, "POWER6", 6)) return CPUTYPE_POWER6; if (!strncasecmp(p, "POWER7", 6)) return CPUTYPE_POWER6; + if (!strncasecmp(p, "POWER8", 6)) return CPUTYPE_POWER6; if (!strncasecmp(p, "Cell", 4)) return CPUTYPE_CELL; if (!strncasecmp(p, "7447", 4)) return CPUTYPE_PPCG4; diff --git a/cpuid_x86.c b/cpuid_x86.c index d2138cc6b..a65991041 100644 --- a/cpuid_x86.c +++ b/cpuid_x86.c @@ -1159,6 +1159,16 @@ int get_cpuname(void){ return CPUTYPE_HASWELL; #else return CPUTYPE_SANDYBRIDGE; +#endif + else + return CPUTYPE_NEHALEM; + case 14: + //Skylake + if(support_avx()) +#ifndef NO_AVX2 + return CPUTYPE_HASWELL; +#else + return CPUTYPE_SANDYBRIDGE; #endif else return CPUTYPE_NEHALEM; @@ -1173,6 +1183,17 @@ int get_cpuname(void){ return CPUTYPE_HASWELL; #else return CPUTYPE_SANDYBRIDGE; +#endif + else + return CPUTYPE_NEHALEM; + case 5: + case 14: + // Skylake + if(support_avx()) +#ifndef NO_AVX2 + return CPUTYPE_HASWELL; +#else + return CPUTYPE_SANDYBRIDGE; #endif else return CPUTYPE_NEHALEM; @@ -1634,6 +1655,16 @@ int get_coretype(void){ return CORE_HASWELL; #else return CORE_SANDYBRIDGE; +#endif + else + return CORE_NEHALEM; + case 14: + //Skylake + if(support_avx()) +#ifndef NO_AVX2 + return CORE_HASWELL; +#else + return CORE_SANDYBRIDGE; #endif else return CORE_NEHALEM; @@ -1648,6 +1679,17 @@ int get_coretype(void){ return CORE_HASWELL; #else return CORE_SANDYBRIDGE; +#endif + else + return CORE_NEHALEM; + case 5: + case 14: + // Skylake + if(support_avx()) +#ifndef NO_AVX2 + return CORE_HASWELL; +#else + return CORE_SANDYBRIDGE; #endif else return CORE_NEHALEM; diff --git a/driver/others/blas_server.c b/driver/others/blas_server.c index b3b1ce7bd..1fd848c6b 100644 --- a/driver/others/blas_server.c +++ b/driver/others/blas_server.c @@ -425,6 +425,10 @@ static int blas_thread_server(void *arg){ main_status[cpu] = MAIN_FINISH; #endif + // arm: make sure all results are written out _before_ + // thread is marked as done and other threads use them + WMB; + thread_status[cpu].queue = (blas_queue_t * volatile) ((long)thread_status[cpu].queue & 0); /* Need a trick */ WMB; @@ -775,7 +779,12 @@ int exec_blas(BLASLONG num, blas_queue_t *queue){ stop = rpcc(); #endif - if ((num > 1) && queue -> next) exec_blas_async_wait(num - 1, queue -> next); + if ((num > 1) && queue -> next) { + exec_blas_async_wait(num - 1, queue -> next); + + // arm: make sure results from other threads are visible + MB; + } #ifdef TIMING_DEBUG fprintf(STDERR, "Thread[0] : %16lu %16lu (%8lu cycles)\n", diff --git a/driver/others/dynamic.c b/driver/others/dynamic.c index ff80504f9..c41164559 100644 --- a/driver/others/dynamic.c +++ b/driver/others/dynamic.c @@ -252,6 +252,15 @@ static gotoblas_t *get_coretype(void){ return &gotoblas_NEHALEM; //OS doesn't support AVX. Use old kernels. } } + //Intel Skylake + if (model == 14) { + if(support_avx()) + return &gotoblas_HASWELL; + else{ + openblas_warning(FALLBACK_VERBOSE, NEHALEM_FALLBACK); + return &gotoblas_NEHALEM; //OS doesn't support AVX. Use old kernels. + } + } return NULL; case 5: //Intel Broadwell @@ -263,6 +272,15 @@ static gotoblas_t *get_coretype(void){ return &gotoblas_NEHALEM; //OS doesn't support AVX. Use old kernels. } } + //Intel Skylake + if (model == 14 || model == 5) { + if(support_avx()) + return &gotoblas_HASWELL; + else{ + openblas_warning(FALLBACK_VERBOSE, NEHALEM_FALLBACK); + return &gotoblas_NEHALEM; //OS doesn't support AVX. Use old kernels. + } + } return NULL; } case 0xf: diff --git a/driver/others/memory.c b/driver/others/memory.c index a000a3c3e..78aee5185 100644 --- a/driver/others/memory.c +++ b/driver/others/memory.c @@ -143,8 +143,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define CONSTRUCTOR __cdecl #define DESTRUCTOR __cdecl #else -#define CONSTRUCTOR __attribute__ ((constructor)) -#define DESTRUCTOR __attribute__ ((destructor)) +#define CONSTRUCTOR __attribute__ ((constructor(101))) +#define DESTRUCTOR __attribute__ ((destructor(101))) #endif #ifdef DYNAMIC_ARCH @@ -1158,6 +1158,9 @@ void blas_memory_free(void *free_area){ printf(" Position : %d\n", position); #endif + // arm: ensure all writes are finished before other thread takes this memory + WMB; + memory[position].used = 0; #ifdef DEBUG diff --git a/getarch.c b/getarch.c index 7ea7c5ddd..0a49fd1b3 100644 --- a/getarch.c +++ b/getarch.c @@ -120,6 +120,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. /* #define FORCE_POWER4 */ /* #define FORCE_POWER5 */ /* #define FORCE_POWER6 */ +/* #define FORCE_POWER7 */ +/* #define FORCE_POWER8 */ /* #define FORCE_PPCG4 */ /* #define FORCE_PPC970 */ /* #define FORCE_PPC970MP */ @@ -550,7 +552,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define CORENAME "POWER5" #endif -#ifdef FORCE_POWER6 +#if defined(FORCE_POWER6) || defined(FORCE_POWER7) || defined(FORCE_POWER8) #define FORCE #define ARCHITECTURE "POWER" #define SUBARCHITECTURE "POWER6" @@ -752,7 +754,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define ARCHITECTURE "ARM" #define SUBARCHITECTURE "CORTEXA9" #define SUBDIRNAME "arm" -#define ARCHCONFIG "-DCORTEXA9 " \ +#define ARCHCONFIG "-DCORTEXA9 -DARMV7 " \ "-DL1_DATA_SIZE=32768 -DL1_DATA_LINESIZE=32 " \ "-DL2_SIZE=1048576 -DL2_LINESIZE=32 " \ "-DDTB_DEFAULT_ENTRIES=128 -DDTB_SIZE=4096 -DL2_ASSOCIATIVE=4 " \ @@ -767,7 +769,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define ARCHITECTURE "ARM" #define SUBARCHITECTURE "CORTEXA15" #define SUBDIRNAME "arm" -#define ARCHCONFIG "-DCORTEXA15 " \ +#define ARCHCONFIG "-DCORTEXA15 -DARMV7 " \ "-DL1_DATA_SIZE=32768 -DL1_DATA_LINESIZE=32 " \ "-DL2_SIZE=1048576 -DL2_LINESIZE=32 " \ "-DDTB_DEFAULT_ENTRIES=128 -DDTB_SIZE=4096 -DL2_ASSOCIATIVE=4 " \ @@ -800,8 +802,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define ARCHCONFIG "-DARMV5 " \ "-DL1_DATA_SIZE=65536 -DL1_DATA_LINESIZE=32 " \ "-DL2_SIZE=512488 -DL2_LINESIZE=32 " \ - "-DDTB_DEFAULT_ENTRIES=64 -DDTB_SIZE=4096 -DL2_ASSOCIATIVE=4 " \ - "-DHAVE_VFP" + "-DDTB_DEFAULT_ENTRIES=64 -DDTB_SIZE=4096 -DL2_ASSOCIATIVE=4 " #define LIBNAME "armv5" #define CORENAME "ARMV5" #else diff --git a/interface/imatcopy.c b/interface/imatcopy.c index 89f0ec823..f4309a85c 100644 --- a/interface/imatcopy.c +++ b/interface/imatcopy.c @@ -26,7 +26,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. *****************************************************************************/ /*********************************************************** - * 2014/06/10 Saar + * 2014-06-10 Saar + * 2015-09-07 grisuthedragon ***********************************************************/ #include @@ -50,6 +51,9 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #undef malloc #undef free +/* Enables the New IMATCOPY code with inplace operation if lda == ldb */ +#define NEW_IMATCOPY + #ifndef CBLAS void NAME( char* ORDER, char* TRANS, blasint *rows, blasint *cols, FLOAT *alpha, FLOAT *a, blasint *lda, blasint *ldb) { @@ -75,7 +79,6 @@ void NAME( char* ORDER, char* TRANS, blasint *rows, blasint *cols, FLOAT *alpha, #else void CNAME( enum CBLAS_ORDER CORDER, enum CBLAS_TRANSPOSE CTRANS, blasint crows, blasint ccols, FLOAT calpha, FLOAT *a, blasint clda, blasint cldb) { - char Order, Trans; int order=-1,trans=-1; blasint info = -1; FLOAT *b; @@ -117,6 +120,34 @@ void CNAME( enum CBLAS_ORDER CORDER, enum CBLAS_TRANSPOSE CTRANS, blasint crows, BLASFUNC(xerbla)(ERROR_NAME, &info, sizeof(ERROR_NAME)); return; } +#ifdef NEW_IMATCOPY + if ( *lda == *ldb ) { + if ( order == BlasColMajor ) + { + if ( trans == BlasNoTrans ) + { + IMATCOPY_K_CN(*rows, *cols, *alpha, a, *lda ); + } + else + { + IMATCOPY_K_CT(*rows, *cols, *alpha, a, *lda ); + } + } + else + { + if ( trans == BlasNoTrans ) + { + IMATCOPY_K_RN(*rows, *cols, *alpha, a, *lda ); + } + else + { + IMATCOPY_K_RT(*rows, *cols, *alpha, a, *lda ); + } + } + return; + } + +#endif if ( *lda > *ldb ) msize = (*lda) * (*ldb) * sizeof(FLOAT); diff --git a/interface/zimatcopy.c b/interface/zimatcopy.c index 3f273cf13..b1e1d15dc 100644 --- a/interface/zimatcopy.c +++ b/interface/zimatcopy.c @@ -26,7 +26,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. *****************************************************************************/ /*********************************************************** - * 2014/06/10 Saar + * 2014-06-10 Saar + * 2015-09-07 grisuthedragon ***********************************************************/ #include @@ -49,6 +50,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define BlasTransConj 2 #define BlasConj 3 +#define NEW_IMATCOPY #ifndef CBLAS void NAME( char* ORDER, char* TRANS, blasint *rows, blasint *cols, FLOAT *alpha, FLOAT *a, blasint *lda, blasint *ldb) @@ -124,6 +126,52 @@ void CNAME( enum CBLAS_ORDER CORDER, enum CBLAS_TRANSPOSE CTRANS, blasint crows, return; } +#ifdef NEW_IMATCOPY + if (*lda == *ldb) { + if ( order == BlasColMajor ) + { + + if ( trans == BlasNoTrans ) + { + IMATCOPY_K_CN(*rows, *cols, alpha[0], alpha[1], a, *lda ); + } + if ( trans == BlasConj ) + { + IMATCOPY_K_CNC(*rows, *cols, alpha[0], alpha[1], a, *lda ); + } + if ( trans == BlasTrans ) + { + IMATCOPY_K_CT(*rows, *cols, alpha[0], alpha[1], a, *lda ); + } + if ( trans == BlasTransConj ) + { + IMATCOPY_K_CTC(*rows, *cols, alpha[0], alpha[1], a, *lda ); + } + } + else + { + + if ( trans == BlasNoTrans ) + { + IMATCOPY_K_RN(*rows, *cols, alpha[0], alpha[1], a, *lda ); + } + if ( trans == BlasConj ) + { + IMATCOPY_K_RNC(*rows, *cols, alpha[0], alpha[1], a, *lda ); + } + if ( trans == BlasTrans ) + { + IMATCOPY_K_RT(*rows, *cols, alpha[0], alpha[1], a, *lda ); + } + if ( trans == BlasTransConj ) + { + IMATCOPY_K_RTC(*rows, *cols, alpha[0], alpha[1], a, *lda ); + } + } + return; + } +#endif + if ( *lda > *ldb ) msize = (*lda) * (*ldb) * sizeof(FLOAT) * 2; else diff --git a/kernel/Makefile.L3 b/kernel/Makefile.L3 index 60b8fb57f..63e675b8d 100644 --- a/kernel/Makefile.L3 +++ b/kernel/Makefile.L3 @@ -334,11 +334,15 @@ endif SBLASOBJS += \ somatcopy_k_cn$(TSUFFIX).$(SUFFIX) somatcopy_k_rn$(TSUFFIX).$(SUFFIX) \ somatcopy_k_ct$(TSUFFIX).$(SUFFIX) somatcopy_k_rt$(TSUFFIX).$(SUFFIX) \ + simatcopy_k_cn$(TSUFFIX).$(SUFFIX) simatcopy_k_rn$(TSUFFIX).$(SUFFIX) \ + simatcopy_k_ct$(TSUFFIX).$(SUFFIX) simatcopy_k_rt$(TSUFFIX).$(SUFFIX) \ sgeadd_k$(TSUFFIX).$(SUFFIX) DBLASOBJS += \ domatcopy_k_cn$(TSUFFIX).$(SUFFIX) domatcopy_k_rn$(TSUFFIX).$(SUFFIX) \ domatcopy_k_ct$(TSUFFIX).$(SUFFIX) domatcopy_k_rt$(TSUFFIX).$(SUFFIX) \ + dimatcopy_k_cn$(TSUFFIX).$(SUFFIX) dimatcopy_k_rn$(TSUFFIX).$(SUFFIX) \ + dimatcopy_k_ct$(TSUFFIX).$(SUFFIX) dimatcopy_k_rt$(TSUFFIX).$(SUFFIX) \ dgeadd_k$(TSUFFIX).$(SUFFIX) CBLASOBJS += \ @@ -346,6 +350,10 @@ CBLASOBJS += \ comatcopy_k_ct$(TSUFFIX).$(SUFFIX) comatcopy_k_rt$(TSUFFIX).$(SUFFIX) \ comatcopy_k_cnc$(TSUFFIX).$(SUFFIX) comatcopy_k_rnc$(TSUFFIX).$(SUFFIX) \ comatcopy_k_ctc$(TSUFFIX).$(SUFFIX) comatcopy_k_rtc$(TSUFFIX).$(SUFFIX) \ + cimatcopy_k_cn$(TSUFFIX).$(SUFFIX) cimatcopy_k_rn$(TSUFFIX).$(SUFFIX) \ + cimatcopy_k_ct$(TSUFFIX).$(SUFFIX) cimatcopy_k_rt$(TSUFFIX).$(SUFFIX) \ + cimatcopy_k_cnc$(TSUFFIX).$(SUFFIX) cimatcopy_k_rnc$(TSUFFIX).$(SUFFIX) \ + cimatcopy_k_ctc$(TSUFFIX).$(SUFFIX) cimatcopy_k_rtc$(TSUFFIX).$(SUFFIX) \ cgeadd_k$(TSUFFIX).$(SUFFIX) ZBLASOBJS += \ @@ -353,6 +361,10 @@ ZBLASOBJS += \ zomatcopy_k_ct$(TSUFFIX).$(SUFFIX) zomatcopy_k_rt$(TSUFFIX).$(SUFFIX) \ zomatcopy_k_cnc$(TSUFFIX).$(SUFFIX) zomatcopy_k_rnc$(TSUFFIX).$(SUFFIX) \ zomatcopy_k_ctc$(TSUFFIX).$(SUFFIX) zomatcopy_k_rtc$(TSUFFIX).$(SUFFIX) \ + zimatcopy_k_cn$(TSUFFIX).$(SUFFIX) zimatcopy_k_rn$(TSUFFIX).$(SUFFIX) \ + zimatcopy_k_ct$(TSUFFIX).$(SUFFIX) zimatcopy_k_rt$(TSUFFIX).$(SUFFIX) \ + zimatcopy_k_cnc$(TSUFFIX).$(SUFFIX) zimatcopy_k_rnc$(TSUFFIX).$(SUFFIX) \ + zimatcopy_k_ctc$(TSUFFIX).$(SUFFIX) zimatcopy_k_rtc$(TSUFFIX).$(SUFFIX) \ zgeadd_k$(TSUFFIX).$(SUFFIX) @@ -3305,6 +3317,34 @@ endif $(KDIR)domatcopy_k_rt$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(DOMATCOPY_RT) $(CC) $(CFLAGS) -c -DDOUBLE -UCOMPLEX -DROWM $< -o $@ +ifndef DIMATCOPY_CN +DIMATCOPY_CN = ../generic/imatcopy_cn.c +endif + +$(KDIR)dimatcopy_k_cn$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(DIMATCOPY_CN) + $(CC) $(CFLAGS) -c -DDOUBLE -UCOMPLEX -UROWM $< -o $@ + +ifndef DIMATCOPY_RN +DIMATCOPY_RN = ../generic/imatcopy_rn.c +endif + +$(KDIR)dimatcopy_k_rn$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(DIMATCOPY_RN) + $(CC) $(CFLAGS) -c -DDOUBLE -UCOMPLEX -DROWM $< -o $@ + +ifndef DIMATCOPY_CT +DIMATCOPY_CT = ../generic/imatcopy_ct.c +endif + +$(KDIR)dimatcopy_k_ct$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(DIMATCOPY_CT) + $(CC) $(CFLAGS) -c -DDOUBLE -UCOMPLEX -UROWM $< -o $@ + +ifndef DIMATCOPY_RT +DIMATCOPY_RT = ../generic/imatcopy_rt.c +endif + +$(KDIR)dimatcopy_k_rt$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(DIMATCOPY_RT) + $(CC) $(CFLAGS) -c -DDOUBLE -UCOMPLEX -DROWM $< -o $@ + ifndef SOMATCOPY_CN SOMATCOPY_CN = ../arm/omatcopy_cn.c endif @@ -3333,6 +3373,34 @@ endif $(KDIR)somatcopy_k_rt$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(SOMATCOPY_RT) $(CC) $(CFLAGS) -c -UDOUBLE -UCOMPLEX -DROWM $< -o $@ +ifndef SIMATCOPY_CN +SIMATCOPY_CN = ../generic/imatcopy_cn.c +endif + +$(KDIR)simatcopy_k_cn$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(SIMATCOPY_CN) + $(CC) $(CFLAGS) -c -UDOUBLE -UCOMPLEX -UROWM $< -o $@ + +ifndef SIMATCOPY_RN +SIMATCOPY_RN = ../generic/imatcopy_rn.c +endif + +$(KDIR)simatcopy_k_rn$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(SIMATCOPY_RN) + $(CC) $(CFLAGS) -c -UDOUBLE -UCOMPLEX -DROWM $< -o $@ + +ifndef SIMATCOPY_CT +SIMATCOPY_CT = ../generic/imatcopy_ct.c +endif + +$(KDIR)simatcopy_k_ct$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(SIMATCOPY_CT) + $(CC) $(CFLAGS) -c -UDOUBLE -UCOMPLEX -UROWM $< -o $@ + +ifndef SIMATCOPY_RT +SIMATCOPY_RT = ../generic/imatcopy_rt.c +endif + +$(KDIR)simatcopy_k_rt$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(SIMATCOPY_RT) + $(CC) $(CFLAGS) -c -UDOUBLE -UCOMPLEX -DROWM $< -o $@ + ifndef COMATCOPY_CN COMATCOPY_CN = ../arm/zomatcopy_cn.c @@ -3390,6 +3458,63 @@ endif $(KDIR)comatcopy_k_rtc$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(COMATCOPY_RTC) $(CC) $(CFLAGS) -c -UDOUBLE -DCOMPLEX -DROWM -DCONJ $< -o $@ +ifndef CIMATCOPY_CN +CIMATCOPY_CN = ../generic/zimatcopy_cn.c +endif + +$(KDIR)cimatcopy_k_cn$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(CIMATCOPY_CN) + $(CC) $(CFLAGS) -c -UDOUBLE -DCOMPLEX -UROWM -UCONJ $< -o $@ + +ifndef CIMATCOPY_RN +CIMATCOPY_RN = ../generic/zimatcopy_rn.c +endif + +$(KDIR)cimatcopy_k_rn$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(CIMATCOPY_RN) + $(CC) $(CFLAGS) -c -UDOUBLE -DCOMPLEX -DROWM -UCONJ $< -o $@ + +ifndef CIMATCOPY_CT +CIMATCOPY_CT = ../generic/zimatcopy_ct.c +endif + +$(KDIR)cimatcopy_k_ct$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(CIMATCOPY_CT) + $(CC) $(CFLAGS) -c -UDOUBLE -DCOMPLEX -UROWM -UCONJ $< -o $@ + +ifndef CIMATCOPY_RT +CIMATCOPY_RT = ../generic/zimatcopy_rt.c +endif + +$(KDIR)cimatcopy_k_rt$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(CIMATCOPY_RT) + $(CC) $(CFLAGS) -c -UDOUBLE -DCOMPLEX -DROWM -UCONJ $< -o $@ + +ifndef CIMATCOPY_CNC +CIMATCOPY_CNC = ../generic/zimatcopy_cnc.c +endif + +$(KDIR)cimatcopy_k_cnc$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(CIMATCOPY_CNC) + $(CC) $(CFLAGS) -c -UDOUBLE -DCOMPLEX -UROWM -DCONJ $< -o $@ + +ifndef CIMATCOPY_RNC +CIMATCOPY_RNC = ../generic/zimatcopy_rnc.c +endif + +$(KDIR)cimatcopy_k_rnc$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(CIMATCOPY_RNC) + $(CC) $(CFLAGS) -c -UDOUBLE -DCOMPLEX -DROWM -DCONJ $< -o $@ + +ifndef CIMATCOPY_CTC +CIMATCOPY_CTC = ../generic/zimatcopy_ctc.c +endif + +$(KDIR)cimatcopy_k_ctc$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(CIMATCOPY_CTC) + $(CC) $(CFLAGS) -c -UDOUBLE -DCOMPLEX -UROWM -DCONJ $< -o $@ + +ifndef CIMATCOPY_RTC +CIMATCOPY_RTC = ../generic/zimatcopy_rtc.c +endif + +$(KDIR)cimatcopy_k_rtc$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(CIMATCOPY_RTC) + $(CC) $(CFLAGS) -c -UDOUBLE -DCOMPLEX -DROWM -DCONJ $< -o $@ + + ifndef ZOMATCOPY_CN ZOMATCOPY_CN = ../arm/zomatcopy_cn.c @@ -3447,6 +3572,62 @@ endif $(KDIR)zomatcopy_k_rtc$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZOMATCOPY_RTC) $(CC) $(CFLAGS) -c -DDOUBLE -DCOMPLEX -DROWM -DCONJ $< -o $@ +ifndef ZIMATCOPY_CN +ZIMATCOPY_CN = ../generic/zimatcopy_cn.c +endif + +$(KDIR)zimatcopy_k_cn$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZIMATCOPY_CN) + $(CC) $(CFLAGS) -c -DDOUBLE -DCOMPLEX -UROWM -UCONJ $< -o $@ + +ifndef ZIMATCOPY_RN +ZIMATCOPY_RN = ../generic/zimatcopy_rn.c +endif + +$(KDIR)zimatcopy_k_rn$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZIMATCOPY_RN) + $(CC) $(CFLAGS) -c -DDOUBLE -DCOMPLEX -DROWM -UCONJ $< -o $@ + +ifndef ZIMATCOPY_CT +ZIMATCOPY_CT = ../generic/zimatcopy_ct.c +endif + +$(KDIR)zimatcopy_k_ct$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZIMATCOPY_CT) + $(CC) $(CFLAGS) -c -DDOUBLE -DCOMPLEX -UROWM -UCONJ $< -o $@ + +ifndef ZIMATCOPY_RT +ZIMATCOPY_RT = ../generic/zimatcopy_rt.c +endif + +$(KDIR)zimatcopy_k_rt$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZIMATCOPY_RT) + $(CC) $(CFLAGS) -c -DDOUBLE -DCOMPLEX -DROWM -UCONJ $< -o $@ + +ifndef ZIMATCOPY_CNC +ZIMATCOPY_CNC = ../generic/zimatcopy_cnc.c +endif + +$(KDIR)zimatcopy_k_cnc$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZIMATCOPY_CNC) + $(CC) $(CFLAGS) -c -DDOUBLE -DCOMPLEX -UROWM -DCONJ $< -o $@ + +ifndef ZIMATCOPY_RNC +ZIMATCOPY_RNC = ../generic/zimatcopy_rnc.c +endif + +$(KDIR)zimatcopy_k_rnc$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZIMATCOPY_RNC) + $(CC) $(CFLAGS) -c -DDOUBLE -DCOMPLEX -DROWM -DCONJ $< -o $@ + +ifndef ZIMATCOPY_CTC +ZIMATCOPY_CTC = ../generic/zimatcopy_ctc.c +endif + +$(KDIR)zimatcopy_k_ctc$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZIMATCOPY_CTC) + $(CC) $(CFLAGS) -c -DDOUBLE -DCOMPLEX -UROWM -DCONJ $< -o $@ + +ifndef ZIMATCOPY_RTC +ZIMATCOPY_RTC = ../generic/zimatcopy_rtc.c +endif + +$(KDIR)zimatcopy_k_rtc$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZIMATCOPY_RTC) + $(CC) $(CFLAGS) -c -DDOUBLE -DCOMPLEX -DROWM -DCONJ $< -o $@ + ifndef SGEADD_K SGEADD_K = ../generic/geadd.c diff --git a/kernel/generic/imatcopy_cn.c b/kernel/generic/imatcopy_cn.c new file mode 100644 index 000000000..e63bc976c --- /dev/null +++ b/kernel/generic/imatcopy_cn.c @@ -0,0 +1,67 @@ +/*************************************************************************** +Copyright (c) 2013, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +#include "common.h" + +/***************************************************** + * 2015-09-07 grisuthedragon +******************************************************/ + +int CNAME(BLASLONG rows, BLASLONG cols, FLOAT alpha, FLOAT *a, BLASLONG lda) +{ + BLASLONG i,j; + FLOAT *aptr; + + if ( rows <= 0 ) return(0); + if ( cols <= 0 ) return(0); + if ( alpha == 1.0 ) return(0); + + aptr = a; + if ( alpha == 0.0 ) + { + for ( i=0; i